From 0595979e1f0514b3da28762f336b3b9ac9eec5c5 Mon Sep 17 00:00:00 2001
From: Philipp Takacs <philipp@bureaucracy.de>
Date: Sat, 19 Sep 2015 23:25:57 +0200
Subject: [PATCH] porte encode_rfc2047 from nmh

Now a user can add any nonascii in header-fields.
mhbuild now encode this acourding to rfc2047.

Thanks Ken Hornstein <kenh@pobox.com>
---
 h/prototypes.h       |  17 ++
 man/mhbuild.man1     |   2 +-
 sbr/Makefile.in      |   2 +-
 sbr/encode_rfc2047.c | 646 +++++++++++++++++++++++++++++++++++++++++++
 uip/mhbuild.c        |  15 +
 5 files changed, 680 insertions(+), 2 deletions(-)
 create mode 100644 sbr/encode_rfc2047.c

diff --git a/h/prototypes.h b/h/prototypes.h
index 42767f57..e6e7dc14 100644
--- a/h/prototypes.h
+++ b/h/prototypes.h
@@ -177,3 +177,20 @@ int is_readonly(struct msgs *);
 void set_readonly(struct msgs *);
 int other_files(struct msgs *);
 void set_other_files(struct msgs *);
+
+/*
+ * Encode a message header using RFC 2047 encoding.  If the message contains
+ * no non-ASCII characters, then leave the header as-is.
+ *
+ * Arguments include:
+ *
+ * name     - Message header name
+ * value    - Message header content; must point to allocated memory
+ *        (may be changed if encoding is necessary)
+ * charset  - Charset used for encoding.  If NULL, obtain from system
+ *        locale.
+ *
+ * Returns 0 on success, any other value on failure.
+ */
+
+int encode_rfc2047(const char *name, char **value, const char *charset);
diff --git a/man/mhbuild.man1 b/man/mhbuild.man1
index 62d01be5..98fb39da 100644
--- a/man/mhbuild.man1
+++ b/man/mhbuild.man1
@@ -24,7 +24,7 @@ creates multi-media messages as specified in RFC\-2045
 thru RFC\-2049.  Currently
 .B mhbuild
 only supports encodings in
-message bodies, and does not support the encoding of message headers as
+message bodies, and does support the encoding of message headers as
 specified in RFC\-2047.
 .PP
 If you specify the name of the composition file as `-',
diff --git a/sbr/Makefile.in b/sbr/Makefile.in
index 4d2b91b7..304b50b0 100644
--- a/sbr/Makefile.in
+++ b/sbr/Makefile.in
@@ -70,7 +70,7 @@ SRCS = addrsbr.c ambigsw.c brkstring.c  \
 	smatch.c snprintb.c strcasecmp.c  \
 	strindex.c trim.c trimcpy.c uprf.c vfgets.c fmt_def.c  \
 	mf.c utils.c m_mktemp.c seq_msgstats.c \
-	unquote.c
+	unquote.c encode_rfc2047.c
 
 OBJS =  $(SRCS:.c=.o)
 
diff --git a/sbr/encode_rfc2047.c b/sbr/encode_rfc2047.c
new file mode 100644
index 00000000..b4027d8b
--- /dev/null
+++ b/sbr/encode_rfc2047.c
@@ -0,0 +1,646 @@
+/*
+ * Routines to encode message headers using RFC 2047-encoding.
+ *
+ * This code is Copyright (c) 2002, by the authors of nmh.  See the
+ * COPYRIGHT file in the root directory of the nmh distribution for
+ * complete copyright information.
+ */
+
+#include <h/mh.h>
+#include <h/mhparse.h>
+#include <h/addrsbr.h>
+#include <h/utils.h>
+
+#include <ctype.h>
+
+/*
+ * List of headers that contain addresses and as a result require special
+ * handling
+ */
+
+static char *address_headers[] = {
+    "To",
+    "From",
+    "cc",
+    "Bcc",
+    "Reply-To",
+    "Sender",
+    "Resent-To",
+    "Resent-From",
+    "Resent-cc",
+    "Resent-Bcc",
+    "Resent-Reply-To",
+    "Resent-Sender",
+    NULL,
+};
+
+/*
+ * Macros we use for parsing headers
+ */
+
+#define is_fws(c) (c == '\t' || c == ' ' || c == '\n')
+
+#define qphrasevalid(c) ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || \
+			 (c >= 'a' && c <= 'z') || \
+			 c == '!' || c == '*' || c == '+' || c == '-' || \
+			 c == '/' || c == '=' || c == '_')
+#define qpspecial(c) (c < ' ' || c == '=' || c == '?' || c == '_')
+
+#define base64len(n) ((((n) + 2) / 3 ) * 4)	/* String len to base64 len */
+#define strbase64(n) ((n) / 4 * 3)		/* Chars that fit in base64 */
+
+#define ENCODELINELIMIT	76
+
+static void unfold_header(char **, int);
+static int field_encode_address(const char *, char **, const char *);
+static int field_encode_quoted(const char *, char **, const char *, int,
+			       int, int);
+static int scanstring(const char *, int *, int *, int *);
+static int utf8len(const char *);
+/*static int pref_encoding(int, int, int);*/
+
+/*
+ * Encode a message header using RFC 2047 encoding.  We make the assumption
+ * that all characters < 128 are ASCII and as a consequence don't need any
+ * encoding.
+ */
+
+int
+encode_rfc2047(const char *name, char **value, const char *charset)
+{
+    int i, asciicount = 0, eightbitcount = 0, qpspecialcount = 0;
+    char *p;
+
+    /*
+     * First, check to see if we even need to encode the header
+     */
+
+    for (p = *value; *p != '\0'; p++) {
+	if (isascii((unsigned char) *p)) {
+	    asciicount++;
+	    if (qpspecial((unsigned char) *p))
+	    	qpspecialcount++;
+	} else
+	    eightbitcount++;
+    }
+
+    if (eightbitcount == 0)
+    	return 0;
+
+    /*
+     * Some rules from RFC 2047:
+     *
+     * - Encoded words cannot be more than 75 characters long
+     * - Multiple "long" encoded words must be on new lines.
+     *
+     * Also, we're not permitted to encode email addresses, so
+     * we need to actually _parse_ email addresses and only encode
+     * the right bits.  
+     */
+
+    /*
+     * If charset was NULL, then get the value from the locale.  But
+     * we reject it if it returns US-ASCII
+     */
+
+	if (charset == NULL)
+		charset = write_charset_8bit();
+
+	if (strcasecmp(charset, "US-ASCII") == 0) {
+		advise(NULL, "Cannot use US-ASCII with 8 bit characters in header");
+		return 1;
+	}
+
+    /*
+     * If we have an address header, then we need to parse the addresses
+     * and only encode the names or comments.  Otherwise, handle it normally.
+     */
+
+	for (i = 0; address_headers[i]; i++) {
+		if (strcasecmp(name, address_headers[i]) == 0) {
+			return field_encode_address(name, value, charset);
+		}
+	}
+
+    /*
+     * On the encoding we choose, and the specifics of encoding:
+     *
+     * - If a specified encoding is passed in, we use that.
+     * - Otherwise, pick which encoding is shorter.
+     *
+     * We don't quite handle continuation right here, but it should be
+     * pretty close.
+     */
+
+    unfold_header(value, asciicount + eightbitcount);
+
+	return field_encode_quoted(name, value, charset, asciicount,
+				   eightbitcount + qpspecialcount, 0);
+}
+
+/*
+ * Encode our specified header (or field) using quoted-printable
+ */
+
+static int
+field_encode_quoted(const char *name, char **value, const char *charset,
+		    int ascii, int encoded, int phraserules)
+{
+    int prefixlen = name ? strlen(name) + 2: 0, outlen = 0, column, newline = 1;
+    int charsetlen = strlen(charset), utf8;
+    char *output = NULL, *p, *q = NULL;
+
+    /*
+     * Right now we just encode the whole thing.  Maybe later on we'll
+     * only encode things on a per-atom basis.
+     */
+
+    p = *value;
+
+    column = prefixlen + 2;	/* Header name plus ": " */
+
+    utf8 = strcasecmp(charset, "UTF-8") == 0;
+
+    while (*p != '\0') {
+    	/*
+	 * Start a new line, if it's time
+	 */
+    	if (newline) {
+	    /*
+	     * If it's the start of the header, we don't need to pad it
+	     *
+	     * The length of the output string is ...
+	     * =?charset?Q?...?=  so that's 7+strlen(charset) + 2 for \n NUL
+	     *
+	     * plus 1 for every ASCII character and 3 for every eight bit
+	     * or special character (eight bit characters are written as =XX).
+	     *
+	     */
+
+	    int tokenlen;
+
+	    outlen += 9 + charsetlen + ascii + 3 * encoded;
+
+	    /*
+	     * If output is set, then we're continuing the header.  Otherwise
+	     * do the initial allocation.
+	     */
+
+	    if (output) {
+	        int curlen = q - output, i;
+		outlen += prefixlen + 1;	/* Header plus \n ": " */
+		output = mh_xrealloc(output, outlen);
+		q = output + curlen;
+		*q++ = '?';
+		*q++ = '=';
+		*q++ = '\n';
+		for (i = 0; i < prefixlen; i++)
+		    *q++ = ' ';
+	    } else {
+	    	/*
+		 * A bit of a hack here; the header can contain multiple
+		 * spaces (probably at least one) until we get to the
+		 * actual text.  Copy until we get to a non-space.
+		 */
+	    	output = mh_xmalloc(outlen);
+		q = output;
+		while (is_fws(*p))
+		    *q++ = *p++;
+	    }
+
+	    tokenlen = snprintf(q, outlen - (q - output), "=?%s?Q?", charset);
+	    q += tokenlen;
+	    column = prefixlen + tokenlen;
+	    newline = 0;
+	}
+
+	/*
+	 * Process each character, encoding if necessary
+	 *
+	 * Note that we have a different set of rules if we're processing
+	 * RFC 5322 'phrase' (something you'd see in an address header).
+	 */
+
+	column++;
+
+	if (*p == ' ') {
+	    *q++ = '_';
+	    ascii--;
+	} else if (isascii((unsigned char) *p) &&
+		   (phraserules ? qphrasevalid((unsigned char) *p) :
+		   			!qpspecial((unsigned char) *p))) {
+	    *q++ = *p;
+	    ascii--;
+	} else {
+	    snprintf(q, outlen - (q - output), "=%02X", (unsigned char) *p);
+	    q += 3;
+	    column += 2;	/* column already incremented by 1 above */
+	    encoded--;
+	}
+
+	p++;
+
+	/*
+	 * We're not allowed more than ENCODELINELIMIT characters per line,
+	 * so reserve some room for the final ?=.
+	 *
+	 * If prefixlen == 0, we haven't been passed in a header name, so
+	 * don't ever wrap the field (we're likely doing an address).
+	 */
+
+	if (prefixlen == 0)
+	    continue;
+
+	if (column >= ENCODELINELIMIT - 2) {
+	    newline = 1;
+	} else if (utf8) {
+	    /*
+	     * Okay, this is a bit weird, but to explain a bit more ...
+	     *
+	     * RFC 2047 prohibits the splitting of multibyte characters
+	     * across encoded words.  Right now we only handle the case
+	     * of UTF-8, the most common multibyte encoding.
+	     *
+	     * p is now pointing at the next input character.  If we're
+	     * using UTF-8 _and_ we'd go over ENCODELINELIMIT given the
+	     * length of the complete character, then trigger a newline
+	     * now.  Note that we check the length * 3 since we have to
+	     * allow for the encoded output.
+	     */
+	    if (column + (utf8len(p) * 3) > ENCODELINELIMIT - 2) {
+		newline = 1;
+	    }
+	}
+    }
+
+    if (q == NULL) {
+	/* This should never happen, but just in case.  Found by
+	   clang static analyzer. */
+	admonish (NULL, "null output encoding for %s", *value);
+	return 1;
+    }
+    *q++ = '?';
+    *q++ = '=';
+
+    if (prefixlen)
+	*q++ = '\n';
+
+    *q = '\0';
+
+    free(*value);
+
+    *value = output;
+
+    return 0;
+}
+
+/*
+ * Calculate the length of a UTF-8 character.
+ *
+ * If it's not a UTF-8 character (or we're in the middle of a multibyte
+ * character) then simply return 0.
+ */
+
+static int
+utf8len(const char *p)
+{
+    int len = 1;
+
+    if (*p == '\0')
+    	return 0;
+
+    if (isascii((unsigned char) *p) || (((unsigned char) *p) & 0xc0) == 0x80)
+    	return 0;
+
+    p++;
+    while ((((unsigned char) *p++) & 0xc0) == 0x80)
+    	len++;
+
+    return len;
+}
+
+/*
+ * "Unfold" a header, making it a single line (without continuation)
+ *
+ * We cheat a bit here; we never make the string longer, so using the
+ * original length here is fine.
+ */
+
+static void
+unfold_header(char **value, int len)
+{
+    char *str = mh_xmalloc(len + 1);
+    char *p = str, *q = *value;
+
+    while (*q != '\0') {
+    	if (*q == '\n') {
+	    /*
+	     * When we get a newline, skip to the next non-whitespace
+	     * character and add a space to replace all of the whitespace
+	     *
+	     * This has the side effect of stripping off the final newline
+	     * for the header; we put it back in the encoding routine.
+	     */
+	    while (is_fws(*q))
+	    	q++;
+	    if (*q == '\0')
+	    	break;
+
+	    *p++ = ' ';
+	} else {
+	    *p++ = *q++;
+	}
+    }
+
+    *p = '\0';
+
+    free(*value);
+    *value = str;
+}
+
+/*
+ * Decode a header containing addresses.  This means we have to parse
+ * each address and only encode the display-name or comment field.
+ */
+
+static int
+field_encode_address(const char *name, char **value, const char *charset)
+{
+    int prefixlen = strlen(name) + 2, column = prefixlen, groupflag;
+    int asciichars, specialchars, eightbitchars, reformat = 0, errflag = 0;
+    size_t len;
+    char *mp, *cp = NULL, *output = NULL;
+    char *tmpbuf = NULL;
+    size_t tmpbufsize = 0;
+    struct mailname *mn;
+    char errbuf[BUFSIZ];
+
+    /*
+     * Because these are addresses, we need to handle them individually.
+     *
+     * Break them down and process them one by one.  This means we have to
+     * rewrite the whole header, but that's unavoidable.
+     */
+
+    /*
+     * The output headers always have to start with a space first; this
+     * is just the way the API works right now.
+     */
+
+    output = add(" ", output);
+
+    for (groupflag = 0; (mp = getname(*value)); ) {
+    	if ((mn = getm(mp, NULL, 0, AD_HOST, errbuf)) == NULL) {
+	    advise(NULL, "%s: %s", errbuf, mp);
+	    errflag++;
+	    continue;
+	}
+
+	reformat = 0;
+
+	/*
+	 * We only care if the phrase (m_pers) or any trailing comment
+	 * (m_note) have 8-bit characters.  If doing q-p, we also need
+	 * to encode anything marked as qspecial().  Unquote it first
+	 * so the specialchars count is right.
+	 */
+
+	if (! mn->m_pers)
+	    goto check_note;
+
+	if ((len = strlen(mn->m_pers)) + 1 > tmpbufsize) {
+	    tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = len + 1);
+	}
+
+	unquote_string(mn->m_pers, tmpbuf);
+
+	if (scanstring(tmpbuf, &asciichars, &eightbitchars,
+		       &specialchars)) {
+	    /*
+	     * If we have 8-bit characters, encode it.
+	     */
+
+	    /*
+	     * This is okay, because the output of unquote_string will be either
+	     * equal or shorter than the original.
+	     */
+
+		strcpy(mn->m_pers, tmpbuf);
+
+		if (field_encode_quoted(NULL, &mn->m_pers, charset, asciichars,
+					eightbitchars + specialchars, 1)) {
+			errflag++;
+		    goto out;
+		}
+
+		 reformat++;
+	}
+
+	check_note:
+
+	/*
+	 * The "note" field is generally a comment at the end of the address,
+	 * at least as how it's implemented here.  Notes are always surrounded
+	 * by parenthesis (since they're comments).  Strip them out and
+	 * then put them back when we format the final field, but they do
+	 * not get encoded.
+	 */
+
+	if (! mn->m_note)
+	    goto do_reformat;
+
+	if ((len = strlen(mn->m_note)) + 1 > tmpbufsize) {
+	    tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = len + 1);
+	}
+
+	if (mn->m_note[0] != '(' || mn->m_note[len - 1] != ')') {
+	    advise(NULL, "Internal error: Invalid note field \"%s\"",
+	    	   mn->m_note);
+	    errflag++;
+	    goto out;
+	}
+
+	strncpy(tmpbuf, mn->m_note + 1, len - 1);
+	tmpbuf[len - 2] = '\0';
+
+	if (scanstring(tmpbuf, &asciichars, &eightbitchars,
+		       &specialchars)) {
+	    /*
+	     * If we have 8-bit characters, encode it.
+	     */
+
+	    	if (field_encode_quoted(NULL, &tmpbuf, charset, asciichars,
+					eightbitchars + specialchars, 1)) {
+		    errflag++;
+		    goto out;
+		}
+
+	    reformat++;
+
+	    /*
+	     * Make sure the size of tmpbuf is correct (it always gets
+	     * reallocated in the above functions).
+	     */
+
+	    tmpbufsize = strlen(tmpbuf) + 1;
+
+	    /*
+	     * Put the note field back surrounded by parenthesis.
+	     */
+
+	    mn->m_note = mh_xrealloc(mn->m_note, tmpbufsize + 2);
+
+	    snprintf(mn->m_note, tmpbufsize + 2, "(%s)", tmpbuf);
+	}
+
+do_reformat:
+
+	/*
+	 * So, some explanation is in order.
+	 *
+	 * We know we need to rewrite at least one address in the header,
+	 * otherwise we wouldn't be here.  If we had to reformat this
+	 * particular address, then run it through adrformat().  Otherwise
+	 * we can use m_text directly.
+	 */
+
+	/*
+	 * If we were in a group but are no longer, make sure we add a
+	 * semicolon (which needs to be FIRST, as it needs to be at the end
+	 * of the last address).
+	 */
+
+	if (groupflag && ! mn->m_ingrp) {
+	    output = add(";", output);
+	    column += 1;
+	}
+
+	groupflag = mn->m_ingrp;
+
+	if (mn->m_gname) {
+	    cp = add(mn->m_gname, NULL);
+	}
+
+	if (reformat) {
+	    cp = add(adrformat(mn), cp);
+	} else {
+	    cp = add(mn->m_text, cp);
+	}
+
+	len = strlen(cp);
+
+	/*
+	 * If we're not at the beginning of the line, add a command and
+	 * either a space or a newline.
+	 */
+
+	if (column != prefixlen) {
+	    if (len + column + 2 > OUTPUTLINELEN) {
+
+	    	if ((size_t) (prefixlen + 3) < tmpbufsize)
+		    tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = prefixlen + 3);
+
+		snprintf(tmpbuf, tmpbufsize, ",\n%*s", column = prefixlen, "");
+		output = add(tmpbuf, output);
+	    } else {
+	    	output = add(", ", output);
+		column += 2;
+	    }
+	}
+
+	/*
+	 * Finally add the address
+	 */
+
+	output = add(cp, output);
+	column += len;
+	free(cp);
+	cp = NULL;
+    }
+
+    /*
+     * Just in case we're at the end of a list
+     */
+
+    if (groupflag) {
+	output = add(";", output);
+    }
+
+    output = add("\n", output);
+
+    free(*value);
+    *value = output;
+    output = NULL;
+
+out:
+
+    if (tmpbuf)
+    	free(tmpbuf);
+    if (output)
+    	free(output);
+
+    return errflag > 0;
+}
+
+/*
+ * Scan a string, check for characters that need to be encoded
+ */
+
+static int
+scanstring(const char *string, int *asciilen, int *eightbitchars,
+	   int *specialchars)
+{
+    *asciilen = 0;
+    *eightbitchars = 0;
+    *specialchars = 0;
+
+    for (; *string != '\0'; string++) {
+    	if ((isascii((unsigned char) *string))) {
+	    (*asciilen)++;
+	    /*
+	     * So, a space is not a valid phrase character, but we're counting
+	     * an exception here, because in q-p a space can be directly
+	     * encoded as an underscore.
+	     */
+	    if (!qphrasevalid((unsigned char) *string) && *string != ' ')
+	    	(*specialchars)++;
+	} else {
+	    (*eightbitchars)++;
+	}
+    }
+
+    return *eightbitchars > 0;
+}
+
+/*
+ * This function is to be used to decide which encoding algorithm we should
+ * use if one is not given.  Basically, we pick whichever one is the shorter
+ * of the two.
+ *
+ * Arguments are:
+ *
+ * ascii	- Number of ASCII characters in to-be-encoded string.
+ * specials	- Number of ASCII characters in to-be-encoded string that
+ *		  still require encoding under quoted-printable.  Note that
+ *		  these are included in the "ascii" total.
+ * eightbit	- Eight-bit characters in the to-be-encoded string.
+ *
+ * Returns one of CE_BASE64 or CE_QUOTED.
+ */
+/*
+static int
+pref_encoding(int ascii, int specials, int eightbits)
+{ */
+    /*
+     * The length of the q-p encoding is:
+     *
+     * ascii - specials + (specials + eightbits) * 3.
+     *
+     * The length of the base64 encoding is:
+     *
+     * base64len(ascii + eightbits)	(See macro for details)
+     */
+/*
+    return base64len(ascii + eightbits) < (ascii - specials +
+    			(specials + eightbits) * 3) ? CE_BASE64 : CE_QUOTED;
+}*/
diff --git a/uip/mhbuild.c b/uip/mhbuild.c
index 140cfdb3..d636cf7c 100644
--- a/uip/mhbuild.c
+++ b/uip/mhbuild.c
@@ -410,6 +410,18 @@ finish_field:
 		break;
 	}
 
+	/*
+	 * Iterate through the list of headers and call the function to MIME-ify
+	 * them if required.
+	 */
+
+	HF hp;
+	for (hp = ct->c_first_hf; hp != NULL; hp = hp->next) {
+		if (encode_rfc2047(hp->name, &hp->value, NULL)) {
+			adios(EX_DATAERR, NULL, "Unable to encode header \"%s\"", hp->name);
+		}
+	}
+
 	/*
 	** Now add the MIME-Version header field
 	** to the list of header fields.
@@ -1529,6 +1541,9 @@ build_headers(CT ct)
 	if (ct->c_descr) {
 		np = getcpy(DESCR_FIELD);
 		vp = concat(" ", ct->c_descr, NULL);
+		if (encode_rfc2047(DESCR_FIELD, &vp, NULL)) {
+			adios(EX_DATAERR, NULL, "Unable to encode %s header", DESCR_FIELD);
+		}
 		add_header(ct, np, vp);
 	}
 
-- 
2.39.5