From 0595979e1f0514b3da28762f336b3b9ac9eec5c5 Mon Sep 17 00:00:00 2001 From: Philipp Takacs Date: Sat, 19 Sep 2015 23:25:57 +0200 Subject: [PATCH] porte encode_rfc2047 from nmh Now a user can add any nonascii in header-fields. mhbuild now encode this acourding to rfc2047. Thanks Ken Hornstein --- h/prototypes.h | 17 ++ man/mhbuild.man1 | 2 +- sbr/Makefile.in | 2 +- sbr/encode_rfc2047.c | 646 ++++++++++++++++++++++++++++++++++++++++++++++++++ uip/mhbuild.c | 15 ++ 5 files changed, 680 insertions(+), 2 deletions(-) create mode 100644 sbr/encode_rfc2047.c diff --git a/h/prototypes.h b/h/prototypes.h index 42767f5..e6e7dc1 100644 --- a/h/prototypes.h +++ b/h/prototypes.h @@ -177,3 +177,20 @@ int is_readonly(struct msgs *); void set_readonly(struct msgs *); int other_files(struct msgs *); void set_other_files(struct msgs *); + +/* + * Encode a message header using RFC 2047 encoding. If the message contains + * no non-ASCII characters, then leave the header as-is. + * + * Arguments include: + * + * name - Message header name + * value - Message header content; must point to allocated memory + * (may be changed if encoding is necessary) + * charset - Charset used for encoding. If NULL, obtain from system + * locale. + * + * Returns 0 on success, any other value on failure. + */ + +int encode_rfc2047(const char *name, char **value, const char *charset); diff --git a/man/mhbuild.man1 b/man/mhbuild.man1 index 62d01be..98fb39d 100644 --- a/man/mhbuild.man1 +++ b/man/mhbuild.man1 @@ -24,7 +24,7 @@ creates multi-media messages as specified in RFC\-2045 thru RFC\-2049. Currently .B mhbuild only supports encodings in -message bodies, and does not support the encoding of message headers as +message bodies, and does support the encoding of message headers as specified in RFC\-2047. .PP If you specify the name of the composition file as `-', diff --git a/sbr/Makefile.in b/sbr/Makefile.in index 4d2b91b..304b50b 100644 --- a/sbr/Makefile.in +++ b/sbr/Makefile.in @@ -70,7 +70,7 @@ SRCS = addrsbr.c ambigsw.c brkstring.c \ smatch.c snprintb.c strcasecmp.c \ strindex.c trim.c trimcpy.c uprf.c vfgets.c fmt_def.c \ mf.c utils.c m_mktemp.c seq_msgstats.c \ - unquote.c + unquote.c encode_rfc2047.c OBJS = $(SRCS:.c=.o) diff --git a/sbr/encode_rfc2047.c b/sbr/encode_rfc2047.c new file mode 100644 index 0000000..b4027d8 --- /dev/null +++ b/sbr/encode_rfc2047.c @@ -0,0 +1,646 @@ +/* + * Routines to encode message headers using RFC 2047-encoding. + * + * This code is Copyright (c) 2002, by the authors of nmh. See the + * COPYRIGHT file in the root directory of the nmh distribution for + * complete copyright information. + */ + +#include +#include +#include +#include + +#include + +/* + * List of headers that contain addresses and as a result require special + * handling + */ + +static char *address_headers[] = { + "To", + "From", + "cc", + "Bcc", + "Reply-To", + "Sender", + "Resent-To", + "Resent-From", + "Resent-cc", + "Resent-Bcc", + "Resent-Reply-To", + "Resent-Sender", + NULL, +}; + +/* + * Macros we use for parsing headers + */ + +#define is_fws(c) (c == '\t' || c == ' ' || c == '\n') + +#define qphrasevalid(c) ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || \ + (c >= 'a' && c <= 'z') || \ + c == '!' || c == '*' || c == '+' || c == '-' || \ + c == '/' || c == '=' || c == '_') +#define qpspecial(c) (c < ' ' || c == '=' || c == '?' || c == '_') + +#define base64len(n) ((((n) + 2) / 3 ) * 4) /* String len to base64 len */ +#define strbase64(n) ((n) / 4 * 3) /* Chars that fit in base64 */ + +#define ENCODELINELIMIT 76 + +static void unfold_header(char **, int); +static int field_encode_address(const char *, char **, const char *); +static int field_encode_quoted(const char *, char **, const char *, int, + int, int); +static int scanstring(const char *, int *, int *, int *); +static int utf8len(const char *); +/*static int pref_encoding(int, int, int);*/ + +/* + * Encode a message header using RFC 2047 encoding. We make the assumption + * that all characters < 128 are ASCII and as a consequence don't need any + * encoding. + */ + +int +encode_rfc2047(const char *name, char **value, const char *charset) +{ + int i, asciicount = 0, eightbitcount = 0, qpspecialcount = 0; + char *p; + + /* + * First, check to see if we even need to encode the header + */ + + for (p = *value; *p != '\0'; p++) { + if (isascii((unsigned char) *p)) { + asciicount++; + if (qpspecial((unsigned char) *p)) + qpspecialcount++; + } else + eightbitcount++; + } + + if (eightbitcount == 0) + return 0; + + /* + * Some rules from RFC 2047: + * + * - Encoded words cannot be more than 75 characters long + * - Multiple "long" encoded words must be on new lines. + * + * Also, we're not permitted to encode email addresses, so + * we need to actually _parse_ email addresses and only encode + * the right bits. + */ + + /* + * If charset was NULL, then get the value from the locale. But + * we reject it if it returns US-ASCII + */ + + if (charset == NULL) + charset = write_charset_8bit(); + + if (strcasecmp(charset, "US-ASCII") == 0) { + advise(NULL, "Cannot use US-ASCII with 8 bit characters in header"); + return 1; + } + + /* + * If we have an address header, then we need to parse the addresses + * and only encode the names or comments. Otherwise, handle it normally. + */ + + for (i = 0; address_headers[i]; i++) { + if (strcasecmp(name, address_headers[i]) == 0) { + return field_encode_address(name, value, charset); + } + } + + /* + * On the encoding we choose, and the specifics of encoding: + * + * - If a specified encoding is passed in, we use that. + * - Otherwise, pick which encoding is shorter. + * + * We don't quite handle continuation right here, but it should be + * pretty close. + */ + + unfold_header(value, asciicount + eightbitcount); + + return field_encode_quoted(name, value, charset, asciicount, + eightbitcount + qpspecialcount, 0); +} + +/* + * Encode our specified header (or field) using quoted-printable + */ + +static int +field_encode_quoted(const char *name, char **value, const char *charset, + int ascii, int encoded, int phraserules) +{ + int prefixlen = name ? strlen(name) + 2: 0, outlen = 0, column, newline = 1; + int charsetlen = strlen(charset), utf8; + char *output = NULL, *p, *q = NULL; + + /* + * Right now we just encode the whole thing. Maybe later on we'll + * only encode things on a per-atom basis. + */ + + p = *value; + + column = prefixlen + 2; /* Header name plus ": " */ + + utf8 = strcasecmp(charset, "UTF-8") == 0; + + while (*p != '\0') { + /* + * Start a new line, if it's time + */ + if (newline) { + /* + * If it's the start of the header, we don't need to pad it + * + * The length of the output string is ... + * =?charset?Q?...?= so that's 7+strlen(charset) + 2 for \n NUL + * + * plus 1 for every ASCII character and 3 for every eight bit + * or special character (eight bit characters are written as =XX). + * + */ + + int tokenlen; + + outlen += 9 + charsetlen + ascii + 3 * encoded; + + /* + * If output is set, then we're continuing the header. Otherwise + * do the initial allocation. + */ + + if (output) { + int curlen = q - output, i; + outlen += prefixlen + 1; /* Header plus \n ": " */ + output = mh_xrealloc(output, outlen); + q = output + curlen; + *q++ = '?'; + *q++ = '='; + *q++ = '\n'; + for (i = 0; i < prefixlen; i++) + *q++ = ' '; + } else { + /* + * A bit of a hack here; the header can contain multiple + * spaces (probably at least one) until we get to the + * actual text. Copy until we get to a non-space. + */ + output = mh_xmalloc(outlen); + q = output; + while (is_fws(*p)) + *q++ = *p++; + } + + tokenlen = snprintf(q, outlen - (q - output), "=?%s?Q?", charset); + q += tokenlen; + column = prefixlen + tokenlen; + newline = 0; + } + + /* + * Process each character, encoding if necessary + * + * Note that we have a different set of rules if we're processing + * RFC 5322 'phrase' (something you'd see in an address header). + */ + + column++; + + if (*p == ' ') { + *q++ = '_'; + ascii--; + } else if (isascii((unsigned char) *p) && + (phraserules ? qphrasevalid((unsigned char) *p) : + !qpspecial((unsigned char) *p))) { + *q++ = *p; + ascii--; + } else { + snprintf(q, outlen - (q - output), "=%02X", (unsigned char) *p); + q += 3; + column += 2; /* column already incremented by 1 above */ + encoded--; + } + + p++; + + /* + * We're not allowed more than ENCODELINELIMIT characters per line, + * so reserve some room for the final ?=. + * + * If prefixlen == 0, we haven't been passed in a header name, so + * don't ever wrap the field (we're likely doing an address). + */ + + if (prefixlen == 0) + continue; + + if (column >= ENCODELINELIMIT - 2) { + newline = 1; + } else if (utf8) { + /* + * Okay, this is a bit weird, but to explain a bit more ... + * + * RFC 2047 prohibits the splitting of multibyte characters + * across encoded words. Right now we only handle the case + * of UTF-8, the most common multibyte encoding. + * + * p is now pointing at the next input character. If we're + * using UTF-8 _and_ we'd go over ENCODELINELIMIT given the + * length of the complete character, then trigger a newline + * now. Note that we check the length * 3 since we have to + * allow for the encoded output. + */ + if (column + (utf8len(p) * 3) > ENCODELINELIMIT - 2) { + newline = 1; + } + } + } + + if (q == NULL) { + /* This should never happen, but just in case. Found by + clang static analyzer. */ + admonish (NULL, "null output encoding for %s", *value); + return 1; + } + *q++ = '?'; + *q++ = '='; + + if (prefixlen) + *q++ = '\n'; + + *q = '\0'; + + free(*value); + + *value = output; + + return 0; +} + +/* + * Calculate the length of a UTF-8 character. + * + * If it's not a UTF-8 character (or we're in the middle of a multibyte + * character) then simply return 0. + */ + +static int +utf8len(const char *p) +{ + int len = 1; + + if (*p == '\0') + return 0; + + if (isascii((unsigned char) *p) || (((unsigned char) *p) & 0xc0) == 0x80) + return 0; + + p++; + while ((((unsigned char) *p++) & 0xc0) == 0x80) + len++; + + return len; +} + +/* + * "Unfold" a header, making it a single line (without continuation) + * + * We cheat a bit here; we never make the string longer, so using the + * original length here is fine. + */ + +static void +unfold_header(char **value, int len) +{ + char *str = mh_xmalloc(len + 1); + char *p = str, *q = *value; + + while (*q != '\0') { + if (*q == '\n') { + /* + * When we get a newline, skip to the next non-whitespace + * character and add a space to replace all of the whitespace + * + * This has the side effect of stripping off the final newline + * for the header; we put it back in the encoding routine. + */ + while (is_fws(*q)) + q++; + if (*q == '\0') + break; + + *p++ = ' '; + } else { + *p++ = *q++; + } + } + + *p = '\0'; + + free(*value); + *value = str; +} + +/* + * Decode a header containing addresses. This means we have to parse + * each address and only encode the display-name or comment field. + */ + +static int +field_encode_address(const char *name, char **value, const char *charset) +{ + int prefixlen = strlen(name) + 2, column = prefixlen, groupflag; + int asciichars, specialchars, eightbitchars, reformat = 0, errflag = 0; + size_t len; + char *mp, *cp = NULL, *output = NULL; + char *tmpbuf = NULL; + size_t tmpbufsize = 0; + struct mailname *mn; + char errbuf[BUFSIZ]; + + /* + * Because these are addresses, we need to handle them individually. + * + * Break them down and process them one by one. This means we have to + * rewrite the whole header, but that's unavoidable. + */ + + /* + * The output headers always have to start with a space first; this + * is just the way the API works right now. + */ + + output = add(" ", output); + + for (groupflag = 0; (mp = getname(*value)); ) { + if ((mn = getm(mp, NULL, 0, AD_HOST, errbuf)) == NULL) { + advise(NULL, "%s: %s", errbuf, mp); + errflag++; + continue; + } + + reformat = 0; + + /* + * We only care if the phrase (m_pers) or any trailing comment + * (m_note) have 8-bit characters. If doing q-p, we also need + * to encode anything marked as qspecial(). Unquote it first + * so the specialchars count is right. + */ + + if (! mn->m_pers) + goto check_note; + + if ((len = strlen(mn->m_pers)) + 1 > tmpbufsize) { + tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = len + 1); + } + + unquote_string(mn->m_pers, tmpbuf); + + if (scanstring(tmpbuf, &asciichars, &eightbitchars, + &specialchars)) { + /* + * If we have 8-bit characters, encode it. + */ + + /* + * This is okay, because the output of unquote_string will be either + * equal or shorter than the original. + */ + + strcpy(mn->m_pers, tmpbuf); + + if (field_encode_quoted(NULL, &mn->m_pers, charset, asciichars, + eightbitchars + specialchars, 1)) { + errflag++; + goto out; + } + + reformat++; + } + + check_note: + + /* + * The "note" field is generally a comment at the end of the address, + * at least as how it's implemented here. Notes are always surrounded + * by parenthesis (since they're comments). Strip them out and + * then put them back when we format the final field, but they do + * not get encoded. + */ + + if (! mn->m_note) + goto do_reformat; + + if ((len = strlen(mn->m_note)) + 1 > tmpbufsize) { + tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = len + 1); + } + + if (mn->m_note[0] != '(' || mn->m_note[len - 1] != ')') { + advise(NULL, "Internal error: Invalid note field \"%s\"", + mn->m_note); + errflag++; + goto out; + } + + strncpy(tmpbuf, mn->m_note + 1, len - 1); + tmpbuf[len - 2] = '\0'; + + if (scanstring(tmpbuf, &asciichars, &eightbitchars, + &specialchars)) { + /* + * If we have 8-bit characters, encode it. + */ + + if (field_encode_quoted(NULL, &tmpbuf, charset, asciichars, + eightbitchars + specialchars, 1)) { + errflag++; + goto out; + } + + reformat++; + + /* + * Make sure the size of tmpbuf is correct (it always gets + * reallocated in the above functions). + */ + + tmpbufsize = strlen(tmpbuf) + 1; + + /* + * Put the note field back surrounded by parenthesis. + */ + + mn->m_note = mh_xrealloc(mn->m_note, tmpbufsize + 2); + + snprintf(mn->m_note, tmpbufsize + 2, "(%s)", tmpbuf); + } + +do_reformat: + + /* + * So, some explanation is in order. + * + * We know we need to rewrite at least one address in the header, + * otherwise we wouldn't be here. If we had to reformat this + * particular address, then run it through adrformat(). Otherwise + * we can use m_text directly. + */ + + /* + * If we were in a group but are no longer, make sure we add a + * semicolon (which needs to be FIRST, as it needs to be at the end + * of the last address). + */ + + if (groupflag && ! mn->m_ingrp) { + output = add(";", output); + column += 1; + } + + groupflag = mn->m_ingrp; + + if (mn->m_gname) { + cp = add(mn->m_gname, NULL); + } + + if (reformat) { + cp = add(adrformat(mn), cp); + } else { + cp = add(mn->m_text, cp); + } + + len = strlen(cp); + + /* + * If we're not at the beginning of the line, add a command and + * either a space or a newline. + */ + + if (column != prefixlen) { + if (len + column + 2 > OUTPUTLINELEN) { + + if ((size_t) (prefixlen + 3) < tmpbufsize) + tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = prefixlen + 3); + + snprintf(tmpbuf, tmpbufsize, ",\n%*s", column = prefixlen, ""); + output = add(tmpbuf, output); + } else { + output = add(", ", output); + column += 2; + } + } + + /* + * Finally add the address + */ + + output = add(cp, output); + column += len; + free(cp); + cp = NULL; + } + + /* + * Just in case we're at the end of a list + */ + + if (groupflag) { + output = add(";", output); + } + + output = add("\n", output); + + free(*value); + *value = output; + output = NULL; + +out: + + if (tmpbuf) + free(tmpbuf); + if (output) + free(output); + + return errflag > 0; +} + +/* + * Scan a string, check for characters that need to be encoded + */ + +static int +scanstring(const char *string, int *asciilen, int *eightbitchars, + int *specialchars) +{ + *asciilen = 0; + *eightbitchars = 0; + *specialchars = 0; + + for (; *string != '\0'; string++) { + if ((isascii((unsigned char) *string))) { + (*asciilen)++; + /* + * So, a space is not a valid phrase character, but we're counting + * an exception here, because in q-p a space can be directly + * encoded as an underscore. + */ + if (!qphrasevalid((unsigned char) *string) && *string != ' ') + (*specialchars)++; + } else { + (*eightbitchars)++; + } + } + + return *eightbitchars > 0; +} + +/* + * This function is to be used to decide which encoding algorithm we should + * use if one is not given. Basically, we pick whichever one is the shorter + * of the two. + * + * Arguments are: + * + * ascii - Number of ASCII characters in to-be-encoded string. + * specials - Number of ASCII characters in to-be-encoded string that + * still require encoding under quoted-printable. Note that + * these are included in the "ascii" total. + * eightbit - Eight-bit characters in the to-be-encoded string. + * + * Returns one of CE_BASE64 or CE_QUOTED. + */ +/* +static int +pref_encoding(int ascii, int specials, int eightbits) +{ */ + /* + * The length of the q-p encoding is: + * + * ascii - specials + (specials + eightbits) * 3. + * + * The length of the base64 encoding is: + * + * base64len(ascii + eightbits) (See macro for details) + */ +/* + return base64len(ascii + eightbits) < (ascii - specials + + (specials + eightbits) * 3) ? CE_BASE64 : CE_QUOTED; +}*/ diff --git a/uip/mhbuild.c b/uip/mhbuild.c index 140cfdb..d636cf7 100644 --- a/uip/mhbuild.c +++ b/uip/mhbuild.c @@ -411,6 +411,18 @@ finish_field: } /* + * Iterate through the list of headers and call the function to MIME-ify + * them if required. + */ + + HF hp; + for (hp = ct->c_first_hf; hp != NULL; hp = hp->next) { + if (encode_rfc2047(hp->name, &hp->value, NULL)) { + adios(EX_DATAERR, NULL, "Unable to encode header \"%s\"", hp->name); + } + } + + /* ** Now add the MIME-Version header field ** to the list of header fields. */ @@ -1529,6 +1541,9 @@ build_headers(CT ct) if (ct->c_descr) { np = getcpy(DESCR_FIELD); vp = concat(" ", ct->c_descr, NULL); + if (encode_rfc2047(DESCR_FIELD, &vp, NULL)) { + adios(EX_DATAERR, NULL, "Unable to encode %s header", DESCR_FIELD); + } add_header(ct, np, vp); } -- 1.7.10.4