--- /dev/null
+/*
+** Routines to encode message headers using RFC 2047-encoding.
+**
+** This code is Copyright (c) 2002, by the authors of nmh. See the
+** COPYRIGHT file in the root directory of the nmh distribution for
+** complete copyright information.
+*/
+
+#include <h/mh.h>
+#include <h/mhparse.h>
+#include <h/addrsbr.h>
+#include <h/utils.h>
+
+#include <ctype.h>
+
+/*
+** List of headers that contain addresses and as a result require special
+** handling
+*/
+
+static char *address_headers[] = {
+ "To",
+ "From",
+ "cc",
+ "Bcc",
+ "Reply-To",
+ "Sender",
+ "Resent-To",
+ "Resent-From",
+ "Resent-cc",
+ "Resent-Bcc",
+ "Resent-Reply-To",
+ "Resent-Sender",
+ NULL,
+};
+
+/*
+** Macros we use for parsing headers
+**
+** Todo: convert the macros to functions
+*/
+
+#define is_fws(c) (c == '\t' || c == ' ' || c == '\n')
+
+#define qphrasevalid(c) ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || \
+ (c >= 'a' && c <= 'z') || \
+ c == '!' || c == '*' || c == '+' || c == '-' || \
+ c == '/' || c == '=' || c == '_')
+#define qpspecial(c) (c < ' ' || c == '=' || c == '?' || c == '_')
+
+#define base64len(n) ((((n) + 2) / 3) * 4) /* String len to base64 len */
+#define strbase64(n) ((n) / 4 * 3) /* Chars that fit in base64 */
+
+#define ENCODELINELIMIT 76
+
+static void unfold_header(char **, int);
+static int field_encode_address(const char *, char **, const char *);
+static int field_encode_quoted(const char *, char **, const char *, int,
+ int, int);
+static int scanstring(const char *, int *, int *, int *);
+static int utf8len(const char *);
+/*static int pref_encoding(int, int, int);*/
+
+/*
+** Encode a message header using RFC 2047 encoding. We make the assumption
+** that all characters < 128 are ASCII and as a consequence don't need any
+** encoding.
+*/
+int
+encode_rfc2047(const char *name, char **value, const char *charset)
+{
+ int i, asciicount = 0, eightbitcount = 0, qpspecialcount = 0;
+ char *p;
+
+ /* First, check to see if we even need to encode the header */
+
+ for (p = *value; *p != '\0'; p++) {
+ if (isascii((unsigned char) *p)) {
+ asciicount++;
+ if (qpspecial((unsigned char) *p)) {
+ qpspecialcount++;
+ }
+ } else {
+ eightbitcount++;
+ }
+ }
+
+ if (eightbitcount == 0) {
+ return 0;
+ }
+
+ /*
+ ** Some rules from RFC 2047:
+ **
+ ** - Encoded words cannot be more than 75 characters long
+ ** - Multiple "long" encoded words must be on new lines.
+ **
+ ** Also, we're not permitted to encode email addresses, so
+ ** we need to actually _parse_ email addresses and only encode
+ ** the right bits.
+ */
+
+ /*
+ ** If charset was NULL, then get the value from the locale. But
+ ** we reject it if it returns US-ASCII
+ */
+
+ if (charset == NULL) {
+ charset = write_charset_8bit();
+ }
+ if (strcasecmp(charset, "US-ASCII") == 0) {
+ advise(NULL, "Cannot use US-ASCII with 8 bit characters in header");
+ return 1;
+ }
+
+ /*
+ ** If we have an address header, then we need to parse the addresses
+ ** and only encode the names or comments. Otherwise, handle it
+ ** normally.
+ */
+
+ for (i = 0; address_headers[i]; i++) {
+ if (strcasecmp(name, address_headers[i]) == 0) {
+ return field_encode_address(name, value, charset);
+ }
+ }
+
+ /*
+ ** On the encoding we choose, and the specifics of encoding:
+ **
+ ** - If a specified encoding is passed in, we use that.
+ ** - Otherwise, pick which encoding is shorter.
+ **
+ ** We don't quite handle continuation right here, but it should be
+ ** pretty close.
+ */
+
+ unfold_header(value, asciicount + eightbitcount);
+
+ return field_encode_quoted(name, value, charset, asciicount,
+ eightbitcount + qpspecialcount, 0);
+}
+
+/*
+** Encode our specified header (or field) using quoted-printable
+*/
+
+static int
+field_encode_quoted(const char *name, char **value, const char *charset,
+ int ascii, int encoded, int phraserules)
+{
+ int prefixlen = name ? strlen(name) + 2: 0;
+ int outlen = 0, column, newline = 1, utf8;
+ int charsetlen = strlen(charset);
+ char *output = NULL, *p, *q = NULL;
+
+ /*
+ ** Right now we just encode the whole thing. Maybe later on we'll
+ ** only encode things on a per-atom basis.
+ */
+
+ p = *value;
+
+ column = prefixlen + 2; /* Header name plus ": " */
+
+ utf8 = strcasecmp(charset, "UTF-8") == 0;
+
+ while (*p != '\0') {
+ /* Start a new line, if it's time */
+ if (newline) {
+ int tokenlen;
+
+ /*
+ ** If it's the start of the header, we don't need
+ ** to pad it
+ **
+ ** The length of the output string is ...
+ ** =?charset?Q?...?= so that's
+ ** 7+strlen(charset) + 2 for \n NUL
+ **
+ ** plus 1 for every ASCII character and 3 for
+ ** every eight bit or special character (eight
+ ** bit characters are written as =XX).
+ */
+ outlen += 9 + charsetlen + ascii + 3 * encoded;
+
+ if (output) {
+ /* continue the header */
+ int curlen = q - output, i;
+ outlen += prefixlen + 1; /* Header plus \n ": " */
+ output = mh_xrealloc(output, outlen);
+ q = output + curlen;
+ *q++ = '?';
+ *q++ = '=';
+ *q++ = '\n';
+ for (i = 0; i < prefixlen; i++) {
+ *q++ = ' ';
+ }
+ } else {
+ /* do the initial allocation */
+ /*
+ ** A bit of a hack here; the header can
+ ** contain multiple spaces (probably at
+ ** least one) until we get to the actual
+ ** text. Copy until we get to a non-space.
+ */
+ output = mh_xmalloc(outlen);
+ q = output;
+ while (is_fws(*p)) {
+ *q++ = *p++;
+ }
+ }
+
+ tokenlen = snprintf(q, outlen - (q - output),
+ "=?%s?Q?", charset);
+ q += tokenlen;
+ column = prefixlen + tokenlen;
+ newline = 0;
+ }
+
+ /*
+ ** Process each character, encoding if necessary
+ **
+ ** Note that we have a different set of rules if we're
+ ** processing RFC 5322 'phrase' (something you'd see in
+ ** an address header).
+ */
+
+ column++;
+
+ if (*p == ' ') {
+ *q++ = '_';
+ ascii--;
+ } else if (isascii((unsigned char) *p) && (phraserules ?
+ qphrasevalid((unsigned char) *p)
+ : !qpspecial((unsigned char) *p))) {
+ *q++ = *p;
+ ascii--;
+ } else {
+ snprintf(q, outlen - (q - output), "=%02X",
+ (unsigned char) *p);
+ q += 3;
+ column += 2; /* column already incremented by 1 above */
+ encoded--;
+ }
+
+ p++;
+
+ if (prefixlen == 0) {
+ /*
+ ** We haven't been passed in a header name,
+ ** so don't ever wrap the field (we're likely
+ ** doing an address).
+ */
+ continue;
+ }
+ /*
+ ** We're not allowed more than ENCODELINELIMIT characters
+ ** per line, so reserve some room for the final ?=.
+ */
+ if (column >= ENCODELINELIMIT - 2) {
+ newline = 1;
+ } else if (utf8) {
+ /*
+ ** Okay, this is a bit weird, but to explain a
+ ** bit more ...
+ **
+ ** RFC 2047 prohibits the splitting of multibyte
+ ** characters across encoded words. Right now
+ ** we only handle the case of UTF-8, the most
+ ** common multibyte encoding.
+ **
+ ** p is now pointing at the next input character.
+ ** If we're using UTF-8 _and_ we'd go over
+ ** ENCODELINELIMIT given the length of the
+ ** complete character, then trigger a newline now.
+ ** Note that we check the length * 3 since we
+ ** have to allow for the encoded output.
+ */
+ if (column + (utf8len(p)*3) > ENCODELINELIMIT - 2) {
+ newline = 1;
+ }
+ }
+ }
+
+ if (q == NULL) {
+ /*
+ ** This should never happen, but just in case.
+ ** Found by clang static analyzer.
+ */
+ admonish (NULL, "null output encoding for %s", *value);
+ return 1;
+ }
+ *q++ = '?';
+ *q++ = '=';
+
+ if (prefixlen) {
+ *q++ = '\n';
+ }
+ *q = '\0';
+
+ free(*value);
+ *value = output;
+
+ return 0;
+}
+
+/*
+** Calculate the length of a UTF-8 character.
+**
+** If it's not a UTF-8 character (or we're in the middle of a multibyte
+** character) then simply return 0.
+*/
+static int
+utf8len(const char *p)
+{
+ int len = 1;
+
+ if (*p == '\0') {
+ return 0;
+ }
+ if (isascii((unsigned char) *p) ||
+ (((unsigned char) *p) & 0xc0) == 0x80) {
+ return 0;
+ }
+ p++;
+ while ((((unsigned char) *p++) & 0xc0) == 0x80) {
+ len++;
+ }
+
+ return len;
+}
+
+/*
+** "Unfold" a header, making it a single line (without continuation)
+**
+** We cheat a bit here; we never make the string longer, so using the
+** original length here is fine.
+*/
+static void
+unfold_header(char **value, int len)
+{
+ char *str = mh_xmalloc(len + 1);
+ char *p = str, *q = *value;
+
+ while (*q != '\0') {
+ if (*q == '\n') {
+ /*
+ ** When we get a newline, skip to the next
+ ** non-whitespace character and add a space to
+ ** replace all of the whitespace
+ **
+ ** This has the side effect of stripping off the
+ ** final newline for the header; we put it back
+ ** in the encoding routine.
+ */
+ while (is_fws(*q)) {
+ q++;
+ }
+ if (*q == '\0') {
+ break;
+ }
+ *p++ = ' ';
+ } else {
+ *p++ = *q++;
+ }
+ }
+ *p = '\0';
+
+ free(*value);
+ *value = str;
+}
+
+/*
+** Decode a header containing addresses. This means we have to parse
+** each address and only encode the display-name or comment field.
+*/
+static int
+field_encode_address(const char *name, char **value, const char *charset)
+{
+ int prefixlen = strlen(name) + 2;
+ int column = prefixlen, groupflag;
+ int asciichars, specialchars, eightbitchars;
+ int reformat = 0, errflag = 0;
+ size_t len;
+ char *mp, *cp = NULL, *output = NULL;
+ char *tmpbuf = NULL;
+ size_t tmpbufsize = 0;
+ struct mailname *mn;
+ char errbuf[BUFSIZ];
+
+ /*
+ ** Because these are addresses, we need to handle them individually.
+ **
+ ** Break them down and process them one by one. This means we
+ ** have to rewrite the whole header, but that's unavoidable.
+ */
+
+ /*
+ ** The output headers always have to start with a space first;
+ ** this is just the way the API works right now.
+ */
+
+ output = add(" ", output);
+
+ for (groupflag = 0; (mp = getname(*value)); ) {
+ if ((mn = getm(mp, NULL, 0, AD_HOST, errbuf)) == NULL) {
+ advise(NULL, "%s: %s", errbuf, mp);
+ errflag++;
+ continue;
+ }
+
+ reformat = 0;
+
+ /*
+ ** We only care if the phrase (m_pers) or any trailing
+ ** comment (m_note) have 8-bit characters. If doing q-p,
+ ** we also need to encode anything marked as qspecial().
+ ** Unquote it first so the specialchars count is right.
+ */
+
+ if (! mn->m_pers) {
+ goto check_note;
+ }
+
+ if ((len = strlen(mn->m_pers)) + 1 > tmpbufsize) {
+ tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = len + 1);
+ }
+
+ unquote_string(mn->m_pers, tmpbuf);
+
+ if (scanstring(tmpbuf, &asciichars, &eightbitchars,
+ &specialchars)) {
+ /*
+ ** If we have 8-bit characters, encode it.
+ */
+
+ /*
+ ** This is okay, because the output of
+ ** unquote_string will be either equal or shorter
+ ** than the original.
+ */
+ strcpy(mn->m_pers, tmpbuf);
+
+ if (field_encode_quoted(NULL, &mn->m_pers, charset,
+ asciichars,
+ eightbitchars + specialchars, 1)) {
+ errflag++;
+ goto out;
+ }
+
+ reformat++;
+ }
+
+ check_note:
+
+ /*
+ ** The "note" field is generally a comment at the end
+ ** of the address, at least as how it's implemented here.
+ ** Notes are always surrounded by parenthesis (since they're
+ ** comments). Strip them out and then put them back when
+ ** we format the final field, but they do not get encoded.
+ */
+
+ if (! mn->m_note) {
+ goto do_reformat;
+ }
+
+ if ((len = strlen(mn->m_note)) + 1 > tmpbufsize) {
+ tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = len + 1);
+ }
+
+ if (mn->m_note[0] != '(' || mn->m_note[len - 1] != ')') {
+ advise(NULL, "Internal error: Invalid note field \"%s\"",
+ mn->m_note);
+ errflag++;
+ goto out;
+ }
+
+ strncpy(tmpbuf, mn->m_note + 1, len - 1);
+ tmpbuf[len - 2] = '\0';
+
+ if (scanstring(tmpbuf, &asciichars, &eightbitchars,
+ &specialchars)) {
+ /*
+ ** If we have 8-bit characters, encode it.
+ */
+
+ if (field_encode_quoted(NULL, &tmpbuf, charset,
+ asciichars,
+ eightbitchars + specialchars, 1)) {
+ errflag++;
+ goto out;
+ }
+
+ reformat++;
+
+ /*
+ ** Make sure the size of tmpbuf is correct (it
+ ** always gets reallocated in the above functions).
+ */
+
+ tmpbufsize = strlen(tmpbuf) + 1;
+
+ /*
+ ** Put the note field back surrounded by
+ ** parenthesis.
+ */
+
+ mn->m_note = mh_xrealloc(mn->m_note, tmpbufsize + 2);
+
+ snprintf(mn->m_note, tmpbufsize + 2, "(%s)", tmpbuf);
+ }
+
+do_reformat:
+
+ /*
+ ** So, some explanation is in order.
+ **
+ ** We know we need to rewrite at least one address in the
+ ** header, otherwise we wouldn't be here. If we had to
+ ** reformat this particular address, then run it through
+ ** adrformat(). Otherwise we can use m_text directly.
+ */
+
+ /*
+ ** If we were in a group but are no longer, make sure we
+ ** add a semicolon (which needs to be FIRST, as it needs
+ ** to be at the end of the last address).
+ */
+
+ if (groupflag && ! mn->m_ingrp) {
+ output = add(";", output);
+ column += 1;
+ }
+
+ groupflag = mn->m_ingrp;
+
+ if (mn->m_gname) {
+ cp = add(mn->m_gname, NULL);
+ }
+
+ if (reformat) {
+ cp = add(adrformat(mn), cp);
+ } else {
+ cp = add(mn->m_text, cp);
+ }
+
+ len = strlen(cp);
+
+ /*
+ ** If we're not at the beginning of the line, add a
+ ** command and either a space or a newline.
+ */
+
+ if (column != prefixlen) {
+ if (len + column + 2 > OUTPUTLINELEN) {
+
+ if ((size_t) (prefixlen + 3) < tmpbufsize) {
+ tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = prefixlen + 3);
+ }
+
+ snprintf(tmpbuf, tmpbufsize, ",\n%*s", column = prefixlen, "");
+ output = add(tmpbuf, output);
+ } else {
+ output = add(", ", output);
+ column += 2;
+ }
+ }
+
+ /*
+ ** Finally add the address
+ */
+
+ output = add(cp, output);
+ column += len;
+ free(cp);
+ cp = NULL;
+ }
+
+ /*
+ ** Just in case we're at the end of a list
+ */
+
+ if (groupflag) {
+ output = add(";", output);
+ }
+
+ output = add("\n", output);
+
+ free(*value);
+ *value = output;
+ output = NULL;
+
+out:
+
+ if (tmpbuf) {
+ free(tmpbuf);
+ }
+ if (output) {
+ free(output);
+ }
+
+ return errflag > 0;
+}
+
+/*
+** Scan a string, check for characters that need to be encoded
+*/
+
+static int
+scanstring(const char *string, int *asciilen, int *eightbitchars,
+ int *specialchars)
+{
+ *asciilen = 0;
+ *eightbitchars = 0;
+ *specialchars = 0;
+
+ for (; *string != '\0'; string++) {
+ if ((isascii((unsigned char) *string))) {
+ (*asciilen)++;
+ /*
+ ** So, a space is not a valid phrase character, but
+ ** we're counting an exception here, because in q-p
+ ** a space can be directly encoded as an underscore.
+ */
+ if (!qphrasevalid((unsigned char) *string) &&
+ *string != ' ') {
+ (*specialchars)++;
+ }
+ } else {
+ (*eightbitchars)++;
+ }
+ }
+
+ return *eightbitchars > 0;
+}
+
+#if 0
+
+/*
+** This function is to be used to decide which encoding algorithm we should
+** use if one is not given. Basically, we pick whichever one is the shorter
+** of the two.
+**
+** Arguments are:
+**
+** ascii - Number of ASCII characters in to-be-encoded string.
+** specials - Number of ASCII characters in to-be-encoded string that
+** still require encoding under quoted-printable. Note that
+** these are included in the "ascii" total.
+** eightbit - Eight-bit characters in the to-be-encoded string.
+**
+** Returns one of CE_BASE64 or CE_QUOTED.
+**/
+static int
+pref_encoding(int ascii, int specials, int eightbits)
+{
+ /*
+ ** The length of the q-p encoding is:
+ **
+ ** ascii - specials + (specials + eightbits) * 3.
+ **
+ ** The length of the base64 encoding is:
+ **
+ ** base64len(ascii + eightbits) (See macro for details)
+ */
+ return base64len(ascii + eightbits) < (ascii - specials +
+ (specials + eightbits) * 3) ? CE_BASE64 : CE_QUOTED;
+}
+
+#endif