2 * Routines to encode message headers using RFC 2047-encoding.
4 * This code is Copyright (c) 2002, by the authors of nmh. See the
5 * COPYRIGHT file in the root directory of the nmh distribution for
6 * complete copyright information.
10 #include <h/mhparse.h>
11 #include <h/addrsbr.h>
17 * List of headers that contain addresses and as a result require special
21 static char *address_headers[] = {
38 * Macros we use for parsing headers
41 #define is_fws(c) (c == '\t' || c == ' ' || c == '\n')
43 #define qphrasevalid(c) ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || \
44 (c >= 'a' && c <= 'z') || \
45 c == '!' || c == '*' || c == '+' || c == '-' || \
46 c == '/' || c == '=' || c == '_')
47 #define qpspecial(c) (c < ' ' || c == '=' || c == '?' || c == '_')
49 #define base64len(n) ((((n) + 2) / 3 ) * 4) /* String len to base64 len */
50 #define strbase64(n) ((n) / 4 * 3) /* Chars that fit in base64 */
52 #define ENCODELINELIMIT 76
54 static void unfold_header(char **, int);
55 static int field_encode_address(const char *, char **, const char *);
56 static int field_encode_quoted(const char *, char **, const char *, int,
58 static int scanstring(const char *, int *, int *, int *);
59 static int utf8len(const char *);
60 /*static int pref_encoding(int, int, int);*/
63 * Encode a message header using RFC 2047 encoding. We make the assumption
64 * that all characters < 128 are ASCII and as a consequence don't need any
69 encode_rfc2047(const char *name, char **value, const char *charset)
71 int i, asciicount = 0, eightbitcount = 0, qpspecialcount = 0;
75 * First, check to see if we even need to encode the header
78 for (p = *value; *p != '\0'; p++) {
79 if (isascii((unsigned char) *p)) {
81 if (qpspecial((unsigned char) *p))
87 if (eightbitcount == 0)
91 * Some rules from RFC 2047:
93 * - Encoded words cannot be more than 75 characters long
94 * - Multiple "long" encoded words must be on new lines.
96 * Also, we're not permitted to encode email addresses, so
97 * we need to actually _parse_ email addresses and only encode
102 * If charset was NULL, then get the value from the locale. But
103 * we reject it if it returns US-ASCII
107 charset = write_charset_8bit();
109 if (strcasecmp(charset, "US-ASCII") == 0) {
110 advise(NULL, "Cannot use US-ASCII with 8 bit characters in header");
115 * If we have an address header, then we need to parse the addresses
116 * and only encode the names or comments. Otherwise, handle it normally.
119 for (i = 0; address_headers[i]; i++) {
120 if (strcasecmp(name, address_headers[i]) == 0) {
121 return field_encode_address(name, value, charset);
126 * On the encoding we choose, and the specifics of encoding:
128 * - If a specified encoding is passed in, we use that.
129 * - Otherwise, pick which encoding is shorter.
131 * We don't quite handle continuation right here, but it should be
135 unfold_header(value, asciicount + eightbitcount);
137 return field_encode_quoted(name, value, charset, asciicount,
138 eightbitcount + qpspecialcount, 0);
142 * Encode our specified header (or field) using quoted-printable
146 field_encode_quoted(const char *name, char **value, const char *charset,
147 int ascii, int encoded, int phraserules)
149 int prefixlen = name ? strlen(name) + 2: 0, outlen = 0, column, newline = 1;
150 int charsetlen = strlen(charset), utf8;
151 char *output = NULL, *p, *q = NULL;
154 * Right now we just encode the whole thing. Maybe later on we'll
155 * only encode things on a per-atom basis.
160 column = prefixlen + 2; /* Header name plus ": " */
162 utf8 = strcasecmp(charset, "UTF-8") == 0;
166 * Start a new line, if it's time
170 * If it's the start of the header, we don't need to pad it
172 * The length of the output string is ...
173 * =?charset?Q?...?= so that's 7+strlen(charset) + 2 for \n NUL
175 * plus 1 for every ASCII character and 3 for every eight bit
176 * or special character (eight bit characters are written as =XX).
182 outlen += 9 + charsetlen + ascii + 3 * encoded;
185 * If output is set, then we're continuing the header. Otherwise
186 * do the initial allocation.
190 int curlen = q - output, i;
191 outlen += prefixlen + 1; /* Header plus \n ": " */
192 output = mh_xrealloc(output, outlen);
197 for (i = 0; i < prefixlen; i++)
201 * A bit of a hack here; the header can contain multiple
202 * spaces (probably at least one) until we get to the
203 * actual text. Copy until we get to a non-space.
205 output = mh_xmalloc(outlen);
211 tokenlen = snprintf(q, outlen - (q - output), "=?%s?Q?", charset);
213 column = prefixlen + tokenlen;
218 * Process each character, encoding if necessary
220 * Note that we have a different set of rules if we're processing
221 * RFC 5322 'phrase' (something you'd see in an address header).
229 } else if (isascii((unsigned char) *p) &&
230 (phraserules ? qphrasevalid((unsigned char) *p) :
231 !qpspecial((unsigned char) *p))) {
235 snprintf(q, outlen - (q - output), "=%02X", (unsigned char) *p);
237 column += 2; /* column already incremented by 1 above */
244 * We're not allowed more than ENCODELINELIMIT characters per line,
245 * so reserve some room for the final ?=.
247 * If prefixlen == 0, we haven't been passed in a header name, so
248 * don't ever wrap the field (we're likely doing an address).
254 if (column >= ENCODELINELIMIT - 2) {
258 * Okay, this is a bit weird, but to explain a bit more ...
260 * RFC 2047 prohibits the splitting of multibyte characters
261 * across encoded words. Right now we only handle the case
262 * of UTF-8, the most common multibyte encoding.
264 * p is now pointing at the next input character. If we're
265 * using UTF-8 _and_ we'd go over ENCODELINELIMIT given the
266 * length of the complete character, then trigger a newline
267 * now. Note that we check the length * 3 since we have to
268 * allow for the encoded output.
270 if (column + (utf8len(p) * 3) > ENCODELINELIMIT - 2) {
277 /* This should never happen, but just in case. Found by
278 clang static analyzer. */
279 admonish (NULL, "null output encoding for %s", *value);
298 * Calculate the length of a UTF-8 character.
300 * If it's not a UTF-8 character (or we're in the middle of a multibyte
301 * character) then simply return 0.
305 utf8len(const char *p)
312 if (isascii((unsigned char) *p) || (((unsigned char) *p) & 0xc0) == 0x80)
316 while ((((unsigned char) *p++) & 0xc0) == 0x80)
323 * "Unfold" a header, making it a single line (without continuation)
325 * We cheat a bit here; we never make the string longer, so using the
326 * original length here is fine.
330 unfold_header(char **value, int len)
332 char *str = mh_xmalloc(len + 1);
333 char *p = str, *q = *value;
338 * When we get a newline, skip to the next non-whitespace
339 * character and add a space to replace all of the whitespace
341 * This has the side effect of stripping off the final newline
342 * for the header; we put it back in the encoding routine.
362 * Decode a header containing addresses. This means we have to parse
363 * each address and only encode the display-name or comment field.
367 field_encode_address(const char *name, char **value, const char *charset)
369 int prefixlen = strlen(name) + 2, column = prefixlen, groupflag;
370 int asciichars, specialchars, eightbitchars, reformat = 0, errflag = 0;
372 char *mp, *cp = NULL, *output = NULL;
374 size_t tmpbufsize = 0;
379 * Because these are addresses, we need to handle them individually.
381 * Break them down and process them one by one. This means we have to
382 * rewrite the whole header, but that's unavoidable.
386 * The output headers always have to start with a space first; this
387 * is just the way the API works right now.
390 output = add(" ", output);
392 for (groupflag = 0; (mp = getname(*value)); ) {
393 if ((mn = getm(mp, NULL, 0, AD_HOST, errbuf)) == NULL) {
394 advise(NULL, "%s: %s", errbuf, mp);
402 * We only care if the phrase (m_pers) or any trailing comment
403 * (m_note) have 8-bit characters. If doing q-p, we also need
404 * to encode anything marked as qspecial(). Unquote it first
405 * so the specialchars count is right.
411 if ((len = strlen(mn->m_pers)) + 1 > tmpbufsize) {
412 tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = len + 1);
415 unquote_string(mn->m_pers, tmpbuf);
417 if (scanstring(tmpbuf, &asciichars, &eightbitchars,
420 * If we have 8-bit characters, encode it.
424 * This is okay, because the output of unquote_string will be either
425 * equal or shorter than the original.
428 strcpy(mn->m_pers, tmpbuf);
430 if (field_encode_quoted(NULL, &mn->m_pers, charset, asciichars,
431 eightbitchars + specialchars, 1)) {
442 * The "note" field is generally a comment at the end of the address,
443 * at least as how it's implemented here. Notes are always surrounded
444 * by parenthesis (since they're comments). Strip them out and
445 * then put them back when we format the final field, but they do
452 if ((len = strlen(mn->m_note)) + 1 > tmpbufsize) {
453 tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = len + 1);
456 if (mn->m_note[0] != '(' || mn->m_note[len - 1] != ')') {
457 advise(NULL, "Internal error: Invalid note field \"%s\"",
463 strncpy(tmpbuf, mn->m_note + 1, len - 1);
464 tmpbuf[len - 2] = '\0';
466 if (scanstring(tmpbuf, &asciichars, &eightbitchars,
469 * If we have 8-bit characters, encode it.
472 if (field_encode_quoted(NULL, &tmpbuf, charset, asciichars,
473 eightbitchars + specialchars, 1)) {
481 * Make sure the size of tmpbuf is correct (it always gets
482 * reallocated in the above functions).
485 tmpbufsize = strlen(tmpbuf) + 1;
488 * Put the note field back surrounded by parenthesis.
491 mn->m_note = mh_xrealloc(mn->m_note, tmpbufsize + 2);
493 snprintf(mn->m_note, tmpbufsize + 2, "(%s)", tmpbuf);
499 * So, some explanation is in order.
501 * We know we need to rewrite at least one address in the header,
502 * otherwise we wouldn't be here. If we had to reformat this
503 * particular address, then run it through adrformat(). Otherwise
504 * we can use m_text directly.
508 * If we were in a group but are no longer, make sure we add a
509 * semicolon (which needs to be FIRST, as it needs to be at the end
510 * of the last address).
513 if (groupflag && ! mn->m_ingrp) {
514 output = add(";", output);
518 groupflag = mn->m_ingrp;
521 cp = add(mn->m_gname, NULL);
525 cp = add(adrformat(mn), cp);
527 cp = add(mn->m_text, cp);
533 * If we're not at the beginning of the line, add a command and
534 * either a space or a newline.
537 if (column != prefixlen) {
538 if (len + column + 2 > OUTPUTLINELEN) {
540 if ((size_t) (prefixlen + 3) < tmpbufsize)
541 tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = prefixlen + 3);
543 snprintf(tmpbuf, tmpbufsize, ",\n%*s", column = prefixlen, "");
544 output = add(tmpbuf, output);
546 output = add(", ", output);
552 * Finally add the address
555 output = add(cp, output);
562 * Just in case we're at the end of a list
566 output = add(";", output);
569 output = add("\n", output);
586 * Scan a string, check for characters that need to be encoded
590 scanstring(const char *string, int *asciilen, int *eightbitchars,
597 for (; *string != '\0'; string++) {
598 if ((isascii((unsigned char) *string))) {
601 * So, a space is not a valid phrase character, but we're counting
602 * an exception here, because in q-p a space can be directly
603 * encoded as an underscore.
605 if (!qphrasevalid((unsigned char) *string) && *string != ' ')
612 return *eightbitchars > 0;
616 * This function is to be used to decide which encoding algorithm we should
617 * use if one is not given. Basically, we pick whichever one is the shorter
622 * ascii - Number of ASCII characters in to-be-encoded string.
623 * specials - Number of ASCII characters in to-be-encoded string that
624 * still require encoding under quoted-printable. Note that
625 * these are included in the "ascii" total.
626 * eightbit - Eight-bit characters in the to-be-encoded string.
628 * Returns one of CE_BASE64 or CE_QUOTED.
632 pref_encoding(int ascii, int specials, int eightbits)
635 * The length of the q-p encoding is:
637 * ascii - specials + (specials + eightbits) * 3.
639 * The length of the base64 encoding is:
641 * base64len(ascii + eightbits) (See macro for details)
644 return base64len(ascii + eightbits) < (ascii - specials +
645 (specials + eightbits) * 3) ? CE_BASE64 : CE_QUOTED;