2 ** Routines to encode message headers using RFC 2047-encoding.
4 ** This code is Copyright (c) 2002, by the authors of nmh. See the
5 ** COPYRIGHT file in the root directory of the nmh distribution for
6 ** complete copyright information.
10 #include <h/mhparse.h>
11 #include <h/addrsbr.h>
17 ** List of headers that contain addresses and as a result require special
21 static char *address_headers[] = {
38 ** Macros we use for parsing headers
40 ** Todo: convert the macros to functions
43 #define is_fws(c) (c == '\t' || c == ' ' || c == '\n')
45 #define qphrasevalid(c) ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || \
46 (c >= 'a' && c <= 'z') || \
47 c == '!' || c == '*' || c == '+' || c == '-' || \
48 c == '/' || c == '=' || c == '_')
49 #define qpspecial(c) (c < ' ' || c == '=' || c == '?' || c == '_')
51 #define base64len(n) ((((n) + 2) / 3) * 4) /* String len to base64 len */
52 #define strbase64(n) ((n) / 4 * 3) /* Chars that fit in base64 */
54 #define ENCODELINELIMIT 76
56 static void unfold_header(char **, int);
57 static int field_encode_address(const char *, char **, const char *);
58 static int field_encode_quoted(const char *, char **, const char *, int,
60 static int scanstring(const char *, int *, int *, int *);
61 static int utf8len(const char *);
62 /*static int pref_encoding(int, int, int);*/
65 ** Encode a message header using RFC 2047 encoding. We make the assumption
66 ** that all characters < 128 are ASCII and as a consequence don't need any
70 encode_rfc2047(const char *name, char **value, const char *charset)
72 int i, asciicount = 0, eightbitcount = 0, qpspecialcount = 0;
75 /* First, check to see if we even need to encode the header */
77 for (p = *value; *p != '\0'; p++) {
78 if (isascii((unsigned char) *p)) {
80 if (qpspecial((unsigned char) *p)) {
88 if (eightbitcount == 0) {
93 ** Some rules from RFC 2047:
95 ** - Encoded words cannot be more than 75 characters long
96 ** - Multiple "long" encoded words must be on new lines.
98 ** Also, we're not permitted to encode email addresses, so
99 ** we need to actually _parse_ email addresses and only encode
104 ** If charset was NULL, then get the value from the locale. But
105 ** we reject it if it returns US-ASCII
108 if (charset == NULL) {
109 charset = write_charset_8bit();
111 if (strcasecmp(charset, "US-ASCII") == 0) {
112 advise(NULL, "Cannot use US-ASCII with 8 bit characters in header");
117 ** If we have an address header, then we need to parse the addresses
118 ** and only encode the names or comments. Otherwise, handle it
122 for (i = 0; address_headers[i]; i++) {
123 if (strcasecmp(name, address_headers[i]) == 0) {
124 return field_encode_address(name, value, charset);
129 ** On the encoding we choose, and the specifics of encoding:
131 ** - If a specified encoding is passed in, we use that.
132 ** - Otherwise, pick which encoding is shorter.
134 ** We don't quite handle continuation right here, but it should be
138 unfold_header(value, asciicount + eightbitcount);
140 return field_encode_quoted(name, value, charset, asciicount,
141 eightbitcount + qpspecialcount, 0);
145 ** Encode our specified header (or field) using quoted-printable
149 field_encode_quoted(const char *name, char **value, const char *charset,
150 int ascii, int encoded, int phraserules)
152 int prefixlen = name ? strlen(name) + 2: 0;
153 int outlen = 0, column, newline = 1, utf8;
154 int charsetlen = strlen(charset);
155 char *output = NULL, *p, *q = NULL;
158 ** Right now we just encode the whole thing. Maybe later on we'll
159 ** only encode things on a per-atom basis.
164 column = prefixlen + 2; /* Header name plus ": " */
166 utf8 = strcasecmp(charset, "UTF-8") == 0;
169 /* Start a new line, if it's time */
174 ** If it's the start of the header, we don't need
177 ** The length of the output string is ...
178 ** =?charset?Q?...?= so that's
179 ** 7+strlen(charset) + 2 for \n NUL
181 ** plus 1 for every ASCII character and 3 for
182 ** every eight bit or special character (eight
183 ** bit characters are written as =XX).
185 outlen += 9 + charsetlen + ascii + 3 * encoded;
188 /* continue the header */
189 int curlen = q - output, i;
190 outlen += prefixlen + 1; /* Header plus \n ": " */
191 output = mh_xrealloc(output, outlen);
196 for (i = 0; i < prefixlen; i++) {
200 /* do the initial allocation */
202 ** A bit of a hack here; the header can
203 ** contain multiple spaces (probably at
204 ** least one) until we get to the actual
205 ** text. Copy until we get to a non-space.
207 output = mh_xmalloc(outlen);
214 tokenlen = snprintf(q, outlen - (q - output),
217 column = prefixlen + tokenlen;
222 ** Process each character, encoding if necessary
224 ** Note that we have a different set of rules if we're
225 ** processing RFC 5322 'phrase' (something you'd see in
226 ** an address header).
234 } else if (isascii((unsigned char) *p) && (phraserules ?
235 qphrasevalid((unsigned char) *p)
236 : !qpspecial((unsigned char) *p))) {
240 snprintf(q, outlen - (q - output), "=%02X",
243 column += 2; /* column already incremented by 1 above */
249 if (prefixlen == 0) {
251 ** We haven't been passed in a header name,
252 ** so don't ever wrap the field (we're likely
253 ** doing an address).
258 ** We're not allowed more than ENCODELINELIMIT characters
259 ** per line, so reserve some room for the final ?=.
261 if (column >= ENCODELINELIMIT - 2) {
265 ** Okay, this is a bit weird, but to explain a
268 ** RFC 2047 prohibits the splitting of multibyte
269 ** characters across encoded words. Right now
270 ** we only handle the case of UTF-8, the most
271 ** common multibyte encoding.
273 ** p is now pointing at the next input character.
274 ** If we're using UTF-8 _and_ we'd go over
275 ** ENCODELINELIMIT given the length of the
276 ** complete character, then trigger a newline now.
277 ** Note that we check the length * 3 since we
278 ** have to allow for the encoded output.
280 if (column + (utf8len(p)*3) > ENCODELINELIMIT - 2) {
288 ** This should never happen, but just in case.
289 ** Found by clang static analyzer.
291 admonish (NULL, "null output encoding for %s", *value);
309 ** Calculate the length of a UTF-8 character.
311 ** If it's not a UTF-8 character (or we're in the middle of a multibyte
312 ** character) then simply return 0.
315 utf8len(const char *p)
322 if (isascii((unsigned char) *p) ||
323 (((unsigned char) *p) & 0xc0) == 0x80) {
327 while ((((unsigned char) *p++) & 0xc0) == 0x80) {
335 ** "Unfold" a header, making it a single line (without continuation)
337 ** We cheat a bit here; we never make the string longer, so using the
338 ** original length here is fine.
341 unfold_header(char **value, int len)
343 char *str = mh_xmalloc(len + 1);
344 char *p = str, *q = *value;
349 ** When we get a newline, skip to the next
350 ** non-whitespace character and add a space to
351 ** replace all of the whitespace
353 ** This has the side effect of stripping off the
354 ** final newline for the header; we put it back
355 ** in the encoding routine.
375 ** Decode a header containing addresses. This means we have to parse
376 ** each address and only encode the display-name or comment field.
379 field_encode_address(const char *name, char **value, const char *charset)
381 int prefixlen = strlen(name) + 2;
382 int column = prefixlen, groupflag;
383 int asciichars, specialchars, eightbitchars;
384 int reformat = 0, errflag = 0;
386 char *mp, *cp = NULL, *output = NULL;
388 size_t tmpbufsize = 0;
393 ** Because these are addresses, we need to handle them individually.
395 ** Break them down and process them one by one. This means we
396 ** have to rewrite the whole header, but that's unavoidable.
400 ** The output headers always have to start with a space first;
401 ** this is just the way the API works right now.
404 output = add(" ", output);
406 for (groupflag = 0; (mp = getname(*value)); ) {
407 if ((mn = getm(mp, NULL, 0, AD_HOST, errbuf)) == NULL) {
408 advise(NULL, "%s: %s", errbuf, mp);
416 ** We only care if the phrase (m_pers) or any trailing
417 ** comment (m_note) have 8-bit characters. If doing q-p,
418 ** we also need to encode anything marked as qspecial().
419 ** Unquote it first so the specialchars count is right.
426 if ((len = strlen(mn->m_pers)) + 1 > tmpbufsize) {
427 tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = len + 1);
430 unquote_string(mn->m_pers, tmpbuf);
432 if (scanstring(tmpbuf, &asciichars, &eightbitchars,
435 ** If we have 8-bit characters, encode it.
439 ** This is okay, because the output of
440 ** unquote_string will be either equal or shorter
441 ** than the original.
443 strcpy(mn->m_pers, tmpbuf);
445 if (field_encode_quoted(NULL, &mn->m_pers, charset,
447 eightbitchars + specialchars, 1)) {
458 ** The "note" field is generally a comment at the end
459 ** of the address, at least as how it's implemented here.
460 ** Notes are always surrounded by parenthesis (since they're
461 ** comments). Strip them out and then put them back when
462 ** we format the final field, but they do not get encoded.
469 if ((len = strlen(mn->m_note)) + 1 > tmpbufsize) {
470 tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = len + 1);
473 if (mn->m_note[0] != '(' || mn->m_note[len - 1] != ')') {
474 advise(NULL, "Internal error: Invalid note field \"%s\"",
480 strncpy(tmpbuf, mn->m_note + 1, len - 1);
481 tmpbuf[len - 2] = '\0';
483 if (scanstring(tmpbuf, &asciichars, &eightbitchars,
486 ** If we have 8-bit characters, encode it.
489 if (field_encode_quoted(NULL, &tmpbuf, charset,
491 eightbitchars + specialchars, 1)) {
499 ** Make sure the size of tmpbuf is correct (it
500 ** always gets reallocated in the above functions).
503 tmpbufsize = strlen(tmpbuf) + 1;
506 ** Put the note field back surrounded by
510 mn->m_note = mh_xrealloc(mn->m_note, tmpbufsize + 2);
512 snprintf(mn->m_note, tmpbufsize + 2, "(%s)", tmpbuf);
518 ** So, some explanation is in order.
520 ** We know we need to rewrite at least one address in the
521 ** header, otherwise we wouldn't be here. If we had to
522 ** reformat this particular address, then run it through
523 ** adrformat(). Otherwise we can use m_text directly.
527 ** If we were in a group but are no longer, make sure we
528 ** add a semicolon (which needs to be FIRST, as it needs
529 ** to be at the end of the last address).
532 if (groupflag && ! mn->m_ingrp) {
533 output = add(";", output);
537 groupflag = mn->m_ingrp;
540 cp = add(mn->m_gname, NULL);
544 cp = add(adrformat(mn), cp);
546 cp = add(mn->m_text, cp);
552 ** If we're not at the beginning of the line, add a
553 ** command and either a space or a newline.
556 if (column != prefixlen) {
557 if (len + column + 2 > OUTPUTLINELEN) {
559 if ((size_t) (prefixlen + 3) < tmpbufsize) {
560 tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = prefixlen + 3);
563 snprintf(tmpbuf, tmpbufsize, ",\n%*s", column = prefixlen, "");
564 output = add(tmpbuf, output);
566 output = add(", ", output);
572 ** Finally add the address
575 output = add(cp, output);
582 ** Just in case we're at the end of a list
586 output = add(";", output);
589 output = add("\n", output);
608 ** Scan a string, check for characters that need to be encoded
612 scanstring(const char *string, int *asciilen, int *eightbitchars,
619 for (; *string != '\0'; string++) {
620 if ((isascii((unsigned char) *string))) {
623 ** So, a space is not a valid phrase character, but
624 ** we're counting an exception here, because in q-p
625 ** a space can be directly encoded as an underscore.
627 if (!qphrasevalid((unsigned char) *string) &&
636 return *eightbitchars > 0;
642 ** This function is to be used to decide which encoding algorithm we should
643 ** use if one is not given. Basically, we pick whichever one is the shorter
648 ** ascii - Number of ASCII characters in to-be-encoded string.
649 ** specials - Number of ASCII characters in to-be-encoded string that
650 ** still require encoding under quoted-printable. Note that
651 ** these are included in the "ascii" total.
652 ** eightbit - Eight-bit characters in the to-be-encoded string.
654 ** Returns one of CE_BASE64 or CE_QUOTED.
657 pref_encoding(int ascii, int specials, int eightbits)
660 ** The length of the q-p encoding is:
662 ** ascii - specials + (specials + eightbits) * 3.
664 ** The length of the base64 encoding is:
666 ** base64len(ascii + eightbits) (See macro for details)
668 return base64len(ascii + eightbits) < (ascii - specials +
669 (specials + eightbits) * 3) ? CE_BASE64 : CE_QUOTED;