git.marmaro.de Git - mmh/blob - sbr/encode_rfc2047.c

   1 /*
   2  * Routines to encode message headers using RFC 2047-encoding.
   3  *
   4  * This code is Copyright (c) 2002, by the authors of nmh.  See the
   5  * COPYRIGHT file in the root directory of the nmh distribution for
   6  * complete copyright information.
   7  */
   8
   9 #include <h/mh.h>
  10 #include <h/mhparse.h>
  11 #include <h/addrsbr.h>
  12 #include <h/utils.h>
  13
  14 #include <ctype.h>
  15
  16 /*
  17  * List of headers that contain addresses and as a result require special
  18  * handling
  19  */
  20
  21 static char *address_headers[] = {
  22     "To",
  23     "From",
  24     "cc",
  25     "Bcc",
  26     "Reply-To",
  27     "Sender",
  28     "Resent-To",
  29     "Resent-From",
  30     "Resent-cc",
  31     "Resent-Bcc",
  32     "Resent-Reply-To",
  33     "Resent-Sender",
  34     NULL,
  35 };
  36
  37 /*
  38  * Macros we use for parsing headers
  39  */
  40
  41 #define is_fws(c) (c == '\t' || c == ' ' || c == '\n')
  42
  43 #define qphrasevalid(c) ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || \
  44                          (c >= 'a' && c <= 'z') || \
  45                          c == '!' || c == '*' || c == '+' || c == '-' || \
  46                          c == '/' || c == '=' || c == '_')
  47 #define qpspecial(c) (c < ' ' || c == '=' || c == '?' || c == '_')
  48
  49 #define base64len(n) ((((n) + 2) / 3 ) * 4)     /* String len to base64 len */
  50 #define strbase64(n) ((n) / 4 * 3)              /* Chars that fit in base64 */
  51
  52 #define ENCODELINELIMIT 76
  53
  54 static void unfold_header(char **, int);
  55 static int field_encode_address(const char *, char **, const char *);
  56 static int field_encode_quoted(const char *, char **, const char *, int,
  57                                int, int);
  58 static int scanstring(const char *, int *, int *, int *);
  59 static int utf8len(const char *);
  60 /*static int pref_encoding(int, int, int);*/
  61
  62 /*
  63  * Encode a message header using RFC 2047 encoding.  We make the assumption
  64  * that all characters < 128 are ASCII and as a consequence don't need any
  65  * encoding.
  66  */
  67
  68 int
  69 encode_rfc2047(const char *name, char **value, const char *charset)
  70 {
  71     int i, asciicount = 0, eightbitcount = 0, qpspecialcount = 0;
  72     char *p;
  73
  74     /*
  75      * First, check to see if we even need to encode the header
  76      */
  77
  78     for (p = *value; *p != '\0'; p++) {
  79         if (isascii((unsigned char) *p)) {
  80             asciicount++;
  81             if (qpspecial((unsigned char) *p))
  82                 qpspecialcount++;
  83         } else
  84             eightbitcount++;
  85     }
  86
  87     if (eightbitcount == 0)
  88         return 0;
  89
  90     /*
  91      * Some rules from RFC 2047:
  92      *
  93      * - Encoded words cannot be more than 75 characters long
  94      * - Multiple "long" encoded words must be on new lines.
  95      *
  96      * Also, we're not permitted to encode email addresses, so
  97      * we need to actually _parse_ email addresses and only encode
  98      * the right bits.
  99      */
 100
 101     /*
 102      * If charset was NULL, then get the value from the locale.  But
 103      * we reject it if it returns US-ASCII
 104      */
 105
 106         if (charset == NULL)
 107                 charset = write_charset_8bit();
 108
 109         if (strcasecmp(charset, "US-ASCII") == 0) {
 110                 advise(NULL, "Cannot use US-ASCII with 8 bit characters in header");
 111                 return 1;
 112         }
 113
 114     /*
 115      * If we have an address header, then we need to parse the addresses
 116      * and only encode the names or comments.  Otherwise, handle it normally.
 117      */
 118
 119         for (i = 0; address_headers[i]; i++) {
 120                 if (strcasecmp(name, address_headers[i]) == 0) {
 121                         return field_encode_address(name, value, charset);
 122                 }
 123         }
 124
 125     /*
 126      * On the encoding we choose, and the specifics of encoding:
 127      *
 128      * - If a specified encoding is passed in, we use that.
 129      * - Otherwise, pick which encoding is shorter.
 130      *
 131      * We don't quite handle continuation right here, but it should be
 132      * pretty close.
 133      */
 134
 135     unfold_header(value, asciicount + eightbitcount);
 136
 137         return field_encode_quoted(name, value, charset, asciicount,
 138                                    eightbitcount + qpspecialcount, 0);
 139 }
 140
 141 /*
 142  * Encode our specified header (or field) using quoted-printable
 143  */
 144
 145 static int
 146 field_encode_quoted(const char *name, char **value, const char *charset,
 147                     int ascii, int encoded, int phraserules)
 148 {
 149     int prefixlen = name ? strlen(name) + 2: 0, outlen = 0, column, newline = 1;
 150     int charsetlen = strlen(charset), utf8;
 151     char *output = NULL, *p, *q = NULL;
 152
 153     /*
 154      * Right now we just encode the whole thing.  Maybe later on we'll
 155      * only encode things on a per-atom basis.
 156      */
 157
 158     p = *value;
 159
 160     column = prefixlen + 2;     /* Header name plus ": " */
 161
 162     utf8 = strcasecmp(charset, "UTF-8") == 0;
 163
 164     while (*p != '\0') {
 165         /*
 166          * Start a new line, if it's time
 167          */
 168         if (newline) {
 169             /*
 170              * If it's the start of the header, we don't need to pad it
 171              *
 172              * The length of the output string is ...
 173              * =?charset?Q?...?=  so that's 7+strlen(charset) + 2 for \n NUL
 174              *
 175              * plus 1 for every ASCII character and 3 for every eight bit
 176              * or special character (eight bit characters are written as =XX).
 177              *
 178              */
 179
 180             int tokenlen;
 181
 182             outlen += 9 + charsetlen + ascii + 3 * encoded;
 183
 184             /*
 185              * If output is set, then we're continuing the header.  Otherwise
 186              * do the initial allocation.
 187              */
 188
 189             if (output) {
 190                 int curlen = q - output, i;
 191                 outlen += prefixlen + 1;        /* Header plus \n ": " */
 192                 output = mh_xrealloc(output, outlen);
 193                 q = output + curlen;
 194                 *q++ = '?';
 195                 *q++ = '=';
 196                 *q++ = '\n';
 197                 for (i = 0; i < prefixlen; i++)
 198                     *q++ = ' ';
 199             } else {
 200                 /*
 201                  * A bit of a hack here; the header can contain multiple
 202                  * spaces (probably at least one) until we get to the
 203                  * actual text.  Copy until we get to a non-space.
 204                  */
 205                 output = mh_xmalloc(outlen);
 206                 q = output;
 207                 while (is_fws(*p))
 208                     *q++ = *p++;
 209             }
 210
 211             tokenlen = snprintf(q, outlen - (q - output), "=?%s?Q?", charset);
 212             q += tokenlen;
 213             column = prefixlen + tokenlen;
 214             newline = 0;
 215         }
 216
 217         /*
 218          * Process each character, encoding if necessary
 219          *
 220          * Note that we have a different set of rules if we're processing
 221          * RFC 5322 'phrase' (something you'd see in an address header).
 222          */
 223
 224         column++;
 225
 226         if (*p == ' ') {
 227             *q++ = '_';
 228             ascii--;
 229         } else if (isascii((unsigned char) *p) &&
 230                    (phraserules ? qphrasevalid((unsigned char) *p) :
 231                                         !qpspecial((unsigned char) *p))) {
 232             *q++ = *p;
 233             ascii--;
 234         } else {
 235             snprintf(q, outlen - (q - output), "=%02X", (unsigned char) *p);
 236             q += 3;
 237             column += 2;        /* column already incremented by 1 above */
 238             encoded--;
 239         }
 240
 241         p++;
 242
 243         /*
 244          * We're not allowed more than ENCODELINELIMIT characters per line,
 245          * so reserve some room for the final ?=.
 246          *
 247          * If prefixlen == 0, we haven't been passed in a header name, so
 248          * don't ever wrap the field (we're likely doing an address).
 249          */
 250
 251         if (prefixlen == 0)
 252             continue;
 253
 254         if (column >= ENCODELINELIMIT - 2) {
 255             newline = 1;
 256         } else if (utf8) {
 257             /*
 258              * Okay, this is a bit weird, but to explain a bit more ...
 259              *
 260              * RFC 2047 prohibits the splitting of multibyte characters
 261              * across encoded words.  Right now we only handle the case
 262              * of UTF-8, the most common multibyte encoding.
 263              *
 264              * p is now pointing at the next input character.  If we're
 265              * using UTF-8 _and_ we'd go over ENCODELINELIMIT given the
 266              * length of the complete character, then trigger a newline
 267              * now.  Note that we check the length * 3 since we have to
 268              * allow for the encoded output.
 269              */
 270             if (column + (utf8len(p) * 3) > ENCODELINELIMIT - 2) {
 271                 newline = 1;
 272             }
 273         }
 274     }
 275
 276     if (q == NULL) {
 277         /* This should never happen, but just in case.  Found by
 278            clang static analyzer. */
 279         admonish (NULL, "null output encoding for %s", *value);
 280         return 1;
 281     }
 282     *q++ = '?';
 283     *q++ = '=';
 284
 285     if (prefixlen)
 286         *q++ = '\n';
 287
 288     *q = '\0';
 289
 290     free(*value);
 291
 292     *value = output;
 293
 294     return 0;
 295 }
 296
 297 /*
 298  * Calculate the length of a UTF-8 character.
 299  *
 300  * If it's not a UTF-8 character (or we're in the middle of a multibyte
 301  * character) then simply return 0.
 302  */
 303
 304 static int
 305 utf8len(const char *p)
 306 {
 307     int len = 1;
 308
 309     if (*p == '\0')
 310         return 0;
 311
 312     if (isascii((unsigned char) *p) || (((unsigned char) *p) & 0xc0) == 0x80)
 313         return 0;
 314
 315     p++;
 316     while ((((unsigned char) *p++) & 0xc0) == 0x80)
 317         len++;
 318
 319     return len;
 320 }
 321
 322 /*
 323  * "Unfold" a header, making it a single line (without continuation)
 324  *
 325  * We cheat a bit here; we never make the string longer, so using the
 326  * original length here is fine.
 327  */
 328
 329 static void
 330 unfold_header(char **value, int len)
 331 {
 332     char *str = mh_xmalloc(len + 1);
 333     char *p = str, *q = *value;
 334
 335     while (*q != '\0') {
 336         if (*q == '\n') {
 337             /*
 338              * When we get a newline, skip to the next non-whitespace
 339              * character and add a space to replace all of the whitespace
 340              *
 341              * This has the side effect of stripping off the final newline
 342              * for the header; we put it back in the encoding routine.
 343              */
 344             while (is_fws(*q))
 345                 q++;
 346             if (*q == '\0')
 347                 break;
 348
 349             *p++ = ' ';
 350         } else {
 351             *p++ = *q++;
 352         }
 353     }
 354
 355     *p = '\0';
 356
 357     free(*value);
 358     *value = str;
 359 }
 360
 361 /*
 362  * Decode a header containing addresses.  This means we have to parse
 363  * each address and only encode the display-name or comment field.
 364  */
 365
 366 static int
 367 field_encode_address(const char *name, char **value, const char *charset)
 368 {
 369     int prefixlen = strlen(name) + 2, column = prefixlen, groupflag;
 370     int asciichars, specialchars, eightbitchars, reformat = 0, errflag = 0;
 371     size_t len;
 372     char *mp, *cp = NULL, *output = NULL;
 373     char *tmpbuf = NULL;
 374     size_t tmpbufsize = 0;
 375     struct mailname *mn;
 376     char errbuf[BUFSIZ];
 377
 378     /*
 379      * Because these are addresses, we need to handle them individually.
 380      *
 381      * Break them down and process them one by one.  This means we have to
 382      * rewrite the whole header, but that's unavoidable.
 383      */
 384
 385     /*
 386      * The output headers always have to start with a space first; this
 387      * is just the way the API works right now.
 388      */
 389
 390     output = add(" ", output);
 391
 392     for (groupflag = 0; (mp = getname(*value)); ) {
 393         if ((mn = getm(mp, NULL, 0, AD_HOST, errbuf)) == NULL) {
 394             advise(NULL, "%s: %s", errbuf, mp);
 395             errflag++;
 396             continue;
 397         }
 398
 399         reformat = 0;
 400
 401         /*
 402          * We only care if the phrase (m_pers) or any trailing comment
 403          * (m_note) have 8-bit characters.  If doing q-p, we also need
 404          * to encode anything marked as qspecial().  Unquote it first
 405          * so the specialchars count is right.
 406          */
 407
 408         if (! mn->m_pers)
 409             goto check_note;
 410
 411         if ((len = strlen(mn->m_pers)) + 1 > tmpbufsize) {
 412             tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = len + 1);
 413         }
 414
 415         unquote_string(mn->m_pers, tmpbuf);
 416
 417         if (scanstring(tmpbuf, &asciichars, &eightbitchars,
 418                        &specialchars)) {
 419             /*
 420              * If we have 8-bit characters, encode it.
 421              */
 422
 423             /*
 424              * This is okay, because the output of unquote_string will be either
 425              * equal or shorter than the original.
 426              */
 427
 428                 strcpy(mn->m_pers, tmpbuf);
 429
 430                 if (field_encode_quoted(NULL, &mn->m_pers, charset, asciichars,
 431                                         eightbitchars + specialchars, 1)) {
 432                         errflag++;
 433                     goto out;
 434                 }
 435
 436                  reformat++;
 437         }
 438
 439         check_note:
 440
 441         /*
 442          * The "note" field is generally a comment at the end of the address,
 443          * at least as how it's implemented here.  Notes are always surrounded
 444          * by parenthesis (since they're comments).  Strip them out and
 445          * then put them back when we format the final field, but they do
 446          * not get encoded.
 447          */
 448
 449         if (! mn->m_note)
 450             goto do_reformat;
 451
 452         if ((len = strlen(mn->m_note)) + 1 > tmpbufsize) {
 453             tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = len + 1);
 454         }
 455
 456         if (mn->m_note[0] != '(' || mn->m_note[len - 1] != ')') {
 457             advise(NULL, "Internal error: Invalid note field \"%s\"",
 458                    mn->m_note);
 459             errflag++;
 460             goto out;
 461         }
 462
 463         strncpy(tmpbuf, mn->m_note + 1, len - 1);
 464         tmpbuf[len - 2] = '\0';
 465
 466         if (scanstring(tmpbuf, &asciichars, &eightbitchars,
 467                        &specialchars)) {
 468             /*
 469              * If we have 8-bit characters, encode it.
 470              */
 471
 472                 if (field_encode_quoted(NULL, &tmpbuf, charset, asciichars,
 473                                         eightbitchars + specialchars, 1)) {
 474                     errflag++;
 475                     goto out;
 476                 }
 477
 478             reformat++;
 479
 480             /*
 481              * Make sure the size of tmpbuf is correct (it always gets
 482              * reallocated in the above functions).
 483              */
 484
 485             tmpbufsize = strlen(tmpbuf) + 1;
 486
 487             /*
 488              * Put the note field back surrounded by parenthesis.
 489              */
 490
 491             mn->m_note = mh_xrealloc(mn->m_note, tmpbufsize + 2);
 492
 493             snprintf(mn->m_note, tmpbufsize + 2, "(%s)", tmpbuf);
 494         }
 495
 496 do_reformat:
 497
 498         /*
 499          * So, some explanation is in order.
 500          *
 501          * We know we need to rewrite at least one address in the header,
 502          * otherwise we wouldn't be here.  If we had to reformat this
 503          * particular address, then run it through adrformat().  Otherwise
 504          * we can use m_text directly.
 505          */
 506
 507         /*
 508          * If we were in a group but are no longer, make sure we add a
 509          * semicolon (which needs to be FIRST, as it needs to be at the end
 510          * of the last address).
 511          */
 512
 513         if (groupflag && ! mn->m_ingrp) {
 514             output = add(";", output);
 515             column += 1;
 516         }
 517
 518         groupflag = mn->m_ingrp;
 519
 520         if (mn->m_gname) {
 521             cp = add(mn->m_gname, NULL);
 522         }
 523
 524         if (reformat) {
 525             cp = add(adrformat(mn), cp);
 526         } else {
 527             cp = add(mn->m_text, cp);
 528         }
 529
 530         len = strlen(cp);
 531
 532         /*
 533          * If we're not at the beginning of the line, add a command and
 534          * either a space or a newline.
 535          */
 536
 537         if (column != prefixlen) {
 538             if (len + column + 2 > OUTPUTLINELEN) {
 539
 540                 if ((size_t) (prefixlen + 3) < tmpbufsize)
 541                     tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = prefixlen + 3);
 542
 543                 snprintf(tmpbuf, tmpbufsize, ",\n%*s", column = prefixlen, "");
 544                 output = add(tmpbuf, output);
 545             } else {
 546                 output = add(", ", output);
 547                 column += 2;
 548             }
 549         }
 550
 551         /*
 552          * Finally add the address
 553          */
 554
 555         output = add(cp, output);
 556         column += len;
 557         free(cp);
 558         cp = NULL;
 559     }
 560
 561     /*
 562      * Just in case we're at the end of a list
 563      */
 564
 565     if (groupflag) {
 566         output = add(";", output);
 567     }
 568
 569     output = add("\n", output);
 570
 571     free(*value);
 572     *value = output;
 573     output = NULL;
 574
 575 out:
 576
 577     if (tmpbuf)
 578         free(tmpbuf);
 579     if (output)
 580         free(output);
 581
 582     return errflag > 0;
 583 }
 584
 585 /*
 586  * Scan a string, check for characters that need to be encoded
 587  */
 588
 589 static int
 590 scanstring(const char *string, int *asciilen, int *eightbitchars,
 591            int *specialchars)
 592 {
 593     *asciilen = 0;
 594     *eightbitchars = 0;
 595     *specialchars = 0;
 596
 597     for (; *string != '\0'; string++) {
 598         if ((isascii((unsigned char) *string))) {
 599             (*asciilen)++;
 600             /*
 601              * So, a space is not a valid phrase character, but we're counting
 602              * an exception here, because in q-p a space can be directly
 603              * encoded as an underscore.
 604              */
 605             if (!qphrasevalid((unsigned char) *string) && *string != ' ')
 606                 (*specialchars)++;
 607         } else {
 608             (*eightbitchars)++;
 609         }
 610     }
 611
 612     return *eightbitchars > 0;
 613 }
 614
 615 /*
 616  * This function is to be used to decide which encoding algorithm we should
 617  * use if one is not given.  Basically, we pick whichever one is the shorter
 618  * of the two.
 619  *
 620  * Arguments are:
 621  *
 622  * ascii        - Number of ASCII characters in to-be-encoded string.
 623  * specials     - Number of ASCII characters in to-be-encoded string that
 624  *                still require encoding under quoted-printable.  Note that
 625  *                these are included in the "ascii" total.
 626  * eightbit     - Eight-bit characters in the to-be-encoded string.
 627  *
 628  * Returns one of CE_BASE64 or CE_QUOTED.
 629  */
 630 /*
 631 static int
 632 pref_encoding(int ascii, int specials, int eightbits)
 633 { */
 634     /*
 635      * The length of the q-p encoding is:
 636      *
 637      * ascii - specials + (specials + eightbits) * 3.
 638      *
 639      * The length of the base64 encoding is:
 640      *
 641      * base64len(ascii + eightbits)     (See macro for details)
 642      */
 643 /*
 644     return base64len(ascii + eightbits) < (ascii - specials +
 645                         (specials + eightbits) * 3) ? CE_BASE64 : CE_QUOTED;
 646 }*/