git.marmaro.de Git - mmh/blob - sbr/encode_rfc2047.c

   1 /*
   2 ** Routines to encode message headers using RFC 2047-encoding.
   3 **
   4 ** This code is Copyright (c) 2002, by the authors of nmh.  See the
   5 ** COPYRIGHT file in the root directory of the nmh distribution for
   6 ** complete copyright information.
   7 */
   8
   9 #include <h/mh.h>
  10 #include <h/mhparse.h>
  11 #include <h/addrsbr.h>
  12 #include <h/utils.h>
  13
  14 #include <ctype.h>
  15
  16 /*
  17 ** List of headers that contain addresses and as a result require special
  18 ** handling
  19 */
  20
  21 static char *address_headers[] = {
  22         "To",
  23         "From",
  24         "cc",
  25         "Bcc",
  26         "Reply-To",
  27         "Sender",
  28         "Resent-To",
  29         "Resent-From",
  30         "Resent-cc",
  31         "Resent-Bcc",
  32         "Resent-Reply-To",
  33         "Resent-Sender",
  34         NULL,
  35 };
  36
  37 /*
  38 ** Macros we use for parsing headers
  39 **
  40 ** Todo: convert the macros to functions
  41 */
  42
  43 #define is_fws(c) (c == '\t' || c == ' ' || c == '\n')
  44
  45 #define qphrasevalid(c) ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || \
  46                         (c >= 'a' && c <= 'z') || \
  47                         c == '!' || c == '*' || c == '+' || c == '-' || \
  48                         c == '/' || c == '=' || c == '_')
  49 #define qpspecial(c) (c < ' ' || c == '=' || c == '?' || c == '_')
  50
  51 #define base64len(n) ((((n) + 2) / 3) * 4)    /* String len to base64 len */
  52 #define strbase64(n) ((n) / 4 * 3)            /* Chars that fit in base64 */
  53
  54 #define ENCODELINELIMIT 76
  55
  56 static void unfold_header(char **, int);
  57 static int field_encode_address(const char *, char **, const char *);
  58 static int field_encode_quoted(const char *, char **, const char *, int,
  59                 int, int);
  60 static int scanstring(const char *, int *, int *, int *);
  61 static int utf8len(const char *);
  62 /*static int pref_encoding(int, int, int);*/
  63
  64 /*
  65 ** Encode a message header using RFC 2047 encoding.  We make the assumption
  66 ** that all characters < 128 are ASCII and as a consequence don't need any
  67 ** encoding.
  68 */
  69 int
  70 encode_rfc2047(const char *name, char **value, const char *charset)
  71 {
  72         int i, asciicount = 0, eightbitcount = 0, qpspecialcount = 0;
  73         char *p;
  74
  75         /* First, check to see if we even need to encode the header */
  76
  77         for (p = *value; *p != '\0'; p++) {
  78                 if (isascii((unsigned char) *p)) {
  79                         asciicount++;
  80                         if (qpspecial((unsigned char) *p)) {
  81                                 qpspecialcount++;
  82                         }
  83                 } else {
  84                         eightbitcount++;
  85                 }
  86         }
  87
  88         if (eightbitcount == 0) {
  89                 return 0;
  90         }
  91
  92         /*
  93         ** Some rules from RFC 2047:
  94         **
  95         ** - Encoded words cannot be more than 75 characters long
  96         ** - Multiple "long" encoded words must be on new lines.
  97         **
  98         ** Also, we're not permitted to encode email addresses, so
  99         ** we need to actually _parse_ email addresses and only encode
 100         ** the right bits.
 101         */
 102
 103         /*
 104         ** If charset was NULL, then get the value from the locale.  But
 105         ** we reject it if it returns US-ASCII
 106         */
 107
 108         if (charset == NULL) {
 109                 charset = write_charset_8bit();
 110         }
 111         if (strcasecmp(charset, "US-ASCII") == 0) {
 112                 advise(NULL, "Cannot use US-ASCII with 8 bit characters in header");
 113                 return 1;
 114         }
 115
 116         /*
 117         ** If we have an address header, then we need to parse the addresses
 118         ** and only encode the names or comments.  Otherwise, handle it
 119         ** normally.
 120         */
 121
 122         for (i = 0; address_headers[i]; i++) {
 123                 if (strcasecmp(name, address_headers[i]) == 0) {
 124                         return field_encode_address(name, value, charset);
 125                 }
 126         }
 127
 128         /*
 129         ** On the encoding we choose, and the specifics of encoding:
 130         **
 131         ** - If a specified encoding is passed in, we use that.
 132         ** - Otherwise, pick which encoding is shorter.
 133         **
 134         ** We don't quite handle continuation right here, but it should be
 135         ** pretty close.
 136         */
 137
 138         unfold_header(value, asciicount + eightbitcount);
 139
 140         return field_encode_quoted(name, value, charset, asciicount,
 141                         eightbitcount + qpspecialcount, 0);
 142 }
 143
 144 /*
 145 ** Encode our specified header (or field) using quoted-printable
 146 */
 147
 148 static int
 149 field_encode_quoted(const char *name, char **value, const char *charset,
 150                 int ascii, int encoded, int phraserules)
 151 {
 152         int prefixlen = name ? strlen(name) + 2: 0;
 153         int outlen = 0, column, newline = 1, utf8;
 154         int charsetlen = strlen(charset);
 155         char *output = NULL, *p, *q = NULL;
 156
 157         /*
 158         ** Right now we just encode the whole thing.  Maybe later on we'll
 159         ** only encode things on a per-atom basis.
 160         */
 161
 162         p = *value;
 163
 164         column = prefixlen + 2;    /* Header name plus ": " */
 165
 166         utf8 = strcasecmp(charset, "UTF-8") == 0;
 167
 168         while (*p != '\0') {
 169                 /* Start a new line, if it's time */
 170                 if (newline) {
 171                         int tokenlen;
 172
 173                         /*
 174                         ** If it's the start of the header, we don't need
 175                         ** to pad it
 176                         **
 177                         ** The length of the output string is ...
 178                         ** =?charset?Q?...?=  so that's
 179                         ** 7+strlen(charset) + 2 for \n NUL
 180                         **
 181                         ** plus 1 for every ASCII character and 3 for
 182                         ** every eight bit or special character (eight
 183                         ** bit characters are written as =XX).
 184                         */
 185                         outlen += 9 + charsetlen + ascii + 3 * encoded;
 186
 187                         if (output) {
 188                                 /* continue the header */
 189                                 int curlen = q - output, i;
 190                                 outlen += prefixlen + 1; /* Header plus \n ": " */
 191                                 output = mh_xrealloc(output, outlen);
 192                                 q = output + curlen;
 193                                 *q++ = '?';
 194                                 *q++ = '=';
 195                                 *q++ = '\n';
 196                                 for (i = 0; i < prefixlen; i++) {
 197                                         *q++ = ' ';
 198                                 }
 199                         } else {
 200                                 /* do the initial allocation */
 201                                 /*
 202                                 ** A bit of a hack here; the header can
 203                                 ** contain multiple spaces (probably at
 204                                 ** least one) until we get to the actual
 205                                 ** text. Copy until we get to a non-space.
 206                                 */
 207                                 output = mh_xcalloc(outlen, sizeof(char));
 208                                 q = output;
 209                                 while (is_fws(*p)) {
 210                                         *q++ = *p++;
 211                                 }
 212                         }
 213
 214                         tokenlen = snprintf(q, outlen - (q - output),
 215                                         "=?%s?Q?", charset);
 216                         q += tokenlen;
 217                         column = prefixlen + tokenlen;
 218                         newline = 0;
 219                 }
 220
 221                 /*
 222                 ** Process each character, encoding if necessary
 223                 **
 224                 ** Note that we have a different set of rules if we're
 225                 ** processing RFC 5322 'phrase' (something you'd see in
 226                 ** an address header).
 227                 */
 228
 229                 column++;
 230
 231                 if (*p == ' ') {
 232                         *q++ = '_';
 233                         ascii--;
 234                 } else if (isascii((unsigned char) *p) && (phraserules ?
 235                                 qphrasevalid((unsigned char) *p)
 236                                 : !qpspecial((unsigned char) *p))) {
 237                         *q++ = *p;
 238                         ascii--;
 239                 } else {
 240                         snprintf(q, outlen - (q - output), "=%02X",
 241                                         (unsigned char) *p);
 242                         q += 3;
 243                         column += 2;   /* column already incremented by 1 above */
 244                         encoded--;
 245                 }
 246
 247                 p++;
 248
 249                 if (prefixlen == 0) {
 250                         /*
 251                         ** We haven't been passed in a header name,
 252                         ** so don't ever wrap the field (we're likely
 253                         ** doing an address).
 254                         */
 255                         continue;
 256                 }
 257                 /*
 258                 ** We're not allowed more than ENCODELINELIMIT characters
 259                 ** per line, so reserve some room for the final ?=.
 260                 */
 261                 if (column >= ENCODELINELIMIT - 2) {
 262                         newline = 1;
 263                 } else if (utf8) {
 264                         /*
 265                         ** Okay, this is a bit weird, but to explain a
 266                         ** bit more ...
 267                         **
 268                         ** RFC 2047 prohibits the splitting of multibyte
 269                         ** characters across encoded words.  Right now
 270                         ** we only handle the case of UTF-8, the most
 271                         ** common multibyte encoding.
 272                         **
 273                         ** p is now pointing at the next input character.
 274                         ** If we're using UTF-8 _and_ we'd go over
 275                         ** ENCODELINELIMIT given the length of the
 276                         ** complete character, then trigger a newline now.
 277                         ** Note that we check the length * 3 since we
 278                         ** have to allow for the encoded output.
 279                         */
 280                         if (column + (utf8len(p)*3) > ENCODELINELIMIT - 2) {
 281                                 newline = 1;
 282                         }
 283                 }
 284         }
 285
 286         if (q == NULL) {
 287                 /*
 288                 ** This should never happen, but just in case.
 289                 ** Found by clang static analyzer.
 290                 */
 291                 admonish (NULL, "null output encoding for %s", *value);
 292                 return 1;
 293         }
 294         *q++ = '?';
 295         *q++ = '=';
 296
 297         if (prefixlen) {
 298                 *q++ = '\n';
 299         }
 300         *q = '\0';
 301
 302         mh_free0(value);
 303         *value = output;
 304
 305         return 0;
 306 }
 307
 308 /*
 309 ** Calculate the length of a UTF-8 character.
 310 **
 311 ** If it's not a UTF-8 character (or we're in the middle of a multibyte
 312 ** character) then simply return 0.
 313 */
 314 static int
 315 utf8len(const char *p)
 316 {
 317         int len = 1;
 318
 319         if (*p == '\0') {
 320                 return 0;
 321         }
 322         if (isascii((unsigned char) *p) ||
 323                         (((unsigned char) *p) & 0xc0) == 0x80) {
 324                 return 0;
 325         }
 326         p++;
 327         while ((((unsigned char) *p++) & 0xc0) == 0x80) {
 328                 len++;
 329         }
 330
 331         return len;
 332 }
 333
 334 /*
 335 ** "Unfold" a header, making it a single line (without continuation)
 336 **
 337 ** We cheat a bit here; we never make the string longer, so using the
 338 ** original length here is fine.
 339 */
 340 static void
 341 unfold_header(char **value, int len)
 342 {
 343         char *str = mh_xcalloc(len + 1, sizeof(char));
 344         char *p = str, *q = *value;
 345
 346         while (*q != '\0') {
 347                 if (*q == '\n') {
 348                         /*
 349                         ** When we get a newline, skip to the next
 350                         ** non-whitespace character and add a space to
 351                         ** replace all of the whitespace
 352                         **
 353                         ** This has the side effect of stripping off the
 354                         ** final newline for the header; we put it back
 355                         ** in the encoding routine.
 356                         */
 357                         while (is_fws(*q)) {
 358                                 q++;
 359                         }
 360                         if (*q == '\0') {
 361                                 break;
 362                         }
 363                         *p++ = ' ';
 364                 } else {
 365                         *p++ = *q++;
 366                 }
 367         }
 368         *p = '\0';
 369
 370         mh_free0(value);
 371         *value = str;
 372 }
 373
 374 /*
 375 ** Decode a header containing addresses. This means we have to parse
 376 ** each address and only encode the display-name or comment field.
 377 */
 378 static int
 379 field_encode_address(const char *name, char **value, const char *charset)
 380 {
 381         int prefixlen = strlen(name) + 2;
 382         int column = prefixlen, groupflag;
 383         int asciichars, specialchars, eightbitchars;
 384         int reformat = 0, errflag = 0;
 385         size_t len;
 386         char *mp, *cp = NULL, *output = NULL;
 387         char *tmpbuf = NULL;
 388         size_t tmpbufsize = 0;
 389         struct mailname *mn;
 390         char errbuf[BUFSIZ];
 391
 392         /*
 393         ** Because these are addresses, we need to handle them individually.
 394         **
 395         ** Break them down and process them one by one.  This means we
 396         ** have to rewrite the whole header, but that's unavoidable.
 397         */
 398
 399         /*
 400         ** The output headers always have to start with a space first;
 401         ** this is just the way the API works right now.
 402         */
 403
 404         output = add(" ", output);
 405
 406         for (groupflag = 0; (mp = getname(*value)); ) {
 407                 if ((mn = getm(mp, NULL, 0, AD_HOST, errbuf)) == NULL) {
 408                         advise(NULL, "%s: %s", errbuf, mp);
 409                         errflag++;
 410                         continue;
 411                 }
 412
 413                 reformat = 0;
 414
 415                 /*
 416                 ** We only care if the phrase (m_pers) or any trailing
 417                 ** comment (m_note) have 8-bit characters.  If doing q-p,
 418                 ** we also need to encode anything marked as qspecial().
 419                 ** Unquote it first so the specialchars count is right.
 420                 */
 421
 422                 if (! mn->m_pers) {
 423                         goto check_note;
 424                 }
 425
 426                 if ((len = strlen(mn->m_pers)) + 1 > tmpbufsize) {
 427                         tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = len + 1);
 428                 }
 429
 430                 unquote_string(mn->m_pers, tmpbuf);
 431
 432                 if (scanstring(tmpbuf, &asciichars, &eightbitchars,
 433                                 &specialchars)) {
 434                         /*
 435                         ** If we have 8-bit characters, encode it.
 436                         */
 437
 438                         /*
 439                         ** This is okay, because the output of
 440                         ** unquote_string will be either equal or shorter
 441                         ** than the original.
 442                         */
 443                         strcpy(mn->m_pers, tmpbuf);
 444
 445                         if (field_encode_quoted(NULL, &mn->m_pers, charset,
 446                                         asciichars,
 447                                         eightbitchars + specialchars, 1)) {
 448                                 errflag++;
 449                                 goto out;
 450                         }
 451
 452                         reformat++;
 453                 }
 454
 455                 check_note:
 456
 457                 /*
 458                 ** The "note" field is generally a comment at the end
 459                 ** of the address, at least as how it's implemented here.
 460                 ** Notes are always surrounded by parenthesis (since they're
 461                 ** comments).  Strip them out and then put them back when
 462                 ** we format the final field, but they do not get encoded.
 463                 */
 464
 465                 if (! mn->m_note) {
 466                         goto do_reformat;
 467                 }
 468
 469                 if ((len = strlen(mn->m_note)) + 1 > tmpbufsize) {
 470                         tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = len + 1);
 471                 }
 472
 473                 if (mn->m_note[0] != '(' || mn->m_note[len - 1] != ')') {
 474                         advise(NULL, "Internal error: Invalid note field \"%s\"",
 475                                         mn->m_note);
 476                         errflag++;
 477                         goto out;
 478                 }
 479
 480                 strncpy(tmpbuf, mn->m_note + 1, len - 1);
 481                 tmpbuf[len - 2] = '\0';
 482
 483                 if (scanstring(tmpbuf, &asciichars, &eightbitchars,
 484                                 &specialchars)) {
 485                         /*
 486                         ** If we have 8-bit characters, encode it.
 487                         */
 488
 489                         if (field_encode_quoted(NULL, &tmpbuf, charset,
 490                                         asciichars,
 491                                         eightbitchars + specialchars, 1)) {
 492                                 errflag++;
 493                                 goto out;
 494                         }
 495
 496                         reformat++;
 497
 498                         /*
 499                         ** Make sure the size of tmpbuf is correct (it
 500                         ** always gets reallocated in the above functions).
 501                         */
 502
 503                         tmpbufsize = strlen(tmpbuf) + 1;
 504
 505                         /*
 506                         ** Put the note field back surrounded by
 507                         ** parenthesis.
 508                         */
 509
 510                         mn->m_note = mh_xrealloc(mn->m_note, tmpbufsize + 2);
 511
 512                         snprintf(mn->m_note, tmpbufsize + 2, "(%s)", tmpbuf);
 513                 }
 514
 515 do_reformat:
 516
 517                 /*
 518                 ** So, some explanation is in order.
 519                 **
 520                 ** We know we need to rewrite at least one address in the
 521                 ** header, otherwise we wouldn't be here.  If we had to
 522                 ** reformat this particular address, then run it through
 523                 ** adrformat().  Otherwise we can use m_text directly.
 524                 */
 525
 526                 /*
 527                 ** If we were in a group but are no longer, make sure we
 528                 ** add a semicolon (which needs to be FIRST, as it needs
 529                 ** to be at the end of the last address).
 530                 */
 531
 532                 if (groupflag && ! mn->m_ingrp) {
 533                         output = add(";", output);
 534                         column += 1;
 535                 }
 536
 537                 groupflag = mn->m_ingrp;
 538
 539                 if (mn->m_gname) {
 540                         cp = add(mn->m_gname, NULL);
 541                 }
 542
 543                 if (reformat) {
 544                         cp = add(adrformat(mn), cp);
 545                 } else {
 546                         cp = add(mn->m_text, cp);
 547                 }
 548
 549                 len = strlen(cp);
 550
 551                 /*
 552                 ** If we're not at the beginning of the line, add a
 553                 ** command and either a space or a newline.
 554                 */
 555
 556                 if (column != prefixlen) {
 557                         if (len + column + 2 > OUTPUTLINELEN) {
 558
 559                                 if ((size_t) (prefixlen + 3) < tmpbufsize) {
 560                                         tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = prefixlen + 3);
 561                                 }
 562
 563                                 snprintf(tmpbuf, tmpbufsize, ",\n%*s", column = prefixlen, "");
 564                                 output = add(tmpbuf, output);
 565                         } else {
 566                                 output = add(", ", output);
 567                                 column += 2;
 568                         }
 569                 }
 570
 571                 /*
 572                 ** Finally add the address
 573                 */
 574
 575                 output = add(cp, output);
 576                 column += len;
 577                 mh_free0(&cp);
 578         }
 579
 580         /*
 581         ** Just in case we're at the end of a list
 582         */
 583
 584         if (groupflag) {
 585                 output = add(";", output);
 586         }
 587
 588         output = add("\n", output);
 589
 590         mh_free0(value);
 591         *value = output;
 592         output = NULL;
 593
 594 out:
 595
 596         if (tmpbuf) {
 597                 mh_free0(&tmpbuf);
 598         }
 599         if (output) {
 600                 mh_free0(&output);
 601         }
 602
 603         return errflag > 0;
 604 }
 605
 606 /*
 607 ** Scan a string, check for characters that need to be encoded
 608 */
 609
 610 static int
 611 scanstring(const char *string, int *asciilen, int *eightbitchars,
 612                 int *specialchars)
 613 {
 614         *asciilen = 0;
 615         *eightbitchars = 0;
 616         *specialchars = 0;
 617
 618         for (; *string != '\0'; string++) {
 619                 if ((isascii((unsigned char) *string))) {
 620                         (*asciilen)++;
 621                         /*
 622                         ** So, a space is not a valid phrase character, but
 623                         ** we're counting an exception here, because in q-p
 624                         ** a space can be directly encoded as an underscore.
 625                         */
 626                         if (!qphrasevalid((unsigned char) *string) &&
 627                                         *string != ' ') {
 628                                 (*specialchars)++;
 629                         }
 630                 } else {
 631                         (*eightbitchars)++;
 632                 }
 633         }
 634
 635         return *eightbitchars > 0;
 636 }
 637
 638 #if 0
 639
 640 /*
 641 ** This function is to be used to decide which encoding algorithm we should
 642 ** use if one is not given.  Basically, we pick whichever one is the shorter
 643 ** of the two.
 644 **
 645 ** Arguments are:
 646 **
 647 ** ascii        - Number of ASCII characters in to-be-encoded string.
 648 ** specials     - Number of ASCII characters in to-be-encoded string that
 649 **                still require encoding under quoted-printable.  Note that
 650 **                these are included in the "ascii" total.
 651 ** eightbit     - Eight-bit characters in the to-be-encoded string.
 652 **
 653 ** Returns one of CE_BASE64 or CE_QUOTED.
 654 **/
 655 static int
 656 pref_encoding(int ascii, int specials, int eightbits)
 657 {
 658         /*
 659         ** The length of the q-p encoding is:
 660         **
 661         ** ascii - specials + (specials + eightbits) * 3.
 662         **
 663         ** The length of the base64 encoding is:
 664         **
 665         ** base64len(ascii + eightbits) (See macro for details)
 666         */
 667         return base64len(ascii + eightbits) < (ascii - specials +
 668                         (specials + eightbits) * 3) ? CE_BASE64 : CE_QUOTED;
 669 }
 670
 671 #endif