git.marmaro.de Git - mmh/blob - sbr/encode_rfc2047.c

   1 /*
   2 ** Routines to encode message headers using RFC 2047-encoding.
   3 **
   4 ** This code is Copyright (c) 2002, by the authors of nmh.  See the
   5 ** COPYRIGHT file in the root directory of the nmh distribution for
   6 ** complete copyright information.
   7 */
   8
   9 #include <h/mh.h>
  10 #include <h/mhparse.h>
  11 #include <h/addrsbr.h>
  12 #include <h/utils.h>
  13
  14 #include <ctype.h>
  15
  16 /*
  17 ** List of headers that contain addresses and as a result require special
  18 ** handling
  19 */
  20
  21 static char *address_headers[] = {
  22         "To",
  23         "From",
  24         "cc",
  25         "Bcc",
  26         "Reply-To",
  27         "Sender",
  28         "Resent-To",
  29         "Resent-From",
  30         "Resent-cc",
  31         "Resent-Bcc",
  32         "Resent-Reply-To",
  33         "Resent-Sender",
  34         NULL,
  35 };
  36
  37 /*
  38 ** Macros we use for parsing headers
  39 **
  40 ** Todo: convert the macros to functions
  41 */
  42
  43 #define is_fws(c) (c == '\t' || c == ' ' || c == '\n')
  44
  45 #define qphrasevalid(c) ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || \
  46                         (c >= 'a' && c <= 'z') || \
  47                         c == '!' || c == '*' || c == '+' || c == '-' || \
  48                         c == '/' || c == '=' || c == '_')
  49 #define qpspecial(c) (c < ' ' || c == '=' || c == '?' || c == '_')
  50
  51 #define ENCODELINELIMIT 76
  52
  53 static void unfold_header(char **, int);
  54 static int field_encode_address(const char *, char **, const char *);
  55 static int field_encode_quoted(const char *, char **, const char *, int,
  56                 int, int);
  57 static int scanstring(const char *, int *, int *, int *);
  58 static int utf8len(const char *);
  59
  60 /*
  61 ** Encode a message header using RFC 2047 encoding.  We make the assumption
  62 ** that all characters < 128 are ASCII and as a consequence don't need any
  63 ** encoding.
  64 */
  65 int
  66 encode_rfc2047(const char *name, char **value, const char *charset)
  67 {
  68         int i, asciicount = 0, eightbitcount = 0, qpspecialcount = 0;
  69         char *p;
  70
  71         /* First, check to see if we even need to encode the header */
  72
  73         for (p = *value; *p != '\0'; p++) {
  74                 if (isascii((unsigned char) *p)) {
  75                         asciicount++;
  76                         if (qpspecial((unsigned char) *p)) {
  77                                 qpspecialcount++;
  78                         }
  79                 } else {
  80                         eightbitcount++;
  81                 }
  82         }
  83
  84         if (eightbitcount == 0) {
  85                 return 0;
  86         }
  87
  88         /*
  89         ** Some rules from RFC 2047:
  90         **
  91         ** - Encoded words cannot be more than 75 characters long
  92         ** - Multiple "long" encoded words must be on new lines.
  93         **
  94         ** Also, we're not permitted to encode email addresses, so
  95         ** we need to actually _parse_ email addresses and only encode
  96         ** the right bits.
  97         */
  98
  99         /*
 100         ** If charset was NULL, then get the value from the locale.  But
 101         ** we reject it if it returns US-ASCII
 102         */
 103
 104         if (charset == NULL) {
 105                 charset = write_charset_8bit();
 106         }
 107         if (strcasecmp(charset, "US-ASCII") == 0) {
 108                 advise(NULL, "Cannot use US-ASCII with 8 bit characters in header");
 109                 return 1;
 110         }
 111
 112         /*
 113         ** If we have an address header, then we need to parse the addresses
 114         ** and only encode the names or comments.  Otherwise, handle it
 115         ** normally.
 116         */
 117
 118         for (i = 0; address_headers[i]; i++) {
 119                 if (strcasecmp(name, address_headers[i]) == 0) {
 120                         return field_encode_address(name, value, charset);
 121                 }
 122         }
 123
 124         /*
 125         ** On the encoding we choose, and the specifics of encoding:
 126         **
 127         ** - If a specified encoding is passed in, we use that.
 128         ** - Otherwise, pick which encoding is shorter.
 129         **
 130         ** We don't quite handle continuation right here, but it should be
 131         ** pretty close.
 132         */
 133
 134         unfold_header(value, asciicount + eightbitcount);
 135
 136         return field_encode_quoted(name, value, charset, asciicount,
 137                         eightbitcount + qpspecialcount, 0);
 138 }
 139
 140 /*
 141 ** Encode our specified header (or field) using quoted-printable
 142 */
 143
 144 static int
 145 field_encode_quoted(const char *name, char **value, const char *charset,
 146                 int ascii, int encoded, int phraserules)
 147 {
 148         int prefixlen = name ? strlen(name) + 2: 0;
 149         int outlen = 0, column, newline = 1, utf8;
 150         int charsetlen = strlen(charset);
 151         char *output = NULL, *p, *q = NULL;
 152
 153         /*
 154         ** Right now we just encode the whole thing.  Maybe later on we'll
 155         ** only encode things on a per-atom basis.
 156         */
 157
 158         p = *value;
 159
 160         column = prefixlen + 2;    /* Header name plus ": " */
 161
 162         utf8 = strcasecmp(charset, "UTF-8") == 0;
 163
 164         while (*p != '\0') {
 165                 /* Start a new line, if it's time */
 166                 if (newline) {
 167                         int tokenlen;
 168
 169                         /*
 170                         ** If it's the start of the header, we don't need
 171                         ** to pad it
 172                         **
 173                         ** The length of the output string is ...
 174                         ** =?charset?Q?...?=  so that's
 175                         ** 7+strlen(charset) + 2 for \n NUL
 176                         **
 177                         ** plus 1 for every ASCII character and 3 for
 178                         ** every eight bit or special character (eight
 179                         ** bit characters are written as =XX).
 180                         */
 181                         outlen += 9 + charsetlen + ascii + 3 * encoded;
 182
 183                         if (output) {
 184                                 /* continue the header */
 185                                 int curlen = q - output, i;
 186                                 outlen += prefixlen + 1; /* Header plus \n ": " */
 187                                 output = mh_xrealloc(output, outlen);
 188                                 q = output + curlen;
 189                                 *q++ = '?';
 190                                 *q++ = '=';
 191                                 *q++ = '\n';
 192                                 for (i = 0; i < prefixlen; i++) {
 193                                         *q++ = ' ';
 194                                 }
 195                         } else {
 196                                 /* do the initial allocation */
 197                                 /*
 198                                 ** A bit of a hack here; the header can
 199                                 ** contain multiple spaces (probably at
 200                                 ** least one) until we get to the actual
 201                                 ** text. Copy until we get to a non-space.
 202                                 */
 203                                 output = mh_xcalloc(outlen, sizeof(char));
 204                                 q = output;
 205                                 while (is_fws(*p)) {
 206                                         *q++ = *p++;
 207                                 }
 208                         }
 209
 210                         tokenlen = snprintf(q, outlen - (q - output),
 211                                         "=?%s?Q?", charset);
 212                         q += tokenlen;
 213                         column = prefixlen + tokenlen;
 214                         newline = 0;
 215                 }
 216
 217                 /*
 218                 ** Process each character, encoding if necessary
 219                 **
 220                 ** Note that we have a different set of rules if we're
 221                 ** processing RFC 5322 'phrase' (something you'd see in
 222                 ** an address header).
 223                 */
 224
 225                 column++;
 226
 227                 if (*p == ' ') {
 228                         *q++ = '_';
 229                         ascii--;
 230                 } else if (isascii((unsigned char) *p) && (phraserules ?
 231                                 qphrasevalid((unsigned char) *p)
 232                                 : !qpspecial((unsigned char) *p))) {
 233                         *q++ = *p;
 234                         ascii--;
 235                 } else {
 236                         snprintf(q, outlen - (q - output), "=%02X",
 237                                         (unsigned char) *p);
 238                         q += 3;
 239                         column += 2;   /* column already incremented by 1 above */
 240                         encoded--;
 241                 }
 242
 243                 p++;
 244
 245                 if (prefixlen == 0) {
 246                         /*
 247                         ** We haven't been passed in a header name,
 248                         ** so don't ever wrap the field (we're likely
 249                         ** doing an address).
 250                         */
 251                         continue;
 252                 }
 253                 /*
 254                 ** We're not allowed more than ENCODELINELIMIT characters
 255                 ** per line, so reserve some room for the final ?=.
 256                 */
 257                 if (column >= ENCODELINELIMIT - 2) {
 258                         newline = 1;
 259                 } else if (utf8) {
 260                         /*
 261                         ** Okay, this is a bit weird, but to explain a
 262                         ** bit more ...
 263                         **
 264                         ** RFC 2047 prohibits the splitting of multibyte
 265                         ** characters across encoded words.  Right now
 266                         ** we only handle the case of UTF-8, the most
 267                         ** common multibyte encoding.
 268                         **
 269                         ** p is now pointing at the next input character.
 270                         ** If we're using UTF-8 _and_ we'd go over
 271                         ** ENCODELINELIMIT given the length of the
 272                         ** complete character, then trigger a newline now.
 273                         ** Note that we check the length * 3 since we
 274                         ** have to allow for the encoded output.
 275                         */
 276                         if (column + (utf8len(p)*3) > ENCODELINELIMIT - 2) {
 277                                 newline = 1;
 278                         }
 279                 }
 280         }
 281
 282         if (q == NULL) {
 283                 /*
 284                 ** This should never happen, but just in case.
 285                 ** Found by clang static analyzer.
 286                 */
 287                 admonish (NULL, "null output encoding for %s", *value);
 288                 return 1;
 289         }
 290         *q++ = '?';
 291         *q++ = '=';
 292
 293         if (prefixlen) {
 294                 *q++ = '\n';
 295         }
 296         *q = '\0';
 297
 298         mh_free0(value);
 299         *value = output;
 300
 301         return 0;
 302 }
 303
 304 /*
 305 ** Calculate the length of a UTF-8 character.
 306 **
 307 ** If it's not a UTF-8 character (or we're in the middle of a multibyte
 308 ** character) then simply return 0.
 309 */
 310 static int
 311 utf8len(const char *p)
 312 {
 313         int len = 1;
 314
 315         if (*p == '\0') {
 316                 return 0;
 317         }
 318         if (isascii((unsigned char) *p) ||
 319                         (((unsigned char) *p) & 0xc0) == 0x80) {
 320                 return 0;
 321         }
 322         p++;
 323         while ((((unsigned char) *p++) & 0xc0) == 0x80) {
 324                 len++;
 325         }
 326
 327         return len;
 328 }
 329
 330 /*
 331 ** "Unfold" a header, making it a single line (without continuation)
 332 **
 333 ** We cheat a bit here; we never make the string longer, so using the
 334 ** original length here is fine.
 335 */
 336 static void
 337 unfold_header(char **value, int len)
 338 {
 339         char *str = mh_xcalloc(len + 1, sizeof(char));
 340         char *p = str, *q = *value;
 341
 342         while (*q != '\0') {
 343                 if (*q == '\n') {
 344                         /*
 345                         ** When we get a newline, skip to the next
 346                         ** non-whitespace character and add a space to
 347                         ** replace all of the whitespace
 348                         **
 349                         ** This has the side effect of stripping off the
 350                         ** final newline for the header; we put it back
 351                         ** in the encoding routine.
 352                         */
 353                         while (is_fws(*q)) {
 354                                 q++;
 355                         }
 356                         if (*q == '\0') {
 357                                 break;
 358                         }
 359                         *p++ = ' ';
 360                 } else {
 361                         *p++ = *q++;
 362                 }
 363         }
 364         *p = '\0';
 365
 366         mh_free0(value);
 367         *value = str;
 368 }
 369
 370 /*
 371 ** Decode a header containing addresses. This means we have to parse
 372 ** each address and only encode the display-name or comment field.
 373 */
 374 static int
 375 field_encode_address(const char *name, char **value, const char *charset)
 376 {
 377         int prefixlen = strlen(name) + 2;
 378         int column = prefixlen, groupflag;
 379         int asciichars, specialchars, eightbitchars;
 380         int reformat = 0, errflag = 0;
 381         size_t len;
 382         char *mp, *cp = NULL, *output = NULL;
 383         char *tmpbuf = NULL;
 384         size_t tmpbufsize = 0;
 385         struct mailname *mn;
 386         char errbuf[BUFSIZ];
 387
 388         /*
 389         ** Because these are addresses, we need to handle them individually.
 390         **
 391         ** Break them down and process them one by one.  This means we
 392         ** have to rewrite the whole header, but that's unavoidable.
 393         */
 394
 395         /*
 396         ** The output headers always have to start with a space first;
 397         ** this is just the way the API works right now.
 398         */
 399
 400         output = add(" ", output);
 401
 402         for (groupflag = 0; (mp = getname(*value)); ) {
 403                 if ((mn = getm(mp, NULL, 0, AD_HOST, errbuf)) == NULL) {
 404                         advise(NULL, "%s: %s", errbuf, mp);
 405                         errflag++;
 406                         continue;
 407                 }
 408
 409                 reformat = 0;
 410
 411                 /*
 412                 ** We only care if the phrase (m_pers) or any trailing
 413                 ** comment (m_note) have 8-bit characters.  If doing q-p,
 414                 ** we also need to encode anything marked as qspecial().
 415                 ** Unquote it first so the specialchars count is right.
 416                 */
 417
 418                 if (! mn->m_pers) {
 419                         goto check_note;
 420                 }
 421
 422                 if ((len = strlen(mn->m_pers)) + 1 > tmpbufsize) {
 423                         tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = len + 1);
 424                 }
 425
 426                 unquote_string(mn->m_pers, tmpbuf);
 427
 428                 if (scanstring(tmpbuf, &asciichars, &eightbitchars,
 429                                 &specialchars)) {
 430                         /*
 431                         ** If we have 8-bit characters, encode it.
 432                         */
 433
 434                         /*
 435                         ** This is okay, because the output of
 436                         ** unquote_string will be either equal or shorter
 437                         ** than the original.
 438                         */
 439                         strcpy(mn->m_pers, tmpbuf);
 440
 441                         if (field_encode_quoted(NULL, &mn->m_pers, charset,
 442                                         asciichars,
 443                                         eightbitchars + specialchars, 1)) {
 444                                 errflag++;
 445                                 goto out;
 446                         }
 447
 448                         reformat++;
 449                 }
 450
 451                 check_note:
 452
 453                 /*
 454                 ** The "note" field is generally a comment at the end
 455                 ** of the address, at least as how it's implemented here.
 456                 ** Notes are always surrounded by parenthesis (since they're
 457                 ** comments).  Strip them out and then put them back when
 458                 ** we format the final field, but they do not get encoded.
 459                 */
 460
 461                 if (! mn->m_note) {
 462                         goto do_reformat;
 463                 }
 464
 465                 if ((len = strlen(mn->m_note)) + 1 > tmpbufsize) {
 466                         tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = len + 1);
 467                 }
 468
 469                 if (mn->m_note[0] != '(' || mn->m_note[len - 1] != ')') {
 470                         advise(NULL, "Internal error: Invalid note field \"%s\"",
 471                                         mn->m_note);
 472                         errflag++;
 473                         goto out;
 474                 }
 475
 476                 strncpy(tmpbuf, mn->m_note + 1, len - 1);
 477                 tmpbuf[len - 2] = '\0';
 478
 479                 if (scanstring(tmpbuf, &asciichars, &eightbitchars,
 480                                 &specialchars)) {
 481                         /*
 482                         ** If we have 8-bit characters, encode it.
 483                         */
 484
 485                         if (field_encode_quoted(NULL, &tmpbuf, charset,
 486                                         asciichars,
 487                                         eightbitchars + specialchars, 1)) {
 488                                 errflag++;
 489                                 goto out;
 490                         }
 491
 492                         reformat++;
 493
 494                         /*
 495                         ** Make sure the size of tmpbuf is correct (it
 496                         ** always gets reallocated in the above functions).
 497                         */
 498
 499                         tmpbufsize = strlen(tmpbuf) + 1;
 500
 501                         /*
 502                         ** Put the note field back surrounded by
 503                         ** parenthesis.
 504                         */
 505
 506                         mn->m_note = mh_xrealloc(mn->m_note, tmpbufsize + 2);
 507
 508                         snprintf(mn->m_note, tmpbufsize + 2, "(%s)", tmpbuf);
 509                 }
 510
 511 do_reformat:
 512
 513                 /*
 514                 ** So, some explanation is in order.
 515                 **
 516                 ** We know we need to rewrite at least one address in the
 517                 ** header, otherwise we wouldn't be here.  If we had to
 518                 ** reformat this particular address, then run it through
 519                 ** adrformat().  Otherwise we can use m_text directly.
 520                 */
 521
 522                 /*
 523                 ** If we were in a group but are no longer, make sure we
 524                 ** add a semicolon (which needs to be FIRST, as it needs
 525                 ** to be at the end of the last address).
 526                 */
 527
 528                 if (groupflag && ! mn->m_ingrp) {
 529                         output = add(";", output);
 530                         column += 1;
 531                 }
 532
 533                 groupflag = mn->m_ingrp;
 534
 535                 if (mn->m_gname) {
 536                         cp = add(mn->m_gname, NULL);
 537                 }
 538
 539                 if (reformat) {
 540                         cp = add(adrformat(mn), cp);
 541                 } else {
 542                         cp = add(mn->m_text, cp);
 543                 }
 544
 545                 len = strlen(cp);
 546
 547                 /*
 548                 ** If we're not at the beginning of the line, add a
 549                 ** command and either a space or a newline.
 550                 */
 551
 552                 if (column != prefixlen) {
 553                         if (len + column + 2 > OUTPUTLINELEN) {
 554
 555                                 if ((size_t) (prefixlen + 3) < tmpbufsize) {
 556                                         tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = prefixlen + 3);
 557                                 }
 558
 559                                 snprintf(tmpbuf, tmpbufsize, ",\n%*s", column = prefixlen, "");
 560                                 output = add(tmpbuf, output);
 561                         } else {
 562                                 output = add(", ", output);
 563                                 column += 2;
 564                         }
 565                 }
 566
 567                 /*
 568                 ** Finally add the address
 569                 */
 570
 571                 output = add(cp, output);
 572                 column += len;
 573                 mh_free0(&cp);
 574         }
 575
 576         /*
 577         ** Just in case we're at the end of a list
 578         */
 579
 580         if (groupflag) {
 581                 output = add(";", output);
 582         }
 583
 584         output = add("\n", output);
 585
 586         mh_free0(value);
 587         *value = output;
 588         output = NULL;
 589
 590 out:
 591
 592         if (tmpbuf) {
 593                 mh_free0(&tmpbuf);
 594         }
 595         if (output) {
 596                 mh_free0(&output);
 597         }
 598
 599         return errflag > 0;
 600 }
 601
 602 /*
 603 ** Scan a string, check for characters that need to be encoded
 604 */
 605
 606 static int
 607 scanstring(const char *string, int *asciilen, int *eightbitchars,
 608                 int *specialchars)
 609 {
 610         *asciilen = 0;
 611         *eightbitchars = 0;
 612         *specialchars = 0;
 613
 614         for (; *string != '\0'; string++) {
 615                 if ((isascii((unsigned char) *string))) {
 616                         (*asciilen)++;
 617                         /*
 618                         ** So, a space is not a valid phrase character, but
 619                         ** we're counting an exception here, because in q-p
 620                         ** a space can be directly encoded as an underscore.
 621                         */
 622                         if (!qphrasevalid((unsigned char) *string) &&
 623                                         *string != ' ') {
 624                                 (*specialchars)++;
 625                         }
 626                 } else {
 627                         (*eightbitchars)++;
 628                 }
 629         }
 630
 631         return *eightbitchars > 0;
 632 }