git.marmaro.de Git - mmh/blob - docs/m_getfld.c.humor

   1 This is the pre-mmh version of sbr/m_getfld.c (dated 2008-12-26).
   2 The current version is still unbearbable, but this one is original.
   3 Enjoy! :-)      -- 2012-04-01 markus schnalke <meillo@marmaro.de>
   4
   5 /*
   6  * m_getfld.c -- read/parse a message
   7  *
   8  * This code is Copyright (c) 2002, by the authors of nmh.  See the
   9  * COPYRIGHT file in the root directory of the nmh distribution for
  10  * complete copyright information.
  11  */
  12
  13 #include <h/mh.h>
  14 #include <h/mts.h>
  15 #include <h/utils.h>
  16
  17 /* This module has a long and checkered history.  First, it didn't burst
  18    maildrops correctly because it considered two CTRL-A:s in a row to be
  19    an inter-message delimiter.  It really is four CTRL-A:s followed by a
  20    newline.  Unfortunately, MMDF will convert this delimiter *inside* a
  21    message to a CTRL-B followed by three CTRL-A:s and a newline.  This
  22    caused the old version of m_getfld() to declare eom prematurely.  The
  23    fix was a lot slower than
  24
  25                 c == '\001' && peekc (iob) == '\001'
  26
  27    but it worked, and to increase generality, MBOX style maildrops could
  28    be parsed as well.  Unfortunately the speed issue finally caught up with
  29    us since this routine is at the very heart of MH.
  30
  31    To speed things up considerably, the routine Eom() was made an auxilary
  32    function called by the macro eom().  Unless we are bursting a maildrop,
  33    the eom() macro returns FALSE saying we aren't at the end of the
  34    message.
  35
  36    The next thing to do is to read the mts.conf file and initialize
  37    delimiter[] and delimlen accordingly...
  38
  39    After mhl was made a built-in in msh, m_getfld() worked just fine
  40    (using m_unknown() at startup).  Until one day: a message which was
  41    the result of a bursting was shown. Then, since the burst boundaries
  42    aren't CTRL-A:s, m_getfld() would blinding plunge on past the boundary.
  43    Very sad.  The solution: introduce m_eomsbr().  This hook gets called
  44    after the end of each line (since testing for eom involves an fseek()).
  45    This worked fine, until one day: a message with no body portion arrived.
  46    Then the
  47
  48                    while (eom (c = Getc (iob), iob))
  49                         continue;
  50
  51    loop caused m_getfld() to return FMTERR.  So, that logic was changed to
  52    check for (*eom_action) and act accordingly.
  53
  54    This worked fine, until one day: someone didn't use four CTRL:A's as
  55    their delimiters.  So, the bullet got bit and we read mts.h and
  56    continue to struggle on.  It's not that bad though, since the only time
  57    the code gets executed is when inc (or msh) calls it, and both of these
  58    have already called mts_init().
  59
  60    ------------------------
  61    (Written by Van Jacobson for the mh6 m_getfld, January, 1986):
  62
  63    This routine was accounting for 60% of the cpu time used by most mh
  64    programs.  I spent a bit of time tuning and it now accounts for <10%
  65    of the time used.  Like any heavily tuned routine, it's a bit
  66    complex and you want to be sure you understand everything that it's
  67    doing before you start hacking on it.  Let me try to emphasize
  68    that:  every line in this atrocity depends on every other line,
  69    sometimes in subtle ways.  You should understand it all, in detail,
  70    before trying to change any part.  If you do change it, test the
  71    result thoroughly (I use a hand-constructed test file that exercises
  72    all the ways a header name, header body, header continuation,
  73    header-body separator, body line and body eom can align themselves
  74    with respect to a buffer boundary).  "Minor" bugs in this routine
  75    result in garbaged or lost mail.
  76
  77    If you hack on this and slow it down, I, my children and my
  78    children's children will curse you.
  79
  80    This routine gets used on three different types of files: normal,
  81    single msg files, "packed" unix or mmdf mailboxs (when used by inc)
  82    and packed, directoried bulletin board files (when used by msh).
  83    The biggest impact of different file types is in "eom" testing.  The
  84    code has been carefully organized to test for eom at appropriate
  85    times and at no other times (since the check is quite expensive).
  86    I have tried to arrange things so that the eom check need only be
  87    done on entry to this routine.  Since an eom can only occur after a
  88    newline, this is easy to manage for header fields.  For the msg
  89    body, we try to efficiently search the input buffer to see if
  90    contains the eom delimiter.  If it does, we take up to the
  91    delimiter, otherwise we take everything in the buffer.  (The change
  92    to the body eom/copy processing produced the most noticeable
  93    performance difference, particularly for "inc" and "show".)
  94
  95    There are three qualitatively different things this routine busts
  96    out of a message: field names, field text and msg bodies.  Field
  97    names are typically short (~8 char) and the loop that extracts them
  98    might terminate on a colon, newline or max width.  I considered
  99    using a Vax "scanc" to locate the end of the field followed by a
 100    "bcopy" but the routine call overhead on a Vax is too large for this
 101    to work on short names.  If Berkeley ever makes "inline" part of the
 102    C optimiser (so things like "scanc" turn into inline instructions) a
 103    change here would be worthwhile.
 104
 105    Field text is typically 60 - 100 characters so there's (barely)
 106    a win in doing a routine call to something that does a "locc"
 107    followed by a "bmove".  About 30% of the fields have continuations
 108    (usually the 822 "received:" lines) and each continuation generates
 109    another routine call.  "Inline" would be a big win here, as well.
 110
 111    Messages, as of this writing, seem to come in two flavors: small
 112    (~1K) and long (>2K).  Most messages have 400 - 600 bytes of headers
 113    so message bodies average at least a few hundred characters.
 114    Assuming your system uses reasonably sized stdio buffers (1K or
 115    more), this routine should be able to remove the body in large
 116    (>500 byte) chunks.  The makes the cost of a call to "bcopy"
 117    small but there is a premium on checking for the eom in packed
 118    maildrops.  The eom pattern is always a simple string so we can
 119    construct an efficient pattern matcher for it (e.g., a Vax "matchc"
 120    instruction).  Some thought went into recognizing the start of
 121    an eom that has been split across two buffers.
 122
 123    This routine wants to deal with large chunks of data so, rather
 124    than "getc" into a local buffer, it uses stdio's buffer.  If
 125    you try to use it on a non-buffered file, you'll get what you
 126    deserve.  This routine "knows" that struct FILEs have a _ptr
 127    and a _cnt to describe the current state of the buffer and
 128    it knows that _filbuf ignores the _ptr & _cnt and simply fills
 129    the buffer.  If stdio on your system doesn't work this way, you
 130    may have to make small changes in this routine.
 131
 132    This routine also "knows" that an EOF indication on a stream is
 133    "sticky" (i.e., you will keep getting EOF until you reposition the
 134    stream).  If your system doesn't work this way it is broken and you
 135    should complain to the vendor.  As a consequence of the sticky
 136    EOF, this routine will never return any kind of EOF status when
 137    there is data in "name" or "buf").
 138   */
 139
 140
 141 /*
 142  * static prototypes
 143  */
 144 static int m_Eom (int, FILE *);
 145 static unsigned char *matchc(int, char *, int, char *);
 146 static unsigned char *locc(int, unsigned char *, unsigned char);
 147
 148 #define Getc(iob)       getc(iob)
 149 #define eom(c,iob)      (msg_style != MS_DEFAULT && \
 150                          (((c) == *msg_delim && m_Eom(c,iob)) ||\
 151                           (eom_action && (*eom_action)(c))))
 152
 153 static unsigned char **pat_map;
 154
 155 /*
 156  * defined in sbr/m_msgdef.c = 0
 157  * This is a disgusting hack for "inc" so it can know how many
 158  * characters were stuffed in the buffer on the last call
 159  * (see comments in uip/scansbr.c).
 160  */
 161 extern int msg_count;
 162
 163 /*
 164  * defined in sbr/m_msgdef.c = MS_DEFAULT
 165  */
 166 extern int msg_style;
 167
 168 /*
 169  * The "full" delimiter string for a packed maildrop consists
 170  * of a newline followed by the actual delimiter.  E.g., the
 171  * full string for a Unix maildrop would be: "\n\nFrom ".
 172  * "Fdelim" points to the start of the full string and is used
 173  * in the BODY case of the main routine to search the buffer for
 174  * a possible eom.  Msg_delim points to the first character of
 175  * the actual delim. string (i.e., fdelim+1).  Edelim
 176  * points to the 2nd character of actual delimiter string.  It
 177  * is used in m_Eom because the first character of the string
 178  * has been read and matched before m_Eom is called.
 179  */
 180 extern char *msg_delim;         /* defined in sbr/m_msgdef.c = "" */
 181 static unsigned char *fdelim;
 182 static unsigned char *delimend;
 183 static int fdelimlen;
 184 static unsigned char *edelim;
 185 static int edelimlen;
 186
 187 static int (*eom_action)(int) = NULL;
 188
 189 #ifdef _FSTDIO
 190 # define _ptr    _p             /* Gag   */
 191 # define _cnt    _r             /* Retch */
 192 # define _filbuf __srget        /* Puke  */
 193 # define DEFINED__FILBUF_TO_SOMETHING_SPECIFIC
 194 #endif
 195
 196 #ifdef SCO_5_STDIO
 197 # define _ptr  __ptr
 198 # define _cnt  __cnt
 199 # define _base __base
 200 # define _filbuf(fp)  ((fp)->__cnt = 0, __filbuf(fp))
 201 # define DEFINED__FILBUF_TO_SOMETHING_SPECIFIC
 202 #endif
 203
 204 #ifndef DEFINED__FILBUF_TO_SOMETHING_SPECIFIC
 205 extern int  _filbuf(FILE*);
 206 #endif
 207
 208
 209 int
 210 m_getfld (int state, unsigned char *name, unsigned char *buf,
 211           int bufsz, FILE *iob)
 212 {
 213     register unsigned char  *bp, *cp, *ep, *sp;
 214     register int cnt, c, i, j;
 215
 216     if ((c = Getc(iob)) < 0) {
 217         msg_count = 0;
 218         *buf = 0;
 219         return FILEEOF;
 220     }
 221     if (eom (c, iob)) {
 222         if (! eom_action) {
 223             /* flush null messages */
 224             while ((c = Getc(iob)) >= 0 && eom (c, iob))
 225                 ;
 226             if (c >= 0)
 227                 ungetc(c, iob);
 228         }
 229         msg_count = 0;
 230         *buf = 0;
 231         return FILEEOF;
 232     }
 233
 234     switch (state) {
 235         case FLDEOF:
 236         case BODYEOF:
 237         case FLD:
 238             if (c == '\n' || c == '-') {
 239                 /* we hit the header/body separator */
 240                 while (c != '\n' && (c = Getc(iob)) >= 0)
 241                     ;
 242
 243                 if (c < 0 || (c = Getc(iob)) < 0 || eom (c, iob)) {
 244                     if (! eom_action) {
 245                         /* flush null messages */
 246                         while ((c = Getc(iob)) >= 0 && eom (c, iob))
 247                             ;
 248                         if (c >= 0)
 249                             ungetc(c, iob);
 250                     }
 251                     msg_count = 0;
 252                     *buf = 0;
 253                     return FILEEOF;
 254                 }
 255                 state = BODY;
 256                 goto body;
 257             }
 258             /*
 259              * get the name of this component.  take characters up
 260              * to a ':', a newline or NAMESZ-1 characters, whichever
 261              * comes first.
 262              */
 263             cp = name;
 264             i = NAMESZ - 1;
 265             for (;;) {
 266 #ifdef LINUX_STDIO
 267                 bp = sp = (unsigned char *) iob->_IO_read_ptr - 1;
 268                 j = (cnt = ((long) iob->_IO_read_end -
 269                         (long) iob->_IO_read_ptr)  + 1) < i ? cnt : i;
 270 #elif defined(__DragonFly__)
 271                 bp = sp = (unsigned char *) ((struct __FILE_public *)iob)->_p - 1;
 272                 j = (cnt = ((struct __FILE_public *)iob)->_r+1) < i ? cnt : i;
 273 #else
 274                 bp = sp = (unsigned char *) iob->_ptr - 1;
 275                 j = (cnt = iob->_cnt+1) < i ? cnt : i;
 276 #endif
 277                 while (--j >= 0 && (c = *bp++) != ':' && c != '\n')
 278                     *cp++ = c;
 279
 280                 j = bp - sp;
 281                 if ((cnt -= j) <= 0) {
 282 #ifdef LINUX_STDIO
 283                     iob->_IO_read_ptr = iob->_IO_read_end;
 284                     if (__underflow(iob) == EOF) {
 285 #elif defined(__DragonFly__)
 286                     if (__srget(iob) == EOF) {
 287 #else
 288                     if (_filbuf(iob) == EOF) {
 289 #endif
 290                         *cp = *buf = 0;
 291                         advise (NULL, "eof encountered in field \"%s\"", name);
 292                         return FMTERR;
 293                     }
 294 #ifdef LINUX_STDIO
 295                 iob->_IO_read_ptr++; /* NOT automatic in __underflow()! */
 296 #endif
 297                 } else {
 298 #ifdef LINUX_STDIO
 299                     iob->_IO_read_ptr = bp + 1;
 300 #elif defined(__DragonFly__)
 301                     ((struct __FILE_public *)iob)->_p = bp + 1;
 302                     ((struct __FILE_public *)iob)->_r = cnt - 1;
 303 #else
 304                     iob->_ptr = bp + 1;
 305                     iob->_cnt = cnt - 1;
 306 #endif
 307                 }
 308                 if (c == ':')
 309                     break;
 310
 311                 /*
 312                  * something went wrong.  possibilities are:
 313                  *  . hit a newline (error)
 314                  *  . got more than namesz chars. (error)
 315                  *  . hit the end of the buffer. (loop)
 316                  */
 317                 if (c == '\n') {
 318                     /* We hit the end of the line without seeing ':' to
 319                      * terminate the field name.  This is usually (always?)
 320                      * spam.  But, blowing up is lame, especially when
 321                      * scan(1)ing a folder with such messages.  Pretend such
 322                      * lines are the first of the body (at least mutt also
 323                      * handles it this way). */
 324
 325                     /* See if buf can hold this line, since we were assuming
 326                      * we had a buffer of NAMESZ, not bufsz. */
 327                     /* + 1 for the newline */
 328                     if (bufsz < j + 1) {
 329                         /* No, it can't.  Oh well, guess we'll blow up. */
 330                         *cp = *buf = 0;
 331                         advise (NULL, "eol encountered in field \"%s\"", name);
 332                         state = FMTERR;
 333                         goto finish;
 334                     }
 335                     memcpy (buf, name, j - 1);
 336                     buf[j - 1] = '\n';
 337                     buf[j] = '\0';
 338                     /* mhparse.c:get_content wants to find the position of the
 339                      * body start, but it thinks there's a blank line between
 340                      * the header and the body (naturally!), so seek back so
 341                      * that things line up even though we don't have that
 342                      * blank line in this case.  Simpler parsers (e.g. mhl)
 343                      * get extra newlines, but that should be harmless enough,
 344                      * right?  This is a corrupt message anyway. */
 345                     fseek (iob, ftell (iob) - 2, SEEK_SET);
 346                     return BODY;
 347                 }
 348                 if ((i -= j) <= 0) {
 349                     *cp = *buf = 0;
 350                     advise (NULL, "field name \"%s\" exceeds %d bytes", name, NAMESZ - 2);
 351                     state = LENERR;
 352                     goto finish;
 353                 }
 354             }
 355
 356             while (isspace (*--cp) && cp >= name)
 357                 ;
 358             *++cp = 0;
 359             /* fall through */
 360
 361         case FLDPLUS:
 362             /*
 363              * get (more of) the text of a field.  take
 364              * characters up to the end of this field (newline
 365              * followed by non-blank) or bufsz-1 characters.
 366              */
 367             cp = buf; i = bufsz-1;
 368             for (;;) {
 369 #ifdef LINUX_STDIO
 370                 cnt = (long) iob->_IO_read_end - (long) iob->_IO_read_ptr;
 371                 bp = (unsigned char *) --iob->_IO_read_ptr;
 372 #elif defined(__DragonFly__)
 373                 cnt = ((struct __FILE_public *)iob)->_r++;
 374                 bp = (unsigned char *) --((struct __FILE_public *)iob)->_p;
 375 #else
 376                 cnt = iob->_cnt++;
 377                 bp = (unsigned char *) --iob->_ptr;
 378 #endif
 379                 c = cnt < i ? cnt : i;
 380                 while ((ep = locc( c, bp, '\n' ))) {
 381                     /*
 382                      * if we hit the end of this field, return.
 383                      */
 384                     if ((j = *++ep) != ' ' && j != '\t') {
 385 #ifdef LINUX_STDIO
 386                         j = ep - (unsigned char *) iob->_IO_read_ptr;
 387                         memcpy (cp, iob->_IO_read_ptr, j);
 388                         iob->_IO_read_ptr = ep;
 389 #elif defined(__DragonFly__)
 390                         j = ep - (unsigned char *) ((struct __FILE_public *)iob)->_p;
 391                         memcpy (cp, ((struct __FILE_public *)iob)->_p, j);
 392                         ((struct __FILE_public *)iob)->_p = ep;
 393                         ((struct __FILE_public *)iob)->_r -= j;
 394 #else
 395                         j = ep - (unsigned char *) iob->_ptr;
 396                         memcpy (cp, iob->_ptr, j);
 397                         iob->_ptr = ep;
 398                         iob->_cnt -= j;
 399 #endif
 400                         cp += j;
 401                         state = FLD;
 402                         goto finish;
 403                     }
 404                     c -= ep - bp;
 405                     bp = ep;
 406                 }
 407                 /*
 408                  * end of input or dest buffer - copy what we've found.
 409                  */
 410 #ifdef LINUX_STDIO
 411                 c += bp - (unsigned char *) iob->_IO_read_ptr;
 412                 memcpy( cp, iob->_IO_read_ptr, c);
 413 #elif defined(__DragonFly__)
 414                 c += bp - (unsigned char *) ((struct __FILE_public *)iob)->_p;
 415                 memcpy( cp, ((struct __FILE_public *)iob)->_p, c);
 416 #else
 417                 c += bp - (unsigned char *) iob->_ptr;
 418                 memcpy( cp, iob->_ptr, c);
 419 #endif
 420                 i -= c;
 421                 cp += c;
 422                 if (i <= 0) {
 423                     /* the dest buffer is full */
 424 #ifdef LINUX_STDIO
 425                     iob->_IO_read_ptr += c;
 426 #elif defined(__DragonFly__)
 427                     ((struct __FILE_public *)iob)->_r -= c;
 428                     ((struct __FILE_public *)iob)->_p += c;
 429 #else
 430                     iob->_cnt -= c;
 431                     iob->_ptr += c;
 432 #endif
 433                     state = FLDPLUS;
 434                     break;
 435                 }
 436                 /*
 437                  * There's one character left in the input buffer.
 438                  * Copy it & fill the buffer.  If the last char
 439                  * was a newline and the next char is not whitespace,
 440                  * this is the end of the field.  Otherwise loop.
 441                  */
 442                 --i;
 443 #ifdef LINUX_STDIO
 444                 *cp++ = j = *(iob->_IO_read_ptr + c);
 445                 iob->_IO_read_ptr = iob->_IO_read_end;
 446                 c = __underflow(iob);
 447                 iob->_IO_read_ptr++;    /* NOT automatic! */
 448 #elif defined(__DragonFly__)
 449                 *cp++ =j = *(((struct __FILE_public *)iob)->_p + c);
 450                 c = __srget(iob);
 451 #else
 452                 *cp++ = j = *(iob->_ptr + c);
 453                 c = _filbuf(iob);
 454 #endif
 455                 if (c == EOF ||
 456                   ((j == '\0' || j == '\n') && c != ' ' && c != '\t')) {
 457                     if (c != EOF) {
 458 #ifdef LINUX_STDIO
 459                         --iob->_IO_read_ptr;
 460 #elif defined(__DragonFly__)
 461                         --((struct __FILE_public *)iob)->_p;
 462                         ++((struct __FILE_public *)iob)->_r;
 463 #else
 464                         --iob->_ptr;
 465                         ++iob->_cnt;
 466 #endif
 467                     }
 468                     state = FLD;
 469                     break;
 470                 }
 471             }
 472             break;
 473
 474         case BODY:
 475         body:
 476             /*
 477              * get the message body up to bufsz characters or the
 478              * end of the message.  Sleazy hack: if bufsz is negative
 479              * we assume that we were called to copy directly into
 480              * the output buffer and we don't add an eos.
 481              */
 482             i = (bufsz < 0) ? -bufsz : bufsz-1;
 483 #ifdef LINUX_STDIO
 484             bp = (unsigned char *) --iob->_IO_read_ptr;
 485             cnt = (long) iob->_IO_read_end - (long) iob->_IO_read_ptr;
 486 #elif defined(__DragonFly__)
 487             bp = (unsigned char *) --((struct __FILE_public *)iob)->_p;
 488             cnt = ++((struct __FILE_public *)iob)->_r;
 489 #else
 490             bp = (unsigned char *) --iob->_ptr;
 491             cnt = ++iob->_cnt;
 492 #endif
 493             c = (cnt < i ? cnt : i);
 494             if (msg_style != MS_DEFAULT && c > 1) {
 495                 /*
 496                  * packed maildrop - only take up to the (possible)
 497                  * start of the next message.  This "matchc" should
 498                  * probably be a Boyer-Moore matcher for non-vaxen,
 499                  * particularly since we have the alignment table
 500                  * all built for the end-of-buffer test (next).
 501                  * But our vax timings indicate that the "matchc"
 502                  * instruction is 50% faster than a carefully coded
 503                  * B.M. matcher for most strings.  (So much for elegant
 504                  * algorithms vs. brute force.)  Since I (currently)
 505                  * run MH on a vax, we use the matchc instruction. --vj
 506                  */
 507                 if ((ep = matchc( fdelimlen, fdelim, c, bp )))
 508                     c = ep - bp + 1;
 509                 else {
 510                     /*
 511                      * There's no delim in the buffer but there may be
 512                      * a partial one at the end.  If so, we want to leave
 513                      * it so the "eom" check on the next call picks it up.
 514                      * Use a modified Boyer-Moore matcher to make this
 515                      * check relatively cheap.  The first "if" figures
 516                      * out what position in the pattern matches the last
 517                      * character in the buffer.  The inner "while" matches
 518                      * the pattern against the buffer, backwards starting
 519                      * at that position.  Note that unless the buffer
 520                      * ends with one of the characters in the pattern
 521                      * (excluding the first and last), we do only one test.
 522                      */
 523                     ep = bp + c - 1;
 524                     if ((sp = pat_map[*ep])) {
 525                         do {
 526                             /* This if() is true unless (a) the buffer is too
 527                              * small to contain this delimiter prefix, or
 528                              * (b) it contains exactly enough chars for the
 529                              * delimiter prefix.
 530                              * For case (a) obviously we aren't going to match.
 531                              * For case (b), if the buffer really contained exactly
 532                              * a delim prefix, then the m_eom call at entry
 533                              * should have found it.  Thus it's not a delim
 534                              * and we know we won't get a match.
 535                              */
 536                             if (((sp - fdelim) + 2) <= c) {
 537                                 cp = sp;
 538                                 /* Unfortunately although fdelim has a preceding NUL
 539                                  * we can't use this as a sentinel in case the buffer
 540                                  * contains a NUL in exactly the wrong place (this
 541                                  * would cause us to run off the front of fdelim).
 542                                  */
 543                                 while (*--ep == *--cp)
 544                                     if (cp < fdelim)
 545                                         break;
 546                                 if (cp < fdelim) {
 547                                     /* we matched the entire delim prefix,
 548                                      * so only take the buffer up to there.
 549                                      * we know ep >= bp -- check above prevents underrun
 550                                      */
 551                                     c = (ep - bp) + 2;
 552                                     break;
 553                                 }
 554                             }
 555                             /* try matching one less char of delim string */
 556                             ep = bp + c - 1;
 557                         } while (--sp > fdelim);
 558                     }
 559                 }
 560             }
 561             memcpy( buf, bp, c );
 562 #ifdef LINUX_STDIO
 563             iob->_IO_read_ptr += c;
 564 #elif defined(__DragonFly__)
 565             ((struct __FILE_public *)iob)->_r -= c;
 566             ((struct __FILE_public *)iob)->_p += c;
 567 #else
 568             iob->_cnt -= c;
 569             iob->_ptr += c;
 570 #endif
 571             if (bufsz < 0) {
 572                 msg_count = c;
 573                 return (state);
 574             }
 575             cp = buf + c;
 576             break;
 577
 578         default:
 579             adios (NULL, "m_getfld() called with bogus state of %d", state);
 580     }
 581 finish:
 582     *cp = 0;
 583     msg_count = cp - buf;
 584     return (state);
 585 }
 586
 587
 588 #ifdef RPATHS
 589 static char unixbuf[BUFSIZ] = "";
 590 #endif /* RPATHS */
 591
 592 void
 593 m_unknown(FILE *iob)
 594 {
 595     register int c;
 596     register long pos;
 597     char text[10];
 598     register char *cp;
 599     register char *delimstr;
 600
 601 /*
 602  * Figure out what the message delimitter string is for this
 603  * maildrop.  (This used to be part of m_Eom but I didn't like
 604  * the idea of an "if" statement that could only succeed on the
 605  * first call to m_Eom getting executed on each call, i.e., at
 606  * every newline in the message).
 607  *
 608  * If the first line of the maildrop is a Unix "From " line, we
 609  * say the style is MBOX and eat the rest of the line.  Otherwise
 610  * we say the style is MMDF and look for the delimiter string
 611  * specified when nmh was built (or from the mts.conf file).
 612  */
 613
 614     msg_style = MS_UNKNOWN;
 615
 616     pos = ftell (iob);
 617     if (fread (text, sizeof(*text), 5, iob) == 5
 618             && strncmp (text, "From ", 5) == 0) {
 619         msg_style = MS_MBOX;
 620         delimstr = "\nFrom ";
 621 #ifndef RPATHS
 622         while ((c = getc (iob)) != '\n' && c >= 0)
 623             ;
 624 #else /* RPATHS */
 625         cp = unixbuf;
 626         while ((c = getc (iob)) != '\n' && cp - unixbuf < BUFSIZ - 1)
 627             *cp++ = c;
 628         *cp = 0;
 629 #endif /* RPATHS */
 630     } else {
 631         /* not a Unix style maildrop */
 632         fseek (iob, pos, SEEK_SET);
 633         if (mmdlm2 == NULL || *mmdlm2 == 0)
 634             mmdlm2 = "\001\001\001\001\n";
 635         delimstr = mmdlm2;
 636         msg_style = MS_MMDF;
 637     }
 638     c = strlen (delimstr);
 639     fdelim = (unsigned char *) mh_xmalloc((size_t) (c + 3));
 640     *fdelim++ = '\0';
 641     *fdelim = '\n';
 642     msg_delim = (char *)fdelim+1;
 643     edelim = (unsigned char *)msg_delim+1;
 644     fdelimlen = c + 1;
 645     edelimlen = c - 1;
 646     strcpy (msg_delim, delimstr);
 647     delimend = (unsigned char *)msg_delim + edelimlen;
 648     if (edelimlen <= 1)
 649         adios (NULL, "maildrop delimiter must be at least 2 bytes");
 650     /*
 651      * build a Boyer-Moore end-position map for the matcher in m_getfld.
 652      * N.B. - we don't match just the first char (since it's the newline
 653      * separator) or the last char (since the matchc would have found it
 654      * if it was a real delim).
 655      */
 656     pat_map = (unsigned char **) calloc (256, sizeof(unsigned char *));
 657
 658     for (cp = (char *) fdelim + 1; cp < (char *) delimend; cp++ )
 659         pat_map[(unsigned char)*cp] = (unsigned char *) cp;
 660
 661     if (msg_style == MS_MMDF) {
 662         /* flush extra msg hdrs */
 663         while ((c = Getc(iob)) >= 0 && eom (c, iob))
 664             ;
 665         if (c >= 0)
 666             ungetc(c, iob);
 667     }
 668 }
 669
 670
 671 void
 672 m_eomsbr (int (*action)(int))
 673 {
 674     if ((eom_action = action)) {
 675         msg_style = MS_MSH;
 676         *msg_delim = 0;
 677         fdelimlen = 1;
 678         delimend = fdelim;
 679     } else {
 680         msg_style = MS_MMDF;
 681         msg_delim = (char *)fdelim + 1;
 682         fdelimlen = strlen((char *)fdelim);
 683         delimend = (unsigned char *)(msg_delim + edelimlen);
 684     }
 685 }
 686
 687
 688 /*
 689  * test for msg delimiter string
 690  */
 691
 692 static int
 693 m_Eom (int c, FILE *iob)
 694 {
 695     register long pos = 0L;
 696     register int i;
 697     char text[10];
 698 #ifdef RPATHS
 699     register char *cp;
 700 #endif /* RPATHS */
 701
 702     pos = ftell (iob);
 703     if ((i = fread (text, sizeof *text, edelimlen, iob)) != edelimlen
 704             || strncmp (text, (char *)edelim, edelimlen)) {
 705         if (i == 0 && msg_style == MS_MBOX)
 706             /* the final newline in the (brain damaged) unix-format
 707              * maildrop is part of the delimitter - delete it.
 708              */
 709             return 1;
 710
 711 #if 0
 712         fseek (iob, pos, SEEK_SET);
 713 #endif
 714
 715         fseek (iob, (long)(pos-1), SEEK_SET);
 716         getc (iob);             /* should be OK */
 717         return 0;
 718     }
 719
 720     if (msg_style == MS_MBOX) {
 721 #ifndef RPATHS
 722         while ((c = getc (iob)) != '\n')
 723             if (c < 0)
 724                 break;
 725 #else /* RPATHS */
 726         cp = unixbuf;
 727         while ((c = getc (iob)) != '\n' && c >= 0 && cp - unixbuf < BUFSIZ - 1)
 728             *cp++ = c;
 729         *cp = 0;
 730 #endif /* RPATHS */
 731     }
 732
 733     return 1;
 734 }
 735
 736
 737 #ifdef RPATHS
 738 /*
 739  * Return the Return-Path and Delivery-Date
 740  * header information.
 741  *
 742  * Currently, I'm assuming that the "From " line
 743  * takes one of the following forms.
 744  *
 745  * From sender date remote from host   (for UUCP delivery)
 746  * From sender@host  date              (for sendmail delivery)
 747  */
 748
 749 int
 750 get_returnpath (char *rp, int rplen, char *dd, int ddlen)
 751 {
 752     char *ap, *bp, *cp, *dp;
 753
 754     ap = unixbuf;
 755     if (!(bp = cp = strchr(ap, ' ')))
 756         return 0;
 757
 758     /*
 759      * Check for "remote from" in envelope to see
 760      * if this message uses UUCP style addressing
 761      */
 762     while ((cp = strchr(++cp, 'r'))) {
 763         if (strncmp (cp, "remote from", 11) == 0) {
 764             cp = strrchr (cp, ' ');
 765             break;
 766         }
 767     }
 768
 769     /*
 770      * Get the Return-Path information from
 771      * the "From " envelope.
 772      */
 773     if (cp) {
 774         /* return path for UUCP style addressing */
 775         dp = strchr (++cp, '\n');
 776         snprintf (rp, rplen, "%.*s!%.*s\n", (int)(dp - cp), cp, (int)(bp - ap), ap);
 777     } else {
 778         /* return path for standard domain addressing */
 779         snprintf (rp, rplen, "%.*s\n", (int)(bp - ap), ap);
 780     }
 781
 782     /*
 783      * advance over the spaces to get to
 784      * delivery date on envelope
 785      */
 786     while (*bp == ' ')
 787         bp++;
 788
 789     /* Now get delivery date from envelope */
 790     snprintf (dd, ddlen, "%.*s\n", 24, bp);
 791
 792     unixbuf[0] = 0;
 793     return 1;
 794 }
 795 #endif /* RPATHS */
 796
 797
 798 static unsigned char *
 799 matchc(int patln, char *pat, int strln, char *str)
 800 {
 801         register char *es = str + strln - patln;
 802         register char *sp;
 803         register char *pp;
 804         register char *ep = pat + patln;
 805         register char pc = *pat++;
 806
 807         for(;;) {
 808                 while (pc != *str++)
 809                         if (str > es)
 810                                 return 0;
 811                 if (str > es+1)
 812                         return 0;
 813                 sp = str; pp = pat;
 814                 while (pp < ep && *sp++ == *pp)
 815                         pp++;
 816                 if (pp >= ep)
 817                         return ((unsigned char *)--str);
 818         }
 819 }
 820
 821
 822 /*
 823  * Locate character "term" in the next "cnt" characters of "src".
 824  * If found, return its address, otherwise return 0.
 825  */
 826
 827 static unsigned char *
 828 locc(int cnt, unsigned char *src, unsigned char term)
 829 {
 830     while (*src++ != term && --cnt > 0);
 831
 832     return (cnt > 0 ? --src : (unsigned char *)0);
 833 }
 834