git.marmaro.de Git - mmh/blob - sbr/m_getfld.c

   1
   2 /*
   3  * m_getfld.c -- read/parse a message
   4  *
   5  * This code is Copyright (c) 2002, by the authors of nmh.  See the
   6  * COPYRIGHT file in the root directory of the nmh distribution for
   7  * complete copyright information.
   8  */
   9
  10 #include <h/mh.h>
  11 #include <h/mts.h>
  12 #include <h/utils.h>
  13
  14 /* This module has a long and checkered history.  First, it didn't burst
  15    maildrops correctly because it considered two CTRL-A:s in a row to be
  16    an inter-message delimiter.  It really is four CTRL-A:s followed by a
  17    newline.  Unfortunately, MMDF will convert this delimiter *inside* a
  18    message to a CTRL-B followed by three CTRL-A:s and a newline.  This
  19    caused the old version of m_getfld() to declare eom prematurely.  The
  20    fix was a lot slower than
  21
  22                 c == '\001' && peekc (iob) == '\001'
  23
  24    but it worked, and to increase generality, MBOX style maildrops could
  25    be parsed as well.  Unfortunately the speed issue finally caught up with
  26    us since this routine is at the very heart of MH.
  27
  28    To speed things up considerably, the routine Eom() was made an auxilary
  29    function called by the macro eom().  Unless we are bursting a maildrop,
  30    the eom() macro returns FALSE saying we aren't at the end of the
  31    message.
  32
  33    The next thing to do is to read the mts.conf file and initialize
  34    delimiter[] and delimlen accordingly...
  35
  36    After mhl was made a built-in in msh, m_getfld() worked just fine
  37    (using m_unknown() at startup).  Until one day: a message which was
  38    the result of a bursting was shown. Then, since the burst boundaries
  39    aren't CTRL-A:s, m_getfld() would blinding plunge on past the boundary.
  40    Very sad.  The solution: introduce m_eomsbr().  This hook gets called
  41    after the end of each line (since testing for eom involves an fseek()).
  42    This worked fine, until one day: a message with no body portion arrived.
  43    Then the
  44
  45                    while (eom (c = Getc (iob), iob))
  46                         continue;
  47
  48    loop caused m_getfld() to return FMTERR.  So, that logic was changed to
  49    check for (*eom_action) and act accordingly.
  50
  51    This worked fine, until one day: someone didn't use four CTRL:A's as
  52    their delimiters.  So, the bullet got bit and we read mts.h and
  53    continue to struggle on.  It's not that bad though, since the only time
  54    the code gets executed is when inc (or msh) calls it, and both of these
  55    have already called mts_init().
  56
  57    ------------------------
  58    (Written by Van Jacobson for the mh6 m_getfld, January, 1986):
  59
  60    This routine was accounting for 60% of the cpu time used by most mh
  61    programs.  I spent a bit of time tuning and it now accounts for <10%
  62    of the time used.  Like any heavily tuned routine, it's a bit
  63    complex and you want to be sure you understand everything that it's
  64    doing before you start hacking on it.  Let me try to emphasize
  65    that:  every line in this atrocity depends on every other line,
  66    sometimes in subtle ways.  You should understand it all, in detail,
  67    before trying to change any part.  If you do change it, test the
  68    result thoroughly (I use a hand-constructed test file that exercises
  69    all the ways a header name, header body, header continuation,
  70    header-body separator, body line and body eom can align themselves
  71    with respect to a buffer boundary).  "Minor" bugs in this routine
  72    result in garbaged or lost mail.
  73
  74    If you hack on this and slow it down, I, my children and my
  75    children's children will curse you.
  76
  77    This routine gets used on three different types of files: normal,
  78    single msg files, "packed" unix or mmdf mailboxs (when used by inc)
  79    and packed, directoried bulletin board files (when used by msh).
  80    The biggest impact of different file types is in "eom" testing.  The
  81    code has been carefully organized to test for eom at appropriate
  82    times and at no other times (since the check is quite expensive).
  83    I have tried to arrange things so that the eom check need only be
  84    done on entry to this routine.  Since an eom can only occur after a
  85    newline, this is easy to manage for header fields.  For the msg
  86    body, we try to efficiently search the input buffer to see if
  87    contains the eom delimiter.  If it does, we take up to the
  88    delimiter, otherwise we take everything in the buffer.  (The change
  89    to the body eom/copy processing produced the most noticeable
  90    performance difference, particularly for "inc" and "show".)
  91
  92    There are three qualitatively different things this routine busts
  93    out of a message: field names, field text and msg bodies.  Field
  94    names are typically short (~8 char) and the loop that extracts them
  95    might terminate on a colon, newline or max width.  I considered
  96    using a Vax "scanc" to locate the end of the field followed by a
  97    "bcopy" but the routine call overhead on a Vax is too large for this
  98    to work on short names.  If Berkeley ever makes "inline" part of the
  99    C optimiser (so things like "scanc" turn into inline instructions) a
 100    change here would be worthwhile.
 101
 102    Field text is typically 60 - 100 characters so there's (barely)
 103    a win in doing a routine call to something that does a "locc"
 104    followed by a "bmove".  About 30% of the fields have continuations
 105    (usually the 822 "received:" lines) and each continuation generates
 106    another routine call.  "Inline" would be a big win here, as well.
 107
 108    Messages, as of this writing, seem to come in two flavors: small
 109    (~1K) and long (>2K).  Most messages have 400 - 600 bytes of headers
 110    so message bodies average at least a few hundred characters.
 111    Assuming your system uses reasonably sized stdio buffers (1K or
 112    more), this routine should be able to remove the body in large
 113    (>500 byte) chunks.  The makes the cost of a call to "bcopy"
 114    small but there is a premium on checking for the eom in packed
 115    maildrops.  The eom pattern is always a simple string so we can
 116    construct an efficient pattern matcher for it (e.g., a Vax "matchc"
 117    instruction).  Some thought went into recognizing the start of
 118    an eom that has been split across two buffers.
 119
 120    This routine wants to deal with large chunks of data so, rather
 121    than "getc" into a local buffer, it uses stdio's buffer.  If
 122    you try to use it on a non-buffered file, you'll get what you
 123    deserve.  This routine "knows" that struct FILEs have a _ptr
 124    and a _cnt to describe the current state of the buffer and
 125    it knows that _filbuf ignores the _ptr & _cnt and simply fills
 126    the buffer.  If stdio on your system doesn't work this way, you
 127    may have to make small changes in this routine.
 128
 129    This routine also "knows" that an EOF indication on a stream is
 130    "sticky" (i.e., you will keep getting EOF until you reposition the
 131    stream).  If your system doesn't work this way it is broken and you
 132    should complain to the vendor.  As a consequence of the sticky
 133    EOF, this routine will never return any kind of EOF status when
 134    there is data in "name" or "buf").
 135   */
 136
 137
 138 /*
 139  * static prototypes
 140  */
 141 static int m_Eom (int, FILE *);
 142 static unsigned char *matchc(int, char *, int, char *);
 143 static unsigned char *locc(int, unsigned char *, unsigned char);
 144
 145 #define Getc(iob)       getc(iob)
 146 #define eom(c,iob)      (msg_style != MS_DEFAULT && \
 147                          (((c) == *msg_delim && m_Eom(c,iob)) ||\
 148                           (eom_action && (*eom_action)(c))))
 149
 150 static unsigned char **pat_map;
 151
 152 /*
 153  * defined in sbr/m_msgdef.c = 0
 154  * This is a disgusting hack for "inc" so it can know how many
 155  * characters were stuffed in the buffer on the last call
 156  * (see comments in uip/scansbr.c).
 157  */
 158 extern int msg_count;
 159
 160 /*
 161  * defined in sbr/m_msgdef.c = MS_DEFAULT
 162  */
 163 extern int msg_style;
 164
 165 /*
 166  * The "full" delimiter string for a packed maildrop consists
 167  * of a newline followed by the actual delimiter.  E.g., the
 168  * full string for a Unix maildrop would be: "\n\nFrom ".
 169  * "Fdelim" points to the start of the full string and is used
 170  * in the BODY case of the main routine to search the buffer for
 171  * a possible eom.  Msg_delim points to the first character of
 172  * the actual delim. string (i.e., fdelim+1).  Edelim
 173  * points to the 2nd character of actual delimiter string.  It
 174  * is used in m_Eom because the first character of the string
 175  * has been read and matched before m_Eom is called.
 176  */
 177 extern char *msg_delim;         /* defined in sbr/m_msgdef.c = "" */
 178 static unsigned char *fdelim;
 179 static unsigned char *delimend;
 180 static int fdelimlen;
 181 static unsigned char *edelim;
 182 static int edelimlen;
 183
 184 static int (*eom_action)(int) = NULL;
 185
 186 #ifdef _FSTDIO
 187 # define _ptr    _p             /* Gag   */
 188 # define _cnt    _r             /* Retch */
 189 # define _filbuf __srget        /* Puke  */
 190 # define DEFINED__FILBUF_TO_SOMETHING_SPECIFIC
 191
 192 # if defined __CYGWIN__
 193   /* Cygwin's stdio.h does not declare __srget(). */
 194   int __srget(FILE *);
 195 # endif /* __CYGWIN__ */
 196 #endif
 197
 198 #ifndef DEFINED__FILBUF_TO_SOMETHING_SPECIFIC
 199 extern int  _filbuf(FILE*);
 200 #endif
 201
 202
 203 int
 204 m_getfld (int state, unsigned char *name, unsigned char *buf,
 205           int bufsz, FILE *iob)
 206 {
 207     register unsigned char  *bp, *cp, *ep, *sp;
 208     register int cnt, c, i, j;
 209
 210     if ((c = Getc(iob)) < 0) {
 211         msg_count = 0;
 212         *buf = 0;
 213         return FILEEOF;
 214     }
 215     if (eom (c, iob)) {
 216         if (! eom_action) {
 217             /* flush null messages */
 218             while ((c = Getc(iob)) >= 0 && eom (c, iob))
 219                 ;
 220             if (c >= 0)
 221                 ungetc(c, iob);
 222         }
 223         msg_count = 0;
 224         *buf = 0;
 225         return FILEEOF;
 226     }
 227
 228     switch (state) {
 229         case FLDEOF:
 230         case BODYEOF:
 231         case FLD:
 232             if (c == '\n' || c == '-') {
 233                 /* we hit the header/body separator */
 234                 while (c != '\n' && (c = Getc(iob)) >= 0)
 235                     ;
 236
 237                 if (c < 0 || (c = Getc(iob)) < 0 || eom (c, iob)) {
 238                     if (! eom_action) {
 239                         /* flush null messages */
 240                         while ((c = Getc(iob)) >= 0 && eom (c, iob))
 241                             ;
 242                         if (c >= 0)
 243                             ungetc(c, iob);
 244                     }
 245                     msg_count = 0;
 246                     *buf = 0;
 247                     return FILEEOF;
 248                 }
 249                 state = BODY;
 250                 goto body;
 251             }
 252             /*
 253              * get the name of this component.  take characters up
 254              * to a ':', a newline or NAMESZ-1 characters, whichever
 255              * comes first.
 256              */
 257             cp = name;
 258             i = NAMESZ - 1;
 259             for (;;) {
 260 #ifdef LINUX_STDIO
 261                 bp = sp = (unsigned char *) iob->_IO_read_ptr - 1;
 262                 j = (cnt = ((long) iob->_IO_read_end -
 263                         (long) iob->_IO_read_ptr)  + 1) < i ? cnt : i;
 264 #elif defined(__DragonFly__)
 265                 bp = sp = (unsigned char *) ((struct __FILE_public *)iob)->_p - 1;
 266                 j = (cnt = ((struct __FILE_public *)iob)->_r+1) < i ? cnt : i;
 267 #else
 268                 bp = sp = (unsigned char *) iob->_ptr - 1;
 269                 j = (cnt = iob->_cnt+1) < i ? cnt : i;
 270 #endif
 271                 while (--j >= 0 && (c = *bp++) != ':' && c != '\n')
 272                     *cp++ = c;
 273
 274                 j = bp - sp;
 275                 if ((cnt -= j) <= 0) {
 276 #ifdef LINUX_STDIO
 277                     iob->_IO_read_ptr = iob->_IO_read_end;
 278                     if (__underflow(iob) == EOF) {
 279 #elif defined(__DragonFly__)
 280                     if (__srget(iob) == EOF) {
 281 #else
 282                     if (_filbuf(iob) == EOF) {
 283 #endif
 284                         *cp = *buf = 0;
 285                         advise (NULL, "eof encountered in field \"%s\"", name);
 286                         return FMTERR;
 287                     }
 288 #ifdef LINUX_STDIO
 289                 iob->_IO_read_ptr++; /* NOT automatic in __underflow()! */
 290 #endif
 291                 } else {
 292 #ifdef LINUX_STDIO
 293                     iob->_IO_read_ptr = bp + 1;
 294 #elif defined(__DragonFly__)
 295                     ((struct __FILE_public *)iob)->_p = bp + 1;
 296                     ((struct __FILE_public *)iob)->_r = cnt - 1;
 297 #else
 298                     iob->_ptr = bp + 1;
 299                     iob->_cnt = cnt - 1;
 300 #endif
 301                 }
 302                 if (c == ':')
 303                     break;
 304
 305                 /*
 306                  * something went wrong.  possibilities are:
 307                  *  . hit a newline (error)
 308                  *  . got more than namesz chars. (error)
 309                  *  . hit the end of the buffer. (loop)
 310                  */
 311                 if (c == '\n') {
 312                     /* We hit the end of the line without seeing ':' to
 313                      * terminate the field name.  This is usually (always?)
 314                      * spam.  But, blowing up is lame, especially when
 315                      * scan(1)ing a folder with such messages.  Pretend such
 316                      * lines are the first of the body (at least mutt also
 317                      * handles it this way). */
 318
 319                     /* See if buf can hold this line, since we were assuming
 320                      * we had a buffer of NAMESZ, not bufsz. */
 321                     /* + 1 for the newline */
 322                     if (bufsz < j + 1) {
 323                         /* No, it can't.  Oh well, guess we'll blow up. */
 324                         *cp = *buf = 0;
 325                         advise (NULL, "eol encountered in field \"%s\"", name);
 326                         state = FMTERR;
 327                         goto finish;
 328                     }
 329                     memcpy (buf, name, j - 1);
 330                     buf[j - 1] = '\n';
 331                     buf[j] = '\0';
 332                     /* mhparse.c:get_content wants to find the position of the
 333                      * body start, but it thinks there's a blank line between
 334                      * the header and the body (naturally!), so seek back so
 335                      * that things line up even though we don't have that
 336                      * blank line in this case.  Simpler parsers (e.g. mhl)
 337                      * get extra newlines, but that should be harmless enough,
 338                      * right?  This is a corrupt message anyway. */
 339                     fseek (iob, ftell (iob) - 2, SEEK_SET);
 340                     return BODY;
 341                 }
 342                 if ((i -= j) <= 0) {
 343                     *cp = *buf = 0;
 344                     advise (NULL, "field name \"%s\" exceeds %d bytes", name, NAMESZ - 2);
 345                     state = LENERR;
 346                     goto finish;
 347                 }
 348             }
 349
 350             while (isspace (*--cp) && cp >= name)
 351                 ;
 352             *++cp = 0;
 353             /* fall through */
 354
 355         case FLDPLUS:
 356             /*
 357              * get (more of) the text of a field.  take
 358              * characters up to the end of this field (newline
 359              * followed by non-blank) or bufsz-1 characters.
 360              */
 361             cp = buf; i = bufsz-1;
 362             for (;;) {
 363 #ifdef LINUX_STDIO
 364                 cnt = (long) iob->_IO_read_end - (long) iob->_IO_read_ptr;
 365                 bp = (unsigned char *) --iob->_IO_read_ptr;
 366 #elif defined(__DragonFly__)
 367                 cnt = ((struct __FILE_public *)iob)->_r++;
 368                 bp = (unsigned char *) --((struct __FILE_public *)iob)->_p;
 369 #else
 370                 cnt = iob->_cnt++;
 371                 bp = (unsigned char *) --iob->_ptr;
 372 #endif
 373                 c = cnt < i ? cnt : i;
 374                 while ((ep = locc( c, bp, '\n' ))) {
 375                     /*
 376                      * if we hit the end of this field, return.
 377                      */
 378                     if ((j = *++ep) != ' ' && j != '\t') {
 379 #ifdef LINUX_STDIO
 380                         j = ep - (unsigned char *) iob->_IO_read_ptr;
 381                         memcpy (cp, iob->_IO_read_ptr, j);
 382                         iob->_IO_read_ptr = ep;
 383 #elif defined(__DragonFly__)
 384                         j = ep - (unsigned char *) ((struct __FILE_public *)iob)->_p;
 385                         memcpy (cp, ((struct __FILE_public *)iob)->_p, j);
 386                         ((struct __FILE_public *)iob)->_p = ep;
 387                         ((struct __FILE_public *)iob)->_r -= j;
 388 #else
 389                         j = ep - (unsigned char *) iob->_ptr;
 390                         memcpy (cp, iob->_ptr, j);
 391                         iob->_ptr = ep;
 392                         iob->_cnt -= j;
 393 #endif
 394                         cp += j;
 395                         state = FLD;
 396                         goto finish;
 397                     }
 398                     c -= ep - bp;
 399                     bp = ep;
 400                 }
 401                 /*
 402                  * end of input or dest buffer - copy what we've found.
 403                  */
 404 #ifdef LINUX_STDIO
 405                 c += bp - (unsigned char *) iob->_IO_read_ptr;
 406                 memcpy( cp, iob->_IO_read_ptr, c);
 407 #elif defined(__DragonFly__)
 408                 c += bp - (unsigned char *) ((struct __FILE_public *)iob)->_p;
 409                 memcpy( cp, ((struct __FILE_public *)iob)->_p, c);
 410 #else
 411                 c += bp - (unsigned char *) iob->_ptr;
 412                 memcpy( cp, iob->_ptr, c);
 413 #endif
 414                 i -= c;
 415                 cp += c;
 416                 if (i <= 0) {
 417                     /* the dest buffer is full */
 418 #ifdef LINUX_STDIO
 419                     iob->_IO_read_ptr += c;
 420 #elif defined(__DragonFly__)
 421                     ((struct __FILE_public *)iob)->_r -= c;
 422                     ((struct __FILE_public *)iob)->_p += c;
 423 #else
 424                     iob->_cnt -= c;
 425                     iob->_ptr += c;
 426 #endif
 427                     state = FLDPLUS;
 428                     break;
 429                 }
 430                 /*
 431                  * There's one character left in the input buffer.
 432                  * Copy it & fill the buffer.  If the last char
 433                  * was a newline and the next char is not whitespace,
 434                  * this is the end of the field.  Otherwise loop.
 435                  */
 436                 --i;
 437 #ifdef LINUX_STDIO
 438                 *cp++ = j = *(iob->_IO_read_ptr + c);
 439                 iob->_IO_read_ptr = iob->_IO_read_end;
 440                 c = __underflow(iob);
 441                 iob->_IO_read_ptr++;    /* NOT automatic! */
 442 #elif defined(__DragonFly__)
 443                 *cp++ =j = *(((struct __FILE_public *)iob)->_p + c);
 444                 c = __srget(iob);
 445 #else
 446                 *cp++ = j = *(iob->_ptr + c);
 447                 c = _filbuf(iob);
 448 #endif
 449                 if (c == EOF ||
 450                   ((j == '\0' || j == '\n') && c != ' ' && c != '\t')) {
 451                     if (c != EOF) {
 452 #ifdef LINUX_STDIO
 453                         --iob->_IO_read_ptr;
 454 #elif defined(__DragonFly__)
 455                         --((struct __FILE_public *)iob)->_p;
 456                         ++((struct __FILE_public *)iob)->_r;
 457 #else
 458                         --iob->_ptr;
 459                         ++iob->_cnt;
 460 #endif
 461                     }
 462                     state = FLD;
 463                     break;
 464                 }
 465             }
 466             break;
 467
 468         case BODY:
 469         body:
 470             /*
 471              * get the message body up to bufsz characters or the
 472              * end of the message.  Sleazy hack: if bufsz is negative
 473              * we assume that we were called to copy directly into
 474              * the output buffer and we don't add an eos.
 475              */
 476             i = (bufsz < 0) ? -bufsz : bufsz-1;
 477 #ifdef LINUX_STDIO
 478             bp = (unsigned char *) --iob->_IO_read_ptr;
 479             cnt = (long) iob->_IO_read_end - (long) iob->_IO_read_ptr;
 480 #elif defined(__DragonFly__)
 481             bp = (unsigned char *) --((struct __FILE_public *)iob)->_p;
 482             cnt = ++((struct __FILE_public *)iob)->_r;
 483 #else
 484             bp = (unsigned char *) --iob->_ptr;
 485             cnt = ++iob->_cnt;
 486 #endif
 487             c = (cnt < i ? cnt : i);
 488             if (msg_style != MS_DEFAULT && c > 1) {
 489                 /*
 490                  * packed maildrop - only take up to the (possible)
 491                  * start of the next message.  This "matchc" should
 492                  * probably be a Boyer-Moore matcher for non-vaxen,
 493                  * particularly since we have the alignment table
 494                  * all built for the end-of-buffer test (next).
 495                  * But our vax timings indicate that the "matchc"
 496                  * instruction is 50% faster than a carefully coded
 497                  * B.M. matcher for most strings.  (So much for elegant
 498                  * algorithms vs. brute force.)  Since I (currently)
 499                  * run MH on a vax, we use the matchc instruction. --vj
 500                  */
 501                 if ((ep = matchc( fdelimlen, fdelim, c, bp )))
 502                     c = ep - bp + 1;
 503                 else {
 504                     /*
 505                      * There's no delim in the buffer but there may be
 506                      * a partial one at the end.  If so, we want to leave
 507                      * it so the "eom" check on the next call picks it up.
 508                      * Use a modified Boyer-Moore matcher to make this
 509                      * check relatively cheap.  The first "if" figures
 510                      * out what position in the pattern matches the last
 511                      * character in the buffer.  The inner "while" matches
 512                      * the pattern against the buffer, backwards starting
 513                      * at that position.  Note that unless the buffer
 514                      * ends with one of the characters in the pattern
 515                      * (excluding the first and last), we do only one test.
 516                      */
 517                     ep = bp + c - 1;
 518                     if ((sp = pat_map[*ep])) {
 519                         do {
 520                             /* This if() is true unless (a) the buffer is too
 521                              * small to contain this delimiter prefix, or
 522                              * (b) it contains exactly enough chars for the
 523                              * delimiter prefix.
 524                              * For case (a) obviously we aren't going to match.
 525                              * For case (b), if the buffer really contained exactly
 526                              * a delim prefix, then the m_eom call at entry
 527                              * should have found it.  Thus it's not a delim
 528                              * and we know we won't get a match.
 529                              */
 530                             if (((sp - fdelim) + 2) <= c) {
 531                                 cp = sp;
 532                                 /* Unfortunately although fdelim has a preceding NUL
 533                                  * we can't use this as a sentinel in case the buffer
 534                                  * contains a NUL in exactly the wrong place (this
 535                                  * would cause us to run off the front of fdelim).
 536                                  */
 537                                 while (*--ep == *--cp)
 538                                     if (cp < fdelim)
 539                                         break;
 540                                 if (cp < fdelim) {
 541                                     /* we matched the entire delim prefix,
 542                                      * so only take the buffer up to there.
 543                                      * we know ep >= bp -- check above prevents underrun
 544                                      */
 545                                     c = (ep - bp) + 2;
 546                                     break;
 547                                 }
 548                             }
 549                             /* try matching one less char of delim string */
 550                             ep = bp + c - 1;
 551                         } while (--sp > fdelim);
 552                     }
 553                 }
 554             }
 555             memcpy( buf, bp, c );
 556 #ifdef LINUX_STDIO
 557             iob->_IO_read_ptr += c;
 558 #elif defined(__DragonFly__)
 559             ((struct __FILE_public *)iob)->_r -= c;
 560             ((struct __FILE_public *)iob)->_p += c;
 561 #else
 562             iob->_cnt -= c;
 563             iob->_ptr += c;
 564 #endif
 565             if (bufsz < 0) {
 566                 msg_count = c;
 567                 return (state);
 568             }
 569             cp = buf + c;
 570             break;
 571
 572         default:
 573             adios (NULL, "m_getfld() called with bogus state of %d", state);
 574     }
 575 finish:
 576     *cp = 0;
 577     msg_count = cp - buf;
 578     return (state);
 579 }
 580
 581
 582 void
 583 m_unknown(FILE *iob)
 584 {
 585     register int c;
 586     register long pos;
 587     char text[10];
 588     register char *cp;
 589     register char *delimstr;
 590
 591 /*
 592  * Figure out what the message delimitter string is for this
 593  * maildrop.  (This used to be part of m_Eom but I didn't like
 594  * the idea of an "if" statement that could only succeed on the
 595  * first call to m_Eom getting executed on each call, i.e., at
 596  * every newline in the message).
 597  *
 598  * If the first line of the maildrop is a Unix "From " line, we
 599  * say the style is MBOX and eat the rest of the line.  Otherwise
 600  * we say the style is MMDF and look for the delimiter string
 601  * specified when nmh was built (or from the mts.conf file).
 602  */
 603
 604     msg_style = MS_UNKNOWN;
 605
 606     pos = ftell (iob);
 607     if (fread (text, sizeof(*text), 5, iob) == 5
 608             && strncmp (text, "From ", 5) == 0) {
 609         msg_style = MS_MBOX;
 610         delimstr = "\nFrom ";
 611         while ((c = getc (iob)) != '\n' && c >= 0)
 612             ;
 613     } else {
 614         /* not a Unix style maildrop */
 615         fseek (iob, pos, SEEK_SET);
 616         if (mmdlm2 == NULL || *mmdlm2 == 0)
 617             mmdlm2 = "\001\001\001\001\n";
 618         delimstr = mmdlm2;
 619         msg_style = MS_MMDF;
 620     }
 621     c = strlen (delimstr);
 622     fdelim = (unsigned char *) mh_xmalloc((size_t) (c + 3));
 623     *fdelim++ = '\0';
 624     *fdelim = '\n';
 625     msg_delim = (char *)fdelim+1;
 626     edelim = (unsigned char *)msg_delim+1;
 627     fdelimlen = c + 1;
 628     edelimlen = c - 1;
 629     strcpy (msg_delim, delimstr);
 630     delimend = (unsigned char *)msg_delim + edelimlen;
 631     if (edelimlen <= 1)
 632         adios (NULL, "maildrop delimiter must be at least 2 bytes");
 633     /*
 634      * build a Boyer-Moore end-position map for the matcher in m_getfld.
 635      * N.B. - we don't match just the first char (since it's the newline
 636      * separator) or the last char (since the matchc would have found it
 637      * if it was a real delim).
 638      */
 639     pat_map = (unsigned char **) calloc (256, sizeof(unsigned char *));
 640
 641     for (cp = (char *) fdelim + 1; cp < (char *) delimend; cp++ )
 642         pat_map[(unsigned char)*cp] = (unsigned char *) cp;
 643
 644     if (msg_style == MS_MMDF) {
 645         /* flush extra msg hdrs */
 646         while ((c = Getc(iob)) >= 0 && eom (c, iob))
 647             ;
 648         if (c >= 0)
 649             ungetc(c, iob);
 650     }
 651 }
 652
 653
 654 void
 655 m_eomsbr (int (*action)(int))
 656 {
 657     if ((eom_action = action)) {
 658         msg_style = MS_MSH;
 659         *msg_delim = 0;
 660         fdelimlen = 1;
 661         delimend = fdelim;
 662     } else {
 663         msg_style = MS_MMDF;
 664         msg_delim = (char *)fdelim + 1;
 665         fdelimlen = strlen((char *)fdelim);
 666         delimend = (unsigned char *)(msg_delim + edelimlen);
 667     }
 668 }
 669
 670
 671 /*
 672  * test for msg delimiter string
 673  */
 674
 675 static int
 676 m_Eom (int c, FILE *iob)
 677 {
 678     register long pos = 0L;
 679     register int i;
 680     char text[10];
 681
 682     pos = ftell (iob);
 683     if ((i = fread (text, sizeof *text, edelimlen, iob)) != edelimlen
 684             || strncmp (text, (char *)edelim, edelimlen)) {
 685         if (i == 0 && msg_style == MS_MBOX)
 686             /* the final newline in the (brain damaged) unix-format
 687              * maildrop is part of the delimitter - delete it.
 688              */
 689             return 1;
 690
 691 #if 0
 692         fseek (iob, pos, SEEK_SET);
 693 #endif
 694
 695         fseek (iob, (long)(pos-1), SEEK_SET);
 696         getc (iob);             /* should be OK */
 697         return 0;
 698     }
 699
 700     if (msg_style == MS_MBOX) {
 701         while ((c = getc (iob)) != '\n')
 702             if (c < 0)
 703                 break;
 704     }
 705
 706     return 1;
 707 }
 708
 709
 710 static unsigned char *
 711 matchc(int patln, char *pat, int strln, char *str)
 712 {
 713         register char *es = str + strln - patln;
 714         register char *sp;
 715         register char *pp;
 716         register char *ep = pat + patln;
 717         register char pc = *pat++;
 718
 719         for(;;) {
 720                 while (pc != *str++)
 721                         if (str > es)
 722                                 return 0;
 723                 if (str > es+1)
 724                         return 0;
 725                 sp = str; pp = pat;
 726                 while (pp < ep && *sp++ == *pp)
 727                         pp++;
 728                 if (pp >= ep)
 729                         return ((unsigned char *)--str);
 730         }
 731 }
 732
 733
 734 /*
 735  * Locate character "term" in the next "cnt" characters of "src".
 736  * If found, return its address, otherwise return 0.
 737  */
 738
 739 static unsigned char *
 740 locc(int cnt, unsigned char *src, unsigned char term)
 741 {
 742     while (*src++ != term && --cnt > 0);
 743
 744     return (cnt > 0 ? --src : (unsigned char *)0);
 745 }
 746