git.marmaro.de Git - mmh/blob - sbr/m_getfld.c

   1
   2 /*
   3  * m_getfld.c -- read/parse a message
   4  *
   5  * This code is Copyright (c) 2002, by the authors of nmh.  See the
   6  * COPYRIGHT file in the root directory of the nmh distribution for
   7  * complete copyright information.
   8  */
   9
  10 #include <h/mh.h>
  11 #include <h/mts.h>
  12 #include <h/utils.h>
  13
  14 /* This module has a long and checkered history.  First, it didn't burst
  15    maildrops correctly because it considered two CTRL-A:s in a row to be
  16    an inter-message delimiter.  It really is four CTRL-A:s followed by a
  17    newline.  Unfortunately, MMDF will convert this delimiter *inside* a
  18    message to a CTRL-B followed by three CTRL-A:s and a newline.  This
  19    caused the old version of m_getfld() to declare eom prematurely.  The
  20    fix was a lot slower than
  21
  22                 c == '\001' && peekc (iob) == '\001'
  23
  24    but it worked, and to increase generality, MBOX style maildrops could
  25    be parsed as well.  Unfortunately the speed issue finally caught up with
  26    us since this routine is at the very heart of MH.
  27
  28    To speed things up considerably, the routine Eom() was made an auxilary
  29    function called by the macro eom().  Unless we are bursting a maildrop,
  30    the eom() macro returns FALSE saying we aren't at the end of the
  31    message.
  32
  33    The next thing to do is to read the mts.conf file and initialize
  34    delimiter[] and delimlen accordingly...
  35
  36    After mhl was made a built-in in msh, m_getfld() worked just fine
  37    (using m_unknown() at startup).  Until one day: a message which was
  38    the result of a bursting was shown. Then, since the burst boundaries
  39    aren't CTRL-A:s, m_getfld() would blinding plunge on past the boundary.
  40    Very sad.  The solution: introduce m_eomsbr().  This hook gets called
  41    after the end of each line (since testing for eom involves an fseek()).
  42    This worked fine, until one day: a message with no body portion arrived.
  43    Then the
  44
  45                    while (eom (c = Getc (iob), iob))
  46                         continue;
  47
  48    loop caused m_getfld() to return FMTERR.  So, that logic was changed to
  49    check for (*eom_action) and act accordingly.
  50
  51    This worked fine, until one day: someone didn't use four CTRL:A's as
  52    their delimiters.  So, the bullet got bit and we read mts.h and
  53    continue to struggle on.  It's not that bad though, since the only time
  54    the code gets executed is when inc (or msh) calls it, and both of these
  55    have already called mts_init().
  56
  57    ------------------------
  58    (Written by Van Jacobson for the mh6 m_getfld, January, 1986):
  59
  60    This routine was accounting for 60% of the cpu time used by most mh
  61    programs.  I spent a bit of time tuning and it now accounts for <10%
  62    of the time used.  Like any heavily tuned routine, it's a bit
  63    complex and you want to be sure you understand everything that it's
  64    doing before you start hacking on it.  Let me try to emphasize
  65    that:  every line in this atrocity depends on every other line,
  66    sometimes in subtle ways.  You should understand it all, in detail,
  67    before trying to change any part.  If you do change it, test the
  68    result thoroughly (I use a hand-constructed test file that exercises
  69    all the ways a header name, header body, header continuation,
  70    header-body separator, body line and body eom can align themselves
  71    with respect to a buffer boundary).  "Minor" bugs in this routine
  72    result in garbaged or lost mail.
  73
  74    If you hack on this and slow it down, I, my children and my
  75    children's children will curse you.
  76
  77    This routine gets used on three different types of files: normal,
  78    single msg files, "packed" unix or mmdf mailboxs (when used by inc)
  79    and packed, directoried bulletin board files (when used by msh).
  80    The biggest impact of different file types is in "eom" testing.  The
  81    code has been carefully organized to test for eom at appropriate
  82    times and at no other times (since the check is quite expensive).
  83    I have tried to arrange things so that the eom check need only be
  84    done on entry to this routine.  Since an eom can only occur after a
  85    newline, this is easy to manage for header fields.  For the msg
  86    body, we try to efficiently search the input buffer to see if
  87    contains the eom delimiter.  If it does, we take up to the
  88    delimiter, otherwise we take everything in the buffer.  (The change
  89    to the body eom/copy processing produced the most noticeable
  90    performance difference, particularly for "inc" and "show".)
  91
  92    There are three qualitatively different things this routine busts
  93    out of a message: field names, field text and msg bodies.  Field
  94    names are typically short (~8 char) and the loop that extracts them
  95    might terminate on a colon, newline or max width.  I considered
  96    using a Vax "scanc" to locate the end of the field followed by a
  97    "bcopy" but the routine call overhead on a Vax is too large for this
  98    to work on short names.  If Berkeley ever makes "inline" part of the
  99    C optimiser (so things like "scanc" turn into inline instructions) a
 100    change here would be worthwhile.
 101
 102    Field text is typically 60 - 100 characters so there's (barely)
 103    a win in doing a routine call to something that does a "locc"
 104    followed by a "bmove".  About 30% of the fields have continuations
 105    (usually the 822 "received:" lines) and each continuation generates
 106    another routine call.  "Inline" would be a big win here, as well.
 107
 108    Messages, as of this writing, seem to come in two flavors: small
 109    (~1K) and long (>2K).  Most messages have 400 - 600 bytes of headers
 110    so message bodies average at least a few hundred characters.
 111    Assuming your system uses reasonably sized stdio buffers (1K or
 112    more), this routine should be able to remove the body in large
 113    (>500 byte) chunks.  The makes the cost of a call to "bcopy"
 114    small but there is a premium on checking for the eom in packed
 115    maildrops.  The eom pattern is always a simple string so we can
 116    construct an efficient pattern matcher for it (e.g., a Vax "matchc"
 117    instruction).  Some thought went into recognizing the start of
 118    an eom that has been split across two buffers.
 119
 120    This routine wants to deal with large chunks of data so, rather
 121    than "getc" into a local buffer, it uses stdio's buffer.  If
 122    you try to use it on a non-buffered file, you'll get what you
 123    deserve.  This routine "knows" that struct FILEs have a _ptr
 124    and a _cnt to describe the current state of the buffer and
 125    it knows that _filbuf ignores the _ptr & _cnt and simply fills
 126    the buffer.  If stdio on your system doesn't work this way, you
 127    may have to make small changes in this routine.
 128
 129    This routine also "knows" that an EOF indication on a stream is
 130    "sticky" (i.e., you will keep getting EOF until you reposition the
 131    stream).  If your system doesn't work this way it is broken and you
 132    should complain to the vendor.  As a consequence of the sticky
 133    EOF, this routine will never return any kind of EOF status when
 134    there is data in "name" or "buf").
 135   */
 136
 137 /*
 138 Purpose
 139 =======
 140 Reads an Internet message (RFC 5322), or one or more messages stored in a
 141 maildrop in mbox (RFC 4155) or MMDF format, from a file stream.  Each call
 142 to m_getfld() reads one header field, or a portion of the body, in sequence.
 143
 144 Inputs
 145 ======
 146 state:  message parse state
 147 bufsz:  maximum number of characters to load into buf
 148 iob:  input file stream
 149
 150 Outputs
 151 =======
 152 name:  header field name (array of size NAMESZ=999)
 153 buf:  either a header field body or message body
 154 (return value):  message parse state on return from function
 155 (global) int msg_count:  number of characters loaded into buf
 156
 157 Functions (part of Inputs, really)
 158 =========
 159 void m_unknown(FILE *iob):  Determines the message delimiter string for the
 160   maildrop.  Called by inc, scan, and msh when reading from a maildrop file.
 161
 162 void m_eomsbr (int (*action)(int)):  Sets the hook to check for end of
 163   message in a maildrop.  Called only by msh.
 164
 165 Those functions save state in the State variables listed below.
 166
 167 Definitions
 168 ===========
 169 state is one of:
 170   FLD      // Field returned
 171   FLDPLUS  // Field returned with more to come
 172   FLDEOF   // Field returned ending at eom
 173   BODY     // Body  returned with more to come
 174   BODYEOF  // Body  returned ending at eom
 175   FILEEOF  // Reached end of input file
 176   FMTERR   // Message Format error
 177   LENERR   // Name too long error from getfld
 178
 179 msg_style is maildrop style, one of:
 180   MS_UNKNOWN // type not known yet
 181   MS_DEFAULT // default (one msg per file)
 182   MS_MBOX    // Unix-style "from" lines
 183   MS_MMDF    // string mmdlm2
 184   MS_MSH     // whacko msh
 185
 186 State variables (part of Outputs)
 187 ===============
 188 m_getfld() retains state internally between calls in some state variables.
 189
 190 These two variables are global, but only used internally by m_getfld.c:
 191 int msg_style
 192 char *msg_delim
 193
 194 These are used for the end-of-message matcher when reading maildrops:
 195 static unsigned char **pat_map
 196 static unsigned char *fdelim
 197 static unsigned char *delimend
 198 static int fdelimlen
 199 static unsigned char *edelim
 200 static int edelimlen
 201
 202 Restriction
 203 ===========
 204 m_getfld() is restricted to operate on one file stream at a time because of
 205 the retained state (see "State variables" above).
 206
 207 Current usage
 208 =============
 209 The first call to m_getfld() on a file stream is with a state of FLD.
 210 Subsequent calls provide the state returned by the previous call.
 211
 212 Along the way, I thought of these possible interface changes that we
 213 might want to consider before rototilling the internals:
 214
 215 1) To improve interface documentation:
 216    Change type of name argument from unsigned char * to unsigned char[NAMESZ].
 217    This would also be a step toward allowing the compiler to check for array
 218    size consistency.
 219
 220 2) To remove globals that don't need to be:
 221    Change msg_style and msg_delim to be file static.
 222
 223 3) To remove a global:
 224    Change bufsz to be in-out instead of in, and therefore int * instead of
 225    int, and use that instead of global msg_count.  There are only 3 call
 226    sites that use msg_count so it wouldn't take much effort to remove use of
 227    it.  Of course, all call sites would have to change to provide an int *
 228    instead of an int.  Some now pass constants.
 229
 230 4) To remove the state argument from the signature:
 231    Given the Current usage and Restriction above, the state variable could
 232    be removed from the signature and just retained internally.
 233
 234 5) To remove the Restriction above:
 235    One approach would be for m_getfld() to retain multiple copies of that
 236    state, one per iob that it sees.  Another approach would be for the
 237    caller to store it in an opaque struct, the address of which is passed
 238    through the interface.
 239 */
 240
 241 /*
 242  * static prototypes
 243  */
 244 static int m_Eom (int, FILE *);
 245 static unsigned char *matchc(int, char *, int, char *);
 246 static unsigned char *locc(int, unsigned char *, unsigned char);
 247
 248 #define Getc(iob)       getc(iob)
 249 #define eom(c,iob)      (msg_style != MS_DEFAULT && \
 250                          (((c) == *msg_delim && m_Eom(c,iob)) ||\
 251                           (eom_action && (*eom_action)(c))))
 252
 253 static unsigned char **pat_map;
 254
 255 /*
 256  * defined in sbr/m_msgdef.c = 0
 257  * This is a disgusting hack for "inc" so it can know how many
 258  * characters were stuffed in the buffer on the last call
 259  * (see comments in uip/scansbr.c).
 260  */
 261 extern int msg_count;
 262
 263 /*
 264  * defined in sbr/m_msgdef.c = MS_DEFAULT
 265  */
 266 extern int msg_style;
 267
 268 /*
 269  * The "full" delimiter string for a packed maildrop consists
 270  * of a newline followed by the actual delimiter.  E.g., the
 271  * full string for a Unix maildrop would be: "\n\nFrom ".
 272  * "Fdelim" points to the start of the full string and is used
 273  * in the BODY case of the main routine to search the buffer for
 274  * a possible eom.  Msg_delim points to the first character of
 275  * the actual delim. string (i.e., fdelim+1).  Edelim
 276  * points to the 2nd character of actual delimiter string.  It
 277  * is used in m_Eom because the first character of the string
 278  * has been read and matched before m_Eom is called.
 279  */
 280 extern char *msg_delim;         /* defined in sbr/m_msgdef.c = "" */
 281 static unsigned char *fdelim;
 282 static unsigned char *delimend;
 283 static int fdelimlen;
 284 static unsigned char *edelim;
 285 static int edelimlen;
 286
 287 static int (*eom_action)(int) = NULL;
 288
 289 #ifdef _FSTDIO
 290 # define _ptr    _p             /* Gag   */
 291 # define _cnt    _r             /* Retch */
 292 # define _filbuf __srget        /* Puke  */
 293 # define DEFINED__FILBUF_TO_SOMETHING_SPECIFIC
 294
 295 # if defined __CYGWIN__
 296   /* Cygwin's stdio.h does not declare __srget(). */
 297   int __srget(FILE *);
 298 # endif /* __CYGWIN__ */
 299 #endif
 300
 301 #ifndef DEFINED__FILBUF_TO_SOMETHING_SPECIFIC
 302 extern int  _filbuf(FILE*);
 303 #endif
 304
 305
 306 int
 307 m_getfld (int state, unsigned char *name, unsigned char *buf,
 308           int bufsz, FILE *iob)
 309 {
 310     register unsigned char  *bp, *cp, *ep, *sp;
 311     register int cnt, c, i, j;
 312
 313     if ((c = Getc(iob)) < 0) {
 314         msg_count = 0;
 315         *buf = 0;
 316         return FILEEOF;
 317     }
 318     if (eom (c, iob)) {
 319         if (! eom_action) {
 320             /* flush null messages */
 321             while ((c = Getc(iob)) >= 0 && eom (c, iob))
 322                 ;
 323             if (c >= 0)
 324                 ungetc(c, iob);
 325         }
 326         msg_count = 0;
 327         *buf = 0;
 328         return FILEEOF;
 329     }
 330
 331     switch (state) {
 332         case FLDEOF:
 333         case BODYEOF:
 334         case FLD:
 335             if (c == '\n' || c == '-') {
 336                 /* we hit the header/body separator */
 337                 while (c != '\n' && (c = Getc(iob)) >= 0)
 338                     ;
 339
 340                 if (c < 0 || (c = Getc(iob)) < 0 || eom (c, iob)) {
 341                     if (! eom_action) {
 342                         /* flush null messages */
 343                         while ((c = Getc(iob)) >= 0 && eom (c, iob))
 344                             ;
 345                         if (c >= 0)
 346                             ungetc(c, iob);
 347                     }
 348                     msg_count = 0;
 349                     *buf = 0;
 350                     return FILEEOF;
 351                 }
 352                 state = BODY;
 353                 goto body;
 354             }
 355             /*
 356              * get the name of this component.  take characters up
 357              * to a ':', a newline or NAMESZ-1 characters, whichever
 358              * comes first.
 359              */
 360             cp = name;
 361             i = NAMESZ - 1;
 362             for (;;) {
 363 #ifdef LINUX_STDIO
 364                 bp = sp = (unsigned char *) iob->_IO_read_ptr - 1;
 365                 j = (cnt = ((long) iob->_IO_read_end -
 366                         (long) iob->_IO_read_ptr)  + 1) < i ? cnt : i;
 367 #elif defined(__DragonFly__)
 368                 bp = sp = (unsigned char *) ((struct __FILE_public *)iob)->_p - 1;
 369                 j = (cnt = ((struct __FILE_public *)iob)->_r+1) < i ? cnt : i;
 370 #else
 371                 bp = sp = (unsigned char *) iob->_ptr - 1;
 372                 j = (cnt = iob->_cnt+1) < i ? cnt : i;
 373 #endif
 374                 while (--j >= 0 && (c = *bp++) != ':' && c != '\n')
 375                     *cp++ = c;
 376
 377                 j = bp - sp;
 378                 if ((cnt -= j) <= 0) {
 379 #ifdef LINUX_STDIO
 380                     iob->_IO_read_ptr = iob->_IO_read_end;
 381                     if (__underflow(iob) == EOF) {
 382 #elif defined(__DragonFly__)
 383                     if (__srget(iob) == EOF) {
 384 #else
 385                     if (_filbuf(iob) == EOF) {
 386 #endif
 387                         *cp = *buf = 0;
 388                         advise (NULL, "eof encountered in field \"%s\"", name);
 389                         return FMTERR;
 390                     }
 391 #ifdef LINUX_STDIO
 392                 iob->_IO_read_ptr++; /* NOT automatic in __underflow()! */
 393 #endif
 394                 } else {
 395 #ifdef LINUX_STDIO
 396                     iob->_IO_read_ptr = bp + 1;
 397 #elif defined(__DragonFly__)
 398                     ((struct __FILE_public *)iob)->_p = bp + 1;
 399                     ((struct __FILE_public *)iob)->_r = cnt - 1;
 400 #else
 401                     iob->_ptr = bp + 1;
 402                     iob->_cnt = cnt - 1;
 403 #endif
 404                 }
 405                 if (c == ':')
 406                     break;
 407
 408                 /*
 409                  * something went wrong.  possibilities are:
 410                  *  . hit a newline (error)
 411                  *  . got more than namesz chars. (error)
 412                  *  . hit the end of the buffer. (loop)
 413                  */
 414                 if (c == '\n') {
 415                     /* We hit the end of the line without seeing ':' to
 416                      * terminate the field name.  This is usually (always?)
 417                      * spam.  But, blowing up is lame, especially when
 418                      * scan(1)ing a folder with such messages.  Pretend such
 419                      * lines are the first of the body (at least mutt also
 420                      * handles it this way). */
 421
 422                     /* See if buf can hold this line, since we were assuming
 423                      * we had a buffer of NAMESZ, not bufsz. */
 424                     /* + 1 for the newline */
 425                     if (bufsz < j + 1) {
 426                         /* No, it can't.  Oh well, guess we'll blow up. */
 427                         *cp = *buf = 0;
 428                         advise (NULL, "eol encountered in field \"%s\"", name);
 429                         state = FMTERR;
 430                         goto finish;
 431                     }
 432                     memcpy (buf, name, j - 1);
 433                     buf[j - 1] = '\n';
 434                     buf[j] = '\0';
 435                     /* mhparse.c:get_content wants to find the position of the
 436                      * body start, but it thinks there's a blank line between
 437                      * the header and the body (naturally!), so seek back so
 438                      * that things line up even though we don't have that
 439                      * blank line in this case.  Simpler parsers (e.g. mhl)
 440                      * get extra newlines, but that should be harmless enough,
 441                      * right?  This is a corrupt message anyway. */
 442                     fseek (iob, ftell (iob) - 2, SEEK_SET);
 443                     return BODY;
 444                 }
 445                 if ((i -= j) <= 0) {
 446                     *cp = *buf = 0;
 447                     advise (NULL, "field name \"%s\" exceeds %d bytes", name, NAMESZ - 2);
 448                     state = LENERR;
 449                     goto finish;
 450                 }
 451             }
 452
 453             while (isspace (*--cp) && cp >= name)
 454                 ;
 455             *++cp = 0;
 456             /* fall through */
 457
 458         case FLDPLUS:
 459             /*
 460              * get (more of) the text of a field.  take
 461              * characters up to the end of this field (newline
 462              * followed by non-blank) or bufsz-1 characters.
 463              */
 464             cp = buf; i = bufsz-1;
 465             for (;;) {
 466 #ifdef LINUX_STDIO
 467                 cnt = (long) iob->_IO_read_end - (long) iob->_IO_read_ptr;
 468                 bp = (unsigned char *) --iob->_IO_read_ptr;
 469 #elif defined(__DragonFly__)
 470                 cnt = ((struct __FILE_public *)iob)->_r++;
 471                 bp = (unsigned char *) --((struct __FILE_public *)iob)->_p;
 472 #else
 473                 cnt = iob->_cnt++;
 474                 bp = (unsigned char *) --iob->_ptr;
 475 #endif
 476                 c = cnt < i ? cnt : i;
 477                 while ((ep = locc( c, bp, '\n' ))) {
 478                     /*
 479                      * if we hit the end of this field, return.
 480                      */
 481                     if ((j = *++ep) != ' ' && j != '\t') {
 482 #ifdef LINUX_STDIO
 483                         j = ep - (unsigned char *) iob->_IO_read_ptr;
 484                         memcpy (cp, iob->_IO_read_ptr, j);
 485                         iob->_IO_read_ptr = ep;
 486 #elif defined(__DragonFly__)
 487                         j = ep - (unsigned char *) ((struct __FILE_public *)iob)->_p;
 488                         memcpy (cp, ((struct __FILE_public *)iob)->_p, j);
 489                         ((struct __FILE_public *)iob)->_p = ep;
 490                         ((struct __FILE_public *)iob)->_r -= j;
 491 #else
 492                         j = ep - (unsigned char *) iob->_ptr;
 493                         memcpy (cp, iob->_ptr, j);
 494                         iob->_ptr = ep;
 495                         iob->_cnt -= j;
 496 #endif
 497                         cp += j;
 498                         state = FLD;
 499                         goto finish;
 500                     }
 501                     c -= ep - bp;
 502                     bp = ep;
 503                 }
 504                 /*
 505                  * end of input or dest buffer - copy what we've found.
 506                  */
 507 #ifdef LINUX_STDIO
 508                 c += bp - (unsigned char *) iob->_IO_read_ptr;
 509                 memcpy( cp, iob->_IO_read_ptr, c);
 510 #elif defined(__DragonFly__)
 511                 c += bp - (unsigned char *) ((struct __FILE_public *)iob)->_p;
 512                 memcpy( cp, ((struct __FILE_public *)iob)->_p, c);
 513 #else
 514                 c += bp - (unsigned char *) iob->_ptr;
 515                 memcpy( cp, iob->_ptr, c);
 516 #endif
 517                 i -= c;
 518                 cp += c;
 519                 if (i <= 0) {
 520                     /* the dest buffer is full */
 521 #ifdef LINUX_STDIO
 522                     iob->_IO_read_ptr += c;
 523 #elif defined(__DragonFly__)
 524                     ((struct __FILE_public *)iob)->_r -= c;
 525                     ((struct __FILE_public *)iob)->_p += c;
 526 #else
 527                     iob->_cnt -= c;
 528                     iob->_ptr += c;
 529 #endif
 530                     state = FLDPLUS;
 531                     break;
 532                 }
 533                 /*
 534                  * There's one character left in the input buffer.
 535                  * Copy it & fill the buffer.  If the last char
 536                  * was a newline and the next char is not whitespace,
 537                  * this is the end of the field.  Otherwise loop.
 538                  */
 539                 --i;
 540 #ifdef LINUX_STDIO
 541                 *cp++ = j = *(iob->_IO_read_ptr + c);
 542                 iob->_IO_read_ptr = iob->_IO_read_end;
 543                 c = __underflow(iob);
 544                 iob->_IO_read_ptr++;    /* NOT automatic! */
 545 #elif defined(__DragonFly__)
 546                 *cp++ =j = *(((struct __FILE_public *)iob)->_p + c);
 547                 c = __srget(iob);
 548 #else
 549                 *cp++ = j = *(iob->_ptr + c);
 550                 c = _filbuf(iob);
 551 #endif
 552                 if (c == EOF ||
 553                   ((j == '\0' || j == '\n') && c != ' ' && c != '\t')) {
 554                     if (c != EOF) {
 555 #ifdef LINUX_STDIO
 556                         --iob->_IO_read_ptr;
 557 #elif defined(__DragonFly__)
 558                         --((struct __FILE_public *)iob)->_p;
 559                         ++((struct __FILE_public *)iob)->_r;
 560 #else
 561                         --iob->_ptr;
 562                         ++iob->_cnt;
 563 #endif
 564                     }
 565                     state = FLD;
 566                     break;
 567                 }
 568             }
 569             break;
 570
 571         case BODY:
 572         body:
 573             /*
 574              * get the message body up to bufsz characters or the
 575              * end of the message.  Sleazy hack: if bufsz is negative
 576              * we assume that we were called to copy directly into
 577              * the output buffer and we don't add an eos.
 578              */
 579             i = (bufsz < 0) ? -bufsz : bufsz-1;
 580 #ifdef LINUX_STDIO
 581             bp = (unsigned char *) --iob->_IO_read_ptr;
 582             cnt = (long) iob->_IO_read_end - (long) iob->_IO_read_ptr;
 583 #elif defined(__DragonFly__)
 584             bp = (unsigned char *) --((struct __FILE_public *)iob)->_p;
 585             cnt = ++((struct __FILE_public *)iob)->_r;
 586 #else
 587             bp = (unsigned char *) --iob->_ptr;
 588             cnt = ++iob->_cnt;
 589 #endif
 590             c = (cnt < i ? cnt : i);
 591             if (msg_style != MS_DEFAULT && c > 1) {
 592                 /*
 593                  * packed maildrop - only take up to the (possible)
 594                  * start of the next message.  This "matchc" should
 595                  * probably be a Boyer-Moore matcher for non-vaxen,
 596                  * particularly since we have the alignment table
 597                  * all built for the end-of-buffer test (next).
 598                  * But our vax timings indicate that the "matchc"
 599                  * instruction is 50% faster than a carefully coded
 600                  * B.M. matcher for most strings.  (So much for elegant
 601                  * algorithms vs. brute force.)  Since I (currently)
 602                  * run MH on a vax, we use the matchc instruction. --vj
 603                  */
 604                 if ((ep = matchc( fdelimlen, fdelim, c, bp )))
 605                     c = ep - bp + 1;
 606                 else {
 607                     /*
 608                      * There's no delim in the buffer but there may be
 609                      * a partial one at the end.  If so, we want to leave
 610                      * it so the "eom" check on the next call picks it up.
 611                      * Use a modified Boyer-Moore matcher to make this
 612                      * check relatively cheap.  The first "if" figures
 613                      * out what position in the pattern matches the last
 614                      * character in the buffer.  The inner "while" matches
 615                      * the pattern against the buffer, backwards starting
 616                      * at that position.  Note that unless the buffer
 617                      * ends with one of the characters in the pattern
 618                      * (excluding the first and last), we do only one test.
 619                      */
 620                     ep = bp + c - 1;
 621                     if ((sp = pat_map[*ep])) {
 622                         do {
 623                             /* This if() is true unless (a) the buffer is too
 624                              * small to contain this delimiter prefix, or
 625                              * (b) it contains exactly enough chars for the
 626                              * delimiter prefix.
 627                              * For case (a) obviously we aren't going to match.
 628                              * For case (b), if the buffer really contained exactly
 629                              * a delim prefix, then the m_eom call at entry
 630                              * should have found it.  Thus it's not a delim
 631                              * and we know we won't get a match.
 632                              */
 633                             if (((sp - fdelim) + 2) <= c) {
 634                                 cp = sp;
 635                                 /* Unfortunately although fdelim has a preceding NUL
 636                                  * we can't use this as a sentinel in case the buffer
 637                                  * contains a NUL in exactly the wrong place (this
 638                                  * would cause us to run off the front of fdelim).
 639                                  */
 640                                 while (*--ep == *--cp)
 641                                     if (cp < fdelim)
 642                                         break;
 643                                 if (cp < fdelim) {
 644                                     /* we matched the entire delim prefix,
 645                                      * so only take the buffer up to there.
 646                                      * we know ep >= bp -- check above prevents underrun
 647                                      */
 648                                     c = (ep - bp) + 2;
 649                                     break;
 650                                 }
 651                             }
 652                             /* try matching one less char of delim string */
 653                             ep = bp + c - 1;
 654                         } while (--sp > fdelim);
 655                     }
 656                 }
 657             }
 658             memcpy( buf, bp, c );
 659 #ifdef LINUX_STDIO
 660             iob->_IO_read_ptr += c;
 661 #elif defined(__DragonFly__)
 662             ((struct __FILE_public *)iob)->_r -= c;
 663             ((struct __FILE_public *)iob)->_p += c;
 664 #else
 665             iob->_cnt -= c;
 666             iob->_ptr += c;
 667 #endif
 668             if (bufsz < 0) {
 669                 msg_count = c;
 670                 return (state);
 671             }
 672             cp = buf + c;
 673             break;
 674
 675         default:
 676             adios (NULL, "m_getfld() called with bogus state of %d", state);
 677     }
 678 finish:
 679     *cp = 0;
 680     msg_count = cp - buf;
 681     return (state);
 682 }
 683
 684
 685 void
 686 m_unknown(FILE *iob)
 687 {
 688     register int c;
 689     register long pos;
 690     char text[10];
 691     register char *cp;
 692     register char *delimstr;
 693
 694 /*
 695  * Figure out what the message delimitter string is for this
 696  * maildrop.  (This used to be part of m_Eom but I didn't like
 697  * the idea of an "if" statement that could only succeed on the
 698  * first call to m_Eom getting executed on each call, i.e., at
 699  * every newline in the message).
 700  *
 701  * If the first line of the maildrop is a Unix "From " line, we
 702  * say the style is MBOX and eat the rest of the line.  Otherwise
 703  * we say the style is MMDF and look for the delimiter string
 704  * specified when nmh was built (or from the mts.conf file).
 705  */
 706
 707     msg_style = MS_UNKNOWN;
 708
 709     pos = ftell (iob);
 710     if (fread (text, sizeof(*text), 5, iob) == 5
 711             && strncmp (text, "From ", 5) == 0) {
 712         msg_style = MS_MBOX;
 713         delimstr = "\nFrom ";
 714         while ((c = getc (iob)) != '\n' && c >= 0)
 715             ;
 716     } else {
 717         /* not a Unix style maildrop */
 718         fseek (iob, pos, SEEK_SET);
 719         if (mmdlm2 == NULL || *mmdlm2 == 0)
 720             mmdlm2 = "\001\001\001\001\n";
 721         delimstr = mmdlm2;
 722         msg_style = MS_MMDF;
 723     }
 724     c = strlen (delimstr);
 725     fdelim = (unsigned char *) mh_xmalloc((size_t) (c + 3));
 726     *fdelim++ = '\0';
 727     *fdelim = '\n';
 728     msg_delim = (char *)fdelim+1;
 729     edelim = (unsigned char *)msg_delim+1;
 730     fdelimlen = c + 1;
 731     edelimlen = c - 1;
 732     strcpy (msg_delim, delimstr);
 733     delimend = (unsigned char *)msg_delim + edelimlen;
 734     if (edelimlen <= 1)
 735         adios (NULL, "maildrop delimiter must be at least 2 bytes");
 736     /*
 737      * build a Boyer-Moore end-position map for the matcher in m_getfld.
 738      * N.B. - we don't match just the first char (since it's the newline
 739      * separator) or the last char (since the matchc would have found it
 740      * if it was a real delim).
 741      */
 742     pat_map = (unsigned char **) calloc (256, sizeof(unsigned char *));
 743
 744     for (cp = (char *) fdelim + 1; cp < (char *) delimend; cp++ )
 745         pat_map[(unsigned char)*cp] = (unsigned char *) cp;
 746
 747     if (msg_style == MS_MMDF) {
 748         /* flush extra msg hdrs */
 749         while ((c = Getc(iob)) >= 0 && eom (c, iob))
 750             ;
 751         if (c >= 0)
 752             ungetc(c, iob);
 753     }
 754 }
 755
 756
 757 void
 758 m_eomsbr (int (*action)(int))
 759 {
 760     if ((eom_action = action)) {
 761         msg_style = MS_MSH;
 762         *msg_delim = 0;
 763         fdelimlen = 1;
 764         delimend = fdelim;
 765     } else {
 766         msg_style = MS_MMDF;
 767         msg_delim = (char *)fdelim + 1;
 768         fdelimlen = strlen((char *)fdelim);
 769         delimend = (unsigned char *)(msg_delim + edelimlen);
 770     }
 771 }
 772
 773
 774 /*
 775  * test for msg delimiter string
 776  */
 777
 778 static int
 779 m_Eom (int c, FILE *iob)
 780 {
 781     register long pos = 0L;
 782     register int i;
 783     char text[10];
 784
 785     pos = ftell (iob);
 786     if ((i = fread (text, sizeof *text, edelimlen, iob)) != edelimlen
 787             || strncmp (text, (char *)edelim, edelimlen)) {
 788         if (i == 0 && msg_style == MS_MBOX)
 789             /* the final newline in the (brain damaged) unix-format
 790              * maildrop is part of the delimitter - delete it.
 791              */
 792             return 1;
 793
 794 #if 0
 795         fseek (iob, pos, SEEK_SET);
 796 #endif
 797
 798         fseek (iob, (long)(pos-1), SEEK_SET);
 799         getc (iob);             /* should be OK */
 800         return 0;
 801     }
 802
 803     if (msg_style == MS_MBOX) {
 804         while ((c = getc (iob)) != '\n')
 805             if (c < 0)
 806                 break;
 807     }
 808
 809     return 1;
 810 }
 811
 812
 813 static unsigned char *
 814 matchc(int patln, char *pat, int strln, char *str)
 815 {
 816         register char *es = str + strln - patln;
 817         register char *sp;
 818         register char *pp;
 819         register char *ep = pat + patln;
 820         register char pc = *pat++;
 821
 822         for(;;) {
 823                 while (pc != *str++)
 824                         if (str > es)
 825                                 return 0;
 826                 if (str > es+1)
 827                         return 0;
 828                 sp = str; pp = pat;
 829                 while (pp < ep && *sp++ == *pp)
 830                         pp++;
 831                 if (pp >= ep)
 832                         return ((unsigned char *)--str);
 833         }
 834 }
 835
 836
 837 /*
 838  * Locate character "term" in the next "cnt" characters of "src".
 839  * If found, return its address, otherwise return 0.
 840  */
 841
 842 static unsigned char *
 843 locc(int cnt, unsigned char *src, unsigned char term)
 844 {
 845     while (*src++ != term && --cnt > 0);
 846
 847     return (cnt > 0 ? --src : (unsigned char *)0);
 848 }
 849