3 * m_getfld.c -- read/parse a message
5 * This code is Copyright (c) 2002, by the authors of nmh. See the
6 * COPYRIGHT file in the root directory of the nmh distribution for
7 * complete copyright information.
14 /* This module has a long and checkered history. First, it didn't burst
15 maildrops correctly because it considered two CTRL-A:s in a row to be
16 an inter-message delimiter. It really is four CTRL-A:s followed by a
17 newline. Unfortunately, MMDF will convert this delimiter *inside* a
18 message to a CTRL-B followed by three CTRL-A:s and a newline. This
19 caused the old version of m_getfld() to declare eom prematurely. The
20 fix was a lot slower than
22 c == '\001' && peekc (iob) == '\001'
24 but it worked, and to increase generality, MBOX style maildrops could
25 be parsed as well. Unfortunately the speed issue finally caught up with
26 us since this routine is at the very heart of MH.
28 To speed things up considerably, the routine Eom() was made an auxilary
29 function called by the macro eom(). Unless we are bursting a maildrop,
30 the eom() macro returns FALSE saying we aren't at the end of the
33 The next thing to do is to read the mts.conf file and initialize
34 delimiter[] and delimlen accordingly...
36 After mhl was made a built-in in msh, m_getfld() worked just fine
37 (using m_unknown() at startup). Until one day: a message which was
38 the result of a bursting was shown. Then, since the burst boundaries
39 aren't CTRL-A:s, m_getfld() would blinding plunge on past the boundary.
40 Very sad. The solution: introduce m_eomsbr(). This hook gets called
41 after the end of each line (since testing for eom involves an fseek()).
42 This worked fine, until one day: a message with no body portion arrived.
45 while (eom (c = Getc (iob), iob))
48 loop caused m_getfld() to return FMTERR. So, that logic was changed to
49 check for (*eom_action) and act accordingly.
51 This worked fine, until one day: someone didn't use four CTRL:A's as
52 their delimiters. So, the bullet got bit and we read mts.h and
53 continue to struggle on. It's not that bad though, since the only time
54 the code gets executed is when inc (or msh) calls it, and both of these
55 have already called mts_init().
57 ------------------------
58 (Written by Van Jacobson for the mh6 m_getfld, January, 1986):
60 This routine was accounting for 60% of the cpu time used by most mh
61 programs. I spent a bit of time tuning and it now accounts for <10%
62 of the time used. Like any heavily tuned routine, it's a bit
63 complex and you want to be sure you understand everything that it's
64 doing before you start hacking on it. Let me try to emphasize
65 that: every line in this atrocity depends on every other line,
66 sometimes in subtle ways. You should understand it all, in detail,
67 before trying to change any part. If you do change it, test the
68 result thoroughly (I use a hand-constructed test file that exercises
69 all the ways a header name, header body, header continuation,
70 header-body separator, body line and body eom can align themselves
71 with respect to a buffer boundary). "Minor" bugs in this routine
72 result in garbaged or lost mail.
74 If you hack on this and slow it down, I, my children and my
75 children's children will curse you.
77 This routine gets used on three different types of files: normal,
78 single msg files, "packed" unix or mmdf mailboxs (when used by inc)
79 and packed, directoried bulletin board files (when used by msh).
80 The biggest impact of different file types is in "eom" testing. The
81 code has been carefully organized to test for eom at appropriate
82 times and at no other times (since the check is quite expensive).
83 I have tried to arrange things so that the eom check need only be
84 done on entry to this routine. Since an eom can only occur after a
85 newline, this is easy to manage for header fields. For the msg
86 body, we try to efficiently search the input buffer to see if
87 contains the eom delimiter. If it does, we take up to the
88 delimiter, otherwise we take everything in the buffer. (The change
89 to the body eom/copy processing produced the most noticeable
90 performance difference, particularly for "inc" and "show".)
92 There are three qualitatively different things this routine busts
93 out of a message: field names, field text and msg bodies. Field
94 names are typically short (~8 char) and the loop that extracts them
95 might terminate on a colon, newline or max width. I considered
96 using a Vax "scanc" to locate the end of the field followed by a
97 "bcopy" but the routine call overhead on a Vax is too large for this
98 to work on short names. If Berkeley ever makes "inline" part of the
99 C optimiser (so things like "scanc" turn into inline instructions) a
100 change here would be worthwhile.
102 Field text is typically 60 - 100 characters so there's (barely)
103 a win in doing a routine call to something that does a "locc"
104 followed by a "bmove". About 30% of the fields have continuations
105 (usually the 822 "received:" lines) and each continuation generates
106 another routine call. "Inline" would be a big win here, as well.
108 Messages, as of this writing, seem to come in two flavors: small
109 (~1K) and long (>2K). Most messages have 400 - 600 bytes of headers
110 so message bodies average at least a few hundred characters.
111 Assuming your system uses reasonably sized stdio buffers (1K or
112 more), this routine should be able to remove the body in large
113 (>500 byte) chunks. The makes the cost of a call to "bcopy"
114 small but there is a premium on checking for the eom in packed
115 maildrops. The eom pattern is always a simple string so we can
116 construct an efficient pattern matcher for it (e.g., a Vax "matchc"
117 instruction). Some thought went into recognizing the start of
118 an eom that has been split across two buffers.
120 This routine wants to deal with large chunks of data so, rather
121 than "getc" into a local buffer, it uses stdio's buffer. If
122 you try to use it on a non-buffered file, you'll get what you
123 deserve. This routine "knows" that struct FILEs have a _ptr
124 and a _cnt to describe the current state of the buffer and
125 it knows that _filbuf ignores the _ptr & _cnt and simply fills
126 the buffer. If stdio on your system doesn't work this way, you
127 may have to make small changes in this routine.
129 This routine also "knows" that an EOF indication on a stream is
130 "sticky" (i.e., you will keep getting EOF until you reposition the
131 stream). If your system doesn't work this way it is broken and you
132 should complain to the vendor. As a consequence of the sticky
133 EOF, this routine will never return any kind of EOF status when
134 there is data in "name" or "buf").
140 Reads an Internet message (RFC 5322), or one or more messages stored in a
141 maildrop in mbox (RFC 4155) or MMDF format, from a file stream. Each call
142 to m_getfld() reads one header field, or a portion of the body, in sequence.
146 state: message parse state
147 bufsz: maximum number of characters to load into buf
148 iob: input file stream
152 name: header field name (array of size NAMESZ=999)
153 buf: either a header field body or message body
154 (return value): message parse state on return from function
155 (global) int msg_count: number of characters loaded into buf
157 Functions (part of Inputs, really)
159 void m_unknown(FILE *iob): Determines the message delimiter string for the
160 maildrop. Called by inc, scan, and msh when reading from a maildrop file.
162 void m_eomsbr (int (*action)(int)): Sets the hook to check for end of
163 message in a maildrop. Called only by msh.
165 Those functions save state in the State variables listed below.
170 FLD // Field returned
171 FLDPLUS // Field returned with more to come
172 FLDEOF // Field returned ending at eom
173 BODY // Body returned with more to come
174 BODYEOF // Body returned ending at eom
175 FILEEOF // Reached end of input file
176 FMTERR // Message Format error
177 LENERR // Name too long error from getfld
179 msg_style is maildrop style, one of:
180 MS_UNKNOWN // type not known yet
181 MS_DEFAULT // default (one msg per file)
182 MS_MBOX // Unix-style "from" lines
183 MS_MMDF // string mmdlm2
186 State variables (part of Outputs)
188 m_getfld() retains state internally between calls in some state variables.
190 These two variables are global, but only used internally by m_getfld.c:
194 These are used for the end-of-message matcher when reading maildrops:
195 static unsigned char **pat_map
196 static unsigned char *fdelim
197 static unsigned char *delimend
199 static unsigned char *edelim
204 m_getfld() is restricted to operate on one file stream at a time because of
205 the retained state (see "State variables" above).
209 The first call to m_getfld() on a file stream is with a state of FLD.
210 Subsequent calls provide the state returned by the previous call.
212 Along the way, I thought of these possible interface changes that we
213 might want to consider before rototilling the internals:
215 1) To improve interface documentation:
216 Change type of name argument from unsigned char * to unsigned char[NAMESZ].
217 This would also be a step toward allowing the compiler to check for array
220 2) To remove globals that don't need to be:
221 Change msg_style and msg_delim to be file static.
223 3) To remove a global:
224 Change bufsz to be in-out instead of in, and therefore int * instead of
225 int, and use that instead of global msg_count. There are only 3 call
226 sites that use msg_count so it wouldn't take much effort to remove use of
227 it. Of course, all call sites would have to change to provide an int *
228 instead of an int. Some now pass constants.
230 4) To remove the state argument from the signature:
231 Given the Current usage and Restriction above, the state variable could
232 be removed from the signature and just retained internally.
234 5) To remove the Restriction above:
235 One approach would be for m_getfld() to retain multiple copies of that
236 state, one per iob that it sees. Another approach would be for the
237 caller to store it in an opaque struct, the address of which is passed
238 through the interface.
244 static int m_Eom (int, FILE *);
245 static unsigned char *matchc(int, char *, int, char *);
246 static unsigned char *locc(int, unsigned char *, unsigned char);
248 #define Getc(iob) getc(iob)
249 #define eom(c,iob) (msg_style != MS_DEFAULT && \
250 (((c) == *msg_delim && m_Eom(c,iob)) ||\
251 (eom_action && (*eom_action)(c))))
253 static unsigned char **pat_map;
256 * defined in sbr/m_msgdef.c = 0
257 * This is a disgusting hack for "inc" so it can know how many
258 * characters were stuffed in the buffer on the last call
259 * (see comments in uip/scansbr.c).
261 extern int msg_count;
264 * defined in sbr/m_msgdef.c = MS_DEFAULT
266 extern int msg_style;
269 * The "full" delimiter string for a packed maildrop consists
270 * of a newline followed by the actual delimiter. E.g., the
271 * full string for a Unix maildrop would be: "\n\nFrom ".
272 * "Fdelim" points to the start of the full string and is used
273 * in the BODY case of the main routine to search the buffer for
274 * a possible eom. Msg_delim points to the first character of
275 * the actual delim. string (i.e., fdelim+1). Edelim
276 * points to the 2nd character of actual delimiter string. It
277 * is used in m_Eom because the first character of the string
278 * has been read and matched before m_Eom is called.
280 extern char *msg_delim; /* defined in sbr/m_msgdef.c = "" */
281 static unsigned char *fdelim;
282 static unsigned char *delimend;
283 static int fdelimlen;
284 static unsigned char *edelim;
285 static int edelimlen;
287 static int (*eom_action)(int) = NULL;
290 # define _ptr _p /* Gag */
291 # define _cnt _r /* Retch */
292 # define _filbuf __srget /* Puke */
293 # define DEFINED__FILBUF_TO_SOMETHING_SPECIFIC
295 # if defined __CYGWIN__
296 /* Cygwin's stdio.h does not declare __srget(). */
298 # endif /* __CYGWIN__ */
301 #ifndef DEFINED__FILBUF_TO_SOMETHING_SPECIFIC
302 extern int _filbuf(FILE*);
307 m_getfld (int state, unsigned char *name, unsigned char *buf,
308 int bufsz, FILE *iob)
310 register unsigned char *bp, *cp, *ep, *sp;
311 register int cnt, c, i, j;
313 if ((c = Getc(iob)) < 0) {
320 /* flush null messages */
321 while ((c = Getc(iob)) >= 0 && eom (c, iob))
335 if (c == '\n' || c == '-') {
336 /* we hit the header/body separator */
337 while (c != '\n' && (c = Getc(iob)) >= 0)
340 if (c < 0 || (c = Getc(iob)) < 0 || eom (c, iob)) {
342 /* flush null messages */
343 while ((c = Getc(iob)) >= 0 && eom (c, iob))
356 * get the name of this component. take characters up
357 * to a ':', a newline or NAMESZ-1 characters, whichever
364 bp = sp = (unsigned char *) iob->_IO_read_ptr - 1;
365 j = (cnt = ((long) iob->_IO_read_end -
366 (long) iob->_IO_read_ptr) + 1) < i ? cnt : i;
367 #elif defined(__DragonFly__)
368 bp = sp = (unsigned char *) ((struct __FILE_public *)iob)->_p - 1;
369 j = (cnt = ((struct __FILE_public *)iob)->_r+1) < i ? cnt : i;
371 bp = sp = (unsigned char *) iob->_ptr - 1;
372 j = (cnt = iob->_cnt+1) < i ? cnt : i;
374 while (--j >= 0 && (c = *bp++) != ':' && c != '\n')
378 if ((cnt -= j) <= 0) {
380 iob->_IO_read_ptr = iob->_IO_read_end;
381 if (__underflow(iob) == EOF) {
382 #elif defined(__DragonFly__)
383 if (__srget(iob) == EOF) {
385 if (_filbuf(iob) == EOF) {
388 advise (NULL, "eof encountered in field \"%s\"", name);
392 iob->_IO_read_ptr++; /* NOT automatic in __underflow()! */
396 iob->_IO_read_ptr = bp + 1;
397 #elif defined(__DragonFly__)
398 ((struct __FILE_public *)iob)->_p = bp + 1;
399 ((struct __FILE_public *)iob)->_r = cnt - 1;
409 * something went wrong. possibilities are:
410 * . hit a newline (error)
411 * . got more than namesz chars. (error)
412 * . hit the end of the buffer. (loop)
415 /* We hit the end of the line without seeing ':' to
416 * terminate the field name. This is usually (always?)
417 * spam. But, blowing up is lame, especially when
418 * scan(1)ing a folder with such messages. Pretend such
419 * lines are the first of the body (at least mutt also
420 * handles it this way). */
422 /* See if buf can hold this line, since we were assuming
423 * we had a buffer of NAMESZ, not bufsz. */
424 /* + 1 for the newline */
426 /* No, it can't. Oh well, guess we'll blow up. */
428 advise (NULL, "eol encountered in field \"%s\"", name);
432 memcpy (buf, name, j - 1);
435 /* mhparse.c:get_content wants to find the position of the
436 * body start, but it thinks there's a blank line between
437 * the header and the body (naturally!), so seek back so
438 * that things line up even though we don't have that
439 * blank line in this case. Simpler parsers (e.g. mhl)
440 * get extra newlines, but that should be harmless enough,
441 * right? This is a corrupt message anyway. */
442 fseek (iob, ftell (iob) - 2, SEEK_SET);
447 advise (NULL, "field name \"%s\" exceeds %d bytes", name, NAMESZ - 2);
453 while (isspace (*--cp) && cp >= name)
460 * get (more of) the text of a field. take
461 * characters up to the end of this field (newline
462 * followed by non-blank) or bufsz-1 characters.
464 cp = buf; i = bufsz-1;
467 cnt = (long) iob->_IO_read_end - (long) iob->_IO_read_ptr;
468 bp = (unsigned char *) --iob->_IO_read_ptr;
469 #elif defined(__DragonFly__)
470 cnt = ((struct __FILE_public *)iob)->_r++;
471 bp = (unsigned char *) --((struct __FILE_public *)iob)->_p;
474 bp = (unsigned char *) --iob->_ptr;
476 c = cnt < i ? cnt : i;
477 while ((ep = locc( c, bp, '\n' ))) {
479 * if we hit the end of this field, return.
481 if ((j = *++ep) != ' ' && j != '\t') {
483 j = ep - (unsigned char *) iob->_IO_read_ptr;
484 memcpy (cp, iob->_IO_read_ptr, j);
485 iob->_IO_read_ptr = ep;
486 #elif defined(__DragonFly__)
487 j = ep - (unsigned char *) ((struct __FILE_public *)iob)->_p;
488 memcpy (cp, ((struct __FILE_public *)iob)->_p, j);
489 ((struct __FILE_public *)iob)->_p = ep;
490 ((struct __FILE_public *)iob)->_r -= j;
492 j = ep - (unsigned char *) iob->_ptr;
493 memcpy (cp, iob->_ptr, j);
505 * end of input or dest buffer - copy what we've found.
508 c += bp - (unsigned char *) iob->_IO_read_ptr;
509 memcpy( cp, iob->_IO_read_ptr, c);
510 #elif defined(__DragonFly__)
511 c += bp - (unsigned char *) ((struct __FILE_public *)iob)->_p;
512 memcpy( cp, ((struct __FILE_public *)iob)->_p, c);
514 c += bp - (unsigned char *) iob->_ptr;
515 memcpy( cp, iob->_ptr, c);
520 /* the dest buffer is full */
522 iob->_IO_read_ptr += c;
523 #elif defined(__DragonFly__)
524 ((struct __FILE_public *)iob)->_r -= c;
525 ((struct __FILE_public *)iob)->_p += c;
534 * There's one character left in the input buffer.
535 * Copy it & fill the buffer. If the last char
536 * was a newline and the next char is not whitespace,
537 * this is the end of the field. Otherwise loop.
541 *cp++ = j = *(iob->_IO_read_ptr + c);
542 iob->_IO_read_ptr = iob->_IO_read_end;
543 c = __underflow(iob);
544 iob->_IO_read_ptr++; /* NOT automatic! */
545 #elif defined(__DragonFly__)
546 *cp++ =j = *(((struct __FILE_public *)iob)->_p + c);
549 *cp++ = j = *(iob->_ptr + c);
553 ((j == '\0' || j == '\n') && c != ' ' && c != '\t')) {
557 #elif defined(__DragonFly__)
558 --((struct __FILE_public *)iob)->_p;
559 ++((struct __FILE_public *)iob)->_r;
574 * get the message body up to bufsz characters or the
575 * end of the message. Sleazy hack: if bufsz is negative
576 * we assume that we were called to copy directly into
577 * the output buffer and we don't add an eos.
579 i = (bufsz < 0) ? -bufsz : bufsz-1;
581 bp = (unsigned char *) --iob->_IO_read_ptr;
582 cnt = (long) iob->_IO_read_end - (long) iob->_IO_read_ptr;
583 #elif defined(__DragonFly__)
584 bp = (unsigned char *) --((struct __FILE_public *)iob)->_p;
585 cnt = ++((struct __FILE_public *)iob)->_r;
587 bp = (unsigned char *) --iob->_ptr;
590 c = (cnt < i ? cnt : i);
591 if (msg_style != MS_DEFAULT && c > 1) {
593 * packed maildrop - only take up to the (possible)
594 * start of the next message. This "matchc" should
595 * probably be a Boyer-Moore matcher for non-vaxen,
596 * particularly since we have the alignment table
597 * all built for the end-of-buffer test (next).
598 * But our vax timings indicate that the "matchc"
599 * instruction is 50% faster than a carefully coded
600 * B.M. matcher for most strings. (So much for elegant
601 * algorithms vs. brute force.) Since I (currently)
602 * run MH on a vax, we use the matchc instruction. --vj
604 if ((ep = matchc( fdelimlen, fdelim, c, bp )))
608 * There's no delim in the buffer but there may be
609 * a partial one at the end. If so, we want to leave
610 * it so the "eom" check on the next call picks it up.
611 * Use a modified Boyer-Moore matcher to make this
612 * check relatively cheap. The first "if" figures
613 * out what position in the pattern matches the last
614 * character in the buffer. The inner "while" matches
615 * the pattern against the buffer, backwards starting
616 * at that position. Note that unless the buffer
617 * ends with one of the characters in the pattern
618 * (excluding the first and last), we do only one test.
621 if ((sp = pat_map[*ep])) {
623 /* This if() is true unless (a) the buffer is too
624 * small to contain this delimiter prefix, or
625 * (b) it contains exactly enough chars for the
627 * For case (a) obviously we aren't going to match.
628 * For case (b), if the buffer really contained exactly
629 * a delim prefix, then the m_eom call at entry
630 * should have found it. Thus it's not a delim
631 * and we know we won't get a match.
633 if (((sp - fdelim) + 2) <= c) {
635 /* Unfortunately although fdelim has a preceding NUL
636 * we can't use this as a sentinel in case the buffer
637 * contains a NUL in exactly the wrong place (this
638 * would cause us to run off the front of fdelim).
640 while (*--ep == *--cp)
644 /* we matched the entire delim prefix,
645 * so only take the buffer up to there.
646 * we know ep >= bp -- check above prevents underrun
652 /* try matching one less char of delim string */
654 } while (--sp > fdelim);
658 memcpy( buf, bp, c );
660 iob->_IO_read_ptr += c;
661 #elif defined(__DragonFly__)
662 ((struct __FILE_public *)iob)->_r -= c;
663 ((struct __FILE_public *)iob)->_p += c;
676 adios (NULL, "m_getfld() called with bogus state of %d", state);
680 msg_count = cp - buf;
692 register char *delimstr;
695 * Figure out what the message delimitter string is for this
696 * maildrop. (This used to be part of m_Eom but I didn't like
697 * the idea of an "if" statement that could only succeed on the
698 * first call to m_Eom getting executed on each call, i.e., at
699 * every newline in the message).
701 * If the first line of the maildrop is a Unix "From " line, we
702 * say the style is MBOX and eat the rest of the line. Otherwise
703 * we say the style is MMDF and look for the delimiter string
704 * specified when nmh was built (or from the mts.conf file).
707 msg_style = MS_UNKNOWN;
710 if (fread (text, sizeof(*text), 5, iob) == 5
711 && strncmp (text, "From ", 5) == 0) {
713 delimstr = "\nFrom ";
714 while ((c = getc (iob)) != '\n' && c >= 0)
717 /* not a Unix style maildrop */
718 fseek (iob, pos, SEEK_SET);
719 if (mmdlm2 == NULL || *mmdlm2 == 0)
720 mmdlm2 = "\001\001\001\001\n";
724 c = strlen (delimstr);
725 fdelim = (unsigned char *) mh_xmalloc((size_t) (c + 3));
728 msg_delim = (char *)fdelim+1;
729 edelim = (unsigned char *)msg_delim+1;
732 strcpy (msg_delim, delimstr);
733 delimend = (unsigned char *)msg_delim + edelimlen;
735 adios (NULL, "maildrop delimiter must be at least 2 bytes");
737 * build a Boyer-Moore end-position map for the matcher in m_getfld.
738 * N.B. - we don't match just the first char (since it's the newline
739 * separator) or the last char (since the matchc would have found it
740 * if it was a real delim).
742 pat_map = (unsigned char **) calloc (256, sizeof(unsigned char *));
744 for (cp = (char *) fdelim + 1; cp < (char *) delimend; cp++ )
745 pat_map[(unsigned char)*cp] = (unsigned char *) cp;
747 if (msg_style == MS_MMDF) {
748 /* flush extra msg hdrs */
749 while ((c = Getc(iob)) >= 0 && eom (c, iob))
758 m_eomsbr (int (*action)(int))
760 if ((eom_action = action)) {
767 msg_delim = (char *)fdelim + 1;
768 fdelimlen = strlen((char *)fdelim);
769 delimend = (unsigned char *)(msg_delim + edelimlen);
775 * test for msg delimiter string
779 m_Eom (int c, FILE *iob)
781 register long pos = 0L;
786 if ((i = fread (text, sizeof *text, edelimlen, iob)) != edelimlen
787 || strncmp (text, (char *)edelim, edelimlen)) {
788 if (i == 0 && msg_style == MS_MBOX)
789 /* the final newline in the (brain damaged) unix-format
790 * maildrop is part of the delimitter - delete it.
795 fseek (iob, pos, SEEK_SET);
798 fseek (iob, (long)(pos-1), SEEK_SET);
799 getc (iob); /* should be OK */
803 if (msg_style == MS_MBOX) {
804 while ((c = getc (iob)) != '\n')
813 static unsigned char *
814 matchc(int patln, char *pat, int strln, char *str)
816 register char *es = str + strln - patln;
819 register char *ep = pat + patln;
820 register char pc = *pat++;
829 while (pp < ep && *sp++ == *pp)
832 return ((unsigned char *)--str);
838 * Locate character "term" in the next "cnt" characters of "src".
839 * If found, return its address, otherwise return 0.
842 static unsigned char *
843 locc(int cnt, unsigned char *src, unsigned char term)
845 while (*src++ != term && --cnt > 0);
847 return (cnt > 0 ? --src : (unsigned char *)0);