3 * m_getfld.c -- read/parse a message
7 * This code is Copyright (c) 2002, by the authors of nmh. See the
8 * COPYRIGHT file in the root directory of the nmh distribution for
9 * complete copyright information.
15 /* This module has a long and checkered history. First, it didn't burst
16 maildrops correctly because it considered two CTRL-A:s in a row to be
17 an inter-message delimiter. It really is four CTRL-A:s followed by a
18 newline. Unfortunately, MMDF will convert this delimiter *inside* a
19 message to a CTRL-B followed by three CTRL-A:s and a newline. This
20 caused the old version of m_getfld() to declare eom prematurely. The
21 fix was a lot slower than
23 c == '\001' && peekc (iob) == '\001'
25 but it worked, and to increase generality, MBOX style maildrops could
26 be parsed as well. Unfortunately the speed issue finally caught up with
27 us since this routine is at the very heart of MH.
29 To speed things up considerably, the routine Eom() was made an auxilary
30 function called by the macro eom(). Unless we are bursting a maildrop,
31 the eom() macro returns FALSE saying we aren't at the end of the
34 The next thing to do is to read the mts.conf file and initialize
35 delimiter[] and delimlen accordingly...
37 After mhl was made a built-in in msh, m_getfld() worked just fine
38 (using m_unknown() at startup). Until one day: a message which was
39 the result of a bursting was shown. Then, since the burst boundaries
40 aren't CTRL-A:s, m_getfld() would blinding plunge on past the boundary.
41 Very sad. The solution: introduce m_eomsbr(). This hook gets called
42 after the end of each line (since testing for eom involves an fseek()).
43 This worked fine, until one day: a message with no body portion arrived.
46 while (eom (c = Getc (iob), iob))
49 loop caused m_getfld() to return FMTERR. So, that logic was changed to
50 check for (*eom_action) and act accordingly.
52 This worked fine, until one day: someone didn't use four CTRL:A's as
53 their delimiters. So, the bullet got bit and we read mts.h and
54 continue to struggle on. It's not that bad though, since the only time
55 the code gets executed is when inc (or msh) calls it, and both of these
56 have already called mts_init().
58 ------------------------
59 (Written by Van Jacobson for the mh6 m_getfld, January, 1986):
61 This routine was accounting for 60% of the cpu time used by most mh
62 programs. I spent a bit of time tuning and it now accounts for <10%
63 of the time used. Like any heavily tuned routine, it's a bit
64 complex and you want to be sure you understand everything that it's
65 doing before you start hacking on it. Let me try to emphasize
66 that: every line in this atrocity depends on every other line,
67 sometimes in subtle ways. You should understand it all, in detail,
68 before trying to change any part. If you do change it, test the
69 result thoroughly (I use a hand-constructed test file that exercises
70 all the ways a header name, header body, header continuation,
71 header-body separator, body line and body eom can align themselves
72 with respect to a buffer boundary). "Minor" bugs in this routine
73 result in garbaged or lost mail.
75 If you hack on this and slow it down, I, my children and my
76 children's children will curse you.
78 This routine gets used on three different types of files: normal,
79 single msg files, "packed" unix or mmdf mailboxs (when used by inc)
80 and packed, directoried bulletin board files (when used by msh).
81 The biggest impact of different file types is in "eom" testing. The
82 code has been carefully organized to test for eom at appropriate
83 times and at no other times (since the check is quite expensive).
84 I have tried to arrange things so that the eom check need only be
85 done on entry to this routine. Since an eom can only occur after a
86 newline, this is easy to manage for header fields. For the msg
87 body, we try to efficiently search the input buffer to see if
88 contains the eom delimiter. If it does, we take up to the
89 delimiter, otherwise we take everything in the buffer. (The change
90 to the body eom/copy processing produced the most noticeable
91 performance difference, particularly for "inc" and "show".)
93 There are three qualitatively different things this routine busts
94 out of a message: field names, field text and msg bodies. Field
95 names are typically short (~8 char) and the loop that extracts them
96 might terminate on a colon, newline or max width. I considered
97 using a Vax "scanc" to locate the end of the field followed by a
98 "bcopy" but the routine call overhead on a Vax is too large for this
99 to work on short names. If Berkeley ever makes "inline" part of the
100 C optimiser (so things like "scanc" turn into inline instructions) a
101 change here would be worthwhile.
103 Field text is typically 60 - 100 characters so there's (barely)
104 a win in doing a routine call to something that does a "locc"
105 followed by a "bmove". About 30% of the fields have continuations
106 (usually the 822 "received:" lines) and each continuation generates
107 another routine call. "Inline" would be a big win here, as well.
109 Messages, as of this writing, seem to come in two flavors: small
110 (~1K) and long (>2K). Most messages have 400 - 600 bytes of headers
111 so message bodies average at least a few hundred characters.
112 Assuming your system uses reasonably sized stdio buffers (1K or
113 more), this routine should be able to remove the body in large
114 (>500 byte) chunks. The makes the cost of a call to "bcopy"
115 small but there is a premium on checking for the eom in packed
116 maildrops. The eom pattern is always a simple string so we can
117 construct an efficient pattern matcher for it (e.g., a Vax "matchc"
118 instruction). Some thought went into recognizing the start of
119 an eom that has been split across two buffers.
121 This routine wants to deal with large chunks of data so, rather
122 than "getc" into a local buffer, it uses stdio's buffer. If
123 you try to use it on a non-buffered file, you'll get what you
124 deserve. This routine "knows" that struct FILEs have a _ptr
125 and a _cnt to describe the current state of the buffer and
126 it knows that _filbuf ignores the _ptr & _cnt and simply fills
127 the buffer. If stdio on your system doesn't work this way, you
128 may have to make small changes in this routine.
130 This routine also "knows" that an EOF indication on a stream is
131 "sticky" (i.e., you will keep getting EOF until you reposition the
132 stream). If your system doesn't work this way it is broken and you
133 should complain to the vendor. As a consequence of the sticky
134 EOF, this routine will never return any kind of EOF status when
135 there is data in "name" or "buf").
142 static int m_Eom (int, FILE *);
143 static unsigned char *matchc(int, char *, int, char *);
144 static unsigned char *locc(int, unsigned char *, unsigned char);
146 #define Getc(iob) getc(iob)
147 #define eom(c,iob) (msg_style != MS_DEFAULT && \
148 (((c) == *msg_delim && m_Eom(c,iob)) ||\
149 (eom_action && (*eom_action)(c))))
151 static unsigned char **pat_map;
154 * defined in sbr/m_msgdef.c = 0
155 * This is a disgusting hack for "inc" so it can know how many
156 * characters were stuffed in the buffer on the last call
157 * (see comments in uip/scansbr.c).
159 extern int msg_count;
162 * defined in sbr/m_msgdef.c = MS_DEFAULT
164 extern int msg_style;
167 * The "full" delimiter string for a packed maildrop consists
168 * of a newline followed by the actual delimiter. E.g., the
169 * full string for a Unix maildrop would be: "\n\nFrom ".
170 * "Fdelim" points to the start of the full string and is used
171 * in the BODY case of the main routine to search the buffer for
172 * a possible eom. Msg_delim points to the first character of
173 * the actual delim. string (i.e., fdelim+1). Edelim
174 * points to the 2nd character of actual delimiter string. It
175 * is used in m_Eom because the first character of the string
176 * has been read and matched before m_Eom is called.
178 extern char *msg_delim; /* defined in sbr/m_msgdef.c = "" */
179 static unsigned char *fdelim;
180 static unsigned char *delimend;
181 static int fdelimlen;
182 static unsigned char *edelim;
183 static int edelimlen;
185 static int (*eom_action)() = NULL;
188 # define _ptr _p /* Gag */
189 # define _cnt _r /* Retch */
190 # define _filbuf __srget /* Puke */
191 # define DEFINED__FILBUF_TO_SOMETHING_SPECIFIC
197 # define _base __base
198 # define _filbuf(fp) ((fp)->__cnt = 0, __filbuf(fp))
199 # define DEFINED__FILBUF_TO_SOMETHING_SPECIFIC
202 #ifndef DEFINED__FILBUF_TO_SOMETHING_SPECIFIC
203 extern int _filbuf(FILE*);
208 m_getfld (int state, unsigned char *name, unsigned char *buf,
209 int bufsz, FILE *iob)
211 register unsigned char *bp, *cp, *ep, *sp;
212 register int cnt, c, i, j;
214 if ((c = Getc(iob)) < 0) {
221 /* flush null messages */
222 while ((c = Getc(iob)) >= 0 && eom (c, iob))
236 if (c == '\n' || c == '-') {
237 /* we hit the header/body separator */
238 while (c != '\n' && (c = Getc(iob)) >= 0)
241 if (c < 0 || (c = Getc(iob)) < 0 || eom (c, iob)) {
243 /* flush null messages */
244 while ((c = Getc(iob)) >= 0 && eom (c, iob))
257 * get the name of this component. take characters up
258 * to a ':', a newline or NAMESZ-1 characters, whichever
265 bp = sp = (unsigned char *) iob->_IO_read_ptr - 1;
266 j = (cnt = ((long) iob->_IO_read_end -
267 (long) iob->_IO_read_ptr) + 1) < i ? cnt : i;
269 bp = sp = (unsigned char *) iob->_ptr - 1;
270 j = (cnt = iob->_cnt+1) < i ? cnt : i;
272 while (--j >= 0 && (c = *bp++) != ':' && c != '\n')
276 if ((cnt -= j) <= 0) {
278 iob->_IO_read_ptr = iob->_IO_read_end;
279 if (__underflow(iob) == EOF) {
281 if (_filbuf(iob) == EOF) {
284 advise (NULL, "eof encountered in field \"%s\"", name);
288 iob->_IO_read_ptr++; /* NOT automatic in __underflow()! */
292 iob->_IO_read_ptr = bp + 1;
302 * something went wrong. possibilities are:
303 * . hit a newline (error)
304 * . got more than namesz chars. (error)
305 * . hit the end of the buffer. (loop)
309 advise (NULL, "eol encountered in field \"%s\"", name);
315 advise (NULL, "field name \"%s\" exceeds %d bytes", name, NAMESZ - 1);
321 while (isspace (*--cp) && cp >= name)
328 * get (more of) the text of a field. take
329 * characters up to the end of this field (newline
330 * followed by non-blank) or bufsz-1 characters.
332 cp = buf; i = bufsz-1;
335 cnt = (long) iob->_IO_read_end - (long) iob->_IO_read_ptr;
336 bp = (unsigned char *) --iob->_IO_read_ptr;
339 bp = (unsigned char *) --iob->_ptr;
341 c = cnt < i ? cnt : i;
342 while ((ep = locc( c, bp, '\n' ))) {
344 * if we hit the end of this field, return.
346 if ((j = *++ep) != ' ' && j != '\t') {
348 j = ep - (unsigned char *) iob->_IO_read_ptr;
349 memcpy (cp, iob->_IO_read_ptr, j);
350 iob->_IO_read_ptr = ep;
352 j = ep - (unsigned char *) iob->_ptr;
353 memcpy (cp, iob->_ptr, j);
365 * end of input or dest buffer - copy what we've found.
368 c += bp - (unsigned char *) iob->_IO_read_ptr;
369 memcpy( cp, iob->_IO_read_ptr, c);
371 c += bp - (unsigned char *) iob->_ptr;
372 memcpy( cp, iob->_ptr, c);
377 /* the dest buffer is full */
379 iob->_IO_read_ptr += c;
388 * There's one character left in the input buffer.
389 * Copy it & fill the buffer. If the last char
390 * was a newline and the next char is not whitespace,
391 * this is the end of the field. Otherwise loop.
395 *cp++ = j = *(iob->_IO_read_ptr + c);
396 iob->_IO_read_ptr = iob->_IO_read_end;
397 c = __underflow(iob);
398 iob->_IO_read_ptr++; /* NOT automatic! */
400 *cp++ = j = *(iob->_ptr + c);
404 ((j == '\0' || j == '\n') && c != ' ' && c != '\t')) {
422 * get the message body up to bufsz characters or the
423 * end of the message. Sleazy hack: if bufsz is negative
424 * we assume that we were called to copy directly into
425 * the output buffer and we don't add an eos.
427 i = (bufsz < 0) ? -bufsz : bufsz-1;
429 bp = (unsigned char *) --iob->_IO_read_ptr;
430 cnt = (long) iob->_IO_read_end - (long) iob->_IO_read_ptr;
432 bp = (unsigned char *) --iob->_ptr;
435 c = (cnt < i ? cnt : i);
436 if (msg_style != MS_DEFAULT && c > 1) {
438 * packed maildrop - only take up to the (possible)
439 * start of the next message. This "matchc" should
440 * probably be a Boyer-Moore matcher for non-vaxen,
441 * particularly since we have the alignment table
442 * all built for the end-of-buffer test (next).
443 * But our vax timings indicate that the "matchc"
444 * instruction is 50% faster than a carefully coded
445 * B.M. matcher for most strings. (So much for elegant
446 * algorithms vs. brute force.) Since I (currently)
447 * run MH on a vax, we use the matchc instruction. --vj
449 if ((ep = matchc( fdelimlen, fdelim, c, bp )))
453 * There's no delim in the buffer but there may be
454 * a partial one at the end. If so, we want to leave
455 * it so the "eom" check on the next call picks it up.
456 * Use a modified Boyer-Moore matcher to make this
457 * check relatively cheap. The first "if" figures
458 * out what position in the pattern matches the last
459 * character in the buffer. The inner "while" matches
460 * the pattern against the buffer, backwards starting
461 * at that position. Note that unless the buffer
462 * ends with one of the characters in the pattern
463 * (excluding the first and last), we do only one test.
466 if ((sp = pat_map[*ep])) {
469 while (*--ep == *--cp)
474 * ep < bp means that all the buffer
475 * contains is a prefix of delim.
476 * If this prefix is really a delim, the
477 * m_eom call at entry should have found
478 * it. Thus it's not a delim and we can
484 /* try matching one less char of delim string */
486 } while (--sp > fdelim);
490 memcpy( buf, bp, c );
492 iob->_IO_read_ptr += c;
505 adios (NULL, "m_getfld() called with bogus state of %d", state);
509 msg_count = cp - buf;
515 static char unixbuf[BUFSIZ] = "";
525 register char *delimstr;
528 * Figure out what the message delimitter string is for this
529 * maildrop. (This used to be part of m_Eom but I didn't like
530 * the idea of an "if" statement that could only succeed on the
531 * first call to m_Eom getting executed on each call, i.e., at
532 * every newline in the message).
534 * If the first line of the maildrop is a Unix "From " line, we
535 * say the style is MBOX and eat the rest of the line. Otherwise
536 * we say the style is MMDF and look for the delimiter string
537 * specified when nmh was built (or from the mts.conf file).
540 msg_style = MS_UNKNOWN;
543 if (fread (text, sizeof(*text), 5, iob) == 5
544 && strncmp (text, "From ", 5) == 0) {
546 delimstr = "\nFrom ";
548 while ((c = getc (iob)) != '\n' && c >= 0)
552 while ((c = getc (iob)) != '\n' && cp - unixbuf < BUFSIZ - 1)
557 /* not a Unix style maildrop */
558 fseek (iob, pos, SEEK_SET);
559 if (mmdlm2 == NULL || *mmdlm2 == 0)
560 mmdlm2 = "\001\001\001\001\n";
564 c = strlen (delimstr);
565 fdelim = (unsigned char *) malloc((size_t) (c + 3));
568 msg_delim = (char *)fdelim+1;
569 edelim = (unsigned char *)msg_delim+1;
572 strcpy (msg_delim, delimstr);
573 delimend = (unsigned char *)msg_delim + edelimlen;
575 adios (NULL, "maildrop delimiter must be at least 2 bytes");
577 * build a Boyer-Moore end-position map for the matcher in m_getfld.
578 * N.B. - we don't match just the first char (since it's the newline
579 * separator) or the last char (since the matchc would have found it
580 * if it was a real delim).
582 pat_map = (unsigned char **) calloc (256, sizeof(unsigned char *));
584 for (cp = (char *) fdelim + 1; cp < (char *) delimend; cp++ )
585 pat_map[(unsigned char)*cp] = (unsigned char *) cp;
587 if (msg_style == MS_MMDF) {
588 /* flush extra msg hdrs */
589 while ((c = Getc(iob)) >= 0 && eom (c, iob))
598 m_eomsbr (int (*action)())
600 if ((eom_action = action)) {
607 msg_delim = (char *)fdelim + 1;
608 fdelimlen = strlen((char *)fdelim);
609 delimend = (unsigned char *)(msg_delim + edelimlen);
615 * test for msg delimiter string
619 m_Eom (int c, FILE *iob)
621 register long pos = 0L;
629 if ((i = fread (text, sizeof *text, edelimlen, iob)) != edelimlen
630 || strncmp (text, (char *)edelim, edelimlen)) {
631 if (i == 0 && msg_style == MS_MBOX)
632 /* the final newline in the (brain damaged) unix-format
633 * maildrop is part of the delimitter - delete it.
638 fseek (iob, pos, SEEK_SET);
641 fseek (iob, (long)(pos-1), SEEK_SET);
642 getc (iob); /* should be OK */
646 if (msg_style == MS_MBOX) {
648 while ((c = getc (iob)) != '\n')
653 while ((c = getc (iob)) != '\n' && c >= 0 && cp - unixbuf < BUFSIZ - 1)
665 * Return the Return-Path and Delivery-Date
666 * header information.
668 * Currently, I'm assuming that the "From " line
669 * takes one of the following forms.
671 * From sender date remote from host (for UUCP delivery)
672 * From sender@host date (for sendmail delivery)
676 get_returnpath (char *rp, int rplen, char *dd, int ddlen)
678 char *ap, *bp, *cp, *dp;
681 if (!(bp = cp = strchr(ap, ' ')))
685 * Check for "remote from" in envelope to see
686 * if this message uses UUCP style addressing
688 while ((cp = strchr(++cp, 'r'))) {
689 if (strncmp (cp, "remote from", 11) == 0) {
690 cp = strrchr (cp, ' ');
696 * Get the Return-Path information from
697 * the "From " envelope.
700 /* return path for UUCP style addressing */
701 dp = strchr (++cp, '\n');
702 snprintf (rp, rplen, "%.*s!%.*s\n", dp - cp, cp, bp - ap, ap);
704 /* return path for standard domain addressing */
705 snprintf (rp, rplen, "%.*s\n", bp - ap, ap);
709 * advance over the spaces to get to
710 * delivery date on envelope
715 /* Now get delivery date from envelope */
716 snprintf (dd, ddlen, "%.*s\n", 24, bp);
724 static unsigned char *
725 matchc(int patln, char *pat, int strln, char *str)
727 register char *es = str + strln - patln;
730 register char *ep = pat + patln;
731 register char pc = *pat++;
740 while (pp < ep && *sp++ == *pp)
743 return ((unsigned char *)--str);
749 * Locate character "term" in the next "cnt" characters of "src".
750 * If found, return its address, otherwise return 0.
753 static unsigned char *
754 locc(int cnt, unsigned char *src, unsigned char term)
756 while (*src++ != term && --cnt > 0);
758 return (cnt > 0 ? --src : (unsigned char *)0);