3 * m_getfld.c -- read/parse a message
7 * This code is Copyright (c) 2002, by the authors of nmh. See the
8 * COPYRIGHT file in the root directory of the nmh distribution for
9 * complete copyright information.
16 /* This module has a long and checkered history. First, it didn't burst
17 maildrops correctly because it considered two CTRL-A:s in a row to be
18 an inter-message delimiter. It really is four CTRL-A:s followed by a
19 newline. Unfortunately, MMDF will convert this delimiter *inside* a
20 message to a CTRL-B followed by three CTRL-A:s and a newline. This
21 caused the old version of m_getfld() to declare eom prematurely. The
22 fix was a lot slower than
24 c == '\001' && peekc (iob) == '\001'
26 but it worked, and to increase generality, MBOX style maildrops could
27 be parsed as well. Unfortunately the speed issue finally caught up with
28 us since this routine is at the very heart of MH.
30 To speed things up considerably, the routine Eom() was made an auxilary
31 function called by the macro eom(). Unless we are bursting a maildrop,
32 the eom() macro returns FALSE saying we aren't at the end of the
35 The next thing to do is to read the mts.conf file and initialize
36 delimiter[] and delimlen accordingly...
38 After mhl was made a built-in in msh, m_getfld() worked just fine
39 (using m_unknown() at startup). Until one day: a message which was
40 the result of a bursting was shown. Then, since the burst boundaries
41 aren't CTRL-A:s, m_getfld() would blinding plunge on past the boundary.
42 Very sad. The solution: introduce m_eomsbr(). This hook gets called
43 after the end of each line (since testing for eom involves an fseek()).
44 This worked fine, until one day: a message with no body portion arrived.
47 while (eom (c = Getc (iob), iob))
50 loop caused m_getfld() to return FMTERR. So, that logic was changed to
51 check for (*eom_action) and act accordingly.
53 This worked fine, until one day: someone didn't use four CTRL:A's as
54 their delimiters. So, the bullet got bit and we read mts.h and
55 continue to struggle on. It's not that bad though, since the only time
56 the code gets executed is when inc (or msh) calls it, and both of these
57 have already called mts_init().
59 ------------------------
60 (Written by Van Jacobson for the mh6 m_getfld, January, 1986):
62 This routine was accounting for 60% of the cpu time used by most mh
63 programs. I spent a bit of time tuning and it now accounts for <10%
64 of the time used. Like any heavily tuned routine, it's a bit
65 complex and you want to be sure you understand everything that it's
66 doing before you start hacking on it. Let me try to emphasize
67 that: every line in this atrocity depends on every other line,
68 sometimes in subtle ways. You should understand it all, in detail,
69 before trying to change any part. If you do change it, test the
70 result thoroughly (I use a hand-constructed test file that exercises
71 all the ways a header name, header body, header continuation,
72 header-body separator, body line and body eom can align themselves
73 with respect to a buffer boundary). "Minor" bugs in this routine
74 result in garbaged or lost mail.
76 If you hack on this and slow it down, I, my children and my
77 children's children will curse you.
79 This routine gets used on three different types of files: normal,
80 single msg files, "packed" unix or mmdf mailboxs (when used by inc)
81 and packed, directoried bulletin board files (when used by msh).
82 The biggest impact of different file types is in "eom" testing. The
83 code has been carefully organized to test for eom at appropriate
84 times and at no other times (since the check is quite expensive).
85 I have tried to arrange things so that the eom check need only be
86 done on entry to this routine. Since an eom can only occur after a
87 newline, this is easy to manage for header fields. For the msg
88 body, we try to efficiently search the input buffer to see if
89 contains the eom delimiter. If it does, we take up to the
90 delimiter, otherwise we take everything in the buffer. (The change
91 to the body eom/copy processing produced the most noticeable
92 performance difference, particularly for "inc" and "show".)
94 There are three qualitatively different things this routine busts
95 out of a message: field names, field text and msg bodies. Field
96 names are typically short (~8 char) and the loop that extracts them
97 might terminate on a colon, newline or max width. I considered
98 using a Vax "scanc" to locate the end of the field followed by a
99 "bcopy" but the routine call overhead on a Vax is too large for this
100 to work on short names. If Berkeley ever makes "inline" part of the
101 C optimiser (so things like "scanc" turn into inline instructions) a
102 change here would be worthwhile.
104 Field text is typically 60 - 100 characters so there's (barely)
105 a win in doing a routine call to something that does a "locc"
106 followed by a "bmove". About 30% of the fields have continuations
107 (usually the 822 "received:" lines) and each continuation generates
108 another routine call. "Inline" would be a big win here, as well.
110 Messages, as of this writing, seem to come in two flavors: small
111 (~1K) and long (>2K). Most messages have 400 - 600 bytes of headers
112 so message bodies average at least a few hundred characters.
113 Assuming your system uses reasonably sized stdio buffers (1K or
114 more), this routine should be able to remove the body in large
115 (>500 byte) chunks. The makes the cost of a call to "bcopy"
116 small but there is a premium on checking for the eom in packed
117 maildrops. The eom pattern is always a simple string so we can
118 construct an efficient pattern matcher for it (e.g., a Vax "matchc"
119 instruction). Some thought went into recognizing the start of
120 an eom that has been split across two buffers.
122 This routine wants to deal with large chunks of data so, rather
123 than "getc" into a local buffer, it uses stdio's buffer. If
124 you try to use it on a non-buffered file, you'll get what you
125 deserve. This routine "knows" that struct FILEs have a _ptr
126 and a _cnt to describe the current state of the buffer and
127 it knows that _filbuf ignores the _ptr & _cnt and simply fills
128 the buffer. If stdio on your system doesn't work this way, you
129 may have to make small changes in this routine.
131 This routine also "knows" that an EOF indication on a stream is
132 "sticky" (i.e., you will keep getting EOF until you reposition the
133 stream). If your system doesn't work this way it is broken and you
134 should complain to the vendor. As a consequence of the sticky
135 EOF, this routine will never return any kind of EOF status when
136 there is data in "name" or "buf").
143 static int m_Eom (int, FILE *);
144 static unsigned char *matchc(int, char *, int, char *);
145 static unsigned char *locc(int, unsigned char *, unsigned char);
147 #define Getc(iob) getc(iob)
148 #define eom(c,iob) (msg_style != MS_DEFAULT && \
149 (((c) == *msg_delim && m_Eom(c,iob)) ||\
150 (eom_action && (*eom_action)(c))))
152 static unsigned char **pat_map;
155 * defined in sbr/m_msgdef.c = 0
156 * This is a disgusting hack for "inc" so it can know how many
157 * characters were stuffed in the buffer on the last call
158 * (see comments in uip/scansbr.c).
160 extern int msg_count;
163 * defined in sbr/m_msgdef.c = MS_DEFAULT
165 extern int msg_style;
168 * The "full" delimiter string for a packed maildrop consists
169 * of a newline followed by the actual delimiter. E.g., the
170 * full string for a Unix maildrop would be: "\n\nFrom ".
171 * "Fdelim" points to the start of the full string and is used
172 * in the BODY case of the main routine to search the buffer for
173 * a possible eom. Msg_delim points to the first character of
174 * the actual delim. string (i.e., fdelim+1). Edelim
175 * points to the 2nd character of actual delimiter string. It
176 * is used in m_Eom because the first character of the string
177 * has been read and matched before m_Eom is called.
179 extern char *msg_delim; /* defined in sbr/m_msgdef.c = "" */
180 static unsigned char *fdelim;
181 static unsigned char *delimend;
182 static int fdelimlen;
183 static unsigned char *edelim;
184 static int edelimlen;
186 static int (*eom_action)(int) = NULL;
189 # define _ptr _p /* Gag */
190 # define _cnt _r /* Retch */
191 # define _filbuf __srget /* Puke */
192 # define DEFINED__FILBUF_TO_SOMETHING_SPECIFIC
198 # define _base __base
199 # define _filbuf(fp) ((fp)->__cnt = 0, __filbuf(fp))
200 # define DEFINED__FILBUF_TO_SOMETHING_SPECIFIC
203 #ifndef DEFINED__FILBUF_TO_SOMETHING_SPECIFIC
204 extern int _filbuf(FILE*);
209 m_getfld (int state, unsigned char *name, unsigned char *buf,
210 int bufsz, FILE *iob)
212 register unsigned char *bp, *cp, *ep, *sp;
213 register int cnt, c, i, j;
215 if ((c = Getc(iob)) < 0) {
222 /* flush null messages */
223 while ((c = Getc(iob)) >= 0 && eom (c, iob))
237 if (c == '\n' || c == '-') {
238 /* we hit the header/body separator */
239 while (c != '\n' && (c = Getc(iob)) >= 0)
242 if (c < 0 || (c = Getc(iob)) < 0 || eom (c, iob)) {
244 /* flush null messages */
245 while ((c = Getc(iob)) >= 0 && eom (c, iob))
258 * get the name of this component. take characters up
259 * to a ':', a newline or NAMESZ-1 characters, whichever
266 bp = sp = (unsigned char *) iob->_IO_read_ptr - 1;
267 j = (cnt = ((long) iob->_IO_read_end -
268 (long) iob->_IO_read_ptr) + 1) < i ? cnt : i;
269 #elif defined(__DragonFly__)
270 bp = sp = (unsigned char *) ((struct __FILE_public *)iob)->_p - 1;
271 j = (cnt = ((struct __FILE_public *)iob)->_r+1) < i ? cnt : i;
273 bp = sp = (unsigned char *) iob->_ptr - 1;
274 j = (cnt = iob->_cnt+1) < i ? cnt : i;
276 while (--j >= 0 && (c = *bp++) != ':' && c != '\n')
280 if ((cnt -= j) <= 0) {
282 iob->_IO_read_ptr = iob->_IO_read_end;
283 if (__underflow(iob) == EOF) {
284 #elif defined(__DragonFly__)
285 if (__srget(iob) == EOF) {
287 if (_filbuf(iob) == EOF) {
290 advise (NULL, "eof encountered in field \"%s\"", name);
294 iob->_IO_read_ptr++; /* NOT automatic in __underflow()! */
298 iob->_IO_read_ptr = bp + 1;
299 #elif defined(__DragonFly__)
300 ((struct __FILE_public *)iob)->_p = bp + 1;
301 ((struct __FILE_public *)iob)->_r = cnt - 1;
311 * something went wrong. possibilities are:
312 * . hit a newline (error)
313 * . got more than namesz chars. (error)
314 * . hit the end of the buffer. (loop)
317 /* We hit the end of the line without seeing ':' to
318 * terminate the field name. This is usually (always?)
319 * spam. But, blowing up is lame, especially when
320 * scan(1)ing a folder with such messages. Pretend such
321 * lines are the first of the body (at least mutt also
322 * handles it this way). */
324 /* See if buf can hold this line, since we were assuming
325 * we had a buffer of NAMESZ, not bufsz. */
326 /* + 1 for the newline */
328 /* No, it can't. Oh well, guess we'll blow up. */
330 advise (NULL, "eol encountered in field \"%s\"", name);
334 memcpy (buf, name, j - 1);
337 /* mhparse.c:get_content wants to find the position of the
338 * body start, but it thinks there's a blank line between
339 * the header and the body (naturally!), so seek back so
340 * that things line up even though we don't have that
341 * blank line in this case. Simpler parsers (e.g. mhl)
342 * get extra newlines, but that should be harmless enough,
343 * right? This is a corrupt message anyway. */
344 fseek (iob, ftell (iob) - 2, SEEK_SET);
349 advise (NULL, "field name \"%s\" exceeds %d bytes", name, NAMESZ - 2);
355 while (isspace (*--cp) && cp >= name)
362 * get (more of) the text of a field. take
363 * characters up to the end of this field (newline
364 * followed by non-blank) or bufsz-1 characters.
366 cp = buf; i = bufsz-1;
369 cnt = (long) iob->_IO_read_end - (long) iob->_IO_read_ptr;
370 bp = (unsigned char *) --iob->_IO_read_ptr;
371 #elif defined(__DragonFly__)
372 cnt = ((struct __FILE_public *)iob)->_r++;
373 bp = (unsigned char *) --((struct __FILE_public *)iob)->_p;
376 bp = (unsigned char *) --iob->_ptr;
378 c = cnt < i ? cnt : i;
379 while ((ep = locc( c, bp, '\n' ))) {
381 * if we hit the end of this field, return.
383 if ((j = *++ep) != ' ' && j != '\t') {
385 j = ep - (unsigned char *) iob->_IO_read_ptr;
386 memcpy (cp, iob->_IO_read_ptr, j);
387 iob->_IO_read_ptr = ep;
388 #elif defined(__DragonFly__)
389 j = ep - (unsigned char *) ((struct __FILE_public *)iob)->_p;
390 memcpy (cp, ((struct __FILE_public *)iob)->_p, j);
391 ((struct __FILE_public *)iob)->_p = ep;
392 ((struct __FILE_public *)iob)->_r -= j;
394 j = ep - (unsigned char *) iob->_ptr;
395 memcpy (cp, iob->_ptr, j);
407 * end of input or dest buffer - copy what we've found.
410 c += bp - (unsigned char *) iob->_IO_read_ptr;
411 memcpy( cp, iob->_IO_read_ptr, c);
412 #elif defined(__DragonFly__)
413 c += bp - (unsigned char *) ((struct __FILE_public *)iob)->_p;
414 memcpy( cp, ((struct __FILE_public *)iob)->_p, c);
416 c += bp - (unsigned char *) iob->_ptr;
417 memcpy( cp, iob->_ptr, c);
422 /* the dest buffer is full */
424 iob->_IO_read_ptr += c;
425 #elif defined(__DragonFly__)
426 ((struct __FILE_public *)iob)->_r -= c;
427 ((struct __FILE_public *)iob)->_p += c;
436 * There's one character left in the input buffer.
437 * Copy it & fill the buffer. If the last char
438 * was a newline and the next char is not whitespace,
439 * this is the end of the field. Otherwise loop.
443 *cp++ = j = *(iob->_IO_read_ptr + c);
444 iob->_IO_read_ptr = iob->_IO_read_end;
445 c = __underflow(iob);
446 iob->_IO_read_ptr++; /* NOT automatic! */
447 #elif defined(__DragonFly__)
448 *cp++ =j = *(((struct __FILE_public *)iob)->_p + c);
451 *cp++ = j = *(iob->_ptr + c);
455 ((j == '\0' || j == '\n') && c != ' ' && c != '\t')) {
459 #elif defined(__DragonFly__)
460 --((struct __FILE_public *)iob)->_p;
461 ++((struct __FILE_public *)iob)->_r;
476 * get the message body up to bufsz characters or the
477 * end of the message. Sleazy hack: if bufsz is negative
478 * we assume that we were called to copy directly into
479 * the output buffer and we don't add an eos.
481 i = (bufsz < 0) ? -bufsz : bufsz-1;
483 bp = (unsigned char *) --iob->_IO_read_ptr;
484 cnt = (long) iob->_IO_read_end - (long) iob->_IO_read_ptr;
485 #elif defined(__DragonFly__)
486 bp = (unsigned char *) --((struct __FILE_public *)iob)->_p;
487 cnt = ++((struct __FILE_public *)iob)->_r;
489 bp = (unsigned char *) --iob->_ptr;
492 c = (cnt < i ? cnt : i);
493 if (msg_style != MS_DEFAULT && c > 1) {
495 * packed maildrop - only take up to the (possible)
496 * start of the next message. This "matchc" should
497 * probably be a Boyer-Moore matcher for non-vaxen,
498 * particularly since we have the alignment table
499 * all built for the end-of-buffer test (next).
500 * But our vax timings indicate that the "matchc"
501 * instruction is 50% faster than a carefully coded
502 * B.M. matcher for most strings. (So much for elegant
503 * algorithms vs. brute force.) Since I (currently)
504 * run MH on a vax, we use the matchc instruction. --vj
506 if ((ep = matchc( fdelimlen, fdelim, c, bp )))
510 * There's no delim in the buffer but there may be
511 * a partial one at the end. If so, we want to leave
512 * it so the "eom" check on the next call picks it up.
513 * Use a modified Boyer-Moore matcher to make this
514 * check relatively cheap. The first "if" figures
515 * out what position in the pattern matches the last
516 * character in the buffer. The inner "while" matches
517 * the pattern against the buffer, backwards starting
518 * at that position. Note that unless the buffer
519 * ends with one of the characters in the pattern
520 * (excluding the first and last), we do only one test.
523 if ((sp = pat_map[*ep])) {
526 while (*--ep == *--cp)
531 * ep < bp means that all the buffer
532 * contains is a prefix of delim.
533 * If this prefix is really a delim, the
534 * m_eom call at entry should have found
535 * it. Thus it's not a delim and we can
541 /* try matching one less char of delim string */
543 } while (--sp > fdelim);
547 memcpy( buf, bp, c );
549 iob->_IO_read_ptr += c;
550 #elif defined(__DragonFly__)
551 ((struct __FILE_public *)iob)->_r -= c;
552 ((struct __FILE_public *)iob)->_p += c;
565 adios (NULL, "m_getfld() called with bogus state of %d", state);
569 msg_count = cp - buf;
575 static char unixbuf[BUFSIZ] = "";
585 register char *delimstr;
588 * Figure out what the message delimitter string is for this
589 * maildrop. (This used to be part of m_Eom but I didn't like
590 * the idea of an "if" statement that could only succeed on the
591 * first call to m_Eom getting executed on each call, i.e., at
592 * every newline in the message).
594 * If the first line of the maildrop is a Unix "From " line, we
595 * say the style is MBOX and eat the rest of the line. Otherwise
596 * we say the style is MMDF and look for the delimiter string
597 * specified when nmh was built (or from the mts.conf file).
600 msg_style = MS_UNKNOWN;
603 if (fread (text, sizeof(*text), 5, iob) == 5
604 && strncmp (text, "From ", 5) == 0) {
606 delimstr = "\nFrom ";
608 while ((c = getc (iob)) != '\n' && c >= 0)
612 while ((c = getc (iob)) != '\n' && cp - unixbuf < BUFSIZ - 1)
617 /* not a Unix style maildrop */
618 fseek (iob, pos, SEEK_SET);
619 if (mmdlm2 == NULL || *mmdlm2 == 0)
620 mmdlm2 = "\001\001\001\001\n";
624 c = strlen (delimstr);
625 fdelim = (unsigned char *) mh_xmalloc((size_t) (c + 3));
628 msg_delim = (char *)fdelim+1;
629 edelim = (unsigned char *)msg_delim+1;
632 strcpy (msg_delim, delimstr);
633 delimend = (unsigned char *)msg_delim + edelimlen;
635 adios (NULL, "maildrop delimiter must be at least 2 bytes");
637 * build a Boyer-Moore end-position map for the matcher in m_getfld.
638 * N.B. - we don't match just the first char (since it's the newline
639 * separator) or the last char (since the matchc would have found it
640 * if it was a real delim).
642 pat_map = (unsigned char **) calloc (256, sizeof(unsigned char *));
644 for (cp = (char *) fdelim + 1; cp < (char *) delimend; cp++ )
645 pat_map[(unsigned char)*cp] = (unsigned char *) cp;
647 if (msg_style == MS_MMDF) {
648 /* flush extra msg hdrs */
649 while ((c = Getc(iob)) >= 0 && eom (c, iob))
658 m_eomsbr (int (*action)(int))
660 if ((eom_action = action)) {
667 msg_delim = (char *)fdelim + 1;
668 fdelimlen = strlen((char *)fdelim);
669 delimend = (unsigned char *)(msg_delim + edelimlen);
675 * test for msg delimiter string
679 m_Eom (int c, FILE *iob)
681 register long pos = 0L;
689 if ((i = fread (text, sizeof *text, edelimlen, iob)) != edelimlen
690 || strncmp (text, (char *)edelim, edelimlen)) {
691 if (i == 0 && msg_style == MS_MBOX)
692 /* the final newline in the (brain damaged) unix-format
693 * maildrop is part of the delimitter - delete it.
698 fseek (iob, pos, SEEK_SET);
701 fseek (iob, (long)(pos-1), SEEK_SET);
702 getc (iob); /* should be OK */
706 if (msg_style == MS_MBOX) {
708 while ((c = getc (iob)) != '\n')
713 while ((c = getc (iob)) != '\n' && c >= 0 && cp - unixbuf < BUFSIZ - 1)
725 * Return the Return-Path and Delivery-Date
726 * header information.
728 * Currently, I'm assuming that the "From " line
729 * takes one of the following forms.
731 * From sender date remote from host (for UUCP delivery)
732 * From sender@host date (for sendmail delivery)
736 get_returnpath (char *rp, int rplen, char *dd, int ddlen)
738 char *ap, *bp, *cp, *dp;
741 if (!(bp = cp = strchr(ap, ' ')))
745 * Check for "remote from" in envelope to see
746 * if this message uses UUCP style addressing
748 while ((cp = strchr(++cp, 'r'))) {
749 if (strncmp (cp, "remote from", 11) == 0) {
750 cp = strrchr (cp, ' ');
756 * Get the Return-Path information from
757 * the "From " envelope.
760 /* return path for UUCP style addressing */
761 dp = strchr (++cp, '\n');
762 snprintf (rp, rplen, "%.*s!%.*s\n", (int)(dp - cp), cp, (int)(bp - ap), ap);
764 /* return path for standard domain addressing */
765 snprintf (rp, rplen, "%.*s\n", (int)(bp - ap), ap);
769 * advance over the spaces to get to
770 * delivery date on envelope
775 /* Now get delivery date from envelope */
776 snprintf (dd, ddlen, "%.*s\n", 24, bp);
784 static unsigned char *
785 matchc(int patln, char *pat, int strln, char *str)
787 register char *es = str + strln - patln;
790 register char *ep = pat + patln;
791 register char pc = *pat++;
800 while (pp < ep && *sp++ == *pp)
803 return ((unsigned char *)--str);
809 * Locate character "term" in the next "cnt" characters of "src".
810 * If found, return its address, otherwise return 0.
813 static unsigned char *
814 locc(int cnt, unsigned char *src, unsigned char term)
816 while (*src++ != term && --cnt > 0);
818 return (cnt > 0 ? --src : (unsigned char *)0);