1 /* m_getfld.c - read/parse a message */
3 static char ident[] = "@(#)$Id: m_getfld.c,v 1.15 1993/02/26 21:57:14 jromine Exp $";
8 #include "../zotnet/mts.h"
12 /* This module has a long and checkered history. First, it didn't burst
13 maildrops correctly because it considered two CTRL-A:s in a row to be
14 an inter-message delimiter. It really is four CTRL-A:s followed by a
15 newline. Unfortunately, MMDF will convert this delimiter *inside* a
16 message to a CTRL-B followed by three CTRL-A:s and a newline. This
17 caused the old version of m_getfld() to declare eom prematurely. The
18 fix was a lot slower than
20 c == '\001' && peekc (iob) == '\001'
22 but it worked, and to increase generality, UUCP style maildrops could
23 be parsed as well. Unfortunately the speed issue finally caught up with
24 us since this routine is at the very heart of MH.
26 To speed things up considerably, the routine Eom() was made an auxilary
27 function called by the macro eom(). Unless we are bursting a maildrop,
28 the eom() macro returns FALSE saying we aren't at the end of the
31 The next thing to do is to read the mtstailor file and initialize
32 delimiter[] and delimlen accordingly...
34 After mhl was made a built-in in msh, m_getfld() worked just fine
35 (using m_unknown() at startup). Until one day: a message which was
36 the result of a bursting was shown. Then, since the burst boundaries
37 aren't CTRL-A:s, m_getfld() would blinding plunge on past the boundary.
38 Very sad. The solution: introduce m_eomsbr(). This hook gets called
39 after the end of each line (since testing for eom involves an fseek()).
40 This worked fine, until one day: a message with no body portion arrived.
43 while (eom (c = Getc (iob), iob))
46 loop caused m_getfld() to return FMTERR. So, that logic was changed to
47 check for (*eom_action) and act accordingly.
49 This worked fine, until one day: someone didn't use four CTRL:A's as
50 their delimiters. So, the bullet got bit and we read mts.h and
51 continue to struggle on. It's not that bad though, since the only time
52 the code gets executed is when inc (or msh) calls it, and both of these
53 have already called mts_init().
55 ------------------------
56 (Written by Van Jacobson for the mh6 m_getfld, January, 1986):
58 This routine was accounting for 60% of the cpu time used by most mh
59 programs. I spent a bit of time tuning and it now accounts for <10%
60 of the time used. Like any heavily tuned routine, it's a bit
61 complex and you want to be sure you understand everything that it's
62 doing before you start hacking on it. Let me try to emphasize
63 that: every line in this atrocity depends on every other line,
64 sometimes in subtle ways. You should understand it all, in detail,
65 before trying to change any part. If you do change it, test the
66 result thoroughly (I use a hand-constructed test file that exercises
67 all the ways a header name, header body, header continuation,
68 header-body separator, body line and body eom can align themselves
69 with respect to a buffer boundary). "Minor" bugs in this routine
70 result in garbaged or lost mail.
72 If you hack on this and slow it down, I, my children and my
73 children's children will curse you.
75 This routine gets used on three different types of files: normal,
76 single msg files, "packed" unix or mmdf mailboxs (when used by inc)
77 and packed, directoried bulletin board files (when used by msh).
78 The biggest impact of different file types is in "eom" testing. The
79 code has been carefully organized to test for eom at appropriate
80 times and at no other times (since the check is quite expensive).
81 I have tried to arrange things so that the eom check need only be
82 done on entry to this routine. Since an eom can only occur after a
83 newline, this is easy to manage for header fields. For the msg
84 body, we try to efficiently search the input buffer to see if
85 contains the eom delimiter. If it does, we take up to the
86 delimiter, otherwise we take everything in the buffer. (The change
87 to the body eom/copy processing produced the most noticeable
88 performance difference, particularly for "inc" and "show".)
90 There are three qualitatively different things this routine busts
91 out of a message: field names, field text and msg bodies. Field
92 names are typically short (~8 char) and the loop that extracts them
93 might terminate on a colon, newline or max width. I considered
94 using a Vax "scanc" to locate the end of the field followed by a
95 "bcopy" but the routine call overhead on a Vax is too large for this
96 to work on short names. If Berkeley ever makes "inline" part of the
97 C optimiser (so things like "scanc" turn into inline instructions) a
98 change here would be worthwhile.
100 Field text is typically 60 - 100 characters so there's (barely)
101 a win in doing a routine call to something that does a "locc"
102 followed by a "bmove". About 30% of the fields have continuations
103 (usually the 822 "received:" lines) and each continuation generates
104 another routine call. "Inline" would be a big win here, as well.
106 Messages, as of this writing, seem to come in two flavors: small
107 (~1K) and long (>2K). Most messages have 400 - 600 bytes of headers
108 so message bodies average at least a few hundred characters.
109 Assuming your system uses reasonably sized stdio buffers (1K or
110 more), this routine should be able to remove the body in large
111 (>500 byte) chunks. The makes the cost of a call to "bcopy"
112 small but there is a premium on checking for the eom in packed
113 maildrops. The eom pattern is always a simple string so we can
114 construct an efficient pattern matcher for it (e.g., a Vax "matchc"
115 instruction). Some thought went into recognizing the start of
116 an eom that has been split across two buffers.
118 This routine wants to deal with large chunks of data so, rather
119 than "getc" into a local buffer, it uses stdio's buffer. If
120 you try to use it on a non-buffered file, you'll get what you
121 deserve. This routine "knows" that struct FILEs have a _ptr
122 and a _cnt to describe the current state of the buffer and
123 it knows that _filbuf ignores the _ptr & _cnt and simply fills
124 the buffer. If stdio on your system doesn't work this way, you
125 may have to make small changes in this routine.
127 This routine also "knows" that an EOF indication on a stream is
128 "sticky" (i.e., you will keep getting EOF until you reposition the
129 stream). If your system doesn't work this way it is broken and you
130 should complain to the vendor. As a consequence of the sticky
131 EOF, this routine will never return any kind of EOF status when
132 there is data in "name" or "buf").
136 #define Getc(iob) getc(iob)
137 #define eom(c,iob) (msg_style != MS_DEFAULT && \
138 (((c) == *msg_delim && m_Eom(c,iob)) ||\
139 (eom_action && (*eom_action)(c))))
141 static unsigned char *matchc();
142 static unsigned char *locc();
144 static unsigned char **pat_map;
146 extern int msg_count; /* defined in sbr/m_msgdef.c = 0
147 * disgusting hack for "inc" so it can
148 * know how many characters were stuffed
149 * in the buffer on the last call (see
150 * comments in uip/scansbr.c) */
152 extern int msg_style; /* defined in sbr/m_msgdef.c = MS_DEFAULT */
154 * The "full" delimiter string for a packed maildrop consists
155 * of a newline followed by the actual delimiter. E.g., the
156 * full string for a Unix maildrop would be: "\n\nFrom ".
157 * "Fdelim" points to the start of the full string and is used
158 * in the BODY case of the main routine to search the buffer for
159 * a possible eom. Msg_delim points to the first character of
160 * the actual delim. string (i.e., fdelim+1). Edelim
161 * points to the 2nd character of actual delimiter string. It
162 * is used in m_Eom because the first character of the string
163 * has been read and matched before m_Eom is called.
165 extern char *msg_delim; /* defined in sbr/m_msgdef.c = "" */
166 static unsigned char *fdelim;
167 static unsigned char *delimend;
168 static int fdelimlen;
169 static unsigned char *edelim;
170 static int edelimlen;
172 static int (*eom_action) () = NULL;
175 #define _ptr _p /* Gag */
176 #define _cnt _r /* Retch */
177 #define _filbuf __srget /* Puke */
182 m_getfld (state, name, buf, bufsz, iob)
189 register unsigned char *cp;
190 register unsigned char *bp;
191 register unsigned char *ep;
192 register unsigned char *sp;
198 if ((c = Getc(iob)) < 0) {
205 /* flush null messages */
206 while ((c = Getc(iob)) >= 0 && eom (c, iob))
209 (void) ungetc(c, iob);
220 if (c == '\n' || c == '-') {
221 /* we hit the header/body separator */
222 while (c != '\n' && (c = Getc(iob)) >= 0)
225 if (c < 0 || (c = Getc(iob)) < 0 || eom (c, iob)) {
227 /* flush null messages */
228 while ((c = Getc(iob)) >= 0 && eom (c, iob))
231 (void) ungetc(c, iob);
241 * get the name of this component. take characters up
242 * to a ':', a newline or NAMESZ-1 characters, whichever
245 cp = name; i = NAMESZ - 1;
247 bp = sp = (unsigned char *) iob->_ptr - 1;
248 j = (cnt = iob->_cnt+1) < i ? cnt : i;
249 while ((c = *bp++) != ':' && c != '\n' && --j >= 0)
253 if ((cnt -= j) <= 0) {
254 if (_filbuf(iob) == EOF) {
256 advise (NULLCP, "eof encountered in field \"%s\"",
268 * something went wrong. possibilities are:
269 * . hit a newline (error)
270 * . got more than namesz chars. (error)
271 * . hit the end of the buffer. (loop)
275 advise (NULLCP, "eol encountered in field \"%s\"", name);
281 advise (NULLCP, "field name \"%s\" exceeds %d bytes",
288 while (isspace (*--cp) && cp >= name)
295 * get (more of) the text of a field. take
296 * characters up to the end of this field (newline
297 * followed by non-blank) or bufsz-1 characters.
299 cp = buf; i = bufsz-1;
301 cnt = iob->_cnt++; bp = (unsigned char *) --iob->_ptr;
302 c = cnt < i ? cnt : i;
303 while (ep = locc( c, bp, '\n' )) {
305 * if we hit the end of this field, return.
307 if ((j = *++ep) != ' ' && j != '\t') {
308 j = ep - (unsigned char *) iob->_ptr;
309 (void) bcopy( iob->_ptr, cp, j);
310 iob->_ptr = ep; iob->_cnt -= j;
315 c -= ep - bp; bp = ep;
318 * end of input or dest buffer - copy what we've found.
320 c += bp - (unsigned char *) iob->_ptr;
321 (void) bcopy( iob->_ptr, cp, c);
324 /* the dest buffer is full */
325 iob->_cnt -= c; iob->_ptr += c;
330 * There's one character left in the input buffer.
331 * Copy it & fill the buffer. If the last char
332 * was a newline and the next char is not whitespace,
333 * this is the end of the field. Otherwise loop.
336 *cp++ = j = *(iob->_ptr + c);
338 if ((j == '\0' || j == '\n') && c != ' ' && c != '\t') {
340 --iob->_ptr, ++iob->_cnt;
350 * get the message body up to bufsz characters or the
351 * end of the message. Sleazy hack: if bufsz is negative
352 * we assume that we were called to copy directly into
353 * the output buffer and we don't add an eos.
355 i = (bufsz < 0) ? -bufsz : bufsz-1;
356 bp = (unsigned char *) --iob->_ptr; cnt = ++iob->_cnt;
357 c = (cnt < i ? cnt : i);
358 if (msg_style != MS_DEFAULT && c > 1) {
360 * packed maildrop - only take up to the (possible)
361 * start of the next message. This "matchc" should
362 * probably be a Boyer-Moore matcher for non-vaxen,
363 * particularly since we have the alignment table
364 * all built for the end-of-buffer test (next).
365 * But our vax timings indicate that the "matchc"
366 * instruction is 50% faster than a carefully coded
367 * B.M. matcher for most strings. (So much for elegant
368 * algorithms vs. brute force.) Since I (currently)
369 * run MH on a vax, we use the matchc instruction. --vj
371 if (ep = matchc( fdelimlen, fdelim, c, bp ) )
375 * There's no delim in the buffer but there may be
376 * a partial one at the end. If so, we want to leave
377 * it so the "eom" check on the next call picks it up.
378 * Use a modified Boyer-Moore matcher to make this
379 * check relatively cheap. The first "if" figures
380 * out what position in the pattern matches the last
381 * character in the buffer. The inner "while" matches
382 * the pattern against the buffer, backwards starting
383 * at that position. Note that unless the buffer
384 * ends with one of the characters in the pattern
385 * (excluding the first and last), we do only one test.
388 if (sp = pat_map[*ep]) {
391 while (*--ep == *--cp)
396 * ep < bp means that all the buffer
397 * contains is a prefix of delim.
398 * If this prefix is really a delim, the
399 * m_eom call at entry should have found
400 * it. Thus it's not a delim and we can
406 /* try matching one less char of delim string */
408 } while (--sp > fdelim);
412 (void) bcopy( bp, buf, c );
423 adios (NULLCP, "m_getfld() called with bogus state of %d", state);
427 msg_count = cp - buf;
434 static char unixbuf[BUFSIZ] = "";
445 register char *delimstr;
447 msg_style = MS_UNKNOWN;
449 /* Figure out what the message delimitter string is for this
450 * maildrop. (This used to be part of m_Eom but I didn't like
451 * the idea of an "if" statement that could only succeed on the
452 * first call to m_Eom getting executed on each call, i.e., at
453 * every newline in the message).
455 * If the first line of the maildrop is a Unix "from" line, we say the
456 * style is UUCP and eat the rest of the line. Otherwise we say the style
457 * is MMDF & look for the delimiter string specified when MH was built
458 * (or from the mtstailor file).
461 if (fread (text, sizeof *text, 5, iob) == 5
462 && strncmp (text, "From ", 5) == 0) {
464 delimstr = "\nFrom ";
466 while ((c = getc (iob)) != '\n' && c >= 0)
470 while ((c = getc (iob)) != '\n')
475 /* not a Unix style maildrop */
476 (void) fseek (iob, pos, 0);
477 if (mmdlm2 == NULLCP || *mmdlm2 == 0)
478 mmdlm2 = "\001\001\001\001\n";
482 c = strlen (delimstr);
483 fdelim = (unsigned char *)malloc((unsigned)c + 3);
486 msg_delim = (char *)fdelim+1;
487 edelim = (unsigned char *)msg_delim+1;
490 (void)strcpy(msg_delim, delimstr);
491 delimend = (unsigned char *)msg_delim + edelimlen;
493 adios (NULLCP, "maildrop delimiter must be at least 2 bytes");
495 * build a Boyer-Moore end-position map for the matcher in m_getfld.
496 * N.B. - we don't match just the first char (since it's the newline
497 * separator) or the last char (since the matchc would have found it
498 * if it was a real delim).
500 pat_map = (unsigned char **) calloc (256, sizeof (unsigned char *));
502 for (cp = (char *)fdelim + 1; cp < (char *)delimend; cp++ )
503 pat_map[*cp] = (unsigned char *)cp;
505 if (msg_style == MS_MMDF) {
506 /* flush extra msg hdrs */
507 while ((c = Getc(iob)) >= 0 && eom (c, iob))
510 (void) ungetc(c, iob);
515 void m_eomsbr (action)
518 if (eom_action = action) {
525 msg_delim = (char *)fdelim + 1;
526 fdelimlen = strlen((char *)fdelim);
527 delimend = (unsigned char *)(msg_delim + edelimlen);
533 /* test for msg delimiter string */
539 register long pos = 0L;
547 if ((i = fread (text, sizeof *text, edelimlen, iob)) != edelimlen
548 || strncmp (text, (char *)edelim, edelimlen)) {
549 if (i == 0 && msg_style == MS_UUCP)
550 /* the final newline in the (brain damaged) unix-format
551 * maildrop is part of the delimitter - delete it.
556 (void) fseek (iob, pos, 0);
558 (void) fseek (iob, (long)(pos-1), 0);
559 (void) getc (iob); /* should be OK */
564 if (msg_style == MS_UUCP) {
566 while ((c = getc (iob)) != '\n')
571 while ((c = getc (iob)) != '\n' && c >= 0)
587 static char unixfrom[BUFSIZ];
590 if (cp = dp = index (unixbuf, ' ')) {
591 while (cp = index (cp + 1, 'r'))
592 if (strncmp (cp, "remote from ", 12) == 0) {
594 (void) sprintf (pp, "%s!", cp + 12);
599 cp = unixbuf + strlen (unixbuf);
600 if ((cp -= 25) >= dp)
604 (void) sprintf (pp, "%s\n", unixbuf);
614 asm("_matchc: .word 0");
615 asm(" movq 4(ap),r0");
616 asm(" movq 12(ap),r2");
617 asm(" matchc r0,(r1),r2,(r3)");
619 asm(" movl 4(ap),r3");
620 asm("1: subl3 4(ap),r3,r0");
623 static unsigned char *
624 matchc( patln, pat, strln, str )
630 register char *es = str + strln - patln;
633 register char *ep = pat + patln;
634 register char pc = *pat++;
642 while (pp < ep && *sp++ == *pp)
645 return ((unsigned char *)--str);
653 * Locate character "term" in the next "cnt" characters of "src".
654 * If found, return its address, otherwise return 0.
658 asm("_locc: .word 0");
659 asm(" movq 4(ap),r0");
660 asm(" locc 12(ap),r0,(r1)");
665 static unsigned char *
666 locc( cnt, src, term )
668 register unsigned char *src;
669 register unsigned char term;
671 while (*src++ != term && --cnt > 0);
673 return (cnt > 0 ? --src : (unsigned char *)0);
679 #if !defined (BSD42) && !defined (bcopy)
680 int bcmp (b1, b2, length)
693 bcopy (b1, b2, length)
710 #endif /* not BSD42 */