1 /* rfc822parse.c - Simple mail and MIME parser
2 * Copyright (C) 1999, 2000 Werner Koch, Duesseldorf
3 * Copyright (C) 2003, 2004 g10 Code GmbH
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public License
7 * as published by the Free Software Foundation; either version 3 of
8 * the License, or (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this program; if not, see <https://www.gnu.org/licenses/>.
20 /* According to RFC822 binary zeroes are allowed at many places. We do
21 * not handle this correct especially in the field parsing code. It
22 * should be easy to fix and the API provides a interfaces which
23 * returns the length but in addition makes sure that returned strings
24 * are always ended by a \0.
26 * Furthermore, the case of field names is changed and thus it is not
27 * always a good idea to use these modified header
28 * lines (e.g. signatures may break).
42 #include "rfc822parse.h"
53 /* For now we directly use our TOKEN as the parse context */
54 typedef struct rfc822parse_field_context *TOKEN;
55 struct rfc822parse_field_context
61 unsigned int lowered:1;
63 /*TOKEN owner_pantry; */
69 struct hdr_line *next;
70 int cont; /* This is a continuation of the previous line. */
71 unsigned char line[1];
74 typedef struct hdr_line *HDR_LINE;
79 struct part *right; /* The next part. */
80 struct part *down; /* A contained part. */
81 HDR_LINE hdr_lines; /* Header lines os that part. */
82 HDR_LINE *hdr_lines_tail; /* Helper for adding lines. */
83 char *boundary; /* Only used in the first part. */
85 typedef struct part *part_t;
87 struct rfc822parse_context
89 rfc822parse_cb_t callback;
93 int in_preamble; /* Wether we are before the first boundary. */
94 part_t parts; /* The tree of parts. */
95 part_t current_part; /* Whom we are processing (points into parts). */
96 const char *boundary; /* Current boundary. */
99 static HDR_LINE find_header (rfc822parse_t msg, const char *name,
100 int which, HDR_LINE * rprev);
104 length_sans_trailing_ws (const unsigned char *line, size_t len)
106 const unsigned char *p, *mark;
109 for (mark=NULL, p=line, n=0; n < len; n++, p++)
111 if (strchr (" \t\r\n", *p ))
127 lowercase_string (unsigned char *string)
129 for (; *string; string++)
130 if (*string >= 'A' && *string <= 'Z')
131 *string = *string - 'A' + 'a';
134 /* Transform a header name into a standard capitalized format; i.e
135 "Content-Type". Conversion stops at the colon. As usual we don't
136 use the localized versions of ctype.h.
139 capitalize_header_name (unsigned char *name)
143 for (; *name && *name != ':'; name++)
148 if (*name >= 'a' && *name <= 'z')
149 *name = *name - 'a' + 'A';
152 else if (*name >= 'A' && *name <= 'Z')
153 *name = *name - 'A' + 'a';
158 stpcpy (char *a,const char *b)
169 /* If a callback has been registerd, call it for the event of type
172 do_callback (rfc822parse_t msg, rfc822parse_event_t event)
176 if (!msg->callback || msg->callback_error)
178 rc = msg->callback (msg->callback_value, event, msg);
180 msg->callback_error = rc;
189 part = calloc (1, sizeof *part);
192 part->hdr_lines_tail = &part->hdr_lines;
199 release_part (part_t part)
204 for (; part; part = tmp)
208 release_part (part->down);
209 for (hdr = part->hdr_lines; hdr; hdr = hdr2)
214 free (part->boundary);
221 release_handle_data (rfc822parse_t msg)
223 release_part (msg->parts);
225 msg->current_part = NULL;
226 msg->boundary = NULL;
230 /* Create a new parsing context for an entire rfc822 message and
231 return it. CB and CB_VALUE may be given to callback for certain
232 events. NULL is returned on error with errno set appropriately. */
234 rfc822parse_open (rfc822parse_cb_t cb, void *cb_value)
236 rfc822parse_t msg = calloc (1, sizeof *msg);
239 msg->parts = msg->current_part = new_part ();
248 msg->callback_value = cb_value;
249 if (do_callback (msg, RFC822PARSE_OPEN))
251 release_handle_data (msg);
262 rfc822parse_cancel (rfc822parse_t msg)
266 do_callback (msg, RFC822PARSE_CANCEL);
267 release_handle_data (msg);
274 rfc822parse_close (rfc822parse_t msg)
278 do_callback (msg, RFC822PARSE_CLOSE);
279 release_handle_data (msg);
285 find_parent (part_t tree, part_t target)
289 for (part = tree->down; part; part = part->right)
292 return tree; /* Found. */
295 part_t tmp = find_parent (part, target);
304 set_current_part_to_parent (rfc822parse_t msg)
308 assert (msg->current_part);
309 parent = find_parent (msg->parts, msg->current_part);
311 return; /* Already at the top. */
316 for (part = parent->down; part; part = part->right)
317 if (part == msg->current_part)
322 msg->current_part = parent;
324 parent = find_parent (msg->parts, parent);
325 msg->boundary = parent? parent->boundary: NULL;
331 * We have read in all header lines and are about to receive the body
332 * part. The delimiter line has already been processed.
334 * FIXME: we's better return an error in case of memory failures.
337 transition_to_body (rfc822parse_t msg)
339 rfc822parse_field_t ctx;
342 rc = do_callback (msg, RFC822PARSE_T2BODY);
345 /* Store the boundary if we have multipart type. */
346 ctx = rfc822parse_parse_field (msg, "Content-Type", -1);
351 s = rfc822parse_query_media_type (ctx, NULL);
352 if (s && !strcmp (s,"multipart"))
354 s = rfc822parse_query_parameter (ctx, "boundary", 0);
357 assert (!msg->current_part->boundary);
358 msg->current_part->boundary = malloc (strlen (s) + 1);
359 if (msg->current_part->boundary)
363 strcpy (msg->current_part->boundary, s);
364 msg->boundary = msg->current_part->boundary;
368 int save_errno = errno;
369 rfc822parse_release_field (ctx);
373 rc = do_callback (msg, RFC822PARSE_LEVEL_DOWN);
374 assert (!msg->current_part->down);
375 msg->current_part->down = part;
376 msg->current_part = part;
377 msg->in_preamble = 1;
381 rfc822parse_release_field (ctx);
388 /* We have just passed a MIME boundary and need to prepare for new part.
391 transition_to_header (rfc822parse_t msg)
395 assert (msg->current_part);
396 assert (!msg->current_part->right);
402 msg->current_part->right = part;
403 msg->current_part = part;
409 insert_header (rfc822parse_t msg, const unsigned char *line, size_t length)
413 assert (msg->current_part);
417 return transition_to_body (msg);
420 if (!msg->current_part->hdr_lines)
421 do_callback (msg, RFC822PARSE_BEGIN_HEADER);
423 length = length_sans_trailing_ws (line, length);
424 hdr = malloc (sizeof (*hdr) + length);
428 hdr->cont = (*line == ' ' || *line == '\t');
429 memcpy (hdr->line, line, length);
430 hdr->line[length] = 0; /* Make it a string. */
432 /* Transform a field name into canonical format. */
433 if (!hdr->cont && strchr (line, ':'))
434 capitalize_header_name (hdr->line);
436 *msg->current_part->hdr_lines_tail = hdr;
437 msg->current_part->hdr_lines_tail = &hdr->next;
439 /* Lets help the caller to prevent mail loops and issue an event for
440 * every Received header. */
441 if (length >= 9 && !memcmp (line, "Received:", 9))
442 do_callback (msg, RFC822PARSE_RCVD_SEEN);
448 * Note: We handle the body transparent to allow binary zeroes in it.
451 insert_body (rfc822parse_t msg, const unsigned char *line, size_t length)
455 if (length > 2 && *line == '-' && line[1] == '-' && msg->boundary)
457 size_t blen = strlen (msg->boundary);
459 if (length == blen + 2
460 && !memcmp (line+2, msg->boundary, blen))
462 rc = do_callback (msg, RFC822PARSE_BOUNDARY);
464 if (!rc && !msg->in_preamble)
465 rc = transition_to_header (msg);
466 msg->in_preamble = 0;
468 else if (length == blen + 4
469 && line[length-2] =='-' && line[length-1] == '-'
470 && !memcmp (line+2, msg->boundary, blen))
472 rc = do_callback (msg, RFC822PARSE_LAST_BOUNDARY);
473 msg->boundary = NULL; /* No current boundary anymore. */
474 set_current_part_to_parent (msg);
476 /* Fixme: The next should actually be send right before the
477 next boundary, so that we can mark the epilogue. */
479 rc = do_callback (msg, RFC822PARSE_LEVEL_UP);
482 if (msg->in_preamble && !rc)
483 rc = do_callback (msg, RFC822PARSE_PREAMBLE);
488 /* Insert the next line into the parser. Return 0 on success or true
489 on error with errno set appropriately. */
491 rfc822parse_insert (rfc822parse_t msg, const unsigned char *line, size_t length)
494 ? insert_body (msg, line, length)
495 : insert_header (msg, line, length));
499 /* Tell the parser that we have finished the message. */
501 rfc822parse_finish (rfc822parse_t msg)
503 return do_callback (msg, RFC822PARSE_FINISH);
509 * Get a copy of a header line. The line is returned as one long
510 * string with LF to separate the continuation line. Caller must free
511 * the return buffer. WHICH may be used to enumerate over all lines.
512 * Wildcards are allowed. This function works on the current headers;
513 * i.e. the regular mail headers or the MIME headers of the current
516 * WHICH gives the mode:
517 * -1 := Take the last occurrence
518 * n := Take the n-th one.
520 * Returns a newly allocated buffer or NULL on error. errno is set in
521 * case of a memory failure or set to 0 if the requested field is not
524 * If VALUEOFF is not NULL it will receive the offset of the first non
525 * space character in the value part of the line (i.e. after the first
529 rfc822parse_get_field (rfc822parse_t msg, const char *name, int which,
536 h = find_header (msg, name, which, NULL);
540 return NULL; /* no such field */
543 n = strlen (h->line) + 1;
544 for (h2 = h->next; h2 && h2->cont; h2 = h2->next)
545 n += strlen (h2->line) + 1;
547 buf = p = malloc (n);
550 p = stpcpy (p, h->line);
552 for (h2 = h->next; h2 && h2->cont; h2 = h2->next)
554 p = stpcpy (p, h2->line);
562 p = strchr (buf, ':');
564 *valueoff = 0; /* Oops: should never happen. */
568 while (*p == ' ' || *p == '\t' || *p == '\r' || *p == '\n')
579 * Enumerate all header. Caller has to provide the address of a pointer
580 * which has to be initialzed to NULL, the caller should then never change this
581 * pointer until he has closed the enumeration by passing again the address
582 * of the pointer but with msg set to NULL.
583 * The function returns pointers to all the header lines or NULL when
584 * all lines have been enumerated or no headers are available.
587 rfc822parse_enum_header_lines (rfc822parse_t msg, void **context)
591 if (!msg) /* Close. */
594 if (*context == msg || !msg->current_part)
597 l = *context ? (HDR_LINE) *context : msg->current_part->hdr_lines;
601 *context = l->next ? (void *) (l->next) : (void *) msg;
604 *context = msg; /* Mark end of list. */
611 * Find a header field. If the Name does end in an asterisk this is meant
614 * which -1 : Retrieve the last field
615 * >0 : Retrieve the n-th field
617 * RPREV may be used to return the predecessor of the returned field;
618 * which may be NULL for the very first one. It has to be initialzed
619 * to either NULL in which case the search start at the first header line,
620 * or it may point to a headerline, where the search should start
623 find_header (rfc822parse_t msg, const char *name, int which, HDR_LINE *rprev)
625 HDR_LINE hdr, prev = NULL, mark = NULL;
631 if (!msg->current_part)
634 namelen = strlen (name);
635 if (namelen && name[namelen - 1] == '*')
641 hdr = msg->current_part->hdr_lines;
644 /* spool forward to the requested starting place.
645 * we cannot simply set this as we have to return
646 * the previous list element too */
647 for (; hdr && hdr != *rprev; prev = hdr, hdr = hdr->next)
651 for (; hdr; prev = hdr, hdr = hdr->next)
655 if (!(p = strchr (hdr->line, ':')))
656 continue; /* invalid header, just skip it. */
659 continue; /* invalid name */
660 if ((glob ? (namelen <= n) : (namelen == n))
661 && !memcmp (hdr->line, name, namelen))
666 else if (found == which)
682 skip_ws (const char *s)
684 while (*s == ' ' || *s == '\t' || *s == '\r' || *s == '\n')
691 release_token_list (TOKEN t)
696 /* fixme: If we have owner_pantry, put the token back to
697 * this pantry so that it can be reused later */
705 new_token (enum token_type type, const char *buf, size_t length)
709 /* fixme: look through our pantries to find a suitable
711 t = malloc (sizeof *t + length);
716 memset (&t->flags, 0, sizeof (t->flags));
720 memcpy (t->data, buf, length);
721 t->data[length] = 0; /* Make sure it is a C string. */
730 append_to_token (TOKEN old, const char *buf, size_t length)
732 size_t n = strlen (old->data);
735 t = malloc (sizeof *t + n + length);
740 t->flags = old->flags;
741 memcpy (t->data, old->data, n);
742 memcpy (t->data + n, buf, length);
743 t->data[n + length] = 0;
745 release_token_list (old);
753 Parse a field into tokens as defined by rfc822.
756 parse_field (HDR_LINE hdr)
758 static const char specials[] = "<>@.,;:\\[]\"()";
759 static const char specials2[] = "<>@.,;:";
760 static const char tspecials[] = "/?=<>@,;:\\[]\"()";
761 static const char tspecials2[] = "/?=<>@.,;:"; /* FIXME: really
765 const unsigned char *name;
767 } tspecial_header[] = {
768 { "Content-Type", 12},
769 { "Content-Transfer-Encoding", 25},
770 { "Content-Disposition", 19},
773 const char *delimiters;
774 const char *delimiters2;
775 const unsigned char *line, *s, *s2;
778 TOKEN t, tok, *tok_tail;
788 if (!(s = strchr (line, ':')))
789 return NULL; /* oops */
793 return NULL; /* oops: invalid name */
795 delimiters = specials;
796 delimiters2 = specials2;
797 for (i = 0; tspecial_header[i].name; i++)
799 if (n == tspecial_header[i].namelen
800 && !memcmp (line, tspecial_header[i].name, n))
802 delimiters = tspecials;
803 delimiters2 = tspecials2;
808 s++; /* Move over the colon. */
813 if (!hdr->next || !hdr->next->cont)
814 return tok; /* Ready. */
816 /* Next item is a header continuation line. */
831 if (!hdr->next || !hdr->next->cont)
833 /* Next item is a header continuation line. */
842 else if (*s == '\\' && s[1]) /* what about continuation? */
857 ; /* Actually this is an error, but we don't care about it. */
861 else if (*s == '\"' || *s == '[')
863 /* We do not check for non-allowed nesting of domainliterals */
864 int term = *s == '\"' ? '\"' : ']';
871 for (s2 = s; *s2; s2++)
875 else if (*s2 == '\\' && s2[1]) /* what about continuation? */
880 ? append_to_token (t, s, s2 - s)
881 : new_token (term == '\"'? tQUOTED : tDOMAINLIT, s, s2 - s));
885 if (*s2 || !hdr->next || !hdr->next->cont)
887 /* Next item is a header continuation line. */
895 s++; /* skip the delimiter */
897 else if ((s2 = strchr (delimiters2, *s)))
898 { /* Special characters which are not handled above. */
900 t = new_token (tSPECIAL, s, 1);
907 else if (*s == ' ' || *s == '\t' || *s == '\r' || *s == '\n')
912 else if (*s > 0x20 && !(*s & 128))
915 for (s2 = s + 1; *s2 > 0x20
916 && !(*s2 & 128) && !strchr (delimiters, *s2); s2++)
918 t = new_token (tATOM, s, s2 - s);
926 { /* Invalid character. */
928 { /* For parsing we assume only one space. */
929 t = new_token (tSPACE, NULL, 0);
944 release_token_list (tok);
954 * Find and parse a header field.
955 * WHICH indicates what to do if there are multiple instance of the same
956 * field (like "Received"); the following value are defined:
957 * -1 := Take the last occurrence
959 * n := Take the n-th one.
960 * Returns a handle for further operations on the parse context of the field
961 * or NULL if the field was not found.
964 rfc822parse_parse_field (rfc822parse_t msg, const char *name, int which)
971 hdr = find_header (msg, name, which, NULL);
974 return parse_field (hdr);
978 rfc822parse_release_field (rfc822parse_field_t ctx)
981 release_token_list (ctx);
987 * Check whether T points to a parameter.
988 * A parameter starts with a semicolon and it is assumed that t
989 * points to exactly this one.
992 is_parameter (TOKEN t)
995 if (!t || t->type != tATOM)
998 if (!t || !(t->type == tSPECIAL && t->data[0] == '='))
1002 return 1; /* We assume that an non existing value is an empty one. */
1003 return t->type == tQUOTED || t->type == tATOM;
1007 Some header (Content-type) have a special syntax where attribute=value
1008 pairs are used after a leading semicolon. The parse_field code
1009 knows about these fields and changes the parsing to the one defined
1011 Returns a pointer to the value which is valid as long as the
1012 parse context is valid; NULL is returned in case that attr is not
1013 defined in the header, a missing value is reppresented by an empty string.
1015 With LOWER_VALUE set to true, a matching field valuebe be
1018 Note, that ATTR should be lowercase.
1021 rfc822parse_query_parameter (rfc822parse_field_t ctx, const char *attr,
1026 for (t = ctx; t; t = t->next)
1028 /* skip to the next semicolon */
1029 for (; t && !(t->type == tSPECIAL && t->data[0] == ';'); t = t->next)
1033 if (is_parameter (t))
1034 { /* Look closer. */
1035 a = t->next; /* We know that this is an atom */
1036 if ( !a->flags.lowered )
1038 lowercase_string (a->data);
1039 a->flags.lowered = 1;
1041 if (!strcmp (a->data, attr))
1044 /* Either T is now an atom, a quoted string or NULL in
1045 * which case we return an empty string. */
1047 if ( lower_value && t && !t->flags.lowered )
1049 lowercase_string (t->data);
1050 t->flags.lowered = 1;
1052 return t ? t->data : "";
1060 * This function may be used for the Content-Type header to figure out
1061 * the media type and subtype. Note, that the returned strings are
1062 * guaranteed to be lowercase as required by MIME.
1064 * Returns: a pointer to the media type and if subtype is not NULL,
1065 * a pointer to the subtype.
1068 rfc822parse_query_media_type (rfc822parse_field_t ctx, const char **subtype)
1073 if (t->type != tATOM)
1075 if (!t->flags.lowered)
1077 lowercase_string (t->data);
1078 t->flags.lowered = 1;
1082 if (!t || t->type != tSPECIAL || t->data[0] != '/')
1085 if (!t || t->type != tATOM)
1090 if (!t->flags.lowered)
1092 lowercase_string (t->data);
1093 t->flags.lowered = 1;
1106 /* Internal debug function to print the structure of the message. */
1108 dump_structure (rfc822parse_t msg, part_t part, int indent)
1112 printf ("*** Structure of this message:\n");
1116 for (; part; part = part->right)
1118 rfc822parse_field_t ctx;
1119 part_t save_part; /* ugly hack - we should have a function to
1120 get part information. */
1123 save_part = msg->current_part;
1124 msg->current_part = part;
1125 ctx = rfc822parse_parse_field (msg, "Content-Type", -1);
1126 msg->current_part = save_part;
1129 const char *s1, *s2;
1130 s1 = rfc822parse_query_media_type (ctx, &s2);
1132 printf ("*** %*s %s/%s", indent*2, "", s1, s2);
1134 printf ("*** %*s [not found]", indent*2, "");
1136 s = rfc822parse_query_parameter (ctx, "boundary", 0);
1138 printf (" (boundary=\"%s\")", s);
1139 rfc822parse_release_field (ctx);
1142 printf ("*** %*s text/plain [assumed]", indent*2, "");
1146 dump_structure (msg, part->down, indent + 1);
1154 show_param (rfc822parse_field_t ctx, const char *name)
1160 s = rfc822parse_query_parameter (ctx, name, 0);
1162 printf ("*** %s: '%s'\n", name, s);
1168 show_event (rfc822parse_event_t event)
1174 case RFC822PARSE_OPEN: s= "Open"; break;
1175 case RFC822PARSE_CLOSE: s= "Close"; break;
1176 case RFC822PARSE_CANCEL: s= "Cancel"; break;
1177 case RFC822PARSE_T2BODY: s= "T2Body"; break;
1178 case RFC822PARSE_FINISH: s= "Finish"; break;
1179 case RFC822PARSE_RCVD_SEEN: s= "Rcvd_Seen"; break;
1180 case RFC822PARSE_LEVEL_DOWN: s= "Level_Down"; break;
1181 case RFC822PARSE_LEVEL_UP: s= "Level_Up"; break;
1182 case RFC822PARSE_BOUNDARY: s= "Boundary"; break;
1183 case RFC822PARSE_LAST_BOUNDARY: s= "Last_Boundary"; break;
1184 case RFC822PARSE_BEGIN_HEADER: s= "Begin_Header"; break;
1185 case RFC822PARSE_PREAMBLE: s= "Preamble"; break;
1186 case RFC822PARSE_EPILOGUE: s= "Epilogue"; break;
1187 default: s= "***invalid event***"; break;
1189 printf ("*** got RFC822 event %s\n", s);
1193 msg_cb (void *dummy_arg, rfc822parse_event_t event, rfc822parse_t msg)
1196 if (event == RFC822PARSE_T2BODY)
1198 rfc822parse_field_t ctx;
1202 for (ectx=NULL; (line = rfc822parse_enum_header_lines (msg, &ectx)); )
1204 printf ("*** HDR: %s\n", line);
1206 rfc822parse_enum_header_lines (NULL, &ectx); /* Close enumerator. */
1208 ctx = rfc822parse_parse_field (msg, "Content-Type", -1);
1211 const char *s1, *s2;
1212 s1 = rfc822parse_query_media_type (ctx, &s2);
1214 printf ("*** media: '%s/%s'\n", s1, s2);
1216 printf ("*** media: [not found]\n");
1217 show_param (ctx, "boundary");
1218 show_param (ctx, "protocol");
1219 rfc822parse_release_field (ctx);
1222 printf ("*** media: text/plain [assumed]\n");
1233 main (int argc, char **argv)
1239 msg = rfc822parse_open (msg_cb, NULL);
1243 while (fgets (line, sizeof (line), stdin))
1245 length = strlen (line);
1246 if (length && line[length - 1] == '\n')
1248 if (length && line[length - 1] == '\r')
1250 if (rfc822parse_insert (msg, line, length))
1254 dump_structure (msg, NULL, 0);
1256 rfc822parse_close (msg);
1263 compile-command: "gcc -Wall -Wno-pointer-sign -g -DTESTING -o rfc822parse rfc822parse.c"