[PATCH v3 5/6] lib: replace the header parser with gmime
Austin Clements
amdragon at MIT.EDU
Mon Feb 3 13:31:13 PST 2014
Quoth Jani Nikula on Feb 03 at 9:51 pm:
> The notmuch library includes a full blown message header parser. Yet
> the same message headers are parsed by gmime during indexing. Switch
> to gmime parsing completely.
>
> These are the main differences between the parsers:
>
> * Gmime stops header parsing at the first invalid header, and presumes
> the message body starts from there. The current parser is quite
> liberal in accepting broken headers. The change means we will be
> much pickier about accepting invalid messages. (Note that the
> headers after the invalid header would not have been indexed before
> this change anyway, as we use gmime for that.)
>
> * The current parser converts tabs used in header folding to
> spaces. Gmime preserve the tabs. Due to a broken python library used
> in mailman, there are plenty of mailing lists that produce headers
> with tabs in header folding, and we'll see plenty of tabs. (This
> change has been mitigated in preparatory patches.)
>
> * For pure header parsing, the current parser is likely faster than
> gmime, which parses the whole message rather than just the
> headers. Since we use gmime for indexing anyway, we can drop an
> extra header parsing round (this is done in a follow-up patch).
>
> At this step, we only switch the header parsing to gmime.
> ---
> lib/database.cc | 4 +
> lib/index.cc | 11 --
> lib/message-file.c | 349 ++++++++++++++++----------------------------------
> lib/message.cc | 6 +
> lib/notmuch-private.h | 4 +
> 5 files changed, 125 insertions(+), 249 deletions(-)
>
> diff --git a/lib/database.cc b/lib/database.cc
> index f395061..d1bea88 100644
> --- a/lib/database.cc
> +++ b/lib/database.cc
> @@ -1940,6 +1940,10 @@ notmuch_database_add_message (notmuch_database_t *notmuch,
> "to",
> (char *) NULL);
>
> + ret = notmuch_message_file_parse (message_file);
> + if (ret)
> + goto DONE;
> +
> try {
> /* Before we do any real work, (especially before doing a
> * potential SHA-1 computation on the entire file's contents),
> diff --git a/lib/index.cc b/lib/index.cc
> index 78c18cf..976e49f 100644
> --- a/lib/index.cc
> +++ b/lib/index.cc
> @@ -437,7 +437,6 @@ _notmuch_message_index_file (notmuch_message_t *message,
> static int initialized = 0;
> char from_buf[5];
> bool is_mbox = false;
> - static bool mbox_warning = false;
>
> if (! initialized) {
> g_mime_init (GMIME_ENABLE_RFC2047_WORKAROUNDS);
> @@ -471,16 +470,6 @@ _notmuch_message_index_file (notmuch_message_t *message,
> ret = NOTMUCH_STATUS_FILE_NOT_EMAIL;
> goto DONE;
> }
> - /* For historical reasons, we support single-message mboxes,
> - * but this behavior is likely to change in the future, so
> - * warn. */
> - if (!mbox_warning) {
> - mbox_warning = true;
> - fprintf (stderr, "\
> -Warning: %s is an mbox containing a single message,\n\
> -likely caused by misconfigured mail delivery. Support for single-message\n\
> -mboxes is deprecated and may be removed in the future.\n", filename);
> - }
> }
>
> from = g_mime_message_get_sender (mime_message);
> diff --git a/lib/message-file.c b/lib/message-file.c
> index a2850c2..33f6468 100644
> --- a/lib/message-file.c
> +++ b/lib/message-file.c
> @@ -26,30 +26,19 @@
>
> #include <glib.h> /* GHashTable */
>
> -typedef struct {
> - char *str;
> - size_t size;
> - size_t len;
> -} header_value_closure_t;
> -
> struct _notmuch_message_file {
> /* File object */
> FILE *file;
> + char *filename;
>
> /* Header storage */
> int restrict_headers;
> GHashTable *headers;
> - int broken_headers;
> - int good_headers;
> - size_t header_size; /* Length of full message header in bytes. */
> -
> - /* Parsing state */
> - char *line;
> - size_t line_size;
> - header_value_closure_t value;
>
> - int parsing_started;
> - int parsing_finished;
> + GMimeStream *stream;
> + GMimeParser *parser;
> + GMimeMessage *message;
> + notmuch_bool_t parsed;
> };
>
> static int
> @@ -76,16 +65,18 @@ strcase_hash (const void *ptr)
> static int
> _notmuch_message_file_destructor (notmuch_message_file_t *message)
> {
> - if (message->line)
> - free (message->line);
> -
> - if (message->value.size)
> - free (message->value.str);
> -
> if (message->headers)
> g_hash_table_destroy (message->headers);
>
> - if (message->file)
> + if (message->message)
> + g_object_unref (message->message);
> +
> + if (message->parser)
> + g_object_unref (message->parser);
> +
> + if (message->stream)
> + g_object_unref (message->stream);
> + else if (message->file) /* GMimeStream takes over the FILE* */
> fclose (message->file);
>
> return 0;
> @@ -102,19 +93,21 @@ _notmuch_message_file_open_ctx (void *ctx, const char *filename)
> if (unlikely (message == NULL))
> return NULL;
>
> + /* only needed for error messages during parsing */
> + message->filename = talloc_strdup (message, filename);
> + if (message->filename == NULL)
> + goto FAIL;
> +
> talloc_set_destructor (message, _notmuch_message_file_destructor);
>
> message->file = fopen (filename, "r");
> if (message->file == NULL)
> goto FAIL;
>
> - message->headers = g_hash_table_new_full (strcase_hash,
> - strcase_equal,
> - free,
> - g_free);
> -
> - message->parsing_started = 0;
> - message->parsing_finished = 0;
> + message->headers = g_hash_table_new_full (strcase_hash, strcase_equal,
> + free, g_free);
> + if (message->headers == NULL)
> + goto FAIL;
>
> return message;
>
> @@ -143,7 +136,7 @@ notmuch_message_file_restrict_headersv (notmuch_message_file_t *message,
> {
> char *header;
>
> - if (message->parsing_started)
> + if (message->parsed)
> INTERNAL_ERROR ("notmuch_message_file_restrict_headers called after parsing has started");
>
> while (1) {
> @@ -167,234 +160,114 @@ notmuch_message_file_restrict_headers (notmuch_message_file_t *message, ...)
> notmuch_message_file_restrict_headersv (message, va_headers);
> }
>
> -static void
> -copy_header_unfolding (header_value_closure_t *value,
> - const char *chunk)
> +/*
> + * gmime does not provide access to all Received: headers the way we
> + * want, so we'll to use the parser header callback to gather them
> + * into a hash table.
> + */
> +static void header_cb (unused(GMimeParser *parser), const char *header,
> + const char *value, unused(gint64 offset),
> + gpointer user_data)
> {
> - char *last;
> + notmuch_message_file_t *message = (notmuch_message_file_t *) user_data;
> + char *existing = NULL;
> + notmuch_bool_t found;
>
> - if (chunk == NULL)
> + found = g_hash_table_lookup_extended (message->headers, header,
> + NULL, (gpointer *) &existing);
> + if (! found && message->restrict_headers)
> return;
>
> - while (*chunk == ' ' || *chunk == '\t')
> - chunk++;
> -
> - if (value->len + 1 + strlen (chunk) + 1 > value->size) {
> - unsigned int new_size = value->size;
> - if (value->size == 0)
> - new_size = strlen (chunk) + 1;
> - else
> - while (value->len + 1 + strlen (chunk) + 1 > new_size)
> - new_size *= 2;
> - value->str = xrealloc (value->str, new_size);
> - value->size = new_size;
> - }
> -
> - last = value->str + value->len;
> - if (value->len) {
> - *last = ' ';
> - last++;
> - value->len++;
> - }
> -
> - strcpy (last, chunk);
> - value->len += strlen (chunk);
> -
> - last = value->str + value->len - 1;
> - if (*last == '\n') {
> - *last = '\0';
> - value->len--;
> + if (existing == NULL) {
> + /* No value, add one */
> + char *decoded = g_mime_utils_header_decode_text (value);
> + g_hash_table_insert (message->headers, xstrdup (header), decoded);
> + } else if (strcasecmp (header, "received") == 0) {
> + /* Concat all of the Received: headers we encounter. */
> + char *combined, *decoded;
> + size_t combined_size;
> +
> + decoded = g_mime_utils_header_decode_text (value);
> +
> + combined_size = strlen(existing) + strlen(decoded) + 2;
> + combined = g_malloc (combined_size);
> + snprintf (combined, combined_size, "%s %s", existing, decoded);
> + g_free (decoded);
> + g_hash_table_insert (message->headers, xstrdup (header), combined);
I spent a while staring at this line trying to figure out whether
header or its duplicate was leaked. It's clear enough that this line
is correct from the documentation of g_hash_table_insert, but it might
be worth adding a comment here mentioning that g_hash_table_insert
will free this temporary copy of the key, to save future readers the
eye strain.
> }
> }
>
> -/* As a special-case, a value of NULL for header_desired will force
> - * the entire header to be parsed if it is not parsed already. This is
> - * used by the _notmuch_message_file_get_headers_end function.
> - * Another special case is the Received: header. For this header we
> - * want to concatenate all instances of the header instead of just
> - * hashing the first instance as we use this when analyzing the path
> - * the mail has taken from sender to recipient.
> - */
> -const char *
> -notmuch_message_file_get_header (notmuch_message_file_t *message,
> - const char *header_desired)
> +notmuch_status_t
> +notmuch_message_file_parse (notmuch_message_file_t *message)
> {
> - int contains;
> - char *header, *decoded_value, *header_sofar, *combined_header;
> - const char *s, *colon;
> - int match, newhdr, hdrsofar, is_received;
> static int initialized = 0;
> -
> - is_received = (strcmp(header_desired,"received") == 0);
> + char from_buf[5];
> + notmuch_bool_t is_mbox = FALSE;
> + static notmuch_bool_t mbox_warning = FALSE;
>
> if (! initialized) {
> g_mime_init (GMIME_ENABLE_RFC2047_WORKAROUNDS);
> initialized = 1;
> }
>
> - message->parsing_started = 1;
> -
> - if (header_desired == NULL)
> - contains = 0;
> - else
> - contains = g_hash_table_lookup_extended (message->headers,
> - header_desired, NULL,
> - (gpointer *) &decoded_value);
> -
> - if (contains && decoded_value)
> - return decoded_value;
> -
> - if (message->parsing_finished)
> - return "";
> -
> -#define NEXT_HEADER_LINE(closure) \
> - while (1) { \
> - ssize_t bytes_read = getline (&message->line, \
> - &message->line_size, \
> - message->file); \
> - if (bytes_read == -1) { \
> - message->parsing_finished = 1; \
> - break; \
> - } \
> - if (*message->line == '\n') { \
> - message->parsing_finished = 1; \
> - break; \
> - } \
> - if (closure && \
> - (*message->line == ' ' || *message->line == '\t')) \
> - { \
> - copy_header_unfolding ((closure), message->line); \
> - } \
> - if (*message->line == ' ' || *message->line == '\t') \
> - message->header_size += strlen (message->line); \
> - else \
> - break; \
> - }
> -
> - if (message->line == NULL)
> - NEXT_HEADER_LINE (NULL);
> -
> - while (1) {
> -
> - if (message->parsing_finished)
> - break;
> -
> - colon = strchr (message->line, ':');
> -
> - if (colon == NULL) {
> - message->broken_headers++;
> - /* A simple heuristic for giving up on things that just
> - * don't look like mail messages. */
> - if (message->broken_headers >= 10 &&
> - message->good_headers < 5)
> - {
> - message->parsing_finished = 1;
> - break;
> - }
> - NEXT_HEADER_LINE (NULL);
> - continue;
> + /* Is this mbox? */
> + if (fread (from_buf, sizeof (from_buf), 1, message->file) == 1 &&
> + strncmp (from_buf, "From ", 5) == 0)
> + is_mbox = TRUE;
> + rewind (message->file);
> +
> + message->stream = g_mime_stream_file_new (message->file);
Given that GMimeStream steals message->file, would it make sense to
set it to NULL here? That would also simplify
_notmuch_message_file_destructor a bit and move the knowledge of this
stealing behavior to where it happens.
> + message->parser = g_mime_parser_new_with_stream (message->stream);
> + g_mime_parser_set_scan_from (message->parser, is_mbox);
> + g_mime_parser_set_header_regex (message->parser, ".*", header_cb,
> + (gpointer) message);
Interesting.
> +
> + message->message = g_mime_parser_construct_message (message->parser);
> + if (! message->message)
> + return NOTMUCH_STATUS_FILE_NOT_EMAIL;
> +
> + if (is_mbox) {
> + if (! g_mime_parser_eos (message->parser)) {
> + /* This is a multi-message mbox. */
> + return NOTMUCH_STATUS_FILE_NOT_EMAIL;
> }
> -
> - message->header_size += strlen (message->line);
> -
> - message->good_headers++;
> -
> - header = xstrndup (message->line, colon - message->line);
> -
> - if (message->restrict_headers &&
> - ! g_hash_table_lookup_extended (message->headers,
> - header, NULL, NULL))
> - {
> - free (header);
> - NEXT_HEADER_LINE (NULL);
> - continue;
> - }
> -
> - s = colon + 1;
> - while (*s == ' ' || *s == '\t')
> - s++;
> -
> - message->value.len = 0;
> - copy_header_unfolding (&message->value, s);
> -
> - NEXT_HEADER_LINE (&message->value);
> -
> - if (header_desired == NULL)
> - match = 0;
> - else
> - match = (strcasecmp (header, header_desired) == 0);
> -
> - decoded_value = g_mime_utils_header_decode_text (message->value.str);
> - header_sofar = (char *)g_hash_table_lookup (message->headers, header);
> - /* we treat the Received: header special - we want to concat ALL of
> - * the Received: headers we encounter.
> - * for everything else we return the first instance of a header */
> - if (strcasecmp(header, "received") == 0) {
> - if (header_sofar == NULL) {
> - /* first Received: header we encountered; just add it */
> - g_hash_table_insert (message->headers, header, decoded_value);
> - } else {
> - /* we need to add the header to those we already collected */
> - newhdr = strlen(decoded_value);
> - hdrsofar = strlen(header_sofar);
> - combined_header = g_malloc(hdrsofar + newhdr + 2);
> - strncpy(combined_header,header_sofar,hdrsofar);
> - *(combined_header+hdrsofar) = ' ';
> - strncpy(combined_header+hdrsofar+1,decoded_value,newhdr+1);
> - g_free (decoded_value);
> - g_hash_table_insert (message->headers, header, combined_header);
> - }
> - } else {
> - if (header_sofar == NULL) {
> - /* Only insert if we don't have a value for this header, yet. */
> - g_hash_table_insert (message->headers, header, decoded_value);
> - } else {
> - free (header);
> - g_free (decoded_value);
> - decoded_value = header_sofar;
> - }
> + /*
> + * For historical reasons, we support single-message mboxes,
> + * but this behavior is likely to change in the future, so
> + * warn.
> + */
> + if (! mbox_warning) {
> + mbox_warning = TRUE;
> + fprintf (stderr, "\
> +Warning: %s is an mbox containing a single message,\n\
> +likely caused by misconfigured mail delivery. Support for single-message\n\
> +mboxes is deprecated and may be removed in the future.\n", message->filename);
> }
Why move this warning here? Currently we obviously only print this
once, at index time. Will this cause us to print the warning in more
situations?
> - /* if we found a match we can bail - unless of course we are
> - * collecting all the Received: headers */
> - if (match && !is_received)
> - return decoded_value;
> }
>
> - if (message->parsing_finished) {
> - fclose (message->file);
> - message->file = NULL;
> - }
> + message->parsed = TRUE;
>
> - if (message->line)
> - free (message->line);
> - message->line = NULL;
> + return NOTMUCH_STATUS_SUCCESS;
> +}
>
> - if (message->value.size) {
> - free (message->value.str);
> - message->value.str = NULL;
> - message->value.size = 0;
> - message->value.len = 0;
> - }
> +/* return NULL on errors, empty string for non-existing headers */
For consistency, the comment should be capitalized.
Also, this comment used to explain how Received headers were handled.
It seems like we lost that.
> +const char *
> +notmuch_message_file_get_header (notmuch_message_file_t *message,
> + const char *header)
> +{
> + const char *value = NULL;
> + notmuch_bool_t found;
>
> - /* For the Received: header we actually might end up here even
> - * though we found the header (as we force continued parsing
> - * in that case). So let's check if that's the header we were
> - * looking for and return the value that we found (if any)
> - */
> - if (is_received)
> - return (char *)g_hash_table_lookup (message->headers, "received");
> -
> - /* We've parsed all headers and never found the one we're looking
> - * for. It's probably just not there, but let's check that we
> - * didn't make a mistake preventing us from seeing it. */
> - if (message->restrict_headers && header_desired &&
> - ! g_hash_table_lookup_extended (message->headers,
> - header_desired, NULL, NULL))
> - {
> - INTERNAL_ERROR ("Attempt to get header \"%s\" which was not\n"
> - "included in call to notmuch_message_file_restrict_headers\n",
> - header_desired);
> - }
> + /* the caller shouldn't ask for headers before parsing */
> + if (! message->parsed)
> + return NULL;
Previously, calling notmuch_message_file_get_header triggered parsing
of the message, which ensured it didn't happen unless necessary. Is
there a reason to change that behavior?
If it were still done lazily, you wouldn't need the addition below to
_notmuch_message_ensure_message_file, or the new prototype in
notmuch-private.h (and notmuch_message_file_parse could be static).
(Ignore this comment if it's justified by the next patch; I haven't
gotten there yet.)
> +
> + found = g_hash_table_lookup_extended (message->headers, header,
> + NULL, (gpointer *) &value);
> +
> + /* the caller shouldn't ask for non-restricted headers */
> + if (! found && message->restrict_headers)
> + return NULL;
>
> - return "";
> + return value ? value : "";
> }
> diff --git a/lib/message.cc b/lib/message.cc
> index c91f3a5..9a22d36 100644
> --- a/lib/message.cc
> +++ b/lib/message.cc
> @@ -407,6 +407,12 @@ _notmuch_message_ensure_message_file (notmuch_message_t *message)
> return;
>
> message->message_file = _notmuch_message_file_open_ctx (message, filename);
> +
> + /* XXX: better return value handling */
> + if (notmuch_message_file_parse (message->message_file)) {
> + notmuch_message_file_close (message->message_file);
> + message->message_file = NULL;
> + }
> }
>
> const char *
> diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h
> index af185c7..7277df1 100644
> --- a/lib/notmuch-private.h
> +++ b/lib/notmuch-private.h
> @@ -345,6 +345,10 @@ notmuch_message_file_open (const char *filename);
> notmuch_message_file_t *
> _notmuch_message_file_open_ctx (void *ctx, const char *filename);
>
> +/* Parse the message */
> +notmuch_status_t
> +notmuch_message_file_parse (notmuch_message_file_t *message);
> +
> /* Close a notmuch message previously opened with notmuch_message_open. */
> void
> notmuch_message_file_close (notmuch_message_file_t *message);
More information about the notmuch
mailing list