[PATCH v3 5/6] lib: replace the header parser with gmime

Austin Clements amdragon at MIT.EDU
Mon Feb 3 13:31:13 PST 2014


Quoth Jani Nikula on Feb 03 at  9:51 pm:
> The notmuch library includes a full blown message header parser. Yet
> the same message headers are parsed by gmime during indexing. Switch
> to gmime parsing completely.
> 
> These are the main differences between the parsers:
> 
> * Gmime stops header parsing at the first invalid header, and presumes
>   the message body starts from there. The current parser is quite
>   liberal in accepting broken headers. The change means we will be
>   much pickier about accepting invalid messages. (Note that the
>   headers after the invalid header would not have been indexed before
>   this change anyway, as we use gmime for that.)
> 
> * The current parser converts tabs used in header folding to
>   spaces. Gmime preserve the tabs. Due to a broken python library used
>   in mailman, there are plenty of mailing lists that produce headers
>   with tabs in header folding, and we'll see plenty of tabs. (This
>   change has been mitigated in preparatory patches.)
> 
> * For pure header parsing, the current parser is likely faster than
>   gmime, which parses the whole message rather than just the
>   headers. Since we use gmime for indexing anyway, we can drop an
>   extra header parsing round (this is done in a follow-up patch).
> 
> At this step, we only switch the header parsing to gmime.
> ---
>  lib/database.cc       |   4 +
>  lib/index.cc          |  11 --
>  lib/message-file.c    | 349 ++++++++++++++++----------------------------------
>  lib/message.cc        |   6 +
>  lib/notmuch-private.h |   4 +
>  5 files changed, 125 insertions(+), 249 deletions(-)
> 
> diff --git a/lib/database.cc b/lib/database.cc
> index f395061..d1bea88 100644
> --- a/lib/database.cc
> +++ b/lib/database.cc
> @@ -1940,6 +1940,10 @@ notmuch_database_add_message (notmuch_database_t *notmuch,
>  					   "to",
>  					   (char *) NULL);
>  
> +    ret = notmuch_message_file_parse (message_file);
> +    if (ret)
> +	goto DONE;
> +
>      try {
>  	/* Before we do any real work, (especially before doing a
>  	 * potential SHA-1 computation on the entire file's contents),
> diff --git a/lib/index.cc b/lib/index.cc
> index 78c18cf..976e49f 100644
> --- a/lib/index.cc
> +++ b/lib/index.cc
> @@ -437,7 +437,6 @@ _notmuch_message_index_file (notmuch_message_t *message,
>      static int initialized = 0;
>      char from_buf[5];
>      bool is_mbox = false;
> -    static bool mbox_warning = false;
>  
>      if (! initialized) {
>  	g_mime_init (GMIME_ENABLE_RFC2047_WORKAROUNDS);
> @@ -471,16 +470,6 @@ _notmuch_message_index_file (notmuch_message_t *message,
>  	    ret = NOTMUCH_STATUS_FILE_NOT_EMAIL;
>  	    goto DONE;
>  	}
> -	/* For historical reasons, we support single-message mboxes,
> -	 * but this behavior is likely to change in the future, so
> -	 * warn. */
> -	if (!mbox_warning) {
> -	    mbox_warning = true;
> -	    fprintf (stderr, "\
> -Warning: %s is an mbox containing a single message,\n\
> -likely caused by misconfigured mail delivery.  Support for single-message\n\
> -mboxes is deprecated and may be removed in the future.\n", filename);
> -	}
>      }
>  
>      from = g_mime_message_get_sender (mime_message);
> diff --git a/lib/message-file.c b/lib/message-file.c
> index a2850c2..33f6468 100644
> --- a/lib/message-file.c
> +++ b/lib/message-file.c
> @@ -26,30 +26,19 @@
>  
>  #include <glib.h> /* GHashTable */
>  
> -typedef struct {
> -    char *str;
> -    size_t size;
> -    size_t len;
> -} header_value_closure_t;
> -
>  struct _notmuch_message_file {
>      /* File object */
>      FILE *file;
> +    char *filename;
>  
>      /* Header storage */
>      int restrict_headers;
>      GHashTable *headers;
> -    int broken_headers;
> -    int good_headers;
> -    size_t header_size; /* Length of full message header in bytes. */
> -
> -    /* Parsing state */
> -    char *line;
> -    size_t line_size;
> -    header_value_closure_t value;
>  
> -    int parsing_started;
> -    int parsing_finished;
> +    GMimeStream *stream;
> +    GMimeParser *parser;
> +    GMimeMessage *message;
> +    notmuch_bool_t parsed;
>  };
>  
>  static int
> @@ -76,16 +65,18 @@ strcase_hash (const void *ptr)
>  static int
>  _notmuch_message_file_destructor (notmuch_message_file_t *message)
>  {
> -    if (message->line)
> -	free (message->line);
> -
> -    if (message->value.size)
> -	free (message->value.str);
> -
>      if (message->headers)
>  	g_hash_table_destroy (message->headers);
>  
> -    if (message->file)
> +    if (message->message)
> +	g_object_unref (message->message);
> +
> +    if (message->parser)
> +	g_object_unref (message->parser);
> +
> +    if (message->stream)
> +	g_object_unref (message->stream);
> +    else if (message->file) /* GMimeStream takes over the FILE* */
>  	fclose (message->file);
>  
>      return 0;
> @@ -102,19 +93,21 @@ _notmuch_message_file_open_ctx (void *ctx, const char *filename)
>      if (unlikely (message == NULL))
>  	return NULL;
>  
> +    /* only needed for error messages during parsing */
> +    message->filename = talloc_strdup (message, filename);
> +    if (message->filename == NULL)
> +	goto FAIL;
> +
>      talloc_set_destructor (message, _notmuch_message_file_destructor);
>  
>      message->file = fopen (filename, "r");
>      if (message->file == NULL)
>  	goto FAIL;
>  
> -    message->headers = g_hash_table_new_full (strcase_hash,
> -					      strcase_equal,
> -					      free,
> -					      g_free);
> -
> -    message->parsing_started = 0;
> -    message->parsing_finished = 0;
> +    message->headers = g_hash_table_new_full (strcase_hash, strcase_equal,
> +					      free, g_free);
> +    if (message->headers == NULL)
> +	goto FAIL;
>  
>      return message;
>  
> @@ -143,7 +136,7 @@ notmuch_message_file_restrict_headersv (notmuch_message_file_t *message,
>  {
>      char *header;
>  
> -    if (message->parsing_started)
> +    if (message->parsed)
>  	INTERNAL_ERROR ("notmuch_message_file_restrict_headers called after parsing has started");
>  
>      while (1) {
> @@ -167,234 +160,114 @@ notmuch_message_file_restrict_headers (notmuch_message_file_t *message, ...)
>      notmuch_message_file_restrict_headersv (message, va_headers);
>  }
>  
> -static void
> -copy_header_unfolding (header_value_closure_t *value,
> -		       const char *chunk)
> +/*
> + * gmime does not provide access to all Received: headers the way we
> + * want, so we'll to use the parser header callback to gather them
> + * into a hash table.
> + */
> +static void header_cb (unused(GMimeParser *parser), const char *header,
> +		       const char *value, unused(gint64 offset),
> +		       gpointer user_data)
>  {
> -    char *last;
> +    notmuch_message_file_t *message = (notmuch_message_file_t *) user_data;
> +    char *existing = NULL;
> +    notmuch_bool_t found;
>  
> -    if (chunk == NULL)
> +    found = g_hash_table_lookup_extended (message->headers, header,
> +					  NULL, (gpointer *) &existing);
> +    if (! found && message->restrict_headers)
>  	return;
>  
> -    while (*chunk == ' ' || *chunk == '\t')
> -	chunk++;
> -
> -    if (value->len + 1 + strlen (chunk) + 1 > value->size) {
> -	unsigned int new_size = value->size;
> -	if (value->size == 0)
> -	    new_size = strlen (chunk) + 1;
> -	else
> -	    while (value->len + 1 + strlen (chunk) + 1 > new_size)
> -		new_size *= 2;
> -	value->str = xrealloc (value->str, new_size);
> -	value->size = new_size;
> -    }
> -
> -    last = value->str + value->len;
> -    if (value->len) {
> -	*last = ' ';
> -	last++;
> -	value->len++;
> -    }
> -
> -    strcpy (last, chunk);
> -    value->len += strlen (chunk);
> -
> -    last = value->str + value->len - 1;
> -    if (*last == '\n') {
> -	*last = '\0';
> -	value->len--;
> +    if (existing == NULL) {
> +	/* No value, add one */
> +	char *decoded = g_mime_utils_header_decode_text (value);
> +	g_hash_table_insert (message->headers, xstrdup (header), decoded);
> +    } else if (strcasecmp (header, "received") == 0) {
> +	/* Concat all of the Received: headers we encounter. */
> +	char *combined, *decoded;
> +	size_t combined_size;
> +
> +	decoded = g_mime_utils_header_decode_text (value);
> +
> +	combined_size = strlen(existing) + strlen(decoded) + 2;
> +	combined = g_malloc (combined_size);
> +	snprintf (combined, combined_size, "%s %s", existing, decoded);
> +	g_free (decoded);
> +	g_hash_table_insert (message->headers, xstrdup (header), combined);

I spent a while staring at this line trying to figure out whether
header or its duplicate was leaked.  It's clear enough that this line
is correct from the documentation of g_hash_table_insert, but it might
be worth adding a comment here mentioning that g_hash_table_insert
will free this temporary copy of the key, to save future readers the
eye strain.

>      }
>  }
>  
> -/* As a special-case, a value of NULL for header_desired will force
> - * the entire header to be parsed if it is not parsed already. This is
> - * used by the _notmuch_message_file_get_headers_end function.
> - * Another special case is the Received: header. For this header we
> - * want to concatenate all instances of the header instead of just
> - * hashing the first instance as we use this when analyzing the path
> - * the mail has taken from sender to recipient.
> - */
> -const char *
> -notmuch_message_file_get_header (notmuch_message_file_t *message,
> -				 const char *header_desired)
> +notmuch_status_t
> +notmuch_message_file_parse (notmuch_message_file_t *message)
>  {
> -    int contains;
> -    char *header, *decoded_value, *header_sofar, *combined_header;
> -    const char *s, *colon;
> -    int match, newhdr, hdrsofar, is_received;
>      static int initialized = 0;
> -
> -    is_received = (strcmp(header_desired,"received") == 0);
> +    char from_buf[5];
> +    notmuch_bool_t is_mbox = FALSE;
> +    static notmuch_bool_t mbox_warning = FALSE;
>  
>      if (! initialized) {
>  	g_mime_init (GMIME_ENABLE_RFC2047_WORKAROUNDS);
>  	initialized = 1;
>      }
>  
> -    message->parsing_started = 1;
> -
> -    if (header_desired == NULL)
> -	contains = 0;
> -    else
> -	contains = g_hash_table_lookup_extended (message->headers,
> -						 header_desired, NULL,
> -						 (gpointer *) &decoded_value);
> -
> -    if (contains && decoded_value)
> -	return decoded_value;
> -
> -    if (message->parsing_finished)
> -	return "";
> -
> -#define NEXT_HEADER_LINE(closure)				\
> -    while (1) {							\
> -	ssize_t bytes_read = getline (&message->line,		\
> -				      &message->line_size,	\
> -				      message->file);		\
> -	if (bytes_read == -1) {					\
> -	    message->parsing_finished = 1;			\
> -	    break;						\
> -	}							\
> -	if (*message->line == '\n') {				\
> -	    message->parsing_finished = 1;			\
> -	    break;						\
> -	}							\
> -	if (closure &&						\
> -	    (*message->line == ' ' || *message->line == '\t'))	\
> -	{							\
> -	    copy_header_unfolding ((closure), message->line);	\
> -	}							\
> -	if (*message->line == ' ' || *message->line == '\t')	\
> -	    message->header_size += strlen (message->line);	\
> -	else							\
> -	    break;						\
> -    }
> -
> -    if (message->line == NULL)
> -	NEXT_HEADER_LINE (NULL);
> -
> -    while (1) {
> -
> -	if (message->parsing_finished)
> -	    break;
> -
> -	colon = strchr (message->line, ':');
> -
> -	if (colon == NULL) {
> -	    message->broken_headers++;
> -	    /* A simple heuristic for giving up on things that just
> -	     * don't look like mail messages. */
> -	    if (message->broken_headers >= 10 &&
> -		message->good_headers < 5)
> -	    {
> -		message->parsing_finished = 1;
> -		break;
> -	    }
> -	    NEXT_HEADER_LINE (NULL);
> -	    continue;
> +    /* Is this mbox? */
> +    if (fread (from_buf, sizeof (from_buf), 1, message->file) == 1 &&
> +	strncmp (from_buf, "From ", 5) == 0)
> +	is_mbox = TRUE;
> +    rewind (message->file);
> +
> +    message->stream = g_mime_stream_file_new (message->file);

Given that GMimeStream steals message->file, would it make sense to
set it to NULL here?  That would also simplify
_notmuch_message_file_destructor a bit and move the knowledge of this
stealing behavior to where it happens.

> +    message->parser = g_mime_parser_new_with_stream (message->stream);
> +    g_mime_parser_set_scan_from (message->parser, is_mbox);
> +    g_mime_parser_set_header_regex (message->parser, ".*", header_cb,
> +				    (gpointer) message);

Interesting.

> +
> +    message->message = g_mime_parser_construct_message (message->parser);
> +    if (! message->message)
> +	return NOTMUCH_STATUS_FILE_NOT_EMAIL;
> +
> +    if (is_mbox) {
> +	if (! g_mime_parser_eos (message->parser)) {
> +	    /* This is a multi-message mbox. */
> +	    return NOTMUCH_STATUS_FILE_NOT_EMAIL;
>  	}
> -
> -	message->header_size += strlen (message->line);
> -
> -	message->good_headers++;
> -
> -	header = xstrndup (message->line, colon - message->line);
> -
> -	if (message->restrict_headers &&
> -	    ! g_hash_table_lookup_extended (message->headers,
> -					    header, NULL, NULL))
> -	{
> -	    free (header);
> -	    NEXT_HEADER_LINE (NULL);
> -	    continue;
> -	}
> -
> -	s = colon + 1;
> -	while (*s == ' ' || *s == '\t')
> -	    s++;
> -
> -	message->value.len = 0;
> -	copy_header_unfolding (&message->value, s);
> -
> -	NEXT_HEADER_LINE (&message->value);
> -
> -	if (header_desired == NULL)
> -	    match = 0;
> -	else
> -	    match = (strcasecmp (header, header_desired) == 0);
> -
> -	decoded_value = g_mime_utils_header_decode_text (message->value.str);
> -	header_sofar = (char *)g_hash_table_lookup (message->headers, header);
> -	/* we treat the Received: header special - we want to concat ALL of 
> -	 * the Received: headers we encounter.
> -	 * for everything else we return the first instance of a header */
> -	if (strcasecmp(header, "received") == 0) {
> -	    if (header_sofar == NULL) {
> -		/* first Received: header we encountered; just add it */
> -		g_hash_table_insert (message->headers, header, decoded_value);
> -	    } else {
> -		/* we need to add the header to those we already collected */
> -		newhdr = strlen(decoded_value);
> -		hdrsofar = strlen(header_sofar);
> -		combined_header = g_malloc(hdrsofar + newhdr + 2);
> -		strncpy(combined_header,header_sofar,hdrsofar);
> -		*(combined_header+hdrsofar) = ' ';
> -		strncpy(combined_header+hdrsofar+1,decoded_value,newhdr+1);
> -		g_free (decoded_value);
> -		g_hash_table_insert (message->headers, header, combined_header);
> -	    }
> -	} else {
> -	    if (header_sofar == NULL) {
> -		/* Only insert if we don't have a value for this header, yet. */
> -		g_hash_table_insert (message->headers, header, decoded_value);
> -	    } else {
> -		free (header);
> -		g_free (decoded_value);
> -		decoded_value = header_sofar;
> -	    }
> +	/*
> +	 * For historical reasons, we support single-message mboxes,
> +	 * but this behavior is likely to change in the future, so
> +	 * warn.
> +	 */
> +	if (! mbox_warning) {
> +	    mbox_warning = TRUE;
> +	    fprintf (stderr, "\
> +Warning: %s is an mbox containing a single message,\n\
> +likely caused by misconfigured mail delivery.  Support for single-message\n\
> +mboxes is deprecated and may be removed in the future.\n", message->filename);
>  	}

Why move this warning here?  Currently we obviously only print this
once, at index time.  Will this cause us to print the warning in more
situations?

> -	/* if we found a match we can bail - unless of course we are
> -	 * collecting all the Received: headers */
> -	if (match && !is_received)
> -	    return decoded_value;
>      }
>  
> -    if (message->parsing_finished) {
> -        fclose (message->file);
> -        message->file = NULL;
> -    }
> +    message->parsed = TRUE;
>  
> -    if (message->line)
> -	free (message->line);
> -    message->line = NULL;
> +    return NOTMUCH_STATUS_SUCCESS;
> +}
>  
> -    if (message->value.size) {
> -	free (message->value.str);
> -	message->value.str = NULL;
> -	message->value.size = 0;
> -	message->value.len = 0;
> -    }
> +/* return NULL on errors, empty string for non-existing headers */

For consistency, the comment should be capitalized.

Also, this comment used to explain how Received headers were handled.
It seems like we lost that.

> +const char *
> +notmuch_message_file_get_header (notmuch_message_file_t *message,
> +				 const char *header)
> +{
> +    const char *value = NULL;
> +    notmuch_bool_t found;
>  
> -    /* For the Received: header we actually might end up here even
> -     * though we found the header (as we force continued parsing
> -     * in that case). So let's check if that's the header we were
> -     * looking for and return the value that we found (if any)
> -     */
> -    if (is_received)
> -	return (char *)g_hash_table_lookup (message->headers, "received");
> -
> -    /* We've parsed all headers and never found the one we're looking
> -     * for. It's probably just not there, but let's check that we
> -     * didn't make a mistake preventing us from seeing it. */
> -    if (message->restrict_headers && header_desired &&
> -	! g_hash_table_lookup_extended (message->headers,
> -					header_desired, NULL, NULL))
> -    {
> -	INTERNAL_ERROR ("Attempt to get header \"%s\" which was not\n"
> -			"included in call to notmuch_message_file_restrict_headers\n",
> -			header_desired);
> -    }
> +    /* the caller shouldn't ask for headers before parsing */
> +    if (! message->parsed)
> +	return NULL;

Previously, calling notmuch_message_file_get_header triggered parsing
of the message, which ensured it didn't happen unless necessary.  Is
there a reason to change that behavior?

If it were still done lazily, you wouldn't need the addition below to
_notmuch_message_ensure_message_file, or the new prototype in
notmuch-private.h (and notmuch_message_file_parse could be static).

(Ignore this comment if it's justified by the next patch; I haven't
gotten there yet.)

> +
> +    found = g_hash_table_lookup_extended (message->headers, header,
> +					  NULL, (gpointer *) &value);
> +
> +    /* the caller shouldn't ask for non-restricted headers */
> +    if (! found && message->restrict_headers)
> +	return NULL;
>  
> -    return "";
> +    return value ? value : "";
>  }
> diff --git a/lib/message.cc b/lib/message.cc
> index c91f3a5..9a22d36 100644
> --- a/lib/message.cc
> +++ b/lib/message.cc
> @@ -407,6 +407,12 @@ _notmuch_message_ensure_message_file (notmuch_message_t *message)
>  	return;
>  
>      message->message_file = _notmuch_message_file_open_ctx (message, filename);
> +
> +    /* XXX: better return value handling */
> +    if (notmuch_message_file_parse (message->message_file)) {
> +	notmuch_message_file_close (message->message_file);
> +	message->message_file = NULL;
> +    }
>  }
>  
>  const char *
> diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h
> index af185c7..7277df1 100644
> --- a/lib/notmuch-private.h
> +++ b/lib/notmuch-private.h
> @@ -345,6 +345,10 @@ notmuch_message_file_open (const char *filename);
>  notmuch_message_file_t *
>  _notmuch_message_file_open_ctx (void *ctx, const char *filename);
>  
> +/* Parse the message */
> +notmuch_status_t
> +notmuch_message_file_parse (notmuch_message_file_t *message);
> +
>  /* Close a notmuch message previously opened with notmuch_message_open. */
>  void
>  notmuch_message_file_close (notmuch_message_file_t *message);


More information about the notmuch mailing list