[PATCH 4/6] lib: replace the header parser with gmime

Jani Nikula jani at nikula.org
Wed Oct 16 12:00:11 PDT 2013


The notmuch library includes a full blown message header parser. Yet
the same message headers are parsed by gmime during indexing. Switch
to gmime parsing completely.

These are the main differences between the parsers:

* Gmime stops header parsing at the first invalid header, and presumes
  the message body starts from there. The current parser is quite
  liberal in accepting broken headers. The change means we will be
  much pickier about accepting invalid messages. For example the
  performance test suite contains plenty of email that does not pass
  as valid email.

* The current parser converts tabs used in header folding to
  spaces. Gmime preserve the tabs. Due to a broken python library used
  in mailman, there are plenty of mailing lists that produce headers
  with tabs in header folding, we'll see plenty of tabs.

At this step, we only switch the header parsing to gmime.
---
 lib/database.cc       |   4 +
 lib/message-file.c    | 346 ++++++++++++++++----------------------------------
 lib/message.cc        |   6 +
 lib/notmuch-private.h |   4 +
 4 files changed, 122 insertions(+), 238 deletions(-)

diff --git a/lib/database.cc b/lib/database.cc
index 06f1c0a..45a3987 100644
--- a/lib/database.cc
+++ b/lib/database.cc
@@ -1907,6 +1907,10 @@ notmuch_database_add_message (notmuch_database_t *notmuch,
 					   "to",
 					   (char *) NULL);
 
+    ret = notmuch_message_file_parse (message_file);
+    if (ret)
+	goto DONE;
+
     try {
 	/* Before we do any real work, (especially before doing a
 	 * potential SHA-1 computation on the entire file's contents),
diff --git a/lib/message-file.c b/lib/message-file.c
index a2850c2..9d5a3b9 100644
--- a/lib/message-file.c
+++ b/lib/message-file.c
@@ -26,30 +26,19 @@
 
 #include <glib.h> /* GHashTable */
 
-typedef struct {
-    char *str;
-    size_t size;
-    size_t len;
-} header_value_closure_t;
-
 struct _notmuch_message_file {
     /* File object */
     FILE *file;
+    char *filename;
 
     /* Header storage */
     int restrict_headers;
     GHashTable *headers;
-    int broken_headers;
-    int good_headers;
-    size_t header_size; /* Length of full message header in bytes. */
-
-    /* Parsing state */
-    char *line;
-    size_t line_size;
-    header_value_closure_t value;
 
-    int parsing_started;
-    int parsing_finished;
+    GMimeStream *stream;
+    GMimeParser *parser;
+    GMimeMessage *message;
+    notmuch_bool_t parsed;
 };
 
 static int
@@ -76,16 +65,18 @@ strcase_hash (const void *ptr)
 static int
 _notmuch_message_file_destructor (notmuch_message_file_t *message)
 {
-    if (message->line)
-	free (message->line);
-
-    if (message->value.size)
-	free (message->value.str);
-
     if (message->headers)
 	g_hash_table_destroy (message->headers);
 
-    if (message->file)
+    if (message->message)
+	g_object_unref (message->message);
+
+    if (message->parser)
+	g_object_unref (message->parser);
+
+    if (message->stream)
+	g_object_unref (message->stream);
+    else if (message->file) /* GMimeStream takes over the FILE* */
 	fclose (message->file);
 
     return 0;
@@ -102,19 +93,21 @@ _notmuch_message_file_open_ctx (void *ctx, const char *filename)
     if (unlikely (message == NULL))
 	return NULL;
 
+    /* only needed for error messages during parsing */
+    message->filename = talloc_strdup (message, filename);
+    if (message->filename == NULL)
+	goto FAIL;
+
     talloc_set_destructor (message, _notmuch_message_file_destructor);
 
     message->file = fopen (filename, "r");
     if (message->file == NULL)
 	goto FAIL;
 
-    message->headers = g_hash_table_new_full (strcase_hash,
-					      strcase_equal,
-					      free,
-					      g_free);
-
-    message->parsing_started = 0;
-    message->parsing_finished = 0;
+    message->headers = g_hash_table_new_full (strcase_hash, strcase_equal,
+					      free, g_free);
+    if (message->headers == NULL)
+	goto FAIL;
 
     return message;
 
@@ -143,7 +136,7 @@ notmuch_message_file_restrict_headersv (notmuch_message_file_t *message,
 {
     char *header;
 
-    if (message->parsing_started)
+    if (message->parsed)
 	INTERNAL_ERROR ("notmuch_message_file_restrict_headers called after parsing has started");
 
     while (1) {
@@ -167,234 +160,111 @@ notmuch_message_file_restrict_headers (notmuch_message_file_t *message, ...)
     notmuch_message_file_restrict_headersv (message, va_headers);
 }
 
-static void
-copy_header_unfolding (header_value_closure_t *value,
-		       const char *chunk)
+/*
+ * gmime does not provide access to all headers, so we need to use the
+ * parser header callback to gather them into a hash table.
+ */
+static void header_cb (unused(GMimeParser *parser), const char *header,
+		       const char *value, unused(gint64 offset),
+		       gpointer user_data)
 {
-    char *last;
+    notmuch_message_file_t *message = (notmuch_message_file_t *) user_data;
+    char *existing = NULL;
+    notmuch_bool_t found;
 
-    if (chunk == NULL)
+    found = g_hash_table_lookup_extended (message->headers, header,
+					  NULL, (gpointer *) &existing);
+    if (! found && message->restrict_headers)
 	return;
 
-    while (*chunk == ' ' || *chunk == '\t')
-	chunk++;
-
-    if (value->len + 1 + strlen (chunk) + 1 > value->size) {
-	unsigned int new_size = value->size;
-	if (value->size == 0)
-	    new_size = strlen (chunk) + 1;
-	else
-	    while (value->len + 1 + strlen (chunk) + 1 > new_size)
-		new_size *= 2;
-	value->str = xrealloc (value->str, new_size);
-	value->size = new_size;
-    }
-
-    last = value->str + value->len;
-    if (value->len) {
-	*last = ' ';
-	last++;
-	value->len++;
-    }
-
-    strcpy (last, chunk);
-    value->len += strlen (chunk);
-
-    last = value->str + value->len - 1;
-    if (*last == '\n') {
-	*last = '\0';
-	value->len--;
+    if (existing == NULL) {
+	/* No value, add one */
+	char *decoded = g_mime_utils_header_decode_text (value);
+	g_hash_table_insert (message->headers, xstrdup (header), decoded);
+    } else if (strcasecmp (header, "received") == 0) {
+	/* Concat all of the Received: headers we encounter. */
+	char *combined, *decoded;
+	size_t combined_size;
+
+	decoded = g_mime_utils_header_decode_text (value);
+
+	combined_size = strlen(existing) + strlen(decoded) + 2;
+	combined = g_malloc (combined_size);
+	snprintf (combined, combined_size, "%s %s", existing, decoded);
+	g_free (decoded);
+	g_hash_table_insert (message->headers, xstrdup (header), combined);
     }
 }
 
-/* As a special-case, a value of NULL for header_desired will force
- * the entire header to be parsed if it is not parsed already. This is
- * used by the _notmuch_message_file_get_headers_end function.
- * Another special case is the Received: header. For this header we
- * want to concatenate all instances of the header instead of just
- * hashing the first instance as we use this when analyzing the path
- * the mail has taken from sender to recipient.
- */
-const char *
-notmuch_message_file_get_header (notmuch_message_file_t *message,
-				 const char *header_desired)
+notmuch_status_t
+notmuch_message_file_parse (notmuch_message_file_t *message)
 {
-    int contains;
-    char *header, *decoded_value, *header_sofar, *combined_header;
-    const char *s, *colon;
-    int match, newhdr, hdrsofar, is_received;
     static int initialized = 0;
-
-    is_received = (strcmp(header_desired,"received") == 0);
+    char from_buf[5];
+    notmuch_bool_t is_mbox = FALSE;
+    static notmuch_bool_t mbox_warning = FALSE;
 
     if (! initialized) {
 	g_mime_init (GMIME_ENABLE_RFC2047_WORKAROUNDS);
 	initialized = 1;
     }
 
-    message->parsing_started = 1;
-
-    if (header_desired == NULL)
-	contains = 0;
-    else
-	contains = g_hash_table_lookup_extended (message->headers,
-						 header_desired, NULL,
-						 (gpointer *) &decoded_value);
-
-    if (contains && decoded_value)
-	return decoded_value;
-
-    if (message->parsing_finished)
-	return "";
-
-#define NEXT_HEADER_LINE(closure)				\
-    while (1) {							\
-	ssize_t bytes_read = getline (&message->line,		\
-				      &message->line_size,	\
-				      message->file);		\
-	if (bytes_read == -1) {					\
-	    message->parsing_finished = 1;			\
-	    break;						\
-	}							\
-	if (*message->line == '\n') {				\
-	    message->parsing_finished = 1;			\
-	    break;						\
-	}							\
-	if (closure &&						\
-	    (*message->line == ' ' || *message->line == '\t'))	\
-	{							\
-	    copy_header_unfolding ((closure), message->line);	\
-	}							\
-	if (*message->line == ' ' || *message->line == '\t')	\
-	    message->header_size += strlen (message->line);	\
-	else							\
-	    break;						\
-    }
-
-    if (message->line == NULL)
-	NEXT_HEADER_LINE (NULL);
-
-    while (1) {
-
-	if (message->parsing_finished)
-	    break;
-
-	colon = strchr (message->line, ':');
-
-	if (colon == NULL) {
-	    message->broken_headers++;
-	    /* A simple heuristic for giving up on things that just
-	     * don't look like mail messages. */
-	    if (message->broken_headers >= 10 &&
-		message->good_headers < 5)
-	    {
-		message->parsing_finished = 1;
-		break;
-	    }
-	    NEXT_HEADER_LINE (NULL);
-	    continue;
+    /* Is this mbox? */
+    if (fread (from_buf, sizeof (from_buf), 1, message->file) == 1 &&
+	strncmp (from_buf, "From ", 5) == 0)
+	is_mbox = TRUE;
+    rewind (message->file);
+
+    message->stream = g_mime_stream_file_new (message->file);
+    message->parser = g_mime_parser_new_with_stream (message->stream);
+    g_mime_parser_set_scan_from (message->parser, is_mbox);
+    g_mime_parser_set_header_regex (message->parser, ".*", header_cb,
+				    (gpointer) message);
+
+    message->message = g_mime_parser_construct_message (message->parser);
+    if (! message->message)
+	return NOTMUCH_STATUS_FILE_NOT_EMAIL;
+
+    if (is_mbox) {
+	if (!g_mime_parser_eos (message->parser)) {
+	    /* This is a multi-message mbox. */
+	    return NOTMUCH_STATUS_FILE_NOT_EMAIL;
 	}
-
-	message->header_size += strlen (message->line);
-
-	message->good_headers++;
-
-	header = xstrndup (message->line, colon - message->line);
-
-	if (message->restrict_headers &&
-	    ! g_hash_table_lookup_extended (message->headers,
-					    header, NULL, NULL))
-	{
-	    free (header);
-	    NEXT_HEADER_LINE (NULL);
-	    continue;
-	}
-
-	s = colon + 1;
-	while (*s == ' ' || *s == '\t')
-	    s++;
-
-	message->value.len = 0;
-	copy_header_unfolding (&message->value, s);
-
-	NEXT_HEADER_LINE (&message->value);
-
-	if (header_desired == NULL)
-	    match = 0;
-	else
-	    match = (strcasecmp (header, header_desired) == 0);
-
-	decoded_value = g_mime_utils_header_decode_text (message->value.str);
-	header_sofar = (char *)g_hash_table_lookup (message->headers, header);
-	/* we treat the Received: header special - we want to concat ALL of 
-	 * the Received: headers we encounter.
-	 * for everything else we return the first instance of a header */
-	if (strcasecmp(header, "received") == 0) {
-	    if (header_sofar == NULL) {
-		/* first Received: header we encountered; just add it */
-		g_hash_table_insert (message->headers, header, decoded_value);
-	    } else {
-		/* we need to add the header to those we already collected */
-		newhdr = strlen(decoded_value);
-		hdrsofar = strlen(header_sofar);
-		combined_header = g_malloc(hdrsofar + newhdr + 2);
-		strncpy(combined_header,header_sofar,hdrsofar);
-		*(combined_header+hdrsofar) = ' ';
-		strncpy(combined_header+hdrsofar+1,decoded_value,newhdr+1);
-		g_free (decoded_value);
-		g_hash_table_insert (message->headers, header, combined_header);
-	    }
-	} else {
-	    if (header_sofar == NULL) {
-		/* Only insert if we don't have a value for this header, yet. */
-		g_hash_table_insert (message->headers, header, decoded_value);
-	    } else {
-		free (header);
-		g_free (decoded_value);
-		decoded_value = header_sofar;
-	    }
+	/* For historical reasons, we support single-message mboxes,
+	 * but this behavior is likely to change in the future, so
+	 * warn. */
+	if (!mbox_warning) {
+	    mbox_warning = TRUE;
+	    fprintf (stderr, "\
+Warning: %s is an mbox containing a single message,\n\
+likely caused by misconfigured mail delivery.  Support for single-message\n\
+mboxes is deprecated and may be removed in the future.\n", message->filename);
 	}
-	/* if we found a match we can bail - unless of course we are
-	 * collecting all the Received: headers */
-	if (match && !is_received)
-	    return decoded_value;
     }
 
-    if (message->parsing_finished) {
-        fclose (message->file);
-        message->file = NULL;
-    }
+    message->parsed = TRUE;
 
-    if (message->line)
-	free (message->line);
-    message->line = NULL;
+    return NOTMUCH_STATUS_SUCCESS;
+}
 
-    if (message->value.size) {
-	free (message->value.str);
-	message->value.str = NULL;
-	message->value.size = 0;
-	message->value.len = 0;
-    }
+/* return NULL on errors, empty string for non-existing headers */
+const char *
+notmuch_message_file_get_header (notmuch_message_file_t *message,
+				 const char *header)
+{
+    const char *value = NULL;
+    notmuch_bool_t found;
 
-    /* For the Received: header we actually might end up here even
-     * though we found the header (as we force continued parsing
-     * in that case). So let's check if that's the header we were
-     * looking for and return the value that we found (if any)
-     */
-    if (is_received)
-	return (char *)g_hash_table_lookup (message->headers, "received");
-
-    /* We've parsed all headers and never found the one we're looking
-     * for. It's probably just not there, but let's check that we
-     * didn't make a mistake preventing us from seeing it. */
-    if (message->restrict_headers && header_desired &&
-	! g_hash_table_lookup_extended (message->headers,
-					header_desired, NULL, NULL))
-    {
-	INTERNAL_ERROR ("Attempt to get header \"%s\" which was not\n"
-			"included in call to notmuch_message_file_restrict_headers\n",
-			header_desired);
-    }
+    /* the caller shouldn't ask for headers before parsing */
+    if (! message->parsed)
+	return NULL;
+
+    found = g_hash_table_lookup_extended (message->headers, header,
+					  NULL, (gpointer *) &value);
+
+    /* the caller shouldn't ask for non-restricted headers */
+    if (! found && message->restrict_headers)
+	return NULL;
 
-    return "";
+    return value ? value : "";
 }
diff --git a/lib/message.cc b/lib/message.cc
index 1b46379..1c7b91f 100644
--- a/lib/message.cc
+++ b/lib/message.cc
@@ -407,6 +407,12 @@ _notmuch_message_ensure_message_file (notmuch_message_t *message)
 	return;
 
     message->message_file = _notmuch_message_file_open_ctx (message, filename);
+
+    /* XXX: better return value handling */
+    if (notmuch_message_file_parse (message->message_file)) {
+	notmuch_message_file_close (message->message_file);
+	message->message_file = NULL;
+    }
 }
 
 const char *
diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h
index af185c7..7277df1 100644
--- a/lib/notmuch-private.h
+++ b/lib/notmuch-private.h
@@ -345,6 +345,10 @@ notmuch_message_file_open (const char *filename);
 notmuch_message_file_t *
 _notmuch_message_file_open_ctx (void *ctx, const char *filename);
 
+/* Parse the message */
+notmuch_status_t
+notmuch_message_file_parse (notmuch_message_file_t *message);
+
 /* Close a notmuch message previously opened with notmuch_message_open. */
 void
 notmuch_message_file_close (notmuch_message_file_t *message);
-- 
1.8.4.rc3



More information about the notmuch mailing list