[PATCH v2 6/9] cli: change the data structure for notmuch address deduplication

Jani Nikula jani at nikula.org
Thu Sep 3 12:40:02 PDT 2015


Currently we key the address hash table with the case sensitive "name
<address>". Switch to case insensitive keying with just address, and
store the case sensitive name and address in linked lists. This will
be helpful in adding support for different deduplication schemes in
the future.

There will be a slight performance penalty for the current full case
sensitive name + address deduplication, but this is simpler as a whole
when other deduplication schemes are added, and I expect the schemes
to be added to become more popular than the current default.

Aparet from the possible performance penalty, the only user visible
change should be the change in the output ordering for
--output=count. The order is not guaranteed (and is based on hash
table traversal) currently anyway, so this should be of no
consequence.
---
 notmuch-client.h |  1 +
 notmuch-search.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 70 insertions(+), 15 deletions(-)

diff --git a/notmuch-client.h b/notmuch-client.h
index 882aa30563df..97d68d1158ac 100644
--- a/notmuch-client.h
+++ b/notmuch-client.h
@@ -48,6 +48,7 @@ typedef GMimeCryptoContext notmuch_crypto_context_t;
 #include <dirent.h>
 #include <errno.h>
 #include <signal.h>
+#include <ctype.h>
 
 #include "talloc-extra.h"
 
diff --git a/notmuch-search.c b/notmuch-search.c
index 66404b561679..7c51d5df6bd4 100644
--- a/notmuch-search.c
+++ b/notmuch-search.c
@@ -265,30 +265,74 @@ static mailbox_t *new_mailbox (void *ctx, const char *name, const char *addr)
     return mailbox;
 }
 
+static int mailbox_compare (const void *v1, const void *v2)
+{
+    const mailbox_t *m1 = v1, *m2 = v2;
+    int v;
+
+    if (m1->name && m2->name)
+	v = strcmp (m1->name, m2->name);
+    else
+	v = !!m1->name - !!m2->name;
+
+    if (! v)
+	v = strcmp (m1->addr, m2->addr);
+
+    return v;
+}
+
 /* Returns TRUE iff name and addr is duplicate. If not, stores the
  * name/addr pair in order to detect subsequent duplicates. */
 static notmuch_bool_t
 is_duplicate (const search_context_t *ctx, const char *name, const char *addr)
 {
     char *key;
+    GList *list, *l;
     mailbox_t *mailbox;
 
-    key = talloc_asprintf (ctx->format, "%s <%s>", name, addr);
-    if (! key)
-	return FALSE;
+    list = g_hash_table_lookup (ctx->addresses, addr);
+    if (list) {
+	mailbox_t find = {
+	    .name = name,
+	    .addr = addr,
+	};
+
+	l = g_list_find_custom (list, &find, mailbox_compare);
+	if (l) {
+	    mailbox = l->data;
+	    mailbox->count++;
+	    return TRUE;
+	}
 
-    mailbox = g_hash_table_lookup (ctx->addresses, key);
-    if (mailbox) {
-	mailbox->count++;
-	talloc_free (key);
-	return TRUE;
+	mailbox = new_mailbox (ctx->format, name, addr);
+	if (! mailbox)
+	    return FALSE;
+
+	/*
+	 * XXX: It would be more efficient to prepend to the list, but
+	 * then we'd have to store the changed list head back to the
+	 * hash table. This check is here just to avoid the compiler
+	 * warning for unused result.
+	 */
+	if (list != g_list_append (list, mailbox))
+	    INTERNAL_ERROR ("appending to list changed list head\n");
+
+	return FALSE;
     }
 
+    key = talloc_strdup (ctx->format, addr);
+    if (! key)
+	return FALSE;
+
     mailbox = new_mailbox (ctx->format, name, addr);
     if (! mailbox)
 	return FALSE;
 
-    g_hash_table_insert (ctx->addresses, key, mailbox);
+    list = g_list_append (NULL, mailbox);
+    if (! list)
+	return FALSE;
+
+    g_hash_table_insert (ctx->addresses, key, list);
 
     return FALSE;
 }
@@ -401,12 +445,21 @@ _talloc_free_for_g_hash (void *ptr)
 }
 
 static void
-print_hash_value (unused (gpointer key), gpointer value, gpointer user_data)
+_list_free_for_g_hash (void *ptr)
+{
+    g_list_free_full (ptr, _talloc_free_for_g_hash);
+}
+
+static void
+print_list_value (void *mailbox, void *context)
 {
-    const mailbox_t *mailbox = value;
-    search_context_t *ctx = user_data;
+    print_mailbox (context, mailbox);
+}
 
-    print_mailbox (ctx, mailbox);
+static void
+print_hash_value (unused (void *key), void *list, void *context)
+{
+    g_list_foreach (list, print_list_value, context);
 }
 
 static int
@@ -792,8 +845,9 @@ notmuch_address_command (notmuch_config_t *config, int argc, char *argv[])
 				 argc - opt_index, argv + opt_index))
 	return EXIT_FAILURE;
 
-    ctx->addresses = g_hash_table_new_full (g_str_hash, g_str_equal,
-					    _talloc_free_for_g_hash, _talloc_free_for_g_hash);
+    ctx->addresses = g_hash_table_new_full (strcase_hash, strcase_equal,
+					    _talloc_free_for_g_hash,
+					    _list_free_for_g_hash);
 
     ret = do_search_messages (ctx);
 
-- 
2.1.4



More information about the notmuch mailing list