[RFC][PATCH] notmuch-search: add file size search feature

Ioan-Adrian Ratiu adi at adirat.com
Fri May 5 20:09:19 PDT 2017


I need to keep track of my maildir file sizes and for a while the
parsers/scripts which I wrote were problematic, especially for viewing
& interacting with large collections of email.

Since I started using notmuch, having a way to filter my email search
results based on file sizes would prove handy, together with notmuch's
incremental indexing/parsing, that would really be helpful.

So this commit adds the following capability (sizes are expressed in
bytes because that's what I'm interested in, other inputs can be added
further on). Replace the range arg to get mails of a specific size:

notmuch search --output=files --sort=biggest-first  filesize:1000..100000 | xargs du -s

I didn't think of another front-end for this other than creating
saved searches from the Emacs GUI, which work well enough for me.

One thing I'm not sure how to handle in code is duplicate files
for a message ID (only the MsgID determines whether files are
"duplicates", right?). Any ideas how best to store/handle duplicates?

Signed-off-by: Ioan-Adrian Ratiu <adi at adirat.com>
---
 lib/database-private.h |  1 +
 lib/database.cc        |  6 ++++++
 lib/index.cc           | 10 ++++++++++
 lib/message-file.c     | 18 +++++++++++++++++-
 lib/message.cc         | 29 +++++++++++++++++++++++++++++
 lib/notmuch-private.h  | 16 ++++++++++++++++
 lib/notmuch.h          | 14 ++++++++++++++
 lib/query.cc           |  6 ++++++
 notmuch-search.c       |  2 ++
 9 files changed, 101 insertions(+), 1 deletion(-)

diff --git a/lib/database-private.h b/lib/database-private.h
index ab3d9691..a7e0a020 100644
--- a/lib/database-private.h
+++ b/lib/database-private.h
@@ -217,6 +217,7 @@ struct _notmuch_database {
     Xapian::ValueRangeProcessor *value_range_processor;
     Xapian::ValueRangeProcessor *date_range_processor;
     Xapian::ValueRangeProcessor *last_mod_range_processor;
+    Xapian::ValueRangeProcessor *filesize_range_processor;
 };
 
 /* Prior to database version 3, features were implied by the database
diff --git a/lib/database.cc b/lib/database.cc
index 5bc131a3..e6d5dd11 100644
--- a/lib/database.cc
+++ b/lib/database.cc
@@ -290,6 +290,7 @@ prefix_t prefix_table[] = {
     { "subject",		"XSUBJECT",	NOTMUCH_FIELD_EXTERNAL |
 						NOTMUCH_FIELD_PROBABILISTIC |
 						NOTMUCH_FIELD_PROCESSOR},
+    { "filesize",		"XFILESIZE",	NOTMUCH_FIELD_EXTERNAL },
 };
 
 static void
@@ -1076,6 +1077,7 @@ notmuch_database_open_verbose (const char *path,
 	notmuch->value_range_processor = new Xapian::NumberValueRangeProcessor (NOTMUCH_VALUE_TIMESTAMP);
 	notmuch->date_range_processor = new ParseTimeValueRangeProcessor (NOTMUCH_VALUE_TIMESTAMP);
 	notmuch->last_mod_range_processor = new Xapian::NumberValueRangeProcessor (NOTMUCH_VALUE_LAST_MOD, "lastmod:");
+	notmuch->filesize_range_processor = new Xapian::NumberValueRangeProcessor (NOTMUCH_VALUE_FILESIZE, "filesize:");
 
 	notmuch->query_parser->set_default_op (Xapian::Query::OP_AND);
 	notmuch->query_parser->set_database (*notmuch->xapian_db);
@@ -1084,6 +1086,7 @@ notmuch_database_open_verbose (const char *path,
 	notmuch->query_parser->add_valuerangeprocessor (notmuch->value_range_processor);
 	notmuch->query_parser->add_valuerangeprocessor (notmuch->date_range_processor);
 	notmuch->query_parser->add_valuerangeprocessor (notmuch->last_mod_range_processor);
+	notmuch->query_parser->add_valuerangeprocessor (notmuch->filesize_range_processor);
 
 	for (i = 0; i < ARRAY_SIZE (prefix_table); i++) {
 	    const prefix_t *prefix = &prefix_table[i];
@@ -1160,6 +1163,8 @@ notmuch_database_close (notmuch_database_t *notmuch)
     notmuch->date_range_processor = NULL;
     delete notmuch->last_mod_range_processor;
     notmuch->last_mod_range_processor = NULL;
+    delete notmuch->filesize_range_processor;
+    notmuch->filesize_range_processor = NULL;
 
     return status;
 }
@@ -2557,6 +2562,7 @@ notmuch_database_add_message (notmuch_database_t *notmuch,
 	}
 
 	_notmuch_message_add_filename (message, filename);
+	_notmuch_message_add_filesize (message, message_file);
 
 	/* Is this a newly created message object or a ghost
 	 * message?  We have to be slightly careful: if this is a
diff --git a/lib/index.cc b/lib/index.cc
index 8c145540..e8655bc1 100644
--- a/lib/index.cc
+++ b/lib/index.cc
@@ -441,6 +441,8 @@ _notmuch_message_index_file (notmuch_message_t *message,
     InternetAddressList *addresses;
     const char *from, *subject;
     notmuch_status_t status;
+    unsigned long filesize;
+    char *filesize_str;
 
     status = _notmuch_message_file_get_mime_message (message_file,
 						     &mime_message);
@@ -464,6 +466,14 @@ _notmuch_message_index_file (notmuch_message_t *message,
     subject = g_mime_message_get_subject (mime_message);
     _notmuch_message_gen_terms (message, "subject", subject);
 
+    filesize = _notmuch_message_file_get_size (message_file);
+    filesize_str = talloc_asprintf(NULL, "%lu", filesize);
+    if (! filesize_str)
+	return NOTMUCH_STATUS_OUT_OF_MEMORY;
+
+    _notmuch_message_add_term (message, "filesize", filesize_str);
+    talloc_free (filesize_str);
+
     _index_mime_part (message, g_mime_message_get_mime_part (mime_message));
 
     return NOTMUCH_STATUS_SUCCESS;
diff --git a/lib/message-file.c b/lib/message-file.c
index db18b163..f75593e3 100644
--- a/lib/message-file.c
+++ b/lib/message-file.c
@@ -26,10 +26,13 @@
 
 #include <glib.h> /* GHashTable */
 
+#include <glib/gstdio.h>
+
 struct _notmuch_message_file {
     /* File object */
     FILE *file;
     char *filename;
+    unsigned long filesize; /* in bytes */
 
     /* Cache for decoded headers */
     GHashTable *headers;
@@ -64,7 +67,7 @@ _notmuch_message_file_open_ctx (notmuch_database_t *notmuch,
     if (unlikely (message == NULL))
 	return NULL;
 
-    /* Only needed for error messages during parsing. */
+    /* Only needed during parsing */
     message->filename = talloc_strdup (message, filename);
     if (message->filename == NULL)
 	goto FAIL;
@@ -98,6 +101,12 @@ _notmuch_message_file_close (notmuch_message_file_t *message)
     talloc_free (message);
 }
 
+unsigned long
+_notmuch_message_file_get_size (notmuch_message_file_t *message)
+{
+    return message->filesize;
+}
+
 static notmuch_bool_t
 _is_mbox (FILE *file)
 {
@@ -122,6 +131,8 @@ _notmuch_message_file_parse (notmuch_message_file_t *message)
     notmuch_status_t status = NOTMUCH_STATUS_SUCCESS;
     static int initialized = 0;
     notmuch_bool_t is_mbox;
+    GStatBuf statResult;
+    int ret;
 
     if (message->message)
 	return NOTMUCH_STATUS_SUCCESS;
@@ -133,6 +144,11 @@ _notmuch_message_file_parse (notmuch_message_file_t *message)
 	initialized = 1;
     }
 
+    /* filesize defaults to zero which is ignored */
+    ret = g_stat(message->filename, &statResult);
+    if (! ret)
+	message->filesize = statResult.st_size;
+
     message->headers = g_hash_table_new_full (strcase_hash, strcase_equal,
 					      free, g_free);
     if (! message->headers)
diff --git a/lib/message.cc b/lib/message.cc
index c2721191..83b4c4db 100644
--- a/lib/message.cc
+++ b/lib/message.cc
@@ -988,6 +988,26 @@ notmuch_message_get_date (notmuch_message_t *message)
     return Xapian::sortable_unserialise (value);
 }
 
+unsigned long
+notmuch_message_get_filesize (notmuch_message_t *message)
+{
+    std::string value;
+
+    try {
+	value = message->doc.get_value (NOTMUCH_VALUE_FILESIZE);
+    } catch (Xapian::Error &error) {
+	_notmuch_database_log(_notmuch_message_database (message), "A Xapian exception occurred when reading filesize: %s\n",
+		 error.get_msg().c_str());
+	message->notmuch->exception_reported = TRUE;
+	return 0;
+    }
+
+    if (value.empty ())
+	/* sortable_unserialise is undefined on empty string */
+	return 0;
+    return Xapian::sortable_unserialise (value);
+}
+
 notmuch_tags_t *
 notmuch_message_get_tags (notmuch_message_t *message)
 {
@@ -1208,6 +1228,15 @@ _notmuch_message_close (notmuch_message_t *message)
     }
 }
 
+void
+_notmuch_message_add_filesize (notmuch_message_t *message,
+			       notmuch_message_file_t *message_file)
+{
+    unsigned long filesize = _notmuch_message_file_get_size(message_file);
+    message->doc.add_value (NOTMUCH_VALUE_FILESIZE,
+			    Xapian::sortable_serialise (filesize));
+}
+
 /* Add a name:value term to 'message', (the actual term will be
  * encoded by prefixing the value with a short prefix). See
  * NORMAL_PREFIX and BOOLEAN_PREFIX arrays for the mapping of term
diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h
index 8587e86c..3c15ed3d 100644
--- a/lib/notmuch-private.h
+++ b/lib/notmuch-private.h
@@ -110,6 +110,7 @@ typedef enum {
     NOTMUCH_VALUE_FROM,
     NOTMUCH_VALUE_SUBJECT,
     NOTMUCH_VALUE_LAST_MOD,
+    NOTMUCH_VALUE_FILESIZE,
 } notmuch_value_t;
 
 /* Xapian (with flint backend) complains if we provide a term longer
@@ -400,6 +401,21 @@ _notmuch_message_file_close (notmuch_message_file_t *message);
 notmuch_status_t
 _notmuch_message_file_parse (notmuch_message_file_t *message);
 
+/*
+ * Get the filesize of a message file
+ *
+ * This filesize member is read during file parsing.
+ */
+unsigned long
+_notmuch_message_file_get_size (notmuch_message_file_t *message);
+
+/*
+ * Set the message filesize to the size of the message_file
+ */
+void
+_notmuch_message_add_filesize (notmuch_message_t *message,
+			       notmuch_message_file_t *message_file);
+
 /* Get the gmime message of a message file.
  *
  * The message file is parsed as necessary.
diff --git a/lib/notmuch.h b/lib/notmuch.h
index d374dc96..8cff214f 100644
--- a/lib/notmuch.h
+++ b/lib/notmuch.h
@@ -741,6 +741,14 @@ typedef enum {
      */
     NOTMUCH_SORT_MESSAGE_ID,
     /**
+     * Smallest first.
+     */
+    NOTMUCH_SORT_SMALLEST_FIRST,
+    /**
+     * Biggest first
+     */
+    NOTMUCH_SORT_BIGGEST_FIRST,
+    /**
      * Do not sort.
      */
     NOTMUCH_SORT_UNSORTED
@@ -1411,6 +1419,12 @@ time_t
 notmuch_message_get_date  (notmuch_message_t *message);
 
 /**
+ * Get the filesize in bytes of 'message'.
+ */
+unsigned long
+notmuch_message_get_filesize  (notmuch_message_t *message);
+
+/**
  * Get the value of the specified header from 'message' as a UTF-8 string.
  *
  * Common headers are stored in the database when the message is
diff --git a/lib/query.cc b/lib/query.cc
index 212e27f0..6b0b6dd7 100644
--- a/lib/query.cc
+++ b/lib/query.cc
@@ -330,6 +330,12 @@ _notmuch_query_search_documents (notmuch_query_t *query,
 	case NOTMUCH_SORT_MESSAGE_ID:
 	    enquire.set_sort_by_value (NOTMUCH_VALUE_MESSAGE_ID, FALSE);
 	    break;
+	case NOTMUCH_SORT_SMALLEST_FIRST:
+	    enquire.set_sort_by_value (NOTMUCH_VALUE_FILESIZE, FALSE);
+	    break;
+	case NOTMUCH_SORT_BIGGEST_FIRST:
+	    enquire.set_sort_by_value (NOTMUCH_VALUE_FILESIZE, TRUE);
+	    break;
 	case NOTMUCH_SORT_UNSORTED:
 	    break;
 	}
diff --git a/notmuch-search.c b/notmuch-search.c
index 019e14ee..65ecfaab 100644
--- a/notmuch-search.c
+++ b/notmuch-search.c
@@ -778,6 +778,8 @@ static const notmuch_opt_desc_t common_options[] = {
     { NOTMUCH_OPT_KEYWORD, &search_context.sort, "sort", 's',
       (notmuch_keyword_t []){ { "oldest-first", NOTMUCH_SORT_OLDEST_FIRST },
 			      { "newest-first", NOTMUCH_SORT_NEWEST_FIRST },
+			      { "smallest-first", NOTMUCH_SORT_SMALLEST_FIRST },
+			      { "biggest-first", NOTMUCH_SORT_BIGGEST_FIRST },
 			      { 0, 0 } } },
     { NOTMUCH_OPT_KEYWORD, &search_context.format_sel, "format", 'f',
       (notmuch_keyword_t []){ { "json", NOTMUCH_FORMAT_JSON },
-- 
2.12.2



More information about the notmuch mailing list