[PATCH 1/3] notmuch-search: implement 'filesize' search

Ioan-Adrian Ratiu adi at adirat.com
Sat May 6 13:45:28 PDT 2017


I need to keep track of my mail file sizes and writing custom
parsers/scripts and running them on the maildir is problematic
especially for managing large collections of email.

Having a way to search email based on file sizes would be very useful,
also copled with notmuch's incremental maildir indexing/parsing, rapid
result navigation/sorting via Emacs makes my life a lot easier.

So this commit adds the following capability (sizes are in bytes
because that's what I'm interested in, other inputs can be added
I guess). Replace the range param to get mails of a specific size:

notmuch search --sort=biggest-first filesize:1000..100000

I didn't think of another front-end for this other than the command
line and saved searches from the Emacs GUI, which work well enough for
me with this patch series, but it would be simple to also expose the
filesize info on the screen somewhere in the results/show buffer.

Another interesting idea would be to offer some optional statistics
in the UI buffers like we do for saved searches `query-count` to show
stuff like disk space occupied by mail files maching a search, etc.

Signed-off-by: Ioan-Adrian Ratiu <adi at adirat.com>
---
 lib/database-private.h |  1 +
 lib/database.cc        |  6 ++++++
 lib/index.cc           | 10 ++++++++++
 lib/message-file.c     | 18 +++++++++++++++++-
 lib/message.cc         | 29 +++++++++++++++++++++++++++++
 lib/notmuch-private.h  | 16 ++++++++++++++++
 lib/notmuch.h          | 14 ++++++++++++++
 lib/query.cc           |  6 ++++++
 notmuch-search.c       |  2 ++
 9 files changed, 101 insertions(+), 1 deletion(-)

diff --git a/lib/database-private.h b/lib/database-private.h
index ab3d9691..a7e0a020 100644
--- a/lib/database-private.h
+++ b/lib/database-private.h
@@ -217,6 +217,7 @@ struct _notmuch_database {
     Xapian::ValueRangeProcessor *value_range_processor;
     Xapian::ValueRangeProcessor *date_range_processor;
     Xapian::ValueRangeProcessor *last_mod_range_processor;
+    Xapian::ValueRangeProcessor *filesize_range_processor;
 };
 
 /* Prior to database version 3, features were implied by the database
diff --git a/lib/database.cc b/lib/database.cc
index 5bc131a3..e6d5dd11 100644
--- a/lib/database.cc
+++ b/lib/database.cc
@@ -290,6 +290,7 @@ prefix_t prefix_table[] = {
     { "subject",		"XSUBJECT",	NOTMUCH_FIELD_EXTERNAL |
 						NOTMUCH_FIELD_PROBABILISTIC |
 						NOTMUCH_FIELD_PROCESSOR},
+    { "filesize",		"XFILESIZE",	NOTMUCH_FIELD_EXTERNAL },
 };
 
 static void
@@ -1076,6 +1077,7 @@ notmuch_database_open_verbose (const char *path,
 	notmuch->value_range_processor = new Xapian::NumberValueRangeProcessor (NOTMUCH_VALUE_TIMESTAMP);
 	notmuch->date_range_processor = new ParseTimeValueRangeProcessor (NOTMUCH_VALUE_TIMESTAMP);
 	notmuch->last_mod_range_processor = new Xapian::NumberValueRangeProcessor (NOTMUCH_VALUE_LAST_MOD, "lastmod:");
+	notmuch->filesize_range_processor = new Xapian::NumberValueRangeProcessor (NOTMUCH_VALUE_FILESIZE, "filesize:");
 
 	notmuch->query_parser->set_default_op (Xapian::Query::OP_AND);
 	notmuch->query_parser->set_database (*notmuch->xapian_db);
@@ -1084,6 +1086,7 @@ notmuch_database_open_verbose (const char *path,
 	notmuch->query_parser->add_valuerangeprocessor (notmuch->value_range_processor);
 	notmuch->query_parser->add_valuerangeprocessor (notmuch->date_range_processor);
 	notmuch->query_parser->add_valuerangeprocessor (notmuch->last_mod_range_processor);
+	notmuch->query_parser->add_valuerangeprocessor (notmuch->filesize_range_processor);
 
 	for (i = 0; i < ARRAY_SIZE (prefix_table); i++) {
 	    const prefix_t *prefix = &prefix_table[i];
@@ -1160,6 +1163,8 @@ notmuch_database_close (notmuch_database_t *notmuch)
     notmuch->date_range_processor = NULL;
     delete notmuch->last_mod_range_processor;
     notmuch->last_mod_range_processor = NULL;
+    delete notmuch->filesize_range_processor;
+    notmuch->filesize_range_processor = NULL;
 
     return status;
 }
@@ -2557,6 +2562,7 @@ notmuch_database_add_message (notmuch_database_t *notmuch,
 	}
 
 	_notmuch_message_add_filename (message, filename);
+	_notmuch_message_add_filesize (message, message_file);
 
 	/* Is this a newly created message object or a ghost
 	 * message?  We have to be slightly careful: if this is a
diff --git a/lib/index.cc b/lib/index.cc
index 8c145540..e8655bc1 100644
--- a/lib/index.cc
+++ b/lib/index.cc
@@ -441,6 +441,8 @@ _notmuch_message_index_file (notmuch_message_t *message,
     InternetAddressList *addresses;
     const char *from, *subject;
     notmuch_status_t status;
+    unsigned long filesize;
+    char *filesize_str;
 
     status = _notmuch_message_file_get_mime_message (message_file,
 						     &mime_message);
@@ -464,6 +466,14 @@ _notmuch_message_index_file (notmuch_message_t *message,
     subject = g_mime_message_get_subject (mime_message);
     _notmuch_message_gen_terms (message, "subject", subject);
 
+    filesize = _notmuch_message_file_get_size (message_file);
+    filesize_str = talloc_asprintf(NULL, "%lu", filesize);
+    if (! filesize_str)
+	return NOTMUCH_STATUS_OUT_OF_MEMORY;
+
+    _notmuch_message_add_term (message, "filesize", filesize_str);
+    talloc_free (filesize_str);
+
     _index_mime_part (message, g_mime_message_get_mime_part (mime_message));
 
     return NOTMUCH_STATUS_SUCCESS;
diff --git a/lib/message-file.c b/lib/message-file.c
index db18b163..f75593e3 100644
--- a/lib/message-file.c
+++ b/lib/message-file.c
@@ -26,10 +26,13 @@
 
 #include <glib.h> /* GHashTable */
 
+#include <glib/gstdio.h>
+
 struct _notmuch_message_file {
     /* File object */
     FILE *file;
     char *filename;
+    unsigned long filesize; /* in bytes */
 
     /* Cache for decoded headers */
     GHashTable *headers;
@@ -64,7 +67,7 @@ _notmuch_message_file_open_ctx (notmuch_database_t *notmuch,
     if (unlikely (message == NULL))
 	return NULL;
 
-    /* Only needed for error messages during parsing. */
+    /* Only needed during parsing */
     message->filename = talloc_strdup (message, filename);
     if (message->filename == NULL)
 	goto FAIL;
@@ -98,6 +101,12 @@ _notmuch_message_file_close (notmuch_message_file_t *message)
     talloc_free (message);
 }
 
+unsigned long
+_notmuch_message_file_get_size (notmuch_message_file_t *message)
+{
+    return message->filesize;
+}
+
 static notmuch_bool_t
 _is_mbox (FILE *file)
 {
@@ -122,6 +131,8 @@ _notmuch_message_file_parse (notmuch_message_file_t *message)
     notmuch_status_t status = NOTMUCH_STATUS_SUCCESS;
     static int initialized = 0;
     notmuch_bool_t is_mbox;
+    GStatBuf statResult;
+    int ret;
 
     if (message->message)
 	return NOTMUCH_STATUS_SUCCESS;
@@ -133,6 +144,11 @@ _notmuch_message_file_parse (notmuch_message_file_t *message)
 	initialized = 1;
     }
 
+    /* filesize defaults to zero which is ignored */
+    ret = g_stat(message->filename, &statResult);
+    if (! ret)
+	message->filesize = statResult.st_size;
+
     message->headers = g_hash_table_new_full (strcase_hash, strcase_equal,
 					      free, g_free);
     if (! message->headers)
diff --git a/lib/message.cc b/lib/message.cc
index c2721191..83b4c4db 100644
--- a/lib/message.cc
+++ b/lib/message.cc
@@ -988,6 +988,26 @@ notmuch_message_get_date (notmuch_message_t *message)
     return Xapian::sortable_unserialise (value);
 }
 
+unsigned long
+notmuch_message_get_filesize (notmuch_message_t *message)
+{
+    std::string value;
+
+    try {
+	value = message->doc.get_value (NOTMUCH_VALUE_FILESIZE);
+    } catch (Xapian::Error &error) {
+	_notmuch_database_log(_notmuch_message_database (message), "A Xapian exception occurred when reading filesize: %s\n",
+		 error.get_msg().c_str());
+	message->notmuch->exception_reported = TRUE;
+	return 0;
+    }
+
+    if (value.empty ())
+	/* sortable_unserialise is undefined on empty string */
+	return 0;
+    return Xapian::sortable_unserialise (value);
+}
+
 notmuch_tags_t *
 notmuch_message_get_tags (notmuch_message_t *message)
 {
@@ -1208,6 +1228,15 @@ _notmuch_message_close (notmuch_message_t *message)
     }
 }
 
+void
+_notmuch_message_add_filesize (notmuch_message_t *message,
+			       notmuch_message_file_t *message_file)
+{
+    unsigned long filesize = _notmuch_message_file_get_size(message_file);
+    message->doc.add_value (NOTMUCH_VALUE_FILESIZE,
+			    Xapian::sortable_serialise (filesize));
+}
+
 /* Add a name:value term to 'message', (the actual term will be
  * encoded by prefixing the value with a short prefix). See
  * NORMAL_PREFIX and BOOLEAN_PREFIX arrays for the mapping of term
diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h
index 8587e86c..3c15ed3d 100644
--- a/lib/notmuch-private.h
+++ b/lib/notmuch-private.h
@@ -110,6 +110,7 @@ typedef enum {
     NOTMUCH_VALUE_FROM,
     NOTMUCH_VALUE_SUBJECT,
     NOTMUCH_VALUE_LAST_MOD,
+    NOTMUCH_VALUE_FILESIZE,
 } notmuch_value_t;
 
 /* Xapian (with flint backend) complains if we provide a term longer
@@ -400,6 +401,21 @@ _notmuch_message_file_close (notmuch_message_file_t *message);
 notmuch_status_t
 _notmuch_message_file_parse (notmuch_message_file_t *message);
 
+/*
+ * Get the filesize of a message file
+ *
+ * This filesize member is read during file parsing.
+ */
+unsigned long
+_notmuch_message_file_get_size (notmuch_message_file_t *message);
+
+/*
+ * Set the message filesize to the size of the message_file
+ */
+void
+_notmuch_message_add_filesize (notmuch_message_t *message,
+			       notmuch_message_file_t *message_file);
+
 /* Get the gmime message of a message file.
  *
  * The message file is parsed as necessary.
diff --git a/lib/notmuch.h b/lib/notmuch.h
index d374dc96..8cff214f 100644
--- a/lib/notmuch.h
+++ b/lib/notmuch.h
@@ -741,6 +741,14 @@ typedef enum {
      */
     NOTMUCH_SORT_MESSAGE_ID,
     /**
+     * Smallest first.
+     */
+    NOTMUCH_SORT_SMALLEST_FIRST,
+    /**
+     * Biggest first
+     */
+    NOTMUCH_SORT_BIGGEST_FIRST,
+    /**
      * Do not sort.
      */
     NOTMUCH_SORT_UNSORTED
@@ -1411,6 +1419,12 @@ time_t
 notmuch_message_get_date  (notmuch_message_t *message);
 
 /**
+ * Get the filesize in bytes of 'message'.
+ */
+unsigned long
+notmuch_message_get_filesize  (notmuch_message_t *message);
+
+/**
  * Get the value of the specified header from 'message' as a UTF-8 string.
  *
  * Common headers are stored in the database when the message is
diff --git a/lib/query.cc b/lib/query.cc
index 212e27f0..6b0b6dd7 100644
--- a/lib/query.cc
+++ b/lib/query.cc
@@ -330,6 +330,12 @@ _notmuch_query_search_documents (notmuch_query_t *query,
 	case NOTMUCH_SORT_MESSAGE_ID:
 	    enquire.set_sort_by_value (NOTMUCH_VALUE_MESSAGE_ID, FALSE);
 	    break;
+	case NOTMUCH_SORT_SMALLEST_FIRST:
+	    enquire.set_sort_by_value (NOTMUCH_VALUE_FILESIZE, FALSE);
+	    break;
+	case NOTMUCH_SORT_BIGGEST_FIRST:
+	    enquire.set_sort_by_value (NOTMUCH_VALUE_FILESIZE, TRUE);
+	    break;
 	case NOTMUCH_SORT_UNSORTED:
 	    break;
 	}
diff --git a/notmuch-search.c b/notmuch-search.c
index 019e14ee..65ecfaab 100644
--- a/notmuch-search.c
+++ b/notmuch-search.c
@@ -778,6 +778,8 @@ static const notmuch_opt_desc_t common_options[] = {
     { NOTMUCH_OPT_KEYWORD, &search_context.sort, "sort", 's',
       (notmuch_keyword_t []){ { "oldest-first", NOTMUCH_SORT_OLDEST_FIRST },
 			      { "newest-first", NOTMUCH_SORT_NEWEST_FIRST },
+			      { "smallest-first", NOTMUCH_SORT_SMALLEST_FIRST },
+			      { "biggest-first", NOTMUCH_SORT_BIGGEST_FIRST },
 			      { 0, 0 } } },
     { NOTMUCH_OPT_KEYWORD, &search_context.format_sel, "format", 'f',
       (notmuch_keyword_t []){ { "json", NOTMUCH_FORMAT_JSON },
-- 
2.12.2



More information about the notmuch mailing list