[notmuch] [PATCH v2] Store the size of the file for each message

James Westby jw+debian at jameswestby.net
Fri Dec 18 16:11:48 PST 2009


When indexing a message store the filesize along with it so that
when we store all the filenames for a message-id we can know if
any of them have different content cheaply.

The value stored is defined to be the largest filesize of any
of the files for that message.

This changes the API for efficiency reasons. The size is often
known to the caller, and so we save a second stat by asking them
to provide it. If they don't know it they can pass -1 and the
stat will be done for them.

We store the filesize such that we can query a range. Thus it
would be possible to query "filesize:0..100" if you somehow
knew the raw message was less that 100 bytes.
---

  With new, improved, working, filesize:.. search.

 lib/database.cc       |    7 +++++++
 lib/message.cc        |   25 +++++++++++++++++++++++++
 lib/notmuch-private.h |    8 +++++++-
 lib/notmuch.h         |    5 +++++
 notmuch-new.c         |    2 +-
 5 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/lib/database.cc b/lib/database.cc
index b6c4d07..d834d94 100644
--- a/lib/database.cc
+++ b/lib/database.cc
@@ -463,6 +463,8 @@ notmuch_database_open (const char *path,
     struct stat st;
     int err;
     unsigned int i;
+    Xapian::NumberValueRangeProcessor *filesize_proc = new Xapian::NumberValueRangeProcessor (NOTMUCH_VALUE_FILESIZE,
+			 "filesize:", true);
 
     if (asprintf (&notmuch_path, "%s/%s", path, ".notmuch") == -1) {
 	notmuch_path = NULL;
@@ -508,6 +510,7 @@ notmuch_database_open (const char *path,
 	notmuch->query_parser->set_stemmer (Xapian::Stem ("english"));
 	notmuch->query_parser->set_stemming_strategy (Xapian::QueryParser::STEM_SOME);
 	notmuch->query_parser->add_valuerangeprocessor (notmuch->value_range_processor);
+	notmuch->query_parser->add_valuerangeprocessor (filesize_proc);
 
 	for (i = 0; i < ARRAY_SIZE (BOOLEAN_PREFIX_EXTERNAL); i++) {
 	    prefix_t *prefix = &BOOLEAN_PREFIX_EXTERNAL[i];
@@ -889,6 +892,7 @@ _notmuch_database_link_message (notmuch_database_t *notmuch,
 notmuch_status_t
 notmuch_database_add_message (notmuch_database_t *notmuch,
 			      const char *filename,
+			      const off_t size,
 			      notmuch_message_t **message_ret)
 {
     notmuch_message_file_t *message_file;
@@ -992,6 +996,9 @@ notmuch_database_add_message (notmuch_database_t *notmuch,
 	if (private_status == NOTMUCH_PRIVATE_STATUS_NO_DOCUMENT_FOUND) {
 	    _notmuch_message_set_filename (message, filename);
 	    _notmuch_message_add_term (message, "type", "mail");
+	    ret = _notmuch_message_set_filesize (message, filename, size);
+	    if (ret)
+		goto DONE;
 	} else {
 	    ret = NOTMUCH_STATUS_DUPLICATE_MESSAGE_ID;
 	    goto DONE;
diff --git a/lib/message.cc b/lib/message.cc
index 49519f1..2bfc5ed 100644
--- a/lib/message.cc
+++ b/lib/message.cc
@@ -426,6 +426,31 @@ _notmuch_message_set_filename (notmuch_message_t *message,
     message->doc.set_data (s);
 }
 
+notmuch_status_t
+_notmuch_message_set_filesize (notmuch_message_t *message,
+			       const char *filename,
+			       const off_t size)
+{
+    struct stat st;
+    off_t realsize = size;
+    notmuch_status_t ret = NOTMUCH_STATUS_SUCCESS;
+
+    if (realsize < 0) {
+	if (stat (filename, &st)) {
+	    ret = NOTMUCH_STATUS_FILE_ERROR;
+	    goto DONE;
+	} else {
+	    realsize = st.st_size;
+	}
+    }
+
+    message->doc.add_value (NOTMUCH_VALUE_FILESIZE,
+			 Xapian::sortable_serialise (realsize));
+
+  DONE:
+    return ret;
+}
+
 const char *
 notmuch_message_get_filename (notmuch_message_t *message)
 {
diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h
index 116f63d..1ba3055 100644
--- a/lib/notmuch-private.h
+++ b/lib/notmuch-private.h
@@ -100,7 +100,8 @@ _internal_error (const char *format, ...) PRINTF_ATTRIBUTE (1, 2);
 
 typedef enum {
     NOTMUCH_VALUE_TIMESTAMP = 0,
-    NOTMUCH_VALUE_MESSAGE_ID
+    NOTMUCH_VALUE_MESSAGE_ID,
+    NOTMUCH_VALUE_FILESIZE
 } notmuch_value_t;
 
 /* Xapian (with flint backend) complains if we provide a term longer
@@ -193,6 +194,11 @@ void
 _notmuch_message_set_filename (notmuch_message_t *message,
 			       const char *filename);
 
+notmuch_status_t
+_notmuch_message_set_filesize (notmuch_message_t *message,
+			       const char *filename,
+			       const off_t size);
+
 void
 _notmuch_message_ensure_thread_id (notmuch_message_t *message);
 
diff --git a/lib/notmuch.h b/lib/notmuch.h
index 60834fb..5d0d224 100644
--- a/lib/notmuch.h
+++ b/lib/notmuch.h
@@ -32,6 +32,7 @@
 NOTMUCH_BEGIN_DECLS
 
 #include <time.h>
+#include <stdlib.h>
 
 #ifndef FALSE
 #define FALSE 0
@@ -241,6 +242,9 @@ notmuch_database_get_timestamp (notmuch_database_t *database,
  * notmuch database will reference the filename, and will not copy the
  * entire contents of the file.
  *
+ * 'size' should be the number of bytes in the file, or -1 if you are
+ * not sure.
+ *
  * If 'message' is not NULL, then, on successful return '*message'
  * will be initialized to a message object that can be used for things
  * such as adding tags to the just-added message. The user should call
@@ -265,6 +269,7 @@ notmuch_database_get_timestamp (notmuch_database_t *database,
 notmuch_status_t
 notmuch_database_add_message (notmuch_database_t *database,
 			      const char *filename,
+			      const off_t size,
 			      notmuch_message_t **message);
 
 /* Find a message with the given message_id.
diff --git a/notmuch-new.c b/notmuch-new.c
index 9d20616..cea66c2 100644
--- a/notmuch-new.c
+++ b/notmuch-new.c
@@ -235,7 +235,7 @@ add_files_recursive (notmuch_database_t *notmuch,
 		    fflush (stdout);
 		}
 
-		status = notmuch_database_add_message (notmuch, next, &message);
+		status = notmuch_database_add_message (notmuch, next, st->st_size, &message);
 		switch (status) {
 		    /* success */
 		    case NOTMUCH_STATUS_SUCCESS:
-- 
1.6.3.3



More information about the notmuch mailing list