[notmuch] [PATCH] Reindex larger files that duplicate ids we have

James Westby jw+debian at jameswestby.net
Fri Dec 18 17:29:09 PST 2009


When we see a message where we already have the file
id stored, check if the size is larger. If it is then
re-index and set the file size and name to be the
new message.
---

  Here's the (quite simple) patch to implement indexing the
  largest copy of each mail that we have.

  Does the re-indexing replace the old terms? In the case
  where you had a collision with different text this could
  make a search return mails that don't contain that text.
  I don't think it's a big issue though, even if that is the
  case.

  Thanks,

  James

 lib/database.cc       |    4 +++-
 lib/index.cc          |   27 +++++++++++++++++++++++++++
 lib/message.cc        |   31 ++++++++++++++++++++++++++-----
 lib/notmuch-private.h |   13 +++++++++++++
 lib/notmuch.h         |    5 +++--
 5 files changed, 72 insertions(+), 8 deletions(-)

diff --git a/lib/database.cc b/lib/database.cc
index d834d94..64f29b9 100644
--- a/lib/database.cc
+++ b/lib/database.cc
@@ -1000,7 +1000,9 @@ notmuch_database_add_message (notmuch_database_t *notmuch,
 	    if (ret)
 		goto DONE;
 	} else {
-	    ret = NOTMUCH_STATUS_DUPLICATE_MESSAGE_ID;
+	    ret = _notmuch_message_possibly_reindex (message, filename, size);
+	    if (!ret)
+		ret = NOTMUCH_STATUS_DUPLICATE_MESSAGE_ID;
 	    goto DONE;
 	}
 
diff --git a/lib/index.cc b/lib/index.cc
index 125fa6c..14c3268 100644
--- a/lib/index.cc
+++ b/lib/index.cc
@@ -312,3 +312,30 @@ _notmuch_message_index_file (notmuch_message_t *message,
 
     return ret;
 }
+
+notmuch_status_t
+_notmuch_message_possibly_reindex (notmuch_message_t *message,
+			     const char *filename,
+			     const off_t size)
+{
+    off_t realsize = size;
+    off_t stored_size;
+    notmuch_status_t ret = NOTMUCH_STATUS_SUCCESS;
+
+    ret = _notmuch_message_size_on_disk (message, filename, &realsize);
+    if (ret)
+        goto DONE;
+    stored_size = _notmuch_message_get_filesize (message);
+    if (realsize > stored_size) {
+	ret = _notmuch_message_index_file (message, filename);
+	if (ret)
+	    goto DONE;
+	ret = _notmuch_message_set_filesize (message, filename, realsize);
+	_notmuch_message_set_filename (message, filename);
+	_notmuch_message_sync (message);
+    }
+
+  DONE:
+    return ret;
+
+}
diff --git a/lib/message.cc b/lib/message.cc
index 2bfc5ed..cc32741 100644
--- a/lib/message.cc
+++ b/lib/message.cc
@@ -427,23 +427,38 @@ _notmuch_message_set_filename (notmuch_message_t *message,
 }
 
 notmuch_status_t
-_notmuch_message_set_filesize (notmuch_message_t *message,
+_notmuch_message_size_on_disk (notmuch_message_t *message,
 			       const char *filename,
-			       const off_t size)
+			       off_t *size)
 {
     struct stat st;
-    off_t realsize = size;
     notmuch_status_t ret = NOTMUCH_STATUS_SUCCESS;
 
-    if (realsize < 0) {
+    if (*size < 0) {
 	if (stat (filename, &st)) {
 	    ret = NOTMUCH_STATUS_FILE_ERROR;
 	    goto DONE;
 	} else {
-	    realsize = st.st_size;
+	    *size = st.st_size;
 	}
     }
 
+  DONE:
+    return ret;
+}
+
+notmuch_status_t
+_notmuch_message_set_filesize (notmuch_message_t *message,
+			       const char *filename,
+			       const off_t size)
+{
+    off_t realsize = size;
+    notmuch_status_t ret = NOTMUCH_STATUS_SUCCESS;
+
+    ret = _notmuch_message_size_on_disk (message, filename, &realsize);
+    if (ret)
+        goto DONE;
+
     message->doc.add_value (NOTMUCH_VALUE_FILESIZE,
 			 Xapian::sortable_serialise (realsize));
 
@@ -451,6 +466,12 @@ _notmuch_message_set_filesize (notmuch_message_t *message,
     return ret;
 }
 
+off_t
+_notmuch_message_get_filesize (notmuch_message_t *message)
+{
+    return Xapian::sortable_unserialise (message->doc.get_value (NOTMUCH_VALUE_FILESIZE));
+}
+
 const char *
 notmuch_message_get_filename (notmuch_message_t *message)
 {
diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h
index 1ba3055..cf65fd9 100644
--- a/lib/notmuch-private.h
+++ b/lib/notmuch-private.h
@@ -199,6 +199,14 @@ _notmuch_message_set_filesize (notmuch_message_t *message,
 			       const char *filename,
 			       const off_t size);
 
+off_t
+_notmuch_message_get_filesize (notmuch_message_t *message);
+
+notmuch_status_t
+_notmuch_message_size_on_disk (notmuch_message_t *message,
+			       const char *filename,
+			       off_t *size);
+
 void
 _notmuch_message_ensure_thread_id (notmuch_message_t *message);
 
@@ -218,6 +226,11 @@ notmuch_status_t
 _notmuch_message_index_file (notmuch_message_t *message,
 			     const char *filename);
 
+notmuch_status_t
+_notmuch_message_possibly_reindex (notmuch_message_t *message,
+			     const char *filename,
+			     const off_t size);
+
 /* message-file.c */
 
 /* XXX: I haven't decided yet whether these will actually get exported
diff --git a/lib/notmuch.h b/lib/notmuch.h
index 5d0d224..892e420 100644
--- a/lib/notmuch.h
+++ b/lib/notmuch.h
@@ -256,8 +256,9 @@ notmuch_database_get_timestamp (notmuch_database_t *database,
  * NOTMUCH_STATUS_SUCCESS: Message successfully added to database.
  *
  * NOTMUCH_STATUS_DUPLICATE_MESSAGE_ID: Message has the same message
- *	ID as another message already in the database. Nothing added
- *	to the database.
+ *	ID as another message already in the database. This may have
+ *	caused some further indexing to be done, but it is not an entirely
+ *	new message.
  *
  * NOTMUCH_STATUS_FILE_ERROR: an error occurred trying to open the
  *	file, (such as permission denied, or file not found,
-- 
1.6.3.3



More information about the notmuch mailing list