[PATCH] Add notmuch-remove-duplicates.py script to contrib.
Michal Nazarewicz
mina86 at mina86.com
Tue Sep 4 12:43:53 PDT 2012
On Tue, Sep 04 2012, Dmitry Kurochkin wrote:
> The script removes duplicate message files. It takes no options.
>
> Files are assumed duplicates if their content is the same except for
> ignored headers. Currently, the only ignored header is Received:.
> ---
> contrib/notmuch-remove-duplicates.py | 95 ++++++++++++++++++++++++++++++++++
> 1 file changed, 95 insertions(+)
> create mode 100755 contrib/notmuch-remove-duplicates.py
>
> diff --git a/contrib/notmuch-remove-duplicates.py b/contrib/notmuch-remove-duplicates.py
> new file mode 100755
> index 0000000..dbe2e25
> --- /dev/null
> +++ b/contrib/notmuch-remove-duplicates.py
> @@ -0,0 +1,95 @@
> +#!/usr/bin/env python
> +
> +import sys
> +
> +IGNORED_HEADERS = [ "Received:" ]
> +
> +if len(sys.argv) != 1:
> + print "Usage: %s" % sys.argv[0]
> + print
> + print "The script removes duplicate message files. Takes no options."
> + print "Requires notmuch python module."
> + print
> + print "Files are assumed duplicates if their content is the same"
> + print "except for the following headers: %s." % ", ".join(IGNORED_HEADERS)
> + exit(1)
It's much better put inside a main() function, which is than called only
if the script is run directly.
> +
> +import notmuch
> +import os
> +import time
> +
> +class MailComparator:
> + """Checks if mail files are duplicates."""
> + def __init__(self, filename):
> + self.filename = filename
> + self.mail = self.readFile(self.filename)
> +
> + def isDuplicate(self, filename):
> + return self.mail == self.readFile(filename)
> +
> + @staticmethod
> + def readFile(filename):
> + with open(filename) as f:
> + data = ""
> + while True:
> + line = f.readline()
> + for header in IGNORED_HEADERS:
> + if line.startswith(header):
Case of headers should be ignored, but this does not ignore it.
> + # skip header continuation lines
> + while True:
> + line = f.readline()
> + if len(line) == 0 or line[0] not in [" ", "\t"]:
> + break
> + break
This will ignore line just after the ignored header.
> + else:
> + data += line
> + if line == "\n":
> + break
> + data += f.read()
> + return data
> +
> +db = notmuch.Database()
> +query = db.create_query('*')
> +print "Number of messages: %s" % query.count_messages()
> +
> +files_count = 0
> +for root, dirs, files in os.walk(db.get_path()):
> + if not root.startswith(os.path.join(db.get_path(), ".notmuch/")):
> + files_count += len(files)
> +print "Number of files: %s" % files_count
> +print "Estimated number of duplicates: %s" % (files_count - query.count_messages())
> +
> +msgs = query.search_messages()
> +msg_count = 0
> +suspected_duplicates_count = 0
> +duplicates_count = 0
> +timestamp = time.time()
> +for msg in msgs:
> + msg_count += 1
> + if len(msg.get_filenames()) > 1:
> + filenames = msg.get_filenames()
> + comparator = MailComparator(filenames.next())
> + for filename in filenames:
Strictly speaking, you need to compare each file to each file, and not
just every file to the first file.
> + if os.path.realpath(comparator.filename) == os.path.realpath(filename):
> + print "Message '%s' has filenames pointing to the
> same file: '%s' '%s'" % (msg.get_message_id(), comparator.filename,
> filename)
So why aren't those removed?
> + elif comparator.isDuplicate(filename):
> + os.remove(filename)
> + duplicates_count += 1
> + else:
> + #print "Potential duplicates: %s" % msg.get_message_id()
> + suspected_duplicates_count += 1
> +
> + new_timestamp = time.time()
> + if new_timestamp - timestamp > 1:
> + timestamp = new_timestamp
> + sys.stdout.write("\rProcessed %s messages, removed %s duplicates..." % (msg_count, duplicates_count))
> + sys.stdout.flush()
> +
> +print "\rFinished. Processed %s messages, removed %s duplicates." % (msg_count, duplicates_count)
> +if duplicates_count > 0:
> + print "You might want to run 'notmuch new' now."
> +
> +if suspected_duplicates_count > 0:
> + print
> + print "Found %s messages with duplicate IDs but different content." % suspected_duplicates_count
> + print "Perhaps we should ignore more headers."
Please consider the following instead (not tested):
#!/usr/bin/env python
import collections
import notmuch
import os
import re
import sys
import time
IGNORED_HEADERS = [ 'Received' ]
isIgnoredHeadersLine = re.compile(
r'^(?:%s)\s*:' % '|'.join(IGNORED_HEADERS),
re.IGNORECASE).search
doesStartWithWS = re.compile(r'^\s').search
def usage(argv0):
print """Usage: %s [<query-string>]
The script removes duplicate message files. Takes no options."
Requires notmuch python module."
Files are assumed duplicates if their content is the same"
except for the following headers: %s.""" % (argv0, ', '.join(IGNORED_HEADERS))
def readMailFile(filename):
with open(filename) as fd:
data = []
skip_header = False
for line in fd:
if doesStartWithWS(line):
if not skip_header:
data.append(line)
elif isIgnoredHeadersLine(line):
skip_header = True
else:
data.append(line)
if line == '\n':
break
data.append(fd.read())
return ''.join(data)
def dedupMessage(msg):
filenames = msg.get_filenames()
if len(filenames) <= 1:
return (0, 0)
realpaths = collections.defaultdict(list)
contents = collections.defaultdict(list)
for filename in filenames:
real = os.path.realpath(filename)
lst = realpaths[real]
lst.append(filename)
if len(lst) == 1:
contents[readMailFile(real)].append(real)
duplicates = 0
for filenames in contents.itervalues():
if len(filenames) > 1:
print 'Files with the same content:'
print ' ', filenames.pop()
duplicates += len(filenames)
for filename in filenames:
del realpaths[filename]
# os.remane(filename)
for real, filenames in realpaths.iteritems():
if len(filenames) > 1:
print 'Files pointing to the same message:'
print ' ', filenames.pop()
duplicates += len(filenames)
# for filename in filenames:
# os.remane(filename)
return (duplicates, len(realpaths) - 1)
def dedupQuery(query):
print 'Number of messages: %s' % query.count_messages()
msg_count = 0
suspected_count = 0
duplicates_count = 0
timestamp = time.time()
msgs = query.search_messages()
for msg in msgs:
msg_count += 1
d, s = dedupMessage(msg)
duplicates_count += d
suspected_count += d
new_timestamp = time.time()
if new_timestamp - timestamp > 1:
timestamp = new_timestamp
sys.stdout.write('\rProcessed %s messages, removed %s duplicates...'
% (msg_count, duplicates_count))
sys.stdout.flush()
print '\rFinished. Processed %s messages, removed %s duplicates.' % (
msg_count, duplicates_count)
if duplicates_count > 0:
print 'You might want to run "notmuch new" now.'
if suspected_duplicates_count > 0:
print """
Found %d messages with duplicate IDs but different content.
Perhaps we should ignore more headers.""" % suspected_count
def main(argv):
if len(argv) == 1:
query = '*'
elif len(argv) == 2:
query = argv[1]
else:
usage(argv[0])
return 1
db = notmuch.Database()
query = db.create_query(query)
dedupQuery(db, query)
return 0
if __name__ == '__main__':
sys.exit(main(sys.argv))
--
Best regards, _ _
.o. | Liege of Serenely Enlightened Majesty of o' \,=./ `o
..o | Computer Science, Michał “mina86” Nazarewicz (o o)
ooo +----<email/xmpp: mpn at google.com>--------------ooO--(_)--Ooo--
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 835 bytes
Desc: not available
URL: <http://notmuchmail.org/pipermail/notmuch/attachments/20120904/dca6723b/attachment.pgp>
More information about the notmuch
mailing list