[PATCH] Add notmuch-remove-duplicates.py script to contrib.

Tue Sep 4 12:43:53 PDT 2012

On Tue, Sep 04 2012, Dmitry Kurochkin wrote:
> The script removes duplicate message files.  It takes no options.
>
> Files are assumed duplicates if their content is the same except for
> ignored headers.  Currently, the only ignored header is Received:.
> ---
>  contrib/notmuch-remove-duplicates.py |   95 ++++++++++++++++++++++++++++++++++
>  1 file changed, 95 insertions(+)
>  create mode 100755 contrib/notmuch-remove-duplicates.py
>
> diff --git a/contrib/notmuch-remove-duplicates.py b/contrib/notmuch-remove-duplicates.py
> new file mode 100755
> index 0000000..dbe2e25
> --- /dev/null
> +++ b/contrib/notmuch-remove-duplicates.py
> @@ -0,0 +1,95 @@
> +#!/usr/bin/env python
> +
> +import sys
> +
> +IGNORED_HEADERS = [ "Received:" ]
> +
> +if len(sys.argv) != 1:
> +    print "Usage: %s" % sys.argv[0]
> +    print
> +    print "The script removes duplicate message files.  Takes no options."
> +    print "Requires notmuch python module."
> +    print
> +    print "Files are assumed duplicates if their content is the same"
> +    print "except for the following headers: %s." % ", ".join(IGNORED_HEADERS)
> +    exit(1)

It's much better put inside a main() function, which is than called only
if the script is run directly.

> +
> +import notmuch
> +import os
> +import time
> +
> +class MailComparator:
> +    """Checks if mail files are duplicates."""
> +    def __init__(self, filename):
> +        self.filename = filename
> +        self.mail = self.readFile(self.filename)
> +
> +    def isDuplicate(self, filename):
> +        return self.mail == self.readFile(filename)
> +
> +    @staticmethod
> +    def readFile(filename):
> +        with open(filename) as f:
> +            data = ""
> +            while True:
> +                line = f.readline()
> +                for header in IGNORED_HEADERS:
> +                    if line.startswith(header):

Case of headers should be ignored, but this does not ignore it.

> +                        # skip header continuation lines
> +                        while True:
> +                            line = f.readline()
> +                            if len(line) == 0 or line[0] not in [" ", "\t"]:
> +                                break
> +                        break

This will ignore line just after the ignored header.

> +                else:
> +                    data += line
> +                    if line == "\n":
> +                        break
> +            data += f.read()
> +            return data
> +
> +db = notmuch.Database()
> +query = db.create_query('*')
> +print "Number of messages: %s" % query.count_messages()
> +
> +files_count = 0
> +for root, dirs, files in os.walk(db.get_path()):
> +    if not root.startswith(os.path.join(db.get_path(), ".notmuch/")):
> +        files_count += len(files)
> +print "Number of files: %s" % files_count
> +print "Estimated number of duplicates: %s" % (files_count - query.count_messages())
> +
> +msgs = query.search_messages()
> +msg_count = 0
> +suspected_duplicates_count = 0
> +duplicates_count = 0
> +timestamp = time.time()
> +for msg in msgs:
> +    msg_count += 1
> +    if len(msg.get_filenames()) > 1:
> +        filenames = msg.get_filenames()
> +        comparator = MailComparator(filenames.next())
> +        for filename in filenames:

Strictly speaking, you need to compare each file to each file, and not
just every file to the first file.

> +            if os.path.realpath(comparator.filename) == os.path.realpath(filename):
> +                print "Message '%s' has filenames pointing to the
> same file: '%s' '%s'" % (msg.get_message_id(), comparator.filename,
> filename)

So why aren't those removed?

> +            elif comparator.isDuplicate(filename):
> +                os.remove(filename)
> +                duplicates_count += 1
> +            else:
> +                #print "Potential duplicates: %s" % msg.get_message_id()
> +                suspected_duplicates_count += 1
> +
> +    new_timestamp = time.time()
> +    if new_timestamp - timestamp > 1:
> +        timestamp = new_timestamp
> +        sys.stdout.write("\rProcessed %s messages, removed %s duplicates..." % (msg_count, duplicates_count))
> +        sys.stdout.flush()
> +
> +print "\rFinished. Processed %s messages, removed %s duplicates." % (msg_count, duplicates_count)
> +if duplicates_count > 0:
> +    print "You might want to run 'notmuch new' now."
> +
> +if suspected_duplicates_count > 0:
> +    print
> +    print "Found %s messages with duplicate IDs but different content." % suspected_duplicates_count
> +    print "Perhaps we should ignore more headers."

Please consider the following instead (not tested):

#!/usr/bin/env python

import collections
import notmuch
import os
import re
import sys
import time

IGNORED_HEADERS = [ 'Received' ]

isIgnoredHeadersLine = re.compile(
    r'^(?:%s)\s*:' % '|'.join(IGNORED_HEADERS),
    re.IGNORECASE).search

doesStartWithWS = re.compile(r'^\s').search

def usage(argv0):
    print """Usage: %s [<query-string>]

The script removes duplicate message files.  Takes no options."
Requires notmuch python module."

Files are assumed duplicates if their content is the same"
except for the following headers: %s.""" % (argv0, ', '.join(IGNORED_HEADERS))

def readMailFile(filename):
    with open(filename) as fd:
        data = []
        skip_header = False
        for line in fd:
            if doesStartWithWS(line):
                if not skip_header:
                    data.append(line)
            elif isIgnoredHeadersLine(line):
                skip_header = True
            else:
                data.append(line)
                if line == '\n':
                    break
        data.append(fd.read())
        return ''.join(data)

def dedupMessage(msg):
    filenames = msg.get_filenames()
    if len(filenames) <= 1:
        return (0, 0)

    realpaths = collections.defaultdict(list)
    contents = collections.defaultdict(list)
    for filename in filenames:
        real = os.path.realpath(filename)
        lst = realpaths[real]
        lst.append(filename)
        if len(lst) == 1:
            contents[readMailFile(real)].append(real)

    duplicates = 0

    for filenames in contents.itervalues():
        if len(filenames) > 1:
            print 'Files with the same content:'
            print ' ', filenames.pop()
            duplicates += len(filenames)
            for filename in filenames:
                del realpaths[filename]
            #     os.remane(filename)

    for real, filenames in realpaths.iteritems():
        if len(filenames) > 1:
            print 'Files pointing to the same message:'
            print ' ', filenames.pop()
            duplicates += len(filenames)
            # for filename in filenames:
            #     os.remane(filename)

    return (duplicates, len(realpaths) - 1)

def dedupQuery(query):
    print 'Number of messages: %s' % query.count_messages()
    msg_count = 0
    suspected_count = 0
    duplicates_count = 0
    timestamp = time.time()
    msgs = query.search_messages()
    for msg in msgs:
        msg_count += 1
        d, s = dedupMessage(msg)
        duplicates_count += d
        suspected_count += d

        new_timestamp = time.time()
        if new_timestamp - timestamp > 1:
            timestamp = new_timestamp
            sys.stdout.write('\rProcessed %s messages, removed %s duplicates...'
                             % (msg_count, duplicates_count))
            sys.stdout.flush()

    print '\rFinished. Processed %s messages, removed %s duplicates.' % (
        msg_count, duplicates_count)
    if duplicates_count > 0:
        print 'You might want to run "notmuch new" now.'

    if suspected_duplicates_count > 0:
        print """
Found %d messages with duplicate IDs but different content.
Perhaps we should ignore more headers.""" % suspected_count

def main(argv):
    if len(argv) == 1:
        query = '*'
    elif len(argv) == 2:
        query = argv[1]
    else:
        usage(argv[0])
        return 1

    db = notmuch.Database()
    query = db.create_query(query)
    dedupQuery(db, query)
    return 0

if __name__ == '__main__':
    sys.exit(main(sys.argv))

-- 
Best regards,                                         _     _
.o. | Liege of Serenely Enlightened Majesty of      o' \,=./ `o
..o | Computer Science,  Michał “mina86” Nazarewicz    (o o)
ooo +----<email/xmpp: mpn at google.com>--------------ooO--(_)--Ooo--
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 835 bytes
Desc: not available
URL: <http://notmuchmail.org/pipermail/notmuch/attachments/20120904/dca6723b/attachment.pgp>