Return-Path: X-Original-To: notmuch@notmuchmail.org Delivered-To: notmuch@notmuchmail.org Received: from localhost (localhost [127.0.0.1]) by olra.theworths.org (Postfix) with ESMTP id 6E5B8431FB6 for ; Tue, 4 Sep 2012 13:12:45 -0700 (PDT) X-Virus-Scanned: Debian amavisd-new at olra.theworths.org X-Spam-Flag: NO X-Spam-Score: -0.799 X-Spam-Level: X-Spam-Status: No, score=-0.799 tagged_above=-999 required=5 tests=[DKIM_SIGNED=0.1, DKIM_VALID=-0.1, DKIM_VALID_AU=-0.1, FREEMAIL_FROM=0.001, RCVD_IN_DNSWL_LOW=-0.7] autolearn=disabled Received: from olra.theworths.org ([127.0.0.1]) by localhost (olra.theworths.org [127.0.0.1]) (amavisd-new, port 10024) with ESMTP id MuUKKbfYHxTC for ; Tue, 4 Sep 2012 13:12:44 -0700 (PDT) Received: from mail-ey0-f181.google.com (mail-ey0-f181.google.com [209.85.215.181]) (using TLSv1 with cipher RC4-SHA (128/128 bits)) (No client certificate requested) by olra.theworths.org (Postfix) with ESMTPS id 28AD1431FAF for ; Tue, 4 Sep 2012 13:12:44 -0700 (PDT) Received: by eaan10 with SMTP id n10so2473248eaa.26 for ; Tue, 04 Sep 2012 13:12:42 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20120113; h=from:to:subject:in-reply-to:references:user-agent:date:message-id :mime-version:content-type:content-transfer-encoding; bh=/0V4lbozO3w2BdidaTB3mWL6HxCXqSb8xE6rnngN3Q0=; b=seUZ7gDgf8JS4Rv2bF3TOI/qaxc3yH5sX7Npn+QtNkQO3LjAIBMSpX32iRDz3kyLUG +7DqAflJe5Tl49XYXjCaD2HO1jSRZV1gqAUpdQIBZ4OEdjORVuGeG5plgQhAXrelQgIw D32eUB8NqR7jJdCz2YBcp5TK31fGx/z2aWkQGCkF9Miry0l+D5zt2sS7V3yNVSwvv0it 8seg7YW2pco+PoUwHIjZI1bAsu+IuHxaiqfRePOLrbF+PWw/YJTQIj1UX3hx7VyhC21M G7ILw1MB9FviVkgPTVjFsfBxGpSn1naShaIO5fvTW3SSS23Lqq1QKbQHoB54j3/D4RGS 3GJw== Received: by 10.14.224.4 with SMTP id w4mr27959330eep.21.1346789562767; Tue, 04 Sep 2012 13:12:42 -0700 (PDT) Received: from localhost ([2001:470:1f0b:14dd:224:d7ff:fee2:c588]) by mx.google.com with ESMTPS id u8sm48089016eel.11.2012.09.04.13.12.41 (version=TLSv1/SSLv3 cipher=OTHER); Tue, 04 Sep 2012 13:12:41 -0700 (PDT) From: Dmitry Kurochkin To: Michal Nazarewicz , notmuch@notmuchmail.org Subject: Re: [PATCH] Add notmuch-remove-duplicates.py script to contrib. In-Reply-To: References: <1346784785-19746-1-git-send-email-dmitry.kurochkin@gmail.com> User-Agent: Notmuch/0.14+18~g79a73cd (http://notmuchmail.org) Emacs/23.4.1 (x86_64-pc-linux-gnu) Date: Wed, 05 Sep 2012 00:12:39 +0400 Message-ID: <87d321sg20.fsf@gmail.com> MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: quoted-printable X-BeenThere: notmuch@notmuchmail.org X-Mailman-Version: 2.1.13 Precedence: list List-Id: "Use and development of the notmuch mail system." List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 04 Sep 2012 20:12:45 -0000 Hi Michal. Michal Nazarewicz writes: > On Tue, Sep 04 2012, Dmitry Kurochkin wrote: >> The script removes duplicate message files. It takes no options. >> >> Files are assumed duplicates if their content is the same except for >> ignored headers. Currently, the only ignored header is Received:. >> --- >> contrib/notmuch-remove-duplicates.py | 95 +++++++++++++++++++++++++++= +++++++ >> 1 file changed, 95 insertions(+) >> create mode 100755 contrib/notmuch-remove-duplicates.py >> >> diff --git a/contrib/notmuch-remove-duplicates.py b/contrib/notmuch-remo= ve-duplicates.py >> new file mode 100755 >> index 0000000..dbe2e25 >> --- /dev/null >> +++ b/contrib/notmuch-remove-duplicates.py >> @@ -0,0 +1,95 @@ >> +#!/usr/bin/env python >> + >> +import sys >> + >> +IGNORED_HEADERS =3D [ "Received:" ] >> + >> +if len(sys.argv) !=3D 1: >> + print "Usage: %s" % sys.argv[0] >> + print >> + print "The script removes duplicate message files. Takes no option= s." >> + print "Requires notmuch python module." >> + print >> + print "Files are assumed duplicates if their content is the same" >> + print "except for the following headers: %s." % ", ".join(IGNORED_H= EADERS) >> + exit(1) > > It's much better put inside a main() function, which is than called only > if the script is run directly. > Good point. My python skill is pretty low :) >> + >> +import notmuch >> +import os >> +import time >> + >> +class MailComparator: >> + """Checks if mail files are duplicates.""" >> + def __init__(self, filename): >> + self.filename =3D filename >> + self.mail =3D self.readFile(self.filename) >> + >> + def isDuplicate(self, filename): >> + return self.mail =3D=3D self.readFile(filename) >> + >> + @staticmethod >> + def readFile(filename): >> + with open(filename) as f: >> + data =3D "" >> + while True: >> + line =3D f.readline() >> + for header in IGNORED_HEADERS: >> + if line.startswith(header): > > Case of headers should be ignored, but this does not ignore it. > It does. >> + # skip header continuation lines >> + while True: >> + line =3D f.readline() >> + if len(line) =3D=3D 0 or line[0] not in [" = ", "\t"]: >> + break >> + break > > This will ignore line just after the ignored header. > The first header line is ignored as well because line is added to data in else block. >> + else: >> + data +=3D line >> + if line =3D=3D "\n": >> + break >> + data +=3D f.read() >> + return data >> + >> +db =3D notmuch.Database() >> +query =3D db.create_query('*') >> +print "Number of messages: %s" % query.count_messages() >> + >> +files_count =3D 0 >> +for root, dirs, files in os.walk(db.get_path()): >> + if not root.startswith(os.path.join(db.get_path(), ".notmuch/")): >> + files_count +=3D len(files) >> +print "Number of files: %s" % files_count >> +print "Estimated number of duplicates: %s" % (files_count - query.count= _messages()) >> + >> +msgs =3D query.search_messages() >> +msg_count =3D 0 >> +suspected_duplicates_count =3D 0 >> +duplicates_count =3D 0 >> +timestamp =3D time.time() >> +for msg in msgs: >> + msg_count +=3D 1 >> + if len(msg.get_filenames()) > 1: >> + filenames =3D msg.get_filenames() >> + comparator =3D MailComparator(filenames.next()) >> + for filename in filenames: > > Strictly speaking, you need to compare each file to each file, and not > just every file to the first file. > >> + if os.path.realpath(comparator.filename) =3D=3D os.path.rea= lpath(filename): >> + print "Message '%s' has filenames pointing to the >> same file: '%s' '%s'" % (msg.get_message_id(), comparator.filename, >> filename) > > So why aren't those removed? > Because it is the same file indexed twice (probably because of symlinks). We do not want to remove the only message file. >> + elif comparator.isDuplicate(filename): >> + os.remove(filename) >> + duplicates_count +=3D 1 >> + else: >> + #print "Potential duplicates: %s" % msg.get_message_id() >> + suspected_duplicates_count +=3D 1 >> + >> + new_timestamp =3D time.time() >> + if new_timestamp - timestamp > 1: >> + timestamp =3D new_timestamp >> + sys.stdout.write("\rProcessed %s messages, removed %s duplicate= s..." % (msg_count, duplicates_count)) >> + sys.stdout.flush() >> + >> +print "\rFinished. Processed %s messages, removed %s duplicates." % (ms= g_count, duplicates_count) >> +if duplicates_count > 0: >> + print "You might want to run 'notmuch new' now." >> + >> +if suspected_duplicates_count > 0: >> + print >> + print "Found %s messages with duplicate IDs but different content."= % suspected_duplicates_count >> + print "Perhaps we should ignore more headers." > > Please consider the following instead (not tested): > Thanks for reviewing my poor python code :) I am afraid I do not have enough interest in improving it. I just implemented a simple solution for my problem. Though it looks like you already took time to rewrite the script. Would be great if you send it as a proper patch obsoleting this one. Regards, Dmitry > > #!/usr/bin/env python > > import collections > import notmuch > import os > import re > import sys > import time > > > IGNORED_HEADERS =3D [ 'Received' ] > > > isIgnoredHeadersLine =3D re.compile( > r'^(?:%s)\s*:' % '|'.join(IGNORED_HEADERS), > re.IGNORECASE).search > > doesStartWithWS =3D re.compile(r'^\s').search > > > def usage(argv0): > print """Usage: %s [] > > The script removes duplicate message files. Takes no options." > Requires notmuch python module." > > Files are assumed duplicates if their content is the same" > except for the following headers: %s.""" % (argv0, ', '.join(IGNORED_HEAD= ERS)) > > > def readMailFile(filename): > with open(filename) as fd: > data =3D [] > skip_header =3D False > for line in fd: > if doesStartWithWS(line): > if not skip_header: > data.append(line) > elif isIgnoredHeadersLine(line): > skip_header =3D True > else: > data.append(line) > if line =3D=3D '\n': > break > data.append(fd.read()) > return ''.join(data) > > > def dedupMessage(msg): > filenames =3D msg.get_filenames() > if len(filenames) <=3D 1: > return (0, 0) > > realpaths =3D collections.defaultdict(list) > contents =3D collections.defaultdict(list) > for filename in filenames: > real =3D os.path.realpath(filename) > lst =3D realpaths[real] > lst.append(filename) > if len(lst) =3D=3D 1: > contents[readMailFile(real)].append(real) > > duplicates =3D 0 > > for filenames in contents.itervalues(): > if len(filenames) > 1: > print 'Files with the same content:' > print ' ', filenames.pop() > duplicates +=3D len(filenames) > for filename in filenames: > del realpaths[filename] > # os.remane(filename) > > for real, filenames in realpaths.iteritems(): > if len(filenames) > 1: > print 'Files pointing to the same message:' > print ' ', filenames.pop() > duplicates +=3D len(filenames) > # for filename in filenames: > # os.remane(filename) > > return (duplicates, len(realpaths) - 1) > > > def dedupQuery(query): > print 'Number of messages: %s' % query.count_messages() > msg_count =3D 0 > suspected_count =3D 0 > duplicates_count =3D 0 > timestamp =3D time.time() > msgs =3D query.search_messages() > for msg in msgs: > msg_count +=3D 1 > d, s =3D dedupMessage(msg) > duplicates_count +=3D d > suspected_count +=3D d > > new_timestamp =3D time.time() > if new_timestamp - timestamp > 1: > timestamp =3D new_timestamp > sys.stdout.write('\rProcessed %s messages, removed %s duplica= tes...' > % (msg_count, duplicates_count)) > sys.stdout.flush() > > print '\rFinished. Processed %s messages, removed %s duplicates.' % ( > msg_count, duplicates_count) > if duplicates_count > 0: > print 'You might want to run "notmuch new" now.' > > if suspected_duplicates_count > 0: > print """ > Found %d messages with duplicate IDs but different content. > Perhaps we should ignore more headers.""" % suspected_count > > > def main(argv): > if len(argv) =3D=3D 1: > query =3D '*' > elif len(argv) =3D=3D 2: > query =3D argv[1] > else: > usage(argv[0]) > return 1 > > db =3D notmuch.Database() > query =3D db.create_query(query) > dedupQuery(db, query) > return 0 > > > if __name__ =3D=3D '__main__': > sys.exit(main(sys.argv)) > > > > --=20 > Best regards, _ _ > .o. | Liege of Serenely Enlightened Majesty of o' \,=3D./ `o > ..o | Computer Science, Micha=C5=82 =E2=80=9Cmina86=E2=80=9D Nazarewicz = (o o) > ooo +------------------ooO--(_)--Ooo--