1 Return-Path: <dmitry.kurochkin@gmail.com>
\r
2 X-Original-To: notmuch@notmuchmail.org
\r
3 Delivered-To: notmuch@notmuchmail.org
\r
4 Received: from localhost (localhost [127.0.0.1])
\r
5 by olra.theworths.org (Postfix) with ESMTP id 6E5B8431FB6
\r
6 for <notmuch@notmuchmail.org>; Tue, 4 Sep 2012 13:12:45 -0700 (PDT)
\r
7 X-Virus-Scanned: Debian amavisd-new at olra.theworths.org
\r
11 X-Spam-Status: No, score=-0.799 tagged_above=-999 required=5
\r
12 tests=[DKIM_SIGNED=0.1, DKIM_VALID=-0.1, DKIM_VALID_AU=-0.1,
\r
13 FREEMAIL_FROM=0.001, RCVD_IN_DNSWL_LOW=-0.7] autolearn=disabled
\r
14 Received: from olra.theworths.org ([127.0.0.1])
\r
15 by localhost (olra.theworths.org [127.0.0.1]) (amavisd-new, port 10024)
\r
16 with ESMTP id MuUKKbfYHxTC for <notmuch@notmuchmail.org>;
\r
17 Tue, 4 Sep 2012 13:12:44 -0700 (PDT)
\r
18 Received: from mail-ey0-f181.google.com (mail-ey0-f181.google.com
\r
19 [209.85.215.181]) (using TLSv1 with cipher RC4-SHA (128/128 bits))
\r
20 (No client certificate requested)
\r
21 by olra.theworths.org (Postfix) with ESMTPS id 28AD1431FAF
\r
22 for <notmuch@notmuchmail.org>; Tue, 4 Sep 2012 13:12:44 -0700 (PDT)
\r
23 Received: by eaan10 with SMTP id n10so2473248eaa.26
\r
24 for <notmuch@notmuchmail.org>; Tue, 04 Sep 2012 13:12:42 -0700 (PDT)
\r
25 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20120113;
\r
26 h=from:to:subject:in-reply-to:references:user-agent:date:message-id
\r
27 :mime-version:content-type:content-transfer-encoding;
\r
28 bh=/0V4lbozO3w2BdidaTB3mWL6HxCXqSb8xE6rnngN3Q0=;
\r
29 b=seUZ7gDgf8JS4Rv2bF3TOI/qaxc3yH5sX7Npn+QtNkQO3LjAIBMSpX32iRDz3kyLUG
\r
30 +7DqAflJe5Tl49XYXjCaD2HO1jSRZV1gqAUpdQIBZ4OEdjORVuGeG5plgQhAXrelQgIw
\r
31 D32eUB8NqR7jJdCz2YBcp5TK31fGx/z2aWkQGCkF9Miry0l+D5zt2sS7V3yNVSwvv0it
\r
32 8seg7YW2pco+PoUwHIjZI1bAsu+IuHxaiqfRePOLrbF+PWw/YJTQIj1UX3hx7VyhC21M
\r
33 G7ILw1MB9FviVkgPTVjFsfBxGpSn1naShaIO5fvTW3SSS23Lqq1QKbQHoB54j3/D4RGS
\r
35 Received: by 10.14.224.4 with SMTP id w4mr27959330eep.21.1346789562767;
\r
36 Tue, 04 Sep 2012 13:12:42 -0700 (PDT)
\r
37 Received: from localhost ([2001:470:1f0b:14dd:224:d7ff:fee2:c588])
\r
38 by mx.google.com with ESMTPS id u8sm48089016eel.11.2012.09.04.13.12.41
\r
39 (version=TLSv1/SSLv3 cipher=OTHER);
\r
40 Tue, 04 Sep 2012 13:12:41 -0700 (PDT)
\r
41 From: Dmitry Kurochkin <dmitry.kurochkin@gmail.com>
\r
42 To: Michal Nazarewicz <mina86@mina86.com>, notmuch@notmuchmail.org
\r
43 Subject: Re: [PATCH] Add notmuch-remove-duplicates.py script to contrib.
\r
44 In-Reply-To: <xa1tligpk1za.fsf@mina86.com>
\r
45 References: <1346784785-19746-1-git-send-email-dmitry.kurochkin@gmail.com>
\r
46 <xa1tligpk1za.fsf@mina86.com>
\r
47 User-Agent: Notmuch/0.14+18~g79a73cd (http://notmuchmail.org) Emacs/23.4.1
\r
48 (x86_64-pc-linux-gnu)
\r
49 Date: Wed, 05 Sep 2012 00:12:39 +0400
\r
50 Message-ID: <87d321sg20.fsf@gmail.com>
\r
52 Content-Type: text/plain; charset=utf-8
\r
53 Content-Transfer-Encoding: quoted-printable
\r
54 X-BeenThere: notmuch@notmuchmail.org
\r
55 X-Mailman-Version: 2.1.13
\r
57 List-Id: "Use and development of the notmuch mail system."
\r
58 <notmuch.notmuchmail.org>
\r
59 List-Unsubscribe: <http://notmuchmail.org/mailman/options/notmuch>,
\r
60 <mailto:notmuch-request@notmuchmail.org?subject=unsubscribe>
\r
61 List-Archive: <http://notmuchmail.org/pipermail/notmuch>
\r
62 List-Post: <mailto:notmuch@notmuchmail.org>
\r
63 List-Help: <mailto:notmuch-request@notmuchmail.org?subject=help>
\r
64 List-Subscribe: <http://notmuchmail.org/mailman/listinfo/notmuch>,
\r
65 <mailto:notmuch-request@notmuchmail.org?subject=subscribe>
\r
66 X-List-Received-Date: Tue, 04 Sep 2012 20:12:45 -0000
\r
70 Michal Nazarewicz <mina86@mina86.com> writes:
\r
72 > On Tue, Sep 04 2012, Dmitry Kurochkin wrote:
\r
73 >> The script removes duplicate message files. It takes no options.
\r
75 >> Files are assumed duplicates if their content is the same except for
\r
76 >> ignored headers. Currently, the only ignored header is Received:.
\r
78 >> contrib/notmuch-remove-duplicates.py | 95 +++++++++++++++++++++++++++=
\r
80 >> 1 file changed, 95 insertions(+)
\r
81 >> create mode 100755 contrib/notmuch-remove-duplicates.py
\r
83 >> diff --git a/contrib/notmuch-remove-duplicates.py b/contrib/notmuch-remo=
\r
85 >> new file mode 100755
\r
86 >> index 0000000..dbe2e25
\r
88 >> +++ b/contrib/notmuch-remove-duplicates.py
\r
90 >> +#!/usr/bin/env python
\r
94 >> +IGNORED_HEADERS =3D [ "Received:" ]
\r
96 >> +if len(sys.argv) !=3D 1:
\r
97 >> + print "Usage: %s" % sys.argv[0]
\r
99 >> + print "The script removes duplicate message files. Takes no option=
\r
101 >> + print "Requires notmuch python module."
\r
103 >> + print "Files are assumed duplicates if their content is the same"
\r
104 >> + print "except for the following headers: %s." % ", ".join(IGNORED_H=
\r
108 > It's much better put inside a main() function, which is than called only
\r
109 > if the script is run directly.
\r
112 Good point. My python skill is pretty low :)
\r
119 >> +class MailComparator:
\r
120 >> + """Checks if mail files are duplicates."""
\r
121 >> + def __init__(self, filename):
\r
122 >> + self.filename =3D filename
\r
123 >> + self.mail =3D self.readFile(self.filename)
\r
125 >> + def isDuplicate(self, filename):
\r
126 >> + return self.mail =3D=3D self.readFile(filename)
\r
129 >> + def readFile(filename):
\r
130 >> + with open(filename) as f:
\r
133 >> + line =3D f.readline()
\r
134 >> + for header in IGNORED_HEADERS:
\r
135 >> + if line.startswith(header):
\r
137 > Case of headers should be ignored, but this does not ignore it.
\r
142 >> + # skip header continuation lines
\r
144 >> + line =3D f.readline()
\r
145 >> + if len(line) =3D=3D 0 or line[0] not in [" =
\r
150 > This will ignore line just after the ignored header.
\r
153 The first header line is ignored as well because line is added to data
\r
157 >> + data +=3D line
\r
158 >> + if line =3D=3D "\n":
\r
160 >> + data +=3D f.read()
\r
163 >> +db =3D notmuch.Database()
\r
164 >> +query =3D db.create_query('*')
\r
165 >> +print "Number of messages: %s" % query.count_messages()
\r
167 >> +files_count =3D 0
\r
168 >> +for root, dirs, files in os.walk(db.get_path()):
\r
169 >> + if not root.startswith(os.path.join(db.get_path(), ".notmuch/")):
\r
170 >> + files_count +=3D len(files)
\r
171 >> +print "Number of files: %s" % files_count
\r
172 >> +print "Estimated number of duplicates: %s" % (files_count - query.count=
\r
175 >> +msgs =3D query.search_messages()
\r
176 >> +msg_count =3D 0
\r
177 >> +suspected_duplicates_count =3D 0
\r
178 >> +duplicates_count =3D 0
\r
179 >> +timestamp =3D time.time()
\r
180 >> +for msg in msgs:
\r
181 >> + msg_count +=3D 1
\r
182 >> + if len(msg.get_filenames()) > 1:
\r
183 >> + filenames =3D msg.get_filenames()
\r
184 >> + comparator =3D MailComparator(filenames.next())
\r
185 >> + for filename in filenames:
\r
187 > Strictly speaking, you need to compare each file to each file, and not
\r
188 > just every file to the first file.
\r
190 >> + if os.path.realpath(comparator.filename) =3D=3D os.path.rea=
\r
192 >> + print "Message '%s' has filenames pointing to the
\r
193 >> same file: '%s' '%s'" % (msg.get_message_id(), comparator.filename,
\r
196 > So why aren't those removed?
\r
199 Because it is the same file indexed twice (probably because of
\r
200 symlinks). We do not want to remove the only message file.
\r
202 >> + elif comparator.isDuplicate(filename):
\r
203 >> + os.remove(filename)
\r
204 >> + duplicates_count +=3D 1
\r
206 >> + #print "Potential duplicates: %s" % msg.get_message_id()
\r
207 >> + suspected_duplicates_count +=3D 1
\r
209 >> + new_timestamp =3D time.time()
\r
210 >> + if new_timestamp - timestamp > 1:
\r
211 >> + timestamp =3D new_timestamp
\r
212 >> + sys.stdout.write("\rProcessed %s messages, removed %s duplicate=
\r
213 s..." % (msg_count, duplicates_count))
\r
214 >> + sys.stdout.flush()
\r
216 >> +print "\rFinished. Processed %s messages, removed %s duplicates." % (ms=
\r
217 g_count, duplicates_count)
\r
218 >> +if duplicates_count > 0:
\r
219 >> + print "You might want to run 'notmuch new' now."
\r
221 >> +if suspected_duplicates_count > 0:
\r
223 >> + print "Found %s messages with duplicate IDs but different content."=
\r
224 % suspected_duplicates_count
\r
225 >> + print "Perhaps we should ignore more headers."
\r
227 > Please consider the following instead (not tested):
\r
230 Thanks for reviewing my poor python code :) I am afraid I do not have
\r
231 enough interest in improving it. I just implemented a simple solution
\r
232 for my problem. Though it looks like you already took time to rewrite
\r
233 the script. Would be great if you send it as a proper patch obsoleting
\r
240 > #!/usr/bin/env python
\r
242 > import collections
\r
250 > IGNORED_HEADERS =3D [ 'Received' ]
\r
253 > isIgnoredHeadersLine =3D re.compile(
\r
254 > r'^(?:%s)\s*:' % '|'.join(IGNORED_HEADERS),
\r
255 > re.IGNORECASE).search
\r
257 > doesStartWithWS =3D re.compile(r'^\s').search
\r
260 > def usage(argv0):
\r
261 > print """Usage: %s [<query-string>]
\r
263 > The script removes duplicate message files. Takes no options."
\r
264 > Requires notmuch python module."
\r
266 > Files are assumed duplicates if their content is the same"
\r
267 > except for the following headers: %s.""" % (argv0, ', '.join(IGNORED_HEAD=
\r
271 > def readMailFile(filename):
\r
272 > with open(filename) as fd:
\r
274 > skip_header =3D False
\r
276 > if doesStartWithWS(line):
\r
277 > if not skip_header:
\r
278 > data.append(line)
\r
279 > elif isIgnoredHeadersLine(line):
\r
280 > skip_header =3D True
\r
282 > data.append(line)
\r
283 > if line =3D=3D '\n':
\r
285 > data.append(fd.read())
\r
286 > return ''.join(data)
\r
289 > def dedupMessage(msg):
\r
290 > filenames =3D msg.get_filenames()
\r
291 > if len(filenames) <=3D 1:
\r
294 > realpaths =3D collections.defaultdict(list)
\r
295 > contents =3D collections.defaultdict(list)
\r
296 > for filename in filenames:
\r
297 > real =3D os.path.realpath(filename)
\r
298 > lst =3D realpaths[real]
\r
299 > lst.append(filename)
\r
300 > if len(lst) =3D=3D 1:
\r
301 > contents[readMailFile(real)].append(real)
\r
305 > for filenames in contents.itervalues():
\r
306 > if len(filenames) > 1:
\r
307 > print 'Files with the same content:'
\r
308 > print ' ', filenames.pop()
\r
309 > duplicates +=3D len(filenames)
\r
310 > for filename in filenames:
\r
311 > del realpaths[filename]
\r
312 > # os.remane(filename)
\r
314 > for real, filenames in realpaths.iteritems():
\r
315 > if len(filenames) > 1:
\r
316 > print 'Files pointing to the same message:'
\r
317 > print ' ', filenames.pop()
\r
318 > duplicates +=3D len(filenames)
\r
319 > # for filename in filenames:
\r
320 > # os.remane(filename)
\r
322 > return (duplicates, len(realpaths) - 1)
\r
325 > def dedupQuery(query):
\r
326 > print 'Number of messages: %s' % query.count_messages()
\r
328 > suspected_count =3D 0
\r
329 > duplicates_count =3D 0
\r
330 > timestamp =3D time.time()
\r
331 > msgs =3D query.search_messages()
\r
334 > d, s =3D dedupMessage(msg)
\r
335 > duplicates_count +=3D d
\r
336 > suspected_count +=3D d
\r
338 > new_timestamp =3D time.time()
\r
339 > if new_timestamp - timestamp > 1:
\r
340 > timestamp =3D new_timestamp
\r
341 > sys.stdout.write('\rProcessed %s messages, removed %s duplica=
\r
343 > % (msg_count, duplicates_count))
\r
344 > sys.stdout.flush()
\r
346 > print '\rFinished. Processed %s messages, removed %s duplicates.' % (
\r
347 > msg_count, duplicates_count)
\r
348 > if duplicates_count > 0:
\r
349 > print 'You might want to run "notmuch new" now.'
\r
351 > if suspected_duplicates_count > 0:
\r
353 > Found %d messages with duplicate IDs but different content.
\r
354 > Perhaps we should ignore more headers.""" % suspected_count
\r
358 > if len(argv) =3D=3D 1:
\r
360 > elif len(argv) =3D=3D 2:
\r
361 > query =3D argv[1]
\r
366 > db =3D notmuch.Database()
\r
367 > query =3D db.create_query(query)
\r
368 > dedupQuery(db, query)
\r
372 > if __name__ =3D=3D '__main__':
\r
373 > sys.exit(main(sys.argv))
\r
378 > Best regards, _ _
\r
379 > .o. | Liege of Serenely Enlightened Majesty of o' \,=3D./ `o
\r
380 > ..o | Computer Science, Micha=C5=82 =E2=80=9Cmina86=E2=80=9D Nazarewicz =
\r
382 > ooo +----<email/xmpp: mpn@google.com>--------------ooO--(_)--Ooo--
\r