3d/e9948133aef2d1ca93625ff15818ac11380ef2

   1 Return-Path: <dmitry.kurochkin@gmail.com>\r
   2 X-Original-To: notmuch@notmuchmail.org\r
   3 Delivered-To: notmuch@notmuchmail.org\r
   4 Received: from localhost (localhost [127.0.0.1])\r
   5         by olra.theworths.org (Postfix) with ESMTP id 6E5B8431FB6\r
   6         for <notmuch@notmuchmail.org>; Tue,  4 Sep 2012 13:12:45 -0700 (PDT)\r
   7 X-Virus-Scanned: Debian amavisd-new at olra.theworths.org\r
   8 X-Spam-Flag: NO\r
   9 X-Spam-Score: -0.799\r
  10 X-Spam-Level: \r
  11 X-Spam-Status: No, score=-0.799 tagged_above=-999 required=5\r
  12         tests=[DKIM_SIGNED=0.1, DKIM_VALID=-0.1, DKIM_VALID_AU=-0.1,\r
  13         FREEMAIL_FROM=0.001, RCVD_IN_DNSWL_LOW=-0.7] autolearn=disabled\r
  14 Received: from olra.theworths.org ([127.0.0.1])\r
  15         by localhost (olra.theworths.org [127.0.0.1]) (amavisd-new, port 10024)\r
  16         with ESMTP id MuUKKbfYHxTC for <notmuch@notmuchmail.org>;\r
  17         Tue,  4 Sep 2012 13:12:44 -0700 (PDT)\r
  18 Received: from mail-ey0-f181.google.com (mail-ey0-f181.google.com\r
  19         [209.85.215.181]) (using TLSv1 with cipher RC4-SHA (128/128 bits))\r
  20         (No client certificate requested)\r
  21         by olra.theworths.org (Postfix) with ESMTPS id 28AD1431FAF\r
  22         for <notmuch@notmuchmail.org>; Tue,  4 Sep 2012 13:12:44 -0700 (PDT)\r
  23 Received: by eaan10 with SMTP id n10so2473248eaa.26\r
  24         for <notmuch@notmuchmail.org>; Tue, 04 Sep 2012 13:12:42 -0700 (PDT)\r
  25 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20120113;\r
  26         h=from:to:subject:in-reply-to:references:user-agent:date:message-id\r
  27         :mime-version:content-type:content-transfer-encoding;\r
  28         bh=/0V4lbozO3w2BdidaTB3mWL6HxCXqSb8xE6rnngN3Q0=;\r
  29         b=seUZ7gDgf8JS4Rv2bF3TOI/qaxc3yH5sX7Npn+QtNkQO3LjAIBMSpX32iRDz3kyLUG\r
  30         +7DqAflJe5Tl49XYXjCaD2HO1jSRZV1gqAUpdQIBZ4OEdjORVuGeG5plgQhAXrelQgIw\r
  31         D32eUB8NqR7jJdCz2YBcp5TK31fGx/z2aWkQGCkF9Miry0l+D5zt2sS7V3yNVSwvv0it\r
  32         8seg7YW2pco+PoUwHIjZI1bAsu+IuHxaiqfRePOLrbF+PWw/YJTQIj1UX3hx7VyhC21M\r
  33         G7ILw1MB9FviVkgPTVjFsfBxGpSn1naShaIO5fvTW3SSS23Lqq1QKbQHoB54j3/D4RGS\r
  34         3GJw==\r
  35 Received: by 10.14.224.4 with SMTP id w4mr27959330eep.21.1346789562767;\r
  36         Tue, 04 Sep 2012 13:12:42 -0700 (PDT)\r
  37 Received: from localhost ([2001:470:1f0b:14dd:224:d7ff:fee2:c588])\r
  38         by mx.google.com with ESMTPS id u8sm48089016eel.11.2012.09.04.13.12.41\r
  39         (version=TLSv1/SSLv3 cipher=OTHER);\r
  40         Tue, 04 Sep 2012 13:12:41 -0700 (PDT)\r
  41 From: Dmitry Kurochkin <dmitry.kurochkin@gmail.com>\r
  42 To: Michal Nazarewicz <mina86@mina86.com>, notmuch@notmuchmail.org\r
  43 Subject: Re: [PATCH] Add notmuch-remove-duplicates.py script to contrib.\r
  44 In-Reply-To: <xa1tligpk1za.fsf@mina86.com>\r
  45 References: <1346784785-19746-1-git-send-email-dmitry.kurochkin@gmail.com>\r
  46         <xa1tligpk1za.fsf@mina86.com>\r
  47 User-Agent: Notmuch/0.14+18~g79a73cd (http://notmuchmail.org) Emacs/23.4.1\r
  48         (x86_64-pc-linux-gnu)\r
  49 Date: Wed, 05 Sep 2012 00:12:39 +0400\r
  50 Message-ID: <87d321sg20.fsf@gmail.com>\r
  51 MIME-Version: 1.0\r
  52 Content-Type: text/plain; charset=utf-8\r
  53 Content-Transfer-Encoding: quoted-printable\r
  54 X-BeenThere: notmuch@notmuchmail.org\r
  55 X-Mailman-Version: 2.1.13\r
  56 Precedence: list\r
  57 List-Id: "Use and development of the notmuch mail system."\r
  58         <notmuch.notmuchmail.org>\r
  59 List-Unsubscribe: <http://notmuchmail.org/mailman/options/notmuch>,\r
  60         <mailto:notmuch-request@notmuchmail.org?subject=unsubscribe>\r
  61 List-Archive: <http://notmuchmail.org/pipermail/notmuch>\r
  62 List-Post: <mailto:notmuch@notmuchmail.org>\r
  63 List-Help: <mailto:notmuch-request@notmuchmail.org?subject=help>\r
  64 List-Subscribe: <http://notmuchmail.org/mailman/listinfo/notmuch>,\r
  65         <mailto:notmuch-request@notmuchmail.org?subject=subscribe>\r
  66 X-List-Received-Date: Tue, 04 Sep 2012 20:12:45 -0000\r
  67 \r
  68 Hi Michal.\r
  69 \r
  70 Michal Nazarewicz <mina86@mina86.com> writes:\r
  71 \r
  72 > On Tue, Sep 04 2012, Dmitry Kurochkin wrote:\r
  73 >> The script removes duplicate message files.  It takes no options.\r
  74 >>\r
  75 >> Files are assumed duplicates if their content is the same except for\r
  76 >> ignored headers.  Currently, the only ignored header is Received:.\r
  77 >> ---\r
  78 >>  contrib/notmuch-remove-duplicates.py |   95 +++++++++++++++++++++++++++=\r
  79 +++++++\r
  80 >>  1 file changed, 95 insertions(+)\r
  81 >>  create mode 100755 contrib/notmuch-remove-duplicates.py\r
  82 >>\r
  83 >> diff --git a/contrib/notmuch-remove-duplicates.py b/contrib/notmuch-remo=\r
  84 ve-duplicates.py\r
  85 >> new file mode 100755\r
  86 >> index 0000000..dbe2e25\r
  87 >> --- /dev/null\r
  88 >> +++ b/contrib/notmuch-remove-duplicates.py\r
  89 >> @@ -0,0 +1,95 @@\r
  90 >> +#!/usr/bin/env python\r
  91 >> +\r
  92 >> +import sys\r
  93 >> +\r
  94 >> +IGNORED_HEADERS =3D [ "Received:" ]\r
  95 >> +\r
  96 >> +if len(sys.argv) !=3D 1:\r
  97 >> +    print "Usage: %s" % sys.argv[0]\r
  98 >> +    print\r
  99 >> +    print "The script removes duplicate message files.  Takes no option=\r
 100 s."\r
 101 >> +    print "Requires notmuch python module."\r
 102 >> +    print\r
 103 >> +    print "Files are assumed duplicates if their content is the same"\r
 104 >> +    print "except for the following headers: %s." % ", ".join(IGNORED_H=\r
 105 EADERS)\r
 106 >> +    exit(1)\r
 107 >\r
 108 > It's much better put inside a main() function, which is than called only\r
 109 > if the script is run directly.\r
 110 >\r
 111 \r
 112 Good point.  My python skill is pretty low :)\r
 113 \r
 114 >> +\r
 115 >> +import notmuch\r
 116 >> +import os\r
 117 >> +import time\r
 118 >> +\r
 119 >> +class MailComparator:\r
 120 >> +    """Checks if mail files are duplicates."""\r
 121 >> +    def __init__(self, filename):\r
 122 >> +        self.filename =3D filename\r
 123 >> +        self.mail =3D self.readFile(self.filename)\r
 124 >> +\r
 125 >> +    def isDuplicate(self, filename):\r
 126 >> +        return self.mail =3D=3D self.readFile(filename)\r
 127 >> +\r
 128 >> +    @staticmethod\r
 129 >> +    def readFile(filename):\r
 130 >> +        with open(filename) as f:\r
 131 >> +            data =3D ""\r
 132 >> +            while True:\r
 133 >> +                line =3D f.readline()\r
 134 >> +                for header in IGNORED_HEADERS:\r
 135 >> +                    if line.startswith(header):\r
 136 >\r
 137 > Case of headers should be ignored, but this does not ignore it.\r
 138 >\r
 139 \r
 140 It does.\r
 141 \r
 142 >> +                        # skip header continuation lines\r
 143 >> +                        while True:\r
 144 >> +                            line =3D f.readline()\r
 145 >> +                            if len(line) =3D=3D 0 or line[0] not in [" =\r
 146 ", "\t"]:\r
 147 >> +                                break\r
 148 >> +                        break\r
 149 >\r
 150 > This will ignore line just after the ignored header.\r
 151 >\r
 152 \r
 153 The first header line is ignored as well because line is added to data\r
 154 in else block.\r
 155 \r
 156 >> +                else:\r
 157 >> +                    data +=3D line\r
 158 >> +                    if line =3D=3D "\n":\r
 159 >> +                        break\r
 160 >> +            data +=3D f.read()\r
 161 >> +            return data\r
 162 >> +\r
 163 >> +db =3D notmuch.Database()\r
 164 >> +query =3D db.create_query('*')\r
 165 >> +print "Number of messages: %s" % query.count_messages()\r
 166 >> +\r
 167 >> +files_count =3D 0\r
 168 >> +for root, dirs, files in os.walk(db.get_path()):\r
 169 >> +    if not root.startswith(os.path.join(db.get_path(), ".notmuch/")):\r
 170 >> +        files_count +=3D len(files)\r
 171 >> +print "Number of files: %s" % files_count\r
 172 >> +print "Estimated number of duplicates: %s" % (files_count - query.count=\r
 173 _messages())\r
 174 >> +\r
 175 >> +msgs =3D query.search_messages()\r
 176 >> +msg_count =3D 0\r
 177 >> +suspected_duplicates_count =3D 0\r
 178 >> +duplicates_count =3D 0\r
 179 >> +timestamp =3D time.time()\r
 180 >> +for msg in msgs:\r
 181 >> +    msg_count +=3D 1\r
 182 >> +    if len(msg.get_filenames()) > 1:\r
 183 >> +        filenames =3D msg.get_filenames()\r
 184 >> +        comparator =3D MailComparator(filenames.next())\r
 185 >> +        for filename in filenames:\r
 186 >\r
 187 > Strictly speaking, you need to compare each file to each file, and not\r
 188 > just every file to the first file.\r
 189 >\r
 190 >> +            if os.path.realpath(comparator.filename) =3D=3D os.path.rea=\r
 191 lpath(filename):\r
 192 >> +                print "Message '%s' has filenames pointing to the\r
 193 >> same file: '%s' '%s'" % (msg.get_message_id(), comparator.filename,\r
 194 >> filename)\r
 195 >\r
 196 > So why aren't those removed?\r
 197 >\r
 198 \r
 199 Because it is the same file indexed twice (probably because of\r
 200 symlinks).  We do not want to remove the only message file.\r
 201 \r
 202 >> +            elif comparator.isDuplicate(filename):\r
 203 >> +                os.remove(filename)\r
 204 >> +                duplicates_count +=3D 1\r
 205 >> +            else:\r
 206 >> +                #print "Potential duplicates: %s" % msg.get_message_id()\r
 207 >> +                suspected_duplicates_count +=3D 1\r
 208 >> +\r
 209 >> +    new_timestamp =3D time.time()\r
 210 >> +    if new_timestamp - timestamp > 1:\r
 211 >> +        timestamp =3D new_timestamp\r
 212 >> +        sys.stdout.write("\rProcessed %s messages, removed %s duplicate=\r
 213 s..." % (msg_count, duplicates_count))\r
 214 >> +        sys.stdout.flush()\r
 215 >> +\r
 216 >> +print "\rFinished. Processed %s messages, removed %s duplicates." % (ms=\r
 217 g_count, duplicates_count)\r
 218 >> +if duplicates_count > 0:\r
 219 >> +    print "You might want to run 'notmuch new' now."\r
 220 >> +\r
 221 >> +if suspected_duplicates_count > 0:\r
 222 >> +    print\r
 223 >> +    print "Found %s messages with duplicate IDs but different content."=\r
 224  % suspected_duplicates_count\r
 225 >> +    print "Perhaps we should ignore more headers."\r
 226 >\r
 227 > Please consider the following instead (not tested):\r
 228 >\r
 229 \r
 230 Thanks for reviewing my poor python code :) I am afraid I do not have\r
 231 enough interest in improving it.  I just implemented a simple solution\r
 232 for my problem.  Though it looks like you already took time to rewrite\r
 233 the script.  Would be great if you send it as a proper patch obsoleting\r
 234 this one.\r
 235 \r
 236 Regards,\r
 237   Dmitry\r
 238 \r
 239 >\r
 240 > #!/usr/bin/env python\r
 241 >\r
 242 > import collections\r
 243 > import notmuch\r
 244 > import os\r
 245 > import re\r
 246 > import sys\r
 247 > import time\r
 248 >\r
 249 >\r
 250 > IGNORED_HEADERS =3D [ 'Received' ]\r
 251 >\r
 252 >\r
 253 > isIgnoredHeadersLine =3D re.compile(\r
 254 >     r'^(?:%s)\s*:' % '|'.join(IGNORED_HEADERS),\r
 255 >     re.IGNORECASE).search\r
 256 >\r
 257 > doesStartWithWS =3D re.compile(r'^\s').search\r
 258 >\r
 259 >\r
 260 > def usage(argv0):\r
 261 >     print """Usage: %s [<query-string>]\r
 262 >\r
 263 > The script removes duplicate message files.  Takes no options."\r
 264 > Requires notmuch python module."\r
 265 >\r
 266 > Files are assumed duplicates if their content is the same"\r
 267 > except for the following headers: %s.""" % (argv0, ', '.join(IGNORED_HEAD=\r
 268 ERS))\r
 269 >\r
 270 >\r
 271 > def readMailFile(filename):\r
 272 >     with open(filename) as fd:\r
 273 >         data =3D []\r
 274 >         skip_header =3D False\r
 275 >         for line in fd:\r
 276 >             if doesStartWithWS(line):\r
 277 >                 if not skip_header:\r
 278 >                     data.append(line)\r
 279 >             elif isIgnoredHeadersLine(line):\r
 280 >                 skip_header =3D True\r
 281 >             else:\r
 282 >                 data.append(line)\r
 283 >                 if line =3D=3D '\n':\r
 284 >                     break\r
 285 >         data.append(fd.read())\r
 286 >         return ''.join(data)\r
 287 >\r
 288 >\r
 289 > def dedupMessage(msg):\r
 290 >     filenames =3D msg.get_filenames()\r
 291 >     if len(filenames) <=3D 1:\r
 292 >         return (0, 0)\r
 293 >\r
 294 >     realpaths =3D collections.defaultdict(list)\r
 295 >     contents =3D collections.defaultdict(list)\r
 296 >     for filename in filenames:\r
 297 >         real =3D os.path.realpath(filename)\r
 298 >         lst =3D realpaths[real]\r
 299 >         lst.append(filename)\r
 300 >         if len(lst) =3D=3D 1:\r
 301 >             contents[readMailFile(real)].append(real)\r
 302 >\r
 303 >     duplicates =3D 0\r
 304 >\r
 305 >     for filenames in contents.itervalues():\r
 306 >         if len(filenames) > 1:\r
 307 >             print 'Files with the same content:'\r
 308 >             print ' ', filenames.pop()\r
 309 >             duplicates +=3D len(filenames)\r
 310 >             for filename in filenames:\r
 311 >                 del realpaths[filename]\r
 312 >             #     os.remane(filename)\r
 313 >\r
 314 >     for real, filenames in realpaths.iteritems():\r
 315 >         if len(filenames) > 1:\r
 316 >             print 'Files pointing to the same message:'\r
 317 >             print ' ', filenames.pop()\r
 318 >             duplicates +=3D len(filenames)\r
 319 >             # for filename in filenames:\r
 320 >             #     os.remane(filename)\r
 321 >\r
 322 >     return (duplicates, len(realpaths) - 1)\r
 323 >\r
 324 >\r
 325 > def dedupQuery(query):\r
 326 >     print 'Number of messages: %s' % query.count_messages()\r
 327 >     msg_count =3D 0\r
 328 >     suspected_count =3D 0\r
 329 >     duplicates_count =3D 0\r
 330 >     timestamp =3D time.time()\r
 331 >     msgs =3D query.search_messages()\r
 332 >     for msg in msgs:\r
 333 >         msg_count +=3D 1\r
 334 >         d, s =3D dedupMessage(msg)\r
 335 >         duplicates_count +=3D d\r
 336 >         suspected_count +=3D d\r
 337 >\r
 338 >         new_timestamp =3D time.time()\r
 339 >         if new_timestamp - timestamp > 1:\r
 340 >             timestamp =3D new_timestamp\r
 341 >             sys.stdout.write('\rProcessed %s messages, removed %s duplica=\r
 342 tes...'\r
 343 >                              % (msg_count, duplicates_count))\r
 344 >             sys.stdout.flush()\r
 345 >\r
 346 >     print '\rFinished. Processed %s messages, removed %s duplicates.' % (\r
 347 >         msg_count, duplicates_count)\r
 348 >     if duplicates_count > 0:\r
 349 >         print 'You might want to run "notmuch new" now.'\r
 350 >\r
 351 >     if suspected_duplicates_count > 0:\r
 352 >         print """\r
 353 > Found %d messages with duplicate IDs but different content.\r
 354 > Perhaps we should ignore more headers.""" % suspected_count\r
 355 >\r
 356 >\r
 357 > def main(argv):\r
 358 >     if len(argv) =3D=3D 1:\r
 359 >         query =3D '*'\r
 360 >     elif len(argv) =3D=3D 2:\r
 361 >         query =3D argv[1]\r
 362 >     else:\r
 363 >         usage(argv[0])\r
 364 >         return 1\r
 365 >\r
 366 >     db =3D notmuch.Database()\r
 367 >     query =3D db.create_query(query)\r
 368 >     dedupQuery(db, query)\r
 369 >     return 0\r
 370 >\r
 371 >\r
 372 > if __name__ =3D=3D '__main__':\r
 373 >     sys.exit(main(sys.argv))\r
 374 >\r
 375 >\r
 376 >\r
 377 > --=20\r
 378 > Best regards,                                         _     _\r
 379 > .o. | Liege of Serenely Enlightened Majesty of      o' \,=3D./ `o\r
 380 > ..o | Computer Science,  Micha=C5=82 =E2=80=9Cmina86=E2=80=9D Nazarewicz =\r
 381    (o o)\r
 382 > ooo +----<email/xmpp: mpn@google.com>--------------ooO--(_)--Ooo--\r