Add find_duplicates post.
authorW. Trevor King <wking@drexel.edu>
Sun, 13 Mar 2011 03:43:20 +0000 (22:43 -0500)
committerW. Trevor King <wking@drexel.edu>
Sun, 13 Mar 2011 03:46:18 +0000 (22:46 -0500)
posts/find_duplicates.mdwn [new file with mode: 0644]
posts/find_duplicates/find_duplicates.py [new file with mode: 0755]

diff --git a/posts/find_duplicates.mdwn b/posts/find_duplicates.mdwn
new file mode 100644 (file)
index 0000000..1902c2b
--- /dev/null
@@ -0,0 +1,15 @@
+[[!meta  title="find_duplicates"]]
+
+I've reorganized my music a few times, and it took me a while to get
+organized enough to want a single directory tree that held everything
+I owned.  Every once and a while I go through the junk drawer and move
+a few more songs into the "official" tree, checking their metadata and
+whatnot.  I was getting annoyed at finding duplicate songs in several
+junk drawers, so I wrote up a little script, [[find_duplicates.py]]
+which can compare SHA1s for files in two trees and remove any
+duplicates from the lesser tree.  Now there is a lot less junk to merge
+into my official tree.
+
+[[!tag tags/code]]
+[[!tag tags/fun]]
+[[!tag tags/python]]
diff --git a/posts/find_duplicates/find_duplicates.py b/posts/find_duplicates/find_duplicates.py
new file mode 100755 (executable)
index 0000000..18b6f21
--- /dev/null
@@ -0,0 +1,87 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2011 W. Trevor King <wking@drexel.edu>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this program.  If not, see
+# <http://www.gnu.org/licenses/>.
+
+"""Search two directory trees for duplicate files.
+
+The command line script can optionally remove duplicates from the
+lesser tree.
+"""
+
+import os
+import os.path
+from hashlib import sha1
+import sys
+
+
+def hash_file(filename):
+    print >> sys.stderr, '    hashing', filename
+    return sha1(open(filename, 'r').read()).hexdigest()
+
+def duplicates(dir_a, dir_b):
+    aa_duplicates = {}
+    ab_duplicates = {}
+    hashes = {}
+    for dirpath,dirnames,filenames in os.walk(dir_a):
+        for filename in filenames:
+            path = os.path.join(dirpath, filename)
+            h = hash_file(path)
+            if h in hashes:
+                if h in ab_duplicates:
+                    aa_duplicates[h].append(path)
+                else:
+                    aa_duplicates[h] = [hashes[h], path]
+            else:
+                hashes[h] = path
+    for dirpath,dirnames,filenames in os.walk(dir_b):
+        for filename in filenames:
+            path = os.path.join(dirpath, filename)
+            h = hash_file(path)
+            if h in hashes:
+                if h in ab_duplicates:
+                    ab_duplicates[h].append(path)
+                else:
+                    ab_duplicates[h] = [hashes[h], path]
+    return (aa_duplicates, ab_duplicates)
+
+
+if __name__ == '__main__':
+    from optparse import OptionParser
+
+    p = OptionParser(usage='%prog [options] dir_a dir_b')
+    p.add_option('-r', '--remove', help='remove duplicates from dir_b',
+                 action='store_true')
+
+    options,arguments = p.parse_args()
+
+    dir_a,dir_b = arguments
+
+    aa_duplicates,ab_duplicates = duplicates(dir_a, dir_b)
+    path_groups = []
+    path_groups.extend(aa_duplicates.itervalues())
+    path_groups.extend(ab_duplicates.itervalues())
+    for path_group in path_groups:
+        print path_group[0]
+        for dup in path_group[1:]:
+            print '  ', dup
+    if options.remove:
+        print ''
+        for path_group in ab_duplicates.itervalues():
+            print 'removing duplicates of', path_group[0]
+            for dup in path_group[1:]:
+                print '  ', dup
+                os.remove(dup)