--- /dev/null
+[[!meta title="find_duplicates"]]
+I've reorganized my music a few times, and it took me a while to get
+organized enough to want a single directory tree that held everything
+I owned. Every once and a while I go through the junk drawer and move
+a few more songs into the "official" tree, checking their metadata and
+whatnot. I was getting annoyed at finding duplicate songs in several
+junk drawers, so I wrote up a little script, [[find_duplicates.py]]
+which can compare SHA1s for files in two trees and remove any
+duplicates from the lesser tree. Now there is a lot less junk to merge
+into my official tree.
+[[!tag tags/code]]
+[[!tag tags/fun]]
+[[!tag tags/python]]
--- /dev/null
+#!/usr/bin/env python
+# Copyright (C) 2011 W. Trevor King <wking@drexel.edu>
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# Lesser General Public License for more details.
+# You should have received a copy of the GNU Lesser General Public
+# License along with this program. If not, see
+# <http://www.gnu.org/licenses/>.
+"""Search two directory trees for duplicate files.
+The command line script can optionally remove duplicates from the
+lesser tree.
+import os
+import os.path
+from hashlib import sha1
+import sys
+def hash_file(filename):
+ print >> sys.stderr, ' hashing', filename
+ return sha1(open(filename, 'r').read()).hexdigest()
+def duplicates(dir_a, dir_b):
+ aa_duplicates = {}
+ ab_duplicates = {}
+ hashes = {}
+ for dirpath,dirnames,filenames in os.walk(dir_a):
+ for filename in filenames:
+ path = os.path.join(dirpath, filename)
+ h = hash_file(path)
+ if h in hashes:
+ if h in ab_duplicates:
+ aa_duplicates[h].append(path)
+ else:
+ aa_duplicates[h] = [hashes[h], path]
+ else:
+ hashes[h] = path
+ for dirpath,dirnames,filenames in os.walk(dir_b):
+ for filename in filenames:
+ path = os.path.join(dirpath, filename)
+ h = hash_file(path)
+ if h in hashes:
+ if h in ab_duplicates:
+ ab_duplicates[h].append(path)
+ else:
+ ab_duplicates[h] = [hashes[h], path]
+ return (aa_duplicates, ab_duplicates)
+if __name__ == '__main__':
+ from optparse import OptionParser
+ p = OptionParser(usage='%prog [options] dir_a dir_b')
+ p.add_option('-r', '--remove', help='remove duplicates from dir_b',
+ action='store_true')
+ options,arguments = p.parse_args()
+ dir_a,dir_b = arguments
+ aa_duplicates,ab_duplicates = duplicates(dir_a, dir_b)
+ path_groups = []
+ path_groups.extend(aa_duplicates.itervalues())
+ path_groups.extend(ab_duplicates.itervalues())
+ for path_group in path_groups:
+ print path_group[0]
+ for dup in path_group[1:]:
+ print ' ', dup
+ if options.remove:
+ print ''
+ for path_group in ab_duplicates.itervalues():
+ print 'removing duplicates of', path_group[0]
+ for dup in path_group[1:]:
+ print ' ', dup
+ os.remove(dup)