duplicates from the lesser tree. Now there is a lot less junk to merge
into my official tree.
+Sometimes you want to do something more effective than printing
+duplicates, but more subtle than removing them. To give you this
+flexibility, I've added the `--one-line` option, which you can use
+along these lines:
+
+ $ find_duplicates.py dir_a dir_b --one-line | while read -a DUPS; do mv "${DUPS[1]}" "${DUPS[0]}".dup; done
+
[[!tag tags/code]]
[[!tag tags/fun]]
[[!tag tags/python]]
return sha1(open(filename, 'r').read()).hexdigest()
def duplicates(dir_a, dir_b):
- aa_duplicates = {}
- ab_duplicates = {}
- hashes = {}
+ hashes = {} # first occurance of hash in dir_a
+ aa_duplicates = {} # hash found multiple times in dir_a
+ ab_duplicates = {} # hash found once in dir_a and 1+ times in dir_b
for dirpath,dirnames,filenames in os.walk(dir_a):
for filename in filenames:
path = os.path.join(dirpath, filename)
h = hash_file(path)
if h in hashes:
- if h in ab_duplicates:
+ if h in aa_duplicates:
aa_duplicates[h].append(path)
else:
aa_duplicates[h] = [hashes[h], path]
from optparse import OptionParser
p = OptionParser(usage='%prog [options] dir_a dir_b')
- p.add_option('-r', '--remove', help='remove duplicates from dir_b',
- action='store_true')
+ p.add_option('-r', '--remove', action='store_true',
+ help='remove duplicates from dir_b')
+ p.add_option('--one-line', action='store_true',
+ help=('print tab-delimited duplicates on a single line '
+ '(for easier post-processing)'))
options,arguments = p.parse_args()
path_groups.extend(aa_duplicates.itervalues())
path_groups.extend(ab_duplicates.itervalues())
for path_group in path_groups:
- print path_group[0]
- for dup in path_group[1:]:
- print ' ', dup
+ if options.one_line:
+ print('\t'.join(path_group))
+ else:
+ print path_group[0]
+ for dup in path_group[1:]:
+ print ' ', dup
if options.remove:
print ''
for path_group in ab_duplicates.itervalues():