From: W. Trevor King Date: Thu, 8 Dec 2011 15:57:41 +0000 (-0500) Subject: Add --one-line option to find_duplicates.py. X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=a339769e77c74747ea068a007fce107a483049bc;p=blog.git Add --one-line option to find_duplicates.py. Also add comments explaining dictionary contents and fix ab_duplicates -> aa_duplicates typo. --- diff --git a/posts/find_duplicates.mdwn b/posts/find_duplicates.mdwn index 1902c2b..887bd83 100644 --- a/posts/find_duplicates.mdwn +++ b/posts/find_duplicates.mdwn @@ -10,6 +10,13 @@ which can compare SHA1s for files in two trees and remove any duplicates from the lesser tree. Now there is a lot less junk to merge into my official tree. +Sometimes you want to do something more effective than printing +duplicates, but more subtle than removing them. To give you this +flexibility, I've added the `--one-line` option, which you can use +along these lines: + + $ find_duplicates.py dir_a dir_b --one-line | while read -a DUPS; do mv "${DUPS[1]}" "${DUPS[0]}".dup; done + [[!tag tags/code]] [[!tag tags/fun]] [[!tag tags/python]] diff --git a/posts/find_duplicates/find_duplicates.py b/posts/find_duplicates/find_duplicates.py index 18b6f21..5767780 100755 --- a/posts/find_duplicates/find_duplicates.py +++ b/posts/find_duplicates/find_duplicates.py @@ -33,15 +33,15 @@ def hash_file(filename): return sha1(open(filename, 'r').read()).hexdigest() def duplicates(dir_a, dir_b): - aa_duplicates = {} - ab_duplicates = {} - hashes = {} + hashes = {} # first occurance of hash in dir_a + aa_duplicates = {} # hash found multiple times in dir_a + ab_duplicates = {} # hash found once in dir_a and 1+ times in dir_b for dirpath,dirnames,filenames in os.walk(dir_a): for filename in filenames: path = os.path.join(dirpath, filename) h = hash_file(path) if h in hashes: - if h in ab_duplicates: + if h in aa_duplicates: aa_duplicates[h].append(path) else: aa_duplicates[h] = [hashes[h], path] @@ -63,8 +63,11 @@ if __name__ == '__main__': from optparse import OptionParser p = OptionParser(usage='%prog [options] dir_a dir_b') - p.add_option('-r', '--remove', help='remove duplicates from dir_b', - action='store_true') + p.add_option('-r', '--remove', action='store_true', + help='remove duplicates from dir_b') + p.add_option('--one-line', action='store_true', + help=('print tab-delimited duplicates on a single line ' + '(for easier post-processing)')) options,arguments = p.parse_args() @@ -75,9 +78,12 @@ if __name__ == '__main__': path_groups.extend(aa_duplicates.itervalues()) path_groups.extend(ab_duplicates.itervalues()) for path_group in path_groups: - print path_group[0] - for dup in path_group[1:]: - print ' ', dup + if options.one_line: + print('\t'.join(path_group)) + else: + print path_group[0] + for dup in path_group[1:]: + print ' ', dup if options.remove: print '' for path_group in ab_duplicates.itervalues():