Add --one-line option to find_duplicates.py.
authorW. Trevor King <wking@drexel.edu>
Thu, 8 Dec 2011 15:57:41 +0000 (10:57 -0500)
committerW. Trevor King <wking@drexel.edu>
Thu, 8 Dec 2011 15:57:41 +0000 (10:57 -0500)
Also add comments explaining dictionary contents and fix
  ab_duplicates -> aa_duplicates
typo.

posts/find_duplicates.mdwn
posts/find_duplicates/find_duplicates.py

index 1902c2b532bcabb53b9b4065407cf3b8fd5fd369..887bd83a3e06fb0474af9de6a787cb5ddf5f5db0 100644 (file)
@@ -10,6 +10,13 @@ which can compare SHA1s for files in two trees and remove any
 duplicates from the lesser tree.  Now there is a lot less junk to merge
 into my official tree.
 
+Sometimes you want to do something more effective than printing
+duplicates, but more subtle than removing them.  To give you this
+flexibility, I've added the `--one-line` option, which you can use
+along these lines:
+
+    $ find_duplicates.py dir_a dir_b --one-line | while read -a DUPS; do mv "${DUPS[1]}" "${DUPS[0]}".dup; done
+
 [[!tag tags/code]]
 [[!tag tags/fun]]
 [[!tag tags/python]]
index 18b6f2108119d2071a80a4e95e4e46df527e72b4..5767780e02e44d2b6e20d0cdbd654372f9807933 100755 (executable)
@@ -33,15 +33,15 @@ def hash_file(filename):
     return sha1(open(filename, 'r').read()).hexdigest()
 
 def duplicates(dir_a, dir_b):
-    aa_duplicates = {}
-    ab_duplicates = {}
-    hashes = {}
+    hashes = {}        # first occurance of hash in dir_a
+    aa_duplicates = {} # hash found multiple times in dir_a
+    ab_duplicates = {} # hash found once in dir_a and 1+ times in dir_b
     for dirpath,dirnames,filenames in os.walk(dir_a):
         for filename in filenames:
             path = os.path.join(dirpath, filename)
             h = hash_file(path)
             if h in hashes:
-                if h in ab_duplicates:
+                if h in aa_duplicates:
                     aa_duplicates[h].append(path)
                 else:
                     aa_duplicates[h] = [hashes[h], path]
@@ -63,8 +63,11 @@ if __name__ == '__main__':
     from optparse import OptionParser
 
     p = OptionParser(usage='%prog [options] dir_a dir_b')
-    p.add_option('-r', '--remove', help='remove duplicates from dir_b',
-                 action='store_true')
+    p.add_option('-r', '--remove', action='store_true',
+                 help='remove duplicates from dir_b')
+    p.add_option('--one-line', action='store_true',
+                 help=('print tab-delimited duplicates on a single line '
+                       '(for easier post-processing)'))
 
     options,arguments = p.parse_args()
 
@@ -75,9 +78,12 @@ if __name__ == '__main__':
     path_groups.extend(aa_duplicates.itervalues())
     path_groups.extend(ab_duplicates.itervalues())
     for path_group in path_groups:
-        print path_group[0]
-        for dup in path_group[1:]:
-            print '  ', dup
+        if options.one_line:
+            print('\t'.join(path_group))
+        else:
+            print path_group[0]
+            for dup in path_group[1:]:
+                print '  ', dup
     if options.remove:
         print ''
         for path_group in ab_duplicates.itervalues():