Add --one-line option to find_duplicates.py.

author W. Trevor King <wking@drexel.edu>

Thu, 8 Dec 2011 15:57:41 +0000 (10:57 -0500)

committer W. Trevor King <wking@drexel.edu>

Thu, 8 Dec 2011 15:57:41 +0000 (10:57 -0500)
author W. Trevor King <wking@drexel.edu>
Thu, 8 Dec 2011 15:57:41 +0000 (10:57 -0500)
committer W. Trevor King <wking@drexel.edu>
Thu, 8 Dec 2011 15:57:41 +0000 (10:57 -0500)
diff --git a/posts/find_duplicates.mdwn b/posts/find_duplicates.mdwn

index 1902c2b532bcabb53b9b4065407cf3b8fd5fd369..887bd83a3e06fb0474af9de6a787cb5ddf5f5db0 100644 (file)
--- a/posts/find_duplicates.mdwn
+++ b/posts/find_duplicates.mdwn
@@ -10,6 +10,13 @@ which can compare SHA1s for files in two trees and remove any
  duplicates from the lesser tree.  Now there is a lot less junk to merge
  into my official tree.
  
+Sometimes you want to do something more effective than printing
+duplicates, but more subtle than removing them.  To give you this
+flexibility, I've added the `--one-line` option, which you can use
+along these lines:
+
+    $ find_duplicates.py dir_a dir_b --one-line | while read -a DUPS; do mv "${DUPS[1]}" "${DUPS[0]}".dup; done
+
  [[!tag tags/code]]
  [[!tag tags/fun]]
  [[!tag tags/python]]
diff --git a/posts/find_duplicates/find_duplicates.py b/posts/find_duplicates/find_duplicates.py

index 18b6f2108119d2071a80a4e95e4e46df527e72b4..5767780e02e44d2b6e20d0cdbd654372f9807933 100755 (executable)
--- a/posts/find_duplicates/find_duplicates.py
+++ b/posts/find_duplicates/find_duplicates.py
@@ -33,15 +33,15 @@ def hash_file(filename):
      return sha1(open(filename, 'r').read()).hexdigest()
  
  def duplicates(dir_a, dir_b):
-    aa_duplicates = {}
-    ab_duplicates = {}
-    hashes = {}
+    hashes = {}        # first occurance of hash in dir_a
+    aa_duplicates = {} # hash found multiple times in dir_a
+    ab_duplicates = {} # hash found once in dir_a and 1+ times in dir_b
      for dirpath,dirnames,filenames in os.walk(dir_a):
          for filename in filenames:
              path = os.path.join(dirpath, filename)
              h = hash_file(path)
              if h in hashes:
-                if h in ab_duplicates:
+                if h in aa_duplicates:
                      aa_duplicates[h].append(path)
                  else:
                      aa_duplicates[h] = [hashes[h], path]
@@ -63,8 +63,11 @@ if __name__ == '__main__':
      from optparse import OptionParser
  
      p = OptionParser(usage='%prog [options] dir_a dir_b')
-    p.add_option('-r', '--remove', help='remove duplicates from dir_b',
-                 action='store_true')
+    p.add_option('-r', '--remove', action='store_true',
+                 help='remove duplicates from dir_b')
+    p.add_option('--one-line', action='store_true',
+                 help=('print tab-delimited duplicates on a single line '
+                       '(for easier post-processing)'))
  
      options,arguments = p.parse_args()
  
@@ -75,9 +78,12 @@ if __name__ == '__main__':
      path_groups.extend(aa_duplicates.itervalues())
      path_groups.extend(ab_duplicates.itervalues())
      for path_group in path_groups:
-        print path_group[0]
-        for dup in path_group[1:]:
-            print '  ', dup
+        if options.one_line:
+            print('\t'.join(path_group))
+        else:
+            print path_group[0]
+            for dup in path_group[1:]:
+                print '  ', dup
      if options.remove:
          print ''
          for path_group in ab_duplicates.itervalues():
author	W. Trevor King <wking@drexel.edu>
	Thu, 8 Dec 2011 15:57:41 +0000 (10:57 -0500)
committer	W. Trevor King <wking@drexel.edu>
	Thu, 8 Dec 2011 15:57:41 +0000 (10:57 -0500)
posts/find_duplicates.mdwn		patch \| blob \| history
posts/find_duplicates/find_duplicates.py		patch \| blob \| history