From: W. Trevor King Date: Sun, 13 Mar 2011 03:43:20 +0000 (-0500) Subject: Add find_duplicates post. X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=80a5dd22235417de2bbda47d5e19b6a6418bcd0e;p=mw2txt.git Add find_duplicates post. --- diff --git a/posts/find_duplicates.mdwn b/posts/find_duplicates.mdwn new file mode 100644 index 0000000..1902c2b --- /dev/null +++ b/posts/find_duplicates.mdwn @@ -0,0 +1,15 @@ +[[!meta title="find_duplicates"]] + +I've reorganized my music a few times, and it took me a while to get +organized enough to want a single directory tree that held everything +I owned. Every once and a while I go through the junk drawer and move +a few more songs into the "official" tree, checking their metadata and +whatnot. I was getting annoyed at finding duplicate songs in several +junk drawers, so I wrote up a little script, [[find_duplicates.py]] +which can compare SHA1s for files in two trees and remove any +duplicates from the lesser tree. Now there is a lot less junk to merge +into my official tree. + +[[!tag tags/code]] +[[!tag tags/fun]] +[[!tag tags/python]] diff --git a/posts/find_duplicates/find_duplicates.py b/posts/find_duplicates/find_duplicates.py new file mode 100755 index 0000000..18b6f21 --- /dev/null +++ b/posts/find_duplicates/find_duplicates.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python +# +# Copyright (C) 2011 W. Trevor King +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this program. If not, see +# . + +"""Search two directory trees for duplicate files. + +The command line script can optionally remove duplicates from the +lesser tree. +""" + +import os +import os.path +from hashlib import sha1 +import sys + + +def hash_file(filename): + print >> sys.stderr, ' hashing', filename + return sha1(open(filename, 'r').read()).hexdigest() + +def duplicates(dir_a, dir_b): + aa_duplicates = {} + ab_duplicates = {} + hashes = {} + for dirpath,dirnames,filenames in os.walk(dir_a): + for filename in filenames: + path = os.path.join(dirpath, filename) + h = hash_file(path) + if h in hashes: + if h in ab_duplicates: + aa_duplicates[h].append(path) + else: + aa_duplicates[h] = [hashes[h], path] + else: + hashes[h] = path + for dirpath,dirnames,filenames in os.walk(dir_b): + for filename in filenames: + path = os.path.join(dirpath, filename) + h = hash_file(path) + if h in hashes: + if h in ab_duplicates: + ab_duplicates[h].append(path) + else: + ab_duplicates[h] = [hashes[h], path] + return (aa_duplicates, ab_duplicates) + + +if __name__ == '__main__': + from optparse import OptionParser + + p = OptionParser(usage='%prog [options] dir_a dir_b') + p.add_option('-r', '--remove', help='remove duplicates from dir_b', + action='store_true') + + options,arguments = p.parse_args() + + dir_a,dir_b = arguments + + aa_duplicates,ab_duplicates = duplicates(dir_a, dir_b) + path_groups = [] + path_groups.extend(aa_duplicates.itervalues()) + path_groups.extend(ab_duplicates.itervalues()) + for path_group in path_groups: + print path_group[0] + for dup in path_group[1:]: + print ' ', dup + if options.remove: + print '' + for path_group in ab_duplicates.itervalues(): + print 'removing duplicates of', path_group[0] + for dup in path_group[1:]: + print ' ', dup + os.remove(dup)