multiple hash algs support for dups detecting

This commit is contained in:
Darko Poljak 2014-02-06 08:48:18 +01:00
parent 7aaeb3e98f
commit 395f2234d5

View file

@ -2,9 +2,9 @@
# Author: Darko Poljak <darko.poljak@gmail.com> # Author: Darko Poljak <darko.poljak@gmail.com>
# License: GPLv3 # License: GPLv3
"""sweeper 0.4.1 """{0} {1}
Usage: sweeper.py [options] [<directory>...] Usage: {0} [options] [<directory>...]
Arguments: Arguments:
<directory> directory path to scan for files <directory> directory path to scan for files
@ -14,7 +14,10 @@ Options:
-v, --version show version and exit -v, --version show version and exit
-b <blocksize>, --block-size=<blocksize> size of block used when reading -b <blocksize>, --block-size=<blocksize> size of block used when reading
file's content [default: 4096] file's content [default: 4096]
-d <hashalg>, --digest-alg=<hashalg> secure hash algorithm [default: md5] -d <hashalgs>, --digest-algs=<hashalgs> secure hash algorithm comma separated
list [default: md5]
note that multiple hashes will slow
down sweeper
-a <action>, --action=<action> action on duplicate files (pprint, -a <action>, --action=<action> action on duplicate files (pprint,
print, remove, move) print, remove, move)
[default: pprint] [default: pprint]
@ -48,7 +51,7 @@ Options:
from __future__ import print_function from __future__ import print_function
__author__ = 'Darko Poljak <darko.poljak@gmail.com>' __author__ = 'Darko Poljak <darko.poljak@gmail.com>'
__version__ = '0.4.0' __version__ = '0.4.1'
__license__ = 'GPLv3' __license__ = 'GPLv3'
__all__ = [ __all__ = [
@ -67,10 +70,16 @@ from functools import partial
if sys.version_info[0] == 3: if sys.version_info[0] == 3:
def _dict_iter_items(d): def _dict_iter_items(d):
return d.items() return d.items()
def _dict_iter_keys(d):
return d.keys()
else: else:
def _dict_iter_items(d): def _dict_iter_items(d):
return d.iteritems() return d.iteritems()
def _dict_iter_keys(d):
return d.iterkeys()
def _filehash(filepath, hashalg, block_size): def _filehash(filepath, hashalg, block_size):
"""Calculate secure hash for given file content using """Calculate secure hash for given file content using
@ -122,7 +131,7 @@ def _files_iter_from_disk(topdirs):
yield fpath yield fpath
def file_dups(topdirs=['./'], hashalg='md5', block_size=4096, verbose=False): def file_dups(topdirs=['./'], hashalgs=['md5'], block_size=4096, verbose=False):
"""Find duplicate files in directory list. Return directory """Find duplicate files in directory list. Return directory
with keys equal to file hash value and value as list of with keys equal to file hash value and value as list of
file paths whose content is the same. file paths whose content is the same.
@ -144,7 +153,8 @@ def file_dups(topdirs=['./'], hashalg='md5', block_size=4096, verbose=False):
end='') end='')
sys.stdout.flush() sys.stdout.flush()
current += 1 current += 1
hexmd = _filehash(fpath, hashalg, block_size) hexmds = [_filehash(fpath, h, block_size) for h in hashalgs]
hexmd = tuple(hexmds)
dups[hexmd].append(fpath) dups[hexmd].append(fpath)
if verbose: if verbose:
@ -157,9 +167,9 @@ def file_dups(topdirs=['./'], hashalg='md5', block_size=4096, verbose=False):
return result return result
def _extract_files_for_action(topdirs, hashalg, block_size, keep_prefix, def _extract_files_for_action(topdirs, hashalgs, block_size, keep_prefix,
verbose): verbose):
for files in iter_file_dups(topdirs=topdirs, hashalg=hashalg, for files in iter_file_dups(topdirs=topdirs, hashalgs=hashalgs,
block_size=block_size, verbose=verbose): block_size=block_size, verbose=verbose):
found = False found = False
if keep_prefix: if keep_prefix:
@ -174,7 +184,7 @@ def _extract_files_for_action(topdirs, hashalg, block_size, keep_prefix,
yield (files, result) yield (files, result)
def rm_file_dups(topdirs=['./'], hashalg='md5', block_size=4096, def rm_file_dups(topdirs=['./'], hashalgs=['md5'], block_size=4096,
simulate=False, keep_prefix=None, verbose=False): simulate=False, keep_prefix=None, verbose=False):
"""Remove duplicate files found in specified directory list. """Remove duplicate files found in specified directory list.
If keep_prefix is specified then first file with that path If keep_prefix is specified then first file with that path
@ -183,9 +193,9 @@ def rm_file_dups(topdirs=['./'], hashalg='md5', block_size=4096,
If simulate is True then only print the action, do not actually If simulate is True then only print the action, do not actually
perform it. perform it.
""" """
for dups, extracted in _extract_files_for_action(topdirs, hashalg, for dups, extracted in _extract_files_for_action(topdirs, hashalgs,
block_size, block_size, keep_prefix,
keep_prefix, verbose): verbose):
if simulate or verbose: if simulate or verbose:
print('found duplicates: \n{}'.format(dups)) print('found duplicates: \n{}'.format(dups))
for f in extracted: for f in extracted:
@ -195,7 +205,7 @@ def rm_file_dups(topdirs=['./'], hashalg='md5', block_size=4096,
os.remove(f) os.remove(f)
def mv_file_dups(topdirs=['./'], hashalg='md5', block_size=4096, def mv_file_dups(topdirs=['./'], hashalgs=['md5'], block_size=4096,
dest_dir='dups', simulate=False, keep_prefix=None, dest_dir='dups', simulate=False, keep_prefix=None,
verbose=False): verbose=False):
"""Move duplicate files found in specified directory list. """Move duplicate files found in specified directory list.
@ -210,9 +220,9 @@ def mv_file_dups(topdirs=['./'], hashalg='md5', block_size=4096,
if not os.path.isdir(dest_dir): if not os.path.isdir(dest_dir):
raise OSError('{} is not a directory'.format(dest_dir)) raise OSError('{} is not a directory'.format(dest_dir))
import shutil import shutil
for dups, extracted in _extract_files_for_action(topdirs, hashalg, for dups, extracted in _extract_files_for_action(topdirs, hashalgs,
block_size, block_size, keep_prefix,
keep_prefix, verbose): verbose):
if simulate or verbose: if simulate or verbose:
print('found duplicates: \n{}'.format(dups)) print('found duplicates: \n{}'.format(dups))
for f in extracted: for f in extracted:
@ -222,13 +232,13 @@ def mv_file_dups(topdirs=['./'], hashalg='md5', block_size=4096,
shutil.move(f, dest_dir) shutil.move(f, dest_dir)
def iter_file_dups(topdirs=['./'], rethash=False, hashalg='md5', def iter_file_dups(topdirs=['./'], rethash=False, hashalgs=['md5'],
block_size=4096, verbose=False): block_size=4096, verbose=False):
"""Yield duplicate files when found in specified directory list. """Yield duplicate files when found in specified directory list.
If rethash is True then tuple hash value and duplicate paths list is If rethash is True then tuple hash value and duplicate paths list is
returned, otherwise duplicate paths list is returned. returned, otherwise duplicate paths list is returned.
""" """
dups = file_dups(topdirs, hashalg, block_size, verbose) dups = file_dups(topdirs, hashalgs, block_size, verbose)
for hash_, fpaths in _dict_iter_items(dups): for hash_, fpaths in _dict_iter_items(dups):
if rethash: if rethash:
yield (hash_, fpaths) yield (hash_, fpaths)
@ -236,13 +246,26 @@ def iter_file_dups(topdirs=['./'], rethash=False, hashalg='md5',
yield fpaths yield fpaths
def _remap_keys_to_str(d):
'''Iterator that remaps dictionary keys to string in case keys are tuple
or list. Leave it unchanged otherwise.
'''
for k in _dict_iter_keys(d):
if isinstance(k, tuple) or isinstance(k, list):
key = ','.join(k)
else:
key = k
yield (key, d[k])
def main(): def main():
"""Main when used as script. See usage (--help). """Main when used as script. See usage (--help).
""" """
import json import json
from docopt import docopt from docopt import docopt
args = docopt(__doc__, version=" ".join(('sweeper', __version__))) args = docopt(__doc__.format(sys.argv[0], __version__),
version=" ".join(('sweeper', __version__)))
topdirs = args['<directory>'] topdirs = args['<directory>']
if not topdirs: if not topdirs:
@ -258,35 +281,45 @@ def main():
except ValueError: except ValueError:
print('Invalid block size "{}"'.format(args['--block-size'])) print('Invalid block size "{}"'.format(args['--block-size']))
sys.exit(1) sys.exit(1)
hashalgs = args['--digest-algs'].split(',')
hashalgs_uniq = _uniq_list(hashalgs)
if len(hashalgs) != len(hashalgs_uniq):
print('Duplicate hash algorithms specified')
sys.exit(1)
block_size = args['--block-size']
simulate = args['--simulate']
keep_prefix = args['--keep']
dest_dir = args['--move']
if action == 'print' or action == 'pprint': if action == 'print' or action == 'pprint':
dups = file_dups(topdirs=topdirs, dups = file_dups(topdirs=topdirs,
hashalg=args['--digest-alg'], hashalgs=hashalgs,
block_size=args['--block-size'], block_size=block_size,
verbose=verbose) verbose=verbose)
# defaultdict(list) -> dict # defaultdict(list) -> dict
spam = dict(dups) spam = dict(dups)
if spam: if spam:
if action == 'pprint': if action == 'pprint':
for h, fpaths in _dict_iter_items(spam): for _, fpaths in _dict_iter_items(spam):
for path in fpaths: for path in fpaths:
print(path) print(path)
if fpaths: if fpaths:
print('') print('')
else: else:
print(json.dumps(spam, indent=4)) print(json.dumps({k: v for k, v in _remap_keys_to_str(spam)},
indent=4))
elif action == 'move': elif action == 'move':
mv_file_dups(topdirs=topdirs, hashalg=args['--digest-alg'], mv_file_dups(topdirs=topdirs, hashalgs=hashalgs,
block_size=args['--block-size'], block_size=block_size,
dest_dir=args['--move'], dest_dir=dest_dir,
simulate=args['--simulate'], simulate=simulate,
keep_prefix=args['--keep'], keep_prefix=keep_prefix,
verbose=verbose) verbose=verbose)
elif action == 'remove': elif action == 'remove':
rm_file_dups(topdirs=topdirs, hashalg=args['--digest-alg'], rm_file_dups(topdirs=topdirs, hashalgs=hashalgs,
block_size=args['--block-size'], block_size=block_size,
simulate=args['--simulate'], simulate=simulate,
keep_prefix=args['--keep'], keep_prefix=-keep_prefix,
verbose=verbose) verbose=verbose)
else: else:
print('Invalid action "{}"'.format(action)) print('Invalid action "{}"'.format(action))