multiple hash algs support for dups detecting
This commit is contained in:
parent
7aaeb3e98f
commit
395f2234d5
1 changed files with 65 additions and 32 deletions
|
@ -2,9 +2,9 @@
|
||||||
# Author: Darko Poljak <darko.poljak@gmail.com>
|
# Author: Darko Poljak <darko.poljak@gmail.com>
|
||||||
# License: GPLv3
|
# License: GPLv3
|
||||||
|
|
||||||
"""sweeper 0.4.1
|
"""{0} {1}
|
||||||
|
|
||||||
Usage: sweeper.py [options] [<directory>...]
|
Usage: {0} [options] [<directory>...]
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
<directory> directory path to scan for files
|
<directory> directory path to scan for files
|
||||||
|
@ -14,7 +14,10 @@ Options:
|
||||||
-v, --version show version and exit
|
-v, --version show version and exit
|
||||||
-b <blocksize>, --block-size=<blocksize> size of block used when reading
|
-b <blocksize>, --block-size=<blocksize> size of block used when reading
|
||||||
file's content [default: 4096]
|
file's content [default: 4096]
|
||||||
-d <hashalg>, --digest-alg=<hashalg> secure hash algorithm [default: md5]
|
-d <hashalgs>, --digest-algs=<hashalgs> secure hash algorithm comma separated
|
||||||
|
list [default: md5]
|
||||||
|
note that multiple hashes will slow
|
||||||
|
down sweeper
|
||||||
-a <action>, --action=<action> action on duplicate files (pprint,
|
-a <action>, --action=<action> action on duplicate files (pprint,
|
||||||
print, remove, move)
|
print, remove, move)
|
||||||
[default: pprint]
|
[default: pprint]
|
||||||
|
@ -48,7 +51,7 @@ Options:
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
|
||||||
__author__ = 'Darko Poljak <darko.poljak@gmail.com>'
|
__author__ = 'Darko Poljak <darko.poljak@gmail.com>'
|
||||||
__version__ = '0.4.0'
|
__version__ = '0.4.1'
|
||||||
__license__ = 'GPLv3'
|
__license__ = 'GPLv3'
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
|
@ -67,10 +70,16 @@ from functools import partial
|
||||||
if sys.version_info[0] == 3:
|
if sys.version_info[0] == 3:
|
||||||
def _dict_iter_items(d):
|
def _dict_iter_items(d):
|
||||||
return d.items()
|
return d.items()
|
||||||
|
|
||||||
|
def _dict_iter_keys(d):
|
||||||
|
return d.keys()
|
||||||
else:
|
else:
|
||||||
def _dict_iter_items(d):
|
def _dict_iter_items(d):
|
||||||
return d.iteritems()
|
return d.iteritems()
|
||||||
|
|
||||||
|
def _dict_iter_keys(d):
|
||||||
|
return d.iterkeys()
|
||||||
|
|
||||||
|
|
||||||
def _filehash(filepath, hashalg, block_size):
|
def _filehash(filepath, hashalg, block_size):
|
||||||
"""Calculate secure hash for given file content using
|
"""Calculate secure hash for given file content using
|
||||||
|
@ -122,7 +131,7 @@ def _files_iter_from_disk(topdirs):
|
||||||
yield fpath
|
yield fpath
|
||||||
|
|
||||||
|
|
||||||
def file_dups(topdirs=['./'], hashalg='md5', block_size=4096, verbose=False):
|
def file_dups(topdirs=['./'], hashalgs=['md5'], block_size=4096, verbose=False):
|
||||||
"""Find duplicate files in directory list. Return directory
|
"""Find duplicate files in directory list. Return directory
|
||||||
with keys equal to file hash value and value as list of
|
with keys equal to file hash value and value as list of
|
||||||
file paths whose content is the same.
|
file paths whose content is the same.
|
||||||
|
@ -144,7 +153,8 @@ def file_dups(topdirs=['./'], hashalg='md5', block_size=4096, verbose=False):
|
||||||
end='')
|
end='')
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
current += 1
|
current += 1
|
||||||
hexmd = _filehash(fpath, hashalg, block_size)
|
hexmds = [_filehash(fpath, h, block_size) for h in hashalgs]
|
||||||
|
hexmd = tuple(hexmds)
|
||||||
dups[hexmd].append(fpath)
|
dups[hexmd].append(fpath)
|
||||||
|
|
||||||
if verbose:
|
if verbose:
|
||||||
|
@ -157,9 +167,9 @@ def file_dups(topdirs=['./'], hashalg='md5', block_size=4096, verbose=False):
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def _extract_files_for_action(topdirs, hashalg, block_size, keep_prefix,
|
def _extract_files_for_action(topdirs, hashalgs, block_size, keep_prefix,
|
||||||
verbose):
|
verbose):
|
||||||
for files in iter_file_dups(topdirs=topdirs, hashalg=hashalg,
|
for files in iter_file_dups(topdirs=topdirs, hashalgs=hashalgs,
|
||||||
block_size=block_size, verbose=verbose):
|
block_size=block_size, verbose=verbose):
|
||||||
found = False
|
found = False
|
||||||
if keep_prefix:
|
if keep_prefix:
|
||||||
|
@ -174,7 +184,7 @@ def _extract_files_for_action(topdirs, hashalg, block_size, keep_prefix,
|
||||||
yield (files, result)
|
yield (files, result)
|
||||||
|
|
||||||
|
|
||||||
def rm_file_dups(topdirs=['./'], hashalg='md5', block_size=4096,
|
def rm_file_dups(topdirs=['./'], hashalgs=['md5'], block_size=4096,
|
||||||
simulate=False, keep_prefix=None, verbose=False):
|
simulate=False, keep_prefix=None, verbose=False):
|
||||||
"""Remove duplicate files found in specified directory list.
|
"""Remove duplicate files found in specified directory list.
|
||||||
If keep_prefix is specified then first file with that path
|
If keep_prefix is specified then first file with that path
|
||||||
|
@ -183,9 +193,9 @@ def rm_file_dups(topdirs=['./'], hashalg='md5', block_size=4096,
|
||||||
If simulate is True then only print the action, do not actually
|
If simulate is True then only print the action, do not actually
|
||||||
perform it.
|
perform it.
|
||||||
"""
|
"""
|
||||||
for dups, extracted in _extract_files_for_action(topdirs, hashalg,
|
for dups, extracted in _extract_files_for_action(topdirs, hashalgs,
|
||||||
block_size,
|
block_size, keep_prefix,
|
||||||
keep_prefix, verbose):
|
verbose):
|
||||||
if simulate or verbose:
|
if simulate or verbose:
|
||||||
print('found duplicates: \n{}'.format(dups))
|
print('found duplicates: \n{}'.format(dups))
|
||||||
for f in extracted:
|
for f in extracted:
|
||||||
|
@ -195,7 +205,7 @@ def rm_file_dups(topdirs=['./'], hashalg='md5', block_size=4096,
|
||||||
os.remove(f)
|
os.remove(f)
|
||||||
|
|
||||||
|
|
||||||
def mv_file_dups(topdirs=['./'], hashalg='md5', block_size=4096,
|
def mv_file_dups(topdirs=['./'], hashalgs=['md5'], block_size=4096,
|
||||||
dest_dir='dups', simulate=False, keep_prefix=None,
|
dest_dir='dups', simulate=False, keep_prefix=None,
|
||||||
verbose=False):
|
verbose=False):
|
||||||
"""Move duplicate files found in specified directory list.
|
"""Move duplicate files found in specified directory list.
|
||||||
|
@ -210,9 +220,9 @@ def mv_file_dups(topdirs=['./'], hashalg='md5', block_size=4096,
|
||||||
if not os.path.isdir(dest_dir):
|
if not os.path.isdir(dest_dir):
|
||||||
raise OSError('{} is not a directory'.format(dest_dir))
|
raise OSError('{} is not a directory'.format(dest_dir))
|
||||||
import shutil
|
import shutil
|
||||||
for dups, extracted in _extract_files_for_action(topdirs, hashalg,
|
for dups, extracted in _extract_files_for_action(topdirs, hashalgs,
|
||||||
block_size,
|
block_size, keep_prefix,
|
||||||
keep_prefix, verbose):
|
verbose):
|
||||||
if simulate or verbose:
|
if simulate or verbose:
|
||||||
print('found duplicates: \n{}'.format(dups))
|
print('found duplicates: \n{}'.format(dups))
|
||||||
for f in extracted:
|
for f in extracted:
|
||||||
|
@ -222,13 +232,13 @@ def mv_file_dups(topdirs=['./'], hashalg='md5', block_size=4096,
|
||||||
shutil.move(f, dest_dir)
|
shutil.move(f, dest_dir)
|
||||||
|
|
||||||
|
|
||||||
def iter_file_dups(topdirs=['./'], rethash=False, hashalg='md5',
|
def iter_file_dups(topdirs=['./'], rethash=False, hashalgs=['md5'],
|
||||||
block_size=4096, verbose=False):
|
block_size=4096, verbose=False):
|
||||||
"""Yield duplicate files when found in specified directory list.
|
"""Yield duplicate files when found in specified directory list.
|
||||||
If rethash is True then tuple hash value and duplicate paths list is
|
If rethash is True then tuple hash value and duplicate paths list is
|
||||||
returned, otherwise duplicate paths list is returned.
|
returned, otherwise duplicate paths list is returned.
|
||||||
"""
|
"""
|
||||||
dups = file_dups(topdirs, hashalg, block_size, verbose)
|
dups = file_dups(topdirs, hashalgs, block_size, verbose)
|
||||||
for hash_, fpaths in _dict_iter_items(dups):
|
for hash_, fpaths in _dict_iter_items(dups):
|
||||||
if rethash:
|
if rethash:
|
||||||
yield (hash_, fpaths)
|
yield (hash_, fpaths)
|
||||||
|
@ -236,13 +246,26 @@ def iter_file_dups(topdirs=['./'], rethash=False, hashalg='md5',
|
||||||
yield fpaths
|
yield fpaths
|
||||||
|
|
||||||
|
|
||||||
|
def _remap_keys_to_str(d):
|
||||||
|
'''Iterator that remaps dictionary keys to string in case keys are tuple
|
||||||
|
or list. Leave it unchanged otherwise.
|
||||||
|
'''
|
||||||
|
for k in _dict_iter_keys(d):
|
||||||
|
if isinstance(k, tuple) or isinstance(k, list):
|
||||||
|
key = ','.join(k)
|
||||||
|
else:
|
||||||
|
key = k
|
||||||
|
yield (key, d[k])
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
"""Main when used as script. See usage (--help).
|
"""Main when used as script. See usage (--help).
|
||||||
"""
|
"""
|
||||||
import json
|
import json
|
||||||
from docopt import docopt
|
from docopt import docopt
|
||||||
|
|
||||||
args = docopt(__doc__, version=" ".join(('sweeper', __version__)))
|
args = docopt(__doc__.format(sys.argv[0], __version__),
|
||||||
|
version=" ".join(('sweeper', __version__)))
|
||||||
|
|
||||||
topdirs = args['<directory>']
|
topdirs = args['<directory>']
|
||||||
if not topdirs:
|
if not topdirs:
|
||||||
|
@ -258,35 +281,45 @@ def main():
|
||||||
except ValueError:
|
except ValueError:
|
||||||
print('Invalid block size "{}"'.format(args['--block-size']))
|
print('Invalid block size "{}"'.format(args['--block-size']))
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
hashalgs = args['--digest-algs'].split(',')
|
||||||
|
hashalgs_uniq = _uniq_list(hashalgs)
|
||||||
|
if len(hashalgs) != len(hashalgs_uniq):
|
||||||
|
print('Duplicate hash algorithms specified')
|
||||||
|
sys.exit(1)
|
||||||
|
block_size = args['--block-size']
|
||||||
|
simulate = args['--simulate']
|
||||||
|
keep_prefix = args['--keep']
|
||||||
|
dest_dir = args['--move']
|
||||||
|
|
||||||
if action == 'print' or action == 'pprint':
|
if action == 'print' or action == 'pprint':
|
||||||
dups = file_dups(topdirs=topdirs,
|
dups = file_dups(topdirs=topdirs,
|
||||||
hashalg=args['--digest-alg'],
|
hashalgs=hashalgs,
|
||||||
block_size=args['--block-size'],
|
block_size=block_size,
|
||||||
verbose=verbose)
|
verbose=verbose)
|
||||||
# defaultdict(list) -> dict
|
# defaultdict(list) -> dict
|
||||||
spam = dict(dups)
|
spam = dict(dups)
|
||||||
if spam:
|
if spam:
|
||||||
if action == 'pprint':
|
if action == 'pprint':
|
||||||
for h, fpaths in _dict_iter_items(spam):
|
for _, fpaths in _dict_iter_items(spam):
|
||||||
for path in fpaths:
|
for path in fpaths:
|
||||||
print(path)
|
print(path)
|
||||||
if fpaths:
|
if fpaths:
|
||||||
print('')
|
print('')
|
||||||
else:
|
else:
|
||||||
print(json.dumps(spam, indent=4))
|
print(json.dumps({k: v for k, v in _remap_keys_to_str(spam)},
|
||||||
|
indent=4))
|
||||||
elif action == 'move':
|
elif action == 'move':
|
||||||
mv_file_dups(topdirs=topdirs, hashalg=args['--digest-alg'],
|
mv_file_dups(topdirs=topdirs, hashalgs=hashalgs,
|
||||||
block_size=args['--block-size'],
|
block_size=block_size,
|
||||||
dest_dir=args['--move'],
|
dest_dir=dest_dir,
|
||||||
simulate=args['--simulate'],
|
simulate=simulate,
|
||||||
keep_prefix=args['--keep'],
|
keep_prefix=keep_prefix,
|
||||||
verbose=verbose)
|
verbose=verbose)
|
||||||
elif action == 'remove':
|
elif action == 'remove':
|
||||||
rm_file_dups(topdirs=topdirs, hashalg=args['--digest-alg'],
|
rm_file_dups(topdirs=topdirs, hashalgs=hashalgs,
|
||||||
block_size=args['--block-size'],
|
block_size=block_size,
|
||||||
simulate=args['--simulate'],
|
simulate=simulate,
|
||||||
keep_prefix=args['--keep'],
|
keep_prefix=-keep_prefix,
|
||||||
verbose=verbose)
|
verbose=verbose)
|
||||||
else:
|
else:
|
||||||
print('Invalid action "{}"'.format(action))
|
print('Invalid action "{}"'.format(action))
|
||||||
|
|
Loading…
Reference in a new issue