diff --git a/TODO b/TODO index 827d609..0bb0c21 100644 --- a/TODO +++ b/TODO @@ -1 +1,2 @@ add logging, status file examined, etc.? +add argument mv, rm action: preserve directory prefix diff --git a/setup.py b/setup.py index 19c5681..ee49c69 100644 --- a/setup.py +++ b/setup.py @@ -1,9 +1,11 @@ import os from setuptools import setup + def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() + setup( name='sweeper', version='0.4.0', @@ -30,4 +32,3 @@ setup( "Topic :: Software Development :: Libraries :: Python Modules", ], ) - diff --git a/sweeper/sweeper.py b/sweeper/sweeper.py index 7da22a3..1960077 100644 --- a/sweeper/sweeper.py +++ b/sweeper/sweeper.py @@ -19,10 +19,12 @@ Options: print, remove, move) [default: pprint] -remove removes duplicate files - except first found + except first or first with specified + directory prefix found -move moves duplicate files to duplicates driectory, except first - found + or first with specified directory + prefix found -print prints result directory where keys are hash values and values are list of duplicate file paths @@ -32,6 +34,15 @@ Options: -m , --move= move duplicate files to directory (used with move action) [default: ./dups] +-k , --keep= directory prefix for remove and move + actions +-s, --simulate if action is remove or move just + simulate action by printing, do not + actually perform the action +-V, --verbose print more info, note that verbosity + will slow down sweeper due to text + printing and additional information + gathering """ __author__ = 'Darko Poljak ' @@ -49,16 +60,11 @@ from collections import defaultdict # some differences in python versions +# we prefer iter methods if sys.version_info[0] == 3: - def _do_encode(buf): - return buf - def _dict_iter_items(d): return d.items() else: - def _do_encode(buf): - return buf - def _dict_iter_items(d): return d.iteritems() @@ -71,58 +77,126 @@ def _filehash(filepath, hashalg, block_size): md = hashlib.new(hashalg) with open(filepath, "rb") as f: for buf in iter(lambda: f.read(block_size), b''): - md.update(_do_encode(buf)) + md.update(buf) return md.hexdigest() -def file_dups(topdirs=['./'], hashalg='md5', block_size=4096): +def _uniq_list(list_): + result = [] + for foo in list_: + if foo not in result: + result.append(foo) + return result + + +def file_dups(topdirs=['./'], hashalg='md5', block_size=4096, verbose=False): """Find duplicate files in directory list. Return directory with keys equal to file hash value and value as list of file paths whose content is the same. """ dups = defaultdict(list) + if verbose: + print('counting...', end='') + sys.stdout.flush() + count = 0 + for topdir in topdirs: + for _, _, filenames in os.walk(topdir): + count += len(filenames) + current = 1 + print(count) for topdir in topdirs: for dirpath, dirnames, filenames in os.walk(topdir): for fname in filenames: + if verbose: + print('\rprocessing file {0}/{1}'.format(current, count), + end='') + sys.stdout.flush() + current += 1 fpath = os.path.join(dirpath, fname) hexmd = _filehash(fpath, hashalg, block_size) dups[hexmd].append(fpath) - result = {k: v for k, v in _dict_iter_items(dups) if len(v) > 1} + if verbose: + print('') + result = {} + for k, v in _dict_iter_items(dups): + uniq_v = _uniq_list(v) + if len(uniq_v) > 1: + result[k] = uniq_v return result -def rm_file_dups(topdirs=['./'], hashalg='md5', block_size=4096): +def _extract_files_for_action(topdirs, hashalg, block_size, keep_prefix, + verbose): + for files in iter_file_dups(topdirs=topdirs, hashalg=hashalg, + block_size=block_size, verbose=verbose): + found = False + if keep_prefix: + result = [] + for f in files: + if f.startswith(keep_prefix) and not found: + found = True + else: + result.append(f) + if not found: + result = files[1:] + yield (files, result) + + +def rm_file_dups(topdirs=['./'], hashalg='md5', block_size=4096, + simulate=False, keep_prefix=None, verbose=False): """Remove duplicate files found in specified directory list. - First file in list is kept. + If keep_prefix is specified then first file with that path + prefix found is kept in the original directory. + Otherwise first file in list is kept in the original directory. + If simulate is True then only print the action, do not actually + perform it. """ - for files in do_with_file_dups(topdirs, hashalg, block_size): - for f in files: - os.remove(f) + for dups, extracted in _extract_files_for_action(topdirs, hashalg, + block_size, + keep_prefix, verbose): + if simulate or verbose: + print('found duplicates: \n{}'.format(dups)) + for f in extracted: + if simulate or verbose: + print('rm {}'.format(f)) + if not simulate: + os.remove(f) def mv_file_dups(topdirs=['./'], hashalg='md5', block_size=4096, - dest_dir='dups'): + dest_dir='dups', simulate=False, keep_prefix=None, + verbose=False): """Move duplicate files found in specified directory list. - First file in list is kept in the original directory. + If keep_prefix is specified then first file with that path + prefix found is kept in the original directory. + Otherwise first file in list is kept in the original directory. + If simulate is True then only print the action, do not actually + perform it. """ if not os.path.exists(dest_dir): os.mkdir(dest_dir) if not os.path.isdir(dest_dir): raise OSError('{} is not a directory'.format(dest_dir)) import shutil - for files in do_with_file_dups(topdirs, hashalg, block_size): - for i, f in enumerate(files): - if i > 0: + for dups, extracted in _extract_files_for_action(topdirs, hashalg, + block_size, + keep_prefix, verbose): + if simulate or verbose: + print('found duplicates: \n{}'.format(dups)) + for f in extracted: + if simulate or verbose: + print('mv {0} to {1}'.format(f, dest_dir)) + if not simulate: shutil.move(f, dest_dir) def iter_file_dups(topdirs=['./'], rethash=False, hashalg='md5', - block_size=4096): + block_size=4096, verbose=False): """Yield duplicate files when found in specified directory list. If rethash is True then tuple hash value and duplicate paths list is returned, otherwise duplicate paths list is returned. """ - dups = file_dups(topdirs, hashalg, block_size) + dups = file_dups(topdirs, hashalg, block_size, verbose) for hash_, fpaths in _dict_iter_items(dups): if rethash: yield (hash_, fpaths) @@ -136,13 +210,16 @@ def main(): import json from docopt import docopt - args = docopt(__doc__) + args = docopt(__doc__, version=" ".join(('sweeper', __version__))) topdirs = args[''] if not topdirs: topdirs = ['./'] action = args['--action'] + verbose = args['--verbose'] + + # set block size as int try: bs = int(args['--block-size']) args['--block-size'] = bs @@ -150,30 +227,38 @@ def main(): print('Invalid block size "{}"'.format(args['--block-size'])) sys.exit(1) - if args['--version']: - print("sweeper {}".format(__version__)) - return - if action == 'print' or action == 'pprint': - dups = file_dups(topdirs, args['--digest-alg'], args['--block-size']) + dups = file_dups(topdirs=topdirs, + hashalg=args['--digest-alg'], + block_size=args['--block-size'], + verbose=verbose) + # defaultdict(list) -> dict spam = dict(dups) if spam: if action == 'pprint': for h, fpaths in _dict_iter_items(spam): for path in fpaths: print(path) - print('') + if fpaths: + print('') else: print(json.dumps(spam, indent=4)) elif action == 'move': - mv_file_dups(topdirs, args['--digest-alg'], args['--block-size'], - args['--move']) + mv_file_dups(topdirs=topdirs, hashalg=args['--digest-alg'], + block_size=args['--block-size'], + dest_dir=args['--move'], + simulate=args['--simulate'], + keep_prefix=args['--keep'], + verbose=verbose) elif action == 'remove': - rm_file_dups(topdirs, args['--digest-alg'], args['--block-size']) + rm_file_dups(topdirs=topdirs, hashalg=args['--digest-alg'], + block_size=args['--block-size'], + simulate=args['--simulate'], + keep_prefix=args['--keep'], + verbose=verbose) else: print('Invalid action "{}"'.format(action)) -# if used as script call main function if __name__ == '__main__': main() diff --git a/test/test_sweeper.py b/test/test_sweeper.py index d75797a..54b0a82 100644 --- a/test/test_sweeper.py +++ b/test/test_sweeper.py @@ -11,7 +11,7 @@ mydir = os.path.dirname(os.path.realpath(__file__)) class TestSweeper(unittest.TestCase): def test_file_dups_dups(self): - dups = file_dups([os.path.join(mydir, 'testfiles_dups')], 'md5') + dups = file_dups([os.path.join(mydir, 'testfiles_dups')]) dups_exist = False for h, flist in dups.items(): if len(flist) > 1: @@ -19,7 +19,7 @@ class TestSweeper(unittest.TestCase): self.assertTrue(dups_exist) def test_file_dups_nodups(self): - dups = file_dups([os.path.join(mydir, 'testfiles_nodups')], 'md5') + dups = file_dups([os.path.join(mydir, 'testfiles_nodups')]) for h, flist in dups.items(): self.assertTrue(len(flist) == 1)