lots of improvements

This commit is contained in:
darko-poljak 2014-01-29 21:10:30 +01:00
parent 8cc0897926
commit 1be4e6ce4a
4 changed files with 124 additions and 37 deletions

1
TODO
View File

@ -1 +1,2 @@
add logging, status file examined, etc.? add logging, status file examined, etc.?
add argument mv, rm action: preserve directory prefix

View File

@ -1,9 +1,11 @@
import os import os
from setuptools import setup from setuptools import setup
def read(fname): def read(fname):
return open(os.path.join(os.path.dirname(__file__), fname)).read() return open(os.path.join(os.path.dirname(__file__), fname)).read()
setup( setup(
name='sweeper', name='sweeper',
version='0.4.0', version='0.4.0',
@ -30,4 +32,3 @@ setup(
"Topic :: Software Development :: Libraries :: Python Modules", "Topic :: Software Development :: Libraries :: Python Modules",
], ],
) )

View File

@ -19,10 +19,12 @@ Options:
print, remove, move) print, remove, move)
[default: pprint] [default: pprint]
-remove removes duplicate files -remove removes duplicate files
except first found except first or first with specified
directory prefix found
-move moves duplicate files to -move moves duplicate files to
duplicates driectory, except first duplicates driectory, except first
found or first with specified directory
prefix found
-print prints result directory where -print prints result directory where
keys are hash values and values are keys are hash values and values are
list of duplicate file paths list of duplicate file paths
@ -32,6 +34,15 @@ Options:
-m <directory>, --move=<directory> move duplicate files to directory -m <directory>, --move=<directory> move duplicate files to directory
(used with move action) (used with move action)
[default: ./dups] [default: ./dups]
-k <dirprefix>, --keep=<dirprefix> directory prefix for remove and move
actions
-s, --simulate if action is remove or move just
simulate action by printing, do not
actually perform the action
-V, --verbose print more info, note that verbosity
will slow down sweeper due to text
printing and additional information
gathering
""" """
__author__ = 'Darko Poljak <darko.poljak@gmail.com>' __author__ = 'Darko Poljak <darko.poljak@gmail.com>'
@ -49,16 +60,11 @@ from collections import defaultdict
# some differences in python versions # some differences in python versions
# we prefer iter methods
if sys.version_info[0] == 3: if sys.version_info[0] == 3:
def _do_encode(buf):
return buf
def _dict_iter_items(d): def _dict_iter_items(d):
return d.items() return d.items()
else: else:
def _do_encode(buf):
return buf
def _dict_iter_items(d): def _dict_iter_items(d):
return d.iteritems() return d.iteritems()
@ -71,58 +77,126 @@ def _filehash(filepath, hashalg, block_size):
md = hashlib.new(hashalg) md = hashlib.new(hashalg)
with open(filepath, "rb") as f: with open(filepath, "rb") as f:
for buf in iter(lambda: f.read(block_size), b''): for buf in iter(lambda: f.read(block_size), b''):
md.update(_do_encode(buf)) md.update(buf)
return md.hexdigest() return md.hexdigest()
def file_dups(topdirs=['./'], hashalg='md5', block_size=4096): def _uniq_list(list_):
result = []
for foo in list_:
if foo not in result:
result.append(foo)
return result
def file_dups(topdirs=['./'], hashalg='md5', block_size=4096, verbose=False):
"""Find duplicate files in directory list. Return directory """Find duplicate files in directory list. Return directory
with keys equal to file hash value and value as list of with keys equal to file hash value and value as list of
file paths whose content is the same. file paths whose content is the same.
""" """
dups = defaultdict(list) dups = defaultdict(list)
if verbose:
print('counting...', end='')
sys.stdout.flush()
count = 0
for topdir in topdirs:
for _, _, filenames in os.walk(topdir):
count += len(filenames)
current = 1
print(count)
for topdir in topdirs: for topdir in topdirs:
for dirpath, dirnames, filenames in os.walk(topdir): for dirpath, dirnames, filenames in os.walk(topdir):
for fname in filenames: for fname in filenames:
if verbose:
print('\rprocessing file {0}/{1}'.format(current, count),
end='')
sys.stdout.flush()
current += 1
fpath = os.path.join(dirpath, fname) fpath = os.path.join(dirpath, fname)
hexmd = _filehash(fpath, hashalg, block_size) hexmd = _filehash(fpath, hashalg, block_size)
dups[hexmd].append(fpath) dups[hexmd].append(fpath)
result = {k: v for k, v in _dict_iter_items(dups) if len(v) > 1} if verbose:
print('')
result = {}
for k, v in _dict_iter_items(dups):
uniq_v = _uniq_list(v)
if len(uniq_v) > 1:
result[k] = uniq_v
return result return result
def rm_file_dups(topdirs=['./'], hashalg='md5', block_size=4096): def _extract_files_for_action(topdirs, hashalg, block_size, keep_prefix,
verbose):
for files in iter_file_dups(topdirs=topdirs, hashalg=hashalg,
block_size=block_size, verbose=verbose):
found = False
if keep_prefix:
result = []
for f in files:
if f.startswith(keep_prefix) and not found:
found = True
else:
result.append(f)
if not found:
result = files[1:]
yield (files, result)
def rm_file_dups(topdirs=['./'], hashalg='md5', block_size=4096,
simulate=False, keep_prefix=None, verbose=False):
"""Remove duplicate files found in specified directory list. """Remove duplicate files found in specified directory list.
First file in list is kept. If keep_prefix is specified then first file with that path
prefix found is kept in the original directory.
Otherwise first file in list is kept in the original directory.
If simulate is True then only print the action, do not actually
perform it.
""" """
for files in do_with_file_dups(topdirs, hashalg, block_size): for dups, extracted in _extract_files_for_action(topdirs, hashalg,
for f in files: block_size,
os.remove(f) keep_prefix, verbose):
if simulate or verbose:
print('found duplicates: \n{}'.format(dups))
for f in extracted:
if simulate or verbose:
print('rm {}'.format(f))
if not simulate:
os.remove(f)
def mv_file_dups(topdirs=['./'], hashalg='md5', block_size=4096, def mv_file_dups(topdirs=['./'], hashalg='md5', block_size=4096,
dest_dir='dups'): dest_dir='dups', simulate=False, keep_prefix=None,
verbose=False):
"""Move duplicate files found in specified directory list. """Move duplicate files found in specified directory list.
First file in list is kept in the original directory. If keep_prefix is specified then first file with that path
prefix found is kept in the original directory.
Otherwise first file in list is kept in the original directory.
If simulate is True then only print the action, do not actually
perform it.
""" """
if not os.path.exists(dest_dir): if not os.path.exists(dest_dir):
os.mkdir(dest_dir) os.mkdir(dest_dir)
if not os.path.isdir(dest_dir): if not os.path.isdir(dest_dir):
raise OSError('{} is not a directory'.format(dest_dir)) raise OSError('{} is not a directory'.format(dest_dir))
import shutil import shutil
for files in do_with_file_dups(topdirs, hashalg, block_size): for dups, extracted in _extract_files_for_action(topdirs, hashalg,
for i, f in enumerate(files): block_size,
if i > 0: keep_prefix, verbose):
if simulate or verbose:
print('found duplicates: \n{}'.format(dups))
for f in extracted:
if simulate or verbose:
print('mv {0} to {1}'.format(f, dest_dir))
if not simulate:
shutil.move(f, dest_dir) shutil.move(f, dest_dir)
def iter_file_dups(topdirs=['./'], rethash=False, hashalg='md5', def iter_file_dups(topdirs=['./'], rethash=False, hashalg='md5',
block_size=4096): block_size=4096, verbose=False):
"""Yield duplicate files when found in specified directory list. """Yield duplicate files when found in specified directory list.
If rethash is True then tuple hash value and duplicate paths list is If rethash is True then tuple hash value and duplicate paths list is
returned, otherwise duplicate paths list is returned. returned, otherwise duplicate paths list is returned.
""" """
dups = file_dups(topdirs, hashalg, block_size) dups = file_dups(topdirs, hashalg, block_size, verbose)
for hash_, fpaths in _dict_iter_items(dups): for hash_, fpaths in _dict_iter_items(dups):
if rethash: if rethash:
yield (hash_, fpaths) yield (hash_, fpaths)
@ -136,13 +210,16 @@ def main():
import json import json
from docopt import docopt from docopt import docopt
args = docopt(__doc__) args = docopt(__doc__, version=" ".join(('sweeper', __version__)))
topdirs = args['<directory>'] topdirs = args['<directory>']
if not topdirs: if not topdirs:
topdirs = ['./'] topdirs = ['./']
action = args['--action'] action = args['--action']
verbose = args['--verbose']
# set block size as int
try: try:
bs = int(args['--block-size']) bs = int(args['--block-size'])
args['--block-size'] = bs args['--block-size'] = bs
@ -150,30 +227,38 @@ def main():
print('Invalid block size "{}"'.format(args['--block-size'])) print('Invalid block size "{}"'.format(args['--block-size']))
sys.exit(1) sys.exit(1)
if args['--version']:
print("sweeper {}".format(__version__))
return
if action == 'print' or action == 'pprint': if action == 'print' or action == 'pprint':
dups = file_dups(topdirs, args['--digest-alg'], args['--block-size']) dups = file_dups(topdirs=topdirs,
hashalg=args['--digest-alg'],
block_size=args['--block-size'],
verbose=verbose)
# defaultdict(list) -> dict
spam = dict(dups) spam = dict(dups)
if spam: if spam:
if action == 'pprint': if action == 'pprint':
for h, fpaths in _dict_iter_items(spam): for h, fpaths in _dict_iter_items(spam):
for path in fpaths: for path in fpaths:
print(path) print(path)
print('') if fpaths:
print('')
else: else:
print(json.dumps(spam, indent=4)) print(json.dumps(spam, indent=4))
elif action == 'move': elif action == 'move':
mv_file_dups(topdirs, args['--digest-alg'], args['--block-size'], mv_file_dups(topdirs=topdirs, hashalg=args['--digest-alg'],
args['--move']) block_size=args['--block-size'],
dest_dir=args['--move'],
simulate=args['--simulate'],
keep_prefix=args['--keep'],
verbose=verbose)
elif action == 'remove': elif action == 'remove':
rm_file_dups(topdirs, args['--digest-alg'], args['--block-size']) rm_file_dups(topdirs=topdirs, hashalg=args['--digest-alg'],
block_size=args['--block-size'],
simulate=args['--simulate'],
keep_prefix=args['--keep'],
verbose=verbose)
else: else:
print('Invalid action "{}"'.format(action)) print('Invalid action "{}"'.format(action))
# if used as script call main function
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View File

@ -11,7 +11,7 @@ mydir = os.path.dirname(os.path.realpath(__file__))
class TestSweeper(unittest.TestCase): class TestSweeper(unittest.TestCase):
def test_file_dups_dups(self): def test_file_dups_dups(self):
dups = file_dups([os.path.join(mydir, 'testfiles_dups')], 'md5') dups = file_dups([os.path.join(mydir, 'testfiles_dups')])
dups_exist = False dups_exist = False
for h, flist in dups.items(): for h, flist in dups.items():
if len(flist) > 1: if len(flist) > 1:
@ -19,7 +19,7 @@ class TestSweeper(unittest.TestCase):
self.assertTrue(dups_exist) self.assertTrue(dups_exist)
def test_file_dups_nodups(self): def test_file_dups_nodups(self):
dups = file_dups([os.path.join(mydir, 'testfiles_nodups')], 'md5') dups = file_dups([os.path.join(mydir, 'testfiles_nodups')])
for h, flist in dups.items(): for h, flist in dups.items():
self.assertTrue(len(flist) == 1) self.assertTrue(len(flist) == 1)