Rewriten as class Sweeper with code improvements and optimizations.

This commit is contained in:
Darko Poljak 2014-08-10 00:06:32 +02:00
parent 41cd0fe6c6
commit 0c04f67b93
5 changed files with 243 additions and 264 deletions

View File

@ -10,28 +10,31 @@ Print duplicates
.. code:: python
from sweeper import file_dups
dups = file_dups(['images1', 'images2'])
from sweeper import Sweeper
swp = Sweeper(['images1', 'images2'])
dups = swp.file_dups()
print(dups)
Remove duplicate files
.. code:: python
from sweeper import rm_file_dups
rm_file_dups(['images'])
from sweeper import Sweeper
swp = Sweeper(['images1', 'images2'])
swp.rm()
Perform custom action
.. code:: python
from sweeper import iter_file_dups
for f, h, dups in iter_file_dups(['images']):
from sweeper import Sweeper
swp = Sweeper(['images'])
for f, h, dups in swp:
print('encountered {} which duplicates with already found duplicate files {} with hash {}'.format(f, dups, h))
As script::
python sweeper.py --help
python -m sweeper/sweeper --help
As installed console script::

2
TODO
View File

@ -1,2 +0,0 @@
* Play it safe and add byte by byte comparison option for hash dup files?
Or use one more, different, hash algorithm?

View File

@ -1,4 +1,4 @@
from __future__ import absolute_import
from .sweeper import file_dups, mv_file_dups, rm_file_dups, iter_file_dups
from .sweeper import Sweeper
__all__ = ['file_dups', 'mv_file_dups', 'rm_file_dups', 'iter_file_dups']
__all__ = ['Sweeper']

View File

@ -57,21 +57,18 @@ Options:
from __future__ import print_function
__author__ = 'Darko Poljak <darko.poljak@gmail.com>'
__version__ = '0.6.0'
__version__ = '0.9.0'
__license__ = 'GPLv3'
__all__ = [
'file_dups', 'rm_file_dups', 'mv_file_dups', 'iter_file_dups'
]
__all__ = ['Sweeper']
import sys
import hashlib
import os
from collections import defaultdict
from functools import partial
import hashlib
DEF_HASHALGS = ['sha1']
# some differences in python versions
# we prefer iter methods
if sys.version_info[0] == 3:
@ -109,40 +106,7 @@ def _filehash(filepath, hashalg, block_size):
def _uniq_list(list_):
return set(list_)
def _gather_file_list(dirs):
'''Gather file paths in directory list dirs.
Return tuple (count, files) where count is files
list length and files is list of file paths in
specified directories.
'''
count = 0
files = []
for dir_ in dirs:
for dirpath, dirnames, filenames in os.walk(dir_):
count += len(filenames)
# replace fpath with realpath value (eliminate symbolic links)
files.extend([os.path.realpath(os.path.join(dirpath, fname))
for fname in filenames])
return (count, files)
# iter through file paths in files list
def _files_iter_from_list(files):
for fpath in files:
yield fpath
# iter through file paths by os.walking
def _files_iter_from_disk(topdirs):
for topdir in topdirs:
for dirpath, dirnames, filenames in os.walk(topdir):
for fname in filenames:
# replace fpath with realpath value (eliminate symbolic links)
fpath = os.path.realpath(os.path.join(dirpath, fname))
yield fpath
return list(set(list_))
def _fbequal(fpath1, fpath2):
@ -166,8 +130,63 @@ def _fbequal(fpath1, fpath2):
return False
def file_dups(topdirs=['./'], hashalgs=DEF_HASHALGS, block_size=4096, verbose=False,
safe_mode=False):
def _remap_keys_to_str(d):
'''Iterator that remaps dictionary keys to string in case keys are tuple
or list. Leave it unchanged otherwise.
Yields string key, value pairs.
'''
for k in _dict_iter_keys(d):
if isinstance(k, tuple) or isinstance(k, list):
key = ','.join(k)
else:
key = k
yield (key, d[k])
def _gather_file_list(dirs):
'''Gather file paths in directory list dirs.
Return tuple (count, files) where count is files
list length and files is list of file paths in
specified directories.
'''
files = []
for dir_ in dirs:
for dirpath, dirnames, filenames in os.walk(dir_):
# replace fpath with realpath value (eliminate symbolic links)
files.extend([os.path.realpath(os.path.join(dirpath, fname))
for fname in filenames])
return files
class Sweeper(object):
DEF_HASHALGS = ['sha1']
def __init__(self, topdirs=['./'], hashalgs=DEF_HASHALGS,
block_size=4096, verbose=False, safe_mode=False):
# replace dir paths with realpath value (eliminate symbolic links)
self.topdirs = []
for i in range(len(topdirs)):
self.topdirs.append(os.path.realpath(topdirs[i]))
self.hashalgs = hashalgs
self.block_size = block_size
self.verbose = verbose
self.safe_mode = safe_mode
# iter through file paths in files list
def _files_iter_from_list(self, files):
return (fpath for fpath in files)
# iter through file paths by os.walking
def _files_iter_from_disk(self):
for topdir in self.topdirs:
for dirpath, dirnames, filenames in os.walk(topdir):
for fname in filenames:
# replace fpath with realpath value
# (eliminate symbolic links)
fpath = os.path.realpath(os.path.join(dirpath, fname))
yield fpath
def file_dups(self):
"""Find duplicate files in directory list. Return directory
with keys equal to file hash value and value as list of
file paths whose content is the same.
@ -175,55 +194,54 @@ def file_dups(topdirs=['./'], hashalgs=DEF_HASHALGS, block_size=4096, verbose=Fa
by byte comparison for hash duplicate files.
"""
dups = defaultdict(list)
# replace dir paths with realpath value (eliminate symbolic links)
for i in range(len(topdirs)):
topdirs[i] = os.path.realpath(topdirs[i])
if verbose:
if safe_mode:
if self.verbose:
if self.safe_mode:
print('safe mode is on')
print('gathering and counting files...', end='')
sys.stdout.flush()
count, files = _gather_file_list(topdirs)
files = _gather_file_list(self.topdirs)
count = len(files)
current = 1
print(count)
_files_iter = partial(_files_iter_from_list, files)
_files_iter = partial(self._files_iter_from_list, files)
else:
_files_iter = partial(_files_iter_from_disk, topdirs)
_files_iter = self._files_iter_from_disk
for fpath in _files_iter():
if verbose:
if self.verbose:
print('\rprocessing file {0}/{1}: calc hash'.format(current,
count),
end='')
sys.stdout.flush()
hexmds = [_filehash(fpath, h, block_size) for h in hashalgs]
hexmds = [_filehash(fpath, h, self.block_size)
for h in self.hashalgs]
hexmd = tuple(hexmds)
dup_files = dups[hexmd]
files_equals = False
if safe_mode:
if self.safe_mode:
if dup_files:
if verbose:
print('\rprocessing file {0}/{1}: byte cmp'.format(current,
count),
end='')
if self.verbose:
print('\rprocessing file {0}/{1}: byte cmp'.format(
current, count), end='')
sys.stdout.flush()
for f in dup_files:
if _fbequal(f, fpath):
files_equals = True
break
if verbose and not files_equals:
print('\nsame hash value {} but not same bytes for file {}'
' with files {}'.format(hexmd, fpath, dup_files))
if self.verbose and not files_equals:
print('\nsame hash value {} but not same bytes for'
' file {} with files {}'.format(
hexmd, fpath, dup_files))
else: # when list is empty in safe mode
files_equals = True
else:
files_equals = True # when safe mode is off
if verbose:
if self.verbose:
current += 1
if files_equals:
dups[hexmd].append(fpath)
if verbose:
if self.verbose:
print('')
# make result dict with unique file paths list
result = {}
@ -233,9 +251,7 @@ def file_dups(topdirs=['./'], hashalgs=DEF_HASHALGS, block_size=4096, verbose=Fa
result[k] = uniq_v
return result
def iter_file_dups(topdirs=['./'], hashalgs=DEF_HASHALGS, block_size=4096,
safe_mode=False):
def __iter__(self):
"""Find duplicate files in directory list.
Yield tuple of file path, hash tuple and list of duplicate files
as soon as duplicate file is found.
@ -249,17 +265,15 @@ def iter_file_dups(topdirs=['./'], hashalgs=DEF_HASHALGS, block_size=4096,
"""
# internaly, file dups dict is still maintained
dups = defaultdict(list)
# replace dir paths with realpath value (eliminate symbolic links)
for i in range(len(topdirs)):
topdirs[i] = os.path.realpath(topdirs[i])
_files_iter = partial(_files_iter_from_disk, topdirs)
_files_iter = self._files_iter_from_disk
for fpath in _files_iter():
hexmds = [_filehash(fpath, h, block_size) for h in hashalgs]
hexmds = [_filehash(fpath, h, self.block_size)
for h in self.hashalgs]
hexmd = tuple(hexmds)
dup_files = dups[hexmd]
# there were dup list elements (used for yield)
if safe_mode and dup_files:
if self.safe_mode and dup_files:
# compare only with first file in dup_files
# all files in dup_files list are already content equal
files_equals = _fbequal(dup_files[0], fpath)
@ -272,12 +286,9 @@ def iter_file_dups(topdirs=['./'], hashalgs=DEF_HASHALGS, block_size=4096,
# finally append newly found file to dup list
dups[hexmd].append(fpath)
def _extract_files_for_action(topdirs, hashalgs, block_size, keep_prefix,
verbose, safe_mode):
for files in _dict_iter_values(file_dups(topdirs=topdirs,
hashalgs=hashalgs, block_size=block_size,
verbose=verbose, safe_mode=safe_mode)):
def _extract_files_for_action(self, keep_prefix):
dups = self.file_dups()
for files in _dict_iter_values(dups):
found = False
if keep_prefix:
result = []
@ -287,13 +298,20 @@ def _extract_files_for_action(topdirs, hashalgs, block_size, keep_prefix,
else:
result.append(f)
if not found:
result = files[1:]
result = list(files)[1:]
yield (files, result)
def _do_action(self, simulate, keep_prefix, action, action_str):
for dups, extracted in self._extract_files_for_action(keep_prefix):
if simulate or self.verbose:
print('found duplicates: \n{}'.format(dups))
for f in extracted:
if simulate or self.verbose:
print(action_str.format(f))
if not simulate:
action(f)
def rm_file_dups(topdirs=['./'], hashalgs=DEF_HASHALGS, block_size=4096,
simulate=False, keep_prefix=None, verbose=False,
safe_mode=False):
def rm(self, simulate=False, keep_prefix=None):
"""Remove duplicate files found in specified directory list.
If keep_prefix is specified then first file with that path
prefix found is kept in the original directory.
@ -303,21 +321,9 @@ def rm_file_dups(topdirs=['./'], hashalgs=DEF_HASHALGS, block_size=4096,
If safe_mode is true then do byte by byte comparison for
hash duplicate files.
"""
for dups, extracted in _extract_files_for_action(topdirs, hashalgs,
block_size, keep_prefix,
verbose, safe_mode):
if simulate or verbose:
print('found duplicates: \n{}'.format(dups))
for f in extracted:
if simulate or verbose:
print('rm {}'.format(f))
if not simulate:
os.remove(f)
self._do_action(simulate, keep_prefix, os.remove, 'rm {}')
def mv_file_dups(topdirs=['./'], hashalgs=DEF_HASHALGS, block_size=4096,
dest_dir='dups', simulate=False, keep_prefix=None,
verbose=False, safe_mode=False):
def mv(self, dest_dir='dups', simulate=False, keep_prefix=None):
"""Move duplicate files found in specified directory list.
If keep_prefix is specified then first file with that path
prefix found is kept in the original directory.
@ -340,28 +346,9 @@ def mv_file_dups(topdirs=['./'], hashalgs=DEF_HASHALGS, block_size=4096,
print('would raise:', errmsg)
else:
raise OSError(errmsg)
for dups, extracted in _extract_files_for_action(topdirs, hashalgs,
block_size, keep_prefix,
verbose, safe_mode):
if simulate or verbose:
print('found duplicates: \n{}'.format(dups))
for f in extracted:
if simulate or verbose:
print('mv {0} to {1}'.format(f, dest_dir))
if not simulate:
shutil.move(f, dest_dir)
def _remap_keys_to_str(d):
'''Iterator that remaps dictionary keys to string in case keys are tuple
or list. Leave it unchanged otherwise.
'''
for k in _dict_iter_keys(d):
if isinstance(k, tuple) or isinstance(k, list):
key = ','.join(k)
else:
key = k
yield (key, d[k])
self._do_action(simulate, keep_prefix,
partial(shutil.move, dst=dest_dir),
'mv {0} to ' + dest_dir)
def main():
@ -398,12 +385,11 @@ def main():
dest_dir = args['--move']
safe_mode = args['--safe-mode']
if action == 'print' or action == 'pprint':
dups = file_dups(topdirs=topdirs,
hashalgs=hashalgs,
block_size=block_size,
verbose=verbose,
sweeper = Sweeper(topdirs=topdirs, hashalgs=hashalgs,
block_size=block_size, verbose=verbose,
safe_mode=safe_mode)
if action == 'print' or action == 'pprint':
dups = sweeper.file_dups()
# defaultdict(list) -> dict
spam = dict(dups)
if spam:
@ -417,20 +403,9 @@ def main():
print(json.dumps({k: v for k, v in _remap_keys_to_str(spam)},
indent=4))
elif action == 'move':
mv_file_dups(topdirs=topdirs, hashalgs=hashalgs,
block_size=block_size,
dest_dir=dest_dir,
simulate=simulate,
keep_prefix=keep_prefix,
verbose=verbose,
safe_mode=safe_mode)
sweeper.mv(dest_dir, simulate, keep_prefix)
elif action == 'remove':
rm_file_dups(topdirs=topdirs, hashalgs=hashalgs,
block_size=block_size,
simulate=simulate,
keep_prefix=keep_prefix,
verbose=verbose,
safe_mode=safe_mode)
sweeper.rm(simulate, keep_prefix)
else:
print('Invalid action "{}"'.format(action))

View File

@ -3,7 +3,7 @@
# License: GPLv3
import unittest
from sweeper import file_dups, iter_file_dups
from sweeper import Sweeper
import os
mydir = os.path.dirname(os.path.realpath(__file__))
@ -11,7 +11,8 @@ mydir = os.path.dirname(os.path.realpath(__file__))
class TestSweeper(unittest.TestCase):
def test_file_dups_dups(self):
dups = file_dups([os.path.join(mydir, 'testfiles_dups')])
swp = Sweeper(topdirs=[os.path.join(mydir, 'testfiles_dups')])
dups = swp.file_dups()
dups_exist = False
for h, flist in dups.items():
if len(flist) > 1:
@ -19,24 +20,26 @@ class TestSweeper(unittest.TestCase):
self.assertTrue(dups_exist)
def test_file_dups_nodups(self):
dups = file_dups([os.path.join(mydir, 'testfiles_nodups')])
swp = Sweeper(topdirs=[os.path.join(mydir, 'testfiles_nodups')])
dups = swp.file_dups()
for h, flist in dups.items():
self.assertTrue(len(flist) == 1)
# does not actually test safe_mode, we would need to find
# hash collision
def test_file_dups_safe_mode(self):
dups = file_dups([os.path.join(mydir, 'testfiles_dups')],
swp = Sweeper(topdirs=[os.path.join(mydir, 'testfiles_dups')],
safe_mode=True)
dups = swp.file_dups()
for h, flist in dups.items():
if len(flist) > 1:
dups_exist = True
self.assertTrue(dups_exist)
def test_iter_file_dups_dups(self):
it = iter_file_dups([os.path.join(mydir, 'testfiles_dups')])
swp = Sweeper(topdirs=[os.path.join(mydir, 'testfiles_dups')])
dups_exist = False
for x in it:
for x in swp:
dups_exist = True
filepath, h, dups = x
self.assertNotIn(filepath, dups)
@ -44,9 +47,9 @@ class TestSweeper(unittest.TestCase):
self.assertTrue(dups_exist)
def test_iter_file_dups_nodups(self):
it = iter_file_dups([os.path.join(mydir, 'testfiles_nodups')])
swp = Sweeper([os.path.join(mydir, 'testfiles_nodups')])
dups_exist = False
for x in it:
for x in swp:
dups_exist = True
break
self.assertFalse(dups_exist)