Rewriten as class Sweeper with code improvements and optimizations.

This commit is contained in:
Darko Poljak 2014-08-10 00:06:32 +02:00
parent 41cd0fe6c6
commit 0c04f67b93
5 changed files with 243 additions and 264 deletions

View file

@ -10,28 +10,31 @@ Print duplicates
.. code:: python .. code:: python
from sweeper import file_dups from sweeper import Sweeper
dups = file_dups(['images1', 'images2']) swp = Sweeper(['images1', 'images2'])
dups = swp.file_dups()
print(dups) print(dups)
Remove duplicate files Remove duplicate files
.. code:: python .. code:: python
from sweeper import rm_file_dups from sweeper import Sweeper
rm_file_dups(['images']) swp = Sweeper(['images1', 'images2'])
swp.rm()
Perform custom action Perform custom action
.. code:: python .. code:: python
from sweeper import iter_file_dups from sweeper import Sweeper
for f, h, dups in iter_file_dups(['images']): swp = Sweeper(['images'])
for f, h, dups in swp:
print('encountered {} which duplicates with already found duplicate files {} with hash {}'.format(f, dups, h)) print('encountered {} which duplicates with already found duplicate files {} with hash {}'.format(f, dups, h))
As script:: As script::
python sweeper.py --help python -m sweeper/sweeper --help
As installed console script:: As installed console script::

2
TODO
View file

@ -1,2 +0,0 @@
* Play it safe and add byte by byte comparison option for hash dup files?
Or use one more, different, hash algorithm?

View file

@ -1,4 +1,4 @@
from __future__ import absolute_import from __future__ import absolute_import
from .sweeper import file_dups, mv_file_dups, rm_file_dups, iter_file_dups from .sweeper import Sweeper
__all__ = ['file_dups', 'mv_file_dups', 'rm_file_dups', 'iter_file_dups'] __all__ = ['Sweeper']

View file

@ -57,21 +57,18 @@ Options:
from __future__ import print_function from __future__ import print_function
__author__ = 'Darko Poljak <darko.poljak@gmail.com>' __author__ = 'Darko Poljak <darko.poljak@gmail.com>'
__version__ = '0.6.0' __version__ = '0.9.0'
__license__ = 'GPLv3' __license__ = 'GPLv3'
__all__ = [ __all__ = ['Sweeper']
'file_dups', 'rm_file_dups', 'mv_file_dups', 'iter_file_dups'
]
import sys import sys
import hashlib
import os import os
from collections import defaultdict from collections import defaultdict
from functools import partial from functools import partial
import hashlib
DEF_HASHALGS = ['sha1']
# some differences in python versions # some differences in python versions
# we prefer iter methods # we prefer iter methods
if sys.version_info[0] == 3: if sys.version_info[0] == 3:
@ -109,40 +106,7 @@ def _filehash(filepath, hashalg, block_size):
def _uniq_list(list_): def _uniq_list(list_):
return set(list_) return list(set(list_))
def _gather_file_list(dirs):
'''Gather file paths in directory list dirs.
Return tuple (count, files) where count is files
list length and files is list of file paths in
specified directories.
'''
count = 0
files = []
for dir_ in dirs:
for dirpath, dirnames, filenames in os.walk(dir_):
count += len(filenames)
# replace fpath with realpath value (eliminate symbolic links)
files.extend([os.path.realpath(os.path.join(dirpath, fname))
for fname in filenames])
return (count, files)
# iter through file paths in files list
def _files_iter_from_list(files):
for fpath in files:
yield fpath
# iter through file paths by os.walking
def _files_iter_from_disk(topdirs):
for topdir in topdirs:
for dirpath, dirnames, filenames in os.walk(topdir):
for fname in filenames:
# replace fpath with realpath value (eliminate symbolic links)
fpath = os.path.realpath(os.path.join(dirpath, fname))
yield fpath
def _fbequal(fpath1, fpath2): def _fbequal(fpath1, fpath2):
@ -166,8 +130,63 @@ def _fbequal(fpath1, fpath2):
return False return False
def file_dups(topdirs=['./'], hashalgs=DEF_HASHALGS, block_size=4096, verbose=False, def _remap_keys_to_str(d):
safe_mode=False): '''Iterator that remaps dictionary keys to string in case keys are tuple
or list. Leave it unchanged otherwise.
Yields string key, value pairs.
'''
for k in _dict_iter_keys(d):
if isinstance(k, tuple) or isinstance(k, list):
key = ','.join(k)
else:
key = k
yield (key, d[k])
def _gather_file_list(dirs):
'''Gather file paths in directory list dirs.
Return tuple (count, files) where count is files
list length and files is list of file paths in
specified directories.
'''
files = []
for dir_ in dirs:
for dirpath, dirnames, filenames in os.walk(dir_):
# replace fpath with realpath value (eliminate symbolic links)
files.extend([os.path.realpath(os.path.join(dirpath, fname))
for fname in filenames])
return files
class Sweeper(object):
DEF_HASHALGS = ['sha1']
def __init__(self, topdirs=['./'], hashalgs=DEF_HASHALGS,
block_size=4096, verbose=False, safe_mode=False):
# replace dir paths with realpath value (eliminate symbolic links)
self.topdirs = []
for i in range(len(topdirs)):
self.topdirs.append(os.path.realpath(topdirs[i]))
self.hashalgs = hashalgs
self.block_size = block_size
self.verbose = verbose
self.safe_mode = safe_mode
# iter through file paths in files list
def _files_iter_from_list(self, files):
return (fpath for fpath in files)
# iter through file paths by os.walking
def _files_iter_from_disk(self):
for topdir in self.topdirs:
for dirpath, dirnames, filenames in os.walk(topdir):
for fname in filenames:
# replace fpath with realpath value
# (eliminate symbolic links)
fpath = os.path.realpath(os.path.join(dirpath, fname))
yield fpath
def file_dups(self):
"""Find duplicate files in directory list. Return directory """Find duplicate files in directory list. Return directory
with keys equal to file hash value and value as list of with keys equal to file hash value and value as list of
file paths whose content is the same. file paths whose content is the same.
@ -175,55 +194,54 @@ def file_dups(topdirs=['./'], hashalgs=DEF_HASHALGS, block_size=4096, verbose=Fa
by byte comparison for hash duplicate files. by byte comparison for hash duplicate files.
""" """
dups = defaultdict(list) dups = defaultdict(list)
# replace dir paths with realpath value (eliminate symbolic links) if self.verbose:
for i in range(len(topdirs)): if self.safe_mode:
topdirs[i] = os.path.realpath(topdirs[i])
if verbose:
if safe_mode:
print('safe mode is on') print('safe mode is on')
print('gathering and counting files...', end='') print('gathering and counting files...', end='')
sys.stdout.flush() sys.stdout.flush()
count, files = _gather_file_list(topdirs) files = _gather_file_list(self.topdirs)
count = len(files)
current = 1 current = 1
print(count) print(count)
_files_iter = partial(_files_iter_from_list, files) _files_iter = partial(self._files_iter_from_list, files)
else: else:
_files_iter = partial(_files_iter_from_disk, topdirs) _files_iter = self._files_iter_from_disk
for fpath in _files_iter(): for fpath in _files_iter():
if verbose: if self.verbose:
print('\rprocessing file {0}/{1}: calc hash'.format(current, print('\rprocessing file {0}/{1}: calc hash'.format(current,
count), count),
end='') end='')
sys.stdout.flush() sys.stdout.flush()
hexmds = [_filehash(fpath, h, block_size) for h in hashalgs] hexmds = [_filehash(fpath, h, self.block_size)
for h in self.hashalgs]
hexmd = tuple(hexmds) hexmd = tuple(hexmds)
dup_files = dups[hexmd] dup_files = dups[hexmd]
files_equals = False files_equals = False
if safe_mode: if self.safe_mode:
if dup_files: if dup_files:
if verbose: if self.verbose:
print('\rprocessing file {0}/{1}: byte cmp'.format(current, print('\rprocessing file {0}/{1}: byte cmp'.format(
count), current, count), end='')
end='')
sys.stdout.flush() sys.stdout.flush()
for f in dup_files: for f in dup_files:
if _fbequal(f, fpath): if _fbequal(f, fpath):
files_equals = True files_equals = True
break break
if verbose and not files_equals: if self.verbose and not files_equals:
print('\nsame hash value {} but not same bytes for file {}' print('\nsame hash value {} but not same bytes for'
' with files {}'.format(hexmd, fpath, dup_files)) ' file {} with files {}'.format(
hexmd, fpath, dup_files))
else: # when list is empty in safe mode else: # when list is empty in safe mode
files_equals = True files_equals = True
else: else:
files_equals = True # when safe mode is off files_equals = True # when safe mode is off
if verbose: if self.verbose:
current += 1 current += 1
if files_equals: if files_equals:
dups[hexmd].append(fpath) dups[hexmd].append(fpath)
if verbose: if self.verbose:
print('') print('')
# make result dict with unique file paths list # make result dict with unique file paths list
result = {} result = {}
@ -233,9 +251,7 @@ def file_dups(topdirs=['./'], hashalgs=DEF_HASHALGS, block_size=4096, verbose=Fa
result[k] = uniq_v result[k] = uniq_v
return result return result
def __iter__(self):
def iter_file_dups(topdirs=['./'], hashalgs=DEF_HASHALGS, block_size=4096,
safe_mode=False):
"""Find duplicate files in directory list. """Find duplicate files in directory list.
Yield tuple of file path, hash tuple and list of duplicate files Yield tuple of file path, hash tuple and list of duplicate files
as soon as duplicate file is found. as soon as duplicate file is found.
@ -249,17 +265,15 @@ def iter_file_dups(topdirs=['./'], hashalgs=DEF_HASHALGS, block_size=4096,
""" """
# internaly, file dups dict is still maintained # internaly, file dups dict is still maintained
dups = defaultdict(list) dups = defaultdict(list)
# replace dir paths with realpath value (eliminate symbolic links) _files_iter = self._files_iter_from_disk
for i in range(len(topdirs)):
topdirs[i] = os.path.realpath(topdirs[i])
_files_iter = partial(_files_iter_from_disk, topdirs)
for fpath in _files_iter(): for fpath in _files_iter():
hexmds = [_filehash(fpath, h, block_size) for h in hashalgs] hexmds = [_filehash(fpath, h, self.block_size)
for h in self.hashalgs]
hexmd = tuple(hexmds) hexmd = tuple(hexmds)
dup_files = dups[hexmd] dup_files = dups[hexmd]
# there were dup list elements (used for yield) # there were dup list elements (used for yield)
if safe_mode and dup_files: if self.safe_mode and dup_files:
# compare only with first file in dup_files # compare only with first file in dup_files
# all files in dup_files list are already content equal # all files in dup_files list are already content equal
files_equals = _fbequal(dup_files[0], fpath) files_equals = _fbequal(dup_files[0], fpath)
@ -272,12 +286,9 @@ def iter_file_dups(topdirs=['./'], hashalgs=DEF_HASHALGS, block_size=4096,
# finally append newly found file to dup list # finally append newly found file to dup list
dups[hexmd].append(fpath) dups[hexmd].append(fpath)
def _extract_files_for_action(self, keep_prefix):
def _extract_files_for_action(topdirs, hashalgs, block_size, keep_prefix, dups = self.file_dups()
verbose, safe_mode): for files in _dict_iter_values(dups):
for files in _dict_iter_values(file_dups(topdirs=topdirs,
hashalgs=hashalgs, block_size=block_size,
verbose=verbose, safe_mode=safe_mode)):
found = False found = False
if keep_prefix: if keep_prefix:
result = [] result = []
@ -287,13 +298,20 @@ def _extract_files_for_action(topdirs, hashalgs, block_size, keep_prefix,
else: else:
result.append(f) result.append(f)
if not found: if not found:
result = files[1:] result = list(files)[1:]
yield (files, result) yield (files, result)
def _do_action(self, simulate, keep_prefix, action, action_str):
for dups, extracted in self._extract_files_for_action(keep_prefix):
if simulate or self.verbose:
print('found duplicates: \n{}'.format(dups))
for f in extracted:
if simulate or self.verbose:
print(action_str.format(f))
if not simulate:
action(f)
def rm_file_dups(topdirs=['./'], hashalgs=DEF_HASHALGS, block_size=4096, def rm(self, simulate=False, keep_prefix=None):
simulate=False, keep_prefix=None, verbose=False,
safe_mode=False):
"""Remove duplicate files found in specified directory list. """Remove duplicate files found in specified directory list.
If keep_prefix is specified then first file with that path If keep_prefix is specified then first file with that path
prefix found is kept in the original directory. prefix found is kept in the original directory.
@ -303,21 +321,9 @@ def rm_file_dups(topdirs=['./'], hashalgs=DEF_HASHALGS, block_size=4096,
If safe_mode is true then do byte by byte comparison for If safe_mode is true then do byte by byte comparison for
hash duplicate files. hash duplicate files.
""" """
for dups, extracted in _extract_files_for_action(topdirs, hashalgs, self._do_action(simulate, keep_prefix, os.remove, 'rm {}')
block_size, keep_prefix,
verbose, safe_mode):
if simulate or verbose:
print('found duplicates: \n{}'.format(dups))
for f in extracted:
if simulate or verbose:
print('rm {}'.format(f))
if not simulate:
os.remove(f)
def mv(self, dest_dir='dups', simulate=False, keep_prefix=None):
def mv_file_dups(topdirs=['./'], hashalgs=DEF_HASHALGS, block_size=4096,
dest_dir='dups', simulate=False, keep_prefix=None,
verbose=False, safe_mode=False):
"""Move duplicate files found in specified directory list. """Move duplicate files found in specified directory list.
If keep_prefix is specified then first file with that path If keep_prefix is specified then first file with that path
prefix found is kept in the original directory. prefix found is kept in the original directory.
@ -340,28 +346,9 @@ def mv_file_dups(topdirs=['./'], hashalgs=DEF_HASHALGS, block_size=4096,
print('would raise:', errmsg) print('would raise:', errmsg)
else: else:
raise OSError(errmsg) raise OSError(errmsg)
for dups, extracted in _extract_files_for_action(topdirs, hashalgs, self._do_action(simulate, keep_prefix,
block_size, keep_prefix, partial(shutil.move, dst=dest_dir),
verbose, safe_mode): 'mv {0} to ' + dest_dir)
if simulate or verbose:
print('found duplicates: \n{}'.format(dups))
for f in extracted:
if simulate or verbose:
print('mv {0} to {1}'.format(f, dest_dir))
if not simulate:
shutil.move(f, dest_dir)
def _remap_keys_to_str(d):
'''Iterator that remaps dictionary keys to string in case keys are tuple
or list. Leave it unchanged otherwise.
'''
for k in _dict_iter_keys(d):
if isinstance(k, tuple) or isinstance(k, list):
key = ','.join(k)
else:
key = k
yield (key, d[k])
def main(): def main():
@ -398,12 +385,11 @@ def main():
dest_dir = args['--move'] dest_dir = args['--move']
safe_mode = args['--safe-mode'] safe_mode = args['--safe-mode']
if action == 'print' or action == 'pprint': sweeper = Sweeper(topdirs=topdirs, hashalgs=hashalgs,
dups = file_dups(topdirs=topdirs, block_size=block_size, verbose=verbose,
hashalgs=hashalgs,
block_size=block_size,
verbose=verbose,
safe_mode=safe_mode) safe_mode=safe_mode)
if action == 'print' or action == 'pprint':
dups = sweeper.file_dups()
# defaultdict(list) -> dict # defaultdict(list) -> dict
spam = dict(dups) spam = dict(dups)
if spam: if spam:
@ -417,20 +403,9 @@ def main():
print(json.dumps({k: v for k, v in _remap_keys_to_str(spam)}, print(json.dumps({k: v for k, v in _remap_keys_to_str(spam)},
indent=4)) indent=4))
elif action == 'move': elif action == 'move':
mv_file_dups(topdirs=topdirs, hashalgs=hashalgs, sweeper.mv(dest_dir, simulate, keep_prefix)
block_size=block_size,
dest_dir=dest_dir,
simulate=simulate,
keep_prefix=keep_prefix,
verbose=verbose,
safe_mode=safe_mode)
elif action == 'remove': elif action == 'remove':
rm_file_dups(topdirs=topdirs, hashalgs=hashalgs, sweeper.rm(simulate, keep_prefix)
block_size=block_size,
simulate=simulate,
keep_prefix=keep_prefix,
verbose=verbose,
safe_mode=safe_mode)
else: else:
print('Invalid action "{}"'.format(action)) print('Invalid action "{}"'.format(action))

View file

@ -3,7 +3,7 @@
# License: GPLv3 # License: GPLv3
import unittest import unittest
from sweeper import file_dups, iter_file_dups from sweeper import Sweeper
import os import os
mydir = os.path.dirname(os.path.realpath(__file__)) mydir = os.path.dirname(os.path.realpath(__file__))
@ -11,7 +11,8 @@ mydir = os.path.dirname(os.path.realpath(__file__))
class TestSweeper(unittest.TestCase): class TestSweeper(unittest.TestCase):
def test_file_dups_dups(self): def test_file_dups_dups(self):
dups = file_dups([os.path.join(mydir, 'testfiles_dups')]) swp = Sweeper(topdirs=[os.path.join(mydir, 'testfiles_dups')])
dups = swp.file_dups()
dups_exist = False dups_exist = False
for h, flist in dups.items(): for h, flist in dups.items():
if len(flist) > 1: if len(flist) > 1:
@ -19,24 +20,26 @@ class TestSweeper(unittest.TestCase):
self.assertTrue(dups_exist) self.assertTrue(dups_exist)
def test_file_dups_nodups(self): def test_file_dups_nodups(self):
dups = file_dups([os.path.join(mydir, 'testfiles_nodups')]) swp = Sweeper(topdirs=[os.path.join(mydir, 'testfiles_nodups')])
dups = swp.file_dups()
for h, flist in dups.items(): for h, flist in dups.items():
self.assertTrue(len(flist) == 1) self.assertTrue(len(flist) == 1)
# does not actually test safe_mode, we would need to find # does not actually test safe_mode, we would need to find
# hash collision # hash collision
def test_file_dups_safe_mode(self): def test_file_dups_safe_mode(self):
dups = file_dups([os.path.join(mydir, 'testfiles_dups')], swp = Sweeper(topdirs=[os.path.join(mydir, 'testfiles_dups')],
safe_mode=True) safe_mode=True)
dups = swp.file_dups()
for h, flist in dups.items(): for h, flist in dups.items():
if len(flist) > 1: if len(flist) > 1:
dups_exist = True dups_exist = True
self.assertTrue(dups_exist) self.assertTrue(dups_exist)
def test_iter_file_dups_dups(self): def test_iter_file_dups_dups(self):
it = iter_file_dups([os.path.join(mydir, 'testfiles_dups')]) swp = Sweeper(topdirs=[os.path.join(mydir, 'testfiles_dups')])
dups_exist = False dups_exist = False
for x in it: for x in swp:
dups_exist = True dups_exist = True
filepath, h, dups = x filepath, h, dups = x
self.assertNotIn(filepath, dups) self.assertNotIn(filepath, dups)
@ -44,9 +47,9 @@ class TestSweeper(unittest.TestCase):
self.assertTrue(dups_exist) self.assertTrue(dups_exist)
def test_iter_file_dups_nodups(self): def test_iter_file_dups_nodups(self):
it = iter_file_dups([os.path.join(mydir, 'testfiles_nodups')]) swp = Sweeper([os.path.join(mydir, 'testfiles_nodups')])
dups_exist = False dups_exist = False
for x in it: for x in swp:
dups_exist = True dups_exist = True
break break
self.assertFalse(dups_exist) self.assertFalse(dups_exist)