Compare commits

...

37 Commits

Author SHA1 Message Date
Darko Poljak 0c04f67b93 Rewriten as class Sweeper with code improvements and optimizations. 2014-08-10 00:06:32 +02:00
Darko Poljak 41cd0fe6c6 Small fixes. 2014-08-09 20:59:23 +02:00
Darko Poljak 499b85bcfa Small changes. 2014-08-09 20:56:09 +02:00
Darko Poljak 2f5af0f2fb Version inc. 2014-08-09 20:54:02 +02:00
Darko Poljak a5cf8d66df Some code improvements. 2014-08-09 20:52:53 +02:00
darko-poljak b41a4afa8b change default hash alg to sha1 2014-02-08 20:27:30 +01:00
darko-poljak ea547d83c5 deleted unwanted pyc file tracking 2014-02-08 19:17:10 +01:00
darko-poljak 867cab7f25 make with nested to work with 2.6 2014-02-08 19:11:55 +01:00
darko-poljak 40ddd90e9b add ignore entries 2014-02-07 19:21:33 +01:00
darko-poljak 24a0ee4c04 update authors file 2014-02-07 19:21:11 +01:00
Darko Poljak 4d015b6be8 fix example for new iter_file_dups function 2014-02-07 13:04:37 +01:00
Darko Poljak c38d74feb7 cleanups and version inc 2014-02-07 08:30:48 +01:00
Darko Poljak 4a45525d00 cleanups and version inc 2014-02-07 08:30:18 +01:00
Darko Poljak d2186f1f26 cleanups and version inc 2014-02-07 08:29:54 +01:00
Darko Poljak 9e714732fb added file_dups_immediate function 2014-02-06 14:43:54 +01:00
Darko Poljak 5040c33f96 added file_dups_immediate function 2014-02-06 14:43:32 +01:00
Darko Poljak 3cc886f057 added file_dups_immediate function 2014-02-06 14:42:51 +01:00
Darko Poljak 12df9e5a7e added missing verbose if condition 2014-02-06 14:22:43 +01:00
Darko Poljak ff0c1a2895 use __version__ from sweeper.py 2014-02-06 14:05:24 +01:00
Darko Poljak c2d0a49d4f use os.path.realpath to eliminate symbolic links 2014-02-06 11:24:29 +01:00
Darko Poljak 19b8349e8c added --safe-mode test (see method comment :)) 2014-02-06 10:11:06 +01:00
Darko Poljak 6d57adc215 added --safe-mode 2014-02-06 10:09:52 +01:00
Darko Poljak 395f2234d5 multiple hash algs support for dups detecting 2014-02-06 08:48:18 +01:00
Darko Poljak 7aaeb3e98f cache file paths when counting in verbose mode 2014-02-06 08:07:06 +01:00
darko-poljak bd88491cb0 added authors file (added matthewi) 2014-02-05 21:37:57 +01:00
darko-poljak cccadf41f7 updated TODO file 2014-02-05 21:34:56 +01:00
Matthew Isaacs 17fa4e7279 import py3k print function from __future to maintain compat
with python 2.6+
2014-02-05 11:33:09 -06:00
darko-poljak 43ebde240d done with current TODO list 2014-01-31 22:45:54 +01:00
darko-poljak cff9388a4e switch to v0.4.1 2014-01-29 21:14:15 +01:00
darko-poljak 1be4e6ce4a lots of improvements 2014-01-29 21:10:30 +01:00
darko-poljak 8cc0897926 added TODO file 2014-01-28 23:38:29 +01:00
darko-poljak beb9734330 Merge branch 'master' of https://github.com/darko-poljak/sweeper 2014-01-28 22:39:27 +01:00
darko-poljak 1f4cb3e177 synchronization 2014-01-28 22:39:07 +01:00
Darko Poljak a3acd60556 replaced % operator with .format() 2014-01-28 12:51:51 +01:00
Darko Poljak ef57c4a04d replaced % operator with .format() 2014-01-28 12:51:29 +01:00
Darko Poljak c7f34614dd switch to version 0.4.0, added --version parameter 2014-01-28 09:09:31 +01:00
Darko Poljak 55279e1aee switch to version 0.4.0 2014-01-28 09:08:38 +01:00
9 changed files with 375 additions and 100 deletions

2
.gitignore vendored
View File

@ -1 +1,3 @@
*.pyc
dist/
*.egg-info/

5
AUTHORS Normal file
View File

@ -0,0 +1,5 @@
Author:
Darko Poljak <darko.poljak@gmail.com>
Contributors:
matthewi

View File

@ -10,29 +10,31 @@ Print duplicates
.. code:: python
from sweeper import file_dups
dups = file_dups(['images1', 'images2'])
from sweeper import Sweeper
swp = Sweeper(['images1', 'images2'])
dups = swp.file_dups()
print(dups)
Remove duplicate files
.. code:: python
from sweeper import rm_file_dups
rm_file_dups(['images'])
from sweeper import Sweeper
swp = Sweeper(['images1', 'images2'])
swp.rm()
Perform custom action
.. code:: python
from sweeper import iter_file_dups
for files in iter_file_dups(['images']):
for fname in files:
print('found duplicate file with name: %s' % fname)
from sweeper import Sweeper
swp = Sweeper(['images'])
for f, h, dups in swp:
print('encountered {} which duplicates with already found duplicate files {} with hash {}'.format(f, dups, h))
As script::
python sweeper.py --help
python -m sweeper/sweeper --help
As installed console script::
@ -61,5 +63,5 @@ https://github.com/darko-poljak/sweeper
Tested With
===========
Python2.7.6, Python3.3.3
Python2.7, Python3

0
TODO Normal file
View File

View File

@ -1,12 +1,17 @@
import os
from setuptools import setup
def read(fname):
return open(os.path.join(os.path.dirname(__file__), fname)).read()
import sweeper.sweeper as sw
setup(
name='sweeper',
version='0.3.0',
version=sw.__version__,
author='Darko Poljak',
author_email='darko.poljak@gmail.com',
description='Find duplicate files and perform action.',
@ -30,4 +35,3 @@ setup(
"Topic :: Software Development :: Libraries :: Python Modules",
],
)

View File

@ -1,4 +1,4 @@
from __future__ import absolute_import
from .sweeper import file_dups, mv_file_dups, rm_file_dups, iter_file_dups
from .sweeper import Sweeper
__all__ = ['file_dups', 'mv_file_dups', 'rm_file_dups', 'iter_file_dups']
__all__ = ['Sweeper']

View File

@ -2,26 +2,32 @@
# Author: Darko Poljak <darko.poljak@gmail.com>
# License: GPLv3
"""Sweeper.
"""{0} {1}
Usage: sweeper.py [options] [<directory>...]
Usage: {0} [options] [<directory>...]
Arguments:
<directory> directory path to scan for files
Options:
-h, --help show this screen
-v, --version show version and exit
-b <blocksize>, --block-size=<blocksize> size of block used when reading
file's content [default: 4096]
-d <hashalg>, --digest-alg=<hashalg> secure hash algorithm [default: md5]
-d <hashalgs>, --digest-algs=<hashalgs> secure hash algorithm comma separated
list [default: sha1]
note that multiple hashes will slow
down sweeper
-a <action>, --action=<action> action on duplicate files (pprint,
print, remove, move)
[default: pprint]
-remove removes duplicate files
except first found
except first or first with specified
directory prefix found
-move moves duplicate files to
duplicates driectory, except first
found
or first with specified directory
prefix found
-print prints result directory where
keys are hash values and values are
list of duplicate file paths
@ -31,36 +37,61 @@ Options:
-m <directory>, --move=<directory> move duplicate files to directory
(used with move action)
[default: ./dups]
-k <dirprefix>, --keep=<dirprefix> directory prefix for remove and move
actions
-s, --simulate if action is remove or move just
simulate action by printing, do not
actually perform the action
-V, --verbose print more info
note that verbosity will slow down
sweeper due to text printing and
gathering additional information
-S, --safe-mode enable safe mode: compare hash
duplicate files byte by byte too
note that it will further slow down
sweeper but will overcome hash
collisions (although this is
unlikely)
"""
from __future__ import print_function
__author__ = 'Darko Poljak <darko.poljak@gmail.com>'
__version__ = '0.3.0'
__version__ = '0.9.0'
__license__ = 'GPLv3'
__all__ = [
'file_dups', 'rm_file_dups', 'mv_file_dups', 'iter_file_dups'
]
__all__ = ['Sweeper']
import sys
import hashlib
import os
from collections import defaultdict
from functools import partial
import hashlib
# some differences in python versions
# we prefer iter methods
if sys.version_info[0] == 3:
def _do_encode(buf):
return buf
def _dict_iter_items(d):
return d.items()
else:
def _do_encode(buf):
return buf
def _dict_iter_keys(d):
return d.keys()
def _dict_iter_values(d):
return d.values()
else:
def _dict_iter_items(d):
return d.iteritems()
def _dict_iter_keys(d):
return d.iterkeys()
def _dict_iter_values(d):
return d.itervalues()
range = xrange
def _filehash(filepath, hashalg, block_size):
"""Calculate secure hash for given file content using
@ -70,63 +101,254 @@ def _filehash(filepath, hashalg, block_size):
md = hashlib.new(hashalg)
with open(filepath, "rb") as f:
for buf in iter(lambda: f.read(block_size), b''):
md.update(_do_encode(buf))
md.update(buf)
return md.hexdigest()
def file_dups(topdirs=['./'], hashalg='md5', block_size=4096):
"""Find duplicate files in directory list. Return directory
with keys equal to file hash value and value as list of
file paths whose content is the same.
"""
dups = defaultdict(list)
for topdir in topdirs:
for dirpath, dirnames, filenames in os.walk(topdir):
for fname in filenames:
fpath = os.path.join(dirpath, fname)
hexmd = _filehash(fpath, hashalg, block_size)
dups[hexmd].append(fpath)
result = {k: v for k, v in _dict_iter_items(dups) if len(v) > 1}
return result
def _uniq_list(list_):
return list(set(list_))
def rm_file_dups(topdirs=['./'], hashalg='md5', block_size=4096):
"""Remove duplicate files found in specified directory list.
First file in list is kept.
"""
for files in do_with_file_dups(topdirs, hashalg, block_size):
for f in files:
os.remove(f)
def _fbequal(fpath1, fpath2):
'''Compare files byte by byte. If files are equal return True,
False otherwise.
fpath1 and fpath2 are file paths.
'''
# nested to work with 2.6
with open(fpath1, "rb") as f1:
with open(fpath2, "rb") as f2:
while True:
b1 = f1.read(1)
b2 = f2.read(1)
if b1 != b2: # different bytes
return False
if not b1 or not b2: # end in one or both files
break
if not b1 and not b2: # end in both files, files are equal
return True
# end in one file but not in the other, files aren't equal
return False
def mv_file_dups(topdirs=['./'], hashalg='md5', block_size=4096,
dest_dir='dups'):
"""Move duplicate files found in specified directory list.
First file in list is kept in the original directory.
"""
if not os.path.exists(dest_dir):
os.mkdir(dest_dir)
if not os.path.isdir(dest_dir):
raise OSError('%s is not a directory' % dest_dir)
import shutil
for files in do_with_file_dups(topdirs, hashalg, block_size):
for i, f in enumerate(files):
if i > 0:
shutil.move(f, dest_dir)
def iter_file_dups(topdirs=['./'], rethash=False, hashalg='md5',
block_size=4096):
"""Yield duplicate files when found in specified directory list.
If rethash is True then tuple hash value and duplicate paths list is
returned, otherwise duplicate paths list is returned.
"""
dups = file_dups(topdirs, hashalg, block_size)
for hash_, fpaths in _dict_iter_items(dups):
if rethash:
yield (hash_, fpaths)
def _remap_keys_to_str(d):
'''Iterator that remaps dictionary keys to string in case keys are tuple
or list. Leave it unchanged otherwise.
Yields string key, value pairs.
'''
for k in _dict_iter_keys(d):
if isinstance(k, tuple) or isinstance(k, list):
key = ','.join(k)
else:
yield fpaths
key = k
yield (key, d[k])
def _gather_file_list(dirs):
'''Gather file paths in directory list dirs.
Return tuple (count, files) where count is files
list length and files is list of file paths in
specified directories.
'''
files = []
for dir_ in dirs:
for dirpath, dirnames, filenames in os.walk(dir_):
# replace fpath with realpath value (eliminate symbolic links)
files.extend([os.path.realpath(os.path.join(dirpath, fname))
for fname in filenames])
return files
class Sweeper(object):
DEF_HASHALGS = ['sha1']
def __init__(self, topdirs=['./'], hashalgs=DEF_HASHALGS,
block_size=4096, verbose=False, safe_mode=False):
# replace dir paths with realpath value (eliminate symbolic links)
self.topdirs = []
for i in range(len(topdirs)):
self.topdirs.append(os.path.realpath(topdirs[i]))
self.hashalgs = hashalgs
self.block_size = block_size
self.verbose = verbose
self.safe_mode = safe_mode
# iter through file paths in files list
def _files_iter_from_list(self, files):
return (fpath for fpath in files)
# iter through file paths by os.walking
def _files_iter_from_disk(self):
for topdir in self.topdirs:
for dirpath, dirnames, filenames in os.walk(topdir):
for fname in filenames:
# replace fpath with realpath value
# (eliminate symbolic links)
fpath = os.path.realpath(os.path.join(dirpath, fname))
yield fpath
def file_dups(self):
"""Find duplicate files in directory list. Return directory
with keys equal to file hash value and value as list of
file paths whose content is the same.
If safe_mode is true then you want to play safe: do byte
by byte comparison for hash duplicate files.
"""
dups = defaultdict(list)
if self.verbose:
if self.safe_mode:
print('safe mode is on')
print('gathering and counting files...', end='')
sys.stdout.flush()
files = _gather_file_list(self.topdirs)
count = len(files)
current = 1
print(count)
_files_iter = partial(self._files_iter_from_list, files)
else:
_files_iter = self._files_iter_from_disk
for fpath in _files_iter():
if self.verbose:
print('\rprocessing file {0}/{1}: calc hash'.format(current,
count),
end='')
sys.stdout.flush()
hexmds = [_filehash(fpath, h, self.block_size)
for h in self.hashalgs]
hexmd = tuple(hexmds)
dup_files = dups[hexmd]
files_equals = False
if self.safe_mode:
if dup_files:
if self.verbose:
print('\rprocessing file {0}/{1}: byte cmp'.format(
current, count), end='')
sys.stdout.flush()
for f in dup_files:
if _fbequal(f, fpath):
files_equals = True
break
if self.verbose and not files_equals:
print('\nsame hash value {} but not same bytes for'
' file {} with files {}'.format(
hexmd, fpath, dup_files))
else: # when list is empty in safe mode
files_equals = True
else:
files_equals = True # when safe mode is off
if self.verbose:
current += 1
if files_equals:
dups[hexmd].append(fpath)
if self.verbose:
print('')
# make result dict with unique file paths list
result = {}
for k, v in _dict_iter_items(dups):
uniq_v = _uniq_list(v)
if len(uniq_v) > 1:
result[k] = uniq_v
return result
def __iter__(self):
"""Find duplicate files in directory list.
Yield tuple of file path, hash tuple and list of duplicate files
as soon as duplicate file is found.
Newly found file is not included in the list at the yield time,
but is appended later before next yield.
This means that not all duplicate files are returned with any
return value. Same hash value and sublist could be returned later
if file with same content is found.
If safe_mode is true then you want to play safe: do byte
by byte comparison for hash duplicate files.
"""
# internaly, file dups dict is still maintained
dups = defaultdict(list)
_files_iter = self._files_iter_from_disk
for fpath in _files_iter():
hexmds = [_filehash(fpath, h, self.block_size)
for h in self.hashalgs]
hexmd = tuple(hexmds)
dup_files = dups[hexmd]
# there were dup list elements (used for yield)
if self.safe_mode and dup_files:
# compare only with first file in dup_files
# all files in dup_files list are already content equal
files_equals = _fbequal(dup_files[0], fpath)
else: # when list is emtpy in safe mode or when safe mode is off
files_equals = True
if files_equals:
# yield only if current dup files list isn't empty
if dup_files:
yield (fpath, hexmd, dups[hexmd])
# finally append newly found file to dup list
dups[hexmd].append(fpath)
def _extract_files_for_action(self, keep_prefix):
dups = self.file_dups()
for files in _dict_iter_values(dups):
found = False
if keep_prefix:
result = []
for f in files:
if f.startswith(keep_prefix) and not found:
found = True
else:
result.append(f)
if not found:
result = list(files)[1:]
yield (files, result)
def _do_action(self, simulate, keep_prefix, action, action_str):
for dups, extracted in self._extract_files_for_action(keep_prefix):
if simulate or self.verbose:
print('found duplicates: \n{}'.format(dups))
for f in extracted:
if simulate or self.verbose:
print(action_str.format(f))
if not simulate:
action(f)
def rm(self, simulate=False, keep_prefix=None):
"""Remove duplicate files found in specified directory list.
If keep_prefix is specified then first file with that path
prefix found is kept in the original directory.
Otherwise first file in list is kept in the original directory.
If simulate is True then only print the action, do not actually
perform it.
If safe_mode is true then do byte by byte comparison for
hash duplicate files.
"""
self._do_action(simulate, keep_prefix, os.remove, 'rm {}')
def mv(self, dest_dir='dups', simulate=False, keep_prefix=None):
"""Move duplicate files found in specified directory list.
If keep_prefix is specified then first file with that path
prefix found is kept in the original directory.
Otherwise first file in list is kept in the original directory.
If simulate is True then only print the action, do not actually
perform it.
If safe_mode is true then do byte by byte comparison for
hash duplicate files.
"""
import shutil
if not os.path.exists(dest_dir):
if simulate:
print('mkdir {}'.format(dest_dir))
else:
os.mkdir(dest_dir)
elif not os.path.isdir(dest_dir):
errmsg = '{} is not a directory'.format(dest_dir)
if simulate:
print('would raise:', errmsg)
else:
raise OSError(errmsg)
self._do_action(simulate, keep_prefix,
partial(shutil.move, dst=dest_dir),
'mv {0} to ' + dest_dir)
def main():
@ -135,40 +357,58 @@ def main():
import json
from docopt import docopt
args = docopt(__doc__)
args = docopt(__doc__.format(sys.argv[0], __version__),
version=" ".join(('sweeper', __version__)))
topdirs = args['<directory>']
if not topdirs:
topdirs = ['./']
action = args['--action']
verbose = args['--verbose']
# set block size as int
try:
bs = int(args['--block-size'])
args['--block-size'] = bs
except ValueError:
print('Invalid block size "%s"' % args['--block-size'])
print('Invalid block size "{}"'.format(args['--block-size']))
sys.exit(1)
hashalgs = args['--digest-algs'].split(',')
hashalgs_uniq = _uniq_list(hashalgs)
if len(hashalgs) != len(hashalgs_uniq):
print('Duplicate hash algorithms specified')
sys.exit(1)
block_size = args['--block-size']
simulate = args['--simulate']
keep_prefix = args['--keep']
dest_dir = args['--move']
safe_mode = args['--safe-mode']
sweeper = Sweeper(topdirs=topdirs, hashalgs=hashalgs,
block_size=block_size, verbose=verbose,
safe_mode=safe_mode)
if action == 'print' or action == 'pprint':
dups = file_dups(topdirs, args['--digest-alg'], args['--block-size'])
dups = sweeper.file_dups()
# defaultdict(list) -> dict
spam = dict(dups)
if spam:
if action == 'pprint':
for h, fpaths in _dict_iter_items(spam):
for _, fpaths in _dict_iter_items(spam):
for path in fpaths:
print(path)
print('')
if fpaths:
print('')
else:
print(json.dumps(spam, indent=4))
print(json.dumps({k: v for k, v in _remap_keys_to_str(spam)},
indent=4))
elif action == 'move':
mv_file_dups(topdirs, args['--digest-alg'], args['--block-size'],
args['--move'])
sweeper.mv(dest_dir, simulate, keep_prefix)
elif action == 'remove':
rm_file_dups(topdirs, args['--digest-alg'], args['--block-size'])
sweeper.rm(simulate, keep_prefix)
else:
print('Invalid action "%s"' % action)
print('Invalid action "{}"'.format(action))
# if used as script call main function
if __name__ == '__main__':
main()

Binary file not shown.

View File

@ -3,7 +3,7 @@
# License: GPLv3
import unittest
from sweeper import file_dups, iter_file_dups
from sweeper import Sweeper
import os
mydir = os.path.dirname(os.path.realpath(__file__))
@ -11,7 +11,8 @@ mydir = os.path.dirname(os.path.realpath(__file__))
class TestSweeper(unittest.TestCase):
def test_file_dups_dups(self):
dups = file_dups([os.path.join(mydir, 'testfiles_dups')], 'md5')
swp = Sweeper(topdirs=[os.path.join(mydir, 'testfiles_dups')])
dups = swp.file_dups()
dups_exist = False
for h, flist in dups.items():
if len(flist) > 1:
@ -19,18 +20,39 @@ class TestSweeper(unittest.TestCase):
self.assertTrue(dups_exist)
def test_file_dups_nodups(self):
dups = file_dups([os.path.join(mydir, 'testfiles_nodups')], 'md5')
swp = Sweeper(topdirs=[os.path.join(mydir, 'testfiles_nodups')])
dups = swp.file_dups()
for h, flist in dups.items():
self.assertTrue(len(flist) == 1)
def test_iter_fule_dups_rethash(self):
for item in iter_file_dups([os.path.join(mydir, 'testfiles_dups')],
rethash=True):
self.assertTrue(type(item).__name__ == 'tuple')
# does not actually test safe_mode, we would need to find
# hash collision
def test_file_dups_safe_mode(self):
swp = Sweeper(topdirs=[os.path.join(mydir, 'testfiles_dups')],
safe_mode=True)
dups = swp.file_dups()
for h, flist in dups.items():
if len(flist) > 1:
dups_exist = True
self.assertTrue(dups_exist)
def test_iter_fule_dups_norethash(self):
for item in iter_file_dups([os.path.join(mydir, 'testfiles_dups')]):
self.assertTrue(type(item).__name__ == 'list')
def test_iter_file_dups_dups(self):
swp = Sweeper(topdirs=[os.path.join(mydir, 'testfiles_dups')])
dups_exist = False
for x in swp:
dups_exist = True
filepath, h, dups = x
self.assertNotIn(filepath, dups)
self.assertTrue(len(dups) > 0)
self.assertTrue(dups_exist)
def test_iter_file_dups_nodups(self):
swp = Sweeper([os.path.join(mydir, 'testfiles_nodups')])
dups_exist = False
for x in swp:
dups_exist = True
break
self.assertFalse(dups_exist)
if __name__ == '__main__':