sweeper/sweeper/sweeper.py

180 lines
5.9 KiB
Python
Raw Normal View History

2014-01-27 06:31:40 +00:00
#!/usr/bin/env python
# Author: Darko Poljak <darko.poljak@gmail.com>
# License: GPLv3
2014-01-28 21:39:07 +00:00
"""sweeper 0.4.0
2014-01-27 06:31:40 +00:00
Usage: sweeper.py [options] [<directory>...]
Arguments:
<directory> directory path to scan for files
Options:
-h, --help show this screen
2014-01-28 21:39:07 +00:00
-v, --version show version and exit
2014-01-27 08:42:43 +00:00
-b <blocksize>, --block-size=<blocksize> size of block used when reading
file's content [default: 4096]
2014-01-27 06:31:40 +00:00
-d <hashalg>, --digest-alg=<hashalg> secure hash algorithm [default: md5]
2014-01-28 21:39:07 +00:00
-a <action>, --action=<action> action on duplicate files (pprint,
print, remove, move)
[default: pprint]
-remove removes duplicate files
except first found
-move moves duplicate files to
duplicates driectory, except first
found
-print prints result directory where
keys are hash values and values are
list of duplicate file paths
-pprint prints sets of duplicate file
paths each in it's line where sets
are separated by blank newline
2014-01-27 06:31:40 +00:00
-m <directory>, --move=<directory> move duplicate files to directory
2014-01-27 08:42:43 +00:00
(used with move action)
[default: ./dups]
2014-01-27 06:31:40 +00:00
"""
__author__ = 'Darko Poljak <darko.poljak@gmail.com>'
2014-01-28 21:39:07 +00:00
__version__ = '0.4.0'
2014-01-27 06:31:40 +00:00
__license__ = 'GPLv3'
__all__ = [
'file_dups', 'rm_file_dups', 'mv_file_dups', 'iter_file_dups'
2014-01-27 06:31:40 +00:00
]
import sys
import hashlib
import os
from collections import defaultdict
2014-01-27 06:58:10 +00:00
# some differences in python versions
2014-01-27 06:31:40 +00:00
if sys.version_info[0] == 3:
def _do_encode(buf):
return buf
def _dict_iter_items(d):
return d.items()
else:
def _do_encode(buf):
return buf
def _dict_iter_items(d):
return d.iteritems()
def _filehash(filepath, hashalg, block_size):
2014-01-27 06:58:10 +00:00
"""Calculate secure hash for given file content using
specified hash algorithm. Use block_size block size
when reading file content.
"""
2014-01-27 06:31:40 +00:00
md = hashlib.new(hashalg)
with open(filepath, "rb") as f:
for buf in iter(lambda: f.read(block_size), b''):
md.update(_do_encode(buf))
return md.hexdigest()
def file_dups(topdirs=['./'], hashalg='md5', block_size=4096):
2014-01-27 06:58:10 +00:00
"""Find duplicate files in directory list. Return directory
with keys equal to file hash value and value as list of
file paths whose content is the same.
"""
2014-01-27 06:31:40 +00:00
dups = defaultdict(list)
for topdir in topdirs:
for dirpath, dirnames, filenames in os.walk(topdir):
for fname in filenames:
fpath = os.path.join(dirpath, fname)
hexmd = _filehash(fpath, hashalg, block_size)
dups[hexmd].append(fpath)
result = {k: v for k, v in _dict_iter_items(dups) if len(v) > 1}
return result
def rm_file_dups(topdirs=['./'], hashalg='md5', block_size=4096):
2014-01-27 06:58:10 +00:00
"""Remove duplicate files found in specified directory list.
First file in list is kept.
"""
2014-01-27 06:31:40 +00:00
for files in do_with_file_dups(topdirs, hashalg, block_size):
for f in files:
os.remove(f)
2014-01-27 08:42:43 +00:00
def mv_file_dups(topdirs=['./'], hashalg='md5', block_size=4096,
dest_dir='dups'):
2014-01-27 06:58:10 +00:00
"""Move duplicate files found in specified directory list.
First file in list is kept in the original directory.
"""
2014-01-27 06:31:40 +00:00
if not os.path.exists(dest_dir):
os.mkdir(dest_dir)
if not os.path.isdir(dest_dir):
2014-01-28 21:39:07 +00:00
raise OSError('{} is not a directory'.format(dest_dir))
2014-01-27 06:31:40 +00:00
import shutil
for files in do_with_file_dups(topdirs, hashalg, block_size):
for i, f in enumerate(files):
if i > 0:
shutil.move(f, dest_dir)
2014-01-28 21:39:07 +00:00
def iter_file_dups(topdirs=['./'], rethash=False, hashalg='md5',
block_size=4096):
"""Yield duplicate files when found in specified directory list.
If rethash is True then tuple hash value and duplicate paths list is
returned, otherwise duplicate paths list is returned.
2014-01-27 06:58:10 +00:00
"""
2014-01-27 06:31:40 +00:00
dups = file_dups(topdirs, hashalg, block_size)
2014-01-28 21:39:07 +00:00
for hash_, fpaths in _dict_iter_items(dups):
if rethash:
yield (hash_, fpaths)
else:
yield fpaths
2014-01-27 06:31:40 +00:00
def main():
2014-01-27 06:58:10 +00:00
"""Main when used as script. See usage (--help).
"""
2014-01-27 06:31:40 +00:00
import json
from docopt import docopt
2014-01-27 06:31:40 +00:00
2014-01-27 08:19:25 +00:00
args = docopt(__doc__)
2014-01-27 08:42:43 +00:00
2014-01-27 06:31:40 +00:00
topdirs = args['<directory>']
if not topdirs:
topdirs = ['./']
action = args['--action']
try:
bs = int(args['--block-size'])
args['--block-size'] = bs
except ValueError:
2014-01-28 21:39:07 +00:00
print('Invalid block size "{}"'.format(args['--block-size']))
2014-01-27 06:31:40 +00:00
sys.exit(1)
2014-01-28 21:39:07 +00:00
if args['--version']:
print("sweeper {}".format(__version__))
return
if action == 'print' or action == 'pprint':
2014-01-27 06:31:40 +00:00
dups = file_dups(topdirs, args['--digest-alg'], args['--block-size'])
spam = dict(dups)
if spam:
2014-01-28 21:39:07 +00:00
if action == 'pprint':
for h, fpaths in _dict_iter_items(spam):
for path in fpaths:
print(path)
print('')
else:
print(json.dumps(spam, indent=4))
2014-01-27 06:31:40 +00:00
elif action == 'move':
mv_file_dups(topdirs, args['--digest-alg'], args['--block-size'],
args['--move'])
elif action == 'remove':
rm_file_dups(topdirs, args['--digest-alg'], args['--block-size'])
else:
2014-01-28 21:39:07 +00:00
print('Invalid action "{}"'.format(action))
2014-01-27 06:31:40 +00:00
2014-01-27 06:58:10 +00:00
# if used as script call main function
2014-01-27 06:31:40 +00:00
if __name__ == '__main__':
2014-01-27 08:42:43 +00:00
main()