Commit a074cfb9 authored by darko-poljak's avatar darko-poljak

Initial commit. First working version.

parent 49518efc
......@@ -2,3 +2,32 @@ sweeper
=======
Find duplicate files and perform action.
Constructor
~~~~~~~~~~~
Installation
============
from source::
python setup.py install
or from PyPI::
pip install sweeper
Documentation
=============
this README.rst, code itself, docstrings
sweeper can be found on github.com at:
https://github.com/darko-poljak/sweeper
Tested With
===========
Python2.7.6, Python3.3.3
import os
from setuptools import setup
def read(fname):
return open(os.path.join(os.path.dirname(__file__), fname)).read()
setup(
name='sweeper',
version='0.1.0',
author='Darko Poljak',
author_email='darko.poljak@gmail.com',
description='Find duplicate files in directory tree.',
license="GPLv3",
keywords=['find duplicate files', ],
url='https://github.com/darko-poljak/sweeper',
download_url='https://github.com/darko-poljak/sweeper',
packages=['sweeper'],
long_description=read('README.rst'),
platforms="OS Independent",
install_requires=["docopt"],
classifiers=[
"Development Status :: 4 - Beta",
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
"Operating System :: OS Independent",
"Topic :: Software Development :: Libraries :: Python Modules",
],
)
from __future__ import absolute_import
from .sweeper import file_dups
__all__ = ['file_dups', 'mv_file_dups', 'rm_file_dups', 'do_with_file_dups']
#!/usr/bin/env python
# Author: Darko Poljak <darko.poljak@gmail.com>
# License: GPLv3
"""Sweeper.
Usage: sweeper.py [options] [<directory>...]
Arguments:
<directory> directory path to scan for files
Options:
-h, --help show this screen
-b <blocksize>, --block-size=<blocksize> size of block used when reading file's
content [default: 4096]
-d <hashalg>, --digest-alg=<hashalg> secure hash algorithm [default: md5]
-a <action>, --action=<action> action on duplicate files
(print, remove, move) [default: print]
-m <directory>, --move=<directory> move duplicate files to directory
(used with move action) [default: ./dups]
"""
__author__ = 'Darko Poljak <darko.poljak@gmail.com>'
__version__ = '0.1.0'
__license__ = 'GPLv3'
__all__ = [
'file_dups', 'rm_file_dups', 'mv_file_dups', 'do_with_file_dups'
]
import sys
import hashlib
import os
from collections import defaultdict
if sys.version_info[0] == 3:
def _do_encode(buf):
return buf
def _dict_iter_items(d):
return d.items()
else:
def _do_encode(buf):
return buf
def _dict_iter_items(d):
return d.iteritems()
def _filehash(filepath, hashalg, block_size):
md = hashlib.new(hashalg)
with open(filepath, "rb") as f:
for buf in iter(lambda: f.read(block_size), b''):
md.update(_do_encode(buf))
return md.hexdigest()
def file_dups(topdirs=['./'], hashalg='md5', block_size=4096):
dups = defaultdict(list)
for topdir in topdirs:
for dirpath, dirnames, filenames in os.walk(topdir):
for fname in filenames:
fpath = os.path.join(dirpath, fname)
hexmd = _filehash(fpath, hashalg, block_size)
dups[hexmd].append(fpath)
result = {k: v for k, v in _dict_iter_items(dups) if len(v) > 1}
return result
def rm_file_dups(topdirs=['./'], hashalg='md5', block_size=4096):
for files in do_with_file_dups(topdirs, hashalg, block_size):
for f in files:
os.remove(f)
def mv_file_dups(topdirs=['./'], hashalg='md5', block_size=4096, dest_dir='dups'):
if not os.path.exists(dest_dir):
os.mkdir(dest_dir)
if not os.path.isdir(dest_dir):
raise OSError('%s is not a directory' % dest_dir)
import shutil
for files in do_with_file_dups(topdirs, hashalg, block_size):
for i, f in enumerate(files):
if i > 0:
shutil.move(f, dest_dir)
def do_with_file_dups(topdirs=['./'], hashalg='md5', block_size=4096):
dups = file_dups(topdirs, hashalg, block_size)
for fpaths in dups.itervalues():
yield fpaths
def main(args):
import json
topdirs = args['<directory>']
if not topdirs:
topdirs = ['./']
action = args['--action']
try:
bs = int(args['--block-size'])
args['--block-size'] = bs
except ValueError:
print('Invalid block size "%s"' % args['--block-size'])
sys.exit(1)
if action == 'print':
dups = file_dups(topdirs, args['--digest-alg'], args['--block-size'])
print(json.dumps(dict(dups), indent=4))
elif action == 'move':
mv_file_dups(topdirs, args['--digest-alg'], args['--block-size'],
args['--move'])
elif action == 'remove':
rm_file_dups(topdirs, args['--digest-alg'], args['--block-size'])
else:
print('Invalid action "%s"' % action)
if __name__ == '__main__':
from docopt import docopt
arguments = docopt(__doc__)
main(arguments)
#!/usr/bin/env python
# Author: Darko Poljak <darko.poljak@gmail.com>
# License: GPLv3
import unittest
from sweeper import file_dups
import os
mydir = os.path.dirname(os.path.realpath(__file__))
class TestSweeper(unittest.TestCase):
def test_file_dups_dups(self):
dups = file_dups([os.path.join(mydir, 'testfiles_dups')], 'md5')
dups_exist = False
for h, flist in dups.items():
if len(flist) > 1:
dups_exist = True
self.assertTrue(dups_exist)
def test_file_dups_nodups(self):
dups = file_dups([os.path.join(mydir, 'testfiles_nodups')], 'md5')
for h, flist in dups.items():
self.assertTrue(len(flist) == 1)
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment