9 changed files with 88 additions and 400 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1 @@
 *.pyc
-dist/
-*.egg-info/
--- a/5
+++ b/5
@ -1,5 +0,0 @@
-Author:
-    Darko Poljak <darko.poljak@gmail.com>
-
-Contributors:
-    matthewi
--- a/README.rst
+++ b/README.rst
@ -10,31 +10,29 @@ Print duplicates

 .. code:: python

-    from sweeper import Sweeper
-    swp = Sweeper(['images1', 'images2'])
-    dups = swp.file_dups()
+    from sweeper import file_dups
+    dups = file_dups(['images1', 'images2'])
    print(dups)

 Remove duplicate files

 .. code:: python

-    from sweeper import Sweeper
-    swp = Sweeper(['images1', 'images2'])
-    swp.rm()
+    from sweeper import file_dups
+    rm_file_dups(['images'])

 Perform custom action

 .. code:: python

-    from sweeper import Sweeper
-    swp = Sweeper(['images'])
-    for f, h, dups in swp:
-        print('encountered {} which duplicates with already found duplicate files {} with hash {}'.format(f, dups, h))
+    from sweeper import file_dups
+    for files in iter_file_dups(['images']):
+        for fname in files:
+            print('found duplicate file with name: %s' % fname)

 As script::

-    python -m sweeper/sweeper --help
+    python sweeper.py --help

 As installed console script::
    
@ -63,5 +61,5 @@ https://github.com/darko-poljak/sweeper
 Tested With
 ===========

-Python2.7, Python3
+Python2.7.6, Python3.3.3

--- a/0
+++ b/0
--- a/setup.py
+++ b/setup.py
@ -1,17 +1,12 @@
 import os
 from setuptools import setup

-
 def read(fname):
    return open(os.path.join(os.path.dirname(__file__), fname)).read()

-
-import sweeper.sweeper as sw
-
-
 setup(
    name='sweeper',
-    version=sw.__version__,
+    version='0.2.0',
    author='Darko Poljak',
    author_email='darko.poljak@gmail.com',
    description='Find duplicate files and perform action.',
@ -35,3 +30,4 @@ setup(
        "Topic :: Software Development :: Libraries :: Python Modules",
    ],
 )
+
--- a/sweeper/init.py
+++ b/sweeper/init.py
@ -1,4 +1,4 @@
 from __future__ import absolute_import
-from .sweeper import Sweeper
+from .sweeper import file_dups

-__all__ = ['Sweeper']
+__all__ = ['file_dups', 'mv_file_dups', 'rm_file_dups', 'iter_file_dups']
--- a/sweeper/sweeper.py
+++ b/sweeper/sweeper.py
@ -2,96 +2,52 @@
 # Author: Darko Poljak <darko.poljak@gmail.com>
 # License: GPLv3

-"""{0} {1}
+"""Sweeper.

-Usage: {0} [options] [<directory>...]
+Usage: sweeper.py [options] [<directory>...]

 Arguments:
    <directory> directory path to scan for files

 Options:
 -h, --help                                show this screen
-v, --version                             show version and exit
-b <blocksize>, --block-size=<blocksize>  size of block used when reading
-                                          file's content [default: 4096]
-d <hashalgs>, --digest-algs=<hashalgs>   secure hash algorithm comma separated
-                                          list [default: sha1]
-                                          note that multiple hashes will slow
-                                          down sweeper
-a <action>, --action=<action>            action on duplicate files (pprint,
-                                          print, remove, move)
-                                          [default: pprint]
-                                          -remove removes duplicate files
-                                           except first or first with specified
-                                           directory prefix found
-                                          -move moves duplicate files to
-                                           duplicates driectory, except first
-                                           or first with specified directory
-                                           prefix found
-                                          -print prints result directory where
-                                           keys are hash values and values are
-                                           list of duplicate file paths
-                                          -pprint prints sets of duplicate file
-                                           paths each in it's line where sets
-                                           are separated by blank newline
+-b <blocksize>, --block-size=<blocksize>  size of block used when reading file's
+                                          content [default: 4096]
+-d <hashalg>, --digest-alg=<hashalg>      secure hash algorithm [default: md5]
+-a <action>, --action=<action>            action on duplicate files 
+                                          (print, remove, move) [default: print]
 -m <directory>, --move=<directory>        move duplicate files to directory
-                                          (used with move action)
-                                          [default: ./dups]
-k <dirprefix>, --keep=<dirprefix>        directory prefix for remove and move
-                                          actions
-s, --simulate                            if action is remove or move just
-                                          simulate action by printing, do not
-                                          actually perform the action
-V, --verbose                             print more info
-                                          note that verbosity will slow down
-                                          sweeper due to text printing and
-                                          gathering additional information
-S, --safe-mode                           enable safe mode: compare hash
-                                          duplicate files byte by byte too
-                                          note that it will further slow down
-                                          sweeper but will overcome hash
-                                          collisions (although this is
-                                          unlikely)
+                                          (used with move action) [default: ./dups]
 """

-from __future__ import print_function
-
 __author__ = 'Darko Poljak <darko.poljak@gmail.com>'
-__version__ = '0.9.0'
+__version__ = '0.2.0'
 __license__ = 'GPLv3'

-__all__ = ['Sweeper']
+__all__ = [
+    'file_dups', 'rm_file_dups', 'mv_file_dups', 'iter_file_dups'
+]

 import sys
+import hashlib
 import os
 from collections import defaultdict
-from functools import partial
-import hashlib


 # some differences in python versions
-# we prefer iter methods
 if sys.version_info[0] == 3:
+    def _do_encode(buf):
+        return buf
+
    def _dict_iter_items(d):
        return d.items()
-
-    def _dict_iter_keys(d):
-        return d.keys()
-
-    def _dict_iter_values(d):
-        return d.values()
 else:
+    def _do_encode(buf):
+        return buf
+
    def _dict_iter_items(d):
        return d.iteritems()

-    def _dict_iter_keys(d):
-        return d.iterkeys()
-
-    def _dict_iter_values(d):
-        return d.itervalues()
-
-    range = xrange
-

 def _filehash(filepath, hashalg, block_size):
    """Calculate secure hash for given file content using
@ -101,254 +57,56 @@ def _filehash(filepath, hashalg, block_size):
    md = hashlib.new(hashalg)
    with open(filepath, "rb") as f:
        for buf in iter(lambda: f.read(block_size), b''):
-            md.update(buf)
+            md.update(_do_encode(buf))
    return md.hexdigest()


-def _uniq_list(list_):
-    return list(set(list_))
-
-
-def _fbequal(fpath1, fpath2):
-    '''Compare files byte by byte. If files are equal return True,
-       False otherwise.
-       fpath1 and fpath2 are file paths.
-    '''
-    # nested to work with 2.6
-    with open(fpath1, "rb") as f1:
-        with open(fpath2, "rb") as f2:
-            while True:
-                b1 = f1.read(1)
-                b2 = f2.read(1)
-                if b1 != b2:  # different bytes
-                    return False
-                if not b1 or not b2:  # end in one or both files
-                    break
-    if not b1 and not b2:  # end in both files, files are equal
-        return True
-    # end in one file but not in the other, files aren't equal
-    return False
-
-
-def _remap_keys_to_str(d):
-    '''Iterator that remaps dictionary keys to string in case keys are tuple
-       or list. Leave it unchanged otherwise.
-       Yields string key, value pairs.
-    '''
-    for k in _dict_iter_keys(d):
-        if isinstance(k, tuple) or isinstance(k, list):
-            key = ','.join(k)
-        else:
-            key = k
-        yield (key, d[k])
-
-
-def _gather_file_list(dirs):
-    '''Gather file paths in directory list dirs.
-       Return tuple (count, files) where count is files
-       list length and files is list of file paths in
-       specified directories.
-    '''
-    files = []
-    for dir_ in dirs:
-        for dirpath, dirnames, filenames in os.walk(dir_):
-            # replace fpath with realpath value (eliminate symbolic links)
-            files.extend([os.path.realpath(os.path.join(dirpath, fname))
-                          for fname in filenames])
-    return files
-
-
-class Sweeper(object):
-    DEF_HASHALGS = ['sha1']
-
-    def __init__(self, topdirs=['./'], hashalgs=DEF_HASHALGS,
-                 block_size=4096, verbose=False, safe_mode=False):
-        # replace dir paths with realpath value (eliminate symbolic links)
-        self.topdirs = []
-        for i in range(len(topdirs)):
-            self.topdirs.append(os.path.realpath(topdirs[i]))
-        self.hashalgs = hashalgs
-        self.block_size = block_size
-        self.verbose = verbose
-        self.safe_mode = safe_mode
-
-    # iter through file paths in files list
-    def _files_iter_from_list(self, files):
-        return (fpath for fpath in files)
-
-    # iter through file paths by os.walking
-    def _files_iter_from_disk(self):
-        for topdir in self.topdirs:
-            for dirpath, dirnames, filenames in os.walk(topdir):
-                for fname in filenames:
-                    # replace fpath with realpath value
-                    # (eliminate symbolic links)
-                    fpath = os.path.realpath(os.path.join(dirpath, fname))
-                    yield fpath
-
-    def file_dups(self):
-        """Find duplicate files in directory list. Return directory
-           with keys equal to file hash value and value as list of
-           file paths whose content is the same.
-           If safe_mode is true then you want to play safe: do byte
-           by byte comparison for hash duplicate files.
-        """
-        dups = defaultdict(list)
-        if self.verbose:
-            if self.safe_mode:
-                print('safe mode is on')
-            print('gathering and counting files...', end='')
-            sys.stdout.flush()
-            files = _gather_file_list(self.topdirs)
-            count = len(files)
-            current = 1
-            print(count)
-            _files_iter = partial(self._files_iter_from_list, files)
-        else:
-            _files_iter = self._files_iter_from_disk
-
-        for fpath in _files_iter():
-            if self.verbose:
-                print('\rprocessing file {0}/{1}: calc hash'.format(current,
-                                                                    count),
-                      end='')
-                sys.stdout.flush()
-            hexmds = [_filehash(fpath, h, self.block_size)
-                      for h in self.hashalgs]
-            hexmd = tuple(hexmds)
-            dup_files = dups[hexmd]
-            files_equals = False
-            if self.safe_mode:
-                if dup_files:
-                    if self.verbose:
-                        print('\rprocessing file {0}/{1}: byte cmp'.format(
-                            current, count), end='')
-                        sys.stdout.flush()
-                    for f in dup_files:
-                        if _fbequal(f, fpath):
-                            files_equals = True
-                            break
-                    if self.verbose and not files_equals:
-                        print('\nsame hash value {} but not same bytes for'
-                              ' file {} with files {}'.format(
-                                  hexmd, fpath, dup_files))
-                else:  # when list is empty in safe mode
-                    files_equals = True
-            else:
-                files_equals = True  # when safe mode is off
-            if self.verbose:
-                current += 1
-            if files_equals:
+def file_dups(topdirs=['./'], hashalg='md5', block_size=4096):
+    """Find duplicate files in directory list. Return directory
+       with keys equal to file hash value and value as list of
+       file paths whose content is the same.
+    """
+    dups = defaultdict(list)
+    for topdir in topdirs:
+        for dirpath, dirnames, filenames in os.walk(topdir):
+            for fname in filenames:
+                fpath = os.path.join(dirpath, fname)
+                hexmd = _filehash(fpath, hashalg, block_size)
                dups[hexmd].append(fpath)
+    result = {k: v for k, v in _dict_iter_items(dups) if len(v) > 1}
+    return result

-        if self.verbose:
-            print('')
-        # make result dict with unique file paths list
-        result = {}
-        for k, v in _dict_iter_items(dups):
-            uniq_v = _uniq_list(v)
-            if len(uniq_v) > 1:
-                result[k] = uniq_v
-        return result

-    def __iter__(self):
-        """Find duplicate files in directory list.
-           Yield tuple of file path, hash tuple and list of duplicate files
-           as soon as duplicate file is found.
-           Newly found file is not included in the list at the yield time,
-           but is appended later before next yield.
-           This means that not all duplicate files are returned with any
-           return value. Same hash value and sublist could be returned later
-           if file with same content is found.
-           If safe_mode is true then you want to play safe: do byte
-           by byte comparison for hash duplicate files.
-        """
-        # internaly, file dups dict is still maintained
-        dups = defaultdict(list)
-        _files_iter = self._files_iter_from_disk
+def rm_file_dups(topdirs=['./'], hashalg='md5', block_size=4096):
+    """Remove duplicate files found in specified directory list.
+       First file in list is kept.
+    """
+    for files in do_with_file_dups(topdirs, hashalg, block_size):
+        for f in files:
+            os.remove(f)

-        for fpath in _files_iter():
-            hexmds = [_filehash(fpath, h, self.block_size)
-                      for h in self.hashalgs]
-            hexmd = tuple(hexmds)
-            dup_files = dups[hexmd]
-            # there were dup list elements (used for yield)
-            if self.safe_mode and dup_files:
-                # compare only with first file in dup_files
-                # all files in dup_files list are already content equal
-                files_equals = _fbequal(dup_files[0], fpath)
-            else:  # when list is emtpy in safe mode or when safe mode is off
-                files_equals = True
-            if files_equals:
-                # yield only if current dup files list isn't empty
-                if dup_files:
-                    yield (fpath, hexmd, dups[hexmd])
-                # finally append newly found file to dup list
-                dups[hexmd].append(fpath)

-    def _extract_files_for_action(self, keep_prefix):
-        dups = self.file_dups()
-        for files in _dict_iter_values(dups):
-            found = False
-            if keep_prefix:
-                result = []
-                for f in files:
-                    if f.startswith(keep_prefix) and not found:
-                        found = True
-                    else:
-                        result.append(f)
-            if not found:
-                result = list(files)[1:]
-            yield (files, result)
+def mv_file_dups(topdirs=['./'], hashalg='md5', block_size=4096, dest_dir='dups'):
+    """Move duplicate files found in specified directory list.
+       First file in list is kept in the original directory.
+    """
+    if not os.path.exists(dest_dir):
+        os.mkdir(dest_dir)
+    if not os.path.isdir(dest_dir):
+        raise OSError('%s is not a directory' % dest_dir)
+    import shutil
+    for files in do_with_file_dups(topdirs, hashalg, block_size):
+        for i, f in enumerate(files):
+            if i > 0:
+                shutil.move(f, dest_dir)

-    def _do_action(self, simulate, keep_prefix, action, action_str):
-        for dups, extracted in self._extract_files_for_action(keep_prefix):
-            if simulate or self.verbose:
-                print('found duplicates: \n{}'.format(dups))
-            for f in extracted:
-                if simulate or self.verbose:
-                    print(action_str.format(f))
-                if not simulate:
-                    action(f)

-    def rm(self, simulate=False, keep_prefix=None):
-        """Remove duplicate files found in specified directory list.
-           If keep_prefix is specified then first file with that path
-           prefix found is kept in the original directory.
-           Otherwise first file in list is kept in the original directory.
-           If simulate is True then only print the action, do not actually
-           perform it.
-           If safe_mode is true then do byte by byte comparison for
-           hash duplicate files.
-        """
-        self._do_action(simulate, keep_prefix, os.remove, 'rm {}')
-
-    def mv(self, dest_dir='dups', simulate=False, keep_prefix=None):
-        """Move duplicate files found in specified directory list.
-           If keep_prefix is specified then first file with that path
-           prefix found is kept in the original directory.
-           Otherwise first file in list is kept in the original directory.
-           If simulate is True then only print the action, do not actually
-           perform it.
-           If safe_mode is true then do byte by byte comparison for
-           hash duplicate files.
-        """
-        import shutil
-
-        if not os.path.exists(dest_dir):
-            if simulate:
-                print('mkdir {}'.format(dest_dir))
-            else:
-                os.mkdir(dest_dir)
-        elif not os.path.isdir(dest_dir):
-            errmsg = '{} is not a directory'.format(dest_dir)
-            if simulate:
-                print('would raise:', errmsg)
-            else:
-                raise OSError(errmsg)
-        self._do_action(simulate, keep_prefix,
-                        partial(shutil.move, dst=dest_dir),
-                        'mv {0} to ' + dest_dir)
+def iter_file_dups(topdirs=['./'], hashalg='md5', block_size=4096):
+    """Yield list of duplicate files when found in specified directory list.
+    """
+    dups = file_dups(topdirs, hashalg, block_size)
+    for fpaths in dups.itervalues():
+        yield fpaths


 def main():
@ -357,58 +115,32 @@ def main():
    import json
    from docopt import docopt

-    args = docopt(__doc__.format(sys.argv[0], __version__),
-                  version=" ".join(('sweeper', __version__)))
+    args = docopt(__doc__)
 
    topdirs = args['<directory>']
    if not topdirs:
        topdirs = ['./']

    action = args['--action']
-    verbose = args['--verbose']
-
-    # set block size as int
    try:
        bs = int(args['--block-size'])
        args['--block-size'] = bs
    except ValueError:
-        print('Invalid block size "{}"'.format(args['--block-size']))
+        print('Invalid block size "%s"' % args['--block-size'])
        sys.exit(1)
-    hashalgs = args['--digest-algs'].split(',')
-    hashalgs_uniq = _uniq_list(hashalgs)
-    if len(hashalgs) != len(hashalgs_uniq):
-        print('Duplicate hash algorithms specified')
-        sys.exit(1)
-    block_size = args['--block-size']
-    simulate = args['--simulate']
-    keep_prefix = args['--keep']
-    dest_dir = args['--move']
-    safe_mode = args['--safe-mode']

-    sweeper = Sweeper(topdirs=topdirs, hashalgs=hashalgs,
-                      block_size=block_size, verbose=verbose,
-                      safe_mode=safe_mode)
-    if action == 'print' or action == 'pprint':
-        dups = sweeper.file_dups()
-        # defaultdict(list) -> dict
-        spam = dict(dups)
-        if spam:
-            if action == 'pprint':
-                for _, fpaths in _dict_iter_items(spam):
-                    for path in fpaths:
-                        print(path)
-                    if fpaths:
-                        print('')
-            else:
-                print(json.dumps({k: v for k, v in _remap_keys_to_str(spam)},
-                                 indent=4))
+    if action == 'print':
+        dups = file_dups(topdirs, args['--digest-alg'], args['--block-size'])
+        print(json.dumps(dict(dups), indent=4))
    elif action == 'move':
-        sweeper.mv(dest_dir, simulate, keep_prefix)
+        mv_file_dups(topdirs, args['--digest-alg'], args['--block-size'],
+                     args['--move'])
    elif action == 'remove':
-        sweeper.rm(simulate, keep_prefix)
+        rm_file_dups(topdirs, args['--digest-alg'], args['--block-size'])
    else:
-        print('Invalid action "{}"'.format(action))
+        print('Invalid action "%s"' % action)


+# if used as script call main function
 if __name__ == '__main__':
-    main()
+   main()
--- a/test/init.pyc
+++ b/test/init.pyc
--- a/test/test_sweeper.py
+++ b/test/test_sweeper.py
@ -3,7 +3,7 @@
 # License: GPLv3

 import unittest
-from sweeper import Sweeper
+from sweeper import file_dups
 import os

 mydir = os.path.dirname(os.path.realpath(__file__))
@ -11,8 +11,7 @@ mydir = os.path.dirname(os.path.realpath(__file__))

 class TestSweeper(unittest.TestCase):
    def test_file_dups_dups(self):
-        swp = Sweeper(topdirs=[os.path.join(mydir, 'testfiles_dups')])
-        dups = swp.file_dups()
+        dups = file_dups([os.path.join(mydir, 'testfiles_dups')], 'md5')
        dups_exist = False
        for h, flist in dups.items():
            if len(flist) > 1:
@ -20,40 +19,10 @@ class TestSweeper(unittest.TestCase):
        self.assertTrue(dups_exist)

    def test_file_dups_nodups(self):
-        swp = Sweeper(topdirs=[os.path.join(mydir, 'testfiles_nodups')])
-        dups = swp.file_dups()
+        dups = file_dups([os.path.join(mydir, 'testfiles_nodups')], 'md5')
        for h, flist in dups.items():
            self.assertTrue(len(flist) == 1)

-    # does not actually test safe_mode, we would need to find
-    # hash collision
-    def test_file_dups_safe_mode(self):
-        swp = Sweeper(topdirs=[os.path.join(mydir, 'testfiles_dups')],
-                      safe_mode=True)
-        dups = swp.file_dups()
-        for h, flist in dups.items():
-            if len(flist) > 1:
-                dups_exist = True
-        self.assertTrue(dups_exist)
-
-    def test_iter_file_dups_dups(self):
-        swp = Sweeper(topdirs=[os.path.join(mydir, 'testfiles_dups')])
-        dups_exist = False
-        for x in swp:
-            dups_exist = True
-            filepath, h, dups = x
-            self.assertNotIn(filepath, dups)
-            self.assertTrue(len(dups) > 0)
-        self.assertTrue(dups_exist)
-
-    def test_iter_file_dups_nodups(self):
-        swp = Sweeper([os.path.join(mydir, 'testfiles_nodups')])
-        dups_exist = False
-        for x in swp:
-            dups_exist = True
-            break
-        self.assertFalse(dups_exist)
-

 if __name__ == '__main__':
    unittest.main()