Commit 1f4cb3e1 authored by darko-poljak's avatar darko-poljak


parent 30620884
......@@ -18,17 +18,17 @@ Remove duplicate files
.. code:: python
from sweeper import file_dups
from sweeper import rm_file_dups
Perform custom action
.. code:: python
from sweeper import file_dups
from sweeper import iter_file_dups
for files in iter_file_dups(['images']):
for fname in files:
print('found duplicate file with name: %s' % fname)
print('found duplicate file with name: {}'.format(fname))
As script::
......@@ -6,7 +6,7 @@ def read(fname):
author='Darko Poljak',
description='Find duplicate files and perform action.',
from __future__ import absolute_import
from .sweeper import file_dups
from .sweeper import file_dups, mv_file_dups, rm_file_dups, iter_file_dups
__all__ = ['file_dups', 'mv_file_dups', 'rm_file_dups', 'iter_file_dups']
......@@ -2,7 +2,7 @@
# Author: Darko Poljak <>
# License: GPLv3
"""sweeper 0.4.0
Usage: [options] [<directory>...]
......@@ -11,18 +11,31 @@ Arguments:
-h, --help show this screen
-v, --version show version and exit
-b <blocksize>, --block-size=<blocksize> size of block used when reading
file's content [default: 4096]
-d <hashalg>, --digest-alg=<hashalg> secure hash algorithm [default: md5]
-a <action>, --action=<action> action on duplicate files (print,
remove, move) [default: print]
-a <action>, --action=<action> action on duplicate files (pprint,
print, remove, move)
[default: pprint]
-remove removes duplicate files
except first found
-move moves duplicate files to
duplicates driectory, except first
-print prints result directory where
keys are hash values and values are
list of duplicate file paths
-pprint prints sets of duplicate file
paths each in it's line where sets
are separated by blank newline
-m <directory>, --move=<directory> move duplicate files to directory
(used with move action)
[default: ./dups]
__author__ = 'Darko Poljak <>'
__version__ = '0.3.0'
__version__ = '0.4.0'
__license__ = 'GPLv3'
__all__ = [
......@@ -95,7 +108,7 @@ def mv_file_dups(topdirs=['./'], hashalg='md5', block_size=4096,
if not os.path.exists(dest_dir):
if not os.path.isdir(dest_dir):
raise OSError('%s is not a directory' % dest_dir)
raise OSError('{} is not a directory'.format(dest_dir))
import shutil
for files in do_with_file_dups(topdirs, hashalg, block_size):
for i, f in enumerate(files):
......@@ -103,12 +116,18 @@ def mv_file_dups(topdirs=['./'], hashalg='md5', block_size=4096,
shutil.move(f, dest_dir)
def iter_file_dups(topdirs=['./'], hashalg='md5', block_size=4096):
"""Yield list of duplicate files when found in specified directory list.
def iter_file_dups(topdirs=['./'], rethash=False, hashalg='md5',
"""Yield duplicate files when found in specified directory list.
If rethash is True then tuple hash value and duplicate paths list is
returned, otherwise duplicate paths list is returned.
dups = file_dups(topdirs, hashalg, block_size)
for fpaths in dups.itervalues():
yield fpaths
for hash_, fpaths in _dict_iter_items(dups):
if rethash:
yield (hash_, fpaths)
yield fpaths
def main():
......@@ -128,21 +147,31 @@ def main():
bs = int(args['--block-size'])
args['--block-size'] = bs
except ValueError:
print('Invalid block size "%s"' % args['--block-size'])
print('Invalid block size "{}"'.format(args['--block-size']))
if action == 'print':
if args['--version']:
print("sweeper {}".format(__version__))
if action == 'print' or action == 'pprint':
dups = file_dups(topdirs, args['--digest-alg'], args['--block-size'])
spam = dict(dups)
if spam:
print(json.dumps(spam, indent=4))
if action == 'pprint':
for h, fpaths in _dict_iter_items(spam):
for path in fpaths:
print(json.dumps(spam, indent=4))
elif action == 'move':
mv_file_dups(topdirs, args['--digest-alg'], args['--block-size'],
elif action == 'remove':
rm_file_dups(topdirs, args['--digest-alg'], args['--block-size'])
print('Invalid action "%s"' % action)
print('Invalid action "{}"'.format(action))
# if used as script call main function
......@@ -3,7 +3,7 @@
# License: GPLv3
import unittest
from sweeper import file_dups
from sweeper import file_dups, iter_file_dups
import os
mydir = os.path.dirname(os.path.realpath(__file__))
......@@ -23,6 +23,15 @@ class TestSweeper(unittest.TestCase):
for h, flist in dups.items():
self.assertTrue(len(flist) == 1)
def test_iter_fule_dups_rethash(self):
for item in iter_file_dups([os.path.join(mydir, 'testfiles_dups')],
self.assertTrue(type(item).__name__ == 'tuple')
def test_iter_fule_dups_norethash(self):
for item in iter_file_dups([os.path.join(mydir, 'testfiles_dups')]):
self.assertTrue(type(item).__name__ == 'list')
if __name__ == '__main__':
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment