From 9e714732fbceabec9d77085085437f6d01f02ba8 Mon Sep 17 00:00:00 2001 From: Darko Poljak Date: Thu, 6 Feb 2014 14:43:54 +0100 Subject: [PATCH] added file_dups_immediate function --- sweeper/sweeper.py | 45 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/sweeper/sweeper.py b/sweeper/sweeper.py index 9a1ef9d..67e5eb0 100644 --- a/sweeper/sweeper.py +++ b/sweeper/sweeper.py @@ -61,7 +61,8 @@ __version__ = '0.4.1' __license__ = 'GPLv3' __all__ = [ - 'file_dups', 'rm_file_dups', 'mv_file_dups', 'iter_file_dups' + 'file_dups', 'rm_file_dups', 'mv_file_dups', 'iter_file_dups', + 'file_dups_immediate' ] import sys @@ -228,6 +229,48 @@ def file_dups(topdirs=['./'], hashalgs=['md5'], block_size=4096, verbose=False, return result +def file_dups_immediate(topdirs=['./'], hashalgs=['md5'], block_size=4096, + safe_mode=False): + """Find duplicate files in directory list iterator. + Yield tuple of file path, hash tuple and list of duplicate files + as soon as duplicate file is found (newly found file is + included in the list). + This means that not all duplicate files are returned. + Same hash value and sublist could be returned later + if file with same content is found. + If safe_mode is true then you want to play safe: do byte + by byte comparison for hash duplicate files. + """ + # internaly, file dups dict is still maintained + dups = defaultdict(list) + # replace dir paths with realpath value (eliminate symbolic links) + for i in range(len(topdirs)): + topdirs[i] = os.path.realpath(topdirs[i]) + _files_iter = partial(_files_iter_from_disk, topdirs) + + for fpath in _files_iter(): + hexmds = [_filehash(fpath, h, block_size) for h in hashalgs] + hexmd = tuple(hexmds) + dup_files = dups[hexmd] + # there were dup list elements (used for yield) + had_dup_list = True if dup_files else False + files_equals = False + if safe_mode: + if dup_files: + for f in dup_files: + if _fbequal(f, fpath): + files_equals = True + break + else: # when list is empty in safe mode + files_equals = True + else: + files_equals = True # when safe mode is off + if files_equals: + dups[hexmd].append(fpath) + if files_equals and had_dup_list: + yield (fpath, hexmd, dups[hexmd]) + + def _extract_files_for_action(topdirs, hashalgs, block_size, keep_prefix, verbose, safe_mode): for files in iter_file_dups(topdirs=topdirs, hashalgs=hashalgs,