added file_dups_immediate function
This commit is contained in:
parent
5040c33f96
commit
9e714732fb
1 changed files with 44 additions and 1 deletions
|
@ -61,7 +61,8 @@ __version__ = '0.4.1'
|
||||||
__license__ = 'GPLv3'
|
__license__ = 'GPLv3'
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'file_dups', 'rm_file_dups', 'mv_file_dups', 'iter_file_dups'
|
'file_dups', 'rm_file_dups', 'mv_file_dups', 'iter_file_dups',
|
||||||
|
'file_dups_immediate'
|
||||||
]
|
]
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
@ -228,6 +229,48 @@ def file_dups(topdirs=['./'], hashalgs=['md5'], block_size=4096, verbose=False,
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def file_dups_immediate(topdirs=['./'], hashalgs=['md5'], block_size=4096,
|
||||||
|
safe_mode=False):
|
||||||
|
"""Find duplicate files in directory list iterator.
|
||||||
|
Yield tuple of file path, hash tuple and list of duplicate files
|
||||||
|
as soon as duplicate file is found (newly found file is
|
||||||
|
included in the list).
|
||||||
|
This means that not all duplicate files are returned.
|
||||||
|
Same hash value and sublist could be returned later
|
||||||
|
if file with same content is found.
|
||||||
|
If safe_mode is true then you want to play safe: do byte
|
||||||
|
by byte comparison for hash duplicate files.
|
||||||
|
"""
|
||||||
|
# internaly, file dups dict is still maintained
|
||||||
|
dups = defaultdict(list)
|
||||||
|
# replace dir paths with realpath value (eliminate symbolic links)
|
||||||
|
for i in range(len(topdirs)):
|
||||||
|
topdirs[i] = os.path.realpath(topdirs[i])
|
||||||
|
_files_iter = partial(_files_iter_from_disk, topdirs)
|
||||||
|
|
||||||
|
for fpath in _files_iter():
|
||||||
|
hexmds = [_filehash(fpath, h, block_size) for h in hashalgs]
|
||||||
|
hexmd = tuple(hexmds)
|
||||||
|
dup_files = dups[hexmd]
|
||||||
|
# there were dup list elements (used for yield)
|
||||||
|
had_dup_list = True if dup_files else False
|
||||||
|
files_equals = False
|
||||||
|
if safe_mode:
|
||||||
|
if dup_files:
|
||||||
|
for f in dup_files:
|
||||||
|
if _fbequal(f, fpath):
|
||||||
|
files_equals = True
|
||||||
|
break
|
||||||
|
else: # when list is empty in safe mode
|
||||||
|
files_equals = True
|
||||||
|
else:
|
||||||
|
files_equals = True # when safe mode is off
|
||||||
|
if files_equals:
|
||||||
|
dups[hexmd].append(fpath)
|
||||||
|
if files_equals and had_dup_list:
|
||||||
|
yield (fpath, hexmd, dups[hexmd])
|
||||||
|
|
||||||
|
|
||||||
def _extract_files_for_action(topdirs, hashalgs, block_size, keep_prefix,
|
def _extract_files_for_action(topdirs, hashalgs, block_size, keep_prefix,
|
||||||
verbose, safe_mode):
|
verbose, safe_mode):
|
||||||
for files in iter_file_dups(topdirs=topdirs, hashalgs=hashalgs,
|
for files in iter_file_dups(topdirs=topdirs, hashalgs=hashalgs,
|
||||||
|
|
Loading…
Reference in a new issue