cleanups and version inc

This commit is contained in:
Darko Poljak 2014-02-07 08:29:54 +01:00
parent 9e714732fb
commit d2186f1f26
1 changed files with 43 additions and 46 deletions

View File

@ -57,12 +57,11 @@ Options:
from __future__ import print_function
__author__ = 'Darko Poljak <darko.poljak@gmail.com>'
__version__ = '0.4.1'
__version__ = '0.5.0'
__license__ = 'GPLv3'
__all__ = [
'file_dups', 'rm_file_dups', 'mv_file_dups', 'iter_file_dups',
'file_dups_immediate'
'file_dups', 'rm_file_dups', 'mv_file_dups', 'iter_file_dups'
]
import sys
@ -80,6 +79,9 @@ if sys.version_info[0] == 3:
def _dict_iter_keys(d):
return d.keys()
def _dict_iter_values(d):
return d.values()
else:
def _dict_iter_items(d):
return d.iteritems()
@ -87,6 +89,9 @@ else:
def _dict_iter_keys(d):
return d.iterkeys()
def _dict_iter_values(d):
return d.itervalues()
range = xrange
@ -221,6 +226,7 @@ def file_dups(topdirs=['./'], hashalgs=['md5'], block_size=4096, verbose=False,
if verbose:
print('')
# make result dict with unique file paths list
result = {}
for k, v in _dict_iter_items(dups):
uniq_v = _uniq_list(v)
@ -229,14 +235,15 @@ def file_dups(topdirs=['./'], hashalgs=['md5'], block_size=4096, verbose=False,
return result
def file_dups_immediate(topdirs=['./'], hashalgs=['md5'], block_size=4096,
safe_mode=False):
"""Find duplicate files in directory list iterator.
def iter_file_dups(topdirs=['./'], hashalgs=['md5'], block_size=4096,
safe_mode=False):
"""Find duplicate files in directory list.
Yield tuple of file path, hash tuple and list of duplicate files
as soon as duplicate file is found (newly found file is
included in the list).
This means that not all duplicate files are returned.
Same hash value and sublist could be returned later
as soon as duplicate file is found.
Newly found file is not included in the list at the yield time,
but is appended later before next yield.
This means that not all duplicate files are returned with any
return value. Same hash value and sublist could be returned later
if file with same content is found.
If safe_mode is true then you want to play safe: do byte
by byte comparison for hash duplicate files.
@ -253,29 +260,25 @@ def file_dups_immediate(topdirs=['./'], hashalgs=['md5'], block_size=4096,
hexmd = tuple(hexmds)
dup_files = dups[hexmd]
# there were dup list elements (used for yield)
had_dup_list = True if dup_files else False
files_equals = False
if safe_mode:
if dup_files:
for f in dup_files:
if _fbequal(f, fpath):
files_equals = True
break
else: # when list is empty in safe mode
files_equals = True
else:
files_equals = True # when safe mode is off
if safe_mode and dup_files:
# compare only with first file in dup_files
# all files in dup_files list are already content equal
files_equals = _fbequal(dup_files[0], fpath)
else: # when list is emtpy in safe mode or when safe mode is off
files_equals = True
if files_equals:
# yield only if current dup files list isn't empty
if dup_files:
yield (fpath, hexmd, dups[hexmd])
# finally append newly found file to dup list
dups[hexmd].append(fpath)
if files_equals and had_dup_list:
yield (fpath, hexmd, dups[hexmd])
def _extract_files_for_action(topdirs, hashalgs, block_size, keep_prefix,
verbose, safe_mode):
for files in iter_file_dups(topdirs=topdirs, hashalgs=hashalgs,
block_size=block_size, verbose=verbose,
safe_mode=safe_mode):
for files in _dict_iter_values(file_dups(topdirs=topdirs,
hashalgs=hashalgs, block_size=block_size,
verbose=verbose, safe_mode=safe_mode)):
found = False
if keep_prefix:
result = []
@ -325,11 +328,19 @@ def mv_file_dups(topdirs=['./'], hashalgs=['md5'], block_size=4096,
If safe_mode is true then do byte by byte comparison for
hash duplicate files.
"""
if not os.path.exists(dest_dir):
os.mkdir(dest_dir)
if not os.path.isdir(dest_dir):
raise OSError('{} is not a directory'.format(dest_dir))
import shutil
if not os.path.exists(dest_dir):
if simulate:
print('mkdir {}'.format(dest_dir))
else:
os.mkdir(dest_dir)
elif not os.path.isdir(dest_dir):
errmsg = '{} is not a directory'.format(dest_dir)
if simulate:
print('would raise:', errmsg)
else:
raise OSError(errmsg)
for dups, extracted in _extract_files_for_action(topdirs, hashalgs,
block_size, keep_prefix,
verbose, safe_mode):
@ -342,20 +353,6 @@ def mv_file_dups(topdirs=['./'], hashalgs=['md5'], block_size=4096,
shutil.move(f, dest_dir)
def iter_file_dups(topdirs=['./'], rethash=False, hashalgs=['md5'],
block_size=4096, verbose=False, safe_mode=False):
"""Yield duplicate files when found in specified directory list.
If rethash is True then tuple hash value and duplicate paths list is
returned, otherwise duplicate paths list is returned.
"""
dups = file_dups(topdirs, hashalgs, block_size, verbose, safe_mode)
for hash_, fpaths in _dict_iter_items(dups):
if rethash:
yield (hash_, fpaths)
else:
yield fpaths
def _remap_keys_to_str(d):
'''Iterator that remaps dictionary keys to string in case keys are tuple
or list. Leave it unchanged otherwise.
@ -432,7 +429,7 @@ def main():
rm_file_dups(topdirs=topdirs, hashalgs=hashalgs,
block_size=block_size,
simulate=simulate,
keep_prefix=-keep_prefix,
keep_prefix=keep_prefix,
verbose=verbose,
safe_mode=safe_mode)
else: