cleanups and version inc

This commit is contained in:
Darko Poljak 2014-02-07 08:29:54 +01:00
parent 9e714732fb
commit d2186f1f26
1 changed files with 43 additions and 46 deletions

View File

@ -57,12 +57,11 @@ Options:
from __future__ import print_function from __future__ import print_function
__author__ = 'Darko Poljak <darko.poljak@gmail.com>' __author__ = 'Darko Poljak <darko.poljak@gmail.com>'
__version__ = '0.4.1' __version__ = '0.5.0'
__license__ = 'GPLv3' __license__ = 'GPLv3'
__all__ = [ __all__ = [
'file_dups', 'rm_file_dups', 'mv_file_dups', 'iter_file_dups', 'file_dups', 'rm_file_dups', 'mv_file_dups', 'iter_file_dups'
'file_dups_immediate'
] ]
import sys import sys
@ -80,6 +79,9 @@ if sys.version_info[0] == 3:
def _dict_iter_keys(d): def _dict_iter_keys(d):
return d.keys() return d.keys()
def _dict_iter_values(d):
return d.values()
else: else:
def _dict_iter_items(d): def _dict_iter_items(d):
return d.iteritems() return d.iteritems()
@ -87,6 +89,9 @@ else:
def _dict_iter_keys(d): def _dict_iter_keys(d):
return d.iterkeys() return d.iterkeys()
def _dict_iter_values(d):
return d.itervalues()
range = xrange range = xrange
@ -221,6 +226,7 @@ def file_dups(topdirs=['./'], hashalgs=['md5'], block_size=4096, verbose=False,
if verbose: if verbose:
print('') print('')
# make result dict with unique file paths list
result = {} result = {}
for k, v in _dict_iter_items(dups): for k, v in _dict_iter_items(dups):
uniq_v = _uniq_list(v) uniq_v = _uniq_list(v)
@ -229,14 +235,15 @@ def file_dups(topdirs=['./'], hashalgs=['md5'], block_size=4096, verbose=False,
return result return result
def file_dups_immediate(topdirs=['./'], hashalgs=['md5'], block_size=4096, def iter_file_dups(topdirs=['./'], hashalgs=['md5'], block_size=4096,
safe_mode=False): safe_mode=False):
"""Find duplicate files in directory list iterator. """Find duplicate files in directory list.
Yield tuple of file path, hash tuple and list of duplicate files Yield tuple of file path, hash tuple and list of duplicate files
as soon as duplicate file is found (newly found file is as soon as duplicate file is found.
included in the list). Newly found file is not included in the list at the yield time,
This means that not all duplicate files are returned. but is appended later before next yield.
Same hash value and sublist could be returned later This means that not all duplicate files are returned with any
return value. Same hash value and sublist could be returned later
if file with same content is found. if file with same content is found.
If safe_mode is true then you want to play safe: do byte If safe_mode is true then you want to play safe: do byte
by byte comparison for hash duplicate files. by byte comparison for hash duplicate files.
@ -253,29 +260,25 @@ def file_dups_immediate(topdirs=['./'], hashalgs=['md5'], block_size=4096,
hexmd = tuple(hexmds) hexmd = tuple(hexmds)
dup_files = dups[hexmd] dup_files = dups[hexmd]
# there were dup list elements (used for yield) # there were dup list elements (used for yield)
had_dup_list = True if dup_files else False if safe_mode and dup_files:
files_equals = False # compare only with first file in dup_files
if safe_mode: # all files in dup_files list are already content equal
if dup_files: files_equals = _fbequal(dup_files[0], fpath)
for f in dup_files: else: # when list is emtpy in safe mode or when safe mode is off
if _fbequal(f, fpath):
files_equals = True files_equals = True
break
else: # when list is empty in safe mode
files_equals = True
else:
files_equals = True # when safe mode is off
if files_equals: if files_equals:
dups[hexmd].append(fpath) # yield only if current dup files list isn't empty
if files_equals and had_dup_list: if dup_files:
yield (fpath, hexmd, dups[hexmd]) yield (fpath, hexmd, dups[hexmd])
# finally append newly found file to dup list
dups[hexmd].append(fpath)
def _extract_files_for_action(topdirs, hashalgs, block_size, keep_prefix, def _extract_files_for_action(topdirs, hashalgs, block_size, keep_prefix,
verbose, safe_mode): verbose, safe_mode):
for files in iter_file_dups(topdirs=topdirs, hashalgs=hashalgs, for files in _dict_iter_values(file_dups(topdirs=topdirs,
block_size=block_size, verbose=verbose, hashalgs=hashalgs, block_size=block_size,
safe_mode=safe_mode): verbose=verbose, safe_mode=safe_mode)):
found = False found = False
if keep_prefix: if keep_prefix:
result = [] result = []
@ -325,11 +328,19 @@ def mv_file_dups(topdirs=['./'], hashalgs=['md5'], block_size=4096,
If safe_mode is true then do byte by byte comparison for If safe_mode is true then do byte by byte comparison for
hash duplicate files. hash duplicate files.
""" """
if not os.path.exists(dest_dir):
os.mkdir(dest_dir)
if not os.path.isdir(dest_dir):
raise OSError('{} is not a directory'.format(dest_dir))
import shutil import shutil
if not os.path.exists(dest_dir):
if simulate:
print('mkdir {}'.format(dest_dir))
else:
os.mkdir(dest_dir)
elif not os.path.isdir(dest_dir):
errmsg = '{} is not a directory'.format(dest_dir)
if simulate:
print('would raise:', errmsg)
else:
raise OSError(errmsg)
for dups, extracted in _extract_files_for_action(topdirs, hashalgs, for dups, extracted in _extract_files_for_action(topdirs, hashalgs,
block_size, keep_prefix, block_size, keep_prefix,
verbose, safe_mode): verbose, safe_mode):
@ -342,20 +353,6 @@ def mv_file_dups(topdirs=['./'], hashalgs=['md5'], block_size=4096,
shutil.move(f, dest_dir) shutil.move(f, dest_dir)
def iter_file_dups(topdirs=['./'], rethash=False, hashalgs=['md5'],
block_size=4096, verbose=False, safe_mode=False):
"""Yield duplicate files when found in specified directory list.
If rethash is True then tuple hash value and duplicate paths list is
returned, otherwise duplicate paths list is returned.
"""
dups = file_dups(topdirs, hashalgs, block_size, verbose, safe_mode)
for hash_, fpaths in _dict_iter_items(dups):
if rethash:
yield (hash_, fpaths)
else:
yield fpaths
def _remap_keys_to_str(d): def _remap_keys_to_str(d):
'''Iterator that remaps dictionary keys to string in case keys are tuple '''Iterator that remaps dictionary keys to string in case keys are tuple
or list. Leave it unchanged otherwise. or list. Leave it unchanged otherwise.
@ -432,7 +429,7 @@ def main():
rm_file_dups(topdirs=topdirs, hashalgs=hashalgs, rm_file_dups(topdirs=topdirs, hashalgs=hashalgs,
block_size=block_size, block_size=block_size,
simulate=simulate, simulate=simulate,
keep_prefix=-keep_prefix, keep_prefix=keep_prefix,
verbose=verbose, verbose=verbose,
safe_mode=safe_mode) safe_mode=safe_mode)
else: else: