cache file paths when counting in verbose mode

This commit is contained in:
Darko Poljak 2014-02-06 08:07:06 +01:00
parent bd88491cb0
commit 7aaeb3e98f
1 changed files with 46 additions and 16 deletions

View File

@ -59,6 +59,7 @@ import sys
import hashlib
import os
from collections import defaultdict
from functools import partial
# some differences in python versions
@ -91,6 +92,36 @@ def _uniq_list(list_):
return result
def _gather_file_list(dirs):
'''Gather file paths in directory list dirs.
Return tuple (count, files) where count is files
list length and files is list of file paths in
specified directories.
'''
count = 0
files = []
for dir_ in dirs:
for dirpath, dirnames, filenames in os.walk(dir_):
count += len(filenames)
files += [os.path.join(dirpath, fname) for fname in filenames]
return (count, files)
# iter through file paths in files list
def _files_iter_from_list(files):
for fpath in files:
yield fpath
# iter through file paths by os.walking
def _files_iter_from_disk(topdirs):
for topdir in topdirs:
for dirpath, dirnames, filenames in os.walk(topdir):
for fname in filenames:
fpath = os.path.join(dirpath, fname)
yield fpath
def file_dups(topdirs=['./'], hashalg='md5', block_size=4096, verbose=False):
"""Find duplicate files in directory list. Return directory
with keys equal to file hash value and value as list of
@ -98,25 +129,24 @@ def file_dups(topdirs=['./'], hashalg='md5', block_size=4096, verbose=False):
"""
dups = defaultdict(list)
if verbose:
print('counting...', end='')
print('gathering and counting files...', end='')
sys.stdout.flush()
count = 0
for topdir in topdirs:
for _, _, filenames in os.walk(topdir):
count += len(filenames)
count, files = _gather_file_list(topdirs)
current = 1
print(count)
for topdir in topdirs:
for dirpath, dirnames, filenames in os.walk(topdir):
for fname in filenames:
if verbose:
print('\rprocessing file {0}/{1}'.format(current, count),
end='')
sys.stdout.flush()
current += 1
fpath = os.path.join(dirpath, fname)
hexmd = _filehash(fpath, hashalg, block_size)
dups[hexmd].append(fpath)
_files_iter = partial(_files_iter_from_list, files)
else:
_files_iter = partial(_files_iter_from_disk, topdirs)
for fpath in _files_iter():
if verbose:
print('\rprocessing file {0}/{1}'.format(current, count),
end='')
sys.stdout.flush()
current += 1
hexmd = _filehash(fpath, hashalg, block_size)
dups[hexmd].append(fpath)
if verbose:
print('')
result = {}