cache file paths when counting in verbose mode

This commit is contained in:
Darko Poljak 2014-02-06 08:07:06 +01:00
parent bd88491cb0
commit 7aaeb3e98f

View file

@ -59,6 +59,7 @@ import sys
import hashlib import hashlib
import os import os
from collections import defaultdict from collections import defaultdict
from functools import partial
# some differences in python versions # some differences in python versions
@ -91,6 +92,36 @@ def _uniq_list(list_):
return result return result
def _gather_file_list(dirs):
'''Gather file paths in directory list dirs.
Return tuple (count, files) where count is files
list length and files is list of file paths in
specified directories.
'''
count = 0
files = []
for dir_ in dirs:
for dirpath, dirnames, filenames in os.walk(dir_):
count += len(filenames)
files += [os.path.join(dirpath, fname) for fname in filenames]
return (count, files)
# iter through file paths in files list
def _files_iter_from_list(files):
for fpath in files:
yield fpath
# iter through file paths by os.walking
def _files_iter_from_disk(topdirs):
for topdir in topdirs:
for dirpath, dirnames, filenames in os.walk(topdir):
for fname in filenames:
fpath = os.path.join(dirpath, fname)
yield fpath
def file_dups(topdirs=['./'], hashalg='md5', block_size=4096, verbose=False): def file_dups(topdirs=['./'], hashalg='md5', block_size=4096, verbose=False):
"""Find duplicate files in directory list. Return directory """Find duplicate files in directory list. Return directory
with keys equal to file hash value and value as list of with keys equal to file hash value and value as list of
@ -98,25 +129,24 @@ def file_dups(topdirs=['./'], hashalg='md5', block_size=4096, verbose=False):
""" """
dups = defaultdict(list) dups = defaultdict(list)
if verbose: if verbose:
print('counting...', end='') print('gathering and counting files...', end='')
sys.stdout.flush() sys.stdout.flush()
count = 0 count, files = _gather_file_list(topdirs)
for topdir in topdirs:
for _, _, filenames in os.walk(topdir):
count += len(filenames)
current = 1 current = 1
print(count) print(count)
for topdir in topdirs: _files_iter = partial(_files_iter_from_list, files)
for dirpath, dirnames, filenames in os.walk(topdir): else:
for fname in filenames: _files_iter = partial(_files_iter_from_disk, topdirs)
for fpath in _files_iter():
if verbose: if verbose:
print('\rprocessing file {0}/{1}'.format(current, count), print('\rprocessing file {0}/{1}'.format(current, count),
end='') end='')
sys.stdout.flush() sys.stdout.flush()
current += 1 current += 1
fpath = os.path.join(dirpath, fname)
hexmd = _filehash(fpath, hashalg, block_size) hexmd = _filehash(fpath, hashalg, block_size)
dups[hexmd].append(fpath) dups[hexmd].append(fpath)
if verbose: if verbose:
print('') print('')
result = {} result = {}