cache file paths when counting in verbose mode
This commit is contained in:
parent
bd88491cb0
commit
7aaeb3e98f
1 changed files with 46 additions and 16 deletions
|
@ -59,6 +59,7 @@ import sys
|
||||||
import hashlib
|
import hashlib
|
||||||
import os
|
import os
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
|
|
||||||
# some differences in python versions
|
# some differences in python versions
|
||||||
|
@ -91,6 +92,36 @@ def _uniq_list(list_):
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _gather_file_list(dirs):
|
||||||
|
'''Gather file paths in directory list dirs.
|
||||||
|
Return tuple (count, files) where count is files
|
||||||
|
list length and files is list of file paths in
|
||||||
|
specified directories.
|
||||||
|
'''
|
||||||
|
count = 0
|
||||||
|
files = []
|
||||||
|
for dir_ in dirs:
|
||||||
|
for dirpath, dirnames, filenames in os.walk(dir_):
|
||||||
|
count += len(filenames)
|
||||||
|
files += [os.path.join(dirpath, fname) for fname in filenames]
|
||||||
|
return (count, files)
|
||||||
|
|
||||||
|
|
||||||
|
# iter through file paths in files list
|
||||||
|
def _files_iter_from_list(files):
|
||||||
|
for fpath in files:
|
||||||
|
yield fpath
|
||||||
|
|
||||||
|
|
||||||
|
# iter through file paths by os.walking
|
||||||
|
def _files_iter_from_disk(topdirs):
|
||||||
|
for topdir in topdirs:
|
||||||
|
for dirpath, dirnames, filenames in os.walk(topdir):
|
||||||
|
for fname in filenames:
|
||||||
|
fpath = os.path.join(dirpath, fname)
|
||||||
|
yield fpath
|
||||||
|
|
||||||
|
|
||||||
def file_dups(topdirs=['./'], hashalg='md5', block_size=4096, verbose=False):
|
def file_dups(topdirs=['./'], hashalg='md5', block_size=4096, verbose=False):
|
||||||
"""Find duplicate files in directory list. Return directory
|
"""Find duplicate files in directory list. Return directory
|
||||||
with keys equal to file hash value and value as list of
|
with keys equal to file hash value and value as list of
|
||||||
|
@ -98,25 +129,24 @@ def file_dups(topdirs=['./'], hashalg='md5', block_size=4096, verbose=False):
|
||||||
"""
|
"""
|
||||||
dups = defaultdict(list)
|
dups = defaultdict(list)
|
||||||
if verbose:
|
if verbose:
|
||||||
print('counting...', end='')
|
print('gathering and counting files...', end='')
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
count = 0
|
count, files = _gather_file_list(topdirs)
|
||||||
for topdir in topdirs:
|
|
||||||
for _, _, filenames in os.walk(topdir):
|
|
||||||
count += len(filenames)
|
|
||||||
current = 1
|
current = 1
|
||||||
print(count)
|
print(count)
|
||||||
for topdir in topdirs:
|
_files_iter = partial(_files_iter_from_list, files)
|
||||||
for dirpath, dirnames, filenames in os.walk(topdir):
|
else:
|
||||||
for fname in filenames:
|
_files_iter = partial(_files_iter_from_disk, topdirs)
|
||||||
|
|
||||||
|
for fpath in _files_iter():
|
||||||
if verbose:
|
if verbose:
|
||||||
print('\rprocessing file {0}/{1}'.format(current, count),
|
print('\rprocessing file {0}/{1}'.format(current, count),
|
||||||
end='')
|
end='')
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
current += 1
|
current += 1
|
||||||
fpath = os.path.join(dirpath, fname)
|
|
||||||
hexmd = _filehash(fpath, hashalg, block_size)
|
hexmd = _filehash(fpath, hashalg, block_size)
|
||||||
dups[hexmd].append(fpath)
|
dups[hexmd].append(fpath)
|
||||||
|
|
||||||
if verbose:
|
if verbose:
|
||||||
print('')
|
print('')
|
||||||
result = {}
|
result = {}
|
||||||
|
|
Loading…
Add table
Reference in a new issue