cache file paths when counting in verbose mode

2014-02-06 08:07:06 +01:00 · 2014-02-06 08:07:06 +01:00 · 7aaeb3e98f
commit 7aaeb3e98f
parent bd88491cb0
1 changed files with 46 additions and 16 deletions
--- a/sweeper/sweeper.py
+++ b/sweeper/sweeper.py
@ -59,6 +59,7 @@ import sys
 import hashlib
 import os
 from collections import defaultdict
 from functools import partial
 # some differences in python versions
@ -91,6 +92,36 @@ def _uniq_list(list_):
    return result
 def _gather_file_list(dirs):
    '''Gather file paths in directory list dirs.
       Return tuple (count, files) where count is files
       list length and files is list of file paths in
       specified directories.
    '''
    count = 0
    files = []
    for dir_ in dirs:
        for dirpath, dirnames, filenames in os.walk(dir_):
            count += len(filenames)
            files += [os.path.join(dirpath, fname) for fname in filenames]
    return (count, files)
 # iter through file paths in files list
 def _files_iter_from_list(files):
    for fpath in files:
        yield fpath
 # iter through file paths by os.walking
 def _files_iter_from_disk(topdirs):
    for topdir in topdirs:
        for dirpath, dirnames, filenames in os.walk(topdir):
            for fname in filenames:
                fpath = os.path.join(dirpath, fname)
                yield fpath
 def file_dups(topdirs=['./'], hashalg='md5', block_size=4096, verbose=False):
    """Find duplicate files in directory list. Return directory
       with keys equal to file hash value and value as list of
@ -98,25 +129,24 @@ def file_dups(topdirs=['./'], hashalg='md5', block_size=4096, verbose=False):
    """
    dups = defaultdict(list)
    if verbose:
-        print('counting...', end='')
+        print('gathering and counting files...', end='')
        sys.stdout.flush()
-        count = 0
+        count, files = _gather_file_list(topdirs)
        for topdir in topdirs:
            for _, _, filenames in os.walk(topdir):
                count += len(filenames)
        current = 1
        print(count)
-    for topdir in topdirs:
+        _files_iter = partial(_files_iter_from_list, files)
-        for dirpath, dirnames, filenames in os.walk(topdir):
+    else:
-            for fname in filenames:
+        _files_iter = partial(_files_iter_from_disk, topdirs)
    for fpath in _files_iter():
        if verbose:
            print('\rprocessing file {0}/{1}'.format(current, count),
                  end='')
            sys.stdout.flush()
            current += 1
                fpath = os.path.join(dirpath, fname)
        hexmd = _filehash(fpath, hashalg, block_size)
        dups[hexmd].append(fpath)
    if verbose:
        print('')
    result = {}