Added comments.

This commit is contained in:
darko-poljak 2014-01-27 07:58:10 +01:00
parent 660d6bd342
commit c5fdf029e2

View file

@ -34,6 +34,7 @@ import os
from collections import defaultdict from collections import defaultdict
# some differences in python versions
if sys.version_info[0] == 3: if sys.version_info[0] == 3:
def _do_encode(buf): def _do_encode(buf):
return buf return buf
@ -49,6 +50,10 @@ else:
def _filehash(filepath, hashalg, block_size): def _filehash(filepath, hashalg, block_size):
"""Calculate secure hash for given file content using
specified hash algorithm. Use block_size block size
when reading file content.
"""
md = hashlib.new(hashalg) md = hashlib.new(hashalg)
with open(filepath, "rb") as f: with open(filepath, "rb") as f:
for buf in iter(lambda: f.read(block_size), b''): for buf in iter(lambda: f.read(block_size), b''):
@ -57,6 +62,10 @@ def _filehash(filepath, hashalg, block_size):
def file_dups(topdirs=['./'], hashalg='md5', block_size=4096): def file_dups(topdirs=['./'], hashalg='md5', block_size=4096):
"""Find duplicate files in directory list. Return directory
with keys equal to file hash value and value as list of
file paths whose content is the same.
"""
dups = defaultdict(list) dups = defaultdict(list)
for topdir in topdirs: for topdir in topdirs:
for dirpath, dirnames, filenames in os.walk(topdir): for dirpath, dirnames, filenames in os.walk(topdir):
@ -69,12 +78,18 @@ def file_dups(topdirs=['./'], hashalg='md5', block_size=4096):
def rm_file_dups(topdirs=['./'], hashalg='md5', block_size=4096): def rm_file_dups(topdirs=['./'], hashalg='md5', block_size=4096):
"""Remove duplicate files found in specified directory list.
First file in list is kept.
"""
for files in do_with_file_dups(topdirs, hashalg, block_size): for files in do_with_file_dups(topdirs, hashalg, block_size):
for f in files: for f in files:
os.remove(f) os.remove(f)
def mv_file_dups(topdirs=['./'], hashalg='md5', block_size=4096, dest_dir='dups'): def mv_file_dups(topdirs=['./'], hashalg='md5', block_size=4096, dest_dir='dups'):
"""Move duplicate files found in specified directory list.
First file in list is kept in the original directory.
"""
if not os.path.exists(dest_dir): if not os.path.exists(dest_dir):
os.mkdir(dest_dir) os.mkdir(dest_dir)
if not os.path.isdir(dest_dir): if not os.path.isdir(dest_dir):
@ -87,12 +102,16 @@ def mv_file_dups(topdirs=['./'], hashalg='md5', block_size=4096, dest_dir='dups'
def do_with_file_dups(topdirs=['./'], hashalg='md5', block_size=4096): def do_with_file_dups(topdirs=['./'], hashalg='md5', block_size=4096):
"""Yield list of duplicate files when found in specified directory list.
"""
dups = file_dups(topdirs, hashalg, block_size) dups = file_dups(topdirs, hashalg, block_size)
for fpaths in dups.itervalues(): for fpaths in dups.itervalues():
yield fpaths yield fpaths
def main(args): def main(args):
"""Main when used as script. See usage (--help).
"""
import json import json
topdirs = args['<directory>'] topdirs = args['<directory>']
@ -119,6 +138,7 @@ def main(args):
print('Invalid action "%s"' % action) print('Invalid action "%s"' % action)
# if used as script call main function
if __name__ == '__main__': if __name__ == '__main__':
from docopt import docopt from docopt import docopt
arguments = docopt(__doc__) arguments = docopt(__doc__)