#!/usr/bin/env python
# -*- coding: iso-8859-2 -*-
# $Date: 2008-11-08 23:08:41 $, $Revision: 1.17 $
#
# Simple utility to maintan a list of non-local files, stored
# for example on CDs, DVDs and makes easy to find what files
# has been already archived (burned).
# 
# Author: Wojciech Muła
# e-mail: wojciech_mula@poczta.onet.pl
# www:    http://0x80.pl
#
# License: BSD
#

import os
import sys
import glob
import time
import md5

from os import walk, makedirs

from os.path import exists, isdir, getsize, getctime, getmtime
from os.path import join, normpath, abspath, dirname
from time    import strftime, localtime

usage_text = """
usage:
	progname scan-dir [!] dbname directory_path
	progname update       dbname directory_path

	progname find-dups    dbnames paths
	progname list-dups    dbnames paths
	progname list-unique  dbnames paths

	progname dump-details dbnames
	progname dump dbnames
	progname size dbnames

	progname list-db
	progname clear-md5-cache
	progname help action_name
"""

help = {
"scan-dir": \
"""
progname scan-dir [!] dbname directory_path

Create listing of directory_path named dbname.
Following informations are stored:
* path
* file size
* creation date & time
* md5 sum

If dbname exists, it won't be overwritten.  Put "!" after
action name, i.e. "progname ! scan-dir ...", to turn off
protection.\
""",

"update": \
"""
progname update dbname directory_path

Update database (if new file, if del file, if file
size change).
""",

"list-db": \
"""
progname list-db

Print names of all existing databases.
Each name is printed in new line.\
""",

"size":\
"""
progname size dbnames

Print total size of all files listed in given database(s).
""",

"dump":\
"""
progname dump dbnames

Print all paths stored in given database(s).  If more
then one database is used, then path listing is preeceded
by "*** database_name" line.

If dbnames is "-", then all existing databases are used.
Many names may be passed as separate shell arguments
or as a single shell argument, where elements are
separated with comma.\
""",

"dump-details":\
"""
progname dump-details dbnames

Print all data stored in given database(s).  If more
then one database is used, then listing is preeceded
by "*** database_name" line.

If dbnames is "-", then all existing databases are used.
Many names may be passed as separate shell arguments
or as a single shell argument, where elements are
separated with comma.\
""",

"find-dups":\
"""
progname find-dups dbnames paths

Find files that are already listed in database(s).
This option print detailed information, see also
"progname help list-dups".

Files are given as list of paths; path may point regular
file or directory.  All files from directory and its
subdirectories are considered.

Databases are given as list of comma-separated names.
If dbnames is "-", then all existing databases are used.

Syntax of raport is simple:

Full path of duplicated file from 'paths' list
* dabasename (where duplicate were located)
** path in database

Of couse one file may occur many times in many databases
and many times in one database.

Example:

$ progname find-dups mailarch.2007,backup.2007 /home/user/mail/
/home/user/mail/readed-01.2007
* mailarch.2007
** backups/readed-01.2007
* backup.2007
** mail/readed-01.2007
/home/user/mail/readed-02.2007
* mailarch.2007
** backups/readed-02.2007
/home/user/mail/readed-03.2007
* mailarch.2007
** backups/readed-03.2007
""",

"list-dups":\
"""
progname list-dups dbnames paths

Find files that are already listed in database(s).

Acts like find-dups (read "progname help find-dups" for details)
but prints only full path of duplicates from 'paths' list.

$ progname list-dups mailarch.2007 /home/user/mail/
/home/user/mail/readed-01.2007
/home/user/mail/readed-02.2007
/home/user/mail/readed-03.2007\
""",

"list-unique":\
"""
progname list-unique dbnames paths

Find files that are not present in any listed database(s).
It is complement for list-dups option (see "progname help list-dups")

Files are given as list of paths; path may point regular
file or directory.  All files from directory and its
subdirectories are considered.

Databases are given as list of comma-separated names.
If dbnames is "-", then all existing databases are used.\
""",

"clear-md5-cache":\
"""
progname clear-md5-cache

Clear md5 cache.

Program progname utilize md5 sums to find duplicates.
Because calculating of such sums are not very fast,
md5 sums of processed files are cached, i.e. saved
somewhere in subdirectory of CDDB_PATH.  With this
option you can clear this subdirectory.
"""
}

class GlobalOptions:
	pass

options = GlobalOptions()

def main():
	env_var = "CDDB_PATH" 
	if env_var not in os.environ:
		die("Please set environment variable", env_var, "- path of main database directory.")
	else:
		options.db_directory = os.path.expanduser(os.environ[env_var])
		if not exists(options.db_directory):
			warning(options.db_directory, "doesn't exists, it will be created.")
			makedirs(options.db_directory)

	options.md5sum_cache_path = normpath(join(options.db_directory, ".md5cache"))

	args = sys.argv
	progname = os.path.basename(args.pop(0))

	def usage():
		die(usage_text.replace("progname", progname))

	def pop_arg(default=None):
		try:
			return sys.argv.pop(0)
		except IndexError:
			if default is not None:
				return default
			else:
				usage()
			
	
	action = pop_arg().lower()
	
	if action == "python":
		import this
	
	elif action == "help":
		if args:
			action_name = pop_arg()
		else:
			die("Action name needed, pick one of:", ", ".join(sorted(help)))

		try:
			print help[action_name].replace("progname", progname)
		except KeyError:
			die(action_name, "is not valid, try one of:", ", ".join(sorted(help)))
	
	elif action == "list-db":
		print "\n".join(sorted(get_all_dbnames()))
	

	elif action == "size":
		dbnames = set()
		for arg in args:
			dbnames.update(parse_dbnames(arg))
	
		if not dbnames:
			die("No databases specified (or no database exists)")

		for dbname in sorted(dbnames):
			size = sum(int(details[1]) for (details, _) in iter_db_entries(dbname))
			if size > 1024*1024:
				print "%s: %0.1fMiB (%d)" % (dbname, size/(1024.0*1024), size)
			elif size > 1024:
				print "%s: %0.2fkiB (%d)" % (dbname, size/(1024.0), size)
			else:
				print "%s: %dB (%d)" % (dbname, size, size)
	
	
	elif action == "dump":
		dbnames = set()
		for arg in args:
			dbnames.update(parse_dbnames(arg))
	
		if not dbnames:
			die("No databases specified (or no database exists)")

		if len(dbnames) == 1:
			dump_db(dbnames.pop())
		else:
			for dbname in sorted(dbnames):
				print "***", dbname
				dump_db(dbname)
	

	elif action == "dump-details":
		dbnames = set()
		for arg in args:
			dbnames.update(parse_dbnames(arg))
	
		if not dbnames:
			die("No databases specified (or no database exists)")

		def send2stdout(dbname):
			filename = join(options.db_directory, dbname) + ".ls"
			file = open(filename, "r")
			while True:
				buf = file.read(8192)
				if not buf: break
				sys.stdout.write(buf)

			sys.stdout.flush()
			file.close()

		if len(dbnames) == 1:
			send2stdout(dbnames.pop())
		else:
			for dbname in sorted(dbnames):
				print "***", dbname
				send2stdout(dbname)
	

	elif action == "scan-dir":
		tmp = pop_arg()
		if tmp == "!":
			overwrite = True
			dbname = pop_arg()
		else:
			overwrite = False
			dbname = tmp

		if not overwrite and dbname in get_all_dbnames():
			die("Database", dbname, "already exists, call program with 'scan-dir !' action to overwrite db.")
			
		if len(args) == 0:
			usage()
		if len(args) > 1:
			warning("scan-dir require just one path, other arguments ignored")

		path = args.pop(0)

		if not isdir(path):
			die(path, "is not a directory")

		scan_dir(path, dbname)


	elif action == "update":
		dbname = pop_arg()
		if dbname not in get_all_dbnames():
			die("Database", dbname, "not exists, create this first (use scan-dir)")
			
		if len(args) == 0:
			usage()
		if len(args) > 1:
			warning("scan-dir require just one path, other arguments ignored")

		path = args.pop(0)

		if not isdir(path):
			die(path, "is not a directory")

		update(path, dbname)
	
	
	elif action in ["find-dups", "list-dups", "list-unique"]:
		dbnames = parse_dbnames(pop_arg())
		if not dbnames:
			die("No databases specified (or no database exists).")

		if not args:
			warning("No paths specified, defaults to current directory.")
			args = ["."]

		all_dbs = {}
		all_sizes = set()
		for dbname in dbnames:
			db, sizes = load_db(dbname)
			all_dbs[dbname] = db
			all_sizes.update(sizes)


		if action == "find-dups" or action == "list-dups":
			for path in iter_paths(args):
				if getsize(path) not in all_sizes:
					continue # there is no file with such size

				locations = locate(path, all_dbs)
				if not locations:
					continue # file not found, or can't be read

				print path
				if action == "find-dups":
					for dbname, paths in locations:
						print "*", dbname
						for p in paths:
							print "**", p

		elif action == "list-unique":
			for path in iter_paths(args, True):
				if getsize(path) not in all_sizes or locate(path, all_dbs) == []:
					print path
	
	elif action == "clear-md5-cache":
		
		for root, dirs, files in walk(options.md5sum_cache_path, topdown=False):
			for file in files:
				path = join(root, file)
				os.remove(path)

			for dir in dirs:
				path = join(root, dir)
				os.rmdir(path)
				
	else:
		usage()
#main
		
			

def join_args(*args):
	"Pascal-like 'write' helper"
	return " ".join(map(str, args))


def warning(*args):
	sys.stderr.write(join_args(*args) + "\n")


def die(*args):
	sys.stderr.write(join_args(*args) + "\n")
	sys.exit(1)


last_msg_len = 0
def status(*args):
	global last_msg_len

	if not sys.stdout.isatty():
		return

	msg = join_args(*args)
	if msg == "\n":
		sys.stdout.write("\n")
		last_msg_len = 0
		return

	n = len(msg)

	sys.stdout.write(msg)
	if n < last_msg_len:
		k = last_msg_len - n
		sys.stdout.write(" " * k)
	
	sys.stdout.write("\r")
	sys.stdout.flush()

	last_msg_len = n


def iter_paths(paths, sort_files=False):
	for path in paths:
		if not exists(path):
			warning(path, "doesn't exists or is a broken link.")
			continue
		elif isdir(path):
			for root, dirs, files in walk(path):
				if sort_files:
					files.sort()
					dirs.sort()

				for file in files:
					yield join(root, file)
		else:
			yield path
	


def parse_dbnames(string):
	all_dbnames = get_all_dbnames()
	if string in ["-", "--"]:
		dbnames = all_dbnames
	else:
		dbnames = set()
		for dbname in string.split(","):
			if dbname in all_dbnames:
				dbnames.add(dbname)
			else:
				warning("Database", dbname, "doesn't exists")
		
	return dbnames


def get_all_dbnames():
	db_directory = normpath(options.db_directory)
	path    = join(db_directory, "*.ls")
	db_list = set()

	for filename in glob.glob(path):
		if not exists(filename):
			continue # broken link

		# add name (without extension) to the list
		db_list.add(filename[len(db_directory)+1:-3])
	
	return db_list


def scan_dir(rootdir, dbname):
	outfile_path = join(options.db_directory, dbname) + ".ls"
	outfile = open(outfile_path, "w")

	rootdir = normpath(rootdir)
	l = len(rootdir) + 1

	# scan directories
	directories = []
	totalfiles  = 0

	status("\n")
	for root, dirs, files in walk(rootdir):
		status("Scanning (%s):" % totalfiles, root[l:])
		directories.append( (root, files[:]) )
		totalfiles = totalfiles + len(files)

	# gather basic info about all files and save it to output file
	n = 0
	
	for root, files in directories:
		outfile.write("\n./%s:\n" % root[len(rootdir)+1:])
		for file in files:
			path = join(root, file)
			if not exists(path): # broken symlink
				continue
			
			n = n + 1
			status("%4.1f%% %s" % ((100.0*n)/totalfiles, path[l:]))

			ctime = getctime(path)
			ctime = strftime("%d.%m.%Y %H:%M:%S", localtime(ctime))
			size  = getsize(path)
			try:
				csum = md5sum(path, True)
			except IOError, err:
				warning("Can't calc MD5 sum for", path, "-", err.strerror)
				csum = "0"*32

			outfile.write("%s %10s %s %s\n" % (csum, size, ctime, file))
	
	status("\n")
	outfile.close()


def iter_db_entries(dbname):
	current_dir = ""
	infile_path = join(options.db_directory, dbname) + ".ls"
	infile = open(infile_path, "r")
	
	for n, line in enumerate(infile):
		line = line.strip()
		if not line:
			continue

		if line.startswith("./") and line[-1] == ":": # ./dirname:
			current_dir = line[2:-1]
		else: # md5 size date time filename
			file   = line[64:]
			fields = line[:64].split()
			if len(fields) != 4:
				die("Wrong file format, error in line", n+1)
			else:
				fields.append(file)

			yield tuple(fields), join(current_dir, file)

	infile.close()


def update(rootdir, dbname):
	# load 
	existing_data = {}
	for (csum, size, sdate, stime, _), path in iter_db_entries(dbname):
		existing_data[path] = (csum, size, sdate, stime)

	outfile_path = join(options.db_directory, dbname) + ".ls"
	outfile = open(outfile_path, "w")

	rootdir = normpath(rootdir)
	l = len(rootdir) + 1

	# scan directories
	directories = []
	totalfiles  = 0

	status("\n")
	for root, dirs, files in walk(rootdir):
		status("Scanning (%s):" % totalfiles, root[l:])
		directories.append( (root, files[:]) )
		totalfiles = totalfiles + len(files)

	# gather basic info about all files and save it to output file
	n = 0
	
	for root, files in directories:
		outfile.write("\n./%s:\n" % root[len(rootdir)+1:])
		for file in files:
			path = join(root, file)
			if not exists(path): # broken symlink
				continue

			n = n + 1
			status("%4.1f%% %s" % ((100.0*n)/totalfiles, path[l:]))
			size  = getsize(path)

			tmp = path[len(rootdir)+1:]
			if tmp in existing_data:
				_csum, _size, _sdate, _stime = existing_data[tmp]
				if (int(_size) == size):
					outfile.write("%s %10s %s %s %s\n" % (_csum, _size, _sdate, _stime, file))
					continue;

			ctime = getctime(path)
			ctime = strftime("%d.%m.%Y %H:%M:%S", localtime(ctime))
			try:
				csum = md5sum(path, True)
			except IOError, err:
				warning("Can't calc MD5 sum for", path, "-", err.strerror)
				csum = "0"*32

			outfile.write("%s %10s %s %s\n" % (csum, size, ctime, file))
	
	status("\n")
	outfile.close()


def dump_db(dbname):
	for _, path in iter_db_entries(dbname):
		print path


def load_db(dbname):
	db = {}
	sizes = set()

	for (csum, size, sdate, stime, _), path in iter_db_entries(dbname):
		size = int(size)
		sizes.add(size)

		if size in db:
			D = db[size]
		else:
			D = {}
			db[size] = D

		if csum in D:
			D[csum].append(path)
		else:
			D[csum] = [path]
	
	return db, sizes


def locate(path, dbs):
	try:
		csum = md5sum(path)
	except IOError, err:
		warning(err.strerror)
		return None
		#if err.errno == 13:
		#	warning("Permission denied: can't read file ()", path)
		#	return None
		#else:
		#	raise err

	size   = getsize(path)
	result = []
	for dbname, db in dbs.iteritems():
		try:
			result.append( (dbname, db[size][csum]) )
		except KeyError:
			pass
	
	return result


def md5sum(filename, nocache=False):
	filename = abspath(filename)
	cache    = options.md5sum_cache_path + filename

	if not nocache and exists(cache) and getmtime(filename) <= getmtime(cache):
		file = open(cache, 'r')
		sum  = file.read()
		file.close()
		if len(sum) == 32:
			return sum

	file    = open(filename, 'rb')
	bufsize = 8192
	sum     = md5.new()
	while True:
		buf = file.read(bufsize)
		if not buf:
			break
		sum.update(buf)

	file.close()

	if not nocache:
		if not exists(dirname(cache)):
			makedirs(dirname(cache))

		file = open(cache, "w")
		file.write(sum.hexdigest())
		file.close()

	return sum.hexdigest()


if __name__ == '__main__':
	main()

# vim: ts=4 sw=4 nowrap noexpandtab

