#!/bin/sh
: ; exec klone $0 "$@"
; The above line finds the klone executable in the $PATH

;; TODO:
;; default: verbose + option -q quiet
;; generated script applies function parametrable (symlinks...)
;; able to specify generated files names 
;; print list of duplicates in a file, sorted by date one per line, 
;;     separated by blank lines
;; print total bytes duplicated
;; priority system for remove script: wildcards on names, regexp on paths

(setq args (getopts "USAGE: %0 [options] dirs..
finds duplicated names in dirs (files with same contents)
Uses the external program \"cksum\"
If no arg, recurses in current dir
Prints all duplicates on the same line
Creates into /tmp/remove-duplicates-$USER a shell script to remove the
duplicates, to be run manually afterwards optionally.
This script is very robust and can process large numbers of files"
    ("-q" () quote-results "quote results when printing")
    ("-i" () ignore-dir-errors "ignore \"cannot go into dir\" errors, otherwise aborts")
    ("-f" () dont-compare "fast: dont compare file, just compares size & checksum")
    ("-v" () verbose "verbose operation")
))

;; principe of operation:
;; 1st pass: recurses on all files. Stores in a hash table indexed on
;;     checksums a list of vectors describing a file:
;;     [name dir size]
;;     On each checksum is a list of list of equal files
;; 2nd pass: for each same-checksum group, find duplicates and prints them

(setq re-cksum (regcomp "^[ \t]*([0-9]+)[ \t]+([0-9]+)[ \t]"))
(setq cksum-ht (Hashtable ()))
(setq resform (if quote-results "%r0 " "%n0 "))
(setq max-command-line-length 60000)
(setq max-arguments 2000)

(defun do-file-or-dir (file dir filelist)
  (catch 'ERROR			;errors abort only current file
    (setq ftype (file-type file))
    (if (= 'file ftype)
      (lappend filelist file)		;deferred treatment
      (= 'directory ftype)
      (do-dir file dir)
)))

(defun do-dir (dir parent &aux
    (*current-directory* ".")
    (filelist (list))
  )
  (verbose? "===> entering directory %0" *current-directory*)
  (catch 'Done
    (if (not (trap-error 'ALL (setq *current-directory* dir)))
      (if ignore-dir-errors
	(throw 'Done (print-format *standard-error* 
	    "Warning: cannot go into dir %1%0 (ignored)\n" dir parent)
	)
	(fatal-error 1 "ERROR: Cannot go into dir %1/%0\n" dir parent)
    ))
    (dolist (file (directory))
      (do-file-or-dir file (+ parent "/" dir) filelist)
    )
    (if filelist (do-files filelist dir)) ;call once cksum on all the files 
  )
)

(defun do-files (filelist dir &aux out line comlen)
  (catch 'Done
    (when (> (length filelist) max-arguments) 
      (do-files-aux (subseq filelist 0 max-arguments) dir)
      (do-files (subseq filelist max-arguments) dir)
      (throw 'Done)
    )
    (with (i 0)
      (setq comlen (length "cksum"))
      (dolist (file filelist)
	(incf comlen (+ 1 (length file)))
	(if (> comlen max-command-line-length)
	  (progn
	    (do-files-aux (subseq filelist 0 i) dir)
	    (do-files (subseq filelist i) dir)
	    (throw 'Done)
	  )
	  (incf i)
      ))
      (do-files-aux filelist dir)
)))

(defun do-files-aux (filelist dir &aux out line)
  (incf nfiles (length filelist))
  (verbose? "   | cksumming %0 files of %1" (length filelist) 
    *current-directory*)
  (system (+ '("cksum") filelist) :output 'out)
  (select out)		;wait for stream to be ready
  (catch 'EOF
    (dolist (file filelist)
      (setq line (read-line out))
      (if (and (regexec re-cksum line)
	  (= (substring-ptr line #[re-cksum 0 1]) file)
	)
	(do-file file dir (regsub re-cksum 0) (Int (regsub re-cksum 1)))
	(fatal-error 1 "ERROR: cksum failed on %1/%0, aborting\nI cannot parse cksum result:\n    %2\n" file dir line)
  )))
  (close out)
)

(defun do-file (file dir checksum size &aux 
    old
    (files (get cksum-ht checksum (list)))
  ) 
  (if verbose
    (if files (progn
	(print-format "   |> file %0 has same checksum as:" file)
	(dolist (fl files)
	  (dolist (f fl)
	    (print-format " %0" (reldir curdir (+ (1 f) (0 f))))
	))
	(print-format "\n")
  )))
  (catch 'EQUAL
    (dolist (ofile files)
      (if (and 
	  (setq old (getn ofile 0)) ;first of the equivalent files
	  (= size #[old 2])
	  (or dont-compare
	    (= 0 (wait (system (list cmp file (+ #[old 1] "/" #[old 0]))
		  :output "/dev/null" :error "/dev/null"))
	)))
	;; file is equal! add it to the current list
	(throw 'EQUAL
	  (verbose? "   # duplicate: %0 of: %1/%2" 
	    file #[old 1] #[old 0])
	  (lappend ofile (vector file *current-directory* size))
	)
	;; file is not equal, continue...
    ))
    ;; we didnt find it, create a new list
    (put cksum-ht checksum (+ files 
	(list (list (vector file *current-directory* size)))
    ))
))

(if (not args) (setq args '(".")))

;; finds an executable in PATH. returns it (string) if found, or
;; nil if not found

(defun find-in-PATH (name &key not-found &aux 
    (path (getenv "PATH"))
    (start 0)
    (end 0)
    dir
    file
    mode
  )
  (catch 'FOUND
    (while (setq end (seek path #\: start))
      (if (= start end) (setq dir ".")
	(setq dir (subseq path start end))
      )
      (setq file (+ dir (if (= #\/ (getn dir -1)) "" "/") name))
      (setq mode (getn (file-stats file) 'mode))
      (if (and mode
	  (= S_IFREG (logand S_IFMT mode))
	  (logand S_IOEXEC mode)
	)
	(throw 'FOUND file)
      )
      (setq start (+ end 1))
    )      
    ;; not found case
    (if not-found
      not-found
      (progn (print-format *standard-error*
	  "ERROR: %0 not found in PATH, aborting!\n" name)
	(exit 1)
))))

;; print only relative path if possible

(defun reldir (pwd-re path)
  (if (regexec pwd-re path)
    (regsub pwd-re 1)
    path
  )
)

(defun main (&aux
    (curdir (regcomp (+ "^" (quote-string-for-regexp *current-directory*) 
	  "[/](.+)$")))
  )
  (setq cksum (find-in-PATH "cksum"))
  (setq cmp (find-in-PATH "cmp"))
  (setq duplicates 0)
  (setq entries 0)
  (setq check-coll 0)
  (setq nfiles 0)
  (setq scriptname (+ "/tmp/remove-duplicates-" (getenv "USER")))
  (setq script-fd (open scriptname :direction :output :if-exists :supersede))
  (PF script-fd "#!/bin/sh\n")

  ;; 1st pass: collect info
  (dolist (rootdir args)
    (with (*current-directory* rootdir filelist (list))
      (dolist (file (directory))
	(do-file-or-dir file rootdir filelist)
      )
      (if filelist (do-files filelist rootdir))
  ))

  ;; 2nd pass: print results
  (dohash (cks files cksum-ht)
    (incf check-coll (- (length files) 1))
    (dolist (file files)
      (if (> (length file) 1) (progn
	  (setq first t)
	  ;; to print: sort longest names first, and quote chars if needed
	  (sort file (lambda (i1 i2)
	      (compare (- (length #[i1 0])) (- (length #[i2 0])))
	  ))
	  (dolist (instance file)
	    (print-format resform 
	      (reldir curdir (+ #[instance 1] "/" #[instance 0]))
	    )
	    (if first (progn
		(PF script-fd "\n### Removing duplicates of: %r0\n" 
		  (+ #[instance 1] "/" #[instance 0])
		) 
		(incf entries)
		(setq first ())
	      ) (progn
		(PF script-fd "rm -f -- %r0\n"
		  (+ #[instance 1] "/" #[instance 0]))
		(incf duplicates)
	      )
	  ))
	  (print-format "\n")
  ))))

  (verbose? "SUMMARY: found %0 duplicates for %1 files" duplicates entries)
  (verbose? "         there was %0 checksum collisions for %1 files"
    check-coll nfiles)
  (if (/= 0 entries)
    (verbose? "You can run the %0 script file to remove the duplicates" 
      scriptname)
  )
  (close script-fd)
  (sh chmod a+x ,scriptname)
)

(main)

;;; EMACS MODES
;;; Local Variables: ***
;;; mode:lisp ***
;;; End: ***
