#!/bin/sh
: ; exec klone $0 "$@"
; The above line finds the klone executable in the $PATH
(setq html t)
;(setq debug t)
(defun PD (&rest args) (if debug (apply print-format args)))
;(kdb t)					;(stack-dump-on-error t)
(load 'defstruct)
(load 'print-margin)
(load 'setqn)

;; send a mail for statistics of usage
(if (file-stats "/usr/lib/sendmail") (progn
    (system (list "/usr/lib/sendmail" "koala-inbox@sophia.inria.fr")
      :nohup t :input 'in :output "/dev/null" :error "/dev/null")
    (print-format in "Subject: koala-inbox.LOG\n\nLOG: k-archie%0\n"
      (with (res (copy ""))
	(dolist (arg *arguments*) (nconc res " " arg))
	res
      )
    )
    (flush in)
    (close in)
))
    
    

(if (or (< (length *arguments*) 2) (= #[*arguments* 1] "-?")
    (= #[*arguments* 1] "-help")
  ) (progn
    (print-format *standard-error* (+ "USAGE: k-archie [archie-options] string
k-archie launches an \"archie\" process on each known host, and prints a 
summary of the results in formatted ascii and HTML.
This is useful, since archie sites are often inconsistent or not accessible
By defaults, uses the -s (substring) mode of archie, which search 
case-insentively for the string as a substring. but other archie modes can
be given (-r, -e, -c, -s)

k-archie interactively lists the results in raw, unformatted form, but
maintain a formatted sorted base in ~/.k-archie (and /tmp/k-archie.PNUM/RESULT
as well as pending sites and errors). This formatted database will be listed
on stdout at the end of the search, and a HTML page be created for direct ftp 
by mosaic/netscape.

^C can be typed anytime to abort seach and keep already downloaded infos, that 
will be sorted and formatted normally. Thus you can browse ~/.k-archie while 
k-archie is still busy, and hit ^C when you have your info, but still have a 
well formatted html page.
NOTE: it is recommended you run k-archie once, and view afterwards the URL
    file://localhost/" (expand-filename "~/.k-archie.html") "
and save its location in your hotlist or bookmarks

k-archie timesout after 5mn, this can be changed via -timeout minutes

k-archie also saves results in a directory in /tmp which is cleaned after
1 hour, timeout changeable by -cleaner-timeout hours. Directory is named
/tmp/k-archie.XXX where XXX is process number, and contains:
    QUERY      the text of the query
    ERRORS     the list of archie sites which returned an error
    RESULT     the results
    RESULT.html same thing in html format

AUTHORS: Colas.Nahaboo@inria.fr, Vincent.Bouthors@inria.fr (html page)
"))
    (exit 1)
))

(setq timeout 300000)
(if (setq timeout-pos (seek *arguments* "-timeout")) (progn
    (delete *arguments* timeout-pos)
    (setq timeout-opt (getn *arguments* timeout-pos))
    (delete *arguments* timeout-pos)
    (if timeout-opt 
      (setq timeout (* 60000 (Int timeout-opt)))
    )
    (if (< timeout 1000) (setq timeout 1000)) ;at least 1s
))
(setq time-of-timeout (+ (get-internal-run-time) timeout))

(setq cleaner-timeout 3600)
(if (setq cleaner-timeout-pos (seek *arguments* "-cleaner-timeout")) (progn
    (delete *arguments* cleaner-timeout-pos)
    (setq cleaner-timeout-opt (getn *arguments* cleaner-timeout-pos))
    (delete *arguments* cleaner-timeout-pos)
    (if cleaner-timeout-opt 
      (setq cleaner-timeout (* 3600 (Int cleaner-timeout-opt)))
    )
    (if (< cleaner-timeout 3600) (setq cleaner-timeout 3600)) ;at least 1h
))
(setq time-of-cleaner (+ (get-internal-run-time) (* cleaner-timeout 1000)))

(defstruct Son				;forked son (C archie client)
  name					;name of the archie host queried
  pid					;process ID of the process
  out					;the file containing its stdout
  err					;the file containing its stderr
  retcode				;() when still alive, then retcode
)

(setq archie-dir (+ "/tmp/k-archie." (String *current-process-id*)))
(setq archie-log (+ archie-dir "/RESULT"))
(setq archie-query (+ archie-dir "/QUERY"))
(setq archie-errors (+ archie-dir "/ERRORS"))
(setq archie-log-html (+ archie-log ".html"))
(setq current-date (cur-date))

(setq archie-command (list "archie" "-l" "-m9999"))
(catch 'mode-set
  (dolist (arg (subseq *arguments* 1))
    (if (match "^-.*[cers].*$" arg) (throw 'mode-set))
  )
  (lappend archie-command "-s")
)

(defun remaining-time (time &aux 
    (seconds (/ (- time (get-internal-run-time)) 1000))
  )
  (if (< seconds 0) (setq seconds 0))
  (print-format String "%0%1%2s"
    (if (>= seconds 3600)
      (+ (String (/ seconds 3600)) "h ") ""
    )
    (if (>= seconds 60)
      (+ (String (/ (mod seconds 3600) 60)) "mn ") ""
    )
    (mod seconds 60)
  )
)

(setq int-handler (+ 
    "trap \"\" 2 1
echo
echo '**************************** SUMMARY ****************************'
cat " archie-log ";"
    "cp " archie-log " " (expand-filename "~/.k-archie;")
    "cp " archie-log-html " " (expand-filename "~/.k-archie.html;")
    "cd " archie-dir ";"
    "for i in *.ERR;do if test -s $i;then echo -n \"$i: \" >>ERRORS;cat $i >>ERRORS;fi;done;"
    "rm -f *.OUT *.ERR;"
    "echo '*****************************************************************'
echo 'results: text in ~/.k-archie'
echo '         html in file://localhost/" (expand-filename "~/.k-archie.html") "'
echo 'and, on machine " *hostname*
" (will be erased in " (remaining-time time-of-cleaner) "):'
echo '         text in " archie-log "'
echo '         html in " archie-log-html "'"
))

(defun main (&aux
    (N 0)
    (re-res (regcomp
	"^([0-9][0-9][0-9][0-9])([0-9][0-9])([0-9][0-9])[^ ]+[ ]+([0-9]+) ([^ ]+)[ ](.*)$"
    ))
    (lines (list))			;results
    (sons (list))
    file-out file-err
    son
  )
  (setq host-list (update-host-list))	;get active sites
  (sh mkdir $archie-dir)
  (with (fd (open archie-query :direction :output :if-exists :supersede))
    (dolist (arg (subseq *arguments* 0))
      (print-format fd "%r0 " arg)
  ))    
  
  (dohash (host comment host-list)		;forks a client per site
    (setq file-out (+ archie-dir "/" host ".OUT"))
    (setq file-err (+ archie-dir "/" host ".ERR"))
    (setq pid (system 
	(+ archie-command (list "-h" host) (subseq *arguments* 1))
	:output file-out :error file-err :input "/dev/null"
    ))
    (lappend sons (make-Son :name host :pid pid :out file-out :err file-err))
  )
					;forks the timeouter
  (setq timeout-pid (system (list "sleep" (String (/ timeout 1000)))))
					;forks the cleaner
  (system (+ "sleep " (String (+ cleaner-timeout (/ timeout 1000)))
      ";rm -rf " archie-dir)
    :nohup t)
					;main loop: wait for answers
  (print-format *standard-error* 
    "Querying %0 hosts by archie...(timeout in %1mn, or ^C any time)\nTemporary results stored in %2/RESULT\n"
    (length sons) (/ timeout 60000) archie-dir
  )
  (setq sons-length (length sons))
  (setq sons-answered 0)
  (while (progn  (trap-signal 2 int-handler 1)		;interruptible
      (and timeout-pid (setq dead (wait () :blocking 1)))
    )
    (trap-signal 2 t)			;do not interrupt
    (if (= #[dead 0] timeout-pid)
      (progn
	(setq timeout-pid ())		;timeout, stop
	(print-format 
	  "*** timout reached, writing results! (%0 sites answered out of %1)\n"
	  sons-answered sons-length
	)
      )
      (progn				;process archie output:
	(catch 'found 
	  (dolist (curson sons)
	    (if (= #[dead 0] (Son-pid curson))
	      (throw 'found (setq son curson))
	  ))
	  (setq son ())
	)
	(if son (progn
	    (with (out (open (Son-out son))) ;a stdout is ready
	      (setqn ans-lines 0 ans-new 0 new-entries (list))
	      (while (setq line (read-line out ()))
		(if (regexec re-res line)
		  (with (entry (print-format String "%0-%1-%2 %3 %4 %5"
			(regsub re-res 1) (regsub re-res 2)
			(regsub re-res 3) (regsub re-res 4) (regsub re-res 5)
			(regsub re-res 6)
		    ))
		    (incf ans-lines)
		    (with (sline (substring-ptr entry 9) ;exclude date
			nothere t
		      )
		      (dohash (line son lines)
			(if (= (substring-ptr line 9) sline) 
			  (setq nothere ())
		      ))
		      (if nothere (progn
			  (put lines entry son)
			  (incf ans-new)
			  (write-line entry)
		      ))
	      ))))
	      (Son-retcode son #[dead 1])
					;remove error log if no errors
	      (if (= 0 (get (file-stats (Son-err son)) 'size 0))
		(wait (system (list "rm" "-f" (Son-err son))))
	      )
	      (incf sons-answered)
	      (write-summary lines)
	      (wait (system (list 
		    "cp" archie-log (+ (expand-filename "~/.k-archie")))))
	      (wait (system (list 
		    "cp" archie-log-html (+ 
		      (expand-filename "~/.k-archie.html")))))
	      (flush ())
	      (print-format 
		"*** %0: [%6/%7] %1 entries (%2 new) :%3%4%5 timeout in %8\n" 
		(Son-name son) ans-lines ans-new 
		(if (/= 0 (Son-retcode son)) " (retcode: " "")
		(if (/= 0 (Son-retcode son)) (Son-retcode son) "")
		(if (/= 0 (Son-retcode son)) ")" "")
		sons-answered
		(- sons-length sons-answered)
		(remaining-time time-of-timeout)
	      )
	  ))
	  ;;(print-format "Warning: unknown process ID: %0\n" #[dead 0])
    )))
  )
  ;;clean
  (sh cd $archie-dir 
    "; for i in *.ERR;do if test -s $i;then echo -n \"$i: \";cat $i;fi;done")
  (wait (system int-handler))		;clean dir
)

(setq re-hostname (regcomp "^[ \t]*([-a-zA-Z_0-9.]+)"))

(defun update-host-list (&aux fd line (host-list (list))
  )
  ;; not done yet, TODO...
  ;;  (setq fd (open "~colas/.archie-servers"))
  (setq fd (open 
      "
    archie.univ-rennes1.fr      129.20.254.2    France
    archie.doc.ic.ac.uk         146.169.2.10    United Kingdom
    archie.hensa.ac.uk          129.12.21.25    United Kingdom
    archie.uni-linz.ac.at       140.78.3.8      Austria
    archie.univie.ac.at         131.130.1.23    Austria
    archie.funet.fi             128.214.6.102   Finland
    archie.th-darmstadt.de      130.83.128.118  Germany
    archie.ac.il                132.65.16.18    Israel
    archie.unipi.it             131.114.21.10   Italy
    archie.uninett.no           128.39.2.20     Norway
    archie.rediris.es           130.206.1.2     Spain
    archie.luth.se              130.240.12.30   Sweden
    archie.switch.ch            130.59.1.40     Switzerland
    archie.cs.mcgill.ca         132.206.51.250  Canada
    archie.uqam.ca              132.208.250.10  Canada
    archie.unl.edu              129.93.1.14     USA (NE)
    archie.internic.net         198.49.45.10    USA (NJ)
    archie.rutgers.edu          128.6.18.15     USA (NJ)
    archie.ans.net              147.225.1.10    USA (NY)
    archie.sura.net             128.167.254.179 USA (MD)
    archie.twnic.net            192.83.166.10   Taiwan
    archie.ncu.edu.tw           192.83.166.12   Taiwan
    archie.wide.ad.jp           133.4.3.6       Japan
    archie.hana.nm.kr           128.134.1.1     Korea
    archie.sogang.ac.kr         163.239.1.11    Korea
    archie.au                   139.130.4.6     Australia
" :type :string))

  (while (setq line (read-line fd ()))
    (if (regexec re-hostname line)
      (put host-list (regsub re-hostname 1) (list))
  ))
  host-list
)

(defun write-summary (lines &aux
    (fd (open archie-log :direction :output :if-exists :supersede))
    (fd-html (open archie-log-html :direction :output :if-exists :supersede))
    (entries (list))
    (re (regcomp "^([^ ]+) ([^ ]+) ([^ ]+) (.*)[/](.+)$"))
    (re-dir (regcomp "[/]$"))
    (ht (Hashtable ()))			
    ind					;index: (name size)
    val					;value: list of (date dir host)
    (l (list))
    print-margin-item:prompts
  )
  
  (print-format fd-html "
<HEADER>
<TITLE>Result of k-archie search on %0</TITLE>
</HEADER>
<H1> Results of query: %0 </H1>
made on %1 on %2
<H1> How to get files by ftp from here </H1>
The presentation of found items is a little tricky to be more useful :
<UL>
<LI> The text may be used as is with ange ftp using cut and paste.
<LI> The text is made of more than one anchored frame.  It is the trick.  Move the cursor around and look at what is the corresponding url... Clicking on:
<UL>
<LI> hostname will put you in the root dir on the host
<LI> filename will get you directly the file itself
<LI> rest (path or date) will put you in the directory of the file
</UL>
</UL>
" 
    (print-command *arguments*) 
    (with (cd current-date)
      (print-format String "%0/%1/%2 %3:%4" #[cd 0] #[cd 1] #[cd 2]
	#[cd 3] #[cd 4]
    ))
    *hostname*
  )
  (dohash (line line-son lines)
    (if (regexec re line)
      (with (name (regsub re 5) size (regsub re 2)
	  ind (list name (if (regexec re-dir name) "***directory***" size))
	  val (get ht ind '(list))
	)
	(put ht ind
	  (lappend val (list (regsub re 1) (regsub re 4) (regsub re 3))
  )))))
  (dohash (ind val ht)
    (lappend l (list ind val))
  )
  (sort l {lambda (e1 e2 &aux r)
    (if (/= 0 (setq r (compare #[e1 0 0] #[e2 0 0])))
      r
      (compare #[e2 0 1] #[e1 0 1]))
    }
  )
  (dolist (e l)    
      ;; HTML
    (print-format fd-html "<H3> %0 <EM> %1%2:</EM></H3>\n" #[e 0 0] #[e 0 1]
      (if (= #[e 0 1] "***directory***") "" " bytes")
    )
      ;; ASCII
    (print-format fd "%0    %1%2:\n" #[e 0 0] #[e 0 1]
      (if (= #[e 0 1] "***directory***") "" " bytes")
    )
    (setq lv (list))			;p-list of (host date) pairs per dir
    (dolist (val #[e 1])
      (setq lvl (get lv #[val 1] '(list)))
      (catch 'already-there
	(dolist (host-date lvl)
	  (if (= #[host-date 0] #[val 2])
	    (throw 'already-there)
	))
	(put lv #[val 1] lvl)
	(lappend lvl (list #[val 2] #[val 0]))
    ))
    
    (with (*standard-output* fd)
	;; HTML
      (progn
	(print-format fd-html "<UL>")
	(dohash (dir val lv)
	  (dolist (host-date val)
	    ;; host dir name date
	    (print-format fd-html "<LI>
<A HREF=ftp://%0/>/%0:</A><A HREF=ftp://%0%1>%1/</A><A HREF=ftp://%0%1/%2>%2</A> <A HREF=ftp://%0%1/><EM>%3</EM></A>
"
	      #[host-date 0] dir #[e 0 0] #[host-date 1]
	)))
	(print-format fd-html "</UL>")
      )
	;; ASCII
      (dohash (dir val lv)
	(setq print-margin-item:prompts
	  (list (print-format String "    %0: " dir)
	    "        " "\n" "" ", " "" ))
	(print-margin-item :begin)
	(dolist (host-date val)
	  (print-margin-item (+ #[host-date 0] " " #[host-date 1]))
	)
	(print-margin-item :end)
))))

(defun print-command (l &aux s)
  (setq s (match "^(.*[/])?([^/]+)" (get l 0 "???") 2))
  (dolist (arg (subseq l 1))
    (nconc s " " arg)
  )
  s
)

(main)

;;; EMACS MODES
;;; Local Variables: ***
;;; mode:lisp ***
;;; End: ***

