#!/bin/sh
: ; exec klone $0 "$@"
; The above line finds the klone executable in the $PATH
;; wget-fix-base: a wget cleaner
;; TODO: generate script to get included data (img/src object/data) with -x
;; first local, then external. param to exclude ext, and limit //
;; TODO: make an index.html at toplevel listing .html files fot referenced 
;; from other pages (rott nodes)

;; things modified:
;; <base href="xxx">
;; href="xxx" or src="xxx":
;;      links to top of hierarchy: /...
;;      absolute URL to a site of the world
;;      option: lists links to outer sites

;; Elements processed: (everything from the HTML4.0 spec)
;; <BASE href="XXX"> 
;; <LINK href="XXX">
;; <BODY background="XXX">
;; <FORM action="XXX">
;; <INPUT src="XXX">
;; <A href="XXX">
;; <IMG src="XXX">
;; <APPLET codebase="XXX">
;; <AREA href="XXX">
;; <IMG lowsrc="XXX">
;; <IMG usemap="XXX">
;; <INS cite="XXX">
;; <DEL cite="XXX">
;; <INPUT usemap="XXX">
;; <OBJECT classid="XXX">
;; <OBJECT codebase="XXX">
;; <OBJECT data="XXX">
;; <OBJECT usemap="XXX">
;; <OBJECT name="XXX">
;; <Q cite="XXX">
;; <BLOCKQUOTE cite="XXX">
;; <FRAME src="XXX">
;; <IFRAME src="XXX">
;; <HEAD profile="XXX">
;; <SCRIPT src="XXX">
;; not in the 4.0 spec: <IMG lowsrc="XXX"> (netscape-specific :-(

;; This script works quite well. Its coding abuses a bit of global variables
;; for the sake of simplicity

(setq args (getopts "USAGE: %0 [options] [dir]
Script to fix locally mirrored HTML documents links, to point to local copy 
in relative mode if available, so that the directory can be moved or put on a 
CD-ROM and browsed offline.
Must be run in a directory hierarchy after a recursive wget to transform 
links to their local form. Performs the following functions:
  * suppresses the <BASE> tags that forces links to point to the internet
  * fix relative links according to the <BASE> tags
  * fix absolute local paths (/...) into relative ones
  * convert links to URLs already mirrored in the same \"world\" to relative
  * converts links to dirs to links to index.htm[l] files there if they exist
  * supresses lowsrc of images if a src exists
  * prints a summary of remaining external links
It tolerates URLs non enclosed into quoted strings.
Must be run in the directory you made the wgets. Considers all the sites 
mirrored (directories in this dir) as belonging to the same \"world\", allowing
local links between them.
This script can be re-run several times on the same data, after mirrorring 
other sites in the same \"world\", for instance. Very useful after mirroring
many linked sites with wget -r -Dxxx,yyy -H ...
To use it on sites, do a (do NOT use -k, it garbles base tags):
    wget -r -l0 --no-parent -t0 --follow-ftp SITES...
    wget-fix-base
    ...you may then create an ALIASES files and redo the wget-fix-base,
       or add other sites, and redo the wget-fix-base
"
    ("-q" () quiet 
      "quiet operation (otherwise verbose on stderr)")
    ("-n" () nowrite 
      "dont modify files")
    ("-a" alias=host host-aliases-names 
      "specify aliases for host name, e.g, -a sun.com=www.sun.com
host must be the dir name appearing in the current dir
So that, if a link refers to \"sun.com\", it can be converted to a
relative link to \"www.sun.com\"
Aliases can also be stored in an ALIASES files in the current dir
with aliases of the form alias=host, one per line" :multiple t)
    ("-s" site-dir given-sites
      "Normally considers that all subdirs in given dir or 
current-dir are mirrored hierarchy sites. This option can restrict 
which sites will be actually processed" :multiple t)
    ("-i" () ignore-errors 
      "in case of error, continue to next file, otherwise
any error aborts the run")
    ("-A" () all-files 
      "treats ALL files. Normally only *.htm, *.html, *.asp, *.shtml files
are examined and changed, this flag processes all files")
    ("-S" regexp re-suffixes 
      "specifies the regular expression that must match files names to be
processed. Defaults to \"([.]s?html?)|([.]asp)$\"")
    ("-o" () orig 
      "keeps backup copy of modified files into .orig suffixed copies.
Default is to modify in place without backups. This script effects
can then be undone by:
    echo '#!/bin/ksh
    mv \"$1\" \"${1%.orig}\"' >/tmp/foo; chmod a+x /tmp/foo
    find . -name \\*.orig -exec /tmp/foo {} \\;
    rm -f /tmp/foo
ksh can be replaced by bash, zsh or any shell with the ${x%y} form.
Alternatively, you can backup beforehand files by a:
    tar cfvz BACKUP.tgz `find . '(' -name '*.[hH][Tt][Mm]' -o -name\\
                    '*.[hH][Tt][Mm][lL]' ')' -print`")
    ("-B" () do-backup 
      "performs the above backup command before running to backup all
html files into a BACKUP.tgz file. To be safe, does not make the
backup file if one already exists")
    ("-l" () allow-lowsrc 
      "do not remove lowsrc attributes in images")
    ("-e" () list-all-externals 
      "lists all external URLs referred to. Default is to only list hosts.")
    ("-E" file list-all-externals-file
      "lists all external URLs referred to as -e, but stores ouput in file"
    )
    ("-1" filename only-one-file
      "only processes one html file given in argument (debug)"
    )
    ("-D" () debug "debug mode, very verbose")
    ("-DD" () debug2 "debug2 mode, VERY verbose")
    ("-T" () do-trace-all "full trace mode, max verbosity"
    )
))

;; Verbose output (default): Items are prefixed by:
;; === when processing a file (or site), prints the file name and dir
;; *** for BASE tags
;; ### when a link is replaced
;; --> when entering a subdir
;; <-- when going back to parent dir

;;-----------------------------------------------------------------------------
;; Options postparsing 
(if args (setq *current-directory* (getn args 0)))
(setq verbose (not quiet))

(setq host-aliases (list))		;plist aliases, hosts
(setq aliases-re (regcomp "^[ \t]*([^= ]+)[ \t]*=[ \t]*([^= ]+)[ \t]*$"))

(with (line () 
    fd (open "ALIASES" :error ()))	;parses ALIASES
  (if fd 
    (while (setq line (read-line fd ()))
      (if (match "^[ \t]*#" line) t	;comment begin by #
	(regexec aliases-re line)
	(put host-aliases (regsub aliases-re 1)(regsub aliases-re 2))
))))
(dolist (alias host-aliases-names)	;adds aliases from command line
  (if (regexec aliases-re alias)
    (put host-aliases (regsub aliases-re 1)(regsub aliases-re 2))
))

(if do-trace-all (trace-all t))

(if list-all-externals-file (setq list-all-externals t))
(setq list-all-externals-fd *standard-output*)
(if list-all-externals 
  (if list-all-externals-file (setq list-all-externals-fd 
      (open list-all-externals-file :direction :output :if-exists :supersede)
)))
(if debug2 (setq debug t))

;;-----------------------------------------------------------------------------
;; Regexps: -nq form is for NonQuoted attribute values (foo=bar),
;; -sq for singlw quoted values (foo='bar'), default is foo="bar"
;; Having a single RE for both cases would be too complex
;; Then, we dynamically build the RE for all the tag/attributes pairs that
;; are specied in the HTML spec to contain URLs
(setq re-html (if re-suffixes (re-nocase re-suffixes)
    (re-nocase "([.]s?html?)|([.]asp)$")
))
(setq re-url "[^\"\'n]*")
(setq re-url-nq "[^ \t\n\"]+")
(setq re-url-sq "[^ \t\n']+")
(setq re-link (copy "<[ \t\n]*("))
(setq re-link-pos 1)
(dohash (tag attribute '(
      ;; HTML 3.2
      link href body background form action input src
      a href img src applet codebase area href img lowsrc img usemap
      ;; HTML 4.0 draft http://www.w3.org/TR/WD-html40-970708/sgml/dtd.html
      ins cite del cite input usemap object classid object codebase object data
      object usemap object name q cite blockquote cite frame src iframe src
      head profile script src
  ))
  (nconc re-link tag "[ \t\n]+([^>]+[ \t\n]+)?" attribute "|")
  (incf re-link-pos)
)
(put re-link (- (length re-link) 1) #\)) ;remove last "|"
(setq re-link-nq (copy re-link))
(setq re-link-sq (copy re-link))
(nconc re-link "[ \t\n]*=[ \t\n]*\"(" re-url ")\"[^>]*>")
(nconc re-link-nq "[ \t\n]*=[ \t\n]*(" re-url-nq ")[^>]*>")
(nconc re-link-sq "[ \t\n]*=[ \t\n]*'(" re-url ")'[^>]*>")
(setq re-link (re-nocase re-link))
(setq re-link-nq (re-nocase re-link-nq))
(setq re-link-sq (re-nocase re-link-sq))
(incf re-link-pos)

;;-----------------------------------------------------------------------------
;; on files we do a search and replace mechanism

(defun do-file (file curdir &aux
    ;; global vars that can be used:
    newbase
    newbasehost
    newbaseprotocol
    ;; local vars
    sep
  )
  (if (or all-files (regexec re-html file))
    (with (s (String (open file)) s-orig (copy s))
      ;; Open the file, read it in memory
      (if debug (verbose? "=== FILE: %0%2%1 %3" curdir file 
	  (setq sep (if (= #\/ (getn curdir -1)) "" "/"))
	  (make-string 
	    (- 78 (length(print-format String "=== FILE: %0%2%1 %3" 
		  curdir file sep ""))) #\=
	  )
      ))
      (incf nfiles)
      ;; process <base> tags
      (Replacement-do s Re-base)		;can set global var newbase
      (if (not newbase) (Replacement-do s Re-base-nq))
      (if (not newbase) (Replacement-do s Re-base-sq))
      ;; netscape-ism of low-resolution src in <img> (may be disabled)
      (if (not allow-lowsrc) (progn
	  (Replacement-do s Re-lowsrc)
	  (Replacement-do s Re-lowsrc-nq)
	  (Replacement-do s Re-lowsrc-sq)
      ))
      ;;process URLS as attribute values
      (Replacement-do s Re-link)
      (Replacement-do s Re-link-nq)
      (Replacement-do s Re-link-sq)
      
      ;; write back the file
      (if (and (not nowrite) (/= s s-orig)) (progn
	  (if orig (progn
	      (wait (system (list "mv" file (+ file ".orig"))))
	      (wait (system (list "chmod" "u+w" (+ file ".orig") 
		    :error "/dev/null" :output "/dev/null")))
	  ))
	  (with (fd (open file :direction :output :if-exists :supersede))
	    (incf nfilesdone)
	    (write s fd)
  )))))
)

;; what to search is abstracted into a Replacement structure
(defstruct Replacement
  name					;not used, for info
  re					;the regexp matching 
  pos					;the n-th () to remove and replace
  func					;computes the replacement
)

;; replacing is then quite simple: find regexps and replace
;; The work is donne by the "func" callbacks (Re-xxx, detailed below)
(defun Replacement-do (s r)
  (replace-string s (Replacement-re r) (Replacement-func r) :all t 
    :npar (Replacement-pos r) :quote t
))

;;-----------------------------------------------------------------------------
;; The search and replace cases:

;; base: fix the <base href="xxx"> statements so that we are not 
;; sent back on the network if the file is actually here
;; We remove the base statement, and if it was different than the file
;; location, set the global var newbase for converting local links
;; to this new base

(setq Re-base (make-Replacement :name "base"
    :re (re-nocase (+
	"<[ \t\n]*base[ \t\n]+(href[ \t\n]*=[ \t\n]*\"(" re-url ")\")[^>]*>"))
    :pos 1
    :func (lambda (re &aux
	(text (regsub re 2))
	(url (parse-url text))
      )
      (setq newbase (if (/= "" (URL-reldir url)) (URL-reldir url) ()))
      (PVD "BASE" curdir file (LURL url) newbase)
      (if (and (/= "" (URL-host url)) (/= site (URL-host url))
	  (not (URL-local url))
	) (progn
	  (verbose? "*** %0: WARNING: EXTERNAL BASE: %0" text)
	  (if (not (seek external-bases text))
	    (lappend external-bases text)
	  )
	  (setq newbasehost (URL-host url))
	  (setq newbaseprotocol 
	    (if (/= "" (URL-protocol url)) (URL-protocol url) "http:")
	  )
	)
	(setq newbasehost ())
      )
      (verbose? "### %0: BASE %1 EXPANDED-TO %2" file text newbase)
      ""				; remove base element
    )
))

;; same for non-quoted 
(setq Re-base-nq (make-Replacement :name "base-nq"
    :re (re-nocase (+
	"<[ \t\n]*base[ \t\n]+href[ \t\n]*=[ \t\n]*(" re-url-nq ")[^>]*>"))
    :pos 0
    :func (Replacement-func Re-base)
))

;; same for single-quoted 
(setq Re-base-sq (make-Replacement :name "base-sq"
    :re (re-nocase (+
	"<[ \t\n]*base[ \t\n]+href[ \t\n]*=[ \t\n]*'(" re-url-nq ")'[^>]*>"))
    :pos 0
    :func (Replacement-func Re-base)
))

;; lowsrc:
;; supresses lowsrc attributes to images if a src exists (lowsrc is not needed
;; on a local copy)

(setq Re-lowsrc:src (re-nocase "[ \t\n]+src[ \t\n]*="))
(setq Re-lowsrc (make-Replacement :name "lowsrc"
    :re (re-nocase (+
	"<[ \t\n]*img[ \t\n]+([^>]+[ \t\n]+)?(lowsrc[ \t\n]*=[ \t\n]*\"("
	re-url ")\")[^>]*>")
    )
    :pos 2
    :func (lambda (re &aux)
      (if (regexec Re-lowsrc:src (regsub re 0))
	(progn
	  (verbose? "### %0: LOWSRC %1 SUPPRESSED" file (regsub re 2))
	  ""
	)
	(regsub re 2)
))))

;; same for non-quoted 
(setq Re-lowsrc-nq (make-Replacement :name "lowsrc-nq"
    :re (re-nocase (+
	"<[ \t\n]*img[ \t\n]+([^>]+[ \t\n]+)?(lowsrc[ \t\n]*=[ \t\n]*("
	re-url-nq "))[^>]*>")
    )
    :pos 2
    :func (Replacement-func Re-lowsrc)
))

;; same for single-quoted 
(setq Re-lowsrc-sq (make-Replacement :name "lowsrc-sq"
    :re (re-nocase (+
	"<[ \t\n]*img[ \t\n]+([^>]+[ \t\n]+)?(lowsrc[ \t\n]*=[ \t\n]*'("
	re-url-nq ")')[^>]*>")
    )
    :pos 2
    :func (Replacement-func Re-lowsrc)
))

;; main workhorse: fix URLs in links
;;  * relative links if the base must be applied
;;  * absolute links into rel links if local copy exists

(setq Re-link (make-Replacement :name "link"
    :re re-link
    :pos re-link-pos
    :func (lambda (re &aux 
	res
	(dummy (if debug (verbose? "%0LINK%0\n   %1" dashes (regsub re 0))))
	(url (parse-url (regsub re re-link-pos) newbase))
      )
      (if (URL-internal url)
	(setq res (URL-internal url))
	;; else
	(progn
	  (if (and (/= "mailto:" (URL-protocol url))
	      (or (URL-local url) parse-url:relmoved)) ;local copy exists?
	    (setq res (+ (URL-relname url) (URL-anchor url)))
	    (if parse-url:absmoved
	      (setq res (+ parse-url:absmoved (URL-anchor url)))
	      ;;else keep orig text, except if was local-absolute
	      (if (= #\/ (getn (URL-text url) 0)) (progn
		  (setq res (+ "http://" site (URL-text url)))
		  (verbose? "### %0: WARNING: didnt found local copy of local link to %0" (URL-text url))		    
		  (if (not (seek internal-notfound (URL-text url)))
		    (lappend internal-notfound (URL-text url))
		  )
		)
		(setq res (URL-text url))) 
	    )
	  )
	  (PVD (LURL url) newbase parse-url:relmoved parse-url:absmoved res)
	  (if verbose
	    (if (/= res (URL-text url))
	      (verbose? "### %0: %1 ==> %2" file (URL-text url) res)
	)))
      )
      res      
)))

;; same for non-quoted 
(setq Re-link-nq (make-Replacement :name "link-nq"
    :re re-link-nq
    :pos re-link-pos
    :func (Replacement-func Re-link)
))

;; same for single-quoted 
(setq Re-link-sq (make-Replacement :name "link-sq"
    :re re-link-sq
    :pos re-link-pos
    :func (Replacement-func Re-link)
))

;;-----------------------------------------------------------------------------
;; recursion into dirs

(defun do-file-or-dir (file curdir)
  (PVD file curdir (file-type file))
  (if ignore-errors
    (catch 'ERROR			;errors abort only current file
      (if (= 'file (setq ftype (file-type file)))
	(do-file file curdir)
	(= 'directory ftype)
	(do-dir file curdir)
    ))
    (if (= 'file (setq ftype (file-type file)))
      (do-file file curdir)
      (= 'directory ftype)
      (do-dir file curdir)
)))


(defun do-dir (dir curdir &aux
    (*current-directory* dir)
  )
  (verbose? "--> entering dir %0" (if (/= curdir "") (+ curdir "/" dir) dir))
  ;; process files in alphabetic order, easier for humans to follow
  (dolist (file (sort (directory) compare)) 
    (do-file-or-dir file (if (/= curdir "") (+ curdir "/" dir) dir)
  ))
  (verbose? "<-- back to dir  %0" curdir)
))

;;-----------------------------------------------------------------------------
;; parse-url
;; parses an URL, returns a struct URL. The main -hairy- logic is here. 
;; Take extra steps to ease the work for further processing
;; empty fields are null strings ""

(defstruct URL 
  :text					;original complete URL
  :protocol				;mailto:, http:, ftp:, file:...
  :host					;host name, aliases expanded
  :dir					;dir in host name
  :file					;file (expanded to index if exists)
  :anchor				;internal anchor (#-prefixed)
  :internal				;this is internal to same file 
					;in which case rest of fields are nil
  :orig-host				;unaliased host
  :localname				;absolute path of local copy
  :local				;if it exists, its filestats
  :relname				;relative path from here to local copy
  :reldir				;relative path to dir
  :fullform				;complete URL regenerated
)

(setq parse-url:redir (regcomp "^((.*)[/])?([^/]*)$"))
(setq parse-url:re (regcomp 
    "^(([a-zA-Z]*:)?)([/][/]([^/]+))?((.*)[/])?([^#]*)(#(.*))?$"))
(defun parse-url (s &optional newbase &aux 
    name relname d f a url h 
    (current-site-alias (get host-aliases current-site current-site))
  )					; global variables set:
  (setq parse-url:relmoved ())		; moved by BASE to a relative link
  (setq parse-url:absmoved ())		; moved by BASE to an external link
  (setq parse-url:internal ())		; link of the form #...
  (if (regexec parse-url:re s)
    (with (host (remove-port (regsub parse-url:re 4)))
      (if (and (/= "" host) (not (getn host-aliases host)))
	(if list-all-externals 
	  (if (not (seek external-urls s)) (lappend external-urls s))
	  (put external-hosts host (+ 1 (get external-hosts host 0)))
      ))
      (setq url
	(if (/= #\# (getn s 0))		;not relative link in same file
	  (make-URL 
	    :host (setq h 
	      (get host-aliases host (if (= "" host) current-site-alias host))
	    )
	    :protocol (regsub parse-url:re 2)
	    :dir (setq d (regsub parse-url:re 6))
	    :file (setq f (regsub parse-url:re 7))
	    :anchor (setq a (regsub parse-url:re 8))
	    :text s :orig-host host
	    :localname (if (or		;absolute path if either:
		(= #\/ (getn d 0))		;path (dir) begins by /
		(/= "" host)		;host was specified
		(and (= "" d) (= "" f) (= "" a)); all empty? means url was "/"
	      )
	      (progn			
		(setq name (+ root "/" 
		    (if (= "" host) site (get host-aliases host host))
		    d "/" f
		))
		(setq name (expand-index name a))
		(PVD "parse-url:abspath" s d f a host name)
		name
	      )
	      (progn			;otherwise, relative path
		(setq relname (if newbase
		    (+ newbase (if (= "" d) "" "/") d "/" f)
		    (+ d (if (= "" d) "" "/") f)
		))
		(setq relname (expand-index relname a))
		(if newbase 
		  (if newbasehost		
		    (setq parse-url:absmoved (+			;change text
			newbaseprotocol "//" (strip-dotdot newbase)
			(if (= #\/ (getn newbase -1)) "" "/") f
		    ))
		    (setq parse-url:relmoved t)) ;else insert relname
		)
		(PVD "parse-url:relpath" s d f a newbase relname)
		(+ root "/" curdir (if (= "" curdir) "" "/") relname)
	      )
	    )
	    :relname (if relname relname (setq relname 
		(dir-relpath curdir (subseq name (+ 1 (length root))))
	    ))
	    :reldir (match parse-url:redir relname 2)
	    :local (file-stats (if name name relname))	    
	  )
	;; internal url, simplified case
	  (make-URL 
	    :text s
	    :host (setq h 
	      (get host-aliases host (if (= "" host) current-site-alias host))
	    )
	    :protocol (regsub parse-url:re 2)
	    :dir (setq d (regsub parse-url:re 6))
	    :file (setq f (regsub parse-url:re 7))
	    :anchor (setq a (regsub parse-url:re 8))
	    :internal s
	  )
      )) ; if
      (URL-fullform url 
	(if (and (seek ["" "http:" "ftp:"] (URL-protocol url))
	    (seek sites (URL-host url))
	    (not (URL-internal url))
	  )
	  (+ (if (/= "" (URL-protocol url)) (URL-protocol url) "http:") "/" 
	    (strip-middotdot (subseq (URL-localname url) (length root)))
	  )
	  (URL-text url)
      ))
      (if (URL-local url)
	(quote-% url)			;URL actually mirrored
	(when (and (not parse-url:relmoved) (not (URL-internal url)))
	  (if (and (seek sites (URL-host url)) 	;URL local to mirrored site?
	      (not (seek local-notfound-urls (URL-fullform url)))
	      (seek ["" "http:" "ftp:"] (URL-protocol url))
	    )
	    (lappend local-notfound-urls (URL-fullform url))
	  )
	)
      )
      (if debug2 (progn (PF "Parsed URL structure: ") (struct-pp url)))
      url
    ) ; with
    (error "unrecognized URL syntax: %r0\n" s)
  )
  url
)


;;-----------------------------------------------------------------------------
;; misc utils

;; quote % in paths
(defun quote-% (url &aux name)
  (if (setq name (URL-localname url))
    (if (regexec re-% name)
      (replace-string name re-% "%25" :quote t :all t)
  ))
  (if (setq name (URL-relname url))
    (if (regexec re-% name)
      (replace-string name re-% "%25" :quote t :all t)
  ))
  (if (setq name (URL-reldir url))
    (if (regexec re-% name)
      (replace-string name re-% "%25" :quote t :all t)
  ))
)

;; compute minimal relpath from source dir to dest file, both relative
(setq dir-relpath:redir (regcomp "^([^/]*)[/]?(.*)$"))
(defun dir-relpath (source dest &aux
  )
  (while (and (/= "" source)
      (= (match dir-relpath:redir source 1)
	(match dir-relpath:redir dest 1)
    ))
    (setq source (match dir-relpath:redir source 2))
    (setq dest (match dir-relpath:redir dest 2))
  )
  (if (= source "")
    dest
    (with (up (copy ""))
      (while (/= source "")
	(nconc up "../")
	(setq source (match dir-relpath:redir source 2))
      )
      (+ up dest)
  ))
)

;; strip-dotdot: remove leading ../ s
(setq strip-dotdot-re (regcomp "^([.][.][/])*(.*)$"))
(defun strip-dotdot (s)
  (if (regexec strip-dotdot-re s) (regsub strip-dotdot-re 2) s)
)
;; strip-middotdot: remove mid /../ s
(setq strip-middotdot-re (regcomp "^(.*[/])([^/]+[/][.][.][/])(.*)$"))
(defun strip-middotdot (s)
  (while (regexec strip-middotdot-re s) 
    (setq s (+ (regsub strip-middotdot-re 1) (regsub strip-middotdot-re 3)))
  )
  s
)

;; for backcompat with old klone installs with old defstruct
(if (boundp 'URL-:text) (with (call (list 'defstruct 'URL)) (progn
      ;; transform keyword into atoms for the defstruct call
      (defun k2a (obj)
	(if (typep obj Keyword) (intern (subseq obj 1)) (Atom obj))
      )
      (dolist (f (subseq URL 2))
	(lappend call (k2a f))
      )
      (eval call)
)))

(defun do-only-one-file (path &aux
    (re (regcomp "^((([^/]+)[/])?((.*)[/])?)([^/]*)$"))
  )
  (if (regexec re path) (progn
      (setq site (regsub re 3))
      (setq curdir (regsub re 1))
      (setq file (regsub re 6))
      (PVD "doing one file:" site curdir file)
      (with (*current-directory* curdir)
	(do-file-or-dir file curdir)
))))

(setq re-~ (regcomp "^([^~]*)~([^~]*)$"))
(setq re-% (regcomp "%"))

(defun expand-index (path anchor)
  (if (regexec re-~ path) 	;expanse ~s in paths
    (with (quoted-path (regsub re-~ "\\1%7E\\2"))
      (if (file-stats quoted-path)
	(setq path quoted-path)
  )))
  (if (/= 0 (length anchor))
    path
    (= #\/ (get path -1 #\/))
    (if (file-stats (+ path "index.html")) (+ path "index.html")
      (file-stats (+ path "index.htm")) (+ path "index.htm")
      t path
    )
    t
    (if (file-stats (+ path "/index.html")) (+ path "/index.html")
      (file-stats (+ path "/index.htm")) (+ path "/index.htm")
      t path
    )
))

(setq dashes "-------------------------------------")
(defun LURL (url &aux
    (fd (open (copy "") :type :string :direction :io))
  )
  (struct-print url fd)
  (file-position fd 0)
  (read fd)
)

(setq remove-port:re (regcomp "^([^:]*):"))
(defun remove-port (s)
  (if (regexec remove-port:re s)
    (regsub remove-port:re 1)
    s
))

;;-----------------------------------------------------------------------------
;; mainloop
(defun main (&aux
    (root *current-directory*)
    (sites (list))
    (external-hosts (list))
    (external-urls (list))
    (external-bases (list))
    (internal-notfound (list))
    (local-notfound-urls (list))
    (nfiles 0)
    (nfilesdone 0)
    current-site			;name of current site
  )
  (if given-sites (setq sites (copy given-sites))
    (dolist (dir (directory)) ;; find sites
      (if (= 'directory (file-type dir))
	(lappend sites dir)
  )))
  (dolist (site sites)			;declare site as self-aliased
    (put host-aliases site site)
  )
  (PVD host-aliases)
  
  (if (and do-backup (not (file-stats "BACKUP.tgz")))
    (wait (system "tar cfz BACKUP.tgz `find . '(' -name '*.[hH][Tt][Mm]' -o -name '*.[hH][Tt][Mm][lL]' ')' -print`"))
  )
  (if only-one-file
    (do-only-one-file only-one-file)
    (dolist (site sites)
      (verbose? "=== treating site: %0" site)
      (with (current-site site) (do-file-or-dir site ""))
  ))
  (when verbose
    (PF "=== DONE =========================================================\n")
    (if list-all-externals 
      (if external-urls (progn
	  (PF list-all-externals-fd "=== STATS: external URLs referred to:\n")
	  (dolist (url (sort external-urls compare))
	    (PF list-all-externals-fd "    %0\n" url)
	  )
	  (if list-all-externals-file
	    (PF "=== External URLs printed to file: %0\n" 
	      list-all-externals-file)
	))
	(PF "=== No external URLs referred to.\n")
      )
      (if external-hosts (progn
	  (PF "=== STATS: external hosts referred to, and times referred to:\n")
	  ;; sort by most usage
	  (setq ehl (list))
	  (dohash (h n external-hosts) (lappend ehl (list h n)))
	  (sort ehl (lambda (x y) (with (res (compare (getn y 1)(getn x 1)))
		(if (= res 0) (compare (getn x 0) (getn y 0)) res)
	  )))
	  (dolist (l ehl)
	    (PF "    %0   %1\n" (getn l 0) (getn l 1))
	))
	(PF "=== No external hosts referred to.\n")
    ))
    (when local-notfound-urls
      (PF list-all-externals-fd "*** WARNING: non mirrored internal URLs:\n")
      (dolist (url (sort local-notfound-urls compare))
	(PF list-all-externals-fd "    %0\n" url)
      )
      (if list-all-externals-file
	(PF "=== Not mirrored internal URLs printed to file: %0\n" 
	  list-all-externals-file)
    ))
    (when internal-notfound
      (PF "*** WARNING: site has local links that are not mirrored\n")
      (dolist (base (sort internal-notfound compare))
	(PF "    %0\n" base)
      )
    )
    (PF "=== STATS: %0 files scanned, %1 modified\n" nfiles nfilesdone)
  )
  (when external-bases 
    (PF "*** WARNING: BASE tags pointing to external hosts found!.
*** You should mirror them too (or declare them as aliases). URLs:\n")
    (dolist (base (sort external-bases compare))
      (PF "    %0\n" base)
    )
  )
)

(main)

;;; EMACS MODES
;;; Local Variables: ***
;;; mode:lisp ***
;;; End: ***

