#!/bin/sh
: ; exec klone $0 "$@"
; The above line allows not to embed the exact path of the klone executable

; script based on "psicvt" by ???

(setq args (getopts "USAGE: %0 [options] PsionSeries5WordFile
Will output the file converted to XML on stdout"
    ("-v" () verbose "verbose operation. lists tags found.")
    ("-r" () output:raw "raw output")
    ("-x" () output:xml "xml output")
    ("-T" () output:text "text output (default)")
    ("-t" xn=name renamed-tags "rename tag xn into name for xml output"
      :multiple t
    )
    ("-o" outfile outfilename "output filename (default: stdout)")
    ("-n" () nulloutput "no output")
    ("-dl" () dump_lumps "dump lumps in filename__lumpnames")
    ("-debug" () enter-debugger-on-error "enter klone debugger on error")
    ("-stackdump" () stackdump-on-error "verbose stack dump on error")
))

(if args 
  (setqn in (open (0 args)) filename (0 args)) 
  (setqn in *standard-input* filename "")
)
(if nulloutput (setq outfilename "/dev/null"))
(if outfilename
  (setq out (open outfilename :direction :output :if-exists :supersede))
  (setq out *standard-output*)
)

(if enter-debugger-on-error (kdb t))
(if stackdump-on-error (stack-dump-on-error t))

;; predefined tokens. uids are the IDs of the rootdir
(setqn MagicNumber "7\x00\x00\x10m\x00\x00\x10"
  worduid "\x7f\x00\x00\x10"
  opluid "\x85\x00\x00\x10"
  skipcount 0				;unused for text, 16 for OPL
  unknown-uid-1 "C\x02\x00\x10"
  unknown-uid-2 "\x04\x01\x00\x10"
  unknown-uid-3 "\x05\x01\x00\x10"
  streamid "\x06\x01\x00\x10"
  par-uid "C\x01\x00\x10"		;I suspect these are the paragraphs
  unknown-uid-4 "\x89\x00\x00\x10"
)
(setq uid-names (list streamid "text" par-uid "paragraphs" 
    unknown-uid-1 "UID1" unknown-uid-2 "UID2" unknown-uid-3 "UID3" 
    unknown-uid-4 "UID4"
))

;; Structure: the text lump is prefixed by its length, which can be coded on 
;; 1/2/3 bytes = 7/14/29 bits
;; In lumps, end of paragraph is ^F, line break is ^G
;; The paragraphs lump can be ommited, and seem to be a collection of:
;; 3 bytes: 01 00 {02|08}
;; 1 word giving the number (starting at 1) of the par, 
;; 1 word at 0
;; 1 word giving the par ID: h1/h2/h3 = ff/fe/fd, normal seem 00, bl fc
;; (seem to be descending order of declaration)

(defun main (&aux
    (buffer (String in))		;gobble up whole file
    rootdir				;plist of ids (strings) and offsets
    lumplengths				;plist of ids and lengths
    text
  )
  (if 
    (/= (subseq buffer 0 8) MagicNumber) ;check magic number
    (fatal-error 1 "ERROR: Input file is not a Psion Series 5 Word file!\n")
    (/= (subseq buffer 8 12) worduid)
    (fatal-error 2 "ERROR: Input file is not a Psion Series 5 Word file!\n")
  )
  (setq rootdir (read-rootdir buffer))

  (setq text (extract-text rootdir filename buffer))
  (setq paragraphs (extract-paragraphs rootdir filename buffer))
  (if dump_lumps (do_dump_lumps rootdir filename buffer))

  (if output:raw (output_raw text out)
    output:xml (output_xml text paragraphs out)
    t (output_wrapped text out)
  )
)

;; We read the "root stream" whose adress is given as bytes 16 to 20
;; (just after the header) in file
;; sets the global lumplengths
(defun read-rootdir (buffer &aux 
    rootadr
    n
    (rootdir (list))			;plist of ids (strings) and offsets
    pos end
  )
  ;; read root directory, ids of file parts
  (setq rootadr (str2word buffer 16)) ;dir position in file
  (verbose? "  at %0, rootdir" rootadr)
  (setq n (get buffer rootadr))
  (setq pos (+ rootadr 1))
  (while (> n 0)
    (put rootdir (subseq buffer pos (+ pos 4)) (str2word buffer (+ pos 4)))
    (incf pos 8)
    (incf n -2)
  )
  ;; compute lump lengths
  (setq lumplengths (list))
  (dohash (id adr rootdir)
    (setq end (length buffer))		;compute end position of lump
    (dohash (id2 adr2 rootdir)
      (if (and (> adr2 adr) (< adr2 end)) (setq end adr2))
    )
    (put lumplengths id (- end adr))
  )
  (if verbose (dohash (id adr rootdir)
      (verbose? "  at %0, lump %2 %r1, %3 bytes" adr id (lump-name id)
	(get lumplengths id)
      )
  ))

  rootdir
)

(defun str (s pos len) (subseq s pos (+ pos len)))

;;=============================================================================
;;                    word encoding/decoding utils, LSB
;;=============================================================================
;; takes next 4 bytes in string at pos, return LSB number
(defun str2word (s pos)
  (+ (get s pos 0) 
    (* 256 (get s (+ pos 1) 0))
    (* 65536 (get s (+ pos 2) 0))
    (* 16777216 (get s (+ pos 3) 0))
))

;; The opposite: take a number and return a 4-chars string
(defun word2str (n &aux (s (copy "")) n2)
  (if (>= n 0) (setq n2 (/ n 256))
    (setq n2 (+ 0x00ffffff (/ n 256)))	;avoid pb with signed numbers
  )
  (String (list
      (mod n 256)
      (mod n2 256)
      (mod (/ n2 256) 256)
      (mod (/ n2 65536) 256)
)))

(defun hexprint (s &aux (res (copy "")))
  (dolist (c (List s)) 
    (nconc res (hexprint-byte c) " ")
  )
  res
)
(setq hexprint-byte-chars '("0""1""2""3""4""5""6""7""8""9""a""b""c""d""e""f"))
(defun hexprint-byte (c)
  (PF String "%0%1" (get hexprint-byte-chars (/ c 16))
    (get hexprint-byte-chars (mod c 16))
))

;;=============================================================================
;;                    lump decoding functions
;;=============================================================================

;; computes paragraphs, returns a list of them
(defstruct Par
  type
  id
  len
)

(defun extract-paragraphs (rootdir filename buffer &aux
    end					;end of lump
    (parids (Hashtable ()))			;plist id / type
    (pars (list))
    par paradr parlen
    pos (textpos 0)
    npars npars-pos id0?
    id type len
  )
  (setq paradr (get rootdir par-uid))
  (if (/= 1 (get buffer paradr))
    (fatal-error 1 "ERROR: PAR lump do not begin with 01 but %d0 ???\n" 
      (get buffer paradr))
  )
  (setq parlen (+ (* 256 (get buffer (+ paradr 1))) (get buffer (+ paradr 2))))
  (verbose? "parlen = %0" parlen)
  (setq pos (+ paradr 3))
  (dotimes (i parlen)
    (put parids 
      (setq id (str2word buffer (+ pos (* i 13))))
      (setq type (str2word buffer (+ 8 pos (* i 13))))
    )
    (verbose? "  Par %0  %1" type id)
  )
  (setq parids (rename-parids parids))
  (setq npars-pos (+ 4 pos (* parlen 13)))
  (verbose? "  Pars end at %0" (itox (- (+ pos (* parlen 13)) paradr)))

  (setq npars (str2word buffer (- npars-pos 4)))
  (verbose? "npars is: %0 = %1"
    (hexprint (subseq buffer (- npars-pos 4) npars-pos)) npars
  )
  (dotimes (i npars)
    (setq id0? ())
    (setq len (str2word buffer npars-pos))
    (setq id (get buffer (+ npars-pos 4)))
    (when (= 0 id) ;; 0 id? we seem to have to skip 6 bytes...
      (setq id0? t)
      (incf npars-pos 6)
      (verbose? "  == Skipping 6 bytes at %0 / %1: %2"
	(- npars-pos 2) (itox (- npars-pos 2))
	(hexprint (subseq buffer (- npars-pos 2) (+ npars-pos 4)))
      )
      (setq id (get buffer (+ npars-pos 4)))
      (if (= 0 id)
	(fatal-error 1 "ERROR: Bad par id at %0 / %1\n" (+ npars-pos 4)
	  (itox (+ npars-pos 4))
	)
    ))
    (setq par (make-Par
	:type (get parids id 
	  '(fatal-error 1 "ERROR: Unknown par id at %0 / %1\n" (+ npars-pos 4)
	    (itox (+ npars-pos 4))
	))
	:id id
	:len len
    ))
    (if verbose 
      (with
	(head (PF String "  par %0 %1(%2): " (get parids id) textpos len))
	(verbose? "%0%1%2" head (make-string (- 23 (length head)))
	  (if (> len 53) (+ (subseq text textpos (+ textpos 50)) "...")
	    (subseq text textpos (+ textpos len -1))
    ))))
    (incf textpos len)
    (incf npars-pos 5)
    (if id0? (incf npars-pos 3))
    (lappend pars par)
  )  
  (verbose? "End of pars at %0" (- (+ npars-pos (* npars 5)) paradr))
  ;; verifications:
  ;; are there still unread data at end of lump?
  (setq pos (- 
      (+ (get rootdir par-uid) (get lumplengths par-uid))
      npars-pos
  ))
  (if (< pos 0)
    (fatal-error 1 "ERROR! Paragraph lump too short by %0 bytes!\n" (- pos))
    (and (= pos 4) (= "\x00\x00\x00\x00" (subseq buffer 
	    (+ npars-pos (* npars 5))
	    (+ (get rootdir par-uid) (get lumplengths par-uid))
    )))
    ()					; 4 0 bytes seem the normal mode
    (> pos 0)
    (verbose? "%0 Unused bytes in paragraph lump %1" pos
      (if (= (make-string pos 0) 
	  (subseq buffer 
	    (+ npars-pos (* npars 5))
	    (+ (get rootdir par-uid) (get lumplengths par-uid))
	))
	"(all zeros)" (hexprint (subseq buffer 
	    (+ npars-pos (* npars 5))
	    (+ (get rootdir par-uid) (get lumplengths par-uid))
    ))))
  )
  ;; check that pars match text divisions. Suppose text is in "text"
  (with (pos 0 partext () re-f (regcomp ".*\x06.*\x06"))
    (dolist (par pars)
      (setq partext (subseq text pos (+ pos (Par-len par))))
      (if (/= #\^f (get partext -1))
	(verbose? "paragraph do not match text division at %0!!!"
	  (+ pos (Par-len par))
	)
	(regexec re-f partext)
	(verbose? "paragraph contains a ^F %0-%1!!!"
	  pos (+ pos (Par-len par))
	)
      )
      (setq pos (+ pos (Par-len par)))
  ))
  pars
)

(defun rename-parids (table &aux
    (newtable (Hashtable ()))
    (aliases (Hashtable ()))
    (re-alias (regcomp "^([^=]+)=(.*)$"))
    newname
  )
  (dohash (id type table)
    (put newtable id
      (if (= type 0) "p"
	(= type 255) "h1"
	(= type 254) "h2"
	(= type 253) "h3"
	(= type 252) "li"
	t (+ "x" (String (- 252 type)))
    ))
  )
  (dolist (rename renamed-tags)
    (if (regexec re-alias rename)
      (put aliases (regsub re-alias 1) (regsub re-alias 2))
      (fatal-error 1 "ERROR: bad tag renaming syntax: %0\n" rename)
  ))
  (dohash (id name newtable)
    (if (setq newname (getn aliases name))
      (put newtable id newname)
  ))
  newtable
)

(defun extract-text (rootdir filename buffer &aux
    end					;end of lump
    textadr
    textlen
    text
  )
   ;; extract character stream
  ; a text lump is 1/2/3 bytes to code the length, and the text itself
  (setq textadr (+ skipcount (get rootdir streamid)))	;adr of charstream part
  (setq textlen (get buffer textadr))
  (incf textadr)
  (if 
    (= 0 (logand textlen 1))		;7 bit length
    (setq textlen (/ textlen 2))
    (= 0 (logand textlen 2))		;14 bit length
    (with (adr textadr)
      (incf textadr)
      (setq textlen	(+ (/ textlen 4) (logshift (get buffer adr) 6)))
    )
    (= 0  (logand textlen 4))		; 29 bit length 
    (with (adr textadr)
      (incf textadr 3)
      (setq textlen (+ (/ textlen 8) (logshift (get buffer adr) 5)
	  (logshift (get buffer (+ adr 1)) 13)
      ))
    )
    t
    (fatal-error 3 "ERROR: Cannot decode text length!\n")
  )
  (verbose? "  Text is %0 bytes at %1" textlen textadr)
  (setq text (subseq buffer textadr (+ textadr textlen)))
  text
)

;;=============================================================================
;;                    Output functions
;;=============================================================================

;;raw one: do not wrap, only one newline between pars
(defun output_raw (text fd)
  ;; replaces ^F by newline (end of paragraph)
  (with (pos 0) (while (setq pos (seek text 6)) (put text pos #\newline)))
  ;; replaces ^G by newline (line break)
  (with (pos 0) (while (setq pos (seek text 7)) (put text pos #\newline)))

  ;; remove the extra newline at the end, or add one if none is there
  (if (= "\n\n" (subseq text (- (length text) 2)))
    (setq text (subseq text 0 (- (length text) 1)))
    (/= #\newline (get text -1))
    (nconc text "\n")
  )
  (write-string text fd)
)

(setq re-nonempty (regcomp "[^ \x07\t\r\n]"))
(setq re-& (regcomp "&"))
(setq re-< (regcomp "<"))

;; standard one: wrapped text, paragraphs separated by 2 newlines
(defun output_wrapped (text fd &aux printed?)
  ;; replaces ^G by newline (line break)
  (with (pos 0) (while (setq pos (seek text 7)) (put text pos #\newline)))
  ;; then wraps each paragraph, skip empty ones
  (with (pos 0 par ())
    (while (setq end (seek text 6 pos))
      (setq par (subseq text pos end))
      (when (regexec re-nonempty par)
	(if printed? (PF fd "\n"))
	(setq printed? t)
	(wrap-paragraph par out)
      )
      (setq pos (+ end 1))		;skip par separator (^F)
  ))
)

;; wraps words, but obey newlines
(defun wrap-paragraph (par fd &aux
    (pos 0)
    (nlpos 0)
    (*standard-output* fd)
    subpar
  )
  (while (setq nlpos (seek par #\newline pos))
    (setq subpar (subseq par pos nlpos))
    (when (regexec re-nonempty subpar)
      (print-margin-words subpar ["" "" "" "" " " "-" ""])
    )
    (setq pos (+ nlpos 1))		;skip newline
  )
  (setq subpar (subseq par pos))
  (when (regexec re-nonempty subpar)
    (print-margin-words subpar ["" "" "" "" " " "-" ""])
  )  
)

(defun output_xml (text pars fd &aux 
    printed? tag
    (tag_occurences (Hashtable ()))
    in-ul?
  )
  ;; replaces ^G by newline (line break)
  (with (pos 0) (while (setq pos (seek text 7)) (put text pos #\newline)))
  ;; replaces ^P by space?
  (with (pos 0) (while (setq pos (seek text #\^p )) (put text pos #\space)))

  ;; then wraps each paragraph, skip empty ones
  (with (pos 0 partext ())
    (dolist (par pars)
      (setq partext (subseq text pos (+ pos -1 (Par-len par))))
      (when (/= "" partext)
	(if printed? (PF fd "\n"))
	(setq printed? t)
	(replace-string partext re-& "&amp;" :all t :quote t)
	(replace-string partext re-< "&lt;" :all t :quote t)
	(setq tag (Par-type par))
	(put tag_occurences tag (+ 1 (get tag_occurences tag 0)))
	(if in-ul?
	  (if (/= tag "li") (progn (PF out "</ul>\n") (setq in-ul? ())))
	  (if (= tag "li") (progn (PF out "<ul>\n") (setq in-ul? t)))
	)
	(wrap-paragraph (+ "<" tag ">" partext "</" tag ">") out)
      )
      (incf pos (Par-len par))		;skip par separator (^F)
  ))

  (if verbose
    (dohash (name count tag_occurences)
      (verbose? "  %0 occured %1 times" name count)
  ))
)

;;=============================================================================
;;                    debug functions
;;=============================================================================
(defun lump-name (id) (get uid-names id "???"))

(defun do_dump_lumps (rootdir filename buffer &aux
    end					;end of lump
  )
  (dohash (id adr rootdir)
    (setq end (length buffer))		;compute end position of lump
    (dohash (id2 adr2 rootdir)
      (if (and (> adr2 adr) (< adr2 end)) (setq end adr2))
    )
    (with (fd (open (+ filename "__" (lump-name id)) :direction :output
	  :if-exists :supersede
      ))
      (write-string (subseq buffer adr end) fd)
      (close fd)
      (PF "  lump %0, length %1 (from offset at %2)\n" 
	(+ filename "__" (lump-name id)) (- end adr) adr
      )
    )	
  )  
)

(main)

;;; EMACS MODES
;;; Local Variables: ***
;;; mode:lisp ***
;;; End: ***

