;;; -*- Base: 10; Syntax: Common-Lisp; Package: cmi; Patch-File: Yes; -*- 

;;;
;;; Patch to lengthen miscellaneous cycles for cm200 ucode.
;;; This doesn't change the location of any ui's.
;;;

(in-package 'cmi)

(defucmacro gen-read-stack-macroinst (to-reg)
  `(def-min-mic-internal ,(intern (format nil "IMP-READ-STACK-~a" (symbol-name to-reg))
				  cmi-package) (offset) NIL
     ((min-mic-declare
	(:DISP-ENTRY nil)
	(:IMP-INLINE-ENTRY t :NO-SAVE-DISPATCH :NO-XFER)
	(:END-RETURN nil))
      "Save counter in Q, load counter with stack-top - offset"
      (ui (alu-simp scratch-ram-addr (Q $EXP))
	  (clk-cntl 5-tick))		; Extend CM part of this instr to fake out
					; extending sequencer part of next instr.
					; Required for cm200 because
					; sram -> alu -> counter is too long for 4 ticks.
      (ui (alu-simp (- %IMP-SP SCRATCH-RAM) (Y $EXP)) (YBUS-DEST-SEL COUNTER))
      "Read stack value into register"
      (ui (alu-simp scratch-ram (,to-reg $EXP)))
      "write saved counter back (advanced past arg), return"
      (ui (alu-simp (+ Q 1) (Y $EXP))
	  (YBUS-DEST-SEL COUNTER)
	  (imp$-macroinst-return-inline)) )))

(def-imp-min-mic read-stack-macroinst ()
  (min-mic-declare (:ENTRY-SPEC nil) (:DISP-ENTRY nil) (:END-RETURN nil))
  (gen-all-read-stack-macroinst))

(def-imp-min-mic cmf-read-stack-macroinst ()
  (min-mic-declare (:ENTRY-SPEC nil) (:DISP-ENTRY nil) (:END-RETURN nil))
  (gen-all-cmf-read-stack-macroinst))


(defucmacro gen-write-stack-macroinst (to-reg)
  `(def-min-mic-internal ,(intern (format nil "IMP-WRITE-STACK-~a" (symbol-name to-reg))
				  cmi-package) (offset) NIL
     ((min-mic-declare
	(:DISP-ENTRY nil)
	(:IMP-INLINE-ENTRY t :NO-SAVE-DISPATCH :NO-XFER)
	(:END-RETURN nil))
      "Save counter in Q, load counter with stack-top - offset"
      (ui (alu-simp scratch-ram-addr (Q $EXP))
	  (clk-cntl 5-tick))		; Extend CM part of this instr to fake out
					; extending sequencer part of next instr.
					; Required for cm200 because
					; sram -> alu -> counter is too long for 4 ticks.
      (ui (alu-simp (- %IMP-SP SCRATCH-RAM) (Y $EXP)) (YBUS-DEST-SEL COUNTER))
      "Write register value into stack"
      (ui (alu-simp ,to-reg (Y $EXP))
	  (SRAM-W WRITE))
      "write saved counter back (advanced past offset arg), return"
      (ui (alu-simp (+ Q 1) (Y $EXP))
	  (YBUS-DEST-SEL COUNTER)
	  (imp$-macroinst-return-inline)))))

(def-imp-min-mic write-stack-macroinst ()
  (min-mic-declare (:ENTRY-SPEC nil) (:DISP-ENTRY nil) (:END-RETURN nil))
  (gen-all-write-stack-macroinst))

(def-imp-min-mic cmf-write-stack-macroinst ()
  (min-mic-declare (:ENTRY-SPEC nil) (:DISP-ENTRY nil) (:END-RETURN nil))
  (gen-all-cmf-write-stack-macroinst))

(defmacro def-physical-news (name &key (conditional nil))
  `(def-news-scan-min-mic-callable ,name (dest source length arg-instr-mask)
     (min-mic-declare (:disp-entry nil)
		      (:end-return nil))

     ;; Expects the Q register to hold the on chip mask.  This is not usually
     ;; called directly but rather through news-dispatch-on-mask.  The caller
     ;; must ensure that the register usage matches the argument list.

     (ucblock ((temp-length '%scratch-reg1%)
	       (off-chip-vector '%scratch-reg2%))

       ;; Save the current scratch ram address. (temporarily use
       ;; temp-length).  Also test for 0 length.
       (ui (alu-simp length (temp-length scratch-ram-addr)))
       (ui (seq crtn addr-zero))

       ;; There are three arrays in scratch ram which give the sequence of
       ;; processors going off chip for any given on chip mask.  The
       ;; individual sequences are glommed together in one big array,
       ;; *news-off-chip-processors-base* (which happens to have 81
       ;; elements).  The two other array have 16 elements (one for each
       ;; possible mask).  The array *news-off-chip-processors-length*
       ;; gives for each mask the number of processors going off chip.
       ;; The array *news-off-chip-processors-index* gives the starting
       ;; position in *news-off-chip-processors-base* for each mask.

       ;; Put (number of processors going off chip - 1) into the dispatch register.
       (ui (alu-simp (+ Q (constant
			    (eval
			      (uc-scratch-address
				(uc-scratch-block-struct '*news-off-chip-processors-length*))))))
	   (ybus-dest-sel counter) (ybus-src-sel alu)
	   (clk-cntl 5-tick))		; Extend CM part of this instr to fake out
					; extending sequencer part of next instr.
					; Required for cm200 because
					; sram -> alu -> dispatch is too long for 4 ticks.
       (ui (alu-simp (- scratch-ram 1) (y $exp))
	   (ybus-dest-sel dispatch))

       ;; Put the scratch ram address of the vector of processors going off
       ;; chip into off-chip-vector (and point scratch-ram there).
       (ui (alu-simp (+ Q (constant
			    (eval
			      (uc-scratch-address
				(uc-scratch-block-struct '*news-off-chip-processors-index*))))))
	   (ybus-dest-sel counter) (ybus-src-sel alu))
       
       ;; off-chip-vector <- *sram; Q <- temp-length (which is saved-sram-address).
       (ui (alu-simp temp-length (Q $exp) (off-chip-vector scratch-ram)))
       (ui (alu-simp (+ off-chip-vector
			(constant
			  (eval
			    (uc-scratch-address
			      (uc-scratch-block-struct '*news-off-chip-processors-base*)))))
		     (off-chip-vector $exp))
	   (ybus-dest-sel counter))

       ;; Shuffle the saved sram address (currently in temp-length) into Q
       ;; and put length into temp-length.
       ;;(ui (alu-simp temp-length (Q $exp)))   ; this is done above
       (ui (alu-simp length (temp-length $exp)))

       ;; Set up alu-sum function and conditionalization.
       (ui ,(if conditional
		`(mop::loadb :b :condition-flag :context-flag)
		`(mop::loadb :b :condition-flag :zero-flag :condition-invert t)))

       ;; The first bit is sent separately to get the pipe going.
       (ucblock-ns

	 ;; Load the first bit into the R and Cube-In latches.
	 (ui (mop-maddr (source ++))
	     (mop::rug-news))

	 ;; Send the bits.
	 (paris-news-inner-loop dest arg-instr-mask off-chip-vector)

	 ;; Decrement the length and reset sratch-ram.  Write good ECC to
	 ;; the current maddr (dest).
	 (ui (alu-simp (- temp-length 1) (temp-length $exp) (y off-chip-vector))
	     (ybus-dest-sel counter)
	     (mop::store)
	     (cube-cntl sendrodd))

	 ;; Latch the first bit from Cube-In.  It sits in the B latch
	 ;; until it is finally written out to memory in the store
	 ;; instruction after the inner loop.
	 (ui (mop::loada :f :bsel 1)
	     (jump news-loop-end addr-zero)))

       (label news-loop-start)

       ;; The main loop.  In each iteration the data from the previous iteration
       ;; is stored to memory and the data for the current source is
       ;; transmitted.
       (ucblock-ns

	 ;; Load the current bit into the R and Cube-In latches.
	 (ui (mop-maddr (source ++))
	     (mop::rug-news))

	 ;; Send the current bit.
	 (paris-news-inner-loop dest arg-instr-mask off-chip-vector)

	 ;; Decrement the length and reset sratch-ram.  Store the previous
	 ;; bit to memory.
	 (ui (alu-simp (- temp-length 1) (temp-length $exp) (y off-chip-vector))
	     (ybus-dest-sel counter)
	     (mop::store)
	     (cube-cntl sendrodd))

	 ;; Latch the current bit from Cube-In.  It sits in the B latch
	 ;; until it is finally written out to memory in the store
	 ;; instruction after the inner loop.
	 (ui (alu-simp (+ dest 1) (dest $exp))
	     (mop::loada :f :bsel 1)
	     (jump news-loop-start not-addr-zero)))

       (label news-loop-end)

       ;; Latch and store the last bit.
       (ui (mop::loada :f :bsel 1)
	   (alu-simp Q)
	   (ybus-dest-sel counter))
       (ui (mop-maddr (dest ++))
	   (mop::store)
	   (seq crtn t)))))

(def-physical-news physical-news-internal :conditional t)
(def-physical-news physical-news-always-internal :conditional nil)


(defucmacro gen-cmis-write-chip-slice-%px-%ax (preg areg)
  `(def-min-mic-internal ,(intern (format nil "CMIS-WRITE-CHIP-SLICE-~a-~a"
					  (symbol-name preg) (symbol-name areg))
				  cmi-package)
		     ()
     NIL	;; (cmis-config :BETA)
     ((min-mic-declare
	(:DISP-ENTRY nil)
	(:IMP-INLINE-ENTRY t :NO-SAVE-DISPATCH))
       						
      "Save current sram value in Q"
      (ui (alu-simp scratch-ram (q $exp)))
      "Write data to sram"
      (ui (alu-simp ,areg (Y $EXP))
	  (SRAM-W WRITE))
      "Set %IMP-TEMP0 and chip-sel register to chip number * 2 (beta chip number)"
      (ui (alu-simp %P4 (%IMP-TEMP0 (2* $EXP))))

      (with-saved-sram-counter
	(UI (ALU-SIMP
	      (CONSTANT
		(EVAL (UC-SCRATCH-ADDRESS (UC-SCRATCH-ENTRY-STRUCT '*UC-PHYSICAL-NUMBER-OF-CHIPS-OFFSET*))))
	      (Y $EXP))
	    (YBUS-DEST-SEL COUNTER)
	    (YBUS-SRC-SEL ALU)
	    (clk-cntl 5-tick))		; Extend CM part of this instr to fake out
					; extending sequencer part of next instr.
					; Required for cm200 because
					; sram -> alu -> chip-sel is too long for 4 ticks.
	(ui (alu-simp (+ %IMP-TEMP0 scratch-ram) (%imp-temp0 $exp))
	    (YBUS-DEST-SEL CHIP-SEL)))

      "Do a loadi-store of low 16 bits to first beta chip and latch maddr"
      (ui (DBUS-SRC-SEL SCRATCH-RAM)		;GET THE LOW 16 BITS OF THE CONSTANT FROM THE IFIFO
	  (YBUS-SRC-SEL BYPASS)			;LOAD THAT INTO LOW 2 BYTES OF BYPASS REGISTER
	  (ARG-INSTR T)				;FLAG = Y<3:0>,  ALU = Y<11:4>, PMODE = Y<15:12>
	  (mop::loadi :data 0)
	  (alu-simp ,preg) (LD-MADDR MADDR))
      "INC CHIP-SEL REGISTER"
      (ui (alu-simp (+ %IMP-TEMP0 1) (Y $EXP))
	  (YBUS-DEST-SEL CHIP-SEL)
	  (mop::store))

      "DO A LOADI-STORE FOR THE SECOND BETA CHIP"
      (ui (DBUS-SRC-SEL SCRATCH-RAM-SWAP)	;GET THE HIGH 16 BITS OF THE CONSTANT FROM THE IFIFO
	  (YBUS-SRC-SEL BYPASS)			;LOAD THAT INTO LOW 2 BYTES OF BYPASS REGISTER
	  (ARG-INSTR 1)				;FLAG = Y<3:0>,  ALU = Y<11:4>, PMODE = Y<15:12>
	  (mop::loadi :data 0))
      "Restore sram value from Q"
      (ui (mop::store)
	  (alu-simp q)
	  (sram-w write)))))

(def-cmis-min-mic cmis-write-chip-slice-%px-%ax ()
  (cmis-config :BETA)
  (min-mic-declare (:ENTRY-SPEC nil) (:DISP-ENTRY nil) (:END-RETURN nil))
  (gen-all-cmis-write-chip-slice-%px-%ax))

(defucmacro gen-cmis-write-chip-slice-direct-%px-%ax (preg areg)
  `(def-min-mic-internal ,(intern (format nil "CMIS-WRITE-CHIP-SLICE-DIRECT-~a-~a"
					  (symbol-name preg) (symbol-name areg))
				  cmi-package)
		     ()
     NIL	;; (cmis-config :BETA)
     ((min-mic-declare
	(:DISP-ENTRY nil)
	(:IMP-INLINE-ENTRY t :NO-SAVE-DISPATCH))
       						
      "Save current sram value in Q"
      (ui (alu-simp scratch-ram (q $exp)))
      "Write data to sram"
      (ui (alu-simp ,areg (Y $EXP))
	  (SRAM-W WRITE))
      "Set %IMP-TEMP0 and chip-sel register to chip number * 2 (beta chip number)"
      (ui (alu-simp %P4 (%IMP-TEMP0 (2* $EXP))))

      (with-saved-sram-counter
	(UI (ALU-SIMP
	      (CONSTANT
		(EVAL (UC-SCRATCH-ADDRESS (UC-SCRATCH-ENTRY-STRUCT 'chip-offset-for-this-uc))))
	      (Y $EXP))
	    (YBUS-DEST-SEL COUNTER)
	    (YBUS-SRC-SEL ALU)
	    (clk-cntl 5-tick))		; Extend CM part of this instr to fake out
					; extending sequencer part of next instr.
					; Required for cm200 because
					; sram -> alu -> chip-sel is too long for 4 ticks.
	(ui (alu-simp (+ %IMP-TEMP0 scratch-ram) (%imp-temp0 $exp))
	    (YBUS-DEST-SEL CHIP-SEL)))

      "Do a loadi-store of low 16 bits to first beta chip and latch maddr"
      (ui (DBUS-SRC-SEL SCRATCH-RAM)		;GET THE LOW 16 BITS OF THE CONSTANT FROM THE IFIFO
	  (YBUS-SRC-SEL BYPASS)			;LOAD THAT INTO LOW 2 BYTES OF BYPASS REGISTER
	  (ARG-INSTR T)				;FLAG = Y<3:0>,  ALU = Y<11:4>, PMODE = Y<15:12>
	  (mop::loadi :data 0)
	  (alu-simp ,preg) (LD-MADDR MADDR))
      "INC CHIP-SEL REGISTER"
      (ui (alu-simp (+ %IMP-TEMP0 1) (Y $EXP))
	  (YBUS-DEST-SEL CHIP-SEL)
	  (mop::store))

      "DO A LOADI-STORE FOR THE SECOND BETA CHIP"
      (ui (DBUS-SRC-SEL SCRATCH-RAM-SWAP)	;GET THE HIGH 16 BITS OF THE CONSTANT FROM THE IFIFO
	  (YBUS-SRC-SEL BYPASS)			;LOAD THAT INTO LOW 2 BYTES OF BYPASS REGISTER
	  (ARG-INSTR 1)				;FLAG = Y<3:0>,  ALU = Y<11:4>, PMODE = Y<15:12>
	  (mop::loadi :data 0))
      "Restore sram value from Q"
      (ui (mop::store)
	  (alu-simp q)
	  (sram-w write)))))

(def-cmis-min-mic cmis-write-chip-slice-direct-%px-%ax ()
  (cmis-config :BETA)
  (min-mic-declare (:ENTRY-SPEC nil) (:DISP-ENTRY nil) (:END-RETURN nil))
  (gen-all-cmis-write-chip-slice-direct-%px-%ax))


(defucmacro gen-cmis-read-chip-slice-%px-%ax (preg areg)
  `(def-min-mic-internal ,(intern (format nil "CMIS-READ-CHIP-SLICE-~a-~a"
					  (symbol-name preg) (symbol-name areg))
				  cmi-package)
		()
     NIL	;; (cmis-config :BETA)
     ((min-mic-declare
	(:DISP-ENTRY nil)
	(:IMP-INLINE-ENTRY t :NO-SAVE-DISPATCH))

      "Set %IMP-TEMP0 and chip-sel register to chip number * 2 (beta chip number)"
      (ui (alu-simp %P4 (%IMP-TEMP0 (2* $EXP))))
      (with-saved-sram-counter
	(UI (ALU-SIMP
	      (CONSTANT
		(EVAL (UC-SCRATCH-ADDRESS (UC-SCRATCH-ENTRY-STRUCT '*UC-PHYSICAL-NUMBER-OF-CHIPS-OFFSET*))))
	      (Y $EXP))
	    (YBUS-DEST-SEL COUNTER)
	    (YBUS-SRC-SEL ALU)
	    (clk-cntl 5-tick))		; Extend CM part of this instr to fake out
					; extending sequencer part of next instr.
					; Required for cm200 because
					; sram -> alu -> chip-sel is too long for 4 ticks.
	(ui (alu-simp (+ %IMP-TEMP0 scratch-ram) (%imp-temp0 $exp))
	    (YBUS-DEST-SEL CHIP-SEL)))
      "Set address and READ"
      (ui (alu-simp ,preg (Y $EXP)) (LD-MADDR MADDR)
	  (CM-OP-SF READ))
      "Bump chip sel"
      (ui (alu-simp (+ %IMP-TEMP0 1) (Y $EXP))
	  (YBUS-DEST-SEL CHIP-SEL))
      "Start next read"
      (ui (CM-OP-SF READ))
      "write the low 16 into %An"
      (ui (alu-simp RDATA (,areg $EXP)))
      "NOP for readback path"
      (ui)
      "transfer the high 16 bits to the top of %An"
      (ui (alu-simp (or ,areg RDATA-SWAP) (,areg $EXP)))

      "At this point %An in the selected sequencer contains the correct value while
      all other sequencers have all 1's."

     "Load complement of %An into SR.  Selected sequencer gets complement of value;
     others get 0.  Set the global line to source SR."
     (ui (alu-simp (not (xor ,areg zero)))
	 (ybus-dest-sel sr)
	 (seq ldct 31)
	 (enc-fld-b global-out-sr))
     "Do a global or on each bit of the SR."
     (ucblock-ns
       (label loop-top)
       (ui (enc-fld-b shift)
	   (seq rpct loop-top)))
     "Now all sequencers have complement of value in SR.  Load complement of SR in to
     %An and reset the global line to source the CM global line."
     (ui (alu-simp (not (xor sr zero)) (,areg $exp))
	 (enc-fld-b global-out-cm)))))

(def-cmis-min-mic cmis-read-chip-slice-%px-%ax ()
  (cmis-config :BETA)
  (min-mic-declare (:ENTRY-SPEC nil) (:DISP-ENTRY nil) (:END-RETURN nil))
  (gen-all-cmis-read-chip-slice-%px-%ax))

(defucmacro gen-cmis-read-chip-slice-direct-%px-%ax (preg areg)
  `(def-min-mic-internal ,(intern (format nil "CMIS-READ-CHIP-SLICE-DIRECT-~a-~a"
					  (symbol-name preg) (symbol-name areg))
				  cmi-package)
		()
     NIL	;; (cmis-config :BETA)
     ((min-mic-declare
	(:DISP-ENTRY nil)
	(:IMP-INLINE-ENTRY t :NO-SAVE-DISPATCH))

      "Set %IMP-TEMP0 and chip-sel register to chip number * 2 (beta chip number)"
      (ui (alu-simp %P4 (%IMP-TEMP0 (2* $EXP))))
      (with-saved-sram-counter
	(UI (ALU-SIMP
	      (CONSTANT
		(EVAL (UC-SCRATCH-ADDRESS (UC-SCRATCH-ENTRY-STRUCT 'chip-offset-for-this-uc))))
	      (Y $EXP))
	    (YBUS-DEST-SEL COUNTER)
	    (YBUS-SRC-SEL ALU)
	    (clk-cntl 5-tick))		; Extend CM part of this instr to fake out
					; extending sequencer part of next instr.
					; Required for cm200 because
					; sram -> alu -> chip-sel is too long for 4 ticks.
	(ui (alu-simp (+ %IMP-TEMP0 scratch-ram) (%imp-temp0 $exp))
	    (YBUS-DEST-SEL CHIP-SEL)))
      "Set address and READ"
      (ui (alu-simp ,preg (Y $EXP)) (LD-MADDR MADDR)
	  (CM-OP-SF READ))
      "Bump chip sel"
      (ui (alu-simp (+ %IMP-TEMP0 1) (Y $EXP))
	  (YBUS-DEST-SEL CHIP-SEL))
      "Start next read"
      (ui (CM-OP-SF READ))
      "write the low 16 into %An"
      (ui (alu-simp RDATA (,areg $EXP)))
      "NOP for readback path"
      (ui)
      "transfer the high 16 bits to the top of %An"
      (ui (alu-simp (or ,areg RDATA-SWAP) (,areg $EXP)))
      )))

(def-cmis-min-mic cmis-read-chip-slice-direct-%px-%ax ()
  (cmis-config :BETA)
  (min-mic-declare (:ENTRY-SPEC nil) (:DISP-ENTRY nil) (:END-RETURN nil))
  (gen-all-cmis-read-chip-slice-direct-%px-%ax))


(defucmacro gen-cmis-send-chip-slice-%px (preg)
  `(def-min-mic-internal ,(intern (format nil "CMIS-SEND-CHIP-SLICE-~a"
					  (symbol-name preg))
				  cmi-package)
		()
     NIL	;; (cmis-config :BETA)
     ((min-mic-declare
	(:DISP-ENTRY nil)
	(:IMP-INLINE-ENTRY t :NO-SAVE-DISPATCH))

      "Set %IMP-TEMP0 to chip number * 2 (beta chip number)"
      (ui (alu-simp %P4 (%IMP-TEMP0 (2* $EXP))))
      "Load beta chip into chip-sel register"
      (with-saved-sram-counter
	(UI (ALU-SIMP
	      (CONSTANT
		(EVAL (UC-SCRATCH-ADDRESS (UC-SCRATCH-ENTRY-STRUCT '*UC-PHYSICAL-NUMBER-OF-CHIPS-OFFSET*))))
	      (Y $EXP))
	    (YBUS-DEST-SEL COUNTER)
	    (YBUS-SRC-SEL ALU)
	    (clk-cntl 5-tick))		; Extend CM part of this instr to fake out
					; extending sequencer part of next instr.
					; Required for cm200 because
					; sram -> alu -> chip-sel is too long for 4 ticks.
	(ui (alu-simp (+ %IMP-TEMP0 scratch-ram) (%imp-temp0 $exp))
	    (ybus-dest-sel chip-sel)))

      ;;Spin waiting for OFIFO
      (wait-for-ofifo)

      "Set address and READ"
      (ui (alu-simp ,preg)  
	  (LD-MADDR MADDR)
	  (CM-OP-SF READ))
    
      "NOP for readback path, get next chip-sel"
      (ui)

      "Start next read"
      (ui (cm-op-sf read)
	  (alu-simp (+ %IMP-TEMP0 1) (Y $EXP))
	  (YBUS-DEST-SEL CHIP-SEL))

      "Latch the low 16 in ofifo-low"
      (ui (YBUS-DEST-SEL OFIFO-LOW)
	  (YBUS-SRC-SEL BYPASS)
	  (DBUS-SRC-SEL RDATA))
    
      "NOP for readback path"
      (ui)

      "Latch the high 16 in ofifo-high"
      (ui (YBUS-DEST-SEL OFIFO-HIGH)      
	  (YBUS-SRC-SEL BYPASS)
	  (DBUS-SRC-SEL RDATA-SWAP)))))

(def-cmis-min-mic cmis-send-chip-slice-%px ()
  (cmis-config :BETA)
  (min-mic-declare (:ENTRY-SPEC nil) (:DISP-ENTRY nil) (:END-RETURN nil))
  (gen-all-cmis-send-chip-slice-%px))


(def-cmis-min-mic cmis-read-proc-internal ()
  (cmis-config :sprint)

  (min-mic-declare
    (:disp-entry nil)
    (:imp-inline-entry t :no-xfer :no-save-dispatch)
    (:end-return nil))

  ;; Reads a processorwise value into %imp-temp1 (thus limited to 32
  ;; bits).  The basic idea is to transpose the source field into a 32
  ;; bit slice (appropriately extending sub 32 bit fields).  The slice
  ;; can be directly read back to one sequencer.  This value must then
  ;; be propogated to all sequencers.  This takes 64 + 23 short cycles, or
  ;; 13 usec.

  ;; Physical processor in %p4.
  ;; Maddr in %imp-temp0.
  ;; Number of bits in %C4.
  ;; Signed or unsigned in Q.  (Signed is nonzero, unsigned is zero.)
  ;; Returns with value in %imp-temp1.

  ;; Folded through this first section of code are the instructions:
  ;;
  ;;  (alu-simp %p4 (%imp-temp1 (half $exp)))
  ;;  (alu-simp %imp-temp1 (%imp-temp1 (half $exp)))
  ;;  (alu-simp %imp-temp1 (%imp-temp1 (half $exp)))
  ;;  (alu-simp %imp-temp1 (%imp-temp1 (half $exp)))
  ;;
  ;; which compute the relative chip address for the processor into %imp-temp1.

  ;; Load %c4 bits into transposer A, starting at %imp-temp0.
  (ui (alu-simp (- %c4 1))
      (ybus-dest-sel dispatch)
      (sop :memory-bus :transposer-a
	   :memory-bus-direction :write
	   :memory-bus-pointer-control :post-clear))
  (ui (alu-simp %p4 (%imp-temp1 (half $exp)))	; Folded instruction 1
      (jump choose-extend-bit addr-neg))
  (ui (seq push t) (br-bus-src-sel dispatch))
  (ui (mop-maddr (%imp-temp0 ++))
      (sop :memory-bus :transposer-a
	   :memory-bus-direction :write
	   :memory-bus-pointer-control :post-add)
      (seq rfct))

  ;; Conditionally (based on Q) load the maddr latch with either the
  ;; sign bit address or the address of a slice containing all zeros.
  (label choose-extend-bit)
  (ui (alu-simp q))
  (ui (mop-maddr (- %imp-temp0 1))
      (jump extend-loop not-addr-zero))
  (ui (mop-maddr (constant (eval *floating-point-add-subtract-identity*))))

  (label extend-loop)

  ;; Load the rest of transposer a from the latched maddr.
  (ui (alu-simp (- (constant 31) %c4))
      (ybus-dest-sel dispatch))
  (ui (alu-simp %imp-temp1 (%imp-temp1 (half $exp)))	; Folded instruction 2.
      (jump write-value addr-neg))
  (ui (seq push t) (br-bus-src-sel dispatch))
  (ui (sop :memory-bus :transposer-a
	   :memory-bus-direction :write
	   :memory-bus-pointer-control :post-add)
      (seq rfct))

  (label write-value)

  ;; Write the value to memory slicewise.  The particular slot of the
  ;; transposer to use is given by the low 5 bits of the physical
  ;; processor address.
  (ui (alu-simp (and %p4 (constant #x1f)))
      (lbl)
      (arg-instr 1))
  (ui (alu-simp %imp-temp1 (%imp-temp1 (half $exp)))	; Folded instruction 3.
      (sprint-rug :rug-read-write :write :register :pointer))
  (ui (alu-simp %imp-temp1 (%imp-temp1 (half $exp))))	; Required nop between sprint-rug and sop
						        ; Folded instruction 4.
  (ui (mop-maddr (constant (eval *cm-temp-bit-0*)))
      (sop :memory-bus :transposer-a
	   :memory-bus-direction :read
	   :memory-bus-pointer-control :post-clear)
      (we-cntl hi-lo))

  ;; From here in, the code is identical to that for cmis-read-slice (in
  ;; mi-read-write).  Folded into the code above we have computed the
  ;; relative beta chip number into %imp-temp1.  Correct for physical
  ;; offset in the machine and load the corrected value into chip
  ;; select.  Note that of the two beta chips connected to a sprint chip
  ;; we always read from chip 0 and then from chip 1.  The correction
  ;; below zeroes the low order bit of the relative chip address which
  ;; ensures this order.
  (ui (alu-simp (logand %imp-temp1 (constant #.(ldb (byte 32 0) (ash -1 1)))) (%imp-temp1 $exp)))
  (with-saved-sram-counter
    (UI (ALU-SIMP
	  (CONSTANT
	    (EVAL (UC-SCRATCH-ADDRESS (UC-SCRATCH-ENTRY-STRUCT '*UC-PHYSICAL-NUMBER-OF-CHIPS-OFFSET*))))
	  (Y $EXP))
	(YBUS-DEST-SEL COUNTER)
	(YBUS-SRC-SEL ALU)
	(clk-cntl 5-tick))		; Extend CM part of this instr to fake out
					; extending sequencer part of next instr.
					; Required for cm200 because
					; sram -> alu -> chip-sel is too long for 4 ticks.
    (ui (alu-simp (+ %imp-temp1 scratch-ram) (%imp-temp1 $exp))
	(ybus-dest-sel chip-sel)))
  ;; Start read of low 16 bits.  The maddr was latched 2 cycles ago.
  (ui (cm-op-sf read))
  ;; Bump chip select to pick chip holding high 16 bits.
  (ui (alu-simp (+ %imp-temp1 1))
      (ybus-dest-sel chip-sel))
  ;; Start read of high 16 bits.
  (ui (cm-op-sf read)
      ;; Get low 16 bits.
      (alu-simp rdata (%imp-temp1 $exp)))
  ;; Nop for readback.
  (ui)
  ;; Get high 16 bits.
  (ui (alu-simp (or %imp-temp1 rdata-swap) (%imp-temp1 $exp)))

  ;; At this point %imp-temp1 in the selected sequencer contains the
  ;; correct value while all other sequencers have all 1's.
 
  ;; Load complement of %imp-temp1 into SR.  Selected sequencer gets
  ;; complement of value; others get 0.  Set the global line to source
  ;; SR.
  (ui (alu-simp (not (xor %imp-temp1 zero)))
      (ybus-dest-sel sr)
      (seq ldct 31)
      (enc-fld-b global-out-sr))
  ;; Do a global or on each bit of the SR.
  (ucblock-ns
    (label loop-top)
    (ui (enc-fld-b shift)
	(seq rpct loop-top)))
  ;; Now all sequencers have complement of value in SR.  Load complement
  ;; of SR in to  %imp-temp1 and reset the global line to source the CM
  ;; global line.
  (ui (alu-simp (not (xor sr zero)) (%imp-temp1 $exp))
      (enc-fld-b global-out-cm)
      (seq crtn t)))

(def-cmis-min-mic cmis-read-proc ()
  (cmis-config :BETA)
  (min-mic-declare (:ENTRY-SPEC nil) (:DISP-ENTRY nil) (:END-RETURN nil))
  (gen-all-cmis-read-proc))


(def-cmis-min-mic CMI::CMIS-SELECT-PROCESSOR ()
  (cmis-config :BETA)

  (min-mic-declare
    (:disp-entry nil)
    (:imp-inline-entry t :no-xfer :no-save-dispatch)
    (:end-return nil))

  ;; Load the hardware context flag with a 1 in the given physical
  ;; processor (%P4) and 0's everywhere else.

  (ui (seq cjs t processor-to-mask-and-chip))

  ;; Load the absolute chip select.
  (with-saved-sram-counter
    (UI (ALU-SIMP
	  (CONSTANT
	    (EVAL (UC-SCRATCH-ADDRESS (UC-SCRATCH-ENTRY-STRUCT '*UC-PHYSICAL-NUMBER-OF-CHIPS-OFFSET*))))
	  (Y $EXP))
	(YBUS-DEST-SEL COUNTER)
	(YBUS-SRC-SEL ALU)
	(clk-cntl 5-tick))		; Extend CM part of this instr to fake out
					; extending sequencer part of next instr.
					; Required for cm200 because
					; sram -> alu -> chip-sel is too long for 4 ticks.
    (ui (alu-simp (+ %imp-temp1 scratch-ram))
	(ybus-dest-sel chip-sel)))

  ;; Load a zero into the context flag, and load the maddr latch with
  ;; the address of a temp.  In the last two instructions fold in the
  ;; code to restore the dispatch register.
  (ui (mop-maddr (constant (eval *cm-temp-bit-0*)))
      (mop::loada :f :read-flag :zero-flag))
  (ui (mop::loadb :f :condition-flag :zero-flag :condition-invert t))
  (ui (mop::store :write-flag :context-flag))

  ;; Now loadi the mask.  Use the latched maddr to write the C latch.
  (ui (mop::loada :a))
  (ui (mop::loadi :data 0)
      (alu-simp %imp-temp0)
      (arg-instr t))

  ;; Loadi sets the cond latch on in the selected chip, and off
  ;; everywhere else.  Thus the following store modifies the context
  ;; flag in only the selected processor.  There it writes the mask to
  ;; the context flag.
  (ui (mop::store :write-flag :context-flag)
      (imp$-macroinst-return-inline :warp)))


(def-cmis-min-mic cmis-select-processor-internal ()
  (cmis-config :BETA)

  (min-mic-declare
    (:disp-entry nil)
    (:imp-inline-entry t :no-xfer :no-save-dispatch)
    (:end-return nil))

  ;; Load the given maddr (in Q) with 0 everywhere except for the
  ;; selected processor (in %P4).

  (ui (seq cjs t processor-to-mask-and-chip))

  ;; Load the absolute chip select.
  (with-saved-sram-counter
    (UI (ALU-SIMP
	  (CONSTANT
	    (EVAL (UC-SCRATCH-ADDRESS (UC-SCRATCH-ENTRY-STRUCT '*UC-PHYSICAL-NUMBER-OF-CHIPS-OFFSET*))))
	  (Y $EXP))
	(YBUS-DEST-SEL COUNTER)
	(YBUS-SRC-SEL ALU)
	(clk-cntl 5-tick))		; Extend CM part of this instr to fake out
					; extending sequencer part of next instr.
					; Required for cm200 because
					; sram -> alu -> chip-sel is too long for 4 ticks.
    (ui (alu-simp (+ %imp-temp1 scratch-ram))
	(ybus-dest-sel chip-sel)))

  ;; Load the maddr latch with the address of a slice of all 0's.
  (ui (mop-maddr (constant (eval *floating-point-add-subtract-identity*))))

  ;; Loadi the mask.  This loads the mask into the A latch, loads all
  ;; 0's into the C latch, loads all 0's into the cond latch of all but
  ;; the selected chip, where all 1's are loaded.
  (ui (mop::loadi :data 0)
      (alu-simp %imp-temp0)
      (arg-instr t))
  
  ;; This store writes the mask in the selected chip, and 0's everywhere
  ;; else (due to conditionalization).
  (ui (mop-maddr q)
      (mop::store)
      (imp$-macroinst-return-inline :warp)))


(defmacro def-cmis-news (name &key (conditional nil))
  `(def-cmis-min-mic ,name ()
     (cmis-config :beta)
     (min-mic-declare
       (:disp-entry nil)
       (:end-return nil))

     ;; Expects the following registers to be set up:
     ;;  %IMP-TEMP0 holds the length
     ;;  %N1 holds the on-chip-mask
     ;;  %N2 holds the xor value

     ;; Save the current scratch ram address.
     (ui (alu-simp scratch-ram-addr (%disp-save $exp)))

     ;; There are three arrays in scratch ram which give the sequence of
     ;; processors going off chip for any given on chip mask.  The
     ;; individual sequences are glommed together in one big array,
     ;; *news-off-chip-processors-base* (which happens to have 81
     ;; elements).  The two other array have 16 elements (one for each
     ;; possible mask).  The array *news-off-chip-processors-length*
     ;; gives for each mask the number of processors going off chip.
     ;; The array *news-off-chip-processors-index* gives the starting
     ;; position in *news-off-chip-processors-base* for each mask.

     ;; Put (number of processors going off chip - 1) into the dispatch register.
     (ui (alu-simp (+ %n1 (constant
			    (eval
			      (uc-scratch-address
				(uc-scratch-block-struct '*news-off-chip-processors-length*))))))
	 (ybus-dest-sel counter) (ybus-src-sel alu)
	 (clk-cntl 5-tick))		; Extend CM part of this instr to fake out
					; extending sequencer part of next instr.
					; Required for cm200 because
					; sram -> alu -> dispatch is too long for 4 ticks.
     (ui (alu-simp (- scratch-ram 1) (y $exp))
	 (ybus-dest-sel dispatch))

     ;; Put the scratch ram address of the vector of processors going off
     ;; chip into %IMP-TEMP1 register (and point scratch-ram there).
     (ui (alu-simp (+ %n1 (constant
			    (eval
			      (uc-scratch-address
				(uc-scratch-block-struct '*news-off-chip-processors-index*))))))
	 (ybus-dest-sel counter) (ybus-src-sel alu))
     (ui (alu-simp scratch-ram (%imp-temp1 $exp)))
     (ui (alu-simp (+ %imp-temp1
		      (constant
			(eval
			  (uc-scratch-address
			    (uc-scratch-block-struct '*news-off-chip-processors-base*)))))
		   (%imp-temp1 $exp))
	 (ybus-dest-sel counter))
		    
     ;; Set up alu-sum function and conditionalization.
     (ui ,(if conditional
	      `(mop::loadb :b :condition-flag :context-flag)
	      `(mop::loadb :b :condition-flag :zero-flag :condition-invert t)))

     ;; The first bit is sent separately to get the pipe going.
     (ucblock-ns

       ;; Load the first bit into the R and Cube-In latches.
       (ui (mop-maddr (%p1 ++))
	   (mop::rug-news))

       ;; Send the bits.
       (cmis-news-inner-loop)

       ;; Decrement the length and reset sratch-ram.  Write good ECC to the
       ;; current maddr (%p2).
       (ui (alu-simp (- %imp-temp0 1) (%imp-temp0 $exp) (y %imp-temp1))
	   (ybus-dest-sel counter)
	   (mop::store)
	   (cube-cntl sendrodd))

       ;; Latch the first bit from Cube-In.  It sits in the B latch
       ;; until it is finally written out to memory in the store
       ;; instruction after the inner loop.
       (ui (mop::loada :f :bsel 1)
	   (jump news-loop-end addr-zero)))

     (label news-loop-start)

     ;; The main loop.  In each iteration the data from the previous iteration
     ;; is stored to memory and the data for the current source is
     ;; transmitted.
     (ucblock-ns

       ;; Load the current bit into the R and Cube-In latches.
       (ui (mop-maddr (%p1 ++))
	   (mop::rug-news))

       ;; Send the current bit.
       (cmis-news-inner-loop)

       ;; Decrement the length and reset sratch-ram.  Store the previous
       ;; bit to memory.
       (ui (alu-simp (- %imp-temp0 1) (%imp-temp0 $exp) (y %imp-temp1))
	   (ybus-dest-sel counter)
	   (mop::store)
	   (cube-cntl sendrodd))

       ;; Latch the current bit from Cube-In.  It sits in the B latch
       ;; until it is finally written out to memory in the store
       ;; instruction after the inner loop.
       (ui (alu-simp (+ %p2 1) (%p2 $exp))
	   (mop::loada :f :bsel 1)
	   (jump news-loop-start not-addr-zero)))

     (label news-loop-end)

     ;; Latch and store the last bit and do a modified return.
     (ui (mop::loada :f :bsel 1)
	 (alu-simp (- %disp-save 1))
	 (ybus-dest-sel counter) (ybus-src-sel alu))
     (ui (mop-maddr (%p2 ++))
	 (mop::store)
	 (dbus-src-sel scratch-ram)
	 (ybus-dest-sel dispatch) (ybus-src-sel bypass)
	 (enc-fld-a inc-counter)
	 (seq crtn t))))

(def-cmis-news cmis-news-%p1-to-%p2-always-internal :conditional nil)
(def-cmis-news cmis-news-%p1-to-%p2-cond-internal :conditional t)

;;; =========================================================================
;;; CMIS-STENCIL2-LOAD-S
;;;
;;; D=data
;;; W=total width of edge, P=number of data columns (at least 2)
;;;
;;; Pipeline:
;;;	MBUS		FBUS		FPU dynamic
;;;	mwb D<1>						      \
;;;	mwb D<2+j>	frb D<1+j>	load reg	0 <= j < P-1  | 0 <= k < W
;;;			frb D<P>	load reg		      /
;;;
;;; (No attempt is made to dovetail the various copies of the inner loop.
;;; This costs us a few cycles, but so what?  This is not critical to the
;;; innermost loop of the overall stencil code.)

(def-cmis-min-mic CMIS-STENCIL2-LOAD-S (useless-product edge-width number-of-data-cols)
  (cmis-config :WTL3164)
  (min-mic-declare
   (:DISP-ENTRY nil)   ;sorry, no FE calls (though it would work) allows WARP return
   (:ARG1-COUNTER t :COUNTED-ARGS 1)
   (:IMP-INLINE-ENTRY t :NO-XFER))
  (with-stencil-registers
    (ui (alu-simp scratch-ram (group-counter $EXP))
	(ENC-FLD-A INC-COUNTER))
    (ui (alu-simp lit (data-dispatch $EXP))
	(lit-vert lit-20)
	(na-sf dispatch-1)
	(clk-cntl 5-tick))		; Extend CM part of this instr to fake out
					; extending sequencer part of next instr.
					; Required for cm200 because
					; sram -> alu -> dispatch is too long for 4 ticks.
    (ui (alu-simp (- data-dispatch SCRATCH-RAM) ;calculate dispatch address
		  (data-dispatch $EXP)) ; and save in data-dispatch
	(YBUS-DEST-SEL DISPATCH)	;also put in Dispatch reg for first use
	(wtl3164-static-instruction :func :monadic :main 0 :mbin 0 :aain 0 :abin 0);nop
	(ENC-FLD-A INC-COUNTER))
    ;; It is STENCIL2, so subtract 2-1=1 time, so no loop is needed.
    ;; Imps called after this one depend on this subtraction.
    (ui (alu-simp (- %sweep-stride %pipeline-stride) (%sweep-stride $EXP)))
    "Read first data element from memory; dispatch on number remaining"
    (UI (alu-simp %data-pointer (Y $EXP)) (LD-MADDR MADDR)
	(cmis$-sprint-bypass :mwb)
	;;jump-to-subroutine via the current DISPATCH contents
	(SEQ CJP t DISPATCH))		;pushes PC, jumps to address that is
					; label "dispatch-1" minus number of data elts
    "Some number of these will be executed to read remaining data elements"
    (cmis$-stencil-pointer-stride-regs-reversed 0 (preg sreg)
      (UI (alu-simp preg (Y $EXP)) (LD-MADDR MADDR)
	  (cmis$-sprint-bypass :frb :mwb)
	  (cmis$-dynamic-from-sram)))
    ;; Need one copy of the following instruction (numbered 2).
    (UI (alu-simp (+ %data-pointer %pipeline-stride) (Y $EXP) (%data-pointer $EXP)) (LD-MADDR MADDR)  ;2
	(cmis$-sprint-bypass :frb :mwb)
	(cmis$-dynamic-from-sram))
    (UI (alu-simp (- group-counter 1) (group-counter $EXP))
	(cmis$-sprint-bypass :frb)
	(cmis$-dynamic-from-sram))
    (label dispatch-1)
    (UI (SEQ CJP ADDR-ZERO done))
    "Adjust dispatch address"
    (ui (alu-simp (- data-dispatch lit) (data-dispatch $EXP))
	(lit-vert lit-20)
	(na-sf dispatch-1))
    (ui (alu-simp (+ data-dispatch lit) (data-dispatch $EXP))
	(YBUS-DEST-SEL DISPATCH)	;also put in Dispatch reg for first use
	(lit-vert lit-20)
	(na-sf dispatch-2))
    "Now a loop that reads the rest with pre-incrementing"
    (label loop)
    "Read first data element from memory; dispatch on number remaining"
    (UI (alu-simp (+ %data-pointer %sweep-stride) (Y $EXP) (%data-pointer $EXP)) (LD-MADDR MADDR)
	(cmis$-sprint-bypass :mwb)
	;;jump-to-subroutine via the current DISPATCH contents
	(SEQ CJP t DISPATCH))	;pushes PC, jumps to address that is
				 ; label "dispatch-2" minus number of data elts
    "Some number of these will be executed to read remaining data elements"
    (cmis$-stencil-pointer-stride-regs-reversed 0 (preg sreg)
      (UI (alu-simp (+ preg sreg) (Y $EXP) (preg $EXP)) (LD-MADDR MADDR)
	  (cmis$-sprint-bypass :frb :mwb)
	  (cmis$-dynamic-from-sram)))
    ;; Need one copy of the following instruction (numbered 2).
    (UI (alu-simp (+ %data-pointer %pipeline-stride) (Y $EXP) (%data-pointer $EXP)) (LD-MADDR MADDR)  ;2
	(cmis$-sprint-bypass :frb :mwb)
	(cmis$-dynamic-from-sram))
    (UI (alu-simp (- group-counter 1) (group-counter $EXP))
	(cmis$-sprint-bypass :frb)
	(cmis$-dynamic-from-sram))
    (label dispatch-2)
    (UI (SEQ CJP NOT-ADDR-ZERO loop))
    (label done)
    ))

;;; =========================================================================
;;; CMIS-STENCIL4-LOAD-S
;;;
;;; D=data
;;; W=total width of edge, P=number of data columns (at least 4)
;;;
;;; Pipeline:
;;;	MBUS		FBUS		FPU dynamic
;;;	mwb D<1>						      \
;;;	mwb D<2+j>	frb D<1+j>	load reg	0 <= j < P-1  | 0 <= k < W
;;;			frb D<P>	load reg		      /
;;;
;;; (No attempt is made to dovetail the various copies of the inner loop.
;;; This costs us a few cycles, but so what?  This is not critical to the
;;; innermost loop of the overall stencil code.)

(def-cmis-min-mic CMIS-STENCIL4-LOAD-S (useless-product edge-width number-of-data-cols)
  (cmis-config :WTL3164)
  (min-mic-declare
   (:DISP-ENTRY nil)   ;sorry, no FE calls (though it would work) allows WARP return
   (:ARG1-COUNTER t :COUNTED-ARGS 1)
   (:IMP-INLINE-ENTRY t :NO-XFER))
  (with-stencil-registers
    (ui (alu-simp scratch-ram (group-counter $EXP))
	(ENC-FLD-A INC-COUNTER))
    (ui (alu-simp lit (data-dispatch $EXP))
	(lit-vert lit-20)
	(na-sf dispatch-1)
	(clk-cntl 5-tick))		; Extend CM part of this instr to fake out
					; extending sequencer part of next instr.
					; Required for cm200 because
					; sram -> alu -> dispatch is too long for 4 ticks.
    (ui (alu-simp (- data-dispatch SCRATCH-RAM) ;calculate dispatch address
		  (data-dispatch $EXP)) ; and save in data-dispatch
	(YBUS-DEST-SEL DISPATCH)	;also put in Dispatch reg for first use
	(wtl3164-static-instruction :func :monadic :main 0 :mbin 0 :aain 0 :abin 0);nop
	(ENC-FLD-A INC-COUNTER))
    ;; It is STENCIL4, so subtract 4-1=3 times, so count is 3-1=2.
    ;; Imps called after this one depend on this subtraction.
    (ui (SEQ PUSH t 2))
    (ui (alu-simp (- %sweep-stride %pipeline-stride) (%sweep-stride $EXP)) (SEQ RFCT))
    "Read first data element from memory; dispatch on number remaining"
    (UI (alu-simp %data-pointer (Y $EXP)) (LD-MADDR MADDR)
	(cmis$-sprint-bypass :mwb)
	;;jump-to-subroutine via the current DISPATCH contents
	(SEQ CJP t DISPATCH))		;pushes PC, jumps to address that is
					; label "dispatch-1" minus number of data elts
    "Some number of these will be executed to read remaining data elements"
    (cmis$-stencil-pointer-stride-regs-reversed 0 (preg sreg)
      (UI (alu-simp preg (Y $EXP)) (LD-MADDR MADDR)
	  (cmis$-sprint-bypass :frb :mwb)
	  (cmis$-dynamic-from-sram)))
    ;; Need three copies of the following instruction (numbered 2 to 4).
    (UI (alu-simp (+ %data-pointer %pipeline-stride) (Y $EXP) (%data-pointer $EXP)) (LD-MADDR MADDR)  ;2
	(cmis$-sprint-bypass :frb :mwb)
	(cmis$-dynamic-from-sram))
    (UI (alu-simp (+ %data-pointer %pipeline-stride) (Y $EXP) (%data-pointer $EXP)) (LD-MADDR MADDR)  ;3
	(cmis$-sprint-bypass :frb :mwb)
	(cmis$-dynamic-from-sram))
    (UI (alu-simp (+ %data-pointer %pipeline-stride) (Y $EXP) (%data-pointer $EXP)) (LD-MADDR MADDR)  ;4
	(cmis$-sprint-bypass :frb :mwb)
	(cmis$-dynamic-from-sram))
    (UI (alu-simp (- group-counter 1) (group-counter $EXP))
	(cmis$-sprint-bypass :frb)
	(cmis$-dynamic-from-sram))
    (label dispatch-1)
    (UI (SEQ CJP ADDR-ZERO done))
    "Adjust dispatch address"
    (ui (alu-simp (- data-dispatch lit) (data-dispatch $EXP))
	(lit-vert lit-20)
	(na-sf dispatch-1))
    (ui (alu-simp (+ data-dispatch lit) (data-dispatch $EXP))
	(YBUS-DEST-SEL DISPATCH)	;also put in Dispatch reg for first use
	(lit-vert lit-20)
	(na-sf dispatch-2))
    "Now a loop that reads the rest with pre-incrementing"
    (label loop)
    "Read first data element from memory; dispatch on number remaining"
    (UI (alu-simp (+ %data-pointer %sweep-stride) (Y $EXP) (%data-pointer $EXP)) (LD-MADDR MADDR)
	(cmis$-sprint-bypass :mwb)
	;;jump-to-subroutine via the current DISPATCH contents
	(SEQ CJP t DISPATCH))	;pushes PC, jumps to address that is
				 ; label "dispatch-2" minus number of data elts
    "Some number of these will be executed to read remaining data elements"
    (cmis$-stencil-pointer-stride-regs-reversed 0 (preg sreg)
      (UI (alu-simp (+ preg sreg) (Y $EXP) (preg $EXP)) (LD-MADDR MADDR)
	  (cmis$-sprint-bypass :frb :mwb)
	  (cmis$-dynamic-from-sram)))
    ;; Need three copies of the following instruction (numbered 2 to 4).
    (UI (alu-simp (+ %data-pointer %pipeline-stride) (Y $EXP) (%data-pointer $EXP)) (LD-MADDR MADDR)  ;2
	(cmis$-sprint-bypass :frb :mwb)
	(cmis$-dynamic-from-sram))
    (UI (alu-simp (+ %data-pointer %pipeline-stride) (Y $EXP) (%data-pointer $EXP)) (LD-MADDR MADDR)  ;3
	(cmis$-sprint-bypass :frb :mwb)
	(cmis$-dynamic-from-sram))
    (UI (alu-simp (+ %data-pointer %pipeline-stride) (Y $EXP) (%data-pointer $EXP)) (LD-MADDR MADDR)  ;4
	(cmis$-sprint-bypass :frb :mwb)
	(cmis$-dynamic-from-sram))
    (UI (alu-simp (- group-counter 1) (group-counter $EXP))
	(cmis$-sprint-bypass :frb)
	(cmis$-dynamic-from-sram))
    (label dispatch-2)
    (UI (SEQ CJP NOT-ADDR-ZERO loop))
    (label done)
    ))


;;; =========================================================================
;;; CMIS-STENCIL8-LOAD-S
;;;
;;; D=data
;;; W=total width of edge, P=number of data columns (at least 8)
;;;
;;; Pipeline:
;;;	MBUS		FBUS		FPU dynamic
;;;	mwb D<1>						      \
;;;	mwb D<2+j>	frb D<1+j>	load reg	0 <= j < P-1  | 0 <= k < W
;;;			frb D<P>	load reg		      /
;;;
;;; (No attempt is made to dovetail the various copies of the inner loop.
;;; This costs us a few cycles, but so what?  This is not critical to the
;;; innermost loop of the overall stencil code.)

(def-cmis-min-mic CMIS-STENCIL8-LOAD-S (useless-product edge-width number-of-data-cols)
  (cmis-config :WTL3164)
  (min-mic-declare
   (:DISP-ENTRY nil)   ;sorry, no FE calls (though it would work) allows WARP return
   (:ARG1-COUNTER t :COUNTED-ARGS 1)
   (:IMP-INLINE-ENTRY t :NO-XFER))
  (with-stencil-registers
    (ui (alu-simp scratch-ram (group-counter $EXP))
	(ENC-FLD-A INC-COUNTER))
    (ui (alu-simp lit (data-dispatch $EXP))
	(lit-vert lit-20)
	(na-sf dispatch-1)
	(clk-cntl 5-tick))		; Extend CM part of this instr to fake out
					; extending sequencer part of next instr.
					; Required for cm200 because
					; sram -> alu -> dispatch is too long for 4 ticks.
    (ui (alu-simp (- data-dispatch SCRATCH-RAM) ;calculate dispatch address
		  (data-dispatch $EXP)) ; and save in data-dispatch
	(YBUS-DEST-SEL DISPATCH)	;also put in Dispatch reg for first use
	(wtl3164-static-instruction :func :monadic :main 0 :mbin 0 :aain 0 :abin 0);nop
	(ENC-FLD-A INC-COUNTER))
    ;; It is STENCIL8, so subtract 8-1=7 times, so count is 7-1=6.
    ;; Imps called after this one depend on this subtraction.
    (ui (SEQ PUSH t 6))
    (ui (alu-simp (- %sweep-stride %pipeline-stride) (%sweep-stride $EXP)) (SEQ RFCT))
    "Read first data element from memory; dispatch on number remaining"
    (UI (alu-simp %data-pointer (Y $EXP)) (LD-MADDR MADDR)
	(cmis$-sprint-bypass :mwb)
	;;jump-to-subroutine via the current DISPATCH contents
	(SEQ CJP t DISPATCH))		;pushes PC, jumps to address that is
					; label "dispatch-1" minus number of data elts
    "Some number of these will be executed to read remaining data elements"
    (cmis$-stencil-pointer-stride-regs-reversed 0 (preg sreg)
      (UI (alu-simp preg (Y $EXP)) (LD-MADDR MADDR)
	  (cmis$-sprint-bypass :frb :mwb)
	  (cmis$-dynamic-from-sram)))
    ;; Need seven copies of the following instruction (numbered 2 to 8).
    (UI (alu-simp (+ %data-pointer %pipeline-stride) (Y $EXP) (%data-pointer $EXP)) (LD-MADDR MADDR)  ;2
	(cmis$-sprint-bypass :frb :mwb)
	(cmis$-dynamic-from-sram))
    (UI (alu-simp (+ %data-pointer %pipeline-stride) (Y $EXP) (%data-pointer $EXP)) (LD-MADDR MADDR)  ;3
	(cmis$-sprint-bypass :frb :mwb)
	(cmis$-dynamic-from-sram))
    (UI (alu-simp (+ %data-pointer %pipeline-stride) (Y $EXP) (%data-pointer $EXP)) (LD-MADDR MADDR)  ;4
	(cmis$-sprint-bypass :frb :mwb)
	(cmis$-dynamic-from-sram))
    (UI (alu-simp (+ %data-pointer %pipeline-stride) (Y $EXP) (%data-pointer $EXP)) (LD-MADDR MADDR)  ;5
	(cmis$-sprint-bypass :frb :mwb)
	(cmis$-dynamic-from-sram))
    (UI (alu-simp (+ %data-pointer %pipeline-stride) (Y $EXP) (%data-pointer $EXP)) (LD-MADDR MADDR)  ;6
	(cmis$-sprint-bypass :frb :mwb)
	(cmis$-dynamic-from-sram))
    (UI (alu-simp (+ %data-pointer %pipeline-stride) (Y $EXP) (%data-pointer $EXP)) (LD-MADDR MADDR)  ;7
	(cmis$-sprint-bypass :frb :mwb)
	(cmis$-dynamic-from-sram))
    (UI (alu-simp (+ %data-pointer %pipeline-stride) (Y $EXP) (%data-pointer $EXP)) (LD-MADDR MADDR)  ;8
	(cmis$-sprint-bypass :frb :mwb)
	(cmis$-dynamic-from-sram))
    (UI (alu-simp (- group-counter 1) (group-counter $EXP))
	(cmis$-sprint-bypass :frb)
	(cmis$-dynamic-from-sram))
    (label dispatch-1)
    (UI (SEQ CJP ADDR-ZERO done))
    "Adjust dispatch address"
    (ui (alu-simp (- data-dispatch lit) (data-dispatch $EXP))
	(lit-vert lit-20)
	(na-sf dispatch-1))
    (ui (alu-simp (+ data-dispatch lit) (data-dispatch $EXP))
	(YBUS-DEST-SEL DISPATCH)	;also put in Dispatch reg for first use
	(lit-vert lit-20)
	(na-sf dispatch-2))
    "Now a loop that reads the rest with pre-incrementing"
    (label loop)
    "Read first data element from memory; dispatch on number remaining"
    (UI (alu-simp (+ %data-pointer %sweep-stride) (Y $EXP) (%data-pointer $EXP)) (LD-MADDR MADDR)
	(cmis$-sprint-bypass :mwb)
	;;jump-to-subroutine via the current DISPATCH contents
	(SEQ CJP t DISPATCH))	;pushes PC, jumps to address that is
				 ; label "dispatch-2" minus number of data elts
    "Some number of these will be executed to read remaining data elements"
    (cmis$-stencil-pointer-stride-regs-reversed 0 (preg sreg)
      (UI (alu-simp (+ preg sreg) (Y $EXP) (preg $EXP)) (LD-MADDR MADDR)
	  (cmis$-sprint-bypass :frb :mwb)
	  (cmis$-dynamic-from-sram)))
    ;; Need seven copies of the following instruction (numbered 2 to 8).
    (UI (alu-simp (+ %data-pointer %pipeline-stride) (Y $EXP) (%data-pointer $EXP)) (LD-MADDR MADDR)  ;2
	(cmis$-sprint-bypass :frb :mwb)
	(cmis$-dynamic-from-sram))
    (UI (alu-simp (+ %data-pointer %pipeline-stride) (Y $EXP) (%data-pointer $EXP)) (LD-MADDR MADDR)  ;3
	(cmis$-sprint-bypass :frb :mwb)
	(cmis$-dynamic-from-sram))
    (UI (alu-simp (+ %data-pointer %pipeline-stride) (Y $EXP) (%data-pointer $EXP)) (LD-MADDR MADDR)  ;4
	(cmis$-sprint-bypass :frb :mwb)
	(cmis$-dynamic-from-sram))
    (UI (alu-simp (+ %data-pointer %pipeline-stride) (Y $EXP) (%data-pointer $EXP)) (LD-MADDR MADDR)  ;5
	(cmis$-sprint-bypass :frb :mwb)
	(cmis$-dynamic-from-sram))
    (UI (alu-simp (+ %data-pointer %pipeline-stride) (Y $EXP) (%data-pointer $EXP)) (LD-MADDR MADDR)  ;6
	(cmis$-sprint-bypass :frb :mwb)
	(cmis$-dynamic-from-sram))
    (UI (alu-simp (+ %data-pointer %pipeline-stride) (Y $EXP) (%data-pointer $EXP)) (LD-MADDR MADDR)  ;7
	(cmis$-sprint-bypass :frb :mwb)
	(cmis$-dynamic-from-sram))
    (UI (alu-simp (+ %data-pointer %pipeline-stride) (Y $EXP) (%data-pointer $EXP)) (LD-MADDR MADDR)  ;8
	(cmis$-sprint-bypass :frb :mwb)
	(cmis$-dynamic-from-sram))
    (UI (alu-simp (- group-counter 1) (group-counter $EXP))
	(cmis$-sprint-bypass :frb)
	(cmis$-dynamic-from-sram))
    (label dispatch-2)
    (UI (SEQ CJP NOT-ADDR-ZERO loop))
    (label done)
    ))

(setq *need-to-relink* t)

(cmi::increment-patch-level 13)
