*===============================================================================
*
*	TEXAS INSTRUMENTS, INC.
*
*	VITIS54 - VITERBI CHANNEL DECODER (IS54)
*
*	Revision Data: 06/13/97
*
*	USAGE This routine is C callable and can be called as
*
*		void vit1s54(int n, short old[], short new_s[], 
*			    int trans[], short m[], short sd[])
*			
*		n 	--- decoder input length 		(input)
*		old[]	--- state metrics at previous instant	(input)
*		new_s[] --- state metrics at current instant	(output)
*		trans[] --- path transition of each state	(output)
*		m[]	--- local variable array to store path metrics
*		sd[]	--- soft decision values of input bit stream (input)
*			    
*	C CODE
*		This is the C equivalent of the Assembly Code without the
*		assumptions listed below. Note that the assembly code is hand
*		optimized and assumptions apply.
*
*		void vitis54(int n, short old[], short new_s[], int trans[],
*			     short m[], short sd[])
*		{
*			int             i, j, k, a, b, m0, m1, mj;
*			short          *tmp;
*			for (i = 0; i < n; i++) {
*				m0 = sd[2 * i] + sd[2 * i + 1];
*				m1 = sd[2 * i] - sd[2 * i + 1];
*				m[0] = m[5] = m[11] = m[14] = m0;
*				m[2] = m[7] = m[9] = m[12] = -m0;
*				m[1] = m[4] = m[10] = m[15] = m1;
*				m[3] = m[6] = m[8] = m[13] = -m1;
*				for (j = 0; j < 16; j++) {
*					mj = m[j];
*					for (k = 0; k < 32; k += 16) {
*						a = old[2 * j] + mj;
*						b = old[2 * j + 1] - mj;
*						new_s[j + k] = (b > a) ? b : a;
*						trans[i] = (trans[i]<<1)|(b>a);
*						mj = -mj;
*					}
*				}
*				tmp = old;
*				old = new_s;
*				new_s = tmp;
*			}
*		}
*
*
*	DESCRIPTION
*		This routine is used to decode the convolutional code 
*		employed in IS-54 full rate system, with Viterbi algorithm
*		and soft decision.
*		The convolutional code is of rate-1/2 and of constraint
*		length K = 6. The generator polynomials are
*			G0(D) = 1 + D^1 + D^3 + D^5
*			G1(D) = 1 + D^2 + D^3 + D^4 + D^5
*		and the output sequence of the convolutional encoder
*		is grouped as G0G1 G0G1 ...
*		The input of the convolutional decoder is a sequence of
*		soft decision values generated by the demodulator. The
*		values are 16-bits values between -1 and +1, with -1 for
*		transmitted 1 and +1 for transmitted 0.
*
*	TECHNIQUES
*		Completely unroll the k loop.
*
*	MEMORY NOTE:
*		m[] must be offset from both new[] and old[] by an odd number of
*		halfwords to avoid a conflict in cycle three of JLOOP.  Also a 
*		2x interleave is used avoid conflicts JLOOP and ILOOP.
*		There will be one conflict every other time through ILOOP
*		resulting in 0.5*n cycles.
*
*	CYCLES		66 * n + 16 + 0.5 * n
*
*===============================================================================
	.global _vitis54
	.text

_vitis54:
*** BEGIN Benchmark Timing ***
B_START

	B	.S1	ILOOP			; branch to setup code		
||	ADD	.L1X	2,	B8,	A9	; copy sd
||	STW	.D2	B14,	*B15--		; push B14 on stack

	LDH	.D2	*B8++[2],	B7	; sd0 = sd[2 * i]
||	LDH	.D1	*A9++[2],	A0	; sd1 = sd[2 * i + 1]
||	MV	.L1X	B15,	A7		; copy stack pointer

	STW	.D1	B13,	*--A7		; push B13 on stack
||	STW	.D2	A13,	*B15--[2]	; push A13 on stack

	STW	.D1	B12,	*--A7[2]	; push B12 on stack
||	STW	.D2	A12,	*B15--[2]	; push A12 on stack

	STW	.D1	B11,	*--A7[2]	; push B11 on stack
||	STW	.D2	A11,	*B15--[2]	; push A11 on stack
||	MV	.L2X	A4,	B2		; move n

	STW	.D1	B10,	*--A7[2]	; push B10 on stack
||	STW	.D2	A10,	*B15--[2]	; push A10 on stack
||	MVK	.S1	78,	A3		; old->new reset constant
||	MVK	.S2	36,	B13		; m reset constant
||	MV	.L2X	A8,	B9		; copy m
||	MV	.L1X	B3,	A13		; move return address

JLOOP:
  [B1]	B	.S1	JLOOP			; for
||[B1]	SUB	.S2	B1,	1,	B1	; j++
||[!A2]	STH	.D1	B12,	*+A6[16]	; new[j+16] = a16
||[!A2]	ADD	.D2	B0,	B14,	B14	; tr  |= t16
||	CMPGT	.L1	A11,	A10,	A1	;* t0 = (b0 > a0)
||	CMPGT	.L2	B11,	B10,	B0	;* t16 = (b16 > a16)
||	MPY	.M1X	1,	B5,	A4	;** copy mj

  [A2]	SUB	.S1	A2,	1,	A2	; decrement priming
||[!A2]	STH	.D1	A12,	*A6++		; new[j] = a0
||[A1]	ADD	.S2	2,	B0,	B0	;* t16	|= (t0 << 1)
||[B0]	MPY	.M2	1,	B11,	B12	;* if (t16) a16 = b16
||	MPY	.M1	1,	A10,	A12	;* copy a0
||	SUB	.L2X	A7,	B5,	B10	;** a16 = old0 - mj
||	LDH	.D2	*++B9,	B5		;**** mj = m[j]

	SHL	.S2	B14,	2,	B14	;* tr <<= 2
||[A1]	MPY	.M1	1,	A11,	A12	;* if (t0) a0 = b0
||	ADD	.S1	A7,	A4,	A10	;** a0 = old0 + mj
||	SUB	.L1X	B3,	A4,	A11	;** b0 = old1 - mj
||	ADD	.L2	B3,	B5,	B11	;** b16 = old1 + mj
||	MPY	.M2	1,	B10,	B12	;** copy a16
||	LDH	.D2	*B4++[2],	A7	;**** old0 = old[2*j]
||	LDH	.D1	*A5++[2],	B3	;**** old1 = old[2*j+1]
; End of JLOOP

	STH	.D1	B12,	*+A6[16]	; new[j+16] = a16
||	SUB	.D2	B9,	B13,	B9	; reset m		i
||	ADD	.S2	B0,	B14,	B14	; tr  |= t16
||	ADD	.L2X	-15,	A6,	B4	; swap new_s, old,	i

	STH	.D1	A12,	*A6		; new[j] = a0
||	SUB	.L1	A5,	A3,	A6	; swap new
||	STW	.D2	B14,	*B6++		; trans[i] = tr,	i+1
||	ADD	.L2	-15,	B4,	B4	; swap new_s, old,	i

ILOOP:
	ADD	.L2X	B7,	A0,	B10	; m0 = sd0 + sd1,	i+1	
||	SUB	.L1X	B7,	A0,	A10	; m1 = sd0 - sd1,	i+1	
||	LDH	.D2	*B8++[2],	B7	; sd0 = sd[2 * i],	i+1
||	LDH	.D1	*A9++[2],	A0	; sd1 = sd[2 * i + 1],	i+1
	
  [B2]	STH	.D1	B10,	*+A8[0]		; m[0] =  m0,		i+1
||[B2]	STH	.D2	A10,	*+B9[1]		; m[1] =  m1,		i+1
	
  [B2]	STH	.D1	B10,	*+A8[5]		; m[5] =  m0,		i+1
||[B2]	STH	.D2	A10,	*+B9[4]		; m[4] =  m1,		i+1

  [B2]	STH	.D1	B10,	*+A8[11]	; m[1] = m0,		i+1
||[B2]	STH	.D2	A10,	*+B9[10]	; m[5] = m1,		i+1

  [B2]	STH	.D1	B10,	*+A8[14]	; m[14] = m0,		i+1
||[B2]	STH	.D2	A10,	*+B9[15]	; m[15] = m1,		i+1
||	SUB	.L2	0,	B10,	B10	; negate for store,	i+1
||	SUB	.S1	0,	A10,	A10	; negate for store,	i+1
||	MPY	.M2	1,	B10,	B5	; for first pass
||[B2]	B	.S2	JLOOP			; for j,		i+1

  [B2]	STH	.D1	B10,	*+A8[2]		; m[2] = -m0,		i+1
||[B2]	STH	.D2	A10,	*+B9[3]		; m[3] = -m1,		i+1

  [B2]	STH	.D1	B10,	*+A8[7]		; m[7] = -m0,		i+1
||[B2]	STH	.D2	A10,	*+B9[6]		; m[6] = -m1,		i+1

  [B2]	STH	.D1	B10,	*+A8[9]		; m[9] = -m0,		i+1
||[B2]	STH	.D2	A10,	*+B9[8]		; m[8] = -m1,		i+1
||	ADD	.L1X	2,	B4,	A5	; copy old
||[B2]	B	.S2	JLOOP			; for j,		i+1

  [B2]	STH	.D1	B10,	*+A8[12]	; m[12] = -m0,		i+1
||[B2]	STH	.D2	A10,	*+B9[13]	; m[13] = -m1,		i+1
||	MVK	.S2	16,	B1		; j = 0,		i+1
||	SUB	.L2	B2,	1,	B2	; i++

	LDH	.D2	*B4++[2],	A7	;**** old0 = old[2*j]
||	LDH	.D1	*A5++[2],	B3	;**** old1 = old[2*j+1]
||	MVK	.S1	3,	A2		; setup priming count,	i+1
; End of ILOOP

B_END:
*** END Benchmark Timing ***

	LDW	.D2	*++B15,	B10		; pop B10 off stack
||	MV	.L1X	B15,	A8		; copy stack pointer
||	MV	.L2X	A13,	B2		; move return address

	LDW	.D1	*++A8[3],	B11	; pop B11 off stack
||	LDW	.D2	*++B15, 	A10	; pop A10 off stack

	LDW	.D1	*++A8[2],	B12	; pop B12 off stack
||	LDW	.D2	*++B15[2], 	A11	; pop A11 off stack

	LDW	.D1	*++A8[2],	B13	; pop B13 off stack
||	LDW	.D2	*++B15[2], 	A12	; pop A12 off stack
||	B	.S2	B2			; return

	LDW	.D1	*++A8,		A13	; pop A13 off stack
||	LDW	.D2	*++B15[3], 	B14	; pop B14 off stack

	NOP	4
