*===============================================================================
*
*	TEXAS INSTRUMENTS, INC.
*
*	GSM RATE-1/2 CONVOLUTIONAL DECODER (VITERBI ALGORITHM, SOFT DECISION)
*
*	Revision Data: 04/22/97
*
*	USAGE This routine is C callable and can be called as
*
*		void vitgsm(int n, short old[], short new_s[], 
*			    short trans[], short m[], short sd[])
*			
*		n 	--- decoder input length 		(input)
*		old[]	--- state metrics at previous instant	(input)
*		new_s[] --- state metrics at current instant	(output)
*		trans[] --- path transition of each state	(output)
*		m[]	--- local variable array to store path metrics
*		sd[]	--- soft decision values of input bit stream (input)
*			    
*	C CODE
*		This is the C equivalent of the Assembly Code without the
*		assumptions listed below. Note that the assembly code is hand
*		optimized and assumptions apply.
*	
*		void vitgsm(int n, short old[], short new_s[], 
*			    short trans[], short m[], short sd[])
*		{
*		  int   i, j, k, a, b, m0, m1, mj;
*		  short *tmp;
*		  for (i = 0; i < n; i++) {
*		    m0 = sd[2 * i] + sd[2 * i + 1];
*		    m1 = sd[2 * i] - sd[2 * i + 1];
*		    m[0] = m[2] = m0;
*		    m[1] = m[3] = -m0;
*		    m[4] = m[6] = m1;
*		    m[5] = m[7] = -m1;
*		    for (j = 0; j < 8; j++) {
*		      mj = m[j];
*		      for (k = 0; k < 16; k += 8) {
*			a = old[2 * j] + mj;
*			b = old[2 * j + 1] - mj;
*			new_s[j + k] = (b > a) ? b : a;
*			trans[i] = (trans[i] << 1) | (b > a);
*			mj = -mj;
*		      }
*		    }
*		    tmp = old;
*		    old = new_s;
*		    new_s = tmp;
*		  }
*		}
*
*	DESCRIPTION
*		This routine is used to decode the convolutional code 
*		employed in GSM full rate system, with Viterbi algorithm
*		and soft decision.
*		The convolutional code is of rate-1/2 and of constraint
*		length K = 5. The generator polynomials are
*			G0(D) = 1 + D^3 + D^4
*			G1(D) = 1 + D + D^3 + D^4
*		and the output sequence of the convolutional encoder
*		is grouped as G0G1 G0G1 ...
*		The input of the convolutional decoder is a sequence of
*		soft decision values generated by the demodulator. The
*		values are 16-bits values between -1 and +1, with -1 for
*		transmitted 1 and +1 for transmitted 0.
*
*	TECHNIQUES
*		Completely unroll the k loop.
*
*	PERFORMANCE COMMENTS
*
*		Limited by 6 ALUs/cycle.
*
*	ASSUMPTIONS
*		1. Rate - 1/2 convolutional decoding
*		2. K = 5
*		3. With the generator polynomials in DESCRIPTION
*
*	MEMORY NOTE:
*		m must be offset from new_s by an odd number of halfwords 
*		to avoid a conflict in j loop. 
*		Memory bank hits still exist once every 4 iterations of 
*		the i loop (i.e. ceil(n/4) cycles.)
*
*	CYCLES		38*n + 12 + n/4
*
*===============================================================================
	.global _vitgsm
	.text

_vitgsm:

*** BEGIN Benchmark Timing ***
B_START:

	B	.S1	ILOOP		; branch to setup code
||	ADD	.L1X	2,B8,A9		; copy sd
||	STW	.D2	B14,*B15--	; push B14 on stack

	LDH	.D2	*B8++[2],B7	; load sd0  = sd[2 * i]
||	LDH	.D1	*A9++[2],A0	; load sd1  = sd[2 * i + 1]
||	MV	.L2X	A4,B2		; move n
||	SUB	.L1X	B15,4,A7	; copy stack pointer

	MV	.L2X	A8,B9		; copy m
||	STW	.D1	A12,*A7--[2]	; push A13 on stack
||	STW	.D2	B13,*B15--[2]	; push B13 on stack

	STW	.D1	A11,*A7--[2]	; push A12 on stack
||	STW	.D2	B12,*B15--[2]	; push B12 on stack

	MVK	.S1	46,A3		; old->new swap constant
||	STW	.D1	A10,*A7		; push A11 on stack
||	STW	.D2	B11,*B15--[2]	; push B11 on stack

	STW	.D2	B10,*B15	; push B10 on stack

JLOOP:
 [B1]	B	.S1	JLOOP		;** for j
||[B1]	SUB	.S2	B1,1,B1		; j++
||[!A2]	STH	.D1	B12,*+A6[8]	; store new[j+8] = a8
||[!A2]	ADD	.D2	B0,B14,B14	; tr  |= t8
||	CMPGT	.L1	A11,A10,A1	; t0 = (b0 > a0)
||	CMPGT	.L2	B11,B10,B0	; t8 = (b8 > a8)
||	MPY	.M1X	1,B5,A4		; copy mj

 [A2]	SUB	.S1	A2,1,A2		; decrement priming
||[!A2]	STH	.D1	A12,*A6++	; store new[j] = a0
||[A1]	ADD	.S2	2,B0,B0		; t8  |= (t0 << 1)
||[B0]	MPY	.M2	1,B11,B12	; if (t8) a8 = b8
||	MPY	.M1	1,A10,A12	; copy a0
||	SUB	.L2X	A7,B5,B10	; a8 = old0 - mj
||	LDH	.D2	*++B9,B5	; load mj = m[j]

	SHL	.S2	B14,2,B14	; tr <<= 2
||[A1]	MPY	.M1	1,A11,A12	; if (t0) a0 = b0
||	ADD	.S1	A7,A4,A10	; a0 = old0 + mj
||	SUB	.L1X	B13,A4,A11	; b0 = old1 - mj
||	ADD	.L2	B13,B5,B11	; b8 = old1 + mj
||	MPY	.M2	1,B10,B12	; copy a8
||	LDH	.D2	*B4++[2],A7	;* load old0 = old[2*j]
||	LDH	.D1	*A5++[2],B13	;* load old1 = old[2*j+1]
; end of JLOOP

	STH	.D1	B12,*+A6[8]	; new[j+8] = a8
||	SUB	.D2	B9,20,B9	; reset m		i
||	ADD	.S2	B0,B14,B14	; tr  |= t8
||	SUB	.L2X	A6,14,B4	; swap new_s, old,	i

	STH	.D1	A12,*A6		; new[j] = a0
||	SUB	.L1	A5,A3,A6	;  "
||	STH	.D2	B14,*B6++	; trans[i] = tr,	i+1

ILOOP:
 [B2]	B	.S2	JLOOP		; for j 
||	ADD	.L1X	B7,A0,A10	; m0 = sd0 + sd1	
||	SUB	.L2X	A0,B7,B10	; -m1 = sd1 - sd0	
||	LDH	.D2	*B8++[2],B7	;* load sd0  = sd[2 * i]	
||	LDH	.D1	*A9++[2],A0	;* load sd1  = sd[2 * i + 1]

	MPY	.M2X	1,A10,B5	; for first pass
||[B2]	STH	.D1	A10,*+A8[0]	; store m[0] =  m0
||[B2]	STH	.D2	B10,*+B9[7]	; store m[7] = -m1
	
 [B2]	STH	.D1	A10,*+A8[2]	; store m[2] =  m0
||[B2]	STH	.D2	B10,*+B9[5]	; store m[5] = -m1
||	SUB	.L1	0,A10,A10	; negate for store
||	SUB	.L2	0,B10,B10	; negate for store

 [B2]	B	.S2	JLOOP		;* for j
||[B2]	STH	.D1	A10,*+A8[1]	; store m[1] = -m0
||[B2]	STH	.D2	B10,*+B9[6]	; store m[6] =  m1
||	ADD	.L1X	2,B4,A5		; copy old

 [B2]	STH	.D1	A10,*+A8[3]	; store m[3] = -m0
||[B2]	STH	.D2	B10,*+B9[4]	; store m[4] =  m1
||	MVK	.S2	8,B1		; j = 0
||	SUB	.L2	B2,1,B2		; i++

	LDH	.D2	*B4++[2],A7	; load old0 = old[2*j]
||	LDH	.D1	*A5++[2],B13	; load old1 = old[2*j+1]
||	MVK	.S1	3,A2		; setup priming count
||[!B2]	ADD	.L1X	B15,4,A8	; copy stack pointer
; end of ILOOP

B_END:
*** END Benchmark Timing ***

	LDW	.D2	*B15++[2],B10	; pop B10 off stack
||	LDW	.D1	*A8++[2],A10	; pop B11 off stack

	LDW	.D2	*B15++[2],B11	; pop A10 off stack
||	LDW	.D1	*A8++[2],A11	; pop A11 off stack

	LDW	.D2	*B15++[2],B12	; pop B12 off stack
||	LDW	.D1	*A8,A12		; pop A12 off stack

	LDW	.D2	*B15++,B13	; pop B13 off stack
||	B	.S2	B3		; return

	LDW	.D2	*B15,B14	; pop B14 off stack

	NOP	4
