*==============================================================================
*
*	TEXAS INSTRUMENTS, INC.		
*
*	CODEBOOK SEARCH - FOR VSELP
*
*	Revision Date:  08/06/97
*	
*	USAGE	This routine is C Callable and can be called as:
*		
*		int v_srch(int numBasis, short *R, short *wiPtr,short *TABLE,
*			   short * wBasisPtr,short *D)
*
*		numBasis   --- number of weighted basis vectors
*		*R	   --- array of Rm values, the cross correlations 
*	                       between weighted speech and weighted basis 
*                              vectors
*		*wiPtr     --- weighted speech vectors
*		*TABLE     --- table of codewords 
*		*wBasisPtr --- weighted basis vectors
*		*D	   --- matrix of Dmj values, the cross correlations 
*                              between the weighted basis vectors
*
*		If routine is not to be used as a C callable function then
*		you need to initialize values for all of the values passed
*		as these are assumed to be in registers as defined by the 
*		calling convention of the compiler, (refer to the C compiler
*		reference guide).
*
*	C CODE
*
*		The C source code for this was written by Motorola Systems 
*		Research Laboratories and is authorized by Motorola for the
*		use of development of North American digital cellular 
*		standards.  As such, the C code cannot be shown here.
*
*	DESCRIPTION
*
*		Performs VSELP vocoder codebook search. This routine performs 
*		the entire v_srch.c function as written by Motorola. It 
*		involves calculating correlations between weighted basis 
*		vectors and weighted speech vector (Rm's), C0, and 
*		0.25 * sum of Djj for G0.  It then calculates all Dmj and 
*		finishes calculating G0. It then initializes the best vector
*		to be code vector zero and performs search by finding the 
*		vector that produces the highest C^2/G value.
*		
*	TECHNIQUES
*	
*		The loops of the code are unrolled.
*
*	MEMORY NOTE
*
*		Vectors wiPtr and wBasisPtr have to be aligned on opposite word 
*		boundaries to avoid memory hits.
*
*	CYCLES
*
*		loop 1		loop 2		loop 3
*
*		  342		  639		 2087
*
*		Total Cycles = 3068
*
*==============================================================================

	.global _v_srch
	.text
_v_srch:

	ADD	.L1X	B15, 0, A9	; copy stack pointer
||	STW	.D2	B13,  *--B15	; push B13 onto stack
			
	STW	.D2	B14, *--B15[2]	; push B14 onto stack	
||	STW	.D1	A10, *--A9[2]	; push A10 onto stack

	STW	.D2	B11, *--B15[2]	; push B11 onto stack
||	STW	.D1	A11, *--A9[2]	; push A11 onto stack

	STW	.D2	B12, *--B15[2]	; push B12 onto stack
||	STW	.D1	A12, *--A9[2]	; push A12 onto stack

	STW	.D1	A13, *--A9[2]	; push A13 onto stack
||	STW	.D2	B10,*--B15[2]	; push B10 onto stack
*** BEGIN Benchmark Timing ***
B_START:

	LDH	.D2	*++B4,B0	; R++; R_tmp = *R;
||	MV	.L1	A8,A10		; tmpPtr = wBasisPtr
||	MVK	.S1	0,A3		; C = 0.0
||	ADD	.L2X	0,A6,B14	; tmpPtr2 = wiPtr

	LDW	.D1	*A10++,A0	; *tmpPtr++,
||	LDW	.D2	*B14++,B5	; *tmpPtr2++,

	MVK	.S2	0,B1		; p0 = 0
||	B	.S1	LOOP1		; for (endPtr2 = tmpPtr2 + S_LEN;
||	STW	.D1	A15,*--A9[2]	; push A14 onto stack
||	ZERO	.L1	A5		; G = 0.0
||	MV	.L2X	A4,B2		; outer lp cntr

	LDW	.D1	*A10++,A0	; *tmpPtr++,
||	LDW	.D2	*B14++,B5	; *tmpPtr2++,
||	ADD	.S2	-1,B2,B2	; dec outer lp cntr


	MVK	.S1	18,A1		; lp cntr =18 for 21 iterations of loop
||	B	.S2	LOOP1		; for (endPtr2 = tmpPtr2 + S_LEN;
||	STW	.D2	B3,*--B15[3]	; push A15 onto stack
||	STW	.D1	A14,*--A9	; push B3 onto stack
||	MPY	.M1	A13,0,A13	; p1 = 0
||	ADD	.L1	1,A4,A7		; Ddim = numBasis + 1

	LDW	.D1	*A10++,A0	; *tmpPtr++,
||	LDW	.D2	*B14++,B5	; *tmpPtr2++,
||	SHL	.S2	B0,15,B0	; R_tmp = (*R << 15);

LOOP1:					; LOOP1A BEGINS HERE
	MPYH	.M2X	A0,B5,B1	; p0 = *tmpPtr * *tmpPtr2,
||	MPYH	.M1	A0,A0,A13	; p1 = *tmpPtr * *tmpPtr,
|| [B1] ADD	.L2	B0,B1,B0	; R_tmp += p0,
||	ADD	.L1	A5,A13,A5	; G += p1,
|| [A1] B	.S1	LOOP1		; for (endPtr2 = tmpPtr2 + S_LEN;

	LDW	.D1	*A10++,A0	; *tmpPtr++,
||	LDW	.D2	*B14++,B5	; *tmpPtr2++,
||	MPY	.M2X	A0,B5,B1	; p0 = *tmpPtr * *tmpPtr2,
||	MPY	.M1	A0,A0,A13	; p1 = *tmpPtr * *tmpPtr,
|| 	ADD	.L2	B0,B1,B0	; R_tmp += p0,
||	ADD	.L1	A5,A13,A5	; G += p1,
|| [A1]	ADD	.S1	-1,A1,A1
; end of LOOP1

	SHR	.S2	B0,14,B0	; *R = (R_tmp >> 14);
|| [B2] B	.S1	LOOP1		; for (endPtr = tmpPtr+numBasis * S_LEN
|| [B2] LDH	.D2	*+B4[1],B0	; R_tmp = *R
||	ADD	.L2X	0,A6,B14	; tmpPtr2 = wiPtr
||	SUB	.D1	A10,16,A10	; sub 16 from A10
||	SUB	.L1X	A3,B0,A3	; C -= R_tmp;

   [B2] LDW	.D1	*A10++,A0	; *tmpPtr++,
|| [B2] LDW	.D2	*B14++,B5	; *tmpPtr2++,

	ZERO	.L1	A13		; p1 = 0
||	ZERO	.L2	B1		; p0 = 0
|| [B2] B	.S1	LOOP1		; for (endPtr2 = tmpPtr2 + S_LEN;
||	STH	.D2	B0,*B4++	; *R = R_tmp;  R++;
||[!B2]	MVK	.S2	8,B9		; used to calculate new D + m*Ddim + j 
					; in START2


   [B2] LDW	.D1	*A10++,A0	; *tmpPtr++,
|| [B2] LDW	.D2	*B14++,B5	; *tmpPtr2++,
||	MVK	.S1	18,A1		; lp cntr = 18 for 21 iterations of loop

   [B2] B	.S1	LOOP1		; for (endPtr2 = tmpPtr2 + S_LEN;
||[!B2] MVK	.S2	80,B0		; used for indexing S_LEN in next loop
||[!B2]	ADD	.L1	0,A8,A10	; tmpPtr = wBasisPtr  +	 (m-1) * S_LEN 
					; for next loop
   
   [B2] LDW	.D1	*A10++,A0	; *tmpPtr++,
|| [B2] LDW	.D2	*B14++,B5	; *tmpPtr2++,
|| [B2]	ADD	.L2	-1,B2,B2	; dec outer lp cntr
|| [B2] SHL	.S2	B0,15,B0	; R_tmp = (*R << 15);
||	ADD	.L1	1,A4,A7		; Ddim = numBasis + 1

START2:
	LDW	.D1	*A10++,A13	; *tmpPtr,
||	SUB	.L1X	A10,B0,A15	; A15 = wBasisPtr  +  (m-2) * S_LEN
||	SHL	.S1	A7,1,A13	; used to calculate D + Ddim
||	ADD	.L2X	A8,B0,B14	; tmpPtr2 = wBasisPtr  +  (j-1) * S_LEN for next loop


	LDW	.D1	*A10++,A0	; *tmpPtr,
||	LDW	.D2	*B14++,B10	; *tmpPtr2,
||	SUB	.L2X	B4,A13,B4	; R -= Ddim
||	MV	.S2	B14,B12		; B12 = wBasisPtr  +  (j-1) * S_LEN

	LDW	.D1	*A10++,A13	; *tmpPtr,
||	LDW	.D2	*B14++,B2	; *tmpPtr2,
||	ADD	.L2X	B8,A13,B11	; D + m*Ddim
||	B	.S1	LOOP2		; for LOOP2
||	MVK	.S2	2,B13		; used for j lp cntr
||	ZERO	.L1	A11		; p0 = 0
||	MPY	.M1X	B0,1,A14	; used for indexing S_LEN in next loop

	LDW	.D1	*A10++,A0	; *tmpPtr,
||	LDW	.D2	*B14++,B10	; *tmpPtr2,
||	ADD	.S2	4,B11,B11	; D + m*Ddim + j = D + 8 + 2
||	MV	.L2X	A4,B5		; move numBasis to other reg file
||	ZERO	.L1	A12		; Dcurrent0 = 0
||	MVK	.S1	8,A9		; used to  calculate new D + m*Ddim + j

	LDW	.D1	*A10++,A13	; *tmpPtr,
||	LDW	.D2	*B14++,B2	; *tmpPtr2,
||	ADD	.L1	-1,A4,A2	; for m lp cntr = numBasis-2
||	SUB	.L2X	A4,B13,B1	; for j lp cntr = numBasis-2
||	MVK	.S2	0,B7		; Dcurrent1 = 0
||	B	.S1	LOOP2		; for LOOP2

LOOP2B:
	LDW	.D1	*A10++,A0	; *tmpPtr,
||	LDW	.D2	*B14++,B10	; *tmpPtr2,
|| [A2] MVK	.S1	8,A1		; lp cntr = 8 for 11 iterations of loop
||[!A2] SUB	.L1	A1,A1,A1	; avoid last Branch to LOOP2
||	MVK	.S2	0,B5		; p1 = 0
||[!B1] ADD	.L2	1,B13,B13	; used for j lp cntr

LOOP2:					; LOOP BEGINS HERE
	LDW	.D1	*A10++,A13	; *tmpPtr,
||	LDW	.D2	*B14++,B2	; *tmpPtr2,
||	MPY	.M1X	A13,B10,A11	; p0 = *tmpPtr * *tmpPtr2,
||	MPYH	.M2X	A13,B10,B5	; p1 = *tmpPtr * *tmpPtr2,
||	ADD	.L1	A12,A11,A12	; Dcurrent0 += p0,
||	ADD	.L2	B7,B5,B7	; Dcurrent1 += p1,
|| [A1] B	.S1	LOOP2		; for endPtr = tmpPtr+S_LEN

	LDW	.D1	*A10++,A0	; *tmpPtr,
||	LDW	.D2	*B14++,B10	; *tmpPtr2,
||	MPY	.M1X	A0,B2,A11	; p0 = *tmpPtr * *tmpPtr2,
||	MPYH	.M2X	A0,B2,B5	; p1 = *tmpPtr * *tmpPtr2,
||	ADD	.L1	A12,A11,A12	; Dcurrent0 += p0,
||	ADD	.L2	B7,B5,B7	; Dcurrent1 += p1,
|| [A1] ADD	.S1	-1,A1,A1	; i--

; end of LOOP2

	ADD	.L1X	A12,B7,A12	; Dcurrent0 += Dcurrent1,
||	ADD	.L2X	A12,B7,B3	; Dcurrent0 += Dcurrent1,
||	ADD	.D1	A15,A14,A10	; tmpPtr = wBasisPtr  +	 (m-1) * S_LEN
||	ADD	.D2	B12,B0,B14	; tmpPtr2 = wBasisPtr  +  (j-1) * S_LEN

	LDW	.D1	*A10++,A13	; *tmpPtr,
|| [A2] B	.S1	LOOP2B		; for j = m+1 OR for m = 1

	SHL	.S1	A12,1,A12	; 2.0 * Dcurrent
||	SHR	.S2	B3,13,B3	; scale 4.0 * Dcurrent
||	LDW	.D2	*B14++,B10	; *tmpPtr2,
||	LDW	.D1	*A10++,A0	; *tmpPtr,

	ADD	.L1	A5,A12,A5	; G += 2.0 * Dcurrent
||	STH	.D2	B3,*B11		; *(D + m*Ddim + j) = 4.0 * Dcurrent	
|| [B1] ADD	.S2	2,B11,B11	; D + m*Ddim + j  (inc by 1 16 bit wd)
||[!B1] ADD	.S1	-1,A2,A2	; for m lp cntr = numBasis-1
||[!B1] ADD	.L2X	B11,A9,B11	; D + m*Ddim + j  (inc by 1 16 bit wd)
||[!B1] ADD	.D1	2,A9,A9		; inc index for D + m*Ddim + j calc

	LDW	.D1	*A10++,A13	; *tmpPtr,
||	LDW	.D2	*B14++,B2	; *tmpPtr2,
||	MPY	.M1	0,A11,A11	; p0 = 0
||	MPY	.M2	0,B5,B5		; p1 = 0
|| [A2] B	.S1	LOOP2		; for LOOP2
||[!B1] SUB	.L2X	A4,B13,B1	; for j lp cntr = numBasis-B13
|| [B1] ADD	.S2	-1,B1,B1	; dec for j lp cntr

	LDW	.D1	*A10++,A0	; *tmpPtr,
||	LDW	.D2	*B14++,B10	; *tmpPtr2,
||	MPY	.M1	0,A12,A12	; Dcurrent0 = 0
||	MPY	.M2	0,B7,B7		; Dcurrent1 = 0
|| [B1] ADD	.S2	-8,B14,B12	; update B12 pointer
||[!B1] ADD	.L1	-12,A10,A15	; update A15 pointer
||[!A2] B	.S1	START3		; for m = 1 complete

	LDW	.D1	*A10++,A13	; *tmpPtr,
||	LDW	.D2	*B14++,B2	; *tmpPtr2,
||[!B1] ADD	.L2X	B0,A15,B12	; update B12 pointer
|| [A2] B	.S1	LOOP2		; for LOOP2

; end of OUTLOOP2


START3:
	ADD	.L1X	6,B6,A10	; intPtr = TABLE + 1
||	SHR	.S1	A3,16,A15	; C>>16
||	LDH	.D2	*+B6[1],A13	; bitChanged = *++intPtr
||	MV	.L2X	A4,B12		;

	LDH	.D2	*B6++[2],B13	; codeWord = *intPtr
||	MPY	.M1	A15,A15,A8	; cSqrdBest = (C>>16) * (C>>16)
||	SHR	.S1	A5,16,A5	; G = G>>16
||	ZERO	.D1	A4		; wordSave = 0;

	MV	.L2X	A3,B14		; cSave = C
||	MVK	.S1	1,A1		; 
||	MV	.L1	A5,A15		; gBest = G


	MVK	.S1	62,A0		; set outer loop3 counter

	ADD	.L1X	2,B4,A12	; R + 1
||	SHR	.S1	A8,16,A8	; cSqrdBest = ((C>>16) * (C>>16)>>16)

	SHL	.S1	A1,A13,A14	; mask = 0x1 << bitChanged
||	LDH	.D1	*+A12[A13],A11	; *(R + bitChanged)
||	ADD	.L1	1,A13,A12	; bitChanged++

OUTLOOP3:

	AND	.L2X	B13,A14,B2	; codeWord & mask
||	ADD	.L1	-1,A12,A1	; loop counter = bitChanged - 1
||	ADD	.D1	A7,A12,A12	; j*Ddim + bitChanged

	SHL	.S1	A12,1,A12	; used to scale offset
||	ADD	.L1	1,A13,A13	; bitChanged++
||	MV	.L2X	A7,B10		; copy Ddim to other reg file
|| [B2]	MVK	.S2	1,B2		; theta = !(!(codeWord & mask)) 

	ADD	.L2X	B8,A12,B7	; D + j*Ddim + bitChanged
|| [A1]	ADD	.D1	-1,A1,A1	; decrement counter
||[!A1] SUB	.L1	A2,A2,A2	; prevents first LD from executing
|| [A1]	MVK	.S1	1,A2		; allows first LD to execute

   [A2]	LDH	.D2	*B7++[B10],B9	; *(D + j*Ddim + bitChanged)
||	MVK	.S1	1,A2		; tmpMask = 1;
|| [A1] B	.S2	LOOP3A		; for j=1
||	MV	.L2	B2,B5		; theta

	AND	.L2X	B13,A2,B0	; codeWord & tmpMask
|| [A1]	ADD	.D1	-1,A1,A1	; decrement counter
||	MVK	.S2	0,B9		; zero initial load value
||	SHL	.S1	A2,2,A2		; tmpMask <<= 1

   [B2]	ADD	.D1	A3,A11,A3	; C += *(R + bitChanged)
||[!B2] SUB	.L1	A3,A11,A3	; C -= *(R + bitChanged)
||	LDH	.D2	*B7++[B10],B9	; *(D + j*Ddim + bitChanged)
|| [B0]	MVK	.S2	1,B2		; !(!(codeWord & tmpMask))
||[!B0] SUB	.L2	B2,B2,B2	; !(!(codeWord & tmpMask))
|| [A1]	B	.S1	LOOP3A		; for j=1
||	MPY	.M1	A13,A7,A12	; bitChanged * Ddim

	AND	.L2	B13,2,B0	; codeWord & tmpMask
|| [A1]	ADD	.D1	-1,A1,A1	; decrement counter

LOOP3A:
	LDH	.D2	*B7++[B10],B9	; *(D + j*Ddim + bitChanged)
|| [B0] MVK	.S2	1,B2		; !(!(codeWord & tmpMask))
||[!B0] MPY	.M2	0,B2,B2		; !(!(codeWord & tmpMask))
||	CMPEQ	.L2	B5,B2,B1	; (theta == !(!(codeWord & tmpMask)))
|| [B1] ADD	.L1X	A5,B9,A5	; G += *(D + bitChanged*Ddim + j)
|| [A1] B	.S1	LOOP3A		; for j=1

	SHL	.S1	A2,1,A2		; tmpMask <<= 1
||[!B1] SUB	.L1X	A5,B9,A5	; G -= *(D + bitChanged*Ddim + j)
||	AND	.L2X	B13,A2,B0	; codeWord & tmpMask
|| [A1] ADD	.D1	-1,A1,A1	; decrement counter

; end of LOOP3A

	ADD	.L2X	1,A13,B7	; j = bitChanged + 1
||	SHL	.S1	A12,1,A12	; used to scale offset
|| [B1] ADD	.L1X	A5,B9,A5	; G += *(D + bitChanged*Ddim + j)
||	MVK	.S2	0,B1		; initialize condreg

	ADDAH	.D2	B8,B7,B7	; j + D
||	SUB	.L1X	B12,A13,A1	; loop cntr = numBasis - bitChanged

	ADD	.L2X	B7,A12,B7	; D + bitChanged*Ddim + j
|| [A1]	ADD	.D1	-1,A1,A1	; loop cntr = numBasis - (bitChanged+1)
||	SHR	.S1	A3,16,A12	; C>>16

	LDH	.D2	*B7++,B9	; *(D + bitChanged*Ddim + j)
|| [A1]	B	.S2	LOOP3B		; for bitChanged+1
||	MPY	.M1	A12,A12,A11	; (C>>16) * (C>>16)

	SHL	.S1	A14,1,A2	; tmpMask = mask << 1;
|| [A1]	ADD	.D1	-1,A1,A1	; decrement counter
||	MPY	.M2X	A0,1,B2		; move outer lp cntr to B2

	LDH	.D2	*B7++,B9	; *(D + bitChanged*Ddim + j)
||	AND	.L2X	B13,A2,B0	; codeWord & tmpMask
|| [A1]	B	.S2	LOOP3B		; for j = bitChanged+1
||	SHR	.S1	A11,16,A11	; (C>>16) * (C>>16) >> 16
||[!B2]	ADD	.L1X	4, B15, A9	; copy stack pointer at end of loop

   [B0]	MVK	.S2	1,B0		; !(!(codeWord & tmpMask))
||	MPY	.M1	A11,A15,A12	; ((C>>16) * (C>>16) >>16) * gBest
|| [A1]	ADD	.D1	-1,A1,A1	; decrement counter
||	SHL	.S1	A2,1,A2		; tmpMask <<= 1

LOOP3B:
	LDH	.D2	*B7++,B9	; *(D + bitChanged*Ddim + j)
||	SUB	.S2	B5,B0,B1	; (theta == !(!(codeWord & tmpMask)))
||	AND	.L2X	B13,A2,B0	; codeWord & tmpMask
|| [B1]	SUB	.L1X	A5,B9,A5	; G -= *(D + bitChanged*Ddim + j)
|| [A1]	B	.S1	LOOP3B		; for j = bitChanged+1

   [B0]	MVK	.S2	1,B0		; !(!(codeWord & tmpMask))
||[!B1] ADD	.L1X	A5,B9,A5	; G += *(D + bitChanged*Ddim + j)
|| [A1]	ADD	.D1	-1,A1,A1	; decrement counter
||	SHL	.S1	A2,1,A2		; tmpMask <<= 1


; end of LOOP3B

   [B2]	B	.S2	OUTLOOP3	; for iePtr = intPtr
|| [B2] LDH	.D2	*B6++[2],B13	; codeWord = *intPtr
|| [B2] LDH	.D1	*A10++[2],A13	; bitChanged = *++intPtr
|| [B1]	SUB	.L1X	A5,B9,A5	; G -= *(D + bitChanged*Ddim + j)

	MPY	.M1	A8,A5,A6	; cSqrdBest * G
||[!B2]	LDW	.D2	*B15++,   B3	; pop B3 off stack
||[!B2]	LDW	.D1	*A9++,    A15	; pop A15 off stack

	ADD	.L1	-1,A0,A0	; dec OUTLOOP3 counter
||	MVK	.S2	127,B11		; load (1<<numBasis) - 1
||	MVK	.S1	1,A14		;
||[!B2]	LDW	.D1	*A9++[2],  B10	; pop B10 off stack
||[!B2]	LDW	.D2	*B15++[2], A14	; pop A14 off stack

	ADD	.S1X	2,B4,A12	; R + 1
||	CMPLT	.L1	A6,A12,A1	; (((C>>16) * (C>>16)) >>16) * gBest
					;  >  cSqrdBest * G)

   [A1] MPY	.M1	A11,1,A8  	; cSqrdBest =(((C>>16) * (C>>16)) >> 16)
|| [A1] MV	.L1X	B13,A4		; wordSave = codeWord
|| [A1]	MV	.S1	A5,A15		; gBest = G
|| [A1]	MV	.S2X	A3,B14		; cSave = C
||[!B2]	LDW	.D1	*A9++[2],  B12	; pop B12 off stack
||[!B2]	LDW	.D2	*B15++[2], A13	; pop A13 off stack

	SHL	.S1	A14,A13,A14	; mask = 0x1 << bitChanged
|| 	LDH	.D1	*+A12[A13],A11	; *(R + bitChanged)
||	ADD	.L1	1,A13,A12	; bitChanged++
||	CMPGT	.L2	0,B14,B0	; if (cSave < 0.0)

; end of OUTLOOP3

   [B0]	XOR	.L1X	A4,B11,A4	; wordSave ^=  (1<<numBasis) - 1
||[!B2]	LDW	.D1	*A9++[2],  B11	; pop B11 off stack
||[!B2]	LDW	.D2	*B15++[2], A12	; pop A12 off stack

B_END:
*** END Benchmark Timing ***
	LDW	.D1	*A9++[2],  B14	; pop B14 off stack
||	LDW	.D2	*B15++[2], A11	; pop A11 off stack
||	B	.S2	B3

	LDW	.D1	*A9++[2],  B13	; pop B13 off stack
||	LDW	.D2	*B15++[3], A10	; pop A10 off stack
	NOP	4
	

