*===============================================================================
*
*	TEXAS INSTRUMENTS, INC.		
*
*	DOTP_SQR
*
*	Revision Date:  05/07/97
*	
*	USAGE	This routine is C Callable and can be called as:
*		
*		int dotp_sqr(int G,short *wiPtr,short *tmpPtr,int *R, int S_LEN)
*
*		wiPtr   --- first vector array
*               tmpPtr  --- second vector array
*		G       --- calculated value of G 
*		R       --- result of vector dot product of wiPtr and tmpPtr
*		S_LEN   --- number of elements in vector
*
*		If routine is not to be used as a C callable function then
*		you need to initialize values for all of the values passed
*		as these are assumed to be in registers as defined by the 
*		calling convention of the compiler, (refer to the C compiler
*		reference guide).
*
*
*	C CODE
*		This is the C equivalent of the assembly code.  Note that
*		the assembly code is hand optimized and restrictions may
*		apply.
*
*
*		int dotp_sqr(int G,short *wiPtr,short *tmpPtr,int *R, int S_LEN)
*		{
*		short *tmpPtr2;
*		short *endPtr2;
*
*		tmpPtr2 = wiPtr;
*		for (endPtr2 = tmpPtr2 + S_LEN; tmpPtr2 < endPtr2; tmpPtr2++){
*			*R += *tmpPtr * *tmpPtr2;
*			G += *tmpPtr * *tmpPtr;
*			tmpPtr++;
*			}
*			return(G);
*		}
*
*	DESCRIPTION
*
*		This routine performs an N element dot product and stores 
*		it in R.  It also squares each element of tmpPtr and 
*		accumulates it in G.  G is passed back to calling function
*		in A4.  G is used in the VSELP coder.
*
*		
*	TECHNIQUES
*
*		Load words are used to load two 16-bit values at a time
*		The loop is unrolled once
*	
*	ASSUMPTIONS
*
*		n is an even number
*		
*	MEMORY NOTE
*
*		Vectors wiPtr and tmpPtr should be aligned on opposite word
*		boundaries to avoid memory hits.
*
*
*	CYCLES	n + 8
*		for n = 40 -> 48 cycles or 240 nsec
*
*===============================================================================

	.global _dotp_sqr
	.text
_dotp_sqr:

*** BEGIN Benchmark Timing ***
B_START:

	LDW	.D1	*A6++,A0	; *tmpPtr++,
||	LDW	.D2	*B4++,B5	; *tmpPtr2++,
||	SUB     .S1	A8, 2, A1       ; dec loop cntr

	LDW	.D2	*B6,B0		; R_tmp = *R
||	MVK	.S2	0,B1		; p0 = 0
||	SUB	.L1	A5,A5,A5	; p1 = 0
||	B	.S1	LOOP		; for LOOP

	LDW	.D1	*A6++,A0	; *tmpPtr++,
||	LDW	.D2	*B4++,B5	; *tmpPtr2++,
||	MVK	.S2	0,B2		; priming count on accumulate

  [A1]	B	.S2	LOOP		; for LOOP

	LDW	.D1	*A6++,A0	; *tmpPtr++,
||	LDW	.D2	*B4++,B5	; *tmpPtr2++,
||[A1] ADD	.L1	-2,A1,A1	; dec loop cntr

LOOP:					; LOOP BEGINS HERE
	MPYH	.M2X	A0,B5,B1	; p0 = *tmpPtr * *tmpPtr2,
||	MPYH	.M1	A0,A0,A5	; p1 = *tmpPtr * *tmpPtr,
|| [B2] ADD	.L2	B0,B1,B0	; R_tmp += p0,
||  	ADD	.L1	A4,A5,A4	; G += p1,
||	MVK	.S2	1,B2		; priming count on accumulate
|| [A1] B	.S1	LOOP		; for LOOP
|| [A1] ADD	.D1	-2,A1,A1	; dec loop cntr


   	LDW	.D1	*A6++,A0	; *tmpPtr++,
|| 	LDW	.D2	*B4++,B5	; *tmpPtr2++,
||	MPY	.M2X	A0,B5,B1	; p0 = *tmpPtr * *tmpPtr2,
||	MPY	.M1	A0,A0,A5	; p1 = *tmpPtr * *tmpPtr,
|| 	ADD	.L2	B0,B1,B0	; R_tmp += p0,
|| 	ADD	.L1	A4,A5,A4	; G += p1,
; LOOP ENDS HERE

	STW	.D2	B0,*B6		; *R = R_tmp
||	B	.S2	B3		; Return

B_END:
*** END Benchmark Timing ***

	NOP	5
