*================================================================================
*
*	TEXAS INSTRUMENTS, INC.		
*
*	DOT PRODUCT
*
*	Revision Date:  04/07/97
*	
*	USAGE	This routine is C Callable and can be called as:
*		
*		int dotprod(short a[], short b[], int N)
*
*		a[] --- first vector array 
*               b[] --- second vector array
*		N   --- number of elements of vector
*
*		If routine is not to be used as a C callable function then
*		you need to initialize values for all of the values passed
*		as these are assumed to be in registers as defined by the 
*		calling convention of the compiler, (refer to the C compiler
*		reference guide).
*
*	C CODE
*		This is the C equivalent of the assembly code.  Note that
*		the assembly code is hand optimized and restrictions may
*		apply.
*
*		int dotprod(short a[],short b[], int N)
*		{
*		int sum;
*		int i;
*
*		sum = 0;
*		for(i=0; i<N; i++){
*			sum += (a[i] * b[i]);
*			}
*		return(sum);
*		}
*
*
*	DESCRIPTION
*
*		This routine takes two vectors and calculates their vector
*               product.  The inputs are 16-bit number, and the result is 
*       	a 32-bit number.
*		
*	TECHNIQUES
*
*		1.  Load words are used to load two 16-bit values at a time
*		2.  The loop is unrolled once
*	
*	ASSUMPTIONS
*
*         	1.  N is an even number greater than 2 
*               2.  Vectors a and b should be aligned on word boundaries
*		
*	MEMORY NOTE
*
*		Vectors a and b should be aligned on opposite word
*		boundaries to avoid memory hits.
*
*       ARGUMENTS PASSED
*
* 		a[]  ->  A4
*               b[]  ->  B4
*               N    ->  A6
*
*
*	CYCLES
*
*		N/2 + 8
*
*================================================================================
	.global _dotprod
	.text

_dotprod:		

*** BEGIN Benchmark Timing ***
B_START:

	LDW	.D1	*A4++,A0	; aData[0] & aData[1]
||	LDW	.D2	*B4++,B0	; bData[0] & bData[1]
||	B	.S2	LOOP		; branch to loop
||	ZERO	.L1	A1		; clear reg.
||      ZERO    .L2     B1              ; clear reg.
||	MPY	.M1	A5,0,A5		; clear reg.
||	MPY	.M2	B5,0,B5		; clear reg.
||	MV	.S1	A6,A2		; N	


	LDW	.D1	*A4++,A0	; aData[2] & aData[3]
||	LDW	.D2	*B4++,B0	; bData[2] & bData[3]
||[A2]	B	.S2	LOOP		; branch to loop
||	SUB	.L1	A2,2,A2		; decrement loop counter

	LDW	.D1	*A4++,A0	; aData[4] & aData[5]
||	LDW	.D2	*B4++,B0	; bData[4] & bData[5]
||[A2]	B	.S2	LOOP		; branch to loop
||[A2]	SUB	.S1	A2,2,A2		; decrement loop counter

	LDW	.D1	*A4++,A0	; aData[6] & aData[7]
||	LDW	.D2	*B4++,B0	; bData[6] & bData[7]
||[A2]	B	.S2	LOOP		; branch to loop
||[A2]	SUB	.S1	A2,2,A2		; decrement loop counter

	LDW	.D1	*A4++,A0	; aData[8] & aData[9]
||	LDW	.D2	*B4++,B0	; bData[8] & bData[9]
||[A2]	B	.S2	LOOP		; branch to loop
||[A2]	SUB	.S1	A2,2,A2		; decrement loop counter

LOOP:

	LDW	.D1	*A4++,A0	; aData[10] & aData[11]
||	LDW	.D2	*B4++,B0	; bData[10] & bData[11]
||	MPY	.M1X	A0,B0,A1	; aData[0]*bData[0]
||	MPYH	.M2X	A0,B0,B1	; aData[1]*bData[1]
||	ADD	.L1	A1,A5,A5	; Sum1 += a[i] * b[i]
||	ADD	.L2	B1,B5,B5	; Sum2 += a[i+1] * b[i+1]
||[A2]	SUB	.S1	A2,2,A2		; decrement loop counter
||[A2]	B	.S2	LOOP		; branch to loop

	ADD	.L1X	A5,B5,A4	; return dot product in A4

B_END:
*** END Benchmark Timing ***

	B	.S2	B3		; return		
  	NOP	5

