*===============================================================================
*
*	TEXAS INSTRUMENTS, INC.		
*
*	C67X FLOATING POINT DOT PRODUCT (HAND-CODED ASSEMBLY VERSION)
*
*	Revision Date:	03/11/98
*	
*	USAGE	This routine is C Callable and can be called as:
*		
*		float dotp(const float *x, const float *y, const int count);
*
*		x is pointer to array holding the first floating point vector
*		y is pointer to array holding the second floating point vector
*		count is the number of values in the x & y vectors
*
*		If the routine is not to be used as a C callable function,
*		then you need to initialize values for all of the parameters
*		passed to the function since these are assumed to be in
*		registers as defined by the calling convention of the
*		compiler, (refer to the TMS320C6x Optimizing C Compiler
*		User's Guide).
*
*	C CODE
*		This is the C equivalent for the assembly code.  Note that
*		the assembly code is hand optimized and restrictions may
*		apply.
*
*		float dotp(const float *x, const float *y, const int count)
*		{
*		   int i;
*		   float sum = 0;
*
*		   for (i=0; i < count; i++)
*		   {
*		      sum += x[i] * y[i];
*		   }
*		   return sum;
*		}
*
*	DESCRIPTION
*
*		This routine calculates the dot product of 2 vectors.
*
*	TECHNIQUES
*
*		1.  LDDW instructions are used to load two SP floating point 
*		    values at a time for the x and y arrays.
*		2.  The loop is unrolled once and software pipelined.
*		3.  Since the ADDSP and MPYSP instructions take 4 cycles,
*		    A8, B8, A0, and B0 multiplex different variables to save
*		    on register usage.
*		    This multiple assignment is possible since the variables
*		    are always read just once on the first cycle that they
*		    are availble.
*		4.  The loop is primed to reduce the prolog by 4 cycles
*		    (14 words) with no increase in cycle time.
*		5.  The load counter is used as the loop counter which
*		    requires a 3 cycle (6 word) epilog to finish the
*		    calculations.  This does not increase the cycle time.
*		    The advantage is that the routine does not perform
*		    extraneous loads.
*
*	ASSUMPTIONS
*
*		1.  Little Endian is assumed for LDDW instructions.
*		2.  The value of count must be greater than or equal
*		    to 10 and even (10, 12, 14, ...).
*		3.  Since single assignment of registers is not used,
*		    interrupts should be disabled before this function is 
*                   called.
*		
*	MEMORY NOTE
*
*		The arrays x and y should be aligned on opposite
*		(even and odd) double word boundaries to avoid memory
*		bank hits.  This routine does not perform extraneous loads.
*
*       ARGUMENTS PASSED
*
*		x	 ->  A4 = ptr_x
*		y	 ->  B4 = ptr_y
*		count	 ->  A6
*
*	CYCLES
*
*	N/2 + 24	with C overhead
*	N/2 + 24	without C overhead
*
*===============================================================================
	.global _dotp
	.text
_dotp:

*** BEGIN Benchmark Timing ***
B_START:
* Prolog Begins ****************************************************************
	ZERO	.L1	A5		; prod0 = 0
||	ZERO	.L2	B5		; prod1 = 0
||	LDDW	.D1	*A4++[1],A7:A6	; load x1:x0 from memory
||	LDDW	.D2	*B4++[1],B7:B6	; load y1:y0 from memory
||	B	.S1	LOOP		; branch to loop

	ZERO	.L1	A8		; sum0 = 0
||	ZERO	.L2	B8		; sum1 = 0
||	LDDW	.D1	*A4++[1],A7:A6	; @ load x1:x0 from memory
||	LDDW	.D2	*B4++[1],B7:B6	; @ load y1:y0 from memory
||	SUB	.S2X	A6,10,B0	; lcntr = count - 10
||	B	.S1	LOOP		; branch to loop

	LDDW	.D1	*A4++[1],A7:A6	; @@ load x1:x0 from memory
||	LDDW	.D2	*B4++[1],B7:B6	; @@ load y1:y0 from memory
||	B	.S1	LOOP		; branch to loop

	LDDW	.D1	*A4++[1],A7:A6	; @@@ load x1:x0 from memory
||	LDDW	.D2	*B4++[1],B7:B6	; @@@ load y1:y0 from memory
||	B	.S1	LOOP		; branch to loop

	LDDW	.D1	*A4++[1],A7:A6	; @@@@ load x1:x0 from memory
||	LDDW	.D2	*B4++[1],B7:B6	; @@@@ load y1:y0 from memory
||	B	.S1	LOOP		; branch to loop
****** Loop Begins *************************************************************
LOOP:
  [B0]	LDDW	.D1	*A4++[1],A7:A6	; @@@@@ if(lcntr) load x1:x0 from memory
||[B0]	LDDW	.D2	*B4++[1],B7:B6	; @@@@@ if(lcntr) load y1:y0 from memory
||	MPYSP	.M1X	A6,B6,A5	; prod0 = x0 * y0
||	MPYSP	.M2X	A7,B7,B5	; prod1 = x1 * y1
||	ADDSP	.L1	A5,A8,A8	; sum0 = prod0 + sum0
||	ADDSP	.L2	B5,B8,B8	; sum1 = prod1 + sum1
||[B0]	B	.S1	LOOP		; if(lcntr) branch to loop
||[B0]	SUB	.S2	B0,2,B0 	; if(lcntr) lcntr -= 2
****** Epilog Begins ***********************************************************
	ADDSP	.L1	A5,A8,A8	; sum0 = prod0 + sum0
||	ADDSP	.L2	B5,B8,B8	; sum1 = prod1 + sum1

	ADDSP	.L1	A5,A8,A8	; sum0 = prod0 + sum0
||	ADDSP	.L2	B5,B8,B8	; sum1 = prod1 + sum1

	ADDSP	.L1	A5,A8,A8	; sum0 = prod0 + sum0
||	ADDSP	.L2	B5,B8,B8	; sum1 = prod1 + sum1
* Epilog Ends ******************************************************************
	ADDSP	.L1X	A8,B8,A0	; A0 = sum0 + sum1
	ADDSP	.L2X	A8,B8,B0	; B0 = sum0 + sum1
	ADDSP	.L1X	A8,B8,A0	; A0 = sum0 + sum1
	ADDSP	.L2X	A8,B8,B0	; B0 = sum0 + sum1
	NOP				; wait for B0
	ADDSP	.L1X	A0,B0,A5	; A5 = A0 + B0
	NOP				; wait for next B0
	ADDSP	.L2X	A0,B0,B5	; B5 = A0 + B0
	NOP				; wait for B5
	B	.S2	B3		; return from function
	NOP				; wait for B5
	ADDSP	.L1X	A5,B5,A4	; return A4
	NOP		3		; wait for A4 and branch
B_END:
*** END Benchmark Timing ***
