*===============================================================================
*
*	TEXAS INSTRUMENTS, INC.		
*
*	WEIGHTED VECTOR SUM (floating point)
*
*	Revision Date:	02/26/98
*	
*	USAGE	This routine is C Callable and can be called as:
*		
*               void w_vec(const float *x, const float *y, const float m, 
*					float *c, const short count);
*
*		x is pointer to array holding the floating point vector being weighted
*		y is pointer to array holding the floating point summation vector		
*		m is floating point weighting factor
*		c is pointer to array which will be floating point output vector
*		count is length of the vectors
*
*		If the routine is not to be used as a C callable function,
*		then you need to initialize values for all of the parameters
*		passed to the function since these are assumed to be in
*		registers as defined by the calling convention of the
*		compiler, (refer to the TMS320C6x Optimizing C Compiler
*		User's Guide).
*
*	C CODE
*		This is the C equivalent for the assembly code.  Note that
*		the assembly code is hand optimized and restrictions may
*		apply.
*
*		void w_vec(const float *x, const float *y, const float m, float *c, const short n)
*		{
*			short i;
*
*			for (i=0; i<n; i++) {
*				c[i] = (m * x[i]) + y[i];
*			}
*		}
*
*	DESCRIPTION
*
*		This routine is used to obtain the weighted vector sum.  Vector 
*		x is weighted by a factor of m and then added to vector y. Both 
*		the inputs and output are floating point vectors.
*
*	TECHNIQUES
*
*		1.  A LDDW instructions are used to load two floating point 
*		    values simultaneously for the x array.	
*		2.  The loop is unrolled once and software pipelined.
*
*	ASSUMPTIONS
*
*		1.  Little Endian is assumed for LDDW instructions.
*		2.  The value of count (and the number of entries in the 
*		    array x) must be greater than or equal to 12 and 
*		    even (i.e. 12, 14, 16, ...).
*		3.  Since single assignment of registers is not used,
*		    interrupts should be disabled before this function is 
*                   called.
*		
*	MEMORY NOTE
*
*		The arrays x and y should be aligned on opposite
*		(even and odd) double word boundaries to avoid memory
*		bank hits.  This routine does not perform extraneous loads.
*
*       ARGUMENTS PASSED
*
*		*x	 ->  A4
*		*y	 ->  B4
*		m	 ->  A6
*		*c	 ->  B6	
*		n	 ->  A8
*
*	CYCLES
*		n + 12
*	
*===============================================================================
	.global _w_vec
	.text

_w_vec:
*** BEGIN Benchmark Timing ***
** --------------------------------------------------------------------------*
* PIPED LOOP PROLOG

	LDDW	.D1T1	*A4++(8),A11:A10	;@	load x[i+1], x[i] from memory 
||	MV	.L2X	A6,B9			;	move weighting term m into B9
||	ADD	.S1X	B6,4,A12		;	make A12 c[i+1]

	SUB	.L2X	A8,6,B2			;2	cntr = n - 6

	LDDW	.D1T1	*A4++(8),A11:A10	;@@	load x[i+1], x[i] from memory 
||	SUB	.L2	B2,B0,B2		;	cntr = cntr - B0

	MV	.L1X	B2,A2			;4	cntr2 = cntr (for preventing extranious loads)

	LDDW	.D1T1	*A4++(8),A11:A10	;@@@	load x[i+1], x[i] from memory
||	LDDW	.D2T2	*B4++(8),B11:B10	;@	load y[i+1], y[i] from memory

	MPYSP	.M1	A11,A6,A0		;@	prod2 = x[i+1] * m 
||	MPYSP	.M2X	A10,B9,B0		;@	prod1 = x[i] * m

	LDDW	.D1T1	*A4++(8),A11:A10	;@@@@	load x[i+1], x[i] from memory
||	LDDW	.D2T2	*B4++(8),B11:B10	;@@	load y[i+1], y[i] form memory

	MPYSP	.M1	A11,A6,A0		;@@	prod2 = x[i+1] * m
||	MPYSP	.M2X	A10,B9,B0		;@@	prod1 = x[i] * m

	LDDW	.D1T1	*A4++(8),A11:A10	;@@@@@	load x[i+1], x[i] from memory
||	LDDW	.D2T2	*B4++(8),B11:B10	;@@@	load y[i+1], y[i] from memory
|| 	B	.S1	LOOP			;@	if (cntr) branch to loop

	MPYSP	.M1	A11,A6,A0		;@@@	prod2 = x[i+1] * m
||	MPYSP	.M2X	A10,B9,B0		;@@@	prod1 = x[i] * m
||	ADDSP	.L1X	A0,B11,A1		;@	sum2 = prod2 + y[i+1]
||	ADDSP	.L2	B0,B10,B1		;@	sum1 = prod1 + y[i]

	LDDW	.D1T1	*A4++(8),A11:A10	;@@@@@@	load x[i+1], x[i] from memory
||	LDDW	.D2T2	*B4++(8),B11:B10	;@@@@	load y[i+1], y[i] from memory
||	B	.S1	LOOP			;@@	if(cntr) branch to loop

	MPYSP	.M1	A11,A6,A0		;@@@@	prod2 = x[i+1] * m
||	MPYSP	.M2X	A10,B9,B0		;@@@@	prod1 = x[i] * m
||	ADDSP	.L1X	A0,B11,A1		;@@	sum2 = prod2 + y[i+1]
||	ADDSP	.L2	B0,B10,B1		;@@	sum1 = prod1 + y[i]
	
;** --------------------------------------------------------------------------*
LOOP:     ; PIPED LOOP KERNEL

   [A2]	LDDW	.D1T1	*A4++(8),A11:A10	;@@@@@@@if (cntr2) load x[i+1], x[i] from memory
|| [A2]	LDDW	.D2T2	*B4++(8),B11:B10	;@@@@@	if (cntr2) load y[i+1], y[i] from memory
|| [B2]	B	.S1	LOOP			;@@@	if (cntr) branch to loop
|| [B2]	SUB	.S2	B2,2,B2			;	if (cntr) cntr = cntr - 2

	MPYSP	.M1	A11,A6,A0		;@@@@@	prod2 = x[i+1] * m
||	MPYSP	.M2X	A10,B9,B0		;@@@@@	prod1 = x[i] * m
||	ADDSP	.L1X	A0,B11,A1		;@@@	sum2 = prod2 + y[i+1]
||	ADDSP	.L2	B0,B10,B1		;@@@	sum1 = prod1 + y[i]
||	STW	.D1	A1,*A12++(8)		;@	store sum2 into c[i+1]
||	STW	.D2	B1,*B6++(8)		;@	store sum1 into c[i]
|| [A2] SUB	.S1	A2,2,A2			;	if (cntr2) cntr2 = cntr2 - 2
                             
*** --------------------------------------------------------------------------*
	
	B	.S2	B3			;21	return from function       
B_END:                  
	NOP		5             	
*** END Benchmark Timing ***