*===============================================================================
*
*	TEXAS INSTRUMENTS, INC.		
*
*	VECTOR SUM OF SQUARES  (floating point)
*
*	Revision Date:	02/18/98
*	
*	USAGE	This routine is C Callable and can be called as:
*		
*		float vecsumsq(float *x, short count);
*
*		x is pointer to array holding the floating point vector
*		count is the number of values in the x vector
*
*		If the routine is not to be used as a C callable function,
*		then you need to initialize values for all of the parameters
*		passed to the function since these are assumed to be in
*		registers as defined by the calling convention of the
*		compiler, (refer to the TMS320C6x Optimizing C Compiler
*		User's Guide).
*
*	C CODE
*		This is the C equivalent for the assembly code.  Note that
*		the assembly code is hand optimized and restrictions may
*		apply.
*
*		float vecsumsq(float x[], int n)
*		{
*			int i;
*			float sum=0;
*			for(i=0; i<n; i++)
*			{
*				sum += x[i]*x[i];
*			}
*			return(sum);
*		}
*
*	DESCRIPTION
*
*		This routine calculates the sum of squares of a vector.
*
*	TECHNIQUES
*
*		1.  Two LDW instructions are used to simultaneously load
*		    x[i] and x[i+1]	
*		2.  The loop is unrolled once and software pipelined.
*		3.  The loop is primed to reduce code size and ease the array 
*			size restriction
*	ASSUMPTIONS
*
*		1.  Little Endian is assumed for LDW instructions.
*		2.  The value of count (and the number of entries in the 
*		    array x) must be greater than or equal to 12 and 
*		    even (i.e. 12, 14, 16, ...).
*		3.  Since single assignment of registers is not used,
*		    interrupts should be disabled before this function is 
*                   called.
*		
*	MEMORY NOTE
*
*		This routine performs extraneous loads since
*		it has no epilog.  The epilog can be added by changing
*		the first instruction as described below and by removing
*		the (*) from the epilog lines.
*
*       ARGUMENTS PASSED
*
*		x	 ->  A4
*		count	 ->  B4
*
*	CYCLES
*
*	N/2 + 25
*
*===============================================================================
	.global _vecsumsq
	.text

_vecsumsq:

*** BEGIN Benchmark Timing ***

;	MVK	.S2	22,B0		; B0 = 22 (if Epilog used)  

	MVK	.S2	4,B0		; B0 = 4 (if Epilog NOT used)
||	ADD	.L2X	A4,4,B8		; make B8 pointer that is x[i+1]
					;      A4 is x[i]

;** --------------------------------------------------------------------------*
; PIPED LOOP PROLOG (primed)

	LDW	.D1T1	*A4++[2],A7	;5	load x[i] from memory
||	LDW	.D2T2	*B8++[2],B7	;	load x[i+1] from memory	
||	B	.S1	LOOP		;	if(cntr) branch to loop
||	ZERO	.L1	A5		;	sum1 = 0
||	ZERO	.L2	B5		;	sum2 = 0
||	SUB	.S2	B4,B0,B2	;

	LDW	.D1T1	*A4++[2],A7	;6	load x[i] from memory
||	LDW	.D2T2	*B8++[2],B7	;	load x[i+1] from memory
||	B	.S1	LOOP		;	if(cntr) branch to loop
||	ZERO	.L1	A0		;	prod1 = 0
||	ZERO	.L2	B0		;	prod2 = 0

	LDW	.D1T1	*A4++[2],A7	;7	load x[i] from memory
||	LDW	.D2T2	*B8++[2],B7	;	load x[i+1] from memory
||	B	.S1	LOOP		;	if(cntr) branch to loop

	LDW	.D1T1	*A4++[2],A7	;8	load x[i] from memory
||	LDW	.D2T2	*B8++[2],B7	;	load x[i+1] from memory
||	B	.S1	LOOP		;	if(cntr) branch to loop
                                    
	LDW	.D1T1	*A4++[2],A7	;9	load x[i] from memory
||	LDW	.D2T2	*B8++[2],B7	;	load x[i+1] from memory
||	B	.S1	LOOP		;	if(cntr) branch to loop
                                  
;** --------------------------------------------------------------------------*
LOOP:     ; PIPED LOOP KERNEL

	LDW	.D1T1	*A4++[2],A7	;10	load x[i] from memory
||	LDW	.D2T2	*B8++[2],B7	;	load x[i+1] from memory
||	MPYSP	.M1	A7,A7,A0	;	prod1 = x[i]*x[i]
||	MPYSP	.M2	B7,B7,B0	;	prod2 = x[i+1]*x[i+1]
||	ADDSP	.L1	A0,A5,A5	;	sum1 = sum1 + prod1
||	ADDSP	.L2	B0,B5,B5	;	sum2 = sum2 + prod2
|| [B2] B	.S1	LOOP		; 	if(cntr) branch to loop
|| [B2]	SUB	.S2	B2,2,B2		;	if(cntr)  cntr = cntr - 2

;** --------------------------------------------------------------------------* 
; PIPED LOOP EPILOG  (not used)                                                  	
*
*	MPYSP	.M1	A7,A7,A0	;11	prod1 = x[i]*x[i]
*||	MPYSP	.M2	B7,B7,B0	;	prod2 = x[i+1]*x[i+1]
*||	ADDSP	.L1	A0,A5,A5	;	sum1 = sum1 + prod1
*||	ADDSP	.L2	B0,B5,B5     	;	sum2 = sum2 + prod2
*
*	MPYSP	.M1	A7,A7,A0	;12	prod1 = x[i]*x[i]
*||	MPYSP	.M2	B7,B7,B0    	;	prod2 = x[i+1]*x[i+1]
*||	ADDSP	.L1	A0,A5,A5	;	sum1 = sum1 + prod1
*||	ADDSP	.L2	B0,B5,B5       	;	sum2 = sum2 + prod2
*
*	MPYSP	.M1	A7,A7,A0	;13	prod1 = x[i]*x[i]
*||	MPYSP	.M2	B7,B7,B0    	;	prod2 = x[i+1]*x[i+1]
*||	ADDSP	.L1	A0,A5,A5	;	sum1 = sum1 + prod1
*||	ADDSP	.L2	B0,B5,B5       	;	sum2 = sum2 + prod2
*
*	MPYSP	.M1	A7,A7,A0	;14	prod1 = x[i]*x[i]
*||	MPYSP	.M2	B7,B7,B0    	;	prod2 = x[i+1]*x[i+1]
*||	ADDSP	.L1	A0,A5,A5	;	sum1 = sum1 + prod1
*||	ADDSP	.L2	B0,B5,B5       	;	sum2 = sum2 + prod2
*
*	MPYSP	.M1	A7,A7,A0	;15	prod1 = x[i]*x[i]
*||	MPYSP	.M2	B7,B7,B0    	;	prod2 = x[i+1]*x[i+1]
*||	ADDSP	.L1	A0,A5,A5	;	sum1 = sum1 + prod1
*||	ADDSP	.L2	B0,B5,B5       	;	sum2 = sum2 + prod2
*
*	ADDSP	.L1	A0,A5,A5	;16	sum1 = sum1 + prod1
*||	ADDSP	.L2	B0,B5,B5       	;	sum2 = sum2 + prod2
*    
*	ADDSP	.L1	A0,A5,A5	;17	sum1 = sum1 + prod1
*||	ADDSP	.L2	B0,B5,B5	;	sum2 = sum2 + prod2
*   
*	ADDSP	.L1	A0,A5,A5	;18	sum1 = sum1 + prod1
*||	ADDSP	.L2	B0,B5,B5	;	sum2 = sum2 + prod2
*              
*	ADDSP	.L1	A0,A5,A5	;19	sum1 = sum1 + prod1
*||	ADDSP	.L2	B0,B5,B5	;	sum2 = sum2 + prod2
*** --------------------------------------------------------------------------*           

	ADDSP	.L1X	A5,B5,A0	;20	A0 = sum1 + sum2
	ADDSP	.L2X	A5,B5,B0	;21	B0 = sum1 + sum2
	ADDSP	.L1X	A5,B5,A0	;22	A0 = sum1 + sum2
	ADDSP	.L2X	A5,B5,B0	;23	B0 = sum1 + sum2

	NOP		1		;	wait for B0
	ADDSP	.L1X	A0,B0,A5	;	A5 = A0 + B0
	NOP		1		;	wait for second B0
	ADDSP	.L2X	A0,B0,B5	;	B5 = A0 + B0
	NOP		1		;	nop
	B 	.S2	B3		;	return from function
	NOP		1		;	wait for B5
	ADDSP	.L1X	A5,B5,A4	;	return (A5 + B5)
	NOP		3		;	wait for A4 and branch
B_END:                                 	
*** END Benchmark Timing ***                                                 