*=============================================================================
*
*	TEXAS INSTRUMENTS, INC.		
*
*	Vector Sum
*
*	Revision Date: 07/02/98
*	
*	USAGE	This routine is C Callable and can be called as:
*		
*		void vecsum(const float *a, const float *b, float *c, const short n);
*
*		a is pointer to array holding the first floating point vector
*		b is pointer to array holding the second floating point vector
*		c is pointer to array holding the result vector of adding a & b vectors 
*		n is the number of values in the a, b & c vectors
*
*		If the routine is not to be used as a C callable function,
*		then you need to initialize values for all of the parameters
*		passed to the function since these are assumed to be in
*		registers as defined by the calling convention of the
*		compiler, (refer to the TMS320C6x Optimizing C Compiler
*		User's Guide).
*
*	C CODE
*		This is the C equivalent for the assembly code.  Note that
*		the assembly code is hand optimized and restrictions may
*		apply.
*
*		void vecsum(const float *a, const float *b, float *c, const short n)
*		{
*			short i;
*
*			_nassert(n >=30);
*
*			for (i=0; i<n; i++) {
*				c[i] = a[i] + b[i];
*			}
*		}
*
*	DESCRIPTION
*
*		This routine calculates the vector sum of 2 vectors.
*
*	TECHNIQUES
*
*		1.  LDDW instructions are used to load two SP floating point 
*           values at a time.
*		2.  The loop is unrolled once and software pipelined.
*		3.  LDDW and STW instructions necessitate a 2 cycle kernal
*	
*	ASSUMPTIONS
*
*		1.  Little Endian is assumed for LDDW instructions.
*		2.  The value of n must be greater than or equal
*		    to 6 and even (6, 8, 10, 12 ...).
*		
*	MEMORY NOTE
*
*		The arrays x and y should be aligned on opposite
*		(even and odd) double word boundaries to avoid memory
*		bank hits.  
*
*       ARGUMENTS PASSED
*
*		a	->	A4
*		b	->	B4
*		c	->	A6
*		n	->	B6
*
*	CYCLES
*
*	N + 8
*
*=============================================================================
B_START:
		.global _vecsum
		.text
_vecsum:
*** BEGIN Benchmark Timing ***

		LDDW	.D1T1	*A4++,A1:A0		; Load a[0] & a[1]
||		LDDW	.D2T2	*B4++,B1:B0		; Load b[0] & b[1]

		ADD	.L2X	0x4,A6,B7		; d = c + 4 (d[0] = c[1] etc)
||		SUB	.D2	B6,6,B2			; n = n - 6
		
		LDDW	.D1T1	*A4++,A1:A0		; Load a[0] & a[1]
||		LDDW	.D2T2	*B4++,B1:B0		; Load b[0] & b[1]

		NOP		1

  		B	.S1	LOOP         
||		LDDW	.D1T1	*A4++,A1:A0		; Load a[0] & a[1] 
||		LDDW	.D2T2	*B4++,B1:B0		; Load b[0] & b[1]

		ADDSP	.L1X	A1,B1,A5		; a[1] + b[1]
||		ADDSP	.L2X	A0,B0,B8		; a[0] + b[0]

  		B	.S1	LOOP			; Branch
||		LDDW	.D1T1	*A4++,A1:A0		; Load a[0] & a[1]
||		LDDW	.D2T2	*B4++,B1:B0		; Load b[0] & b[1]

		ADDSP	.L1X	A1,B1,A5		; a[1] + b[1]
||		ADDSP	.L2X	A0,B0,B8		; a[0] + b[0] 

;** ------ Loop Begins ------------------------------------*
LOOP:		; LOOP KERNEL

  [B2]		B	.S1	LOOP			; Branch
||		LDDW	.D1T1	*A4++,A1:A0		; Load a[0] & a[1]
||		LDDW	.D2T2	*B4++,B1:B0		; Load b[0] & b[1]

		STW	.D1T2	B8,*A6++(8)		; Store c[0]
||		STW	.D2T1	A5,*B7++(8)		; Store c[1]
||		ADDSP	.L1X	A1,B1,A5		; a[1] + b[1]
||		ADDSP	.L2X	A0,B0,B8		; a[0] + b[0] 
||[B2]		SUB	.S2	B2,0x2,B2		; n = n - 2

;** ------ Loop Ends --------------------------------------*

B_END:
*** END Benchmark Timing ***

		B	.S2	B3
		NOP		5

		; BRANCH OCCURS
