*===============================================================================
*
*	TEXAS INSTRUMENTS, INC.		
*
*	CIRCULAR ADDRESSED FIR FILTER (floating point version)
*
*	Revision Date:	06/05/98
*	
*	USAGE	This routine is C Callable and can be called as:
*		
*		fircirc(float *x, float *h, float *y, int hsize, int ysize, int size, int index);
*
*		x 	= input array
*		h 	= filter coefficients array
*		y 	= output array
*		hsize 	= number of filter coefficients (MULTIPLE OF 4 >= 4)
*		ysize	= size of output array 
*		size 	= 2^(size+1) is the block size in bytes of the circular buffer
*		index	= offset by which to start reading from the input array
*
*		If the routine is not to be used as a C callable function,
*		then you need to initialize values for all of the parameters
*		passed to the function since these are assumed to be in
*		registers as defined by the calling convention of the
*		compiler, (refer to the TMS320C6x Optimizing C Compiler
*		User's Guide).
*
*	C CODE
*		This is the C equivalent for the assembly code.  Note that
*		the assembly code is hand optimized and restrictions may
*		apply.
*
* 		void fircirc(float x[], float h[], float y[], int hsize, int ysize, int size, int index)
* 		{
* 			int             i, j;
* 			for (j = 0; j < ysize; j++) {
* 				float y0 = 0;
* 				for (i = 0; i < hsize; i++)
* 					y0 += x[(i+j+index) % (1 << (size-1))] * h[i];
* 				y[j] = y0;
* 				}
* 		}
*
*	DESCRIPTION
*
*		This routine implements a circularly addressed FIR filter.  
*		hsize is the number of filter coefficients.  ysize is the number 
*		of the output samples.  
*
*	TECHNIQUES
*
*		1.  LDDW instructions are used to load two SP floating point
*		    values simultaneously for the x and h arrays.
*		2.  The outer loop is unrolled 4 times.
*		3.  The inner loop is unrolled 2 times.
*		4.  The variables prod1, prod3, prod5 and prod7 share A9.
*		    The variables prod0, prod2, prod4 and prod6 share B6.
*		    The variables sum1, sum3, sum5 and sum7 share A7.
*		    The variables sum0, sum2, sum4 and sum6 share B8.
*		    This multiple assignment is possible since the variables
*		    are always read just once on the first cycle that they are	
*		    avaliable.
*		5.  A load counter is used so that an epilog is not needed.  No 
*		    extraneous loads are performed.
*
*	ASSUMPTIONS
*		1.  Little Endian is assumed for LDDW instructions.
*		2.  The number of coefficients  (hsize) must be a multiple of 2
*		    and greater than or equal to 4.
*		3.  The number of outputs (ysize) must be a multiple of 4 and greater
*		    than or equal to 4.
*		4.  Since single assignment of registers is not used, interrupts 
*		    should be disabled before this function is called.
*		
*	MEMORY NOTE
*		The x, h, and y arrays should be placed on the same double-word 
*		boundaries (i.e. y, x, and h on even double-word boundaries) to 
*		prevent internal data memory bank hits.
*
*       ARGUMENTS PASSED
*
*		x[]	 ->  A4
*		h[]	 ->  B4
*		y[]	 ->  A6 
*		hsize	 ->  B6
*		ysize	 ->  A8
*		size	 ->  B8
*		index	 ->  A10
*
*	CYCLES
*		((2*hsize) + 10)*(ysize/4) + 9
*
*===============================================================================
	.global _fircirc
	.text

_fircirc:	

*** BEGIN Benchmark Timing ***

		ADDAW	.D1	A4,A10,A4		; x += index

*** begin piplining inner loop	

		LDDW	.D1	*A4++[1],B1:B0		; load x1:x0 from mem
||		SUB	.S1	A8,4,A2			; f ocntr = numY - 4
||		SHL	.S2	B6,1,B9			; f B9 = (numH) << 1
||		MV	.L1X	B4,A8			; f ptr_h = h

		LDDW	.D1	*A8++[1],A5:A4		; load h1:h0 from mem
||		MV	.L2X	A4,B7			; f ptr_x = x
||		SUB	.L1X	B6,4,A0			; f ireset = numH - 4
||		SHL	.S2	B8,16,B5		; shift size into BK0
||		MVK	.S1	0x00004000,A5		; move AMR mode into A5

		LDDW	.D2	*B7,B5:B4		; load x3:x2 from mem
||		MV	.L2X	A0,B2			; icntr = ireset
||		MV	.L1	A0,A1			; ocntr = ireset
||		SUB	.S2	B9,8,B9			; f xreset = B9 - 8
||		OR	.S1X	B5,A5,A9		; set AMR value into A9

		LDW	.D2	*+B7[2],A3		; load x4 from mem
||		MVC	.S2	A9,AMR			; move A9 into AMR

		LDDW	.D2	*B7++[1],B1:B0		; @ load x1:x0 from mem

		LDDW	.D1	*A8++[1],A5:A4		; @ load h1:h0 from mem

		LDDW	.D2	*B7,B5:B4		; @ load x3:x2 from mem
||		MPYSP	.M1X	B1,A5,A9		; prod1 = x1 * h1
||		MPYSP	.M2X	B0,A4,B6		; prod0 = x0 * h0	

		LDW	.D2	*+B7[2],A3		; @ load x4 from mem
||		MPYSP	.M1X	B4,A5,A9		; prod3 = x1 * h1
||		MPYSP	.M2X	B1,A4,B6		; prod2 = x0 * h0

OLOOP:

	[A1]	LDDW	.D2	*B7++[1],B1:B0		; @@ load x1:x0 from mem
||		MPYSP	.M1X	B5,A5,A9		; prod5 = x3 * h1
||		MPYSP	.M2X	B4,A4,B6		; prod4	= x2 * h0
||		B	.S1	LOOP			; if(icntr) branch to LOOP

	[A1]	LDDW	.D1	*A8++[1],A5:A4		; @@ load h1:h0 from mem
||		MPYSP	.M1	A3,A5,A9		; prod7 = x4 * h1
||		MPYSP	.M2	B5,A4,B6		; prod6 = x3 * h0
||		ZERO	.S1	A7			; sum1 = 0
||		ZERO	.S2	B8			; sum0 = 0

*****************loop begins*************************	
LOOP:

	[A1]	LDDW	.D2	*B7,B5:B4		; @@ load x3:x2 from mem
||		MPYSP	.M1X	B1,A5,A9		; @ prod1 = x1 * h1
||		MPYSP	.M2X	B0,A4,B6		; @ prod0 = x0 * h0
||		ADDSP	.L1	A7,A9,A7		; sum1 = prod1 + sum1	
||		ADDSP	.L2	B8,B6,B8		; sum0 = prod0 + sum0

	[A1]	LDW	.D2	*+B7[2],A3		; @@ load x4
||		MPYSP	.M1X	B4,A5,A9		; @ prod3 = x2 * h1
||		MPYSP	.M2X	B1,A4,B6		; @ prod2 = x1 * h0
||		ADDSP	.L1	A7,A9,A7		; sum3 = prod3 + sum3
||		ADDSP	.L2	B8,B6,B8		; sum2 = prod2 + sum2
||	[A1]	SUB	.S1	A1,2,A1 		; if(ocntr) ocntr -= 2

	[A1]	LDDW	.D2	*B7++[1],B1:B0		; @@@ load x1:x0 from mem
||		MPYSP	.M1X	B5,A5,A9		; @ prod5 = x3 * h1
||		MPYSP	.M2X	B4,A4,B6		; @ prod4 = x2 * h0
||		ADDSP	.L1	A7,A9,A7		; sum5 = prod5 + sum5	
||		ADDSP	.L2	B8,B6,B8		; sum4 = prod4 + sum4	
||	[B2]	B	.S1	LOOP			; if(icntr) branch to LOOP

	[A1]	LDDW	.D1	*A8++[1],A5:A4		; @@@ load h1:h0 from mem
||		MPYSP	.M1	A3,A5,A9		; @ prod7 = x4 * h1
||		MPYSP	.M2X	B5,A4,B6		; @ prod6 = x3 * h0 
||		ADDSP	.L1	A7,A9,A7		; sum7 = prod7 + sum7	
||		ADDSP	.L2	B8,B6,B8		; sum6 = prod6 + sum6	
||	[B2]	SUB	.D2	B2,2,B2 		; if(icntr) icntr -= 2
||	[!B2]	SUB	.S2	B7,B9,B7		; o if(!icntr) ptr_x -= xreset
||	[!B2]	SUB	.S1X	A8,B9,A8		; o if(!icntr) ptr_h -= xreset

******************loop ends**************************

		ADDSP	.L1X	B8,A7,A7		; o temp1 = sum0 + sum1	
||		SUB	.D1	A8,16,A8		; ptr_h -= 16	
||		LDDW	.D2	*B7++[1],B1:B0		; p load x1:x0 from mem

		ADDSP	.L2X	B8,A7,B8		; o temp2 = sum2 + sum3
||	[A2]	LDDW	.D1	*A8++[1],A5:A4		; p load h1:h0 from mem

		ADDSP	.L1X	B8,A7,A7		; o temp3 = sum4 + sum5
||	[A2]	B	.S1	OLOOP			; o if(ocntr) branch to OLOOP
||	[!A2]	B	.S2	B3			; f if(!ocntr) return
||	[A2]	LDDW	.D2	*B7,B5:B4		; p load x3:x2 from mem

		ADDSP	.L2X	B8,A7,B8		; o temp4 = sum6 + sum7
||	[A2]	LDW	.D2	*+B7[2],A3		; p load x4 from memory

		STW	.D1	A7,*A6++[2]		; o store temp1
||	[A2]	LDDW	.D2	*B7++[1],B1:B0		; p load x1:x0 from memory
||		MV	.S2X	A6,B6			; o b6 = a6

		STW	.D2	B8,*+B6[1]		; o store temp2
|| 	[A2]	LDDW	.D1	*A8++[1],A5:A4		; p load h1:h0 from memory
||	[A2]	MV	.S1	A0,A1			; p lcntr = ireset
||	[A2]	MV	.S2X	A0,B2			; p icntr = ireset

		STW	.D1	A7,*A6++[1]		; o store temp3
||	[A2]	LDDW	.D2	*B7,B5:B4		; p load x3:x2 from memory
||	[A2]	MPYSP	.M1X	B1,A5,A9		; p prod1 = x1 * h1
||	[A2]	MPYSP	.M2X	B0,A4,B6		; p prod0 = x0 * h0
||	[!A2]	ZERO	.S2	B6			; if (!ocntr) A9 = 0	

		STW	.D1	B8,*A6++[1]		; o store temp3
||	[A2]	SUB	.S1	A2,4,A2			; o if(ocntr) ocntr -= 4
||	[A2]	LDW	.D2	*+B7[2],A3		; p load x4 from memory
||	[A2]	MPYSP	.M1X	B4,A5,A9		; p prod3 = x2 * h1
||	[A2]	MPYSP	.M2X	B1,A4,B6		; p prod2 = x1 * h0
||	[!A2]	MVC	.S2	B6,AMR			; if (!ocntr) move B6 into AMR

**************** outer loop ends *********************
			
B_END:

*** END Benchmark Timing ***	
