*===============================================================================
*
*	TEXAS INSTRUMENTS, INC.
*
*	Complex FIR
*
*	Revision Date: 2/3/97
*
*	USAGE	This routine is C Callable and can be called as:
*
*		void fircx(short *x, short *h, short *y, int N, int M)
*		
*		x = input array
*		h = coefficient array
*		y = output array
*		N = number of coefficients (N EVEN >= 2)
*		M = number of output samples (M >= 1)
*
*		If routine is not to be used as a C callable function
*		then all instructions relating to stack should be removed.
*		Refer to comments of inxividual instructions.  You will also
*		need to initialize values for all of the values passed as these
*		are assumed to be in registers as defined by the calling 
*		convention of the compiler, (refer to the C compiler reference
*		guide).
*
*	C Code 	This is the C equivalent of the Assembly Code without
*		restrictions.  Note that the assembly code is hand optimized and
*		restrictions may apply
*
*		void fircx(short *x, short *h, short *y, short N, short M)
*		{
*		short i,j;
*		int imag, real;
*		
*		for (i = 0; i < 2*M; i += 2){
*			imag = 0;
*			real = 0;
*			for (j = 0; j < 2*N; j += 2){
*				real += h[j] * x[i-j]   - h[j+1] * x[i+1-j];
*				imag += h[j] * x[i+1-j] + h[j+1] * x[i-j];
*				}
*			y[i] = (real >> 15);
*			y[i+1] = (imag >> 15);
*			}
*		}
*
*	DESCRIPTION
*		This complex FIR assumes the number of filter coeficients is a 
*		multiple of 2 and the number of output samples times the number
*		of input samples is greater than 4.
*
*		It operates on 16-bit data with a 32-bit accumulate.  This
*		routine has no memory hits regardless of where x, h, and y 
*		arrays are located in memory.  The filter is M output samples
*		and N coefficients.  Each array consists of an even and odd term
*		with even terms representing the real part of the element and
*		the odd terms the imaginary.  
*
*		It is assumed that x points to the Nth element of some complex
*		array (2N shorts) upon entry to the function.
*
*
*	TECHNIQUES
*		The inner loop is unrolled two times thus the number of 
*		filter coefficients must be a multiple of two
*
*		The outer loop is conditionally executed in parallel with the
*		inner loop.  This allows for a zero overhead outer loop.
*
*
*	ASSUMPTIONS
*		N MULTIPLE of 2 >= 2
*		M >= 1
*		N*M >= 4
*
*
*	MEMORY NOTE
*		This code has no memory hits regardless of where x and h are
*		located in memory.
*
*	CYCLES	2*M*N + 10
*
*===============================================================================
	.global _fircx
	.text
_fircx:

	STW	.D2	B10,*B15--	; push register (for c-callable func)

	STW	.D2	A10,*B15--	; push register (for c-callable func)

	STW	.D2	B2,*B15--	; push register (for c-callable func)

	STW	.D2	A11,*B15	; push register (for c-callable func)

*** BEGIN Benchmark Timing ***
B_START

	SHL	.S2	B6,2,B10	; used to reset the pointer
||	LDW	.D2	*B4++[2],B5	; h[j] & h[j+1] (real & imag)
||	LDW	.D1	*A4--[2],A5	; x[i-j] & x[i+1-j]
||	SUB	.L2	B6,2,B1		; N - 2

	ADD	.S1X	B10,4,A0	; used to reset the pointer
||	CMPGT	.L2	B6,4,B2		; N > 4?
||	MPY	.M2X	A8,B6,B0	; M*N, loop counter
||	ADD	.S2	B10,4,B9	; used to reset the pointer

  [!B1]	SUB	.S2	B4,B10,B4	; reset the ptr
||	LDW	.D2	*-B4[1],A3	; h[j+2] & hi[j+3] (real & imag)
||	LDW	.D1	*+A4[1],B2	; x[i-2-j] & x[i-1-j]
||[!B1]	ADD	.L1	A4,A0,A4	; reset the ptr
||	MV	.S1X	B6,A7		; N
||	MPY	.M1	A10,0,A10	; zero reset counter value
||	SUB	.L2X	A4,4,B11	; &x[i-2-j]

  [!B1]	ADD	.S2	B11,B9,B11	; reset the ptr
||[B1]	SUB	.L1X	B1,2,A1		; decrement inner loop counter
||[!B1]	SUB	.S1	A7,2,A1		;* reset inner loop counter
||	LDW	.D2	*B4++[2],B5	;* h[j] & h[j+1] (real & imag)

	LDW	.D2	*B11--[2],B2	;* x[i-2-j] & x[i-1-j]
||	LDW	.D1	*A4--[2],A5	;* x[i-j] & x[i+1-j]
||[B2]	SUB	.S1	A7,4,A10	; setup reset counter offset
||	ADD	.L2X	A6,2,B6		; set up reOut ptr to other reg file

	MPYLH	.M2X	A5,B5,B8	; x[i-j] * h[j+1] (real * imag)
||	MPY	.M1X	A5,B5,A8	; x[i-j] * h[j]   (real * real)
||[!A1]	ADD	.S1	A4,A0,A4	;* reset the ptr

	MPYHL	.M2X	A5,B5,B7	; x[i+1-j] * h[j]   (imag * real)
||	MPYH	.M1X	A5,B5,A11	; x[i+1-j] * h[j+1] (imag * imag)
||[B0]	B	.S2	LOOP		; branch to the loop
||[!A1]	SUB	.L2	B4,B10,B4	;* reset the ptr
||	LDW	.D2	*-B4[1],A3	;* h[j+2] & h[j+3] (real & imag)

	MPYLH	.M2X	B2,A3,B7	; x[i-2-j] * h[j+3] (real * imag)
||	MPY	.M1X	B2,A3,A11	; x[i-2-j] * h[j+2] (real * real)
||	MVK	.S1	1,A2		; prevent first stores from executing
||[B0]	SUB	.S2	B0,4,B0		; decrement loop counter
||[!A1]	ADD	.L2	B11,B9,B11	;* reset the ptr
||[A1]	SUB	.D1	A1,2,A1		;* decrement inner loop counter
||[!A1]	SUB	.L1	A7,2,A1		;** reset inner loop counter
||	LDW	.D2	*B4++[2],B5	;** h[j] & h[j+1] (real & imag)

LOOP:

	ADD	.L2	B8,B7,B8	; imag
||	SUB	.L1	A8,A11,A8	; real
||	MPYHL	.M2X	B2,A3,B7	; x[i-1-j] * h[j+2] (imag * real)
||	MPYH	.M1X	B2,A3,A11	; x[i-1-j] * h[j+3] (imag * imag)
||	LDW	.D1	*A4--[2],A5	;** x[i-j] & x[i+1-j]
||	LDW	.D2	*B11--[2],B2	;** x[i-2-j] & x[i-1-j]
||[!A2]	SHR	.S2	B1,15,B1	; final y[i+1]
||[!A2]	SHR	.S1	A9,15,A9	; final y[i]

	ADD	.L2	B8,B7,B1	; imag
||	ADD	.L1	A8,A11,A9	; real
||	MPYLH	.M2X	A5,B5,B8	;* x[i-j] * h[j+1] (real * imag)
||	MPY	.M1X	A5,B5,A8	;* x[i-j] * h[j]   (real * real)
||[!A1]	ADD	.S2	B11,B9,B11	;** reset the pointer
||[!A1]	ADD	.S1	A4,A0,A4	;** reset the ptr
||[!A2]	STH	.D2	B1,*B6++[2]	; store imOut[0]
||[!A2]	STH	.D1	A9,*A6++[2]	; store reOut[0]

	ADD	.L2	B1,B7,B1	; imag
||	SUB	.L1	A9,A11,A9	; real
||	SUB	.D1	A1,A10,A2	; decrement loop counter
||	MPYHL	.M2X	A5,B5,B7	;* x[i+1-j] * h[j]   (imag * real)
||	MPYH	.M1X	A5,B5,A11	;* x[i+1-j] * h[j+1] (imag * imag)
||[B0]	B	.S1	LOOP		;* branch to the loop
||[!A1]	SUB	.S2	B4,B10,B4	;** reset the ptr
||	LDW	.D2	*-B4[1],A3	;** h[j+2] & h[j+3] (real & imag)

  [A2]	ADD	.L2	B1,B8,B8	; imag
||[A2]	ADD	.L1	A9,A8,A8	; real
||	MPYLH	.M2X	B2,A3,B7	;* x[i-2-j] * h[j+3] (real * imag)
||	MPY	.M1X	B2,A3,A11	;* x[i-2-j] * h[j+2] (real * real)
||[B0]	SUB	.S2	B0,2,B0		;** decrement loop counter
||[A1]	SUB	.D1	A1,2,A1		;** decrement inner loop counter
||[!A1]	SUB	.S1	A7,2,A1		;*** reset inner loop counter
||	LDW	.D2	*B4++[2],B5	;*** h[j] & h[j+1] (real & imag)
	; Loop ends here

	SHR	.S2	B1,15,B1	; final y[i+1]
||	SHR	.S1	A9,15,A9	; final y[i]
||	LDW	.D2	*B15++,A11	; pop register (for c-callable func)

	STH	.D2	B1,*B6++	; store imOut[0]
||	STH	.D1	A9,*A6++	; store reOut[0]

B_END:
*** END Benchmark Timing ***

	LDW	.D2	*B15++,B2	; pop register (for c-callable func)

	LDW	.D2	*B15++,A10	; pop register (for c-callable func)
||	B	.S2	B3		; return

	LDW	.D2	*B15,B10	; pop register (for c-callable func)

	NOP	4

