*===============================================================================
*
*	TEXAS INSTRUMENTS, INC.
*
*	FIR
*
*	Revision Date: 2/3/97
*
*	USAGE	This routine is C Callable and can be called as:
*
*		void fir8(short *x, short *h, short *y, int N, int M)
*		
*		x = input array
*		h = coefficient array
*		y = output array
*		N = number of coefficients (MULTIPLE of 8 >= 8)
*		M = number of output samples (EVEN >= 2)
*
*		If routine is not to be used as a C callable function
*		then all instructions relating to stack should be removed.
*		Refer to comments of individual instructions.  You will also
*		need to initialize values for all of the values passed as these
*		are assumed to be in registers as defined by the calling 
*		convention of the compiler, (refer to the C compiler reference
*		guide).
*
*	C Code 	This is the C equivalent of the Assembly Code without 
*		restrictions.  Note that the assembly code is hand optimized and
*		restrictions may apply
*
*		void fir8(short x[], short h[], short y[], int N, int M)
*		{
*			int i, j, sum;
*			
*			for (j = 0; j < M; j++) {
*				sum = 0;
*				for (i = 0; i < N; i++)
*					sum += x[i + j] * h[i];
*				y[j] = sum >> 15;
*			}
*		}
*
*	DESCRIPTION
*		This FIR assumes the number of filter coeficients is a multiple
*		of 8 and the number of output samples is a multiple of 2.  It
*		operates on 16-bit data with a 32-bit accumulate.  This
*		routine has no memory hits regardless of where x, h, and y 
*		arrays are located in memory.  The filter is M output samples
*		and N coefficients.  The assembly routine performs 2 output
*		samples at a time.
*
*
*	TECHNIQUES
*		The inner loop is unrolled eight times thus the number of 
*		filter coefficients must be a multiple of eight.  The outer
*		loop is unrolled twice so the number of output samples must
*		be a multiple of 2.
*
*		If an odd number of output samples is needed or possible, the
*		final store can either be removed or conditionally executed 
*		depending on whether M is even or odd.  This code would have to 
*		be added to the existing code.
*
*		The outer loop is conditionally executed in parallel with the
*		inner loop.  This allows for a zero overhead outer loop.
*
*		Refer to FIR example in the optimizing assembly chapter of
*		the programmer's guide for more information.
*
*
*	ASSUMPTIONS
*		N MULTIPLE of 8 >= 8
*		M EVEN >= 2
*
*
*	MEMORY NOTE
*		This code has no memory hits regardless of where x and h are
*		located in memory.
*
*	CYCLES	M*N/2+13
*
*===============================================================================
	.global _fir8
	.text
_fir8:
	STW	.D2	A15,*B15--	; push register (for c-callable func)
||	SUB	.L1X	B15,8,A15	; copy stack pointer to A reg file

	STW	.D2	B14,*B15--[2]	; push register (for c-callable func)
||	STW	.D1	A14,*A15--[2]	; push register (for c-callable func)

	STW	.D2	B13,*B15--[2]	; push register (for c-callable func)
||	STW	.D1	A13,*A15--[2]	; push register (for c-callable func)

	STW	.D2	B12,*B15--[2]	; push register (for c-callable func)
||	STW	.D1	A12,*A15--[2]	; push register (for c-callable func)

*** BEGIN Benchmark Timing ***
B_START

	MPY	.M2	B6,2,B5		; used to reset h ptr (16*N/8)
||	SHR	.S1X	B6,3,A5		; set pointer reset lp cntr (N/8)
||	ADD	.L2X	A6,2,B6		; point to y[j+1]

	SHR	.S2X	A8,1,B0		; M/2
||	STW	.D2	B11,*B15--[2]	; push register (for c-callable func)
||	STW	.D1	A11,*A15--[2]	; push register (for c-callable func)
||	ZERO	.S1	A2		; set store lp cntr (N/8+1)

	MV	.L1X	B4,A0		; point to h[0] & h[1]
||	ADD	.S2	B4,4,B14	; point to h[2] & h[3]
||	MV	.L2X	A4,B1		; point to x[j] & x[j+1]
||	ADD	.S1	A4,4,A4		; point to x[j+2] & x[j+3]
||	STW	.D2	B10,*B15--[2]	; push register (for c-callable func)
||	STW	.D1	A10,*A15--[2]	; push register (for c-callable func)

  	LDW	.D1	*A4++[2],B9	; x[j+i+2] & x[j+i+3]
||	LDW	.D2	*B1++[2],A10	; x[j+i+0] & x[j+i+1]
||	MV	.L1	A5,A1		; set pointer reset lp cntr (N/8)
||	MPY	.M2X	A5,B0,B0	; set up loop counter ((N/8)*(M/2))
||	SUB	.S1X	B5,4,A3		; used to reset x ptr (16*N/8-4)
||	ZERO	.L2	B11		; zero out initial accumulator
||	MVK	.S2	2,B2		; initialize loop priming count

LOOP:
  [!A2]	SHR	.S1	A10,15,A12	; (Asum0 >> 15)
||	MPYH	.M2	B7,B9,B13	; p03 = h[i+3]*x[j+i+3]
||[A2]	ADD	.L1	A7,A10,A7	; sum0(p00) = p00 + sum0
||	MPYHL	.M1X	B7,A11,A10	; p13 = h[i+3]*x[j+i+4]
||	ADD	.L2X	A14,B4,B7	; sum1 += p11
||	LDW	.D2	*B14++[2],B7	;* h[i+2] & h[i+3]
||	LDW	.D1	*A0++[2],A8	;* h[i+0] & h[i+1]
||[B2]	SUB	.S2	B2,1,B2		; dec loop priming count

  	ADD	.L1	A10,A7,A13	; sum0 += p01
||	MPYHL	.M2X	A9,B10,B12	; p15 = h[i+5]*x[j+i+6]
||	MPYLH	.M1	A9,A11,A10	; p14 = h[i+4]*x[j+i+5]
||	ADD	.L2	B13,B7,B7	; sum1 +=  p12
||	LDW	.D2	*B1++[2],A11	;* x[j+i+4] & x[j+i+5]
||	LDW	.D1	*A4++[2],B10	;* x[j+i+6] & x[j+i+7]
||[A1]	SUB	.S1	A1,1,A1		;* dec pointer reset lp cntr
||[!B2]	SUB	.S2	B0,1,B0		; dec outer lp cntr

  [B0]	B	.S2	LOOP		; Branch outer loop
||	MPY	.M1	A9,A11,A11	; p04 = h[i+4]*x[j+i+4]
||	ADD	.L1X	B9,A13,A13	; sum0 += p02
||	MPYLH	.M2	B8,B10,B13	; p16 = h[i+6]*x[j+i+7]
||	ADD	.L2X	A10,B7,B7	; sum1 += p13
||	LDW	.D1	*A0++[2],A9	;* h[i+4] & h[i+5]
||	LDW	.D2	*B14++[2],B8	;* h[i+6] & h[i+7]
||[!A1]	SUB	.S1	A4,A3,A4	;* reset x ptr

  	MPY	.M2	B8,B10,B11	; p06 = h[i+6]*x[j+i+6]
||	MPYH	.M1	A9,A11,A11	; p05 = h[i+5]*x[j+i+5]
||	ADD	.L1X	B13,A13,A9	; sum0 += p03
||	ADD	.L2X	A10,B7,B7	; sum1 += p14
||[!A1]	SUB	.S2	B1,B5,B1	;* reset x ptr
||[!A1]	SUB	.S1	A0,A3,A0	;* reset h ptr
||	LDH	.D2	*B1,A8		;* x[j+i+8]
||[B2]	ADD	.D1	A5,1,A2		; set store lp cntr (N/8+1)

  [!A2]	MV	.S1	A5,A2		; reset store lp cntr (N/8)
||	MPYH	.M2	B8,B10,B13	; p07 = h[i+7]*x[j+i+7]
||	ADD	.L1	A11,A9,A9	; sum0 += p04
||	MPYHL	.M1X	B8,A8,A9	; p17 = h[i+7]*x[j+i+8]
||	ADD	.S2	B12,B7,B10	; sum1 += p15
||[!A2]	STH	.D2	B11,*B6++[2]	; y[j+1] = (Bsum1 >> 15)
||[!A2]	STH	.D1	A12,*A6++[2]	; y[j] = (Asum0 >> 15)
||	ADD	.L2X	A10,0,B8	;* move to other reg file

  	ADD	.L1	A11,A9,A12	; sum0 += p05
||	ADD	.L2	B13,B10,B8	; sum1 += p16
||	MPYLH	.M2X	A8,B8,B4	;* p10 = h[i+0]*x[j+i+1]
||[!A1]	SUB	.D2	B14,B5,B14	;* reset h ptr
||	MPYHL	.M1X	A8,B9,A14	;* p11 = h[i+1]*x[j+i+2]
||[!A1]	ADD	.S2	B1,4,B1		;* reset x ptr
||[!A1]	SUB	.S1	A0,4,A0		;* reset x ptr

  [!B2]	ADD	.L2X	A9,B8,B11	; sum1 += p17
||	ADD	.L1X	B11,A12,A12	; sum0 += p06
||	MPY	.M1	A8,A10,A7	;* p00 = h[i+0]*x[j+i+0]
||	MPYLH	.M2	B7,B9,B13	;* p12 = h[i+2]*x[j+i+3]
||[A2]	SUB	.D1	A2,1,A2		;* dec store lp cntr
||[!A1]	MV	.S1	A5,A1		;* reset pointer reset lp cntr (N/8)
||[B2]	ZERO	.D2	B11		; zero out initial accumulator

  [!B2]	ADD	.L1X	B13,A12,A10	; sum0 += p07
||[!A2]	SHR	.S2	B11,15,B11	;* (Bsum1 >> 15)
||	MPY	.M2	B7,B9,B9	;* p02 = h[i+2]*x[j+i+2]
||	MPYH	.M1	A8,A10,A10	;* p01 = h[i+1]*x[j+i+1]
||[A2]	ADD	.L2	B4,B11,B4	;* sum1(p10) = p10 + sum1
||	LDW	.D1	*A4++[2],B9	;** x[j+i+2] & x[j+i+3]
||	LDW	.D2	*B1++[2],A10	;** x[j+i+0] & x[j+i+1]
||[B2]	ZERO	.S1	A10		; zero out initial accumulator
	; Loop ends here

	SHR	.S1	A10,15,A12	; (Asum0 >> 15)
||	LDW	.D2	*++B15,A10	; pop register (for c-callable func)
||	MV	.L1X	B15,A15		; move stack pointer to A reg file

	STH	.D2	B11,*B6++[2]	; y[j+1] = (Bsum1 >> 15)
||	STH	.D1	A12,*A6++[2]	; y[j] = (Asum0 >> 15)

B_END:
*** END Benchmark Timing ***


	LDW	.D1	*++A15[2],B10	; pop register (for c-callable func)
||	LDW	.D2	*++B15[2],A11	; pop register (for c-callable func)

	LDW	.D1	*++A15[2],B11	; pop register (for c-callable func)
||	LDW	.D2	*++B15[2],A12	; pop register (for c-callable func)

	LDW	.D1	*++A15[2],B12	; pop register (for c-callable func)
||	LDW	.D2	*++B15[2],A13	; pop register (for c-callable func)

	LDW	.D1	*++A15[2],B13	; pop register (for c-callable func)
||	LDW	.D2	*++B15[2],A14	; pop register (for c-callable func)
||	B	.S2	B3		; return

	LDW	.D1	*++A15[2],B14	; pop register (for c-callable func)
||	LDW	.D2	*++B15[2],A15	; pop register (for c-callable func)

	NOP 4


