*===============================================================================
*
*	TEXAS INSTRUMENTS, INC.
*
*	FIRLMS2
*
*	Revision Date: 5/5/97
*
*	USAGE	This routine is C Callable and can be called as:
*
*		Long40 firlms2(short h[], short x[], short b, int n)
*
*		h = Coefficient Array
*		x = Input Array
*		b = Error of from previous FIR
*		n = Number of coefficients
*
*		If routine is not to be used as a C callable function
*		then all instructions relating to stack should be removed.
*		Refer to comments of individual instructions.  You will also
*		need to initialize values for all of the values passed as these
*		are assumed to be in registers as defined by the calling 
*		convention of the compiler, (refer to the C compiler reference
*		guide).
*
*	C Code 	This is the C equivalent of the Assembly Code without 
*		restrictions.
*
*		Note that the assembly code is hand optimized and restrictions 
*		may apply
*
*		Long40 firlms2(short h[], short x[], short b, int N)
*		{
*			int             i;
*			Long40          y = 0;
*			for (i = 0; i < N; i++) {
*				h[i] += (x[i] * b) >> 16;
*				y += x[i + 1] * h[i];
*			}
*			return y;
*		}
*
*	DESCRIPTION
*		This is an Least Mean Squared Adaptive FIR Filter.  Given the
*		error from the previous sample and pointer to the next sample
*		it computes an update of the coefficents and then performs
*		the FIR for the given input.  This routine has no memory hits
*		regardless of where x and h arrays are placed in memory.  This
*		routine assumes 16-bit input and output.
*
*	TECHNIQUES
*		The loop is unrolled once and the number of coefficients must be
*		a multiple of 2.
*
*	MEMORY NOTE
*		This code has no memory hits regardless of where x and h are
*		located in memory.
*
*	CYCLES		1.5*N + 16
*
*******************************************************************************
	.global _firlms2
	.text

_firlms2:
		STW	.D2	B10,	*B15--[2]	; push B10 on the stack
||		MV	.L1X	B15,	A8		; copy stack pointer

*** BEGIN Benchmark Timing ***
B_START

		STW	.D2	A10,	*B15--[2]	; push A10 on the stack
||		STW	.D1	B11,	*--A8		; push B11 on the stack
||		MV	.L1X	B3,	A1		; move return address
||		MV	.L2X	A6,	B5		; copy b

		B	.S1	LOOP			; for i
||		MVK	.S2	4,	B1		; setup priming
||		ADD	.L1X	2,	B4,	A3	; copy x
||		LDH	.D2	*B4++[2],	A0	;**** x0  = *x++, j=1

		ADD	.L2X	A4,	2,	B3	; copy h
||		SHR	.S2	B6,	1,	B0	; n / 2
||		STW	.D2	A11,	*B15		;push A11 on the stack
||		STW	.D1	B12,	*--A8[2]	; push B12 on the stack

		ADD	.S2	1,	B0,	B0	; n/2 + 1
||		SUB	.L1	A10,	A10,	A11:A10 ; y = 0
||		SUB	.L2	B9,	B9,	B9:B8	; y = 0
||		LDH	.D1	*A3++[2],	B2	;**** x0  = *x++, j=0
||		LDH	.D2	*B4++[2],	A0	;**** x0  = *x++, j=1
LOOP:
	[B0]	B	.S1	LOOP			;* for i
||		MV	.L1X	B2,	A5		;* copy x0, j=0
||		MPY	.M2X	1,	A0,	B6	;* copy x0, j=1
||		SHR	.S2	B10,	16,	B10	;* e  = f >> 16, j=1
||		MPY	.M1	A0,	A6,	A9	;** f  = x0 * b, j=0
||		LDH	.D1	*A4++[2],	A2	;*** h0	 = *h++, j=0
||		LDH	.D2	*B3++[2],	B12	;*** h0	 = *h++, j=1
||	[B1]	SUB	.L2	B1,	1,	B1	;* priming count

	[!B1]	STH	.D1	A7,	*-A4[8]		; h[-1] = h1, j=0
||	[!B1]	STH	.D2	B7,	*-B3[8]		; h[-1] = h1, j=1
||		ADD	.S1	A9,	A2,	A7	;* h1  = h0 + e, j=0
||		ADD	.S2	B10,	B12,	B7	;* h1  = h0 + e, j=1
||		MPY	.M2	B2,	B5,	B10	;** f  = x0 * b, j=1

	[B0]	SUB	.S2	B0,	1,	B0	; i++
||	[!B1]	ADD	.L1	A8,	A11:A10,A11:A10 ; y += p, j=0
||	[!B1]	ADD	.L2	B11,	B9:B8,	B9:B8	; y += p, j=1
||		MPY	.M1	A5,	A7,	A8	;* p  = x0 * h1, j=0
||		MPY	.M2	B6,	B7,	B11	;* p  = x0 * h1, j=1
||		SHR	.S1	A9,	16,	A9	;** e  = f >> 16, j=0
||		LDH	.D1	*A3++[2],	B2	;**** x0  = *x++, j=0
||		LDH	.D2	*B4++[2],	A0	;**** x0  = *x++, j=1
; end of LOOP

		LDW	.D2	*B15++,	A11		; pop A11 off the stack
||		MV	.L2X	A1,	B3		; move return address
||		MV	.L1X	B8,	A4		;

		ADD	.L1X	A11,	B9,	A5	; sum sums
||		LDW	.D2	*B15++,	B12		; pop B12 off the stack

		ADDU	.L1	A10,	A5:A4,	A5:A4	; sum sums
||		LDW	.D2	*B15++,	A10		; pop A10 off the stack

B_END:
*** END Benchmark Timing ***

		LDW	.D2	*B15++,	B11		; pop B11 off the stack
||		B	.S2	B3

		LDW	.D2	*B15,	B10		; pop B12 off the stack

		NOP	4
