*===============================================================================
*
*      TEXAS INSTRUMENTS, INC.
*
*      BIQUAD (Cascaded 2nd Order IIR filters)
*
*      Revision Date:  02/26/98
*	
*      USAGE   This routine is C Callable and can be called as:
*		
*      float biquad(int numBiquad, float *c, float *d, float y)
*
*		numBiquad is the number of cascaded 2nd order IIR filters.
*		c is pointer to an array holding the filter coefficients.
*		d is pointer to an array holding the biquad delayed states.
*		y is the original input value
*
*		If the routine is not to be used as a C callable function,
*		then you need to initialize values for all of the parameters
*		passed to the function since these are assumed to be in
*		registers as defined by the calling convention of the
*		compiler, (refer to the TMS320C6x Optimizing C Compiler
*		User's Guide).
*
*      C CODE
*		This is the C equivalent for the assembly code.  Note that
*		the assembly code is hand optimized and restrictions may
*		apply.
*
*
*	float biquad(int numBiquad, float *c, float *d, float y)
*	{
*	   int i;
*	   float sum0;
*	   float y0;
*	   float d0, d1, c0, c1, c2, c3;
*
*	   for(i=0; i<numBiquad; i++)
*	   {
*	      d0 = d[2*i+0];
*	      d1 = d[2*i+1];
*	      c0 = c[4*i+0];
*	      c1 = c[4*i+1];
*	      c2 = c[4*i+2];
*	      c3 = c[4*i+3];
*
*	      sum0 = c1*d1  + c0*d0   + y;
*	      y   = c3*d1   + c2*d0   + sum0;
*
*	      d[2*i+1] = d0;
*	      d[2*i+0] = sum0;
*	   }
*	   return y;
*	}
*
*      DESCRIPTION
*
*		This routine implements "numBiquad" cascaded 2nd order IIR
*		filters (Biquads).  There are four filter coefficients per
*		biquad that are stored in the 'c' array in the following order
*		{-a1, -a2, b1, b2, -a1', -a2', b1', b2', ...}.  There are two
*		delayed states per biquad that are stored in the 'd' array in
*		the following order {d(n-1), d(n-2), d'(n-1), d'(n-2), ...}.
*		For the 'd' and 'c' array, the first set of values are for
*		the first biquad.  There is one x(n) input to the first biquad.
*		The output y(n) of the first biquad is the the input to the
*		next biquad, etc.  The function returns the single y'(n)
*		output of the last cascaded biquad.  The new delayed states
*		are saved back to the 'd' array for each biquad so that the
*		next input value x(n+1) can be processed with another call
*		to the biquad function {d(n), d(n-1), d'(n), d'(n-1), ...}.
*
*		The routine uses the following formulas for the first two
*		biquads:
*		d(n) = - a2*d(n-2) - a1*d(n-1) + x(n)
*		y(n) =	 b2*d(n-2) + b1*d(n-1) + b0*d(n)
*
*		d'(n) = - a2'*d'(n-2) - a1'*d'(n-1) + y(n)
*		y'(n) =   b2'*d'(n-2) + b1'*d'(n-1) + b0'*d(n)
*
*		This routine assumes that all b0=b0'=a0=a0'=1 which will be
*		the case if your original IIR filter has a0=b0=1 before
*		you factor it into multiple 2nd order IIR filters.  If
*		a0 and b0 are not 1, divide the other coefficients
*		by b0.
*
*		a0*y(n) = b0*x(n) + b1*x(n-1) + ... +
*				  - a1*y(n-1) - ...
*
*		to get the following new coefficients:
*
*		y'(n) = a0/b0*y(n) = x(n) + b1'*x(n-1) + ... +
*					  - a1'*y(n-1) - ...
*
*		This means that your final answer y'(n) needs to be scaled
*		by b0/a0 to get the correct y(n).
*
*      TECHNIQUES
*
*		1.  LDDW instructions are used to load two SP floating point 
*		    values simultaneously.
*		2.  The loop is software pipelined.
*		3.  The variables prod1 and sum0 share a register.
*		    The variables c0 and ntemp0 share a register.
*		    The variables prod2 and tmp0 share a register.
*		    The variables c2 and temp1 share a register.
*		4.  The function prolog is scheduled in parallel with the
*		    loop prolog.
*		5.  A load counter is used so that an epilog is not needed.
*		    No extraneous loads are performed.
*		6.  A move instuction is used to solve a live-to-long
*		    problem with "temp0".
*		7.  The algorithm is changed to eliminate the cyclical
*		    dependency between "sum0" and "y0".  This adds an
*		    additional ADDSP instruction, but allows a 4 cycle
*		    kernel (dependent on the running sum of "y0") rather
*		    than an 8 cycle kernel.
*		8.  Two address pointers are used into the 'c' array
*		    to allow loading 'c0' and 'c1' in parallel with
*		    'c2' and 'c3'.
*		9.  Since the store of 'd0' is dependent on the load
*		    counter, one last store is required after the loop
*		    completes.	This also requires the use of a second
*		    address pointer into 'd' to store 'sum0'.
*	       10.  The prolog size is reduced by priming the loop.  This
*		    requires that a store counter be used for the store
*		    of sum0.
*	
*      ASSUMPTIONS
*
*		1.  Little Endian is assumed for LDDW instructions.
*		2.  The numBiquad must be greater than or equal to 2
*		    (2, 3, 4, ...).
*		3.  Since single assignment of registers is not used,
*		    interrupts should be disabled before this function is 
*                   called.
*		
*      MEMORY NOTE
*
*		There are no memory bank hits regardless of where the
*		function parameters are placed in memory.
*
*		The 'c' and 'd' arrays must be aligned on double word (64-bit)
*		boundaries since LDDW instructions are used to load
*		two SP floating point values at a time.
*
*      ARGUMENTS PASSED
*
*		numBiquad ->  A4
*		c	  ->  B4 = ptr_c
*		d	  ->  A6 = ptr_d
*		y	  ->  B6 = y0
*
*	CYCLES
*
*		4*(numBiquad) + 29     with C overhead
*		4*(numBiquad) + 29     without C overhead
*
*	NOTATIONS
*
*		f = Function Prolog or Epilog
*
*===============================================================================
	.global _biquad
	.text

_biquad:

*** BEGIN Benchmark Timing ***
B_START:
* Prolog Begins ****************************************************************
	LDDW	.D1	*A6++[1],B9:B8	; load d1:d0
||	MV	.S1	A6,A7		; f ptr_d2 = ptr_d
||	MV	.L1	B4,A0		; f ptr_ca = ptr_c
||	ADD	.L2	B4,8,B7 	; f ptr_cb = ptr_c + 8

	LDDW	.D1	*A0++[2],A3:A2	; load c1:c0
||	LDDW	.D2	*B7++[2],B3:B2	; load c3:c2
||	ADD	.L1	A4,2,A1 	; f cntr  = numBiquad + 2
||	SUB	.L2X	A4,1,B1 	; f lcntr = numBiquad - 1

	MV	.S1X	B3,A4		; f A4 = B3 (return address)

	MVK	.S2	4,B0		; f scntr = 4

	LDDW	.D1	*A6++[1],B9:B8	; @ load d1:d0

	LDDW	.D1	*A0++[2],A3:A2	; @ load c1:c0
||	LDDW	.D2	*B7++[2],B3:B2	; @ load c3:c2
||	B	.S2	LOOP		; branch to LOOP
||	ZERO	.S1	A8		; prime the loop; temp0 = 0

	MPYSP	.M1X	A2,B8,A9	; prod0 = c0 * d0
||	MPYSP	.M2	B2,B8,B4	; prod2 = c2 * d0
||	ZERO	.D1	A2		; prime the loop; ntemp0 = 0
||	ZERO	.L1	A5		; prime the loop; p1 = sum0 = 0
||	ZERO	.S1	A9		; prime the loop; p0 = 0
||	ZERO	.D2	B2		; prime the loop; temp1 = 0
||	ZERO	.L2	B4		; prime the loop; p2 = tmp0 = 0
||	ZERO	.S2	B5		; prime the loop; p3 = 0
******* Loop Begins ************************************************************
LOOP:
   [B1] STW	.D1	B8,*-A6[3]	; @@@@ if(lcntr) store d0
|| [B1] SUB	.D2	B1,1,B1 	; if(lcntr) lcntr--
||	MPYSP	.M1X	A3,B9,A5	; @@@@ prod1 = c1 * d1
||	MPYSP	.M2	B3,B9,B5	; @@@@ prod3 = c3 * d1
||	ADDSP	.L1	A9,A5,A8	; @@@ temp0 = prod0 + prod1
||	ADDSP	.L2	B4,B5,B2	; @@@ temp1 = prod2 + prod3

   [B1] LDDW	.D1	*A6++[1],B9:B8	; @@@@@@ if(lcntr) load d1:d0
||	ADDSP	.L2X	A8,B2,B4	; @@ tmp0 = temp0 + temp1

   [B1] LDDW	.D1	*A0++[2],A3:A2	; @@@@@@ if(lcntr) load c1:c0
|| [B1] LDDW	.D2	*B7++[2],B3:B2	; @@@@@@ if(lcntr) load c3:c2
||	ADDSP	.L1X	A2,B6,A5	; @ sum0 = ntemp0 + y0
||	ADDSP	.L2	B4,B6,B6	; @ y0 += tmp0
|| [A1] SUB	.S1	A1,1,A1 	; if(cntr) cntr--
|| [A1] B	.S2	LOOP		; if(cntr) branch to LOOP

  [!B0] STW	.D1	A5,*A7++[2]	; if(!scntr) store sum0
||	MPYSP	.M1X	A2,B8,A9	; @@@@@ prod0 = c0 * d0
||	MPYSP	.M2	B2,B8,B4	; @@@@@ prod2 = c2 * d0
||	MV	.S1	A8,A2		; @@ ntemp0 = temp0
|| [B0] SUB	.S2	B0,1,B0 	; if(scntr) scntr--
******* Loop Ends **************************************************************
* We want the second to last B6 value.	Therefore, we need to copy B6 to
* A4 before the last "ADDSP.L2 B4,B6,B6" writes to B6.
********************************************************************************
	B	.S2	A4		; f return from function
||	STW	.D1	B8,*-A6[1]	; f store last sum0 (new d0)
||	MV	.L1X	B6,A4		; f A4 = y0 (return y0)
	NOP		5		; f wait for return
B_END:
*** END Benchmark Timing ***
