*===============================================================================
*
*	TEXAS INSTRUMENTS, INC.		
*
*	CROSS-CORRELATION
*
*	Revision Date: 3/13/98
*	
*	USAGE
*
*		This routine is C Callable and can be called as:
*
*		void crosscor(float *a, float *b, float *r, short nb, short nr)
*
*		a =	pointer to real input vector of size = nr+nb-1
*			a typically contains input data (x) padded with 
*			consecutive nb - 1  zeros at the beginning and end.
*		b =	pointer to real input vector of size nb in forward order. 
*			b typically contains the filter coefs (h)
*		r =	pointer to real output vector of size nr
*		nb=	number of elements in vector b. NOTE: nb <= nr  nb is 
*			typically noted as m in convol formulas. nb must be a 
*			MULTIPLE of 2
*		nr=	number of elements in vector r. nr must be a MULTIPLE of 4
*
*		If routine is not to be used as a C callable function then
*		you need to initialize values for all of the values passed
*		as these are assumed to be in registers as defined by the 
*		calling convention of the compiler, (refer to the C compiler
*		reference guide).
*
*		ARGUMENTS PASSED   ->   REGISTER USED
*		-------------------------------------
*		a                  ->   A4
*		b                  ->   B4
*		r                  ->   A6
*		nb                 ->   B6
*		nr                 ->   A8
*
*	C CODE
*
*		void crosscor(float *a, float *b, float *r, short nb, short nr)
*		{
*			short	i, j;
*			float	accum;
*		
*			for (i = 0; i < nr ; i++)
*			{
*				acc = 0 ;
*		
*				for (j = 0; j < nb; j++)
*					accum += b[j] * a[i+j];
*		
*				r[i] = acc ;
*			}
*		}
*
*		This is the C equivalent of the assembly code.  Note that
*		the assembly code is hand optimized and restrictions may
*		apply.
*
*	DESCRIPTION
*
*	  	This fucntion calculates the full-lenght biased cross-correlation
*		of real vectors a and b using time-domain techniques. The result
*		is placed in real vector r.
*		
*	TECHNIQUES
*
*		The inner loop is unrolled twice and software pipelined and the 
*		outer loop is unrolled 4 times.
*
*		Input vector a is assumed to be padded with zeros which provides 
*		symmetry and allows unrolling of inner loop.
*
*		This routine computes biased (raw) cross-correlation.
*
*		Registers are shared by variables whenever possible to save on
*		register usage.
*
*	ASSUMPTIONS
*
*		nb is a multiple of 2 and greater than or equal to 2 (2,4,6,...)
*		nr is a multiple of 4 and greater than or equal to 4 (4,8,12,...)
*
*		It is assumed that input vector is padded with nb-1 no of consecutive 
*		zeros in the beginning and end.
*		
*		Arrays a,b and r are aligned on the same (odd/even) word boundary
*
*	MEMORY NOTE
*
*		No memory bank hits if array alignment restrictions hold
*
*	CYCLES
*
*		(nb/2)*nr + (nr/2)*5 + 8
*
*================================================================================

		.global	_crosscor
		.text
_crosscor:

	STW	.D2T1	A10,*B15--(4)

* BEGIN BENCHMARK TIMING

	LDDW	.D1	*A4++,A1:A0	; p @ aa1:aa0 = *a++
||	MV	.L1X	B4,A6		; f b_save = b
||	MV	.L2X	A6,B0		; f r1 = r

	LDDW	.D1T2	*A6++,B9:B8	; p @ bb1:bb0 = *b_save++
||	MV	.L1	A4,A3		; p a_save = a

	LDDW	.D1	*A3,A9:A8	; p @ aa3:aa2 = *a_save

	LDW	.D1	*+A3[2],A10	; p @ aa4 = *+a_save[2]

	LDDW	.D1	*A3++,A1:A0	; p @@ aa1:aa0 = *a_save++

	LDDW	.D1T2	*A6++,B9:B8	; p @@ bb1:bb0 = *b_save++

	LDDW	.D1	*A3,A9:A8	; p @@ aa3:aa2 = *a_save
||	MPYSP	.M1X	A0,B8,A7	; p prod1 = aa0 * bb0
||	MPYSP	.M2X	A1,B9,B7	; p prod5 = aa1 * bb1
||	ADD	.L1	A4,8,A4		; p a = a + 8
||	SUB	.S1	A8,4,A2		; f ocntr = nr - 4

	LDW	.D1	*+A3[2],A10	; p @@ aa4 = *+a_save[2]
||	MPYSP	.M1X	A1,B8,A7	; p prod2 = aa1 * bb0
||	MPYSP	.M2X	A8,B9,B7	; p prod6 = aa2 * bb1
||	SUB	.L2	B6,4,B1		; f lcntr = nb - 4

;**	
oloop:

  [B1]	LDDW	.D1	*A3++,A1:A0	; p @@@ aa1:aa0 = *a_save++
||	MPYSP	.M1X	A8,B8,A7	; p prod3 = aa2 * bb0
||	MPYSP	.M2X	A9,B9,B7	; p prod7 = aa3 * bb1
||	B	.S2	iloop		; p Branch to inner loop
||	ZERO	.L1	A5		; p acc1=acc2=acc3=acc4 = 0
||	ZERO	.L2	B5		; p acc5=acc6=acc7=acc8 = 0

  [B1]	LDDW	.D1T2	*A6++,B9:B8	; p @@@ bb1:bb0 = *b_save++
||	MPYSP	.M1X	A9,B8,A7	; p prod4 = aa3 * bb0
||	MPYSP	.M2X	A10,B9,B7	; p prod8 = aa4 * bb1
||	SUB	.L2	B6,4,B2		; p icntr = nb - 4

; Kernel Loop Begins

iloop:
  [B1]	LDDW	.D1	*A3,A9:A8	; @@@ aa3:aa2 = *a_save
||	MPYSP	.M1X	A0,B8,A7	; prod1 = aa0 * bb0
||	MPYSP	.M2X	A1,B9,B7	; prod5 = aa1 * bb1
||	ADDSP	.L1	A5,A7,A5	; acc1 = acc1 + prod1
||	ADDSP	.L2	B5,B7,B5	; acc5 = acc5 + prod5

  [B1]	LDW	.D1	*+A3[2],A10	; @@@ aa4 = *+a_save[2]
||	MPYSP	.M1X	A1,B8,A7	; prod2 = aa1 * bb0
||	MPYSP	.M2X	A8,B9,B7	; prod6 = aa2 * bb1
||	ADDSP	.L1	A5,A7,A5	; acc2 = acc2 + prod2
||	ADDSP	.L2	B5,B7,B5	; acc6 = acc6 + prod6
||[B1]	SUB	.S2	B1,2,B1		; lcntr = lcntr - 2


  [B1]	LDDW	.D1	*A3++,A1:A0	; @@@@ aa1:aa0 = *a_save++
||	MPYSP	.M1X	A8,B8,A7	; prod3 = aa2 * bb0
||	MPYSP	.M2X	A9,B9,B7	; prod7 = aa3 * bb1
||	ADDSP	.L1	A5,A7,A5	; acc3 = acc3 + prod3
||	ADDSP	.L2	B5,B7,B5	; acc7 = acc7 + prod7
||[B2]	B	.S1	iloop		; branch to inner loop
||[B2]	SUB	.S2	B2,2,B2		; icntr = icntr - 2

  [B1]	LDDW	.D1T2	*A6++,B9:B8	; @@@@ bb1:bb0 = *b_save++
||	MPYSP	.M1X	A9,B8,A7	; prod4 = aa3 * bb0
||	MPYSP	.M2X	A10,B9,B7	; prod8 = aa4 * bb1
||	ADDSP	.L1	A5,A7,A5	; acc4 = acc4 + prod4 
||	ADDSP	.L2	B5,B7,B5	; acc8 = acc8 + prod8
||[!B2]	MV	.S1	A4,A3		; a_save = a


; Kernel Loop Ends 

	ADDSP	.L2X	A5,B5,B5	; o acc5 = acc1 + acc5
||[A2]	LDDW	.D1	*A3++,A1:A0	; p @ aa1:aa0 = *a_save++
||[A2]	MV	.S1X	B4,A6		; p b_save = b

	ADDSP	.L1X	A5,B5,A5	; o acc2 = acc2 + acc6
||[A2]	LDDW	.D1T2	*A6++,B9:B8	; p @ bb1:bb0 = *b_save++
||[A2]	SUB	.S2	B6,4,B1		; p lcntr = nb - 4

	ADDSP	.L2X	A5,B5,B5	; o acc7 = acc3 + acc7
||[A2]	B	.S2	oloop		; branch to outer loop
||[A2]	LDDW	.D1	*A3,A9:A8	; p @ aa3:aa2 = *a_save

	ADDSP	.L2X	A5,B5,B5	; o acc8 = acc4 + acc8
||[A2]	LDW	.D1	*+A3[2],A10	; p @ aa4 = *+a_save[2]
||	ADD	.S1	A4,8,A4		; p a = a + 8

	STW	.D2	B5,*B0++	; o *r1++ = acc5
||[A2]	LDDW	.D1	*A3++,A1:A0	; p @@ aa1:aa0 = *a_save++
||	ADD	.S1	A4,8,A4		; p a = a + 8

	STW	.D2T1	A5,*B0++	; o *r1++ = acc2
||[A2]	LDDW	.D1T2	*A6++,B9:B8	; p @@ bb1:bb0 = *b_save++

	STW	.D2	B5,*B0++	; o *r1++ = acc7
||[A2]	LDDW	.D1	*A3,A9:A8	; p @@ aa3:aa2 = *a_save
||[A2]	MPYSP	.M1X	A0,B8,A7	; p prod1 = aa0 * bb0
||[A2]	MPYSP	.M2X	A1,B9,B7	; p prod5 = aa1 * bb1

	STW	.D2	B5,*B0++	; o *r1++ = acc8
||[A2]	SUB	.L1	A2,4,A2		; o ocntr = ocntr - 4
||[A2]	LDW	.D1	*+A3[2],A10	; p @@ aa4 = *+a_save[2]
||[A2]	MPYSP	.M1X	A1,B8,A7	; p prod2 = aa1 * bb0
||[A2]	MPYSP	.M2X	A8,B9,B7	; p prod6 = aa2 * bb1

; Branch to outer loop occures here 
* END BENCHMARK TIMING

	B       .S2     B3
||	LDW	.D2T1	*++B15(4),A10	; Pop A10
	NOP		5