*******************************************************************************
*
*	TEXAS INSTRUMENTS INC.
*
*	COMPLEX FFT (Radix 4)
*
*	Revision Data: 05/28/97
*
*	USAGE This routine is C callable and can the called as
* 		
*	void radix4(int n, short x[], short w[])
*
*		n    --- FFT size (power of 4)			(input)		
*		x[]  --- input and output sequences (dim-n) 	(input/output)
*		w[]  --- FFT coefficients (dim-n)		(input)
* 
*		If the routine is not to be used as a C callable function,
*		then all instructions relating to stack should be removed.
*		Refer to comments of individual instructions. You will also
*		need to initialize values for all the values passed as these
*		are assumed to be in registers as defined by the calling
*		convention of the compiler, (refer to the C compiler reference
*		guide.)
*
*	C CODE
*
*		This is the C equivalent of the Assembly Code without the 
*		assumptions listed below. Note that the assembly code is hand
*		optimized and assumptions apply.
*
*		SOURCE:Burrus, Parks p .113
*
*	void radix4(int n, short x[], short w[])
*	{
*		int             n1, n2, ie, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
*		short           t, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
*	
*		n2 = n;
*		ie = 1;
*		for (k = n; k > 1; k >>= 2) {
*			n1 = n2;
*			n2 >>= 2;
*			ia1 = 0;
*			for (j = 0; j < n2; j++) {
*				ia2 = ia1 + ia1;
*				ia3 = ia2 + ia1;
*				co1 = w[ia1 * 2 + 1];
*				si1 = w[ia1 * 2];
*				co2 = w[ia2 * 2 + 1];
*				si2 = w[ia2 * 2];
*				co3 = w[ia3 * 2 + 1];
*				si3 = w[ia3 * 2];
*				ia1 = ia1 + ie;
*				for (i0 = j; i0 < n; i0 += n1) {
*					i1 = i0 + n2;
*					i2 = i1 + n2;
*					i3 = i2 + n2;
*					r1 = x[2 * i0] + x[2 * i2];
*					r2 = x[2 * i0] - x[2 * i2];
*					t = x[2 * i1] + x[2 * i3];
*					x[2 * i0] = r1 + t;
*					r1 = r1 - t;
*					s1 = x[2 * i0 + 1] + x[2 * i2 + 1];
*					s2 = x[2 * i0 + 1] - x[2 * i2 + 1];
*					t = x[2 * i1 + 1] + x[2 * i3 + 1];
*					x[2 * i0 + 1] = s1 + t;
*					s1 = s1 - t;
*					x[2 * i2] = (r1 * co2 + s1 * si2) >> 15;
*					x[2 * i2 + 1] = (s1 * co2-r1 * si2)>>15;
*					t = x[2 * i1 + 1] - x[2 * i3 + 1];
*					r1 = r2 + t;
*					r2 = r2 - t;
*					t = x[2 * i1] - x[2 * i3];
*					s1 = s2 - t;
*					s2 = s2 + t;
*					x[2 * i1] = (r1 * co1 + s1 * si1)  >>15;
*					x[2 * i1 + 1] = (s1 * co1-r1 * si1)>>15;
*					x[2 * i3] = (r2 * co3 + s2 * si3)  >>15;
*					x[2 * i3 + 1] = (s2 * co3-r2 * si3)>>15;
*				}
*			}
*			ie <<= 2;
*		}
*	}
*	
*	DESCRIPTION
*
*		This routine is used to compute FFT of a complex sequece
*		of size n, a power of 4, with "decimation-in-frequency
*		decomposition" method. The output is in digit-reversed 
*		order. Each complex value is with interleaved 16-bit real
*	        and imaginary parts.
*
*	TECHNIQUES
*	     1. Loading input x as well as coefficient w in word.
*	     2. Both loops j and i0 shown in the C code are placed in the
*	        INNERLOOP of the assembly code.
*
*	ASSUMPTIONS
*		4 <= n <= 65536
*		x is aligned on a 4*N Byte (N*word) boundary
*		w is aligned on an odd word boundary
*		x data is stored in the order real[0], image[0], real[1], ...
*		w coef is stored in the order k*sin[0*delta], k*cos[0*delta], 
*			k*sin[1*delta], ...  where delta = 2*PI/N, k = 32767
*
*	MEMORY NOTE
*		x must be aligned on a 4*N Byte (N*word) boundary for circular
*		buffering.  w should be aligned on an odd word boundary to
*		minimize memory bank hits.  There are N/4 memory bank hits total
*		
*	CYCLES
*		LOGBASE4(N) * (10 * N/4 + 33) + 7 + N/4
*
*******************************************************************************
	.global _radix4
	.bss	stack,52			; reserve space for stack
	.text

_radix4:
	MVK	.S1	stack,	A0		; new stack pointer in A0 and B1
||	MVK	.S2	stack,	B1

	MVKH	.S1	stack,	A0		; new stack pointer in A0 and B1
||	MVKH	.S2	stack,	B1

	STW	.D2	B3,	*B1		; push return address on stack

	STW	.D1	A10,	*+A0[1]		; push A10 on stack
||	STW	.D2	B10,	*+B1[2]		; push B10 on stack

*** BEGIN Benchmark Timing ***
B_START:

	MVK	.S1	32,	A1		; A1 = 32
||	LMBD	.L1	1,	A4,	A2	; 31 - log2(n)
||	SHR	.S2X	A4,	2,	B6	; n2 = n / 4
||	ZERO	.L2	B7			; mask
||	STW	.D1	A11,	*+A0[3]		; push A11 on stack
||	STW	.D2	B11,	*+B1[4]		; push B11 on stack

	SUB	.L1	A1,	A2,	A4	; log2(n)+1 (circ buff size in bytes)
||	SHR	.S1	A4,	1,	A7	; 2 * n2 = n / 2, a-side
||	SHR	.S2X	A4,	1,	B9	; 2 * n2 = n / 2, b-side
||	MV	.L2	B6,	B0		; n / 4
||	STW	.D1	A12,	*+A0[5]		; push A12 on stack
||	STW	.D2	B12,	*+B1[6]		; push B12 on stack

	SHL	.S1	A4,	16,	A4	; shift into BK0 field
||	MVC	.S2	B4,	IRP		; save off x
||	STW	.D1	A13,	*+A0[7]		; push A13 on stack
||	STW	.D2	B13,	*+B1[8]		; push B13 on stack

	ADDK	.S1	0404h,	A4		; A5, B5 set circular mode on BK0
||	MVK	.S2	1,	B8		; ie = 1
||	STW	.D1	A14,	*+A0[9]		; push A14 on stack
||	STW	.D2	B14,	*+B1[10]	; push B14 on stack

	MVC	.S2X	A4,	AMR		; load AMR
||	STW	.D1	A15,	*+A0[11]	; push A15 on stack
||	STW	.D2	B15,	*+B1[12]	; push B15 on stack
||	SUB	.L2	B0,	1,	B0	; loop coutner = n / 4 - 1

K_LOOP:
	MV	.L2	B4,	B5		; reset X load pointer
||	MV	.L1X	B4,	A5		; reset X store pointer
||	ADD	.D2	B0,	1,	B1	; i = loop counter + 1
||	MV	.D1	A6,	A1		; setup twiddle factor pointer

	ZERO	.S1	A4			; j = 0
||	SUBAW	.D1	A5,	A7,	A5	; setup for first preincrement
||	AND	.S2	B1,	B7,	B1	; j loop twiddle reload test

	SUBAW	.D1	A5,	A7,	A5	; setup for first preincrement
||	MPY	.M2	B1,	1,	B2	; j loop twiddle reload test

	LDW	.D2	*B5++[B6],	B10	; xi0=xt[0*n2],yi0 = yt[0*n2+1]

	LDW	.D2	*B5++[B6],	A8	; xi1=xt[2*n2],yi1 = yt[2*n2+1]
||[!B2]	LDW	.D1	*++A1[A4],	B15	; si1 = w[2 * j], co1 = w[2*j+1]

	LDW	.D2	*B5++[B6],	B11	; xi2=xt[4*n2],yi2 = yt[4*n2+1]
||[!B2]	LDW	.D1	*++A1[A4],	A3	; si2 = w[4*j], co2 = w[4*j+1]

	LDW	.D2	*B5++[B6],	A9	; xi3=xt[6*n2],yi3 = yt[6*n2+1]

	NOP 		2

  [!B2] LDW	.D1	*++A1[A4],	A13	; si3 = w[6*j], co3 = w[6*j+1]
||[!B2]	ADD	.L1X	A4,	B8,	A4	; j += ie

	SUB2	.S2	B10,	B11,	B3	; r2a=xi0 - xi2,s2a = yi0 - yi2
||	ADD	.L2	B0,	0,	B1	;* i = loop counter
||	MV	.L1	A6,	A1		; reset w

	SUB2	.S1	A8,	A9,	A10	; t3=xi1 - xi3,  t1 = yi1 - yi3
||	AND	.S2	B1,	B7,	B1	;* j loop twiddle reload test

	ADD2	.S1	A8,	A9,	A8	; t0=xi1 + xi3,  t2 = yi1 + yi3
||	ADD2	.S2	B10,	B11,	B1	; r1a=xi0 + xi2,s1a = yi0 + yi2
||	MPY	.M2	B1,	1,	B2	;* j loop twiddle reload test
||[!B1]	ADDAW	.D2	B5,	1,	B5	;* reset x input, (circular)
||[!B2]	SUBAW	.D1	A5,	1,	A5	;
||	ADD	.L2	B0,	1,	B0	;
||	ZERO	.L1	A2			; first pass cond. init to zero

LOOP:

	SHR	.S1X	B3,	16,	A9	;* extract s2a
||	SHR	.S2X	A10,	16,	B10	;* extract t1
||[!B2]	ADDAW	.D1	A5,	1,	A5	;* reset x output, (circular)
||	LDW	.D2	*B5++[B6],	B10	;** xi0=xt[0*n2], yi0=yt[0*n2+1]
||	MV	.L1	A6,	A1		;** reset w

	ADD	.L2	B3,	B10,	B11	;* r1c = r2a + t1 
||	SUB	.L1	A9,	A10,	A12	;* s1c = s2a - t3
||	SUB2	.S2X	B1,	A8,	B1	;* r1b=r1a - t0, s1b = s1a - t2
||	ADD2	.S1X	B1,	A8,	A8	;* xo0=r1a + t0, yo0 = s1a + t2
||	LDW	.D2	*B5++[B6],	A8	;** xi1=xt[2*n2], yi1=yt[2*n2+1]
||[!B2]	LDW	.D1	*++A1[A4],	B15	;** si1 = w[2*j], co1 = w[2*j+1]

  [A2]	ADD	.S2X	A11,	2,	B3	; copy B-side x store pointer
||[A2]	SHR	.S1	A14,	15,	A14	; xo2 = xa2 >> 15
||	SUB	.L2	B3,	B10,	B12	;* r2c = r2a - t1 
||	ADD	.L1	A9,	A10,	A9	;* s2c = s2a + t3
||	MPY	.M1X	A12,	B15,	A10	;* ss1 = s1c * si1
||	MPYLH	.M2	B11,	B15,	B10	;* rc1 = r1c * co1
||	LDW	.D2	*B5++[B6],	B11	;** xi2=xt[4*n2], yi2=yt[4*n2+1]
||[!B2]	LDW	.D1	*++A1[A4],	A3	;** si2 = w[4*j], co2 = w[4*j+1]

  [A2]	SHR	.S2	B13,	15,	B13	; yo1 = ya1 >> 15
||[A2]	SHR	.S1	A15,	15,	A15	; xo3 = xa3 >> 15
||	MPYLH	.M1X	B1,	A3,	A10	;* rc2 = r1b * co2
||	MPYLH	.M2X	A12,	B15,	B11	;* sc1 = s1c * co1
||	LDW	.D2	*B5++[B6],	A9	;** xi3=xt[6*n2], yi3=yt[6*n2+1]
||	ADDAW	.D1	A5,	A7,	A5	; 

  [A2]	SHR	.S2	B14,	15,	B14	; yo2 = ya2 >> 15
||[B0]	B	.S1	LOOP			; for i
||	MPY	.M1	A9,	A13,	A12	;* ss3 = s2c * si3
||	ADD	.L1X	B10,	A10,	A8	;* xa1 = rc1 + ss1
||	MPY	.M2	B11,	B15,	B13	;* rs1 = r1c * si1
||[B0]	STW	.D1	A8,	*++A5[A7]	;* xt[0*n2]=xo0,  yt[0*n2+1]=yo0

  [A2]	STH	.D2	B13,	*B3++[B9]	; yt[2 * n2 + 1] = yo1
||[A2]	SHR	.S2	B4,	15,	B4	; yo3 = ya3 >> 15
||[A2]	STH	.D1	A0,	*A11++[A7]	; xt[2 * n2] = xo1
||	SHR	.S1	A8,	15,	A0	;* xo1 = xa1 >> 15
||	MPYH	.M2X	B1,	A3,	B14	;* sc2 = s1b * co2
||	MPYHL	.M1X	B1,	A3,	A9	;* ss2 = s1b * si2

  [A2]	STH	.D2	B14,	*B3++[B9]	; yt[4 * n2 + 1] = yo2
||	MPYLH	.M1	A9,	A13,	A1	;* sc3 = s2c * co3
||	MPY	.M2X	B1,	A3,	B12	;* rs2 = r1b * si2
||	SUB	.L2	B11,	B13,	B13	;* ya1 = sc1 - rs1
||[!B2]	LDW	.D1	*++A1[A4],	A13	;** si3 = w[6*j], co3 = w[6*j+1]
||[!B2]	ADD	.L1X	A4,	B8,	A4	;** j += ie
||	SUB	.S2	B0,	1,	B0	;*** generate loop counter

  [A2]	STH	.D2	B4,	*B3		; yt[6 * n2 + 1] = yo3
||[A2]	STH	.D1	A14,	*A11++[A7]	; xt[4 * n2] = xo2
||	MPY	.M2X	B12,	A13,	B12	;* rs3 = r2c * si3
||	MPYLH	.M1X	B12,	A13,	A11	;* rc3 = r2c * co3
||	ADD	.L1	A10,	A9,	A14	;* xa2 = rc2 + ss2
||	SUB2	.S2	B10,	B11,	B3	;** r2a = xi0-xi2, s2a = yi0-yi2
||	SUB	.L2	B0,	1,	B1	;*** i = loop counter - 1

  [A2]	STH	.D1	A15,	*A11		; xt[6 * n2] = xo3
||	SUB	.L2	B14,	B12,	B14	;* ya2 = sc2 - rs2
||	SUB2	.S1	A8,	A9,	A10	;** t3=xi1 - xi3,  t1 = yi1-yi3
||	AND	.S2	B1,	B7,	B1	;*** j loop twiddle reload test
||[!A2] ADD	.L1	A2,	1,	A2	; First Pass Done Set Cond. Reg

	ADDAH	.D1	A5,	A7,	A11	;* copy A-side x store pointer
||	SUB	.L2X	A1,	B12,	B4	;* ya3 = sc3 - rs3
||	ADD	.L1	A11,	A12,	A15	;* xa3 = rc3 + ss3
||	ADD2	.S1	A8,	A9,	A8	;** t0=xi1 + xi3,  t2 = yi1+yi3
||	ADD2	.S2	B10,	B11,	B1	;** r1a = xi0+xi2, s1a = yi0+yi2
||	MPY	.M2	B1,	1,	B2	;*** j loop twiddle reload test
||[!B1]	ADDAW	.D2	B5,	1,	B5	;*** reset x input, (circular)

; LOOP ends here

	SHL	.S2	B7,	2,	B7	; mask   <<= 2
||	MPY	.M2	B6,	B8,	B0	; n/4 = n2 * ie

	SHR	.S1	A7,	2,	A7	; 2 * n2 >>= 2
||	SHR	.S2	B9,	2,	B9	; 2 * n2 >>= 2
||	ADD	.L2	B7,	3,	B7	; mask    += 3

	SHR	.S2	B6,	2,	B6	; n2     >>= 2
||	SUB	.L2	B0,	1,	B0	; loop counter = n/4 - 1

	CMPGT	.L2	B7,	B0,	B1	; kcond    = mask > n / 4 - 1

  [!B1]	B	.S1	K_LOOP			; if (!kcond) do loop
||	SHL	.S2	B8,	2,	B8	; ie     <<= 2

	MVC	.S2	IRP,	B4		; reload x

	NOP	4

; K_LOOP ends here

B_END:
*** END Benchmark Timing ***


	MVK	.S1	stack,	A0		; new stack pointer in A0 and B0
||	MVK	.S2	stack,	B0		; new stack pointer in A0 and B0

	MVKH	.S1	stack,	A0		; new stack pointer in A0 and B0
||	MVKH	.S2	stack,	B0		; new stack pointer in A0 and B0

	LDW	.D2	*B0,	B3		; pop return address off stack
||	ZERO	.L2	B2

	LDW	.D1	*+A0[1],	A10	; pop A10 off stack
||	LDW	.D2	*+B0[2],	B10	; pop B10 off stack
||	MVC	.S2	B2,	AMR		; reset AMR

	LDW	.D1	*+A0[3],	A11	; pop A11 off stack
||	LDW	.D2	*+B0[4],	B11	; pop B11 off stack

	LDW	.D1	*+A0[5],	A12	; pop A12 off stack
||	LDW	.D2	*+B0[6],	B12	; pop B12 off stack

	LDW	.D1	*+A0[7],	A13	; pop A13 off stack
||	LDW	.D2	*+B0[8],	B13	; pop B13 off stack

	LDW	.D1	*+A0[9],	A14	; pop A14 off stack
||	LDW	.D2	*+B0[10],	B14	; pop B14 off stack
||	B	.S2	B3

	LDW	.D1	*+A0[11],	A15	; pop A15 off stack
||	LDW	.D2	*+B0[12],	B15	; pop B15 off stack

	NOP		4			; wait 4 cycles for the last pop 
						; to occur before returning
