*==============================================================================*
*     TEXAS INSTRUMENTS, INC.                                                  *
*                                                                              *
*     NAME                                                                     *
*     Complex Radix4 FFT (Inplace FFT w/ special sequence of coeffs)           *
*                                                                              *
*     REVISION HISTORY                                                         *
*                                                                              *
*    26 Apr 96      Original version                                           *
*    19 Apr 99      Last Modified                                              *
*                                                                              *
*     USAGE                                                                    *
*           This routine is C-callable and can be called as:                   *
*                                                                              *
*           void radix4_asm(int n, short ptr_x[], short ptr_w[]);              *
*                                                                              *
*           n = length of fft in complex samples                               *
*           ptr_x = pointer to complex data input                              *
*           ptr_w = pointer to complex twiddle factors                         *
*                                                                              *
*           (See the C compiler reference guide.)                              *
*                                                                              *
*     DESCRIPTION                                                              *
*           The benchmark performs a radix 4 fft inplace using                 *
*           a special sequece of coefficients generated in the following       *
*           way:                                                               *
*                                                                              *
*            int i,j,k,offset;                                                 *
*            double theta1, theta2, theta3, x_t, y_t;                          *
*            const double M = 32768.0;                                         *
*            for (j=1, k=0; j <= n>>2; j = j<<2)                               *
*            {                                                                 *
*              for (i=0; i < n>>2; i+=j)                                       *
*              {                                                               *
*                theta1 = 2*PI*i/n;                                            *
*                x_t = M*cos(theta1);                                          *
*                y_t = M*sin(theta1);                                          *
*                w[k]   = (short) x_t;                                         *
*                if (x_t >= M) w[k  ] = 0x7fff;                                *
*                w[k+1] = (short) y_t;                                         *
*                if (y_t >= M) w[k+1] = 0x7fff;                                *
*                                                                              *
*                theta2 = 4*PI*i/n;                                            *
*                x_t = M*cos(theta2);                                          *
*                y_t = M*sin(theta2);                                          *
*                w[k+2] = (short) x_t;                                         *
*                if (x_t >= M) w[k+2] = 0x7fff;                                *
*                w[k+3] = (short) y_t;                                         *
*                if (y_t >= M) w[k+3] = 0x7fff;                                *
*                                                                              *
*                theta3 = 6*PI*i/n;                                            *
*                x_t = M*cos(theta3);                                          *
*                y_t = M*sin(theta3);                                          *
*                w[k+4] = (short) x_t;                                         *
*                if (x_t >= M) w[k+4] = 0x7fff;                                *
*                w[k+5] = (short) y_t;                                         *
*                if (y_t >= M) w[k+5] = 0x7fff;                                *
*                k+=6;                                                         *
*              }                                                               *
*            }                                                                 *
*                                                                              *
*     ASSUMPTIONS:                                                             *
*          n must be a power of 4 and greater than or equal to 4               *
*          and less than 32536                                                 *
*          FFT data x are aligned on a word boundary, in real/imag pairs       *
*          FFT twiddle factors w are aligned on a word boundary in             *
*          real/imaginary pairs.                                               *
*          Input FFT coeffs  are in signed 1.15 format word aligned            *
*          Configuration is LITTLE ENDIAN                                      *
*          The complex data will be returned in digit reversed order           *
*          This code is uninteruptable, interupts are disabled                 *
*          on entry to the function and it is interupt tolerant                *
*                                                                              *
*     MEMORY NOTE:                                                             *
*          No bank hits occur in this code                                     *
*                                                                              *
*     TECHNIQUES                                                               *
*          A special sequence of coeffs. used as generated above               *
*          produces the fft. This collapses the inner 2 loops in the           *
*          taditional Burrus and Parks implementation Fortran Code             *
*     CYCLES                                                                   *
*          cycles = 10 * log4(N) * (0.25 * N + 3) + 22                         *
*          e.g. N = 1024,  cycles = 12972                                      *
*          e.g. N = 256,   cycles = 2702                                       *
*          e.g. N = 64,    cycles = 592                                        *
*          e.g. N = 16,    cycles = 162                                        *
*     NOTES                                                                    *
*                                                                              *
*     CODESIZE                                                                 *
*          768 bytes                                                           *
*                                                                              *
*     C CODE                                                                   *
*           This is the C equivalent of the assembly code without restrictions:*
*           Note that the assembly code is hand optimized and restrictions may *
*           apply.                                                             *
*                                                                              *
*      void radix4_h(int n, short ptr_x[], short ptr_w[])                     *
*      {                                                                       *
*          int  i, j, l1, l2, h2, predj;                                       *
*          int  l1p1,l2p1,h2p1, tw_offset, stride, fft_jmp;                    *
*          short xt0, yt0, xt1, yt1, xt2, yt2;                                 *
*          short si10,si20,si30,co10,co20,co30;                                *
*          short xh0,xh1,xh20,xh21,xl0,xl1,xl20,xl21;                          *
*          short x_0, x_1, x_l1, x_l1p1, x_h2 , x_h2p1, x_l2, x_l2p1;          *
*          short *x,*w,*x2,*x0;                                                *
*          short * tmp[1];                                                     *
*                                                                              *
*          stride = n; /* n is the number of complex samples */                *
*          tw_offset = 0;                                                      *
*          while (stride > 1)                                                  *
*          {                                                                   *
*              j = 0;                                                          *
*              fft_jmp = stride + (stride>>1);                                 *
*              h2 = stride>>1;                                                 *
*              l1 = stride;                                                    *
*              l2 = stride + (stride>>1);                                      *
*              x = ptr_x;                                                      *
*              w = ptr_w + tw_offset;                                          *
*                                                                              *
*              for (i = 0; i < n>>1; i += 2)                                   *
*              {                                                               *
*                  si10 = w[j]; j++;                                           *
*                  co10 = w[j]; j++;                                           *
*                  si20 = w[j]; j++;                                           *
*                  co20 = w[j]; j++;                                           *
*                  si30 = w[j]; j++;                                           *
*                  co30 = w[j]; j++;                                           *
*                                                                              *
*                  x_0    = x[0];                                              *
*                  x_1    = x[1];                                              *
*                  x_l1   = x[l1];                                             *
*                  x_l1p1 = x[l1+1];                                           *
*                  x_h2   = x[h2];                                             *
*                  x_h2p1 = x[h2+1];                                           *
*                  x_l2   = x[l2];                                             *
*                  x_l2p1 = x[l2+1];                                           *
*                                                                              *
*                  xh0  = x_0 + x_l1; xh1  = x_1 + x_l1p1;                     *
*                  xl0  = x_0 - x_l1; xl1  = x_1 - x_l1p1;                     *
*                  xh20 = x_h2 + x_l2; xh21 = x_h2p1 + x_l2p1;                 *
*                  xl20 = x_h2 - x_l2; xl21 = x_h2p1 - x_l2p1;                 *
*                                                                              *
*                  x0 = x;                                                     *
*                  x0[0] = xh0 + xh20;  x0[1] = xh1 + xh21;                    *
*                                                                              *
*                  tmp[0] = x;                                                 *
*                  x2 = tmp[0];                                                *
*                                                                              *
*                  x += 2;                                                     *
*                  predj = (j - fft_jmp);                                      *
*                  if (!predj) x += fft_jmp;                                   *
*                  if (!predj) j = 0;                                          *
*                                                                              *
*                  xt0  = xh0 - xh20;  yt0  = xh1 - xh21;                      *
*                  xt1  = xl0 + xl21;  yt2  = xl1 + xl20;                      *
*                  xt2  = xl0 - xl21;  yt1  = xl1 - xl20;                      *
*                                                                              *
*                  x2[l1] = (si20*yt0+co20*xt0)>>15;                           *
*                  x2[h2] = (si10*yt1+co10*xt1)>>15;                           *
*                  x2[l2] = (si30*yt2+co30*xt2)>>15;                           *
*                                                                              *
*                  l1p1 = l1+1;                                                *
*                  h2p1 = h2+1;                                                *
*                  l2p1 = l2+1;                                                *
*                                                                              *
*                  x2[l1p1] = (co20*yt0-si20*xt0)>>15;                         *
*                  x2[h2p1] = (co10*yt1-si10*xt1)>>15;                         *
*                  x2[l2p1] = (co30*yt2-si30*xt2)>>15;                         *
*              }                                                               *
*              tw_offset += fft_jmp;                                           *
*              stride = stride>>2;                                             *
*          }/* end while */                                                    *
*      }                                                                       *
*                                                                              *
*==============================================================================*
*      Copyright (C) 1997-1999 Texas Instruments Incorporated.                 *
*                      All Rights Reserved                                     *
*==============================================================================*
*====================== SYMBOLIC REGISTER ASSIGNMENTS =========================*
        .asg            B15,        B_SP                        ; Preferred
        .asg            A4,         A_n                         ; Preferred
        .asg            B4,         B_ptr_x                     ; Preferred
        .asg            A6,         A_ptr_w                     ; Preferred
        .asg            B2,         B_pro                       ; Preferred
        .asg            A15,        A_SP                        ; Preferred
        .asg            B4,         B_x                         ; Preferred
        .asg            B14,        B_w0                        ; Preferred
        .asg            B13,        B_stride_2                  ; Preferred
        .asg            A14,        A_fft_jmp                   ; Preferred
        .asg            B15,        B_fft_jmp                   ; Preferred
        .asg            A2,         A_i                         ; Preferred
        .asg            A13,        A_j                         ; Preferred
        .asg            B6,         B_j                         ; Preferred
        .asg            A1,         A_tw_offset
        .asg            B1,         B_stride
        .asg            A5,         A_stride_1
        .asg            A3,         A_w0
        .asg            A2,         A_i
        .asg            A13,        A_j
        .asg            B6,         B_j
        .asg            B13,        B_stride_2
        .asg            B4,         B_x
        .asg            B14,        B_w0
        .asg            A14,        A_fft_jmp
        .asg            B15,        B_fft_jmp
        .asg            B10,         B_l1w
        .asg            B8,         B_l2w
        .asg            B3,         B_x1x0
        .asg            A7,         A_xh2p1xh2
        .asg            B7,         B_xl1p1xl1
        .asg            A10,        A_xl2p1xl2
        .asg            B8,         B_xh1xh0
        .asg            B0,         B_xl1xl0
        .asg            A6,         A_xh21xh20
        .asg            A5,         A_xl21xl20
        .asg            B3,         B_y1y0
        .asg            B11,        B_x0
        .asg            A12,        A_x1
        .asg            B10,        B_x1
        .asg            A3,         A_w1
        .asg            B8,         B_w2
        .asg            A11,        A_si10co10
        .asg            B12,        B_si20co20
        .asg            A7,         A_si30co30
        .asg            A1,         A_predj
        .asg            B5,         B_yt0xt0
        .asg            A4,         A_xl20xl21
        .asg            A3,         A_l21
        .asg            A9,         A_yt2xt1
        .asg            B0,         B_yt1xt2
        .asg            A8,         A_yt1xt2
        .asg            A4,         A_h2
        .asg            B9,         B_l1
        .asg            B6,         B_l2
        .asg            A3,         A_h2p1
        .asg            B3,         B_l1p1
        .asg            A8,         A_l2p1
        .asg            A9,         A_p4
        .asg            A3,         A_p5
        .asg            A3,         A_p40
        .asg            A0,         A_p41
        .asg            A4,         A_p6
        .asg            A0,         A_p7
        .asg            A5,         A_p60
        .asg            A5,         A_p61
        .asg            B1,         B_p0
        .asg            B5,         B_p1
        .asg            B0,         B_p00
        .asg            B5,         B_p01
        .asg            B3,         B_p2
        .asg            B1,         B_p3
        .asg            B7,         B_p20
        .asg            B5,         B_p21
        .asg            A4,         A_p8
        .asg            B3,         B_p9
        .asg            B0,         B_p80
        .asg            B0,         B_p81
        .asg            A4,         A_pa
        .asg            A5,         A_pb
        .asg            A6,         A_pa0
        .asg            A6,         A_pa1
        .asg            B5,         B_csr
        .asg            B6,         B_no_gie
        .asg            B2,         B_while
*==============================================================================*
*      Copyright (C) 1997-1999 Texas Instruments Incorporated.                 *
*                            All Rights Reserved                               *
*==============================================================================*
        .global     _radix4_h

_radix4_h:
        STW     .D2T1 A15,        *--B_SP[20]  ;[13,0] 
||      MVC     .S2   CSR,        B_csr        ; Get current GIE bit

        STW     .D2T1 A10,        *B_SP[5]     ;[ 4,0] 
||      ZERO    .L1   A_tw_offset              ;tw_offset=0
||      MV      .S1X  B_SP,       A_SP         ;[ 3,0] 
||      AND     .L2   B_csr, -2,  B_no_gie     ; Clear GIE

        STW     .D2T1 A11,        *B_SP[4]     ;[ 5,0] 
||      STW     .D1T2 B_csr,      *A_SP[17]    ;save orginal CSR

        STW     .D2T2 B10,        *B_SP[10]    ;[ 6,0] 
||      STW     .D1T1 A12,        *A_SP[3]     ;[ 6,0] 
||      MVC     .S2   B_no_gie,   CSR          ; Disable interrupts

        STW     .D2T2 B11,        *B_SP[9]     ;[ 7,0] 
||      STW     .D1T1 A13,        *A_SP[2]     ;[ 7,0] 

        STW     .D2T2 B12,        *B_SP[8]     ;[ 8,0] 
||      STW     .D1T1 A14,        *A_SP[1]     ;[ 8,0] 
||      MV      .S2X  A_n,        B_stride     ;[15,0] 

        STW     .D2T2 B13,        *B_SP[7]     ;[ 9,0] 
||      STW     .D1T1 A_ptr_w,    *A_SP[14]    ;[ 9,0] 

        STW     .D2T2 B3,         *B_SP[11]    ;[12,0] 
||      STW     .D1T1 A_n,        *A_SP[12]    ;[10,0] 

        STW     .D2T2 B_ptr_x,    *B_SP[13]    ;[11,0] 
||      STW     .D1T1 A_tw_offset,*A_SP[16]    ;[11,0] 

        STW     .D2T2 B14,        *B_SP[6]     ;[10,0] 
||      STW     .D1T1 A_n,        *A_SP[15]    ;[12,0] stride=n
*============================= PIPE LOOP PROLOG ===============================*
LOOP_WHILE:
        SHR     .S1X  B_stride,   1,        A_stride_1 ;[20,0] 
||      SHRU    .S2   B_stride,   2,        B_stride_2 ;[20,0] 

        ADD     .L1X  A_stride_1, B_stride,  A_fft_jmp ;fft_jmp=1.5*stride
||      STW     .D1T2 B_stride_2, *A_SP[15]            ;[21,0] 

        ADDAH   .D1   A_ptr_w,    A_tw_offset, A_w0    ;w = ptr_w + tw_offset
||      ADD     .L1   A_tw_offset,A_fft_jmp,A_tw_offset;tw_offset+=fft_jmp
||      MPY     .M2   3,          B_stride_2, B_l2w    ;[ 2,1] 
||      SHL     .S2X  A_fft_jmp,  1,         B_fft_jmp ;[23,0] 

        STW     .D1T1 A_tw_offset,           *A_SP[16] ;[23,0] 
||      SHRU    .S1   A_fft_jmp,  1,         A_fft_jmp ;[23,0] 
||      MPY     .M2   2,          B_stride_2, B_l1w    ;[ 1,1] 

        SUB     .D1   A_fft_jmp,  3,         A_fft_jmp ;[24,0] 
||      MV      .L2X  A_w0,       B_w0                 ;[24,0] 
||      LDW     .D2T1 *B_x[B_l2w],         A_xl2p1xl2  ;[ 5,1] x[l2] (3N/4)

        LDW     .D2T2 *B_x[B_l1w],         B_xl1p1xl1  ;[ 6,1] x[l1] (N/2)

        SHRU    .S1   A_n,        1,          A_i      ;[22,0] n>>1

        SUB     .L1   A_fft_jmp,  0,        A_predj    ;predj = (j - fft_jmp)
||      LDW     .D2T1 *B_x[B_stride_2],    A_xh2p1xh2  ;[ 8,1] x[h2] (N/4)

        MVK     .S2   1,          B_pro                ;[24,0] prologue start

        LDW     .D2T2 *B_x,       B_x1x0               ;[10,1] x[0] (0)

        MPY     .M2   2,          B_stride_2, B_l1w    ;[ 1,2] 

        ADD     .L1X  B_w0,       4,          A_w1     ;[12,1] j += 1
||      ADD     .L2   B_w0,       8,          B_w2     ;[12,1] j += 1
||      MPY     .M2   3,          B_stride_2, B_l2w    ;[ 2,2] 

        SUB2    .S1   A_xh2p1xh2,A_xl2p1xl2,A_xl21xl20 ;xl20=x[h2] -x[l2]
||      MV      .L2   B_x,        B_x0                 ;[13,1] 
||[!A_predj]ADD .S2   B_x,        B_fft_jmp,  B_x      ;if(!predj)x +=fft_jmp
||      LDW     .D2T1 *B_w2[0], A_si30co30             ;si30=w[j+5]co30=w[j+4]

        ADD     .L2   B_x,        4,          B_x      ;[14,1] x +=2;
||      LDW     .D1T2 *A_w1[0], B_si20co20             ;si20=w[j+3]co20=w[j+2]
||      LDW     .D2T1 *B_w0[0], A_si10co10             ;si10=w[j+1]co10=w[j  ]

        ADD2    .S2   B_x1x0,     B_xl1p1xl1, B_xh1xh0 ;[15,1] xh0=x[0]+x[l1]
||      SHRU    .S1   A_xl21xl20, 16,         A_l21    ;[15,1] 
||      ADD     .L1X  B_pro,        2,          A_j    ;[15,1] j += 1
||      LDW     .D2T1 *B_x[B_l2w],         A_xl2p1xl2  ;[ 5,2] x[l2] (3N/4)

        SHL     .S1   A_xl21xl20, 16,      A_xl20xl21  ;[16,1] 
||[!A_predj]ZERO.D1   A_j                              ;[16,1]if (!predj) j=0
||      LDW     .D2T2 *B_x[B_l1w],         B_xl1p1xl1  ;[ 6,2] x[l1] (N/2)
||      ZERO    .S2   B_j                              ;[24,0] j = 0
*============================= PIPE LOOP KERNEL ===============================*
LOOP_FOR:
  [!B_pro]STH   .D2T2 B_p81,     *B_x1[B_l2]           ;[27,1]x[l2]
||      MPY     .M2   B_si20co20, B_yt0xt0,   B_p1     ;[27,1] 
||[!B_pro]STH   .D1T1 A_pa1,     *A_x1[A_l2p1]         ;[27,1]x[l2+1]
||      SUB2    .S2   B_x1x0,     B_xl1p1xl1, B_xl1xl0 ;xl0=x[0]-x[l1]
||      ADD2    .S1   A_xh2p1xh2, A_xl2p1xl2, A_xh21xh20;xh20=x[h2]+x[l2]
||      ADD     .L1   A_l21,      A_xl20xl21, A_xl20xl21;[17,2] 
||      MV      .L2X  A_j,        B_j                  ;[17,2] 

        MPY     .M1X  2,          B_stride_2, A_h2     ;[28,1] 
||      ADD     .D1   A_p4,       A_p5,       A_p40    ;(si10*yt1+co10*xt1)
||      SHR     .S1   A_p60,      15,         A_p61    ;[28,1] y[h2p1]>>=15
||      SUB     .L2   B_p2,       B_p3,       B_p20    ;(co20*yt0-si20*xt0)
||      MPYH    .M2   B_si20co20, B_yt0xt0,   B_p0     ;[28,1] 
||      ADD2    .S2X  B_xh1xh0,   A_xh21xh20, B_y1y0   ;[18,2]y[0]=xh0+xh20
||      SUB     .L1   A_fft_jmp,  A_j,        A_predj  ;predj=(j - fft_jmp)
||      LDW     .D2T1 *B_x[B_stride_2],    A_xh2p1xh2  ;[ 8,3] x[h2] (N/4)

  [ A_i]SUB     .L1   A_i,        2,          A_i      ;[29,1] 
||      ADD     .L2   B_l1,       1,          B_l1p1   ;[29,1] 
||      ADD2    .S1X  A_xl20xl21, B_xl1xl0,   A_yt2xt1 ;xt1=xl0+xl21yt2=xl1+xl20
||      SUB2    .S2X  B_xl1xl0,   A_xl20xl21, B_yt1xt2 ;xt2=xl0-xl21yt1=xl1-xl20
||      STW     .D2T2 B_y1y0,     *B_x0                ;[19,2] 

        ADD     .D1   A_h2,       1,          A_h2p1   ;[30,1] 
||      SHR     .S1   A_p40,      15,         A_p41    ;[30,1] y[h2]>>=15
||      SHR     .S2   B_p20,      15,         B_p21    ;[30,1] y[l1p1]>>=15
||      ADD     .L2   B_p0,       B_p1,       B_p00    ;(si20*yt0+co20*xt0)
||      MPY     .M2X  B_yt1xt2,   A_si30co30, B_p9     ;[20,2] 
||      MPYLH   .M1   A_si30co30, A_yt2xt1,   A_pa     ;[20,2] 
||      MV      .L1X  B_yt1xt2,   A_yt1xt2             ;[20,2] 
||      LDW     .D2T2 *B_x,       B_x1x0               ;[10,3] x[0]  (0)

  [ A_i]B       .S1   LOOP_FOR                         ;[31,1] }
||[!B_pro]STH   .D1T1 A_p41,      *A_x1[A_h2]          ;[31,1] x[h2]
||[!B_pro]STH   .D2T2 B_p21,      *B_x1[B_l1p1]        ;[31,1] x[l1+1]
||      SHR     .S2   B_p00,      15,         B_p01    ;[31,1] y[l1]>>=15
||      MPYHL   .M1   A_si30co30, A_yt1xt2,   A_pb     ;[21,2] 
||      MPY     .M2   2,          B_stride_2, B_l1w    ;[ 1,4] 

  [!B_pro]STH   .D1T1 A_p61,      *A_x1[A_h2p1]        ;[32,1] x[h2+1]
||[!B_pro]STH   .D2T2 B_p01,      *B_x1[B_l1]          ;[32,1] x[l1]
||      MPYH    .M1   A_yt2xt1,   A_si30co30, A_p8     ;[22,2] 
||      SUB2    .S2X  B_xh1xh0,   A_xh21xh20, B_yt0xt0 ;xt0=xh0-xh20yt0=xh1-xh21
||      ADD     .L1X  B_w0,       4,          A_w1     ;[12,3] j += 1
||      ADD     .L2   B_w0,       8,          B_w2     ;[12,3] j += 1
||      MPY     .M2   3,          B_stride_2, B_l2w    ;[ 2,4] 

        MPY     .M2   6,          B_stride_2, B_l2     ;[23,2] 
||      MPYLH   .M1   A_si10co10, A_yt1xt2,   A_p6     ;[23,2] 
||      MV      .L1X  B_x0,       A_x1                 ;[23,2] 
||      SUB     .D1   A_pa,       A_pb,       A_pa0    ;(co30*yt2-si30*xt2)
||      SUB2    .S1   A_xh2p1xh2,A_xl2p1xl2,A_xl21xl20 ;xl20=x[h2]  -x[l2]
||      MV      .L2   B_x,        B_x0                 ;[13,3] 
||[!A_predj]ADD .S2   B_x,        B_fft_jmp,  B_x      ;if(!predj)x +=fft_jmp
||      LDW     .D2T1 *B_w2[B_j], A_si30co30           ;si30=w[j+5]co30=w[j+4]

        MPYHL   .M1   A_si10co10, A_yt2xt1,   A_p7     ;[24,2] 
||      ADD     .S2X  B_p9,       A_p8,       B_p80    ;(si30*yt2+co30*xt2)
||      MPY     .M2   4,          B_stride_2, B_l1     ;[24,2] 
||      SHR     .S1   A_pa0,      15,         A_pa1    ;[24,2] y[l2p1]>>=15
||      ADD     .L2   B_x,        4,          B_x      ;[14,3] x +=2;
||      LDW     .D1T2 *A_w1[A_j], B_si20co20           ;si20=w[j+3]co20=w[j+2]
||      LDW     .D2T1 *B_w0[B_j], A_si10co10           ;si10=w[j+1]co10=w[j]

        ZERO    .L2   B_pro                            ;[35,1] 
||      MPYH    .M1   A_si10co10, A_yt1xt2,   A_p4     ;[25,2] 
||      ADD     .L1X  B_l2,       1,          A_l2p1   ;[25,2] 
||      MPYHL   .M2   B_si20co20, B_yt0xt0,   B_p3     ;[25,2] 
||      ADD2    .S2   B_x1x0,   B_xl1p1xl1, B_xh1xh0   ;xh0=x[0]+x[l1]
||      SHRU    .S1   A_xl21xl20, 16,         A_l21    ;[15,3] 
||      ADD     .D1   A_j,        3,          A_j      ;[15,3] j += 1
||      LDW     .D2T1 *B_x[B_l2w],       A_xl2p1xl2    ;[ 5,4] x[l2] (3N/4)

        MPY     .M1   A_si10co10, A_yt2xt1,   A_p5     ;[26,2] 
||      SUB     .L1   A_p6,       A_p7,       A_p60    ;(co10*yt1-si10*xt1)
||      MV      .L2X  A_x1,       B_x1                 ;[26,2] 
||      SHR     .S2   B_p80,      15,         B_p81    ;[26,2] y[l2]>>=15
||      MPYLH   .M2   B_si20co20, B_yt0xt0,   B_p2     ;[26,2] 
||      SHL     .S1   A_xl21xl20, 16,   A_xl20xl21     ;[16,3] 
||[!A_predj]ZERO.D1   A_j                              ;if (!predj) j = 0
||      LDW     .D2T2 *B_x[B_l1w], B_xl1p1xl1          ;[6,4]x[l1](N/2)
        ; BRANCH OCCURS HERE
*============================= PIPE LOOP EPILOG ===============================*
        STH     .D2T2 B_p81,    *B_x1[B_l2]    ;[27,4]x[l2]
||      MPY     .M2   B_si20co20,B_yt0xt0,B_p1 ;[27,4] 

        MPY     .M1X  2,       B_stride_2,A_h2 ;[28,4] 
||      ADD     .L1   A_p4,    A_p5,   A_p40   ;y[h2]=(si10*yt1+co10*xt1)
||      SHR     .S1   A_p60,   15,     A_p61   ;[28,4]y[h2p1]>>=15
||      SUB     .L2   B_p2,    B_p3,   B_p20   ;[28,4](co20*yt0-si20*xt0)
||      MPYH    .M2   B_si20co20,B_yt0xt0,B_p0 ;[28,4] 
||      LDW     .D1T2 *A_SP[15], B_stride

        ADD     .L2   B_l1,      1,    B_l1p1  ;[29,4] 
||      STH     .D1T1 A_pa1,    *A_x1[A_l2p1]  ;[27,4]x[l2+1]

        ADD     .D1   A_h2,      1,    A_h2p1  ;[30,4] 
||      SHR     .S1   A_p40,     15,   A_p41   ;[30,4] y[h2]>>=15
||      SHR     .S2   B_p20,     15,   B_p21   ;[30,4] y[l1p1]>>=15
||      ADD     .L2   B_p0,      B_p1, B_p00   ;y[l1]=(si20*yt0+co20*xt0)

        STH     .D1T1 A_p41,    *A_x1[A_h2]    ;[31,4] x[h2]
||      STH     .D2T2 B_p21,    *B_x1[B_l1p1]  ;[31,4] x[l1+1]
||      SHR     .S2   B_p00,     15,      B_p01;[31,4] y[l1]>>=15

        STH     .D1T1 A_p61,    *A_x1[A_h2p1]  ;[32,4] x[h2+1]
||      STH     .D2T2 B_p01,    *B_x1[B_l1]    ;[32,4] x[l1]

        CMPGTU  .L2   B_stride,1,B_while       ;
 [B_while]B     .S1   LOOP_WHILE               ;
||      MV      .S2X  A_SP, B_SP

        LDW     .D1T2 *A_SP[15], B_stride      ;@@@@@
||      LDW     .D2T1 *B_SP[2],  A13           ;

        LDW     .D1T1 *A_SP[16], A_tw_offset   ;@@@@@
||      LDW     .D2T2 *B_SP[11], B3            ;return original stack state

        LDW     .D1T1 *A_SP[14], A_ptr_w       ;@@@@@
||      LDW     .D2T2 *B_SP[9],B11             ;

        LDW     .D1T1 *A_SP[12], A_n           ;@@@@@

        LDW     .D1T2 *A_SP[13], B_x           ;@@@@@
||      LDW     .D2T1 *B_SP[4],A11             ;
        ; BRANCH OCCURS HERE
*==============================================================================*
        LDW     .D2T2 *B_SP[7],B13             ;

        LDW     .D1T1 *A_SP[5],A10             ;
||      LDW     .D2T2 *B_SP[10],B10            ;

        LDW     .D1T1 *A_SP[3],A12             ;
||      LDW     .D2T2 *B_SP[8],B12             ;

        B       .S2   B3                       ;
||      LDW     .D1T1 *A_SP[0],A15             ;
||      LDW     .D2T2 *B_SP[17], B_csr         ;load orginal CSR

        LDW     .D1T1 *A_SP[1],A14             ;
||      LDW     .D2T2 *B_SP[6],B14             ;

        ADDAW   .D2   B_SP, 20, B_SP           ;
||      MVC     .S2   B_csr,    CSR            ;restore orginal CSR

        NOP     3

        ;BRANCH OCCURS HERE
*==============================================================================*
* End of radix4 assembly code                                                  *
*==============================================================================*
*      Copyright (C) 1997-1999 Texas Instruments Incorporated.                 *
*                            All Rights Reserved                               *
*==============================================================================*
