* ============================================================================ *
*                                                                              *
*     TEXAS INSTRUMENTS, INC.                                                  *
*                                                                              *
*     NAME                                                                     *
*           fftSPxSP                                                           *
*                                                                              *
*     USAGE                                                                    *
*           This routine is C-callable and can be called as:                   *
*                                                                              *
*           void fftSPxSP_asm(                                                 *
*               int N, float * ptr_x, float * ptr_w, float * ptr_y,            *
*               unsigned char brev[], int n_min, int offset, int n_max);       *
*                                                                              *
*           N = length of fft in complex samples, power of 2 < 8192            *
*           ptr_x = pointer to complex data input                              *
*           ptr_w = pointer to complex twiddle factor (see below)              *
*           brev = pointer to bit reverse table containing 64 entries          *
*           n_min = smallest fft butterfly used in computation                 *
*                   used for decomposing fft into subffts, see notes           *
*           offset = index in complex samples of sub-fft from start of main fft*
*           n_max = size of main fft in complex samples                        *
*                                                                              *
*           (See the C compiler reference guide.)                              *
*                                                                              *
*     DESCRIPTION                                                              *
*          The benchmark performs a mixed radix forwards fft using             *
*          a special sequece of coefficients generated in the following        *
*          way:                                                                *
*                                                                              *
*            /* generate vector of twiddle factors for optimized algorithm */  *
*           void tw_gen(float * w, int N)                                      *
*           {                                                                  *
*             int j, k;                                                        *
*             double x_t, y_t, theta1, theta2, theta3;                         *
*             const double PI = 3.141592654;                                   *
*                                                                              *
*             for (j=1, k=0; j <= N>>2; j = j<<2)                              *
*             {                                                                *
*                 for (i=0; i < N>>2; i+=j)                                    *
*                 {                                                            *
*                     theta1 = 2*PI*i/N;                                       *
*                     x_t = cos(theta1);                                       *
*                     y_t = sin(theta1);                                       *
*                     w[k]   =  (float)x_t;                                    *
*                     w[k+1] =  (float)y_t;                                    *
*                                                                              *
*                     theta2 = 4*PI*i/N;                                       *
*                     x_t = cos(theta2);                                       *
*                     y_t = sin(theta2);                                       *
*                     w[k+2] =  (float)x_t;                                    *
*                     w[k+3] =  (float)y_t;                                    *
*                                                                              *
*                     theta3 = 6*PI*i/N;                                       *
*                     x_t = cos(theta3);                                       *
*                     y_t = sin(theta3);                                       *
*                     w[k+4] =  (float)x_t;                                    *
*                     w[k+5] =  (float)y_t;                                    *
*                     k+=6;                                                    *
*                 }                                                            *
*             }                                                                *
*           }                                                                  *
*          This redundent set of twiddle factors is size 2*N float samples.    *
*          The function is accurate to about 130dB of signal to noise ratio    *
*          to the DFT function below:                                          *
*                                                                              *
*           void dft(int n, float x[], float y[])                              *
*           {                                                                  *
*              int k,i, index;                                                 *
*              const float PI = 3.14159654;                                    *
*              float * p_x;                                                    *
*              float arg, fx_0, fx_1, fy_0, fy_1, co, si;                      *
*                                                                              *
*              for(k = 0; k<n; k++)                                            *
*              {                                                               *
*                p_x = x;                                                      *
*                fy_0 = 0;                                                     *
*                fy_1 = 0;                                                     *
*                for(i=0; i<n; i++)                                            *
*                {                                                             *
*                  fx_0 = p_x[0];                                              *
*                  fx_1 = p_x[1];                                              *
*                  p_x += 2;                                                   *
*                  index = (i*k) % n;                                          *
*                  arg = 2*PI*index/n;                                         *
*                  co = cos(arg);                                              *
*                  si = -sin(arg);                                             *
*                  fy_0 += ((fx_0 * co) - (fx_1 * si));                        *
*                  fy_1 += ((fx_1 * co) + (fx_0 * si));                        *
*                }                                                             *
*                y[2*k] = fy_0;                                                *
*                y[2*k+1] = fy_1;                                              *
*              }                                                               *
*           }                                                                  *
*          The function takes the table and input data and calculates the fft  *
*          producing the frequency domain data in the Y array.                 *
*          As the fft allows every input point to effect every output point in *
*          a cache based system such as the c6711, this causes cache thrashing.*
*          This is mitigated by allowing the main fft of size N to be divided  *
*          into several steps, allowing as much data reuse as possible.        *
*                                                                              *
*          For example the following function:                                 *
*                                                                              *
*          fftSPxSP_asm(1024, &x_asm[0],&w[0],y_asm,brev,4,  0,1024);          *
*                                                                              *
*          is equvalent to:                                                    *
*                                                                              *
*          fftSPxSP_asm(256, &x_asm[2*0],  &w[2*768],y_asm,brev,4,    0,1024); *
*          fftSPxSP_asm(256, &x_asm[2*256],&w[2*768],y_asm,brev,4,  256,1024); *
*          fftSPxSP_asm(256, &x_asm[2*512],&w[2*768],y_asm,brev,4,  512,1024); *
*          fftSPxSP_asm(256, &x_asm[2*768],&w[2*768],y_asm,brev,4,  768,1024); *
*                                                                              *
*          Notice how the 1st fft function is called on the entire 1K data set *
*          it covers the 1st pass of the fft until the butterfly size is 256.  *
*          The following 4 ffts do 256 pt ffts 25% of the size. These continue *
*          down to the end when the buttefly is of size 4. The use an index to *
*          the main twiddle factor array of 0.75*2*N. This is because the      *
*          twiddle factor array is composed of successively decimated versions *
*          of the main array.                                                  *
*                                                                              *
*          N not equal to a power of 4 can be used, i.e. 512. In this case to  *
*          decompose the fft the following would be needed :                   *
*                                                                              *
*          fftSPxSP_asm(512, &x_asm[0],&w[0],y_asm,brev,2,  0,512);            *
*                                                                              *
*          is equvalent to:                                                    *
*                                                                              *
*          fftSPxSP_asm(128, &x_asm[2*0],  &w[2*384],y_asm,brev,4,    0,512);  *
*          fftSPxSP_asm(128, &x_asm[2*128],&w[2*384],y_asm,brev,4,  128,512);  *
*          fftSPxSP_asm(128, &x_asm[2*256],&w[2*384],y_asm,brev,4,  256,512);  *
*          fftSPxSP_asm(128, &x_asm[2*384],&w[2*384],y_asm,brev,4,  384,512);  *
*                                                                              *
*          The twiddle factor array is composed of log4(N) sets of twiddle     *
*          factors, (3/4)*N, (3/16)*N, (3/64)*N, etc.  The index into this     *
*          array for each stage of the fft is calculated by summing these      *
*          indices up appropriately.                                           *
*          For multiple ffts they can share the same table by calling the small*
*          ffts from further down in the twiddle factor array. In the same way *
*          as the decomposition works for more data reuse.                     *
*                                                                              *
*     ASSUMPTIONS:                                                             *
*          n must be a power of 2 and n >= 8  n <= 16384 points.               *
*          Complex time data x and twiddle facotrs w are aligned on double     *
*          word boundares. Real values are stored in even word positions and   *
*          imaginary values in odd positions.                                  *
*                                                                              *
*          All data is in single precision floating point format. The complex  *
*          frequency data will be returned in linear order.                    *
*          This code is interupti tolerant, interupts are disabled on entry to *
*          the function and reanlbed on exit.                                  *
*                                                                              *
*     MEMORY NOTE:                                                             *
*          Configuration is BI ENDIAN either tools flag -me or none will give  *
*          the same results. No memory bank hits occur in this code.           *
*                                                                              *
*     TECHNIQUES                                                               *
*          A special sequence of coeffs. used as generated above               *
*          produces the fft. This collapses the inner 2 loops in the           *
*          taditional Burrus and Parks implementation Fortran Code.            *
*                                                                              *
*          The revised FFT uses a redundant sequence of twiddle factors to     *
*          allow a linear access through the data. This linear access enables  *
*          data and instruction level parallelism.                             *
*          The data produced by the fftSPxSP fft is in normal form, the        *
*          whole data array is written into a new output buffer.               *
*                                                                              *
*          The fftSPxSP butterfly is bit reversed, i.e. the inner 2 points of  *
*          the butterfly are corssed over, this has the effect of making the   *
*          data come out in bit reversed rather than fftSPxSP digit reversed   *
*          order. This simplifies the last pass of the loop. ia simple table   *
*          is used to do the bit reversal out of place.                        *
*                                                                              *
*              unsigned char brev[64] = {                                      *
*                    0x0, 0x20, 0x10, 0x30, 0x8, 0x28, 0x18, 0x38,             *
*                    0x4, 0x24, 0x14, 0x34, 0xc, 0x2c, 0x1c, 0x3c,             *
*                    0x2, 0x22, 0x12, 0x32, 0xa, 0x2a, 0x1a, 0x3a,             *
*                    0x6, 0x26, 0x16, 0x36, 0xe, 0x2e, 0x1e, 0x3e,             *
*                    0x1, 0x21, 0x11, 0x31, 0x9, 0x29, 0x19, 0x39,             *
*                    0x5, 0x25, 0x15, 0x35, 0xd, 0x2d, 0x1d, 0x3d,             *
*                    0x3, 0x23, 0x13, 0x33, 0xb, 0x2b, 0x1b, 0x3b,             *
*                    0x7, 0x27, 0x17, 0x37, 0xf, 0x2f, 0x1f, 0x3f              *
*              };                                                              *
*                                                                              *
*     C CODE                                                                   *
*           This is the C equivalent of the assembly code without restrictions:*
*           Note that the assembly code is hand optimized and restrictions may *
*           apply.                                                             *
*                                                                              *
*                                                                              *
*         void fftSPxSP_co(int n, float ptr_x[], float ptr_w[], float ptr_y[], *
*                      unsigned char brev[], int n_min, int offset, int n_max) *
*         {                                                                    *
*            int  i, j, k, l1, l2, h2, predj;                                  *
*            int  tw_offset, stride, fft_jmp;                                  *
*                                                                              *
*            float x0, x1, x2, x3,x4,x5,x6,x7;                                 *
*            float xt0, yt0, xt1, yt1, xt2, yt2, yt3;                          *
*            float yt4, yt5, yt6, yt7;                                         *
*            float si1,si2,si3,co1,co2,co3;                                    *
*            float xh0,xh1,xh20,xh21,xl0,xl1,xl20,xl21;                        *
*            float x_0, x_1, x_l1, x_l1p1, x_h2 , x_h2p1, x_l2, x_l2p1;        *
*            float xl0_0, xl1_0, xl0_1, xl1_1;                                 *
*            float xh0_0, xh1_0, xh0_1, xh1_1;                                 *
*            float *x,*w;                                                      *
*            int   k0, k1, j0, j1, l0, radix;                                  *
*            float * y0, * ptr_x0, * ptr_x2;                                   *
*                                                                              *
*            radix = n_min;                                                    *
*                                                                              *
*            stride = n; /* n is the number of complex samples */              *
*            tw_offset = 0;                                                    *
*            while (stride > radix)                                            *
*            {                                                                 *
*                j = 0;                                                        *
*                fft_jmp = stride + (stride>>1);                               *
*                h2 = stride>>1;                                               *
*                l1 = stride;                                                  *
*                l2 = stride + (stride>>1);                                    *
*                x = ptr_x;                                                    *
*                w = ptr_w + tw_offset;                                        *
*                                                                              *
*                for (i = 0; i < n; i += 4)                                    *
*                {                                                             *
*                    co1 = w[j];                                               *
*                    si1 = w[j+1];                                             *
*                    co2 = w[j+2];                                             *
*                    si2 = w[j+3];                                             *
*                    co3 = w[j+4];                                             *
*                    si3 = w[j+5];                                             *
*                                                                              *
*                    x_0    = x[0];                                            *
*                    x_1    = x[1];                                            *
*                    x_h2   = x[h2];                                           *
*                    x_h2p1 = x[h2+1];                                         *
*                    x_l1   = x[l1];                                           *
*                    x_l1p1 = x[l1+1];                                         *
*                    x_l2   = x[l2];                                           *
*                    x_l2p1 = x[l2+1];                                         *
*                                                                              *
*                    xh0  = x_0    + x_l1;                                     *
*                    xh1  = x_1    + x_l1p1;                                   *
*                    xl0  = x_0    - x_l1;                                     *
*                    xl1  = x_1    - x_l1p1;                                   *
*                                                                              *
*                    xh20 = x_h2   + x_l2;                                     *
*                    xh21 = x_h2p1 + x_l2p1;                                   *
*                    xl20 = x_h2   - x_l2;                                     *
*                    xl21 = x_h2p1 - x_l2p1;                                   *
*                                                                              *
*                    ptr_x0 = x;                                               *
*                    ptr_x0[0] = xh0 + xh20;                                   *
*                    ptr_x0[1] = xh1 + xh21;                                   *
*                                                                              *
*                    ptr_x2 = ptr_x0;                                          *
*                    x += 2;                                                   *
*                    j += 6;                                                   *
*                    predj = (j - fft_jmp);                                    *
*                    if (!predj) x += fft_jmp;                                 *
*                    if (!predj) j = 0;                                        *
*                                                                              *
*                    xt0 = xh0 - xh20;                                         *
*                    yt0 = xh1 - xh21;                                         *
*                    xt1 = xl0 + xl21;                                         *
*                    yt2 = xl1 + xl20;                                         *
*                    xt2 = xl0 - xl21;                                         *
*                    yt1 = xl1 - xl20;                                         *
*                                                                              *
*                    ptr_x2[l1  ] = xt1 * co1 + yt1 * si1;                     *
*                    ptr_x2[l1+1] = yt1 * co1 - xt1 * si1;                     *
*                    ptr_x2[h2  ] = xt0 * co2 + yt0 * si2;                     *
*                    ptr_x2[h2+1] = yt0 * co2 - xt0 * si2;                     *
*                    ptr_x2[l2  ] = xt2 * co3 + yt2 * si3;                     *
*                    ptr_x2[l2+1] = yt2 * co3 - xt2 * si3;                     *
*                }                                                             *
*                tw_offset += fft_jmp;                                         *
*                stride = stride>>2;                                           *
*            }/* end while */                                                  *
*                                                                              *
*            j = 0;                                                            *
*                                                                              *
*            ptr_x0 = ptr_x;                                                   *
*            y0 = ptr_y;                                                       *
*            l0 = _norm(n) - 17;   /* get size of fft */                       *
*                                                                              *
*            if (radix <= 4) for (i = 0; i < n; i += 4)                        *
*            {                                                                 *
*                    /* reversal computation */                                *
*                                                                              *
*                    j0 = (j     ) & 0x3F;                                     *
*                    j1 = (j >> 6) & 0x3F;                                     *
*                    k0 = brev[j0];                                            *
*                    k1 = brev[j1];                                            *
*                    k = (k0 << 6) |  k1;                                      *
*                    k = k >> l0;                                              *
*                    j++;        /* multiple of 4 index */                     *
*                                                                              *
*                    x0   = ptr_x0[0];  x1 = ptr_x0[1];                        *
*                    x2   = ptr_x0[2];  x3 = ptr_x0[3];                        *
*                    x4   = ptr_x0[4];  x5 = ptr_x0[5];                        *
*                    x6   = ptr_x0[6];  x7 = ptr_x0[7];                        *
*                    ptr_x0 += 8;                                              *
*                                                                              *
*                    xh0_0  = x0 + x4;                                         *
*                    xh1_0  = x1 + x5;                                         *
*                    xh0_1  = x2 + x6;                                         *
*                    xh1_1  = x3 + x7;                                         *
*                                                                              *
*                    if (radix == 2) {                                         *
*                      xh0_0 = x0;                                             *
*                      xh1_0 = x1;                                             *
*                      xh0_1 = x2;                                             *
*                      xh1_1 = x3;                                             *
*                    }                                                         *
*                                                                              *
*                    yt0  = xh0_0 + xh0_1;                                     *
*                    yt1  = xh1_0 + xh1_1;                                     *
*                    yt4  = xh0_0 - xh0_1;                                     *
*                    yt5  = xh1_0 - xh1_1;                                     *
*                                                                              *
*                    xl0_0  = x0 - x4;                                         *
*                    xl1_0  = x1 - x5;                                         *
*                    xl0_1  = x2 - x6;                                         *
*                    xl1_1  = x3 - x7;                                         *
*                                                                              *
*                    if (radix == 2) {                                         *
*                      xl0_0 = x4;                                             *
*                      xl1_0 = x5;                                             *
*                      xl1_1 = x6;                                             *
*                      xl0_1 = x7;                                             *
*                    }                                                         *
*                                                                              *
*                    yt2  = xl0_0 + xl1_1;                                     *
*                    yt3  = xl1_0 - xl0_1;                                     *
*                    yt6  = xl0_0 - xl1_1;                                     *
*                    yt7  = xl1_0 + xl0_1;                                     *
*                                                                              *
*                    if (radix == 2) {                                         *
*                      yt7  = xl1_0 - xl0_1;                                   *
*                      yt3  = xl1_0 + xl0_1;                                   *
*                    }                                                         *
*                                                                              *
*                    y0[k] = yt0; y0[k+1] = yt1;                               *
*                    k += n>>1;                                                *
*                    y0[k] = yt2; y0[k+1] = yt3;                               *
*                    k += n>>1;                                                *
*                    y0[k] = yt4; y0[k+1] = yt5;                               *
*                    k += n>>1;                                                *
*                    y0[k] = yt6; y0[k+1] = yt7;                               *
*            }                                                                 *
*        }                                                                     *
*     CYCLES                                                                   *
*          cycles = 3.25 * ceil(log4(N)-1) * N  + 3*N + 179                    *
*          e.g. N = 1024,  cycles = 16563                                      *
*          e.g. N = 512,   cycles = 8371                                       *
*          e.g. N = 256,   cycles = 3443                                       *
*          e.g. N = 128,   cycles = 1811                                       *
*          e.g. N = 64,    cycles = 729                                        *
*                                                                              *
*     REFERENCES                                                               *
*     [1] C. S. Burrus and T.W. Parks (1985) "DFT/FFT and Convolution Algos -  *
*         Theory and Implementation", J. Wiley.                                *
*     [2] Implementation of Various Precision Fast Fourier Transforms on the   *
*         TMS320C6400 processor - David J. Hoyle, ESC 2000                     *
*     [3] Burrus - Paper on converting radix4 to radix2 digit reversal.        *
*                                                                              *
*     CODESIZE                                                                 *
*          1312 bytes                                                          *
*                                                                              *
*==============================================================================*
*      Copyright (C) 1997-2000 Texas Instruments Incorporated.                 *
*                      All Rights Reserved                                     *
*==============================================================================*
                .sect   ".data:copyright_h"
_Copyright:     .string "Copyright (C) 2000 Texas Instruments Incorporated."
                .string "All Rights Reserved."
                .global _fftSPxSP_asm
                .sect   ".text:hand"
_fftSPxSP_asm:
* ====================== SYMBOLIC REGISTER ASSIGNMENTS ======================= *
        .asg    B15,        B_SP         ;stack pointer
        .asg    A7,         A_SP         ;stack pointer copy
        .asg    A5,         A_csr        ;orginal CSR 
        .asg    B5,         B_csr        ;orginal CSR 
        .asg    A3,         A_csr_no_gie ;CSR no interupts
        .asg    A4,         A_n          ;number of points to transform
        .asg    B4,         B_ptr_x      ;pointer to time domain data
        .asg    A6,         A_ptr_w      ;pointer to twiddle factor array
        .asg    B6,         B_ptr_y      ;pointer to output frequency domain
        .asg    A8,         A_brev       ;pointer to bit reverse table
        .asg    B8,         B_radix      ;smallest butterfly size
        .asg    A10,        A_offset     ;offset of fft in the output array
        .asg    B10,        B_n_max      ;largest fft used
        .asg    B14,        B_stride     ;size of butterfly
        .asg    A3,         A_tw_offset  ;offset into twiddle array
        .asg    A9,         A_j          ;index into twiddle table
        .asg    B7,         B_x          ;copy of pointer to transfomred data
        .asg    A7,         A_fft_jmp    ;correction factor in index
        .asg    A8,         A_w0         ;current pointer to twiddle factors
        .asg    A2,         A_i          ;loop counter
        .asg    B2,         B_while      ;while loop condition for end 
        .asg    B12,        B_xp0        ;x[2*i]
        .asg    B13,        B_xp1        ;x[2*i+1]
        .asg    B0,         B_x_         ;copy of pointer to transformed data
        .asg    A12,        A_xh2p0      ;x[N/2  +i*2  ]
        .asg    A13,        A_xh2p1      ;x[N/2  +i*2+1]
        .asg    B8,         B_xl1p0      ;x[N    +i*2  ]
        .asg    B9,         B_xl1p1      ;x[N    +i*2+1]
        .asg    A10,        A_xl2p0      ;x[3*N/2+i*2  ]
        .asg    A11,        A_xl2p1      ;x[3*N/2+i*2+1]
        .asg    B9,         B_xh0        ;intermediate butterfly calculation
        .asg    B8,         B_xh1        ;intermediate butterfly calculation
        .asg    B2,         B_xl0        ;intermediate butterfly calculation
        .asg    B1,         B_xl1        ;intermediate butterfly calculation
        .asg    A0,         A_xh20       ;intermediate butterfly calculation
        .asg    A12,        A_xh21       ;intermediate butterfly calculation
        .asg    A3,         A_xl20       ;intermediate butterfly calculation
        .asg    A5,         A_xl21       ;intermediate butterfly calculation
        .asg    B9,         B_y0         ;new x[2*i]
        .asg    B4,         B_y1         ;new x[2*i+1]
        .asg    B6,         B_ptr_x0     ;copy of pointer to data
        .asg    B5,         B_ptr_x1     ;copy of pointer to data
        .asg    A4,         A_ptr_x1     ;copy of pointer to data
        .asg    A15,        A_si10       ;sin(2*PI*i/N)
        .asg    A14,        A_co10       ;cos(2*PI*i/N)
        .asg    B11,        B_si20       ;sin(4*PI*i/N)
        .asg    B10,        B_co20       ;cos(4*PI*i/N)
        .asg    A11,        A_si30       ;sin(6*PI*i/N)
        .asg    A10,        A_co30       ;cos(6*PI*i/N)
        .asg    A7,         A_fft_jmp    ;copy of fft_jmp
        .asg    B3,         B_fft_jmp    ;copy of fft_jmp
        .asg    A3,         A_prj_       ;predicate to test for new butterfly 
        .asg    A1,         A_prj        ;copy of prj_
        .asg    B1,         B_xt0        ;intermediate butterfly calculation
        .asg    B1,         B_yt0        ;intermediate butterfly calculation
        .asg    A5,         A_yt2        ;intermediate butterfly calculation
        .asg    A3,         A_xt1        ;intermediate butterfly calculation
        .asg    B2,         B_xt2        ;intermediate butterfly calculation
        .asg    B2,         B_yt1        ;intermediate butterfly calculation
        .asg    B4,         B_h2_0       ;index into data N/4
        .asg    A6,         A_h2_0       ;index into data N/4
        .asg    B4,         B_p0         ;intermediate butterfly calculation
        .asg    B3,         B_p1         ;intermediate butterfly calculation
        .asg    B1,         B_p00        ;new x[N  +i*2  ]
        .asg    B1,         B_p2         ;intermediate butterfly calculation
        .asg    B6,         B_p3         ;intermediate butterfly calculation
        .asg    B3,         B_p20        ;new x[N  +i*2+1]
        .asg    A6,         A_p4         ;intermediate butterfly calculation
        .asg    A0,         A_p5         ;intermediate butterfly calculation
        .asg    A11,        A_p40        ;new x[N/2  +i*2  ]
        .asg    A7,         A_p6         ;intermediate butterfly calculation
        .asg    A3,         A_p7         ;intermediate butterfly calculation
        .asg    A10,        A_p60        ;new x[N/2  +i*2+1]
        .asg    A3,         A_p8         ;intermediate butterfly calculation
        .asg    B12,        B_p9         ;intermediate butterfly calculation
        .asg    A14,        A_p80        ;new x[3*N/2 +i*2+1]
        .asg    A3,         A_pa         ;intermediate butterfly calculation
        .asg    B10,        B_pb         ;intermediate butterfly calculation
        .asg    A0,         A_pa0        ;new x[3*N/2 +i*2+1]
        .asg    B2,         B_return     ;early return decision variable
* ============================ PIPE LOOP PROLOG ============================== *
        MV    .L1X  B_SP,      A_SP                  ;copy stack pointer
||      STW   .D2T1 A15,       *-B_SP[1]             ;save A15

        STW   .D2T2 B14,       *-B_SP[2]             ;save B14
||      STW   .D1T1 A14,       *-A_SP[3]             ;save A14

        STW   .D2T2 B13,       *-B_SP[4]             ;save B13
||      STW   .D1T1 A13,       *-A_SP[5]             ;save A13

        STW   .D2T2 B12,       *-B_SP[6]             ;save B12
||      STW   .D1T1 A12,       *-A_SP[7]             ;save A12

        STW   .D2T2 B11,       *-B_SP[8]             ;save B11
||      STW   .D1T1 A11,       *-A_SP[9]             ;save A11

        STW   .D2T2 B_n_max,   *-B_SP[10]            ;save B10
||      STW   .D1T1 A_offset,  *-A_SP[11]            ;save A10

        STW   .D2T2 B_radix,   *-B_SP[12]            ;save B8
||      STW   .D1T1 A_brev,    *-A_SP[13]            ;save A8
||      MVC   .S2   CSR,       B_csr                 ;

        STW   .D2T2 B_ptr_y,   *-B_SP[14]            ;save B6
||      STW   .D1T1 A_ptr_w,   *-A_SP[15]            ;save A6
||      AND   .L1X  B_csr,     -2,    A_csr_no_gie   ;disable interupt bit

        STW   .D2T2 B_ptr_x,   *-B_SP[16]            ;save B4
||      STW   .D1T1 A_n,       *-A_SP[17]            ;save A4
||      MVC   .S2X  A_csr_no_gie, CSR                ;diable interupts
||      MV    .L1X  B_csr,     A_csr                 ;copy csr

        STW   .D2T2 B3,        *-B_SP[19]            ;save B3
||      STW   .D1T1 A_csr,     *-A_SP[18]            ;save original CSR
||      ZERO  .S1   A_tw_offset                      ;[2,0]tw_offset=0
||      MV    .S2X  A_n,       B_stride              ;[2,0]stride=n
||      MV    .L2   B_ptr_x,   B_x_                  ;[6,0]x_ = ptr_x

        LDDW  .D2T2 *B_ptr_x[0],B_xp1:B_xp0          ;[3,1]x[2*i], x[2*i+1]  
||      MV    .L2   B_ptr_x,   B_x                   ;[3,1]x = ptr_x 
||      SHRU  .S2   B_stride,  2,          B_stride  ;[6,0]stride>>=2

        LDDW  .D2T1 *++B_x_[B_stride],A_xh2p1:A_xh2p0;[4,1]X[i+N/4]

        LDDW  .D2T2 *++B_x_[B_stride],B_xl1p1:B_xl1p0;[5,1]X[i+N/2]
||      MPYSU .M1X  12,        B_stride,   A_fft_jmp ;[4,0]fft_jmp =3*stride

        LDDW  .D2T1 *++B_x_[B_stride],A_xl2p1:A_xl2p0;[6,1]X[3N/4+i]
||      ADDAH .D1   A_ptr_w,   A_tw_offset,A_w0      ;[5,0]w=ptr_w+tw_offset

        MPYSU .M2   12,        B_stride,   B_fft_jmp ;[7,1]fft_jmp = 3*stride
||      SUB   .S1   A_n,       0,          A_i       ;[6,0]for(i=0; i < n;){
||      ADD   .L1   A_tw_offset,A_fft_jmp,A_tw_offset;[6,0]tw_offset+=fft_jmp

        STW   .D2T1 A_tw_offset, *-B_SP[20]          ;save tw_offset
LOOP_WHILE:
        MPYSU .M1X  3,         B_stride,   A_fft_jmp ;[1,1]fft_jmp=3*stride 

        ADDSP .L2   B_xp1,     B_xl1p1,    B_xh1     ;[10,1]xh1=x[1]+x[l1+1]
||      ZERO  .D1   A_j                              ;[6,0]j=0

        ADDSP .L1   A_xh2p1,   A_xl2p1,    A_xh21    ;[11,1]xh21=x[h2+1]+x[l2+1]
||      ADDSP .L2   B_xp0,     B_xl1p0,    B_xh0     ;[11,1]xh0=x[0]+x[l1]

        SUB   .S1   A_fft_jmp, A_j,        A_prj_    ;[12,1]predj=(j-fft_jmp)
||      SUBSP .L1   A_xh2p0,   A_xl2p0,    A_xl20    ;[12,1]xl20=x[h2]-x[l2]
||      SUBSP .L2   B_xp1,     B_xl1p1,    B_xl1     ;[12,1]xl1=x[1]-x[l1+1]

        SUB   .D1   A_prj_,    3,          A_prj     ;[13,1]predj = predj_
||      SUBSP .L1   A_xh2p1,   A_xl2p1,    A_xl21    ;[13,1]xl21=x[h2+1]-x[l2+1]
||      SUBSP .L2   B_xp0,     B_xl1p0,    B_xl0     ;[13,1]xl0=x[0]-x[l1]

        ADDSP .L1   A_xh2p0,   A_xl2p0,    A_xh20    ;[14,1]xh20=x[h2]+x[l2]
||      MV    .S2   B_x,       B_ptr_x0              ;[14,1]x = ptr_x0 
||      LDDW  .D1T1 *A_w0[A_j],A_si10:A_co10         ;[14,1]si1=w[j+1]co1=w[j]
||      ADD   .S1   A_w0,      8,          A_w0      ;[14,1]j+=1
||[!A_prj]ADDAH.D2  B_x,       B_fft_jmp,  B_x       ;[14,1]if(!predj)x+=fft_jmp
||      MPYSU .M1X  3,         B_stride,   A_fft_jmp ;[1,2]fft_jmp = 3*stride 

        ADD   .S2   B_x,       8,          B_x       ;[15,1]x+=2
||      LDDW  .D1T2 *A_w0[A_j],B_si20:B_co20         ;[15,1]si2=w[j+3]co2=w[j+2]
||      B     .S1   PREF8 + 8                        ;prolog collapse

        LDDW  .D2T2 *B_x[0],   B_xp1:B_xp0           ;[3,2]X[i]
||      MV    .S2   B_x,       B_x_                  ;[3,2]x_ = x 
||      B     .S1   PREF9 + 4                        ;prolog collapse

        SUBSP .L2X  B_xh1,     A_xh21,     B_yt0     ;[17,1]yt0=xh1-xh21
||      ADDSP .L1X  A_xl21,    B_xl0,      A_xt1     ;[17,1]xt1=xl0+xl21
||      LDDW  .D2T1 *++B_x_[B_stride],A_xh2p1:A_xh2p0;[4,2]X[N/4 + i]
||      B     .S1   PREF10                           ;prolog collapse

        ADDSP .L1X  A_xl20,    B_xl1,      A_yt2     ;[18,1]yt2=xl1+xl20
||      SUBSP .L2X  B_xl1,     A_xl20,     B_yt1     ;[18,1]yt1=xl1-xl20
||      LDDW  .D2T2 *++B_x_[B_stride],B_xl1p1:B_xl1p0;[ 5,2]x[N/2+i]
||      B     .S1   PREF11 + 4                       ;prolog collapse

        SUBSP .L2X  B_xh0,     A_xh20,     B_xt0     ;[19,1]xt0=xh0-xh20 
||      LDDW  .D2T1 *++B_x_[B_stride],A_xl2p1:A_xl2p0;[ 6,2]X[3N/4+i]
||      B     .S1   PREF12 + 4                       ;prolog collapse

        SUBSP .L2X  B_xl0,     A_xl21,     B_xt2     ;[20,1]xt2=xl0-xl21 
||      MPYSU .M2   12,        B_stride,   B_fft_jmp ;[ 7,2] fft_jmp = 3*stride
||      B     .S2   LOOP_FOR_A                       ;prolog collapse
* ============================ PIPE LOOP KERNEL ============================== *
LOOP_FOR_A:
        ADD   .S1   A_w0,      -16,        A_w0      ;[26,1] j += 1
||      STW   .D2T2 B_y0,      *B_ptr_x0[0]          ;[26,1]write x[i] 
||      SUB   .D1   A_prj_,    3,          A_prj     ;[13,2]prj = prj_ 
||      SUBSP .L1   A_xh2p1,   A_xl2p1,    A_xl21    ;[13,2]xl21=x[h2+1]-x[l2+1]
||      SUBSP .L2   B_xp0,     B_xl1p0,    B_xl0     ;[13,2]xl0=x[0]-x[l1]

        MPYSU .M2   2,         B_stride,   B_h2_0    ;[27,1]h2_0=stride 
||      SUBSP .L2   B_p2,      B_p3,       B_p20     ;[27,1]co20*yt0-si20*xt0
||      ADDSP .L1   A_xh2p0,   A_xl2p0,    A_xh20    ;[14,2]xh20=x[h2]+x[l2]
||      MV    .S2   B_x,       B_ptr_x0              ;[14,2]ptr_x0 = x 
||      LDDW  .D1T1 *A_w0[A_j],A_si10:A_co10         ;[14,2]si1=w[j+1]co1=w[j]
||      ADD   .S1   A_w0,      8,          A_w0      ;[14,2]j += 1
||[!A_prj]ADDAH.D2  B_x,       B_fft_jmp,  B_x       ;[14,2]if(!predj)x+=fft_jmp
||      MPYSU .M1X  3,         B_stride,   A_fft_jmp ;[ 1,3]fft_jmp = 3*stride

        ADD   .S1X  B_ptr_x1,  -4,         A_ptr_x1  ;[28,1]ptr_x1 -= 1  
||      MPYSP .M1   A_co30,    A_yt2,      A_pa      ;[28,1]pa = co30*yt2 
||      MPYSP .M2X  A_si30,    B_xt2,      B_pb      ;[28,1]pb = si30*xt2 
||      SUBSP .L1   A_p6,      A_p7,       A_p60     ;[28,1]co10*yt1-si10*xt1
||      ADDSP .L2   B_p0,      B_p1,       B_p00     ;[28,1]si20*yt0+co20*xt0
||      ADD   .S2   B_x,       8,          B_x       ;[15,2]x +=2;
||      LDDW  .D1T2 *A_w0[A_j],B_si20:B_co20         ;[15,2]si2=w[j+3]co2=w[j+2]

        MPYSP .M2X  B_xt2,     A_co30,     B_p9      ;[29,1]p9=xt2*co30 
||      MPYSP .M1   A_yt2,     A_si30,     A_p8      ;[29,1]p8=yt2*si30 
||      ADDSP .L1   A_p4,      A_p5,       A_p40     ;[29,1](si10*yt1+co10*xt1)
||      MV    .S1X  B_h2_0,    A_h2_0                ;[29,1]copy h2_0 
||      LDDW  .D2T2 *B_x[0],   B_xp1:B_xp0           ;[ 3,3]X[i]
||      MV    .S2   B_x,       B_x_                  ;[ 3,3]x_ = x 

        SUBSP .L2X  B_xh1,     A_xh21,     B_yt0     ;[17,2]yt0=xh1-xh21
||      ADDSP .L1X  A_xl21,    B_xl0,      A_xt1     ;[17,2]xt1=xl0+xl21
||      LDDW  .D2T1 *++B_x_[B_stride],A_xh2p1:A_xh2p0;[ 4,3]X[N/4+i]

        ADDSP .L1X  A_xl20,    B_xl1,      A_yt2     ;[18,2]yt2=xl1+xl20
||      SUBSP .L2X  B_xl1,     A_xl20,     B_yt1     ;[18,2]yt1=xl1-xl20
||      LDDW  .D2T2 *++B_x_[B_stride],B_xl1p1:B_xl1p0;[ 5,3]x[N/2+i]

        SUB   .D1   A_i,       4,          A_i       ;[32,1]i+=4 
||      SUBSP .L1X  A_pa,      B_pb,       A_pa0     ;[32,1]co30*yt2-si30*xt2
||      SUBSP .L2X  B_xh0,     A_xh20,     B_xt0     ;[19,2]xt0=xh0-xh20 
||      LDDW  .D2T1 *++B_x_[B_stride],A_xl2p1:A_xl2p0;[ 6,3]x[3N/4+i]

  [ A_i]B     .S2   LOOP_FOR_A                       ;[33,1]}/* end for */
||      ADDSP .L1X  B_p9,      A_p8,       A_p80     ;[33,1]si30*yt2+co30*xt2
||      STW   .D1T2 B_p00,     *++A_ptr_x1[A_h2_0]   ;[33,1]save x[N/4+i]
||[ A_i]SUBSP .L2X  B_xl0,     A_xl21,     B_xt2     ;[20,2]xt2=xl0-xl21 
||[ A_i]MPYSU .M2   12,        B_stride,   B_fft_jmp ;[ 7,3]fft_jmp = 3*stride 
PREF8:
        STW   .D2T2 B_p20,     *++B_ptr_x1[B_h2_0]   ;[34,1]save x[N/4+i+1]
||      STW   .D1T1 A_p40,     *++A_ptr_x1[A_h2_0]   ;[34,1]save x[N/2+i]
||[ A_i]MPYSP .M2   B_co20,    B_yt0,      B_p2      ;[21,2]p2=co20*yt0 
||[ A_i]ADDSP .L2X  B_xh1,     A_xh21,     B_y1      ;[21,2]y1=xh1+xh21
||[ A_i]MPYSP .M1   A_co10,    A_xt1,      A_p5      ;[21,2]p5 = co10*xt1 
PREF9:
        STW   .D2T1 A_p60,     *++B_ptr_x1[B_h2_0]   ;[35,1]x[3*N/4+i]
||[ A_i]ADD   .D1   A_w0,      8,          A_w0      ;[22,2]j += 1
||[ A_i]ADDSP .L2X  B_xh0,     A_xh20,     B_y0      ;[22,2]y0=xh0+xh20
||[ A_i]MPYSP .M2   B_si20,    B_yt0,      B_p0      ;[22,2]p0=si20*yt0 
||[ A_i]MPYSP .M1X  A_si10,    B_yt1,      A_p4      ;[22,2]p4=si10*yt1 
PREF10:
  [ A_i]MPYSP .M2   B_si20,    B_xt0,      B_p3      ;[23,2]p3 = si20*xt0 
||[ A_i]ADD   .S1   A_j,       3,          A_j       ;[23,2]j += 1
||[ A_i]LDDW  .D1T1 *A_w0[A_j],A_si30:A_co30         ;[23,2]si3=w[j+5]co3=w[j+4]
||[ A_i]MPYSP .M1X  A_co10,    B_yt1,      A_p6      ;[23,2]p6 = co10*yt1 
||[ A_i]ADDSP .L2   B_xp1,     B_xl1p1,    B_xh1     ;[10,3]xh1=x[1]+x[l1+1]
PREF11:
        STW   .D2T1 A_pa0,     *++B_ptr_x1[B_h2_0]   ;[37,1]save x[3*N/4+i+1]
||[ A_i]MPYSP .M2   B_co20,    B_xt0,      B_p1      ;[24,2]p1=co20*xt0
||[!A_prj]ZERO.S1   A_j                              ;[24,2]if(!predj)j = 0
||[ A_i]MPYSP .M1   A_si10,    A_xt1,      A_p7      ;[24,2]p7=si10*xt1 
||[ A_i]ADDSP .L1   A_xh2p1,   A_xl2p1,    A_xh21    ;[11,3]xh21=x[h2p1]+x[l2p1]
||[ A_i]ADDSP .L2   B_xp0,     B_xl1p0,    B_xh0     ;[11,3]xh0=x[0]+x[l1]
PREF12:
        STW   .D1T1 A_p80,     *++A_ptr_x1[A_h2_0]   ;[38,1]save x[3*N/4+i]
||[ A_i]ADD   .S2   B_ptr_x0,  4,          B_ptr_x1  ;[25,2]ptr_x1 = ptr_x + 1
||[ A_i]STW   .D2T2 B_y1,      *B_ptr_x0[1]          ;[25,2]save x[i+1]
||[ A_i]SUB   .S1   A_fft_jmp, A_j,        A_prj_    ;[12,3]predj=j-fft_jmp
||[ A_i]SUBSP .L1   A_xh2p0,   A_xl2p0,    A_xl20    ;[12,3]xl20=x[h2]-x[l2]
||[ A_i]SUBSP .L2   B_xp1,     B_xl1p1,    B_xl1     ;[12,3]xl1=x[1]-x[l1+1]
* ============================ PIPE LOOP EPILOG ============================== *
        MV    .L1X  B_SP,      A_SP                  ;copy stack pntr
||      LDW   .D2T2 *-B_SP[12],B_radix               ;restore B10

        LDW   .D2T2 *-B_SP[16],B_ptr_x               ;restore ptr_x

        LDW   .D1T1 *-A_SP[17],A_n                   ;restore A_n

        LDW   .D1T1 *-A_SP[20],A_tw_offset           ;restore tw_offset

        LDW   .D1T1 *-A_SP[15],A_ptr_w               ;restore ptr_w

        CMPGTU.L2   B_stride,  B_radix,    B_while   ;test for last pass
||      SHRU  .S2   B_stride,  2,          B_stride  ;[6,0]stride=stride>>2

  [B_while]B  .S1   LOOP_WHILE                       ;}/* end while */
||[B_while]LDDW.D2T2 *B_ptr_x[0],B_xp1:B_xp0         ;[3,1] X[i]
||[B_while]MV .L2   B_ptr_x,   B_x                   ;[3,1]x = ptr_x
||      MV    .S2   B_ptr_x,   B_x_                  ;[6,0]x_ = ptr_x

 [B_while]LDDW.D2T1 *++B_x_[B_stride],A_xh2p1:A_xh2p0;[4,1]x[N/4+i]

 [B_while]LDDW.D2T2 *++B_x_[B_stride],B_xl1p1:B_xl1p0;[5,1]x[N/2+i]
||[B_while]MPYSU.M1X 12,       B_stride,   A_fft_jmp ;[4,0]fft_jmp=3*stride

 [B_while]LDDW.D2T1 *++B_x_[B_stride],A_xl2p1:A_xl2p0;[6,1]x[3N/4+i]
||[B_while]ADDAH.D1 A_ptr_w,   A_tw_offset,A_w0      ;[5,0]w=ptr_w+tw_offset

 [B_while]MPYSU.M2  12,        B_stride,   B_fft_jmp ;[7,1]fft_jmp=stride*3 
||[B_while]SUB.S1   A_n,       0,          A_i       ;[6,0]for(i=0; i < n){
||[B_while]ADD.L1   A_tw_offset,A_fft_jmp,A_tw_offset;[6,0]tw_offset+=fft_jmp

  [B_while]STW.D2T1 A_tw_offset,*-B_SP[20]           ;save tw_offset 
        ;BRANCH OCCURS

        CMPGTU.L2   B_radix,   4,          B_return  ;check for early exit

 [B_return]B  .S2   EARLY_EXIT                       ;early exit for cache

        LDW   .D2T2 *-B_SP[10],            B_n_max   ;restore n_max
||      LDW   .D1T1 *-A_SP[11],            A_offset  ;restore offset

        LDW   .D2T2 *-B_SP[14],            B_ptr_y   ;restore ptr_y
||      LDW   .D1T1 *-A_SP[13],            A_brev    ;restore brev table

        NOP             3                            ;pipeline latency

        NOP             1                            ;pipeline latency
* ====================== SYMBOLIC REGISTER ASSIGNMENTS ======================= *
        .asg            A4,         A_n       ;number of points in transform
        .asg            B4,         B_ptr_x   ;pntr to in data partial transform
        .asg            B6,         B_ptr_y   ;pntr to final output data
        .asg            A8,         A_brev    ;pointer to bit reverse table
        .asg            B8,         B_radix   ;smallest butterfly size radix=2,4
        .asg            A10,        A_offset  ;index into main fft array
        .asg            B10,        B_n_max   ;maximuk size of all ffts 
        .asg            A2,         A_r2      ;condition whether radix2
        .asg            B8,         B_x       ;pointer to data
        .asg            A7,         A_x       ;pointer to data
        .asg            A11,        A_y0      ;even output data pointer
        .asg            B5,         B_y0      ;odd output data pointer
        .asg            B12,        B_n2      ;n/4
        .asg            A6,         A_n2      ;copy of n2
        .asg            B2,         B_i       ;loop counter
        .asg            B9,         B_l0      ;shift for index bit reverse
        .asg            A1,         A_pro     ;prolog counter
        .asg            A9,         A_j       ;index into data
        .asg            A12,        A_cx3f    ;mask for bit reverse calc.
        .asg            A3,         A_j0      ;lower 6 bits of j
        .asg            A5,         A_j1      ;upper 6 bits of j
        .asg            A0,         A_k0      ;reversed j0
        .asg            A10,        A_k1      ;reversed j1
        .asg            B4,         B_k0_     ;k0 << 6
        .asg            B4,         B_k_      ;k0_ + k1
        .asg            B6,         B_k       ;k_ >> l0
        .asg            A3,         A_k       ;copy of k
        .asg            A5,         A_ptr_y0  ;copy of A_y0
        .asg            B7,         B_ptr_y1  ;copy of B_y0
        .asg            B11,        B_x1      ;partial sums
        .asg            B10,        B_x0      ;partial sums
        .asg            A15,        A_x3      ;partial sums
        .asg            A14,        A_x2      ;partial sums
        .asg            B1,         B_x5      ;partial sums
        .asg            B0,         B_x4      ;partial sums
        .asg            A13,        A_x7      ;partial sums
        .asg            A12,        A_x6      ;partial sums
        .asg            B1,         B_xh0_0   ;partial sums
        .asg            B7,         B_xh1_0   ;partial sums
        .asg            A4,         A_xh0_1   ;partial sums
        .asg            A13,        A_xh1_1   ;partial sums
        .asg            A12,        A_yt0     ;partial sums
        .asg            A5,         A_yt1     ;partial sums
        .asg            B3,         B_yt4     ;partial sums
        .asg            B6,         B_yt5     ;partial sums
        .asg            B3,         B_xl0_0   ;partial sums
        .asg            B4,         B_xl1_0   ;partial sums
        .asg            A5,         A_xl0_1   ;partial sums
        .asg            A14,        A_xl1_1   ;partial sums
        .asg            A3,         A_yt2     ;partial sums
        .asg            B4,         B_yt3     ;partial sums
        .asg            B5,         B_yt6     ;partial sums
        .asg            A4,         A_yt7     ;partial sums
* ============================ PIPE LOOP PROLOG ============================== *
        SHRU  .S1   A_offset,   8,          A_j1    ;[ 1,1]j1 =offset >> 8 
||      NORM  .L2   B_n_max,    B_l0                ;[ 2,0]l0=norm(n_max) 

        LDBU  .D1T1 *A_brev[A_j1],          A_k1    ;[ 2,1]k1=brev[j1] 
||      SHRU  .S1   A_offset,   2,          A_j     ;[ 4,0]j=offset>>2 

        MVK   .S1   03Fh,       A_cx3f              ;[ 2,1]const = 0x3f 

        AND   .S1   A_j,        A_cx3f,     A_j0    ;[ 3,1]j0 = lo6bit of j 

        LDBU  .D1T1 *A_brev[A_j0],          A_k0    ;[ 5,1]k0 = brev[j0]

        ADD   .L2   B_l0,       -16,        B_l0    ;[ 3,0]l0 -=16
||      MV    .S1X  B_ptr_y,    A_y0                ;[ 3,0]y0 = ptr_y

        ADD   .S2   B_l0,       -1,         B_l0    ;[ 4,0]l0=norm(n) - 17
||      ADD   .L1X  B_ptr_x,    8,          A_x     ;[ 4,0]x=ptr_x+1 

        MVK   .S1   1,          A_pro               ;[ 5,0]pro = 1 
||      ADD   .L2X  A_n,        4,          B_i     ;[ 5,0]for(i=0;i<n;){ 
||      SHRU  .S2   B_n_max,    1,          B_n2    ;[ 5,0]n2= n/2 
||      MV    .D2   B_ptr_x,    B_x                 ;[ 5,0]x=ptr_x 
||      SUB   .L1X  B_radix,    2,          A_r2    ;[ 5,0]test if radix2
* ============================ PIPE LOOP KERNEL ============================== *
LOOP_FOR_B:
        MV    .S1X  B_n2,       A_n2                ;[21,1]copy n2
||[!A_r2]MV   .S2   B_x1,       B_xh1_0             ;[21,1]if(!r2)xh1_0=x1
||[!A_r2]MV   .D1   A_x7,       A_xl0_1             ;[21,1]if(!r2)xl0_1=x7
||      LDDW  .D2T2 *B_x++[2],  B_x1:B_x0           ;[ 9,2]X[i]

        ADDSP .L1X  B_xl1_0,    A_xl0_1,    A_yt7   ;[22,1]yt7=xl1_0+xl0_1
||[!A_r2]MV   .S2   B_x0,       B_xh0_0             ;[22,1]if(!r2)xh0_0=x0
||      SUBSP .L2X  B_xl1_0,    A_xl0_1,    B_yt3   ;[22,1]yt3=xl1_0-xl0_1
||[!A_r2]MV   .S1   A_x3,       A_xh1_1             ;[22,1]if(!r2)xh1_1=x3
||      LDDW  .D1T1 *A_x++[2],  A_x3:A_x2           ;[10,2]X[i+2]

  [!A_r2]ADDSP.L2X  B_xl1_0,    A_xl0_1,    B_yt3   ;[23,1](!r2)yt3=xl1_0+xl0_1
||      ADDSP .L1X  B_xh1_0,    A_xh1_1,    A_yt1   ;[23,1]yt1=xh1_0+xh1_1
||[!A_r2]MV   .S1   A_x2,       A_xh0_1             ;[23,1]if(!r2)xh0_1=x2
||      ADD   .D1   A_j,        1,          A_j     ;[11,2] j+=1
||      LDDW  .D2T2 *B_x++[2],  B_x5:B_x4           ;[11,2]X[i+3]

  [!A_r2]MV   .S1   A_x6,       A_xl1_1             ;[24,1]if(!r2)xl1_1=x6
||[!A_r2]MV   .D2   B_x4,       B_xl0_0             ;[24,1]if(!r2)xl0_0=x4
||      ADDSP .L1X  B_xh0_0,    A_xh0_1,    A_yt0   ;[24,1]yt0=xh0_0+xh0_1
||      SUBSP .L2X  B_xh0_0,    A_xh0_1,    B_yt4   ;[24,1]yt4=xh0_0-xh0_1
||      LDDW  .D1T1 *A_x++[2],  A_x7:A_x6           ;[12,2]X[i+4]

        SUBSP .L2X  B_xl0_0,    A_xl1_1,    B_yt6   ;[25,1]yt6=xl0_0-xl1_1
||[!A_r2]SUBSP.L1X  B_xl1_0,    A_xl0_1,    A_yt7   ;[25,1](!r2)yt7=xl1_0-xl0_1
||      SHRU  .S1   A_j,        6,          A_j1    ;[ 1,3]j1 = j >> 6 

        SUB   .S2   B_i,        4,          B_i     ;[26,1]i+= 4
||      ADDAW .D2   B_y0,       B_k,        B_ptr_y1;[26,1]ptr_y1 = y0+k 
||      ADDSP .L1X  B_xl0_0,    A_xl1_1,    A_yt2   ;[26,1]yt2=xl1_0+xl1_1
||      SUBSP .L2X  B_xh1_0,    A_xh1_1,    B_yt5   ;[26,1]yt5=xh1_0-xh1_1
||      LDBU  .D1T1 *A_brev[A_j1],          A_k1    ;[ 2,3]k1 = brev[j1] 
||      MVK   .S1   03Fh,       A_cx3f              ;[ 2,3]const = 0x3f 

        ADDAW .D1   A_y0,       A_k,        A_ptr_y0;[27,1]ptr_y0 = y0 + k 
||[!A_pro]STW .D2T1 A_yt1,      *B_ptr_y1++[B_n2]   ;[27,1]store x[1] 
||[ B_i]B     .S2   LOOP_FOR_B                      ;[27,1]} /* end for */
||[ B_i]AND   .S1   A_j,        A_cx3f,     A_j0    ;[ 3,3]j0 - j & 0x3f 

  [!A_pro]STW .D2T2 B_yt3,      *B_ptr_y1++[B_n2]   ;[28,1]store x[3]
||[!A_pro]STW .D1T1 A_yt0,      *A_ptr_y0++[A_n2]   ;[28,1]store x[0] 
||[ B_i]SHL   .S2X  A_k0,       6,          B_k0_   ;[16,2]k0_ = k0 << 6
||[ B_i]SUBSP .L2   B_x1,       B_x5,       B_xl1_0 ;[16,2]xl1_0=x[1]+x[5]

  [ B_i]ADD   .S2X  B_k0_,      A_k1,       B_k_    ;[17,2]k_=k0_+k1 
||[ B_i]SUBSP .L1   A_x2,       A_x6,       A_xl0_1 ;[17,2]xl0_1=x[2]-x[6]
||[ B_i]ADDSP .L2   B_x1,       B_x5,       B_xh1_0 ;[17,2]xh1_0=x[1]+x[5]
||[ B_i]LDBU  .D1T1 *A_brev[A_j0],          A_k0    ;[ 5,3]k0=brev[j0] 

  [!A_pro]STW .D1T1 A_yt2,      *A_ptr_y0++[A_n2]   ;[30,1]store x[2]
||[!A_pro]STW .D2T2 B_yt5,      *B_ptr_y1++[B_n2]   ;[30,1]store x[5]
||[ B_i]ADDSP .L1   A_x3,       A_x7,       A_xh1_1 ;[18,2]xh1_1=x[3]+x[7]
||[ B_i]ADDSP .L2   B_x0,       B_x4,       B_xh0_0 ;[18,2]xh0_0=x[0]+x[4]
||[ B_i]SHRU  .S2   B_k_,       B_l0,       B_k     ;[18,2]k=k_ >> l0 

  [!A_pro]STW .D2T1 A_yt7,      *B_ptr_y1[0]        ;[31,1]store x[7]
||[!A_pro]STW .D1T2 B_yt4,      *A_ptr_y0++[A_n2]   ;[31,1]store x[4]
||[ B_i]MV    .S1X  B_k,        A_k                 ;[19,2]duplicate k
||[ B_i]ADDSP .L1   A_x2,       A_x6,       A_xh0_1 ;[19,2]xh0_1=x[2]+x[6]

  [ B_i]ZERO  .S1   A_pro                           ;[32,1]pro = 0 
||[!A_pro]STW .D1T2 B_yt6,      *A_ptr_y0[0]        ;[32,1]store x[6]
||[ B_i]ADD   .S2X  A_y0,       4,          B_y0    ;[20,2]y0 += 1 
||[ B_i]SUBSP .L2   B_x0,       B_x4,       B_xl0_0 ;[20,2]xl0_0=x[0]-x[4]
||[ B_i]SUBSP .L1   A_x3,       A_x7,       A_xl1_1 ;[20,2]xl1_1=x[3]-x[7]
||[!A_r2]MV   .D2   B_x5,       B_xl1_0             ;[20,2]if(!r2)xl1_0=x5
* ============================ PIPE LOOP EPILOG ============================== *
EARLY_EXIT:
        MV    .L1X  B_SP,       A_SP                ;copy stack pointer
||      LDW   .D2T2 *-B_SP[19], B3                  ;restore B3

        LDW   .D2T1 *-B_SP[1],  A15                 ;restore A15
||      LDW   .D1T2 *-A_SP[18], B_csr               ;load original CSR

        LDW   .D2T2 *-B_SP[2],  B14                 ;restore B14
||      LDW   .D1T1 *-A_SP[3],  A14                 ;restore A14

        LDW   .D2T2 *-B_SP[4],  B13                 ;restore B13
||      LDW   .D1T1 *-A_SP[5],  A13                 ;restore A13

        LDW   .D2T2 *-B_SP[6],  B12                 ;restore B12
||      LDW   .D1T1 *-A_SP[7],  A12                 ;restore A12

        LDW   .D2T2 *-B_SP[8],  B11                 ;restore B11
||      LDW   .D1T1 *-A_SP[9],  A11                 ;restore A11
||      B     .S2   B3                              ;return to caller
 
        LDW   .D2T2 *-B_SP[10], B10                 ;restore B10    
||      LDW   .D1T1 *-A_SP[11], A10                 ;restore A10

        MVC   .S2   B_csr,      CSR                 ;interuptabilty restored

        NOP         3                               ;wait for branch delay
        ;BRANCH OCCURS HERE
* ============================================================================ *
* End of fftSPxSP assembly code                                                *
*==============================================================================*
*      Copyright (C) 1997-2000 Texas Instruments Incorporated.                 *
*                            All Rights Reserved                               *
*==============================================================================*
