*===============================================================================*
*       TEXAS INSTRUMENTS, INC.
*
*       SYMMETRIC FIR
*
*       Revision Date: 7/22/98   by C. J. Chen
*
*       USAGE   This routine is C Callable and can be called as:
*
*               void
*               sym_fir(short x[], short h[], short y[], int n, int m, int s)
*
*               x = input array
*               h = coefficient array (2n+1 taps)
*               y = output array
*               n = the first n number of coefficients (multiple of 2 >= 2)
*               m = number of output samples (even >= 2)
*               s = number of insignificant digits to truncated
*
*               If routine is not to be used as a C callable function
*               then all instructions relating to stack should be removed.
*               Refer to comments of individual instructions.  You will also
*               need to initialize values for all of the values passed as these
*               are assumed to be in registers as defined by the calling  
*               convention of the compiler, (refer to the C compiler reference
*               guide).
*
*       C Code  This is the C equivalent of the Assembly Code without
*               restrictions.  Note that the assembly code is hand optimized
*               and destrictions may apply
*
*               Original C code:
*               void
*               sym_fir(short x[], short h[], short y[], int n, int m, int s)
*               {
*                       int             i, j;
*                       Long40          y0;
*                       Long40          round = (Long40) 1 << (s - 1);
*                       for (j = 0; j < m; j++) {
*                          y0 = round;
*                          for (i = 0; i < n; i++)
*                            y0 += (short) (x[j + i] + x[j + 2 * n - i]) * h[i];
*                          y0 += x[j + n] * h[n];
*                          y[j] = (int) (y0 >> s);
*                       }
*               }
*
*               Unroll two times C code:
*               void
*               sym_fir_x(short x[], short h[], short y[], int n, int m, int
s)
*               {
*                       short   *xa, *xb, x0a, x0b, x1a, x1b;
*                       short   x00, x10, x01, x11, h0, h1;
*                       int     i, j, p00, p01, p10, p11;
*                       Long40  y0, y1;
*                       Long40  round = (Long40) 1 << (s - 1);
*                       xa = x;
*                       xb = x + 2 * n + 1;
*                       for (j = 0; j < m; j += 2) {
*                               y0 = round;
*                               y1 = round;
*                               x0a = *xa++;
*                               x1b = *xb--;
*                               for (i = 0; i < n; i += 2) {
*                                       x1a = *xa++;
*                                       x0b = *xb--;
*                                       x00 = x0a + x0b;
*                                       x10 = x1a + x1b;
*                                       h0 = *h++;
*                                       p00 = x00 * h0;
*                                       p10 = x10 * h0;
*                                       y0 += p00;
*                                       y1 += p10;
*
*                                       x0a = *xa++;
*                                       x1b = *xb--;
*                                       x01 = x1a + x1b;
*                                       x11 = x0a + x0b;
*                                       h1 = *h++;
*                                       p01 = x01 * h1;
*                                       p11 = x11 * h1;
*                                       y0 += p01;
*                                       y1 += p11;
*                               }
*                               h0 = *h;
*                               p00 = x0a * h0;
*                               p10 = x1b * h0;
*                               y0 += p00;
*                               y1 += p10;
*
*                               y0 >>= s;
*                               y1 >>= s;
*                               y[j] = (int) y0;
*                               y[j + 1] = (int) y1;
*
*                               xa += 2 - (n + 1);
*                               xb += 2 + (n + 1);
*                               h -= n;
*                       }
*               }
*
*       DESCRIPTION
*               This SYMMETRIC FIR assumes the number of filter coeficients
*               is a 2n + 1 taps and the number of output samples is a
*               multiple of 2.  It operates on 16-bit data with a 40-bit
*               accumulation.  This routine has no memory hits. However, h
*               should be word-aligned.  The filter is m output samples
*               and 2n+1 coefficients.  The assembly routine performs 2 output
*               samples at a time.
*
*
*       TECHNIQUES
*               The inner loop is unrolled two times.  Thus, the number of
*               the filter coefficients must be a multiple of 2n + 1 where
*               n is even.  Also, output samples must be a multiple of 2.
*
*               If an odd number of output samples is needed or possible, the
*               final store can either be removed or conditionally executed
*               depending on whether m is even or odd.  This code would have to
*               be added to the existing code.
*
*       ASSUMPTIONS
*               h word_aligned
*               n multiple of 2 >= 2
*               m even >= 2
*
*       MEMORY NOTE
*               This code has no memory hits regardless of where x and h are
*               located in memory.
*
*       CYCLES  4 + (3n/2+ 10)m/2 + 2
*
*       SIZE    320 bytes.
*
*       NOtes:  the register push and pop aren't inclued in the cycles counts
*               and code size counts.
*
*===============================================================================

        .global _sym_fir
        .text
 
_sym_fir:
 
        STW     .D2  A15,*B15--      ; push register (for c-callable func)
||      SUB     .L1X B15,8,A15       ; copy stack pointer to A reg file
 
        STW     .D2  B14,*B15--[2]   ; push register (for c-callable func)
||      STW     .D1  A14,*A15--[2]   ; push register (for c-callable func)
 
        STW     .D2  B13,*B15--[2]   ; push register (for c-callable func)
||      STW     .D1  A13,*A15--[2]   ; push register (for c-callable func)
 
        STW     .D2  B12,*B15--[2]   ; push register (for c-callable func)
||      STW     .D1  A12,*A15--[2]   ; push register (for c-callable func)
 
 
 
*** BEGIN Benchmark Timing ***
 
        MVK     .S2  1,B0               ; set output sync bit
||      SUB     .L2  B8,1,B7            ; s -= 1
||      SUB     .L1X B6,0,A2            ; set inner loop counter (= n)
||      MPY     .M2  B6,2,B9            ; 2 * n
||      STW     .D2  B11,*B15--[2]      ; push register (for c-callable func)
||      STW     .D1  A11,*A15--[2]      ; push register (for c-callable func)
 
        SHL     .S2  B0,B7,B11:B10      ; y0 = round = (Long40) 1 << (s - 1)
||      B       .S1  loop
||      MV      .L2X A4,B5              ; *xa
||      SUB     .L1  A8,2,A8            ; set outer loop counter (=m-2)
||      MPY     .M1  A1,0,A1            ; clear compare for outer loop branch
||      MPY     .M2  B0,1,B1            ; = 1
||      STW     .D2  B10,*B15--[2]      ; push register (for c-callable func)
||      STW     .D1  A10,*A15--[2]      ; push register (for c-callable func)
 
        ADD     .L1X B10,A1,A11:A10     ; y1 = round;
||      ADD     .S2  B9,1,B9            ; 2 * n + 1
||      LDW     .D2  *B4,A3             ; h1_h0 = *h
 
        LDH     .D1  *A4,A9             ; x0a = *xa
||      LDH     .D2  *++B5[B9],B9       ; x1b = *xb, *xb -> x+2n
||      MVK     .S2  3,B2               ; set sync for y's counter (= 3)
||      ADD     .L1X B7,1,A14           ; s
||      ADD     .S1  A10,0,A5           ; round;
 
; outer loop repeat m/2 times; and inner loop repeat n/2+2 times.

loop:
   [A2] LDH     .D1  *++A4,B7           ;@@@ x1a = *++xa
|| [A2] LDH     .D2  *--B5,A7           ;@@@ x0b = *--xb
||      ADD     .L1  A9,A7,A12          ;@ x11 = x0a + x0b
||      ADD     .L2  B7,B9,B14          ;@ x01 = x1a + x1b
||      MPY     .M1  A12,A3,A13         ;@ p00 = x00 * h0
||      MPY     .M2X B14,A3,B13         ;@ p10 = x10 * h0
|| [A2] B       .S2  loop
||      MV      .S1  A3,A7              ; = h1_h0 (old)
 
   [A2] LDH     .D1  *++A4,A9           ;@@@ x0a = *++xa
|| [A2] LDH     .D2  *--B5,B9           ;@@@ x1b = *--xb
||      MPY     .M2X A12,B7,B13         ;@ p11 = x11 * h1_h0 (old)
||      MPYLH   .M1X B14,A7,A13         ;@ p01 = x01 * h1 (old)
||[!B2] ADD     .L1  A11:A10,A13,A11:A10        ; y0 += p01
||[!B2] ADD     .L2  B11:B10,B13,B11:B10        ; y1 += p11
|| [B2] SUB     .S2  B2,1,B2            ; decrement sync for y's
||[A1]  B       .S1  loop               ; outer loop branch
 
   [A2] LDW     .D2  *++B4,A3           ;@@@@ h1_h0 = *++h
||      ADD     .D1  A9,A7,A12          ;@@ x00 = x0a + x0b
||      ADD     .S2  B7,B9,B14          ;@@ x10 = x1a + x1b
||[!B2] ADD     .L1  A11:A10,A13,A11:A10        ;@ y0 += p00
||[!B2] ADD     .L2  B11:B10,B13,B11:B10        ;@ y1 += p10
|| [A2] SUB     .S1  A2,2,A2            ; inner_loop count -= 2
||[!A2] MPY     .M1  A8,1,A1            ; compare for outer loop branch
||      MPYHL   .M2X A3,B1,B7           ; = h1 (old)
        ; inner_loop_end
 
        MPY     .M1  A9,A3,A7           ;e p00 = x0a * h0
||      MPY     .M2X B9,A3,B7           ;e p10 = x1b * h0
||      SHR     .S2  B6,1,B14           ;p B14 = n/2
||      ADD     .L1  A11:A10,A13,A3:A2  ;e (2) y0 += p01
||      SUB     .S1x B6,2,A9            ;p A9 = n-2
||      ADD     .L2  B6,2,B9            ;p B9 = n+2
||[!B0] STH     .D1  B12,*+A6[1]        ;e y[j+1]=(int) d_y1
||      ADD     .D2  B2,3,B2            ;p reset y's counter (= 3)
 
  [!B0] STH     .D1  A0,*A6++[2]        ;e y[j] = (int) d_y0
||      ADD     .L2  B11:B10,B13,B13:B12        ;e (2) y1 += p11
||      MV      .L1  A5,A10             ;p y0 = round;
||      SUB     .S1  A8,2,A8            ;o decrement outer loop counter (-=2)
|| [A1] B       .S2  loop
||      SUB     .D2  B0,B0,B0           ;o clear sync for output
||      MPY     .M2  B11,0,B11          ;p y1 = (H)round;
||      MPY     .M1  A11,0,A11          ;p y1 = (H)round;
 
        LDH     .D1  *--A4[A9],A9       ;o (2) x0a = *xa, xa->xa-=n-2
||      LDH     .D2  *++B5[B9],B9       ;o (2) x1b = *xb, xb->xb+=n+2
||      ADD     .L1  A3:A2,A7,A3:A2     ;e (final) y0 += p00
||      ADD     .L2  B13:B12,B7,B13:B12 ;e (final) y1 += p10
||      MV      .S2X A5,B10             ;p y0 = (L)round;
||      MPY     .M1X B6,1,A2            ;p set inner loop counter (=n)
 
        LDW     .D2  *--B4[B14],A3      ;p h1_h0 = *h[1,0]
||      SHRU    .S1  A3:A2,A14,A1:A0    ;e d_y0 = y0 >> s
||      SHR     .S2  B13:B12,B8,B13:B12 ;e d_y1 = y1 >> s
        ; outer_loop_end
 
        STH     .D1  A0,*A6++           ;e y[j] = (int) d_y0
 
        STH     .D1  B12,*A6++          ;e y[j+1]=(int) d_y1
B_END:
*** END Benchmark Timing ***
        LDW     .D1     *++A15[2],A10   ; pop register (for c-callable func)
||      LDW     .D2     *++B15[2],B10   ; pop register (for c-callable func)
 
        LDW     .D1     *++A15[2],A11   ; pop register (for c-callable func)
||      LDW     .D2     *++B15[2],B11   ; pop register (for c-callable func)
 
        LDW     .D1     *++A15[2],A12   ; pop register (for c-callable func)
||      LDW     .D2     *++B15[2],B12   ; pop register (for c-callable func)
 
        LDW     .D1     *++A15[2],A13   ; pop register (for c-callable func)
||      LDW     .D2     *++B15[2],B13   ; pop register (for c-callable func)
 
        LDW     .D1     *++A15[2],A14   ; pop register (for c-callable func)
||      LDW     .D2     *++B15[2],B14   ; pop register (for c-callable func)
||      B       .S2     B3              ; return
 
        LDW     .D2     *++B15,A15      ; pop register (for c-callable func)
 
        NOP 4
 