* ========================================================================= *
*   NAME                                                                    *
*       idct_8x8 -- IEEE-1180 Compliant IDCT, Little Endian.                *
*                                                                           *
*   REVISION HISTORY                                                        *
*       30-Apr-1999 Reduce codesize                                         *
*       11-May-1999 New release version                                     *
*                                                                           *
*   USAGE                                                                   *
*       This routine is C callable, and has the following C prototype:      *
*                                                                           *
*           void idct_8x8(short idct_data[], unsigned num_idcts)            *
*                                                                           *
*       The idct_8x8 routine accepts a list of 8x8 DCT coeffient blocks     *
*       and performs IDCTs on each.  The array should be aligned to a       *
*       32-bit boundary, and be laid out equivalently to the C array        *
*       idct_data[num_idcts+1][8][8].                                       *
*                                                                           *
*       The routine requires one 8x8-block's worth of extra storage at      *
*       the end of the list of DCT blocks.  When processing 'num_idcts'     *
*       blocks, an area of length 'num_idcts + 1' must be provided.  The    *
*       contents of the extra block are ignored and overwritten with        *
*       intermediate results by idct_8x8().                                 *
*                                                                           *
*       This code requires '62 + 168 * num_idcts' cycles to process         *
*       'num_idcts' blocks, including 6 cycles of function call overhead.   *
*       When 'num_idcts' is zero, an early exit is taken and the function   *
*       runs for only 35 cycles (again, including overhead).                *
*                                                                           *
*   DESCRIPTION                                                             *
*       The idct_8x8 algorithm performs an IEEE-1180 compliant IDCT,        *
*       complete with rounding and saturation to signed 9-bit quantities.   *
*       The input coefficients are assumed to be signed 12-bit cosine       *
*       terms.                                                              *
*                                                                           *
*       void idct_8x8(short *idct_data, unsigned num_dcts)                  *
*       {                                                                   *
*         const short c1 = 0x0B19, c2 = 0x0A74, c3 = 0x0968;                *
*         const short c5 = 0x0649, c6 = 0x0454, c7 = 0x0235;                *
*         const int   c4_shift = 11;                                        *
*         const int   round1 = 256, round2 = 32768;                         *
*         const int   trunc1 = 9, trunc2 = 16;                              *
*         const short *i_ptr;                                               *
*         short       *o_ptr;                                               *
*         unsigned    i, j;                                                 *
*         short X0, X1, X2, X3, X4, X5, X6, X7;   /* Freq domain terms  */  *
*         int   P0, P1, p0, p1, r0, r1;           /* Even-half temp     */  *
*         int   g0, g1, h1, h0;                   /* Even-half result   */  *
*         int   g2, g3, h3, h2;                   /* Odd-half result    */  *
*         int   x0, x1, x2, x3, x4, x5, x6, x7;   /* Resulting samples  */  *
*         int   x0t,x1t,x2t,x3t,x4t,x5t,x6t,x7t;  /* Truncated result   */  *
*         int   x0s,x1s,x2s,x3s,x4s,x5s,x6s,x7s;  /* Saturated result   */  *
*                                                                           *
*         /* ---------------------------------------------------------- */  *
*         /*  Avoid running the code if we don't have any IDCTs to do.  */  *
*         /* ---------------------------------------------------------- */  *
*         if (!num_dcts) return;                                            *
*                                                                           *
*         /* ---------------------------------------------------------- */  *
*         /*  Set up pointers.                                          */  *
*         /* ---------------------------------------------------------- */  *
*         i_ptr = idct_data + num_dcts * 64 - 8;                            *
*         o_ptr = idct_data + num_dcts * 64 + 7;                            *
*                                                                           *
*         for (j = 0; j < num_dcts; j++)                                    *
*         {                                                                 *
*           /* -------------------------------------------------------- */  *
*           /*  Perform Horizontal 1-D IDCT on each 8x8 block.  Store   */  *
*           /*  out the results transposed.                             */  *
*           /* -------------------------------------------------------- */  *
*           for (i = 0; i < 8; i++)                                         *
*           {                                                               *
*               /* ---------------------------------------------------- */  *
*               /*  Load the freq-domain coefficients.                  */  *
*               /* ---------------------------------------------------- */  *
*               X0 = i_ptr[0];                                              *
*               X1 = i_ptr[1];                                              *
*               X2 = i_ptr[2];                                              *
*               X3 = i_ptr[3];                                              *
*               X4 = i_ptr[4];                                              *
*               X5 = i_ptr[5];                                              *
*               X6 = i_ptr[6];                                              *
*               X7 = i_ptr[7];                                              *
*                                                                           *
*               i_ptr -= 8;             /* decr pointer to next row     */  *
*                                                                           *
*               /* ---------------------------------------------------- */  *
*               /*  Even part of decomp.  Add rounding to DC term.      */  *
*               /* ---------------------------------------------------- */  *
*               P0 = (((int)X0) << c4_shift) + round1;                      *
*               P1 = (((int)X4) << c4_shift);                               *
*                                                                           *
*               p0 = P0 + P1;                                               *
*               p1 = P0 - P1;                                               *
*                                                                           *
*               r1 = X2*c6 - X6*c2;                                         *
*               r0 = X2*c2 + X6*c6;                                         *
*                                                                           *
*               g0 = p0 + r0;                                               *
*               g1 = p1 + r1;                                               *
*               h1 = p1 - r1;                                               *
*               h0 = p0 - r0;                                               *
*                                                                           *
*               /* ---------------------------------------------------- */  *
*               /*  Odd part of decomp.                                 */  *
*               /* ---------------------------------------------------- */  *
*               g2 = (X1*c7 - X3*c5) + (X5*c3 - X7*c1);                     *
*               g3 = (X1*c5 - X3*c1) + (X5*c7 + X7*c3);                     *
*               h3 = (X1*c3 - X3*c7) - (X5*c1 + X7*c5);                     *
*               h2 = (X1*c1 + X3*c3) + (X5*c5 + X7*c7);                     *
*                                                                           *
*               /* ---------------------------------------------------- */  *
*               /*  Final butterfly.                                    */  *
*               /* ---------------------------------------------------- */  *
*               x0 = g0 + h2;                                               *
*               x1 = g1 + h3;                                               *
*               x2 = h1 + g3;                                               *
*               x3 = h0 + g2;                                               *
*               x4 = h0 - g2;                                               *
*               x5 = h1 - g3;                                               *
*               x6 = g1 - h3;                                               *
*               x7 = g0 - h2;                                               *
*                                                                           *
*               /* ---------------------------------------------------- */  *
*               /*  Truncate to fit back into 16 bits.                  */  *
*               /* ---------------------------------------------------- */  *
*               x0t = x0 >> trunc1;                                         *
*               x1t = x1 >> trunc1;                                         *
*               x2t = x2 >> trunc1;                                         *
*               x3t = x3 >> trunc1;                                         *
*               x4t = x4 >> trunc1;                                         *
*               x5t = x5 >> trunc1;                                         *
*               x6t = x6 >> trunc1;                                         *
*               x7t = x7 >> trunc1;                                         *
*                                                                           *
*               /* ---------------------------------------------------- */  *
*               /*  Store the results transposed.                       */  *
*               /* ---------------------------------------------------- */  *
*               o_ptr[ 0] = x0t;                                            *
*               o_ptr[ 8] = x1t;                                            *
*               o_ptr[16] = x2t;                                            *
*               o_ptr[24] = x3t;                                            *
*               o_ptr[32] = x4t;                                            *
*               o_ptr[40] = x5t;                                            *
*               o_ptr[48] = x6t;                                            *
*               o_ptr[56] = x7t;                                            *
*                                                                           *
*               o_ptr--;                /* decrement ptr to next column */  *
*           }                                                               *
*                                                                           *
*           /* -------------------------------------------------------- */  *
*           /*  Update output pointer to point to next block.           */  *
*           /* -------------------------------------------------------- */  *
*                                                                           *
*           o_ptr = o_ptr + 8 - 64;                                         *
*         }                                                                 *
*                                                                           *
*         /* ---------------------------------------------------------- */  *
*         /*  Reset our pointers for the vertical pass.                 */  *
*         /* ---------------------------------------------------------- */  *
*         i_ptr = idct_data + 64;                                           *
*         o_ptr = idct_data;                                                *
*                                                                           *
*         for (j = 0; j < num_dcts; j++)                                    *
*         {                                                                 *
*           /* -------------------------------------------------------- */  *
*           /*  Perform Vertical 1-D IDCT on each 8x8 block.  Store     */  *
*           /*  out the results transposed.                             */  *
*           /* -------------------------------------------------------- */  *
*           for (i = 0; i < 8; i++)                                         *
*           {                                                               *
*               /* ---------------------------------------------------- */  *
*               /*  Load the freq-domain coefficients.                  */  *
*               /* ---------------------------------------------------- */  *
*               X0 = i_ptr[0];                                              *
*               X1 = i_ptr[1];                                              *
*               X2 = i_ptr[2];                                              *
*               X3 = i_ptr[3];                                              *
*               X4 = i_ptr[4];                                              *
*               X5 = i_ptr[5];                                              *
*               X6 = i_ptr[6];                                              *
*               X7 = i_ptr[7];                                              *
*               i_ptr += 8;             /* increment ptr to next row    */  *
*                                                                           *
*               /* ---------------------------------------------------- */  *
*               /*  Even part of decomp.  Add rouding term to DC.       */  *
*               /* ---------------------------------------------------- */  *
*               P0 = (((int)X0) << c4_shift) + round2; /* c4 is a shift */  *
*               P1 = (((int)X4) << c4_shift);          /* c4 is a shift */  *
*                                                                           *
*               p0 = P0 + P1;                                               *
*               p1 = P0 - P1;                                               *
*                                                                           *
*               r1 = X2*c6 - X6*c2;                                         *
*               r0 = X2*c2 + X6*c6;                                         *
*                                                                           *
*               g0 = p0 + r0;                                               *
*               g1 = p1 + r1;                                               *
*               h1 = p1 - r1;                                               *
*               h0 = p0 - r0;                                               *
*                                                                           *
*               /* ---------------------------------------------------- */  *
*               /*  Odd part of decomp.                                 */  *
*               /* ---------------------------------------------------- */  *
*               g2 = (X1*c7 - X3*c5) + (X5*c3 - X7*c1);                     *
*               g3 = (X1*c5 - X3*c1) + (X5*c7 + X7*c3);                     *
*               h3 = (X1*c3 - X3*c7) - (X5*c1 + X7*c5);                     *
*               h2 = (X1*c1 + X3*c3) + (X5*c5 + X7*c7);                     *
*                                                                           *
*               /* ---------------------------------------------------- */  *
*               /*  Final butterfly.                                    */  *
*               /* ---------------------------------------------------- */  *
*               x0 = g0 + h2;                                               *
*               x1 = g1 + h3;                                               *
*               x2 = h1 + g3;                                               *
*               x3 = h0 + g2;                                               *
*               x4 = h0 - g2;                                               *
*               x5 = h1 - g3;                                               *
*               x6 = g1 - h3;                                               *
*               x7 = g0 - h2;                                               *
*                                                                           *
*               /* ---------------------------------------------------- */  *
*               /*  Truncate and saturate final results.                */  *
*               /* ---------------------------------------------------- */  *
*               x0t = x0 >> trunc2;                                         *
*               x1t = x1 >> trunc2;                                         *
*               x2t = x2 >> trunc2;                                         *
*               x3t = x3 >> trunc2;                                         *
*               x4t = x4 >> trunc2;                                         *
*               x5t = x5 >> trunc2;                                         *
*               x6t = x6 >> trunc2;                                         *
*               x7t = x7 >> trunc2;                                         *
*                                                                           *
*               x0s = x0t < -256 ? -256 : x0t > 255 ? 255 : x0t;            *
*               x1s = x1t < -256 ? -256 : x1t > 255 ? 255 : x1t;            *
*               x2s = x2t < -256 ? -256 : x2t > 255 ? 255 : x2t;            *
*               x3s = x3t < -256 ? -256 : x3t > 255 ? 255 : x3t;            *
*               x4s = x4t < -256 ? -256 : x4t > 255 ? 255 : x4t;            *
*               x5s = x5t < -256 ? -256 : x5t > 255 ? 255 : x5t;            *
*               x6s = x6t < -256 ? -256 : x6t > 255 ? 255 : x6t;            *
*               x7s = x7t < -256 ? -256 : x7t > 255 ? 255 : x7t;            *
*                                                                           *
*               /* ---------------------------------------------------- */  *
*               /*  Store the results transposed in the result area.    */  *
*               /* ---------------------------------------------------- */  *
*               o_ptr[ 0] = x0s;                                            *
*               o_ptr[ 8] = x1s;                                            *
*               o_ptr[16] = x2s;                                            *
*               o_ptr[24] = x3s;                                            *
*               o_ptr[32] = x4s;                                            *
*               o_ptr[40] = x5s;                                            *
*               o_ptr[48] = x6s;                                            *
*               o_ptr[56] = x7s;                                            *
*                                                                           *
*               o_ptr++;                /* increment ptr to next column */  *
*           }                                                               *
*           /* -------------------------------------------------------- */  *
*           /*  Update output pointer to point to next block.           */  *
*           /* -------------------------------------------------------- */  *
*           o_ptr = o_ptr - 8 + 64;                                         *
*         }                                                                 *
*       }                                                                   *
*                                                                           *
*                                                                           *
*       Note:  This code guarantees correct operation, even in the case     *
*       that 'num_idcts == 0'.  In that case, the function runs for only    *
*       35 cycles (counting 6 cycles of function-call overhead), due to     *
*       early-exit code.  The early-exit case performs no accesses to the   *
*       idct_data[] array.                                                  *
*                                                                           *
*   TECHNIQUES                                                              *
*       All levels of looping are collapsed into single loops which are     *
*       pipelined.  The outer loop focuses on 8-pt IDCTs, whereas the       *
*       inner loop controls the column-pointer to handle jumps between      *
*       IDCT blocks.                                                        *
*                                                                           *
*       For performance, portions of the code outside the loops have been   *
*       inter-scheduled with the prolog and epilog code of the loops.       *
*       Also, twin stack-pointers are used to accelerate stack accesses.    *
*       Finally, pointer values and cosine term registers are reused        *
*       between the horizontal and vertical loops to save the need for      *
*       messy pointer and constant reinitialization.                        *
*                                                                           *
*       To save codesize, prolog and epilog collapsing have been performed  *
*       to the extent that it does not impact performance.  Also, code      *
*       outside the loops has been scheduled to pack as tightly into        *
*       fetch packets as possible to avoid alignment padding NOPs.          *
*                                                                           *
*       The IDCTs cannot be performed completely in-place due to the        *
*       transpose that each pass performs.  In order to save data memory,   *
*       the horizontal pass works from the end of the array towards the     *
*       begining, writing its result one IDCT block later in memory,        *
*       thus performing the IDCT nearly-in-place.  The vertical pass        *
*       performs its IDCTs in the opposite direction, working from the      *
*       start of the array towards the end, writing the results in-place.   *
*       A nice side effect of this is that the pointer values at the        *
*       end of the horizontal loop are a fixed offset relative to their     *
*       required values for the vertical loop, regardless of the number     *
*       of IDCTs performed.  This makes the pointer reinitialization        *
*       exceptionally cheap.                                                *
*                                                                           *
*       Additional section-specific optimization notes are provided below.  *
*                                                                           *
*   ASSUMPTIONS                                                             *
*       The input array must be aligned on a word boundary, and one         *
*       extra block's worth of storage must be present after the list       *
*       of IDCT input blocks.                                               *
*                                                                           *
*   MEMORY NOTE                                                             *
*       No bank conflicts occur.  The code requires 16 words of stack       *
*       space to save Save-On-Entry (SOE) registers, CSR, IRP, and a        *
*       spill value.  For correct operation, the input array must be        *
*       aligned to a word boundary.                                         *
*                                                                           *
*       Bank usage on C6201:                                                *
*                                                                           *
*           Horiz loop accesses: 1 of 4 banks for 80% of cycles             *
*                                4 of 4 banks for 20% of cycles             *
*                                                                           *
*           Vert loop accesses:  1 of 4 banks for 73% of cycles             *
*                                4 of 4 banks for 18% of cycles             *
*                                0 of 4 banks for  9% of cycles             *
*                                                                           *
*   NOTES                                                                   *
*       This is a LITTLE ENDIAN implementation.                             *
*                                                                           *
*       This code masks interrupts for nearly its entire duration.          *
*       Interrupts are locked out for '53 + 168 * num_idcts' cycles.  As    *
*       a result, the code is interrupt-tolerant, but not interruptible.    *
*                                                                           *
*       The cosine terms have all been scaled by sqrt(2), so that the       *
*       "c4" term is basically an even power of 2.                          *
*                                                                           *
*       The precision of the final results can be changed by modifying      *
*       the constants at the top of the code and reassembling.  Usually,    *
*       modifying the final-shift constants in the "Symbolic Constants"     *
*       section is sufficient.                                              *
*                                                                           *
*   SOURCE                                                                  *
*       The IDCT form used is the Even-Odd Decomposition IDCT.              *
*                                                                           *
* ------------------------------------------------------------------------- *
*             Copyright (c) 1999 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *
            .sect       ".data:copyright_h"
_Copyright: .string     "Copyright (C) 1999 Texas Instruments Incorporated. "
            .string     "All Rights Reserved."

            .sect       ".text:hand"
            .global     _idct_8x8_asm

_idct_8x8_asm:
; ============================ SYMBOLIC CONSTANTS ============================
        .asg            0x0B19,     cst_c1  ; Cosine term c1
        .asg            0x0A74,     cst_c2  ; Cosine term c2
        .asg            0x0968,     cst_c3  ; Cosine term c3
        .asg            0x0800,     cst_c4  ; Cosine term c4
        .asg            0x0649,     cst_c5  ; Cosine term c5
        .asg            0x0454,     cst_c6  ; Cosine term c6
        .asg            0x0235,     cst_c7  ; Cosine term c7
        .asg            11,         q_pt    ; Q-point for calculations
        .asg            16,         kq_a    ; Extract const for c4 "mpy"
        .asg            16-q_pt,    kq_b    ; Extract const for c4 "mpy"
        .asg            9,          trunc1  ; Truncation after horizontal pass
        .asg            9,          results ; Final precision of results
        .asg            32-results, trunc2  ; Final truncation right-shift
        .asg            16-results, satl    ; Final saturation left-shift
; =============== SYMBOLIC REGISTER ASSIGNMENTS FOR HORIZ LOOP ===============
        .asg            B13,        B_c7c5  ; Cosine terms c7, c5   (packed)
        .asg            A13,        A_c7c5  ; Cosine terms c7, c5   (packed)
        .asg            B12,        B_c3c1  ; Cosine terms c3, c1   (packed)
        .asg            A12,        A_c3c1  ; Cosine terms c3, c1   (packed)
        .asg            B14,        B_c6c2  ; Cosine terms c6, c2   (packed)
        .asg            A14,        A_i_ptr ; Input pointer #1
        .asg            B15,        B_i_ptr ; Input pointer #2
        .asg            A11,        A_o_ptr ; Output pointer #1
        .asg            B11,        B_o_ptr ; Output pointer #2
        .asg            B2,         B_o     ; Outer loop counter
        .asg            A5,         A_X1X0  ; Incoming coefs X1, X0 (packed)
        .asg            A10,        A_X3X2  ; Incoming coefs X3, X2 (packed)
        .asg            B7,         B_X5X4  ; Incoming coefs X5, X4 (packed)
        .asg            B10,        B_X7X6  ; Incoming coefs X7, X6 (packed)
        .asg            A7,         A_X2c6  ; X2 * c6
        .asg            B0,         B_X6c2  ; X6 * c2
        .asg            A0,         A_X2c2  ; X2 * c2
        .asg            B1,         B_X6c6  ; X6 * c6
        .asg            A6,         A_P0    ; Node P0 in signal flow graph
        .asg            B8,         B_P1    ; Node P1 in signal flow graph
        .asg            A8,         A_p0    ; Node p0 in signal flow graph
        .asg            A0,         A_p1    ; Node p1 in signal flow graph
        .asg            B0,         B_r1    ; Node r1 in signal flow graph
        .asg            B4,         B_r0    ; Node r0 in signal flow graph
        .asg            B7,         B_g0    ; Node g0 in signal flow graph
        .asg            B3,         B_g1    ; Node g1 in signal flow graph
        .asg            A15,        A_h1    ; Node h1 in signal flow graph
        .asg            A15,        A_h0    ; Node h0 in signal flow graph
        .asg            A3,         A_X1c1  ; X1 * c1
        .asg            A0,         A_X1c3  ; X1 * c3
        .asg            A3,         A_X1c5  ; X1 * c5
        .asg            A9,         A_X1c7  ; X1 * c7
        .asg            A9,         A_X3c1  ; X3 * c1
        .asg            A0,         A_X3c3  ; X3 * c3
        .asg            A5,         A_X3c5  ; X3 * c5
        .asg            A5,         A_X3c7  ; X3 * c7
        .asg            B0,         B_X5c1  ; X5 * c1
        .asg            B4,         B_X5c3  ; X5 * c3
        .asg            B3,         B_X5c5  ; X5 * c5
        .asg            B6,         B_X5c7  ; X5 * c7
        .asg            B0,         B_X7c1  ; X7 * c1
        .asg            B3,         B_X7c3  ; X7 * c3
        .asg            B9,         B_X7c5  ; X7 * c5
        .asg            B1,         B_X7c7  ; X7 * c7
        .asg            A7,         A_g2a   ; X1 * c7 - X3 * c5
        .asg            B8,         B_g2b   ; X5 * c3 - X7 * c1
        .asg            A6,         A_g2    ; Node g2 in signal flow graph
        .asg            A3,         A_g3a   ; X1 * c5 - X3 * c1
        .asg            B6,         B_g3b   ; X5 * c7 + X7 * c3
        .asg            A4,         A_g3    ; Node g3 in signal flow graph
        .asg            A6,         A_h3a   ; X1 * c3 - X2 * c7
        .asg            B7,         B_h3b   ; X5 * c1 + X7 * c5
        .asg            B5,         B_h3n   ; Node h3, negated.
        .asg            A0,         A_h2a   ; X1 * c1 + X3 * c3
        .asg            B3,         B_h2b   ; X5 * c5 + X7 * c7
        .asg            B1,         B_h2    ; Node h2 in signal flow graph
        .asg            B4,         B_x0    ; Output x0, pre-truncation
        .asg            B0,         B_x1    ; Output x1, pre-truncation
        .asg            A4,         A_x2    ; Output x2, pre-truncation
        .asg            A4,         A_x3    ; Output x3, pre-truncation
        .asg            A7,         A_x4    ; Output x4, pre-truncation
        .asg            A15,        A_x5    ; Output x5, pre-truncation
        .asg            B6,         B_x6    ; Output x6, pre-truncation
        .asg            B3,         B_x7    ; Output x7, pre-truncation
        .asg            B4,         B_x0t   ; Output x0, truncated to 16 bits
        .asg            B5,         B_x1t   ; Output x1, truncated to 16 bits
        .asg            A4,         A_x2t   ; Output x2, truncated to 16 bits
        .asg            A8,         A_x3t   ; Output x3, truncated to 16 bits
        .asg            A7,         A_x4t   ; Output x4, truncated to 16 bits
        .asg            A5,         A_x5t   ; Output x5, truncated to 16 bits
        .asg            B3,         B_x6t   ; Output x6, truncated to 16 bits
        .asg            B9,         B_x7t   ; Output x7, truncated to 16 bits
        .asg            A2,         A_i     ; Inner-loop counter.
; ============================================================================

* ========================================================================= *
*   Initialization code for horizontal loop:  Saves registers to            *
*   the stack, sets up cosine terms, pointers and loop control.             *
*                                                                           *
*   The stack frame for this code is 16 words large.  It holds the Save     *
*   on Entry (SOE) registers A10..A15, B10..B14, as well as the return      *
*   address (B3), CSR, IRP, and a single spill value.  (The loop counter    *
*   initializer is shared between both loops and so I spill it to the       *
*   stack.)  I twin the stack pointer to speed up stack accesses.  The      *
*   stack frame layout is slightly funky to avoid bank conflicts while      *
*   allowing me to get to everything when I need it most.                   *
*                                                                           *
*   The horizontal loop starts at the end of the IDCT array and works back  *
*   towards the beginning.  As a result, the input and output pointers are  *
*   initialized like so:                                                    *
*                                                                           *
*    -- A_i_ptr is set to point to the coefficients "X0" and "X1" in the    *
*       last row of the last valid IDCT block in the input.  B_i_ptr is     *
*       set to point to the coefficients "X4" and "X5" in that same row.    *
*                                                                           *
*    -- A_o_ptr is set to point to the coefficient "x4" in the rightmost    *
*       column of the scratch block I require at the end of the array.      *
*       B_o_ptr is set to point to "x3" in that same column.                *
*                                                                           *
*   The loop count is simply the number of IDCTs times 8, minus 1 to        *
*   handle the parallel iterations in the kernel.  (It would've been more,  *
*   except that I've performed some limited prolog and epilog collapsing,   *
*   so I need to iterate the kernel more times.)  A happy coincidence       *
*   gives both horizontal and vertical loops the exact same trip count,     *
*   so I spill this value to the stack and simply restore it unchanged      *
*   for the second loop, rather than recalculating it.                      *
*                                                                           *
*   Since I was able to free up a single predication register in the first  *
*   loop, I prolog-collapsed one stage of the prolog.  I use A1 as my       *
*   prolog-collapsation fuse.  To save a MVK (since this code bottlenecks   *
*   heavily on S units), I initialize it to -1 with an OR, rather than a    *
*   more traditional 1.                                                     *
*                                                                           *
*   Both loops use all 32 registers, so I have saved the stack pointer in   *
*   IRP.  This is safe since interrupts are explicitly disabled for the     *
*   entire function.                                                        *
*                                                                           *
*   Note:  This setup code could possibly be a cycle or two faster.  For    *
*   instance, I could copy B15 to A15 before the decrement and use          *
*   negative indexes for the STWs through A15, saving a whole cycle on      *
*   the stack saves.  The resulting code doesn't pack as nicely, though.    *
* ========================================================================= *

;-
        STW     .D2T1   A15,        *B15--[16]      ; Save A15, get stack frame
||      MVC     .S2     CSR,        B0              ; Grab the current CSR

        AND     .L2     B0,         -2,         B1  ; Clear GIE bit in CSR
||      MV      .L1X    B15,        A15             ; Twin the stack pointer

        STW     .D1T1   A14,        *+A15 [13]      ; Save SOE reg A14
||      STW     .D2T2   B14,        *+B15 [12]      ; Save SOE reg B14
||      MV      .L1X    B0,         A0              ; Partitioning MV.
||      MVC     .S2     B1,         CSR             ; Interrupts disabled here

;-
        STW     .D1T1   A13,        *+A15 [10]      ; Save SOE reg A13
||      STW     .D2T2   B13,        *+B15 [11]      ; Save SOE reg B13

        STW     .D1T1   A12,        *+A15 [ 9]      ; Save SOE reg A12
||      STW     .D2T2   B12,        *+B15 [ 8]      ; Save SOE reg B12

        STW     .D1T1   A11,        *+A15 [ 7]      ; Save SOE reg A11
||      STW     .D2T2   B11,        *+B15 [ 6]      ; Save SOE reg B11
||      SHL     .S2     B4,         3,      B_o     ; Set up outer loop counter
||      OR      .L1     A1,         -1,     A1      ; Prolog collapse counter

;-
        STW     .D1T1   A10,        *+A15 [ 5]      ; Save SOE reg A10
||      STW     .D2T2   B10,        *+B15 [ 4]      ; Save SOE reg B10
||      SHL     .S2     B4,         7,      B4      ; Set up end-of-array ptr
||[B_o] SUB     .L2     B_o,        1,      B_o     ; Loop count = IDCTs*8 - 1

        STW     .D2T2   B3,         *+B15 [ 2]      ; Remember the return addr
||      STW     .D1T1   A0,         *+A15 [ 3]      ; Remember the CSR state
||      ADD     .L2X    A4,         B4,     B4      ; Point to scratch area
||      MVC     .S2     IRP,        B0

;-
        STW     .D2T2   B_o,        *+B15 [ 1]      ; Spill our loop count init
||      MVC     .S2     B15,        IRP             ; Save stack ptr in IRP
||      SUB     .L2     B4,         8,      B_i_ptr ; Point to X5X4, row 7
||      MV      .L1X    B4,         A_o_ptr
||      MVK     .S1     7,          A_i             ; Set up inner loop counter

        SUB     .L1X    B_i_ptr,    8,      A_i_ptr ; Point to X1X0, row 7
||      ADDAH   .D2     B4,         31,     B_o_ptr ; Point to x3, col 7
||      ADDK    .S1     78,         A_o_ptr         ; Point to x4, col 7
;-
; ============================ PIPE LOOP PROLOG ==============================
h_prolog:
  [ B_o]LDW     .D1T1   * A_i_ptr--[4],         A_X1X0          ;[ 1,1] 
||[ B_o]LDW     .D2T2   *+B_i_ptr[1],           B_X7X6          ;[ 1,1] 
||      MVK     .S1     cst_c1,     A_c3c1                      ; c1
||[!B_o]B       .S2     idct_8x8_abort          ; Abort if num_idcts == 0

  [ B_o]LDW     .D1T1   *+A_i_ptr[5],           A_X3X2          ;[ 2,1] 
||[ B_o]LDW     .D2T2   * B_i_ptr--[4],         B_X5X4          ;[ 2,1] 
||      MVK     .S1     cst_c5,     A_c7c5                      ; c5
||      MVK     .S2     cst_c2,     B_c6c2                      ; c2
;-
        STW     .D1T2   B0,         *A15[14]                    ; save IRP

        MVKLH   .S1     cst_c7,     A_c7c5                      ; c7
||      MVKLH   .S2     cst_c6,     B_c6c2                      ; c6

        MVKLH   .S1     cst_c3,     A_c3c1                      ; c3
||      MVK     .S2     cst_c5,     B_c7c5                      ; c5

        MPYH    .M1     A_X1X0,     A_c7c5,     A_X1c7          ;[ 6,1] 
||      MPYLH   .M2     B_X7X6,     B_c6c2,     B_X6c6          ;[ 6,1] 
||      MVKLH   .S2     cst_c7,     B_c7c5                      ; c7

; ===== Branch Occurs =====
;-
        EXT     .S1     A_X1X0,     kq_a, kq_b, A_P0            ;[ 7,1] 
||      MPY     .M1X    A_X3X2,     B_c6c2,     A_X2c2          ;[ 7,1] 
||      MPYHL   .M2     B_X7X6,     B_c7c5,     B_X7c5          ;[ 7,1] 
||      MV      .L2X    A_c3c1,     B_c3c1

        ADDK    .S1     256,        A_P0                        ;[ 8,1] 
||      EXT     .S2     B_X5X4,     kq_a, kq_b, B_P1            ;[ 8,1] 
||      MPYHL   .M1     A_X1X0,     A_c3c1,     A_X1c1          ;[ 8,1] 
||      MPYH    .M2     B_X7X6,     B_c7c5,     B_X7c7          ;[ 8,1] 
;-
; ============================ PIPE LOOP KERNEL ==============================
h_loop:
h_loop_0:
        SUB     .L2     B_g1,       B_h3n,      B_x1            ;[19,1] 
||      STH     .D2T2   B_x0t,      *-B_o_ptr[24]               ;[19,1] 
||      ADD     .D1     A_i,        1,          A_i             ;[19,1] 
||      SHR     .S1     A_x3,       trunc1,     A_x3t           ;[19,1] 
||      ADD     .L1X    A_g3a,      B_g3b,      A_g3            ;[19,1] 
||      ADD     .S2X    A_X2c2,     B_X6c6,     B_r0            ;[ 9,2] 
||      MPYH    .M1     A_X3X2,     A_c3c1,     A_X3c3          ;[ 9,2] 
||      MPYHL   .M2     B_X5X4,     B_c7c5,     B_X5c5          ;[ 9,2] 

h_loop_1:
        ADD     .L2     B_g1,       B_h3n,      B_x6            ;[20,1] 
||[!A1] STH     .D2T1   A_x3t,      * B_o_ptr--[1]              ;[20,1] 
||      ADD     .S1     A_h1,       A_g3,       A_x2            ;[20,1] 
||      SUB     .D1     A_h1,       A_g3,       A_x5            ;[20,1] 
||      ADD     .L1X    A_P0,       B_P1,       A_p0            ;[10,2] 
||      MPYHL   .M1     A_X1X0,     A_c7c5,     A_X1c5          ;[10,2] 
||      MPYHL   .M2     B_X7X6,     B_c3c1,     B_X7c1          ;[10,2] 

h_loop_2:
        SHR     .S1     A_x5,       trunc1,     A_x5t           ;[21,1] 
||      SHR     .S2     B_x1,       trunc1,     B_x1t           ;[21,1] 
||      ADD     .L1     A_X1c1,     A_X3c3,     A_h2a           ;[11,2] 
||      ADD     .L2     B_X5c5,     B_X7c7,     B_h2b           ;[11,2] 
||      MPYH    .M1     A_X1X0,     A_c3c1,     A_X1c3          ;[11,2] 
||      MPYH    .M2     B_X5X4,     B_c7c5,     B_X5c7          ;[11,2] 
||      LDW     .D1T1   * A_i_ptr--[4],         A_X1X0          ;[ 1,3] 
||      LDW     .D2T2   *+B_i_ptr[1],           B_X7X6          ;[ 1,3] 

h_loop_3:
        SHR     .S2     B_x6,       trunc1,     B_x6t           ;[22,1] 
||      SHR     .S1     A_x2,       trunc1,     A_x2t           ;[22,1] 
||      SUB     .L1X    A_p0,       B_r0,       A_h0            ;[12,2] 
||      ADD     .L2X    A_h2a,      B_h2b,      B_h2            ;[12,2] 
||      MPYH    .M1     A_X3X2,     A_c7c5,     A_X3c7          ;[12,2] 
||      MPYH    .M2     B_X5X4,     B_c3c1,     B_X5c3          ;[12,2] 
||      LDW     .D1T1   *+A_i_ptr[5],           A_X3X2          ;[ 2,3] 
||      LDW     .D2T2   * B_i_ptr--[4],         B_X5X4          ;[ 2,3] 

h_loop_4:
  [ B_o]B       .S2     h_loop                                  ;[23,1] 
||      STH     .D1T1   A_x5t,      *+A_o_ptr[8]                ;[23,1] 
||      SHR     .S1     A_x4,       trunc1,     A_x4t           ;[23,1] 
||      ADD     .L2X    A_p0,       B_r0,       B_g0            ;[13,2] 
||[ B_o]SUB     .D2     B_o,        1,          B_o             ;[13,2] 
||[!A1] AND     .L1     A_i,        7,          A_i             ;[13,2] 
||      MPYHL   .M1     A_X3X2,     A_c7c5,     A_X3c5          ;[13,2] 
||      MPYHL   .M2     B_X5X4,     B_c3c1,     B_X5c1          ;[13,2] 

h_loop_5:
  [!A1] STH     .D1T1   A_x4t,      * A_o_ptr--[1]              ;[24,1] 
||      SUB     .S1     A_X1c3,     A_X3c7,     A_h3a           ;[14,2] 
||      SUB     .L1X    A_P0,       B_P1,       A_p1            ;[14,2] 
||      ADD     .S2     B_g0,       B_h2,       B_x0            ;[14,2] 
||      SUB     .L2     B_X5c3,     B_X7c1,     B_g2b           ;[14,2] 
||      MPYHL   .M1     A_X3X2,     A_c3c1,     A_X3c1          ;[14,2] 
||      MPY     .M2     B_X7X6,     B_c6c2,     B_X6c2          ;[14,2] 

h_loop_6:
        STH     .D1T2   B_x6t,      *+A_o_ptr[17]               ;[25,1] 
||      SUB     .D2     B_g0,       B_h2,       B_x7            ;[15,2] 
||      SHR     .S2     B_x0,       trunc1,     B_x0t           ;[15,2] 
||      SUB     .S1     A_X1c7,     A_X3c5,     A_g2a           ;[15,2] 
||      ADD     .L2     B_X5c1,     B_X7c5,     B_h3b           ;[15,2] 
||      MPYLH   .M1X    A_X3X2,     B_c6c2,     A_X2c6          ;[15,2] 
||      MPYH    .M2     B_X7X6,     B_c3c1,     B_X7c3          ;[15,2] 
||[ A1] ADD     .L1     A1,         1,          A1

h_loop_7:
  [!A_i]SUBAW   .D1     A_o_ptr,    28,         A_o_ptr         ;[26,1] 
||      STH     .D2T2   B_x1t,      *-B_o_ptr[15]               ;[26,1] 
||      SHR     .S2     B_x7,       trunc1,     B_x7t           ;[16,2] 
||      SUB     .L1     A_X1c5,     A_X3c1,     A_g3a           ;[16,2] 
||      SUB     .L2X    B_h3b,      A_h3a,      B_h3n           ;[16,2] 
||      ADD     .S1X    A_g2a,      B_g2b,      A_g2            ;[16,2] 
||      MPYH    .M1     A_X1X0,     A_c7c5,     A_X1c7          ;[ 6,3] 
||      MPYLH   .M2     B_X7X6,     B_c6c2,     B_X6c6          ;[ 6,3] 

h_loop_8:
        STH     .D2T1   A_x2t,      *-B_o_ptr[7]                ;[27,1] 
||      ADD     .L1     A_h0,       A_g2,       A_x3            ;[17,2] 
||      SUB     .D1     A_h0,       A_g2,       A_x4            ;[17,2] 
||      SUB     .L2X    A_X2c6,     B_X6c2,     B_r1            ;[17,2] 
||      EXT     .S1     A_X1X0,     kq_a, kq_b, A_P0            ;[ 7,3] 
||      EXT     .S2     B_X5X4,     kq_a, kq_b, B_P1            ;[ 7,3] 
||      MPY     .M1X    A_X3X2,     B_c6c2,     A_X2c2          ;[ 7,3] 
||      MPYHL   .M2     B_X7X6,     B_c7c5,     B_X7c5          ;[ 7,3] 

h_loop_9:
  [!A_i]SUBAW   .D2     B_o_ptr,    28,         B_o_ptr         ;[28,1] 
||      STH     .D1T2   B_x7t,      *+A_o_ptr[24]               ;[18,2] 
||      ADD     .S2X    A_p1,       B_r1,       B_g1            ;[18,2] 
||      SUB     .L1X    A_p1,       B_r1,       A_h1            ;[18,2] 
||      ADD     .L2     B_X5c7,     B_X7c3,     B_g3b           ;[18,2] 
||      ADDK    .S1     256,        A_P0                        ;[ 8,3] 
||      MPYHL   .M1     A_X1X0,     A_c3c1,     A_X1c1          ;[ 8,3] 
||      MPYH    .M2     B_X7X6,     B_c7c5,     B_X7c7          ;[ 8,3] 

; ============================ PIPE LOOP EPILOG ==============================
h_epilog:
        SUB     .L2     B_g1,       B_h3n,      B_x1            ;[19,3] 
||      STH     .D2T2   B_x0t,      *-B_o_ptr[24]               ;[19,3] 
||      SHR     .S1     A_x3,       trunc1,     A_x3t           ;[19,3] 
||      ADD     .L1X    A_g3a,      B_g3b,      A_g3            ;[19,3] 

        ADD     .L2     B_g1,       B_h3n,      B_x6            ;[20,3] 
||      STH     .D2T1   A_x3t,      *+B_o_ptr[0]                ;[20,3] 
||      ADD     .S1     A_h1,       A_g3,       A_x2            ;[20,3] 
||      SUB     .D1     A_h1,       A_g3,       A_x5            ;[20,3] 
;-
        SHR     .S1     A_x5,       trunc1,     A_x5t           ;[21,3] 
||      SHR     .S2     B_x1,       trunc1,     B_x1t           ;[21,3] 

        SHR     .S2     B_x6,       trunc1,     B_x6t           ;[22,3] 
||      SHR     .S1     A_x2,       trunc1,     A_x2t           ;[22,3] 
||      STH     .D2T2   B_x1t,      *-B_o_ptr[16]               ;[26,3] 

        STH     .D1T1   A_x5t,      *+A_o_ptr[8]                ;[23,3] 
||      SHR     .S1     A_x4,       trunc1,     A_x4t           ;[23,3] 

* ========================================================================= *
*   Interloop code:  Performs remaining epilog from horizontal pass, and    *
*   begins setup of the vertical pass.                                      *
*                                                                           *
*   In order to save some time between loops, I start performing pointer    *
*   fixups and constant initializations in the epilog of the horizontal     *
*   pass loop.  The horizontal pass works from the bottom of the            *
*   IDCT list and ends at the top, whereas the vertical pass works from     *
*   the top of the list and ends up at the bottom.  As a result, the        *
*   displacement between the required pointer settings between the two      *
*   loops is fixed, regardless of the number of IDCTs processed, since      *
*   the two loops pointers always meet at the top of the list.              *
*                                                                           *
*   The vertical loop needs a new repacking of the cosine terms: c6c3 and   *
*   c2c1.  By playing around w/ how the cosine terms are packed,            *
*   I was able to save two whole registers in the vertical loop and thus    *
*   fit into the register file.  I do this repacking partly here, and       *
*   partly in the vertical loop's prolog.                                   *
* ========================================================================= *

        STH     .D1T1   A_x4t,      *+A_o_ptr[0]                ;[24,3] 
;-
        STH     .D1T2   B_x6t,      *+A_o_ptr[16]               ;[25,3] 
||      ADDK    .S1     168,        A_i_ptr     ; Fixup for vert loop
||      ADDK    .S2     156,        B_i_ptr     ; Fixup for vert loop

        .asg            A15,        A_c6c3      ; Symbolic name from vert loop

        STH     .D2T1   A_x2t,      *-B_o_ptr[8]                ;[27,3] 
||      SHR     .S1     A_c3c1,     16, A_c6c3  ; Set up new cosine constant
||      MVC     .S2     IRP,        B0          ; Get SP so we can unspill A_o

; ============================================================================

; =============== SYMBOLIC REGISTER ASSIGNMENTS FOR VERT LOOP ================
        .asg            A14,        A_i_ptr ; Input pointer #1
        .asg            B15,        B_i_ptr ; Input pointer #2
        .asg            A11,        A_o_ptr ; Output pointer #1
        .asg            B11,        B_o_ptr ; Output pointer #2
        .asg            B13,        B_c7c5  ; Cosine terms c7, c5   (packed)
        .asg            A13,        A_c7c5  ; Cosine terms c7, c5   (packed)
        .asg            A15,        A_c6c3  ; Cosine terms c6, c3   (packed)
        .asg            B12,        B_c2c1  ; Cosine terms c2, c1   (packed)
        .asg            A4,         A_c1c4  ; Cosine term  c1, c4 (alternates)
        .asg            A2,         A_o     ; Outer loop counter
        .asg            B2,         B_i     ; Inner loop counter
        .asg            A12,        A_X7X6  ; Incoming coefs X7, X6 (packed)
        .asg            A8,         A_X5X4  ; Incoming coefs X5, X4 (packed)
        .asg            B10,        B_X3X2  ; Incoming coefs X3, X2 (packed)
        .asg            B14,        B_X1X0  ; Incoming coefs X1, X0 (packed)
        .asg            B9,         B_rnd   ; Rounding value applied to P0
        .asg            B1,         B_P0_t  ; Node P0, temporary pre-rounding
        .asg            B5,         B_P0    ; Rounded value of Node P0
        .asg            A7,         A_P1    ; Node P1 in signal flow graph
        .asg            B0,         B_X2c2  ; X2 * c2
        .asg            B4,         B_X2c6  ; X2 * c6
        .asg            A4,         A_X6c2  ; X6 * c2
        .asg            A3,         A_X6c6  ; X6 * c6
        .asg            A5,         A_p0    ; Node p0 in signal flow graph
        .asg            A8,         A_p1    ; Node p1 in signal flow graph
        .asg            B4,         B_r1    ; Node r1 in signal flow graph
        .asg            B3,         B_r0    ; Node r0 in signal flow graph
        .asg            B0,         B_g0    ; Node g0 in signal flow graph
        .asg            A1,         A_g1    ; Node g1 in signal flow graph
        .asg            B3,         B_h1    ; Node h1 in signal flow graph
        .asg            A3,         A_h0    ; Node h0 in signal flow graph
        .asg            B5,         B_X1c1  ; X1 * c1
        .asg            B1,         B_X1c3  ; X1 * c3
        .asg            B3,         B_X1c5  ; X1 * c5
        .asg            B8,         B_X1c7  ; X1 * c7
        .asg            B0,         B_X3c1  ; X3 * c1
        .asg            B0,         B_X3c3  ; X3 * c3
        .asg            B0,         B_X3c5  ; X3 * c5
        .asg            B9,         B_X3c7  ; X3 * c7
        .asg            A3,         A_X5c1  ; X5 * c1
        .asg            A1,         A_X5c3  ; X5 * c3
        .asg            A5,         A_X5c5  ; X5 * c5
        .asg            A0,         A_X5c7  ; X5 * c7
        .asg            A6,         A_X7c1  ; X7 * c1
        .asg            A7,         A_X7c3  ; X7 * c3
        .asg            A4,         A_X7c5  ; X7 * c5
        .asg            A6,         A_X7c7  ; X7 * c7
        .asg            A3,         A_h2a   ; X5 * c5 + X7 * c7
        .asg            B3,         B_h2b   ; X1 * c1 + X3 * c3
        .asg            B6,         B_h2    ; Node h2 in signal flow graph
        .asg            A4,         A_h3a   ; X5 * c1 + X7 * c5
        .asg            B1,         B_h3b   ; X1 * c3 + X3 * c7
        .asg            A3,         A_h3    ; Node h3 in signal flow graph
        .asg            A9,         A_g3a   ; X5 * c7 + X7 * c3
        .asg            B1,         B_g3b   ; X1 * c5 + X3 * c1
        .asg            B7,         B_g3    ; Node g3 in signal flow graph
        .asg            A9,         A_g2a   ; X5 * c3 + X7 * c1
        .asg            B1,         B_g2b   ; X1 * c7 + X3 * c5
        .asg            A0,         A_g2    ; Node g2 in signal flow graph
        .asg            B8,         B_x0    ; Output x0, pre-saturate/truncate
        .asg            A1,         A_x1    ; Output x1, pre-saturate/truncate
        .asg            B7,         B_x2    ; Output x2, pre-saturate/truncate
        .asg            A4,         A_x3    ; Output x3, pre-saturate/truncate
        .asg            A0,         A_x4    ; Output x4, pre-saturate/truncate
        .asg            B4,         B_x5    ; Output x5, pre-saturate/truncate
        .asg            A5,         A_x6    ; Output x6, pre-saturate/truncate
        .asg            B6,         B_x7    ; Output x7, pre-saturate/truncate
        .asg            B5,         B_x0s   ; Output x0, saturated to 9 bits
        .asg            A10,        A_x1s   ; Output x1, saturated to 9 bits
        .asg            B3,         B_x2s   ; Output x2, saturated to 9 bits
        .asg            A6,         A_x3s   ; Output x3, saturated to 9 bits
        .asg            A7,         A_x4s   ; Output x4, saturated to 9 bits
        .asg            B4,         B_x5s   ; Output x5, saturated to 9 bits
        .asg            A3,         A_x6s   ; Output x6, saturated to 9 bits
        .asg            B6,         B_x7s   ; Output x7, saturated to 9 bits
        .asg            B8,         B_x0t   ; Output x0, truncated to 9 bits
        .asg            A0,         A_x1t   ; Output x1, truncated to 9 bits
        .asg            B0,         B_x2t   ; Output x2, truncated to 9 bits
        .asg            A6,         A_x3t   ; Output x3, truncated to 9 bits
        .asg            A7,         A_x4t   ; Output x4, truncated to 9 bits
        .asg            B4,         B_x5t   ; Output x5, truncated to 9 bits
        .asg            A5,         A_x6t   ; Output x6, truncated to 9 bits
        .asg            B3,         B_x7t   ; Output x7, truncated to 9 bits
; ============================================================================

; ============================ PIPE LOOP PROLOG ==============================
v_prolog:
        LDW     .D2T1   *B0[1],     A_o         ; Unspill loop trip count
||      ADDK    .S2     -128,       B_o_ptr     ; Fixup for vert loop
;-
        LDW     .D1T1   *+A_i_ptr[1],           A_X7X6          ;[ 1,1] 
||      LDW     .D2T2   *-B_i_ptr[1],           B_X1X0          ;[ 1,1] 

        ADDK    .S1     -128,       A_o_ptr     ; Fixup for vert loop

        ; Set up modified constants for second loop
        ; Note: A_c7c5, B_c7c5 are in same regs both loops.
        ; Also, B_c2c1 reuses h_loop's B_c3c1.

        LDW     .D2T2   * B_i_ptr++[4],         B_X3X2          ;[ 3,1] 
||      LDW     .D1T1   * A_i_ptr++[4],         A_X5X4          ;[ 3,1] 

        MVKLH   .S2     cst_c2,     B_c2c1      ; c2  (B_c2c1 == B_c3c1)
||      MVKLH   .S1     cst_c6,     A_c6c3      ; c6
        
        MVK     .S2     8,          B_i         ; Inner loop counter.
;-
        MPYHL   .M1     A_X7X6,     A_c6c3,     A_X7c3          ;[ 6,1] 

        MPYH    .M1     A_X7X6,     A_c7c5,     A_X7c7          ;[ 7,1] 
||      MPYHL   .M2     B_X1X0,     B_c2c1,     B_X1c1          ;[ 7,1] 

        MVK     .S1     cst_c4,     A_c1c4                      ;[ 8,1] 
||      MPYH    .M1     A_X5X4,     A_c7c5,     A_X5c7          ;[ 8,1] 
||      MPYHL   .M2     B_X1X0,     B_c7c5,     B_X1c5          ;[ 8,1] 

        MPY     .M1     A_X5X4,     A_c1c4,     A_P1            ;[ 9,1] 
||      MPYHL   .M2     B_X3X2,     B_c2c1,     B_X3c1          ;[ 9,1] 
;-
        ADD     .D1     A_X5c7,     A_X7c3,     A_g3a           ;[10,1] 
||      MPYHL   .M1     A_X5X4,     A_c6c3,     A_X5c3          ;[10,1] 
||      MPYHL   .M2X    B_X3X2,     A_c6c3,     B_X3c3          ;[10,1] 

        SUB     .L2     B_X1c5,     B_X3c1,     B_g3b           ;[11,1] 
||      MPYHL   .M1     A_X5X4,     A_c7c5,     A_X5c5          ;[11,1] 
||      MPY     .M2X    B_X1X0,     A_c1c4,     B_P0_t          ;[11,1] 
||      MVK     .S2     -32768,     B_rnd                       ;[ 6,1] 
||      B       .S1     v_loop_0 + 8                            ; skip 2
;-
        ADD     .L2X    B_g3b,      A_g3a,      B_g3            ;[12,1] 
||      MPYHL   .M1X    A_X7X6,     B_c2c1,     A_X7c1          ;[12,1] 
||      MPYH    .M2     B_X3X2,     B_c7c5,     B_X3c7          ;[12,1] 
||      LDW     .D1T1   *+A_i_ptr[1],           A_X7X6          ;[ 1,2] 
||      LDW     .D2T2   *-B_i_ptr[1],           B_X1X0          ;[ 1,2] 
||      B       .S2     v_loop_1 + 8                            ; skip 2
;-
        SUB     .D2     B_P0_t,     B_rnd,      B_P0            ;[13,1] 
||      ADD     .L2     B_X1c1,     B_X3c3,     B_h2b           ;[13,1] 
||      ADD     .L1     A_X5c5,     A_X7c7,     A_h2a           ;[13,1] 
||      MPYLH   .M1X    A_X7X6,     B_c2c1,     A_X6c2          ;[13,1] 
||      MPYLH   .M2X    B_X3X2,     A_c6c3,     B_X2c6          ;[13,1] 
||      B       .S2     v_loop_2 + 12                           ; skip 3
||      MVKL    .S1     cst_c1,     A_c1c4                      ;
;-
        SUB     .L1     A_X5c3,     A_X7c1,     A_g2a           ;[14,1] 
||      MPYHL   .M1     A_X5X4,     A_c1c4,     A_X5c1          ;[14,1]
||      MPYHL   .M2X    B_X1X0,     A_c6c3,     B_X1c3          ;[14,1] 
||      LDW     .D2T2   * B_i_ptr++[4],         B_X3X2          ;[ 3,2] 
||      LDW     .D1T1   * A_i_ptr++[4],         A_X5X4          ;[ 3,2] 
||      B       .S2     v_loop_3 + 4                            ; skip 1
||      ADD     .S1X    B_P0,       A_P1,       A_p0            ;[16,1]
;-
        ADD     .L2X    B_h2b,      A_h2a,      B_h2            ;[15,1] 
||      SUB     .L1X    B_P0,       A_P1,       A_p1            ;[15,1] 
||      MPYHL   .M1     A_X7X6,     A_c7c5,     A_X7c5          ;[15,1] 
||      MPYLH   .M2     B_X3X2,     B_c2c1,     B_X2c2          ;[15,1] 
||      B       .S2     v_loop_4 + 4                            ; skip 1

        SUB     .L2X    B_X2c6,     A_X6c2,     B_r1            ;[16,1] 
||      MPYLH   .M1     A_X7X6,     A_c6c3,     A_X6c6          ;[16,1] 
||      MPYH    .M2     B_X1X0,     B_c7c5,     B_X1c7          ;[16,1] 
;-
; ===== Branch Occurs =====
; ============================ PIPE LOOP KERNEL ==============================
v_loop:
v_loop_0:
        STH     .D1T2   B_x7t,      *+A_o_ptr[24]               ;[28,1] 
||      SHR     .S1     A_x4s,      trunc2,     A_x4t           ;[28,1] 
||      ADD     .L1     A_X5c1,     A_X7c5,     A_h3a           ;[17,2] 
||      SUB     .D2     B_X1c3,     B_X3c7,     B_h3b           ;[17,2] 
||      SUB     .L2X    A_p1,       B_r1,       B_h1            ;[17,2]
||      MPYHL   .M2     B_X3X2,     B_c7c5,     B_X3c5          ;[17,2] 
||      MVK     .S2     -32768,     B_rnd                       ;[ 6,3] 
||      MPYHL   .M1     A_X7X6,     A_c6c3,     A_X7c3          ;[ 6,3] 

v_loop_1:
        STH     .D1T1   A_x4t,      * A_o_ptr++[1]              ;[29,1] 
||      SHR     .S1     A_x1s,      trunc2,     A_x1t           ;[29,1] 
||      ADD     .S2     B_h1,       B_g3,       B_x2            ;[18,2] 
||      SUB     .D2     B_h1,       B_g3,       B_x5            ;[18,2] 
||      ADD     .L1X    A_p1,       B_r1,       A_g1            ;[18,2]
||      ADD     .L2X    B_X2c2,     A_X6c6,     B_r0            ;[18,2] 
||      MPYH    .M1     A_X7X6,     A_c7c5,     A_X7c7          ;[ 7,3] 
||      MPYHL   .M2     B_X1X0,     B_c2c1,     B_X1c1          ;[ 7,3] 

v_loop_2:
  [!B_i]ADDAW   .D1     A_o_ptr,    28,         A_o_ptr         ;[30,1] 
||      STH     .D2T1   A_x3t,      * B_o_ptr++[1]              ;[30,1] 
||      SHR     .S2     B_x0s,      trunc2,     B_x0t           ;[30,1] 
||      SUB     .L2     B_X1c7,     B_X3c5,     B_g2b           ;[19,2] 
||      SUB     .L1X    B_h3b,      A_h3a,      A_h3            ;[19,2] 
||      MVK     .S1     cst_c4,     A_c1c4                      ;[ 8,3] 
||      MPYH    .M1     A_X5X4,     A_c7c5,     A_X5c7          ;[ 8,3] 
||      MPYHL   .M2     B_X1X0,     B_c7c5,     B_X1c5          ;[ 8,3] 

v_loop_3:
        STH     .D2T1   A_x1t,      *-B_o_ptr[17]               ;[31,1] 
||      ADD     .L2X    A_p0,       B_r0,       B_g0            ;[20,2]
||      SSHL    .S2     B_x5,       satl,       B_x5s           ;[20,2] 
||      SUB     .S1X    A_p0,       B_r0,       A_h0            ;[20,2]
||      SUB     .L1     A_g1,       A_h3,       A_x6            ;[20,2] 
||      ADD     .D1     A_g1,       A_h3,       A_x1            ;[20,2] 
||      MPY     .M1     A_X5X4,     A_c1c4,     A_P1            ;[ 9,3] 
||      MPYHL   .M2     B_X3X2,     B_c2c1,     B_X3c1          ;[ 9,3] 

v_loop_4:
        STH     .D2T2   B_x0t,      *-B_o_ptr[25]               ;[32,1] 
||      SUB     .S2     B_g0,       B_h2,       B_x7            ;[21,2] 
||      ADD     .L2     B_g0,       B_h2,       B_x0            ;[21,2] 
||      ADD     .L1X    B_g2b,      A_g2a,      A_g2            ;[21,2] 
||      SSHL    .S1     A_x1,       satl,       A_x1s           ;[21,2] 
||      ADD     .D1     A_X5c7,     A_X7c3,     A_g3a           ;[10,3] 
||      MPYHL   .M1     A_X5X4,     A_c6c3,     A_X5c3          ;[10,3] 
||      MPYHL   .M2X    B_X3X2,     A_c6c3,     B_X3c3          ;[10,3] 

v_loop_5:
  [ A_o]B       .S1     v_loop                                  ;[33,1] 
||[!B_i]ADDAW   .D2     B_o_ptr,    28,         B_o_ptr         ;[33,1] 
||      SSHL    .S2     B_x2,       satl,       B_x2s           ;[22,2] 
||      ADD     .D1     A_h0,       A_g2,       A_x3            ;[22,2] 
||[ A_o]SUB     .L1     A_o,        1,          A_o             ;[22,2] 
||      SUB     .L2     B_X1c5,     B_X3c1,     B_g3b           ;[11,3] 
||      MPYHL   .M1     A_X5X4,     A_c7c5,     A_X5c5          ;[11,3] 
||      MPY     .M2X    B_X1X0,     A_c1c4,     B_P0_t          ;[11,3] 

v_loop_6:
        SHR     .S2     B_x5s,      trunc2,     B_x5t           ;[23,2] 
||      SUB     .L1     A_h0,       A_g2,       A_x4            ;[23,2] 
||      SSHL    .S1     A_x6,       satl,       A_x6s           ;[23,2] 
||      ADD     .L2X    B_g3b,      A_g3a,      B_g3            ;[12,3] 
||      MPYHL   .M1X    A_X7X6,     B_c2c1,     A_X7c1          ;[12,3] 
||      MPYH    .M2     B_X3X2,     B_c7c5,     B_X3c7          ;[12,3] 
||      LDW     .D1T1   *+A_i_ptr[1],           A_X7X6          ;[ 1,4] 
||      LDW     .D2T2   *-B_i_ptr[1],           B_X1X0          ;[ 1,4] 

v_loop_7:
        SHR     .S2     B_x2s,      trunc2,     B_x2t           ;[24,2] 
||      STH     .D1T2   B_x5t,      *+A_o_ptr[8]                ;[24,2] 
||      SHR     .S1     A_x6s,      trunc2,     A_x6t           ;[24,2] 
||      SUB     .D2     B_P0_t,     B_rnd,      B_P0            ;[13,3] 
||      ADD     .L2     B_X1c1,     B_X3c3,     B_h2b           ;[13,3] 
||      ADD     .L1     A_X5c5,     A_X7c7,     A_h2a           ;[13,3] 
||      MPYLH   .M1X    A_X7X6,     B_c2c1,     A_X6c2          ;[13,3] 
||      MPYLH   .M2X    B_X3X2,     A_c6c3,     B_X2c6          ;[13,3] 

v_loop_8:
        AND     .L2     B_i,        7,          B_i             ;[36,1] 
||      SSHL    .S2     B_x7,       satl,       B_x7s           ;[25,2] 
||      SSHL    .S1     A_x3,       satl,       A_x3s           ;[25,2] 
||      SUB     .L1     A_X5c3,     A_X7c1,     A_g2a           ;[14,3] 
||      MPYHL   .M1X    A_X5X4,     B_c2c1,     A_X5c1          ;[14,3] 
||      MPYHL   .M2X    B_X1X0,     A_c6c3,     B_X1c3          ;[14,3] 
||      LDW     .D2T2   * B_i_ptr++[4],         B_X3X2          ;[ 3,4] 
||      LDW     .D1T1   * A_i_ptr++[4],         A_X5X4          ;[ 3,4] 

v_loop_9:
        STH     .D2T2   B_x2t,      *-B_o_ptr[8]                ;[26,2] 
||      SHR     .S1     A_x3s,      trunc2,     A_x3t           ;[26,2] 
||      SHR     .S2     B_x7s,      trunc2,     B_x7t           ;[26,2] 
||      ADD     .L2X    B_h2b,      A_h2a,      B_h2            ;[15,3] 
||      SUB     .L1X    B_P0,       A_P1,       A_p1            ;[15,3] 
||      MPYHL   .M1     A_X7X6,     A_c7c5,     A_X7c5          ;[15,3] 
||      MPYLH   .M2     B_X3X2,     B_c2c1,     B_X2c2          ;[15,3] 

v_loop_a:
  [ A_o]SUB     .D2     B_i,        1,          B_i             ;[27,2] 
||      STH     .D1T1   A_x6t,      *+A_o_ptr[16]               ;[27,2] 
||      SSHL    .S2     B_x0,       satl,       B_x0s           ;[27,2] 
||      SSHL    .S1     A_x4,       satl,       A_x4s           ;[27,2] 
||      SUB     .L2X    B_X2c6,     A_X6c2,     B_r1            ;[16,3] 
||      ADD     .L1X    B_P0,       A_P1,       A_p0            ;[16,3] 
||      MPYLH   .M1     A_X7X6,     A_c6c3,     A_X6c6          ;[16,3] 
||      MPYH    .M2     B_X1X0,     B_c7c5,     B_X1c7          ;[16,3] 

; ============================ PIPE LOOP EPILOG ==============================
v_epilog:
* ========================================================================= *
*   Post-vertical loop code:  Performs remaining vertical-loop epilog,      *
*   pulls registers from the stack, restores the interrupt-enable state,    *
*   and returns to the caller.                                              *
*                                                                           *
*   For speed, I start pulling items from the stack as quickly as           *
*   possible. I pop the return address earliest, followed by the CSR        *
*   restore value and the rest of the stack frame (basically, the SOE       *
*   registers).                                                             *
*                                                                           *
*   I throw the return branch in flight nearly as soon as the return addr   *
*   arrives from the stack in order to return to the caller as soon as      *
*   possible.  I don't think it's possible to save any more time in this    *
*   epilog code.  :-)                                                       *
*                                                                           *
*   Once the stack-frame restore is complete, I allow the remainder of      *
*   the epilog (mostly shifts and stores) to complete, in the remaining     *
*   delay slots of the return branch.  Since the stack-restore loads        *
*   need to complete before this time anyway, I couldn't of used those      *
*   cycles for much else anyway.                                            *
*                                                                           *
*   The interrupt-enable state is not restored until the return branch      *
*   is in flight.  This implies that any pending interrupt will be taken    *
*   on arrival in the calling function, assuming it called the IDCT with    *
*   interrupts enabled.                                                     *
*                                                                           *
*   Again, this code uses twin stack-pointers for speed.                    *
*                                                                           *
*   To highlight how intertwined the epilog is with the stack frame code    *
*   I've added comments highlighting what is what.                          *
*                                                                           *
*   A trick is played in order to allow an early abort from the code.       *
*   If the loop trip count is calculated to be zero by the main setup code  *
*   at the beginning, an emergency branch is made to the abort label        *
*   below.  (The abort is triggered only if we're asked to do zero IDCTs.)  *
*   The outer loop trip count for the first loop (B_o) is stored in B2.     *
*   The second loop uses B2 for its inner loop trip count (B_i).  Under     *
*   normal operation, B2 (aka. B_o) is non-zero upon entry to this code.    *
*   However, in the case of an abort, it will be zero, since we did not     *
*   execute either loop.  Therefore we can use B_o to shut off the epilog   *
*   stores in the case of an early abort.                                   *
* ========================================================================= *
idct_8x8_abort:
  [ B_o]STH     .D1T2   B_x7t,      *+A_o_ptr[24]       ; epilog code
||      SHR     .S1     A_x4s,      trunc2,     A_x4t   ; epilog code
||      MVC     .S2     IRP,        B15         ; Grab the stack pointer

        LDW     .D2T2   *+ B15[ 2], B3          ; Get our return address
||      MV      .L2     B_o_ptr,    B0          ; We need this later.
||      SHR     .S1     A_x1s,      trunc2,     A_x1t   ; epilog code
||      MV      .L1X    B15,        A1 

        LDW     .D2T1   *+ B15[14], A2
;-
        LDW     .D2T2   *+ B15[10], B1          ; A13 value to restore below
||      LDW     .D1T1   *+ A1 [13], A14         ; Restoring SOE reg A14

        LDW     .D2T2   *+ B15[12], B14         ; Restoring SOE reg B14
||      LDW     .D1T1   *+ A1 [ 3], A3          ; CSR value to restore below

        LDW     .D1T2   *+ A1 [ 4], B10         ; Restoring SOE reg A10
||      LDW     .D2T1   *+ B15[ 5], A10         ; Restoring SOE reg A10

        LDW     .D1T2   *+ A1 [ 6], B11         ; Restoring SOE reg A11
||      LDW     .D2T1   *+ B15[ 7], A11         ; Restoring SOE reg A11
;-
        LDW     .D1T2   *+ A1 [ 8], B12         ; Restoring SOE reg A12
||      LDW     .D2T1   *+ B15[ 9], A12         ; Restoring SOE reg A12
||      B       .S2     B3                      ; Go home!

        LDW     .D1T2   *+ A1 [11], B13         ; Restoring SOE reg B13
||      LDW     .D2T1   *++B15[16], A15         ; Restoring SOE reg A15, B15
||      MV      .L1X    B1,         A13         ; Restoring SOE reg A13

  [ B_o]STH     .D2T1   A_x3t,      * B0                ; epilog code
||      SHR     .S2     B_x0s,      trunc2,     B_x0t   ; epilog code
;-

  [ B_o]STH     .D2T1   A_x1t,      *-B0[16]            ; epilog code

  [ B_o]STH     .D2T2   B_x0t,      *-B0[24]            ; epilog code
||      MVC     .S2X    A2,         IRP         ; Restoring SOE reg IRP

  [ B_o]STH     .D2T1   A_x4t,      *+B0[8]             ; epilog code
||      MVC     .S2X    A3,         CSR         ; Restore intr enable state
;-
v_end:

* ========================================================================= *
*   End of file:  idct_8x8_h.asm                                            *
* ------------------------------------------------------------------------- *
*             Copyright (c) 1999 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *
