*===========================================================
*
*     TEXAS INSTRUMENTS, INC.
*
*     NAME:             FHT (Fast Hadamard Transform)
*
*     Revision Date:    04/01/98
*     Author:           Jelena Nikolic-Popovic, DSP Apps. Canada
*
*     USAGE
*           This routine is C-callable and can be called as:
*
*           void fht(short *data)
*
*           data = 64-point            input vector on entry
*                  64-point FHT of the input vector on exit
*                  
*     C CODE
*           This is the C equivalent of the assembly code
*           without restrictions:
*           Note that the assembly code is hand optimized
*           and restrictions may  apply.
*
*       void fht(short *data)
*       {
*         short tmp,k,i,count,j,n;
*
*         for (k=0;k<6;k++) {
*           n=32>>k;
*           for (i=0;i<n;i++) {
*             tmp=data[i]+data[i+n];
*             data[i+n]=data[i]-data[i+n];
*             data[i]=tmp;
*           }
*           count=(2<<(k-1))-1;
*           if (count>0) {
*             for (j=1;j<=count;j++) {
*               for (i=n*j*2;i<n*(j*2+1);i++) {
*                 tmp=data[i]+data[i+n];
*                 data[i+n]=data[i]-data[i+n];
*                 data[i]=tmp;
*               }
*             }   // end of for j=1:count
*           }     // endif
*         }       //  end of for k=0:5  
*       }
*
*
*     DESCRIPTION
*         The  routine  replaces the 64-point  input  data
*         vector  with  its FHT (Fast Hadamard Transform).
*         The FHT is an efficient way to multiply a vector
*         by  the Hadamard  matrix. In  IS-95, the size of
*         input vector  is 64 points  and  Hadamard matrix
*         H6 is a  64 x 64 matrix  defined recursively as:
*
*         H0 = [1]
*         H1 = H0   H0  =  1   1
*              H0  -H0     1  -1 
*
*         ...
*
*         H6 = H5   H5
*              H5  -H5
*
*
*         FHT reduces the vector * matrix multiplication
*         task to butterfly computation, similar to
*         Viterbi.
*         For multiplication by H6, there are 6 stages of
*         32 butterflies each. From one stage to the next,
*         the butterfly "span" is divided by 2 (from 32
*         points in the first stage to 2 points in the
*         last stage).
*         The algorithm requires: 
*         6 stages * 32 butterflies/stage * 2 op's/butterfly 
*                = 384 add/subtract operations
*  
*
*     TECHNIQUES
*
*         The algorithm is optimized for speed of execution
*         rather than code size.
*
*         A potential bottleneck in the C6x implementation
*         is in the requirement to store/load  intermediate
*         64-point array after each stage.
*         This was overcome by making following modification
*         to the H6 - FHT algorithm:
*
*         Multiplication of input vector d[64] by the H6
*         matrix ( 64x64 ) is broken down in two
*         multiplications by H5 (32x32 matrix) as follows:
*         d[64] = [ d1[32] d2[32] ], i.e. split the input
*         vector in two 32-point vectors
*         H6*d[64]  =         H5*d1[32] + H5*d2[32]            
*                             H5*d1[32] - H5*d2[32]
*         First, H5*d1[32] is calculated using the butterfly
*         method. Since there are only 32 intermediate
*         results of type short, they can be stored in
*         16 registers - no need to store out after each
*         stage and then load in for the next one.
*         Next, H5*d1[32] will be stored out to memory and
*         H5*d2[32] will be calculated in the same manner as
*         H5*d1[32]. Finally, H6*d[64] is computed from
*         H5*d1[32] and H5*d2[32] as shown above.
*
*     ASSUMPTIONS
*         - interrupts are disabled
*         - the range of data in the input  vector is such
*           that the data in the output vector will not
*           overflow 16 bits. Since there are 6 stages of
*           add/sub's, this means that the range of input
*           data should not be more than 10 bits.
*           ( in the worst case).
*         - input vector is aligned on 4-byte boundary.
*
*     MEMORY NOTE
*             no memory bank hits.  
*     CYCLES
*           191 cycles (including C calling overhead)
*
*===========================================================


        .global _fht
        .text
_fht:

        ; save context  

        sub.l1x    b15,4,a7             ; copy stack pointer
||      stw.d2     b3,*B15--[2]

        stw.d1     a10,*a7--[2] ; push a10 
||      stw.d2     b10,*b15--[2]; push b10 
        stw.d1     a11,*A7--[2] ; push a11 
||      stw.d2     b11,*b15--[2]; push b11 
        stw.d1     a12,*a7--[2] ; push a12 
||      stw.d2     b12,*b15--[2]; push b12 
        stw.d1     a13,*a7--[2] ; push a13 
||      stw.d2     b13,*b15--[2]; push b13 
        stw.d1     a14,*a7      ; push a14 
||      stw.d2     b14,*b15     ; push b14 

        ; intialization


        mv.l1   a4,a14          ; a14 = &d[0]       
||      mvk     32,a13          ; a13 = 32
||      mvk     32,b13          ; b13 = 32         

        mvk     2,b13           ; b13 = 00010002    
||      mvk     0ffffh,a15      ; a15 = 0000ffff   
||      add.l2  a14,b13,b14     ; b14 = &d(16)     

        mvklh   1,b13           ; b13 = 00010002   
||      mvklh   0,a15           ; a15 = 0000ffff

        ; H5 loop computes H5*d1(32) and H5*d2(32) 
        ; Notation : di(), where i=0,1,2,3,4,5,
        ; are the intermediate vectors after stage i

H5_loop:

        ldw.d1 *a14++,b0        ; d0(0)  and d0(1)    
                                            

        ldw.d2 *b14++,a0        ; d0(16) and d0(17)
||      ldw.d1 *a14++,b1        ; d0(2)  and d0(3)   

        ldw.d2 *b14++,a1        ; d0(18) and d0(19)
||      ldw.d1 *a14++,b2        ; d0(4)  and d0(5)

        ldw.d2 *b14++,a2        ; d0(20) and d0(21)
||      ldw.d1 *a14++,b3        ; d0(6)  and d0(7) 

        ldw.d2 *b14++,a3        ; d0(22) and d0(23)
||      ldw.d1 *a14++,b4        ; d0(8)  and d0(9)

        ldw.d2 *b14++,a4        ; d0(24) and d0(25)
||      ldw.d1 *a14++,b5        ; d0(10) and d0(11) 

        ldw.d2 *b14++,a5        ; d0(26) and d0(27) 
||      add2.s1 a0,b0,a0        ; d1(0)  and d1(1)  
||      sub2.s2 b0,a0,b0        ; d1(16) and d1(17) 
||      ldw.d1 *a14++,b6        ; d0(12) and d0(13) 

        ldw.d2 *b14++,a6        ; d0(28) and d0(29) 
||      add2.s1 a1,b1,a1        ; d1(2)  and d1(3)  
||      sub2.s2 b1,a1,b1        ; d1(18  and d1(19) 
||      ldw.d1 *a14++,b7        ; d0(14) and d0(15)

        ldw.d2 *b14++,a7        ; d0(30) and d0(31)
||      add2.s1 a2,b2,a2        ; d1(4)  and d1(5)  
||      sub2.s2 b2,a2,b2        ; d1(20) and d1(21)

        add2.s1 a3,b3,a3        ; d1(6)  and d1(7) 
||      sub2.s2 b3,a3,b3        ; d(22)  and d(23) 

        add2.s1 a4,b4,a4        ; d1(8)  and d1(9) 
||      sub2.s2 b4,a4,b4        ; d1(24) and d1(25)

        add2.s1 a5,b5,a5        ; d1(10) and d1(11) 
||      sub2.s2 b5,a5,b5        ; d1(26) and d1(27) 

        add2.s1 a6,b6,a6        ; d1(12) and d1(13) 
||      sub2.s2 b6,a6,b6        ; d1(28) and d1(29) 

        add2.s1 a7,b7,a7        ; d1(14) and d1(15) 
||      sub2.s2 b7,a7,b7        ; d1(30) and d1(31)

        ; End   of 1st stage 
        ; Start of 2nd stage

        sub2.s1 a0,a4,a8        ; d2(8)  and d2(9)                  
||      sub2.s2 b0,b4,b8        ; d2(24) and d2(25) 

        add2.s1 a0,a4,a0        ; d2(0)  and d2(1)   
||      add2.s2 b0,b4,b0        ; d2(16) and d2(17) 

        sub2.s1 a2,a6,a4        ; d2(12) and d2(13)      
||      sub2.s2 b2,b6,b4        ; d2(28) and d2(29)       

        add2.s1 a2,a6,a2        ; d2(4)  and d2(5)        
||      add2.s2 b2,b6,b2        ; d2(20) and d2(21)         

        sub2.s1 a1,a5,a6        ; d2(10) and d2(11)      
||      sub2.s2 b1,b5,b6        ; d2(26) and d2(27)        

        add2.s1 a1,a5,a1        ; d2(2)  and d2(3)       
||      add2.s2 b1,b5,b1        ; d2(18) and d2(19)      

        sub2.s1 a3,a7,a5        ; d2(14) and d2(15) 
||      sub2.s2 b3,b7,b5        ; d2(30) and d2(31)        

        add2.s1 a3,a7,a3        ; d2(6)  and d2(7)       
||      add2.s2 b3,b7,b3        ; d2(22) and d2(23)       


        ; End   of 2nd stage
        ; Start of 3rd stage

        sub2.s1 a0,a2,a7        ; d3(4)  and d3(5)
||      sub2.s2 b0,b2,b7        ; d3(20) and d3(21)        

        add2.s1 a0,a2,a0        ; d3(0)  and d3(1)   
||      add2.s2 b0,b2,b0        ; d3(16) and d3(17) 

        sub2.s1 a1,a3,a2        ; d3(6)  and d3(7)      
||      sub2.s2 b1,b3,b2        ; d3(22) and d3(23)        

        add2.s1 a1,a3,a1        ; d3(2)  and d3(3)       
||      add2.s2 b1,b3,b1        ; d3(18) and d3(19) 

        sub2.s1 a8,a4,a3        ; d3(12) and d3(13)                
||      sub2.s2 b8,b4,b3        ; d3(28) and d3(29)        

        add2.s1 a8,a4,a4        ; d3(8   and d3(9)     
||      add2.s2 b8,b4,b4        ; d3(24) and d3(25)  

        sub2.s1 a6,a5,a8        ; d3(14) and d3(15)                
||      sub2.s2 b6,b5,b8        ; d3(30) and d3(31)        

        add2.s1 a6,a5,a5        ; d3(10) and d3(11)  
||      add2.s2 b6,b5,b5        ; d3(26  and d3(27) 

        ; End   of 3nd stage
        ; Start of 4th stage ( pipelined with 5th stage)

        sub2.s1 a0,a1,a6        ; d4(2)  and d4(3)  
||      sub2.s2 b0,b1,b6        ; d4(18) and d4(19) 

        add2.s1 a0,a1,a0        ; d4(0)  and d4(1) 
||      add2.s2 b0,b1,b0        ; d4(16) and d4(17) 
||      sub.l1 a14,a13,a14      ; adjust store pointers
||      sub.l2 b14,a13,b14

        sub2.s1 a7,a2,a9        ; d4(6)  and d4(7)    
||      sub2.s2 b7,b2,b9        ; d4(22) and d4(23)   
||      mpyhu.m1 a0,b13,a1      ; d4(1) 
||      and.l1 a0,a15,a0        ; d4(0)  
||      mpyhu.m2 b0,b13,b1      ; d4(17) 
||      and.l2 b0,a15,b0        ; d4(16) 
         
        add2.s1 a7,a2,a2        ; d4(4)  and d4(5) 
||      add2.s2 b7,b2,b2        ; d4(20) and d4(21) 
||      mpyhu.m1 a6,b13,a7      ; d4(3)           
||      and.l1 a6,a15,a6        ; d4(2)            
||      mpyhu.m2 b6,b13,b7      ; d4(19)           
||      and.l2 b6,a15,b6        ; d4(18)           

        ; Start of 5th stage

        sub2.s1 a4,a5,a12       ; d4(10) and d4(11)
||      sub2.s2 b4,b5,b12       ; d4(26) and d4(27) 
||      add.l1 a0,a1,a0         ; d5(0)            
||      sub.d1 a0,a1,a1         ; d5(1)          
||      add.l2 b0,b1,b0         ; d5(16)        
||      sub.d2 b0,b1,b1         ; d5(17)      
||      mpyhu.m1 a2,b13,a10     ; d4(5)  
||      mpyhu.m2 b2,b13,b10     ; d4(21)         


        add2.s1 a4,a5,a4        ; d4(8)  and d4(9)  
||      add2.s2 b4,b5,b4        ; d4(24) and d4(25)
||      add.l1 a6,a7,a6         ; d5(2)          
||      sub.d1 a6,a7,a7         ; d5(3)            
||      add.l2 b6,b7,b6         ; d5(18)           
||      sub.d2 b6,b7,b7         ; d5(19)          
||      mpylhu.m1 a2,b13,a2     ; d4(4)          
||      mpylhu.m2 b2,b13,b2     ; d4(20)            

        sub2.s1 a3,a8,a5        ; d4(14) and d4(15) 
||      sub2.s2 b3,b8,b5        ; d4(30) and d4(31)  
||      mpyhu.m1 a9,b13,a11     ; d4(6)             
||      and.l1 a9,a15,a9        ; d4(7)             
||      mpyhu.m2 b9,b13,b11     ; d4(22)          
||      and.l2 b9,a15,b9        ; d4(23)             


        add2.s1 a3,a8,a3        ; d4(12) and d4(13) 
||      add2.s2 b3,b8,b3        ; d4(28) and d4(29)  
||      and.l1  a0,a15,a0       ; clear upper half of register
||      and.l2  b0,a15,b0       ; clear upper half of register

        add.l1 a2,a10,a2        ; d5(4)            
||      sub.d1 a2,a10,a10       ; d5(5)            
||      add.l2 b2,b10,b2        ; d5(20)            
||      sub.d2 b2,b10,b10       ; d5(21)           
||      mpyhu.m1 a4,b13,a8      ; d4(9)              
||      mpyhu.m2 b4,b13,b8      ; d4(25)            
||      and.s1  a6,a15,a6       ; clear upper half of register  
||      and.s2  b6,a15,b6       ; clear upper half of register


        shl.s1 a1,16,a1         ; d5(1)  in upper half
||      shl.s2 b1,16,b1         ; d5(17) in upper half
||      add.l1 a9,a11,a9        ; d5(6)            
||      sub.d1 a9,a11,a11       ; d5(7)            
||      add.l2 b9,b11,b9        ; d5(22)           
||      sub.d2 b9,b11,b11       ; d5(23)            
||      mpylhu.m1 a4,b13,a4     ; d4(8)            
||      mpylhu.m2 b4,b13,b4     ; d4(24)          

        shl.s1 a7,16,a7         ; d5(3)  in upper half
||      shl.s2 b7,16,b7         ; d5(19) in upper half
||      add.d1 a0,a1,a0         ; d5(0)  and d5(1)   
||      add.d2 b0,b1,b0         ; d5(16) and d5(17)     
||      mpyhu.m1 a12,b13,a1     ; d4(10)             
||      and.l1 a12,a15,a12      ; d4(11)            
||      mpyhu.m2 b12,b13,b1     ; d4(26)            
||      and.l2 b12,a15,b12      ; d4(27)            


        and.l1 a6,a15,a6        ; clear upper half of register
||      and.l2 b6,a15,b6        ; clear upper half of register
||      mpylhu.m1 a2,b13,a2     ; clear upper half of register
||      mpylhu.m2 b2,b13,b2     ; clear upper half of register

        shl.s1 a10,16,a10       ; d5(5)  in upper half
||      shl.s2 b10,16,b10       ; d5(21) in upper half
||      add.d1 a6,a7,a6         ; d5(2)  and d5(3) 
||      add.d2 b6,b7,b6         ; d5(18) and d5(19)  
||      mpyhu.m1 a3,b13,a7      ; d4(12) 
||      and.l1 a3,a15,a3        ; d4(13) 
||      mpyhu.m2 b3,b13,b7      ; d4(28) 
||      and.l2 b3,a15,b3        ; d4(29) 
 
        shl.s1 a11,16,a11       ; d5(7)  in upper half
||      shl.s2 b11,16,b11       ; d5(23) in upper half
||      add.d1 a2,a10,a2        ; d5(4)  and d5(5)    
||      add.d2 b2,b10,b2        ; d5(20) and d5(21)  
||      mpyhu.m1 a5,b13,a10     ; d4(14)              
||      and.l1 a5,a15,a5        ; d4(15)              
||      mpyhu.m2 b5,b13,b10     ; d4(30)             
||      and.l2 b5,a15,b5        ; d4(31)             

        ; End of 4th stage

        stw.d1 a0,*a14++        ; 
||      add.l1 a4,a8,a4         ; d5(8)
||      sub.s1 a4,a8,a8         ; d5(9)
||      add.l2 b4,b8,b4         ; d5(24)
||      sub.s2 b4,b8,b8         ; d5(25)

        shl.s1 a8,16,a8         ; d5(9)  in upper half
||      shl.s2 b8,16,b8         ; d5(25) in upper half
||      add.l1 a12,a1,a12       ; d5(10)             
||      sub.d1 a12,a1,a1        ; d5(11)             
||      add.l2 b12,b1,b12       ; d5(26)             
||      sub.d2 b12,b1,b1        ; d5(27)             

        shl.s1 a1,16,a1         ; d5(11) in upper half
||      shl.s2 b1,16,b1         ; d5(27) in upper half
||      add.l1 a3,a7,a3         ; d5(12)            
||      sub.d1 a3,a7,a7         ; d5(13)            
||      add.l2 b3,b7,b3         ; d5(28)            
||      sub.d2 b3,b7,b7         ; d5(29)            
||      mpylhu.m1 a9,b13,a9     ; clear upper half of register
||      mpylhu.m2 b9,b13,b9     ; clear upper half of register
 
        shl.s1 a7,16,a7         ; d5(13) in upper half
||      shl.s2 b7,16,b7         ; d5(29) in upper half
||      add.l1 a5,a10,a5        ; d5(14)             
||      sub.d1 a5,a10,a10       ; d5(15)             
||      add.l2 b5,b10,b5        ; d5(30)             
||      sub.d2 b5,b10,b10       ; d5(31)             
||      mpyhl.m2 b13,b13,b0     ; extract the counter in b0
  
        stw.d1 a6,*a14++        ; 
||      stw.d2 b0,*b14++        ;         
||      shl.s1 a10,16,a10       ; d5(15) in upper half
||      shl.s2 b10,16,b10       ; d5(31) in upper half
||      add.l1 a9,a11,a9        ; d5(6)  and d5(7)  
||      add.l2 b9,b11,b9        ; d5(22) and d5(23) 
 
        and.l1 a4,a15,a4        ; clear upper half of register
||      and.l2 b4,a15,b4        ; clear upper half of register
||      mpylhu.m1 a12,b13,a12   ; clear upper half of register
||      mpylhu.m2 b12,b13,b12   ; clear upper half of register

        and.l1 a3,a15,a3        ; clear upper half of register
||      and.l2 b3,a15,b3        ; clear upper half of register
||      mpylhu.m1 a5,b13,a5     ; clear upper half of register
||      mpylhu.m2 b5,b13,b5     ; clear upper half of register

        stw.d1 a2,*a14++        ; 
||      stw.d2 b6,*b14++        ; 
||      add.l1 a4,a8,a4         ; d5(8)  and d5(9)   
||      add.l2 b4,b8,b4         ; d5(24) and d5(25)  
||      sub.s2 b0,1,b0          ; decrement counter
 
        stw.d1 a9,*a14++        ; 
||      stw.d2 b2,*b14++        ; 
||      add.l1 a3,a7,a3         ; d5(12) and d5(13)  
||      add.l2 b3,b7,b3         ; d5(28) and d5(29) 
||      add.s1 a12,a1,a12       ; d5(10) and d5(11) 
||      add.s2 b12,b1,b12       ; d5(26) and d5(27) 
||      mpylh.m2 b0,b13,b13     ; move counter to b13(low)
   
        stw.d1 a4,*a14++        ; 
||      stw.d2 b9,*b14++        ; 
||      add.l1 a5,a10,a5        ; d5(14) and d5(15)  
||      add.l2 b5,b10,b5        ; d5(30) and d5(31)  
    
        stw.d1 a12,*a14++       ; 
||      stw.d2 b4,*b14++        ; 
||      mvklh.s2 1,b13          ; b13(high) = 1 ( for multiplications)
||[b0]  b.s1   H5_loop          ; branch back
   
        stw.d1 a3,*a14++        ; 
||      stw.d2 b12,*b14++       ;         
 
        stw.d1 a5,*a14++        ; 
||      stw.d2 b3,*b14++        ;         
||[!b0] mv.l2 a13,b13           ; 

        stw.d2 b5,*b14++        ;       

        ; End of 5th stage
        ; Adjust pointers                                       
                                                
   [b0] add.l1 a14,a13,a14      ; if looping back, a14 = &d(32)
|| [b0] add.l2 b14,a13,b14      ;
||[!b0] sub.s1 a14,a13,a14      ; if not looping,  a14 = &d(32) 
||[!b0] sub.s2 b14,b13,b14      ; if not looping,  b14 = &d(48)

  [!b0] sub.s1 a14,a13,a14      ; if not looping,  a14 = &d(16)
||[!b0] sub.s2 b14,b13,b14      ; if not looping,  b14 = &d(32)

  [!b0] sub.s1 a14,a13,a14      ; if not looping,  a14 = &d(0)

        ; Branch to H5_loop occurs here 

        ; H6 loop (last stage)
H6:

        mvk.s2 8,b0             ; initialize counter
||      mv     b14,a13          ; set up load pointers: 
||      add    a14,4,b14        ; a14 = &d[0]  ; b14 = &d[2]  (upper even and
odd)
||      add    b14,4,b13        ; a13 = &d[32] ; b13 = &d[34] (lower even and
odd)

        ldw.d1 *a14++[2],a4     ; upper even load
||      ldw.d2 *b14++[2],b4     ; upper odd  load
||      mv     a14,a12          ; set up store pointers
||      mv     b14,b12          ; a12 = &d[0]  ; b12 = &d[2]

        ldw.d1 *a13++[2],a3     ; lower even load
||      ldw.d2 *b13++[2],b3     ; lower odd load
||      mv     a13,a11          ; set up store pointers
||      mv     b13,b11          ; a11 = &d[0]  ; b11 = &d[2]

        nop 2

        ldw.d1 *a14++[2],a4     ; upper even load                           
||      ldw.d2 *b14++[2],b4     ; upper odd  load

        ldw.d1 *a13++[2],a3     ; lower even load                           
||      ldw.d2 *b13++[2],b3     ; lower odd load

        add2.s1 a4,a3,a2        ; upper/lower even -> upper even
||      add2.s2 b4,b3,b2        ; upper/lower odd  -> upper odd

        sub2.s1 a4,a3,a1        ; upper/lower even -> lower even
||      sub2.s2 b4,b3,b1        ; upper/lower odd  -> lower odd
||[b0]  sub.l2 b0,1,b0          ; 

        ldw.d1 *a14++[2],a4     ; upper even load                           
||      ldw.d2 *b14++[2],b4     ; upper odd  load
||[b0]  b      H6loop       

        ldw.d1 *a13++[2],a3     ; lower even load                           
||      ldw.d2 *b13++[2],b3     ; lower odd load

H6loop:
        add2.s1 a4,a3,a2        ; upper/lower even -> upper even
||      add2.s2 b4,b3,b2        ; upper/lower odd  -> upper odd
||      stw a2,*a12++[2]                
||      stw b2,*b12++[2]

        sub2.s1 a4,a3,a1        ; upper/lower even -> lower even
||      sub2.s2 b4,b3,b1        ; upper/lower odd  -> lower odd
||[b0]  sub.l2 b0,1,b0          ; 
||      stw a1,*a11++[2]
||      stw b1,*b11++[2]

        ldw.d1 *a14++[2],a4     ; upper even load                           
||      ldw.d2 *b14++[2],b4     ; upper odd  load
||[b0]  b      H6loop       

        ldw.d1 *a13++[2],a3     ; lower even load                           
||      ldw.d2 *b13++[2],b3     ; lower odd load

        ; Branch to H6loop occurs here                  
       
        ; Restore context       

        ldw.d2  *+b15[10],b3    ; pop b3 
        add.l1x b15,4,a8        ; copy stack pointer
        ldw.d2  *b15++[2],b14   ; pop b14 
||      ldw.d1  *a8++[2],a14    ; pop a14 
        ldw.d2  *b15++[2],b13   ; pop b13 
||      ldw.d1  *a8++[2],a13    ; pop a13 
        ldw.d2  *b15++[2],b12   ; pop b12 
||      ldw.d1  *a8++[2],a12    ; pop a12 
        ldw.d2  *b15++[2],b11   ; pop b11 
||      ldw.d1  *a8++[2],a11    ; pop a11 
||      b.s2     b3
        ldw.d2  *b15++[2],b10   ; pop b10 
||      ldw.d1  *a8,a10         ; pop a10 
        nop     4

        .end