1*638691a0SAndroid Build Coastguard Worker/* Intel SIMD SSE2 implementation of Viterbi ACS butterflies 2*638691a0SAndroid Build Coastguard Worker for 256-state (k=9) convolutional code 3*638691a0SAndroid Build Coastguard Worker Copyright 2004 Phil Karn, KA9Q 4*638691a0SAndroid Build Coastguard Worker This code may be used under the terms of the GNU Lesser General Public License (LGPL) 5*638691a0SAndroid Build Coastguard Worker 6*638691a0SAndroid Build Coastguard Worker void update_viterbi29_blk_sse2(struct v29 *vp,unsigned char *syms,int nbits) ; 7*638691a0SAndroid Build Coastguard Worker*/ 8*638691a0SAndroid Build Coastguard Worker 9*638691a0SAndroid Build Coastguard Worker # SSE2 (128-bit integer SIMD) version 10*638691a0SAndroid Build Coastguard Worker # Requires Pentium 4 or better 11*638691a0SAndroid Build Coastguard Worker # These are offsets into struct v29, defined in viterbi29.h 12*638691a0SAndroid Build Coastguard Worker .set DP,512 13*638691a0SAndroid Build Coastguard Worker .set OLDMETRICS,516 14*638691a0SAndroid Build Coastguard Worker .set NEWMETRICS,520 15*638691a0SAndroid Build Coastguard Worker 16*638691a0SAndroid Build Coastguard Worker .text 17*638691a0SAndroid Build Coastguard Worker .global update_viterbi29_blk_sse2,Branchtab29_sse2 18*638691a0SAndroid Build Coastguard Worker .type update_viterbi29_blk_sse2,@function 19*638691a0SAndroid Build Coastguard Worker .align 16 20*638691a0SAndroid Build Coastguard Worker 21*638691a0SAndroid Build Coastguard Workerupdate_viterbi29_blk_sse2: 22*638691a0SAndroid Build Coastguard Worker pushl %ebp 23*638691a0SAndroid Build Coastguard Worker movl %esp,%ebp 24*638691a0SAndroid Build Coastguard Worker pushl %esi 25*638691a0SAndroid Build Coastguard Worker pushl %edi 26*638691a0SAndroid Build Coastguard Worker pushl %edx 27*638691a0SAndroid Build Coastguard Worker pushl %ebx 28*638691a0SAndroid Build Coastguard Worker 29*638691a0SAndroid Build Coastguard Worker movl 8(%ebp),%edx # edx = vp 30*638691a0SAndroid Build Coastguard Worker testl %edx,%edx 31*638691a0SAndroid Build Coastguard Worker jnz 0f 32*638691a0SAndroid Build Coastguard Worker movl -1,%eax 33*638691a0SAndroid Build Coastguard Worker jmp err 34*638691a0SAndroid Build Coastguard Worker0: movl OLDMETRICS(%edx),%esi # esi -> old metrics 35*638691a0SAndroid Build Coastguard Worker movl NEWMETRICS(%edx),%edi # edi -> new metrics 36*638691a0SAndroid Build Coastguard Worker movl DP(%edx),%edx # edx -> decisions 37*638691a0SAndroid Build Coastguard Worker 38*638691a0SAndroid Build Coastguard Worker1: movl 16(%ebp),%eax # eax = nbits 39*638691a0SAndroid Build Coastguard Worker decl %eax 40*638691a0SAndroid Build Coastguard Worker jl 2f # passed zero, we're done 41*638691a0SAndroid Build Coastguard Worker movl %eax,16(%ebp) 42*638691a0SAndroid Build Coastguard Worker 43*638691a0SAndroid Build Coastguard Worker xorl %eax,%eax 44*638691a0SAndroid Build Coastguard Worker movl 12(%ebp),%ebx # ebx = syms 45*638691a0SAndroid Build Coastguard Worker movb (%ebx),%al 46*638691a0SAndroid Build Coastguard Worker movd %eax,%xmm6 # xmm6[0] = first symbol 47*638691a0SAndroid Build Coastguard Worker movb 1(%ebx),%al 48*638691a0SAndroid Build Coastguard Worker movd %eax,%xmm5 # xmm5[0] = second symbol 49*638691a0SAndroid Build Coastguard Worker addl $2,%ebx 50*638691a0SAndroid Build Coastguard Worker movl %ebx,12(%ebp) 51*638691a0SAndroid Build Coastguard Worker 52*638691a0SAndroid Build Coastguard Worker punpcklbw %xmm6,%xmm6 # xmm6[1] = xmm6[0] 53*638691a0SAndroid Build Coastguard Worker punpcklbw %xmm5,%xmm5 54*638691a0SAndroid Build Coastguard Worker movdqa thirtyones,%xmm7 55*638691a0SAndroid Build Coastguard Worker pshuflw $0,%xmm6,%xmm6 # copy low word to low 3 56*638691a0SAndroid Build Coastguard Worker pshuflw $0,%xmm5,%xmm5 57*638691a0SAndroid Build Coastguard Worker punpcklqdq %xmm6,%xmm6 # propagate to all 16 58*638691a0SAndroid Build Coastguard Worker punpcklqdq %xmm5,%xmm5 59*638691a0SAndroid Build Coastguard Worker # xmm6 now contains first symbol in each byte, xmm5 the second 60*638691a0SAndroid Build Coastguard Worker 61*638691a0SAndroid Build Coastguard Worker movdqa thirtyones,%xmm7 62*638691a0SAndroid Build Coastguard Worker 63*638691a0SAndroid Build Coastguard Worker # each invocation of this macro does 16 butterflies in parallel 64*638691a0SAndroid Build Coastguard Worker .MACRO butterfly GROUP 65*638691a0SAndroid Build Coastguard Worker # compute branch metrics 66*638691a0SAndroid Build Coastguard Worker movdqa Branchtab29_sse2+(16*\GROUP),%xmm4 67*638691a0SAndroid Build Coastguard Worker movdqa Branchtab29_sse2+128+(16*\GROUP),%xmm3 68*638691a0SAndroid Build Coastguard Worker pxor %xmm6,%xmm4 69*638691a0SAndroid Build Coastguard Worker pxor %xmm5,%xmm3 70*638691a0SAndroid Build Coastguard Worker pavgb %xmm3,%xmm4 71*638691a0SAndroid Build Coastguard Worker psrlw $3,%xmm4 72*638691a0SAndroid Build Coastguard Worker 73*638691a0SAndroid Build Coastguard Worker pand %xmm7,%xmm4 # xmm4 contains branch metrics 74*638691a0SAndroid Build Coastguard Worker 75*638691a0SAndroid Build Coastguard Worker movdqa (16*\GROUP)(%esi),%xmm0 # Incoming path metric, high bit = 0 76*638691a0SAndroid Build Coastguard Worker movdqa ((16*\GROUP)+128)(%esi),%xmm3 # Incoming path metric, high bit = 1 77*638691a0SAndroid Build Coastguard Worker movdqa %xmm0,%xmm2 78*638691a0SAndroid Build Coastguard Worker movdqa %xmm3,%xmm1 79*638691a0SAndroid Build Coastguard Worker paddusb %xmm4,%xmm0 80*638691a0SAndroid Build Coastguard Worker paddusb %xmm4,%xmm3 81*638691a0SAndroid Build Coastguard Worker 82*638691a0SAndroid Build Coastguard Worker # invert branch metrics 83*638691a0SAndroid Build Coastguard Worker pxor %xmm7,%xmm4 84*638691a0SAndroid Build Coastguard Worker 85*638691a0SAndroid Build Coastguard Worker paddusb %xmm4,%xmm1 86*638691a0SAndroid Build Coastguard Worker paddusb %xmm4,%xmm2 87*638691a0SAndroid Build Coastguard Worker 88*638691a0SAndroid Build Coastguard Worker # Find survivors, leave in mm0,2 89*638691a0SAndroid Build Coastguard Worker pminub %xmm1,%xmm0 90*638691a0SAndroid Build Coastguard Worker pminub %xmm3,%xmm2 91*638691a0SAndroid Build Coastguard Worker # get decisions, leave in mm1,3 92*638691a0SAndroid Build Coastguard Worker pcmpeqb %xmm0,%xmm1 93*638691a0SAndroid Build Coastguard Worker pcmpeqb %xmm2,%xmm3 94*638691a0SAndroid Build Coastguard Worker 95*638691a0SAndroid Build Coastguard Worker # interleave and store new branch metrics in mm0,2 96*638691a0SAndroid Build Coastguard Worker movdqa %xmm0,%xmm4 97*638691a0SAndroid Build Coastguard Worker punpckhbw %xmm2,%xmm0 # interleave second 16 new metrics 98*638691a0SAndroid Build Coastguard Worker punpcklbw %xmm2,%xmm4 # interleave first 16 new metrics 99*638691a0SAndroid Build Coastguard Worker movdqa %xmm0,(32*\GROUP+16)(%edi) 100*638691a0SAndroid Build Coastguard Worker movdqa %xmm4,(32*\GROUP)(%edi) 101*638691a0SAndroid Build Coastguard Worker 102*638691a0SAndroid Build Coastguard Worker # interleave decisions & store 103*638691a0SAndroid Build Coastguard Worker movdqa %xmm1,%xmm4 104*638691a0SAndroid Build Coastguard Worker punpckhbw %xmm3,%xmm1 105*638691a0SAndroid Build Coastguard Worker punpcklbw %xmm3,%xmm4 106*638691a0SAndroid Build Coastguard Worker # work around bug in gas due to Intel doc error 107*638691a0SAndroid Build Coastguard Worker .byte 0x66,0x0f,0xd7,0xd9 # pmovmskb %xmm1,%ebx 108*638691a0SAndroid Build Coastguard Worker shll $16,%ebx 109*638691a0SAndroid Build Coastguard Worker .byte 0x66,0x0f,0xd7,0xc4 # pmovmskb %xmm4,%eax 110*638691a0SAndroid Build Coastguard Worker orl %eax,%ebx 111*638691a0SAndroid Build Coastguard Worker movl %ebx,(4*\GROUP)(%edx) 112*638691a0SAndroid Build Coastguard Worker .endm 113*638691a0SAndroid Build Coastguard Worker 114*638691a0SAndroid Build Coastguard Worker # invoke macro 8 times for a total of 128 butterflies 115*638691a0SAndroid Build Coastguard Worker butterfly GROUP=0 116*638691a0SAndroid Build Coastguard Worker butterfly GROUP=1 117*638691a0SAndroid Build Coastguard Worker butterfly GROUP=2 118*638691a0SAndroid Build Coastguard Worker butterfly GROUP=3 119*638691a0SAndroid Build Coastguard Worker butterfly GROUP=4 120*638691a0SAndroid Build Coastguard Worker butterfly GROUP=5 121*638691a0SAndroid Build Coastguard Worker butterfly GROUP=6 122*638691a0SAndroid Build Coastguard Worker butterfly GROUP=7 123*638691a0SAndroid Build Coastguard Worker 124*638691a0SAndroid Build Coastguard Worker addl $32,%edx # bump decision pointer 125*638691a0SAndroid Build Coastguard Worker 126*638691a0SAndroid Build Coastguard Worker # see if we have to normalize 127*638691a0SAndroid Build Coastguard Worker movl (%edi),%eax # extract first output metric 128*638691a0SAndroid Build Coastguard Worker andl $255,%eax 129*638691a0SAndroid Build Coastguard Worker cmp $50,%eax # is it greater than 50? 130*638691a0SAndroid Build Coastguard Worker movl $0,%eax 131*638691a0SAndroid Build Coastguard Worker jle done # No, no need to normalize 132*638691a0SAndroid Build Coastguard Worker 133*638691a0SAndroid Build Coastguard Worker # Normalize by finding smallest metric and subtracting it 134*638691a0SAndroid Build Coastguard Worker # from all metrics 135*638691a0SAndroid Build Coastguard Worker movdqa (%edi),%xmm0 136*638691a0SAndroid Build Coastguard Worker pminub 16(%edi),%xmm0 137*638691a0SAndroid Build Coastguard Worker pminub 32(%edi),%xmm0 138*638691a0SAndroid Build Coastguard Worker pminub 48(%edi),%xmm0 139*638691a0SAndroid Build Coastguard Worker pminub 64(%edi),%xmm0 140*638691a0SAndroid Build Coastguard Worker pminub 80(%edi),%xmm0 141*638691a0SAndroid Build Coastguard Worker pminub 96(%edi),%xmm0 142*638691a0SAndroid Build Coastguard Worker pminub 112(%edi),%xmm0 143*638691a0SAndroid Build Coastguard Worker pminub 128(%edi),%xmm0 144*638691a0SAndroid Build Coastguard Worker pminub 144(%edi),%xmm0 145*638691a0SAndroid Build Coastguard Worker pminub 160(%edi),%xmm0 146*638691a0SAndroid Build Coastguard Worker pminub 176(%edi),%xmm0 147*638691a0SAndroid Build Coastguard Worker pminub 192(%edi),%xmm0 148*638691a0SAndroid Build Coastguard Worker pminub 208(%edi),%xmm0 149*638691a0SAndroid Build Coastguard Worker pminub 224(%edi),%xmm0 150*638691a0SAndroid Build Coastguard Worker pminub 240(%edi),%xmm0 151*638691a0SAndroid Build Coastguard Worker 152*638691a0SAndroid Build Coastguard Worker # crunch down to single lowest metric 153*638691a0SAndroid Build Coastguard Worker movdqa %xmm0,%xmm1 154*638691a0SAndroid Build Coastguard Worker psrldq $8,%xmm0 # the count to psrldq is bytes, not bits! 155*638691a0SAndroid Build Coastguard Worker pminub %xmm1,%xmm0 156*638691a0SAndroid Build Coastguard Worker movdqa %xmm0,%xmm1 157*638691a0SAndroid Build Coastguard Worker psrlq $32,%xmm0 158*638691a0SAndroid Build Coastguard Worker pminub %xmm1,%xmm0 159*638691a0SAndroid Build Coastguard Worker movdqa %xmm0,%xmm1 160*638691a0SAndroid Build Coastguard Worker psrlq $16,%xmm0 161*638691a0SAndroid Build Coastguard Worker pminub %xmm1,%xmm0 162*638691a0SAndroid Build Coastguard Worker movdqa %xmm0,%xmm1 163*638691a0SAndroid Build Coastguard Worker psrlq $8,%xmm0 164*638691a0SAndroid Build Coastguard Worker pminub %xmm1,%xmm0 165*638691a0SAndroid Build Coastguard Worker 166*638691a0SAndroid Build Coastguard Worker punpcklbw %xmm0,%xmm0 # lowest 2 bytes 167*638691a0SAndroid Build Coastguard Worker pshuflw $0,%xmm0,%xmm0 # lowest 8 bytes 168*638691a0SAndroid Build Coastguard Worker punpcklqdq %xmm0,%xmm0 # all 16 bytes 169*638691a0SAndroid Build Coastguard Worker 170*638691a0SAndroid Build Coastguard Worker # xmm0 now contains lowest metric in all 16 bytes 171*638691a0SAndroid Build Coastguard Worker # subtract it from every output metric 172*638691a0SAndroid Build Coastguard Worker movdqa (%edi),%xmm1 173*638691a0SAndroid Build Coastguard Worker psubusb %xmm0,%xmm1 174*638691a0SAndroid Build Coastguard Worker movdqa %xmm1,(%edi) 175*638691a0SAndroid Build Coastguard Worker movdqa 16(%edi),%xmm1 176*638691a0SAndroid Build Coastguard Worker psubusb %xmm0,%xmm1 177*638691a0SAndroid Build Coastguard Worker movdqa %xmm1,16(%edi) 178*638691a0SAndroid Build Coastguard Worker movdqa 32(%edi),%xmm1 179*638691a0SAndroid Build Coastguard Worker psubusb %xmm0,%xmm1 180*638691a0SAndroid Build Coastguard Worker movdqa %xmm1,32(%edi) 181*638691a0SAndroid Build Coastguard Worker movdqa 48(%edi),%xmm1 182*638691a0SAndroid Build Coastguard Worker psubusb %xmm0,%xmm1 183*638691a0SAndroid Build Coastguard Worker movdqa %xmm1,48(%edi) 184*638691a0SAndroid Build Coastguard Worker movdqa 64(%edi),%xmm1 185*638691a0SAndroid Build Coastguard Worker psubusb %xmm0,%xmm1 186*638691a0SAndroid Build Coastguard Worker movdqa %xmm1,64(%edi) 187*638691a0SAndroid Build Coastguard Worker movdqa 80(%edi),%xmm1 188*638691a0SAndroid Build Coastguard Worker psubusb %xmm0,%xmm1 189*638691a0SAndroid Build Coastguard Worker movdqa %xmm1,80(%edi) 190*638691a0SAndroid Build Coastguard Worker movdqa 96(%edi),%xmm1 191*638691a0SAndroid Build Coastguard Worker psubusb %xmm0,%xmm1 192*638691a0SAndroid Build Coastguard Worker movdqa %xmm1,96(%edi) 193*638691a0SAndroid Build Coastguard Worker movdqa 112(%edi),%xmm1 194*638691a0SAndroid Build Coastguard Worker psubusb %xmm0,%xmm1 195*638691a0SAndroid Build Coastguard Worker movdqa %xmm1,112(%edi) 196*638691a0SAndroid Build Coastguard Worker movdqa 128(%edi),%xmm1 197*638691a0SAndroid Build Coastguard Worker psubusb %xmm0,%xmm1 198*638691a0SAndroid Build Coastguard Worker movdqa %xmm1,128(%edi) 199*638691a0SAndroid Build Coastguard Worker movdqa 144(%edi),%xmm1 200*638691a0SAndroid Build Coastguard Worker psubusb %xmm0,%xmm1 201*638691a0SAndroid Build Coastguard Worker movdqa %xmm1,144(%edi) 202*638691a0SAndroid Build Coastguard Worker movdqa 160(%edi),%xmm1 203*638691a0SAndroid Build Coastguard Worker psubusb %xmm0,%xmm1 204*638691a0SAndroid Build Coastguard Worker movdqa %xmm1,160(%edi) 205*638691a0SAndroid Build Coastguard Worker movdqa 176(%edi),%xmm1 206*638691a0SAndroid Build Coastguard Worker psubusb %xmm0,%xmm1 207*638691a0SAndroid Build Coastguard Worker movdqa %xmm1,176(%edi) 208*638691a0SAndroid Build Coastguard Worker movdqa 192(%edi),%xmm1 209*638691a0SAndroid Build Coastguard Worker psubusb %xmm0,%xmm1 210*638691a0SAndroid Build Coastguard Worker movdqa %xmm1,192(%edi) 211*638691a0SAndroid Build Coastguard Worker movdqa 208(%edi),%xmm1 212*638691a0SAndroid Build Coastguard Worker psubusb %xmm0,%xmm1 213*638691a0SAndroid Build Coastguard Worker movdqa %xmm1,208(%edi) 214*638691a0SAndroid Build Coastguard Worker movdqa 224(%edi),%xmm1 215*638691a0SAndroid Build Coastguard Worker psubusb %xmm0,%xmm1 216*638691a0SAndroid Build Coastguard Worker movdqa %xmm1,224(%edi) 217*638691a0SAndroid Build Coastguard Worker movdqa 240(%edi),%xmm1 218*638691a0SAndroid Build Coastguard Worker psubusb %xmm0,%xmm1 219*638691a0SAndroid Build Coastguard Worker movdqa %xmm1,240(%edi) 220*638691a0SAndroid Build Coastguard Worker 221*638691a0SAndroid Build Coastguard Workerdone: 222*638691a0SAndroid Build Coastguard Worker # swap metrics 223*638691a0SAndroid Build Coastguard Worker movl %esi,%eax 224*638691a0SAndroid Build Coastguard Worker movl %edi,%esi 225*638691a0SAndroid Build Coastguard Worker movl %eax,%edi 226*638691a0SAndroid Build Coastguard Worker jmp 1b 227*638691a0SAndroid Build Coastguard Worker 228*638691a0SAndroid Build Coastguard Worker2: movl 8(%ebp),%ebx # ebx = vp 229*638691a0SAndroid Build Coastguard Worker # stash metric pointers 230*638691a0SAndroid Build Coastguard Worker movl %esi,OLDMETRICS(%ebx) 231*638691a0SAndroid Build Coastguard Worker movl %edi,NEWMETRICS(%ebx) 232*638691a0SAndroid Build Coastguard Worker movl %edx,DP(%ebx) # stash incremented value of vp->dp 233*638691a0SAndroid Build Coastguard Worker xorl %eax,%eax 234*638691a0SAndroid Build Coastguard Workererr: popl %ebx 235*638691a0SAndroid Build Coastguard Worker popl %edx 236*638691a0SAndroid Build Coastguard Worker popl %edi 237*638691a0SAndroid Build Coastguard Worker popl %esi 238*638691a0SAndroid Build Coastguard Worker popl %ebp 239*638691a0SAndroid Build Coastguard Worker ret 240*638691a0SAndroid Build Coastguard Worker 241*638691a0SAndroid Build Coastguard Worker .data 242*638691a0SAndroid Build Coastguard Worker .align 16 243*638691a0SAndroid Build Coastguard Workerthirtyones: 244*638691a0SAndroid Build Coastguard Worker .byte 31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31 245*638691a0SAndroid Build Coastguard Worker 246