1*638691a0SAndroid Build Coastguard Worker/* Intel SIMD SSE implementation of Viterbi ACS butterflies 2*638691a0SAndroid Build Coastguard Worker for 256-state (k=9) convolutional code 3*638691a0SAndroid Build Coastguard Worker Copyright 2004 Phil Karn, KA9Q 4*638691a0SAndroid Build Coastguard Worker This code may be used under the terms of the GNU Lesser General Public License (LGPL) 5*638691a0SAndroid Build Coastguard Worker 6*638691a0SAndroid Build Coastguard Worker void update_viterbi29_blk_sse(struct v29 *vp,unsigned char syms[],int nbits); 7*638691a0SAndroid Build Coastguard Worker*/ 8*638691a0SAndroid Build Coastguard Worker # SSE (64-bit integer SIMD) version 9*638691a0SAndroid Build Coastguard Worker # Requires Pentium III or better 10*638691a0SAndroid Build Coastguard Worker # These are offsets into struct v29, defined in viterbi29.h 11*638691a0SAndroid Build Coastguard Worker .set DP,512 12*638691a0SAndroid Build Coastguard Worker .set OLDMETRICS,516 13*638691a0SAndroid Build Coastguard Worker .set NEWMETRICS,520 14*638691a0SAndroid Build Coastguard Worker .text 15*638691a0SAndroid Build Coastguard Worker .global update_viterbi29_blk_sse,Branchtab29_sse 16*638691a0SAndroid Build Coastguard Worker .type update_viterbi29_blk_sse,@function 17*638691a0SAndroid Build Coastguard Worker .align 16 18*638691a0SAndroid Build Coastguard Worker 19*638691a0SAndroid Build Coastguard Workerupdate_viterbi29_blk_sse: 20*638691a0SAndroid Build Coastguard Worker pushl %ebp 21*638691a0SAndroid Build Coastguard Worker movl %esp,%ebp 22*638691a0SAndroid Build Coastguard Worker pushl %esi 23*638691a0SAndroid Build Coastguard Worker pushl %edi 24*638691a0SAndroid Build Coastguard Worker pushl %edx 25*638691a0SAndroid Build Coastguard Worker pushl %ebx 26*638691a0SAndroid Build Coastguard Worker 27*638691a0SAndroid Build Coastguard Worker movl 8(%ebp),%edx # edx = vp 28*638691a0SAndroid Build Coastguard Worker testl %edx,%edx 29*638691a0SAndroid Build Coastguard Worker jnz 0f 30*638691a0SAndroid Build Coastguard Worker movl -1,%eax 31*638691a0SAndroid Build Coastguard Worker jmp err 32*638691a0SAndroid Build Coastguard Worker0: movl OLDMETRICS(%edx),%esi # esi -> old metrics 33*638691a0SAndroid Build Coastguard Worker movl NEWMETRICS(%edx),%edi # edi -> new metrics 34*638691a0SAndroid Build Coastguard Worker movl DP(%edx),%edx # edx -> decisions 35*638691a0SAndroid Build Coastguard Worker 36*638691a0SAndroid Build Coastguard Worker1: movl 16(%ebp),%eax # eax = nbits 37*638691a0SAndroid Build Coastguard Worker decl %eax 38*638691a0SAndroid Build Coastguard Worker jl 2f # passed zero, we're done 39*638691a0SAndroid Build Coastguard Worker movl %eax,16(%ebp) 40*638691a0SAndroid Build Coastguard Worker 41*638691a0SAndroid Build Coastguard Worker xorl %eax,%eax 42*638691a0SAndroid Build Coastguard Worker movl 12(%ebp),%ebx # ebx = syms 43*638691a0SAndroid Build Coastguard Worker movb (%ebx),%al 44*638691a0SAndroid Build Coastguard Worker movd %eax,%mm6 # mm6[0] = first symbol 45*638691a0SAndroid Build Coastguard Worker movb 1(%ebx),%al 46*638691a0SAndroid Build Coastguard Worker movd %eax,%mm5 # mm5[0] = second symbol 47*638691a0SAndroid Build Coastguard Worker addl $2,%ebx 48*638691a0SAndroid Build Coastguard Worker movl %ebx,12(%ebp) 49*638691a0SAndroid Build Coastguard Worker 50*638691a0SAndroid Build Coastguard Worker punpcklbw %mm6,%mm6 # mm6[1] = mm6[0] 51*638691a0SAndroid Build Coastguard Worker punpcklbw %mm5,%mm5 52*638691a0SAndroid Build Coastguard Worker 53*638691a0SAndroid Build Coastguard Worker movq thirtyones,%mm7 54*638691a0SAndroid Build Coastguard Worker pshufw $0,%mm6,%mm6 # copy low word to upper 3 55*638691a0SAndroid Build Coastguard Worker pshufw $0,%mm5,%mm5 56*638691a0SAndroid Build Coastguard Worker # mm6 now contains first symbol in each byte, mm5 the second 57*638691a0SAndroid Build Coastguard Worker 58*638691a0SAndroid Build Coastguard Worker # each invocation of this macro does 8 butterflies in parallel 59*638691a0SAndroid Build Coastguard Worker .MACRO butterfly GROUP 60*638691a0SAndroid Build Coastguard Worker # compute branch metrics 61*638691a0SAndroid Build Coastguard Worker movq Branchtab29_sse+(8*\GROUP),%mm4 62*638691a0SAndroid Build Coastguard Worker movq Branchtab29_sse+128+(8*\GROUP),%mm3 63*638691a0SAndroid Build Coastguard Worker pxor %mm6,%mm4 64*638691a0SAndroid Build Coastguard Worker pxor %mm5,%mm3 65*638691a0SAndroid Build Coastguard Worker pavgb %mm3,%mm4 # mm4 contains branch metrics 66*638691a0SAndroid Build Coastguard Worker psrlw $3,%mm4 67*638691a0SAndroid Build Coastguard Worker pand %mm7,%mm4 68*638691a0SAndroid Build Coastguard Worker 69*638691a0SAndroid Build Coastguard Worker movq (8*\GROUP)(%esi),%mm0 # Incoming path metric, high bit = 0 70*638691a0SAndroid Build Coastguard Worker movq ((8*\GROUP)+128)(%esi),%mm3 # Incoming path metric, high bit = 1 71*638691a0SAndroid Build Coastguard Worker movq %mm0,%mm2 72*638691a0SAndroid Build Coastguard Worker movq %mm3,%mm1 73*638691a0SAndroid Build Coastguard Worker paddusb %mm4,%mm0 74*638691a0SAndroid Build Coastguard Worker paddusb %mm4,%mm3 75*638691a0SAndroid Build Coastguard Worker 76*638691a0SAndroid Build Coastguard Worker # invert branch metrics. This works only because they're 5 bits 77*638691a0SAndroid Build Coastguard Worker pxor %mm7,%mm4 78*638691a0SAndroid Build Coastguard Worker 79*638691a0SAndroid Build Coastguard Worker paddusb %mm4,%mm1 80*638691a0SAndroid Build Coastguard Worker paddusb %mm4,%mm2 81*638691a0SAndroid Build Coastguard Worker 82*638691a0SAndroid Build Coastguard Worker # Find survivors, leave in mm0,2 83*638691a0SAndroid Build Coastguard Worker pminub %mm1,%mm0 84*638691a0SAndroid Build Coastguard Worker pminub %mm3,%mm2 85*638691a0SAndroid Build Coastguard Worker # get decisions, leave in mm1,3 86*638691a0SAndroid Build Coastguard Worker pcmpeqb %mm0,%mm1 87*638691a0SAndroid Build Coastguard Worker pcmpeqb %mm2,%mm3 88*638691a0SAndroid Build Coastguard Worker 89*638691a0SAndroid Build Coastguard Worker # interleave and store new branch metrics in mm0,2 90*638691a0SAndroid Build Coastguard Worker movq %mm0,%mm4 91*638691a0SAndroid Build Coastguard Worker punpckhbw %mm2,%mm0 # interleave second 8 new metrics 92*638691a0SAndroid Build Coastguard Worker punpcklbw %mm2,%mm4 # interleave first 8 new metrics 93*638691a0SAndroid Build Coastguard Worker movq %mm0,(16*\GROUP+8)(%edi) 94*638691a0SAndroid Build Coastguard Worker movq %mm4,(16*\GROUP)(%edi) 95*638691a0SAndroid Build Coastguard Worker 96*638691a0SAndroid Build Coastguard Worker # interleave decisions, accumulate into %ebx 97*638691a0SAndroid Build Coastguard Worker movq %mm1,%mm4 98*638691a0SAndroid Build Coastguard Worker punpckhbw %mm3,%mm1 99*638691a0SAndroid Build Coastguard Worker punpcklbw %mm3,%mm4 100*638691a0SAndroid Build Coastguard Worker # Due to an error in the Intel instruction set ref (the register 101*638691a0SAndroid Build Coastguard Worker # fields are swapped), gas assembles pmovmskb incorrectly 102*638691a0SAndroid Build Coastguard Worker # See http://mail.gnu.org/pipermail/bug-gnu-utils/2000-August/002341.html 103*638691a0SAndroid Build Coastguard Worker .byte 0x0f,0xd7,0xc1 # pmovmskb %mm1,%eax 104*638691a0SAndroid Build Coastguard Worker shll $((16*\GROUP+8)&31),%eax 105*638691a0SAndroid Build Coastguard Worker orl %eax,%ebx 106*638691a0SAndroid Build Coastguard Worker .byte 0x0f,0xd7,0xc4 # pmovmskb %mm4,%eax 107*638691a0SAndroid Build Coastguard Worker shll $((16*\GROUP)&31),%eax 108*638691a0SAndroid Build Coastguard Worker orl %eax,%ebx 109*638691a0SAndroid Build Coastguard Worker .endm 110*638691a0SAndroid Build Coastguard Worker 111*638691a0SAndroid Build Coastguard Worker # invoke macro 16 times for a total of 128 butterflies 112*638691a0SAndroid Build Coastguard Worker xorl %ebx,%ebx # clear decisions 113*638691a0SAndroid Build Coastguard Worker butterfly GROUP=0 114*638691a0SAndroid Build Coastguard Worker butterfly GROUP=1 115*638691a0SAndroid Build Coastguard Worker movl %ebx,(%edx) # stash first 32 decisions 116*638691a0SAndroid Build Coastguard Worker xorl %ebx,%ebx 117*638691a0SAndroid Build Coastguard Worker butterfly GROUP=2 118*638691a0SAndroid Build Coastguard Worker butterfly GROUP=3 119*638691a0SAndroid Build Coastguard Worker movl %ebx,4(%edx) # stash second 32 decisions 120*638691a0SAndroid Build Coastguard Worker xorl %ebx,%ebx # clear decisions 121*638691a0SAndroid Build Coastguard Worker butterfly GROUP=4 122*638691a0SAndroid Build Coastguard Worker butterfly GROUP=5 123*638691a0SAndroid Build Coastguard Worker movl %ebx,8(%edx) # stash first 32 decisions 124*638691a0SAndroid Build Coastguard Worker xorl %ebx,%ebx 125*638691a0SAndroid Build Coastguard Worker butterfly GROUP=6 126*638691a0SAndroid Build Coastguard Worker butterfly GROUP=7 127*638691a0SAndroid Build Coastguard Worker movl %ebx,12(%edx) # stash second 32 decisions 128*638691a0SAndroid Build Coastguard Worker xorl %ebx,%ebx # clear decisions 129*638691a0SAndroid Build Coastguard Worker butterfly GROUP=8 130*638691a0SAndroid Build Coastguard Worker butterfly GROUP=9 131*638691a0SAndroid Build Coastguard Worker movl %ebx,16(%edx) # stash first 32 decisions 132*638691a0SAndroid Build Coastguard Worker xorl %ebx,%ebx 133*638691a0SAndroid Build Coastguard Worker butterfly GROUP=10 134*638691a0SAndroid Build Coastguard Worker butterfly GROUP=11 135*638691a0SAndroid Build Coastguard Worker movl %ebx,20(%edx) # stash second 32 decisions 136*638691a0SAndroid Build Coastguard Worker xorl %ebx,%ebx # clear decisions 137*638691a0SAndroid Build Coastguard Worker butterfly GROUP=12 138*638691a0SAndroid Build Coastguard Worker butterfly GROUP=13 139*638691a0SAndroid Build Coastguard Worker movl %ebx,24(%edx) # stash first 32 decisions 140*638691a0SAndroid Build Coastguard Worker xorl %ebx,%ebx 141*638691a0SAndroid Build Coastguard Worker butterfly GROUP=14 142*638691a0SAndroid Build Coastguard Worker butterfly GROUP=15 143*638691a0SAndroid Build Coastguard Worker movl %ebx,28(%edx) # stash second 32 decisions 144*638691a0SAndroid Build Coastguard Worker 145*638691a0SAndroid Build Coastguard Worker addl $32,%edx # bump decision pointer 146*638691a0SAndroid Build Coastguard Worker 147*638691a0SAndroid Build Coastguard Worker # see if we have to normalize 148*638691a0SAndroid Build Coastguard Worker movl (%edi),%eax # extract first output metric 149*638691a0SAndroid Build Coastguard Worker andl $255,%eax 150*638691a0SAndroid Build Coastguard Worker cmp $50,%eax # is it greater than 50? 151*638691a0SAndroid Build Coastguard Worker movl $0,%eax 152*638691a0SAndroid Build Coastguard Worker jle done # No, no need to normalize 153*638691a0SAndroid Build Coastguard Worker 154*638691a0SAndroid Build Coastguard Worker # Normalize by finding smallest metric and subtracting it 155*638691a0SAndroid Build Coastguard Worker # from all metrics 156*638691a0SAndroid Build Coastguard Worker movq (%edi),%mm0 157*638691a0SAndroid Build Coastguard Worker pminub 8(%edi),%mm0 158*638691a0SAndroid Build Coastguard Worker pminub 16(%edi),%mm0 159*638691a0SAndroid Build Coastguard Worker pminub 24(%edi),%mm0 160*638691a0SAndroid Build Coastguard Worker pminub 32(%edi),%mm0 161*638691a0SAndroid Build Coastguard Worker pminub 40(%edi),%mm0 162*638691a0SAndroid Build Coastguard Worker pminub 48(%edi),%mm0 163*638691a0SAndroid Build Coastguard Worker pminub 56(%edi),%mm0 164*638691a0SAndroid Build Coastguard Worker pminub 64(%edi),%mm0 165*638691a0SAndroid Build Coastguard Worker pminub 72(%edi),%mm0 166*638691a0SAndroid Build Coastguard Worker pminub 80(%edi),%mm0 167*638691a0SAndroid Build Coastguard Worker pminub 88(%edi),%mm0 168*638691a0SAndroid Build Coastguard Worker pminub 96(%edi),%mm0 169*638691a0SAndroid Build Coastguard Worker pminub 104(%edi),%mm0 170*638691a0SAndroid Build Coastguard Worker pminub 112(%edi),%mm0 171*638691a0SAndroid Build Coastguard Worker pminub 120(%edi),%mm0 172*638691a0SAndroid Build Coastguard Worker pminub 128(%edi),%mm0 173*638691a0SAndroid Build Coastguard Worker pminub 136(%edi),%mm0 174*638691a0SAndroid Build Coastguard Worker pminub 144(%edi),%mm0 175*638691a0SAndroid Build Coastguard Worker pminub 152(%edi),%mm0 176*638691a0SAndroid Build Coastguard Worker pminub 160(%edi),%mm0 177*638691a0SAndroid Build Coastguard Worker pminub 168(%edi),%mm0 178*638691a0SAndroid Build Coastguard Worker pminub 176(%edi),%mm0 179*638691a0SAndroid Build Coastguard Worker pminub 184(%edi),%mm0 180*638691a0SAndroid Build Coastguard Worker pminub 192(%edi),%mm0 181*638691a0SAndroid Build Coastguard Worker pminub 200(%edi),%mm0 182*638691a0SAndroid Build Coastguard Worker pminub 208(%edi),%mm0 183*638691a0SAndroid Build Coastguard Worker pminub 216(%edi),%mm0 184*638691a0SAndroid Build Coastguard Worker pminub 224(%edi),%mm0 185*638691a0SAndroid Build Coastguard Worker pminub 232(%edi),%mm0 186*638691a0SAndroid Build Coastguard Worker pminub 240(%edi),%mm0 187*638691a0SAndroid Build Coastguard Worker pminub 248(%edi),%mm0 188*638691a0SAndroid Build Coastguard Worker # mm0 contains 8 smallest metrics 189*638691a0SAndroid Build Coastguard Worker # crunch down to single lowest metric 190*638691a0SAndroid Build Coastguard Worker movq %mm0,%mm1 191*638691a0SAndroid Build Coastguard Worker psrlq $32,%mm0 192*638691a0SAndroid Build Coastguard Worker pminub %mm1,%mm0 193*638691a0SAndroid Build Coastguard Worker movq %mm0,%mm1 194*638691a0SAndroid Build Coastguard Worker psrlq $16,%mm0 195*638691a0SAndroid Build Coastguard Worker pminub %mm1,%mm0 196*638691a0SAndroid Build Coastguard Worker movq %mm0,%mm1 197*638691a0SAndroid Build Coastguard Worker psrlq $8,%mm0 198*638691a0SAndroid Build Coastguard Worker pminub %mm1,%mm0 199*638691a0SAndroid Build Coastguard Worker movq 8(%edi),%mm1 # reload 200*638691a0SAndroid Build Coastguard Worker punpcklbw %mm0,%mm0 # expand to all 8 bytes 201*638691a0SAndroid Build Coastguard Worker pshufw $0,%mm0,%mm0 202*638691a0SAndroid Build Coastguard Worker 203*638691a0SAndroid Build Coastguard Worker # mm0 now contains lowest metric in all 8 bytes 204*638691a0SAndroid Build Coastguard Worker # subtract it from every output metric 205*638691a0SAndroid Build Coastguard Worker # Trashes %mm7 206*638691a0SAndroid Build Coastguard Worker .macro PSUBUSBM REG,MEM 207*638691a0SAndroid Build Coastguard Worker movq \MEM,%mm7 208*638691a0SAndroid Build Coastguard Worker psubusb \REG,%mm7 209*638691a0SAndroid Build Coastguard Worker movq %mm7,\MEM 210*638691a0SAndroid Build Coastguard Worker .endm 211*638691a0SAndroid Build Coastguard Worker 212*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,(%edi) 213*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,8(%edi) 214*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,16(%edi) 215*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,24(%edi) 216*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,32(%edi) 217*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,40(%edi) 218*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,48(%edi) 219*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,56(%edi) 220*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,64(%edi) 221*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,72(%edi) 222*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,80(%edi) 223*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,88(%edi) 224*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,96(%edi) 225*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,104(%edi) 226*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,112(%edi) 227*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,120(%edi) 228*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,128(%edi) 229*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,136(%edi) 230*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,144(%edi) 231*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,152(%edi) 232*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,160(%edi) 233*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,168(%edi) 234*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,176(%edi) 235*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,184(%edi) 236*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,192(%edi) 237*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,200(%edi) 238*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,208(%edi) 239*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,216(%edi) 240*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,224(%edi) 241*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,232(%edi) 242*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,240(%edi) 243*638691a0SAndroid Build Coastguard Worker PSUBUSBM %mm0,248(%edi) 244*638691a0SAndroid Build Coastguard Worker 245*638691a0SAndroid Build Coastguard Workerdone: 246*638691a0SAndroid Build Coastguard Worker # swap metrics 247*638691a0SAndroid Build Coastguard Worker movl %esi,%eax 248*638691a0SAndroid Build Coastguard Worker movl %edi,%esi 249*638691a0SAndroid Build Coastguard Worker movl %eax,%edi 250*638691a0SAndroid Build Coastguard Worker jmp 1b 251*638691a0SAndroid Build Coastguard Worker 252*638691a0SAndroid Build Coastguard Worker2: emms 253*638691a0SAndroid Build Coastguard Worker movl 8(%ebp),%ebx # ebx = vp 254*638691a0SAndroid Build Coastguard Worker # stash metric pointers 255*638691a0SAndroid Build Coastguard Worker movl %esi,OLDMETRICS(%ebx) 256*638691a0SAndroid Build Coastguard Worker movl %edi,NEWMETRICS(%ebx) 257*638691a0SAndroid Build Coastguard Worker movl %edx,DP(%ebx) # stash incremented value of vp->dp 258*638691a0SAndroid Build Coastguard Worker xorl %eax,%eax 259*638691a0SAndroid Build Coastguard Workererr: popl %ebx 260*638691a0SAndroid Build Coastguard Worker popl %edx 261*638691a0SAndroid Build Coastguard Worker popl %edi 262*638691a0SAndroid Build Coastguard Worker popl %esi 263*638691a0SAndroid Build Coastguard Worker popl %ebp 264*638691a0SAndroid Build Coastguard Worker ret 265*638691a0SAndroid Build Coastguard Worker 266*638691a0SAndroid Build Coastguard Worker .data 267*638691a0SAndroid Build Coastguard Worker .align 8 268*638691a0SAndroid Build Coastguard Workerthirtyones: 269*638691a0SAndroid Build Coastguard Worker .byte 31,31,31,31,31,31,31,31 270*638691a0SAndroid Build Coastguard Worker 271*638691a0SAndroid Build Coastguard Worker 272