Lines Matching full:b
133 * b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
137 * Results b and c can be XORed together, as the vector elements have
149 ext t7.16b, \b64\().16b, \b64\().16b, #1
150 tbl t5.16b, {\a16\().16b}, perm.16b
151 uzp1 t7.16b, \b64\().16b, t7.16b
153 ext \b64\().16b, t4.16b, t4.16b, #15
154 eor \c64\().16b, t8.16b, t5.16b
158 ext t6.16b, t5.16b, t5.16b, #8
160 pmull t3.8h, t7.8b, t5.8b
161 pmull t4.8h, t7.8b, t6.8b
162 pmull2 t5.8h, t7.16b, t5.16b
163 pmull2 t6.8h, t7.16b, t6.16b
165 ext t8.16b, t3.16b, t3.16b, #8
166 eor t4.16b, t4.16b, t6.16b
167 ext t7.16b, t5.16b, t5.16b, #8
168 ext t6.16b, t4.16b, t4.16b, #8
169 eor t8.8b, t8.8b, t3.8b
170 eor t5.8b, t5.8b, t7.8b
171 eor t4.8b, t4.8b, t6.8b
172 ext t5.16b, t5.16b, t5.16b, #14
184 CPU_LE( rev64 v11.16b, v11.16b )
185 CPU_LE( rev64 v12.16b, v12.16b )
189 CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
190 CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
192 eor \reg1\().16b, \reg1\().16b, v8.16b
193 eor \reg2\().16b, \reg2\().16b, v9.16b
194 eor \reg1\().16b, \reg1\().16b, v11.16b
195 eor \reg2\().16b, \reg2\().16b, v12.16b
204 eor \dst_reg\().16b, \dst_reg\().16b, v8.16b
205 eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
212 b.lt .Lless_than_256_bytes_\@
223 CPU_LE( rev64 v0.16b, v0.16b )
224 CPU_LE( rev64 v1.16b, v1.16b )
225 CPU_LE( rev64 v2.16b, v2.16b )
226 CPU_LE( rev64 v3.16b, v3.16b )
227 CPU_LE( rev64 v4.16b, v4.16b )
228 CPU_LE( rev64 v5.16b, v5.16b )
229 CPU_LE( rev64 v6.16b, v6.16b )
230 CPU_LE( rev64 v7.16b, v7.16b )
231 CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
232 CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
233 CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 )
234 CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 )
235 CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 )
236 CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 )
237 CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 )
238 CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
241 movi v8.16b, #0
243 eor v0.16b, v0.16b, v8.16b
261 b.ge .Lfold_128_bytes_loop_\@
286 b.lt .Lfold_16_bytes_loop_done_\@
289 eor v7.16b, v7.16b, v8.16b
291 CPU_LE( rev64 v0.16b, v0.16b )
292 CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
293 eor v7.16b, v7.16b, v0.16b
295 b.ge .Lfold_16_bytes_loop_\@
301 b.eq .Lreduce_final_16_bytes_\@
313 CPU_LE( rev64 v0.16b, v0.16b )
314 CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
319 ld1 {v2.16b}, [x4]
320 tbl v1.16b, {v7.16b}, v2.16b
323 movi v3.16b, #0x80
324 eor v2.16b, v2.16b, v3.16b
325 tbl v3.16b, {v7.16b}, v2.16b
328 sshr v2.16b, v2.16b, #7
332 bsl v2.16b, v1.16b, v0.16b
336 eor v7.16b, v3.16b, v0.16b
337 eor v7.16b, v7.16b, v2.16b
338 b .Lreduce_final_16_bytes_\@
347 CPU_LE( rev64 v7.16b, v7.16b )
348 CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
351 movi v0.16b, #0
353 eor v7.16b, v7.16b, v0.16b
359 b.eq .Lreduce_final_16_bytes_\@ // len == 16
361 b.ge .Lfold_16_bytes_loop_\@ // 32 <= len <= 255
363 b .Lhandle_partial_segment_\@ // 17 <= len <= 31
380 zip1 perm.16b, perm.16b, perm.16b
381 zip1 perm.16b, perm.16b, perm.16b
385 CPU_LE( rev64 v7.16b, v7.16b )
386 CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
404 movi v2.16b, #0 // init zero register
412 ext v0.16b, v2.16b, v7.16b, #8
414 eor v0.16b, v0.16b, v7.16b // + low bits * x^64
418 ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits
421 eor v0.16b, v0.16b, v1.16b // + low bits
431 eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits