Lines Matching +full:1 +full:b
91 pmull2 \c64\().1q, \a16\().2d, \b64\().2d
92 pmull \b64\().1q, \a16\().1d, \b64\().1d
114 * 1 (w0*x1 ^ w1*x0) << 8 ^ | (y0*z1 ^ y1*z0) << 8 ^
133 * b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
137 * Results b and c can be XORed together, as the vector elements have
149 ext t7.16b, \b64\().16b, \b64\().16b, #1
150 tbl t5.16b, {\a16\().16b}, perm.16b
151 uzp1 t7.16b, \b64\().16b, t7.16b
153 ext \b64\().16b, t4.16b, t4.16b, #15
154 eor \c64\().16b, t8.16b, t5.16b
158 ext t6.16b, t5.16b, t5.16b, #8
160 pmull t3.8h, t7.8b, t5.8b
161 pmull t4.8h, t7.8b, t6.8b
162 pmull2 t5.8h, t7.16b, t5.16b
163 pmull2 t6.8h, t7.16b, t6.16b
165 ext t8.16b, t3.16b, t3.16b, #8
166 eor t4.16b, t4.16b, t6.16b
167 ext t7.16b, t5.16b, t5.16b, #8
168 ext t6.16b, t4.16b, t4.16b, #8
169 eor t8.8b, t8.8b, t3.8b
170 eor t5.8b, t5.8b, t7.8b
171 eor t4.8b, t4.8b, t6.8b
172 ext t5.16b, t5.16b, t5.16b, #14
184 CPU_LE( rev64 v11.16b, v11.16b )
185 CPU_LE( rev64 v12.16b, v12.16b )
189 CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
190 CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
192 eor \reg1\().16b, \reg1\().16b, v8.16b
193 eor \reg2\().16b, \reg2\().16b, v9.16b
194 eor \reg1\().16b, \reg1\().16b, v11.16b
195 eor \reg2\().16b, \reg2\().16b, v12.16b
204 eor \dst_reg\().16b, \dst_reg\().16b, v8.16b
205 eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
212 b.lt .Lless_than_256_bytes_\@
223 CPU_LE( rev64 v0.16b, v0.16b )
224 CPU_LE( rev64 v1.16b, v1.16b )
225 CPU_LE( rev64 v2.16b, v2.16b )
226 CPU_LE( rev64 v3.16b, v3.16b )
227 CPU_LE( rev64 v4.16b, v4.16b )
228 CPU_LE( rev64 v5.16b, v5.16b )
229 CPU_LE( rev64 v6.16b, v6.16b )
230 CPU_LE( rev64 v7.16b, v7.16b )
231 CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
232 CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
233 CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 )
234 CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 )
235 CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 )
236 CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 )
237 CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 )
238 CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
241 movi v8.16b, #0
243 eor v0.16b, v0.16b, v8.16b
261 b.ge .Lfold_128_bytes_loop_\@
271 fold_16_bytes \p, v3, v7, 1
274 fold_16_bytes \p, v5, v7, 1
286 b.lt .Lfold_16_bytes_loop_done_\@
289 eor v7.16b, v7.16b, v8.16b
291 CPU_LE( rev64 v0.16b, v0.16b )
292 CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
293 eor v7.16b, v7.16b, v0.16b
295 b.ge .Lfold_16_bytes_loop_\@
301 b.eq .Lreduce_final_16_bytes_\@
304 // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
313 CPU_LE( rev64 v0.16b, v0.16b )
314 CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
319 ld1 {v2.16b}, [x4]
320 tbl v1.16b, {v7.16b}, v2.16b
323 movi v3.16b, #0x80
324 eor v2.16b, v2.16b, v3.16b
325 tbl v3.16b, {v7.16b}, v2.16b
328 sshr v2.16b, v2.16b, #7
332 bsl v2.16b, v1.16b, v0.16b
336 eor v7.16b, v3.16b, v0.16b
337 eor v7.16b, v7.16b, v2.16b
338 b .Lreduce_final_16_bytes_\@
347 CPU_LE( rev64 v7.16b, v7.16b )
348 CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
351 movi v0.16b, #0
353 eor v7.16b, v7.16b, v0.16b
359 b.eq .Lreduce_final_16_bytes_\@ // len == 16
361 b.ge .Lfold_16_bytes_loop_\@ // 32 <= len <= 255
363 b .Lhandle_partial_segment_\@ // 17 <= len <= 31
374 frame_push 1
376 // Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 }
378 orr perm.2s, #1, lsl #16
379 orr perm.2s, #1, lsl #24
380 zip1 perm.16b, perm.16b, perm.16b
381 zip1 perm.16b, perm.16b, perm.16b
385 CPU_LE( rev64 v7.16b, v7.16b )
386 CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
404 movi v2.16b, #0 // init zero register
412 ext v0.16b, v2.16b, v7.16b, #8
413 pmull2 v7.1q, v7.2d, fold_consts.2d // high bits * x^48 * (x^80 mod G(x))
414 eor v0.16b, v0.16b, v7.16b // + low bits * x^64
418 ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits
420 pmull v1.1q, v1.1d, fold_consts.1d // high 32 bits * x^48 * (x^48 mod G(x))
421 eor v0.16b, v0.16b, v1.16b // + low bits
427 pmull2 v1.1q, v0.2d, fold_consts.2d // high 32 bits * floor(x^48 / G(x))
429 pmull v1.1q, v1.1d, fold_consts.1d // *= G(x)
431 eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits
442 // G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
453 .quad 0x000000000000a010 // x^(1*128) mod G(x)
454 .quad 0x0000000000001faa // x^(1*128+64) mod G(x)
462 // For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -