Lines Matching full:d0

45       "vld1.32 {d0}, [%[in]]!\n"  in Pack()
46 "vaddw.u8 q8, q8, d0\n" in Pack()
47 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
52 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
57 "vmul.i32 q8, q8, d0[0]\n" in Pack()
64 : "d0", "d2", "d3", "d16", "d17", "cc", "memory"); in Pack()
90 "vld1.32 {d0}, [%[in]]!\n" in Pack()
91 "vaddw.u8 q8, q8, d0\n" in Pack()
92 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
99 "vmov.i8 d0, #0\n" in Pack()
100 "vld1.8 {d0[0]}, [%[in]]!\n" in Pack()
101 "vaddw.u8 q8, q8, d0\n" in Pack()
102 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
105 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
110 "vmul.i32 q8, q8, d0[0]\n" in Pack()
117 : "d0", "d2", "d3", "d16", "d17", "cc", "memory"); in Pack()
143 "vld1.32 {d0}, [%[in]]!\n" in Pack()
144 "vaddw.u8 q8, q8, d0\n" in Pack()
145 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
152 "vmov.i8 d0, #0\n" in Pack()
153 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
154 "vaddw.u8 q8, q8, d0\n" in Pack()
155 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
158 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
163 "vmul.i32 q8, q8, d0[0]\n" in Pack()
170 : "d0", "d2", "d3", "d16", "d17", "cc", "memory"); in Pack()
196 "vld1.32 {d0}, [%[in]]!\n" in Pack()
197 "vaddw.u8 q8, q8, d0\n" in Pack()
198 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
205 "vmov.i8 d0, #0\n" in Pack()
206 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
207 "vld1.8 {d0[2]}, [%[in]]!\n" in Pack()
208 "vaddw.u8 q8, q8, d0\n" in Pack()
209 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
212 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
217 "vmul.i32 q8, q8, d0[0]\n" in Pack()
224 : "d0", "d2", "d3", "d16", "d17", "cc", "memory"); in Pack()
250 "vld1.32 {d0}, [%[in]]!\n" in Pack()
251 "vaddw.u8 q8, q8, d0\n" in Pack()
252 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
259 "vmov.i8 d0, #0\n" in Pack()
260 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
261 "vaddw.u8 q8, q8, d0\n" in Pack()
262 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
265 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
270 "vmul.i32 q8, q8, d0[0]\n" in Pack()
277 : "d0", "d2", "d3", "d16", "d17", "cc", "memory"); in Pack()
303 "vld1.32 {d0}, [%[in]]!\n" in Pack()
304 "vaddw.u8 q8, q8, d0\n" in Pack()
305 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
312 "vmov.i8 d0, #0\n" in Pack()
313 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
314 "vld1.8 {d0[4]}, [%[in]]!\n" in Pack()
315 "vaddw.u8 q8, q8, d0\n" in Pack()
316 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
319 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
324 "vmul.i32 q8, q8, d0[0]\n" in Pack()
331 : "d0", "d2", "d3", "d16", "d17", "cc", "memory"); in Pack()
357 "vld1.32 {d0}, [%[in]]!\n" in Pack()
358 "vaddw.u8 q8, q8, d0\n" in Pack()
359 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
366 "vmov.i8 d0, #0\n" in Pack()
367 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
368 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
369 "vaddw.u8 q8, q8, d0\n" in Pack()
370 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
373 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
378 "vmul.i32 q8, q8, d0[0]\n" in Pack()
385 : "d0", "d2", "d3", "d16", "d17", "cc", "memory"); in Pack()
411 "vld1.32 {d0}, [%[in]]!\n" in Pack()
412 "vaddw.u8 q8, q8, d0\n" in Pack()
413 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
420 "vmov.i8 d0, #0\n" in Pack()
421 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
422 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
423 "vld1.8 {d0[6]}, [%[in]]!\n" in Pack()
424 "vaddw.u8 q8, q8, d0\n" in Pack()
425 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
428 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
433 "vmul.i32 q8, q8, d0[0]\n" in Pack()
440 : "d0", "d2", "d3", "d16", "d17", "cc", "memory"); in Pack()
464 "vld1.32 {d0}, [%[in]]!\n" in Pack()
466 "vaddw.u8 q8, q8, d0\n" in Pack()
468 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
473 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
480 "vmul.i32 q8, q8, d0[0]\n" in Pack()
487 : "r0", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "cc", in Pack()
516 "vld1.32 {d0}, [%[in]]!\n" in Pack()
518 "vaddw.u8 q8, q8, d0\n" in Pack()
520 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
527 "vmov.i8 d0, #0\n" in Pack()
529 "vld1.8 {d0[0]}, [%[in]]!\n" in Pack()
531 "vaddw.u8 q8, q8, d0\n" in Pack()
533 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
536 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
543 "vmul.i32 q8, q8, d0[0]\n" in Pack()
550 : "r0", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "cc", in Pack()
579 "vld1.32 {d0}, [%[in]]!\n" in Pack()
581 "vaddw.u8 q8, q8, d0\n" in Pack()
583 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
590 "vmov.i8 d0, #0\n" in Pack()
592 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
594 "vaddw.u8 q8, q8, d0\n" in Pack()
596 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
599 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
606 "vmul.i32 q8, q8, d0[0]\n" in Pack()
613 : "r0", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "cc", in Pack()
642 "vld1.32 {d0}, [%[in]]!\n" in Pack()
644 "vaddw.u8 q8, q8, d0\n" in Pack()
646 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
653 "vmov.i8 d0, #0\n" in Pack()
655 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
656 "vld1.8 {d0[2]}, [%[in]]!\n" in Pack()
659 "vaddw.u8 q8, q8, d0\n" in Pack()
661 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
664 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
671 "vmul.i32 q8, q8, d0[0]\n" in Pack()
678 : "r0", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "cc", in Pack()
707 "vld1.32 {d0}, [%[in]]!\n" in Pack()
709 "vaddw.u8 q8, q8, d0\n" in Pack()
711 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
718 "vmov.i8 d0, #0\n" in Pack()
720 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
722 "vaddw.u8 q8, q8, d0\n" in Pack()
724 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
727 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
734 "vmul.i32 q8, q8, d0[0]\n" in Pack()
741 : "r0", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "cc", in Pack()
770 "vld1.32 {d0}, [%[in]]!\n" in Pack()
772 "vaddw.u8 q8, q8, d0\n" in Pack()
774 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
781 "vmov.i8 d0, #0\n" in Pack()
783 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
784 "vld1.8 {d0[4]}, [%[in]]!\n" in Pack()
787 "vaddw.u8 q8, q8, d0\n" in Pack()
789 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
792 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
799 "vmul.i32 q8, q8, d0[0]\n" in Pack()
806 : "r0", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "cc", in Pack()
835 "vld1.32 {d0}, [%[in]]!\n" in Pack()
837 "vaddw.u8 q8, q8, d0\n" in Pack()
839 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
846 "vmov.i8 d0, #0\n" in Pack()
848 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
849 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
852 "vaddw.u8 q8, q8, d0\n" in Pack()
854 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
857 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
864 "vmul.i32 q8, q8, d0[0]\n" in Pack()
871 : "r0", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "cc", in Pack()
900 "vld1.32 {d0}, [%[in]]!\n" in Pack()
902 "vaddw.u8 q8, q8, d0\n" in Pack()
904 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
911 "vmov.i8 d0, #0\n" in Pack()
913 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
914 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
915 "vld1.8 {d0[6]}, [%[in]]!\n" in Pack()
919 "vaddw.u8 q8, q8, d0\n" in Pack()
921 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
924 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
931 "vmul.i32 q8, q8, d0[0]\n" in Pack()
938 : "r0", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "cc", in Pack()
965 "vld1.32 {d0}, [%[in]]!\n" in Pack()
968 "vaddw.u8 q8, q8, d0\n" in Pack()
971 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
976 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
986 "vmul.i32 q8, q8, d0[0]\n" in Pack()
993 : "r0", "r1", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d20", in Pack()
1024 "vld1.32 {d0}, [%[in]]!\n" in Pack()
1027 "vaddw.u8 q8, q8, d0\n" in Pack()
1030 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
1037 "vmov.i8 d0, #0\n" in Pack()
1040 "vld1.8 {d0[0]}, [%[in]]!\n" in Pack()
1043 "vaddw.u8 q8, q8, d0\n" in Pack()
1046 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
1049 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
1059 "vmul.i32 q8, q8, d0[0]\n" in Pack()
1066 : "r0", "r1", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d20", in Pack()
1097 "vld1.32 {d0}, [%[in]]!\n" in Pack()
1100 "vaddw.u8 q8, q8, d0\n" in Pack()
1103 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
1110 "vmov.i8 d0, #0\n" in Pack()
1113 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
1116 "vaddw.u8 q8, q8, d0\n" in Pack()
1119 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
1122 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
1132 "vmul.i32 q8, q8, d0[0]\n" in Pack()
1139 : "r0", "r1", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d20", in Pack()
1170 "vld1.32 {d0}, [%[in]]!\n" in Pack()
1173 "vaddw.u8 q8, q8, d0\n" in Pack()
1176 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
1183 "vmov.i8 d0, #0\n" in Pack()
1186 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
1187 "vld1.8 {d0[2]}, [%[in]]!\n" in Pack()
1192 "vaddw.u8 q8, q8, d0\n" in Pack()
1195 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
1198 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
1208 "vmul.i32 q8, q8, d0[0]\n" in Pack()
1215 : "r0", "r1", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d20", in Pack()
1246 "vld1.32 {d0}, [%[in]]!\n" in Pack()
1249 "vaddw.u8 q8, q8, d0\n" in Pack()
1252 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
1259 "vmov.i8 d0, #0\n" in Pack()
1262 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
1265 "vaddw.u8 q8, q8, d0\n" in Pack()
1268 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
1271 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
1281 "vmul.i32 q8, q8, d0[0]\n" in Pack()
1288 : "r0", "r1", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d20", in Pack()
1319 "vld1.32 {d0}, [%[in]]!\n" in Pack()
1322 "vaddw.u8 q8, q8, d0\n" in Pack()
1325 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
1332 "vmov.i8 d0, #0\n" in Pack()
1335 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
1336 "vld1.8 {d0[4]}, [%[in]]!\n" in Pack()
1341 "vaddw.u8 q8, q8, d0\n" in Pack()
1344 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
1347 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
1357 "vmul.i32 q8, q8, d0[0]\n" in Pack()
1364 : "r0", "r1", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d20", in Pack()
1395 "vld1.32 {d0}, [%[in]]!\n" in Pack()
1398 "vaddw.u8 q8, q8, d0\n" in Pack()
1401 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
1408 "vmov.i8 d0, #0\n" in Pack()
1411 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
1412 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
1417 "vaddw.u8 q8, q8, d0\n" in Pack()
1420 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
1423 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
1433 "vmul.i32 q8, q8, d0[0]\n" in Pack()
1440 : "r0", "r1", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d20", in Pack()
1471 "vld1.32 {d0}, [%[in]]!\n" in Pack()
1474 "vaddw.u8 q8, q8, d0\n" in Pack()
1477 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
1484 "vmov.i8 d0, #0\n" in Pack()
1487 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
1488 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
1489 "vld1.8 {d0[6]}, [%[in]]!\n" in Pack()
1496 "vaddw.u8 q8, q8, d0\n" in Pack()
1499 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
1502 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
1512 "vmul.i32 q8, q8, d0[0]\n" in Pack()
1519 : "r0", "r1", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d20", in Pack()
1548 "vld1.32 {d0}, [%[in]]!\n" in Pack()
1552 "vaddw.u8 q8, q8, d0\n" in Pack()
1556 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
1561 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
1573 "vmul.i32 q8, q8, d0[0]\n" in Pack()
1580 : "r0", "r1", "r2", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", in Pack()
1613 "vld1.32 {d0}, [%[in]]!\n" in Pack()
1617 "vaddw.u8 q8, q8, d0\n" in Pack()
1621 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
1628 "vmov.i8 d0, #0\n" in Pack()
1632 "vld1.8 {d0[0]}, [%[in]]!\n" in Pack()
1636 "vaddw.u8 q8, q8, d0\n" in Pack()
1640 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
1643 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
1655 "vmul.i32 q8, q8, d0[0]\n" in Pack()
1662 : "r0", "r1", "r2", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", in Pack()
1695 "vld1.32 {d0}, [%[in]]!\n" in Pack()
1699 "vaddw.u8 q8, q8, d0\n" in Pack()
1703 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
1710 "vmov.i8 d0, #0\n" in Pack()
1714 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
1718 "vaddw.u8 q8, q8, d0\n" in Pack()
1722 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
1725 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
1737 "vmul.i32 q8, q8, d0[0]\n" in Pack()
1744 : "r0", "r1", "r2", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", in Pack()
1777 "vld1.32 {d0}, [%[in]]!\n" in Pack()
1781 "vaddw.u8 q8, q8, d0\n" in Pack()
1785 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
1792 "vmov.i8 d0, #0\n" in Pack()
1796 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
1797 "vld1.8 {d0[2]}, [%[in]]!\n" in Pack()
1804 "vaddw.u8 q8, q8, d0\n" in Pack()
1808 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
1811 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
1823 "vmul.i32 q8, q8, d0[0]\n" in Pack()
1830 : "r0", "r1", "r2", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", in Pack()
1863 "vld1.32 {d0}, [%[in]]!\n" in Pack()
1867 "vaddw.u8 q8, q8, d0\n" in Pack()
1871 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
1878 "vmov.i8 d0, #0\n" in Pack()
1882 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
1886 "vaddw.u8 q8, q8, d0\n" in Pack()
1890 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
1893 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
1905 "vmul.i32 q8, q8, d0[0]\n" in Pack()
1912 : "r0", "r1", "r2", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", in Pack()
1945 "vld1.32 {d0}, [%[in]]!\n" in Pack()
1949 "vaddw.u8 q8, q8, d0\n" in Pack()
1953 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
1960 "vmov.i8 d0, #0\n" in Pack()
1964 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
1965 "vld1.8 {d0[4]}, [%[in]]!\n" in Pack()
1972 "vaddw.u8 q8, q8, d0\n" in Pack()
1976 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
1979 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
1991 "vmul.i32 q8, q8, d0[0]\n" in Pack()
1998 : "r0", "r1", "r2", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", in Pack()
2031 "vld1.32 {d0}, [%[in]]!\n" in Pack()
2035 "vaddw.u8 q8, q8, d0\n" in Pack()
2039 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
2046 "vmov.i8 d0, #0\n" in Pack()
2050 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
2051 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
2058 "vaddw.u8 q8, q8, d0\n" in Pack()
2062 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
2065 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
2077 "vmul.i32 q8, q8, d0[0]\n" in Pack()
2084 : "r0", "r1", "r2", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", in Pack()
2117 "vld1.32 {d0}, [%[in]]!\n" in Pack()
2121 "vaddw.u8 q8, q8, d0\n" in Pack()
2125 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
2132 "vmov.i8 d0, #0\n" in Pack()
2136 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
2137 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
2138 "vld1.8 {d0[6]}, [%[in]]!\n" in Pack()
2148 "vaddw.u8 q8, q8, d0\n" in Pack()
2152 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
2155 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
2167 "vmul.i32 q8, q8, d0[0]\n" in Pack()
2174 : "r0", "r1", "r2", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", in Pack()
2205 "vld1.32 {d0}, [%[in]]!\n" in Pack()
2210 "vaddw.u8 q8, q8, d0\n" in Pack()
2215 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
2221 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
2236 "vmul.i32 q8, q8, d0[0]\n" in Pack()
2237 "vmul.i32 q9, q9, d0[0]\n" in Pack()
2245 : "r0", "r1", "r2", "r3", "d0", "d1", "d2", "d3", "d4", "d16", "d17", in Pack()
2280 "vld1.32 {d0}, [%[in]]!\n" in Pack()
2285 "vaddw.u8 q8, q8, d0\n" in Pack()
2290 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
2298 "vmov.i8 d0, #0\n" in Pack()
2303 "vld1.8 {d0[0]}, [%[in]]!\n" in Pack()
2308 "vaddw.u8 q8, q8, d0\n" in Pack()
2313 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
2317 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
2332 "vmul.i32 q8, q8, d0[0]\n" in Pack()
2333 "vmul.i32 q9, q9, d0[0]\n" in Pack()
2341 : "r0", "r1", "r2", "r3", "d0", "d1", "d2", "d3", "d4", "d16", "d17", in Pack()
2376 "vld1.32 {d0}, [%[in]]!\n" in Pack()
2381 "vaddw.u8 q8, q8, d0\n" in Pack()
2386 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
2394 "vmov.i8 d0, #0\n" in Pack()
2399 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
2404 "vaddw.u8 q8, q8, d0\n" in Pack()
2409 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
2413 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
2428 "vmul.i32 q8, q8, d0[0]\n" in Pack()
2429 "vmul.i32 q9, q9, d0[0]\n" in Pack()
2437 : "r0", "r1", "r2", "r3", "d0", "d1", "d2", "d3", "d4", "d16", "d17", in Pack()
2472 "vld1.32 {d0}, [%[in]]!\n" in Pack()
2477 "vaddw.u8 q8, q8, d0\n" in Pack()
2482 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
2490 "vmov.i8 d0, #0\n" in Pack()
2495 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
2496 "vld1.8 {d0[2]}, [%[in]]!\n" in Pack()
2505 "vaddw.u8 q8, q8, d0\n" in Pack()
2510 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
2514 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
2529 "vmul.i32 q8, q8, d0[0]\n" in Pack()
2530 "vmul.i32 q9, q9, d0[0]\n" in Pack()
2538 : "r0", "r1", "r2", "r3", "d0", "d1", "d2", "d3", "d4", "d16", "d17", in Pack()
2573 "vld1.32 {d0}, [%[in]]!\n" in Pack()
2578 "vaddw.u8 q8, q8, d0\n" in Pack()
2583 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
2591 "vmov.i8 d0, #0\n" in Pack()
2596 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
2601 "vaddw.u8 q8, q8, d0\n" in Pack()
2606 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
2610 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
2625 "vmul.i32 q8, q8, d0[0]\n" in Pack()
2626 "vmul.i32 q9, q9, d0[0]\n" in Pack()
2634 : "r0", "r1", "r2", "r3", "d0", "d1", "d2", "d3", "d4", "d16", "d17", in Pack()
2669 "vld1.32 {d0}, [%[in]]!\n" in Pack()
2674 "vaddw.u8 q8, q8, d0\n" in Pack()
2679 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
2687 "vmov.i8 d0, #0\n" in Pack()
2692 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
2693 "vld1.8 {d0[4]}, [%[in]]!\n" in Pack()
2702 "vaddw.u8 q8, q8, d0\n" in Pack()
2707 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
2711 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
2726 "vmul.i32 q8, q8, d0[0]\n" in Pack()
2727 "vmul.i32 q9, q9, d0[0]\n" in Pack()
2735 : "r0", "r1", "r2", "r3", "d0", "d1", "d2", "d3", "d4", "d16", "d17", in Pack()
2770 "vld1.32 {d0}, [%[in]]!\n" in Pack()
2775 "vaddw.u8 q8, q8, d0\n" in Pack()
2780 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
2788 "vmov.i8 d0, #0\n" in Pack()
2793 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
2794 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
2803 "vaddw.u8 q8, q8, d0\n" in Pack()
2808 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
2812 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
2827 "vmul.i32 q8, q8, d0[0]\n" in Pack()
2828 "vmul.i32 q9, q9, d0[0]\n" in Pack()
2836 : "r0", "r1", "r2", "r3", "d0", "d1", "d2", "d3", "d4", "d16", "d17", in Pack()
2871 "vld1.32 {d0}, [%[in]]!\n" in Pack()
2876 "vaddw.u8 q8, q8, d0\n" in Pack()
2881 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
2889 "vmov.i8 d0, #0\n" in Pack()
2894 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
2895 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
2896 "vld1.8 {d0[6]}, [%[in]]!\n" in Pack()
2909 "vaddw.u8 q8, q8, d0\n" in Pack()
2914 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
2918 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
2933 "vmul.i32 q8, q8, d0[0]\n" in Pack()
2934 "vmul.i32 q9, q9, d0[0]\n" in Pack()
2942 : "r0", "r1", "r2", "r3", "d0", "d1", "d2", "d3", "d4", "d16", "d17", in Pack()
2975 "vld1.32 {d0}, [%[in]]!\n" in Pack()
2981 "vaddw.u8 q8, q8, d0\n" in Pack()
2987 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
2993 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
3010 "vmul.i32 q8, q8, d0[0]\n" in Pack()
3011 "vmul.i32 q9, q9, d0[0]\n" in Pack()
3019 : "r0", "r1", "r2", "r3", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d16", in Pack()
3057 "vld1.32 {d0}, [%[in]]!\n" in Pack()
3063 "vaddw.u8 q8, q8, d0\n" in Pack()
3069 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
3077 "vmov.i8 d0, #0\n" in Pack()
3083 "vld1.8 {d0[0]}, [%[in]]!\n" in Pack()
3089 "vaddw.u8 q8, q8, d0\n" in Pack()
3095 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
3099 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
3116 "vmul.i32 q8, q8, d0[0]\n" in Pack()
3117 "vmul.i32 q9, q9, d0[0]\n" in Pack()
3125 : "r0", "r1", "r2", "r3", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d16", in Pack()
3163 "vld1.32 {d0}, [%[in]]!\n" in Pack()
3169 "vaddw.u8 q8, q8, d0\n" in Pack()
3175 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
3183 "vmov.i8 d0, #0\n" in Pack()
3189 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
3195 "vaddw.u8 q8, q8, d0\n" in Pack()
3201 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
3205 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
3222 "vmul.i32 q8, q8, d0[0]\n" in Pack()
3223 "vmul.i32 q9, q9, d0[0]\n" in Pack()
3231 : "r0", "r1", "r2", "r3", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d16", in Pack()
3269 "vld1.32 {d0}, [%[in]]!\n" in Pack()
3275 "vaddw.u8 q8, q8, d0\n" in Pack()
3281 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
3289 "vmov.i8 d0, #0\n" in Pack()
3295 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
3296 "vld1.8 {d0[2]}, [%[in]]!\n" in Pack()
3307 "vaddw.u8 q8, q8, d0\n" in Pack()
3313 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
3317 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
3334 "vmul.i32 q8, q8, d0[0]\n" in Pack()
3335 "vmul.i32 q9, q9, d0[0]\n" in Pack()
3343 : "r0", "r1", "r2", "r3", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d16", in Pack()
3381 "vld1.32 {d0}, [%[in]]!\n" in Pack()
3387 "vaddw.u8 q8, q8, d0\n" in Pack()
3393 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
3401 "vmov.i8 d0, #0\n" in Pack()
3407 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
3413 "vaddw.u8 q8, q8, d0\n" in Pack()
3419 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
3423 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
3440 "vmul.i32 q8, q8, d0[0]\n" in Pack()
3441 "vmul.i32 q9, q9, d0[0]\n" in Pack()
3449 : "r0", "r1", "r2", "r3", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d16", in Pack()
3487 "vld1.32 {d0}, [%[in]]!\n" in Pack()
3493 "vaddw.u8 q8, q8, d0\n" in Pack()
3499 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
3507 "vmov.i8 d0, #0\n" in Pack()
3513 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
3514 "vld1.8 {d0[4]}, [%[in]]!\n" in Pack()
3525 "vaddw.u8 q8, q8, d0\n" in Pack()
3531 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
3535 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
3552 "vmul.i32 q8, q8, d0[0]\n" in Pack()
3553 "vmul.i32 q9, q9, d0[0]\n" in Pack()
3561 : "r0", "r1", "r2", "r3", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d16", in Pack()
3599 "vld1.32 {d0}, [%[in]]!\n" in Pack()
3605 "vaddw.u8 q8, q8, d0\n" in Pack()
3611 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
3619 "vmov.i8 d0, #0\n" in Pack()
3625 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
3626 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
3637 "vaddw.u8 q8, q8, d0\n" in Pack()
3643 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
3647 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
3664 "vmul.i32 q8, q8, d0[0]\n" in Pack()
3665 "vmul.i32 q9, q9, d0[0]\n" in Pack()
3673 : "r0", "r1", "r2", "r3", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d16", in Pack()
3711 "vld1.32 {d0}, [%[in]]!\n" in Pack()
3717 "vaddw.u8 q8, q8, d0\n" in Pack()
3723 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
3731 "vmov.i8 d0, #0\n" in Pack()
3737 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
3738 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
3739 "vld1.8 {d0[6]}, [%[in]]!\n" in Pack()
3755 "vaddw.u8 q8, q8, d0\n" in Pack()
3761 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
3765 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
3782 "vmul.i32 q8, q8, d0[0]\n" in Pack()
3783 "vmul.i32 q9, q9, d0[0]\n" in Pack()
3791 : "r0", "r1", "r2", "r3", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d16", in Pack()
3827 "vld1.32 {d0}, [%[in]]!\n" in Pack()
3834 "vaddw.u8 q8, q8, d0\n" in Pack()
3841 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
3849 "vmov.32 d0[0], r0\n" in Pack()
3869 "vmul.i32 q8, q8, d0[0]\n" in Pack()
3870 "vmul.i32 q9, q9, d0[0]\n" in Pack()
3878 : "r0", "r1", "r2", "r3", "r4", "r5", "d0", "d1", "d2", "d3", "d4", "d5", in Pack()
3918 "vld1.32 {d0}, [%[in]]!\n" in Pack()
3925 "vaddw.u8 q8, q8, d0\n" in Pack()
3932 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
3940 "vmov.i8 d0, #0\n" in Pack()
3947 "vld1.8 {d0[0]}, [%[in]]!\n" in Pack()
3954 "vaddw.u8 q8, q8, d0\n" in Pack()
3961 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
3967 "vmov.32 d0[0], r0\n" in Pack()
3987 "vmul.i32 q8, q8, d0[0]\n" in Pack()
3988 "vmul.i32 q9, q9, d0[0]\n" in Pack()
3996 : "r0", "r1", "r2", "r3", "r4", "r5", "d0", "d1", "d2", "d3", "d4", "d5", in Pack()
4036 "vld1.32 {d0}, [%[in]]!\n" in Pack()
4043 "vaddw.u8 q8, q8, d0\n" in Pack()
4050 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
4058 "vmov.i8 d0, #0\n" in Pack()
4065 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
4072 "vaddw.u8 q8, q8, d0\n" in Pack()
4079 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
4085 "vmov.32 d0[0], r0\n" in Pack()
4105 "vmul.i32 q8, q8, d0[0]\n" in Pack()
4106 "vmul.i32 q9, q9, d0[0]\n" in Pack()
4114 : "r0", "r1", "r2", "r3", "r4", "r5", "d0", "d1", "d2", "d3", "d4", "d5", in Pack()
4154 "vld1.32 {d0}, [%[in]]!\n" in Pack()
4161 "vaddw.u8 q8, q8, d0\n" in Pack()
4168 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
4176 "vmov.i8 d0, #0\n" in Pack()
4183 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
4184 "vld1.8 {d0[2]}, [%[in]]!\n" in Pack()
4197 "vaddw.u8 q8, q8, d0\n" in Pack()
4204 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
4210 "vmov.32 d0[0], r0\n" in Pack()
4230 "vmul.i32 q8, q8, d0[0]\n" in Pack()
4231 "vmul.i32 q9, q9, d0[0]\n" in Pack()
4239 : "r0", "r1", "r2", "r3", "r4", "r5", "d0", "d1", "d2", "d3", "d4", "d5", in Pack()
4279 "vld1.32 {d0}, [%[in]]!\n" in Pack()
4286 "vaddw.u8 q8, q8, d0\n" in Pack()
4293 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
4301 "vmov.i8 d0, #0\n" in Pack()
4308 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
4315 "vaddw.u8 q8, q8, d0\n" in Pack()
4322 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
4328 "vmov.32 d0[0], r0\n" in Pack()
4348 "vmul.i32 q8, q8, d0[0]\n" in Pack()
4349 "vmul.i32 q9, q9, d0[0]\n" in Pack()
4357 : "r0", "r1", "r2", "r3", "r4", "r5", "d0", "d1", "d2", "d3", "d4", "d5", in Pack()
4397 "vld1.32 {d0}, [%[in]]!\n" in Pack()
4404 "vaddw.u8 q8, q8, d0\n" in Pack()
4411 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
4419 "vmov.i8 d0, #0\n" in Pack()
4426 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
4427 "vld1.8 {d0[4]}, [%[in]]!\n" in Pack()
4440 "vaddw.u8 q8, q8, d0\n" in Pack()
4447 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
4453 "vmov.32 d0[0], r0\n" in Pack()
4473 "vmul.i32 q8, q8, d0[0]\n" in Pack()
4474 "vmul.i32 q9, q9, d0[0]\n" in Pack()
4482 : "r0", "r1", "r2", "r3", "r4", "r5", "d0", "d1", "d2", "d3", "d4", "d5", in Pack()
4522 "vld1.32 {d0}, [%[in]]!\n" in Pack()
4529 "vaddw.u8 q8, q8, d0\n" in Pack()
4536 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
4544 "vmov.i8 d0, #0\n" in Pack()
4551 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
4552 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
4565 "vaddw.u8 q8, q8, d0\n" in Pack()
4572 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
4578 "vmov.32 d0[0], r0\n" in Pack()
4598 "vmul.i32 q8, q8, d0[0]\n" in Pack()
4599 "vmul.i32 q9, q9, d0[0]\n" in Pack()
4607 : "r0", "r1", "r2", "r3", "r4", "r5", "d0", "d1", "d2", "d3", "d4", "d5", in Pack()
4647 "vld1.32 {d0}, [%[in]]!\n" in Pack()
4654 "vaddw.u8 q8, q8, d0\n" in Pack()
4661 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
4669 "vmov.i8 d0, #0\n" in Pack()
4676 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
4677 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
4678 "vld1.8 {d0[6]}, [%[in]]!\n" in Pack()
4697 "vaddw.u8 q8, q8, d0\n" in Pack()
4704 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
4710 "vmov.32 d0[0], r0\n" in Pack()
4730 "vmul.i32 q8, q8, d0[0]\n" in Pack()
4731 "vmul.i32 q9, q9, d0[0]\n" in Pack()
4739 : "r0", "r1", "r2", "r3", "r4", "r5", "d0", "d1", "d2", "d3", "d4", "d5", in Pack()
4777 "vld1.32 {d0}, [%[in]]!\n" in Pack()
4785 "vaddw.u8 q8, q8, d0\n" in Pack()
4793 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
4801 "vmov.32 d0[0], r0\n" in Pack()
4823 "vmul.i32 q8, q8, d0[0]\n" in Pack()
4824 "vmul.i32 q9, q9, d0[0]\n" in Pack()
4832 : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "d0", "d1", "d2", "d3", "d4", in Pack()
4875 "vld1.32 {d0}, [%[in]]!\n" in Pack()
4883 "vaddw.u8 q8, q8, d0\n" in Pack()
4891 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
4899 "vmov.i8 d0, #0\n" in Pack()
4907 "vld1.8 {d0[0]}, [%[in]]!\n" in Pack()
4915 "vaddw.u8 q8, q8, d0\n" in Pack()
4923 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
4929 "vmov.32 d0[0], r0\n" in Pack()
4951 "vmul.i32 q8, q8, d0[0]\n" in Pack()
4952 "vmul.i32 q9, q9, d0[0]\n" in Pack()
4960 : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "d0", "d1", "d2", "d3", "d4", in Pack()
5003 "vld1.32 {d0}, [%[in]]!\n" in Pack()
5011 "vaddw.u8 q8, q8, d0\n" in Pack()
5019 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
5027 "vmov.i8 d0, #0\n" in Pack()
5035 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
5043 "vaddw.u8 q8, q8, d0\n" in Pack()
5051 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
5057 "vmov.32 d0[0], r0\n" in Pack()
5079 "vmul.i32 q8, q8, d0[0]\n" in Pack()
5080 "vmul.i32 q9, q9, d0[0]\n" in Pack()
5088 : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "d0", "d1", "d2", "d3", "d4", in Pack()
5131 "vld1.32 {d0}, [%[in]]!\n" in Pack()
5139 "vaddw.u8 q8, q8, d0\n" in Pack()
5147 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
5155 "vmov.i8 d0, #0\n" in Pack()
5163 "vld1.16 {d0[0]}, [%[in]]!\n" in Pack()
5164 "vld1.8 {d0[2]}, [%[in]]!\n" in Pack()
5179 "vaddw.u8 q8, q8, d0\n" in Pack()
5187 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
5193 "vmov.32 d0[0], r0\n" in Pack()
5215 "vmul.i32 q8, q8, d0[0]\n" in Pack()
5216 "vmul.i32 q9, q9, d0[0]\n" in Pack()
5224 : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "d0", "d1", "d2", "d3", "d4", in Pack()
5267 "vld1.32 {d0}, [%[in]]!\n" in Pack()
5275 "vaddw.u8 q8, q8, d0\n" in Pack()
5283 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
5291 "vmov.i8 d0, #0\n" in Pack()
5299 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
5307 "vaddw.u8 q8, q8, d0\n" in Pack()
5315 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
5321 "vmov.32 d0[0], r0\n" in Pack()
5343 "vmul.i32 q8, q8, d0[0]\n" in Pack()
5344 "vmul.i32 q9, q9, d0[0]\n" in Pack()
5352 : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "d0", "d1", "d2", "d3", "d4", in Pack()
5395 "vld1.32 {d0}, [%[in]]!\n" in Pack()
5403 "vaddw.u8 q8, q8, d0\n" in Pack()
5411 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
5419 "vmov.i8 d0, #0\n" in Pack()
5427 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
5428 "vld1.8 {d0[4]}, [%[in]]!\n" in Pack()
5443 "vaddw.u8 q8, q8, d0\n" in Pack()
5451 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
5457 "vmov.32 d0[0], r0\n" in Pack()
5479 "vmul.i32 q8, q8, d0[0]\n" in Pack()
5480 "vmul.i32 q9, q9, d0[0]\n" in Pack()
5488 : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "d0", "d1", "d2", "d3", "d4", in Pack()
5531 "vld1.32 {d0}, [%[in]]!\n" in Pack()
5539 "vaddw.u8 q8, q8, d0\n" in Pack()
5547 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
5555 "vmov.i8 d0, #0\n" in Pack()
5563 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
5564 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
5579 "vaddw.u8 q8, q8, d0\n" in Pack()
5587 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
5593 "vmov.32 d0[0], r0\n" in Pack()
5615 "vmul.i32 q8, q8, d0[0]\n" in Pack()
5616 "vmul.i32 q9, q9, d0[0]\n" in Pack()
5624 : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "d0", "d1", "d2", "d3", "d4", in Pack()
5667 "vld1.32 {d0}, [%[in]]!\n" in Pack()
5675 "vaddw.u8 q8, q8, d0\n" in Pack()
5683 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
5691 "vmov.i8 d0, #0\n" in Pack()
5699 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
5700 "vld1.16 {d0[2]}, [%[in]]!\n" in Pack()
5701 "vld1.8 {d0[6]}, [%[in]]!\n" in Pack()
5723 "vaddw.u8 q8, q8, d0\n" in Pack()
5731 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
5737 "vmov.32 d0[0], r0\n" in Pack()
5759 "vmul.i32 q8, q8, d0[0]\n" in Pack()
5760 "vmul.i32 q9, q9, d0[0]\n" in Pack()
5768 : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "d0", "d1", "d2", "d3", "d4", in Pack()
5795 "vld1.8 {d0[0]}, [%[in]], %[stride]\n" in Pack()
5796 "vld1.8 {d0[1]}, [%[in]], %[stride]\n" in Pack()
5797 "vld1.8 {d0[2]}, [%[in]], %[stride]\n" in Pack()
5798 "vld1.8 {d0[3]}, [%[in]], %[stride]\n" in Pack()
5799 "vld1.8 {d0[4]}, [%[in]], %[stride]\n" in Pack()
5800 "vld1.8 {d0[5]}, [%[in]], %[stride]\n" in Pack()
5801 "vld1.8 {d0[6]}, [%[in]], %[stride]\n" in Pack()
5802 "vld1.8 {d0[7]}, [%[in]], %[stride]\n" in Pack()
5804 "vaddw.u8 q8, q8, d0\n" in Pack()
5805 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
5810 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
5815 "vmul.i32 q8, q8, d0[0]\n" in Pack()
5822 : "d0", "d2", "d3", "d16", "d17", "cc", "memory"); in Pack()
5850 "vld1.8 {d0[0]}, [%[in]], %[stride]\n" in Pack()
5851 "vld1.8 {d0[1]}, [%[in]], %[stride]\n" in Pack()
5852 "vld1.8 {d0[2]}, [%[in]], %[stride]\n" in Pack()
5853 "vld1.8 {d0[3]}, [%[in]], %[stride]\n" in Pack()
5854 "vld1.8 {d0[4]}, [%[in]], %[stride]\n" in Pack()
5855 "vld1.8 {d0[5]}, [%[in]], %[stride]\n" in Pack()
5856 "vld1.8 {d0[6]}, [%[in]], %[stride]\n" in Pack()
5857 "vld1.8 {d0[7]}, [%[in]], %[stride]\n" in Pack()
5859 "vaddw.u8 q8, q8, d0\n" in Pack()
5860 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
5867 "vmov.i8 d0, #0\n" in Pack()
5868 "vld1.8 {d0[0]}, [%[in]], %[stride]\n" in Pack()
5870 "vaddw.u8 q8, q8, d0\n" in Pack()
5871 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
5874 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
5879 "vmul.i32 q8, q8, d0[0]\n" in Pack()
5886 : "d0", "d2", "d3", "d16", "d17", "cc", "memory"); in Pack()
5914 "vld1.8 {d0[0]}, [%[in]], %[stride]\n" in Pack()
5915 "vld1.8 {d0[1]}, [%[in]], %[stride]\n" in Pack()
5916 "vld1.8 {d0[2]}, [%[in]], %[stride]\n" in Pack()
5917 "vld1.8 {d0[3]}, [%[in]], %[stride]\n" in Pack()
5918 "vld1.8 {d0[4]}, [%[in]], %[stride]\n" in Pack()
5919 "vld1.8 {d0[5]}, [%[in]], %[stride]\n" in Pack()
5920 "vld1.8 {d0[6]}, [%[in]], %[stride]\n" in Pack()
5921 "vld1.8 {d0[7]}, [%[in]], %[stride]\n" in Pack()
5923 "vaddw.u8 q8, q8, d0\n" in Pack()
5924 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
5931 "vmov.i8 d0, #0\n" in Pack()
5932 "vld1.8 {d0[0]}, [%[in]], %[stride]\n" in Pack()
5933 "vld1.8 {d0[1]}, [%[in]], %[stride]\n" in Pack()
5935 "vaddw.u8 q8, q8, d0\n" in Pack()
5936 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
5939 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
5944 "vmul.i32 q8, q8, d0[0]\n" in Pack()
5951 : "d0", "d2", "d3", "d16", "d17", "cc", "memory"); in Pack()
5979 "vld1.8 {d0[0]}, [%[in]], %[stride]\n" in Pack()
5980 "vld1.8 {d0[1]}, [%[in]], %[stride]\n" in Pack()
5981 "vld1.8 {d0[2]}, [%[in]], %[stride]\n" in Pack()
5982 "vld1.8 {d0[3]}, [%[in]], %[stride]\n" in Pack()
5983 "vld1.8 {d0[4]}, [%[in]], %[stride]\n" in Pack()
5984 "vld1.8 {d0[5]}, [%[in]], %[stride]\n" in Pack()
5985 "vld1.8 {d0[6]}, [%[in]], %[stride]\n" in Pack()
5986 "vld1.8 {d0[7]}, [%[in]], %[stride]\n" in Pack()
5988 "vaddw.u8 q8, q8, d0\n" in Pack()
5989 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
5996 "vmov.i8 d0, #0\n" in Pack()
5997 "vld1.8 {d0[0]}, [%[in]], %[stride]\n" in Pack()
5998 "vld1.8 {d0[1]}, [%[in]], %[stride]\n" in Pack()
5999 "vld1.8 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6001 "vaddw.u8 q8, q8, d0\n" in Pack()
6002 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
6005 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
6010 "vmul.i32 q8, q8, d0[0]\n" in Pack()
6017 : "d0", "d2", "d3", "d16", "d17", "cc", "memory"); in Pack()
6045 "vld1.8 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6046 "vld1.8 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6047 "vld1.8 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6048 "vld1.8 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6049 "vld1.8 {d0[4]}, [%[in]], %[stride]\n" in Pack()
6050 "vld1.8 {d0[5]}, [%[in]], %[stride]\n" in Pack()
6051 "vld1.8 {d0[6]}, [%[in]], %[stride]\n" in Pack()
6052 "vld1.8 {d0[7]}, [%[in]], %[stride]\n" in Pack()
6054 "vaddw.u8 q8, q8, d0\n" in Pack()
6055 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
6062 "vmov.i8 d0, #0\n" in Pack()
6063 "vld1.8 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6064 "vld1.8 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6065 "vld1.8 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6066 "vld1.8 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6068 "vaddw.u8 q8, q8, d0\n" in Pack()
6069 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
6072 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
6077 "vmul.i32 q8, q8, d0[0]\n" in Pack()
6084 : "d0", "d2", "d3", "d16", "d17", "cc", "memory"); in Pack()
6112 "vld1.8 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6113 "vld1.8 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6114 "vld1.8 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6115 "vld1.8 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6116 "vld1.8 {d0[4]}, [%[in]], %[stride]\n" in Pack()
6117 "vld1.8 {d0[5]}, [%[in]], %[stride]\n" in Pack()
6118 "vld1.8 {d0[6]}, [%[in]], %[stride]\n" in Pack()
6119 "vld1.8 {d0[7]}, [%[in]], %[stride]\n" in Pack()
6121 "vaddw.u8 q8, q8, d0\n" in Pack()
6122 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
6129 "vmov.i8 d0, #0\n" in Pack()
6130 "vld1.8 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6131 "vld1.8 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6132 "vld1.8 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6133 "vld1.8 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6134 "vld1.8 {d0[4]}, [%[in]], %[stride]\n" in Pack()
6136 "vaddw.u8 q8, q8, d0\n" in Pack()
6137 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
6140 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
6145 "vmul.i32 q8, q8, d0[0]\n" in Pack()
6152 : "d0", "d2", "d3", "d16", "d17", "cc", "memory"); in Pack()
6180 "vld1.8 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6181 "vld1.8 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6182 "vld1.8 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6183 "vld1.8 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6184 "vld1.8 {d0[4]}, [%[in]], %[stride]\n" in Pack()
6185 "vld1.8 {d0[5]}, [%[in]], %[stride]\n" in Pack()
6186 "vld1.8 {d0[6]}, [%[in]], %[stride]\n" in Pack()
6187 "vld1.8 {d0[7]}, [%[in]], %[stride]\n" in Pack()
6189 "vaddw.u8 q8, q8, d0\n" in Pack()
6190 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
6197 "vmov.i8 d0, #0\n" in Pack()
6198 "vld1.8 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6199 "vld1.8 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6200 "vld1.8 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6201 "vld1.8 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6202 "vld1.8 {d0[4]}, [%[in]], %[stride]\n" in Pack()
6203 "vld1.8 {d0[5]}, [%[in]], %[stride]\n" in Pack()
6205 "vaddw.u8 q8, q8, d0\n" in Pack()
6206 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
6209 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
6214 "vmul.i32 q8, q8, d0[0]\n" in Pack()
6221 : "d0", "d2", "d3", "d16", "d17", "cc", "memory"); in Pack()
6249 "vld1.8 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6250 "vld1.8 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6251 "vld1.8 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6252 "vld1.8 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6253 "vld1.8 {d0[4]}, [%[in]], %[stride]\n" in Pack()
6254 "vld1.8 {d0[5]}, [%[in]], %[stride]\n" in Pack()
6255 "vld1.8 {d0[6]}, [%[in]], %[stride]\n" in Pack()
6256 "vld1.8 {d0[7]}, [%[in]], %[stride]\n" in Pack()
6258 "vaddw.u8 q8, q8, d0\n" in Pack()
6259 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
6266 "vmov.i8 d0, #0\n" in Pack()
6267 "vld1.8 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6268 "vld1.8 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6269 "vld1.8 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6270 "vld1.8 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6271 "vld1.8 {d0[4]}, [%[in]], %[stride]\n" in Pack()
6272 "vld1.8 {d0[5]}, [%[in]], %[stride]\n" in Pack()
6273 "vld1.8 {d0[6]}, [%[in]], %[stride]\n" in Pack()
6275 "vaddw.u8 q8, q8, d0\n" in Pack()
6276 "vst1.32 {d0}, [%[out]:64]!\n" in Pack()
6279 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
6284 "vmul.i32 q8, q8, d0[0]\n" in Pack()
6291 : "d0", "d2", "d3", "d16", "d17", "cc", "memory"); in Pack()
6316 "vld1.16 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6317 "vld1.16 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6318 "vld1.16 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6319 "vld1.16 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6325 "vuzp.8 d0, d1\n" in Pack()
6326 "vaddw.u8 q8, q8, d0\n" in Pack()
6328 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
6333 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
6340 "vmul.i32 q8, q8, d0[0]\n" in Pack()
6347 : "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "cc", "memory"); in Pack()
6376 "vld1.16 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6377 "vld1.16 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6378 "vld1.16 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6379 "vld1.16 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6385 "vuzp.8 d0, d1\n" in Pack()
6386 "vaddw.u8 q8, q8, d0\n" in Pack()
6388 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
6395 "vmov.i8 d0, #0\n" in Pack()
6397 "vld1.16 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6399 "vuzp.8 d0, d1\n" in Pack()
6400 "vaddw.u8 q8, q8, d0\n" in Pack()
6402 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
6405 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
6412 "vmul.i32 q8, q8, d0[0]\n" in Pack()
6419 : "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "cc", "memory"); in Pack()
6448 "vld1.16 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6449 "vld1.16 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6450 "vld1.16 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6451 "vld1.16 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6457 "vuzp.8 d0, d1\n" in Pack()
6458 "vaddw.u8 q8, q8, d0\n" in Pack()
6460 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
6467 "vmov.i8 d0, #0\n" in Pack()
6469 "vld1.16 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6470 "vld1.16 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6472 "vuzp.8 d0, d1\n" in Pack()
6473 "vaddw.u8 q8, q8, d0\n" in Pack()
6475 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
6478 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
6485 "vmul.i32 q8, q8, d0[0]\n" in Pack()
6492 : "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "cc", "memory"); in Pack()
6521 "vld1.16 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6522 "vld1.16 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6523 "vld1.16 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6524 "vld1.16 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6530 "vuzp.8 d0, d1\n" in Pack()
6531 "vaddw.u8 q8, q8, d0\n" in Pack()
6533 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
6540 "vmov.i8 d0, #0\n" in Pack()
6542 "vld1.16 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6543 "vld1.16 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6544 "vld1.16 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6546 "vuzp.8 d0, d1\n" in Pack()
6547 "vaddw.u8 q8, q8, d0\n" in Pack()
6549 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
6552 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
6559 "vmul.i32 q8, q8, d0[0]\n" in Pack()
6566 : "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "cc", "memory"); in Pack()
6595 "vld1.16 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6596 "vld1.16 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6597 "vld1.16 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6598 "vld1.16 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6604 "vuzp.8 d0, d1\n" in Pack()
6605 "vaddw.u8 q8, q8, d0\n" in Pack()
6607 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
6614 "vmov.i8 d0, #0\n" in Pack()
6616 "vld1.16 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6617 "vld1.16 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6618 "vld1.16 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6619 "vld1.16 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6621 "vuzp.8 d0, d1\n" in Pack()
6622 "vaddw.u8 q8, q8, d0\n" in Pack()
6624 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
6627 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
6634 "vmul.i32 q8, q8, d0[0]\n" in Pack()
6641 : "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "cc", "memory"); in Pack()
6670 "vld1.16 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6671 "vld1.16 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6672 "vld1.16 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6673 "vld1.16 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6679 "vuzp.8 d0, d1\n" in Pack()
6680 "vaddw.u8 q8, q8, d0\n" in Pack()
6682 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
6689 "vmov.i8 d0, #0\n" in Pack()
6691 "vld1.16 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6692 "vld1.16 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6693 "vld1.16 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6694 "vld1.16 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6697 "vuzp.8 d0, d1\n" in Pack()
6698 "vaddw.u8 q8, q8, d0\n" in Pack()
6700 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
6703 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
6710 "vmul.i32 q8, q8, d0[0]\n" in Pack()
6717 : "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "cc", "memory"); in Pack()
6746 "vld1.16 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6747 "vld1.16 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6748 "vld1.16 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6749 "vld1.16 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6755 "vuzp.8 d0, d1\n" in Pack()
6756 "vaddw.u8 q8, q8, d0\n" in Pack()
6758 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
6765 "vmov.i8 d0, #0\n" in Pack()
6767 "vld1.16 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6768 "vld1.16 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6769 "vld1.16 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6770 "vld1.16 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6774 "vuzp.8 d0, d1\n" in Pack()
6775 "vaddw.u8 q8, q8, d0\n" in Pack()
6777 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
6780 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
6787 "vmul.i32 q8, q8, d0[0]\n" in Pack()
6794 : "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "cc", "memory"); in Pack()
6823 "vld1.16 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6824 "vld1.16 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6825 "vld1.16 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6826 "vld1.16 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6832 "vuzp.8 d0, d1\n" in Pack()
6833 "vaddw.u8 q8, q8, d0\n" in Pack()
6835 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
6842 "vmov.i8 d0, #0\n" in Pack()
6844 "vld1.16 {d0[0]}, [%[in]], %[stride]\n" in Pack()
6845 "vld1.16 {d0[1]}, [%[in]], %[stride]\n" in Pack()
6846 "vld1.16 {d0[2]}, [%[in]], %[stride]\n" in Pack()
6847 "vld1.16 {d0[3]}, [%[in]], %[stride]\n" in Pack()
6852 "vuzp.8 d0, d1\n" in Pack()
6853 "vaddw.u8 q8, q8, d0\n" in Pack()
6855 "vst1.32 {d0, d1}, [%[out]:128]!\n" in Pack()
6858 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
6865 "vmul.i32 q8, q8, d0[0]\n" in Pack()
6872 : "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "cc", "memory"); in Pack()
6898 "vld3.8 {d0[0], d1[0], d2[0]}, [%[in]], %[stride]\n" in Pack()
6899 "vld3.8 {d0[1], d1[1], d2[1]}, [%[in]], %[stride]\n" in Pack()
6900 "vld3.8 {d0[2], d1[2], d2[2]}, [%[in]], %[stride]\n" in Pack()
6901 "vld3.8 {d0[3], d1[3], d2[3]}, [%[in]], %[stride]\n" in Pack()
6902 "vld3.8 {d0[4], d1[4], d2[4]}, [%[in]], %[stride]\n" in Pack()
6903 "vld3.8 {d0[5], d1[5], d2[5]}, [%[in]], %[stride]\n" in Pack()
6904 "vld3.8 {d0[6], d1[6], d2[6]}, [%[in]], %[stride]\n" in Pack()
6905 "vld3.8 {d0[7], d1[7], d2[7]}, [%[in]], %[stride]\n" in Pack()
6906 "vaddw.u8 q8, q8, d0\n" in Pack()
6909 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
6914 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
6924 "vmul.i32 q8, q8, d0[0]\n" in Pack()
6931 : "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d20", "d21", "cc", in Pack()
6962 "vld3.8 {d0[0], d1[0], d2[0]}, [%[in]], %[stride]\n" in Pack()
6963 "vld3.8 {d0[1], d1[1], d2[1]}, [%[in]], %[stride]\n" in Pack()
6964 "vld3.8 {d0[2], d1[2], d2[2]}, [%[in]], %[stride]\n" in Pack()
6965 "vld3.8 {d0[3], d1[3], d2[3]}, [%[in]], %[stride]\n" in Pack()
6966 "vld3.8 {d0[4], d1[4], d2[4]}, [%[in]], %[stride]\n" in Pack()
6967 "vld3.8 {d0[5], d1[5], d2[5]}, [%[in]], %[stride]\n" in Pack()
6968 "vld3.8 {d0[6], d1[6], d2[6]}, [%[in]], %[stride]\n" in Pack()
6969 "vld3.8 {d0[7], d1[7], d2[7]}, [%[in]], %[stride]\n" in Pack()
6970 "vaddw.u8 q8, q8, d0\n" in Pack()
6973 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
6980 "vmov.i8 d0, #0\n" in Pack()
6983 "vld3.8 {d0[0], d1[0], d2[0]}, [%[in]], %[stride]\n" in Pack()
6984 "vaddw.u8 q8, q8, d0\n" in Pack()
6987 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
6990 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
7000 "vmul.i32 q8, q8, d0[0]\n" in Pack()
7007 : "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d20", "d21", "cc", in Pack()
7038 "vld3.8 {d0[0], d1[0], d2[0]}, [%[in]], %[stride]\n" in Pack()
7039 "vld3.8 {d0[1], d1[1], d2[1]}, [%[in]], %[stride]\n" in Pack()
7040 "vld3.8 {d0[2], d1[2], d2[2]}, [%[in]], %[stride]\n" in Pack()
7041 "vld3.8 {d0[3], d1[3], d2[3]}, [%[in]], %[stride]\n" in Pack()
7042 "vld3.8 {d0[4], d1[4], d2[4]}, [%[in]], %[stride]\n" in Pack()
7043 "vld3.8 {d0[5], d1[5], d2[5]}, [%[in]], %[stride]\n" in Pack()
7044 "vld3.8 {d0[6], d1[6], d2[6]}, [%[in]], %[stride]\n" in Pack()
7045 "vld3.8 {d0[7], d1[7], d2[7]}, [%[in]], %[stride]\n" in Pack()
7046 "vaddw.u8 q8, q8, d0\n" in Pack()
7049 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
7056 "vmov.i8 d0, #0\n" in Pack()
7059 "vld3.8 {d0[0], d1[0], d2[0]}, [%[in]], %[stride]\n" in Pack()
7060 "vld3.8 {d0[1], d1[1], d2[1]}, [%[in]], %[stride]\n" in Pack()
7061 "vaddw.u8 q8, q8, d0\n" in Pack()
7064 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
7067 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
7077 "vmul.i32 q8, q8, d0[0]\n" in Pack()
7084 : "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d20", "d21", "cc", in Pack()
7115 "vld3.8 {d0[0], d1[0], d2[0]}, [%[in]], %[stride]\n" in Pack()
7116 "vld3.8 {d0[1], d1[1], d2[1]}, [%[in]], %[stride]\n" in Pack()
7117 "vld3.8 {d0[2], d1[2], d2[2]}, [%[in]], %[stride]\n" in Pack()
7118 "vld3.8 {d0[3], d1[3], d2[3]}, [%[in]], %[stride]\n" in Pack()
7119 "vld3.8 {d0[4], d1[4], d2[4]}, [%[in]], %[stride]\n" in Pack()
7120 "vld3.8 {d0[5], d1[5], d2[5]}, [%[in]], %[stride]\n" in Pack()
7121 "vld3.8 {d0[6], d1[6], d2[6]}, [%[in]], %[stride]\n" in Pack()
7122 "vld3.8 {d0[7], d1[7], d2[7]}, [%[in]], %[stride]\n" in Pack()
7123 "vaddw.u8 q8, q8, d0\n" in Pack()
7126 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
7133 "vmov.i8 d0, #0\n" in Pack()
7136 "vld3.8 {d0[0], d1[0], d2[0]}, [%[in]], %[stride]\n" in Pack()
7137 "vld3.8 {d0[1], d1[1], d2[1]}, [%[in]], %[stride]\n" in Pack()
7138 "vld3.8 {d0[2], d1[2], d2[2]}, [%[in]], %[stride]\n" in Pack()
7139 "vaddw.u8 q8, q8, d0\n" in Pack()
7142 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
7145 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
7155 "vmul.i32 q8, q8, d0[0]\n" in Pack()
7162 : "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d20", "d21", "cc", in Pack()
7193 "vld3.8 {d0[0], d1[0], d2[0]}, [%[in]], %[stride]\n" in Pack()
7194 "vld3.8 {d0[1], d1[1], d2[1]}, [%[in]], %[stride]\n" in Pack()
7195 "vld3.8 {d0[2], d1[2], d2[2]}, [%[in]], %[stride]\n" in Pack()
7196 "vld3.8 {d0[3], d1[3], d2[3]}, [%[in]], %[stride]\n" in Pack()
7197 "vld3.8 {d0[4], d1[4], d2[4]}, [%[in]], %[stride]\n" in Pack()
7198 "vld3.8 {d0[5], d1[5], d2[5]}, [%[in]], %[stride]\n" in Pack()
7199 "vld3.8 {d0[6], d1[6], d2[6]}, [%[in]], %[stride]\n" in Pack()
7200 "vld3.8 {d0[7], d1[7], d2[7]}, [%[in]], %[stride]\n" in Pack()
7201 "vaddw.u8 q8, q8, d0\n" in Pack()
7204 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
7211 "vmov.i8 d0, #0\n" in Pack()
7214 "vld3.8 {d0[0], d1[0], d2[0]}, [%[in]], %[stride]\n" in Pack()
7215 "vld3.8 {d0[1], d1[1], d2[1]}, [%[in]], %[stride]\n" in Pack()
7216 "vld3.8 {d0[2], d1[2], d2[2]}, [%[in]], %[stride]\n" in Pack()
7217 "vld3.8 {d0[3], d1[3], d2[3]}, [%[in]], %[stride]\n" in Pack()
7218 "vaddw.u8 q8, q8, d0\n" in Pack()
7221 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
7224 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
7234 "vmul.i32 q8, q8, d0[0]\n" in Pack()
7241 : "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d20", "d21", "cc", in Pack()
7272 "vld3.8 {d0[0], d1[0], d2[0]}, [%[in]], %[stride]\n" in Pack()
7273 "vld3.8 {d0[1], d1[1], d2[1]}, [%[in]], %[stride]\n" in Pack()
7274 "vld3.8 {d0[2], d1[2], d2[2]}, [%[in]], %[stride]\n" in Pack()
7275 "vld3.8 {d0[3], d1[3], d2[3]}, [%[in]], %[stride]\n" in Pack()
7276 "vld3.8 {d0[4], d1[4], d2[4]}, [%[in]], %[stride]\n" in Pack()
7277 "vld3.8 {d0[5], d1[5], d2[5]}, [%[in]], %[stride]\n" in Pack()
7278 "vld3.8 {d0[6], d1[6], d2[6]}, [%[in]], %[stride]\n" in Pack()
7279 "vld3.8 {d0[7], d1[7], d2[7]}, [%[in]], %[stride]\n" in Pack()
7280 "vaddw.u8 q8, q8, d0\n" in Pack()
7283 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
7290 "vmov.i8 d0, #0\n" in Pack()
7293 "vld3.8 {d0[0], d1[0], d2[0]}, [%[in]], %[stride]\n" in Pack()
7294 "vld3.8 {d0[1], d1[1], d2[1]}, [%[in]], %[stride]\n" in Pack()
7295 "vld3.8 {d0[2], d1[2], d2[2]}, [%[in]], %[stride]\n" in Pack()
7296 "vld3.8 {d0[3], d1[3], d2[3]}, [%[in]], %[stride]\n" in Pack()
7297 "vld3.8 {d0[4], d1[4], d2[4]}, [%[in]], %[stride]\n" in Pack()
7298 "vaddw.u8 q8, q8, d0\n" in Pack()
7301 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
7304 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
7314 "vmul.i32 q8, q8, d0[0]\n" in Pack()
7321 : "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d20", "d21", "cc", in Pack()
7352 "vld3.8 {d0[0], d1[0], d2[0]}, [%[in]], %[stride]\n" in Pack()
7353 "vld3.8 {d0[1], d1[1], d2[1]}, [%[in]], %[stride]\n" in Pack()
7354 "vld3.8 {d0[2], d1[2], d2[2]}, [%[in]], %[stride]\n" in Pack()
7355 "vld3.8 {d0[3], d1[3], d2[3]}, [%[in]], %[stride]\n" in Pack()
7356 "vld3.8 {d0[4], d1[4], d2[4]}, [%[in]], %[stride]\n" in Pack()
7357 "vld3.8 {d0[5], d1[5], d2[5]}, [%[in]], %[stride]\n" in Pack()
7358 "vld3.8 {d0[6], d1[6], d2[6]}, [%[in]], %[stride]\n" in Pack()
7359 "vld3.8 {d0[7], d1[7], d2[7]}, [%[in]], %[stride]\n" in Pack()
7360 "vaddw.u8 q8, q8, d0\n" in Pack()
7363 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
7370 "vmov.i8 d0, #0\n" in Pack()
7373 "vld3.8 {d0[0], d1[0], d2[0]}, [%[in]], %[stride]\n" in Pack()
7374 "vld3.8 {d0[1], d1[1], d2[1]}, [%[in]], %[stride]\n" in Pack()
7375 "vld3.8 {d0[2], d1[2], d2[2]}, [%[in]], %[stride]\n" in Pack()
7376 "vld3.8 {d0[3], d1[3], d2[3]}, [%[in]], %[stride]\n" in Pack()
7377 "vld3.8 {d0[4], d1[4], d2[4]}, [%[in]], %[stride]\n" in Pack()
7378 "vld3.8 {d0[5], d1[5], d2[5]}, [%[in]], %[stride]\n" in Pack()
7379 "vaddw.u8 q8, q8, d0\n" in Pack()
7382 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
7385 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
7395 "vmul.i32 q8, q8, d0[0]\n" in Pack()
7402 : "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d20", "d21", "cc", in Pack()
7433 "vld3.8 {d0[0], d1[0], d2[0]}, [%[in]], %[stride]\n" in Pack()
7434 "vld3.8 {d0[1], d1[1], d2[1]}, [%[in]], %[stride]\n" in Pack()
7435 "vld3.8 {d0[2], d1[2], d2[2]}, [%[in]], %[stride]\n" in Pack()
7436 "vld3.8 {d0[3], d1[3], d2[3]}, [%[in]], %[stride]\n" in Pack()
7437 "vld3.8 {d0[4], d1[4], d2[4]}, [%[in]], %[stride]\n" in Pack()
7438 "vld3.8 {d0[5], d1[5], d2[5]}, [%[in]], %[stride]\n" in Pack()
7439 "vld3.8 {d0[6], d1[6], d2[6]}, [%[in]], %[stride]\n" in Pack()
7440 "vld3.8 {d0[7], d1[7], d2[7]}, [%[in]], %[stride]\n" in Pack()
7441 "vaddw.u8 q8, q8, d0\n" in Pack()
7444 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
7451 "vmov.i8 d0, #0\n" in Pack()
7454 "vld3.8 {d0[0], d1[0], d2[0]}, [%[in]], %[stride]\n" in Pack()
7455 "vld3.8 {d0[1], d1[1], d2[1]}, [%[in]], %[stride]\n" in Pack()
7456 "vld3.8 {d0[2], d1[2], d2[2]}, [%[in]], %[stride]\n" in Pack()
7457 "vld3.8 {d0[3], d1[3], d2[3]}, [%[in]], %[stride]\n" in Pack()
7458 "vld3.8 {d0[4], d1[4], d2[4]}, [%[in]], %[stride]\n" in Pack()
7459 "vld3.8 {d0[5], d1[5], d2[5]}, [%[in]], %[stride]\n" in Pack()
7460 "vld3.8 {d0[6], d1[6], d2[6]}, [%[in]], %[stride]\n" in Pack()
7461 "vaddw.u8 q8, q8, d0\n" in Pack()
7464 "vst1.32 {d0, d1, d2}, [%[out]:64]!\n" in Pack()
7467 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
7477 "vmul.i32 q8, q8, d0[0]\n" in Pack()
7484 : "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d20", "d21", "cc", in Pack()
7512 "vld1.32 {d0[0]}, [%[in]], %[stride]\n" in Pack()
7516 "vld1.32 {d0[1]}, [%[in]], %[stride]\n" in Pack()
7521 "vtrn.16 d0, d2\n" in Pack()
7523 "vtrn.8 d0, d1\n" in Pack()
7525 "vaddw.u8 q8, q8, d0\n" in Pack()
7529 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
7534 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
7546 "vmul.i32 q8, q8, d0[0]\n" in Pack()
7553 : "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d20", "d21", "d22", in Pack()
7585 "vld1.32 {d0[0]}, [%[in]], %[stride]\n" in Pack()
7589 "vld1.32 {d0[1]}, [%[in]], %[stride]\n" in Pack()
7594 "vtrn.16 d0, d2\n" in Pack()
7596 "vtrn.8 d0, d1\n" in Pack()
7598 "vaddw.u8 q8, q8, d0\n" in Pack()
7602 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
7609 "vmov.i8 d0, #0\n" in Pack()
7613 "vld1.32 {d0[0]}, [%[in]], %[stride]\n" in Pack()
7615 "vtrn.16 d0, d2\n" in Pack()
7617 "vtrn.8 d0, d1\n" in Pack()
7619 "vaddw.u8 q8, q8, d0\n" in Pack()
7623 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
7626 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
7638 "vmul.i32 q8, q8, d0[0]\n" in Pack()
7645 : "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d20", "d21", "d22", in Pack()
7677 "vld1.32 {d0[0]}, [%[in]], %[stride]\n" in Pack()
7681 "vld1.32 {d0[1]}, [%[in]], %[stride]\n" in Pack()
7686 "vtrn.16 d0, d2\n" in Pack()
7688 "vtrn.8 d0, d1\n" in Pack()
7690 "vaddw.u8 q8, q8, d0\n" in Pack()
7694 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
7701 "vmov.i8 d0, #0\n" in Pack()
7705 "vld1.32 {d0[0]}, [%[in]], %[stride]\n" in Pack()
7708 "vtrn.16 d0, d2\n" in Pack()
7710 "vtrn.8 d0, d1\n" in Pack()
7712 "vaddw.u8 q8, q8, d0\n" in Pack()
7716 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
7719 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
7731 "vmul.i32 q8, q8, d0[0]\n" in Pack()
7738 : "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d20", "d21", "d22", in Pack()
7770 "vld1.32 {d0[0]}, [%[in]], %[stride]\n" in Pack()
7774 "vld1.32 {d0[1]}, [%[in]], %[stride]\n" in Pack()
7779 "vtrn.16 d0, d2\n" in Pack()
7781 "vtrn.8 d0, d1\n" in Pack()
7783 "vaddw.u8 q8, q8, d0\n" in Pack()
7787 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
7794 "vmov.i8 d0, #0\n" in Pack()
7798 "vld1.32 {d0[0]}, [%[in]], %[stride]\n" in Pack()
7802 "vtrn.16 d0, d2\n" in Pack()
7804 "vtrn.8 d0, d1\n" in Pack()
7806 "vaddw.u8 q8, q8, d0\n" in Pack()
7810 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
7813 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
7825 "vmul.i32 q8, q8, d0[0]\n" in Pack()
7832 : "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d20", "d21", "d22", in Pack()
7864 "vld1.32 {d0[0]}, [%[in]], %[stride]\n" in Pack()
7868 "vld1.32 {d0[1]}, [%[in]], %[stride]\n" in Pack()
7873 "vtrn.16 d0, d2\n" in Pack()
7875 "vtrn.8 d0, d1\n" in Pack()
7877 "vaddw.u8 q8, q8, d0\n" in Pack()
7881 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
7888 "vmov.i8 d0, #0\n" in Pack()
7892 "vld1.32 {d0[0]}, [%[in]], %[stride]\n" in Pack()
7897 "vtrn.16 d0, d2\n" in Pack()
7899 "vtrn.8 d0, d1\n" in Pack()
7901 "vaddw.u8 q8, q8, d0\n" in Pack()
7905 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
7908 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
7920 "vmul.i32 q8, q8, d0[0]\n" in Pack()
7927 : "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d20", "d21", "d22", in Pack()
7959 "vld1.32 {d0[0]}, [%[in]], %[stride]\n" in Pack()
7963 "vld1.32 {d0[1]}, [%[in]], %[stride]\n" in Pack()
7968 "vtrn.16 d0, d2\n" in Pack()
7970 "vtrn.8 d0, d1\n" in Pack()
7972 "vaddw.u8 q8, q8, d0\n" in Pack()
7976 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
7983 "vmov.i8 d0, #0\n" in Pack()
7987 "vld1.32 {d0[0]}, [%[in]], %[stride]\n" in Pack()
7991 "vld1.32 {d0[1]}, [%[in]], %[stride]\n" in Pack()
7993 "vtrn.16 d0, d2\n" in Pack()
7995 "vtrn.8 d0, d1\n" in Pack()
7997 "vaddw.u8 q8, q8, d0\n" in Pack()
8001 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
8004 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
8016 "vmul.i32 q8, q8, d0[0]\n" in Pack()
8023 : "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d20", "d21", "d22", in Pack()
8055 "vld1.32 {d0[0]}, [%[in]], %[stride]\n" in Pack()
8059 "vld1.32 {d0[1]}, [%[in]], %[stride]\n" in Pack()
8064 "vtrn.16 d0, d2\n" in Pack()
8066 "vtrn.8 d0, d1\n" in Pack()
8068 "vaddw.u8 q8, q8, d0\n" in Pack()
8072 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
8079 "vmov.i8 d0, #0\n" in Pack()
8083 "vld1.32 {d0[0]}, [%[in]], %[stride]\n" in Pack()
8087 "vld1.32 {d0[1]}, [%[in]], %[stride]\n" in Pack()
8090 "vtrn.16 d0, d2\n" in Pack()
8092 "vtrn.8 d0, d1\n" in Pack()
8094 "vaddw.u8 q8, q8, d0\n" in Pack()
8098 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
8101 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
8113 "vmul.i32 q8, q8, d0[0]\n" in Pack()
8120 : "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d20", "d21", "d22", in Pack()
8152 "vld1.32 {d0[0]}, [%[in]], %[stride]\n" in Pack()
8156 "vld1.32 {d0[1]}, [%[in]], %[stride]\n" in Pack()
8161 "vtrn.16 d0, d2\n" in Pack()
8163 "vtrn.8 d0, d1\n" in Pack()
8165 "vaddw.u8 q8, q8, d0\n" in Pack()
8169 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
8176 "vmov.i8 d0, #0\n" in Pack()
8180 "vld1.32 {d0[0]}, [%[in]], %[stride]\n" in Pack()
8184 "vld1.32 {d0[1]}, [%[in]], %[stride]\n" in Pack()
8188 "vtrn.16 d0, d2\n" in Pack()
8190 "vtrn.8 d0, d1\n" in Pack()
8192 "vaddw.u8 q8, q8, d0\n" in Pack()
8196 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
8199 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
8211 "vmul.i32 q8, q8, d0[0]\n" in Pack()
8218 : "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d20", "d21", "d22", in Pack()
8248 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
8256 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
8265 "vtrn.16 d0, d2\n" in Pack()
8267 "vtrn.8 d0, d1\n" in Pack()
8269 "vaddw.u8 q8, q8, d0\n" in Pack()
8274 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
8280 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
8295 "vmul.i32 q8, q8, d0[0]\n" in Pack()
8296 "vmul.i32 q9, q9, d0[0]\n" in Pack()
8304 : "d0", "d1", "d2", "d3", "d4", "d16", "d17", "d18", "d19", "d20", "d21", in Pack()
8338 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
8346 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
8355 "vtrn.16 d0, d2\n" in Pack()
8357 "vtrn.8 d0, d1\n" in Pack()
8359 "vaddw.u8 q8, q8, d0\n" in Pack()
8364 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
8372 "vmov.i8 d0, #0\n" in Pack()
8377 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
8380 "vtrn.16 d0, d2\n" in Pack()
8382 "vtrn.8 d0, d1\n" in Pack()
8384 "vaddw.u8 q8, q8, d0\n" in Pack()
8389 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
8393 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
8408 "vmul.i32 q8, q8, d0[0]\n" in Pack()
8409 "vmul.i32 q9, q9, d0[0]\n" in Pack()
8417 : "d0", "d1", "d2", "d3", "d4", "d16", "d17", "d18", "d19", "d20", "d21", in Pack()
8451 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
8459 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
8468 "vtrn.16 d0, d2\n" in Pack()
8470 "vtrn.8 d0, d1\n" in Pack()
8472 "vaddw.u8 q8, q8, d0\n" in Pack()
8477 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
8485 "vmov.i8 d0, #0\n" in Pack()
8490 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
8495 "vtrn.16 d0, d2\n" in Pack()
8497 "vtrn.8 d0, d1\n" in Pack()
8499 "vaddw.u8 q8, q8, d0\n" in Pack()
8504 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
8508 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
8523 "vmul.i32 q8, q8, d0[0]\n" in Pack()
8524 "vmul.i32 q9, q9, d0[0]\n" in Pack()
8532 : "d0", "d1", "d2", "d3", "d4", "d16", "d17", "d18", "d19", "d20", "d21", in Pack()
8566 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
8574 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
8583 "vtrn.16 d0, d2\n" in Pack()
8585 "vtrn.8 d0, d1\n" in Pack()
8587 "vaddw.u8 q8, q8, d0\n" in Pack()
8592 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
8600 "vmov.i8 d0, #0\n" in Pack()
8605 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
8612 "vtrn.16 d0, d2\n" in Pack()
8614 "vtrn.8 d0, d1\n" in Pack()
8616 "vaddw.u8 q8, q8, d0\n" in Pack()
8621 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
8625 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
8640 "vmul.i32 q8, q8, d0[0]\n" in Pack()
8641 "vmul.i32 q9, q9, d0[0]\n" in Pack()
8649 : "d0", "d1", "d2", "d3", "d4", "d16", "d17", "d18", "d19", "d20", "d21", in Pack()
8683 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
8691 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
8700 "vtrn.16 d0, d2\n" in Pack()
8702 "vtrn.8 d0, d1\n" in Pack()
8704 "vaddw.u8 q8, q8, d0\n" in Pack()
8709 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
8717 "vmov.i8 d0, #0\n" in Pack()
8722 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
8731 "vtrn.16 d0, d2\n" in Pack()
8733 "vtrn.8 d0, d1\n" in Pack()
8735 "vaddw.u8 q8, q8, d0\n" in Pack()
8740 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
8744 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
8759 "vmul.i32 q8, q8, d0[0]\n" in Pack()
8760 "vmul.i32 q9, q9, d0[0]\n" in Pack()
8768 : "d0", "d1", "d2", "d3", "d4", "d16", "d17", "d18", "d19", "d20", "d21", in Pack()
8802 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
8810 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
8819 "vtrn.16 d0, d2\n" in Pack()
8821 "vtrn.8 d0, d1\n" in Pack()
8823 "vaddw.u8 q8, q8, d0\n" in Pack()
8828 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
8836 "vmov.i8 d0, #0\n" in Pack()
8841 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
8849 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
8852 "vtrn.16 d0, d2\n" in Pack()
8854 "vtrn.8 d0, d1\n" in Pack()
8856 "vaddw.u8 q8, q8, d0\n" in Pack()
8861 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
8865 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
8880 "vmul.i32 q8, q8, d0[0]\n" in Pack()
8881 "vmul.i32 q9, q9, d0[0]\n" in Pack()
8889 : "d0", "d1", "d2", "d3", "d4", "d16", "d17", "d18", "d19", "d20", "d21", in Pack()
8923 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
8931 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
8940 "vtrn.16 d0, d2\n" in Pack()
8942 "vtrn.8 d0, d1\n" in Pack()
8944 "vaddw.u8 q8, q8, d0\n" in Pack()
8949 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
8957 "vmov.i8 d0, #0\n" in Pack()
8962 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
8970 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
8975 "vtrn.16 d0, d2\n" in Pack()
8977 "vtrn.8 d0, d1\n" in Pack()
8979 "vaddw.u8 q8, q8, d0\n" in Pack()
8984 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
8988 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
9003 "vmul.i32 q8, q8, d0[0]\n" in Pack()
9004 "vmul.i32 q9, q9, d0[0]\n" in Pack()
9012 : "d0", "d1", "d2", "d3", "d4", "d16", "d17", "d18", "d19", "d20", "d21", in Pack()
9046 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
9054 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
9063 "vtrn.16 d0, d2\n" in Pack()
9065 "vtrn.8 d0, d1\n" in Pack()
9067 "vaddw.u8 q8, q8, d0\n" in Pack()
9072 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
9080 "vmov.i8 d0, #0\n" in Pack()
9085 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
9093 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
9100 "vtrn.16 d0, d2\n" in Pack()
9102 "vtrn.8 d0, d1\n" in Pack()
9104 "vaddw.u8 q8, q8, d0\n" in Pack()
9109 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
9113 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
9128 "vmul.i32 q8, q8, d0[0]\n" in Pack()
9129 "vmul.i32 q9, q9, d0[0]\n" in Pack()
9137 : "d0", "d1", "d2", "d3", "d4", "d16", "d17", "d18", "d19", "d20", "d21", in Pack()
9168 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
9176 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
9185 "vtrn.16 d0, d2\n" in Pack()
9188 "vtrn.8 d0, d1\n" in Pack()
9190 "vaddw.u8 q8, q8, d0\n" in Pack()
9196 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
9202 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
9219 "vmul.i32 q8, q8, d0[0]\n" in Pack()
9220 "vmul.i32 q9, q9, d0[0]\n" in Pack()
9228 : "d0", "d1", "d2", "d3", "d4", "d5", "d16", "d17", "d18", "d19", "d20", in Pack()
9263 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
9271 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
9280 "vtrn.16 d0, d2\n" in Pack()
9283 "vtrn.8 d0, d1\n" in Pack()
9285 "vaddw.u8 q8, q8, d0\n" in Pack()
9291 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
9299 "vmov.i8 d0, #0\n" in Pack()
9305 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
9308 "vtrn.16 d0, d2\n" in Pack()
9311 "vtrn.8 d0, d1\n" in Pack()
9313 "vaddw.u8 q8, q8, d0\n" in Pack()
9319 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
9323 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
9340 "vmul.i32 q8, q8, d0[0]\n" in Pack()
9341 "vmul.i32 q9, q9, d0[0]\n" in Pack()
9349 : "d0", "d1", "d2", "d3", "d4", "d5", "d16", "d17", "d18", "d19", "d20", in Pack()
9384 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
9392 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
9401 "vtrn.16 d0, d2\n" in Pack()
9404 "vtrn.8 d0, d1\n" in Pack()
9406 "vaddw.u8 q8, q8, d0\n" in Pack()
9412 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
9420 "vmov.i8 d0, #0\n" in Pack()
9426 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
9431 "vtrn.16 d0, d2\n" in Pack()
9434 "vtrn.8 d0, d1\n" in Pack()
9436 "vaddw.u8 q8, q8, d0\n" in Pack()
9442 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
9446 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
9463 "vmul.i32 q8, q8, d0[0]\n" in Pack()
9464 "vmul.i32 q9, q9, d0[0]\n" in Pack()
9472 : "d0", "d1", "d2", "d3", "d4", "d5", "d16", "d17", "d18", "d19", "d20", in Pack()
9507 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
9515 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
9524 "vtrn.16 d0, d2\n" in Pack()
9527 "vtrn.8 d0, d1\n" in Pack()
9529 "vaddw.u8 q8, q8, d0\n" in Pack()
9535 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
9543 "vmov.i8 d0, #0\n" in Pack()
9549 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
9556 "vtrn.16 d0, d2\n" in Pack()
9559 "vtrn.8 d0, d1\n" in Pack()
9561 "vaddw.u8 q8, q8, d0\n" in Pack()
9567 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
9571 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
9588 "vmul.i32 q8, q8, d0[0]\n" in Pack()
9589 "vmul.i32 q9, q9, d0[0]\n" in Pack()
9597 : "d0", "d1", "d2", "d3", "d4", "d5", "d16", "d17", "d18", "d19", "d20", in Pack()
9632 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
9640 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
9649 "vtrn.16 d0, d2\n" in Pack()
9652 "vtrn.8 d0, d1\n" in Pack()
9654 "vaddw.u8 q8, q8, d0\n" in Pack()
9660 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
9668 "vmov.i8 d0, #0\n" in Pack()
9674 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
9683 "vtrn.16 d0, d2\n" in Pack()
9686 "vtrn.8 d0, d1\n" in Pack()
9688 "vaddw.u8 q8, q8, d0\n" in Pack()
9694 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
9698 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
9715 "vmul.i32 q8, q8, d0[0]\n" in Pack()
9716 "vmul.i32 q9, q9, d0[0]\n" in Pack()
9724 : "d0", "d1", "d2", "d3", "d4", "d5", "d16", "d17", "d18", "d19", "d20", in Pack()
9759 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
9767 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
9776 "vtrn.16 d0, d2\n" in Pack()
9779 "vtrn.8 d0, d1\n" in Pack()
9781 "vaddw.u8 q8, q8, d0\n" in Pack()
9787 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
9795 "vmov.i8 d0, #0\n" in Pack()
9801 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
9809 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
9812 "vtrn.16 d0, d2\n" in Pack()
9815 "vtrn.8 d0, d1\n" in Pack()
9817 "vaddw.u8 q8, q8, d0\n" in Pack()
9823 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
9827 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
9844 "vmul.i32 q8, q8, d0[0]\n" in Pack()
9845 "vmul.i32 q9, q9, d0[0]\n" in Pack()
9853 : "d0", "d1", "d2", "d3", "d4", "d5", "d16", "d17", "d18", "d19", "d20", in Pack()
9888 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
9896 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
9905 "vtrn.16 d0, d2\n" in Pack()
9908 "vtrn.8 d0, d1\n" in Pack()
9910 "vaddw.u8 q8, q8, d0\n" in Pack()
9916 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
9924 "vmov.i8 d0, #0\n" in Pack()
9930 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
9938 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
9943 "vtrn.16 d0, d2\n" in Pack()
9946 "vtrn.8 d0, d1\n" in Pack()
9948 "vaddw.u8 q8, q8, d0\n" in Pack()
9954 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
9958 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
9975 "vmul.i32 q8, q8, d0[0]\n" in Pack()
9976 "vmul.i32 q9, q9, d0[0]\n" in Pack()
9984 : "d0", "d1", "d2", "d3", "d4", "d5", "d16", "d17", "d18", "d19", "d20", in Pack()
10019 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
10027 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
10036 "vtrn.16 d0, d2\n" in Pack()
10039 "vtrn.8 d0, d1\n" in Pack()
10041 "vaddw.u8 q8, q8, d0\n" in Pack()
10047 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
10055 "vmov.i8 d0, #0\n" in Pack()
10061 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
10069 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
10076 "vtrn.16 d0, d2\n" in Pack()
10079 "vtrn.8 d0, d1\n" in Pack()
10081 "vaddw.u8 q8, q8, d0\n" in Pack()
10087 "vst1.32 {d0, d1, d2, d3}, [%[out]:128]!\n" in Pack()
10091 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
10108 "vmul.i32 q8, q8, d0[0]\n" in Pack()
10109 "vmul.i32 q9, q9, d0[0]\n" in Pack()
10117 : "d0", "d1", "d2", "d3", "d4", "d5", "d16", "d17", "d18", "d19", "d20", in Pack()
10149 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
10157 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
10166 "vtrn.16 d0, d2\n" in Pack()
10168 "vtrn.8 d0, d1\n" in Pack()
10170 "vaddw.u8 q8, q8, d0\n" in Pack()
10177 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
10183 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
10203 "vmul.i32 q8, q8, d0[0]\n" in Pack()
10204 "vmul.i32 q9, q9, d0[0]\n" in Pack()
10212 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", in Pack()
10249 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
10257 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
10266 "vtrn.16 d0, d2\n" in Pack()
10268 "vtrn.8 d0, d1\n" in Pack()
10270 "vaddw.u8 q8, q8, d0\n" in Pack()
10277 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
10285 "vmov.i8 d0, #0\n" in Pack()
10292 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
10295 "vtrn.16 d0, d2\n" in Pack()
10297 "vtrn.8 d0, d1\n" in Pack()
10299 "vaddw.u8 q8, q8, d0\n" in Pack()
10306 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
10310 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
10330 "vmul.i32 q8, q8, d0[0]\n" in Pack()
10331 "vmul.i32 q9, q9, d0[0]\n" in Pack()
10339 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", in Pack()
10376 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
10384 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
10393 "vtrn.16 d0, d2\n" in Pack()
10395 "vtrn.8 d0, d1\n" in Pack()
10397 "vaddw.u8 q8, q8, d0\n" in Pack()
10404 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
10412 "vmov.i8 d0, #0\n" in Pack()
10419 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
10424 "vtrn.16 d0, d2\n" in Pack()
10426 "vtrn.8 d0, d1\n" in Pack()
10428 "vaddw.u8 q8, q8, d0\n" in Pack()
10435 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
10439 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
10459 "vmul.i32 q8, q8, d0[0]\n" in Pack()
10460 "vmul.i32 q9, q9, d0[0]\n" in Pack()
10468 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", in Pack()
10505 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
10513 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
10522 "vtrn.16 d0, d2\n" in Pack()
10524 "vtrn.8 d0, d1\n" in Pack()
10526 "vaddw.u8 q8, q8, d0\n" in Pack()
10533 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
10541 "vmov.i8 d0, #0\n" in Pack()
10548 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
10555 "vtrn.16 d0, d2\n" in Pack()
10557 "vtrn.8 d0, d1\n" in Pack()
10559 "vaddw.u8 q8, q8, d0\n" in Pack()
10566 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
10570 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
10590 "vmul.i32 q8, q8, d0[0]\n" in Pack()
10591 "vmul.i32 q9, q9, d0[0]\n" in Pack()
10599 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", in Pack()
10636 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
10644 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
10653 "vtrn.16 d0, d2\n" in Pack()
10655 "vtrn.8 d0, d1\n" in Pack()
10657 "vaddw.u8 q8, q8, d0\n" in Pack()
10664 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
10672 "vmov.i8 d0, #0\n" in Pack()
10679 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
10688 "vtrn.16 d0, d2\n" in Pack()
10690 "vtrn.8 d0, d1\n" in Pack()
10692 "vaddw.u8 q8, q8, d0\n" in Pack()
10699 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
10703 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
10723 "vmul.i32 q8, q8, d0[0]\n" in Pack()
10724 "vmul.i32 q9, q9, d0[0]\n" in Pack()
10732 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", in Pack()
10769 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
10777 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
10786 "vtrn.16 d0, d2\n" in Pack()
10788 "vtrn.8 d0, d1\n" in Pack()
10790 "vaddw.u8 q8, q8, d0\n" in Pack()
10797 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
10805 "vmov.i8 d0, #0\n" in Pack()
10812 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
10820 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
10823 "vtrn.16 d0, d2\n" in Pack()
10825 "vtrn.8 d0, d1\n" in Pack()
10827 "vaddw.u8 q8, q8, d0\n" in Pack()
10834 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
10838 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
10858 "vmul.i32 q8, q8, d0[0]\n" in Pack()
10859 "vmul.i32 q9, q9, d0[0]\n" in Pack()
10867 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", in Pack()
10904 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
10912 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
10921 "vtrn.16 d0, d2\n" in Pack()
10923 "vtrn.8 d0, d1\n" in Pack()
10925 "vaddw.u8 q8, q8, d0\n" in Pack()
10932 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
10940 "vmov.i8 d0, #0\n" in Pack()
10947 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
10955 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
10960 "vtrn.16 d0, d2\n" in Pack()
10962 "vtrn.8 d0, d1\n" in Pack()
10964 "vaddw.u8 q8, q8, d0\n" in Pack()
10971 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
10975 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
10995 "vmul.i32 q8, q8, d0[0]\n" in Pack()
10996 "vmul.i32 q9, q9, d0[0]\n" in Pack()
11004 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", in Pack()
11041 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
11049 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
11058 "vtrn.16 d0, d2\n" in Pack()
11060 "vtrn.8 d0, d1\n" in Pack()
11062 "vaddw.u8 q8, q8, d0\n" in Pack()
11069 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
11077 "vmov.i8 d0, #0\n" in Pack()
11084 "vld1.32 {d0[0]}, [%[in]]!\n" in Pack()
11092 "vld1.32 {d0[1]}, [%[in]]!\n" in Pack()
11099 "vtrn.16 d0, d2\n" in Pack()
11101 "vtrn.8 d0, d1\n" in Pack()
11103 "vaddw.u8 q8, q8, d0\n" in Pack()
11110 "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n" in Pack()
11114 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
11134 "vmul.i32 q8, q8, d0[0]\n" in Pack()
11135 "vmul.i32 q9, q9, d0[0]\n" in Pack()
11143 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d16", "d17", "d18", "d19", in Pack()
11176 "vld1.32 {d0}, [%[in]], %[stride]\n" in Pack()
11185 "vtrn.8 d0, d1\n" in Pack()
11189 "vtrn.16 d0, d2\n" in Pack()
11193 "vtrn.32 d0, d4\n" in Pack()
11197 "vaddw.u8 q8, q8, d0\n" in Pack()
11205 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
11211 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
11233 "vmul.i32 q8, q8, d0[0]\n" in Pack()
11234 "vmul.i32 q9, q9, d0[0]\n" in Pack()
11242 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", in Pack()
11279 "vld1.32 {d0}, [%[in]], %[stride]\n" in Pack()
11288 "vtrn.8 d0, d1\n" in Pack()
11292 "vtrn.16 d0, d2\n" in Pack()
11296 "vtrn.32 d0, d4\n" in Pack()
11300 "vaddw.u8 q8, q8, d0\n" in Pack()
11308 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
11316 "vmov.i8 d0, #0\n" in Pack()
11324 "vld1.32 {d0}, [%[in]], %[stride]\n" in Pack()
11326 "vtrn.8 d0, d1\n" in Pack()
11330 "vtrn.16 d0, d2\n" in Pack()
11334 "vtrn.32 d0, d4\n" in Pack()
11338 "vaddw.u8 q8, q8, d0\n" in Pack()
11346 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
11350 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
11372 "vmul.i32 q8, q8, d0[0]\n" in Pack()
11373 "vmul.i32 q9, q9, d0[0]\n" in Pack()
11381 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", in Pack()
11418 "vld1.32 {d0}, [%[in]], %[stride]\n" in Pack()
11427 "vtrn.8 d0, d1\n" in Pack()
11431 "vtrn.16 d0, d2\n" in Pack()
11435 "vtrn.32 d0, d4\n" in Pack()
11439 "vaddw.u8 q8, q8, d0\n" in Pack()
11447 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
11455 "vmov.i8 d0, #0\n" in Pack()
11463 "vld1.32 {d0}, [%[in]], %[stride]\n" in Pack()
11466 "vtrn.8 d0, d1\n" in Pack()
11470 "vtrn.16 d0, d2\n" in Pack()
11474 "vtrn.32 d0, d4\n" in Pack()
11478 "vaddw.u8 q8, q8, d0\n" in Pack()
11486 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
11490 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
11512 "vmul.i32 q8, q8, d0[0]\n" in Pack()
11513 "vmul.i32 q9, q9, d0[0]\n" in Pack()
11521 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", in Pack()
11558 "vld1.32 {d0}, [%[in]], %[stride]\n" in Pack()
11567 "vtrn.8 d0, d1\n" in Pack()
11571 "vtrn.16 d0, d2\n" in Pack()
11575 "vtrn.32 d0, d4\n" in Pack()
11579 "vaddw.u8 q8, q8, d0\n" in Pack()
11587 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
11595 "vmov.i8 d0, #0\n" in Pack()
11603 "vld1.32 {d0}, [%[in]], %[stride]\n" in Pack()
11607 "vtrn.8 d0, d1\n" in Pack()
11611 "vtrn.16 d0, d2\n" in Pack()
11615 "vtrn.32 d0, d4\n" in Pack()
11619 "vaddw.u8 q8, q8, d0\n" in Pack()
11627 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
11631 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
11653 "vmul.i32 q8, q8, d0[0]\n" in Pack()
11654 "vmul.i32 q9, q9, d0[0]\n" in Pack()
11662 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", in Pack()
11699 "vld1.32 {d0}, [%[in]], %[stride]\n" in Pack()
11708 "vtrn.8 d0, d1\n" in Pack()
11712 "vtrn.16 d0, d2\n" in Pack()
11716 "vtrn.32 d0, d4\n" in Pack()
11720 "vaddw.u8 q8, q8, d0\n" in Pack()
11728 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
11736 "vmov.i8 d0, #0\n" in Pack()
11744 "vld1.32 {d0}, [%[in]], %[stride]\n" in Pack()
11749 "vtrn.8 d0, d1\n" in Pack()
11753 "vtrn.16 d0, d2\n" in Pack()
11757 "vtrn.32 d0, d4\n" in Pack()
11761 "vaddw.u8 q8, q8, d0\n" in Pack()
11769 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
11773 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
11795 "vmul.i32 q8, q8, d0[0]\n" in Pack()
11796 "vmul.i32 q9, q9, d0[0]\n" in Pack()
11804 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", in Pack()
11841 "vld1.32 {d0}, [%[in]], %[stride]\n" in Pack()
11850 "vtrn.8 d0, d1\n" in Pack()
11854 "vtrn.16 d0, d2\n" in Pack()
11858 "vtrn.32 d0, d4\n" in Pack()
11862 "vaddw.u8 q8, q8, d0\n" in Pack()
11870 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
11878 "vmov.i8 d0, #0\n" in Pack()
11886 "vld1.32 {d0}, [%[in]], %[stride]\n" in Pack()
11892 "vtrn.8 d0, d1\n" in Pack()
11896 "vtrn.16 d0, d2\n" in Pack()
11900 "vtrn.32 d0, d4\n" in Pack()
11904 "vaddw.u8 q8, q8, d0\n" in Pack()
11912 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
11916 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
11938 "vmul.i32 q8, q8, d0[0]\n" in Pack()
11939 "vmul.i32 q9, q9, d0[0]\n" in Pack()
11947 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", in Pack()
11984 "vld1.32 {d0}, [%[in]], %[stride]\n" in Pack()
11993 "vtrn.8 d0, d1\n" in Pack()
11997 "vtrn.16 d0, d2\n" in Pack()
12001 "vtrn.32 d0, d4\n" in Pack()
12005 "vaddw.u8 q8, q8, d0\n" in Pack()
12013 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
12021 "vmov.i8 d0, #0\n" in Pack()
12029 "vld1.32 {d0}, [%[in]], %[stride]\n" in Pack()
12036 "vtrn.8 d0, d1\n" in Pack()
12040 "vtrn.16 d0, d2\n" in Pack()
12044 "vtrn.32 d0, d4\n" in Pack()
12048 "vaddw.u8 q8, q8, d0\n" in Pack()
12056 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
12060 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
12082 "vmul.i32 q8, q8, d0[0]\n" in Pack()
12083 "vmul.i32 q9, q9, d0[0]\n" in Pack()
12091 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", in Pack()
12128 "vld1.32 {d0}, [%[in]], %[stride]\n" in Pack()
12137 "vtrn.8 d0, d1\n" in Pack()
12141 "vtrn.16 d0, d2\n" in Pack()
12145 "vtrn.32 d0, d4\n" in Pack()
12149 "vaddw.u8 q8, q8, d0\n" in Pack()
12157 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
12165 "vmov.i8 d0, #0\n" in Pack()
12173 "vld1.32 {d0}, [%[in]], %[stride]\n" in Pack()
12181 "vtrn.8 d0, d1\n" in Pack()
12185 "vtrn.16 d0, d2\n" in Pack()
12189 "vtrn.32 d0, d4\n" in Pack()
12193 "vaddw.u8 q8, q8, d0\n" in Pack()
12201 "vst1.32 {d0, d1, d2, d3}, [%[out]:256]!\n" in Pack()
12205 "vmov.32 d0[0], %[multiplicative_sum_offset]\n" in Pack()
12227 "vmul.i32 q8, q8, d0[0]\n" in Pack()
12228 "vmul.i32 q9, q9, d0[0]\n" in Pack()
12236 : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", in Pack()