Lines Matching full:8

57 8:
58 st1 {v0.8b}, [x0], x1
59 st1 {v0.8b}, [x6], x1
61 st1 {v0.8b}, [x0], x1
62 st1 {v0.8b}, [x6], x1
63 b.gt 8b
136 ld1 {v0.8b}, [x2]
137 8:
138 st1 {v0.8b}, [x0], x1
139 st1 {v0.8b}, [x6], x1
141 st1 {v0.8b}, [x0], x1
142 st1 {v0.8b}, [x6], x1
143 b.gt 8b
206 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7
216 8:
217 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7
218 st1 {v3.8b}, [x0], x1
219 st1 {v2.8b}, [x6], x1
221 st1 {v1.8b}, [x0], x1
222 st1 {v0.8b}, [x6], x1
223 b.gt 8b
297 uaddlv h0, v0.8b
298 rshrn v0.8b, v0.8h, #3
299 dup v0.8b, v0.b[0]
310 ld1 {v0.8b}, [x2]
311 uaddlv h0, v0.8b
312 rshrn v0.8b, v0.8h, #3
313 dup v0.8b, v0.b[0]
314 8:
315 st1 {v0.8b}, [x0], x1
316 st1 {v0.8b}, [x6], x1
318 st1 {v0.8b}, [x0], x1
319 st1 {v0.8b}, [x6], x1
320 b.gt 8b
326 rshrn v0.8b, v0.8h, #4
342 rshrn v2.8b, v2.8h, #5
363 rshrn v4.8b, v4.8h, #6
408 uaddlv h0, v0.8b
409 rshrn v0.8b, v0.8h, #3
425 ld1 {v0.8b}, [x2]
426 uaddlv h0, v0.8b
427 rshrn v0.8b, v0.8h, #3
433 st1 {v0.8b}, [x0], x1
434 st1 {v0.8b}, [x6], x1
436 st1 {v0.8b}, [x0], x1
437 st1 {v0.8b}, [x6], x1
445 rshrn v0.8b, v0.8h, #4
465 rshrn v0.8b, v0.8h, #5
490 rshrn v0.8b, v0.8h, #6
530 dup v16.8h, w7 // width + height
541 ushr v16.8h, v16.8h, #1 // (width + height) >> 1
542 dup v17.8h, w7 // -ctz(width + height)
551 uaddlv h0, v0.8b
559 uaddlv h1, v1.8b
564 // h = 8/16
572 dup v0.8b, v0.b[0]
584 ld1 {v0.8b}, [x2], #8
585 uaddlv h0, v0.8b
590 ld1 {v1.8b}, [x2]
592 uaddlv h1, v1.8b
593 cmp w4, #8
605 dup v0.8b, v0.b[0]
607 st1 {v0.8b}, [x0], x1
608 st1 {v0.8b}, [x6], x1
610 st1 {v0.8b}, [x0], x1
611 st1 {v0.8b}, [x6], x1
630 // h = 4/8/32/64
631 tst w4, #(32+16+8) // 16 added to make a consecutive bitmask
667 // h = 8/16/64
668 cmp w4, #8
767 usubl v6.8h, v5.8b, v4.8b // top - topleft
769 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7
772 uaddw v16.8h, v6.8h, v0.8b
773 uaddw v17.8h, v6.8h, v2.8b
774 sqxtun v16.8b, v16.8h // base
775 sqxtun2 v16.16b, v17.8h
795 usubl v6.8h, v5.8b, v4.8b // top - topleft
796 8:
797 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7
798 uaddw v16.8h, v6.8h, v0.8b
799 uaddw v17.8h, v6.8h, v1.8b
800 uaddw v18.8h, v6.8h, v2.8b
801 uaddw v19.8h, v6.8h, v3.8b
802 sqxtun v16.8b, v16.8h // base
803 sqxtun2 v16.16b, v17.8h
804 sqxtun v18.8b, v18.8h
805 sqxtun2 v18.16b, v19.8h
829 b.gt 8b
845 usubl v6.8h, v5.8b, v4.8b // top - topleft
846 usubl2 v7.8h, v5.16b, v4.16b
847 uaddw v24.8h, v6.8h, v0.8b
848 uaddw v25.8h, v7.8h, v0.8b
849 uaddw v26.8h, v6.8h, v1.8b
850 uaddw v27.8h, v7.8h, v1.8b
851 uaddw v28.8h, v6.8h, v2.8b
852 uaddw v29.8h, v7.8h, v2.8b
853 uaddw v30.8h, v6.8h, v3.8b
854 uaddw v31.8h, v7.8h, v3.8b
855 sqxtun v17.8b, v26.8h // base
856 sqxtun2 v17.16b, v27.8h
857 sqxtun v16.8b, v24.8h
858 sqxtun2 v16.16b, v25.8h
859 sqxtun v19.8b, v30.8h
860 sqxtun2 v19.16b, v31.8h
861 sqxtun v18.8b, v28.8h
862 sqxtun2 v18.16b, v29.8h
900 b.le 8f
903 8:
954 usubl v6.8h, v6.8b, v4.8b // top-bottom
955 uxtl v7.8h, v7.8b // weights_hor
957 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left
958 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver
959 shll v20.8h, v5.8b, #8 // right*256
960 shll v21.8h, v5.8b, #8
965 shll v22.8h, v4.8b, #8 // bottom*256
966 shll v23.8h, v4.8b, #8
967 usubl v0.8h, v0.8b, v5.8b // left-right
968 usubl v1.8h, v1.8b, v5.8b
969 uxtl v16.8h, v16.8b // weights_ver
970 uxtl v18.8h, v18.8b
971 mla v20.8h, v0.8h, v7.8h // right*256 + (left-right)*weights_hor
972 mla v21.8h, v1.8h, v7.8h
973 mla v22.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
974 mla v23.8h, v6.8h, v18.8h
975 uhadd v20.8h, v20.8h, v22.8h
976 uhadd v21.8h, v21.8h, v23.8h
977 rshrn v20.8b, v20.8h, #8
978 rshrn v21.8b, v21.8h, #8
988 ld1 {v6.8b}, [x8] // top
989 ld1 {v7.8b}, [x10] // weights_hor
993 usubl v6.8h, v6.8b, v4.8b // top-bottom
994 uxtl v7.8h, v7.8b // weights_hor
995 8:
996 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left
997 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver
998 shll v20.8h, v5.8b, #8 // right*256
999 shll v21.8h, v5.8b, #8
1000 shll v22.8h, v5.8b, #8
1001 shll v23.8h, v5.8b, #8
1002 usubl v0.8h, v0.8b, v5.8b // left-right
1003 usubl v1.8h, v1.8b, v5.8b
1004 usubl v2.8h, v2.8b, v5.8b
1005 usubl v3.8h, v3.8b, v5.8b
1006 shll v24.8h, v4.8b, #8 // bottom*256
1007 shll v25.8h, v4.8b, #8
1008 shll v26.8h, v4.8b, #8
1009 shll v27.8h, v4.8b, #8
1010 uxtl v16.8h, v16.8b // weights_ver
1011 uxtl v17.8h, v17.8b
1012 uxtl v18.8h, v18.8b
1013 uxtl v19.8h, v19.8b
1014 mla v20.8h, v3.8h, v7.8h // right*256 + (left-right)*weights_hor
1015 mla v21.8h, v2.8h, v7.8h // (left flipped)
1016 mla v22.8h, v1.8h, v7.8h
1017 mla v23.8h, v0.8h, v7.8h
1018 mla v24.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
1019 mla v25.8h, v6.8h, v17.8h
1020 mla v26.8h, v6.8h, v18.8h
1021 mla v27.8h, v6.8h, v19.8h
1022 uhadd v20.8h, v20.8h, v24.8h
1023 uhadd v21.8h, v21.8h, v25.8h
1024 uhadd v22.8h, v22.8h, v26.8h
1025 uhadd v23.8h, v23.8h, v27.8h
1026 rshrn v20.8b, v20.8h, #8
1027 rshrn v21.8b, v21.8h, #8
1028 rshrn v22.8b, v22.8h, #8
1029 rshrn v23.8b, v23.8h, #8
1030 st1 {v20.8b}, [x0], x1
1031 st1 {v21.8b}, [x6], x1
1033 st1 {v22.8b}, [x0], x1
1034 st1 {v23.8b}, [x6], x1
1035 b.gt 8b
1049 ld2r {v0.8b, v1.8b}, [x2], x7 // left
1050 ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver
1051 usubl v0.8h, v0.8b, v5.8b // left-right
1052 usubl v1.8h, v1.8b, v5.8b
1053 uxtl v16.8h, v16.8b // weights_ver
1054 uxtl v17.8h, v17.8b
1058 shll v20.8h, v5.8b, #8 // right*256
1059 shll v21.8h, v5.8b, #8
1060 shll v22.8h, v5.8b, #8
1061 shll v23.8h, v5.8b, #8
1062 uxtl v6.8h, v7.8b // weights_hor
1063 uxtl2 v7.8h, v7.16b
1064 usubl v2.8h, v3.8b, v4.8b // top-bottom
1065 usubl2 v3.8h, v3.16b, v4.16b
1066 mla v20.8h, v1.8h, v6.8h // right*256 + (left-right)*weights_hor
1067 mla v21.8h, v1.8h, v7.8h // (left flipped)
1068 mla v22.8h, v0.8h, v6.8h
1069 mla v23.8h, v0.8h, v7.8h
1070 shll v24.8h, v4.8b, #8 // bottom*256
1071 shll v25.8h, v4.8b, #8
1072 shll v26.8h, v4.8b, #8
1073 shll v27.8h, v4.8b, #8
1074 mla v24.8h, v2.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
1075 mla v25.8h, v3.8h, v16.8h
1076 mla v26.8h, v2.8h, v17.8h
1077 mla v27.8h, v3.8h, v17.8h
1078 uhadd v20.8h, v20.8h, v24.8h
1079 uhadd v21.8h, v21.8h, v25.8h
1080 uhadd v22.8h, v22.8h, v26.8h
1081 uhadd v23.8h, v23.8h, v27.8h
1082 rshrn v20.8b, v20.8h, #8
1083 rshrn2 v20.16b, v21.8h, #8
1084 rshrn v22.8b, v22.8h, #8
1085 rshrn2 v22.16b, v23.8h, #8
1131 usubl v6.8h, v6.8b, v4.8b // top-bottom
1133 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
1134 shll v22.8h, v4.8b, #8 // bottom*256
1135 shll v23.8h, v4.8b, #8
1138 uxtl v16.8h, v16.8b // weights_ver
1139 uxtl v18.8h, v18.8b
1140 mla v22.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
1141 mla v23.8h, v6.8h, v18.8h
1142 rshrn v22.8b, v22.8h, #8
1143 rshrn v23.8b, v23.8h, #8
1153 ld1 {v6.8b}, [x2] // top
1154 usubl v6.8h, v6.8b, v4.8b // top-bottom
1155 8:
1156 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
1157 shll v24.8h, v4.8b, #8 // bottom*256
1158 shll v25.8h, v4.8b, #8
1159 shll v26.8h, v4.8b, #8
1160 shll v27.8h, v4.8b, #8
1161 uxtl v16.8h, v16.8b // weights_ver
1162 uxtl v17.8h, v17.8b
1163 uxtl v18.8h, v18.8b
1164 uxtl v19.8h, v19.8b
1165 mla v24.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
1166 mla v25.8h, v6.8h, v17.8h
1167 mla v26.8h, v6.8h, v18.8h
1168 mla v27.8h, v6.8h, v19.8h
1169 rshrn v24.8b, v24.8h, #8
1170 rshrn v25.8b, v25.8h, #8
1171 rshrn v26.8b, v26.8h, #8
1172 rshrn v27.8b, v27.8h, #8
1173 st1 {v24.8b}, [x0], x1
1174 st1 {v25.8b}, [x6], x1
1176 st1 {v26.8b}, [x0], x1
1177 st1 {v27.8b}, [x6], x1
1178 b.gt 8b
1192 ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
1193 uxtl v16.8h, v16.8b // weights_ver
1194 uxtl v17.8h, v17.8b
1195 uxtl v18.8h, v18.8b
1196 uxtl v19.8h, v19.8b
1199 shll v20.8h, v4.8b, #8 // bottom*256
1200 shll v21.8h, v4.8b, #8
1201 shll v22.8h, v4.8b, #8
1202 shll v23.8h, v4.8b, #8
1203 shll v24.8h, v4.8b, #8
1204 shll v25.8h, v4.8b, #8
1205 shll v26.8h, v4.8b, #8
1206 shll v27.8h, v4.8b, #8
1207 usubl v2.8h, v3.8b, v4.8b // top-bottom
1208 usubl2 v3.8h, v3.16b, v4.16b
1209 mla v20.8h, v2.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver
1210 mla v21.8h, v3.8h, v16.8h
1211 mla v22.8h, v2.8h, v17.8h
1212 mla v23.8h, v3.8h, v17.8h
1213 mla v24.8h, v2.8h, v18.8h
1214 mla v25.8h, v3.8h, v18.8h
1215 mla v26.8h, v2.8h, v19.8h
1216 mla v27.8h, v3.8h, v19.8h
1217 rshrn v20.8b, v20.8h, #8
1218 rshrn2 v20.16b, v21.8h, #8
1219 rshrn v22.8b, v22.8h, #8
1220 rshrn2 v22.16b, v23.8h, #8
1221 rshrn v24.8b, v24.8h, #8
1222 rshrn2 v24.16b, v25.8h, #8
1223 rshrn v26.8b, v26.8h, #8
1224 rshrn2 v26.16b, v27.8h, #8
1274 uxtl v7.8h, v7.8b // weights_hor
1276 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left
1277 shll v20.8h, v5.8b, #8 // right*256
1278 shll v21.8h, v5.8b, #8
1281 usubl v0.8h, v0.8b, v5.8b // left-right
1282 usubl v1.8h, v1.8b, v5.8b
1283 mla v20.8h, v0.8h, v7.8h // right*256 + (left-right)*weights_hor
1284 mla v21.8h, v1.8h, v7.8h
1285 rshrn v20.8b, v20.8h, #8
1286 rshrn v21.8b, v21.8h, #8
1296 ld1 {v7.8b}, [x8] // weights_hor
1299 uxtl v7.8h, v7.8b // weights_hor
1300 8:
1301 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left
1302 shll v20.8h, v5.8b, #8 // right*256
1303 shll v21.8h, v5.8b, #8
1304 shll v22.8h, v5.8b, #8
1305 shll v23.8h, v5.8b, #8
1306 usubl v3.8h, v3.8b, v5.8b // left-right
1307 usubl v2.8h, v2.8b, v5.8b
1308 usubl v1.8h, v1.8b, v5.8b
1309 usubl v0.8h, v0.8b, v5.8b
1310 mla v20.8h, v3.8h, v7.8h // right*256 + (left-right)*weights_hor
1311 mla v21.8h, v2.8h, v7.8h // (left flipped)
1312 mla v22.8h, v1.8h, v7.8h
1313 mla v23.8h, v0.8h, v7.8h
1314 rshrn v20.8b, v20.8h, #8
1315 rshrn v21.8b, v21.8h, #8
1316 rshrn v22.8b, v22.8h, #8
1317 rshrn v23.8b, v23.8h, #8
1318 st1 {v20.8b}, [x0], x1
1319 st1 {v21.8b}, [x6], x1
1321 st1 {v22.8b}, [x0], x1
1322 st1 {v23.8b}, [x6], x1
1323 b.gt 8b
1339 ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left
1340 usubl v0.8h, v0.8b, v5.8b // left-right
1341 usubl v1.8h, v1.8b, v5.8b
1342 usubl v2.8h, v2.8b, v5.8b
1343 usubl v3.8h, v3.8b, v5.8b
1346 shll v20.8h, v5.8b, #8 // right*256
1347 shll v21.8h, v5.8b, #8
1348 shll v22.8h, v5.8b, #8
1349 shll v23.8h, v5.8b, #8
1350 shll v24.8h, v5.8b, #8
1351 shll v25.8h, v5.8b, #8
1352 shll v26.8h, v5.8b, #8
1353 shll v27.8h, v5.8b, #8
1354 uxtl v6.8h, v7.8b // weights_hor
1355 uxtl2 v7.8h, v7.16b
1356 mla v20.8h, v3.8h, v6.8h // right*256 + (left-right)*weights_hor
1357 mla v21.8h, v3.8h, v7.8h // (left flipped)
1358 mla v22.8h, v2.8h, v6.8h
1359 mla v23.8h, v2.8h, v7.8h
1360 mla v24.8h, v1.8h, v6.8h
1361 mla v25.8h, v1.8h, v7.8h
1362 mla v26.8h, v0.8h, v6.8h
1363 mla v27.8h, v0.8h, v7.8h
1364 rshrn v20.8b, v20.8h, #8
1365 rshrn2 v20.16b, v21.8h, #8
1366 rshrn v22.8b, v22.8h, #8
1367 rshrn2 v22.16b, v23.8h, #8
1368 rshrn v24.8b, v24.8h, #8
1369 rshrn2 v24.16b, v25.8h, #8
1370 rshrn v26.8b, v26.8h, #8
1371 rshrn2 v26.16b, v27.8h, #8
1422 movi v31.8h, #9
1430 uaddl v16.8h, v4.8b, v5.8b // in[i+1] + in[i+2]
1431 uaddl2 v17.8h, v4.16b, v5.16b
1432 uaddl v18.8h, v0.8b, v6.8b // in[i+0] + in[i+3]
1433 uaddl2 v19.8h, v0.16b, v6.16b
1434 mul v16.8h, v16.8h, v31.8h // 9*(in[i+1] + in[i+2])
1435 mul v17.8h, v17.8h, v31.8h
1436 sub v16.8h, v16.8h, v18.8h
1437 sub v17.8h, v17.8h, v19.8h
1439 sqrshrun v16.8b, v16.8h, #4
1440 sqrshrun2 v16.16b, v17.8h, #4
1453 // Here, sz is 4 or 8, and we produce 2*sz+1 output elements.
1463 movi v31.8h, #9
1471 uaddl v16.8h, v0.8b, v5.8b // in[i+0] + in[i+1]
1472 uaddl v18.8h, v4.8b, v6.8b // in[i-1] + in[i+2]
1473 mul v16.8h, v16.8h, v31.8h // 9*(in[i+1] + in[i+2])
1474 sub v16.8h, v16.8h, v18.8h
1476 sqrshrun v16.8b, v16.8h, #4
1483 // In case sz=8, output one single pixel in out[16].
1490 .byte 0, 4, 8, 0
1521 umull v4.8h, v0.8b, v30.8b
1522 umlal v4.8h, v2.8b, v31.8b
1523 umlal v4.8h, v3.8b, v30.8b
1524 umull2 v5.8h, v0.16b, v30.16b
1525 umlal2 v5.8h, v2.16b, v31.16b
1526 umlal2 v5.8h, v3.16b, v30.16b
1529 rshrn v4.8b, v4.8h, #4
1530 rshrn2 v4.16b, v5.8h, #4
1552 umull v4.8h, v0.8b, v30.8b
1553 umlal v4.8h, v2.8b, v31.8b
1554 umlal v4.8h, v3.8b, v30.8b
1555 umull2 v5.8h, v0.16b, v30.16b
1556 umlal2 v5.8h, v2.16b, v31.16b
1557 umlal2 v5.8h, v3.16b, v30.16b
1559 rshrn v4.8b, v4.8h, #4
1560 rshrn2 v4.16b, v5.8h, #4
1590 umull v6.8h, v0.8b, v29.8b
1591 umlal v6.8h, v2.8b, v30.8b
1592 umlal v6.8h, v3.8b, v31.8b
1593 umlal v6.8h, v4.8b, v30.8b
1594 umlal v6.8h, v5.8b, v29.8b
1595 umull2 v7.8h, v0.16b, v29.16b
1596 umlal2 v7.8h, v2.16b, v30.16b
1597 umlal2 v7.8h, v3.16b, v31.16b
1598 umlal2 v7.8h, v4.16b, v30.16b
1599 umlal2 v7.8h, v5.16b, v29.16b
1602 rshrn v6.8b, v6.8h, #4
1603 rshrn2 v6.16b, v7.8h, #4
1628 umull v6.8h, v0.8b, v29.8b
1629 umlal v6.8h, v2.8b, v30.8b
1630 umlal v6.8h, v3.8b, v31.8b
1631 umlal v6.8h, v4.8b, v30.8b
1632 umlal v6.8h, v5.8b, v29.8b
1633 umull2 v7.8h, v0.16b, v29.16b
1634 umlal2 v7.8h, v2.16b, v30.16b
1635 umlal2 v7.8h, v3.16b, v31.16b
1636 umlal2 v7.8h, v4.16b, v30.16b
1637 umlal2 v7.8h, v5.16b, v29.16b
1641 rshrn v6.8b, v6.8h, #4
1642 rshrn2 v6.16b, v7.8h, #4
1700 ext v1.8b, v0.8b, v0.8b, #1 // top[base+1]
1701 ext v3.8b, v2.8b, v2.8b, #1
1702 usubl v6.8h, v1.8b, v0.8b // top[base+1]-top[base]
1703 usubl v7.8h, v3.8b, v2.8b
1704 ushll v16.8h, v0.8b, #6 // top[base]*64
1705 ushll v17.8h, v2.8b, #6
1708 rshrn v16.8b, v16.8h, #6
1709 rshrn v17.8b, v17.8h, #6
1726 8:
1736 dup v4.8b, w9 // frac
1737 dup v5.8b, w11
1740 dup v6.8b, w9 // 64 - frac
1741 dup v7.8b, w11
1744 umull v16.8h, v0.8b, v6.8b // top[base]*(64-frac)
1745 umlal v16.8h, v1.8b, v4.8b // + top[base+1]*frac
1746 umull v17.8h, v2.8b, v7.8b
1747 umlal v17.8h, v3.8b, v5.8b
1748 rshrn v16.8b, v16.8h, #6
1749 rshrn v17.8b, v17.8h, #6
1750 st1 {v16.8b}, [x0], x1
1753 st1 {v17.8b}, [x0], x1
1754 b.gt 8b
1758 st1 {v31.8b}, [x0], x1
1760 st1 {v31.8b}, [x0], x1
1797 umull v18.8h, v0.8b, v6.8b // top[base]*(64-frac)
1798 umlal v18.8h, v16.8b, v4.8b // + top[base+1]*frac
1799 umull2 v19.8h, v0.16b, v6.16b
1800 umlal2 v19.8h, v16.16b, v4.16b
1801 umull v20.8h, v2.8b, v7.8b
1802 umlal v20.8h, v17.8b, v5.8b
1803 umull2 v21.8h, v2.16b, v7.16b
1804 umlal2 v21.8h, v17.16b, v5.16b
1805 rshrn v16.8b, v18.8h, #6
1806 rshrn2 v16.16b, v19.8h, #6
1807 rshrn v17.8b, v20.8h, #6
1808 rshrn2 v17.16b, v21.8h, #6
1850 cmp w3, #8
1855 b.eq 8f
1869 uzp2 v1.8b, v0.8b, v0.8b // top[base+1]
1870 uzp1 v0.8b, v0.8b, v0.8b // top[base]
1871 uzp2 v3.8b, v2.8b, v2.8b
1872 uzp1 v2.8b, v2.8b, v2.8b
1873 usubl v6.8h, v1.8b, v0.8b // top[base+1]-top[base]
1874 usubl v7.8h, v3.8b, v2.8b
1875 ushll v16.8h, v0.8b, #6 // top[base]*64
1876 ushll v17.8h, v2.8b, #6
1879 rshrn v16.8b, v16.8h, #6
1880 rshrn v17.8b, v17.8h, #6
1895 8: // w == 8
1905 dup v4.8b, w9 // frac
1906 dup v5.8b, w11
1909 dup v6.8b, w9 // 64 - frac
1910 dup v7.8b, w11
1915 umull v16.8h, v1.8b, v4.8b // top[base+1]*frac
1916 umlal v16.8h, v0.8b, v6.8b // + top[base]*(64-frac)
1917 umull v17.8h, v3.8b, v5.8b
1918 umlal v17.8h, v2.8b, v7.8b
1919 rshrn v16.8b, v16.8h, #6
1920 rshrn v17.8b, v17.8h, #6
1921 st1 {v16.8b}, [x0], x1
1924 st1 {v17.8b}, [x0], x1
1925 b.gt 8b
1929 st1 {v31.8b}, [x0], x1
1931 st1 {v31.8b}, [x0], x1
1940 add x3, x0, #8
1955 .short 8, 9, 10, 11, 12, 13, 14, 15
1973 ld1 {v31.8h}, [x11] // increments
1981 movi v17.8b, #1
1987 xtn v31.8b, v31.8h // {0,1,2,3}
1995 xtn v27.8b, v30.8h // (uint8_t)ypos
1996 shrn v29.8b, v30.8h, #6 // ypos >> 6
1997 and v27.8b, v27.8b, v25.8b // frac_y
1999 add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1
2001 add v30.8b, v29.8b, v17.8b // base_y + 1
2002 add v28.8b, v29.8b, v19.8b // base_y + 2
2004 tbl v16.8b, {v0.16b}, v29.8b // left[base_y]
2008 sub v28.8b, v26.8b, v27.8b // 64 - frac_y
2015 movi v29.8b, #2
2034 tbl v17.8b, {v0.16b}, v30.8b // left[base_y+1], left[base_y+2]
2036 shrn v20.8b, v6.8h, #6 // first base_x for each row
2037 xtn v6.8b, v6.8h // (uint8_t)xpos
2039 ext v3.8b, v2.8b, v2.8b, #1 // top[base_x+1]
2040 ext v5.8b, v4.8b, v4.8b, #1
2042 and v6.8b, v6.8b, v25.8b // frac_x
2049 sub v7.8b, v26.8b, v6.8b // 64 - frac_x
2051 add v20.8b, v20.8b, v31.8b // actual base_x
2053 umull v16.8h, v16.8b, v28.8b // left[base_y]*(64-frac_y)
2054 umlal v16.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y
2056 umull v22.8h, v2.8b, v7.8b // top[base_x]-*(64-frac_x)
2057 umlal v22.8h, v3.8b, v6.8b // + top[base_x+1]*frac_x
2059 cmge v20.8b, v20.8b, #0
2061 rshrn v16.8b, v16.8h, #6
2062 rshrn v22.8b, v22.8h, #6
2064 bit v16.8b, v22.8b, v20.8b
2072 ext v16.8b, v17.8b, v17.8b, #4
2073 add v30.8b, v30.8b, v29.8b // base_y += 2
2077 tbl v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+2]
2081 umull v18.8h, v16.8b, v28.8b // left[base_y]*(64-frac_t)
2082 umlal v18.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y
2083 rshrn v18.8b, v18.8h, #6
2090 ext v16.8b, v17.8b, v17.8b, #4
2091 add v30.8b, v30.8b, v29.8b // base_y += 2
2100 dup v30.8h, w7 // -dy
2101 movi v17.8b, #1
2103 mul v16.8h, v31.8h, v30.8h // {0,1,2,3,4,5,6,7}* -dy
2105 add v30.8h, v16.8h, v30.8h // -= dy
2107 xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7}
2109 // Worst case height for w=8 is 32, but we need at least h+1 elements
2115 xtn v27.8b, v30.8h // (uint8_t)ypos
2116 shrn v29.8b, v30.8h, #6 // ypos >> 6
2117 and v27.8b, v27.8b, v25.8b // frac_y
2119 add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1
2123 tbl v18.8b, {v0.16b}, v29.8b // left[base_y]
2125 add v30.8b, v29.8b, v19.8b // base_y + 2
2126 add v29.8b, v29.8b, v17.8b // base_y + 1
2128 sub v28.8b, v26.8b, v27.8b // 64 - frac_y
2132 movi v24.8b, #2 // 2
2133 8:
2135 dup v16.8h, w8 // xpos
2137 cmp w9, #-8 // base_x <= -8
2141 dup v17.8h, w8 // xpos
2149 tbl v19.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+1]
2151 shrn v21.8b, v16.8h, #6 // first base_x
2152 shrn2 v21.16b, v17.8h, #6
2153 xtn v16.8b, v16.8h // (uint8_t)xpos
2154 xtn2 v16.16b, v17.8h
2156 tbl v20.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+2]
2170 umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y)
2171 umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y
2172 umull v17.8h, v19.8b, v28.8b
2173 umlal v17.8h, v20.8b, v27.8b
2175 umull v22.8h, v4.8b, v7.8b // top[base_x]-*(64-frac_x)
2176 umlal v22.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x
2177 umull2 v23.8h, v4.16b, v7.16b
2178 umlal2 v23.8h, v5.16b, v16.16b
2182 rshrn v6.8b, v6.8h, #6
2183 rshrn2 v6.16b, v17.8h, #6
2184 rshrn v22.8b, v22.8h, #6
2185 rshrn2 v22.16b, v23.8h, #6
2195 mov v18.8b, v20.8b
2196 add v29.8b, v29.8b, v24.8b // base_y += 2
2197 add v30.8b, v30.8b, v24.8b // base_y += 2
2198 b 8b
2201 tbl v19.8b, {v0.16b, v1.16b, v2.16b}, v29.8b // left[base_y+1]
2202 tbl v20.8b, {v0.16b, v1.16b, v2.16b}, v30.8b // left[base_y+2]
2204 umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y)
2205 umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y
2206 umull v17.8h, v19.8b, v28.8b
2207 umlal v17.8h, v20.8b, v27.8b
2209 rshrn v6.8b, v6.8h, #6
2210 rshrn2 v6.16b, v17.8h, #6
2217 mov v18.8b, v20.8b
2218 add v29.8b, v29.8b, v24.8b // base_y += 2
2219 add v30.8b, v30.8b, v24.8b // base_y += 2
2235 dup v18.8h, w7 // -dy
2239 ld1 {v14.8h}, [x11] // {8,9,10,11,12,13,14,15}
2241 mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy
2242 mul v19.8h, v14.8h, v18.8h // {8,9,10,11,12,13,14,15}* -dy
2244 add v16.8h, v16.8h, v18.8h // -= dy
2245 add v18.8h, v19.8h, v18.8h
2247 xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7}
2248 xtn2 v31.16b, v14.8h // {8,9,10,11,12,13,14,15}
2257 xtn v27.8b, v16.8h // (uint8_t)ypos
2258 xtn2 v27.16b, v18.8h
2259 shrn v29.8b, v16.8h, #6 // ypos >> 6
2260 shrn2 v29.16b, v18.8h, #6
2276 dup v16.8h, w8 // xpos
2282 dup v17.8h, w8 // xpos
2295 shrn v21.8b, v16.8h, #6 // first base_x
2296 shrn v22.8b, v17.8h, #6
2297 xtn v16.8b, v16.8h // (uint8_t)xpos
2298 xtn v17.8b, v17.8h
2313 umull v10.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y)
2314 umlal v10.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y
2319 umull2 v11.8h, v18.16b, v28.16b
2320 umlal2 v11.8h, v19.16b, v27.16b
2325 umull v12.8h, v19.8b, v28.8b
2326 umlal v12.8h, v20.8b, v27.8b
2327 umull2 v13.8h, v19.16b, v28.16b
2328 umlal2 v13.8h, v20.16b, v27.16b
2330 rshrn v10.8b, v10.8h, #6
2331 rshrn2 v10.16b, v11.8h, #6
2332 rshrn v11.8b, v12.8h, #6
2333 rshrn2 v11.16b, v13.8h, #6
2335 umull v12.8h, v4.8b, v8.8b // top[base_x]-*(64-frac_x)
2336 umlal v12.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x
2337 umull2 v13.8h, v4.16b, v8.16b
2338 umlal2 v13.8h, v5.16b, v16.16b
2339 umull v14.8h, v6.8b, v9.8b
2340 umlal v14.8h, v7.8b, v17.8b
2341 umull2 v18.8h, v6.16b, v9.16b
2342 umlal2 v18.8h, v7.16b, v17.16b
2347 rshrn v12.8b, v12.8h, #6
2348 rshrn2 v12.16b, v13.8h, #6
2349 rshrn v13.8b, v14.8h, #6
2350 rshrn2 v13.16b, v18.8h, #6
2372 umull v4.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y)
2373 umlal v4.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y
2374 umull2 v5.8h, v18.16b, v28.16b
2375 umlal2 v5.8h, v19.16b, v27.16b
2376 umull v6.8h, v19.8b, v28.8b
2377 umlal v6.8h, v20.8b, v27.8b
2378 umull2 v7.8h, v19.16b, v28.16b
2379 umlal2 v7.8h, v20.16b, v27.16b
2381 rshrn v4.8b, v4.8h, #6
2382 rshrn2 v4.16b, v5.8h, #6
2383 rshrn v5.8b, v6.8h, #6
2384 rshrn2 v5.16b, v7.8h, #6
2414 dup v25.8h, w7 // -dy
2417 ld1 {v14.8h}, [x11] // {8,9,10,11,12,13,14,15}
2423 movi v11.8h, #8
2424 mul v26.8h, v31.8h, v25.8h // {0,1,2,3,4,5,6,7}* -dy
2425 add v26.8h, v26.8h, v25.8h // -= dy
2426 mul v25.8h, v25.8h, v11.8h // -8*dy
2428 xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7}
2429 xtn2 v31.16b, v14.8h // {8,9,10,11,12,13,14,15}
2442 dup v16.8h, w8 // xpos
2448 dup v17.8h, w8 // xpos
2454 sqshrn v21.8b, v16.8h, #6 // first base_x
2455 sqshrn v22.8b, v17.8h, #6
2456 xtn v16.8b, v16.8h // (uint8_t)xpos
2457 xtn v17.8b, v17.8h
2480 add v13.8h, v23.8h, v25.8h // ypos -= 8*dy
2487 xtn v27.8b, v23.8h // (uint8_t)ypos
2488 xtn2 v27.16b, v13.8h
2489 shrn v29.8b, v23.8h, #6 // ypos >> 6
2490 shrn2 v29.16b, v13.8h, #6
2498 add v23.8h, v13.8h, v25.8h // ypos -= 8*dy
2516 umull v10.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y)
2517 umlal v10.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y
2518 umull2 v11.8h, v18.16b, v28.16b
2519 umlal2 v11.8h, v19.16b, v27.16b
2520 umull v12.8h, v19.8b, v28.8b
2521 umlal v12.8h, v20.8b, v27.8b
2522 umull2 v13.8h, v19.16b, v28.16b
2523 umlal2 v13.8h, v20.16b, v27.16b
2528 rshrn v10.8b, v10.8h, #6
2529 rshrn2 v10.16b, v11.8h, #6
2530 rshrn v11.8b, v12.8h, #6
2531 rshrn2 v11.16b, v13.8h, #6
2533 umull v12.8h, v4.8b, v8.8b // top[base_x]-*(64-frac_x)
2534 umlal v12.8h, v18.8b, v16.8b // + top[base_x+1]*frac_x
2535 umull2 v13.8h, v4.16b, v8.16b
2536 umlal2 v13.8h, v18.16b, v16.16b
2537 umull v14.8h, v6.8b, v9.8b
2538 umlal v14.8h, v19.8b, v17.8b
2539 umull2 v20.8h, v6.16b, v9.16b
2540 umlal2 v20.8h, v19.16b, v17.16b
2545 rshrn v12.8b, v12.8h, #6
2546 rshrn2 v12.16b, v13.8h, #6
2547 rshrn v13.8b, v14.8h, #6
2548 rshrn2 v13.16b, v20.8h, #6
2568 movi v10.8h, #128
2572 add v26.8h, v26.8h, v10.8h // ypos += 2*(1<<6)
2582 umull v12.8h, v4.8b, v8.8b // top[base_x]-*(64-frac_x)
2583 umlal v12.8h, v18.8b, v16.8b // + top[base_x+1]*frac_x
2584 umull2 v13.8h, v4.16b, v8.16b
2585 umlal2 v13.8h, v18.16b, v16.16b
2586 umull v14.8h, v6.8b, v9.8b
2587 umlal v14.8h, v19.8b, v17.8b
2588 umull2 v20.8h, v6.16b, v9.16b
2589 umlal2 v20.8h, v19.16b, v17.16b
2591 rshrn v12.8b, v12.8h, #6
2592 rshrn2 v12.16b, v13.8h, #6
2593 rshrn v13.8b, v14.8h, #6
2594 rshrn2 v13.16b, v20.8h, #6
2609 add v13.8h, v23.8h, v25.8h // ypos -= 8*dy
2613 xtn v27.8b, v23.8h // (uint8_t)ypos
2614 xtn2 v27.16b, v13.8h
2615 shrn v29.8b, v23.8h, #6 // ypos >> 6
2616 shrn2 v29.16b, v13.8h, #6
2620 add v23.8h, v13.8h, v25.8h // ypos -= 8*dy
2635 umull v10.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y)
2636 umlal v10.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y
2637 umull2 v11.8h, v18.16b, v28.16b
2638 umlal2 v11.8h, v19.16b, v27.16b
2639 umull v12.8h, v19.8b, v28.8b
2640 umlal v12.8h, v20.8b, v27.8b
2641 umull2 v13.8h, v19.16b, v28.16b
2642 umlal2 v13.8h, v20.16b, v27.16b
2644 rshrn v10.8b, v10.8h, #6
2645 rshrn2 v10.16b, v11.8h, #6
2646 rshrn v11.8b, v12.8h, #6
2647 rshrn2 v11.16b, v13.8h, #6
2686 cmp w4, #8
2691 ld1 {v31.8h}, [x11] // increments
2697 movi v17.8b, #1
2703 xtn v31.8b, v31.8h // {0,1,2,3}
2705 // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements
2712 xtn v27.8b, v30.8h // (uint8_t)ypos
2713 shrn v29.8b, v30.8h, #6 // ypos >> 6
2714 and v27.8b, v27.8b, v25.8b // frac_y
2716 add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1
2718 add v30.8b, v29.8b, v17.8b // base_y + 1
2719 add v28.8b, v29.8b, v19.8b // base_y + 2
2721 tbl v16.8b, {v0.16b}, v29.8b // left[base_y]
2725 sub v28.8b, v26.8b, v27.8b // 64 - frac_y
2732 movi v29.8b, #2
2733 add v31.8b, v31.8b, v31.8b // {0,2,4,6,0,2,4,6}
2738 cmp w9, #-8 // base_x <= -8
2749 tbl v17.8b, {v0.16b}, v30.8b // left[base_y+1], left[base_y+2]
2751 shrn v20.8b, v6.8h, #6 // first base_x for each row
2752 xtn v6.8b, v6.8h // (uint8_t)xpos
2754 uzp2 v3.8b, v2.8b, v4.8b // top[base_x+1]
2755 uzp1 v2.8b, v2.8b, v4.8b // top[base_x]
2757 and v6.8b, v6.8b, v25.8b // frac_x
2761 sub v7.8b, v26.8b, v6.8b // 64 - frac_x
2763 add v20.8b, v20.8b, v31.8b // actual base_x
2765 umull v16.8h, v16.8b, v28.8b // left[base_y]*(64-frac_y)
2766 umlal v16.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y
2768 umull v22.8h, v2.8b, v7.8b // top[base_x]-*(64-frac_x)
2769 umlal v22.8h, v3.8b, v6.8b // + top[base_x+1]*frac_x
2771 cmge v20.8b, v20.8b, #0
2773 rshrn v16.8b, v16.8h, #6
2774 rshrn v22.8b, v22.8h, #6
2776 bit v16.8b, v22.8b, v20.8b
2784 ext v16.8b, v17.8b, v17.8b, #4
2785 add v30.8b, v30.8b, v29.8b // base_y += 2
2789 tbl v17.8b, {v0.16b}, v30.8b // left[base_y+1], left[base_y+2]
2793 umull v18.8h, v16.8b, v28.8b // left[base_y]*(64-frac_t)
2794 umlal v18.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y
2795 rshrn v18.8b, v18.8h, #6
2802 ext v16.8b, v17.8b, v17.8b, #4
2803 add v30.8b, v30.8b, v29.8b // base_y += 2
2810 dup v30.8h, w7 // -dy
2811 movi v17.8b, #1
2813 mul v16.8h, v31.8h, v30.8h // {0,1,2,3,4,5,6,7}* -dy
2815 add v30.8h, v16.8h, v30.8h // -= dy
2817 xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7}
2819 // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements
2826 xtn v27.8b, v30.8h // (uint8_t)ypos
2827 shrn v29.8b, v30.8h, #6 // ypos >> 6
2828 and v27.8b, v27.8b, v25.8b // frac_y
2830 add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1
2832 tbl v18.8b, {v0.16b}, v29.8b // left[base_y]
2834 add v30.8b, v29.8b, v19.8b // base_y + 2
2835 add v29.8b, v29.8b, v17.8b // base_y + 1
2837 sub v28.8b, v26.8b, v27.8b // 64 - frac_y
2841 movi v24.8b, #2 // 2
2842 add v31.16b, v31.16b, v31.16b // {0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14}
2843 8:
2845 dup v16.8h, w8 // xpos
2851 dup v17.8h, w8 // xpos
2856 tbl v19.8b, {v0.16b}, v29.8b // left[base_y+1]
2858 shrn v21.8b, v16.8h, #6 // first base_x
2859 shrn2 v21.16b, v17.8h, #6
2860 xtn v16.8b, v16.8h // (uint8_t)xpos
2861 xtn2 v16.16b, v17.8h
2863 tbl v20.8b, {v0.16b}, v30.8b // left[base_y+2]
2874 umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y)
2875 umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y
2876 umull v17.8h, v19.8b, v28.8b
2877 umlal v17.8h, v20.8b, v27.8b
2879 umull v22.8h, v4.8b, v7.8b // top[base_x]-*(64-frac_x)
2880 umlal v22.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x
2881 umull2 v23.8h, v4.16b, v7.16b
2882 umlal2 v23.8h, v5.16b, v16.16b
2886 rshrn v6.8b, v6.8h, #6
2887 rshrn2 v6.16b, v17.8h, #6
2888 rshrn v22.8b, v22.8h, #6
2889 rshrn2 v22.16b, v23.8h, #6
2899 mov v18.8b, v20.8b
2900 add v29.8b, v29.8b, v24.8b // base_y += 2
2901 add v30.8b, v30.8b, v24.8b // base_y += 2
2902 b 8b
2905 tbl v19.8b, {v0.16b}, v29.8b // left[base_y+1]
2906 tbl v20.8b, {v0.16b}, v30.8b // left[base_y+2]
2908 umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y)
2909 umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y
2910 umull v17.8h, v19.8b, v28.8b
2911 umlal v17.8h, v20.8b, v27.8b
2913 rshrn v6.8b, v6.8h, #6
2914 rshrn2 v6.16b, v17.8h, #6
2921 mov v18.8b, v20.8b
2922 add v29.8b, v29.8b, v24.8b // base_y += 2
2923 add v30.8b, v30.8b, v24.8b // base_y += 2
2931 cmp w4, #8
2936 ld1 {v31.8h}, [x11] // increments
2942 movi v17.8b, #1
2948 xtn v31.8b, v31.8h // {0,1,2,3}
2950 // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements.
2956 xtn v27.8b, v30.8h // (uint8_t)ypos
2957 shrn v29.8b, v30.8h, #6 // ypos >> 6
2958 and v27.8b, v27.8b, v25.8b // frac_y
2960 add v29.8b, v29.8b, v19.8b // base_y = (ypos >> 6) + 2
2962 add v30.8b, v29.8b, v17.8b // base_y + 1
2963 add v28.8b, v29.8b, v19.8b // base_y + 2
2967 add v24.8b, v30.8b, v19.8b // base_y + 3
2972 sub v28.8b, v26.8b, v27.8b // 64 - frac_y
2977 movi v24.8b, #4
2993 tbl v16.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+0], left[base_y+2]
2994 tbl v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+3]
2996 shrn v20.8b, v6.8h, #6 // first base_x for each row
2997 xtn v6.8b, v6.8h // (uint8_t)xpos
2999 ext v3.8b, v2.8b, v2.8b, #1 // top[base_x+1]
3000 ext v5.8b, v4.8b, v4.8b, #1
3002 and v6.8b, v6.8b, v25.8b // frac_x
3007 sub v7.8b, v26.8b, v6.8b // 64 - frac_x
3009 add v20.8b, v20.8b, v31.8b // actual base_x
3011 umull v16.8h, v16.8b, v28.8b // left[base_y]*(64-frac_y)
3012 umlal v16.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y
3014 umull v22.8h, v2.8b, v7.8b // top[base_x]-*(64-frac_x)
3015 umlal v22.8h, v3.8b, v6.8b // + top[base_x+1]*frac_x
3017 cmge v20.8b, v20.8b, #0
3019 rshrn v16.8b, v16.8h, #6
3020 rshrn v22.8b, v22.8h, #6
3022 bit v16.8b, v22.8b, v20.8b
3030 add v29.8b, v29.8b, v24.8b // base_y += 4
3031 add v30.8b, v30.8b, v24.8b // base_y += 4
3035 tbl v16.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+0], left[base_y+2]
3036 tbl v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+3]
3038 umull v18.8h, v16.8b, v28.8b // left[base_y]*(64-frac_t)
3039 umlal v18.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y
3040 rshrn v18.8b, v18.8h, #6
3047 add v29.8b, v29.8b, v24.8b // base_y += 4
3048 add v30.8b, v30.8b, v24.8b // base_y += 4
3055 dup v30.8h, w7 // -dy
3056 movi v17.8b, #1
3058 mul v16.8h, v31.8h, v30.8h // {0,1,2,3,4,5,6,7}* -dy
3060 add v30.8h, v16.8h, v30.8h // -= dy
3062 xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7}
3064 // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements.
3070 xtn v27.8b, v30.8h // (uint8_t)ypos
3071 shrn v29.8b, v30.8h, #6 // ypos >> 6
3072 and v27.8b, v27.8b, v25.8b // frac_y
3074 add v29.8b, v29.8b, v19.8b // base_y = (ypos >> 6) + 2
3076 add v28.8b, v29.8b, v17.8b // base_y + 1
3077 add v30.8b, v29.8b, v19.8b // base_y + 2
3080 add v24.8b, v28.8b, v19.8b // base_y + 3
3085 sub v28.8b, v26.8b, v27.8b // 64 - frac_y
3091 8:
3093 dup v16.8h, w8 // xpos
3095 cmp w9, #-8 // base_x <= -8
3099 dup v17.8h, w8 // xpos
3107 shrn v21.8b, v16.8h, #6 // first base_x
3108 shrn2 v21.16b, v17.8h, #6
3109 xtn v16.8b, v16.8h // (uint8_t)xpos
3110 xtn2 v16.16b, v17.8h
3124 umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y)
3125 umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y
3126 umull2 v17.8h, v18.16b, v28.16b
3127 umlal2 v17.8h, v19.16b, v27.16b
3129 umull v22.8h, v4.8b, v7.8b // top[base_x]-*(64-frac_x)
3130 umlal v22.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x
3131 umull2 v23.8h, v4.16b, v7.16b
3132 umlal2 v23.8h, v5.16b, v16.16b
3136 rshrn v6.8b, v6.8h, #6
3137 rshrn2 v6.16b, v17.8h, #6
3138 rshrn v22.8b, v22.8h, #6
3139 rshrn2 v22.16b, v23.8h, #6
3151 b 8b
3157 umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y)
3158 umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y
3159 umull2 v17.8h, v18.16b, v28.16b
3160 umlal2 v17.8h, v19.16b, v27.16b
3162 rshrn v6.8b, v6.8h, #6
3163 rshrn2 v6.16b, v17.8h, #6
3193 ld1 {v30.8h}, [x11] // increments
3213 xtn v24.8b, v30.8h // (uint8_t)ypos
3214 uqshrn v26.8b, v30.8h, #6 // base
3215 and v24.8b, v24.8b, v23.8b // frac
3217 mov v4.8b, v31.8b
3218 uqadd v27.8b, v26.8b, v20.8b // base + 1
3219 uqadd v28.8b, v26.8b, v21.8b // base + 2
3220 sub v25.8b, v22.8b, v24.8b // 64 - frac
3222 tbx v4.8b, {v0.16b, v1.16b}, v26.8b // left[base]
3228 mov v5.8b, v31.8b
3229 tbx v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+2]
3233 umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac)
3234 umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac
3235 rshrn v16.8b, v16.8h, #6
3241 ext v4.8b, v5.8b, v5.8b, #4
3242 uqadd v27.8b, v27.8b, v21.8b // base += 2
3250 dup v29.8h, w5 // dy
3252 mul v30.8h, v30.8h, v29.8h // {0,1,2,3,4,5,6,7}*dy
3255 // Worst case max_base_y is width+height-1, for w=8, h=32, <= 48
3257 add v30.8h, v29.8h, v30.8h // ypos
3263 xtn v24.8b, v30.8h // (uint8_t)ypos
3264 uqshrn v26.8b, v30.8h, #6 // base
3265 and v24.8b, v24.8b, v23.8b // frac
3267 mov v4.8b, v31.8b
3268 uqadd v27.8b, v26.8b, v20.8b // base + 1
3269 uqadd v28.8b, v26.8b, v21.8b // base + 2
3270 sub v25.8b, v22.8b, v24.8b // 64 - frac
3272 tbx v4.8b, {v0.16b, v1.16b, v2.16b}, v26.8b // left[base]
3274 mov v5.8b, v31.8b
3275 mov v6.8b, v31.8b
3276 tbx v5.8b, {v0.16b, v1.16b, v2.16b}, v27.8b // left[base+1]
3277 tbx v6.8b, {v0.16b, v1.16b, v2.16b}, v28.8b // left[base+2]
3279 umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac)
3280 umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac
3281 umull v17.8h, v5.8b, v25.8b
3282 umlal v17.8h, v6.8b, v24.8b
3283 rshrn v16.8b, v16.8h, #6
3284 rshrn v17.8b, v17.8h, #6
3285 st1 {v16.8b}, [x0], x1
3287 st1 {v17.8b}, [x0], x1
3290 mov v4.8b, v6.8b
3291 uqadd v27.8b, v27.8b, v21.8b // base += 2
3292 uqadd v28.8b, v28.8b, v21.8b // base += 2
3300 dup v28.8h, w5 // dy
3302 shl v29.8h, v28.8h, #3 // 8*dy
3303 mul v30.8h, v30.8h, v28.8h // {0,1,2,3,4,5,6,7}*dy
3308 add v28.8h, v28.8h, v30.8h // ypos
3314 add v29.8h, v28.8h, v29.8h // ypos + 8*dy
3316 xtn v24.8b, v28.8h // (uint8_t)ypos
3317 xtn2 v24.16b, v29.8h
3318 uqshrn v26.8b, v28.8h, #6 // base
3319 uqshrn2 v26.16b, v29.8h, #6
3334 umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac)
3335 umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac
3336 umull2 v17.8h, v4.16b, v25.16b
3337 umlal2 v17.8h, v5.16b, v24.16b
3338 umull v18.8h, v5.8b, v25.8b
3339 umlal v18.8h, v6.8b, v24.8b
3340 umull2 v19.8h, v5.16b, v25.16b
3341 umlal2 v19.8h, v6.16b, v24.16b
3342 rshrn v16.8b, v16.8h, #6
3343 rshrn2 v16.16b, v17.8h, #6
3344 rshrn v17.8b, v18.8h, #6
3345 rshrn2 v17.16b, v19.8h, #6
3361 dup v28.8h, w5 // dy
3366 shl v29.8h, v28.8h, #3 // 8*dy
3367 mul v30.8h, v30.8h, v28.8h // {0,1,2,3,4,5,6,7}*dy
3372 add v30.8h, v28.8h, v30.8h // ypos
3385 add v27.8h, v26.8h, v29.8h // ypos + 8*dy
3386 uqshrn v16.8b, v26.8h, #6 // base
3387 uqshrn2 v16.16b, v27.8h, #6
3388 xtn v24.8b, v26.8h // (uint8_t)ypos
3389 xtn2 v24.16b, v27.8h
3408 umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac)
3409 umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac
3410 umull2 v17.8h, v4.16b, v25.16b
3411 umlal2 v17.8h, v5.16b, v24.16b
3412 umull v18.8h, v5.8b, v25.8b
3413 umlal v18.8h, v6.8b, v24.8b
3414 umull2 v19.8h, v5.16b, v25.16b
3415 umlal2 v19.8h, v6.16b, v24.16b
3416 rshrn v16.8b, v16.8h, #6
3417 rshrn2 v16.16b, v17.8h, #6
3418 rshrn v17.8b, v18.8h, #6
3419 rshrn2 v17.16b, v19.8h, #6
3423 add v26.8h, v27.8h, v29.8h // ypos += 16*dy
3429 movi v16.8h, #128
3432 add v30.8h, v30.8h, v16.8h // ypos = dy + y*(1<<6)*2
3478 umull v18.8h, v16.8b, v4.8b // left[base+1]*frac
3479 umlal v18.8h, v0.8b, v6.8b // + left[base]*(64-frac)
3480 umull2 v19.8h, v16.16b, v4.16b
3481 umlal2 v19.8h, v0.16b, v6.16b
3482 umull v20.8h, v17.8b, v5.8b
3483 umlal v20.8h, v2.8b, v7.8b
3484 umull2 v21.8h, v17.16b, v5.16b
3485 umlal2 v21.8h, v2.16b, v7.16b
3486 rshrn v16.8b, v18.8h, #6
3487 rshrn2 v16.16b, v19.8h, #6
3488 rshrn v17.8b, v20.8h, #6
3489 rshrn2 v17.16b, v21.8h, #6
3597 8:
3598 st1 {v31.8b}, [x0], x1
3600 st1 {v31.8b}, [x13], x1
3601 st1 {v31.8b}, [x0], x1
3602 st1 {v31.8b}, [x13], x1
3603 b.gt 8b
3604 subs w3, w3, #8
3610 add x0, x0, #8
3611 add x13, x13, #8
3678 cmp w3, #8
3682 ld1 {v30.8h}, [x11] // increments
3700 xtn v24.8b, v30.8h // (uint8_t)ypos
3701 uqshrn v26.8b, v30.8h, #6 // base
3702 and v24.8b, v24.8b, v23.8b // frac
3704 uqadd v27.8b, v26.8b, v20.8b // base + 1
3705 uqadd v28.8b, v26.8b, v21.8b // base + 2
3706 sub v25.8b, v22.8b, v24.8b // 64 - frac
3707 uqadd v29.8b, v27.8b, v21.8b // base + 3
3716 mov v4.8b, v31.8b
3717 mov v5.8b, v31.8b
3718 tbx v4.8b, {v0.16b, v1.16b}, v26.8b // left[base], left[base+2]
3719 tbx v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+3]
3721 umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac)
3722 umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac
3723 rshrn v16.8b, v16.8h, #6
3729 uqadd v26.8b, v26.8b, v21.8b // base += 4
3730 uqadd v27.8b, v27.8b, v21.8b // base += 4
3736 80: // w == 8
3737 dup v29.8h, w5 // dy
3739 mul v30.8h, v30.8h, v29.8h // {0,1,2,3,4,5,6,7}*dy
3745 add v30.8h, v29.8h, v30.8h // ypos
3751 xtn v24.8b, v30.8h // (uint8_t)ypos
3752 uqshrn v26.8b, v30.8h, #6 // base
3753 and v24.8b, v24.8b, v23.8b // frac
3755 uqadd v27.8b, v26.8b, v20.8b // base + 1
3756 uqadd v28.8b, v26.8b, v21.8b // base + 2
3757 sub v25.8b, v22.8b, v24.8b // 64 - frac
3758 uqadd v29.8b, v27.8b, v21.8b // base + 3
3772 umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac)
3773 umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac
3774 umull2 v17.8h, v4.16b, v25.16b
3775 umlal2 v17.8h, v5.16b, v24.16b
3776 rshrn v16.8b, v16.8h, #6
3777 rshrn v17.8b, v17.8h, #6
3778 st1 {v16.8b}, [x0], x1
3780 st1 {v17.8b}, [x0], x1
3801 ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
3804 ld1 {v20.8b, v21.8b, v22.8b}, [x6]
3807 sxtl v16.8h, v16.8b
3808 sxtl v17.8h, v17.8b
3810 sxtl v18.8h, v18.8b
3811 sxtl v19.8h, v19.8b
3814 sxtl v20.8h, v20.8b
3815 sxtl v21.8h, v21.8b
3816 sxtl v22.8h, v22.8b
3823 uxtl v0.8h, v0.8b // top (0-3)
3826 mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
3827 mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
3828 mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
3829 uxtl v1.8h, v1.8b // left (0-1) + topleft (2)
3830 mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
3831 mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0)
3832 mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
3833 mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
3834 sqrshrun v2.8b, v2.8h, #4
3837 uxtl v0.8h, v2.8b
3839 ext v0.16b, v0.16b, v0.16b, #8 // move top from [4-7] to [0-3]
3847 uxtl v0.8h, v0.8b // top (0-7)
3848 8:
3850 mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1)
3851 mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2)
3852 mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3)
3853 uxtl v1.8h, v1.8b // left (0-1) + topleft (2)
3854 mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4)
3855 mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0)
3856 mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5)
3857 mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6)
3858 mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1)
3859 mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2)
3860 mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3)
3861 sqrshrun v2.8b, v2.8h, #4
3862 uxtl v1.8h, v2.8b // first block, in 16 bit
3863 mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4)
3864 mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0)
3865 mla v3.8h, v21.8h, v1.h[3] // p5(left[0]) * filter(5)
3866 mla v3.8h, v22.8h, v1.h[7] // p6(left[1]) * filter(6)
3867 sqrshrun v3.8b, v3.8h, #4
3872 uxtl v0.8h, v0.8b
3873 b.gt 8b
3886 uxtl v0.8h, v0.8b // left (0-1) + topleft (2)
3889 mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0)
3890 mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5)
3891 uxtl v1.8h, v2.8b // top(0-7)
3892 uxtl2 v2.8h, v2.16b // top(8-15)
3893 mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6)
3894 mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1)
3895 mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2)
3896 mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3)
3897 mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4)
3899 mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1)
3900 mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2)
3901 mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3)
3902 sqrshrun v3.8b, v3.8h, #4
3903 uxtl v0.8h, v3.8b // first block, in 16 bit
3904 mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4)
3905 mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0)
3906 mla v4.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5)
3907 mla v4.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6)
3909 mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1)
3910 mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2)
3911 mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3)
3912 sqrshrun v4.8b, v4.8h, #4
3913 uxtl v0.8h, v4.8b // second block, in 16 bit
3914 mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4)
3915 mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0)
3916 mla v5.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5)
3917 mla v5.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6)
3919 mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1)
3920 mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2)
3921 mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3)
3922 sqrshrun v5.8b, v5.8h, #4
3923 uxtl v0.8h, v5.8b // third block, in 16 bit
3924 mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4)
3925 mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0)
3926 mla v6.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5)
3927 mla v6.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6)
3930 sqrshrun v6.8b, v6.8h, #4
3934 b.le 8f
3939 8:
3962 ld1 {v0.8b}, [x2]
3975 ld1 {v1.8b}, [x3], #8
3977 ushr v3.8b, v1.8b, #4
3978 and v2.8b, v1.8b, v31.8b
3989 8:
4002 b.gt 8b
4114 movi v0.8h, #128 // dc
4115 dup v1.8h, w6 // alpha
4123 ld1 {v2.8h, v3.8h}, [x5], #32
4124 mul v2.8h, v2.8h, v1.8h // diff = ac * alpha
4125 mul v3.8h, v3.8h, v1.8h
4126 cmlt v4.8h, v2.8h, #0 // sign
4127 cmlt v5.8h, v3.8h, #0
4128 add v2.8h, v2.8h, v4.8h // diff + sign
4129 add v3.8h, v3.8h, v5.8h
4130 srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign()
4131 srshr v3.8h, v3.8h, #6
4132 add v2.8h, v2.8h, v0.8h // dc + apply_sign()
4133 add v3.8h, v3.8h, v0.8h
4134 sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign())
4135 sqxtun v3.8b, v3.8h
4146 ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x5], #64
4147 mul v2.8h, v2.8h, v1.8h // diff = ac * alpha
4148 mul v3.8h, v3.8h, v1.8h
4149 mul v4.8h, v4.8h, v1.8h
4150 mul v5.8h, v5.8h, v1.8h
4151 cmlt v16.8h, v2.8h, #0 // sign
4152 cmlt v17.8h, v3.8h, #0
4153 cmlt v18.8h, v4.8h, #0
4154 cmlt v19.8h, v5.8h, #0
4155 add v2.8h, v2.8h, v16.8h // diff + sign
4156 add v3.8h, v3.8h, v17.8h
4157 add v4.8h, v4.8h, v18.8h
4158 add v5.8h, v5.8h, v19.8h
4159 srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign()
4160 srshr v3.8h, v3.8h, #6
4161 srshr v4.8h, v4.8h, #6
4162 srshr v5.8h, v5.8h, #6
4163 add v2.8h, v2.8h, v0.8h // dc + apply_sign()
4164 add v3.8h, v3.8h, v0.8h
4165 add v4.8h, v4.8h, v0.8h
4166 add v5.8h, v5.8h, v0.8h
4167 sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign())
4168 sqxtun v3.8b, v3.8h
4169 sqxtun v4.8b, v4.8h
4170 sqxtun v5.8b, v5.8h
4171 st1 {v2.8b}, [x0], x1
4172 st1 {v3.8b}, [x6], x1
4174 st1 {v4.8b}, [x0], x1
4175 st1 {v5.8b}, [x6], x1
4184 ld1 {v2.8h, v3.8h}, [x5], #32
4185 ld1 {v4.8h, v5.8h}, [x7], #32
4186 mul v2.8h, v2.8h, v1.8h // diff = ac * alpha
4187 mul v3.8h, v3.8h, v1.8h
4188 mul v4.8h, v4.8h, v1.8h
4189 mul v5.8h, v5.8h, v1.8h
4190 cmlt v16.8h, v2.8h, #0 // sign
4191 cmlt v17.8h, v3.8h, #0
4192 cmlt v18.8h, v4.8h, #0
4193 cmlt v19.8h, v5.8h, #0
4194 add v2.8h, v2.8h, v16.8h // diff + sign
4195 add v3.8h, v3.8h, v17.8h
4196 add v4.8h, v4.8h, v18.8h
4197 add v5.8h, v5.8h, v19.8h
4198 srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign()
4199 srshr v3.8h, v3.8h, #6
4200 srshr v4.8h, v4.8h, #6
4201 srshr v5.8h, v5.8h, #6
4202 add v2.8h, v2.8h, v0.8h // dc + apply_sign()
4203 add v3.8h, v3.8h, v0.8h
4204 add v4.8h, v4.8h, v0.8h
4205 add v5.8h, v5.8h, v0.8h
4206 sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign())
4207 sqxtun v3.8b, v3.8h
4208 sqxtun v4.8b, v4.8h
4209 sqxtun v5.8b, v5.8h
4211 st1 {v2.8b, v3.8b}, [x0], #16
4212 st1 {v4.8b, v5.8b}, [x6], #16
4241 dup v1.8h, w6 // alpha
4250 uaddlv h0, v0.8b
4252 dup v0.8h, v0.h[0]
4254 8:
4256 ld1 {v0.8b}, [x2]
4257 uaddlv h0, v0.8b
4259 dup v0.8h, v0.h[0]
4266 dup v0.8h, v0.h[0]
4275 dup v0.8h, v2.h[0]
4282 .word 8b - ipred_cfl_top_tbl
4300 dup v1.8h, w6 // alpha
4310 uaddlv h0, v0.8b
4312 dup v0.8h, v0.h[0]
4317 ld1 {v0.8b}, [x2]
4318 uaddlv h0, v0.8b
4320 dup v0.8h, v0.h[0]
4328 dup v0.8h, v0.h[0]
4338 dup v0.8h, v2.h[0]
4356 dup v1.8h, w6 // alpha
4359 dup v16.8h, w8 // width + height
4370 ushr v16.8h, v16.8h, #1 // (width + height) >> 1
4371 dup v17.8h, w8 // -ctz(width + height)
4381 uaddlv h0, v0.8b
4388 uaddlv h2, v2.8b
4393 // h = 8/16
4401 dup v0.8h, v0.h[0]
4406 ld1 {v0.8b}, [x2], #8
4407 uaddlv h0, v0.8b
4412 ld1 {v2.8b}, [x2]
4414 uaddlv h2, v2.8b
4415 cmp w4, #8
4427 dup v0.8h, v0.h[0]
4445 // h = 4/8/32
4453 dup v0.8h, v0.h[0]
4475 // h = 8/16
4483 dup v0.8h, v0.h[0]
4507 movi v16.8h, #0
4508 movi v17.8h, #0
4509 movi v18.8h, #0
4510 movi v19.8h, #0
4527 ld1 {v0.8b}, [x1], x2
4528 ld1 {v1.8b}, [x10], x2
4531 uaddlp v0.8h, v0.16b
4532 uaddlp v1.8h, v1.16b
4533 add v0.8h, v0.8h, v1.8h
4534 shl v0.8h, v0.8h, #1
4536 st1 {v0.8h}, [x0], #16
4537 add v16.8h, v16.8h, v0.8h
4545 st1 {v0.8h, v1.8h}, [x0], #32
4546 add v16.8h, v16.8h, v0.8h
4547 add v17.8h, v17.8h, v1.8h
4551 add v0.8h, v16.8h, v17.8h
4552 uaddlv s0, v0.8h // sum
4555 dup v4.8h, v4.h[0]
4557 ld1 {v0.8h, v1.8h}, [x0]
4559 sub v0.8h, v0.8h, v4.8h
4560 sub v1.8h, v1.8h, v4.8h
4561 st1 {v0.8h, v1.8h}, [x0], #32
4572 uaddlp v0.8h, v0.16b
4574 uaddlp v1.8h, v1.16b
4575 uaddlp v2.8h, v2.16b
4576 uaddlp v3.8h, v3.16b
4577 add v0.8h, v0.8h, v1.8h
4578 add v2.8h, v2.8h, v3.8h
4579 shl v0.8h, v0.8h, #1
4580 shl v1.8h, v2.8h, #1
4582 st1 {v0.8h, v1.8h}, [x0], #32
4583 add v16.8h, v16.8h, v0.8h
4584 add v17.8h, v17.8h, v1.8h
4591 ld1 {v0.8b}, [x1], x2
4592 ld1 {v1.8b}, [x10], x2
4595 uaddlp v0.8h, v0.16b
4596 uaddlp v1.8h, v1.16b
4597 add v0.8h, v0.8h, v1.8h
4598 shl v0.8h, v0.8h, #1
4616 st1 {v0.8h, v1.8h}, [x0], #32
4617 add v16.8h, v16.8h, v0.8h
4618 add v17.8h, v17.8h, v1.8h
4619 st1 {v0.8h, v1.8h}, [x0], #32
4620 add v18.8h, v18.8h, v0.8h
4621 add v19.8h, v19.8h, v1.8h
4627 add v0.8h, v16.8h, v17.8h
4628 add v2.8h, v18.8h, v19.8h
4629 uaddlp v0.4s, v0.8h
4630 uaddlp v2.4s, v2.8h
4635 dup v4.8h, v4.h[0]
4638 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
4640 sub v0.8h, v0.8h, v4.8h
4641 sub v1.8h, v1.8h, v4.8h
4642 sub v2.8h, v2.8h, v4.8h
4643 sub v3.8h, v3.8h, v4.8h
4644 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4660 uaddlp v0.8h, v0.16b
4662 uaddlp v1.8h, v1.16b
4664 uaddlp v2.8h, v2.16b
4665 uaddlp v3.8h, v3.16b
4666 uaddlp v4.8h, v4.16b
4667 uaddlp v5.8h, v5.16b
4668 uaddlp v6.8h, v6.16b
4669 uaddlp v7.8h, v7.16b
4670 add v0.8h, v0.8h, v2.8h
4671 add v1.8h, v1.8h, v3.8h
4672 add v4.8h, v4.8h, v6.8h
4673 add v5.8h, v5.8h, v7.8h
4674 shl v0.8h, v0.8h, #1
4675 shl v1.8h, v1.8h, #1
4676 shl v2.8h, v4.8h, #1
4677 shl v3.8h, v5.8h, #1
4679 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4680 add v16.8h, v16.8h, v0.8h
4681 add v17.8h, v17.8h, v1.8h
4682 add v18.8h, v18.8h, v2.8h
4683 add v19.8h, v19.8h, v3.8h
4696 uaddlp v1.4h, v1.8b
4698 uaddlp v0.8h, v0.16b
4700 uaddlp v3.4h, v3.8b
4702 uaddlp v2.8h, v2.16b
4704 uaddlp v5.4h, v5.8b
4705 uaddlp v4.8h, v4.16b
4706 uaddlp v7.4h, v7.8b
4707 uaddlp v6.8h, v6.16b
4709 add v0.8h, v0.8h, v2.8h
4711 add v4.8h, v4.8h, v6.8h
4713 shl v0.8h, v0.8h, #1
4715 shl v2.8h, v4.8h, #1
4721 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4722 add v16.8h, v16.8h, v0.8h
4723 add v17.8h, v17.8h, v1.8h
4724 add v18.8h, v18.8h, v2.8h
4725 add v19.8h, v19.8h, v3.8h
4733 1: // Copy and subsample input, padding 8
4737 uaddlp v0.8h, v0.16b
4739 uaddlp v2.8h, v2.16b
4740 uaddlp v4.8h, v4.16b
4741 uaddlp v6.8h, v6.16b
4742 add v0.8h, v0.8h, v2.8h
4743 add v4.8h, v4.8h, v6.8h
4744 shl v0.8h, v0.8h, #1
4745 shl v2.8h, v4.8h, #1
4746 dup v1.8h, v0.h[7]
4747 dup v3.8h, v2.h[7]
4749 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4750 add v16.8h, v16.8h, v0.8h
4751 add v17.8h, v17.8h, v1.8h
4752 add v18.8h, v18.8h, v2.8h
4753 add v19.8h, v19.8h, v3.8h
4762 ld1 {v0.8b}, [x1], x2
4763 ld1 {v2.8b}, [x10], x2
4764 ld1 {v4.8b}, [x1], x2
4765 uaddlp v0.4h, v0.8b
4766 ld1 {v6.8b}, [x10], x2
4767 uaddlp v2.4h, v2.8b
4768 uaddlp v4.4h, v4.8b
4769 uaddlp v6.4h, v6.8b
4774 dup v1.8h, v0.h[3]
4775 dup v3.8h, v2.h[3]
4779 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4780 add v16.8h, v16.8h, v0.8h
4781 add v17.8h, v17.8h, v1.8h
4782 add v18.8h, v18.8h, v2.8h
4783 add v19.8h, v19.8h, v3.8h
4792 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4793 add v16.8h, v16.8h, v0.8h
4794 add v17.8h, v17.8h, v1.8h
4795 add v18.8h, v18.8h, v2.8h
4796 add v19.8h, v19.8h, v3.8h
4797 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4798 add v16.8h, v16.8h, v0.8h
4799 add v17.8h, v17.8h, v1.8h
4800 add v18.8h, v18.8h, v2.8h
4801 add v19.8h, v19.8h, v3.8h
4832 movi v16.8h, #0
4833 movi v17.8h, #0
4834 movi v18.8h, #0
4835 movi v19.8h, #0
4852 ld1 {v0.8b}, [x1], x2
4854 ld1 {v1.8b}, [x1], x2
4856 uaddlp v0.8h, v0.16b
4857 uaddlp v1.8h, v1.16b
4858 shl v0.8h, v0.8h, #2
4859 shl v1.8h, v1.8h, #2
4861 add v16.8h, v16.8h, v0.8h
4862 add v17.8h, v17.8h, v1.8h
4863 st1 {v0.8h, v1.8h}, [x0], #32
4876 uaddlp v0.8h, v0.16b
4878 uaddlp v1.8h, v1.16b
4879 uaddlp v2.8h, v2.16b
4880 uaddlp v3.8h, v3.16b
4881 shl v0.8h, v0.8h, #2
4882 shl v1.8h, v1.8h, #2
4883 shl v2.8h, v2.8h, #2
4884 shl v3.8h, v3.8h, #2
4886 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4887 add v16.8h, v16.8h, v0.8h
4888 add v17.8h, v17.8h, v1.8h
4889 add v18.8h, v18.8h, v2.8h
4890 add v19.8h, v19.8h, v3.8h
4898 ld1 {v0.8b}, [x1], x2
4900 ld1 {v2.8b}, [x1], x2
4902 uaddlp v0.8h, v0.16b
4903 uaddlp v2.8h, v2.16b
4904 shl v0.8h, v0.8h, #2
4905 shl v2.8h, v2.8h, #2
4907 dup v5.8h, v0.h[7]
4909 dup v7.8h, v2.h[7]
4915 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4916 add v16.8h, v16.8h, v0.8h
4917 add v17.8h, v17.8h, v1.8h
4918 add v18.8h, v18.8h, v2.8h
4919 add v19.8h, v19.8h, v3.8h
4937 uaddlp v0.8h, v0.16b
4938 uaddlp v1.8h, v1.16b
4939 uaddlp v2.8h, v2.16b
4940 uaddlp v3.8h, v3.16b
4941 shl v0.8h, v0.8h, #2
4942 shl v1.8h, v1.8h, #2
4943 shl v2.8h, v2.8h, #2
4944 shl v3.8h, v3.8h, #2
4946 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4947 add v16.8h, v16.8h, v0.8h
4948 add v17.8h, v17.8h, v1.8h
4949 add v18.8h, v18.8h, v2.8h
4950 add v19.8h, v19.8h, v3.8h
4963 uaddlp v1.4h, v1.8b
4964 uaddlp v0.8h, v0.16b
4965 uaddlp v3.4h, v3.8b
4966 uaddlp v2.8h, v2.16b
4968 shl v0.8h, v0.8h, #2
4970 shl v2.8h, v2.8h, #2
4976 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4977 add v16.8h, v16.8h, v0.8h
4978 add v17.8h, v17.8h, v1.8h
4979 add v18.8h, v18.8h, v2.8h
4980 add v19.8h, v19.8h, v3.8h
4988 1: // Copy and subsample input, padding 8
4991 uaddlp v0.8h, v0.16b
4992 uaddlp v2.8h, v2.16b
4993 shl v0.8h, v0.8h, #2
4994 shl v2.8h, v2.8h, #2
4995 dup v1.8h, v0.h[7]
4996 dup v3.8h, v2.h[7]
4998 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4999 add v16.8h, v16.8h, v0.8h
5000 add v17.8h, v17.8h, v1.8h
5001 add v18.8h, v18.8h, v2.8h
5002 add v19.8h, v19.8h, v3.8h
5011 ld1 {v0.8b}, [x1], x2
5012 ld1 {v2.8b}, [x10], x2
5013 uaddlp v0.4h, v0.8b
5014 uaddlp v2.4h, v2.8b
5017 dup v1.8h, v0.h[3]
5018 dup v3.8h, v2.h[3]
5022 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5023 add v16.8h, v16.8h, v0.8h
5024 add v17.8h, v17.8h, v1.8h
5025 add v18.8h, v18.8h, v2.8h
5026 add v19.8h, v19.8h, v3.8h
5055 movi v16.8h, #0
5056 movi v17.8h, #0
5057 movi v18.8h, #0
5058 movi v19.8h, #0
5079 ushll v0.8h, v0.8b, #3
5080 ushll v1.8h, v1.8b, #3
5082 add v16.8h, v16.8h, v0.8h
5083 add v17.8h, v17.8h, v1.8h
5084 st1 {v0.8h, v1.8h}, [x0], #32
5093 ld1 {v0.8b}, [x1], x2
5094 ld1 {v1.8b}, [x10], x2
5095 ld1 {v2.8b}, [x1], x2
5096 ushll v0.8h, v0.8b, #3
5097 ld1 {v3.8b}, [x10], x2
5098 ushll v1.8h, v1.8b, #3
5099 ushll v2.8h, v2.8b, #3
5100 ushll v3.8h, v3.8b, #3
5102 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5103 add v16.8h, v16.8h, v0.8h
5104 add v17.8h, v17.8h, v1.8h
5105 add v18.8h, v18.8h, v2.8h
5106 add v19.8h, v19.8h, v3.8h
5119 ushll2 v1.8h, v0.16b, #3
5120 ushll v0.8h, v0.8b, #3
5122 ushll2 v3.8h, v2.16b, #3
5123 ushll v2.8h, v2.8b, #3
5124 ushll2 v5.8h, v4.16b, #3
5125 ushll v4.8h, v4.8b, #3
5126 ushll2 v7.8h, v6.16b, #3
5127 ushll v6.8h, v6.8b, #3
5129 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5130 add v16.8h, v16.8h, v0.8h
5131 add v17.8h, v17.8h, v1.8h
5132 add v18.8h, v18.8h, v2.8h
5133 add v19.8h, v19.8h, v3.8h
5134 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5135 add v16.8h, v16.8h, v4.8h
5136 add v17.8h, v17.8h, v5.8h
5137 add v18.8h, v18.8h, v6.8h
5138 add v19.8h, v19.8h, v7.8h
5147 1: // Copy and expand input, padding 8
5148 ld1 {v0.8b}, [x1], x2
5149 ld1 {v2.8b}, [x10], x2
5150 ld1 {v4.8b}, [x1], x2
5151 ld1 {v6.8b}, [x10], x2
5152 ushll v0.8h, v0.8b, #3
5153 ushll v2.8h, v2.8b, #3
5154 ushll v4.8h, v4.8b, #3
5155 ushll v6.8h, v6.8b, #3
5156 dup v1.8h, v0.h[7]
5157 dup v3.8h, v2.h[7]
5158 dup v5.8h, v4.h[7]
5159 dup v7.8h, v6.h[7]
5161 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5162 add v16.8h, v16.8h, v0.8h
5163 add v17.8h, v17.8h, v1.8h
5164 add v18.8h, v18.8h, v2.8h
5165 add v19.8h, v19.8h, v3.8h
5166 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5167 add v16.8h, v16.8h, v4.8h
5168 add v17.8h, v17.8h, v5.8h
5169 add v18.8h, v18.8h, v6.8h
5170 add v19.8h, v19.8h, v7.8h
5191 ushll v0.8h, v2.8b, #3
5192 ushll2 v1.8h, v2.16b, #3
5193 ushll v2.8h, v3.8b, #3
5194 ushll2 v3.8h, v3.16b, #3
5195 ushll v4.8h, v6.8b, #3
5196 ushll2 v5.8h, v6.16b, #3
5197 ushll v6.8h, v7.8b, #3
5198 ushll2 v7.8h, v7.16b, #3
5200 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5201 add v16.8h, v16.8h, v0.8h
5202 add v17.8h, v17.8h, v1.8h
5203 add v18.8h, v18.8h, v2.8h
5204 add v19.8h, v19.8h, v3.8h
5205 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5206 add v16.8h, v16.8h, v4.8h
5207 add v17.8h, v17.8h, v5.8h
5208 add v18.8h, v18.8h, v6.8h
5209 add v19.8h, v19.8h, v7.8h
5215 1: // Copy and expand input, padding 8
5220 ushll v2.8h, v2.8b, #3
5221 ushll v0.8h, v1.8b, #3
5222 ushll2 v1.8h, v1.16b, #3
5223 ushll v6.8h, v6.8b, #3
5224 ushll v4.8h, v5.8b, #3
5225 ushll2 v5.8h, v5.16b, #3
5226 dup v3.8h, v2.h[7]
5227 dup v7.8h, v6.h[7]
5229 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5230 add v16.8h, v16.8h, v0.8h
5231 add v17.8h, v17.8h, v1.8h
5232 add v18.8h, v18.8h, v2.8h
5233 add v19.8h, v19.8h, v3.8h
5234 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5235 add v16.8h, v16.8h, v4.8h
5236 add v17.8h, v17.8h, v5.8h
5237 add v18.8h, v18.8h, v6.8h
5238 add v19.8h, v19.8h, v7.8h
5247 ushll v0.8h, v1.8b, #3
5248 ushll2 v1.8h, v1.16b, #3
5249 ushll v4.8h, v5.8b, #3
5250 ushll2 v5.8h, v5.16b, #3
5251 dup v2.8h, v1.h[7]
5252 dup v3.8h, v1.h[7]
5253 dup v6.8h, v5.h[7]
5254 dup v7.8h, v5.h[7]
5256 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5257 add v16.8h, v16.8h, v0.8h
5258 add v17.8h, v17.8h, v1.8h
5259 add v18.8h, v18.8h, v2.8h
5260 add v19.8h, v19.8h, v3.8h
5261 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5262 add v16.8h, v16.8h, v4.8h
5263 add v17.8h, v17.8h, v5.8h
5264 add v18.8h, v18.8h, v6.8h
5265 add v19.8h, v19.8h, v7.8h
5272 ld1 {v0.8b}, [x1], x2
5273 ld1 {v4.8b}, [x10], x2
5274 ushll v0.8h, v0.8b, #3
5275 ushll v4.8h, v4.8b, #3
5276 dup v1.8h, v0.h[7]
5277 dup v2.8h, v0.h[7]
5278 dup v3.8h, v0.h[7]
5279 dup v5.8h, v4.h[7]
5280 dup v6.8h, v4.h[7]
5281 dup v7.8h, v4.h[7]
5283 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5284 add v16.8h, v16.8h, v0.8h
5285 add v17.8h, v17.8h, v1.8h
5286 add v18.8h, v18.8h, v2.8h
5287 add v19.8h, v19.8h, v3.8h
5288 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5289 add v16.8h, v16.8h, v4.8h
5290 add v17.8h, v17.8h, v5.8h
5291 add v18.8h, v18.8h, v6.8h
5292 add v19.8h, v19.8h, v7.8h
5299 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5300 add v16.8h, v16.8h, v4.8h
5301 add v17.8h, v17.8h, v5.8h
5302 add v18.8h, v18.8h, v6.8h
5303 add v19.8h, v19.8h, v7.8h
5304 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5305 add v16.8h, v16.8h, v4.8h
5306 add v17.8h, v17.8h, v5.8h
5307 add v18.8h, v18.8h, v6.8h
5308 add v19.8h, v19.8h, v7.8h
5316 uaddlp v0.4s, v16.8h
5317 uaddlp v1.4s, v17.8h
5318 uaddlp v2.4s, v18.8h
5319 uaddlp v3.4s, v19.8h
5326 dup v4.8h, v4.h[0]