Lines Matching full:sub

126 #define GEN_FFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \  argument
134 const T_VEC w1 = sub(i0, i2); \
136 const T_VEC w3 = sub(i1, i3); \
139 store(output + 2 * stride, sub(w0, w2)); \
140 store(output + 3 * stride, sub(kWeight0, w3)); \
143 #define GEN_FFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul) \ argument
156 const T_VEC w1 = sub(i0, i4); \
158 const T_VEC w3 = sub(i2, i6); \
160 const T_VEC w5 = sub(w0, w2); \
162 const T_VEC w8 = sub(i1, i5); \
164 const T_VEC w10 = sub(i3, i7); \
166 const T_VEC w12 = sub(w7, w9); \
168 store(output + 1 * stride, add(w1, mul(kWeight2, sub(w8, w10)))); \
170 store(output + 3 * stride, sub(w1, mul(kWeight2, sub(w8, w10)))); \
171 store(output + 4 * stride, sub(w4, w11)); \
173 sub(sub(kWeight0, w3), mul(kWeight2, add(w10, w8)))); \
174 store(output + 6 * stride, sub(kWeight0, w12)); \
175 store(output + 7 * stride, sub(w3, mul(kWeight2, add(w10, w8)))); \
178 #define GEN_FFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ argument
202 const T_VEC w1 = sub(i0, i8); \
204 const T_VEC w3 = sub(i4, i12); \
206 const T_VEC w5 = sub(w0, w2); \
208 const T_VEC w8 = sub(i2, i10); \
210 const T_VEC w10 = sub(i6, i14); \
212 const T_VEC w12 = sub(w7, w9); \
214 const T_VEC w15 = sub(w4, w11); \
215 const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))), \
216 sub(sub(kWeight0, w3), \
218 const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))), \
219 sub(w3, mul(kWeight2, add(w10, w8))) }; \
221 const T_VEC w20 = sub(i1, i9); \
223 const T_VEC w22 = sub(i5, i13); \
225 const T_VEC w24 = sub(w19, w21); \
227 const T_VEC w27 = sub(i3, i11); \
229 const T_VEC w29 = sub(i7, i15); \
231 const T_VEC w31 = sub(w26, w28); \
233 const T_VEC w34 = sub(w23, w30); \
234 const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))), \
235 sub(sub(kWeight0, w22), \
237 const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))), \
238 sub(w22, mul(kWeight2, add(w29, w27))) }; \
242 store(output + 2 * stride, add(w5, mul(kWeight2, sub(w24, w31)))); \
247 add(w18[0], sub(sub(kWeight0, mul(kWeight4, w37[0])), \
249 store(output + 6 * stride, sub(w5, mul(kWeight2, sub(w24, w31)))); \
251 add(w16[0], sub(sub(kWeight0, mul(kWeight3, w35[0])), \
253 store(output + 8 * stride, sub(w14, w33)); \
255 add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0])))); \
257 sub(sub(kWeight0, w12), mul(kWeight2, add(w31, w24)))); \
259 add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0])))); \
260 store(output + 12 * stride, sub(kWeight0, w34)); \
262 sub(sub(kWeight0, w18[1]), \
263 sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1])))); \
264 store(output + 14 * stride, sub(w12, mul(kWeight2, add(w31, w24)))); \
266 sub(sub(kWeight0, w16[1]), \
267 sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1])))); \
270 #define GEN_FFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ argument
314 const T_VEC w1 = sub(i0, i16); \
316 const T_VEC w3 = sub(i8, i24); \
318 const T_VEC w5 = sub(w0, w2); \
320 const T_VEC w8 = sub(i4, i20); \
322 const T_VEC w10 = sub(i12, i28); \
324 const T_VEC w12 = sub(w7, w9); \
326 const T_VEC w15 = sub(w4, w11); \
327 const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))), \
328 sub(sub(kWeight0, w3), \
330 const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))), \
331 sub(w3, mul(kWeight2, add(w10, w8))) }; \
333 const T_VEC w20 = sub(i2, i18); \
335 const T_VEC w22 = sub(i10, i26); \
337 const T_VEC w24 = sub(w19, w21); \
339 const T_VEC w27 = sub(i6, i22); \
341 const T_VEC w29 = sub(i14, i30); \
343 const T_VEC w31 = sub(w26, w28); \
345 const T_VEC w34 = sub(w23, w30); \
346 const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))), \
347 sub(sub(kWeight0, w22), \
349 const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))), \
350 sub(w22, mul(kWeight2, add(w29, w27))) }; \
352 const T_VEC w39 = sub(w14, w33); \
355 add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0]))) \
357 const T_VEC w41[2] = { add(w5, mul(kWeight2, sub(w24, w31))), \
358 sub(sub(kWeight0, w12), \
362 add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0]))) \
366 sub(sub(kWeight0, mul(kWeight4, w37[0])), mul(kWeight3, w37[1]))), \
367 sub(sub(kWeight0, w18[1]), \
368 sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1]))) \
370 const T_VEC w45[2] = { sub(w5, mul(kWeight2, sub(w24, w31))), \
371 sub(w12, mul(kWeight2, add(w31, w24))) }; \
374 sub(sub(kWeight0, mul(kWeight3, w35[0])), mul(kWeight4, w35[1]))), \
375 sub(sub(kWeight0, w16[1]), \
376 sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1]))) \
379 const T_VEC w48 = sub(i1, i17); \
381 const T_VEC w50 = sub(i9, i25); \
383 const T_VEC w52 = sub(w47, w49); \
385 const T_VEC w55 = sub(i5, i21); \
387 const T_VEC w57 = sub(i13, i29); \
389 const T_VEC w59 = sub(w54, w56); \
391 const T_VEC w62 = sub(w51, w58); \
392 const T_VEC w63[2] = { add(w48, mul(kWeight2, sub(w55, w57))), \
393 sub(sub(kWeight0, w50), \
395 const T_VEC w65[2] = { sub(w48, mul(kWeight2, sub(w55, w57))), \
396 sub(w50, mul(kWeight2, add(w57, w55))) }; \
398 const T_VEC w67 = sub(i3, i19); \
400 const T_VEC w69 = sub(i11, i27); \
402 const T_VEC w71 = sub(w66, w68); \
404 const T_VEC w74 = sub(i7, i23); \
406 const T_VEC w76 = sub(i15, i31); \
408 const T_VEC w78 = sub(w73, w75); \
410 const T_VEC w81 = sub(w70, w77); \
411 const T_VEC w82[2] = { add(w67, mul(kWeight2, sub(w74, w76))), \
412 sub(sub(kWeight0, w69), \
414 const T_VEC w84[2] = { sub(w67, mul(kWeight2, sub(w74, w76))), \
415 sub(w69, mul(kWeight2, add(w76, w74))) }; \
417 const T_VEC w86 = sub(w61, w80); \
420 add(w63[1], sub(mul(kWeight3, w82[1]), mul(kWeight4, w82[0]))) \
422 const T_VEC w88[2] = { add(w52, mul(kWeight2, sub(w71, w78))), \
423 sub(sub(kWeight0, w59), \
427 add(w65[1], sub(mul(kWeight4, w84[1]), mul(kWeight3, w84[0]))) \
431 sub(sub(kWeight0, mul(kWeight4, w84[0])), mul(kWeight3, w84[1]))), \
432 sub(sub(kWeight0, w65[1]), \
433 sub(mul(kWeight3, w84[0]), mul(kWeight4, w84[1]))) \
435 const T_VEC w92[2] = { sub(w52, mul(kWeight2, sub(w71, w78))), \
436 sub(w59, mul(kWeight2, add(w78, w71))) }; \
439 sub(sub(kWeight0, mul(kWeight3, w82[0])), mul(kWeight4, w82[1]))), \
440 sub(sub(kWeight0, w63[1]), \
441 sub(mul(kWeight4, w82[0]), mul(kWeight3, w82[1]))) \
450 store(output + 4 * stride, add(w15, mul(kWeight2, sub(w62, w81)))); \
459 add(w46[0], sub(sub(kWeight0, mul(kWeight6, w93[0])), \
462 add(w45[0], sub(sub(kWeight0, mul(kWeight4, w92[0])), \
465 add(w44[0], sub(sub(kWeight0, mul(kWeight8, w91[0])), \
467 store(output + 12 * stride, sub(w15, mul(kWeight2, sub(w62, w81)))); \
469 add(w42[0], sub(sub(kWeight0, mul(kWeight7, w89[0])), \
472 add(w41[0], sub(sub(kWeight0, mul(kWeight3, w88[0])), \
475 add(w40[0], sub(sub(kWeight0, mul(kWeight5, w87[0])), \
477 store(output + 16 * stride, sub(w38, w85)); \
479 add(w40[1], sub(mul(kWeight5, w87[1]), mul(kWeight6, w87[0])))); \
481 add(w41[1], sub(mul(kWeight3, w88[1]), mul(kWeight4, w88[0])))); \
483 add(w42[1], sub(mul(kWeight7, w89[1]), mul(kWeight8, w89[0])))); \
485 sub(sub(kWeight0, w34), mul(kWeight2, add(w81, w62)))); \
487 add(w44[1], sub(mul(kWeight8, w91[1]), mul(kWeight7, w91[0])))); \
489 add(w45[1], sub(mul(kWeight4, w92[1]), mul(kWeight3, w92[0])))); \
491 add(w46[1], sub(mul(kWeight6, w93[1]), mul(kWeight5, w93[0])))); \
492 store(output + 24 * stride, sub(kWeight0, w86)); \
494 sub(sub(kWeight0, w46[1]), \
495 sub(mul(kWeight5, w93[0]), mul(kWeight6, w93[1])))); \
497 sub(sub(kWeight0, w45[1]), \
498 sub(mul(kWeight3, w92[0]), mul(kWeight4, w92[1])))); \
500 sub(sub(kWeight0, w44[1]), \
501 sub(mul(kWeight7, w91[0]), mul(kWeight8, w91[1])))); \
502 store(output + 28 * stride, sub(w34, mul(kWeight2, add(w81, w62)))); \
504 sub(sub(kWeight0, w42[1]), \
505 sub(mul(kWeight8, w89[0]), mul(kWeight7, w89[1])))); \
507 sub(sub(kWeight0, w41[1]), \
508 sub(mul(kWeight4, w88[0]), mul(kWeight3, w88[1])))); \
510 sub(sub(kWeight0, w40[1]), \
511 sub(mul(kWeight6, w87[0]), mul(kWeight5, w87[1])))); \
522 #define GEN_IFFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \ argument
530 const T_VEC w3 = sub(i0, i2); \
531 const T_VEC w4[2] = { add(i1, i1), sub(i3, i3) }; \
532 const T_VEC w5[2] = { sub(i1, i1), sub(sub(kWeight0, i3), i3) }; \
535 store(output + 2 * stride, sub(w2, w4[0])); \
536 store(output + 3 * stride, sub(w3, w5[1])); \
539 #define GEN_IFFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ argument
553 const T_VEC w7 = sub(i0, i4); \
554 const T_VEC w8[2] = { add(i2, i2), sub(i6, i6) }; \
555 const T_VEC w9[2] = { sub(i2, i2), sub(sub(kWeight0, i6), i6) }; \
557 const T_VEC w11[2] = { sub(w6, w8[0]), sub(kWeight0, w8[1]) }; \
558 const T_VEC w12[2] = { add(w7, w9[1]), sub(kWeight0, w9[0]) }; \
559 const T_VEC w13[2] = { sub(w7, w9[1]), w9[0] }; \
560 const T_VEC w14[2] = { add(i1, i3), sub(i7, i5) }; \
561 const T_VEC w15[2] = { sub(i1, i3), sub(sub(kWeight0, i5), i7) }; \
562 const T_VEC w16[2] = { add(i3, i1), sub(i5, i7) }; \
563 const T_VEC w17[2] = { sub(i3, i1), sub(sub(kWeight0, i7), i5) }; \
565 const T_VEC w19[2] = { sub(w14[0], w16[0]), sub(w14[1], w16[1]) }; \
566 const T_VEC w20[2] = { add(w15[0], w17[1]), sub(w15[1], w17[0]) }; \
567 const T_VEC w21[2] = { sub(w15[0], w17[1]), add(w15[1], w17[0]) }; \
573 sub(w13[0], mul(kWeight2, sub(w21[0], w21[1])))); \
574 store(output + 4 * stride, sub(w10[0], w18[0])); \
576 add(w12[0], sub(sub(kWeight0, mul(kWeight2, w20[0])), \
578 store(output + 6 * stride, sub(w11[0], w19[1])); \
580 add(w13[0], mul(kWeight2, sub(w21[0], w21[1])))); \
583 #define GEN_IFFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ argument
607 const T_VEC w15 = sub(i0, i8); \
608 const T_VEC w16[2] = { add(i4, i4), sub(i12, i12) }; \
609 const T_VEC w17[2] = { sub(i4, i4), sub(sub(kWeight0, i12), i12) }; \
611 const T_VEC w19[2] = { sub(w14, w16[0]), sub(kWeight0, w16[1]) }; \
612 const T_VEC w20[2] = { add(w15, w17[1]), sub(kWeight0, w17[0]) }; \
613 const T_VEC w21[2] = { sub(w15, w17[1]), w17[0] }; \
614 const T_VEC w22[2] = { add(i2, i6), sub(i14, i10) }; \
615 const T_VEC w23[2] = { sub(i2, i6), sub(sub(kWeight0, i10), i14) }; \
616 const T_VEC w24[2] = { add(i6, i2), sub(i10, i14) }; \
617 const T_VEC w25[2] = { sub(i6, i2), sub(sub(kWeight0, i14), i10) }; \
619 const T_VEC w27[2] = { sub(w22[0], w24[0]), sub(w22[1], w24[1]) }; \
620 const T_VEC w28[2] = { add(w23[0], w25[1]), sub(w23[1], w25[0]) }; \
621 const T_VEC w29[2] = { sub(w23[0], w25[1]), add(w23[1], w25[0]) }; \
623 const T_VEC w31[2] = { sub(w18[0], w26[0]), sub(w18[1], w26[1]) }; \
625 add(w20[1], mul(kWeight2, sub(w28[1], w28[0]))) }; \
627 sub(sub(kWeight0, mul(kWeight2, w28[0])), \
629 add(w20[1], mul(kWeight2, sub(w28[0], w28[1]))) }; \
630 const T_VEC w34[2] = { add(w19[0], w27[1]), sub(w19[1], w27[0]) }; \
631 const T_VEC w35[2] = { sub(w19[0], w27[1]), add(w19[1], w27[0]) }; \
632 const T_VEC w36[2] = { sub(w21[0], mul(kWeight2, sub(w29[0], w29[1]))), \
633 sub(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \
634 const T_VEC w37[2] = { add(w21[0], mul(kWeight2, sub(w29[0], w29[1]))), \
636 const T_VEC w38[2] = { add(i1, i7), sub(i15, i9) }; \
637 const T_VEC w39[2] = { sub(i1, i7), sub(sub(kWeight0, i9), i15) }; \
638 const T_VEC w40[2] = { add(i5, i3), sub(i11, i13) }; \
639 const T_VEC w41[2] = { sub(i5, i3), sub(sub(kWeight0, i13), i11) }; \
641 const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) }; \
642 const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) }; \
643 const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) }; \
644 const T_VEC w46[2] = { add(i3, i5), sub(i13, i11) }; \
645 const T_VEC w47[2] = { sub(i3, i5), sub(sub(kWeight0, i11), i13) }; \
646 const T_VEC w48[2] = { add(i7, i1), sub(i9, i15) }; \
647 const T_VEC w49[2] = { sub(i7, i1), sub(sub(kWeight0, i15), i9) }; \
649 const T_VEC w51[2] = { sub(w46[0], w48[0]), sub(w46[1], w48[1]) }; \
650 const T_VEC w52[2] = { add(w47[0], w49[1]), sub(w47[1], w49[0]) }; \
651 const T_VEC w53[2] = { sub(w47[0], w49[1]), add(w47[1], w49[0]) }; \
653 const T_VEC w55[2] = { sub(w42[0], w50[0]), sub(w42[1], w50[1]) }; \
655 add(w44[1], mul(kWeight2, sub(w52[1], w52[0]))) }; \
657 sub(sub(kWeight0, mul(kWeight2, w52[0])), \
659 add(w44[1], mul(kWeight2, sub(w52[0], w52[1]))) }; \
660 const T_VEC w58[2] = { add(w43[0], w51[1]), sub(w43[1], w51[0]) }; \
661 const T_VEC w59[2] = { sub(w43[0], w51[1]), add(w43[1], w51[0]) }; \
662 const T_VEC w60[2] = { sub(w45[0], mul(kWeight2, sub(w53[0], w53[1]))), \
663 sub(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \
664 const T_VEC w61[2] = { add(w45[0], mul(kWeight2, sub(w53[0], w53[1]))), \
675 sub(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1])))); \
677 sub(w35[0], mul(kWeight2, sub(w59[0], w59[1])))); \
679 sub(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1])))); \
680 store(output + 8 * stride, sub(w30[0], w54[0])); \
682 add(w32[0], sub(sub(kWeight0, mul(kWeight3, w56[0])), \
685 add(w34[0], sub(sub(kWeight0, mul(kWeight2, w58[0])), \
688 add(w36[0], sub(sub(kWeight0, mul(kWeight4, w60[0])), \
690 store(output + 12 * stride, sub(w31[0], w55[1])); \
692 add(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1])))); \
694 add(w35[0], mul(kWeight2, sub(w59[0], w59[1])))); \
696 add(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1])))); \
698 #define GEN_IFFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ argument
742 const T_VEC w31 = sub(i0, i16); \
743 const T_VEC w32[2] = { add(i8, i8), sub(i24, i24) }; \
744 const T_VEC w33[2] = { sub(i8, i8), sub(sub(kWeight0, i24), i24) }; \
746 const T_VEC w35[2] = { sub(w30, w32[0]), sub(kWeight0, w32[1]) }; \
747 const T_VEC w36[2] = { add(w31, w33[1]), sub(kWeight0, w33[0]) }; \
748 const T_VEC w37[2] = { sub(w31, w33[1]), w33[0] }; \
749 const T_VEC w38[2] = { add(i4, i12), sub(i28, i20) }; \
750 const T_VEC w39[2] = { sub(i4, i12), sub(sub(kWeight0, i20), i28) }; \
751 const T_VEC w40[2] = { add(i12, i4), sub(i20, i28) }; \
752 const T_VEC w41[2] = { sub(i12, i4), sub(sub(kWeight0, i28), i20) }; \
754 const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) }; \
755 const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) }; \
756 const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) }; \
758 const T_VEC w47[2] = { sub(w34[0], w42[0]), sub(w34[1], w42[1]) }; \
760 add(w36[1], mul(kWeight2, sub(w44[1], w44[0]))) }; \
762 sub(sub(kWeight0, mul(kWeight2, w44[0])), \
764 add(w36[1], mul(kWeight2, sub(w44[0], w44[1]))) }; \
765 const T_VEC w50[2] = { add(w35[0], w43[1]), sub(w35[1], w43[0]) }; \
766 const T_VEC w51[2] = { sub(w35[0], w43[1]), add(w35[1], w43[0]) }; \
767 const T_VEC w52[2] = { sub(w37[0], mul(kWeight2, sub(w45[0], w45[1]))), \
768 sub(w37[1], mul(kWeight2, add(w45[1], w45[0]))) }; \
769 const T_VEC w53[2] = { add(w37[0], mul(kWeight2, sub(w45[0], w45[1]))), \
771 const T_VEC w54[2] = { add(i2, i14), sub(i30, i18) }; \
772 const T_VEC w55[2] = { sub(i2, i14), sub(sub(kWeight0, i18), i30) }; \
773 const T_VEC w56[2] = { add(i10, i6), sub(i22, i26) }; \
774 const T_VEC w57[2] = { sub(i10, i6), sub(sub(kWeight0, i26), i22) }; \
776 const T_VEC w59[2] = { sub(w54[0], w56[0]), sub(w54[1], w56[1]) }; \
777 const T_VEC w60[2] = { add(w55[0], w57[1]), sub(w55[1], w57[0]) }; \
778 const T_VEC w61[2] = { sub(w55[0], w57[1]), add(w55[1], w57[0]) }; \
779 const T_VEC w62[2] = { add(i6, i10), sub(i26, i22) }; \
780 const T_VEC w63[2] = { sub(i6, i10), sub(sub(kWeight0, i22), i26) }; \
781 const T_VEC w64[2] = { add(i14, i2), sub(i18, i30) }; \
782 const T_VEC w65[2] = { sub(i14, i2), sub(sub(kWeight0, i30), i18) }; \
784 const T_VEC w67[2] = { sub(w62[0], w64[0]), sub(w62[1], w64[1]) }; \
785 const T_VEC w68[2] = { add(w63[0], w65[1]), sub(w63[1], w65[0]) }; \
786 const T_VEC w69[2] = { sub(w63[0], w65[1]), add(w63[1], w65[0]) }; \
788 const T_VEC w71[2] = { sub(w58[0], w66[0]), sub(w58[1], w66[1]) }; \
790 add(w60[1], mul(kWeight2, sub(w68[1], w68[0]))) }; \
792 sub(sub(kWeight0, mul(kWeight2, w68[0])), \
794 add(w60[1], mul(kWeight2, sub(w68[0], w68[1]))) }; \
795 const T_VEC w74[2] = { add(w59[0], w67[1]), sub(w59[1], w67[0]) }; \
796 const T_VEC w75[2] = { sub(w59[0], w67[1]), add(w59[1], w67[0]) }; \
797 const T_VEC w76[2] = { sub(w61[0], mul(kWeight2, sub(w69[0], w69[1]))), \
798 sub(w61[1], mul(kWeight2, add(w69[1], w69[0]))) }; \
799 const T_VEC w77[2] = { add(w61[0], mul(kWeight2, sub(w69[0], w69[1]))), \
802 const T_VEC w79[2] = { sub(w46[0], w70[0]), sub(w46[1], w70[1]) }; \
805 add(w48[1], sub(mul(kWeight3, w72[1]), mul(kWeight4, w72[0]))) \
809 sub(sub(kWeight0, mul(kWeight3, w72[0])), mul(kWeight4, w72[1]))), \
810 add(w48[1], sub(mul(kWeight4, w72[0]), mul(kWeight3, w72[1]))) \
813 add(w50[1], mul(kWeight2, sub(w74[1], w74[0]))) }; \
815 sub(sub(kWeight0, mul(kWeight2, w74[0])), \
817 add(w50[1], mul(kWeight2, sub(w74[0], w74[1]))) }; \
820 add(w52[1], sub(mul(kWeight4, w76[1]), mul(kWeight3, w76[0]))) \
824 sub(sub(kWeight0, mul(kWeight4, w76[0])), mul(kWeight3, w76[1]))), \
825 add(w52[1], sub(mul(kWeight3, w76[0]), mul(kWeight4, w76[1]))) \
827 const T_VEC w86[2] = { add(w47[0], w71[1]), sub(w47[1], w71[0]) }; \
828 const T_VEC w87[2] = { sub(w47[0], w71[1]), add(w47[1], w71[0]) }; \
830 sub(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))), \
832 sub(sub(kWeight0, mul(kWeight4, w73[1])), mul(kWeight3, w73[0]))) \
835 add(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))), \
838 const T_VEC w90[2] = { sub(w51[0], mul(kWeight2, sub(w75[0], w75[1]))), \
839 sub(w51[1], mul(kWeight2, add(w75[1], w75[0]))) }; \
840 const T_VEC w91[2] = { add(w51[0], mul(kWeight2, sub(w75[0], w75[1]))), \
843 sub(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))), \
845 sub(sub(kWeight0, mul(kWeight3, w77[1])), mul(kWeight4, w77[0]))) \
848 add(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))), \
851 const T_VEC w94[2] = { add(i1, i15), sub(i31, i17) }; \
852 const T_VEC w95[2] = { sub(i1, i15), sub(sub(kWeight0, i17), i31) }; \
853 const T_VEC w96[2] = { add(i9, i7), sub(i23, i25) }; \
854 const T_VEC w97[2] = { sub(i9, i7), sub(sub(kWeight0, i25), i23) }; \
856 const T_VEC w99[2] = { sub(w94[0], w96[0]), sub(w94[1], w96[1]) }; \
857 const T_VEC w100[2] = { add(w95[0], w97[1]), sub(w95[1], w97[0]) }; \
858 const T_VEC w101[2] = { sub(w95[0], w97[1]), add(w95[1], w97[0]) }; \
859 const T_VEC w102[2] = { add(i5, i11), sub(i27, i21) }; \
860 const T_VEC w103[2] = { sub(i5, i11), sub(sub(kWeight0, i21), i27) }; \
861 const T_VEC w104[2] = { add(i13, i3), sub(i19, i29) }; \
862 const T_VEC w105[2] = { sub(i13, i3), sub(sub(kWeight0, i29), i19) }; \
864 const T_VEC w107[2] = { sub(w102[0], w104[0]), sub(w102[1], w104[1]) }; \
865 const T_VEC w108[2] = { add(w103[0], w105[1]), sub(w103[1], w105[0]) }; \
866 const T_VEC w109[2] = { sub(w103[0], w105[1]), add(w103[1], w105[0]) }; \
868 const T_VEC w111[2] = { sub(w98[0], w106[0]), sub(w98[1], w106[1]) }; \
871 add(w100[1], mul(kWeight2, sub(w108[1], w108[0]))) \
875 sub(sub(kWeight0, mul(kWeight2, w108[0])), mul(kWeight2, w108[1]))), \
876 add(w100[1], mul(kWeight2, sub(w108[0], w108[1]))) \
878 const T_VEC w114[2] = { add(w99[0], w107[1]), sub(w99[1], w107[0]) }; \
879 const T_VEC w115[2] = { sub(w99[0], w107[1]), add(w99[1], w107[0]) }; \
881 sub(w101[0], mul(kWeight2, sub(w109[0], w109[1]))), \
882 sub(w101[1], mul(kWeight2, add(w109[1], w109[0]))) \
885 add(w101[0], mul(kWeight2, sub(w109[0], w109[1]))), \
888 const T_VEC w118[2] = { add(i3, i13), sub(i29, i19) }; \
889 const T_VEC w119[2] = { sub(i3, i13), sub(sub(kWeight0, i19), i29) }; \
890 const T_VEC w120[2] = { add(i11, i5), sub(i21, i27) }; \
891 const T_VEC w121[2] = { sub(i11, i5), sub(sub(kWeight0, i27), i21) }; \
893 const T_VEC w123[2] = { sub(w118[0], w120[0]), sub(w118[1], w120[1]) }; \
894 const T_VEC w124[2] = { add(w119[0], w121[1]), sub(w119[1], w121[0]) }; \
895 const T_VEC w125[2] = { sub(w119[0], w121[1]), add(w119[1], w121[0]) }; \
896 const T_VEC w126[2] = { add(i7, i9), sub(i25, i23) }; \
897 const T_VEC w127[2] = { sub(i7, i9), sub(sub(kWeight0, i23), i25) }; \
898 const T_VEC w128[2] = { add(i15, i1), sub(i17, i31) }; \
899 const T_VEC w129[2] = { sub(i15, i1), sub(sub(kWeight0, i31), i17) }; \
901 const T_VEC w131[2] = { sub(w126[0], w128[0]), sub(w126[1], w128[1]) }; \
902 const T_VEC w132[2] = { add(w127[0], w129[1]), sub(w127[1], w129[0]) }; \
903 const T_VEC w133[2] = { sub(w127[0], w129[1]), add(w127[1], w129[0]) }; \
905 const T_VEC w135[2] = { sub(w122[0], w130[0]), sub(w122[1], w130[1]) }; \
908 add(w124[1], mul(kWeight2, sub(w132[1], w132[0]))) \
912 sub(sub(kWeight0, mul(kWeight2, w132[0])), mul(kWeight2, w132[1]))), \
913 add(w124[1], mul(kWeight2, sub(w132[0], w132[1]))) \
915 const T_VEC w138[2] = { add(w123[0], w131[1]), sub(w123[1], w131[0]) }; \
916 const T_VEC w139[2] = { sub(w123[0], w131[1]), add(w123[1], w131[0]) }; \
918 sub(w125[0], mul(kWeight2, sub(w133[0], w133[1]))), \
919 sub(w125[1], mul(kWeight2, add(w133[1], w133[0]))) \
922 add(w125[0], mul(kWeight2, sub(w133[0], w133[1]))), \
926 const T_VEC w143[2] = { sub(w110[0], w134[0]), sub(w110[1], w134[1]) }; \
929 add(w112[1], sub(mul(kWeight3, w136[1]), mul(kWeight4, w136[0]))) \
933 sub(sub(kWeight0, mul(kWeight3, w136[0])), mul(kWeight4, w136[1]))), \
934 add(w112[1], sub(mul(kWeight4, w136[0]), mul(kWeight3, w136[1]))) \
938 add(w114[1], mul(kWeight2, sub(w138[1], w138[0]))) \
942 sub(sub(kWeight0, mul(kWeight2, w138[0])), mul(kWeight2, w138[1]))), \
943 add(w114[1], mul(kWeight2, sub(w138[0], w138[1]))) \
947 add(w116[1], sub(mul(kWeight4, w140[1]), mul(kWeight3, w140[0]))) \
951 sub(sub(kWeight0, mul(kWeight4, w140[0])), mul(kWeight3, w140[1]))), \
952 add(w116[1], sub(mul(kWeight3, w140[0]), mul(kWeight4, w140[1]))) \
954 const T_VEC w150[2] = { add(w111[0], w135[1]), sub(w111[1], w135[0]) }; \
955 const T_VEC w151[2] = { sub(w111[0], w135[1]), add(w111[1], w135[0]) }; \
957 sub(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))), \
959 sub(sub(kWeight0, mul(kWeight4, w137[1])), mul(kWeight3, w137[0]))) \
962 add(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))), \
966 sub(w115[0], mul(kWeight2, sub(w139[0], w139[1]))), \
967 sub(w115[1], mul(kWeight2, add(w139[1], w139[0]))) \
970 add(w115[0], mul(kWeight2, sub(w139[0], w139[1]))), \
974 sub(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))), \
976 sub(sub(kWeight0, mul(kWeight3, w141[1])), mul(kWeight4, w141[0]))) \
979 add(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))), \
999 sub(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1])))); \
1001 sub(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1])))); \
1003 sub(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1])))); \
1005 sub(w87[0], mul(kWeight2, sub(w151[0], w151[1])))); \
1007 sub(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1])))); \
1009 sub(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1])))); \
1011 sub(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1])))); \
1012 store(output + 16 * stride, sub(w78[0], w142[0])); \
1014 add(w80[0], sub(sub(kWeight0, mul(kWeight5, w144[0])), \
1017 add(w82[0], sub(sub(kWeight0, mul(kWeight3, w146[0])), \
1020 add(w84[0], sub(sub(kWeight0, mul(kWeight7, w148[0])), \
1023 add(w86[0], sub(sub(kWeight0, mul(kWeight2, w150[0])), \
1026 add(w88[0], sub(sub(kWeight0, mul(kWeight8, w152[0])), \
1029 add(w90[0], sub(sub(kWeight0, mul(kWeight4, w154[0])), \
1032 add(w92[0], sub(sub(kWeight0, mul(kWeight6, w156[0])), \
1034 store(output + 24 * stride, sub(w79[0], w143[1])); \
1036 add(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1])))); \
1038 add(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1])))); \
1040 add(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1])))); \
1042 add(w87[0], mul(kWeight2, sub(w151[0], w151[1])))); \
1044 add(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1])))); \
1046 add(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1])))); \
1048 add(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1])))); \