xref: /aosp_15_r20/external/libaom/aom_dsp/x86/intrapred_avx2.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker  * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker  *
4*77c1e3ccSAndroid Build Coastguard Worker  * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker  * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker  * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker  */
11*77c1e3ccSAndroid Build Coastguard Worker 
12*77c1e3ccSAndroid Build Coastguard Worker #include <immintrin.h>
13*77c1e3ccSAndroid Build Coastguard Worker 
14*77c1e3ccSAndroid Build Coastguard Worker #include "config/av1_rtcd.h"
15*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/x86/intrapred_x86.h"
16*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/x86/intrapred_utils.h"
17*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/x86/lpf_common_sse2.h"
18*77c1e3ccSAndroid Build Coastguard Worker 
dc_sum_64(const uint8_t * ref)19*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i dc_sum_64(const uint8_t *ref) {
20*77c1e3ccSAndroid Build Coastguard Worker   const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref);
21*77c1e3ccSAndroid Build Coastguard Worker   const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
22*77c1e3ccSAndroid Build Coastguard Worker   const __m256i zero = _mm256_setzero_si256();
23*77c1e3ccSAndroid Build Coastguard Worker   __m256i y0 = _mm256_sad_epu8(x0, zero);
24*77c1e3ccSAndroid Build Coastguard Worker   __m256i y1 = _mm256_sad_epu8(x1, zero);
25*77c1e3ccSAndroid Build Coastguard Worker   y0 = _mm256_add_epi64(y0, y1);
26*77c1e3ccSAndroid Build Coastguard Worker   __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1);
27*77c1e3ccSAndroid Build Coastguard Worker   y0 = _mm256_add_epi64(u0, y0);
28*77c1e3ccSAndroid Build Coastguard Worker   u0 = _mm256_unpackhi_epi64(y0, y0);
29*77c1e3ccSAndroid Build Coastguard Worker   return _mm256_add_epi16(y0, u0);
30*77c1e3ccSAndroid Build Coastguard Worker }
31*77c1e3ccSAndroid Build Coastguard Worker 
dc_sum_32(const uint8_t * ref)32*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i dc_sum_32(const uint8_t *ref) {
33*77c1e3ccSAndroid Build Coastguard Worker   const __m256i x = _mm256_loadu_si256((const __m256i *)ref);
34*77c1e3ccSAndroid Build Coastguard Worker   const __m256i zero = _mm256_setzero_si256();
35*77c1e3ccSAndroid Build Coastguard Worker   __m256i y = _mm256_sad_epu8(x, zero);
36*77c1e3ccSAndroid Build Coastguard Worker   __m256i u = _mm256_permute2x128_si256(y, y, 1);
37*77c1e3ccSAndroid Build Coastguard Worker   y = _mm256_add_epi64(u, y);
38*77c1e3ccSAndroid Build Coastguard Worker   u = _mm256_unpackhi_epi64(y, y);
39*77c1e3ccSAndroid Build Coastguard Worker   return _mm256_add_epi16(y, u);
40*77c1e3ccSAndroid Build Coastguard Worker }
41*77c1e3ccSAndroid Build Coastguard Worker 
row_store_32xh(const __m256i * r,int height,uint8_t * dst,ptrdiff_t stride)42*77c1e3ccSAndroid Build Coastguard Worker static inline void row_store_32xh(const __m256i *r, int height, uint8_t *dst,
43*77c1e3ccSAndroid Build Coastguard Worker                                   ptrdiff_t stride) {
44*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < height; ++i) {
45*77c1e3ccSAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i *)dst, *r);
46*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
47*77c1e3ccSAndroid Build Coastguard Worker   }
48*77c1e3ccSAndroid Build Coastguard Worker }
49*77c1e3ccSAndroid Build Coastguard Worker 
row_store_32x2xh(const __m256i * r0,const __m256i * r1,int height,uint8_t * dst,ptrdiff_t stride)50*77c1e3ccSAndroid Build Coastguard Worker static inline void row_store_32x2xh(const __m256i *r0, const __m256i *r1,
51*77c1e3ccSAndroid Build Coastguard Worker                                     int height, uint8_t *dst,
52*77c1e3ccSAndroid Build Coastguard Worker                                     ptrdiff_t stride) {
53*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < height; ++i) {
54*77c1e3ccSAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i *)dst, *r0);
55*77c1e3ccSAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i *)(dst + 32), *r1);
56*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
57*77c1e3ccSAndroid Build Coastguard Worker   }
58*77c1e3ccSAndroid Build Coastguard Worker }
59*77c1e3ccSAndroid Build Coastguard Worker 
row_store_64xh(const __m256i * r,int height,uint8_t * dst,ptrdiff_t stride)60*77c1e3ccSAndroid Build Coastguard Worker static inline void row_store_64xh(const __m256i *r, int height, uint8_t *dst,
61*77c1e3ccSAndroid Build Coastguard Worker                                   ptrdiff_t stride) {
62*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < height; ++i) {
63*77c1e3ccSAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i *)dst, *r);
64*77c1e3ccSAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i *)(dst + 32), *r);
65*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
66*77c1e3ccSAndroid Build Coastguard Worker   }
67*77c1e3ccSAndroid Build Coastguard Worker }
68*77c1e3ccSAndroid Build Coastguard Worker 
69*77c1e3ccSAndroid Build Coastguard Worker #if CONFIG_AV1_HIGHBITDEPTH
70*77c1e3ccSAndroid Build Coastguard Worker static DECLARE_ALIGNED(16, uint8_t, HighbdLoadMaskx[8][16]) = {
71*77c1e3ccSAndroid Build Coastguard Worker   { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
72*77c1e3ccSAndroid Build Coastguard Worker   { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
73*77c1e3ccSAndroid Build Coastguard Worker   { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
74*77c1e3ccSAndroid Build Coastguard Worker   { 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
75*77c1e3ccSAndroid Build Coastguard Worker   { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 },
76*77c1e3ccSAndroid Build Coastguard Worker   { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5 },
77*77c1e3ccSAndroid Build Coastguard Worker   { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3 },
78*77c1e3ccSAndroid Build Coastguard Worker   { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
79*77c1e3ccSAndroid Build Coastguard Worker };
80*77c1e3ccSAndroid Build Coastguard Worker 
81*77c1e3ccSAndroid Build Coastguard Worker static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx4[4][16]) = {
82*77c1e3ccSAndroid Build Coastguard Worker   { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 },
83*77c1e3ccSAndroid Build Coastguard Worker   { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 4, 5, 8, 9, 12, 13 },
84*77c1e3ccSAndroid Build Coastguard Worker   { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 6, 7, 10, 11 },
85*77c1e3ccSAndroid Build Coastguard Worker   { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 0, 1, 8, 9 }
86*77c1e3ccSAndroid Build Coastguard Worker };
87*77c1e3ccSAndroid Build Coastguard Worker 
88*77c1e3ccSAndroid Build Coastguard Worker static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx[8][32]) = {
89*77c1e3ccSAndroid Build Coastguard Worker   { 0, 1, 4, 5, 8,  9,  12, 13, 16, 17, 20, 21, 24, 25, 28, 29,
90*77c1e3ccSAndroid Build Coastguard Worker     2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 },
91*77c1e3ccSAndroid Build Coastguard Worker   { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27,
92*77c1e3ccSAndroid Build Coastguard Worker     0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 },
93*77c1e3ccSAndroid Build Coastguard Worker   { 0, 1, 0, 1, 4, 5, 8,  9,  12, 13, 16, 17, 20, 21, 24, 25,
94*77c1e3ccSAndroid Build Coastguard Worker     0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27 },
95*77c1e3ccSAndroid Build Coastguard Worker   { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23,
96*77c1e3ccSAndroid Build Coastguard Worker     0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25 },
97*77c1e3ccSAndroid Build Coastguard Worker   { 0, 1, 0, 1, 0, 1, 0, 1, 8,  9,  12, 13, 16, 17, 20, 21,
98*77c1e3ccSAndroid Build Coastguard Worker     0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19, 22, 23 },
99*77c1e3ccSAndroid Build Coastguard Worker   { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19,
100*77c1e3ccSAndroid Build Coastguard Worker     0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17, 20, 21 },
101*77c1e3ccSAndroid Build Coastguard Worker   { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17,
102*77c1e3ccSAndroid Build Coastguard Worker     0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15, 18, 19 },
103*77c1e3ccSAndroid Build Coastguard Worker   { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15,
104*77c1e3ccSAndroid Build Coastguard Worker     0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 16, 17 }
105*77c1e3ccSAndroid Build Coastguard Worker };
106*77c1e3ccSAndroid Build Coastguard Worker 
107*77c1e3ccSAndroid Build Coastguard Worker static DECLARE_ALIGNED(32, uint16_t, HighbdBaseMask[17][16]) = {
108*77c1e3ccSAndroid Build Coastguard Worker   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
109*77c1e3ccSAndroid Build Coastguard Worker   { 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
110*77c1e3ccSAndroid Build Coastguard Worker   { 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
111*77c1e3ccSAndroid Build Coastguard Worker   { 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
112*77c1e3ccSAndroid Build Coastguard Worker   { 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
113*77c1e3ccSAndroid Build Coastguard Worker   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
114*77c1e3ccSAndroid Build Coastguard Worker   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
115*77c1e3ccSAndroid Build Coastguard Worker     0 },
116*77c1e3ccSAndroid Build Coastguard Worker   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0,
117*77c1e3ccSAndroid Build Coastguard Worker     0, 0 },
118*77c1e3ccSAndroid Build Coastguard Worker   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0,
119*77c1e3ccSAndroid Build Coastguard Worker     0, 0, 0, 0 },
120*77c1e3ccSAndroid Build Coastguard Worker   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0,
121*77c1e3ccSAndroid Build Coastguard Worker     0, 0, 0, 0, 0, 0 },
122*77c1e3ccSAndroid Build Coastguard Worker   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
123*77c1e3ccSAndroid Build Coastguard Worker     0xffff, 0, 0, 0, 0, 0, 0 },
124*77c1e3ccSAndroid Build Coastguard Worker   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
125*77c1e3ccSAndroid Build Coastguard Worker     0xffff, 0xffff, 0, 0, 0, 0, 0 },
126*77c1e3ccSAndroid Build Coastguard Worker   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
127*77c1e3ccSAndroid Build Coastguard Worker     0xffff, 0xffff, 0xffff, 0, 0, 0, 0 },
128*77c1e3ccSAndroid Build Coastguard Worker   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
129*77c1e3ccSAndroid Build Coastguard Worker     0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0 },
130*77c1e3ccSAndroid Build Coastguard Worker   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
131*77c1e3ccSAndroid Build Coastguard Worker     0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0 },
132*77c1e3ccSAndroid Build Coastguard Worker   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
133*77c1e3ccSAndroid Build Coastguard Worker     0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0 },
134*77c1e3ccSAndroid Build Coastguard Worker   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
135*77c1e3ccSAndroid Build Coastguard Worker     0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff }
136*77c1e3ccSAndroid Build Coastguard Worker };
137*77c1e3ccSAndroid Build Coastguard Worker 
138*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
highbd_transpose16x4_8x8_sse2(__m128i * x,__m128i * d)139*77c1e3ccSAndroid Build Coastguard Worker static inline void highbd_transpose16x4_8x8_sse2(__m128i *x, __m128i *d) {
140*77c1e3ccSAndroid Build Coastguard Worker   __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
141*77c1e3ccSAndroid Build Coastguard Worker 
142*77c1e3ccSAndroid Build Coastguard Worker   r0 = _mm_unpacklo_epi16(x[0], x[1]);
143*77c1e3ccSAndroid Build Coastguard Worker   r1 = _mm_unpacklo_epi16(x[2], x[3]);
144*77c1e3ccSAndroid Build Coastguard Worker   r2 = _mm_unpacklo_epi16(x[4], x[5]);
145*77c1e3ccSAndroid Build Coastguard Worker   r3 = _mm_unpacklo_epi16(x[6], x[7]);
146*77c1e3ccSAndroid Build Coastguard Worker 
147*77c1e3ccSAndroid Build Coastguard Worker   r4 = _mm_unpacklo_epi16(x[8], x[9]);
148*77c1e3ccSAndroid Build Coastguard Worker   r5 = _mm_unpacklo_epi16(x[10], x[11]);
149*77c1e3ccSAndroid Build Coastguard Worker   r6 = _mm_unpacklo_epi16(x[12], x[13]);
150*77c1e3ccSAndroid Build Coastguard Worker   r7 = _mm_unpacklo_epi16(x[14], x[15]);
151*77c1e3ccSAndroid Build Coastguard Worker 
152*77c1e3ccSAndroid Build Coastguard Worker   r8 = _mm_unpacklo_epi32(r0, r1);
153*77c1e3ccSAndroid Build Coastguard Worker   r9 = _mm_unpackhi_epi32(r0, r1);
154*77c1e3ccSAndroid Build Coastguard Worker   r10 = _mm_unpacklo_epi32(r2, r3);
155*77c1e3ccSAndroid Build Coastguard Worker   r11 = _mm_unpackhi_epi32(r2, r3);
156*77c1e3ccSAndroid Build Coastguard Worker 
157*77c1e3ccSAndroid Build Coastguard Worker   r12 = _mm_unpacklo_epi32(r4, r5);
158*77c1e3ccSAndroid Build Coastguard Worker   r13 = _mm_unpackhi_epi32(r4, r5);
159*77c1e3ccSAndroid Build Coastguard Worker   r14 = _mm_unpacklo_epi32(r6, r7);
160*77c1e3ccSAndroid Build Coastguard Worker   r15 = _mm_unpackhi_epi32(r6, r7);
161*77c1e3ccSAndroid Build Coastguard Worker 
162*77c1e3ccSAndroid Build Coastguard Worker   r0 = _mm_unpacklo_epi64(r8, r9);
163*77c1e3ccSAndroid Build Coastguard Worker   r1 = _mm_unpackhi_epi64(r8, r9);
164*77c1e3ccSAndroid Build Coastguard Worker   r2 = _mm_unpacklo_epi64(r10, r11);
165*77c1e3ccSAndroid Build Coastguard Worker   r3 = _mm_unpackhi_epi64(r10, r11);
166*77c1e3ccSAndroid Build Coastguard Worker 
167*77c1e3ccSAndroid Build Coastguard Worker   r4 = _mm_unpacklo_epi64(r12, r13);
168*77c1e3ccSAndroid Build Coastguard Worker   r5 = _mm_unpackhi_epi64(r12, r13);
169*77c1e3ccSAndroid Build Coastguard Worker   r6 = _mm_unpacklo_epi64(r14, r15);
170*77c1e3ccSAndroid Build Coastguard Worker   r7 = _mm_unpackhi_epi64(r14, r15);
171*77c1e3ccSAndroid Build Coastguard Worker 
172*77c1e3ccSAndroid Build Coastguard Worker   d[0] = _mm_unpacklo_epi64(r0, r2);
173*77c1e3ccSAndroid Build Coastguard Worker   d[1] = _mm_unpacklo_epi64(r4, r6);
174*77c1e3ccSAndroid Build Coastguard Worker   d[2] = _mm_unpacklo_epi64(r1, r3);
175*77c1e3ccSAndroid Build Coastguard Worker   d[3] = _mm_unpacklo_epi64(r5, r7);
176*77c1e3ccSAndroid Build Coastguard Worker 
177*77c1e3ccSAndroid Build Coastguard Worker   d[4] = _mm_unpackhi_epi64(r0, r2);
178*77c1e3ccSAndroid Build Coastguard Worker   d[5] = _mm_unpackhi_epi64(r4, r6);
179*77c1e3ccSAndroid Build Coastguard Worker   d[6] = _mm_unpackhi_epi64(r1, r3);
180*77c1e3ccSAndroid Build Coastguard Worker   d[7] = _mm_unpackhi_epi64(r5, r7);
181*77c1e3ccSAndroid Build Coastguard Worker }
182*77c1e3ccSAndroid Build Coastguard Worker 
highbd_transpose4x16_avx2(__m256i * x,__m256i * d)183*77c1e3ccSAndroid Build Coastguard Worker static inline void highbd_transpose4x16_avx2(__m256i *x, __m256i *d) {
184*77c1e3ccSAndroid Build Coastguard Worker   __m256i w0, w1, w2, w3, ww0, ww1;
185*77c1e3ccSAndroid Build Coastguard Worker 
186*77c1e3ccSAndroid Build Coastguard Worker   w0 = _mm256_unpacklo_epi16(x[0], x[1]);  // 00 10 01 11 02 12 03 13
187*77c1e3ccSAndroid Build Coastguard Worker   w1 = _mm256_unpacklo_epi16(x[2], x[3]);  // 20 30 21 31 22 32 23 33
188*77c1e3ccSAndroid Build Coastguard Worker   w2 = _mm256_unpackhi_epi16(x[0], x[1]);  // 40 50 41 51 42 52 43 53
189*77c1e3ccSAndroid Build Coastguard Worker   w3 = _mm256_unpackhi_epi16(x[2], x[3]);  // 60 70 61 71 62 72 63 73
190*77c1e3ccSAndroid Build Coastguard Worker 
191*77c1e3ccSAndroid Build Coastguard Worker   ww0 = _mm256_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
192*77c1e3ccSAndroid Build Coastguard Worker   ww1 = _mm256_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
193*77c1e3ccSAndroid Build Coastguard Worker 
194*77c1e3ccSAndroid Build Coastguard Worker   d[0] = _mm256_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
195*77c1e3ccSAndroid Build Coastguard Worker   d[1] = _mm256_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
196*77c1e3ccSAndroid Build Coastguard Worker 
197*77c1e3ccSAndroid Build Coastguard Worker   ww0 = _mm256_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
198*77c1e3ccSAndroid Build Coastguard Worker   ww1 = _mm256_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
199*77c1e3ccSAndroid Build Coastguard Worker 
200*77c1e3ccSAndroid Build Coastguard Worker   d[2] = _mm256_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
201*77c1e3ccSAndroid Build Coastguard Worker   d[3] = _mm256_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
202*77c1e3ccSAndroid Build Coastguard Worker }
203*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
204*77c1e3ccSAndroid Build Coastguard Worker 
highbd_transpose8x16_16x8_avx2(__m256i * x,__m256i * d)205*77c1e3ccSAndroid Build Coastguard Worker static inline void highbd_transpose8x16_16x8_avx2(__m256i *x, __m256i *d) {
206*77c1e3ccSAndroid Build Coastguard Worker   __m256i w0, w1, w2, w3, ww0, ww1;
207*77c1e3ccSAndroid Build Coastguard Worker 
208*77c1e3ccSAndroid Build Coastguard Worker   w0 = _mm256_unpacklo_epi16(x[0], x[1]);  // 00 10 01 11 02 12 03 13
209*77c1e3ccSAndroid Build Coastguard Worker   w1 = _mm256_unpacklo_epi16(x[2], x[3]);  // 20 30 21 31 22 32 23 33
210*77c1e3ccSAndroid Build Coastguard Worker   w2 = _mm256_unpacklo_epi16(x[4], x[5]);  // 40 50 41 51 42 52 43 53
211*77c1e3ccSAndroid Build Coastguard Worker   w3 = _mm256_unpacklo_epi16(x[6], x[7]);  // 60 70 61 71 62 72 63 73
212*77c1e3ccSAndroid Build Coastguard Worker 
213*77c1e3ccSAndroid Build Coastguard Worker   ww0 = _mm256_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
214*77c1e3ccSAndroid Build Coastguard Worker   ww1 = _mm256_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
215*77c1e3ccSAndroid Build Coastguard Worker 
216*77c1e3ccSAndroid Build Coastguard Worker   d[0] = _mm256_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
217*77c1e3ccSAndroid Build Coastguard Worker   d[1] = _mm256_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
218*77c1e3ccSAndroid Build Coastguard Worker 
219*77c1e3ccSAndroid Build Coastguard Worker   ww0 = _mm256_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
220*77c1e3ccSAndroid Build Coastguard Worker   ww1 = _mm256_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
221*77c1e3ccSAndroid Build Coastguard Worker 
222*77c1e3ccSAndroid Build Coastguard Worker   d[2] = _mm256_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
223*77c1e3ccSAndroid Build Coastguard Worker   d[3] = _mm256_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
224*77c1e3ccSAndroid Build Coastguard Worker 
225*77c1e3ccSAndroid Build Coastguard Worker   w0 = _mm256_unpackhi_epi16(x[0], x[1]);  // 04 14 05 15 06 16 07 17
226*77c1e3ccSAndroid Build Coastguard Worker   w1 = _mm256_unpackhi_epi16(x[2], x[3]);  // 24 34 25 35 26 36 27 37
227*77c1e3ccSAndroid Build Coastguard Worker   w2 = _mm256_unpackhi_epi16(x[4], x[5]);  // 44 54 45 55 46 56 47 57
228*77c1e3ccSAndroid Build Coastguard Worker   w3 = _mm256_unpackhi_epi16(x[6], x[7]);  // 64 74 65 75 66 76 67 77
229*77c1e3ccSAndroid Build Coastguard Worker 
230*77c1e3ccSAndroid Build Coastguard Worker   ww0 = _mm256_unpacklo_epi32(w0, w1);  // 04 14 24 34 05 15 25 35
231*77c1e3ccSAndroid Build Coastguard Worker   ww1 = _mm256_unpacklo_epi32(w2, w3);  // 44 54 64 74 45 55 65 75
232*77c1e3ccSAndroid Build Coastguard Worker 
233*77c1e3ccSAndroid Build Coastguard Worker   d[4] = _mm256_unpacklo_epi64(ww0, ww1);  // 04 14 24 34 44 54 64 74
234*77c1e3ccSAndroid Build Coastguard Worker   d[5] = _mm256_unpackhi_epi64(ww0, ww1);  // 05 15 25 35 45 55 65 75
235*77c1e3ccSAndroid Build Coastguard Worker 
236*77c1e3ccSAndroid Build Coastguard Worker   ww0 = _mm256_unpackhi_epi32(w0, w1);  // 06 16 26 36 07 17 27 37
237*77c1e3ccSAndroid Build Coastguard Worker   ww1 = _mm256_unpackhi_epi32(w2, w3);  // 46 56 66 76 47 57 67 77
238*77c1e3ccSAndroid Build Coastguard Worker 
239*77c1e3ccSAndroid Build Coastguard Worker   d[6] = _mm256_unpacklo_epi64(ww0, ww1);  // 06 16 26 36 46 56 66 76
240*77c1e3ccSAndroid Build Coastguard Worker   d[7] = _mm256_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
241*77c1e3ccSAndroid Build Coastguard Worker }
242*77c1e3ccSAndroid Build Coastguard Worker 
highbd_transpose16x16_avx2(__m256i * x,__m256i * d)243*77c1e3ccSAndroid Build Coastguard Worker static inline void highbd_transpose16x16_avx2(__m256i *x, __m256i *d) {
244*77c1e3ccSAndroid Build Coastguard Worker   __m256i w0, w1, w2, w3, ww0, ww1;
245*77c1e3ccSAndroid Build Coastguard Worker   __m256i dd[16];
246*77c1e3ccSAndroid Build Coastguard Worker   w0 = _mm256_unpacklo_epi16(x[0], x[1]);
247*77c1e3ccSAndroid Build Coastguard Worker   w1 = _mm256_unpacklo_epi16(x[2], x[3]);
248*77c1e3ccSAndroid Build Coastguard Worker   w2 = _mm256_unpacklo_epi16(x[4], x[5]);
249*77c1e3ccSAndroid Build Coastguard Worker   w3 = _mm256_unpacklo_epi16(x[6], x[7]);
250*77c1e3ccSAndroid Build Coastguard Worker 
251*77c1e3ccSAndroid Build Coastguard Worker   ww0 = _mm256_unpacklo_epi32(w0, w1);  //
252*77c1e3ccSAndroid Build Coastguard Worker   ww1 = _mm256_unpacklo_epi32(w2, w3);  //
253*77c1e3ccSAndroid Build Coastguard Worker 
254*77c1e3ccSAndroid Build Coastguard Worker   dd[0] = _mm256_unpacklo_epi64(ww0, ww1);
255*77c1e3ccSAndroid Build Coastguard Worker   dd[1] = _mm256_unpackhi_epi64(ww0, ww1);
256*77c1e3ccSAndroid Build Coastguard Worker 
257*77c1e3ccSAndroid Build Coastguard Worker   ww0 = _mm256_unpackhi_epi32(w0, w1);  //
258*77c1e3ccSAndroid Build Coastguard Worker   ww1 = _mm256_unpackhi_epi32(w2, w3);  //
259*77c1e3ccSAndroid Build Coastguard Worker 
260*77c1e3ccSAndroid Build Coastguard Worker   dd[2] = _mm256_unpacklo_epi64(ww0, ww1);
261*77c1e3ccSAndroid Build Coastguard Worker   dd[3] = _mm256_unpackhi_epi64(ww0, ww1);
262*77c1e3ccSAndroid Build Coastguard Worker 
263*77c1e3ccSAndroid Build Coastguard Worker   w0 = _mm256_unpackhi_epi16(x[0], x[1]);
264*77c1e3ccSAndroid Build Coastguard Worker   w1 = _mm256_unpackhi_epi16(x[2], x[3]);
265*77c1e3ccSAndroid Build Coastguard Worker   w2 = _mm256_unpackhi_epi16(x[4], x[5]);
266*77c1e3ccSAndroid Build Coastguard Worker   w3 = _mm256_unpackhi_epi16(x[6], x[7]);
267*77c1e3ccSAndroid Build Coastguard Worker 
268*77c1e3ccSAndroid Build Coastguard Worker   ww0 = _mm256_unpacklo_epi32(w0, w1);  //
269*77c1e3ccSAndroid Build Coastguard Worker   ww1 = _mm256_unpacklo_epi32(w2, w3);  //
270*77c1e3ccSAndroid Build Coastguard Worker 
271*77c1e3ccSAndroid Build Coastguard Worker   dd[4] = _mm256_unpacklo_epi64(ww0, ww1);
272*77c1e3ccSAndroid Build Coastguard Worker   dd[5] = _mm256_unpackhi_epi64(ww0, ww1);
273*77c1e3ccSAndroid Build Coastguard Worker 
274*77c1e3ccSAndroid Build Coastguard Worker   ww0 = _mm256_unpackhi_epi32(w0, w1);  //
275*77c1e3ccSAndroid Build Coastguard Worker   ww1 = _mm256_unpackhi_epi32(w2, w3);  //
276*77c1e3ccSAndroid Build Coastguard Worker 
277*77c1e3ccSAndroid Build Coastguard Worker   dd[6] = _mm256_unpacklo_epi64(ww0, ww1);
278*77c1e3ccSAndroid Build Coastguard Worker   dd[7] = _mm256_unpackhi_epi64(ww0, ww1);
279*77c1e3ccSAndroid Build Coastguard Worker 
280*77c1e3ccSAndroid Build Coastguard Worker   w0 = _mm256_unpacklo_epi16(x[8], x[9]);
281*77c1e3ccSAndroid Build Coastguard Worker   w1 = _mm256_unpacklo_epi16(x[10], x[11]);
282*77c1e3ccSAndroid Build Coastguard Worker   w2 = _mm256_unpacklo_epi16(x[12], x[13]);
283*77c1e3ccSAndroid Build Coastguard Worker   w3 = _mm256_unpacklo_epi16(x[14], x[15]);
284*77c1e3ccSAndroid Build Coastguard Worker 
285*77c1e3ccSAndroid Build Coastguard Worker   ww0 = _mm256_unpacklo_epi32(w0, w1);
286*77c1e3ccSAndroid Build Coastguard Worker   ww1 = _mm256_unpacklo_epi32(w2, w3);
287*77c1e3ccSAndroid Build Coastguard Worker 
288*77c1e3ccSAndroid Build Coastguard Worker   dd[8] = _mm256_unpacklo_epi64(ww0, ww1);
289*77c1e3ccSAndroid Build Coastguard Worker   dd[9] = _mm256_unpackhi_epi64(ww0, ww1);
290*77c1e3ccSAndroid Build Coastguard Worker 
291*77c1e3ccSAndroid Build Coastguard Worker   ww0 = _mm256_unpackhi_epi32(w0, w1);
292*77c1e3ccSAndroid Build Coastguard Worker   ww1 = _mm256_unpackhi_epi32(w2, w3);
293*77c1e3ccSAndroid Build Coastguard Worker 
294*77c1e3ccSAndroid Build Coastguard Worker   dd[10] = _mm256_unpacklo_epi64(ww0, ww1);
295*77c1e3ccSAndroid Build Coastguard Worker   dd[11] = _mm256_unpackhi_epi64(ww0, ww1);
296*77c1e3ccSAndroid Build Coastguard Worker 
297*77c1e3ccSAndroid Build Coastguard Worker   w0 = _mm256_unpackhi_epi16(x[8], x[9]);
298*77c1e3ccSAndroid Build Coastguard Worker   w1 = _mm256_unpackhi_epi16(x[10], x[11]);
299*77c1e3ccSAndroid Build Coastguard Worker   w2 = _mm256_unpackhi_epi16(x[12], x[13]);
300*77c1e3ccSAndroid Build Coastguard Worker   w3 = _mm256_unpackhi_epi16(x[14], x[15]);
301*77c1e3ccSAndroid Build Coastguard Worker 
302*77c1e3ccSAndroid Build Coastguard Worker   ww0 = _mm256_unpacklo_epi32(w0, w1);
303*77c1e3ccSAndroid Build Coastguard Worker   ww1 = _mm256_unpacklo_epi32(w2, w3);
304*77c1e3ccSAndroid Build Coastguard Worker 
305*77c1e3ccSAndroid Build Coastguard Worker   dd[12] = _mm256_unpacklo_epi64(ww0, ww1);
306*77c1e3ccSAndroid Build Coastguard Worker   dd[13] = _mm256_unpackhi_epi64(ww0, ww1);
307*77c1e3ccSAndroid Build Coastguard Worker 
308*77c1e3ccSAndroid Build Coastguard Worker   ww0 = _mm256_unpackhi_epi32(w0, w1);
309*77c1e3ccSAndroid Build Coastguard Worker   ww1 = _mm256_unpackhi_epi32(w2, w3);
310*77c1e3ccSAndroid Build Coastguard Worker 
311*77c1e3ccSAndroid Build Coastguard Worker   dd[14] = _mm256_unpacklo_epi64(ww0, ww1);
312*77c1e3ccSAndroid Build Coastguard Worker   dd[15] = _mm256_unpackhi_epi64(ww0, ww1);
313*77c1e3ccSAndroid Build Coastguard Worker 
314*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 8; i++) {
315*77c1e3ccSAndroid Build Coastguard Worker     d[i] = _mm256_insertf128_si256(dd[i], _mm256_castsi256_si128(dd[i + 8]), 1);
316*77c1e3ccSAndroid Build Coastguard Worker     d[i + 8] = _mm256_insertf128_si256(dd[i + 8],
317*77c1e3ccSAndroid Build Coastguard Worker                                        _mm256_extracti128_si256(dd[i], 1), 0);
318*77c1e3ccSAndroid Build Coastguard Worker   }
319*77c1e3ccSAndroid Build Coastguard Worker }
320*77c1e3ccSAndroid Build Coastguard Worker #endif  // CONFIG_AV1_HIGHBITDEPTH
321*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)322*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
323*77c1e3ccSAndroid Build Coastguard Worker                                  const uint8_t *above, const uint8_t *left) {
324*77c1e3ccSAndroid Build Coastguard Worker   const __m256i sum_above = dc_sum_32(above);
325*77c1e3ccSAndroid Build Coastguard Worker   __m256i sum_left = dc_sum_32(left);
326*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm256_add_epi16(sum_left, sum_above);
327*77c1e3ccSAndroid Build Coastguard Worker   const __m256i thirtytwo = _mm256_set1_epi16(32);
328*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm256_add_epi16(sum_left, thirtytwo);
329*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm256_srai_epi16(sum_left, 6);
330*77c1e3ccSAndroid Build Coastguard Worker   const __m256i zero = _mm256_setzero_si256();
331*77c1e3ccSAndroid Build Coastguard Worker   __m256i row = _mm256_shuffle_epi8(sum_left, zero);
332*77c1e3ccSAndroid Build Coastguard Worker   row_store_32xh(&row, 32, dst, stride);
333*77c1e3ccSAndroid Build Coastguard Worker }
334*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_top_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)335*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
336*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
337*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
338*77c1e3ccSAndroid Build Coastguard Worker   __m256i sum = dc_sum_32(above);
339*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
340*77c1e3ccSAndroid Build Coastguard Worker 
341*77c1e3ccSAndroid Build Coastguard Worker   const __m256i sixteen = _mm256_set1_epi16(16);
342*77c1e3ccSAndroid Build Coastguard Worker   sum = _mm256_add_epi16(sum, sixteen);
343*77c1e3ccSAndroid Build Coastguard Worker   sum = _mm256_srai_epi16(sum, 5);
344*77c1e3ccSAndroid Build Coastguard Worker   const __m256i zero = _mm256_setzero_si256();
345*77c1e3ccSAndroid Build Coastguard Worker   __m256i row = _mm256_shuffle_epi8(sum, zero);
346*77c1e3ccSAndroid Build Coastguard Worker   row_store_32xh(&row, 32, dst, stride);
347*77c1e3ccSAndroid Build Coastguard Worker }
348*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_left_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)349*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
350*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *above,
351*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *left) {
352*77c1e3ccSAndroid Build Coastguard Worker   __m256i sum = dc_sum_32(left);
353*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
354*77c1e3ccSAndroid Build Coastguard Worker 
355*77c1e3ccSAndroid Build Coastguard Worker   const __m256i sixteen = _mm256_set1_epi16(16);
356*77c1e3ccSAndroid Build Coastguard Worker   sum = _mm256_add_epi16(sum, sixteen);
357*77c1e3ccSAndroid Build Coastguard Worker   sum = _mm256_srai_epi16(sum, 5);
358*77c1e3ccSAndroid Build Coastguard Worker   const __m256i zero = _mm256_setzero_si256();
359*77c1e3ccSAndroid Build Coastguard Worker   __m256i row = _mm256_shuffle_epi8(sum, zero);
360*77c1e3ccSAndroid Build Coastguard Worker   row_store_32xh(&row, 32, dst, stride);
361*77c1e3ccSAndroid Build Coastguard Worker }
362*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_128_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)363*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
364*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
365*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
366*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
367*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
368*77c1e3ccSAndroid Build Coastguard Worker   const __m256i row = _mm256_set1_epi8((int8_t)0x80);
369*77c1e3ccSAndroid Build Coastguard Worker   row_store_32xh(&row, 32, dst, stride);
370*77c1e3ccSAndroid Build Coastguard Worker }
371*77c1e3ccSAndroid Build Coastguard Worker 
aom_v_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)372*77c1e3ccSAndroid Build Coastguard Worker void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
373*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8_t *above, const uint8_t *left) {
374*77c1e3ccSAndroid Build Coastguard Worker   const __m256i row = _mm256_loadu_si256((const __m256i *)above);
375*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
376*77c1e3ccSAndroid Build Coastguard Worker   row_store_32xh(&row, 32, dst, stride);
377*77c1e3ccSAndroid Build Coastguard Worker }
378*77c1e3ccSAndroid Build Coastguard Worker 
379*77c1e3ccSAndroid Build Coastguard Worker // There are 32 rows togeter. This function does line:
380*77c1e3ccSAndroid Build Coastguard Worker // 0,1,2,3, and 16,17,18,19. The next call would do
381*77c1e3ccSAndroid Build Coastguard Worker // 4,5,6,7, and 20,21,22,23. So 4 times of calling
382*77c1e3ccSAndroid Build Coastguard Worker // would finish 32 rows.
h_predictor_32x8line(const __m256i * row,uint8_t * dst,ptrdiff_t stride)383*77c1e3ccSAndroid Build Coastguard Worker static inline void h_predictor_32x8line(const __m256i *row, uint8_t *dst,
384*77c1e3ccSAndroid Build Coastguard Worker                                         ptrdiff_t stride) {
385*77c1e3ccSAndroid Build Coastguard Worker   __m256i t[4];
386*77c1e3ccSAndroid Build Coastguard Worker   __m256i m = _mm256_setzero_si256();
387*77c1e3ccSAndroid Build Coastguard Worker   const __m256i inc = _mm256_set1_epi8(4);
388*77c1e3ccSAndroid Build Coastguard Worker   int i;
389*77c1e3ccSAndroid Build Coastguard Worker 
390*77c1e3ccSAndroid Build Coastguard Worker   for (i = 0; i < 4; i++) {
391*77c1e3ccSAndroid Build Coastguard Worker     t[i] = _mm256_shuffle_epi8(*row, m);
392*77c1e3ccSAndroid Build Coastguard Worker     __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0);
393*77c1e3ccSAndroid Build Coastguard Worker     __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11);
394*77c1e3ccSAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i *)dst, r0);
395*77c1e3ccSAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1);
396*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
397*77c1e3ccSAndroid Build Coastguard Worker     m = _mm256_add_epi8(m, inc);
398*77c1e3ccSAndroid Build Coastguard Worker   }
399*77c1e3ccSAndroid Build Coastguard Worker }
400*77c1e3ccSAndroid Build Coastguard Worker 
aom_h_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)401*77c1e3ccSAndroid Build Coastguard Worker void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
402*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8_t *above, const uint8_t *left) {
403*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
404*77c1e3ccSAndroid Build Coastguard Worker   const __m256i left_col = _mm256_loadu_si256((__m256i const *)left);
405*77c1e3ccSAndroid Build Coastguard Worker 
406*77c1e3ccSAndroid Build Coastguard Worker   __m256i u = _mm256_unpacklo_epi8(left_col, left_col);
407*77c1e3ccSAndroid Build Coastguard Worker 
408*77c1e3ccSAndroid Build Coastguard Worker   __m256i v = _mm256_unpacklo_epi8(u, u);
409*77c1e3ccSAndroid Build Coastguard Worker   h_predictor_32x8line(&v, dst, stride);
410*77c1e3ccSAndroid Build Coastguard Worker   dst += stride << 2;
411*77c1e3ccSAndroid Build Coastguard Worker 
412*77c1e3ccSAndroid Build Coastguard Worker   v = _mm256_unpackhi_epi8(u, u);
413*77c1e3ccSAndroid Build Coastguard Worker   h_predictor_32x8line(&v, dst, stride);
414*77c1e3ccSAndroid Build Coastguard Worker   dst += stride << 2;
415*77c1e3ccSAndroid Build Coastguard Worker 
416*77c1e3ccSAndroid Build Coastguard Worker   u = _mm256_unpackhi_epi8(left_col, left_col);
417*77c1e3ccSAndroid Build Coastguard Worker 
418*77c1e3ccSAndroid Build Coastguard Worker   v = _mm256_unpacklo_epi8(u, u);
419*77c1e3ccSAndroid Build Coastguard Worker   h_predictor_32x8line(&v, dst, stride);
420*77c1e3ccSAndroid Build Coastguard Worker   dst += stride << 2;
421*77c1e3ccSAndroid Build Coastguard Worker 
422*77c1e3ccSAndroid Build Coastguard Worker   v = _mm256_unpackhi_epi8(u, u);
423*77c1e3ccSAndroid Build Coastguard Worker   h_predictor_32x8line(&v, dst, stride);
424*77c1e3ccSAndroid Build Coastguard Worker }
425*77c1e3ccSAndroid Build Coastguard Worker 
426*77c1e3ccSAndroid Build Coastguard Worker // -----------------------------------------------------------------------------
427*77c1e3ccSAndroid Build Coastguard Worker // Rectangle
aom_dc_predictor_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)428*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
429*77c1e3ccSAndroid Build Coastguard Worker                                  const uint8_t *above, const uint8_t *left) {
430*77c1e3ccSAndroid Build Coastguard Worker   const __m128i top_sum = dc_sum_32_sse2(above);
431*77c1e3ccSAndroid Build Coastguard Worker   __m128i left_sum = dc_sum_16_sse2(left);
432*77c1e3ccSAndroid Build Coastguard Worker   left_sum = _mm_add_epi16(top_sum, left_sum);
433*77c1e3ccSAndroid Build Coastguard Worker   uint16_t sum = (uint16_t)_mm_cvtsi128_si32(left_sum);
434*77c1e3ccSAndroid Build Coastguard Worker   sum += 24;
435*77c1e3ccSAndroid Build Coastguard Worker   sum /= 48;
436*77c1e3ccSAndroid Build Coastguard Worker   const __m256i row = _mm256_set1_epi8((int8_t)sum);
437*77c1e3ccSAndroid Build Coastguard Worker   row_store_32xh(&row, 16, dst, stride);
438*77c1e3ccSAndroid Build Coastguard Worker }
439*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_predictor_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)440*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
441*77c1e3ccSAndroid Build Coastguard Worker                                  const uint8_t *above, const uint8_t *left) {
442*77c1e3ccSAndroid Build Coastguard Worker   const __m256i sum_above = dc_sum_32(above);
443*77c1e3ccSAndroid Build Coastguard Worker   __m256i sum_left = dc_sum_64(left);
444*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm256_add_epi16(sum_left, sum_above);
445*77c1e3ccSAndroid Build Coastguard Worker   uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
446*77c1e3ccSAndroid Build Coastguard Worker   sum += 48;
447*77c1e3ccSAndroid Build Coastguard Worker   sum /= 96;
448*77c1e3ccSAndroid Build Coastguard Worker   const __m256i row = _mm256_set1_epi8((int8_t)sum);
449*77c1e3ccSAndroid Build Coastguard Worker   row_store_32xh(&row, 64, dst, stride);
450*77c1e3ccSAndroid Build Coastguard Worker }
451*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_predictor_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)452*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
453*77c1e3ccSAndroid Build Coastguard Worker                                  const uint8_t *above, const uint8_t *left) {
454*77c1e3ccSAndroid Build Coastguard Worker   const __m256i sum_above = dc_sum_64(above);
455*77c1e3ccSAndroid Build Coastguard Worker   __m256i sum_left = dc_sum_64(left);
456*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm256_add_epi16(sum_left, sum_above);
457*77c1e3ccSAndroid Build Coastguard Worker   uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
458*77c1e3ccSAndroid Build Coastguard Worker   sum += 64;
459*77c1e3ccSAndroid Build Coastguard Worker   sum /= 128;
460*77c1e3ccSAndroid Build Coastguard Worker   const __m256i row = _mm256_set1_epi8((int8_t)sum);
461*77c1e3ccSAndroid Build Coastguard Worker   row_store_64xh(&row, 64, dst, stride);
462*77c1e3ccSAndroid Build Coastguard Worker }
463*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_predictor_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)464*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
465*77c1e3ccSAndroid Build Coastguard Worker                                  const uint8_t *above, const uint8_t *left) {
466*77c1e3ccSAndroid Build Coastguard Worker   const __m256i sum_above = dc_sum_64(above);
467*77c1e3ccSAndroid Build Coastguard Worker   __m256i sum_left = dc_sum_32(left);
468*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm256_add_epi16(sum_left, sum_above);
469*77c1e3ccSAndroid Build Coastguard Worker   uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
470*77c1e3ccSAndroid Build Coastguard Worker   sum += 48;
471*77c1e3ccSAndroid Build Coastguard Worker   sum /= 96;
472*77c1e3ccSAndroid Build Coastguard Worker   const __m256i row = _mm256_set1_epi8((int8_t)sum);
473*77c1e3ccSAndroid Build Coastguard Worker   row_store_64xh(&row, 32, dst, stride);
474*77c1e3ccSAndroid Build Coastguard Worker }
475*77c1e3ccSAndroid Build Coastguard Worker 
476*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_predictor_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)477*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
478*77c1e3ccSAndroid Build Coastguard Worker                                  const uint8_t *above, const uint8_t *left) {
479*77c1e3ccSAndroid Build Coastguard Worker   const __m256i sum_above = dc_sum_64(above);
480*77c1e3ccSAndroid Build Coastguard Worker   __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left));
481*77c1e3ccSAndroid Build Coastguard Worker   sum_left = _mm256_add_epi16(sum_left, sum_above);
482*77c1e3ccSAndroid Build Coastguard Worker   uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
483*77c1e3ccSAndroid Build Coastguard Worker   sum += 40;
484*77c1e3ccSAndroid Build Coastguard Worker   sum /= 80;
485*77c1e3ccSAndroid Build Coastguard Worker   const __m256i row = _mm256_set1_epi8((int8_t)sum);
486*77c1e3ccSAndroid Build Coastguard Worker   row_store_64xh(&row, 16, dst, stride);
487*77c1e3ccSAndroid Build Coastguard Worker }
488*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
489*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_top_predictor_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)490*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
491*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
492*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
493*77c1e3ccSAndroid Build Coastguard Worker   __m256i sum = dc_sum_32(above);
494*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
495*77c1e3ccSAndroid Build Coastguard Worker 
496*77c1e3ccSAndroid Build Coastguard Worker   const __m256i sixteen = _mm256_set1_epi16(16);
497*77c1e3ccSAndroid Build Coastguard Worker   sum = _mm256_add_epi16(sum, sixteen);
498*77c1e3ccSAndroid Build Coastguard Worker   sum = _mm256_srai_epi16(sum, 5);
499*77c1e3ccSAndroid Build Coastguard Worker   const __m256i zero = _mm256_setzero_si256();
500*77c1e3ccSAndroid Build Coastguard Worker   __m256i row = _mm256_shuffle_epi8(sum, zero);
501*77c1e3ccSAndroid Build Coastguard Worker   row_store_32xh(&row, 16, dst, stride);
502*77c1e3ccSAndroid Build Coastguard Worker }
503*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_top_predictor_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)504*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
505*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
506*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
507*77c1e3ccSAndroid Build Coastguard Worker   __m256i sum = dc_sum_32(above);
508*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
509*77c1e3ccSAndroid Build Coastguard Worker 
510*77c1e3ccSAndroid Build Coastguard Worker   const __m256i sixteen = _mm256_set1_epi16(16);
511*77c1e3ccSAndroid Build Coastguard Worker   sum = _mm256_add_epi16(sum, sixteen);
512*77c1e3ccSAndroid Build Coastguard Worker   sum = _mm256_srai_epi16(sum, 5);
513*77c1e3ccSAndroid Build Coastguard Worker   const __m256i zero = _mm256_setzero_si256();
514*77c1e3ccSAndroid Build Coastguard Worker   __m256i row = _mm256_shuffle_epi8(sum, zero);
515*77c1e3ccSAndroid Build Coastguard Worker   row_store_32xh(&row, 64, dst, stride);
516*77c1e3ccSAndroid Build Coastguard Worker }
517*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_top_predictor_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)518*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
519*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
520*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
521*77c1e3ccSAndroid Build Coastguard Worker   __m256i sum = dc_sum_64(above);
522*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
523*77c1e3ccSAndroid Build Coastguard Worker 
524*77c1e3ccSAndroid Build Coastguard Worker   const __m256i thirtytwo = _mm256_set1_epi16(32);
525*77c1e3ccSAndroid Build Coastguard Worker   sum = _mm256_add_epi16(sum, thirtytwo);
526*77c1e3ccSAndroid Build Coastguard Worker   sum = _mm256_srai_epi16(sum, 6);
527*77c1e3ccSAndroid Build Coastguard Worker   const __m256i zero = _mm256_setzero_si256();
528*77c1e3ccSAndroid Build Coastguard Worker   __m256i row = _mm256_shuffle_epi8(sum, zero);
529*77c1e3ccSAndroid Build Coastguard Worker   row_store_64xh(&row, 64, dst, stride);
530*77c1e3ccSAndroid Build Coastguard Worker }
531*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_top_predictor_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)532*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
533*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
534*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
535*77c1e3ccSAndroid Build Coastguard Worker   __m256i sum = dc_sum_64(above);
536*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
537*77c1e3ccSAndroid Build Coastguard Worker 
538*77c1e3ccSAndroid Build Coastguard Worker   const __m256i thirtytwo = _mm256_set1_epi16(32);
539*77c1e3ccSAndroid Build Coastguard Worker   sum = _mm256_add_epi16(sum, thirtytwo);
540*77c1e3ccSAndroid Build Coastguard Worker   sum = _mm256_srai_epi16(sum, 6);
541*77c1e3ccSAndroid Build Coastguard Worker   const __m256i zero = _mm256_setzero_si256();
542*77c1e3ccSAndroid Build Coastguard Worker   __m256i row = _mm256_shuffle_epi8(sum, zero);
543*77c1e3ccSAndroid Build Coastguard Worker   row_store_64xh(&row, 32, dst, stride);
544*77c1e3ccSAndroid Build Coastguard Worker }
545*77c1e3ccSAndroid Build Coastguard Worker 
546*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_top_predictor_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)547*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
548*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
549*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
550*77c1e3ccSAndroid Build Coastguard Worker   __m256i sum = dc_sum_64(above);
551*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
552*77c1e3ccSAndroid Build Coastguard Worker 
553*77c1e3ccSAndroid Build Coastguard Worker   const __m256i thirtytwo = _mm256_set1_epi16(32);
554*77c1e3ccSAndroid Build Coastguard Worker   sum = _mm256_add_epi16(sum, thirtytwo);
555*77c1e3ccSAndroid Build Coastguard Worker   sum = _mm256_srai_epi16(sum, 6);
556*77c1e3ccSAndroid Build Coastguard Worker   const __m256i zero = _mm256_setzero_si256();
557*77c1e3ccSAndroid Build Coastguard Worker   __m256i row = _mm256_shuffle_epi8(sum, zero);
558*77c1e3ccSAndroid Build Coastguard Worker   row_store_64xh(&row, 16, dst, stride);
559*77c1e3ccSAndroid Build Coastguard Worker }
560*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
561*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_left_predictor_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)562*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
563*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *above,
564*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *left) {
565*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum = dc_sum_16_sse2(left);
566*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
567*77c1e3ccSAndroid Build Coastguard Worker 
568*77c1e3ccSAndroid Build Coastguard Worker   const __m128i eight = _mm_set1_epi16(8);
569*77c1e3ccSAndroid Build Coastguard Worker   sum = _mm_add_epi16(sum, eight);
570*77c1e3ccSAndroid Build Coastguard Worker   sum = _mm_srai_epi16(sum, 4);
571*77c1e3ccSAndroid Build Coastguard Worker   const __m128i zero = _mm_setzero_si128();
572*77c1e3ccSAndroid Build Coastguard Worker   const __m128i r = _mm_shuffle_epi8(sum, zero);
573*77c1e3ccSAndroid Build Coastguard Worker   const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
574*77c1e3ccSAndroid Build Coastguard Worker   row_store_32xh(&row, 16, dst, stride);
575*77c1e3ccSAndroid Build Coastguard Worker }
576*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_left_predictor_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)577*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
578*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *above,
579*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *left) {
580*77c1e3ccSAndroid Build Coastguard Worker   __m256i sum = dc_sum_64(left);
581*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
582*77c1e3ccSAndroid Build Coastguard Worker 
583*77c1e3ccSAndroid Build Coastguard Worker   const __m256i thirtytwo = _mm256_set1_epi16(32);
584*77c1e3ccSAndroid Build Coastguard Worker   sum = _mm256_add_epi16(sum, thirtytwo);
585*77c1e3ccSAndroid Build Coastguard Worker   sum = _mm256_srai_epi16(sum, 6);
586*77c1e3ccSAndroid Build Coastguard Worker   const __m256i zero = _mm256_setzero_si256();
587*77c1e3ccSAndroid Build Coastguard Worker   __m256i row = _mm256_shuffle_epi8(sum, zero);
588*77c1e3ccSAndroid Build Coastguard Worker   row_store_32xh(&row, 64, dst, stride);
589*77c1e3ccSAndroid Build Coastguard Worker }
590*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_left_predictor_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)591*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
592*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *above,
593*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *left) {
594*77c1e3ccSAndroid Build Coastguard Worker   __m256i sum = dc_sum_64(left);
595*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
596*77c1e3ccSAndroid Build Coastguard Worker 
597*77c1e3ccSAndroid Build Coastguard Worker   const __m256i thirtytwo = _mm256_set1_epi16(32);
598*77c1e3ccSAndroid Build Coastguard Worker   sum = _mm256_add_epi16(sum, thirtytwo);
599*77c1e3ccSAndroid Build Coastguard Worker   sum = _mm256_srai_epi16(sum, 6);
600*77c1e3ccSAndroid Build Coastguard Worker   const __m256i zero = _mm256_setzero_si256();
601*77c1e3ccSAndroid Build Coastguard Worker   __m256i row = _mm256_shuffle_epi8(sum, zero);
602*77c1e3ccSAndroid Build Coastguard Worker   row_store_64xh(&row, 64, dst, stride);
603*77c1e3ccSAndroid Build Coastguard Worker }
604*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_left_predictor_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)605*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
606*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *above,
607*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *left) {
608*77c1e3ccSAndroid Build Coastguard Worker   __m256i sum = dc_sum_32(left);
609*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
610*77c1e3ccSAndroid Build Coastguard Worker 
611*77c1e3ccSAndroid Build Coastguard Worker   const __m256i sixteen = _mm256_set1_epi16(16);
612*77c1e3ccSAndroid Build Coastguard Worker   sum = _mm256_add_epi16(sum, sixteen);
613*77c1e3ccSAndroid Build Coastguard Worker   sum = _mm256_srai_epi16(sum, 5);
614*77c1e3ccSAndroid Build Coastguard Worker   const __m256i zero = _mm256_setzero_si256();
615*77c1e3ccSAndroid Build Coastguard Worker   __m256i row = _mm256_shuffle_epi8(sum, zero);
616*77c1e3ccSAndroid Build Coastguard Worker   row_store_64xh(&row, 32, dst, stride);
617*77c1e3ccSAndroid Build Coastguard Worker }
618*77c1e3ccSAndroid Build Coastguard Worker 
619*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_left_predictor_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)620*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
621*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *above,
622*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *left) {
623*77c1e3ccSAndroid Build Coastguard Worker   __m128i sum = dc_sum_16_sse2(left);
624*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
625*77c1e3ccSAndroid Build Coastguard Worker 
626*77c1e3ccSAndroid Build Coastguard Worker   const __m128i eight = _mm_set1_epi16(8);
627*77c1e3ccSAndroid Build Coastguard Worker   sum = _mm_add_epi16(sum, eight);
628*77c1e3ccSAndroid Build Coastguard Worker   sum = _mm_srai_epi16(sum, 4);
629*77c1e3ccSAndroid Build Coastguard Worker   const __m128i zero = _mm_setzero_si128();
630*77c1e3ccSAndroid Build Coastguard Worker   const __m128i r = _mm_shuffle_epi8(sum, zero);
631*77c1e3ccSAndroid Build Coastguard Worker   const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
632*77c1e3ccSAndroid Build Coastguard Worker   row_store_64xh(&row, 16, dst, stride);
633*77c1e3ccSAndroid Build Coastguard Worker }
634*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
635*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_128_predictor_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)636*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
637*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
638*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
639*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
640*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
641*77c1e3ccSAndroid Build Coastguard Worker   const __m256i row = _mm256_set1_epi8((int8_t)0x80);
642*77c1e3ccSAndroid Build Coastguard Worker   row_store_32xh(&row, 16, dst, stride);
643*77c1e3ccSAndroid Build Coastguard Worker }
644*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_128_predictor_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)645*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
646*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
647*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
648*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
649*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
650*77c1e3ccSAndroid Build Coastguard Worker   const __m256i row = _mm256_set1_epi8((int8_t)0x80);
651*77c1e3ccSAndroid Build Coastguard Worker   row_store_32xh(&row, 64, dst, stride);
652*77c1e3ccSAndroid Build Coastguard Worker }
653*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_128_predictor_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)654*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
655*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
656*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
657*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
658*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
659*77c1e3ccSAndroid Build Coastguard Worker   const __m256i row = _mm256_set1_epi8((int8_t)0x80);
660*77c1e3ccSAndroid Build Coastguard Worker   row_store_64xh(&row, 64, dst, stride);
661*77c1e3ccSAndroid Build Coastguard Worker }
662*77c1e3ccSAndroid Build Coastguard Worker 
aom_dc_128_predictor_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)663*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
664*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
665*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
666*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
667*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
668*77c1e3ccSAndroid Build Coastguard Worker   const __m256i row = _mm256_set1_epi8((int8_t)0x80);
669*77c1e3ccSAndroid Build Coastguard Worker   row_store_64xh(&row, 32, dst, stride);
670*77c1e3ccSAndroid Build Coastguard Worker }
671*77c1e3ccSAndroid Build Coastguard Worker 
672*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_dc_128_predictor_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)673*77c1e3ccSAndroid Build Coastguard Worker void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
674*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *above,
675*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *left) {
676*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
677*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
678*77c1e3ccSAndroid Build Coastguard Worker   const __m256i row = _mm256_set1_epi8((int8_t)0x80);
679*77c1e3ccSAndroid Build Coastguard Worker   row_store_64xh(&row, 16, dst, stride);
680*77c1e3ccSAndroid Build Coastguard Worker }
681*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
682*77c1e3ccSAndroid Build Coastguard Worker 
aom_v_predictor_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)683*77c1e3ccSAndroid Build Coastguard Worker void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
684*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8_t *above, const uint8_t *left) {
685*77c1e3ccSAndroid Build Coastguard Worker   const __m256i row = _mm256_loadu_si256((const __m256i *)above);
686*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
687*77c1e3ccSAndroid Build Coastguard Worker   row_store_32xh(&row, 16, dst, stride);
688*77c1e3ccSAndroid Build Coastguard Worker }
689*77c1e3ccSAndroid Build Coastguard Worker 
aom_v_predictor_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)690*77c1e3ccSAndroid Build Coastguard Worker void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
691*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8_t *above, const uint8_t *left) {
692*77c1e3ccSAndroid Build Coastguard Worker   const __m256i row = _mm256_loadu_si256((const __m256i *)above);
693*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
694*77c1e3ccSAndroid Build Coastguard Worker   row_store_32xh(&row, 64, dst, stride);
695*77c1e3ccSAndroid Build Coastguard Worker }
696*77c1e3ccSAndroid Build Coastguard Worker 
aom_v_predictor_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)697*77c1e3ccSAndroid Build Coastguard Worker void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
698*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8_t *above, const uint8_t *left) {
699*77c1e3ccSAndroid Build Coastguard Worker   const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
700*77c1e3ccSAndroid Build Coastguard Worker   const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
701*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
702*77c1e3ccSAndroid Build Coastguard Worker   row_store_32x2xh(&row0, &row1, 64, dst, stride);
703*77c1e3ccSAndroid Build Coastguard Worker }
704*77c1e3ccSAndroid Build Coastguard Worker 
aom_v_predictor_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)705*77c1e3ccSAndroid Build Coastguard Worker void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
706*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8_t *above, const uint8_t *left) {
707*77c1e3ccSAndroid Build Coastguard Worker   const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
708*77c1e3ccSAndroid Build Coastguard Worker   const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
709*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
710*77c1e3ccSAndroid Build Coastguard Worker   row_store_32x2xh(&row0, &row1, 32, dst, stride);
711*77c1e3ccSAndroid Build Coastguard Worker }
712*77c1e3ccSAndroid Build Coastguard Worker 
713*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_v_predictor_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)714*77c1e3ccSAndroid Build Coastguard Worker void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
715*77c1e3ccSAndroid Build Coastguard Worker                                 const uint8_t *above, const uint8_t *left) {
716*77c1e3ccSAndroid Build Coastguard Worker   const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
717*77c1e3ccSAndroid Build Coastguard Worker   const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
718*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
719*77c1e3ccSAndroid Build Coastguard Worker   row_store_32x2xh(&row0, &row1, 16, dst, stride);
720*77c1e3ccSAndroid Build Coastguard Worker }
721*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
722*77c1e3ccSAndroid Build Coastguard Worker 
723*77c1e3ccSAndroid Build Coastguard Worker // -----------------------------------------------------------------------------
724*77c1e3ccSAndroid Build Coastguard Worker // PAETH_PRED
725*77c1e3ccSAndroid Build Coastguard Worker 
726*77c1e3ccSAndroid Build Coastguard Worker // Return 16 16-bit pixels in one row (__m256i)
paeth_pred(const __m256i * left,const __m256i * top,const __m256i * topleft)727*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i paeth_pred(const __m256i *left, const __m256i *top,
728*77c1e3ccSAndroid Build Coastguard Worker                                  const __m256i *topleft) {
729*77c1e3ccSAndroid Build Coastguard Worker   const __m256i base =
730*77c1e3ccSAndroid Build Coastguard Worker       _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft);
731*77c1e3ccSAndroid Build Coastguard Worker 
732*77c1e3ccSAndroid Build Coastguard Worker   __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left));
733*77c1e3ccSAndroid Build Coastguard Worker   __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top));
734*77c1e3ccSAndroid Build Coastguard Worker   __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft));
735*77c1e3ccSAndroid Build Coastguard Worker 
736*77c1e3ccSAndroid Build Coastguard Worker   __m256i mask1 = _mm256_cmpgt_epi16(pl, pt);
737*77c1e3ccSAndroid Build Coastguard Worker   mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl));
738*77c1e3ccSAndroid Build Coastguard Worker   __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl);
739*77c1e3ccSAndroid Build Coastguard Worker 
740*77c1e3ccSAndroid Build Coastguard Worker   pl = _mm256_andnot_si256(mask1, *left);
741*77c1e3ccSAndroid Build Coastguard Worker 
742*77c1e3ccSAndroid Build Coastguard Worker   ptl = _mm256_and_si256(mask2, *topleft);
743*77c1e3ccSAndroid Build Coastguard Worker   pt = _mm256_andnot_si256(mask2, *top);
744*77c1e3ccSAndroid Build Coastguard Worker   pt = _mm256_or_si256(pt, ptl);
745*77c1e3ccSAndroid Build Coastguard Worker   pt = _mm256_and_si256(mask1, pt);
746*77c1e3ccSAndroid Build Coastguard Worker 
747*77c1e3ccSAndroid Build Coastguard Worker   return _mm256_or_si256(pt, pl);
748*77c1e3ccSAndroid Build Coastguard Worker }
749*77c1e3ccSAndroid Build Coastguard Worker 
750*77c1e3ccSAndroid Build Coastguard Worker // Return 16 8-bit pixels in one row (__m128i)
paeth_16x1_pred(const __m256i * left,const __m256i * top,const __m256i * topleft)751*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top,
752*77c1e3ccSAndroid Build Coastguard Worker                                       const __m256i *topleft) {
753*77c1e3ccSAndroid Build Coastguard Worker   const __m256i p0 = paeth_pred(left, top, topleft);
754*77c1e3ccSAndroid Build Coastguard Worker   const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
755*77c1e3ccSAndroid Build Coastguard Worker   const __m256i p = _mm256_packus_epi16(p0, p1);
756*77c1e3ccSAndroid Build Coastguard Worker   return _mm256_castsi256_si128(p);
757*77c1e3ccSAndroid Build Coastguard Worker }
758*77c1e3ccSAndroid Build Coastguard Worker 
get_top_vector(const uint8_t * above)759*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i get_top_vector(const uint8_t *above) {
760*77c1e3ccSAndroid Build Coastguard Worker   const __m128i x = _mm_load_si128((const __m128i *)above);
761*77c1e3ccSAndroid Build Coastguard Worker   const __m128i zero = _mm_setzero_si128();
762*77c1e3ccSAndroid Build Coastguard Worker   const __m128i t0 = _mm_unpacklo_epi8(x, zero);
763*77c1e3ccSAndroid Build Coastguard Worker   const __m128i t1 = _mm_unpackhi_epi8(x, zero);
764*77c1e3ccSAndroid Build Coastguard Worker   return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1);
765*77c1e3ccSAndroid Build Coastguard Worker }
766*77c1e3ccSAndroid Build Coastguard Worker 
aom_paeth_predictor_16x8_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)767*77c1e3ccSAndroid Build Coastguard Worker void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
768*77c1e3ccSAndroid Build Coastguard Worker                                    const uint8_t *above, const uint8_t *left) {
769*77c1e3ccSAndroid Build Coastguard Worker   __m128i x = _mm_loadl_epi64((const __m128i *)left);
770*77c1e3ccSAndroid Build Coastguard Worker   const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
771*77c1e3ccSAndroid Build Coastguard Worker   const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
772*77c1e3ccSAndroid Build Coastguard Worker   __m256i rep = _mm256_set1_epi16((short)0x8000);
773*77c1e3ccSAndroid Build Coastguard Worker   const __m256i one = _mm256_set1_epi16(1);
774*77c1e3ccSAndroid Build Coastguard Worker   const __m256i top = get_top_vector(above);
775*77c1e3ccSAndroid Build Coastguard Worker 
776*77c1e3ccSAndroid Build Coastguard Worker   int i;
777*77c1e3ccSAndroid Build Coastguard Worker   for (i = 0; i < 8; ++i) {
778*77c1e3ccSAndroid Build Coastguard Worker     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
779*77c1e3ccSAndroid Build Coastguard Worker     const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
780*77c1e3ccSAndroid Build Coastguard Worker 
781*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)dst, row);
782*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
783*77c1e3ccSAndroid Build Coastguard Worker     rep = _mm256_add_epi16(rep, one);
784*77c1e3ccSAndroid Build Coastguard Worker   }
785*77c1e3ccSAndroid Build Coastguard Worker }
786*77c1e3ccSAndroid Build Coastguard Worker 
get_left_vector(const uint8_t * left)787*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i get_left_vector(const uint8_t *left) {
788*77c1e3ccSAndroid Build Coastguard Worker   const __m128i x = _mm_load_si128((const __m128i *)left);
789*77c1e3ccSAndroid Build Coastguard Worker   return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
790*77c1e3ccSAndroid Build Coastguard Worker }
791*77c1e3ccSAndroid Build Coastguard Worker 
aom_paeth_predictor_16x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)792*77c1e3ccSAndroid Build Coastguard Worker void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
793*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *above, const uint8_t *left) {
794*77c1e3ccSAndroid Build Coastguard Worker   const __m256i l = get_left_vector(left);
795*77c1e3ccSAndroid Build Coastguard Worker   const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
796*77c1e3ccSAndroid Build Coastguard Worker   __m256i rep = _mm256_set1_epi16((short)0x8000);
797*77c1e3ccSAndroid Build Coastguard Worker   const __m256i one = _mm256_set1_epi16(1);
798*77c1e3ccSAndroid Build Coastguard Worker   const __m256i top = get_top_vector(above);
799*77c1e3ccSAndroid Build Coastguard Worker 
800*77c1e3ccSAndroid Build Coastguard Worker   int i;
801*77c1e3ccSAndroid Build Coastguard Worker   for (i = 0; i < 16; ++i) {
802*77c1e3ccSAndroid Build Coastguard Worker     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
803*77c1e3ccSAndroid Build Coastguard Worker     const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
804*77c1e3ccSAndroid Build Coastguard Worker 
805*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)dst, row);
806*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
807*77c1e3ccSAndroid Build Coastguard Worker     rep = _mm256_add_epi16(rep, one);
808*77c1e3ccSAndroid Build Coastguard Worker   }
809*77c1e3ccSAndroid Build Coastguard Worker }
810*77c1e3ccSAndroid Build Coastguard Worker 
aom_paeth_predictor_16x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)811*77c1e3ccSAndroid Build Coastguard Worker void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
812*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *above, const uint8_t *left) {
813*77c1e3ccSAndroid Build Coastguard Worker   __m256i l = get_left_vector(left);
814*77c1e3ccSAndroid Build Coastguard Worker   const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
815*77c1e3ccSAndroid Build Coastguard Worker   __m256i rep = _mm256_set1_epi16((short)0x8000);
816*77c1e3ccSAndroid Build Coastguard Worker   const __m256i one = _mm256_set1_epi16(1);
817*77c1e3ccSAndroid Build Coastguard Worker   const __m256i top = get_top_vector(above);
818*77c1e3ccSAndroid Build Coastguard Worker 
819*77c1e3ccSAndroid Build Coastguard Worker   int i;
820*77c1e3ccSAndroid Build Coastguard Worker   for (i = 0; i < 16; ++i) {
821*77c1e3ccSAndroid Build Coastguard Worker     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
822*77c1e3ccSAndroid Build Coastguard Worker     const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
823*77c1e3ccSAndroid Build Coastguard Worker 
824*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)dst, row);
825*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
826*77c1e3ccSAndroid Build Coastguard Worker     rep = _mm256_add_epi16(rep, one);
827*77c1e3ccSAndroid Build Coastguard Worker   }
828*77c1e3ccSAndroid Build Coastguard Worker 
829*77c1e3ccSAndroid Build Coastguard Worker   l = get_left_vector(left + 16);
830*77c1e3ccSAndroid Build Coastguard Worker   rep = _mm256_set1_epi16((short)0x8000);
831*77c1e3ccSAndroid Build Coastguard Worker   for (i = 0; i < 16; ++i) {
832*77c1e3ccSAndroid Build Coastguard Worker     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
833*77c1e3ccSAndroid Build Coastguard Worker     const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
834*77c1e3ccSAndroid Build Coastguard Worker 
835*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)dst, row);
836*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
837*77c1e3ccSAndroid Build Coastguard Worker     rep = _mm256_add_epi16(rep, one);
838*77c1e3ccSAndroid Build Coastguard Worker   }
839*77c1e3ccSAndroid Build Coastguard Worker }
840*77c1e3ccSAndroid Build Coastguard Worker 
841*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_paeth_predictor_16x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)842*77c1e3ccSAndroid Build Coastguard Worker void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
843*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *above, const uint8_t *left) {
844*77c1e3ccSAndroid Build Coastguard Worker   const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
845*77c1e3ccSAndroid Build Coastguard Worker   const __m256i one = _mm256_set1_epi16(1);
846*77c1e3ccSAndroid Build Coastguard Worker   const __m256i top = get_top_vector(above);
847*77c1e3ccSAndroid Build Coastguard Worker 
848*77c1e3ccSAndroid Build Coastguard Worker   for (int j = 0; j < 4; ++j) {
849*77c1e3ccSAndroid Build Coastguard Worker     const __m256i l = get_left_vector(left + j * 16);
850*77c1e3ccSAndroid Build Coastguard Worker     __m256i rep = _mm256_set1_epi16((short)0x8000);
851*77c1e3ccSAndroid Build Coastguard Worker     for (int i = 0; i < 16; ++i) {
852*77c1e3ccSAndroid Build Coastguard Worker       const __m256i l16 = _mm256_shuffle_epi8(l, rep);
853*77c1e3ccSAndroid Build Coastguard Worker       const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
854*77c1e3ccSAndroid Build Coastguard Worker 
855*77c1e3ccSAndroid Build Coastguard Worker       _mm_store_si128((__m128i *)dst, row);
856*77c1e3ccSAndroid Build Coastguard Worker       dst += stride;
857*77c1e3ccSAndroid Build Coastguard Worker       rep = _mm256_add_epi16(rep, one);
858*77c1e3ccSAndroid Build Coastguard Worker     }
859*77c1e3ccSAndroid Build Coastguard Worker   }
860*77c1e3ccSAndroid Build Coastguard Worker }
861*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
862*77c1e3ccSAndroid Build Coastguard Worker 
863*77c1e3ccSAndroid Build Coastguard Worker // Return 32 8-bit pixels in one row (__m256i)
paeth_32x1_pred(const __m256i * left,const __m256i * top0,const __m256i * top1,const __m256i * topleft)864*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0,
865*77c1e3ccSAndroid Build Coastguard Worker                                       const __m256i *top1,
866*77c1e3ccSAndroid Build Coastguard Worker                                       const __m256i *topleft) {
867*77c1e3ccSAndroid Build Coastguard Worker   __m256i p0 = paeth_pred(left, top0, topleft);
868*77c1e3ccSAndroid Build Coastguard Worker   __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
869*77c1e3ccSAndroid Build Coastguard Worker   const __m256i x0 = _mm256_packus_epi16(p0, p1);
870*77c1e3ccSAndroid Build Coastguard Worker 
871*77c1e3ccSAndroid Build Coastguard Worker   p0 = paeth_pred(left, top1, topleft);
872*77c1e3ccSAndroid Build Coastguard Worker   p1 = _mm256_permute4x64_epi64(p0, 0xe);
873*77c1e3ccSAndroid Build Coastguard Worker   const __m256i x1 = _mm256_packus_epi16(p0, p1);
874*77c1e3ccSAndroid Build Coastguard Worker 
875*77c1e3ccSAndroid Build Coastguard Worker   return _mm256_permute2x128_si256(x0, x1, 0x20);
876*77c1e3ccSAndroid Build Coastguard Worker }
877*77c1e3ccSAndroid Build Coastguard Worker 
aom_paeth_predictor_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)878*77c1e3ccSAndroid Build Coastguard Worker void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
879*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *above, const uint8_t *left) {
880*77c1e3ccSAndroid Build Coastguard Worker   const __m256i l = get_left_vector(left);
881*77c1e3ccSAndroid Build Coastguard Worker   const __m256i t0 = get_top_vector(above);
882*77c1e3ccSAndroid Build Coastguard Worker   const __m256i t1 = get_top_vector(above + 16);
883*77c1e3ccSAndroid Build Coastguard Worker   const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
884*77c1e3ccSAndroid Build Coastguard Worker   __m256i rep = _mm256_set1_epi16((short)0x8000);
885*77c1e3ccSAndroid Build Coastguard Worker   const __m256i one = _mm256_set1_epi16(1);
886*77c1e3ccSAndroid Build Coastguard Worker 
887*77c1e3ccSAndroid Build Coastguard Worker   int i;
888*77c1e3ccSAndroid Build Coastguard Worker   for (i = 0; i < 16; ++i) {
889*77c1e3ccSAndroid Build Coastguard Worker     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
890*77c1e3ccSAndroid Build Coastguard Worker 
891*77c1e3ccSAndroid Build Coastguard Worker     const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl);
892*77c1e3ccSAndroid Build Coastguard Worker 
893*77c1e3ccSAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i *)dst, r);
894*77c1e3ccSAndroid Build Coastguard Worker 
895*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
896*77c1e3ccSAndroid Build Coastguard Worker     rep = _mm256_add_epi16(rep, one);
897*77c1e3ccSAndroid Build Coastguard Worker   }
898*77c1e3ccSAndroid Build Coastguard Worker }
899*77c1e3ccSAndroid Build Coastguard Worker 
aom_paeth_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)900*77c1e3ccSAndroid Build Coastguard Worker void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
901*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *above, const uint8_t *left) {
902*77c1e3ccSAndroid Build Coastguard Worker   __m256i l = get_left_vector(left);
903*77c1e3ccSAndroid Build Coastguard Worker   const __m256i t0 = get_top_vector(above);
904*77c1e3ccSAndroid Build Coastguard Worker   const __m256i t1 = get_top_vector(above + 16);
905*77c1e3ccSAndroid Build Coastguard Worker   const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
906*77c1e3ccSAndroid Build Coastguard Worker   __m256i rep = _mm256_set1_epi16((short)0x8000);
907*77c1e3ccSAndroid Build Coastguard Worker   const __m256i one = _mm256_set1_epi16(1);
908*77c1e3ccSAndroid Build Coastguard Worker 
909*77c1e3ccSAndroid Build Coastguard Worker   int i;
910*77c1e3ccSAndroid Build Coastguard Worker   for (i = 0; i < 16; ++i) {
911*77c1e3ccSAndroid Build Coastguard Worker     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
912*77c1e3ccSAndroid Build Coastguard Worker 
913*77c1e3ccSAndroid Build Coastguard Worker     const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
914*77c1e3ccSAndroid Build Coastguard Worker     const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
915*77c1e3ccSAndroid Build Coastguard Worker 
916*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)dst, r0);
917*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + 16), r1);
918*77c1e3ccSAndroid Build Coastguard Worker 
919*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
920*77c1e3ccSAndroid Build Coastguard Worker     rep = _mm256_add_epi16(rep, one);
921*77c1e3ccSAndroid Build Coastguard Worker   }
922*77c1e3ccSAndroid Build Coastguard Worker 
923*77c1e3ccSAndroid Build Coastguard Worker   l = get_left_vector(left + 16);
924*77c1e3ccSAndroid Build Coastguard Worker   rep = _mm256_set1_epi16((short)0x8000);
925*77c1e3ccSAndroid Build Coastguard Worker   for (i = 0; i < 16; ++i) {
926*77c1e3ccSAndroid Build Coastguard Worker     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
927*77c1e3ccSAndroid Build Coastguard Worker 
928*77c1e3ccSAndroid Build Coastguard Worker     const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
929*77c1e3ccSAndroid Build Coastguard Worker     const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
930*77c1e3ccSAndroid Build Coastguard Worker 
931*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)dst, r0);
932*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + 16), r1);
933*77c1e3ccSAndroid Build Coastguard Worker 
934*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
935*77c1e3ccSAndroid Build Coastguard Worker     rep = _mm256_add_epi16(rep, one);
936*77c1e3ccSAndroid Build Coastguard Worker   }
937*77c1e3ccSAndroid Build Coastguard Worker }
938*77c1e3ccSAndroid Build Coastguard Worker 
aom_paeth_predictor_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)939*77c1e3ccSAndroid Build Coastguard Worker void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
940*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *above, const uint8_t *left) {
941*77c1e3ccSAndroid Build Coastguard Worker   const __m256i t0 = get_top_vector(above);
942*77c1e3ccSAndroid Build Coastguard Worker   const __m256i t1 = get_top_vector(above + 16);
943*77c1e3ccSAndroid Build Coastguard Worker   const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
944*77c1e3ccSAndroid Build Coastguard Worker   const __m256i one = _mm256_set1_epi16(1);
945*77c1e3ccSAndroid Build Coastguard Worker 
946*77c1e3ccSAndroid Build Coastguard Worker   int i, j;
947*77c1e3ccSAndroid Build Coastguard Worker   for (j = 0; j < 4; ++j) {
948*77c1e3ccSAndroid Build Coastguard Worker     const __m256i l = get_left_vector(left + j * 16);
949*77c1e3ccSAndroid Build Coastguard Worker     __m256i rep = _mm256_set1_epi16((short)0x8000);
950*77c1e3ccSAndroid Build Coastguard Worker     for (i = 0; i < 16; ++i) {
951*77c1e3ccSAndroid Build Coastguard Worker       const __m256i l16 = _mm256_shuffle_epi8(l, rep);
952*77c1e3ccSAndroid Build Coastguard Worker 
953*77c1e3ccSAndroid Build Coastguard Worker       const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
954*77c1e3ccSAndroid Build Coastguard Worker       const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
955*77c1e3ccSAndroid Build Coastguard Worker 
956*77c1e3ccSAndroid Build Coastguard Worker       _mm_store_si128((__m128i *)dst, r0);
957*77c1e3ccSAndroid Build Coastguard Worker       _mm_store_si128((__m128i *)(dst + 16), r1);
958*77c1e3ccSAndroid Build Coastguard Worker 
959*77c1e3ccSAndroid Build Coastguard Worker       dst += stride;
960*77c1e3ccSAndroid Build Coastguard Worker       rep = _mm256_add_epi16(rep, one);
961*77c1e3ccSAndroid Build Coastguard Worker     }
962*77c1e3ccSAndroid Build Coastguard Worker   }
963*77c1e3ccSAndroid Build Coastguard Worker }
964*77c1e3ccSAndroid Build Coastguard Worker 
aom_paeth_predictor_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)965*77c1e3ccSAndroid Build Coastguard Worker void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
966*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *above, const uint8_t *left) {
967*77c1e3ccSAndroid Build Coastguard Worker   const __m256i t0 = get_top_vector(above);
968*77c1e3ccSAndroid Build Coastguard Worker   const __m256i t1 = get_top_vector(above + 16);
969*77c1e3ccSAndroid Build Coastguard Worker   const __m256i t2 = get_top_vector(above + 32);
970*77c1e3ccSAndroid Build Coastguard Worker   const __m256i t3 = get_top_vector(above + 48);
971*77c1e3ccSAndroid Build Coastguard Worker   const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
972*77c1e3ccSAndroid Build Coastguard Worker   const __m256i one = _mm256_set1_epi16(1);
973*77c1e3ccSAndroid Build Coastguard Worker 
974*77c1e3ccSAndroid Build Coastguard Worker   int i, j;
975*77c1e3ccSAndroid Build Coastguard Worker   for (j = 0; j < 2; ++j) {
976*77c1e3ccSAndroid Build Coastguard Worker     const __m256i l = get_left_vector(left + j * 16);
977*77c1e3ccSAndroid Build Coastguard Worker     __m256i rep = _mm256_set1_epi16((short)0x8000);
978*77c1e3ccSAndroid Build Coastguard Worker     for (i = 0; i < 16; ++i) {
979*77c1e3ccSAndroid Build Coastguard Worker       const __m256i l16 = _mm256_shuffle_epi8(l, rep);
980*77c1e3ccSAndroid Build Coastguard Worker 
981*77c1e3ccSAndroid Build Coastguard Worker       const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
982*77c1e3ccSAndroid Build Coastguard Worker       const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
983*77c1e3ccSAndroid Build Coastguard Worker       const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
984*77c1e3ccSAndroid Build Coastguard Worker       const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
985*77c1e3ccSAndroid Build Coastguard Worker 
986*77c1e3ccSAndroid Build Coastguard Worker       _mm_store_si128((__m128i *)dst, r0);
987*77c1e3ccSAndroid Build Coastguard Worker       _mm_store_si128((__m128i *)(dst + 16), r1);
988*77c1e3ccSAndroid Build Coastguard Worker       _mm_store_si128((__m128i *)(dst + 32), r2);
989*77c1e3ccSAndroid Build Coastguard Worker       _mm_store_si128((__m128i *)(dst + 48), r3);
990*77c1e3ccSAndroid Build Coastguard Worker 
991*77c1e3ccSAndroid Build Coastguard Worker       dst += stride;
992*77c1e3ccSAndroid Build Coastguard Worker       rep = _mm256_add_epi16(rep, one);
993*77c1e3ccSAndroid Build Coastguard Worker     }
994*77c1e3ccSAndroid Build Coastguard Worker   }
995*77c1e3ccSAndroid Build Coastguard Worker }
996*77c1e3ccSAndroid Build Coastguard Worker 
aom_paeth_predictor_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)997*77c1e3ccSAndroid Build Coastguard Worker void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
998*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *above, const uint8_t *left) {
999*77c1e3ccSAndroid Build Coastguard Worker   const __m256i t0 = get_top_vector(above);
1000*77c1e3ccSAndroid Build Coastguard Worker   const __m256i t1 = get_top_vector(above + 16);
1001*77c1e3ccSAndroid Build Coastguard Worker   const __m256i t2 = get_top_vector(above + 32);
1002*77c1e3ccSAndroid Build Coastguard Worker   const __m256i t3 = get_top_vector(above + 48);
1003*77c1e3ccSAndroid Build Coastguard Worker   const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
1004*77c1e3ccSAndroid Build Coastguard Worker   const __m256i one = _mm256_set1_epi16(1);
1005*77c1e3ccSAndroid Build Coastguard Worker 
1006*77c1e3ccSAndroid Build Coastguard Worker   int i, j;
1007*77c1e3ccSAndroid Build Coastguard Worker   for (j = 0; j < 4; ++j) {
1008*77c1e3ccSAndroid Build Coastguard Worker     const __m256i l = get_left_vector(left + j * 16);
1009*77c1e3ccSAndroid Build Coastguard Worker     __m256i rep = _mm256_set1_epi16((short)0x8000);
1010*77c1e3ccSAndroid Build Coastguard Worker     for (i = 0; i < 16; ++i) {
1011*77c1e3ccSAndroid Build Coastguard Worker       const __m256i l16 = _mm256_shuffle_epi8(l, rep);
1012*77c1e3ccSAndroid Build Coastguard Worker 
1013*77c1e3ccSAndroid Build Coastguard Worker       const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
1014*77c1e3ccSAndroid Build Coastguard Worker       const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
1015*77c1e3ccSAndroid Build Coastguard Worker       const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
1016*77c1e3ccSAndroid Build Coastguard Worker       const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
1017*77c1e3ccSAndroid Build Coastguard Worker 
1018*77c1e3ccSAndroid Build Coastguard Worker       _mm_store_si128((__m128i *)dst, r0);
1019*77c1e3ccSAndroid Build Coastguard Worker       _mm_store_si128((__m128i *)(dst + 16), r1);
1020*77c1e3ccSAndroid Build Coastguard Worker       _mm_store_si128((__m128i *)(dst + 32), r2);
1021*77c1e3ccSAndroid Build Coastguard Worker       _mm_store_si128((__m128i *)(dst + 48), r3);
1022*77c1e3ccSAndroid Build Coastguard Worker 
1023*77c1e3ccSAndroid Build Coastguard Worker       dst += stride;
1024*77c1e3ccSAndroid Build Coastguard Worker       rep = _mm256_add_epi16(rep, one);
1025*77c1e3ccSAndroid Build Coastguard Worker     }
1026*77c1e3ccSAndroid Build Coastguard Worker   }
1027*77c1e3ccSAndroid Build Coastguard Worker }
1028*77c1e3ccSAndroid Build Coastguard Worker 
1029*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_paeth_predictor_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1030*77c1e3ccSAndroid Build Coastguard Worker void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
1031*77c1e3ccSAndroid Build Coastguard Worker                                     const uint8_t *above, const uint8_t *left) {
1032*77c1e3ccSAndroid Build Coastguard Worker   const __m256i t0 = get_top_vector(above);
1033*77c1e3ccSAndroid Build Coastguard Worker   const __m256i t1 = get_top_vector(above + 16);
1034*77c1e3ccSAndroid Build Coastguard Worker   const __m256i t2 = get_top_vector(above + 32);
1035*77c1e3ccSAndroid Build Coastguard Worker   const __m256i t3 = get_top_vector(above + 48);
1036*77c1e3ccSAndroid Build Coastguard Worker   const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
1037*77c1e3ccSAndroid Build Coastguard Worker   const __m256i one = _mm256_set1_epi16(1);
1038*77c1e3ccSAndroid Build Coastguard Worker 
1039*77c1e3ccSAndroid Build Coastguard Worker   int i;
1040*77c1e3ccSAndroid Build Coastguard Worker   const __m256i l = get_left_vector(left);
1041*77c1e3ccSAndroid Build Coastguard Worker   __m256i rep = _mm256_set1_epi16((short)0x8000);
1042*77c1e3ccSAndroid Build Coastguard Worker   for (i = 0; i < 16; ++i) {
1043*77c1e3ccSAndroid Build Coastguard Worker     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
1044*77c1e3ccSAndroid Build Coastguard Worker 
1045*77c1e3ccSAndroid Build Coastguard Worker     const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
1046*77c1e3ccSAndroid Build Coastguard Worker     const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
1047*77c1e3ccSAndroid Build Coastguard Worker     const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
1048*77c1e3ccSAndroid Build Coastguard Worker     const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
1049*77c1e3ccSAndroid Build Coastguard Worker 
1050*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)dst, r0);
1051*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + 16), r1);
1052*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + 32), r2);
1053*77c1e3ccSAndroid Build Coastguard Worker     _mm_store_si128((__m128i *)(dst + 48), r3);
1054*77c1e3ccSAndroid Build Coastguard Worker 
1055*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
1056*77c1e3ccSAndroid Build Coastguard Worker     rep = _mm256_add_epi16(rep, one);
1057*77c1e3ccSAndroid Build Coastguard Worker   }
1058*77c1e3ccSAndroid Build Coastguard Worker }
1059*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1060*77c1e3ccSAndroid Build Coastguard Worker 
1061*77c1e3ccSAndroid Build Coastguard Worker #if CONFIG_AV1_HIGHBITDEPTH
1062*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_z1_4xN_internal_avx2(int N,__m128i * dst,const uint16_t * above,int upsample_above,int dx)1063*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void highbd_dr_prediction_z1_4xN_internal_avx2(
1064*77c1e3ccSAndroid Build Coastguard Worker     int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1065*77c1e3ccSAndroid Build Coastguard Worker   const int frac_bits = 6 - upsample_above;
1066*77c1e3ccSAndroid Build Coastguard Worker   const int max_base_x = ((N + 4) - 1) << upsample_above;
1067*77c1e3ccSAndroid Build Coastguard Worker 
1068*77c1e3ccSAndroid Build Coastguard Worker   assert(dx > 0);
1069*77c1e3ccSAndroid Build Coastguard Worker   // pre-filter above pixels
1070*77c1e3ccSAndroid Build Coastguard Worker   // store in temp buffers:
1071*77c1e3ccSAndroid Build Coastguard Worker   //   above[x] * 32 + 16
1072*77c1e3ccSAndroid Build Coastguard Worker   //   above[x+1] - above[x]
1073*77c1e3ccSAndroid Build Coastguard Worker   // final pixels will be calculated as:
1074*77c1e3ccSAndroid Build Coastguard Worker   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1075*77c1e3ccSAndroid Build Coastguard Worker   __m256i a0, a1, a32, a16;
1076*77c1e3ccSAndroid Build Coastguard Worker   __m256i diff, c3f;
1077*77c1e3ccSAndroid Build Coastguard Worker   __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
1078*77c1e3ccSAndroid Build Coastguard Worker   __m128i a0_128, a1_128;
1079*77c1e3ccSAndroid Build Coastguard Worker   a16 = _mm256_set1_epi16(16);
1080*77c1e3ccSAndroid Build Coastguard Worker   a_mbase_x = _mm_set1_epi16(above[max_base_x]);
1081*77c1e3ccSAndroid Build Coastguard Worker   max_base_x128 = _mm_set1_epi16(max_base_x);
1082*77c1e3ccSAndroid Build Coastguard Worker   c3f = _mm256_set1_epi16(0x3f);
1083*77c1e3ccSAndroid Build Coastguard Worker 
1084*77c1e3ccSAndroid Build Coastguard Worker   int x = dx;
1085*77c1e3ccSAndroid Build Coastguard Worker   for (int r = 0; r < N; r++) {
1086*77c1e3ccSAndroid Build Coastguard Worker     __m256i b, res, shift;
1087*77c1e3ccSAndroid Build Coastguard Worker     __m128i res1;
1088*77c1e3ccSAndroid Build Coastguard Worker 
1089*77c1e3ccSAndroid Build Coastguard Worker     int base = x >> frac_bits;
1090*77c1e3ccSAndroid Build Coastguard Worker     if (base >= max_base_x) {
1091*77c1e3ccSAndroid Build Coastguard Worker       for (int i = r; i < N; ++i) {
1092*77c1e3ccSAndroid Build Coastguard Worker         dst[i] = a_mbase_x;  // save 4 values
1093*77c1e3ccSAndroid Build Coastguard Worker       }
1094*77c1e3ccSAndroid Build Coastguard Worker       return;
1095*77c1e3ccSAndroid Build Coastguard Worker     }
1096*77c1e3ccSAndroid Build Coastguard Worker 
1097*77c1e3ccSAndroid Build Coastguard Worker     a0_128 = _mm_loadu_si128((__m128i *)(above + base));
1098*77c1e3ccSAndroid Build Coastguard Worker     a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
1099*77c1e3ccSAndroid Build Coastguard Worker 
1100*77c1e3ccSAndroid Build Coastguard Worker     if (upsample_above) {
1101*77c1e3ccSAndroid Build Coastguard Worker       a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)HighbdEvenOddMaskx4[0]);
1102*77c1e3ccSAndroid Build Coastguard Worker       a1_128 = _mm_srli_si128(a0_128, 8);
1103*77c1e3ccSAndroid Build Coastguard Worker 
1104*77c1e3ccSAndroid Build Coastguard Worker       base_inc128 = _mm_setr_epi16(base, base + 2, base + 4, base + 6, base + 8,
1105*77c1e3ccSAndroid Build Coastguard Worker                                    base + 10, base + 12, base + 14);
1106*77c1e3ccSAndroid Build Coastguard Worker       shift = _mm256_srli_epi16(
1107*77c1e3ccSAndroid Build Coastguard Worker           _mm256_and_si256(
1108*77c1e3ccSAndroid Build Coastguard Worker               _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above),
1109*77c1e3ccSAndroid Build Coastguard Worker               _mm256_set1_epi16(0x3f)),
1110*77c1e3ccSAndroid Build Coastguard Worker           1);
1111*77c1e3ccSAndroid Build Coastguard Worker     } else {
1112*77c1e3ccSAndroid Build Coastguard Worker       base_inc128 = _mm_setr_epi16(base, base + 1, base + 2, base + 3, base + 4,
1113*77c1e3ccSAndroid Build Coastguard Worker                                    base + 5, base + 6, base + 7);
1114*77c1e3ccSAndroid Build Coastguard Worker       shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1115*77c1e3ccSAndroid Build Coastguard Worker     }
1116*77c1e3ccSAndroid Build Coastguard Worker     a0 = _mm256_castsi128_si256(a0_128);
1117*77c1e3ccSAndroid Build Coastguard Worker     a1 = _mm256_castsi128_si256(a1_128);
1118*77c1e3ccSAndroid Build Coastguard Worker     diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
1119*77c1e3ccSAndroid Build Coastguard Worker     a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
1120*77c1e3ccSAndroid Build Coastguard Worker     a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
1121*77c1e3ccSAndroid Build Coastguard Worker 
1122*77c1e3ccSAndroid Build Coastguard Worker     b = _mm256_mullo_epi16(diff, shift);
1123*77c1e3ccSAndroid Build Coastguard Worker     res = _mm256_add_epi16(a32, b);
1124*77c1e3ccSAndroid Build Coastguard Worker     res = _mm256_srli_epi16(res, 5);
1125*77c1e3ccSAndroid Build Coastguard Worker     res1 = _mm256_castsi256_si128(res);
1126*77c1e3ccSAndroid Build Coastguard Worker 
1127*77c1e3ccSAndroid Build Coastguard Worker     mask128 = _mm_cmpgt_epi16(max_base_x128, base_inc128);
1128*77c1e3ccSAndroid Build Coastguard Worker     dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
1129*77c1e3ccSAndroid Build Coastguard Worker     x += dx;
1130*77c1e3ccSAndroid Build Coastguard Worker   }
1131*77c1e3ccSAndroid Build Coastguard Worker }
1132*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_32bit_z1_4xN_internal_avx2(int N,__m128i * dst,const uint16_t * above,int upsample_above,int dx)1133*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_4xN_internal_avx2(
1134*77c1e3ccSAndroid Build Coastguard Worker     int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1135*77c1e3ccSAndroid Build Coastguard Worker   const int frac_bits = 6 - upsample_above;
1136*77c1e3ccSAndroid Build Coastguard Worker   const int max_base_x = ((N + 4) - 1) << upsample_above;
1137*77c1e3ccSAndroid Build Coastguard Worker 
1138*77c1e3ccSAndroid Build Coastguard Worker   assert(dx > 0);
1139*77c1e3ccSAndroid Build Coastguard Worker   // pre-filter above pixels
1140*77c1e3ccSAndroid Build Coastguard Worker   // store in temp buffers:
1141*77c1e3ccSAndroid Build Coastguard Worker   //   above[x] * 32 + 16
1142*77c1e3ccSAndroid Build Coastguard Worker   //   above[x+1] - above[x]
1143*77c1e3ccSAndroid Build Coastguard Worker   // final pixels will be calculated as:
1144*77c1e3ccSAndroid Build Coastguard Worker   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1145*77c1e3ccSAndroid Build Coastguard Worker   __m256i a0, a1, a32, a16;
1146*77c1e3ccSAndroid Build Coastguard Worker   __m256i diff;
1147*77c1e3ccSAndroid Build Coastguard Worker   __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
1148*77c1e3ccSAndroid Build Coastguard Worker 
1149*77c1e3ccSAndroid Build Coastguard Worker   a16 = _mm256_set1_epi32(16);
1150*77c1e3ccSAndroid Build Coastguard Worker   a_mbase_x = _mm_set1_epi16(above[max_base_x]);
1151*77c1e3ccSAndroid Build Coastguard Worker   max_base_x128 = _mm_set1_epi32(max_base_x);
1152*77c1e3ccSAndroid Build Coastguard Worker 
1153*77c1e3ccSAndroid Build Coastguard Worker   int x = dx;
1154*77c1e3ccSAndroid Build Coastguard Worker   for (int r = 0; r < N; r++) {
1155*77c1e3ccSAndroid Build Coastguard Worker     __m256i b, res, shift;
1156*77c1e3ccSAndroid Build Coastguard Worker     __m128i res1;
1157*77c1e3ccSAndroid Build Coastguard Worker 
1158*77c1e3ccSAndroid Build Coastguard Worker     int base = x >> frac_bits;
1159*77c1e3ccSAndroid Build Coastguard Worker     if (base >= max_base_x) {
1160*77c1e3ccSAndroid Build Coastguard Worker       for (int i = r; i < N; ++i) {
1161*77c1e3ccSAndroid Build Coastguard Worker         dst[i] = a_mbase_x;  // save 4 values
1162*77c1e3ccSAndroid Build Coastguard Worker       }
1163*77c1e3ccSAndroid Build Coastguard Worker       return;
1164*77c1e3ccSAndroid Build Coastguard Worker     }
1165*77c1e3ccSAndroid Build Coastguard Worker 
1166*77c1e3ccSAndroid Build Coastguard Worker     a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
1167*77c1e3ccSAndroid Build Coastguard Worker     a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
1168*77c1e3ccSAndroid Build Coastguard Worker 
1169*77c1e3ccSAndroid Build Coastguard Worker     if (upsample_above) {
1170*77c1e3ccSAndroid Build Coastguard Worker       a0 = _mm256_permutevar8x32_epi32(
1171*77c1e3ccSAndroid Build Coastguard Worker           a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
1172*77c1e3ccSAndroid Build Coastguard Worker       a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
1173*77c1e3ccSAndroid Build Coastguard Worker       base_inc128 = _mm_setr_epi32(base, base + 2, base + 4, base + 6);
1174*77c1e3ccSAndroid Build Coastguard Worker       shift = _mm256_srli_epi32(
1175*77c1e3ccSAndroid Build Coastguard Worker           _mm256_and_si256(
1176*77c1e3ccSAndroid Build Coastguard Worker               _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
1177*77c1e3ccSAndroid Build Coastguard Worker               _mm256_set1_epi32(0x3f)),
1178*77c1e3ccSAndroid Build Coastguard Worker           1);
1179*77c1e3ccSAndroid Build Coastguard Worker     } else {
1180*77c1e3ccSAndroid Build Coastguard Worker       base_inc128 = _mm_setr_epi32(base, base + 1, base + 2, base + 3);
1181*77c1e3ccSAndroid Build Coastguard Worker       shift = _mm256_srli_epi32(
1182*77c1e3ccSAndroid Build Coastguard Worker           _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1183*77c1e3ccSAndroid Build Coastguard Worker     }
1184*77c1e3ccSAndroid Build Coastguard Worker 
1185*77c1e3ccSAndroid Build Coastguard Worker     diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
1186*77c1e3ccSAndroid Build Coastguard Worker     a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
1187*77c1e3ccSAndroid Build Coastguard Worker     a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
1188*77c1e3ccSAndroid Build Coastguard Worker 
1189*77c1e3ccSAndroid Build Coastguard Worker     b = _mm256_mullo_epi32(diff, shift);
1190*77c1e3ccSAndroid Build Coastguard Worker     res = _mm256_add_epi32(a32, b);
1191*77c1e3ccSAndroid Build Coastguard Worker     res = _mm256_srli_epi32(res, 5);
1192*77c1e3ccSAndroid Build Coastguard Worker 
1193*77c1e3ccSAndroid Build Coastguard Worker     res1 = _mm256_castsi256_si128(res);
1194*77c1e3ccSAndroid Build Coastguard Worker     res1 = _mm_packus_epi32(res1, res1);
1195*77c1e3ccSAndroid Build Coastguard Worker 
1196*77c1e3ccSAndroid Build Coastguard Worker     mask128 = _mm_cmpgt_epi32(max_base_x128, base_inc128);
1197*77c1e3ccSAndroid Build Coastguard Worker     mask128 = _mm_packs_epi32(mask128, mask128);  // goto 16 bit
1198*77c1e3ccSAndroid Build Coastguard Worker     dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
1199*77c1e3ccSAndroid Build Coastguard Worker     x += dx;
1200*77c1e3ccSAndroid Build Coastguard Worker   }
1201*77c1e3ccSAndroid Build Coastguard Worker }
1202*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_z1_4xN_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,int upsample_above,int dx,int bd)1203*77c1e3ccSAndroid Build Coastguard Worker static void highbd_dr_prediction_z1_4xN_avx2(int N, uint16_t *dst,
1204*77c1e3ccSAndroid Build Coastguard Worker                                              ptrdiff_t stride,
1205*77c1e3ccSAndroid Build Coastguard Worker                                              const uint16_t *above,
1206*77c1e3ccSAndroid Build Coastguard Worker                                              int upsample_above, int dx,
1207*77c1e3ccSAndroid Build Coastguard Worker                                              int bd) {
1208*77c1e3ccSAndroid Build Coastguard Worker   __m128i dstvec[16];
1209*77c1e3ccSAndroid Build Coastguard Worker   if (bd < 12) {
1210*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above,
1211*77c1e3ccSAndroid Build Coastguard Worker                                               dx);
1212*77c1e3ccSAndroid Build Coastguard Worker   } else {
1213*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_32bit_z1_4xN_internal_avx2(N, dstvec, above,
1214*77c1e3ccSAndroid Build Coastguard Worker                                                     upsample_above, dx);
1215*77c1e3ccSAndroid Build Coastguard Worker   }
1216*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < N; i++) {
1217*77c1e3ccSAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
1218*77c1e3ccSAndroid Build Coastguard Worker   }
1219*77c1e3ccSAndroid Build Coastguard Worker }
1220*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_32bit_z1_8xN_internal_avx2(int N,__m128i * dst,const uint16_t * above,int upsample_above,int dx)1221*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_8xN_internal_avx2(
1222*77c1e3ccSAndroid Build Coastguard Worker     int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1223*77c1e3ccSAndroid Build Coastguard Worker   const int frac_bits = 6 - upsample_above;
1224*77c1e3ccSAndroid Build Coastguard Worker   const int max_base_x = ((8 + N) - 1) << upsample_above;
1225*77c1e3ccSAndroid Build Coastguard Worker 
1226*77c1e3ccSAndroid Build Coastguard Worker   assert(dx > 0);
1227*77c1e3ccSAndroid Build Coastguard Worker   // pre-filter above pixels
1228*77c1e3ccSAndroid Build Coastguard Worker   // store in temp buffers:
1229*77c1e3ccSAndroid Build Coastguard Worker   //   above[x] * 32 + 16
1230*77c1e3ccSAndroid Build Coastguard Worker   //   above[x+1] - above[x]
1231*77c1e3ccSAndroid Build Coastguard Worker   // final pixels will be calculated as:
1232*77c1e3ccSAndroid Build Coastguard Worker   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1233*77c1e3ccSAndroid Build Coastguard Worker   __m256i a0, a1, a0_1, a1_1, a32, a16;
1234*77c1e3ccSAndroid Build Coastguard Worker   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1235*77c1e3ccSAndroid Build Coastguard Worker 
1236*77c1e3ccSAndroid Build Coastguard Worker   a16 = _mm256_set1_epi32(16);
1237*77c1e3ccSAndroid Build Coastguard Worker   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1238*77c1e3ccSAndroid Build Coastguard Worker   max_base_x256 = _mm256_set1_epi32(max_base_x);
1239*77c1e3ccSAndroid Build Coastguard Worker 
1240*77c1e3ccSAndroid Build Coastguard Worker   int x = dx;
1241*77c1e3ccSAndroid Build Coastguard Worker   for (int r = 0; r < N; r++) {
1242*77c1e3ccSAndroid Build Coastguard Worker     __m256i b, res, res1, shift;
1243*77c1e3ccSAndroid Build Coastguard Worker 
1244*77c1e3ccSAndroid Build Coastguard Worker     int base = x >> frac_bits;
1245*77c1e3ccSAndroid Build Coastguard Worker     if (base >= max_base_x) {
1246*77c1e3ccSAndroid Build Coastguard Worker       for (int i = r; i < N; ++i) {
1247*77c1e3ccSAndroid Build Coastguard Worker         dst[i] = _mm256_castsi256_si128(a_mbase_x);  // save 8 values
1248*77c1e3ccSAndroid Build Coastguard Worker       }
1249*77c1e3ccSAndroid Build Coastguard Worker       return;
1250*77c1e3ccSAndroid Build Coastguard Worker     }
1251*77c1e3ccSAndroid Build Coastguard Worker 
1252*77c1e3ccSAndroid Build Coastguard Worker     a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
1253*77c1e3ccSAndroid Build Coastguard Worker     a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
1254*77c1e3ccSAndroid Build Coastguard Worker 
1255*77c1e3ccSAndroid Build Coastguard Worker     if (upsample_above) {
1256*77c1e3ccSAndroid Build Coastguard Worker       a0 = _mm256_permutevar8x32_epi32(
1257*77c1e3ccSAndroid Build Coastguard Worker           a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
1258*77c1e3ccSAndroid Build Coastguard Worker       a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
1259*77c1e3ccSAndroid Build Coastguard Worker 
1260*77c1e3ccSAndroid Build Coastguard Worker       a0_1 =
1261*77c1e3ccSAndroid Build Coastguard Worker           _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
1262*77c1e3ccSAndroid Build Coastguard Worker       a0_1 = _mm256_permutevar8x32_epi32(
1263*77c1e3ccSAndroid Build Coastguard Worker           a0_1, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
1264*77c1e3ccSAndroid Build Coastguard Worker       a1_1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0_1, 1));
1265*77c1e3ccSAndroid Build Coastguard Worker 
1266*77c1e3ccSAndroid Build Coastguard Worker       a0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a0_1), 1);
1267*77c1e3ccSAndroid Build Coastguard Worker       a1 = _mm256_inserti128_si256(a1, _mm256_castsi256_si128(a1_1), 1);
1268*77c1e3ccSAndroid Build Coastguard Worker       base_inc256 =
1269*77c1e3ccSAndroid Build Coastguard Worker           _mm256_setr_epi32(base, base + 2, base + 4, base + 6, base + 8,
1270*77c1e3ccSAndroid Build Coastguard Worker                             base + 10, base + 12, base + 14);
1271*77c1e3ccSAndroid Build Coastguard Worker       shift = _mm256_srli_epi32(
1272*77c1e3ccSAndroid Build Coastguard Worker           _mm256_and_si256(
1273*77c1e3ccSAndroid Build Coastguard Worker               _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
1274*77c1e3ccSAndroid Build Coastguard Worker               _mm256_set1_epi32(0x3f)),
1275*77c1e3ccSAndroid Build Coastguard Worker           1);
1276*77c1e3ccSAndroid Build Coastguard Worker     } else {
1277*77c1e3ccSAndroid Build Coastguard Worker       base_inc256 = _mm256_setr_epi32(base, base + 1, base + 2, base + 3,
1278*77c1e3ccSAndroid Build Coastguard Worker                                       base + 4, base + 5, base + 6, base + 7);
1279*77c1e3ccSAndroid Build Coastguard Worker       shift = _mm256_srli_epi32(
1280*77c1e3ccSAndroid Build Coastguard Worker           _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1281*77c1e3ccSAndroid Build Coastguard Worker     }
1282*77c1e3ccSAndroid Build Coastguard Worker 
1283*77c1e3ccSAndroid Build Coastguard Worker     diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
1284*77c1e3ccSAndroid Build Coastguard Worker     a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
1285*77c1e3ccSAndroid Build Coastguard Worker     a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
1286*77c1e3ccSAndroid Build Coastguard Worker 
1287*77c1e3ccSAndroid Build Coastguard Worker     b = _mm256_mullo_epi32(diff, shift);
1288*77c1e3ccSAndroid Build Coastguard Worker     res = _mm256_add_epi32(a32, b);
1289*77c1e3ccSAndroid Build Coastguard Worker     res = _mm256_srli_epi32(res, 5);
1290*77c1e3ccSAndroid Build Coastguard Worker 
1291*77c1e3ccSAndroid Build Coastguard Worker     res1 = _mm256_packus_epi32(
1292*77c1e3ccSAndroid Build Coastguard Worker         res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
1293*77c1e3ccSAndroid Build Coastguard Worker 
1294*77c1e3ccSAndroid Build Coastguard Worker     mask256 = _mm256_cmpgt_epi32(max_base_x256, base_inc256);
1295*77c1e3ccSAndroid Build Coastguard Worker     mask256 = _mm256_packs_epi32(
1296*77c1e3ccSAndroid Build Coastguard Worker         mask256, _mm256_castsi128_si256(
1297*77c1e3ccSAndroid Build Coastguard Worker                      _mm256_extracti128_si256(mask256, 1)));  // goto 16 bit
1298*77c1e3ccSAndroid Build Coastguard Worker     res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1299*77c1e3ccSAndroid Build Coastguard Worker     dst[r] = _mm256_castsi256_si128(res1);
1300*77c1e3ccSAndroid Build Coastguard Worker     x += dx;
1301*77c1e3ccSAndroid Build Coastguard Worker   }
1302*77c1e3ccSAndroid Build Coastguard Worker }
1303*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_z1_8xN_internal_avx2(int N,__m128i * dst,const uint16_t * above,int upsample_above,int dx)1304*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void highbd_dr_prediction_z1_8xN_internal_avx2(
1305*77c1e3ccSAndroid Build Coastguard Worker     int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1306*77c1e3ccSAndroid Build Coastguard Worker   const int frac_bits = 6 - upsample_above;
1307*77c1e3ccSAndroid Build Coastguard Worker   const int max_base_x = ((8 + N) - 1) << upsample_above;
1308*77c1e3ccSAndroid Build Coastguard Worker 
1309*77c1e3ccSAndroid Build Coastguard Worker   assert(dx > 0);
1310*77c1e3ccSAndroid Build Coastguard Worker   // pre-filter above pixels
1311*77c1e3ccSAndroid Build Coastguard Worker   // store in temp buffers:
1312*77c1e3ccSAndroid Build Coastguard Worker   //   above[x] * 32 + 16
1313*77c1e3ccSAndroid Build Coastguard Worker   //   above[x+1] - above[x]
1314*77c1e3ccSAndroid Build Coastguard Worker   // final pixels will be calculated as:
1315*77c1e3ccSAndroid Build Coastguard Worker   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1316*77c1e3ccSAndroid Build Coastguard Worker   __m256i a0, a1, a32, a16, c3f;
1317*77c1e3ccSAndroid Build Coastguard Worker   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1318*77c1e3ccSAndroid Build Coastguard Worker   __m128i a0_x128, a1_x128;
1319*77c1e3ccSAndroid Build Coastguard Worker 
1320*77c1e3ccSAndroid Build Coastguard Worker   a16 = _mm256_set1_epi16(16);
1321*77c1e3ccSAndroid Build Coastguard Worker   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1322*77c1e3ccSAndroid Build Coastguard Worker   max_base_x256 = _mm256_set1_epi16(max_base_x);
1323*77c1e3ccSAndroid Build Coastguard Worker   c3f = _mm256_set1_epi16(0x3f);
1324*77c1e3ccSAndroid Build Coastguard Worker 
1325*77c1e3ccSAndroid Build Coastguard Worker   int x = dx;
1326*77c1e3ccSAndroid Build Coastguard Worker   for (int r = 0; r < N; r++) {
1327*77c1e3ccSAndroid Build Coastguard Worker     __m256i b, res, res1, shift;
1328*77c1e3ccSAndroid Build Coastguard Worker 
1329*77c1e3ccSAndroid Build Coastguard Worker     int base = x >> frac_bits;
1330*77c1e3ccSAndroid Build Coastguard Worker     if (base >= max_base_x) {
1331*77c1e3ccSAndroid Build Coastguard Worker       for (int i = r; i < N; ++i) {
1332*77c1e3ccSAndroid Build Coastguard Worker         dst[i] = _mm256_castsi256_si128(a_mbase_x);  // save 8 values
1333*77c1e3ccSAndroid Build Coastguard Worker       }
1334*77c1e3ccSAndroid Build Coastguard Worker       return;
1335*77c1e3ccSAndroid Build Coastguard Worker     }
1336*77c1e3ccSAndroid Build Coastguard Worker 
1337*77c1e3ccSAndroid Build Coastguard Worker     a0_x128 = _mm_loadu_si128((__m128i *)(above + base));
1338*77c1e3ccSAndroid Build Coastguard Worker     if (upsample_above) {
1339*77c1e3ccSAndroid Build Coastguard Worker       __m128i mask, atmp0, atmp1, atmp2, atmp3;
1340*77c1e3ccSAndroid Build Coastguard Worker       a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 8));
1341*77c1e3ccSAndroid Build Coastguard Worker       atmp0 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
1342*77c1e3ccSAndroid Build Coastguard Worker       atmp1 = _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
1343*77c1e3ccSAndroid Build Coastguard Worker       atmp2 =
1344*77c1e3ccSAndroid Build Coastguard Worker           _mm_shuffle_epi8(a0_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
1345*77c1e3ccSAndroid Build Coastguard Worker       atmp3 =
1346*77c1e3ccSAndroid Build Coastguard Worker           _mm_shuffle_epi8(a1_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
1347*77c1e3ccSAndroid Build Coastguard Worker       mask =
1348*77c1e3ccSAndroid Build Coastguard Worker           _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[0], _mm_set1_epi8(15));
1349*77c1e3ccSAndroid Build Coastguard Worker       a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
1350*77c1e3ccSAndroid Build Coastguard Worker       mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[0] + 16),
1351*77c1e3ccSAndroid Build Coastguard Worker                             _mm_set1_epi8(15));
1352*77c1e3ccSAndroid Build Coastguard Worker       a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
1353*77c1e3ccSAndroid Build Coastguard Worker 
1354*77c1e3ccSAndroid Build Coastguard Worker       base_inc256 = _mm256_setr_epi16(base, base + 2, base + 4, base + 6,
1355*77c1e3ccSAndroid Build Coastguard Worker                                       base + 8, base + 10, base + 12, base + 14,
1356*77c1e3ccSAndroid Build Coastguard Worker                                       0, 0, 0, 0, 0, 0, 0, 0);
1357*77c1e3ccSAndroid Build Coastguard Worker       shift = _mm256_srli_epi16(
1358*77c1e3ccSAndroid Build Coastguard Worker           _mm256_and_si256(
1359*77c1e3ccSAndroid Build Coastguard Worker               _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
1360*77c1e3ccSAndroid Build Coastguard Worker           1);
1361*77c1e3ccSAndroid Build Coastguard Worker     } else {
1362*77c1e3ccSAndroid Build Coastguard Worker       a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 1));
1363*77c1e3ccSAndroid Build Coastguard Worker       base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
1364*77c1e3ccSAndroid Build Coastguard Worker                                       base + 4, base + 5, base + 6, base + 7, 0,
1365*77c1e3ccSAndroid Build Coastguard Worker                                       0, 0, 0, 0, 0, 0, 0);
1366*77c1e3ccSAndroid Build Coastguard Worker       shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1367*77c1e3ccSAndroid Build Coastguard Worker     }
1368*77c1e3ccSAndroid Build Coastguard Worker     a0 = _mm256_castsi128_si256(a0_x128);
1369*77c1e3ccSAndroid Build Coastguard Worker     a1 = _mm256_castsi128_si256(a1_x128);
1370*77c1e3ccSAndroid Build Coastguard Worker 
1371*77c1e3ccSAndroid Build Coastguard Worker     diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
1372*77c1e3ccSAndroid Build Coastguard Worker     a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
1373*77c1e3ccSAndroid Build Coastguard Worker     a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
1374*77c1e3ccSAndroid Build Coastguard Worker 
1375*77c1e3ccSAndroid Build Coastguard Worker     b = _mm256_mullo_epi16(diff, shift);
1376*77c1e3ccSAndroid Build Coastguard Worker     res = _mm256_add_epi16(a32, b);
1377*77c1e3ccSAndroid Build Coastguard Worker     res = _mm256_srli_epi16(res, 5);
1378*77c1e3ccSAndroid Build Coastguard Worker 
1379*77c1e3ccSAndroid Build Coastguard Worker     mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1380*77c1e3ccSAndroid Build Coastguard Worker     res1 = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1381*77c1e3ccSAndroid Build Coastguard Worker     dst[r] = _mm256_castsi256_si128(res1);
1382*77c1e3ccSAndroid Build Coastguard Worker     x += dx;
1383*77c1e3ccSAndroid Build Coastguard Worker   }
1384*77c1e3ccSAndroid Build Coastguard Worker }
1385*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_z1_8xN_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,int upsample_above,int dx,int bd)1386*77c1e3ccSAndroid Build Coastguard Worker static void highbd_dr_prediction_z1_8xN_avx2(int N, uint16_t *dst,
1387*77c1e3ccSAndroid Build Coastguard Worker                                              ptrdiff_t stride,
1388*77c1e3ccSAndroid Build Coastguard Worker                                              const uint16_t *above,
1389*77c1e3ccSAndroid Build Coastguard Worker                                              int upsample_above, int dx,
1390*77c1e3ccSAndroid Build Coastguard Worker                                              int bd) {
1391*77c1e3ccSAndroid Build Coastguard Worker   __m128i dstvec[32];
1392*77c1e3ccSAndroid Build Coastguard Worker   if (bd < 12) {
1393*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above,
1394*77c1e3ccSAndroid Build Coastguard Worker                                               dx);
1395*77c1e3ccSAndroid Build Coastguard Worker   } else {
1396*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_32bit_z1_8xN_internal_avx2(N, dstvec, above,
1397*77c1e3ccSAndroid Build Coastguard Worker                                                     upsample_above, dx);
1398*77c1e3ccSAndroid Build Coastguard Worker   }
1399*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < N; i++) {
1400*77c1e3ccSAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
1401*77c1e3ccSAndroid Build Coastguard Worker   }
1402*77c1e3ccSAndroid Build Coastguard Worker }
1403*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_32bit_z1_16xN_internal_avx2(int N,__m256i * dstvec,const uint16_t * above,int upsample_above,int dx)1404*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_16xN_internal_avx2(
1405*77c1e3ccSAndroid Build Coastguard Worker     int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1406*77c1e3ccSAndroid Build Coastguard Worker   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1407*77c1e3ccSAndroid Build Coastguard Worker   (void)upsample_above;
1408*77c1e3ccSAndroid Build Coastguard Worker   const int frac_bits = 6;
1409*77c1e3ccSAndroid Build Coastguard Worker   const int max_base_x = ((16 + N) - 1);
1410*77c1e3ccSAndroid Build Coastguard Worker 
1411*77c1e3ccSAndroid Build Coastguard Worker   // pre-filter above pixels
1412*77c1e3ccSAndroid Build Coastguard Worker   // store in temp buffers:
1413*77c1e3ccSAndroid Build Coastguard Worker   //   above[x] * 32 + 16
1414*77c1e3ccSAndroid Build Coastguard Worker   //   above[x+1] - above[x]
1415*77c1e3ccSAndroid Build Coastguard Worker   // final pixels will be calculated as:
1416*77c1e3ccSAndroid Build Coastguard Worker   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1417*77c1e3ccSAndroid Build Coastguard Worker   __m256i a0, a0_1, a1, a1_1, a32, a16;
1418*77c1e3ccSAndroid Build Coastguard Worker   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1419*77c1e3ccSAndroid Build Coastguard Worker 
1420*77c1e3ccSAndroid Build Coastguard Worker   a16 = _mm256_set1_epi32(16);
1421*77c1e3ccSAndroid Build Coastguard Worker   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1422*77c1e3ccSAndroid Build Coastguard Worker   max_base_x256 = _mm256_set1_epi16(max_base_x);
1423*77c1e3ccSAndroid Build Coastguard Worker 
1424*77c1e3ccSAndroid Build Coastguard Worker   int x = dx;
1425*77c1e3ccSAndroid Build Coastguard Worker   for (int r = 0; r < N; r++) {
1426*77c1e3ccSAndroid Build Coastguard Worker     __m256i b, res[2], res1;
1427*77c1e3ccSAndroid Build Coastguard Worker 
1428*77c1e3ccSAndroid Build Coastguard Worker     int base = x >> frac_bits;
1429*77c1e3ccSAndroid Build Coastguard Worker     if (base >= max_base_x) {
1430*77c1e3ccSAndroid Build Coastguard Worker       for (int i = r; i < N; ++i) {
1431*77c1e3ccSAndroid Build Coastguard Worker         dstvec[i] = a_mbase_x;  // save 16 values
1432*77c1e3ccSAndroid Build Coastguard Worker       }
1433*77c1e3ccSAndroid Build Coastguard Worker       return;
1434*77c1e3ccSAndroid Build Coastguard Worker     }
1435*77c1e3ccSAndroid Build Coastguard Worker     __m256i shift = _mm256_srli_epi32(
1436*77c1e3ccSAndroid Build Coastguard Worker         _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1437*77c1e3ccSAndroid Build Coastguard Worker 
1438*77c1e3ccSAndroid Build Coastguard Worker     a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
1439*77c1e3ccSAndroid Build Coastguard Worker     a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
1440*77c1e3ccSAndroid Build Coastguard Worker 
1441*77c1e3ccSAndroid Build Coastguard Worker     diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
1442*77c1e3ccSAndroid Build Coastguard Worker     a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
1443*77c1e3ccSAndroid Build Coastguard Worker     a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
1444*77c1e3ccSAndroid Build Coastguard Worker     b = _mm256_mullo_epi32(diff, shift);
1445*77c1e3ccSAndroid Build Coastguard Worker 
1446*77c1e3ccSAndroid Build Coastguard Worker     res[0] = _mm256_add_epi32(a32, b);
1447*77c1e3ccSAndroid Build Coastguard Worker     res[0] = _mm256_srli_epi32(res[0], 5);
1448*77c1e3ccSAndroid Build Coastguard Worker     res[0] = _mm256_packus_epi32(
1449*77c1e3ccSAndroid Build Coastguard Worker         res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
1450*77c1e3ccSAndroid Build Coastguard Worker 
1451*77c1e3ccSAndroid Build Coastguard Worker     int mdif = max_base_x - base;
1452*77c1e3ccSAndroid Build Coastguard Worker     if (mdif > 8) {
1453*77c1e3ccSAndroid Build Coastguard Worker       a0_1 =
1454*77c1e3ccSAndroid Build Coastguard Worker           _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
1455*77c1e3ccSAndroid Build Coastguard Worker       a1_1 =
1456*77c1e3ccSAndroid Build Coastguard Worker           _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 9)));
1457*77c1e3ccSAndroid Build Coastguard Worker 
1458*77c1e3ccSAndroid Build Coastguard Worker       diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
1459*77c1e3ccSAndroid Build Coastguard Worker       a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
1460*77c1e3ccSAndroid Build Coastguard Worker       a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
1461*77c1e3ccSAndroid Build Coastguard Worker       b = _mm256_mullo_epi32(diff, shift);
1462*77c1e3ccSAndroid Build Coastguard Worker 
1463*77c1e3ccSAndroid Build Coastguard Worker       res[1] = _mm256_add_epi32(a32, b);
1464*77c1e3ccSAndroid Build Coastguard Worker       res[1] = _mm256_srli_epi32(res[1], 5);
1465*77c1e3ccSAndroid Build Coastguard Worker       res[1] = _mm256_packus_epi32(
1466*77c1e3ccSAndroid Build Coastguard Worker           res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
1467*77c1e3ccSAndroid Build Coastguard Worker     } else {
1468*77c1e3ccSAndroid Build Coastguard Worker       res[1] = a_mbase_x;
1469*77c1e3ccSAndroid Build Coastguard Worker     }
1470*77c1e3ccSAndroid Build Coastguard Worker     res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
1471*77c1e3ccSAndroid Build Coastguard Worker                                    1);  // 16 16bit values
1472*77c1e3ccSAndroid Build Coastguard Worker 
1473*77c1e3ccSAndroid Build Coastguard Worker     base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
1474*77c1e3ccSAndroid Build Coastguard Worker                                     base + 4, base + 5, base + 6, base + 7,
1475*77c1e3ccSAndroid Build Coastguard Worker                                     base + 8, base + 9, base + 10, base + 11,
1476*77c1e3ccSAndroid Build Coastguard Worker                                     base + 12, base + 13, base + 14, base + 15);
1477*77c1e3ccSAndroid Build Coastguard Worker     mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1478*77c1e3ccSAndroid Build Coastguard Worker     dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1479*77c1e3ccSAndroid Build Coastguard Worker     x += dx;
1480*77c1e3ccSAndroid Build Coastguard Worker   }
1481*77c1e3ccSAndroid Build Coastguard Worker }
1482*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_z1_16xN_internal_avx2(int N,__m256i * dstvec,const uint16_t * above,int upsample_above,int dx)1483*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void highbd_dr_prediction_z1_16xN_internal_avx2(
1484*77c1e3ccSAndroid Build Coastguard Worker     int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1485*77c1e3ccSAndroid Build Coastguard Worker   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1486*77c1e3ccSAndroid Build Coastguard Worker   (void)upsample_above;
1487*77c1e3ccSAndroid Build Coastguard Worker   const int frac_bits = 6;
1488*77c1e3ccSAndroid Build Coastguard Worker   const int max_base_x = ((16 + N) - 1);
1489*77c1e3ccSAndroid Build Coastguard Worker 
1490*77c1e3ccSAndroid Build Coastguard Worker   // pre-filter above pixels
1491*77c1e3ccSAndroid Build Coastguard Worker   // store in temp buffers:
1492*77c1e3ccSAndroid Build Coastguard Worker   //   above[x] * 32 + 16
1493*77c1e3ccSAndroid Build Coastguard Worker   //   above[x+1] - above[x]
1494*77c1e3ccSAndroid Build Coastguard Worker   // final pixels will be calculated as:
1495*77c1e3ccSAndroid Build Coastguard Worker   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1496*77c1e3ccSAndroid Build Coastguard Worker   __m256i a0, a1, a32, a16, c3f;
1497*77c1e3ccSAndroid Build Coastguard Worker   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1498*77c1e3ccSAndroid Build Coastguard Worker 
1499*77c1e3ccSAndroid Build Coastguard Worker   a16 = _mm256_set1_epi16(16);
1500*77c1e3ccSAndroid Build Coastguard Worker   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1501*77c1e3ccSAndroid Build Coastguard Worker   max_base_x256 = _mm256_set1_epi16(max_base_x);
1502*77c1e3ccSAndroid Build Coastguard Worker   c3f = _mm256_set1_epi16(0x3f);
1503*77c1e3ccSAndroid Build Coastguard Worker 
1504*77c1e3ccSAndroid Build Coastguard Worker   int x = dx;
1505*77c1e3ccSAndroid Build Coastguard Worker   for (int r = 0; r < N; r++) {
1506*77c1e3ccSAndroid Build Coastguard Worker     __m256i b, res;
1507*77c1e3ccSAndroid Build Coastguard Worker 
1508*77c1e3ccSAndroid Build Coastguard Worker     int base = x >> frac_bits;
1509*77c1e3ccSAndroid Build Coastguard Worker     if (base >= max_base_x) {
1510*77c1e3ccSAndroid Build Coastguard Worker       for (int i = r; i < N; ++i) {
1511*77c1e3ccSAndroid Build Coastguard Worker         dstvec[i] = a_mbase_x;  // save 16 values
1512*77c1e3ccSAndroid Build Coastguard Worker       }
1513*77c1e3ccSAndroid Build Coastguard Worker       return;
1514*77c1e3ccSAndroid Build Coastguard Worker     }
1515*77c1e3ccSAndroid Build Coastguard Worker     __m256i shift =
1516*77c1e3ccSAndroid Build Coastguard Worker         _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1517*77c1e3ccSAndroid Build Coastguard Worker 
1518*77c1e3ccSAndroid Build Coastguard Worker     a0 = _mm256_loadu_si256((__m256i *)(above + base));
1519*77c1e3ccSAndroid Build Coastguard Worker     a1 = _mm256_loadu_si256((__m256i *)(above + base + 1));
1520*77c1e3ccSAndroid Build Coastguard Worker 
1521*77c1e3ccSAndroid Build Coastguard Worker     diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
1522*77c1e3ccSAndroid Build Coastguard Worker     a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
1523*77c1e3ccSAndroid Build Coastguard Worker     a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
1524*77c1e3ccSAndroid Build Coastguard Worker     b = _mm256_mullo_epi16(diff, shift);
1525*77c1e3ccSAndroid Build Coastguard Worker 
1526*77c1e3ccSAndroid Build Coastguard Worker     res = _mm256_add_epi16(a32, b);
1527*77c1e3ccSAndroid Build Coastguard Worker     res = _mm256_srli_epi16(res, 5);  // 16 16bit values
1528*77c1e3ccSAndroid Build Coastguard Worker 
1529*77c1e3ccSAndroid Build Coastguard Worker     base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
1530*77c1e3ccSAndroid Build Coastguard Worker                                     base + 4, base + 5, base + 6, base + 7,
1531*77c1e3ccSAndroid Build Coastguard Worker                                     base + 8, base + 9, base + 10, base + 11,
1532*77c1e3ccSAndroid Build Coastguard Worker                                     base + 12, base + 13, base + 14, base + 15);
1533*77c1e3ccSAndroid Build Coastguard Worker     mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1534*77c1e3ccSAndroid Build Coastguard Worker     dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1535*77c1e3ccSAndroid Build Coastguard Worker     x += dx;
1536*77c1e3ccSAndroid Build Coastguard Worker   }
1537*77c1e3ccSAndroid Build Coastguard Worker }
1538*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_z1_16xN_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,int upsample_above,int dx,int bd)1539*77c1e3ccSAndroid Build Coastguard Worker static void highbd_dr_prediction_z1_16xN_avx2(int N, uint16_t *dst,
1540*77c1e3ccSAndroid Build Coastguard Worker                                               ptrdiff_t stride,
1541*77c1e3ccSAndroid Build Coastguard Worker                                               const uint16_t *above,
1542*77c1e3ccSAndroid Build Coastguard Worker                                               int upsample_above, int dx,
1543*77c1e3ccSAndroid Build Coastguard Worker                                               int bd) {
1544*77c1e3ccSAndroid Build Coastguard Worker   __m256i dstvec[64];
1545*77c1e3ccSAndroid Build Coastguard Worker   if (bd < 12) {
1546*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above,
1547*77c1e3ccSAndroid Build Coastguard Worker                                                dx);
1548*77c1e3ccSAndroid Build Coastguard Worker   } else {
1549*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_32bit_z1_16xN_internal_avx2(N, dstvec, above,
1550*77c1e3ccSAndroid Build Coastguard Worker                                                      upsample_above, dx);
1551*77c1e3ccSAndroid Build Coastguard Worker   }
1552*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < N; i++) {
1553*77c1e3ccSAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
1554*77c1e3ccSAndroid Build Coastguard Worker   }
1555*77c1e3ccSAndroid Build Coastguard Worker }
1556*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_32bit_z1_32xN_internal_avx2(int N,__m256i * dstvec,const uint16_t * above,int upsample_above,int dx)1557*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_32xN_internal_avx2(
1558*77c1e3ccSAndroid Build Coastguard Worker     int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1559*77c1e3ccSAndroid Build Coastguard Worker   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1560*77c1e3ccSAndroid Build Coastguard Worker   (void)upsample_above;
1561*77c1e3ccSAndroid Build Coastguard Worker   const int frac_bits = 6;
1562*77c1e3ccSAndroid Build Coastguard Worker   const int max_base_x = ((32 + N) - 1);
1563*77c1e3ccSAndroid Build Coastguard Worker 
1564*77c1e3ccSAndroid Build Coastguard Worker   // pre-filter above pixels
1565*77c1e3ccSAndroid Build Coastguard Worker   // store in temp buffers:
1566*77c1e3ccSAndroid Build Coastguard Worker   //   above[x] * 32 + 16
1567*77c1e3ccSAndroid Build Coastguard Worker   //   above[x+1] - above[x]
1568*77c1e3ccSAndroid Build Coastguard Worker   // final pixels will be calculated as:
1569*77c1e3ccSAndroid Build Coastguard Worker   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1570*77c1e3ccSAndroid Build Coastguard Worker   __m256i a0, a0_1, a1, a1_1, a32, a16, c3f;
1571*77c1e3ccSAndroid Build Coastguard Worker   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1572*77c1e3ccSAndroid Build Coastguard Worker 
1573*77c1e3ccSAndroid Build Coastguard Worker   a16 = _mm256_set1_epi32(16);
1574*77c1e3ccSAndroid Build Coastguard Worker   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1575*77c1e3ccSAndroid Build Coastguard Worker   max_base_x256 = _mm256_set1_epi16(max_base_x);
1576*77c1e3ccSAndroid Build Coastguard Worker   c3f = _mm256_set1_epi16(0x3f);
1577*77c1e3ccSAndroid Build Coastguard Worker 
1578*77c1e3ccSAndroid Build Coastguard Worker   int x = dx;
1579*77c1e3ccSAndroid Build Coastguard Worker   for (int r = 0; r < N; r++) {
1580*77c1e3ccSAndroid Build Coastguard Worker     __m256i b, res[2], res1;
1581*77c1e3ccSAndroid Build Coastguard Worker 
1582*77c1e3ccSAndroid Build Coastguard Worker     int base = x >> frac_bits;
1583*77c1e3ccSAndroid Build Coastguard Worker     if (base >= max_base_x) {
1584*77c1e3ccSAndroid Build Coastguard Worker       for (int i = r; i < N; ++i) {
1585*77c1e3ccSAndroid Build Coastguard Worker         dstvec[i] = a_mbase_x;  // save 32 values
1586*77c1e3ccSAndroid Build Coastguard Worker         dstvec[i + N] = a_mbase_x;
1587*77c1e3ccSAndroid Build Coastguard Worker       }
1588*77c1e3ccSAndroid Build Coastguard Worker       return;
1589*77c1e3ccSAndroid Build Coastguard Worker     }
1590*77c1e3ccSAndroid Build Coastguard Worker 
1591*77c1e3ccSAndroid Build Coastguard Worker     __m256i shift =
1592*77c1e3ccSAndroid Build Coastguard Worker         _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
1593*77c1e3ccSAndroid Build Coastguard Worker 
1594*77c1e3ccSAndroid Build Coastguard Worker     for (int j = 0; j < 32; j += 16) {
1595*77c1e3ccSAndroid Build Coastguard Worker       int mdif = max_base_x - (base + j);
1596*77c1e3ccSAndroid Build Coastguard Worker       if (mdif <= 0) {
1597*77c1e3ccSAndroid Build Coastguard Worker         res1 = a_mbase_x;
1598*77c1e3ccSAndroid Build Coastguard Worker       } else {
1599*77c1e3ccSAndroid Build Coastguard Worker         a0 = _mm256_cvtepu16_epi32(
1600*77c1e3ccSAndroid Build Coastguard Worker             _mm_loadu_si128((__m128i *)(above + base + j)));
1601*77c1e3ccSAndroid Build Coastguard Worker         a1 = _mm256_cvtepu16_epi32(
1602*77c1e3ccSAndroid Build Coastguard Worker             _mm_loadu_si128((__m128i *)(above + base + 1 + j)));
1603*77c1e3ccSAndroid Build Coastguard Worker 
1604*77c1e3ccSAndroid Build Coastguard Worker         diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
1605*77c1e3ccSAndroid Build Coastguard Worker         a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
1606*77c1e3ccSAndroid Build Coastguard Worker         a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
1607*77c1e3ccSAndroid Build Coastguard Worker         b = _mm256_mullo_epi32(diff, shift);
1608*77c1e3ccSAndroid Build Coastguard Worker 
1609*77c1e3ccSAndroid Build Coastguard Worker         res[0] = _mm256_add_epi32(a32, b);
1610*77c1e3ccSAndroid Build Coastguard Worker         res[0] = _mm256_srli_epi32(res[0], 5);
1611*77c1e3ccSAndroid Build Coastguard Worker         res[0] = _mm256_packus_epi32(
1612*77c1e3ccSAndroid Build Coastguard Worker             res[0],
1613*77c1e3ccSAndroid Build Coastguard Worker             _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
1614*77c1e3ccSAndroid Build Coastguard Worker         if (mdif > 8) {
1615*77c1e3ccSAndroid Build Coastguard Worker           a0_1 = _mm256_cvtepu16_epi32(
1616*77c1e3ccSAndroid Build Coastguard Worker               _mm_loadu_si128((__m128i *)(above + base + 8 + j)));
1617*77c1e3ccSAndroid Build Coastguard Worker           a1_1 = _mm256_cvtepu16_epi32(
1618*77c1e3ccSAndroid Build Coastguard Worker               _mm_loadu_si128((__m128i *)(above + base + 9 + j)));
1619*77c1e3ccSAndroid Build Coastguard Worker 
1620*77c1e3ccSAndroid Build Coastguard Worker           diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
1621*77c1e3ccSAndroid Build Coastguard Worker           a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
1622*77c1e3ccSAndroid Build Coastguard Worker           a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
1623*77c1e3ccSAndroid Build Coastguard Worker           b = _mm256_mullo_epi32(diff, shift);
1624*77c1e3ccSAndroid Build Coastguard Worker 
1625*77c1e3ccSAndroid Build Coastguard Worker           res[1] = _mm256_add_epi32(a32, b);
1626*77c1e3ccSAndroid Build Coastguard Worker           res[1] = _mm256_srli_epi32(res[1], 5);
1627*77c1e3ccSAndroid Build Coastguard Worker           res[1] = _mm256_packus_epi32(
1628*77c1e3ccSAndroid Build Coastguard Worker               res[1],
1629*77c1e3ccSAndroid Build Coastguard Worker               _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
1630*77c1e3ccSAndroid Build Coastguard Worker         } else {
1631*77c1e3ccSAndroid Build Coastguard Worker           res[1] = a_mbase_x;
1632*77c1e3ccSAndroid Build Coastguard Worker         }
1633*77c1e3ccSAndroid Build Coastguard Worker         res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
1634*77c1e3ccSAndroid Build Coastguard Worker                                        1);  // 16 16bit values
1635*77c1e3ccSAndroid Build Coastguard Worker         base_inc256 = _mm256_setr_epi16(
1636*77c1e3ccSAndroid Build Coastguard Worker             base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1637*77c1e3ccSAndroid Build Coastguard Worker             base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1638*77c1e3ccSAndroid Build Coastguard Worker             base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1639*77c1e3ccSAndroid Build Coastguard Worker             base + j + 13, base + j + 14, base + j + 15);
1640*77c1e3ccSAndroid Build Coastguard Worker 
1641*77c1e3ccSAndroid Build Coastguard Worker         mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1642*77c1e3ccSAndroid Build Coastguard Worker         res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1643*77c1e3ccSAndroid Build Coastguard Worker       }
1644*77c1e3ccSAndroid Build Coastguard Worker       if (!j) {
1645*77c1e3ccSAndroid Build Coastguard Worker         dstvec[r] = res1;
1646*77c1e3ccSAndroid Build Coastguard Worker       } else {
1647*77c1e3ccSAndroid Build Coastguard Worker         dstvec[r + N] = res1;
1648*77c1e3ccSAndroid Build Coastguard Worker       }
1649*77c1e3ccSAndroid Build Coastguard Worker     }
1650*77c1e3ccSAndroid Build Coastguard Worker     x += dx;
1651*77c1e3ccSAndroid Build Coastguard Worker   }
1652*77c1e3ccSAndroid Build Coastguard Worker }
1653*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_z1_32xN_internal_avx2(int N,__m256i * dstvec,const uint16_t * above,int upsample_above,int dx)1654*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void highbd_dr_prediction_z1_32xN_internal_avx2(
1655*77c1e3ccSAndroid Build Coastguard Worker     int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1656*77c1e3ccSAndroid Build Coastguard Worker   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1657*77c1e3ccSAndroid Build Coastguard Worker   (void)upsample_above;
1658*77c1e3ccSAndroid Build Coastguard Worker   const int frac_bits = 6;
1659*77c1e3ccSAndroid Build Coastguard Worker   const int max_base_x = ((32 + N) - 1);
1660*77c1e3ccSAndroid Build Coastguard Worker 
1661*77c1e3ccSAndroid Build Coastguard Worker   // pre-filter above pixels
1662*77c1e3ccSAndroid Build Coastguard Worker   // store in temp buffers:
1663*77c1e3ccSAndroid Build Coastguard Worker   //   above[x] * 32 + 16
1664*77c1e3ccSAndroid Build Coastguard Worker   //   above[x+1] - above[x]
1665*77c1e3ccSAndroid Build Coastguard Worker   // final pixels will be calculated as:
1666*77c1e3ccSAndroid Build Coastguard Worker   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1667*77c1e3ccSAndroid Build Coastguard Worker   __m256i a0, a1, a32, a16, c3f;
1668*77c1e3ccSAndroid Build Coastguard Worker   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1669*77c1e3ccSAndroid Build Coastguard Worker 
1670*77c1e3ccSAndroid Build Coastguard Worker   a16 = _mm256_set1_epi16(16);
1671*77c1e3ccSAndroid Build Coastguard Worker   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1672*77c1e3ccSAndroid Build Coastguard Worker   max_base_x256 = _mm256_set1_epi16(max_base_x);
1673*77c1e3ccSAndroid Build Coastguard Worker   c3f = _mm256_set1_epi16(0x3f);
1674*77c1e3ccSAndroid Build Coastguard Worker 
1675*77c1e3ccSAndroid Build Coastguard Worker   int x = dx;
1676*77c1e3ccSAndroid Build Coastguard Worker   for (int r = 0; r < N; r++) {
1677*77c1e3ccSAndroid Build Coastguard Worker     __m256i b, res;
1678*77c1e3ccSAndroid Build Coastguard Worker 
1679*77c1e3ccSAndroid Build Coastguard Worker     int base = x >> frac_bits;
1680*77c1e3ccSAndroid Build Coastguard Worker     if (base >= max_base_x) {
1681*77c1e3ccSAndroid Build Coastguard Worker       for (int i = r; i < N; ++i) {
1682*77c1e3ccSAndroid Build Coastguard Worker         dstvec[i] = a_mbase_x;  // save 32 values
1683*77c1e3ccSAndroid Build Coastguard Worker         dstvec[i + N] = a_mbase_x;
1684*77c1e3ccSAndroid Build Coastguard Worker       }
1685*77c1e3ccSAndroid Build Coastguard Worker       return;
1686*77c1e3ccSAndroid Build Coastguard Worker     }
1687*77c1e3ccSAndroid Build Coastguard Worker 
1688*77c1e3ccSAndroid Build Coastguard Worker     __m256i shift =
1689*77c1e3ccSAndroid Build Coastguard Worker         _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1690*77c1e3ccSAndroid Build Coastguard Worker 
1691*77c1e3ccSAndroid Build Coastguard Worker     for (int j = 0; j < 32; j += 16) {
1692*77c1e3ccSAndroid Build Coastguard Worker       int mdif = max_base_x - (base + j);
1693*77c1e3ccSAndroid Build Coastguard Worker       if (mdif <= 0) {
1694*77c1e3ccSAndroid Build Coastguard Worker         res = a_mbase_x;
1695*77c1e3ccSAndroid Build Coastguard Worker       } else {
1696*77c1e3ccSAndroid Build Coastguard Worker         a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
1697*77c1e3ccSAndroid Build Coastguard Worker         a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
1698*77c1e3ccSAndroid Build Coastguard Worker 
1699*77c1e3ccSAndroid Build Coastguard Worker         diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
1700*77c1e3ccSAndroid Build Coastguard Worker         a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
1701*77c1e3ccSAndroid Build Coastguard Worker         a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
1702*77c1e3ccSAndroid Build Coastguard Worker         b = _mm256_mullo_epi16(diff, shift);
1703*77c1e3ccSAndroid Build Coastguard Worker 
1704*77c1e3ccSAndroid Build Coastguard Worker         res = _mm256_add_epi16(a32, b);
1705*77c1e3ccSAndroid Build Coastguard Worker         res = _mm256_srli_epi16(res, 5);
1706*77c1e3ccSAndroid Build Coastguard Worker 
1707*77c1e3ccSAndroid Build Coastguard Worker         base_inc256 = _mm256_setr_epi16(
1708*77c1e3ccSAndroid Build Coastguard Worker             base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1709*77c1e3ccSAndroid Build Coastguard Worker             base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1710*77c1e3ccSAndroid Build Coastguard Worker             base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1711*77c1e3ccSAndroid Build Coastguard Worker             base + j + 13, base + j + 14, base + j + 15);
1712*77c1e3ccSAndroid Build Coastguard Worker 
1713*77c1e3ccSAndroid Build Coastguard Worker         mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1714*77c1e3ccSAndroid Build Coastguard Worker         res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1715*77c1e3ccSAndroid Build Coastguard Worker       }
1716*77c1e3ccSAndroid Build Coastguard Worker       if (!j) {
1717*77c1e3ccSAndroid Build Coastguard Worker         dstvec[r] = res;
1718*77c1e3ccSAndroid Build Coastguard Worker       } else {
1719*77c1e3ccSAndroid Build Coastguard Worker         dstvec[r + N] = res;
1720*77c1e3ccSAndroid Build Coastguard Worker       }
1721*77c1e3ccSAndroid Build Coastguard Worker     }
1722*77c1e3ccSAndroid Build Coastguard Worker     x += dx;
1723*77c1e3ccSAndroid Build Coastguard Worker   }
1724*77c1e3ccSAndroid Build Coastguard Worker }
1725*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_z1_32xN_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,int upsample_above,int dx,int bd)1726*77c1e3ccSAndroid Build Coastguard Worker static void highbd_dr_prediction_z1_32xN_avx2(int N, uint16_t *dst,
1727*77c1e3ccSAndroid Build Coastguard Worker                                               ptrdiff_t stride,
1728*77c1e3ccSAndroid Build Coastguard Worker                                               const uint16_t *above,
1729*77c1e3ccSAndroid Build Coastguard Worker                                               int upsample_above, int dx,
1730*77c1e3ccSAndroid Build Coastguard Worker                                               int bd) {
1731*77c1e3ccSAndroid Build Coastguard Worker   __m256i dstvec[128];
1732*77c1e3ccSAndroid Build Coastguard Worker   if (bd < 12) {
1733*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above,
1734*77c1e3ccSAndroid Build Coastguard Worker                                                dx);
1735*77c1e3ccSAndroid Build Coastguard Worker   } else {
1736*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_32bit_z1_32xN_internal_avx2(N, dstvec, above,
1737*77c1e3ccSAndroid Build Coastguard Worker                                                      upsample_above, dx);
1738*77c1e3ccSAndroid Build Coastguard Worker   }
1739*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < N; i++) {
1740*77c1e3ccSAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
1741*77c1e3ccSAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i *)(dst + stride * i + 16), dstvec[i + N]);
1742*77c1e3ccSAndroid Build Coastguard Worker   }
1743*77c1e3ccSAndroid Build Coastguard Worker }
1744*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_32bit_z1_64xN_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,int upsample_above,int dx)1745*77c1e3ccSAndroid Build Coastguard Worker static void highbd_dr_prediction_32bit_z1_64xN_avx2(int N, uint16_t *dst,
1746*77c1e3ccSAndroid Build Coastguard Worker                                                     ptrdiff_t stride,
1747*77c1e3ccSAndroid Build Coastguard Worker                                                     const uint16_t *above,
1748*77c1e3ccSAndroid Build Coastguard Worker                                                     int upsample_above,
1749*77c1e3ccSAndroid Build Coastguard Worker                                                     int dx) {
1750*77c1e3ccSAndroid Build Coastguard Worker   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1751*77c1e3ccSAndroid Build Coastguard Worker   (void)upsample_above;
1752*77c1e3ccSAndroid Build Coastguard Worker   const int frac_bits = 6;
1753*77c1e3ccSAndroid Build Coastguard Worker   const int max_base_x = ((64 + N) - 1);
1754*77c1e3ccSAndroid Build Coastguard Worker 
1755*77c1e3ccSAndroid Build Coastguard Worker   // pre-filter above pixels
1756*77c1e3ccSAndroid Build Coastguard Worker   // store in temp buffers:
1757*77c1e3ccSAndroid Build Coastguard Worker   //   above[x] * 32 + 16
1758*77c1e3ccSAndroid Build Coastguard Worker   //   above[x+1] - above[x]
1759*77c1e3ccSAndroid Build Coastguard Worker   // final pixels will be calculated as:
1760*77c1e3ccSAndroid Build Coastguard Worker   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1761*77c1e3ccSAndroid Build Coastguard Worker   __m256i a0, a0_1, a1, a1_1, a32, a16;
1762*77c1e3ccSAndroid Build Coastguard Worker   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1763*77c1e3ccSAndroid Build Coastguard Worker 
1764*77c1e3ccSAndroid Build Coastguard Worker   a16 = _mm256_set1_epi32(16);
1765*77c1e3ccSAndroid Build Coastguard Worker   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1766*77c1e3ccSAndroid Build Coastguard Worker   max_base_x256 = _mm256_set1_epi16(max_base_x);
1767*77c1e3ccSAndroid Build Coastguard Worker 
1768*77c1e3ccSAndroid Build Coastguard Worker   int x = dx;
1769*77c1e3ccSAndroid Build Coastguard Worker   for (int r = 0; r < N; r++, dst += stride) {
1770*77c1e3ccSAndroid Build Coastguard Worker     __m256i b, res[2], res1;
1771*77c1e3ccSAndroid Build Coastguard Worker 
1772*77c1e3ccSAndroid Build Coastguard Worker     int base = x >> frac_bits;
1773*77c1e3ccSAndroid Build Coastguard Worker     if (base >= max_base_x) {
1774*77c1e3ccSAndroid Build Coastguard Worker       for (int i = r; i < N; ++i) {
1775*77c1e3ccSAndroid Build Coastguard Worker         _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
1776*77c1e3ccSAndroid Build Coastguard Worker         _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
1777*77c1e3ccSAndroid Build Coastguard Worker         _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
1778*77c1e3ccSAndroid Build Coastguard Worker         _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
1779*77c1e3ccSAndroid Build Coastguard Worker         dst += stride;
1780*77c1e3ccSAndroid Build Coastguard Worker       }
1781*77c1e3ccSAndroid Build Coastguard Worker       return;
1782*77c1e3ccSAndroid Build Coastguard Worker     }
1783*77c1e3ccSAndroid Build Coastguard Worker 
1784*77c1e3ccSAndroid Build Coastguard Worker     __m256i shift = _mm256_srli_epi32(
1785*77c1e3ccSAndroid Build Coastguard Worker         _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1786*77c1e3ccSAndroid Build Coastguard Worker 
1787*77c1e3ccSAndroid Build Coastguard Worker     __m128i a0_128, a0_1_128, a1_128, a1_1_128;
1788*77c1e3ccSAndroid Build Coastguard Worker     for (int j = 0; j < 64; j += 16) {
1789*77c1e3ccSAndroid Build Coastguard Worker       int mdif = max_base_x - (base + j);
1790*77c1e3ccSAndroid Build Coastguard Worker       if (mdif <= 0) {
1791*77c1e3ccSAndroid Build Coastguard Worker         _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
1792*77c1e3ccSAndroid Build Coastguard Worker       } else {
1793*77c1e3ccSAndroid Build Coastguard Worker         a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
1794*77c1e3ccSAndroid Build Coastguard Worker         a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
1795*77c1e3ccSAndroid Build Coastguard Worker         a0 = _mm256_cvtepu16_epi32(a0_128);
1796*77c1e3ccSAndroid Build Coastguard Worker         a1 = _mm256_cvtepu16_epi32(a1_128);
1797*77c1e3ccSAndroid Build Coastguard Worker 
1798*77c1e3ccSAndroid Build Coastguard Worker         diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
1799*77c1e3ccSAndroid Build Coastguard Worker         a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
1800*77c1e3ccSAndroid Build Coastguard Worker         a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
1801*77c1e3ccSAndroid Build Coastguard Worker         b = _mm256_mullo_epi32(diff, shift);
1802*77c1e3ccSAndroid Build Coastguard Worker 
1803*77c1e3ccSAndroid Build Coastguard Worker         res[0] = _mm256_add_epi32(a32, b);
1804*77c1e3ccSAndroid Build Coastguard Worker         res[0] = _mm256_srli_epi32(res[0], 5);
1805*77c1e3ccSAndroid Build Coastguard Worker         res[0] = _mm256_packus_epi32(
1806*77c1e3ccSAndroid Build Coastguard Worker             res[0],
1807*77c1e3ccSAndroid Build Coastguard Worker             _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
1808*77c1e3ccSAndroid Build Coastguard Worker         if (mdif > 8) {
1809*77c1e3ccSAndroid Build Coastguard Worker           a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j));
1810*77c1e3ccSAndroid Build Coastguard Worker           a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j));
1811*77c1e3ccSAndroid Build Coastguard Worker           a0_1 = _mm256_cvtepu16_epi32(a0_1_128);
1812*77c1e3ccSAndroid Build Coastguard Worker           a1_1 = _mm256_cvtepu16_epi32(a1_1_128);
1813*77c1e3ccSAndroid Build Coastguard Worker 
1814*77c1e3ccSAndroid Build Coastguard Worker           diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
1815*77c1e3ccSAndroid Build Coastguard Worker           a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
1816*77c1e3ccSAndroid Build Coastguard Worker           a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
1817*77c1e3ccSAndroid Build Coastguard Worker           b = _mm256_mullo_epi32(diff, shift);
1818*77c1e3ccSAndroid Build Coastguard Worker 
1819*77c1e3ccSAndroid Build Coastguard Worker           res[1] = _mm256_add_epi32(a32, b);
1820*77c1e3ccSAndroid Build Coastguard Worker           res[1] = _mm256_srli_epi32(res[1], 5);
1821*77c1e3ccSAndroid Build Coastguard Worker           res[1] = _mm256_packus_epi32(
1822*77c1e3ccSAndroid Build Coastguard Worker               res[1],
1823*77c1e3ccSAndroid Build Coastguard Worker               _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
1824*77c1e3ccSAndroid Build Coastguard Worker         } else {
1825*77c1e3ccSAndroid Build Coastguard Worker           res[1] = a_mbase_x;
1826*77c1e3ccSAndroid Build Coastguard Worker         }
1827*77c1e3ccSAndroid Build Coastguard Worker         res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
1828*77c1e3ccSAndroid Build Coastguard Worker                                        1);  // 16 16bit values
1829*77c1e3ccSAndroid Build Coastguard Worker         base_inc256 = _mm256_setr_epi16(
1830*77c1e3ccSAndroid Build Coastguard Worker             base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1831*77c1e3ccSAndroid Build Coastguard Worker             base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1832*77c1e3ccSAndroid Build Coastguard Worker             base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1833*77c1e3ccSAndroid Build Coastguard Worker             base + j + 13, base + j + 14, base + j + 15);
1834*77c1e3ccSAndroid Build Coastguard Worker 
1835*77c1e3ccSAndroid Build Coastguard Worker         mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1836*77c1e3ccSAndroid Build Coastguard Worker         res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1837*77c1e3ccSAndroid Build Coastguard Worker         _mm256_storeu_si256((__m256i *)(dst + j), res1);
1838*77c1e3ccSAndroid Build Coastguard Worker       }
1839*77c1e3ccSAndroid Build Coastguard Worker     }
1840*77c1e3ccSAndroid Build Coastguard Worker     x += dx;
1841*77c1e3ccSAndroid Build Coastguard Worker   }
1842*77c1e3ccSAndroid Build Coastguard Worker }
1843*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_z1_64xN_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,int upsample_above,int dx)1844*77c1e3ccSAndroid Build Coastguard Worker static void highbd_dr_prediction_z1_64xN_avx2(int N, uint16_t *dst,
1845*77c1e3ccSAndroid Build Coastguard Worker                                               ptrdiff_t stride,
1846*77c1e3ccSAndroid Build Coastguard Worker                                               const uint16_t *above,
1847*77c1e3ccSAndroid Build Coastguard Worker                                               int upsample_above, int dx) {
1848*77c1e3ccSAndroid Build Coastguard Worker   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1849*77c1e3ccSAndroid Build Coastguard Worker   (void)upsample_above;
1850*77c1e3ccSAndroid Build Coastguard Worker   const int frac_bits = 6;
1851*77c1e3ccSAndroid Build Coastguard Worker   const int max_base_x = ((64 + N) - 1);
1852*77c1e3ccSAndroid Build Coastguard Worker 
1853*77c1e3ccSAndroid Build Coastguard Worker   // pre-filter above pixels
1854*77c1e3ccSAndroid Build Coastguard Worker   // store in temp buffers:
1855*77c1e3ccSAndroid Build Coastguard Worker   //   above[x] * 32 + 16
1856*77c1e3ccSAndroid Build Coastguard Worker   //   above[x+1] - above[x]
1857*77c1e3ccSAndroid Build Coastguard Worker   // final pixels will be calculated as:
1858*77c1e3ccSAndroid Build Coastguard Worker   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1859*77c1e3ccSAndroid Build Coastguard Worker   __m256i a0, a1, a32, a16, c3f;
1860*77c1e3ccSAndroid Build Coastguard Worker   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1861*77c1e3ccSAndroid Build Coastguard Worker 
1862*77c1e3ccSAndroid Build Coastguard Worker   a16 = _mm256_set1_epi16(16);
1863*77c1e3ccSAndroid Build Coastguard Worker   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1864*77c1e3ccSAndroid Build Coastguard Worker   max_base_x256 = _mm256_set1_epi16(max_base_x);
1865*77c1e3ccSAndroid Build Coastguard Worker   c3f = _mm256_set1_epi16(0x3f);
1866*77c1e3ccSAndroid Build Coastguard Worker 
1867*77c1e3ccSAndroid Build Coastguard Worker   int x = dx;
1868*77c1e3ccSAndroid Build Coastguard Worker   for (int r = 0; r < N; r++, dst += stride) {
1869*77c1e3ccSAndroid Build Coastguard Worker     __m256i b, res;
1870*77c1e3ccSAndroid Build Coastguard Worker 
1871*77c1e3ccSAndroid Build Coastguard Worker     int base = x >> frac_bits;
1872*77c1e3ccSAndroid Build Coastguard Worker     if (base >= max_base_x) {
1873*77c1e3ccSAndroid Build Coastguard Worker       for (int i = r; i < N; ++i) {
1874*77c1e3ccSAndroid Build Coastguard Worker         _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
1875*77c1e3ccSAndroid Build Coastguard Worker         _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
1876*77c1e3ccSAndroid Build Coastguard Worker         _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
1877*77c1e3ccSAndroid Build Coastguard Worker         _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
1878*77c1e3ccSAndroid Build Coastguard Worker         dst += stride;
1879*77c1e3ccSAndroid Build Coastguard Worker       }
1880*77c1e3ccSAndroid Build Coastguard Worker       return;
1881*77c1e3ccSAndroid Build Coastguard Worker     }
1882*77c1e3ccSAndroid Build Coastguard Worker 
1883*77c1e3ccSAndroid Build Coastguard Worker     __m256i shift =
1884*77c1e3ccSAndroid Build Coastguard Worker         _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1885*77c1e3ccSAndroid Build Coastguard Worker 
1886*77c1e3ccSAndroid Build Coastguard Worker     for (int j = 0; j < 64; j += 16) {
1887*77c1e3ccSAndroid Build Coastguard Worker       int mdif = max_base_x - (base + j);
1888*77c1e3ccSAndroid Build Coastguard Worker       if (mdif <= 0) {
1889*77c1e3ccSAndroid Build Coastguard Worker         _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
1890*77c1e3ccSAndroid Build Coastguard Worker       } else {
1891*77c1e3ccSAndroid Build Coastguard Worker         a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
1892*77c1e3ccSAndroid Build Coastguard Worker         a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
1893*77c1e3ccSAndroid Build Coastguard Worker 
1894*77c1e3ccSAndroid Build Coastguard Worker         diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
1895*77c1e3ccSAndroid Build Coastguard Worker         a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
1896*77c1e3ccSAndroid Build Coastguard Worker         a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
1897*77c1e3ccSAndroid Build Coastguard Worker         b = _mm256_mullo_epi16(diff, shift);
1898*77c1e3ccSAndroid Build Coastguard Worker 
1899*77c1e3ccSAndroid Build Coastguard Worker         res = _mm256_add_epi16(a32, b);
1900*77c1e3ccSAndroid Build Coastguard Worker         res = _mm256_srli_epi16(res, 5);
1901*77c1e3ccSAndroid Build Coastguard Worker 
1902*77c1e3ccSAndroid Build Coastguard Worker         base_inc256 = _mm256_setr_epi16(
1903*77c1e3ccSAndroid Build Coastguard Worker             base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1904*77c1e3ccSAndroid Build Coastguard Worker             base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1905*77c1e3ccSAndroid Build Coastguard Worker             base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1906*77c1e3ccSAndroid Build Coastguard Worker             base + j + 13, base + j + 14, base + j + 15);
1907*77c1e3ccSAndroid Build Coastguard Worker 
1908*77c1e3ccSAndroid Build Coastguard Worker         mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1909*77c1e3ccSAndroid Build Coastguard Worker         res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1910*77c1e3ccSAndroid Build Coastguard Worker         _mm256_storeu_si256((__m256i *)(dst + j), res);  // 16 16bit values
1911*77c1e3ccSAndroid Build Coastguard Worker       }
1912*77c1e3ccSAndroid Build Coastguard Worker     }
1913*77c1e3ccSAndroid Build Coastguard Worker     x += dx;
1914*77c1e3ccSAndroid Build Coastguard Worker   }
1915*77c1e3ccSAndroid Build Coastguard Worker }
1916*77c1e3ccSAndroid Build Coastguard Worker 
1917*77c1e3ccSAndroid Build Coastguard Worker // Directional prediction, zone 1: 0 < angle < 90
av1_highbd_dr_prediction_z1_avx2(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int upsample_above,int dx,int dy,int bd)1918*77c1e3ccSAndroid Build Coastguard Worker void av1_highbd_dr_prediction_z1_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
1919*77c1e3ccSAndroid Build Coastguard Worker                                       int bh, const uint16_t *above,
1920*77c1e3ccSAndroid Build Coastguard Worker                                       const uint16_t *left, int upsample_above,
1921*77c1e3ccSAndroid Build Coastguard Worker                                       int dx, int dy, int bd) {
1922*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
1923*77c1e3ccSAndroid Build Coastguard Worker   (void)dy;
1924*77c1e3ccSAndroid Build Coastguard Worker 
1925*77c1e3ccSAndroid Build Coastguard Worker   switch (bw) {
1926*77c1e3ccSAndroid Build Coastguard Worker     case 4:
1927*77c1e3ccSAndroid Build Coastguard Worker       highbd_dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above,
1928*77c1e3ccSAndroid Build Coastguard Worker                                        dx, bd);
1929*77c1e3ccSAndroid Build Coastguard Worker       break;
1930*77c1e3ccSAndroid Build Coastguard Worker     case 8:
1931*77c1e3ccSAndroid Build Coastguard Worker       highbd_dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above,
1932*77c1e3ccSAndroid Build Coastguard Worker                                        dx, bd);
1933*77c1e3ccSAndroid Build Coastguard Worker       break;
1934*77c1e3ccSAndroid Build Coastguard Worker     case 16:
1935*77c1e3ccSAndroid Build Coastguard Worker       highbd_dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above,
1936*77c1e3ccSAndroid Build Coastguard Worker                                         dx, bd);
1937*77c1e3ccSAndroid Build Coastguard Worker       break;
1938*77c1e3ccSAndroid Build Coastguard Worker     case 32:
1939*77c1e3ccSAndroid Build Coastguard Worker       highbd_dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above,
1940*77c1e3ccSAndroid Build Coastguard Worker                                         dx, bd);
1941*77c1e3ccSAndroid Build Coastguard Worker       break;
1942*77c1e3ccSAndroid Build Coastguard Worker     case 64:
1943*77c1e3ccSAndroid Build Coastguard Worker       if (bd < 12) {
1944*77c1e3ccSAndroid Build Coastguard Worker         highbd_dr_prediction_z1_64xN_avx2(bh, dst, stride, above,
1945*77c1e3ccSAndroid Build Coastguard Worker                                           upsample_above, dx);
1946*77c1e3ccSAndroid Build Coastguard Worker       } else {
1947*77c1e3ccSAndroid Build Coastguard Worker         highbd_dr_prediction_32bit_z1_64xN_avx2(bh, dst, stride, above,
1948*77c1e3ccSAndroid Build Coastguard Worker                                                 upsample_above, dx);
1949*77c1e3ccSAndroid Build Coastguard Worker       }
1950*77c1e3ccSAndroid Build Coastguard Worker       break;
1951*77c1e3ccSAndroid Build Coastguard Worker     default: break;
1952*77c1e3ccSAndroid Build Coastguard Worker   }
1953*77c1e3ccSAndroid Build Coastguard Worker   return;
1954*77c1e3ccSAndroid Build Coastguard Worker }
1955*77c1e3ccSAndroid Build Coastguard Worker 
highbd_transpose_TX_16X16(const uint16_t * src,ptrdiff_t pitchSrc,uint16_t * dst,ptrdiff_t pitchDst)1956*77c1e3ccSAndroid Build Coastguard Worker static void highbd_transpose_TX_16X16(const uint16_t *src, ptrdiff_t pitchSrc,
1957*77c1e3ccSAndroid Build Coastguard Worker                                       uint16_t *dst, ptrdiff_t pitchDst) {
1958*77c1e3ccSAndroid Build Coastguard Worker   __m256i r[16];
1959*77c1e3ccSAndroid Build Coastguard Worker   __m256i d[16];
1960*77c1e3ccSAndroid Build Coastguard Worker   for (int j = 0; j < 16; j++) {
1961*77c1e3ccSAndroid Build Coastguard Worker     r[j] = _mm256_loadu_si256((__m256i *)(src + j * pitchSrc));
1962*77c1e3ccSAndroid Build Coastguard Worker   }
1963*77c1e3ccSAndroid Build Coastguard Worker   highbd_transpose16x16_avx2(r, d);
1964*77c1e3ccSAndroid Build Coastguard Worker   for (int j = 0; j < 16; j++) {
1965*77c1e3ccSAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i *)(dst + j * pitchDst), d[j]);
1966*77c1e3ccSAndroid Build Coastguard Worker   }
1967*77c1e3ccSAndroid Build Coastguard Worker }
1968*77c1e3ccSAndroid Build Coastguard Worker 
highbd_transpose(const uint16_t * src,ptrdiff_t pitchSrc,uint16_t * dst,ptrdiff_t pitchDst,int width,int height)1969*77c1e3ccSAndroid Build Coastguard Worker static void highbd_transpose(const uint16_t *src, ptrdiff_t pitchSrc,
1970*77c1e3ccSAndroid Build Coastguard Worker                              uint16_t *dst, ptrdiff_t pitchDst, int width,
1971*77c1e3ccSAndroid Build Coastguard Worker                              int height) {
1972*77c1e3ccSAndroid Build Coastguard Worker   for (int j = 0; j < height; j += 16)
1973*77c1e3ccSAndroid Build Coastguard Worker     for (int i = 0; i < width; i += 16)
1974*77c1e3ccSAndroid Build Coastguard Worker       highbd_transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
1975*77c1e3ccSAndroid Build Coastguard Worker                                 dst + j * pitchDst + i, pitchDst);
1976*77c1e3ccSAndroid Build Coastguard Worker }
1977*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_32bit_z2_Nx4_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy)1978*77c1e3ccSAndroid Build Coastguard Worker static void highbd_dr_prediction_32bit_z2_Nx4_avx2(
1979*77c1e3ccSAndroid Build Coastguard Worker     int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
1980*77c1e3ccSAndroid Build Coastguard Worker     const uint16_t *left, int upsample_above, int upsample_left, int dx,
1981*77c1e3ccSAndroid Build Coastguard Worker     int dy) {
1982*77c1e3ccSAndroid Build Coastguard Worker   const int min_base_x = -(1 << upsample_above);
1983*77c1e3ccSAndroid Build Coastguard Worker   const int min_base_y = -(1 << upsample_left);
1984*77c1e3ccSAndroid Build Coastguard Worker   const int frac_bits_x = 6 - upsample_above;
1985*77c1e3ccSAndroid Build Coastguard Worker   const int frac_bits_y = 6 - upsample_left;
1986*77c1e3ccSAndroid Build Coastguard Worker 
1987*77c1e3ccSAndroid Build Coastguard Worker   assert(dx > 0);
1988*77c1e3ccSAndroid Build Coastguard Worker   // pre-filter above pixels
1989*77c1e3ccSAndroid Build Coastguard Worker   // store in temp buffers:
1990*77c1e3ccSAndroid Build Coastguard Worker   //   above[x] * 32 + 16
1991*77c1e3ccSAndroid Build Coastguard Worker   //   above[x+1] - above[x]
1992*77c1e3ccSAndroid Build Coastguard Worker   // final pixels will be calculated as:
1993*77c1e3ccSAndroid Build Coastguard Worker   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1994*77c1e3ccSAndroid Build Coastguard Worker   __m256i a0_x, a1_x, a32, a16;
1995*77c1e3ccSAndroid Build Coastguard Worker   __m256i diff;
1996*77c1e3ccSAndroid Build Coastguard Worker   __m128i c3f, min_base_y128;
1997*77c1e3ccSAndroid Build Coastguard Worker 
1998*77c1e3ccSAndroid Build Coastguard Worker   a16 = _mm256_set1_epi32(16);
1999*77c1e3ccSAndroid Build Coastguard Worker   c3f = _mm_set1_epi32(0x3f);
2000*77c1e3ccSAndroid Build Coastguard Worker   min_base_y128 = _mm_set1_epi32(min_base_y);
2001*77c1e3ccSAndroid Build Coastguard Worker 
2002*77c1e3ccSAndroid Build Coastguard Worker   for (int r = 0; r < N; r++) {
2003*77c1e3ccSAndroid Build Coastguard Worker     __m256i b, res, shift;
2004*77c1e3ccSAndroid Build Coastguard Worker     __m128i resx, resy, resxy;
2005*77c1e3ccSAndroid Build Coastguard Worker     __m128i a0_x128, a1_x128;
2006*77c1e3ccSAndroid Build Coastguard Worker     int y = r + 1;
2007*77c1e3ccSAndroid Build Coastguard Worker     int base_x = (-y * dx) >> frac_bits_x;
2008*77c1e3ccSAndroid Build Coastguard Worker     int base_shift = 0;
2009*77c1e3ccSAndroid Build Coastguard Worker     if (base_x < (min_base_x - 1)) {
2010*77c1e3ccSAndroid Build Coastguard Worker       base_shift = (min_base_x - base_x - 1) >> upsample_above;
2011*77c1e3ccSAndroid Build Coastguard Worker     }
2012*77c1e3ccSAndroid Build Coastguard Worker     int base_min_diff =
2013*77c1e3ccSAndroid Build Coastguard Worker         (min_base_x - base_x + upsample_above) >> upsample_above;
2014*77c1e3ccSAndroid Build Coastguard Worker     if (base_min_diff > 4) {
2015*77c1e3ccSAndroid Build Coastguard Worker       base_min_diff = 4;
2016*77c1e3ccSAndroid Build Coastguard Worker     } else {
2017*77c1e3ccSAndroid Build Coastguard Worker       if (base_min_diff < 0) base_min_diff = 0;
2018*77c1e3ccSAndroid Build Coastguard Worker     }
2019*77c1e3ccSAndroid Build Coastguard Worker 
2020*77c1e3ccSAndroid Build Coastguard Worker     if (base_shift > 3) {
2021*77c1e3ccSAndroid Build Coastguard Worker       a0_x = _mm256_setzero_si256();
2022*77c1e3ccSAndroid Build Coastguard Worker       a1_x = _mm256_setzero_si256();
2023*77c1e3ccSAndroid Build Coastguard Worker       shift = _mm256_setzero_si256();
2024*77c1e3ccSAndroid Build Coastguard Worker     } else {
2025*77c1e3ccSAndroid Build Coastguard Worker       a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2026*77c1e3ccSAndroid Build Coastguard Worker       if (upsample_above) {
2027*77c1e3ccSAndroid Build Coastguard Worker         a0_x128 = _mm_shuffle_epi8(a0_x128,
2028*77c1e3ccSAndroid Build Coastguard Worker                                    *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
2029*77c1e3ccSAndroid Build Coastguard Worker         a1_x128 = _mm_srli_si128(a0_x128, 8);
2030*77c1e3ccSAndroid Build Coastguard Worker 
2031*77c1e3ccSAndroid Build Coastguard Worker         shift = _mm256_castsi128_si256(_mm_srli_epi32(
2032*77c1e3ccSAndroid Build Coastguard Worker             _mm_and_si128(
2033*77c1e3ccSAndroid Build Coastguard Worker                 _mm_slli_epi32(
2034*77c1e3ccSAndroid Build Coastguard Worker                     _mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
2035*77c1e3ccSAndroid Build Coastguard Worker                                    (2 << 6) - y * dx, (3 << 6) - y * dx),
2036*77c1e3ccSAndroid Build Coastguard Worker                     upsample_above),
2037*77c1e3ccSAndroid Build Coastguard Worker                 c3f),
2038*77c1e3ccSAndroid Build Coastguard Worker             1));
2039*77c1e3ccSAndroid Build Coastguard Worker       } else {
2040*77c1e3ccSAndroid Build Coastguard Worker         a0_x128 =
2041*77c1e3ccSAndroid Build Coastguard Worker             _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2042*77c1e3ccSAndroid Build Coastguard Worker         a1_x128 = _mm_srli_si128(a0_x128, 2);
2043*77c1e3ccSAndroid Build Coastguard Worker 
2044*77c1e3ccSAndroid Build Coastguard Worker         shift = _mm256_castsi128_si256(_mm_srli_epi32(
2045*77c1e3ccSAndroid Build Coastguard Worker             _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
2046*77c1e3ccSAndroid Build Coastguard Worker                                          (2 << 6) - y * dx, (3 << 6) - y * dx),
2047*77c1e3ccSAndroid Build Coastguard Worker                           c3f),
2048*77c1e3ccSAndroid Build Coastguard Worker             1));
2049*77c1e3ccSAndroid Build Coastguard Worker       }
2050*77c1e3ccSAndroid Build Coastguard Worker       a0_x = _mm256_cvtepu16_epi32(a0_x128);
2051*77c1e3ccSAndroid Build Coastguard Worker       a1_x = _mm256_cvtepu16_epi32(a1_x128);
2052*77c1e3ccSAndroid Build Coastguard Worker     }
2053*77c1e3ccSAndroid Build Coastguard Worker     // y calc
2054*77c1e3ccSAndroid Build Coastguard Worker     __m128i a0_y, a1_y, shifty;
2055*77c1e3ccSAndroid Build Coastguard Worker     if (base_x < min_base_x) {
2056*77c1e3ccSAndroid Build Coastguard Worker       __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
2057*77c1e3ccSAndroid Build Coastguard Worker       DECLARE_ALIGNED(32, int, base_y_c[4]);
2058*77c1e3ccSAndroid Build Coastguard Worker       r6 = _mm_set1_epi32(r << 6);
2059*77c1e3ccSAndroid Build Coastguard Worker       dy128 = _mm_set1_epi32(dy);
2060*77c1e3ccSAndroid Build Coastguard Worker       c1234 = _mm_setr_epi32(1, 2, 3, 4);
2061*77c1e3ccSAndroid Build Coastguard Worker       y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128));
2062*77c1e3ccSAndroid Build Coastguard Worker       base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y);
2063*77c1e3ccSAndroid Build Coastguard Worker       mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128);
2064*77c1e3ccSAndroid Build Coastguard Worker       base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
2065*77c1e3ccSAndroid Build Coastguard Worker       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
2066*77c1e3ccSAndroid Build Coastguard Worker 
2067*77c1e3ccSAndroid Build Coastguard Worker       a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]],
2068*77c1e3ccSAndroid Build Coastguard Worker                             left[base_y_c[2]], left[base_y_c[3]]);
2069*77c1e3ccSAndroid Build Coastguard Worker       a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
2070*77c1e3ccSAndroid Build Coastguard Worker                             left[base_y_c[2] + 1], left[base_y_c[3] + 1]);
2071*77c1e3ccSAndroid Build Coastguard Worker 
2072*77c1e3ccSAndroid Build Coastguard Worker       if (upsample_left) {
2073*77c1e3ccSAndroid Build Coastguard Worker         shifty = _mm_srli_epi32(
2074*77c1e3ccSAndroid Build Coastguard Worker             _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1);
2075*77c1e3ccSAndroid Build Coastguard Worker       } else {
2076*77c1e3ccSAndroid Build Coastguard Worker         shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1);
2077*77c1e3ccSAndroid Build Coastguard Worker       }
2078*77c1e3ccSAndroid Build Coastguard Worker       a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
2079*77c1e3ccSAndroid Build Coastguard Worker       a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
2080*77c1e3ccSAndroid Build Coastguard Worker       shift = _mm256_inserti128_si256(shift, shifty, 1);
2081*77c1e3ccSAndroid Build Coastguard Worker     }
2082*77c1e3ccSAndroid Build Coastguard Worker 
2083*77c1e3ccSAndroid Build Coastguard Worker     diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
2084*77c1e3ccSAndroid Build Coastguard Worker     a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
2085*77c1e3ccSAndroid Build Coastguard Worker     a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2086*77c1e3ccSAndroid Build Coastguard Worker 
2087*77c1e3ccSAndroid Build Coastguard Worker     b = _mm256_mullo_epi32(diff, shift);
2088*77c1e3ccSAndroid Build Coastguard Worker     res = _mm256_add_epi32(a32, b);
2089*77c1e3ccSAndroid Build Coastguard Worker     res = _mm256_srli_epi32(res, 5);
2090*77c1e3ccSAndroid Build Coastguard Worker 
2091*77c1e3ccSAndroid Build Coastguard Worker     resx = _mm256_castsi256_si128(res);
2092*77c1e3ccSAndroid Build Coastguard Worker     resx = _mm_packus_epi32(resx, resx);
2093*77c1e3ccSAndroid Build Coastguard Worker 
2094*77c1e3ccSAndroid Build Coastguard Worker     resy = _mm256_extracti128_si256(res, 1);
2095*77c1e3ccSAndroid Build Coastguard Worker     resy = _mm_packus_epi32(resy, resy);
2096*77c1e3ccSAndroid Build Coastguard Worker 
2097*77c1e3ccSAndroid Build Coastguard Worker     resxy =
2098*77c1e3ccSAndroid Build Coastguard Worker         _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2099*77c1e3ccSAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i *)(dst), resxy);
2100*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
2101*77c1e3ccSAndroid Build Coastguard Worker   }
2102*77c1e3ccSAndroid Build Coastguard Worker }
2103*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_z2_Nx4_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy)2104*77c1e3ccSAndroid Build Coastguard Worker static void highbd_dr_prediction_z2_Nx4_avx2(
2105*77c1e3ccSAndroid Build Coastguard Worker     int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2106*77c1e3ccSAndroid Build Coastguard Worker     const uint16_t *left, int upsample_above, int upsample_left, int dx,
2107*77c1e3ccSAndroid Build Coastguard Worker     int dy) {
2108*77c1e3ccSAndroid Build Coastguard Worker   const int min_base_x = -(1 << upsample_above);
2109*77c1e3ccSAndroid Build Coastguard Worker   const int min_base_y = -(1 << upsample_left);
2110*77c1e3ccSAndroid Build Coastguard Worker   const int frac_bits_x = 6 - upsample_above;
2111*77c1e3ccSAndroid Build Coastguard Worker   const int frac_bits_y = 6 - upsample_left;
2112*77c1e3ccSAndroid Build Coastguard Worker 
2113*77c1e3ccSAndroid Build Coastguard Worker   assert(dx > 0);
2114*77c1e3ccSAndroid Build Coastguard Worker   // pre-filter above pixels
2115*77c1e3ccSAndroid Build Coastguard Worker   // store in temp buffers:
2116*77c1e3ccSAndroid Build Coastguard Worker   //   above[x] * 32 + 16
2117*77c1e3ccSAndroid Build Coastguard Worker   //   above[x+1] - above[x]
2118*77c1e3ccSAndroid Build Coastguard Worker   // final pixels will be calculated as:
2119*77c1e3ccSAndroid Build Coastguard Worker   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2120*77c1e3ccSAndroid Build Coastguard Worker   __m256i a0_x, a1_x, a32, a16;
2121*77c1e3ccSAndroid Build Coastguard Worker   __m256i diff;
2122*77c1e3ccSAndroid Build Coastguard Worker   __m128i c3f, min_base_y128;
2123*77c1e3ccSAndroid Build Coastguard Worker 
2124*77c1e3ccSAndroid Build Coastguard Worker   a16 = _mm256_set1_epi16(16);
2125*77c1e3ccSAndroid Build Coastguard Worker   c3f = _mm_set1_epi16(0x3f);
2126*77c1e3ccSAndroid Build Coastguard Worker   min_base_y128 = _mm_set1_epi16(min_base_y);
2127*77c1e3ccSAndroid Build Coastguard Worker 
2128*77c1e3ccSAndroid Build Coastguard Worker   for (int r = 0; r < N; r++) {
2129*77c1e3ccSAndroid Build Coastguard Worker     __m256i b, res, shift;
2130*77c1e3ccSAndroid Build Coastguard Worker     __m128i resx, resy, resxy;
2131*77c1e3ccSAndroid Build Coastguard Worker     __m128i a0_x128, a1_x128;
2132*77c1e3ccSAndroid Build Coastguard Worker     int y = r + 1;
2133*77c1e3ccSAndroid Build Coastguard Worker     int base_x = (-y * dx) >> frac_bits_x;
2134*77c1e3ccSAndroid Build Coastguard Worker     int base_shift = 0;
2135*77c1e3ccSAndroid Build Coastguard Worker     if (base_x < (min_base_x - 1)) {
2136*77c1e3ccSAndroid Build Coastguard Worker       base_shift = (min_base_x - base_x - 1) >> upsample_above;
2137*77c1e3ccSAndroid Build Coastguard Worker     }
2138*77c1e3ccSAndroid Build Coastguard Worker     int base_min_diff =
2139*77c1e3ccSAndroid Build Coastguard Worker         (min_base_x - base_x + upsample_above) >> upsample_above;
2140*77c1e3ccSAndroid Build Coastguard Worker     if (base_min_diff > 4) {
2141*77c1e3ccSAndroid Build Coastguard Worker       base_min_diff = 4;
2142*77c1e3ccSAndroid Build Coastguard Worker     } else {
2143*77c1e3ccSAndroid Build Coastguard Worker       if (base_min_diff < 0) base_min_diff = 0;
2144*77c1e3ccSAndroid Build Coastguard Worker     }
2145*77c1e3ccSAndroid Build Coastguard Worker 
2146*77c1e3ccSAndroid Build Coastguard Worker     if (base_shift > 3) {
2147*77c1e3ccSAndroid Build Coastguard Worker       a0_x = _mm256_setzero_si256();
2148*77c1e3ccSAndroid Build Coastguard Worker       a1_x = _mm256_setzero_si256();
2149*77c1e3ccSAndroid Build Coastguard Worker       shift = _mm256_setzero_si256();
2150*77c1e3ccSAndroid Build Coastguard Worker     } else {
2151*77c1e3ccSAndroid Build Coastguard Worker       a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2152*77c1e3ccSAndroid Build Coastguard Worker       if (upsample_above) {
2153*77c1e3ccSAndroid Build Coastguard Worker         a0_x128 = _mm_shuffle_epi8(a0_x128,
2154*77c1e3ccSAndroid Build Coastguard Worker                                    *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
2155*77c1e3ccSAndroid Build Coastguard Worker         a1_x128 = _mm_srli_si128(a0_x128, 8);
2156*77c1e3ccSAndroid Build Coastguard Worker 
2157*77c1e3ccSAndroid Build Coastguard Worker         shift = _mm256_castsi128_si256(_mm_srli_epi16(
2158*77c1e3ccSAndroid Build Coastguard Worker             _mm_and_si128(
2159*77c1e3ccSAndroid Build Coastguard Worker                 _mm_slli_epi16(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
2160*77c1e3ccSAndroid Build Coastguard Worker                                               (2 << 6) - y * dx,
2161*77c1e3ccSAndroid Build Coastguard Worker                                               (3 << 6) - y * dx, 0, 0, 0, 0),
2162*77c1e3ccSAndroid Build Coastguard Worker                                upsample_above),
2163*77c1e3ccSAndroid Build Coastguard Worker                 c3f),
2164*77c1e3ccSAndroid Build Coastguard Worker             1));
2165*77c1e3ccSAndroid Build Coastguard Worker       } else {
2166*77c1e3ccSAndroid Build Coastguard Worker         a0_x128 =
2167*77c1e3ccSAndroid Build Coastguard Worker             _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2168*77c1e3ccSAndroid Build Coastguard Worker         a1_x128 = _mm_srli_si128(a0_x128, 2);
2169*77c1e3ccSAndroid Build Coastguard Worker 
2170*77c1e3ccSAndroid Build Coastguard Worker         shift = _mm256_castsi128_si256(_mm_srli_epi16(
2171*77c1e3ccSAndroid Build Coastguard Worker             _mm_and_si128(
2172*77c1e3ccSAndroid Build Coastguard Worker                 _mm_setr_epi16(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
2173*77c1e3ccSAndroid Build Coastguard Worker                                (3 << 6) - y * dx, 0, 0, 0, 0),
2174*77c1e3ccSAndroid Build Coastguard Worker                 c3f),
2175*77c1e3ccSAndroid Build Coastguard Worker             1));
2176*77c1e3ccSAndroid Build Coastguard Worker       }
2177*77c1e3ccSAndroid Build Coastguard Worker       a0_x = _mm256_castsi128_si256(a0_x128);
2178*77c1e3ccSAndroid Build Coastguard Worker       a1_x = _mm256_castsi128_si256(a1_x128);
2179*77c1e3ccSAndroid Build Coastguard Worker     }
2180*77c1e3ccSAndroid Build Coastguard Worker     // y calc
2181*77c1e3ccSAndroid Build Coastguard Worker     __m128i a0_y, a1_y, shifty;
2182*77c1e3ccSAndroid Build Coastguard Worker     if (base_x < min_base_x) {
2183*77c1e3ccSAndroid Build Coastguard Worker       __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
2184*77c1e3ccSAndroid Build Coastguard Worker       DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
2185*77c1e3ccSAndroid Build Coastguard Worker       r6 = _mm_set1_epi16(r << 6);
2186*77c1e3ccSAndroid Build Coastguard Worker       dy128 = _mm_set1_epi16(dy);
2187*77c1e3ccSAndroid Build Coastguard Worker       c1234 = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
2188*77c1e3ccSAndroid Build Coastguard Worker       y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
2189*77c1e3ccSAndroid Build Coastguard Worker       base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
2190*77c1e3ccSAndroid Build Coastguard Worker       mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
2191*77c1e3ccSAndroid Build Coastguard Worker       base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
2192*77c1e3ccSAndroid Build Coastguard Worker       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
2193*77c1e3ccSAndroid Build Coastguard Worker 
2194*77c1e3ccSAndroid Build Coastguard Worker       a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
2195*77c1e3ccSAndroid Build Coastguard Worker                             left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
2196*77c1e3ccSAndroid Build Coastguard Worker       a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
2197*77c1e3ccSAndroid Build Coastguard Worker                             left[base_y_c[2] + 1], left[base_y_c[3] + 1], 0, 0,
2198*77c1e3ccSAndroid Build Coastguard Worker                             0, 0);
2199*77c1e3ccSAndroid Build Coastguard Worker 
2200*77c1e3ccSAndroid Build Coastguard Worker       if (upsample_left) {
2201*77c1e3ccSAndroid Build Coastguard Worker         shifty = _mm_srli_epi16(
2202*77c1e3ccSAndroid Build Coastguard Worker             _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
2203*77c1e3ccSAndroid Build Coastguard Worker       } else {
2204*77c1e3ccSAndroid Build Coastguard Worker         shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
2205*77c1e3ccSAndroid Build Coastguard Worker       }
2206*77c1e3ccSAndroid Build Coastguard Worker       a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
2207*77c1e3ccSAndroid Build Coastguard Worker       a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
2208*77c1e3ccSAndroid Build Coastguard Worker       shift = _mm256_inserti128_si256(shift, shifty, 1);
2209*77c1e3ccSAndroid Build Coastguard Worker     }
2210*77c1e3ccSAndroid Build Coastguard Worker 
2211*77c1e3ccSAndroid Build Coastguard Worker     diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
2212*77c1e3ccSAndroid Build Coastguard Worker     a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
2213*77c1e3ccSAndroid Build Coastguard Worker     a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
2214*77c1e3ccSAndroid Build Coastguard Worker 
2215*77c1e3ccSAndroid Build Coastguard Worker     b = _mm256_mullo_epi16(diff, shift);
2216*77c1e3ccSAndroid Build Coastguard Worker     res = _mm256_add_epi16(a32, b);
2217*77c1e3ccSAndroid Build Coastguard Worker     res = _mm256_srli_epi16(res, 5);
2218*77c1e3ccSAndroid Build Coastguard Worker 
2219*77c1e3ccSAndroid Build Coastguard Worker     resx = _mm256_castsi256_si128(res);
2220*77c1e3ccSAndroid Build Coastguard Worker     resy = _mm256_extracti128_si256(res, 1);
2221*77c1e3ccSAndroid Build Coastguard Worker     resxy =
2222*77c1e3ccSAndroid Build Coastguard Worker         _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2223*77c1e3ccSAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i *)(dst), resxy);
2224*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
2225*77c1e3ccSAndroid Build Coastguard Worker   }
2226*77c1e3ccSAndroid Build Coastguard Worker }
2227*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_32bit_z2_Nx8_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy)2228*77c1e3ccSAndroid Build Coastguard Worker static void highbd_dr_prediction_32bit_z2_Nx8_avx2(
2229*77c1e3ccSAndroid Build Coastguard Worker     int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2230*77c1e3ccSAndroid Build Coastguard Worker     const uint16_t *left, int upsample_above, int upsample_left, int dx,
2231*77c1e3ccSAndroid Build Coastguard Worker     int dy) {
2232*77c1e3ccSAndroid Build Coastguard Worker   const int min_base_x = -(1 << upsample_above);
2233*77c1e3ccSAndroid Build Coastguard Worker   const int min_base_y = -(1 << upsample_left);
2234*77c1e3ccSAndroid Build Coastguard Worker   const int frac_bits_x = 6 - upsample_above;
2235*77c1e3ccSAndroid Build Coastguard Worker   const int frac_bits_y = 6 - upsample_left;
2236*77c1e3ccSAndroid Build Coastguard Worker 
2237*77c1e3ccSAndroid Build Coastguard Worker   // pre-filter above pixels
2238*77c1e3ccSAndroid Build Coastguard Worker   // store in temp buffers:
2239*77c1e3ccSAndroid Build Coastguard Worker   //   above[x] * 32 + 16
2240*77c1e3ccSAndroid Build Coastguard Worker   //   above[x+1] - above[x]
2241*77c1e3ccSAndroid Build Coastguard Worker   // final pixels will be calculated as:
2242*77c1e3ccSAndroid Build Coastguard Worker   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2243*77c1e3ccSAndroid Build Coastguard Worker   __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c3f, min_base_y256;
2244*77c1e3ccSAndroid Build Coastguard Worker   __m256i diff;
2245*77c1e3ccSAndroid Build Coastguard Worker   __m128i a0_x128, a1_x128;
2246*77c1e3ccSAndroid Build Coastguard Worker 
2247*77c1e3ccSAndroid Build Coastguard Worker   a16 = _mm256_set1_epi32(16);
2248*77c1e3ccSAndroid Build Coastguard Worker   c3f = _mm256_set1_epi32(0x3f);
2249*77c1e3ccSAndroid Build Coastguard Worker   min_base_y256 = _mm256_set1_epi32(min_base_y);
2250*77c1e3ccSAndroid Build Coastguard Worker 
2251*77c1e3ccSAndroid Build Coastguard Worker   for (int r = 0; r < N; r++) {
2252*77c1e3ccSAndroid Build Coastguard Worker     __m256i b, res, shift;
2253*77c1e3ccSAndroid Build Coastguard Worker     __m128i resx, resy, resxy;
2254*77c1e3ccSAndroid Build Coastguard Worker     int y = r + 1;
2255*77c1e3ccSAndroid Build Coastguard Worker     int base_x = (-y * dx) >> frac_bits_x;
2256*77c1e3ccSAndroid Build Coastguard Worker     int base_shift = 0;
2257*77c1e3ccSAndroid Build Coastguard Worker     if (base_x < (min_base_x - 1)) {
2258*77c1e3ccSAndroid Build Coastguard Worker       base_shift = (min_base_x - base_x - 1) >> upsample_above;
2259*77c1e3ccSAndroid Build Coastguard Worker     }
2260*77c1e3ccSAndroid Build Coastguard Worker     int base_min_diff =
2261*77c1e3ccSAndroid Build Coastguard Worker         (min_base_x - base_x + upsample_above) >> upsample_above;
2262*77c1e3ccSAndroid Build Coastguard Worker     if (base_min_diff > 8) {
2263*77c1e3ccSAndroid Build Coastguard Worker       base_min_diff = 8;
2264*77c1e3ccSAndroid Build Coastguard Worker     } else {
2265*77c1e3ccSAndroid Build Coastguard Worker       if (base_min_diff < 0) base_min_diff = 0;
2266*77c1e3ccSAndroid Build Coastguard Worker     }
2267*77c1e3ccSAndroid Build Coastguard Worker 
2268*77c1e3ccSAndroid Build Coastguard Worker     if (base_shift > 7) {
2269*77c1e3ccSAndroid Build Coastguard Worker       resx = _mm_setzero_si128();
2270*77c1e3ccSAndroid Build Coastguard Worker     } else {
2271*77c1e3ccSAndroid Build Coastguard Worker       a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2272*77c1e3ccSAndroid Build Coastguard Worker       if (upsample_above) {
2273*77c1e3ccSAndroid Build Coastguard Worker         __m128i mask, atmp0, atmp1, atmp2, atmp3;
2274*77c1e3ccSAndroid Build Coastguard Worker         a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
2275*77c1e3ccSAndroid Build Coastguard Worker         atmp0 = _mm_shuffle_epi8(a0_x128,
2276*77c1e3ccSAndroid Build Coastguard Worker                                  *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2277*77c1e3ccSAndroid Build Coastguard Worker         atmp1 = _mm_shuffle_epi8(a1_x128,
2278*77c1e3ccSAndroid Build Coastguard Worker                                  *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2279*77c1e3ccSAndroid Build Coastguard Worker         atmp2 = _mm_shuffle_epi8(
2280*77c1e3ccSAndroid Build Coastguard Worker             a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2281*77c1e3ccSAndroid Build Coastguard Worker         atmp3 = _mm_shuffle_epi8(
2282*77c1e3ccSAndroid Build Coastguard Worker             a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2283*77c1e3ccSAndroid Build Coastguard Worker         mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
2284*77c1e3ccSAndroid Build Coastguard Worker                               _mm_set1_epi8(15));
2285*77c1e3ccSAndroid Build Coastguard Worker         a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
2286*77c1e3ccSAndroid Build Coastguard Worker         mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
2287*77c1e3ccSAndroid Build Coastguard Worker                               _mm_set1_epi8(15));
2288*77c1e3ccSAndroid Build Coastguard Worker         a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
2289*77c1e3ccSAndroid Build Coastguard Worker         shift = _mm256_srli_epi32(
2290*77c1e3ccSAndroid Build Coastguard Worker             _mm256_and_si256(
2291*77c1e3ccSAndroid Build Coastguard Worker                 _mm256_slli_epi32(
2292*77c1e3ccSAndroid Build Coastguard Worker                     _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx,
2293*77c1e3ccSAndroid Build Coastguard Worker                                       (2 << 6) - y * dx, (3 << 6) - y * dx,
2294*77c1e3ccSAndroid Build Coastguard Worker                                       (4 << 6) - y * dx, (5 << 6) - y * dx,
2295*77c1e3ccSAndroid Build Coastguard Worker                                       (6 << 6) - y * dx, (7 << 6) - y * dx),
2296*77c1e3ccSAndroid Build Coastguard Worker                     upsample_above),
2297*77c1e3ccSAndroid Build Coastguard Worker                 c3f),
2298*77c1e3ccSAndroid Build Coastguard Worker             1);
2299*77c1e3ccSAndroid Build Coastguard Worker       } else {
2300*77c1e3ccSAndroid Build Coastguard Worker         a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
2301*77c1e3ccSAndroid Build Coastguard Worker         a0_x128 =
2302*77c1e3ccSAndroid Build Coastguard Worker             _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2303*77c1e3ccSAndroid Build Coastguard Worker         a1_x128 =
2304*77c1e3ccSAndroid Build Coastguard Worker             _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2305*77c1e3ccSAndroid Build Coastguard Worker 
2306*77c1e3ccSAndroid Build Coastguard Worker         shift = _mm256_srli_epi32(
2307*77c1e3ccSAndroid Build Coastguard Worker             _mm256_and_si256(
2308*77c1e3ccSAndroid Build Coastguard Worker                 _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
2309*77c1e3ccSAndroid Build Coastguard Worker                                   (3 << 6) - y * dx, (4 << 6) - y * dx,
2310*77c1e3ccSAndroid Build Coastguard Worker                                   (5 << 6) - y * dx, (6 << 6) - y * dx,
2311*77c1e3ccSAndroid Build Coastguard Worker                                   (7 << 6) - y * dx),
2312*77c1e3ccSAndroid Build Coastguard Worker                 c3f),
2313*77c1e3ccSAndroid Build Coastguard Worker             1);
2314*77c1e3ccSAndroid Build Coastguard Worker       }
2315*77c1e3ccSAndroid Build Coastguard Worker       a0_x = _mm256_cvtepu16_epi32(a0_x128);
2316*77c1e3ccSAndroid Build Coastguard Worker       a1_x = _mm256_cvtepu16_epi32(a1_x128);
2317*77c1e3ccSAndroid Build Coastguard Worker 
2318*77c1e3ccSAndroid Build Coastguard Worker       diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
2319*77c1e3ccSAndroid Build Coastguard Worker       a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
2320*77c1e3ccSAndroid Build Coastguard Worker       a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2321*77c1e3ccSAndroid Build Coastguard Worker 
2322*77c1e3ccSAndroid Build Coastguard Worker       b = _mm256_mullo_epi32(diff, shift);
2323*77c1e3ccSAndroid Build Coastguard Worker       res = _mm256_add_epi32(a32, b);
2324*77c1e3ccSAndroid Build Coastguard Worker       res = _mm256_srli_epi32(res, 5);
2325*77c1e3ccSAndroid Build Coastguard Worker 
2326*77c1e3ccSAndroid Build Coastguard Worker       resx = _mm256_castsi256_si128(_mm256_packus_epi32(
2327*77c1e3ccSAndroid Build Coastguard Worker           res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
2328*77c1e3ccSAndroid Build Coastguard Worker     }
2329*77c1e3ccSAndroid Build Coastguard Worker     // y calc
2330*77c1e3ccSAndroid Build Coastguard Worker     if (base_x < min_base_x) {
2331*77c1e3ccSAndroid Build Coastguard Worker       DECLARE_ALIGNED(32, int, base_y_c[8]);
2332*77c1e3ccSAndroid Build Coastguard Worker       __m256i r6, c256, dy256, y_c256, base_y_c256, mask256;
2333*77c1e3ccSAndroid Build Coastguard Worker       r6 = _mm256_set1_epi32(r << 6);
2334*77c1e3ccSAndroid Build Coastguard Worker       dy256 = _mm256_set1_epi32(dy);
2335*77c1e3ccSAndroid Build Coastguard Worker       c256 = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
2336*77c1e3ccSAndroid Build Coastguard Worker       y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
2337*77c1e3ccSAndroid Build Coastguard Worker       base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
2338*77c1e3ccSAndroid Build Coastguard Worker       mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
2339*77c1e3ccSAndroid Build Coastguard Worker       base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2340*77c1e3ccSAndroid Build Coastguard Worker       _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2341*77c1e3ccSAndroid Build Coastguard Worker 
2342*77c1e3ccSAndroid Build Coastguard Worker       a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2343*77c1e3ccSAndroid Build Coastguard Worker           left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2344*77c1e3ccSAndroid Build Coastguard Worker           left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2345*77c1e3ccSAndroid Build Coastguard Worker           left[base_y_c[6]], left[base_y_c[7]]));
2346*77c1e3ccSAndroid Build Coastguard Worker       a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2347*77c1e3ccSAndroid Build Coastguard Worker           left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
2348*77c1e3ccSAndroid Build Coastguard Worker           left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
2349*77c1e3ccSAndroid Build Coastguard Worker           left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
2350*77c1e3ccSAndroid Build Coastguard Worker 
2351*77c1e3ccSAndroid Build Coastguard Worker       if (upsample_left) {
2352*77c1e3ccSAndroid Build Coastguard Worker         shift = _mm256_srli_epi32(
2353*77c1e3ccSAndroid Build Coastguard Worker             _mm256_and_si256(_mm256_slli_epi32((y_c256), upsample_left), c3f),
2354*77c1e3ccSAndroid Build Coastguard Worker             1);
2355*77c1e3ccSAndroid Build Coastguard Worker       } else {
2356*77c1e3ccSAndroid Build Coastguard Worker         shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
2357*77c1e3ccSAndroid Build Coastguard Worker       }
2358*77c1e3ccSAndroid Build Coastguard Worker       diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
2359*77c1e3ccSAndroid Build Coastguard Worker       a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
2360*77c1e3ccSAndroid Build Coastguard Worker       a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2361*77c1e3ccSAndroid Build Coastguard Worker 
2362*77c1e3ccSAndroid Build Coastguard Worker       b = _mm256_mullo_epi32(diff, shift);
2363*77c1e3ccSAndroid Build Coastguard Worker       res = _mm256_add_epi32(a32, b);
2364*77c1e3ccSAndroid Build Coastguard Worker       res = _mm256_srli_epi32(res, 5);
2365*77c1e3ccSAndroid Build Coastguard Worker 
2366*77c1e3ccSAndroid Build Coastguard Worker       resy = _mm256_castsi256_si128(_mm256_packus_epi32(
2367*77c1e3ccSAndroid Build Coastguard Worker           res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
2368*77c1e3ccSAndroid Build Coastguard Worker     } else {
2369*77c1e3ccSAndroid Build Coastguard Worker       resy = resx;
2370*77c1e3ccSAndroid Build Coastguard Worker     }
2371*77c1e3ccSAndroid Build Coastguard Worker     resxy =
2372*77c1e3ccSAndroid Build Coastguard Worker         _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2373*77c1e3ccSAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i *)(dst), resxy);
2374*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
2375*77c1e3ccSAndroid Build Coastguard Worker   }
2376*77c1e3ccSAndroid Build Coastguard Worker }
2377*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_z2_Nx8_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy)2378*77c1e3ccSAndroid Build Coastguard Worker static void highbd_dr_prediction_z2_Nx8_avx2(
2379*77c1e3ccSAndroid Build Coastguard Worker     int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2380*77c1e3ccSAndroid Build Coastguard Worker     const uint16_t *left, int upsample_above, int upsample_left, int dx,
2381*77c1e3ccSAndroid Build Coastguard Worker     int dy) {
2382*77c1e3ccSAndroid Build Coastguard Worker   const int min_base_x = -(1 << upsample_above);
2383*77c1e3ccSAndroid Build Coastguard Worker   const int min_base_y = -(1 << upsample_left);
2384*77c1e3ccSAndroid Build Coastguard Worker   const int frac_bits_x = 6 - upsample_above;
2385*77c1e3ccSAndroid Build Coastguard Worker   const int frac_bits_y = 6 - upsample_left;
2386*77c1e3ccSAndroid Build Coastguard Worker 
2387*77c1e3ccSAndroid Build Coastguard Worker   // pre-filter above pixels
2388*77c1e3ccSAndroid Build Coastguard Worker   // store in temp buffers:
2389*77c1e3ccSAndroid Build Coastguard Worker   //   above[x] * 32 + 16
2390*77c1e3ccSAndroid Build Coastguard Worker   //   above[x+1] - above[x]
2391*77c1e3ccSAndroid Build Coastguard Worker   // final pixels will be calculated as:
2392*77c1e3ccSAndroid Build Coastguard Worker   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2393*77c1e3ccSAndroid Build Coastguard Worker   __m128i c3f, min_base_y128;
2394*77c1e3ccSAndroid Build Coastguard Worker   __m256i a0_x, a1_x, diff, a32, a16;
2395*77c1e3ccSAndroid Build Coastguard Worker   __m128i a0_x128, a1_x128;
2396*77c1e3ccSAndroid Build Coastguard Worker 
2397*77c1e3ccSAndroid Build Coastguard Worker   a16 = _mm256_set1_epi16(16);
2398*77c1e3ccSAndroid Build Coastguard Worker   c3f = _mm_set1_epi16(0x3f);
2399*77c1e3ccSAndroid Build Coastguard Worker   min_base_y128 = _mm_set1_epi16(min_base_y);
2400*77c1e3ccSAndroid Build Coastguard Worker 
2401*77c1e3ccSAndroid Build Coastguard Worker   for (int r = 0; r < N; r++) {
2402*77c1e3ccSAndroid Build Coastguard Worker     __m256i b, res, shift;
2403*77c1e3ccSAndroid Build Coastguard Worker     __m128i resx, resy, resxy;
2404*77c1e3ccSAndroid Build Coastguard Worker     int y = r + 1;
2405*77c1e3ccSAndroid Build Coastguard Worker     int base_x = (-y * dx) >> frac_bits_x;
2406*77c1e3ccSAndroid Build Coastguard Worker     int base_shift = 0;
2407*77c1e3ccSAndroid Build Coastguard Worker     if (base_x < (min_base_x - 1)) {
2408*77c1e3ccSAndroid Build Coastguard Worker       base_shift = (min_base_x - base_x - 1) >> upsample_above;
2409*77c1e3ccSAndroid Build Coastguard Worker     }
2410*77c1e3ccSAndroid Build Coastguard Worker     int base_min_diff =
2411*77c1e3ccSAndroid Build Coastguard Worker         (min_base_x - base_x + upsample_above) >> upsample_above;
2412*77c1e3ccSAndroid Build Coastguard Worker     if (base_min_diff > 8) {
2413*77c1e3ccSAndroid Build Coastguard Worker       base_min_diff = 8;
2414*77c1e3ccSAndroid Build Coastguard Worker     } else {
2415*77c1e3ccSAndroid Build Coastguard Worker       if (base_min_diff < 0) base_min_diff = 0;
2416*77c1e3ccSAndroid Build Coastguard Worker     }
2417*77c1e3ccSAndroid Build Coastguard Worker 
2418*77c1e3ccSAndroid Build Coastguard Worker     if (base_shift > 7) {
2419*77c1e3ccSAndroid Build Coastguard Worker       a0_x = _mm256_setzero_si256();
2420*77c1e3ccSAndroid Build Coastguard Worker       a1_x = _mm256_setzero_si256();
2421*77c1e3ccSAndroid Build Coastguard Worker       shift = _mm256_setzero_si256();
2422*77c1e3ccSAndroid Build Coastguard Worker     } else {
2423*77c1e3ccSAndroid Build Coastguard Worker       a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2424*77c1e3ccSAndroid Build Coastguard Worker       if (upsample_above) {
2425*77c1e3ccSAndroid Build Coastguard Worker         __m128i mask, atmp0, atmp1, atmp2, atmp3;
2426*77c1e3ccSAndroid Build Coastguard Worker         a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
2427*77c1e3ccSAndroid Build Coastguard Worker         atmp0 = _mm_shuffle_epi8(a0_x128,
2428*77c1e3ccSAndroid Build Coastguard Worker                                  *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2429*77c1e3ccSAndroid Build Coastguard Worker         atmp1 = _mm_shuffle_epi8(a1_x128,
2430*77c1e3ccSAndroid Build Coastguard Worker                                  *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2431*77c1e3ccSAndroid Build Coastguard Worker         atmp2 = _mm_shuffle_epi8(
2432*77c1e3ccSAndroid Build Coastguard Worker             a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2433*77c1e3ccSAndroid Build Coastguard Worker         atmp3 = _mm_shuffle_epi8(
2434*77c1e3ccSAndroid Build Coastguard Worker             a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2435*77c1e3ccSAndroid Build Coastguard Worker         mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
2436*77c1e3ccSAndroid Build Coastguard Worker                               _mm_set1_epi8(15));
2437*77c1e3ccSAndroid Build Coastguard Worker         a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
2438*77c1e3ccSAndroid Build Coastguard Worker         mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
2439*77c1e3ccSAndroid Build Coastguard Worker                               _mm_set1_epi8(15));
2440*77c1e3ccSAndroid Build Coastguard Worker         a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
2441*77c1e3ccSAndroid Build Coastguard Worker 
2442*77c1e3ccSAndroid Build Coastguard Worker         shift = _mm256_castsi128_si256(_mm_srli_epi16(
2443*77c1e3ccSAndroid Build Coastguard Worker             _mm_and_si128(
2444*77c1e3ccSAndroid Build Coastguard Worker                 _mm_slli_epi16(
2445*77c1e3ccSAndroid Build Coastguard Worker                     _mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
2446*77c1e3ccSAndroid Build Coastguard Worker                                    (2 << 6) - y * dx, (3 << 6) - y * dx,
2447*77c1e3ccSAndroid Build Coastguard Worker                                    (4 << 6) - y * dx, (5 << 6) - y * dx,
2448*77c1e3ccSAndroid Build Coastguard Worker                                    (6 << 6) - y * dx, (7 << 6) - y * dx),
2449*77c1e3ccSAndroid Build Coastguard Worker                     upsample_above),
2450*77c1e3ccSAndroid Build Coastguard Worker                 c3f),
2451*77c1e3ccSAndroid Build Coastguard Worker             1));
2452*77c1e3ccSAndroid Build Coastguard Worker       } else {
2453*77c1e3ccSAndroid Build Coastguard Worker         a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
2454*77c1e3ccSAndroid Build Coastguard Worker         a0_x128 =
2455*77c1e3ccSAndroid Build Coastguard Worker             _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2456*77c1e3ccSAndroid Build Coastguard Worker         a1_x128 =
2457*77c1e3ccSAndroid Build Coastguard Worker             _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2458*77c1e3ccSAndroid Build Coastguard Worker 
2459*77c1e3ccSAndroid Build Coastguard Worker         shift = _mm256_castsi128_si256(_mm_srli_epi16(
2460*77c1e3ccSAndroid Build Coastguard Worker             _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
2461*77c1e3ccSAndroid Build Coastguard Worker                                          (2 << 6) - y * dx, (3 << 6) - y * dx,
2462*77c1e3ccSAndroid Build Coastguard Worker                                          (4 << 6) - y * dx, (5 << 6) - y * dx,
2463*77c1e3ccSAndroid Build Coastguard Worker                                          (6 << 6) - y * dx, (7 << 6) - y * dx),
2464*77c1e3ccSAndroid Build Coastguard Worker                           c3f),
2465*77c1e3ccSAndroid Build Coastguard Worker             1));
2466*77c1e3ccSAndroid Build Coastguard Worker       }
2467*77c1e3ccSAndroid Build Coastguard Worker       a0_x = _mm256_castsi128_si256(a0_x128);
2468*77c1e3ccSAndroid Build Coastguard Worker       a1_x = _mm256_castsi128_si256(a1_x128);
2469*77c1e3ccSAndroid Build Coastguard Worker     }
2470*77c1e3ccSAndroid Build Coastguard Worker 
2471*77c1e3ccSAndroid Build Coastguard Worker     // y calc
2472*77c1e3ccSAndroid Build Coastguard Worker     __m128i a0_y, a1_y, shifty;
2473*77c1e3ccSAndroid Build Coastguard Worker     if (base_x < min_base_x) {
2474*77c1e3ccSAndroid Build Coastguard Worker       DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
2475*77c1e3ccSAndroid Build Coastguard Worker       __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
2476*77c1e3ccSAndroid Build Coastguard Worker       r6 = _mm_set1_epi16(r << 6);
2477*77c1e3ccSAndroid Build Coastguard Worker       dy128 = _mm_set1_epi16(dy);
2478*77c1e3ccSAndroid Build Coastguard Worker       c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
2479*77c1e3ccSAndroid Build Coastguard Worker       y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
2480*77c1e3ccSAndroid Build Coastguard Worker       base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
2481*77c1e3ccSAndroid Build Coastguard Worker       mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
2482*77c1e3ccSAndroid Build Coastguard Worker       base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
2483*77c1e3ccSAndroid Build Coastguard Worker       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
2484*77c1e3ccSAndroid Build Coastguard Worker 
2485*77c1e3ccSAndroid Build Coastguard Worker       a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
2486*77c1e3ccSAndroid Build Coastguard Worker                             left[base_y_c[2]], left[base_y_c[3]],
2487*77c1e3ccSAndroid Build Coastguard Worker                             left[base_y_c[4]], left[base_y_c[5]],
2488*77c1e3ccSAndroid Build Coastguard Worker                             left[base_y_c[6]], left[base_y_c[7]]);
2489*77c1e3ccSAndroid Build Coastguard Worker       a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
2490*77c1e3ccSAndroid Build Coastguard Worker                             left[base_y_c[2] + 1], left[base_y_c[3] + 1],
2491*77c1e3ccSAndroid Build Coastguard Worker                             left[base_y_c[4] + 1], left[base_y_c[5] + 1],
2492*77c1e3ccSAndroid Build Coastguard Worker                             left[base_y_c[6] + 1], left[base_y_c[7] + 1]);
2493*77c1e3ccSAndroid Build Coastguard Worker 
2494*77c1e3ccSAndroid Build Coastguard Worker       if (upsample_left) {
2495*77c1e3ccSAndroid Build Coastguard Worker         shifty = _mm_srli_epi16(
2496*77c1e3ccSAndroid Build Coastguard Worker             _mm_and_si128(_mm_slli_epi16((y_c128), upsample_left), c3f), 1);
2497*77c1e3ccSAndroid Build Coastguard Worker       } else {
2498*77c1e3ccSAndroid Build Coastguard Worker         shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
2499*77c1e3ccSAndroid Build Coastguard Worker       }
2500*77c1e3ccSAndroid Build Coastguard Worker       a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
2501*77c1e3ccSAndroid Build Coastguard Worker       a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
2502*77c1e3ccSAndroid Build Coastguard Worker       shift = _mm256_inserti128_si256(shift, shifty, 1);
2503*77c1e3ccSAndroid Build Coastguard Worker     }
2504*77c1e3ccSAndroid Build Coastguard Worker 
2505*77c1e3ccSAndroid Build Coastguard Worker     diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
2506*77c1e3ccSAndroid Build Coastguard Worker     a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
2507*77c1e3ccSAndroid Build Coastguard Worker     a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
2508*77c1e3ccSAndroid Build Coastguard Worker 
2509*77c1e3ccSAndroid Build Coastguard Worker     b = _mm256_mullo_epi16(diff, shift);
2510*77c1e3ccSAndroid Build Coastguard Worker     res = _mm256_add_epi16(a32, b);
2511*77c1e3ccSAndroid Build Coastguard Worker     res = _mm256_srli_epi16(res, 5);
2512*77c1e3ccSAndroid Build Coastguard Worker 
2513*77c1e3ccSAndroid Build Coastguard Worker     resx = _mm256_castsi256_si128(res);
2514*77c1e3ccSAndroid Build Coastguard Worker     resy = _mm256_extracti128_si256(res, 1);
2515*77c1e3ccSAndroid Build Coastguard Worker 
2516*77c1e3ccSAndroid Build Coastguard Worker     resxy =
2517*77c1e3ccSAndroid Build Coastguard Worker         _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2518*77c1e3ccSAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i *)(dst), resxy);
2519*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
2520*77c1e3ccSAndroid Build Coastguard Worker   }
2521*77c1e3ccSAndroid Build Coastguard Worker }
2522*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_32bit_z2_HxW_avx2(int H,int W,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy)2523*77c1e3ccSAndroid Build Coastguard Worker static void highbd_dr_prediction_32bit_z2_HxW_avx2(
2524*77c1e3ccSAndroid Build Coastguard Worker     int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2525*77c1e3ccSAndroid Build Coastguard Worker     const uint16_t *left, int upsample_above, int upsample_left, int dx,
2526*77c1e3ccSAndroid Build Coastguard Worker     int dy) {
2527*77c1e3ccSAndroid Build Coastguard Worker   // here upsample_above and upsample_left are 0 by design of
2528*77c1e3ccSAndroid Build Coastguard Worker   // av1_use_intra_edge_upsample
2529*77c1e3ccSAndroid Build Coastguard Worker   const int min_base_x = -1;
2530*77c1e3ccSAndroid Build Coastguard Worker   const int min_base_y = -1;
2531*77c1e3ccSAndroid Build Coastguard Worker   (void)upsample_above;
2532*77c1e3ccSAndroid Build Coastguard Worker   (void)upsample_left;
2533*77c1e3ccSAndroid Build Coastguard Worker   const int frac_bits_x = 6;
2534*77c1e3ccSAndroid Build Coastguard Worker   const int frac_bits_y = 6;
2535*77c1e3ccSAndroid Build Coastguard Worker 
2536*77c1e3ccSAndroid Build Coastguard Worker   // pre-filter above pixels
2537*77c1e3ccSAndroid Build Coastguard Worker   // store in temp buffers:
2538*77c1e3ccSAndroid Build Coastguard Worker   //   above[x] * 32 + 16
2539*77c1e3ccSAndroid Build Coastguard Worker   //   above[x+1] - above[x]
2540*77c1e3ccSAndroid Build Coastguard Worker   // final pixels will be calculated as:
2541*77c1e3ccSAndroid Build Coastguard Worker   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2542*77c1e3ccSAndroid Build Coastguard Worker   __m256i a0_x, a1_x, a0_y, a1_y, a32, a0_1_x, a1_1_x, a16, c1;
2543*77c1e3ccSAndroid Build Coastguard Worker   __m256i diff, min_base_y256, c3f, dy256, c1234, c0123, c8;
2544*77c1e3ccSAndroid Build Coastguard Worker   __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
2545*77c1e3ccSAndroid Build Coastguard Worker   DECLARE_ALIGNED(32, int, base_y_c[16]);
2546*77c1e3ccSAndroid Build Coastguard Worker 
2547*77c1e3ccSAndroid Build Coastguard Worker   a16 = _mm256_set1_epi32(16);
2548*77c1e3ccSAndroid Build Coastguard Worker   c1 = _mm256_srli_epi32(a16, 4);
2549*77c1e3ccSAndroid Build Coastguard Worker   c8 = _mm256_srli_epi32(a16, 1);
2550*77c1e3ccSAndroid Build Coastguard Worker   min_base_y256 = _mm256_set1_epi32(min_base_y);
2551*77c1e3ccSAndroid Build Coastguard Worker   c3f = _mm256_set1_epi32(0x3f);
2552*77c1e3ccSAndroid Build Coastguard Worker   dy256 = _mm256_set1_epi32(dy);
2553*77c1e3ccSAndroid Build Coastguard Worker   c0123 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
2554*77c1e3ccSAndroid Build Coastguard Worker   c1234 = _mm256_add_epi32(c0123, c1);
2555*77c1e3ccSAndroid Build Coastguard Worker 
2556*77c1e3ccSAndroid Build Coastguard Worker   for (int r = 0; r < H; r++) {
2557*77c1e3ccSAndroid Build Coastguard Worker     __m256i b, res, shift, ydx;
2558*77c1e3ccSAndroid Build Coastguard Worker     __m256i resx[2], resy[2];
2559*77c1e3ccSAndroid Build Coastguard Worker     __m256i resxy, j256, r6;
2560*77c1e3ccSAndroid Build Coastguard Worker     for (int j = 0; j < W; j += 16) {
2561*77c1e3ccSAndroid Build Coastguard Worker       j256 = _mm256_set1_epi32(j);
2562*77c1e3ccSAndroid Build Coastguard Worker       int y = r + 1;
2563*77c1e3ccSAndroid Build Coastguard Worker       ydx = _mm256_set1_epi32(y * dx);
2564*77c1e3ccSAndroid Build Coastguard Worker 
2565*77c1e3ccSAndroid Build Coastguard Worker       int base_x = ((j << 6) - y * dx) >> frac_bits_x;
2566*77c1e3ccSAndroid Build Coastguard Worker       int base_shift = 0;
2567*77c1e3ccSAndroid Build Coastguard Worker       if ((base_x) < (min_base_x - 1)) {
2568*77c1e3ccSAndroid Build Coastguard Worker         base_shift = (min_base_x - base_x - 1);
2569*77c1e3ccSAndroid Build Coastguard Worker       }
2570*77c1e3ccSAndroid Build Coastguard Worker       int base_min_diff = (min_base_x - base_x);
2571*77c1e3ccSAndroid Build Coastguard Worker       if (base_min_diff > 16) {
2572*77c1e3ccSAndroid Build Coastguard Worker         base_min_diff = 16;
2573*77c1e3ccSAndroid Build Coastguard Worker       } else {
2574*77c1e3ccSAndroid Build Coastguard Worker         if (base_min_diff < 0) base_min_diff = 0;
2575*77c1e3ccSAndroid Build Coastguard Worker       }
2576*77c1e3ccSAndroid Build Coastguard Worker 
2577*77c1e3ccSAndroid Build Coastguard Worker       if (base_shift > 7) {
2578*77c1e3ccSAndroid Build Coastguard Worker         resx[0] = _mm256_setzero_si256();
2579*77c1e3ccSAndroid Build Coastguard Worker       } else {
2580*77c1e3ccSAndroid Build Coastguard Worker         a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2581*77c1e3ccSAndroid Build Coastguard Worker         a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
2582*77c1e3ccSAndroid Build Coastguard Worker         a0_x128 =
2583*77c1e3ccSAndroid Build Coastguard Worker             _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2584*77c1e3ccSAndroid Build Coastguard Worker         a1_x128 =
2585*77c1e3ccSAndroid Build Coastguard Worker             _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2586*77c1e3ccSAndroid Build Coastguard Worker 
2587*77c1e3ccSAndroid Build Coastguard Worker         a0_x = _mm256_cvtepu16_epi32(a0_x128);
2588*77c1e3ccSAndroid Build Coastguard Worker         a1_x = _mm256_cvtepu16_epi32(a1_x128);
2589*77c1e3ccSAndroid Build Coastguard Worker 
2590*77c1e3ccSAndroid Build Coastguard Worker         r6 = _mm256_slli_epi32(_mm256_add_epi32(c0123, j256), 6);
2591*77c1e3ccSAndroid Build Coastguard Worker         shift = _mm256_srli_epi32(
2592*77c1e3ccSAndroid Build Coastguard Worker             _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
2593*77c1e3ccSAndroid Build Coastguard Worker 
2594*77c1e3ccSAndroid Build Coastguard Worker         diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
2595*77c1e3ccSAndroid Build Coastguard Worker         a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
2596*77c1e3ccSAndroid Build Coastguard Worker         a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2597*77c1e3ccSAndroid Build Coastguard Worker 
2598*77c1e3ccSAndroid Build Coastguard Worker         b = _mm256_mullo_epi32(diff, shift);
2599*77c1e3ccSAndroid Build Coastguard Worker         res = _mm256_add_epi32(a32, b);
2600*77c1e3ccSAndroid Build Coastguard Worker         res = _mm256_srli_epi32(res, 5);
2601*77c1e3ccSAndroid Build Coastguard Worker 
2602*77c1e3ccSAndroid Build Coastguard Worker         resx[0] = _mm256_packus_epi32(
2603*77c1e3ccSAndroid Build Coastguard Worker             res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
2604*77c1e3ccSAndroid Build Coastguard Worker       }
2605*77c1e3ccSAndroid Build Coastguard Worker       int base_shift8 = 0;
2606*77c1e3ccSAndroid Build Coastguard Worker       if ((base_x + 8) < (min_base_x - 1)) {
2607*77c1e3ccSAndroid Build Coastguard Worker         base_shift8 = (min_base_x - (base_x + 8) - 1);
2608*77c1e3ccSAndroid Build Coastguard Worker       }
2609*77c1e3ccSAndroid Build Coastguard Worker       if (base_shift8 > 7) {
2610*77c1e3ccSAndroid Build Coastguard Worker         resx[1] = _mm256_setzero_si256();
2611*77c1e3ccSAndroid Build Coastguard Worker       } else {
2612*77c1e3ccSAndroid Build Coastguard Worker         a0_1_x128 =
2613*77c1e3ccSAndroid Build Coastguard Worker             _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 8));
2614*77c1e3ccSAndroid Build Coastguard Worker         a1_1_x128 =
2615*77c1e3ccSAndroid Build Coastguard Worker             _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 9));
2616*77c1e3ccSAndroid Build Coastguard Worker         a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
2617*77c1e3ccSAndroid Build Coastguard Worker                                      *(__m128i *)HighbdLoadMaskx[base_shift8]);
2618*77c1e3ccSAndroid Build Coastguard Worker         a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
2619*77c1e3ccSAndroid Build Coastguard Worker                                      *(__m128i *)HighbdLoadMaskx[base_shift8]);
2620*77c1e3ccSAndroid Build Coastguard Worker 
2621*77c1e3ccSAndroid Build Coastguard Worker         a0_1_x = _mm256_cvtepu16_epi32(a0_1_x128);
2622*77c1e3ccSAndroid Build Coastguard Worker         a1_1_x = _mm256_cvtepu16_epi32(a1_1_x128);
2623*77c1e3ccSAndroid Build Coastguard Worker 
2624*77c1e3ccSAndroid Build Coastguard Worker         r6 = _mm256_slli_epi32(
2625*77c1e3ccSAndroid Build Coastguard Worker             _mm256_add_epi32(c0123, _mm256_add_epi32(j256, c8)), 6);
2626*77c1e3ccSAndroid Build Coastguard Worker         shift = _mm256_srli_epi32(
2627*77c1e3ccSAndroid Build Coastguard Worker             _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
2628*77c1e3ccSAndroid Build Coastguard Worker 
2629*77c1e3ccSAndroid Build Coastguard Worker         diff = _mm256_sub_epi32(a1_1_x, a0_1_x);  // a[x+1] - a[x]
2630*77c1e3ccSAndroid Build Coastguard Worker         a32 = _mm256_slli_epi32(a0_1_x, 5);       // a[x] * 32
2631*77c1e3ccSAndroid Build Coastguard Worker         a32 = _mm256_add_epi32(a32, a16);         // a[x] * 32 + 16
2632*77c1e3ccSAndroid Build Coastguard Worker         b = _mm256_mullo_epi32(diff, shift);
2633*77c1e3ccSAndroid Build Coastguard Worker 
2634*77c1e3ccSAndroid Build Coastguard Worker         resx[1] = _mm256_add_epi32(a32, b);
2635*77c1e3ccSAndroid Build Coastguard Worker         resx[1] = _mm256_srli_epi32(resx[1], 5);
2636*77c1e3ccSAndroid Build Coastguard Worker         resx[1] = _mm256_packus_epi32(
2637*77c1e3ccSAndroid Build Coastguard Worker             resx[1],
2638*77c1e3ccSAndroid Build Coastguard Worker             _mm256_castsi128_si256(_mm256_extracti128_si256(resx[1], 1)));
2639*77c1e3ccSAndroid Build Coastguard Worker       }
2640*77c1e3ccSAndroid Build Coastguard Worker       resx[0] =
2641*77c1e3ccSAndroid Build Coastguard Worker           _mm256_inserti128_si256(resx[0], _mm256_castsi256_si128(resx[1]),
2642*77c1e3ccSAndroid Build Coastguard Worker                                   1);  // 16 16bit values
2643*77c1e3ccSAndroid Build Coastguard Worker 
2644*77c1e3ccSAndroid Build Coastguard Worker       // y calc
2645*77c1e3ccSAndroid Build Coastguard Worker       resy[0] = _mm256_setzero_si256();
2646*77c1e3ccSAndroid Build Coastguard Worker       if ((base_x < min_base_x)) {
2647*77c1e3ccSAndroid Build Coastguard Worker         __m256i c256, y_c256, y_c_1_256, base_y_c256, mask256;
2648*77c1e3ccSAndroid Build Coastguard Worker         r6 = _mm256_set1_epi32(r << 6);
2649*77c1e3ccSAndroid Build Coastguard Worker         c256 = _mm256_add_epi32(j256, c1234);
2650*77c1e3ccSAndroid Build Coastguard Worker         y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
2651*77c1e3ccSAndroid Build Coastguard Worker         base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
2652*77c1e3ccSAndroid Build Coastguard Worker         mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
2653*77c1e3ccSAndroid Build Coastguard Worker         base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2654*77c1e3ccSAndroid Build Coastguard Worker         _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2655*77c1e3ccSAndroid Build Coastguard Worker         c256 = _mm256_add_epi32(c256, c8);
2656*77c1e3ccSAndroid Build Coastguard Worker         y_c_1_256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
2657*77c1e3ccSAndroid Build Coastguard Worker         base_y_c256 = _mm256_srai_epi32(y_c_1_256, frac_bits_y);
2658*77c1e3ccSAndroid Build Coastguard Worker         mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
2659*77c1e3ccSAndroid Build Coastguard Worker         base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2660*77c1e3ccSAndroid Build Coastguard Worker         _mm256_store_si256((__m256i *)(base_y_c + 8), base_y_c256);
2661*77c1e3ccSAndroid Build Coastguard Worker 
2662*77c1e3ccSAndroid Build Coastguard Worker         a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2663*77c1e3ccSAndroid Build Coastguard Worker             left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2664*77c1e3ccSAndroid Build Coastguard Worker             left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2665*77c1e3ccSAndroid Build Coastguard Worker             left[base_y_c[6]], left[base_y_c[7]]));
2666*77c1e3ccSAndroid Build Coastguard Worker         a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2667*77c1e3ccSAndroid Build Coastguard Worker             left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
2668*77c1e3ccSAndroid Build Coastguard Worker             left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
2669*77c1e3ccSAndroid Build Coastguard Worker             left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
2670*77c1e3ccSAndroid Build Coastguard Worker 
2671*77c1e3ccSAndroid Build Coastguard Worker         shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
2672*77c1e3ccSAndroid Build Coastguard Worker 
2673*77c1e3ccSAndroid Build Coastguard Worker         diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
2674*77c1e3ccSAndroid Build Coastguard Worker         a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
2675*77c1e3ccSAndroid Build Coastguard Worker         a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2676*77c1e3ccSAndroid Build Coastguard Worker 
2677*77c1e3ccSAndroid Build Coastguard Worker         b = _mm256_mullo_epi32(diff, shift);
2678*77c1e3ccSAndroid Build Coastguard Worker         res = _mm256_add_epi32(a32, b);
2679*77c1e3ccSAndroid Build Coastguard Worker         res = _mm256_srli_epi32(res, 5);
2680*77c1e3ccSAndroid Build Coastguard Worker 
2681*77c1e3ccSAndroid Build Coastguard Worker         resy[0] = _mm256_packus_epi32(
2682*77c1e3ccSAndroid Build Coastguard Worker             res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
2683*77c1e3ccSAndroid Build Coastguard Worker 
2684*77c1e3ccSAndroid Build Coastguard Worker         a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2685*77c1e3ccSAndroid Build Coastguard Worker             left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]],
2686*77c1e3ccSAndroid Build Coastguard Worker             left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]],
2687*77c1e3ccSAndroid Build Coastguard Worker             left[base_y_c[14]], left[base_y_c[15]]));
2688*77c1e3ccSAndroid Build Coastguard Worker         a1_y = _mm256_cvtepu16_epi32(
2689*77c1e3ccSAndroid Build Coastguard Worker             _mm_setr_epi16(left[base_y_c[8] + 1], left[base_y_c[9] + 1],
2690*77c1e3ccSAndroid Build Coastguard Worker                            left[base_y_c[10] + 1], left[base_y_c[11] + 1],
2691*77c1e3ccSAndroid Build Coastguard Worker                            left[base_y_c[12] + 1], left[base_y_c[13] + 1],
2692*77c1e3ccSAndroid Build Coastguard Worker                            left[base_y_c[14] + 1], left[base_y_c[15] + 1]));
2693*77c1e3ccSAndroid Build Coastguard Worker         shift = _mm256_srli_epi32(_mm256_and_si256(y_c_1_256, c3f), 1);
2694*77c1e3ccSAndroid Build Coastguard Worker 
2695*77c1e3ccSAndroid Build Coastguard Worker         diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
2696*77c1e3ccSAndroid Build Coastguard Worker         a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
2697*77c1e3ccSAndroid Build Coastguard Worker         a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2698*77c1e3ccSAndroid Build Coastguard Worker 
2699*77c1e3ccSAndroid Build Coastguard Worker         b = _mm256_mullo_epi32(diff, shift);
2700*77c1e3ccSAndroid Build Coastguard Worker         res = _mm256_add_epi32(a32, b);
2701*77c1e3ccSAndroid Build Coastguard Worker         res = _mm256_srli_epi32(res, 5);
2702*77c1e3ccSAndroid Build Coastguard Worker 
2703*77c1e3ccSAndroid Build Coastguard Worker         resy[1] = _mm256_packus_epi32(
2704*77c1e3ccSAndroid Build Coastguard Worker             res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
2705*77c1e3ccSAndroid Build Coastguard Worker 
2706*77c1e3ccSAndroid Build Coastguard Worker         resy[0] =
2707*77c1e3ccSAndroid Build Coastguard Worker             _mm256_inserti128_si256(resy[0], _mm256_castsi256_si128(resy[1]),
2708*77c1e3ccSAndroid Build Coastguard Worker                                     1);  // 16 16bit values
2709*77c1e3ccSAndroid Build Coastguard Worker       }
2710*77c1e3ccSAndroid Build Coastguard Worker 
2711*77c1e3ccSAndroid Build Coastguard Worker       resxy = _mm256_blendv_epi8(resx[0], resy[0],
2712*77c1e3ccSAndroid Build Coastguard Worker                                  *(__m256i *)HighbdBaseMask[base_min_diff]);
2713*77c1e3ccSAndroid Build Coastguard Worker       _mm256_storeu_si256((__m256i *)(dst + j), resxy);
2714*77c1e3ccSAndroid Build Coastguard Worker     }  // for j
2715*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
2716*77c1e3ccSAndroid Build Coastguard Worker   }
2717*77c1e3ccSAndroid Build Coastguard Worker }
2718*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_z2_HxW_avx2(int H,int W,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy)2719*77c1e3ccSAndroid Build Coastguard Worker static void highbd_dr_prediction_z2_HxW_avx2(
2720*77c1e3ccSAndroid Build Coastguard Worker     int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2721*77c1e3ccSAndroid Build Coastguard Worker     const uint16_t *left, int upsample_above, int upsample_left, int dx,
2722*77c1e3ccSAndroid Build Coastguard Worker     int dy) {
2723*77c1e3ccSAndroid Build Coastguard Worker   // here upsample_above and upsample_left are 0 by design of
2724*77c1e3ccSAndroid Build Coastguard Worker   // av1_use_intra_edge_upsample
2725*77c1e3ccSAndroid Build Coastguard Worker   const int min_base_x = -1;
2726*77c1e3ccSAndroid Build Coastguard Worker   const int min_base_y = -1;
2727*77c1e3ccSAndroid Build Coastguard Worker   (void)upsample_above;
2728*77c1e3ccSAndroid Build Coastguard Worker   (void)upsample_left;
2729*77c1e3ccSAndroid Build Coastguard Worker   const int frac_bits_x = 6;
2730*77c1e3ccSAndroid Build Coastguard Worker   const int frac_bits_y = 6;
2731*77c1e3ccSAndroid Build Coastguard Worker 
2732*77c1e3ccSAndroid Build Coastguard Worker   // pre-filter above pixels
2733*77c1e3ccSAndroid Build Coastguard Worker   // store in temp buffers:
2734*77c1e3ccSAndroid Build Coastguard Worker   //   above[x] * 32 + 16
2735*77c1e3ccSAndroid Build Coastguard Worker   //   above[x+1] - above[x]
2736*77c1e3ccSAndroid Build Coastguard Worker   // final pixels will be calculated as:
2737*77c1e3ccSAndroid Build Coastguard Worker   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2738*77c1e3ccSAndroid Build Coastguard Worker   __m256i a0_x, a1_x, a32, a16, c3f, c1;
2739*77c1e3ccSAndroid Build Coastguard Worker   __m256i diff, min_base_y256, dy256, c1234, c0123;
2740*77c1e3ccSAndroid Build Coastguard Worker   DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
2741*77c1e3ccSAndroid Build Coastguard Worker 
2742*77c1e3ccSAndroid Build Coastguard Worker   a16 = _mm256_set1_epi16(16);
2743*77c1e3ccSAndroid Build Coastguard Worker   c1 = _mm256_srli_epi16(a16, 4);
2744*77c1e3ccSAndroid Build Coastguard Worker   min_base_y256 = _mm256_set1_epi16(min_base_y);
2745*77c1e3ccSAndroid Build Coastguard Worker   c3f = _mm256_set1_epi16(0x3f);
2746*77c1e3ccSAndroid Build Coastguard Worker   dy256 = _mm256_set1_epi16(dy);
2747*77c1e3ccSAndroid Build Coastguard Worker   c0123 =
2748*77c1e3ccSAndroid Build Coastguard Worker       _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
2749*77c1e3ccSAndroid Build Coastguard Worker   c1234 = _mm256_add_epi16(c0123, c1);
2750*77c1e3ccSAndroid Build Coastguard Worker 
2751*77c1e3ccSAndroid Build Coastguard Worker   for (int r = 0; r < H; r++) {
2752*77c1e3ccSAndroid Build Coastguard Worker     __m256i b, res, shift;
2753*77c1e3ccSAndroid Build Coastguard Worker     __m256i resx, resy, ydx;
2754*77c1e3ccSAndroid Build Coastguard Worker     __m256i resxy, j256, r6;
2755*77c1e3ccSAndroid Build Coastguard Worker     __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
2756*77c1e3ccSAndroid Build Coastguard Worker     int y = r + 1;
2757*77c1e3ccSAndroid Build Coastguard Worker     ydx = _mm256_set1_epi16((short)(y * dx));
2758*77c1e3ccSAndroid Build Coastguard Worker 
2759*77c1e3ccSAndroid Build Coastguard Worker     for (int j = 0; j < W; j += 16) {
2760*77c1e3ccSAndroid Build Coastguard Worker       j256 = _mm256_set1_epi16(j);
2761*77c1e3ccSAndroid Build Coastguard Worker       int base_x = ((j << 6) - y * dx) >> frac_bits_x;
2762*77c1e3ccSAndroid Build Coastguard Worker       int base_shift = 0;
2763*77c1e3ccSAndroid Build Coastguard Worker       if ((base_x) < (min_base_x - 1)) {
2764*77c1e3ccSAndroid Build Coastguard Worker         base_shift = (min_base_x - (base_x)-1);
2765*77c1e3ccSAndroid Build Coastguard Worker       }
2766*77c1e3ccSAndroid Build Coastguard Worker       int base_min_diff = (min_base_x - base_x);
2767*77c1e3ccSAndroid Build Coastguard Worker       if (base_min_diff > 16) {
2768*77c1e3ccSAndroid Build Coastguard Worker         base_min_diff = 16;
2769*77c1e3ccSAndroid Build Coastguard Worker       } else {
2770*77c1e3ccSAndroid Build Coastguard Worker         if (base_min_diff < 0) base_min_diff = 0;
2771*77c1e3ccSAndroid Build Coastguard Worker       }
2772*77c1e3ccSAndroid Build Coastguard Worker 
2773*77c1e3ccSAndroid Build Coastguard Worker       if (base_shift < 8) {
2774*77c1e3ccSAndroid Build Coastguard Worker         a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2775*77c1e3ccSAndroid Build Coastguard Worker         a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
2776*77c1e3ccSAndroid Build Coastguard Worker         a0_x128 =
2777*77c1e3ccSAndroid Build Coastguard Worker             _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2778*77c1e3ccSAndroid Build Coastguard Worker         a1_x128 =
2779*77c1e3ccSAndroid Build Coastguard Worker             _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2780*77c1e3ccSAndroid Build Coastguard Worker 
2781*77c1e3ccSAndroid Build Coastguard Worker         a0_x = _mm256_castsi128_si256(a0_x128);
2782*77c1e3ccSAndroid Build Coastguard Worker         a1_x = _mm256_castsi128_si256(a1_x128);
2783*77c1e3ccSAndroid Build Coastguard Worker       } else {
2784*77c1e3ccSAndroid Build Coastguard Worker         a0_x = _mm256_setzero_si256();
2785*77c1e3ccSAndroid Build Coastguard Worker         a1_x = _mm256_setzero_si256();
2786*77c1e3ccSAndroid Build Coastguard Worker       }
2787*77c1e3ccSAndroid Build Coastguard Worker 
2788*77c1e3ccSAndroid Build Coastguard Worker       int base_shift1 = 0;
2789*77c1e3ccSAndroid Build Coastguard Worker       if (base_shift > 8) {
2790*77c1e3ccSAndroid Build Coastguard Worker         base_shift1 = base_shift - 8;
2791*77c1e3ccSAndroid Build Coastguard Worker       }
2792*77c1e3ccSAndroid Build Coastguard Worker       if (base_shift1 < 8) {
2793*77c1e3ccSAndroid Build Coastguard Worker         a0_1_x128 =
2794*77c1e3ccSAndroid Build Coastguard Worker             _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 8));
2795*77c1e3ccSAndroid Build Coastguard Worker         a1_1_x128 =
2796*77c1e3ccSAndroid Build Coastguard Worker             _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 9));
2797*77c1e3ccSAndroid Build Coastguard Worker         a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
2798*77c1e3ccSAndroid Build Coastguard Worker                                      *(__m128i *)HighbdLoadMaskx[base_shift1]);
2799*77c1e3ccSAndroid Build Coastguard Worker         a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
2800*77c1e3ccSAndroid Build Coastguard Worker                                      *(__m128i *)HighbdLoadMaskx[base_shift1]);
2801*77c1e3ccSAndroid Build Coastguard Worker 
2802*77c1e3ccSAndroid Build Coastguard Worker         a0_x = _mm256_inserti128_si256(a0_x, a0_1_x128, 1);
2803*77c1e3ccSAndroid Build Coastguard Worker         a1_x = _mm256_inserti128_si256(a1_x, a1_1_x128, 1);
2804*77c1e3ccSAndroid Build Coastguard Worker       }
2805*77c1e3ccSAndroid Build Coastguard Worker       r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
2806*77c1e3ccSAndroid Build Coastguard Worker       shift = _mm256_srli_epi16(
2807*77c1e3ccSAndroid Build Coastguard Worker           _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
2808*77c1e3ccSAndroid Build Coastguard Worker 
2809*77c1e3ccSAndroid Build Coastguard Worker       diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
2810*77c1e3ccSAndroid Build Coastguard Worker       a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
2811*77c1e3ccSAndroid Build Coastguard Worker       a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
2812*77c1e3ccSAndroid Build Coastguard Worker 
2813*77c1e3ccSAndroid Build Coastguard Worker       b = _mm256_mullo_epi16(diff, shift);
2814*77c1e3ccSAndroid Build Coastguard Worker       res = _mm256_add_epi16(a32, b);
2815*77c1e3ccSAndroid Build Coastguard Worker       resx = _mm256_srli_epi16(res, 5);  // 16 16-bit values
2816*77c1e3ccSAndroid Build Coastguard Worker 
2817*77c1e3ccSAndroid Build Coastguard Worker       // y calc
2818*77c1e3ccSAndroid Build Coastguard Worker       resy = _mm256_setzero_si256();
2819*77c1e3ccSAndroid Build Coastguard Worker       __m256i a0_y, a1_y, shifty;
2820*77c1e3ccSAndroid Build Coastguard Worker       if ((base_x < min_base_x)) {
2821*77c1e3ccSAndroid Build Coastguard Worker         __m256i c256, y_c256, base_y_c256, mask256, mul16;
2822*77c1e3ccSAndroid Build Coastguard Worker         r6 = _mm256_set1_epi16(r << 6);
2823*77c1e3ccSAndroid Build Coastguard Worker         c256 = _mm256_add_epi16(j256, c1234);
2824*77c1e3ccSAndroid Build Coastguard Worker         mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
2825*77c1e3ccSAndroid Build Coastguard Worker                                  _mm256_srli_epi16(min_base_y256, 1));
2826*77c1e3ccSAndroid Build Coastguard Worker         y_c256 = _mm256_sub_epi16(r6, mul16);
2827*77c1e3ccSAndroid Build Coastguard Worker         base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
2828*77c1e3ccSAndroid Build Coastguard Worker         mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
2829*77c1e3ccSAndroid Build Coastguard Worker         base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2830*77c1e3ccSAndroid Build Coastguard Worker         _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2831*77c1e3ccSAndroid Build Coastguard Worker 
2832*77c1e3ccSAndroid Build Coastguard Worker         a0_y = _mm256_setr_epi16(
2833*77c1e3ccSAndroid Build Coastguard Worker             left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2834*77c1e3ccSAndroid Build Coastguard Worker             left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2835*77c1e3ccSAndroid Build Coastguard Worker             left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
2836*77c1e3ccSAndroid Build Coastguard Worker             left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
2837*77c1e3ccSAndroid Build Coastguard Worker             left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
2838*77c1e3ccSAndroid Build Coastguard Worker             left[base_y_c[15]]);
2839*77c1e3ccSAndroid Build Coastguard Worker         base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
2840*77c1e3ccSAndroid Build Coastguard Worker         _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2841*77c1e3ccSAndroid Build Coastguard Worker 
2842*77c1e3ccSAndroid Build Coastguard Worker         a1_y = _mm256_setr_epi16(
2843*77c1e3ccSAndroid Build Coastguard Worker             left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2844*77c1e3ccSAndroid Build Coastguard Worker             left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2845*77c1e3ccSAndroid Build Coastguard Worker             left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
2846*77c1e3ccSAndroid Build Coastguard Worker             left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
2847*77c1e3ccSAndroid Build Coastguard Worker             left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
2848*77c1e3ccSAndroid Build Coastguard Worker             left[base_y_c[15]]);
2849*77c1e3ccSAndroid Build Coastguard Worker 
2850*77c1e3ccSAndroid Build Coastguard Worker         shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
2851*77c1e3ccSAndroid Build Coastguard Worker 
2852*77c1e3ccSAndroid Build Coastguard Worker         diff = _mm256_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
2853*77c1e3ccSAndroid Build Coastguard Worker         a32 = _mm256_slli_epi16(a0_y, 5);     // a[x] * 32
2854*77c1e3ccSAndroid Build Coastguard Worker         a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
2855*77c1e3ccSAndroid Build Coastguard Worker 
2856*77c1e3ccSAndroid Build Coastguard Worker         b = _mm256_mullo_epi16(diff, shifty);
2857*77c1e3ccSAndroid Build Coastguard Worker         res = _mm256_add_epi16(a32, b);
2858*77c1e3ccSAndroid Build Coastguard Worker         resy = _mm256_srli_epi16(res, 5);
2859*77c1e3ccSAndroid Build Coastguard Worker       }
2860*77c1e3ccSAndroid Build Coastguard Worker 
2861*77c1e3ccSAndroid Build Coastguard Worker       resxy = _mm256_blendv_epi8(resx, resy,
2862*77c1e3ccSAndroid Build Coastguard Worker                                  *(__m256i *)HighbdBaseMask[base_min_diff]);
2863*77c1e3ccSAndroid Build Coastguard Worker       _mm256_storeu_si256((__m256i *)(dst + j), resxy);
2864*77c1e3ccSAndroid Build Coastguard Worker     }  // for j
2865*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
2866*77c1e3ccSAndroid Build Coastguard Worker   }
2867*77c1e3ccSAndroid Build Coastguard Worker }
2868*77c1e3ccSAndroid Build Coastguard Worker 
2869*77c1e3ccSAndroid Build Coastguard Worker // Directional prediction, zone 2: 90 < angle < 180
av1_highbd_dr_prediction_z2_avx2(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy,int bd)2870*77c1e3ccSAndroid Build Coastguard Worker void av1_highbd_dr_prediction_z2_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
2871*77c1e3ccSAndroid Build Coastguard Worker                                       int bh, const uint16_t *above,
2872*77c1e3ccSAndroid Build Coastguard Worker                                       const uint16_t *left, int upsample_above,
2873*77c1e3ccSAndroid Build Coastguard Worker                                       int upsample_left, int dx, int dy,
2874*77c1e3ccSAndroid Build Coastguard Worker                                       int bd) {
2875*77c1e3ccSAndroid Build Coastguard Worker   (void)bd;
2876*77c1e3ccSAndroid Build Coastguard Worker   assert(dx > 0);
2877*77c1e3ccSAndroid Build Coastguard Worker   assert(dy > 0);
2878*77c1e3ccSAndroid Build Coastguard Worker   switch (bw) {
2879*77c1e3ccSAndroid Build Coastguard Worker     case 4:
2880*77c1e3ccSAndroid Build Coastguard Worker       if (bd < 12) {
2881*77c1e3ccSAndroid Build Coastguard Worker         highbd_dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left,
2882*77c1e3ccSAndroid Build Coastguard Worker                                          upsample_above, upsample_left, dx, dy);
2883*77c1e3ccSAndroid Build Coastguard Worker       } else {
2884*77c1e3ccSAndroid Build Coastguard Worker         highbd_dr_prediction_32bit_z2_Nx4_avx2(bh, dst, stride, above, left,
2885*77c1e3ccSAndroid Build Coastguard Worker                                                upsample_above, upsample_left,
2886*77c1e3ccSAndroid Build Coastguard Worker                                                dx, dy);
2887*77c1e3ccSAndroid Build Coastguard Worker       }
2888*77c1e3ccSAndroid Build Coastguard Worker       break;
2889*77c1e3ccSAndroid Build Coastguard Worker     case 8:
2890*77c1e3ccSAndroid Build Coastguard Worker       if (bd < 12) {
2891*77c1e3ccSAndroid Build Coastguard Worker         highbd_dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left,
2892*77c1e3ccSAndroid Build Coastguard Worker                                          upsample_above, upsample_left, dx, dy);
2893*77c1e3ccSAndroid Build Coastguard Worker       } else {
2894*77c1e3ccSAndroid Build Coastguard Worker         highbd_dr_prediction_32bit_z2_Nx8_avx2(bh, dst, stride, above, left,
2895*77c1e3ccSAndroid Build Coastguard Worker                                                upsample_above, upsample_left,
2896*77c1e3ccSAndroid Build Coastguard Worker                                                dx, dy);
2897*77c1e3ccSAndroid Build Coastguard Worker       }
2898*77c1e3ccSAndroid Build Coastguard Worker       break;
2899*77c1e3ccSAndroid Build Coastguard Worker     default:
2900*77c1e3ccSAndroid Build Coastguard Worker       if (bd < 12) {
2901*77c1e3ccSAndroid Build Coastguard Worker         highbd_dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
2902*77c1e3ccSAndroid Build Coastguard Worker                                          upsample_above, upsample_left, dx, dy);
2903*77c1e3ccSAndroid Build Coastguard Worker       } else {
2904*77c1e3ccSAndroid Build Coastguard Worker         highbd_dr_prediction_32bit_z2_HxW_avx2(bh, bw, dst, stride, above, left,
2905*77c1e3ccSAndroid Build Coastguard Worker                                                upsample_above, upsample_left,
2906*77c1e3ccSAndroid Build Coastguard Worker                                                dx, dy);
2907*77c1e3ccSAndroid Build Coastguard Worker       }
2908*77c1e3ccSAndroid Build Coastguard Worker       break;
2909*77c1e3ccSAndroid Build Coastguard Worker   }
2910*77c1e3ccSAndroid Build Coastguard Worker }
2911*77c1e3ccSAndroid Build Coastguard Worker 
2912*77c1e3ccSAndroid Build Coastguard Worker //  Directional prediction, zone 3 functions
highbd_dr_prediction_z3_4x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)2913*77c1e3ccSAndroid Build Coastguard Worker static void highbd_dr_prediction_z3_4x4_avx2(uint16_t *dst, ptrdiff_t stride,
2914*77c1e3ccSAndroid Build Coastguard Worker                                              const uint16_t *left,
2915*77c1e3ccSAndroid Build Coastguard Worker                                              int upsample_left, int dy,
2916*77c1e3ccSAndroid Build Coastguard Worker                                              int bd) {
2917*77c1e3ccSAndroid Build Coastguard Worker   __m128i dstvec[4], d[4];
2918*77c1e3ccSAndroid Build Coastguard Worker   if (bd < 12) {
2919*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left,
2920*77c1e3ccSAndroid Build Coastguard Worker                                               dy);
2921*77c1e3ccSAndroid Build Coastguard Worker   } else {
2922*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_32bit_z1_4xN_internal_avx2(4, dstvec, left,
2923*77c1e3ccSAndroid Build Coastguard Worker                                                     upsample_left, dy);
2924*77c1e3ccSAndroid Build Coastguard Worker   }
2925*77c1e3ccSAndroid Build Coastguard Worker   highbd_transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2],
2926*77c1e3ccSAndroid Build Coastguard Worker                                    &dstvec[3], &d[0], &d[1], &d[2], &d[3]);
2927*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
2928*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
2929*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
2930*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
2931*77c1e3ccSAndroid Build Coastguard Worker   return;
2932*77c1e3ccSAndroid Build Coastguard Worker }
2933*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_z3_8x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)2934*77c1e3ccSAndroid Build Coastguard Worker static void highbd_dr_prediction_z3_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
2935*77c1e3ccSAndroid Build Coastguard Worker                                              const uint16_t *left,
2936*77c1e3ccSAndroid Build Coastguard Worker                                              int upsample_left, int dy,
2937*77c1e3ccSAndroid Build Coastguard Worker                                              int bd) {
2938*77c1e3ccSAndroid Build Coastguard Worker   __m128i dstvec[8], d[8];
2939*77c1e3ccSAndroid Build Coastguard Worker   if (bd < 12) {
2940*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left,
2941*77c1e3ccSAndroid Build Coastguard Worker                                               dy);
2942*77c1e3ccSAndroid Build Coastguard Worker   } else {
2943*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_32bit_z1_8xN_internal_avx2(8, dstvec, left,
2944*77c1e3ccSAndroid Build Coastguard Worker                                                     upsample_left, dy);
2945*77c1e3ccSAndroid Build Coastguard Worker   }
2946*77c1e3ccSAndroid Build Coastguard Worker   highbd_transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
2947*77c1e3ccSAndroid Build Coastguard Worker                            &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
2948*77c1e3ccSAndroid Build Coastguard Worker                            &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
2949*77c1e3ccSAndroid Build Coastguard Worker                            &d[7]);
2950*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 8; i++) {
2951*77c1e3ccSAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
2952*77c1e3ccSAndroid Build Coastguard Worker   }
2953*77c1e3ccSAndroid Build Coastguard Worker }
2954*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_z3_4x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)2955*77c1e3ccSAndroid Build Coastguard Worker static void highbd_dr_prediction_z3_4x8_avx2(uint16_t *dst, ptrdiff_t stride,
2956*77c1e3ccSAndroid Build Coastguard Worker                                              const uint16_t *left,
2957*77c1e3ccSAndroid Build Coastguard Worker                                              int upsample_left, int dy,
2958*77c1e3ccSAndroid Build Coastguard Worker                                              int bd) {
2959*77c1e3ccSAndroid Build Coastguard Worker   __m128i dstvec[4], d[8];
2960*77c1e3ccSAndroid Build Coastguard Worker   if (bd < 12) {
2961*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left,
2962*77c1e3ccSAndroid Build Coastguard Worker                                               dy);
2963*77c1e3ccSAndroid Build Coastguard Worker   } else {
2964*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_32bit_z1_8xN_internal_avx2(4, dstvec, left,
2965*77c1e3ccSAndroid Build Coastguard Worker                                                     upsample_left, dy);
2966*77c1e3ccSAndroid Build Coastguard Worker   }
2967*77c1e3ccSAndroid Build Coastguard Worker 
2968*77c1e3ccSAndroid Build Coastguard Worker   highbd_transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
2969*77c1e3ccSAndroid Build Coastguard Worker                                &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
2970*77c1e3ccSAndroid Build Coastguard Worker                                &d[7]);
2971*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 8; i++) {
2972*77c1e3ccSAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
2973*77c1e3ccSAndroid Build Coastguard Worker   }
2974*77c1e3ccSAndroid Build Coastguard Worker }
2975*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_z3_8x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)2976*77c1e3ccSAndroid Build Coastguard Worker static void highbd_dr_prediction_z3_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
2977*77c1e3ccSAndroid Build Coastguard Worker                                              const uint16_t *left,
2978*77c1e3ccSAndroid Build Coastguard Worker                                              int upsample_left, int dy,
2979*77c1e3ccSAndroid Build Coastguard Worker                                              int bd) {
2980*77c1e3ccSAndroid Build Coastguard Worker   __m128i dstvec[8], d[4];
2981*77c1e3ccSAndroid Build Coastguard Worker   if (bd < 12) {
2982*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left,
2983*77c1e3ccSAndroid Build Coastguard Worker                                               dy);
2984*77c1e3ccSAndroid Build Coastguard Worker   } else {
2985*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_32bit_z1_4xN_internal_avx2(8, dstvec, left,
2986*77c1e3ccSAndroid Build Coastguard Worker                                                     upsample_left, dy);
2987*77c1e3ccSAndroid Build Coastguard Worker   }
2988*77c1e3ccSAndroid Build Coastguard Worker 
2989*77c1e3ccSAndroid Build Coastguard Worker   highbd_transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
2990*77c1e3ccSAndroid Build Coastguard Worker                                &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
2991*77c1e3ccSAndroid Build Coastguard Worker                                &d[0], &d[1], &d[2], &d[3]);
2992*77c1e3ccSAndroid Build Coastguard Worker   _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
2993*77c1e3ccSAndroid Build Coastguard Worker   _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[1]);
2994*77c1e3ccSAndroid Build Coastguard Worker   _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[2]);
2995*77c1e3ccSAndroid Build Coastguard Worker   _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[3]);
2996*77c1e3ccSAndroid Build Coastguard Worker }
2997*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_z3_8x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)2998*77c1e3ccSAndroid Build Coastguard Worker static void highbd_dr_prediction_z3_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
2999*77c1e3ccSAndroid Build Coastguard Worker                                               const uint16_t *left,
3000*77c1e3ccSAndroid Build Coastguard Worker                                               int upsample_left, int dy,
3001*77c1e3ccSAndroid Build Coastguard Worker                                               int bd) {
3002*77c1e3ccSAndroid Build Coastguard Worker   __m256i dstvec[8], d[8];
3003*77c1e3ccSAndroid Build Coastguard Worker   if (bd < 12) {
3004*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left,
3005*77c1e3ccSAndroid Build Coastguard Worker                                                dy);
3006*77c1e3ccSAndroid Build Coastguard Worker   } else {
3007*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_32bit_z1_16xN_internal_avx2(8, dstvec, left,
3008*77c1e3ccSAndroid Build Coastguard Worker                                                      upsample_left, dy);
3009*77c1e3ccSAndroid Build Coastguard Worker   }
3010*77c1e3ccSAndroid Build Coastguard Worker   highbd_transpose8x16_16x8_avx2(dstvec, d);
3011*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 8; i++) {
3012*77c1e3ccSAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i *)(dst + i * stride),
3013*77c1e3ccSAndroid Build Coastguard Worker                      _mm256_castsi256_si128(d[i]));
3014*77c1e3ccSAndroid Build Coastguard Worker   }
3015*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 8; i < 16; i++) {
3016*77c1e3ccSAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i *)(dst + i * stride),
3017*77c1e3ccSAndroid Build Coastguard Worker                      _mm256_extracti128_si256(d[i - 8], 1));
3018*77c1e3ccSAndroid Build Coastguard Worker   }
3019*77c1e3ccSAndroid Build Coastguard Worker }
3020*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_z3_16x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3021*77c1e3ccSAndroid Build Coastguard Worker static void highbd_dr_prediction_z3_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
3022*77c1e3ccSAndroid Build Coastguard Worker                                               const uint16_t *left,
3023*77c1e3ccSAndroid Build Coastguard Worker                                               int upsample_left, int dy,
3024*77c1e3ccSAndroid Build Coastguard Worker                                               int bd) {
3025*77c1e3ccSAndroid Build Coastguard Worker   __m128i dstvec[16], d[16];
3026*77c1e3ccSAndroid Build Coastguard Worker   if (bd < 12) {
3027*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left,
3028*77c1e3ccSAndroid Build Coastguard Worker                                               dy);
3029*77c1e3ccSAndroid Build Coastguard Worker   } else {
3030*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_32bit_z1_8xN_internal_avx2(16, dstvec, left,
3031*77c1e3ccSAndroid Build Coastguard Worker                                                     upsample_left, dy);
3032*77c1e3ccSAndroid Build Coastguard Worker   }
3033*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 16; i += 8) {
3034*77c1e3ccSAndroid Build Coastguard Worker     highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
3035*77c1e3ccSAndroid Build Coastguard Worker                              &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
3036*77c1e3ccSAndroid Build Coastguard Worker                              &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
3037*77c1e3ccSAndroid Build Coastguard Worker                              &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
3038*77c1e3ccSAndroid Build Coastguard Worker                              &d[5 + i], &d[6 + i], &d[7 + i]);
3039*77c1e3ccSAndroid Build Coastguard Worker   }
3040*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 8; i++) {
3041*77c1e3ccSAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
3042*77c1e3ccSAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
3043*77c1e3ccSAndroid Build Coastguard Worker   }
3044*77c1e3ccSAndroid Build Coastguard Worker }
3045*77c1e3ccSAndroid Build Coastguard Worker 
3046*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
highbd_dr_prediction_z3_4x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3047*77c1e3ccSAndroid Build Coastguard Worker static void highbd_dr_prediction_z3_4x16_avx2(uint16_t *dst, ptrdiff_t stride,
3048*77c1e3ccSAndroid Build Coastguard Worker                                               const uint16_t *left,
3049*77c1e3ccSAndroid Build Coastguard Worker                                               int upsample_left, int dy,
3050*77c1e3ccSAndroid Build Coastguard Worker                                               int bd) {
3051*77c1e3ccSAndroid Build Coastguard Worker   __m256i dstvec[4], d[4], d1;
3052*77c1e3ccSAndroid Build Coastguard Worker   if (bd < 12) {
3053*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left,
3054*77c1e3ccSAndroid Build Coastguard Worker                                                dy);
3055*77c1e3ccSAndroid Build Coastguard Worker   } else {
3056*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_32bit_z1_16xN_internal_avx2(4, dstvec, left,
3057*77c1e3ccSAndroid Build Coastguard Worker                                                      upsample_left, dy);
3058*77c1e3ccSAndroid Build Coastguard Worker   }
3059*77c1e3ccSAndroid Build Coastguard Worker   highbd_transpose4x16_avx2(dstvec, d);
3060*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 4; i++) {
3061*77c1e3ccSAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i *)(dst + i * stride),
3062*77c1e3ccSAndroid Build Coastguard Worker                      _mm256_castsi256_si128(d[i]));
3063*77c1e3ccSAndroid Build Coastguard Worker     d1 = _mm256_bsrli_epi128(d[i], 8);
3064*77c1e3ccSAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i *)(dst + (i + 4) * stride),
3065*77c1e3ccSAndroid Build Coastguard Worker                      _mm256_castsi256_si128(d1));
3066*77c1e3ccSAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
3067*77c1e3ccSAndroid Build Coastguard Worker                      _mm256_extracti128_si256(d[i], 1));
3068*77c1e3ccSAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i *)(dst + (i + 12) * stride),
3069*77c1e3ccSAndroid Build Coastguard Worker                      _mm256_extracti128_si256(d1, 1));
3070*77c1e3ccSAndroid Build Coastguard Worker   }
3071*77c1e3ccSAndroid Build Coastguard Worker }
3072*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_z3_16x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3073*77c1e3ccSAndroid Build Coastguard Worker static void highbd_dr_prediction_z3_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
3074*77c1e3ccSAndroid Build Coastguard Worker                                               const uint16_t *left,
3075*77c1e3ccSAndroid Build Coastguard Worker                                               int upsample_left, int dy,
3076*77c1e3ccSAndroid Build Coastguard Worker                                               int bd) {
3077*77c1e3ccSAndroid Build Coastguard Worker   __m128i dstvec[16], d[8];
3078*77c1e3ccSAndroid Build Coastguard Worker   if (bd < 12) {
3079*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left,
3080*77c1e3ccSAndroid Build Coastguard Worker                                               dy);
3081*77c1e3ccSAndroid Build Coastguard Worker   } else {
3082*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_32bit_z1_4xN_internal_avx2(16, dstvec, left,
3083*77c1e3ccSAndroid Build Coastguard Worker                                                     upsample_left, dy);
3084*77c1e3ccSAndroid Build Coastguard Worker   }
3085*77c1e3ccSAndroid Build Coastguard Worker   highbd_transpose16x4_8x8_sse2(dstvec, d);
3086*77c1e3ccSAndroid Build Coastguard Worker 
3087*77c1e3ccSAndroid Build Coastguard Worker   _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
3088*77c1e3ccSAndroid Build Coastguard Worker   _mm_storeu_si128((__m128i *)(dst + 0 * stride + 8), d[1]);
3089*77c1e3ccSAndroid Build Coastguard Worker   _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[2]);
3090*77c1e3ccSAndroid Build Coastguard Worker   _mm_storeu_si128((__m128i *)(dst + 1 * stride + 8), d[3]);
3091*77c1e3ccSAndroid Build Coastguard Worker   _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[4]);
3092*77c1e3ccSAndroid Build Coastguard Worker   _mm_storeu_si128((__m128i *)(dst + 2 * stride + 8), d[5]);
3093*77c1e3ccSAndroid Build Coastguard Worker   _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[6]);
3094*77c1e3ccSAndroid Build Coastguard Worker   _mm_storeu_si128((__m128i *)(dst + 3 * stride + 8), d[7]);
3095*77c1e3ccSAndroid Build Coastguard Worker }
3096*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_z3_8x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3097*77c1e3ccSAndroid Build Coastguard Worker static void highbd_dr_prediction_z3_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
3098*77c1e3ccSAndroid Build Coastguard Worker                                               const uint16_t *left,
3099*77c1e3ccSAndroid Build Coastguard Worker                                               int upsample_left, int dy,
3100*77c1e3ccSAndroid Build Coastguard Worker                                               int bd) {
3101*77c1e3ccSAndroid Build Coastguard Worker   __m256i dstvec[16], d[16];
3102*77c1e3ccSAndroid Build Coastguard Worker   if (bd < 12) {
3103*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left,
3104*77c1e3ccSAndroid Build Coastguard Worker                                                dy);
3105*77c1e3ccSAndroid Build Coastguard Worker   } else {
3106*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_32bit_z1_32xN_internal_avx2(8, dstvec, left,
3107*77c1e3ccSAndroid Build Coastguard Worker                                                      upsample_left, dy);
3108*77c1e3ccSAndroid Build Coastguard Worker   }
3109*77c1e3ccSAndroid Build Coastguard Worker 
3110*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 16; i += 8) {
3111*77c1e3ccSAndroid Build Coastguard Worker     highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
3112*77c1e3ccSAndroid Build Coastguard Worker   }
3113*77c1e3ccSAndroid Build Coastguard Worker 
3114*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 8; i++) {
3115*77c1e3ccSAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i *)(dst + i * stride),
3116*77c1e3ccSAndroid Build Coastguard Worker                      _mm256_castsi256_si128(d[i]));
3117*77c1e3ccSAndroid Build Coastguard Worker   }
3118*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 8; i++) {
3119*77c1e3ccSAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
3120*77c1e3ccSAndroid Build Coastguard Worker                      _mm256_extracti128_si256(d[i], 1));
3121*77c1e3ccSAndroid Build Coastguard Worker   }
3122*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 8; i < 16; i++) {
3123*77c1e3ccSAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
3124*77c1e3ccSAndroid Build Coastguard Worker                      _mm256_castsi256_si128(d[i]));
3125*77c1e3ccSAndroid Build Coastguard Worker   }
3126*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 8; i < 16; i++) {
3127*77c1e3ccSAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i *)(dst + (i + 16) * stride),
3128*77c1e3ccSAndroid Build Coastguard Worker                      _mm256_extracti128_si256(d[i], 1));
3129*77c1e3ccSAndroid Build Coastguard Worker   }
3130*77c1e3ccSAndroid Build Coastguard Worker }
3131*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_z3_32x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3132*77c1e3ccSAndroid Build Coastguard Worker static void highbd_dr_prediction_z3_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
3133*77c1e3ccSAndroid Build Coastguard Worker                                               const uint16_t *left,
3134*77c1e3ccSAndroid Build Coastguard Worker                                               int upsample_left, int dy,
3135*77c1e3ccSAndroid Build Coastguard Worker                                               int bd) {
3136*77c1e3ccSAndroid Build Coastguard Worker   __m128i dstvec[32], d[32];
3137*77c1e3ccSAndroid Build Coastguard Worker   if (bd < 12) {
3138*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left,
3139*77c1e3ccSAndroid Build Coastguard Worker                                               dy);
3140*77c1e3ccSAndroid Build Coastguard Worker   } else {
3141*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_32bit_z1_8xN_internal_avx2(32, dstvec, left,
3142*77c1e3ccSAndroid Build Coastguard Worker                                                     upsample_left, dy);
3143*77c1e3ccSAndroid Build Coastguard Worker   }
3144*77c1e3ccSAndroid Build Coastguard Worker 
3145*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 32; i += 8) {
3146*77c1e3ccSAndroid Build Coastguard Worker     highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
3147*77c1e3ccSAndroid Build Coastguard Worker                              &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
3148*77c1e3ccSAndroid Build Coastguard Worker                              &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
3149*77c1e3ccSAndroid Build Coastguard Worker                              &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
3150*77c1e3ccSAndroid Build Coastguard Worker                              &d[5 + i], &d[6 + i], &d[7 + i]);
3151*77c1e3ccSAndroid Build Coastguard Worker   }
3152*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 8; i++) {
3153*77c1e3ccSAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
3154*77c1e3ccSAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
3155*77c1e3ccSAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 16]);
3156*77c1e3ccSAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i *)(dst + i * stride + 24), d[i + 24]);
3157*77c1e3ccSAndroid Build Coastguard Worker   }
3158*77c1e3ccSAndroid Build Coastguard Worker }
3159*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3160*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_z3_16x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3161*77c1e3ccSAndroid Build Coastguard Worker static void highbd_dr_prediction_z3_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
3162*77c1e3ccSAndroid Build Coastguard Worker                                                const uint16_t *left,
3163*77c1e3ccSAndroid Build Coastguard Worker                                                int upsample_left, int dy,
3164*77c1e3ccSAndroid Build Coastguard Worker                                                int bd) {
3165*77c1e3ccSAndroid Build Coastguard Worker   __m256i dstvec[16], d[16];
3166*77c1e3ccSAndroid Build Coastguard Worker   if (bd < 12) {
3167*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left,
3168*77c1e3ccSAndroid Build Coastguard Worker                                                dy);
3169*77c1e3ccSAndroid Build Coastguard Worker   } else {
3170*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_32bit_z1_16xN_internal_avx2(16, dstvec, left,
3171*77c1e3ccSAndroid Build Coastguard Worker                                                      upsample_left, dy);
3172*77c1e3ccSAndroid Build Coastguard Worker   }
3173*77c1e3ccSAndroid Build Coastguard Worker 
3174*77c1e3ccSAndroid Build Coastguard Worker   highbd_transpose16x16_avx2(dstvec, d);
3175*77c1e3ccSAndroid Build Coastguard Worker 
3176*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 16; i++) {
3177*77c1e3ccSAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i *)(dst + i * stride), d[i]);
3178*77c1e3ccSAndroid Build Coastguard Worker   }
3179*77c1e3ccSAndroid Build Coastguard Worker }
3180*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_z3_32x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3181*77c1e3ccSAndroid Build Coastguard Worker static void highbd_dr_prediction_z3_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
3182*77c1e3ccSAndroid Build Coastguard Worker                                                const uint16_t *left,
3183*77c1e3ccSAndroid Build Coastguard Worker                                                int upsample_left, int dy,
3184*77c1e3ccSAndroid Build Coastguard Worker                                                int bd) {
3185*77c1e3ccSAndroid Build Coastguard Worker   __m256i dstvec[64], d[16];
3186*77c1e3ccSAndroid Build Coastguard Worker   if (bd < 12) {
3187*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left,
3188*77c1e3ccSAndroid Build Coastguard Worker                                                dy);
3189*77c1e3ccSAndroid Build Coastguard Worker   } else {
3190*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_32bit_z1_32xN_internal_avx2(32, dstvec, left,
3191*77c1e3ccSAndroid Build Coastguard Worker                                                      upsample_left, dy);
3192*77c1e3ccSAndroid Build Coastguard Worker   }
3193*77c1e3ccSAndroid Build Coastguard Worker   highbd_transpose16x16_avx2(dstvec, d);
3194*77c1e3ccSAndroid Build Coastguard Worker   for (int j = 0; j < 16; j++) {
3195*77c1e3ccSAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i *)(dst + j * stride), d[j]);
3196*77c1e3ccSAndroid Build Coastguard Worker   }
3197*77c1e3ccSAndroid Build Coastguard Worker   highbd_transpose16x16_avx2(dstvec + 16, d);
3198*77c1e3ccSAndroid Build Coastguard Worker   for (int j = 0; j < 16; j++) {
3199*77c1e3ccSAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i *)(dst + j * stride + 16), d[j]);
3200*77c1e3ccSAndroid Build Coastguard Worker   }
3201*77c1e3ccSAndroid Build Coastguard Worker   highbd_transpose16x16_avx2(dstvec + 32, d);
3202*77c1e3ccSAndroid Build Coastguard Worker   for (int j = 0; j < 16; j++) {
3203*77c1e3ccSAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride), d[j]);
3204*77c1e3ccSAndroid Build Coastguard Worker   }
3205*77c1e3ccSAndroid Build Coastguard Worker   highbd_transpose16x16_avx2(dstvec + 48, d);
3206*77c1e3ccSAndroid Build Coastguard Worker   for (int j = 0; j < 16; j++) {
3207*77c1e3ccSAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride + 16), d[j]);
3208*77c1e3ccSAndroid Build Coastguard Worker   }
3209*77c1e3ccSAndroid Build Coastguard Worker }
3210*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_z3_64x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3211*77c1e3ccSAndroid Build Coastguard Worker static void highbd_dr_prediction_z3_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
3212*77c1e3ccSAndroid Build Coastguard Worker                                                const uint16_t *left,
3213*77c1e3ccSAndroid Build Coastguard Worker                                                int upsample_left, int dy,
3214*77c1e3ccSAndroid Build Coastguard Worker                                                int bd) {
3215*77c1e3ccSAndroid Build Coastguard Worker   DECLARE_ALIGNED(16, uint16_t, dstT[64 * 64]);
3216*77c1e3ccSAndroid Build Coastguard Worker   if (bd < 12) {
3217*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
3218*77c1e3ccSAndroid Build Coastguard Worker   } else {
3219*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_32bit_z1_64xN_avx2(64, dstT, 64, left, upsample_left,
3220*77c1e3ccSAndroid Build Coastguard Worker                                             dy);
3221*77c1e3ccSAndroid Build Coastguard Worker   }
3222*77c1e3ccSAndroid Build Coastguard Worker   highbd_transpose(dstT, 64, dst, stride, 64, 64);
3223*77c1e3ccSAndroid Build Coastguard Worker }
3224*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_z3_16x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3225*77c1e3ccSAndroid Build Coastguard Worker static void highbd_dr_prediction_z3_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
3226*77c1e3ccSAndroid Build Coastguard Worker                                                const uint16_t *left,
3227*77c1e3ccSAndroid Build Coastguard Worker                                                int upsample_left, int dy,
3228*77c1e3ccSAndroid Build Coastguard Worker                                                int bd) {
3229*77c1e3ccSAndroid Build Coastguard Worker   __m256i dstvec[32], d[32];
3230*77c1e3ccSAndroid Build Coastguard Worker   if (bd < 12) {
3231*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left,
3232*77c1e3ccSAndroid Build Coastguard Worker                                                dy);
3233*77c1e3ccSAndroid Build Coastguard Worker   } else {
3234*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_32bit_z1_32xN_internal_avx2(16, dstvec, left,
3235*77c1e3ccSAndroid Build Coastguard Worker                                                      upsample_left, dy);
3236*77c1e3ccSAndroid Build Coastguard Worker   }
3237*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 32; i += 8) {
3238*77c1e3ccSAndroid Build Coastguard Worker     highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
3239*77c1e3ccSAndroid Build Coastguard Worker   }
3240*77c1e3ccSAndroid Build Coastguard Worker   // store
3241*77c1e3ccSAndroid Build Coastguard Worker   for (int j = 0; j < 32; j += 16) {
3242*77c1e3ccSAndroid Build Coastguard Worker     for (int i = 0; i < 8; i++) {
3243*77c1e3ccSAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i *)(dst + (i + j) * stride),
3244*77c1e3ccSAndroid Build Coastguard Worker                        _mm256_castsi256_si128(d[(i + j)]));
3245*77c1e3ccSAndroid Build Coastguard Worker     }
3246*77c1e3ccSAndroid Build Coastguard Worker     for (int i = 0; i < 8; i++) {
3247*77c1e3ccSAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i *)(dst + (i + j) * stride + 8),
3248*77c1e3ccSAndroid Build Coastguard Worker                        _mm256_castsi256_si128(d[(i + j) + 8]));
3249*77c1e3ccSAndroid Build Coastguard Worker     }
3250*77c1e3ccSAndroid Build Coastguard Worker     for (int i = 8; i < 16; i++) {
3251*77c1e3ccSAndroid Build Coastguard Worker       _mm256_storeu_si256(
3252*77c1e3ccSAndroid Build Coastguard Worker           (__m256i *)(dst + (i + j) * stride),
3253*77c1e3ccSAndroid Build Coastguard Worker           _mm256_inserti128_si256(
3254*77c1e3ccSAndroid Build Coastguard Worker               d[(i + j)], _mm256_extracti128_si256(d[(i + j) - 8], 1), 0));
3255*77c1e3ccSAndroid Build Coastguard Worker     }
3256*77c1e3ccSAndroid Build Coastguard Worker   }
3257*77c1e3ccSAndroid Build Coastguard Worker }
3258*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_z3_32x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3259*77c1e3ccSAndroid Build Coastguard Worker static void highbd_dr_prediction_z3_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
3260*77c1e3ccSAndroid Build Coastguard Worker                                                const uint16_t *left,
3261*77c1e3ccSAndroid Build Coastguard Worker                                                int upsample_left, int dy,
3262*77c1e3ccSAndroid Build Coastguard Worker                                                int bd) {
3263*77c1e3ccSAndroid Build Coastguard Worker   __m256i dstvec[32], d[16];
3264*77c1e3ccSAndroid Build Coastguard Worker   if (bd < 12) {
3265*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left,
3266*77c1e3ccSAndroid Build Coastguard Worker                                                dy);
3267*77c1e3ccSAndroid Build Coastguard Worker   } else {
3268*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_32bit_z1_16xN_internal_avx2(32, dstvec, left,
3269*77c1e3ccSAndroid Build Coastguard Worker                                                      upsample_left, dy);
3270*77c1e3ccSAndroid Build Coastguard Worker   }
3271*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 32; i += 16) {
3272*77c1e3ccSAndroid Build Coastguard Worker     highbd_transpose16x16_avx2((dstvec + i), d);
3273*77c1e3ccSAndroid Build Coastguard Worker     for (int j = 0; j < 16; j++) {
3274*77c1e3ccSAndroid Build Coastguard Worker       _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
3275*77c1e3ccSAndroid Build Coastguard Worker     }
3276*77c1e3ccSAndroid Build Coastguard Worker   }
3277*77c1e3ccSAndroid Build Coastguard Worker }
3278*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_z3_32x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3279*77c1e3ccSAndroid Build Coastguard Worker static void highbd_dr_prediction_z3_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
3280*77c1e3ccSAndroid Build Coastguard Worker                                                const uint16_t *left,
3281*77c1e3ccSAndroid Build Coastguard Worker                                                int upsample_left, int dy,
3282*77c1e3ccSAndroid Build Coastguard Worker                                                int bd) {
3283*77c1e3ccSAndroid Build Coastguard Worker   uint16_t dstT[64 * 32];
3284*77c1e3ccSAndroid Build Coastguard Worker   if (bd < 12) {
3285*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
3286*77c1e3ccSAndroid Build Coastguard Worker   } else {
3287*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_32bit_z1_64xN_avx2(32, dstT, 64, left, upsample_left,
3288*77c1e3ccSAndroid Build Coastguard Worker                                             dy);
3289*77c1e3ccSAndroid Build Coastguard Worker   }
3290*77c1e3ccSAndroid Build Coastguard Worker   highbd_transpose(dstT, 64, dst, stride, 32, 64);
3291*77c1e3ccSAndroid Build Coastguard Worker }
3292*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_z3_64x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3293*77c1e3ccSAndroid Build Coastguard Worker static void highbd_dr_prediction_z3_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
3294*77c1e3ccSAndroid Build Coastguard Worker                                                const uint16_t *left,
3295*77c1e3ccSAndroid Build Coastguard Worker                                                int upsample_left, int dy,
3296*77c1e3ccSAndroid Build Coastguard Worker                                                int bd) {
3297*77c1e3ccSAndroid Build Coastguard Worker   DECLARE_ALIGNED(16, uint16_t, dstT[32 * 64]);
3298*77c1e3ccSAndroid Build Coastguard Worker   highbd_dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy, bd);
3299*77c1e3ccSAndroid Build Coastguard Worker   highbd_transpose(dstT, 32, dst, stride, 64, 32);
3300*77c1e3ccSAndroid Build Coastguard Worker   return;
3301*77c1e3ccSAndroid Build Coastguard Worker }
3302*77c1e3ccSAndroid Build Coastguard Worker 
3303*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
highbd_dr_prediction_z3_16x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3304*77c1e3ccSAndroid Build Coastguard Worker static void highbd_dr_prediction_z3_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
3305*77c1e3ccSAndroid Build Coastguard Worker                                                const uint16_t *left,
3306*77c1e3ccSAndroid Build Coastguard Worker                                                int upsample_left, int dy,
3307*77c1e3ccSAndroid Build Coastguard Worker                                                int bd) {
3308*77c1e3ccSAndroid Build Coastguard Worker   DECLARE_ALIGNED(16, uint16_t, dstT[64 * 16]);
3309*77c1e3ccSAndroid Build Coastguard Worker   if (bd < 12) {
3310*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
3311*77c1e3ccSAndroid Build Coastguard Worker   } else {
3312*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_32bit_z1_64xN_avx2(16, dstT, 64, left, upsample_left,
3313*77c1e3ccSAndroid Build Coastguard Worker                                             dy);
3314*77c1e3ccSAndroid Build Coastguard Worker   }
3315*77c1e3ccSAndroid Build Coastguard Worker   highbd_transpose(dstT, 64, dst, stride, 16, 64);
3316*77c1e3ccSAndroid Build Coastguard Worker }
3317*77c1e3ccSAndroid Build Coastguard Worker 
highbd_dr_prediction_z3_64x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3318*77c1e3ccSAndroid Build Coastguard Worker static void highbd_dr_prediction_z3_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
3319*77c1e3ccSAndroid Build Coastguard Worker                                                const uint16_t *left,
3320*77c1e3ccSAndroid Build Coastguard Worker                                                int upsample_left, int dy,
3321*77c1e3ccSAndroid Build Coastguard Worker                                                int bd) {
3322*77c1e3ccSAndroid Build Coastguard Worker   __m256i dstvec[64], d[16];
3323*77c1e3ccSAndroid Build Coastguard Worker   if (bd < 12) {
3324*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left,
3325*77c1e3ccSAndroid Build Coastguard Worker                                                dy);
3326*77c1e3ccSAndroid Build Coastguard Worker   } else {
3327*77c1e3ccSAndroid Build Coastguard Worker     highbd_dr_prediction_32bit_z1_16xN_internal_avx2(64, dstvec, left,
3328*77c1e3ccSAndroid Build Coastguard Worker                                                      upsample_left, dy);
3329*77c1e3ccSAndroid Build Coastguard Worker   }
3330*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 64; i += 16) {
3331*77c1e3ccSAndroid Build Coastguard Worker     highbd_transpose16x16_avx2((dstvec + i), d);
3332*77c1e3ccSAndroid Build Coastguard Worker     for (int j = 0; j < 16; j++) {
3333*77c1e3ccSAndroid Build Coastguard Worker       _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
3334*77c1e3ccSAndroid Build Coastguard Worker     }
3335*77c1e3ccSAndroid Build Coastguard Worker   }
3336*77c1e3ccSAndroid Build Coastguard Worker }
3337*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3338*77c1e3ccSAndroid Build Coastguard Worker 
av1_highbd_dr_prediction_z3_avx2(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int upsample_left,int dx,int dy,int bd)3339*77c1e3ccSAndroid Build Coastguard Worker void av1_highbd_dr_prediction_z3_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
3340*77c1e3ccSAndroid Build Coastguard Worker                                       int bh, const uint16_t *above,
3341*77c1e3ccSAndroid Build Coastguard Worker                                       const uint16_t *left, int upsample_left,
3342*77c1e3ccSAndroid Build Coastguard Worker                                       int dx, int dy, int bd) {
3343*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
3344*77c1e3ccSAndroid Build Coastguard Worker   (void)dx;
3345*77c1e3ccSAndroid Build Coastguard Worker 
3346*77c1e3ccSAndroid Build Coastguard Worker   assert(dx == 1);
3347*77c1e3ccSAndroid Build Coastguard Worker   assert(dy > 0);
3348*77c1e3ccSAndroid Build Coastguard Worker   if (bw == bh) {
3349*77c1e3ccSAndroid Build Coastguard Worker     switch (bw) {
3350*77c1e3ccSAndroid Build Coastguard Worker       case 4:
3351*77c1e3ccSAndroid Build Coastguard Worker         highbd_dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy,
3352*77c1e3ccSAndroid Build Coastguard Worker                                          bd);
3353*77c1e3ccSAndroid Build Coastguard Worker         break;
3354*77c1e3ccSAndroid Build Coastguard Worker       case 8:
3355*77c1e3ccSAndroid Build Coastguard Worker         highbd_dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy,
3356*77c1e3ccSAndroid Build Coastguard Worker                                          bd);
3357*77c1e3ccSAndroid Build Coastguard Worker         break;
3358*77c1e3ccSAndroid Build Coastguard Worker       case 16:
3359*77c1e3ccSAndroid Build Coastguard Worker         highbd_dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy,
3360*77c1e3ccSAndroid Build Coastguard Worker                                            bd);
3361*77c1e3ccSAndroid Build Coastguard Worker         break;
3362*77c1e3ccSAndroid Build Coastguard Worker       case 32:
3363*77c1e3ccSAndroid Build Coastguard Worker         highbd_dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy,
3364*77c1e3ccSAndroid Build Coastguard Worker                                            bd);
3365*77c1e3ccSAndroid Build Coastguard Worker         break;
3366*77c1e3ccSAndroid Build Coastguard Worker       case 64:
3367*77c1e3ccSAndroid Build Coastguard Worker         highbd_dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy,
3368*77c1e3ccSAndroid Build Coastguard Worker                                            bd);
3369*77c1e3ccSAndroid Build Coastguard Worker         break;
3370*77c1e3ccSAndroid Build Coastguard Worker     }
3371*77c1e3ccSAndroid Build Coastguard Worker   } else {
3372*77c1e3ccSAndroid Build Coastguard Worker     if (bw < bh) {
3373*77c1e3ccSAndroid Build Coastguard Worker       if (bw + bw == bh) {
3374*77c1e3ccSAndroid Build Coastguard Worker         switch (bw) {
3375*77c1e3ccSAndroid Build Coastguard Worker           case 4:
3376*77c1e3ccSAndroid Build Coastguard Worker             highbd_dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left,
3377*77c1e3ccSAndroid Build Coastguard Worker                                              dy, bd);
3378*77c1e3ccSAndroid Build Coastguard Worker             break;
3379*77c1e3ccSAndroid Build Coastguard Worker           case 8:
3380*77c1e3ccSAndroid Build Coastguard Worker             highbd_dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left,
3381*77c1e3ccSAndroid Build Coastguard Worker                                               dy, bd);
3382*77c1e3ccSAndroid Build Coastguard Worker             break;
3383*77c1e3ccSAndroid Build Coastguard Worker           case 16:
3384*77c1e3ccSAndroid Build Coastguard Worker             highbd_dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left,
3385*77c1e3ccSAndroid Build Coastguard Worker                                                dy, bd);
3386*77c1e3ccSAndroid Build Coastguard Worker             break;
3387*77c1e3ccSAndroid Build Coastguard Worker           case 32:
3388*77c1e3ccSAndroid Build Coastguard Worker             highbd_dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left,
3389*77c1e3ccSAndroid Build Coastguard Worker                                                dy, bd);
3390*77c1e3ccSAndroid Build Coastguard Worker             break;
3391*77c1e3ccSAndroid Build Coastguard Worker         }
3392*77c1e3ccSAndroid Build Coastguard Worker       } else {
3393*77c1e3ccSAndroid Build Coastguard Worker         switch (bw) {
3394*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3395*77c1e3ccSAndroid Build Coastguard Worker           case 4:
3396*77c1e3ccSAndroid Build Coastguard Worker             highbd_dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left,
3397*77c1e3ccSAndroid Build Coastguard Worker                                               dy, bd);
3398*77c1e3ccSAndroid Build Coastguard Worker             break;
3399*77c1e3ccSAndroid Build Coastguard Worker           case 8:
3400*77c1e3ccSAndroid Build Coastguard Worker             highbd_dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left,
3401*77c1e3ccSAndroid Build Coastguard Worker                                               dy, bd);
3402*77c1e3ccSAndroid Build Coastguard Worker             break;
3403*77c1e3ccSAndroid Build Coastguard Worker           case 16:
3404*77c1e3ccSAndroid Build Coastguard Worker             highbd_dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left,
3405*77c1e3ccSAndroid Build Coastguard Worker                                                dy, bd);
3406*77c1e3ccSAndroid Build Coastguard Worker             break;
3407*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3408*77c1e3ccSAndroid Build Coastguard Worker         }
3409*77c1e3ccSAndroid Build Coastguard Worker       }
3410*77c1e3ccSAndroid Build Coastguard Worker     } else {
3411*77c1e3ccSAndroid Build Coastguard Worker       if (bh + bh == bw) {
3412*77c1e3ccSAndroid Build Coastguard Worker         switch (bh) {
3413*77c1e3ccSAndroid Build Coastguard Worker           case 4:
3414*77c1e3ccSAndroid Build Coastguard Worker             highbd_dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left,
3415*77c1e3ccSAndroid Build Coastguard Worker                                              dy, bd);
3416*77c1e3ccSAndroid Build Coastguard Worker             break;
3417*77c1e3ccSAndroid Build Coastguard Worker           case 8:
3418*77c1e3ccSAndroid Build Coastguard Worker             highbd_dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left,
3419*77c1e3ccSAndroid Build Coastguard Worker                                               dy, bd);
3420*77c1e3ccSAndroid Build Coastguard Worker             break;
3421*77c1e3ccSAndroid Build Coastguard Worker           case 16:
3422*77c1e3ccSAndroid Build Coastguard Worker             highbd_dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left,
3423*77c1e3ccSAndroid Build Coastguard Worker                                                dy, bd);
3424*77c1e3ccSAndroid Build Coastguard Worker             break;
3425*77c1e3ccSAndroid Build Coastguard Worker           case 32:
3426*77c1e3ccSAndroid Build Coastguard Worker             highbd_dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left,
3427*77c1e3ccSAndroid Build Coastguard Worker                                                dy, bd);
3428*77c1e3ccSAndroid Build Coastguard Worker             break;
3429*77c1e3ccSAndroid Build Coastguard Worker         }
3430*77c1e3ccSAndroid Build Coastguard Worker       } else {
3431*77c1e3ccSAndroid Build Coastguard Worker         switch (bh) {
3432*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3433*77c1e3ccSAndroid Build Coastguard Worker           case 4:
3434*77c1e3ccSAndroid Build Coastguard Worker             highbd_dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left,
3435*77c1e3ccSAndroid Build Coastguard Worker                                               dy, bd);
3436*77c1e3ccSAndroid Build Coastguard Worker             break;
3437*77c1e3ccSAndroid Build Coastguard Worker           case 8:
3438*77c1e3ccSAndroid Build Coastguard Worker             highbd_dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left,
3439*77c1e3ccSAndroid Build Coastguard Worker                                               dy, bd);
3440*77c1e3ccSAndroid Build Coastguard Worker             break;
3441*77c1e3ccSAndroid Build Coastguard Worker           case 16:
3442*77c1e3ccSAndroid Build Coastguard Worker             highbd_dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left,
3443*77c1e3ccSAndroid Build Coastguard Worker                                                dy, bd);
3444*77c1e3ccSAndroid Build Coastguard Worker             break;
3445*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
3446*77c1e3ccSAndroid Build Coastguard Worker         }
3447*77c1e3ccSAndroid Build Coastguard Worker       }
3448*77c1e3ccSAndroid Build Coastguard Worker     }
3449*77c1e3ccSAndroid Build Coastguard Worker   }
3450*77c1e3ccSAndroid Build Coastguard Worker   return;
3451*77c1e3ccSAndroid Build Coastguard Worker }
3452*77c1e3ccSAndroid Build Coastguard Worker #endif  // CONFIG_AV1_HIGHBITDEPTH
3453*77c1e3ccSAndroid Build Coastguard Worker 
3454*77c1e3ccSAndroid Build Coastguard Worker // Low bit depth functions
3455*77c1e3ccSAndroid Build Coastguard Worker static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = {
3456*77c1e3ccSAndroid Build Coastguard Worker   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3457*77c1e3ccSAndroid Build Coastguard Worker     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3458*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3459*77c1e3ccSAndroid Build Coastguard Worker     0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3460*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3461*77c1e3ccSAndroid Build Coastguard Worker     0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3462*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3463*77c1e3ccSAndroid Build Coastguard Worker     0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3464*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3465*77c1e3ccSAndroid Build Coastguard Worker     0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3466*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3467*77c1e3ccSAndroid Build Coastguard Worker     0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3468*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3469*77c1e3ccSAndroid Build Coastguard Worker     0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3470*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3471*77c1e3ccSAndroid Build Coastguard Worker     0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0 },
3472*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
3473*77c1e3ccSAndroid Build Coastguard Worker     0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0 },
3474*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
3475*77c1e3ccSAndroid Build Coastguard Worker     0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0 },
3476*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
3477*77c1e3ccSAndroid Build Coastguard Worker     0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
3478*77c1e3ccSAndroid Build Coastguard Worker     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3479*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3480*77c1e3ccSAndroid Build Coastguard Worker     0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
3481*77c1e3ccSAndroid Build Coastguard Worker     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3482*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3483*77c1e3ccSAndroid Build Coastguard Worker     0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
3484*77c1e3ccSAndroid Build Coastguard Worker     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3485*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3486*77c1e3ccSAndroid Build Coastguard Worker     0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,
3487*77c1e3ccSAndroid Build Coastguard Worker     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3488*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3489*77c1e3ccSAndroid Build Coastguard Worker     0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,
3490*77c1e3ccSAndroid Build Coastguard Worker     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3491*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3492*77c1e3ccSAndroid Build Coastguard Worker     0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,
3493*77c1e3ccSAndroid Build Coastguard Worker     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3494*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3495*77c1e3ccSAndroid Build Coastguard Worker     0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,
3496*77c1e3ccSAndroid Build Coastguard Worker     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3497*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3498*77c1e3ccSAndroid Build Coastguard Worker     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,
3499*77c1e3ccSAndroid Build Coastguard Worker     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3500*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3501*77c1e3ccSAndroid Build Coastguard Worker     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,
3502*77c1e3ccSAndroid Build Coastguard Worker     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3503*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3504*77c1e3ccSAndroid Build Coastguard Worker     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,
3505*77c1e3ccSAndroid Build Coastguard Worker     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3506*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3507*77c1e3ccSAndroid Build Coastguard Worker     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,
3508*77c1e3ccSAndroid Build Coastguard Worker     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3509*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3510*77c1e3ccSAndroid Build Coastguard Worker     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
3511*77c1e3ccSAndroid Build Coastguard Worker     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3512*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3513*77c1e3ccSAndroid Build Coastguard Worker     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3514*77c1e3ccSAndroid Build Coastguard Worker     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3515*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3516*77c1e3ccSAndroid Build Coastguard Worker     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3517*77c1e3ccSAndroid Build Coastguard Worker     0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0 },
3518*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3519*77c1e3ccSAndroid Build Coastguard Worker     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3520*77c1e3ccSAndroid Build Coastguard Worker     0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0 },
3521*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3522*77c1e3ccSAndroid Build Coastguard Worker     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3523*77c1e3ccSAndroid Build Coastguard Worker     0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0 },
3524*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3525*77c1e3ccSAndroid Build Coastguard Worker     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3526*77c1e3ccSAndroid Build Coastguard Worker     0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0 },
3527*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3528*77c1e3ccSAndroid Build Coastguard Worker     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3529*77c1e3ccSAndroid Build Coastguard Worker     0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0 },
3530*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3531*77c1e3ccSAndroid Build Coastguard Worker     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3532*77c1e3ccSAndroid Build Coastguard Worker     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0 },
3533*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3534*77c1e3ccSAndroid Build Coastguard Worker     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3535*77c1e3ccSAndroid Build Coastguard Worker     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0 },
3536*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3537*77c1e3ccSAndroid Build Coastguard Worker     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3538*77c1e3ccSAndroid Build Coastguard Worker     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0 },
3539*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3540*77c1e3ccSAndroid Build Coastguard Worker     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3541*77c1e3ccSAndroid Build Coastguard Worker     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
3542*77c1e3ccSAndroid Build Coastguard Worker   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3543*77c1e3ccSAndroid Build Coastguard Worker     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3544*77c1e3ccSAndroid Build Coastguard Worker     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
3545*77c1e3ccSAndroid Build Coastguard Worker };
3546*77c1e3ccSAndroid Build Coastguard Worker 
3547*77c1e3ccSAndroid Build Coastguard Worker /* clang-format on */
dr_prediction_z1_HxW_internal_avx2(int H,int W,__m128i * dst,const uint8_t * above,int upsample_above,int dx)3548*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_avx2(
3549*77c1e3ccSAndroid Build Coastguard Worker     int H, int W, __m128i *dst, const uint8_t *above, int upsample_above,
3550*77c1e3ccSAndroid Build Coastguard Worker     int dx) {
3551*77c1e3ccSAndroid Build Coastguard Worker   const int frac_bits = 6 - upsample_above;
3552*77c1e3ccSAndroid Build Coastguard Worker   const int max_base_x = ((W + H) - 1) << upsample_above;
3553*77c1e3ccSAndroid Build Coastguard Worker 
3554*77c1e3ccSAndroid Build Coastguard Worker   assert(dx > 0);
3555*77c1e3ccSAndroid Build Coastguard Worker   // pre-filter above pixels
3556*77c1e3ccSAndroid Build Coastguard Worker   // store in temp buffers:
3557*77c1e3ccSAndroid Build Coastguard Worker   //   above[x] * 32 + 16
3558*77c1e3ccSAndroid Build Coastguard Worker   //   above[x+1] - above[x]
3559*77c1e3ccSAndroid Build Coastguard Worker   // final pixels will be calculated as:
3560*77c1e3ccSAndroid Build Coastguard Worker   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3561*77c1e3ccSAndroid Build Coastguard Worker   __m256i a0, a1, a32, a16;
3562*77c1e3ccSAndroid Build Coastguard Worker   __m256i diff, c3f;
3563*77c1e3ccSAndroid Build Coastguard Worker   __m128i a_mbase_x;
3564*77c1e3ccSAndroid Build Coastguard Worker 
3565*77c1e3ccSAndroid Build Coastguard Worker   a16 = _mm256_set1_epi16(16);
3566*77c1e3ccSAndroid Build Coastguard Worker   a_mbase_x = _mm_set1_epi8((int8_t)above[max_base_x]);
3567*77c1e3ccSAndroid Build Coastguard Worker   c3f = _mm256_set1_epi16(0x3f);
3568*77c1e3ccSAndroid Build Coastguard Worker 
3569*77c1e3ccSAndroid Build Coastguard Worker   int x = dx;
3570*77c1e3ccSAndroid Build Coastguard Worker   for (int r = 0; r < W; r++) {
3571*77c1e3ccSAndroid Build Coastguard Worker     __m256i b, res, shift;
3572*77c1e3ccSAndroid Build Coastguard Worker     __m128i res1, a0_128, a1_128;
3573*77c1e3ccSAndroid Build Coastguard Worker 
3574*77c1e3ccSAndroid Build Coastguard Worker     int base = x >> frac_bits;
3575*77c1e3ccSAndroid Build Coastguard Worker     int base_max_diff = (max_base_x - base) >> upsample_above;
3576*77c1e3ccSAndroid Build Coastguard Worker     if (base_max_diff <= 0) {
3577*77c1e3ccSAndroid Build Coastguard Worker       for (int i = r; i < W; ++i) {
3578*77c1e3ccSAndroid Build Coastguard Worker         dst[i] = a_mbase_x;  // save 4 values
3579*77c1e3ccSAndroid Build Coastguard Worker       }
3580*77c1e3ccSAndroid Build Coastguard Worker       return;
3581*77c1e3ccSAndroid Build Coastguard Worker     }
3582*77c1e3ccSAndroid Build Coastguard Worker     if (base_max_diff > H) base_max_diff = H;
3583*77c1e3ccSAndroid Build Coastguard Worker     a0_128 = _mm_loadu_si128((__m128i *)(above + base));
3584*77c1e3ccSAndroid Build Coastguard Worker     a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
3585*77c1e3ccSAndroid Build Coastguard Worker 
3586*77c1e3ccSAndroid Build Coastguard Worker     if (upsample_above) {
3587*77c1e3ccSAndroid Build Coastguard Worker       a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)EvenOddMaskx[0]);
3588*77c1e3ccSAndroid Build Coastguard Worker       a1_128 = _mm_srli_si128(a0_128, 8);
3589*77c1e3ccSAndroid Build Coastguard Worker 
3590*77c1e3ccSAndroid Build Coastguard Worker       shift = _mm256_srli_epi16(
3591*77c1e3ccSAndroid Build Coastguard Worker           _mm256_and_si256(
3592*77c1e3ccSAndroid Build Coastguard Worker               _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
3593*77c1e3ccSAndroid Build Coastguard Worker           1);
3594*77c1e3ccSAndroid Build Coastguard Worker     } else {
3595*77c1e3ccSAndroid Build Coastguard Worker       shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
3596*77c1e3ccSAndroid Build Coastguard Worker     }
3597*77c1e3ccSAndroid Build Coastguard Worker     a0 = _mm256_cvtepu8_epi16(a0_128);
3598*77c1e3ccSAndroid Build Coastguard Worker     a1 = _mm256_cvtepu8_epi16(a1_128);
3599*77c1e3ccSAndroid Build Coastguard Worker 
3600*77c1e3ccSAndroid Build Coastguard Worker     diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
3601*77c1e3ccSAndroid Build Coastguard Worker     a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
3602*77c1e3ccSAndroid Build Coastguard Worker     a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
3603*77c1e3ccSAndroid Build Coastguard Worker 
3604*77c1e3ccSAndroid Build Coastguard Worker     b = _mm256_mullo_epi16(diff, shift);
3605*77c1e3ccSAndroid Build Coastguard Worker     res = _mm256_add_epi16(a32, b);
3606*77c1e3ccSAndroid Build Coastguard Worker     res = _mm256_srli_epi16(res, 5);
3607*77c1e3ccSAndroid Build Coastguard Worker 
3608*77c1e3ccSAndroid Build Coastguard Worker     res = _mm256_packus_epi16(
3609*77c1e3ccSAndroid Build Coastguard Worker         res, _mm256_castsi128_si256(
3610*77c1e3ccSAndroid Build Coastguard Worker                  _mm256_extracti128_si256(res, 1)));  // goto 8 bit
3611*77c1e3ccSAndroid Build Coastguard Worker     res1 = _mm256_castsi256_si128(res);               // 16 8bit values
3612*77c1e3ccSAndroid Build Coastguard Worker 
3613*77c1e3ccSAndroid Build Coastguard Worker     dst[r] =
3614*77c1e3ccSAndroid Build Coastguard Worker         _mm_blendv_epi8(a_mbase_x, res1, *(__m128i *)BaseMask[base_max_diff]);
3615*77c1e3ccSAndroid Build Coastguard Worker     x += dx;
3616*77c1e3ccSAndroid Build Coastguard Worker   }
3617*77c1e3ccSAndroid Build Coastguard Worker }
3618*77c1e3ccSAndroid Build Coastguard Worker 
dr_prediction_z1_4xN_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)3619*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z1_4xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3620*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *above, int upsample_above,
3621*77c1e3ccSAndroid Build Coastguard Worker                                       int dx) {
3622*77c1e3ccSAndroid Build Coastguard Worker   __m128i dstvec[16];
3623*77c1e3ccSAndroid Build Coastguard Worker 
3624*77c1e3ccSAndroid Build Coastguard Worker   dr_prediction_z1_HxW_internal_avx2(4, N, dstvec, above, upsample_above, dx);
3625*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < N; i++) {
3626*77c1e3ccSAndroid Build Coastguard Worker     *(int *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]);
3627*77c1e3ccSAndroid Build Coastguard Worker   }
3628*77c1e3ccSAndroid Build Coastguard Worker }
3629*77c1e3ccSAndroid Build Coastguard Worker 
dr_prediction_z1_8xN_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)3630*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z1_8xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3631*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *above, int upsample_above,
3632*77c1e3ccSAndroid Build Coastguard Worker                                       int dx) {
3633*77c1e3ccSAndroid Build Coastguard Worker   __m128i dstvec[32];
3634*77c1e3ccSAndroid Build Coastguard Worker 
3635*77c1e3ccSAndroid Build Coastguard Worker   dr_prediction_z1_HxW_internal_avx2(8, N, dstvec, above, upsample_above, dx);
3636*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < N; i++) {
3637*77c1e3ccSAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
3638*77c1e3ccSAndroid Build Coastguard Worker   }
3639*77c1e3ccSAndroid Build Coastguard Worker }
3640*77c1e3ccSAndroid Build Coastguard Worker 
dr_prediction_z1_16xN_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)3641*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z1_16xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3642*77c1e3ccSAndroid Build Coastguard Worker                                        const uint8_t *above, int upsample_above,
3643*77c1e3ccSAndroid Build Coastguard Worker                                        int dx) {
3644*77c1e3ccSAndroid Build Coastguard Worker   __m128i dstvec[64];
3645*77c1e3ccSAndroid Build Coastguard Worker 
3646*77c1e3ccSAndroid Build Coastguard Worker   dr_prediction_z1_HxW_internal_avx2(16, N, dstvec, above, upsample_above, dx);
3647*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < N; i++) {
3648*77c1e3ccSAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
3649*77c1e3ccSAndroid Build Coastguard Worker   }
3650*77c1e3ccSAndroid Build Coastguard Worker }
3651*77c1e3ccSAndroid Build Coastguard Worker 
dr_prediction_z1_32xN_internal_avx2(int N,__m256i * dstvec,const uint8_t * above,int upsample_above,int dx)3652*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_avx2(
3653*77c1e3ccSAndroid Build Coastguard Worker     int N, __m256i *dstvec, const uint8_t *above, int upsample_above, int dx) {
3654*77c1e3ccSAndroid Build Coastguard Worker   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
3655*77c1e3ccSAndroid Build Coastguard Worker   (void)upsample_above;
3656*77c1e3ccSAndroid Build Coastguard Worker   const int frac_bits = 6;
3657*77c1e3ccSAndroid Build Coastguard Worker   const int max_base_x = ((32 + N) - 1);
3658*77c1e3ccSAndroid Build Coastguard Worker 
3659*77c1e3ccSAndroid Build Coastguard Worker   // pre-filter above pixels
3660*77c1e3ccSAndroid Build Coastguard Worker   // store in temp buffers:
3661*77c1e3ccSAndroid Build Coastguard Worker   //   above[x] * 32 + 16
3662*77c1e3ccSAndroid Build Coastguard Worker   //   above[x+1] - above[x]
3663*77c1e3ccSAndroid Build Coastguard Worker   // final pixels will be calculated as:
3664*77c1e3ccSAndroid Build Coastguard Worker   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3665*77c1e3ccSAndroid Build Coastguard Worker   __m256i a0, a1, a32, a16;
3666*77c1e3ccSAndroid Build Coastguard Worker   __m256i a_mbase_x, diff, c3f;
3667*77c1e3ccSAndroid Build Coastguard Worker 
3668*77c1e3ccSAndroid Build Coastguard Worker   a16 = _mm256_set1_epi16(16);
3669*77c1e3ccSAndroid Build Coastguard Worker   a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
3670*77c1e3ccSAndroid Build Coastguard Worker   c3f = _mm256_set1_epi16(0x3f);
3671*77c1e3ccSAndroid Build Coastguard Worker 
3672*77c1e3ccSAndroid Build Coastguard Worker   int x = dx;
3673*77c1e3ccSAndroid Build Coastguard Worker   for (int r = 0; r < N; r++) {
3674*77c1e3ccSAndroid Build Coastguard Worker     __m256i b, res, res16[2];
3675*77c1e3ccSAndroid Build Coastguard Worker     __m128i a0_128, a1_128;
3676*77c1e3ccSAndroid Build Coastguard Worker 
3677*77c1e3ccSAndroid Build Coastguard Worker     int base = x >> frac_bits;
3678*77c1e3ccSAndroid Build Coastguard Worker     int base_max_diff = (max_base_x - base);
3679*77c1e3ccSAndroid Build Coastguard Worker     if (base_max_diff <= 0) {
3680*77c1e3ccSAndroid Build Coastguard Worker       for (int i = r; i < N; ++i) {
3681*77c1e3ccSAndroid Build Coastguard Worker         dstvec[i] = a_mbase_x;  // save 32 values
3682*77c1e3ccSAndroid Build Coastguard Worker       }
3683*77c1e3ccSAndroid Build Coastguard Worker       return;
3684*77c1e3ccSAndroid Build Coastguard Worker     }
3685*77c1e3ccSAndroid Build Coastguard Worker     if (base_max_diff > 32) base_max_diff = 32;
3686*77c1e3ccSAndroid Build Coastguard Worker     __m256i shift =
3687*77c1e3ccSAndroid Build Coastguard Worker         _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
3688*77c1e3ccSAndroid Build Coastguard Worker 
3689*77c1e3ccSAndroid Build Coastguard Worker     for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
3690*77c1e3ccSAndroid Build Coastguard Worker       int mdiff = base_max_diff - j;
3691*77c1e3ccSAndroid Build Coastguard Worker       if (mdiff <= 0) {
3692*77c1e3ccSAndroid Build Coastguard Worker         res16[jj] = a_mbase_x;
3693*77c1e3ccSAndroid Build Coastguard Worker       } else {
3694*77c1e3ccSAndroid Build Coastguard Worker         a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
3695*77c1e3ccSAndroid Build Coastguard Worker         a1_128 = _mm_loadu_si128((__m128i *)(above + base + j + 1));
3696*77c1e3ccSAndroid Build Coastguard Worker         a0 = _mm256_cvtepu8_epi16(a0_128);
3697*77c1e3ccSAndroid Build Coastguard Worker         a1 = _mm256_cvtepu8_epi16(a1_128);
3698*77c1e3ccSAndroid Build Coastguard Worker 
3699*77c1e3ccSAndroid Build Coastguard Worker         diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
3700*77c1e3ccSAndroid Build Coastguard Worker         a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
3701*77c1e3ccSAndroid Build Coastguard Worker         a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
3702*77c1e3ccSAndroid Build Coastguard Worker         b = _mm256_mullo_epi16(diff, shift);
3703*77c1e3ccSAndroid Build Coastguard Worker 
3704*77c1e3ccSAndroid Build Coastguard Worker         res = _mm256_add_epi16(a32, b);
3705*77c1e3ccSAndroid Build Coastguard Worker         res = _mm256_srli_epi16(res, 5);
3706*77c1e3ccSAndroid Build Coastguard Worker         res16[jj] = _mm256_packus_epi16(
3707*77c1e3ccSAndroid Build Coastguard Worker             res, _mm256_castsi128_si256(
3708*77c1e3ccSAndroid Build Coastguard Worker                      _mm256_extracti128_si256(res, 1)));  // 16 8bit values
3709*77c1e3ccSAndroid Build Coastguard Worker       }
3710*77c1e3ccSAndroid Build Coastguard Worker     }
3711*77c1e3ccSAndroid Build Coastguard Worker     res16[1] =
3712*77c1e3ccSAndroid Build Coastguard Worker         _mm256_inserti128_si256(res16[0], _mm256_castsi256_si128(res16[1]),
3713*77c1e3ccSAndroid Build Coastguard Worker                                 1);  // 32 8bit values
3714*77c1e3ccSAndroid Build Coastguard Worker 
3715*77c1e3ccSAndroid Build Coastguard Worker     dstvec[r] = _mm256_blendv_epi8(
3716*77c1e3ccSAndroid Build Coastguard Worker         a_mbase_x, res16[1],
3717*77c1e3ccSAndroid Build Coastguard Worker         *(__m256i *)BaseMask[base_max_diff]);  // 32 8bit values
3718*77c1e3ccSAndroid Build Coastguard Worker     x += dx;
3719*77c1e3ccSAndroid Build Coastguard Worker   }
3720*77c1e3ccSAndroid Build Coastguard Worker }
3721*77c1e3ccSAndroid Build Coastguard Worker 
dr_prediction_z1_32xN_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)3722*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z1_32xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3723*77c1e3ccSAndroid Build Coastguard Worker                                        const uint8_t *above, int upsample_above,
3724*77c1e3ccSAndroid Build Coastguard Worker                                        int dx) {
3725*77c1e3ccSAndroid Build Coastguard Worker   __m256i dstvec[64];
3726*77c1e3ccSAndroid Build Coastguard Worker   dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, dx);
3727*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < N; i++) {
3728*77c1e3ccSAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
3729*77c1e3ccSAndroid Build Coastguard Worker   }
3730*77c1e3ccSAndroid Build Coastguard Worker }
3731*77c1e3ccSAndroid Build Coastguard Worker 
dr_prediction_z1_64xN_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)3732*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z1_64xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3733*77c1e3ccSAndroid Build Coastguard Worker                                        const uint8_t *above, int upsample_above,
3734*77c1e3ccSAndroid Build Coastguard Worker                                        int dx) {
3735*77c1e3ccSAndroid Build Coastguard Worker   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
3736*77c1e3ccSAndroid Build Coastguard Worker   (void)upsample_above;
3737*77c1e3ccSAndroid Build Coastguard Worker   const int frac_bits = 6;
3738*77c1e3ccSAndroid Build Coastguard Worker   const int max_base_x = ((64 + N) - 1);
3739*77c1e3ccSAndroid Build Coastguard Worker 
3740*77c1e3ccSAndroid Build Coastguard Worker   // pre-filter above pixels
3741*77c1e3ccSAndroid Build Coastguard Worker   // store in temp buffers:
3742*77c1e3ccSAndroid Build Coastguard Worker   //   above[x] * 32 + 16
3743*77c1e3ccSAndroid Build Coastguard Worker   //   above[x+1] - above[x]
3744*77c1e3ccSAndroid Build Coastguard Worker   // final pixels will be calculated as:
3745*77c1e3ccSAndroid Build Coastguard Worker   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3746*77c1e3ccSAndroid Build Coastguard Worker   __m256i a0, a1, a32, a16;
3747*77c1e3ccSAndroid Build Coastguard Worker   __m256i a_mbase_x, diff, c3f;
3748*77c1e3ccSAndroid Build Coastguard Worker   __m128i max_base_x128, base_inc128, mask128;
3749*77c1e3ccSAndroid Build Coastguard Worker 
3750*77c1e3ccSAndroid Build Coastguard Worker   a16 = _mm256_set1_epi16(16);
3751*77c1e3ccSAndroid Build Coastguard Worker   a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
3752*77c1e3ccSAndroid Build Coastguard Worker   max_base_x128 = _mm_set1_epi8(max_base_x);
3753*77c1e3ccSAndroid Build Coastguard Worker   c3f = _mm256_set1_epi16(0x3f);
3754*77c1e3ccSAndroid Build Coastguard Worker 
3755*77c1e3ccSAndroid Build Coastguard Worker   int x = dx;
3756*77c1e3ccSAndroid Build Coastguard Worker   for (int r = 0; r < N; r++, dst += stride) {
3757*77c1e3ccSAndroid Build Coastguard Worker     __m256i b, res;
3758*77c1e3ccSAndroid Build Coastguard Worker     int base = x >> frac_bits;
3759*77c1e3ccSAndroid Build Coastguard Worker     if (base >= max_base_x) {
3760*77c1e3ccSAndroid Build Coastguard Worker       for (int i = r; i < N; ++i) {
3761*77c1e3ccSAndroid Build Coastguard Worker         _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
3762*77c1e3ccSAndroid Build Coastguard Worker         _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
3763*77c1e3ccSAndroid Build Coastguard Worker         dst += stride;
3764*77c1e3ccSAndroid Build Coastguard Worker       }
3765*77c1e3ccSAndroid Build Coastguard Worker       return;
3766*77c1e3ccSAndroid Build Coastguard Worker     }
3767*77c1e3ccSAndroid Build Coastguard Worker 
3768*77c1e3ccSAndroid Build Coastguard Worker     __m256i shift =
3769*77c1e3ccSAndroid Build Coastguard Worker         _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
3770*77c1e3ccSAndroid Build Coastguard Worker 
3771*77c1e3ccSAndroid Build Coastguard Worker     __m128i a0_128, a1_128, res128;
3772*77c1e3ccSAndroid Build Coastguard Worker     for (int j = 0; j < 64; j += 16) {
3773*77c1e3ccSAndroid Build Coastguard Worker       int mdif = max_base_x - (base + j);
3774*77c1e3ccSAndroid Build Coastguard Worker       if (mdif <= 0) {
3775*77c1e3ccSAndroid Build Coastguard Worker         _mm_storeu_si128((__m128i *)(dst + j),
3776*77c1e3ccSAndroid Build Coastguard Worker                          _mm256_castsi256_si128(a_mbase_x));
3777*77c1e3ccSAndroid Build Coastguard Worker       } else {
3778*77c1e3ccSAndroid Build Coastguard Worker         a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
3779*77c1e3ccSAndroid Build Coastguard Worker         a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
3780*77c1e3ccSAndroid Build Coastguard Worker         a0 = _mm256_cvtepu8_epi16(a0_128);
3781*77c1e3ccSAndroid Build Coastguard Worker         a1 = _mm256_cvtepu8_epi16(a1_128);
3782*77c1e3ccSAndroid Build Coastguard Worker 
3783*77c1e3ccSAndroid Build Coastguard Worker         diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
3784*77c1e3ccSAndroid Build Coastguard Worker         a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
3785*77c1e3ccSAndroid Build Coastguard Worker         a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
3786*77c1e3ccSAndroid Build Coastguard Worker         b = _mm256_mullo_epi16(diff, shift);
3787*77c1e3ccSAndroid Build Coastguard Worker 
3788*77c1e3ccSAndroid Build Coastguard Worker         res = _mm256_add_epi16(a32, b);
3789*77c1e3ccSAndroid Build Coastguard Worker         res = _mm256_srli_epi16(res, 5);
3790*77c1e3ccSAndroid Build Coastguard Worker         res = _mm256_packus_epi16(
3791*77c1e3ccSAndroid Build Coastguard Worker             res, _mm256_castsi128_si256(
3792*77c1e3ccSAndroid Build Coastguard Worker                      _mm256_extracti128_si256(res, 1)));  // 16 8bit values
3793*77c1e3ccSAndroid Build Coastguard Worker 
3794*77c1e3ccSAndroid Build Coastguard Worker         base_inc128 =
3795*77c1e3ccSAndroid Build Coastguard Worker             _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1),
3796*77c1e3ccSAndroid Build Coastguard Worker                           (int8_t)(base + j + 2), (int8_t)(base + j + 3),
3797*77c1e3ccSAndroid Build Coastguard Worker                           (int8_t)(base + j + 4), (int8_t)(base + j + 5),
3798*77c1e3ccSAndroid Build Coastguard Worker                           (int8_t)(base + j + 6), (int8_t)(base + j + 7),
3799*77c1e3ccSAndroid Build Coastguard Worker                           (int8_t)(base + j + 8), (int8_t)(base + j + 9),
3800*77c1e3ccSAndroid Build Coastguard Worker                           (int8_t)(base + j + 10), (int8_t)(base + j + 11),
3801*77c1e3ccSAndroid Build Coastguard Worker                           (int8_t)(base + j + 12), (int8_t)(base + j + 13),
3802*77c1e3ccSAndroid Build Coastguard Worker                           (int8_t)(base + j + 14), (int8_t)(base + j + 15));
3803*77c1e3ccSAndroid Build Coastguard Worker 
3804*77c1e3ccSAndroid Build Coastguard Worker         mask128 = _mm_cmpgt_epi8(_mm_subs_epu8(max_base_x128, base_inc128),
3805*77c1e3ccSAndroid Build Coastguard Worker                                  _mm_setzero_si128());
3806*77c1e3ccSAndroid Build Coastguard Worker         res128 = _mm_blendv_epi8(_mm256_castsi256_si128(a_mbase_x),
3807*77c1e3ccSAndroid Build Coastguard Worker                                  _mm256_castsi256_si128(res), mask128);
3808*77c1e3ccSAndroid Build Coastguard Worker         _mm_storeu_si128((__m128i *)(dst + j), res128);
3809*77c1e3ccSAndroid Build Coastguard Worker       }
3810*77c1e3ccSAndroid Build Coastguard Worker     }
3811*77c1e3ccSAndroid Build Coastguard Worker     x += dx;
3812*77c1e3ccSAndroid Build Coastguard Worker   }
3813*77c1e3ccSAndroid Build Coastguard Worker }
3814*77c1e3ccSAndroid Build Coastguard Worker 
3815*77c1e3ccSAndroid Build Coastguard Worker // Directional prediction, zone 1: 0 < angle < 90
av1_dr_prediction_z1_avx2(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_above,int dx,int dy)3816*77c1e3ccSAndroid Build Coastguard Worker void av1_dr_prediction_z1_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
3817*77c1e3ccSAndroid Build Coastguard Worker                                const uint8_t *above, const uint8_t *left,
3818*77c1e3ccSAndroid Build Coastguard Worker                                int upsample_above, int dx, int dy) {
3819*77c1e3ccSAndroid Build Coastguard Worker   (void)left;
3820*77c1e3ccSAndroid Build Coastguard Worker   (void)dy;
3821*77c1e3ccSAndroid Build Coastguard Worker   switch (bw) {
3822*77c1e3ccSAndroid Build Coastguard Worker     case 4:
3823*77c1e3ccSAndroid Build Coastguard Worker       dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, dx);
3824*77c1e3ccSAndroid Build Coastguard Worker       break;
3825*77c1e3ccSAndroid Build Coastguard Worker     case 8:
3826*77c1e3ccSAndroid Build Coastguard Worker       dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, dx);
3827*77c1e3ccSAndroid Build Coastguard Worker       break;
3828*77c1e3ccSAndroid Build Coastguard Worker     case 16:
3829*77c1e3ccSAndroid Build Coastguard Worker       dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, dx);
3830*77c1e3ccSAndroid Build Coastguard Worker       break;
3831*77c1e3ccSAndroid Build Coastguard Worker     case 32:
3832*77c1e3ccSAndroid Build Coastguard Worker       dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, dx);
3833*77c1e3ccSAndroid Build Coastguard Worker       break;
3834*77c1e3ccSAndroid Build Coastguard Worker     case 64:
3835*77c1e3ccSAndroid Build Coastguard Worker       dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above, dx);
3836*77c1e3ccSAndroid Build Coastguard Worker       break;
3837*77c1e3ccSAndroid Build Coastguard Worker     default: break;
3838*77c1e3ccSAndroid Build Coastguard Worker   }
3839*77c1e3ccSAndroid Build Coastguard Worker   return;
3840*77c1e3ccSAndroid Build Coastguard Worker }
3841*77c1e3ccSAndroid Build Coastguard Worker 
dr_prediction_z2_Nx4_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)3842*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z2_Nx4_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3843*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *above, const uint8_t *left,
3844*77c1e3ccSAndroid Build Coastguard Worker                                       int upsample_above, int upsample_left,
3845*77c1e3ccSAndroid Build Coastguard Worker                                       int dx, int dy) {
3846*77c1e3ccSAndroid Build Coastguard Worker   const int min_base_x = -(1 << upsample_above);
3847*77c1e3ccSAndroid Build Coastguard Worker   const int min_base_y = -(1 << upsample_left);
3848*77c1e3ccSAndroid Build Coastguard Worker   const int frac_bits_x = 6 - upsample_above;
3849*77c1e3ccSAndroid Build Coastguard Worker   const int frac_bits_y = 6 - upsample_left;
3850*77c1e3ccSAndroid Build Coastguard Worker 
3851*77c1e3ccSAndroid Build Coastguard Worker   assert(dx > 0);
3852*77c1e3ccSAndroid Build Coastguard Worker   // pre-filter above pixels
3853*77c1e3ccSAndroid Build Coastguard Worker   // store in temp buffers:
3854*77c1e3ccSAndroid Build Coastguard Worker   //   above[x] * 32 + 16
3855*77c1e3ccSAndroid Build Coastguard Worker   //   above[x+1] - above[x]
3856*77c1e3ccSAndroid Build Coastguard Worker   // final pixels will be calculated as:
3857*77c1e3ccSAndroid Build Coastguard Worker   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3858*77c1e3ccSAndroid Build Coastguard Worker   __m128i a0_x, a1_x, a32, a16, diff;
3859*77c1e3ccSAndroid Build Coastguard Worker   __m128i c3f, min_base_y128, c1234, dy128;
3860*77c1e3ccSAndroid Build Coastguard Worker 
3861*77c1e3ccSAndroid Build Coastguard Worker   a16 = _mm_set1_epi16(16);
3862*77c1e3ccSAndroid Build Coastguard Worker   c3f = _mm_set1_epi16(0x3f);
3863*77c1e3ccSAndroid Build Coastguard Worker   min_base_y128 = _mm_set1_epi16(min_base_y);
3864*77c1e3ccSAndroid Build Coastguard Worker   c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0);
3865*77c1e3ccSAndroid Build Coastguard Worker   dy128 = _mm_set1_epi16(dy);
3866*77c1e3ccSAndroid Build Coastguard Worker 
3867*77c1e3ccSAndroid Build Coastguard Worker   for (int r = 0; r < N; r++) {
3868*77c1e3ccSAndroid Build Coastguard Worker     __m128i b, res, shift, r6, ydx;
3869*77c1e3ccSAndroid Build Coastguard Worker     __m128i resx, resy, resxy;
3870*77c1e3ccSAndroid Build Coastguard Worker     __m128i a0_x128, a1_x128;
3871*77c1e3ccSAndroid Build Coastguard Worker     int y = r + 1;
3872*77c1e3ccSAndroid Build Coastguard Worker     int base_x = (-y * dx) >> frac_bits_x;
3873*77c1e3ccSAndroid Build Coastguard Worker     int base_shift = 0;
3874*77c1e3ccSAndroid Build Coastguard Worker     if (base_x < (min_base_x - 1)) {
3875*77c1e3ccSAndroid Build Coastguard Worker       base_shift = (min_base_x - base_x - 1) >> upsample_above;
3876*77c1e3ccSAndroid Build Coastguard Worker     }
3877*77c1e3ccSAndroid Build Coastguard Worker     int base_min_diff =
3878*77c1e3ccSAndroid Build Coastguard Worker         (min_base_x - base_x + upsample_above) >> upsample_above;
3879*77c1e3ccSAndroid Build Coastguard Worker     if (base_min_diff > 4) {
3880*77c1e3ccSAndroid Build Coastguard Worker       base_min_diff = 4;
3881*77c1e3ccSAndroid Build Coastguard Worker     } else {
3882*77c1e3ccSAndroid Build Coastguard Worker       if (base_min_diff < 0) base_min_diff = 0;
3883*77c1e3ccSAndroid Build Coastguard Worker     }
3884*77c1e3ccSAndroid Build Coastguard Worker 
3885*77c1e3ccSAndroid Build Coastguard Worker     if (base_shift > 3) {
3886*77c1e3ccSAndroid Build Coastguard Worker       a0_x = _mm_setzero_si128();
3887*77c1e3ccSAndroid Build Coastguard Worker       a1_x = _mm_setzero_si128();
3888*77c1e3ccSAndroid Build Coastguard Worker       shift = _mm_setzero_si128();
3889*77c1e3ccSAndroid Build Coastguard Worker     } else {
3890*77c1e3ccSAndroid Build Coastguard Worker       a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
3891*77c1e3ccSAndroid Build Coastguard Worker       ydx = _mm_set1_epi16(y * dx);
3892*77c1e3ccSAndroid Build Coastguard Worker       r6 = _mm_slli_epi16(c1234, 6);
3893*77c1e3ccSAndroid Build Coastguard Worker 
3894*77c1e3ccSAndroid Build Coastguard Worker       if (upsample_above) {
3895*77c1e3ccSAndroid Build Coastguard Worker         a0_x128 =
3896*77c1e3ccSAndroid Build Coastguard Worker             _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
3897*77c1e3ccSAndroid Build Coastguard Worker         a1_x128 = _mm_srli_si128(a0_x128, 8);
3898*77c1e3ccSAndroid Build Coastguard Worker 
3899*77c1e3ccSAndroid Build Coastguard Worker         shift = _mm_srli_epi16(
3900*77c1e3ccSAndroid Build Coastguard Worker             _mm_and_si128(
3901*77c1e3ccSAndroid Build Coastguard Worker                 _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
3902*77c1e3ccSAndroid Build Coastguard Worker             1);
3903*77c1e3ccSAndroid Build Coastguard Worker       } else {
3904*77c1e3ccSAndroid Build Coastguard Worker         a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
3905*77c1e3ccSAndroid Build Coastguard Worker         a1_x128 = _mm_srli_si128(a0_x128, 1);
3906*77c1e3ccSAndroid Build Coastguard Worker 
3907*77c1e3ccSAndroid Build Coastguard Worker         shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
3908*77c1e3ccSAndroid Build Coastguard Worker       }
3909*77c1e3ccSAndroid Build Coastguard Worker       a0_x = _mm_cvtepu8_epi16(a0_x128);
3910*77c1e3ccSAndroid Build Coastguard Worker       a1_x = _mm_cvtepu8_epi16(a1_x128);
3911*77c1e3ccSAndroid Build Coastguard Worker     }
3912*77c1e3ccSAndroid Build Coastguard Worker     // y calc
3913*77c1e3ccSAndroid Build Coastguard Worker     __m128i a0_y, a1_y, shifty;
3914*77c1e3ccSAndroid Build Coastguard Worker     if (base_x < min_base_x) {
3915*77c1e3ccSAndroid Build Coastguard Worker       DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
3916*77c1e3ccSAndroid Build Coastguard Worker       __m128i y_c128, base_y_c128, mask128, c1234_;
3917*77c1e3ccSAndroid Build Coastguard Worker       c1234_ = _mm_srli_si128(c1234, 2);
3918*77c1e3ccSAndroid Build Coastguard Worker       r6 = _mm_set1_epi16(r << 6);
3919*77c1e3ccSAndroid Build Coastguard Worker       y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy128));
3920*77c1e3ccSAndroid Build Coastguard Worker       base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
3921*77c1e3ccSAndroid Build Coastguard Worker       mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
3922*77c1e3ccSAndroid Build Coastguard Worker       base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
3923*77c1e3ccSAndroid Build Coastguard Worker       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
3924*77c1e3ccSAndroid Build Coastguard Worker 
3925*77c1e3ccSAndroid Build Coastguard Worker       a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
3926*77c1e3ccSAndroid Build Coastguard Worker                             left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
3927*77c1e3ccSAndroid Build Coastguard Worker       base_y_c128 = _mm_add_epi16(base_y_c128, _mm_srli_epi16(a16, 4));
3928*77c1e3ccSAndroid Build Coastguard Worker       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
3929*77c1e3ccSAndroid Build Coastguard Worker       a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
3930*77c1e3ccSAndroid Build Coastguard Worker                             left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
3931*77c1e3ccSAndroid Build Coastguard Worker 
3932*77c1e3ccSAndroid Build Coastguard Worker       if (upsample_left) {
3933*77c1e3ccSAndroid Build Coastguard Worker         shifty = _mm_srli_epi16(
3934*77c1e3ccSAndroid Build Coastguard Worker             _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
3935*77c1e3ccSAndroid Build Coastguard Worker       } else {
3936*77c1e3ccSAndroid Build Coastguard Worker         shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
3937*77c1e3ccSAndroid Build Coastguard Worker       }
3938*77c1e3ccSAndroid Build Coastguard Worker       a0_x = _mm_unpacklo_epi64(a0_x, a0_y);
3939*77c1e3ccSAndroid Build Coastguard Worker       a1_x = _mm_unpacklo_epi64(a1_x, a1_y);
3940*77c1e3ccSAndroid Build Coastguard Worker       shift = _mm_unpacklo_epi64(shift, shifty);
3941*77c1e3ccSAndroid Build Coastguard Worker     }
3942*77c1e3ccSAndroid Build Coastguard Worker 
3943*77c1e3ccSAndroid Build Coastguard Worker     diff = _mm_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
3944*77c1e3ccSAndroid Build Coastguard Worker     a32 = _mm_slli_epi16(a0_x, 5);     // a[x] * 32
3945*77c1e3ccSAndroid Build Coastguard Worker     a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
3946*77c1e3ccSAndroid Build Coastguard Worker 
3947*77c1e3ccSAndroid Build Coastguard Worker     b = _mm_mullo_epi16(diff, shift);
3948*77c1e3ccSAndroid Build Coastguard Worker     res = _mm_add_epi16(a32, b);
3949*77c1e3ccSAndroid Build Coastguard Worker     res = _mm_srli_epi16(res, 5);
3950*77c1e3ccSAndroid Build Coastguard Worker 
3951*77c1e3ccSAndroid Build Coastguard Worker     resx = _mm_packus_epi16(res, res);
3952*77c1e3ccSAndroid Build Coastguard Worker     resy = _mm_srli_si128(resx, 4);
3953*77c1e3ccSAndroid Build Coastguard Worker 
3954*77c1e3ccSAndroid Build Coastguard Worker     resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
3955*77c1e3ccSAndroid Build Coastguard Worker     *(int *)(dst) = _mm_cvtsi128_si32(resxy);
3956*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
3957*77c1e3ccSAndroid Build Coastguard Worker   }
3958*77c1e3ccSAndroid Build Coastguard Worker }
3959*77c1e3ccSAndroid Build Coastguard Worker 
dr_prediction_z2_Nx8_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)3960*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z2_Nx8_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3961*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *above, const uint8_t *left,
3962*77c1e3ccSAndroid Build Coastguard Worker                                       int upsample_above, int upsample_left,
3963*77c1e3ccSAndroid Build Coastguard Worker                                       int dx, int dy) {
3964*77c1e3ccSAndroid Build Coastguard Worker   const int min_base_x = -(1 << upsample_above);
3965*77c1e3ccSAndroid Build Coastguard Worker   const int min_base_y = -(1 << upsample_left);
3966*77c1e3ccSAndroid Build Coastguard Worker   const int frac_bits_x = 6 - upsample_above;
3967*77c1e3ccSAndroid Build Coastguard Worker   const int frac_bits_y = 6 - upsample_left;
3968*77c1e3ccSAndroid Build Coastguard Worker 
3969*77c1e3ccSAndroid Build Coastguard Worker   // pre-filter above pixels
3970*77c1e3ccSAndroid Build Coastguard Worker   // store in temp buffers:
3971*77c1e3ccSAndroid Build Coastguard Worker   //   above[x] * 32 + 16
3972*77c1e3ccSAndroid Build Coastguard Worker   //   above[x+1] - above[x]
3973*77c1e3ccSAndroid Build Coastguard Worker   // final pixels will be calculated as:
3974*77c1e3ccSAndroid Build Coastguard Worker   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3975*77c1e3ccSAndroid Build Coastguard Worker   __m256i diff, a32, a16;
3976*77c1e3ccSAndroid Build Coastguard Worker   __m256i a0_x, a1_x;
3977*77c1e3ccSAndroid Build Coastguard Worker   __m128i a0_x128, a1_x128, min_base_y128, c3f;
3978*77c1e3ccSAndroid Build Coastguard Worker   __m128i c1234, dy128;
3979*77c1e3ccSAndroid Build Coastguard Worker 
3980*77c1e3ccSAndroid Build Coastguard Worker   a16 = _mm256_set1_epi16(16);
3981*77c1e3ccSAndroid Build Coastguard Worker   c3f = _mm_set1_epi16(0x3f);
3982*77c1e3ccSAndroid Build Coastguard Worker   min_base_y128 = _mm_set1_epi16(min_base_y);
3983*77c1e3ccSAndroid Build Coastguard Worker   dy128 = _mm_set1_epi16(dy);
3984*77c1e3ccSAndroid Build Coastguard Worker   c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
3985*77c1e3ccSAndroid Build Coastguard Worker 
3986*77c1e3ccSAndroid Build Coastguard Worker   for (int r = 0; r < N; r++) {
3987*77c1e3ccSAndroid Build Coastguard Worker     __m256i b, res, shift;
3988*77c1e3ccSAndroid Build Coastguard Worker     __m128i resx, resy, resxy, r6, ydx;
3989*77c1e3ccSAndroid Build Coastguard Worker 
3990*77c1e3ccSAndroid Build Coastguard Worker     int y = r + 1;
3991*77c1e3ccSAndroid Build Coastguard Worker     int base_x = (-y * dx) >> frac_bits_x;
3992*77c1e3ccSAndroid Build Coastguard Worker     int base_shift = 0;
3993*77c1e3ccSAndroid Build Coastguard Worker     if (base_x < (min_base_x - 1)) {
3994*77c1e3ccSAndroid Build Coastguard Worker       base_shift = (min_base_x - base_x - 1) >> upsample_above;
3995*77c1e3ccSAndroid Build Coastguard Worker     }
3996*77c1e3ccSAndroid Build Coastguard Worker     int base_min_diff =
3997*77c1e3ccSAndroid Build Coastguard Worker         (min_base_x - base_x + upsample_above) >> upsample_above;
3998*77c1e3ccSAndroid Build Coastguard Worker     if (base_min_diff > 8) {
3999*77c1e3ccSAndroid Build Coastguard Worker       base_min_diff = 8;
4000*77c1e3ccSAndroid Build Coastguard Worker     } else {
4001*77c1e3ccSAndroid Build Coastguard Worker       if (base_min_diff < 0) base_min_diff = 0;
4002*77c1e3ccSAndroid Build Coastguard Worker     }
4003*77c1e3ccSAndroid Build Coastguard Worker 
4004*77c1e3ccSAndroid Build Coastguard Worker     if (base_shift > 7) {
4005*77c1e3ccSAndroid Build Coastguard Worker       a0_x = _mm256_setzero_si256();
4006*77c1e3ccSAndroid Build Coastguard Worker       a1_x = _mm256_setzero_si256();
4007*77c1e3ccSAndroid Build Coastguard Worker       shift = _mm256_setzero_si256();
4008*77c1e3ccSAndroid Build Coastguard Worker     } else {
4009*77c1e3ccSAndroid Build Coastguard Worker       a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
4010*77c1e3ccSAndroid Build Coastguard Worker       ydx = _mm_set1_epi16(y * dx);
4011*77c1e3ccSAndroid Build Coastguard Worker       r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6);
4012*77c1e3ccSAndroid Build Coastguard Worker       if (upsample_above) {
4013*77c1e3ccSAndroid Build Coastguard Worker         a0_x128 =
4014*77c1e3ccSAndroid Build Coastguard Worker             _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
4015*77c1e3ccSAndroid Build Coastguard Worker         a1_x128 = _mm_srli_si128(a0_x128, 8);
4016*77c1e3ccSAndroid Build Coastguard Worker 
4017*77c1e3ccSAndroid Build Coastguard Worker         shift = _mm256_castsi128_si256(_mm_srli_epi16(
4018*77c1e3ccSAndroid Build Coastguard Worker             _mm_and_si128(
4019*77c1e3ccSAndroid Build Coastguard Worker                 _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
4020*77c1e3ccSAndroid Build Coastguard Worker             1));
4021*77c1e3ccSAndroid Build Coastguard Worker       } else {
4022*77c1e3ccSAndroid Build Coastguard Worker         a1_x128 = _mm_srli_si128(a0_x128, 1);
4023*77c1e3ccSAndroid Build Coastguard Worker         a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
4024*77c1e3ccSAndroid Build Coastguard Worker         a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
4025*77c1e3ccSAndroid Build Coastguard Worker 
4026*77c1e3ccSAndroid Build Coastguard Worker         shift = _mm256_castsi128_si256(
4027*77c1e3ccSAndroid Build Coastguard Worker             _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1));
4028*77c1e3ccSAndroid Build Coastguard Worker       }
4029*77c1e3ccSAndroid Build Coastguard Worker       a0_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a0_x128));
4030*77c1e3ccSAndroid Build Coastguard Worker       a1_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a1_x128));
4031*77c1e3ccSAndroid Build Coastguard Worker     }
4032*77c1e3ccSAndroid Build Coastguard Worker 
4033*77c1e3ccSAndroid Build Coastguard Worker     // y calc
4034*77c1e3ccSAndroid Build Coastguard Worker     __m128i a0_y, a1_y, shifty;
4035*77c1e3ccSAndroid Build Coastguard Worker     if (base_x < min_base_x) {
4036*77c1e3ccSAndroid Build Coastguard Worker       DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
4037*77c1e3ccSAndroid Build Coastguard Worker       __m128i y_c128, base_y_c128, mask128;
4038*77c1e3ccSAndroid Build Coastguard Worker       r6 = _mm_set1_epi16(r << 6);
4039*77c1e3ccSAndroid Build Coastguard Worker       y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
4040*77c1e3ccSAndroid Build Coastguard Worker       base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
4041*77c1e3ccSAndroid Build Coastguard Worker       mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
4042*77c1e3ccSAndroid Build Coastguard Worker       base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
4043*77c1e3ccSAndroid Build Coastguard Worker       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
4044*77c1e3ccSAndroid Build Coastguard Worker 
4045*77c1e3ccSAndroid Build Coastguard Worker       a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
4046*77c1e3ccSAndroid Build Coastguard Worker                             left[base_y_c[2]], left[base_y_c[3]],
4047*77c1e3ccSAndroid Build Coastguard Worker                             left[base_y_c[4]], left[base_y_c[5]],
4048*77c1e3ccSAndroid Build Coastguard Worker                             left[base_y_c[6]], left[base_y_c[7]]);
4049*77c1e3ccSAndroid Build Coastguard Worker       base_y_c128 = _mm_add_epi16(
4050*77c1e3ccSAndroid Build Coastguard Worker           base_y_c128, _mm_srli_epi16(_mm256_castsi256_si128(a16), 4));
4051*77c1e3ccSAndroid Build Coastguard Worker       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
4052*77c1e3ccSAndroid Build Coastguard Worker 
4053*77c1e3ccSAndroid Build Coastguard Worker       a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
4054*77c1e3ccSAndroid Build Coastguard Worker                             left[base_y_c[2]], left[base_y_c[3]],
4055*77c1e3ccSAndroid Build Coastguard Worker                             left[base_y_c[4]], left[base_y_c[5]],
4056*77c1e3ccSAndroid Build Coastguard Worker                             left[base_y_c[6]], left[base_y_c[7]]);
4057*77c1e3ccSAndroid Build Coastguard Worker 
4058*77c1e3ccSAndroid Build Coastguard Worker       if (upsample_left) {
4059*77c1e3ccSAndroid Build Coastguard Worker         shifty = _mm_srli_epi16(
4060*77c1e3ccSAndroid Build Coastguard Worker             _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
4061*77c1e3ccSAndroid Build Coastguard Worker       } else {
4062*77c1e3ccSAndroid Build Coastguard Worker         shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
4063*77c1e3ccSAndroid Build Coastguard Worker       }
4064*77c1e3ccSAndroid Build Coastguard Worker 
4065*77c1e3ccSAndroid Build Coastguard Worker       a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
4066*77c1e3ccSAndroid Build Coastguard Worker       a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
4067*77c1e3ccSAndroid Build Coastguard Worker       shift = _mm256_inserti128_si256(shift, shifty, 1);
4068*77c1e3ccSAndroid Build Coastguard Worker     }
4069*77c1e3ccSAndroid Build Coastguard Worker 
4070*77c1e3ccSAndroid Build Coastguard Worker     diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
4071*77c1e3ccSAndroid Build Coastguard Worker     a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
4072*77c1e3ccSAndroid Build Coastguard Worker     a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
4073*77c1e3ccSAndroid Build Coastguard Worker 
4074*77c1e3ccSAndroid Build Coastguard Worker     b = _mm256_mullo_epi16(diff, shift);
4075*77c1e3ccSAndroid Build Coastguard Worker     res = _mm256_add_epi16(a32, b);
4076*77c1e3ccSAndroid Build Coastguard Worker     res = _mm256_srli_epi16(res, 5);
4077*77c1e3ccSAndroid Build Coastguard Worker 
4078*77c1e3ccSAndroid Build Coastguard Worker     resx = _mm_packus_epi16(_mm256_castsi256_si128(res),
4079*77c1e3ccSAndroid Build Coastguard Worker                             _mm256_castsi256_si128(res));
4080*77c1e3ccSAndroid Build Coastguard Worker     resy = _mm256_extracti128_si256(res, 1);
4081*77c1e3ccSAndroid Build Coastguard Worker     resy = _mm_packus_epi16(resy, resy);
4082*77c1e3ccSAndroid Build Coastguard Worker 
4083*77c1e3ccSAndroid Build Coastguard Worker     resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
4084*77c1e3ccSAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i *)(dst), resxy);
4085*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
4086*77c1e3ccSAndroid Build Coastguard Worker   }
4087*77c1e3ccSAndroid Build Coastguard Worker }
4088*77c1e3ccSAndroid Build Coastguard Worker 
dr_prediction_z2_HxW_avx2(int H,int W,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)4089*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z2_HxW_avx2(int H, int W, uint8_t *dst,
4090*77c1e3ccSAndroid Build Coastguard Worker                                       ptrdiff_t stride, const uint8_t *above,
4091*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *left, int upsample_above,
4092*77c1e3ccSAndroid Build Coastguard Worker                                       int upsample_left, int dx, int dy) {
4093*77c1e3ccSAndroid Build Coastguard Worker   // here upsample_above and upsample_left are 0 by design of
4094*77c1e3ccSAndroid Build Coastguard Worker   // av1_use_intra_edge_upsample
4095*77c1e3ccSAndroid Build Coastguard Worker   const int min_base_x = -1;
4096*77c1e3ccSAndroid Build Coastguard Worker   const int min_base_y = -1;
4097*77c1e3ccSAndroid Build Coastguard Worker   (void)upsample_above;
4098*77c1e3ccSAndroid Build Coastguard Worker   (void)upsample_left;
4099*77c1e3ccSAndroid Build Coastguard Worker   const int frac_bits_x = 6;
4100*77c1e3ccSAndroid Build Coastguard Worker   const int frac_bits_y = 6;
4101*77c1e3ccSAndroid Build Coastguard Worker 
4102*77c1e3ccSAndroid Build Coastguard Worker   __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c1234, c0123;
4103*77c1e3ccSAndroid Build Coastguard Worker   __m256i diff, min_base_y256, c3f, shifty, dy256, c1;
4104*77c1e3ccSAndroid Build Coastguard Worker   __m128i a0_x128, a1_x128;
4105*77c1e3ccSAndroid Build Coastguard Worker 
4106*77c1e3ccSAndroid Build Coastguard Worker   DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
4107*77c1e3ccSAndroid Build Coastguard Worker   a16 = _mm256_set1_epi16(16);
4108*77c1e3ccSAndroid Build Coastguard Worker   c1 = _mm256_srli_epi16(a16, 4);
4109*77c1e3ccSAndroid Build Coastguard Worker   min_base_y256 = _mm256_set1_epi16(min_base_y);
4110*77c1e3ccSAndroid Build Coastguard Worker   c3f = _mm256_set1_epi16(0x3f);
4111*77c1e3ccSAndroid Build Coastguard Worker   dy256 = _mm256_set1_epi16(dy);
4112*77c1e3ccSAndroid Build Coastguard Worker   c0123 =
4113*77c1e3ccSAndroid Build Coastguard Worker       _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
4114*77c1e3ccSAndroid Build Coastguard Worker   c1234 = _mm256_add_epi16(c0123, c1);
4115*77c1e3ccSAndroid Build Coastguard Worker 
4116*77c1e3ccSAndroid Build Coastguard Worker   for (int r = 0; r < H; r++) {
4117*77c1e3ccSAndroid Build Coastguard Worker     __m256i b, res, shift, j256, r6, ydx;
4118*77c1e3ccSAndroid Build Coastguard Worker     __m128i resx, resy;
4119*77c1e3ccSAndroid Build Coastguard Worker     __m128i resxy;
4120*77c1e3ccSAndroid Build Coastguard Worker     int y = r + 1;
4121*77c1e3ccSAndroid Build Coastguard Worker     ydx = _mm256_set1_epi16((int16_t)(y * dx));
4122*77c1e3ccSAndroid Build Coastguard Worker 
4123*77c1e3ccSAndroid Build Coastguard Worker     int base_x = (-y * dx) >> frac_bits_x;
4124*77c1e3ccSAndroid Build Coastguard Worker     for (int j = 0; j < W; j += 16) {
4125*77c1e3ccSAndroid Build Coastguard Worker       j256 = _mm256_set1_epi16(j);
4126*77c1e3ccSAndroid Build Coastguard Worker       int base_shift = 0;
4127*77c1e3ccSAndroid Build Coastguard Worker       if ((base_x + j) < (min_base_x - 1)) {
4128*77c1e3ccSAndroid Build Coastguard Worker         base_shift = (min_base_x - (base_x + j) - 1);
4129*77c1e3ccSAndroid Build Coastguard Worker       }
4130*77c1e3ccSAndroid Build Coastguard Worker       int base_min_diff = (min_base_x - base_x - j);
4131*77c1e3ccSAndroid Build Coastguard Worker       if (base_min_diff > 16) {
4132*77c1e3ccSAndroid Build Coastguard Worker         base_min_diff = 16;
4133*77c1e3ccSAndroid Build Coastguard Worker       } else {
4134*77c1e3ccSAndroid Build Coastguard Worker         if (base_min_diff < 0) base_min_diff = 0;
4135*77c1e3ccSAndroid Build Coastguard Worker       }
4136*77c1e3ccSAndroid Build Coastguard Worker 
4137*77c1e3ccSAndroid Build Coastguard Worker       if (base_shift < 16) {
4138*77c1e3ccSAndroid Build Coastguard Worker         a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
4139*77c1e3ccSAndroid Build Coastguard Worker         a1_x128 =
4140*77c1e3ccSAndroid Build Coastguard Worker             _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
4141*77c1e3ccSAndroid Build Coastguard Worker         a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
4142*77c1e3ccSAndroid Build Coastguard Worker         a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
4143*77c1e3ccSAndroid Build Coastguard Worker 
4144*77c1e3ccSAndroid Build Coastguard Worker         a0_x = _mm256_cvtepu8_epi16(a0_x128);
4145*77c1e3ccSAndroid Build Coastguard Worker         a1_x = _mm256_cvtepu8_epi16(a1_x128);
4146*77c1e3ccSAndroid Build Coastguard Worker 
4147*77c1e3ccSAndroid Build Coastguard Worker         r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
4148*77c1e3ccSAndroid Build Coastguard Worker         shift = _mm256_srli_epi16(
4149*77c1e3ccSAndroid Build Coastguard Worker             _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
4150*77c1e3ccSAndroid Build Coastguard Worker 
4151*77c1e3ccSAndroid Build Coastguard Worker         diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
4152*77c1e3ccSAndroid Build Coastguard Worker         a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
4153*77c1e3ccSAndroid Build Coastguard Worker         a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
4154*77c1e3ccSAndroid Build Coastguard Worker 
4155*77c1e3ccSAndroid Build Coastguard Worker         b = _mm256_mullo_epi16(diff, shift);
4156*77c1e3ccSAndroid Build Coastguard Worker         res = _mm256_add_epi16(a32, b);
4157*77c1e3ccSAndroid Build Coastguard Worker         res = _mm256_srli_epi16(res, 5);  // 16 16-bit values
4158*77c1e3ccSAndroid Build Coastguard Worker         resx = _mm256_castsi256_si128(_mm256_packus_epi16(
4159*77c1e3ccSAndroid Build Coastguard Worker             res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
4160*77c1e3ccSAndroid Build Coastguard Worker       } else {
4161*77c1e3ccSAndroid Build Coastguard Worker         resx = _mm_setzero_si128();
4162*77c1e3ccSAndroid Build Coastguard Worker       }
4163*77c1e3ccSAndroid Build Coastguard Worker 
4164*77c1e3ccSAndroid Build Coastguard Worker       // y calc
4165*77c1e3ccSAndroid Build Coastguard Worker       if (base_x < min_base_x) {
4166*77c1e3ccSAndroid Build Coastguard Worker         __m256i c256, y_c256, base_y_c256, mask256, mul16;
4167*77c1e3ccSAndroid Build Coastguard Worker         r6 = _mm256_set1_epi16(r << 6);
4168*77c1e3ccSAndroid Build Coastguard Worker         c256 = _mm256_add_epi16(j256, c1234);
4169*77c1e3ccSAndroid Build Coastguard Worker         mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
4170*77c1e3ccSAndroid Build Coastguard Worker                                  _mm256_srli_epi16(min_base_y256, 1));
4171*77c1e3ccSAndroid Build Coastguard Worker         y_c256 = _mm256_sub_epi16(r6, mul16);
4172*77c1e3ccSAndroid Build Coastguard Worker 
4173*77c1e3ccSAndroid Build Coastguard Worker         base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
4174*77c1e3ccSAndroid Build Coastguard Worker         mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
4175*77c1e3ccSAndroid Build Coastguard Worker 
4176*77c1e3ccSAndroid Build Coastguard Worker         base_y_c256 = _mm256_blendv_epi8(base_y_c256, min_base_y256, mask256);
4177*77c1e3ccSAndroid Build Coastguard Worker         int16_t min_y = (int16_t)_mm_extract_epi16(
4178*77c1e3ccSAndroid Build Coastguard Worker             _mm256_extracti128_si256(base_y_c256, 1), 7);
4179*77c1e3ccSAndroid Build Coastguard Worker         int16_t max_y =
4180*77c1e3ccSAndroid Build Coastguard Worker             (int16_t)_mm_extract_epi16(_mm256_castsi256_si128(base_y_c256), 0);
4181*77c1e3ccSAndroid Build Coastguard Worker         int16_t offset_diff = max_y - min_y;
4182*77c1e3ccSAndroid Build Coastguard Worker 
4183*77c1e3ccSAndroid Build Coastguard Worker         if (offset_diff < 16) {
4184*77c1e3ccSAndroid Build Coastguard Worker           __m256i min_y256 = _mm256_set1_epi16(min_y);
4185*77c1e3ccSAndroid Build Coastguard Worker 
4186*77c1e3ccSAndroid Build Coastguard Worker           __m256i base_y_offset = _mm256_sub_epi16(base_y_c256, min_y256);
4187*77c1e3ccSAndroid Build Coastguard Worker           __m128i base_y_offset128 =
4188*77c1e3ccSAndroid Build Coastguard Worker               _mm_packs_epi16(_mm256_extracti128_si256(base_y_offset, 0),
4189*77c1e3ccSAndroid Build Coastguard Worker                               _mm256_extracti128_si256(base_y_offset, 1));
4190*77c1e3ccSAndroid Build Coastguard Worker 
4191*77c1e3ccSAndroid Build Coastguard Worker           __m128i a0_y128 = _mm_maskload_epi32(
4192*77c1e3ccSAndroid Build Coastguard Worker               (int *)(left + min_y), *(__m128i *)LoadMaskz2[offset_diff / 4]);
4193*77c1e3ccSAndroid Build Coastguard Worker           __m128i a1_y128 =
4194*77c1e3ccSAndroid Build Coastguard Worker               _mm_maskload_epi32((int *)(left + min_y + 1),
4195*77c1e3ccSAndroid Build Coastguard Worker                                  *(__m128i *)LoadMaskz2[offset_diff / 4]);
4196*77c1e3ccSAndroid Build Coastguard Worker           a0_y128 = _mm_shuffle_epi8(a0_y128, base_y_offset128);
4197*77c1e3ccSAndroid Build Coastguard Worker           a1_y128 = _mm_shuffle_epi8(a1_y128, base_y_offset128);
4198*77c1e3ccSAndroid Build Coastguard Worker           a0_y = _mm256_cvtepu8_epi16(a0_y128);
4199*77c1e3ccSAndroid Build Coastguard Worker           a1_y = _mm256_cvtepu8_epi16(a1_y128);
4200*77c1e3ccSAndroid Build Coastguard Worker         } else {
4201*77c1e3ccSAndroid Build Coastguard Worker           base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
4202*77c1e3ccSAndroid Build Coastguard Worker           _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
4203*77c1e3ccSAndroid Build Coastguard Worker 
4204*77c1e3ccSAndroid Build Coastguard Worker           a0_y = _mm256_setr_epi16(
4205*77c1e3ccSAndroid Build Coastguard Worker               left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
4206*77c1e3ccSAndroid Build Coastguard Worker               left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
4207*77c1e3ccSAndroid Build Coastguard Worker               left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
4208*77c1e3ccSAndroid Build Coastguard Worker               left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
4209*77c1e3ccSAndroid Build Coastguard Worker               left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
4210*77c1e3ccSAndroid Build Coastguard Worker               left[base_y_c[15]]);
4211*77c1e3ccSAndroid Build Coastguard Worker           base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
4212*77c1e3ccSAndroid Build Coastguard Worker           _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
4213*77c1e3ccSAndroid Build Coastguard Worker 
4214*77c1e3ccSAndroid Build Coastguard Worker           a1_y = _mm256_setr_epi16(
4215*77c1e3ccSAndroid Build Coastguard Worker               left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
4216*77c1e3ccSAndroid Build Coastguard Worker               left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
4217*77c1e3ccSAndroid Build Coastguard Worker               left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
4218*77c1e3ccSAndroid Build Coastguard Worker               left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
4219*77c1e3ccSAndroid Build Coastguard Worker               left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
4220*77c1e3ccSAndroid Build Coastguard Worker               left[base_y_c[15]]);
4221*77c1e3ccSAndroid Build Coastguard Worker         }
4222*77c1e3ccSAndroid Build Coastguard Worker         shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
4223*77c1e3ccSAndroid Build Coastguard Worker 
4224*77c1e3ccSAndroid Build Coastguard Worker         diff = _mm256_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
4225*77c1e3ccSAndroid Build Coastguard Worker         a32 = _mm256_slli_epi16(a0_y, 5);     // a[x] * 32
4226*77c1e3ccSAndroid Build Coastguard Worker         a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
4227*77c1e3ccSAndroid Build Coastguard Worker 
4228*77c1e3ccSAndroid Build Coastguard Worker         b = _mm256_mullo_epi16(diff, shifty);
4229*77c1e3ccSAndroid Build Coastguard Worker         res = _mm256_add_epi16(a32, b);
4230*77c1e3ccSAndroid Build Coastguard Worker         res = _mm256_srli_epi16(res, 5);  // 16 16-bit values
4231*77c1e3ccSAndroid Build Coastguard Worker         resy = _mm256_castsi256_si128(_mm256_packus_epi16(
4232*77c1e3ccSAndroid Build Coastguard Worker             res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
4233*77c1e3ccSAndroid Build Coastguard Worker       } else {
4234*77c1e3ccSAndroid Build Coastguard Worker         resy = _mm_setzero_si128();
4235*77c1e3ccSAndroid Build Coastguard Worker       }
4236*77c1e3ccSAndroid Build Coastguard Worker       resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
4237*77c1e3ccSAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i *)(dst + j), resxy);
4238*77c1e3ccSAndroid Build Coastguard Worker     }  // for j
4239*77c1e3ccSAndroid Build Coastguard Worker     dst += stride;
4240*77c1e3ccSAndroid Build Coastguard Worker   }
4241*77c1e3ccSAndroid Build Coastguard Worker }
4242*77c1e3ccSAndroid Build Coastguard Worker 
4243*77c1e3ccSAndroid Build Coastguard Worker // Directional prediction, zone 2: 90 < angle < 180
av1_dr_prediction_z2_avx2(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)4244*77c1e3ccSAndroid Build Coastguard Worker void av1_dr_prediction_z2_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
4245*77c1e3ccSAndroid Build Coastguard Worker                                const uint8_t *above, const uint8_t *left,
4246*77c1e3ccSAndroid Build Coastguard Worker                                int upsample_above, int upsample_left, int dx,
4247*77c1e3ccSAndroid Build Coastguard Worker                                int dy) {
4248*77c1e3ccSAndroid Build Coastguard Worker   assert(dx > 0);
4249*77c1e3ccSAndroid Build Coastguard Worker   assert(dy > 0);
4250*77c1e3ccSAndroid Build Coastguard Worker   switch (bw) {
4251*77c1e3ccSAndroid Build Coastguard Worker     case 4:
4252*77c1e3ccSAndroid Build Coastguard Worker       dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, upsample_above,
4253*77c1e3ccSAndroid Build Coastguard Worker                                 upsample_left, dx, dy);
4254*77c1e3ccSAndroid Build Coastguard Worker       break;
4255*77c1e3ccSAndroid Build Coastguard Worker     case 8:
4256*77c1e3ccSAndroid Build Coastguard Worker       dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, upsample_above,
4257*77c1e3ccSAndroid Build Coastguard Worker                                 upsample_left, dx, dy);
4258*77c1e3ccSAndroid Build Coastguard Worker       break;
4259*77c1e3ccSAndroid Build Coastguard Worker     default:
4260*77c1e3ccSAndroid Build Coastguard Worker       dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
4261*77c1e3ccSAndroid Build Coastguard Worker                                 upsample_above, upsample_left, dx, dy);
4262*77c1e3ccSAndroid Build Coastguard Worker       break;
4263*77c1e3ccSAndroid Build Coastguard Worker   }
4264*77c1e3ccSAndroid Build Coastguard Worker   return;
4265*77c1e3ccSAndroid Build Coastguard Worker }
4266*77c1e3ccSAndroid Build Coastguard Worker 
4267*77c1e3ccSAndroid Build Coastguard Worker // z3 functions
transpose16x32_avx2(__m256i * x,__m256i * d)4268*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose16x32_avx2(__m256i *x, __m256i *d) {
4269*77c1e3ccSAndroid Build Coastguard Worker   __m256i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
4270*77c1e3ccSAndroid Build Coastguard Worker   __m256i w10, w11, w12, w13, w14, w15;
4271*77c1e3ccSAndroid Build Coastguard Worker 
4272*77c1e3ccSAndroid Build Coastguard Worker   w0 = _mm256_unpacklo_epi8(x[0], x[1]);
4273*77c1e3ccSAndroid Build Coastguard Worker   w1 = _mm256_unpacklo_epi8(x[2], x[3]);
4274*77c1e3ccSAndroid Build Coastguard Worker   w2 = _mm256_unpacklo_epi8(x[4], x[5]);
4275*77c1e3ccSAndroid Build Coastguard Worker   w3 = _mm256_unpacklo_epi8(x[6], x[7]);
4276*77c1e3ccSAndroid Build Coastguard Worker 
4277*77c1e3ccSAndroid Build Coastguard Worker   w8 = _mm256_unpacklo_epi8(x[8], x[9]);
4278*77c1e3ccSAndroid Build Coastguard Worker   w9 = _mm256_unpacklo_epi8(x[10], x[11]);
4279*77c1e3ccSAndroid Build Coastguard Worker   w10 = _mm256_unpacklo_epi8(x[12], x[13]);
4280*77c1e3ccSAndroid Build Coastguard Worker   w11 = _mm256_unpacklo_epi8(x[14], x[15]);
4281*77c1e3ccSAndroid Build Coastguard Worker 
4282*77c1e3ccSAndroid Build Coastguard Worker   w4 = _mm256_unpacklo_epi16(w0, w1);
4283*77c1e3ccSAndroid Build Coastguard Worker   w5 = _mm256_unpacklo_epi16(w2, w3);
4284*77c1e3ccSAndroid Build Coastguard Worker   w12 = _mm256_unpacklo_epi16(w8, w9);
4285*77c1e3ccSAndroid Build Coastguard Worker   w13 = _mm256_unpacklo_epi16(w10, w11);
4286*77c1e3ccSAndroid Build Coastguard Worker 
4287*77c1e3ccSAndroid Build Coastguard Worker   w6 = _mm256_unpacklo_epi32(w4, w5);
4288*77c1e3ccSAndroid Build Coastguard Worker   w7 = _mm256_unpackhi_epi32(w4, w5);
4289*77c1e3ccSAndroid Build Coastguard Worker   w14 = _mm256_unpacklo_epi32(w12, w13);
4290*77c1e3ccSAndroid Build Coastguard Worker   w15 = _mm256_unpackhi_epi32(w12, w13);
4291*77c1e3ccSAndroid Build Coastguard Worker 
4292*77c1e3ccSAndroid Build Coastguard Worker   // Store first 4-line result
4293*77c1e3ccSAndroid Build Coastguard Worker   d[0] = _mm256_unpacklo_epi64(w6, w14);
4294*77c1e3ccSAndroid Build Coastguard Worker   d[1] = _mm256_unpackhi_epi64(w6, w14);
4295*77c1e3ccSAndroid Build Coastguard Worker   d[2] = _mm256_unpacklo_epi64(w7, w15);
4296*77c1e3ccSAndroid Build Coastguard Worker   d[3] = _mm256_unpackhi_epi64(w7, w15);
4297*77c1e3ccSAndroid Build Coastguard Worker 
4298*77c1e3ccSAndroid Build Coastguard Worker   w4 = _mm256_unpackhi_epi16(w0, w1);
4299*77c1e3ccSAndroid Build Coastguard Worker   w5 = _mm256_unpackhi_epi16(w2, w3);
4300*77c1e3ccSAndroid Build Coastguard Worker   w12 = _mm256_unpackhi_epi16(w8, w9);
4301*77c1e3ccSAndroid Build Coastguard Worker   w13 = _mm256_unpackhi_epi16(w10, w11);
4302*77c1e3ccSAndroid Build Coastguard Worker 
4303*77c1e3ccSAndroid Build Coastguard Worker   w6 = _mm256_unpacklo_epi32(w4, w5);
4304*77c1e3ccSAndroid Build Coastguard Worker   w7 = _mm256_unpackhi_epi32(w4, w5);
4305*77c1e3ccSAndroid Build Coastguard Worker   w14 = _mm256_unpacklo_epi32(w12, w13);
4306*77c1e3ccSAndroid Build Coastguard Worker   w15 = _mm256_unpackhi_epi32(w12, w13);
4307*77c1e3ccSAndroid Build Coastguard Worker 
4308*77c1e3ccSAndroid Build Coastguard Worker   // Store second 4-line result
4309*77c1e3ccSAndroid Build Coastguard Worker   d[4] = _mm256_unpacklo_epi64(w6, w14);
4310*77c1e3ccSAndroid Build Coastguard Worker   d[5] = _mm256_unpackhi_epi64(w6, w14);
4311*77c1e3ccSAndroid Build Coastguard Worker   d[6] = _mm256_unpacklo_epi64(w7, w15);
4312*77c1e3ccSAndroid Build Coastguard Worker   d[7] = _mm256_unpackhi_epi64(w7, w15);
4313*77c1e3ccSAndroid Build Coastguard Worker 
4314*77c1e3ccSAndroid Build Coastguard Worker   // upper half
4315*77c1e3ccSAndroid Build Coastguard Worker   w0 = _mm256_unpackhi_epi8(x[0], x[1]);
4316*77c1e3ccSAndroid Build Coastguard Worker   w1 = _mm256_unpackhi_epi8(x[2], x[3]);
4317*77c1e3ccSAndroid Build Coastguard Worker   w2 = _mm256_unpackhi_epi8(x[4], x[5]);
4318*77c1e3ccSAndroid Build Coastguard Worker   w3 = _mm256_unpackhi_epi8(x[6], x[7]);
4319*77c1e3ccSAndroid Build Coastguard Worker 
4320*77c1e3ccSAndroid Build Coastguard Worker   w8 = _mm256_unpackhi_epi8(x[8], x[9]);
4321*77c1e3ccSAndroid Build Coastguard Worker   w9 = _mm256_unpackhi_epi8(x[10], x[11]);
4322*77c1e3ccSAndroid Build Coastguard Worker   w10 = _mm256_unpackhi_epi8(x[12], x[13]);
4323*77c1e3ccSAndroid Build Coastguard Worker   w11 = _mm256_unpackhi_epi8(x[14], x[15]);
4324*77c1e3ccSAndroid Build Coastguard Worker 
4325*77c1e3ccSAndroid Build Coastguard Worker   w4 = _mm256_unpacklo_epi16(w0, w1);
4326*77c1e3ccSAndroid Build Coastguard Worker   w5 = _mm256_unpacklo_epi16(w2, w3);
4327*77c1e3ccSAndroid Build Coastguard Worker   w12 = _mm256_unpacklo_epi16(w8, w9);
4328*77c1e3ccSAndroid Build Coastguard Worker   w13 = _mm256_unpacklo_epi16(w10, w11);
4329*77c1e3ccSAndroid Build Coastguard Worker 
4330*77c1e3ccSAndroid Build Coastguard Worker   w6 = _mm256_unpacklo_epi32(w4, w5);
4331*77c1e3ccSAndroid Build Coastguard Worker   w7 = _mm256_unpackhi_epi32(w4, w5);
4332*77c1e3ccSAndroid Build Coastguard Worker   w14 = _mm256_unpacklo_epi32(w12, w13);
4333*77c1e3ccSAndroid Build Coastguard Worker   w15 = _mm256_unpackhi_epi32(w12, w13);
4334*77c1e3ccSAndroid Build Coastguard Worker 
4335*77c1e3ccSAndroid Build Coastguard Worker   // Store first 4-line result
4336*77c1e3ccSAndroid Build Coastguard Worker   d[8] = _mm256_unpacklo_epi64(w6, w14);
4337*77c1e3ccSAndroid Build Coastguard Worker   d[9] = _mm256_unpackhi_epi64(w6, w14);
4338*77c1e3ccSAndroid Build Coastguard Worker   d[10] = _mm256_unpacklo_epi64(w7, w15);
4339*77c1e3ccSAndroid Build Coastguard Worker   d[11] = _mm256_unpackhi_epi64(w7, w15);
4340*77c1e3ccSAndroid Build Coastguard Worker 
4341*77c1e3ccSAndroid Build Coastguard Worker   w4 = _mm256_unpackhi_epi16(w0, w1);
4342*77c1e3ccSAndroid Build Coastguard Worker   w5 = _mm256_unpackhi_epi16(w2, w3);
4343*77c1e3ccSAndroid Build Coastguard Worker   w12 = _mm256_unpackhi_epi16(w8, w9);
4344*77c1e3ccSAndroid Build Coastguard Worker   w13 = _mm256_unpackhi_epi16(w10, w11);
4345*77c1e3ccSAndroid Build Coastguard Worker 
4346*77c1e3ccSAndroid Build Coastguard Worker   w6 = _mm256_unpacklo_epi32(w4, w5);
4347*77c1e3ccSAndroid Build Coastguard Worker   w7 = _mm256_unpackhi_epi32(w4, w5);
4348*77c1e3ccSAndroid Build Coastguard Worker   w14 = _mm256_unpacklo_epi32(w12, w13);
4349*77c1e3ccSAndroid Build Coastguard Worker   w15 = _mm256_unpackhi_epi32(w12, w13);
4350*77c1e3ccSAndroid Build Coastguard Worker 
4351*77c1e3ccSAndroid Build Coastguard Worker   // Store second 4-line result
4352*77c1e3ccSAndroid Build Coastguard Worker   d[12] = _mm256_unpacklo_epi64(w6, w14);
4353*77c1e3ccSAndroid Build Coastguard Worker   d[13] = _mm256_unpackhi_epi64(w6, w14);
4354*77c1e3ccSAndroid Build Coastguard Worker   d[14] = _mm256_unpacklo_epi64(w7, w15);
4355*77c1e3ccSAndroid Build Coastguard Worker   d[15] = _mm256_unpackhi_epi64(w7, w15);
4356*77c1e3ccSAndroid Build Coastguard Worker }
4357*77c1e3ccSAndroid Build Coastguard Worker 
dr_prediction_z3_4x4_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4358*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_4x4_avx2(uint8_t *dst, ptrdiff_t stride,
4359*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *left, int upsample_left,
4360*77c1e3ccSAndroid Build Coastguard Worker                                       int dy) {
4361*77c1e3ccSAndroid Build Coastguard Worker   __m128i dstvec[4], d[4];
4362*77c1e3ccSAndroid Build Coastguard Worker 
4363*77c1e3ccSAndroid Build Coastguard Worker   dr_prediction_z1_HxW_internal_avx2(4, 4, dstvec, left, upsample_left, dy);
4364*77c1e3ccSAndroid Build Coastguard Worker   transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
4365*77c1e3ccSAndroid Build Coastguard Worker                             &d[0], &d[1], &d[2], &d[3]);
4366*77c1e3ccSAndroid Build Coastguard Worker 
4367*77c1e3ccSAndroid Build Coastguard Worker   *(int *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]);
4368*77c1e3ccSAndroid Build Coastguard Worker   *(int *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]);
4369*77c1e3ccSAndroid Build Coastguard Worker   *(int *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]);
4370*77c1e3ccSAndroid Build Coastguard Worker   *(int *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]);
4371*77c1e3ccSAndroid Build Coastguard Worker   return;
4372*77c1e3ccSAndroid Build Coastguard Worker }
4373*77c1e3ccSAndroid Build Coastguard Worker 
dr_prediction_z3_8x8_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4374*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_8x8_avx2(uint8_t *dst, ptrdiff_t stride,
4375*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *left, int upsample_left,
4376*77c1e3ccSAndroid Build Coastguard Worker                                       int dy) {
4377*77c1e3ccSAndroid Build Coastguard Worker   __m128i dstvec[8], d[8];
4378*77c1e3ccSAndroid Build Coastguard Worker 
4379*77c1e3ccSAndroid Build Coastguard Worker   dr_prediction_z1_HxW_internal_avx2(8, 8, dstvec, left, upsample_left, dy);
4380*77c1e3ccSAndroid Build Coastguard Worker   transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4],
4381*77c1e3ccSAndroid Build Coastguard Worker                     &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2],
4382*77c1e3ccSAndroid Build Coastguard Worker                     &d[3]);
4383*77c1e3ccSAndroid Build Coastguard Worker 
4384*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
4385*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8));
4386*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]);
4387*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8));
4388*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]);
4389*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8));
4390*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]);
4391*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8));
4392*77c1e3ccSAndroid Build Coastguard Worker }
4393*77c1e3ccSAndroid Build Coastguard Worker 
dr_prediction_z3_4x8_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4394*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_4x8_avx2(uint8_t *dst, ptrdiff_t stride,
4395*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *left, int upsample_left,
4396*77c1e3ccSAndroid Build Coastguard Worker                                       int dy) {
4397*77c1e3ccSAndroid Build Coastguard Worker   __m128i dstvec[4], d[8];
4398*77c1e3ccSAndroid Build Coastguard Worker 
4399*77c1e3ccSAndroid Build Coastguard Worker   dr_prediction_z1_HxW_internal_avx2(8, 4, dstvec, left, upsample_left, dy);
4400*77c1e3ccSAndroid Build Coastguard Worker   transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0],
4401*77c1e3ccSAndroid Build Coastguard Worker                         &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
4402*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 8; i++) {
4403*77c1e3ccSAndroid Build Coastguard Worker     *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
4404*77c1e3ccSAndroid Build Coastguard Worker   }
4405*77c1e3ccSAndroid Build Coastguard Worker }
4406*77c1e3ccSAndroid Build Coastguard Worker 
dr_prediction_z3_8x4_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4407*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_8x4_avx2(uint8_t *dst, ptrdiff_t stride,
4408*77c1e3ccSAndroid Build Coastguard Worker                                       const uint8_t *left, int upsample_left,
4409*77c1e3ccSAndroid Build Coastguard Worker                                       int dy) {
4410*77c1e3ccSAndroid Build Coastguard Worker   __m128i dstvec[8], d[4];
4411*77c1e3ccSAndroid Build Coastguard Worker 
4412*77c1e3ccSAndroid Build Coastguard Worker   dr_prediction_z1_HxW_internal_avx2(4, 8, dstvec, left, upsample_left, dy);
4413*77c1e3ccSAndroid Build Coastguard Worker   transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
4414*77c1e3ccSAndroid Build Coastguard Worker                         &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0],
4415*77c1e3ccSAndroid Build Coastguard Worker                         &d[1], &d[2], &d[3]);
4416*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
4417*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
4418*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
4419*77c1e3ccSAndroid Build Coastguard Worker   _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
4420*77c1e3ccSAndroid Build Coastguard Worker }
4421*77c1e3ccSAndroid Build Coastguard Worker 
dr_prediction_z3_8x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4422*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_8x16_avx2(uint8_t *dst, ptrdiff_t stride,
4423*77c1e3ccSAndroid Build Coastguard Worker                                        const uint8_t *left, int upsample_left,
4424*77c1e3ccSAndroid Build Coastguard Worker                                        int dy) {
4425*77c1e3ccSAndroid Build Coastguard Worker   __m128i dstvec[8], d[8];
4426*77c1e3ccSAndroid Build Coastguard Worker 
4427*77c1e3ccSAndroid Build Coastguard Worker   dr_prediction_z1_HxW_internal_avx2(16, 8, dstvec, left, upsample_left, dy);
4428*77c1e3ccSAndroid Build Coastguard Worker   transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3,
4429*77c1e3ccSAndroid Build Coastguard Worker                           dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d,
4430*77c1e3ccSAndroid Build Coastguard Worker                           d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7);
4431*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 8; i++) {
4432*77c1e3ccSAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
4433*77c1e3ccSAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
4434*77c1e3ccSAndroid Build Coastguard Worker                      _mm_srli_si128(d[i], 8));
4435*77c1e3ccSAndroid Build Coastguard Worker   }
4436*77c1e3ccSAndroid Build Coastguard Worker }
4437*77c1e3ccSAndroid Build Coastguard Worker 
dr_prediction_z3_16x8_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4438*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
4439*77c1e3ccSAndroid Build Coastguard Worker                                        const uint8_t *left, int upsample_left,
4440*77c1e3ccSAndroid Build Coastguard Worker                                        int dy) {
4441*77c1e3ccSAndroid Build Coastguard Worker   __m128i dstvec[16], d[16];
4442*77c1e3ccSAndroid Build Coastguard Worker 
4443*77c1e3ccSAndroid Build Coastguard Worker   dr_prediction_z1_HxW_internal_avx2(8, 16, dstvec, left, upsample_left, dy);
4444*77c1e3ccSAndroid Build Coastguard Worker   transpose16x8_8x16_sse2(
4445*77c1e3ccSAndroid Build Coastguard Worker       &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
4446*77c1e3ccSAndroid Build Coastguard Worker       &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
4447*77c1e3ccSAndroid Build Coastguard Worker       &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
4448*77c1e3ccSAndroid Build Coastguard Worker       &d[3], &d[4], &d[5], &d[6], &d[7]);
4449*77c1e3ccSAndroid Build Coastguard Worker 
4450*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 8; i++) {
4451*77c1e3ccSAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4452*77c1e3ccSAndroid Build Coastguard Worker   }
4453*77c1e3ccSAndroid Build Coastguard Worker }
4454*77c1e3ccSAndroid Build Coastguard Worker 
4455*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
dr_prediction_z3_4x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4456*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_4x16_avx2(uint8_t *dst, ptrdiff_t stride,
4457*77c1e3ccSAndroid Build Coastguard Worker                                        const uint8_t *left, int upsample_left,
4458*77c1e3ccSAndroid Build Coastguard Worker                                        int dy) {
4459*77c1e3ccSAndroid Build Coastguard Worker   __m128i dstvec[4], d[16];
4460*77c1e3ccSAndroid Build Coastguard Worker 
4461*77c1e3ccSAndroid Build Coastguard Worker   dr_prediction_z1_HxW_internal_avx2(16, 4, dstvec, left, upsample_left, dy);
4462*77c1e3ccSAndroid Build Coastguard Worker   transpose4x16_sse2(dstvec, d);
4463*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 16; i++) {
4464*77c1e3ccSAndroid Build Coastguard Worker     *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
4465*77c1e3ccSAndroid Build Coastguard Worker   }
4466*77c1e3ccSAndroid Build Coastguard Worker }
4467*77c1e3ccSAndroid Build Coastguard Worker 
dr_prediction_z3_16x4_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4468*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_16x4_avx2(uint8_t *dst, ptrdiff_t stride,
4469*77c1e3ccSAndroid Build Coastguard Worker                                        const uint8_t *left, int upsample_left,
4470*77c1e3ccSAndroid Build Coastguard Worker                                        int dy) {
4471*77c1e3ccSAndroid Build Coastguard Worker   __m128i dstvec[16], d[8];
4472*77c1e3ccSAndroid Build Coastguard Worker 
4473*77c1e3ccSAndroid Build Coastguard Worker   dr_prediction_z1_HxW_internal_avx2(4, 16, dstvec, left, upsample_left, dy);
4474*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 4; i < 8; i++) {
4475*77c1e3ccSAndroid Build Coastguard Worker     d[i] = _mm_setzero_si128();
4476*77c1e3ccSAndroid Build Coastguard Worker   }
4477*77c1e3ccSAndroid Build Coastguard Worker   transpose16x8_8x16_sse2(
4478*77c1e3ccSAndroid Build Coastguard Worker       &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
4479*77c1e3ccSAndroid Build Coastguard Worker       &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
4480*77c1e3ccSAndroid Build Coastguard Worker       &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
4481*77c1e3ccSAndroid Build Coastguard Worker       &d[3], &d[4], &d[5], &d[6], &d[7]);
4482*77c1e3ccSAndroid Build Coastguard Worker 
4483*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 4; i++) {
4484*77c1e3ccSAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4485*77c1e3ccSAndroid Build Coastguard Worker   }
4486*77c1e3ccSAndroid Build Coastguard Worker }
4487*77c1e3ccSAndroid Build Coastguard Worker 
dr_prediction_z3_8x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4488*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_8x32_avx2(uint8_t *dst, ptrdiff_t stride,
4489*77c1e3ccSAndroid Build Coastguard Worker                                        const uint8_t *left, int upsample_left,
4490*77c1e3ccSAndroid Build Coastguard Worker                                        int dy) {
4491*77c1e3ccSAndroid Build Coastguard Worker   __m256i dstvec[16], d[16];
4492*77c1e3ccSAndroid Build Coastguard Worker 
4493*77c1e3ccSAndroid Build Coastguard Worker   dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, dy);
4494*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 8; i < 16; i++) {
4495*77c1e3ccSAndroid Build Coastguard Worker     dstvec[i] = _mm256_setzero_si256();
4496*77c1e3ccSAndroid Build Coastguard Worker   }
4497*77c1e3ccSAndroid Build Coastguard Worker   transpose16x32_avx2(dstvec, d);
4498*77c1e3ccSAndroid Build Coastguard Worker 
4499*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 16; i++) {
4500*77c1e3ccSAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i *)(dst + i * stride),
4501*77c1e3ccSAndroid Build Coastguard Worker                      _mm256_castsi256_si128(d[i]));
4502*77c1e3ccSAndroid Build Coastguard Worker   }
4503*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 16; i++) {
4504*77c1e3ccSAndroid Build Coastguard Worker     _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride),
4505*77c1e3ccSAndroid Build Coastguard Worker                      _mm256_extracti128_si256(d[i], 1));
4506*77c1e3ccSAndroid Build Coastguard Worker   }
4507*77c1e3ccSAndroid Build Coastguard Worker }
4508*77c1e3ccSAndroid Build Coastguard Worker 
dr_prediction_z3_32x8_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4509*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_32x8_avx2(uint8_t *dst, ptrdiff_t stride,
4510*77c1e3ccSAndroid Build Coastguard Worker                                        const uint8_t *left, int upsample_left,
4511*77c1e3ccSAndroid Build Coastguard Worker                                        int dy) {
4512*77c1e3ccSAndroid Build Coastguard Worker   __m128i dstvec[32], d[16];
4513*77c1e3ccSAndroid Build Coastguard Worker 
4514*77c1e3ccSAndroid Build Coastguard Worker   dr_prediction_z1_HxW_internal_avx2(8, 32, dstvec, left, upsample_left, dy);
4515*77c1e3ccSAndroid Build Coastguard Worker 
4516*77c1e3ccSAndroid Build Coastguard Worker   transpose16x8_8x16_sse2(
4517*77c1e3ccSAndroid Build Coastguard Worker       &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
4518*77c1e3ccSAndroid Build Coastguard Worker       &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
4519*77c1e3ccSAndroid Build Coastguard Worker       &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
4520*77c1e3ccSAndroid Build Coastguard Worker       &d[3], &d[4], &d[5], &d[6], &d[7]);
4521*77c1e3ccSAndroid Build Coastguard Worker   transpose16x8_8x16_sse2(
4522*77c1e3ccSAndroid Build Coastguard Worker       &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16],
4523*77c1e3ccSAndroid Build Coastguard Worker       &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16],
4524*77c1e3ccSAndroid Build Coastguard Worker       &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16],
4525*77c1e3ccSAndroid Build Coastguard Worker       &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16],
4526*77c1e3ccSAndroid Build Coastguard Worker       &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8],
4527*77c1e3ccSAndroid Build Coastguard Worker       &d[6 + 8], &d[7 + 8]);
4528*77c1e3ccSAndroid Build Coastguard Worker 
4529*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 8; i++) {
4530*77c1e3ccSAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4531*77c1e3ccSAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]);
4532*77c1e3ccSAndroid Build Coastguard Worker   }
4533*77c1e3ccSAndroid Build Coastguard Worker }
4534*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
4535*77c1e3ccSAndroid Build Coastguard Worker 
dr_prediction_z3_16x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4536*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
4537*77c1e3ccSAndroid Build Coastguard Worker                                         const uint8_t *left, int upsample_left,
4538*77c1e3ccSAndroid Build Coastguard Worker                                         int dy) {
4539*77c1e3ccSAndroid Build Coastguard Worker   __m128i dstvec[16], d[16];
4540*77c1e3ccSAndroid Build Coastguard Worker 
4541*77c1e3ccSAndroid Build Coastguard Worker   dr_prediction_z1_HxW_internal_avx2(16, 16, dstvec, left, upsample_left, dy);
4542*77c1e3ccSAndroid Build Coastguard Worker   transpose16x16_sse2(dstvec, d);
4543*77c1e3ccSAndroid Build Coastguard Worker 
4544*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 16; i++) {
4545*77c1e3ccSAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4546*77c1e3ccSAndroid Build Coastguard Worker   }
4547*77c1e3ccSAndroid Build Coastguard Worker }
4548*77c1e3ccSAndroid Build Coastguard Worker 
dr_prediction_z3_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4549*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
4550*77c1e3ccSAndroid Build Coastguard Worker                                         const uint8_t *left, int upsample_left,
4551*77c1e3ccSAndroid Build Coastguard Worker                                         int dy) {
4552*77c1e3ccSAndroid Build Coastguard Worker   __m256i dstvec[32], d[32];
4553*77c1e3ccSAndroid Build Coastguard Worker 
4554*77c1e3ccSAndroid Build Coastguard Worker   dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, dy);
4555*77c1e3ccSAndroid Build Coastguard Worker   transpose16x32_avx2(dstvec, d);
4556*77c1e3ccSAndroid Build Coastguard Worker   transpose16x32_avx2(dstvec + 16, d + 16);
4557*77c1e3ccSAndroid Build Coastguard Worker   for (int j = 0; j < 16; j++) {
4558*77c1e3ccSAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i *)(dst + j * stride),
4559*77c1e3ccSAndroid Build Coastguard Worker                      _mm256_castsi256_si128(d[j]));
4560*77c1e3ccSAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i *)(dst + j * stride + 16),
4561*77c1e3ccSAndroid Build Coastguard Worker                      _mm256_castsi256_si128(d[j + 16]));
4562*77c1e3ccSAndroid Build Coastguard Worker   }
4563*77c1e3ccSAndroid Build Coastguard Worker   for (int j = 0; j < 16; j++) {
4564*77c1e3ccSAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
4565*77c1e3ccSAndroid Build Coastguard Worker                      _mm256_extracti128_si256(d[j], 1));
4566*77c1e3ccSAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16),
4567*77c1e3ccSAndroid Build Coastguard Worker                      _mm256_extracti128_si256(d[j + 16], 1));
4568*77c1e3ccSAndroid Build Coastguard Worker   }
4569*77c1e3ccSAndroid Build Coastguard Worker }
4570*77c1e3ccSAndroid Build Coastguard Worker 
dr_prediction_z3_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4571*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
4572*77c1e3ccSAndroid Build Coastguard Worker                                         const uint8_t *left, int upsample_left,
4573*77c1e3ccSAndroid Build Coastguard Worker                                         int dy) {
4574*77c1e3ccSAndroid Build Coastguard Worker   DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
4575*77c1e3ccSAndroid Build Coastguard Worker   dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
4576*77c1e3ccSAndroid Build Coastguard Worker   transpose(dstT, 64, dst, stride, 64, 64);
4577*77c1e3ccSAndroid Build Coastguard Worker }
4578*77c1e3ccSAndroid Build Coastguard Worker 
dr_prediction_z3_16x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4579*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
4580*77c1e3ccSAndroid Build Coastguard Worker                                         const uint8_t *left, int upsample_left,
4581*77c1e3ccSAndroid Build Coastguard Worker                                         int dy) {
4582*77c1e3ccSAndroid Build Coastguard Worker   __m256i dstvec[16], d[16];
4583*77c1e3ccSAndroid Build Coastguard Worker 
4584*77c1e3ccSAndroid Build Coastguard Worker   dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, dy);
4585*77c1e3ccSAndroid Build Coastguard Worker   transpose16x32_avx2(dstvec, d);
4586*77c1e3ccSAndroid Build Coastguard Worker   // store
4587*77c1e3ccSAndroid Build Coastguard Worker   for (int j = 0; j < 16; j++) {
4588*77c1e3ccSAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i *)(dst + j * stride),
4589*77c1e3ccSAndroid Build Coastguard Worker                      _mm256_castsi256_si128(d[j]));
4590*77c1e3ccSAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
4591*77c1e3ccSAndroid Build Coastguard Worker                      _mm256_extracti128_si256(d[j], 1));
4592*77c1e3ccSAndroid Build Coastguard Worker   }
4593*77c1e3ccSAndroid Build Coastguard Worker }
4594*77c1e3ccSAndroid Build Coastguard Worker 
dr_prediction_z3_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4595*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
4596*77c1e3ccSAndroid Build Coastguard Worker                                         const uint8_t *left, int upsample_left,
4597*77c1e3ccSAndroid Build Coastguard Worker                                         int dy) {
4598*77c1e3ccSAndroid Build Coastguard Worker   __m128i dstvec[32], d[16];
4599*77c1e3ccSAndroid Build Coastguard Worker 
4600*77c1e3ccSAndroid Build Coastguard Worker   dr_prediction_z1_HxW_internal_avx2(16, 32, dstvec, left, upsample_left, dy);
4601*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 32; i += 16) {
4602*77c1e3ccSAndroid Build Coastguard Worker     transpose16x16_sse2((dstvec + i), d);
4603*77c1e3ccSAndroid Build Coastguard Worker     for (int j = 0; j < 16; j++) {
4604*77c1e3ccSAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
4605*77c1e3ccSAndroid Build Coastguard Worker     }
4606*77c1e3ccSAndroid Build Coastguard Worker   }
4607*77c1e3ccSAndroid Build Coastguard Worker }
4608*77c1e3ccSAndroid Build Coastguard Worker 
dr_prediction_z3_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4609*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
4610*77c1e3ccSAndroid Build Coastguard Worker                                         const uint8_t *left, int upsample_left,
4611*77c1e3ccSAndroid Build Coastguard Worker                                         int dy) {
4612*77c1e3ccSAndroid Build Coastguard Worker   uint8_t dstT[64 * 32];
4613*77c1e3ccSAndroid Build Coastguard Worker   dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
4614*77c1e3ccSAndroid Build Coastguard Worker   transpose(dstT, 64, dst, stride, 32, 64);
4615*77c1e3ccSAndroid Build Coastguard Worker }
4616*77c1e3ccSAndroid Build Coastguard Worker 
dr_prediction_z3_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4617*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
4618*77c1e3ccSAndroid Build Coastguard Worker                                         const uint8_t *left, int upsample_left,
4619*77c1e3ccSAndroid Build Coastguard Worker                                         int dy) {
4620*77c1e3ccSAndroid Build Coastguard Worker   uint8_t dstT[32 * 64];
4621*77c1e3ccSAndroid Build Coastguard Worker   dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy);
4622*77c1e3ccSAndroid Build Coastguard Worker   transpose(dstT, 32, dst, stride, 64, 32);
4623*77c1e3ccSAndroid Build Coastguard Worker   return;
4624*77c1e3ccSAndroid Build Coastguard Worker }
4625*77c1e3ccSAndroid Build Coastguard Worker 
4626*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
dr_prediction_z3_16x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4627*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
4628*77c1e3ccSAndroid Build Coastguard Worker                                         const uint8_t *left, int upsample_left,
4629*77c1e3ccSAndroid Build Coastguard Worker                                         int dy) {
4630*77c1e3ccSAndroid Build Coastguard Worker   uint8_t dstT[64 * 16];
4631*77c1e3ccSAndroid Build Coastguard Worker   dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
4632*77c1e3ccSAndroid Build Coastguard Worker   transpose(dstT, 64, dst, stride, 16, 64);
4633*77c1e3ccSAndroid Build Coastguard Worker }
4634*77c1e3ccSAndroid Build Coastguard Worker 
dr_prediction_z3_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4635*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
4636*77c1e3ccSAndroid Build Coastguard Worker                                         const uint8_t *left, int upsample_left,
4637*77c1e3ccSAndroid Build Coastguard Worker                                         int dy) {
4638*77c1e3ccSAndroid Build Coastguard Worker   __m128i dstvec[64], d[16];
4639*77c1e3ccSAndroid Build Coastguard Worker 
4640*77c1e3ccSAndroid Build Coastguard Worker   dr_prediction_z1_HxW_internal_avx2(16, 64, dstvec, left, upsample_left, dy);
4641*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < 64; i += 16) {
4642*77c1e3ccSAndroid Build Coastguard Worker     transpose16x16_sse2((dstvec + i), d);
4643*77c1e3ccSAndroid Build Coastguard Worker     for (int j = 0; j < 16; j++) {
4644*77c1e3ccSAndroid Build Coastguard Worker       _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
4645*77c1e3ccSAndroid Build Coastguard Worker     }
4646*77c1e3ccSAndroid Build Coastguard Worker   }
4647*77c1e3ccSAndroid Build Coastguard Worker }
4648*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
4649*77c1e3ccSAndroid Build Coastguard Worker 
av1_dr_prediction_z3_avx2(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_left,int dx,int dy)4650*77c1e3ccSAndroid Build Coastguard Worker void av1_dr_prediction_z3_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
4651*77c1e3ccSAndroid Build Coastguard Worker                                const uint8_t *above, const uint8_t *left,
4652*77c1e3ccSAndroid Build Coastguard Worker                                int upsample_left, int dx, int dy) {
4653*77c1e3ccSAndroid Build Coastguard Worker   (void)above;
4654*77c1e3ccSAndroid Build Coastguard Worker   (void)dx;
4655*77c1e3ccSAndroid Build Coastguard Worker   assert(dx == 1);
4656*77c1e3ccSAndroid Build Coastguard Worker   assert(dy > 0);
4657*77c1e3ccSAndroid Build Coastguard Worker 
4658*77c1e3ccSAndroid Build Coastguard Worker   if (bw == bh) {
4659*77c1e3ccSAndroid Build Coastguard Worker     switch (bw) {
4660*77c1e3ccSAndroid Build Coastguard Worker       case 4:
4661*77c1e3ccSAndroid Build Coastguard Worker         dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy);
4662*77c1e3ccSAndroid Build Coastguard Worker         break;
4663*77c1e3ccSAndroid Build Coastguard Worker       case 8:
4664*77c1e3ccSAndroid Build Coastguard Worker         dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy);
4665*77c1e3ccSAndroid Build Coastguard Worker         break;
4666*77c1e3ccSAndroid Build Coastguard Worker       case 16:
4667*77c1e3ccSAndroid Build Coastguard Worker         dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy);
4668*77c1e3ccSAndroid Build Coastguard Worker         break;
4669*77c1e3ccSAndroid Build Coastguard Worker       case 32:
4670*77c1e3ccSAndroid Build Coastguard Worker         dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy);
4671*77c1e3ccSAndroid Build Coastguard Worker         break;
4672*77c1e3ccSAndroid Build Coastguard Worker       case 64:
4673*77c1e3ccSAndroid Build Coastguard Worker         dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy);
4674*77c1e3ccSAndroid Build Coastguard Worker         break;
4675*77c1e3ccSAndroid Build Coastguard Worker     }
4676*77c1e3ccSAndroid Build Coastguard Worker   } else {
4677*77c1e3ccSAndroid Build Coastguard Worker     if (bw < bh) {
4678*77c1e3ccSAndroid Build Coastguard Worker       if (bw + bw == bh) {
4679*77c1e3ccSAndroid Build Coastguard Worker         switch (bw) {
4680*77c1e3ccSAndroid Build Coastguard Worker           case 4:
4681*77c1e3ccSAndroid Build Coastguard Worker             dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, dy);
4682*77c1e3ccSAndroid Build Coastguard Worker             break;
4683*77c1e3ccSAndroid Build Coastguard Worker           case 8:
4684*77c1e3ccSAndroid Build Coastguard Worker             dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, dy);
4685*77c1e3ccSAndroid Build Coastguard Worker             break;
4686*77c1e3ccSAndroid Build Coastguard Worker           case 16:
4687*77c1e3ccSAndroid Build Coastguard Worker             dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, dy);
4688*77c1e3ccSAndroid Build Coastguard Worker             break;
4689*77c1e3ccSAndroid Build Coastguard Worker           case 32:
4690*77c1e3ccSAndroid Build Coastguard Worker             dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, dy);
4691*77c1e3ccSAndroid Build Coastguard Worker             break;
4692*77c1e3ccSAndroid Build Coastguard Worker         }
4693*77c1e3ccSAndroid Build Coastguard Worker       } else {
4694*77c1e3ccSAndroid Build Coastguard Worker         switch (bw) {
4695*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
4696*77c1e3ccSAndroid Build Coastguard Worker           case 4:
4697*77c1e3ccSAndroid Build Coastguard Worker             dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, dy);
4698*77c1e3ccSAndroid Build Coastguard Worker             break;
4699*77c1e3ccSAndroid Build Coastguard Worker           case 8:
4700*77c1e3ccSAndroid Build Coastguard Worker             dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, dy);
4701*77c1e3ccSAndroid Build Coastguard Worker             break;
4702*77c1e3ccSAndroid Build Coastguard Worker           case 16:
4703*77c1e3ccSAndroid Build Coastguard Worker             dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, dy);
4704*77c1e3ccSAndroid Build Coastguard Worker             break;
4705*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
4706*77c1e3ccSAndroid Build Coastguard Worker         }
4707*77c1e3ccSAndroid Build Coastguard Worker       }
4708*77c1e3ccSAndroid Build Coastguard Worker     } else {
4709*77c1e3ccSAndroid Build Coastguard Worker       if (bh + bh == bw) {
4710*77c1e3ccSAndroid Build Coastguard Worker         switch (bh) {
4711*77c1e3ccSAndroid Build Coastguard Worker           case 4:
4712*77c1e3ccSAndroid Build Coastguard Worker             dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, dy);
4713*77c1e3ccSAndroid Build Coastguard Worker             break;
4714*77c1e3ccSAndroid Build Coastguard Worker           case 8:
4715*77c1e3ccSAndroid Build Coastguard Worker             dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, dy);
4716*77c1e3ccSAndroid Build Coastguard Worker             break;
4717*77c1e3ccSAndroid Build Coastguard Worker           case 16:
4718*77c1e3ccSAndroid Build Coastguard Worker             dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, dy);
4719*77c1e3ccSAndroid Build Coastguard Worker             break;
4720*77c1e3ccSAndroid Build Coastguard Worker           case 32:
4721*77c1e3ccSAndroid Build Coastguard Worker             dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, dy);
4722*77c1e3ccSAndroid Build Coastguard Worker             break;
4723*77c1e3ccSAndroid Build Coastguard Worker         }
4724*77c1e3ccSAndroid Build Coastguard Worker       } else {
4725*77c1e3ccSAndroid Build Coastguard Worker         switch (bh) {
4726*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
4727*77c1e3ccSAndroid Build Coastguard Worker           case 4:
4728*77c1e3ccSAndroid Build Coastguard Worker             dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, dy);
4729*77c1e3ccSAndroid Build Coastguard Worker             break;
4730*77c1e3ccSAndroid Build Coastguard Worker           case 8:
4731*77c1e3ccSAndroid Build Coastguard Worker             dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, dy);
4732*77c1e3ccSAndroid Build Coastguard Worker             break;
4733*77c1e3ccSAndroid Build Coastguard Worker           case 16:
4734*77c1e3ccSAndroid Build Coastguard Worker             dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, dy);
4735*77c1e3ccSAndroid Build Coastguard Worker             break;
4736*77c1e3ccSAndroid Build Coastguard Worker #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
4737*77c1e3ccSAndroid Build Coastguard Worker         }
4738*77c1e3ccSAndroid Build Coastguard Worker       }
4739*77c1e3ccSAndroid Build Coastguard Worker     }
4740*77c1e3ccSAndroid Build Coastguard Worker   }
4741*77c1e3ccSAndroid Build Coastguard Worker }
4742