1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker *
4*77c1e3ccSAndroid Build Coastguard Worker * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker */
11*77c1e3ccSAndroid Build Coastguard Worker
12*77c1e3ccSAndroid Build Coastguard Worker #include <emmintrin.h> // SSE2
13*77c1e3ccSAndroid Build Coastguard Worker #include <smmintrin.h> /* SSE4.1 */
14*77c1e3ccSAndroid Build Coastguard Worker
15*77c1e3ccSAndroid Build Coastguard Worker #include "config/av1_rtcd.h"
16*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/x86/intrapred_x86.h"
17*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/x86/intrapred_utils.h"
18*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/x86/lpf_common_sse2.h"
19*77c1e3ccSAndroid Build Coastguard Worker
20*77c1e3ccSAndroid Build Coastguard Worker // Low bit depth functions
21*77c1e3ccSAndroid Build Coastguard Worker static DECLARE_ALIGNED(16, uint8_t, Mask[2][33][16]) = {
22*77c1e3ccSAndroid Build Coastguard Worker { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
23*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
24*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
25*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
26*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
27*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
28*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
29*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
30*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
31*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0,
32*77c1e3ccSAndroid Build Coastguard Worker 0 },
33*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0,
34*77c1e3ccSAndroid Build Coastguard Worker 0 },
35*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0,
36*77c1e3ccSAndroid Build Coastguard Worker 0, 0 },
37*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
38*77c1e3ccSAndroid Build Coastguard Worker 0, 0, 0 },
39*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
40*77c1e3ccSAndroid Build Coastguard Worker 0xff, 0, 0, 0 },
41*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
42*77c1e3ccSAndroid Build Coastguard Worker 0xff, 0xff, 0, 0 },
43*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
44*77c1e3ccSAndroid Build Coastguard Worker 0xff, 0xff, 0xff, 0 },
45*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
46*77c1e3ccSAndroid Build Coastguard Worker 0xff, 0xff, 0xff, 0xff },
47*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
48*77c1e3ccSAndroid Build Coastguard Worker 0xff, 0xff, 0xff, 0xff },
49*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
50*77c1e3ccSAndroid Build Coastguard Worker 0xff, 0xff, 0xff, 0xff },
51*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
52*77c1e3ccSAndroid Build Coastguard Worker 0xff, 0xff, 0xff, 0xff },
53*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
54*77c1e3ccSAndroid Build Coastguard Worker 0xff, 0xff, 0xff, 0xff },
55*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
56*77c1e3ccSAndroid Build Coastguard Worker 0xff, 0xff, 0xff, 0xff },
57*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
58*77c1e3ccSAndroid Build Coastguard Worker 0xff, 0xff, 0xff, 0xff },
59*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
60*77c1e3ccSAndroid Build Coastguard Worker 0xff, 0xff, 0xff, 0xff },
61*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
62*77c1e3ccSAndroid Build Coastguard Worker 0xff, 0xff, 0xff, 0xff },
63*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
64*77c1e3ccSAndroid Build Coastguard Worker 0xff, 0xff, 0xff, 0xff },
65*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
66*77c1e3ccSAndroid Build Coastguard Worker 0xff, 0xff, 0xff, 0xff },
67*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
68*77c1e3ccSAndroid Build Coastguard Worker 0xff, 0xff, 0xff, 0xff },
69*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
70*77c1e3ccSAndroid Build Coastguard Worker 0xff, 0xff, 0xff, 0xff },
71*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
72*77c1e3ccSAndroid Build Coastguard Worker 0xff, 0xff, 0xff, 0xff },
73*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
74*77c1e3ccSAndroid Build Coastguard Worker 0xff, 0xff, 0xff, 0xff },
75*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
76*77c1e3ccSAndroid Build Coastguard Worker 0xff, 0xff, 0xff, 0xff },
77*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
78*77c1e3ccSAndroid Build Coastguard Worker 0xff, 0xff, 0xff, 0xff } },
79*77c1e3ccSAndroid Build Coastguard Worker {
80*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
81*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
82*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
83*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
84*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
85*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
86*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
87*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
88*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
89*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
90*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
91*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
92*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
93*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
94*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
95*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
96*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
97*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
98*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
99*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
100*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
101*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
102*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
103*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
104*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
105*77c1e3ccSAndroid Build Coastguard Worker 0 },
106*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0,
107*77c1e3ccSAndroid Build Coastguard Worker 0 },
108*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0,
109*77c1e3ccSAndroid Build Coastguard Worker 0, 0 },
110*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0,
111*77c1e3ccSAndroid Build Coastguard Worker 0, 0, 0 },
112*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
113*77c1e3ccSAndroid Build Coastguard Worker 0, 0, 0, 0 },
114*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
115*77c1e3ccSAndroid Build Coastguard Worker 0xff, 0, 0, 0 },
116*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
117*77c1e3ccSAndroid Build Coastguard Worker 0xff, 0xff, 0, 0 },
118*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
119*77c1e3ccSAndroid Build Coastguard Worker 0xff, 0xff, 0xff, 0 },
120*77c1e3ccSAndroid Build Coastguard Worker { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
121*77c1e3ccSAndroid Build Coastguard Worker 0xff, 0xff, 0xff, 0xff },
122*77c1e3ccSAndroid Build Coastguard Worker },
123*77c1e3ccSAndroid Build Coastguard Worker };
124*77c1e3ccSAndroid Build Coastguard Worker
125*77c1e3ccSAndroid Build Coastguard Worker /* clang-format on */
dr_prediction_z1_HxW_internal_sse4_1(int H,int W,__m128i * dst,const uint8_t * above,int upsample_above,int dx)126*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_sse4_1(
127*77c1e3ccSAndroid Build Coastguard Worker int H, int W, __m128i *dst, const uint8_t *above, int upsample_above,
128*77c1e3ccSAndroid Build Coastguard Worker int dx) {
129*77c1e3ccSAndroid Build Coastguard Worker const int frac_bits = 6 - upsample_above;
130*77c1e3ccSAndroid Build Coastguard Worker const int max_base_x = ((W + H) - 1) << upsample_above;
131*77c1e3ccSAndroid Build Coastguard Worker
132*77c1e3ccSAndroid Build Coastguard Worker assert(dx > 0);
133*77c1e3ccSAndroid Build Coastguard Worker // pre-filter above pixels
134*77c1e3ccSAndroid Build Coastguard Worker // store in temp buffers:
135*77c1e3ccSAndroid Build Coastguard Worker // above[x] * 32 + 16
136*77c1e3ccSAndroid Build Coastguard Worker // above[x+1] - above[x]
137*77c1e3ccSAndroid Build Coastguard Worker // final pixels will be calculated as:
138*77c1e3ccSAndroid Build Coastguard Worker // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
139*77c1e3ccSAndroid Build Coastguard Worker __m128i a0, a1, a32, a16;
140*77c1e3ccSAndroid Build Coastguard Worker __m128i diff, c3f;
141*77c1e3ccSAndroid Build Coastguard Worker __m128i a_mbase_x;
142*77c1e3ccSAndroid Build Coastguard Worker
143*77c1e3ccSAndroid Build Coastguard Worker a16 = _mm_set1_epi16(16);
144*77c1e3ccSAndroid Build Coastguard Worker a_mbase_x = _mm_set1_epi8((char)above[max_base_x]);
145*77c1e3ccSAndroid Build Coastguard Worker c3f = _mm_set1_epi16(0x3f);
146*77c1e3ccSAndroid Build Coastguard Worker
147*77c1e3ccSAndroid Build Coastguard Worker int x = dx;
148*77c1e3ccSAndroid Build Coastguard Worker for (int r = 0; r < W; r++) {
149*77c1e3ccSAndroid Build Coastguard Worker __m128i b, res, res1, shift;
150*77c1e3ccSAndroid Build Coastguard Worker __m128i a0_above, a1_above;
151*77c1e3ccSAndroid Build Coastguard Worker
152*77c1e3ccSAndroid Build Coastguard Worker int base = x >> frac_bits;
153*77c1e3ccSAndroid Build Coastguard Worker int base_max_diff = (max_base_x - base) >> upsample_above;
154*77c1e3ccSAndroid Build Coastguard Worker if (base_max_diff <= 0) {
155*77c1e3ccSAndroid Build Coastguard Worker for (int i = r; i < W; ++i) {
156*77c1e3ccSAndroid Build Coastguard Worker dst[i] = a_mbase_x; // save 4 values
157*77c1e3ccSAndroid Build Coastguard Worker }
158*77c1e3ccSAndroid Build Coastguard Worker return;
159*77c1e3ccSAndroid Build Coastguard Worker }
160*77c1e3ccSAndroid Build Coastguard Worker if (base_max_diff > H) base_max_diff = H;
161*77c1e3ccSAndroid Build Coastguard Worker a0_above = _mm_loadu_si128((__m128i *)(above + base));
162*77c1e3ccSAndroid Build Coastguard Worker a1_above = _mm_loadu_si128((__m128i *)(above + base + 1));
163*77c1e3ccSAndroid Build Coastguard Worker
164*77c1e3ccSAndroid Build Coastguard Worker if (upsample_above) {
165*77c1e3ccSAndroid Build Coastguard Worker a0_above = _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[0]);
166*77c1e3ccSAndroid Build Coastguard Worker a1_above = _mm_srli_si128(a0_above, 8);
167*77c1e3ccSAndroid Build Coastguard Worker
168*77c1e3ccSAndroid Build Coastguard Worker shift = _mm_srli_epi16(
169*77c1e3ccSAndroid Build Coastguard Worker _mm_and_si128(_mm_slli_epi16(_mm_set1_epi16(x), upsample_above), c3f),
170*77c1e3ccSAndroid Build Coastguard Worker 1);
171*77c1e3ccSAndroid Build Coastguard Worker } else {
172*77c1e3ccSAndroid Build Coastguard Worker shift = _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1);
173*77c1e3ccSAndroid Build Coastguard Worker }
174*77c1e3ccSAndroid Build Coastguard Worker // lower half
175*77c1e3ccSAndroid Build Coastguard Worker a0 = _mm_cvtepu8_epi16(a0_above);
176*77c1e3ccSAndroid Build Coastguard Worker a1 = _mm_cvtepu8_epi16(a1_above);
177*77c1e3ccSAndroid Build Coastguard Worker
178*77c1e3ccSAndroid Build Coastguard Worker diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x]
179*77c1e3ccSAndroid Build Coastguard Worker a32 = _mm_slli_epi16(a0, 5); // a[x] * 32
180*77c1e3ccSAndroid Build Coastguard Worker a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
181*77c1e3ccSAndroid Build Coastguard Worker
182*77c1e3ccSAndroid Build Coastguard Worker b = _mm_mullo_epi16(diff, shift);
183*77c1e3ccSAndroid Build Coastguard Worker res = _mm_add_epi16(a32, b);
184*77c1e3ccSAndroid Build Coastguard Worker res = _mm_srli_epi16(res, 5);
185*77c1e3ccSAndroid Build Coastguard Worker
186*77c1e3ccSAndroid Build Coastguard Worker // uppar half
187*77c1e3ccSAndroid Build Coastguard Worker a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
188*77c1e3ccSAndroid Build Coastguard Worker a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
189*77c1e3ccSAndroid Build Coastguard Worker
190*77c1e3ccSAndroid Build Coastguard Worker diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x]
191*77c1e3ccSAndroid Build Coastguard Worker a32 = _mm_slli_epi16(a0, 5); // a[x] * 32
192*77c1e3ccSAndroid Build Coastguard Worker a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
193*77c1e3ccSAndroid Build Coastguard Worker
194*77c1e3ccSAndroid Build Coastguard Worker b = _mm_mullo_epi16(diff, shift);
195*77c1e3ccSAndroid Build Coastguard Worker res1 = _mm_add_epi16(a32, b);
196*77c1e3ccSAndroid Build Coastguard Worker res1 = _mm_srli_epi16(res1, 5);
197*77c1e3ccSAndroid Build Coastguard Worker
198*77c1e3ccSAndroid Build Coastguard Worker res = _mm_packus_epi16(res, res1);
199*77c1e3ccSAndroid Build Coastguard Worker
200*77c1e3ccSAndroid Build Coastguard Worker dst[r] =
201*77c1e3ccSAndroid Build Coastguard Worker _mm_blendv_epi8(a_mbase_x, res, *(__m128i *)Mask[0][base_max_diff]);
202*77c1e3ccSAndroid Build Coastguard Worker x += dx;
203*77c1e3ccSAndroid Build Coastguard Worker }
204*77c1e3ccSAndroid Build Coastguard Worker }
205*77c1e3ccSAndroid Build Coastguard Worker
dr_prediction_z1_4xN_sse4_1(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)206*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z1_4xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
207*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *above,
208*77c1e3ccSAndroid Build Coastguard Worker int upsample_above, int dx) {
209*77c1e3ccSAndroid Build Coastguard Worker __m128i dstvec[16];
210*77c1e3ccSAndroid Build Coastguard Worker
211*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z1_HxW_internal_sse4_1(4, N, dstvec, above, upsample_above, dx);
212*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < N; i++) {
213*77c1e3ccSAndroid Build Coastguard Worker *(int *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]);
214*77c1e3ccSAndroid Build Coastguard Worker }
215*77c1e3ccSAndroid Build Coastguard Worker }
216*77c1e3ccSAndroid Build Coastguard Worker
dr_prediction_z1_8xN_sse4_1(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)217*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z1_8xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
218*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *above,
219*77c1e3ccSAndroid Build Coastguard Worker int upsample_above, int dx) {
220*77c1e3ccSAndroid Build Coastguard Worker __m128i dstvec[32];
221*77c1e3ccSAndroid Build Coastguard Worker
222*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z1_HxW_internal_sse4_1(8, N, dstvec, above, upsample_above, dx);
223*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < N; i++) {
224*77c1e3ccSAndroid Build Coastguard Worker _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
225*77c1e3ccSAndroid Build Coastguard Worker }
226*77c1e3ccSAndroid Build Coastguard Worker }
227*77c1e3ccSAndroid Build Coastguard Worker
dr_prediction_z1_16xN_sse4_1(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)228*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z1_16xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
229*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *above,
230*77c1e3ccSAndroid Build Coastguard Worker int upsample_above, int dx) {
231*77c1e3ccSAndroid Build Coastguard Worker __m128i dstvec[64];
232*77c1e3ccSAndroid Build Coastguard Worker
233*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z1_HxW_internal_sse4_1(16, N, dstvec, above, upsample_above,
234*77c1e3ccSAndroid Build Coastguard Worker dx);
235*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < N; i++) {
236*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
237*77c1e3ccSAndroid Build Coastguard Worker }
238*77c1e3ccSAndroid Build Coastguard Worker }
239*77c1e3ccSAndroid Build Coastguard Worker
dr_prediction_z1_32xN_internal_sse4_1(int N,__m128i * dstvec,__m128i * dstvec_h,const uint8_t * above,int upsample_above,int dx)240*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_sse4_1(
241*77c1e3ccSAndroid Build Coastguard Worker int N, __m128i *dstvec, __m128i *dstvec_h, const uint8_t *above,
242*77c1e3ccSAndroid Build Coastguard Worker int upsample_above, int dx) {
243*77c1e3ccSAndroid Build Coastguard Worker // here upsample_above is 0 by design of av1_use_intra_edge_upsample
244*77c1e3ccSAndroid Build Coastguard Worker (void)upsample_above;
245*77c1e3ccSAndroid Build Coastguard Worker const int frac_bits = 6;
246*77c1e3ccSAndroid Build Coastguard Worker const int max_base_x = ((32 + N) - 1);
247*77c1e3ccSAndroid Build Coastguard Worker
248*77c1e3ccSAndroid Build Coastguard Worker // pre-filter above pixels
249*77c1e3ccSAndroid Build Coastguard Worker // store in temp buffers:
250*77c1e3ccSAndroid Build Coastguard Worker // above[x] * 32 + 16
251*77c1e3ccSAndroid Build Coastguard Worker // above[x+1] - above[x]
252*77c1e3ccSAndroid Build Coastguard Worker // final pixels will be calculated as:
253*77c1e3ccSAndroid Build Coastguard Worker // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
254*77c1e3ccSAndroid Build Coastguard Worker __m128i a0, a1, a32, a16;
255*77c1e3ccSAndroid Build Coastguard Worker __m128i a_mbase_x, diff, c3f;
256*77c1e3ccSAndroid Build Coastguard Worker
257*77c1e3ccSAndroid Build Coastguard Worker a16 = _mm_set1_epi16(16);
258*77c1e3ccSAndroid Build Coastguard Worker a_mbase_x = _mm_set1_epi8((char)above[max_base_x]);
259*77c1e3ccSAndroid Build Coastguard Worker c3f = _mm_set1_epi16(0x3f);
260*77c1e3ccSAndroid Build Coastguard Worker
261*77c1e3ccSAndroid Build Coastguard Worker int x = dx;
262*77c1e3ccSAndroid Build Coastguard Worker for (int r = 0; r < N; r++) {
263*77c1e3ccSAndroid Build Coastguard Worker __m128i b, res, res1, res16[2];
264*77c1e3ccSAndroid Build Coastguard Worker __m128i a0_above, a1_above;
265*77c1e3ccSAndroid Build Coastguard Worker
266*77c1e3ccSAndroid Build Coastguard Worker int base = x >> frac_bits;
267*77c1e3ccSAndroid Build Coastguard Worker int base_max_diff = (max_base_x - base);
268*77c1e3ccSAndroid Build Coastguard Worker if (base_max_diff <= 0) {
269*77c1e3ccSAndroid Build Coastguard Worker for (int i = r; i < N; ++i) {
270*77c1e3ccSAndroid Build Coastguard Worker dstvec[i] = a_mbase_x; // save 32 values
271*77c1e3ccSAndroid Build Coastguard Worker dstvec_h[i] = a_mbase_x;
272*77c1e3ccSAndroid Build Coastguard Worker }
273*77c1e3ccSAndroid Build Coastguard Worker return;
274*77c1e3ccSAndroid Build Coastguard Worker }
275*77c1e3ccSAndroid Build Coastguard Worker if (base_max_diff > 32) base_max_diff = 32;
276*77c1e3ccSAndroid Build Coastguard Worker __m128i shift = _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1);
277*77c1e3ccSAndroid Build Coastguard Worker
278*77c1e3ccSAndroid Build Coastguard Worker for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
279*77c1e3ccSAndroid Build Coastguard Worker int mdiff = base_max_diff - j;
280*77c1e3ccSAndroid Build Coastguard Worker if (mdiff <= 0) {
281*77c1e3ccSAndroid Build Coastguard Worker res16[jj] = a_mbase_x;
282*77c1e3ccSAndroid Build Coastguard Worker } else {
283*77c1e3ccSAndroid Build Coastguard Worker a0_above = _mm_loadu_si128((__m128i *)(above + base + j));
284*77c1e3ccSAndroid Build Coastguard Worker a1_above = _mm_loadu_si128((__m128i *)(above + base + j + 1));
285*77c1e3ccSAndroid Build Coastguard Worker
286*77c1e3ccSAndroid Build Coastguard Worker // lower half
287*77c1e3ccSAndroid Build Coastguard Worker a0 = _mm_cvtepu8_epi16(a0_above);
288*77c1e3ccSAndroid Build Coastguard Worker a1 = _mm_cvtepu8_epi16(a1_above);
289*77c1e3ccSAndroid Build Coastguard Worker
290*77c1e3ccSAndroid Build Coastguard Worker diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x]
291*77c1e3ccSAndroid Build Coastguard Worker a32 = _mm_slli_epi16(a0, 5); // a[x] * 32
292*77c1e3ccSAndroid Build Coastguard Worker a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
293*77c1e3ccSAndroid Build Coastguard Worker b = _mm_mullo_epi16(diff, shift);
294*77c1e3ccSAndroid Build Coastguard Worker
295*77c1e3ccSAndroid Build Coastguard Worker res = _mm_add_epi16(a32, b);
296*77c1e3ccSAndroid Build Coastguard Worker res = _mm_srli_epi16(res, 5);
297*77c1e3ccSAndroid Build Coastguard Worker
298*77c1e3ccSAndroid Build Coastguard Worker // uppar half
299*77c1e3ccSAndroid Build Coastguard Worker a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
300*77c1e3ccSAndroid Build Coastguard Worker a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
301*77c1e3ccSAndroid Build Coastguard Worker
302*77c1e3ccSAndroid Build Coastguard Worker diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x]
303*77c1e3ccSAndroid Build Coastguard Worker a32 = _mm_slli_epi16(a0, 5); // a[x] * 32
304*77c1e3ccSAndroid Build Coastguard Worker a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
305*77c1e3ccSAndroid Build Coastguard Worker
306*77c1e3ccSAndroid Build Coastguard Worker b = _mm_mullo_epi16(diff, shift);
307*77c1e3ccSAndroid Build Coastguard Worker res1 = _mm_add_epi16(a32, b);
308*77c1e3ccSAndroid Build Coastguard Worker res1 = _mm_srli_epi16(res1, 5);
309*77c1e3ccSAndroid Build Coastguard Worker
310*77c1e3ccSAndroid Build Coastguard Worker res16[jj] = _mm_packus_epi16(res, res1); // 16 8bit values
311*77c1e3ccSAndroid Build Coastguard Worker }
312*77c1e3ccSAndroid Build Coastguard Worker }
313*77c1e3ccSAndroid Build Coastguard Worker
314*77c1e3ccSAndroid Build Coastguard Worker dstvec[r] =
315*77c1e3ccSAndroid Build Coastguard Worker _mm_blendv_epi8(a_mbase_x, res16[0],
316*77c1e3ccSAndroid Build Coastguard Worker *(__m128i *)Mask[0][base_max_diff]); // 16 8bit values
317*77c1e3ccSAndroid Build Coastguard Worker
318*77c1e3ccSAndroid Build Coastguard Worker dstvec_h[r] =
319*77c1e3ccSAndroid Build Coastguard Worker _mm_blendv_epi8(a_mbase_x, res16[1],
320*77c1e3ccSAndroid Build Coastguard Worker *(__m128i *)Mask[1][base_max_diff]); // 16 8bit values
321*77c1e3ccSAndroid Build Coastguard Worker x += dx;
322*77c1e3ccSAndroid Build Coastguard Worker }
323*77c1e3ccSAndroid Build Coastguard Worker }
324*77c1e3ccSAndroid Build Coastguard Worker
dr_prediction_z1_32xN_sse4_1(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)325*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z1_32xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
326*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *above,
327*77c1e3ccSAndroid Build Coastguard Worker int upsample_above, int dx) {
328*77c1e3ccSAndroid Build Coastguard Worker __m128i dstvec[64], dstvec_h[64];
329*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z1_32xN_internal_sse4_1(N, dstvec, dstvec_h, above,
330*77c1e3ccSAndroid Build Coastguard Worker upsample_above, dx);
331*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < N; i++) {
332*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
333*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(dst + stride * i + 16), dstvec_h[i]);
334*77c1e3ccSAndroid Build Coastguard Worker }
335*77c1e3ccSAndroid Build Coastguard Worker }
336*77c1e3ccSAndroid Build Coastguard Worker
dr_prediction_z1_64xN_sse4_1(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)337*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z1_64xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
338*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *above,
339*77c1e3ccSAndroid Build Coastguard Worker int upsample_above, int dx) {
340*77c1e3ccSAndroid Build Coastguard Worker // here upsample_above is 0 by design of av1_use_intra_edge_upsample
341*77c1e3ccSAndroid Build Coastguard Worker (void)upsample_above;
342*77c1e3ccSAndroid Build Coastguard Worker const int frac_bits = 6;
343*77c1e3ccSAndroid Build Coastguard Worker const int max_base_x = ((64 + N) - 1);
344*77c1e3ccSAndroid Build Coastguard Worker
345*77c1e3ccSAndroid Build Coastguard Worker // pre-filter above pixels
346*77c1e3ccSAndroid Build Coastguard Worker // store in temp buffers:
347*77c1e3ccSAndroid Build Coastguard Worker // above[x] * 32 + 16
348*77c1e3ccSAndroid Build Coastguard Worker // above[x+1] - above[x]
349*77c1e3ccSAndroid Build Coastguard Worker // final pixels will be calculated as:
350*77c1e3ccSAndroid Build Coastguard Worker // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
351*77c1e3ccSAndroid Build Coastguard Worker __m128i a0, a1, a32, a16;
352*77c1e3ccSAndroid Build Coastguard Worker __m128i a_mbase_x, diff, c3f;
353*77c1e3ccSAndroid Build Coastguard Worker __m128i max_base, base_inc, mask;
354*77c1e3ccSAndroid Build Coastguard Worker
355*77c1e3ccSAndroid Build Coastguard Worker a16 = _mm_set1_epi16(16);
356*77c1e3ccSAndroid Build Coastguard Worker a_mbase_x = _mm_set1_epi8((char)above[max_base_x]);
357*77c1e3ccSAndroid Build Coastguard Worker max_base = _mm_set1_epi8(max_base_x);
358*77c1e3ccSAndroid Build Coastguard Worker c3f = _mm_set1_epi16(0x3f);
359*77c1e3ccSAndroid Build Coastguard Worker
360*77c1e3ccSAndroid Build Coastguard Worker int x = dx;
361*77c1e3ccSAndroid Build Coastguard Worker for (int r = 0; r < N; r++, dst += stride) {
362*77c1e3ccSAndroid Build Coastguard Worker __m128i b, res, res1;
363*77c1e3ccSAndroid Build Coastguard Worker int base = x >> frac_bits;
364*77c1e3ccSAndroid Build Coastguard Worker if (base >= max_base_x) {
365*77c1e3ccSAndroid Build Coastguard Worker for (int i = r; i < N; ++i) {
366*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)dst, a_mbase_x); // save 32 values
367*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(dst + 16), a_mbase_x);
368*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(dst + 32), a_mbase_x);
369*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(dst + 48), a_mbase_x);
370*77c1e3ccSAndroid Build Coastguard Worker dst += stride;
371*77c1e3ccSAndroid Build Coastguard Worker }
372*77c1e3ccSAndroid Build Coastguard Worker return;
373*77c1e3ccSAndroid Build Coastguard Worker }
374*77c1e3ccSAndroid Build Coastguard Worker
375*77c1e3ccSAndroid Build Coastguard Worker __m128i shift =
376*77c1e3ccSAndroid Build Coastguard Worker _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1); // 8 element
377*77c1e3ccSAndroid Build Coastguard Worker
378*77c1e3ccSAndroid Build Coastguard Worker __m128i a0_above, a1_above, res_val;
379*77c1e3ccSAndroid Build Coastguard Worker for (int j = 0; j < 64; j += 16) {
380*77c1e3ccSAndroid Build Coastguard Worker int mdif = max_base_x - (base + j);
381*77c1e3ccSAndroid Build Coastguard Worker if (mdif <= 0) {
382*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(dst + j), a_mbase_x);
383*77c1e3ccSAndroid Build Coastguard Worker } else {
384*77c1e3ccSAndroid Build Coastguard Worker a0_above =
385*77c1e3ccSAndroid Build Coastguard Worker _mm_loadu_si128((__m128i *)(above + base + j)); // load 16 element
386*77c1e3ccSAndroid Build Coastguard Worker a1_above = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
387*77c1e3ccSAndroid Build Coastguard Worker
388*77c1e3ccSAndroid Build Coastguard Worker // lower half
389*77c1e3ccSAndroid Build Coastguard Worker a0 = _mm_cvtepu8_epi16(a0_above);
390*77c1e3ccSAndroid Build Coastguard Worker a1 = _mm_cvtepu8_epi16(a1_above);
391*77c1e3ccSAndroid Build Coastguard Worker
392*77c1e3ccSAndroid Build Coastguard Worker diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x]
393*77c1e3ccSAndroid Build Coastguard Worker a32 = _mm_slli_epi16(a0, 5); // a[x] * 32
394*77c1e3ccSAndroid Build Coastguard Worker a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
395*77c1e3ccSAndroid Build Coastguard Worker b = _mm_mullo_epi16(diff, shift);
396*77c1e3ccSAndroid Build Coastguard Worker
397*77c1e3ccSAndroid Build Coastguard Worker res = _mm_add_epi16(a32, b);
398*77c1e3ccSAndroid Build Coastguard Worker res = _mm_srli_epi16(res, 5);
399*77c1e3ccSAndroid Build Coastguard Worker
400*77c1e3ccSAndroid Build Coastguard Worker // uppar half
401*77c1e3ccSAndroid Build Coastguard Worker a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
402*77c1e3ccSAndroid Build Coastguard Worker a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
403*77c1e3ccSAndroid Build Coastguard Worker
404*77c1e3ccSAndroid Build Coastguard Worker diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x]
405*77c1e3ccSAndroid Build Coastguard Worker a32 = _mm_slli_epi16(a0, 5); // a[x] * 32
406*77c1e3ccSAndroid Build Coastguard Worker a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
407*77c1e3ccSAndroid Build Coastguard Worker
408*77c1e3ccSAndroid Build Coastguard Worker b = _mm_mullo_epi16(diff, shift);
409*77c1e3ccSAndroid Build Coastguard Worker res1 = _mm_add_epi16(a32, b);
410*77c1e3ccSAndroid Build Coastguard Worker res1 = _mm_srli_epi16(res1, 5);
411*77c1e3ccSAndroid Build Coastguard Worker
412*77c1e3ccSAndroid Build Coastguard Worker res = _mm_packus_epi16(res, res1); // 16 8bit values
413*77c1e3ccSAndroid Build Coastguard Worker
414*77c1e3ccSAndroid Build Coastguard Worker base_inc =
415*77c1e3ccSAndroid Build Coastguard Worker _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1),
416*77c1e3ccSAndroid Build Coastguard Worker (int8_t)(base + j + 2), (int8_t)(base + j + 3),
417*77c1e3ccSAndroid Build Coastguard Worker (int8_t)(base + j + 4), (int8_t)(base + j + 5),
418*77c1e3ccSAndroid Build Coastguard Worker (int8_t)(base + j + 6), (int8_t)(base + j + 7),
419*77c1e3ccSAndroid Build Coastguard Worker (int8_t)(base + j + 8), (int8_t)(base + j + 9),
420*77c1e3ccSAndroid Build Coastguard Worker (int8_t)(base + j + 10), (int8_t)(base + j + 11),
421*77c1e3ccSAndroid Build Coastguard Worker (int8_t)(base + j + 12), (int8_t)(base + j + 13),
422*77c1e3ccSAndroid Build Coastguard Worker (int8_t)(base + j + 14), (int8_t)(base + j + 15));
423*77c1e3ccSAndroid Build Coastguard Worker
424*77c1e3ccSAndroid Build Coastguard Worker mask = _mm_cmpgt_epi8(_mm_subs_epu8(max_base, base_inc),
425*77c1e3ccSAndroid Build Coastguard Worker _mm_setzero_si128());
426*77c1e3ccSAndroid Build Coastguard Worker res_val = _mm_blendv_epi8(a_mbase_x, res, mask);
427*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(dst + j), res_val);
428*77c1e3ccSAndroid Build Coastguard Worker }
429*77c1e3ccSAndroid Build Coastguard Worker }
430*77c1e3ccSAndroid Build Coastguard Worker x += dx;
431*77c1e3ccSAndroid Build Coastguard Worker }
432*77c1e3ccSAndroid Build Coastguard Worker }
433*77c1e3ccSAndroid Build Coastguard Worker
434*77c1e3ccSAndroid Build Coastguard Worker // Directional prediction, zone 1: 0 < angle < 90
av1_dr_prediction_z1_sse4_1(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_above,int dx,int dy)435*77c1e3ccSAndroid Build Coastguard Worker void av1_dr_prediction_z1_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
436*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *above, const uint8_t *left,
437*77c1e3ccSAndroid Build Coastguard Worker int upsample_above, int dx, int dy) {
438*77c1e3ccSAndroid Build Coastguard Worker (void)left;
439*77c1e3ccSAndroid Build Coastguard Worker (void)dy;
440*77c1e3ccSAndroid Build Coastguard Worker switch (bw) {
441*77c1e3ccSAndroid Build Coastguard Worker case 4:
442*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z1_4xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
443*77c1e3ccSAndroid Build Coastguard Worker break;
444*77c1e3ccSAndroid Build Coastguard Worker case 8:
445*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z1_8xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
446*77c1e3ccSAndroid Build Coastguard Worker break;
447*77c1e3ccSAndroid Build Coastguard Worker case 16:
448*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z1_16xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
449*77c1e3ccSAndroid Build Coastguard Worker break;
450*77c1e3ccSAndroid Build Coastguard Worker case 32:
451*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z1_32xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
452*77c1e3ccSAndroid Build Coastguard Worker break;
453*77c1e3ccSAndroid Build Coastguard Worker case 64:
454*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z1_64xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
455*77c1e3ccSAndroid Build Coastguard Worker break;
456*77c1e3ccSAndroid Build Coastguard Worker default: assert(0 && "Invalid block size");
457*77c1e3ccSAndroid Build Coastguard Worker }
458*77c1e3ccSAndroid Build Coastguard Worker return;
459*77c1e3ccSAndroid Build Coastguard Worker }
460*77c1e3ccSAndroid Build Coastguard Worker
dr_prediction_z2_Nx4_sse4_1(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)461*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z2_Nx4_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
462*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *above,
463*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *left, int upsample_above,
464*77c1e3ccSAndroid Build Coastguard Worker int upsample_left, int dx, int dy) {
465*77c1e3ccSAndroid Build Coastguard Worker const int min_base_x = -(1 << upsample_above);
466*77c1e3ccSAndroid Build Coastguard Worker const int min_base_y = -(1 << upsample_left);
467*77c1e3ccSAndroid Build Coastguard Worker const int frac_bits_x = 6 - upsample_above;
468*77c1e3ccSAndroid Build Coastguard Worker const int frac_bits_y = 6 - upsample_left;
469*77c1e3ccSAndroid Build Coastguard Worker
470*77c1e3ccSAndroid Build Coastguard Worker assert(dx > 0);
471*77c1e3ccSAndroid Build Coastguard Worker // pre-filter above pixels
472*77c1e3ccSAndroid Build Coastguard Worker // store in temp buffers:
473*77c1e3ccSAndroid Build Coastguard Worker // above[x] * 32 + 16
474*77c1e3ccSAndroid Build Coastguard Worker // above[x+1] - above[x]
475*77c1e3ccSAndroid Build Coastguard Worker // final pixels will be calculated as:
476*77c1e3ccSAndroid Build Coastguard Worker // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
477*77c1e3ccSAndroid Build Coastguard Worker __m128i a0_x, a1_x, a32, diff;
478*77c1e3ccSAndroid Build Coastguard Worker
479*77c1e3ccSAndroid Build Coastguard Worker const __m128i c3f = _mm_set1_epi16(0x3f);
480*77c1e3ccSAndroid Build Coastguard Worker const __m128i min_y_base = _mm_set1_epi16(min_base_y);
481*77c1e3ccSAndroid Build Coastguard Worker const __m128i c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0);
482*77c1e3ccSAndroid Build Coastguard Worker const __m128i dy_reg = _mm_set1_epi16(dy);
483*77c1e3ccSAndroid Build Coastguard Worker const __m128i a16 = _mm_set1_epi16(16);
484*77c1e3ccSAndroid Build Coastguard Worker
485*77c1e3ccSAndroid Build Coastguard Worker for (int r = 0; r < N; r++) {
486*77c1e3ccSAndroid Build Coastguard Worker __m128i b, res, shift, r6, ydx;
487*77c1e3ccSAndroid Build Coastguard Worker __m128i resx, resy, resxy;
488*77c1e3ccSAndroid Build Coastguard Worker __m128i a0_above, a1_above;
489*77c1e3ccSAndroid Build Coastguard Worker int y = r + 1;
490*77c1e3ccSAndroid Build Coastguard Worker int base_x = (-y * dx) >> frac_bits_x;
491*77c1e3ccSAndroid Build Coastguard Worker int base_shift = 0;
492*77c1e3ccSAndroid Build Coastguard Worker if (base_x < (min_base_x - 1)) {
493*77c1e3ccSAndroid Build Coastguard Worker base_shift = (min_base_x - base_x - 1) >> upsample_above;
494*77c1e3ccSAndroid Build Coastguard Worker }
495*77c1e3ccSAndroid Build Coastguard Worker int base_min_diff =
496*77c1e3ccSAndroid Build Coastguard Worker (min_base_x - base_x + upsample_above) >> upsample_above;
497*77c1e3ccSAndroid Build Coastguard Worker if (base_min_diff > 4) {
498*77c1e3ccSAndroid Build Coastguard Worker base_min_diff = 4;
499*77c1e3ccSAndroid Build Coastguard Worker } else {
500*77c1e3ccSAndroid Build Coastguard Worker if (base_min_diff < 0) base_min_diff = 0;
501*77c1e3ccSAndroid Build Coastguard Worker }
502*77c1e3ccSAndroid Build Coastguard Worker
503*77c1e3ccSAndroid Build Coastguard Worker if (base_shift > 3) {
504*77c1e3ccSAndroid Build Coastguard Worker a0_x = _mm_setzero_si128();
505*77c1e3ccSAndroid Build Coastguard Worker a1_x = _mm_setzero_si128();
506*77c1e3ccSAndroid Build Coastguard Worker shift = _mm_setzero_si128();
507*77c1e3ccSAndroid Build Coastguard Worker } else {
508*77c1e3ccSAndroid Build Coastguard Worker a0_above = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
509*77c1e3ccSAndroid Build Coastguard Worker ydx = _mm_set1_epi16(y * dx);
510*77c1e3ccSAndroid Build Coastguard Worker r6 = _mm_slli_epi16(c1234, 6);
511*77c1e3ccSAndroid Build Coastguard Worker
512*77c1e3ccSAndroid Build Coastguard Worker if (upsample_above) {
513*77c1e3ccSAndroid Build Coastguard Worker a0_above =
514*77c1e3ccSAndroid Build Coastguard Worker _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[base_shift]);
515*77c1e3ccSAndroid Build Coastguard Worker a1_above = _mm_srli_si128(a0_above, 8);
516*77c1e3ccSAndroid Build Coastguard Worker
517*77c1e3ccSAndroid Build Coastguard Worker shift = _mm_srli_epi16(
518*77c1e3ccSAndroid Build Coastguard Worker _mm_and_si128(
519*77c1e3ccSAndroid Build Coastguard Worker _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
520*77c1e3ccSAndroid Build Coastguard Worker 1);
521*77c1e3ccSAndroid Build Coastguard Worker } else {
522*77c1e3ccSAndroid Build Coastguard Worker a0_above =
523*77c1e3ccSAndroid Build Coastguard Worker _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]);
524*77c1e3ccSAndroid Build Coastguard Worker a1_above = _mm_srli_si128(a0_above, 1);
525*77c1e3ccSAndroid Build Coastguard Worker
526*77c1e3ccSAndroid Build Coastguard Worker shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
527*77c1e3ccSAndroid Build Coastguard Worker }
528*77c1e3ccSAndroid Build Coastguard Worker a0_x = _mm_cvtepu8_epi16(a0_above);
529*77c1e3ccSAndroid Build Coastguard Worker a1_x = _mm_cvtepu8_epi16(a1_above);
530*77c1e3ccSAndroid Build Coastguard Worker }
531*77c1e3ccSAndroid Build Coastguard Worker // y calc
532*77c1e3ccSAndroid Build Coastguard Worker __m128i a0_y, a1_y, shifty;
533*77c1e3ccSAndroid Build Coastguard Worker if (base_x < min_base_x) {
534*77c1e3ccSAndroid Build Coastguard Worker DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
535*77c1e3ccSAndroid Build Coastguard Worker __m128i y_c, base_y_c_reg, mask, c1234_;
536*77c1e3ccSAndroid Build Coastguard Worker c1234_ = _mm_srli_si128(c1234, 2);
537*77c1e3ccSAndroid Build Coastguard Worker r6 = _mm_set1_epi16(r << 6);
538*77c1e3ccSAndroid Build Coastguard Worker y_c = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy_reg));
539*77c1e3ccSAndroid Build Coastguard Worker base_y_c_reg = _mm_srai_epi16(y_c, frac_bits_y);
540*77c1e3ccSAndroid Build Coastguard Worker mask = _mm_cmpgt_epi16(min_y_base, base_y_c_reg);
541*77c1e3ccSAndroid Build Coastguard Worker base_y_c_reg = _mm_andnot_si128(mask, base_y_c_reg);
542*77c1e3ccSAndroid Build Coastguard Worker _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
543*77c1e3ccSAndroid Build Coastguard Worker
544*77c1e3ccSAndroid Build Coastguard Worker a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
545*77c1e3ccSAndroid Build Coastguard Worker left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
546*77c1e3ccSAndroid Build Coastguard Worker base_y_c_reg = _mm_add_epi16(base_y_c_reg, _mm_srli_epi16(a16, 4));
547*77c1e3ccSAndroid Build Coastguard Worker _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
548*77c1e3ccSAndroid Build Coastguard Worker a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
549*77c1e3ccSAndroid Build Coastguard Worker left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
550*77c1e3ccSAndroid Build Coastguard Worker
551*77c1e3ccSAndroid Build Coastguard Worker if (upsample_left) {
552*77c1e3ccSAndroid Build Coastguard Worker shifty = _mm_srli_epi16(
553*77c1e3ccSAndroid Build Coastguard Worker _mm_and_si128(_mm_slli_epi16(y_c, upsample_left), c3f), 1);
554*77c1e3ccSAndroid Build Coastguard Worker } else {
555*77c1e3ccSAndroid Build Coastguard Worker shifty = _mm_srli_epi16(_mm_and_si128(y_c, c3f), 1);
556*77c1e3ccSAndroid Build Coastguard Worker }
557*77c1e3ccSAndroid Build Coastguard Worker a0_x = _mm_unpacklo_epi64(a0_x, a0_y);
558*77c1e3ccSAndroid Build Coastguard Worker a1_x = _mm_unpacklo_epi64(a1_x, a1_y);
559*77c1e3ccSAndroid Build Coastguard Worker shift = _mm_unpacklo_epi64(shift, shifty);
560*77c1e3ccSAndroid Build Coastguard Worker }
561*77c1e3ccSAndroid Build Coastguard Worker
562*77c1e3ccSAndroid Build Coastguard Worker diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
563*77c1e3ccSAndroid Build Coastguard Worker a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32
564*77c1e3ccSAndroid Build Coastguard Worker a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
565*77c1e3ccSAndroid Build Coastguard Worker
566*77c1e3ccSAndroid Build Coastguard Worker b = _mm_mullo_epi16(diff, shift);
567*77c1e3ccSAndroid Build Coastguard Worker res = _mm_add_epi16(a32, b);
568*77c1e3ccSAndroid Build Coastguard Worker res = _mm_srli_epi16(res, 5);
569*77c1e3ccSAndroid Build Coastguard Worker
570*77c1e3ccSAndroid Build Coastguard Worker resx = _mm_packus_epi16(res, res);
571*77c1e3ccSAndroid Build Coastguard Worker resy = _mm_srli_si128(resx, 4);
572*77c1e3ccSAndroid Build Coastguard Worker
573*77c1e3ccSAndroid Build Coastguard Worker resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]);
574*77c1e3ccSAndroid Build Coastguard Worker *(int *)(dst) = _mm_cvtsi128_si32(resxy);
575*77c1e3ccSAndroid Build Coastguard Worker dst += stride;
576*77c1e3ccSAndroid Build Coastguard Worker }
577*77c1e3ccSAndroid Build Coastguard Worker }
578*77c1e3ccSAndroid Build Coastguard Worker
dr_prediction_z2_Nx8_sse4_1(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)579*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z2_Nx8_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
580*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *above,
581*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *left, int upsample_above,
582*77c1e3ccSAndroid Build Coastguard Worker int upsample_left, int dx, int dy) {
583*77c1e3ccSAndroid Build Coastguard Worker const int min_base_x = -(1 << upsample_above);
584*77c1e3ccSAndroid Build Coastguard Worker const int min_base_y = -(1 << upsample_left);
585*77c1e3ccSAndroid Build Coastguard Worker const int frac_bits_x = 6 - upsample_above;
586*77c1e3ccSAndroid Build Coastguard Worker const int frac_bits_y = 6 - upsample_left;
587*77c1e3ccSAndroid Build Coastguard Worker
588*77c1e3ccSAndroid Build Coastguard Worker // pre-filter above pixels
589*77c1e3ccSAndroid Build Coastguard Worker // store in temp buffers:
590*77c1e3ccSAndroid Build Coastguard Worker // above[x] * 32 + 16
591*77c1e3ccSAndroid Build Coastguard Worker // above[x+1] - above[x]
592*77c1e3ccSAndroid Build Coastguard Worker // final pixels will be calculated as:
593*77c1e3ccSAndroid Build Coastguard Worker // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
594*77c1e3ccSAndroid Build Coastguard Worker __m128i diff, a32;
595*77c1e3ccSAndroid Build Coastguard Worker __m128i a0_x, a1_x, a0_y, a1_y;
596*77c1e3ccSAndroid Build Coastguard Worker __m128i a0_above, a1_above;
597*77c1e3ccSAndroid Build Coastguard Worker
598*77c1e3ccSAndroid Build Coastguard Worker const __m128i a16 = _mm_set1_epi16(16);
599*77c1e3ccSAndroid Build Coastguard Worker const __m128i c3f = _mm_set1_epi16(0x3f);
600*77c1e3ccSAndroid Build Coastguard Worker const __m128i min_y_base = _mm_set1_epi16(min_base_y);
601*77c1e3ccSAndroid Build Coastguard Worker const __m128i dy_reg = _mm_set1_epi16(dy);
602*77c1e3ccSAndroid Build Coastguard Worker const __m128i c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
603*77c1e3ccSAndroid Build Coastguard Worker
604*77c1e3ccSAndroid Build Coastguard Worker for (int r = 0; r < N; r++) {
605*77c1e3ccSAndroid Build Coastguard Worker __m128i b, res, res1, shift;
606*77c1e3ccSAndroid Build Coastguard Worker __m128i resx, resy, resxy, r6, ydx;
607*77c1e3ccSAndroid Build Coastguard Worker
608*77c1e3ccSAndroid Build Coastguard Worker int y = r + 1;
609*77c1e3ccSAndroid Build Coastguard Worker int base_x = (-y * dx) >> frac_bits_x;
610*77c1e3ccSAndroid Build Coastguard Worker int base_shift = 0;
611*77c1e3ccSAndroid Build Coastguard Worker if (base_x < (min_base_x - 1)) {
612*77c1e3ccSAndroid Build Coastguard Worker base_shift = (min_base_x - base_x - 1) >> upsample_above;
613*77c1e3ccSAndroid Build Coastguard Worker }
614*77c1e3ccSAndroid Build Coastguard Worker int base_min_diff =
615*77c1e3ccSAndroid Build Coastguard Worker (min_base_x - base_x + upsample_above) >> upsample_above;
616*77c1e3ccSAndroid Build Coastguard Worker if (base_min_diff > 8) {
617*77c1e3ccSAndroid Build Coastguard Worker base_min_diff = 8;
618*77c1e3ccSAndroid Build Coastguard Worker } else {
619*77c1e3ccSAndroid Build Coastguard Worker if (base_min_diff < 0) base_min_diff = 0;
620*77c1e3ccSAndroid Build Coastguard Worker }
621*77c1e3ccSAndroid Build Coastguard Worker
622*77c1e3ccSAndroid Build Coastguard Worker if (base_shift > 7) {
623*77c1e3ccSAndroid Build Coastguard Worker resx = _mm_setzero_si128();
624*77c1e3ccSAndroid Build Coastguard Worker } else {
625*77c1e3ccSAndroid Build Coastguard Worker a0_above = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
626*77c1e3ccSAndroid Build Coastguard Worker ydx = _mm_set1_epi16(y * dx);
627*77c1e3ccSAndroid Build Coastguard Worker r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6);
628*77c1e3ccSAndroid Build Coastguard Worker if (upsample_above) {
629*77c1e3ccSAndroid Build Coastguard Worker a0_above =
630*77c1e3ccSAndroid Build Coastguard Worker _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[base_shift]);
631*77c1e3ccSAndroid Build Coastguard Worker a1_above = _mm_srli_si128(a0_above, 8);
632*77c1e3ccSAndroid Build Coastguard Worker
633*77c1e3ccSAndroid Build Coastguard Worker shift = _mm_srli_epi16(
634*77c1e3ccSAndroid Build Coastguard Worker _mm_and_si128(
635*77c1e3ccSAndroid Build Coastguard Worker _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
636*77c1e3ccSAndroid Build Coastguard Worker 1);
637*77c1e3ccSAndroid Build Coastguard Worker } else {
638*77c1e3ccSAndroid Build Coastguard Worker a1_above = _mm_srli_si128(a0_above, 1);
639*77c1e3ccSAndroid Build Coastguard Worker a0_above =
640*77c1e3ccSAndroid Build Coastguard Worker _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]);
641*77c1e3ccSAndroid Build Coastguard Worker a1_above =
642*77c1e3ccSAndroid Build Coastguard Worker _mm_shuffle_epi8(a1_above, *(__m128i *)LoadMaskx[base_shift]);
643*77c1e3ccSAndroid Build Coastguard Worker
644*77c1e3ccSAndroid Build Coastguard Worker shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
645*77c1e3ccSAndroid Build Coastguard Worker }
646*77c1e3ccSAndroid Build Coastguard Worker a0_x = _mm_cvtepu8_epi16(a0_above);
647*77c1e3ccSAndroid Build Coastguard Worker a1_x = _mm_cvtepu8_epi16(a1_above);
648*77c1e3ccSAndroid Build Coastguard Worker
649*77c1e3ccSAndroid Build Coastguard Worker diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
650*77c1e3ccSAndroid Build Coastguard Worker a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32
651*77c1e3ccSAndroid Build Coastguard Worker a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
652*77c1e3ccSAndroid Build Coastguard Worker
653*77c1e3ccSAndroid Build Coastguard Worker b = _mm_mullo_epi16(diff, shift);
654*77c1e3ccSAndroid Build Coastguard Worker res = _mm_add_epi16(a32, b);
655*77c1e3ccSAndroid Build Coastguard Worker res = _mm_srli_epi16(res, 5);
656*77c1e3ccSAndroid Build Coastguard Worker resx = _mm_packus_epi16(res, res);
657*77c1e3ccSAndroid Build Coastguard Worker }
658*77c1e3ccSAndroid Build Coastguard Worker
659*77c1e3ccSAndroid Build Coastguard Worker // y calc
660*77c1e3ccSAndroid Build Coastguard Worker if (base_x < min_base_x) {
661*77c1e3ccSAndroid Build Coastguard Worker DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
662*77c1e3ccSAndroid Build Coastguard Worker __m128i y_c, base_y_c_reg, mask;
663*77c1e3ccSAndroid Build Coastguard Worker r6 = _mm_set1_epi16(r << 6);
664*77c1e3ccSAndroid Build Coastguard Worker y_c = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy_reg));
665*77c1e3ccSAndroid Build Coastguard Worker base_y_c_reg = _mm_srai_epi16(y_c, frac_bits_y);
666*77c1e3ccSAndroid Build Coastguard Worker mask = _mm_cmpgt_epi16(min_y_base, base_y_c_reg);
667*77c1e3ccSAndroid Build Coastguard Worker base_y_c_reg = _mm_andnot_si128(mask, base_y_c_reg);
668*77c1e3ccSAndroid Build Coastguard Worker _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
669*77c1e3ccSAndroid Build Coastguard Worker
670*77c1e3ccSAndroid Build Coastguard Worker a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
671*77c1e3ccSAndroid Build Coastguard Worker left[base_y_c[2]], left[base_y_c[3]],
672*77c1e3ccSAndroid Build Coastguard Worker left[base_y_c[4]], left[base_y_c[5]],
673*77c1e3ccSAndroid Build Coastguard Worker left[base_y_c[6]], left[base_y_c[7]]);
674*77c1e3ccSAndroid Build Coastguard Worker base_y_c_reg = _mm_add_epi16(base_y_c_reg, _mm_srli_epi16(a16, 4));
675*77c1e3ccSAndroid Build Coastguard Worker _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
676*77c1e3ccSAndroid Build Coastguard Worker
677*77c1e3ccSAndroid Build Coastguard Worker a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
678*77c1e3ccSAndroid Build Coastguard Worker left[base_y_c[2]], left[base_y_c[3]],
679*77c1e3ccSAndroid Build Coastguard Worker left[base_y_c[4]], left[base_y_c[5]],
680*77c1e3ccSAndroid Build Coastguard Worker left[base_y_c[6]], left[base_y_c[7]]);
681*77c1e3ccSAndroid Build Coastguard Worker
682*77c1e3ccSAndroid Build Coastguard Worker if (upsample_left) {
683*77c1e3ccSAndroid Build Coastguard Worker shift = _mm_srli_epi16(
684*77c1e3ccSAndroid Build Coastguard Worker _mm_and_si128(_mm_slli_epi16(y_c, upsample_left), c3f), 1);
685*77c1e3ccSAndroid Build Coastguard Worker } else {
686*77c1e3ccSAndroid Build Coastguard Worker shift = _mm_srli_epi16(_mm_and_si128(y_c, c3f), 1);
687*77c1e3ccSAndroid Build Coastguard Worker }
688*77c1e3ccSAndroid Build Coastguard Worker
689*77c1e3ccSAndroid Build Coastguard Worker diff = _mm_sub_epi16(a1_y, a0_y); // a[x+1] - a[x]
690*77c1e3ccSAndroid Build Coastguard Worker a32 = _mm_slli_epi16(a0_y, 5); // a[x] * 32
691*77c1e3ccSAndroid Build Coastguard Worker a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
692*77c1e3ccSAndroid Build Coastguard Worker
693*77c1e3ccSAndroid Build Coastguard Worker b = _mm_mullo_epi16(diff, shift);
694*77c1e3ccSAndroid Build Coastguard Worker res1 = _mm_add_epi16(a32, b);
695*77c1e3ccSAndroid Build Coastguard Worker res1 = _mm_srli_epi16(res1, 5);
696*77c1e3ccSAndroid Build Coastguard Worker
697*77c1e3ccSAndroid Build Coastguard Worker resy = _mm_packus_epi16(res1, res1);
698*77c1e3ccSAndroid Build Coastguard Worker resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]);
699*77c1e3ccSAndroid Build Coastguard Worker _mm_storel_epi64((__m128i *)dst, resxy);
700*77c1e3ccSAndroid Build Coastguard Worker } else {
701*77c1e3ccSAndroid Build Coastguard Worker _mm_storel_epi64((__m128i *)dst, resx);
702*77c1e3ccSAndroid Build Coastguard Worker }
703*77c1e3ccSAndroid Build Coastguard Worker
704*77c1e3ccSAndroid Build Coastguard Worker dst += stride;
705*77c1e3ccSAndroid Build Coastguard Worker }
706*77c1e3ccSAndroid Build Coastguard Worker }
707*77c1e3ccSAndroid Build Coastguard Worker
dr_prediction_z2_HxW_sse4_1(int H,int W,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)708*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z2_HxW_sse4_1(int H, int W, uint8_t *dst,
709*77c1e3ccSAndroid Build Coastguard Worker ptrdiff_t stride, const uint8_t *above,
710*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *left, int upsample_above,
711*77c1e3ccSAndroid Build Coastguard Worker int upsample_left, int dx, int dy) {
712*77c1e3ccSAndroid Build Coastguard Worker // here upsample_above and upsample_left are 0 by design of
713*77c1e3ccSAndroid Build Coastguard Worker // av1_use_intra_edge_upsample
714*77c1e3ccSAndroid Build Coastguard Worker const int min_base_x = -1;
715*77c1e3ccSAndroid Build Coastguard Worker const int min_base_y = -1;
716*77c1e3ccSAndroid Build Coastguard Worker (void)upsample_above;
717*77c1e3ccSAndroid Build Coastguard Worker (void)upsample_left;
718*77c1e3ccSAndroid Build Coastguard Worker const int frac_bits_x = 6;
719*77c1e3ccSAndroid Build Coastguard Worker const int frac_bits_y = 6;
720*77c1e3ccSAndroid Build Coastguard Worker
721*77c1e3ccSAndroid Build Coastguard Worker __m128i a0_x, a1_x, a0_y, a1_y, a0_y_h, a1_y_h, a32;
722*77c1e3ccSAndroid Build Coastguard Worker __m128i diff, shifty, shifty_h;
723*77c1e3ccSAndroid Build Coastguard Worker __m128i a0_above, a1_above;
724*77c1e3ccSAndroid Build Coastguard Worker
725*77c1e3ccSAndroid Build Coastguard Worker DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
726*77c1e3ccSAndroid Build Coastguard Worker const __m128i a16 = _mm_set1_epi16(16);
727*77c1e3ccSAndroid Build Coastguard Worker const __m128i c1 = _mm_srli_epi16(a16, 4);
728*77c1e3ccSAndroid Build Coastguard Worker const __m128i min_y_base = _mm_set1_epi16(min_base_y);
729*77c1e3ccSAndroid Build Coastguard Worker const __m128i c3f = _mm_set1_epi16(0x3f);
730*77c1e3ccSAndroid Build Coastguard Worker const __m128i dy256 = _mm_set1_epi16(dy);
731*77c1e3ccSAndroid Build Coastguard Worker const __m128i c0123 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
732*77c1e3ccSAndroid Build Coastguard Worker const __m128i c0123_h = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
733*77c1e3ccSAndroid Build Coastguard Worker const __m128i c1234 = _mm_add_epi16(c0123, c1);
734*77c1e3ccSAndroid Build Coastguard Worker const __m128i c1234_h = _mm_add_epi16(c0123_h, c1);
735*77c1e3ccSAndroid Build Coastguard Worker
736*77c1e3ccSAndroid Build Coastguard Worker for (int r = 0; r < H; r++) {
737*77c1e3ccSAndroid Build Coastguard Worker __m128i b, res, res1, shift, reg_j, r6, ydx;
738*77c1e3ccSAndroid Build Coastguard Worker __m128i resx, resy;
739*77c1e3ccSAndroid Build Coastguard Worker __m128i resxy;
740*77c1e3ccSAndroid Build Coastguard Worker int y = r + 1;
741*77c1e3ccSAndroid Build Coastguard Worker ydx = _mm_set1_epi16((int16_t)(y * dx));
742*77c1e3ccSAndroid Build Coastguard Worker
743*77c1e3ccSAndroid Build Coastguard Worker int base_x = (-y * dx) >> frac_bits_x;
744*77c1e3ccSAndroid Build Coastguard Worker for (int j = 0; j < W; j += 16) {
745*77c1e3ccSAndroid Build Coastguard Worker reg_j = _mm_set1_epi16(j);
746*77c1e3ccSAndroid Build Coastguard Worker int base_shift = 0;
747*77c1e3ccSAndroid Build Coastguard Worker if ((base_x + j) < (min_base_x - 1)) {
748*77c1e3ccSAndroid Build Coastguard Worker base_shift = (min_base_x - (base_x + j) - 1);
749*77c1e3ccSAndroid Build Coastguard Worker }
750*77c1e3ccSAndroid Build Coastguard Worker int base_min_diff = (min_base_x - base_x - j);
751*77c1e3ccSAndroid Build Coastguard Worker if (base_min_diff > 16) {
752*77c1e3ccSAndroid Build Coastguard Worker base_min_diff = 16;
753*77c1e3ccSAndroid Build Coastguard Worker } else {
754*77c1e3ccSAndroid Build Coastguard Worker if (base_min_diff < 0) base_min_diff = 0;
755*77c1e3ccSAndroid Build Coastguard Worker }
756*77c1e3ccSAndroid Build Coastguard Worker
757*77c1e3ccSAndroid Build Coastguard Worker if (base_shift < 16) {
758*77c1e3ccSAndroid Build Coastguard Worker a0_above =
759*77c1e3ccSAndroid Build Coastguard Worker _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
760*77c1e3ccSAndroid Build Coastguard Worker a1_above =
761*77c1e3ccSAndroid Build Coastguard Worker _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
762*77c1e3ccSAndroid Build Coastguard Worker a0_above =
763*77c1e3ccSAndroid Build Coastguard Worker _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]);
764*77c1e3ccSAndroid Build Coastguard Worker a1_above =
765*77c1e3ccSAndroid Build Coastguard Worker _mm_shuffle_epi8(a1_above, *(__m128i *)LoadMaskx[base_shift]);
766*77c1e3ccSAndroid Build Coastguard Worker
767*77c1e3ccSAndroid Build Coastguard Worker a0_x = _mm_cvtepu8_epi16(a0_above);
768*77c1e3ccSAndroid Build Coastguard Worker a1_x = _mm_cvtepu8_epi16(a1_above);
769*77c1e3ccSAndroid Build Coastguard Worker
770*77c1e3ccSAndroid Build Coastguard Worker r6 = _mm_slli_epi16(_mm_add_epi16(c0123, reg_j), 6);
771*77c1e3ccSAndroid Build Coastguard Worker shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
772*77c1e3ccSAndroid Build Coastguard Worker
773*77c1e3ccSAndroid Build Coastguard Worker diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
774*77c1e3ccSAndroid Build Coastguard Worker a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32
775*77c1e3ccSAndroid Build Coastguard Worker a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
776*77c1e3ccSAndroid Build Coastguard Worker
777*77c1e3ccSAndroid Build Coastguard Worker b = _mm_mullo_epi16(diff, shift);
778*77c1e3ccSAndroid Build Coastguard Worker res = _mm_add_epi16(a32, b);
779*77c1e3ccSAndroid Build Coastguard Worker res = _mm_srli_epi16(res, 5); // 16 16-bit values
780*77c1e3ccSAndroid Build Coastguard Worker
781*77c1e3ccSAndroid Build Coastguard Worker a0_x = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
782*77c1e3ccSAndroid Build Coastguard Worker a1_x = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
783*77c1e3ccSAndroid Build Coastguard Worker
784*77c1e3ccSAndroid Build Coastguard Worker r6 = _mm_slli_epi16(_mm_add_epi16(c0123_h, reg_j), 6);
785*77c1e3ccSAndroid Build Coastguard Worker shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
786*77c1e3ccSAndroid Build Coastguard Worker
787*77c1e3ccSAndroid Build Coastguard Worker diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
788*77c1e3ccSAndroid Build Coastguard Worker a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32
789*77c1e3ccSAndroid Build Coastguard Worker a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
790*77c1e3ccSAndroid Build Coastguard Worker
791*77c1e3ccSAndroid Build Coastguard Worker b = _mm_mullo_epi16(diff, shift);
792*77c1e3ccSAndroid Build Coastguard Worker res1 = _mm_add_epi16(a32, b);
793*77c1e3ccSAndroid Build Coastguard Worker res1 = _mm_srli_epi16(res1, 5); // 16 16-bit values
794*77c1e3ccSAndroid Build Coastguard Worker
795*77c1e3ccSAndroid Build Coastguard Worker resx = _mm_packus_epi16(res, res1);
796*77c1e3ccSAndroid Build Coastguard Worker } else {
797*77c1e3ccSAndroid Build Coastguard Worker resx = _mm_setzero_si128();
798*77c1e3ccSAndroid Build Coastguard Worker }
799*77c1e3ccSAndroid Build Coastguard Worker
800*77c1e3ccSAndroid Build Coastguard Worker // y calc
801*77c1e3ccSAndroid Build Coastguard Worker if (base_x < min_base_x) {
802*77c1e3ccSAndroid Build Coastguard Worker __m128i c_reg, c_reg_h, y_reg, y_reg_h, base_y, base_y_h;
803*77c1e3ccSAndroid Build Coastguard Worker __m128i mask, mask_h, mul16, mul16_h;
804*77c1e3ccSAndroid Build Coastguard Worker r6 = _mm_set1_epi16(r << 6);
805*77c1e3ccSAndroid Build Coastguard Worker c_reg = _mm_add_epi16(reg_j, c1234);
806*77c1e3ccSAndroid Build Coastguard Worker c_reg_h = _mm_add_epi16(reg_j, c1234_h);
807*77c1e3ccSAndroid Build Coastguard Worker mul16 = _mm_min_epu16(_mm_mullo_epi16(c_reg, dy256),
808*77c1e3ccSAndroid Build Coastguard Worker _mm_srli_epi16(min_y_base, 1));
809*77c1e3ccSAndroid Build Coastguard Worker mul16_h = _mm_min_epu16(_mm_mullo_epi16(c_reg_h, dy256),
810*77c1e3ccSAndroid Build Coastguard Worker _mm_srli_epi16(min_y_base, 1));
811*77c1e3ccSAndroid Build Coastguard Worker y_reg = _mm_sub_epi16(r6, mul16);
812*77c1e3ccSAndroid Build Coastguard Worker y_reg_h = _mm_sub_epi16(r6, mul16_h);
813*77c1e3ccSAndroid Build Coastguard Worker
814*77c1e3ccSAndroid Build Coastguard Worker base_y = _mm_srai_epi16(y_reg, frac_bits_y);
815*77c1e3ccSAndroid Build Coastguard Worker base_y_h = _mm_srai_epi16(y_reg_h, frac_bits_y);
816*77c1e3ccSAndroid Build Coastguard Worker mask = _mm_cmpgt_epi16(min_y_base, base_y);
817*77c1e3ccSAndroid Build Coastguard Worker mask_h = _mm_cmpgt_epi16(min_y_base, base_y_h);
818*77c1e3ccSAndroid Build Coastguard Worker
819*77c1e3ccSAndroid Build Coastguard Worker base_y = _mm_blendv_epi8(base_y, min_y_base, mask);
820*77c1e3ccSAndroid Build Coastguard Worker base_y_h = _mm_blendv_epi8(base_y_h, min_y_base, mask_h);
821*77c1e3ccSAndroid Build Coastguard Worker int16_t min_y = (int16_t)_mm_extract_epi16(base_y_h, 7);
822*77c1e3ccSAndroid Build Coastguard Worker int16_t max_y = (int16_t)_mm_extract_epi16(base_y, 0);
823*77c1e3ccSAndroid Build Coastguard Worker int16_t offset_diff = max_y - min_y;
824*77c1e3ccSAndroid Build Coastguard Worker
825*77c1e3ccSAndroid Build Coastguard Worker if (offset_diff < 16) {
826*77c1e3ccSAndroid Build Coastguard Worker __m128i min_y_reg = _mm_set1_epi16(min_y);
827*77c1e3ccSAndroid Build Coastguard Worker
828*77c1e3ccSAndroid Build Coastguard Worker __m128i base_y_offset = _mm_sub_epi16(base_y, min_y_reg);
829*77c1e3ccSAndroid Build Coastguard Worker __m128i base_y_offset_h = _mm_sub_epi16(base_y_h, min_y_reg);
830*77c1e3ccSAndroid Build Coastguard Worker __m128i y_offset = _mm_packs_epi16(base_y_offset, base_y_offset_h);
831*77c1e3ccSAndroid Build Coastguard Worker
832*77c1e3ccSAndroid Build Coastguard Worker __m128i a0_mask = _mm_loadu_si128((__m128i *)(left + min_y));
833*77c1e3ccSAndroid Build Coastguard Worker __m128i a1_mask = _mm_loadu_si128((__m128i *)(left + min_y + 1));
834*77c1e3ccSAndroid Build Coastguard Worker __m128i LoadMask =
835*77c1e3ccSAndroid Build Coastguard Worker _mm_loadu_si128((__m128i *)(LoadMaskz2[offset_diff / 4]));
836*77c1e3ccSAndroid Build Coastguard Worker
837*77c1e3ccSAndroid Build Coastguard Worker a0_mask = _mm_and_si128(a0_mask, LoadMask);
838*77c1e3ccSAndroid Build Coastguard Worker a1_mask = _mm_and_si128(a1_mask, LoadMask);
839*77c1e3ccSAndroid Build Coastguard Worker
840*77c1e3ccSAndroid Build Coastguard Worker a0_mask = _mm_shuffle_epi8(a0_mask, y_offset);
841*77c1e3ccSAndroid Build Coastguard Worker a1_mask = _mm_shuffle_epi8(a1_mask, y_offset);
842*77c1e3ccSAndroid Build Coastguard Worker a0_y = _mm_cvtepu8_epi16(a0_mask);
843*77c1e3ccSAndroid Build Coastguard Worker a1_y = _mm_cvtepu8_epi16(a1_mask);
844*77c1e3ccSAndroid Build Coastguard Worker a0_y_h = _mm_cvtepu8_epi16(_mm_srli_si128(a0_mask, 8));
845*77c1e3ccSAndroid Build Coastguard Worker a1_y_h = _mm_cvtepu8_epi16(_mm_srli_si128(a1_mask, 8));
846*77c1e3ccSAndroid Build Coastguard Worker } else {
847*77c1e3ccSAndroid Build Coastguard Worker base_y = _mm_andnot_si128(mask, base_y);
848*77c1e3ccSAndroid Build Coastguard Worker base_y_h = _mm_andnot_si128(mask_h, base_y_h);
849*77c1e3ccSAndroid Build Coastguard Worker _mm_store_si128((__m128i *)base_y_c, base_y);
850*77c1e3ccSAndroid Build Coastguard Worker _mm_store_si128((__m128i *)&base_y_c[8], base_y_h);
851*77c1e3ccSAndroid Build Coastguard Worker
852*77c1e3ccSAndroid Build Coastguard Worker a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
853*77c1e3ccSAndroid Build Coastguard Worker left[base_y_c[2]], left[base_y_c[3]],
854*77c1e3ccSAndroid Build Coastguard Worker left[base_y_c[4]], left[base_y_c[5]],
855*77c1e3ccSAndroid Build Coastguard Worker left[base_y_c[6]], left[base_y_c[7]]);
856*77c1e3ccSAndroid Build Coastguard Worker a0_y_h = _mm_setr_epi16(left[base_y_c[8]], left[base_y_c[9]],
857*77c1e3ccSAndroid Build Coastguard Worker left[base_y_c[10]], left[base_y_c[11]],
858*77c1e3ccSAndroid Build Coastguard Worker left[base_y_c[12]], left[base_y_c[13]],
859*77c1e3ccSAndroid Build Coastguard Worker left[base_y_c[14]], left[base_y_c[15]]);
860*77c1e3ccSAndroid Build Coastguard Worker base_y = _mm_add_epi16(base_y, c1);
861*77c1e3ccSAndroid Build Coastguard Worker base_y_h = _mm_add_epi16(base_y_h, c1);
862*77c1e3ccSAndroid Build Coastguard Worker _mm_store_si128((__m128i *)base_y_c, base_y);
863*77c1e3ccSAndroid Build Coastguard Worker _mm_store_si128((__m128i *)&base_y_c[8], base_y_h);
864*77c1e3ccSAndroid Build Coastguard Worker
865*77c1e3ccSAndroid Build Coastguard Worker a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
866*77c1e3ccSAndroid Build Coastguard Worker left[base_y_c[2]], left[base_y_c[3]],
867*77c1e3ccSAndroid Build Coastguard Worker left[base_y_c[4]], left[base_y_c[5]],
868*77c1e3ccSAndroid Build Coastguard Worker left[base_y_c[6]], left[base_y_c[7]]);
869*77c1e3ccSAndroid Build Coastguard Worker a1_y_h = _mm_setr_epi16(left[base_y_c[8]], left[base_y_c[9]],
870*77c1e3ccSAndroid Build Coastguard Worker left[base_y_c[10]], left[base_y_c[11]],
871*77c1e3ccSAndroid Build Coastguard Worker left[base_y_c[12]], left[base_y_c[13]],
872*77c1e3ccSAndroid Build Coastguard Worker left[base_y_c[14]], left[base_y_c[15]]);
873*77c1e3ccSAndroid Build Coastguard Worker }
874*77c1e3ccSAndroid Build Coastguard Worker shifty = _mm_srli_epi16(_mm_and_si128(y_reg, c3f), 1);
875*77c1e3ccSAndroid Build Coastguard Worker shifty_h = _mm_srli_epi16(_mm_and_si128(y_reg_h, c3f), 1);
876*77c1e3ccSAndroid Build Coastguard Worker
877*77c1e3ccSAndroid Build Coastguard Worker diff = _mm_sub_epi16(a1_y, a0_y); // a[x+1] - a[x]
878*77c1e3ccSAndroid Build Coastguard Worker a32 = _mm_slli_epi16(a0_y, 5); // a[x] * 32
879*77c1e3ccSAndroid Build Coastguard Worker a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
880*77c1e3ccSAndroid Build Coastguard Worker
881*77c1e3ccSAndroid Build Coastguard Worker b = _mm_mullo_epi16(diff, shifty);
882*77c1e3ccSAndroid Build Coastguard Worker res = _mm_add_epi16(a32, b);
883*77c1e3ccSAndroid Build Coastguard Worker res = _mm_srli_epi16(res, 5); // 16 16-bit values
884*77c1e3ccSAndroid Build Coastguard Worker
885*77c1e3ccSAndroid Build Coastguard Worker diff = _mm_sub_epi16(a1_y_h, a0_y_h); // a[x+1] - a[x]
886*77c1e3ccSAndroid Build Coastguard Worker a32 = _mm_slli_epi16(a0_y_h, 5); // a[x] * 32
887*77c1e3ccSAndroid Build Coastguard Worker a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
888*77c1e3ccSAndroid Build Coastguard Worker
889*77c1e3ccSAndroid Build Coastguard Worker b = _mm_mullo_epi16(diff, shifty_h);
890*77c1e3ccSAndroid Build Coastguard Worker res1 = _mm_add_epi16(a32, b);
891*77c1e3ccSAndroid Build Coastguard Worker res1 = _mm_srli_epi16(res1, 5); // 16 16-bit values
892*77c1e3ccSAndroid Build Coastguard Worker resy = _mm_packus_epi16(res, res1);
893*77c1e3ccSAndroid Build Coastguard Worker } else {
894*77c1e3ccSAndroid Build Coastguard Worker resy = _mm_setzero_si128();
895*77c1e3ccSAndroid Build Coastguard Worker }
896*77c1e3ccSAndroid Build Coastguard Worker resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]);
897*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(dst + j), resxy);
898*77c1e3ccSAndroid Build Coastguard Worker } // for j
899*77c1e3ccSAndroid Build Coastguard Worker dst += stride;
900*77c1e3ccSAndroid Build Coastguard Worker }
901*77c1e3ccSAndroid Build Coastguard Worker }
902*77c1e3ccSAndroid Build Coastguard Worker
903*77c1e3ccSAndroid Build Coastguard Worker // Directional prediction, zone 2: 90 < angle < 180
av1_dr_prediction_z2_sse4_1(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)904*77c1e3ccSAndroid Build Coastguard Worker void av1_dr_prediction_z2_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
905*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *above, const uint8_t *left,
906*77c1e3ccSAndroid Build Coastguard Worker int upsample_above, int upsample_left, int dx,
907*77c1e3ccSAndroid Build Coastguard Worker int dy) {
908*77c1e3ccSAndroid Build Coastguard Worker assert(dx > 0);
909*77c1e3ccSAndroid Build Coastguard Worker assert(dy > 0);
910*77c1e3ccSAndroid Build Coastguard Worker switch (bw) {
911*77c1e3ccSAndroid Build Coastguard Worker case 4:
912*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z2_Nx4_sse4_1(bh, dst, stride, above, left, upsample_above,
913*77c1e3ccSAndroid Build Coastguard Worker upsample_left, dx, dy);
914*77c1e3ccSAndroid Build Coastguard Worker break;
915*77c1e3ccSAndroid Build Coastguard Worker case 8:
916*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z2_Nx8_sse4_1(bh, dst, stride, above, left, upsample_above,
917*77c1e3ccSAndroid Build Coastguard Worker upsample_left, dx, dy);
918*77c1e3ccSAndroid Build Coastguard Worker break;
919*77c1e3ccSAndroid Build Coastguard Worker default:
920*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z2_HxW_sse4_1(bh, bw, dst, stride, above, left,
921*77c1e3ccSAndroid Build Coastguard Worker upsample_above, upsample_left, dx, dy);
922*77c1e3ccSAndroid Build Coastguard Worker }
923*77c1e3ccSAndroid Build Coastguard Worker return;
924*77c1e3ccSAndroid Build Coastguard Worker }
925*77c1e3ccSAndroid Build Coastguard Worker
926*77c1e3ccSAndroid Build Coastguard Worker // z3 functions
dr_prediction_z3_4x4_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)927*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_4x4_sse4_1(uint8_t *dst, ptrdiff_t stride,
928*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *left, int upsample_left,
929*77c1e3ccSAndroid Build Coastguard Worker int dy) {
930*77c1e3ccSAndroid Build Coastguard Worker __m128i dstvec[4], d[4];
931*77c1e3ccSAndroid Build Coastguard Worker
932*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z1_HxW_internal_sse4_1(4, 4, dstvec, left, upsample_left, dy);
933*77c1e3ccSAndroid Build Coastguard Worker transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
934*77c1e3ccSAndroid Build Coastguard Worker &d[0], &d[1], &d[2], &d[3]);
935*77c1e3ccSAndroid Build Coastguard Worker
936*77c1e3ccSAndroid Build Coastguard Worker *(int *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]);
937*77c1e3ccSAndroid Build Coastguard Worker *(int *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]);
938*77c1e3ccSAndroid Build Coastguard Worker *(int *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]);
939*77c1e3ccSAndroid Build Coastguard Worker *(int *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]);
940*77c1e3ccSAndroid Build Coastguard Worker return;
941*77c1e3ccSAndroid Build Coastguard Worker }
942*77c1e3ccSAndroid Build Coastguard Worker
dr_prediction_z3_8x8_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)943*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_8x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
944*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *left, int upsample_left,
945*77c1e3ccSAndroid Build Coastguard Worker int dy) {
946*77c1e3ccSAndroid Build Coastguard Worker __m128i dstvec[8], d[8];
947*77c1e3ccSAndroid Build Coastguard Worker
948*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z1_HxW_internal_sse4_1(8, 8, dstvec, left, upsample_left, dy);
949*77c1e3ccSAndroid Build Coastguard Worker transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4],
950*77c1e3ccSAndroid Build Coastguard Worker &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2],
951*77c1e3ccSAndroid Build Coastguard Worker &d[3]);
952*77c1e3ccSAndroid Build Coastguard Worker
953*77c1e3ccSAndroid Build Coastguard Worker _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
954*77c1e3ccSAndroid Build Coastguard Worker _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8));
955*77c1e3ccSAndroid Build Coastguard Worker _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]);
956*77c1e3ccSAndroid Build Coastguard Worker _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8));
957*77c1e3ccSAndroid Build Coastguard Worker _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]);
958*77c1e3ccSAndroid Build Coastguard Worker _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8));
959*77c1e3ccSAndroid Build Coastguard Worker _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]);
960*77c1e3ccSAndroid Build Coastguard Worker _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8));
961*77c1e3ccSAndroid Build Coastguard Worker }
962*77c1e3ccSAndroid Build Coastguard Worker
dr_prediction_z3_4x8_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)963*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_4x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
964*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *left, int upsample_left,
965*77c1e3ccSAndroid Build Coastguard Worker int dy) {
966*77c1e3ccSAndroid Build Coastguard Worker __m128i dstvec[4], d[8];
967*77c1e3ccSAndroid Build Coastguard Worker
968*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z1_HxW_internal_sse4_1(8, 4, dstvec, left, upsample_left, dy);
969*77c1e3ccSAndroid Build Coastguard Worker transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0],
970*77c1e3ccSAndroid Build Coastguard Worker &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
971*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < 8; i++) {
972*77c1e3ccSAndroid Build Coastguard Worker *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
973*77c1e3ccSAndroid Build Coastguard Worker }
974*77c1e3ccSAndroid Build Coastguard Worker }
975*77c1e3ccSAndroid Build Coastguard Worker
dr_prediction_z3_8x4_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)976*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_8x4_sse4_1(uint8_t *dst, ptrdiff_t stride,
977*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *left, int upsample_left,
978*77c1e3ccSAndroid Build Coastguard Worker int dy) {
979*77c1e3ccSAndroid Build Coastguard Worker __m128i dstvec[8], d[4];
980*77c1e3ccSAndroid Build Coastguard Worker
981*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z1_HxW_internal_sse4_1(4, 8, dstvec, left, upsample_left, dy);
982*77c1e3ccSAndroid Build Coastguard Worker transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
983*77c1e3ccSAndroid Build Coastguard Worker &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0],
984*77c1e3ccSAndroid Build Coastguard Worker &d[1], &d[2], &d[3]);
985*77c1e3ccSAndroid Build Coastguard Worker _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
986*77c1e3ccSAndroid Build Coastguard Worker _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
987*77c1e3ccSAndroid Build Coastguard Worker _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
988*77c1e3ccSAndroid Build Coastguard Worker _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
989*77c1e3ccSAndroid Build Coastguard Worker }
990*77c1e3ccSAndroid Build Coastguard Worker
dr_prediction_z3_8x16_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)991*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_8x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
992*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *left, int upsample_left,
993*77c1e3ccSAndroid Build Coastguard Worker int dy) {
994*77c1e3ccSAndroid Build Coastguard Worker __m128i dstvec[8], d[8];
995*77c1e3ccSAndroid Build Coastguard Worker
996*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z1_HxW_internal_sse4_1(16, 8, dstvec, left, upsample_left, dy);
997*77c1e3ccSAndroid Build Coastguard Worker transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3,
998*77c1e3ccSAndroid Build Coastguard Worker dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d,
999*77c1e3ccSAndroid Build Coastguard Worker d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7);
1000*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < 8; i++) {
1001*77c1e3ccSAndroid Build Coastguard Worker _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
1002*77c1e3ccSAndroid Build Coastguard Worker _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
1003*77c1e3ccSAndroid Build Coastguard Worker _mm_srli_si128(d[i], 8));
1004*77c1e3ccSAndroid Build Coastguard Worker }
1005*77c1e3ccSAndroid Build Coastguard Worker }
1006*77c1e3ccSAndroid Build Coastguard Worker
dr_prediction_z3_16x8_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)1007*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_16x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
1008*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *left, int upsample_left,
1009*77c1e3ccSAndroid Build Coastguard Worker int dy) {
1010*77c1e3ccSAndroid Build Coastguard Worker __m128i dstvec[16], d[16];
1011*77c1e3ccSAndroid Build Coastguard Worker
1012*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z1_HxW_internal_sse4_1(8, 16, dstvec, left, upsample_left, dy);
1013*77c1e3ccSAndroid Build Coastguard Worker transpose16x8_8x16_sse2(
1014*77c1e3ccSAndroid Build Coastguard Worker &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
1015*77c1e3ccSAndroid Build Coastguard Worker &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
1016*77c1e3ccSAndroid Build Coastguard Worker &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
1017*77c1e3ccSAndroid Build Coastguard Worker &d[3], &d[4], &d[5], &d[6], &d[7]);
1018*77c1e3ccSAndroid Build Coastguard Worker
1019*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < 8; i++) {
1020*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
1021*77c1e3ccSAndroid Build Coastguard Worker }
1022*77c1e3ccSAndroid Build Coastguard Worker }
1023*77c1e3ccSAndroid Build Coastguard Worker
1024*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
dr_prediction_z3_4x16_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)1025*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_4x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
1026*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *left, int upsample_left,
1027*77c1e3ccSAndroid Build Coastguard Worker int dy) {
1028*77c1e3ccSAndroid Build Coastguard Worker __m128i dstvec[4], d[16];
1029*77c1e3ccSAndroid Build Coastguard Worker
1030*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z1_HxW_internal_sse4_1(16, 4, dstvec, left, upsample_left, dy);
1031*77c1e3ccSAndroid Build Coastguard Worker transpose4x16_sse2(dstvec, d);
1032*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < 16; i++) {
1033*77c1e3ccSAndroid Build Coastguard Worker *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
1034*77c1e3ccSAndroid Build Coastguard Worker }
1035*77c1e3ccSAndroid Build Coastguard Worker }
1036*77c1e3ccSAndroid Build Coastguard Worker
dr_prediction_z3_16x4_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)1037*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_16x4_sse4_1(uint8_t *dst, ptrdiff_t stride,
1038*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *left, int upsample_left,
1039*77c1e3ccSAndroid Build Coastguard Worker int dy) {
1040*77c1e3ccSAndroid Build Coastguard Worker __m128i dstvec[16], d[8];
1041*77c1e3ccSAndroid Build Coastguard Worker
1042*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z1_HxW_internal_sse4_1(4, 16, dstvec, left, upsample_left, dy);
1043*77c1e3ccSAndroid Build Coastguard Worker for (int i = 4; i < 8; i++) {
1044*77c1e3ccSAndroid Build Coastguard Worker d[i] = _mm_setzero_si128();
1045*77c1e3ccSAndroid Build Coastguard Worker }
1046*77c1e3ccSAndroid Build Coastguard Worker transpose16x8_8x16_sse2(
1047*77c1e3ccSAndroid Build Coastguard Worker &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
1048*77c1e3ccSAndroid Build Coastguard Worker &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
1049*77c1e3ccSAndroid Build Coastguard Worker &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
1050*77c1e3ccSAndroid Build Coastguard Worker &d[3], &d[4], &d[5], &d[6], &d[7]);
1051*77c1e3ccSAndroid Build Coastguard Worker
1052*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < 4; i++) {
1053*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
1054*77c1e3ccSAndroid Build Coastguard Worker }
1055*77c1e3ccSAndroid Build Coastguard Worker }
1056*77c1e3ccSAndroid Build Coastguard Worker
dr_prediction_z3_8x32_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)1057*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_8x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
1058*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *left, int upsample_left,
1059*77c1e3ccSAndroid Build Coastguard Worker int dy) {
1060*77c1e3ccSAndroid Build Coastguard Worker __m128i dstvec[16], d[16], dstvec_h[16], d_h[16];
1061*77c1e3ccSAndroid Build Coastguard Worker
1062*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z1_32xN_internal_sse4_1(8, dstvec, dstvec_h, left,
1063*77c1e3ccSAndroid Build Coastguard Worker upsample_left, dy);
1064*77c1e3ccSAndroid Build Coastguard Worker for (int i = 8; i < 16; i++) {
1065*77c1e3ccSAndroid Build Coastguard Worker dstvec[i] = _mm_setzero_si128();
1066*77c1e3ccSAndroid Build Coastguard Worker dstvec_h[i] = _mm_setzero_si128();
1067*77c1e3ccSAndroid Build Coastguard Worker }
1068*77c1e3ccSAndroid Build Coastguard Worker transpose16x16_sse2(dstvec, d);
1069*77c1e3ccSAndroid Build Coastguard Worker transpose16x16_sse2(dstvec_h, d_h);
1070*77c1e3ccSAndroid Build Coastguard Worker
1071*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < 16; i++) {
1072*77c1e3ccSAndroid Build Coastguard Worker _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
1073*77c1e3ccSAndroid Build Coastguard Worker }
1074*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < 16; i++) {
1075*77c1e3ccSAndroid Build Coastguard Worker _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride), d_h[i]);
1076*77c1e3ccSAndroid Build Coastguard Worker }
1077*77c1e3ccSAndroid Build Coastguard Worker }
1078*77c1e3ccSAndroid Build Coastguard Worker
dr_prediction_z3_32x8_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)1079*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_32x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
1080*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *left, int upsample_left,
1081*77c1e3ccSAndroid Build Coastguard Worker int dy) {
1082*77c1e3ccSAndroid Build Coastguard Worker __m128i dstvec[32], d[16];
1083*77c1e3ccSAndroid Build Coastguard Worker
1084*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z1_HxW_internal_sse4_1(8, 32, dstvec, left, upsample_left, dy);
1085*77c1e3ccSAndroid Build Coastguard Worker
1086*77c1e3ccSAndroid Build Coastguard Worker transpose16x8_8x16_sse2(
1087*77c1e3ccSAndroid Build Coastguard Worker &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
1088*77c1e3ccSAndroid Build Coastguard Worker &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
1089*77c1e3ccSAndroid Build Coastguard Worker &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
1090*77c1e3ccSAndroid Build Coastguard Worker &d[3], &d[4], &d[5], &d[6], &d[7]);
1091*77c1e3ccSAndroid Build Coastguard Worker transpose16x8_8x16_sse2(
1092*77c1e3ccSAndroid Build Coastguard Worker &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16],
1093*77c1e3ccSAndroid Build Coastguard Worker &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16],
1094*77c1e3ccSAndroid Build Coastguard Worker &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16],
1095*77c1e3ccSAndroid Build Coastguard Worker &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16],
1096*77c1e3ccSAndroid Build Coastguard Worker &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8],
1097*77c1e3ccSAndroid Build Coastguard Worker &d[6 + 8], &d[7 + 8]);
1098*77c1e3ccSAndroid Build Coastguard Worker
1099*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < 8; i++) {
1100*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
1101*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]);
1102*77c1e3ccSAndroid Build Coastguard Worker }
1103*77c1e3ccSAndroid Build Coastguard Worker }
1104*77c1e3ccSAndroid Build Coastguard Worker #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1105*77c1e3ccSAndroid Build Coastguard Worker
dr_prediction_z3_16x16_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)1106*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_16x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
1107*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *left,
1108*77c1e3ccSAndroid Build Coastguard Worker int upsample_left, int dy) {
1109*77c1e3ccSAndroid Build Coastguard Worker __m128i dstvec[16], d[16];
1110*77c1e3ccSAndroid Build Coastguard Worker
1111*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z1_HxW_internal_sse4_1(16, 16, dstvec, left, upsample_left, dy);
1112*77c1e3ccSAndroid Build Coastguard Worker transpose16x16_sse2(dstvec, d);
1113*77c1e3ccSAndroid Build Coastguard Worker
1114*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < 16; i++) {
1115*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
1116*77c1e3ccSAndroid Build Coastguard Worker }
1117*77c1e3ccSAndroid Build Coastguard Worker }
1118*77c1e3ccSAndroid Build Coastguard Worker
dr_prediction_z3_32x32_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)1119*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_32x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
1120*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *left,
1121*77c1e3ccSAndroid Build Coastguard Worker int upsample_left, int dy) {
1122*77c1e3ccSAndroid Build Coastguard Worker __m128i dstvec[32], d[32], dstvec_h[32], d_h[32];
1123*77c1e3ccSAndroid Build Coastguard Worker
1124*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z1_32xN_internal_sse4_1(32, dstvec, dstvec_h, left,
1125*77c1e3ccSAndroid Build Coastguard Worker upsample_left, dy);
1126*77c1e3ccSAndroid Build Coastguard Worker transpose16x16_sse2(dstvec, d);
1127*77c1e3ccSAndroid Build Coastguard Worker transpose16x16_sse2(dstvec_h, d_h);
1128*77c1e3ccSAndroid Build Coastguard Worker transpose16x16_sse2(dstvec + 16, d + 16);
1129*77c1e3ccSAndroid Build Coastguard Worker transpose16x16_sse2(dstvec_h + 16, d_h + 16);
1130*77c1e3ccSAndroid Build Coastguard Worker for (int j = 0; j < 16; j++) {
1131*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(dst + j * stride), d[j]);
1132*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(dst + j * stride + 16), d[j + 16]);
1133*77c1e3ccSAndroid Build Coastguard Worker }
1134*77c1e3ccSAndroid Build Coastguard Worker for (int j = 0; j < 16; j++) {
1135*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), d_h[j]);
1136*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16), d_h[j + 16]);
1137*77c1e3ccSAndroid Build Coastguard Worker }
1138*77c1e3ccSAndroid Build Coastguard Worker }
1139*77c1e3ccSAndroid Build Coastguard Worker
dr_prediction_z3_64x64_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)1140*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_64x64_sse4_1(uint8_t *dst, ptrdiff_t stride,
1141*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *left,
1142*77c1e3ccSAndroid Build Coastguard Worker int upsample_left, int dy) {
1143*77c1e3ccSAndroid Build Coastguard Worker uint8_t dstT[64 * 64];
1144*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z1_64xN_sse4_1(64, dstT, 64, left, upsample_left, dy);
1145*77c1e3ccSAndroid Build Coastguard Worker transpose(dstT, 64, dst, stride, 64, 64);
1146*77c1e3ccSAndroid Build Coastguard Worker }
1147*77c1e3ccSAndroid Build Coastguard Worker
dr_prediction_z3_16x32_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)1148*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_16x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
1149*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *left,
1150*77c1e3ccSAndroid Build Coastguard Worker int upsample_left, int dy) {
1151*77c1e3ccSAndroid Build Coastguard Worker __m128i dstvec[16], d[16], dstvec_h[16], d_h[16];
1152*77c1e3ccSAndroid Build Coastguard Worker
1153*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z1_32xN_internal_sse4_1(16, dstvec, dstvec_h, left,
1154*77c1e3ccSAndroid Build Coastguard Worker upsample_left, dy);
1155*77c1e3ccSAndroid Build Coastguard Worker transpose16x16_sse2(dstvec, d);
1156*77c1e3ccSAndroid Build Coastguard Worker transpose16x16_sse2(dstvec_h, d_h);
1157*77c1e3ccSAndroid Build Coastguard Worker // store
1158*77c1e3ccSAndroid Build Coastguard Worker for (int j = 0; j < 16; j++) {
1159*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(dst + j * stride), d[j]);
1160*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), d_h[j]);
1161*77c1e3ccSAndroid Build Coastguard Worker }
1162*77c1e3ccSAndroid Build Coastguard Worker }
1163*77c1e3ccSAndroid Build Coastguard Worker
dr_prediction_z3_32x16_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)1164*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_32x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
1165*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *left,
1166*77c1e3ccSAndroid Build Coastguard Worker int upsample_left, int dy) {
1167*77c1e3ccSAndroid Build Coastguard Worker __m128i dstvec[32], d[16];
1168*77c1e3ccSAndroid Build Coastguard Worker
1169*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z1_HxW_internal_sse4_1(16, 32, dstvec, left, upsample_left, dy);
1170*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < 32; i += 16) {
1171*77c1e3ccSAndroid Build Coastguard Worker transpose16x16_sse2((dstvec + i), d);
1172*77c1e3ccSAndroid Build Coastguard Worker for (int j = 0; j < 16; j++) {
1173*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
1174*77c1e3ccSAndroid Build Coastguard Worker }
1175*77c1e3ccSAndroid Build Coastguard Worker }
1176*77c1e3ccSAndroid Build Coastguard Worker }
1177*77c1e3ccSAndroid Build Coastguard Worker
dr_prediction_z3_32x64_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)1178*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_32x64_sse4_1(uint8_t *dst, ptrdiff_t stride,
1179*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *left,
1180*77c1e3ccSAndroid Build Coastguard Worker int upsample_left, int dy) {
1181*77c1e3ccSAndroid Build Coastguard Worker uint8_t dstT[64 * 32];
1182*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z1_64xN_sse4_1(32, dstT, 64, left, upsample_left, dy);
1183*77c1e3ccSAndroid Build Coastguard Worker transpose(dstT, 64, dst, stride, 32, 64);
1184*77c1e3ccSAndroid Build Coastguard Worker }
1185*77c1e3ccSAndroid Build Coastguard Worker
dr_prediction_z3_64x32_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)1186*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_64x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
1187*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *left,
1188*77c1e3ccSAndroid Build Coastguard Worker int upsample_left, int dy) {
1189*77c1e3ccSAndroid Build Coastguard Worker uint8_t dstT[32 * 64];
1190*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z1_32xN_sse4_1(64, dstT, 32, left, upsample_left, dy);
1191*77c1e3ccSAndroid Build Coastguard Worker transpose(dstT, 32, dst, stride, 64, 32);
1192*77c1e3ccSAndroid Build Coastguard Worker return;
1193*77c1e3ccSAndroid Build Coastguard Worker }
1194*77c1e3ccSAndroid Build Coastguard Worker
1195*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
dr_prediction_z3_16x64_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)1196*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_16x64_sse4_1(uint8_t *dst, ptrdiff_t stride,
1197*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *left,
1198*77c1e3ccSAndroid Build Coastguard Worker int upsample_left, int dy) {
1199*77c1e3ccSAndroid Build Coastguard Worker uint8_t dstT[64 * 16];
1200*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z1_64xN_sse4_1(16, dstT, 64, left, upsample_left, dy);
1201*77c1e3ccSAndroid Build Coastguard Worker transpose(dstT, 64, dst, stride, 16, 64);
1202*77c1e3ccSAndroid Build Coastguard Worker }
1203*77c1e3ccSAndroid Build Coastguard Worker
dr_prediction_z3_64x16_sse4_1(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)1204*77c1e3ccSAndroid Build Coastguard Worker static void dr_prediction_z3_64x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
1205*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *left,
1206*77c1e3ccSAndroid Build Coastguard Worker int upsample_left, int dy) {
1207*77c1e3ccSAndroid Build Coastguard Worker __m128i dstvec[64], d[16];
1208*77c1e3ccSAndroid Build Coastguard Worker
1209*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z1_HxW_internal_sse4_1(16, 64, dstvec, left, upsample_left, dy);
1210*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < 64; i += 16) {
1211*77c1e3ccSAndroid Build Coastguard Worker transpose16x16_sse2(dstvec + i, d);
1212*77c1e3ccSAndroid Build Coastguard Worker for (int j = 0; j < 16; j++) {
1213*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
1214*77c1e3ccSAndroid Build Coastguard Worker }
1215*77c1e3ccSAndroid Build Coastguard Worker }
1216*77c1e3ccSAndroid Build Coastguard Worker }
1217*77c1e3ccSAndroid Build Coastguard Worker #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1218*77c1e3ccSAndroid Build Coastguard Worker
av1_dr_prediction_z3_sse4_1(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_left,int dx,int dy)1219*77c1e3ccSAndroid Build Coastguard Worker void av1_dr_prediction_z3_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
1220*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *above, const uint8_t *left,
1221*77c1e3ccSAndroid Build Coastguard Worker int upsample_left, int dx, int dy) {
1222*77c1e3ccSAndroid Build Coastguard Worker (void)above;
1223*77c1e3ccSAndroid Build Coastguard Worker (void)dx;
1224*77c1e3ccSAndroid Build Coastguard Worker assert(dx == 1);
1225*77c1e3ccSAndroid Build Coastguard Worker assert(dy > 0);
1226*77c1e3ccSAndroid Build Coastguard Worker
1227*77c1e3ccSAndroid Build Coastguard Worker if (bw == bh) {
1228*77c1e3ccSAndroid Build Coastguard Worker switch (bw) {
1229*77c1e3ccSAndroid Build Coastguard Worker case 4:
1230*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z3_4x4_sse4_1(dst, stride, left, upsample_left, dy);
1231*77c1e3ccSAndroid Build Coastguard Worker break;
1232*77c1e3ccSAndroid Build Coastguard Worker case 8:
1233*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z3_8x8_sse4_1(dst, stride, left, upsample_left, dy);
1234*77c1e3ccSAndroid Build Coastguard Worker break;
1235*77c1e3ccSAndroid Build Coastguard Worker case 16:
1236*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z3_16x16_sse4_1(dst, stride, left, upsample_left, dy);
1237*77c1e3ccSAndroid Build Coastguard Worker break;
1238*77c1e3ccSAndroid Build Coastguard Worker case 32:
1239*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z3_32x32_sse4_1(dst, stride, left, upsample_left, dy);
1240*77c1e3ccSAndroid Build Coastguard Worker break;
1241*77c1e3ccSAndroid Build Coastguard Worker case 64:
1242*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z3_64x64_sse4_1(dst, stride, left, upsample_left, dy);
1243*77c1e3ccSAndroid Build Coastguard Worker break;
1244*77c1e3ccSAndroid Build Coastguard Worker default: assert(0 && "Invalid block size");
1245*77c1e3ccSAndroid Build Coastguard Worker }
1246*77c1e3ccSAndroid Build Coastguard Worker } else {
1247*77c1e3ccSAndroid Build Coastguard Worker if (bw < bh) {
1248*77c1e3ccSAndroid Build Coastguard Worker if (bw + bw == bh) {
1249*77c1e3ccSAndroid Build Coastguard Worker switch (bw) {
1250*77c1e3ccSAndroid Build Coastguard Worker case 4:
1251*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z3_4x8_sse4_1(dst, stride, left, upsample_left, dy);
1252*77c1e3ccSAndroid Build Coastguard Worker break;
1253*77c1e3ccSAndroid Build Coastguard Worker case 8:
1254*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z3_8x16_sse4_1(dst, stride, left, upsample_left, dy);
1255*77c1e3ccSAndroid Build Coastguard Worker break;
1256*77c1e3ccSAndroid Build Coastguard Worker case 16:
1257*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z3_16x32_sse4_1(dst, stride, left, upsample_left, dy);
1258*77c1e3ccSAndroid Build Coastguard Worker break;
1259*77c1e3ccSAndroid Build Coastguard Worker case 32:
1260*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z3_32x64_sse4_1(dst, stride, left, upsample_left, dy);
1261*77c1e3ccSAndroid Build Coastguard Worker break;
1262*77c1e3ccSAndroid Build Coastguard Worker default: assert(0 && "Invalid block size");
1263*77c1e3ccSAndroid Build Coastguard Worker }
1264*77c1e3ccSAndroid Build Coastguard Worker } else {
1265*77c1e3ccSAndroid Build Coastguard Worker switch (bw) {
1266*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1267*77c1e3ccSAndroid Build Coastguard Worker case 4:
1268*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z3_4x16_sse4_1(dst, stride, left, upsample_left, dy);
1269*77c1e3ccSAndroid Build Coastguard Worker break;
1270*77c1e3ccSAndroid Build Coastguard Worker case 8:
1271*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z3_8x32_sse4_1(dst, stride, left, upsample_left, dy);
1272*77c1e3ccSAndroid Build Coastguard Worker break;
1273*77c1e3ccSAndroid Build Coastguard Worker case 16:
1274*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z3_16x64_sse4_1(dst, stride, left, upsample_left, dy);
1275*77c1e3ccSAndroid Build Coastguard Worker break;
1276*77c1e3ccSAndroid Build Coastguard Worker default: assert(0 && "Invalid block size");
1277*77c1e3ccSAndroid Build Coastguard Worker #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1278*77c1e3ccSAndroid Build Coastguard Worker }
1279*77c1e3ccSAndroid Build Coastguard Worker }
1280*77c1e3ccSAndroid Build Coastguard Worker } else {
1281*77c1e3ccSAndroid Build Coastguard Worker if (bh + bh == bw) {
1282*77c1e3ccSAndroid Build Coastguard Worker switch (bh) {
1283*77c1e3ccSAndroid Build Coastguard Worker case 4:
1284*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z3_8x4_sse4_1(dst, stride, left, upsample_left, dy);
1285*77c1e3ccSAndroid Build Coastguard Worker break;
1286*77c1e3ccSAndroid Build Coastguard Worker case 8:
1287*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z3_16x8_sse4_1(dst, stride, left, upsample_left, dy);
1288*77c1e3ccSAndroid Build Coastguard Worker break;
1289*77c1e3ccSAndroid Build Coastguard Worker case 16:
1290*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z3_32x16_sse4_1(dst, stride, left, upsample_left, dy);
1291*77c1e3ccSAndroid Build Coastguard Worker break;
1292*77c1e3ccSAndroid Build Coastguard Worker case 32:
1293*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z3_64x32_sse4_1(dst, stride, left, upsample_left, dy);
1294*77c1e3ccSAndroid Build Coastguard Worker break;
1295*77c1e3ccSAndroid Build Coastguard Worker default: assert(0 && "Invalid block size");
1296*77c1e3ccSAndroid Build Coastguard Worker }
1297*77c1e3ccSAndroid Build Coastguard Worker } else {
1298*77c1e3ccSAndroid Build Coastguard Worker switch (bh) {
1299*77c1e3ccSAndroid Build Coastguard Worker #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1300*77c1e3ccSAndroid Build Coastguard Worker case 4:
1301*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z3_16x4_sse4_1(dst, stride, left, upsample_left, dy);
1302*77c1e3ccSAndroid Build Coastguard Worker break;
1303*77c1e3ccSAndroid Build Coastguard Worker case 8:
1304*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z3_32x8_sse4_1(dst, stride, left, upsample_left, dy);
1305*77c1e3ccSAndroid Build Coastguard Worker break;
1306*77c1e3ccSAndroid Build Coastguard Worker case 16:
1307*77c1e3ccSAndroid Build Coastguard Worker dr_prediction_z3_64x16_sse4_1(dst, stride, left, upsample_left, dy);
1308*77c1e3ccSAndroid Build Coastguard Worker break;
1309*77c1e3ccSAndroid Build Coastguard Worker default: assert(0 && "Invalid block size");
1310*77c1e3ccSAndroid Build Coastguard Worker #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1311*77c1e3ccSAndroid Build Coastguard Worker }
1312*77c1e3ccSAndroid Build Coastguard Worker }
1313*77c1e3ccSAndroid Build Coastguard Worker }
1314*77c1e3ccSAndroid Build Coastguard Worker }
1315*77c1e3ccSAndroid Build Coastguard Worker }
1316