1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/dsp/intrapred.h"
16 #include "src/utils/cpu.h"
17
18 #if LIBGAV1_ENABLE_NEON
19
20 #include <arm_neon.h>
21
22 #include <cassert>
23 #include <cstddef>
24 #include <cstdint>
25
26 #include "src/dsp/arm/common_neon.h"
27 #include "src/dsp/constants.h"
28 #include "src/dsp/dsp.h"
29 #include "src/utils/common.h"
30 #include "src/utils/constants.h"
31
32 namespace libgav1 {
33 namespace dsp {
34 namespace {
35
36 //------------------------------------------------------------------------------
37 // DcPredFuncs_NEON
38
39 using DcSumFunc = uint32x2_t (*)(const void* ref_0, const int ref_0_size_log2,
40 const bool use_ref_1, const void* ref_1,
41 const int ref_1_size_log2);
42 using DcStoreFunc = void (*)(void* dest, ptrdiff_t stride, const uint32x2_t dc);
43
44 // DC intra-predictors for square blocks.
45 template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
46 DcStoreFunc storefn>
47 struct DcPredFuncs_NEON {
48 DcPredFuncs_NEON() = delete;
49
50 static void DcTop(void* dest, ptrdiff_t stride, const void* top_row,
51 const void* left_column);
52 static void DcLeft(void* dest, ptrdiff_t stride, const void* top_row,
53 const void* left_column);
54 static void Dc(void* dest, ptrdiff_t stride, const void* top_row,
55 const void* left_column);
56 };
57
58 template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
59 DcStoreFunc storefn>
60 void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn, storefn>::
DcTop(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void *)61 DcTop(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
62 const void* LIBGAV1_RESTRICT const top_row,
63 const void* /*left_column*/) {
64 const uint32x2_t sum = sumfn(top_row, block_width_log2, false, nullptr, 0);
65 const uint32x2_t dc = vrshr_n_u32(sum, block_width_log2);
66 storefn(dest, stride, dc);
67 }
68
69 template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
70 DcStoreFunc storefn>
71 void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn, storefn>::
DcLeft(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void *,const void * LIBGAV1_RESTRICT const left_column)72 DcLeft(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
73 const void* /*top_row*/,
74 const void* LIBGAV1_RESTRICT const left_column) {
75 const uint32x2_t sum =
76 sumfn(left_column, block_height_log2, false, nullptr, 0);
77 const uint32x2_t dc = vrshr_n_u32(sum, block_height_log2);
78 storefn(dest, stride, dc);
79 }
80
81 template <int block_width_log2, int block_height_log2, DcSumFunc sumfn,
82 DcStoreFunc storefn>
Dc(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)83 void DcPredFuncs_NEON<block_width_log2, block_height_log2, sumfn, storefn>::Dc(
84 void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
85 const void* LIBGAV1_RESTRICT const top_row,
86 const void* LIBGAV1_RESTRICT const left_column) {
87 const uint32x2_t sum =
88 sumfn(top_row, block_width_log2, true, left_column, block_height_log2);
89 if (block_width_log2 == block_height_log2) {
90 const uint32x2_t dc = vrshr_n_u32(sum, block_width_log2 + 1);
91 storefn(dest, stride, dc);
92 } else {
93 // TODO(johannkoenig): Compare this to mul/shift in vectors.
94 const int divisor = (1 << block_width_log2) + (1 << block_height_log2);
95 uint32_t dc = vget_lane_u32(sum, 0);
96 dc += divisor >> 1;
97 dc /= divisor;
98 storefn(dest, stride, vdup_n_u32(dc));
99 }
100 }
101
102 // Sum all the elements in the vector into the low 32 bits.
Sum(const uint16x4_t val)103 inline uint32x2_t Sum(const uint16x4_t val) {
104 const uint32x2_t sum = vpaddl_u16(val);
105 return vpadd_u32(sum, sum);
106 }
107
108 // Sum all the elements in the vector into the low 32 bits.
Sum(const uint16x8_t val)109 inline uint32x2_t Sum(const uint16x8_t val) {
110 const uint32x4_t sum_0 = vpaddlq_u16(val);
111 const uint64x2_t sum_1 = vpaddlq_u32(sum_0);
112 return vadd_u32(vget_low_u32(vreinterpretq_u32_u64(sum_1)),
113 vget_high_u32(vreinterpretq_u32_u64(sum_1)));
114 }
115
116 } // namespace
117
118 //------------------------------------------------------------------------------
119 namespace low_bitdepth {
120 namespace {
121
122 // Add and expand the elements in the |val_[01]| to uint16_t but do not sum the
123 // entire vector.
Add(const uint8x16_t val_0,const uint8x16_t val_1)124 inline uint16x8_t Add(const uint8x16_t val_0, const uint8x16_t val_1) {
125 const uint16x8_t sum_0 = vpaddlq_u8(val_0);
126 const uint16x8_t sum_1 = vpaddlq_u8(val_1);
127 return vaddq_u16(sum_0, sum_1);
128 }
129
130 // Add and expand the elements in the |val_[0123]| to uint16_t but do not sum
131 // the entire vector.
Add(const uint8x16_t val_0,const uint8x16_t val_1,const uint8x16_t val_2,const uint8x16_t val_3)132 inline uint16x8_t Add(const uint8x16_t val_0, const uint8x16_t val_1,
133 const uint8x16_t val_2, const uint8x16_t val_3) {
134 const uint16x8_t sum_0 = Add(val_0, val_1);
135 const uint16x8_t sum_1 = Add(val_2, val_3);
136 return vaddq_u16(sum_0, sum_1);
137 }
138
139 // Load and combine 32 uint8_t values.
LoadAndAdd32(const uint8_t * buf)140 inline uint16x8_t LoadAndAdd32(const uint8_t* buf) {
141 const uint8x16_t val_0 = vld1q_u8(buf);
142 const uint8x16_t val_1 = vld1q_u8(buf + 16);
143 return Add(val_0, val_1);
144 }
145
146 // Load and combine 64 uint8_t values.
LoadAndAdd64(const uint8_t * buf)147 inline uint16x8_t LoadAndAdd64(const uint8_t* buf) {
148 const uint8x16_t val_0 = vld1q_u8(buf);
149 const uint8x16_t val_1 = vld1q_u8(buf + 16);
150 const uint8x16_t val_2 = vld1q_u8(buf + 32);
151 const uint8x16_t val_3 = vld1q_u8(buf + 48);
152 return Add(val_0, val_1, val_2, val_3);
153 }
154
155 // |ref_[01]| each point to 1 << |ref[01]_size_log2| packed uint8_t values.
156 // If |use_ref_1| is false then only sum |ref_0|.
157 // For |ref[01]_size_log2| == 4 this relies on |ref_[01]| being aligned to
158 // uint32_t.
DcSum_NEON(const void * LIBGAV1_RESTRICT ref_0,const int ref_0_size_log2,const bool use_ref_1,const void * LIBGAV1_RESTRICT ref_1,const int ref_1_size_log2)159 inline uint32x2_t DcSum_NEON(const void* LIBGAV1_RESTRICT ref_0,
160 const int ref_0_size_log2, const bool use_ref_1,
161 const void* LIBGAV1_RESTRICT ref_1,
162 const int ref_1_size_log2) {
163 const auto* const ref_0_u8 = static_cast<const uint8_t*>(ref_0);
164 const auto* const ref_1_u8 = static_cast<const uint8_t*>(ref_1);
165 if (ref_0_size_log2 == 2) {
166 uint8x8_t val = Load4(ref_0_u8);
167 if (use_ref_1) {
168 switch (ref_1_size_log2) {
169 case 2: { // 4x4
170 val = Load4<1>(ref_1_u8, val);
171 return Sum(vpaddl_u8(val));
172 }
173 case 3: { // 4x8
174 const uint8x8_t val_1 = vld1_u8(ref_1_u8);
175 const uint16x4_t sum_0 = vpaddl_u8(val);
176 const uint16x4_t sum_1 = vpaddl_u8(val_1);
177 return Sum(vadd_u16(sum_0, sum_1));
178 }
179 case 4: { // 4x16
180 const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
181 return Sum(vaddw_u8(vpaddlq_u8(val_1), val));
182 }
183 }
184 }
185 // 4x1
186 const uint16x4_t sum = vpaddl_u8(val);
187 return vpaddl_u16(sum);
188 }
189 if (ref_0_size_log2 == 3) {
190 const uint8x8_t val_0 = vld1_u8(ref_0_u8);
191 if (use_ref_1) {
192 switch (ref_1_size_log2) {
193 case 2: { // 8x4
194 const uint8x8_t val_1 = Load4(ref_1_u8);
195 const uint16x4_t sum_0 = vpaddl_u8(val_0);
196 const uint16x4_t sum_1 = vpaddl_u8(val_1);
197 return Sum(vadd_u16(sum_0, sum_1));
198 }
199 case 3: { // 8x8
200 const uint8x8_t val_1 = vld1_u8(ref_1_u8);
201 const uint16x4_t sum_0 = vpaddl_u8(val_0);
202 const uint16x4_t sum_1 = vpaddl_u8(val_1);
203 return Sum(vadd_u16(sum_0, sum_1));
204 }
205 case 4: { // 8x16
206 const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
207 return Sum(vaddw_u8(vpaddlq_u8(val_1), val_0));
208 }
209 case 5: { // 8x32
210 return Sum(vaddw_u8(LoadAndAdd32(ref_1_u8), val_0));
211 }
212 }
213 }
214 // 8x1
215 return Sum(vpaddl_u8(val_0));
216 }
217 if (ref_0_size_log2 == 4) {
218 const uint8x16_t val_0 = vld1q_u8(ref_0_u8);
219 if (use_ref_1) {
220 switch (ref_1_size_log2) {
221 case 2: { // 16x4
222 const uint8x8_t val_1 = Load4(ref_1_u8);
223 return Sum(vaddw_u8(vpaddlq_u8(val_0), val_1));
224 }
225 case 3: { // 16x8
226 const uint8x8_t val_1 = vld1_u8(ref_1_u8);
227 return Sum(vaddw_u8(vpaddlq_u8(val_0), val_1));
228 }
229 case 4: { // 16x16
230 const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
231 return Sum(Add(val_0, val_1));
232 }
233 case 5: { // 16x32
234 const uint16x8_t sum_0 = vpaddlq_u8(val_0);
235 const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8);
236 return Sum(vaddq_u16(sum_0, sum_1));
237 }
238 case 6: { // 16x64
239 const uint16x8_t sum_0 = vpaddlq_u8(val_0);
240 const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8);
241 return Sum(vaddq_u16(sum_0, sum_1));
242 }
243 }
244 }
245 // 16x1
246 return Sum(vpaddlq_u8(val_0));
247 }
248 if (ref_0_size_log2 == 5) {
249 const uint16x8_t sum_0 = LoadAndAdd32(ref_0_u8);
250 if (use_ref_1) {
251 switch (ref_1_size_log2) {
252 case 3: { // 32x8
253 const uint8x8_t val_1 = vld1_u8(ref_1_u8);
254 return Sum(vaddw_u8(sum_0, val_1));
255 }
256 case 4: { // 32x16
257 const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
258 const uint16x8_t sum_1 = vpaddlq_u8(val_1);
259 return Sum(vaddq_u16(sum_0, sum_1));
260 }
261 case 5: { // 32x32
262 const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8);
263 return Sum(vaddq_u16(sum_0, sum_1));
264 }
265 case 6: { // 32x64
266 const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8);
267 return Sum(vaddq_u16(sum_0, sum_1));
268 }
269 }
270 }
271 // 32x1
272 return Sum(sum_0);
273 }
274
275 assert(ref_0_size_log2 == 6);
276 const uint16x8_t sum_0 = LoadAndAdd64(ref_0_u8);
277 if (use_ref_1) {
278 switch (ref_1_size_log2) {
279 case 4: { // 64x16
280 const uint8x16_t val_1 = vld1q_u8(ref_1_u8);
281 const uint16x8_t sum_1 = vpaddlq_u8(val_1);
282 return Sum(vaddq_u16(sum_0, sum_1));
283 }
284 case 5: { // 64x32
285 const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u8);
286 return Sum(vaddq_u16(sum_0, sum_1));
287 }
288 case 6: { // 64x64
289 const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u8);
290 return Sum(vaddq_u16(sum_0, sum_1));
291 }
292 }
293 }
294 // 64x1
295 return Sum(sum_0);
296 }
297
298 template <int width, int height>
DcStore_NEON(void * const dest,ptrdiff_t stride,const uint32x2_t dc)299 inline void DcStore_NEON(void* const dest, ptrdiff_t stride,
300 const uint32x2_t dc) {
301 const uint8x16_t dc_dup = vdupq_lane_u8(vreinterpret_u8_u32(dc), 0);
302 auto* dst = static_cast<uint8_t*>(dest);
303 if (width == 4) {
304 int i = height - 1;
305 do {
306 StoreLo4(dst, vget_low_u8(dc_dup));
307 dst += stride;
308 } while (--i != 0);
309 StoreLo4(dst, vget_low_u8(dc_dup));
310 } else if (width == 8) {
311 int i = height - 1;
312 do {
313 vst1_u8(dst, vget_low_u8(dc_dup));
314 dst += stride;
315 } while (--i != 0);
316 vst1_u8(dst, vget_low_u8(dc_dup));
317 } else if (width == 16) {
318 int i = height - 1;
319 do {
320 vst1q_u8(dst, dc_dup);
321 dst += stride;
322 } while (--i != 0);
323 vst1q_u8(dst, dc_dup);
324 } else if (width == 32) {
325 int i = height - 1;
326 do {
327 vst1q_u8(dst, dc_dup);
328 vst1q_u8(dst + 16, dc_dup);
329 dst += stride;
330 } while (--i != 0);
331 vst1q_u8(dst, dc_dup);
332 vst1q_u8(dst + 16, dc_dup);
333 } else {
334 assert(width == 64);
335 int i = height - 1;
336 do {
337 vst1q_u8(dst, dc_dup);
338 vst1q_u8(dst + 16, dc_dup);
339 vst1q_u8(dst + 32, dc_dup);
340 vst1q_u8(dst + 48, dc_dup);
341 dst += stride;
342 } while (--i != 0);
343 vst1q_u8(dst, dc_dup);
344 vst1q_u8(dst + 16, dc_dup);
345 vst1q_u8(dst + 32, dc_dup);
346 vst1q_u8(dst + 48, dc_dup);
347 }
348 }
349
350 template <int width, int height>
Paeth4Or8xN_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)351 inline void Paeth4Or8xN_NEON(void* LIBGAV1_RESTRICT const dest,
352 ptrdiff_t stride,
353 const void* LIBGAV1_RESTRICT const top_row,
354 const void* LIBGAV1_RESTRICT const left_column) {
355 auto* dest_u8 = static_cast<uint8_t*>(dest);
356 const auto* const top_row_u8 = static_cast<const uint8_t*>(top_row);
357 const auto* const left_col_u8 = static_cast<const uint8_t*>(left_column);
358
359 const uint8x8_t top_left = vdup_n_u8(top_row_u8[-1]);
360 const uint16x8_t top_left_x2 = vdupq_n_u16(top_row_u8[-1] + top_row_u8[-1]);
361 uint8x8_t top;
362 if (width == 4) {
363 top = Load4(top_row_u8);
364 } else { // width == 8
365 top = vld1_u8(top_row_u8);
366 }
367
368 for (int y = 0; y < height; ++y) {
369 const uint8x8_t left = vdup_n_u8(left_col_u8[y]);
370
371 const uint8x8_t left_dist = vabd_u8(top, top_left);
372 const uint8x8_t top_dist = vabd_u8(left, top_left);
373 const uint16x8_t top_left_dist =
374 vabdq_u16(vaddl_u8(top, left), top_left_x2);
375
376 const uint8x8_t left_le_top = vcle_u8(left_dist, top_dist);
377 const uint8x8_t left_le_top_left =
378 vmovn_u16(vcleq_u16(vmovl_u8(left_dist), top_left_dist));
379 const uint8x8_t top_le_top_left =
380 vmovn_u16(vcleq_u16(vmovl_u8(top_dist), top_left_dist));
381
382 // if (left_dist <= top_dist && left_dist <= top_left_dist)
383 const uint8x8_t left_mask = vand_u8(left_le_top, left_le_top_left);
384 // dest[x] = left_column[y];
385 // Fill all the unused spaces with 'top'. They will be overwritten when
386 // the positions for top_left are known.
387 uint8x8_t result = vbsl_u8(left_mask, left, top);
388 // else if (top_dist <= top_left_dist)
389 // dest[x] = top_row[x];
390 // Add these values to the mask. They were already set.
391 const uint8x8_t left_or_top_mask = vorr_u8(left_mask, top_le_top_left);
392 // else
393 // dest[x] = top_left;
394 result = vbsl_u8(left_or_top_mask, result, top_left);
395
396 if (width == 4) {
397 StoreLo4(dest_u8, result);
398 } else { // width == 8
399 vst1_u8(dest_u8, result);
400 }
401 dest_u8 += stride;
402 }
403 }
404
405 // Calculate X distance <= TopLeft distance and pack the resulting mask into
406 // uint8x8_t.
XLeTopLeft(const uint8x16_t x_dist,const uint16x8_t top_left_dist_low,const uint16x8_t top_left_dist_high)407 inline uint8x16_t XLeTopLeft(const uint8x16_t x_dist,
408 const uint16x8_t top_left_dist_low,
409 const uint16x8_t top_left_dist_high) {
410 const uint8x16_t top_left_dist = vcombine_u8(vqmovn_u16(top_left_dist_low),
411 vqmovn_u16(top_left_dist_high));
412 return vcleq_u8(x_dist, top_left_dist);
413 }
414
415 // Select the closest values and collect them.
SelectPaeth(const uint8x16_t top,const uint8x16_t left,const uint8x16_t top_left,const uint8x16_t left_le_top,const uint8x16_t left_le_top_left,const uint8x16_t top_le_top_left)416 inline uint8x16_t SelectPaeth(const uint8x16_t top, const uint8x16_t left,
417 const uint8x16_t top_left,
418 const uint8x16_t left_le_top,
419 const uint8x16_t left_le_top_left,
420 const uint8x16_t top_le_top_left) {
421 // if (left_dist <= top_dist && left_dist <= top_left_dist)
422 const uint8x16_t left_mask = vandq_u8(left_le_top, left_le_top_left);
423 // dest[x] = left_column[y];
424 // Fill all the unused spaces with 'top'. They will be overwritten when
425 // the positions for top_left are known.
426 uint8x16_t result = vbslq_u8(left_mask, left, top);
427 // else if (top_dist <= top_left_dist)
428 // dest[x] = top_row[x];
429 // Add these values to the mask. They were already set.
430 const uint8x16_t left_or_top_mask = vorrq_u8(left_mask, top_le_top_left);
431 // else
432 // dest[x] = top_left;
433 return vbslq_u8(left_or_top_mask, result, top_left);
434 }
435
436 // Generate numbered and high/low versions of top_left_dist.
437 #define TOP_LEFT_DIST(num) \
438 const uint16x8_t top_left_##num##_dist_low = vabdq_u16( \
439 vaddl_u8(vget_low_u8(top[num]), vget_low_u8(left)), top_left_x2); \
440 const uint16x8_t top_left_##num##_dist_high = vabdq_u16( \
441 vaddl_u8(vget_high_u8(top[num]), vget_low_u8(left)), top_left_x2)
442
443 // Generate numbered versions of XLeTopLeft with x = left.
444 #define LEFT_LE_TOP_LEFT(num) \
445 const uint8x16_t left_le_top_left_##num = \
446 XLeTopLeft(left_##num##_dist, top_left_##num##_dist_low, \
447 top_left_##num##_dist_high)
448
449 // Generate numbered versions of XLeTopLeft with x = top.
450 #define TOP_LE_TOP_LEFT(num) \
451 const uint8x16_t top_le_top_left_##num = XLeTopLeft( \
452 top_dist, top_left_##num##_dist_low, top_left_##num##_dist_high)
453
454 template <int width, int height>
Paeth16PlusxN_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * LIBGAV1_RESTRICT const left_column)455 inline void Paeth16PlusxN_NEON(void* LIBGAV1_RESTRICT const dest,
456 ptrdiff_t stride,
457 const void* LIBGAV1_RESTRICT const top_row,
458 const void* LIBGAV1_RESTRICT const left_column) {
459 auto* dest_u8 = static_cast<uint8_t*>(dest);
460 const auto* const top_row_u8 = static_cast<const uint8_t*>(top_row);
461 const auto* const left_col_u8 = static_cast<const uint8_t*>(left_column);
462
463 const uint8x16_t top_left = vdupq_n_u8(top_row_u8[-1]);
464 const uint16x8_t top_left_x2 = vdupq_n_u16(top_row_u8[-1] + top_row_u8[-1]);
465 uint8x16_t top[4];
466 top[0] = vld1q_u8(top_row_u8);
467 if (width > 16) {
468 top[1] = vld1q_u8(top_row_u8 + 16);
469 if (width == 64) {
470 top[2] = vld1q_u8(top_row_u8 + 32);
471 top[3] = vld1q_u8(top_row_u8 + 48);
472 }
473 }
474
475 for (int y = 0; y < height; ++y) {
476 const uint8x16_t left = vdupq_n_u8(left_col_u8[y]);
477
478 const uint8x16_t top_dist = vabdq_u8(left, top_left);
479
480 const uint8x16_t left_0_dist = vabdq_u8(top[0], top_left);
481 TOP_LEFT_DIST(0);
482 const uint8x16_t left_0_le_top = vcleq_u8(left_0_dist, top_dist);
483 LEFT_LE_TOP_LEFT(0);
484 TOP_LE_TOP_LEFT(0);
485
486 const uint8x16_t result_0 =
487 SelectPaeth(top[0], left, top_left, left_0_le_top, left_le_top_left_0,
488 top_le_top_left_0);
489 vst1q_u8(dest_u8, result_0);
490
491 if (width > 16) {
492 const uint8x16_t left_1_dist = vabdq_u8(top[1], top_left);
493 TOP_LEFT_DIST(1);
494 const uint8x16_t left_1_le_top = vcleq_u8(left_1_dist, top_dist);
495 LEFT_LE_TOP_LEFT(1);
496 TOP_LE_TOP_LEFT(1);
497
498 const uint8x16_t result_1 =
499 SelectPaeth(top[1], left, top_left, left_1_le_top, left_le_top_left_1,
500 top_le_top_left_1);
501 vst1q_u8(dest_u8 + 16, result_1);
502
503 if (width == 64) {
504 const uint8x16_t left_2_dist = vabdq_u8(top[2], top_left);
505 TOP_LEFT_DIST(2);
506 const uint8x16_t left_2_le_top = vcleq_u8(left_2_dist, top_dist);
507 LEFT_LE_TOP_LEFT(2);
508 TOP_LE_TOP_LEFT(2);
509
510 const uint8x16_t result_2 =
511 SelectPaeth(top[2], left, top_left, left_2_le_top,
512 left_le_top_left_2, top_le_top_left_2);
513 vst1q_u8(dest_u8 + 32, result_2);
514
515 const uint8x16_t left_3_dist = vabdq_u8(top[3], top_left);
516 TOP_LEFT_DIST(3);
517 const uint8x16_t left_3_le_top = vcleq_u8(left_3_dist, top_dist);
518 LEFT_LE_TOP_LEFT(3);
519 TOP_LE_TOP_LEFT(3);
520
521 const uint8x16_t result_3 =
522 SelectPaeth(top[3], left, top_left, left_3_le_top,
523 left_le_top_left_3, top_le_top_left_3);
524 vst1q_u8(dest_u8 + 48, result_3);
525 }
526 }
527
528 dest_u8 += stride;
529 }
530 }
531
532 struct DcDefs {
533 DcDefs() = delete;
534
535 using _4x4 = DcPredFuncs_NEON<2, 2, DcSum_NEON, DcStore_NEON<4, 4>>;
536 using _4x8 = DcPredFuncs_NEON<2, 3, DcSum_NEON, DcStore_NEON<4, 8>>;
537 using _4x16 = DcPredFuncs_NEON<2, 4, DcSum_NEON, DcStore_NEON<4, 16>>;
538 using _8x4 = DcPredFuncs_NEON<3, 2, DcSum_NEON, DcStore_NEON<8, 4>>;
539 using _8x8 = DcPredFuncs_NEON<3, 3, DcSum_NEON, DcStore_NEON<8, 8>>;
540 using _8x16 = DcPredFuncs_NEON<3, 4, DcSum_NEON, DcStore_NEON<8, 16>>;
541 using _8x32 = DcPredFuncs_NEON<3, 5, DcSum_NEON, DcStore_NEON<8, 32>>;
542 using _16x4 = DcPredFuncs_NEON<4, 2, DcSum_NEON, DcStore_NEON<16, 4>>;
543 using _16x8 = DcPredFuncs_NEON<4, 3, DcSum_NEON, DcStore_NEON<16, 8>>;
544 using _16x16 = DcPredFuncs_NEON<4, 4, DcSum_NEON, DcStore_NEON<16, 16>>;
545 using _16x32 = DcPredFuncs_NEON<4, 5, DcSum_NEON, DcStore_NEON<16, 32>>;
546 using _16x64 = DcPredFuncs_NEON<4, 6, DcSum_NEON, DcStore_NEON<16, 64>>;
547 using _32x8 = DcPredFuncs_NEON<5, 3, DcSum_NEON, DcStore_NEON<32, 8>>;
548 using _32x16 = DcPredFuncs_NEON<5, 4, DcSum_NEON, DcStore_NEON<32, 16>>;
549 using _32x32 = DcPredFuncs_NEON<5, 5, DcSum_NEON, DcStore_NEON<32, 32>>;
550 using _32x64 = DcPredFuncs_NEON<5, 6, DcSum_NEON, DcStore_NEON<32, 64>>;
551 using _64x16 = DcPredFuncs_NEON<6, 4, DcSum_NEON, DcStore_NEON<64, 16>>;
552 using _64x32 = DcPredFuncs_NEON<6, 5, DcSum_NEON, DcStore_NEON<64, 32>>;
553 using _64x64 = DcPredFuncs_NEON<6, 6, DcSum_NEON, DcStore_NEON<64, 64>>;
554 };
555
Init8bpp()556 void Init8bpp() {
557 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
558 assert(dsp != nullptr);
559 // 4x4
560 dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
561 DcDefs::_4x4::DcTop;
562 dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
563 DcDefs::_4x4::DcLeft;
564 dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
565 DcDefs::_4x4::Dc;
566 dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
567 Paeth4Or8xN_NEON<4, 4>;
568
569 // 4x8
570 dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
571 DcDefs::_4x8::DcTop;
572 dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
573 DcDefs::_4x8::DcLeft;
574 dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
575 DcDefs::_4x8::Dc;
576 dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
577 Paeth4Or8xN_NEON<4, 8>;
578
579 // 4x16
580 dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
581 DcDefs::_4x16::DcTop;
582 dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
583 DcDefs::_4x16::DcLeft;
584 dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
585 DcDefs::_4x16::Dc;
586 dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
587 Paeth4Or8xN_NEON<4, 16>;
588
589 // 8x4
590 dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
591 DcDefs::_8x4::DcTop;
592 dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
593 DcDefs::_8x4::DcLeft;
594 dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
595 DcDefs::_8x4::Dc;
596 dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
597 Paeth4Or8xN_NEON<8, 4>;
598
599 // 8x8
600 dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
601 DcDefs::_8x8::DcTop;
602 dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
603 DcDefs::_8x8::DcLeft;
604 dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
605 DcDefs::_8x8::Dc;
606 dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
607 Paeth4Or8xN_NEON<8, 8>;
608
609 // 8x16
610 dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
611 DcDefs::_8x16::DcTop;
612 dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
613 DcDefs::_8x16::DcLeft;
614 dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
615 DcDefs::_8x16::Dc;
616 dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
617 Paeth4Or8xN_NEON<8, 16>;
618
619 // 8x32
620 dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
621 DcDefs::_8x32::DcTop;
622 dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
623 DcDefs::_8x32::DcLeft;
624 dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
625 DcDefs::_8x32::Dc;
626 dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
627 Paeth4Or8xN_NEON<8, 32>;
628
629 // 16x4
630 dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
631 DcDefs::_16x4::DcTop;
632 dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
633 DcDefs::_16x4::DcLeft;
634 dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
635 DcDefs::_16x4::Dc;
636 dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
637 Paeth16PlusxN_NEON<16, 4>;
638
639 // 16x8
640 dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
641 DcDefs::_16x8::DcTop;
642 dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
643 DcDefs::_16x8::DcLeft;
644 dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
645 DcDefs::_16x8::Dc;
646 dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
647 Paeth16PlusxN_NEON<16, 8>;
648
649 // 16x16
650 dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
651 DcDefs::_16x16::DcTop;
652 dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
653 DcDefs::_16x16::DcLeft;
654 dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
655 DcDefs::_16x16::Dc;
656 dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
657 Paeth16PlusxN_NEON<16, 16>;
658
659 // 16x32
660 dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
661 DcDefs::_16x32::DcTop;
662 dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
663 DcDefs::_16x32::DcLeft;
664 dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
665 DcDefs::_16x32::Dc;
666 dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
667 Paeth16PlusxN_NEON<16, 32>;
668
669 // 16x64
670 dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
671 DcDefs::_16x64::DcTop;
672 dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
673 DcDefs::_16x64::DcLeft;
674 dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
675 DcDefs::_16x64::Dc;
676 dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
677 Paeth16PlusxN_NEON<16, 64>;
678
679 // 32x8
680 dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
681 DcDefs::_32x8::DcTop;
682 dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
683 DcDefs::_32x8::DcLeft;
684 dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
685 DcDefs::_32x8::Dc;
686 dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
687 Paeth16PlusxN_NEON<32, 8>;
688
689 // 32x16
690 dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
691 DcDefs::_32x16::DcTop;
692 dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
693 DcDefs::_32x16::DcLeft;
694 dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
695 DcDefs::_32x16::Dc;
696 dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
697 Paeth16PlusxN_NEON<32, 16>;
698
699 // 32x32
700 dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
701 DcDefs::_32x32::DcTop;
702 dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
703 DcDefs::_32x32::DcLeft;
704 dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
705 DcDefs::_32x32::Dc;
706 dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
707 Paeth16PlusxN_NEON<32, 32>;
708
709 // 32x64
710 dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
711 DcDefs::_32x64::DcTop;
712 dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
713 DcDefs::_32x64::DcLeft;
714 dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
715 DcDefs::_32x64::Dc;
716 dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
717 Paeth16PlusxN_NEON<32, 64>;
718
719 // 64x16
720 dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
721 DcDefs::_64x16::DcTop;
722 dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
723 DcDefs::_64x16::DcLeft;
724 dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
725 DcDefs::_64x16::Dc;
726 dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
727 Paeth16PlusxN_NEON<64, 16>;
728
729 // 64x32
730 dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
731 DcDefs::_64x32::DcTop;
732 dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
733 DcDefs::_64x32::DcLeft;
734 dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
735 DcDefs::_64x32::Dc;
736 dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
737 Paeth16PlusxN_NEON<64, 32>;
738
739 // 64x64
740 dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
741 DcDefs::_64x64::DcTop;
742 dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
743 DcDefs::_64x64::DcLeft;
744 dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
745 DcDefs::_64x64::Dc;
746 dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
747 Paeth16PlusxN_NEON<64, 64>;
748 }
749
750 } // namespace
751 } // namespace low_bitdepth
752
753 //------------------------------------------------------------------------------
754 #if LIBGAV1_MAX_BITDEPTH >= 10
755 namespace high_bitdepth {
756 namespace {
757
758 // Add the elements in the given vectors together but do not sum the entire
759 // vector.
Add(const uint16x8_t val_0,const uint16x8_t val_1,const uint16x8_t val_2,const uint16x8_t val_3)760 inline uint16x8_t Add(const uint16x8_t val_0, const uint16x8_t val_1,
761 const uint16x8_t val_2, const uint16x8_t val_3) {
762 const uint16x8_t sum_0 = vaddq_u16(val_0, val_1);
763 const uint16x8_t sum_1 = vaddq_u16(val_2, val_3);
764 return vaddq_u16(sum_0, sum_1);
765 }
766
767 // Load and combine 16 uint16_t values.
LoadAndAdd16(const uint16_t * buf)768 inline uint16x8_t LoadAndAdd16(const uint16_t* buf) {
769 const uint16x8_t val_0 = vld1q_u16(buf);
770 const uint16x8_t val_1 = vld1q_u16(buf + 8);
771 return vaddq_u16(val_0, val_1);
772 }
773
774 // Load and combine 32 uint16_t values.
LoadAndAdd32(const uint16_t * buf)775 inline uint16x8_t LoadAndAdd32(const uint16_t* buf) {
776 const uint16x8_t val_0 = vld1q_u16(buf);
777 const uint16x8_t val_1 = vld1q_u16(buf + 8);
778 const uint16x8_t val_2 = vld1q_u16(buf + 16);
779 const uint16x8_t val_3 = vld1q_u16(buf + 24);
780 return Add(val_0, val_1, val_2, val_3);
781 }
782
783 // Load and combine 64 uint16_t values.
LoadAndAdd64(const uint16_t * buf)784 inline uint16x8_t LoadAndAdd64(const uint16_t* buf) {
785 const uint16x8_t val_0 = vld1q_u16(buf);
786 const uint16x8_t val_1 = vld1q_u16(buf + 8);
787 const uint16x8_t val_2 = vld1q_u16(buf + 16);
788 const uint16x8_t val_3 = vld1q_u16(buf + 24);
789 const uint16x8_t val_4 = vld1q_u16(buf + 32);
790 const uint16x8_t val_5 = vld1q_u16(buf + 40);
791 const uint16x8_t val_6 = vld1q_u16(buf + 48);
792 const uint16x8_t val_7 = vld1q_u16(buf + 56);
793 const uint16x8_t sum_0 = Add(val_0, val_1, val_2, val_3);
794 const uint16x8_t sum_1 = Add(val_4, val_5, val_6, val_7);
795 return vaddq_u16(sum_0, sum_1);
796 }
797
798 // |ref_[01]| each point to 1 << |ref[01]_size_log2| packed uint16_t values.
799 // If |use_ref_1| is false then only sum |ref_0|.
DcSum_NEON(const void * LIBGAV1_RESTRICT ref_0,const int ref_0_size_log2,const bool use_ref_1,const void * LIBGAV1_RESTRICT ref_1,const int ref_1_size_log2)800 inline uint32x2_t DcSum_NEON(const void* LIBGAV1_RESTRICT ref_0,
801 const int ref_0_size_log2, const bool use_ref_1,
802 const void* LIBGAV1_RESTRICT ref_1,
803 const int ref_1_size_log2) {
804 const auto* ref_0_u16 = static_cast<const uint16_t*>(ref_0);
805 const auto* ref_1_u16 = static_cast<const uint16_t*>(ref_1);
806 if (ref_0_size_log2 == 2) {
807 const uint16x4_t val_0 = vld1_u16(ref_0_u16);
808 if (use_ref_1) {
809 switch (ref_1_size_log2) {
810 case 2: { // 4x4
811 const uint16x4_t val_1 = vld1_u16(ref_1_u16);
812 return Sum(vadd_u16(val_0, val_1));
813 }
814 case 3: { // 4x8
815 const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
816 const uint16x8_t sum_0 = vcombine_u16(vdup_n_u16(0), val_0);
817 return Sum(vaddq_u16(sum_0, val_1));
818 }
819 case 4: { // 4x16
820 const uint16x8_t sum_0 = vcombine_u16(vdup_n_u16(0), val_0);
821 const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
822 return Sum(vaddq_u16(sum_0, sum_1));
823 }
824 }
825 }
826 // 4x1
827 return Sum(val_0);
828 }
829 if (ref_0_size_log2 == 3) {
830 const uint16x8_t val_0 = vld1q_u16(ref_0_u16);
831 if (use_ref_1) {
832 switch (ref_1_size_log2) {
833 case 2: { // 8x4
834 const uint16x4_t val_1 = vld1_u16(ref_1_u16);
835 const uint16x8_t sum_1 = vcombine_u16(vdup_n_u16(0), val_1);
836 return Sum(vaddq_u16(val_0, sum_1));
837 }
838 case 3: { // 8x8
839 const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
840 return Sum(vaddq_u16(val_0, val_1));
841 }
842 case 4: { // 8x16
843 const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
844 return Sum(vaddq_u16(val_0, sum_1));
845 }
846 case 5: { // 8x32
847 const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
848 return Sum(vaddq_u16(val_0, sum_1));
849 }
850 }
851 }
852 // 8x1
853 return Sum(val_0);
854 }
855 if (ref_0_size_log2 == 4) {
856 const uint16x8_t sum_0 = LoadAndAdd16(ref_0_u16);
857 if (use_ref_1) {
858 switch (ref_1_size_log2) {
859 case 2: { // 16x4
860 const uint16x4_t val_1 = vld1_u16(ref_1_u16);
861 const uint16x8_t sum_1 = vcombine_u16(vdup_n_u16(0), val_1);
862 return Sum(vaddq_u16(sum_0, sum_1));
863 }
864 case 3: { // 16x8
865 const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
866 return Sum(vaddq_u16(sum_0, val_1));
867 }
868 case 4: { // 16x16
869 const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
870 return Sum(vaddq_u16(sum_0, sum_1));
871 }
872 case 5: { // 16x32
873 const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
874 return Sum(vaddq_u16(sum_0, sum_1));
875 }
876 case 6: { // 16x64
877 const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16);
878 return Sum(vaddq_u16(sum_0, sum_1));
879 }
880 }
881 }
882 // 16x1
883 return Sum(sum_0);
884 }
885 if (ref_0_size_log2 == 5) {
886 const uint16x8_t sum_0 = LoadAndAdd32(ref_0_u16);
887 if (use_ref_1) {
888 switch (ref_1_size_log2) {
889 case 3: { // 32x8
890 const uint16x8_t val_1 = vld1q_u16(ref_1_u16);
891 return Sum(vaddq_u16(sum_0, val_1));
892 }
893 case 4: { // 32x16
894 const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
895 return Sum(vaddq_u16(sum_0, sum_1));
896 }
897 case 5: { // 32x32
898 const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
899 return Sum(vaddq_u16(sum_0, sum_1));
900 }
901 case 6: { // 32x64
902 const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16);
903 return Sum(vaddq_u16(sum_0, sum_1));
904 }
905 }
906 }
907 // 32x1
908 return Sum(sum_0);
909 }
910
911 assert(ref_0_size_log2 == 6);
912 const uint16x8_t sum_0 = LoadAndAdd64(ref_0_u16);
913 if (use_ref_1) {
914 switch (ref_1_size_log2) {
915 case 4: { // 64x16
916 const uint16x8_t sum_1 = LoadAndAdd16(ref_1_u16);
917 return Sum(vaddq_u16(sum_0, sum_1));
918 }
919 case 5: { // 64x32
920 const uint16x8_t sum_1 = LoadAndAdd32(ref_1_u16);
921 return Sum(vaddq_u16(sum_0, sum_1));
922 }
923 case 6: { // 64x64
924 const uint16x8_t sum_1 = LoadAndAdd64(ref_1_u16);
925 return Sum(vaddq_u16(sum_0, sum_1));
926 }
927 }
928 }
929 // 64x1
930 return Sum(sum_0);
931 }
932
933 template <int width, int height>
DcStore_NEON(void * const dest,ptrdiff_t stride,const uint32x2_t dc)934 inline void DcStore_NEON(void* const dest, ptrdiff_t stride,
935 const uint32x2_t dc) {
936 auto* dest_u16 = static_cast<uint16_t*>(dest);
937 ptrdiff_t stride_u16 = stride >> 1;
938 const uint16x8_t dc_dup = vdupq_lane_u16(vreinterpret_u16_u32(dc), 0);
939 if (width == 4) {
940 int i = height - 1;
941 do {
942 vst1_u16(dest_u16, vget_low_u16(dc_dup));
943 dest_u16 += stride_u16;
944 } while (--i != 0);
945 vst1_u16(dest_u16, vget_low_u16(dc_dup));
946 } else if (width == 8) {
947 int i = height - 1;
948 do {
949 vst1q_u16(dest_u16, dc_dup);
950 dest_u16 += stride_u16;
951 } while (--i != 0);
952 vst1q_u16(dest_u16, dc_dup);
953 } else if (width == 16) {
954 int i = height - 1;
955 do {
956 vst1q_u16(dest_u16, dc_dup);
957 vst1q_u16(dest_u16 + 8, dc_dup);
958 dest_u16 += stride_u16;
959 } while (--i != 0);
960 vst1q_u16(dest_u16, dc_dup);
961 vst1q_u16(dest_u16 + 8, dc_dup);
962 } else if (width == 32) {
963 int i = height - 1;
964 do {
965 vst1q_u16(dest_u16, dc_dup);
966 vst1q_u16(dest_u16 + 8, dc_dup);
967 vst1q_u16(dest_u16 + 16, dc_dup);
968 vst1q_u16(dest_u16 + 24, dc_dup);
969 dest_u16 += stride_u16;
970 } while (--i != 0);
971 vst1q_u16(dest_u16, dc_dup);
972 vst1q_u16(dest_u16 + 8, dc_dup);
973 vst1q_u16(dest_u16 + 16, dc_dup);
974 vst1q_u16(dest_u16 + 24, dc_dup);
975 } else {
976 assert(width == 64);
977 int i = height - 1;
978 do {
979 vst1q_u16(dest_u16, dc_dup);
980 vst1q_u16(dest_u16 + 8, dc_dup);
981 vst1q_u16(dest_u16 + 16, dc_dup);
982 vst1q_u16(dest_u16 + 24, dc_dup);
983 vst1q_u16(dest_u16 + 32, dc_dup);
984 vst1q_u16(dest_u16 + 40, dc_dup);
985 vst1q_u16(dest_u16 + 48, dc_dup);
986 vst1q_u16(dest_u16 + 56, dc_dup);
987 dest_u16 += stride_u16;
988 } while (--i != 0);
989 vst1q_u16(dest_u16, dc_dup);
990 vst1q_u16(dest_u16 + 8, dc_dup);
991 vst1q_u16(dest_u16 + 16, dc_dup);
992 vst1q_u16(dest_u16 + 24, dc_dup);
993 vst1q_u16(dest_u16 + 32, dc_dup);
994 vst1q_u16(dest_u16 + 40, dc_dup);
995 vst1q_u16(dest_u16 + 48, dc_dup);
996 vst1q_u16(dest_u16 + 56, dc_dup);
997 }
998 }
999
1000 struct DcDefs {
1001 DcDefs() = delete;
1002
1003 using _4x4 = DcPredFuncs_NEON<2, 2, DcSum_NEON, DcStore_NEON<4, 4>>;
1004 using _4x8 = DcPredFuncs_NEON<2, 3, DcSum_NEON, DcStore_NEON<4, 8>>;
1005 using _4x16 = DcPredFuncs_NEON<2, 4, DcSum_NEON, DcStore_NEON<4, 16>>;
1006 using _8x4 = DcPredFuncs_NEON<3, 2, DcSum_NEON, DcStore_NEON<8, 4>>;
1007 using _8x8 = DcPredFuncs_NEON<3, 3, DcSum_NEON, DcStore_NEON<8, 8>>;
1008 using _8x16 = DcPredFuncs_NEON<3, 4, DcSum_NEON, DcStore_NEON<8, 16>>;
1009 using _8x32 = DcPredFuncs_NEON<3, 5, DcSum_NEON, DcStore_NEON<8, 32>>;
1010 using _16x4 = DcPredFuncs_NEON<4, 2, DcSum_NEON, DcStore_NEON<16, 4>>;
1011 using _16x8 = DcPredFuncs_NEON<4, 3, DcSum_NEON, DcStore_NEON<16, 8>>;
1012 using _16x16 = DcPredFuncs_NEON<4, 4, DcSum_NEON, DcStore_NEON<16, 16>>;
1013 using _16x32 = DcPredFuncs_NEON<4, 5, DcSum_NEON, DcStore_NEON<16, 32>>;
1014 using _16x64 = DcPredFuncs_NEON<4, 6, DcSum_NEON, DcStore_NEON<16, 64>>;
1015 using _32x8 = DcPredFuncs_NEON<5, 3, DcSum_NEON, DcStore_NEON<32, 8>>;
1016 using _32x16 = DcPredFuncs_NEON<5, 4, DcSum_NEON, DcStore_NEON<32, 16>>;
1017 using _32x32 = DcPredFuncs_NEON<5, 5, DcSum_NEON, DcStore_NEON<32, 32>>;
1018 using _32x64 = DcPredFuncs_NEON<5, 6, DcSum_NEON, DcStore_NEON<32, 64>>;
1019 using _64x16 = DcPredFuncs_NEON<6, 4, DcSum_NEON, DcStore_NEON<64, 16>>;
1020 using _64x32 = DcPredFuncs_NEON<6, 5, DcSum_NEON, DcStore_NEON<64, 32>>;
1021 using _64x64 = DcPredFuncs_NEON<6, 6, DcSum_NEON, DcStore_NEON<64, 64>>;
1022 };
1023
1024 // IntraPredFuncs_NEON::Horizontal -- duplicate left column across all rows
1025
1026 template <int block_height>
Horizontal4xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void *,const void * LIBGAV1_RESTRICT const left_column)1027 void Horizontal4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1028 const void* /*top_row*/,
1029 const void* LIBGAV1_RESTRICT const left_column) {
1030 const auto* const left = static_cast<const uint16_t*>(left_column);
1031 auto* dst = static_cast<uint8_t*>(dest);
1032 int y = 0;
1033 do {
1034 auto* dst16 = reinterpret_cast<uint16_t*>(dst);
1035 const uint16x4_t row = vld1_dup_u16(left + y);
1036 vst1_u16(dst16, row);
1037 dst += stride;
1038 } while (++y < block_height);
1039 }
1040
1041 template <int block_height>
Horizontal8xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void *,const void * LIBGAV1_RESTRICT const left_column)1042 void Horizontal8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1043 const void* /*top_row*/,
1044 const void* LIBGAV1_RESTRICT const left_column) {
1045 const auto* const left = static_cast<const uint16_t*>(left_column);
1046 auto* dst = static_cast<uint8_t*>(dest);
1047 int y = 0;
1048 do {
1049 auto* dst16 = reinterpret_cast<uint16_t*>(dst);
1050 const uint16x8_t row = vld1q_dup_u16(left + y);
1051 vst1q_u16(dst16, row);
1052 dst += stride;
1053 } while (++y < block_height);
1054 }
1055
1056 template <int block_height>
Horizontal16xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void *,const void * LIBGAV1_RESTRICT const left_column)1057 void Horizontal16xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1058 const void* /*top_row*/,
1059 const void* LIBGAV1_RESTRICT const left_column) {
1060 const auto* const left = static_cast<const uint16_t*>(left_column);
1061 auto* dst = static_cast<uint8_t*>(dest);
1062 int y = 0;
1063 do {
1064 const uint16x8_t row0 = vld1q_dup_u16(left + y);
1065 const uint16x8_t row1 = vld1q_dup_u16(left + y + 1);
1066 auto* dst16 = reinterpret_cast<uint16_t*>(dst);
1067 vst1q_u16(dst16, row0);
1068 vst1q_u16(dst16 + 8, row0);
1069 dst += stride;
1070 dst16 = reinterpret_cast<uint16_t*>(dst);
1071 vst1q_u16(dst16, row1);
1072 vst1q_u16(dst16 + 8, row1);
1073 dst += stride;
1074 y += 2;
1075 } while (y < block_height);
1076 }
1077
1078 template <int block_height>
Horizontal32xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void *,const void * LIBGAV1_RESTRICT const left_column)1079 void Horizontal32xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1080 const void* /*top_row*/,
1081 const void* LIBGAV1_RESTRICT const left_column) {
1082 const auto* const left = static_cast<const uint16_t*>(left_column);
1083 auto* dst = static_cast<uint8_t*>(dest);
1084 int y = 0;
1085 do {
1086 const uint16x8_t row0 = vld1q_dup_u16(left + y);
1087 const uint16x8_t row1 = vld1q_dup_u16(left + y + 1);
1088 auto* dst16 = reinterpret_cast<uint16_t*>(dst);
1089 vst1q_u16(dst16, row0);
1090 vst1q_u16(dst16 + 8, row0);
1091 vst1q_u16(dst16 + 16, row0);
1092 vst1q_u16(dst16 + 24, row0);
1093 dst += stride;
1094 dst16 = reinterpret_cast<uint16_t*>(dst);
1095 vst1q_u16(dst16, row1);
1096 vst1q_u16(dst16 + 8, row1);
1097 vst1q_u16(dst16 + 16, row1);
1098 vst1q_u16(dst16 + 24, row1);
1099 dst += stride;
1100 y += 2;
1101 } while (y < block_height);
1102 }
1103
1104 // IntraPredFuncs_NEON::Vertical -- copy top row to all rows
1105
1106 template <int block_height>
Vertical4xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * const)1107 void Vertical4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1108 const void* LIBGAV1_RESTRICT const top_row,
1109 const void* const /*left_column*/) {
1110 const auto* const top = static_cast<const uint8_t*>(top_row);
1111 auto* dst = static_cast<uint8_t*>(dest);
1112 const uint8x8_t row = vld1_u8(top);
1113 int y = block_height;
1114 do {
1115 vst1_u8(dst, row);
1116 dst += stride;
1117 } while (--y != 0);
1118 }
1119
1120 template <int block_height>
Vertical8xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * const)1121 void Vertical8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1122 const void* LIBGAV1_RESTRICT const top_row,
1123 const void* const /*left_column*/) {
1124 const auto* const top = static_cast<const uint8_t*>(top_row);
1125 auto* dst = static_cast<uint8_t*>(dest);
1126 const uint8x16_t row = vld1q_u8(top);
1127 int y = block_height;
1128 do {
1129 vst1q_u8(dst, row);
1130 dst += stride;
1131 } while (--y != 0);
1132 }
1133
1134 template <int block_height>
Vertical16xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * const)1135 void Vertical16xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1136 const void* LIBGAV1_RESTRICT const top_row,
1137 const void* const /*left_column*/) {
1138 const auto* const top = static_cast<const uint8_t*>(top_row);
1139 auto* dst = static_cast<uint8_t*>(dest);
1140 const uint8x16_t row0 = vld1q_u8(top);
1141 const uint8x16_t row1 = vld1q_u8(top + 16);
1142 int y = block_height;
1143 do {
1144 vst1q_u8(dst, row0);
1145 vst1q_u8(dst + 16, row1);
1146 dst += stride;
1147 vst1q_u8(dst, row0);
1148 vst1q_u8(dst + 16, row1);
1149 dst += stride;
1150 y -= 2;
1151 } while (y != 0);
1152 }
1153
1154 template <int block_height>
Vertical32xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * const)1155 void Vertical32xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1156 const void* LIBGAV1_RESTRICT const top_row,
1157 const void* const /*left_column*/) {
1158 const auto* const top = static_cast<const uint8_t*>(top_row);
1159 auto* dst = static_cast<uint8_t*>(dest);
1160 const uint8x16_t row0 = vld1q_u8(top);
1161 const uint8x16_t row1 = vld1q_u8(top + 16);
1162 const uint8x16_t row2 = vld1q_u8(top + 32);
1163 const uint8x16_t row3 = vld1q_u8(top + 48);
1164 int y = block_height;
1165 do {
1166 vst1q_u8(dst, row0);
1167 vst1q_u8(dst + 16, row1);
1168 vst1q_u8(dst + 32, row2);
1169 vst1q_u8(dst + 48, row3);
1170 dst += stride;
1171 vst1q_u8(dst, row0);
1172 vst1q_u8(dst + 16, row1);
1173 vst1q_u8(dst + 32, row2);
1174 vst1q_u8(dst + 48, row3);
1175 dst += stride;
1176 y -= 2;
1177 } while (y != 0);
1178 }
1179
1180 template <int block_height>
Vertical64xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_row,const void * const)1181 void Vertical64xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1182 const void* LIBGAV1_RESTRICT const top_row,
1183 const void* const /*left_column*/) {
1184 const auto* const top = static_cast<const uint8_t*>(top_row);
1185 auto* dst = static_cast<uint8_t*>(dest);
1186 const uint8x16_t row0 = vld1q_u8(top);
1187 const uint8x16_t row1 = vld1q_u8(top + 16);
1188 const uint8x16_t row2 = vld1q_u8(top + 32);
1189 const uint8x16_t row3 = vld1q_u8(top + 48);
1190 const uint8x16_t row4 = vld1q_u8(top + 64);
1191 const uint8x16_t row5 = vld1q_u8(top + 80);
1192 const uint8x16_t row6 = vld1q_u8(top + 96);
1193 const uint8x16_t row7 = vld1q_u8(top + 112);
1194 int y = block_height;
1195 do {
1196 vst1q_u8(dst, row0);
1197 vst1q_u8(dst + 16, row1);
1198 vst1q_u8(dst + 32, row2);
1199 vst1q_u8(dst + 48, row3);
1200 vst1q_u8(dst + 64, row4);
1201 vst1q_u8(dst + 80, row5);
1202 vst1q_u8(dst + 96, row6);
1203 vst1q_u8(dst + 112, row7);
1204 dst += stride;
1205 vst1q_u8(dst, row0);
1206 vst1q_u8(dst + 16, row1);
1207 vst1q_u8(dst + 32, row2);
1208 vst1q_u8(dst + 48, row3);
1209 vst1q_u8(dst + 64, row4);
1210 vst1q_u8(dst + 80, row5);
1211 vst1q_u8(dst + 96, row6);
1212 vst1q_u8(dst + 112, row7);
1213 dst += stride;
1214 y -= 2;
1215 } while (y != 0);
1216 }
1217
1218 template <int height>
Paeth4xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_ptr,const void * LIBGAV1_RESTRICT const left_ptr)1219 inline void Paeth4xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1220 const void* LIBGAV1_RESTRICT const top_ptr,
1221 const void* LIBGAV1_RESTRICT const left_ptr) {
1222 auto* dst = static_cast<uint8_t*>(dest);
1223 const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
1224 const auto* const left_col = static_cast<const uint16_t*>(left_ptr);
1225
1226 const uint16x4_t top_left = vdup_n_u16(top_row[-1]);
1227 const uint16x4_t top_left_x2 = vshl_n_u16(top_left, 1);
1228 const uint16x4_t top = vld1_u16(top_row);
1229
1230 for (int y = 0; y < height; ++y) {
1231 auto* dst16 = reinterpret_cast<uint16_t*>(dst);
1232 const uint16x4_t left = vdup_n_u16(left_col[y]);
1233
1234 const uint16x4_t left_dist = vabd_u16(top, top_left);
1235 const uint16x4_t top_dist = vabd_u16(left, top_left);
1236 const uint16x4_t top_left_dist = vabd_u16(vadd_u16(top, left), top_left_x2);
1237
1238 const uint16x4_t left_le_top = vcle_u16(left_dist, top_dist);
1239 const uint16x4_t left_le_top_left = vcle_u16(left_dist, top_left_dist);
1240 const uint16x4_t top_le_top_left = vcle_u16(top_dist, top_left_dist);
1241
1242 // if (left_dist <= top_dist && left_dist <= top_left_dist)
1243 const uint16x4_t left_mask = vand_u16(left_le_top, left_le_top_left);
1244 // dest[x] = left_column[y];
1245 // Fill all the unused spaces with 'top'. They will be overwritten when
1246 // the positions for top_left are known.
1247 uint16x4_t result = vbsl_u16(left_mask, left, top);
1248 // else if (top_dist <= top_left_dist)
1249 // dest[x] = top_row[x];
1250 // Add these values to the mask. They were already set.
1251 const uint16x4_t left_or_top_mask = vorr_u16(left_mask, top_le_top_left);
1252 // else
1253 // dest[x] = top_left;
1254 result = vbsl_u16(left_or_top_mask, result, top_left);
1255
1256 vst1_u16(dst16, result);
1257 dst += stride;
1258 }
1259 }
1260
1261 template <int height>
Paeth8xH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_ptr,const void * LIBGAV1_RESTRICT const left_ptr)1262 inline void Paeth8xH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1263 const void* LIBGAV1_RESTRICT const top_ptr,
1264 const void* LIBGAV1_RESTRICT const left_ptr) {
1265 auto* dst = static_cast<uint8_t*>(dest);
1266 const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
1267 const auto* const left_col = static_cast<const uint16_t*>(left_ptr);
1268
1269 const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
1270 const uint16x8_t top_left_x2 = vshlq_n_u16(top_left, 1);
1271 const uint16x8_t top = vld1q_u16(top_row);
1272
1273 for (int y = 0; y < height; ++y) {
1274 auto* dst16 = reinterpret_cast<uint16_t*>(dst);
1275 const uint16x8_t left = vdupq_n_u16(left_col[y]);
1276
1277 const uint16x8_t left_dist = vabdq_u16(top, top_left);
1278 const uint16x8_t top_dist = vabdq_u16(left, top_left);
1279 const uint16x8_t top_left_dist =
1280 vabdq_u16(vaddq_u16(top, left), top_left_x2);
1281
1282 const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);
1283 const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);
1284 const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);
1285
1286 // if (left_dist <= top_dist && left_dist <= top_left_dist)
1287 const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
1288 // dest[x] = left_column[y];
1289 // Fill all the unused spaces with 'top'. They will be overwritten when
1290 // the positions for top_left are known.
1291 uint16x8_t result = vbslq_u16(left_mask, left, top);
1292 // else if (top_dist <= top_left_dist)
1293 // dest[x] = top_row[x];
1294 // Add these values to the mask. They were already set.
1295 const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
1296 // else
1297 // dest[x] = top_left;
1298 result = vbslq_u16(left_or_top_mask, result, top_left);
1299
1300 vst1q_u16(dst16, result);
1301 dst += stride;
1302 }
1303 }
1304
1305 // For 16xH and above.
1306 template <int width, int height>
PaethWxH_NEON(void * LIBGAV1_RESTRICT const dest,ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_ptr,const void * LIBGAV1_RESTRICT const left_ptr)1307 inline void PaethWxH_NEON(void* LIBGAV1_RESTRICT const dest, ptrdiff_t stride,
1308 const void* LIBGAV1_RESTRICT const top_ptr,
1309 const void* LIBGAV1_RESTRICT const left_ptr) {
1310 auto* dst = static_cast<uint8_t*>(dest);
1311 const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
1312 const auto* const left_col = static_cast<const uint16_t*>(left_ptr);
1313
1314 const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
1315 const uint16x8_t top_left_x2 = vshlq_n_u16(top_left, 1);
1316
1317 uint16x8_t top[width >> 3];
1318 for (int i = 0; i < width >> 3; ++i) {
1319 top[i] = vld1q_u16(top_row + (i << 3));
1320 }
1321
1322 for (int y = 0; y < height; ++y) {
1323 auto* dst_x = reinterpret_cast<uint16_t*>(dst);
1324 const uint16x8_t left = vdupq_n_u16(left_col[y]);
1325 const uint16x8_t top_dist = vabdq_u16(left, top_left);
1326
1327 for (int i = 0; i < (width >> 3); ++i) {
1328 const uint16x8_t left_dist = vabdq_u16(top[i], top_left);
1329 const uint16x8_t top_left_dist =
1330 vabdq_u16(vaddq_u16(top[i], left), top_left_x2);
1331
1332 const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);
1333 const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);
1334 const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);
1335
1336 // if (left_dist <= top_dist && left_dist <= top_left_dist)
1337 const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
1338 // dest[x] = left_column[y];
1339 // Fill all the unused spaces with 'top'. They will be overwritten when
1340 // the positions for top_left are known.
1341 uint16x8_t result = vbslq_u16(left_mask, left, top[i]);
1342 // else if (top_dist <= top_left_dist)
1343 // dest[x] = top_row[x];
1344 // Add these values to the mask. They were already set.
1345 const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
1346 // else
1347 // dest[x] = top_left;
1348 result = vbslq_u16(left_or_top_mask, result, top_left);
1349
1350 vst1q_u16(dst_x, result);
1351 dst_x += 8;
1352 }
1353 dst += stride;
1354 }
1355 }
1356
Init10bpp()1357 void Init10bpp() {
1358 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
1359 assert(dsp != nullptr);
1360 dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
1361 DcDefs::_4x4::DcTop;
1362 dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcLeft] =
1363 DcDefs::_4x4::DcLeft;
1364 dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
1365 DcDefs::_4x4::Dc;
1366 dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] =
1367 Vertical4xH_NEON<4>;
1368 dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
1369 Paeth4xH_NEON<4>;
1370
1371 // 4x8
1372 dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
1373 DcDefs::_4x8::DcTop;
1374 dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcLeft] =
1375 DcDefs::_4x8::DcLeft;
1376 dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
1377 DcDefs::_4x8::Dc;
1378 dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
1379 Horizontal4xH_NEON<8>;
1380 dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] =
1381 Vertical4xH_NEON<8>;
1382 dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
1383 Paeth4xH_NEON<8>;
1384
1385 // 4x16
1386 dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
1387 DcDefs::_4x16::DcTop;
1388 dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcLeft] =
1389 DcDefs::_4x16::DcLeft;
1390 dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
1391 DcDefs::_4x16::Dc;
1392 dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
1393 Horizontal4xH_NEON<16>;
1394 dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] =
1395 Vertical4xH_NEON<16>;
1396 dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
1397 Paeth4xH_NEON<16>;
1398
1399 // 8x4
1400 dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
1401 DcDefs::_8x4::DcTop;
1402 dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcLeft] =
1403 DcDefs::_8x4::DcLeft;
1404 dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
1405 DcDefs::_8x4::Dc;
1406 dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] =
1407 Vertical8xH_NEON<4>;
1408 dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
1409 Paeth8xH_NEON<4>;
1410
1411 // 8x8
1412 dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
1413 DcDefs::_8x8::DcTop;
1414 dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcLeft] =
1415 DcDefs::_8x8::DcLeft;
1416 dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
1417 DcDefs::_8x8::Dc;
1418 dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
1419 Horizontal8xH_NEON<8>;
1420 dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] =
1421 Vertical8xH_NEON<8>;
1422 dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
1423 Paeth8xH_NEON<8>;
1424
1425 // 8x16
1426 dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
1427 DcDefs::_8x16::DcTop;
1428 dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcLeft] =
1429 DcDefs::_8x16::DcLeft;
1430 dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
1431 DcDefs::_8x16::Dc;
1432 dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] =
1433 Vertical8xH_NEON<16>;
1434 dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
1435 Paeth8xH_NEON<16>;
1436
1437 // 8x32
1438 dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
1439 DcDefs::_8x32::DcTop;
1440 dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcLeft] =
1441 DcDefs::_8x32::DcLeft;
1442 dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
1443 DcDefs::_8x32::Dc;
1444 dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
1445 Horizontal8xH_NEON<32>;
1446 dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] =
1447 Vertical8xH_NEON<32>;
1448 dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
1449 Paeth8xH_NEON<32>;
1450
1451 // 16x4
1452 dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
1453 DcDefs::_16x4::DcTop;
1454 dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcLeft] =
1455 DcDefs::_16x4::DcLeft;
1456 dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
1457 DcDefs::_16x4::Dc;
1458 dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] =
1459 Vertical16xH_NEON<4>;
1460 dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
1461 PaethWxH_NEON<16, 4>;
1462
1463 // 16x8
1464 dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
1465 DcDefs::_16x8::DcTop;
1466 dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcLeft] =
1467 DcDefs::_16x8::DcLeft;
1468 dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
1469 DcDefs::_16x8::Dc;
1470 dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
1471 Horizontal16xH_NEON<8>;
1472 dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] =
1473 Vertical16xH_NEON<8>;
1474 dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
1475 PaethWxH_NEON<16, 8>;
1476
1477 // 16x16
1478 dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
1479 DcDefs::_16x16::DcTop;
1480 dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcLeft] =
1481 DcDefs::_16x16::DcLeft;
1482 dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
1483 DcDefs::_16x16::Dc;
1484 dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] =
1485 Vertical16xH_NEON<16>;
1486 dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
1487 PaethWxH_NEON<16, 16>;
1488
1489 // 16x32
1490 dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
1491 DcDefs::_16x32::DcTop;
1492 dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcLeft] =
1493 DcDefs::_16x32::DcLeft;
1494 dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
1495 DcDefs::_16x32::Dc;
1496 dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] =
1497 Vertical16xH_NEON<32>;
1498 dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
1499 PaethWxH_NEON<16, 32>;
1500
1501 // 16x64
1502 dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
1503 DcDefs::_16x64::DcTop;
1504 dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcLeft] =
1505 DcDefs::_16x64::DcLeft;
1506 dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
1507 DcDefs::_16x64::Dc;
1508 dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] =
1509 Vertical16xH_NEON<64>;
1510 dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
1511 PaethWxH_NEON<16, 64>;
1512
1513 // 32x8
1514 dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
1515 DcDefs::_32x8::DcTop;
1516 dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcLeft] =
1517 DcDefs::_32x8::DcLeft;
1518 dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
1519 DcDefs::_32x8::Dc;
1520 dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] =
1521 Vertical32xH_NEON<8>;
1522 dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
1523 PaethWxH_NEON<32, 8>;
1524
1525 // 32x16
1526 dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
1527 DcDefs::_32x16::DcTop;
1528 dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcLeft] =
1529 DcDefs::_32x16::DcLeft;
1530 dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
1531 DcDefs::_32x16::Dc;
1532 dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] =
1533 Vertical32xH_NEON<16>;
1534 dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
1535 PaethWxH_NEON<32, 16>;
1536
1537 // 32x32
1538 dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
1539 DcDefs::_32x32::DcTop;
1540 dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcLeft] =
1541 DcDefs::_32x32::DcLeft;
1542 dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
1543 DcDefs::_32x32::Dc;
1544 dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] =
1545 Vertical32xH_NEON<32>;
1546 dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
1547 PaethWxH_NEON<32, 32>;
1548
1549 // 32x64
1550 dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
1551 DcDefs::_32x64::DcTop;
1552 dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcLeft] =
1553 DcDefs::_32x64::DcLeft;
1554 dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
1555 DcDefs::_32x64::Dc;
1556 dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
1557 Horizontal32xH_NEON<64>;
1558 dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] =
1559 Vertical32xH_NEON<64>;
1560 dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
1561 PaethWxH_NEON<32, 64>;
1562
1563 // 64x16
1564 dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
1565 DcDefs::_64x16::DcTop;
1566 dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcLeft] =
1567 DcDefs::_64x16::DcLeft;
1568 dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
1569 DcDefs::_64x16::Dc;
1570 dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] =
1571 Vertical64xH_NEON<16>;
1572 dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
1573 PaethWxH_NEON<64, 16>;
1574
1575 // 64x32
1576 dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
1577 DcDefs::_64x32::DcTop;
1578 dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcLeft] =
1579 DcDefs::_64x32::DcLeft;
1580 dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
1581 DcDefs::_64x32::Dc;
1582 dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] =
1583 Vertical64xH_NEON<32>;
1584 dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
1585 PaethWxH_NEON<64, 32>;
1586
1587 // 64x64
1588 dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
1589 DcDefs::_64x64::DcTop;
1590 dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcLeft] =
1591 DcDefs::_64x64::DcLeft;
1592 dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
1593 DcDefs::_64x64::Dc;
1594 dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] =
1595 Vertical64xH_NEON<64>;
1596 dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
1597 PaethWxH_NEON<64, 64>;
1598 }
1599
1600 } // namespace
1601 } // namespace high_bitdepth
1602 #endif // LIBGAV1_MAX_BITDEPTH >= 10
1603
IntraPredInit_NEON()1604 void IntraPredInit_NEON() {
1605 low_bitdepth::Init8bpp();
1606 #if LIBGAV1_MAX_BITDEPTH >= 10
1607 high_bitdepth::Init10bpp();
1608 #endif
1609 }
1610
1611 } // namespace dsp
1612 } // namespace libgav1
1613
1614 #else // !LIBGAV1_ENABLE_NEON
1615 namespace libgav1 {
1616 namespace dsp {
1617
IntraPredInit_NEON()1618 void IntraPredInit_NEON() {}
1619
1620 } // namespace dsp
1621 } // namespace libgav1
1622 #endif // LIBGAV1_ENABLE_NEON
1623