1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_
13 #define AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_
14
15 /* Note: This implements the intrinsics in plain, unoptimised C.
16 Intended for reference, porting or debugging. */
17
18 #include <stdio.h>
19 #include <stdlib.h>
20
21 #include "config/aom_config.h"
22
23 typedef union {
24 uint8_t u8[8];
25 uint16_t u16[4];
26 uint32_t u32[2];
27 uint64_t u64;
28 int8_t s8[8];
29 int16_t s16[4];
30 int32_t s32[2];
31 int64_t s64;
32 } c_v64;
33
c_v64_low_u32(c_v64 a)34 SIMD_INLINE uint32_t c_v64_low_u32(c_v64 a) {
35 return a.u32[!!CONFIG_BIG_ENDIAN];
36 }
37
c_v64_high_u32(c_v64 a)38 SIMD_INLINE uint32_t c_v64_high_u32(c_v64 a) {
39 return a.u32[!CONFIG_BIG_ENDIAN];
40 }
41
c_v64_low_s32(c_v64 a)42 SIMD_INLINE int32_t c_v64_low_s32(c_v64 a) {
43 return a.s32[!!CONFIG_BIG_ENDIAN];
44 }
45
c_v64_high_s32(c_v64 a)46 SIMD_INLINE int32_t c_v64_high_s32(c_v64 a) {
47 return a.s32[!CONFIG_BIG_ENDIAN];
48 }
49
c_v64_from_32(uint32_t x,uint32_t y)50 SIMD_INLINE c_v64 c_v64_from_32(uint32_t x, uint32_t y) {
51 c_v64 t;
52 t.u32[!CONFIG_BIG_ENDIAN] = x;
53 t.u32[!!CONFIG_BIG_ENDIAN] = y;
54 return t;
55 }
56
c_v64_from_64(uint64_t x)57 SIMD_INLINE c_v64 c_v64_from_64(uint64_t x) {
58 c_v64 t;
59 t.u64 = x;
60 return t;
61 }
62
c_v64_u64(c_v64 x)63 SIMD_INLINE uint64_t c_v64_u64(c_v64 x) { return x.u64; }
64
c_v64_from_16(uint16_t a,uint16_t b,uint16_t c,uint16_t d)65 SIMD_INLINE c_v64 c_v64_from_16(uint16_t a, uint16_t b, uint16_t c,
66 uint16_t d) {
67 c_v64 t;
68 if (CONFIG_BIG_ENDIAN) {
69 t.u16[0] = a;
70 t.u16[1] = b;
71 t.u16[2] = c;
72 t.u16[3] = d;
73 } else {
74 t.u16[3] = a;
75 t.u16[2] = b;
76 t.u16[1] = c;
77 t.u16[0] = d;
78 }
79 return t;
80 }
81
c_u32_load_unaligned(const void * p)82 SIMD_INLINE uint32_t c_u32_load_unaligned(const void *p) {
83 uint32_t t;
84 uint8_t *pp = (uint8_t *)p;
85 uint8_t *q = (uint8_t *)&t;
86 int c;
87 for (c = 0; c < 4; c++) q[c] = pp[c];
88 return t;
89 }
90
c_u32_store_unaligned(void * p,uint32_t a)91 SIMD_INLINE void c_u32_store_unaligned(void *p, uint32_t a) {
92 uint8_t *pp = (uint8_t *)p;
93 uint8_t *q = (uint8_t *)&a;
94 int c;
95 for (c = 0; c < 4; c++) pp[c] = q[c];
96 }
97
c_u32_load_aligned(const void * p)98 SIMD_INLINE uint32_t c_u32_load_aligned(const void *p) {
99 if (SIMD_CHECK && (uintptr_t)p & 3) {
100 fprintf(stderr, "Error: Unaligned u32 load at %p\n", p);
101 abort();
102 }
103 return c_u32_load_unaligned(p);
104 }
105
c_u32_store_aligned(void * p,uint32_t a)106 SIMD_INLINE void c_u32_store_aligned(void *p, uint32_t a) {
107 if (SIMD_CHECK && (uintptr_t)p & 3) {
108 fprintf(stderr, "Error: Unaligned u32 store at %p\n", p);
109 abort();
110 }
111 c_u32_store_unaligned(p, a);
112 }
113
c_v64_load_unaligned(const void * p)114 SIMD_INLINE c_v64 c_v64_load_unaligned(const void *p) {
115 c_v64 t;
116 uint8_t *pp = (uint8_t *)p;
117 uint8_t *q = (uint8_t *)&t;
118 int c;
119 for (c = 0; c < 8; c++) q[c] = pp[c];
120 return t;
121 }
122
c_v64_load_aligned(const void * p)123 SIMD_INLINE c_v64 c_v64_load_aligned(const void *p) {
124 if (SIMD_CHECK && (uintptr_t)p & 7) {
125 fprintf(stderr, "Error: Unaligned c_v64 load at %p\n", p);
126 abort();
127 }
128 return c_v64_load_unaligned(p);
129 }
130
c_v64_store_unaligned(void * p,c_v64 a)131 SIMD_INLINE void c_v64_store_unaligned(void *p, c_v64 a) {
132 uint8_t *q = (uint8_t *)p;
133 uint8_t *r = (uint8_t *)&a;
134 int c;
135 for (c = 0; c < 8; c++) q[c] = r[c];
136 }
137
c_v64_store_aligned(void * p,c_v64 a)138 SIMD_INLINE void c_v64_store_aligned(void *p, c_v64 a) {
139 if (SIMD_CHECK && (uintptr_t)p & 7) {
140 fprintf(stderr, "Error: Unaligned c_v64 store at %p\n", p);
141 abort();
142 }
143 c_v64_store_unaligned(p, a);
144 }
145
c_v64_zero(void)146 SIMD_INLINE c_v64 c_v64_zero(void) {
147 c_v64 t;
148 t.u64 = 0;
149 return t;
150 }
151
c_v64_dup_8(uint8_t x)152 SIMD_INLINE c_v64 c_v64_dup_8(uint8_t x) {
153 c_v64 t;
154 t.u8[0] = t.u8[1] = t.u8[2] = t.u8[3] = t.u8[4] = t.u8[5] = t.u8[6] =
155 t.u8[7] = x;
156 return t;
157 }
158
c_v64_dup_16(uint16_t x)159 SIMD_INLINE c_v64 c_v64_dup_16(uint16_t x) {
160 c_v64 t;
161 t.u16[0] = t.u16[1] = t.u16[2] = t.u16[3] = x;
162 return t;
163 }
164
c_v64_dup_32(uint32_t x)165 SIMD_INLINE c_v64 c_v64_dup_32(uint32_t x) {
166 c_v64 t;
167 t.u32[0] = t.u32[1] = x;
168 return t;
169 }
170
c_v64_add_8(c_v64 a,c_v64 b)171 SIMD_INLINE c_v64 c_v64_add_8(c_v64 a, c_v64 b) {
172 c_v64 t;
173 int c;
174 for (c = 0; c < 8; c++) t.u8[c] = (uint8_t)(a.u8[c] + b.u8[c]);
175 return t;
176 }
177
c_v64_add_16(c_v64 a,c_v64 b)178 SIMD_INLINE c_v64 c_v64_add_16(c_v64 a, c_v64 b) {
179 c_v64 t;
180 int c;
181 for (c = 0; c < 4; c++) t.u16[c] = (uint16_t)(a.u16[c] + b.u16[c]);
182 return t;
183 }
184
c_v64_sadd_u8(c_v64 a,c_v64 b)185 SIMD_INLINE c_v64 c_v64_sadd_u8(c_v64 a, c_v64 b) {
186 c_v64 t;
187 int c;
188 for (c = 0; c < 8; c++)
189 t.u8[c] = SIMD_CLAMP((int16_t)a.u8[c] + (int16_t)b.u8[c], 0, 255);
190 return t;
191 }
192
c_v64_sadd_s8(c_v64 a,c_v64 b)193 SIMD_INLINE c_v64 c_v64_sadd_s8(c_v64 a, c_v64 b) {
194 c_v64 t;
195 int c;
196 for (c = 0; c < 8; c++)
197 t.s8[c] = SIMD_CLAMP((int16_t)a.s8[c] + (int16_t)b.s8[c], -128, 127);
198 return t;
199 }
200
c_v64_sadd_s16(c_v64 a,c_v64 b)201 SIMD_INLINE c_v64 c_v64_sadd_s16(c_v64 a, c_v64 b) {
202 c_v64 t;
203 int c;
204 for (c = 0; c < 4; c++)
205 t.s16[c] = SIMD_CLAMP((int32_t)a.s16[c] + (int32_t)b.s16[c], -32768, 32767);
206 return t;
207 }
208
c_v64_add_32(c_v64 a,c_v64 b)209 SIMD_INLINE c_v64 c_v64_add_32(c_v64 a, c_v64 b) {
210 c_v64 t;
211 t.u32[0] = (uint32_t)((uint64_t)a.u32[0] + b.u32[0]);
212 t.u32[1] = (uint32_t)((uint64_t)a.u32[1] + b.u32[1]);
213 return t;
214 }
215
c_v64_sub_8(c_v64 a,c_v64 b)216 SIMD_INLINE c_v64 c_v64_sub_8(c_v64 a, c_v64 b) {
217 c_v64 t;
218 int c;
219 for (c = 0; c < 8; c++) t.u8[c] = (uint8_t)(a.u8[c] - b.u8[c]);
220 return t;
221 }
222
c_v64_ssub_u8(c_v64 a,c_v64 b)223 SIMD_INLINE c_v64 c_v64_ssub_u8(c_v64 a, c_v64 b) {
224 c_v64 t;
225 int c;
226 for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] < b.u8[c] ? 0 : a.u8[c] - b.u8[c];
227 return t;
228 }
229
c_v64_ssub_s8(c_v64 a,c_v64 b)230 SIMD_INLINE c_v64 c_v64_ssub_s8(c_v64 a, c_v64 b) {
231 c_v64 t;
232 int c;
233 for (c = 0; c < 8; c++) {
234 int16_t d = (int16_t)a.s8[c] - (int16_t)b.s8[c];
235 t.s8[c] = SIMD_CLAMP(d, -128, 127);
236 }
237 return t;
238 }
239
c_v64_sub_16(c_v64 a,c_v64 b)240 SIMD_INLINE c_v64 c_v64_sub_16(c_v64 a, c_v64 b) {
241 c_v64 t;
242 int c;
243 for (c = 0; c < 4; c++) t.u16[c] = (uint16_t)(a.u16[c] - b.u16[c]);
244 return t;
245 }
246
c_v64_ssub_s16(c_v64 a,c_v64 b)247 SIMD_INLINE c_v64 c_v64_ssub_s16(c_v64 a, c_v64 b) {
248 c_v64 t;
249 int c;
250 for (c = 0; c < 4; c++)
251 t.s16[c] = SIMD_CLAMP((int32_t)a.s16[c] - (int32_t)b.s16[c], -32768, 32767);
252 return t;
253 }
254
c_v64_ssub_u16(c_v64 a,c_v64 b)255 SIMD_INLINE c_v64 c_v64_ssub_u16(c_v64 a, c_v64 b) {
256 c_v64 t;
257 int c;
258 for (c = 0; c < 4; c++)
259 t.u16[c] =
260 (int32_t)a.u16[c] - (int32_t)b.u16[c] < 0 ? 0 : a.u16[c] - b.u16[c];
261 return t;
262 }
263
c_v64_sub_32(c_v64 a,c_v64 b)264 SIMD_INLINE c_v64 c_v64_sub_32(c_v64 a, c_v64 b) {
265 c_v64 t;
266 t.u32[0] = (uint32_t)((int64_t)a.u32[0] - b.u32[0]);
267 t.u32[1] = (uint32_t)((int64_t)a.u32[1] - b.u32[1]);
268 return t;
269 }
270
c_v64_abs_s16(c_v64 a)271 SIMD_INLINE c_v64 c_v64_abs_s16(c_v64 a) {
272 c_v64 t;
273 int c;
274 for (c = 0; c < 4; c++)
275 t.u16[c] = (uint16_t)((int16_t)a.u16[c] > 0 ? a.u16[c] : -a.u16[c]);
276 return t;
277 }
278
c_v64_abs_s8(c_v64 a)279 SIMD_INLINE c_v64 c_v64_abs_s8(c_v64 a) {
280 c_v64 t;
281 int c;
282 for (c = 0; c < 8; c++)
283 t.u8[c] = (uint8_t)((int8_t)a.u8[c] > 0 ? a.u8[c] : -a.u8[c]);
284 return t;
285 }
286
_c_v64_zip_8(c_v64 a,c_v64 b,int mode)287 SIMD_INLINE c_v64 _c_v64_zip_8(c_v64 a, c_v64 b, int mode) {
288 c_v64 t;
289 if (mode) {
290 t.u8[7] = a.u8[7];
291 t.u8[6] = b.u8[7];
292 t.u8[5] = a.u8[6];
293 t.u8[4] = b.u8[6];
294 t.u8[3] = a.u8[5];
295 t.u8[2] = b.u8[5];
296 t.u8[1] = a.u8[4];
297 t.u8[0] = b.u8[4];
298 } else {
299 t.u8[7] = a.u8[3];
300 t.u8[6] = b.u8[3];
301 t.u8[5] = a.u8[2];
302 t.u8[4] = b.u8[2];
303 t.u8[3] = a.u8[1];
304 t.u8[2] = b.u8[1];
305 t.u8[1] = a.u8[0];
306 t.u8[0] = b.u8[0];
307 }
308 return t;
309 }
310
c_v64_ziplo_8(c_v64 a,c_v64 b)311 SIMD_INLINE c_v64 c_v64_ziplo_8(c_v64 a, c_v64 b) {
312 return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 1) : _c_v64_zip_8(a, b, 0);
313 }
314
c_v64_ziphi_8(c_v64 a,c_v64 b)315 SIMD_INLINE c_v64 c_v64_ziphi_8(c_v64 a, c_v64 b) {
316 return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 0) : _c_v64_zip_8(a, b, 1);
317 }
318
_c_v64_zip_16(c_v64 a,c_v64 b,int mode)319 SIMD_INLINE c_v64 _c_v64_zip_16(c_v64 a, c_v64 b, int mode) {
320 c_v64 t;
321 if (mode) {
322 t.u16[3] = a.u16[3];
323 t.u16[2] = b.u16[3];
324 t.u16[1] = a.u16[2];
325 t.u16[0] = b.u16[2];
326 } else {
327 t.u16[3] = a.u16[1];
328 t.u16[2] = b.u16[1];
329 t.u16[1] = a.u16[0];
330 t.u16[0] = b.u16[0];
331 }
332 return t;
333 }
334
c_v64_ziplo_16(c_v64 a,c_v64 b)335 SIMD_INLINE c_v64 c_v64_ziplo_16(c_v64 a, c_v64 b) {
336 return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 1) : _c_v64_zip_16(a, b, 0);
337 }
338
c_v64_ziphi_16(c_v64 a,c_v64 b)339 SIMD_INLINE c_v64 c_v64_ziphi_16(c_v64 a, c_v64 b) {
340 return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 0) : _c_v64_zip_16(a, b, 1);
341 }
342
_c_v64_zip_32(c_v64 a,c_v64 b,int mode)343 SIMD_INLINE c_v64 _c_v64_zip_32(c_v64 a, c_v64 b, int mode) {
344 c_v64 t;
345 if (mode) {
346 t.u32[1] = a.u32[1];
347 t.u32[0] = b.u32[1];
348 } else {
349 t.u32[1] = a.u32[0];
350 t.u32[0] = b.u32[0];
351 }
352 return t;
353 }
354
c_v64_ziplo_32(c_v64 a,c_v64 b)355 SIMD_INLINE c_v64 c_v64_ziplo_32(c_v64 a, c_v64 b) {
356 return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 1) : _c_v64_zip_32(a, b, 0);
357 }
358
c_v64_ziphi_32(c_v64 a,c_v64 b)359 SIMD_INLINE c_v64 c_v64_ziphi_32(c_v64 a, c_v64 b) {
360 return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 0) : _c_v64_zip_32(a, b, 1);
361 }
362
_c_v64_unzip_8(c_v64 a,c_v64 b,int mode)363 SIMD_INLINE c_v64 _c_v64_unzip_8(c_v64 a, c_v64 b, int mode) {
364 c_v64 t;
365 if (mode) {
366 t.u8[7] = b.u8[7];
367 t.u8[6] = b.u8[5];
368 t.u8[5] = b.u8[3];
369 t.u8[4] = b.u8[1];
370 t.u8[3] = a.u8[7];
371 t.u8[2] = a.u8[5];
372 t.u8[1] = a.u8[3];
373 t.u8[0] = a.u8[1];
374 } else {
375 t.u8[7] = a.u8[6];
376 t.u8[6] = a.u8[4];
377 t.u8[5] = a.u8[2];
378 t.u8[4] = a.u8[0];
379 t.u8[3] = b.u8[6];
380 t.u8[2] = b.u8[4];
381 t.u8[1] = b.u8[2];
382 t.u8[0] = b.u8[0];
383 }
384 return t;
385 }
386
c_v64_unziplo_8(c_v64 a,c_v64 b)387 SIMD_INLINE c_v64 c_v64_unziplo_8(c_v64 a, c_v64 b) {
388 return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(a, b, 1) : _c_v64_unzip_8(a, b, 0);
389 }
390
c_v64_unziphi_8(c_v64 a,c_v64 b)391 SIMD_INLINE c_v64 c_v64_unziphi_8(c_v64 a, c_v64 b) {
392 return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(b, a, 0) : _c_v64_unzip_8(b, a, 1);
393 }
394
_c_v64_unzip_16(c_v64 a,c_v64 b,int mode)395 SIMD_INLINE c_v64 _c_v64_unzip_16(c_v64 a, c_v64 b, int mode) {
396 c_v64 t;
397 if (mode) {
398 t.u16[3] = b.u16[3];
399 t.u16[2] = b.u16[1];
400 t.u16[1] = a.u16[3];
401 t.u16[0] = a.u16[1];
402 } else {
403 t.u16[3] = a.u16[2];
404 t.u16[2] = a.u16[0];
405 t.u16[1] = b.u16[2];
406 t.u16[0] = b.u16[0];
407 }
408 return t;
409 }
410
c_v64_unziplo_16(c_v64 a,c_v64 b)411 SIMD_INLINE c_v64 c_v64_unziplo_16(c_v64 a, c_v64 b) {
412 return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(a, b, 1)
413 : _c_v64_unzip_16(a, b, 0);
414 }
415
c_v64_unziphi_16(c_v64 a,c_v64 b)416 SIMD_INLINE c_v64 c_v64_unziphi_16(c_v64 a, c_v64 b) {
417 return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(b, a, 0)
418 : _c_v64_unzip_16(b, a, 1);
419 }
420
c_v64_unpacklo_u8_s16(c_v64 a)421 SIMD_INLINE c_v64 c_v64_unpacklo_u8_s16(c_v64 a) {
422 c_v64 t;
423 int endian = !!CONFIG_BIG_ENDIAN * 4;
424 t.s16[3] = (int16_t)a.u8[3 + endian];
425 t.s16[2] = (int16_t)a.u8[2 + endian];
426 t.s16[1] = (int16_t)a.u8[1 + endian];
427 t.s16[0] = (int16_t)a.u8[0 + endian];
428 return t;
429 }
430
c_v64_unpackhi_u8_s16(c_v64 a)431 SIMD_INLINE c_v64 c_v64_unpackhi_u8_s16(c_v64 a) {
432 c_v64 t;
433 int endian = !!CONFIG_BIG_ENDIAN * 4;
434 t.s16[3] = (int16_t)a.u8[7 - endian];
435 t.s16[2] = (int16_t)a.u8[6 - endian];
436 t.s16[1] = (int16_t)a.u8[5 - endian];
437 t.s16[0] = (int16_t)a.u8[4 - endian];
438 return t;
439 }
440
c_v64_unpacklo_s8_s16(c_v64 a)441 SIMD_INLINE c_v64 c_v64_unpacklo_s8_s16(c_v64 a) {
442 c_v64 t;
443 int endian = !!CONFIG_BIG_ENDIAN * 4;
444 t.s16[3] = (int16_t)a.s8[3 + endian];
445 t.s16[2] = (int16_t)a.s8[2 + endian];
446 t.s16[1] = (int16_t)a.s8[1 + endian];
447 t.s16[0] = (int16_t)a.s8[0 + endian];
448 return t;
449 }
450
c_v64_unpackhi_s8_s16(c_v64 a)451 SIMD_INLINE c_v64 c_v64_unpackhi_s8_s16(c_v64 a) {
452 c_v64 t;
453 int endian = !!CONFIG_BIG_ENDIAN * 4;
454 t.s16[3] = (int16_t)a.s8[7 - endian];
455 t.s16[2] = (int16_t)a.s8[6 - endian];
456 t.s16[1] = (int16_t)a.s8[5 - endian];
457 t.s16[0] = (int16_t)a.s8[4 - endian];
458 return t;
459 }
460
c_v64_pack_s32_s16(c_v64 a,c_v64 b)461 SIMD_INLINE c_v64 c_v64_pack_s32_s16(c_v64 a, c_v64 b) {
462 c_v64 t;
463 if (CONFIG_BIG_ENDIAN) {
464 c_v64 u = a;
465 a = b;
466 b = u;
467 }
468 t.s16[3] = SIMD_CLAMP(a.s32[1], -32768, 32767);
469 t.s16[2] = SIMD_CLAMP(a.s32[0], -32768, 32767);
470 t.s16[1] = SIMD_CLAMP(b.s32[1], -32768, 32767);
471 t.s16[0] = SIMD_CLAMP(b.s32[0], -32768, 32767);
472 return t;
473 }
474
c_v64_pack_s32_u16(c_v64 a,c_v64 b)475 SIMD_INLINE c_v64 c_v64_pack_s32_u16(c_v64 a, c_v64 b) {
476 c_v64 t;
477 if (CONFIG_BIG_ENDIAN) {
478 c_v64 u = a;
479 a = b;
480 b = u;
481 }
482 t.u16[3] = SIMD_CLAMP(a.s32[1], 0, 65535);
483 t.u16[2] = SIMD_CLAMP(a.s32[0], 0, 65535);
484 t.u16[1] = SIMD_CLAMP(b.s32[1], 0, 65535);
485 t.u16[0] = SIMD_CLAMP(b.s32[0], 0, 65535);
486 return t;
487 }
488
c_v64_pack_s16_u8(c_v64 a,c_v64 b)489 SIMD_INLINE c_v64 c_v64_pack_s16_u8(c_v64 a, c_v64 b) {
490 c_v64 t;
491 if (CONFIG_BIG_ENDIAN) {
492 c_v64 u = a;
493 a = b;
494 b = u;
495 }
496 t.u8[7] = SIMD_CLAMP(a.s16[3], 0, 255);
497 t.u8[6] = SIMD_CLAMP(a.s16[2], 0, 255);
498 t.u8[5] = SIMD_CLAMP(a.s16[1], 0, 255);
499 t.u8[4] = SIMD_CLAMP(a.s16[0], 0, 255);
500 t.u8[3] = SIMD_CLAMP(b.s16[3], 0, 255);
501 t.u8[2] = SIMD_CLAMP(b.s16[2], 0, 255);
502 t.u8[1] = SIMD_CLAMP(b.s16[1], 0, 255);
503 t.u8[0] = SIMD_CLAMP(b.s16[0], 0, 255);
504 return t;
505 }
506
c_v64_pack_s16_s8(c_v64 a,c_v64 b)507 SIMD_INLINE c_v64 c_v64_pack_s16_s8(c_v64 a, c_v64 b) {
508 c_v64 t;
509 if (CONFIG_BIG_ENDIAN) {
510 c_v64 u = a;
511 a = b;
512 b = u;
513 }
514 t.s8[7] = SIMD_CLAMP(a.s16[3], -128, 127);
515 t.s8[6] = SIMD_CLAMP(a.s16[2], -128, 127);
516 t.s8[5] = SIMD_CLAMP(a.s16[1], -128, 127);
517 t.s8[4] = SIMD_CLAMP(a.s16[0], -128, 127);
518 t.s8[3] = SIMD_CLAMP(b.s16[3], -128, 127);
519 t.s8[2] = SIMD_CLAMP(b.s16[2], -128, 127);
520 t.s8[1] = SIMD_CLAMP(b.s16[1], -128, 127);
521 t.s8[0] = SIMD_CLAMP(b.s16[0], -128, 127);
522 return t;
523 }
524
c_v64_unpacklo_u16_s32(c_v64 a)525 SIMD_INLINE c_v64 c_v64_unpacklo_u16_s32(c_v64 a) {
526 c_v64 t;
527 t.s32[1] = a.u16[1 + !!CONFIG_BIG_ENDIAN * 2];
528 t.s32[0] = a.u16[0 + !!CONFIG_BIG_ENDIAN * 2];
529 return t;
530 }
531
c_v64_unpacklo_s16_s32(c_v64 a)532 SIMD_INLINE c_v64 c_v64_unpacklo_s16_s32(c_v64 a) {
533 c_v64 t;
534 t.s32[1] = a.s16[1 + !!CONFIG_BIG_ENDIAN * 2];
535 t.s32[0] = a.s16[0 + !!CONFIG_BIG_ENDIAN * 2];
536 return t;
537 }
538
c_v64_unpackhi_u16_s32(c_v64 a)539 SIMD_INLINE c_v64 c_v64_unpackhi_u16_s32(c_v64 a) {
540 c_v64 t;
541 t.s32[1] = a.u16[3 - !!CONFIG_BIG_ENDIAN * 2];
542 t.s32[0] = a.u16[2 - !!CONFIG_BIG_ENDIAN * 2];
543 return t;
544 }
545
c_v64_unpackhi_s16_s32(c_v64 a)546 SIMD_INLINE c_v64 c_v64_unpackhi_s16_s32(c_v64 a) {
547 c_v64 t;
548 t.s32[1] = a.s16[3 - !!CONFIG_BIG_ENDIAN * 2];
549 t.s32[0] = a.s16[2 - !!CONFIG_BIG_ENDIAN * 2];
550 return t;
551 }
552
c_v64_shuffle_8(c_v64 a,c_v64 pattern)553 SIMD_INLINE c_v64 c_v64_shuffle_8(c_v64 a, c_v64 pattern) {
554 c_v64 t;
555 int c;
556 for (c = 0; c < 8; c++) {
557 if (SIMD_CHECK && (pattern.u8[c] & ~7)) {
558 fprintf(stderr, "Error: Undefined v64_shuffle_8 index %d/%d\n",
559 pattern.u8[c], c);
560 abort();
561 }
562 t.u8[c] =
563 a.u8[CONFIG_BIG_ENDIAN ? 7 - (pattern.u8[c] & 7) : pattern.u8[c] & 7];
564 }
565 return t;
566 }
567
c_v64_dotp_su8(c_v64 a,c_v64 b)568 SIMD_INLINE int64_t c_v64_dotp_su8(c_v64 a, c_v64 b) {
569 return a.s8[7] * b.u8[7] + a.s8[6] * b.u8[6] + a.s8[5] * b.u8[5] +
570 a.s8[4] * b.u8[4] + a.s8[3] * b.u8[3] + a.s8[2] * b.u8[2] +
571 a.s8[1] * b.u8[1] + a.s8[0] * b.u8[0];
572 }
573
c_v64_dotp_s16(c_v64 a,c_v64 b)574 SIMD_INLINE int64_t c_v64_dotp_s16(c_v64 a, c_v64 b) {
575 return (int64_t)(a.s16[3] * b.s16[3] + a.s16[2] * b.s16[2]) +
576 (int64_t)(a.s16[1] * b.s16[1] + a.s16[0] * b.s16[0]);
577 }
578
c_v64_hadd_u8(c_v64 a)579 SIMD_INLINE uint64_t c_v64_hadd_u8(c_v64 a) {
580 return a.u8[7] + a.u8[6] + a.u8[5] + a.u8[4] + a.u8[3] + a.u8[2] + a.u8[1] +
581 a.u8[0];
582 }
583
c_v64_hadd_s16(c_v64 a)584 SIMD_INLINE int64_t c_v64_hadd_s16(c_v64 a) {
585 return a.s16[3] + a.s16[2] + a.s16[1] + a.s16[0];
586 }
587
588 typedef struct {
589 uint32_t val;
590 int count;
591 } c_sad64_internal;
592
c_v64_sad_u8_init(void)593 SIMD_INLINE c_sad64_internal c_v64_sad_u8_init(void) {
594 c_sad64_internal t;
595 t.val = t.count = 0;
596 return t;
597 }
598
599 /* Implementation dependent return value. Result must be finalised with
600 v64_sad_u8_sum(). The result for more than 32 v64_sad_u8() calls is
601 undefined. */
c_v64_sad_u8(c_sad64_internal s,c_v64 a,c_v64 b)602 SIMD_INLINE c_sad64_internal c_v64_sad_u8(c_sad64_internal s, c_v64 a,
603 c_v64 b) {
604 int c;
605 for (c = 0; c < 8; c++)
606 s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
607 s.count++;
608 if (SIMD_CHECK && s.count > 32) {
609 fprintf(stderr,
610 "Error: sad called 32 times returning an undefined result\n");
611 abort();
612 }
613 return s;
614 }
615
c_v64_sad_u8_sum(c_sad64_internal s)616 SIMD_INLINE uint32_t c_v64_sad_u8_sum(c_sad64_internal s) { return s.val; }
617
618 typedef uint32_t c_ssd64_internal;
619
620 /* Implementation dependent return value. Result must be finalised with
621 * v64_ssd_u8_sum(). */
c_v64_ssd_u8_init(void)622 SIMD_INLINE c_ssd64_internal c_v64_ssd_u8_init(void) { return 0; }
623
c_v64_ssd_u8(c_ssd64_internal s,c_v64 a,c_v64 b)624 SIMD_INLINE c_ssd64_internal c_v64_ssd_u8(c_ssd64_internal s, c_v64 a,
625 c_v64 b) {
626 int c;
627 for (c = 0; c < 8; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
628 return s;
629 }
630
c_v64_ssd_u8_sum(c_ssd64_internal s)631 SIMD_INLINE uint32_t c_v64_ssd_u8_sum(c_ssd64_internal s) { return s; }
632
c_v64_or(c_v64 a,c_v64 b)633 SIMD_INLINE c_v64 c_v64_or(c_v64 a, c_v64 b) {
634 c_v64 t;
635 t.u64 = a.u64 | b.u64;
636 return t;
637 }
638
c_v64_xor(c_v64 a,c_v64 b)639 SIMD_INLINE c_v64 c_v64_xor(c_v64 a, c_v64 b) {
640 c_v64 t;
641 t.u64 = a.u64 ^ b.u64;
642 return t;
643 }
644
c_v64_and(c_v64 a,c_v64 b)645 SIMD_INLINE c_v64 c_v64_and(c_v64 a, c_v64 b) {
646 c_v64 t;
647 t.u64 = a.u64 & b.u64;
648 return t;
649 }
650
c_v64_andn(c_v64 a,c_v64 b)651 SIMD_INLINE c_v64 c_v64_andn(c_v64 a, c_v64 b) {
652 c_v64 t;
653 t.u64 = a.u64 & ~b.u64;
654 return t;
655 }
656
c_v64_mullo_s16(c_v64 a,c_v64 b)657 SIMD_INLINE c_v64 c_v64_mullo_s16(c_v64 a, c_v64 b) {
658 c_v64 t;
659 int c;
660 for (c = 0; c < 4; c++) t.s16[c] = (int16_t)(a.s16[c] * b.s16[c]);
661 return t;
662 }
663
c_v64_mulhi_s16(c_v64 a,c_v64 b)664 SIMD_INLINE c_v64 c_v64_mulhi_s16(c_v64 a, c_v64 b) {
665 c_v64 t;
666 int c;
667 for (c = 0; c < 4; c++) t.s16[c] = (a.s16[c] * b.s16[c]) >> 16;
668 return t;
669 }
670
c_v64_mullo_s32(c_v64 a,c_v64 b)671 SIMD_INLINE c_v64 c_v64_mullo_s32(c_v64 a, c_v64 b) {
672 c_v64 t;
673 t.s32[0] = (int32_t)((int64_t)a.s32[0] * b.s32[0]);
674 t.s32[1] = (int32_t)((int64_t)a.s32[1] * b.s32[1]);
675 return t;
676 }
677
c_v64_madd_s16(c_v64 a,c_v64 b)678 SIMD_INLINE c_v64 c_v64_madd_s16(c_v64 a, c_v64 b) {
679 c_v64 t;
680 t.s32[0] = a.s16[0] * b.s16[0] + a.s16[1] * b.s16[1];
681 t.s32[1] = a.s16[2] * b.s16[2] + a.s16[3] * b.s16[3];
682 return t;
683 }
684
c_v64_madd_us8(c_v64 a,c_v64 b)685 SIMD_INLINE c_v64 c_v64_madd_us8(c_v64 a, c_v64 b) {
686 c_v64 t;
687 int32_t u;
688 u = a.u8[0] * b.s8[0] + a.u8[1] * b.s8[1];
689 t.s16[0] = SIMD_CLAMP(u, -32768, 32767);
690 u = a.u8[2] * b.s8[2] + a.u8[3] * b.s8[3];
691 t.s16[1] = SIMD_CLAMP(u, -32768, 32767);
692 u = a.u8[4] * b.s8[4] + a.u8[5] * b.s8[5];
693 t.s16[2] = SIMD_CLAMP(u, -32768, 32767);
694 u = a.u8[6] * b.s8[6] + a.u8[7] * b.s8[7];
695 t.s16[3] = SIMD_CLAMP(u, -32768, 32767);
696 return t;
697 }
698
c_v64_avg_u8(c_v64 a,c_v64 b)699 SIMD_INLINE c_v64 c_v64_avg_u8(c_v64 a, c_v64 b) {
700 c_v64 t;
701 int c;
702 for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c] + 1) >> 1;
703 return t;
704 }
705
c_v64_rdavg_u8(c_v64 a,c_v64 b)706 SIMD_INLINE c_v64 c_v64_rdavg_u8(c_v64 a, c_v64 b) {
707 c_v64 t;
708 int c;
709 for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c]) >> 1;
710 return t;
711 }
712
c_v64_rdavg_u16(c_v64 a,c_v64 b)713 SIMD_INLINE c_v64 c_v64_rdavg_u16(c_v64 a, c_v64 b) {
714 c_v64 t;
715 int c;
716 for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c]) >> 1;
717 return t;
718 }
719
c_v64_avg_u16(c_v64 a,c_v64 b)720 SIMD_INLINE c_v64 c_v64_avg_u16(c_v64 a, c_v64 b) {
721 c_v64 t;
722 int c;
723 for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c] + 1) >> 1;
724 return t;
725 }
726
c_v64_min_u8(c_v64 a,c_v64 b)727 SIMD_INLINE c_v64 c_v64_min_u8(c_v64 a, c_v64 b) {
728 c_v64 t;
729 int c;
730 for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? b.u8[c] : a.u8[c];
731 return t;
732 }
733
c_v64_max_u8(c_v64 a,c_v64 b)734 SIMD_INLINE c_v64 c_v64_max_u8(c_v64 a, c_v64 b) {
735 c_v64 t;
736 int c;
737 for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? a.u8[c] : b.u8[c];
738 return t;
739 }
740
c_v64_min_s8(c_v64 a,c_v64 b)741 SIMD_INLINE c_v64 c_v64_min_s8(c_v64 a, c_v64 b) {
742 c_v64 t;
743 int c;
744 for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? b.s8[c] : a.s8[c];
745 return t;
746 }
747
c_v64_max_s8(c_v64 a,c_v64 b)748 SIMD_INLINE c_v64 c_v64_max_s8(c_v64 a, c_v64 b) {
749 c_v64 t;
750 int c;
751 for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? a.s8[c] : b.s8[c];
752 return t;
753 }
754
c_v64_min_s16(c_v64 a,c_v64 b)755 SIMD_INLINE c_v64 c_v64_min_s16(c_v64 a, c_v64 b) {
756 c_v64 t;
757 int c;
758 for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? b.s16[c] : a.s16[c];
759 return t;
760 }
761
c_v64_max_s16(c_v64 a,c_v64 b)762 SIMD_INLINE c_v64 c_v64_max_s16(c_v64 a, c_v64 b) {
763 c_v64 t;
764 int c;
765 for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? a.s16[c] : b.s16[c];
766 return t;
767 }
768
c_v64_cmpgt_s8(c_v64 a,c_v64 b)769 SIMD_INLINE c_v64 c_v64_cmpgt_s8(c_v64 a, c_v64 b) {
770 c_v64 t;
771 int c;
772 for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] > b.s8[c]);
773 return t;
774 }
775
c_v64_cmplt_s8(c_v64 a,c_v64 b)776 SIMD_INLINE c_v64 c_v64_cmplt_s8(c_v64 a, c_v64 b) {
777 c_v64 t;
778 int c;
779 for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] < b.s8[c]);
780 return t;
781 }
782
c_v64_cmpeq_8(c_v64 a,c_v64 b)783 SIMD_INLINE c_v64 c_v64_cmpeq_8(c_v64 a, c_v64 b) {
784 c_v64 t;
785 int c;
786 for (c = 0; c < 8; c++) t.s8[c] = -(a.u8[c] == b.u8[c]);
787 return t;
788 }
789
c_v64_cmpgt_s16(c_v64 a,c_v64 b)790 SIMD_INLINE c_v64 c_v64_cmpgt_s16(c_v64 a, c_v64 b) {
791 c_v64 t;
792 int c;
793 for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] > b.s16[c]);
794 return t;
795 }
796
c_v64_cmplt_s16(c_v64 a,c_v64 b)797 SIMD_INLINE c_v64 c_v64_cmplt_s16(c_v64 a, c_v64 b) {
798 c_v64 t;
799 int c;
800 for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] < b.s16[c]);
801 return t;
802 }
803
c_v64_cmpeq_16(c_v64 a,c_v64 b)804 SIMD_INLINE c_v64 c_v64_cmpeq_16(c_v64 a, c_v64 b) {
805 c_v64 t;
806 int c;
807 for (c = 0; c < 4; c++) t.s16[c] = -(a.u16[c] == b.u16[c]);
808 return t;
809 }
810
c_v64_shl_8(c_v64 a,unsigned int n)811 SIMD_INLINE c_v64 c_v64_shl_8(c_v64 a, unsigned int n) {
812 c_v64 t;
813 int c;
814 if (SIMD_CHECK && n > 7) {
815 fprintf(stderr, "Error: Undefined u8 shift left %d\n", n);
816 abort();
817 }
818 for (c = 0; c < 8; c++) t.s8[c] = (int8_t)(a.u8[c] << n);
819 return t;
820 }
821
c_v64_shr_u8(c_v64 a,unsigned int n)822 SIMD_INLINE c_v64 c_v64_shr_u8(c_v64 a, unsigned int n) {
823 c_v64 t;
824 int c;
825 if (SIMD_CHECK && n > 7) {
826 fprintf(stderr, "Error: Undefined u8 shift right %d\n", n);
827 abort();
828 }
829 for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] >> n;
830 return t;
831 }
832
c_v64_shr_s8(c_v64 a,unsigned int n)833 SIMD_INLINE c_v64 c_v64_shr_s8(c_v64 a, unsigned int n) {
834 c_v64 t;
835 int c;
836 if (SIMD_CHECK && n > 7) {
837 fprintf(stderr, "Error: Undefined s8 shift right %d\n", n);
838 abort();
839 }
840 for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] >> n;
841 return t;
842 }
843
c_v64_shl_16(c_v64 a,unsigned int n)844 SIMD_INLINE c_v64 c_v64_shl_16(c_v64 a, unsigned int n) {
845 c_v64 t;
846 int c;
847 if (SIMD_CHECK && n > 15) {
848 fprintf(stderr, "Error: Undefined u16 shift left %d\n", n);
849 abort();
850 }
851 for (c = 0; c < 4; c++) t.u16[c] = (uint16_t)(a.u16[c] << n);
852 return t;
853 }
854
c_v64_shr_u16(c_v64 a,unsigned int n)855 SIMD_INLINE c_v64 c_v64_shr_u16(c_v64 a, unsigned int n) {
856 c_v64 t;
857 int c;
858 if (SIMD_CHECK && n > 15) {
859 fprintf(stderr, "Error: Undefined u16 shift right %d\n", n);
860 abort();
861 }
862 for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] >> n;
863 return t;
864 }
865
c_v64_shr_s16(c_v64 a,unsigned int n)866 SIMD_INLINE c_v64 c_v64_shr_s16(c_v64 a, unsigned int n) {
867 c_v64 t;
868 int c;
869 if (SIMD_CHECK && n > 15) {
870 fprintf(stderr, "Error: undefined s16 shift right %d\n", n);
871 abort();
872 }
873 for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] >> n;
874 return t;
875 }
876
c_v64_shl_32(c_v64 a,unsigned int n)877 SIMD_INLINE c_v64 c_v64_shl_32(c_v64 a, unsigned int n) {
878 c_v64 t;
879 if (SIMD_CHECK && n > 31) {
880 fprintf(stderr, "Error: undefined u32 shift left %d\n", n);
881 abort();
882 }
883 t.u32[1] = a.u32[1] << n;
884 t.u32[0] = a.u32[0] << n;
885 return t;
886 }
887
c_v64_shr_u32(c_v64 a,unsigned int n)888 SIMD_INLINE c_v64 c_v64_shr_u32(c_v64 a, unsigned int n) {
889 c_v64 t;
890 if (SIMD_CHECK && n > 31) {
891 fprintf(stderr, "Error: undefined u32 shift right %d\n", n);
892 abort();
893 }
894 t.u32[1] = a.u32[1] >> n;
895 t.u32[0] = a.u32[0] >> n;
896 return t;
897 }
898
c_v64_shr_s32(c_v64 a,unsigned int n)899 SIMD_INLINE c_v64 c_v64_shr_s32(c_v64 a, unsigned int n) {
900 c_v64 t;
901 if (SIMD_CHECK && n > 31) {
902 fprintf(stderr, "Error: undefined s32 shift right %d\n", n);
903 abort();
904 }
905 t.s32[1] = a.s32[1] >> n;
906 t.s32[0] = a.s32[0] >> n;
907 return t;
908 }
909
c_v64_shr_n_byte(c_v64 x,unsigned int i)910 SIMD_INLINE c_v64 c_v64_shr_n_byte(c_v64 x, unsigned int i) {
911 c_v64 t;
912 t.u64 = x.u64 >> i * 8;
913 return t;
914 }
915
c_v64_shl_n_byte(c_v64 x,unsigned int i)916 SIMD_INLINE c_v64 c_v64_shl_n_byte(c_v64 x, unsigned int i) {
917 c_v64 t;
918 t.u64 = x.u64 << i * 8;
919 return t;
920 }
921
c_v64_align(c_v64 a,c_v64 b,unsigned int c)922 SIMD_INLINE c_v64 c_v64_align(c_v64 a, c_v64 b, unsigned int c) {
923 if (SIMD_CHECK && c > 7) {
924 fprintf(stderr, "Error: undefined alignment %d\n", c);
925 abort();
926 }
927 return c ? c_v64_or(c_v64_shr_n_byte(b, c), c_v64_shl_n_byte(a, 8 - c)) : b;
928 }
929
c_v64_shl_n_8(c_v64 a,unsigned int c)930 SIMD_INLINE c_v64 c_v64_shl_n_8(c_v64 a, unsigned int c) {
931 return c_v64_shl_8(a, c);
932 }
933
c_v64_shr_n_u8(c_v64 a,unsigned int c)934 SIMD_INLINE c_v64 c_v64_shr_n_u8(c_v64 a, unsigned int c) {
935 return c_v64_shr_u8(a, c);
936 }
937
c_v64_shr_n_s8(c_v64 a,unsigned int c)938 SIMD_INLINE c_v64 c_v64_shr_n_s8(c_v64 a, unsigned int c) {
939 return c_v64_shr_s8(a, c);
940 }
941
c_v64_shl_n_16(c_v64 a,unsigned int c)942 SIMD_INLINE c_v64 c_v64_shl_n_16(c_v64 a, unsigned int c) {
943 return c_v64_shl_16(a, c);
944 }
945
c_v64_shr_n_u16(c_v64 a,unsigned int c)946 SIMD_INLINE c_v64 c_v64_shr_n_u16(c_v64 a, unsigned int c) {
947 return c_v64_shr_u16(a, c);
948 }
949
c_v64_shr_n_s16(c_v64 a,unsigned int c)950 SIMD_INLINE c_v64 c_v64_shr_n_s16(c_v64 a, unsigned int c) {
951 return c_v64_shr_s16(a, c);
952 }
953
c_v64_shl_n_32(c_v64 a,unsigned int c)954 SIMD_INLINE c_v64 c_v64_shl_n_32(c_v64 a, unsigned int c) {
955 return c_v64_shl_32(a, c);
956 }
957
c_v64_shr_n_u32(c_v64 a,unsigned int c)958 SIMD_INLINE c_v64 c_v64_shr_n_u32(c_v64 a, unsigned int c) {
959 return c_v64_shr_u32(a, c);
960 }
961
c_v64_shr_n_s32(c_v64 a,unsigned int c)962 SIMD_INLINE c_v64 c_v64_shr_n_s32(c_v64 a, unsigned int c) {
963 return c_v64_shr_s32(a, c);
964 }
965
966 #endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_
967