1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_
13 #define AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_
14
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18
19 #include "config/aom_config.h"
20
21 #include "aom_dsp/simd/v64_intrinsics_c.h"
22
23 typedef union {
24 uint8_t u8[16];
25 uint16_t u16[8];
26 uint32_t u32[4];
27 uint64_t u64[2];
28 int8_t s8[16];
29 int16_t s16[8];
30 int32_t s32[4];
31 int64_t s64[2];
32 c_v64 v64[2];
33 } c_v128;
34
c_v128_low_u32(c_v128 a)35 SIMD_INLINE uint32_t c_v128_low_u32(c_v128 a) { return a.u32[0]; }
36
c_v128_low_v64(c_v128 a)37 SIMD_INLINE c_v64 c_v128_low_v64(c_v128 a) { return a.v64[0]; }
38
c_v128_high_v64(c_v128 a)39 SIMD_INLINE c_v64 c_v128_high_v64(c_v128 a) { return a.v64[1]; }
40
c_v128_from_64(uint64_t hi,uint64_t lo)41 SIMD_INLINE c_v128 c_v128_from_64(uint64_t hi, uint64_t lo) {
42 c_v128 t;
43 t.u64[1] = hi;
44 t.u64[0] = lo;
45 return t;
46 }
47
c_v128_from_v64(c_v64 hi,c_v64 lo)48 SIMD_INLINE c_v128 c_v128_from_v64(c_v64 hi, c_v64 lo) {
49 c_v128 t;
50 t.v64[1] = hi;
51 t.v64[0] = lo;
52 return t;
53 }
54
c_v128_from_32(uint32_t a,uint32_t b,uint32_t c,uint32_t d)55 SIMD_INLINE c_v128 c_v128_from_32(uint32_t a, uint32_t b, uint32_t c,
56 uint32_t d) {
57 c_v128 t;
58 t.u32[3] = a;
59 t.u32[2] = b;
60 t.u32[1] = c;
61 t.u32[0] = d;
62 return t;
63 }
64
c_v128_load_unaligned(const void * p)65 SIMD_INLINE c_v128 c_v128_load_unaligned(const void *p) {
66 c_v128 t;
67 memcpy(&t, p, 16);
68 return t;
69 }
70
c_v128_load_aligned(const void * p)71 SIMD_INLINE c_v128 c_v128_load_aligned(const void *p) {
72 if (SIMD_CHECK && (uintptr_t)p & 15) {
73 fprintf(stderr, "Error: unaligned v128 load at %p\n", p);
74 abort();
75 }
76 return c_v128_load_unaligned(p);
77 }
78
c_v128_store_unaligned(void * p,c_v128 a)79 SIMD_INLINE void c_v128_store_unaligned(void *p, c_v128 a) {
80 memcpy(p, &a, 16);
81 }
82
c_v128_store_aligned(void * p,c_v128 a)83 SIMD_INLINE void c_v128_store_aligned(void *p, c_v128 a) {
84 if (SIMD_CHECK && (uintptr_t)p & 15) {
85 fprintf(stderr, "Error: unaligned v128 store at %p\n", p);
86 abort();
87 }
88 c_v128_store_unaligned(p, a);
89 }
90
c_v128_zero(void)91 SIMD_INLINE c_v128 c_v128_zero(void) {
92 c_v128 t;
93 t.u64[1] = t.u64[0] = 0;
94 return t;
95 }
96
c_v128_dup_8(uint8_t x)97 SIMD_INLINE c_v128 c_v128_dup_8(uint8_t x) {
98 c_v128 t;
99 t.v64[1] = t.v64[0] = c_v64_dup_8(x);
100 return t;
101 }
102
c_v128_dup_16(uint16_t x)103 SIMD_INLINE c_v128 c_v128_dup_16(uint16_t x) {
104 c_v128 t;
105 t.v64[1] = t.v64[0] = c_v64_dup_16(x);
106 return t;
107 }
108
c_v128_dup_32(uint32_t x)109 SIMD_INLINE c_v128 c_v128_dup_32(uint32_t x) {
110 c_v128 t;
111 t.v64[1] = t.v64[0] = c_v64_dup_32(x);
112 return t;
113 }
114
c_v128_dup_64(uint64_t x)115 SIMD_INLINE c_v128 c_v128_dup_64(uint64_t x) {
116 c_v128 t;
117 t.u64[1] = t.u64[0] = x;
118 return t;
119 }
120
c_v128_dotp_su8(c_v128 a,c_v128 b)121 SIMD_INLINE int64_t c_v128_dotp_su8(c_v128 a, c_v128 b) {
122 return c_v64_dotp_su8(a.v64[1], b.v64[1]) +
123 c_v64_dotp_su8(a.v64[0], b.v64[0]);
124 }
125
c_v128_dotp_s16(c_v128 a,c_v128 b)126 SIMD_INLINE int64_t c_v128_dotp_s16(c_v128 a, c_v128 b) {
127 return c_v64_dotp_s16(a.v64[1], b.v64[1]) +
128 c_v64_dotp_s16(a.v64[0], b.v64[0]);
129 }
130
c_v128_dotp_s32(c_v128 a,c_v128 b)131 SIMD_INLINE int64_t c_v128_dotp_s32(c_v128 a, c_v128 b) {
132 // 32 bit products, 64 bit sum
133 return (int64_t)(int32_t)((int64_t)a.s32[3] * b.s32[3]) +
134 (int64_t)(int32_t)((int64_t)a.s32[2] * b.s32[2]) +
135 (int64_t)(int32_t)((int64_t)a.s32[1] * b.s32[1]) +
136 (int64_t)(int32_t)((int64_t)a.s32[0] * b.s32[0]);
137 }
138
c_v128_hadd_u8(c_v128 a)139 SIMD_INLINE uint64_t c_v128_hadd_u8(c_v128 a) {
140 return c_v64_hadd_u8(a.v64[1]) + c_v64_hadd_u8(a.v64[0]);
141 }
142
143 typedef struct {
144 uint32_t val;
145 int count;
146 } c_sad128_internal;
147
c_v128_sad_u8_init(void)148 SIMD_INLINE c_sad128_internal c_v128_sad_u8_init(void) {
149 c_sad128_internal t;
150 t.val = t.count = 0;
151 return t;
152 }
153
154 /* Implementation dependent return value. Result must be finalised with
155 * v128_sad_u8_sum(). The result for more than 32 v128_sad_u8() calls is
156 * undefined. */
c_v128_sad_u8(c_sad128_internal s,c_v128 a,c_v128 b)157 SIMD_INLINE c_sad128_internal c_v128_sad_u8(c_sad128_internal s, c_v128 a,
158 c_v128 b) {
159 int c;
160 for (c = 0; c < 16; c++)
161 s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
162 s.count++;
163 if (SIMD_CHECK && s.count > 32) {
164 fprintf(stderr,
165 "Error: sad called 32 times returning an undefined result\n");
166 abort();
167 }
168 return s;
169 }
170
c_v128_sad_u8_sum(c_sad128_internal s)171 SIMD_INLINE uint32_t c_v128_sad_u8_sum(c_sad128_internal s) { return s.val; }
172
173 typedef uint32_t c_ssd128_internal;
174
c_v128_ssd_u8_init(void)175 SIMD_INLINE c_ssd128_internal c_v128_ssd_u8_init(void) { return 0; }
176
177 /* Implementation dependent return value. Result must be finalised with
178 * v128_ssd_u8_sum(). */
c_v128_ssd_u8(c_ssd128_internal s,c_v128 a,c_v128 b)179 SIMD_INLINE c_ssd128_internal c_v128_ssd_u8(c_ssd128_internal s, c_v128 a,
180 c_v128 b) {
181 int c;
182 for (c = 0; c < 16; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
183 return s;
184 }
185
c_v128_ssd_u8_sum(c_ssd128_internal s)186 SIMD_INLINE uint32_t c_v128_ssd_u8_sum(c_ssd128_internal s) { return s; }
187
c_v128_or(c_v128 a,c_v128 b)188 SIMD_INLINE c_v128 c_v128_or(c_v128 a, c_v128 b) {
189 return c_v128_from_v64(c_v64_or(a.v64[1], b.v64[1]),
190 c_v64_or(a.v64[0], b.v64[0]));
191 }
192
c_v128_xor(c_v128 a,c_v128 b)193 SIMD_INLINE c_v128 c_v128_xor(c_v128 a, c_v128 b) {
194 return c_v128_from_v64(c_v64_xor(a.v64[1], b.v64[1]),
195 c_v64_xor(a.v64[0], b.v64[0]));
196 }
197
c_v128_and(c_v128 a,c_v128 b)198 SIMD_INLINE c_v128 c_v128_and(c_v128 a, c_v128 b) {
199 return c_v128_from_v64(c_v64_and(a.v64[1], b.v64[1]),
200 c_v64_and(a.v64[0], b.v64[0]));
201 }
202
c_v128_andn(c_v128 a,c_v128 b)203 SIMD_INLINE c_v128 c_v128_andn(c_v128 a, c_v128 b) {
204 return c_v128_from_v64(c_v64_andn(a.v64[1], b.v64[1]),
205 c_v64_andn(a.v64[0], b.v64[0]));
206 }
207
c_v128_add_8(c_v128 a,c_v128 b)208 SIMD_INLINE c_v128 c_v128_add_8(c_v128 a, c_v128 b) {
209 return c_v128_from_v64(c_v64_add_8(a.v64[1], b.v64[1]),
210 c_v64_add_8(a.v64[0], b.v64[0]));
211 }
212
c_v128_add_16(c_v128 a,c_v128 b)213 SIMD_INLINE c_v128 c_v128_add_16(c_v128 a, c_v128 b) {
214 return c_v128_from_v64(c_v64_add_16(a.v64[1], b.v64[1]),
215 c_v64_add_16(a.v64[0], b.v64[0]));
216 }
217
c_v128_sadd_u8(c_v128 a,c_v128 b)218 SIMD_INLINE c_v128 c_v128_sadd_u8(c_v128 a, c_v128 b) {
219 return c_v128_from_v64(c_v64_sadd_u8(a.v64[1], b.v64[1]),
220 c_v64_sadd_u8(a.v64[0], b.v64[0]));
221 }
222
c_v128_sadd_s8(c_v128 a,c_v128 b)223 SIMD_INLINE c_v128 c_v128_sadd_s8(c_v128 a, c_v128 b) {
224 return c_v128_from_v64(c_v64_sadd_s8(a.v64[1], b.v64[1]),
225 c_v64_sadd_s8(a.v64[0], b.v64[0]));
226 }
227
c_v128_sadd_s16(c_v128 a,c_v128 b)228 SIMD_INLINE c_v128 c_v128_sadd_s16(c_v128 a, c_v128 b) {
229 return c_v128_from_v64(c_v64_sadd_s16(a.v64[1], b.v64[1]),
230 c_v64_sadd_s16(a.v64[0], b.v64[0]));
231 }
232
c_v128_add_32(c_v128 a,c_v128 b)233 SIMD_INLINE c_v128 c_v128_add_32(c_v128 a, c_v128 b) {
234 return c_v128_from_v64(c_v64_add_32(a.v64[1], b.v64[1]),
235 c_v64_add_32(a.v64[0], b.v64[0]));
236 }
237
c_v128_add_64(c_v128 a,c_v128 b)238 SIMD_INLINE c_v128 c_v128_add_64(c_v128 a, c_v128 b) {
239 // Two complement overflow (silences sanitizers)
240 return c_v128_from_64(
241 a.v64[1].u64 > ~b.v64[1].u64 ? a.v64[1].u64 - ~b.v64[1].u64 - 1
242 : a.v64[1].u64 + b.v64[1].u64,
243 a.v64[0].u64 > ~b.v64[0].u64 ? a.v64[0].u64 - ~b.v64[0].u64 - 1
244 : a.v64[0].u64 + b.v64[0].u64);
245 }
246
c_v128_padd_s16(c_v128 a)247 SIMD_INLINE c_v128 c_v128_padd_s16(c_v128 a) {
248 c_v128 t;
249 t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1];
250 t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3];
251 t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5];
252 t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7];
253 return t;
254 }
255
c_v128_padd_u8(c_v128 a)256 SIMD_INLINE c_v128 c_v128_padd_u8(c_v128 a) {
257 c_v128 t;
258 t.u16[0] = (uint16_t)a.u8[0] + (uint16_t)a.u8[1];
259 t.u16[1] = (uint16_t)a.u8[2] + (uint16_t)a.u8[3];
260 t.u16[2] = (uint16_t)a.u8[4] + (uint16_t)a.u8[5];
261 t.u16[3] = (uint16_t)a.u8[6] + (uint16_t)a.u8[7];
262 t.u16[4] = (uint16_t)a.u8[8] + (uint16_t)a.u8[9];
263 t.u16[5] = (uint16_t)a.u8[10] + (uint16_t)a.u8[11];
264 t.u16[6] = (uint16_t)a.u8[12] + (uint16_t)a.u8[13];
265 t.u16[7] = (uint16_t)a.u8[14] + (uint16_t)a.u8[15];
266 return t;
267 }
268
c_v128_sub_8(c_v128 a,c_v128 b)269 SIMD_INLINE c_v128 c_v128_sub_8(c_v128 a, c_v128 b) {
270 return c_v128_from_v64(c_v64_sub_8(a.v64[1], b.v64[1]),
271 c_v64_sub_8(a.v64[0], b.v64[0]));
272 }
273
c_v128_ssub_u8(c_v128 a,c_v128 b)274 SIMD_INLINE c_v128 c_v128_ssub_u8(c_v128 a, c_v128 b) {
275 return c_v128_from_v64(c_v64_ssub_u8(a.v64[1], b.v64[1]),
276 c_v64_ssub_u8(a.v64[0], b.v64[0]));
277 }
278
c_v128_ssub_s8(c_v128 a,c_v128 b)279 SIMD_INLINE c_v128 c_v128_ssub_s8(c_v128 a, c_v128 b) {
280 return c_v128_from_v64(c_v64_ssub_s8(a.v64[1], b.v64[1]),
281 c_v64_ssub_s8(a.v64[0], b.v64[0]));
282 }
283
c_v128_sub_16(c_v128 a,c_v128 b)284 SIMD_INLINE c_v128 c_v128_sub_16(c_v128 a, c_v128 b) {
285 return c_v128_from_v64(c_v64_sub_16(a.v64[1], b.v64[1]),
286 c_v64_sub_16(a.v64[0], b.v64[0]));
287 }
288
c_v128_ssub_s16(c_v128 a,c_v128 b)289 SIMD_INLINE c_v128 c_v128_ssub_s16(c_v128 a, c_v128 b) {
290 return c_v128_from_v64(c_v64_ssub_s16(a.v64[1], b.v64[1]),
291 c_v64_ssub_s16(a.v64[0], b.v64[0]));
292 }
293
c_v128_ssub_u16(c_v128 a,c_v128 b)294 SIMD_INLINE c_v128 c_v128_ssub_u16(c_v128 a, c_v128 b) {
295 return c_v128_from_v64(c_v64_ssub_u16(a.v64[1], b.v64[1]),
296 c_v64_ssub_u16(a.v64[0], b.v64[0]));
297 }
298
c_v128_sub_32(c_v128 a,c_v128 b)299 SIMD_INLINE c_v128 c_v128_sub_32(c_v128 a, c_v128 b) {
300 return c_v128_from_v64(c_v64_sub_32(a.v64[1], b.v64[1]),
301 c_v64_sub_32(a.v64[0], b.v64[0]));
302 }
303
c_v128_sub_64(c_v128 a,c_v128 b)304 SIMD_INLINE c_v128 c_v128_sub_64(c_v128 a, c_v128 b) {
305 // Two complement underflow (silences sanitizers)
306 return c_v128_from_64(
307 a.v64[1].u64 < b.v64[1].u64 ? a.v64[1].u64 + ~b.v64[1].u64 + 1
308 : a.v64[1].u64 - b.v64[1].u64,
309 a.v64[0].u64 < b.v64[0].u64 ? a.v64[0].u64 + ~b.v64[0].u64 + 1
310 : a.v64[0].u64 - b.v64[0].u64);
311 }
312
c_v128_abs_s16(c_v128 a)313 SIMD_INLINE c_v128 c_v128_abs_s16(c_v128 a) {
314 return c_v128_from_v64(c_v64_abs_s16(a.v64[1]), c_v64_abs_s16(a.v64[0]));
315 }
316
c_v128_abs_s8(c_v128 a)317 SIMD_INLINE c_v128 c_v128_abs_s8(c_v128 a) {
318 return c_v128_from_v64(c_v64_abs_s8(a.v64[1]), c_v64_abs_s8(a.v64[0]));
319 }
320
c_v128_mul_s16(c_v64 a,c_v64 b)321 SIMD_INLINE c_v128 c_v128_mul_s16(c_v64 a, c_v64 b) {
322 c_v64 lo_bits = c_v64_mullo_s16(a, b);
323 c_v64 hi_bits = c_v64_mulhi_s16(a, b);
324 return c_v128_from_v64(c_v64_ziphi_16(hi_bits, lo_bits),
325 c_v64_ziplo_16(hi_bits, lo_bits));
326 }
327
c_v128_mullo_s16(c_v128 a,c_v128 b)328 SIMD_INLINE c_v128 c_v128_mullo_s16(c_v128 a, c_v128 b) {
329 return c_v128_from_v64(c_v64_mullo_s16(a.v64[1], b.v64[1]),
330 c_v64_mullo_s16(a.v64[0], b.v64[0]));
331 }
332
c_v128_mulhi_s16(c_v128 a,c_v128 b)333 SIMD_INLINE c_v128 c_v128_mulhi_s16(c_v128 a, c_v128 b) {
334 return c_v128_from_v64(c_v64_mulhi_s16(a.v64[1], b.v64[1]),
335 c_v64_mulhi_s16(a.v64[0], b.v64[0]));
336 }
337
c_v128_mullo_s32(c_v128 a,c_v128 b)338 SIMD_INLINE c_v128 c_v128_mullo_s32(c_v128 a, c_v128 b) {
339 return c_v128_from_v64(c_v64_mullo_s32(a.v64[1], b.v64[1]),
340 c_v64_mullo_s32(a.v64[0], b.v64[0]));
341 }
342
c_v128_madd_s16(c_v128 a,c_v128 b)343 SIMD_INLINE c_v128 c_v128_madd_s16(c_v128 a, c_v128 b) {
344 return c_v128_from_v64(c_v64_madd_s16(a.v64[1], b.v64[1]),
345 c_v64_madd_s16(a.v64[0], b.v64[0]));
346 }
347
c_v128_madd_us8(c_v128 a,c_v128 b)348 SIMD_INLINE c_v128 c_v128_madd_us8(c_v128 a, c_v128 b) {
349 return c_v128_from_v64(c_v64_madd_us8(a.v64[1], b.v64[1]),
350 c_v64_madd_us8(a.v64[0], b.v64[0]));
351 }
352
c_v128_avg_u8(c_v128 a,c_v128 b)353 SIMD_INLINE c_v128 c_v128_avg_u8(c_v128 a, c_v128 b) {
354 return c_v128_from_v64(c_v64_avg_u8(a.v64[1], b.v64[1]),
355 c_v64_avg_u8(a.v64[0], b.v64[0]));
356 }
357
c_v128_rdavg_u8(c_v128 a,c_v128 b)358 SIMD_INLINE c_v128 c_v128_rdavg_u8(c_v128 a, c_v128 b) {
359 return c_v128_from_v64(c_v64_rdavg_u8(a.v64[1], b.v64[1]),
360 c_v64_rdavg_u8(a.v64[0], b.v64[0]));
361 }
362
c_v128_rdavg_u16(c_v128 a,c_v128 b)363 SIMD_INLINE c_v128 c_v128_rdavg_u16(c_v128 a, c_v128 b) {
364 return c_v128_from_v64(c_v64_rdavg_u16(a.v64[1], b.v64[1]),
365 c_v64_rdavg_u16(a.v64[0], b.v64[0]));
366 }
367
c_v128_avg_u16(c_v128 a,c_v128 b)368 SIMD_INLINE c_v128 c_v128_avg_u16(c_v128 a, c_v128 b) {
369 return c_v128_from_v64(c_v64_avg_u16(a.v64[1], b.v64[1]),
370 c_v64_avg_u16(a.v64[0], b.v64[0]));
371 }
372
c_v128_min_u8(c_v128 a,c_v128 b)373 SIMD_INLINE c_v128 c_v128_min_u8(c_v128 a, c_v128 b) {
374 return c_v128_from_v64(c_v64_min_u8(a.v64[1], b.v64[1]),
375 c_v64_min_u8(a.v64[0], b.v64[0]));
376 }
377
c_v128_max_u8(c_v128 a,c_v128 b)378 SIMD_INLINE c_v128 c_v128_max_u8(c_v128 a, c_v128 b) {
379 return c_v128_from_v64(c_v64_max_u8(a.v64[1], b.v64[1]),
380 c_v64_max_u8(a.v64[0], b.v64[0]));
381 }
382
c_v128_min_s8(c_v128 a,c_v128 b)383 SIMD_INLINE c_v128 c_v128_min_s8(c_v128 a, c_v128 b) {
384 return c_v128_from_v64(c_v64_min_s8(a.v64[1], b.v64[1]),
385 c_v64_min_s8(a.v64[0], b.v64[0]));
386 }
387
c_v128_movemask_8(c_v128 a)388 SIMD_INLINE uint32_t c_v128_movemask_8(c_v128 a) {
389 return ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) |
390 ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) |
391 ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) |
392 ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) |
393 ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) |
394 ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) |
395 ((a.s8[0] < 0) << 0);
396 }
397
c_v128_blend_8(c_v128 a,c_v128 b,c_v128 c)398 SIMD_INLINE c_v128 c_v128_blend_8(c_v128 a, c_v128 b, c_v128 c) {
399 c_v128 t;
400 for (int i = 0; i < 16; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i];
401 return t;
402 }
403
c_v128_max_s8(c_v128 a,c_v128 b)404 SIMD_INLINE c_v128 c_v128_max_s8(c_v128 a, c_v128 b) {
405 return c_v128_from_v64(c_v64_max_s8(a.v64[1], b.v64[1]),
406 c_v64_max_s8(a.v64[0], b.v64[0]));
407 }
408
c_v128_min_s16(c_v128 a,c_v128 b)409 SIMD_INLINE c_v128 c_v128_min_s16(c_v128 a, c_v128 b) {
410 return c_v128_from_v64(c_v64_min_s16(a.v64[1], b.v64[1]),
411 c_v64_min_s16(a.v64[0], b.v64[0]));
412 }
413
c_v128_max_s16(c_v128 a,c_v128 b)414 SIMD_INLINE c_v128 c_v128_max_s16(c_v128 a, c_v128 b) {
415 return c_v128_from_v64(c_v64_max_s16(a.v64[1], b.v64[1]),
416 c_v64_max_s16(a.v64[0], b.v64[0]));
417 }
418
c_v128_max_s32(c_v128 a,c_v128 b)419 SIMD_INLINE c_v128 c_v128_max_s32(c_v128 a, c_v128 b) {
420 c_v128 t;
421 int c;
422 for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? a.s32[c] : b.s32[c];
423 return t;
424 }
425
c_v128_min_s32(c_v128 a,c_v128 b)426 SIMD_INLINE c_v128 c_v128_min_s32(c_v128 a, c_v128 b) {
427 c_v128 t;
428 int c;
429 for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? b.s32[c] : a.s32[c];
430 return t;
431 }
432
c_v128_ziplo_8(c_v128 a,c_v128 b)433 SIMD_INLINE c_v128 c_v128_ziplo_8(c_v128 a, c_v128 b) {
434 return c_v128_from_v64(c_v64_ziphi_8(a.v64[0], b.v64[0]),
435 c_v64_ziplo_8(a.v64[0], b.v64[0]));
436 }
437
c_v128_ziphi_8(c_v128 a,c_v128 b)438 SIMD_INLINE c_v128 c_v128_ziphi_8(c_v128 a, c_v128 b) {
439 return c_v128_from_v64(c_v64_ziphi_8(a.v64[1], b.v64[1]),
440 c_v64_ziplo_8(a.v64[1], b.v64[1]));
441 }
442
c_v128_ziplo_16(c_v128 a,c_v128 b)443 SIMD_INLINE c_v128 c_v128_ziplo_16(c_v128 a, c_v128 b) {
444 return c_v128_from_v64(c_v64_ziphi_16(a.v64[0], b.v64[0]),
445 c_v64_ziplo_16(a.v64[0], b.v64[0]));
446 }
447
c_v128_ziphi_16(c_v128 a,c_v128 b)448 SIMD_INLINE c_v128 c_v128_ziphi_16(c_v128 a, c_v128 b) {
449 return c_v128_from_v64(c_v64_ziphi_16(a.v64[1], b.v64[1]),
450 c_v64_ziplo_16(a.v64[1], b.v64[1]));
451 }
452
c_v128_ziplo_32(c_v128 a,c_v128 b)453 SIMD_INLINE c_v128 c_v128_ziplo_32(c_v128 a, c_v128 b) {
454 return c_v128_from_v64(c_v64_ziphi_32(a.v64[0], b.v64[0]),
455 c_v64_ziplo_32(a.v64[0], b.v64[0]));
456 }
457
c_v128_ziphi_32(c_v128 a,c_v128 b)458 SIMD_INLINE c_v128 c_v128_ziphi_32(c_v128 a, c_v128 b) {
459 return c_v128_from_v64(c_v64_ziphi_32(a.v64[1], b.v64[1]),
460 c_v64_ziplo_32(a.v64[1], b.v64[1]));
461 }
462
c_v128_ziplo_64(c_v128 a,c_v128 b)463 SIMD_INLINE c_v128 c_v128_ziplo_64(c_v128 a, c_v128 b) {
464 return c_v128_from_v64(a.v64[0], b.v64[0]);
465 }
466
c_v128_ziphi_64(c_v128 a,c_v128 b)467 SIMD_INLINE c_v128 c_v128_ziphi_64(c_v128 a, c_v128 b) {
468 return c_v128_from_v64(a.v64[1], b.v64[1]);
469 }
470
c_v128_zip_8(c_v64 a,c_v64 b)471 SIMD_INLINE c_v128 c_v128_zip_8(c_v64 a, c_v64 b) {
472 return c_v128_from_v64(c_v64_ziphi_8(a, b), c_v64_ziplo_8(a, b));
473 }
474
c_v128_zip_16(c_v64 a,c_v64 b)475 SIMD_INLINE c_v128 c_v128_zip_16(c_v64 a, c_v64 b) {
476 return c_v128_from_v64(c_v64_ziphi_16(a, b), c_v64_ziplo_16(a, b));
477 }
478
c_v128_zip_32(c_v64 a,c_v64 b)479 SIMD_INLINE c_v128 c_v128_zip_32(c_v64 a, c_v64 b) {
480 return c_v128_from_v64(c_v64_ziphi_32(a, b), c_v64_ziplo_32(a, b));
481 }
482
_c_v128_unzip_8(c_v128 a,c_v128 b,int mode)483 SIMD_INLINE c_v128 _c_v128_unzip_8(c_v128 a, c_v128 b, int mode) {
484 c_v128 t;
485 if (mode) {
486 t.u8[15] = b.u8[15];
487 t.u8[14] = b.u8[13];
488 t.u8[13] = b.u8[11];
489 t.u8[12] = b.u8[9];
490 t.u8[11] = b.u8[7];
491 t.u8[10] = b.u8[5];
492 t.u8[9] = b.u8[3];
493 t.u8[8] = b.u8[1];
494 t.u8[7] = a.u8[15];
495 t.u8[6] = a.u8[13];
496 t.u8[5] = a.u8[11];
497 t.u8[4] = a.u8[9];
498 t.u8[3] = a.u8[7];
499 t.u8[2] = a.u8[5];
500 t.u8[1] = a.u8[3];
501 t.u8[0] = a.u8[1];
502 } else {
503 t.u8[15] = a.u8[14];
504 t.u8[14] = a.u8[12];
505 t.u8[13] = a.u8[10];
506 t.u8[12] = a.u8[8];
507 t.u8[11] = a.u8[6];
508 t.u8[10] = a.u8[4];
509 t.u8[9] = a.u8[2];
510 t.u8[8] = a.u8[0];
511 t.u8[7] = b.u8[14];
512 t.u8[6] = b.u8[12];
513 t.u8[5] = b.u8[10];
514 t.u8[4] = b.u8[8];
515 t.u8[3] = b.u8[6];
516 t.u8[2] = b.u8[4];
517 t.u8[1] = b.u8[2];
518 t.u8[0] = b.u8[0];
519 }
520 return t;
521 }
522
c_v128_unziplo_8(c_v128 a,c_v128 b)523 SIMD_INLINE c_v128 c_v128_unziplo_8(c_v128 a, c_v128 b) {
524 return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(a, b, 1)
525 : _c_v128_unzip_8(a, b, 0);
526 }
527
c_v128_unziphi_8(c_v128 a,c_v128 b)528 SIMD_INLINE c_v128 c_v128_unziphi_8(c_v128 a, c_v128 b) {
529 return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(b, a, 0)
530 : _c_v128_unzip_8(b, a, 1);
531 }
532
_c_v128_unzip_16(c_v128 a,c_v128 b,int mode)533 SIMD_INLINE c_v128 _c_v128_unzip_16(c_v128 a, c_v128 b, int mode) {
534 c_v128 t;
535 if (mode) {
536 t.u16[7] = b.u16[7];
537 t.u16[6] = b.u16[5];
538 t.u16[5] = b.u16[3];
539 t.u16[4] = b.u16[1];
540 t.u16[3] = a.u16[7];
541 t.u16[2] = a.u16[5];
542 t.u16[1] = a.u16[3];
543 t.u16[0] = a.u16[1];
544 } else {
545 t.u16[7] = a.u16[6];
546 t.u16[6] = a.u16[4];
547 t.u16[5] = a.u16[2];
548 t.u16[4] = a.u16[0];
549 t.u16[3] = b.u16[6];
550 t.u16[2] = b.u16[4];
551 t.u16[1] = b.u16[2];
552 t.u16[0] = b.u16[0];
553 }
554 return t;
555 }
556
c_v128_unziplo_16(c_v128 a,c_v128 b)557 SIMD_INLINE c_v128 c_v128_unziplo_16(c_v128 a, c_v128 b) {
558 return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(a, b, 1)
559 : _c_v128_unzip_16(a, b, 0);
560 }
561
c_v128_unziphi_16(c_v128 a,c_v128 b)562 SIMD_INLINE c_v128 c_v128_unziphi_16(c_v128 a, c_v128 b) {
563 return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(b, a, 0)
564 : _c_v128_unzip_16(b, a, 1);
565 }
566
_c_v128_unzip_32(c_v128 a,c_v128 b,int mode)567 SIMD_INLINE c_v128 _c_v128_unzip_32(c_v128 a, c_v128 b, int mode) {
568 c_v128 t;
569 if (mode) {
570 t.u32[3] = b.u32[3];
571 t.u32[2] = b.u32[1];
572 t.u32[1] = a.u32[3];
573 t.u32[0] = a.u32[1];
574 } else {
575 t.u32[3] = a.u32[2];
576 t.u32[2] = a.u32[0];
577 t.u32[1] = b.u32[2];
578 t.u32[0] = b.u32[0];
579 }
580 return t;
581 }
582
c_v128_unziplo_32(c_v128 a,c_v128 b)583 SIMD_INLINE c_v128 c_v128_unziplo_32(c_v128 a, c_v128 b) {
584 return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(a, b, 1)
585 : _c_v128_unzip_32(a, b, 0);
586 }
587
c_v128_unziphi_32(c_v128 a,c_v128 b)588 SIMD_INLINE c_v128 c_v128_unziphi_32(c_v128 a, c_v128 b) {
589 return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(b, a, 0)
590 : _c_v128_unzip_32(b, a, 1);
591 }
592
c_v128_unpack_u8_s16(c_v64 a)593 SIMD_INLINE c_v128 c_v128_unpack_u8_s16(c_v64 a) {
594 return c_v128_from_v64(c_v64_unpackhi_u8_s16(a), c_v64_unpacklo_u8_s16(a));
595 }
596
c_v128_unpacklo_u8_s16(c_v128 a)597 SIMD_INLINE c_v128 c_v128_unpacklo_u8_s16(c_v128 a) {
598 return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[0]),
599 c_v64_unpacklo_u8_s16(a.v64[0]));
600 }
601
c_v128_unpackhi_u8_s16(c_v128 a)602 SIMD_INLINE c_v128 c_v128_unpackhi_u8_s16(c_v128 a) {
603 return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[1]),
604 c_v64_unpacklo_u8_s16(a.v64[1]));
605 }
606
c_v128_unpack_s8_s16(c_v64 a)607 SIMD_INLINE c_v128 c_v128_unpack_s8_s16(c_v64 a) {
608 return c_v128_from_v64(c_v64_unpackhi_s8_s16(a), c_v64_unpacklo_s8_s16(a));
609 }
610
c_v128_unpacklo_s8_s16(c_v128 a)611 SIMD_INLINE c_v128 c_v128_unpacklo_s8_s16(c_v128 a) {
612 return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[0]),
613 c_v64_unpacklo_s8_s16(a.v64[0]));
614 }
615
c_v128_unpackhi_s8_s16(c_v128 a)616 SIMD_INLINE c_v128 c_v128_unpackhi_s8_s16(c_v128 a) {
617 return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[1]),
618 c_v64_unpacklo_s8_s16(a.v64[1]));
619 }
620
c_v128_pack_s32_s16(c_v128 a,c_v128 b)621 SIMD_INLINE c_v128 c_v128_pack_s32_s16(c_v128 a, c_v128 b) {
622 return c_v128_from_v64(c_v64_pack_s32_s16(a.v64[1], a.v64[0]),
623 c_v64_pack_s32_s16(b.v64[1], b.v64[0]));
624 }
625
c_v128_pack_s32_u16(c_v128 a,c_v128 b)626 SIMD_INLINE c_v128 c_v128_pack_s32_u16(c_v128 a, c_v128 b) {
627 return c_v128_from_v64(c_v64_pack_s32_u16(a.v64[1], a.v64[0]),
628 c_v64_pack_s32_u16(b.v64[1], b.v64[0]));
629 }
630
c_v128_pack_s16_u8(c_v128 a,c_v128 b)631 SIMD_INLINE c_v128 c_v128_pack_s16_u8(c_v128 a, c_v128 b) {
632 return c_v128_from_v64(c_v64_pack_s16_u8(a.v64[1], a.v64[0]),
633 c_v64_pack_s16_u8(b.v64[1], b.v64[0]));
634 }
635
c_v128_pack_s16_s8(c_v128 a,c_v128 b)636 SIMD_INLINE c_v128 c_v128_pack_s16_s8(c_v128 a, c_v128 b) {
637 return c_v128_from_v64(c_v64_pack_s16_s8(a.v64[1], a.v64[0]),
638 c_v64_pack_s16_s8(b.v64[1], b.v64[0]));
639 }
640
c_v128_unpack_u16_s32(c_v64 a)641 SIMD_INLINE c_v128 c_v128_unpack_u16_s32(c_v64 a) {
642 return c_v128_from_v64(c_v64_unpackhi_u16_s32(a), c_v64_unpacklo_u16_s32(a));
643 }
644
c_v128_unpack_s16_s32(c_v64 a)645 SIMD_INLINE c_v128 c_v128_unpack_s16_s32(c_v64 a) {
646 return c_v128_from_v64(c_v64_unpackhi_s16_s32(a), c_v64_unpacklo_s16_s32(a));
647 }
648
c_v128_unpacklo_u16_s32(c_v128 a)649 SIMD_INLINE c_v128 c_v128_unpacklo_u16_s32(c_v128 a) {
650 return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[0]),
651 c_v64_unpacklo_u16_s32(a.v64[0]));
652 }
653
c_v128_unpacklo_s16_s32(c_v128 a)654 SIMD_INLINE c_v128 c_v128_unpacklo_s16_s32(c_v128 a) {
655 return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[0]),
656 c_v64_unpacklo_s16_s32(a.v64[0]));
657 }
658
c_v128_unpackhi_u16_s32(c_v128 a)659 SIMD_INLINE c_v128 c_v128_unpackhi_u16_s32(c_v128 a) {
660 return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[1]),
661 c_v64_unpacklo_u16_s32(a.v64[1]));
662 }
663
c_v128_unpackhi_s16_s32(c_v128 a)664 SIMD_INLINE c_v128 c_v128_unpackhi_s16_s32(c_v128 a) {
665 return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[1]),
666 c_v64_unpacklo_s16_s32(a.v64[1]));
667 }
668
c_v128_shuffle_8(c_v128 a,c_v128 pattern)669 SIMD_INLINE c_v128 c_v128_shuffle_8(c_v128 a, c_v128 pattern) {
670 c_v128 t;
671 int c;
672 for (c = 0; c < 16; c++)
673 t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 15 - (pattern.u8[c] & 15)
674 : pattern.u8[c] & 15];
675
676 return t;
677 }
678
c_v128_cmpgt_s8(c_v128 a,c_v128 b)679 SIMD_INLINE c_v128 c_v128_cmpgt_s8(c_v128 a, c_v128 b) {
680 return c_v128_from_v64(c_v64_cmpgt_s8(a.v64[1], b.v64[1]),
681 c_v64_cmpgt_s8(a.v64[0], b.v64[0]));
682 }
683
c_v128_cmplt_s8(c_v128 a,c_v128 b)684 SIMD_INLINE c_v128 c_v128_cmplt_s8(c_v128 a, c_v128 b) {
685 return c_v128_from_v64(c_v64_cmplt_s8(a.v64[1], b.v64[1]),
686 c_v64_cmplt_s8(a.v64[0], b.v64[0]));
687 }
688
c_v128_cmpeq_8(c_v128 a,c_v128 b)689 SIMD_INLINE c_v128 c_v128_cmpeq_8(c_v128 a, c_v128 b) {
690 return c_v128_from_v64(c_v64_cmpeq_8(a.v64[1], b.v64[1]),
691 c_v64_cmpeq_8(a.v64[0], b.v64[0]));
692 }
693
c_v128_cmpgt_s16(c_v128 a,c_v128 b)694 SIMD_INLINE c_v128 c_v128_cmpgt_s16(c_v128 a, c_v128 b) {
695 return c_v128_from_v64(c_v64_cmpgt_s16(a.v64[1], b.v64[1]),
696 c_v64_cmpgt_s16(a.v64[0], b.v64[0]));
697 }
698
c_v128_cmplt_s16(c_v128 a,c_v128 b)699 SIMD_INLINE c_v128 c_v128_cmplt_s16(c_v128 a, c_v128 b) {
700 return c_v128_from_v64(c_v64_cmplt_s16(a.v64[1], b.v64[1]),
701 c_v64_cmplt_s16(a.v64[0], b.v64[0]));
702 }
703
c_v128_cmpeq_16(c_v128 a,c_v128 b)704 SIMD_INLINE c_v128 c_v128_cmpeq_16(c_v128 a, c_v128 b) {
705 return c_v128_from_v64(c_v64_cmpeq_16(a.v64[1], b.v64[1]),
706 c_v64_cmpeq_16(a.v64[0], b.v64[0]));
707 }
708
c_v128_cmpgt_s32(c_v128 a,c_v128 b)709 SIMD_INLINE c_v128 c_v128_cmpgt_s32(c_v128 a, c_v128 b) {
710 c_v128 t;
711 int c;
712 for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] > b.s32[c]);
713 return t;
714 }
715
c_v128_cmplt_s32(c_v128 a,c_v128 b)716 SIMD_INLINE c_v128 c_v128_cmplt_s32(c_v128 a, c_v128 b) {
717 c_v128 t;
718 int c;
719 for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] < b.s32[c]);
720 return t;
721 }
722
c_v128_cmpeq_32(c_v128 a,c_v128 b)723 SIMD_INLINE c_v128 c_v128_cmpeq_32(c_v128 a, c_v128 b) {
724 c_v128 t;
725 int c;
726 for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] == b.s32[c]);
727 return t;
728 }
729
c_v128_shl_n_byte(c_v128 a,const unsigned int n)730 SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, const unsigned int n) {
731 if (n == 0) return a;
732 if (n < 8)
733 return c_v128_from_v64(c_v64_or(c_v64_shl_n_byte(a.v64[1], n),
734 c_v64_shr_n_byte(a.v64[0], 8 - n)),
735 c_v64_shl_n_byte(a.v64[0], n));
736 else
737 return c_v128_from_v64(c_v64_shl_n_byte(a.v64[0], n - 8), c_v64_zero());
738 }
739
c_v128_shr_n_byte(c_v128 a,const unsigned int n)740 SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, const unsigned int n) {
741 if (n == 0) return a;
742 if (n < 8)
743 return c_v128_from_v64(c_v64_shr_n_byte(a.v64[1], n),
744 c_v64_or(c_v64_shr_n_byte(a.v64[0], n),
745 c_v64_shl_n_byte(a.v64[1], 8 - n)));
746 else
747 return c_v128_from_v64(c_v64_zero(), c_v64_shr_n_byte(a.v64[1], n - 8));
748 }
749
c_v128_align(c_v128 a,c_v128 b,const unsigned int c)750 SIMD_INLINE c_v128 c_v128_align(c_v128 a, c_v128 b, const unsigned int c) {
751 if (SIMD_CHECK && c > 15) {
752 fprintf(stderr, "Error: undefined alignment %d\n", c);
753 abort();
754 }
755 return c ? c_v128_or(c_v128_shr_n_byte(b, c), c_v128_shl_n_byte(a, 16 - c))
756 : b;
757 }
758
c_v128_shl_8(c_v128 a,const unsigned int c)759 SIMD_INLINE c_v128 c_v128_shl_8(c_v128 a, const unsigned int c) {
760 return c_v128_from_v64(c_v64_shl_8(a.v64[1], c), c_v64_shl_8(a.v64[0], c));
761 }
762
c_v128_shr_u8(c_v128 a,const unsigned int c)763 SIMD_INLINE c_v128 c_v128_shr_u8(c_v128 a, const unsigned int c) {
764 return c_v128_from_v64(c_v64_shr_u8(a.v64[1], c), c_v64_shr_u8(a.v64[0], c));
765 }
766
c_v128_shr_s8(c_v128 a,const unsigned int c)767 SIMD_INLINE c_v128 c_v128_shr_s8(c_v128 a, const unsigned int c) {
768 return c_v128_from_v64(c_v64_shr_s8(a.v64[1], c), c_v64_shr_s8(a.v64[0], c));
769 }
770
c_v128_shl_16(c_v128 a,const unsigned int c)771 SIMD_INLINE c_v128 c_v128_shl_16(c_v128 a, const unsigned int c) {
772 return c_v128_from_v64(c_v64_shl_16(a.v64[1], c), c_v64_shl_16(a.v64[0], c));
773 }
774
c_v128_shr_u16(c_v128 a,const unsigned int c)775 SIMD_INLINE c_v128 c_v128_shr_u16(c_v128 a, const unsigned int c) {
776 return c_v128_from_v64(c_v64_shr_u16(a.v64[1], c),
777 c_v64_shr_u16(a.v64[0], c));
778 }
779
c_v128_shr_s16(c_v128 a,const unsigned int c)780 SIMD_INLINE c_v128 c_v128_shr_s16(c_v128 a, const unsigned int c) {
781 return c_v128_from_v64(c_v64_shr_s16(a.v64[1], c),
782 c_v64_shr_s16(a.v64[0], c));
783 }
784
c_v128_shl_32(c_v128 a,const unsigned int c)785 SIMD_INLINE c_v128 c_v128_shl_32(c_v128 a, const unsigned int c) {
786 return c_v128_from_v64(c_v64_shl_32(a.v64[1], c), c_v64_shl_32(a.v64[0], c));
787 }
788
c_v128_shr_u32(c_v128 a,const unsigned int c)789 SIMD_INLINE c_v128 c_v128_shr_u32(c_v128 a, const unsigned int c) {
790 return c_v128_from_v64(c_v64_shr_u32(a.v64[1], c),
791 c_v64_shr_u32(a.v64[0], c));
792 }
793
c_v128_shr_s32(c_v128 a,const unsigned int c)794 SIMD_INLINE c_v128 c_v128_shr_s32(c_v128 a, const unsigned int c) {
795 return c_v128_from_v64(c_v64_shr_s32(a.v64[1], c),
796 c_v64_shr_s32(a.v64[0], c));
797 }
798
c_v128_shl_64(c_v128 a,const unsigned int c)799 SIMD_INLINE c_v128 c_v128_shl_64(c_v128 a, const unsigned int c) {
800 a.v64[1].u64 <<= c;
801 a.v64[0].u64 <<= c;
802 return c_v128_from_v64(a.v64[1], a.v64[0]);
803 }
804
c_v128_shr_u64(c_v128 a,const unsigned int c)805 SIMD_INLINE c_v128 c_v128_shr_u64(c_v128 a, const unsigned int c) {
806 a.v64[1].u64 >>= c;
807 a.v64[0].u64 >>= c;
808 return c_v128_from_v64(a.v64[1], a.v64[0]);
809 }
810
c_v128_shr_s64(c_v128 a,const unsigned int c)811 SIMD_INLINE c_v128 c_v128_shr_s64(c_v128 a, const unsigned int c) {
812 a.v64[1].s64 >>= c;
813 a.v64[0].s64 >>= c;
814 return c_v128_from_v64(a.v64[1], a.v64[0]);
815 }
816
c_v128_shl_n_8(c_v128 a,const unsigned int n)817 SIMD_INLINE c_v128 c_v128_shl_n_8(c_v128 a, const unsigned int n) {
818 return c_v128_shl_8(a, n);
819 }
820
c_v128_shl_n_16(c_v128 a,const unsigned int n)821 SIMD_INLINE c_v128 c_v128_shl_n_16(c_v128 a, const unsigned int n) {
822 return c_v128_shl_16(a, n);
823 }
824
c_v128_shl_n_32(c_v128 a,const unsigned int n)825 SIMD_INLINE c_v128 c_v128_shl_n_32(c_v128 a, const unsigned int n) {
826 return c_v128_shl_32(a, n);
827 }
828
c_v128_shl_n_64(c_v128 a,const unsigned int n)829 SIMD_INLINE c_v128 c_v128_shl_n_64(c_v128 a, const unsigned int n) {
830 return c_v128_shl_64(a, n);
831 }
832
c_v128_shr_n_u8(c_v128 a,const unsigned int n)833 SIMD_INLINE c_v128 c_v128_shr_n_u8(c_v128 a, const unsigned int n) {
834 return c_v128_shr_u8(a, n);
835 }
836
c_v128_shr_n_u16(c_v128 a,const unsigned int n)837 SIMD_INLINE c_v128 c_v128_shr_n_u16(c_v128 a, const unsigned int n) {
838 return c_v128_shr_u16(a, n);
839 }
840
c_v128_shr_n_u32(c_v128 a,const unsigned int n)841 SIMD_INLINE c_v128 c_v128_shr_n_u32(c_v128 a, const unsigned int n) {
842 return c_v128_shr_u32(a, n);
843 }
844
c_v128_shr_n_u64(c_v128 a,const unsigned int n)845 SIMD_INLINE c_v128 c_v128_shr_n_u64(c_v128 a, const unsigned int n) {
846 return c_v128_shr_u64(a, n);
847 }
848
c_v128_shr_n_s8(c_v128 a,const unsigned int n)849 SIMD_INLINE c_v128 c_v128_shr_n_s8(c_v128 a, const unsigned int n) {
850 return c_v128_shr_s8(a, n);
851 }
852
c_v128_shr_n_s16(c_v128 a,const unsigned int n)853 SIMD_INLINE c_v128 c_v128_shr_n_s16(c_v128 a, const unsigned int n) {
854 return c_v128_shr_s16(a, n);
855 }
856
c_v128_shr_n_s32(c_v128 a,const unsigned int n)857 SIMD_INLINE c_v128 c_v128_shr_n_s32(c_v128 a, const unsigned int n) {
858 return c_v128_shr_s32(a, n);
859 }
860
c_v128_shr_n_s64(c_v128 a,const unsigned int n)861 SIMD_INLINE c_v128 c_v128_shr_n_s64(c_v128 a, const unsigned int n) {
862 return c_v128_shr_s64(a, n);
863 }
864
865 typedef uint32_t c_sad128_internal_u16;
866
c_v128_sad_u16_init(void)867 SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16_init(void) { return 0; }
868
869 /* Implementation dependent return value. Result must be finalised with
870 * v128_sad_u16_sum(). */
c_v128_sad_u16(c_sad128_internal_u16 s,c_v128 a,c_v128 b)871 SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16(c_sad128_internal_u16 s,
872 c_v128 a, c_v128 b) {
873 int c;
874 for (c = 0; c < 8; c++)
875 s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c];
876 return s;
877 }
878
c_v128_sad_u16_sum(c_sad128_internal_u16 s)879 SIMD_INLINE uint32_t c_v128_sad_u16_sum(c_sad128_internal_u16 s) { return s; }
880
881 typedef uint64_t c_ssd128_internal_s16;
882
c_v128_ssd_s16_init(void)883 SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16_init(void) { return 0; }
884
885 /* Implementation dependent return value. Result must be finalised with
886 * v128_ssd_s16_sum(). */
c_v128_ssd_s16(c_ssd128_internal_s16 s,c_v128 a,c_v128 b)887 SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16(c_ssd128_internal_s16 s,
888 c_v128 a, c_v128 b) {
889 int c;
890 for (c = 0; c < 8; c++)
891 s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) *
892 (int32_t)(int16_t)(a.s16[c] - b.s16[c]);
893 return s;
894 }
895
c_v128_ssd_s16_sum(c_ssd128_internal_s16 s)896 SIMD_INLINE uint64_t c_v128_ssd_s16_sum(c_ssd128_internal_s16 s) { return s; }
897
898 #endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_
899