xref: /aosp_15_r20/external/libaom/test/simd_cmp_impl.inc (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1/*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12#include <assert.h>
13#include <string>
14
15#include "config/aom_dsp_rtcd.h"
16
17#include "test/acm_random.h"
18// Inlining not forced for the compiler due to some tests calling
19// SIMD_INLINE functions via function pointers
20#undef SIMD_INLINE
21#define SIMD_INLINE static inline
22#include "aom_dsp/aom_simd.h"
23#include "aom_dsp/simd/v256_intrinsics_c.h"
24
25// Machine tuned code goes into this file. This file is included from
26// simd_cmp_sse2.cc, simd_cmp_ssse3.cc etc which define the macros
27// ARCH (=neon, sse2, ssse3, etc), SIMD_NAMESPACE and ARCH_POSTFIX().
28
29#ifdef _MSC_VER
30// Disable "value of intrinsic immediate argument 'value' is out of range
31// 'lowerbound - upperbound'" warning. Visual Studio emits this warning though
32// the parameters are conditionally checked in e.g., v256_shr_n_byte. Adding a
33// mask doesn't always appear to be sufficient.
34#pragma warning(disable : 4556)
35#endif
36
37using libaom_test::ACMRandom;
38
39namespace SIMD_NAMESPACE {
40
41// Wrap templates around intrinsics using immediate values
42template <int shift>
43v64 imm_v64_shl_n_byte(v64 a) {
44  return v64_shl_n_byte(a, shift);
45}
46template <int shift>
47v64 imm_v64_shr_n_byte(v64 a) {
48  return v64_shr_n_byte(a, shift);
49}
50template <int shift>
51v64 imm_v64_shl_n_8(v64 a) {
52  return v64_shl_n_8(a, shift);
53}
54template <int shift>
55v64 imm_v64_shr_n_u8(v64 a) {
56  return v64_shr_n_u8(a, shift);
57}
58template <int shift>
59v64 imm_v64_shr_n_s8(v64 a) {
60  return v64_shr_n_s8(a, shift);
61}
62template <int shift>
63v64 imm_v64_shl_n_16(v64 a) {
64  return v64_shl_n_16(a, shift);
65}
66template <int shift>
67v64 imm_v64_shr_n_u16(v64 a) {
68  return v64_shr_n_u16(a, shift);
69}
70template <int shift>
71v64 imm_v64_shr_n_s16(v64 a) {
72  return v64_shr_n_s16(a, shift);
73}
74template <int shift>
75v64 imm_v64_shl_n_32(v64 a) {
76  return v64_shl_n_32(a, shift);
77}
78template <int shift>
79v64 imm_v64_shr_n_u32(v64 a) {
80  return v64_shr_n_u32(a, shift);
81}
82template <int shift>
83v64 imm_v64_shr_n_s32(v64 a) {
84  return v64_shr_n_s32(a, shift);
85}
86template <int shift>
87v64 imm_v64_align(v64 a, v64 b) {
88  return v64_align(a, b, shift);
89}
90
91// Wrap templates around corresponding C implementations of the above
92template <int shift>
93c_v64 c_imm_v64_shl_n_byte(c_v64 a) {
94  return c_v64_shl_n_byte(a, shift);
95}
96template <int shift>
97c_v64 c_imm_v64_shr_n_byte(c_v64 a) {
98  return c_v64_shr_n_byte(a, shift);
99}
100template <int shift>
101c_v64 c_imm_v64_shl_n_8(c_v64 a) {
102  return c_v64_shl_n_8(a, shift);
103}
104template <int shift>
105c_v64 c_imm_v64_shr_n_u8(c_v64 a) {
106  return c_v64_shr_n_u8(a, shift);
107}
108template <int shift>
109c_v64 c_imm_v64_shr_n_s8(c_v64 a) {
110  return c_v64_shr_n_s8(a, shift);
111}
112template <int shift>
113c_v64 c_imm_v64_shl_n_16(c_v64 a) {
114  return c_v64_shl_n_16(a, shift);
115}
116template <int shift>
117c_v64 c_imm_v64_shr_n_u16(c_v64 a) {
118  return c_v64_shr_n_u16(a, shift);
119}
120template <int shift>
121c_v64 c_imm_v64_shr_n_s16(c_v64 a) {
122  return c_v64_shr_n_s16(a, shift);
123}
124template <int shift>
125c_v64 c_imm_v64_shl_n_32(c_v64 a) {
126  return c_v64_shl_n_32(a, shift);
127}
128template <int shift>
129c_v64 c_imm_v64_shr_n_u32(c_v64 a) {
130  return c_v64_shr_n_u32(a, shift);
131}
132template <int shift>
133c_v64 c_imm_v64_shr_n_s32(c_v64 a) {
134  return c_v64_shr_n_s32(a, shift);
135}
136template <int shift>
137c_v64 c_imm_v64_align(c_v64 a, c_v64 b) {
138  return c_v64_align(a, b, shift);
139}
140
141template <int shift>
142v128 imm_v128_shl_n_byte(v128 a) {
143  return v128_shl_n_byte(a, shift);
144}
145template <int shift>
146v128 imm_v128_shr_n_byte(v128 a) {
147  return v128_shr_n_byte(a, shift);
148}
149template <int shift>
150v128 imm_v128_shl_n_8(v128 a) {
151  return v128_shl_n_8(a, shift);
152}
153template <int shift>
154v128 imm_v128_shr_n_u8(v128 a) {
155  return v128_shr_n_u8(a, shift);
156}
157template <int shift>
158v128 imm_v128_shr_n_s8(v128 a) {
159  return v128_shr_n_s8(a, shift);
160}
161template <int shift>
162v128 imm_v128_shl_n_16(v128 a) {
163  return v128_shl_n_16(a, shift);
164}
165template <int shift>
166v128 imm_v128_shr_n_u16(v128 a) {
167  return v128_shr_n_u16(a, shift);
168}
169template <int shift>
170v128 imm_v128_shr_n_s16(v128 a) {
171  return v128_shr_n_s16(a, shift);
172}
173template <int shift>
174v128 imm_v128_shl_n_32(v128 a) {
175  return v128_shl_n_32(a, shift);
176}
177template <int shift>
178v128 imm_v128_shr_n_u32(v128 a) {
179  return v128_shr_n_u32(a, shift);
180}
181template <int shift>
182v128 imm_v128_shr_n_s32(v128 a) {
183  return v128_shr_n_s32(a, shift);
184}
185template <int shift>
186v128 imm_v128_shl_n_64(v128 a) {
187  return v128_shl_n_64(a, shift);
188}
189template <int shift>
190v128 imm_v128_shr_n_u64(v128 a) {
191  return v128_shr_n_u64(a, shift);
192}
193template <int shift>
194v128 imm_v128_shr_n_s64(v128 a) {
195  return v128_shr_n_s64(a, shift);
196}
197template <int shift>
198v128 imm_v128_align(v128 a, v128 b) {
199  return v128_align(a, b, shift);
200}
201
202template <int shift>
203c_v128 c_imm_v128_shl_n_byte(c_v128 a) {
204  return c_v128_shl_n_byte(a, shift);
205}
206template <int shift>
207c_v128 c_imm_v128_shr_n_byte(c_v128 a) {
208  return c_v128_shr_n_byte(a, shift);
209}
210template <int shift>
211c_v128 c_imm_v128_shl_n_8(c_v128 a) {
212  return c_v128_shl_n_8(a, shift);
213}
214template <int shift>
215c_v128 c_imm_v128_shr_n_u8(c_v128 a) {
216  return c_v128_shr_n_u8(a, shift);
217}
218template <int shift>
219c_v128 c_imm_v128_shr_n_s8(c_v128 a) {
220  return c_v128_shr_n_s8(a, shift);
221}
222template <int shift>
223c_v128 c_imm_v128_shl_n_16(c_v128 a) {
224  return c_v128_shl_n_16(a, shift);
225}
226template <int shift>
227c_v128 c_imm_v128_shr_n_u16(c_v128 a) {
228  return c_v128_shr_n_u16(a, shift);
229}
230template <int shift>
231c_v128 c_imm_v128_shr_n_s16(c_v128 a) {
232  return c_v128_shr_n_s16(a, shift);
233}
234template <int shift>
235c_v128 c_imm_v128_shl_n_32(c_v128 a) {
236  return c_v128_shl_n_32(a, shift);
237}
238template <int shift>
239c_v128 c_imm_v128_shr_n_u32(c_v128 a) {
240  return c_v128_shr_n_u32(a, shift);
241}
242template <int shift>
243c_v128 c_imm_v128_shr_n_s32(c_v128 a) {
244  return c_v128_shr_n_s32(a, shift);
245}
246template <int shift>
247c_v128 c_imm_v128_shl_n_64(c_v128 a) {
248  return c_v128_shl_n_64(a, shift);
249}
250template <int shift>
251c_v128 c_imm_v128_shr_n_u64(c_v128 a) {
252  return c_v128_shr_n_u64(a, shift);
253}
254template <int shift>
255c_v128 c_imm_v128_shr_n_s64(c_v128 a) {
256  return c_v128_shr_n_s64(a, shift);
257}
258template <int shift>
259c_v128 c_imm_v128_align(c_v128 a, c_v128 b) {
260  return c_v128_align(a, b, shift);
261}
262
263template <int shift>
264v256 imm_v256_shl_n_word(v256 a) {
265  return v256_shl_n_word(a, shift);
266}
267template <int shift>
268v256 imm_v256_shr_n_word(v256 a) {
269  return v256_shr_n_word(a, shift);
270}
271template <int shift>
272v256 imm_v256_shl_n_byte(v256 a) {
273  return v256_shl_n_byte(a, shift);
274}
275template <int shift>
276v256 imm_v256_shr_n_byte(v256 a) {
277  return v256_shr_n_byte(a, shift);
278}
279template <int shift>
280v256 imm_v256_shl_n_8(v256 a) {
281  return v256_shl_n_8(a, shift);
282}
283template <int shift>
284v256 imm_v256_shr_n_u8(v256 a) {
285  return v256_shr_n_u8(a, shift);
286}
287template <int shift>
288v256 imm_v256_shr_n_s8(v256 a) {
289  return v256_shr_n_s8(a, shift);
290}
291template <int shift>
292v256 imm_v256_shl_n_16(v256 a) {
293  return v256_shl_n_16(a, shift);
294}
295template <int shift>
296v256 imm_v256_shr_n_u16(v256 a) {
297  return v256_shr_n_u16(a, shift);
298}
299template <int shift>
300v256 imm_v256_shr_n_s16(v256 a) {
301  return v256_shr_n_s16(a, shift);
302}
303template <int shift>
304v256 imm_v256_shl_n_32(v256 a) {
305  return v256_shl_n_32(a, shift);
306}
307template <int shift>
308v256 imm_v256_shr_n_u32(v256 a) {
309  return v256_shr_n_u32(a, shift);
310}
311template <int shift>
312v256 imm_v256_shr_n_s32(v256 a) {
313  return v256_shr_n_s32(a, shift);
314}
315template <int shift>
316v256 imm_v256_shl_n_64(v256 a) {
317  return v256_shl_n_64(a, shift);
318}
319template <int shift>
320v256 imm_v256_shr_n_u64(v256 a) {
321  return v256_shr_n_u64(a, shift);
322}
323template <int shift>
324v256 imm_v256_shr_n_s64(v256 a) {
325  return v256_shr_n_s64(a, shift);
326}
327template <int shift>
328v256 imm_v256_align(v256 a, v256 b) {
329  return v256_align(a, b, shift);
330}
331
332template <int shift>
333c_v256 c_imm_v256_shl_n_word(c_v256 a) {
334  return c_v256_shl_n_word(a, shift);
335}
336template <int shift>
337c_v256 c_imm_v256_shr_n_word(c_v256 a) {
338  return c_v256_shr_n_word(a, shift);
339}
340template <int shift>
341c_v256 c_imm_v256_shl_n_byte(c_v256 a) {
342  return c_v256_shl_n_byte(a, shift);
343}
344template <int shift>
345c_v256 c_imm_v256_shr_n_byte(c_v256 a) {
346  return c_v256_shr_n_byte(a, shift);
347}
348template <int shift>
349c_v256 c_imm_v256_shl_n_8(c_v256 a) {
350  return c_v256_shl_n_8(a, shift);
351}
352template <int shift>
353c_v256 c_imm_v256_shr_n_u8(c_v256 a) {
354  return c_v256_shr_n_u8(a, shift);
355}
356template <int shift>
357c_v256 c_imm_v256_shr_n_s8(c_v256 a) {
358  return c_v256_shr_n_s8(a, shift);
359}
360template <int shift>
361c_v256 c_imm_v256_shl_n_16(c_v256 a) {
362  return c_v256_shl_n_16(a, shift);
363}
364template <int shift>
365c_v256 c_imm_v256_shr_n_u16(c_v256 a) {
366  return c_v256_shr_n_u16(a, shift);
367}
368template <int shift>
369c_v256 c_imm_v256_shr_n_s16(c_v256 a) {
370  return c_v256_shr_n_s16(a, shift);
371}
372template <int shift>
373c_v256 c_imm_v256_shl_n_32(c_v256 a) {
374  return c_v256_shl_n_32(a, shift);
375}
376template <int shift>
377c_v256 c_imm_v256_shr_n_u32(c_v256 a) {
378  return c_v256_shr_n_u32(a, shift);
379}
380template <int shift>
381c_v256 c_imm_v256_shr_n_s32(c_v256 a) {
382  return c_v256_shr_n_s32(a, shift);
383}
384template <int shift>
385c_v256 c_imm_v256_shl_n_64(c_v256 a) {
386  return c_v256_shl_n_64(a, shift);
387}
388template <int shift>
389c_v256 c_imm_v256_shr_n_u64(c_v256 a) {
390  return c_v256_shr_n_u64(a, shift);
391}
392template <int shift>
393c_v256 c_imm_v256_shr_n_s64(c_v256 a) {
394  return c_v256_shr_n_s64(a, shift);
395}
396template <int shift>
397c_v256 c_imm_v256_align(c_v256 a, c_v256 b) {
398  return c_v256_align(a, b, shift);
399}
400
401namespace {
402
403// Wrappers around the the SAD and SSD functions
404uint32_t v64_sad_u8(v64 a, v64 b) {
405  return v64_sad_u8_sum(::v64_sad_u8(v64_sad_u8_init(), a, b));
406}
407uint32_t v64_ssd_u8(v64 a, v64 b) {
408  return v64_ssd_u8_sum(::v64_ssd_u8(v64_ssd_u8_init(), a, b));
409}
410
411uint32_t c_v64_sad_u8(c_v64 a, c_v64 b) {
412  return c_v64_sad_u8_sum(::c_v64_sad_u8(c_v64_sad_u8_init(), a, b));
413}
414uint32_t c_v64_ssd_u8(c_v64 a, c_v64 b) {
415  return c_v64_ssd_u8_sum(::c_v64_ssd_u8(c_v64_ssd_u8_init(), a, b));
416}
417uint32_t v128_sad_u8(v128 a, v128 b) {
418  return v128_sad_u8_sum(::v128_sad_u8(v128_sad_u8_init(), a, b));
419}
420uint32_t v128_ssd_u8(v128 a, v128 b) {
421  return v128_ssd_u8_sum(::v128_ssd_u8(v128_ssd_u8_init(), a, b));
422}
423uint32_t c_v128_sad_u8(c_v128 a, c_v128 b) {
424  return c_v128_sad_u8_sum(::c_v128_sad_u8(c_v128_sad_u8_init(), a, b));
425}
426uint32_t c_v128_ssd_u8(c_v128 a, c_v128 b) {
427  return c_v128_ssd_u8_sum(::c_v128_ssd_u8(c_v128_ssd_u8_init(), a, b));
428}
429uint32_t v128_sad_u16(v128 a, v128 b) {
430  return v128_sad_u16_sum(::v128_sad_u16(v128_sad_u16_init(), a, b));
431}
432uint64_t v128_ssd_s16(v128 a, v128 b) {
433  return v128_ssd_s16_sum(::v128_ssd_s16(v128_ssd_s16_init(), a, b));
434}
435uint32_t c_v128_sad_u16(c_v128 a, c_v128 b) {
436  return c_v128_sad_u16_sum(::c_v128_sad_u16(c_v128_sad_u16_init(), a, b));
437}
438uint64_t c_v128_ssd_s16(c_v128 a, c_v128 b) {
439  return c_v128_ssd_s16_sum(::c_v128_ssd_s16(c_v128_ssd_s16_init(), a, b));
440}
441uint32_t v256_sad_u8(v256 a, v256 b) {
442  return v256_sad_u8_sum(::v256_sad_u8(v256_sad_u8_init(), a, b));
443}
444uint32_t v256_ssd_u8(v256 a, v256 b) {
445  return v256_ssd_u8_sum(::v256_ssd_u8(v256_ssd_u8_init(), a, b));
446}
447uint32_t c_v256_sad_u8(c_v256 a, c_v256 b) {
448  return c_v256_sad_u8_sum(::c_v256_sad_u8(c_v256_sad_u8_init(), a, b));
449}
450uint32_t c_v256_ssd_u8(c_v256 a, c_v256 b) {
451  return c_v256_ssd_u8_sum(::c_v256_ssd_u8(c_v256_ssd_u8_init(), a, b));
452}
453uint32_t v256_sad_u16(v256 a, v256 b) {
454  return v256_sad_u16_sum(::v256_sad_u16(v256_sad_u16_init(), a, b));
455}
456uint64_t v256_ssd_s16(v256 a, v256 b) {
457  return v256_ssd_s16_sum(::v256_ssd_s16(v256_ssd_s16_init(), a, b));
458}
459uint32_t c_v256_sad_u16(c_v256 a, c_v256 b) {
460  return c_v256_sad_u16_sum(::c_v256_sad_u16(c_v256_sad_u16_init(), a, b));
461}
462uint64_t c_v256_ssd_s16(c_v256 a, c_v256 b) {
463  return c_v256_ssd_s16_sum(::c_v256_ssd_s16(c_v256_ssd_s16_init(), a, b));
464}
465
466typedef void (*fptr)();
467
468typedef struct {
469  const char *name;
470  fptr ref;
471  fptr simd;
472} mapping;
473
474#define MAP(name) \
475  { #name, reinterpret_cast < fptr>(c_##name), reinterpret_cast < fptr>(name) }
476
477const mapping m[] = { MAP(v64_sad_u8),
478                      MAP(v64_ssd_u8),
479                      MAP(v64_add_8),
480                      MAP(v64_add_16),
481                      MAP(v64_sadd_s8),
482                      MAP(v64_sadd_u8),
483                      MAP(v64_sadd_s16),
484                      MAP(v64_add_32),
485                      MAP(v64_sub_8),
486                      MAP(v64_ssub_u8),
487                      MAP(v64_ssub_s8),
488                      MAP(v64_sub_16),
489                      MAP(v64_ssub_s16),
490                      MAP(v64_ssub_u16),
491                      MAP(v64_sub_32),
492                      MAP(v64_ziplo_8),
493                      MAP(v64_ziphi_8),
494                      MAP(v64_ziplo_16),
495                      MAP(v64_ziphi_16),
496                      MAP(v64_ziplo_32),
497                      MAP(v64_ziphi_32),
498                      MAP(v64_pack_s32_u16),
499                      MAP(v64_pack_s32_s16),
500                      MAP(v64_pack_s16_u8),
501                      MAP(v64_pack_s16_s8),
502                      MAP(v64_unziphi_8),
503                      MAP(v64_unziplo_8),
504                      MAP(v64_unziphi_16),
505                      MAP(v64_unziplo_16),
506                      MAP(v64_or),
507                      MAP(v64_xor),
508                      MAP(v64_and),
509                      MAP(v64_andn),
510                      MAP(v64_mullo_s16),
511                      MAP(v64_mulhi_s16),
512                      MAP(v64_mullo_s32),
513                      MAP(v64_madd_s16),
514                      MAP(v64_madd_us8),
515                      MAP(v64_avg_u8),
516                      MAP(v64_rdavg_u8),
517                      MAP(v64_rdavg_u16),
518                      MAP(v64_avg_u16),
519                      MAP(v64_min_u8),
520                      MAP(v64_max_u8),
521                      MAP(v64_min_s8),
522                      MAP(v64_max_s8),
523                      MAP(v64_min_s16),
524                      MAP(v64_max_s16),
525                      MAP(v64_cmpgt_s8),
526                      MAP(v64_cmplt_s8),
527                      MAP(v64_cmpeq_8),
528                      MAP(v64_cmpgt_s16),
529                      MAP(v64_cmplt_s16),
530                      MAP(v64_cmpeq_16),
531                      MAP(v64_shuffle_8),
532                      MAP(imm_v64_align<1>),
533                      MAP(imm_v64_align<2>),
534                      MAP(imm_v64_align<3>),
535                      MAP(imm_v64_align<4>),
536                      MAP(imm_v64_align<5>),
537                      MAP(imm_v64_align<6>),
538                      MAP(imm_v64_align<7>),
539                      MAP(v64_abs_s8),
540                      MAP(v64_abs_s16),
541                      MAP(v64_unpacklo_u8_s16),
542                      MAP(v64_unpackhi_u8_s16),
543                      MAP(v64_unpacklo_s8_s16),
544                      MAP(v64_unpackhi_s8_s16),
545                      MAP(v64_unpacklo_u16_s32),
546                      MAP(v64_unpacklo_s16_s32),
547                      MAP(v64_unpackhi_u16_s32),
548                      MAP(v64_unpackhi_s16_s32),
549                      MAP(imm_v64_shr_n_byte<1>),
550                      MAP(imm_v64_shr_n_byte<2>),
551                      MAP(imm_v64_shr_n_byte<3>),
552                      MAP(imm_v64_shr_n_byte<4>),
553                      MAP(imm_v64_shr_n_byte<5>),
554                      MAP(imm_v64_shr_n_byte<6>),
555                      MAP(imm_v64_shr_n_byte<7>),
556                      MAP(imm_v64_shl_n_byte<1>),
557                      MAP(imm_v64_shl_n_byte<2>),
558                      MAP(imm_v64_shl_n_byte<3>),
559                      MAP(imm_v64_shl_n_byte<4>),
560                      MAP(imm_v64_shl_n_byte<5>),
561                      MAP(imm_v64_shl_n_byte<6>),
562                      MAP(imm_v64_shl_n_byte<7>),
563                      MAP(imm_v64_shl_n_8<1>),
564                      MAP(imm_v64_shl_n_8<2>),
565                      MAP(imm_v64_shl_n_8<3>),
566                      MAP(imm_v64_shl_n_8<4>),
567                      MAP(imm_v64_shl_n_8<5>),
568                      MAP(imm_v64_shl_n_8<6>),
569                      MAP(imm_v64_shl_n_8<7>),
570                      MAP(imm_v64_shr_n_u8<1>),
571                      MAP(imm_v64_shr_n_u8<2>),
572                      MAP(imm_v64_shr_n_u8<3>),
573                      MAP(imm_v64_shr_n_u8<4>),
574                      MAP(imm_v64_shr_n_u8<5>),
575                      MAP(imm_v64_shr_n_u8<6>),
576                      MAP(imm_v64_shr_n_u8<7>),
577                      MAP(imm_v64_shr_n_s8<1>),
578                      MAP(imm_v64_shr_n_s8<2>),
579                      MAP(imm_v64_shr_n_s8<3>),
580                      MAP(imm_v64_shr_n_s8<4>),
581                      MAP(imm_v64_shr_n_s8<5>),
582                      MAP(imm_v64_shr_n_s8<6>),
583                      MAP(imm_v64_shr_n_s8<7>),
584                      MAP(imm_v64_shl_n_16<1>),
585                      MAP(imm_v64_shl_n_16<2>),
586                      MAP(imm_v64_shl_n_16<4>),
587                      MAP(imm_v64_shl_n_16<6>),
588                      MAP(imm_v64_shl_n_16<8>),
589                      MAP(imm_v64_shl_n_16<10>),
590                      MAP(imm_v64_shl_n_16<12>),
591                      MAP(imm_v64_shl_n_16<14>),
592                      MAP(imm_v64_shr_n_u16<1>),
593                      MAP(imm_v64_shr_n_u16<2>),
594                      MAP(imm_v64_shr_n_u16<4>),
595                      MAP(imm_v64_shr_n_u16<6>),
596                      MAP(imm_v64_shr_n_u16<8>),
597                      MAP(imm_v64_shr_n_u16<10>),
598                      MAP(imm_v64_shr_n_u16<12>),
599                      MAP(imm_v64_shr_n_u16<14>),
600                      MAP(imm_v64_shr_n_s16<1>),
601                      MAP(imm_v64_shr_n_s16<2>),
602                      MAP(imm_v64_shr_n_s16<4>),
603                      MAP(imm_v64_shr_n_s16<6>),
604                      MAP(imm_v64_shr_n_s16<8>),
605                      MAP(imm_v64_shr_n_s16<10>),
606                      MAP(imm_v64_shr_n_s16<12>),
607                      MAP(imm_v64_shr_n_s16<14>),
608                      MAP(imm_v64_shl_n_32<1>),
609                      MAP(imm_v64_shl_n_32<4>),
610                      MAP(imm_v64_shl_n_32<8>),
611                      MAP(imm_v64_shl_n_32<12>),
612                      MAP(imm_v64_shl_n_32<16>),
613                      MAP(imm_v64_shl_n_32<20>),
614                      MAP(imm_v64_shl_n_32<24>),
615                      MAP(imm_v64_shl_n_32<28>),
616                      MAP(imm_v64_shr_n_u32<1>),
617                      MAP(imm_v64_shr_n_u32<4>),
618                      MAP(imm_v64_shr_n_u32<8>),
619                      MAP(imm_v64_shr_n_u32<12>),
620                      MAP(imm_v64_shr_n_u32<16>),
621                      MAP(imm_v64_shr_n_u32<20>),
622                      MAP(imm_v64_shr_n_u32<24>),
623                      MAP(imm_v64_shr_n_u32<28>),
624                      MAP(imm_v64_shr_n_s32<1>),
625                      MAP(imm_v64_shr_n_s32<4>),
626                      MAP(imm_v64_shr_n_s32<8>),
627                      MAP(imm_v64_shr_n_s32<12>),
628                      MAP(imm_v64_shr_n_s32<16>),
629                      MAP(imm_v64_shr_n_s32<20>),
630                      MAP(imm_v64_shr_n_s32<24>),
631                      MAP(imm_v64_shr_n_s32<28>),
632                      MAP(v64_shl_8),
633                      MAP(v64_shr_u8),
634                      MAP(v64_shr_s8),
635                      MAP(v64_shl_16),
636                      MAP(v64_shr_u16),
637                      MAP(v64_shr_s16),
638                      MAP(v64_shl_32),
639                      MAP(v64_shr_u32),
640                      MAP(v64_shr_s32),
641                      MAP(v64_hadd_u8),
642                      MAP(v64_hadd_s16),
643                      MAP(v64_dotp_s16),
644                      MAP(v64_dotp_su8),
645                      MAP(v64_u64),
646                      MAP(v64_low_u32),
647                      MAP(v64_high_u32),
648                      MAP(v64_low_s32),
649                      MAP(v64_high_s32),
650                      MAP(v64_dup_8),
651                      MAP(v64_dup_16),
652                      MAP(v64_dup_32),
653                      MAP(v64_from_32),
654                      MAP(v64_zero),
655                      MAP(v64_from_16),
656                      MAP(v128_sad_u8),
657                      MAP(v128_ssd_u8),
658                      MAP(v128_sad_u16),
659                      MAP(v128_ssd_s16),
660                      MAP(v128_add_8),
661                      MAP(v128_add_16),
662                      MAP(v128_sadd_s8),
663                      MAP(v128_sadd_u8),
664                      MAP(v128_sadd_s16),
665                      MAP(v128_add_32),
666                      MAP(v128_add_64),
667                      MAP(v128_sub_8),
668                      MAP(v128_ssub_u8),
669                      MAP(v128_ssub_s8),
670                      MAP(v128_sub_16),
671                      MAP(v128_ssub_s16),
672                      MAP(v128_ssub_u16),
673                      MAP(v128_sub_32),
674                      MAP(v128_sub_64),
675                      MAP(v128_ziplo_8),
676                      MAP(v128_ziphi_8),
677                      MAP(v128_ziplo_16),
678                      MAP(v128_ziphi_16),
679                      MAP(v128_ziplo_32),
680                      MAP(v128_ziphi_32),
681                      MAP(v128_ziplo_64),
682                      MAP(v128_ziphi_64),
683                      MAP(v128_unziphi_8),
684                      MAP(v128_unziplo_8),
685                      MAP(v128_unziphi_16),
686                      MAP(v128_unziplo_16),
687                      MAP(v128_unziphi_32),
688                      MAP(v128_unziplo_32),
689                      MAP(v128_pack_s32_u16),
690                      MAP(v128_pack_s32_s16),
691                      MAP(v128_pack_s16_u8),
692                      MAP(v128_pack_s16_s8),
693                      MAP(v128_or),
694                      MAP(v128_xor),
695                      MAP(v128_and),
696                      MAP(v128_andn),
697                      MAP(v128_mullo_s16),
698                      MAP(v128_mulhi_s16),
699                      MAP(v128_mullo_s32),
700                      MAP(v128_madd_s16),
701                      MAP(v128_madd_us8),
702                      MAP(v128_avg_u8),
703                      MAP(v128_rdavg_u8),
704                      MAP(v128_rdavg_u16),
705                      MAP(v128_avg_u16),
706                      MAP(v128_min_u8),
707                      MAP(v128_max_u8),
708                      MAP(v128_min_s8),
709                      MAP(v128_max_s8),
710                      MAP(v128_min_s16),
711                      MAP(v128_max_s16),
712                      MAP(v128_min_s32),
713                      MAP(v128_max_s32),
714                      MAP(v128_cmpgt_s8),
715                      MAP(v128_cmplt_s8),
716                      MAP(v128_cmpeq_8),
717                      MAP(v128_cmpgt_s16),
718                      MAP(v128_cmpeq_16),
719                      MAP(v128_cmplt_s16),
720                      MAP(v128_cmpgt_s32),
721                      MAP(v128_cmpeq_32),
722                      MAP(v128_cmplt_s32),
723                      MAP(v128_shuffle_8),
724                      MAP(imm_v128_align<1>),
725                      MAP(imm_v128_align<2>),
726                      MAP(imm_v128_align<3>),
727                      MAP(imm_v128_align<4>),
728                      MAP(imm_v128_align<5>),
729                      MAP(imm_v128_align<6>),
730                      MAP(imm_v128_align<7>),
731                      MAP(imm_v128_align<8>),
732                      MAP(imm_v128_align<9>),
733                      MAP(imm_v128_align<10>),
734                      MAP(imm_v128_align<11>),
735                      MAP(imm_v128_align<12>),
736                      MAP(imm_v128_align<13>),
737                      MAP(imm_v128_align<14>),
738                      MAP(imm_v128_align<15>),
739                      MAP(v128_abs_s8),
740                      MAP(v128_abs_s16),
741                      MAP(v128_padd_u8),
742                      MAP(v128_padd_s16),
743                      MAP(v128_unpacklo_u16_s32),
744                      MAP(v128_unpacklo_s16_s32),
745                      MAP(v128_unpackhi_u16_s32),
746                      MAP(v128_unpackhi_s16_s32),
747                      MAP(imm_v128_shr_n_byte<1>),
748                      MAP(imm_v128_shr_n_byte<2>),
749                      MAP(imm_v128_shr_n_byte<3>),
750                      MAP(imm_v128_shr_n_byte<4>),
751                      MAP(imm_v128_shr_n_byte<5>),
752                      MAP(imm_v128_shr_n_byte<6>),
753                      MAP(imm_v128_shr_n_byte<7>),
754                      MAP(imm_v128_shr_n_byte<8>),
755                      MAP(imm_v128_shr_n_byte<9>),
756                      MAP(imm_v128_shr_n_byte<10>),
757                      MAP(imm_v128_shr_n_byte<11>),
758                      MAP(imm_v128_shr_n_byte<12>),
759                      MAP(imm_v128_shr_n_byte<13>),
760                      MAP(imm_v128_shr_n_byte<14>),
761                      MAP(imm_v128_shr_n_byte<15>),
762                      MAP(imm_v128_shl_n_byte<1>),
763                      MAP(imm_v128_shl_n_byte<2>),
764                      MAP(imm_v128_shl_n_byte<3>),
765                      MAP(imm_v128_shl_n_byte<4>),
766                      MAP(imm_v128_shl_n_byte<5>),
767                      MAP(imm_v128_shl_n_byte<6>),
768                      MAP(imm_v128_shl_n_byte<7>),
769                      MAP(imm_v128_shl_n_byte<8>),
770                      MAP(imm_v128_shl_n_byte<9>),
771                      MAP(imm_v128_shl_n_byte<10>),
772                      MAP(imm_v128_shl_n_byte<11>),
773                      MAP(imm_v128_shl_n_byte<12>),
774                      MAP(imm_v128_shl_n_byte<13>),
775                      MAP(imm_v128_shl_n_byte<14>),
776                      MAP(imm_v128_shl_n_byte<15>),
777                      MAP(imm_v128_shl_n_8<1>),
778                      MAP(imm_v128_shl_n_8<2>),
779                      MAP(imm_v128_shl_n_8<3>),
780                      MAP(imm_v128_shl_n_8<4>),
781                      MAP(imm_v128_shl_n_8<5>),
782                      MAP(imm_v128_shl_n_8<6>),
783                      MAP(imm_v128_shl_n_8<7>),
784                      MAP(imm_v128_shr_n_u8<1>),
785                      MAP(imm_v128_shr_n_u8<2>),
786                      MAP(imm_v128_shr_n_u8<3>),
787                      MAP(imm_v128_shr_n_u8<4>),
788                      MAP(imm_v128_shr_n_u8<5>),
789                      MAP(imm_v128_shr_n_u8<6>),
790                      MAP(imm_v128_shr_n_u8<7>),
791                      MAP(imm_v128_shr_n_s8<1>),
792                      MAP(imm_v128_shr_n_s8<2>),
793                      MAP(imm_v128_shr_n_s8<3>),
794                      MAP(imm_v128_shr_n_s8<4>),
795                      MAP(imm_v128_shr_n_s8<5>),
796                      MAP(imm_v128_shr_n_s8<6>),
797                      MAP(imm_v128_shr_n_s8<7>),
798                      MAP(imm_v128_shl_n_16<1>),
799                      MAP(imm_v128_shl_n_16<2>),
800                      MAP(imm_v128_shl_n_16<4>),
801                      MAP(imm_v128_shl_n_16<6>),
802                      MAP(imm_v128_shl_n_16<8>),
803                      MAP(imm_v128_shl_n_16<10>),
804                      MAP(imm_v128_shl_n_16<12>),
805                      MAP(imm_v128_shl_n_16<14>),
806                      MAP(imm_v128_shr_n_u16<1>),
807                      MAP(imm_v128_shr_n_u16<2>),
808                      MAP(imm_v128_shr_n_u16<4>),
809                      MAP(imm_v128_shr_n_u16<6>),
810                      MAP(imm_v128_shr_n_u16<8>),
811                      MAP(imm_v128_shr_n_u16<10>),
812                      MAP(imm_v128_shr_n_u16<12>),
813                      MAP(imm_v128_shr_n_u16<14>),
814                      MAP(imm_v128_shr_n_s16<1>),
815                      MAP(imm_v128_shr_n_s16<2>),
816                      MAP(imm_v128_shr_n_s16<4>),
817                      MAP(imm_v128_shr_n_s16<6>),
818                      MAP(imm_v128_shr_n_s16<8>),
819                      MAP(imm_v128_shr_n_s16<10>),
820                      MAP(imm_v128_shr_n_s16<12>),
821                      MAP(imm_v128_shr_n_s16<14>),
822                      MAP(imm_v128_shl_n_32<1>),
823                      MAP(imm_v128_shl_n_32<4>),
824                      MAP(imm_v128_shl_n_32<8>),
825                      MAP(imm_v128_shl_n_32<12>),
826                      MAP(imm_v128_shl_n_32<16>),
827                      MAP(imm_v128_shl_n_32<20>),
828                      MAP(imm_v128_shl_n_32<24>),
829                      MAP(imm_v128_shl_n_32<28>),
830                      MAP(imm_v128_shr_n_u32<1>),
831                      MAP(imm_v128_shr_n_u32<4>),
832                      MAP(imm_v128_shr_n_u32<8>),
833                      MAP(imm_v128_shr_n_u32<12>),
834                      MAP(imm_v128_shr_n_u32<16>),
835                      MAP(imm_v128_shr_n_u32<20>),
836                      MAP(imm_v128_shr_n_u32<24>),
837                      MAP(imm_v128_shr_n_u32<28>),
838                      MAP(imm_v128_shr_n_s32<1>),
839                      MAP(imm_v128_shr_n_s32<4>),
840                      MAP(imm_v128_shr_n_s32<8>),
841                      MAP(imm_v128_shr_n_s32<12>),
842                      MAP(imm_v128_shr_n_s32<16>),
843                      MAP(imm_v128_shr_n_s32<20>),
844                      MAP(imm_v128_shr_n_s32<24>),
845                      MAP(imm_v128_shr_n_s32<28>),
846                      MAP(imm_v128_shl_n_64<1>),
847                      MAP(imm_v128_shl_n_64<4>),
848                      MAP(imm_v128_shl_n_64<8>),
849                      MAP(imm_v128_shl_n_64<12>),
850                      MAP(imm_v128_shl_n_64<16>),
851                      MAP(imm_v128_shl_n_64<20>),
852                      MAP(imm_v128_shl_n_64<24>),
853                      MAP(imm_v128_shl_n_64<28>),
854                      MAP(imm_v128_shl_n_64<32>),
855                      MAP(imm_v128_shl_n_64<36>),
856                      MAP(imm_v128_shl_n_64<40>),
857                      MAP(imm_v128_shl_n_64<44>),
858                      MAP(imm_v128_shl_n_64<48>),
859                      MAP(imm_v128_shl_n_64<52>),
860                      MAP(imm_v128_shl_n_64<56>),
861                      MAP(imm_v128_shl_n_64<60>),
862                      MAP(imm_v128_shr_n_u64<1>),
863                      MAP(imm_v128_shr_n_u64<4>),
864                      MAP(imm_v128_shr_n_u64<8>),
865                      MAP(imm_v128_shr_n_u64<12>),
866                      MAP(imm_v128_shr_n_u64<16>),
867                      MAP(imm_v128_shr_n_u64<20>),
868                      MAP(imm_v128_shr_n_u64<24>),
869                      MAP(imm_v128_shr_n_u64<28>),
870                      MAP(imm_v128_shr_n_u64<32>),
871                      MAP(imm_v128_shr_n_u64<36>),
872                      MAP(imm_v128_shr_n_u64<40>),
873                      MAP(imm_v128_shr_n_u64<44>),
874                      MAP(imm_v128_shr_n_u64<48>),
875                      MAP(imm_v128_shr_n_u64<52>),
876                      MAP(imm_v128_shr_n_u64<56>),
877                      MAP(imm_v128_shr_n_u64<60>),
878                      MAP(imm_v128_shr_n_s64<1>),
879                      MAP(imm_v128_shr_n_s64<4>),
880                      MAP(imm_v128_shr_n_s64<8>),
881                      MAP(imm_v128_shr_n_s64<12>),
882                      MAP(imm_v128_shr_n_s64<16>),
883                      MAP(imm_v128_shr_n_s64<20>),
884                      MAP(imm_v128_shr_n_s64<24>),
885                      MAP(imm_v128_shr_n_s64<28>),
886                      MAP(imm_v128_shr_n_s64<32>),
887                      MAP(imm_v128_shr_n_s64<36>),
888                      MAP(imm_v128_shr_n_s64<40>),
889                      MAP(imm_v128_shr_n_s64<44>),
890                      MAP(imm_v128_shr_n_s64<48>),
891                      MAP(imm_v128_shr_n_s64<52>),
892                      MAP(imm_v128_shr_n_s64<56>),
893                      MAP(imm_v128_shr_n_s64<60>),
894                      MAP(v128_from_v64),
895                      MAP(v128_zip_8),
896                      MAP(v128_zip_16),
897                      MAP(v128_zip_32),
898                      MAP(v128_mul_s16),
899                      MAP(v128_unpack_u8_s16),
900                      MAP(v128_unpack_s8_s16),
901                      MAP(v128_unpack_u16_s32),
902                      MAP(v128_unpack_s16_s32),
903                      MAP(v128_shl_8),
904                      MAP(v128_shr_u8),
905                      MAP(v128_shr_s8),
906                      MAP(v128_shl_16),
907                      MAP(v128_shr_u16),
908                      MAP(v128_shr_s16),
909                      MAP(v128_shl_32),
910                      MAP(v128_shr_u32),
911                      MAP(v128_shr_s32),
912                      MAP(v128_shl_64),
913                      MAP(v128_shr_u64),
914                      MAP(v128_shr_s64),
915                      MAP(v128_hadd_u8),
916                      MAP(v128_dotp_su8),
917                      MAP(v128_dotp_s16),
918                      MAP(v128_dotp_s32),
919                      MAP(v128_low_u32),
920                      MAP(v128_low_v64),
921                      MAP(v128_high_v64),
922                      MAP(v128_from_64),
923                      MAP(v128_from_32),
924                      MAP(v128_movemask_8),
925                      MAP(v128_zero),
926                      MAP(v128_dup_8),
927                      MAP(v128_dup_16),
928                      MAP(v128_dup_32),
929                      MAP(v128_dup_64),
930                      MAP(v128_unpacklo_u8_s16),
931                      MAP(v128_unpackhi_u8_s16),
932                      MAP(v128_unpacklo_s8_s16),
933                      MAP(v128_unpackhi_s8_s16),
934                      MAP(v128_blend_8),
935                      MAP(u32_load_unaligned),
936                      MAP(u32_store_unaligned),
937                      MAP(v64_load_unaligned),
938                      MAP(v64_store_unaligned),
939                      MAP(v128_load_unaligned),
940                      MAP(v128_store_unaligned),
941                      MAP(v256_sad_u8),
942                      MAP(v256_ssd_u8),
943                      MAP(v256_sad_u16),
944                      MAP(v256_ssd_s16),
945                      MAP(v256_hadd_u8),
946                      MAP(v256_low_u64),
947                      MAP(v256_dotp_su8),
948                      MAP(v256_dotp_s16),
949                      MAP(v256_dotp_s32),
950                      MAP(v256_add_8),
951                      MAP(v256_add_16),
952                      MAP(v256_sadd_s8),
953                      MAP(v256_sadd_u8),
954                      MAP(v256_sadd_s16),
955                      MAP(v256_add_32),
956                      MAP(v256_add_64),
957                      MAP(v256_sub_8),
958                      MAP(v256_ssub_u8),
959                      MAP(v256_ssub_s8),
960                      MAP(v256_sub_16),
961                      MAP(v256_ssub_u16),
962                      MAP(v256_ssub_s16),
963                      MAP(v256_sub_32),
964                      MAP(v256_sub_64),
965                      MAP(v256_ziplo_8),
966                      MAP(v256_ziphi_8),
967                      MAP(v256_ziplo_16),
968                      MAP(v256_ziphi_16),
969                      MAP(v256_ziplo_32),
970                      MAP(v256_ziphi_32),
971                      MAP(v256_ziplo_64),
972                      MAP(v256_ziphi_64),
973                      MAP(v256_unziphi_8),
974                      MAP(v256_unziplo_8),
975                      MAP(v256_unziphi_16),
976                      MAP(v256_unziplo_16),
977                      MAP(v256_unziphi_32),
978                      MAP(v256_unziplo_32),
979                      MAP(v256_unziphi_64),
980                      MAP(v256_unziplo_64),
981                      MAP(v256_pack_s32_u16),
982                      MAP(v256_pack_s32_s16),
983                      MAP(v256_pack_s16_u8),
984                      MAP(v256_pack_s16_s8),
985                      MAP(v256_or),
986                      MAP(v256_xor),
987                      MAP(v256_and),
988                      MAP(v256_andn),
989                      MAP(v256_mullo_s16),
990                      MAP(v256_mulhi_s16),
991                      MAP(v256_mullo_s32),
992                      MAP(v256_madd_s16),
993                      MAP(v256_madd_us8),
994                      MAP(v256_avg_u8),
995                      MAP(v256_rdavg_u8),
996                      MAP(v256_rdavg_u16),
997                      MAP(v256_avg_u16),
998                      MAP(v256_min_u8),
999                      MAP(v256_max_u8),
1000                      MAP(v256_min_s8),
1001                      MAP(v256_max_s8),
1002                      MAP(v256_min_s16),
1003                      MAP(v256_max_s16),
1004                      MAP(v256_min_s32),
1005                      MAP(v256_max_s32),
1006                      MAP(v256_cmpgt_s8),
1007                      MAP(v256_cmplt_s8),
1008                      MAP(v256_cmpeq_8),
1009                      MAP(v256_cmpgt_s16),
1010                      MAP(v256_cmplt_s16),
1011                      MAP(v256_cmpeq_16),
1012                      MAP(v256_cmpgt_s32),
1013                      MAP(v256_cmplt_s32),
1014                      MAP(v256_cmpeq_32),
1015                      MAP(v256_shuffle_8),
1016                      MAP(v256_pshuffle_8),
1017                      MAP(v256_wideshuffle_8),
1018                      MAP(imm_v256_align<1>),
1019                      MAP(imm_v256_align<2>),
1020                      MAP(imm_v256_align<3>),
1021                      MAP(imm_v256_align<4>),
1022                      MAP(imm_v256_align<5>),
1023                      MAP(imm_v256_align<6>),
1024                      MAP(imm_v256_align<7>),
1025                      MAP(imm_v256_align<8>),
1026                      MAP(imm_v256_align<9>),
1027                      MAP(imm_v256_align<10>),
1028                      MAP(imm_v256_align<11>),
1029                      MAP(imm_v256_align<12>),
1030                      MAP(imm_v256_align<13>),
1031                      MAP(imm_v256_align<14>),
1032                      MAP(imm_v256_align<15>),
1033                      MAP(imm_v256_align<16>),
1034                      MAP(imm_v256_align<17>),
1035                      MAP(imm_v256_align<18>),
1036                      MAP(imm_v256_align<19>),
1037                      MAP(imm_v256_align<20>),
1038                      MAP(imm_v256_align<21>),
1039                      MAP(imm_v256_align<22>),
1040                      MAP(imm_v256_align<23>),
1041                      MAP(imm_v256_align<24>),
1042                      MAP(imm_v256_align<25>),
1043                      MAP(imm_v256_align<26>),
1044                      MAP(imm_v256_align<27>),
1045                      MAP(imm_v256_align<28>),
1046                      MAP(imm_v256_align<29>),
1047                      MAP(imm_v256_align<30>),
1048                      MAP(imm_v256_align<31>),
1049                      MAP(v256_from_v128),
1050                      MAP(v256_zip_8),
1051                      MAP(v256_zip_16),
1052                      MAP(v256_zip_32),
1053                      MAP(v256_mul_s16),
1054                      MAP(v256_unpack_u8_s16),
1055                      MAP(v256_unpack_s8_s16),
1056                      MAP(v256_unpack_u16_s32),
1057                      MAP(v256_unpack_s16_s32),
1058                      MAP(v256_shl_8),
1059                      MAP(v256_shr_u8),
1060                      MAP(v256_shr_s8),
1061                      MAP(v256_shl_16),
1062                      MAP(v256_shr_u16),
1063                      MAP(v256_shr_s16),
1064                      MAP(v256_shl_32),
1065                      MAP(v256_shr_u32),
1066                      MAP(v256_shr_s32),
1067                      MAP(v256_shl_64),
1068                      MAP(v256_shr_u64),
1069                      MAP(v256_shr_s64),
1070                      MAP(v256_abs_s8),
1071                      MAP(v256_abs_s16),
1072                      MAP(v256_padd_u8),
1073                      MAP(v256_padd_s16),
1074                      MAP(v256_unpacklo_u16_s32),
1075                      MAP(v256_unpacklo_s16_s32),
1076                      MAP(v256_unpackhi_u16_s32),
1077                      MAP(v256_unpackhi_s16_s32),
1078                      MAP(imm_v256_shr_n_word<1>),
1079                      MAP(imm_v256_shr_n_word<2>),
1080                      MAP(imm_v256_shr_n_word<3>),
1081                      MAP(imm_v256_shr_n_word<4>),
1082                      MAP(imm_v256_shr_n_word<5>),
1083                      MAP(imm_v256_shr_n_word<6>),
1084                      MAP(imm_v256_shr_n_word<7>),
1085                      MAP(imm_v256_shr_n_word<8>),
1086                      MAP(imm_v256_shr_n_word<9>),
1087                      MAP(imm_v256_shr_n_word<10>),
1088                      MAP(imm_v256_shr_n_word<11>),
1089                      MAP(imm_v256_shr_n_word<12>),
1090                      MAP(imm_v256_shr_n_word<13>),
1091                      MAP(imm_v256_shr_n_word<14>),
1092                      MAP(imm_v256_shr_n_word<15>),
1093                      MAP(imm_v256_shl_n_word<1>),
1094                      MAP(imm_v256_shl_n_word<2>),
1095                      MAP(imm_v256_shl_n_word<3>),
1096                      MAP(imm_v256_shl_n_word<4>),
1097                      MAP(imm_v256_shl_n_word<5>),
1098                      MAP(imm_v256_shl_n_word<6>),
1099                      MAP(imm_v256_shl_n_word<7>),
1100                      MAP(imm_v256_shl_n_word<8>),
1101                      MAP(imm_v256_shl_n_word<9>),
1102                      MAP(imm_v256_shl_n_word<10>),
1103                      MAP(imm_v256_shl_n_word<11>),
1104                      MAP(imm_v256_shl_n_word<12>),
1105                      MAP(imm_v256_shl_n_word<13>),
1106                      MAP(imm_v256_shl_n_word<14>),
1107                      MAP(imm_v256_shl_n_word<15>),
1108                      MAP(imm_v256_shr_n_byte<1>),
1109                      MAP(imm_v256_shr_n_byte<2>),
1110                      MAP(imm_v256_shr_n_byte<3>),
1111                      MAP(imm_v256_shr_n_byte<4>),
1112                      MAP(imm_v256_shr_n_byte<5>),
1113                      MAP(imm_v256_shr_n_byte<6>),
1114                      MAP(imm_v256_shr_n_byte<7>),
1115                      MAP(imm_v256_shr_n_byte<8>),
1116                      MAP(imm_v256_shr_n_byte<9>),
1117                      MAP(imm_v256_shr_n_byte<10>),
1118                      MAP(imm_v256_shr_n_byte<11>),
1119                      MAP(imm_v256_shr_n_byte<12>),
1120                      MAP(imm_v256_shr_n_byte<13>),
1121                      MAP(imm_v256_shr_n_byte<14>),
1122                      MAP(imm_v256_shr_n_byte<15>),
1123                      MAP(imm_v256_shr_n_byte<16>),
1124                      MAP(imm_v256_shr_n_byte<17>),
1125                      MAP(imm_v256_shr_n_byte<18>),
1126                      MAP(imm_v256_shr_n_byte<19>),
1127                      MAP(imm_v256_shr_n_byte<20>),
1128                      MAP(imm_v256_shr_n_byte<21>),
1129                      MAP(imm_v256_shr_n_byte<22>),
1130                      MAP(imm_v256_shr_n_byte<23>),
1131                      MAP(imm_v256_shr_n_byte<24>),
1132                      MAP(imm_v256_shr_n_byte<25>),
1133                      MAP(imm_v256_shr_n_byte<26>),
1134                      MAP(imm_v256_shr_n_byte<27>),
1135                      MAP(imm_v256_shr_n_byte<28>),
1136                      MAP(imm_v256_shr_n_byte<29>),
1137                      MAP(imm_v256_shr_n_byte<30>),
1138                      MAP(imm_v256_shr_n_byte<31>),
1139                      MAP(imm_v256_shl_n_byte<1>),
1140                      MAP(imm_v256_shl_n_byte<2>),
1141                      MAP(imm_v256_shl_n_byte<3>),
1142                      MAP(imm_v256_shl_n_byte<4>),
1143                      MAP(imm_v256_shl_n_byte<5>),
1144                      MAP(imm_v256_shl_n_byte<6>),
1145                      MAP(imm_v256_shl_n_byte<7>),
1146                      MAP(imm_v256_shl_n_byte<8>),
1147                      MAP(imm_v256_shl_n_byte<9>),
1148                      MAP(imm_v256_shl_n_byte<10>),
1149                      MAP(imm_v256_shl_n_byte<11>),
1150                      MAP(imm_v256_shl_n_byte<12>),
1151                      MAP(imm_v256_shl_n_byte<13>),
1152                      MAP(imm_v256_shl_n_byte<14>),
1153                      MAP(imm_v256_shl_n_byte<15>),
1154                      MAP(imm_v256_shl_n_byte<16>),
1155                      MAP(imm_v256_shl_n_byte<17>),
1156                      MAP(imm_v256_shl_n_byte<18>),
1157                      MAP(imm_v256_shl_n_byte<19>),
1158                      MAP(imm_v256_shl_n_byte<20>),
1159                      MAP(imm_v256_shl_n_byte<21>),
1160                      MAP(imm_v256_shl_n_byte<22>),
1161                      MAP(imm_v256_shl_n_byte<23>),
1162                      MAP(imm_v256_shl_n_byte<24>),
1163                      MAP(imm_v256_shl_n_byte<25>),
1164                      MAP(imm_v256_shl_n_byte<26>),
1165                      MAP(imm_v256_shl_n_byte<27>),
1166                      MAP(imm_v256_shl_n_byte<28>),
1167                      MAP(imm_v256_shl_n_byte<29>),
1168                      MAP(imm_v256_shl_n_byte<30>),
1169                      MAP(imm_v256_shl_n_byte<31>),
1170                      MAP(imm_v256_shl_n_8<1>),
1171                      MAP(imm_v256_shl_n_8<2>),
1172                      MAP(imm_v256_shl_n_8<3>),
1173                      MAP(imm_v256_shl_n_8<4>),
1174                      MAP(imm_v256_shl_n_8<5>),
1175                      MAP(imm_v256_shl_n_8<6>),
1176                      MAP(imm_v256_shl_n_8<7>),
1177                      MAP(imm_v256_shr_n_u8<1>),
1178                      MAP(imm_v256_shr_n_u8<2>),
1179                      MAP(imm_v256_shr_n_u8<3>),
1180                      MAP(imm_v256_shr_n_u8<4>),
1181                      MAP(imm_v256_shr_n_u8<5>),
1182                      MAP(imm_v256_shr_n_u8<6>),
1183                      MAP(imm_v256_shr_n_u8<7>),
1184                      MAP(imm_v256_shr_n_s8<1>),
1185                      MAP(imm_v256_shr_n_s8<2>),
1186                      MAP(imm_v256_shr_n_s8<3>),
1187                      MAP(imm_v256_shr_n_s8<4>),
1188                      MAP(imm_v256_shr_n_s8<5>),
1189                      MAP(imm_v256_shr_n_s8<6>),
1190                      MAP(imm_v256_shr_n_s8<7>),
1191                      MAP(imm_v256_shl_n_16<1>),
1192                      MAP(imm_v256_shl_n_16<2>),
1193                      MAP(imm_v256_shl_n_16<4>),
1194                      MAP(imm_v256_shl_n_16<6>),
1195                      MAP(imm_v256_shl_n_16<8>),
1196                      MAP(imm_v256_shl_n_16<10>),
1197                      MAP(imm_v256_shl_n_16<12>),
1198                      MAP(imm_v256_shl_n_16<14>),
1199                      MAP(imm_v256_shr_n_u16<1>),
1200                      MAP(imm_v256_shr_n_u16<2>),
1201                      MAP(imm_v256_shr_n_u16<4>),
1202                      MAP(imm_v256_shr_n_u16<6>),
1203                      MAP(imm_v256_shr_n_u16<8>),
1204                      MAP(imm_v256_shr_n_u16<10>),
1205                      MAP(imm_v256_shr_n_u16<12>),
1206                      MAP(imm_v256_shr_n_u16<14>),
1207                      MAP(imm_v256_shr_n_s16<1>),
1208                      MAP(imm_v256_shr_n_s16<2>),
1209                      MAP(imm_v256_shr_n_s16<4>),
1210                      MAP(imm_v256_shr_n_s16<6>),
1211                      MAP(imm_v256_shr_n_s16<8>),
1212                      MAP(imm_v256_shr_n_s16<10>),
1213                      MAP(imm_v256_shr_n_s16<12>),
1214                      MAP(imm_v256_shr_n_s16<14>),
1215                      MAP(imm_v256_shl_n_32<1>),
1216                      MAP(imm_v256_shl_n_32<4>),
1217                      MAP(imm_v256_shl_n_32<8>),
1218                      MAP(imm_v256_shl_n_32<12>),
1219                      MAP(imm_v256_shl_n_32<16>),
1220                      MAP(imm_v256_shl_n_32<20>),
1221                      MAP(imm_v256_shl_n_32<24>),
1222                      MAP(imm_v256_shl_n_32<28>),
1223                      MAP(imm_v256_shr_n_u32<1>),
1224                      MAP(imm_v256_shr_n_u32<4>),
1225                      MAP(imm_v256_shr_n_u32<8>),
1226                      MAP(imm_v256_shr_n_u32<12>),
1227                      MAP(imm_v256_shr_n_u32<16>),
1228                      MAP(imm_v256_shr_n_u32<20>),
1229                      MAP(imm_v256_shr_n_u32<24>),
1230                      MAP(imm_v256_shr_n_u32<28>),
1231                      MAP(imm_v256_shr_n_s32<1>),
1232                      MAP(imm_v256_shr_n_s32<4>),
1233                      MAP(imm_v256_shr_n_s32<8>),
1234                      MAP(imm_v256_shr_n_s32<12>),
1235                      MAP(imm_v256_shr_n_s32<16>),
1236                      MAP(imm_v256_shr_n_s32<20>),
1237                      MAP(imm_v256_shr_n_s32<24>),
1238                      MAP(imm_v256_shr_n_s32<28>),
1239                      MAP(imm_v256_shl_n_64<1>),
1240                      MAP(imm_v256_shl_n_64<4>),
1241                      MAP(imm_v256_shl_n_64<8>),
1242                      MAP(imm_v256_shl_n_64<12>),
1243                      MAP(imm_v256_shl_n_64<16>),
1244                      MAP(imm_v256_shl_n_64<20>),
1245                      MAP(imm_v256_shl_n_64<24>),
1246                      MAP(imm_v256_shl_n_64<28>),
1247                      MAP(imm_v256_shl_n_64<32>),
1248                      MAP(imm_v256_shl_n_64<36>),
1249                      MAP(imm_v256_shl_n_64<40>),
1250                      MAP(imm_v256_shl_n_64<44>),
1251                      MAP(imm_v256_shl_n_64<48>),
1252                      MAP(imm_v256_shl_n_64<52>),
1253                      MAP(imm_v256_shl_n_64<56>),
1254                      MAP(imm_v256_shl_n_64<60>),
1255                      MAP(imm_v256_shr_n_u64<1>),
1256                      MAP(imm_v256_shr_n_u64<4>),
1257                      MAP(imm_v256_shr_n_u64<8>),
1258                      MAP(imm_v256_shr_n_u64<12>),
1259                      MAP(imm_v256_shr_n_u64<16>),
1260                      MAP(imm_v256_shr_n_u64<20>),
1261                      MAP(imm_v256_shr_n_u64<24>),
1262                      MAP(imm_v256_shr_n_u64<28>),
1263                      MAP(imm_v256_shr_n_u64<32>),
1264                      MAP(imm_v256_shr_n_u64<36>),
1265                      MAP(imm_v256_shr_n_u64<40>),
1266                      MAP(imm_v256_shr_n_u64<44>),
1267                      MAP(imm_v256_shr_n_u64<48>),
1268                      MAP(imm_v256_shr_n_u64<52>),
1269                      MAP(imm_v256_shr_n_u64<56>),
1270                      MAP(imm_v256_shr_n_u64<60>),
1271                      MAP(imm_v256_shr_n_s64<1>),
1272                      MAP(imm_v256_shr_n_s64<4>),
1273                      MAP(imm_v256_shr_n_s64<8>),
1274                      MAP(imm_v256_shr_n_s64<12>),
1275                      MAP(imm_v256_shr_n_s64<16>),
1276                      MAP(imm_v256_shr_n_s64<20>),
1277                      MAP(imm_v256_shr_n_s64<24>),
1278                      MAP(imm_v256_shr_n_s64<28>),
1279                      MAP(imm_v256_shr_n_s64<32>),
1280                      MAP(imm_v256_shr_n_s64<36>),
1281                      MAP(imm_v256_shr_n_s64<40>),
1282                      MAP(imm_v256_shr_n_s64<44>),
1283                      MAP(imm_v256_shr_n_s64<48>),
1284                      MAP(imm_v256_shr_n_s64<52>),
1285                      MAP(imm_v256_shr_n_s64<56>),
1286                      MAP(imm_v256_shr_n_s64<60>),
1287                      MAP(v256_movemask_8),
1288                      MAP(v256_zero),
1289                      MAP(v256_dup_8),
1290                      MAP(v256_dup_16),
1291                      MAP(v256_dup_32),
1292                      MAP(v256_dup_64),
1293                      MAP(v256_low_u32),
1294                      MAP(v256_low_v64),
1295                      MAP(v256_from_64),
1296                      MAP(v256_from_v64),
1297                      MAP(v256_ziplo_128),
1298                      MAP(v256_ziphi_128),
1299                      MAP(v256_unpacklo_u8_s16),
1300                      MAP(v256_unpackhi_u8_s16),
1301                      MAP(v256_unpacklo_s8_s16),
1302                      MAP(v256_unpackhi_s8_s16),
1303                      MAP(v256_blend_8),
1304                      { nullptr, nullptr, nullptr } };
1305#undef MAP
1306
1307// Map reference functions to machine tuned functions. Since the
1308// functions depend on machine tuned types, the non-machine tuned
1309// instantiations of the test can't refer to these functions directly,
1310// so we refer to them by name and do the mapping here.
1311void Map(const char *name, fptr *ref, fptr *simd) {
1312  unsigned int i;
1313  for (i = 0; m[i].name && strcmp(name, m[i].name); i++) {
1314  }
1315
1316  *ref = m[i].ref;
1317  *simd = m[i].simd;
1318}
1319
1320// Used for printing errors in TestSimd1Arg, TestSimd2Args and TestSimd3Args
1321std::string Print(const uint8_t *a, int size) {
1322  std::string text = "0x";
1323  for (int i = 0; i < size; i++) {
1324    const uint8_t c = a[!CONFIG_BIG_ENDIAN ? size - 1 - i : i];
1325    // Same as snprintf(..., ..., "%02x", c)
1326    text += (c >> 4) + '0' + ((c >> 4) > 9) * ('a' - '0' - 10);
1327    text += (c & 15) + '0' + ((c & 15) > 9) * ('a' - '0' - 10);
1328  }
1329
1330  return text;
1331}
1332
1333// Used in TestSimd1Arg, TestSimd2Args and TestSimd3Args to restrict argument
1334// ranges
1335void SetMask(uint8_t *s, int size, uint32_t mask, uint32_t maskwidth) {
1336  switch (maskwidth) {
1337    case 0: {
1338      break;
1339    }
1340    case 8: {
1341      for (int i = 0; i < size; i++) s[i] &= mask;
1342      break;
1343    }
1344    case 16: {
1345      uint16_t *t = reinterpret_cast<uint16_t *>(s);
1346      assert(!(reinterpret_cast<uintptr_t>(s) & 1));
1347      for (int i = 0; i < size / 2; i++) t[i] &= mask;
1348      break;
1349    }
1350    case 32: {
1351      uint32_t *t = reinterpret_cast<uint32_t *>(s);
1352      assert(!(reinterpret_cast<uintptr_t>(s) & 3));
1353      for (int i = 0; i < size / 4; i++) t[i] &= mask;
1354      break;
1355    }
1356    case 64: {
1357      uint64_t *t = reinterpret_cast<uint64_t *>(s);
1358      assert(!(reinterpret_cast<uintptr_t>(s) & 7));
1359      for (int i = 0; i < size / 8; i++) t[i] &= mask;
1360      break;
1361    }
1362    default: {
1363      FAIL() << "Unsupported mask width";
1364      break;
1365    }
1366  }
1367}
1368
1369// We need some extra load/store functions
1370void u64_store_aligned(void *p, uint64_t a) {
1371  v64_store_aligned(p, v64_from_64(a));
1372}
1373void s32_store_aligned(void *p, int32_t a) {
1374  u32_store_aligned(p, static_cast<uint32_t>(a));
1375}
1376void s64_store_aligned(void *p, int64_t a) {
1377  v64_store_aligned(p, v64_from_64(static_cast<uint64_t>(a)));
1378}
1379
1380void c_u64_store_aligned(void *p, uint64_t a) {
1381  c_v64_store_aligned(p, c_v64_from_64(a));
1382}
1383
1384void c_s32_store_aligned(void *p, int32_t a) {
1385  c_u32_store_aligned(p, static_cast<uint32_t>(a));
1386}
1387
1388void c_s64_store_aligned(void *p, int64_t a) {
1389  c_v64_store_aligned(p, c_v64_from_64(static_cast<uint64_t>(a)));
1390}
1391
1392uint64_t u64_load_aligned(const void *p) {
1393  return v64_u64(v64_load_aligned(p));
1394}
1395uint16_t u16_load_aligned(const void *p) {
1396  return *(reinterpret_cast<const uint16_t *>(p));
1397}
1398uint8_t u8_load_aligned(const void *p) {
1399  return *(reinterpret_cast<const uint8_t *>(p));
1400}
1401
1402uint64_t c_u64_load_aligned(const void *p) {
1403  return c_v64_u64(c_v64_load_aligned(p));
1404}
1405uint16_t c_u16_load_aligned(const void *p) {
1406  return *(reinterpret_cast<const uint16_t *>(p));
1407}
1408uint8_t c_u8_load_aligned(const void *p) {
1409  return *(reinterpret_cast<const uint8_t *>(p));
1410}
1411
1412// CompareSimd1Arg, CompareSimd2Args and CompareSimd3Args compare
1413// intrinsics taking 1, 2 or 3 arguments respectively with their
1414// corresponding C reference.  Ideally, the loads and stores should
1415// have gone into the template parameter list, but v64 and v128 could
1416// be typedef'ed to the same type (which is the case on x86) and then
1417// we can't instantiate both v64 and v128, so the function return and
1418// argument types, including the always differing types in the C
1419// equivalent are used instead.  The function arguments must be void
1420// pointers and then go through a cast to avoid matching errors in the
1421// branches eliminated by the typeid tests in the calling function.
1422template <typename Ret, typename Arg, typename CRet, typename CArg>
1423int CompareSimd1Arg(fptr store, fptr load, fptr simd, void *d, fptr c_store,
1424                    fptr c_load, fptr c_simd, void *ref_d, const void *a) {
1425  void (*const my_store)(void *, Ret) = (void (*const)(void *, Ret))store;
1426  Arg (*const my_load)(const void *) = (Arg(*const)(const void *))load;
1427  Ret (*const my_simd)(Arg) = (Ret(*const)(Arg))simd;
1428  void (*const my_c_store)(void *, CRet) = (void (*const)(void *, CRet))c_store;
1429  CArg (*const my_c_load)(const void *) = (CArg(*const)(const void *))c_load;
1430  CRet (*const my_c_simd)(CArg) = (CRet(*const)(CArg))c_simd;
1431
1432  // Call reference and intrinsic
1433  my_c_store(ref_d, my_c_simd(my_c_load(a)));
1434  my_store(d, my_simd(my_load(a)));
1435
1436  // Compare results
1437  return memcmp(ref_d, d, sizeof(CRet));
1438}
1439
1440template <typename Ret, typename Arg1, typename Arg2, typename CRet,
1441          typename CArg1, typename CArg2>
1442int CompareSimd2Args(fptr store, fptr load1, fptr load2, fptr simd, void *d,
1443                     fptr c_store, fptr c_load1, fptr c_load2, fptr c_simd,
1444                     void *ref_d, const void *a, const void *b) {
1445  void (*const my_store)(void *, Ret) = (void (*const)(void *, Ret))store;
1446  Arg1 (*const my_load1)(const void *) = (Arg1(*const)(const void *))load1;
1447  Arg2 (*const my_load2)(const void *) = (Arg2(*const)(const void *))load2;
1448  Ret (*const my_simd)(Arg1, Arg2) = (Ret(*const)(Arg1, Arg2))simd;
1449  void (*const my_c_store)(void *, CRet) = (void (*const)(void *, CRet))c_store;
1450  CArg1 (*const my_c_load1)(const void *) =
1451      (CArg1(*const)(const void *))c_load1;
1452  CArg2 (*const my_c_load2)(const void *) =
1453      (CArg2(*const)(const void *))c_load2;
1454  CRet (*const my_c_simd)(CArg1, CArg2) = (CRet(*const)(CArg1, CArg2))c_simd;
1455
1456  // Call reference and intrinsic
1457  my_c_store(ref_d, my_c_simd(my_c_load1(a), my_c_load2(b)));
1458  my_store(d, my_simd(my_load1(a), my_load2(b)));
1459
1460  // Compare results
1461  return memcmp(ref_d, d, sizeof(CRet));
1462}
1463
1464template <typename Ret, typename Arg1, typename Arg2, typename Arg3,
1465          typename CRet, typename CArg1, typename CArg2, typename CArg3>
1466int CompareSimd3Args(fptr store, fptr load1, fptr load2, fptr load3, fptr simd,
1467                     void *d, fptr c_store, fptr c_load1, fptr c_load2,
1468                     fptr c_load3, fptr c_simd, void *ref_d, const void *a,
1469                     const void *b, const void *c) {
1470  void (*const my_store)(void *, Ret) = (void (*const)(void *, Ret))store;
1471  Arg1 (*const my_load1)(const void *) = (Arg1(*const)(const void *))load1;
1472  Arg2 (*const my_load2)(const void *) = (Arg2(*const)(const void *))load2;
1473  Arg3 (*const my_load3)(const void *) = (Arg3(*const)(const void *))load3;
1474  Ret (*const my_simd)(Arg1, Arg2, Arg3) = (Ret(*const)(Arg1, Arg2, Arg3))simd;
1475  void (*const my_c_store)(void *, CRet) = (void (*const)(void *, CRet))c_store;
1476  CArg1 (*const my_c_load1)(const void *) =
1477      (CArg1(*const)(const void *))c_load1;
1478  CArg2 (*const my_c_load2)(const void *) =
1479      (CArg2(*const)(const void *))c_load2;
1480  CArg3 (*const my_c_load3)(const void *) =
1481      (CArg3(*const)(const void *))c_load3;
1482  CRet (*const my_c_simd)(CArg1, CArg2, CArg3) =
1483      (CRet(*const)(CArg1, CArg2, CArg3))c_simd;
1484
1485  // Call reference and intrinsic
1486  my_c_store(ref_d, my_c_simd(my_c_load1(a), my_c_load2(b), my_c_load3(c)));
1487  my_store(d, my_simd(my_load1(a), my_load2(b), my_load3(c)));
1488
1489  // Compare results
1490  return memcmp(ref_d, d, sizeof(CRet));
1491}
1492
1493}  // namespace
1494
1495template <typename CRet, typename CArg>
1496void TestSimd1Arg(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
1497                  const char *name) {
1498  ACMRandom rnd(ACMRandom::DeterministicSeed());
1499  fptr ref_simd;
1500  fptr simd;
1501  int error = 0;
1502  DECLARE_ALIGNED(32, uint8_t, s[32]);
1503  DECLARE_ALIGNED(32, uint8_t, d[32]);
1504  DECLARE_ALIGNED(32, uint8_t, ref_d[32]);
1505  assert(sizeof(CArg) <= 32 && sizeof(CRet) <= 32);
1506  memset(ref_d, 0, sizeof(ref_d));
1507  memset(d, 0, sizeof(d));
1508
1509  Map(name, &ref_simd, &simd);
1510  if (simd == nullptr || ref_simd == nullptr) {
1511    FAIL() << "Internal error: Unknown intrinsic function " << name;
1512  }
1513  for (unsigned int count = 0;
1514       count < iterations && !error && !testing::Test::HasFailure(); count++) {
1515    for (unsigned int c = 0; c < sizeof(CArg); c++) s[c] = rnd.Rand8();
1516
1517    if (maskwidth) {
1518      SetMask(s, sizeof(CArg), mask, maskwidth);
1519    }
1520
1521    if (typeid(CRet) == typeid(c_v64) && typeid(CArg) == typeid(c_v64)) {
1522      // V64_V64
1523      error = CompareSimd1Arg<v64, v64, c_v64, c_v64>(
1524          reinterpret_cast<fptr>(v64_store_aligned),
1525          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
1526          reinterpret_cast<fptr>(c_v64_store_aligned),
1527          reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
1528    } else if (typeid(CRet) == typeid(c_v64) &&
1529               typeid(CArg) == typeid(uint8_t)) {
1530      // V64_U8
1531      error = CompareSimd1Arg<v64, uint8_t, c_v64, uint8_t>(
1532          reinterpret_cast<fptr>(v64_store_aligned),
1533          reinterpret_cast<fptr>(u8_load_aligned), simd, d,
1534          reinterpret_cast<fptr>(c_v64_store_aligned),
1535          reinterpret_cast<fptr>(c_u8_load_aligned), ref_simd, ref_d, s);
1536    } else if (typeid(CRet) == typeid(c_v64) &&
1537               typeid(CArg) == typeid(uint16_t)) {
1538      // V64_U16
1539      error = CompareSimd1Arg<v64, uint16_t, c_v64, uint16_t>(
1540          reinterpret_cast<fptr>(v64_store_aligned),
1541          reinterpret_cast<fptr>(u16_load_aligned), simd, d,
1542          reinterpret_cast<fptr>(c_v64_store_aligned),
1543          reinterpret_cast<fptr>(c_u16_load_aligned), ref_simd, ref_d, s);
1544    } else if (typeid(CRet) == typeid(c_v64) &&
1545               typeid(CArg) == typeid(uint32_t)) {
1546      // V64_U32
1547      error = CompareSimd1Arg<v64, uint32_t, c_v64, uint32_t>(
1548          reinterpret_cast<fptr>(v64_store_aligned),
1549          reinterpret_cast<fptr>(u32_load_aligned), simd, d,
1550          reinterpret_cast<fptr>(c_v64_store_aligned),
1551          reinterpret_cast<fptr>(c_u32_load_aligned), ref_simd, ref_d, s);
1552    } else if (typeid(CRet) == typeid(uint64_t) &&
1553               typeid(CArg) == typeid(c_v64)) {
1554      // U64_V64
1555      error = CompareSimd1Arg<uint64_t, v64, uint64_t, c_v64>(
1556          reinterpret_cast<fptr>(u64_store_aligned),
1557          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
1558          reinterpret_cast<fptr>(c_u64_store_aligned),
1559          reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
1560    } else if (typeid(CRet) == typeid(int64_t) &&
1561               typeid(CArg) == typeid(c_v64)) {
1562      // S64_V64
1563      error = CompareSimd1Arg<int64_t, v64, int64_t, c_v64>(
1564          reinterpret_cast<fptr>(s64_store_aligned),
1565          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
1566          reinterpret_cast<fptr>(c_s64_store_aligned),
1567          reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
1568    } else if (typeid(CRet) == typeid(uint32_t) &&
1569               typeid(CArg) == typeid(c_v64)) {
1570      // U32_V64
1571      error = CompareSimd1Arg<uint32_t, v64, uint32_t, c_v64>(
1572          reinterpret_cast<fptr>(u32_store_aligned),
1573          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
1574          reinterpret_cast<fptr>(c_u32_store_aligned),
1575          reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
1576    } else if (typeid(CRet) == typeid(int32_t) &&
1577               typeid(CArg) == typeid(c_v64)) {
1578      // S32_V64
1579      error = CompareSimd1Arg<int32_t, v64, int32_t, c_v64>(
1580          reinterpret_cast<fptr>(s32_store_aligned),
1581          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
1582          reinterpret_cast<fptr>(c_s32_store_aligned),
1583          reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
1584    } else if (typeid(CRet) == typeid(uint32_t) &&
1585               typeid(CArg) == typeid(c_v128)) {
1586      // U32_V128
1587      error = CompareSimd1Arg<uint32_t, v128, uint32_t, c_v128>(
1588          reinterpret_cast<fptr>(u32_store_aligned),
1589          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
1590          reinterpret_cast<fptr>(c_u32_store_aligned),
1591          reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
1592    } else if (typeid(CRet) == typeid(uint64_t) &&
1593               typeid(CArg) == typeid(c_v128)) {
1594      // U64_V128
1595      error = CompareSimd1Arg<uint64_t, v128, uint64_t, c_v128>(
1596          reinterpret_cast<fptr>(u64_store_aligned),
1597          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
1598          reinterpret_cast<fptr>(c_u64_store_aligned),
1599          reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
1600    } else if (typeid(CRet) == typeid(uint64_t) &&
1601               typeid(CArg) == typeid(c_v256)) {
1602      // U64_V256
1603      error = CompareSimd1Arg<uint64_t, v256, uint64_t, c_v256>(
1604          reinterpret_cast<fptr>(u64_store_aligned),
1605          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
1606          reinterpret_cast<fptr>(c_u64_store_aligned),
1607          reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s);
1608    } else if (typeid(CRet) == typeid(c_v64) &&
1609               typeid(CArg) == typeid(c_v128)) {
1610      // V64_V128
1611      error = CompareSimd1Arg<v64, v128, c_v64, c_v128>(
1612          reinterpret_cast<fptr>(v64_store_aligned),
1613          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
1614          reinterpret_cast<fptr>(c_v64_store_aligned),
1615          reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
1616    } else if (typeid(CRet) == typeid(c_v128) &&
1617               typeid(CArg) == typeid(c_v128)) {
1618      // V128_V128
1619      error = CompareSimd1Arg<v128, v128, c_v128, c_v128>(
1620          reinterpret_cast<fptr>(v128_store_aligned),
1621          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
1622          reinterpret_cast<fptr>(c_v128_store_aligned),
1623          reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
1624    } else if (typeid(CRet) == typeid(c_v128) &&
1625               typeid(CArg) == typeid(c_v64)) {
1626      // V128_V64
1627      error = CompareSimd1Arg<v128, v64, c_v128, c_v64>(
1628          reinterpret_cast<fptr>(v128_store_aligned),
1629          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
1630          reinterpret_cast<fptr>(c_v128_store_aligned),
1631          reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
1632    } else if (typeid(CRet) == typeid(c_v128) &&
1633               typeid(CArg) == typeid(uint8_t)) {
1634      // V128_U8
1635      error = CompareSimd1Arg<v128, uint8_t, c_v128, uint8_t>(
1636          reinterpret_cast<fptr>(v128_store_aligned),
1637          reinterpret_cast<fptr>(u8_load_aligned), simd, d,
1638          reinterpret_cast<fptr>(c_v128_store_aligned),
1639          reinterpret_cast<fptr>(c_u8_load_aligned), ref_simd, ref_d, s);
1640    } else if (typeid(CRet) == typeid(c_v128) &&
1641               typeid(CArg) == typeid(uint16_t)) {
1642      // V128_U16
1643      error = CompareSimd1Arg<v128, uint16_t, c_v128, uint16_t>(
1644          reinterpret_cast<fptr>(v128_store_aligned),
1645          reinterpret_cast<fptr>(u16_load_aligned), simd, d,
1646          reinterpret_cast<fptr>(c_v128_store_aligned),
1647          reinterpret_cast<fptr>(c_u16_load_aligned), ref_simd, ref_d, s);
1648    } else if (typeid(CRet) == typeid(c_v128) &&
1649               typeid(CArg) == typeid(uint32_t)) {
1650      // V128_U32
1651      error = CompareSimd1Arg<v128, uint32_t, c_v128, uint32_t>(
1652          reinterpret_cast<fptr>(v128_store_aligned),
1653          reinterpret_cast<fptr>(u32_load_aligned), simd, d,
1654          reinterpret_cast<fptr>(c_v128_store_aligned),
1655          reinterpret_cast<fptr>(c_u32_load_aligned), ref_simd, ref_d, s);
1656    } else if (typeid(CRet) == typeid(c_v128) &&
1657               typeid(CArg) == typeid(uint64_t)) {
1658      // V128_U64
1659      error = CompareSimd1Arg<v128, uint64_t, c_v128, uint64_t>(
1660          reinterpret_cast<fptr>(v128_store_aligned),
1661          reinterpret_cast<fptr>(u64_load_aligned), simd, d,
1662          reinterpret_cast<fptr>(c_v128_store_aligned),
1663          reinterpret_cast<fptr>(c_u64_load_aligned), ref_simd, ref_d, s);
1664    } else if (typeid(CRet) == typeid(c_v256) &&
1665               typeid(CArg) == typeid(c_v256)) {
1666      // V256_V256
1667      error = CompareSimd1Arg<v256, v256, c_v256, c_v256>(
1668          reinterpret_cast<fptr>(v256_store_aligned),
1669          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
1670          reinterpret_cast<fptr>(c_v256_store_aligned),
1671          reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s);
1672    } else if (typeid(CRet) == typeid(c_v256) &&
1673               typeid(CArg) == typeid(c_v128)) {
1674      // V256_V128
1675      error = CompareSimd1Arg<v256, v128, c_v256, c_v128>(
1676          reinterpret_cast<fptr>(v256_store_aligned),
1677          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
1678          reinterpret_cast<fptr>(c_v256_store_aligned),
1679          reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
1680    } else if (typeid(CRet) == typeid(c_v256) &&
1681               typeid(CArg) == typeid(uint8_t)) {
1682      // V256_U8
1683      error = CompareSimd1Arg<v256, uint8_t, c_v256, uint8_t>(
1684          reinterpret_cast<fptr>(v256_store_aligned),
1685          reinterpret_cast<fptr>(u8_load_aligned), simd, d,
1686          reinterpret_cast<fptr>(c_v256_store_aligned),
1687          reinterpret_cast<fptr>(c_u8_load_aligned), ref_simd, ref_d, s);
1688    } else if (typeid(CRet) == typeid(c_v256) &&
1689               typeid(CArg) == typeid(uint16_t)) {
1690      // V256_U16
1691      error = CompareSimd1Arg<v256, uint16_t, c_v256, uint16_t>(
1692          reinterpret_cast<fptr>(v256_store_aligned),
1693          reinterpret_cast<fptr>(u16_load_aligned), simd, d,
1694          reinterpret_cast<fptr>(c_v256_store_aligned),
1695          reinterpret_cast<fptr>(c_u16_load_aligned), ref_simd, ref_d, s);
1696    } else if (typeid(CRet) == typeid(c_v256) &&
1697               typeid(CArg) == typeid(uint32_t)) {
1698      // V256_U32
1699      error = CompareSimd1Arg<v256, uint32_t, c_v256, uint32_t>(
1700          reinterpret_cast<fptr>(v256_store_aligned),
1701          reinterpret_cast<fptr>(u32_load_aligned), simd, d,
1702          reinterpret_cast<fptr>(c_v256_store_aligned),
1703          reinterpret_cast<fptr>(c_u32_load_aligned), ref_simd, ref_d, s);
1704    } else if (typeid(CRet) == typeid(c_v256) &&
1705               typeid(CArg) == typeid(uint64_t)) {
1706      // V256_U64
1707      error = CompareSimd1Arg<v256, uint64_t, c_v256, uint64_t>(
1708          reinterpret_cast<fptr>(v256_store_aligned),
1709          reinterpret_cast<fptr>(u64_load_aligned), simd, d,
1710          reinterpret_cast<fptr>(c_v256_store_aligned),
1711          reinterpret_cast<fptr>(c_u64_load_aligned), ref_simd, ref_d, s);
1712    } else if (typeid(CRet) == typeid(uint32_t) &&
1713               typeid(CArg) == typeid(c_v256)) {
1714      // U32_V256
1715      error = CompareSimd1Arg<uint32_t, v256, uint32_t, c_v256>(
1716          reinterpret_cast<fptr>(u32_store_aligned),
1717          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
1718          reinterpret_cast<fptr>(c_u32_store_aligned),
1719          reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s);
1720    } else if (typeid(CRet) == typeid(c_v64) &&
1721               typeid(CArg) == typeid(c_v256)) {
1722      // V64_V256
1723      error = CompareSimd1Arg<v64, v256, c_v64, c_v256>(
1724          reinterpret_cast<fptr>(v64_store_aligned),
1725          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
1726          reinterpret_cast<fptr>(c_v64_store_aligned),
1727          reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s);
1728    } else {
1729      FAIL() << "Internal error: Unknown intrinsic function "
1730             << typeid(CRet).name() << " " << name << "(" << typeid(CArg).name()
1731             << ")";
1732    }
1733  }
1734
1735  EXPECT_EQ(0, error) << "Error: mismatch for " << name << "("
1736                      << Print(s, sizeof(CArg)) << ") -> "
1737                      << Print(d, sizeof(CRet)) << " (simd), "
1738                      << Print(ref_d, sizeof(CRet)) << " (ref)";
1739}
1740
1741template <typename CRet, typename CArg1, typename CArg2>
1742void TestSimd2Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
1743                   const char *name) {
1744  ACMRandom rnd(ACMRandom::DeterministicSeed());
1745  fptr ref_simd;
1746  fptr simd;
1747  int error = 0;
1748  DECLARE_ALIGNED(32, uint8_t, s1[32]);
1749  DECLARE_ALIGNED(32, uint8_t, s2[32]);
1750  DECLARE_ALIGNED(32, uint8_t, d[32]);
1751  DECLARE_ALIGNED(32, uint8_t, ref_d[32]);
1752  assert(sizeof(CArg1) <= 32 && sizeof(CArg2) <= 32 && sizeof(CRet) <= 32);
1753  memset(ref_d, 0, sizeof(ref_d));
1754  memset(d, 0, sizeof(d));
1755
1756  Map(name, &ref_simd, &simd);
1757  if (simd == nullptr || ref_simd == nullptr) {
1758    FAIL() << "Internal error: Unknown intrinsic function " << name;
1759  }
1760
1761  for (unsigned int count = 0;
1762       count < iterations && !error && !testing::Test::HasFailure(); count++) {
1763    for (unsigned int c = 0; c < sizeof(CArg1); c++) s1[c] = rnd.Rand8();
1764
1765    for (unsigned int c = 0; c < sizeof(CArg2); c++) s2[c] = rnd.Rand8();
1766
1767    if (maskwidth) SetMask(s2, sizeof(CArg2), mask, maskwidth);
1768
1769    if (typeid(CRet) == typeid(c_v64) && typeid(CArg1) == typeid(c_v64) &&
1770        typeid(CArg2) == typeid(c_v64)) {
1771      // V64_V64V64
1772      error = CompareSimd2Args<v64, v64, v64, c_v64, c_v64, c_v64>(
1773          reinterpret_cast<fptr>(v64_store_aligned),
1774          reinterpret_cast<fptr>(v64_load_aligned),
1775          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
1776          reinterpret_cast<fptr>(c_v64_store_aligned),
1777          reinterpret_cast<fptr>(c_v64_load_aligned),
1778          reinterpret_cast<fptr>(c_v64_load_aligned),
1779          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1780    } else if (typeid(CRet) == typeid(c_v64) &&
1781               typeid(CArg1) == typeid(uint32_t) &&
1782               typeid(CArg2) == typeid(uint32_t)) {
1783      // V64_U32U32
1784      error =
1785          CompareSimd2Args<v64, uint32_t, uint32_t, c_v64, uint32_t, uint32_t>(
1786              reinterpret_cast<fptr>(v64_store_aligned),
1787              reinterpret_cast<fptr>(u32_load_aligned),
1788              reinterpret_cast<fptr>(u32_load_aligned), simd, d,
1789              reinterpret_cast<fptr>(c_v64_store_aligned),
1790              reinterpret_cast<fptr>(c_u32_load_aligned),
1791              reinterpret_cast<fptr>(c_u32_load_aligned),
1792              reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1793    } else if (typeid(CRet) == typeid(uint32_t) &&
1794               typeid(CArg1) == typeid(c_v64) &&
1795               typeid(CArg2) == typeid(c_v64)) {
1796      // U32_V64V64
1797      error = CompareSimd2Args<uint32_t, v64, v64, uint32_t, c_v64, c_v64>(
1798          reinterpret_cast<fptr>(u32_store_aligned),
1799          reinterpret_cast<fptr>(v64_load_aligned),
1800          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
1801          reinterpret_cast<fptr>(c_u32_store_aligned),
1802          reinterpret_cast<fptr>(c_v64_load_aligned),
1803          reinterpret_cast<fptr>(c_v64_load_aligned),
1804          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1805    } else if (typeid(CRet) == typeid(int64_t) &&
1806               typeid(CArg1) == typeid(c_v64) &&
1807               typeid(CArg2) == typeid(c_v64)) {
1808      // S64_V64V64
1809      error = CompareSimd2Args<int64_t, v64, v64, int64_t, c_v64, c_v64>(
1810          reinterpret_cast<fptr>(s64_store_aligned),
1811          reinterpret_cast<fptr>(v64_load_aligned),
1812          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
1813          reinterpret_cast<fptr>(c_s64_store_aligned),
1814          reinterpret_cast<fptr>(c_v64_load_aligned),
1815          reinterpret_cast<fptr>(c_v64_load_aligned),
1816          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1817    } else if (typeid(CRet) == typeid(c_v64) &&
1818               typeid(CArg1) == typeid(c_v64) &&
1819               typeid(CArg2) == typeid(uint32_t)) {
1820      // V64_V64U32
1821      error = CompareSimd2Args<v64, v64, uint32_t, c_v64, c_v64, uint32_t>(
1822          reinterpret_cast<fptr>(v64_store_aligned),
1823          reinterpret_cast<fptr>(v64_load_aligned),
1824          reinterpret_cast<fptr>(u32_load_aligned), simd, d,
1825          reinterpret_cast<fptr>(c_v64_store_aligned),
1826          reinterpret_cast<fptr>(c_v64_load_aligned),
1827          reinterpret_cast<fptr>(c_u32_load_aligned),
1828          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1829    } else if (typeid(CRet) == typeid(c_v128) &&
1830               typeid(CArg1) == typeid(c_v128) &&
1831               typeid(CArg2) == typeid(c_v128)) {
1832      // V128_V128V128
1833      error = CompareSimd2Args<v128, v128, v128, c_v128, c_v128, c_v128>(
1834          reinterpret_cast<fptr>(v128_store_aligned),
1835          reinterpret_cast<fptr>(v128_load_aligned),
1836          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
1837          reinterpret_cast<fptr>(c_v128_store_aligned),
1838          reinterpret_cast<fptr>(c_v128_load_aligned),
1839          reinterpret_cast<fptr>(c_v128_load_aligned),
1840          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1841    } else if (typeid(CRet) == typeid(uint32_t) &&
1842               typeid(CArg1) == typeid(c_v128) &&
1843               typeid(CArg2) == typeid(c_v128)) {
1844      // U32_V128V128
1845      error = CompareSimd2Args<uint32_t, v128, v128, uint32_t, c_v128, c_v128>(
1846          reinterpret_cast<fptr>(u32_store_aligned),
1847          reinterpret_cast<fptr>(v128_load_aligned),
1848          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
1849          reinterpret_cast<fptr>(c_u32_store_aligned),
1850          reinterpret_cast<fptr>(c_v128_load_aligned),
1851          reinterpret_cast<fptr>(c_v128_load_aligned),
1852          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1853    } else if (typeid(CRet) == typeid(uint64_t) &&
1854               typeid(CArg1) == typeid(c_v128) &&
1855               typeid(CArg2) == typeid(c_v128)) {
1856      // U64_V128V128
1857      error = CompareSimd2Args<uint64_t, v128, v128, uint64_t, c_v128, c_v128>(
1858          reinterpret_cast<fptr>(u64_store_aligned),
1859          reinterpret_cast<fptr>(v128_load_aligned),
1860          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
1861          reinterpret_cast<fptr>(c_u64_store_aligned),
1862          reinterpret_cast<fptr>(c_v128_load_aligned),
1863          reinterpret_cast<fptr>(c_v128_load_aligned),
1864          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1865    } else if (typeid(CRet) == typeid(int64_t) &&
1866               typeid(CArg1) == typeid(c_v128) &&
1867               typeid(CArg2) == typeid(c_v128)) {
1868      // S64_V128V128
1869      error = CompareSimd2Args<int64_t, v128, v128, int64_t, c_v128, c_v128>(
1870          reinterpret_cast<fptr>(s64_store_aligned),
1871          reinterpret_cast<fptr>(v128_load_aligned),
1872          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
1873          reinterpret_cast<fptr>(c_s64_store_aligned),
1874          reinterpret_cast<fptr>(c_v128_load_aligned),
1875          reinterpret_cast<fptr>(c_v128_load_aligned),
1876          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1877    } else if (typeid(CRet) == typeid(c_v128) &&
1878               typeid(CArg1) == typeid(uint64_t) &&
1879               typeid(CArg2) == typeid(uint64_t)) {
1880      // V128_U64U64
1881      error = CompareSimd2Args<v128, uint64_t, uint64_t, c_v128, uint64_t,
1882                               uint64_t>(
1883          reinterpret_cast<fptr>(v128_store_aligned),
1884          reinterpret_cast<fptr>(u64_load_aligned),
1885          reinterpret_cast<fptr>(u64_load_aligned), simd, d,
1886          reinterpret_cast<fptr>(c_v128_store_aligned),
1887          reinterpret_cast<fptr>(c_u64_load_aligned),
1888          reinterpret_cast<fptr>(c_u64_load_aligned),
1889          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1890    } else if (typeid(CRet) == typeid(c_v128) &&
1891               typeid(CArg1) == typeid(c_v64) &&
1892               typeid(CArg2) == typeid(c_v64)) {
1893      // V128_V64V64
1894      error = CompareSimd2Args<v128, v64, v64, c_v128, c_v64, c_v64>(
1895          reinterpret_cast<fptr>(v128_store_aligned),
1896          reinterpret_cast<fptr>(v64_load_aligned),
1897          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
1898          reinterpret_cast<fptr>(c_v128_store_aligned),
1899          reinterpret_cast<fptr>(c_v64_load_aligned),
1900          reinterpret_cast<fptr>(c_v64_load_aligned),
1901          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1902    } else if (typeid(CRet) == typeid(c_v128) &&
1903               typeid(CArg1) == typeid(c_v128) &&
1904               typeid(CArg2) == typeid(uint32_t)) {
1905      // V128_V128U32
1906      error = CompareSimd2Args<v128, v128, uint32_t, c_v128, c_v128, uint32_t>(
1907          reinterpret_cast<fptr>(v128_store_aligned),
1908          reinterpret_cast<fptr>(v128_load_aligned),
1909          reinterpret_cast<fptr>(u32_load_aligned), simd, d,
1910          reinterpret_cast<fptr>(c_v128_store_aligned),
1911          reinterpret_cast<fptr>(c_v128_load_aligned),
1912          reinterpret_cast<fptr>(c_u32_load_aligned),
1913          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1914    } else if (typeid(CRet) == typeid(c_v256) &&
1915               typeid(CArg1) == typeid(c_v256) &&
1916               typeid(CArg2) == typeid(c_v256)) {
1917      // V256_V256V256
1918      error = CompareSimd2Args<v256, v256, v256, c_v256, c_v256, c_v256>(
1919          reinterpret_cast<fptr>(v256_store_aligned),
1920          reinterpret_cast<fptr>(v256_load_aligned),
1921          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
1922          reinterpret_cast<fptr>(c_v256_store_aligned),
1923          reinterpret_cast<fptr>(c_v256_load_aligned),
1924          reinterpret_cast<fptr>(c_v256_load_aligned),
1925          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1926    } else if (typeid(CRet) == typeid(uint64_t) &&
1927               typeid(CArg1) == typeid(c_v256) &&
1928               typeid(CArg2) == typeid(c_v256)) {
1929      // U64_V256V256
1930      error = CompareSimd2Args<uint64_t, v256, v256, uint64_t, c_v256, c_v256>(
1931          reinterpret_cast<fptr>(u64_store_aligned),
1932          reinterpret_cast<fptr>(v256_load_aligned),
1933          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
1934          reinterpret_cast<fptr>(c_u64_store_aligned),
1935          reinterpret_cast<fptr>(c_v256_load_aligned),
1936          reinterpret_cast<fptr>(c_v256_load_aligned),
1937          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1938    } else if (typeid(CRet) == typeid(int64_t) &&
1939               typeid(CArg1) == typeid(c_v256) &&
1940               typeid(CArg2) == typeid(c_v256)) {
1941      // S64_V256V256
1942      error = CompareSimd2Args<int64_t, v256, v256, int64_t, c_v256, c_v256>(
1943          reinterpret_cast<fptr>(s64_store_aligned),
1944          reinterpret_cast<fptr>(v256_load_aligned),
1945          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
1946          reinterpret_cast<fptr>(c_s64_store_aligned),
1947          reinterpret_cast<fptr>(c_v256_load_aligned),
1948          reinterpret_cast<fptr>(c_v256_load_aligned),
1949          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1950    } else if (typeid(CRet) == typeid(uint32_t) &&
1951               typeid(CArg1) == typeid(c_v256) &&
1952               typeid(CArg2) == typeid(c_v256)) {
1953      // U32_V256V256
1954      error = CompareSimd2Args<uint32_t, v256, v256, uint32_t, c_v256, c_v256>(
1955          reinterpret_cast<fptr>(u32_store_aligned),
1956          reinterpret_cast<fptr>(v256_load_aligned),
1957          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
1958          reinterpret_cast<fptr>(c_u32_store_aligned),
1959          reinterpret_cast<fptr>(c_v256_load_aligned),
1960          reinterpret_cast<fptr>(c_v256_load_aligned),
1961          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1962    } else if (typeid(CRet) == typeid(c_v256) &&
1963               typeid(CArg1) == typeid(c_v128) &&
1964               typeid(CArg2) == typeid(c_v128)) {
1965      // V256_V128V128
1966      error = CompareSimd2Args<v256, v128, v128, c_v256, c_v128, c_v128>(
1967          reinterpret_cast<fptr>(v256_store_aligned),
1968          reinterpret_cast<fptr>(v128_load_aligned),
1969          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
1970          reinterpret_cast<fptr>(c_v256_store_aligned),
1971          reinterpret_cast<fptr>(c_v128_load_aligned),
1972          reinterpret_cast<fptr>(c_v128_load_aligned),
1973          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1974    } else if (typeid(CRet) == typeid(c_v256) &&
1975               typeid(CArg1) == typeid(c_v256) &&
1976               typeid(CArg2) == typeid(uint32_t)) {
1977      // V256_V256U32
1978      error = CompareSimd2Args<v256, v256, uint32_t, c_v256, c_v256, uint32_t>(
1979          reinterpret_cast<fptr>(v256_store_aligned),
1980          reinterpret_cast<fptr>(v256_load_aligned),
1981          reinterpret_cast<fptr>(u32_load_aligned), simd, d,
1982          reinterpret_cast<fptr>(c_v256_store_aligned),
1983          reinterpret_cast<fptr>(c_v256_load_aligned),
1984          reinterpret_cast<fptr>(c_u32_load_aligned),
1985          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1986
1987    } else {
1988      FAIL() << "Internal error: Unknown intrinsic function "
1989             << typeid(CRet).name() << " " << name << "("
1990             << typeid(CArg1).name() << ", " << typeid(CArg2).name() << ")";
1991    }
1992  }
1993
1994  EXPECT_EQ(0, error) << "Error: mismatch for " << name << "("
1995                      << Print(s1, sizeof(CArg1)) << ", "
1996                      << Print(s2, sizeof(CArg2)) << ") -> "
1997                      << Print(d, sizeof(CRet)) << " (simd), "
1998                      << Print(ref_d, sizeof(CRet)) << " (ref)";
1999}
2000
2001template <typename CRet, typename CArg1, typename CArg2, typename CArg3>
2002void TestSimd3Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
2003                   const char *name) {
2004  ACMRandom rnd(ACMRandom::DeterministicSeed());
2005  fptr ref_simd;
2006  fptr simd;
2007  int error = 0;
2008  DECLARE_ALIGNED(32, uint8_t, s1[32]);
2009  DECLARE_ALIGNED(32, uint8_t, s2[32]);
2010  DECLARE_ALIGNED(32, uint8_t, s3[32]);
2011  DECLARE_ALIGNED(32, uint8_t, d[32]);
2012  DECLARE_ALIGNED(32, uint8_t, ref_d[32]);
2013  assert(sizeof(CArg1) <= 32 && sizeof(CArg2) <= 32 && sizeof(CArg3) <= 32 &&
2014         sizeof(CRet) <= 32);
2015  memset(ref_d, 0, sizeof(ref_d));
2016  memset(d, 0, sizeof(d));
2017
2018  Map(name, &ref_simd, &simd);
2019  if (simd == nullptr || ref_simd == nullptr) {
2020    FAIL() << "Internal error: Unknown intrinsic function " << name;
2021  }
2022
2023  for (unsigned int count = 0;
2024       count < iterations && !error && !testing::Test::HasFailure(); count++) {
2025    for (unsigned int c = 0; c < sizeof(CArg1); c++) s1[c] = rnd.Rand8();
2026
2027    for (unsigned int c = 0; c < sizeof(CArg2); c++) s2[c] = rnd.Rand8();
2028
2029    for (unsigned int c = 0; c < sizeof(CArg3); c++) s3[c] = rnd.Rand8();
2030
2031    if (maskwidth) SetMask(s3, sizeof(CArg3), mask, maskwidth);
2032
2033    if (typeid(CRet) == typeid(c_v128) && typeid(CArg1) == typeid(c_v128) &&
2034        typeid(CArg2) == typeid(c_v128) && typeid(CArg3) == typeid(c_v128)) {
2035      // V128_V128V128V128
2036      error = CompareSimd3Args<v128, v128, v128, v128, c_v128, c_v128, c_v128,
2037                               c_v128>(
2038          reinterpret_cast<fptr>(v128_store_aligned),
2039          reinterpret_cast<fptr>(v128_load_aligned),
2040          reinterpret_cast<fptr>(v128_load_aligned),
2041          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
2042          reinterpret_cast<fptr>(c_v128_store_aligned),
2043          reinterpret_cast<fptr>(c_v128_load_aligned),
2044          reinterpret_cast<fptr>(c_v128_load_aligned),
2045          reinterpret_cast<fptr>(c_v128_load_aligned),
2046          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2, s3);
2047    } else if (typeid(CRet) == typeid(c_v256) &&
2048               typeid(CArg1) == typeid(c_v256) &&
2049               typeid(CArg2) == typeid(c_v256) &&
2050               typeid(CArg3) == typeid(c_v256)) {
2051      // V256_V256V256V256
2052      error = CompareSimd3Args<v256, v256, v256, v256, c_v256, c_v256, c_v256,
2053                               c_v256>(
2054          reinterpret_cast<fptr>(v256_store_aligned),
2055          reinterpret_cast<fptr>(v256_load_aligned),
2056          reinterpret_cast<fptr>(v256_load_aligned),
2057          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
2058          reinterpret_cast<fptr>(c_v256_store_aligned),
2059          reinterpret_cast<fptr>(c_v256_load_aligned),
2060          reinterpret_cast<fptr>(c_v256_load_aligned),
2061          reinterpret_cast<fptr>(c_v256_load_aligned),
2062          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2, s3);
2063    } else {
2064      FAIL() << "Internal error: Unknown intrinsic function "
2065             << typeid(CRet).name() << " " << name << "("
2066             << typeid(CArg1).name() << ", " << typeid(CArg2).name() << ", "
2067             << typeid(CArg3).name() << ")";
2068    }
2069  }
2070
2071  EXPECT_EQ(0, error) << "Error: mismatch for " << name << "("
2072                      << Print(s1, sizeof(CArg1)) << ", "
2073                      << Print(s2, sizeof(CArg2)) << ", "
2074                      << Print(s3, sizeof(CArg3)) << ") -> "
2075                      << Print(d, sizeof(CRet)) << " (simd), "
2076                      << Print(ref_d, sizeof(CRet)) << " (ref)";
2077}
2078
2079// Instantiations to make the functions callable from another files
2080template void TestSimd1Arg<c_v64, uint8_t>(uint32_t, uint32_t, uint32_t,
2081                                           const char *);
2082template void TestSimd1Arg<c_v64, uint16_t>(uint32_t, uint32_t, uint32_t,
2083                                            const char *);
2084template void TestSimd1Arg<c_v64, uint32_t>(uint32_t, uint32_t, uint32_t,
2085                                            const char *);
2086template void TestSimd1Arg<c_v64, c_v64>(uint32_t, uint32_t, uint32_t,
2087                                         const char *);
2088template void TestSimd1Arg<uint32_t, c_v64>(uint32_t, uint32_t, uint32_t,
2089                                            const char *);
2090template void TestSimd1Arg<int32_t, c_v64>(uint32_t, uint32_t, uint32_t,
2091                                           const char *);
2092template void TestSimd1Arg<uint64_t, c_v64>(uint32_t, uint32_t, uint32_t,
2093                                            const char *);
2094template void TestSimd1Arg<int64_t, c_v64>(uint32_t, uint32_t, uint32_t,
2095                                           const char *);
2096template void TestSimd2Args<c_v64, uint32_t, uint32_t>(uint32_t, uint32_t,
2097                                                       uint32_t, const char *);
2098template void TestSimd2Args<c_v64, c_v64, c_v64>(uint32_t, uint32_t, uint32_t,
2099                                                 const char *);
2100template void TestSimd2Args<c_v64, c_v64, uint32_t>(uint32_t, uint32_t,
2101                                                    uint32_t, const char *);
2102template void TestSimd2Args<int64_t, c_v64, c_v64>(uint32_t, uint32_t, uint32_t,
2103                                                   const char *);
2104template void TestSimd2Args<uint32_t, c_v64, c_v64>(uint32_t, uint32_t,
2105                                                    uint32_t, const char *);
2106template void TestSimd1Arg<c_v128, c_v128>(uint32_t, uint32_t, uint32_t,
2107                                           const char *);
2108template void TestSimd1Arg<c_v128, uint8_t>(uint32_t, uint32_t, uint32_t,
2109                                            const char *);
2110template void TestSimd1Arg<c_v128, uint16_t>(uint32_t, uint32_t, uint32_t,
2111                                             const char *);
2112template void TestSimd1Arg<c_v128, uint32_t>(uint32_t, uint32_t, uint32_t,
2113                                             const char *);
2114template void TestSimd1Arg<c_v128, uint64_t>(uint32_t, uint32_t, uint32_t,
2115                                             const char *);
2116template void TestSimd1Arg<c_v128, c_v64>(uint32_t, uint32_t, uint32_t,
2117                                          const char *);
2118template void TestSimd1Arg<uint32_t, c_v128>(uint32_t, uint32_t, uint32_t,
2119                                             const char *);
2120template void TestSimd1Arg<uint64_t, c_v128>(uint32_t, uint32_t, uint32_t,
2121                                             const char *);
2122template void TestSimd1Arg<c_v64, c_v128>(uint32_t, uint32_t, uint32_t,
2123                                          const char *);
2124template void TestSimd2Args<c_v128, c_v128, c_v128>(uint32_t, uint32_t,
2125                                                    uint32_t, const char *);
2126template void TestSimd2Args<c_v128, c_v128, uint32_t>(uint32_t, uint32_t,
2127                                                      uint32_t, const char *);
2128template void TestSimd2Args<c_v128, uint64_t, uint64_t>(uint32_t, uint32_t,
2129                                                        uint32_t, const char *);
2130template void TestSimd2Args<c_v128, c_v64, c_v64>(uint32_t, uint32_t, uint32_t,
2131                                                  const char *);
2132template void TestSimd2Args<uint64_t, c_v128, c_v128>(uint32_t, uint32_t,
2133                                                      uint32_t, const char *);
2134template void TestSimd2Args<int64_t, c_v128, c_v128>(uint32_t, uint32_t,
2135                                                     uint32_t, const char *);
2136template void TestSimd2Args<uint32_t, c_v128, c_v128>(uint32_t, uint32_t,
2137                                                      uint32_t, const char *);
2138template void TestSimd3Args<c_v128, c_v128, c_v128, c_v128>(uint32_t, uint32_t,
2139                                                            uint32_t,
2140                                                            const char *);
2141template void TestSimd1Arg<c_v256, c_v128>(uint32_t, uint32_t, uint32_t,
2142                                           const char *);
2143template void TestSimd1Arg<c_v256, c_v256>(uint32_t, uint32_t, uint32_t,
2144                                           const char *);
2145template void TestSimd1Arg<uint64_t, c_v256>(uint32_t, uint32_t, uint32_t,
2146                                             const char *);
2147template void TestSimd1Arg<c_v256, uint8_t>(uint32_t, uint32_t, uint32_t,
2148                                            const char *);
2149template void TestSimd1Arg<c_v256, uint16_t>(uint32_t, uint32_t, uint32_t,
2150                                             const char *);
2151template void TestSimd1Arg<c_v256, uint32_t>(uint32_t, uint32_t, uint32_t,
2152                                             const char *);
2153template void TestSimd1Arg<c_v256, uint64_t>(uint32_t, uint32_t, uint32_t,
2154                                             const char *);
2155template void TestSimd1Arg<uint32_t, c_v256>(uint32_t, uint32_t, uint32_t,
2156                                             const char *);
2157template void TestSimd1Arg<c_v64, c_v256>(uint32_t, uint32_t, uint32_t,
2158                                          const char *);
2159template void TestSimd2Args<c_v256, c_v128, c_v128>(uint32_t, uint32_t,
2160                                                    uint32_t, const char *);
2161template void TestSimd2Args<c_v256, c_v256, c_v256>(uint32_t, uint32_t,
2162                                                    uint32_t, const char *);
2163template void TestSimd2Args<c_v256, c_v256, uint32_t>(uint32_t, uint32_t,
2164                                                      uint32_t, const char *);
2165template void TestSimd2Args<uint64_t, c_v256, c_v256>(uint32_t, uint32_t,
2166                                                      uint32_t, const char *);
2167template void TestSimd2Args<int64_t, c_v256, c_v256>(uint32_t, uint32_t,
2168                                                     uint32_t, const char *);
2169template void TestSimd2Args<uint32_t, c_v256, c_v256>(uint32_t, uint32_t,
2170                                                      uint32_t, const char *);
2171template void TestSimd3Args<c_v256, c_v256, c_v256, c_v256>(uint32_t, uint32_t,
2172                                                            uint32_t,
2173                                                            const char *);
2174
2175}  // namespace SIMD_NAMESPACE
2176