1/* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12#include <assert.h> 13#include <string> 14 15#include "config/aom_dsp_rtcd.h" 16 17#include "test/acm_random.h" 18// Inlining not forced for the compiler due to some tests calling 19// SIMD_INLINE functions via function pointers 20#undef SIMD_INLINE 21#define SIMD_INLINE static inline 22#include "aom_dsp/aom_simd.h" 23#include "aom_dsp/simd/v256_intrinsics_c.h" 24 25// Machine tuned code goes into this file. This file is included from 26// simd_cmp_sse2.cc, simd_cmp_ssse3.cc etc which define the macros 27// ARCH (=neon, sse2, ssse3, etc), SIMD_NAMESPACE and ARCH_POSTFIX(). 28 29#ifdef _MSC_VER 30// Disable "value of intrinsic immediate argument 'value' is out of range 31// 'lowerbound - upperbound'" warning. Visual Studio emits this warning though 32// the parameters are conditionally checked in e.g., v256_shr_n_byte. Adding a 33// mask doesn't always appear to be sufficient. 34#pragma warning(disable : 4556) 35#endif 36 37using libaom_test::ACMRandom; 38 39namespace SIMD_NAMESPACE { 40 41// Wrap templates around intrinsics using immediate values 42template <int shift> 43v64 imm_v64_shl_n_byte(v64 a) { 44 return v64_shl_n_byte(a, shift); 45} 46template <int shift> 47v64 imm_v64_shr_n_byte(v64 a) { 48 return v64_shr_n_byte(a, shift); 49} 50template <int shift> 51v64 imm_v64_shl_n_8(v64 a) { 52 return v64_shl_n_8(a, shift); 53} 54template <int shift> 55v64 imm_v64_shr_n_u8(v64 a) { 56 return v64_shr_n_u8(a, shift); 57} 58template <int shift> 59v64 imm_v64_shr_n_s8(v64 a) { 60 return v64_shr_n_s8(a, shift); 61} 62template <int shift> 63v64 imm_v64_shl_n_16(v64 a) { 64 return v64_shl_n_16(a, shift); 65} 66template <int shift> 67v64 imm_v64_shr_n_u16(v64 a) { 68 return v64_shr_n_u16(a, shift); 69} 70template <int shift> 71v64 imm_v64_shr_n_s16(v64 a) { 72 return v64_shr_n_s16(a, shift); 73} 74template <int shift> 75v64 imm_v64_shl_n_32(v64 a) { 76 return v64_shl_n_32(a, shift); 77} 78template <int shift> 79v64 imm_v64_shr_n_u32(v64 a) { 80 return v64_shr_n_u32(a, shift); 81} 82template <int shift> 83v64 imm_v64_shr_n_s32(v64 a) { 84 return v64_shr_n_s32(a, shift); 85} 86template <int shift> 87v64 imm_v64_align(v64 a, v64 b) { 88 return v64_align(a, b, shift); 89} 90 91// Wrap templates around corresponding C implementations of the above 92template <int shift> 93c_v64 c_imm_v64_shl_n_byte(c_v64 a) { 94 return c_v64_shl_n_byte(a, shift); 95} 96template <int shift> 97c_v64 c_imm_v64_shr_n_byte(c_v64 a) { 98 return c_v64_shr_n_byte(a, shift); 99} 100template <int shift> 101c_v64 c_imm_v64_shl_n_8(c_v64 a) { 102 return c_v64_shl_n_8(a, shift); 103} 104template <int shift> 105c_v64 c_imm_v64_shr_n_u8(c_v64 a) { 106 return c_v64_shr_n_u8(a, shift); 107} 108template <int shift> 109c_v64 c_imm_v64_shr_n_s8(c_v64 a) { 110 return c_v64_shr_n_s8(a, shift); 111} 112template <int shift> 113c_v64 c_imm_v64_shl_n_16(c_v64 a) { 114 return c_v64_shl_n_16(a, shift); 115} 116template <int shift> 117c_v64 c_imm_v64_shr_n_u16(c_v64 a) { 118 return c_v64_shr_n_u16(a, shift); 119} 120template <int shift> 121c_v64 c_imm_v64_shr_n_s16(c_v64 a) { 122 return c_v64_shr_n_s16(a, shift); 123} 124template <int shift> 125c_v64 c_imm_v64_shl_n_32(c_v64 a) { 126 return c_v64_shl_n_32(a, shift); 127} 128template <int shift> 129c_v64 c_imm_v64_shr_n_u32(c_v64 a) { 130 return c_v64_shr_n_u32(a, shift); 131} 132template <int shift> 133c_v64 c_imm_v64_shr_n_s32(c_v64 a) { 134 return c_v64_shr_n_s32(a, shift); 135} 136template <int shift> 137c_v64 c_imm_v64_align(c_v64 a, c_v64 b) { 138 return c_v64_align(a, b, shift); 139} 140 141template <int shift> 142v128 imm_v128_shl_n_byte(v128 a) { 143 return v128_shl_n_byte(a, shift); 144} 145template <int shift> 146v128 imm_v128_shr_n_byte(v128 a) { 147 return v128_shr_n_byte(a, shift); 148} 149template <int shift> 150v128 imm_v128_shl_n_8(v128 a) { 151 return v128_shl_n_8(a, shift); 152} 153template <int shift> 154v128 imm_v128_shr_n_u8(v128 a) { 155 return v128_shr_n_u8(a, shift); 156} 157template <int shift> 158v128 imm_v128_shr_n_s8(v128 a) { 159 return v128_shr_n_s8(a, shift); 160} 161template <int shift> 162v128 imm_v128_shl_n_16(v128 a) { 163 return v128_shl_n_16(a, shift); 164} 165template <int shift> 166v128 imm_v128_shr_n_u16(v128 a) { 167 return v128_shr_n_u16(a, shift); 168} 169template <int shift> 170v128 imm_v128_shr_n_s16(v128 a) { 171 return v128_shr_n_s16(a, shift); 172} 173template <int shift> 174v128 imm_v128_shl_n_32(v128 a) { 175 return v128_shl_n_32(a, shift); 176} 177template <int shift> 178v128 imm_v128_shr_n_u32(v128 a) { 179 return v128_shr_n_u32(a, shift); 180} 181template <int shift> 182v128 imm_v128_shr_n_s32(v128 a) { 183 return v128_shr_n_s32(a, shift); 184} 185template <int shift> 186v128 imm_v128_shl_n_64(v128 a) { 187 return v128_shl_n_64(a, shift); 188} 189template <int shift> 190v128 imm_v128_shr_n_u64(v128 a) { 191 return v128_shr_n_u64(a, shift); 192} 193template <int shift> 194v128 imm_v128_shr_n_s64(v128 a) { 195 return v128_shr_n_s64(a, shift); 196} 197template <int shift> 198v128 imm_v128_align(v128 a, v128 b) { 199 return v128_align(a, b, shift); 200} 201 202template <int shift> 203c_v128 c_imm_v128_shl_n_byte(c_v128 a) { 204 return c_v128_shl_n_byte(a, shift); 205} 206template <int shift> 207c_v128 c_imm_v128_shr_n_byte(c_v128 a) { 208 return c_v128_shr_n_byte(a, shift); 209} 210template <int shift> 211c_v128 c_imm_v128_shl_n_8(c_v128 a) { 212 return c_v128_shl_n_8(a, shift); 213} 214template <int shift> 215c_v128 c_imm_v128_shr_n_u8(c_v128 a) { 216 return c_v128_shr_n_u8(a, shift); 217} 218template <int shift> 219c_v128 c_imm_v128_shr_n_s8(c_v128 a) { 220 return c_v128_shr_n_s8(a, shift); 221} 222template <int shift> 223c_v128 c_imm_v128_shl_n_16(c_v128 a) { 224 return c_v128_shl_n_16(a, shift); 225} 226template <int shift> 227c_v128 c_imm_v128_shr_n_u16(c_v128 a) { 228 return c_v128_shr_n_u16(a, shift); 229} 230template <int shift> 231c_v128 c_imm_v128_shr_n_s16(c_v128 a) { 232 return c_v128_shr_n_s16(a, shift); 233} 234template <int shift> 235c_v128 c_imm_v128_shl_n_32(c_v128 a) { 236 return c_v128_shl_n_32(a, shift); 237} 238template <int shift> 239c_v128 c_imm_v128_shr_n_u32(c_v128 a) { 240 return c_v128_shr_n_u32(a, shift); 241} 242template <int shift> 243c_v128 c_imm_v128_shr_n_s32(c_v128 a) { 244 return c_v128_shr_n_s32(a, shift); 245} 246template <int shift> 247c_v128 c_imm_v128_shl_n_64(c_v128 a) { 248 return c_v128_shl_n_64(a, shift); 249} 250template <int shift> 251c_v128 c_imm_v128_shr_n_u64(c_v128 a) { 252 return c_v128_shr_n_u64(a, shift); 253} 254template <int shift> 255c_v128 c_imm_v128_shr_n_s64(c_v128 a) { 256 return c_v128_shr_n_s64(a, shift); 257} 258template <int shift> 259c_v128 c_imm_v128_align(c_v128 a, c_v128 b) { 260 return c_v128_align(a, b, shift); 261} 262 263template <int shift> 264v256 imm_v256_shl_n_word(v256 a) { 265 return v256_shl_n_word(a, shift); 266} 267template <int shift> 268v256 imm_v256_shr_n_word(v256 a) { 269 return v256_shr_n_word(a, shift); 270} 271template <int shift> 272v256 imm_v256_shl_n_byte(v256 a) { 273 return v256_shl_n_byte(a, shift); 274} 275template <int shift> 276v256 imm_v256_shr_n_byte(v256 a) { 277 return v256_shr_n_byte(a, shift); 278} 279template <int shift> 280v256 imm_v256_shl_n_8(v256 a) { 281 return v256_shl_n_8(a, shift); 282} 283template <int shift> 284v256 imm_v256_shr_n_u8(v256 a) { 285 return v256_shr_n_u8(a, shift); 286} 287template <int shift> 288v256 imm_v256_shr_n_s8(v256 a) { 289 return v256_shr_n_s8(a, shift); 290} 291template <int shift> 292v256 imm_v256_shl_n_16(v256 a) { 293 return v256_shl_n_16(a, shift); 294} 295template <int shift> 296v256 imm_v256_shr_n_u16(v256 a) { 297 return v256_shr_n_u16(a, shift); 298} 299template <int shift> 300v256 imm_v256_shr_n_s16(v256 a) { 301 return v256_shr_n_s16(a, shift); 302} 303template <int shift> 304v256 imm_v256_shl_n_32(v256 a) { 305 return v256_shl_n_32(a, shift); 306} 307template <int shift> 308v256 imm_v256_shr_n_u32(v256 a) { 309 return v256_shr_n_u32(a, shift); 310} 311template <int shift> 312v256 imm_v256_shr_n_s32(v256 a) { 313 return v256_shr_n_s32(a, shift); 314} 315template <int shift> 316v256 imm_v256_shl_n_64(v256 a) { 317 return v256_shl_n_64(a, shift); 318} 319template <int shift> 320v256 imm_v256_shr_n_u64(v256 a) { 321 return v256_shr_n_u64(a, shift); 322} 323template <int shift> 324v256 imm_v256_shr_n_s64(v256 a) { 325 return v256_shr_n_s64(a, shift); 326} 327template <int shift> 328v256 imm_v256_align(v256 a, v256 b) { 329 return v256_align(a, b, shift); 330} 331 332template <int shift> 333c_v256 c_imm_v256_shl_n_word(c_v256 a) { 334 return c_v256_shl_n_word(a, shift); 335} 336template <int shift> 337c_v256 c_imm_v256_shr_n_word(c_v256 a) { 338 return c_v256_shr_n_word(a, shift); 339} 340template <int shift> 341c_v256 c_imm_v256_shl_n_byte(c_v256 a) { 342 return c_v256_shl_n_byte(a, shift); 343} 344template <int shift> 345c_v256 c_imm_v256_shr_n_byte(c_v256 a) { 346 return c_v256_shr_n_byte(a, shift); 347} 348template <int shift> 349c_v256 c_imm_v256_shl_n_8(c_v256 a) { 350 return c_v256_shl_n_8(a, shift); 351} 352template <int shift> 353c_v256 c_imm_v256_shr_n_u8(c_v256 a) { 354 return c_v256_shr_n_u8(a, shift); 355} 356template <int shift> 357c_v256 c_imm_v256_shr_n_s8(c_v256 a) { 358 return c_v256_shr_n_s8(a, shift); 359} 360template <int shift> 361c_v256 c_imm_v256_shl_n_16(c_v256 a) { 362 return c_v256_shl_n_16(a, shift); 363} 364template <int shift> 365c_v256 c_imm_v256_shr_n_u16(c_v256 a) { 366 return c_v256_shr_n_u16(a, shift); 367} 368template <int shift> 369c_v256 c_imm_v256_shr_n_s16(c_v256 a) { 370 return c_v256_shr_n_s16(a, shift); 371} 372template <int shift> 373c_v256 c_imm_v256_shl_n_32(c_v256 a) { 374 return c_v256_shl_n_32(a, shift); 375} 376template <int shift> 377c_v256 c_imm_v256_shr_n_u32(c_v256 a) { 378 return c_v256_shr_n_u32(a, shift); 379} 380template <int shift> 381c_v256 c_imm_v256_shr_n_s32(c_v256 a) { 382 return c_v256_shr_n_s32(a, shift); 383} 384template <int shift> 385c_v256 c_imm_v256_shl_n_64(c_v256 a) { 386 return c_v256_shl_n_64(a, shift); 387} 388template <int shift> 389c_v256 c_imm_v256_shr_n_u64(c_v256 a) { 390 return c_v256_shr_n_u64(a, shift); 391} 392template <int shift> 393c_v256 c_imm_v256_shr_n_s64(c_v256 a) { 394 return c_v256_shr_n_s64(a, shift); 395} 396template <int shift> 397c_v256 c_imm_v256_align(c_v256 a, c_v256 b) { 398 return c_v256_align(a, b, shift); 399} 400 401namespace { 402 403// Wrappers around the the SAD and SSD functions 404uint32_t v64_sad_u8(v64 a, v64 b) { 405 return v64_sad_u8_sum(::v64_sad_u8(v64_sad_u8_init(), a, b)); 406} 407uint32_t v64_ssd_u8(v64 a, v64 b) { 408 return v64_ssd_u8_sum(::v64_ssd_u8(v64_ssd_u8_init(), a, b)); 409} 410 411uint32_t c_v64_sad_u8(c_v64 a, c_v64 b) { 412 return c_v64_sad_u8_sum(::c_v64_sad_u8(c_v64_sad_u8_init(), a, b)); 413} 414uint32_t c_v64_ssd_u8(c_v64 a, c_v64 b) { 415 return c_v64_ssd_u8_sum(::c_v64_ssd_u8(c_v64_ssd_u8_init(), a, b)); 416} 417uint32_t v128_sad_u8(v128 a, v128 b) { 418 return v128_sad_u8_sum(::v128_sad_u8(v128_sad_u8_init(), a, b)); 419} 420uint32_t v128_ssd_u8(v128 a, v128 b) { 421 return v128_ssd_u8_sum(::v128_ssd_u8(v128_ssd_u8_init(), a, b)); 422} 423uint32_t c_v128_sad_u8(c_v128 a, c_v128 b) { 424 return c_v128_sad_u8_sum(::c_v128_sad_u8(c_v128_sad_u8_init(), a, b)); 425} 426uint32_t c_v128_ssd_u8(c_v128 a, c_v128 b) { 427 return c_v128_ssd_u8_sum(::c_v128_ssd_u8(c_v128_ssd_u8_init(), a, b)); 428} 429uint32_t v128_sad_u16(v128 a, v128 b) { 430 return v128_sad_u16_sum(::v128_sad_u16(v128_sad_u16_init(), a, b)); 431} 432uint64_t v128_ssd_s16(v128 a, v128 b) { 433 return v128_ssd_s16_sum(::v128_ssd_s16(v128_ssd_s16_init(), a, b)); 434} 435uint32_t c_v128_sad_u16(c_v128 a, c_v128 b) { 436 return c_v128_sad_u16_sum(::c_v128_sad_u16(c_v128_sad_u16_init(), a, b)); 437} 438uint64_t c_v128_ssd_s16(c_v128 a, c_v128 b) { 439 return c_v128_ssd_s16_sum(::c_v128_ssd_s16(c_v128_ssd_s16_init(), a, b)); 440} 441uint32_t v256_sad_u8(v256 a, v256 b) { 442 return v256_sad_u8_sum(::v256_sad_u8(v256_sad_u8_init(), a, b)); 443} 444uint32_t v256_ssd_u8(v256 a, v256 b) { 445 return v256_ssd_u8_sum(::v256_ssd_u8(v256_ssd_u8_init(), a, b)); 446} 447uint32_t c_v256_sad_u8(c_v256 a, c_v256 b) { 448 return c_v256_sad_u8_sum(::c_v256_sad_u8(c_v256_sad_u8_init(), a, b)); 449} 450uint32_t c_v256_ssd_u8(c_v256 a, c_v256 b) { 451 return c_v256_ssd_u8_sum(::c_v256_ssd_u8(c_v256_ssd_u8_init(), a, b)); 452} 453uint32_t v256_sad_u16(v256 a, v256 b) { 454 return v256_sad_u16_sum(::v256_sad_u16(v256_sad_u16_init(), a, b)); 455} 456uint64_t v256_ssd_s16(v256 a, v256 b) { 457 return v256_ssd_s16_sum(::v256_ssd_s16(v256_ssd_s16_init(), a, b)); 458} 459uint32_t c_v256_sad_u16(c_v256 a, c_v256 b) { 460 return c_v256_sad_u16_sum(::c_v256_sad_u16(c_v256_sad_u16_init(), a, b)); 461} 462uint64_t c_v256_ssd_s16(c_v256 a, c_v256 b) { 463 return c_v256_ssd_s16_sum(::c_v256_ssd_s16(c_v256_ssd_s16_init(), a, b)); 464} 465 466typedef void (*fptr)(); 467 468typedef struct { 469 const char *name; 470 fptr ref; 471 fptr simd; 472} mapping; 473 474#define MAP(name) \ 475 { #name, reinterpret_cast < fptr>(c_##name), reinterpret_cast < fptr>(name) } 476 477const mapping m[] = { MAP(v64_sad_u8), 478 MAP(v64_ssd_u8), 479 MAP(v64_add_8), 480 MAP(v64_add_16), 481 MAP(v64_sadd_s8), 482 MAP(v64_sadd_u8), 483 MAP(v64_sadd_s16), 484 MAP(v64_add_32), 485 MAP(v64_sub_8), 486 MAP(v64_ssub_u8), 487 MAP(v64_ssub_s8), 488 MAP(v64_sub_16), 489 MAP(v64_ssub_s16), 490 MAP(v64_ssub_u16), 491 MAP(v64_sub_32), 492 MAP(v64_ziplo_8), 493 MAP(v64_ziphi_8), 494 MAP(v64_ziplo_16), 495 MAP(v64_ziphi_16), 496 MAP(v64_ziplo_32), 497 MAP(v64_ziphi_32), 498 MAP(v64_pack_s32_u16), 499 MAP(v64_pack_s32_s16), 500 MAP(v64_pack_s16_u8), 501 MAP(v64_pack_s16_s8), 502 MAP(v64_unziphi_8), 503 MAP(v64_unziplo_8), 504 MAP(v64_unziphi_16), 505 MAP(v64_unziplo_16), 506 MAP(v64_or), 507 MAP(v64_xor), 508 MAP(v64_and), 509 MAP(v64_andn), 510 MAP(v64_mullo_s16), 511 MAP(v64_mulhi_s16), 512 MAP(v64_mullo_s32), 513 MAP(v64_madd_s16), 514 MAP(v64_madd_us8), 515 MAP(v64_avg_u8), 516 MAP(v64_rdavg_u8), 517 MAP(v64_rdavg_u16), 518 MAP(v64_avg_u16), 519 MAP(v64_min_u8), 520 MAP(v64_max_u8), 521 MAP(v64_min_s8), 522 MAP(v64_max_s8), 523 MAP(v64_min_s16), 524 MAP(v64_max_s16), 525 MAP(v64_cmpgt_s8), 526 MAP(v64_cmplt_s8), 527 MAP(v64_cmpeq_8), 528 MAP(v64_cmpgt_s16), 529 MAP(v64_cmplt_s16), 530 MAP(v64_cmpeq_16), 531 MAP(v64_shuffle_8), 532 MAP(imm_v64_align<1>), 533 MAP(imm_v64_align<2>), 534 MAP(imm_v64_align<3>), 535 MAP(imm_v64_align<4>), 536 MAP(imm_v64_align<5>), 537 MAP(imm_v64_align<6>), 538 MAP(imm_v64_align<7>), 539 MAP(v64_abs_s8), 540 MAP(v64_abs_s16), 541 MAP(v64_unpacklo_u8_s16), 542 MAP(v64_unpackhi_u8_s16), 543 MAP(v64_unpacklo_s8_s16), 544 MAP(v64_unpackhi_s8_s16), 545 MAP(v64_unpacklo_u16_s32), 546 MAP(v64_unpacklo_s16_s32), 547 MAP(v64_unpackhi_u16_s32), 548 MAP(v64_unpackhi_s16_s32), 549 MAP(imm_v64_shr_n_byte<1>), 550 MAP(imm_v64_shr_n_byte<2>), 551 MAP(imm_v64_shr_n_byte<3>), 552 MAP(imm_v64_shr_n_byte<4>), 553 MAP(imm_v64_shr_n_byte<5>), 554 MAP(imm_v64_shr_n_byte<6>), 555 MAP(imm_v64_shr_n_byte<7>), 556 MAP(imm_v64_shl_n_byte<1>), 557 MAP(imm_v64_shl_n_byte<2>), 558 MAP(imm_v64_shl_n_byte<3>), 559 MAP(imm_v64_shl_n_byte<4>), 560 MAP(imm_v64_shl_n_byte<5>), 561 MAP(imm_v64_shl_n_byte<6>), 562 MAP(imm_v64_shl_n_byte<7>), 563 MAP(imm_v64_shl_n_8<1>), 564 MAP(imm_v64_shl_n_8<2>), 565 MAP(imm_v64_shl_n_8<3>), 566 MAP(imm_v64_shl_n_8<4>), 567 MAP(imm_v64_shl_n_8<5>), 568 MAP(imm_v64_shl_n_8<6>), 569 MAP(imm_v64_shl_n_8<7>), 570 MAP(imm_v64_shr_n_u8<1>), 571 MAP(imm_v64_shr_n_u8<2>), 572 MAP(imm_v64_shr_n_u8<3>), 573 MAP(imm_v64_shr_n_u8<4>), 574 MAP(imm_v64_shr_n_u8<5>), 575 MAP(imm_v64_shr_n_u8<6>), 576 MAP(imm_v64_shr_n_u8<7>), 577 MAP(imm_v64_shr_n_s8<1>), 578 MAP(imm_v64_shr_n_s8<2>), 579 MAP(imm_v64_shr_n_s8<3>), 580 MAP(imm_v64_shr_n_s8<4>), 581 MAP(imm_v64_shr_n_s8<5>), 582 MAP(imm_v64_shr_n_s8<6>), 583 MAP(imm_v64_shr_n_s8<7>), 584 MAP(imm_v64_shl_n_16<1>), 585 MAP(imm_v64_shl_n_16<2>), 586 MAP(imm_v64_shl_n_16<4>), 587 MAP(imm_v64_shl_n_16<6>), 588 MAP(imm_v64_shl_n_16<8>), 589 MAP(imm_v64_shl_n_16<10>), 590 MAP(imm_v64_shl_n_16<12>), 591 MAP(imm_v64_shl_n_16<14>), 592 MAP(imm_v64_shr_n_u16<1>), 593 MAP(imm_v64_shr_n_u16<2>), 594 MAP(imm_v64_shr_n_u16<4>), 595 MAP(imm_v64_shr_n_u16<6>), 596 MAP(imm_v64_shr_n_u16<8>), 597 MAP(imm_v64_shr_n_u16<10>), 598 MAP(imm_v64_shr_n_u16<12>), 599 MAP(imm_v64_shr_n_u16<14>), 600 MAP(imm_v64_shr_n_s16<1>), 601 MAP(imm_v64_shr_n_s16<2>), 602 MAP(imm_v64_shr_n_s16<4>), 603 MAP(imm_v64_shr_n_s16<6>), 604 MAP(imm_v64_shr_n_s16<8>), 605 MAP(imm_v64_shr_n_s16<10>), 606 MAP(imm_v64_shr_n_s16<12>), 607 MAP(imm_v64_shr_n_s16<14>), 608 MAP(imm_v64_shl_n_32<1>), 609 MAP(imm_v64_shl_n_32<4>), 610 MAP(imm_v64_shl_n_32<8>), 611 MAP(imm_v64_shl_n_32<12>), 612 MAP(imm_v64_shl_n_32<16>), 613 MAP(imm_v64_shl_n_32<20>), 614 MAP(imm_v64_shl_n_32<24>), 615 MAP(imm_v64_shl_n_32<28>), 616 MAP(imm_v64_shr_n_u32<1>), 617 MAP(imm_v64_shr_n_u32<4>), 618 MAP(imm_v64_shr_n_u32<8>), 619 MAP(imm_v64_shr_n_u32<12>), 620 MAP(imm_v64_shr_n_u32<16>), 621 MAP(imm_v64_shr_n_u32<20>), 622 MAP(imm_v64_shr_n_u32<24>), 623 MAP(imm_v64_shr_n_u32<28>), 624 MAP(imm_v64_shr_n_s32<1>), 625 MAP(imm_v64_shr_n_s32<4>), 626 MAP(imm_v64_shr_n_s32<8>), 627 MAP(imm_v64_shr_n_s32<12>), 628 MAP(imm_v64_shr_n_s32<16>), 629 MAP(imm_v64_shr_n_s32<20>), 630 MAP(imm_v64_shr_n_s32<24>), 631 MAP(imm_v64_shr_n_s32<28>), 632 MAP(v64_shl_8), 633 MAP(v64_shr_u8), 634 MAP(v64_shr_s8), 635 MAP(v64_shl_16), 636 MAP(v64_shr_u16), 637 MAP(v64_shr_s16), 638 MAP(v64_shl_32), 639 MAP(v64_shr_u32), 640 MAP(v64_shr_s32), 641 MAP(v64_hadd_u8), 642 MAP(v64_hadd_s16), 643 MAP(v64_dotp_s16), 644 MAP(v64_dotp_su8), 645 MAP(v64_u64), 646 MAP(v64_low_u32), 647 MAP(v64_high_u32), 648 MAP(v64_low_s32), 649 MAP(v64_high_s32), 650 MAP(v64_dup_8), 651 MAP(v64_dup_16), 652 MAP(v64_dup_32), 653 MAP(v64_from_32), 654 MAP(v64_zero), 655 MAP(v64_from_16), 656 MAP(v128_sad_u8), 657 MAP(v128_ssd_u8), 658 MAP(v128_sad_u16), 659 MAP(v128_ssd_s16), 660 MAP(v128_add_8), 661 MAP(v128_add_16), 662 MAP(v128_sadd_s8), 663 MAP(v128_sadd_u8), 664 MAP(v128_sadd_s16), 665 MAP(v128_add_32), 666 MAP(v128_add_64), 667 MAP(v128_sub_8), 668 MAP(v128_ssub_u8), 669 MAP(v128_ssub_s8), 670 MAP(v128_sub_16), 671 MAP(v128_ssub_s16), 672 MAP(v128_ssub_u16), 673 MAP(v128_sub_32), 674 MAP(v128_sub_64), 675 MAP(v128_ziplo_8), 676 MAP(v128_ziphi_8), 677 MAP(v128_ziplo_16), 678 MAP(v128_ziphi_16), 679 MAP(v128_ziplo_32), 680 MAP(v128_ziphi_32), 681 MAP(v128_ziplo_64), 682 MAP(v128_ziphi_64), 683 MAP(v128_unziphi_8), 684 MAP(v128_unziplo_8), 685 MAP(v128_unziphi_16), 686 MAP(v128_unziplo_16), 687 MAP(v128_unziphi_32), 688 MAP(v128_unziplo_32), 689 MAP(v128_pack_s32_u16), 690 MAP(v128_pack_s32_s16), 691 MAP(v128_pack_s16_u8), 692 MAP(v128_pack_s16_s8), 693 MAP(v128_or), 694 MAP(v128_xor), 695 MAP(v128_and), 696 MAP(v128_andn), 697 MAP(v128_mullo_s16), 698 MAP(v128_mulhi_s16), 699 MAP(v128_mullo_s32), 700 MAP(v128_madd_s16), 701 MAP(v128_madd_us8), 702 MAP(v128_avg_u8), 703 MAP(v128_rdavg_u8), 704 MAP(v128_rdavg_u16), 705 MAP(v128_avg_u16), 706 MAP(v128_min_u8), 707 MAP(v128_max_u8), 708 MAP(v128_min_s8), 709 MAP(v128_max_s8), 710 MAP(v128_min_s16), 711 MAP(v128_max_s16), 712 MAP(v128_min_s32), 713 MAP(v128_max_s32), 714 MAP(v128_cmpgt_s8), 715 MAP(v128_cmplt_s8), 716 MAP(v128_cmpeq_8), 717 MAP(v128_cmpgt_s16), 718 MAP(v128_cmpeq_16), 719 MAP(v128_cmplt_s16), 720 MAP(v128_cmpgt_s32), 721 MAP(v128_cmpeq_32), 722 MAP(v128_cmplt_s32), 723 MAP(v128_shuffle_8), 724 MAP(imm_v128_align<1>), 725 MAP(imm_v128_align<2>), 726 MAP(imm_v128_align<3>), 727 MAP(imm_v128_align<4>), 728 MAP(imm_v128_align<5>), 729 MAP(imm_v128_align<6>), 730 MAP(imm_v128_align<7>), 731 MAP(imm_v128_align<8>), 732 MAP(imm_v128_align<9>), 733 MAP(imm_v128_align<10>), 734 MAP(imm_v128_align<11>), 735 MAP(imm_v128_align<12>), 736 MAP(imm_v128_align<13>), 737 MAP(imm_v128_align<14>), 738 MAP(imm_v128_align<15>), 739 MAP(v128_abs_s8), 740 MAP(v128_abs_s16), 741 MAP(v128_padd_u8), 742 MAP(v128_padd_s16), 743 MAP(v128_unpacklo_u16_s32), 744 MAP(v128_unpacklo_s16_s32), 745 MAP(v128_unpackhi_u16_s32), 746 MAP(v128_unpackhi_s16_s32), 747 MAP(imm_v128_shr_n_byte<1>), 748 MAP(imm_v128_shr_n_byte<2>), 749 MAP(imm_v128_shr_n_byte<3>), 750 MAP(imm_v128_shr_n_byte<4>), 751 MAP(imm_v128_shr_n_byte<5>), 752 MAP(imm_v128_shr_n_byte<6>), 753 MAP(imm_v128_shr_n_byte<7>), 754 MAP(imm_v128_shr_n_byte<8>), 755 MAP(imm_v128_shr_n_byte<9>), 756 MAP(imm_v128_shr_n_byte<10>), 757 MAP(imm_v128_shr_n_byte<11>), 758 MAP(imm_v128_shr_n_byte<12>), 759 MAP(imm_v128_shr_n_byte<13>), 760 MAP(imm_v128_shr_n_byte<14>), 761 MAP(imm_v128_shr_n_byte<15>), 762 MAP(imm_v128_shl_n_byte<1>), 763 MAP(imm_v128_shl_n_byte<2>), 764 MAP(imm_v128_shl_n_byte<3>), 765 MAP(imm_v128_shl_n_byte<4>), 766 MAP(imm_v128_shl_n_byte<5>), 767 MAP(imm_v128_shl_n_byte<6>), 768 MAP(imm_v128_shl_n_byte<7>), 769 MAP(imm_v128_shl_n_byte<8>), 770 MAP(imm_v128_shl_n_byte<9>), 771 MAP(imm_v128_shl_n_byte<10>), 772 MAP(imm_v128_shl_n_byte<11>), 773 MAP(imm_v128_shl_n_byte<12>), 774 MAP(imm_v128_shl_n_byte<13>), 775 MAP(imm_v128_shl_n_byte<14>), 776 MAP(imm_v128_shl_n_byte<15>), 777 MAP(imm_v128_shl_n_8<1>), 778 MAP(imm_v128_shl_n_8<2>), 779 MAP(imm_v128_shl_n_8<3>), 780 MAP(imm_v128_shl_n_8<4>), 781 MAP(imm_v128_shl_n_8<5>), 782 MAP(imm_v128_shl_n_8<6>), 783 MAP(imm_v128_shl_n_8<7>), 784 MAP(imm_v128_shr_n_u8<1>), 785 MAP(imm_v128_shr_n_u8<2>), 786 MAP(imm_v128_shr_n_u8<3>), 787 MAP(imm_v128_shr_n_u8<4>), 788 MAP(imm_v128_shr_n_u8<5>), 789 MAP(imm_v128_shr_n_u8<6>), 790 MAP(imm_v128_shr_n_u8<7>), 791 MAP(imm_v128_shr_n_s8<1>), 792 MAP(imm_v128_shr_n_s8<2>), 793 MAP(imm_v128_shr_n_s8<3>), 794 MAP(imm_v128_shr_n_s8<4>), 795 MAP(imm_v128_shr_n_s8<5>), 796 MAP(imm_v128_shr_n_s8<6>), 797 MAP(imm_v128_shr_n_s8<7>), 798 MAP(imm_v128_shl_n_16<1>), 799 MAP(imm_v128_shl_n_16<2>), 800 MAP(imm_v128_shl_n_16<4>), 801 MAP(imm_v128_shl_n_16<6>), 802 MAP(imm_v128_shl_n_16<8>), 803 MAP(imm_v128_shl_n_16<10>), 804 MAP(imm_v128_shl_n_16<12>), 805 MAP(imm_v128_shl_n_16<14>), 806 MAP(imm_v128_shr_n_u16<1>), 807 MAP(imm_v128_shr_n_u16<2>), 808 MAP(imm_v128_shr_n_u16<4>), 809 MAP(imm_v128_shr_n_u16<6>), 810 MAP(imm_v128_shr_n_u16<8>), 811 MAP(imm_v128_shr_n_u16<10>), 812 MAP(imm_v128_shr_n_u16<12>), 813 MAP(imm_v128_shr_n_u16<14>), 814 MAP(imm_v128_shr_n_s16<1>), 815 MAP(imm_v128_shr_n_s16<2>), 816 MAP(imm_v128_shr_n_s16<4>), 817 MAP(imm_v128_shr_n_s16<6>), 818 MAP(imm_v128_shr_n_s16<8>), 819 MAP(imm_v128_shr_n_s16<10>), 820 MAP(imm_v128_shr_n_s16<12>), 821 MAP(imm_v128_shr_n_s16<14>), 822 MAP(imm_v128_shl_n_32<1>), 823 MAP(imm_v128_shl_n_32<4>), 824 MAP(imm_v128_shl_n_32<8>), 825 MAP(imm_v128_shl_n_32<12>), 826 MAP(imm_v128_shl_n_32<16>), 827 MAP(imm_v128_shl_n_32<20>), 828 MAP(imm_v128_shl_n_32<24>), 829 MAP(imm_v128_shl_n_32<28>), 830 MAP(imm_v128_shr_n_u32<1>), 831 MAP(imm_v128_shr_n_u32<4>), 832 MAP(imm_v128_shr_n_u32<8>), 833 MAP(imm_v128_shr_n_u32<12>), 834 MAP(imm_v128_shr_n_u32<16>), 835 MAP(imm_v128_shr_n_u32<20>), 836 MAP(imm_v128_shr_n_u32<24>), 837 MAP(imm_v128_shr_n_u32<28>), 838 MAP(imm_v128_shr_n_s32<1>), 839 MAP(imm_v128_shr_n_s32<4>), 840 MAP(imm_v128_shr_n_s32<8>), 841 MAP(imm_v128_shr_n_s32<12>), 842 MAP(imm_v128_shr_n_s32<16>), 843 MAP(imm_v128_shr_n_s32<20>), 844 MAP(imm_v128_shr_n_s32<24>), 845 MAP(imm_v128_shr_n_s32<28>), 846 MAP(imm_v128_shl_n_64<1>), 847 MAP(imm_v128_shl_n_64<4>), 848 MAP(imm_v128_shl_n_64<8>), 849 MAP(imm_v128_shl_n_64<12>), 850 MAP(imm_v128_shl_n_64<16>), 851 MAP(imm_v128_shl_n_64<20>), 852 MAP(imm_v128_shl_n_64<24>), 853 MAP(imm_v128_shl_n_64<28>), 854 MAP(imm_v128_shl_n_64<32>), 855 MAP(imm_v128_shl_n_64<36>), 856 MAP(imm_v128_shl_n_64<40>), 857 MAP(imm_v128_shl_n_64<44>), 858 MAP(imm_v128_shl_n_64<48>), 859 MAP(imm_v128_shl_n_64<52>), 860 MAP(imm_v128_shl_n_64<56>), 861 MAP(imm_v128_shl_n_64<60>), 862 MAP(imm_v128_shr_n_u64<1>), 863 MAP(imm_v128_shr_n_u64<4>), 864 MAP(imm_v128_shr_n_u64<8>), 865 MAP(imm_v128_shr_n_u64<12>), 866 MAP(imm_v128_shr_n_u64<16>), 867 MAP(imm_v128_shr_n_u64<20>), 868 MAP(imm_v128_shr_n_u64<24>), 869 MAP(imm_v128_shr_n_u64<28>), 870 MAP(imm_v128_shr_n_u64<32>), 871 MAP(imm_v128_shr_n_u64<36>), 872 MAP(imm_v128_shr_n_u64<40>), 873 MAP(imm_v128_shr_n_u64<44>), 874 MAP(imm_v128_shr_n_u64<48>), 875 MAP(imm_v128_shr_n_u64<52>), 876 MAP(imm_v128_shr_n_u64<56>), 877 MAP(imm_v128_shr_n_u64<60>), 878 MAP(imm_v128_shr_n_s64<1>), 879 MAP(imm_v128_shr_n_s64<4>), 880 MAP(imm_v128_shr_n_s64<8>), 881 MAP(imm_v128_shr_n_s64<12>), 882 MAP(imm_v128_shr_n_s64<16>), 883 MAP(imm_v128_shr_n_s64<20>), 884 MAP(imm_v128_shr_n_s64<24>), 885 MAP(imm_v128_shr_n_s64<28>), 886 MAP(imm_v128_shr_n_s64<32>), 887 MAP(imm_v128_shr_n_s64<36>), 888 MAP(imm_v128_shr_n_s64<40>), 889 MAP(imm_v128_shr_n_s64<44>), 890 MAP(imm_v128_shr_n_s64<48>), 891 MAP(imm_v128_shr_n_s64<52>), 892 MAP(imm_v128_shr_n_s64<56>), 893 MAP(imm_v128_shr_n_s64<60>), 894 MAP(v128_from_v64), 895 MAP(v128_zip_8), 896 MAP(v128_zip_16), 897 MAP(v128_zip_32), 898 MAP(v128_mul_s16), 899 MAP(v128_unpack_u8_s16), 900 MAP(v128_unpack_s8_s16), 901 MAP(v128_unpack_u16_s32), 902 MAP(v128_unpack_s16_s32), 903 MAP(v128_shl_8), 904 MAP(v128_shr_u8), 905 MAP(v128_shr_s8), 906 MAP(v128_shl_16), 907 MAP(v128_shr_u16), 908 MAP(v128_shr_s16), 909 MAP(v128_shl_32), 910 MAP(v128_shr_u32), 911 MAP(v128_shr_s32), 912 MAP(v128_shl_64), 913 MAP(v128_shr_u64), 914 MAP(v128_shr_s64), 915 MAP(v128_hadd_u8), 916 MAP(v128_dotp_su8), 917 MAP(v128_dotp_s16), 918 MAP(v128_dotp_s32), 919 MAP(v128_low_u32), 920 MAP(v128_low_v64), 921 MAP(v128_high_v64), 922 MAP(v128_from_64), 923 MAP(v128_from_32), 924 MAP(v128_movemask_8), 925 MAP(v128_zero), 926 MAP(v128_dup_8), 927 MAP(v128_dup_16), 928 MAP(v128_dup_32), 929 MAP(v128_dup_64), 930 MAP(v128_unpacklo_u8_s16), 931 MAP(v128_unpackhi_u8_s16), 932 MAP(v128_unpacklo_s8_s16), 933 MAP(v128_unpackhi_s8_s16), 934 MAP(v128_blend_8), 935 MAP(u32_load_unaligned), 936 MAP(u32_store_unaligned), 937 MAP(v64_load_unaligned), 938 MAP(v64_store_unaligned), 939 MAP(v128_load_unaligned), 940 MAP(v128_store_unaligned), 941 MAP(v256_sad_u8), 942 MAP(v256_ssd_u8), 943 MAP(v256_sad_u16), 944 MAP(v256_ssd_s16), 945 MAP(v256_hadd_u8), 946 MAP(v256_low_u64), 947 MAP(v256_dotp_su8), 948 MAP(v256_dotp_s16), 949 MAP(v256_dotp_s32), 950 MAP(v256_add_8), 951 MAP(v256_add_16), 952 MAP(v256_sadd_s8), 953 MAP(v256_sadd_u8), 954 MAP(v256_sadd_s16), 955 MAP(v256_add_32), 956 MAP(v256_add_64), 957 MAP(v256_sub_8), 958 MAP(v256_ssub_u8), 959 MAP(v256_ssub_s8), 960 MAP(v256_sub_16), 961 MAP(v256_ssub_u16), 962 MAP(v256_ssub_s16), 963 MAP(v256_sub_32), 964 MAP(v256_sub_64), 965 MAP(v256_ziplo_8), 966 MAP(v256_ziphi_8), 967 MAP(v256_ziplo_16), 968 MAP(v256_ziphi_16), 969 MAP(v256_ziplo_32), 970 MAP(v256_ziphi_32), 971 MAP(v256_ziplo_64), 972 MAP(v256_ziphi_64), 973 MAP(v256_unziphi_8), 974 MAP(v256_unziplo_8), 975 MAP(v256_unziphi_16), 976 MAP(v256_unziplo_16), 977 MAP(v256_unziphi_32), 978 MAP(v256_unziplo_32), 979 MAP(v256_unziphi_64), 980 MAP(v256_unziplo_64), 981 MAP(v256_pack_s32_u16), 982 MAP(v256_pack_s32_s16), 983 MAP(v256_pack_s16_u8), 984 MAP(v256_pack_s16_s8), 985 MAP(v256_or), 986 MAP(v256_xor), 987 MAP(v256_and), 988 MAP(v256_andn), 989 MAP(v256_mullo_s16), 990 MAP(v256_mulhi_s16), 991 MAP(v256_mullo_s32), 992 MAP(v256_madd_s16), 993 MAP(v256_madd_us8), 994 MAP(v256_avg_u8), 995 MAP(v256_rdavg_u8), 996 MAP(v256_rdavg_u16), 997 MAP(v256_avg_u16), 998 MAP(v256_min_u8), 999 MAP(v256_max_u8), 1000 MAP(v256_min_s8), 1001 MAP(v256_max_s8), 1002 MAP(v256_min_s16), 1003 MAP(v256_max_s16), 1004 MAP(v256_min_s32), 1005 MAP(v256_max_s32), 1006 MAP(v256_cmpgt_s8), 1007 MAP(v256_cmplt_s8), 1008 MAP(v256_cmpeq_8), 1009 MAP(v256_cmpgt_s16), 1010 MAP(v256_cmplt_s16), 1011 MAP(v256_cmpeq_16), 1012 MAP(v256_cmpgt_s32), 1013 MAP(v256_cmplt_s32), 1014 MAP(v256_cmpeq_32), 1015 MAP(v256_shuffle_8), 1016 MAP(v256_pshuffle_8), 1017 MAP(v256_wideshuffle_8), 1018 MAP(imm_v256_align<1>), 1019 MAP(imm_v256_align<2>), 1020 MAP(imm_v256_align<3>), 1021 MAP(imm_v256_align<4>), 1022 MAP(imm_v256_align<5>), 1023 MAP(imm_v256_align<6>), 1024 MAP(imm_v256_align<7>), 1025 MAP(imm_v256_align<8>), 1026 MAP(imm_v256_align<9>), 1027 MAP(imm_v256_align<10>), 1028 MAP(imm_v256_align<11>), 1029 MAP(imm_v256_align<12>), 1030 MAP(imm_v256_align<13>), 1031 MAP(imm_v256_align<14>), 1032 MAP(imm_v256_align<15>), 1033 MAP(imm_v256_align<16>), 1034 MAP(imm_v256_align<17>), 1035 MAP(imm_v256_align<18>), 1036 MAP(imm_v256_align<19>), 1037 MAP(imm_v256_align<20>), 1038 MAP(imm_v256_align<21>), 1039 MAP(imm_v256_align<22>), 1040 MAP(imm_v256_align<23>), 1041 MAP(imm_v256_align<24>), 1042 MAP(imm_v256_align<25>), 1043 MAP(imm_v256_align<26>), 1044 MAP(imm_v256_align<27>), 1045 MAP(imm_v256_align<28>), 1046 MAP(imm_v256_align<29>), 1047 MAP(imm_v256_align<30>), 1048 MAP(imm_v256_align<31>), 1049 MAP(v256_from_v128), 1050 MAP(v256_zip_8), 1051 MAP(v256_zip_16), 1052 MAP(v256_zip_32), 1053 MAP(v256_mul_s16), 1054 MAP(v256_unpack_u8_s16), 1055 MAP(v256_unpack_s8_s16), 1056 MAP(v256_unpack_u16_s32), 1057 MAP(v256_unpack_s16_s32), 1058 MAP(v256_shl_8), 1059 MAP(v256_shr_u8), 1060 MAP(v256_shr_s8), 1061 MAP(v256_shl_16), 1062 MAP(v256_shr_u16), 1063 MAP(v256_shr_s16), 1064 MAP(v256_shl_32), 1065 MAP(v256_shr_u32), 1066 MAP(v256_shr_s32), 1067 MAP(v256_shl_64), 1068 MAP(v256_shr_u64), 1069 MAP(v256_shr_s64), 1070 MAP(v256_abs_s8), 1071 MAP(v256_abs_s16), 1072 MAP(v256_padd_u8), 1073 MAP(v256_padd_s16), 1074 MAP(v256_unpacklo_u16_s32), 1075 MAP(v256_unpacklo_s16_s32), 1076 MAP(v256_unpackhi_u16_s32), 1077 MAP(v256_unpackhi_s16_s32), 1078 MAP(imm_v256_shr_n_word<1>), 1079 MAP(imm_v256_shr_n_word<2>), 1080 MAP(imm_v256_shr_n_word<3>), 1081 MAP(imm_v256_shr_n_word<4>), 1082 MAP(imm_v256_shr_n_word<5>), 1083 MAP(imm_v256_shr_n_word<6>), 1084 MAP(imm_v256_shr_n_word<7>), 1085 MAP(imm_v256_shr_n_word<8>), 1086 MAP(imm_v256_shr_n_word<9>), 1087 MAP(imm_v256_shr_n_word<10>), 1088 MAP(imm_v256_shr_n_word<11>), 1089 MAP(imm_v256_shr_n_word<12>), 1090 MAP(imm_v256_shr_n_word<13>), 1091 MAP(imm_v256_shr_n_word<14>), 1092 MAP(imm_v256_shr_n_word<15>), 1093 MAP(imm_v256_shl_n_word<1>), 1094 MAP(imm_v256_shl_n_word<2>), 1095 MAP(imm_v256_shl_n_word<3>), 1096 MAP(imm_v256_shl_n_word<4>), 1097 MAP(imm_v256_shl_n_word<5>), 1098 MAP(imm_v256_shl_n_word<6>), 1099 MAP(imm_v256_shl_n_word<7>), 1100 MAP(imm_v256_shl_n_word<8>), 1101 MAP(imm_v256_shl_n_word<9>), 1102 MAP(imm_v256_shl_n_word<10>), 1103 MAP(imm_v256_shl_n_word<11>), 1104 MAP(imm_v256_shl_n_word<12>), 1105 MAP(imm_v256_shl_n_word<13>), 1106 MAP(imm_v256_shl_n_word<14>), 1107 MAP(imm_v256_shl_n_word<15>), 1108 MAP(imm_v256_shr_n_byte<1>), 1109 MAP(imm_v256_shr_n_byte<2>), 1110 MAP(imm_v256_shr_n_byte<3>), 1111 MAP(imm_v256_shr_n_byte<4>), 1112 MAP(imm_v256_shr_n_byte<5>), 1113 MAP(imm_v256_shr_n_byte<6>), 1114 MAP(imm_v256_shr_n_byte<7>), 1115 MAP(imm_v256_shr_n_byte<8>), 1116 MAP(imm_v256_shr_n_byte<9>), 1117 MAP(imm_v256_shr_n_byte<10>), 1118 MAP(imm_v256_shr_n_byte<11>), 1119 MAP(imm_v256_shr_n_byte<12>), 1120 MAP(imm_v256_shr_n_byte<13>), 1121 MAP(imm_v256_shr_n_byte<14>), 1122 MAP(imm_v256_shr_n_byte<15>), 1123 MAP(imm_v256_shr_n_byte<16>), 1124 MAP(imm_v256_shr_n_byte<17>), 1125 MAP(imm_v256_shr_n_byte<18>), 1126 MAP(imm_v256_shr_n_byte<19>), 1127 MAP(imm_v256_shr_n_byte<20>), 1128 MAP(imm_v256_shr_n_byte<21>), 1129 MAP(imm_v256_shr_n_byte<22>), 1130 MAP(imm_v256_shr_n_byte<23>), 1131 MAP(imm_v256_shr_n_byte<24>), 1132 MAP(imm_v256_shr_n_byte<25>), 1133 MAP(imm_v256_shr_n_byte<26>), 1134 MAP(imm_v256_shr_n_byte<27>), 1135 MAP(imm_v256_shr_n_byte<28>), 1136 MAP(imm_v256_shr_n_byte<29>), 1137 MAP(imm_v256_shr_n_byte<30>), 1138 MAP(imm_v256_shr_n_byte<31>), 1139 MAP(imm_v256_shl_n_byte<1>), 1140 MAP(imm_v256_shl_n_byte<2>), 1141 MAP(imm_v256_shl_n_byte<3>), 1142 MAP(imm_v256_shl_n_byte<4>), 1143 MAP(imm_v256_shl_n_byte<5>), 1144 MAP(imm_v256_shl_n_byte<6>), 1145 MAP(imm_v256_shl_n_byte<7>), 1146 MAP(imm_v256_shl_n_byte<8>), 1147 MAP(imm_v256_shl_n_byte<9>), 1148 MAP(imm_v256_shl_n_byte<10>), 1149 MAP(imm_v256_shl_n_byte<11>), 1150 MAP(imm_v256_shl_n_byte<12>), 1151 MAP(imm_v256_shl_n_byte<13>), 1152 MAP(imm_v256_shl_n_byte<14>), 1153 MAP(imm_v256_shl_n_byte<15>), 1154 MAP(imm_v256_shl_n_byte<16>), 1155 MAP(imm_v256_shl_n_byte<17>), 1156 MAP(imm_v256_shl_n_byte<18>), 1157 MAP(imm_v256_shl_n_byte<19>), 1158 MAP(imm_v256_shl_n_byte<20>), 1159 MAP(imm_v256_shl_n_byte<21>), 1160 MAP(imm_v256_shl_n_byte<22>), 1161 MAP(imm_v256_shl_n_byte<23>), 1162 MAP(imm_v256_shl_n_byte<24>), 1163 MAP(imm_v256_shl_n_byte<25>), 1164 MAP(imm_v256_shl_n_byte<26>), 1165 MAP(imm_v256_shl_n_byte<27>), 1166 MAP(imm_v256_shl_n_byte<28>), 1167 MAP(imm_v256_shl_n_byte<29>), 1168 MAP(imm_v256_shl_n_byte<30>), 1169 MAP(imm_v256_shl_n_byte<31>), 1170 MAP(imm_v256_shl_n_8<1>), 1171 MAP(imm_v256_shl_n_8<2>), 1172 MAP(imm_v256_shl_n_8<3>), 1173 MAP(imm_v256_shl_n_8<4>), 1174 MAP(imm_v256_shl_n_8<5>), 1175 MAP(imm_v256_shl_n_8<6>), 1176 MAP(imm_v256_shl_n_8<7>), 1177 MAP(imm_v256_shr_n_u8<1>), 1178 MAP(imm_v256_shr_n_u8<2>), 1179 MAP(imm_v256_shr_n_u8<3>), 1180 MAP(imm_v256_shr_n_u8<4>), 1181 MAP(imm_v256_shr_n_u8<5>), 1182 MAP(imm_v256_shr_n_u8<6>), 1183 MAP(imm_v256_shr_n_u8<7>), 1184 MAP(imm_v256_shr_n_s8<1>), 1185 MAP(imm_v256_shr_n_s8<2>), 1186 MAP(imm_v256_shr_n_s8<3>), 1187 MAP(imm_v256_shr_n_s8<4>), 1188 MAP(imm_v256_shr_n_s8<5>), 1189 MAP(imm_v256_shr_n_s8<6>), 1190 MAP(imm_v256_shr_n_s8<7>), 1191 MAP(imm_v256_shl_n_16<1>), 1192 MAP(imm_v256_shl_n_16<2>), 1193 MAP(imm_v256_shl_n_16<4>), 1194 MAP(imm_v256_shl_n_16<6>), 1195 MAP(imm_v256_shl_n_16<8>), 1196 MAP(imm_v256_shl_n_16<10>), 1197 MAP(imm_v256_shl_n_16<12>), 1198 MAP(imm_v256_shl_n_16<14>), 1199 MAP(imm_v256_shr_n_u16<1>), 1200 MAP(imm_v256_shr_n_u16<2>), 1201 MAP(imm_v256_shr_n_u16<4>), 1202 MAP(imm_v256_shr_n_u16<6>), 1203 MAP(imm_v256_shr_n_u16<8>), 1204 MAP(imm_v256_shr_n_u16<10>), 1205 MAP(imm_v256_shr_n_u16<12>), 1206 MAP(imm_v256_shr_n_u16<14>), 1207 MAP(imm_v256_shr_n_s16<1>), 1208 MAP(imm_v256_shr_n_s16<2>), 1209 MAP(imm_v256_shr_n_s16<4>), 1210 MAP(imm_v256_shr_n_s16<6>), 1211 MAP(imm_v256_shr_n_s16<8>), 1212 MAP(imm_v256_shr_n_s16<10>), 1213 MAP(imm_v256_shr_n_s16<12>), 1214 MAP(imm_v256_shr_n_s16<14>), 1215 MAP(imm_v256_shl_n_32<1>), 1216 MAP(imm_v256_shl_n_32<4>), 1217 MAP(imm_v256_shl_n_32<8>), 1218 MAP(imm_v256_shl_n_32<12>), 1219 MAP(imm_v256_shl_n_32<16>), 1220 MAP(imm_v256_shl_n_32<20>), 1221 MAP(imm_v256_shl_n_32<24>), 1222 MAP(imm_v256_shl_n_32<28>), 1223 MAP(imm_v256_shr_n_u32<1>), 1224 MAP(imm_v256_shr_n_u32<4>), 1225 MAP(imm_v256_shr_n_u32<8>), 1226 MAP(imm_v256_shr_n_u32<12>), 1227 MAP(imm_v256_shr_n_u32<16>), 1228 MAP(imm_v256_shr_n_u32<20>), 1229 MAP(imm_v256_shr_n_u32<24>), 1230 MAP(imm_v256_shr_n_u32<28>), 1231 MAP(imm_v256_shr_n_s32<1>), 1232 MAP(imm_v256_shr_n_s32<4>), 1233 MAP(imm_v256_shr_n_s32<8>), 1234 MAP(imm_v256_shr_n_s32<12>), 1235 MAP(imm_v256_shr_n_s32<16>), 1236 MAP(imm_v256_shr_n_s32<20>), 1237 MAP(imm_v256_shr_n_s32<24>), 1238 MAP(imm_v256_shr_n_s32<28>), 1239 MAP(imm_v256_shl_n_64<1>), 1240 MAP(imm_v256_shl_n_64<4>), 1241 MAP(imm_v256_shl_n_64<8>), 1242 MAP(imm_v256_shl_n_64<12>), 1243 MAP(imm_v256_shl_n_64<16>), 1244 MAP(imm_v256_shl_n_64<20>), 1245 MAP(imm_v256_shl_n_64<24>), 1246 MAP(imm_v256_shl_n_64<28>), 1247 MAP(imm_v256_shl_n_64<32>), 1248 MAP(imm_v256_shl_n_64<36>), 1249 MAP(imm_v256_shl_n_64<40>), 1250 MAP(imm_v256_shl_n_64<44>), 1251 MAP(imm_v256_shl_n_64<48>), 1252 MAP(imm_v256_shl_n_64<52>), 1253 MAP(imm_v256_shl_n_64<56>), 1254 MAP(imm_v256_shl_n_64<60>), 1255 MAP(imm_v256_shr_n_u64<1>), 1256 MAP(imm_v256_shr_n_u64<4>), 1257 MAP(imm_v256_shr_n_u64<8>), 1258 MAP(imm_v256_shr_n_u64<12>), 1259 MAP(imm_v256_shr_n_u64<16>), 1260 MAP(imm_v256_shr_n_u64<20>), 1261 MAP(imm_v256_shr_n_u64<24>), 1262 MAP(imm_v256_shr_n_u64<28>), 1263 MAP(imm_v256_shr_n_u64<32>), 1264 MAP(imm_v256_shr_n_u64<36>), 1265 MAP(imm_v256_shr_n_u64<40>), 1266 MAP(imm_v256_shr_n_u64<44>), 1267 MAP(imm_v256_shr_n_u64<48>), 1268 MAP(imm_v256_shr_n_u64<52>), 1269 MAP(imm_v256_shr_n_u64<56>), 1270 MAP(imm_v256_shr_n_u64<60>), 1271 MAP(imm_v256_shr_n_s64<1>), 1272 MAP(imm_v256_shr_n_s64<4>), 1273 MAP(imm_v256_shr_n_s64<8>), 1274 MAP(imm_v256_shr_n_s64<12>), 1275 MAP(imm_v256_shr_n_s64<16>), 1276 MAP(imm_v256_shr_n_s64<20>), 1277 MAP(imm_v256_shr_n_s64<24>), 1278 MAP(imm_v256_shr_n_s64<28>), 1279 MAP(imm_v256_shr_n_s64<32>), 1280 MAP(imm_v256_shr_n_s64<36>), 1281 MAP(imm_v256_shr_n_s64<40>), 1282 MAP(imm_v256_shr_n_s64<44>), 1283 MAP(imm_v256_shr_n_s64<48>), 1284 MAP(imm_v256_shr_n_s64<52>), 1285 MAP(imm_v256_shr_n_s64<56>), 1286 MAP(imm_v256_shr_n_s64<60>), 1287 MAP(v256_movemask_8), 1288 MAP(v256_zero), 1289 MAP(v256_dup_8), 1290 MAP(v256_dup_16), 1291 MAP(v256_dup_32), 1292 MAP(v256_dup_64), 1293 MAP(v256_low_u32), 1294 MAP(v256_low_v64), 1295 MAP(v256_from_64), 1296 MAP(v256_from_v64), 1297 MAP(v256_ziplo_128), 1298 MAP(v256_ziphi_128), 1299 MAP(v256_unpacklo_u8_s16), 1300 MAP(v256_unpackhi_u8_s16), 1301 MAP(v256_unpacklo_s8_s16), 1302 MAP(v256_unpackhi_s8_s16), 1303 MAP(v256_blend_8), 1304 { nullptr, nullptr, nullptr } }; 1305#undef MAP 1306 1307// Map reference functions to machine tuned functions. Since the 1308// functions depend on machine tuned types, the non-machine tuned 1309// instantiations of the test can't refer to these functions directly, 1310// so we refer to them by name and do the mapping here. 1311void Map(const char *name, fptr *ref, fptr *simd) { 1312 unsigned int i; 1313 for (i = 0; m[i].name && strcmp(name, m[i].name); i++) { 1314 } 1315 1316 *ref = m[i].ref; 1317 *simd = m[i].simd; 1318} 1319 1320// Used for printing errors in TestSimd1Arg, TestSimd2Args and TestSimd3Args 1321std::string Print(const uint8_t *a, int size) { 1322 std::string text = "0x"; 1323 for (int i = 0; i < size; i++) { 1324 const uint8_t c = a[!CONFIG_BIG_ENDIAN ? size - 1 - i : i]; 1325 // Same as snprintf(..., ..., "%02x", c) 1326 text += (c >> 4) + '0' + ((c >> 4) > 9) * ('a' - '0' - 10); 1327 text += (c & 15) + '0' + ((c & 15) > 9) * ('a' - '0' - 10); 1328 } 1329 1330 return text; 1331} 1332 1333// Used in TestSimd1Arg, TestSimd2Args and TestSimd3Args to restrict argument 1334// ranges 1335void SetMask(uint8_t *s, int size, uint32_t mask, uint32_t maskwidth) { 1336 switch (maskwidth) { 1337 case 0: { 1338 break; 1339 } 1340 case 8: { 1341 for (int i = 0; i < size; i++) s[i] &= mask; 1342 break; 1343 } 1344 case 16: { 1345 uint16_t *t = reinterpret_cast<uint16_t *>(s); 1346 assert(!(reinterpret_cast<uintptr_t>(s) & 1)); 1347 for (int i = 0; i < size / 2; i++) t[i] &= mask; 1348 break; 1349 } 1350 case 32: { 1351 uint32_t *t = reinterpret_cast<uint32_t *>(s); 1352 assert(!(reinterpret_cast<uintptr_t>(s) & 3)); 1353 for (int i = 0; i < size / 4; i++) t[i] &= mask; 1354 break; 1355 } 1356 case 64: { 1357 uint64_t *t = reinterpret_cast<uint64_t *>(s); 1358 assert(!(reinterpret_cast<uintptr_t>(s) & 7)); 1359 for (int i = 0; i < size / 8; i++) t[i] &= mask; 1360 break; 1361 } 1362 default: { 1363 FAIL() << "Unsupported mask width"; 1364 break; 1365 } 1366 } 1367} 1368 1369// We need some extra load/store functions 1370void u64_store_aligned(void *p, uint64_t a) { 1371 v64_store_aligned(p, v64_from_64(a)); 1372} 1373void s32_store_aligned(void *p, int32_t a) { 1374 u32_store_aligned(p, static_cast<uint32_t>(a)); 1375} 1376void s64_store_aligned(void *p, int64_t a) { 1377 v64_store_aligned(p, v64_from_64(static_cast<uint64_t>(a))); 1378} 1379 1380void c_u64_store_aligned(void *p, uint64_t a) { 1381 c_v64_store_aligned(p, c_v64_from_64(a)); 1382} 1383 1384void c_s32_store_aligned(void *p, int32_t a) { 1385 c_u32_store_aligned(p, static_cast<uint32_t>(a)); 1386} 1387 1388void c_s64_store_aligned(void *p, int64_t a) { 1389 c_v64_store_aligned(p, c_v64_from_64(static_cast<uint64_t>(a))); 1390} 1391 1392uint64_t u64_load_aligned(const void *p) { 1393 return v64_u64(v64_load_aligned(p)); 1394} 1395uint16_t u16_load_aligned(const void *p) { 1396 return *(reinterpret_cast<const uint16_t *>(p)); 1397} 1398uint8_t u8_load_aligned(const void *p) { 1399 return *(reinterpret_cast<const uint8_t *>(p)); 1400} 1401 1402uint64_t c_u64_load_aligned(const void *p) { 1403 return c_v64_u64(c_v64_load_aligned(p)); 1404} 1405uint16_t c_u16_load_aligned(const void *p) { 1406 return *(reinterpret_cast<const uint16_t *>(p)); 1407} 1408uint8_t c_u8_load_aligned(const void *p) { 1409 return *(reinterpret_cast<const uint8_t *>(p)); 1410} 1411 1412// CompareSimd1Arg, CompareSimd2Args and CompareSimd3Args compare 1413// intrinsics taking 1, 2 or 3 arguments respectively with their 1414// corresponding C reference. Ideally, the loads and stores should 1415// have gone into the template parameter list, but v64 and v128 could 1416// be typedef'ed to the same type (which is the case on x86) and then 1417// we can't instantiate both v64 and v128, so the function return and 1418// argument types, including the always differing types in the C 1419// equivalent are used instead. The function arguments must be void 1420// pointers and then go through a cast to avoid matching errors in the 1421// branches eliminated by the typeid tests in the calling function. 1422template <typename Ret, typename Arg, typename CRet, typename CArg> 1423int CompareSimd1Arg(fptr store, fptr load, fptr simd, void *d, fptr c_store, 1424 fptr c_load, fptr c_simd, void *ref_d, const void *a) { 1425 void (*const my_store)(void *, Ret) = (void (*const)(void *, Ret))store; 1426 Arg (*const my_load)(const void *) = (Arg(*const)(const void *))load; 1427 Ret (*const my_simd)(Arg) = (Ret(*const)(Arg))simd; 1428 void (*const my_c_store)(void *, CRet) = (void (*const)(void *, CRet))c_store; 1429 CArg (*const my_c_load)(const void *) = (CArg(*const)(const void *))c_load; 1430 CRet (*const my_c_simd)(CArg) = (CRet(*const)(CArg))c_simd; 1431 1432 // Call reference and intrinsic 1433 my_c_store(ref_d, my_c_simd(my_c_load(a))); 1434 my_store(d, my_simd(my_load(a))); 1435 1436 // Compare results 1437 return memcmp(ref_d, d, sizeof(CRet)); 1438} 1439 1440template <typename Ret, typename Arg1, typename Arg2, typename CRet, 1441 typename CArg1, typename CArg2> 1442int CompareSimd2Args(fptr store, fptr load1, fptr load2, fptr simd, void *d, 1443 fptr c_store, fptr c_load1, fptr c_load2, fptr c_simd, 1444 void *ref_d, const void *a, const void *b) { 1445 void (*const my_store)(void *, Ret) = (void (*const)(void *, Ret))store; 1446 Arg1 (*const my_load1)(const void *) = (Arg1(*const)(const void *))load1; 1447 Arg2 (*const my_load2)(const void *) = (Arg2(*const)(const void *))load2; 1448 Ret (*const my_simd)(Arg1, Arg2) = (Ret(*const)(Arg1, Arg2))simd; 1449 void (*const my_c_store)(void *, CRet) = (void (*const)(void *, CRet))c_store; 1450 CArg1 (*const my_c_load1)(const void *) = 1451 (CArg1(*const)(const void *))c_load1; 1452 CArg2 (*const my_c_load2)(const void *) = 1453 (CArg2(*const)(const void *))c_load2; 1454 CRet (*const my_c_simd)(CArg1, CArg2) = (CRet(*const)(CArg1, CArg2))c_simd; 1455 1456 // Call reference and intrinsic 1457 my_c_store(ref_d, my_c_simd(my_c_load1(a), my_c_load2(b))); 1458 my_store(d, my_simd(my_load1(a), my_load2(b))); 1459 1460 // Compare results 1461 return memcmp(ref_d, d, sizeof(CRet)); 1462} 1463 1464template <typename Ret, typename Arg1, typename Arg2, typename Arg3, 1465 typename CRet, typename CArg1, typename CArg2, typename CArg3> 1466int CompareSimd3Args(fptr store, fptr load1, fptr load2, fptr load3, fptr simd, 1467 void *d, fptr c_store, fptr c_load1, fptr c_load2, 1468 fptr c_load3, fptr c_simd, void *ref_d, const void *a, 1469 const void *b, const void *c) { 1470 void (*const my_store)(void *, Ret) = (void (*const)(void *, Ret))store; 1471 Arg1 (*const my_load1)(const void *) = (Arg1(*const)(const void *))load1; 1472 Arg2 (*const my_load2)(const void *) = (Arg2(*const)(const void *))load2; 1473 Arg3 (*const my_load3)(const void *) = (Arg3(*const)(const void *))load3; 1474 Ret (*const my_simd)(Arg1, Arg2, Arg3) = (Ret(*const)(Arg1, Arg2, Arg3))simd; 1475 void (*const my_c_store)(void *, CRet) = (void (*const)(void *, CRet))c_store; 1476 CArg1 (*const my_c_load1)(const void *) = 1477 (CArg1(*const)(const void *))c_load1; 1478 CArg2 (*const my_c_load2)(const void *) = 1479 (CArg2(*const)(const void *))c_load2; 1480 CArg3 (*const my_c_load3)(const void *) = 1481 (CArg3(*const)(const void *))c_load3; 1482 CRet (*const my_c_simd)(CArg1, CArg2, CArg3) = 1483 (CRet(*const)(CArg1, CArg2, CArg3))c_simd; 1484 1485 // Call reference and intrinsic 1486 my_c_store(ref_d, my_c_simd(my_c_load1(a), my_c_load2(b), my_c_load3(c))); 1487 my_store(d, my_simd(my_load1(a), my_load2(b), my_load3(c))); 1488 1489 // Compare results 1490 return memcmp(ref_d, d, sizeof(CRet)); 1491} 1492 1493} // namespace 1494 1495template <typename CRet, typename CArg> 1496void TestSimd1Arg(uint32_t iterations, uint32_t mask, uint32_t maskwidth, 1497 const char *name) { 1498 ACMRandom rnd(ACMRandom::DeterministicSeed()); 1499 fptr ref_simd; 1500 fptr simd; 1501 int error = 0; 1502 DECLARE_ALIGNED(32, uint8_t, s[32]); 1503 DECLARE_ALIGNED(32, uint8_t, d[32]); 1504 DECLARE_ALIGNED(32, uint8_t, ref_d[32]); 1505 assert(sizeof(CArg) <= 32 && sizeof(CRet) <= 32); 1506 memset(ref_d, 0, sizeof(ref_d)); 1507 memset(d, 0, sizeof(d)); 1508 1509 Map(name, &ref_simd, &simd); 1510 if (simd == nullptr || ref_simd == nullptr) { 1511 FAIL() << "Internal error: Unknown intrinsic function " << name; 1512 } 1513 for (unsigned int count = 0; 1514 count < iterations && !error && !testing::Test::HasFailure(); count++) { 1515 for (unsigned int c = 0; c < sizeof(CArg); c++) s[c] = rnd.Rand8(); 1516 1517 if (maskwidth) { 1518 SetMask(s, sizeof(CArg), mask, maskwidth); 1519 } 1520 1521 if (typeid(CRet) == typeid(c_v64) && typeid(CArg) == typeid(c_v64)) { 1522 // V64_V64 1523 error = CompareSimd1Arg<v64, v64, c_v64, c_v64>( 1524 reinterpret_cast<fptr>(v64_store_aligned), 1525 reinterpret_cast<fptr>(v64_load_aligned), simd, d, 1526 reinterpret_cast<fptr>(c_v64_store_aligned), 1527 reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s); 1528 } else if (typeid(CRet) == typeid(c_v64) && 1529 typeid(CArg) == typeid(uint8_t)) { 1530 // V64_U8 1531 error = CompareSimd1Arg<v64, uint8_t, c_v64, uint8_t>( 1532 reinterpret_cast<fptr>(v64_store_aligned), 1533 reinterpret_cast<fptr>(u8_load_aligned), simd, d, 1534 reinterpret_cast<fptr>(c_v64_store_aligned), 1535 reinterpret_cast<fptr>(c_u8_load_aligned), ref_simd, ref_d, s); 1536 } else if (typeid(CRet) == typeid(c_v64) && 1537 typeid(CArg) == typeid(uint16_t)) { 1538 // V64_U16 1539 error = CompareSimd1Arg<v64, uint16_t, c_v64, uint16_t>( 1540 reinterpret_cast<fptr>(v64_store_aligned), 1541 reinterpret_cast<fptr>(u16_load_aligned), simd, d, 1542 reinterpret_cast<fptr>(c_v64_store_aligned), 1543 reinterpret_cast<fptr>(c_u16_load_aligned), ref_simd, ref_d, s); 1544 } else if (typeid(CRet) == typeid(c_v64) && 1545 typeid(CArg) == typeid(uint32_t)) { 1546 // V64_U32 1547 error = CompareSimd1Arg<v64, uint32_t, c_v64, uint32_t>( 1548 reinterpret_cast<fptr>(v64_store_aligned), 1549 reinterpret_cast<fptr>(u32_load_aligned), simd, d, 1550 reinterpret_cast<fptr>(c_v64_store_aligned), 1551 reinterpret_cast<fptr>(c_u32_load_aligned), ref_simd, ref_d, s); 1552 } else if (typeid(CRet) == typeid(uint64_t) && 1553 typeid(CArg) == typeid(c_v64)) { 1554 // U64_V64 1555 error = CompareSimd1Arg<uint64_t, v64, uint64_t, c_v64>( 1556 reinterpret_cast<fptr>(u64_store_aligned), 1557 reinterpret_cast<fptr>(v64_load_aligned), simd, d, 1558 reinterpret_cast<fptr>(c_u64_store_aligned), 1559 reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s); 1560 } else if (typeid(CRet) == typeid(int64_t) && 1561 typeid(CArg) == typeid(c_v64)) { 1562 // S64_V64 1563 error = CompareSimd1Arg<int64_t, v64, int64_t, c_v64>( 1564 reinterpret_cast<fptr>(s64_store_aligned), 1565 reinterpret_cast<fptr>(v64_load_aligned), simd, d, 1566 reinterpret_cast<fptr>(c_s64_store_aligned), 1567 reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s); 1568 } else if (typeid(CRet) == typeid(uint32_t) && 1569 typeid(CArg) == typeid(c_v64)) { 1570 // U32_V64 1571 error = CompareSimd1Arg<uint32_t, v64, uint32_t, c_v64>( 1572 reinterpret_cast<fptr>(u32_store_aligned), 1573 reinterpret_cast<fptr>(v64_load_aligned), simd, d, 1574 reinterpret_cast<fptr>(c_u32_store_aligned), 1575 reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s); 1576 } else if (typeid(CRet) == typeid(int32_t) && 1577 typeid(CArg) == typeid(c_v64)) { 1578 // S32_V64 1579 error = CompareSimd1Arg<int32_t, v64, int32_t, c_v64>( 1580 reinterpret_cast<fptr>(s32_store_aligned), 1581 reinterpret_cast<fptr>(v64_load_aligned), simd, d, 1582 reinterpret_cast<fptr>(c_s32_store_aligned), 1583 reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s); 1584 } else if (typeid(CRet) == typeid(uint32_t) && 1585 typeid(CArg) == typeid(c_v128)) { 1586 // U32_V128 1587 error = CompareSimd1Arg<uint32_t, v128, uint32_t, c_v128>( 1588 reinterpret_cast<fptr>(u32_store_aligned), 1589 reinterpret_cast<fptr>(v128_load_aligned), simd, d, 1590 reinterpret_cast<fptr>(c_u32_store_aligned), 1591 reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s); 1592 } else if (typeid(CRet) == typeid(uint64_t) && 1593 typeid(CArg) == typeid(c_v128)) { 1594 // U64_V128 1595 error = CompareSimd1Arg<uint64_t, v128, uint64_t, c_v128>( 1596 reinterpret_cast<fptr>(u64_store_aligned), 1597 reinterpret_cast<fptr>(v128_load_aligned), simd, d, 1598 reinterpret_cast<fptr>(c_u64_store_aligned), 1599 reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s); 1600 } else if (typeid(CRet) == typeid(uint64_t) && 1601 typeid(CArg) == typeid(c_v256)) { 1602 // U64_V256 1603 error = CompareSimd1Arg<uint64_t, v256, uint64_t, c_v256>( 1604 reinterpret_cast<fptr>(u64_store_aligned), 1605 reinterpret_cast<fptr>(v256_load_aligned), simd, d, 1606 reinterpret_cast<fptr>(c_u64_store_aligned), 1607 reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s); 1608 } else if (typeid(CRet) == typeid(c_v64) && 1609 typeid(CArg) == typeid(c_v128)) { 1610 // V64_V128 1611 error = CompareSimd1Arg<v64, v128, c_v64, c_v128>( 1612 reinterpret_cast<fptr>(v64_store_aligned), 1613 reinterpret_cast<fptr>(v128_load_aligned), simd, d, 1614 reinterpret_cast<fptr>(c_v64_store_aligned), 1615 reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s); 1616 } else if (typeid(CRet) == typeid(c_v128) && 1617 typeid(CArg) == typeid(c_v128)) { 1618 // V128_V128 1619 error = CompareSimd1Arg<v128, v128, c_v128, c_v128>( 1620 reinterpret_cast<fptr>(v128_store_aligned), 1621 reinterpret_cast<fptr>(v128_load_aligned), simd, d, 1622 reinterpret_cast<fptr>(c_v128_store_aligned), 1623 reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s); 1624 } else if (typeid(CRet) == typeid(c_v128) && 1625 typeid(CArg) == typeid(c_v64)) { 1626 // V128_V64 1627 error = CompareSimd1Arg<v128, v64, c_v128, c_v64>( 1628 reinterpret_cast<fptr>(v128_store_aligned), 1629 reinterpret_cast<fptr>(v64_load_aligned), simd, d, 1630 reinterpret_cast<fptr>(c_v128_store_aligned), 1631 reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s); 1632 } else if (typeid(CRet) == typeid(c_v128) && 1633 typeid(CArg) == typeid(uint8_t)) { 1634 // V128_U8 1635 error = CompareSimd1Arg<v128, uint8_t, c_v128, uint8_t>( 1636 reinterpret_cast<fptr>(v128_store_aligned), 1637 reinterpret_cast<fptr>(u8_load_aligned), simd, d, 1638 reinterpret_cast<fptr>(c_v128_store_aligned), 1639 reinterpret_cast<fptr>(c_u8_load_aligned), ref_simd, ref_d, s); 1640 } else if (typeid(CRet) == typeid(c_v128) && 1641 typeid(CArg) == typeid(uint16_t)) { 1642 // V128_U16 1643 error = CompareSimd1Arg<v128, uint16_t, c_v128, uint16_t>( 1644 reinterpret_cast<fptr>(v128_store_aligned), 1645 reinterpret_cast<fptr>(u16_load_aligned), simd, d, 1646 reinterpret_cast<fptr>(c_v128_store_aligned), 1647 reinterpret_cast<fptr>(c_u16_load_aligned), ref_simd, ref_d, s); 1648 } else if (typeid(CRet) == typeid(c_v128) && 1649 typeid(CArg) == typeid(uint32_t)) { 1650 // V128_U32 1651 error = CompareSimd1Arg<v128, uint32_t, c_v128, uint32_t>( 1652 reinterpret_cast<fptr>(v128_store_aligned), 1653 reinterpret_cast<fptr>(u32_load_aligned), simd, d, 1654 reinterpret_cast<fptr>(c_v128_store_aligned), 1655 reinterpret_cast<fptr>(c_u32_load_aligned), ref_simd, ref_d, s); 1656 } else if (typeid(CRet) == typeid(c_v128) && 1657 typeid(CArg) == typeid(uint64_t)) { 1658 // V128_U64 1659 error = CompareSimd1Arg<v128, uint64_t, c_v128, uint64_t>( 1660 reinterpret_cast<fptr>(v128_store_aligned), 1661 reinterpret_cast<fptr>(u64_load_aligned), simd, d, 1662 reinterpret_cast<fptr>(c_v128_store_aligned), 1663 reinterpret_cast<fptr>(c_u64_load_aligned), ref_simd, ref_d, s); 1664 } else if (typeid(CRet) == typeid(c_v256) && 1665 typeid(CArg) == typeid(c_v256)) { 1666 // V256_V256 1667 error = CompareSimd1Arg<v256, v256, c_v256, c_v256>( 1668 reinterpret_cast<fptr>(v256_store_aligned), 1669 reinterpret_cast<fptr>(v256_load_aligned), simd, d, 1670 reinterpret_cast<fptr>(c_v256_store_aligned), 1671 reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s); 1672 } else if (typeid(CRet) == typeid(c_v256) && 1673 typeid(CArg) == typeid(c_v128)) { 1674 // V256_V128 1675 error = CompareSimd1Arg<v256, v128, c_v256, c_v128>( 1676 reinterpret_cast<fptr>(v256_store_aligned), 1677 reinterpret_cast<fptr>(v128_load_aligned), simd, d, 1678 reinterpret_cast<fptr>(c_v256_store_aligned), 1679 reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s); 1680 } else if (typeid(CRet) == typeid(c_v256) && 1681 typeid(CArg) == typeid(uint8_t)) { 1682 // V256_U8 1683 error = CompareSimd1Arg<v256, uint8_t, c_v256, uint8_t>( 1684 reinterpret_cast<fptr>(v256_store_aligned), 1685 reinterpret_cast<fptr>(u8_load_aligned), simd, d, 1686 reinterpret_cast<fptr>(c_v256_store_aligned), 1687 reinterpret_cast<fptr>(c_u8_load_aligned), ref_simd, ref_d, s); 1688 } else if (typeid(CRet) == typeid(c_v256) && 1689 typeid(CArg) == typeid(uint16_t)) { 1690 // V256_U16 1691 error = CompareSimd1Arg<v256, uint16_t, c_v256, uint16_t>( 1692 reinterpret_cast<fptr>(v256_store_aligned), 1693 reinterpret_cast<fptr>(u16_load_aligned), simd, d, 1694 reinterpret_cast<fptr>(c_v256_store_aligned), 1695 reinterpret_cast<fptr>(c_u16_load_aligned), ref_simd, ref_d, s); 1696 } else if (typeid(CRet) == typeid(c_v256) && 1697 typeid(CArg) == typeid(uint32_t)) { 1698 // V256_U32 1699 error = CompareSimd1Arg<v256, uint32_t, c_v256, uint32_t>( 1700 reinterpret_cast<fptr>(v256_store_aligned), 1701 reinterpret_cast<fptr>(u32_load_aligned), simd, d, 1702 reinterpret_cast<fptr>(c_v256_store_aligned), 1703 reinterpret_cast<fptr>(c_u32_load_aligned), ref_simd, ref_d, s); 1704 } else if (typeid(CRet) == typeid(c_v256) && 1705 typeid(CArg) == typeid(uint64_t)) { 1706 // V256_U64 1707 error = CompareSimd1Arg<v256, uint64_t, c_v256, uint64_t>( 1708 reinterpret_cast<fptr>(v256_store_aligned), 1709 reinterpret_cast<fptr>(u64_load_aligned), simd, d, 1710 reinterpret_cast<fptr>(c_v256_store_aligned), 1711 reinterpret_cast<fptr>(c_u64_load_aligned), ref_simd, ref_d, s); 1712 } else if (typeid(CRet) == typeid(uint32_t) && 1713 typeid(CArg) == typeid(c_v256)) { 1714 // U32_V256 1715 error = CompareSimd1Arg<uint32_t, v256, uint32_t, c_v256>( 1716 reinterpret_cast<fptr>(u32_store_aligned), 1717 reinterpret_cast<fptr>(v256_load_aligned), simd, d, 1718 reinterpret_cast<fptr>(c_u32_store_aligned), 1719 reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s); 1720 } else if (typeid(CRet) == typeid(c_v64) && 1721 typeid(CArg) == typeid(c_v256)) { 1722 // V64_V256 1723 error = CompareSimd1Arg<v64, v256, c_v64, c_v256>( 1724 reinterpret_cast<fptr>(v64_store_aligned), 1725 reinterpret_cast<fptr>(v256_load_aligned), simd, d, 1726 reinterpret_cast<fptr>(c_v64_store_aligned), 1727 reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s); 1728 } else { 1729 FAIL() << "Internal error: Unknown intrinsic function " 1730 << typeid(CRet).name() << " " << name << "(" << typeid(CArg).name() 1731 << ")"; 1732 } 1733 } 1734 1735 EXPECT_EQ(0, error) << "Error: mismatch for " << name << "(" 1736 << Print(s, sizeof(CArg)) << ") -> " 1737 << Print(d, sizeof(CRet)) << " (simd), " 1738 << Print(ref_d, sizeof(CRet)) << " (ref)"; 1739} 1740 1741template <typename CRet, typename CArg1, typename CArg2> 1742void TestSimd2Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth, 1743 const char *name) { 1744 ACMRandom rnd(ACMRandom::DeterministicSeed()); 1745 fptr ref_simd; 1746 fptr simd; 1747 int error = 0; 1748 DECLARE_ALIGNED(32, uint8_t, s1[32]); 1749 DECLARE_ALIGNED(32, uint8_t, s2[32]); 1750 DECLARE_ALIGNED(32, uint8_t, d[32]); 1751 DECLARE_ALIGNED(32, uint8_t, ref_d[32]); 1752 assert(sizeof(CArg1) <= 32 && sizeof(CArg2) <= 32 && sizeof(CRet) <= 32); 1753 memset(ref_d, 0, sizeof(ref_d)); 1754 memset(d, 0, sizeof(d)); 1755 1756 Map(name, &ref_simd, &simd); 1757 if (simd == nullptr || ref_simd == nullptr) { 1758 FAIL() << "Internal error: Unknown intrinsic function " << name; 1759 } 1760 1761 for (unsigned int count = 0; 1762 count < iterations && !error && !testing::Test::HasFailure(); count++) { 1763 for (unsigned int c = 0; c < sizeof(CArg1); c++) s1[c] = rnd.Rand8(); 1764 1765 for (unsigned int c = 0; c < sizeof(CArg2); c++) s2[c] = rnd.Rand8(); 1766 1767 if (maskwidth) SetMask(s2, sizeof(CArg2), mask, maskwidth); 1768 1769 if (typeid(CRet) == typeid(c_v64) && typeid(CArg1) == typeid(c_v64) && 1770 typeid(CArg2) == typeid(c_v64)) { 1771 // V64_V64V64 1772 error = CompareSimd2Args<v64, v64, v64, c_v64, c_v64, c_v64>( 1773 reinterpret_cast<fptr>(v64_store_aligned), 1774 reinterpret_cast<fptr>(v64_load_aligned), 1775 reinterpret_cast<fptr>(v64_load_aligned), simd, d, 1776 reinterpret_cast<fptr>(c_v64_store_aligned), 1777 reinterpret_cast<fptr>(c_v64_load_aligned), 1778 reinterpret_cast<fptr>(c_v64_load_aligned), 1779 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2); 1780 } else if (typeid(CRet) == typeid(c_v64) && 1781 typeid(CArg1) == typeid(uint32_t) && 1782 typeid(CArg2) == typeid(uint32_t)) { 1783 // V64_U32U32 1784 error = 1785 CompareSimd2Args<v64, uint32_t, uint32_t, c_v64, uint32_t, uint32_t>( 1786 reinterpret_cast<fptr>(v64_store_aligned), 1787 reinterpret_cast<fptr>(u32_load_aligned), 1788 reinterpret_cast<fptr>(u32_load_aligned), simd, d, 1789 reinterpret_cast<fptr>(c_v64_store_aligned), 1790 reinterpret_cast<fptr>(c_u32_load_aligned), 1791 reinterpret_cast<fptr>(c_u32_load_aligned), 1792 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2); 1793 } else if (typeid(CRet) == typeid(uint32_t) && 1794 typeid(CArg1) == typeid(c_v64) && 1795 typeid(CArg2) == typeid(c_v64)) { 1796 // U32_V64V64 1797 error = CompareSimd2Args<uint32_t, v64, v64, uint32_t, c_v64, c_v64>( 1798 reinterpret_cast<fptr>(u32_store_aligned), 1799 reinterpret_cast<fptr>(v64_load_aligned), 1800 reinterpret_cast<fptr>(v64_load_aligned), simd, d, 1801 reinterpret_cast<fptr>(c_u32_store_aligned), 1802 reinterpret_cast<fptr>(c_v64_load_aligned), 1803 reinterpret_cast<fptr>(c_v64_load_aligned), 1804 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2); 1805 } else if (typeid(CRet) == typeid(int64_t) && 1806 typeid(CArg1) == typeid(c_v64) && 1807 typeid(CArg2) == typeid(c_v64)) { 1808 // S64_V64V64 1809 error = CompareSimd2Args<int64_t, v64, v64, int64_t, c_v64, c_v64>( 1810 reinterpret_cast<fptr>(s64_store_aligned), 1811 reinterpret_cast<fptr>(v64_load_aligned), 1812 reinterpret_cast<fptr>(v64_load_aligned), simd, d, 1813 reinterpret_cast<fptr>(c_s64_store_aligned), 1814 reinterpret_cast<fptr>(c_v64_load_aligned), 1815 reinterpret_cast<fptr>(c_v64_load_aligned), 1816 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2); 1817 } else if (typeid(CRet) == typeid(c_v64) && 1818 typeid(CArg1) == typeid(c_v64) && 1819 typeid(CArg2) == typeid(uint32_t)) { 1820 // V64_V64U32 1821 error = CompareSimd2Args<v64, v64, uint32_t, c_v64, c_v64, uint32_t>( 1822 reinterpret_cast<fptr>(v64_store_aligned), 1823 reinterpret_cast<fptr>(v64_load_aligned), 1824 reinterpret_cast<fptr>(u32_load_aligned), simd, d, 1825 reinterpret_cast<fptr>(c_v64_store_aligned), 1826 reinterpret_cast<fptr>(c_v64_load_aligned), 1827 reinterpret_cast<fptr>(c_u32_load_aligned), 1828 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2); 1829 } else if (typeid(CRet) == typeid(c_v128) && 1830 typeid(CArg1) == typeid(c_v128) && 1831 typeid(CArg2) == typeid(c_v128)) { 1832 // V128_V128V128 1833 error = CompareSimd2Args<v128, v128, v128, c_v128, c_v128, c_v128>( 1834 reinterpret_cast<fptr>(v128_store_aligned), 1835 reinterpret_cast<fptr>(v128_load_aligned), 1836 reinterpret_cast<fptr>(v128_load_aligned), simd, d, 1837 reinterpret_cast<fptr>(c_v128_store_aligned), 1838 reinterpret_cast<fptr>(c_v128_load_aligned), 1839 reinterpret_cast<fptr>(c_v128_load_aligned), 1840 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2); 1841 } else if (typeid(CRet) == typeid(uint32_t) && 1842 typeid(CArg1) == typeid(c_v128) && 1843 typeid(CArg2) == typeid(c_v128)) { 1844 // U32_V128V128 1845 error = CompareSimd2Args<uint32_t, v128, v128, uint32_t, c_v128, c_v128>( 1846 reinterpret_cast<fptr>(u32_store_aligned), 1847 reinterpret_cast<fptr>(v128_load_aligned), 1848 reinterpret_cast<fptr>(v128_load_aligned), simd, d, 1849 reinterpret_cast<fptr>(c_u32_store_aligned), 1850 reinterpret_cast<fptr>(c_v128_load_aligned), 1851 reinterpret_cast<fptr>(c_v128_load_aligned), 1852 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2); 1853 } else if (typeid(CRet) == typeid(uint64_t) && 1854 typeid(CArg1) == typeid(c_v128) && 1855 typeid(CArg2) == typeid(c_v128)) { 1856 // U64_V128V128 1857 error = CompareSimd2Args<uint64_t, v128, v128, uint64_t, c_v128, c_v128>( 1858 reinterpret_cast<fptr>(u64_store_aligned), 1859 reinterpret_cast<fptr>(v128_load_aligned), 1860 reinterpret_cast<fptr>(v128_load_aligned), simd, d, 1861 reinterpret_cast<fptr>(c_u64_store_aligned), 1862 reinterpret_cast<fptr>(c_v128_load_aligned), 1863 reinterpret_cast<fptr>(c_v128_load_aligned), 1864 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2); 1865 } else if (typeid(CRet) == typeid(int64_t) && 1866 typeid(CArg1) == typeid(c_v128) && 1867 typeid(CArg2) == typeid(c_v128)) { 1868 // S64_V128V128 1869 error = CompareSimd2Args<int64_t, v128, v128, int64_t, c_v128, c_v128>( 1870 reinterpret_cast<fptr>(s64_store_aligned), 1871 reinterpret_cast<fptr>(v128_load_aligned), 1872 reinterpret_cast<fptr>(v128_load_aligned), simd, d, 1873 reinterpret_cast<fptr>(c_s64_store_aligned), 1874 reinterpret_cast<fptr>(c_v128_load_aligned), 1875 reinterpret_cast<fptr>(c_v128_load_aligned), 1876 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2); 1877 } else if (typeid(CRet) == typeid(c_v128) && 1878 typeid(CArg1) == typeid(uint64_t) && 1879 typeid(CArg2) == typeid(uint64_t)) { 1880 // V128_U64U64 1881 error = CompareSimd2Args<v128, uint64_t, uint64_t, c_v128, uint64_t, 1882 uint64_t>( 1883 reinterpret_cast<fptr>(v128_store_aligned), 1884 reinterpret_cast<fptr>(u64_load_aligned), 1885 reinterpret_cast<fptr>(u64_load_aligned), simd, d, 1886 reinterpret_cast<fptr>(c_v128_store_aligned), 1887 reinterpret_cast<fptr>(c_u64_load_aligned), 1888 reinterpret_cast<fptr>(c_u64_load_aligned), 1889 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2); 1890 } else if (typeid(CRet) == typeid(c_v128) && 1891 typeid(CArg1) == typeid(c_v64) && 1892 typeid(CArg2) == typeid(c_v64)) { 1893 // V128_V64V64 1894 error = CompareSimd2Args<v128, v64, v64, c_v128, c_v64, c_v64>( 1895 reinterpret_cast<fptr>(v128_store_aligned), 1896 reinterpret_cast<fptr>(v64_load_aligned), 1897 reinterpret_cast<fptr>(v64_load_aligned), simd, d, 1898 reinterpret_cast<fptr>(c_v128_store_aligned), 1899 reinterpret_cast<fptr>(c_v64_load_aligned), 1900 reinterpret_cast<fptr>(c_v64_load_aligned), 1901 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2); 1902 } else if (typeid(CRet) == typeid(c_v128) && 1903 typeid(CArg1) == typeid(c_v128) && 1904 typeid(CArg2) == typeid(uint32_t)) { 1905 // V128_V128U32 1906 error = CompareSimd2Args<v128, v128, uint32_t, c_v128, c_v128, uint32_t>( 1907 reinterpret_cast<fptr>(v128_store_aligned), 1908 reinterpret_cast<fptr>(v128_load_aligned), 1909 reinterpret_cast<fptr>(u32_load_aligned), simd, d, 1910 reinterpret_cast<fptr>(c_v128_store_aligned), 1911 reinterpret_cast<fptr>(c_v128_load_aligned), 1912 reinterpret_cast<fptr>(c_u32_load_aligned), 1913 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2); 1914 } else if (typeid(CRet) == typeid(c_v256) && 1915 typeid(CArg1) == typeid(c_v256) && 1916 typeid(CArg2) == typeid(c_v256)) { 1917 // V256_V256V256 1918 error = CompareSimd2Args<v256, v256, v256, c_v256, c_v256, c_v256>( 1919 reinterpret_cast<fptr>(v256_store_aligned), 1920 reinterpret_cast<fptr>(v256_load_aligned), 1921 reinterpret_cast<fptr>(v256_load_aligned), simd, d, 1922 reinterpret_cast<fptr>(c_v256_store_aligned), 1923 reinterpret_cast<fptr>(c_v256_load_aligned), 1924 reinterpret_cast<fptr>(c_v256_load_aligned), 1925 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2); 1926 } else if (typeid(CRet) == typeid(uint64_t) && 1927 typeid(CArg1) == typeid(c_v256) && 1928 typeid(CArg2) == typeid(c_v256)) { 1929 // U64_V256V256 1930 error = CompareSimd2Args<uint64_t, v256, v256, uint64_t, c_v256, c_v256>( 1931 reinterpret_cast<fptr>(u64_store_aligned), 1932 reinterpret_cast<fptr>(v256_load_aligned), 1933 reinterpret_cast<fptr>(v256_load_aligned), simd, d, 1934 reinterpret_cast<fptr>(c_u64_store_aligned), 1935 reinterpret_cast<fptr>(c_v256_load_aligned), 1936 reinterpret_cast<fptr>(c_v256_load_aligned), 1937 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2); 1938 } else if (typeid(CRet) == typeid(int64_t) && 1939 typeid(CArg1) == typeid(c_v256) && 1940 typeid(CArg2) == typeid(c_v256)) { 1941 // S64_V256V256 1942 error = CompareSimd2Args<int64_t, v256, v256, int64_t, c_v256, c_v256>( 1943 reinterpret_cast<fptr>(s64_store_aligned), 1944 reinterpret_cast<fptr>(v256_load_aligned), 1945 reinterpret_cast<fptr>(v256_load_aligned), simd, d, 1946 reinterpret_cast<fptr>(c_s64_store_aligned), 1947 reinterpret_cast<fptr>(c_v256_load_aligned), 1948 reinterpret_cast<fptr>(c_v256_load_aligned), 1949 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2); 1950 } else if (typeid(CRet) == typeid(uint32_t) && 1951 typeid(CArg1) == typeid(c_v256) && 1952 typeid(CArg2) == typeid(c_v256)) { 1953 // U32_V256V256 1954 error = CompareSimd2Args<uint32_t, v256, v256, uint32_t, c_v256, c_v256>( 1955 reinterpret_cast<fptr>(u32_store_aligned), 1956 reinterpret_cast<fptr>(v256_load_aligned), 1957 reinterpret_cast<fptr>(v256_load_aligned), simd, d, 1958 reinterpret_cast<fptr>(c_u32_store_aligned), 1959 reinterpret_cast<fptr>(c_v256_load_aligned), 1960 reinterpret_cast<fptr>(c_v256_load_aligned), 1961 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2); 1962 } else if (typeid(CRet) == typeid(c_v256) && 1963 typeid(CArg1) == typeid(c_v128) && 1964 typeid(CArg2) == typeid(c_v128)) { 1965 // V256_V128V128 1966 error = CompareSimd2Args<v256, v128, v128, c_v256, c_v128, c_v128>( 1967 reinterpret_cast<fptr>(v256_store_aligned), 1968 reinterpret_cast<fptr>(v128_load_aligned), 1969 reinterpret_cast<fptr>(v128_load_aligned), simd, d, 1970 reinterpret_cast<fptr>(c_v256_store_aligned), 1971 reinterpret_cast<fptr>(c_v128_load_aligned), 1972 reinterpret_cast<fptr>(c_v128_load_aligned), 1973 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2); 1974 } else if (typeid(CRet) == typeid(c_v256) && 1975 typeid(CArg1) == typeid(c_v256) && 1976 typeid(CArg2) == typeid(uint32_t)) { 1977 // V256_V256U32 1978 error = CompareSimd2Args<v256, v256, uint32_t, c_v256, c_v256, uint32_t>( 1979 reinterpret_cast<fptr>(v256_store_aligned), 1980 reinterpret_cast<fptr>(v256_load_aligned), 1981 reinterpret_cast<fptr>(u32_load_aligned), simd, d, 1982 reinterpret_cast<fptr>(c_v256_store_aligned), 1983 reinterpret_cast<fptr>(c_v256_load_aligned), 1984 reinterpret_cast<fptr>(c_u32_load_aligned), 1985 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2); 1986 1987 } else { 1988 FAIL() << "Internal error: Unknown intrinsic function " 1989 << typeid(CRet).name() << " " << name << "(" 1990 << typeid(CArg1).name() << ", " << typeid(CArg2).name() << ")"; 1991 } 1992 } 1993 1994 EXPECT_EQ(0, error) << "Error: mismatch for " << name << "(" 1995 << Print(s1, sizeof(CArg1)) << ", " 1996 << Print(s2, sizeof(CArg2)) << ") -> " 1997 << Print(d, sizeof(CRet)) << " (simd), " 1998 << Print(ref_d, sizeof(CRet)) << " (ref)"; 1999} 2000 2001template <typename CRet, typename CArg1, typename CArg2, typename CArg3> 2002void TestSimd3Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth, 2003 const char *name) { 2004 ACMRandom rnd(ACMRandom::DeterministicSeed()); 2005 fptr ref_simd; 2006 fptr simd; 2007 int error = 0; 2008 DECLARE_ALIGNED(32, uint8_t, s1[32]); 2009 DECLARE_ALIGNED(32, uint8_t, s2[32]); 2010 DECLARE_ALIGNED(32, uint8_t, s3[32]); 2011 DECLARE_ALIGNED(32, uint8_t, d[32]); 2012 DECLARE_ALIGNED(32, uint8_t, ref_d[32]); 2013 assert(sizeof(CArg1) <= 32 && sizeof(CArg2) <= 32 && sizeof(CArg3) <= 32 && 2014 sizeof(CRet) <= 32); 2015 memset(ref_d, 0, sizeof(ref_d)); 2016 memset(d, 0, sizeof(d)); 2017 2018 Map(name, &ref_simd, &simd); 2019 if (simd == nullptr || ref_simd == nullptr) { 2020 FAIL() << "Internal error: Unknown intrinsic function " << name; 2021 } 2022 2023 for (unsigned int count = 0; 2024 count < iterations && !error && !testing::Test::HasFailure(); count++) { 2025 for (unsigned int c = 0; c < sizeof(CArg1); c++) s1[c] = rnd.Rand8(); 2026 2027 for (unsigned int c = 0; c < sizeof(CArg2); c++) s2[c] = rnd.Rand8(); 2028 2029 for (unsigned int c = 0; c < sizeof(CArg3); c++) s3[c] = rnd.Rand8(); 2030 2031 if (maskwidth) SetMask(s3, sizeof(CArg3), mask, maskwidth); 2032 2033 if (typeid(CRet) == typeid(c_v128) && typeid(CArg1) == typeid(c_v128) && 2034 typeid(CArg2) == typeid(c_v128) && typeid(CArg3) == typeid(c_v128)) { 2035 // V128_V128V128V128 2036 error = CompareSimd3Args<v128, v128, v128, v128, c_v128, c_v128, c_v128, 2037 c_v128>( 2038 reinterpret_cast<fptr>(v128_store_aligned), 2039 reinterpret_cast<fptr>(v128_load_aligned), 2040 reinterpret_cast<fptr>(v128_load_aligned), 2041 reinterpret_cast<fptr>(v128_load_aligned), simd, d, 2042 reinterpret_cast<fptr>(c_v128_store_aligned), 2043 reinterpret_cast<fptr>(c_v128_load_aligned), 2044 reinterpret_cast<fptr>(c_v128_load_aligned), 2045 reinterpret_cast<fptr>(c_v128_load_aligned), 2046 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2, s3); 2047 } else if (typeid(CRet) == typeid(c_v256) && 2048 typeid(CArg1) == typeid(c_v256) && 2049 typeid(CArg2) == typeid(c_v256) && 2050 typeid(CArg3) == typeid(c_v256)) { 2051 // V256_V256V256V256 2052 error = CompareSimd3Args<v256, v256, v256, v256, c_v256, c_v256, c_v256, 2053 c_v256>( 2054 reinterpret_cast<fptr>(v256_store_aligned), 2055 reinterpret_cast<fptr>(v256_load_aligned), 2056 reinterpret_cast<fptr>(v256_load_aligned), 2057 reinterpret_cast<fptr>(v256_load_aligned), simd, d, 2058 reinterpret_cast<fptr>(c_v256_store_aligned), 2059 reinterpret_cast<fptr>(c_v256_load_aligned), 2060 reinterpret_cast<fptr>(c_v256_load_aligned), 2061 reinterpret_cast<fptr>(c_v256_load_aligned), 2062 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2, s3); 2063 } else { 2064 FAIL() << "Internal error: Unknown intrinsic function " 2065 << typeid(CRet).name() << " " << name << "(" 2066 << typeid(CArg1).name() << ", " << typeid(CArg2).name() << ", " 2067 << typeid(CArg3).name() << ")"; 2068 } 2069 } 2070 2071 EXPECT_EQ(0, error) << "Error: mismatch for " << name << "(" 2072 << Print(s1, sizeof(CArg1)) << ", " 2073 << Print(s2, sizeof(CArg2)) << ", " 2074 << Print(s3, sizeof(CArg3)) << ") -> " 2075 << Print(d, sizeof(CRet)) << " (simd), " 2076 << Print(ref_d, sizeof(CRet)) << " (ref)"; 2077} 2078 2079// Instantiations to make the functions callable from another files 2080template void TestSimd1Arg<c_v64, uint8_t>(uint32_t, uint32_t, uint32_t, 2081 const char *); 2082template void TestSimd1Arg<c_v64, uint16_t>(uint32_t, uint32_t, uint32_t, 2083 const char *); 2084template void TestSimd1Arg<c_v64, uint32_t>(uint32_t, uint32_t, uint32_t, 2085 const char *); 2086template void TestSimd1Arg<c_v64, c_v64>(uint32_t, uint32_t, uint32_t, 2087 const char *); 2088template void TestSimd1Arg<uint32_t, c_v64>(uint32_t, uint32_t, uint32_t, 2089 const char *); 2090template void TestSimd1Arg<int32_t, c_v64>(uint32_t, uint32_t, uint32_t, 2091 const char *); 2092template void TestSimd1Arg<uint64_t, c_v64>(uint32_t, uint32_t, uint32_t, 2093 const char *); 2094template void TestSimd1Arg<int64_t, c_v64>(uint32_t, uint32_t, uint32_t, 2095 const char *); 2096template void TestSimd2Args<c_v64, uint32_t, uint32_t>(uint32_t, uint32_t, 2097 uint32_t, const char *); 2098template void TestSimd2Args<c_v64, c_v64, c_v64>(uint32_t, uint32_t, uint32_t, 2099 const char *); 2100template void TestSimd2Args<c_v64, c_v64, uint32_t>(uint32_t, uint32_t, 2101 uint32_t, const char *); 2102template void TestSimd2Args<int64_t, c_v64, c_v64>(uint32_t, uint32_t, uint32_t, 2103 const char *); 2104template void TestSimd2Args<uint32_t, c_v64, c_v64>(uint32_t, uint32_t, 2105 uint32_t, const char *); 2106template void TestSimd1Arg<c_v128, c_v128>(uint32_t, uint32_t, uint32_t, 2107 const char *); 2108template void TestSimd1Arg<c_v128, uint8_t>(uint32_t, uint32_t, uint32_t, 2109 const char *); 2110template void TestSimd1Arg<c_v128, uint16_t>(uint32_t, uint32_t, uint32_t, 2111 const char *); 2112template void TestSimd1Arg<c_v128, uint32_t>(uint32_t, uint32_t, uint32_t, 2113 const char *); 2114template void TestSimd1Arg<c_v128, uint64_t>(uint32_t, uint32_t, uint32_t, 2115 const char *); 2116template void TestSimd1Arg<c_v128, c_v64>(uint32_t, uint32_t, uint32_t, 2117 const char *); 2118template void TestSimd1Arg<uint32_t, c_v128>(uint32_t, uint32_t, uint32_t, 2119 const char *); 2120template void TestSimd1Arg<uint64_t, c_v128>(uint32_t, uint32_t, uint32_t, 2121 const char *); 2122template void TestSimd1Arg<c_v64, c_v128>(uint32_t, uint32_t, uint32_t, 2123 const char *); 2124template void TestSimd2Args<c_v128, c_v128, c_v128>(uint32_t, uint32_t, 2125 uint32_t, const char *); 2126template void TestSimd2Args<c_v128, c_v128, uint32_t>(uint32_t, uint32_t, 2127 uint32_t, const char *); 2128template void TestSimd2Args<c_v128, uint64_t, uint64_t>(uint32_t, uint32_t, 2129 uint32_t, const char *); 2130template void TestSimd2Args<c_v128, c_v64, c_v64>(uint32_t, uint32_t, uint32_t, 2131 const char *); 2132template void TestSimd2Args<uint64_t, c_v128, c_v128>(uint32_t, uint32_t, 2133 uint32_t, const char *); 2134template void TestSimd2Args<int64_t, c_v128, c_v128>(uint32_t, uint32_t, 2135 uint32_t, const char *); 2136template void TestSimd2Args<uint32_t, c_v128, c_v128>(uint32_t, uint32_t, 2137 uint32_t, const char *); 2138template void TestSimd3Args<c_v128, c_v128, c_v128, c_v128>(uint32_t, uint32_t, 2139 uint32_t, 2140 const char *); 2141template void TestSimd1Arg<c_v256, c_v128>(uint32_t, uint32_t, uint32_t, 2142 const char *); 2143template void TestSimd1Arg<c_v256, c_v256>(uint32_t, uint32_t, uint32_t, 2144 const char *); 2145template void TestSimd1Arg<uint64_t, c_v256>(uint32_t, uint32_t, uint32_t, 2146 const char *); 2147template void TestSimd1Arg<c_v256, uint8_t>(uint32_t, uint32_t, uint32_t, 2148 const char *); 2149template void TestSimd1Arg<c_v256, uint16_t>(uint32_t, uint32_t, uint32_t, 2150 const char *); 2151template void TestSimd1Arg<c_v256, uint32_t>(uint32_t, uint32_t, uint32_t, 2152 const char *); 2153template void TestSimd1Arg<c_v256, uint64_t>(uint32_t, uint32_t, uint32_t, 2154 const char *); 2155template void TestSimd1Arg<uint32_t, c_v256>(uint32_t, uint32_t, uint32_t, 2156 const char *); 2157template void TestSimd1Arg<c_v64, c_v256>(uint32_t, uint32_t, uint32_t, 2158 const char *); 2159template void TestSimd2Args<c_v256, c_v128, c_v128>(uint32_t, uint32_t, 2160 uint32_t, const char *); 2161template void TestSimd2Args<c_v256, c_v256, c_v256>(uint32_t, uint32_t, 2162 uint32_t, const char *); 2163template void TestSimd2Args<c_v256, c_v256, uint32_t>(uint32_t, uint32_t, 2164 uint32_t, const char *); 2165template void TestSimd2Args<uint64_t, c_v256, c_v256>(uint32_t, uint32_t, 2166 uint32_t, const char *); 2167template void TestSimd2Args<int64_t, c_v256, c_v256>(uint32_t, uint32_t, 2168 uint32_t, const char *); 2169template void TestSimd2Args<uint32_t, c_v256, c_v256>(uint32_t, uint32_t, 2170 uint32_t, const char *); 2171template void TestSimd3Args<c_v256, c_v256, c_v256, c_v256>(uint32_t, uint32_t, 2172 uint32_t, 2173 const char *); 2174 2175} // namespace SIMD_NAMESPACE 2176