1 // Copyright (c) Facebook, Inc. and its affiliates. 2 // All rights reserved. 3 // 4 // Copyright 2019 Google LLC 5 // 6 // This source code is licensed under the BSD-style license found in the 7 // LICENSE file in the root directory of this source tree. 8 // 9 // Auto-generated file. Do not edit! 10 // Specification: test/f16-gemm-minmax.yaml 11 // Generator: tools/generate-gemm-test.py 12 13 14 #include <gtest/gtest.h> 15 16 #include <xnnpack/allocator.h> 17 #include <xnnpack/common.h> 18 #include <xnnpack/isa-checks.h> 19 #include <xnnpack/microparams-init.h> 20 21 #include <xnnpack/gemm.h> 22 #include <xnnpack/igemm.h> 23 #include <xnnpack/ppmm.h> 24 #include "gemm-microkernel-tester.h" 25 26 27 #if XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4)28 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4) { 29 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 30 GemmMicrokernelTester() 31 .mr(1) 32 .nr(8) 33 .kr(1) 34 .sr(1) 35 .m(1) 36 .n(8) 37 .k(4) 38 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 39 } 40 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,strided_cn)41 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, strided_cn) { 42 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 43 GemmMicrokernelTester() 44 .mr(1) 45 .nr(8) 46 .kr(1) 47 .sr(1) 48 .m(1) 49 .n(8) 50 .k(4) 51 .cn_stride(11) 52 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 53 } 54 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_strided_a)55 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_strided_a) { 56 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 57 GemmMicrokernelTester() 58 .mr(1) 59 .nr(8) 60 .kr(1) 61 .sr(1) 62 .m(1) 63 .n(8) 64 .k(4) 65 .a_stride(7) 66 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 67 } 68 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile)69 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile) { 70 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 71 for (uint32_t n = 1; n <= 8; n++) { 72 for (uint32_t m = 1; m <= 1; m++) { 73 GemmMicrokernelTester() 74 .mr(1) 75 .nr(8) 76 .kr(1) 77 .sr(1) 78 .m(m) 79 .n(n) 80 .k(4) 81 .iterations(1) 82 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 83 } 84 } 85 } 86 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile_m)87 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 88 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 89 for (uint32_t m = 1; m <= 1; m++) { 90 GemmMicrokernelTester() 91 .mr(1) 92 .nr(8) 93 .kr(1) 94 .sr(1) 95 .m(m) 96 .n(8) 97 .k(4) 98 .iterations(1) 99 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 100 } 101 } 102 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile_n)103 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 104 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 105 for (uint32_t n = 1; n <= 8; n++) { 106 GemmMicrokernelTester() 107 .mr(1) 108 .nr(8) 109 .kr(1) 110 .sr(1) 111 .m(1) 112 .n(n) 113 .k(4) 114 .iterations(1) 115 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 116 } 117 } 118 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,k_lt_4)119 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4) { 120 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 121 for (size_t k = 1; k < 4; k++) { 122 GemmMicrokernelTester() 123 .mr(1) 124 .nr(8) 125 .kr(1) 126 .sr(1) 127 .m(1) 128 .n(8) 129 .k(k) 130 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 131 } 132 } 133 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,k_lt_4_strided_a)134 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4_strided_a) { 135 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 136 for (size_t k = 1; k < 4; k++) { 137 GemmMicrokernelTester() 138 .mr(1) 139 .nr(8) 140 .kr(1) 141 .sr(1) 142 .m(1) 143 .n(8) 144 .k(k) 145 .a_stride(7) 146 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 147 } 148 } 149 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,k_lt_4_subtile)150 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4_subtile) { 151 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 152 for (size_t k = 1; k < 4; k++) { 153 for (uint32_t n = 1; n <= 8; n++) { 154 for (uint32_t m = 1; m <= 1; m++) { 155 GemmMicrokernelTester() 156 .mr(1) 157 .nr(8) 158 .kr(1) 159 .sr(1) 160 .m(m) 161 .n(n) 162 .k(k) 163 .iterations(1) 164 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 165 } 166 } 167 } 168 } 169 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,k_gt_4)170 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4) { 171 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 172 for (size_t k = 5; k < 8; k++) { 173 GemmMicrokernelTester() 174 .mr(1) 175 .nr(8) 176 .kr(1) 177 .sr(1) 178 .m(1) 179 .n(8) 180 .k(k) 181 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 182 } 183 } 184 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,k_gt_4_strided_a)185 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4_strided_a) { 186 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 187 for (size_t k = 5; k < 8; k++) { 188 GemmMicrokernelTester() 189 .mr(1) 190 .nr(8) 191 .kr(1) 192 .sr(1) 193 .m(1) 194 .n(8) 195 .k(k) 196 .a_stride(11) 197 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 198 } 199 } 200 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,k_gt_4_subtile)201 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4_subtile) { 202 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 203 for (size_t k = 5; k < 8; k++) { 204 for (uint32_t n = 1; n <= 8; n++) { 205 for (uint32_t m = 1; m <= 1; m++) { 206 GemmMicrokernelTester() 207 .mr(1) 208 .nr(8) 209 .kr(1) 210 .sr(1) 211 .m(m) 212 .n(n) 213 .k(k) 214 .iterations(1) 215 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 216 } 217 } 218 } 219 } 220 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,k_div_4)221 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_div_4) { 222 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 223 for (size_t k = 8; k <= 40; k += 4) { 224 GemmMicrokernelTester() 225 .mr(1) 226 .nr(8) 227 .kr(1) 228 .sr(1) 229 .m(1) 230 .n(8) 231 .k(k) 232 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 233 } 234 } 235 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,k_div_4_strided_a)236 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_div_4_strided_a) { 237 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 238 for (size_t k = 8; k <= 40; k += 4) { 239 GemmMicrokernelTester() 240 .mr(1) 241 .nr(8) 242 .kr(1) 243 .sr(1) 244 .m(1) 245 .n(8) 246 .k(k) 247 .a_stride(43) 248 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 249 } 250 } 251 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,k_div_4_subtile)252 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_div_4_subtile) { 253 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 254 for (size_t k = 8; k <= 40; k += 4) { 255 for (uint32_t n = 1; n <= 8; n++) { 256 for (uint32_t m = 1; m <= 1; m++) { 257 GemmMicrokernelTester() 258 .mr(1) 259 .nr(8) 260 .kr(1) 261 .sr(1) 262 .m(m) 263 .n(n) 264 .k(k) 265 .iterations(1) 266 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 267 } 268 } 269 } 270 } 271 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8)272 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8) { 273 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 274 for (uint32_t n = 9; n < 16; n++) { 275 for (size_t k = 1; k <= 20; k += 5) { 276 GemmMicrokernelTester() 277 .mr(1) 278 .nr(8) 279 .kr(1) 280 .sr(1) 281 .m(1) 282 .n(n) 283 .k(k) 284 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 285 } 286 } 287 } 288 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8_strided_cn)289 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_strided_cn) { 290 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 291 for (uint32_t n = 9; n < 16; n++) { 292 for (size_t k = 1; k <= 20; k += 5) { 293 GemmMicrokernelTester() 294 .mr(1) 295 .nr(8) 296 .kr(1) 297 .sr(1) 298 .m(1) 299 .n(n) 300 .k(k) 301 .cn_stride(11) 302 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 303 } 304 } 305 } 306 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8_strided_a)307 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_strided_a) { 308 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 309 for (uint32_t n = 9; n < 16; n++) { 310 for (size_t k = 1; k <= 20; k += 5) { 311 GemmMicrokernelTester() 312 .mr(1) 313 .nr(8) 314 .kr(1) 315 .sr(1) 316 .m(1) 317 .n(n) 318 .k(k) 319 .a_stride(23) 320 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 321 } 322 } 323 } 324 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8_subtile)325 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_subtile) { 326 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 327 for (uint32_t n = 9; n < 16; n++) { 328 for (size_t k = 1; k <= 20; k += 5) { 329 for (uint32_t m = 1; m <= 1; m++) { 330 GemmMicrokernelTester() 331 .mr(1) 332 .nr(8) 333 .kr(1) 334 .sr(1) 335 .m(m) 336 .n(n) 337 .k(k) 338 .iterations(1) 339 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 340 } 341 } 342 } 343 } 344 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,n_div_8)345 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, n_div_8) { 346 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 347 for (uint32_t n = 16; n <= 24; n += 8) { 348 for (size_t k = 1; k <= 20; k += 5) { 349 GemmMicrokernelTester() 350 .mr(1) 351 .nr(8) 352 .kr(1) 353 .sr(1) 354 .m(1) 355 .n(n) 356 .k(k) 357 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 358 } 359 } 360 } 361 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,n_div_8_strided_cn)362 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_strided_cn) { 363 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 364 for (uint32_t n = 16; n <= 24; n += 8) { 365 for (size_t k = 1; k <= 20; k += 5) { 366 GemmMicrokernelTester() 367 .mr(1) 368 .nr(8) 369 .kr(1) 370 .sr(1) 371 .m(1) 372 .n(n) 373 .k(k) 374 .cn_stride(11) 375 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 376 } 377 } 378 } 379 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,n_div_8_strided_a)380 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_strided_a) { 381 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 382 for (uint32_t n = 16; n <= 24; n += 8) { 383 for (size_t k = 1; k <= 20; k += 5) { 384 GemmMicrokernelTester() 385 .mr(1) 386 .nr(8) 387 .kr(1) 388 .sr(1) 389 .m(1) 390 .n(n) 391 .k(k) 392 .a_stride(23) 393 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 394 } 395 } 396 } 397 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,n_div_8_subtile)398 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_subtile) { 399 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 400 for (uint32_t n = 16; n <= 24; n += 8) { 401 for (size_t k = 1; k <= 20; k += 5) { 402 for (uint32_t m = 1; m <= 1; m++) { 403 GemmMicrokernelTester() 404 .mr(1) 405 .nr(8) 406 .kr(1) 407 .sr(1) 408 .m(m) 409 .n(n) 410 .k(k) 411 .iterations(1) 412 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 413 } 414 } 415 } 416 } 417 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,strided_cm_subtile)418 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, strided_cm_subtile) { 419 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 420 for (size_t k = 1; k <= 20; k += 5) { 421 for (uint32_t n = 1; n <= 8; n++) { 422 for (uint32_t m = 1; m <= 1; m++) { 423 GemmMicrokernelTester() 424 .mr(1) 425 .nr(8) 426 .kr(1) 427 .sr(1) 428 .m(m) 429 .n(n) 430 .k(k) 431 .cm_stride(11) 432 .iterations(1) 433 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 434 } 435 } 436 } 437 } 438 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,qmin)439 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, qmin) { 440 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 441 GemmMicrokernelTester() 442 .mr(1) 443 .nr(8) 444 .kr(1) 445 .sr(1) 446 .m(1) 447 .n(8) 448 .k(4) 449 .qmin(128) 450 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 451 } 452 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,qmax)453 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, qmax) { 454 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 455 GemmMicrokernelTester() 456 .mr(1) 457 .nr(8) 458 .kr(1) 459 .sr(1) 460 .m(1) 461 .n(8) 462 .k(4) 463 .qmax(128) 464 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 465 } 466 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,strided_cm)467 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, strided_cm) { 468 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 469 GemmMicrokernelTester() 470 .mr(1) 471 .nr(8) 472 .kr(1) 473 .sr(1) 474 .m(1) 475 .n(8) 476 .k(4) 477 .cm_stride(11) 478 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 479 } 480 #endif // XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 481 482 483 #if XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2)484 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2) { 485 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 486 GemmMicrokernelTester() 487 .mr(1) 488 .nr(16) 489 .kr(1) 490 .sr(1) 491 .m(1) 492 .n(16) 493 .k(2) 494 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 495 } 496 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,strided_cn)497 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, strided_cn) { 498 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 499 GemmMicrokernelTester() 500 .mr(1) 501 .nr(16) 502 .kr(1) 503 .sr(1) 504 .m(1) 505 .n(16) 506 .k(2) 507 .cn_stride(19) 508 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 509 } 510 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_strided_a)511 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_strided_a) { 512 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 513 GemmMicrokernelTester() 514 .mr(1) 515 .nr(16) 516 .kr(1) 517 .sr(1) 518 .m(1) 519 .n(16) 520 .k(2) 521 .a_stride(5) 522 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 523 } 524 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_subtile)525 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile) { 526 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 527 for (uint32_t n = 1; n <= 16; n++) { 528 for (uint32_t m = 1; m <= 1; m++) { 529 GemmMicrokernelTester() 530 .mr(1) 531 .nr(16) 532 .kr(1) 533 .sr(1) 534 .m(m) 535 .n(n) 536 .k(2) 537 .iterations(1) 538 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 539 } 540 } 541 } 542 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_subtile_m)543 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile_m) { 544 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 545 for (uint32_t m = 1; m <= 1; m++) { 546 GemmMicrokernelTester() 547 .mr(1) 548 .nr(16) 549 .kr(1) 550 .sr(1) 551 .m(m) 552 .n(16) 553 .k(2) 554 .iterations(1) 555 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 556 } 557 } 558 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_subtile_n)559 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile_n) { 560 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 561 for (uint32_t n = 1; n <= 16; n++) { 562 GemmMicrokernelTester() 563 .mr(1) 564 .nr(16) 565 .kr(1) 566 .sr(1) 567 .m(1) 568 .n(n) 569 .k(2) 570 .iterations(1) 571 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 572 } 573 } 574 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_lt_2)575 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2) { 576 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 577 for (size_t k = 1; k < 2; k++) { 578 GemmMicrokernelTester() 579 .mr(1) 580 .nr(16) 581 .kr(1) 582 .sr(1) 583 .m(1) 584 .n(16) 585 .k(k) 586 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 587 } 588 } 589 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_lt_2_strided_a)590 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2_strided_a) { 591 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 592 for (size_t k = 1; k < 2; k++) { 593 GemmMicrokernelTester() 594 .mr(1) 595 .nr(16) 596 .kr(1) 597 .sr(1) 598 .m(1) 599 .n(16) 600 .k(k) 601 .a_stride(5) 602 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 603 } 604 } 605 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_lt_2_subtile)606 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2_subtile) { 607 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 608 for (size_t k = 1; k < 2; k++) { 609 for (uint32_t n = 1; n <= 16; n++) { 610 for (uint32_t m = 1; m <= 1; m++) { 611 GemmMicrokernelTester() 612 .mr(1) 613 .nr(16) 614 .kr(1) 615 .sr(1) 616 .m(m) 617 .n(n) 618 .k(k) 619 .iterations(1) 620 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 621 } 622 } 623 } 624 } 625 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_gt_2)626 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2) { 627 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 628 for (size_t k = 3; k < 4; k++) { 629 GemmMicrokernelTester() 630 .mr(1) 631 .nr(16) 632 .kr(1) 633 .sr(1) 634 .m(1) 635 .n(16) 636 .k(k) 637 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 638 } 639 } 640 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_gt_2_strided_a)641 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2_strided_a) { 642 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 643 for (size_t k = 3; k < 4; k++) { 644 GemmMicrokernelTester() 645 .mr(1) 646 .nr(16) 647 .kr(1) 648 .sr(1) 649 .m(1) 650 .n(16) 651 .k(k) 652 .a_stride(7) 653 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 654 } 655 } 656 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_gt_2_subtile)657 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2_subtile) { 658 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 659 for (size_t k = 3; k < 4; k++) { 660 for (uint32_t n = 1; n <= 16; n++) { 661 for (uint32_t m = 1; m <= 1; m++) { 662 GemmMicrokernelTester() 663 .mr(1) 664 .nr(16) 665 .kr(1) 666 .sr(1) 667 .m(m) 668 .n(n) 669 .k(k) 670 .iterations(1) 671 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 672 } 673 } 674 } 675 } 676 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_div_2)677 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_div_2) { 678 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 679 for (size_t k = 4; k <= 20; k += 2) { 680 GemmMicrokernelTester() 681 .mr(1) 682 .nr(16) 683 .kr(1) 684 .sr(1) 685 .m(1) 686 .n(16) 687 .k(k) 688 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 689 } 690 } 691 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_div_2_strided_a)692 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_div_2_strided_a) { 693 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 694 for (size_t k = 4; k <= 20; k += 2) { 695 GemmMicrokernelTester() 696 .mr(1) 697 .nr(16) 698 .kr(1) 699 .sr(1) 700 .m(1) 701 .n(16) 702 .k(k) 703 .a_stride(23) 704 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 705 } 706 } 707 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_div_2_subtile)708 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_div_2_subtile) { 709 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 710 for (size_t k = 4; k <= 20; k += 2) { 711 for (uint32_t n = 1; n <= 16; n++) { 712 for (uint32_t m = 1; m <= 1; m++) { 713 GemmMicrokernelTester() 714 .mr(1) 715 .nr(16) 716 .kr(1) 717 .sr(1) 718 .m(m) 719 .n(n) 720 .k(k) 721 .iterations(1) 722 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 723 } 724 } 725 } 726 } 727 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16)728 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16) { 729 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 730 for (uint32_t n = 17; n < 32; n++) { 731 for (size_t k = 1; k <= 10; k += 3) { 732 GemmMicrokernelTester() 733 .mr(1) 734 .nr(16) 735 .kr(1) 736 .sr(1) 737 .m(1) 738 .n(n) 739 .k(k) 740 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 741 } 742 } 743 } 744 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16_strided_cn)745 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_strided_cn) { 746 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 747 for (uint32_t n = 17; n < 32; n++) { 748 for (size_t k = 1; k <= 10; k += 3) { 749 GemmMicrokernelTester() 750 .mr(1) 751 .nr(16) 752 .kr(1) 753 .sr(1) 754 .m(1) 755 .n(n) 756 .k(k) 757 .cn_stride(19) 758 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 759 } 760 } 761 } 762 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16_strided_a)763 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_strided_a) { 764 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 765 for (uint32_t n = 17; n < 32; n++) { 766 for (size_t k = 1; k <= 10; k += 3) { 767 GemmMicrokernelTester() 768 .mr(1) 769 .nr(16) 770 .kr(1) 771 .sr(1) 772 .m(1) 773 .n(n) 774 .k(k) 775 .a_stride(13) 776 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 777 } 778 } 779 } 780 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16_subtile)781 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_subtile) { 782 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 783 for (uint32_t n = 17; n < 32; n++) { 784 for (size_t k = 1; k <= 10; k += 3) { 785 for (uint32_t m = 1; m <= 1; m++) { 786 GemmMicrokernelTester() 787 .mr(1) 788 .nr(16) 789 .kr(1) 790 .sr(1) 791 .m(m) 792 .n(n) 793 .k(k) 794 .iterations(1) 795 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 796 } 797 } 798 } 799 } 800 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,n_div_16)801 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_div_16) { 802 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 803 for (uint32_t n = 32; n <= 48; n += 16) { 804 for (size_t k = 1; k <= 10; k += 3) { 805 GemmMicrokernelTester() 806 .mr(1) 807 .nr(16) 808 .kr(1) 809 .sr(1) 810 .m(1) 811 .n(n) 812 .k(k) 813 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 814 } 815 } 816 } 817 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,n_div_16_strided_cn)818 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_strided_cn) { 819 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 820 for (uint32_t n = 32; n <= 48; n += 16) { 821 for (size_t k = 1; k <= 10; k += 3) { 822 GemmMicrokernelTester() 823 .mr(1) 824 .nr(16) 825 .kr(1) 826 .sr(1) 827 .m(1) 828 .n(n) 829 .k(k) 830 .cn_stride(19) 831 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 832 } 833 } 834 } 835 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,n_div_16_strided_a)836 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_strided_a) { 837 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 838 for (uint32_t n = 32; n <= 48; n += 16) { 839 for (size_t k = 1; k <= 10; k += 3) { 840 GemmMicrokernelTester() 841 .mr(1) 842 .nr(16) 843 .kr(1) 844 .sr(1) 845 .m(1) 846 .n(n) 847 .k(k) 848 .a_stride(13) 849 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 850 } 851 } 852 } 853 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,n_div_16_subtile)854 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_subtile) { 855 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 856 for (uint32_t n = 32; n <= 48; n += 16) { 857 for (size_t k = 1; k <= 10; k += 3) { 858 for (uint32_t m = 1; m <= 1; m++) { 859 GemmMicrokernelTester() 860 .mr(1) 861 .nr(16) 862 .kr(1) 863 .sr(1) 864 .m(m) 865 .n(n) 866 .k(k) 867 .iterations(1) 868 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 869 } 870 } 871 } 872 } 873 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,strided_cm_subtile)874 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, strided_cm_subtile) { 875 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 876 for (size_t k = 1; k <= 10; k += 3) { 877 for (uint32_t n = 1; n <= 16; n++) { 878 for (uint32_t m = 1; m <= 1; m++) { 879 GemmMicrokernelTester() 880 .mr(1) 881 .nr(16) 882 .kr(1) 883 .sr(1) 884 .m(m) 885 .n(n) 886 .k(k) 887 .cm_stride(19) 888 .iterations(1) 889 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 890 } 891 } 892 } 893 } 894 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,qmin)895 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, qmin) { 896 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 897 GemmMicrokernelTester() 898 .mr(1) 899 .nr(16) 900 .kr(1) 901 .sr(1) 902 .m(1) 903 .n(16) 904 .k(2) 905 .qmin(128) 906 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 907 } 908 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,qmax)909 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, qmax) { 910 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 911 GemmMicrokernelTester() 912 .mr(1) 913 .nr(16) 914 .kr(1) 915 .sr(1) 916 .m(1) 917 .n(16) 918 .k(2) 919 .qmax(128) 920 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 921 } 922 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,strided_cm)923 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, strided_cm) { 924 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 925 GemmMicrokernelTester() 926 .mr(1) 927 .nr(16) 928 .kr(1) 929 .sr(1) 930 .m(1) 931 .n(16) 932 .k(2) 933 .cm_stride(19) 934 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 935 } 936 #endif // XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 937 938 939 #if XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,k_eq_4)940 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, k_eq_4) { 941 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 942 GemmMicrokernelTester() 943 .mr(1) 944 .nr(16) 945 .kr(1) 946 .sr(1) 947 .m(1) 948 .n(16) 949 .k(4) 950 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 951 } 952 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,strided_cn)953 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, strided_cn) { 954 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 955 GemmMicrokernelTester() 956 .mr(1) 957 .nr(16) 958 .kr(1) 959 .sr(1) 960 .m(1) 961 .n(16) 962 .k(4) 963 .cn_stride(19) 964 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 965 } 966 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,k_eq_4_strided_a)967 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, k_eq_4_strided_a) { 968 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 969 GemmMicrokernelTester() 970 .mr(1) 971 .nr(16) 972 .kr(1) 973 .sr(1) 974 .m(1) 975 .n(16) 976 .k(4) 977 .a_stride(7) 978 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 979 } 980 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile)981 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile) { 982 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 983 for (uint32_t n = 1; n <= 16; n++) { 984 for (uint32_t m = 1; m <= 1; m++) { 985 GemmMicrokernelTester() 986 .mr(1) 987 .nr(16) 988 .kr(1) 989 .sr(1) 990 .m(m) 991 .n(n) 992 .k(4) 993 .iterations(1) 994 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 995 } 996 } 997 } 998 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile_m)999 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 1000 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1001 for (uint32_t m = 1; m <= 1; m++) { 1002 GemmMicrokernelTester() 1003 .mr(1) 1004 .nr(16) 1005 .kr(1) 1006 .sr(1) 1007 .m(m) 1008 .n(16) 1009 .k(4) 1010 .iterations(1) 1011 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1012 } 1013 } 1014 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile_n)1015 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 1016 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1017 for (uint32_t n = 1; n <= 16; n++) { 1018 GemmMicrokernelTester() 1019 .mr(1) 1020 .nr(16) 1021 .kr(1) 1022 .sr(1) 1023 .m(1) 1024 .n(n) 1025 .k(4) 1026 .iterations(1) 1027 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1028 } 1029 } 1030 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,k_lt_4)1031 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, k_lt_4) { 1032 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1033 for (size_t k = 1; k < 4; k++) { 1034 GemmMicrokernelTester() 1035 .mr(1) 1036 .nr(16) 1037 .kr(1) 1038 .sr(1) 1039 .m(1) 1040 .n(16) 1041 .k(k) 1042 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1043 } 1044 } 1045 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,k_lt_4_strided_a)1046 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, k_lt_4_strided_a) { 1047 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1048 for (size_t k = 1; k < 4; k++) { 1049 GemmMicrokernelTester() 1050 .mr(1) 1051 .nr(16) 1052 .kr(1) 1053 .sr(1) 1054 .m(1) 1055 .n(16) 1056 .k(k) 1057 .a_stride(7) 1058 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1059 } 1060 } 1061 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,k_lt_4_subtile)1062 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, k_lt_4_subtile) { 1063 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1064 for (size_t k = 1; k < 4; k++) { 1065 for (uint32_t n = 1; n <= 16; n++) { 1066 for (uint32_t m = 1; m <= 1; m++) { 1067 GemmMicrokernelTester() 1068 .mr(1) 1069 .nr(16) 1070 .kr(1) 1071 .sr(1) 1072 .m(m) 1073 .n(n) 1074 .k(k) 1075 .iterations(1) 1076 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1077 } 1078 } 1079 } 1080 } 1081 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,k_gt_4)1082 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, k_gt_4) { 1083 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1084 for (size_t k = 5; k < 8; k++) { 1085 GemmMicrokernelTester() 1086 .mr(1) 1087 .nr(16) 1088 .kr(1) 1089 .sr(1) 1090 .m(1) 1091 .n(16) 1092 .k(k) 1093 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1094 } 1095 } 1096 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,k_gt_4_strided_a)1097 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, k_gt_4_strided_a) { 1098 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1099 for (size_t k = 5; k < 8; k++) { 1100 GemmMicrokernelTester() 1101 .mr(1) 1102 .nr(16) 1103 .kr(1) 1104 .sr(1) 1105 .m(1) 1106 .n(16) 1107 .k(k) 1108 .a_stride(11) 1109 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1110 } 1111 } 1112 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,k_gt_4_subtile)1113 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, k_gt_4_subtile) { 1114 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1115 for (size_t k = 5; k < 8; k++) { 1116 for (uint32_t n = 1; n <= 16; n++) { 1117 for (uint32_t m = 1; m <= 1; m++) { 1118 GemmMicrokernelTester() 1119 .mr(1) 1120 .nr(16) 1121 .kr(1) 1122 .sr(1) 1123 .m(m) 1124 .n(n) 1125 .k(k) 1126 .iterations(1) 1127 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1128 } 1129 } 1130 } 1131 } 1132 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,k_div_4)1133 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, k_div_4) { 1134 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1135 for (size_t k = 8; k <= 40; k += 4) { 1136 GemmMicrokernelTester() 1137 .mr(1) 1138 .nr(16) 1139 .kr(1) 1140 .sr(1) 1141 .m(1) 1142 .n(16) 1143 .k(k) 1144 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1145 } 1146 } 1147 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,k_div_4_strided_a)1148 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, k_div_4_strided_a) { 1149 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1150 for (size_t k = 8; k <= 40; k += 4) { 1151 GemmMicrokernelTester() 1152 .mr(1) 1153 .nr(16) 1154 .kr(1) 1155 .sr(1) 1156 .m(1) 1157 .n(16) 1158 .k(k) 1159 .a_stride(43) 1160 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1161 } 1162 } 1163 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,k_div_4_subtile)1164 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, k_div_4_subtile) { 1165 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1166 for (size_t k = 8; k <= 40; k += 4) { 1167 for (uint32_t n = 1; n <= 16; n++) { 1168 for (uint32_t m = 1; m <= 1; m++) { 1169 GemmMicrokernelTester() 1170 .mr(1) 1171 .nr(16) 1172 .kr(1) 1173 .sr(1) 1174 .m(m) 1175 .n(n) 1176 .k(k) 1177 .iterations(1) 1178 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1179 } 1180 } 1181 } 1182 } 1183 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,n_gt_16)1184 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, n_gt_16) { 1185 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1186 for (uint32_t n = 17; n < 32; n++) { 1187 for (size_t k = 1; k <= 20; k += 5) { 1188 GemmMicrokernelTester() 1189 .mr(1) 1190 .nr(16) 1191 .kr(1) 1192 .sr(1) 1193 .m(1) 1194 .n(n) 1195 .k(k) 1196 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1197 } 1198 } 1199 } 1200 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,n_gt_16_strided_cn)1201 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, n_gt_16_strided_cn) { 1202 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1203 for (uint32_t n = 17; n < 32; n++) { 1204 for (size_t k = 1; k <= 20; k += 5) { 1205 GemmMicrokernelTester() 1206 .mr(1) 1207 .nr(16) 1208 .kr(1) 1209 .sr(1) 1210 .m(1) 1211 .n(n) 1212 .k(k) 1213 .cn_stride(19) 1214 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1215 } 1216 } 1217 } 1218 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,n_gt_16_strided_a)1219 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, n_gt_16_strided_a) { 1220 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1221 for (uint32_t n = 17; n < 32; n++) { 1222 for (size_t k = 1; k <= 20; k += 5) { 1223 GemmMicrokernelTester() 1224 .mr(1) 1225 .nr(16) 1226 .kr(1) 1227 .sr(1) 1228 .m(1) 1229 .n(n) 1230 .k(k) 1231 .a_stride(23) 1232 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1233 } 1234 } 1235 } 1236 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,n_gt_16_subtile)1237 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, n_gt_16_subtile) { 1238 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1239 for (uint32_t n = 17; n < 32; n++) { 1240 for (size_t k = 1; k <= 20; k += 5) { 1241 for (uint32_t m = 1; m <= 1; m++) { 1242 GemmMicrokernelTester() 1243 .mr(1) 1244 .nr(16) 1245 .kr(1) 1246 .sr(1) 1247 .m(m) 1248 .n(n) 1249 .k(k) 1250 .iterations(1) 1251 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1252 } 1253 } 1254 } 1255 } 1256 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,n_div_16)1257 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, n_div_16) { 1258 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1259 for (uint32_t n = 32; n <= 48; n += 16) { 1260 for (size_t k = 1; k <= 20; k += 5) { 1261 GemmMicrokernelTester() 1262 .mr(1) 1263 .nr(16) 1264 .kr(1) 1265 .sr(1) 1266 .m(1) 1267 .n(n) 1268 .k(k) 1269 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1270 } 1271 } 1272 } 1273 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,n_div_16_strided_cn)1274 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, n_div_16_strided_cn) { 1275 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1276 for (uint32_t n = 32; n <= 48; n += 16) { 1277 for (size_t k = 1; k <= 20; k += 5) { 1278 GemmMicrokernelTester() 1279 .mr(1) 1280 .nr(16) 1281 .kr(1) 1282 .sr(1) 1283 .m(1) 1284 .n(n) 1285 .k(k) 1286 .cn_stride(19) 1287 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1288 } 1289 } 1290 } 1291 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,n_div_16_strided_a)1292 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, n_div_16_strided_a) { 1293 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1294 for (uint32_t n = 32; n <= 48; n += 16) { 1295 for (size_t k = 1; k <= 20; k += 5) { 1296 GemmMicrokernelTester() 1297 .mr(1) 1298 .nr(16) 1299 .kr(1) 1300 .sr(1) 1301 .m(1) 1302 .n(n) 1303 .k(k) 1304 .a_stride(23) 1305 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1306 } 1307 } 1308 } 1309 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,n_div_16_subtile)1310 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, n_div_16_subtile) { 1311 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1312 for (uint32_t n = 32; n <= 48; n += 16) { 1313 for (size_t k = 1; k <= 20; k += 5) { 1314 for (uint32_t m = 1; m <= 1; m++) { 1315 GemmMicrokernelTester() 1316 .mr(1) 1317 .nr(16) 1318 .kr(1) 1319 .sr(1) 1320 .m(m) 1321 .n(n) 1322 .k(k) 1323 .iterations(1) 1324 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1325 } 1326 } 1327 } 1328 } 1329 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,strided_cm_subtile)1330 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, strided_cm_subtile) { 1331 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1332 for (size_t k = 1; k <= 20; k += 5) { 1333 for (uint32_t n = 1; n <= 16; n++) { 1334 for (uint32_t m = 1; m <= 1; m++) { 1335 GemmMicrokernelTester() 1336 .mr(1) 1337 .nr(16) 1338 .kr(1) 1339 .sr(1) 1340 .m(m) 1341 .n(n) 1342 .k(k) 1343 .cm_stride(19) 1344 .iterations(1) 1345 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1346 } 1347 } 1348 } 1349 } 1350 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,qmin)1351 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, qmin) { 1352 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1353 GemmMicrokernelTester() 1354 .mr(1) 1355 .nr(16) 1356 .kr(1) 1357 .sr(1) 1358 .m(1) 1359 .n(16) 1360 .k(4) 1361 .qmin(128) 1362 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1363 } 1364 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,qmax)1365 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, qmax) { 1366 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1367 GemmMicrokernelTester() 1368 .mr(1) 1369 .nr(16) 1370 .kr(1) 1371 .sr(1) 1372 .m(1) 1373 .n(16) 1374 .k(4) 1375 .qmax(128) 1376 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1377 } 1378 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,strided_cm)1379 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, strided_cm) { 1380 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1381 GemmMicrokernelTester() 1382 .mr(1) 1383 .nr(16) 1384 .kr(1) 1385 .sr(1) 1386 .m(1) 1387 .n(16) 1388 .k(4) 1389 .cm_stride(19) 1390 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1391 } 1392 #endif // XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 1393 1394 1395 #if XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4)1396 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4) { 1397 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1398 GemmMicrokernelTester() 1399 .mr(4) 1400 .nr(8) 1401 .kr(1) 1402 .sr(1) 1403 .m(4) 1404 .n(8) 1405 .k(4) 1406 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1407 } 1408 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,strided_cn)1409 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, strided_cn) { 1410 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1411 GemmMicrokernelTester() 1412 .mr(4) 1413 .nr(8) 1414 .kr(1) 1415 .sr(1) 1416 .m(4) 1417 .n(8) 1418 .k(4) 1419 .cn_stride(11) 1420 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1421 } 1422 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_strided_a)1423 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_strided_a) { 1424 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1425 GemmMicrokernelTester() 1426 .mr(4) 1427 .nr(8) 1428 .kr(1) 1429 .sr(1) 1430 .m(4) 1431 .n(8) 1432 .k(4) 1433 .a_stride(7) 1434 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1435 } 1436 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile)1437 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile) { 1438 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1439 for (uint32_t n = 1; n <= 8; n++) { 1440 for (uint32_t m = 1; m <= 4; m++) { 1441 GemmMicrokernelTester() 1442 .mr(4) 1443 .nr(8) 1444 .kr(1) 1445 .sr(1) 1446 .m(m) 1447 .n(n) 1448 .k(4) 1449 .iterations(1) 1450 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1451 } 1452 } 1453 } 1454 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile_m)1455 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 1456 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1457 for (uint32_t m = 1; m <= 4; m++) { 1458 GemmMicrokernelTester() 1459 .mr(4) 1460 .nr(8) 1461 .kr(1) 1462 .sr(1) 1463 .m(m) 1464 .n(8) 1465 .k(4) 1466 .iterations(1) 1467 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1468 } 1469 } 1470 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile_n)1471 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 1472 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1473 for (uint32_t n = 1; n <= 8; n++) { 1474 GemmMicrokernelTester() 1475 .mr(4) 1476 .nr(8) 1477 .kr(1) 1478 .sr(1) 1479 .m(4) 1480 .n(n) 1481 .k(4) 1482 .iterations(1) 1483 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1484 } 1485 } 1486 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,k_lt_4)1487 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4) { 1488 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1489 for (size_t k = 1; k < 4; k++) { 1490 GemmMicrokernelTester() 1491 .mr(4) 1492 .nr(8) 1493 .kr(1) 1494 .sr(1) 1495 .m(4) 1496 .n(8) 1497 .k(k) 1498 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1499 } 1500 } 1501 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,k_lt_4_strided_a)1502 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4_strided_a) { 1503 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1504 for (size_t k = 1; k < 4; k++) { 1505 GemmMicrokernelTester() 1506 .mr(4) 1507 .nr(8) 1508 .kr(1) 1509 .sr(1) 1510 .m(4) 1511 .n(8) 1512 .k(k) 1513 .a_stride(7) 1514 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1515 } 1516 } 1517 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,k_lt_4_subtile)1518 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4_subtile) { 1519 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1520 for (size_t k = 1; k < 4; k++) { 1521 for (uint32_t n = 1; n <= 8; n++) { 1522 for (uint32_t m = 1; m <= 4; m++) { 1523 GemmMicrokernelTester() 1524 .mr(4) 1525 .nr(8) 1526 .kr(1) 1527 .sr(1) 1528 .m(m) 1529 .n(n) 1530 .k(k) 1531 .iterations(1) 1532 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1533 } 1534 } 1535 } 1536 } 1537 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,k_gt_4)1538 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4) { 1539 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1540 for (size_t k = 5; k < 8; k++) { 1541 GemmMicrokernelTester() 1542 .mr(4) 1543 .nr(8) 1544 .kr(1) 1545 .sr(1) 1546 .m(4) 1547 .n(8) 1548 .k(k) 1549 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1550 } 1551 } 1552 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,k_gt_4_strided_a)1553 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4_strided_a) { 1554 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1555 for (size_t k = 5; k < 8; k++) { 1556 GemmMicrokernelTester() 1557 .mr(4) 1558 .nr(8) 1559 .kr(1) 1560 .sr(1) 1561 .m(4) 1562 .n(8) 1563 .k(k) 1564 .a_stride(11) 1565 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1566 } 1567 } 1568 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,k_gt_4_subtile)1569 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4_subtile) { 1570 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1571 for (size_t k = 5; k < 8; k++) { 1572 for (uint32_t n = 1; n <= 8; n++) { 1573 for (uint32_t m = 1; m <= 4; m++) { 1574 GemmMicrokernelTester() 1575 .mr(4) 1576 .nr(8) 1577 .kr(1) 1578 .sr(1) 1579 .m(m) 1580 .n(n) 1581 .k(k) 1582 .iterations(1) 1583 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1584 } 1585 } 1586 } 1587 } 1588 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,k_div_4)1589 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_div_4) { 1590 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1591 for (size_t k = 8; k <= 40; k += 4) { 1592 GemmMicrokernelTester() 1593 .mr(4) 1594 .nr(8) 1595 .kr(1) 1596 .sr(1) 1597 .m(4) 1598 .n(8) 1599 .k(k) 1600 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1601 } 1602 } 1603 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,k_div_4_strided_a)1604 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_div_4_strided_a) { 1605 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1606 for (size_t k = 8; k <= 40; k += 4) { 1607 GemmMicrokernelTester() 1608 .mr(4) 1609 .nr(8) 1610 .kr(1) 1611 .sr(1) 1612 .m(4) 1613 .n(8) 1614 .k(k) 1615 .a_stride(43) 1616 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1617 } 1618 } 1619 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,k_div_4_subtile)1620 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_div_4_subtile) { 1621 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1622 for (size_t k = 8; k <= 40; k += 4) { 1623 for (uint32_t n = 1; n <= 8; n++) { 1624 for (uint32_t m = 1; m <= 4; m++) { 1625 GemmMicrokernelTester() 1626 .mr(4) 1627 .nr(8) 1628 .kr(1) 1629 .sr(1) 1630 .m(m) 1631 .n(n) 1632 .k(k) 1633 .iterations(1) 1634 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1635 } 1636 } 1637 } 1638 } 1639 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8)1640 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8) { 1641 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1642 for (uint32_t n = 9; n < 16; n++) { 1643 for (size_t k = 1; k <= 20; k += 5) { 1644 GemmMicrokernelTester() 1645 .mr(4) 1646 .nr(8) 1647 .kr(1) 1648 .sr(1) 1649 .m(4) 1650 .n(n) 1651 .k(k) 1652 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1653 } 1654 } 1655 } 1656 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8_strided_cn)1657 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_strided_cn) { 1658 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1659 for (uint32_t n = 9; n < 16; n++) { 1660 for (size_t k = 1; k <= 20; k += 5) { 1661 GemmMicrokernelTester() 1662 .mr(4) 1663 .nr(8) 1664 .kr(1) 1665 .sr(1) 1666 .m(4) 1667 .n(n) 1668 .k(k) 1669 .cn_stride(11) 1670 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1671 } 1672 } 1673 } 1674 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8_strided_a)1675 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_strided_a) { 1676 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1677 for (uint32_t n = 9; n < 16; n++) { 1678 for (size_t k = 1; k <= 20; k += 5) { 1679 GemmMicrokernelTester() 1680 .mr(4) 1681 .nr(8) 1682 .kr(1) 1683 .sr(1) 1684 .m(4) 1685 .n(n) 1686 .k(k) 1687 .a_stride(23) 1688 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1689 } 1690 } 1691 } 1692 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8_subtile)1693 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_subtile) { 1694 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1695 for (uint32_t n = 9; n < 16; n++) { 1696 for (size_t k = 1; k <= 20; k += 5) { 1697 for (uint32_t m = 1; m <= 4; m++) { 1698 GemmMicrokernelTester() 1699 .mr(4) 1700 .nr(8) 1701 .kr(1) 1702 .sr(1) 1703 .m(m) 1704 .n(n) 1705 .k(k) 1706 .iterations(1) 1707 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1708 } 1709 } 1710 } 1711 } 1712 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,n_div_8)1713 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, n_div_8) { 1714 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1715 for (uint32_t n = 16; n <= 24; n += 8) { 1716 for (size_t k = 1; k <= 20; k += 5) { 1717 GemmMicrokernelTester() 1718 .mr(4) 1719 .nr(8) 1720 .kr(1) 1721 .sr(1) 1722 .m(4) 1723 .n(n) 1724 .k(k) 1725 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1726 } 1727 } 1728 } 1729 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,n_div_8_strided_cn)1730 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_strided_cn) { 1731 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1732 for (uint32_t n = 16; n <= 24; n += 8) { 1733 for (size_t k = 1; k <= 20; k += 5) { 1734 GemmMicrokernelTester() 1735 .mr(4) 1736 .nr(8) 1737 .kr(1) 1738 .sr(1) 1739 .m(4) 1740 .n(n) 1741 .k(k) 1742 .cn_stride(11) 1743 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1744 } 1745 } 1746 } 1747 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,n_div_8_strided_a)1748 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_strided_a) { 1749 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1750 for (uint32_t n = 16; n <= 24; n += 8) { 1751 for (size_t k = 1; k <= 20; k += 5) { 1752 GemmMicrokernelTester() 1753 .mr(4) 1754 .nr(8) 1755 .kr(1) 1756 .sr(1) 1757 .m(4) 1758 .n(n) 1759 .k(k) 1760 .a_stride(23) 1761 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1762 } 1763 } 1764 } 1765 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,n_div_8_subtile)1766 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_subtile) { 1767 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1768 for (uint32_t n = 16; n <= 24; n += 8) { 1769 for (size_t k = 1; k <= 20; k += 5) { 1770 for (uint32_t m = 1; m <= 4; m++) { 1771 GemmMicrokernelTester() 1772 .mr(4) 1773 .nr(8) 1774 .kr(1) 1775 .sr(1) 1776 .m(m) 1777 .n(n) 1778 .k(k) 1779 .iterations(1) 1780 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1781 } 1782 } 1783 } 1784 } 1785 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,strided_cm_subtile)1786 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, strided_cm_subtile) { 1787 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1788 for (size_t k = 1; k <= 20; k += 5) { 1789 for (uint32_t n = 1; n <= 8; n++) { 1790 for (uint32_t m = 1; m <= 4; m++) { 1791 GemmMicrokernelTester() 1792 .mr(4) 1793 .nr(8) 1794 .kr(1) 1795 .sr(1) 1796 .m(m) 1797 .n(n) 1798 .k(k) 1799 .cm_stride(11) 1800 .iterations(1) 1801 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1802 } 1803 } 1804 } 1805 } 1806 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,qmin)1807 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, qmin) { 1808 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1809 GemmMicrokernelTester() 1810 .mr(4) 1811 .nr(8) 1812 .kr(1) 1813 .sr(1) 1814 .m(4) 1815 .n(8) 1816 .k(4) 1817 .qmin(128) 1818 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1819 } 1820 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,qmax)1821 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, qmax) { 1822 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1823 GemmMicrokernelTester() 1824 .mr(4) 1825 .nr(8) 1826 .kr(1) 1827 .sr(1) 1828 .m(4) 1829 .n(8) 1830 .k(4) 1831 .qmax(128) 1832 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1833 } 1834 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,strided_cm)1835 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, strided_cm) { 1836 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1837 GemmMicrokernelTester() 1838 .mr(4) 1839 .nr(8) 1840 .kr(1) 1841 .sr(1) 1842 .m(4) 1843 .n(8) 1844 .k(4) 1845 .cm_stride(11) 1846 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1847 } 1848 #endif // XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 1849 1850 1851 #if XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2)1852 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2) { 1853 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1854 GemmMicrokernelTester() 1855 .mr(4) 1856 .nr(16) 1857 .kr(1) 1858 .sr(1) 1859 .m(4) 1860 .n(16) 1861 .k(2) 1862 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1863 } 1864 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,strided_cn)1865 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, strided_cn) { 1866 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1867 GemmMicrokernelTester() 1868 .mr(4) 1869 .nr(16) 1870 .kr(1) 1871 .sr(1) 1872 .m(4) 1873 .n(16) 1874 .k(2) 1875 .cn_stride(19) 1876 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1877 } 1878 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_strided_a)1879 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_strided_a) { 1880 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1881 GemmMicrokernelTester() 1882 .mr(4) 1883 .nr(16) 1884 .kr(1) 1885 .sr(1) 1886 .m(4) 1887 .n(16) 1888 .k(2) 1889 .a_stride(5) 1890 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1891 } 1892 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_subtile)1893 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile) { 1894 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1895 for (uint32_t n = 1; n <= 16; n++) { 1896 for (uint32_t m = 1; m <= 4; m++) { 1897 GemmMicrokernelTester() 1898 .mr(4) 1899 .nr(16) 1900 .kr(1) 1901 .sr(1) 1902 .m(m) 1903 .n(n) 1904 .k(2) 1905 .iterations(1) 1906 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1907 } 1908 } 1909 } 1910 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_subtile_m)1911 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile_m) { 1912 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1913 for (uint32_t m = 1; m <= 4; m++) { 1914 GemmMicrokernelTester() 1915 .mr(4) 1916 .nr(16) 1917 .kr(1) 1918 .sr(1) 1919 .m(m) 1920 .n(16) 1921 .k(2) 1922 .iterations(1) 1923 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1924 } 1925 } 1926 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_subtile_n)1927 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile_n) { 1928 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1929 for (uint32_t n = 1; n <= 16; n++) { 1930 GemmMicrokernelTester() 1931 .mr(4) 1932 .nr(16) 1933 .kr(1) 1934 .sr(1) 1935 .m(4) 1936 .n(n) 1937 .k(2) 1938 .iterations(1) 1939 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1940 } 1941 } 1942 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_lt_2)1943 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2) { 1944 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1945 for (size_t k = 1; k < 2; k++) { 1946 GemmMicrokernelTester() 1947 .mr(4) 1948 .nr(16) 1949 .kr(1) 1950 .sr(1) 1951 .m(4) 1952 .n(16) 1953 .k(k) 1954 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1955 } 1956 } 1957 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_lt_2_strided_a)1958 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2_strided_a) { 1959 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1960 for (size_t k = 1; k < 2; k++) { 1961 GemmMicrokernelTester() 1962 .mr(4) 1963 .nr(16) 1964 .kr(1) 1965 .sr(1) 1966 .m(4) 1967 .n(16) 1968 .k(k) 1969 .a_stride(5) 1970 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1971 } 1972 } 1973 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_lt_2_subtile)1974 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2_subtile) { 1975 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1976 for (size_t k = 1; k < 2; k++) { 1977 for (uint32_t n = 1; n <= 16; n++) { 1978 for (uint32_t m = 1; m <= 4; m++) { 1979 GemmMicrokernelTester() 1980 .mr(4) 1981 .nr(16) 1982 .kr(1) 1983 .sr(1) 1984 .m(m) 1985 .n(n) 1986 .k(k) 1987 .iterations(1) 1988 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1989 } 1990 } 1991 } 1992 } 1993 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_gt_2)1994 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2) { 1995 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1996 for (size_t k = 3; k < 4; k++) { 1997 GemmMicrokernelTester() 1998 .mr(4) 1999 .nr(16) 2000 .kr(1) 2001 .sr(1) 2002 .m(4) 2003 .n(16) 2004 .k(k) 2005 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 2006 } 2007 } 2008 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_gt_2_strided_a)2009 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2_strided_a) { 2010 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2011 for (size_t k = 3; k < 4; k++) { 2012 GemmMicrokernelTester() 2013 .mr(4) 2014 .nr(16) 2015 .kr(1) 2016 .sr(1) 2017 .m(4) 2018 .n(16) 2019 .k(k) 2020 .a_stride(7) 2021 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 2022 } 2023 } 2024 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_gt_2_subtile)2025 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2_subtile) { 2026 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2027 for (size_t k = 3; k < 4; k++) { 2028 for (uint32_t n = 1; n <= 16; n++) { 2029 for (uint32_t m = 1; m <= 4; m++) { 2030 GemmMicrokernelTester() 2031 .mr(4) 2032 .nr(16) 2033 .kr(1) 2034 .sr(1) 2035 .m(m) 2036 .n(n) 2037 .k(k) 2038 .iterations(1) 2039 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 2040 } 2041 } 2042 } 2043 } 2044 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_div_2)2045 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_div_2) { 2046 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2047 for (size_t k = 4; k <= 20; k += 2) { 2048 GemmMicrokernelTester() 2049 .mr(4) 2050 .nr(16) 2051 .kr(1) 2052 .sr(1) 2053 .m(4) 2054 .n(16) 2055 .k(k) 2056 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 2057 } 2058 } 2059 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_div_2_strided_a)2060 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_div_2_strided_a) { 2061 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2062 for (size_t k = 4; k <= 20; k += 2) { 2063 GemmMicrokernelTester() 2064 .mr(4) 2065 .nr(16) 2066 .kr(1) 2067 .sr(1) 2068 .m(4) 2069 .n(16) 2070 .k(k) 2071 .a_stride(23) 2072 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 2073 } 2074 } 2075 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_div_2_subtile)2076 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_div_2_subtile) { 2077 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2078 for (size_t k = 4; k <= 20; k += 2) { 2079 for (uint32_t n = 1; n <= 16; n++) { 2080 for (uint32_t m = 1; m <= 4; m++) { 2081 GemmMicrokernelTester() 2082 .mr(4) 2083 .nr(16) 2084 .kr(1) 2085 .sr(1) 2086 .m(m) 2087 .n(n) 2088 .k(k) 2089 .iterations(1) 2090 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 2091 } 2092 } 2093 } 2094 } 2095 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16)2096 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16) { 2097 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2098 for (uint32_t n = 17; n < 32; n++) { 2099 for (size_t k = 1; k <= 10; k += 3) { 2100 GemmMicrokernelTester() 2101 .mr(4) 2102 .nr(16) 2103 .kr(1) 2104 .sr(1) 2105 .m(4) 2106 .n(n) 2107 .k(k) 2108 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 2109 } 2110 } 2111 } 2112 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16_strided_cn)2113 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_strided_cn) { 2114 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2115 for (uint32_t n = 17; n < 32; n++) { 2116 for (size_t k = 1; k <= 10; k += 3) { 2117 GemmMicrokernelTester() 2118 .mr(4) 2119 .nr(16) 2120 .kr(1) 2121 .sr(1) 2122 .m(4) 2123 .n(n) 2124 .k(k) 2125 .cn_stride(19) 2126 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 2127 } 2128 } 2129 } 2130 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16_strided_a)2131 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_strided_a) { 2132 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2133 for (uint32_t n = 17; n < 32; n++) { 2134 for (size_t k = 1; k <= 10; k += 3) { 2135 GemmMicrokernelTester() 2136 .mr(4) 2137 .nr(16) 2138 .kr(1) 2139 .sr(1) 2140 .m(4) 2141 .n(n) 2142 .k(k) 2143 .a_stride(13) 2144 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 2145 } 2146 } 2147 } 2148 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16_subtile)2149 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_subtile) { 2150 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2151 for (uint32_t n = 17; n < 32; n++) { 2152 for (size_t k = 1; k <= 10; k += 3) { 2153 for (uint32_t m = 1; m <= 4; m++) { 2154 GemmMicrokernelTester() 2155 .mr(4) 2156 .nr(16) 2157 .kr(1) 2158 .sr(1) 2159 .m(m) 2160 .n(n) 2161 .k(k) 2162 .iterations(1) 2163 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 2164 } 2165 } 2166 } 2167 } 2168 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,n_div_16)2169 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_div_16) { 2170 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2171 for (uint32_t n = 32; n <= 48; n += 16) { 2172 for (size_t k = 1; k <= 10; k += 3) { 2173 GemmMicrokernelTester() 2174 .mr(4) 2175 .nr(16) 2176 .kr(1) 2177 .sr(1) 2178 .m(4) 2179 .n(n) 2180 .k(k) 2181 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 2182 } 2183 } 2184 } 2185 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,n_div_16_strided_cn)2186 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_strided_cn) { 2187 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2188 for (uint32_t n = 32; n <= 48; n += 16) { 2189 for (size_t k = 1; k <= 10; k += 3) { 2190 GemmMicrokernelTester() 2191 .mr(4) 2192 .nr(16) 2193 .kr(1) 2194 .sr(1) 2195 .m(4) 2196 .n(n) 2197 .k(k) 2198 .cn_stride(19) 2199 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 2200 } 2201 } 2202 } 2203 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,n_div_16_strided_a)2204 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_strided_a) { 2205 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2206 for (uint32_t n = 32; n <= 48; n += 16) { 2207 for (size_t k = 1; k <= 10; k += 3) { 2208 GemmMicrokernelTester() 2209 .mr(4) 2210 .nr(16) 2211 .kr(1) 2212 .sr(1) 2213 .m(4) 2214 .n(n) 2215 .k(k) 2216 .a_stride(13) 2217 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 2218 } 2219 } 2220 } 2221 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,n_div_16_subtile)2222 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_subtile) { 2223 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2224 for (uint32_t n = 32; n <= 48; n += 16) { 2225 for (size_t k = 1; k <= 10; k += 3) { 2226 for (uint32_t m = 1; m <= 4; m++) { 2227 GemmMicrokernelTester() 2228 .mr(4) 2229 .nr(16) 2230 .kr(1) 2231 .sr(1) 2232 .m(m) 2233 .n(n) 2234 .k(k) 2235 .iterations(1) 2236 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 2237 } 2238 } 2239 } 2240 } 2241 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,strided_cm_subtile)2242 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, strided_cm_subtile) { 2243 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2244 for (size_t k = 1; k <= 10; k += 3) { 2245 for (uint32_t n = 1; n <= 16; n++) { 2246 for (uint32_t m = 1; m <= 4; m++) { 2247 GemmMicrokernelTester() 2248 .mr(4) 2249 .nr(16) 2250 .kr(1) 2251 .sr(1) 2252 .m(m) 2253 .n(n) 2254 .k(k) 2255 .cm_stride(19) 2256 .iterations(1) 2257 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 2258 } 2259 } 2260 } 2261 } 2262 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,qmin)2263 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, qmin) { 2264 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2265 GemmMicrokernelTester() 2266 .mr(4) 2267 .nr(16) 2268 .kr(1) 2269 .sr(1) 2270 .m(4) 2271 .n(16) 2272 .k(2) 2273 .qmin(128) 2274 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 2275 } 2276 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,qmax)2277 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, qmax) { 2278 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2279 GemmMicrokernelTester() 2280 .mr(4) 2281 .nr(16) 2282 .kr(1) 2283 .sr(1) 2284 .m(4) 2285 .n(16) 2286 .k(2) 2287 .qmax(128) 2288 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 2289 } 2290 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,strided_cm)2291 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, strided_cm) { 2292 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2293 GemmMicrokernelTester() 2294 .mr(4) 2295 .nr(16) 2296 .kr(1) 2297 .sr(1) 2298 .m(4) 2299 .n(16) 2300 .k(2) 2301 .cm_stride(19) 2302 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 2303 } 2304 #endif // XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 2305 2306 2307 #if XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,k_eq_4)2308 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, k_eq_4) { 2309 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2310 GemmMicrokernelTester() 2311 .mr(4) 2312 .nr(16) 2313 .kr(1) 2314 .sr(1) 2315 .m(4) 2316 .n(16) 2317 .k(4) 2318 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2319 } 2320 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,strided_cn)2321 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, strided_cn) { 2322 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2323 GemmMicrokernelTester() 2324 .mr(4) 2325 .nr(16) 2326 .kr(1) 2327 .sr(1) 2328 .m(4) 2329 .n(16) 2330 .k(4) 2331 .cn_stride(19) 2332 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2333 } 2334 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,k_eq_4_strided_a)2335 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, k_eq_4_strided_a) { 2336 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2337 GemmMicrokernelTester() 2338 .mr(4) 2339 .nr(16) 2340 .kr(1) 2341 .sr(1) 2342 .m(4) 2343 .n(16) 2344 .k(4) 2345 .a_stride(7) 2346 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2347 } 2348 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile)2349 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile) { 2350 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2351 for (uint32_t n = 1; n <= 16; n++) { 2352 for (uint32_t m = 1; m <= 4; m++) { 2353 GemmMicrokernelTester() 2354 .mr(4) 2355 .nr(16) 2356 .kr(1) 2357 .sr(1) 2358 .m(m) 2359 .n(n) 2360 .k(4) 2361 .iterations(1) 2362 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2363 } 2364 } 2365 } 2366 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile_m)2367 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 2368 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2369 for (uint32_t m = 1; m <= 4; m++) { 2370 GemmMicrokernelTester() 2371 .mr(4) 2372 .nr(16) 2373 .kr(1) 2374 .sr(1) 2375 .m(m) 2376 .n(16) 2377 .k(4) 2378 .iterations(1) 2379 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2380 } 2381 } 2382 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile_n)2383 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 2384 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2385 for (uint32_t n = 1; n <= 16; n++) { 2386 GemmMicrokernelTester() 2387 .mr(4) 2388 .nr(16) 2389 .kr(1) 2390 .sr(1) 2391 .m(4) 2392 .n(n) 2393 .k(4) 2394 .iterations(1) 2395 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2396 } 2397 } 2398 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,k_lt_4)2399 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, k_lt_4) { 2400 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2401 for (size_t k = 1; k < 4; k++) { 2402 GemmMicrokernelTester() 2403 .mr(4) 2404 .nr(16) 2405 .kr(1) 2406 .sr(1) 2407 .m(4) 2408 .n(16) 2409 .k(k) 2410 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2411 } 2412 } 2413 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,k_lt_4_strided_a)2414 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, k_lt_4_strided_a) { 2415 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2416 for (size_t k = 1; k < 4; k++) { 2417 GemmMicrokernelTester() 2418 .mr(4) 2419 .nr(16) 2420 .kr(1) 2421 .sr(1) 2422 .m(4) 2423 .n(16) 2424 .k(k) 2425 .a_stride(7) 2426 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2427 } 2428 } 2429 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,k_lt_4_subtile)2430 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, k_lt_4_subtile) { 2431 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2432 for (size_t k = 1; k < 4; k++) { 2433 for (uint32_t n = 1; n <= 16; n++) { 2434 for (uint32_t m = 1; m <= 4; m++) { 2435 GemmMicrokernelTester() 2436 .mr(4) 2437 .nr(16) 2438 .kr(1) 2439 .sr(1) 2440 .m(m) 2441 .n(n) 2442 .k(k) 2443 .iterations(1) 2444 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2445 } 2446 } 2447 } 2448 } 2449 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,k_gt_4)2450 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, k_gt_4) { 2451 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2452 for (size_t k = 5; k < 8; k++) { 2453 GemmMicrokernelTester() 2454 .mr(4) 2455 .nr(16) 2456 .kr(1) 2457 .sr(1) 2458 .m(4) 2459 .n(16) 2460 .k(k) 2461 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2462 } 2463 } 2464 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,k_gt_4_strided_a)2465 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, k_gt_4_strided_a) { 2466 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2467 for (size_t k = 5; k < 8; k++) { 2468 GemmMicrokernelTester() 2469 .mr(4) 2470 .nr(16) 2471 .kr(1) 2472 .sr(1) 2473 .m(4) 2474 .n(16) 2475 .k(k) 2476 .a_stride(11) 2477 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2478 } 2479 } 2480 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,k_gt_4_subtile)2481 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, k_gt_4_subtile) { 2482 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2483 for (size_t k = 5; k < 8; k++) { 2484 for (uint32_t n = 1; n <= 16; n++) { 2485 for (uint32_t m = 1; m <= 4; m++) { 2486 GemmMicrokernelTester() 2487 .mr(4) 2488 .nr(16) 2489 .kr(1) 2490 .sr(1) 2491 .m(m) 2492 .n(n) 2493 .k(k) 2494 .iterations(1) 2495 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2496 } 2497 } 2498 } 2499 } 2500 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,k_div_4)2501 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, k_div_4) { 2502 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2503 for (size_t k = 8; k <= 40; k += 4) { 2504 GemmMicrokernelTester() 2505 .mr(4) 2506 .nr(16) 2507 .kr(1) 2508 .sr(1) 2509 .m(4) 2510 .n(16) 2511 .k(k) 2512 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2513 } 2514 } 2515 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,k_div_4_strided_a)2516 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, k_div_4_strided_a) { 2517 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2518 for (size_t k = 8; k <= 40; k += 4) { 2519 GemmMicrokernelTester() 2520 .mr(4) 2521 .nr(16) 2522 .kr(1) 2523 .sr(1) 2524 .m(4) 2525 .n(16) 2526 .k(k) 2527 .a_stride(43) 2528 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2529 } 2530 } 2531 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,k_div_4_subtile)2532 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, k_div_4_subtile) { 2533 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2534 for (size_t k = 8; k <= 40; k += 4) { 2535 for (uint32_t n = 1; n <= 16; n++) { 2536 for (uint32_t m = 1; m <= 4; m++) { 2537 GemmMicrokernelTester() 2538 .mr(4) 2539 .nr(16) 2540 .kr(1) 2541 .sr(1) 2542 .m(m) 2543 .n(n) 2544 .k(k) 2545 .iterations(1) 2546 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2547 } 2548 } 2549 } 2550 } 2551 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,n_gt_16)2552 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, n_gt_16) { 2553 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2554 for (uint32_t n = 17; n < 32; n++) { 2555 for (size_t k = 1; k <= 20; k += 5) { 2556 GemmMicrokernelTester() 2557 .mr(4) 2558 .nr(16) 2559 .kr(1) 2560 .sr(1) 2561 .m(4) 2562 .n(n) 2563 .k(k) 2564 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2565 } 2566 } 2567 } 2568 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,n_gt_16_strided_cn)2569 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, n_gt_16_strided_cn) { 2570 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2571 for (uint32_t n = 17; n < 32; n++) { 2572 for (size_t k = 1; k <= 20; k += 5) { 2573 GemmMicrokernelTester() 2574 .mr(4) 2575 .nr(16) 2576 .kr(1) 2577 .sr(1) 2578 .m(4) 2579 .n(n) 2580 .k(k) 2581 .cn_stride(19) 2582 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2583 } 2584 } 2585 } 2586 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,n_gt_16_strided_a)2587 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, n_gt_16_strided_a) { 2588 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2589 for (uint32_t n = 17; n < 32; n++) { 2590 for (size_t k = 1; k <= 20; k += 5) { 2591 GemmMicrokernelTester() 2592 .mr(4) 2593 .nr(16) 2594 .kr(1) 2595 .sr(1) 2596 .m(4) 2597 .n(n) 2598 .k(k) 2599 .a_stride(23) 2600 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2601 } 2602 } 2603 } 2604 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,n_gt_16_subtile)2605 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, n_gt_16_subtile) { 2606 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2607 for (uint32_t n = 17; n < 32; n++) { 2608 for (size_t k = 1; k <= 20; k += 5) { 2609 for (uint32_t m = 1; m <= 4; m++) { 2610 GemmMicrokernelTester() 2611 .mr(4) 2612 .nr(16) 2613 .kr(1) 2614 .sr(1) 2615 .m(m) 2616 .n(n) 2617 .k(k) 2618 .iterations(1) 2619 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2620 } 2621 } 2622 } 2623 } 2624 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,n_div_16)2625 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, n_div_16) { 2626 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2627 for (uint32_t n = 32; n <= 48; n += 16) { 2628 for (size_t k = 1; k <= 20; k += 5) { 2629 GemmMicrokernelTester() 2630 .mr(4) 2631 .nr(16) 2632 .kr(1) 2633 .sr(1) 2634 .m(4) 2635 .n(n) 2636 .k(k) 2637 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2638 } 2639 } 2640 } 2641 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,n_div_16_strided_cn)2642 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, n_div_16_strided_cn) { 2643 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2644 for (uint32_t n = 32; n <= 48; n += 16) { 2645 for (size_t k = 1; k <= 20; k += 5) { 2646 GemmMicrokernelTester() 2647 .mr(4) 2648 .nr(16) 2649 .kr(1) 2650 .sr(1) 2651 .m(4) 2652 .n(n) 2653 .k(k) 2654 .cn_stride(19) 2655 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2656 } 2657 } 2658 } 2659 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,n_div_16_strided_a)2660 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, n_div_16_strided_a) { 2661 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2662 for (uint32_t n = 32; n <= 48; n += 16) { 2663 for (size_t k = 1; k <= 20; k += 5) { 2664 GemmMicrokernelTester() 2665 .mr(4) 2666 .nr(16) 2667 .kr(1) 2668 .sr(1) 2669 .m(4) 2670 .n(n) 2671 .k(k) 2672 .a_stride(23) 2673 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2674 } 2675 } 2676 } 2677 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,n_div_16_subtile)2678 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, n_div_16_subtile) { 2679 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2680 for (uint32_t n = 32; n <= 48; n += 16) { 2681 for (size_t k = 1; k <= 20; k += 5) { 2682 for (uint32_t m = 1; m <= 4; m++) { 2683 GemmMicrokernelTester() 2684 .mr(4) 2685 .nr(16) 2686 .kr(1) 2687 .sr(1) 2688 .m(m) 2689 .n(n) 2690 .k(k) 2691 .iterations(1) 2692 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2693 } 2694 } 2695 } 2696 } 2697 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,strided_cm_subtile)2698 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, strided_cm_subtile) { 2699 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2700 for (size_t k = 1; k <= 20; k += 5) { 2701 for (uint32_t n = 1; n <= 16; n++) { 2702 for (uint32_t m = 1; m <= 4; m++) { 2703 GemmMicrokernelTester() 2704 .mr(4) 2705 .nr(16) 2706 .kr(1) 2707 .sr(1) 2708 .m(m) 2709 .n(n) 2710 .k(k) 2711 .cm_stride(19) 2712 .iterations(1) 2713 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2714 } 2715 } 2716 } 2717 } 2718 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,qmin)2719 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, qmin) { 2720 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2721 GemmMicrokernelTester() 2722 .mr(4) 2723 .nr(16) 2724 .kr(1) 2725 .sr(1) 2726 .m(4) 2727 .n(16) 2728 .k(4) 2729 .qmin(128) 2730 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2731 } 2732 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,qmax)2733 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, qmax) { 2734 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2735 GemmMicrokernelTester() 2736 .mr(4) 2737 .nr(16) 2738 .kr(1) 2739 .sr(1) 2740 .m(4) 2741 .n(16) 2742 .k(4) 2743 .qmax(128) 2744 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2745 } 2746 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,strided_cm)2747 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, strided_cm) { 2748 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2749 GemmMicrokernelTester() 2750 .mr(4) 2751 .nr(16) 2752 .kr(1) 2753 .sr(1) 2754 .m(4) 2755 .n(16) 2756 .k(4) 2757 .cm_stride(19) 2758 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2759 } 2760 #endif // XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 2761 2762 2763 #if XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4)2764 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4) { 2765 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2766 GemmMicrokernelTester() 2767 .mr(6) 2768 .nr(8) 2769 .kr(1) 2770 .sr(1) 2771 .m(6) 2772 .n(8) 2773 .k(4) 2774 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2775 } 2776 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,strided_cn)2777 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, strided_cn) { 2778 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2779 GemmMicrokernelTester() 2780 .mr(6) 2781 .nr(8) 2782 .kr(1) 2783 .sr(1) 2784 .m(6) 2785 .n(8) 2786 .k(4) 2787 .cn_stride(11) 2788 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2789 } 2790 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_strided_a)2791 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_strided_a) { 2792 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2793 GemmMicrokernelTester() 2794 .mr(6) 2795 .nr(8) 2796 .kr(1) 2797 .sr(1) 2798 .m(6) 2799 .n(8) 2800 .k(4) 2801 .a_stride(7) 2802 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2803 } 2804 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile)2805 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile) { 2806 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2807 for (uint32_t n = 1; n <= 8; n++) { 2808 for (uint32_t m = 1; m <= 6; m++) { 2809 GemmMicrokernelTester() 2810 .mr(6) 2811 .nr(8) 2812 .kr(1) 2813 .sr(1) 2814 .m(m) 2815 .n(n) 2816 .k(4) 2817 .iterations(1) 2818 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2819 } 2820 } 2821 } 2822 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile_m)2823 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 2824 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2825 for (uint32_t m = 1; m <= 6; m++) { 2826 GemmMicrokernelTester() 2827 .mr(6) 2828 .nr(8) 2829 .kr(1) 2830 .sr(1) 2831 .m(m) 2832 .n(8) 2833 .k(4) 2834 .iterations(1) 2835 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2836 } 2837 } 2838 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile_n)2839 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 2840 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2841 for (uint32_t n = 1; n <= 8; n++) { 2842 GemmMicrokernelTester() 2843 .mr(6) 2844 .nr(8) 2845 .kr(1) 2846 .sr(1) 2847 .m(6) 2848 .n(n) 2849 .k(4) 2850 .iterations(1) 2851 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2852 } 2853 } 2854 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,k_lt_4)2855 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4) { 2856 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2857 for (size_t k = 1; k < 4; k++) { 2858 GemmMicrokernelTester() 2859 .mr(6) 2860 .nr(8) 2861 .kr(1) 2862 .sr(1) 2863 .m(6) 2864 .n(8) 2865 .k(k) 2866 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2867 } 2868 } 2869 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,k_lt_4_strided_a)2870 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4_strided_a) { 2871 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2872 for (size_t k = 1; k < 4; k++) { 2873 GemmMicrokernelTester() 2874 .mr(6) 2875 .nr(8) 2876 .kr(1) 2877 .sr(1) 2878 .m(6) 2879 .n(8) 2880 .k(k) 2881 .a_stride(7) 2882 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2883 } 2884 } 2885 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,k_lt_4_subtile)2886 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4_subtile) { 2887 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2888 for (size_t k = 1; k < 4; k++) { 2889 for (uint32_t n = 1; n <= 8; n++) { 2890 for (uint32_t m = 1; m <= 6; m++) { 2891 GemmMicrokernelTester() 2892 .mr(6) 2893 .nr(8) 2894 .kr(1) 2895 .sr(1) 2896 .m(m) 2897 .n(n) 2898 .k(k) 2899 .iterations(1) 2900 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2901 } 2902 } 2903 } 2904 } 2905 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,k_gt_4)2906 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4) { 2907 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2908 for (size_t k = 5; k < 8; k++) { 2909 GemmMicrokernelTester() 2910 .mr(6) 2911 .nr(8) 2912 .kr(1) 2913 .sr(1) 2914 .m(6) 2915 .n(8) 2916 .k(k) 2917 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2918 } 2919 } 2920 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,k_gt_4_strided_a)2921 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4_strided_a) { 2922 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2923 for (size_t k = 5; k < 8; k++) { 2924 GemmMicrokernelTester() 2925 .mr(6) 2926 .nr(8) 2927 .kr(1) 2928 .sr(1) 2929 .m(6) 2930 .n(8) 2931 .k(k) 2932 .a_stride(11) 2933 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2934 } 2935 } 2936 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,k_gt_4_subtile)2937 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4_subtile) { 2938 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2939 for (size_t k = 5; k < 8; k++) { 2940 for (uint32_t n = 1; n <= 8; n++) { 2941 for (uint32_t m = 1; m <= 6; m++) { 2942 GemmMicrokernelTester() 2943 .mr(6) 2944 .nr(8) 2945 .kr(1) 2946 .sr(1) 2947 .m(m) 2948 .n(n) 2949 .k(k) 2950 .iterations(1) 2951 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2952 } 2953 } 2954 } 2955 } 2956 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,k_div_4)2957 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_div_4) { 2958 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2959 for (size_t k = 8; k <= 40; k += 4) { 2960 GemmMicrokernelTester() 2961 .mr(6) 2962 .nr(8) 2963 .kr(1) 2964 .sr(1) 2965 .m(6) 2966 .n(8) 2967 .k(k) 2968 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2969 } 2970 } 2971 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,k_div_4_strided_a)2972 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_div_4_strided_a) { 2973 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2974 for (size_t k = 8; k <= 40; k += 4) { 2975 GemmMicrokernelTester() 2976 .mr(6) 2977 .nr(8) 2978 .kr(1) 2979 .sr(1) 2980 .m(6) 2981 .n(8) 2982 .k(k) 2983 .a_stride(43) 2984 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 2985 } 2986 } 2987 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,k_div_4_subtile)2988 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_div_4_subtile) { 2989 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2990 for (size_t k = 8; k <= 40; k += 4) { 2991 for (uint32_t n = 1; n <= 8; n++) { 2992 for (uint32_t m = 1; m <= 6; m++) { 2993 GemmMicrokernelTester() 2994 .mr(6) 2995 .nr(8) 2996 .kr(1) 2997 .sr(1) 2998 .m(m) 2999 .n(n) 3000 .k(k) 3001 .iterations(1) 3002 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 3003 } 3004 } 3005 } 3006 } 3007 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8)3008 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8) { 3009 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3010 for (uint32_t n = 9; n < 16; n++) { 3011 for (size_t k = 1; k <= 20; k += 5) { 3012 GemmMicrokernelTester() 3013 .mr(6) 3014 .nr(8) 3015 .kr(1) 3016 .sr(1) 3017 .m(6) 3018 .n(n) 3019 .k(k) 3020 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 3021 } 3022 } 3023 } 3024 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8_strided_cn)3025 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_strided_cn) { 3026 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3027 for (uint32_t n = 9; n < 16; n++) { 3028 for (size_t k = 1; k <= 20; k += 5) { 3029 GemmMicrokernelTester() 3030 .mr(6) 3031 .nr(8) 3032 .kr(1) 3033 .sr(1) 3034 .m(6) 3035 .n(n) 3036 .k(k) 3037 .cn_stride(11) 3038 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 3039 } 3040 } 3041 } 3042 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8_strided_a)3043 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_strided_a) { 3044 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3045 for (uint32_t n = 9; n < 16; n++) { 3046 for (size_t k = 1; k <= 20; k += 5) { 3047 GemmMicrokernelTester() 3048 .mr(6) 3049 .nr(8) 3050 .kr(1) 3051 .sr(1) 3052 .m(6) 3053 .n(n) 3054 .k(k) 3055 .a_stride(23) 3056 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 3057 } 3058 } 3059 } 3060 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8_subtile)3061 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_subtile) { 3062 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3063 for (uint32_t n = 9; n < 16; n++) { 3064 for (size_t k = 1; k <= 20; k += 5) { 3065 for (uint32_t m = 1; m <= 6; m++) { 3066 GemmMicrokernelTester() 3067 .mr(6) 3068 .nr(8) 3069 .kr(1) 3070 .sr(1) 3071 .m(m) 3072 .n(n) 3073 .k(k) 3074 .iterations(1) 3075 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 3076 } 3077 } 3078 } 3079 } 3080 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,n_div_8)3081 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, n_div_8) { 3082 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3083 for (uint32_t n = 16; n <= 24; n += 8) { 3084 for (size_t k = 1; k <= 20; k += 5) { 3085 GemmMicrokernelTester() 3086 .mr(6) 3087 .nr(8) 3088 .kr(1) 3089 .sr(1) 3090 .m(6) 3091 .n(n) 3092 .k(k) 3093 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 3094 } 3095 } 3096 } 3097 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,n_div_8_strided_cn)3098 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_strided_cn) { 3099 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3100 for (uint32_t n = 16; n <= 24; n += 8) { 3101 for (size_t k = 1; k <= 20; k += 5) { 3102 GemmMicrokernelTester() 3103 .mr(6) 3104 .nr(8) 3105 .kr(1) 3106 .sr(1) 3107 .m(6) 3108 .n(n) 3109 .k(k) 3110 .cn_stride(11) 3111 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 3112 } 3113 } 3114 } 3115 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,n_div_8_strided_a)3116 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_strided_a) { 3117 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3118 for (uint32_t n = 16; n <= 24; n += 8) { 3119 for (size_t k = 1; k <= 20; k += 5) { 3120 GemmMicrokernelTester() 3121 .mr(6) 3122 .nr(8) 3123 .kr(1) 3124 .sr(1) 3125 .m(6) 3126 .n(n) 3127 .k(k) 3128 .a_stride(23) 3129 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 3130 } 3131 } 3132 } 3133 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,n_div_8_subtile)3134 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_subtile) { 3135 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3136 for (uint32_t n = 16; n <= 24; n += 8) { 3137 for (size_t k = 1; k <= 20; k += 5) { 3138 for (uint32_t m = 1; m <= 6; m++) { 3139 GemmMicrokernelTester() 3140 .mr(6) 3141 .nr(8) 3142 .kr(1) 3143 .sr(1) 3144 .m(m) 3145 .n(n) 3146 .k(k) 3147 .iterations(1) 3148 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 3149 } 3150 } 3151 } 3152 } 3153 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,strided_cm_subtile)3154 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, strided_cm_subtile) { 3155 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3156 for (size_t k = 1; k <= 20; k += 5) { 3157 for (uint32_t n = 1; n <= 8; n++) { 3158 for (uint32_t m = 1; m <= 6; m++) { 3159 GemmMicrokernelTester() 3160 .mr(6) 3161 .nr(8) 3162 .kr(1) 3163 .sr(1) 3164 .m(m) 3165 .n(n) 3166 .k(k) 3167 .cm_stride(11) 3168 .iterations(1) 3169 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 3170 } 3171 } 3172 } 3173 } 3174 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,qmin)3175 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, qmin) { 3176 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3177 GemmMicrokernelTester() 3178 .mr(6) 3179 .nr(8) 3180 .kr(1) 3181 .sr(1) 3182 .m(6) 3183 .n(8) 3184 .k(4) 3185 .qmin(128) 3186 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 3187 } 3188 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,qmax)3189 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, qmax) { 3190 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3191 GemmMicrokernelTester() 3192 .mr(6) 3193 .nr(8) 3194 .kr(1) 3195 .sr(1) 3196 .m(6) 3197 .n(8) 3198 .k(4) 3199 .qmax(128) 3200 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 3201 } 3202 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,strided_cm)3203 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, strided_cm) { 3204 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3205 GemmMicrokernelTester() 3206 .mr(6) 3207 .nr(8) 3208 .kr(1) 3209 .sr(1) 3210 .m(6) 3211 .n(8) 3212 .k(4) 3213 .cm_stride(11) 3214 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 3215 } 3216 #endif // XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 3217 3218 3219 #if XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,k_eq_4)3220 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, k_eq_4) { 3221 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3222 GemmMicrokernelTester() 3223 .mr(6) 3224 .nr(16) 3225 .kr(1) 3226 .sr(1) 3227 .m(6) 3228 .n(16) 3229 .k(4) 3230 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 3231 } 3232 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,strided_cn)3233 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, strided_cn) { 3234 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3235 GemmMicrokernelTester() 3236 .mr(6) 3237 .nr(16) 3238 .kr(1) 3239 .sr(1) 3240 .m(6) 3241 .n(16) 3242 .k(4) 3243 .cn_stride(19) 3244 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 3245 } 3246 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,k_eq_4_strided_a)3247 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, k_eq_4_strided_a) { 3248 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3249 GemmMicrokernelTester() 3250 .mr(6) 3251 .nr(16) 3252 .kr(1) 3253 .sr(1) 3254 .m(6) 3255 .n(16) 3256 .k(4) 3257 .a_stride(7) 3258 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 3259 } 3260 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,k_eq_4_subtile)3261 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, k_eq_4_subtile) { 3262 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3263 for (uint32_t n = 1; n <= 16; n++) { 3264 for (uint32_t m = 1; m <= 6; m++) { 3265 GemmMicrokernelTester() 3266 .mr(6) 3267 .nr(16) 3268 .kr(1) 3269 .sr(1) 3270 .m(m) 3271 .n(n) 3272 .k(4) 3273 .iterations(1) 3274 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 3275 } 3276 } 3277 } 3278 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,k_eq_4_subtile_m)3279 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, k_eq_4_subtile_m) { 3280 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3281 for (uint32_t m = 1; m <= 6; m++) { 3282 GemmMicrokernelTester() 3283 .mr(6) 3284 .nr(16) 3285 .kr(1) 3286 .sr(1) 3287 .m(m) 3288 .n(16) 3289 .k(4) 3290 .iterations(1) 3291 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 3292 } 3293 } 3294 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,k_eq_4_subtile_n)3295 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, k_eq_4_subtile_n) { 3296 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3297 for (uint32_t n = 1; n <= 16; n++) { 3298 GemmMicrokernelTester() 3299 .mr(6) 3300 .nr(16) 3301 .kr(1) 3302 .sr(1) 3303 .m(6) 3304 .n(n) 3305 .k(4) 3306 .iterations(1) 3307 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 3308 } 3309 } 3310 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,k_lt_4)3311 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, k_lt_4) { 3312 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3313 for (size_t k = 1; k < 4; k++) { 3314 GemmMicrokernelTester() 3315 .mr(6) 3316 .nr(16) 3317 .kr(1) 3318 .sr(1) 3319 .m(6) 3320 .n(16) 3321 .k(k) 3322 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 3323 } 3324 } 3325 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,k_lt_4_strided_a)3326 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, k_lt_4_strided_a) { 3327 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3328 for (size_t k = 1; k < 4; k++) { 3329 GemmMicrokernelTester() 3330 .mr(6) 3331 .nr(16) 3332 .kr(1) 3333 .sr(1) 3334 .m(6) 3335 .n(16) 3336 .k(k) 3337 .a_stride(7) 3338 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 3339 } 3340 } 3341 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,k_lt_4_subtile)3342 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, k_lt_4_subtile) { 3343 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3344 for (size_t k = 1; k < 4; k++) { 3345 for (uint32_t n = 1; n <= 16; n++) { 3346 for (uint32_t m = 1; m <= 6; m++) { 3347 GemmMicrokernelTester() 3348 .mr(6) 3349 .nr(16) 3350 .kr(1) 3351 .sr(1) 3352 .m(m) 3353 .n(n) 3354 .k(k) 3355 .iterations(1) 3356 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 3357 } 3358 } 3359 } 3360 } 3361 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,k_gt_4)3362 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, k_gt_4) { 3363 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3364 for (size_t k = 5; k < 8; k++) { 3365 GemmMicrokernelTester() 3366 .mr(6) 3367 .nr(16) 3368 .kr(1) 3369 .sr(1) 3370 .m(6) 3371 .n(16) 3372 .k(k) 3373 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 3374 } 3375 } 3376 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,k_gt_4_strided_a)3377 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, k_gt_4_strided_a) { 3378 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3379 for (size_t k = 5; k < 8; k++) { 3380 GemmMicrokernelTester() 3381 .mr(6) 3382 .nr(16) 3383 .kr(1) 3384 .sr(1) 3385 .m(6) 3386 .n(16) 3387 .k(k) 3388 .a_stride(11) 3389 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 3390 } 3391 } 3392 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,k_gt_4_subtile)3393 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, k_gt_4_subtile) { 3394 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3395 for (size_t k = 5; k < 8; k++) { 3396 for (uint32_t n = 1; n <= 16; n++) { 3397 for (uint32_t m = 1; m <= 6; m++) { 3398 GemmMicrokernelTester() 3399 .mr(6) 3400 .nr(16) 3401 .kr(1) 3402 .sr(1) 3403 .m(m) 3404 .n(n) 3405 .k(k) 3406 .iterations(1) 3407 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 3408 } 3409 } 3410 } 3411 } 3412 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,k_div_4)3413 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, k_div_4) { 3414 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3415 for (size_t k = 8; k <= 40; k += 4) { 3416 GemmMicrokernelTester() 3417 .mr(6) 3418 .nr(16) 3419 .kr(1) 3420 .sr(1) 3421 .m(6) 3422 .n(16) 3423 .k(k) 3424 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 3425 } 3426 } 3427 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,k_div_4_strided_a)3428 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, k_div_4_strided_a) { 3429 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3430 for (size_t k = 8; k <= 40; k += 4) { 3431 GemmMicrokernelTester() 3432 .mr(6) 3433 .nr(16) 3434 .kr(1) 3435 .sr(1) 3436 .m(6) 3437 .n(16) 3438 .k(k) 3439 .a_stride(43) 3440 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 3441 } 3442 } 3443 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,k_div_4_subtile)3444 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, k_div_4_subtile) { 3445 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3446 for (size_t k = 8; k <= 40; k += 4) { 3447 for (uint32_t n = 1; n <= 16; n++) { 3448 for (uint32_t m = 1; m <= 6; m++) { 3449 GemmMicrokernelTester() 3450 .mr(6) 3451 .nr(16) 3452 .kr(1) 3453 .sr(1) 3454 .m(m) 3455 .n(n) 3456 .k(k) 3457 .iterations(1) 3458 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 3459 } 3460 } 3461 } 3462 } 3463 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,n_gt_16)3464 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, n_gt_16) { 3465 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3466 for (uint32_t n = 17; n < 32; n++) { 3467 for (size_t k = 1; k <= 20; k += 5) { 3468 GemmMicrokernelTester() 3469 .mr(6) 3470 .nr(16) 3471 .kr(1) 3472 .sr(1) 3473 .m(6) 3474 .n(n) 3475 .k(k) 3476 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 3477 } 3478 } 3479 } 3480 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,n_gt_16_strided_cn)3481 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, n_gt_16_strided_cn) { 3482 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3483 for (uint32_t n = 17; n < 32; n++) { 3484 for (size_t k = 1; k <= 20; k += 5) { 3485 GemmMicrokernelTester() 3486 .mr(6) 3487 .nr(16) 3488 .kr(1) 3489 .sr(1) 3490 .m(6) 3491 .n(n) 3492 .k(k) 3493 .cn_stride(19) 3494 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 3495 } 3496 } 3497 } 3498 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,n_gt_16_strided_a)3499 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, n_gt_16_strided_a) { 3500 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3501 for (uint32_t n = 17; n < 32; n++) { 3502 for (size_t k = 1; k <= 20; k += 5) { 3503 GemmMicrokernelTester() 3504 .mr(6) 3505 .nr(16) 3506 .kr(1) 3507 .sr(1) 3508 .m(6) 3509 .n(n) 3510 .k(k) 3511 .a_stride(23) 3512 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 3513 } 3514 } 3515 } 3516 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,n_gt_16_subtile)3517 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, n_gt_16_subtile) { 3518 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3519 for (uint32_t n = 17; n < 32; n++) { 3520 for (size_t k = 1; k <= 20; k += 5) { 3521 for (uint32_t m = 1; m <= 6; m++) { 3522 GemmMicrokernelTester() 3523 .mr(6) 3524 .nr(16) 3525 .kr(1) 3526 .sr(1) 3527 .m(m) 3528 .n(n) 3529 .k(k) 3530 .iterations(1) 3531 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 3532 } 3533 } 3534 } 3535 } 3536 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,n_div_16)3537 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, n_div_16) { 3538 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3539 for (uint32_t n = 32; n <= 48; n += 16) { 3540 for (size_t k = 1; k <= 20; k += 5) { 3541 GemmMicrokernelTester() 3542 .mr(6) 3543 .nr(16) 3544 .kr(1) 3545 .sr(1) 3546 .m(6) 3547 .n(n) 3548 .k(k) 3549 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 3550 } 3551 } 3552 } 3553 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,n_div_16_strided_cn)3554 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, n_div_16_strided_cn) { 3555 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3556 for (uint32_t n = 32; n <= 48; n += 16) { 3557 for (size_t k = 1; k <= 20; k += 5) { 3558 GemmMicrokernelTester() 3559 .mr(6) 3560 .nr(16) 3561 .kr(1) 3562 .sr(1) 3563 .m(6) 3564 .n(n) 3565 .k(k) 3566 .cn_stride(19) 3567 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 3568 } 3569 } 3570 } 3571 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,n_div_16_strided_a)3572 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, n_div_16_strided_a) { 3573 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3574 for (uint32_t n = 32; n <= 48; n += 16) { 3575 for (size_t k = 1; k <= 20; k += 5) { 3576 GemmMicrokernelTester() 3577 .mr(6) 3578 .nr(16) 3579 .kr(1) 3580 .sr(1) 3581 .m(6) 3582 .n(n) 3583 .k(k) 3584 .a_stride(23) 3585 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 3586 } 3587 } 3588 } 3589 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,n_div_16_subtile)3590 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, n_div_16_subtile) { 3591 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3592 for (uint32_t n = 32; n <= 48; n += 16) { 3593 for (size_t k = 1; k <= 20; k += 5) { 3594 for (uint32_t m = 1; m <= 6; m++) { 3595 GemmMicrokernelTester() 3596 .mr(6) 3597 .nr(16) 3598 .kr(1) 3599 .sr(1) 3600 .m(m) 3601 .n(n) 3602 .k(k) 3603 .iterations(1) 3604 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 3605 } 3606 } 3607 } 3608 } 3609 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,strided_cm_subtile)3610 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, strided_cm_subtile) { 3611 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3612 for (size_t k = 1; k <= 20; k += 5) { 3613 for (uint32_t n = 1; n <= 16; n++) { 3614 for (uint32_t m = 1; m <= 6; m++) { 3615 GemmMicrokernelTester() 3616 .mr(6) 3617 .nr(16) 3618 .kr(1) 3619 .sr(1) 3620 .m(m) 3621 .n(n) 3622 .k(k) 3623 .cm_stride(19) 3624 .iterations(1) 3625 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 3626 } 3627 } 3628 } 3629 } 3630 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,qmin)3631 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, qmin) { 3632 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3633 GemmMicrokernelTester() 3634 .mr(6) 3635 .nr(16) 3636 .kr(1) 3637 .sr(1) 3638 .m(6) 3639 .n(16) 3640 .k(4) 3641 .qmin(128) 3642 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 3643 } 3644 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,qmax)3645 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, qmax) { 3646 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3647 GemmMicrokernelTester() 3648 .mr(6) 3649 .nr(16) 3650 .kr(1) 3651 .sr(1) 3652 .m(6) 3653 .n(16) 3654 .k(4) 3655 .qmax(128) 3656 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 3657 } 3658 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,strided_cm)3659 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, strided_cm) { 3660 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3661 GemmMicrokernelTester() 3662 .mr(6) 3663 .nr(16) 3664 .kr(1) 3665 .sr(1) 3666 .m(6) 3667 .n(16) 3668 .k(4) 3669 .cm_stride(19) 3670 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 3671 } 3672 #endif // XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 3673 3674 3675 #if XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,k_eq_4)3676 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, k_eq_4) { 3677 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3678 GemmMicrokernelTester() 3679 .mr(6) 3680 .nr(16) 3681 .kr(1) 3682 .sr(1) 3683 .m(6) 3684 .n(16) 3685 .k(4) 3686 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 3687 } 3688 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,strided_cn)3689 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, strided_cn) { 3690 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3691 GemmMicrokernelTester() 3692 .mr(6) 3693 .nr(16) 3694 .kr(1) 3695 .sr(1) 3696 .m(6) 3697 .n(16) 3698 .k(4) 3699 .cn_stride(19) 3700 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 3701 } 3702 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,k_eq_4_strided_a)3703 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, k_eq_4_strided_a) { 3704 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3705 GemmMicrokernelTester() 3706 .mr(6) 3707 .nr(16) 3708 .kr(1) 3709 .sr(1) 3710 .m(6) 3711 .n(16) 3712 .k(4) 3713 .a_stride(7) 3714 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 3715 } 3716 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,k_eq_4_subtile)3717 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, k_eq_4_subtile) { 3718 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3719 for (uint32_t n = 1; n <= 16; n++) { 3720 for (uint32_t m = 1; m <= 6; m++) { 3721 GemmMicrokernelTester() 3722 .mr(6) 3723 .nr(16) 3724 .kr(1) 3725 .sr(1) 3726 .m(m) 3727 .n(n) 3728 .k(4) 3729 .iterations(1) 3730 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 3731 } 3732 } 3733 } 3734 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,k_eq_4_subtile_m)3735 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, k_eq_4_subtile_m) { 3736 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3737 for (uint32_t m = 1; m <= 6; m++) { 3738 GemmMicrokernelTester() 3739 .mr(6) 3740 .nr(16) 3741 .kr(1) 3742 .sr(1) 3743 .m(m) 3744 .n(16) 3745 .k(4) 3746 .iterations(1) 3747 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 3748 } 3749 } 3750 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,k_eq_4_subtile_n)3751 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, k_eq_4_subtile_n) { 3752 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3753 for (uint32_t n = 1; n <= 16; n++) { 3754 GemmMicrokernelTester() 3755 .mr(6) 3756 .nr(16) 3757 .kr(1) 3758 .sr(1) 3759 .m(6) 3760 .n(n) 3761 .k(4) 3762 .iterations(1) 3763 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 3764 } 3765 } 3766 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,k_lt_4)3767 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, k_lt_4) { 3768 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3769 for (size_t k = 1; k < 4; k++) { 3770 GemmMicrokernelTester() 3771 .mr(6) 3772 .nr(16) 3773 .kr(1) 3774 .sr(1) 3775 .m(6) 3776 .n(16) 3777 .k(k) 3778 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 3779 } 3780 } 3781 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,k_lt_4_strided_a)3782 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, k_lt_4_strided_a) { 3783 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3784 for (size_t k = 1; k < 4; k++) { 3785 GemmMicrokernelTester() 3786 .mr(6) 3787 .nr(16) 3788 .kr(1) 3789 .sr(1) 3790 .m(6) 3791 .n(16) 3792 .k(k) 3793 .a_stride(7) 3794 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 3795 } 3796 } 3797 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,k_lt_4_subtile)3798 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, k_lt_4_subtile) { 3799 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3800 for (size_t k = 1; k < 4; k++) { 3801 for (uint32_t n = 1; n <= 16; n++) { 3802 for (uint32_t m = 1; m <= 6; m++) { 3803 GemmMicrokernelTester() 3804 .mr(6) 3805 .nr(16) 3806 .kr(1) 3807 .sr(1) 3808 .m(m) 3809 .n(n) 3810 .k(k) 3811 .iterations(1) 3812 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 3813 } 3814 } 3815 } 3816 } 3817 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,k_gt_4)3818 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, k_gt_4) { 3819 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3820 for (size_t k = 5; k < 8; k++) { 3821 GemmMicrokernelTester() 3822 .mr(6) 3823 .nr(16) 3824 .kr(1) 3825 .sr(1) 3826 .m(6) 3827 .n(16) 3828 .k(k) 3829 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 3830 } 3831 } 3832 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,k_gt_4_strided_a)3833 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, k_gt_4_strided_a) { 3834 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3835 for (size_t k = 5; k < 8; k++) { 3836 GemmMicrokernelTester() 3837 .mr(6) 3838 .nr(16) 3839 .kr(1) 3840 .sr(1) 3841 .m(6) 3842 .n(16) 3843 .k(k) 3844 .a_stride(11) 3845 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 3846 } 3847 } 3848 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,k_gt_4_subtile)3849 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, k_gt_4_subtile) { 3850 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3851 for (size_t k = 5; k < 8; k++) { 3852 for (uint32_t n = 1; n <= 16; n++) { 3853 for (uint32_t m = 1; m <= 6; m++) { 3854 GemmMicrokernelTester() 3855 .mr(6) 3856 .nr(16) 3857 .kr(1) 3858 .sr(1) 3859 .m(m) 3860 .n(n) 3861 .k(k) 3862 .iterations(1) 3863 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 3864 } 3865 } 3866 } 3867 } 3868 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,k_div_4)3869 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, k_div_4) { 3870 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3871 for (size_t k = 8; k <= 40; k += 4) { 3872 GemmMicrokernelTester() 3873 .mr(6) 3874 .nr(16) 3875 .kr(1) 3876 .sr(1) 3877 .m(6) 3878 .n(16) 3879 .k(k) 3880 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 3881 } 3882 } 3883 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,k_div_4_strided_a)3884 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, k_div_4_strided_a) { 3885 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3886 for (size_t k = 8; k <= 40; k += 4) { 3887 GemmMicrokernelTester() 3888 .mr(6) 3889 .nr(16) 3890 .kr(1) 3891 .sr(1) 3892 .m(6) 3893 .n(16) 3894 .k(k) 3895 .a_stride(43) 3896 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 3897 } 3898 } 3899 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,k_div_4_subtile)3900 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, k_div_4_subtile) { 3901 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3902 for (size_t k = 8; k <= 40; k += 4) { 3903 for (uint32_t n = 1; n <= 16; n++) { 3904 for (uint32_t m = 1; m <= 6; m++) { 3905 GemmMicrokernelTester() 3906 .mr(6) 3907 .nr(16) 3908 .kr(1) 3909 .sr(1) 3910 .m(m) 3911 .n(n) 3912 .k(k) 3913 .iterations(1) 3914 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 3915 } 3916 } 3917 } 3918 } 3919 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,n_gt_16)3920 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, n_gt_16) { 3921 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3922 for (uint32_t n = 17; n < 32; n++) { 3923 for (size_t k = 1; k <= 20; k += 5) { 3924 GemmMicrokernelTester() 3925 .mr(6) 3926 .nr(16) 3927 .kr(1) 3928 .sr(1) 3929 .m(6) 3930 .n(n) 3931 .k(k) 3932 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 3933 } 3934 } 3935 } 3936 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,n_gt_16_strided_cn)3937 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, n_gt_16_strided_cn) { 3938 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3939 for (uint32_t n = 17; n < 32; n++) { 3940 for (size_t k = 1; k <= 20; k += 5) { 3941 GemmMicrokernelTester() 3942 .mr(6) 3943 .nr(16) 3944 .kr(1) 3945 .sr(1) 3946 .m(6) 3947 .n(n) 3948 .k(k) 3949 .cn_stride(19) 3950 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 3951 } 3952 } 3953 } 3954 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,n_gt_16_strided_a)3955 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, n_gt_16_strided_a) { 3956 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3957 for (uint32_t n = 17; n < 32; n++) { 3958 for (size_t k = 1; k <= 20; k += 5) { 3959 GemmMicrokernelTester() 3960 .mr(6) 3961 .nr(16) 3962 .kr(1) 3963 .sr(1) 3964 .m(6) 3965 .n(n) 3966 .k(k) 3967 .a_stride(23) 3968 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 3969 } 3970 } 3971 } 3972 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,n_gt_16_subtile)3973 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, n_gt_16_subtile) { 3974 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3975 for (uint32_t n = 17; n < 32; n++) { 3976 for (size_t k = 1; k <= 20; k += 5) { 3977 for (uint32_t m = 1; m <= 6; m++) { 3978 GemmMicrokernelTester() 3979 .mr(6) 3980 .nr(16) 3981 .kr(1) 3982 .sr(1) 3983 .m(m) 3984 .n(n) 3985 .k(k) 3986 .iterations(1) 3987 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 3988 } 3989 } 3990 } 3991 } 3992 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,n_div_16)3993 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, n_div_16) { 3994 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3995 for (uint32_t n = 32; n <= 48; n += 16) { 3996 for (size_t k = 1; k <= 20; k += 5) { 3997 GemmMicrokernelTester() 3998 .mr(6) 3999 .nr(16) 4000 .kr(1) 4001 .sr(1) 4002 .m(6) 4003 .n(n) 4004 .k(k) 4005 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 4006 } 4007 } 4008 } 4009 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,n_div_16_strided_cn)4010 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, n_div_16_strided_cn) { 4011 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4012 for (uint32_t n = 32; n <= 48; n += 16) { 4013 for (size_t k = 1; k <= 20; k += 5) { 4014 GemmMicrokernelTester() 4015 .mr(6) 4016 .nr(16) 4017 .kr(1) 4018 .sr(1) 4019 .m(6) 4020 .n(n) 4021 .k(k) 4022 .cn_stride(19) 4023 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 4024 } 4025 } 4026 } 4027 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,n_div_16_strided_a)4028 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, n_div_16_strided_a) { 4029 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4030 for (uint32_t n = 32; n <= 48; n += 16) { 4031 for (size_t k = 1; k <= 20; k += 5) { 4032 GemmMicrokernelTester() 4033 .mr(6) 4034 .nr(16) 4035 .kr(1) 4036 .sr(1) 4037 .m(6) 4038 .n(n) 4039 .k(k) 4040 .a_stride(23) 4041 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 4042 } 4043 } 4044 } 4045 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,n_div_16_subtile)4046 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, n_div_16_subtile) { 4047 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4048 for (uint32_t n = 32; n <= 48; n += 16) { 4049 for (size_t k = 1; k <= 20; k += 5) { 4050 for (uint32_t m = 1; m <= 6; m++) { 4051 GemmMicrokernelTester() 4052 .mr(6) 4053 .nr(16) 4054 .kr(1) 4055 .sr(1) 4056 .m(m) 4057 .n(n) 4058 .k(k) 4059 .iterations(1) 4060 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 4061 } 4062 } 4063 } 4064 } 4065 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,strided_cm_subtile)4066 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, strided_cm_subtile) { 4067 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4068 for (size_t k = 1; k <= 20; k += 5) { 4069 for (uint32_t n = 1; n <= 16; n++) { 4070 for (uint32_t m = 1; m <= 6; m++) { 4071 GemmMicrokernelTester() 4072 .mr(6) 4073 .nr(16) 4074 .kr(1) 4075 .sr(1) 4076 .m(m) 4077 .n(n) 4078 .k(k) 4079 .cm_stride(19) 4080 .iterations(1) 4081 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 4082 } 4083 } 4084 } 4085 } 4086 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,qmin)4087 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, qmin) { 4088 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4089 GemmMicrokernelTester() 4090 .mr(6) 4091 .nr(16) 4092 .kr(1) 4093 .sr(1) 4094 .m(6) 4095 .n(16) 4096 .k(4) 4097 .qmin(128) 4098 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 4099 } 4100 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,qmax)4101 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, qmax) { 4102 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4103 GemmMicrokernelTester() 4104 .mr(6) 4105 .nr(16) 4106 .kr(1) 4107 .sr(1) 4108 .m(6) 4109 .n(16) 4110 .k(4) 4111 .qmax(128) 4112 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 4113 } 4114 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,strided_cm)4115 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, strided_cm) { 4116 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4117 GemmMicrokernelTester() 4118 .mr(6) 4119 .nr(16) 4120 .kr(1) 4121 .sr(1) 4122 .m(6) 4123 .n(16) 4124 .k(4) 4125 .cm_stride(19) 4126 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 4127 } 4128 #endif // XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 4129 4130 4131 #if XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,k_eq_4)4132 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, k_eq_4) { 4133 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4134 GemmMicrokernelTester() 4135 .mr(6) 4136 .nr(16) 4137 .kr(1) 4138 .sr(1) 4139 .m(6) 4140 .n(16) 4141 .k(4) 4142 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 4143 } 4144 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,strided_cn)4145 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, strided_cn) { 4146 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4147 GemmMicrokernelTester() 4148 .mr(6) 4149 .nr(16) 4150 .kr(1) 4151 .sr(1) 4152 .m(6) 4153 .n(16) 4154 .k(4) 4155 .cn_stride(19) 4156 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 4157 } 4158 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,k_eq_4_strided_a)4159 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, k_eq_4_strided_a) { 4160 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4161 GemmMicrokernelTester() 4162 .mr(6) 4163 .nr(16) 4164 .kr(1) 4165 .sr(1) 4166 .m(6) 4167 .n(16) 4168 .k(4) 4169 .a_stride(7) 4170 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 4171 } 4172 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,k_eq_4_subtile)4173 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, k_eq_4_subtile) { 4174 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4175 for (uint32_t n = 1; n <= 16; n++) { 4176 for (uint32_t m = 1; m <= 6; m++) { 4177 GemmMicrokernelTester() 4178 .mr(6) 4179 .nr(16) 4180 .kr(1) 4181 .sr(1) 4182 .m(m) 4183 .n(n) 4184 .k(4) 4185 .iterations(1) 4186 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 4187 } 4188 } 4189 } 4190 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,k_eq_4_subtile_m)4191 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, k_eq_4_subtile_m) { 4192 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4193 for (uint32_t m = 1; m <= 6; m++) { 4194 GemmMicrokernelTester() 4195 .mr(6) 4196 .nr(16) 4197 .kr(1) 4198 .sr(1) 4199 .m(m) 4200 .n(16) 4201 .k(4) 4202 .iterations(1) 4203 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 4204 } 4205 } 4206 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,k_eq_4_subtile_n)4207 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, k_eq_4_subtile_n) { 4208 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4209 for (uint32_t n = 1; n <= 16; n++) { 4210 GemmMicrokernelTester() 4211 .mr(6) 4212 .nr(16) 4213 .kr(1) 4214 .sr(1) 4215 .m(6) 4216 .n(n) 4217 .k(4) 4218 .iterations(1) 4219 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 4220 } 4221 } 4222 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,k_lt_4)4223 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, k_lt_4) { 4224 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4225 for (size_t k = 1; k < 4; k++) { 4226 GemmMicrokernelTester() 4227 .mr(6) 4228 .nr(16) 4229 .kr(1) 4230 .sr(1) 4231 .m(6) 4232 .n(16) 4233 .k(k) 4234 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 4235 } 4236 } 4237 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,k_lt_4_strided_a)4238 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, k_lt_4_strided_a) { 4239 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4240 for (size_t k = 1; k < 4; k++) { 4241 GemmMicrokernelTester() 4242 .mr(6) 4243 .nr(16) 4244 .kr(1) 4245 .sr(1) 4246 .m(6) 4247 .n(16) 4248 .k(k) 4249 .a_stride(7) 4250 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 4251 } 4252 } 4253 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,k_lt_4_subtile)4254 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, k_lt_4_subtile) { 4255 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4256 for (size_t k = 1; k < 4; k++) { 4257 for (uint32_t n = 1; n <= 16; n++) { 4258 for (uint32_t m = 1; m <= 6; m++) { 4259 GemmMicrokernelTester() 4260 .mr(6) 4261 .nr(16) 4262 .kr(1) 4263 .sr(1) 4264 .m(m) 4265 .n(n) 4266 .k(k) 4267 .iterations(1) 4268 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 4269 } 4270 } 4271 } 4272 } 4273 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,k_gt_4)4274 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, k_gt_4) { 4275 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4276 for (size_t k = 5; k < 8; k++) { 4277 GemmMicrokernelTester() 4278 .mr(6) 4279 .nr(16) 4280 .kr(1) 4281 .sr(1) 4282 .m(6) 4283 .n(16) 4284 .k(k) 4285 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 4286 } 4287 } 4288 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,k_gt_4_strided_a)4289 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, k_gt_4_strided_a) { 4290 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4291 for (size_t k = 5; k < 8; k++) { 4292 GemmMicrokernelTester() 4293 .mr(6) 4294 .nr(16) 4295 .kr(1) 4296 .sr(1) 4297 .m(6) 4298 .n(16) 4299 .k(k) 4300 .a_stride(11) 4301 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 4302 } 4303 } 4304 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,k_gt_4_subtile)4305 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, k_gt_4_subtile) { 4306 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4307 for (size_t k = 5; k < 8; k++) { 4308 for (uint32_t n = 1; n <= 16; n++) { 4309 for (uint32_t m = 1; m <= 6; m++) { 4310 GemmMicrokernelTester() 4311 .mr(6) 4312 .nr(16) 4313 .kr(1) 4314 .sr(1) 4315 .m(m) 4316 .n(n) 4317 .k(k) 4318 .iterations(1) 4319 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 4320 } 4321 } 4322 } 4323 } 4324 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,k_div_4)4325 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, k_div_4) { 4326 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4327 for (size_t k = 8; k <= 40; k += 4) { 4328 GemmMicrokernelTester() 4329 .mr(6) 4330 .nr(16) 4331 .kr(1) 4332 .sr(1) 4333 .m(6) 4334 .n(16) 4335 .k(k) 4336 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 4337 } 4338 } 4339 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,k_div_4_strided_a)4340 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, k_div_4_strided_a) { 4341 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4342 for (size_t k = 8; k <= 40; k += 4) { 4343 GemmMicrokernelTester() 4344 .mr(6) 4345 .nr(16) 4346 .kr(1) 4347 .sr(1) 4348 .m(6) 4349 .n(16) 4350 .k(k) 4351 .a_stride(43) 4352 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 4353 } 4354 } 4355 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,k_div_4_subtile)4356 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, k_div_4_subtile) { 4357 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4358 for (size_t k = 8; k <= 40; k += 4) { 4359 for (uint32_t n = 1; n <= 16; n++) { 4360 for (uint32_t m = 1; m <= 6; m++) { 4361 GemmMicrokernelTester() 4362 .mr(6) 4363 .nr(16) 4364 .kr(1) 4365 .sr(1) 4366 .m(m) 4367 .n(n) 4368 .k(k) 4369 .iterations(1) 4370 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 4371 } 4372 } 4373 } 4374 } 4375 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,n_gt_16)4376 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, n_gt_16) { 4377 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4378 for (uint32_t n = 17; n < 32; n++) { 4379 for (size_t k = 1; k <= 20; k += 5) { 4380 GemmMicrokernelTester() 4381 .mr(6) 4382 .nr(16) 4383 .kr(1) 4384 .sr(1) 4385 .m(6) 4386 .n(n) 4387 .k(k) 4388 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 4389 } 4390 } 4391 } 4392 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,n_gt_16_strided_cn)4393 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, n_gt_16_strided_cn) { 4394 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4395 for (uint32_t n = 17; n < 32; n++) { 4396 for (size_t k = 1; k <= 20; k += 5) { 4397 GemmMicrokernelTester() 4398 .mr(6) 4399 .nr(16) 4400 .kr(1) 4401 .sr(1) 4402 .m(6) 4403 .n(n) 4404 .k(k) 4405 .cn_stride(19) 4406 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 4407 } 4408 } 4409 } 4410 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,n_gt_16_strided_a)4411 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, n_gt_16_strided_a) { 4412 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4413 for (uint32_t n = 17; n < 32; n++) { 4414 for (size_t k = 1; k <= 20; k += 5) { 4415 GemmMicrokernelTester() 4416 .mr(6) 4417 .nr(16) 4418 .kr(1) 4419 .sr(1) 4420 .m(6) 4421 .n(n) 4422 .k(k) 4423 .a_stride(23) 4424 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 4425 } 4426 } 4427 } 4428 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,n_gt_16_subtile)4429 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, n_gt_16_subtile) { 4430 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4431 for (uint32_t n = 17; n < 32; n++) { 4432 for (size_t k = 1; k <= 20; k += 5) { 4433 for (uint32_t m = 1; m <= 6; m++) { 4434 GemmMicrokernelTester() 4435 .mr(6) 4436 .nr(16) 4437 .kr(1) 4438 .sr(1) 4439 .m(m) 4440 .n(n) 4441 .k(k) 4442 .iterations(1) 4443 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 4444 } 4445 } 4446 } 4447 } 4448 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,n_div_16)4449 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, n_div_16) { 4450 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4451 for (uint32_t n = 32; n <= 48; n += 16) { 4452 for (size_t k = 1; k <= 20; k += 5) { 4453 GemmMicrokernelTester() 4454 .mr(6) 4455 .nr(16) 4456 .kr(1) 4457 .sr(1) 4458 .m(6) 4459 .n(n) 4460 .k(k) 4461 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 4462 } 4463 } 4464 } 4465 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,n_div_16_strided_cn)4466 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, n_div_16_strided_cn) { 4467 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4468 for (uint32_t n = 32; n <= 48; n += 16) { 4469 for (size_t k = 1; k <= 20; k += 5) { 4470 GemmMicrokernelTester() 4471 .mr(6) 4472 .nr(16) 4473 .kr(1) 4474 .sr(1) 4475 .m(6) 4476 .n(n) 4477 .k(k) 4478 .cn_stride(19) 4479 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 4480 } 4481 } 4482 } 4483 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,n_div_16_strided_a)4484 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, n_div_16_strided_a) { 4485 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4486 for (uint32_t n = 32; n <= 48; n += 16) { 4487 for (size_t k = 1; k <= 20; k += 5) { 4488 GemmMicrokernelTester() 4489 .mr(6) 4490 .nr(16) 4491 .kr(1) 4492 .sr(1) 4493 .m(6) 4494 .n(n) 4495 .k(k) 4496 .a_stride(23) 4497 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 4498 } 4499 } 4500 } 4501 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,n_div_16_subtile)4502 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, n_div_16_subtile) { 4503 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4504 for (uint32_t n = 32; n <= 48; n += 16) { 4505 for (size_t k = 1; k <= 20; k += 5) { 4506 for (uint32_t m = 1; m <= 6; m++) { 4507 GemmMicrokernelTester() 4508 .mr(6) 4509 .nr(16) 4510 .kr(1) 4511 .sr(1) 4512 .m(m) 4513 .n(n) 4514 .k(k) 4515 .iterations(1) 4516 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 4517 } 4518 } 4519 } 4520 } 4521 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,strided_cm_subtile)4522 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, strided_cm_subtile) { 4523 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4524 for (size_t k = 1; k <= 20; k += 5) { 4525 for (uint32_t n = 1; n <= 16; n++) { 4526 for (uint32_t m = 1; m <= 6; m++) { 4527 GemmMicrokernelTester() 4528 .mr(6) 4529 .nr(16) 4530 .kr(1) 4531 .sr(1) 4532 .m(m) 4533 .n(n) 4534 .k(k) 4535 .cm_stride(19) 4536 .iterations(1) 4537 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 4538 } 4539 } 4540 } 4541 } 4542 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,qmin)4543 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, qmin) { 4544 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4545 GemmMicrokernelTester() 4546 .mr(6) 4547 .nr(16) 4548 .kr(1) 4549 .sr(1) 4550 .m(6) 4551 .n(16) 4552 .k(4) 4553 .qmin(128) 4554 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 4555 } 4556 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,qmax)4557 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, qmax) { 4558 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4559 GemmMicrokernelTester() 4560 .mr(6) 4561 .nr(16) 4562 .kr(1) 4563 .sr(1) 4564 .m(6) 4565 .n(16) 4566 .k(4) 4567 .qmax(128) 4568 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 4569 } 4570 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,strided_cm)4571 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, strided_cm) { 4572 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4573 GemmMicrokernelTester() 4574 .mr(6) 4575 .nr(16) 4576 .kr(1) 4577 .sr(1) 4578 .m(6) 4579 .n(16) 4580 .k(4) 4581 .cm_stride(19) 4582 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 4583 } 4584 #endif // XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 4585 4586 4587 #if XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2)4588 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2) { 4589 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4590 GemmMicrokernelTester() 4591 .mr(6) 4592 .nr(16) 4593 .kr(1) 4594 .sr(1) 4595 .m(6) 4596 .n(16) 4597 .k(2) 4598 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 4599 } 4600 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,strided_cn)4601 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, strided_cn) { 4602 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4603 GemmMicrokernelTester() 4604 .mr(6) 4605 .nr(16) 4606 .kr(1) 4607 .sr(1) 4608 .m(6) 4609 .n(16) 4610 .k(2) 4611 .cn_stride(19) 4612 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 4613 } 4614 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_strided_a)4615 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_strided_a) { 4616 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4617 GemmMicrokernelTester() 4618 .mr(6) 4619 .nr(16) 4620 .kr(1) 4621 .sr(1) 4622 .m(6) 4623 .n(16) 4624 .k(2) 4625 .a_stride(5) 4626 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 4627 } 4628 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_subtile)4629 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile) { 4630 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4631 for (uint32_t n = 1; n <= 16; n++) { 4632 for (uint32_t m = 1; m <= 6; m++) { 4633 GemmMicrokernelTester() 4634 .mr(6) 4635 .nr(16) 4636 .kr(1) 4637 .sr(1) 4638 .m(m) 4639 .n(n) 4640 .k(2) 4641 .iterations(1) 4642 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 4643 } 4644 } 4645 } 4646 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_subtile_m)4647 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile_m) { 4648 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4649 for (uint32_t m = 1; m <= 6; m++) { 4650 GemmMicrokernelTester() 4651 .mr(6) 4652 .nr(16) 4653 .kr(1) 4654 .sr(1) 4655 .m(m) 4656 .n(16) 4657 .k(2) 4658 .iterations(1) 4659 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 4660 } 4661 } 4662 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_subtile_n)4663 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile_n) { 4664 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4665 for (uint32_t n = 1; n <= 16; n++) { 4666 GemmMicrokernelTester() 4667 .mr(6) 4668 .nr(16) 4669 .kr(1) 4670 .sr(1) 4671 .m(6) 4672 .n(n) 4673 .k(2) 4674 .iterations(1) 4675 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 4676 } 4677 } 4678 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_lt_2)4679 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2) { 4680 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4681 for (size_t k = 1; k < 2; k++) { 4682 GemmMicrokernelTester() 4683 .mr(6) 4684 .nr(16) 4685 .kr(1) 4686 .sr(1) 4687 .m(6) 4688 .n(16) 4689 .k(k) 4690 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 4691 } 4692 } 4693 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_lt_2_strided_a)4694 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2_strided_a) { 4695 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4696 for (size_t k = 1; k < 2; k++) { 4697 GemmMicrokernelTester() 4698 .mr(6) 4699 .nr(16) 4700 .kr(1) 4701 .sr(1) 4702 .m(6) 4703 .n(16) 4704 .k(k) 4705 .a_stride(5) 4706 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 4707 } 4708 } 4709 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_lt_2_subtile)4710 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2_subtile) { 4711 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4712 for (size_t k = 1; k < 2; k++) { 4713 for (uint32_t n = 1; n <= 16; n++) { 4714 for (uint32_t m = 1; m <= 6; m++) { 4715 GemmMicrokernelTester() 4716 .mr(6) 4717 .nr(16) 4718 .kr(1) 4719 .sr(1) 4720 .m(m) 4721 .n(n) 4722 .k(k) 4723 .iterations(1) 4724 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 4725 } 4726 } 4727 } 4728 } 4729 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_gt_2)4730 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2) { 4731 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4732 for (size_t k = 3; k < 4; k++) { 4733 GemmMicrokernelTester() 4734 .mr(6) 4735 .nr(16) 4736 .kr(1) 4737 .sr(1) 4738 .m(6) 4739 .n(16) 4740 .k(k) 4741 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 4742 } 4743 } 4744 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_gt_2_strided_a)4745 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2_strided_a) { 4746 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4747 for (size_t k = 3; k < 4; k++) { 4748 GemmMicrokernelTester() 4749 .mr(6) 4750 .nr(16) 4751 .kr(1) 4752 .sr(1) 4753 .m(6) 4754 .n(16) 4755 .k(k) 4756 .a_stride(7) 4757 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 4758 } 4759 } 4760 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_gt_2_subtile)4761 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2_subtile) { 4762 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4763 for (size_t k = 3; k < 4; k++) { 4764 for (uint32_t n = 1; n <= 16; n++) { 4765 for (uint32_t m = 1; m <= 6; m++) { 4766 GemmMicrokernelTester() 4767 .mr(6) 4768 .nr(16) 4769 .kr(1) 4770 .sr(1) 4771 .m(m) 4772 .n(n) 4773 .k(k) 4774 .iterations(1) 4775 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 4776 } 4777 } 4778 } 4779 } 4780 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_div_2)4781 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_div_2) { 4782 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4783 for (size_t k = 4; k <= 20; k += 2) { 4784 GemmMicrokernelTester() 4785 .mr(6) 4786 .nr(16) 4787 .kr(1) 4788 .sr(1) 4789 .m(6) 4790 .n(16) 4791 .k(k) 4792 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 4793 } 4794 } 4795 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_div_2_strided_a)4796 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_div_2_strided_a) { 4797 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4798 for (size_t k = 4; k <= 20; k += 2) { 4799 GemmMicrokernelTester() 4800 .mr(6) 4801 .nr(16) 4802 .kr(1) 4803 .sr(1) 4804 .m(6) 4805 .n(16) 4806 .k(k) 4807 .a_stride(23) 4808 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 4809 } 4810 } 4811 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_div_2_subtile)4812 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_div_2_subtile) { 4813 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4814 for (size_t k = 4; k <= 20; k += 2) { 4815 for (uint32_t n = 1; n <= 16; n++) { 4816 for (uint32_t m = 1; m <= 6; m++) { 4817 GemmMicrokernelTester() 4818 .mr(6) 4819 .nr(16) 4820 .kr(1) 4821 .sr(1) 4822 .m(m) 4823 .n(n) 4824 .k(k) 4825 .iterations(1) 4826 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 4827 } 4828 } 4829 } 4830 } 4831 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16)4832 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16) { 4833 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4834 for (uint32_t n = 17; n < 32; n++) { 4835 for (size_t k = 1; k <= 10; k += 3) { 4836 GemmMicrokernelTester() 4837 .mr(6) 4838 .nr(16) 4839 .kr(1) 4840 .sr(1) 4841 .m(6) 4842 .n(n) 4843 .k(k) 4844 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 4845 } 4846 } 4847 } 4848 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16_strided_cn)4849 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_strided_cn) { 4850 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4851 for (uint32_t n = 17; n < 32; n++) { 4852 for (size_t k = 1; k <= 10; k += 3) { 4853 GemmMicrokernelTester() 4854 .mr(6) 4855 .nr(16) 4856 .kr(1) 4857 .sr(1) 4858 .m(6) 4859 .n(n) 4860 .k(k) 4861 .cn_stride(19) 4862 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 4863 } 4864 } 4865 } 4866 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16_strided_a)4867 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_strided_a) { 4868 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4869 for (uint32_t n = 17; n < 32; n++) { 4870 for (size_t k = 1; k <= 10; k += 3) { 4871 GemmMicrokernelTester() 4872 .mr(6) 4873 .nr(16) 4874 .kr(1) 4875 .sr(1) 4876 .m(6) 4877 .n(n) 4878 .k(k) 4879 .a_stride(13) 4880 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 4881 } 4882 } 4883 } 4884 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16_subtile)4885 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_subtile) { 4886 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4887 for (uint32_t n = 17; n < 32; n++) { 4888 for (size_t k = 1; k <= 10; k += 3) { 4889 for (uint32_t m = 1; m <= 6; m++) { 4890 GemmMicrokernelTester() 4891 .mr(6) 4892 .nr(16) 4893 .kr(1) 4894 .sr(1) 4895 .m(m) 4896 .n(n) 4897 .k(k) 4898 .iterations(1) 4899 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 4900 } 4901 } 4902 } 4903 } 4904 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,n_div_16)4905 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_div_16) { 4906 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4907 for (uint32_t n = 32; n <= 48; n += 16) { 4908 for (size_t k = 1; k <= 10; k += 3) { 4909 GemmMicrokernelTester() 4910 .mr(6) 4911 .nr(16) 4912 .kr(1) 4913 .sr(1) 4914 .m(6) 4915 .n(n) 4916 .k(k) 4917 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 4918 } 4919 } 4920 } 4921 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,n_div_16_strided_cn)4922 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_strided_cn) { 4923 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4924 for (uint32_t n = 32; n <= 48; n += 16) { 4925 for (size_t k = 1; k <= 10; k += 3) { 4926 GemmMicrokernelTester() 4927 .mr(6) 4928 .nr(16) 4929 .kr(1) 4930 .sr(1) 4931 .m(6) 4932 .n(n) 4933 .k(k) 4934 .cn_stride(19) 4935 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 4936 } 4937 } 4938 } 4939 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,n_div_16_strided_a)4940 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_strided_a) { 4941 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4942 for (uint32_t n = 32; n <= 48; n += 16) { 4943 for (size_t k = 1; k <= 10; k += 3) { 4944 GemmMicrokernelTester() 4945 .mr(6) 4946 .nr(16) 4947 .kr(1) 4948 .sr(1) 4949 .m(6) 4950 .n(n) 4951 .k(k) 4952 .a_stride(13) 4953 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 4954 } 4955 } 4956 } 4957 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,n_div_16_subtile)4958 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_subtile) { 4959 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4960 for (uint32_t n = 32; n <= 48; n += 16) { 4961 for (size_t k = 1; k <= 10; k += 3) { 4962 for (uint32_t m = 1; m <= 6; m++) { 4963 GemmMicrokernelTester() 4964 .mr(6) 4965 .nr(16) 4966 .kr(1) 4967 .sr(1) 4968 .m(m) 4969 .n(n) 4970 .k(k) 4971 .iterations(1) 4972 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 4973 } 4974 } 4975 } 4976 } 4977 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,strided_cm_subtile)4978 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, strided_cm_subtile) { 4979 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4980 for (size_t k = 1; k <= 10; k += 3) { 4981 for (uint32_t n = 1; n <= 16; n++) { 4982 for (uint32_t m = 1; m <= 6; m++) { 4983 GemmMicrokernelTester() 4984 .mr(6) 4985 .nr(16) 4986 .kr(1) 4987 .sr(1) 4988 .m(m) 4989 .n(n) 4990 .k(k) 4991 .cm_stride(19) 4992 .iterations(1) 4993 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 4994 } 4995 } 4996 } 4997 } 4998 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,qmin)4999 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, qmin) { 5000 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5001 GemmMicrokernelTester() 5002 .mr(6) 5003 .nr(16) 5004 .kr(1) 5005 .sr(1) 5006 .m(6) 5007 .n(16) 5008 .k(2) 5009 .qmin(128) 5010 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 5011 } 5012 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,qmax)5013 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, qmax) { 5014 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5015 GemmMicrokernelTester() 5016 .mr(6) 5017 .nr(16) 5018 .kr(1) 5019 .sr(1) 5020 .m(6) 5021 .n(16) 5022 .k(2) 5023 .qmax(128) 5024 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 5025 } 5026 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,strided_cm)5027 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, strided_cm) { 5028 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5029 GemmMicrokernelTester() 5030 .mr(6) 5031 .nr(16) 5032 .kr(1) 5033 .sr(1) 5034 .m(6) 5035 .n(16) 5036 .k(2) 5037 .cm_stride(19) 5038 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 5039 } 5040 #endif // XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 5041 5042 5043 #if XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,k_eq_4)5044 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, k_eq_4) { 5045 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5046 GemmMicrokernelTester() 5047 .mr(6) 5048 .nr(16) 5049 .kr(1) 5050 .sr(1) 5051 .m(6) 5052 .n(16) 5053 .k(4) 5054 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5055 } 5056 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,strided_cn)5057 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, strided_cn) { 5058 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5059 GemmMicrokernelTester() 5060 .mr(6) 5061 .nr(16) 5062 .kr(1) 5063 .sr(1) 5064 .m(6) 5065 .n(16) 5066 .k(4) 5067 .cn_stride(19) 5068 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5069 } 5070 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,k_eq_4_strided_a)5071 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, k_eq_4_strided_a) { 5072 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5073 GemmMicrokernelTester() 5074 .mr(6) 5075 .nr(16) 5076 .kr(1) 5077 .sr(1) 5078 .m(6) 5079 .n(16) 5080 .k(4) 5081 .a_stride(7) 5082 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5083 } 5084 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile)5085 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile) { 5086 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5087 for (uint32_t n = 1; n <= 16; n++) { 5088 for (uint32_t m = 1; m <= 6; m++) { 5089 GemmMicrokernelTester() 5090 .mr(6) 5091 .nr(16) 5092 .kr(1) 5093 .sr(1) 5094 .m(m) 5095 .n(n) 5096 .k(4) 5097 .iterations(1) 5098 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5099 } 5100 } 5101 } 5102 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile_m)5103 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 5104 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5105 for (uint32_t m = 1; m <= 6; m++) { 5106 GemmMicrokernelTester() 5107 .mr(6) 5108 .nr(16) 5109 .kr(1) 5110 .sr(1) 5111 .m(m) 5112 .n(16) 5113 .k(4) 5114 .iterations(1) 5115 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5116 } 5117 } 5118 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile_n)5119 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 5120 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5121 for (uint32_t n = 1; n <= 16; n++) { 5122 GemmMicrokernelTester() 5123 .mr(6) 5124 .nr(16) 5125 .kr(1) 5126 .sr(1) 5127 .m(6) 5128 .n(n) 5129 .k(4) 5130 .iterations(1) 5131 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5132 } 5133 } 5134 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,k_lt_4)5135 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, k_lt_4) { 5136 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5137 for (size_t k = 1; k < 4; k++) { 5138 GemmMicrokernelTester() 5139 .mr(6) 5140 .nr(16) 5141 .kr(1) 5142 .sr(1) 5143 .m(6) 5144 .n(16) 5145 .k(k) 5146 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5147 } 5148 } 5149 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,k_lt_4_strided_a)5150 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, k_lt_4_strided_a) { 5151 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5152 for (size_t k = 1; k < 4; k++) { 5153 GemmMicrokernelTester() 5154 .mr(6) 5155 .nr(16) 5156 .kr(1) 5157 .sr(1) 5158 .m(6) 5159 .n(16) 5160 .k(k) 5161 .a_stride(7) 5162 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5163 } 5164 } 5165 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,k_lt_4_subtile)5166 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, k_lt_4_subtile) { 5167 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5168 for (size_t k = 1; k < 4; k++) { 5169 for (uint32_t n = 1; n <= 16; n++) { 5170 for (uint32_t m = 1; m <= 6; m++) { 5171 GemmMicrokernelTester() 5172 .mr(6) 5173 .nr(16) 5174 .kr(1) 5175 .sr(1) 5176 .m(m) 5177 .n(n) 5178 .k(k) 5179 .iterations(1) 5180 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5181 } 5182 } 5183 } 5184 } 5185 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,k_gt_4)5186 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, k_gt_4) { 5187 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5188 for (size_t k = 5; k < 8; k++) { 5189 GemmMicrokernelTester() 5190 .mr(6) 5191 .nr(16) 5192 .kr(1) 5193 .sr(1) 5194 .m(6) 5195 .n(16) 5196 .k(k) 5197 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5198 } 5199 } 5200 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,k_gt_4_strided_a)5201 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, k_gt_4_strided_a) { 5202 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5203 for (size_t k = 5; k < 8; k++) { 5204 GemmMicrokernelTester() 5205 .mr(6) 5206 .nr(16) 5207 .kr(1) 5208 .sr(1) 5209 .m(6) 5210 .n(16) 5211 .k(k) 5212 .a_stride(11) 5213 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5214 } 5215 } 5216 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,k_gt_4_subtile)5217 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, k_gt_4_subtile) { 5218 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5219 for (size_t k = 5; k < 8; k++) { 5220 for (uint32_t n = 1; n <= 16; n++) { 5221 for (uint32_t m = 1; m <= 6; m++) { 5222 GemmMicrokernelTester() 5223 .mr(6) 5224 .nr(16) 5225 .kr(1) 5226 .sr(1) 5227 .m(m) 5228 .n(n) 5229 .k(k) 5230 .iterations(1) 5231 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5232 } 5233 } 5234 } 5235 } 5236 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,k_div_4)5237 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, k_div_4) { 5238 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5239 for (size_t k = 8; k <= 40; k += 4) { 5240 GemmMicrokernelTester() 5241 .mr(6) 5242 .nr(16) 5243 .kr(1) 5244 .sr(1) 5245 .m(6) 5246 .n(16) 5247 .k(k) 5248 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5249 } 5250 } 5251 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,k_div_4_strided_a)5252 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, k_div_4_strided_a) { 5253 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5254 for (size_t k = 8; k <= 40; k += 4) { 5255 GemmMicrokernelTester() 5256 .mr(6) 5257 .nr(16) 5258 .kr(1) 5259 .sr(1) 5260 .m(6) 5261 .n(16) 5262 .k(k) 5263 .a_stride(43) 5264 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5265 } 5266 } 5267 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,k_div_4_subtile)5268 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, k_div_4_subtile) { 5269 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5270 for (size_t k = 8; k <= 40; k += 4) { 5271 for (uint32_t n = 1; n <= 16; n++) { 5272 for (uint32_t m = 1; m <= 6; m++) { 5273 GemmMicrokernelTester() 5274 .mr(6) 5275 .nr(16) 5276 .kr(1) 5277 .sr(1) 5278 .m(m) 5279 .n(n) 5280 .k(k) 5281 .iterations(1) 5282 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5283 } 5284 } 5285 } 5286 } 5287 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,n_gt_16)5288 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, n_gt_16) { 5289 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5290 for (uint32_t n = 17; n < 32; n++) { 5291 for (size_t k = 1; k <= 20; k += 5) { 5292 GemmMicrokernelTester() 5293 .mr(6) 5294 .nr(16) 5295 .kr(1) 5296 .sr(1) 5297 .m(6) 5298 .n(n) 5299 .k(k) 5300 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5301 } 5302 } 5303 } 5304 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,n_gt_16_strided_cn)5305 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, n_gt_16_strided_cn) { 5306 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5307 for (uint32_t n = 17; n < 32; n++) { 5308 for (size_t k = 1; k <= 20; k += 5) { 5309 GemmMicrokernelTester() 5310 .mr(6) 5311 .nr(16) 5312 .kr(1) 5313 .sr(1) 5314 .m(6) 5315 .n(n) 5316 .k(k) 5317 .cn_stride(19) 5318 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5319 } 5320 } 5321 } 5322 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,n_gt_16_strided_a)5323 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, n_gt_16_strided_a) { 5324 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5325 for (uint32_t n = 17; n < 32; n++) { 5326 for (size_t k = 1; k <= 20; k += 5) { 5327 GemmMicrokernelTester() 5328 .mr(6) 5329 .nr(16) 5330 .kr(1) 5331 .sr(1) 5332 .m(6) 5333 .n(n) 5334 .k(k) 5335 .a_stride(23) 5336 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5337 } 5338 } 5339 } 5340 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,n_gt_16_subtile)5341 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, n_gt_16_subtile) { 5342 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5343 for (uint32_t n = 17; n < 32; n++) { 5344 for (size_t k = 1; k <= 20; k += 5) { 5345 for (uint32_t m = 1; m <= 6; m++) { 5346 GemmMicrokernelTester() 5347 .mr(6) 5348 .nr(16) 5349 .kr(1) 5350 .sr(1) 5351 .m(m) 5352 .n(n) 5353 .k(k) 5354 .iterations(1) 5355 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5356 } 5357 } 5358 } 5359 } 5360 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,n_div_16)5361 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, n_div_16) { 5362 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5363 for (uint32_t n = 32; n <= 48; n += 16) { 5364 for (size_t k = 1; k <= 20; k += 5) { 5365 GemmMicrokernelTester() 5366 .mr(6) 5367 .nr(16) 5368 .kr(1) 5369 .sr(1) 5370 .m(6) 5371 .n(n) 5372 .k(k) 5373 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5374 } 5375 } 5376 } 5377 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,n_div_16_strided_cn)5378 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, n_div_16_strided_cn) { 5379 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5380 for (uint32_t n = 32; n <= 48; n += 16) { 5381 for (size_t k = 1; k <= 20; k += 5) { 5382 GemmMicrokernelTester() 5383 .mr(6) 5384 .nr(16) 5385 .kr(1) 5386 .sr(1) 5387 .m(6) 5388 .n(n) 5389 .k(k) 5390 .cn_stride(19) 5391 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5392 } 5393 } 5394 } 5395 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,n_div_16_strided_a)5396 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, n_div_16_strided_a) { 5397 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5398 for (uint32_t n = 32; n <= 48; n += 16) { 5399 for (size_t k = 1; k <= 20; k += 5) { 5400 GemmMicrokernelTester() 5401 .mr(6) 5402 .nr(16) 5403 .kr(1) 5404 .sr(1) 5405 .m(6) 5406 .n(n) 5407 .k(k) 5408 .a_stride(23) 5409 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5410 } 5411 } 5412 } 5413 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,n_div_16_subtile)5414 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, n_div_16_subtile) { 5415 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5416 for (uint32_t n = 32; n <= 48; n += 16) { 5417 for (size_t k = 1; k <= 20; k += 5) { 5418 for (uint32_t m = 1; m <= 6; m++) { 5419 GemmMicrokernelTester() 5420 .mr(6) 5421 .nr(16) 5422 .kr(1) 5423 .sr(1) 5424 .m(m) 5425 .n(n) 5426 .k(k) 5427 .iterations(1) 5428 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5429 } 5430 } 5431 } 5432 } 5433 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,strided_cm_subtile)5434 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, strided_cm_subtile) { 5435 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5436 for (size_t k = 1; k <= 20; k += 5) { 5437 for (uint32_t n = 1; n <= 16; n++) { 5438 for (uint32_t m = 1; m <= 6; m++) { 5439 GemmMicrokernelTester() 5440 .mr(6) 5441 .nr(16) 5442 .kr(1) 5443 .sr(1) 5444 .m(m) 5445 .n(n) 5446 .k(k) 5447 .cm_stride(19) 5448 .iterations(1) 5449 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5450 } 5451 } 5452 } 5453 } 5454 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,qmin)5455 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, qmin) { 5456 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5457 GemmMicrokernelTester() 5458 .mr(6) 5459 .nr(16) 5460 .kr(1) 5461 .sr(1) 5462 .m(6) 5463 .n(16) 5464 .k(4) 5465 .qmin(128) 5466 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5467 } 5468 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,qmax)5469 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, qmax) { 5470 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5471 GemmMicrokernelTester() 5472 .mr(6) 5473 .nr(16) 5474 .kr(1) 5475 .sr(1) 5476 .m(6) 5477 .n(16) 5478 .k(4) 5479 .qmax(128) 5480 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5481 } 5482 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,strided_cm)5483 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, strided_cm) { 5484 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5485 GemmMicrokernelTester() 5486 .mr(6) 5487 .nr(16) 5488 .kr(1) 5489 .sr(1) 5490 .m(6) 5491 .n(16) 5492 .k(4) 5493 .cm_stride(19) 5494 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5495 } 5496 #endif // XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 5497 5498 5499 #if XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4)5500 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4) { 5501 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5502 GemmMicrokernelTester() 5503 .mr(8) 5504 .nr(8) 5505 .kr(1) 5506 .sr(1) 5507 .m(8) 5508 .n(8) 5509 .k(4) 5510 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5511 } 5512 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,strided_cn)5513 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, strided_cn) { 5514 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5515 GemmMicrokernelTester() 5516 .mr(8) 5517 .nr(8) 5518 .kr(1) 5519 .sr(1) 5520 .m(8) 5521 .n(8) 5522 .k(4) 5523 .cn_stride(11) 5524 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5525 } 5526 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_strided_a)5527 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_strided_a) { 5528 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5529 GemmMicrokernelTester() 5530 .mr(8) 5531 .nr(8) 5532 .kr(1) 5533 .sr(1) 5534 .m(8) 5535 .n(8) 5536 .k(4) 5537 .a_stride(7) 5538 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5539 } 5540 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile)5541 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile) { 5542 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5543 for (uint32_t n = 1; n <= 8; n++) { 5544 for (uint32_t m = 1; m <= 8; m++) { 5545 GemmMicrokernelTester() 5546 .mr(8) 5547 .nr(8) 5548 .kr(1) 5549 .sr(1) 5550 .m(m) 5551 .n(n) 5552 .k(4) 5553 .iterations(1) 5554 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5555 } 5556 } 5557 } 5558 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile_m)5559 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 5560 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5561 for (uint32_t m = 1; m <= 8; m++) { 5562 GemmMicrokernelTester() 5563 .mr(8) 5564 .nr(8) 5565 .kr(1) 5566 .sr(1) 5567 .m(m) 5568 .n(8) 5569 .k(4) 5570 .iterations(1) 5571 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5572 } 5573 } 5574 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile_n)5575 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 5576 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5577 for (uint32_t n = 1; n <= 8; n++) { 5578 GemmMicrokernelTester() 5579 .mr(8) 5580 .nr(8) 5581 .kr(1) 5582 .sr(1) 5583 .m(8) 5584 .n(n) 5585 .k(4) 5586 .iterations(1) 5587 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5588 } 5589 } 5590 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,k_lt_4)5591 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4) { 5592 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5593 for (size_t k = 1; k < 4; k++) { 5594 GemmMicrokernelTester() 5595 .mr(8) 5596 .nr(8) 5597 .kr(1) 5598 .sr(1) 5599 .m(8) 5600 .n(8) 5601 .k(k) 5602 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5603 } 5604 } 5605 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,k_lt_4_strided_a)5606 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4_strided_a) { 5607 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5608 for (size_t k = 1; k < 4; k++) { 5609 GemmMicrokernelTester() 5610 .mr(8) 5611 .nr(8) 5612 .kr(1) 5613 .sr(1) 5614 .m(8) 5615 .n(8) 5616 .k(k) 5617 .a_stride(7) 5618 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5619 } 5620 } 5621 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,k_lt_4_subtile)5622 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4_subtile) { 5623 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5624 for (size_t k = 1; k < 4; k++) { 5625 for (uint32_t n = 1; n <= 8; n++) { 5626 for (uint32_t m = 1; m <= 8; m++) { 5627 GemmMicrokernelTester() 5628 .mr(8) 5629 .nr(8) 5630 .kr(1) 5631 .sr(1) 5632 .m(m) 5633 .n(n) 5634 .k(k) 5635 .iterations(1) 5636 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5637 } 5638 } 5639 } 5640 } 5641 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,k_gt_4)5642 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4) { 5643 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5644 for (size_t k = 5; k < 8; k++) { 5645 GemmMicrokernelTester() 5646 .mr(8) 5647 .nr(8) 5648 .kr(1) 5649 .sr(1) 5650 .m(8) 5651 .n(8) 5652 .k(k) 5653 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5654 } 5655 } 5656 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,k_gt_4_strided_a)5657 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4_strided_a) { 5658 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5659 for (size_t k = 5; k < 8; k++) { 5660 GemmMicrokernelTester() 5661 .mr(8) 5662 .nr(8) 5663 .kr(1) 5664 .sr(1) 5665 .m(8) 5666 .n(8) 5667 .k(k) 5668 .a_stride(11) 5669 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5670 } 5671 } 5672 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,k_gt_4_subtile)5673 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4_subtile) { 5674 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5675 for (size_t k = 5; k < 8; k++) { 5676 for (uint32_t n = 1; n <= 8; n++) { 5677 for (uint32_t m = 1; m <= 8; m++) { 5678 GemmMicrokernelTester() 5679 .mr(8) 5680 .nr(8) 5681 .kr(1) 5682 .sr(1) 5683 .m(m) 5684 .n(n) 5685 .k(k) 5686 .iterations(1) 5687 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5688 } 5689 } 5690 } 5691 } 5692 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,k_div_4)5693 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_div_4) { 5694 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5695 for (size_t k = 8; k <= 40; k += 4) { 5696 GemmMicrokernelTester() 5697 .mr(8) 5698 .nr(8) 5699 .kr(1) 5700 .sr(1) 5701 .m(8) 5702 .n(8) 5703 .k(k) 5704 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5705 } 5706 } 5707 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,k_div_4_strided_a)5708 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_div_4_strided_a) { 5709 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5710 for (size_t k = 8; k <= 40; k += 4) { 5711 GemmMicrokernelTester() 5712 .mr(8) 5713 .nr(8) 5714 .kr(1) 5715 .sr(1) 5716 .m(8) 5717 .n(8) 5718 .k(k) 5719 .a_stride(43) 5720 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5721 } 5722 } 5723 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,k_div_4_subtile)5724 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_div_4_subtile) { 5725 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5726 for (size_t k = 8; k <= 40; k += 4) { 5727 for (uint32_t n = 1; n <= 8; n++) { 5728 for (uint32_t m = 1; m <= 8; m++) { 5729 GemmMicrokernelTester() 5730 .mr(8) 5731 .nr(8) 5732 .kr(1) 5733 .sr(1) 5734 .m(m) 5735 .n(n) 5736 .k(k) 5737 .iterations(1) 5738 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5739 } 5740 } 5741 } 5742 } 5743 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8)5744 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8) { 5745 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5746 for (uint32_t n = 9; n < 16; n++) { 5747 for (size_t k = 1; k <= 20; k += 5) { 5748 GemmMicrokernelTester() 5749 .mr(8) 5750 .nr(8) 5751 .kr(1) 5752 .sr(1) 5753 .m(8) 5754 .n(n) 5755 .k(k) 5756 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5757 } 5758 } 5759 } 5760 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8_strided_cn)5761 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_strided_cn) { 5762 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5763 for (uint32_t n = 9; n < 16; n++) { 5764 for (size_t k = 1; k <= 20; k += 5) { 5765 GemmMicrokernelTester() 5766 .mr(8) 5767 .nr(8) 5768 .kr(1) 5769 .sr(1) 5770 .m(8) 5771 .n(n) 5772 .k(k) 5773 .cn_stride(11) 5774 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5775 } 5776 } 5777 } 5778 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8_strided_a)5779 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_strided_a) { 5780 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5781 for (uint32_t n = 9; n < 16; n++) { 5782 for (size_t k = 1; k <= 20; k += 5) { 5783 GemmMicrokernelTester() 5784 .mr(8) 5785 .nr(8) 5786 .kr(1) 5787 .sr(1) 5788 .m(8) 5789 .n(n) 5790 .k(k) 5791 .a_stride(23) 5792 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5793 } 5794 } 5795 } 5796 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8_subtile)5797 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_subtile) { 5798 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5799 for (uint32_t n = 9; n < 16; n++) { 5800 for (size_t k = 1; k <= 20; k += 5) { 5801 for (uint32_t m = 1; m <= 8; m++) { 5802 GemmMicrokernelTester() 5803 .mr(8) 5804 .nr(8) 5805 .kr(1) 5806 .sr(1) 5807 .m(m) 5808 .n(n) 5809 .k(k) 5810 .iterations(1) 5811 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5812 } 5813 } 5814 } 5815 } 5816 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,n_div_8)5817 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, n_div_8) { 5818 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5819 for (uint32_t n = 16; n <= 24; n += 8) { 5820 for (size_t k = 1; k <= 20; k += 5) { 5821 GemmMicrokernelTester() 5822 .mr(8) 5823 .nr(8) 5824 .kr(1) 5825 .sr(1) 5826 .m(8) 5827 .n(n) 5828 .k(k) 5829 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5830 } 5831 } 5832 } 5833 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,n_div_8_strided_cn)5834 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_strided_cn) { 5835 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5836 for (uint32_t n = 16; n <= 24; n += 8) { 5837 for (size_t k = 1; k <= 20; k += 5) { 5838 GemmMicrokernelTester() 5839 .mr(8) 5840 .nr(8) 5841 .kr(1) 5842 .sr(1) 5843 .m(8) 5844 .n(n) 5845 .k(k) 5846 .cn_stride(11) 5847 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5848 } 5849 } 5850 } 5851 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,n_div_8_strided_a)5852 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_strided_a) { 5853 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5854 for (uint32_t n = 16; n <= 24; n += 8) { 5855 for (size_t k = 1; k <= 20; k += 5) { 5856 GemmMicrokernelTester() 5857 .mr(8) 5858 .nr(8) 5859 .kr(1) 5860 .sr(1) 5861 .m(8) 5862 .n(n) 5863 .k(k) 5864 .a_stride(23) 5865 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5866 } 5867 } 5868 } 5869 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,n_div_8_subtile)5870 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_subtile) { 5871 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5872 for (uint32_t n = 16; n <= 24; n += 8) { 5873 for (size_t k = 1; k <= 20; k += 5) { 5874 for (uint32_t m = 1; m <= 8; m++) { 5875 GemmMicrokernelTester() 5876 .mr(8) 5877 .nr(8) 5878 .kr(1) 5879 .sr(1) 5880 .m(m) 5881 .n(n) 5882 .k(k) 5883 .iterations(1) 5884 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5885 } 5886 } 5887 } 5888 } 5889 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,strided_cm_subtile)5890 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, strided_cm_subtile) { 5891 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5892 for (size_t k = 1; k <= 20; k += 5) { 5893 for (uint32_t n = 1; n <= 8; n++) { 5894 for (uint32_t m = 1; m <= 8; m++) { 5895 GemmMicrokernelTester() 5896 .mr(8) 5897 .nr(8) 5898 .kr(1) 5899 .sr(1) 5900 .m(m) 5901 .n(n) 5902 .k(k) 5903 .cm_stride(11) 5904 .iterations(1) 5905 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5906 } 5907 } 5908 } 5909 } 5910 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,qmin)5911 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, qmin) { 5912 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5913 GemmMicrokernelTester() 5914 .mr(8) 5915 .nr(8) 5916 .kr(1) 5917 .sr(1) 5918 .m(8) 5919 .n(8) 5920 .k(4) 5921 .qmin(128) 5922 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5923 } 5924 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,qmax)5925 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, qmax) { 5926 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5927 GemmMicrokernelTester() 5928 .mr(8) 5929 .nr(8) 5930 .kr(1) 5931 .sr(1) 5932 .m(8) 5933 .n(8) 5934 .k(4) 5935 .qmax(128) 5936 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5937 } 5938 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,strided_cm)5939 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, strided_cm) { 5940 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5941 GemmMicrokernelTester() 5942 .mr(8) 5943 .nr(8) 5944 .kr(1) 5945 .sr(1) 5946 .m(8) 5947 .n(8) 5948 .k(4) 5949 .cm_stride(11) 5950 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5951 } 5952 #endif // XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 5953 5954 5955 #if XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_eq_4)5956 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_eq_4) { 5957 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5958 GemmMicrokernelTester() 5959 .mr(1) 5960 .nr(8) 5961 .kr(1) 5962 .sr(1) 5963 .m(1) 5964 .n(8) 5965 .k(4) 5966 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5967 } 5968 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,strided_cn)5969 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, strided_cn) { 5970 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5971 GemmMicrokernelTester() 5972 .mr(1) 5973 .nr(8) 5974 .kr(1) 5975 .sr(1) 5976 .m(1) 5977 .n(8) 5978 .k(4) 5979 .cn_stride(11) 5980 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5981 } 5982 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_eq_4_strided_a)5983 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_eq_4_strided_a) { 5984 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5985 GemmMicrokernelTester() 5986 .mr(1) 5987 .nr(8) 5988 .kr(1) 5989 .sr(1) 5990 .m(1) 5991 .n(8) 5992 .k(4) 5993 .a_stride(7) 5994 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5995 } 5996 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_eq_4_subtile)5997 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_eq_4_subtile) { 5998 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5999 for (uint32_t n = 1; n <= 8; n++) { 6000 for (uint32_t m = 1; m <= 1; m++) { 6001 GemmMicrokernelTester() 6002 .mr(1) 6003 .nr(8) 6004 .kr(1) 6005 .sr(1) 6006 .m(m) 6007 .n(n) 6008 .k(4) 6009 .iterations(1) 6010 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6011 } 6012 } 6013 } 6014 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_eq_4_subtile_m)6015 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 6016 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6017 for (uint32_t m = 1; m <= 1; m++) { 6018 GemmMicrokernelTester() 6019 .mr(1) 6020 .nr(8) 6021 .kr(1) 6022 .sr(1) 6023 .m(m) 6024 .n(8) 6025 .k(4) 6026 .iterations(1) 6027 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6028 } 6029 } 6030 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_eq_4_subtile_n)6031 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 6032 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6033 for (uint32_t n = 1; n <= 8; n++) { 6034 GemmMicrokernelTester() 6035 .mr(1) 6036 .nr(8) 6037 .kr(1) 6038 .sr(1) 6039 .m(1) 6040 .n(n) 6041 .k(4) 6042 .iterations(1) 6043 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6044 } 6045 } 6046 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_lt_4)6047 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_lt_4) { 6048 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6049 for (size_t k = 1; k < 4; k++) { 6050 GemmMicrokernelTester() 6051 .mr(1) 6052 .nr(8) 6053 .kr(1) 6054 .sr(1) 6055 .m(1) 6056 .n(8) 6057 .k(k) 6058 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6059 } 6060 } 6061 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_lt_4_strided_a)6062 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_lt_4_strided_a) { 6063 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6064 for (size_t k = 1; k < 4; k++) { 6065 GemmMicrokernelTester() 6066 .mr(1) 6067 .nr(8) 6068 .kr(1) 6069 .sr(1) 6070 .m(1) 6071 .n(8) 6072 .k(k) 6073 .a_stride(7) 6074 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6075 } 6076 } 6077 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_lt_4_subtile)6078 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_lt_4_subtile) { 6079 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6080 for (size_t k = 1; k < 4; k++) { 6081 for (uint32_t n = 1; n <= 8; n++) { 6082 for (uint32_t m = 1; m <= 1; m++) { 6083 GemmMicrokernelTester() 6084 .mr(1) 6085 .nr(8) 6086 .kr(1) 6087 .sr(1) 6088 .m(m) 6089 .n(n) 6090 .k(k) 6091 .iterations(1) 6092 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6093 } 6094 } 6095 } 6096 } 6097 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_gt_4)6098 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_gt_4) { 6099 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6100 for (size_t k = 5; k < 8; k++) { 6101 GemmMicrokernelTester() 6102 .mr(1) 6103 .nr(8) 6104 .kr(1) 6105 .sr(1) 6106 .m(1) 6107 .n(8) 6108 .k(k) 6109 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6110 } 6111 } 6112 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_gt_4_strided_a)6113 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_gt_4_strided_a) { 6114 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6115 for (size_t k = 5; k < 8; k++) { 6116 GemmMicrokernelTester() 6117 .mr(1) 6118 .nr(8) 6119 .kr(1) 6120 .sr(1) 6121 .m(1) 6122 .n(8) 6123 .k(k) 6124 .a_stride(11) 6125 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6126 } 6127 } 6128 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_gt_4_subtile)6129 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_gt_4_subtile) { 6130 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6131 for (size_t k = 5; k < 8; k++) { 6132 for (uint32_t n = 1; n <= 8; n++) { 6133 for (uint32_t m = 1; m <= 1; m++) { 6134 GemmMicrokernelTester() 6135 .mr(1) 6136 .nr(8) 6137 .kr(1) 6138 .sr(1) 6139 .m(m) 6140 .n(n) 6141 .k(k) 6142 .iterations(1) 6143 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6144 } 6145 } 6146 } 6147 } 6148 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_div_4)6149 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_div_4) { 6150 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6151 for (size_t k = 8; k <= 40; k += 4) { 6152 GemmMicrokernelTester() 6153 .mr(1) 6154 .nr(8) 6155 .kr(1) 6156 .sr(1) 6157 .m(1) 6158 .n(8) 6159 .k(k) 6160 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6161 } 6162 } 6163 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_div_4_strided_a)6164 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_div_4_strided_a) { 6165 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6166 for (size_t k = 8; k <= 40; k += 4) { 6167 GemmMicrokernelTester() 6168 .mr(1) 6169 .nr(8) 6170 .kr(1) 6171 .sr(1) 6172 .m(1) 6173 .n(8) 6174 .k(k) 6175 .a_stride(43) 6176 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6177 } 6178 } 6179 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_div_4_subtile)6180 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_div_4_subtile) { 6181 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6182 for (size_t k = 8; k <= 40; k += 4) { 6183 for (uint32_t n = 1; n <= 8; n++) { 6184 for (uint32_t m = 1; m <= 1; m++) { 6185 GemmMicrokernelTester() 6186 .mr(1) 6187 .nr(8) 6188 .kr(1) 6189 .sr(1) 6190 .m(m) 6191 .n(n) 6192 .k(k) 6193 .iterations(1) 6194 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6195 } 6196 } 6197 } 6198 } 6199 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,n_gt_8)6200 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_gt_8) { 6201 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6202 for (uint32_t n = 9; n < 16; n++) { 6203 for (size_t k = 1; k <= 20; k += 5) { 6204 GemmMicrokernelTester() 6205 .mr(1) 6206 .nr(8) 6207 .kr(1) 6208 .sr(1) 6209 .m(1) 6210 .n(n) 6211 .k(k) 6212 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6213 } 6214 } 6215 } 6216 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,n_gt_8_strided_cn)6217 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_gt_8_strided_cn) { 6218 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6219 for (uint32_t n = 9; n < 16; n++) { 6220 for (size_t k = 1; k <= 20; k += 5) { 6221 GemmMicrokernelTester() 6222 .mr(1) 6223 .nr(8) 6224 .kr(1) 6225 .sr(1) 6226 .m(1) 6227 .n(n) 6228 .k(k) 6229 .cn_stride(11) 6230 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6231 } 6232 } 6233 } 6234 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,n_gt_8_strided_a)6235 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_gt_8_strided_a) { 6236 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6237 for (uint32_t n = 9; n < 16; n++) { 6238 for (size_t k = 1; k <= 20; k += 5) { 6239 GemmMicrokernelTester() 6240 .mr(1) 6241 .nr(8) 6242 .kr(1) 6243 .sr(1) 6244 .m(1) 6245 .n(n) 6246 .k(k) 6247 .a_stride(23) 6248 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6249 } 6250 } 6251 } 6252 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,n_gt_8_subtile)6253 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_gt_8_subtile) { 6254 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6255 for (uint32_t n = 9; n < 16; n++) { 6256 for (size_t k = 1; k <= 20; k += 5) { 6257 for (uint32_t m = 1; m <= 1; m++) { 6258 GemmMicrokernelTester() 6259 .mr(1) 6260 .nr(8) 6261 .kr(1) 6262 .sr(1) 6263 .m(m) 6264 .n(n) 6265 .k(k) 6266 .iterations(1) 6267 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6268 } 6269 } 6270 } 6271 } 6272 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,n_div_8)6273 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_div_8) { 6274 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6275 for (uint32_t n = 16; n <= 24; n += 8) { 6276 for (size_t k = 1; k <= 20; k += 5) { 6277 GemmMicrokernelTester() 6278 .mr(1) 6279 .nr(8) 6280 .kr(1) 6281 .sr(1) 6282 .m(1) 6283 .n(n) 6284 .k(k) 6285 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6286 } 6287 } 6288 } 6289 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,n_div_8_strided_cn)6290 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_div_8_strided_cn) { 6291 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6292 for (uint32_t n = 16; n <= 24; n += 8) { 6293 for (size_t k = 1; k <= 20; k += 5) { 6294 GemmMicrokernelTester() 6295 .mr(1) 6296 .nr(8) 6297 .kr(1) 6298 .sr(1) 6299 .m(1) 6300 .n(n) 6301 .k(k) 6302 .cn_stride(11) 6303 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6304 } 6305 } 6306 } 6307 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,n_div_8_strided_a)6308 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_div_8_strided_a) { 6309 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6310 for (uint32_t n = 16; n <= 24; n += 8) { 6311 for (size_t k = 1; k <= 20; k += 5) { 6312 GemmMicrokernelTester() 6313 .mr(1) 6314 .nr(8) 6315 .kr(1) 6316 .sr(1) 6317 .m(1) 6318 .n(n) 6319 .k(k) 6320 .a_stride(23) 6321 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6322 } 6323 } 6324 } 6325 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,n_div_8_subtile)6326 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_div_8_subtile) { 6327 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6328 for (uint32_t n = 16; n <= 24; n += 8) { 6329 for (size_t k = 1; k <= 20; k += 5) { 6330 for (uint32_t m = 1; m <= 1; m++) { 6331 GemmMicrokernelTester() 6332 .mr(1) 6333 .nr(8) 6334 .kr(1) 6335 .sr(1) 6336 .m(m) 6337 .n(n) 6338 .k(k) 6339 .iterations(1) 6340 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6341 } 6342 } 6343 } 6344 } 6345 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,strided_cm_subtile)6346 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, strided_cm_subtile) { 6347 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6348 for (size_t k = 1; k <= 20; k += 5) { 6349 for (uint32_t n = 1; n <= 8; n++) { 6350 for (uint32_t m = 1; m <= 1; m++) { 6351 GemmMicrokernelTester() 6352 .mr(1) 6353 .nr(8) 6354 .kr(1) 6355 .sr(1) 6356 .m(m) 6357 .n(n) 6358 .k(k) 6359 .cm_stride(11) 6360 .iterations(1) 6361 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6362 } 6363 } 6364 } 6365 } 6366 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,qmin)6367 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, qmin) { 6368 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6369 GemmMicrokernelTester() 6370 .mr(1) 6371 .nr(8) 6372 .kr(1) 6373 .sr(1) 6374 .m(1) 6375 .n(8) 6376 .k(4) 6377 .qmin(128) 6378 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6379 } 6380 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,qmax)6381 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, qmax) { 6382 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6383 GemmMicrokernelTester() 6384 .mr(1) 6385 .nr(8) 6386 .kr(1) 6387 .sr(1) 6388 .m(1) 6389 .n(8) 6390 .k(4) 6391 .qmax(128) 6392 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6393 } 6394 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,strided_cm)6395 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, strided_cm) { 6396 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6397 GemmMicrokernelTester() 6398 .mr(1) 6399 .nr(8) 6400 .kr(1) 6401 .sr(1) 6402 .m(1) 6403 .n(8) 6404 .k(4) 6405 .cm_stride(11) 6406 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6407 } 6408 #endif // XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) 6409 6410 6411 #if XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_eq_4)6412 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_eq_4) { 6413 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6414 GemmMicrokernelTester() 6415 .mr(1) 6416 .nr(16) 6417 .kr(1) 6418 .sr(1) 6419 .m(1) 6420 .n(16) 6421 .k(4) 6422 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6423 } 6424 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,strided_cn)6425 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, strided_cn) { 6426 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6427 GemmMicrokernelTester() 6428 .mr(1) 6429 .nr(16) 6430 .kr(1) 6431 .sr(1) 6432 .m(1) 6433 .n(16) 6434 .k(4) 6435 .cn_stride(19) 6436 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6437 } 6438 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_eq_4_strided_a)6439 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_eq_4_strided_a) { 6440 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6441 GemmMicrokernelTester() 6442 .mr(1) 6443 .nr(16) 6444 .kr(1) 6445 .sr(1) 6446 .m(1) 6447 .n(16) 6448 .k(4) 6449 .a_stride(7) 6450 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6451 } 6452 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_eq_4_subtile)6453 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_eq_4_subtile) { 6454 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6455 for (uint32_t n = 1; n <= 16; n++) { 6456 for (uint32_t m = 1; m <= 1; m++) { 6457 GemmMicrokernelTester() 6458 .mr(1) 6459 .nr(16) 6460 .kr(1) 6461 .sr(1) 6462 .m(m) 6463 .n(n) 6464 .k(4) 6465 .iterations(1) 6466 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6467 } 6468 } 6469 } 6470 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_eq_4_subtile_m)6471 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 6472 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6473 for (uint32_t m = 1; m <= 1; m++) { 6474 GemmMicrokernelTester() 6475 .mr(1) 6476 .nr(16) 6477 .kr(1) 6478 .sr(1) 6479 .m(m) 6480 .n(16) 6481 .k(4) 6482 .iterations(1) 6483 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6484 } 6485 } 6486 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_eq_4_subtile_n)6487 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 6488 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6489 for (uint32_t n = 1; n <= 16; n++) { 6490 GemmMicrokernelTester() 6491 .mr(1) 6492 .nr(16) 6493 .kr(1) 6494 .sr(1) 6495 .m(1) 6496 .n(n) 6497 .k(4) 6498 .iterations(1) 6499 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6500 } 6501 } 6502 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_lt_4)6503 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_lt_4) { 6504 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6505 for (size_t k = 1; k < 4; k++) { 6506 GemmMicrokernelTester() 6507 .mr(1) 6508 .nr(16) 6509 .kr(1) 6510 .sr(1) 6511 .m(1) 6512 .n(16) 6513 .k(k) 6514 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6515 } 6516 } 6517 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_lt_4_strided_a)6518 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_lt_4_strided_a) { 6519 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6520 for (size_t k = 1; k < 4; k++) { 6521 GemmMicrokernelTester() 6522 .mr(1) 6523 .nr(16) 6524 .kr(1) 6525 .sr(1) 6526 .m(1) 6527 .n(16) 6528 .k(k) 6529 .a_stride(7) 6530 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6531 } 6532 } 6533 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_lt_4_subtile)6534 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_lt_4_subtile) { 6535 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6536 for (size_t k = 1; k < 4; k++) { 6537 for (uint32_t n = 1; n <= 16; n++) { 6538 for (uint32_t m = 1; m <= 1; m++) { 6539 GemmMicrokernelTester() 6540 .mr(1) 6541 .nr(16) 6542 .kr(1) 6543 .sr(1) 6544 .m(m) 6545 .n(n) 6546 .k(k) 6547 .iterations(1) 6548 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6549 } 6550 } 6551 } 6552 } 6553 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_gt_4)6554 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_gt_4) { 6555 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6556 for (size_t k = 5; k < 8; k++) { 6557 GemmMicrokernelTester() 6558 .mr(1) 6559 .nr(16) 6560 .kr(1) 6561 .sr(1) 6562 .m(1) 6563 .n(16) 6564 .k(k) 6565 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6566 } 6567 } 6568 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_gt_4_strided_a)6569 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_gt_4_strided_a) { 6570 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6571 for (size_t k = 5; k < 8; k++) { 6572 GemmMicrokernelTester() 6573 .mr(1) 6574 .nr(16) 6575 .kr(1) 6576 .sr(1) 6577 .m(1) 6578 .n(16) 6579 .k(k) 6580 .a_stride(11) 6581 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6582 } 6583 } 6584 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_gt_4_subtile)6585 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_gt_4_subtile) { 6586 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6587 for (size_t k = 5; k < 8; k++) { 6588 for (uint32_t n = 1; n <= 16; n++) { 6589 for (uint32_t m = 1; m <= 1; m++) { 6590 GemmMicrokernelTester() 6591 .mr(1) 6592 .nr(16) 6593 .kr(1) 6594 .sr(1) 6595 .m(m) 6596 .n(n) 6597 .k(k) 6598 .iterations(1) 6599 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6600 } 6601 } 6602 } 6603 } 6604 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_div_4)6605 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_div_4) { 6606 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6607 for (size_t k = 8; k <= 40; k += 4) { 6608 GemmMicrokernelTester() 6609 .mr(1) 6610 .nr(16) 6611 .kr(1) 6612 .sr(1) 6613 .m(1) 6614 .n(16) 6615 .k(k) 6616 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6617 } 6618 } 6619 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_div_4_strided_a)6620 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_div_4_strided_a) { 6621 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6622 for (size_t k = 8; k <= 40; k += 4) { 6623 GemmMicrokernelTester() 6624 .mr(1) 6625 .nr(16) 6626 .kr(1) 6627 .sr(1) 6628 .m(1) 6629 .n(16) 6630 .k(k) 6631 .a_stride(43) 6632 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6633 } 6634 } 6635 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_div_4_subtile)6636 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_div_4_subtile) { 6637 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6638 for (size_t k = 8; k <= 40; k += 4) { 6639 for (uint32_t n = 1; n <= 16; n++) { 6640 for (uint32_t m = 1; m <= 1; m++) { 6641 GemmMicrokernelTester() 6642 .mr(1) 6643 .nr(16) 6644 .kr(1) 6645 .sr(1) 6646 .m(m) 6647 .n(n) 6648 .k(k) 6649 .iterations(1) 6650 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6651 } 6652 } 6653 } 6654 } 6655 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,n_gt_16)6656 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_gt_16) { 6657 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6658 for (uint32_t n = 17; n < 32; n++) { 6659 for (size_t k = 1; k <= 20; k += 5) { 6660 GemmMicrokernelTester() 6661 .mr(1) 6662 .nr(16) 6663 .kr(1) 6664 .sr(1) 6665 .m(1) 6666 .n(n) 6667 .k(k) 6668 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6669 } 6670 } 6671 } 6672 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,n_gt_16_strided_cn)6673 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_gt_16_strided_cn) { 6674 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6675 for (uint32_t n = 17; n < 32; n++) { 6676 for (size_t k = 1; k <= 20; k += 5) { 6677 GemmMicrokernelTester() 6678 .mr(1) 6679 .nr(16) 6680 .kr(1) 6681 .sr(1) 6682 .m(1) 6683 .n(n) 6684 .k(k) 6685 .cn_stride(19) 6686 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6687 } 6688 } 6689 } 6690 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,n_gt_16_strided_a)6691 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_gt_16_strided_a) { 6692 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6693 for (uint32_t n = 17; n < 32; n++) { 6694 for (size_t k = 1; k <= 20; k += 5) { 6695 GemmMicrokernelTester() 6696 .mr(1) 6697 .nr(16) 6698 .kr(1) 6699 .sr(1) 6700 .m(1) 6701 .n(n) 6702 .k(k) 6703 .a_stride(23) 6704 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6705 } 6706 } 6707 } 6708 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,n_gt_16_subtile)6709 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_gt_16_subtile) { 6710 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6711 for (uint32_t n = 17; n < 32; n++) { 6712 for (size_t k = 1; k <= 20; k += 5) { 6713 for (uint32_t m = 1; m <= 1; m++) { 6714 GemmMicrokernelTester() 6715 .mr(1) 6716 .nr(16) 6717 .kr(1) 6718 .sr(1) 6719 .m(m) 6720 .n(n) 6721 .k(k) 6722 .iterations(1) 6723 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6724 } 6725 } 6726 } 6727 } 6728 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,n_div_16)6729 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_div_16) { 6730 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6731 for (uint32_t n = 32; n <= 48; n += 16) { 6732 for (size_t k = 1; k <= 20; k += 5) { 6733 GemmMicrokernelTester() 6734 .mr(1) 6735 .nr(16) 6736 .kr(1) 6737 .sr(1) 6738 .m(1) 6739 .n(n) 6740 .k(k) 6741 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6742 } 6743 } 6744 } 6745 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,n_div_16_strided_cn)6746 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_div_16_strided_cn) { 6747 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6748 for (uint32_t n = 32; n <= 48; n += 16) { 6749 for (size_t k = 1; k <= 20; k += 5) { 6750 GemmMicrokernelTester() 6751 .mr(1) 6752 .nr(16) 6753 .kr(1) 6754 .sr(1) 6755 .m(1) 6756 .n(n) 6757 .k(k) 6758 .cn_stride(19) 6759 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6760 } 6761 } 6762 } 6763 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,n_div_16_strided_a)6764 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_div_16_strided_a) { 6765 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6766 for (uint32_t n = 32; n <= 48; n += 16) { 6767 for (size_t k = 1; k <= 20; k += 5) { 6768 GemmMicrokernelTester() 6769 .mr(1) 6770 .nr(16) 6771 .kr(1) 6772 .sr(1) 6773 .m(1) 6774 .n(n) 6775 .k(k) 6776 .a_stride(23) 6777 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6778 } 6779 } 6780 } 6781 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,n_div_16_subtile)6782 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_div_16_subtile) { 6783 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6784 for (uint32_t n = 32; n <= 48; n += 16) { 6785 for (size_t k = 1; k <= 20; k += 5) { 6786 for (uint32_t m = 1; m <= 1; m++) { 6787 GemmMicrokernelTester() 6788 .mr(1) 6789 .nr(16) 6790 .kr(1) 6791 .sr(1) 6792 .m(m) 6793 .n(n) 6794 .k(k) 6795 .iterations(1) 6796 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6797 } 6798 } 6799 } 6800 } 6801 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,strided_cm_subtile)6802 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, strided_cm_subtile) { 6803 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6804 for (size_t k = 1; k <= 20; k += 5) { 6805 for (uint32_t n = 1; n <= 16; n++) { 6806 for (uint32_t m = 1; m <= 1; m++) { 6807 GemmMicrokernelTester() 6808 .mr(1) 6809 .nr(16) 6810 .kr(1) 6811 .sr(1) 6812 .m(m) 6813 .n(n) 6814 .k(k) 6815 .cm_stride(19) 6816 .iterations(1) 6817 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6818 } 6819 } 6820 } 6821 } 6822 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,qmin)6823 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, qmin) { 6824 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6825 GemmMicrokernelTester() 6826 .mr(1) 6827 .nr(16) 6828 .kr(1) 6829 .sr(1) 6830 .m(1) 6831 .n(16) 6832 .k(4) 6833 .qmin(128) 6834 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6835 } 6836 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,qmax)6837 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, qmax) { 6838 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6839 GemmMicrokernelTester() 6840 .mr(1) 6841 .nr(16) 6842 .kr(1) 6843 .sr(1) 6844 .m(1) 6845 .n(16) 6846 .k(4) 6847 .qmax(128) 6848 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6849 } 6850 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,strided_cm)6851 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, strided_cm) { 6852 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6853 GemmMicrokernelTester() 6854 .mr(1) 6855 .nr(16) 6856 .kr(1) 6857 .sr(1) 6858 .m(1) 6859 .n(16) 6860 .k(4) 6861 .cm_stride(19) 6862 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6863 } 6864 #endif // XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) 6865 6866 6867 #if XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_eq_4)6868 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_eq_4) { 6869 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6870 GemmMicrokernelTester() 6871 .mr(4) 6872 .nr(8) 6873 .kr(1) 6874 .sr(1) 6875 .m(4) 6876 .n(8) 6877 .k(4) 6878 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6879 } 6880 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,strided_cn)6881 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, strided_cn) { 6882 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6883 GemmMicrokernelTester() 6884 .mr(4) 6885 .nr(8) 6886 .kr(1) 6887 .sr(1) 6888 .m(4) 6889 .n(8) 6890 .k(4) 6891 .cn_stride(11) 6892 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6893 } 6894 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_eq_4_strided_a)6895 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_eq_4_strided_a) { 6896 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6897 GemmMicrokernelTester() 6898 .mr(4) 6899 .nr(8) 6900 .kr(1) 6901 .sr(1) 6902 .m(4) 6903 .n(8) 6904 .k(4) 6905 .a_stride(7) 6906 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6907 } 6908 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_eq_4_subtile)6909 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_eq_4_subtile) { 6910 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6911 for (uint32_t n = 1; n <= 8; n++) { 6912 for (uint32_t m = 1; m <= 4; m++) { 6913 GemmMicrokernelTester() 6914 .mr(4) 6915 .nr(8) 6916 .kr(1) 6917 .sr(1) 6918 .m(m) 6919 .n(n) 6920 .k(4) 6921 .iterations(1) 6922 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6923 } 6924 } 6925 } 6926 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_eq_4_subtile_m)6927 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 6928 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6929 for (uint32_t m = 1; m <= 4; m++) { 6930 GemmMicrokernelTester() 6931 .mr(4) 6932 .nr(8) 6933 .kr(1) 6934 .sr(1) 6935 .m(m) 6936 .n(8) 6937 .k(4) 6938 .iterations(1) 6939 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6940 } 6941 } 6942 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_eq_4_subtile_n)6943 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 6944 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6945 for (uint32_t n = 1; n <= 8; n++) { 6946 GemmMicrokernelTester() 6947 .mr(4) 6948 .nr(8) 6949 .kr(1) 6950 .sr(1) 6951 .m(4) 6952 .n(n) 6953 .k(4) 6954 .iterations(1) 6955 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6956 } 6957 } 6958 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_lt_4)6959 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_lt_4) { 6960 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6961 for (size_t k = 1; k < 4; k++) { 6962 GemmMicrokernelTester() 6963 .mr(4) 6964 .nr(8) 6965 .kr(1) 6966 .sr(1) 6967 .m(4) 6968 .n(8) 6969 .k(k) 6970 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6971 } 6972 } 6973 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_lt_4_strided_a)6974 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_lt_4_strided_a) { 6975 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6976 for (size_t k = 1; k < 4; k++) { 6977 GemmMicrokernelTester() 6978 .mr(4) 6979 .nr(8) 6980 .kr(1) 6981 .sr(1) 6982 .m(4) 6983 .n(8) 6984 .k(k) 6985 .a_stride(7) 6986 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6987 } 6988 } 6989 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_lt_4_subtile)6990 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_lt_4_subtile) { 6991 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6992 for (size_t k = 1; k < 4; k++) { 6993 for (uint32_t n = 1; n <= 8; n++) { 6994 for (uint32_t m = 1; m <= 4; m++) { 6995 GemmMicrokernelTester() 6996 .mr(4) 6997 .nr(8) 6998 .kr(1) 6999 .sr(1) 7000 .m(m) 7001 .n(n) 7002 .k(k) 7003 .iterations(1) 7004 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7005 } 7006 } 7007 } 7008 } 7009 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_gt_4)7010 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_gt_4) { 7011 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7012 for (size_t k = 5; k < 8; k++) { 7013 GemmMicrokernelTester() 7014 .mr(4) 7015 .nr(8) 7016 .kr(1) 7017 .sr(1) 7018 .m(4) 7019 .n(8) 7020 .k(k) 7021 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7022 } 7023 } 7024 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_gt_4_strided_a)7025 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_gt_4_strided_a) { 7026 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7027 for (size_t k = 5; k < 8; k++) { 7028 GemmMicrokernelTester() 7029 .mr(4) 7030 .nr(8) 7031 .kr(1) 7032 .sr(1) 7033 .m(4) 7034 .n(8) 7035 .k(k) 7036 .a_stride(11) 7037 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7038 } 7039 } 7040 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_gt_4_subtile)7041 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_gt_4_subtile) { 7042 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7043 for (size_t k = 5; k < 8; k++) { 7044 for (uint32_t n = 1; n <= 8; n++) { 7045 for (uint32_t m = 1; m <= 4; m++) { 7046 GemmMicrokernelTester() 7047 .mr(4) 7048 .nr(8) 7049 .kr(1) 7050 .sr(1) 7051 .m(m) 7052 .n(n) 7053 .k(k) 7054 .iterations(1) 7055 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7056 } 7057 } 7058 } 7059 } 7060 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_div_4)7061 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_div_4) { 7062 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7063 for (size_t k = 8; k <= 40; k += 4) { 7064 GemmMicrokernelTester() 7065 .mr(4) 7066 .nr(8) 7067 .kr(1) 7068 .sr(1) 7069 .m(4) 7070 .n(8) 7071 .k(k) 7072 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7073 } 7074 } 7075 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_div_4_strided_a)7076 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_div_4_strided_a) { 7077 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7078 for (size_t k = 8; k <= 40; k += 4) { 7079 GemmMicrokernelTester() 7080 .mr(4) 7081 .nr(8) 7082 .kr(1) 7083 .sr(1) 7084 .m(4) 7085 .n(8) 7086 .k(k) 7087 .a_stride(43) 7088 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7089 } 7090 } 7091 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_div_4_subtile)7092 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_div_4_subtile) { 7093 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7094 for (size_t k = 8; k <= 40; k += 4) { 7095 for (uint32_t n = 1; n <= 8; n++) { 7096 for (uint32_t m = 1; m <= 4; m++) { 7097 GemmMicrokernelTester() 7098 .mr(4) 7099 .nr(8) 7100 .kr(1) 7101 .sr(1) 7102 .m(m) 7103 .n(n) 7104 .k(k) 7105 .iterations(1) 7106 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7107 } 7108 } 7109 } 7110 } 7111 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,n_gt_8)7112 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_gt_8) { 7113 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7114 for (uint32_t n = 9; n < 16; n++) { 7115 for (size_t k = 1; k <= 20; k += 5) { 7116 GemmMicrokernelTester() 7117 .mr(4) 7118 .nr(8) 7119 .kr(1) 7120 .sr(1) 7121 .m(4) 7122 .n(n) 7123 .k(k) 7124 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7125 } 7126 } 7127 } 7128 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,n_gt_8_strided_cn)7129 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_gt_8_strided_cn) { 7130 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7131 for (uint32_t n = 9; n < 16; n++) { 7132 for (size_t k = 1; k <= 20; k += 5) { 7133 GemmMicrokernelTester() 7134 .mr(4) 7135 .nr(8) 7136 .kr(1) 7137 .sr(1) 7138 .m(4) 7139 .n(n) 7140 .k(k) 7141 .cn_stride(11) 7142 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7143 } 7144 } 7145 } 7146 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,n_gt_8_strided_a)7147 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_gt_8_strided_a) { 7148 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7149 for (uint32_t n = 9; n < 16; n++) { 7150 for (size_t k = 1; k <= 20; k += 5) { 7151 GemmMicrokernelTester() 7152 .mr(4) 7153 .nr(8) 7154 .kr(1) 7155 .sr(1) 7156 .m(4) 7157 .n(n) 7158 .k(k) 7159 .a_stride(23) 7160 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7161 } 7162 } 7163 } 7164 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,n_gt_8_subtile)7165 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_gt_8_subtile) { 7166 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7167 for (uint32_t n = 9; n < 16; n++) { 7168 for (size_t k = 1; k <= 20; k += 5) { 7169 for (uint32_t m = 1; m <= 4; m++) { 7170 GemmMicrokernelTester() 7171 .mr(4) 7172 .nr(8) 7173 .kr(1) 7174 .sr(1) 7175 .m(m) 7176 .n(n) 7177 .k(k) 7178 .iterations(1) 7179 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7180 } 7181 } 7182 } 7183 } 7184 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,n_div_8)7185 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_div_8) { 7186 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7187 for (uint32_t n = 16; n <= 24; n += 8) { 7188 for (size_t k = 1; k <= 20; k += 5) { 7189 GemmMicrokernelTester() 7190 .mr(4) 7191 .nr(8) 7192 .kr(1) 7193 .sr(1) 7194 .m(4) 7195 .n(n) 7196 .k(k) 7197 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7198 } 7199 } 7200 } 7201 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,n_div_8_strided_cn)7202 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_div_8_strided_cn) { 7203 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7204 for (uint32_t n = 16; n <= 24; n += 8) { 7205 for (size_t k = 1; k <= 20; k += 5) { 7206 GemmMicrokernelTester() 7207 .mr(4) 7208 .nr(8) 7209 .kr(1) 7210 .sr(1) 7211 .m(4) 7212 .n(n) 7213 .k(k) 7214 .cn_stride(11) 7215 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7216 } 7217 } 7218 } 7219 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,n_div_8_strided_a)7220 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_div_8_strided_a) { 7221 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7222 for (uint32_t n = 16; n <= 24; n += 8) { 7223 for (size_t k = 1; k <= 20; k += 5) { 7224 GemmMicrokernelTester() 7225 .mr(4) 7226 .nr(8) 7227 .kr(1) 7228 .sr(1) 7229 .m(4) 7230 .n(n) 7231 .k(k) 7232 .a_stride(23) 7233 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7234 } 7235 } 7236 } 7237 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,n_div_8_subtile)7238 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_div_8_subtile) { 7239 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7240 for (uint32_t n = 16; n <= 24; n += 8) { 7241 for (size_t k = 1; k <= 20; k += 5) { 7242 for (uint32_t m = 1; m <= 4; m++) { 7243 GemmMicrokernelTester() 7244 .mr(4) 7245 .nr(8) 7246 .kr(1) 7247 .sr(1) 7248 .m(m) 7249 .n(n) 7250 .k(k) 7251 .iterations(1) 7252 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7253 } 7254 } 7255 } 7256 } 7257 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,strided_cm_subtile)7258 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, strided_cm_subtile) { 7259 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7260 for (size_t k = 1; k <= 20; k += 5) { 7261 for (uint32_t n = 1; n <= 8; n++) { 7262 for (uint32_t m = 1; m <= 4; m++) { 7263 GemmMicrokernelTester() 7264 .mr(4) 7265 .nr(8) 7266 .kr(1) 7267 .sr(1) 7268 .m(m) 7269 .n(n) 7270 .k(k) 7271 .cm_stride(11) 7272 .iterations(1) 7273 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7274 } 7275 } 7276 } 7277 } 7278 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,qmin)7279 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, qmin) { 7280 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7281 GemmMicrokernelTester() 7282 .mr(4) 7283 .nr(8) 7284 .kr(1) 7285 .sr(1) 7286 .m(4) 7287 .n(8) 7288 .k(4) 7289 .qmin(128) 7290 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7291 } 7292 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,qmax)7293 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, qmax) { 7294 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7295 GemmMicrokernelTester() 7296 .mr(4) 7297 .nr(8) 7298 .kr(1) 7299 .sr(1) 7300 .m(4) 7301 .n(8) 7302 .k(4) 7303 .qmax(128) 7304 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7305 } 7306 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,strided_cm)7307 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, strided_cm) { 7308 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7309 GemmMicrokernelTester() 7310 .mr(4) 7311 .nr(8) 7312 .kr(1) 7313 .sr(1) 7314 .m(4) 7315 .n(8) 7316 .k(4) 7317 .cm_stride(11) 7318 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7319 } 7320 #endif // XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) 7321 7322 7323 #if XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_eq_4)7324 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_eq_4) { 7325 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7326 GemmMicrokernelTester() 7327 .mr(4) 7328 .nr(16) 7329 .kr(1) 7330 .sr(1) 7331 .m(4) 7332 .n(16) 7333 .k(4) 7334 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7335 } 7336 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,strided_cn)7337 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, strided_cn) { 7338 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7339 GemmMicrokernelTester() 7340 .mr(4) 7341 .nr(16) 7342 .kr(1) 7343 .sr(1) 7344 .m(4) 7345 .n(16) 7346 .k(4) 7347 .cn_stride(19) 7348 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7349 } 7350 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_eq_4_strided_a)7351 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_eq_4_strided_a) { 7352 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7353 GemmMicrokernelTester() 7354 .mr(4) 7355 .nr(16) 7356 .kr(1) 7357 .sr(1) 7358 .m(4) 7359 .n(16) 7360 .k(4) 7361 .a_stride(7) 7362 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7363 } 7364 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_eq_4_subtile)7365 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_eq_4_subtile) { 7366 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7367 for (uint32_t n = 1; n <= 16; n++) { 7368 for (uint32_t m = 1; m <= 4; m++) { 7369 GemmMicrokernelTester() 7370 .mr(4) 7371 .nr(16) 7372 .kr(1) 7373 .sr(1) 7374 .m(m) 7375 .n(n) 7376 .k(4) 7377 .iterations(1) 7378 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7379 } 7380 } 7381 } 7382 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_eq_4_subtile_m)7383 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 7384 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7385 for (uint32_t m = 1; m <= 4; m++) { 7386 GemmMicrokernelTester() 7387 .mr(4) 7388 .nr(16) 7389 .kr(1) 7390 .sr(1) 7391 .m(m) 7392 .n(16) 7393 .k(4) 7394 .iterations(1) 7395 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7396 } 7397 } 7398 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_eq_4_subtile_n)7399 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 7400 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7401 for (uint32_t n = 1; n <= 16; n++) { 7402 GemmMicrokernelTester() 7403 .mr(4) 7404 .nr(16) 7405 .kr(1) 7406 .sr(1) 7407 .m(4) 7408 .n(n) 7409 .k(4) 7410 .iterations(1) 7411 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7412 } 7413 } 7414 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_lt_4)7415 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_lt_4) { 7416 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7417 for (size_t k = 1; k < 4; k++) { 7418 GemmMicrokernelTester() 7419 .mr(4) 7420 .nr(16) 7421 .kr(1) 7422 .sr(1) 7423 .m(4) 7424 .n(16) 7425 .k(k) 7426 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7427 } 7428 } 7429 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_lt_4_strided_a)7430 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_lt_4_strided_a) { 7431 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7432 for (size_t k = 1; k < 4; k++) { 7433 GemmMicrokernelTester() 7434 .mr(4) 7435 .nr(16) 7436 .kr(1) 7437 .sr(1) 7438 .m(4) 7439 .n(16) 7440 .k(k) 7441 .a_stride(7) 7442 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7443 } 7444 } 7445 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_lt_4_subtile)7446 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_lt_4_subtile) { 7447 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7448 for (size_t k = 1; k < 4; k++) { 7449 for (uint32_t n = 1; n <= 16; n++) { 7450 for (uint32_t m = 1; m <= 4; m++) { 7451 GemmMicrokernelTester() 7452 .mr(4) 7453 .nr(16) 7454 .kr(1) 7455 .sr(1) 7456 .m(m) 7457 .n(n) 7458 .k(k) 7459 .iterations(1) 7460 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7461 } 7462 } 7463 } 7464 } 7465 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_gt_4)7466 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_gt_4) { 7467 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7468 for (size_t k = 5; k < 8; k++) { 7469 GemmMicrokernelTester() 7470 .mr(4) 7471 .nr(16) 7472 .kr(1) 7473 .sr(1) 7474 .m(4) 7475 .n(16) 7476 .k(k) 7477 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7478 } 7479 } 7480 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_gt_4_strided_a)7481 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_gt_4_strided_a) { 7482 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7483 for (size_t k = 5; k < 8; k++) { 7484 GemmMicrokernelTester() 7485 .mr(4) 7486 .nr(16) 7487 .kr(1) 7488 .sr(1) 7489 .m(4) 7490 .n(16) 7491 .k(k) 7492 .a_stride(11) 7493 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7494 } 7495 } 7496 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_gt_4_subtile)7497 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_gt_4_subtile) { 7498 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7499 for (size_t k = 5; k < 8; k++) { 7500 for (uint32_t n = 1; n <= 16; n++) { 7501 for (uint32_t m = 1; m <= 4; m++) { 7502 GemmMicrokernelTester() 7503 .mr(4) 7504 .nr(16) 7505 .kr(1) 7506 .sr(1) 7507 .m(m) 7508 .n(n) 7509 .k(k) 7510 .iterations(1) 7511 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7512 } 7513 } 7514 } 7515 } 7516 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_div_4)7517 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_div_4) { 7518 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7519 for (size_t k = 8; k <= 40; k += 4) { 7520 GemmMicrokernelTester() 7521 .mr(4) 7522 .nr(16) 7523 .kr(1) 7524 .sr(1) 7525 .m(4) 7526 .n(16) 7527 .k(k) 7528 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7529 } 7530 } 7531 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_div_4_strided_a)7532 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_div_4_strided_a) { 7533 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7534 for (size_t k = 8; k <= 40; k += 4) { 7535 GemmMicrokernelTester() 7536 .mr(4) 7537 .nr(16) 7538 .kr(1) 7539 .sr(1) 7540 .m(4) 7541 .n(16) 7542 .k(k) 7543 .a_stride(43) 7544 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7545 } 7546 } 7547 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_div_4_subtile)7548 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_div_4_subtile) { 7549 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7550 for (size_t k = 8; k <= 40; k += 4) { 7551 for (uint32_t n = 1; n <= 16; n++) { 7552 for (uint32_t m = 1; m <= 4; m++) { 7553 GemmMicrokernelTester() 7554 .mr(4) 7555 .nr(16) 7556 .kr(1) 7557 .sr(1) 7558 .m(m) 7559 .n(n) 7560 .k(k) 7561 .iterations(1) 7562 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7563 } 7564 } 7565 } 7566 } 7567 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,n_gt_16)7568 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_gt_16) { 7569 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7570 for (uint32_t n = 17; n < 32; n++) { 7571 for (size_t k = 1; k <= 20; k += 5) { 7572 GemmMicrokernelTester() 7573 .mr(4) 7574 .nr(16) 7575 .kr(1) 7576 .sr(1) 7577 .m(4) 7578 .n(n) 7579 .k(k) 7580 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7581 } 7582 } 7583 } 7584 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,n_gt_16_strided_cn)7585 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_gt_16_strided_cn) { 7586 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7587 for (uint32_t n = 17; n < 32; n++) { 7588 for (size_t k = 1; k <= 20; k += 5) { 7589 GemmMicrokernelTester() 7590 .mr(4) 7591 .nr(16) 7592 .kr(1) 7593 .sr(1) 7594 .m(4) 7595 .n(n) 7596 .k(k) 7597 .cn_stride(19) 7598 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7599 } 7600 } 7601 } 7602 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,n_gt_16_strided_a)7603 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_gt_16_strided_a) { 7604 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7605 for (uint32_t n = 17; n < 32; n++) { 7606 for (size_t k = 1; k <= 20; k += 5) { 7607 GemmMicrokernelTester() 7608 .mr(4) 7609 .nr(16) 7610 .kr(1) 7611 .sr(1) 7612 .m(4) 7613 .n(n) 7614 .k(k) 7615 .a_stride(23) 7616 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7617 } 7618 } 7619 } 7620 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,n_gt_16_subtile)7621 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_gt_16_subtile) { 7622 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7623 for (uint32_t n = 17; n < 32; n++) { 7624 for (size_t k = 1; k <= 20; k += 5) { 7625 for (uint32_t m = 1; m <= 4; m++) { 7626 GemmMicrokernelTester() 7627 .mr(4) 7628 .nr(16) 7629 .kr(1) 7630 .sr(1) 7631 .m(m) 7632 .n(n) 7633 .k(k) 7634 .iterations(1) 7635 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7636 } 7637 } 7638 } 7639 } 7640 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,n_div_16)7641 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_div_16) { 7642 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7643 for (uint32_t n = 32; n <= 48; n += 16) { 7644 for (size_t k = 1; k <= 20; k += 5) { 7645 GemmMicrokernelTester() 7646 .mr(4) 7647 .nr(16) 7648 .kr(1) 7649 .sr(1) 7650 .m(4) 7651 .n(n) 7652 .k(k) 7653 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7654 } 7655 } 7656 } 7657 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,n_div_16_strided_cn)7658 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_div_16_strided_cn) { 7659 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7660 for (uint32_t n = 32; n <= 48; n += 16) { 7661 for (size_t k = 1; k <= 20; k += 5) { 7662 GemmMicrokernelTester() 7663 .mr(4) 7664 .nr(16) 7665 .kr(1) 7666 .sr(1) 7667 .m(4) 7668 .n(n) 7669 .k(k) 7670 .cn_stride(19) 7671 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7672 } 7673 } 7674 } 7675 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,n_div_16_strided_a)7676 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_div_16_strided_a) { 7677 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7678 for (uint32_t n = 32; n <= 48; n += 16) { 7679 for (size_t k = 1; k <= 20; k += 5) { 7680 GemmMicrokernelTester() 7681 .mr(4) 7682 .nr(16) 7683 .kr(1) 7684 .sr(1) 7685 .m(4) 7686 .n(n) 7687 .k(k) 7688 .a_stride(23) 7689 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7690 } 7691 } 7692 } 7693 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,n_div_16_subtile)7694 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_div_16_subtile) { 7695 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7696 for (uint32_t n = 32; n <= 48; n += 16) { 7697 for (size_t k = 1; k <= 20; k += 5) { 7698 for (uint32_t m = 1; m <= 4; m++) { 7699 GemmMicrokernelTester() 7700 .mr(4) 7701 .nr(16) 7702 .kr(1) 7703 .sr(1) 7704 .m(m) 7705 .n(n) 7706 .k(k) 7707 .iterations(1) 7708 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7709 } 7710 } 7711 } 7712 } 7713 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,strided_cm_subtile)7714 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, strided_cm_subtile) { 7715 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7716 for (size_t k = 1; k <= 20; k += 5) { 7717 for (uint32_t n = 1; n <= 16; n++) { 7718 for (uint32_t m = 1; m <= 4; m++) { 7719 GemmMicrokernelTester() 7720 .mr(4) 7721 .nr(16) 7722 .kr(1) 7723 .sr(1) 7724 .m(m) 7725 .n(n) 7726 .k(k) 7727 .cm_stride(19) 7728 .iterations(1) 7729 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7730 } 7731 } 7732 } 7733 } 7734 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,qmin)7735 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, qmin) { 7736 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7737 GemmMicrokernelTester() 7738 .mr(4) 7739 .nr(16) 7740 .kr(1) 7741 .sr(1) 7742 .m(4) 7743 .n(16) 7744 .k(4) 7745 .qmin(128) 7746 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7747 } 7748 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,qmax)7749 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, qmax) { 7750 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7751 GemmMicrokernelTester() 7752 .mr(4) 7753 .nr(16) 7754 .kr(1) 7755 .sr(1) 7756 .m(4) 7757 .n(16) 7758 .k(4) 7759 .qmax(128) 7760 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7761 } 7762 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,strided_cm)7763 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, strided_cm) { 7764 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7765 GemmMicrokernelTester() 7766 .mr(4) 7767 .nr(16) 7768 .kr(1) 7769 .sr(1) 7770 .m(4) 7771 .n(16) 7772 .k(4) 7773 .cm_stride(19) 7774 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7775 } 7776 #endif // XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) 7777 7778 7779 #if XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_eq_4)7780 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_eq_4) { 7781 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7782 GemmMicrokernelTester() 7783 .mr(6) 7784 .nr(8) 7785 .kr(1) 7786 .sr(1) 7787 .m(6) 7788 .n(8) 7789 .k(4) 7790 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7791 } 7792 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,strided_cn)7793 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, strided_cn) { 7794 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7795 GemmMicrokernelTester() 7796 .mr(6) 7797 .nr(8) 7798 .kr(1) 7799 .sr(1) 7800 .m(6) 7801 .n(8) 7802 .k(4) 7803 .cn_stride(11) 7804 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7805 } 7806 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_eq_4_strided_a)7807 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_eq_4_strided_a) { 7808 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7809 GemmMicrokernelTester() 7810 .mr(6) 7811 .nr(8) 7812 .kr(1) 7813 .sr(1) 7814 .m(6) 7815 .n(8) 7816 .k(4) 7817 .a_stride(7) 7818 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7819 } 7820 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_eq_4_subtile)7821 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_eq_4_subtile) { 7822 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7823 for (uint32_t n = 1; n <= 8; n++) { 7824 for (uint32_t m = 1; m <= 6; m++) { 7825 GemmMicrokernelTester() 7826 .mr(6) 7827 .nr(8) 7828 .kr(1) 7829 .sr(1) 7830 .m(m) 7831 .n(n) 7832 .k(4) 7833 .iterations(1) 7834 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7835 } 7836 } 7837 } 7838 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_eq_4_subtile_m)7839 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 7840 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7841 for (uint32_t m = 1; m <= 6; m++) { 7842 GemmMicrokernelTester() 7843 .mr(6) 7844 .nr(8) 7845 .kr(1) 7846 .sr(1) 7847 .m(m) 7848 .n(8) 7849 .k(4) 7850 .iterations(1) 7851 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7852 } 7853 } 7854 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_eq_4_subtile_n)7855 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 7856 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7857 for (uint32_t n = 1; n <= 8; n++) { 7858 GemmMicrokernelTester() 7859 .mr(6) 7860 .nr(8) 7861 .kr(1) 7862 .sr(1) 7863 .m(6) 7864 .n(n) 7865 .k(4) 7866 .iterations(1) 7867 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7868 } 7869 } 7870 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_lt_4)7871 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_lt_4) { 7872 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7873 for (size_t k = 1; k < 4; k++) { 7874 GemmMicrokernelTester() 7875 .mr(6) 7876 .nr(8) 7877 .kr(1) 7878 .sr(1) 7879 .m(6) 7880 .n(8) 7881 .k(k) 7882 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7883 } 7884 } 7885 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_lt_4_strided_a)7886 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_lt_4_strided_a) { 7887 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7888 for (size_t k = 1; k < 4; k++) { 7889 GemmMicrokernelTester() 7890 .mr(6) 7891 .nr(8) 7892 .kr(1) 7893 .sr(1) 7894 .m(6) 7895 .n(8) 7896 .k(k) 7897 .a_stride(7) 7898 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7899 } 7900 } 7901 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_lt_4_subtile)7902 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_lt_4_subtile) { 7903 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7904 for (size_t k = 1; k < 4; k++) { 7905 for (uint32_t n = 1; n <= 8; n++) { 7906 for (uint32_t m = 1; m <= 6; m++) { 7907 GemmMicrokernelTester() 7908 .mr(6) 7909 .nr(8) 7910 .kr(1) 7911 .sr(1) 7912 .m(m) 7913 .n(n) 7914 .k(k) 7915 .iterations(1) 7916 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7917 } 7918 } 7919 } 7920 } 7921 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_gt_4)7922 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_gt_4) { 7923 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7924 for (size_t k = 5; k < 8; k++) { 7925 GemmMicrokernelTester() 7926 .mr(6) 7927 .nr(8) 7928 .kr(1) 7929 .sr(1) 7930 .m(6) 7931 .n(8) 7932 .k(k) 7933 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7934 } 7935 } 7936 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_gt_4_strided_a)7937 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_gt_4_strided_a) { 7938 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7939 for (size_t k = 5; k < 8; k++) { 7940 GemmMicrokernelTester() 7941 .mr(6) 7942 .nr(8) 7943 .kr(1) 7944 .sr(1) 7945 .m(6) 7946 .n(8) 7947 .k(k) 7948 .a_stride(11) 7949 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7950 } 7951 } 7952 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_gt_4_subtile)7953 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_gt_4_subtile) { 7954 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7955 for (size_t k = 5; k < 8; k++) { 7956 for (uint32_t n = 1; n <= 8; n++) { 7957 for (uint32_t m = 1; m <= 6; m++) { 7958 GemmMicrokernelTester() 7959 .mr(6) 7960 .nr(8) 7961 .kr(1) 7962 .sr(1) 7963 .m(m) 7964 .n(n) 7965 .k(k) 7966 .iterations(1) 7967 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7968 } 7969 } 7970 } 7971 } 7972 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_div_4)7973 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_div_4) { 7974 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7975 for (size_t k = 8; k <= 40; k += 4) { 7976 GemmMicrokernelTester() 7977 .mr(6) 7978 .nr(8) 7979 .kr(1) 7980 .sr(1) 7981 .m(6) 7982 .n(8) 7983 .k(k) 7984 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7985 } 7986 } 7987 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_div_4_strided_a)7988 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_div_4_strided_a) { 7989 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7990 for (size_t k = 8; k <= 40; k += 4) { 7991 GemmMicrokernelTester() 7992 .mr(6) 7993 .nr(8) 7994 .kr(1) 7995 .sr(1) 7996 .m(6) 7997 .n(8) 7998 .k(k) 7999 .a_stride(43) 8000 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8001 } 8002 } 8003 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_div_4_subtile)8004 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_div_4_subtile) { 8005 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8006 for (size_t k = 8; k <= 40; k += 4) { 8007 for (uint32_t n = 1; n <= 8; n++) { 8008 for (uint32_t m = 1; m <= 6; m++) { 8009 GemmMicrokernelTester() 8010 .mr(6) 8011 .nr(8) 8012 .kr(1) 8013 .sr(1) 8014 .m(m) 8015 .n(n) 8016 .k(k) 8017 .iterations(1) 8018 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8019 } 8020 } 8021 } 8022 } 8023 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,n_gt_8)8024 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_gt_8) { 8025 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8026 for (uint32_t n = 9; n < 16; n++) { 8027 for (size_t k = 1; k <= 20; k += 5) { 8028 GemmMicrokernelTester() 8029 .mr(6) 8030 .nr(8) 8031 .kr(1) 8032 .sr(1) 8033 .m(6) 8034 .n(n) 8035 .k(k) 8036 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8037 } 8038 } 8039 } 8040 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,n_gt_8_strided_cn)8041 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_gt_8_strided_cn) { 8042 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8043 for (uint32_t n = 9; n < 16; n++) { 8044 for (size_t k = 1; k <= 20; k += 5) { 8045 GemmMicrokernelTester() 8046 .mr(6) 8047 .nr(8) 8048 .kr(1) 8049 .sr(1) 8050 .m(6) 8051 .n(n) 8052 .k(k) 8053 .cn_stride(11) 8054 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8055 } 8056 } 8057 } 8058 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,n_gt_8_strided_a)8059 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_gt_8_strided_a) { 8060 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8061 for (uint32_t n = 9; n < 16; n++) { 8062 for (size_t k = 1; k <= 20; k += 5) { 8063 GemmMicrokernelTester() 8064 .mr(6) 8065 .nr(8) 8066 .kr(1) 8067 .sr(1) 8068 .m(6) 8069 .n(n) 8070 .k(k) 8071 .a_stride(23) 8072 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8073 } 8074 } 8075 } 8076 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,n_gt_8_subtile)8077 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_gt_8_subtile) { 8078 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8079 for (uint32_t n = 9; n < 16; n++) { 8080 for (size_t k = 1; k <= 20; k += 5) { 8081 for (uint32_t m = 1; m <= 6; m++) { 8082 GemmMicrokernelTester() 8083 .mr(6) 8084 .nr(8) 8085 .kr(1) 8086 .sr(1) 8087 .m(m) 8088 .n(n) 8089 .k(k) 8090 .iterations(1) 8091 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8092 } 8093 } 8094 } 8095 } 8096 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,n_div_8)8097 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_div_8) { 8098 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8099 for (uint32_t n = 16; n <= 24; n += 8) { 8100 for (size_t k = 1; k <= 20; k += 5) { 8101 GemmMicrokernelTester() 8102 .mr(6) 8103 .nr(8) 8104 .kr(1) 8105 .sr(1) 8106 .m(6) 8107 .n(n) 8108 .k(k) 8109 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8110 } 8111 } 8112 } 8113 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,n_div_8_strided_cn)8114 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_div_8_strided_cn) { 8115 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8116 for (uint32_t n = 16; n <= 24; n += 8) { 8117 for (size_t k = 1; k <= 20; k += 5) { 8118 GemmMicrokernelTester() 8119 .mr(6) 8120 .nr(8) 8121 .kr(1) 8122 .sr(1) 8123 .m(6) 8124 .n(n) 8125 .k(k) 8126 .cn_stride(11) 8127 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8128 } 8129 } 8130 } 8131 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,n_div_8_strided_a)8132 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_div_8_strided_a) { 8133 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8134 for (uint32_t n = 16; n <= 24; n += 8) { 8135 for (size_t k = 1; k <= 20; k += 5) { 8136 GemmMicrokernelTester() 8137 .mr(6) 8138 .nr(8) 8139 .kr(1) 8140 .sr(1) 8141 .m(6) 8142 .n(n) 8143 .k(k) 8144 .a_stride(23) 8145 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8146 } 8147 } 8148 } 8149 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,n_div_8_subtile)8150 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_div_8_subtile) { 8151 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8152 for (uint32_t n = 16; n <= 24; n += 8) { 8153 for (size_t k = 1; k <= 20; k += 5) { 8154 for (uint32_t m = 1; m <= 6; m++) { 8155 GemmMicrokernelTester() 8156 .mr(6) 8157 .nr(8) 8158 .kr(1) 8159 .sr(1) 8160 .m(m) 8161 .n(n) 8162 .k(k) 8163 .iterations(1) 8164 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8165 } 8166 } 8167 } 8168 } 8169 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,strided_cm_subtile)8170 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, strided_cm_subtile) { 8171 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8172 for (size_t k = 1; k <= 20; k += 5) { 8173 for (uint32_t n = 1; n <= 8; n++) { 8174 for (uint32_t m = 1; m <= 6; m++) { 8175 GemmMicrokernelTester() 8176 .mr(6) 8177 .nr(8) 8178 .kr(1) 8179 .sr(1) 8180 .m(m) 8181 .n(n) 8182 .k(k) 8183 .cm_stride(11) 8184 .iterations(1) 8185 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8186 } 8187 } 8188 } 8189 } 8190 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,qmin)8191 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, qmin) { 8192 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8193 GemmMicrokernelTester() 8194 .mr(6) 8195 .nr(8) 8196 .kr(1) 8197 .sr(1) 8198 .m(6) 8199 .n(8) 8200 .k(4) 8201 .qmin(128) 8202 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8203 } 8204 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,qmax)8205 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, qmax) { 8206 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8207 GemmMicrokernelTester() 8208 .mr(6) 8209 .nr(8) 8210 .kr(1) 8211 .sr(1) 8212 .m(6) 8213 .n(8) 8214 .k(4) 8215 .qmax(128) 8216 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8217 } 8218 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,strided_cm)8219 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, strided_cm) { 8220 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8221 GemmMicrokernelTester() 8222 .mr(6) 8223 .nr(8) 8224 .kr(1) 8225 .sr(1) 8226 .m(6) 8227 .n(8) 8228 .k(4) 8229 .cm_stride(11) 8230 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8231 } 8232 #endif // XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) 8233 8234 8235 #if XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_eq_4)8236 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_eq_4) { 8237 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8238 GemmMicrokernelTester() 8239 .mr(6) 8240 .nr(16) 8241 .kr(1) 8242 .sr(1) 8243 .m(6) 8244 .n(16) 8245 .k(4) 8246 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8247 } 8248 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,strided_cn)8249 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, strided_cn) { 8250 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8251 GemmMicrokernelTester() 8252 .mr(6) 8253 .nr(16) 8254 .kr(1) 8255 .sr(1) 8256 .m(6) 8257 .n(16) 8258 .k(4) 8259 .cn_stride(19) 8260 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8261 } 8262 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_eq_4_strided_a)8263 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_eq_4_strided_a) { 8264 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8265 GemmMicrokernelTester() 8266 .mr(6) 8267 .nr(16) 8268 .kr(1) 8269 .sr(1) 8270 .m(6) 8271 .n(16) 8272 .k(4) 8273 .a_stride(7) 8274 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8275 } 8276 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_eq_4_subtile)8277 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_eq_4_subtile) { 8278 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8279 for (uint32_t n = 1; n <= 16; n++) { 8280 for (uint32_t m = 1; m <= 6; m++) { 8281 GemmMicrokernelTester() 8282 .mr(6) 8283 .nr(16) 8284 .kr(1) 8285 .sr(1) 8286 .m(m) 8287 .n(n) 8288 .k(4) 8289 .iterations(1) 8290 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8291 } 8292 } 8293 } 8294 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_eq_4_subtile_m)8295 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 8296 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8297 for (uint32_t m = 1; m <= 6; m++) { 8298 GemmMicrokernelTester() 8299 .mr(6) 8300 .nr(16) 8301 .kr(1) 8302 .sr(1) 8303 .m(m) 8304 .n(16) 8305 .k(4) 8306 .iterations(1) 8307 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8308 } 8309 } 8310 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_eq_4_subtile_n)8311 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 8312 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8313 for (uint32_t n = 1; n <= 16; n++) { 8314 GemmMicrokernelTester() 8315 .mr(6) 8316 .nr(16) 8317 .kr(1) 8318 .sr(1) 8319 .m(6) 8320 .n(n) 8321 .k(4) 8322 .iterations(1) 8323 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8324 } 8325 } 8326 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_lt_4)8327 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_lt_4) { 8328 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8329 for (size_t k = 1; k < 4; k++) { 8330 GemmMicrokernelTester() 8331 .mr(6) 8332 .nr(16) 8333 .kr(1) 8334 .sr(1) 8335 .m(6) 8336 .n(16) 8337 .k(k) 8338 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8339 } 8340 } 8341 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_lt_4_strided_a)8342 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_lt_4_strided_a) { 8343 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8344 for (size_t k = 1; k < 4; k++) { 8345 GemmMicrokernelTester() 8346 .mr(6) 8347 .nr(16) 8348 .kr(1) 8349 .sr(1) 8350 .m(6) 8351 .n(16) 8352 .k(k) 8353 .a_stride(7) 8354 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8355 } 8356 } 8357 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_lt_4_subtile)8358 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_lt_4_subtile) { 8359 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8360 for (size_t k = 1; k < 4; k++) { 8361 for (uint32_t n = 1; n <= 16; n++) { 8362 for (uint32_t m = 1; m <= 6; m++) { 8363 GemmMicrokernelTester() 8364 .mr(6) 8365 .nr(16) 8366 .kr(1) 8367 .sr(1) 8368 .m(m) 8369 .n(n) 8370 .k(k) 8371 .iterations(1) 8372 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8373 } 8374 } 8375 } 8376 } 8377 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_gt_4)8378 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_gt_4) { 8379 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8380 for (size_t k = 5; k < 8; k++) { 8381 GemmMicrokernelTester() 8382 .mr(6) 8383 .nr(16) 8384 .kr(1) 8385 .sr(1) 8386 .m(6) 8387 .n(16) 8388 .k(k) 8389 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8390 } 8391 } 8392 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_gt_4_strided_a)8393 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_gt_4_strided_a) { 8394 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8395 for (size_t k = 5; k < 8; k++) { 8396 GemmMicrokernelTester() 8397 .mr(6) 8398 .nr(16) 8399 .kr(1) 8400 .sr(1) 8401 .m(6) 8402 .n(16) 8403 .k(k) 8404 .a_stride(11) 8405 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8406 } 8407 } 8408 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_gt_4_subtile)8409 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_gt_4_subtile) { 8410 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8411 for (size_t k = 5; k < 8; k++) { 8412 for (uint32_t n = 1; n <= 16; n++) { 8413 for (uint32_t m = 1; m <= 6; m++) { 8414 GemmMicrokernelTester() 8415 .mr(6) 8416 .nr(16) 8417 .kr(1) 8418 .sr(1) 8419 .m(m) 8420 .n(n) 8421 .k(k) 8422 .iterations(1) 8423 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8424 } 8425 } 8426 } 8427 } 8428 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_div_4)8429 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_div_4) { 8430 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8431 for (size_t k = 8; k <= 40; k += 4) { 8432 GemmMicrokernelTester() 8433 .mr(6) 8434 .nr(16) 8435 .kr(1) 8436 .sr(1) 8437 .m(6) 8438 .n(16) 8439 .k(k) 8440 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8441 } 8442 } 8443 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_div_4_strided_a)8444 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_div_4_strided_a) { 8445 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8446 for (size_t k = 8; k <= 40; k += 4) { 8447 GemmMicrokernelTester() 8448 .mr(6) 8449 .nr(16) 8450 .kr(1) 8451 .sr(1) 8452 .m(6) 8453 .n(16) 8454 .k(k) 8455 .a_stride(43) 8456 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8457 } 8458 } 8459 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_div_4_subtile)8460 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_div_4_subtile) { 8461 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8462 for (size_t k = 8; k <= 40; k += 4) { 8463 for (uint32_t n = 1; n <= 16; n++) { 8464 for (uint32_t m = 1; m <= 6; m++) { 8465 GemmMicrokernelTester() 8466 .mr(6) 8467 .nr(16) 8468 .kr(1) 8469 .sr(1) 8470 .m(m) 8471 .n(n) 8472 .k(k) 8473 .iterations(1) 8474 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8475 } 8476 } 8477 } 8478 } 8479 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,n_gt_16)8480 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_gt_16) { 8481 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8482 for (uint32_t n = 17; n < 32; n++) { 8483 for (size_t k = 1; k <= 20; k += 5) { 8484 GemmMicrokernelTester() 8485 .mr(6) 8486 .nr(16) 8487 .kr(1) 8488 .sr(1) 8489 .m(6) 8490 .n(n) 8491 .k(k) 8492 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8493 } 8494 } 8495 } 8496 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,n_gt_16_strided_cn)8497 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_gt_16_strided_cn) { 8498 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8499 for (uint32_t n = 17; n < 32; n++) { 8500 for (size_t k = 1; k <= 20; k += 5) { 8501 GemmMicrokernelTester() 8502 .mr(6) 8503 .nr(16) 8504 .kr(1) 8505 .sr(1) 8506 .m(6) 8507 .n(n) 8508 .k(k) 8509 .cn_stride(19) 8510 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8511 } 8512 } 8513 } 8514 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,n_gt_16_strided_a)8515 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_gt_16_strided_a) { 8516 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8517 for (uint32_t n = 17; n < 32; n++) { 8518 for (size_t k = 1; k <= 20; k += 5) { 8519 GemmMicrokernelTester() 8520 .mr(6) 8521 .nr(16) 8522 .kr(1) 8523 .sr(1) 8524 .m(6) 8525 .n(n) 8526 .k(k) 8527 .a_stride(23) 8528 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8529 } 8530 } 8531 } 8532 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,n_gt_16_subtile)8533 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_gt_16_subtile) { 8534 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8535 for (uint32_t n = 17; n < 32; n++) { 8536 for (size_t k = 1; k <= 20; k += 5) { 8537 for (uint32_t m = 1; m <= 6; m++) { 8538 GemmMicrokernelTester() 8539 .mr(6) 8540 .nr(16) 8541 .kr(1) 8542 .sr(1) 8543 .m(m) 8544 .n(n) 8545 .k(k) 8546 .iterations(1) 8547 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8548 } 8549 } 8550 } 8551 } 8552 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,n_div_16)8553 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_div_16) { 8554 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8555 for (uint32_t n = 32; n <= 48; n += 16) { 8556 for (size_t k = 1; k <= 20; k += 5) { 8557 GemmMicrokernelTester() 8558 .mr(6) 8559 .nr(16) 8560 .kr(1) 8561 .sr(1) 8562 .m(6) 8563 .n(n) 8564 .k(k) 8565 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8566 } 8567 } 8568 } 8569 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,n_div_16_strided_cn)8570 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_div_16_strided_cn) { 8571 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8572 for (uint32_t n = 32; n <= 48; n += 16) { 8573 for (size_t k = 1; k <= 20; k += 5) { 8574 GemmMicrokernelTester() 8575 .mr(6) 8576 .nr(16) 8577 .kr(1) 8578 .sr(1) 8579 .m(6) 8580 .n(n) 8581 .k(k) 8582 .cn_stride(19) 8583 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8584 } 8585 } 8586 } 8587 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,n_div_16_strided_a)8588 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_div_16_strided_a) { 8589 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8590 for (uint32_t n = 32; n <= 48; n += 16) { 8591 for (size_t k = 1; k <= 20; k += 5) { 8592 GemmMicrokernelTester() 8593 .mr(6) 8594 .nr(16) 8595 .kr(1) 8596 .sr(1) 8597 .m(6) 8598 .n(n) 8599 .k(k) 8600 .a_stride(23) 8601 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8602 } 8603 } 8604 } 8605 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,n_div_16_subtile)8606 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_div_16_subtile) { 8607 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8608 for (uint32_t n = 32; n <= 48; n += 16) { 8609 for (size_t k = 1; k <= 20; k += 5) { 8610 for (uint32_t m = 1; m <= 6; m++) { 8611 GemmMicrokernelTester() 8612 .mr(6) 8613 .nr(16) 8614 .kr(1) 8615 .sr(1) 8616 .m(m) 8617 .n(n) 8618 .k(k) 8619 .iterations(1) 8620 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8621 } 8622 } 8623 } 8624 } 8625 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,strided_cm_subtile)8626 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, strided_cm_subtile) { 8627 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8628 for (size_t k = 1; k <= 20; k += 5) { 8629 for (uint32_t n = 1; n <= 16; n++) { 8630 for (uint32_t m = 1; m <= 6; m++) { 8631 GemmMicrokernelTester() 8632 .mr(6) 8633 .nr(16) 8634 .kr(1) 8635 .sr(1) 8636 .m(m) 8637 .n(n) 8638 .k(k) 8639 .cm_stride(19) 8640 .iterations(1) 8641 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8642 } 8643 } 8644 } 8645 } 8646 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,qmin)8647 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, qmin) { 8648 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8649 GemmMicrokernelTester() 8650 .mr(6) 8651 .nr(16) 8652 .kr(1) 8653 .sr(1) 8654 .m(6) 8655 .n(16) 8656 .k(4) 8657 .qmin(128) 8658 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8659 } 8660 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,qmax)8661 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, qmax) { 8662 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8663 GemmMicrokernelTester() 8664 .mr(6) 8665 .nr(16) 8666 .kr(1) 8667 .sr(1) 8668 .m(6) 8669 .n(16) 8670 .k(4) 8671 .qmax(128) 8672 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8673 } 8674 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,strided_cm)8675 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, strided_cm) { 8676 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8677 GemmMicrokernelTester() 8678 .mr(6) 8679 .nr(16) 8680 .kr(1) 8681 .sr(1) 8682 .m(6) 8683 .n(16) 8684 .k(4) 8685 .cm_stride(19) 8686 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8687 } 8688 #endif // XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) 8689 8690 8691 #if XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_eq_4)8692 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_eq_4) { 8693 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8694 GemmMicrokernelTester() 8695 .mr(8) 8696 .nr(8) 8697 .kr(1) 8698 .sr(1) 8699 .m(8) 8700 .n(8) 8701 .k(4) 8702 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8703 } 8704 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,strided_cn)8705 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, strided_cn) { 8706 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8707 GemmMicrokernelTester() 8708 .mr(8) 8709 .nr(8) 8710 .kr(1) 8711 .sr(1) 8712 .m(8) 8713 .n(8) 8714 .k(4) 8715 .cn_stride(11) 8716 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8717 } 8718 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_eq_4_strided_a)8719 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_eq_4_strided_a) { 8720 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8721 GemmMicrokernelTester() 8722 .mr(8) 8723 .nr(8) 8724 .kr(1) 8725 .sr(1) 8726 .m(8) 8727 .n(8) 8728 .k(4) 8729 .a_stride(7) 8730 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8731 } 8732 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_eq_4_subtile)8733 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_eq_4_subtile) { 8734 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8735 for (uint32_t n = 1; n <= 8; n++) { 8736 for (uint32_t m = 1; m <= 8; m++) { 8737 GemmMicrokernelTester() 8738 .mr(8) 8739 .nr(8) 8740 .kr(1) 8741 .sr(1) 8742 .m(m) 8743 .n(n) 8744 .k(4) 8745 .iterations(1) 8746 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8747 } 8748 } 8749 } 8750 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_eq_4_subtile_m)8751 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 8752 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8753 for (uint32_t m = 1; m <= 8; m++) { 8754 GemmMicrokernelTester() 8755 .mr(8) 8756 .nr(8) 8757 .kr(1) 8758 .sr(1) 8759 .m(m) 8760 .n(8) 8761 .k(4) 8762 .iterations(1) 8763 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8764 } 8765 } 8766 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_eq_4_subtile_n)8767 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 8768 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8769 for (uint32_t n = 1; n <= 8; n++) { 8770 GemmMicrokernelTester() 8771 .mr(8) 8772 .nr(8) 8773 .kr(1) 8774 .sr(1) 8775 .m(8) 8776 .n(n) 8777 .k(4) 8778 .iterations(1) 8779 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8780 } 8781 } 8782 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_lt_4)8783 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_lt_4) { 8784 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8785 for (size_t k = 1; k < 4; k++) { 8786 GemmMicrokernelTester() 8787 .mr(8) 8788 .nr(8) 8789 .kr(1) 8790 .sr(1) 8791 .m(8) 8792 .n(8) 8793 .k(k) 8794 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8795 } 8796 } 8797 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_lt_4_strided_a)8798 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_lt_4_strided_a) { 8799 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8800 for (size_t k = 1; k < 4; k++) { 8801 GemmMicrokernelTester() 8802 .mr(8) 8803 .nr(8) 8804 .kr(1) 8805 .sr(1) 8806 .m(8) 8807 .n(8) 8808 .k(k) 8809 .a_stride(7) 8810 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8811 } 8812 } 8813 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_lt_4_subtile)8814 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_lt_4_subtile) { 8815 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8816 for (size_t k = 1; k < 4; k++) { 8817 for (uint32_t n = 1; n <= 8; n++) { 8818 for (uint32_t m = 1; m <= 8; m++) { 8819 GemmMicrokernelTester() 8820 .mr(8) 8821 .nr(8) 8822 .kr(1) 8823 .sr(1) 8824 .m(m) 8825 .n(n) 8826 .k(k) 8827 .iterations(1) 8828 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8829 } 8830 } 8831 } 8832 } 8833 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_gt_4)8834 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_gt_4) { 8835 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8836 for (size_t k = 5; k < 8; k++) { 8837 GemmMicrokernelTester() 8838 .mr(8) 8839 .nr(8) 8840 .kr(1) 8841 .sr(1) 8842 .m(8) 8843 .n(8) 8844 .k(k) 8845 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8846 } 8847 } 8848 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_gt_4_strided_a)8849 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_gt_4_strided_a) { 8850 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8851 for (size_t k = 5; k < 8; k++) { 8852 GemmMicrokernelTester() 8853 .mr(8) 8854 .nr(8) 8855 .kr(1) 8856 .sr(1) 8857 .m(8) 8858 .n(8) 8859 .k(k) 8860 .a_stride(11) 8861 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8862 } 8863 } 8864 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_gt_4_subtile)8865 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_gt_4_subtile) { 8866 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8867 for (size_t k = 5; k < 8; k++) { 8868 for (uint32_t n = 1; n <= 8; n++) { 8869 for (uint32_t m = 1; m <= 8; m++) { 8870 GemmMicrokernelTester() 8871 .mr(8) 8872 .nr(8) 8873 .kr(1) 8874 .sr(1) 8875 .m(m) 8876 .n(n) 8877 .k(k) 8878 .iterations(1) 8879 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8880 } 8881 } 8882 } 8883 } 8884 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_div_4)8885 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_div_4) { 8886 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8887 for (size_t k = 8; k <= 40; k += 4) { 8888 GemmMicrokernelTester() 8889 .mr(8) 8890 .nr(8) 8891 .kr(1) 8892 .sr(1) 8893 .m(8) 8894 .n(8) 8895 .k(k) 8896 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8897 } 8898 } 8899 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_div_4_strided_a)8900 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_div_4_strided_a) { 8901 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8902 for (size_t k = 8; k <= 40; k += 4) { 8903 GemmMicrokernelTester() 8904 .mr(8) 8905 .nr(8) 8906 .kr(1) 8907 .sr(1) 8908 .m(8) 8909 .n(8) 8910 .k(k) 8911 .a_stride(43) 8912 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8913 } 8914 } 8915 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_div_4_subtile)8916 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_div_4_subtile) { 8917 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8918 for (size_t k = 8; k <= 40; k += 4) { 8919 for (uint32_t n = 1; n <= 8; n++) { 8920 for (uint32_t m = 1; m <= 8; m++) { 8921 GemmMicrokernelTester() 8922 .mr(8) 8923 .nr(8) 8924 .kr(1) 8925 .sr(1) 8926 .m(m) 8927 .n(n) 8928 .k(k) 8929 .iterations(1) 8930 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8931 } 8932 } 8933 } 8934 } 8935 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,n_gt_8)8936 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_gt_8) { 8937 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8938 for (uint32_t n = 9; n < 16; n++) { 8939 for (size_t k = 1; k <= 20; k += 5) { 8940 GemmMicrokernelTester() 8941 .mr(8) 8942 .nr(8) 8943 .kr(1) 8944 .sr(1) 8945 .m(8) 8946 .n(n) 8947 .k(k) 8948 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8949 } 8950 } 8951 } 8952 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,n_gt_8_strided_cn)8953 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_gt_8_strided_cn) { 8954 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8955 for (uint32_t n = 9; n < 16; n++) { 8956 for (size_t k = 1; k <= 20; k += 5) { 8957 GemmMicrokernelTester() 8958 .mr(8) 8959 .nr(8) 8960 .kr(1) 8961 .sr(1) 8962 .m(8) 8963 .n(n) 8964 .k(k) 8965 .cn_stride(11) 8966 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8967 } 8968 } 8969 } 8970 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,n_gt_8_strided_a)8971 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_gt_8_strided_a) { 8972 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8973 for (uint32_t n = 9; n < 16; n++) { 8974 for (size_t k = 1; k <= 20; k += 5) { 8975 GemmMicrokernelTester() 8976 .mr(8) 8977 .nr(8) 8978 .kr(1) 8979 .sr(1) 8980 .m(8) 8981 .n(n) 8982 .k(k) 8983 .a_stride(23) 8984 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 8985 } 8986 } 8987 } 8988 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,n_gt_8_subtile)8989 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_gt_8_subtile) { 8990 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 8991 for (uint32_t n = 9; n < 16; n++) { 8992 for (size_t k = 1; k <= 20; k += 5) { 8993 for (uint32_t m = 1; m <= 8; m++) { 8994 GemmMicrokernelTester() 8995 .mr(8) 8996 .nr(8) 8997 .kr(1) 8998 .sr(1) 8999 .m(m) 9000 .n(n) 9001 .k(k) 9002 .iterations(1) 9003 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9004 } 9005 } 9006 } 9007 } 9008 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,n_div_8)9009 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_div_8) { 9010 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9011 for (uint32_t n = 16; n <= 24; n += 8) { 9012 for (size_t k = 1; k <= 20; k += 5) { 9013 GemmMicrokernelTester() 9014 .mr(8) 9015 .nr(8) 9016 .kr(1) 9017 .sr(1) 9018 .m(8) 9019 .n(n) 9020 .k(k) 9021 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9022 } 9023 } 9024 } 9025 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,n_div_8_strided_cn)9026 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_div_8_strided_cn) { 9027 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9028 for (uint32_t n = 16; n <= 24; n += 8) { 9029 for (size_t k = 1; k <= 20; k += 5) { 9030 GemmMicrokernelTester() 9031 .mr(8) 9032 .nr(8) 9033 .kr(1) 9034 .sr(1) 9035 .m(8) 9036 .n(n) 9037 .k(k) 9038 .cn_stride(11) 9039 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9040 } 9041 } 9042 } 9043 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,n_div_8_strided_a)9044 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_div_8_strided_a) { 9045 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9046 for (uint32_t n = 16; n <= 24; n += 8) { 9047 for (size_t k = 1; k <= 20; k += 5) { 9048 GemmMicrokernelTester() 9049 .mr(8) 9050 .nr(8) 9051 .kr(1) 9052 .sr(1) 9053 .m(8) 9054 .n(n) 9055 .k(k) 9056 .a_stride(23) 9057 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9058 } 9059 } 9060 } 9061 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,n_div_8_subtile)9062 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_div_8_subtile) { 9063 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9064 for (uint32_t n = 16; n <= 24; n += 8) { 9065 for (size_t k = 1; k <= 20; k += 5) { 9066 for (uint32_t m = 1; m <= 8; m++) { 9067 GemmMicrokernelTester() 9068 .mr(8) 9069 .nr(8) 9070 .kr(1) 9071 .sr(1) 9072 .m(m) 9073 .n(n) 9074 .k(k) 9075 .iterations(1) 9076 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9077 } 9078 } 9079 } 9080 } 9081 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,strided_cm_subtile)9082 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, strided_cm_subtile) { 9083 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9084 for (size_t k = 1; k <= 20; k += 5) { 9085 for (uint32_t n = 1; n <= 8; n++) { 9086 for (uint32_t m = 1; m <= 8; m++) { 9087 GemmMicrokernelTester() 9088 .mr(8) 9089 .nr(8) 9090 .kr(1) 9091 .sr(1) 9092 .m(m) 9093 .n(n) 9094 .k(k) 9095 .cm_stride(11) 9096 .iterations(1) 9097 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9098 } 9099 } 9100 } 9101 } 9102 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,qmin)9103 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, qmin) { 9104 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9105 GemmMicrokernelTester() 9106 .mr(8) 9107 .nr(8) 9108 .kr(1) 9109 .sr(1) 9110 .m(8) 9111 .n(8) 9112 .k(4) 9113 .qmin(128) 9114 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9115 } 9116 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,qmax)9117 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, qmax) { 9118 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9119 GemmMicrokernelTester() 9120 .mr(8) 9121 .nr(8) 9122 .kr(1) 9123 .sr(1) 9124 .m(8) 9125 .n(8) 9126 .k(4) 9127 .qmax(128) 9128 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9129 } 9130 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,strided_cm)9131 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, strided_cm) { 9132 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9133 GemmMicrokernelTester() 9134 .mr(8) 9135 .nr(8) 9136 .kr(1) 9137 .sr(1) 9138 .m(8) 9139 .n(8) 9140 .k(4) 9141 .cm_stride(11) 9142 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9143 } 9144 #endif // XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) 9145 9146 9147 #if XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_eq_4)9148 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_eq_4) { 9149 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9150 GemmMicrokernelTester() 9151 .mr(8) 9152 .nr(16) 9153 .kr(1) 9154 .sr(1) 9155 .m(8) 9156 .n(16) 9157 .k(4) 9158 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9159 } 9160 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,strided_cn)9161 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, strided_cn) { 9162 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9163 GemmMicrokernelTester() 9164 .mr(8) 9165 .nr(16) 9166 .kr(1) 9167 .sr(1) 9168 .m(8) 9169 .n(16) 9170 .k(4) 9171 .cn_stride(19) 9172 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9173 } 9174 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_eq_4_strided_a)9175 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_eq_4_strided_a) { 9176 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9177 GemmMicrokernelTester() 9178 .mr(8) 9179 .nr(16) 9180 .kr(1) 9181 .sr(1) 9182 .m(8) 9183 .n(16) 9184 .k(4) 9185 .a_stride(7) 9186 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9187 } 9188 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_eq_4_subtile)9189 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_eq_4_subtile) { 9190 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9191 for (uint32_t n = 1; n <= 16; n++) { 9192 for (uint32_t m = 1; m <= 8; m++) { 9193 GemmMicrokernelTester() 9194 .mr(8) 9195 .nr(16) 9196 .kr(1) 9197 .sr(1) 9198 .m(m) 9199 .n(n) 9200 .k(4) 9201 .iterations(1) 9202 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9203 } 9204 } 9205 } 9206 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_eq_4_subtile_m)9207 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 9208 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9209 for (uint32_t m = 1; m <= 8; m++) { 9210 GemmMicrokernelTester() 9211 .mr(8) 9212 .nr(16) 9213 .kr(1) 9214 .sr(1) 9215 .m(m) 9216 .n(16) 9217 .k(4) 9218 .iterations(1) 9219 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9220 } 9221 } 9222 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_eq_4_subtile_n)9223 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 9224 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9225 for (uint32_t n = 1; n <= 16; n++) { 9226 GemmMicrokernelTester() 9227 .mr(8) 9228 .nr(16) 9229 .kr(1) 9230 .sr(1) 9231 .m(8) 9232 .n(n) 9233 .k(4) 9234 .iterations(1) 9235 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9236 } 9237 } 9238 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_lt_4)9239 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_lt_4) { 9240 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9241 for (size_t k = 1; k < 4; k++) { 9242 GemmMicrokernelTester() 9243 .mr(8) 9244 .nr(16) 9245 .kr(1) 9246 .sr(1) 9247 .m(8) 9248 .n(16) 9249 .k(k) 9250 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9251 } 9252 } 9253 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_lt_4_strided_a)9254 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_lt_4_strided_a) { 9255 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9256 for (size_t k = 1; k < 4; k++) { 9257 GemmMicrokernelTester() 9258 .mr(8) 9259 .nr(16) 9260 .kr(1) 9261 .sr(1) 9262 .m(8) 9263 .n(16) 9264 .k(k) 9265 .a_stride(7) 9266 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9267 } 9268 } 9269 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_lt_4_subtile)9270 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_lt_4_subtile) { 9271 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9272 for (size_t k = 1; k < 4; k++) { 9273 for (uint32_t n = 1; n <= 16; n++) { 9274 for (uint32_t m = 1; m <= 8; m++) { 9275 GemmMicrokernelTester() 9276 .mr(8) 9277 .nr(16) 9278 .kr(1) 9279 .sr(1) 9280 .m(m) 9281 .n(n) 9282 .k(k) 9283 .iterations(1) 9284 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9285 } 9286 } 9287 } 9288 } 9289 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_gt_4)9290 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_gt_4) { 9291 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9292 for (size_t k = 5; k < 8; k++) { 9293 GemmMicrokernelTester() 9294 .mr(8) 9295 .nr(16) 9296 .kr(1) 9297 .sr(1) 9298 .m(8) 9299 .n(16) 9300 .k(k) 9301 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9302 } 9303 } 9304 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_gt_4_strided_a)9305 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_gt_4_strided_a) { 9306 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9307 for (size_t k = 5; k < 8; k++) { 9308 GemmMicrokernelTester() 9309 .mr(8) 9310 .nr(16) 9311 .kr(1) 9312 .sr(1) 9313 .m(8) 9314 .n(16) 9315 .k(k) 9316 .a_stride(11) 9317 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9318 } 9319 } 9320 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_gt_4_subtile)9321 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_gt_4_subtile) { 9322 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9323 for (size_t k = 5; k < 8; k++) { 9324 for (uint32_t n = 1; n <= 16; n++) { 9325 for (uint32_t m = 1; m <= 8; m++) { 9326 GemmMicrokernelTester() 9327 .mr(8) 9328 .nr(16) 9329 .kr(1) 9330 .sr(1) 9331 .m(m) 9332 .n(n) 9333 .k(k) 9334 .iterations(1) 9335 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9336 } 9337 } 9338 } 9339 } 9340 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_div_4)9341 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_div_4) { 9342 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9343 for (size_t k = 8; k <= 40; k += 4) { 9344 GemmMicrokernelTester() 9345 .mr(8) 9346 .nr(16) 9347 .kr(1) 9348 .sr(1) 9349 .m(8) 9350 .n(16) 9351 .k(k) 9352 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9353 } 9354 } 9355 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_div_4_strided_a)9356 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_div_4_strided_a) { 9357 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9358 for (size_t k = 8; k <= 40; k += 4) { 9359 GemmMicrokernelTester() 9360 .mr(8) 9361 .nr(16) 9362 .kr(1) 9363 .sr(1) 9364 .m(8) 9365 .n(16) 9366 .k(k) 9367 .a_stride(43) 9368 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9369 } 9370 } 9371 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_div_4_subtile)9372 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_div_4_subtile) { 9373 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9374 for (size_t k = 8; k <= 40; k += 4) { 9375 for (uint32_t n = 1; n <= 16; n++) { 9376 for (uint32_t m = 1; m <= 8; m++) { 9377 GemmMicrokernelTester() 9378 .mr(8) 9379 .nr(16) 9380 .kr(1) 9381 .sr(1) 9382 .m(m) 9383 .n(n) 9384 .k(k) 9385 .iterations(1) 9386 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9387 } 9388 } 9389 } 9390 } 9391 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,n_gt_16)9392 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_gt_16) { 9393 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9394 for (uint32_t n = 17; n < 32; n++) { 9395 for (size_t k = 1; k <= 20; k += 5) { 9396 GemmMicrokernelTester() 9397 .mr(8) 9398 .nr(16) 9399 .kr(1) 9400 .sr(1) 9401 .m(8) 9402 .n(n) 9403 .k(k) 9404 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9405 } 9406 } 9407 } 9408 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,n_gt_16_strided_cn)9409 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_gt_16_strided_cn) { 9410 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9411 for (uint32_t n = 17; n < 32; n++) { 9412 for (size_t k = 1; k <= 20; k += 5) { 9413 GemmMicrokernelTester() 9414 .mr(8) 9415 .nr(16) 9416 .kr(1) 9417 .sr(1) 9418 .m(8) 9419 .n(n) 9420 .k(k) 9421 .cn_stride(19) 9422 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9423 } 9424 } 9425 } 9426 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,n_gt_16_strided_a)9427 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_gt_16_strided_a) { 9428 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9429 for (uint32_t n = 17; n < 32; n++) { 9430 for (size_t k = 1; k <= 20; k += 5) { 9431 GemmMicrokernelTester() 9432 .mr(8) 9433 .nr(16) 9434 .kr(1) 9435 .sr(1) 9436 .m(8) 9437 .n(n) 9438 .k(k) 9439 .a_stride(23) 9440 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9441 } 9442 } 9443 } 9444 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,n_gt_16_subtile)9445 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_gt_16_subtile) { 9446 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9447 for (uint32_t n = 17; n < 32; n++) { 9448 for (size_t k = 1; k <= 20; k += 5) { 9449 for (uint32_t m = 1; m <= 8; m++) { 9450 GemmMicrokernelTester() 9451 .mr(8) 9452 .nr(16) 9453 .kr(1) 9454 .sr(1) 9455 .m(m) 9456 .n(n) 9457 .k(k) 9458 .iterations(1) 9459 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9460 } 9461 } 9462 } 9463 } 9464 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,n_div_16)9465 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_div_16) { 9466 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9467 for (uint32_t n = 32; n <= 48; n += 16) { 9468 for (size_t k = 1; k <= 20; k += 5) { 9469 GemmMicrokernelTester() 9470 .mr(8) 9471 .nr(16) 9472 .kr(1) 9473 .sr(1) 9474 .m(8) 9475 .n(n) 9476 .k(k) 9477 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9478 } 9479 } 9480 } 9481 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,n_div_16_strided_cn)9482 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_div_16_strided_cn) { 9483 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9484 for (uint32_t n = 32; n <= 48; n += 16) { 9485 for (size_t k = 1; k <= 20; k += 5) { 9486 GemmMicrokernelTester() 9487 .mr(8) 9488 .nr(16) 9489 .kr(1) 9490 .sr(1) 9491 .m(8) 9492 .n(n) 9493 .k(k) 9494 .cn_stride(19) 9495 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9496 } 9497 } 9498 } 9499 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,n_div_16_strided_a)9500 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_div_16_strided_a) { 9501 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9502 for (uint32_t n = 32; n <= 48; n += 16) { 9503 for (size_t k = 1; k <= 20; k += 5) { 9504 GemmMicrokernelTester() 9505 .mr(8) 9506 .nr(16) 9507 .kr(1) 9508 .sr(1) 9509 .m(8) 9510 .n(n) 9511 .k(k) 9512 .a_stride(23) 9513 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9514 } 9515 } 9516 } 9517 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,n_div_16_subtile)9518 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_div_16_subtile) { 9519 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9520 for (uint32_t n = 32; n <= 48; n += 16) { 9521 for (size_t k = 1; k <= 20; k += 5) { 9522 for (uint32_t m = 1; m <= 8; m++) { 9523 GemmMicrokernelTester() 9524 .mr(8) 9525 .nr(16) 9526 .kr(1) 9527 .sr(1) 9528 .m(m) 9529 .n(n) 9530 .k(k) 9531 .iterations(1) 9532 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9533 } 9534 } 9535 } 9536 } 9537 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,strided_cm_subtile)9538 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, strided_cm_subtile) { 9539 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9540 for (size_t k = 1; k <= 20; k += 5) { 9541 for (uint32_t n = 1; n <= 16; n++) { 9542 for (uint32_t m = 1; m <= 8; m++) { 9543 GemmMicrokernelTester() 9544 .mr(8) 9545 .nr(16) 9546 .kr(1) 9547 .sr(1) 9548 .m(m) 9549 .n(n) 9550 .k(k) 9551 .cm_stride(19) 9552 .iterations(1) 9553 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9554 } 9555 } 9556 } 9557 } 9558 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,qmin)9559 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, qmin) { 9560 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9561 GemmMicrokernelTester() 9562 .mr(8) 9563 .nr(16) 9564 .kr(1) 9565 .sr(1) 9566 .m(8) 9567 .n(16) 9568 .k(4) 9569 .qmin(128) 9570 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9571 } 9572 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,qmax)9573 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, qmax) { 9574 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9575 GemmMicrokernelTester() 9576 .mr(8) 9577 .nr(16) 9578 .kr(1) 9579 .sr(1) 9580 .m(8) 9581 .n(16) 9582 .k(4) 9583 .qmax(128) 9584 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9585 } 9586 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,strided_cm)9587 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, strided_cm) { 9588 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 9589 GemmMicrokernelTester() 9590 .mr(8) 9591 .nr(16) 9592 .kr(1) 9593 .sr(1) 9594 .m(8) 9595 .n(16) 9596 .k(4) 9597 .cm_stride(19) 9598 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 9599 } 9600 #endif // XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) 9601 9602 9603 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST,k_eq_1)9604 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST, k_eq_1) { 9605 TEST_REQUIRES_X86_AVX2; 9606 GemmMicrokernelTester() 9607 .mr(1) 9608 .nr(8) 9609 .kr(1) 9610 .sr(1) 9611 .m(1) 9612 .n(8) 9613 .k(1) 9614 .Test(xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9615 } 9616 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST,strided_cn)9617 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST, strided_cn) { 9618 TEST_REQUIRES_X86_AVX2; 9619 GemmMicrokernelTester() 9620 .mr(1) 9621 .nr(8) 9622 .kr(1) 9623 .sr(1) 9624 .m(1) 9625 .n(8) 9626 .k(1) 9627 .cn_stride(11) 9628 .Test(xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9629 } 9630 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST,k_eq_1_strided_a)9631 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST, k_eq_1_strided_a) { 9632 TEST_REQUIRES_X86_AVX2; 9633 GemmMicrokernelTester() 9634 .mr(1) 9635 .nr(8) 9636 .kr(1) 9637 .sr(1) 9638 .m(1) 9639 .n(8) 9640 .k(1) 9641 .a_stride(3) 9642 .Test(xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9643 } 9644 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST,k_eq_1_subtile)9645 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST, k_eq_1_subtile) { 9646 TEST_REQUIRES_X86_AVX2; 9647 for (uint32_t n = 1; n <= 8; n++) { 9648 for (uint32_t m = 1; m <= 1; m++) { 9649 GemmMicrokernelTester() 9650 .mr(1) 9651 .nr(8) 9652 .kr(1) 9653 .sr(1) 9654 .m(m) 9655 .n(n) 9656 .k(1) 9657 .iterations(1) 9658 .Test(xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9659 } 9660 } 9661 } 9662 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST,k_eq_1_subtile_m)9663 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST, k_eq_1_subtile_m) { 9664 TEST_REQUIRES_X86_AVX2; 9665 for (uint32_t m = 1; m <= 1; m++) { 9666 GemmMicrokernelTester() 9667 .mr(1) 9668 .nr(8) 9669 .kr(1) 9670 .sr(1) 9671 .m(m) 9672 .n(8) 9673 .k(1) 9674 .iterations(1) 9675 .Test(xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9676 } 9677 } 9678 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST,k_eq_1_subtile_n)9679 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST, k_eq_1_subtile_n) { 9680 TEST_REQUIRES_X86_AVX2; 9681 for (uint32_t n = 1; n <= 8; n++) { 9682 GemmMicrokernelTester() 9683 .mr(1) 9684 .nr(8) 9685 .kr(1) 9686 .sr(1) 9687 .m(1) 9688 .n(n) 9689 .k(1) 9690 .iterations(1) 9691 .Test(xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9692 } 9693 } 9694 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST,k_gt_1)9695 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST, k_gt_1) { 9696 TEST_REQUIRES_X86_AVX2; 9697 for (size_t k = 2; k < 10; k++) { 9698 GemmMicrokernelTester() 9699 .mr(1) 9700 .nr(8) 9701 .kr(1) 9702 .sr(1) 9703 .m(1) 9704 .n(8) 9705 .k(k) 9706 .Test(xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9707 } 9708 } 9709 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST,k_gt_1_strided_a)9710 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST, k_gt_1_strided_a) { 9711 TEST_REQUIRES_X86_AVX2; 9712 for (size_t k = 2; k < 10; k++) { 9713 GemmMicrokernelTester() 9714 .mr(1) 9715 .nr(8) 9716 .kr(1) 9717 .sr(1) 9718 .m(1) 9719 .n(8) 9720 .k(k) 9721 .a_stride(11) 9722 .Test(xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9723 } 9724 } 9725 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST,k_gt_1_subtile)9726 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST, k_gt_1_subtile) { 9727 TEST_REQUIRES_X86_AVX2; 9728 for (size_t k = 2; k < 10; k++) { 9729 for (uint32_t n = 1; n <= 8; n++) { 9730 for (uint32_t m = 1; m <= 1; m++) { 9731 GemmMicrokernelTester() 9732 .mr(1) 9733 .nr(8) 9734 .kr(1) 9735 .sr(1) 9736 .m(m) 9737 .n(n) 9738 .k(k) 9739 .iterations(1) 9740 .Test(xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9741 } 9742 } 9743 } 9744 } 9745 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST,n_gt_8)9746 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST, n_gt_8) { 9747 TEST_REQUIRES_X86_AVX2; 9748 for (uint32_t n = 9; n < 16; n++) { 9749 for (size_t k = 1; k <= 5; k += 2) { 9750 GemmMicrokernelTester() 9751 .mr(1) 9752 .nr(8) 9753 .kr(1) 9754 .sr(1) 9755 .m(1) 9756 .n(n) 9757 .k(k) 9758 .Test(xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9759 } 9760 } 9761 } 9762 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST,n_gt_8_strided_cn)9763 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST, n_gt_8_strided_cn) { 9764 TEST_REQUIRES_X86_AVX2; 9765 for (uint32_t n = 9; n < 16; n++) { 9766 for (size_t k = 1; k <= 5; k += 2) { 9767 GemmMicrokernelTester() 9768 .mr(1) 9769 .nr(8) 9770 .kr(1) 9771 .sr(1) 9772 .m(1) 9773 .n(n) 9774 .k(k) 9775 .cn_stride(11) 9776 .Test(xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9777 } 9778 } 9779 } 9780 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST,n_gt_8_strided_a)9781 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST, n_gt_8_strided_a) { 9782 TEST_REQUIRES_X86_AVX2; 9783 for (uint32_t n = 9; n < 16; n++) { 9784 for (size_t k = 1; k <= 5; k += 2) { 9785 GemmMicrokernelTester() 9786 .mr(1) 9787 .nr(8) 9788 .kr(1) 9789 .sr(1) 9790 .m(1) 9791 .n(n) 9792 .k(k) 9793 .a_stride(7) 9794 .Test(xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9795 } 9796 } 9797 } 9798 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST,n_gt_8_subtile)9799 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST, n_gt_8_subtile) { 9800 TEST_REQUIRES_X86_AVX2; 9801 for (uint32_t n = 9; n < 16; n++) { 9802 for (size_t k = 1; k <= 5; k += 2) { 9803 for (uint32_t m = 1; m <= 1; m++) { 9804 GemmMicrokernelTester() 9805 .mr(1) 9806 .nr(8) 9807 .kr(1) 9808 .sr(1) 9809 .m(m) 9810 .n(n) 9811 .k(k) 9812 .iterations(1) 9813 .Test(xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9814 } 9815 } 9816 } 9817 } 9818 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST,n_div_8)9819 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST, n_div_8) { 9820 TEST_REQUIRES_X86_AVX2; 9821 for (uint32_t n = 16; n <= 24; n += 8) { 9822 for (size_t k = 1; k <= 5; k += 2) { 9823 GemmMicrokernelTester() 9824 .mr(1) 9825 .nr(8) 9826 .kr(1) 9827 .sr(1) 9828 .m(1) 9829 .n(n) 9830 .k(k) 9831 .Test(xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9832 } 9833 } 9834 } 9835 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST,n_div_8_strided_cn)9836 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST, n_div_8_strided_cn) { 9837 TEST_REQUIRES_X86_AVX2; 9838 for (uint32_t n = 16; n <= 24; n += 8) { 9839 for (size_t k = 1; k <= 5; k += 2) { 9840 GemmMicrokernelTester() 9841 .mr(1) 9842 .nr(8) 9843 .kr(1) 9844 .sr(1) 9845 .m(1) 9846 .n(n) 9847 .k(k) 9848 .cn_stride(11) 9849 .Test(xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9850 } 9851 } 9852 } 9853 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST,n_div_8_strided_a)9854 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST, n_div_8_strided_a) { 9855 TEST_REQUIRES_X86_AVX2; 9856 for (uint32_t n = 16; n <= 24; n += 8) { 9857 for (size_t k = 1; k <= 5; k += 2) { 9858 GemmMicrokernelTester() 9859 .mr(1) 9860 .nr(8) 9861 .kr(1) 9862 .sr(1) 9863 .m(1) 9864 .n(n) 9865 .k(k) 9866 .a_stride(7) 9867 .Test(xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9868 } 9869 } 9870 } 9871 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST,n_div_8_subtile)9872 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST, n_div_8_subtile) { 9873 TEST_REQUIRES_X86_AVX2; 9874 for (uint32_t n = 16; n <= 24; n += 8) { 9875 for (size_t k = 1; k <= 5; k += 2) { 9876 for (uint32_t m = 1; m <= 1; m++) { 9877 GemmMicrokernelTester() 9878 .mr(1) 9879 .nr(8) 9880 .kr(1) 9881 .sr(1) 9882 .m(m) 9883 .n(n) 9884 .k(k) 9885 .iterations(1) 9886 .Test(xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9887 } 9888 } 9889 } 9890 } 9891 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST,strided_cm_subtile)9892 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST, strided_cm_subtile) { 9893 TEST_REQUIRES_X86_AVX2; 9894 for (size_t k = 1; k <= 5; k += 2) { 9895 for (uint32_t n = 1; n <= 8; n++) { 9896 for (uint32_t m = 1; m <= 1; m++) { 9897 GemmMicrokernelTester() 9898 .mr(1) 9899 .nr(8) 9900 .kr(1) 9901 .sr(1) 9902 .m(m) 9903 .n(n) 9904 .k(k) 9905 .cm_stride(11) 9906 .iterations(1) 9907 .Test(xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9908 } 9909 } 9910 } 9911 } 9912 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST,qmin)9913 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST, qmin) { 9914 TEST_REQUIRES_X86_AVX2; 9915 GemmMicrokernelTester() 9916 .mr(1) 9917 .nr(8) 9918 .kr(1) 9919 .sr(1) 9920 .m(1) 9921 .n(8) 9922 .k(1) 9923 .qmin(128) 9924 .Test(xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9925 } 9926 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST,qmax)9927 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST, qmax) { 9928 TEST_REQUIRES_X86_AVX2; 9929 GemmMicrokernelTester() 9930 .mr(1) 9931 .nr(8) 9932 .kr(1) 9933 .sr(1) 9934 .m(1) 9935 .n(8) 9936 .k(1) 9937 .qmax(128) 9938 .Test(xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9939 } 9940 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST,strided_cm)9941 TEST(F16_GEMM_MINMAX_1X8__AVX2_BROADCAST, strided_cm) { 9942 TEST_REQUIRES_X86_AVX2; 9943 GemmMicrokernelTester() 9944 .mr(1) 9945 .nr(8) 9946 .kr(1) 9947 .sr(1) 9948 .m(1) 9949 .n(8) 9950 .k(1) 9951 .cm_stride(11) 9952 .Test(xnn_f16_gemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9953 } 9954 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 9955 9956 9957 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST,k_eq_1)9958 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST, k_eq_1) { 9959 TEST_REQUIRES_X86_AVX2; 9960 GemmMicrokernelTester() 9961 .mr(1) 9962 .nr(16) 9963 .kr(1) 9964 .sr(1) 9965 .m(1) 9966 .n(16) 9967 .k(1) 9968 .Test(xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9969 } 9970 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST,strided_cn)9971 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST, strided_cn) { 9972 TEST_REQUIRES_X86_AVX2; 9973 GemmMicrokernelTester() 9974 .mr(1) 9975 .nr(16) 9976 .kr(1) 9977 .sr(1) 9978 .m(1) 9979 .n(16) 9980 .k(1) 9981 .cn_stride(19) 9982 .Test(xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9983 } 9984 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST,k_eq_1_strided_a)9985 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST, k_eq_1_strided_a) { 9986 TEST_REQUIRES_X86_AVX2; 9987 GemmMicrokernelTester() 9988 .mr(1) 9989 .nr(16) 9990 .kr(1) 9991 .sr(1) 9992 .m(1) 9993 .n(16) 9994 .k(1) 9995 .a_stride(3) 9996 .Test(xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9997 } 9998 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST,k_eq_1_subtile)9999 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST, k_eq_1_subtile) { 10000 TEST_REQUIRES_X86_AVX2; 10001 for (uint32_t n = 1; n <= 16; n++) { 10002 for (uint32_t m = 1; m <= 1; m++) { 10003 GemmMicrokernelTester() 10004 .mr(1) 10005 .nr(16) 10006 .kr(1) 10007 .sr(1) 10008 .m(m) 10009 .n(n) 10010 .k(1) 10011 .iterations(1) 10012 .Test(xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10013 } 10014 } 10015 } 10016 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST,k_eq_1_subtile_m)10017 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST, k_eq_1_subtile_m) { 10018 TEST_REQUIRES_X86_AVX2; 10019 for (uint32_t m = 1; m <= 1; m++) { 10020 GemmMicrokernelTester() 10021 .mr(1) 10022 .nr(16) 10023 .kr(1) 10024 .sr(1) 10025 .m(m) 10026 .n(16) 10027 .k(1) 10028 .iterations(1) 10029 .Test(xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10030 } 10031 } 10032 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST,k_eq_1_subtile_n)10033 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST, k_eq_1_subtile_n) { 10034 TEST_REQUIRES_X86_AVX2; 10035 for (uint32_t n = 1; n <= 16; n++) { 10036 GemmMicrokernelTester() 10037 .mr(1) 10038 .nr(16) 10039 .kr(1) 10040 .sr(1) 10041 .m(1) 10042 .n(n) 10043 .k(1) 10044 .iterations(1) 10045 .Test(xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10046 } 10047 } 10048 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST,k_gt_1)10049 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST, k_gt_1) { 10050 TEST_REQUIRES_X86_AVX2; 10051 for (size_t k = 2; k < 10; k++) { 10052 GemmMicrokernelTester() 10053 .mr(1) 10054 .nr(16) 10055 .kr(1) 10056 .sr(1) 10057 .m(1) 10058 .n(16) 10059 .k(k) 10060 .Test(xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10061 } 10062 } 10063 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST,k_gt_1_strided_a)10064 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST, k_gt_1_strided_a) { 10065 TEST_REQUIRES_X86_AVX2; 10066 for (size_t k = 2; k < 10; k++) { 10067 GemmMicrokernelTester() 10068 .mr(1) 10069 .nr(16) 10070 .kr(1) 10071 .sr(1) 10072 .m(1) 10073 .n(16) 10074 .k(k) 10075 .a_stride(11) 10076 .Test(xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10077 } 10078 } 10079 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST,k_gt_1_subtile)10080 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST, k_gt_1_subtile) { 10081 TEST_REQUIRES_X86_AVX2; 10082 for (size_t k = 2; k < 10; k++) { 10083 for (uint32_t n = 1; n <= 16; n++) { 10084 for (uint32_t m = 1; m <= 1; m++) { 10085 GemmMicrokernelTester() 10086 .mr(1) 10087 .nr(16) 10088 .kr(1) 10089 .sr(1) 10090 .m(m) 10091 .n(n) 10092 .k(k) 10093 .iterations(1) 10094 .Test(xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10095 } 10096 } 10097 } 10098 } 10099 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST,n_gt_16)10100 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST, n_gt_16) { 10101 TEST_REQUIRES_X86_AVX2; 10102 for (uint32_t n = 17; n < 32; n++) { 10103 for (size_t k = 1; k <= 5; k += 2) { 10104 GemmMicrokernelTester() 10105 .mr(1) 10106 .nr(16) 10107 .kr(1) 10108 .sr(1) 10109 .m(1) 10110 .n(n) 10111 .k(k) 10112 .Test(xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10113 } 10114 } 10115 } 10116 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST,n_gt_16_strided_cn)10117 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST, n_gt_16_strided_cn) { 10118 TEST_REQUIRES_X86_AVX2; 10119 for (uint32_t n = 17; n < 32; n++) { 10120 for (size_t k = 1; k <= 5; k += 2) { 10121 GemmMicrokernelTester() 10122 .mr(1) 10123 .nr(16) 10124 .kr(1) 10125 .sr(1) 10126 .m(1) 10127 .n(n) 10128 .k(k) 10129 .cn_stride(19) 10130 .Test(xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10131 } 10132 } 10133 } 10134 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST,n_gt_16_strided_a)10135 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST, n_gt_16_strided_a) { 10136 TEST_REQUIRES_X86_AVX2; 10137 for (uint32_t n = 17; n < 32; n++) { 10138 for (size_t k = 1; k <= 5; k += 2) { 10139 GemmMicrokernelTester() 10140 .mr(1) 10141 .nr(16) 10142 .kr(1) 10143 .sr(1) 10144 .m(1) 10145 .n(n) 10146 .k(k) 10147 .a_stride(7) 10148 .Test(xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10149 } 10150 } 10151 } 10152 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST,n_gt_16_subtile)10153 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST, n_gt_16_subtile) { 10154 TEST_REQUIRES_X86_AVX2; 10155 for (uint32_t n = 17; n < 32; n++) { 10156 for (size_t k = 1; k <= 5; k += 2) { 10157 for (uint32_t m = 1; m <= 1; m++) { 10158 GemmMicrokernelTester() 10159 .mr(1) 10160 .nr(16) 10161 .kr(1) 10162 .sr(1) 10163 .m(m) 10164 .n(n) 10165 .k(k) 10166 .iterations(1) 10167 .Test(xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10168 } 10169 } 10170 } 10171 } 10172 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST,n_div_16)10173 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST, n_div_16) { 10174 TEST_REQUIRES_X86_AVX2; 10175 for (uint32_t n = 32; n <= 48; n += 16) { 10176 for (size_t k = 1; k <= 5; k += 2) { 10177 GemmMicrokernelTester() 10178 .mr(1) 10179 .nr(16) 10180 .kr(1) 10181 .sr(1) 10182 .m(1) 10183 .n(n) 10184 .k(k) 10185 .Test(xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10186 } 10187 } 10188 } 10189 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST,n_div_16_strided_cn)10190 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST, n_div_16_strided_cn) { 10191 TEST_REQUIRES_X86_AVX2; 10192 for (uint32_t n = 32; n <= 48; n += 16) { 10193 for (size_t k = 1; k <= 5; k += 2) { 10194 GemmMicrokernelTester() 10195 .mr(1) 10196 .nr(16) 10197 .kr(1) 10198 .sr(1) 10199 .m(1) 10200 .n(n) 10201 .k(k) 10202 .cn_stride(19) 10203 .Test(xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10204 } 10205 } 10206 } 10207 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST,n_div_16_strided_a)10208 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST, n_div_16_strided_a) { 10209 TEST_REQUIRES_X86_AVX2; 10210 for (uint32_t n = 32; n <= 48; n += 16) { 10211 for (size_t k = 1; k <= 5; k += 2) { 10212 GemmMicrokernelTester() 10213 .mr(1) 10214 .nr(16) 10215 .kr(1) 10216 .sr(1) 10217 .m(1) 10218 .n(n) 10219 .k(k) 10220 .a_stride(7) 10221 .Test(xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10222 } 10223 } 10224 } 10225 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST,n_div_16_subtile)10226 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST, n_div_16_subtile) { 10227 TEST_REQUIRES_X86_AVX2; 10228 for (uint32_t n = 32; n <= 48; n += 16) { 10229 for (size_t k = 1; k <= 5; k += 2) { 10230 for (uint32_t m = 1; m <= 1; m++) { 10231 GemmMicrokernelTester() 10232 .mr(1) 10233 .nr(16) 10234 .kr(1) 10235 .sr(1) 10236 .m(m) 10237 .n(n) 10238 .k(k) 10239 .iterations(1) 10240 .Test(xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10241 } 10242 } 10243 } 10244 } 10245 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST,strided_cm_subtile)10246 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST, strided_cm_subtile) { 10247 TEST_REQUIRES_X86_AVX2; 10248 for (size_t k = 1; k <= 5; k += 2) { 10249 for (uint32_t n = 1; n <= 16; n++) { 10250 for (uint32_t m = 1; m <= 1; m++) { 10251 GemmMicrokernelTester() 10252 .mr(1) 10253 .nr(16) 10254 .kr(1) 10255 .sr(1) 10256 .m(m) 10257 .n(n) 10258 .k(k) 10259 .cm_stride(19) 10260 .iterations(1) 10261 .Test(xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10262 } 10263 } 10264 } 10265 } 10266 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST,qmin)10267 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST, qmin) { 10268 TEST_REQUIRES_X86_AVX2; 10269 GemmMicrokernelTester() 10270 .mr(1) 10271 .nr(16) 10272 .kr(1) 10273 .sr(1) 10274 .m(1) 10275 .n(16) 10276 .k(1) 10277 .qmin(128) 10278 .Test(xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10279 } 10280 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST,qmax)10281 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST, qmax) { 10282 TEST_REQUIRES_X86_AVX2; 10283 GemmMicrokernelTester() 10284 .mr(1) 10285 .nr(16) 10286 .kr(1) 10287 .sr(1) 10288 .m(1) 10289 .n(16) 10290 .k(1) 10291 .qmax(128) 10292 .Test(xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10293 } 10294 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST,strided_cm)10295 TEST(F16_GEMM_MINMAX_1X16__AVX2_BROADCAST, strided_cm) { 10296 TEST_REQUIRES_X86_AVX2; 10297 GemmMicrokernelTester() 10298 .mr(1) 10299 .nr(16) 10300 .kr(1) 10301 .sr(1) 10302 .m(1) 10303 .n(16) 10304 .k(1) 10305 .cm_stride(19) 10306 .Test(xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10307 } 10308 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 10309 10310 10311 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST,k_eq_1)10312 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST, k_eq_1) { 10313 TEST_REQUIRES_X86_AVX2; 10314 GemmMicrokernelTester() 10315 .mr(3) 10316 .nr(16) 10317 .kr(1) 10318 .sr(1) 10319 .m(3) 10320 .n(16) 10321 .k(1) 10322 .Test(xnn_f16_gemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10323 } 10324 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST,strided_cn)10325 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST, strided_cn) { 10326 TEST_REQUIRES_X86_AVX2; 10327 GemmMicrokernelTester() 10328 .mr(3) 10329 .nr(16) 10330 .kr(1) 10331 .sr(1) 10332 .m(3) 10333 .n(16) 10334 .k(1) 10335 .cn_stride(19) 10336 .Test(xnn_f16_gemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10337 } 10338 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST,k_eq_1_strided_a)10339 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST, k_eq_1_strided_a) { 10340 TEST_REQUIRES_X86_AVX2; 10341 GemmMicrokernelTester() 10342 .mr(3) 10343 .nr(16) 10344 .kr(1) 10345 .sr(1) 10346 .m(3) 10347 .n(16) 10348 .k(1) 10349 .a_stride(3) 10350 .Test(xnn_f16_gemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10351 } 10352 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST,k_eq_1_subtile)10353 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST, k_eq_1_subtile) { 10354 TEST_REQUIRES_X86_AVX2; 10355 for (uint32_t n = 1; n <= 16; n++) { 10356 for (uint32_t m = 1; m <= 3; m++) { 10357 GemmMicrokernelTester() 10358 .mr(3) 10359 .nr(16) 10360 .kr(1) 10361 .sr(1) 10362 .m(m) 10363 .n(n) 10364 .k(1) 10365 .iterations(1) 10366 .Test(xnn_f16_gemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10367 } 10368 } 10369 } 10370 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST,k_eq_1_subtile_m)10371 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST, k_eq_1_subtile_m) { 10372 TEST_REQUIRES_X86_AVX2; 10373 for (uint32_t m = 1; m <= 3; m++) { 10374 GemmMicrokernelTester() 10375 .mr(3) 10376 .nr(16) 10377 .kr(1) 10378 .sr(1) 10379 .m(m) 10380 .n(16) 10381 .k(1) 10382 .iterations(1) 10383 .Test(xnn_f16_gemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10384 } 10385 } 10386 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST,k_eq_1_subtile_n)10387 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST, k_eq_1_subtile_n) { 10388 TEST_REQUIRES_X86_AVX2; 10389 for (uint32_t n = 1; n <= 16; n++) { 10390 GemmMicrokernelTester() 10391 .mr(3) 10392 .nr(16) 10393 .kr(1) 10394 .sr(1) 10395 .m(3) 10396 .n(n) 10397 .k(1) 10398 .iterations(1) 10399 .Test(xnn_f16_gemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10400 } 10401 } 10402 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST,k_gt_1)10403 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST, k_gt_1) { 10404 TEST_REQUIRES_X86_AVX2; 10405 for (size_t k = 2; k < 10; k++) { 10406 GemmMicrokernelTester() 10407 .mr(3) 10408 .nr(16) 10409 .kr(1) 10410 .sr(1) 10411 .m(3) 10412 .n(16) 10413 .k(k) 10414 .Test(xnn_f16_gemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10415 } 10416 } 10417 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST,k_gt_1_strided_a)10418 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST, k_gt_1_strided_a) { 10419 TEST_REQUIRES_X86_AVX2; 10420 for (size_t k = 2; k < 10; k++) { 10421 GemmMicrokernelTester() 10422 .mr(3) 10423 .nr(16) 10424 .kr(1) 10425 .sr(1) 10426 .m(3) 10427 .n(16) 10428 .k(k) 10429 .a_stride(11) 10430 .Test(xnn_f16_gemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10431 } 10432 } 10433 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST,k_gt_1_subtile)10434 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST, k_gt_1_subtile) { 10435 TEST_REQUIRES_X86_AVX2; 10436 for (size_t k = 2; k < 10; k++) { 10437 for (uint32_t n = 1; n <= 16; n++) { 10438 for (uint32_t m = 1; m <= 3; m++) { 10439 GemmMicrokernelTester() 10440 .mr(3) 10441 .nr(16) 10442 .kr(1) 10443 .sr(1) 10444 .m(m) 10445 .n(n) 10446 .k(k) 10447 .iterations(1) 10448 .Test(xnn_f16_gemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10449 } 10450 } 10451 } 10452 } 10453 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST,n_gt_16)10454 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST, n_gt_16) { 10455 TEST_REQUIRES_X86_AVX2; 10456 for (uint32_t n = 17; n < 32; n++) { 10457 for (size_t k = 1; k <= 5; k += 2) { 10458 GemmMicrokernelTester() 10459 .mr(3) 10460 .nr(16) 10461 .kr(1) 10462 .sr(1) 10463 .m(3) 10464 .n(n) 10465 .k(k) 10466 .Test(xnn_f16_gemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10467 } 10468 } 10469 } 10470 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST,n_gt_16_strided_cn)10471 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST, n_gt_16_strided_cn) { 10472 TEST_REQUIRES_X86_AVX2; 10473 for (uint32_t n = 17; n < 32; n++) { 10474 for (size_t k = 1; k <= 5; k += 2) { 10475 GemmMicrokernelTester() 10476 .mr(3) 10477 .nr(16) 10478 .kr(1) 10479 .sr(1) 10480 .m(3) 10481 .n(n) 10482 .k(k) 10483 .cn_stride(19) 10484 .Test(xnn_f16_gemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10485 } 10486 } 10487 } 10488 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST,n_gt_16_strided_a)10489 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST, n_gt_16_strided_a) { 10490 TEST_REQUIRES_X86_AVX2; 10491 for (uint32_t n = 17; n < 32; n++) { 10492 for (size_t k = 1; k <= 5; k += 2) { 10493 GemmMicrokernelTester() 10494 .mr(3) 10495 .nr(16) 10496 .kr(1) 10497 .sr(1) 10498 .m(3) 10499 .n(n) 10500 .k(k) 10501 .a_stride(7) 10502 .Test(xnn_f16_gemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10503 } 10504 } 10505 } 10506 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST,n_gt_16_subtile)10507 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST, n_gt_16_subtile) { 10508 TEST_REQUIRES_X86_AVX2; 10509 for (uint32_t n = 17; n < 32; n++) { 10510 for (size_t k = 1; k <= 5; k += 2) { 10511 for (uint32_t m = 1; m <= 3; m++) { 10512 GemmMicrokernelTester() 10513 .mr(3) 10514 .nr(16) 10515 .kr(1) 10516 .sr(1) 10517 .m(m) 10518 .n(n) 10519 .k(k) 10520 .iterations(1) 10521 .Test(xnn_f16_gemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10522 } 10523 } 10524 } 10525 } 10526 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST,n_div_16)10527 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST, n_div_16) { 10528 TEST_REQUIRES_X86_AVX2; 10529 for (uint32_t n = 32; n <= 48; n += 16) { 10530 for (size_t k = 1; k <= 5; k += 2) { 10531 GemmMicrokernelTester() 10532 .mr(3) 10533 .nr(16) 10534 .kr(1) 10535 .sr(1) 10536 .m(3) 10537 .n(n) 10538 .k(k) 10539 .Test(xnn_f16_gemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10540 } 10541 } 10542 } 10543 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST,n_div_16_strided_cn)10544 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST, n_div_16_strided_cn) { 10545 TEST_REQUIRES_X86_AVX2; 10546 for (uint32_t n = 32; n <= 48; n += 16) { 10547 for (size_t k = 1; k <= 5; k += 2) { 10548 GemmMicrokernelTester() 10549 .mr(3) 10550 .nr(16) 10551 .kr(1) 10552 .sr(1) 10553 .m(3) 10554 .n(n) 10555 .k(k) 10556 .cn_stride(19) 10557 .Test(xnn_f16_gemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10558 } 10559 } 10560 } 10561 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST,n_div_16_strided_a)10562 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST, n_div_16_strided_a) { 10563 TEST_REQUIRES_X86_AVX2; 10564 for (uint32_t n = 32; n <= 48; n += 16) { 10565 for (size_t k = 1; k <= 5; k += 2) { 10566 GemmMicrokernelTester() 10567 .mr(3) 10568 .nr(16) 10569 .kr(1) 10570 .sr(1) 10571 .m(3) 10572 .n(n) 10573 .k(k) 10574 .a_stride(7) 10575 .Test(xnn_f16_gemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10576 } 10577 } 10578 } 10579 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST,n_div_16_subtile)10580 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST, n_div_16_subtile) { 10581 TEST_REQUIRES_X86_AVX2; 10582 for (uint32_t n = 32; n <= 48; n += 16) { 10583 for (size_t k = 1; k <= 5; k += 2) { 10584 for (uint32_t m = 1; m <= 3; m++) { 10585 GemmMicrokernelTester() 10586 .mr(3) 10587 .nr(16) 10588 .kr(1) 10589 .sr(1) 10590 .m(m) 10591 .n(n) 10592 .k(k) 10593 .iterations(1) 10594 .Test(xnn_f16_gemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10595 } 10596 } 10597 } 10598 } 10599 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST,strided_cm_subtile)10600 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST, strided_cm_subtile) { 10601 TEST_REQUIRES_X86_AVX2; 10602 for (size_t k = 1; k <= 5; k += 2) { 10603 for (uint32_t n = 1; n <= 16; n++) { 10604 for (uint32_t m = 1; m <= 3; m++) { 10605 GemmMicrokernelTester() 10606 .mr(3) 10607 .nr(16) 10608 .kr(1) 10609 .sr(1) 10610 .m(m) 10611 .n(n) 10612 .k(k) 10613 .cm_stride(19) 10614 .iterations(1) 10615 .Test(xnn_f16_gemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10616 } 10617 } 10618 } 10619 } 10620 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST,qmin)10621 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST, qmin) { 10622 TEST_REQUIRES_X86_AVX2; 10623 GemmMicrokernelTester() 10624 .mr(3) 10625 .nr(16) 10626 .kr(1) 10627 .sr(1) 10628 .m(3) 10629 .n(16) 10630 .k(1) 10631 .qmin(128) 10632 .Test(xnn_f16_gemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10633 } 10634 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST,qmax)10635 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST, qmax) { 10636 TEST_REQUIRES_X86_AVX2; 10637 GemmMicrokernelTester() 10638 .mr(3) 10639 .nr(16) 10640 .kr(1) 10641 .sr(1) 10642 .m(3) 10643 .n(16) 10644 .k(1) 10645 .qmax(128) 10646 .Test(xnn_f16_gemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10647 } 10648 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST,strided_cm)10649 TEST(F16_GEMM_MINMAX_3X16__AVX2_BROADCAST, strided_cm) { 10650 TEST_REQUIRES_X86_AVX2; 10651 GemmMicrokernelTester() 10652 .mr(3) 10653 .nr(16) 10654 .kr(1) 10655 .sr(1) 10656 .m(3) 10657 .n(16) 10658 .k(1) 10659 .cm_stride(19) 10660 .Test(xnn_f16_gemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10661 } 10662 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 10663 10664 10665 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST,k_eq_1)10666 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST, k_eq_1) { 10667 TEST_REQUIRES_X86_AVX2; 10668 GemmMicrokernelTester() 10669 .mr(4) 10670 .nr(8) 10671 .kr(1) 10672 .sr(1) 10673 .m(4) 10674 .n(8) 10675 .k(1) 10676 .Test(xnn_f16_gemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10677 } 10678 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST,strided_cn)10679 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST, strided_cn) { 10680 TEST_REQUIRES_X86_AVX2; 10681 GemmMicrokernelTester() 10682 .mr(4) 10683 .nr(8) 10684 .kr(1) 10685 .sr(1) 10686 .m(4) 10687 .n(8) 10688 .k(1) 10689 .cn_stride(11) 10690 .Test(xnn_f16_gemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10691 } 10692 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST,k_eq_1_strided_a)10693 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST, k_eq_1_strided_a) { 10694 TEST_REQUIRES_X86_AVX2; 10695 GemmMicrokernelTester() 10696 .mr(4) 10697 .nr(8) 10698 .kr(1) 10699 .sr(1) 10700 .m(4) 10701 .n(8) 10702 .k(1) 10703 .a_stride(3) 10704 .Test(xnn_f16_gemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10705 } 10706 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST,k_eq_1_subtile)10707 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST, k_eq_1_subtile) { 10708 TEST_REQUIRES_X86_AVX2; 10709 for (uint32_t n = 1; n <= 8; n++) { 10710 for (uint32_t m = 1; m <= 4; m++) { 10711 GemmMicrokernelTester() 10712 .mr(4) 10713 .nr(8) 10714 .kr(1) 10715 .sr(1) 10716 .m(m) 10717 .n(n) 10718 .k(1) 10719 .iterations(1) 10720 .Test(xnn_f16_gemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10721 } 10722 } 10723 } 10724 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST,k_eq_1_subtile_m)10725 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST, k_eq_1_subtile_m) { 10726 TEST_REQUIRES_X86_AVX2; 10727 for (uint32_t m = 1; m <= 4; m++) { 10728 GemmMicrokernelTester() 10729 .mr(4) 10730 .nr(8) 10731 .kr(1) 10732 .sr(1) 10733 .m(m) 10734 .n(8) 10735 .k(1) 10736 .iterations(1) 10737 .Test(xnn_f16_gemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10738 } 10739 } 10740 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST,k_eq_1_subtile_n)10741 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST, k_eq_1_subtile_n) { 10742 TEST_REQUIRES_X86_AVX2; 10743 for (uint32_t n = 1; n <= 8; n++) { 10744 GemmMicrokernelTester() 10745 .mr(4) 10746 .nr(8) 10747 .kr(1) 10748 .sr(1) 10749 .m(4) 10750 .n(n) 10751 .k(1) 10752 .iterations(1) 10753 .Test(xnn_f16_gemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10754 } 10755 } 10756 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST,k_gt_1)10757 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST, k_gt_1) { 10758 TEST_REQUIRES_X86_AVX2; 10759 for (size_t k = 2; k < 10; k++) { 10760 GemmMicrokernelTester() 10761 .mr(4) 10762 .nr(8) 10763 .kr(1) 10764 .sr(1) 10765 .m(4) 10766 .n(8) 10767 .k(k) 10768 .Test(xnn_f16_gemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10769 } 10770 } 10771 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST,k_gt_1_strided_a)10772 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST, k_gt_1_strided_a) { 10773 TEST_REQUIRES_X86_AVX2; 10774 for (size_t k = 2; k < 10; k++) { 10775 GemmMicrokernelTester() 10776 .mr(4) 10777 .nr(8) 10778 .kr(1) 10779 .sr(1) 10780 .m(4) 10781 .n(8) 10782 .k(k) 10783 .a_stride(11) 10784 .Test(xnn_f16_gemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10785 } 10786 } 10787 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST,k_gt_1_subtile)10788 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST, k_gt_1_subtile) { 10789 TEST_REQUIRES_X86_AVX2; 10790 for (size_t k = 2; k < 10; k++) { 10791 for (uint32_t n = 1; n <= 8; n++) { 10792 for (uint32_t m = 1; m <= 4; m++) { 10793 GemmMicrokernelTester() 10794 .mr(4) 10795 .nr(8) 10796 .kr(1) 10797 .sr(1) 10798 .m(m) 10799 .n(n) 10800 .k(k) 10801 .iterations(1) 10802 .Test(xnn_f16_gemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10803 } 10804 } 10805 } 10806 } 10807 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST,n_gt_8)10808 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST, n_gt_8) { 10809 TEST_REQUIRES_X86_AVX2; 10810 for (uint32_t n = 9; n < 16; n++) { 10811 for (size_t k = 1; k <= 5; k += 2) { 10812 GemmMicrokernelTester() 10813 .mr(4) 10814 .nr(8) 10815 .kr(1) 10816 .sr(1) 10817 .m(4) 10818 .n(n) 10819 .k(k) 10820 .Test(xnn_f16_gemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10821 } 10822 } 10823 } 10824 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST,n_gt_8_strided_cn)10825 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST, n_gt_8_strided_cn) { 10826 TEST_REQUIRES_X86_AVX2; 10827 for (uint32_t n = 9; n < 16; n++) { 10828 for (size_t k = 1; k <= 5; k += 2) { 10829 GemmMicrokernelTester() 10830 .mr(4) 10831 .nr(8) 10832 .kr(1) 10833 .sr(1) 10834 .m(4) 10835 .n(n) 10836 .k(k) 10837 .cn_stride(11) 10838 .Test(xnn_f16_gemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10839 } 10840 } 10841 } 10842 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST,n_gt_8_strided_a)10843 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST, n_gt_8_strided_a) { 10844 TEST_REQUIRES_X86_AVX2; 10845 for (uint32_t n = 9; n < 16; n++) { 10846 for (size_t k = 1; k <= 5; k += 2) { 10847 GemmMicrokernelTester() 10848 .mr(4) 10849 .nr(8) 10850 .kr(1) 10851 .sr(1) 10852 .m(4) 10853 .n(n) 10854 .k(k) 10855 .a_stride(7) 10856 .Test(xnn_f16_gemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10857 } 10858 } 10859 } 10860 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST,n_gt_8_subtile)10861 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST, n_gt_8_subtile) { 10862 TEST_REQUIRES_X86_AVX2; 10863 for (uint32_t n = 9; n < 16; n++) { 10864 for (size_t k = 1; k <= 5; k += 2) { 10865 for (uint32_t m = 1; m <= 4; m++) { 10866 GemmMicrokernelTester() 10867 .mr(4) 10868 .nr(8) 10869 .kr(1) 10870 .sr(1) 10871 .m(m) 10872 .n(n) 10873 .k(k) 10874 .iterations(1) 10875 .Test(xnn_f16_gemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10876 } 10877 } 10878 } 10879 } 10880 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST,n_div_8)10881 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST, n_div_8) { 10882 TEST_REQUIRES_X86_AVX2; 10883 for (uint32_t n = 16; n <= 24; n += 8) { 10884 for (size_t k = 1; k <= 5; k += 2) { 10885 GemmMicrokernelTester() 10886 .mr(4) 10887 .nr(8) 10888 .kr(1) 10889 .sr(1) 10890 .m(4) 10891 .n(n) 10892 .k(k) 10893 .Test(xnn_f16_gemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10894 } 10895 } 10896 } 10897 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST,n_div_8_strided_cn)10898 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST, n_div_8_strided_cn) { 10899 TEST_REQUIRES_X86_AVX2; 10900 for (uint32_t n = 16; n <= 24; n += 8) { 10901 for (size_t k = 1; k <= 5; k += 2) { 10902 GemmMicrokernelTester() 10903 .mr(4) 10904 .nr(8) 10905 .kr(1) 10906 .sr(1) 10907 .m(4) 10908 .n(n) 10909 .k(k) 10910 .cn_stride(11) 10911 .Test(xnn_f16_gemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10912 } 10913 } 10914 } 10915 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST,n_div_8_strided_a)10916 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST, n_div_8_strided_a) { 10917 TEST_REQUIRES_X86_AVX2; 10918 for (uint32_t n = 16; n <= 24; n += 8) { 10919 for (size_t k = 1; k <= 5; k += 2) { 10920 GemmMicrokernelTester() 10921 .mr(4) 10922 .nr(8) 10923 .kr(1) 10924 .sr(1) 10925 .m(4) 10926 .n(n) 10927 .k(k) 10928 .a_stride(7) 10929 .Test(xnn_f16_gemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10930 } 10931 } 10932 } 10933 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST,n_div_8_subtile)10934 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST, n_div_8_subtile) { 10935 TEST_REQUIRES_X86_AVX2; 10936 for (uint32_t n = 16; n <= 24; n += 8) { 10937 for (size_t k = 1; k <= 5; k += 2) { 10938 for (uint32_t m = 1; m <= 4; m++) { 10939 GemmMicrokernelTester() 10940 .mr(4) 10941 .nr(8) 10942 .kr(1) 10943 .sr(1) 10944 .m(m) 10945 .n(n) 10946 .k(k) 10947 .iterations(1) 10948 .Test(xnn_f16_gemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10949 } 10950 } 10951 } 10952 } 10953 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST,strided_cm_subtile)10954 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST, strided_cm_subtile) { 10955 TEST_REQUIRES_X86_AVX2; 10956 for (size_t k = 1; k <= 5; k += 2) { 10957 for (uint32_t n = 1; n <= 8; n++) { 10958 for (uint32_t m = 1; m <= 4; m++) { 10959 GemmMicrokernelTester() 10960 .mr(4) 10961 .nr(8) 10962 .kr(1) 10963 .sr(1) 10964 .m(m) 10965 .n(n) 10966 .k(k) 10967 .cm_stride(11) 10968 .iterations(1) 10969 .Test(xnn_f16_gemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10970 } 10971 } 10972 } 10973 } 10974 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST,qmin)10975 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST, qmin) { 10976 TEST_REQUIRES_X86_AVX2; 10977 GemmMicrokernelTester() 10978 .mr(4) 10979 .nr(8) 10980 .kr(1) 10981 .sr(1) 10982 .m(4) 10983 .n(8) 10984 .k(1) 10985 .qmin(128) 10986 .Test(xnn_f16_gemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10987 } 10988 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST,qmax)10989 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST, qmax) { 10990 TEST_REQUIRES_X86_AVX2; 10991 GemmMicrokernelTester() 10992 .mr(4) 10993 .nr(8) 10994 .kr(1) 10995 .sr(1) 10996 .m(4) 10997 .n(8) 10998 .k(1) 10999 .qmax(128) 11000 .Test(xnn_f16_gemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11001 } 11002 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST,strided_cm)11003 TEST(F16_GEMM_MINMAX_4X8__AVX2_BROADCAST, strided_cm) { 11004 TEST_REQUIRES_X86_AVX2; 11005 GemmMicrokernelTester() 11006 .mr(4) 11007 .nr(8) 11008 .kr(1) 11009 .sr(1) 11010 .m(4) 11011 .n(8) 11012 .k(1) 11013 .cm_stride(11) 11014 .Test(xnn_f16_gemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11015 } 11016 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 11017 11018 11019 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST,k_eq_1)11020 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST, k_eq_1) { 11021 TEST_REQUIRES_X86_AVX2; 11022 GemmMicrokernelTester() 11023 .mr(4) 11024 .nr(16) 11025 .kr(1) 11026 .sr(1) 11027 .m(4) 11028 .n(16) 11029 .k(1) 11030 .Test(xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11031 } 11032 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST,strided_cn)11033 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST, strided_cn) { 11034 TEST_REQUIRES_X86_AVX2; 11035 GemmMicrokernelTester() 11036 .mr(4) 11037 .nr(16) 11038 .kr(1) 11039 .sr(1) 11040 .m(4) 11041 .n(16) 11042 .k(1) 11043 .cn_stride(19) 11044 .Test(xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11045 } 11046 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST,k_eq_1_strided_a)11047 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST, k_eq_1_strided_a) { 11048 TEST_REQUIRES_X86_AVX2; 11049 GemmMicrokernelTester() 11050 .mr(4) 11051 .nr(16) 11052 .kr(1) 11053 .sr(1) 11054 .m(4) 11055 .n(16) 11056 .k(1) 11057 .a_stride(3) 11058 .Test(xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11059 } 11060 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST,k_eq_1_subtile)11061 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST, k_eq_1_subtile) { 11062 TEST_REQUIRES_X86_AVX2; 11063 for (uint32_t n = 1; n <= 16; n++) { 11064 for (uint32_t m = 1; m <= 4; m++) { 11065 GemmMicrokernelTester() 11066 .mr(4) 11067 .nr(16) 11068 .kr(1) 11069 .sr(1) 11070 .m(m) 11071 .n(n) 11072 .k(1) 11073 .iterations(1) 11074 .Test(xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11075 } 11076 } 11077 } 11078 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST,k_eq_1_subtile_m)11079 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST, k_eq_1_subtile_m) { 11080 TEST_REQUIRES_X86_AVX2; 11081 for (uint32_t m = 1; m <= 4; m++) { 11082 GemmMicrokernelTester() 11083 .mr(4) 11084 .nr(16) 11085 .kr(1) 11086 .sr(1) 11087 .m(m) 11088 .n(16) 11089 .k(1) 11090 .iterations(1) 11091 .Test(xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11092 } 11093 } 11094 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST,k_eq_1_subtile_n)11095 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST, k_eq_1_subtile_n) { 11096 TEST_REQUIRES_X86_AVX2; 11097 for (uint32_t n = 1; n <= 16; n++) { 11098 GemmMicrokernelTester() 11099 .mr(4) 11100 .nr(16) 11101 .kr(1) 11102 .sr(1) 11103 .m(4) 11104 .n(n) 11105 .k(1) 11106 .iterations(1) 11107 .Test(xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11108 } 11109 } 11110 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST,k_gt_1)11111 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST, k_gt_1) { 11112 TEST_REQUIRES_X86_AVX2; 11113 for (size_t k = 2; k < 10; k++) { 11114 GemmMicrokernelTester() 11115 .mr(4) 11116 .nr(16) 11117 .kr(1) 11118 .sr(1) 11119 .m(4) 11120 .n(16) 11121 .k(k) 11122 .Test(xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11123 } 11124 } 11125 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST,k_gt_1_strided_a)11126 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST, k_gt_1_strided_a) { 11127 TEST_REQUIRES_X86_AVX2; 11128 for (size_t k = 2; k < 10; k++) { 11129 GemmMicrokernelTester() 11130 .mr(4) 11131 .nr(16) 11132 .kr(1) 11133 .sr(1) 11134 .m(4) 11135 .n(16) 11136 .k(k) 11137 .a_stride(11) 11138 .Test(xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11139 } 11140 } 11141 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST,k_gt_1_subtile)11142 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST, k_gt_1_subtile) { 11143 TEST_REQUIRES_X86_AVX2; 11144 for (size_t k = 2; k < 10; k++) { 11145 for (uint32_t n = 1; n <= 16; n++) { 11146 for (uint32_t m = 1; m <= 4; m++) { 11147 GemmMicrokernelTester() 11148 .mr(4) 11149 .nr(16) 11150 .kr(1) 11151 .sr(1) 11152 .m(m) 11153 .n(n) 11154 .k(k) 11155 .iterations(1) 11156 .Test(xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11157 } 11158 } 11159 } 11160 } 11161 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST,n_gt_16)11162 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST, n_gt_16) { 11163 TEST_REQUIRES_X86_AVX2; 11164 for (uint32_t n = 17; n < 32; n++) { 11165 for (size_t k = 1; k <= 5; k += 2) { 11166 GemmMicrokernelTester() 11167 .mr(4) 11168 .nr(16) 11169 .kr(1) 11170 .sr(1) 11171 .m(4) 11172 .n(n) 11173 .k(k) 11174 .Test(xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11175 } 11176 } 11177 } 11178 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST,n_gt_16_strided_cn)11179 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST, n_gt_16_strided_cn) { 11180 TEST_REQUIRES_X86_AVX2; 11181 for (uint32_t n = 17; n < 32; n++) { 11182 for (size_t k = 1; k <= 5; k += 2) { 11183 GemmMicrokernelTester() 11184 .mr(4) 11185 .nr(16) 11186 .kr(1) 11187 .sr(1) 11188 .m(4) 11189 .n(n) 11190 .k(k) 11191 .cn_stride(19) 11192 .Test(xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11193 } 11194 } 11195 } 11196 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST,n_gt_16_strided_a)11197 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST, n_gt_16_strided_a) { 11198 TEST_REQUIRES_X86_AVX2; 11199 for (uint32_t n = 17; n < 32; n++) { 11200 for (size_t k = 1; k <= 5; k += 2) { 11201 GemmMicrokernelTester() 11202 .mr(4) 11203 .nr(16) 11204 .kr(1) 11205 .sr(1) 11206 .m(4) 11207 .n(n) 11208 .k(k) 11209 .a_stride(7) 11210 .Test(xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11211 } 11212 } 11213 } 11214 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST,n_gt_16_subtile)11215 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST, n_gt_16_subtile) { 11216 TEST_REQUIRES_X86_AVX2; 11217 for (uint32_t n = 17; n < 32; n++) { 11218 for (size_t k = 1; k <= 5; k += 2) { 11219 for (uint32_t m = 1; m <= 4; m++) { 11220 GemmMicrokernelTester() 11221 .mr(4) 11222 .nr(16) 11223 .kr(1) 11224 .sr(1) 11225 .m(m) 11226 .n(n) 11227 .k(k) 11228 .iterations(1) 11229 .Test(xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11230 } 11231 } 11232 } 11233 } 11234 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST,n_div_16)11235 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST, n_div_16) { 11236 TEST_REQUIRES_X86_AVX2; 11237 for (uint32_t n = 32; n <= 48; n += 16) { 11238 for (size_t k = 1; k <= 5; k += 2) { 11239 GemmMicrokernelTester() 11240 .mr(4) 11241 .nr(16) 11242 .kr(1) 11243 .sr(1) 11244 .m(4) 11245 .n(n) 11246 .k(k) 11247 .Test(xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11248 } 11249 } 11250 } 11251 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST,n_div_16_strided_cn)11252 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST, n_div_16_strided_cn) { 11253 TEST_REQUIRES_X86_AVX2; 11254 for (uint32_t n = 32; n <= 48; n += 16) { 11255 for (size_t k = 1; k <= 5; k += 2) { 11256 GemmMicrokernelTester() 11257 .mr(4) 11258 .nr(16) 11259 .kr(1) 11260 .sr(1) 11261 .m(4) 11262 .n(n) 11263 .k(k) 11264 .cn_stride(19) 11265 .Test(xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11266 } 11267 } 11268 } 11269 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST,n_div_16_strided_a)11270 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST, n_div_16_strided_a) { 11271 TEST_REQUIRES_X86_AVX2; 11272 for (uint32_t n = 32; n <= 48; n += 16) { 11273 for (size_t k = 1; k <= 5; k += 2) { 11274 GemmMicrokernelTester() 11275 .mr(4) 11276 .nr(16) 11277 .kr(1) 11278 .sr(1) 11279 .m(4) 11280 .n(n) 11281 .k(k) 11282 .a_stride(7) 11283 .Test(xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11284 } 11285 } 11286 } 11287 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST,n_div_16_subtile)11288 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST, n_div_16_subtile) { 11289 TEST_REQUIRES_X86_AVX2; 11290 for (uint32_t n = 32; n <= 48; n += 16) { 11291 for (size_t k = 1; k <= 5; k += 2) { 11292 for (uint32_t m = 1; m <= 4; m++) { 11293 GemmMicrokernelTester() 11294 .mr(4) 11295 .nr(16) 11296 .kr(1) 11297 .sr(1) 11298 .m(m) 11299 .n(n) 11300 .k(k) 11301 .iterations(1) 11302 .Test(xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11303 } 11304 } 11305 } 11306 } 11307 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST,strided_cm_subtile)11308 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST, strided_cm_subtile) { 11309 TEST_REQUIRES_X86_AVX2; 11310 for (size_t k = 1; k <= 5; k += 2) { 11311 for (uint32_t n = 1; n <= 16; n++) { 11312 for (uint32_t m = 1; m <= 4; m++) { 11313 GemmMicrokernelTester() 11314 .mr(4) 11315 .nr(16) 11316 .kr(1) 11317 .sr(1) 11318 .m(m) 11319 .n(n) 11320 .k(k) 11321 .cm_stride(19) 11322 .iterations(1) 11323 .Test(xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11324 } 11325 } 11326 } 11327 } 11328 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST,qmin)11329 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST, qmin) { 11330 TEST_REQUIRES_X86_AVX2; 11331 GemmMicrokernelTester() 11332 .mr(4) 11333 .nr(16) 11334 .kr(1) 11335 .sr(1) 11336 .m(4) 11337 .n(16) 11338 .k(1) 11339 .qmin(128) 11340 .Test(xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11341 } 11342 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST,qmax)11343 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST, qmax) { 11344 TEST_REQUIRES_X86_AVX2; 11345 GemmMicrokernelTester() 11346 .mr(4) 11347 .nr(16) 11348 .kr(1) 11349 .sr(1) 11350 .m(4) 11351 .n(16) 11352 .k(1) 11353 .qmax(128) 11354 .Test(xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11355 } 11356 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST,strided_cm)11357 TEST(F16_GEMM_MINMAX_4X16__AVX2_BROADCAST, strided_cm) { 11358 TEST_REQUIRES_X86_AVX2; 11359 GemmMicrokernelTester() 11360 .mr(4) 11361 .nr(16) 11362 .kr(1) 11363 .sr(1) 11364 .m(4) 11365 .n(16) 11366 .k(1) 11367 .cm_stride(19) 11368 .Test(xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11369 } 11370 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 11371 11372 11373 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST,k_eq_1)11374 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST, k_eq_1) { 11375 TEST_REQUIRES_X86_AVX2; 11376 GemmMicrokernelTester() 11377 .mr(5) 11378 .nr(8) 11379 .kr(1) 11380 .sr(1) 11381 .m(5) 11382 .n(8) 11383 .k(1) 11384 .Test(xnn_f16_gemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11385 } 11386 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST,strided_cn)11387 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST, strided_cn) { 11388 TEST_REQUIRES_X86_AVX2; 11389 GemmMicrokernelTester() 11390 .mr(5) 11391 .nr(8) 11392 .kr(1) 11393 .sr(1) 11394 .m(5) 11395 .n(8) 11396 .k(1) 11397 .cn_stride(11) 11398 .Test(xnn_f16_gemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11399 } 11400 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST,k_eq_1_strided_a)11401 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST, k_eq_1_strided_a) { 11402 TEST_REQUIRES_X86_AVX2; 11403 GemmMicrokernelTester() 11404 .mr(5) 11405 .nr(8) 11406 .kr(1) 11407 .sr(1) 11408 .m(5) 11409 .n(8) 11410 .k(1) 11411 .a_stride(3) 11412 .Test(xnn_f16_gemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11413 } 11414 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST,k_eq_1_subtile)11415 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST, k_eq_1_subtile) { 11416 TEST_REQUIRES_X86_AVX2; 11417 for (uint32_t n = 1; n <= 8; n++) { 11418 for (uint32_t m = 1; m <= 5; m++) { 11419 GemmMicrokernelTester() 11420 .mr(5) 11421 .nr(8) 11422 .kr(1) 11423 .sr(1) 11424 .m(m) 11425 .n(n) 11426 .k(1) 11427 .iterations(1) 11428 .Test(xnn_f16_gemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11429 } 11430 } 11431 } 11432 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST,k_eq_1_subtile_m)11433 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST, k_eq_1_subtile_m) { 11434 TEST_REQUIRES_X86_AVX2; 11435 for (uint32_t m = 1; m <= 5; m++) { 11436 GemmMicrokernelTester() 11437 .mr(5) 11438 .nr(8) 11439 .kr(1) 11440 .sr(1) 11441 .m(m) 11442 .n(8) 11443 .k(1) 11444 .iterations(1) 11445 .Test(xnn_f16_gemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11446 } 11447 } 11448 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST,k_eq_1_subtile_n)11449 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST, k_eq_1_subtile_n) { 11450 TEST_REQUIRES_X86_AVX2; 11451 for (uint32_t n = 1; n <= 8; n++) { 11452 GemmMicrokernelTester() 11453 .mr(5) 11454 .nr(8) 11455 .kr(1) 11456 .sr(1) 11457 .m(5) 11458 .n(n) 11459 .k(1) 11460 .iterations(1) 11461 .Test(xnn_f16_gemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11462 } 11463 } 11464 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST,k_gt_1)11465 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST, k_gt_1) { 11466 TEST_REQUIRES_X86_AVX2; 11467 for (size_t k = 2; k < 10; k++) { 11468 GemmMicrokernelTester() 11469 .mr(5) 11470 .nr(8) 11471 .kr(1) 11472 .sr(1) 11473 .m(5) 11474 .n(8) 11475 .k(k) 11476 .Test(xnn_f16_gemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11477 } 11478 } 11479 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST,k_gt_1_strided_a)11480 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST, k_gt_1_strided_a) { 11481 TEST_REQUIRES_X86_AVX2; 11482 for (size_t k = 2; k < 10; k++) { 11483 GemmMicrokernelTester() 11484 .mr(5) 11485 .nr(8) 11486 .kr(1) 11487 .sr(1) 11488 .m(5) 11489 .n(8) 11490 .k(k) 11491 .a_stride(11) 11492 .Test(xnn_f16_gemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11493 } 11494 } 11495 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST,k_gt_1_subtile)11496 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST, k_gt_1_subtile) { 11497 TEST_REQUIRES_X86_AVX2; 11498 for (size_t k = 2; k < 10; k++) { 11499 for (uint32_t n = 1; n <= 8; n++) { 11500 for (uint32_t m = 1; m <= 5; m++) { 11501 GemmMicrokernelTester() 11502 .mr(5) 11503 .nr(8) 11504 .kr(1) 11505 .sr(1) 11506 .m(m) 11507 .n(n) 11508 .k(k) 11509 .iterations(1) 11510 .Test(xnn_f16_gemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11511 } 11512 } 11513 } 11514 } 11515 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST,n_gt_8)11516 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST, n_gt_8) { 11517 TEST_REQUIRES_X86_AVX2; 11518 for (uint32_t n = 9; n < 16; n++) { 11519 for (size_t k = 1; k <= 5; k += 2) { 11520 GemmMicrokernelTester() 11521 .mr(5) 11522 .nr(8) 11523 .kr(1) 11524 .sr(1) 11525 .m(5) 11526 .n(n) 11527 .k(k) 11528 .Test(xnn_f16_gemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11529 } 11530 } 11531 } 11532 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST,n_gt_8_strided_cn)11533 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST, n_gt_8_strided_cn) { 11534 TEST_REQUIRES_X86_AVX2; 11535 for (uint32_t n = 9; n < 16; n++) { 11536 for (size_t k = 1; k <= 5; k += 2) { 11537 GemmMicrokernelTester() 11538 .mr(5) 11539 .nr(8) 11540 .kr(1) 11541 .sr(1) 11542 .m(5) 11543 .n(n) 11544 .k(k) 11545 .cn_stride(11) 11546 .Test(xnn_f16_gemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11547 } 11548 } 11549 } 11550 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST,n_gt_8_strided_a)11551 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST, n_gt_8_strided_a) { 11552 TEST_REQUIRES_X86_AVX2; 11553 for (uint32_t n = 9; n < 16; n++) { 11554 for (size_t k = 1; k <= 5; k += 2) { 11555 GemmMicrokernelTester() 11556 .mr(5) 11557 .nr(8) 11558 .kr(1) 11559 .sr(1) 11560 .m(5) 11561 .n(n) 11562 .k(k) 11563 .a_stride(7) 11564 .Test(xnn_f16_gemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11565 } 11566 } 11567 } 11568 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST,n_gt_8_subtile)11569 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST, n_gt_8_subtile) { 11570 TEST_REQUIRES_X86_AVX2; 11571 for (uint32_t n = 9; n < 16; n++) { 11572 for (size_t k = 1; k <= 5; k += 2) { 11573 for (uint32_t m = 1; m <= 5; m++) { 11574 GemmMicrokernelTester() 11575 .mr(5) 11576 .nr(8) 11577 .kr(1) 11578 .sr(1) 11579 .m(m) 11580 .n(n) 11581 .k(k) 11582 .iterations(1) 11583 .Test(xnn_f16_gemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11584 } 11585 } 11586 } 11587 } 11588 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST,n_div_8)11589 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST, n_div_8) { 11590 TEST_REQUIRES_X86_AVX2; 11591 for (uint32_t n = 16; n <= 24; n += 8) { 11592 for (size_t k = 1; k <= 5; k += 2) { 11593 GemmMicrokernelTester() 11594 .mr(5) 11595 .nr(8) 11596 .kr(1) 11597 .sr(1) 11598 .m(5) 11599 .n(n) 11600 .k(k) 11601 .Test(xnn_f16_gemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11602 } 11603 } 11604 } 11605 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST,n_div_8_strided_cn)11606 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST, n_div_8_strided_cn) { 11607 TEST_REQUIRES_X86_AVX2; 11608 for (uint32_t n = 16; n <= 24; n += 8) { 11609 for (size_t k = 1; k <= 5; k += 2) { 11610 GemmMicrokernelTester() 11611 .mr(5) 11612 .nr(8) 11613 .kr(1) 11614 .sr(1) 11615 .m(5) 11616 .n(n) 11617 .k(k) 11618 .cn_stride(11) 11619 .Test(xnn_f16_gemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11620 } 11621 } 11622 } 11623 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST,n_div_8_strided_a)11624 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST, n_div_8_strided_a) { 11625 TEST_REQUIRES_X86_AVX2; 11626 for (uint32_t n = 16; n <= 24; n += 8) { 11627 for (size_t k = 1; k <= 5; k += 2) { 11628 GemmMicrokernelTester() 11629 .mr(5) 11630 .nr(8) 11631 .kr(1) 11632 .sr(1) 11633 .m(5) 11634 .n(n) 11635 .k(k) 11636 .a_stride(7) 11637 .Test(xnn_f16_gemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11638 } 11639 } 11640 } 11641 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST,n_div_8_subtile)11642 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST, n_div_8_subtile) { 11643 TEST_REQUIRES_X86_AVX2; 11644 for (uint32_t n = 16; n <= 24; n += 8) { 11645 for (size_t k = 1; k <= 5; k += 2) { 11646 for (uint32_t m = 1; m <= 5; m++) { 11647 GemmMicrokernelTester() 11648 .mr(5) 11649 .nr(8) 11650 .kr(1) 11651 .sr(1) 11652 .m(m) 11653 .n(n) 11654 .k(k) 11655 .iterations(1) 11656 .Test(xnn_f16_gemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11657 } 11658 } 11659 } 11660 } 11661 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST,strided_cm_subtile)11662 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST, strided_cm_subtile) { 11663 TEST_REQUIRES_X86_AVX2; 11664 for (size_t k = 1; k <= 5; k += 2) { 11665 for (uint32_t n = 1; n <= 8; n++) { 11666 for (uint32_t m = 1; m <= 5; m++) { 11667 GemmMicrokernelTester() 11668 .mr(5) 11669 .nr(8) 11670 .kr(1) 11671 .sr(1) 11672 .m(m) 11673 .n(n) 11674 .k(k) 11675 .cm_stride(11) 11676 .iterations(1) 11677 .Test(xnn_f16_gemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11678 } 11679 } 11680 } 11681 } 11682 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST,qmin)11683 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST, qmin) { 11684 TEST_REQUIRES_X86_AVX2; 11685 GemmMicrokernelTester() 11686 .mr(5) 11687 .nr(8) 11688 .kr(1) 11689 .sr(1) 11690 .m(5) 11691 .n(8) 11692 .k(1) 11693 .qmin(128) 11694 .Test(xnn_f16_gemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11695 } 11696 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST,qmax)11697 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST, qmax) { 11698 TEST_REQUIRES_X86_AVX2; 11699 GemmMicrokernelTester() 11700 .mr(5) 11701 .nr(8) 11702 .kr(1) 11703 .sr(1) 11704 .m(5) 11705 .n(8) 11706 .k(1) 11707 .qmax(128) 11708 .Test(xnn_f16_gemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11709 } 11710 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST,strided_cm)11711 TEST(F16_GEMM_MINMAX_5X8__AVX2_BROADCAST, strided_cm) { 11712 TEST_REQUIRES_X86_AVX2; 11713 GemmMicrokernelTester() 11714 .mr(5) 11715 .nr(8) 11716 .kr(1) 11717 .sr(1) 11718 .m(5) 11719 .n(8) 11720 .k(1) 11721 .cm_stride(11) 11722 .Test(xnn_f16_gemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11723 } 11724 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 11725 11726 11727 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST,k_eq_1)11728 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST, k_eq_1) { 11729 TEST_REQUIRES_X86_AVX2; 11730 GemmMicrokernelTester() 11731 .mr(5) 11732 .nr(16) 11733 .kr(1) 11734 .sr(1) 11735 .m(5) 11736 .n(16) 11737 .k(1) 11738 .Test(xnn_f16_gemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11739 } 11740 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST,strided_cn)11741 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST, strided_cn) { 11742 TEST_REQUIRES_X86_AVX2; 11743 GemmMicrokernelTester() 11744 .mr(5) 11745 .nr(16) 11746 .kr(1) 11747 .sr(1) 11748 .m(5) 11749 .n(16) 11750 .k(1) 11751 .cn_stride(19) 11752 .Test(xnn_f16_gemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11753 } 11754 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST,k_eq_1_strided_a)11755 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST, k_eq_1_strided_a) { 11756 TEST_REQUIRES_X86_AVX2; 11757 GemmMicrokernelTester() 11758 .mr(5) 11759 .nr(16) 11760 .kr(1) 11761 .sr(1) 11762 .m(5) 11763 .n(16) 11764 .k(1) 11765 .a_stride(3) 11766 .Test(xnn_f16_gemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11767 } 11768 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST,k_eq_1_subtile)11769 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST, k_eq_1_subtile) { 11770 TEST_REQUIRES_X86_AVX2; 11771 for (uint32_t n = 1; n <= 16; n++) { 11772 for (uint32_t m = 1; m <= 5; m++) { 11773 GemmMicrokernelTester() 11774 .mr(5) 11775 .nr(16) 11776 .kr(1) 11777 .sr(1) 11778 .m(m) 11779 .n(n) 11780 .k(1) 11781 .iterations(1) 11782 .Test(xnn_f16_gemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11783 } 11784 } 11785 } 11786 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST,k_eq_1_subtile_m)11787 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST, k_eq_1_subtile_m) { 11788 TEST_REQUIRES_X86_AVX2; 11789 for (uint32_t m = 1; m <= 5; m++) { 11790 GemmMicrokernelTester() 11791 .mr(5) 11792 .nr(16) 11793 .kr(1) 11794 .sr(1) 11795 .m(m) 11796 .n(16) 11797 .k(1) 11798 .iterations(1) 11799 .Test(xnn_f16_gemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11800 } 11801 } 11802 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST,k_eq_1_subtile_n)11803 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST, k_eq_1_subtile_n) { 11804 TEST_REQUIRES_X86_AVX2; 11805 for (uint32_t n = 1; n <= 16; n++) { 11806 GemmMicrokernelTester() 11807 .mr(5) 11808 .nr(16) 11809 .kr(1) 11810 .sr(1) 11811 .m(5) 11812 .n(n) 11813 .k(1) 11814 .iterations(1) 11815 .Test(xnn_f16_gemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11816 } 11817 } 11818 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST,k_gt_1)11819 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST, k_gt_1) { 11820 TEST_REQUIRES_X86_AVX2; 11821 for (size_t k = 2; k < 10; k++) { 11822 GemmMicrokernelTester() 11823 .mr(5) 11824 .nr(16) 11825 .kr(1) 11826 .sr(1) 11827 .m(5) 11828 .n(16) 11829 .k(k) 11830 .Test(xnn_f16_gemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11831 } 11832 } 11833 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST,k_gt_1_strided_a)11834 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST, k_gt_1_strided_a) { 11835 TEST_REQUIRES_X86_AVX2; 11836 for (size_t k = 2; k < 10; k++) { 11837 GemmMicrokernelTester() 11838 .mr(5) 11839 .nr(16) 11840 .kr(1) 11841 .sr(1) 11842 .m(5) 11843 .n(16) 11844 .k(k) 11845 .a_stride(11) 11846 .Test(xnn_f16_gemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11847 } 11848 } 11849 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST,k_gt_1_subtile)11850 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST, k_gt_1_subtile) { 11851 TEST_REQUIRES_X86_AVX2; 11852 for (size_t k = 2; k < 10; k++) { 11853 for (uint32_t n = 1; n <= 16; n++) { 11854 for (uint32_t m = 1; m <= 5; m++) { 11855 GemmMicrokernelTester() 11856 .mr(5) 11857 .nr(16) 11858 .kr(1) 11859 .sr(1) 11860 .m(m) 11861 .n(n) 11862 .k(k) 11863 .iterations(1) 11864 .Test(xnn_f16_gemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11865 } 11866 } 11867 } 11868 } 11869 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST,n_gt_16)11870 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST, n_gt_16) { 11871 TEST_REQUIRES_X86_AVX2; 11872 for (uint32_t n = 17; n < 32; n++) { 11873 for (size_t k = 1; k <= 5; k += 2) { 11874 GemmMicrokernelTester() 11875 .mr(5) 11876 .nr(16) 11877 .kr(1) 11878 .sr(1) 11879 .m(5) 11880 .n(n) 11881 .k(k) 11882 .Test(xnn_f16_gemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11883 } 11884 } 11885 } 11886 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST,n_gt_16_strided_cn)11887 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST, n_gt_16_strided_cn) { 11888 TEST_REQUIRES_X86_AVX2; 11889 for (uint32_t n = 17; n < 32; n++) { 11890 for (size_t k = 1; k <= 5; k += 2) { 11891 GemmMicrokernelTester() 11892 .mr(5) 11893 .nr(16) 11894 .kr(1) 11895 .sr(1) 11896 .m(5) 11897 .n(n) 11898 .k(k) 11899 .cn_stride(19) 11900 .Test(xnn_f16_gemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11901 } 11902 } 11903 } 11904 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST,n_gt_16_strided_a)11905 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST, n_gt_16_strided_a) { 11906 TEST_REQUIRES_X86_AVX2; 11907 for (uint32_t n = 17; n < 32; n++) { 11908 for (size_t k = 1; k <= 5; k += 2) { 11909 GemmMicrokernelTester() 11910 .mr(5) 11911 .nr(16) 11912 .kr(1) 11913 .sr(1) 11914 .m(5) 11915 .n(n) 11916 .k(k) 11917 .a_stride(7) 11918 .Test(xnn_f16_gemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11919 } 11920 } 11921 } 11922 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST,n_gt_16_subtile)11923 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST, n_gt_16_subtile) { 11924 TEST_REQUIRES_X86_AVX2; 11925 for (uint32_t n = 17; n < 32; n++) { 11926 for (size_t k = 1; k <= 5; k += 2) { 11927 for (uint32_t m = 1; m <= 5; m++) { 11928 GemmMicrokernelTester() 11929 .mr(5) 11930 .nr(16) 11931 .kr(1) 11932 .sr(1) 11933 .m(m) 11934 .n(n) 11935 .k(k) 11936 .iterations(1) 11937 .Test(xnn_f16_gemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11938 } 11939 } 11940 } 11941 } 11942 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST,n_div_16)11943 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST, n_div_16) { 11944 TEST_REQUIRES_X86_AVX2; 11945 for (uint32_t n = 32; n <= 48; n += 16) { 11946 for (size_t k = 1; k <= 5; k += 2) { 11947 GemmMicrokernelTester() 11948 .mr(5) 11949 .nr(16) 11950 .kr(1) 11951 .sr(1) 11952 .m(5) 11953 .n(n) 11954 .k(k) 11955 .Test(xnn_f16_gemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11956 } 11957 } 11958 } 11959 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST,n_div_16_strided_cn)11960 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST, n_div_16_strided_cn) { 11961 TEST_REQUIRES_X86_AVX2; 11962 for (uint32_t n = 32; n <= 48; n += 16) { 11963 for (size_t k = 1; k <= 5; k += 2) { 11964 GemmMicrokernelTester() 11965 .mr(5) 11966 .nr(16) 11967 .kr(1) 11968 .sr(1) 11969 .m(5) 11970 .n(n) 11971 .k(k) 11972 .cn_stride(19) 11973 .Test(xnn_f16_gemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11974 } 11975 } 11976 } 11977 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST,n_div_16_strided_a)11978 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST, n_div_16_strided_a) { 11979 TEST_REQUIRES_X86_AVX2; 11980 for (uint32_t n = 32; n <= 48; n += 16) { 11981 for (size_t k = 1; k <= 5; k += 2) { 11982 GemmMicrokernelTester() 11983 .mr(5) 11984 .nr(16) 11985 .kr(1) 11986 .sr(1) 11987 .m(5) 11988 .n(n) 11989 .k(k) 11990 .a_stride(7) 11991 .Test(xnn_f16_gemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11992 } 11993 } 11994 } 11995 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST,n_div_16_subtile)11996 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST, n_div_16_subtile) { 11997 TEST_REQUIRES_X86_AVX2; 11998 for (uint32_t n = 32; n <= 48; n += 16) { 11999 for (size_t k = 1; k <= 5; k += 2) { 12000 for (uint32_t m = 1; m <= 5; m++) { 12001 GemmMicrokernelTester() 12002 .mr(5) 12003 .nr(16) 12004 .kr(1) 12005 .sr(1) 12006 .m(m) 12007 .n(n) 12008 .k(k) 12009 .iterations(1) 12010 .Test(xnn_f16_gemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12011 } 12012 } 12013 } 12014 } 12015 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST,strided_cm_subtile)12016 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST, strided_cm_subtile) { 12017 TEST_REQUIRES_X86_AVX2; 12018 for (size_t k = 1; k <= 5; k += 2) { 12019 for (uint32_t n = 1; n <= 16; n++) { 12020 for (uint32_t m = 1; m <= 5; m++) { 12021 GemmMicrokernelTester() 12022 .mr(5) 12023 .nr(16) 12024 .kr(1) 12025 .sr(1) 12026 .m(m) 12027 .n(n) 12028 .k(k) 12029 .cm_stride(19) 12030 .iterations(1) 12031 .Test(xnn_f16_gemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12032 } 12033 } 12034 } 12035 } 12036 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST,qmin)12037 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST, qmin) { 12038 TEST_REQUIRES_X86_AVX2; 12039 GemmMicrokernelTester() 12040 .mr(5) 12041 .nr(16) 12042 .kr(1) 12043 .sr(1) 12044 .m(5) 12045 .n(16) 12046 .k(1) 12047 .qmin(128) 12048 .Test(xnn_f16_gemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12049 } 12050 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST,qmax)12051 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST, qmax) { 12052 TEST_REQUIRES_X86_AVX2; 12053 GemmMicrokernelTester() 12054 .mr(5) 12055 .nr(16) 12056 .kr(1) 12057 .sr(1) 12058 .m(5) 12059 .n(16) 12060 .k(1) 12061 .qmax(128) 12062 .Test(xnn_f16_gemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12063 } 12064 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST,strided_cm)12065 TEST(F16_GEMM_MINMAX_5X16__AVX2_BROADCAST, strided_cm) { 12066 TEST_REQUIRES_X86_AVX2; 12067 GemmMicrokernelTester() 12068 .mr(5) 12069 .nr(16) 12070 .kr(1) 12071 .sr(1) 12072 .m(5) 12073 .n(16) 12074 .k(1) 12075 .cm_stride(19) 12076 .Test(xnn_f16_gemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12077 } 12078 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 12079 12080 12081 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST,k_eq_1)12082 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST, k_eq_1) { 12083 TEST_REQUIRES_X86_AVX2; 12084 GemmMicrokernelTester() 12085 .mr(6) 12086 .nr(8) 12087 .kr(1) 12088 .sr(1) 12089 .m(6) 12090 .n(8) 12091 .k(1) 12092 .Test(xnn_f16_gemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12093 } 12094 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST,strided_cn)12095 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST, strided_cn) { 12096 TEST_REQUIRES_X86_AVX2; 12097 GemmMicrokernelTester() 12098 .mr(6) 12099 .nr(8) 12100 .kr(1) 12101 .sr(1) 12102 .m(6) 12103 .n(8) 12104 .k(1) 12105 .cn_stride(11) 12106 .Test(xnn_f16_gemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12107 } 12108 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST,k_eq_1_strided_a)12109 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST, k_eq_1_strided_a) { 12110 TEST_REQUIRES_X86_AVX2; 12111 GemmMicrokernelTester() 12112 .mr(6) 12113 .nr(8) 12114 .kr(1) 12115 .sr(1) 12116 .m(6) 12117 .n(8) 12118 .k(1) 12119 .a_stride(3) 12120 .Test(xnn_f16_gemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12121 } 12122 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST,k_eq_1_subtile)12123 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST, k_eq_1_subtile) { 12124 TEST_REQUIRES_X86_AVX2; 12125 for (uint32_t n = 1; n <= 8; n++) { 12126 for (uint32_t m = 1; m <= 6; m++) { 12127 GemmMicrokernelTester() 12128 .mr(6) 12129 .nr(8) 12130 .kr(1) 12131 .sr(1) 12132 .m(m) 12133 .n(n) 12134 .k(1) 12135 .iterations(1) 12136 .Test(xnn_f16_gemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12137 } 12138 } 12139 } 12140 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST,k_eq_1_subtile_m)12141 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST, k_eq_1_subtile_m) { 12142 TEST_REQUIRES_X86_AVX2; 12143 for (uint32_t m = 1; m <= 6; m++) { 12144 GemmMicrokernelTester() 12145 .mr(6) 12146 .nr(8) 12147 .kr(1) 12148 .sr(1) 12149 .m(m) 12150 .n(8) 12151 .k(1) 12152 .iterations(1) 12153 .Test(xnn_f16_gemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12154 } 12155 } 12156 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST,k_eq_1_subtile_n)12157 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST, k_eq_1_subtile_n) { 12158 TEST_REQUIRES_X86_AVX2; 12159 for (uint32_t n = 1; n <= 8; n++) { 12160 GemmMicrokernelTester() 12161 .mr(6) 12162 .nr(8) 12163 .kr(1) 12164 .sr(1) 12165 .m(6) 12166 .n(n) 12167 .k(1) 12168 .iterations(1) 12169 .Test(xnn_f16_gemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12170 } 12171 } 12172 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST,k_gt_1)12173 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST, k_gt_1) { 12174 TEST_REQUIRES_X86_AVX2; 12175 for (size_t k = 2; k < 10; k++) { 12176 GemmMicrokernelTester() 12177 .mr(6) 12178 .nr(8) 12179 .kr(1) 12180 .sr(1) 12181 .m(6) 12182 .n(8) 12183 .k(k) 12184 .Test(xnn_f16_gemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12185 } 12186 } 12187 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST,k_gt_1_strided_a)12188 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST, k_gt_1_strided_a) { 12189 TEST_REQUIRES_X86_AVX2; 12190 for (size_t k = 2; k < 10; k++) { 12191 GemmMicrokernelTester() 12192 .mr(6) 12193 .nr(8) 12194 .kr(1) 12195 .sr(1) 12196 .m(6) 12197 .n(8) 12198 .k(k) 12199 .a_stride(11) 12200 .Test(xnn_f16_gemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12201 } 12202 } 12203 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST,k_gt_1_subtile)12204 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST, k_gt_1_subtile) { 12205 TEST_REQUIRES_X86_AVX2; 12206 for (size_t k = 2; k < 10; k++) { 12207 for (uint32_t n = 1; n <= 8; n++) { 12208 for (uint32_t m = 1; m <= 6; m++) { 12209 GemmMicrokernelTester() 12210 .mr(6) 12211 .nr(8) 12212 .kr(1) 12213 .sr(1) 12214 .m(m) 12215 .n(n) 12216 .k(k) 12217 .iterations(1) 12218 .Test(xnn_f16_gemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12219 } 12220 } 12221 } 12222 } 12223 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST,n_gt_8)12224 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST, n_gt_8) { 12225 TEST_REQUIRES_X86_AVX2; 12226 for (uint32_t n = 9; n < 16; n++) { 12227 for (size_t k = 1; k <= 5; k += 2) { 12228 GemmMicrokernelTester() 12229 .mr(6) 12230 .nr(8) 12231 .kr(1) 12232 .sr(1) 12233 .m(6) 12234 .n(n) 12235 .k(k) 12236 .Test(xnn_f16_gemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12237 } 12238 } 12239 } 12240 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST,n_gt_8_strided_cn)12241 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST, n_gt_8_strided_cn) { 12242 TEST_REQUIRES_X86_AVX2; 12243 for (uint32_t n = 9; n < 16; n++) { 12244 for (size_t k = 1; k <= 5; k += 2) { 12245 GemmMicrokernelTester() 12246 .mr(6) 12247 .nr(8) 12248 .kr(1) 12249 .sr(1) 12250 .m(6) 12251 .n(n) 12252 .k(k) 12253 .cn_stride(11) 12254 .Test(xnn_f16_gemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12255 } 12256 } 12257 } 12258 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST,n_gt_8_strided_a)12259 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST, n_gt_8_strided_a) { 12260 TEST_REQUIRES_X86_AVX2; 12261 for (uint32_t n = 9; n < 16; n++) { 12262 for (size_t k = 1; k <= 5; k += 2) { 12263 GemmMicrokernelTester() 12264 .mr(6) 12265 .nr(8) 12266 .kr(1) 12267 .sr(1) 12268 .m(6) 12269 .n(n) 12270 .k(k) 12271 .a_stride(7) 12272 .Test(xnn_f16_gemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12273 } 12274 } 12275 } 12276 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST,n_gt_8_subtile)12277 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST, n_gt_8_subtile) { 12278 TEST_REQUIRES_X86_AVX2; 12279 for (uint32_t n = 9; n < 16; n++) { 12280 for (size_t k = 1; k <= 5; k += 2) { 12281 for (uint32_t m = 1; m <= 6; m++) { 12282 GemmMicrokernelTester() 12283 .mr(6) 12284 .nr(8) 12285 .kr(1) 12286 .sr(1) 12287 .m(m) 12288 .n(n) 12289 .k(k) 12290 .iterations(1) 12291 .Test(xnn_f16_gemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12292 } 12293 } 12294 } 12295 } 12296 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST,n_div_8)12297 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST, n_div_8) { 12298 TEST_REQUIRES_X86_AVX2; 12299 for (uint32_t n = 16; n <= 24; n += 8) { 12300 for (size_t k = 1; k <= 5; k += 2) { 12301 GemmMicrokernelTester() 12302 .mr(6) 12303 .nr(8) 12304 .kr(1) 12305 .sr(1) 12306 .m(6) 12307 .n(n) 12308 .k(k) 12309 .Test(xnn_f16_gemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12310 } 12311 } 12312 } 12313 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST,n_div_8_strided_cn)12314 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST, n_div_8_strided_cn) { 12315 TEST_REQUIRES_X86_AVX2; 12316 for (uint32_t n = 16; n <= 24; n += 8) { 12317 for (size_t k = 1; k <= 5; k += 2) { 12318 GemmMicrokernelTester() 12319 .mr(6) 12320 .nr(8) 12321 .kr(1) 12322 .sr(1) 12323 .m(6) 12324 .n(n) 12325 .k(k) 12326 .cn_stride(11) 12327 .Test(xnn_f16_gemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12328 } 12329 } 12330 } 12331 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST,n_div_8_strided_a)12332 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST, n_div_8_strided_a) { 12333 TEST_REQUIRES_X86_AVX2; 12334 for (uint32_t n = 16; n <= 24; n += 8) { 12335 for (size_t k = 1; k <= 5; k += 2) { 12336 GemmMicrokernelTester() 12337 .mr(6) 12338 .nr(8) 12339 .kr(1) 12340 .sr(1) 12341 .m(6) 12342 .n(n) 12343 .k(k) 12344 .a_stride(7) 12345 .Test(xnn_f16_gemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12346 } 12347 } 12348 } 12349 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST,n_div_8_subtile)12350 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST, n_div_8_subtile) { 12351 TEST_REQUIRES_X86_AVX2; 12352 for (uint32_t n = 16; n <= 24; n += 8) { 12353 for (size_t k = 1; k <= 5; k += 2) { 12354 for (uint32_t m = 1; m <= 6; m++) { 12355 GemmMicrokernelTester() 12356 .mr(6) 12357 .nr(8) 12358 .kr(1) 12359 .sr(1) 12360 .m(m) 12361 .n(n) 12362 .k(k) 12363 .iterations(1) 12364 .Test(xnn_f16_gemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12365 } 12366 } 12367 } 12368 } 12369 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST,strided_cm_subtile)12370 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST, strided_cm_subtile) { 12371 TEST_REQUIRES_X86_AVX2; 12372 for (size_t k = 1; k <= 5; k += 2) { 12373 for (uint32_t n = 1; n <= 8; n++) { 12374 for (uint32_t m = 1; m <= 6; m++) { 12375 GemmMicrokernelTester() 12376 .mr(6) 12377 .nr(8) 12378 .kr(1) 12379 .sr(1) 12380 .m(m) 12381 .n(n) 12382 .k(k) 12383 .cm_stride(11) 12384 .iterations(1) 12385 .Test(xnn_f16_gemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12386 } 12387 } 12388 } 12389 } 12390 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST,qmin)12391 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST, qmin) { 12392 TEST_REQUIRES_X86_AVX2; 12393 GemmMicrokernelTester() 12394 .mr(6) 12395 .nr(8) 12396 .kr(1) 12397 .sr(1) 12398 .m(6) 12399 .n(8) 12400 .k(1) 12401 .qmin(128) 12402 .Test(xnn_f16_gemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12403 } 12404 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST,qmax)12405 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST, qmax) { 12406 TEST_REQUIRES_X86_AVX2; 12407 GemmMicrokernelTester() 12408 .mr(6) 12409 .nr(8) 12410 .kr(1) 12411 .sr(1) 12412 .m(6) 12413 .n(8) 12414 .k(1) 12415 .qmax(128) 12416 .Test(xnn_f16_gemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12417 } 12418 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST,strided_cm)12419 TEST(F16_GEMM_MINMAX_6X8__AVX2_BROADCAST, strided_cm) { 12420 TEST_REQUIRES_X86_AVX2; 12421 GemmMicrokernelTester() 12422 .mr(6) 12423 .nr(8) 12424 .kr(1) 12425 .sr(1) 12426 .m(6) 12427 .n(8) 12428 .k(1) 12429 .cm_stride(11) 12430 .Test(xnn_f16_gemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12431 } 12432 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 12433 12434 12435 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST,k_eq_1)12436 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST, k_eq_1) { 12437 TEST_REQUIRES_X86_AVX2; 12438 GemmMicrokernelTester() 12439 .mr(7) 12440 .nr(8) 12441 .kr(1) 12442 .sr(1) 12443 .m(7) 12444 .n(8) 12445 .k(1) 12446 .Test(xnn_f16_gemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12447 } 12448 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST,strided_cn)12449 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST, strided_cn) { 12450 TEST_REQUIRES_X86_AVX2; 12451 GemmMicrokernelTester() 12452 .mr(7) 12453 .nr(8) 12454 .kr(1) 12455 .sr(1) 12456 .m(7) 12457 .n(8) 12458 .k(1) 12459 .cn_stride(11) 12460 .Test(xnn_f16_gemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12461 } 12462 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST,k_eq_1_strided_a)12463 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST, k_eq_1_strided_a) { 12464 TEST_REQUIRES_X86_AVX2; 12465 GemmMicrokernelTester() 12466 .mr(7) 12467 .nr(8) 12468 .kr(1) 12469 .sr(1) 12470 .m(7) 12471 .n(8) 12472 .k(1) 12473 .a_stride(3) 12474 .Test(xnn_f16_gemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12475 } 12476 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST,k_eq_1_subtile)12477 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST, k_eq_1_subtile) { 12478 TEST_REQUIRES_X86_AVX2; 12479 for (uint32_t n = 1; n <= 8; n++) { 12480 for (uint32_t m = 1; m <= 7; m++) { 12481 GemmMicrokernelTester() 12482 .mr(7) 12483 .nr(8) 12484 .kr(1) 12485 .sr(1) 12486 .m(m) 12487 .n(n) 12488 .k(1) 12489 .iterations(1) 12490 .Test(xnn_f16_gemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12491 } 12492 } 12493 } 12494 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST,k_eq_1_subtile_m)12495 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST, k_eq_1_subtile_m) { 12496 TEST_REQUIRES_X86_AVX2; 12497 for (uint32_t m = 1; m <= 7; m++) { 12498 GemmMicrokernelTester() 12499 .mr(7) 12500 .nr(8) 12501 .kr(1) 12502 .sr(1) 12503 .m(m) 12504 .n(8) 12505 .k(1) 12506 .iterations(1) 12507 .Test(xnn_f16_gemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12508 } 12509 } 12510 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST,k_eq_1_subtile_n)12511 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST, k_eq_1_subtile_n) { 12512 TEST_REQUIRES_X86_AVX2; 12513 for (uint32_t n = 1; n <= 8; n++) { 12514 GemmMicrokernelTester() 12515 .mr(7) 12516 .nr(8) 12517 .kr(1) 12518 .sr(1) 12519 .m(7) 12520 .n(n) 12521 .k(1) 12522 .iterations(1) 12523 .Test(xnn_f16_gemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12524 } 12525 } 12526 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST,k_gt_1)12527 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST, k_gt_1) { 12528 TEST_REQUIRES_X86_AVX2; 12529 for (size_t k = 2; k < 10; k++) { 12530 GemmMicrokernelTester() 12531 .mr(7) 12532 .nr(8) 12533 .kr(1) 12534 .sr(1) 12535 .m(7) 12536 .n(8) 12537 .k(k) 12538 .Test(xnn_f16_gemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12539 } 12540 } 12541 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST,k_gt_1_strided_a)12542 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST, k_gt_1_strided_a) { 12543 TEST_REQUIRES_X86_AVX2; 12544 for (size_t k = 2; k < 10; k++) { 12545 GemmMicrokernelTester() 12546 .mr(7) 12547 .nr(8) 12548 .kr(1) 12549 .sr(1) 12550 .m(7) 12551 .n(8) 12552 .k(k) 12553 .a_stride(11) 12554 .Test(xnn_f16_gemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12555 } 12556 } 12557 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST,k_gt_1_subtile)12558 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST, k_gt_1_subtile) { 12559 TEST_REQUIRES_X86_AVX2; 12560 for (size_t k = 2; k < 10; k++) { 12561 for (uint32_t n = 1; n <= 8; n++) { 12562 for (uint32_t m = 1; m <= 7; m++) { 12563 GemmMicrokernelTester() 12564 .mr(7) 12565 .nr(8) 12566 .kr(1) 12567 .sr(1) 12568 .m(m) 12569 .n(n) 12570 .k(k) 12571 .iterations(1) 12572 .Test(xnn_f16_gemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12573 } 12574 } 12575 } 12576 } 12577 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST,n_gt_8)12578 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST, n_gt_8) { 12579 TEST_REQUIRES_X86_AVX2; 12580 for (uint32_t n = 9; n < 16; n++) { 12581 for (size_t k = 1; k <= 5; k += 2) { 12582 GemmMicrokernelTester() 12583 .mr(7) 12584 .nr(8) 12585 .kr(1) 12586 .sr(1) 12587 .m(7) 12588 .n(n) 12589 .k(k) 12590 .Test(xnn_f16_gemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12591 } 12592 } 12593 } 12594 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST,n_gt_8_strided_cn)12595 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST, n_gt_8_strided_cn) { 12596 TEST_REQUIRES_X86_AVX2; 12597 for (uint32_t n = 9; n < 16; n++) { 12598 for (size_t k = 1; k <= 5; k += 2) { 12599 GemmMicrokernelTester() 12600 .mr(7) 12601 .nr(8) 12602 .kr(1) 12603 .sr(1) 12604 .m(7) 12605 .n(n) 12606 .k(k) 12607 .cn_stride(11) 12608 .Test(xnn_f16_gemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12609 } 12610 } 12611 } 12612 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST,n_gt_8_strided_a)12613 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST, n_gt_8_strided_a) { 12614 TEST_REQUIRES_X86_AVX2; 12615 for (uint32_t n = 9; n < 16; n++) { 12616 for (size_t k = 1; k <= 5; k += 2) { 12617 GemmMicrokernelTester() 12618 .mr(7) 12619 .nr(8) 12620 .kr(1) 12621 .sr(1) 12622 .m(7) 12623 .n(n) 12624 .k(k) 12625 .a_stride(7) 12626 .Test(xnn_f16_gemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12627 } 12628 } 12629 } 12630 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST,n_gt_8_subtile)12631 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST, n_gt_8_subtile) { 12632 TEST_REQUIRES_X86_AVX2; 12633 for (uint32_t n = 9; n < 16; n++) { 12634 for (size_t k = 1; k <= 5; k += 2) { 12635 for (uint32_t m = 1; m <= 7; m++) { 12636 GemmMicrokernelTester() 12637 .mr(7) 12638 .nr(8) 12639 .kr(1) 12640 .sr(1) 12641 .m(m) 12642 .n(n) 12643 .k(k) 12644 .iterations(1) 12645 .Test(xnn_f16_gemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12646 } 12647 } 12648 } 12649 } 12650 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST,n_div_8)12651 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST, n_div_8) { 12652 TEST_REQUIRES_X86_AVX2; 12653 for (uint32_t n = 16; n <= 24; n += 8) { 12654 for (size_t k = 1; k <= 5; k += 2) { 12655 GemmMicrokernelTester() 12656 .mr(7) 12657 .nr(8) 12658 .kr(1) 12659 .sr(1) 12660 .m(7) 12661 .n(n) 12662 .k(k) 12663 .Test(xnn_f16_gemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12664 } 12665 } 12666 } 12667 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST,n_div_8_strided_cn)12668 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST, n_div_8_strided_cn) { 12669 TEST_REQUIRES_X86_AVX2; 12670 for (uint32_t n = 16; n <= 24; n += 8) { 12671 for (size_t k = 1; k <= 5; k += 2) { 12672 GemmMicrokernelTester() 12673 .mr(7) 12674 .nr(8) 12675 .kr(1) 12676 .sr(1) 12677 .m(7) 12678 .n(n) 12679 .k(k) 12680 .cn_stride(11) 12681 .Test(xnn_f16_gemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12682 } 12683 } 12684 } 12685 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST,n_div_8_strided_a)12686 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST, n_div_8_strided_a) { 12687 TEST_REQUIRES_X86_AVX2; 12688 for (uint32_t n = 16; n <= 24; n += 8) { 12689 for (size_t k = 1; k <= 5; k += 2) { 12690 GemmMicrokernelTester() 12691 .mr(7) 12692 .nr(8) 12693 .kr(1) 12694 .sr(1) 12695 .m(7) 12696 .n(n) 12697 .k(k) 12698 .a_stride(7) 12699 .Test(xnn_f16_gemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12700 } 12701 } 12702 } 12703 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST,n_div_8_subtile)12704 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST, n_div_8_subtile) { 12705 TEST_REQUIRES_X86_AVX2; 12706 for (uint32_t n = 16; n <= 24; n += 8) { 12707 for (size_t k = 1; k <= 5; k += 2) { 12708 for (uint32_t m = 1; m <= 7; m++) { 12709 GemmMicrokernelTester() 12710 .mr(7) 12711 .nr(8) 12712 .kr(1) 12713 .sr(1) 12714 .m(m) 12715 .n(n) 12716 .k(k) 12717 .iterations(1) 12718 .Test(xnn_f16_gemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12719 } 12720 } 12721 } 12722 } 12723 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST,strided_cm_subtile)12724 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST, strided_cm_subtile) { 12725 TEST_REQUIRES_X86_AVX2; 12726 for (size_t k = 1; k <= 5; k += 2) { 12727 for (uint32_t n = 1; n <= 8; n++) { 12728 for (uint32_t m = 1; m <= 7; m++) { 12729 GemmMicrokernelTester() 12730 .mr(7) 12731 .nr(8) 12732 .kr(1) 12733 .sr(1) 12734 .m(m) 12735 .n(n) 12736 .k(k) 12737 .cm_stride(11) 12738 .iterations(1) 12739 .Test(xnn_f16_gemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12740 } 12741 } 12742 } 12743 } 12744 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST,qmin)12745 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST, qmin) { 12746 TEST_REQUIRES_X86_AVX2; 12747 GemmMicrokernelTester() 12748 .mr(7) 12749 .nr(8) 12750 .kr(1) 12751 .sr(1) 12752 .m(7) 12753 .n(8) 12754 .k(1) 12755 .qmin(128) 12756 .Test(xnn_f16_gemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12757 } 12758 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST,qmax)12759 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST, qmax) { 12760 TEST_REQUIRES_X86_AVX2; 12761 GemmMicrokernelTester() 12762 .mr(7) 12763 .nr(8) 12764 .kr(1) 12765 .sr(1) 12766 .m(7) 12767 .n(8) 12768 .k(1) 12769 .qmax(128) 12770 .Test(xnn_f16_gemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12771 } 12772 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST,strided_cm)12773 TEST(F16_GEMM_MINMAX_7X8__AVX2_BROADCAST, strided_cm) { 12774 TEST_REQUIRES_X86_AVX2; 12775 GemmMicrokernelTester() 12776 .mr(7) 12777 .nr(8) 12778 .kr(1) 12779 .sr(1) 12780 .m(7) 12781 .n(8) 12782 .k(1) 12783 .cm_stride(11) 12784 .Test(xnn_f16_gemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 12785 } 12786 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 12787