1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 //
9 // Auto-generated file. Do not edit!
10 // Specification: test/qu8-gemm-minmax-fp32.yaml
11 // Generator: tools/generate-gemm-test.py
12
13
14 #include <gtest/gtest.h>
15
16 #include <xnnpack/allocator.h>
17 #include <xnnpack/common.h>
18 #include <xnnpack/isa-checks.h>
19 #include <xnnpack/microparams-init.h>
20
21 #include <xnnpack/gemm.h>
22 #include <xnnpack/igemm.h>
23 #include <xnnpack/ppmm.h>
24 #include "gemm-microkernel-tester.h"
25
26
27 #if XNN_ARCH_ARM
TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32,k_eq_4)28 TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32, k_eq_4) {
29 TEST_REQUIRES_ARM_SIMD32;
30 GemmMicrokernelTester()
31 .mr(1)
32 .nr(2)
33 .kr(4)
34 .sr(1)
35 .m(1)
36 .n(2)
37 .k(4)
38 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
39 }
40
TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32,strided_cn)41 TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32, strided_cn) {
42 TEST_REQUIRES_ARM_SIMD32;
43 GemmMicrokernelTester()
44 .mr(1)
45 .nr(2)
46 .kr(4)
47 .sr(1)
48 .m(1)
49 .n(2)
50 .k(4)
51 .cn_stride(5)
52 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
53 }
54
TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32,k_eq_4_strided_a)55 TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32, k_eq_4_strided_a) {
56 TEST_REQUIRES_ARM_SIMD32;
57 GemmMicrokernelTester()
58 .mr(1)
59 .nr(2)
60 .kr(4)
61 .sr(1)
62 .m(1)
63 .n(2)
64 .k(4)
65 .a_stride(7)
66 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
67 }
68
TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32,k_eq_4_subtile)69 TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32, k_eq_4_subtile) {
70 TEST_REQUIRES_ARM_SIMD32;
71 for (uint32_t n = 1; n <= 2; n++) {
72 for (uint32_t m = 1; m <= 1; m++) {
73 GemmMicrokernelTester()
74 .mr(1)
75 .nr(2)
76 .kr(4)
77 .sr(1)
78 .m(m)
79 .n(n)
80 .k(4)
81 .iterations(1)
82 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
83 }
84 }
85 }
86
TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32,k_eq_4_subtile_m)87 TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32, k_eq_4_subtile_m) {
88 TEST_REQUIRES_ARM_SIMD32;
89 for (uint32_t m = 1; m <= 1; m++) {
90 GemmMicrokernelTester()
91 .mr(1)
92 .nr(2)
93 .kr(4)
94 .sr(1)
95 .m(m)
96 .n(2)
97 .k(4)
98 .iterations(1)
99 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
100 }
101 }
102
TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32,k_eq_4_subtile_n)103 TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32, k_eq_4_subtile_n) {
104 TEST_REQUIRES_ARM_SIMD32;
105 for (uint32_t n = 1; n <= 2; n++) {
106 GemmMicrokernelTester()
107 .mr(1)
108 .nr(2)
109 .kr(4)
110 .sr(1)
111 .m(1)
112 .n(n)
113 .k(4)
114 .iterations(1)
115 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
116 }
117 }
118
TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32,k_lt_4)119 TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32, k_lt_4) {
120 TEST_REQUIRES_ARM_SIMD32;
121 for (size_t k = 1; k < 4; k++) {
122 GemmMicrokernelTester()
123 .mr(1)
124 .nr(2)
125 .kr(4)
126 .sr(1)
127 .m(1)
128 .n(2)
129 .k(k)
130 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
131 }
132 }
133
TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32,k_lt_4_strided_a)134 TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32, k_lt_4_strided_a) {
135 TEST_REQUIRES_ARM_SIMD32;
136 for (size_t k = 1; k < 4; k++) {
137 GemmMicrokernelTester()
138 .mr(1)
139 .nr(2)
140 .kr(4)
141 .sr(1)
142 .m(1)
143 .n(2)
144 .k(k)
145 .a_stride(7)
146 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
147 }
148 }
149
TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32,k_lt_4_subtile)150 TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32, k_lt_4_subtile) {
151 TEST_REQUIRES_ARM_SIMD32;
152 for (size_t k = 1; k < 4; k++) {
153 for (uint32_t n = 1; n <= 2; n++) {
154 for (uint32_t m = 1; m <= 1; m++) {
155 GemmMicrokernelTester()
156 .mr(1)
157 .nr(2)
158 .kr(4)
159 .sr(1)
160 .m(m)
161 .n(n)
162 .k(k)
163 .iterations(1)
164 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
165 }
166 }
167 }
168 }
169
TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32,k_gt_4)170 TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32, k_gt_4) {
171 TEST_REQUIRES_ARM_SIMD32;
172 for (size_t k = 5; k < 8; k++) {
173 GemmMicrokernelTester()
174 .mr(1)
175 .nr(2)
176 .kr(4)
177 .sr(1)
178 .m(1)
179 .n(2)
180 .k(k)
181 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
182 }
183 }
184
TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32,k_gt_4_strided_a)185 TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32, k_gt_4_strided_a) {
186 TEST_REQUIRES_ARM_SIMD32;
187 for (size_t k = 5; k < 8; k++) {
188 GemmMicrokernelTester()
189 .mr(1)
190 .nr(2)
191 .kr(4)
192 .sr(1)
193 .m(1)
194 .n(2)
195 .k(k)
196 .a_stride(11)
197 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
198 }
199 }
200
TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32,k_gt_4_subtile)201 TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32, k_gt_4_subtile) {
202 TEST_REQUIRES_ARM_SIMD32;
203 for (size_t k = 5; k < 8; k++) {
204 for (uint32_t n = 1; n <= 2; n++) {
205 for (uint32_t m = 1; m <= 1; m++) {
206 GemmMicrokernelTester()
207 .mr(1)
208 .nr(2)
209 .kr(4)
210 .sr(1)
211 .m(m)
212 .n(n)
213 .k(k)
214 .iterations(1)
215 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
216 }
217 }
218 }
219 }
220
TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32,k_div_4)221 TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32, k_div_4) {
222 TEST_REQUIRES_ARM_SIMD32;
223 for (size_t k = 8; k <= 40; k += 4) {
224 GemmMicrokernelTester()
225 .mr(1)
226 .nr(2)
227 .kr(4)
228 .sr(1)
229 .m(1)
230 .n(2)
231 .k(k)
232 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
233 }
234 }
235
TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32,k_div_4_strided_a)236 TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32, k_div_4_strided_a) {
237 TEST_REQUIRES_ARM_SIMD32;
238 for (size_t k = 8; k <= 40; k += 4) {
239 GemmMicrokernelTester()
240 .mr(1)
241 .nr(2)
242 .kr(4)
243 .sr(1)
244 .m(1)
245 .n(2)
246 .k(k)
247 .a_stride(43)
248 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
249 }
250 }
251
TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32,k_div_4_subtile)252 TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32, k_div_4_subtile) {
253 TEST_REQUIRES_ARM_SIMD32;
254 for (size_t k = 8; k <= 40; k += 4) {
255 for (uint32_t n = 1; n <= 2; n++) {
256 for (uint32_t m = 1; m <= 1; m++) {
257 GemmMicrokernelTester()
258 .mr(1)
259 .nr(2)
260 .kr(4)
261 .sr(1)
262 .m(m)
263 .n(n)
264 .k(k)
265 .iterations(1)
266 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
267 }
268 }
269 }
270 }
271
TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32,n_gt_2)272 TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32, n_gt_2) {
273 TEST_REQUIRES_ARM_SIMD32;
274 for (uint32_t n = 3; n < 4; n++) {
275 for (size_t k = 1; k <= 20; k += 5) {
276 GemmMicrokernelTester()
277 .mr(1)
278 .nr(2)
279 .kr(4)
280 .sr(1)
281 .m(1)
282 .n(n)
283 .k(k)
284 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
285 }
286 }
287 }
288
TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32,n_gt_2_strided_cn)289 TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32, n_gt_2_strided_cn) {
290 TEST_REQUIRES_ARM_SIMD32;
291 for (uint32_t n = 3; n < 4; n++) {
292 for (size_t k = 1; k <= 20; k += 5) {
293 GemmMicrokernelTester()
294 .mr(1)
295 .nr(2)
296 .kr(4)
297 .sr(1)
298 .m(1)
299 .n(n)
300 .k(k)
301 .cn_stride(5)
302 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
303 }
304 }
305 }
306
TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32,n_gt_2_strided_a)307 TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32, n_gt_2_strided_a) {
308 TEST_REQUIRES_ARM_SIMD32;
309 for (uint32_t n = 3; n < 4; n++) {
310 for (size_t k = 1; k <= 20; k += 5) {
311 GemmMicrokernelTester()
312 .mr(1)
313 .nr(2)
314 .kr(4)
315 .sr(1)
316 .m(1)
317 .n(n)
318 .k(k)
319 .a_stride(23)
320 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
321 }
322 }
323 }
324
TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32,n_gt_2_subtile)325 TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32, n_gt_2_subtile) {
326 TEST_REQUIRES_ARM_SIMD32;
327 for (uint32_t n = 3; n < 4; n++) {
328 for (size_t k = 1; k <= 20; k += 5) {
329 for (uint32_t m = 1; m <= 1; m++) {
330 GemmMicrokernelTester()
331 .mr(1)
332 .nr(2)
333 .kr(4)
334 .sr(1)
335 .m(m)
336 .n(n)
337 .k(k)
338 .iterations(1)
339 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
340 }
341 }
342 }
343 }
344
TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32,n_div_2)345 TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32, n_div_2) {
346 TEST_REQUIRES_ARM_SIMD32;
347 for (uint32_t n = 4; n <= 6; n += 2) {
348 for (size_t k = 1; k <= 20; k += 5) {
349 GemmMicrokernelTester()
350 .mr(1)
351 .nr(2)
352 .kr(4)
353 .sr(1)
354 .m(1)
355 .n(n)
356 .k(k)
357 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
358 }
359 }
360 }
361
TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32,n_div_2_strided_cn)362 TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32, n_div_2_strided_cn) {
363 TEST_REQUIRES_ARM_SIMD32;
364 for (uint32_t n = 4; n <= 6; n += 2) {
365 for (size_t k = 1; k <= 20; k += 5) {
366 GemmMicrokernelTester()
367 .mr(1)
368 .nr(2)
369 .kr(4)
370 .sr(1)
371 .m(1)
372 .n(n)
373 .k(k)
374 .cn_stride(5)
375 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
376 }
377 }
378 }
379
TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32,n_div_2_strided_a)380 TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32, n_div_2_strided_a) {
381 TEST_REQUIRES_ARM_SIMD32;
382 for (uint32_t n = 4; n <= 6; n += 2) {
383 for (size_t k = 1; k <= 20; k += 5) {
384 GemmMicrokernelTester()
385 .mr(1)
386 .nr(2)
387 .kr(4)
388 .sr(1)
389 .m(1)
390 .n(n)
391 .k(k)
392 .a_stride(23)
393 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
394 }
395 }
396 }
397
TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32,n_div_2_subtile)398 TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32, n_div_2_subtile) {
399 TEST_REQUIRES_ARM_SIMD32;
400 for (uint32_t n = 4; n <= 6; n += 2) {
401 for (size_t k = 1; k <= 20; k += 5) {
402 for (uint32_t m = 1; m <= 1; m++) {
403 GemmMicrokernelTester()
404 .mr(1)
405 .nr(2)
406 .kr(4)
407 .sr(1)
408 .m(m)
409 .n(n)
410 .k(k)
411 .iterations(1)
412 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
413 }
414 }
415 }
416 }
417
TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32,strided_cm_subtile)418 TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32, strided_cm_subtile) {
419 TEST_REQUIRES_ARM_SIMD32;
420 for (size_t k = 1; k <= 20; k += 5) {
421 for (uint32_t n = 1; n <= 2; n++) {
422 for (uint32_t m = 1; m <= 1; m++) {
423 GemmMicrokernelTester()
424 .mr(1)
425 .nr(2)
426 .kr(4)
427 .sr(1)
428 .m(m)
429 .n(n)
430 .k(k)
431 .cm_stride(5)
432 .iterations(1)
433 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
434 }
435 }
436 }
437 }
438
TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32,qmin)439 TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32, qmin) {
440 TEST_REQUIRES_ARM_SIMD32;
441 GemmMicrokernelTester()
442 .mr(1)
443 .nr(2)
444 .kr(4)
445 .sr(1)
446 .m(1)
447 .n(2)
448 .k(4)
449 .qmin(128)
450 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
451 }
452
TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32,qmax)453 TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32, qmax) {
454 TEST_REQUIRES_ARM_SIMD32;
455 GemmMicrokernelTester()
456 .mr(1)
457 .nr(2)
458 .kr(4)
459 .sr(1)
460 .m(1)
461 .n(2)
462 .k(4)
463 .qmax(128)
464 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
465 }
466
TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32,strided_cm)467 TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32, strided_cm) {
468 TEST_REQUIRES_ARM_SIMD32;
469 GemmMicrokernelTester()
470 .mr(1)
471 .nr(2)
472 .kr(4)
473 .sr(1)
474 .m(1)
475 .n(2)
476 .k(4)
477 .cm_stride(5)
478 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
479 }
480
TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32,no_a_zero_point)481 TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32, no_a_zero_point) {
482 TEST_REQUIRES_ARM_SIMD32;
483 for (size_t k = 1; k <= 20; k += 5) {
484 GemmMicrokernelTester()
485 .mr(1)
486 .nr(2)
487 .kr(4)
488 .sr(1)
489 .m(1)
490 .n(2)
491 .k(k)
492 .a_zero_point(0)
493 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
494 }
495 }
496
TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32,no_b_zero_point)497 TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32, no_b_zero_point) {
498 TEST_REQUIRES_ARM_SIMD32;
499 for (size_t k = 1; k <= 20; k += 5) {
500 GemmMicrokernelTester()
501 .mr(1)
502 .nr(2)
503 .kr(4)
504 .sr(1)
505 .m(1)
506 .n(2)
507 .k(k)
508 .b_zero_point(0)
509 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
510 }
511 }
512
TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32,no_zero_point)513 TEST(QU8_GEMM_MINMAX_FP32_1X2C4__ARMSIMD32, no_zero_point) {
514 TEST_REQUIRES_ARM_SIMD32;
515 for (size_t k = 1; k <= 20; k += 5) {
516 GemmMicrokernelTester()
517 .mr(1)
518 .nr(2)
519 .kr(4)
520 .sr(1)
521 .m(1)
522 .n(2)
523 .k(k)
524 .a_zero_point(0)
525 .b_zero_point(0)
526 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
527 }
528 }
529 #endif // XNN_ARCH_ARM
530
531
532 #if XNN_ARCH_ARM
TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32,k_eq_4)533 TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32, k_eq_4) {
534 TEST_REQUIRES_ARM_SIMD32;
535 GemmMicrokernelTester()
536 .mr(2)
537 .nr(2)
538 .kr(4)
539 .sr(1)
540 .m(2)
541 .n(2)
542 .k(4)
543 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
544 }
545
TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32,strided_cn)546 TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32, strided_cn) {
547 TEST_REQUIRES_ARM_SIMD32;
548 GemmMicrokernelTester()
549 .mr(2)
550 .nr(2)
551 .kr(4)
552 .sr(1)
553 .m(2)
554 .n(2)
555 .k(4)
556 .cn_stride(5)
557 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
558 }
559
TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32,k_eq_4_strided_a)560 TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32, k_eq_4_strided_a) {
561 TEST_REQUIRES_ARM_SIMD32;
562 GemmMicrokernelTester()
563 .mr(2)
564 .nr(2)
565 .kr(4)
566 .sr(1)
567 .m(2)
568 .n(2)
569 .k(4)
570 .a_stride(7)
571 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
572 }
573
TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32,k_eq_4_subtile)574 TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32, k_eq_4_subtile) {
575 TEST_REQUIRES_ARM_SIMD32;
576 for (uint32_t n = 1; n <= 2; n++) {
577 for (uint32_t m = 1; m <= 2; m++) {
578 GemmMicrokernelTester()
579 .mr(2)
580 .nr(2)
581 .kr(4)
582 .sr(1)
583 .m(m)
584 .n(n)
585 .k(4)
586 .iterations(1)
587 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
588 }
589 }
590 }
591
TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32,k_eq_4_subtile_m)592 TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32, k_eq_4_subtile_m) {
593 TEST_REQUIRES_ARM_SIMD32;
594 for (uint32_t m = 1; m <= 2; m++) {
595 GemmMicrokernelTester()
596 .mr(2)
597 .nr(2)
598 .kr(4)
599 .sr(1)
600 .m(m)
601 .n(2)
602 .k(4)
603 .iterations(1)
604 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
605 }
606 }
607
TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32,k_eq_4_subtile_n)608 TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32, k_eq_4_subtile_n) {
609 TEST_REQUIRES_ARM_SIMD32;
610 for (uint32_t n = 1; n <= 2; n++) {
611 GemmMicrokernelTester()
612 .mr(2)
613 .nr(2)
614 .kr(4)
615 .sr(1)
616 .m(2)
617 .n(n)
618 .k(4)
619 .iterations(1)
620 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
621 }
622 }
623
TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32,k_lt_4)624 TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32, k_lt_4) {
625 TEST_REQUIRES_ARM_SIMD32;
626 for (size_t k = 1; k < 4; k++) {
627 GemmMicrokernelTester()
628 .mr(2)
629 .nr(2)
630 .kr(4)
631 .sr(1)
632 .m(2)
633 .n(2)
634 .k(k)
635 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
636 }
637 }
638
TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32,k_lt_4_strided_a)639 TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32, k_lt_4_strided_a) {
640 TEST_REQUIRES_ARM_SIMD32;
641 for (size_t k = 1; k < 4; k++) {
642 GemmMicrokernelTester()
643 .mr(2)
644 .nr(2)
645 .kr(4)
646 .sr(1)
647 .m(2)
648 .n(2)
649 .k(k)
650 .a_stride(7)
651 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
652 }
653 }
654
TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32,k_lt_4_subtile)655 TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32, k_lt_4_subtile) {
656 TEST_REQUIRES_ARM_SIMD32;
657 for (size_t k = 1; k < 4; k++) {
658 for (uint32_t n = 1; n <= 2; n++) {
659 for (uint32_t m = 1; m <= 2; m++) {
660 GemmMicrokernelTester()
661 .mr(2)
662 .nr(2)
663 .kr(4)
664 .sr(1)
665 .m(m)
666 .n(n)
667 .k(k)
668 .iterations(1)
669 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
670 }
671 }
672 }
673 }
674
TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32,k_gt_4)675 TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32, k_gt_4) {
676 TEST_REQUIRES_ARM_SIMD32;
677 for (size_t k = 5; k < 8; k++) {
678 GemmMicrokernelTester()
679 .mr(2)
680 .nr(2)
681 .kr(4)
682 .sr(1)
683 .m(2)
684 .n(2)
685 .k(k)
686 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
687 }
688 }
689
TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32,k_gt_4_strided_a)690 TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32, k_gt_4_strided_a) {
691 TEST_REQUIRES_ARM_SIMD32;
692 for (size_t k = 5; k < 8; k++) {
693 GemmMicrokernelTester()
694 .mr(2)
695 .nr(2)
696 .kr(4)
697 .sr(1)
698 .m(2)
699 .n(2)
700 .k(k)
701 .a_stride(11)
702 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
703 }
704 }
705
TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32,k_gt_4_subtile)706 TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32, k_gt_4_subtile) {
707 TEST_REQUIRES_ARM_SIMD32;
708 for (size_t k = 5; k < 8; k++) {
709 for (uint32_t n = 1; n <= 2; n++) {
710 for (uint32_t m = 1; m <= 2; m++) {
711 GemmMicrokernelTester()
712 .mr(2)
713 .nr(2)
714 .kr(4)
715 .sr(1)
716 .m(m)
717 .n(n)
718 .k(k)
719 .iterations(1)
720 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
721 }
722 }
723 }
724 }
725
TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32,k_div_4)726 TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32, k_div_4) {
727 TEST_REQUIRES_ARM_SIMD32;
728 for (size_t k = 8; k <= 40; k += 4) {
729 GemmMicrokernelTester()
730 .mr(2)
731 .nr(2)
732 .kr(4)
733 .sr(1)
734 .m(2)
735 .n(2)
736 .k(k)
737 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
738 }
739 }
740
TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32,k_div_4_strided_a)741 TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32, k_div_4_strided_a) {
742 TEST_REQUIRES_ARM_SIMD32;
743 for (size_t k = 8; k <= 40; k += 4) {
744 GemmMicrokernelTester()
745 .mr(2)
746 .nr(2)
747 .kr(4)
748 .sr(1)
749 .m(2)
750 .n(2)
751 .k(k)
752 .a_stride(43)
753 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
754 }
755 }
756
TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32,k_div_4_subtile)757 TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32, k_div_4_subtile) {
758 TEST_REQUIRES_ARM_SIMD32;
759 for (size_t k = 8; k <= 40; k += 4) {
760 for (uint32_t n = 1; n <= 2; n++) {
761 for (uint32_t m = 1; m <= 2; m++) {
762 GemmMicrokernelTester()
763 .mr(2)
764 .nr(2)
765 .kr(4)
766 .sr(1)
767 .m(m)
768 .n(n)
769 .k(k)
770 .iterations(1)
771 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
772 }
773 }
774 }
775 }
776
TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32,n_gt_2)777 TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32, n_gt_2) {
778 TEST_REQUIRES_ARM_SIMD32;
779 for (uint32_t n = 3; n < 4; n++) {
780 for (size_t k = 1; k <= 20; k += 5) {
781 GemmMicrokernelTester()
782 .mr(2)
783 .nr(2)
784 .kr(4)
785 .sr(1)
786 .m(2)
787 .n(n)
788 .k(k)
789 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
790 }
791 }
792 }
793
TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32,n_gt_2_strided_cn)794 TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32, n_gt_2_strided_cn) {
795 TEST_REQUIRES_ARM_SIMD32;
796 for (uint32_t n = 3; n < 4; n++) {
797 for (size_t k = 1; k <= 20; k += 5) {
798 GemmMicrokernelTester()
799 .mr(2)
800 .nr(2)
801 .kr(4)
802 .sr(1)
803 .m(2)
804 .n(n)
805 .k(k)
806 .cn_stride(5)
807 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
808 }
809 }
810 }
811
TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32,n_gt_2_strided_a)812 TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32, n_gt_2_strided_a) {
813 TEST_REQUIRES_ARM_SIMD32;
814 for (uint32_t n = 3; n < 4; n++) {
815 for (size_t k = 1; k <= 20; k += 5) {
816 GemmMicrokernelTester()
817 .mr(2)
818 .nr(2)
819 .kr(4)
820 .sr(1)
821 .m(2)
822 .n(n)
823 .k(k)
824 .a_stride(23)
825 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
826 }
827 }
828 }
829
TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32,n_gt_2_subtile)830 TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32, n_gt_2_subtile) {
831 TEST_REQUIRES_ARM_SIMD32;
832 for (uint32_t n = 3; n < 4; n++) {
833 for (size_t k = 1; k <= 20; k += 5) {
834 for (uint32_t m = 1; m <= 2; m++) {
835 GemmMicrokernelTester()
836 .mr(2)
837 .nr(2)
838 .kr(4)
839 .sr(1)
840 .m(m)
841 .n(n)
842 .k(k)
843 .iterations(1)
844 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
845 }
846 }
847 }
848 }
849
TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32,n_div_2)850 TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32, n_div_2) {
851 TEST_REQUIRES_ARM_SIMD32;
852 for (uint32_t n = 4; n <= 6; n += 2) {
853 for (size_t k = 1; k <= 20; k += 5) {
854 GemmMicrokernelTester()
855 .mr(2)
856 .nr(2)
857 .kr(4)
858 .sr(1)
859 .m(2)
860 .n(n)
861 .k(k)
862 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
863 }
864 }
865 }
866
TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32,n_div_2_strided_cn)867 TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32, n_div_2_strided_cn) {
868 TEST_REQUIRES_ARM_SIMD32;
869 for (uint32_t n = 4; n <= 6; n += 2) {
870 for (size_t k = 1; k <= 20; k += 5) {
871 GemmMicrokernelTester()
872 .mr(2)
873 .nr(2)
874 .kr(4)
875 .sr(1)
876 .m(2)
877 .n(n)
878 .k(k)
879 .cn_stride(5)
880 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
881 }
882 }
883 }
884
TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32,n_div_2_strided_a)885 TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32, n_div_2_strided_a) {
886 TEST_REQUIRES_ARM_SIMD32;
887 for (uint32_t n = 4; n <= 6; n += 2) {
888 for (size_t k = 1; k <= 20; k += 5) {
889 GemmMicrokernelTester()
890 .mr(2)
891 .nr(2)
892 .kr(4)
893 .sr(1)
894 .m(2)
895 .n(n)
896 .k(k)
897 .a_stride(23)
898 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
899 }
900 }
901 }
902
TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32,n_div_2_subtile)903 TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32, n_div_2_subtile) {
904 TEST_REQUIRES_ARM_SIMD32;
905 for (uint32_t n = 4; n <= 6; n += 2) {
906 for (size_t k = 1; k <= 20; k += 5) {
907 for (uint32_t m = 1; m <= 2; m++) {
908 GemmMicrokernelTester()
909 .mr(2)
910 .nr(2)
911 .kr(4)
912 .sr(1)
913 .m(m)
914 .n(n)
915 .k(k)
916 .iterations(1)
917 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
918 }
919 }
920 }
921 }
922
TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32,strided_cm_subtile)923 TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32, strided_cm_subtile) {
924 TEST_REQUIRES_ARM_SIMD32;
925 for (size_t k = 1; k <= 20; k += 5) {
926 for (uint32_t n = 1; n <= 2; n++) {
927 for (uint32_t m = 1; m <= 2; m++) {
928 GemmMicrokernelTester()
929 .mr(2)
930 .nr(2)
931 .kr(4)
932 .sr(1)
933 .m(m)
934 .n(n)
935 .k(k)
936 .cm_stride(5)
937 .iterations(1)
938 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
939 }
940 }
941 }
942 }
943
TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32,qmin)944 TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32, qmin) {
945 TEST_REQUIRES_ARM_SIMD32;
946 GemmMicrokernelTester()
947 .mr(2)
948 .nr(2)
949 .kr(4)
950 .sr(1)
951 .m(2)
952 .n(2)
953 .k(4)
954 .qmin(128)
955 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
956 }
957
TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32,qmax)958 TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32, qmax) {
959 TEST_REQUIRES_ARM_SIMD32;
960 GemmMicrokernelTester()
961 .mr(2)
962 .nr(2)
963 .kr(4)
964 .sr(1)
965 .m(2)
966 .n(2)
967 .k(4)
968 .qmax(128)
969 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
970 }
971
TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32,strided_cm)972 TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32, strided_cm) {
973 TEST_REQUIRES_ARM_SIMD32;
974 GemmMicrokernelTester()
975 .mr(2)
976 .nr(2)
977 .kr(4)
978 .sr(1)
979 .m(2)
980 .n(2)
981 .k(4)
982 .cm_stride(5)
983 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
984 }
985
TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32,no_a_zero_point)986 TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32, no_a_zero_point) {
987 TEST_REQUIRES_ARM_SIMD32;
988 for (size_t k = 1; k <= 20; k += 5) {
989 GemmMicrokernelTester()
990 .mr(2)
991 .nr(2)
992 .kr(4)
993 .sr(1)
994 .m(2)
995 .n(2)
996 .k(k)
997 .a_zero_point(0)
998 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
999 }
1000 }
1001
TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32,no_b_zero_point)1002 TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32, no_b_zero_point) {
1003 TEST_REQUIRES_ARM_SIMD32;
1004 for (size_t k = 1; k <= 20; k += 5) {
1005 GemmMicrokernelTester()
1006 .mr(2)
1007 .nr(2)
1008 .kr(4)
1009 .sr(1)
1010 .m(2)
1011 .n(2)
1012 .k(k)
1013 .b_zero_point(0)
1014 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
1015 }
1016 }
1017
TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32,no_zero_point)1018 TEST(QU8_GEMM_MINMAX_FP32_2X2C4__ARMSIMD32, no_zero_point) {
1019 TEST_REQUIRES_ARM_SIMD32;
1020 for (size_t k = 1; k <= 20; k += 5) {
1021 GemmMicrokernelTester()
1022 .mr(2)
1023 .nr(2)
1024 .kr(4)
1025 .sr(1)
1026 .m(2)
1027 .n(2)
1028 .k(k)
1029 .a_zero_point(0)
1030 .b_zero_point(0)
1031 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32, xnn_init_qu8_conv_minmax_fp32_armsimd32_params, xnn_qu8_requantize_fp32);
1032 }
1033 }
1034 #endif // XNN_ARCH_ARM
1035
1036
1037 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE,k_eq_8)1038 TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_eq_8) {
1039 TEST_REQUIRES_ARM_NEON;
1040 GemmMicrokernelTester()
1041 .mr(1)
1042 .nr(8)
1043 .kr(1)
1044 .sr(1)
1045 .m(1)
1046 .n(8)
1047 .k(8)
1048 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1049 }
1050
TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE,strided_cn)1051 TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, strided_cn) {
1052 TEST_REQUIRES_ARM_NEON;
1053 GemmMicrokernelTester()
1054 .mr(1)
1055 .nr(8)
1056 .kr(1)
1057 .sr(1)
1058 .m(1)
1059 .n(8)
1060 .k(8)
1061 .cn_stride(11)
1062 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1063 }
1064
TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE,k_eq_8_strided_a)1065 TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_eq_8_strided_a) {
1066 TEST_REQUIRES_ARM_NEON;
1067 GemmMicrokernelTester()
1068 .mr(1)
1069 .nr(8)
1070 .kr(1)
1071 .sr(1)
1072 .m(1)
1073 .n(8)
1074 .k(8)
1075 .a_stride(11)
1076 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1077 }
1078
TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE,k_eq_8_subtile)1079 TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_eq_8_subtile) {
1080 TEST_REQUIRES_ARM_NEON;
1081 for (uint32_t n = 1; n <= 8; n++) {
1082 for (uint32_t m = 1; m <= 1; m++) {
1083 GemmMicrokernelTester()
1084 .mr(1)
1085 .nr(8)
1086 .kr(1)
1087 .sr(1)
1088 .m(m)
1089 .n(n)
1090 .k(8)
1091 .iterations(1)
1092 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1093 }
1094 }
1095 }
1096
TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE,k_eq_8_subtile_m)1097 TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_eq_8_subtile_m) {
1098 TEST_REQUIRES_ARM_NEON;
1099 for (uint32_t m = 1; m <= 1; m++) {
1100 GemmMicrokernelTester()
1101 .mr(1)
1102 .nr(8)
1103 .kr(1)
1104 .sr(1)
1105 .m(m)
1106 .n(8)
1107 .k(8)
1108 .iterations(1)
1109 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1110 }
1111 }
1112
TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE,k_eq_8_subtile_n)1113 TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_eq_8_subtile_n) {
1114 TEST_REQUIRES_ARM_NEON;
1115 for (uint32_t n = 1; n <= 8; n++) {
1116 GemmMicrokernelTester()
1117 .mr(1)
1118 .nr(8)
1119 .kr(1)
1120 .sr(1)
1121 .m(1)
1122 .n(n)
1123 .k(8)
1124 .iterations(1)
1125 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1126 }
1127 }
1128
TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE,k_lt_8)1129 TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_lt_8) {
1130 TEST_REQUIRES_ARM_NEON;
1131 for (size_t k = 1; k < 8; k++) {
1132 GemmMicrokernelTester()
1133 .mr(1)
1134 .nr(8)
1135 .kr(1)
1136 .sr(1)
1137 .m(1)
1138 .n(8)
1139 .k(k)
1140 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1141 }
1142 }
1143
TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE,k_lt_8_strided_a)1144 TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_lt_8_strided_a) {
1145 TEST_REQUIRES_ARM_NEON;
1146 for (size_t k = 1; k < 8; k++) {
1147 GemmMicrokernelTester()
1148 .mr(1)
1149 .nr(8)
1150 .kr(1)
1151 .sr(1)
1152 .m(1)
1153 .n(8)
1154 .k(k)
1155 .a_stride(11)
1156 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1157 }
1158 }
1159
TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE,k_lt_8_subtile)1160 TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_lt_8_subtile) {
1161 TEST_REQUIRES_ARM_NEON;
1162 for (size_t k = 1; k < 8; k++) {
1163 for (uint32_t n = 1; n <= 8; n++) {
1164 for (uint32_t m = 1; m <= 1; m++) {
1165 GemmMicrokernelTester()
1166 .mr(1)
1167 .nr(8)
1168 .kr(1)
1169 .sr(1)
1170 .m(m)
1171 .n(n)
1172 .k(k)
1173 .iterations(1)
1174 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1175 }
1176 }
1177 }
1178 }
1179
TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE,k_gt_8)1180 TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_gt_8) {
1181 TEST_REQUIRES_ARM_NEON;
1182 for (size_t k = 9; k < 16; k++) {
1183 GemmMicrokernelTester()
1184 .mr(1)
1185 .nr(8)
1186 .kr(1)
1187 .sr(1)
1188 .m(1)
1189 .n(8)
1190 .k(k)
1191 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1192 }
1193 }
1194
TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE,k_gt_8_strided_a)1195 TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_gt_8_strided_a) {
1196 TEST_REQUIRES_ARM_NEON;
1197 for (size_t k = 9; k < 16; k++) {
1198 GemmMicrokernelTester()
1199 .mr(1)
1200 .nr(8)
1201 .kr(1)
1202 .sr(1)
1203 .m(1)
1204 .n(8)
1205 .k(k)
1206 .a_stride(19)
1207 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1208 }
1209 }
1210
TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE,k_gt_8_subtile)1211 TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_gt_8_subtile) {
1212 TEST_REQUIRES_ARM_NEON;
1213 for (size_t k = 9; k < 16; k++) {
1214 for (uint32_t n = 1; n <= 8; n++) {
1215 for (uint32_t m = 1; m <= 1; m++) {
1216 GemmMicrokernelTester()
1217 .mr(1)
1218 .nr(8)
1219 .kr(1)
1220 .sr(1)
1221 .m(m)
1222 .n(n)
1223 .k(k)
1224 .iterations(1)
1225 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1226 }
1227 }
1228 }
1229 }
1230
TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE,k_div_8)1231 TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_div_8) {
1232 TEST_REQUIRES_ARM_NEON;
1233 for (size_t k = 16; k <= 80; k += 8) {
1234 GemmMicrokernelTester()
1235 .mr(1)
1236 .nr(8)
1237 .kr(1)
1238 .sr(1)
1239 .m(1)
1240 .n(8)
1241 .k(k)
1242 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1243 }
1244 }
1245
TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE,k_div_8_strided_a)1246 TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_div_8_strided_a) {
1247 TEST_REQUIRES_ARM_NEON;
1248 for (size_t k = 16; k <= 80; k += 8) {
1249 GemmMicrokernelTester()
1250 .mr(1)
1251 .nr(8)
1252 .kr(1)
1253 .sr(1)
1254 .m(1)
1255 .n(8)
1256 .k(k)
1257 .a_stride(83)
1258 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1259 }
1260 }
1261
TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE,k_div_8_subtile)1262 TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_div_8_subtile) {
1263 TEST_REQUIRES_ARM_NEON;
1264 for (size_t k = 16; k <= 80; k += 8) {
1265 for (uint32_t n = 1; n <= 8; n++) {
1266 for (uint32_t m = 1; m <= 1; m++) {
1267 GemmMicrokernelTester()
1268 .mr(1)
1269 .nr(8)
1270 .kr(1)
1271 .sr(1)
1272 .m(m)
1273 .n(n)
1274 .k(k)
1275 .iterations(1)
1276 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1277 }
1278 }
1279 }
1280 }
1281
TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE,n_gt_8)1282 TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_gt_8) {
1283 TEST_REQUIRES_ARM_NEON;
1284 for (uint32_t n = 9; n < 16; n++) {
1285 for (size_t k = 1; k <= 40; k += 9) {
1286 GemmMicrokernelTester()
1287 .mr(1)
1288 .nr(8)
1289 .kr(1)
1290 .sr(1)
1291 .m(1)
1292 .n(n)
1293 .k(k)
1294 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1295 }
1296 }
1297 }
1298
TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE,n_gt_8_strided_cn)1299 TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_gt_8_strided_cn) {
1300 TEST_REQUIRES_ARM_NEON;
1301 for (uint32_t n = 9; n < 16; n++) {
1302 for (size_t k = 1; k <= 40; k += 9) {
1303 GemmMicrokernelTester()
1304 .mr(1)
1305 .nr(8)
1306 .kr(1)
1307 .sr(1)
1308 .m(1)
1309 .n(n)
1310 .k(k)
1311 .cn_stride(11)
1312 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1313 }
1314 }
1315 }
1316
TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE,n_gt_8_strided_a)1317 TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_gt_8_strided_a) {
1318 TEST_REQUIRES_ARM_NEON;
1319 for (uint32_t n = 9; n < 16; n++) {
1320 for (size_t k = 1; k <= 40; k += 9) {
1321 GemmMicrokernelTester()
1322 .mr(1)
1323 .nr(8)
1324 .kr(1)
1325 .sr(1)
1326 .m(1)
1327 .n(n)
1328 .k(k)
1329 .a_stride(43)
1330 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1331 }
1332 }
1333 }
1334
TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE,n_gt_8_subtile)1335 TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_gt_8_subtile) {
1336 TEST_REQUIRES_ARM_NEON;
1337 for (uint32_t n = 9; n < 16; n++) {
1338 for (size_t k = 1; k <= 40; k += 9) {
1339 for (uint32_t m = 1; m <= 1; m++) {
1340 GemmMicrokernelTester()
1341 .mr(1)
1342 .nr(8)
1343 .kr(1)
1344 .sr(1)
1345 .m(m)
1346 .n(n)
1347 .k(k)
1348 .iterations(1)
1349 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1350 }
1351 }
1352 }
1353 }
1354
TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE,n_div_8)1355 TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_div_8) {
1356 TEST_REQUIRES_ARM_NEON;
1357 for (uint32_t n = 16; n <= 24; n += 8) {
1358 for (size_t k = 1; k <= 40; k += 9) {
1359 GemmMicrokernelTester()
1360 .mr(1)
1361 .nr(8)
1362 .kr(1)
1363 .sr(1)
1364 .m(1)
1365 .n(n)
1366 .k(k)
1367 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1368 }
1369 }
1370 }
1371
TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE,n_div_8_strided_cn)1372 TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_div_8_strided_cn) {
1373 TEST_REQUIRES_ARM_NEON;
1374 for (uint32_t n = 16; n <= 24; n += 8) {
1375 for (size_t k = 1; k <= 40; k += 9) {
1376 GemmMicrokernelTester()
1377 .mr(1)
1378 .nr(8)
1379 .kr(1)
1380 .sr(1)
1381 .m(1)
1382 .n(n)
1383 .k(k)
1384 .cn_stride(11)
1385 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1386 }
1387 }
1388 }
1389
TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE,n_div_8_strided_a)1390 TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_div_8_strided_a) {
1391 TEST_REQUIRES_ARM_NEON;
1392 for (uint32_t n = 16; n <= 24; n += 8) {
1393 for (size_t k = 1; k <= 40; k += 9) {
1394 GemmMicrokernelTester()
1395 .mr(1)
1396 .nr(8)
1397 .kr(1)
1398 .sr(1)
1399 .m(1)
1400 .n(n)
1401 .k(k)
1402 .a_stride(43)
1403 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1404 }
1405 }
1406 }
1407
TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE,n_div_8_subtile)1408 TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_div_8_subtile) {
1409 TEST_REQUIRES_ARM_NEON;
1410 for (uint32_t n = 16; n <= 24; n += 8) {
1411 for (size_t k = 1; k <= 40; k += 9) {
1412 for (uint32_t m = 1; m <= 1; m++) {
1413 GemmMicrokernelTester()
1414 .mr(1)
1415 .nr(8)
1416 .kr(1)
1417 .sr(1)
1418 .m(m)
1419 .n(n)
1420 .k(k)
1421 .iterations(1)
1422 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1423 }
1424 }
1425 }
1426 }
1427
TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE,strided_cm_subtile)1428 TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, strided_cm_subtile) {
1429 TEST_REQUIRES_ARM_NEON;
1430 for (size_t k = 1; k <= 40; k += 9) {
1431 for (uint32_t n = 1; n <= 8; n++) {
1432 for (uint32_t m = 1; m <= 1; m++) {
1433 GemmMicrokernelTester()
1434 .mr(1)
1435 .nr(8)
1436 .kr(1)
1437 .sr(1)
1438 .m(m)
1439 .n(n)
1440 .k(k)
1441 .cm_stride(11)
1442 .iterations(1)
1443 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1444 }
1445 }
1446 }
1447 }
1448
TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE,qmin)1449 TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, qmin) {
1450 TEST_REQUIRES_ARM_NEON;
1451 GemmMicrokernelTester()
1452 .mr(1)
1453 .nr(8)
1454 .kr(1)
1455 .sr(1)
1456 .m(1)
1457 .n(8)
1458 .k(8)
1459 .qmin(128)
1460 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1461 }
1462
TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE,qmax)1463 TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, qmax) {
1464 TEST_REQUIRES_ARM_NEON;
1465 GemmMicrokernelTester()
1466 .mr(1)
1467 .nr(8)
1468 .kr(1)
1469 .sr(1)
1470 .m(1)
1471 .n(8)
1472 .k(8)
1473 .qmax(128)
1474 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1475 }
1476
TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE,strided_cm)1477 TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, strided_cm) {
1478 TEST_REQUIRES_ARM_NEON;
1479 GemmMicrokernelTester()
1480 .mr(1)
1481 .nr(8)
1482 .kr(1)
1483 .sr(1)
1484 .m(1)
1485 .n(8)
1486 .k(8)
1487 .cm_stride(11)
1488 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1489 }
1490
TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE,no_a_zero_point)1491 TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, no_a_zero_point) {
1492 TEST_REQUIRES_ARM_NEON;
1493 for (size_t k = 1; k <= 40; k += 9) {
1494 GemmMicrokernelTester()
1495 .mr(1)
1496 .nr(8)
1497 .kr(1)
1498 .sr(1)
1499 .m(1)
1500 .n(8)
1501 .k(k)
1502 .a_zero_point(0)
1503 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1504 }
1505 }
1506
TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE,no_b_zero_point)1507 TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, no_b_zero_point) {
1508 TEST_REQUIRES_ARM_NEON;
1509 for (size_t k = 1; k <= 40; k += 9) {
1510 GemmMicrokernelTester()
1511 .mr(1)
1512 .nr(8)
1513 .kr(1)
1514 .sr(1)
1515 .m(1)
1516 .n(8)
1517 .k(k)
1518 .b_zero_point(0)
1519 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1520 }
1521 }
1522
TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE,no_zero_point)1523 TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, no_zero_point) {
1524 TEST_REQUIRES_ARM_NEON;
1525 for (size_t k = 1; k <= 40; k += 9) {
1526 GemmMicrokernelTester()
1527 .mr(1)
1528 .nr(8)
1529 .kr(1)
1530 .sr(1)
1531 .m(1)
1532 .n(8)
1533 .k(k)
1534 .a_zero_point(0)
1535 .b_zero_point(0)
1536 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1537 }
1538 }
1539 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
1540
1541
1542 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE,k_eq_8)1543 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, k_eq_8) {
1544 TEST_REQUIRES_ARM_NEON;
1545 GemmMicrokernelTester()
1546 .mr(1)
1547 .nr(16)
1548 .kr(1)
1549 .sr(1)
1550 .m(1)
1551 .n(16)
1552 .k(8)
1553 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1554 }
1555
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE,strided_cn)1556 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, strided_cn) {
1557 TEST_REQUIRES_ARM_NEON;
1558 GemmMicrokernelTester()
1559 .mr(1)
1560 .nr(16)
1561 .kr(1)
1562 .sr(1)
1563 .m(1)
1564 .n(16)
1565 .k(8)
1566 .cn_stride(19)
1567 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1568 }
1569
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE,k_eq_8_strided_a)1570 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, k_eq_8_strided_a) {
1571 TEST_REQUIRES_ARM_NEON;
1572 GemmMicrokernelTester()
1573 .mr(1)
1574 .nr(16)
1575 .kr(1)
1576 .sr(1)
1577 .m(1)
1578 .n(16)
1579 .k(8)
1580 .a_stride(11)
1581 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1582 }
1583
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE,k_eq_8_subtile)1584 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, k_eq_8_subtile) {
1585 TEST_REQUIRES_ARM_NEON;
1586 for (uint32_t n = 1; n <= 16; n++) {
1587 for (uint32_t m = 1; m <= 1; m++) {
1588 GemmMicrokernelTester()
1589 .mr(1)
1590 .nr(16)
1591 .kr(1)
1592 .sr(1)
1593 .m(m)
1594 .n(n)
1595 .k(8)
1596 .iterations(1)
1597 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1598 }
1599 }
1600 }
1601
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE,k_eq_8_subtile_m)1602 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, k_eq_8_subtile_m) {
1603 TEST_REQUIRES_ARM_NEON;
1604 for (uint32_t m = 1; m <= 1; m++) {
1605 GemmMicrokernelTester()
1606 .mr(1)
1607 .nr(16)
1608 .kr(1)
1609 .sr(1)
1610 .m(m)
1611 .n(16)
1612 .k(8)
1613 .iterations(1)
1614 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1615 }
1616 }
1617
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE,k_eq_8_subtile_n)1618 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, k_eq_8_subtile_n) {
1619 TEST_REQUIRES_ARM_NEON;
1620 for (uint32_t n = 1; n <= 16; n++) {
1621 GemmMicrokernelTester()
1622 .mr(1)
1623 .nr(16)
1624 .kr(1)
1625 .sr(1)
1626 .m(1)
1627 .n(n)
1628 .k(8)
1629 .iterations(1)
1630 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1631 }
1632 }
1633
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE,k_lt_8)1634 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, k_lt_8) {
1635 TEST_REQUIRES_ARM_NEON;
1636 for (size_t k = 1; k < 8; k++) {
1637 GemmMicrokernelTester()
1638 .mr(1)
1639 .nr(16)
1640 .kr(1)
1641 .sr(1)
1642 .m(1)
1643 .n(16)
1644 .k(k)
1645 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1646 }
1647 }
1648
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE,k_lt_8_strided_a)1649 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, k_lt_8_strided_a) {
1650 TEST_REQUIRES_ARM_NEON;
1651 for (size_t k = 1; k < 8; k++) {
1652 GemmMicrokernelTester()
1653 .mr(1)
1654 .nr(16)
1655 .kr(1)
1656 .sr(1)
1657 .m(1)
1658 .n(16)
1659 .k(k)
1660 .a_stride(11)
1661 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1662 }
1663 }
1664
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE,k_lt_8_subtile)1665 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, k_lt_8_subtile) {
1666 TEST_REQUIRES_ARM_NEON;
1667 for (size_t k = 1; k < 8; k++) {
1668 for (uint32_t n = 1; n <= 16; n++) {
1669 for (uint32_t m = 1; m <= 1; m++) {
1670 GemmMicrokernelTester()
1671 .mr(1)
1672 .nr(16)
1673 .kr(1)
1674 .sr(1)
1675 .m(m)
1676 .n(n)
1677 .k(k)
1678 .iterations(1)
1679 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1680 }
1681 }
1682 }
1683 }
1684
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE,k_gt_8)1685 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, k_gt_8) {
1686 TEST_REQUIRES_ARM_NEON;
1687 for (size_t k = 9; k < 16; k++) {
1688 GemmMicrokernelTester()
1689 .mr(1)
1690 .nr(16)
1691 .kr(1)
1692 .sr(1)
1693 .m(1)
1694 .n(16)
1695 .k(k)
1696 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1697 }
1698 }
1699
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE,k_gt_8_strided_a)1700 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, k_gt_8_strided_a) {
1701 TEST_REQUIRES_ARM_NEON;
1702 for (size_t k = 9; k < 16; k++) {
1703 GemmMicrokernelTester()
1704 .mr(1)
1705 .nr(16)
1706 .kr(1)
1707 .sr(1)
1708 .m(1)
1709 .n(16)
1710 .k(k)
1711 .a_stride(19)
1712 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1713 }
1714 }
1715
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE,k_gt_8_subtile)1716 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, k_gt_8_subtile) {
1717 TEST_REQUIRES_ARM_NEON;
1718 for (size_t k = 9; k < 16; k++) {
1719 for (uint32_t n = 1; n <= 16; n++) {
1720 for (uint32_t m = 1; m <= 1; m++) {
1721 GemmMicrokernelTester()
1722 .mr(1)
1723 .nr(16)
1724 .kr(1)
1725 .sr(1)
1726 .m(m)
1727 .n(n)
1728 .k(k)
1729 .iterations(1)
1730 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1731 }
1732 }
1733 }
1734 }
1735
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE,k_div_8)1736 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, k_div_8) {
1737 TEST_REQUIRES_ARM_NEON;
1738 for (size_t k = 16; k <= 80; k += 8) {
1739 GemmMicrokernelTester()
1740 .mr(1)
1741 .nr(16)
1742 .kr(1)
1743 .sr(1)
1744 .m(1)
1745 .n(16)
1746 .k(k)
1747 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1748 }
1749 }
1750
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE,k_div_8_strided_a)1751 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, k_div_8_strided_a) {
1752 TEST_REQUIRES_ARM_NEON;
1753 for (size_t k = 16; k <= 80; k += 8) {
1754 GemmMicrokernelTester()
1755 .mr(1)
1756 .nr(16)
1757 .kr(1)
1758 .sr(1)
1759 .m(1)
1760 .n(16)
1761 .k(k)
1762 .a_stride(83)
1763 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1764 }
1765 }
1766
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE,k_div_8_subtile)1767 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, k_div_8_subtile) {
1768 TEST_REQUIRES_ARM_NEON;
1769 for (size_t k = 16; k <= 80; k += 8) {
1770 for (uint32_t n = 1; n <= 16; n++) {
1771 for (uint32_t m = 1; m <= 1; m++) {
1772 GemmMicrokernelTester()
1773 .mr(1)
1774 .nr(16)
1775 .kr(1)
1776 .sr(1)
1777 .m(m)
1778 .n(n)
1779 .k(k)
1780 .iterations(1)
1781 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1782 }
1783 }
1784 }
1785 }
1786
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE,n_gt_16)1787 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, n_gt_16) {
1788 TEST_REQUIRES_ARM_NEON;
1789 for (uint32_t n = 17; n < 32; n++) {
1790 for (size_t k = 1; k <= 40; k += 9) {
1791 GemmMicrokernelTester()
1792 .mr(1)
1793 .nr(16)
1794 .kr(1)
1795 .sr(1)
1796 .m(1)
1797 .n(n)
1798 .k(k)
1799 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1800 }
1801 }
1802 }
1803
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE,n_gt_16_strided_cn)1804 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, n_gt_16_strided_cn) {
1805 TEST_REQUIRES_ARM_NEON;
1806 for (uint32_t n = 17; n < 32; n++) {
1807 for (size_t k = 1; k <= 40; k += 9) {
1808 GemmMicrokernelTester()
1809 .mr(1)
1810 .nr(16)
1811 .kr(1)
1812 .sr(1)
1813 .m(1)
1814 .n(n)
1815 .k(k)
1816 .cn_stride(19)
1817 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1818 }
1819 }
1820 }
1821
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE,n_gt_16_strided_a)1822 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, n_gt_16_strided_a) {
1823 TEST_REQUIRES_ARM_NEON;
1824 for (uint32_t n = 17; n < 32; n++) {
1825 for (size_t k = 1; k <= 40; k += 9) {
1826 GemmMicrokernelTester()
1827 .mr(1)
1828 .nr(16)
1829 .kr(1)
1830 .sr(1)
1831 .m(1)
1832 .n(n)
1833 .k(k)
1834 .a_stride(43)
1835 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1836 }
1837 }
1838 }
1839
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE,n_gt_16_subtile)1840 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, n_gt_16_subtile) {
1841 TEST_REQUIRES_ARM_NEON;
1842 for (uint32_t n = 17; n < 32; n++) {
1843 for (size_t k = 1; k <= 40; k += 9) {
1844 for (uint32_t m = 1; m <= 1; m++) {
1845 GemmMicrokernelTester()
1846 .mr(1)
1847 .nr(16)
1848 .kr(1)
1849 .sr(1)
1850 .m(m)
1851 .n(n)
1852 .k(k)
1853 .iterations(1)
1854 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1855 }
1856 }
1857 }
1858 }
1859
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE,n_div_16)1860 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, n_div_16) {
1861 TEST_REQUIRES_ARM_NEON;
1862 for (uint32_t n = 32; n <= 48; n += 16) {
1863 for (size_t k = 1; k <= 40; k += 9) {
1864 GemmMicrokernelTester()
1865 .mr(1)
1866 .nr(16)
1867 .kr(1)
1868 .sr(1)
1869 .m(1)
1870 .n(n)
1871 .k(k)
1872 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1873 }
1874 }
1875 }
1876
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE,n_div_16_strided_cn)1877 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, n_div_16_strided_cn) {
1878 TEST_REQUIRES_ARM_NEON;
1879 for (uint32_t n = 32; n <= 48; n += 16) {
1880 for (size_t k = 1; k <= 40; k += 9) {
1881 GemmMicrokernelTester()
1882 .mr(1)
1883 .nr(16)
1884 .kr(1)
1885 .sr(1)
1886 .m(1)
1887 .n(n)
1888 .k(k)
1889 .cn_stride(19)
1890 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1891 }
1892 }
1893 }
1894
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE,n_div_16_strided_a)1895 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, n_div_16_strided_a) {
1896 TEST_REQUIRES_ARM_NEON;
1897 for (uint32_t n = 32; n <= 48; n += 16) {
1898 for (size_t k = 1; k <= 40; k += 9) {
1899 GemmMicrokernelTester()
1900 .mr(1)
1901 .nr(16)
1902 .kr(1)
1903 .sr(1)
1904 .m(1)
1905 .n(n)
1906 .k(k)
1907 .a_stride(43)
1908 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1909 }
1910 }
1911 }
1912
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE,n_div_16_subtile)1913 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, n_div_16_subtile) {
1914 TEST_REQUIRES_ARM_NEON;
1915 for (uint32_t n = 32; n <= 48; n += 16) {
1916 for (size_t k = 1; k <= 40; k += 9) {
1917 for (uint32_t m = 1; m <= 1; m++) {
1918 GemmMicrokernelTester()
1919 .mr(1)
1920 .nr(16)
1921 .kr(1)
1922 .sr(1)
1923 .m(m)
1924 .n(n)
1925 .k(k)
1926 .iterations(1)
1927 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1928 }
1929 }
1930 }
1931 }
1932
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE,strided_cm_subtile)1933 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, strided_cm_subtile) {
1934 TEST_REQUIRES_ARM_NEON;
1935 for (size_t k = 1; k <= 40; k += 9) {
1936 for (uint32_t n = 1; n <= 16; n++) {
1937 for (uint32_t m = 1; m <= 1; m++) {
1938 GemmMicrokernelTester()
1939 .mr(1)
1940 .nr(16)
1941 .kr(1)
1942 .sr(1)
1943 .m(m)
1944 .n(n)
1945 .k(k)
1946 .cm_stride(19)
1947 .iterations(1)
1948 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1949 }
1950 }
1951 }
1952 }
1953
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE,qmin)1954 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, qmin) {
1955 TEST_REQUIRES_ARM_NEON;
1956 GemmMicrokernelTester()
1957 .mr(1)
1958 .nr(16)
1959 .kr(1)
1960 .sr(1)
1961 .m(1)
1962 .n(16)
1963 .k(8)
1964 .qmin(128)
1965 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1966 }
1967
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE,qmax)1968 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, qmax) {
1969 TEST_REQUIRES_ARM_NEON;
1970 GemmMicrokernelTester()
1971 .mr(1)
1972 .nr(16)
1973 .kr(1)
1974 .sr(1)
1975 .m(1)
1976 .n(16)
1977 .k(8)
1978 .qmax(128)
1979 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1980 }
1981
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE,strided_cm)1982 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, strided_cm) {
1983 TEST_REQUIRES_ARM_NEON;
1984 GemmMicrokernelTester()
1985 .mr(1)
1986 .nr(16)
1987 .kr(1)
1988 .sr(1)
1989 .m(1)
1990 .n(16)
1991 .k(8)
1992 .cm_stride(19)
1993 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
1994 }
1995
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE,no_a_zero_point)1996 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, no_a_zero_point) {
1997 TEST_REQUIRES_ARM_NEON;
1998 for (size_t k = 1; k <= 40; k += 9) {
1999 GemmMicrokernelTester()
2000 .mr(1)
2001 .nr(16)
2002 .kr(1)
2003 .sr(1)
2004 .m(1)
2005 .n(16)
2006 .k(k)
2007 .a_zero_point(0)
2008 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
2009 }
2010 }
2011
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE,no_b_zero_point)2012 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, no_b_zero_point) {
2013 TEST_REQUIRES_ARM_NEON;
2014 for (size_t k = 1; k <= 40; k += 9) {
2015 GemmMicrokernelTester()
2016 .mr(1)
2017 .nr(16)
2018 .kr(1)
2019 .sr(1)
2020 .m(1)
2021 .n(16)
2022 .k(k)
2023 .b_zero_point(0)
2024 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
2025 }
2026 }
2027
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE,no_zero_point)2028 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, no_zero_point) {
2029 TEST_REQUIRES_ARM_NEON;
2030 for (size_t k = 1; k <= 40; k += 9) {
2031 GemmMicrokernelTester()
2032 .mr(1)
2033 .nr(16)
2034 .kr(1)
2035 .sr(1)
2036 .m(1)
2037 .n(16)
2038 .k(k)
2039 .a_zero_point(0)
2040 .b_zero_point(0)
2041 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
2042 }
2043 }
2044 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
2045
2046
2047 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE,k_eq_8)2048 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_eq_8) {
2049 TEST_REQUIRES_ARM_NEON_V8;
2050 GemmMicrokernelTester()
2051 .mr(1)
2052 .nr(16)
2053 .kr(1)
2054 .sr(1)
2055 .m(1)
2056 .n(16)
2057 .k(8)
2058 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2059 }
2060
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE,strided_cn)2061 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, strided_cn) {
2062 TEST_REQUIRES_ARM_NEON_V8;
2063 GemmMicrokernelTester()
2064 .mr(1)
2065 .nr(16)
2066 .kr(1)
2067 .sr(1)
2068 .m(1)
2069 .n(16)
2070 .k(8)
2071 .cn_stride(19)
2072 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2073 }
2074
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE,k_eq_8_strided_a)2075 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_eq_8_strided_a) {
2076 TEST_REQUIRES_ARM_NEON_V8;
2077 GemmMicrokernelTester()
2078 .mr(1)
2079 .nr(16)
2080 .kr(1)
2081 .sr(1)
2082 .m(1)
2083 .n(16)
2084 .k(8)
2085 .a_stride(11)
2086 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2087 }
2088
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE,k_eq_8_subtile)2089 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_eq_8_subtile) {
2090 TEST_REQUIRES_ARM_NEON_V8;
2091 for (uint32_t n = 1; n <= 16; n++) {
2092 for (uint32_t m = 1; m <= 1; m++) {
2093 GemmMicrokernelTester()
2094 .mr(1)
2095 .nr(16)
2096 .kr(1)
2097 .sr(1)
2098 .m(m)
2099 .n(n)
2100 .k(8)
2101 .iterations(1)
2102 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2103 }
2104 }
2105 }
2106
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE,k_eq_8_subtile_m)2107 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_eq_8_subtile_m) {
2108 TEST_REQUIRES_ARM_NEON_V8;
2109 for (uint32_t m = 1; m <= 1; m++) {
2110 GemmMicrokernelTester()
2111 .mr(1)
2112 .nr(16)
2113 .kr(1)
2114 .sr(1)
2115 .m(m)
2116 .n(16)
2117 .k(8)
2118 .iterations(1)
2119 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2120 }
2121 }
2122
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE,k_eq_8_subtile_n)2123 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_eq_8_subtile_n) {
2124 TEST_REQUIRES_ARM_NEON_V8;
2125 for (uint32_t n = 1; n <= 16; n++) {
2126 GemmMicrokernelTester()
2127 .mr(1)
2128 .nr(16)
2129 .kr(1)
2130 .sr(1)
2131 .m(1)
2132 .n(n)
2133 .k(8)
2134 .iterations(1)
2135 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2136 }
2137 }
2138
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE,k_lt_8)2139 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_lt_8) {
2140 TEST_REQUIRES_ARM_NEON_V8;
2141 for (size_t k = 1; k < 8; k++) {
2142 GemmMicrokernelTester()
2143 .mr(1)
2144 .nr(16)
2145 .kr(1)
2146 .sr(1)
2147 .m(1)
2148 .n(16)
2149 .k(k)
2150 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2151 }
2152 }
2153
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE,k_lt_8_strided_a)2154 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_lt_8_strided_a) {
2155 TEST_REQUIRES_ARM_NEON_V8;
2156 for (size_t k = 1; k < 8; k++) {
2157 GemmMicrokernelTester()
2158 .mr(1)
2159 .nr(16)
2160 .kr(1)
2161 .sr(1)
2162 .m(1)
2163 .n(16)
2164 .k(k)
2165 .a_stride(11)
2166 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2167 }
2168 }
2169
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE,k_lt_8_subtile)2170 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_lt_8_subtile) {
2171 TEST_REQUIRES_ARM_NEON_V8;
2172 for (size_t k = 1; k < 8; k++) {
2173 for (uint32_t n = 1; n <= 16; n++) {
2174 for (uint32_t m = 1; m <= 1; m++) {
2175 GemmMicrokernelTester()
2176 .mr(1)
2177 .nr(16)
2178 .kr(1)
2179 .sr(1)
2180 .m(m)
2181 .n(n)
2182 .k(k)
2183 .iterations(1)
2184 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2185 }
2186 }
2187 }
2188 }
2189
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE,k_gt_8)2190 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_gt_8) {
2191 TEST_REQUIRES_ARM_NEON_V8;
2192 for (size_t k = 9; k < 16; k++) {
2193 GemmMicrokernelTester()
2194 .mr(1)
2195 .nr(16)
2196 .kr(1)
2197 .sr(1)
2198 .m(1)
2199 .n(16)
2200 .k(k)
2201 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2202 }
2203 }
2204
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE,k_gt_8_strided_a)2205 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_gt_8_strided_a) {
2206 TEST_REQUIRES_ARM_NEON_V8;
2207 for (size_t k = 9; k < 16; k++) {
2208 GemmMicrokernelTester()
2209 .mr(1)
2210 .nr(16)
2211 .kr(1)
2212 .sr(1)
2213 .m(1)
2214 .n(16)
2215 .k(k)
2216 .a_stride(19)
2217 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2218 }
2219 }
2220
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE,k_gt_8_subtile)2221 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_gt_8_subtile) {
2222 TEST_REQUIRES_ARM_NEON_V8;
2223 for (size_t k = 9; k < 16; k++) {
2224 for (uint32_t n = 1; n <= 16; n++) {
2225 for (uint32_t m = 1; m <= 1; m++) {
2226 GemmMicrokernelTester()
2227 .mr(1)
2228 .nr(16)
2229 .kr(1)
2230 .sr(1)
2231 .m(m)
2232 .n(n)
2233 .k(k)
2234 .iterations(1)
2235 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2236 }
2237 }
2238 }
2239 }
2240
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE,k_div_8)2241 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_div_8) {
2242 TEST_REQUIRES_ARM_NEON_V8;
2243 for (size_t k = 16; k <= 80; k += 8) {
2244 GemmMicrokernelTester()
2245 .mr(1)
2246 .nr(16)
2247 .kr(1)
2248 .sr(1)
2249 .m(1)
2250 .n(16)
2251 .k(k)
2252 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2253 }
2254 }
2255
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE,k_div_8_strided_a)2256 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_div_8_strided_a) {
2257 TEST_REQUIRES_ARM_NEON_V8;
2258 for (size_t k = 16; k <= 80; k += 8) {
2259 GemmMicrokernelTester()
2260 .mr(1)
2261 .nr(16)
2262 .kr(1)
2263 .sr(1)
2264 .m(1)
2265 .n(16)
2266 .k(k)
2267 .a_stride(83)
2268 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2269 }
2270 }
2271
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE,k_div_8_subtile)2272 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_div_8_subtile) {
2273 TEST_REQUIRES_ARM_NEON_V8;
2274 for (size_t k = 16; k <= 80; k += 8) {
2275 for (uint32_t n = 1; n <= 16; n++) {
2276 for (uint32_t m = 1; m <= 1; m++) {
2277 GemmMicrokernelTester()
2278 .mr(1)
2279 .nr(16)
2280 .kr(1)
2281 .sr(1)
2282 .m(m)
2283 .n(n)
2284 .k(k)
2285 .iterations(1)
2286 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2287 }
2288 }
2289 }
2290 }
2291
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE,n_gt_16)2292 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, n_gt_16) {
2293 TEST_REQUIRES_ARM_NEON_V8;
2294 for (uint32_t n = 17; n < 32; n++) {
2295 for (size_t k = 1; k <= 40; k += 9) {
2296 GemmMicrokernelTester()
2297 .mr(1)
2298 .nr(16)
2299 .kr(1)
2300 .sr(1)
2301 .m(1)
2302 .n(n)
2303 .k(k)
2304 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2305 }
2306 }
2307 }
2308
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE,n_gt_16_strided_cn)2309 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, n_gt_16_strided_cn) {
2310 TEST_REQUIRES_ARM_NEON_V8;
2311 for (uint32_t n = 17; n < 32; n++) {
2312 for (size_t k = 1; k <= 40; k += 9) {
2313 GemmMicrokernelTester()
2314 .mr(1)
2315 .nr(16)
2316 .kr(1)
2317 .sr(1)
2318 .m(1)
2319 .n(n)
2320 .k(k)
2321 .cn_stride(19)
2322 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2323 }
2324 }
2325 }
2326
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE,n_gt_16_strided_a)2327 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, n_gt_16_strided_a) {
2328 TEST_REQUIRES_ARM_NEON_V8;
2329 for (uint32_t n = 17; n < 32; n++) {
2330 for (size_t k = 1; k <= 40; k += 9) {
2331 GemmMicrokernelTester()
2332 .mr(1)
2333 .nr(16)
2334 .kr(1)
2335 .sr(1)
2336 .m(1)
2337 .n(n)
2338 .k(k)
2339 .a_stride(43)
2340 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2341 }
2342 }
2343 }
2344
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE,n_gt_16_subtile)2345 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, n_gt_16_subtile) {
2346 TEST_REQUIRES_ARM_NEON_V8;
2347 for (uint32_t n = 17; n < 32; n++) {
2348 for (size_t k = 1; k <= 40; k += 9) {
2349 for (uint32_t m = 1; m <= 1; m++) {
2350 GemmMicrokernelTester()
2351 .mr(1)
2352 .nr(16)
2353 .kr(1)
2354 .sr(1)
2355 .m(m)
2356 .n(n)
2357 .k(k)
2358 .iterations(1)
2359 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2360 }
2361 }
2362 }
2363 }
2364
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE,n_div_16)2365 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, n_div_16) {
2366 TEST_REQUIRES_ARM_NEON_V8;
2367 for (uint32_t n = 32; n <= 48; n += 16) {
2368 for (size_t k = 1; k <= 40; k += 9) {
2369 GemmMicrokernelTester()
2370 .mr(1)
2371 .nr(16)
2372 .kr(1)
2373 .sr(1)
2374 .m(1)
2375 .n(n)
2376 .k(k)
2377 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2378 }
2379 }
2380 }
2381
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE,n_div_16_strided_cn)2382 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, n_div_16_strided_cn) {
2383 TEST_REQUIRES_ARM_NEON_V8;
2384 for (uint32_t n = 32; n <= 48; n += 16) {
2385 for (size_t k = 1; k <= 40; k += 9) {
2386 GemmMicrokernelTester()
2387 .mr(1)
2388 .nr(16)
2389 .kr(1)
2390 .sr(1)
2391 .m(1)
2392 .n(n)
2393 .k(k)
2394 .cn_stride(19)
2395 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2396 }
2397 }
2398 }
2399
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE,n_div_16_strided_a)2400 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, n_div_16_strided_a) {
2401 TEST_REQUIRES_ARM_NEON_V8;
2402 for (uint32_t n = 32; n <= 48; n += 16) {
2403 for (size_t k = 1; k <= 40; k += 9) {
2404 GemmMicrokernelTester()
2405 .mr(1)
2406 .nr(16)
2407 .kr(1)
2408 .sr(1)
2409 .m(1)
2410 .n(n)
2411 .k(k)
2412 .a_stride(43)
2413 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2414 }
2415 }
2416 }
2417
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE,n_div_16_subtile)2418 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, n_div_16_subtile) {
2419 TEST_REQUIRES_ARM_NEON_V8;
2420 for (uint32_t n = 32; n <= 48; n += 16) {
2421 for (size_t k = 1; k <= 40; k += 9) {
2422 for (uint32_t m = 1; m <= 1; m++) {
2423 GemmMicrokernelTester()
2424 .mr(1)
2425 .nr(16)
2426 .kr(1)
2427 .sr(1)
2428 .m(m)
2429 .n(n)
2430 .k(k)
2431 .iterations(1)
2432 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2433 }
2434 }
2435 }
2436 }
2437
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE,strided_cm_subtile)2438 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, strided_cm_subtile) {
2439 TEST_REQUIRES_ARM_NEON_V8;
2440 for (size_t k = 1; k <= 40; k += 9) {
2441 for (uint32_t n = 1; n <= 16; n++) {
2442 for (uint32_t m = 1; m <= 1; m++) {
2443 GemmMicrokernelTester()
2444 .mr(1)
2445 .nr(16)
2446 .kr(1)
2447 .sr(1)
2448 .m(m)
2449 .n(n)
2450 .k(k)
2451 .cm_stride(19)
2452 .iterations(1)
2453 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2454 }
2455 }
2456 }
2457 }
2458
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE,qmin)2459 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, qmin) {
2460 TEST_REQUIRES_ARM_NEON_V8;
2461 GemmMicrokernelTester()
2462 .mr(1)
2463 .nr(16)
2464 .kr(1)
2465 .sr(1)
2466 .m(1)
2467 .n(16)
2468 .k(8)
2469 .qmin(128)
2470 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2471 }
2472
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE,qmax)2473 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, qmax) {
2474 TEST_REQUIRES_ARM_NEON_V8;
2475 GemmMicrokernelTester()
2476 .mr(1)
2477 .nr(16)
2478 .kr(1)
2479 .sr(1)
2480 .m(1)
2481 .n(16)
2482 .k(8)
2483 .qmax(128)
2484 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2485 }
2486
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE,strided_cm)2487 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, strided_cm) {
2488 TEST_REQUIRES_ARM_NEON_V8;
2489 GemmMicrokernelTester()
2490 .mr(1)
2491 .nr(16)
2492 .kr(1)
2493 .sr(1)
2494 .m(1)
2495 .n(16)
2496 .k(8)
2497 .cm_stride(19)
2498 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2499 }
2500
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE,no_a_zero_point)2501 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, no_a_zero_point) {
2502 TEST_REQUIRES_ARM_NEON_V8;
2503 for (size_t k = 1; k <= 40; k += 9) {
2504 GemmMicrokernelTester()
2505 .mr(1)
2506 .nr(16)
2507 .kr(1)
2508 .sr(1)
2509 .m(1)
2510 .n(16)
2511 .k(k)
2512 .a_zero_point(0)
2513 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2514 }
2515 }
2516
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE,no_b_zero_point)2517 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, no_b_zero_point) {
2518 TEST_REQUIRES_ARM_NEON_V8;
2519 for (size_t k = 1; k <= 40; k += 9) {
2520 GemmMicrokernelTester()
2521 .mr(1)
2522 .nr(16)
2523 .kr(1)
2524 .sr(1)
2525 .m(1)
2526 .n(16)
2527 .k(k)
2528 .b_zero_point(0)
2529 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2530 }
2531 }
2532
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE,no_zero_point)2533 TEST(QU8_GEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, no_zero_point) {
2534 TEST_REQUIRES_ARM_NEON_V8;
2535 for (size_t k = 1; k <= 40; k += 9) {
2536 GemmMicrokernelTester()
2537 .mr(1)
2538 .nr(16)
2539 .kr(1)
2540 .sr(1)
2541 .m(1)
2542 .n(16)
2543 .k(k)
2544 .a_zero_point(0)
2545 .b_zero_point(0)
2546 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2547 }
2548 }
2549 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
2550
2551
2552 #if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64)
TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT,k_eq_8)2553 TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, k_eq_8) {
2554 TEST_REQUIRES_ARM_NEON_DOT;
2555 GemmMicrokernelTester()
2556 .mr(2)
2557 .nr(16)
2558 .kr(4)
2559 .sr(1)
2560 .m(2)
2561 .n(16)
2562 .k(8)
2563 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2564 }
2565
TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT,strided_cn)2566 TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, strided_cn) {
2567 TEST_REQUIRES_ARM_NEON_DOT;
2568 GemmMicrokernelTester()
2569 .mr(2)
2570 .nr(16)
2571 .kr(4)
2572 .sr(1)
2573 .m(2)
2574 .n(16)
2575 .k(8)
2576 .cn_stride(19)
2577 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2578 }
2579
TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT,k_eq_8_strided_a)2580 TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, k_eq_8_strided_a) {
2581 TEST_REQUIRES_ARM_NEON_DOT;
2582 GemmMicrokernelTester()
2583 .mr(2)
2584 .nr(16)
2585 .kr(4)
2586 .sr(1)
2587 .m(2)
2588 .n(16)
2589 .k(8)
2590 .a_stride(11)
2591 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2592 }
2593
TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT,k_eq_8_subtile)2594 TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, k_eq_8_subtile) {
2595 TEST_REQUIRES_ARM_NEON_DOT;
2596 for (uint32_t n = 1; n <= 16; n++) {
2597 for (uint32_t m = 1; m <= 2; m++) {
2598 GemmMicrokernelTester()
2599 .mr(2)
2600 .nr(16)
2601 .kr(4)
2602 .sr(1)
2603 .m(m)
2604 .n(n)
2605 .k(8)
2606 .iterations(1)
2607 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2608 }
2609 }
2610 }
2611
TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT,k_eq_8_subtile_m)2612 TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, k_eq_8_subtile_m) {
2613 TEST_REQUIRES_ARM_NEON_DOT;
2614 for (uint32_t m = 1; m <= 2; m++) {
2615 GemmMicrokernelTester()
2616 .mr(2)
2617 .nr(16)
2618 .kr(4)
2619 .sr(1)
2620 .m(m)
2621 .n(16)
2622 .k(8)
2623 .iterations(1)
2624 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2625 }
2626 }
2627
TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT,k_eq_8_subtile_n)2628 TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, k_eq_8_subtile_n) {
2629 TEST_REQUIRES_ARM_NEON_DOT;
2630 for (uint32_t n = 1; n <= 16; n++) {
2631 GemmMicrokernelTester()
2632 .mr(2)
2633 .nr(16)
2634 .kr(4)
2635 .sr(1)
2636 .m(2)
2637 .n(n)
2638 .k(8)
2639 .iterations(1)
2640 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2641 }
2642 }
2643
TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT,k_lt_8)2644 TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, k_lt_8) {
2645 TEST_REQUIRES_ARM_NEON_DOT;
2646 for (size_t k = 1; k < 8; k++) {
2647 GemmMicrokernelTester()
2648 .mr(2)
2649 .nr(16)
2650 .kr(4)
2651 .sr(1)
2652 .m(2)
2653 .n(16)
2654 .k(k)
2655 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2656 }
2657 }
2658
TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT,k_lt_8_strided_a)2659 TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, k_lt_8_strided_a) {
2660 TEST_REQUIRES_ARM_NEON_DOT;
2661 for (size_t k = 1; k < 8; k++) {
2662 GemmMicrokernelTester()
2663 .mr(2)
2664 .nr(16)
2665 .kr(4)
2666 .sr(1)
2667 .m(2)
2668 .n(16)
2669 .k(k)
2670 .a_stride(11)
2671 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2672 }
2673 }
2674
TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT,k_lt_8_subtile)2675 TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, k_lt_8_subtile) {
2676 TEST_REQUIRES_ARM_NEON_DOT;
2677 for (size_t k = 1; k < 8; k++) {
2678 for (uint32_t n = 1; n <= 16; n++) {
2679 for (uint32_t m = 1; m <= 2; m++) {
2680 GemmMicrokernelTester()
2681 .mr(2)
2682 .nr(16)
2683 .kr(4)
2684 .sr(1)
2685 .m(m)
2686 .n(n)
2687 .k(k)
2688 .iterations(1)
2689 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2690 }
2691 }
2692 }
2693 }
2694
TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT,k_gt_8)2695 TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, k_gt_8) {
2696 TEST_REQUIRES_ARM_NEON_DOT;
2697 for (size_t k = 9; k < 16; k++) {
2698 GemmMicrokernelTester()
2699 .mr(2)
2700 .nr(16)
2701 .kr(4)
2702 .sr(1)
2703 .m(2)
2704 .n(16)
2705 .k(k)
2706 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2707 }
2708 }
2709
TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT,k_gt_8_strided_a)2710 TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, k_gt_8_strided_a) {
2711 TEST_REQUIRES_ARM_NEON_DOT;
2712 for (size_t k = 9; k < 16; k++) {
2713 GemmMicrokernelTester()
2714 .mr(2)
2715 .nr(16)
2716 .kr(4)
2717 .sr(1)
2718 .m(2)
2719 .n(16)
2720 .k(k)
2721 .a_stride(19)
2722 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2723 }
2724 }
2725
TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT,k_gt_8_subtile)2726 TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, k_gt_8_subtile) {
2727 TEST_REQUIRES_ARM_NEON_DOT;
2728 for (size_t k = 9; k < 16; k++) {
2729 for (uint32_t n = 1; n <= 16; n++) {
2730 for (uint32_t m = 1; m <= 2; m++) {
2731 GemmMicrokernelTester()
2732 .mr(2)
2733 .nr(16)
2734 .kr(4)
2735 .sr(1)
2736 .m(m)
2737 .n(n)
2738 .k(k)
2739 .iterations(1)
2740 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2741 }
2742 }
2743 }
2744 }
2745
TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT,k_div_8)2746 TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, k_div_8) {
2747 TEST_REQUIRES_ARM_NEON_DOT;
2748 for (size_t k = 16; k <= 80; k += 8) {
2749 GemmMicrokernelTester()
2750 .mr(2)
2751 .nr(16)
2752 .kr(4)
2753 .sr(1)
2754 .m(2)
2755 .n(16)
2756 .k(k)
2757 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2758 }
2759 }
2760
TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT,k_div_8_strided_a)2761 TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, k_div_8_strided_a) {
2762 TEST_REQUIRES_ARM_NEON_DOT;
2763 for (size_t k = 16; k <= 80; k += 8) {
2764 GemmMicrokernelTester()
2765 .mr(2)
2766 .nr(16)
2767 .kr(4)
2768 .sr(1)
2769 .m(2)
2770 .n(16)
2771 .k(k)
2772 .a_stride(83)
2773 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2774 }
2775 }
2776
TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT,k_div_8_subtile)2777 TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, k_div_8_subtile) {
2778 TEST_REQUIRES_ARM_NEON_DOT;
2779 for (size_t k = 16; k <= 80; k += 8) {
2780 for (uint32_t n = 1; n <= 16; n++) {
2781 for (uint32_t m = 1; m <= 2; m++) {
2782 GemmMicrokernelTester()
2783 .mr(2)
2784 .nr(16)
2785 .kr(4)
2786 .sr(1)
2787 .m(m)
2788 .n(n)
2789 .k(k)
2790 .iterations(1)
2791 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2792 }
2793 }
2794 }
2795 }
2796
TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT,n_gt_16)2797 TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, n_gt_16) {
2798 TEST_REQUIRES_ARM_NEON_DOT;
2799 for (uint32_t n = 17; n < 32; n++) {
2800 for (size_t k = 1; k <= 40; k += 9) {
2801 GemmMicrokernelTester()
2802 .mr(2)
2803 .nr(16)
2804 .kr(4)
2805 .sr(1)
2806 .m(2)
2807 .n(n)
2808 .k(k)
2809 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2810 }
2811 }
2812 }
2813
TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT,n_gt_16_strided_cn)2814 TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, n_gt_16_strided_cn) {
2815 TEST_REQUIRES_ARM_NEON_DOT;
2816 for (uint32_t n = 17; n < 32; n++) {
2817 for (size_t k = 1; k <= 40; k += 9) {
2818 GemmMicrokernelTester()
2819 .mr(2)
2820 .nr(16)
2821 .kr(4)
2822 .sr(1)
2823 .m(2)
2824 .n(n)
2825 .k(k)
2826 .cn_stride(19)
2827 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2828 }
2829 }
2830 }
2831
TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT,n_gt_16_strided_a)2832 TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, n_gt_16_strided_a) {
2833 TEST_REQUIRES_ARM_NEON_DOT;
2834 for (uint32_t n = 17; n < 32; n++) {
2835 for (size_t k = 1; k <= 40; k += 9) {
2836 GemmMicrokernelTester()
2837 .mr(2)
2838 .nr(16)
2839 .kr(4)
2840 .sr(1)
2841 .m(2)
2842 .n(n)
2843 .k(k)
2844 .a_stride(43)
2845 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2846 }
2847 }
2848 }
2849
TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT,n_gt_16_subtile)2850 TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, n_gt_16_subtile) {
2851 TEST_REQUIRES_ARM_NEON_DOT;
2852 for (uint32_t n = 17; n < 32; n++) {
2853 for (size_t k = 1; k <= 40; k += 9) {
2854 for (uint32_t m = 1; m <= 2; m++) {
2855 GemmMicrokernelTester()
2856 .mr(2)
2857 .nr(16)
2858 .kr(4)
2859 .sr(1)
2860 .m(m)
2861 .n(n)
2862 .k(k)
2863 .iterations(1)
2864 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2865 }
2866 }
2867 }
2868 }
2869
TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT,n_div_16)2870 TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, n_div_16) {
2871 TEST_REQUIRES_ARM_NEON_DOT;
2872 for (uint32_t n = 32; n <= 48; n += 16) {
2873 for (size_t k = 1; k <= 40; k += 9) {
2874 GemmMicrokernelTester()
2875 .mr(2)
2876 .nr(16)
2877 .kr(4)
2878 .sr(1)
2879 .m(2)
2880 .n(n)
2881 .k(k)
2882 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2883 }
2884 }
2885 }
2886
TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT,n_div_16_strided_cn)2887 TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, n_div_16_strided_cn) {
2888 TEST_REQUIRES_ARM_NEON_DOT;
2889 for (uint32_t n = 32; n <= 48; n += 16) {
2890 for (size_t k = 1; k <= 40; k += 9) {
2891 GemmMicrokernelTester()
2892 .mr(2)
2893 .nr(16)
2894 .kr(4)
2895 .sr(1)
2896 .m(2)
2897 .n(n)
2898 .k(k)
2899 .cn_stride(19)
2900 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2901 }
2902 }
2903 }
2904
TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT,n_div_16_strided_a)2905 TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, n_div_16_strided_a) {
2906 TEST_REQUIRES_ARM_NEON_DOT;
2907 for (uint32_t n = 32; n <= 48; n += 16) {
2908 for (size_t k = 1; k <= 40; k += 9) {
2909 GemmMicrokernelTester()
2910 .mr(2)
2911 .nr(16)
2912 .kr(4)
2913 .sr(1)
2914 .m(2)
2915 .n(n)
2916 .k(k)
2917 .a_stride(43)
2918 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2919 }
2920 }
2921 }
2922
TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT,n_div_16_subtile)2923 TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, n_div_16_subtile) {
2924 TEST_REQUIRES_ARM_NEON_DOT;
2925 for (uint32_t n = 32; n <= 48; n += 16) {
2926 for (size_t k = 1; k <= 40; k += 9) {
2927 for (uint32_t m = 1; m <= 2; m++) {
2928 GemmMicrokernelTester()
2929 .mr(2)
2930 .nr(16)
2931 .kr(4)
2932 .sr(1)
2933 .m(m)
2934 .n(n)
2935 .k(k)
2936 .iterations(1)
2937 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2938 }
2939 }
2940 }
2941 }
2942
TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT,strided_cm_subtile)2943 TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, strided_cm_subtile) {
2944 TEST_REQUIRES_ARM_NEON_DOT;
2945 for (size_t k = 1; k <= 40; k += 9) {
2946 for (uint32_t n = 1; n <= 16; n++) {
2947 for (uint32_t m = 1; m <= 2; m++) {
2948 GemmMicrokernelTester()
2949 .mr(2)
2950 .nr(16)
2951 .kr(4)
2952 .sr(1)
2953 .m(m)
2954 .n(n)
2955 .k(k)
2956 .cm_stride(19)
2957 .iterations(1)
2958 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2959 }
2960 }
2961 }
2962 }
2963
TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT,qmin)2964 TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, qmin) {
2965 TEST_REQUIRES_ARM_NEON_DOT;
2966 GemmMicrokernelTester()
2967 .mr(2)
2968 .nr(16)
2969 .kr(4)
2970 .sr(1)
2971 .m(2)
2972 .n(16)
2973 .k(8)
2974 .qmin(128)
2975 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2976 }
2977
TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT,qmax)2978 TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, qmax) {
2979 TEST_REQUIRES_ARM_NEON_DOT;
2980 GemmMicrokernelTester()
2981 .mr(2)
2982 .nr(16)
2983 .kr(4)
2984 .sr(1)
2985 .m(2)
2986 .n(16)
2987 .k(8)
2988 .qmax(128)
2989 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
2990 }
2991
TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT,strided_cm)2992 TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, strided_cm) {
2993 TEST_REQUIRES_ARM_NEON_DOT;
2994 GemmMicrokernelTester()
2995 .mr(2)
2996 .nr(16)
2997 .kr(4)
2998 .sr(1)
2999 .m(2)
3000 .n(16)
3001 .k(8)
3002 .cm_stride(19)
3003 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
3004 }
3005
TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT,no_a_zero_point)3006 TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, no_a_zero_point) {
3007 TEST_REQUIRES_ARM_NEON_DOT;
3008 for (size_t k = 1; k <= 40; k += 9) {
3009 GemmMicrokernelTester()
3010 .mr(2)
3011 .nr(16)
3012 .kr(4)
3013 .sr(1)
3014 .m(2)
3015 .n(16)
3016 .k(k)
3017 .a_zero_point(0)
3018 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
3019 }
3020 }
3021
TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT,no_b_zero_point)3022 TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, no_b_zero_point) {
3023 TEST_REQUIRES_ARM_NEON_DOT;
3024 for (size_t k = 1; k <= 40; k += 9) {
3025 GemmMicrokernelTester()
3026 .mr(2)
3027 .nr(16)
3028 .kr(4)
3029 .sr(1)
3030 .m(2)
3031 .n(16)
3032 .k(k)
3033 .b_zero_point(0)
3034 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
3035 }
3036 }
3037
TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT,no_zero_point)3038 TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, no_zero_point) {
3039 TEST_REQUIRES_ARM_NEON_DOT;
3040 for (size_t k = 1; k <= 40; k += 9) {
3041 GemmMicrokernelTester()
3042 .mr(2)
3043 .nr(16)
3044 .kr(4)
3045 .sr(1)
3046 .m(2)
3047 .n(16)
3048 .k(k)
3049 .a_zero_point(0)
3050 .b_zero_point(0)
3051 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
3052 }
3053 }
3054 #endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64)
3055
3056
3057 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE,k_eq_8)3058 TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_eq_8) {
3059 TEST_REQUIRES_ARM_NEON;
3060 GemmMicrokernelTester()
3061 .mr(4)
3062 .nr(8)
3063 .kr(1)
3064 .sr(1)
3065 .m(4)
3066 .n(8)
3067 .k(8)
3068 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
3069 }
3070
TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE,strided_cn)3071 TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, strided_cn) {
3072 TEST_REQUIRES_ARM_NEON;
3073 GemmMicrokernelTester()
3074 .mr(4)
3075 .nr(8)
3076 .kr(1)
3077 .sr(1)
3078 .m(4)
3079 .n(8)
3080 .k(8)
3081 .cn_stride(11)
3082 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
3083 }
3084
TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE,k_eq_8_strided_a)3085 TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_eq_8_strided_a) {
3086 TEST_REQUIRES_ARM_NEON;
3087 GemmMicrokernelTester()
3088 .mr(4)
3089 .nr(8)
3090 .kr(1)
3091 .sr(1)
3092 .m(4)
3093 .n(8)
3094 .k(8)
3095 .a_stride(11)
3096 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
3097 }
3098
TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE,k_eq_8_subtile)3099 TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_eq_8_subtile) {
3100 TEST_REQUIRES_ARM_NEON;
3101 for (uint32_t n = 1; n <= 8; n++) {
3102 for (uint32_t m = 1; m <= 4; m++) {
3103 GemmMicrokernelTester()
3104 .mr(4)
3105 .nr(8)
3106 .kr(1)
3107 .sr(1)
3108 .m(m)
3109 .n(n)
3110 .k(8)
3111 .iterations(1)
3112 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
3113 }
3114 }
3115 }
3116
TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE,k_eq_8_subtile_m)3117 TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_eq_8_subtile_m) {
3118 TEST_REQUIRES_ARM_NEON;
3119 for (uint32_t m = 1; m <= 4; m++) {
3120 GemmMicrokernelTester()
3121 .mr(4)
3122 .nr(8)
3123 .kr(1)
3124 .sr(1)
3125 .m(m)
3126 .n(8)
3127 .k(8)
3128 .iterations(1)
3129 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
3130 }
3131 }
3132
TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE,k_eq_8_subtile_n)3133 TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_eq_8_subtile_n) {
3134 TEST_REQUIRES_ARM_NEON;
3135 for (uint32_t n = 1; n <= 8; n++) {
3136 GemmMicrokernelTester()
3137 .mr(4)
3138 .nr(8)
3139 .kr(1)
3140 .sr(1)
3141 .m(4)
3142 .n(n)
3143 .k(8)
3144 .iterations(1)
3145 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
3146 }
3147 }
3148
TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE,k_lt_8)3149 TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_lt_8) {
3150 TEST_REQUIRES_ARM_NEON;
3151 for (size_t k = 1; k < 8; k++) {
3152 GemmMicrokernelTester()
3153 .mr(4)
3154 .nr(8)
3155 .kr(1)
3156 .sr(1)
3157 .m(4)
3158 .n(8)
3159 .k(k)
3160 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
3161 }
3162 }
3163
TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE,k_lt_8_strided_a)3164 TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_lt_8_strided_a) {
3165 TEST_REQUIRES_ARM_NEON;
3166 for (size_t k = 1; k < 8; k++) {
3167 GemmMicrokernelTester()
3168 .mr(4)
3169 .nr(8)
3170 .kr(1)
3171 .sr(1)
3172 .m(4)
3173 .n(8)
3174 .k(k)
3175 .a_stride(11)
3176 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
3177 }
3178 }
3179
TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE,k_lt_8_subtile)3180 TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_lt_8_subtile) {
3181 TEST_REQUIRES_ARM_NEON;
3182 for (size_t k = 1; k < 8; k++) {
3183 for (uint32_t n = 1; n <= 8; n++) {
3184 for (uint32_t m = 1; m <= 4; m++) {
3185 GemmMicrokernelTester()
3186 .mr(4)
3187 .nr(8)
3188 .kr(1)
3189 .sr(1)
3190 .m(m)
3191 .n(n)
3192 .k(k)
3193 .iterations(1)
3194 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
3195 }
3196 }
3197 }
3198 }
3199
TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE,k_gt_8)3200 TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_gt_8) {
3201 TEST_REQUIRES_ARM_NEON;
3202 for (size_t k = 9; k < 16; k++) {
3203 GemmMicrokernelTester()
3204 .mr(4)
3205 .nr(8)
3206 .kr(1)
3207 .sr(1)
3208 .m(4)
3209 .n(8)
3210 .k(k)
3211 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
3212 }
3213 }
3214
TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE,k_gt_8_strided_a)3215 TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_gt_8_strided_a) {
3216 TEST_REQUIRES_ARM_NEON;
3217 for (size_t k = 9; k < 16; k++) {
3218 GemmMicrokernelTester()
3219 .mr(4)
3220 .nr(8)
3221 .kr(1)
3222 .sr(1)
3223 .m(4)
3224 .n(8)
3225 .k(k)
3226 .a_stride(19)
3227 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
3228 }
3229 }
3230
TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE,k_gt_8_subtile)3231 TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_gt_8_subtile) {
3232 TEST_REQUIRES_ARM_NEON;
3233 for (size_t k = 9; k < 16; k++) {
3234 for (uint32_t n = 1; n <= 8; n++) {
3235 for (uint32_t m = 1; m <= 4; m++) {
3236 GemmMicrokernelTester()
3237 .mr(4)
3238 .nr(8)
3239 .kr(1)
3240 .sr(1)
3241 .m(m)
3242 .n(n)
3243 .k(k)
3244 .iterations(1)
3245 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
3246 }
3247 }
3248 }
3249 }
3250
TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE,k_div_8)3251 TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_div_8) {
3252 TEST_REQUIRES_ARM_NEON;
3253 for (size_t k = 16; k <= 80; k += 8) {
3254 GemmMicrokernelTester()
3255 .mr(4)
3256 .nr(8)
3257 .kr(1)
3258 .sr(1)
3259 .m(4)
3260 .n(8)
3261 .k(k)
3262 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
3263 }
3264 }
3265
TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE,k_div_8_strided_a)3266 TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_div_8_strided_a) {
3267 TEST_REQUIRES_ARM_NEON;
3268 for (size_t k = 16; k <= 80; k += 8) {
3269 GemmMicrokernelTester()
3270 .mr(4)
3271 .nr(8)
3272 .kr(1)
3273 .sr(1)
3274 .m(4)
3275 .n(8)
3276 .k(k)
3277 .a_stride(83)
3278 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
3279 }
3280 }
3281
TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE,k_div_8_subtile)3282 TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_div_8_subtile) {
3283 TEST_REQUIRES_ARM_NEON;
3284 for (size_t k = 16; k <= 80; k += 8) {
3285 for (uint32_t n = 1; n <= 8; n++) {
3286 for (uint32_t m = 1; m <= 4; m++) {
3287 GemmMicrokernelTester()
3288 .mr(4)
3289 .nr(8)
3290 .kr(1)
3291 .sr(1)
3292 .m(m)
3293 .n(n)
3294 .k(k)
3295 .iterations(1)
3296 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
3297 }
3298 }
3299 }
3300 }
3301
TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE,n_gt_8)3302 TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_gt_8) {
3303 TEST_REQUIRES_ARM_NEON;
3304 for (uint32_t n = 9; n < 16; n++) {
3305 for (size_t k = 1; k <= 40; k += 9) {
3306 GemmMicrokernelTester()
3307 .mr(4)
3308 .nr(8)
3309 .kr(1)
3310 .sr(1)
3311 .m(4)
3312 .n(n)
3313 .k(k)
3314 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
3315 }
3316 }
3317 }
3318
TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE,n_gt_8_strided_cn)3319 TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_gt_8_strided_cn) {
3320 TEST_REQUIRES_ARM_NEON;
3321 for (uint32_t n = 9; n < 16; n++) {
3322 for (size_t k = 1; k <= 40; k += 9) {
3323 GemmMicrokernelTester()
3324 .mr(4)
3325 .nr(8)
3326 .kr(1)
3327 .sr(1)
3328 .m(4)
3329 .n(n)
3330 .k(k)
3331 .cn_stride(11)
3332 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
3333 }
3334 }
3335 }
3336
TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE,n_gt_8_strided_a)3337 TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_gt_8_strided_a) {
3338 TEST_REQUIRES_ARM_NEON;
3339 for (uint32_t n = 9; n < 16; n++) {
3340 for (size_t k = 1; k <= 40; k += 9) {
3341 GemmMicrokernelTester()
3342 .mr(4)
3343 .nr(8)
3344 .kr(1)
3345 .sr(1)
3346 .m(4)
3347 .n(n)
3348 .k(k)
3349 .a_stride(43)
3350 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
3351 }
3352 }
3353 }
3354
TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE,n_gt_8_subtile)3355 TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_gt_8_subtile) {
3356 TEST_REQUIRES_ARM_NEON;
3357 for (uint32_t n = 9; n < 16; n++) {
3358 for (size_t k = 1; k <= 40; k += 9) {
3359 for (uint32_t m = 1; m <= 4; m++) {
3360 GemmMicrokernelTester()
3361 .mr(4)
3362 .nr(8)
3363 .kr(1)
3364 .sr(1)
3365 .m(m)
3366 .n(n)
3367 .k(k)
3368 .iterations(1)
3369 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
3370 }
3371 }
3372 }
3373 }
3374
TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE,n_div_8)3375 TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_div_8) {
3376 TEST_REQUIRES_ARM_NEON;
3377 for (uint32_t n = 16; n <= 24; n += 8) {
3378 for (size_t k = 1; k <= 40; k += 9) {
3379 GemmMicrokernelTester()
3380 .mr(4)
3381 .nr(8)
3382 .kr(1)
3383 .sr(1)
3384 .m(4)
3385 .n(n)
3386 .k(k)
3387 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
3388 }
3389 }
3390 }
3391
TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE,n_div_8_strided_cn)3392 TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_div_8_strided_cn) {
3393 TEST_REQUIRES_ARM_NEON;
3394 for (uint32_t n = 16; n <= 24; n += 8) {
3395 for (size_t k = 1; k <= 40; k += 9) {
3396 GemmMicrokernelTester()
3397 .mr(4)
3398 .nr(8)
3399 .kr(1)
3400 .sr(1)
3401 .m(4)
3402 .n(n)
3403 .k(k)
3404 .cn_stride(11)
3405 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
3406 }
3407 }
3408 }
3409
TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE,n_div_8_strided_a)3410 TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_div_8_strided_a) {
3411 TEST_REQUIRES_ARM_NEON;
3412 for (uint32_t n = 16; n <= 24; n += 8) {
3413 for (size_t k = 1; k <= 40; k += 9) {
3414 GemmMicrokernelTester()
3415 .mr(4)
3416 .nr(8)
3417 .kr(1)
3418 .sr(1)
3419 .m(4)
3420 .n(n)
3421 .k(k)
3422 .a_stride(43)
3423 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
3424 }
3425 }
3426 }
3427
TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE,n_div_8_subtile)3428 TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_div_8_subtile) {
3429 TEST_REQUIRES_ARM_NEON;
3430 for (uint32_t n = 16; n <= 24; n += 8) {
3431 for (size_t k = 1; k <= 40; k += 9) {
3432 for (uint32_t m = 1; m <= 4; m++) {
3433 GemmMicrokernelTester()
3434 .mr(4)
3435 .nr(8)
3436 .kr(1)
3437 .sr(1)
3438 .m(m)
3439 .n(n)
3440 .k(k)
3441 .iterations(1)
3442 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
3443 }
3444 }
3445 }
3446 }
3447
TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE,strided_cm_subtile)3448 TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, strided_cm_subtile) {
3449 TEST_REQUIRES_ARM_NEON;
3450 for (size_t k = 1; k <= 40; k += 9) {
3451 for (uint32_t n = 1; n <= 8; n++) {
3452 for (uint32_t m = 1; m <= 4; m++) {
3453 GemmMicrokernelTester()
3454 .mr(4)
3455 .nr(8)
3456 .kr(1)
3457 .sr(1)
3458 .m(m)
3459 .n(n)
3460 .k(k)
3461 .cm_stride(11)
3462 .iterations(1)
3463 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
3464 }
3465 }
3466 }
3467 }
3468
TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE,qmin)3469 TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, qmin) {
3470 TEST_REQUIRES_ARM_NEON;
3471 GemmMicrokernelTester()
3472 .mr(4)
3473 .nr(8)
3474 .kr(1)
3475 .sr(1)
3476 .m(4)
3477 .n(8)
3478 .k(8)
3479 .qmin(128)
3480 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
3481 }
3482
TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE,qmax)3483 TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, qmax) {
3484 TEST_REQUIRES_ARM_NEON;
3485 GemmMicrokernelTester()
3486 .mr(4)
3487 .nr(8)
3488 .kr(1)
3489 .sr(1)
3490 .m(4)
3491 .n(8)
3492 .k(8)
3493 .qmax(128)
3494 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
3495 }
3496
TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE,strided_cm)3497 TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, strided_cm) {
3498 TEST_REQUIRES_ARM_NEON;
3499 GemmMicrokernelTester()
3500 .mr(4)
3501 .nr(8)
3502 .kr(1)
3503 .sr(1)
3504 .m(4)
3505 .n(8)
3506 .k(8)
3507 .cm_stride(11)
3508 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
3509 }
3510
TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE,no_a_zero_point)3511 TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, no_a_zero_point) {
3512 TEST_REQUIRES_ARM_NEON;
3513 for (size_t k = 1; k <= 40; k += 9) {
3514 GemmMicrokernelTester()
3515 .mr(4)
3516 .nr(8)
3517 .kr(1)
3518 .sr(1)
3519 .m(4)
3520 .n(8)
3521 .k(k)
3522 .a_zero_point(0)
3523 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
3524 }
3525 }
3526
TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE,no_b_zero_point)3527 TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, no_b_zero_point) {
3528 TEST_REQUIRES_ARM_NEON;
3529 for (size_t k = 1; k <= 40; k += 9) {
3530 GemmMicrokernelTester()
3531 .mr(4)
3532 .nr(8)
3533 .kr(1)
3534 .sr(1)
3535 .m(4)
3536 .n(8)
3537 .k(k)
3538 .b_zero_point(0)
3539 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
3540 }
3541 }
3542
TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE,no_zero_point)3543 TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, no_zero_point) {
3544 TEST_REQUIRES_ARM_NEON;
3545 for (size_t k = 1; k <= 40; k += 9) {
3546 GemmMicrokernelTester()
3547 .mr(4)
3548 .nr(8)
3549 .kr(1)
3550 .sr(1)
3551 .m(4)
3552 .n(8)
3553 .k(k)
3554 .a_zero_point(0)
3555 .b_zero_point(0)
3556 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_qu8_requantize_fp32);
3557 }
3558 }
3559 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
3560
3561
3562 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE,k_eq_8)3563 TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, k_eq_8) {
3564 TEST_REQUIRES_ARM_NEON_V8;
3565 GemmMicrokernelTester()
3566 .mr(4)
3567 .nr(16)
3568 .kr(1)
3569 .sr(1)
3570 .m(4)
3571 .n(16)
3572 .k(8)
3573 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
3574 }
3575
TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE,strided_cn)3576 TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, strided_cn) {
3577 TEST_REQUIRES_ARM_NEON_V8;
3578 GemmMicrokernelTester()
3579 .mr(4)
3580 .nr(16)
3581 .kr(1)
3582 .sr(1)
3583 .m(4)
3584 .n(16)
3585 .k(8)
3586 .cn_stride(19)
3587 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
3588 }
3589
TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE,k_eq_8_strided_a)3590 TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, k_eq_8_strided_a) {
3591 TEST_REQUIRES_ARM_NEON_V8;
3592 GemmMicrokernelTester()
3593 .mr(4)
3594 .nr(16)
3595 .kr(1)
3596 .sr(1)
3597 .m(4)
3598 .n(16)
3599 .k(8)
3600 .a_stride(11)
3601 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
3602 }
3603
TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE,k_eq_8_subtile)3604 TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, k_eq_8_subtile) {
3605 TEST_REQUIRES_ARM_NEON_V8;
3606 for (uint32_t n = 1; n <= 16; n++) {
3607 for (uint32_t m = 1; m <= 4; m++) {
3608 GemmMicrokernelTester()
3609 .mr(4)
3610 .nr(16)
3611 .kr(1)
3612 .sr(1)
3613 .m(m)
3614 .n(n)
3615 .k(8)
3616 .iterations(1)
3617 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
3618 }
3619 }
3620 }
3621
TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE,k_eq_8_subtile_m)3622 TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, k_eq_8_subtile_m) {
3623 TEST_REQUIRES_ARM_NEON_V8;
3624 for (uint32_t m = 1; m <= 4; m++) {
3625 GemmMicrokernelTester()
3626 .mr(4)
3627 .nr(16)
3628 .kr(1)
3629 .sr(1)
3630 .m(m)
3631 .n(16)
3632 .k(8)
3633 .iterations(1)
3634 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
3635 }
3636 }
3637
TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE,k_eq_8_subtile_n)3638 TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, k_eq_8_subtile_n) {
3639 TEST_REQUIRES_ARM_NEON_V8;
3640 for (uint32_t n = 1; n <= 16; n++) {
3641 GemmMicrokernelTester()
3642 .mr(4)
3643 .nr(16)
3644 .kr(1)
3645 .sr(1)
3646 .m(4)
3647 .n(n)
3648 .k(8)
3649 .iterations(1)
3650 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
3651 }
3652 }
3653
TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE,k_lt_8)3654 TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, k_lt_8) {
3655 TEST_REQUIRES_ARM_NEON_V8;
3656 for (size_t k = 1; k < 8; k++) {
3657 GemmMicrokernelTester()
3658 .mr(4)
3659 .nr(16)
3660 .kr(1)
3661 .sr(1)
3662 .m(4)
3663 .n(16)
3664 .k(k)
3665 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
3666 }
3667 }
3668
TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE,k_lt_8_strided_a)3669 TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, k_lt_8_strided_a) {
3670 TEST_REQUIRES_ARM_NEON_V8;
3671 for (size_t k = 1; k < 8; k++) {
3672 GemmMicrokernelTester()
3673 .mr(4)
3674 .nr(16)
3675 .kr(1)
3676 .sr(1)
3677 .m(4)
3678 .n(16)
3679 .k(k)
3680 .a_stride(11)
3681 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
3682 }
3683 }
3684
TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE,k_lt_8_subtile)3685 TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, k_lt_8_subtile) {
3686 TEST_REQUIRES_ARM_NEON_V8;
3687 for (size_t k = 1; k < 8; k++) {
3688 for (uint32_t n = 1; n <= 16; n++) {
3689 for (uint32_t m = 1; m <= 4; m++) {
3690 GemmMicrokernelTester()
3691 .mr(4)
3692 .nr(16)
3693 .kr(1)
3694 .sr(1)
3695 .m(m)
3696 .n(n)
3697 .k(k)
3698 .iterations(1)
3699 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
3700 }
3701 }
3702 }
3703 }
3704
TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE,k_gt_8)3705 TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, k_gt_8) {
3706 TEST_REQUIRES_ARM_NEON_V8;
3707 for (size_t k = 9; k < 16; k++) {
3708 GemmMicrokernelTester()
3709 .mr(4)
3710 .nr(16)
3711 .kr(1)
3712 .sr(1)
3713 .m(4)
3714 .n(16)
3715 .k(k)
3716 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
3717 }
3718 }
3719
TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE,k_gt_8_strided_a)3720 TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, k_gt_8_strided_a) {
3721 TEST_REQUIRES_ARM_NEON_V8;
3722 for (size_t k = 9; k < 16; k++) {
3723 GemmMicrokernelTester()
3724 .mr(4)
3725 .nr(16)
3726 .kr(1)
3727 .sr(1)
3728 .m(4)
3729 .n(16)
3730 .k(k)
3731 .a_stride(19)
3732 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
3733 }
3734 }
3735
TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE,k_gt_8_subtile)3736 TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, k_gt_8_subtile) {
3737 TEST_REQUIRES_ARM_NEON_V8;
3738 for (size_t k = 9; k < 16; k++) {
3739 for (uint32_t n = 1; n <= 16; n++) {
3740 for (uint32_t m = 1; m <= 4; m++) {
3741 GemmMicrokernelTester()
3742 .mr(4)
3743 .nr(16)
3744 .kr(1)
3745 .sr(1)
3746 .m(m)
3747 .n(n)
3748 .k(k)
3749 .iterations(1)
3750 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
3751 }
3752 }
3753 }
3754 }
3755
TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE,k_div_8)3756 TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, k_div_8) {
3757 TEST_REQUIRES_ARM_NEON_V8;
3758 for (size_t k = 16; k <= 80; k += 8) {
3759 GemmMicrokernelTester()
3760 .mr(4)
3761 .nr(16)
3762 .kr(1)
3763 .sr(1)
3764 .m(4)
3765 .n(16)
3766 .k(k)
3767 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
3768 }
3769 }
3770
TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE,k_div_8_strided_a)3771 TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, k_div_8_strided_a) {
3772 TEST_REQUIRES_ARM_NEON_V8;
3773 for (size_t k = 16; k <= 80; k += 8) {
3774 GemmMicrokernelTester()
3775 .mr(4)
3776 .nr(16)
3777 .kr(1)
3778 .sr(1)
3779 .m(4)
3780 .n(16)
3781 .k(k)
3782 .a_stride(83)
3783 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
3784 }
3785 }
3786
TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE,k_div_8_subtile)3787 TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, k_div_8_subtile) {
3788 TEST_REQUIRES_ARM_NEON_V8;
3789 for (size_t k = 16; k <= 80; k += 8) {
3790 for (uint32_t n = 1; n <= 16; n++) {
3791 for (uint32_t m = 1; m <= 4; m++) {
3792 GemmMicrokernelTester()
3793 .mr(4)
3794 .nr(16)
3795 .kr(1)
3796 .sr(1)
3797 .m(m)
3798 .n(n)
3799 .k(k)
3800 .iterations(1)
3801 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
3802 }
3803 }
3804 }
3805 }
3806
TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE,n_gt_16)3807 TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, n_gt_16) {
3808 TEST_REQUIRES_ARM_NEON_V8;
3809 for (uint32_t n = 17; n < 32; n++) {
3810 for (size_t k = 1; k <= 40; k += 9) {
3811 GemmMicrokernelTester()
3812 .mr(4)
3813 .nr(16)
3814 .kr(1)
3815 .sr(1)
3816 .m(4)
3817 .n(n)
3818 .k(k)
3819 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
3820 }
3821 }
3822 }
3823
TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE,n_gt_16_strided_cn)3824 TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, n_gt_16_strided_cn) {
3825 TEST_REQUIRES_ARM_NEON_V8;
3826 for (uint32_t n = 17; n < 32; n++) {
3827 for (size_t k = 1; k <= 40; k += 9) {
3828 GemmMicrokernelTester()
3829 .mr(4)
3830 .nr(16)
3831 .kr(1)
3832 .sr(1)
3833 .m(4)
3834 .n(n)
3835 .k(k)
3836 .cn_stride(19)
3837 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
3838 }
3839 }
3840 }
3841
TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE,n_gt_16_strided_a)3842 TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, n_gt_16_strided_a) {
3843 TEST_REQUIRES_ARM_NEON_V8;
3844 for (uint32_t n = 17; n < 32; n++) {
3845 for (size_t k = 1; k <= 40; k += 9) {
3846 GemmMicrokernelTester()
3847 .mr(4)
3848 .nr(16)
3849 .kr(1)
3850 .sr(1)
3851 .m(4)
3852 .n(n)
3853 .k(k)
3854 .a_stride(43)
3855 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
3856 }
3857 }
3858 }
3859
TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE,n_gt_16_subtile)3860 TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, n_gt_16_subtile) {
3861 TEST_REQUIRES_ARM_NEON_V8;
3862 for (uint32_t n = 17; n < 32; n++) {
3863 for (size_t k = 1; k <= 40; k += 9) {
3864 for (uint32_t m = 1; m <= 4; m++) {
3865 GemmMicrokernelTester()
3866 .mr(4)
3867 .nr(16)
3868 .kr(1)
3869 .sr(1)
3870 .m(m)
3871 .n(n)
3872 .k(k)
3873 .iterations(1)
3874 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
3875 }
3876 }
3877 }
3878 }
3879
TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE,n_div_16)3880 TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, n_div_16) {
3881 TEST_REQUIRES_ARM_NEON_V8;
3882 for (uint32_t n = 32; n <= 48; n += 16) {
3883 for (size_t k = 1; k <= 40; k += 9) {
3884 GemmMicrokernelTester()
3885 .mr(4)
3886 .nr(16)
3887 .kr(1)
3888 .sr(1)
3889 .m(4)
3890 .n(n)
3891 .k(k)
3892 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
3893 }
3894 }
3895 }
3896
TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE,n_div_16_strided_cn)3897 TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, n_div_16_strided_cn) {
3898 TEST_REQUIRES_ARM_NEON_V8;
3899 for (uint32_t n = 32; n <= 48; n += 16) {
3900 for (size_t k = 1; k <= 40; k += 9) {
3901 GemmMicrokernelTester()
3902 .mr(4)
3903 .nr(16)
3904 .kr(1)
3905 .sr(1)
3906 .m(4)
3907 .n(n)
3908 .k(k)
3909 .cn_stride(19)
3910 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
3911 }
3912 }
3913 }
3914
TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE,n_div_16_strided_a)3915 TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, n_div_16_strided_a) {
3916 TEST_REQUIRES_ARM_NEON_V8;
3917 for (uint32_t n = 32; n <= 48; n += 16) {
3918 for (size_t k = 1; k <= 40; k += 9) {
3919 GemmMicrokernelTester()
3920 .mr(4)
3921 .nr(16)
3922 .kr(1)
3923 .sr(1)
3924 .m(4)
3925 .n(n)
3926 .k(k)
3927 .a_stride(43)
3928 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
3929 }
3930 }
3931 }
3932
TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE,n_div_16_subtile)3933 TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, n_div_16_subtile) {
3934 TEST_REQUIRES_ARM_NEON_V8;
3935 for (uint32_t n = 32; n <= 48; n += 16) {
3936 for (size_t k = 1; k <= 40; k += 9) {
3937 for (uint32_t m = 1; m <= 4; m++) {
3938 GemmMicrokernelTester()
3939 .mr(4)
3940 .nr(16)
3941 .kr(1)
3942 .sr(1)
3943 .m(m)
3944 .n(n)
3945 .k(k)
3946 .iterations(1)
3947 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
3948 }
3949 }
3950 }
3951 }
3952
TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE,strided_cm_subtile)3953 TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, strided_cm_subtile) {
3954 TEST_REQUIRES_ARM_NEON_V8;
3955 for (size_t k = 1; k <= 40; k += 9) {
3956 for (uint32_t n = 1; n <= 16; n++) {
3957 for (uint32_t m = 1; m <= 4; m++) {
3958 GemmMicrokernelTester()
3959 .mr(4)
3960 .nr(16)
3961 .kr(1)
3962 .sr(1)
3963 .m(m)
3964 .n(n)
3965 .k(k)
3966 .cm_stride(19)
3967 .iterations(1)
3968 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
3969 }
3970 }
3971 }
3972 }
3973
TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE,qmin)3974 TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, qmin) {
3975 TEST_REQUIRES_ARM_NEON_V8;
3976 GemmMicrokernelTester()
3977 .mr(4)
3978 .nr(16)
3979 .kr(1)
3980 .sr(1)
3981 .m(4)
3982 .n(16)
3983 .k(8)
3984 .qmin(128)
3985 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
3986 }
3987
TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE,qmax)3988 TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, qmax) {
3989 TEST_REQUIRES_ARM_NEON_V8;
3990 GemmMicrokernelTester()
3991 .mr(4)
3992 .nr(16)
3993 .kr(1)
3994 .sr(1)
3995 .m(4)
3996 .n(16)
3997 .k(8)
3998 .qmax(128)
3999 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4000 }
4001
TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE,strided_cm)4002 TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, strided_cm) {
4003 TEST_REQUIRES_ARM_NEON_V8;
4004 GemmMicrokernelTester()
4005 .mr(4)
4006 .nr(16)
4007 .kr(1)
4008 .sr(1)
4009 .m(4)
4010 .n(16)
4011 .k(8)
4012 .cm_stride(19)
4013 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4014 }
4015
TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE,no_a_zero_point)4016 TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, no_a_zero_point) {
4017 TEST_REQUIRES_ARM_NEON_V8;
4018 for (size_t k = 1; k <= 40; k += 9) {
4019 GemmMicrokernelTester()
4020 .mr(4)
4021 .nr(16)
4022 .kr(1)
4023 .sr(1)
4024 .m(4)
4025 .n(16)
4026 .k(k)
4027 .a_zero_point(0)
4028 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4029 }
4030 }
4031
TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE,no_b_zero_point)4032 TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, no_b_zero_point) {
4033 TEST_REQUIRES_ARM_NEON_V8;
4034 for (size_t k = 1; k <= 40; k += 9) {
4035 GemmMicrokernelTester()
4036 .mr(4)
4037 .nr(16)
4038 .kr(1)
4039 .sr(1)
4040 .m(4)
4041 .n(16)
4042 .k(k)
4043 .b_zero_point(0)
4044 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4045 }
4046 }
4047
TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE,no_zero_point)4048 TEST(QU8_GEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, no_zero_point) {
4049 TEST_REQUIRES_ARM_NEON_V8;
4050 for (size_t k = 1; k <= 40; k += 9) {
4051 GemmMicrokernelTester()
4052 .mr(4)
4053 .nr(16)
4054 .kr(1)
4055 .sr(1)
4056 .m(4)
4057 .n(16)
4058 .k(k)
4059 .a_zero_point(0)
4060 .b_zero_point(0)
4061 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4062 }
4063 }
4064 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
4065
4066
4067 #if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64)
TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT,k_eq_8)4068 TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, k_eq_8) {
4069 TEST_REQUIRES_ARM_NEON_DOT;
4070 GemmMicrokernelTester()
4071 .mr(4)
4072 .nr(16)
4073 .kr(4)
4074 .sr(1)
4075 .m(4)
4076 .n(16)
4077 .k(8)
4078 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4079 }
4080
TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT,strided_cn)4081 TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, strided_cn) {
4082 TEST_REQUIRES_ARM_NEON_DOT;
4083 GemmMicrokernelTester()
4084 .mr(4)
4085 .nr(16)
4086 .kr(4)
4087 .sr(1)
4088 .m(4)
4089 .n(16)
4090 .k(8)
4091 .cn_stride(19)
4092 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4093 }
4094
TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT,k_eq_8_strided_a)4095 TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, k_eq_8_strided_a) {
4096 TEST_REQUIRES_ARM_NEON_DOT;
4097 GemmMicrokernelTester()
4098 .mr(4)
4099 .nr(16)
4100 .kr(4)
4101 .sr(1)
4102 .m(4)
4103 .n(16)
4104 .k(8)
4105 .a_stride(11)
4106 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4107 }
4108
TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT,k_eq_8_subtile)4109 TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, k_eq_8_subtile) {
4110 TEST_REQUIRES_ARM_NEON_DOT;
4111 for (uint32_t n = 1; n <= 16; n++) {
4112 for (uint32_t m = 1; m <= 4; m++) {
4113 GemmMicrokernelTester()
4114 .mr(4)
4115 .nr(16)
4116 .kr(4)
4117 .sr(1)
4118 .m(m)
4119 .n(n)
4120 .k(8)
4121 .iterations(1)
4122 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4123 }
4124 }
4125 }
4126
TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT,k_eq_8_subtile_m)4127 TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, k_eq_8_subtile_m) {
4128 TEST_REQUIRES_ARM_NEON_DOT;
4129 for (uint32_t m = 1; m <= 4; m++) {
4130 GemmMicrokernelTester()
4131 .mr(4)
4132 .nr(16)
4133 .kr(4)
4134 .sr(1)
4135 .m(m)
4136 .n(16)
4137 .k(8)
4138 .iterations(1)
4139 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4140 }
4141 }
4142
TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT,k_eq_8_subtile_n)4143 TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, k_eq_8_subtile_n) {
4144 TEST_REQUIRES_ARM_NEON_DOT;
4145 for (uint32_t n = 1; n <= 16; n++) {
4146 GemmMicrokernelTester()
4147 .mr(4)
4148 .nr(16)
4149 .kr(4)
4150 .sr(1)
4151 .m(4)
4152 .n(n)
4153 .k(8)
4154 .iterations(1)
4155 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4156 }
4157 }
4158
TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT,k_lt_8)4159 TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, k_lt_8) {
4160 TEST_REQUIRES_ARM_NEON_DOT;
4161 for (size_t k = 1; k < 8; k++) {
4162 GemmMicrokernelTester()
4163 .mr(4)
4164 .nr(16)
4165 .kr(4)
4166 .sr(1)
4167 .m(4)
4168 .n(16)
4169 .k(k)
4170 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4171 }
4172 }
4173
TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT,k_lt_8_strided_a)4174 TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, k_lt_8_strided_a) {
4175 TEST_REQUIRES_ARM_NEON_DOT;
4176 for (size_t k = 1; k < 8; k++) {
4177 GemmMicrokernelTester()
4178 .mr(4)
4179 .nr(16)
4180 .kr(4)
4181 .sr(1)
4182 .m(4)
4183 .n(16)
4184 .k(k)
4185 .a_stride(11)
4186 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4187 }
4188 }
4189
TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT,k_lt_8_subtile)4190 TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, k_lt_8_subtile) {
4191 TEST_REQUIRES_ARM_NEON_DOT;
4192 for (size_t k = 1; k < 8; k++) {
4193 for (uint32_t n = 1; n <= 16; n++) {
4194 for (uint32_t m = 1; m <= 4; m++) {
4195 GemmMicrokernelTester()
4196 .mr(4)
4197 .nr(16)
4198 .kr(4)
4199 .sr(1)
4200 .m(m)
4201 .n(n)
4202 .k(k)
4203 .iterations(1)
4204 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4205 }
4206 }
4207 }
4208 }
4209
TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT,k_gt_8)4210 TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, k_gt_8) {
4211 TEST_REQUIRES_ARM_NEON_DOT;
4212 for (size_t k = 9; k < 16; k++) {
4213 GemmMicrokernelTester()
4214 .mr(4)
4215 .nr(16)
4216 .kr(4)
4217 .sr(1)
4218 .m(4)
4219 .n(16)
4220 .k(k)
4221 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4222 }
4223 }
4224
TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT,k_gt_8_strided_a)4225 TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, k_gt_8_strided_a) {
4226 TEST_REQUIRES_ARM_NEON_DOT;
4227 for (size_t k = 9; k < 16; k++) {
4228 GemmMicrokernelTester()
4229 .mr(4)
4230 .nr(16)
4231 .kr(4)
4232 .sr(1)
4233 .m(4)
4234 .n(16)
4235 .k(k)
4236 .a_stride(19)
4237 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4238 }
4239 }
4240
TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT,k_gt_8_subtile)4241 TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, k_gt_8_subtile) {
4242 TEST_REQUIRES_ARM_NEON_DOT;
4243 for (size_t k = 9; k < 16; k++) {
4244 for (uint32_t n = 1; n <= 16; n++) {
4245 for (uint32_t m = 1; m <= 4; m++) {
4246 GemmMicrokernelTester()
4247 .mr(4)
4248 .nr(16)
4249 .kr(4)
4250 .sr(1)
4251 .m(m)
4252 .n(n)
4253 .k(k)
4254 .iterations(1)
4255 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4256 }
4257 }
4258 }
4259 }
4260
TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT,k_div_8)4261 TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, k_div_8) {
4262 TEST_REQUIRES_ARM_NEON_DOT;
4263 for (size_t k = 16; k <= 80; k += 8) {
4264 GemmMicrokernelTester()
4265 .mr(4)
4266 .nr(16)
4267 .kr(4)
4268 .sr(1)
4269 .m(4)
4270 .n(16)
4271 .k(k)
4272 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4273 }
4274 }
4275
TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT,k_div_8_strided_a)4276 TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, k_div_8_strided_a) {
4277 TEST_REQUIRES_ARM_NEON_DOT;
4278 for (size_t k = 16; k <= 80; k += 8) {
4279 GemmMicrokernelTester()
4280 .mr(4)
4281 .nr(16)
4282 .kr(4)
4283 .sr(1)
4284 .m(4)
4285 .n(16)
4286 .k(k)
4287 .a_stride(83)
4288 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4289 }
4290 }
4291
TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT,k_div_8_subtile)4292 TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, k_div_8_subtile) {
4293 TEST_REQUIRES_ARM_NEON_DOT;
4294 for (size_t k = 16; k <= 80; k += 8) {
4295 for (uint32_t n = 1; n <= 16; n++) {
4296 for (uint32_t m = 1; m <= 4; m++) {
4297 GemmMicrokernelTester()
4298 .mr(4)
4299 .nr(16)
4300 .kr(4)
4301 .sr(1)
4302 .m(m)
4303 .n(n)
4304 .k(k)
4305 .iterations(1)
4306 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4307 }
4308 }
4309 }
4310 }
4311
TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT,n_gt_16)4312 TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, n_gt_16) {
4313 TEST_REQUIRES_ARM_NEON_DOT;
4314 for (uint32_t n = 17; n < 32; n++) {
4315 for (size_t k = 1; k <= 40; k += 9) {
4316 GemmMicrokernelTester()
4317 .mr(4)
4318 .nr(16)
4319 .kr(4)
4320 .sr(1)
4321 .m(4)
4322 .n(n)
4323 .k(k)
4324 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4325 }
4326 }
4327 }
4328
TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT,n_gt_16_strided_cn)4329 TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, n_gt_16_strided_cn) {
4330 TEST_REQUIRES_ARM_NEON_DOT;
4331 for (uint32_t n = 17; n < 32; n++) {
4332 for (size_t k = 1; k <= 40; k += 9) {
4333 GemmMicrokernelTester()
4334 .mr(4)
4335 .nr(16)
4336 .kr(4)
4337 .sr(1)
4338 .m(4)
4339 .n(n)
4340 .k(k)
4341 .cn_stride(19)
4342 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4343 }
4344 }
4345 }
4346
TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT,n_gt_16_strided_a)4347 TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, n_gt_16_strided_a) {
4348 TEST_REQUIRES_ARM_NEON_DOT;
4349 for (uint32_t n = 17; n < 32; n++) {
4350 for (size_t k = 1; k <= 40; k += 9) {
4351 GemmMicrokernelTester()
4352 .mr(4)
4353 .nr(16)
4354 .kr(4)
4355 .sr(1)
4356 .m(4)
4357 .n(n)
4358 .k(k)
4359 .a_stride(43)
4360 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4361 }
4362 }
4363 }
4364
TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT,n_gt_16_subtile)4365 TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, n_gt_16_subtile) {
4366 TEST_REQUIRES_ARM_NEON_DOT;
4367 for (uint32_t n = 17; n < 32; n++) {
4368 for (size_t k = 1; k <= 40; k += 9) {
4369 for (uint32_t m = 1; m <= 4; m++) {
4370 GemmMicrokernelTester()
4371 .mr(4)
4372 .nr(16)
4373 .kr(4)
4374 .sr(1)
4375 .m(m)
4376 .n(n)
4377 .k(k)
4378 .iterations(1)
4379 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4380 }
4381 }
4382 }
4383 }
4384
TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT,n_div_16)4385 TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, n_div_16) {
4386 TEST_REQUIRES_ARM_NEON_DOT;
4387 for (uint32_t n = 32; n <= 48; n += 16) {
4388 for (size_t k = 1; k <= 40; k += 9) {
4389 GemmMicrokernelTester()
4390 .mr(4)
4391 .nr(16)
4392 .kr(4)
4393 .sr(1)
4394 .m(4)
4395 .n(n)
4396 .k(k)
4397 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4398 }
4399 }
4400 }
4401
TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT,n_div_16_strided_cn)4402 TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, n_div_16_strided_cn) {
4403 TEST_REQUIRES_ARM_NEON_DOT;
4404 for (uint32_t n = 32; n <= 48; n += 16) {
4405 for (size_t k = 1; k <= 40; k += 9) {
4406 GemmMicrokernelTester()
4407 .mr(4)
4408 .nr(16)
4409 .kr(4)
4410 .sr(1)
4411 .m(4)
4412 .n(n)
4413 .k(k)
4414 .cn_stride(19)
4415 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4416 }
4417 }
4418 }
4419
TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT,n_div_16_strided_a)4420 TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, n_div_16_strided_a) {
4421 TEST_REQUIRES_ARM_NEON_DOT;
4422 for (uint32_t n = 32; n <= 48; n += 16) {
4423 for (size_t k = 1; k <= 40; k += 9) {
4424 GemmMicrokernelTester()
4425 .mr(4)
4426 .nr(16)
4427 .kr(4)
4428 .sr(1)
4429 .m(4)
4430 .n(n)
4431 .k(k)
4432 .a_stride(43)
4433 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4434 }
4435 }
4436 }
4437
TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT,n_div_16_subtile)4438 TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, n_div_16_subtile) {
4439 TEST_REQUIRES_ARM_NEON_DOT;
4440 for (uint32_t n = 32; n <= 48; n += 16) {
4441 for (size_t k = 1; k <= 40; k += 9) {
4442 for (uint32_t m = 1; m <= 4; m++) {
4443 GemmMicrokernelTester()
4444 .mr(4)
4445 .nr(16)
4446 .kr(4)
4447 .sr(1)
4448 .m(m)
4449 .n(n)
4450 .k(k)
4451 .iterations(1)
4452 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4453 }
4454 }
4455 }
4456 }
4457
TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT,strided_cm_subtile)4458 TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, strided_cm_subtile) {
4459 TEST_REQUIRES_ARM_NEON_DOT;
4460 for (size_t k = 1; k <= 40; k += 9) {
4461 for (uint32_t n = 1; n <= 16; n++) {
4462 for (uint32_t m = 1; m <= 4; m++) {
4463 GemmMicrokernelTester()
4464 .mr(4)
4465 .nr(16)
4466 .kr(4)
4467 .sr(1)
4468 .m(m)
4469 .n(n)
4470 .k(k)
4471 .cm_stride(19)
4472 .iterations(1)
4473 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4474 }
4475 }
4476 }
4477 }
4478
TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT,qmin)4479 TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, qmin) {
4480 TEST_REQUIRES_ARM_NEON_DOT;
4481 GemmMicrokernelTester()
4482 .mr(4)
4483 .nr(16)
4484 .kr(4)
4485 .sr(1)
4486 .m(4)
4487 .n(16)
4488 .k(8)
4489 .qmin(128)
4490 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4491 }
4492
TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT,qmax)4493 TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, qmax) {
4494 TEST_REQUIRES_ARM_NEON_DOT;
4495 GemmMicrokernelTester()
4496 .mr(4)
4497 .nr(16)
4498 .kr(4)
4499 .sr(1)
4500 .m(4)
4501 .n(16)
4502 .k(8)
4503 .qmax(128)
4504 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4505 }
4506
TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT,strided_cm)4507 TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, strided_cm) {
4508 TEST_REQUIRES_ARM_NEON_DOT;
4509 GemmMicrokernelTester()
4510 .mr(4)
4511 .nr(16)
4512 .kr(4)
4513 .sr(1)
4514 .m(4)
4515 .n(16)
4516 .k(8)
4517 .cm_stride(19)
4518 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4519 }
4520
TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT,no_a_zero_point)4521 TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, no_a_zero_point) {
4522 TEST_REQUIRES_ARM_NEON_DOT;
4523 for (size_t k = 1; k <= 40; k += 9) {
4524 GemmMicrokernelTester()
4525 .mr(4)
4526 .nr(16)
4527 .kr(4)
4528 .sr(1)
4529 .m(4)
4530 .n(16)
4531 .k(k)
4532 .a_zero_point(0)
4533 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4534 }
4535 }
4536
TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT,no_b_zero_point)4537 TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, no_b_zero_point) {
4538 TEST_REQUIRES_ARM_NEON_DOT;
4539 for (size_t k = 1; k <= 40; k += 9) {
4540 GemmMicrokernelTester()
4541 .mr(4)
4542 .nr(16)
4543 .kr(4)
4544 .sr(1)
4545 .m(4)
4546 .n(16)
4547 .k(k)
4548 .b_zero_point(0)
4549 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4550 }
4551 }
4552
TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT,no_zero_point)4553 TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, no_zero_point) {
4554 TEST_REQUIRES_ARM_NEON_DOT;
4555 for (size_t k = 1; k <= 40; k += 9) {
4556 GemmMicrokernelTester()
4557 .mr(4)
4558 .nr(16)
4559 .kr(4)
4560 .sr(1)
4561 .m(4)
4562 .n(16)
4563 .k(k)
4564 .a_zero_point(0)
4565 .b_zero_point(0)
4566 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_qu8_requantize_fp32);
4567 }
4568 }
4569 #endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64)
4570
4571
4572 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64,k_eq_8)4573 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8) {
4574 TEST_REQUIRES_X86_SSE2;
4575 GemmMicrokernelTester()
4576 .mr(1)
4577 .nr(4)
4578 .kr(2)
4579 .sr(1)
4580 .m(1)
4581 .n(4)
4582 .k(8)
4583 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
4584 }
4585
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64,strided_cn)4586 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, strided_cn) {
4587 TEST_REQUIRES_X86_SSE2;
4588 GemmMicrokernelTester()
4589 .mr(1)
4590 .nr(4)
4591 .kr(2)
4592 .sr(1)
4593 .m(1)
4594 .n(4)
4595 .k(8)
4596 .cn_stride(7)
4597 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
4598 }
4599
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64,k_eq_8_strided_a)4600 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8_strided_a) {
4601 TEST_REQUIRES_X86_SSE2;
4602 GemmMicrokernelTester()
4603 .mr(1)
4604 .nr(4)
4605 .kr(2)
4606 .sr(1)
4607 .m(1)
4608 .n(4)
4609 .k(8)
4610 .a_stride(11)
4611 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
4612 }
4613
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64,k_eq_8_subtile)4614 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8_subtile) {
4615 TEST_REQUIRES_X86_SSE2;
4616 for (uint32_t n = 1; n <= 4; n++) {
4617 for (uint32_t m = 1; m <= 1; m++) {
4618 GemmMicrokernelTester()
4619 .mr(1)
4620 .nr(4)
4621 .kr(2)
4622 .sr(1)
4623 .m(m)
4624 .n(n)
4625 .k(8)
4626 .iterations(1)
4627 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
4628 }
4629 }
4630 }
4631
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64,k_eq_8_subtile_m)4632 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8_subtile_m) {
4633 TEST_REQUIRES_X86_SSE2;
4634 for (uint32_t m = 1; m <= 1; m++) {
4635 GemmMicrokernelTester()
4636 .mr(1)
4637 .nr(4)
4638 .kr(2)
4639 .sr(1)
4640 .m(m)
4641 .n(4)
4642 .k(8)
4643 .iterations(1)
4644 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
4645 }
4646 }
4647
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64,k_eq_8_subtile_n)4648 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8_subtile_n) {
4649 TEST_REQUIRES_X86_SSE2;
4650 for (uint32_t n = 1; n <= 4; n++) {
4651 GemmMicrokernelTester()
4652 .mr(1)
4653 .nr(4)
4654 .kr(2)
4655 .sr(1)
4656 .m(1)
4657 .n(n)
4658 .k(8)
4659 .iterations(1)
4660 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
4661 }
4662 }
4663
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64,k_lt_8)4664 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_lt_8) {
4665 TEST_REQUIRES_X86_SSE2;
4666 for (size_t k = 1; k < 8; k++) {
4667 GemmMicrokernelTester()
4668 .mr(1)
4669 .nr(4)
4670 .kr(2)
4671 .sr(1)
4672 .m(1)
4673 .n(4)
4674 .k(k)
4675 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
4676 }
4677 }
4678
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64,k_lt_8_strided_a)4679 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_lt_8_strided_a) {
4680 TEST_REQUIRES_X86_SSE2;
4681 for (size_t k = 1; k < 8; k++) {
4682 GemmMicrokernelTester()
4683 .mr(1)
4684 .nr(4)
4685 .kr(2)
4686 .sr(1)
4687 .m(1)
4688 .n(4)
4689 .k(k)
4690 .a_stride(11)
4691 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
4692 }
4693 }
4694
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64,k_lt_8_subtile)4695 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_lt_8_subtile) {
4696 TEST_REQUIRES_X86_SSE2;
4697 for (size_t k = 1; k < 8; k++) {
4698 for (uint32_t n = 1; n <= 4; n++) {
4699 for (uint32_t m = 1; m <= 1; m++) {
4700 GemmMicrokernelTester()
4701 .mr(1)
4702 .nr(4)
4703 .kr(2)
4704 .sr(1)
4705 .m(m)
4706 .n(n)
4707 .k(k)
4708 .iterations(1)
4709 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
4710 }
4711 }
4712 }
4713 }
4714
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64,k_gt_8)4715 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_gt_8) {
4716 TEST_REQUIRES_X86_SSE2;
4717 for (size_t k = 9; k < 16; k++) {
4718 GemmMicrokernelTester()
4719 .mr(1)
4720 .nr(4)
4721 .kr(2)
4722 .sr(1)
4723 .m(1)
4724 .n(4)
4725 .k(k)
4726 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
4727 }
4728 }
4729
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64,k_gt_8_strided_a)4730 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_gt_8_strided_a) {
4731 TEST_REQUIRES_X86_SSE2;
4732 for (size_t k = 9; k < 16; k++) {
4733 GemmMicrokernelTester()
4734 .mr(1)
4735 .nr(4)
4736 .kr(2)
4737 .sr(1)
4738 .m(1)
4739 .n(4)
4740 .k(k)
4741 .a_stride(19)
4742 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
4743 }
4744 }
4745
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64,k_gt_8_subtile)4746 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_gt_8_subtile) {
4747 TEST_REQUIRES_X86_SSE2;
4748 for (size_t k = 9; k < 16; k++) {
4749 for (uint32_t n = 1; n <= 4; n++) {
4750 for (uint32_t m = 1; m <= 1; m++) {
4751 GemmMicrokernelTester()
4752 .mr(1)
4753 .nr(4)
4754 .kr(2)
4755 .sr(1)
4756 .m(m)
4757 .n(n)
4758 .k(k)
4759 .iterations(1)
4760 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
4761 }
4762 }
4763 }
4764 }
4765
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64,k_div_8)4766 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_div_8) {
4767 TEST_REQUIRES_X86_SSE2;
4768 for (size_t k = 16; k <= 80; k += 8) {
4769 GemmMicrokernelTester()
4770 .mr(1)
4771 .nr(4)
4772 .kr(2)
4773 .sr(1)
4774 .m(1)
4775 .n(4)
4776 .k(k)
4777 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
4778 }
4779 }
4780
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64,k_div_8_strided_a)4781 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_div_8_strided_a) {
4782 TEST_REQUIRES_X86_SSE2;
4783 for (size_t k = 16; k <= 80; k += 8) {
4784 GemmMicrokernelTester()
4785 .mr(1)
4786 .nr(4)
4787 .kr(2)
4788 .sr(1)
4789 .m(1)
4790 .n(4)
4791 .k(k)
4792 .a_stride(83)
4793 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
4794 }
4795 }
4796
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64,k_div_8_subtile)4797 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_div_8_subtile) {
4798 TEST_REQUIRES_X86_SSE2;
4799 for (size_t k = 16; k <= 80; k += 8) {
4800 for (uint32_t n = 1; n <= 4; n++) {
4801 for (uint32_t m = 1; m <= 1; m++) {
4802 GemmMicrokernelTester()
4803 .mr(1)
4804 .nr(4)
4805 .kr(2)
4806 .sr(1)
4807 .m(m)
4808 .n(n)
4809 .k(k)
4810 .iterations(1)
4811 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
4812 }
4813 }
4814 }
4815 }
4816
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64,n_gt_4)4817 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_gt_4) {
4818 TEST_REQUIRES_X86_SSE2;
4819 for (uint32_t n = 5; n < 8; n++) {
4820 for (size_t k = 1; k <= 40; k += 9) {
4821 GemmMicrokernelTester()
4822 .mr(1)
4823 .nr(4)
4824 .kr(2)
4825 .sr(1)
4826 .m(1)
4827 .n(n)
4828 .k(k)
4829 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
4830 }
4831 }
4832 }
4833
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64,n_gt_4_strided_cn)4834 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_gt_4_strided_cn) {
4835 TEST_REQUIRES_X86_SSE2;
4836 for (uint32_t n = 5; n < 8; n++) {
4837 for (size_t k = 1; k <= 40; k += 9) {
4838 GemmMicrokernelTester()
4839 .mr(1)
4840 .nr(4)
4841 .kr(2)
4842 .sr(1)
4843 .m(1)
4844 .n(n)
4845 .k(k)
4846 .cn_stride(7)
4847 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
4848 }
4849 }
4850 }
4851
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64,n_gt_4_strided_a)4852 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_gt_4_strided_a) {
4853 TEST_REQUIRES_X86_SSE2;
4854 for (uint32_t n = 5; n < 8; n++) {
4855 for (size_t k = 1; k <= 40; k += 9) {
4856 GemmMicrokernelTester()
4857 .mr(1)
4858 .nr(4)
4859 .kr(2)
4860 .sr(1)
4861 .m(1)
4862 .n(n)
4863 .k(k)
4864 .a_stride(43)
4865 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
4866 }
4867 }
4868 }
4869
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64,n_gt_4_subtile)4870 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_gt_4_subtile) {
4871 TEST_REQUIRES_X86_SSE2;
4872 for (uint32_t n = 5; n < 8; n++) {
4873 for (size_t k = 1; k <= 40; k += 9) {
4874 for (uint32_t m = 1; m <= 1; m++) {
4875 GemmMicrokernelTester()
4876 .mr(1)
4877 .nr(4)
4878 .kr(2)
4879 .sr(1)
4880 .m(m)
4881 .n(n)
4882 .k(k)
4883 .iterations(1)
4884 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
4885 }
4886 }
4887 }
4888 }
4889
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64,n_div_4)4890 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_div_4) {
4891 TEST_REQUIRES_X86_SSE2;
4892 for (uint32_t n = 8; n <= 12; n += 4) {
4893 for (size_t k = 1; k <= 40; k += 9) {
4894 GemmMicrokernelTester()
4895 .mr(1)
4896 .nr(4)
4897 .kr(2)
4898 .sr(1)
4899 .m(1)
4900 .n(n)
4901 .k(k)
4902 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
4903 }
4904 }
4905 }
4906
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64,n_div_4_strided_cn)4907 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_div_4_strided_cn) {
4908 TEST_REQUIRES_X86_SSE2;
4909 for (uint32_t n = 8; n <= 12; n += 4) {
4910 for (size_t k = 1; k <= 40; k += 9) {
4911 GemmMicrokernelTester()
4912 .mr(1)
4913 .nr(4)
4914 .kr(2)
4915 .sr(1)
4916 .m(1)
4917 .n(n)
4918 .k(k)
4919 .cn_stride(7)
4920 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
4921 }
4922 }
4923 }
4924
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64,n_div_4_strided_a)4925 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_div_4_strided_a) {
4926 TEST_REQUIRES_X86_SSE2;
4927 for (uint32_t n = 8; n <= 12; n += 4) {
4928 for (size_t k = 1; k <= 40; k += 9) {
4929 GemmMicrokernelTester()
4930 .mr(1)
4931 .nr(4)
4932 .kr(2)
4933 .sr(1)
4934 .m(1)
4935 .n(n)
4936 .k(k)
4937 .a_stride(43)
4938 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
4939 }
4940 }
4941 }
4942
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64,n_div_4_subtile)4943 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, n_div_4_subtile) {
4944 TEST_REQUIRES_X86_SSE2;
4945 for (uint32_t n = 8; n <= 12; n += 4) {
4946 for (size_t k = 1; k <= 40; k += 9) {
4947 for (uint32_t m = 1; m <= 1; m++) {
4948 GemmMicrokernelTester()
4949 .mr(1)
4950 .nr(4)
4951 .kr(2)
4952 .sr(1)
4953 .m(m)
4954 .n(n)
4955 .k(k)
4956 .iterations(1)
4957 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
4958 }
4959 }
4960 }
4961 }
4962
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64,strided_cm_subtile)4963 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, strided_cm_subtile) {
4964 TEST_REQUIRES_X86_SSE2;
4965 for (size_t k = 1; k <= 40; k += 9) {
4966 for (uint32_t n = 1; n <= 4; n++) {
4967 for (uint32_t m = 1; m <= 1; m++) {
4968 GemmMicrokernelTester()
4969 .mr(1)
4970 .nr(4)
4971 .kr(2)
4972 .sr(1)
4973 .m(m)
4974 .n(n)
4975 .k(k)
4976 .cm_stride(7)
4977 .iterations(1)
4978 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
4979 }
4980 }
4981 }
4982 }
4983
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64,qmin)4984 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, qmin) {
4985 TEST_REQUIRES_X86_SSE2;
4986 GemmMicrokernelTester()
4987 .mr(1)
4988 .nr(4)
4989 .kr(2)
4990 .sr(1)
4991 .m(1)
4992 .n(4)
4993 .k(8)
4994 .qmin(128)
4995 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
4996 }
4997
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64,qmax)4998 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, qmax) {
4999 TEST_REQUIRES_X86_SSE2;
5000 GemmMicrokernelTester()
5001 .mr(1)
5002 .nr(4)
5003 .kr(2)
5004 .sr(1)
5005 .m(1)
5006 .n(4)
5007 .k(8)
5008 .qmax(128)
5009 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5010 }
5011
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64,strided_cm)5012 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, strided_cm) {
5013 TEST_REQUIRES_X86_SSE2;
5014 GemmMicrokernelTester()
5015 .mr(1)
5016 .nr(4)
5017 .kr(2)
5018 .sr(1)
5019 .m(1)
5020 .n(4)
5021 .k(8)
5022 .cm_stride(7)
5023 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5024 }
5025
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64,no_a_zero_point)5026 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, no_a_zero_point) {
5027 TEST_REQUIRES_X86_SSE2;
5028 for (size_t k = 1; k <= 40; k += 9) {
5029 GemmMicrokernelTester()
5030 .mr(1)
5031 .nr(4)
5032 .kr(2)
5033 .sr(1)
5034 .m(1)
5035 .n(4)
5036 .k(k)
5037 .a_zero_point(0)
5038 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5039 }
5040 }
5041
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64,no_b_zero_point)5042 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, no_b_zero_point) {
5043 TEST_REQUIRES_X86_SSE2;
5044 for (size_t k = 1; k <= 40; k += 9) {
5045 GemmMicrokernelTester()
5046 .mr(1)
5047 .nr(4)
5048 .kr(2)
5049 .sr(1)
5050 .m(1)
5051 .n(4)
5052 .k(k)
5053 .b_zero_point(0)
5054 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5055 }
5056 }
5057
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64,no_zero_point)5058 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, no_zero_point) {
5059 TEST_REQUIRES_X86_SSE2;
5060 for (size_t k = 1; k <= 40; k += 9) {
5061 GemmMicrokernelTester()
5062 .mr(1)
5063 .nr(4)
5064 .kr(2)
5065 .sr(1)
5066 .m(1)
5067 .n(4)
5068 .k(k)
5069 .a_zero_point(0)
5070 .b_zero_point(0)
5071 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5072 }
5073 }
5074 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
5075
5076
5077 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64,k_eq_8)5078 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_eq_8) {
5079 TEST_REQUIRES_X86_SSE41;
5080 GemmMicrokernelTester()
5081 .mr(1)
5082 .nr(4)
5083 .kr(2)
5084 .sr(1)
5085 .m(1)
5086 .n(4)
5087 .k(8)
5088 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5089 }
5090
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64,strided_cn)5091 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, strided_cn) {
5092 TEST_REQUIRES_X86_SSE41;
5093 GemmMicrokernelTester()
5094 .mr(1)
5095 .nr(4)
5096 .kr(2)
5097 .sr(1)
5098 .m(1)
5099 .n(4)
5100 .k(8)
5101 .cn_stride(7)
5102 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5103 }
5104
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64,k_eq_8_strided_a)5105 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_eq_8_strided_a) {
5106 TEST_REQUIRES_X86_SSE41;
5107 GemmMicrokernelTester()
5108 .mr(1)
5109 .nr(4)
5110 .kr(2)
5111 .sr(1)
5112 .m(1)
5113 .n(4)
5114 .k(8)
5115 .a_stride(11)
5116 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5117 }
5118
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64,k_eq_8_subtile)5119 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_eq_8_subtile) {
5120 TEST_REQUIRES_X86_SSE41;
5121 for (uint32_t n = 1; n <= 4; n++) {
5122 for (uint32_t m = 1; m <= 1; m++) {
5123 GemmMicrokernelTester()
5124 .mr(1)
5125 .nr(4)
5126 .kr(2)
5127 .sr(1)
5128 .m(m)
5129 .n(n)
5130 .k(8)
5131 .iterations(1)
5132 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5133 }
5134 }
5135 }
5136
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64,k_eq_8_subtile_m)5137 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_eq_8_subtile_m) {
5138 TEST_REQUIRES_X86_SSE41;
5139 for (uint32_t m = 1; m <= 1; m++) {
5140 GemmMicrokernelTester()
5141 .mr(1)
5142 .nr(4)
5143 .kr(2)
5144 .sr(1)
5145 .m(m)
5146 .n(4)
5147 .k(8)
5148 .iterations(1)
5149 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5150 }
5151 }
5152
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64,k_eq_8_subtile_n)5153 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_eq_8_subtile_n) {
5154 TEST_REQUIRES_X86_SSE41;
5155 for (uint32_t n = 1; n <= 4; n++) {
5156 GemmMicrokernelTester()
5157 .mr(1)
5158 .nr(4)
5159 .kr(2)
5160 .sr(1)
5161 .m(1)
5162 .n(n)
5163 .k(8)
5164 .iterations(1)
5165 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5166 }
5167 }
5168
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64,k_lt_8)5169 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_lt_8) {
5170 TEST_REQUIRES_X86_SSE41;
5171 for (size_t k = 1; k < 8; k++) {
5172 GemmMicrokernelTester()
5173 .mr(1)
5174 .nr(4)
5175 .kr(2)
5176 .sr(1)
5177 .m(1)
5178 .n(4)
5179 .k(k)
5180 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5181 }
5182 }
5183
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64,k_lt_8_strided_a)5184 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_lt_8_strided_a) {
5185 TEST_REQUIRES_X86_SSE41;
5186 for (size_t k = 1; k < 8; k++) {
5187 GemmMicrokernelTester()
5188 .mr(1)
5189 .nr(4)
5190 .kr(2)
5191 .sr(1)
5192 .m(1)
5193 .n(4)
5194 .k(k)
5195 .a_stride(11)
5196 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5197 }
5198 }
5199
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64,k_lt_8_subtile)5200 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_lt_8_subtile) {
5201 TEST_REQUIRES_X86_SSE41;
5202 for (size_t k = 1; k < 8; k++) {
5203 for (uint32_t n = 1; n <= 4; n++) {
5204 for (uint32_t m = 1; m <= 1; m++) {
5205 GemmMicrokernelTester()
5206 .mr(1)
5207 .nr(4)
5208 .kr(2)
5209 .sr(1)
5210 .m(m)
5211 .n(n)
5212 .k(k)
5213 .iterations(1)
5214 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5215 }
5216 }
5217 }
5218 }
5219
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64,k_gt_8)5220 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_gt_8) {
5221 TEST_REQUIRES_X86_SSE41;
5222 for (size_t k = 9; k < 16; k++) {
5223 GemmMicrokernelTester()
5224 .mr(1)
5225 .nr(4)
5226 .kr(2)
5227 .sr(1)
5228 .m(1)
5229 .n(4)
5230 .k(k)
5231 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5232 }
5233 }
5234
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64,k_gt_8_strided_a)5235 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_gt_8_strided_a) {
5236 TEST_REQUIRES_X86_SSE41;
5237 for (size_t k = 9; k < 16; k++) {
5238 GemmMicrokernelTester()
5239 .mr(1)
5240 .nr(4)
5241 .kr(2)
5242 .sr(1)
5243 .m(1)
5244 .n(4)
5245 .k(k)
5246 .a_stride(19)
5247 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5248 }
5249 }
5250
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64,k_gt_8_subtile)5251 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_gt_8_subtile) {
5252 TEST_REQUIRES_X86_SSE41;
5253 for (size_t k = 9; k < 16; k++) {
5254 for (uint32_t n = 1; n <= 4; n++) {
5255 for (uint32_t m = 1; m <= 1; m++) {
5256 GemmMicrokernelTester()
5257 .mr(1)
5258 .nr(4)
5259 .kr(2)
5260 .sr(1)
5261 .m(m)
5262 .n(n)
5263 .k(k)
5264 .iterations(1)
5265 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5266 }
5267 }
5268 }
5269 }
5270
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64,k_div_8)5271 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_div_8) {
5272 TEST_REQUIRES_X86_SSE41;
5273 for (size_t k = 16; k <= 80; k += 8) {
5274 GemmMicrokernelTester()
5275 .mr(1)
5276 .nr(4)
5277 .kr(2)
5278 .sr(1)
5279 .m(1)
5280 .n(4)
5281 .k(k)
5282 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5283 }
5284 }
5285
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64,k_div_8_strided_a)5286 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_div_8_strided_a) {
5287 TEST_REQUIRES_X86_SSE41;
5288 for (size_t k = 16; k <= 80; k += 8) {
5289 GemmMicrokernelTester()
5290 .mr(1)
5291 .nr(4)
5292 .kr(2)
5293 .sr(1)
5294 .m(1)
5295 .n(4)
5296 .k(k)
5297 .a_stride(83)
5298 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5299 }
5300 }
5301
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64,k_div_8_subtile)5302 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, k_div_8_subtile) {
5303 TEST_REQUIRES_X86_SSE41;
5304 for (size_t k = 16; k <= 80; k += 8) {
5305 for (uint32_t n = 1; n <= 4; n++) {
5306 for (uint32_t m = 1; m <= 1; m++) {
5307 GemmMicrokernelTester()
5308 .mr(1)
5309 .nr(4)
5310 .kr(2)
5311 .sr(1)
5312 .m(m)
5313 .n(n)
5314 .k(k)
5315 .iterations(1)
5316 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5317 }
5318 }
5319 }
5320 }
5321
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64,n_gt_4)5322 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_gt_4) {
5323 TEST_REQUIRES_X86_SSE41;
5324 for (uint32_t n = 5; n < 8; n++) {
5325 for (size_t k = 1; k <= 40; k += 9) {
5326 GemmMicrokernelTester()
5327 .mr(1)
5328 .nr(4)
5329 .kr(2)
5330 .sr(1)
5331 .m(1)
5332 .n(n)
5333 .k(k)
5334 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5335 }
5336 }
5337 }
5338
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64,n_gt_4_strided_cn)5339 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_gt_4_strided_cn) {
5340 TEST_REQUIRES_X86_SSE41;
5341 for (uint32_t n = 5; n < 8; n++) {
5342 for (size_t k = 1; k <= 40; k += 9) {
5343 GemmMicrokernelTester()
5344 .mr(1)
5345 .nr(4)
5346 .kr(2)
5347 .sr(1)
5348 .m(1)
5349 .n(n)
5350 .k(k)
5351 .cn_stride(7)
5352 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5353 }
5354 }
5355 }
5356
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64,n_gt_4_strided_a)5357 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_gt_4_strided_a) {
5358 TEST_REQUIRES_X86_SSE41;
5359 for (uint32_t n = 5; n < 8; n++) {
5360 for (size_t k = 1; k <= 40; k += 9) {
5361 GemmMicrokernelTester()
5362 .mr(1)
5363 .nr(4)
5364 .kr(2)
5365 .sr(1)
5366 .m(1)
5367 .n(n)
5368 .k(k)
5369 .a_stride(43)
5370 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5371 }
5372 }
5373 }
5374
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64,n_gt_4_subtile)5375 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_gt_4_subtile) {
5376 TEST_REQUIRES_X86_SSE41;
5377 for (uint32_t n = 5; n < 8; n++) {
5378 for (size_t k = 1; k <= 40; k += 9) {
5379 for (uint32_t m = 1; m <= 1; m++) {
5380 GemmMicrokernelTester()
5381 .mr(1)
5382 .nr(4)
5383 .kr(2)
5384 .sr(1)
5385 .m(m)
5386 .n(n)
5387 .k(k)
5388 .iterations(1)
5389 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5390 }
5391 }
5392 }
5393 }
5394
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64,n_div_4)5395 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_div_4) {
5396 TEST_REQUIRES_X86_SSE41;
5397 for (uint32_t n = 8; n <= 12; n += 4) {
5398 for (size_t k = 1; k <= 40; k += 9) {
5399 GemmMicrokernelTester()
5400 .mr(1)
5401 .nr(4)
5402 .kr(2)
5403 .sr(1)
5404 .m(1)
5405 .n(n)
5406 .k(k)
5407 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5408 }
5409 }
5410 }
5411
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64,n_div_4_strided_cn)5412 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_div_4_strided_cn) {
5413 TEST_REQUIRES_X86_SSE41;
5414 for (uint32_t n = 8; n <= 12; n += 4) {
5415 for (size_t k = 1; k <= 40; k += 9) {
5416 GemmMicrokernelTester()
5417 .mr(1)
5418 .nr(4)
5419 .kr(2)
5420 .sr(1)
5421 .m(1)
5422 .n(n)
5423 .k(k)
5424 .cn_stride(7)
5425 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5426 }
5427 }
5428 }
5429
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64,n_div_4_strided_a)5430 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_div_4_strided_a) {
5431 TEST_REQUIRES_X86_SSE41;
5432 for (uint32_t n = 8; n <= 12; n += 4) {
5433 for (size_t k = 1; k <= 40; k += 9) {
5434 GemmMicrokernelTester()
5435 .mr(1)
5436 .nr(4)
5437 .kr(2)
5438 .sr(1)
5439 .m(1)
5440 .n(n)
5441 .k(k)
5442 .a_stride(43)
5443 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5444 }
5445 }
5446 }
5447
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64,n_div_4_subtile)5448 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, n_div_4_subtile) {
5449 TEST_REQUIRES_X86_SSE41;
5450 for (uint32_t n = 8; n <= 12; n += 4) {
5451 for (size_t k = 1; k <= 40; k += 9) {
5452 for (uint32_t m = 1; m <= 1; m++) {
5453 GemmMicrokernelTester()
5454 .mr(1)
5455 .nr(4)
5456 .kr(2)
5457 .sr(1)
5458 .m(m)
5459 .n(n)
5460 .k(k)
5461 .iterations(1)
5462 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5463 }
5464 }
5465 }
5466 }
5467
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64,strided_cm_subtile)5468 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, strided_cm_subtile) {
5469 TEST_REQUIRES_X86_SSE41;
5470 for (size_t k = 1; k <= 40; k += 9) {
5471 for (uint32_t n = 1; n <= 4; n++) {
5472 for (uint32_t m = 1; m <= 1; m++) {
5473 GemmMicrokernelTester()
5474 .mr(1)
5475 .nr(4)
5476 .kr(2)
5477 .sr(1)
5478 .m(m)
5479 .n(n)
5480 .k(k)
5481 .cm_stride(7)
5482 .iterations(1)
5483 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5484 }
5485 }
5486 }
5487 }
5488
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64,qmin)5489 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, qmin) {
5490 TEST_REQUIRES_X86_SSE41;
5491 GemmMicrokernelTester()
5492 .mr(1)
5493 .nr(4)
5494 .kr(2)
5495 .sr(1)
5496 .m(1)
5497 .n(4)
5498 .k(8)
5499 .qmin(128)
5500 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5501 }
5502
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64,qmax)5503 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, qmax) {
5504 TEST_REQUIRES_X86_SSE41;
5505 GemmMicrokernelTester()
5506 .mr(1)
5507 .nr(4)
5508 .kr(2)
5509 .sr(1)
5510 .m(1)
5511 .n(4)
5512 .k(8)
5513 .qmax(128)
5514 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5515 }
5516
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64,strided_cm)5517 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, strided_cm) {
5518 TEST_REQUIRES_X86_SSE41;
5519 GemmMicrokernelTester()
5520 .mr(1)
5521 .nr(4)
5522 .kr(2)
5523 .sr(1)
5524 .m(1)
5525 .n(4)
5526 .k(8)
5527 .cm_stride(7)
5528 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5529 }
5530
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64,no_a_zero_point)5531 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, no_a_zero_point) {
5532 TEST_REQUIRES_X86_SSE41;
5533 for (size_t k = 1; k <= 40; k += 9) {
5534 GemmMicrokernelTester()
5535 .mr(1)
5536 .nr(4)
5537 .kr(2)
5538 .sr(1)
5539 .m(1)
5540 .n(4)
5541 .k(k)
5542 .a_zero_point(0)
5543 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5544 }
5545 }
5546
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64,no_b_zero_point)5547 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, no_b_zero_point) {
5548 TEST_REQUIRES_X86_SSE41;
5549 for (size_t k = 1; k <= 40; k += 9) {
5550 GemmMicrokernelTester()
5551 .mr(1)
5552 .nr(4)
5553 .kr(2)
5554 .sr(1)
5555 .m(1)
5556 .n(4)
5557 .k(k)
5558 .b_zero_point(0)
5559 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5560 }
5561 }
5562
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64,no_zero_point)5563 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD64, no_zero_point) {
5564 TEST_REQUIRES_X86_SSE41;
5565 for (size_t k = 1; k <= 40; k += 9) {
5566 GemmMicrokernelTester()
5567 .mr(1)
5568 .nr(4)
5569 .kr(2)
5570 .sr(1)
5571 .m(1)
5572 .n(4)
5573 .k(k)
5574 .a_zero_point(0)
5575 .b_zero_point(0)
5576 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5577 }
5578 }
5579 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
5580
5581
5582 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64,k_eq_8)5583 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_eq_8) {
5584 TEST_REQUIRES_X86_SSE2;
5585 GemmMicrokernelTester()
5586 .mr(2)
5587 .nr(4)
5588 .kr(2)
5589 .sr(1)
5590 .m(2)
5591 .n(4)
5592 .k(8)
5593 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5594 }
5595
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64,strided_cn)5596 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, strided_cn) {
5597 TEST_REQUIRES_X86_SSE2;
5598 GemmMicrokernelTester()
5599 .mr(2)
5600 .nr(4)
5601 .kr(2)
5602 .sr(1)
5603 .m(2)
5604 .n(4)
5605 .k(8)
5606 .cn_stride(7)
5607 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5608 }
5609
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64,k_eq_8_strided_a)5610 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_eq_8_strided_a) {
5611 TEST_REQUIRES_X86_SSE2;
5612 GemmMicrokernelTester()
5613 .mr(2)
5614 .nr(4)
5615 .kr(2)
5616 .sr(1)
5617 .m(2)
5618 .n(4)
5619 .k(8)
5620 .a_stride(11)
5621 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5622 }
5623
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64,k_eq_8_subtile)5624 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_eq_8_subtile) {
5625 TEST_REQUIRES_X86_SSE2;
5626 for (uint32_t n = 1; n <= 4; n++) {
5627 for (uint32_t m = 1; m <= 2; m++) {
5628 GemmMicrokernelTester()
5629 .mr(2)
5630 .nr(4)
5631 .kr(2)
5632 .sr(1)
5633 .m(m)
5634 .n(n)
5635 .k(8)
5636 .iterations(1)
5637 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5638 }
5639 }
5640 }
5641
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64,k_eq_8_subtile_m)5642 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_eq_8_subtile_m) {
5643 TEST_REQUIRES_X86_SSE2;
5644 for (uint32_t m = 1; m <= 2; m++) {
5645 GemmMicrokernelTester()
5646 .mr(2)
5647 .nr(4)
5648 .kr(2)
5649 .sr(1)
5650 .m(m)
5651 .n(4)
5652 .k(8)
5653 .iterations(1)
5654 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5655 }
5656 }
5657
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64,k_eq_8_subtile_n)5658 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_eq_8_subtile_n) {
5659 TEST_REQUIRES_X86_SSE2;
5660 for (uint32_t n = 1; n <= 4; n++) {
5661 GemmMicrokernelTester()
5662 .mr(2)
5663 .nr(4)
5664 .kr(2)
5665 .sr(1)
5666 .m(2)
5667 .n(n)
5668 .k(8)
5669 .iterations(1)
5670 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5671 }
5672 }
5673
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64,k_lt_8)5674 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_lt_8) {
5675 TEST_REQUIRES_X86_SSE2;
5676 for (size_t k = 1; k < 8; k++) {
5677 GemmMicrokernelTester()
5678 .mr(2)
5679 .nr(4)
5680 .kr(2)
5681 .sr(1)
5682 .m(2)
5683 .n(4)
5684 .k(k)
5685 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5686 }
5687 }
5688
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64,k_lt_8_strided_a)5689 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_lt_8_strided_a) {
5690 TEST_REQUIRES_X86_SSE2;
5691 for (size_t k = 1; k < 8; k++) {
5692 GemmMicrokernelTester()
5693 .mr(2)
5694 .nr(4)
5695 .kr(2)
5696 .sr(1)
5697 .m(2)
5698 .n(4)
5699 .k(k)
5700 .a_stride(11)
5701 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5702 }
5703 }
5704
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64,k_lt_8_subtile)5705 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_lt_8_subtile) {
5706 TEST_REQUIRES_X86_SSE2;
5707 for (size_t k = 1; k < 8; k++) {
5708 for (uint32_t n = 1; n <= 4; n++) {
5709 for (uint32_t m = 1; m <= 2; m++) {
5710 GemmMicrokernelTester()
5711 .mr(2)
5712 .nr(4)
5713 .kr(2)
5714 .sr(1)
5715 .m(m)
5716 .n(n)
5717 .k(k)
5718 .iterations(1)
5719 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5720 }
5721 }
5722 }
5723 }
5724
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64,k_gt_8)5725 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_gt_8) {
5726 TEST_REQUIRES_X86_SSE2;
5727 for (size_t k = 9; k < 16; k++) {
5728 GemmMicrokernelTester()
5729 .mr(2)
5730 .nr(4)
5731 .kr(2)
5732 .sr(1)
5733 .m(2)
5734 .n(4)
5735 .k(k)
5736 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5737 }
5738 }
5739
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64,k_gt_8_strided_a)5740 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_gt_8_strided_a) {
5741 TEST_REQUIRES_X86_SSE2;
5742 for (size_t k = 9; k < 16; k++) {
5743 GemmMicrokernelTester()
5744 .mr(2)
5745 .nr(4)
5746 .kr(2)
5747 .sr(1)
5748 .m(2)
5749 .n(4)
5750 .k(k)
5751 .a_stride(19)
5752 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5753 }
5754 }
5755
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64,k_gt_8_subtile)5756 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_gt_8_subtile) {
5757 TEST_REQUIRES_X86_SSE2;
5758 for (size_t k = 9; k < 16; k++) {
5759 for (uint32_t n = 1; n <= 4; n++) {
5760 for (uint32_t m = 1; m <= 2; m++) {
5761 GemmMicrokernelTester()
5762 .mr(2)
5763 .nr(4)
5764 .kr(2)
5765 .sr(1)
5766 .m(m)
5767 .n(n)
5768 .k(k)
5769 .iterations(1)
5770 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5771 }
5772 }
5773 }
5774 }
5775
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64,k_div_8)5776 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_div_8) {
5777 TEST_REQUIRES_X86_SSE2;
5778 for (size_t k = 16; k <= 80; k += 8) {
5779 GemmMicrokernelTester()
5780 .mr(2)
5781 .nr(4)
5782 .kr(2)
5783 .sr(1)
5784 .m(2)
5785 .n(4)
5786 .k(k)
5787 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5788 }
5789 }
5790
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64,k_div_8_strided_a)5791 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_div_8_strided_a) {
5792 TEST_REQUIRES_X86_SSE2;
5793 for (size_t k = 16; k <= 80; k += 8) {
5794 GemmMicrokernelTester()
5795 .mr(2)
5796 .nr(4)
5797 .kr(2)
5798 .sr(1)
5799 .m(2)
5800 .n(4)
5801 .k(k)
5802 .a_stride(83)
5803 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5804 }
5805 }
5806
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64,k_div_8_subtile)5807 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, k_div_8_subtile) {
5808 TEST_REQUIRES_X86_SSE2;
5809 for (size_t k = 16; k <= 80; k += 8) {
5810 for (uint32_t n = 1; n <= 4; n++) {
5811 for (uint32_t m = 1; m <= 2; m++) {
5812 GemmMicrokernelTester()
5813 .mr(2)
5814 .nr(4)
5815 .kr(2)
5816 .sr(1)
5817 .m(m)
5818 .n(n)
5819 .k(k)
5820 .iterations(1)
5821 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5822 }
5823 }
5824 }
5825 }
5826
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64,n_gt_4)5827 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_gt_4) {
5828 TEST_REQUIRES_X86_SSE2;
5829 for (uint32_t n = 5; n < 8; n++) {
5830 for (size_t k = 1; k <= 40; k += 9) {
5831 GemmMicrokernelTester()
5832 .mr(2)
5833 .nr(4)
5834 .kr(2)
5835 .sr(1)
5836 .m(2)
5837 .n(n)
5838 .k(k)
5839 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5840 }
5841 }
5842 }
5843
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64,n_gt_4_strided_cn)5844 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_gt_4_strided_cn) {
5845 TEST_REQUIRES_X86_SSE2;
5846 for (uint32_t n = 5; n < 8; n++) {
5847 for (size_t k = 1; k <= 40; k += 9) {
5848 GemmMicrokernelTester()
5849 .mr(2)
5850 .nr(4)
5851 .kr(2)
5852 .sr(1)
5853 .m(2)
5854 .n(n)
5855 .k(k)
5856 .cn_stride(7)
5857 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5858 }
5859 }
5860 }
5861
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64,n_gt_4_strided_a)5862 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_gt_4_strided_a) {
5863 TEST_REQUIRES_X86_SSE2;
5864 for (uint32_t n = 5; n < 8; n++) {
5865 for (size_t k = 1; k <= 40; k += 9) {
5866 GemmMicrokernelTester()
5867 .mr(2)
5868 .nr(4)
5869 .kr(2)
5870 .sr(1)
5871 .m(2)
5872 .n(n)
5873 .k(k)
5874 .a_stride(43)
5875 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5876 }
5877 }
5878 }
5879
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64,n_gt_4_subtile)5880 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_gt_4_subtile) {
5881 TEST_REQUIRES_X86_SSE2;
5882 for (uint32_t n = 5; n < 8; n++) {
5883 for (size_t k = 1; k <= 40; k += 9) {
5884 for (uint32_t m = 1; m <= 2; m++) {
5885 GemmMicrokernelTester()
5886 .mr(2)
5887 .nr(4)
5888 .kr(2)
5889 .sr(1)
5890 .m(m)
5891 .n(n)
5892 .k(k)
5893 .iterations(1)
5894 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5895 }
5896 }
5897 }
5898 }
5899
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64,n_div_4)5900 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_div_4) {
5901 TEST_REQUIRES_X86_SSE2;
5902 for (uint32_t n = 8; n <= 12; n += 4) {
5903 for (size_t k = 1; k <= 40; k += 9) {
5904 GemmMicrokernelTester()
5905 .mr(2)
5906 .nr(4)
5907 .kr(2)
5908 .sr(1)
5909 .m(2)
5910 .n(n)
5911 .k(k)
5912 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5913 }
5914 }
5915 }
5916
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64,n_div_4_strided_cn)5917 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_div_4_strided_cn) {
5918 TEST_REQUIRES_X86_SSE2;
5919 for (uint32_t n = 8; n <= 12; n += 4) {
5920 for (size_t k = 1; k <= 40; k += 9) {
5921 GemmMicrokernelTester()
5922 .mr(2)
5923 .nr(4)
5924 .kr(2)
5925 .sr(1)
5926 .m(2)
5927 .n(n)
5928 .k(k)
5929 .cn_stride(7)
5930 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5931 }
5932 }
5933 }
5934
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64,n_div_4_strided_a)5935 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_div_4_strided_a) {
5936 TEST_REQUIRES_X86_SSE2;
5937 for (uint32_t n = 8; n <= 12; n += 4) {
5938 for (size_t k = 1; k <= 40; k += 9) {
5939 GemmMicrokernelTester()
5940 .mr(2)
5941 .nr(4)
5942 .kr(2)
5943 .sr(1)
5944 .m(2)
5945 .n(n)
5946 .k(k)
5947 .a_stride(43)
5948 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5949 }
5950 }
5951 }
5952
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64,n_div_4_subtile)5953 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, n_div_4_subtile) {
5954 TEST_REQUIRES_X86_SSE2;
5955 for (uint32_t n = 8; n <= 12; n += 4) {
5956 for (size_t k = 1; k <= 40; k += 9) {
5957 for (uint32_t m = 1; m <= 2; m++) {
5958 GemmMicrokernelTester()
5959 .mr(2)
5960 .nr(4)
5961 .kr(2)
5962 .sr(1)
5963 .m(m)
5964 .n(n)
5965 .k(k)
5966 .iterations(1)
5967 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5968 }
5969 }
5970 }
5971 }
5972
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64,strided_cm_subtile)5973 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, strided_cm_subtile) {
5974 TEST_REQUIRES_X86_SSE2;
5975 for (size_t k = 1; k <= 40; k += 9) {
5976 for (uint32_t n = 1; n <= 4; n++) {
5977 for (uint32_t m = 1; m <= 2; m++) {
5978 GemmMicrokernelTester()
5979 .mr(2)
5980 .nr(4)
5981 .kr(2)
5982 .sr(1)
5983 .m(m)
5984 .n(n)
5985 .k(k)
5986 .cm_stride(7)
5987 .iterations(1)
5988 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
5989 }
5990 }
5991 }
5992 }
5993
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64,qmin)5994 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, qmin) {
5995 TEST_REQUIRES_X86_SSE2;
5996 GemmMicrokernelTester()
5997 .mr(2)
5998 .nr(4)
5999 .kr(2)
6000 .sr(1)
6001 .m(2)
6002 .n(4)
6003 .k(8)
6004 .qmin(128)
6005 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6006 }
6007
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64,qmax)6008 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, qmax) {
6009 TEST_REQUIRES_X86_SSE2;
6010 GemmMicrokernelTester()
6011 .mr(2)
6012 .nr(4)
6013 .kr(2)
6014 .sr(1)
6015 .m(2)
6016 .n(4)
6017 .k(8)
6018 .qmax(128)
6019 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6020 }
6021
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64,strided_cm)6022 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, strided_cm) {
6023 TEST_REQUIRES_X86_SSE2;
6024 GemmMicrokernelTester()
6025 .mr(2)
6026 .nr(4)
6027 .kr(2)
6028 .sr(1)
6029 .m(2)
6030 .n(4)
6031 .k(8)
6032 .cm_stride(7)
6033 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6034 }
6035
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64,no_a_zero_point)6036 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, no_a_zero_point) {
6037 TEST_REQUIRES_X86_SSE2;
6038 for (size_t k = 1; k <= 40; k += 9) {
6039 GemmMicrokernelTester()
6040 .mr(2)
6041 .nr(4)
6042 .kr(2)
6043 .sr(1)
6044 .m(2)
6045 .n(4)
6046 .k(k)
6047 .a_zero_point(0)
6048 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6049 }
6050 }
6051
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64,no_b_zero_point)6052 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, no_b_zero_point) {
6053 TEST_REQUIRES_X86_SSE2;
6054 for (size_t k = 1; k <= 40; k += 9) {
6055 GemmMicrokernelTester()
6056 .mr(2)
6057 .nr(4)
6058 .kr(2)
6059 .sr(1)
6060 .m(2)
6061 .n(4)
6062 .k(k)
6063 .b_zero_point(0)
6064 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6065 }
6066 }
6067
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64,no_zero_point)6068 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE2_LD64, no_zero_point) {
6069 TEST_REQUIRES_X86_SSE2;
6070 for (size_t k = 1; k <= 40; k += 9) {
6071 GemmMicrokernelTester()
6072 .mr(2)
6073 .nr(4)
6074 .kr(2)
6075 .sr(1)
6076 .m(2)
6077 .n(4)
6078 .k(k)
6079 .a_zero_point(0)
6080 .b_zero_point(0)
6081 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6082 }
6083 }
6084 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
6085
6086
6087 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64,k_eq_8)6088 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8) {
6089 TEST_REQUIRES_X86_SSE41;
6090 GemmMicrokernelTester()
6091 .mr(2)
6092 .nr(4)
6093 .kr(2)
6094 .sr(1)
6095 .m(2)
6096 .n(4)
6097 .k(8)
6098 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6099 }
6100
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64,strided_cn)6101 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, strided_cn) {
6102 TEST_REQUIRES_X86_SSE41;
6103 GemmMicrokernelTester()
6104 .mr(2)
6105 .nr(4)
6106 .kr(2)
6107 .sr(1)
6108 .m(2)
6109 .n(4)
6110 .k(8)
6111 .cn_stride(7)
6112 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6113 }
6114
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64,k_eq_8_strided_a)6115 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8_strided_a) {
6116 TEST_REQUIRES_X86_SSE41;
6117 GemmMicrokernelTester()
6118 .mr(2)
6119 .nr(4)
6120 .kr(2)
6121 .sr(1)
6122 .m(2)
6123 .n(4)
6124 .k(8)
6125 .a_stride(11)
6126 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6127 }
6128
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64,k_eq_8_subtile)6129 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8_subtile) {
6130 TEST_REQUIRES_X86_SSE41;
6131 for (uint32_t n = 1; n <= 4; n++) {
6132 for (uint32_t m = 1; m <= 2; m++) {
6133 GemmMicrokernelTester()
6134 .mr(2)
6135 .nr(4)
6136 .kr(2)
6137 .sr(1)
6138 .m(m)
6139 .n(n)
6140 .k(8)
6141 .iterations(1)
6142 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6143 }
6144 }
6145 }
6146
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64,k_eq_8_subtile_m)6147 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8_subtile_m) {
6148 TEST_REQUIRES_X86_SSE41;
6149 for (uint32_t m = 1; m <= 2; m++) {
6150 GemmMicrokernelTester()
6151 .mr(2)
6152 .nr(4)
6153 .kr(2)
6154 .sr(1)
6155 .m(m)
6156 .n(4)
6157 .k(8)
6158 .iterations(1)
6159 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6160 }
6161 }
6162
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64,k_eq_8_subtile_n)6163 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_eq_8_subtile_n) {
6164 TEST_REQUIRES_X86_SSE41;
6165 for (uint32_t n = 1; n <= 4; n++) {
6166 GemmMicrokernelTester()
6167 .mr(2)
6168 .nr(4)
6169 .kr(2)
6170 .sr(1)
6171 .m(2)
6172 .n(n)
6173 .k(8)
6174 .iterations(1)
6175 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6176 }
6177 }
6178
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64,k_lt_8)6179 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_lt_8) {
6180 TEST_REQUIRES_X86_SSE41;
6181 for (size_t k = 1; k < 8; k++) {
6182 GemmMicrokernelTester()
6183 .mr(2)
6184 .nr(4)
6185 .kr(2)
6186 .sr(1)
6187 .m(2)
6188 .n(4)
6189 .k(k)
6190 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6191 }
6192 }
6193
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64,k_lt_8_strided_a)6194 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_lt_8_strided_a) {
6195 TEST_REQUIRES_X86_SSE41;
6196 for (size_t k = 1; k < 8; k++) {
6197 GemmMicrokernelTester()
6198 .mr(2)
6199 .nr(4)
6200 .kr(2)
6201 .sr(1)
6202 .m(2)
6203 .n(4)
6204 .k(k)
6205 .a_stride(11)
6206 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6207 }
6208 }
6209
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64,k_lt_8_subtile)6210 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_lt_8_subtile) {
6211 TEST_REQUIRES_X86_SSE41;
6212 for (size_t k = 1; k < 8; k++) {
6213 for (uint32_t n = 1; n <= 4; n++) {
6214 for (uint32_t m = 1; m <= 2; m++) {
6215 GemmMicrokernelTester()
6216 .mr(2)
6217 .nr(4)
6218 .kr(2)
6219 .sr(1)
6220 .m(m)
6221 .n(n)
6222 .k(k)
6223 .iterations(1)
6224 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6225 }
6226 }
6227 }
6228 }
6229
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64,k_gt_8)6230 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_gt_8) {
6231 TEST_REQUIRES_X86_SSE41;
6232 for (size_t k = 9; k < 16; k++) {
6233 GemmMicrokernelTester()
6234 .mr(2)
6235 .nr(4)
6236 .kr(2)
6237 .sr(1)
6238 .m(2)
6239 .n(4)
6240 .k(k)
6241 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6242 }
6243 }
6244
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64,k_gt_8_strided_a)6245 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_gt_8_strided_a) {
6246 TEST_REQUIRES_X86_SSE41;
6247 for (size_t k = 9; k < 16; k++) {
6248 GemmMicrokernelTester()
6249 .mr(2)
6250 .nr(4)
6251 .kr(2)
6252 .sr(1)
6253 .m(2)
6254 .n(4)
6255 .k(k)
6256 .a_stride(19)
6257 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6258 }
6259 }
6260
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64,k_gt_8_subtile)6261 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_gt_8_subtile) {
6262 TEST_REQUIRES_X86_SSE41;
6263 for (size_t k = 9; k < 16; k++) {
6264 for (uint32_t n = 1; n <= 4; n++) {
6265 for (uint32_t m = 1; m <= 2; m++) {
6266 GemmMicrokernelTester()
6267 .mr(2)
6268 .nr(4)
6269 .kr(2)
6270 .sr(1)
6271 .m(m)
6272 .n(n)
6273 .k(k)
6274 .iterations(1)
6275 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6276 }
6277 }
6278 }
6279 }
6280
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64,k_div_8)6281 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_div_8) {
6282 TEST_REQUIRES_X86_SSE41;
6283 for (size_t k = 16; k <= 80; k += 8) {
6284 GemmMicrokernelTester()
6285 .mr(2)
6286 .nr(4)
6287 .kr(2)
6288 .sr(1)
6289 .m(2)
6290 .n(4)
6291 .k(k)
6292 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6293 }
6294 }
6295
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64,k_div_8_strided_a)6296 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_div_8_strided_a) {
6297 TEST_REQUIRES_X86_SSE41;
6298 for (size_t k = 16; k <= 80; k += 8) {
6299 GemmMicrokernelTester()
6300 .mr(2)
6301 .nr(4)
6302 .kr(2)
6303 .sr(1)
6304 .m(2)
6305 .n(4)
6306 .k(k)
6307 .a_stride(83)
6308 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6309 }
6310 }
6311
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64,k_div_8_subtile)6312 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, k_div_8_subtile) {
6313 TEST_REQUIRES_X86_SSE41;
6314 for (size_t k = 16; k <= 80; k += 8) {
6315 for (uint32_t n = 1; n <= 4; n++) {
6316 for (uint32_t m = 1; m <= 2; m++) {
6317 GemmMicrokernelTester()
6318 .mr(2)
6319 .nr(4)
6320 .kr(2)
6321 .sr(1)
6322 .m(m)
6323 .n(n)
6324 .k(k)
6325 .iterations(1)
6326 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6327 }
6328 }
6329 }
6330 }
6331
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64,n_gt_4)6332 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_gt_4) {
6333 TEST_REQUIRES_X86_SSE41;
6334 for (uint32_t n = 5; n < 8; n++) {
6335 for (size_t k = 1; k <= 40; k += 9) {
6336 GemmMicrokernelTester()
6337 .mr(2)
6338 .nr(4)
6339 .kr(2)
6340 .sr(1)
6341 .m(2)
6342 .n(n)
6343 .k(k)
6344 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6345 }
6346 }
6347 }
6348
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64,n_gt_4_strided_cn)6349 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_gt_4_strided_cn) {
6350 TEST_REQUIRES_X86_SSE41;
6351 for (uint32_t n = 5; n < 8; n++) {
6352 for (size_t k = 1; k <= 40; k += 9) {
6353 GemmMicrokernelTester()
6354 .mr(2)
6355 .nr(4)
6356 .kr(2)
6357 .sr(1)
6358 .m(2)
6359 .n(n)
6360 .k(k)
6361 .cn_stride(7)
6362 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6363 }
6364 }
6365 }
6366
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64,n_gt_4_strided_a)6367 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_gt_4_strided_a) {
6368 TEST_REQUIRES_X86_SSE41;
6369 for (uint32_t n = 5; n < 8; n++) {
6370 for (size_t k = 1; k <= 40; k += 9) {
6371 GemmMicrokernelTester()
6372 .mr(2)
6373 .nr(4)
6374 .kr(2)
6375 .sr(1)
6376 .m(2)
6377 .n(n)
6378 .k(k)
6379 .a_stride(43)
6380 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6381 }
6382 }
6383 }
6384
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64,n_gt_4_subtile)6385 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_gt_4_subtile) {
6386 TEST_REQUIRES_X86_SSE41;
6387 for (uint32_t n = 5; n < 8; n++) {
6388 for (size_t k = 1; k <= 40; k += 9) {
6389 for (uint32_t m = 1; m <= 2; m++) {
6390 GemmMicrokernelTester()
6391 .mr(2)
6392 .nr(4)
6393 .kr(2)
6394 .sr(1)
6395 .m(m)
6396 .n(n)
6397 .k(k)
6398 .iterations(1)
6399 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6400 }
6401 }
6402 }
6403 }
6404
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64,n_div_4)6405 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_div_4) {
6406 TEST_REQUIRES_X86_SSE41;
6407 for (uint32_t n = 8; n <= 12; n += 4) {
6408 for (size_t k = 1; k <= 40; k += 9) {
6409 GemmMicrokernelTester()
6410 .mr(2)
6411 .nr(4)
6412 .kr(2)
6413 .sr(1)
6414 .m(2)
6415 .n(n)
6416 .k(k)
6417 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6418 }
6419 }
6420 }
6421
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64,n_div_4_strided_cn)6422 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_div_4_strided_cn) {
6423 TEST_REQUIRES_X86_SSE41;
6424 for (uint32_t n = 8; n <= 12; n += 4) {
6425 for (size_t k = 1; k <= 40; k += 9) {
6426 GemmMicrokernelTester()
6427 .mr(2)
6428 .nr(4)
6429 .kr(2)
6430 .sr(1)
6431 .m(2)
6432 .n(n)
6433 .k(k)
6434 .cn_stride(7)
6435 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6436 }
6437 }
6438 }
6439
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64,n_div_4_strided_a)6440 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_div_4_strided_a) {
6441 TEST_REQUIRES_X86_SSE41;
6442 for (uint32_t n = 8; n <= 12; n += 4) {
6443 for (size_t k = 1; k <= 40; k += 9) {
6444 GemmMicrokernelTester()
6445 .mr(2)
6446 .nr(4)
6447 .kr(2)
6448 .sr(1)
6449 .m(2)
6450 .n(n)
6451 .k(k)
6452 .a_stride(43)
6453 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6454 }
6455 }
6456 }
6457
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64,n_div_4_subtile)6458 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, n_div_4_subtile) {
6459 TEST_REQUIRES_X86_SSE41;
6460 for (uint32_t n = 8; n <= 12; n += 4) {
6461 for (size_t k = 1; k <= 40; k += 9) {
6462 for (uint32_t m = 1; m <= 2; m++) {
6463 GemmMicrokernelTester()
6464 .mr(2)
6465 .nr(4)
6466 .kr(2)
6467 .sr(1)
6468 .m(m)
6469 .n(n)
6470 .k(k)
6471 .iterations(1)
6472 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6473 }
6474 }
6475 }
6476 }
6477
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64,strided_cm_subtile)6478 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, strided_cm_subtile) {
6479 TEST_REQUIRES_X86_SSE41;
6480 for (size_t k = 1; k <= 40; k += 9) {
6481 for (uint32_t n = 1; n <= 4; n++) {
6482 for (uint32_t m = 1; m <= 2; m++) {
6483 GemmMicrokernelTester()
6484 .mr(2)
6485 .nr(4)
6486 .kr(2)
6487 .sr(1)
6488 .m(m)
6489 .n(n)
6490 .k(k)
6491 .cm_stride(7)
6492 .iterations(1)
6493 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6494 }
6495 }
6496 }
6497 }
6498
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64,qmin)6499 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, qmin) {
6500 TEST_REQUIRES_X86_SSE41;
6501 GemmMicrokernelTester()
6502 .mr(2)
6503 .nr(4)
6504 .kr(2)
6505 .sr(1)
6506 .m(2)
6507 .n(4)
6508 .k(8)
6509 .qmin(128)
6510 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6511 }
6512
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64,qmax)6513 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, qmax) {
6514 TEST_REQUIRES_X86_SSE41;
6515 GemmMicrokernelTester()
6516 .mr(2)
6517 .nr(4)
6518 .kr(2)
6519 .sr(1)
6520 .m(2)
6521 .n(4)
6522 .k(8)
6523 .qmax(128)
6524 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6525 }
6526
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64,strided_cm)6527 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, strided_cm) {
6528 TEST_REQUIRES_X86_SSE41;
6529 GemmMicrokernelTester()
6530 .mr(2)
6531 .nr(4)
6532 .kr(2)
6533 .sr(1)
6534 .m(2)
6535 .n(4)
6536 .k(8)
6537 .cm_stride(7)
6538 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6539 }
6540
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64,no_a_zero_point)6541 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, no_a_zero_point) {
6542 TEST_REQUIRES_X86_SSE41;
6543 for (size_t k = 1; k <= 40; k += 9) {
6544 GemmMicrokernelTester()
6545 .mr(2)
6546 .nr(4)
6547 .kr(2)
6548 .sr(1)
6549 .m(2)
6550 .n(4)
6551 .k(k)
6552 .a_zero_point(0)
6553 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6554 }
6555 }
6556
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64,no_b_zero_point)6557 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, no_b_zero_point) {
6558 TEST_REQUIRES_X86_SSE41;
6559 for (size_t k = 1; k <= 40; k += 9) {
6560 GemmMicrokernelTester()
6561 .mr(2)
6562 .nr(4)
6563 .kr(2)
6564 .sr(1)
6565 .m(2)
6566 .n(4)
6567 .k(k)
6568 .b_zero_point(0)
6569 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6570 }
6571 }
6572
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64,no_zero_point)6573 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD64, no_zero_point) {
6574 TEST_REQUIRES_X86_SSE41;
6575 for (size_t k = 1; k <= 40; k += 9) {
6576 GemmMicrokernelTester()
6577 .mr(2)
6578 .nr(4)
6579 .kr(2)
6580 .sr(1)
6581 .m(2)
6582 .n(4)
6583 .k(k)
6584 .a_zero_point(0)
6585 .b_zero_point(0)
6586 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6587 }
6588 }
6589 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
6590
6591
6592 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64,k_eq_8)6593 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8) {
6594 TEST_REQUIRES_X86_SSE2;
6595 GemmMicrokernelTester()
6596 .mr(4)
6597 .nr(4)
6598 .kr(2)
6599 .sr(1)
6600 .m(4)
6601 .n(4)
6602 .k(8)
6603 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6604 }
6605
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64,strided_cn)6606 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, strided_cn) {
6607 TEST_REQUIRES_X86_SSE2;
6608 GemmMicrokernelTester()
6609 .mr(4)
6610 .nr(4)
6611 .kr(2)
6612 .sr(1)
6613 .m(4)
6614 .n(4)
6615 .k(8)
6616 .cn_stride(7)
6617 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6618 }
6619
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64,k_eq_8_strided_a)6620 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8_strided_a) {
6621 TEST_REQUIRES_X86_SSE2;
6622 GemmMicrokernelTester()
6623 .mr(4)
6624 .nr(4)
6625 .kr(2)
6626 .sr(1)
6627 .m(4)
6628 .n(4)
6629 .k(8)
6630 .a_stride(11)
6631 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6632 }
6633
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64,k_eq_8_subtile)6634 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8_subtile) {
6635 TEST_REQUIRES_X86_SSE2;
6636 for (uint32_t n = 1; n <= 4; n++) {
6637 for (uint32_t m = 1; m <= 4; m++) {
6638 GemmMicrokernelTester()
6639 .mr(4)
6640 .nr(4)
6641 .kr(2)
6642 .sr(1)
6643 .m(m)
6644 .n(n)
6645 .k(8)
6646 .iterations(1)
6647 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6648 }
6649 }
6650 }
6651
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64,k_eq_8_subtile_m)6652 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8_subtile_m) {
6653 TEST_REQUIRES_X86_SSE2;
6654 for (uint32_t m = 1; m <= 4; m++) {
6655 GemmMicrokernelTester()
6656 .mr(4)
6657 .nr(4)
6658 .kr(2)
6659 .sr(1)
6660 .m(m)
6661 .n(4)
6662 .k(8)
6663 .iterations(1)
6664 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6665 }
6666 }
6667
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64,k_eq_8_subtile_n)6668 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_eq_8_subtile_n) {
6669 TEST_REQUIRES_X86_SSE2;
6670 for (uint32_t n = 1; n <= 4; n++) {
6671 GemmMicrokernelTester()
6672 .mr(4)
6673 .nr(4)
6674 .kr(2)
6675 .sr(1)
6676 .m(4)
6677 .n(n)
6678 .k(8)
6679 .iterations(1)
6680 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6681 }
6682 }
6683
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64,k_lt_8)6684 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_lt_8) {
6685 TEST_REQUIRES_X86_SSE2;
6686 for (size_t k = 1; k < 8; k++) {
6687 GemmMicrokernelTester()
6688 .mr(4)
6689 .nr(4)
6690 .kr(2)
6691 .sr(1)
6692 .m(4)
6693 .n(4)
6694 .k(k)
6695 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6696 }
6697 }
6698
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64,k_lt_8_strided_a)6699 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_lt_8_strided_a) {
6700 TEST_REQUIRES_X86_SSE2;
6701 for (size_t k = 1; k < 8; k++) {
6702 GemmMicrokernelTester()
6703 .mr(4)
6704 .nr(4)
6705 .kr(2)
6706 .sr(1)
6707 .m(4)
6708 .n(4)
6709 .k(k)
6710 .a_stride(11)
6711 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6712 }
6713 }
6714
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64,k_lt_8_subtile)6715 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_lt_8_subtile) {
6716 TEST_REQUIRES_X86_SSE2;
6717 for (size_t k = 1; k < 8; k++) {
6718 for (uint32_t n = 1; n <= 4; n++) {
6719 for (uint32_t m = 1; m <= 4; m++) {
6720 GemmMicrokernelTester()
6721 .mr(4)
6722 .nr(4)
6723 .kr(2)
6724 .sr(1)
6725 .m(m)
6726 .n(n)
6727 .k(k)
6728 .iterations(1)
6729 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6730 }
6731 }
6732 }
6733 }
6734
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64,k_gt_8)6735 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_gt_8) {
6736 TEST_REQUIRES_X86_SSE2;
6737 for (size_t k = 9; k < 16; k++) {
6738 GemmMicrokernelTester()
6739 .mr(4)
6740 .nr(4)
6741 .kr(2)
6742 .sr(1)
6743 .m(4)
6744 .n(4)
6745 .k(k)
6746 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6747 }
6748 }
6749
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64,k_gt_8_strided_a)6750 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_gt_8_strided_a) {
6751 TEST_REQUIRES_X86_SSE2;
6752 for (size_t k = 9; k < 16; k++) {
6753 GemmMicrokernelTester()
6754 .mr(4)
6755 .nr(4)
6756 .kr(2)
6757 .sr(1)
6758 .m(4)
6759 .n(4)
6760 .k(k)
6761 .a_stride(19)
6762 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6763 }
6764 }
6765
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64,k_gt_8_subtile)6766 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_gt_8_subtile) {
6767 TEST_REQUIRES_X86_SSE2;
6768 for (size_t k = 9; k < 16; k++) {
6769 for (uint32_t n = 1; n <= 4; n++) {
6770 for (uint32_t m = 1; m <= 4; m++) {
6771 GemmMicrokernelTester()
6772 .mr(4)
6773 .nr(4)
6774 .kr(2)
6775 .sr(1)
6776 .m(m)
6777 .n(n)
6778 .k(k)
6779 .iterations(1)
6780 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6781 }
6782 }
6783 }
6784 }
6785
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64,k_div_8)6786 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_div_8) {
6787 TEST_REQUIRES_X86_SSE2;
6788 for (size_t k = 16; k <= 80; k += 8) {
6789 GemmMicrokernelTester()
6790 .mr(4)
6791 .nr(4)
6792 .kr(2)
6793 .sr(1)
6794 .m(4)
6795 .n(4)
6796 .k(k)
6797 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6798 }
6799 }
6800
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64,k_div_8_strided_a)6801 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_div_8_strided_a) {
6802 TEST_REQUIRES_X86_SSE2;
6803 for (size_t k = 16; k <= 80; k += 8) {
6804 GemmMicrokernelTester()
6805 .mr(4)
6806 .nr(4)
6807 .kr(2)
6808 .sr(1)
6809 .m(4)
6810 .n(4)
6811 .k(k)
6812 .a_stride(83)
6813 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6814 }
6815 }
6816
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64,k_div_8_subtile)6817 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, k_div_8_subtile) {
6818 TEST_REQUIRES_X86_SSE2;
6819 for (size_t k = 16; k <= 80; k += 8) {
6820 for (uint32_t n = 1; n <= 4; n++) {
6821 for (uint32_t m = 1; m <= 4; m++) {
6822 GemmMicrokernelTester()
6823 .mr(4)
6824 .nr(4)
6825 .kr(2)
6826 .sr(1)
6827 .m(m)
6828 .n(n)
6829 .k(k)
6830 .iterations(1)
6831 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6832 }
6833 }
6834 }
6835 }
6836
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64,n_gt_4)6837 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_gt_4) {
6838 TEST_REQUIRES_X86_SSE2;
6839 for (uint32_t n = 5; n < 8; n++) {
6840 for (size_t k = 1; k <= 40; k += 9) {
6841 GemmMicrokernelTester()
6842 .mr(4)
6843 .nr(4)
6844 .kr(2)
6845 .sr(1)
6846 .m(4)
6847 .n(n)
6848 .k(k)
6849 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6850 }
6851 }
6852 }
6853
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64,n_gt_4_strided_cn)6854 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_gt_4_strided_cn) {
6855 TEST_REQUIRES_X86_SSE2;
6856 for (uint32_t n = 5; n < 8; n++) {
6857 for (size_t k = 1; k <= 40; k += 9) {
6858 GemmMicrokernelTester()
6859 .mr(4)
6860 .nr(4)
6861 .kr(2)
6862 .sr(1)
6863 .m(4)
6864 .n(n)
6865 .k(k)
6866 .cn_stride(7)
6867 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6868 }
6869 }
6870 }
6871
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64,n_gt_4_strided_a)6872 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_gt_4_strided_a) {
6873 TEST_REQUIRES_X86_SSE2;
6874 for (uint32_t n = 5; n < 8; n++) {
6875 for (size_t k = 1; k <= 40; k += 9) {
6876 GemmMicrokernelTester()
6877 .mr(4)
6878 .nr(4)
6879 .kr(2)
6880 .sr(1)
6881 .m(4)
6882 .n(n)
6883 .k(k)
6884 .a_stride(43)
6885 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6886 }
6887 }
6888 }
6889
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64,n_gt_4_subtile)6890 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_gt_4_subtile) {
6891 TEST_REQUIRES_X86_SSE2;
6892 for (uint32_t n = 5; n < 8; n++) {
6893 for (size_t k = 1; k <= 40; k += 9) {
6894 for (uint32_t m = 1; m <= 4; m++) {
6895 GemmMicrokernelTester()
6896 .mr(4)
6897 .nr(4)
6898 .kr(2)
6899 .sr(1)
6900 .m(m)
6901 .n(n)
6902 .k(k)
6903 .iterations(1)
6904 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6905 }
6906 }
6907 }
6908 }
6909
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64,n_div_4)6910 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_div_4) {
6911 TEST_REQUIRES_X86_SSE2;
6912 for (uint32_t n = 8; n <= 12; n += 4) {
6913 for (size_t k = 1; k <= 40; k += 9) {
6914 GemmMicrokernelTester()
6915 .mr(4)
6916 .nr(4)
6917 .kr(2)
6918 .sr(1)
6919 .m(4)
6920 .n(n)
6921 .k(k)
6922 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6923 }
6924 }
6925 }
6926
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64,n_div_4_strided_cn)6927 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_div_4_strided_cn) {
6928 TEST_REQUIRES_X86_SSE2;
6929 for (uint32_t n = 8; n <= 12; n += 4) {
6930 for (size_t k = 1; k <= 40; k += 9) {
6931 GemmMicrokernelTester()
6932 .mr(4)
6933 .nr(4)
6934 .kr(2)
6935 .sr(1)
6936 .m(4)
6937 .n(n)
6938 .k(k)
6939 .cn_stride(7)
6940 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6941 }
6942 }
6943 }
6944
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64,n_div_4_strided_a)6945 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_div_4_strided_a) {
6946 TEST_REQUIRES_X86_SSE2;
6947 for (uint32_t n = 8; n <= 12; n += 4) {
6948 for (size_t k = 1; k <= 40; k += 9) {
6949 GemmMicrokernelTester()
6950 .mr(4)
6951 .nr(4)
6952 .kr(2)
6953 .sr(1)
6954 .m(4)
6955 .n(n)
6956 .k(k)
6957 .a_stride(43)
6958 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6959 }
6960 }
6961 }
6962
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64,n_div_4_subtile)6963 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, n_div_4_subtile) {
6964 TEST_REQUIRES_X86_SSE2;
6965 for (uint32_t n = 8; n <= 12; n += 4) {
6966 for (size_t k = 1; k <= 40; k += 9) {
6967 for (uint32_t m = 1; m <= 4; m++) {
6968 GemmMicrokernelTester()
6969 .mr(4)
6970 .nr(4)
6971 .kr(2)
6972 .sr(1)
6973 .m(m)
6974 .n(n)
6975 .k(k)
6976 .iterations(1)
6977 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6978 }
6979 }
6980 }
6981 }
6982
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64,strided_cm_subtile)6983 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, strided_cm_subtile) {
6984 TEST_REQUIRES_X86_SSE2;
6985 for (size_t k = 1; k <= 40; k += 9) {
6986 for (uint32_t n = 1; n <= 4; n++) {
6987 for (uint32_t m = 1; m <= 4; m++) {
6988 GemmMicrokernelTester()
6989 .mr(4)
6990 .nr(4)
6991 .kr(2)
6992 .sr(1)
6993 .m(m)
6994 .n(n)
6995 .k(k)
6996 .cm_stride(7)
6997 .iterations(1)
6998 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
6999 }
7000 }
7001 }
7002 }
7003
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64,qmin)7004 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, qmin) {
7005 TEST_REQUIRES_X86_SSE2;
7006 GemmMicrokernelTester()
7007 .mr(4)
7008 .nr(4)
7009 .kr(2)
7010 .sr(1)
7011 .m(4)
7012 .n(4)
7013 .k(8)
7014 .qmin(128)
7015 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7016 }
7017
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64,qmax)7018 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, qmax) {
7019 TEST_REQUIRES_X86_SSE2;
7020 GemmMicrokernelTester()
7021 .mr(4)
7022 .nr(4)
7023 .kr(2)
7024 .sr(1)
7025 .m(4)
7026 .n(4)
7027 .k(8)
7028 .qmax(128)
7029 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7030 }
7031
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64,strided_cm)7032 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, strided_cm) {
7033 TEST_REQUIRES_X86_SSE2;
7034 GemmMicrokernelTester()
7035 .mr(4)
7036 .nr(4)
7037 .kr(2)
7038 .sr(1)
7039 .m(4)
7040 .n(4)
7041 .k(8)
7042 .cm_stride(7)
7043 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7044 }
7045
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64,no_a_zero_point)7046 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, no_a_zero_point) {
7047 TEST_REQUIRES_X86_SSE2;
7048 for (size_t k = 1; k <= 40; k += 9) {
7049 GemmMicrokernelTester()
7050 .mr(4)
7051 .nr(4)
7052 .kr(2)
7053 .sr(1)
7054 .m(4)
7055 .n(4)
7056 .k(k)
7057 .a_zero_point(0)
7058 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7059 }
7060 }
7061
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64,no_b_zero_point)7062 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, no_b_zero_point) {
7063 TEST_REQUIRES_X86_SSE2;
7064 for (size_t k = 1; k <= 40; k += 9) {
7065 GemmMicrokernelTester()
7066 .mr(4)
7067 .nr(4)
7068 .kr(2)
7069 .sr(1)
7070 .m(4)
7071 .n(4)
7072 .k(k)
7073 .b_zero_point(0)
7074 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7075 }
7076 }
7077
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64,no_zero_point)7078 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE2_LD64, no_zero_point) {
7079 TEST_REQUIRES_X86_SSE2;
7080 for (size_t k = 1; k <= 40; k += 9) {
7081 GemmMicrokernelTester()
7082 .mr(4)
7083 .nr(4)
7084 .kr(2)
7085 .sr(1)
7086 .m(4)
7087 .n(4)
7088 .k(k)
7089 .a_zero_point(0)
7090 .b_zero_point(0)
7091 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7092 }
7093 }
7094 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
7095
7096
7097 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64,k_eq_8)7098 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8) {
7099 TEST_REQUIRES_X86_SSE41;
7100 GemmMicrokernelTester()
7101 .mr(4)
7102 .nr(4)
7103 .kr(2)
7104 .sr(1)
7105 .m(4)
7106 .n(4)
7107 .k(8)
7108 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7109 }
7110
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64,strided_cn)7111 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, strided_cn) {
7112 TEST_REQUIRES_X86_SSE41;
7113 GemmMicrokernelTester()
7114 .mr(4)
7115 .nr(4)
7116 .kr(2)
7117 .sr(1)
7118 .m(4)
7119 .n(4)
7120 .k(8)
7121 .cn_stride(7)
7122 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7123 }
7124
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64,k_eq_8_strided_a)7125 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8_strided_a) {
7126 TEST_REQUIRES_X86_SSE41;
7127 GemmMicrokernelTester()
7128 .mr(4)
7129 .nr(4)
7130 .kr(2)
7131 .sr(1)
7132 .m(4)
7133 .n(4)
7134 .k(8)
7135 .a_stride(11)
7136 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7137 }
7138
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64,k_eq_8_subtile)7139 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8_subtile) {
7140 TEST_REQUIRES_X86_SSE41;
7141 for (uint32_t n = 1; n <= 4; n++) {
7142 for (uint32_t m = 1; m <= 4; m++) {
7143 GemmMicrokernelTester()
7144 .mr(4)
7145 .nr(4)
7146 .kr(2)
7147 .sr(1)
7148 .m(m)
7149 .n(n)
7150 .k(8)
7151 .iterations(1)
7152 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7153 }
7154 }
7155 }
7156
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64,k_eq_8_subtile_m)7157 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8_subtile_m) {
7158 TEST_REQUIRES_X86_SSE41;
7159 for (uint32_t m = 1; m <= 4; m++) {
7160 GemmMicrokernelTester()
7161 .mr(4)
7162 .nr(4)
7163 .kr(2)
7164 .sr(1)
7165 .m(m)
7166 .n(4)
7167 .k(8)
7168 .iterations(1)
7169 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7170 }
7171 }
7172
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64,k_eq_8_subtile_n)7173 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_eq_8_subtile_n) {
7174 TEST_REQUIRES_X86_SSE41;
7175 for (uint32_t n = 1; n <= 4; n++) {
7176 GemmMicrokernelTester()
7177 .mr(4)
7178 .nr(4)
7179 .kr(2)
7180 .sr(1)
7181 .m(4)
7182 .n(n)
7183 .k(8)
7184 .iterations(1)
7185 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7186 }
7187 }
7188
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64,k_lt_8)7189 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_lt_8) {
7190 TEST_REQUIRES_X86_SSE41;
7191 for (size_t k = 1; k < 8; k++) {
7192 GemmMicrokernelTester()
7193 .mr(4)
7194 .nr(4)
7195 .kr(2)
7196 .sr(1)
7197 .m(4)
7198 .n(4)
7199 .k(k)
7200 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7201 }
7202 }
7203
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64,k_lt_8_strided_a)7204 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_lt_8_strided_a) {
7205 TEST_REQUIRES_X86_SSE41;
7206 for (size_t k = 1; k < 8; k++) {
7207 GemmMicrokernelTester()
7208 .mr(4)
7209 .nr(4)
7210 .kr(2)
7211 .sr(1)
7212 .m(4)
7213 .n(4)
7214 .k(k)
7215 .a_stride(11)
7216 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7217 }
7218 }
7219
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64,k_lt_8_subtile)7220 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_lt_8_subtile) {
7221 TEST_REQUIRES_X86_SSE41;
7222 for (size_t k = 1; k < 8; k++) {
7223 for (uint32_t n = 1; n <= 4; n++) {
7224 for (uint32_t m = 1; m <= 4; m++) {
7225 GemmMicrokernelTester()
7226 .mr(4)
7227 .nr(4)
7228 .kr(2)
7229 .sr(1)
7230 .m(m)
7231 .n(n)
7232 .k(k)
7233 .iterations(1)
7234 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7235 }
7236 }
7237 }
7238 }
7239
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64,k_gt_8)7240 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_gt_8) {
7241 TEST_REQUIRES_X86_SSE41;
7242 for (size_t k = 9; k < 16; k++) {
7243 GemmMicrokernelTester()
7244 .mr(4)
7245 .nr(4)
7246 .kr(2)
7247 .sr(1)
7248 .m(4)
7249 .n(4)
7250 .k(k)
7251 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7252 }
7253 }
7254
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64,k_gt_8_strided_a)7255 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_gt_8_strided_a) {
7256 TEST_REQUIRES_X86_SSE41;
7257 for (size_t k = 9; k < 16; k++) {
7258 GemmMicrokernelTester()
7259 .mr(4)
7260 .nr(4)
7261 .kr(2)
7262 .sr(1)
7263 .m(4)
7264 .n(4)
7265 .k(k)
7266 .a_stride(19)
7267 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7268 }
7269 }
7270
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64,k_gt_8_subtile)7271 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_gt_8_subtile) {
7272 TEST_REQUIRES_X86_SSE41;
7273 for (size_t k = 9; k < 16; k++) {
7274 for (uint32_t n = 1; n <= 4; n++) {
7275 for (uint32_t m = 1; m <= 4; m++) {
7276 GemmMicrokernelTester()
7277 .mr(4)
7278 .nr(4)
7279 .kr(2)
7280 .sr(1)
7281 .m(m)
7282 .n(n)
7283 .k(k)
7284 .iterations(1)
7285 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7286 }
7287 }
7288 }
7289 }
7290
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64,k_div_8)7291 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_div_8) {
7292 TEST_REQUIRES_X86_SSE41;
7293 for (size_t k = 16; k <= 80; k += 8) {
7294 GemmMicrokernelTester()
7295 .mr(4)
7296 .nr(4)
7297 .kr(2)
7298 .sr(1)
7299 .m(4)
7300 .n(4)
7301 .k(k)
7302 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7303 }
7304 }
7305
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64,k_div_8_strided_a)7306 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_div_8_strided_a) {
7307 TEST_REQUIRES_X86_SSE41;
7308 for (size_t k = 16; k <= 80; k += 8) {
7309 GemmMicrokernelTester()
7310 .mr(4)
7311 .nr(4)
7312 .kr(2)
7313 .sr(1)
7314 .m(4)
7315 .n(4)
7316 .k(k)
7317 .a_stride(83)
7318 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7319 }
7320 }
7321
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64,k_div_8_subtile)7322 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, k_div_8_subtile) {
7323 TEST_REQUIRES_X86_SSE41;
7324 for (size_t k = 16; k <= 80; k += 8) {
7325 for (uint32_t n = 1; n <= 4; n++) {
7326 for (uint32_t m = 1; m <= 4; m++) {
7327 GemmMicrokernelTester()
7328 .mr(4)
7329 .nr(4)
7330 .kr(2)
7331 .sr(1)
7332 .m(m)
7333 .n(n)
7334 .k(k)
7335 .iterations(1)
7336 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7337 }
7338 }
7339 }
7340 }
7341
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64,n_gt_4)7342 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_gt_4) {
7343 TEST_REQUIRES_X86_SSE41;
7344 for (uint32_t n = 5; n < 8; n++) {
7345 for (size_t k = 1; k <= 40; k += 9) {
7346 GemmMicrokernelTester()
7347 .mr(4)
7348 .nr(4)
7349 .kr(2)
7350 .sr(1)
7351 .m(4)
7352 .n(n)
7353 .k(k)
7354 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7355 }
7356 }
7357 }
7358
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64,n_gt_4_strided_cn)7359 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_gt_4_strided_cn) {
7360 TEST_REQUIRES_X86_SSE41;
7361 for (uint32_t n = 5; n < 8; n++) {
7362 for (size_t k = 1; k <= 40; k += 9) {
7363 GemmMicrokernelTester()
7364 .mr(4)
7365 .nr(4)
7366 .kr(2)
7367 .sr(1)
7368 .m(4)
7369 .n(n)
7370 .k(k)
7371 .cn_stride(7)
7372 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7373 }
7374 }
7375 }
7376
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64,n_gt_4_strided_a)7377 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_gt_4_strided_a) {
7378 TEST_REQUIRES_X86_SSE41;
7379 for (uint32_t n = 5; n < 8; n++) {
7380 for (size_t k = 1; k <= 40; k += 9) {
7381 GemmMicrokernelTester()
7382 .mr(4)
7383 .nr(4)
7384 .kr(2)
7385 .sr(1)
7386 .m(4)
7387 .n(n)
7388 .k(k)
7389 .a_stride(43)
7390 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7391 }
7392 }
7393 }
7394
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64,n_gt_4_subtile)7395 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_gt_4_subtile) {
7396 TEST_REQUIRES_X86_SSE41;
7397 for (uint32_t n = 5; n < 8; n++) {
7398 for (size_t k = 1; k <= 40; k += 9) {
7399 for (uint32_t m = 1; m <= 4; m++) {
7400 GemmMicrokernelTester()
7401 .mr(4)
7402 .nr(4)
7403 .kr(2)
7404 .sr(1)
7405 .m(m)
7406 .n(n)
7407 .k(k)
7408 .iterations(1)
7409 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7410 }
7411 }
7412 }
7413 }
7414
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64,n_div_4)7415 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_div_4) {
7416 TEST_REQUIRES_X86_SSE41;
7417 for (uint32_t n = 8; n <= 12; n += 4) {
7418 for (size_t k = 1; k <= 40; k += 9) {
7419 GemmMicrokernelTester()
7420 .mr(4)
7421 .nr(4)
7422 .kr(2)
7423 .sr(1)
7424 .m(4)
7425 .n(n)
7426 .k(k)
7427 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7428 }
7429 }
7430 }
7431
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64,n_div_4_strided_cn)7432 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_div_4_strided_cn) {
7433 TEST_REQUIRES_X86_SSE41;
7434 for (uint32_t n = 8; n <= 12; n += 4) {
7435 for (size_t k = 1; k <= 40; k += 9) {
7436 GemmMicrokernelTester()
7437 .mr(4)
7438 .nr(4)
7439 .kr(2)
7440 .sr(1)
7441 .m(4)
7442 .n(n)
7443 .k(k)
7444 .cn_stride(7)
7445 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7446 }
7447 }
7448 }
7449
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64,n_div_4_strided_a)7450 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_div_4_strided_a) {
7451 TEST_REQUIRES_X86_SSE41;
7452 for (uint32_t n = 8; n <= 12; n += 4) {
7453 for (size_t k = 1; k <= 40; k += 9) {
7454 GemmMicrokernelTester()
7455 .mr(4)
7456 .nr(4)
7457 .kr(2)
7458 .sr(1)
7459 .m(4)
7460 .n(n)
7461 .k(k)
7462 .a_stride(43)
7463 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7464 }
7465 }
7466 }
7467
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64,n_div_4_subtile)7468 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, n_div_4_subtile) {
7469 TEST_REQUIRES_X86_SSE41;
7470 for (uint32_t n = 8; n <= 12; n += 4) {
7471 for (size_t k = 1; k <= 40; k += 9) {
7472 for (uint32_t m = 1; m <= 4; m++) {
7473 GemmMicrokernelTester()
7474 .mr(4)
7475 .nr(4)
7476 .kr(2)
7477 .sr(1)
7478 .m(m)
7479 .n(n)
7480 .k(k)
7481 .iterations(1)
7482 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7483 }
7484 }
7485 }
7486 }
7487
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64,strided_cm_subtile)7488 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, strided_cm_subtile) {
7489 TEST_REQUIRES_X86_SSE41;
7490 for (size_t k = 1; k <= 40; k += 9) {
7491 for (uint32_t n = 1; n <= 4; n++) {
7492 for (uint32_t m = 1; m <= 4; m++) {
7493 GemmMicrokernelTester()
7494 .mr(4)
7495 .nr(4)
7496 .kr(2)
7497 .sr(1)
7498 .m(m)
7499 .n(n)
7500 .k(k)
7501 .cm_stride(7)
7502 .iterations(1)
7503 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7504 }
7505 }
7506 }
7507 }
7508
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64,qmin)7509 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, qmin) {
7510 TEST_REQUIRES_X86_SSE41;
7511 GemmMicrokernelTester()
7512 .mr(4)
7513 .nr(4)
7514 .kr(2)
7515 .sr(1)
7516 .m(4)
7517 .n(4)
7518 .k(8)
7519 .qmin(128)
7520 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7521 }
7522
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64,qmax)7523 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, qmax) {
7524 TEST_REQUIRES_X86_SSE41;
7525 GemmMicrokernelTester()
7526 .mr(4)
7527 .nr(4)
7528 .kr(2)
7529 .sr(1)
7530 .m(4)
7531 .n(4)
7532 .k(8)
7533 .qmax(128)
7534 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7535 }
7536
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64,strided_cm)7537 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, strided_cm) {
7538 TEST_REQUIRES_X86_SSE41;
7539 GemmMicrokernelTester()
7540 .mr(4)
7541 .nr(4)
7542 .kr(2)
7543 .sr(1)
7544 .m(4)
7545 .n(4)
7546 .k(8)
7547 .cm_stride(7)
7548 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7549 }
7550
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64,no_a_zero_point)7551 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, no_a_zero_point) {
7552 TEST_REQUIRES_X86_SSE41;
7553 for (size_t k = 1; k <= 40; k += 9) {
7554 GemmMicrokernelTester()
7555 .mr(4)
7556 .nr(4)
7557 .kr(2)
7558 .sr(1)
7559 .m(4)
7560 .n(4)
7561 .k(k)
7562 .a_zero_point(0)
7563 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7564 }
7565 }
7566
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64,no_b_zero_point)7567 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, no_b_zero_point) {
7568 TEST_REQUIRES_X86_SSE41;
7569 for (size_t k = 1; k <= 40; k += 9) {
7570 GemmMicrokernelTester()
7571 .mr(4)
7572 .nr(4)
7573 .kr(2)
7574 .sr(1)
7575 .m(4)
7576 .n(4)
7577 .k(k)
7578 .b_zero_point(0)
7579 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7580 }
7581 }
7582
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64,no_zero_point)7583 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__SSE41_LD64, no_zero_point) {
7584 TEST_REQUIRES_X86_SSE41;
7585 for (size_t k = 1; k <= 40; k += 9) {
7586 GemmMicrokernelTester()
7587 .mr(4)
7588 .nr(4)
7589 .kr(2)
7590 .sr(1)
7591 .m(4)
7592 .n(4)
7593 .k(k)
7594 .a_zero_point(0)
7595 .b_zero_point(0)
7596 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7597 }
7598 }
7599 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
7600
7601
7602 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64,k_eq_8)7603 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8) {
7604 TEST_REQUIRES_X86_AVX;
7605 GemmMicrokernelTester()
7606 .mr(1)
7607 .nr(4)
7608 .kr(2)
7609 .sr(1)
7610 .m(1)
7611 .n(4)
7612 .k(8)
7613 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7614 }
7615
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64,strided_cn)7616 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, strided_cn) {
7617 TEST_REQUIRES_X86_AVX;
7618 GemmMicrokernelTester()
7619 .mr(1)
7620 .nr(4)
7621 .kr(2)
7622 .sr(1)
7623 .m(1)
7624 .n(4)
7625 .k(8)
7626 .cn_stride(7)
7627 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7628 }
7629
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64,k_eq_8_strided_a)7630 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8_strided_a) {
7631 TEST_REQUIRES_X86_AVX;
7632 GemmMicrokernelTester()
7633 .mr(1)
7634 .nr(4)
7635 .kr(2)
7636 .sr(1)
7637 .m(1)
7638 .n(4)
7639 .k(8)
7640 .a_stride(11)
7641 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7642 }
7643
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64,k_eq_8_subtile)7644 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8_subtile) {
7645 TEST_REQUIRES_X86_AVX;
7646 for (uint32_t n = 1; n <= 4; n++) {
7647 for (uint32_t m = 1; m <= 1; m++) {
7648 GemmMicrokernelTester()
7649 .mr(1)
7650 .nr(4)
7651 .kr(2)
7652 .sr(1)
7653 .m(m)
7654 .n(n)
7655 .k(8)
7656 .iterations(1)
7657 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7658 }
7659 }
7660 }
7661
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64,k_eq_8_subtile_m)7662 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8_subtile_m) {
7663 TEST_REQUIRES_X86_AVX;
7664 for (uint32_t m = 1; m <= 1; m++) {
7665 GemmMicrokernelTester()
7666 .mr(1)
7667 .nr(4)
7668 .kr(2)
7669 .sr(1)
7670 .m(m)
7671 .n(4)
7672 .k(8)
7673 .iterations(1)
7674 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7675 }
7676 }
7677
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64,k_eq_8_subtile_n)7678 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_eq_8_subtile_n) {
7679 TEST_REQUIRES_X86_AVX;
7680 for (uint32_t n = 1; n <= 4; n++) {
7681 GemmMicrokernelTester()
7682 .mr(1)
7683 .nr(4)
7684 .kr(2)
7685 .sr(1)
7686 .m(1)
7687 .n(n)
7688 .k(8)
7689 .iterations(1)
7690 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7691 }
7692 }
7693
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64,k_lt_8)7694 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_lt_8) {
7695 TEST_REQUIRES_X86_AVX;
7696 for (size_t k = 1; k < 8; k++) {
7697 GemmMicrokernelTester()
7698 .mr(1)
7699 .nr(4)
7700 .kr(2)
7701 .sr(1)
7702 .m(1)
7703 .n(4)
7704 .k(k)
7705 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7706 }
7707 }
7708
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64,k_lt_8_strided_a)7709 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_lt_8_strided_a) {
7710 TEST_REQUIRES_X86_AVX;
7711 for (size_t k = 1; k < 8; k++) {
7712 GemmMicrokernelTester()
7713 .mr(1)
7714 .nr(4)
7715 .kr(2)
7716 .sr(1)
7717 .m(1)
7718 .n(4)
7719 .k(k)
7720 .a_stride(11)
7721 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7722 }
7723 }
7724
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64,k_lt_8_subtile)7725 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_lt_8_subtile) {
7726 TEST_REQUIRES_X86_AVX;
7727 for (size_t k = 1; k < 8; k++) {
7728 for (uint32_t n = 1; n <= 4; n++) {
7729 for (uint32_t m = 1; m <= 1; m++) {
7730 GemmMicrokernelTester()
7731 .mr(1)
7732 .nr(4)
7733 .kr(2)
7734 .sr(1)
7735 .m(m)
7736 .n(n)
7737 .k(k)
7738 .iterations(1)
7739 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7740 }
7741 }
7742 }
7743 }
7744
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64,k_gt_8)7745 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_gt_8) {
7746 TEST_REQUIRES_X86_AVX;
7747 for (size_t k = 9; k < 16; k++) {
7748 GemmMicrokernelTester()
7749 .mr(1)
7750 .nr(4)
7751 .kr(2)
7752 .sr(1)
7753 .m(1)
7754 .n(4)
7755 .k(k)
7756 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7757 }
7758 }
7759
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64,k_gt_8_strided_a)7760 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_gt_8_strided_a) {
7761 TEST_REQUIRES_X86_AVX;
7762 for (size_t k = 9; k < 16; k++) {
7763 GemmMicrokernelTester()
7764 .mr(1)
7765 .nr(4)
7766 .kr(2)
7767 .sr(1)
7768 .m(1)
7769 .n(4)
7770 .k(k)
7771 .a_stride(19)
7772 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7773 }
7774 }
7775
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64,k_gt_8_subtile)7776 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_gt_8_subtile) {
7777 TEST_REQUIRES_X86_AVX;
7778 for (size_t k = 9; k < 16; k++) {
7779 for (uint32_t n = 1; n <= 4; n++) {
7780 for (uint32_t m = 1; m <= 1; m++) {
7781 GemmMicrokernelTester()
7782 .mr(1)
7783 .nr(4)
7784 .kr(2)
7785 .sr(1)
7786 .m(m)
7787 .n(n)
7788 .k(k)
7789 .iterations(1)
7790 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7791 }
7792 }
7793 }
7794 }
7795
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64,k_div_8)7796 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_div_8) {
7797 TEST_REQUIRES_X86_AVX;
7798 for (size_t k = 16; k <= 80; k += 8) {
7799 GemmMicrokernelTester()
7800 .mr(1)
7801 .nr(4)
7802 .kr(2)
7803 .sr(1)
7804 .m(1)
7805 .n(4)
7806 .k(k)
7807 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7808 }
7809 }
7810
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64,k_div_8_strided_a)7811 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_div_8_strided_a) {
7812 TEST_REQUIRES_X86_AVX;
7813 for (size_t k = 16; k <= 80; k += 8) {
7814 GemmMicrokernelTester()
7815 .mr(1)
7816 .nr(4)
7817 .kr(2)
7818 .sr(1)
7819 .m(1)
7820 .n(4)
7821 .k(k)
7822 .a_stride(83)
7823 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7824 }
7825 }
7826
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64,k_div_8_subtile)7827 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, k_div_8_subtile) {
7828 TEST_REQUIRES_X86_AVX;
7829 for (size_t k = 16; k <= 80; k += 8) {
7830 for (uint32_t n = 1; n <= 4; n++) {
7831 for (uint32_t m = 1; m <= 1; m++) {
7832 GemmMicrokernelTester()
7833 .mr(1)
7834 .nr(4)
7835 .kr(2)
7836 .sr(1)
7837 .m(m)
7838 .n(n)
7839 .k(k)
7840 .iterations(1)
7841 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7842 }
7843 }
7844 }
7845 }
7846
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64,n_gt_4)7847 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_gt_4) {
7848 TEST_REQUIRES_X86_AVX;
7849 for (uint32_t n = 5; n < 8; n++) {
7850 for (size_t k = 1; k <= 40; k += 9) {
7851 GemmMicrokernelTester()
7852 .mr(1)
7853 .nr(4)
7854 .kr(2)
7855 .sr(1)
7856 .m(1)
7857 .n(n)
7858 .k(k)
7859 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7860 }
7861 }
7862 }
7863
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64,n_gt_4_strided_cn)7864 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_gt_4_strided_cn) {
7865 TEST_REQUIRES_X86_AVX;
7866 for (uint32_t n = 5; n < 8; n++) {
7867 for (size_t k = 1; k <= 40; k += 9) {
7868 GemmMicrokernelTester()
7869 .mr(1)
7870 .nr(4)
7871 .kr(2)
7872 .sr(1)
7873 .m(1)
7874 .n(n)
7875 .k(k)
7876 .cn_stride(7)
7877 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7878 }
7879 }
7880 }
7881
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64,n_gt_4_strided_a)7882 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_gt_4_strided_a) {
7883 TEST_REQUIRES_X86_AVX;
7884 for (uint32_t n = 5; n < 8; n++) {
7885 for (size_t k = 1; k <= 40; k += 9) {
7886 GemmMicrokernelTester()
7887 .mr(1)
7888 .nr(4)
7889 .kr(2)
7890 .sr(1)
7891 .m(1)
7892 .n(n)
7893 .k(k)
7894 .a_stride(43)
7895 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7896 }
7897 }
7898 }
7899
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64,n_gt_4_subtile)7900 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_gt_4_subtile) {
7901 TEST_REQUIRES_X86_AVX;
7902 for (uint32_t n = 5; n < 8; n++) {
7903 for (size_t k = 1; k <= 40; k += 9) {
7904 for (uint32_t m = 1; m <= 1; m++) {
7905 GemmMicrokernelTester()
7906 .mr(1)
7907 .nr(4)
7908 .kr(2)
7909 .sr(1)
7910 .m(m)
7911 .n(n)
7912 .k(k)
7913 .iterations(1)
7914 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7915 }
7916 }
7917 }
7918 }
7919
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64,n_div_4)7920 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_div_4) {
7921 TEST_REQUIRES_X86_AVX;
7922 for (uint32_t n = 8; n <= 12; n += 4) {
7923 for (size_t k = 1; k <= 40; k += 9) {
7924 GemmMicrokernelTester()
7925 .mr(1)
7926 .nr(4)
7927 .kr(2)
7928 .sr(1)
7929 .m(1)
7930 .n(n)
7931 .k(k)
7932 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7933 }
7934 }
7935 }
7936
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64,n_div_4_strided_cn)7937 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_div_4_strided_cn) {
7938 TEST_REQUIRES_X86_AVX;
7939 for (uint32_t n = 8; n <= 12; n += 4) {
7940 for (size_t k = 1; k <= 40; k += 9) {
7941 GemmMicrokernelTester()
7942 .mr(1)
7943 .nr(4)
7944 .kr(2)
7945 .sr(1)
7946 .m(1)
7947 .n(n)
7948 .k(k)
7949 .cn_stride(7)
7950 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7951 }
7952 }
7953 }
7954
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64,n_div_4_strided_a)7955 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_div_4_strided_a) {
7956 TEST_REQUIRES_X86_AVX;
7957 for (uint32_t n = 8; n <= 12; n += 4) {
7958 for (size_t k = 1; k <= 40; k += 9) {
7959 GemmMicrokernelTester()
7960 .mr(1)
7961 .nr(4)
7962 .kr(2)
7963 .sr(1)
7964 .m(1)
7965 .n(n)
7966 .k(k)
7967 .a_stride(43)
7968 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7969 }
7970 }
7971 }
7972
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64,n_div_4_subtile)7973 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, n_div_4_subtile) {
7974 TEST_REQUIRES_X86_AVX;
7975 for (uint32_t n = 8; n <= 12; n += 4) {
7976 for (size_t k = 1; k <= 40; k += 9) {
7977 for (uint32_t m = 1; m <= 1; m++) {
7978 GemmMicrokernelTester()
7979 .mr(1)
7980 .nr(4)
7981 .kr(2)
7982 .sr(1)
7983 .m(m)
7984 .n(n)
7985 .k(k)
7986 .iterations(1)
7987 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
7988 }
7989 }
7990 }
7991 }
7992
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64,strided_cm_subtile)7993 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, strided_cm_subtile) {
7994 TEST_REQUIRES_X86_AVX;
7995 for (size_t k = 1; k <= 40; k += 9) {
7996 for (uint32_t n = 1; n <= 4; n++) {
7997 for (uint32_t m = 1; m <= 1; m++) {
7998 GemmMicrokernelTester()
7999 .mr(1)
8000 .nr(4)
8001 .kr(2)
8002 .sr(1)
8003 .m(m)
8004 .n(n)
8005 .k(k)
8006 .cm_stride(7)
8007 .iterations(1)
8008 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8009 }
8010 }
8011 }
8012 }
8013
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64,qmin)8014 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, qmin) {
8015 TEST_REQUIRES_X86_AVX;
8016 GemmMicrokernelTester()
8017 .mr(1)
8018 .nr(4)
8019 .kr(2)
8020 .sr(1)
8021 .m(1)
8022 .n(4)
8023 .k(8)
8024 .qmin(128)
8025 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8026 }
8027
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64,qmax)8028 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, qmax) {
8029 TEST_REQUIRES_X86_AVX;
8030 GemmMicrokernelTester()
8031 .mr(1)
8032 .nr(4)
8033 .kr(2)
8034 .sr(1)
8035 .m(1)
8036 .n(4)
8037 .k(8)
8038 .qmax(128)
8039 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8040 }
8041
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64,strided_cm)8042 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, strided_cm) {
8043 TEST_REQUIRES_X86_AVX;
8044 GemmMicrokernelTester()
8045 .mr(1)
8046 .nr(4)
8047 .kr(2)
8048 .sr(1)
8049 .m(1)
8050 .n(4)
8051 .k(8)
8052 .cm_stride(7)
8053 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8054 }
8055
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64,no_a_zero_point)8056 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, no_a_zero_point) {
8057 TEST_REQUIRES_X86_AVX;
8058 for (size_t k = 1; k <= 40; k += 9) {
8059 GemmMicrokernelTester()
8060 .mr(1)
8061 .nr(4)
8062 .kr(2)
8063 .sr(1)
8064 .m(1)
8065 .n(4)
8066 .k(k)
8067 .a_zero_point(0)
8068 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8069 }
8070 }
8071
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64,no_b_zero_point)8072 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, no_b_zero_point) {
8073 TEST_REQUIRES_X86_AVX;
8074 for (size_t k = 1; k <= 40; k += 9) {
8075 GemmMicrokernelTester()
8076 .mr(1)
8077 .nr(4)
8078 .kr(2)
8079 .sr(1)
8080 .m(1)
8081 .n(4)
8082 .k(k)
8083 .b_zero_point(0)
8084 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8085 }
8086 }
8087
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64,no_zero_point)8088 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD64, no_zero_point) {
8089 TEST_REQUIRES_X86_AVX;
8090 for (size_t k = 1; k <= 40; k += 9) {
8091 GemmMicrokernelTester()
8092 .mr(1)
8093 .nr(4)
8094 .kr(2)
8095 .sr(1)
8096 .m(1)
8097 .n(4)
8098 .k(k)
8099 .a_zero_point(0)
8100 .b_zero_point(0)
8101 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8102 }
8103 }
8104 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
8105
8106
8107 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64,k_eq_8)8108 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_eq_8) {
8109 TEST_REQUIRES_X86_XOP;
8110 GemmMicrokernelTester()
8111 .mr(1)
8112 .nr(4)
8113 .kr(2)
8114 .sr(1)
8115 .m(1)
8116 .n(4)
8117 .k(8)
8118 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8119 }
8120
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64,strided_cn)8121 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, strided_cn) {
8122 TEST_REQUIRES_X86_XOP;
8123 GemmMicrokernelTester()
8124 .mr(1)
8125 .nr(4)
8126 .kr(2)
8127 .sr(1)
8128 .m(1)
8129 .n(4)
8130 .k(8)
8131 .cn_stride(7)
8132 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8133 }
8134
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64,k_eq_8_strided_a)8135 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_eq_8_strided_a) {
8136 TEST_REQUIRES_X86_XOP;
8137 GemmMicrokernelTester()
8138 .mr(1)
8139 .nr(4)
8140 .kr(2)
8141 .sr(1)
8142 .m(1)
8143 .n(4)
8144 .k(8)
8145 .a_stride(11)
8146 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8147 }
8148
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64,k_eq_8_subtile)8149 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_eq_8_subtile) {
8150 TEST_REQUIRES_X86_XOP;
8151 for (uint32_t n = 1; n <= 4; n++) {
8152 for (uint32_t m = 1; m <= 1; m++) {
8153 GemmMicrokernelTester()
8154 .mr(1)
8155 .nr(4)
8156 .kr(2)
8157 .sr(1)
8158 .m(m)
8159 .n(n)
8160 .k(8)
8161 .iterations(1)
8162 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8163 }
8164 }
8165 }
8166
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64,k_eq_8_subtile_m)8167 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_eq_8_subtile_m) {
8168 TEST_REQUIRES_X86_XOP;
8169 for (uint32_t m = 1; m <= 1; m++) {
8170 GemmMicrokernelTester()
8171 .mr(1)
8172 .nr(4)
8173 .kr(2)
8174 .sr(1)
8175 .m(m)
8176 .n(4)
8177 .k(8)
8178 .iterations(1)
8179 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8180 }
8181 }
8182
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64,k_eq_8_subtile_n)8183 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_eq_8_subtile_n) {
8184 TEST_REQUIRES_X86_XOP;
8185 for (uint32_t n = 1; n <= 4; n++) {
8186 GemmMicrokernelTester()
8187 .mr(1)
8188 .nr(4)
8189 .kr(2)
8190 .sr(1)
8191 .m(1)
8192 .n(n)
8193 .k(8)
8194 .iterations(1)
8195 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8196 }
8197 }
8198
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64,k_lt_8)8199 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_lt_8) {
8200 TEST_REQUIRES_X86_XOP;
8201 for (size_t k = 1; k < 8; k++) {
8202 GemmMicrokernelTester()
8203 .mr(1)
8204 .nr(4)
8205 .kr(2)
8206 .sr(1)
8207 .m(1)
8208 .n(4)
8209 .k(k)
8210 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8211 }
8212 }
8213
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64,k_lt_8_strided_a)8214 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_lt_8_strided_a) {
8215 TEST_REQUIRES_X86_XOP;
8216 for (size_t k = 1; k < 8; k++) {
8217 GemmMicrokernelTester()
8218 .mr(1)
8219 .nr(4)
8220 .kr(2)
8221 .sr(1)
8222 .m(1)
8223 .n(4)
8224 .k(k)
8225 .a_stride(11)
8226 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8227 }
8228 }
8229
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64,k_lt_8_subtile)8230 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_lt_8_subtile) {
8231 TEST_REQUIRES_X86_XOP;
8232 for (size_t k = 1; k < 8; k++) {
8233 for (uint32_t n = 1; n <= 4; n++) {
8234 for (uint32_t m = 1; m <= 1; m++) {
8235 GemmMicrokernelTester()
8236 .mr(1)
8237 .nr(4)
8238 .kr(2)
8239 .sr(1)
8240 .m(m)
8241 .n(n)
8242 .k(k)
8243 .iterations(1)
8244 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8245 }
8246 }
8247 }
8248 }
8249
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64,k_gt_8)8250 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_gt_8) {
8251 TEST_REQUIRES_X86_XOP;
8252 for (size_t k = 9; k < 16; k++) {
8253 GemmMicrokernelTester()
8254 .mr(1)
8255 .nr(4)
8256 .kr(2)
8257 .sr(1)
8258 .m(1)
8259 .n(4)
8260 .k(k)
8261 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8262 }
8263 }
8264
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64,k_gt_8_strided_a)8265 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_gt_8_strided_a) {
8266 TEST_REQUIRES_X86_XOP;
8267 for (size_t k = 9; k < 16; k++) {
8268 GemmMicrokernelTester()
8269 .mr(1)
8270 .nr(4)
8271 .kr(2)
8272 .sr(1)
8273 .m(1)
8274 .n(4)
8275 .k(k)
8276 .a_stride(19)
8277 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8278 }
8279 }
8280
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64,k_gt_8_subtile)8281 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_gt_8_subtile) {
8282 TEST_REQUIRES_X86_XOP;
8283 for (size_t k = 9; k < 16; k++) {
8284 for (uint32_t n = 1; n <= 4; n++) {
8285 for (uint32_t m = 1; m <= 1; m++) {
8286 GemmMicrokernelTester()
8287 .mr(1)
8288 .nr(4)
8289 .kr(2)
8290 .sr(1)
8291 .m(m)
8292 .n(n)
8293 .k(k)
8294 .iterations(1)
8295 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8296 }
8297 }
8298 }
8299 }
8300
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64,k_div_8)8301 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_div_8) {
8302 TEST_REQUIRES_X86_XOP;
8303 for (size_t k = 16; k <= 80; k += 8) {
8304 GemmMicrokernelTester()
8305 .mr(1)
8306 .nr(4)
8307 .kr(2)
8308 .sr(1)
8309 .m(1)
8310 .n(4)
8311 .k(k)
8312 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8313 }
8314 }
8315
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64,k_div_8_strided_a)8316 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_div_8_strided_a) {
8317 TEST_REQUIRES_X86_XOP;
8318 for (size_t k = 16; k <= 80; k += 8) {
8319 GemmMicrokernelTester()
8320 .mr(1)
8321 .nr(4)
8322 .kr(2)
8323 .sr(1)
8324 .m(1)
8325 .n(4)
8326 .k(k)
8327 .a_stride(83)
8328 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8329 }
8330 }
8331
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64,k_div_8_subtile)8332 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, k_div_8_subtile) {
8333 TEST_REQUIRES_X86_XOP;
8334 for (size_t k = 16; k <= 80; k += 8) {
8335 for (uint32_t n = 1; n <= 4; n++) {
8336 for (uint32_t m = 1; m <= 1; m++) {
8337 GemmMicrokernelTester()
8338 .mr(1)
8339 .nr(4)
8340 .kr(2)
8341 .sr(1)
8342 .m(m)
8343 .n(n)
8344 .k(k)
8345 .iterations(1)
8346 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8347 }
8348 }
8349 }
8350 }
8351
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64,n_gt_4)8352 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_gt_4) {
8353 TEST_REQUIRES_X86_XOP;
8354 for (uint32_t n = 5; n < 8; n++) {
8355 for (size_t k = 1; k <= 40; k += 9) {
8356 GemmMicrokernelTester()
8357 .mr(1)
8358 .nr(4)
8359 .kr(2)
8360 .sr(1)
8361 .m(1)
8362 .n(n)
8363 .k(k)
8364 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8365 }
8366 }
8367 }
8368
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64,n_gt_4_strided_cn)8369 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_gt_4_strided_cn) {
8370 TEST_REQUIRES_X86_XOP;
8371 for (uint32_t n = 5; n < 8; n++) {
8372 for (size_t k = 1; k <= 40; k += 9) {
8373 GemmMicrokernelTester()
8374 .mr(1)
8375 .nr(4)
8376 .kr(2)
8377 .sr(1)
8378 .m(1)
8379 .n(n)
8380 .k(k)
8381 .cn_stride(7)
8382 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8383 }
8384 }
8385 }
8386
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64,n_gt_4_strided_a)8387 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_gt_4_strided_a) {
8388 TEST_REQUIRES_X86_XOP;
8389 for (uint32_t n = 5; n < 8; n++) {
8390 for (size_t k = 1; k <= 40; k += 9) {
8391 GemmMicrokernelTester()
8392 .mr(1)
8393 .nr(4)
8394 .kr(2)
8395 .sr(1)
8396 .m(1)
8397 .n(n)
8398 .k(k)
8399 .a_stride(43)
8400 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8401 }
8402 }
8403 }
8404
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64,n_gt_4_subtile)8405 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_gt_4_subtile) {
8406 TEST_REQUIRES_X86_XOP;
8407 for (uint32_t n = 5; n < 8; n++) {
8408 for (size_t k = 1; k <= 40; k += 9) {
8409 for (uint32_t m = 1; m <= 1; m++) {
8410 GemmMicrokernelTester()
8411 .mr(1)
8412 .nr(4)
8413 .kr(2)
8414 .sr(1)
8415 .m(m)
8416 .n(n)
8417 .k(k)
8418 .iterations(1)
8419 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8420 }
8421 }
8422 }
8423 }
8424
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64,n_div_4)8425 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_div_4) {
8426 TEST_REQUIRES_X86_XOP;
8427 for (uint32_t n = 8; n <= 12; n += 4) {
8428 for (size_t k = 1; k <= 40; k += 9) {
8429 GemmMicrokernelTester()
8430 .mr(1)
8431 .nr(4)
8432 .kr(2)
8433 .sr(1)
8434 .m(1)
8435 .n(n)
8436 .k(k)
8437 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8438 }
8439 }
8440 }
8441
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64,n_div_4_strided_cn)8442 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_div_4_strided_cn) {
8443 TEST_REQUIRES_X86_XOP;
8444 for (uint32_t n = 8; n <= 12; n += 4) {
8445 for (size_t k = 1; k <= 40; k += 9) {
8446 GemmMicrokernelTester()
8447 .mr(1)
8448 .nr(4)
8449 .kr(2)
8450 .sr(1)
8451 .m(1)
8452 .n(n)
8453 .k(k)
8454 .cn_stride(7)
8455 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8456 }
8457 }
8458 }
8459
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64,n_div_4_strided_a)8460 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_div_4_strided_a) {
8461 TEST_REQUIRES_X86_XOP;
8462 for (uint32_t n = 8; n <= 12; n += 4) {
8463 for (size_t k = 1; k <= 40; k += 9) {
8464 GemmMicrokernelTester()
8465 .mr(1)
8466 .nr(4)
8467 .kr(2)
8468 .sr(1)
8469 .m(1)
8470 .n(n)
8471 .k(k)
8472 .a_stride(43)
8473 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8474 }
8475 }
8476 }
8477
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64,n_div_4_subtile)8478 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, n_div_4_subtile) {
8479 TEST_REQUIRES_X86_XOP;
8480 for (uint32_t n = 8; n <= 12; n += 4) {
8481 for (size_t k = 1; k <= 40; k += 9) {
8482 for (uint32_t m = 1; m <= 1; m++) {
8483 GemmMicrokernelTester()
8484 .mr(1)
8485 .nr(4)
8486 .kr(2)
8487 .sr(1)
8488 .m(m)
8489 .n(n)
8490 .k(k)
8491 .iterations(1)
8492 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8493 }
8494 }
8495 }
8496 }
8497
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64,strided_cm_subtile)8498 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, strided_cm_subtile) {
8499 TEST_REQUIRES_X86_XOP;
8500 for (size_t k = 1; k <= 40; k += 9) {
8501 for (uint32_t n = 1; n <= 4; n++) {
8502 for (uint32_t m = 1; m <= 1; m++) {
8503 GemmMicrokernelTester()
8504 .mr(1)
8505 .nr(4)
8506 .kr(2)
8507 .sr(1)
8508 .m(m)
8509 .n(n)
8510 .k(k)
8511 .cm_stride(7)
8512 .iterations(1)
8513 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8514 }
8515 }
8516 }
8517 }
8518
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64,qmin)8519 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, qmin) {
8520 TEST_REQUIRES_X86_XOP;
8521 GemmMicrokernelTester()
8522 .mr(1)
8523 .nr(4)
8524 .kr(2)
8525 .sr(1)
8526 .m(1)
8527 .n(4)
8528 .k(8)
8529 .qmin(128)
8530 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8531 }
8532
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64,qmax)8533 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, qmax) {
8534 TEST_REQUIRES_X86_XOP;
8535 GemmMicrokernelTester()
8536 .mr(1)
8537 .nr(4)
8538 .kr(2)
8539 .sr(1)
8540 .m(1)
8541 .n(4)
8542 .k(8)
8543 .qmax(128)
8544 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8545 }
8546
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64,strided_cm)8547 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, strided_cm) {
8548 TEST_REQUIRES_X86_XOP;
8549 GemmMicrokernelTester()
8550 .mr(1)
8551 .nr(4)
8552 .kr(2)
8553 .sr(1)
8554 .m(1)
8555 .n(4)
8556 .k(8)
8557 .cm_stride(7)
8558 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8559 }
8560
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64,no_a_zero_point)8561 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, no_a_zero_point) {
8562 TEST_REQUIRES_X86_XOP;
8563 for (size_t k = 1; k <= 40; k += 9) {
8564 GemmMicrokernelTester()
8565 .mr(1)
8566 .nr(4)
8567 .kr(2)
8568 .sr(1)
8569 .m(1)
8570 .n(4)
8571 .k(k)
8572 .a_zero_point(0)
8573 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8574 }
8575 }
8576
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64,no_b_zero_point)8577 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, no_b_zero_point) {
8578 TEST_REQUIRES_X86_XOP;
8579 for (size_t k = 1; k <= 40; k += 9) {
8580 GemmMicrokernelTester()
8581 .mr(1)
8582 .nr(4)
8583 .kr(2)
8584 .sr(1)
8585 .m(1)
8586 .n(4)
8587 .k(k)
8588 .b_zero_point(0)
8589 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8590 }
8591 }
8592
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64,no_zero_point)8593 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__XOP_LD64, no_zero_point) {
8594 TEST_REQUIRES_X86_XOP;
8595 for (size_t k = 1; k <= 40; k += 9) {
8596 GemmMicrokernelTester()
8597 .mr(1)
8598 .nr(4)
8599 .kr(2)
8600 .sr(1)
8601 .m(1)
8602 .n(4)
8603 .k(k)
8604 .a_zero_point(0)
8605 .b_zero_point(0)
8606 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8607 }
8608 }
8609 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
8610
8611
8612 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128,k_eq_8)8613 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_eq_8) {
8614 TEST_REQUIRES_X86_SSE41;
8615 GemmMicrokernelTester()
8616 .mr(1)
8617 .nr(4)
8618 .kr(2)
8619 .sr(1)
8620 .m(1)
8621 .n(4)
8622 .k(8)
8623 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8624 }
8625
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128,strided_cn)8626 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, strided_cn) {
8627 TEST_REQUIRES_X86_SSE41;
8628 GemmMicrokernelTester()
8629 .mr(1)
8630 .nr(4)
8631 .kr(2)
8632 .sr(1)
8633 .m(1)
8634 .n(4)
8635 .k(8)
8636 .cn_stride(7)
8637 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8638 }
8639
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128,k_eq_8_strided_a)8640 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_eq_8_strided_a) {
8641 TEST_REQUIRES_X86_SSE41;
8642 GemmMicrokernelTester()
8643 .mr(1)
8644 .nr(4)
8645 .kr(2)
8646 .sr(1)
8647 .m(1)
8648 .n(4)
8649 .k(8)
8650 .a_stride(11)
8651 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8652 }
8653
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128,k_eq_8_subtile)8654 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_eq_8_subtile) {
8655 TEST_REQUIRES_X86_SSE41;
8656 for (uint32_t n = 1; n <= 4; n++) {
8657 for (uint32_t m = 1; m <= 1; m++) {
8658 GemmMicrokernelTester()
8659 .mr(1)
8660 .nr(4)
8661 .kr(2)
8662 .sr(1)
8663 .m(m)
8664 .n(n)
8665 .k(8)
8666 .iterations(1)
8667 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8668 }
8669 }
8670 }
8671
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128,k_eq_8_subtile_m)8672 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_eq_8_subtile_m) {
8673 TEST_REQUIRES_X86_SSE41;
8674 for (uint32_t m = 1; m <= 1; m++) {
8675 GemmMicrokernelTester()
8676 .mr(1)
8677 .nr(4)
8678 .kr(2)
8679 .sr(1)
8680 .m(m)
8681 .n(4)
8682 .k(8)
8683 .iterations(1)
8684 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8685 }
8686 }
8687
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128,k_eq_8_subtile_n)8688 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_eq_8_subtile_n) {
8689 TEST_REQUIRES_X86_SSE41;
8690 for (uint32_t n = 1; n <= 4; n++) {
8691 GemmMicrokernelTester()
8692 .mr(1)
8693 .nr(4)
8694 .kr(2)
8695 .sr(1)
8696 .m(1)
8697 .n(n)
8698 .k(8)
8699 .iterations(1)
8700 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8701 }
8702 }
8703
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128,k_lt_8)8704 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_lt_8) {
8705 TEST_REQUIRES_X86_SSE41;
8706 for (size_t k = 1; k < 8; k++) {
8707 GemmMicrokernelTester()
8708 .mr(1)
8709 .nr(4)
8710 .kr(2)
8711 .sr(1)
8712 .m(1)
8713 .n(4)
8714 .k(k)
8715 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8716 }
8717 }
8718
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128,k_lt_8_strided_a)8719 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_lt_8_strided_a) {
8720 TEST_REQUIRES_X86_SSE41;
8721 for (size_t k = 1; k < 8; k++) {
8722 GemmMicrokernelTester()
8723 .mr(1)
8724 .nr(4)
8725 .kr(2)
8726 .sr(1)
8727 .m(1)
8728 .n(4)
8729 .k(k)
8730 .a_stride(11)
8731 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8732 }
8733 }
8734
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128,k_lt_8_subtile)8735 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_lt_8_subtile) {
8736 TEST_REQUIRES_X86_SSE41;
8737 for (size_t k = 1; k < 8; k++) {
8738 for (uint32_t n = 1; n <= 4; n++) {
8739 for (uint32_t m = 1; m <= 1; m++) {
8740 GemmMicrokernelTester()
8741 .mr(1)
8742 .nr(4)
8743 .kr(2)
8744 .sr(1)
8745 .m(m)
8746 .n(n)
8747 .k(k)
8748 .iterations(1)
8749 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8750 }
8751 }
8752 }
8753 }
8754
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128,k_gt_8)8755 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_gt_8) {
8756 TEST_REQUIRES_X86_SSE41;
8757 for (size_t k = 9; k < 16; k++) {
8758 GemmMicrokernelTester()
8759 .mr(1)
8760 .nr(4)
8761 .kr(2)
8762 .sr(1)
8763 .m(1)
8764 .n(4)
8765 .k(k)
8766 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8767 }
8768 }
8769
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128,k_gt_8_strided_a)8770 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_gt_8_strided_a) {
8771 TEST_REQUIRES_X86_SSE41;
8772 for (size_t k = 9; k < 16; k++) {
8773 GemmMicrokernelTester()
8774 .mr(1)
8775 .nr(4)
8776 .kr(2)
8777 .sr(1)
8778 .m(1)
8779 .n(4)
8780 .k(k)
8781 .a_stride(19)
8782 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8783 }
8784 }
8785
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128,k_gt_8_subtile)8786 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_gt_8_subtile) {
8787 TEST_REQUIRES_X86_SSE41;
8788 for (size_t k = 9; k < 16; k++) {
8789 for (uint32_t n = 1; n <= 4; n++) {
8790 for (uint32_t m = 1; m <= 1; m++) {
8791 GemmMicrokernelTester()
8792 .mr(1)
8793 .nr(4)
8794 .kr(2)
8795 .sr(1)
8796 .m(m)
8797 .n(n)
8798 .k(k)
8799 .iterations(1)
8800 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8801 }
8802 }
8803 }
8804 }
8805
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128,k_div_8)8806 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_div_8) {
8807 TEST_REQUIRES_X86_SSE41;
8808 for (size_t k = 16; k <= 80; k += 8) {
8809 GemmMicrokernelTester()
8810 .mr(1)
8811 .nr(4)
8812 .kr(2)
8813 .sr(1)
8814 .m(1)
8815 .n(4)
8816 .k(k)
8817 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8818 }
8819 }
8820
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128,k_div_8_strided_a)8821 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_div_8_strided_a) {
8822 TEST_REQUIRES_X86_SSE41;
8823 for (size_t k = 16; k <= 80; k += 8) {
8824 GemmMicrokernelTester()
8825 .mr(1)
8826 .nr(4)
8827 .kr(2)
8828 .sr(1)
8829 .m(1)
8830 .n(4)
8831 .k(k)
8832 .a_stride(83)
8833 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8834 }
8835 }
8836
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128,k_div_8_subtile)8837 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, k_div_8_subtile) {
8838 TEST_REQUIRES_X86_SSE41;
8839 for (size_t k = 16; k <= 80; k += 8) {
8840 for (uint32_t n = 1; n <= 4; n++) {
8841 for (uint32_t m = 1; m <= 1; m++) {
8842 GemmMicrokernelTester()
8843 .mr(1)
8844 .nr(4)
8845 .kr(2)
8846 .sr(1)
8847 .m(m)
8848 .n(n)
8849 .k(k)
8850 .iterations(1)
8851 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8852 }
8853 }
8854 }
8855 }
8856
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128,n_gt_4)8857 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_gt_4) {
8858 TEST_REQUIRES_X86_SSE41;
8859 for (uint32_t n = 5; n < 8; n++) {
8860 for (size_t k = 1; k <= 40; k += 9) {
8861 GemmMicrokernelTester()
8862 .mr(1)
8863 .nr(4)
8864 .kr(2)
8865 .sr(1)
8866 .m(1)
8867 .n(n)
8868 .k(k)
8869 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8870 }
8871 }
8872 }
8873
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128,n_gt_4_strided_cn)8874 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_gt_4_strided_cn) {
8875 TEST_REQUIRES_X86_SSE41;
8876 for (uint32_t n = 5; n < 8; n++) {
8877 for (size_t k = 1; k <= 40; k += 9) {
8878 GemmMicrokernelTester()
8879 .mr(1)
8880 .nr(4)
8881 .kr(2)
8882 .sr(1)
8883 .m(1)
8884 .n(n)
8885 .k(k)
8886 .cn_stride(7)
8887 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8888 }
8889 }
8890 }
8891
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128,n_gt_4_strided_a)8892 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_gt_4_strided_a) {
8893 TEST_REQUIRES_X86_SSE41;
8894 for (uint32_t n = 5; n < 8; n++) {
8895 for (size_t k = 1; k <= 40; k += 9) {
8896 GemmMicrokernelTester()
8897 .mr(1)
8898 .nr(4)
8899 .kr(2)
8900 .sr(1)
8901 .m(1)
8902 .n(n)
8903 .k(k)
8904 .a_stride(43)
8905 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8906 }
8907 }
8908 }
8909
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128,n_gt_4_subtile)8910 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_gt_4_subtile) {
8911 TEST_REQUIRES_X86_SSE41;
8912 for (uint32_t n = 5; n < 8; n++) {
8913 for (size_t k = 1; k <= 40; k += 9) {
8914 for (uint32_t m = 1; m <= 1; m++) {
8915 GemmMicrokernelTester()
8916 .mr(1)
8917 .nr(4)
8918 .kr(2)
8919 .sr(1)
8920 .m(m)
8921 .n(n)
8922 .k(k)
8923 .iterations(1)
8924 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8925 }
8926 }
8927 }
8928 }
8929
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128,n_div_4)8930 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_div_4) {
8931 TEST_REQUIRES_X86_SSE41;
8932 for (uint32_t n = 8; n <= 12; n += 4) {
8933 for (size_t k = 1; k <= 40; k += 9) {
8934 GemmMicrokernelTester()
8935 .mr(1)
8936 .nr(4)
8937 .kr(2)
8938 .sr(1)
8939 .m(1)
8940 .n(n)
8941 .k(k)
8942 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8943 }
8944 }
8945 }
8946
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128,n_div_4_strided_cn)8947 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_div_4_strided_cn) {
8948 TEST_REQUIRES_X86_SSE41;
8949 for (uint32_t n = 8; n <= 12; n += 4) {
8950 for (size_t k = 1; k <= 40; k += 9) {
8951 GemmMicrokernelTester()
8952 .mr(1)
8953 .nr(4)
8954 .kr(2)
8955 .sr(1)
8956 .m(1)
8957 .n(n)
8958 .k(k)
8959 .cn_stride(7)
8960 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8961 }
8962 }
8963 }
8964
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128,n_div_4_strided_a)8965 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_div_4_strided_a) {
8966 TEST_REQUIRES_X86_SSE41;
8967 for (uint32_t n = 8; n <= 12; n += 4) {
8968 for (size_t k = 1; k <= 40; k += 9) {
8969 GemmMicrokernelTester()
8970 .mr(1)
8971 .nr(4)
8972 .kr(2)
8973 .sr(1)
8974 .m(1)
8975 .n(n)
8976 .k(k)
8977 .a_stride(43)
8978 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8979 }
8980 }
8981 }
8982
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128,n_div_4_subtile)8983 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, n_div_4_subtile) {
8984 TEST_REQUIRES_X86_SSE41;
8985 for (uint32_t n = 8; n <= 12; n += 4) {
8986 for (size_t k = 1; k <= 40; k += 9) {
8987 for (uint32_t m = 1; m <= 1; m++) {
8988 GemmMicrokernelTester()
8989 .mr(1)
8990 .nr(4)
8991 .kr(2)
8992 .sr(1)
8993 .m(m)
8994 .n(n)
8995 .k(k)
8996 .iterations(1)
8997 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
8998 }
8999 }
9000 }
9001 }
9002
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128,strided_cm_subtile)9003 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, strided_cm_subtile) {
9004 TEST_REQUIRES_X86_SSE41;
9005 for (size_t k = 1; k <= 40; k += 9) {
9006 for (uint32_t n = 1; n <= 4; n++) {
9007 for (uint32_t m = 1; m <= 1; m++) {
9008 GemmMicrokernelTester()
9009 .mr(1)
9010 .nr(4)
9011 .kr(2)
9012 .sr(1)
9013 .m(m)
9014 .n(n)
9015 .k(k)
9016 .cm_stride(7)
9017 .iterations(1)
9018 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9019 }
9020 }
9021 }
9022 }
9023
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128,qmin)9024 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, qmin) {
9025 TEST_REQUIRES_X86_SSE41;
9026 GemmMicrokernelTester()
9027 .mr(1)
9028 .nr(4)
9029 .kr(2)
9030 .sr(1)
9031 .m(1)
9032 .n(4)
9033 .k(8)
9034 .qmin(128)
9035 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9036 }
9037
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128,qmax)9038 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, qmax) {
9039 TEST_REQUIRES_X86_SSE41;
9040 GemmMicrokernelTester()
9041 .mr(1)
9042 .nr(4)
9043 .kr(2)
9044 .sr(1)
9045 .m(1)
9046 .n(4)
9047 .k(8)
9048 .qmax(128)
9049 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9050 }
9051
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128,strided_cm)9052 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, strided_cm) {
9053 TEST_REQUIRES_X86_SSE41;
9054 GemmMicrokernelTester()
9055 .mr(1)
9056 .nr(4)
9057 .kr(2)
9058 .sr(1)
9059 .m(1)
9060 .n(4)
9061 .k(8)
9062 .cm_stride(7)
9063 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9064 }
9065
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128,no_a_zero_point)9066 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, no_a_zero_point) {
9067 TEST_REQUIRES_X86_SSE41;
9068 for (size_t k = 1; k <= 40; k += 9) {
9069 GemmMicrokernelTester()
9070 .mr(1)
9071 .nr(4)
9072 .kr(2)
9073 .sr(1)
9074 .m(1)
9075 .n(4)
9076 .k(k)
9077 .a_zero_point(0)
9078 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9079 }
9080 }
9081
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128,no_b_zero_point)9082 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, no_b_zero_point) {
9083 TEST_REQUIRES_X86_SSE41;
9084 for (size_t k = 1; k <= 40; k += 9) {
9085 GemmMicrokernelTester()
9086 .mr(1)
9087 .nr(4)
9088 .kr(2)
9089 .sr(1)
9090 .m(1)
9091 .n(4)
9092 .k(k)
9093 .b_zero_point(0)
9094 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9095 }
9096 }
9097
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128,no_zero_point)9098 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE41_LD128, no_zero_point) {
9099 TEST_REQUIRES_X86_SSE41;
9100 for (size_t k = 1; k <= 40; k += 9) {
9101 GemmMicrokernelTester()
9102 .mr(1)
9103 .nr(4)
9104 .kr(2)
9105 .sr(1)
9106 .m(1)
9107 .n(4)
9108 .k(k)
9109 .a_zero_point(0)
9110 .b_zero_point(0)
9111 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9112 }
9113 }
9114 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
9115
9116
9117 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128,k_eq_8)9118 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_eq_8) {
9119 TEST_REQUIRES_X86_SSE41;
9120 GemmMicrokernelTester()
9121 .mr(2)
9122 .nr(4)
9123 .kr(2)
9124 .sr(1)
9125 .m(2)
9126 .n(4)
9127 .k(8)
9128 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9129 }
9130
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128,strided_cn)9131 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, strided_cn) {
9132 TEST_REQUIRES_X86_SSE41;
9133 GemmMicrokernelTester()
9134 .mr(2)
9135 .nr(4)
9136 .kr(2)
9137 .sr(1)
9138 .m(2)
9139 .n(4)
9140 .k(8)
9141 .cn_stride(7)
9142 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9143 }
9144
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128,k_eq_8_strided_a)9145 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_eq_8_strided_a) {
9146 TEST_REQUIRES_X86_SSE41;
9147 GemmMicrokernelTester()
9148 .mr(2)
9149 .nr(4)
9150 .kr(2)
9151 .sr(1)
9152 .m(2)
9153 .n(4)
9154 .k(8)
9155 .a_stride(11)
9156 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9157 }
9158
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128,k_eq_8_subtile)9159 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_eq_8_subtile) {
9160 TEST_REQUIRES_X86_SSE41;
9161 for (uint32_t n = 1; n <= 4; n++) {
9162 for (uint32_t m = 1; m <= 2; m++) {
9163 GemmMicrokernelTester()
9164 .mr(2)
9165 .nr(4)
9166 .kr(2)
9167 .sr(1)
9168 .m(m)
9169 .n(n)
9170 .k(8)
9171 .iterations(1)
9172 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9173 }
9174 }
9175 }
9176
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128,k_eq_8_subtile_m)9177 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_eq_8_subtile_m) {
9178 TEST_REQUIRES_X86_SSE41;
9179 for (uint32_t m = 1; m <= 2; m++) {
9180 GemmMicrokernelTester()
9181 .mr(2)
9182 .nr(4)
9183 .kr(2)
9184 .sr(1)
9185 .m(m)
9186 .n(4)
9187 .k(8)
9188 .iterations(1)
9189 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9190 }
9191 }
9192
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128,k_eq_8_subtile_n)9193 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_eq_8_subtile_n) {
9194 TEST_REQUIRES_X86_SSE41;
9195 for (uint32_t n = 1; n <= 4; n++) {
9196 GemmMicrokernelTester()
9197 .mr(2)
9198 .nr(4)
9199 .kr(2)
9200 .sr(1)
9201 .m(2)
9202 .n(n)
9203 .k(8)
9204 .iterations(1)
9205 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9206 }
9207 }
9208
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128,k_lt_8)9209 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_lt_8) {
9210 TEST_REQUIRES_X86_SSE41;
9211 for (size_t k = 1; k < 8; k++) {
9212 GemmMicrokernelTester()
9213 .mr(2)
9214 .nr(4)
9215 .kr(2)
9216 .sr(1)
9217 .m(2)
9218 .n(4)
9219 .k(k)
9220 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9221 }
9222 }
9223
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128,k_lt_8_strided_a)9224 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_lt_8_strided_a) {
9225 TEST_REQUIRES_X86_SSE41;
9226 for (size_t k = 1; k < 8; k++) {
9227 GemmMicrokernelTester()
9228 .mr(2)
9229 .nr(4)
9230 .kr(2)
9231 .sr(1)
9232 .m(2)
9233 .n(4)
9234 .k(k)
9235 .a_stride(11)
9236 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9237 }
9238 }
9239
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128,k_lt_8_subtile)9240 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_lt_8_subtile) {
9241 TEST_REQUIRES_X86_SSE41;
9242 for (size_t k = 1; k < 8; k++) {
9243 for (uint32_t n = 1; n <= 4; n++) {
9244 for (uint32_t m = 1; m <= 2; m++) {
9245 GemmMicrokernelTester()
9246 .mr(2)
9247 .nr(4)
9248 .kr(2)
9249 .sr(1)
9250 .m(m)
9251 .n(n)
9252 .k(k)
9253 .iterations(1)
9254 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9255 }
9256 }
9257 }
9258 }
9259
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128,k_gt_8)9260 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_gt_8) {
9261 TEST_REQUIRES_X86_SSE41;
9262 for (size_t k = 9; k < 16; k++) {
9263 GemmMicrokernelTester()
9264 .mr(2)
9265 .nr(4)
9266 .kr(2)
9267 .sr(1)
9268 .m(2)
9269 .n(4)
9270 .k(k)
9271 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9272 }
9273 }
9274
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128,k_gt_8_strided_a)9275 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_gt_8_strided_a) {
9276 TEST_REQUIRES_X86_SSE41;
9277 for (size_t k = 9; k < 16; k++) {
9278 GemmMicrokernelTester()
9279 .mr(2)
9280 .nr(4)
9281 .kr(2)
9282 .sr(1)
9283 .m(2)
9284 .n(4)
9285 .k(k)
9286 .a_stride(19)
9287 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9288 }
9289 }
9290
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128,k_gt_8_subtile)9291 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_gt_8_subtile) {
9292 TEST_REQUIRES_X86_SSE41;
9293 for (size_t k = 9; k < 16; k++) {
9294 for (uint32_t n = 1; n <= 4; n++) {
9295 for (uint32_t m = 1; m <= 2; m++) {
9296 GemmMicrokernelTester()
9297 .mr(2)
9298 .nr(4)
9299 .kr(2)
9300 .sr(1)
9301 .m(m)
9302 .n(n)
9303 .k(k)
9304 .iterations(1)
9305 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9306 }
9307 }
9308 }
9309 }
9310
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128,k_div_8)9311 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_div_8) {
9312 TEST_REQUIRES_X86_SSE41;
9313 for (size_t k = 16; k <= 80; k += 8) {
9314 GemmMicrokernelTester()
9315 .mr(2)
9316 .nr(4)
9317 .kr(2)
9318 .sr(1)
9319 .m(2)
9320 .n(4)
9321 .k(k)
9322 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9323 }
9324 }
9325
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128,k_div_8_strided_a)9326 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_div_8_strided_a) {
9327 TEST_REQUIRES_X86_SSE41;
9328 for (size_t k = 16; k <= 80; k += 8) {
9329 GemmMicrokernelTester()
9330 .mr(2)
9331 .nr(4)
9332 .kr(2)
9333 .sr(1)
9334 .m(2)
9335 .n(4)
9336 .k(k)
9337 .a_stride(83)
9338 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9339 }
9340 }
9341
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128,k_div_8_subtile)9342 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, k_div_8_subtile) {
9343 TEST_REQUIRES_X86_SSE41;
9344 for (size_t k = 16; k <= 80; k += 8) {
9345 for (uint32_t n = 1; n <= 4; n++) {
9346 for (uint32_t m = 1; m <= 2; m++) {
9347 GemmMicrokernelTester()
9348 .mr(2)
9349 .nr(4)
9350 .kr(2)
9351 .sr(1)
9352 .m(m)
9353 .n(n)
9354 .k(k)
9355 .iterations(1)
9356 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9357 }
9358 }
9359 }
9360 }
9361
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128,n_gt_4)9362 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_gt_4) {
9363 TEST_REQUIRES_X86_SSE41;
9364 for (uint32_t n = 5; n < 8; n++) {
9365 for (size_t k = 1; k <= 40; k += 9) {
9366 GemmMicrokernelTester()
9367 .mr(2)
9368 .nr(4)
9369 .kr(2)
9370 .sr(1)
9371 .m(2)
9372 .n(n)
9373 .k(k)
9374 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9375 }
9376 }
9377 }
9378
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128,n_gt_4_strided_cn)9379 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_gt_4_strided_cn) {
9380 TEST_REQUIRES_X86_SSE41;
9381 for (uint32_t n = 5; n < 8; n++) {
9382 for (size_t k = 1; k <= 40; k += 9) {
9383 GemmMicrokernelTester()
9384 .mr(2)
9385 .nr(4)
9386 .kr(2)
9387 .sr(1)
9388 .m(2)
9389 .n(n)
9390 .k(k)
9391 .cn_stride(7)
9392 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9393 }
9394 }
9395 }
9396
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128,n_gt_4_strided_a)9397 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_gt_4_strided_a) {
9398 TEST_REQUIRES_X86_SSE41;
9399 for (uint32_t n = 5; n < 8; n++) {
9400 for (size_t k = 1; k <= 40; k += 9) {
9401 GemmMicrokernelTester()
9402 .mr(2)
9403 .nr(4)
9404 .kr(2)
9405 .sr(1)
9406 .m(2)
9407 .n(n)
9408 .k(k)
9409 .a_stride(43)
9410 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9411 }
9412 }
9413 }
9414
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128,n_gt_4_subtile)9415 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_gt_4_subtile) {
9416 TEST_REQUIRES_X86_SSE41;
9417 for (uint32_t n = 5; n < 8; n++) {
9418 for (size_t k = 1; k <= 40; k += 9) {
9419 for (uint32_t m = 1; m <= 2; m++) {
9420 GemmMicrokernelTester()
9421 .mr(2)
9422 .nr(4)
9423 .kr(2)
9424 .sr(1)
9425 .m(m)
9426 .n(n)
9427 .k(k)
9428 .iterations(1)
9429 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9430 }
9431 }
9432 }
9433 }
9434
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128,n_div_4)9435 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_div_4) {
9436 TEST_REQUIRES_X86_SSE41;
9437 for (uint32_t n = 8; n <= 12; n += 4) {
9438 for (size_t k = 1; k <= 40; k += 9) {
9439 GemmMicrokernelTester()
9440 .mr(2)
9441 .nr(4)
9442 .kr(2)
9443 .sr(1)
9444 .m(2)
9445 .n(n)
9446 .k(k)
9447 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9448 }
9449 }
9450 }
9451
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128,n_div_4_strided_cn)9452 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_div_4_strided_cn) {
9453 TEST_REQUIRES_X86_SSE41;
9454 for (uint32_t n = 8; n <= 12; n += 4) {
9455 for (size_t k = 1; k <= 40; k += 9) {
9456 GemmMicrokernelTester()
9457 .mr(2)
9458 .nr(4)
9459 .kr(2)
9460 .sr(1)
9461 .m(2)
9462 .n(n)
9463 .k(k)
9464 .cn_stride(7)
9465 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9466 }
9467 }
9468 }
9469
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128,n_div_4_strided_a)9470 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_div_4_strided_a) {
9471 TEST_REQUIRES_X86_SSE41;
9472 for (uint32_t n = 8; n <= 12; n += 4) {
9473 for (size_t k = 1; k <= 40; k += 9) {
9474 GemmMicrokernelTester()
9475 .mr(2)
9476 .nr(4)
9477 .kr(2)
9478 .sr(1)
9479 .m(2)
9480 .n(n)
9481 .k(k)
9482 .a_stride(43)
9483 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9484 }
9485 }
9486 }
9487
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128,n_div_4_subtile)9488 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, n_div_4_subtile) {
9489 TEST_REQUIRES_X86_SSE41;
9490 for (uint32_t n = 8; n <= 12; n += 4) {
9491 for (size_t k = 1; k <= 40; k += 9) {
9492 for (uint32_t m = 1; m <= 2; m++) {
9493 GemmMicrokernelTester()
9494 .mr(2)
9495 .nr(4)
9496 .kr(2)
9497 .sr(1)
9498 .m(m)
9499 .n(n)
9500 .k(k)
9501 .iterations(1)
9502 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9503 }
9504 }
9505 }
9506 }
9507
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128,strided_cm_subtile)9508 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, strided_cm_subtile) {
9509 TEST_REQUIRES_X86_SSE41;
9510 for (size_t k = 1; k <= 40; k += 9) {
9511 for (uint32_t n = 1; n <= 4; n++) {
9512 for (uint32_t m = 1; m <= 2; m++) {
9513 GemmMicrokernelTester()
9514 .mr(2)
9515 .nr(4)
9516 .kr(2)
9517 .sr(1)
9518 .m(m)
9519 .n(n)
9520 .k(k)
9521 .cm_stride(7)
9522 .iterations(1)
9523 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9524 }
9525 }
9526 }
9527 }
9528
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128,qmin)9529 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, qmin) {
9530 TEST_REQUIRES_X86_SSE41;
9531 GemmMicrokernelTester()
9532 .mr(2)
9533 .nr(4)
9534 .kr(2)
9535 .sr(1)
9536 .m(2)
9537 .n(4)
9538 .k(8)
9539 .qmin(128)
9540 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9541 }
9542
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128,qmax)9543 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, qmax) {
9544 TEST_REQUIRES_X86_SSE41;
9545 GemmMicrokernelTester()
9546 .mr(2)
9547 .nr(4)
9548 .kr(2)
9549 .sr(1)
9550 .m(2)
9551 .n(4)
9552 .k(8)
9553 .qmax(128)
9554 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9555 }
9556
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128,strided_cm)9557 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, strided_cm) {
9558 TEST_REQUIRES_X86_SSE41;
9559 GemmMicrokernelTester()
9560 .mr(2)
9561 .nr(4)
9562 .kr(2)
9563 .sr(1)
9564 .m(2)
9565 .n(4)
9566 .k(8)
9567 .cm_stride(7)
9568 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9569 }
9570
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128,no_a_zero_point)9571 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, no_a_zero_point) {
9572 TEST_REQUIRES_X86_SSE41;
9573 for (size_t k = 1; k <= 40; k += 9) {
9574 GemmMicrokernelTester()
9575 .mr(2)
9576 .nr(4)
9577 .kr(2)
9578 .sr(1)
9579 .m(2)
9580 .n(4)
9581 .k(k)
9582 .a_zero_point(0)
9583 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9584 }
9585 }
9586
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128,no_b_zero_point)9587 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, no_b_zero_point) {
9588 TEST_REQUIRES_X86_SSE41;
9589 for (size_t k = 1; k <= 40; k += 9) {
9590 GemmMicrokernelTester()
9591 .mr(2)
9592 .nr(4)
9593 .kr(2)
9594 .sr(1)
9595 .m(2)
9596 .n(4)
9597 .k(k)
9598 .b_zero_point(0)
9599 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9600 }
9601 }
9602
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128,no_zero_point)9603 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__SSE41_LD128, no_zero_point) {
9604 TEST_REQUIRES_X86_SSE41;
9605 for (size_t k = 1; k <= 40; k += 9) {
9606 GemmMicrokernelTester()
9607 .mr(2)
9608 .nr(4)
9609 .kr(2)
9610 .sr(1)
9611 .m(2)
9612 .n(4)
9613 .k(k)
9614 .a_zero_point(0)
9615 .b_zero_point(0)
9616 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9617 }
9618 }
9619 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
9620
9621
9622 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128,k_eq_8)9623 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_eq_8) {
9624 TEST_REQUIRES_X86_SSE2;
9625 GemmMicrokernelTester()
9626 .mr(3)
9627 .nr(4)
9628 .kr(2)
9629 .sr(1)
9630 .m(3)
9631 .n(4)
9632 .k(8)
9633 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9634 }
9635
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128,strided_cn)9636 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, strided_cn) {
9637 TEST_REQUIRES_X86_SSE2;
9638 GemmMicrokernelTester()
9639 .mr(3)
9640 .nr(4)
9641 .kr(2)
9642 .sr(1)
9643 .m(3)
9644 .n(4)
9645 .k(8)
9646 .cn_stride(7)
9647 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9648 }
9649
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128,k_eq_8_strided_a)9650 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_eq_8_strided_a) {
9651 TEST_REQUIRES_X86_SSE2;
9652 GemmMicrokernelTester()
9653 .mr(3)
9654 .nr(4)
9655 .kr(2)
9656 .sr(1)
9657 .m(3)
9658 .n(4)
9659 .k(8)
9660 .a_stride(11)
9661 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9662 }
9663
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128,k_eq_8_subtile)9664 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_eq_8_subtile) {
9665 TEST_REQUIRES_X86_SSE2;
9666 for (uint32_t n = 1; n <= 4; n++) {
9667 for (uint32_t m = 1; m <= 3; m++) {
9668 GemmMicrokernelTester()
9669 .mr(3)
9670 .nr(4)
9671 .kr(2)
9672 .sr(1)
9673 .m(m)
9674 .n(n)
9675 .k(8)
9676 .iterations(1)
9677 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9678 }
9679 }
9680 }
9681
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128,k_eq_8_subtile_m)9682 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_eq_8_subtile_m) {
9683 TEST_REQUIRES_X86_SSE2;
9684 for (uint32_t m = 1; m <= 3; m++) {
9685 GemmMicrokernelTester()
9686 .mr(3)
9687 .nr(4)
9688 .kr(2)
9689 .sr(1)
9690 .m(m)
9691 .n(4)
9692 .k(8)
9693 .iterations(1)
9694 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9695 }
9696 }
9697
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128,k_eq_8_subtile_n)9698 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_eq_8_subtile_n) {
9699 TEST_REQUIRES_X86_SSE2;
9700 for (uint32_t n = 1; n <= 4; n++) {
9701 GemmMicrokernelTester()
9702 .mr(3)
9703 .nr(4)
9704 .kr(2)
9705 .sr(1)
9706 .m(3)
9707 .n(n)
9708 .k(8)
9709 .iterations(1)
9710 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9711 }
9712 }
9713
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128,k_lt_8)9714 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_lt_8) {
9715 TEST_REQUIRES_X86_SSE2;
9716 for (size_t k = 1; k < 8; k++) {
9717 GemmMicrokernelTester()
9718 .mr(3)
9719 .nr(4)
9720 .kr(2)
9721 .sr(1)
9722 .m(3)
9723 .n(4)
9724 .k(k)
9725 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9726 }
9727 }
9728
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128,k_lt_8_strided_a)9729 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_lt_8_strided_a) {
9730 TEST_REQUIRES_X86_SSE2;
9731 for (size_t k = 1; k < 8; k++) {
9732 GemmMicrokernelTester()
9733 .mr(3)
9734 .nr(4)
9735 .kr(2)
9736 .sr(1)
9737 .m(3)
9738 .n(4)
9739 .k(k)
9740 .a_stride(11)
9741 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9742 }
9743 }
9744
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128,k_lt_8_subtile)9745 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_lt_8_subtile) {
9746 TEST_REQUIRES_X86_SSE2;
9747 for (size_t k = 1; k < 8; k++) {
9748 for (uint32_t n = 1; n <= 4; n++) {
9749 for (uint32_t m = 1; m <= 3; m++) {
9750 GemmMicrokernelTester()
9751 .mr(3)
9752 .nr(4)
9753 .kr(2)
9754 .sr(1)
9755 .m(m)
9756 .n(n)
9757 .k(k)
9758 .iterations(1)
9759 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9760 }
9761 }
9762 }
9763 }
9764
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128,k_gt_8)9765 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_gt_8) {
9766 TEST_REQUIRES_X86_SSE2;
9767 for (size_t k = 9; k < 16; k++) {
9768 GemmMicrokernelTester()
9769 .mr(3)
9770 .nr(4)
9771 .kr(2)
9772 .sr(1)
9773 .m(3)
9774 .n(4)
9775 .k(k)
9776 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9777 }
9778 }
9779
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128,k_gt_8_strided_a)9780 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_gt_8_strided_a) {
9781 TEST_REQUIRES_X86_SSE2;
9782 for (size_t k = 9; k < 16; k++) {
9783 GemmMicrokernelTester()
9784 .mr(3)
9785 .nr(4)
9786 .kr(2)
9787 .sr(1)
9788 .m(3)
9789 .n(4)
9790 .k(k)
9791 .a_stride(19)
9792 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9793 }
9794 }
9795
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128,k_gt_8_subtile)9796 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_gt_8_subtile) {
9797 TEST_REQUIRES_X86_SSE2;
9798 for (size_t k = 9; k < 16; k++) {
9799 for (uint32_t n = 1; n <= 4; n++) {
9800 for (uint32_t m = 1; m <= 3; m++) {
9801 GemmMicrokernelTester()
9802 .mr(3)
9803 .nr(4)
9804 .kr(2)
9805 .sr(1)
9806 .m(m)
9807 .n(n)
9808 .k(k)
9809 .iterations(1)
9810 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9811 }
9812 }
9813 }
9814 }
9815
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128,k_div_8)9816 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_div_8) {
9817 TEST_REQUIRES_X86_SSE2;
9818 for (size_t k = 16; k <= 80; k += 8) {
9819 GemmMicrokernelTester()
9820 .mr(3)
9821 .nr(4)
9822 .kr(2)
9823 .sr(1)
9824 .m(3)
9825 .n(4)
9826 .k(k)
9827 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9828 }
9829 }
9830
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128,k_div_8_strided_a)9831 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_div_8_strided_a) {
9832 TEST_REQUIRES_X86_SSE2;
9833 for (size_t k = 16; k <= 80; k += 8) {
9834 GemmMicrokernelTester()
9835 .mr(3)
9836 .nr(4)
9837 .kr(2)
9838 .sr(1)
9839 .m(3)
9840 .n(4)
9841 .k(k)
9842 .a_stride(83)
9843 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9844 }
9845 }
9846
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128,k_div_8_subtile)9847 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, k_div_8_subtile) {
9848 TEST_REQUIRES_X86_SSE2;
9849 for (size_t k = 16; k <= 80; k += 8) {
9850 for (uint32_t n = 1; n <= 4; n++) {
9851 for (uint32_t m = 1; m <= 3; m++) {
9852 GemmMicrokernelTester()
9853 .mr(3)
9854 .nr(4)
9855 .kr(2)
9856 .sr(1)
9857 .m(m)
9858 .n(n)
9859 .k(k)
9860 .iterations(1)
9861 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9862 }
9863 }
9864 }
9865 }
9866
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128,n_gt_4)9867 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_gt_4) {
9868 TEST_REQUIRES_X86_SSE2;
9869 for (uint32_t n = 5; n < 8; n++) {
9870 for (size_t k = 1; k <= 40; k += 9) {
9871 GemmMicrokernelTester()
9872 .mr(3)
9873 .nr(4)
9874 .kr(2)
9875 .sr(1)
9876 .m(3)
9877 .n(n)
9878 .k(k)
9879 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9880 }
9881 }
9882 }
9883
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128,n_gt_4_strided_cn)9884 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_gt_4_strided_cn) {
9885 TEST_REQUIRES_X86_SSE2;
9886 for (uint32_t n = 5; n < 8; n++) {
9887 for (size_t k = 1; k <= 40; k += 9) {
9888 GemmMicrokernelTester()
9889 .mr(3)
9890 .nr(4)
9891 .kr(2)
9892 .sr(1)
9893 .m(3)
9894 .n(n)
9895 .k(k)
9896 .cn_stride(7)
9897 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9898 }
9899 }
9900 }
9901
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128,n_gt_4_strided_a)9902 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_gt_4_strided_a) {
9903 TEST_REQUIRES_X86_SSE2;
9904 for (uint32_t n = 5; n < 8; n++) {
9905 for (size_t k = 1; k <= 40; k += 9) {
9906 GemmMicrokernelTester()
9907 .mr(3)
9908 .nr(4)
9909 .kr(2)
9910 .sr(1)
9911 .m(3)
9912 .n(n)
9913 .k(k)
9914 .a_stride(43)
9915 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9916 }
9917 }
9918 }
9919
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128,n_gt_4_subtile)9920 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_gt_4_subtile) {
9921 TEST_REQUIRES_X86_SSE2;
9922 for (uint32_t n = 5; n < 8; n++) {
9923 for (size_t k = 1; k <= 40; k += 9) {
9924 for (uint32_t m = 1; m <= 3; m++) {
9925 GemmMicrokernelTester()
9926 .mr(3)
9927 .nr(4)
9928 .kr(2)
9929 .sr(1)
9930 .m(m)
9931 .n(n)
9932 .k(k)
9933 .iterations(1)
9934 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9935 }
9936 }
9937 }
9938 }
9939
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128,n_div_4)9940 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_div_4) {
9941 TEST_REQUIRES_X86_SSE2;
9942 for (uint32_t n = 8; n <= 12; n += 4) {
9943 for (size_t k = 1; k <= 40; k += 9) {
9944 GemmMicrokernelTester()
9945 .mr(3)
9946 .nr(4)
9947 .kr(2)
9948 .sr(1)
9949 .m(3)
9950 .n(n)
9951 .k(k)
9952 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9953 }
9954 }
9955 }
9956
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128,n_div_4_strided_cn)9957 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_div_4_strided_cn) {
9958 TEST_REQUIRES_X86_SSE2;
9959 for (uint32_t n = 8; n <= 12; n += 4) {
9960 for (size_t k = 1; k <= 40; k += 9) {
9961 GemmMicrokernelTester()
9962 .mr(3)
9963 .nr(4)
9964 .kr(2)
9965 .sr(1)
9966 .m(3)
9967 .n(n)
9968 .k(k)
9969 .cn_stride(7)
9970 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9971 }
9972 }
9973 }
9974
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128,n_div_4_strided_a)9975 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_div_4_strided_a) {
9976 TEST_REQUIRES_X86_SSE2;
9977 for (uint32_t n = 8; n <= 12; n += 4) {
9978 for (size_t k = 1; k <= 40; k += 9) {
9979 GemmMicrokernelTester()
9980 .mr(3)
9981 .nr(4)
9982 .kr(2)
9983 .sr(1)
9984 .m(3)
9985 .n(n)
9986 .k(k)
9987 .a_stride(43)
9988 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
9989 }
9990 }
9991 }
9992
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128,n_div_4_subtile)9993 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, n_div_4_subtile) {
9994 TEST_REQUIRES_X86_SSE2;
9995 for (uint32_t n = 8; n <= 12; n += 4) {
9996 for (size_t k = 1; k <= 40; k += 9) {
9997 for (uint32_t m = 1; m <= 3; m++) {
9998 GemmMicrokernelTester()
9999 .mr(3)
10000 .nr(4)
10001 .kr(2)
10002 .sr(1)
10003 .m(m)
10004 .n(n)
10005 .k(k)
10006 .iterations(1)
10007 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10008 }
10009 }
10010 }
10011 }
10012
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128,strided_cm_subtile)10013 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, strided_cm_subtile) {
10014 TEST_REQUIRES_X86_SSE2;
10015 for (size_t k = 1; k <= 40; k += 9) {
10016 for (uint32_t n = 1; n <= 4; n++) {
10017 for (uint32_t m = 1; m <= 3; m++) {
10018 GemmMicrokernelTester()
10019 .mr(3)
10020 .nr(4)
10021 .kr(2)
10022 .sr(1)
10023 .m(m)
10024 .n(n)
10025 .k(k)
10026 .cm_stride(7)
10027 .iterations(1)
10028 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10029 }
10030 }
10031 }
10032 }
10033
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128,qmin)10034 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, qmin) {
10035 TEST_REQUIRES_X86_SSE2;
10036 GemmMicrokernelTester()
10037 .mr(3)
10038 .nr(4)
10039 .kr(2)
10040 .sr(1)
10041 .m(3)
10042 .n(4)
10043 .k(8)
10044 .qmin(128)
10045 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10046 }
10047
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128,qmax)10048 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, qmax) {
10049 TEST_REQUIRES_X86_SSE2;
10050 GemmMicrokernelTester()
10051 .mr(3)
10052 .nr(4)
10053 .kr(2)
10054 .sr(1)
10055 .m(3)
10056 .n(4)
10057 .k(8)
10058 .qmax(128)
10059 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10060 }
10061
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128,strided_cm)10062 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, strided_cm) {
10063 TEST_REQUIRES_X86_SSE2;
10064 GemmMicrokernelTester()
10065 .mr(3)
10066 .nr(4)
10067 .kr(2)
10068 .sr(1)
10069 .m(3)
10070 .n(4)
10071 .k(8)
10072 .cm_stride(7)
10073 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10074 }
10075
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128,no_a_zero_point)10076 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, no_a_zero_point) {
10077 TEST_REQUIRES_X86_SSE2;
10078 for (size_t k = 1; k <= 40; k += 9) {
10079 GemmMicrokernelTester()
10080 .mr(3)
10081 .nr(4)
10082 .kr(2)
10083 .sr(1)
10084 .m(3)
10085 .n(4)
10086 .k(k)
10087 .a_zero_point(0)
10088 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10089 }
10090 }
10091
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128,no_b_zero_point)10092 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, no_b_zero_point) {
10093 TEST_REQUIRES_X86_SSE2;
10094 for (size_t k = 1; k <= 40; k += 9) {
10095 GemmMicrokernelTester()
10096 .mr(3)
10097 .nr(4)
10098 .kr(2)
10099 .sr(1)
10100 .m(3)
10101 .n(4)
10102 .k(k)
10103 .b_zero_point(0)
10104 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10105 }
10106 }
10107
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128,no_zero_point)10108 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__SSE2_LD128, no_zero_point) {
10109 TEST_REQUIRES_X86_SSE2;
10110 for (size_t k = 1; k <= 40; k += 9) {
10111 GemmMicrokernelTester()
10112 .mr(3)
10113 .nr(4)
10114 .kr(2)
10115 .sr(1)
10116 .m(3)
10117 .n(4)
10118 .k(k)
10119 .a_zero_point(0)
10120 .b_zero_point(0)
10121 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10122 }
10123 }
10124 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
10125
10126
10127 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128,k_eq_8)10128 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8) {
10129 TEST_REQUIRES_X86_AVX;
10130 GemmMicrokernelTester()
10131 .mr(1)
10132 .nr(4)
10133 .kr(2)
10134 .sr(1)
10135 .m(1)
10136 .n(4)
10137 .k(8)
10138 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10139 }
10140
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128,strided_cn)10141 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, strided_cn) {
10142 TEST_REQUIRES_X86_AVX;
10143 GemmMicrokernelTester()
10144 .mr(1)
10145 .nr(4)
10146 .kr(2)
10147 .sr(1)
10148 .m(1)
10149 .n(4)
10150 .k(8)
10151 .cn_stride(7)
10152 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10153 }
10154
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128,k_eq_8_strided_a)10155 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8_strided_a) {
10156 TEST_REQUIRES_X86_AVX;
10157 GemmMicrokernelTester()
10158 .mr(1)
10159 .nr(4)
10160 .kr(2)
10161 .sr(1)
10162 .m(1)
10163 .n(4)
10164 .k(8)
10165 .a_stride(11)
10166 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10167 }
10168
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128,k_eq_8_subtile)10169 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8_subtile) {
10170 TEST_REQUIRES_X86_AVX;
10171 for (uint32_t n = 1; n <= 4; n++) {
10172 for (uint32_t m = 1; m <= 1; m++) {
10173 GemmMicrokernelTester()
10174 .mr(1)
10175 .nr(4)
10176 .kr(2)
10177 .sr(1)
10178 .m(m)
10179 .n(n)
10180 .k(8)
10181 .iterations(1)
10182 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10183 }
10184 }
10185 }
10186
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128,k_eq_8_subtile_m)10187 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8_subtile_m) {
10188 TEST_REQUIRES_X86_AVX;
10189 for (uint32_t m = 1; m <= 1; m++) {
10190 GemmMicrokernelTester()
10191 .mr(1)
10192 .nr(4)
10193 .kr(2)
10194 .sr(1)
10195 .m(m)
10196 .n(4)
10197 .k(8)
10198 .iterations(1)
10199 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10200 }
10201 }
10202
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128,k_eq_8_subtile_n)10203 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_eq_8_subtile_n) {
10204 TEST_REQUIRES_X86_AVX;
10205 for (uint32_t n = 1; n <= 4; n++) {
10206 GemmMicrokernelTester()
10207 .mr(1)
10208 .nr(4)
10209 .kr(2)
10210 .sr(1)
10211 .m(1)
10212 .n(n)
10213 .k(8)
10214 .iterations(1)
10215 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10216 }
10217 }
10218
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128,k_lt_8)10219 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_lt_8) {
10220 TEST_REQUIRES_X86_AVX;
10221 for (size_t k = 1; k < 8; k++) {
10222 GemmMicrokernelTester()
10223 .mr(1)
10224 .nr(4)
10225 .kr(2)
10226 .sr(1)
10227 .m(1)
10228 .n(4)
10229 .k(k)
10230 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10231 }
10232 }
10233
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128,k_lt_8_strided_a)10234 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_lt_8_strided_a) {
10235 TEST_REQUIRES_X86_AVX;
10236 for (size_t k = 1; k < 8; k++) {
10237 GemmMicrokernelTester()
10238 .mr(1)
10239 .nr(4)
10240 .kr(2)
10241 .sr(1)
10242 .m(1)
10243 .n(4)
10244 .k(k)
10245 .a_stride(11)
10246 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10247 }
10248 }
10249
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128,k_lt_8_subtile)10250 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_lt_8_subtile) {
10251 TEST_REQUIRES_X86_AVX;
10252 for (size_t k = 1; k < 8; k++) {
10253 for (uint32_t n = 1; n <= 4; n++) {
10254 for (uint32_t m = 1; m <= 1; m++) {
10255 GemmMicrokernelTester()
10256 .mr(1)
10257 .nr(4)
10258 .kr(2)
10259 .sr(1)
10260 .m(m)
10261 .n(n)
10262 .k(k)
10263 .iterations(1)
10264 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10265 }
10266 }
10267 }
10268 }
10269
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128,k_gt_8)10270 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_gt_8) {
10271 TEST_REQUIRES_X86_AVX;
10272 for (size_t k = 9; k < 16; k++) {
10273 GemmMicrokernelTester()
10274 .mr(1)
10275 .nr(4)
10276 .kr(2)
10277 .sr(1)
10278 .m(1)
10279 .n(4)
10280 .k(k)
10281 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10282 }
10283 }
10284
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128,k_gt_8_strided_a)10285 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_gt_8_strided_a) {
10286 TEST_REQUIRES_X86_AVX;
10287 for (size_t k = 9; k < 16; k++) {
10288 GemmMicrokernelTester()
10289 .mr(1)
10290 .nr(4)
10291 .kr(2)
10292 .sr(1)
10293 .m(1)
10294 .n(4)
10295 .k(k)
10296 .a_stride(19)
10297 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10298 }
10299 }
10300
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128,k_gt_8_subtile)10301 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_gt_8_subtile) {
10302 TEST_REQUIRES_X86_AVX;
10303 for (size_t k = 9; k < 16; k++) {
10304 for (uint32_t n = 1; n <= 4; n++) {
10305 for (uint32_t m = 1; m <= 1; m++) {
10306 GemmMicrokernelTester()
10307 .mr(1)
10308 .nr(4)
10309 .kr(2)
10310 .sr(1)
10311 .m(m)
10312 .n(n)
10313 .k(k)
10314 .iterations(1)
10315 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10316 }
10317 }
10318 }
10319 }
10320
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128,k_div_8)10321 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_div_8) {
10322 TEST_REQUIRES_X86_AVX;
10323 for (size_t k = 16; k <= 80; k += 8) {
10324 GemmMicrokernelTester()
10325 .mr(1)
10326 .nr(4)
10327 .kr(2)
10328 .sr(1)
10329 .m(1)
10330 .n(4)
10331 .k(k)
10332 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10333 }
10334 }
10335
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128,k_div_8_strided_a)10336 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_div_8_strided_a) {
10337 TEST_REQUIRES_X86_AVX;
10338 for (size_t k = 16; k <= 80; k += 8) {
10339 GemmMicrokernelTester()
10340 .mr(1)
10341 .nr(4)
10342 .kr(2)
10343 .sr(1)
10344 .m(1)
10345 .n(4)
10346 .k(k)
10347 .a_stride(83)
10348 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10349 }
10350 }
10351
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128,k_div_8_subtile)10352 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, k_div_8_subtile) {
10353 TEST_REQUIRES_X86_AVX;
10354 for (size_t k = 16; k <= 80; k += 8) {
10355 for (uint32_t n = 1; n <= 4; n++) {
10356 for (uint32_t m = 1; m <= 1; m++) {
10357 GemmMicrokernelTester()
10358 .mr(1)
10359 .nr(4)
10360 .kr(2)
10361 .sr(1)
10362 .m(m)
10363 .n(n)
10364 .k(k)
10365 .iterations(1)
10366 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10367 }
10368 }
10369 }
10370 }
10371
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128,n_gt_4)10372 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_gt_4) {
10373 TEST_REQUIRES_X86_AVX;
10374 for (uint32_t n = 5; n < 8; n++) {
10375 for (size_t k = 1; k <= 40; k += 9) {
10376 GemmMicrokernelTester()
10377 .mr(1)
10378 .nr(4)
10379 .kr(2)
10380 .sr(1)
10381 .m(1)
10382 .n(n)
10383 .k(k)
10384 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10385 }
10386 }
10387 }
10388
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128,n_gt_4_strided_cn)10389 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_gt_4_strided_cn) {
10390 TEST_REQUIRES_X86_AVX;
10391 for (uint32_t n = 5; n < 8; n++) {
10392 for (size_t k = 1; k <= 40; k += 9) {
10393 GemmMicrokernelTester()
10394 .mr(1)
10395 .nr(4)
10396 .kr(2)
10397 .sr(1)
10398 .m(1)
10399 .n(n)
10400 .k(k)
10401 .cn_stride(7)
10402 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10403 }
10404 }
10405 }
10406
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128,n_gt_4_strided_a)10407 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_gt_4_strided_a) {
10408 TEST_REQUIRES_X86_AVX;
10409 for (uint32_t n = 5; n < 8; n++) {
10410 for (size_t k = 1; k <= 40; k += 9) {
10411 GemmMicrokernelTester()
10412 .mr(1)
10413 .nr(4)
10414 .kr(2)
10415 .sr(1)
10416 .m(1)
10417 .n(n)
10418 .k(k)
10419 .a_stride(43)
10420 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10421 }
10422 }
10423 }
10424
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128,n_gt_4_subtile)10425 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_gt_4_subtile) {
10426 TEST_REQUIRES_X86_AVX;
10427 for (uint32_t n = 5; n < 8; n++) {
10428 for (size_t k = 1; k <= 40; k += 9) {
10429 for (uint32_t m = 1; m <= 1; m++) {
10430 GemmMicrokernelTester()
10431 .mr(1)
10432 .nr(4)
10433 .kr(2)
10434 .sr(1)
10435 .m(m)
10436 .n(n)
10437 .k(k)
10438 .iterations(1)
10439 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10440 }
10441 }
10442 }
10443 }
10444
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128,n_div_4)10445 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_div_4) {
10446 TEST_REQUIRES_X86_AVX;
10447 for (uint32_t n = 8; n <= 12; n += 4) {
10448 for (size_t k = 1; k <= 40; k += 9) {
10449 GemmMicrokernelTester()
10450 .mr(1)
10451 .nr(4)
10452 .kr(2)
10453 .sr(1)
10454 .m(1)
10455 .n(n)
10456 .k(k)
10457 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10458 }
10459 }
10460 }
10461
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128,n_div_4_strided_cn)10462 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_div_4_strided_cn) {
10463 TEST_REQUIRES_X86_AVX;
10464 for (uint32_t n = 8; n <= 12; n += 4) {
10465 for (size_t k = 1; k <= 40; k += 9) {
10466 GemmMicrokernelTester()
10467 .mr(1)
10468 .nr(4)
10469 .kr(2)
10470 .sr(1)
10471 .m(1)
10472 .n(n)
10473 .k(k)
10474 .cn_stride(7)
10475 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10476 }
10477 }
10478 }
10479
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128,n_div_4_strided_a)10480 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_div_4_strided_a) {
10481 TEST_REQUIRES_X86_AVX;
10482 for (uint32_t n = 8; n <= 12; n += 4) {
10483 for (size_t k = 1; k <= 40; k += 9) {
10484 GemmMicrokernelTester()
10485 .mr(1)
10486 .nr(4)
10487 .kr(2)
10488 .sr(1)
10489 .m(1)
10490 .n(n)
10491 .k(k)
10492 .a_stride(43)
10493 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10494 }
10495 }
10496 }
10497
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128,n_div_4_subtile)10498 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, n_div_4_subtile) {
10499 TEST_REQUIRES_X86_AVX;
10500 for (uint32_t n = 8; n <= 12; n += 4) {
10501 for (size_t k = 1; k <= 40; k += 9) {
10502 for (uint32_t m = 1; m <= 1; m++) {
10503 GemmMicrokernelTester()
10504 .mr(1)
10505 .nr(4)
10506 .kr(2)
10507 .sr(1)
10508 .m(m)
10509 .n(n)
10510 .k(k)
10511 .iterations(1)
10512 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10513 }
10514 }
10515 }
10516 }
10517
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128,strided_cm_subtile)10518 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, strided_cm_subtile) {
10519 TEST_REQUIRES_X86_AVX;
10520 for (size_t k = 1; k <= 40; k += 9) {
10521 for (uint32_t n = 1; n <= 4; n++) {
10522 for (uint32_t m = 1; m <= 1; m++) {
10523 GemmMicrokernelTester()
10524 .mr(1)
10525 .nr(4)
10526 .kr(2)
10527 .sr(1)
10528 .m(m)
10529 .n(n)
10530 .k(k)
10531 .cm_stride(7)
10532 .iterations(1)
10533 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10534 }
10535 }
10536 }
10537 }
10538
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128,qmin)10539 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, qmin) {
10540 TEST_REQUIRES_X86_AVX;
10541 GemmMicrokernelTester()
10542 .mr(1)
10543 .nr(4)
10544 .kr(2)
10545 .sr(1)
10546 .m(1)
10547 .n(4)
10548 .k(8)
10549 .qmin(128)
10550 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10551 }
10552
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128,qmax)10553 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, qmax) {
10554 TEST_REQUIRES_X86_AVX;
10555 GemmMicrokernelTester()
10556 .mr(1)
10557 .nr(4)
10558 .kr(2)
10559 .sr(1)
10560 .m(1)
10561 .n(4)
10562 .k(8)
10563 .qmax(128)
10564 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10565 }
10566
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128,strided_cm)10567 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, strided_cm) {
10568 TEST_REQUIRES_X86_AVX;
10569 GemmMicrokernelTester()
10570 .mr(1)
10571 .nr(4)
10572 .kr(2)
10573 .sr(1)
10574 .m(1)
10575 .n(4)
10576 .k(8)
10577 .cm_stride(7)
10578 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10579 }
10580
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128,no_a_zero_point)10581 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, no_a_zero_point) {
10582 TEST_REQUIRES_X86_AVX;
10583 for (size_t k = 1; k <= 40; k += 9) {
10584 GemmMicrokernelTester()
10585 .mr(1)
10586 .nr(4)
10587 .kr(2)
10588 .sr(1)
10589 .m(1)
10590 .n(4)
10591 .k(k)
10592 .a_zero_point(0)
10593 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10594 }
10595 }
10596
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128,no_b_zero_point)10597 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, no_b_zero_point) {
10598 TEST_REQUIRES_X86_AVX;
10599 for (size_t k = 1; k <= 40; k += 9) {
10600 GemmMicrokernelTester()
10601 .mr(1)
10602 .nr(4)
10603 .kr(2)
10604 .sr(1)
10605 .m(1)
10606 .n(4)
10607 .k(k)
10608 .b_zero_point(0)
10609 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10610 }
10611 }
10612
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128,no_zero_point)10613 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__AVX_LD128, no_zero_point) {
10614 TEST_REQUIRES_X86_AVX;
10615 for (size_t k = 1; k <= 40; k += 9) {
10616 GemmMicrokernelTester()
10617 .mr(1)
10618 .nr(4)
10619 .kr(2)
10620 .sr(1)
10621 .m(1)
10622 .n(4)
10623 .k(k)
10624 .a_zero_point(0)
10625 .b_zero_point(0)
10626 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10627 }
10628 }
10629 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
10630
10631
10632 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128,k_eq_8)10633 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8) {
10634 TEST_REQUIRES_X86_AVX;
10635 GemmMicrokernelTester()
10636 .mr(2)
10637 .nr(4)
10638 .kr(2)
10639 .sr(1)
10640 .m(2)
10641 .n(4)
10642 .k(8)
10643 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10644 }
10645
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128,strided_cn)10646 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, strided_cn) {
10647 TEST_REQUIRES_X86_AVX;
10648 GemmMicrokernelTester()
10649 .mr(2)
10650 .nr(4)
10651 .kr(2)
10652 .sr(1)
10653 .m(2)
10654 .n(4)
10655 .k(8)
10656 .cn_stride(7)
10657 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10658 }
10659
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128,k_eq_8_strided_a)10660 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8_strided_a) {
10661 TEST_REQUIRES_X86_AVX;
10662 GemmMicrokernelTester()
10663 .mr(2)
10664 .nr(4)
10665 .kr(2)
10666 .sr(1)
10667 .m(2)
10668 .n(4)
10669 .k(8)
10670 .a_stride(11)
10671 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10672 }
10673
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128,k_eq_8_subtile)10674 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8_subtile) {
10675 TEST_REQUIRES_X86_AVX;
10676 for (uint32_t n = 1; n <= 4; n++) {
10677 for (uint32_t m = 1; m <= 2; m++) {
10678 GemmMicrokernelTester()
10679 .mr(2)
10680 .nr(4)
10681 .kr(2)
10682 .sr(1)
10683 .m(m)
10684 .n(n)
10685 .k(8)
10686 .iterations(1)
10687 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10688 }
10689 }
10690 }
10691
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128,k_eq_8_subtile_m)10692 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8_subtile_m) {
10693 TEST_REQUIRES_X86_AVX;
10694 for (uint32_t m = 1; m <= 2; m++) {
10695 GemmMicrokernelTester()
10696 .mr(2)
10697 .nr(4)
10698 .kr(2)
10699 .sr(1)
10700 .m(m)
10701 .n(4)
10702 .k(8)
10703 .iterations(1)
10704 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10705 }
10706 }
10707
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128,k_eq_8_subtile_n)10708 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_eq_8_subtile_n) {
10709 TEST_REQUIRES_X86_AVX;
10710 for (uint32_t n = 1; n <= 4; n++) {
10711 GemmMicrokernelTester()
10712 .mr(2)
10713 .nr(4)
10714 .kr(2)
10715 .sr(1)
10716 .m(2)
10717 .n(n)
10718 .k(8)
10719 .iterations(1)
10720 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10721 }
10722 }
10723
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128,k_lt_8)10724 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_lt_8) {
10725 TEST_REQUIRES_X86_AVX;
10726 for (size_t k = 1; k < 8; k++) {
10727 GemmMicrokernelTester()
10728 .mr(2)
10729 .nr(4)
10730 .kr(2)
10731 .sr(1)
10732 .m(2)
10733 .n(4)
10734 .k(k)
10735 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10736 }
10737 }
10738
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128,k_lt_8_strided_a)10739 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_lt_8_strided_a) {
10740 TEST_REQUIRES_X86_AVX;
10741 for (size_t k = 1; k < 8; k++) {
10742 GemmMicrokernelTester()
10743 .mr(2)
10744 .nr(4)
10745 .kr(2)
10746 .sr(1)
10747 .m(2)
10748 .n(4)
10749 .k(k)
10750 .a_stride(11)
10751 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10752 }
10753 }
10754
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128,k_lt_8_subtile)10755 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_lt_8_subtile) {
10756 TEST_REQUIRES_X86_AVX;
10757 for (size_t k = 1; k < 8; k++) {
10758 for (uint32_t n = 1; n <= 4; n++) {
10759 for (uint32_t m = 1; m <= 2; m++) {
10760 GemmMicrokernelTester()
10761 .mr(2)
10762 .nr(4)
10763 .kr(2)
10764 .sr(1)
10765 .m(m)
10766 .n(n)
10767 .k(k)
10768 .iterations(1)
10769 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10770 }
10771 }
10772 }
10773 }
10774
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128,k_gt_8)10775 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_gt_8) {
10776 TEST_REQUIRES_X86_AVX;
10777 for (size_t k = 9; k < 16; k++) {
10778 GemmMicrokernelTester()
10779 .mr(2)
10780 .nr(4)
10781 .kr(2)
10782 .sr(1)
10783 .m(2)
10784 .n(4)
10785 .k(k)
10786 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10787 }
10788 }
10789
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128,k_gt_8_strided_a)10790 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_gt_8_strided_a) {
10791 TEST_REQUIRES_X86_AVX;
10792 for (size_t k = 9; k < 16; k++) {
10793 GemmMicrokernelTester()
10794 .mr(2)
10795 .nr(4)
10796 .kr(2)
10797 .sr(1)
10798 .m(2)
10799 .n(4)
10800 .k(k)
10801 .a_stride(19)
10802 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10803 }
10804 }
10805
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128,k_gt_8_subtile)10806 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_gt_8_subtile) {
10807 TEST_REQUIRES_X86_AVX;
10808 for (size_t k = 9; k < 16; k++) {
10809 for (uint32_t n = 1; n <= 4; n++) {
10810 for (uint32_t m = 1; m <= 2; m++) {
10811 GemmMicrokernelTester()
10812 .mr(2)
10813 .nr(4)
10814 .kr(2)
10815 .sr(1)
10816 .m(m)
10817 .n(n)
10818 .k(k)
10819 .iterations(1)
10820 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10821 }
10822 }
10823 }
10824 }
10825
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128,k_div_8)10826 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_div_8) {
10827 TEST_REQUIRES_X86_AVX;
10828 for (size_t k = 16; k <= 80; k += 8) {
10829 GemmMicrokernelTester()
10830 .mr(2)
10831 .nr(4)
10832 .kr(2)
10833 .sr(1)
10834 .m(2)
10835 .n(4)
10836 .k(k)
10837 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10838 }
10839 }
10840
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128,k_div_8_strided_a)10841 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_div_8_strided_a) {
10842 TEST_REQUIRES_X86_AVX;
10843 for (size_t k = 16; k <= 80; k += 8) {
10844 GemmMicrokernelTester()
10845 .mr(2)
10846 .nr(4)
10847 .kr(2)
10848 .sr(1)
10849 .m(2)
10850 .n(4)
10851 .k(k)
10852 .a_stride(83)
10853 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10854 }
10855 }
10856
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128,k_div_8_subtile)10857 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, k_div_8_subtile) {
10858 TEST_REQUIRES_X86_AVX;
10859 for (size_t k = 16; k <= 80; k += 8) {
10860 for (uint32_t n = 1; n <= 4; n++) {
10861 for (uint32_t m = 1; m <= 2; m++) {
10862 GemmMicrokernelTester()
10863 .mr(2)
10864 .nr(4)
10865 .kr(2)
10866 .sr(1)
10867 .m(m)
10868 .n(n)
10869 .k(k)
10870 .iterations(1)
10871 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10872 }
10873 }
10874 }
10875 }
10876
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128,n_gt_4)10877 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_gt_4) {
10878 TEST_REQUIRES_X86_AVX;
10879 for (uint32_t n = 5; n < 8; n++) {
10880 for (size_t k = 1; k <= 40; k += 9) {
10881 GemmMicrokernelTester()
10882 .mr(2)
10883 .nr(4)
10884 .kr(2)
10885 .sr(1)
10886 .m(2)
10887 .n(n)
10888 .k(k)
10889 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10890 }
10891 }
10892 }
10893
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128,n_gt_4_strided_cn)10894 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_gt_4_strided_cn) {
10895 TEST_REQUIRES_X86_AVX;
10896 for (uint32_t n = 5; n < 8; n++) {
10897 for (size_t k = 1; k <= 40; k += 9) {
10898 GemmMicrokernelTester()
10899 .mr(2)
10900 .nr(4)
10901 .kr(2)
10902 .sr(1)
10903 .m(2)
10904 .n(n)
10905 .k(k)
10906 .cn_stride(7)
10907 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10908 }
10909 }
10910 }
10911
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128,n_gt_4_strided_a)10912 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_gt_4_strided_a) {
10913 TEST_REQUIRES_X86_AVX;
10914 for (uint32_t n = 5; n < 8; n++) {
10915 for (size_t k = 1; k <= 40; k += 9) {
10916 GemmMicrokernelTester()
10917 .mr(2)
10918 .nr(4)
10919 .kr(2)
10920 .sr(1)
10921 .m(2)
10922 .n(n)
10923 .k(k)
10924 .a_stride(43)
10925 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10926 }
10927 }
10928 }
10929
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128,n_gt_4_subtile)10930 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_gt_4_subtile) {
10931 TEST_REQUIRES_X86_AVX;
10932 for (uint32_t n = 5; n < 8; n++) {
10933 for (size_t k = 1; k <= 40; k += 9) {
10934 for (uint32_t m = 1; m <= 2; m++) {
10935 GemmMicrokernelTester()
10936 .mr(2)
10937 .nr(4)
10938 .kr(2)
10939 .sr(1)
10940 .m(m)
10941 .n(n)
10942 .k(k)
10943 .iterations(1)
10944 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10945 }
10946 }
10947 }
10948 }
10949
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128,n_div_4)10950 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_div_4) {
10951 TEST_REQUIRES_X86_AVX;
10952 for (uint32_t n = 8; n <= 12; n += 4) {
10953 for (size_t k = 1; k <= 40; k += 9) {
10954 GemmMicrokernelTester()
10955 .mr(2)
10956 .nr(4)
10957 .kr(2)
10958 .sr(1)
10959 .m(2)
10960 .n(n)
10961 .k(k)
10962 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10963 }
10964 }
10965 }
10966
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128,n_div_4_strided_cn)10967 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_div_4_strided_cn) {
10968 TEST_REQUIRES_X86_AVX;
10969 for (uint32_t n = 8; n <= 12; n += 4) {
10970 for (size_t k = 1; k <= 40; k += 9) {
10971 GemmMicrokernelTester()
10972 .mr(2)
10973 .nr(4)
10974 .kr(2)
10975 .sr(1)
10976 .m(2)
10977 .n(n)
10978 .k(k)
10979 .cn_stride(7)
10980 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10981 }
10982 }
10983 }
10984
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128,n_div_4_strided_a)10985 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_div_4_strided_a) {
10986 TEST_REQUIRES_X86_AVX;
10987 for (uint32_t n = 8; n <= 12; n += 4) {
10988 for (size_t k = 1; k <= 40; k += 9) {
10989 GemmMicrokernelTester()
10990 .mr(2)
10991 .nr(4)
10992 .kr(2)
10993 .sr(1)
10994 .m(2)
10995 .n(n)
10996 .k(k)
10997 .a_stride(43)
10998 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
10999 }
11000 }
11001 }
11002
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128,n_div_4_subtile)11003 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, n_div_4_subtile) {
11004 TEST_REQUIRES_X86_AVX;
11005 for (uint32_t n = 8; n <= 12; n += 4) {
11006 for (size_t k = 1; k <= 40; k += 9) {
11007 for (uint32_t m = 1; m <= 2; m++) {
11008 GemmMicrokernelTester()
11009 .mr(2)
11010 .nr(4)
11011 .kr(2)
11012 .sr(1)
11013 .m(m)
11014 .n(n)
11015 .k(k)
11016 .iterations(1)
11017 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11018 }
11019 }
11020 }
11021 }
11022
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128,strided_cm_subtile)11023 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, strided_cm_subtile) {
11024 TEST_REQUIRES_X86_AVX;
11025 for (size_t k = 1; k <= 40; k += 9) {
11026 for (uint32_t n = 1; n <= 4; n++) {
11027 for (uint32_t m = 1; m <= 2; m++) {
11028 GemmMicrokernelTester()
11029 .mr(2)
11030 .nr(4)
11031 .kr(2)
11032 .sr(1)
11033 .m(m)
11034 .n(n)
11035 .k(k)
11036 .cm_stride(7)
11037 .iterations(1)
11038 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11039 }
11040 }
11041 }
11042 }
11043
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128,qmin)11044 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, qmin) {
11045 TEST_REQUIRES_X86_AVX;
11046 GemmMicrokernelTester()
11047 .mr(2)
11048 .nr(4)
11049 .kr(2)
11050 .sr(1)
11051 .m(2)
11052 .n(4)
11053 .k(8)
11054 .qmin(128)
11055 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11056 }
11057
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128,qmax)11058 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, qmax) {
11059 TEST_REQUIRES_X86_AVX;
11060 GemmMicrokernelTester()
11061 .mr(2)
11062 .nr(4)
11063 .kr(2)
11064 .sr(1)
11065 .m(2)
11066 .n(4)
11067 .k(8)
11068 .qmax(128)
11069 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11070 }
11071
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128,strided_cm)11072 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, strided_cm) {
11073 TEST_REQUIRES_X86_AVX;
11074 GemmMicrokernelTester()
11075 .mr(2)
11076 .nr(4)
11077 .kr(2)
11078 .sr(1)
11079 .m(2)
11080 .n(4)
11081 .k(8)
11082 .cm_stride(7)
11083 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11084 }
11085
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128,no_a_zero_point)11086 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, no_a_zero_point) {
11087 TEST_REQUIRES_X86_AVX;
11088 for (size_t k = 1; k <= 40; k += 9) {
11089 GemmMicrokernelTester()
11090 .mr(2)
11091 .nr(4)
11092 .kr(2)
11093 .sr(1)
11094 .m(2)
11095 .n(4)
11096 .k(k)
11097 .a_zero_point(0)
11098 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11099 }
11100 }
11101
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128,no_b_zero_point)11102 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, no_b_zero_point) {
11103 TEST_REQUIRES_X86_AVX;
11104 for (size_t k = 1; k <= 40; k += 9) {
11105 GemmMicrokernelTester()
11106 .mr(2)
11107 .nr(4)
11108 .kr(2)
11109 .sr(1)
11110 .m(2)
11111 .n(4)
11112 .k(k)
11113 .b_zero_point(0)
11114 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11115 }
11116 }
11117
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128,no_zero_point)11118 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__AVX_LD128, no_zero_point) {
11119 TEST_REQUIRES_X86_AVX;
11120 for (size_t k = 1; k <= 40; k += 9) {
11121 GemmMicrokernelTester()
11122 .mr(2)
11123 .nr(4)
11124 .kr(2)
11125 .sr(1)
11126 .m(2)
11127 .n(4)
11128 .k(k)
11129 .a_zero_point(0)
11130 .b_zero_point(0)
11131 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11132 }
11133 }
11134 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
11135
11136
11137 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128,k_eq_8)11138 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8) {
11139 TEST_REQUIRES_X86_XOP;
11140 GemmMicrokernelTester()
11141 .mr(3)
11142 .nr(4)
11143 .kr(2)
11144 .sr(1)
11145 .m(3)
11146 .n(4)
11147 .k(8)
11148 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11149 }
11150
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128,strided_cn)11151 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, strided_cn) {
11152 TEST_REQUIRES_X86_XOP;
11153 GemmMicrokernelTester()
11154 .mr(3)
11155 .nr(4)
11156 .kr(2)
11157 .sr(1)
11158 .m(3)
11159 .n(4)
11160 .k(8)
11161 .cn_stride(7)
11162 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11163 }
11164
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128,k_eq_8_strided_a)11165 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8_strided_a) {
11166 TEST_REQUIRES_X86_XOP;
11167 GemmMicrokernelTester()
11168 .mr(3)
11169 .nr(4)
11170 .kr(2)
11171 .sr(1)
11172 .m(3)
11173 .n(4)
11174 .k(8)
11175 .a_stride(11)
11176 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11177 }
11178
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128,k_eq_8_subtile)11179 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8_subtile) {
11180 TEST_REQUIRES_X86_XOP;
11181 for (uint32_t n = 1; n <= 4; n++) {
11182 for (uint32_t m = 1; m <= 3; m++) {
11183 GemmMicrokernelTester()
11184 .mr(3)
11185 .nr(4)
11186 .kr(2)
11187 .sr(1)
11188 .m(m)
11189 .n(n)
11190 .k(8)
11191 .iterations(1)
11192 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11193 }
11194 }
11195 }
11196
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128,k_eq_8_subtile_m)11197 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8_subtile_m) {
11198 TEST_REQUIRES_X86_XOP;
11199 for (uint32_t m = 1; m <= 3; m++) {
11200 GemmMicrokernelTester()
11201 .mr(3)
11202 .nr(4)
11203 .kr(2)
11204 .sr(1)
11205 .m(m)
11206 .n(4)
11207 .k(8)
11208 .iterations(1)
11209 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11210 }
11211 }
11212
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128,k_eq_8_subtile_n)11213 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_eq_8_subtile_n) {
11214 TEST_REQUIRES_X86_XOP;
11215 for (uint32_t n = 1; n <= 4; n++) {
11216 GemmMicrokernelTester()
11217 .mr(3)
11218 .nr(4)
11219 .kr(2)
11220 .sr(1)
11221 .m(3)
11222 .n(n)
11223 .k(8)
11224 .iterations(1)
11225 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11226 }
11227 }
11228
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128,k_lt_8)11229 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_lt_8) {
11230 TEST_REQUIRES_X86_XOP;
11231 for (size_t k = 1; k < 8; k++) {
11232 GemmMicrokernelTester()
11233 .mr(3)
11234 .nr(4)
11235 .kr(2)
11236 .sr(1)
11237 .m(3)
11238 .n(4)
11239 .k(k)
11240 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11241 }
11242 }
11243
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128,k_lt_8_strided_a)11244 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_lt_8_strided_a) {
11245 TEST_REQUIRES_X86_XOP;
11246 for (size_t k = 1; k < 8; k++) {
11247 GemmMicrokernelTester()
11248 .mr(3)
11249 .nr(4)
11250 .kr(2)
11251 .sr(1)
11252 .m(3)
11253 .n(4)
11254 .k(k)
11255 .a_stride(11)
11256 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11257 }
11258 }
11259
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128,k_lt_8_subtile)11260 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_lt_8_subtile) {
11261 TEST_REQUIRES_X86_XOP;
11262 for (size_t k = 1; k < 8; k++) {
11263 for (uint32_t n = 1; n <= 4; n++) {
11264 for (uint32_t m = 1; m <= 3; m++) {
11265 GemmMicrokernelTester()
11266 .mr(3)
11267 .nr(4)
11268 .kr(2)
11269 .sr(1)
11270 .m(m)
11271 .n(n)
11272 .k(k)
11273 .iterations(1)
11274 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11275 }
11276 }
11277 }
11278 }
11279
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128,k_gt_8)11280 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_gt_8) {
11281 TEST_REQUIRES_X86_XOP;
11282 for (size_t k = 9; k < 16; k++) {
11283 GemmMicrokernelTester()
11284 .mr(3)
11285 .nr(4)
11286 .kr(2)
11287 .sr(1)
11288 .m(3)
11289 .n(4)
11290 .k(k)
11291 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11292 }
11293 }
11294
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128,k_gt_8_strided_a)11295 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_gt_8_strided_a) {
11296 TEST_REQUIRES_X86_XOP;
11297 for (size_t k = 9; k < 16; k++) {
11298 GemmMicrokernelTester()
11299 .mr(3)
11300 .nr(4)
11301 .kr(2)
11302 .sr(1)
11303 .m(3)
11304 .n(4)
11305 .k(k)
11306 .a_stride(19)
11307 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11308 }
11309 }
11310
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128,k_gt_8_subtile)11311 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_gt_8_subtile) {
11312 TEST_REQUIRES_X86_XOP;
11313 for (size_t k = 9; k < 16; k++) {
11314 for (uint32_t n = 1; n <= 4; n++) {
11315 for (uint32_t m = 1; m <= 3; m++) {
11316 GemmMicrokernelTester()
11317 .mr(3)
11318 .nr(4)
11319 .kr(2)
11320 .sr(1)
11321 .m(m)
11322 .n(n)
11323 .k(k)
11324 .iterations(1)
11325 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11326 }
11327 }
11328 }
11329 }
11330
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128,k_div_8)11331 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_div_8) {
11332 TEST_REQUIRES_X86_XOP;
11333 for (size_t k = 16; k <= 80; k += 8) {
11334 GemmMicrokernelTester()
11335 .mr(3)
11336 .nr(4)
11337 .kr(2)
11338 .sr(1)
11339 .m(3)
11340 .n(4)
11341 .k(k)
11342 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11343 }
11344 }
11345
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128,k_div_8_strided_a)11346 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_div_8_strided_a) {
11347 TEST_REQUIRES_X86_XOP;
11348 for (size_t k = 16; k <= 80; k += 8) {
11349 GemmMicrokernelTester()
11350 .mr(3)
11351 .nr(4)
11352 .kr(2)
11353 .sr(1)
11354 .m(3)
11355 .n(4)
11356 .k(k)
11357 .a_stride(83)
11358 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11359 }
11360 }
11361
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128,k_div_8_subtile)11362 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, k_div_8_subtile) {
11363 TEST_REQUIRES_X86_XOP;
11364 for (size_t k = 16; k <= 80; k += 8) {
11365 for (uint32_t n = 1; n <= 4; n++) {
11366 for (uint32_t m = 1; m <= 3; m++) {
11367 GemmMicrokernelTester()
11368 .mr(3)
11369 .nr(4)
11370 .kr(2)
11371 .sr(1)
11372 .m(m)
11373 .n(n)
11374 .k(k)
11375 .iterations(1)
11376 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11377 }
11378 }
11379 }
11380 }
11381
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128,n_gt_4)11382 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_gt_4) {
11383 TEST_REQUIRES_X86_XOP;
11384 for (uint32_t n = 5; n < 8; n++) {
11385 for (size_t k = 1; k <= 40; k += 9) {
11386 GemmMicrokernelTester()
11387 .mr(3)
11388 .nr(4)
11389 .kr(2)
11390 .sr(1)
11391 .m(3)
11392 .n(n)
11393 .k(k)
11394 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11395 }
11396 }
11397 }
11398
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128,n_gt_4_strided_cn)11399 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_gt_4_strided_cn) {
11400 TEST_REQUIRES_X86_XOP;
11401 for (uint32_t n = 5; n < 8; n++) {
11402 for (size_t k = 1; k <= 40; k += 9) {
11403 GemmMicrokernelTester()
11404 .mr(3)
11405 .nr(4)
11406 .kr(2)
11407 .sr(1)
11408 .m(3)
11409 .n(n)
11410 .k(k)
11411 .cn_stride(7)
11412 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11413 }
11414 }
11415 }
11416
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128,n_gt_4_strided_a)11417 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_gt_4_strided_a) {
11418 TEST_REQUIRES_X86_XOP;
11419 for (uint32_t n = 5; n < 8; n++) {
11420 for (size_t k = 1; k <= 40; k += 9) {
11421 GemmMicrokernelTester()
11422 .mr(3)
11423 .nr(4)
11424 .kr(2)
11425 .sr(1)
11426 .m(3)
11427 .n(n)
11428 .k(k)
11429 .a_stride(43)
11430 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11431 }
11432 }
11433 }
11434
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128,n_gt_4_subtile)11435 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_gt_4_subtile) {
11436 TEST_REQUIRES_X86_XOP;
11437 for (uint32_t n = 5; n < 8; n++) {
11438 for (size_t k = 1; k <= 40; k += 9) {
11439 for (uint32_t m = 1; m <= 3; m++) {
11440 GemmMicrokernelTester()
11441 .mr(3)
11442 .nr(4)
11443 .kr(2)
11444 .sr(1)
11445 .m(m)
11446 .n(n)
11447 .k(k)
11448 .iterations(1)
11449 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11450 }
11451 }
11452 }
11453 }
11454
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128,n_div_4)11455 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_div_4) {
11456 TEST_REQUIRES_X86_XOP;
11457 for (uint32_t n = 8; n <= 12; n += 4) {
11458 for (size_t k = 1; k <= 40; k += 9) {
11459 GemmMicrokernelTester()
11460 .mr(3)
11461 .nr(4)
11462 .kr(2)
11463 .sr(1)
11464 .m(3)
11465 .n(n)
11466 .k(k)
11467 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11468 }
11469 }
11470 }
11471
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128,n_div_4_strided_cn)11472 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_div_4_strided_cn) {
11473 TEST_REQUIRES_X86_XOP;
11474 for (uint32_t n = 8; n <= 12; n += 4) {
11475 for (size_t k = 1; k <= 40; k += 9) {
11476 GemmMicrokernelTester()
11477 .mr(3)
11478 .nr(4)
11479 .kr(2)
11480 .sr(1)
11481 .m(3)
11482 .n(n)
11483 .k(k)
11484 .cn_stride(7)
11485 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11486 }
11487 }
11488 }
11489
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128,n_div_4_strided_a)11490 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_div_4_strided_a) {
11491 TEST_REQUIRES_X86_XOP;
11492 for (uint32_t n = 8; n <= 12; n += 4) {
11493 for (size_t k = 1; k <= 40; k += 9) {
11494 GemmMicrokernelTester()
11495 .mr(3)
11496 .nr(4)
11497 .kr(2)
11498 .sr(1)
11499 .m(3)
11500 .n(n)
11501 .k(k)
11502 .a_stride(43)
11503 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11504 }
11505 }
11506 }
11507
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128,n_div_4_subtile)11508 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, n_div_4_subtile) {
11509 TEST_REQUIRES_X86_XOP;
11510 for (uint32_t n = 8; n <= 12; n += 4) {
11511 for (size_t k = 1; k <= 40; k += 9) {
11512 for (uint32_t m = 1; m <= 3; m++) {
11513 GemmMicrokernelTester()
11514 .mr(3)
11515 .nr(4)
11516 .kr(2)
11517 .sr(1)
11518 .m(m)
11519 .n(n)
11520 .k(k)
11521 .iterations(1)
11522 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11523 }
11524 }
11525 }
11526 }
11527
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128,strided_cm_subtile)11528 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, strided_cm_subtile) {
11529 TEST_REQUIRES_X86_XOP;
11530 for (size_t k = 1; k <= 40; k += 9) {
11531 for (uint32_t n = 1; n <= 4; n++) {
11532 for (uint32_t m = 1; m <= 3; m++) {
11533 GemmMicrokernelTester()
11534 .mr(3)
11535 .nr(4)
11536 .kr(2)
11537 .sr(1)
11538 .m(m)
11539 .n(n)
11540 .k(k)
11541 .cm_stride(7)
11542 .iterations(1)
11543 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11544 }
11545 }
11546 }
11547 }
11548
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128,qmin)11549 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, qmin) {
11550 TEST_REQUIRES_X86_XOP;
11551 GemmMicrokernelTester()
11552 .mr(3)
11553 .nr(4)
11554 .kr(2)
11555 .sr(1)
11556 .m(3)
11557 .n(4)
11558 .k(8)
11559 .qmin(128)
11560 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11561 }
11562
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128,qmax)11563 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, qmax) {
11564 TEST_REQUIRES_X86_XOP;
11565 GemmMicrokernelTester()
11566 .mr(3)
11567 .nr(4)
11568 .kr(2)
11569 .sr(1)
11570 .m(3)
11571 .n(4)
11572 .k(8)
11573 .qmax(128)
11574 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11575 }
11576
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128,strided_cm)11577 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, strided_cm) {
11578 TEST_REQUIRES_X86_XOP;
11579 GemmMicrokernelTester()
11580 .mr(3)
11581 .nr(4)
11582 .kr(2)
11583 .sr(1)
11584 .m(3)
11585 .n(4)
11586 .k(8)
11587 .cm_stride(7)
11588 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11589 }
11590
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128,no_a_zero_point)11591 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, no_a_zero_point) {
11592 TEST_REQUIRES_X86_XOP;
11593 for (size_t k = 1; k <= 40; k += 9) {
11594 GemmMicrokernelTester()
11595 .mr(3)
11596 .nr(4)
11597 .kr(2)
11598 .sr(1)
11599 .m(3)
11600 .n(4)
11601 .k(k)
11602 .a_zero_point(0)
11603 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11604 }
11605 }
11606
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128,no_b_zero_point)11607 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, no_b_zero_point) {
11608 TEST_REQUIRES_X86_XOP;
11609 for (size_t k = 1; k <= 40; k += 9) {
11610 GemmMicrokernelTester()
11611 .mr(3)
11612 .nr(4)
11613 .kr(2)
11614 .sr(1)
11615 .m(3)
11616 .n(4)
11617 .k(k)
11618 .b_zero_point(0)
11619 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11620 }
11621 }
11622
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128,no_zero_point)11623 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__XOP_LD128, no_zero_point) {
11624 TEST_REQUIRES_X86_XOP;
11625 for (size_t k = 1; k <= 40; k += 9) {
11626 GemmMicrokernelTester()
11627 .mr(3)
11628 .nr(4)
11629 .kr(2)
11630 .sr(1)
11631 .m(3)
11632 .n(4)
11633 .k(k)
11634 .a_zero_point(0)
11635 .b_zero_point(0)
11636 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11637 }
11638 }
11639 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
11640
11641
11642 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128,k_eq_8)11643 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8) {
11644 TEST_REQUIRES_X86_AVX;
11645 GemmMicrokernelTester()
11646 .mr(4)
11647 .nr(4)
11648 .kr(2)
11649 .sr(1)
11650 .m(4)
11651 .n(4)
11652 .k(8)
11653 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11654 }
11655
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128,strided_cn)11656 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, strided_cn) {
11657 TEST_REQUIRES_X86_AVX;
11658 GemmMicrokernelTester()
11659 .mr(4)
11660 .nr(4)
11661 .kr(2)
11662 .sr(1)
11663 .m(4)
11664 .n(4)
11665 .k(8)
11666 .cn_stride(7)
11667 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11668 }
11669
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128,k_eq_8_strided_a)11670 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8_strided_a) {
11671 TEST_REQUIRES_X86_AVX;
11672 GemmMicrokernelTester()
11673 .mr(4)
11674 .nr(4)
11675 .kr(2)
11676 .sr(1)
11677 .m(4)
11678 .n(4)
11679 .k(8)
11680 .a_stride(11)
11681 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11682 }
11683
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128,k_eq_8_subtile)11684 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8_subtile) {
11685 TEST_REQUIRES_X86_AVX;
11686 for (uint32_t n = 1; n <= 4; n++) {
11687 for (uint32_t m = 1; m <= 4; m++) {
11688 GemmMicrokernelTester()
11689 .mr(4)
11690 .nr(4)
11691 .kr(2)
11692 .sr(1)
11693 .m(m)
11694 .n(n)
11695 .k(8)
11696 .iterations(1)
11697 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11698 }
11699 }
11700 }
11701
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128,k_eq_8_subtile_m)11702 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8_subtile_m) {
11703 TEST_REQUIRES_X86_AVX;
11704 for (uint32_t m = 1; m <= 4; m++) {
11705 GemmMicrokernelTester()
11706 .mr(4)
11707 .nr(4)
11708 .kr(2)
11709 .sr(1)
11710 .m(m)
11711 .n(4)
11712 .k(8)
11713 .iterations(1)
11714 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11715 }
11716 }
11717
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128,k_eq_8_subtile_n)11718 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_eq_8_subtile_n) {
11719 TEST_REQUIRES_X86_AVX;
11720 for (uint32_t n = 1; n <= 4; n++) {
11721 GemmMicrokernelTester()
11722 .mr(4)
11723 .nr(4)
11724 .kr(2)
11725 .sr(1)
11726 .m(4)
11727 .n(n)
11728 .k(8)
11729 .iterations(1)
11730 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11731 }
11732 }
11733
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128,k_lt_8)11734 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_lt_8) {
11735 TEST_REQUIRES_X86_AVX;
11736 for (size_t k = 1; k < 8; k++) {
11737 GemmMicrokernelTester()
11738 .mr(4)
11739 .nr(4)
11740 .kr(2)
11741 .sr(1)
11742 .m(4)
11743 .n(4)
11744 .k(k)
11745 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11746 }
11747 }
11748
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128,k_lt_8_strided_a)11749 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_lt_8_strided_a) {
11750 TEST_REQUIRES_X86_AVX;
11751 for (size_t k = 1; k < 8; k++) {
11752 GemmMicrokernelTester()
11753 .mr(4)
11754 .nr(4)
11755 .kr(2)
11756 .sr(1)
11757 .m(4)
11758 .n(4)
11759 .k(k)
11760 .a_stride(11)
11761 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11762 }
11763 }
11764
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128,k_lt_8_subtile)11765 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_lt_8_subtile) {
11766 TEST_REQUIRES_X86_AVX;
11767 for (size_t k = 1; k < 8; k++) {
11768 for (uint32_t n = 1; n <= 4; n++) {
11769 for (uint32_t m = 1; m <= 4; m++) {
11770 GemmMicrokernelTester()
11771 .mr(4)
11772 .nr(4)
11773 .kr(2)
11774 .sr(1)
11775 .m(m)
11776 .n(n)
11777 .k(k)
11778 .iterations(1)
11779 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11780 }
11781 }
11782 }
11783 }
11784
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128,k_gt_8)11785 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_gt_8) {
11786 TEST_REQUIRES_X86_AVX;
11787 for (size_t k = 9; k < 16; k++) {
11788 GemmMicrokernelTester()
11789 .mr(4)
11790 .nr(4)
11791 .kr(2)
11792 .sr(1)
11793 .m(4)
11794 .n(4)
11795 .k(k)
11796 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11797 }
11798 }
11799
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128,k_gt_8_strided_a)11800 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_gt_8_strided_a) {
11801 TEST_REQUIRES_X86_AVX;
11802 for (size_t k = 9; k < 16; k++) {
11803 GemmMicrokernelTester()
11804 .mr(4)
11805 .nr(4)
11806 .kr(2)
11807 .sr(1)
11808 .m(4)
11809 .n(4)
11810 .k(k)
11811 .a_stride(19)
11812 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11813 }
11814 }
11815
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128,k_gt_8_subtile)11816 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_gt_8_subtile) {
11817 TEST_REQUIRES_X86_AVX;
11818 for (size_t k = 9; k < 16; k++) {
11819 for (uint32_t n = 1; n <= 4; n++) {
11820 for (uint32_t m = 1; m <= 4; m++) {
11821 GemmMicrokernelTester()
11822 .mr(4)
11823 .nr(4)
11824 .kr(2)
11825 .sr(1)
11826 .m(m)
11827 .n(n)
11828 .k(k)
11829 .iterations(1)
11830 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11831 }
11832 }
11833 }
11834 }
11835
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128,k_div_8)11836 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_div_8) {
11837 TEST_REQUIRES_X86_AVX;
11838 for (size_t k = 16; k <= 80; k += 8) {
11839 GemmMicrokernelTester()
11840 .mr(4)
11841 .nr(4)
11842 .kr(2)
11843 .sr(1)
11844 .m(4)
11845 .n(4)
11846 .k(k)
11847 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11848 }
11849 }
11850
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128,k_div_8_strided_a)11851 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_div_8_strided_a) {
11852 TEST_REQUIRES_X86_AVX;
11853 for (size_t k = 16; k <= 80; k += 8) {
11854 GemmMicrokernelTester()
11855 .mr(4)
11856 .nr(4)
11857 .kr(2)
11858 .sr(1)
11859 .m(4)
11860 .n(4)
11861 .k(k)
11862 .a_stride(83)
11863 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11864 }
11865 }
11866
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128,k_div_8_subtile)11867 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, k_div_8_subtile) {
11868 TEST_REQUIRES_X86_AVX;
11869 for (size_t k = 16; k <= 80; k += 8) {
11870 for (uint32_t n = 1; n <= 4; n++) {
11871 for (uint32_t m = 1; m <= 4; m++) {
11872 GemmMicrokernelTester()
11873 .mr(4)
11874 .nr(4)
11875 .kr(2)
11876 .sr(1)
11877 .m(m)
11878 .n(n)
11879 .k(k)
11880 .iterations(1)
11881 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11882 }
11883 }
11884 }
11885 }
11886
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128,n_gt_4)11887 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_gt_4) {
11888 TEST_REQUIRES_X86_AVX;
11889 for (uint32_t n = 5; n < 8; n++) {
11890 for (size_t k = 1; k <= 40; k += 9) {
11891 GemmMicrokernelTester()
11892 .mr(4)
11893 .nr(4)
11894 .kr(2)
11895 .sr(1)
11896 .m(4)
11897 .n(n)
11898 .k(k)
11899 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11900 }
11901 }
11902 }
11903
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128,n_gt_4_strided_cn)11904 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_gt_4_strided_cn) {
11905 TEST_REQUIRES_X86_AVX;
11906 for (uint32_t n = 5; n < 8; n++) {
11907 for (size_t k = 1; k <= 40; k += 9) {
11908 GemmMicrokernelTester()
11909 .mr(4)
11910 .nr(4)
11911 .kr(2)
11912 .sr(1)
11913 .m(4)
11914 .n(n)
11915 .k(k)
11916 .cn_stride(7)
11917 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11918 }
11919 }
11920 }
11921
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128,n_gt_4_strided_a)11922 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_gt_4_strided_a) {
11923 TEST_REQUIRES_X86_AVX;
11924 for (uint32_t n = 5; n < 8; n++) {
11925 for (size_t k = 1; k <= 40; k += 9) {
11926 GemmMicrokernelTester()
11927 .mr(4)
11928 .nr(4)
11929 .kr(2)
11930 .sr(1)
11931 .m(4)
11932 .n(n)
11933 .k(k)
11934 .a_stride(43)
11935 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11936 }
11937 }
11938 }
11939
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128,n_gt_4_subtile)11940 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_gt_4_subtile) {
11941 TEST_REQUIRES_X86_AVX;
11942 for (uint32_t n = 5; n < 8; n++) {
11943 for (size_t k = 1; k <= 40; k += 9) {
11944 for (uint32_t m = 1; m <= 4; m++) {
11945 GemmMicrokernelTester()
11946 .mr(4)
11947 .nr(4)
11948 .kr(2)
11949 .sr(1)
11950 .m(m)
11951 .n(n)
11952 .k(k)
11953 .iterations(1)
11954 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11955 }
11956 }
11957 }
11958 }
11959
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128,n_div_4)11960 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_div_4) {
11961 TEST_REQUIRES_X86_AVX;
11962 for (uint32_t n = 8; n <= 12; n += 4) {
11963 for (size_t k = 1; k <= 40; k += 9) {
11964 GemmMicrokernelTester()
11965 .mr(4)
11966 .nr(4)
11967 .kr(2)
11968 .sr(1)
11969 .m(4)
11970 .n(n)
11971 .k(k)
11972 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11973 }
11974 }
11975 }
11976
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128,n_div_4_strided_cn)11977 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_div_4_strided_cn) {
11978 TEST_REQUIRES_X86_AVX;
11979 for (uint32_t n = 8; n <= 12; n += 4) {
11980 for (size_t k = 1; k <= 40; k += 9) {
11981 GemmMicrokernelTester()
11982 .mr(4)
11983 .nr(4)
11984 .kr(2)
11985 .sr(1)
11986 .m(4)
11987 .n(n)
11988 .k(k)
11989 .cn_stride(7)
11990 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
11991 }
11992 }
11993 }
11994
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128,n_div_4_strided_a)11995 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_div_4_strided_a) {
11996 TEST_REQUIRES_X86_AVX;
11997 for (uint32_t n = 8; n <= 12; n += 4) {
11998 for (size_t k = 1; k <= 40; k += 9) {
11999 GemmMicrokernelTester()
12000 .mr(4)
12001 .nr(4)
12002 .kr(2)
12003 .sr(1)
12004 .m(4)
12005 .n(n)
12006 .k(k)
12007 .a_stride(43)
12008 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12009 }
12010 }
12011 }
12012
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128,n_div_4_subtile)12013 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, n_div_4_subtile) {
12014 TEST_REQUIRES_X86_AVX;
12015 for (uint32_t n = 8; n <= 12; n += 4) {
12016 for (size_t k = 1; k <= 40; k += 9) {
12017 for (uint32_t m = 1; m <= 4; m++) {
12018 GemmMicrokernelTester()
12019 .mr(4)
12020 .nr(4)
12021 .kr(2)
12022 .sr(1)
12023 .m(m)
12024 .n(n)
12025 .k(k)
12026 .iterations(1)
12027 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12028 }
12029 }
12030 }
12031 }
12032
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128,strided_cm_subtile)12033 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, strided_cm_subtile) {
12034 TEST_REQUIRES_X86_AVX;
12035 for (size_t k = 1; k <= 40; k += 9) {
12036 for (uint32_t n = 1; n <= 4; n++) {
12037 for (uint32_t m = 1; m <= 4; m++) {
12038 GemmMicrokernelTester()
12039 .mr(4)
12040 .nr(4)
12041 .kr(2)
12042 .sr(1)
12043 .m(m)
12044 .n(n)
12045 .k(k)
12046 .cm_stride(7)
12047 .iterations(1)
12048 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12049 }
12050 }
12051 }
12052 }
12053
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128,qmin)12054 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, qmin) {
12055 TEST_REQUIRES_X86_AVX;
12056 GemmMicrokernelTester()
12057 .mr(4)
12058 .nr(4)
12059 .kr(2)
12060 .sr(1)
12061 .m(4)
12062 .n(4)
12063 .k(8)
12064 .qmin(128)
12065 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12066 }
12067
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128,qmax)12068 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, qmax) {
12069 TEST_REQUIRES_X86_AVX;
12070 GemmMicrokernelTester()
12071 .mr(4)
12072 .nr(4)
12073 .kr(2)
12074 .sr(1)
12075 .m(4)
12076 .n(4)
12077 .k(8)
12078 .qmax(128)
12079 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12080 }
12081
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128,strided_cm)12082 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, strided_cm) {
12083 TEST_REQUIRES_X86_AVX;
12084 GemmMicrokernelTester()
12085 .mr(4)
12086 .nr(4)
12087 .kr(2)
12088 .sr(1)
12089 .m(4)
12090 .n(4)
12091 .k(8)
12092 .cm_stride(7)
12093 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12094 }
12095
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128,no_a_zero_point)12096 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, no_a_zero_point) {
12097 TEST_REQUIRES_X86_AVX;
12098 for (size_t k = 1; k <= 40; k += 9) {
12099 GemmMicrokernelTester()
12100 .mr(4)
12101 .nr(4)
12102 .kr(2)
12103 .sr(1)
12104 .m(4)
12105 .n(4)
12106 .k(k)
12107 .a_zero_point(0)
12108 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12109 }
12110 }
12111
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128,no_b_zero_point)12112 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, no_b_zero_point) {
12113 TEST_REQUIRES_X86_AVX;
12114 for (size_t k = 1; k <= 40; k += 9) {
12115 GemmMicrokernelTester()
12116 .mr(4)
12117 .nr(4)
12118 .kr(2)
12119 .sr(1)
12120 .m(4)
12121 .n(4)
12122 .k(k)
12123 .b_zero_point(0)
12124 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12125 }
12126 }
12127
TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128,no_zero_point)12128 TEST(QU8_GEMM_MINMAX_FP32_4X4C2__AVX_LD128, no_zero_point) {
12129 TEST_REQUIRES_X86_AVX;
12130 for (size_t k = 1; k <= 40; k += 9) {
12131 GemmMicrokernelTester()
12132 .mr(4)
12133 .nr(4)
12134 .kr(2)
12135 .sr(1)
12136 .m(4)
12137 .n(4)
12138 .k(k)
12139 .a_zero_point(0)
12140 .b_zero_point(0)
12141 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12142 }
12143 }
12144 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
12145
12146
12147 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64,k_eq_8)12148 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64, k_eq_8) {
12149 TEST_REQUIRES_X86_SSE2;
12150 GemmMicrokernelTester()
12151 .mr(1)
12152 .nr(4)
12153 .kr(2)
12154 .sr(4)
12155 .m(1)
12156 .n(4)
12157 .k(8)
12158 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12159 }
12160
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64,strided_cn)12161 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64, strided_cn) {
12162 TEST_REQUIRES_X86_SSE2;
12163 GemmMicrokernelTester()
12164 .mr(1)
12165 .nr(4)
12166 .kr(2)
12167 .sr(4)
12168 .m(1)
12169 .n(4)
12170 .k(8)
12171 .cn_stride(7)
12172 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12173 }
12174
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64,k_eq_8_strided_a)12175 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64, k_eq_8_strided_a) {
12176 TEST_REQUIRES_X86_SSE2;
12177 GemmMicrokernelTester()
12178 .mr(1)
12179 .nr(4)
12180 .kr(2)
12181 .sr(4)
12182 .m(1)
12183 .n(4)
12184 .k(8)
12185 .a_stride(11)
12186 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12187 }
12188
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64,k_eq_8_subtile)12189 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64, k_eq_8_subtile) {
12190 TEST_REQUIRES_X86_SSE2;
12191 for (uint32_t n = 1; n <= 4; n++) {
12192 for (uint32_t m = 1; m <= 1; m++) {
12193 GemmMicrokernelTester()
12194 .mr(1)
12195 .nr(4)
12196 .kr(2)
12197 .sr(4)
12198 .m(m)
12199 .n(n)
12200 .k(8)
12201 .iterations(1)
12202 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12203 }
12204 }
12205 }
12206
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64,k_eq_8_subtile_m)12207 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64, k_eq_8_subtile_m) {
12208 TEST_REQUIRES_X86_SSE2;
12209 for (uint32_t m = 1; m <= 1; m++) {
12210 GemmMicrokernelTester()
12211 .mr(1)
12212 .nr(4)
12213 .kr(2)
12214 .sr(4)
12215 .m(m)
12216 .n(4)
12217 .k(8)
12218 .iterations(1)
12219 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12220 }
12221 }
12222
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64,k_eq_8_subtile_n)12223 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64, k_eq_8_subtile_n) {
12224 TEST_REQUIRES_X86_SSE2;
12225 for (uint32_t n = 1; n <= 4; n++) {
12226 GemmMicrokernelTester()
12227 .mr(1)
12228 .nr(4)
12229 .kr(2)
12230 .sr(4)
12231 .m(1)
12232 .n(n)
12233 .k(8)
12234 .iterations(1)
12235 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12236 }
12237 }
12238
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64,k_lt_8)12239 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64, k_lt_8) {
12240 TEST_REQUIRES_X86_SSE2;
12241 for (size_t k = 1; k < 8; k++) {
12242 GemmMicrokernelTester()
12243 .mr(1)
12244 .nr(4)
12245 .kr(2)
12246 .sr(4)
12247 .m(1)
12248 .n(4)
12249 .k(k)
12250 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12251 }
12252 }
12253
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64,k_lt_8_strided_a)12254 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64, k_lt_8_strided_a) {
12255 TEST_REQUIRES_X86_SSE2;
12256 for (size_t k = 1; k < 8; k++) {
12257 GemmMicrokernelTester()
12258 .mr(1)
12259 .nr(4)
12260 .kr(2)
12261 .sr(4)
12262 .m(1)
12263 .n(4)
12264 .k(k)
12265 .a_stride(11)
12266 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12267 }
12268 }
12269
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64,k_lt_8_subtile)12270 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64, k_lt_8_subtile) {
12271 TEST_REQUIRES_X86_SSE2;
12272 for (size_t k = 1; k < 8; k++) {
12273 for (uint32_t n = 1; n <= 4; n++) {
12274 for (uint32_t m = 1; m <= 1; m++) {
12275 GemmMicrokernelTester()
12276 .mr(1)
12277 .nr(4)
12278 .kr(2)
12279 .sr(4)
12280 .m(m)
12281 .n(n)
12282 .k(k)
12283 .iterations(1)
12284 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12285 }
12286 }
12287 }
12288 }
12289
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64,k_gt_8)12290 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64, k_gt_8) {
12291 TEST_REQUIRES_X86_SSE2;
12292 for (size_t k = 9; k < 16; k++) {
12293 GemmMicrokernelTester()
12294 .mr(1)
12295 .nr(4)
12296 .kr(2)
12297 .sr(4)
12298 .m(1)
12299 .n(4)
12300 .k(k)
12301 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12302 }
12303 }
12304
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64,k_gt_8_strided_a)12305 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64, k_gt_8_strided_a) {
12306 TEST_REQUIRES_X86_SSE2;
12307 for (size_t k = 9; k < 16; k++) {
12308 GemmMicrokernelTester()
12309 .mr(1)
12310 .nr(4)
12311 .kr(2)
12312 .sr(4)
12313 .m(1)
12314 .n(4)
12315 .k(k)
12316 .a_stride(19)
12317 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12318 }
12319 }
12320
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64,k_gt_8_subtile)12321 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64, k_gt_8_subtile) {
12322 TEST_REQUIRES_X86_SSE2;
12323 for (size_t k = 9; k < 16; k++) {
12324 for (uint32_t n = 1; n <= 4; n++) {
12325 for (uint32_t m = 1; m <= 1; m++) {
12326 GemmMicrokernelTester()
12327 .mr(1)
12328 .nr(4)
12329 .kr(2)
12330 .sr(4)
12331 .m(m)
12332 .n(n)
12333 .k(k)
12334 .iterations(1)
12335 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12336 }
12337 }
12338 }
12339 }
12340
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64,k_div_8)12341 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64, k_div_8) {
12342 TEST_REQUIRES_X86_SSE2;
12343 for (size_t k = 16; k <= 80; k += 8) {
12344 GemmMicrokernelTester()
12345 .mr(1)
12346 .nr(4)
12347 .kr(2)
12348 .sr(4)
12349 .m(1)
12350 .n(4)
12351 .k(k)
12352 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12353 }
12354 }
12355
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64,k_div_8_strided_a)12356 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64, k_div_8_strided_a) {
12357 TEST_REQUIRES_X86_SSE2;
12358 for (size_t k = 16; k <= 80; k += 8) {
12359 GemmMicrokernelTester()
12360 .mr(1)
12361 .nr(4)
12362 .kr(2)
12363 .sr(4)
12364 .m(1)
12365 .n(4)
12366 .k(k)
12367 .a_stride(83)
12368 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12369 }
12370 }
12371
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64,k_div_8_subtile)12372 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64, k_div_8_subtile) {
12373 TEST_REQUIRES_X86_SSE2;
12374 for (size_t k = 16; k <= 80; k += 8) {
12375 for (uint32_t n = 1; n <= 4; n++) {
12376 for (uint32_t m = 1; m <= 1; m++) {
12377 GemmMicrokernelTester()
12378 .mr(1)
12379 .nr(4)
12380 .kr(2)
12381 .sr(4)
12382 .m(m)
12383 .n(n)
12384 .k(k)
12385 .iterations(1)
12386 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12387 }
12388 }
12389 }
12390 }
12391
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64,n_gt_4)12392 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64, n_gt_4) {
12393 TEST_REQUIRES_X86_SSE2;
12394 for (uint32_t n = 5; n < 8; n++) {
12395 for (size_t k = 1; k <= 40; k += 9) {
12396 GemmMicrokernelTester()
12397 .mr(1)
12398 .nr(4)
12399 .kr(2)
12400 .sr(4)
12401 .m(1)
12402 .n(n)
12403 .k(k)
12404 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12405 }
12406 }
12407 }
12408
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64,n_gt_4_strided_cn)12409 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64, n_gt_4_strided_cn) {
12410 TEST_REQUIRES_X86_SSE2;
12411 for (uint32_t n = 5; n < 8; n++) {
12412 for (size_t k = 1; k <= 40; k += 9) {
12413 GemmMicrokernelTester()
12414 .mr(1)
12415 .nr(4)
12416 .kr(2)
12417 .sr(4)
12418 .m(1)
12419 .n(n)
12420 .k(k)
12421 .cn_stride(7)
12422 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12423 }
12424 }
12425 }
12426
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64,n_gt_4_strided_a)12427 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64, n_gt_4_strided_a) {
12428 TEST_REQUIRES_X86_SSE2;
12429 for (uint32_t n = 5; n < 8; n++) {
12430 for (size_t k = 1; k <= 40; k += 9) {
12431 GemmMicrokernelTester()
12432 .mr(1)
12433 .nr(4)
12434 .kr(2)
12435 .sr(4)
12436 .m(1)
12437 .n(n)
12438 .k(k)
12439 .a_stride(43)
12440 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12441 }
12442 }
12443 }
12444
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64,n_gt_4_subtile)12445 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64, n_gt_4_subtile) {
12446 TEST_REQUIRES_X86_SSE2;
12447 for (uint32_t n = 5; n < 8; n++) {
12448 for (size_t k = 1; k <= 40; k += 9) {
12449 for (uint32_t m = 1; m <= 1; m++) {
12450 GemmMicrokernelTester()
12451 .mr(1)
12452 .nr(4)
12453 .kr(2)
12454 .sr(4)
12455 .m(m)
12456 .n(n)
12457 .k(k)
12458 .iterations(1)
12459 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12460 }
12461 }
12462 }
12463 }
12464
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64,n_div_4)12465 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64, n_div_4) {
12466 TEST_REQUIRES_X86_SSE2;
12467 for (uint32_t n = 8; n <= 12; n += 4) {
12468 for (size_t k = 1; k <= 40; k += 9) {
12469 GemmMicrokernelTester()
12470 .mr(1)
12471 .nr(4)
12472 .kr(2)
12473 .sr(4)
12474 .m(1)
12475 .n(n)
12476 .k(k)
12477 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12478 }
12479 }
12480 }
12481
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64,n_div_4_strided_cn)12482 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64, n_div_4_strided_cn) {
12483 TEST_REQUIRES_X86_SSE2;
12484 for (uint32_t n = 8; n <= 12; n += 4) {
12485 for (size_t k = 1; k <= 40; k += 9) {
12486 GemmMicrokernelTester()
12487 .mr(1)
12488 .nr(4)
12489 .kr(2)
12490 .sr(4)
12491 .m(1)
12492 .n(n)
12493 .k(k)
12494 .cn_stride(7)
12495 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12496 }
12497 }
12498 }
12499
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64,n_div_4_strided_a)12500 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64, n_div_4_strided_a) {
12501 TEST_REQUIRES_X86_SSE2;
12502 for (uint32_t n = 8; n <= 12; n += 4) {
12503 for (size_t k = 1; k <= 40; k += 9) {
12504 GemmMicrokernelTester()
12505 .mr(1)
12506 .nr(4)
12507 .kr(2)
12508 .sr(4)
12509 .m(1)
12510 .n(n)
12511 .k(k)
12512 .a_stride(43)
12513 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12514 }
12515 }
12516 }
12517
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64,n_div_4_subtile)12518 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64, n_div_4_subtile) {
12519 TEST_REQUIRES_X86_SSE2;
12520 for (uint32_t n = 8; n <= 12; n += 4) {
12521 for (size_t k = 1; k <= 40; k += 9) {
12522 for (uint32_t m = 1; m <= 1; m++) {
12523 GemmMicrokernelTester()
12524 .mr(1)
12525 .nr(4)
12526 .kr(2)
12527 .sr(4)
12528 .m(m)
12529 .n(n)
12530 .k(k)
12531 .iterations(1)
12532 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12533 }
12534 }
12535 }
12536 }
12537
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64,strided_cm_subtile)12538 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64, strided_cm_subtile) {
12539 TEST_REQUIRES_X86_SSE2;
12540 for (size_t k = 1; k <= 40; k += 9) {
12541 for (uint32_t n = 1; n <= 4; n++) {
12542 for (uint32_t m = 1; m <= 1; m++) {
12543 GemmMicrokernelTester()
12544 .mr(1)
12545 .nr(4)
12546 .kr(2)
12547 .sr(4)
12548 .m(m)
12549 .n(n)
12550 .k(k)
12551 .cm_stride(7)
12552 .iterations(1)
12553 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12554 }
12555 }
12556 }
12557 }
12558
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64,qmin)12559 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64, qmin) {
12560 TEST_REQUIRES_X86_SSE2;
12561 GemmMicrokernelTester()
12562 .mr(1)
12563 .nr(4)
12564 .kr(2)
12565 .sr(4)
12566 .m(1)
12567 .n(4)
12568 .k(8)
12569 .qmin(128)
12570 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12571 }
12572
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64,qmax)12573 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64, qmax) {
12574 TEST_REQUIRES_X86_SSE2;
12575 GemmMicrokernelTester()
12576 .mr(1)
12577 .nr(4)
12578 .kr(2)
12579 .sr(4)
12580 .m(1)
12581 .n(4)
12582 .k(8)
12583 .qmax(128)
12584 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12585 }
12586
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64,strided_cm)12587 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64, strided_cm) {
12588 TEST_REQUIRES_X86_SSE2;
12589 GemmMicrokernelTester()
12590 .mr(1)
12591 .nr(4)
12592 .kr(2)
12593 .sr(4)
12594 .m(1)
12595 .n(4)
12596 .k(8)
12597 .cm_stride(7)
12598 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12599 }
12600
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64,no_a_zero_point)12601 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64, no_a_zero_point) {
12602 TEST_REQUIRES_X86_SSE2;
12603 for (size_t k = 1; k <= 40; k += 9) {
12604 GemmMicrokernelTester()
12605 .mr(1)
12606 .nr(4)
12607 .kr(2)
12608 .sr(4)
12609 .m(1)
12610 .n(4)
12611 .k(k)
12612 .a_zero_point(0)
12613 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12614 }
12615 }
12616
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64,no_b_zero_point)12617 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64, no_b_zero_point) {
12618 TEST_REQUIRES_X86_SSE2;
12619 for (size_t k = 1; k <= 40; k += 9) {
12620 GemmMicrokernelTester()
12621 .mr(1)
12622 .nr(4)
12623 .kr(2)
12624 .sr(4)
12625 .m(1)
12626 .n(4)
12627 .k(k)
12628 .b_zero_point(0)
12629 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12630 }
12631 }
12632
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64,no_zero_point)12633 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE2_LD64, no_zero_point) {
12634 TEST_REQUIRES_X86_SSE2;
12635 for (size_t k = 1; k <= 40; k += 9) {
12636 GemmMicrokernelTester()
12637 .mr(1)
12638 .nr(4)
12639 .kr(2)
12640 .sr(4)
12641 .m(1)
12642 .n(4)
12643 .k(k)
12644 .a_zero_point(0)
12645 .b_zero_point(0)
12646 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12647 }
12648 }
12649 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
12650
12651
12652 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64,k_eq_8)12653 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64, k_eq_8) {
12654 TEST_REQUIRES_X86_SSE41;
12655 GemmMicrokernelTester()
12656 .mr(1)
12657 .nr(4)
12658 .kr(2)
12659 .sr(4)
12660 .m(1)
12661 .n(4)
12662 .k(8)
12663 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12664 }
12665
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64,strided_cn)12666 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64, strided_cn) {
12667 TEST_REQUIRES_X86_SSE41;
12668 GemmMicrokernelTester()
12669 .mr(1)
12670 .nr(4)
12671 .kr(2)
12672 .sr(4)
12673 .m(1)
12674 .n(4)
12675 .k(8)
12676 .cn_stride(7)
12677 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12678 }
12679
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64,k_eq_8_strided_a)12680 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64, k_eq_8_strided_a) {
12681 TEST_REQUIRES_X86_SSE41;
12682 GemmMicrokernelTester()
12683 .mr(1)
12684 .nr(4)
12685 .kr(2)
12686 .sr(4)
12687 .m(1)
12688 .n(4)
12689 .k(8)
12690 .a_stride(11)
12691 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12692 }
12693
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64,k_eq_8_subtile)12694 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64, k_eq_8_subtile) {
12695 TEST_REQUIRES_X86_SSE41;
12696 for (uint32_t n = 1; n <= 4; n++) {
12697 for (uint32_t m = 1; m <= 1; m++) {
12698 GemmMicrokernelTester()
12699 .mr(1)
12700 .nr(4)
12701 .kr(2)
12702 .sr(4)
12703 .m(m)
12704 .n(n)
12705 .k(8)
12706 .iterations(1)
12707 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12708 }
12709 }
12710 }
12711
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64,k_eq_8_subtile_m)12712 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64, k_eq_8_subtile_m) {
12713 TEST_REQUIRES_X86_SSE41;
12714 for (uint32_t m = 1; m <= 1; m++) {
12715 GemmMicrokernelTester()
12716 .mr(1)
12717 .nr(4)
12718 .kr(2)
12719 .sr(4)
12720 .m(m)
12721 .n(4)
12722 .k(8)
12723 .iterations(1)
12724 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12725 }
12726 }
12727
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64,k_eq_8_subtile_n)12728 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64, k_eq_8_subtile_n) {
12729 TEST_REQUIRES_X86_SSE41;
12730 for (uint32_t n = 1; n <= 4; n++) {
12731 GemmMicrokernelTester()
12732 .mr(1)
12733 .nr(4)
12734 .kr(2)
12735 .sr(4)
12736 .m(1)
12737 .n(n)
12738 .k(8)
12739 .iterations(1)
12740 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12741 }
12742 }
12743
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64,k_lt_8)12744 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64, k_lt_8) {
12745 TEST_REQUIRES_X86_SSE41;
12746 for (size_t k = 1; k < 8; k++) {
12747 GemmMicrokernelTester()
12748 .mr(1)
12749 .nr(4)
12750 .kr(2)
12751 .sr(4)
12752 .m(1)
12753 .n(4)
12754 .k(k)
12755 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12756 }
12757 }
12758
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64,k_lt_8_strided_a)12759 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64, k_lt_8_strided_a) {
12760 TEST_REQUIRES_X86_SSE41;
12761 for (size_t k = 1; k < 8; k++) {
12762 GemmMicrokernelTester()
12763 .mr(1)
12764 .nr(4)
12765 .kr(2)
12766 .sr(4)
12767 .m(1)
12768 .n(4)
12769 .k(k)
12770 .a_stride(11)
12771 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12772 }
12773 }
12774
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64,k_lt_8_subtile)12775 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64, k_lt_8_subtile) {
12776 TEST_REQUIRES_X86_SSE41;
12777 for (size_t k = 1; k < 8; k++) {
12778 for (uint32_t n = 1; n <= 4; n++) {
12779 for (uint32_t m = 1; m <= 1; m++) {
12780 GemmMicrokernelTester()
12781 .mr(1)
12782 .nr(4)
12783 .kr(2)
12784 .sr(4)
12785 .m(m)
12786 .n(n)
12787 .k(k)
12788 .iterations(1)
12789 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12790 }
12791 }
12792 }
12793 }
12794
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64,k_gt_8)12795 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64, k_gt_8) {
12796 TEST_REQUIRES_X86_SSE41;
12797 for (size_t k = 9; k < 16; k++) {
12798 GemmMicrokernelTester()
12799 .mr(1)
12800 .nr(4)
12801 .kr(2)
12802 .sr(4)
12803 .m(1)
12804 .n(4)
12805 .k(k)
12806 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12807 }
12808 }
12809
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64,k_gt_8_strided_a)12810 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64, k_gt_8_strided_a) {
12811 TEST_REQUIRES_X86_SSE41;
12812 for (size_t k = 9; k < 16; k++) {
12813 GemmMicrokernelTester()
12814 .mr(1)
12815 .nr(4)
12816 .kr(2)
12817 .sr(4)
12818 .m(1)
12819 .n(4)
12820 .k(k)
12821 .a_stride(19)
12822 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12823 }
12824 }
12825
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64,k_gt_8_subtile)12826 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64, k_gt_8_subtile) {
12827 TEST_REQUIRES_X86_SSE41;
12828 for (size_t k = 9; k < 16; k++) {
12829 for (uint32_t n = 1; n <= 4; n++) {
12830 for (uint32_t m = 1; m <= 1; m++) {
12831 GemmMicrokernelTester()
12832 .mr(1)
12833 .nr(4)
12834 .kr(2)
12835 .sr(4)
12836 .m(m)
12837 .n(n)
12838 .k(k)
12839 .iterations(1)
12840 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12841 }
12842 }
12843 }
12844 }
12845
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64,k_div_8)12846 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64, k_div_8) {
12847 TEST_REQUIRES_X86_SSE41;
12848 for (size_t k = 16; k <= 80; k += 8) {
12849 GemmMicrokernelTester()
12850 .mr(1)
12851 .nr(4)
12852 .kr(2)
12853 .sr(4)
12854 .m(1)
12855 .n(4)
12856 .k(k)
12857 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12858 }
12859 }
12860
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64,k_div_8_strided_a)12861 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64, k_div_8_strided_a) {
12862 TEST_REQUIRES_X86_SSE41;
12863 for (size_t k = 16; k <= 80; k += 8) {
12864 GemmMicrokernelTester()
12865 .mr(1)
12866 .nr(4)
12867 .kr(2)
12868 .sr(4)
12869 .m(1)
12870 .n(4)
12871 .k(k)
12872 .a_stride(83)
12873 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12874 }
12875 }
12876
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64,k_div_8_subtile)12877 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64, k_div_8_subtile) {
12878 TEST_REQUIRES_X86_SSE41;
12879 for (size_t k = 16; k <= 80; k += 8) {
12880 for (uint32_t n = 1; n <= 4; n++) {
12881 for (uint32_t m = 1; m <= 1; m++) {
12882 GemmMicrokernelTester()
12883 .mr(1)
12884 .nr(4)
12885 .kr(2)
12886 .sr(4)
12887 .m(m)
12888 .n(n)
12889 .k(k)
12890 .iterations(1)
12891 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12892 }
12893 }
12894 }
12895 }
12896
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64,n_gt_4)12897 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64, n_gt_4) {
12898 TEST_REQUIRES_X86_SSE41;
12899 for (uint32_t n = 5; n < 8; n++) {
12900 for (size_t k = 1; k <= 40; k += 9) {
12901 GemmMicrokernelTester()
12902 .mr(1)
12903 .nr(4)
12904 .kr(2)
12905 .sr(4)
12906 .m(1)
12907 .n(n)
12908 .k(k)
12909 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12910 }
12911 }
12912 }
12913
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64,n_gt_4_strided_cn)12914 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64, n_gt_4_strided_cn) {
12915 TEST_REQUIRES_X86_SSE41;
12916 for (uint32_t n = 5; n < 8; n++) {
12917 for (size_t k = 1; k <= 40; k += 9) {
12918 GemmMicrokernelTester()
12919 .mr(1)
12920 .nr(4)
12921 .kr(2)
12922 .sr(4)
12923 .m(1)
12924 .n(n)
12925 .k(k)
12926 .cn_stride(7)
12927 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12928 }
12929 }
12930 }
12931
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64,n_gt_4_strided_a)12932 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64, n_gt_4_strided_a) {
12933 TEST_REQUIRES_X86_SSE41;
12934 for (uint32_t n = 5; n < 8; n++) {
12935 for (size_t k = 1; k <= 40; k += 9) {
12936 GemmMicrokernelTester()
12937 .mr(1)
12938 .nr(4)
12939 .kr(2)
12940 .sr(4)
12941 .m(1)
12942 .n(n)
12943 .k(k)
12944 .a_stride(43)
12945 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12946 }
12947 }
12948 }
12949
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64,n_gt_4_subtile)12950 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64, n_gt_4_subtile) {
12951 TEST_REQUIRES_X86_SSE41;
12952 for (uint32_t n = 5; n < 8; n++) {
12953 for (size_t k = 1; k <= 40; k += 9) {
12954 for (uint32_t m = 1; m <= 1; m++) {
12955 GemmMicrokernelTester()
12956 .mr(1)
12957 .nr(4)
12958 .kr(2)
12959 .sr(4)
12960 .m(m)
12961 .n(n)
12962 .k(k)
12963 .iterations(1)
12964 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12965 }
12966 }
12967 }
12968 }
12969
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64,n_div_4)12970 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64, n_div_4) {
12971 TEST_REQUIRES_X86_SSE41;
12972 for (uint32_t n = 8; n <= 12; n += 4) {
12973 for (size_t k = 1; k <= 40; k += 9) {
12974 GemmMicrokernelTester()
12975 .mr(1)
12976 .nr(4)
12977 .kr(2)
12978 .sr(4)
12979 .m(1)
12980 .n(n)
12981 .k(k)
12982 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
12983 }
12984 }
12985 }
12986
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64,n_div_4_strided_cn)12987 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64, n_div_4_strided_cn) {
12988 TEST_REQUIRES_X86_SSE41;
12989 for (uint32_t n = 8; n <= 12; n += 4) {
12990 for (size_t k = 1; k <= 40; k += 9) {
12991 GemmMicrokernelTester()
12992 .mr(1)
12993 .nr(4)
12994 .kr(2)
12995 .sr(4)
12996 .m(1)
12997 .n(n)
12998 .k(k)
12999 .cn_stride(7)
13000 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13001 }
13002 }
13003 }
13004
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64,n_div_4_strided_a)13005 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64, n_div_4_strided_a) {
13006 TEST_REQUIRES_X86_SSE41;
13007 for (uint32_t n = 8; n <= 12; n += 4) {
13008 for (size_t k = 1; k <= 40; k += 9) {
13009 GemmMicrokernelTester()
13010 .mr(1)
13011 .nr(4)
13012 .kr(2)
13013 .sr(4)
13014 .m(1)
13015 .n(n)
13016 .k(k)
13017 .a_stride(43)
13018 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13019 }
13020 }
13021 }
13022
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64,n_div_4_subtile)13023 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64, n_div_4_subtile) {
13024 TEST_REQUIRES_X86_SSE41;
13025 for (uint32_t n = 8; n <= 12; n += 4) {
13026 for (size_t k = 1; k <= 40; k += 9) {
13027 for (uint32_t m = 1; m <= 1; m++) {
13028 GemmMicrokernelTester()
13029 .mr(1)
13030 .nr(4)
13031 .kr(2)
13032 .sr(4)
13033 .m(m)
13034 .n(n)
13035 .k(k)
13036 .iterations(1)
13037 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13038 }
13039 }
13040 }
13041 }
13042
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64,strided_cm_subtile)13043 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64, strided_cm_subtile) {
13044 TEST_REQUIRES_X86_SSE41;
13045 for (size_t k = 1; k <= 40; k += 9) {
13046 for (uint32_t n = 1; n <= 4; n++) {
13047 for (uint32_t m = 1; m <= 1; m++) {
13048 GemmMicrokernelTester()
13049 .mr(1)
13050 .nr(4)
13051 .kr(2)
13052 .sr(4)
13053 .m(m)
13054 .n(n)
13055 .k(k)
13056 .cm_stride(7)
13057 .iterations(1)
13058 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13059 }
13060 }
13061 }
13062 }
13063
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64,qmin)13064 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64, qmin) {
13065 TEST_REQUIRES_X86_SSE41;
13066 GemmMicrokernelTester()
13067 .mr(1)
13068 .nr(4)
13069 .kr(2)
13070 .sr(4)
13071 .m(1)
13072 .n(4)
13073 .k(8)
13074 .qmin(128)
13075 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13076 }
13077
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64,qmax)13078 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64, qmax) {
13079 TEST_REQUIRES_X86_SSE41;
13080 GemmMicrokernelTester()
13081 .mr(1)
13082 .nr(4)
13083 .kr(2)
13084 .sr(4)
13085 .m(1)
13086 .n(4)
13087 .k(8)
13088 .qmax(128)
13089 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13090 }
13091
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64,strided_cm)13092 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64, strided_cm) {
13093 TEST_REQUIRES_X86_SSE41;
13094 GemmMicrokernelTester()
13095 .mr(1)
13096 .nr(4)
13097 .kr(2)
13098 .sr(4)
13099 .m(1)
13100 .n(4)
13101 .k(8)
13102 .cm_stride(7)
13103 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13104 }
13105
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64,no_a_zero_point)13106 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64, no_a_zero_point) {
13107 TEST_REQUIRES_X86_SSE41;
13108 for (size_t k = 1; k <= 40; k += 9) {
13109 GemmMicrokernelTester()
13110 .mr(1)
13111 .nr(4)
13112 .kr(2)
13113 .sr(4)
13114 .m(1)
13115 .n(4)
13116 .k(k)
13117 .a_zero_point(0)
13118 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13119 }
13120 }
13121
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64,no_b_zero_point)13122 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64, no_b_zero_point) {
13123 TEST_REQUIRES_X86_SSE41;
13124 for (size_t k = 1; k <= 40; k += 9) {
13125 GemmMicrokernelTester()
13126 .mr(1)
13127 .nr(4)
13128 .kr(2)
13129 .sr(4)
13130 .m(1)
13131 .n(4)
13132 .k(k)
13133 .b_zero_point(0)
13134 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13135 }
13136 }
13137
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64,no_zero_point)13138 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD64, no_zero_point) {
13139 TEST_REQUIRES_X86_SSE41;
13140 for (size_t k = 1; k <= 40; k += 9) {
13141 GemmMicrokernelTester()
13142 .mr(1)
13143 .nr(4)
13144 .kr(2)
13145 .sr(4)
13146 .m(1)
13147 .n(4)
13148 .k(k)
13149 .a_zero_point(0)
13150 .b_zero_point(0)
13151 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13152 }
13153 }
13154 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
13155
13156
13157 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64,k_eq_8)13158 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64, k_eq_8) {
13159 TEST_REQUIRES_X86_SSE2;
13160 GemmMicrokernelTester()
13161 .mr(2)
13162 .nr(4)
13163 .kr(2)
13164 .sr(4)
13165 .m(2)
13166 .n(4)
13167 .k(8)
13168 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13169 }
13170
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64,strided_cn)13171 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64, strided_cn) {
13172 TEST_REQUIRES_X86_SSE2;
13173 GemmMicrokernelTester()
13174 .mr(2)
13175 .nr(4)
13176 .kr(2)
13177 .sr(4)
13178 .m(2)
13179 .n(4)
13180 .k(8)
13181 .cn_stride(7)
13182 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13183 }
13184
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64,k_eq_8_strided_a)13185 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64, k_eq_8_strided_a) {
13186 TEST_REQUIRES_X86_SSE2;
13187 GemmMicrokernelTester()
13188 .mr(2)
13189 .nr(4)
13190 .kr(2)
13191 .sr(4)
13192 .m(2)
13193 .n(4)
13194 .k(8)
13195 .a_stride(11)
13196 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13197 }
13198
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64,k_eq_8_subtile)13199 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64, k_eq_8_subtile) {
13200 TEST_REQUIRES_X86_SSE2;
13201 for (uint32_t n = 1; n <= 4; n++) {
13202 for (uint32_t m = 1; m <= 2; m++) {
13203 GemmMicrokernelTester()
13204 .mr(2)
13205 .nr(4)
13206 .kr(2)
13207 .sr(4)
13208 .m(m)
13209 .n(n)
13210 .k(8)
13211 .iterations(1)
13212 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13213 }
13214 }
13215 }
13216
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64,k_eq_8_subtile_m)13217 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64, k_eq_8_subtile_m) {
13218 TEST_REQUIRES_X86_SSE2;
13219 for (uint32_t m = 1; m <= 2; m++) {
13220 GemmMicrokernelTester()
13221 .mr(2)
13222 .nr(4)
13223 .kr(2)
13224 .sr(4)
13225 .m(m)
13226 .n(4)
13227 .k(8)
13228 .iterations(1)
13229 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13230 }
13231 }
13232
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64,k_eq_8_subtile_n)13233 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64, k_eq_8_subtile_n) {
13234 TEST_REQUIRES_X86_SSE2;
13235 for (uint32_t n = 1; n <= 4; n++) {
13236 GemmMicrokernelTester()
13237 .mr(2)
13238 .nr(4)
13239 .kr(2)
13240 .sr(4)
13241 .m(2)
13242 .n(n)
13243 .k(8)
13244 .iterations(1)
13245 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13246 }
13247 }
13248
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64,k_lt_8)13249 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64, k_lt_8) {
13250 TEST_REQUIRES_X86_SSE2;
13251 for (size_t k = 1; k < 8; k++) {
13252 GemmMicrokernelTester()
13253 .mr(2)
13254 .nr(4)
13255 .kr(2)
13256 .sr(4)
13257 .m(2)
13258 .n(4)
13259 .k(k)
13260 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13261 }
13262 }
13263
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64,k_lt_8_strided_a)13264 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64, k_lt_8_strided_a) {
13265 TEST_REQUIRES_X86_SSE2;
13266 for (size_t k = 1; k < 8; k++) {
13267 GemmMicrokernelTester()
13268 .mr(2)
13269 .nr(4)
13270 .kr(2)
13271 .sr(4)
13272 .m(2)
13273 .n(4)
13274 .k(k)
13275 .a_stride(11)
13276 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13277 }
13278 }
13279
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64,k_lt_8_subtile)13280 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64, k_lt_8_subtile) {
13281 TEST_REQUIRES_X86_SSE2;
13282 for (size_t k = 1; k < 8; k++) {
13283 for (uint32_t n = 1; n <= 4; n++) {
13284 for (uint32_t m = 1; m <= 2; m++) {
13285 GemmMicrokernelTester()
13286 .mr(2)
13287 .nr(4)
13288 .kr(2)
13289 .sr(4)
13290 .m(m)
13291 .n(n)
13292 .k(k)
13293 .iterations(1)
13294 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13295 }
13296 }
13297 }
13298 }
13299
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64,k_gt_8)13300 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64, k_gt_8) {
13301 TEST_REQUIRES_X86_SSE2;
13302 for (size_t k = 9; k < 16; k++) {
13303 GemmMicrokernelTester()
13304 .mr(2)
13305 .nr(4)
13306 .kr(2)
13307 .sr(4)
13308 .m(2)
13309 .n(4)
13310 .k(k)
13311 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13312 }
13313 }
13314
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64,k_gt_8_strided_a)13315 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64, k_gt_8_strided_a) {
13316 TEST_REQUIRES_X86_SSE2;
13317 for (size_t k = 9; k < 16; k++) {
13318 GemmMicrokernelTester()
13319 .mr(2)
13320 .nr(4)
13321 .kr(2)
13322 .sr(4)
13323 .m(2)
13324 .n(4)
13325 .k(k)
13326 .a_stride(19)
13327 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13328 }
13329 }
13330
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64,k_gt_8_subtile)13331 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64, k_gt_8_subtile) {
13332 TEST_REQUIRES_X86_SSE2;
13333 for (size_t k = 9; k < 16; k++) {
13334 for (uint32_t n = 1; n <= 4; n++) {
13335 for (uint32_t m = 1; m <= 2; m++) {
13336 GemmMicrokernelTester()
13337 .mr(2)
13338 .nr(4)
13339 .kr(2)
13340 .sr(4)
13341 .m(m)
13342 .n(n)
13343 .k(k)
13344 .iterations(1)
13345 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13346 }
13347 }
13348 }
13349 }
13350
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64,k_div_8)13351 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64, k_div_8) {
13352 TEST_REQUIRES_X86_SSE2;
13353 for (size_t k = 16; k <= 80; k += 8) {
13354 GemmMicrokernelTester()
13355 .mr(2)
13356 .nr(4)
13357 .kr(2)
13358 .sr(4)
13359 .m(2)
13360 .n(4)
13361 .k(k)
13362 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13363 }
13364 }
13365
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64,k_div_8_strided_a)13366 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64, k_div_8_strided_a) {
13367 TEST_REQUIRES_X86_SSE2;
13368 for (size_t k = 16; k <= 80; k += 8) {
13369 GemmMicrokernelTester()
13370 .mr(2)
13371 .nr(4)
13372 .kr(2)
13373 .sr(4)
13374 .m(2)
13375 .n(4)
13376 .k(k)
13377 .a_stride(83)
13378 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13379 }
13380 }
13381
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64,k_div_8_subtile)13382 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64, k_div_8_subtile) {
13383 TEST_REQUIRES_X86_SSE2;
13384 for (size_t k = 16; k <= 80; k += 8) {
13385 for (uint32_t n = 1; n <= 4; n++) {
13386 for (uint32_t m = 1; m <= 2; m++) {
13387 GemmMicrokernelTester()
13388 .mr(2)
13389 .nr(4)
13390 .kr(2)
13391 .sr(4)
13392 .m(m)
13393 .n(n)
13394 .k(k)
13395 .iterations(1)
13396 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13397 }
13398 }
13399 }
13400 }
13401
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64,n_gt_4)13402 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64, n_gt_4) {
13403 TEST_REQUIRES_X86_SSE2;
13404 for (uint32_t n = 5; n < 8; n++) {
13405 for (size_t k = 1; k <= 40; k += 9) {
13406 GemmMicrokernelTester()
13407 .mr(2)
13408 .nr(4)
13409 .kr(2)
13410 .sr(4)
13411 .m(2)
13412 .n(n)
13413 .k(k)
13414 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13415 }
13416 }
13417 }
13418
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64,n_gt_4_strided_cn)13419 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64, n_gt_4_strided_cn) {
13420 TEST_REQUIRES_X86_SSE2;
13421 for (uint32_t n = 5; n < 8; n++) {
13422 for (size_t k = 1; k <= 40; k += 9) {
13423 GemmMicrokernelTester()
13424 .mr(2)
13425 .nr(4)
13426 .kr(2)
13427 .sr(4)
13428 .m(2)
13429 .n(n)
13430 .k(k)
13431 .cn_stride(7)
13432 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13433 }
13434 }
13435 }
13436
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64,n_gt_4_strided_a)13437 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64, n_gt_4_strided_a) {
13438 TEST_REQUIRES_X86_SSE2;
13439 for (uint32_t n = 5; n < 8; n++) {
13440 for (size_t k = 1; k <= 40; k += 9) {
13441 GemmMicrokernelTester()
13442 .mr(2)
13443 .nr(4)
13444 .kr(2)
13445 .sr(4)
13446 .m(2)
13447 .n(n)
13448 .k(k)
13449 .a_stride(43)
13450 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13451 }
13452 }
13453 }
13454
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64,n_gt_4_subtile)13455 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64, n_gt_4_subtile) {
13456 TEST_REQUIRES_X86_SSE2;
13457 for (uint32_t n = 5; n < 8; n++) {
13458 for (size_t k = 1; k <= 40; k += 9) {
13459 for (uint32_t m = 1; m <= 2; m++) {
13460 GemmMicrokernelTester()
13461 .mr(2)
13462 .nr(4)
13463 .kr(2)
13464 .sr(4)
13465 .m(m)
13466 .n(n)
13467 .k(k)
13468 .iterations(1)
13469 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13470 }
13471 }
13472 }
13473 }
13474
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64,n_div_4)13475 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64, n_div_4) {
13476 TEST_REQUIRES_X86_SSE2;
13477 for (uint32_t n = 8; n <= 12; n += 4) {
13478 for (size_t k = 1; k <= 40; k += 9) {
13479 GemmMicrokernelTester()
13480 .mr(2)
13481 .nr(4)
13482 .kr(2)
13483 .sr(4)
13484 .m(2)
13485 .n(n)
13486 .k(k)
13487 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13488 }
13489 }
13490 }
13491
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64,n_div_4_strided_cn)13492 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64, n_div_4_strided_cn) {
13493 TEST_REQUIRES_X86_SSE2;
13494 for (uint32_t n = 8; n <= 12; n += 4) {
13495 for (size_t k = 1; k <= 40; k += 9) {
13496 GemmMicrokernelTester()
13497 .mr(2)
13498 .nr(4)
13499 .kr(2)
13500 .sr(4)
13501 .m(2)
13502 .n(n)
13503 .k(k)
13504 .cn_stride(7)
13505 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13506 }
13507 }
13508 }
13509
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64,n_div_4_strided_a)13510 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64, n_div_4_strided_a) {
13511 TEST_REQUIRES_X86_SSE2;
13512 for (uint32_t n = 8; n <= 12; n += 4) {
13513 for (size_t k = 1; k <= 40; k += 9) {
13514 GemmMicrokernelTester()
13515 .mr(2)
13516 .nr(4)
13517 .kr(2)
13518 .sr(4)
13519 .m(2)
13520 .n(n)
13521 .k(k)
13522 .a_stride(43)
13523 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13524 }
13525 }
13526 }
13527
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64,n_div_4_subtile)13528 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64, n_div_4_subtile) {
13529 TEST_REQUIRES_X86_SSE2;
13530 for (uint32_t n = 8; n <= 12; n += 4) {
13531 for (size_t k = 1; k <= 40; k += 9) {
13532 for (uint32_t m = 1; m <= 2; m++) {
13533 GemmMicrokernelTester()
13534 .mr(2)
13535 .nr(4)
13536 .kr(2)
13537 .sr(4)
13538 .m(m)
13539 .n(n)
13540 .k(k)
13541 .iterations(1)
13542 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13543 }
13544 }
13545 }
13546 }
13547
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64,strided_cm_subtile)13548 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64, strided_cm_subtile) {
13549 TEST_REQUIRES_X86_SSE2;
13550 for (size_t k = 1; k <= 40; k += 9) {
13551 for (uint32_t n = 1; n <= 4; n++) {
13552 for (uint32_t m = 1; m <= 2; m++) {
13553 GemmMicrokernelTester()
13554 .mr(2)
13555 .nr(4)
13556 .kr(2)
13557 .sr(4)
13558 .m(m)
13559 .n(n)
13560 .k(k)
13561 .cm_stride(7)
13562 .iterations(1)
13563 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13564 }
13565 }
13566 }
13567 }
13568
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64,qmin)13569 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64, qmin) {
13570 TEST_REQUIRES_X86_SSE2;
13571 GemmMicrokernelTester()
13572 .mr(2)
13573 .nr(4)
13574 .kr(2)
13575 .sr(4)
13576 .m(2)
13577 .n(4)
13578 .k(8)
13579 .qmin(128)
13580 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13581 }
13582
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64,qmax)13583 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64, qmax) {
13584 TEST_REQUIRES_X86_SSE2;
13585 GemmMicrokernelTester()
13586 .mr(2)
13587 .nr(4)
13588 .kr(2)
13589 .sr(4)
13590 .m(2)
13591 .n(4)
13592 .k(8)
13593 .qmax(128)
13594 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13595 }
13596
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64,strided_cm)13597 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64, strided_cm) {
13598 TEST_REQUIRES_X86_SSE2;
13599 GemmMicrokernelTester()
13600 .mr(2)
13601 .nr(4)
13602 .kr(2)
13603 .sr(4)
13604 .m(2)
13605 .n(4)
13606 .k(8)
13607 .cm_stride(7)
13608 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13609 }
13610
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64,no_a_zero_point)13611 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64, no_a_zero_point) {
13612 TEST_REQUIRES_X86_SSE2;
13613 for (size_t k = 1; k <= 40; k += 9) {
13614 GemmMicrokernelTester()
13615 .mr(2)
13616 .nr(4)
13617 .kr(2)
13618 .sr(4)
13619 .m(2)
13620 .n(4)
13621 .k(k)
13622 .a_zero_point(0)
13623 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13624 }
13625 }
13626
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64,no_b_zero_point)13627 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64, no_b_zero_point) {
13628 TEST_REQUIRES_X86_SSE2;
13629 for (size_t k = 1; k <= 40; k += 9) {
13630 GemmMicrokernelTester()
13631 .mr(2)
13632 .nr(4)
13633 .kr(2)
13634 .sr(4)
13635 .m(2)
13636 .n(4)
13637 .k(k)
13638 .b_zero_point(0)
13639 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13640 }
13641 }
13642
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64,no_zero_point)13643 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD64, no_zero_point) {
13644 TEST_REQUIRES_X86_SSE2;
13645 for (size_t k = 1; k <= 40; k += 9) {
13646 GemmMicrokernelTester()
13647 .mr(2)
13648 .nr(4)
13649 .kr(2)
13650 .sr(4)
13651 .m(2)
13652 .n(4)
13653 .k(k)
13654 .a_zero_point(0)
13655 .b_zero_point(0)
13656 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13657 }
13658 }
13659 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
13660
13661
13662 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64,k_eq_8)13663 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64, k_eq_8) {
13664 TEST_REQUIRES_X86_SSE41;
13665 GemmMicrokernelTester()
13666 .mr(4)
13667 .nr(4)
13668 .kr(2)
13669 .sr(4)
13670 .m(4)
13671 .n(4)
13672 .k(8)
13673 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13674 }
13675
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64,strided_cn)13676 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64, strided_cn) {
13677 TEST_REQUIRES_X86_SSE41;
13678 GemmMicrokernelTester()
13679 .mr(4)
13680 .nr(4)
13681 .kr(2)
13682 .sr(4)
13683 .m(4)
13684 .n(4)
13685 .k(8)
13686 .cn_stride(7)
13687 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13688 }
13689
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64,k_eq_8_strided_a)13690 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64, k_eq_8_strided_a) {
13691 TEST_REQUIRES_X86_SSE41;
13692 GemmMicrokernelTester()
13693 .mr(4)
13694 .nr(4)
13695 .kr(2)
13696 .sr(4)
13697 .m(4)
13698 .n(4)
13699 .k(8)
13700 .a_stride(11)
13701 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13702 }
13703
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64,k_eq_8_subtile)13704 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64, k_eq_8_subtile) {
13705 TEST_REQUIRES_X86_SSE41;
13706 for (uint32_t n = 1; n <= 4; n++) {
13707 for (uint32_t m = 1; m <= 4; m++) {
13708 GemmMicrokernelTester()
13709 .mr(4)
13710 .nr(4)
13711 .kr(2)
13712 .sr(4)
13713 .m(m)
13714 .n(n)
13715 .k(8)
13716 .iterations(1)
13717 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13718 }
13719 }
13720 }
13721
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64,k_eq_8_subtile_m)13722 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64, k_eq_8_subtile_m) {
13723 TEST_REQUIRES_X86_SSE41;
13724 for (uint32_t m = 1; m <= 4; m++) {
13725 GemmMicrokernelTester()
13726 .mr(4)
13727 .nr(4)
13728 .kr(2)
13729 .sr(4)
13730 .m(m)
13731 .n(4)
13732 .k(8)
13733 .iterations(1)
13734 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13735 }
13736 }
13737
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64,k_eq_8_subtile_n)13738 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64, k_eq_8_subtile_n) {
13739 TEST_REQUIRES_X86_SSE41;
13740 for (uint32_t n = 1; n <= 4; n++) {
13741 GemmMicrokernelTester()
13742 .mr(4)
13743 .nr(4)
13744 .kr(2)
13745 .sr(4)
13746 .m(4)
13747 .n(n)
13748 .k(8)
13749 .iterations(1)
13750 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13751 }
13752 }
13753
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64,k_lt_8)13754 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64, k_lt_8) {
13755 TEST_REQUIRES_X86_SSE41;
13756 for (size_t k = 1; k < 8; k++) {
13757 GemmMicrokernelTester()
13758 .mr(4)
13759 .nr(4)
13760 .kr(2)
13761 .sr(4)
13762 .m(4)
13763 .n(4)
13764 .k(k)
13765 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13766 }
13767 }
13768
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64,k_lt_8_strided_a)13769 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64, k_lt_8_strided_a) {
13770 TEST_REQUIRES_X86_SSE41;
13771 for (size_t k = 1; k < 8; k++) {
13772 GemmMicrokernelTester()
13773 .mr(4)
13774 .nr(4)
13775 .kr(2)
13776 .sr(4)
13777 .m(4)
13778 .n(4)
13779 .k(k)
13780 .a_stride(11)
13781 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13782 }
13783 }
13784
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64,k_lt_8_subtile)13785 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64, k_lt_8_subtile) {
13786 TEST_REQUIRES_X86_SSE41;
13787 for (size_t k = 1; k < 8; k++) {
13788 for (uint32_t n = 1; n <= 4; n++) {
13789 for (uint32_t m = 1; m <= 4; m++) {
13790 GemmMicrokernelTester()
13791 .mr(4)
13792 .nr(4)
13793 .kr(2)
13794 .sr(4)
13795 .m(m)
13796 .n(n)
13797 .k(k)
13798 .iterations(1)
13799 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13800 }
13801 }
13802 }
13803 }
13804
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64,k_gt_8)13805 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64, k_gt_8) {
13806 TEST_REQUIRES_X86_SSE41;
13807 for (size_t k = 9; k < 16; k++) {
13808 GemmMicrokernelTester()
13809 .mr(4)
13810 .nr(4)
13811 .kr(2)
13812 .sr(4)
13813 .m(4)
13814 .n(4)
13815 .k(k)
13816 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13817 }
13818 }
13819
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64,k_gt_8_strided_a)13820 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64, k_gt_8_strided_a) {
13821 TEST_REQUIRES_X86_SSE41;
13822 for (size_t k = 9; k < 16; k++) {
13823 GemmMicrokernelTester()
13824 .mr(4)
13825 .nr(4)
13826 .kr(2)
13827 .sr(4)
13828 .m(4)
13829 .n(4)
13830 .k(k)
13831 .a_stride(19)
13832 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13833 }
13834 }
13835
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64,k_gt_8_subtile)13836 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64, k_gt_8_subtile) {
13837 TEST_REQUIRES_X86_SSE41;
13838 for (size_t k = 9; k < 16; k++) {
13839 for (uint32_t n = 1; n <= 4; n++) {
13840 for (uint32_t m = 1; m <= 4; m++) {
13841 GemmMicrokernelTester()
13842 .mr(4)
13843 .nr(4)
13844 .kr(2)
13845 .sr(4)
13846 .m(m)
13847 .n(n)
13848 .k(k)
13849 .iterations(1)
13850 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13851 }
13852 }
13853 }
13854 }
13855
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64,k_div_8)13856 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64, k_div_8) {
13857 TEST_REQUIRES_X86_SSE41;
13858 for (size_t k = 16; k <= 80; k += 8) {
13859 GemmMicrokernelTester()
13860 .mr(4)
13861 .nr(4)
13862 .kr(2)
13863 .sr(4)
13864 .m(4)
13865 .n(4)
13866 .k(k)
13867 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13868 }
13869 }
13870
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64,k_div_8_strided_a)13871 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64, k_div_8_strided_a) {
13872 TEST_REQUIRES_X86_SSE41;
13873 for (size_t k = 16; k <= 80; k += 8) {
13874 GemmMicrokernelTester()
13875 .mr(4)
13876 .nr(4)
13877 .kr(2)
13878 .sr(4)
13879 .m(4)
13880 .n(4)
13881 .k(k)
13882 .a_stride(83)
13883 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13884 }
13885 }
13886
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64,k_div_8_subtile)13887 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64, k_div_8_subtile) {
13888 TEST_REQUIRES_X86_SSE41;
13889 for (size_t k = 16; k <= 80; k += 8) {
13890 for (uint32_t n = 1; n <= 4; n++) {
13891 for (uint32_t m = 1; m <= 4; m++) {
13892 GemmMicrokernelTester()
13893 .mr(4)
13894 .nr(4)
13895 .kr(2)
13896 .sr(4)
13897 .m(m)
13898 .n(n)
13899 .k(k)
13900 .iterations(1)
13901 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13902 }
13903 }
13904 }
13905 }
13906
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64,n_gt_4)13907 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64, n_gt_4) {
13908 TEST_REQUIRES_X86_SSE41;
13909 for (uint32_t n = 5; n < 8; n++) {
13910 for (size_t k = 1; k <= 40; k += 9) {
13911 GemmMicrokernelTester()
13912 .mr(4)
13913 .nr(4)
13914 .kr(2)
13915 .sr(4)
13916 .m(4)
13917 .n(n)
13918 .k(k)
13919 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13920 }
13921 }
13922 }
13923
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64,n_gt_4_strided_cn)13924 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64, n_gt_4_strided_cn) {
13925 TEST_REQUIRES_X86_SSE41;
13926 for (uint32_t n = 5; n < 8; n++) {
13927 for (size_t k = 1; k <= 40; k += 9) {
13928 GemmMicrokernelTester()
13929 .mr(4)
13930 .nr(4)
13931 .kr(2)
13932 .sr(4)
13933 .m(4)
13934 .n(n)
13935 .k(k)
13936 .cn_stride(7)
13937 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13938 }
13939 }
13940 }
13941
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64,n_gt_4_strided_a)13942 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64, n_gt_4_strided_a) {
13943 TEST_REQUIRES_X86_SSE41;
13944 for (uint32_t n = 5; n < 8; n++) {
13945 for (size_t k = 1; k <= 40; k += 9) {
13946 GemmMicrokernelTester()
13947 .mr(4)
13948 .nr(4)
13949 .kr(2)
13950 .sr(4)
13951 .m(4)
13952 .n(n)
13953 .k(k)
13954 .a_stride(43)
13955 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13956 }
13957 }
13958 }
13959
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64,n_gt_4_subtile)13960 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64, n_gt_4_subtile) {
13961 TEST_REQUIRES_X86_SSE41;
13962 for (uint32_t n = 5; n < 8; n++) {
13963 for (size_t k = 1; k <= 40; k += 9) {
13964 for (uint32_t m = 1; m <= 4; m++) {
13965 GemmMicrokernelTester()
13966 .mr(4)
13967 .nr(4)
13968 .kr(2)
13969 .sr(4)
13970 .m(m)
13971 .n(n)
13972 .k(k)
13973 .iterations(1)
13974 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13975 }
13976 }
13977 }
13978 }
13979
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64,n_div_4)13980 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64, n_div_4) {
13981 TEST_REQUIRES_X86_SSE41;
13982 for (uint32_t n = 8; n <= 12; n += 4) {
13983 for (size_t k = 1; k <= 40; k += 9) {
13984 GemmMicrokernelTester()
13985 .mr(4)
13986 .nr(4)
13987 .kr(2)
13988 .sr(4)
13989 .m(4)
13990 .n(n)
13991 .k(k)
13992 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
13993 }
13994 }
13995 }
13996
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64,n_div_4_strided_cn)13997 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64, n_div_4_strided_cn) {
13998 TEST_REQUIRES_X86_SSE41;
13999 for (uint32_t n = 8; n <= 12; n += 4) {
14000 for (size_t k = 1; k <= 40; k += 9) {
14001 GemmMicrokernelTester()
14002 .mr(4)
14003 .nr(4)
14004 .kr(2)
14005 .sr(4)
14006 .m(4)
14007 .n(n)
14008 .k(k)
14009 .cn_stride(7)
14010 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14011 }
14012 }
14013 }
14014
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64,n_div_4_strided_a)14015 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64, n_div_4_strided_a) {
14016 TEST_REQUIRES_X86_SSE41;
14017 for (uint32_t n = 8; n <= 12; n += 4) {
14018 for (size_t k = 1; k <= 40; k += 9) {
14019 GemmMicrokernelTester()
14020 .mr(4)
14021 .nr(4)
14022 .kr(2)
14023 .sr(4)
14024 .m(4)
14025 .n(n)
14026 .k(k)
14027 .a_stride(43)
14028 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14029 }
14030 }
14031 }
14032
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64,n_div_4_subtile)14033 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64, n_div_4_subtile) {
14034 TEST_REQUIRES_X86_SSE41;
14035 for (uint32_t n = 8; n <= 12; n += 4) {
14036 for (size_t k = 1; k <= 40; k += 9) {
14037 for (uint32_t m = 1; m <= 4; m++) {
14038 GemmMicrokernelTester()
14039 .mr(4)
14040 .nr(4)
14041 .kr(2)
14042 .sr(4)
14043 .m(m)
14044 .n(n)
14045 .k(k)
14046 .iterations(1)
14047 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14048 }
14049 }
14050 }
14051 }
14052
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64,strided_cm_subtile)14053 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64, strided_cm_subtile) {
14054 TEST_REQUIRES_X86_SSE41;
14055 for (size_t k = 1; k <= 40; k += 9) {
14056 for (uint32_t n = 1; n <= 4; n++) {
14057 for (uint32_t m = 1; m <= 4; m++) {
14058 GemmMicrokernelTester()
14059 .mr(4)
14060 .nr(4)
14061 .kr(2)
14062 .sr(4)
14063 .m(m)
14064 .n(n)
14065 .k(k)
14066 .cm_stride(7)
14067 .iterations(1)
14068 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14069 }
14070 }
14071 }
14072 }
14073
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64,qmin)14074 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64, qmin) {
14075 TEST_REQUIRES_X86_SSE41;
14076 GemmMicrokernelTester()
14077 .mr(4)
14078 .nr(4)
14079 .kr(2)
14080 .sr(4)
14081 .m(4)
14082 .n(4)
14083 .k(8)
14084 .qmin(128)
14085 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14086 }
14087
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64,qmax)14088 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64, qmax) {
14089 TEST_REQUIRES_X86_SSE41;
14090 GemmMicrokernelTester()
14091 .mr(4)
14092 .nr(4)
14093 .kr(2)
14094 .sr(4)
14095 .m(4)
14096 .n(4)
14097 .k(8)
14098 .qmax(128)
14099 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14100 }
14101
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64,strided_cm)14102 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64, strided_cm) {
14103 TEST_REQUIRES_X86_SSE41;
14104 GemmMicrokernelTester()
14105 .mr(4)
14106 .nr(4)
14107 .kr(2)
14108 .sr(4)
14109 .m(4)
14110 .n(4)
14111 .k(8)
14112 .cm_stride(7)
14113 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14114 }
14115
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64,no_a_zero_point)14116 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64, no_a_zero_point) {
14117 TEST_REQUIRES_X86_SSE41;
14118 for (size_t k = 1; k <= 40; k += 9) {
14119 GemmMicrokernelTester()
14120 .mr(4)
14121 .nr(4)
14122 .kr(2)
14123 .sr(4)
14124 .m(4)
14125 .n(4)
14126 .k(k)
14127 .a_zero_point(0)
14128 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14129 }
14130 }
14131
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64,no_b_zero_point)14132 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64, no_b_zero_point) {
14133 TEST_REQUIRES_X86_SSE41;
14134 for (size_t k = 1; k <= 40; k += 9) {
14135 GemmMicrokernelTester()
14136 .mr(4)
14137 .nr(4)
14138 .kr(2)
14139 .sr(4)
14140 .m(4)
14141 .n(4)
14142 .k(k)
14143 .b_zero_point(0)
14144 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14145 }
14146 }
14147
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64,no_zero_point)14148 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__SSE41_LD64, no_zero_point) {
14149 TEST_REQUIRES_X86_SSE41;
14150 for (size_t k = 1; k <= 40; k += 9) {
14151 GemmMicrokernelTester()
14152 .mr(4)
14153 .nr(4)
14154 .kr(2)
14155 .sr(4)
14156 .m(4)
14157 .n(4)
14158 .k(k)
14159 .a_zero_point(0)
14160 .b_zero_point(0)
14161 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14162 }
14163 }
14164 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
14165
14166
14167 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64,k_eq_8)14168 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64, k_eq_8) {
14169 TEST_REQUIRES_X86_AVX;
14170 GemmMicrokernelTester()
14171 .mr(1)
14172 .nr(4)
14173 .kr(2)
14174 .sr(4)
14175 .m(1)
14176 .n(4)
14177 .k(8)
14178 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14179 }
14180
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64,strided_cn)14181 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64, strided_cn) {
14182 TEST_REQUIRES_X86_AVX;
14183 GemmMicrokernelTester()
14184 .mr(1)
14185 .nr(4)
14186 .kr(2)
14187 .sr(4)
14188 .m(1)
14189 .n(4)
14190 .k(8)
14191 .cn_stride(7)
14192 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14193 }
14194
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64,k_eq_8_strided_a)14195 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64, k_eq_8_strided_a) {
14196 TEST_REQUIRES_X86_AVX;
14197 GemmMicrokernelTester()
14198 .mr(1)
14199 .nr(4)
14200 .kr(2)
14201 .sr(4)
14202 .m(1)
14203 .n(4)
14204 .k(8)
14205 .a_stride(11)
14206 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14207 }
14208
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64,k_eq_8_subtile)14209 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64, k_eq_8_subtile) {
14210 TEST_REQUIRES_X86_AVX;
14211 for (uint32_t n = 1; n <= 4; n++) {
14212 for (uint32_t m = 1; m <= 1; m++) {
14213 GemmMicrokernelTester()
14214 .mr(1)
14215 .nr(4)
14216 .kr(2)
14217 .sr(4)
14218 .m(m)
14219 .n(n)
14220 .k(8)
14221 .iterations(1)
14222 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14223 }
14224 }
14225 }
14226
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64,k_eq_8_subtile_m)14227 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64, k_eq_8_subtile_m) {
14228 TEST_REQUIRES_X86_AVX;
14229 for (uint32_t m = 1; m <= 1; m++) {
14230 GemmMicrokernelTester()
14231 .mr(1)
14232 .nr(4)
14233 .kr(2)
14234 .sr(4)
14235 .m(m)
14236 .n(4)
14237 .k(8)
14238 .iterations(1)
14239 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14240 }
14241 }
14242
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64,k_eq_8_subtile_n)14243 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64, k_eq_8_subtile_n) {
14244 TEST_REQUIRES_X86_AVX;
14245 for (uint32_t n = 1; n <= 4; n++) {
14246 GemmMicrokernelTester()
14247 .mr(1)
14248 .nr(4)
14249 .kr(2)
14250 .sr(4)
14251 .m(1)
14252 .n(n)
14253 .k(8)
14254 .iterations(1)
14255 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14256 }
14257 }
14258
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64,k_lt_8)14259 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64, k_lt_8) {
14260 TEST_REQUIRES_X86_AVX;
14261 for (size_t k = 1; k < 8; k++) {
14262 GemmMicrokernelTester()
14263 .mr(1)
14264 .nr(4)
14265 .kr(2)
14266 .sr(4)
14267 .m(1)
14268 .n(4)
14269 .k(k)
14270 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14271 }
14272 }
14273
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64,k_lt_8_strided_a)14274 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64, k_lt_8_strided_a) {
14275 TEST_REQUIRES_X86_AVX;
14276 for (size_t k = 1; k < 8; k++) {
14277 GemmMicrokernelTester()
14278 .mr(1)
14279 .nr(4)
14280 .kr(2)
14281 .sr(4)
14282 .m(1)
14283 .n(4)
14284 .k(k)
14285 .a_stride(11)
14286 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14287 }
14288 }
14289
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64,k_lt_8_subtile)14290 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64, k_lt_8_subtile) {
14291 TEST_REQUIRES_X86_AVX;
14292 for (size_t k = 1; k < 8; k++) {
14293 for (uint32_t n = 1; n <= 4; n++) {
14294 for (uint32_t m = 1; m <= 1; m++) {
14295 GemmMicrokernelTester()
14296 .mr(1)
14297 .nr(4)
14298 .kr(2)
14299 .sr(4)
14300 .m(m)
14301 .n(n)
14302 .k(k)
14303 .iterations(1)
14304 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14305 }
14306 }
14307 }
14308 }
14309
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64,k_gt_8)14310 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64, k_gt_8) {
14311 TEST_REQUIRES_X86_AVX;
14312 for (size_t k = 9; k < 16; k++) {
14313 GemmMicrokernelTester()
14314 .mr(1)
14315 .nr(4)
14316 .kr(2)
14317 .sr(4)
14318 .m(1)
14319 .n(4)
14320 .k(k)
14321 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14322 }
14323 }
14324
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64,k_gt_8_strided_a)14325 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64, k_gt_8_strided_a) {
14326 TEST_REQUIRES_X86_AVX;
14327 for (size_t k = 9; k < 16; k++) {
14328 GemmMicrokernelTester()
14329 .mr(1)
14330 .nr(4)
14331 .kr(2)
14332 .sr(4)
14333 .m(1)
14334 .n(4)
14335 .k(k)
14336 .a_stride(19)
14337 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14338 }
14339 }
14340
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64,k_gt_8_subtile)14341 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64, k_gt_8_subtile) {
14342 TEST_REQUIRES_X86_AVX;
14343 for (size_t k = 9; k < 16; k++) {
14344 for (uint32_t n = 1; n <= 4; n++) {
14345 for (uint32_t m = 1; m <= 1; m++) {
14346 GemmMicrokernelTester()
14347 .mr(1)
14348 .nr(4)
14349 .kr(2)
14350 .sr(4)
14351 .m(m)
14352 .n(n)
14353 .k(k)
14354 .iterations(1)
14355 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14356 }
14357 }
14358 }
14359 }
14360
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64,k_div_8)14361 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64, k_div_8) {
14362 TEST_REQUIRES_X86_AVX;
14363 for (size_t k = 16; k <= 80; k += 8) {
14364 GemmMicrokernelTester()
14365 .mr(1)
14366 .nr(4)
14367 .kr(2)
14368 .sr(4)
14369 .m(1)
14370 .n(4)
14371 .k(k)
14372 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14373 }
14374 }
14375
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64,k_div_8_strided_a)14376 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64, k_div_8_strided_a) {
14377 TEST_REQUIRES_X86_AVX;
14378 for (size_t k = 16; k <= 80; k += 8) {
14379 GemmMicrokernelTester()
14380 .mr(1)
14381 .nr(4)
14382 .kr(2)
14383 .sr(4)
14384 .m(1)
14385 .n(4)
14386 .k(k)
14387 .a_stride(83)
14388 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14389 }
14390 }
14391
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64,k_div_8_subtile)14392 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64, k_div_8_subtile) {
14393 TEST_REQUIRES_X86_AVX;
14394 for (size_t k = 16; k <= 80; k += 8) {
14395 for (uint32_t n = 1; n <= 4; n++) {
14396 for (uint32_t m = 1; m <= 1; m++) {
14397 GemmMicrokernelTester()
14398 .mr(1)
14399 .nr(4)
14400 .kr(2)
14401 .sr(4)
14402 .m(m)
14403 .n(n)
14404 .k(k)
14405 .iterations(1)
14406 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14407 }
14408 }
14409 }
14410 }
14411
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64,n_gt_4)14412 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64, n_gt_4) {
14413 TEST_REQUIRES_X86_AVX;
14414 for (uint32_t n = 5; n < 8; n++) {
14415 for (size_t k = 1; k <= 40; k += 9) {
14416 GemmMicrokernelTester()
14417 .mr(1)
14418 .nr(4)
14419 .kr(2)
14420 .sr(4)
14421 .m(1)
14422 .n(n)
14423 .k(k)
14424 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14425 }
14426 }
14427 }
14428
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64,n_gt_4_strided_cn)14429 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64, n_gt_4_strided_cn) {
14430 TEST_REQUIRES_X86_AVX;
14431 for (uint32_t n = 5; n < 8; n++) {
14432 for (size_t k = 1; k <= 40; k += 9) {
14433 GemmMicrokernelTester()
14434 .mr(1)
14435 .nr(4)
14436 .kr(2)
14437 .sr(4)
14438 .m(1)
14439 .n(n)
14440 .k(k)
14441 .cn_stride(7)
14442 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14443 }
14444 }
14445 }
14446
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64,n_gt_4_strided_a)14447 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64, n_gt_4_strided_a) {
14448 TEST_REQUIRES_X86_AVX;
14449 for (uint32_t n = 5; n < 8; n++) {
14450 for (size_t k = 1; k <= 40; k += 9) {
14451 GemmMicrokernelTester()
14452 .mr(1)
14453 .nr(4)
14454 .kr(2)
14455 .sr(4)
14456 .m(1)
14457 .n(n)
14458 .k(k)
14459 .a_stride(43)
14460 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14461 }
14462 }
14463 }
14464
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64,n_gt_4_subtile)14465 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64, n_gt_4_subtile) {
14466 TEST_REQUIRES_X86_AVX;
14467 for (uint32_t n = 5; n < 8; n++) {
14468 for (size_t k = 1; k <= 40; k += 9) {
14469 for (uint32_t m = 1; m <= 1; m++) {
14470 GemmMicrokernelTester()
14471 .mr(1)
14472 .nr(4)
14473 .kr(2)
14474 .sr(4)
14475 .m(m)
14476 .n(n)
14477 .k(k)
14478 .iterations(1)
14479 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14480 }
14481 }
14482 }
14483 }
14484
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64,n_div_4)14485 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64, n_div_4) {
14486 TEST_REQUIRES_X86_AVX;
14487 for (uint32_t n = 8; n <= 12; n += 4) {
14488 for (size_t k = 1; k <= 40; k += 9) {
14489 GemmMicrokernelTester()
14490 .mr(1)
14491 .nr(4)
14492 .kr(2)
14493 .sr(4)
14494 .m(1)
14495 .n(n)
14496 .k(k)
14497 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14498 }
14499 }
14500 }
14501
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64,n_div_4_strided_cn)14502 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64, n_div_4_strided_cn) {
14503 TEST_REQUIRES_X86_AVX;
14504 for (uint32_t n = 8; n <= 12; n += 4) {
14505 for (size_t k = 1; k <= 40; k += 9) {
14506 GemmMicrokernelTester()
14507 .mr(1)
14508 .nr(4)
14509 .kr(2)
14510 .sr(4)
14511 .m(1)
14512 .n(n)
14513 .k(k)
14514 .cn_stride(7)
14515 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14516 }
14517 }
14518 }
14519
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64,n_div_4_strided_a)14520 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64, n_div_4_strided_a) {
14521 TEST_REQUIRES_X86_AVX;
14522 for (uint32_t n = 8; n <= 12; n += 4) {
14523 for (size_t k = 1; k <= 40; k += 9) {
14524 GemmMicrokernelTester()
14525 .mr(1)
14526 .nr(4)
14527 .kr(2)
14528 .sr(4)
14529 .m(1)
14530 .n(n)
14531 .k(k)
14532 .a_stride(43)
14533 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14534 }
14535 }
14536 }
14537
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64,n_div_4_subtile)14538 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64, n_div_4_subtile) {
14539 TEST_REQUIRES_X86_AVX;
14540 for (uint32_t n = 8; n <= 12; n += 4) {
14541 for (size_t k = 1; k <= 40; k += 9) {
14542 for (uint32_t m = 1; m <= 1; m++) {
14543 GemmMicrokernelTester()
14544 .mr(1)
14545 .nr(4)
14546 .kr(2)
14547 .sr(4)
14548 .m(m)
14549 .n(n)
14550 .k(k)
14551 .iterations(1)
14552 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14553 }
14554 }
14555 }
14556 }
14557
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64,strided_cm_subtile)14558 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64, strided_cm_subtile) {
14559 TEST_REQUIRES_X86_AVX;
14560 for (size_t k = 1; k <= 40; k += 9) {
14561 for (uint32_t n = 1; n <= 4; n++) {
14562 for (uint32_t m = 1; m <= 1; m++) {
14563 GemmMicrokernelTester()
14564 .mr(1)
14565 .nr(4)
14566 .kr(2)
14567 .sr(4)
14568 .m(m)
14569 .n(n)
14570 .k(k)
14571 .cm_stride(7)
14572 .iterations(1)
14573 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14574 }
14575 }
14576 }
14577 }
14578
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64,qmin)14579 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64, qmin) {
14580 TEST_REQUIRES_X86_AVX;
14581 GemmMicrokernelTester()
14582 .mr(1)
14583 .nr(4)
14584 .kr(2)
14585 .sr(4)
14586 .m(1)
14587 .n(4)
14588 .k(8)
14589 .qmin(128)
14590 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14591 }
14592
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64,qmax)14593 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64, qmax) {
14594 TEST_REQUIRES_X86_AVX;
14595 GemmMicrokernelTester()
14596 .mr(1)
14597 .nr(4)
14598 .kr(2)
14599 .sr(4)
14600 .m(1)
14601 .n(4)
14602 .k(8)
14603 .qmax(128)
14604 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14605 }
14606
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64,strided_cm)14607 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64, strided_cm) {
14608 TEST_REQUIRES_X86_AVX;
14609 GemmMicrokernelTester()
14610 .mr(1)
14611 .nr(4)
14612 .kr(2)
14613 .sr(4)
14614 .m(1)
14615 .n(4)
14616 .k(8)
14617 .cm_stride(7)
14618 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14619 }
14620
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64,no_a_zero_point)14621 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64, no_a_zero_point) {
14622 TEST_REQUIRES_X86_AVX;
14623 for (size_t k = 1; k <= 40; k += 9) {
14624 GemmMicrokernelTester()
14625 .mr(1)
14626 .nr(4)
14627 .kr(2)
14628 .sr(4)
14629 .m(1)
14630 .n(4)
14631 .k(k)
14632 .a_zero_point(0)
14633 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14634 }
14635 }
14636
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64,no_b_zero_point)14637 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64, no_b_zero_point) {
14638 TEST_REQUIRES_X86_AVX;
14639 for (size_t k = 1; k <= 40; k += 9) {
14640 GemmMicrokernelTester()
14641 .mr(1)
14642 .nr(4)
14643 .kr(2)
14644 .sr(4)
14645 .m(1)
14646 .n(4)
14647 .k(k)
14648 .b_zero_point(0)
14649 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14650 }
14651 }
14652
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64,no_zero_point)14653 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD64, no_zero_point) {
14654 TEST_REQUIRES_X86_AVX;
14655 for (size_t k = 1; k <= 40; k += 9) {
14656 GemmMicrokernelTester()
14657 .mr(1)
14658 .nr(4)
14659 .kr(2)
14660 .sr(4)
14661 .m(1)
14662 .n(4)
14663 .k(k)
14664 .a_zero_point(0)
14665 .b_zero_point(0)
14666 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14667 }
14668 }
14669 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
14670
14671
14672 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64,k_eq_8)14673 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64, k_eq_8) {
14674 TEST_REQUIRES_X86_XOP;
14675 GemmMicrokernelTester()
14676 .mr(1)
14677 .nr(4)
14678 .kr(2)
14679 .sr(4)
14680 .m(1)
14681 .n(4)
14682 .k(8)
14683 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14684 }
14685
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64,strided_cn)14686 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64, strided_cn) {
14687 TEST_REQUIRES_X86_XOP;
14688 GemmMicrokernelTester()
14689 .mr(1)
14690 .nr(4)
14691 .kr(2)
14692 .sr(4)
14693 .m(1)
14694 .n(4)
14695 .k(8)
14696 .cn_stride(7)
14697 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14698 }
14699
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64,k_eq_8_strided_a)14700 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64, k_eq_8_strided_a) {
14701 TEST_REQUIRES_X86_XOP;
14702 GemmMicrokernelTester()
14703 .mr(1)
14704 .nr(4)
14705 .kr(2)
14706 .sr(4)
14707 .m(1)
14708 .n(4)
14709 .k(8)
14710 .a_stride(11)
14711 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14712 }
14713
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64,k_eq_8_subtile)14714 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64, k_eq_8_subtile) {
14715 TEST_REQUIRES_X86_XOP;
14716 for (uint32_t n = 1; n <= 4; n++) {
14717 for (uint32_t m = 1; m <= 1; m++) {
14718 GemmMicrokernelTester()
14719 .mr(1)
14720 .nr(4)
14721 .kr(2)
14722 .sr(4)
14723 .m(m)
14724 .n(n)
14725 .k(8)
14726 .iterations(1)
14727 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14728 }
14729 }
14730 }
14731
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64,k_eq_8_subtile_m)14732 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64, k_eq_8_subtile_m) {
14733 TEST_REQUIRES_X86_XOP;
14734 for (uint32_t m = 1; m <= 1; m++) {
14735 GemmMicrokernelTester()
14736 .mr(1)
14737 .nr(4)
14738 .kr(2)
14739 .sr(4)
14740 .m(m)
14741 .n(4)
14742 .k(8)
14743 .iterations(1)
14744 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14745 }
14746 }
14747
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64,k_eq_8_subtile_n)14748 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64, k_eq_8_subtile_n) {
14749 TEST_REQUIRES_X86_XOP;
14750 for (uint32_t n = 1; n <= 4; n++) {
14751 GemmMicrokernelTester()
14752 .mr(1)
14753 .nr(4)
14754 .kr(2)
14755 .sr(4)
14756 .m(1)
14757 .n(n)
14758 .k(8)
14759 .iterations(1)
14760 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14761 }
14762 }
14763
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64,k_lt_8)14764 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64, k_lt_8) {
14765 TEST_REQUIRES_X86_XOP;
14766 for (size_t k = 1; k < 8; k++) {
14767 GemmMicrokernelTester()
14768 .mr(1)
14769 .nr(4)
14770 .kr(2)
14771 .sr(4)
14772 .m(1)
14773 .n(4)
14774 .k(k)
14775 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14776 }
14777 }
14778
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64,k_lt_8_strided_a)14779 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64, k_lt_8_strided_a) {
14780 TEST_REQUIRES_X86_XOP;
14781 for (size_t k = 1; k < 8; k++) {
14782 GemmMicrokernelTester()
14783 .mr(1)
14784 .nr(4)
14785 .kr(2)
14786 .sr(4)
14787 .m(1)
14788 .n(4)
14789 .k(k)
14790 .a_stride(11)
14791 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14792 }
14793 }
14794
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64,k_lt_8_subtile)14795 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64, k_lt_8_subtile) {
14796 TEST_REQUIRES_X86_XOP;
14797 for (size_t k = 1; k < 8; k++) {
14798 for (uint32_t n = 1; n <= 4; n++) {
14799 for (uint32_t m = 1; m <= 1; m++) {
14800 GemmMicrokernelTester()
14801 .mr(1)
14802 .nr(4)
14803 .kr(2)
14804 .sr(4)
14805 .m(m)
14806 .n(n)
14807 .k(k)
14808 .iterations(1)
14809 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14810 }
14811 }
14812 }
14813 }
14814
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64,k_gt_8)14815 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64, k_gt_8) {
14816 TEST_REQUIRES_X86_XOP;
14817 for (size_t k = 9; k < 16; k++) {
14818 GemmMicrokernelTester()
14819 .mr(1)
14820 .nr(4)
14821 .kr(2)
14822 .sr(4)
14823 .m(1)
14824 .n(4)
14825 .k(k)
14826 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14827 }
14828 }
14829
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64,k_gt_8_strided_a)14830 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64, k_gt_8_strided_a) {
14831 TEST_REQUIRES_X86_XOP;
14832 for (size_t k = 9; k < 16; k++) {
14833 GemmMicrokernelTester()
14834 .mr(1)
14835 .nr(4)
14836 .kr(2)
14837 .sr(4)
14838 .m(1)
14839 .n(4)
14840 .k(k)
14841 .a_stride(19)
14842 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14843 }
14844 }
14845
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64,k_gt_8_subtile)14846 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64, k_gt_8_subtile) {
14847 TEST_REQUIRES_X86_XOP;
14848 for (size_t k = 9; k < 16; k++) {
14849 for (uint32_t n = 1; n <= 4; n++) {
14850 for (uint32_t m = 1; m <= 1; m++) {
14851 GemmMicrokernelTester()
14852 .mr(1)
14853 .nr(4)
14854 .kr(2)
14855 .sr(4)
14856 .m(m)
14857 .n(n)
14858 .k(k)
14859 .iterations(1)
14860 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14861 }
14862 }
14863 }
14864 }
14865
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64,k_div_8)14866 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64, k_div_8) {
14867 TEST_REQUIRES_X86_XOP;
14868 for (size_t k = 16; k <= 80; k += 8) {
14869 GemmMicrokernelTester()
14870 .mr(1)
14871 .nr(4)
14872 .kr(2)
14873 .sr(4)
14874 .m(1)
14875 .n(4)
14876 .k(k)
14877 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14878 }
14879 }
14880
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64,k_div_8_strided_a)14881 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64, k_div_8_strided_a) {
14882 TEST_REQUIRES_X86_XOP;
14883 for (size_t k = 16; k <= 80; k += 8) {
14884 GemmMicrokernelTester()
14885 .mr(1)
14886 .nr(4)
14887 .kr(2)
14888 .sr(4)
14889 .m(1)
14890 .n(4)
14891 .k(k)
14892 .a_stride(83)
14893 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14894 }
14895 }
14896
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64,k_div_8_subtile)14897 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64, k_div_8_subtile) {
14898 TEST_REQUIRES_X86_XOP;
14899 for (size_t k = 16; k <= 80; k += 8) {
14900 for (uint32_t n = 1; n <= 4; n++) {
14901 for (uint32_t m = 1; m <= 1; m++) {
14902 GemmMicrokernelTester()
14903 .mr(1)
14904 .nr(4)
14905 .kr(2)
14906 .sr(4)
14907 .m(m)
14908 .n(n)
14909 .k(k)
14910 .iterations(1)
14911 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14912 }
14913 }
14914 }
14915 }
14916
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64,n_gt_4)14917 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64, n_gt_4) {
14918 TEST_REQUIRES_X86_XOP;
14919 for (uint32_t n = 5; n < 8; n++) {
14920 for (size_t k = 1; k <= 40; k += 9) {
14921 GemmMicrokernelTester()
14922 .mr(1)
14923 .nr(4)
14924 .kr(2)
14925 .sr(4)
14926 .m(1)
14927 .n(n)
14928 .k(k)
14929 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14930 }
14931 }
14932 }
14933
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64,n_gt_4_strided_cn)14934 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64, n_gt_4_strided_cn) {
14935 TEST_REQUIRES_X86_XOP;
14936 for (uint32_t n = 5; n < 8; n++) {
14937 for (size_t k = 1; k <= 40; k += 9) {
14938 GemmMicrokernelTester()
14939 .mr(1)
14940 .nr(4)
14941 .kr(2)
14942 .sr(4)
14943 .m(1)
14944 .n(n)
14945 .k(k)
14946 .cn_stride(7)
14947 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14948 }
14949 }
14950 }
14951
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64,n_gt_4_strided_a)14952 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64, n_gt_4_strided_a) {
14953 TEST_REQUIRES_X86_XOP;
14954 for (uint32_t n = 5; n < 8; n++) {
14955 for (size_t k = 1; k <= 40; k += 9) {
14956 GemmMicrokernelTester()
14957 .mr(1)
14958 .nr(4)
14959 .kr(2)
14960 .sr(4)
14961 .m(1)
14962 .n(n)
14963 .k(k)
14964 .a_stride(43)
14965 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14966 }
14967 }
14968 }
14969
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64,n_gt_4_subtile)14970 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64, n_gt_4_subtile) {
14971 TEST_REQUIRES_X86_XOP;
14972 for (uint32_t n = 5; n < 8; n++) {
14973 for (size_t k = 1; k <= 40; k += 9) {
14974 for (uint32_t m = 1; m <= 1; m++) {
14975 GemmMicrokernelTester()
14976 .mr(1)
14977 .nr(4)
14978 .kr(2)
14979 .sr(4)
14980 .m(m)
14981 .n(n)
14982 .k(k)
14983 .iterations(1)
14984 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
14985 }
14986 }
14987 }
14988 }
14989
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64,n_div_4)14990 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64, n_div_4) {
14991 TEST_REQUIRES_X86_XOP;
14992 for (uint32_t n = 8; n <= 12; n += 4) {
14993 for (size_t k = 1; k <= 40; k += 9) {
14994 GemmMicrokernelTester()
14995 .mr(1)
14996 .nr(4)
14997 .kr(2)
14998 .sr(4)
14999 .m(1)
15000 .n(n)
15001 .k(k)
15002 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15003 }
15004 }
15005 }
15006
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64,n_div_4_strided_cn)15007 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64, n_div_4_strided_cn) {
15008 TEST_REQUIRES_X86_XOP;
15009 for (uint32_t n = 8; n <= 12; n += 4) {
15010 for (size_t k = 1; k <= 40; k += 9) {
15011 GemmMicrokernelTester()
15012 .mr(1)
15013 .nr(4)
15014 .kr(2)
15015 .sr(4)
15016 .m(1)
15017 .n(n)
15018 .k(k)
15019 .cn_stride(7)
15020 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15021 }
15022 }
15023 }
15024
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64,n_div_4_strided_a)15025 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64, n_div_4_strided_a) {
15026 TEST_REQUIRES_X86_XOP;
15027 for (uint32_t n = 8; n <= 12; n += 4) {
15028 for (size_t k = 1; k <= 40; k += 9) {
15029 GemmMicrokernelTester()
15030 .mr(1)
15031 .nr(4)
15032 .kr(2)
15033 .sr(4)
15034 .m(1)
15035 .n(n)
15036 .k(k)
15037 .a_stride(43)
15038 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15039 }
15040 }
15041 }
15042
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64,n_div_4_subtile)15043 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64, n_div_4_subtile) {
15044 TEST_REQUIRES_X86_XOP;
15045 for (uint32_t n = 8; n <= 12; n += 4) {
15046 for (size_t k = 1; k <= 40; k += 9) {
15047 for (uint32_t m = 1; m <= 1; m++) {
15048 GemmMicrokernelTester()
15049 .mr(1)
15050 .nr(4)
15051 .kr(2)
15052 .sr(4)
15053 .m(m)
15054 .n(n)
15055 .k(k)
15056 .iterations(1)
15057 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15058 }
15059 }
15060 }
15061 }
15062
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64,strided_cm_subtile)15063 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64, strided_cm_subtile) {
15064 TEST_REQUIRES_X86_XOP;
15065 for (size_t k = 1; k <= 40; k += 9) {
15066 for (uint32_t n = 1; n <= 4; n++) {
15067 for (uint32_t m = 1; m <= 1; m++) {
15068 GemmMicrokernelTester()
15069 .mr(1)
15070 .nr(4)
15071 .kr(2)
15072 .sr(4)
15073 .m(m)
15074 .n(n)
15075 .k(k)
15076 .cm_stride(7)
15077 .iterations(1)
15078 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15079 }
15080 }
15081 }
15082 }
15083
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64,qmin)15084 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64, qmin) {
15085 TEST_REQUIRES_X86_XOP;
15086 GemmMicrokernelTester()
15087 .mr(1)
15088 .nr(4)
15089 .kr(2)
15090 .sr(4)
15091 .m(1)
15092 .n(4)
15093 .k(8)
15094 .qmin(128)
15095 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15096 }
15097
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64,qmax)15098 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64, qmax) {
15099 TEST_REQUIRES_X86_XOP;
15100 GemmMicrokernelTester()
15101 .mr(1)
15102 .nr(4)
15103 .kr(2)
15104 .sr(4)
15105 .m(1)
15106 .n(4)
15107 .k(8)
15108 .qmax(128)
15109 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15110 }
15111
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64,strided_cm)15112 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64, strided_cm) {
15113 TEST_REQUIRES_X86_XOP;
15114 GemmMicrokernelTester()
15115 .mr(1)
15116 .nr(4)
15117 .kr(2)
15118 .sr(4)
15119 .m(1)
15120 .n(4)
15121 .k(8)
15122 .cm_stride(7)
15123 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15124 }
15125
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64,no_a_zero_point)15126 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64, no_a_zero_point) {
15127 TEST_REQUIRES_X86_XOP;
15128 for (size_t k = 1; k <= 40; k += 9) {
15129 GemmMicrokernelTester()
15130 .mr(1)
15131 .nr(4)
15132 .kr(2)
15133 .sr(4)
15134 .m(1)
15135 .n(4)
15136 .k(k)
15137 .a_zero_point(0)
15138 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15139 }
15140 }
15141
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64,no_b_zero_point)15142 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64, no_b_zero_point) {
15143 TEST_REQUIRES_X86_XOP;
15144 for (size_t k = 1; k <= 40; k += 9) {
15145 GemmMicrokernelTester()
15146 .mr(1)
15147 .nr(4)
15148 .kr(2)
15149 .sr(4)
15150 .m(1)
15151 .n(4)
15152 .k(k)
15153 .b_zero_point(0)
15154 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15155 }
15156 }
15157
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64,no_zero_point)15158 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__XOP_LD64, no_zero_point) {
15159 TEST_REQUIRES_X86_XOP;
15160 for (size_t k = 1; k <= 40; k += 9) {
15161 GemmMicrokernelTester()
15162 .mr(1)
15163 .nr(4)
15164 .kr(2)
15165 .sr(4)
15166 .m(1)
15167 .n(4)
15168 .k(k)
15169 .a_zero_point(0)
15170 .b_zero_point(0)
15171 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15172 }
15173 }
15174 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
15175
15176
15177 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64,k_eq_8)15178 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64, k_eq_8) {
15179 TEST_REQUIRES_X86_AVX;
15180 GemmMicrokernelTester()
15181 .mr(2)
15182 .nr(4)
15183 .kr(2)
15184 .sr(4)
15185 .m(2)
15186 .n(4)
15187 .k(8)
15188 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15189 }
15190
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64,strided_cn)15191 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64, strided_cn) {
15192 TEST_REQUIRES_X86_AVX;
15193 GemmMicrokernelTester()
15194 .mr(2)
15195 .nr(4)
15196 .kr(2)
15197 .sr(4)
15198 .m(2)
15199 .n(4)
15200 .k(8)
15201 .cn_stride(7)
15202 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15203 }
15204
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64,k_eq_8_strided_a)15205 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64, k_eq_8_strided_a) {
15206 TEST_REQUIRES_X86_AVX;
15207 GemmMicrokernelTester()
15208 .mr(2)
15209 .nr(4)
15210 .kr(2)
15211 .sr(4)
15212 .m(2)
15213 .n(4)
15214 .k(8)
15215 .a_stride(11)
15216 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15217 }
15218
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64,k_eq_8_subtile)15219 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64, k_eq_8_subtile) {
15220 TEST_REQUIRES_X86_AVX;
15221 for (uint32_t n = 1; n <= 4; n++) {
15222 for (uint32_t m = 1; m <= 2; m++) {
15223 GemmMicrokernelTester()
15224 .mr(2)
15225 .nr(4)
15226 .kr(2)
15227 .sr(4)
15228 .m(m)
15229 .n(n)
15230 .k(8)
15231 .iterations(1)
15232 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15233 }
15234 }
15235 }
15236
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64,k_eq_8_subtile_m)15237 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64, k_eq_8_subtile_m) {
15238 TEST_REQUIRES_X86_AVX;
15239 for (uint32_t m = 1; m <= 2; m++) {
15240 GemmMicrokernelTester()
15241 .mr(2)
15242 .nr(4)
15243 .kr(2)
15244 .sr(4)
15245 .m(m)
15246 .n(4)
15247 .k(8)
15248 .iterations(1)
15249 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15250 }
15251 }
15252
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64,k_eq_8_subtile_n)15253 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64, k_eq_8_subtile_n) {
15254 TEST_REQUIRES_X86_AVX;
15255 for (uint32_t n = 1; n <= 4; n++) {
15256 GemmMicrokernelTester()
15257 .mr(2)
15258 .nr(4)
15259 .kr(2)
15260 .sr(4)
15261 .m(2)
15262 .n(n)
15263 .k(8)
15264 .iterations(1)
15265 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15266 }
15267 }
15268
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64,k_lt_8)15269 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64, k_lt_8) {
15270 TEST_REQUIRES_X86_AVX;
15271 for (size_t k = 1; k < 8; k++) {
15272 GemmMicrokernelTester()
15273 .mr(2)
15274 .nr(4)
15275 .kr(2)
15276 .sr(4)
15277 .m(2)
15278 .n(4)
15279 .k(k)
15280 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15281 }
15282 }
15283
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64,k_lt_8_strided_a)15284 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64, k_lt_8_strided_a) {
15285 TEST_REQUIRES_X86_AVX;
15286 for (size_t k = 1; k < 8; k++) {
15287 GemmMicrokernelTester()
15288 .mr(2)
15289 .nr(4)
15290 .kr(2)
15291 .sr(4)
15292 .m(2)
15293 .n(4)
15294 .k(k)
15295 .a_stride(11)
15296 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15297 }
15298 }
15299
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64,k_lt_8_subtile)15300 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64, k_lt_8_subtile) {
15301 TEST_REQUIRES_X86_AVX;
15302 for (size_t k = 1; k < 8; k++) {
15303 for (uint32_t n = 1; n <= 4; n++) {
15304 for (uint32_t m = 1; m <= 2; m++) {
15305 GemmMicrokernelTester()
15306 .mr(2)
15307 .nr(4)
15308 .kr(2)
15309 .sr(4)
15310 .m(m)
15311 .n(n)
15312 .k(k)
15313 .iterations(1)
15314 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15315 }
15316 }
15317 }
15318 }
15319
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64,k_gt_8)15320 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64, k_gt_8) {
15321 TEST_REQUIRES_X86_AVX;
15322 for (size_t k = 9; k < 16; k++) {
15323 GemmMicrokernelTester()
15324 .mr(2)
15325 .nr(4)
15326 .kr(2)
15327 .sr(4)
15328 .m(2)
15329 .n(4)
15330 .k(k)
15331 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15332 }
15333 }
15334
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64,k_gt_8_strided_a)15335 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64, k_gt_8_strided_a) {
15336 TEST_REQUIRES_X86_AVX;
15337 for (size_t k = 9; k < 16; k++) {
15338 GemmMicrokernelTester()
15339 .mr(2)
15340 .nr(4)
15341 .kr(2)
15342 .sr(4)
15343 .m(2)
15344 .n(4)
15345 .k(k)
15346 .a_stride(19)
15347 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15348 }
15349 }
15350
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64,k_gt_8_subtile)15351 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64, k_gt_8_subtile) {
15352 TEST_REQUIRES_X86_AVX;
15353 for (size_t k = 9; k < 16; k++) {
15354 for (uint32_t n = 1; n <= 4; n++) {
15355 for (uint32_t m = 1; m <= 2; m++) {
15356 GemmMicrokernelTester()
15357 .mr(2)
15358 .nr(4)
15359 .kr(2)
15360 .sr(4)
15361 .m(m)
15362 .n(n)
15363 .k(k)
15364 .iterations(1)
15365 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15366 }
15367 }
15368 }
15369 }
15370
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64,k_div_8)15371 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64, k_div_8) {
15372 TEST_REQUIRES_X86_AVX;
15373 for (size_t k = 16; k <= 80; k += 8) {
15374 GemmMicrokernelTester()
15375 .mr(2)
15376 .nr(4)
15377 .kr(2)
15378 .sr(4)
15379 .m(2)
15380 .n(4)
15381 .k(k)
15382 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15383 }
15384 }
15385
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64,k_div_8_strided_a)15386 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64, k_div_8_strided_a) {
15387 TEST_REQUIRES_X86_AVX;
15388 for (size_t k = 16; k <= 80; k += 8) {
15389 GemmMicrokernelTester()
15390 .mr(2)
15391 .nr(4)
15392 .kr(2)
15393 .sr(4)
15394 .m(2)
15395 .n(4)
15396 .k(k)
15397 .a_stride(83)
15398 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15399 }
15400 }
15401
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64,k_div_8_subtile)15402 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64, k_div_8_subtile) {
15403 TEST_REQUIRES_X86_AVX;
15404 for (size_t k = 16; k <= 80; k += 8) {
15405 for (uint32_t n = 1; n <= 4; n++) {
15406 for (uint32_t m = 1; m <= 2; m++) {
15407 GemmMicrokernelTester()
15408 .mr(2)
15409 .nr(4)
15410 .kr(2)
15411 .sr(4)
15412 .m(m)
15413 .n(n)
15414 .k(k)
15415 .iterations(1)
15416 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15417 }
15418 }
15419 }
15420 }
15421
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64,n_gt_4)15422 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64, n_gt_4) {
15423 TEST_REQUIRES_X86_AVX;
15424 for (uint32_t n = 5; n < 8; n++) {
15425 for (size_t k = 1; k <= 40; k += 9) {
15426 GemmMicrokernelTester()
15427 .mr(2)
15428 .nr(4)
15429 .kr(2)
15430 .sr(4)
15431 .m(2)
15432 .n(n)
15433 .k(k)
15434 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15435 }
15436 }
15437 }
15438
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64,n_gt_4_strided_cn)15439 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64, n_gt_4_strided_cn) {
15440 TEST_REQUIRES_X86_AVX;
15441 for (uint32_t n = 5; n < 8; n++) {
15442 for (size_t k = 1; k <= 40; k += 9) {
15443 GemmMicrokernelTester()
15444 .mr(2)
15445 .nr(4)
15446 .kr(2)
15447 .sr(4)
15448 .m(2)
15449 .n(n)
15450 .k(k)
15451 .cn_stride(7)
15452 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15453 }
15454 }
15455 }
15456
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64,n_gt_4_strided_a)15457 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64, n_gt_4_strided_a) {
15458 TEST_REQUIRES_X86_AVX;
15459 for (uint32_t n = 5; n < 8; n++) {
15460 for (size_t k = 1; k <= 40; k += 9) {
15461 GemmMicrokernelTester()
15462 .mr(2)
15463 .nr(4)
15464 .kr(2)
15465 .sr(4)
15466 .m(2)
15467 .n(n)
15468 .k(k)
15469 .a_stride(43)
15470 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15471 }
15472 }
15473 }
15474
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64,n_gt_4_subtile)15475 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64, n_gt_4_subtile) {
15476 TEST_REQUIRES_X86_AVX;
15477 for (uint32_t n = 5; n < 8; n++) {
15478 for (size_t k = 1; k <= 40; k += 9) {
15479 for (uint32_t m = 1; m <= 2; m++) {
15480 GemmMicrokernelTester()
15481 .mr(2)
15482 .nr(4)
15483 .kr(2)
15484 .sr(4)
15485 .m(m)
15486 .n(n)
15487 .k(k)
15488 .iterations(1)
15489 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15490 }
15491 }
15492 }
15493 }
15494
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64,n_div_4)15495 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64, n_div_4) {
15496 TEST_REQUIRES_X86_AVX;
15497 for (uint32_t n = 8; n <= 12; n += 4) {
15498 for (size_t k = 1; k <= 40; k += 9) {
15499 GemmMicrokernelTester()
15500 .mr(2)
15501 .nr(4)
15502 .kr(2)
15503 .sr(4)
15504 .m(2)
15505 .n(n)
15506 .k(k)
15507 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15508 }
15509 }
15510 }
15511
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64,n_div_4_strided_cn)15512 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64, n_div_4_strided_cn) {
15513 TEST_REQUIRES_X86_AVX;
15514 for (uint32_t n = 8; n <= 12; n += 4) {
15515 for (size_t k = 1; k <= 40; k += 9) {
15516 GemmMicrokernelTester()
15517 .mr(2)
15518 .nr(4)
15519 .kr(2)
15520 .sr(4)
15521 .m(2)
15522 .n(n)
15523 .k(k)
15524 .cn_stride(7)
15525 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15526 }
15527 }
15528 }
15529
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64,n_div_4_strided_a)15530 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64, n_div_4_strided_a) {
15531 TEST_REQUIRES_X86_AVX;
15532 for (uint32_t n = 8; n <= 12; n += 4) {
15533 for (size_t k = 1; k <= 40; k += 9) {
15534 GemmMicrokernelTester()
15535 .mr(2)
15536 .nr(4)
15537 .kr(2)
15538 .sr(4)
15539 .m(2)
15540 .n(n)
15541 .k(k)
15542 .a_stride(43)
15543 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15544 }
15545 }
15546 }
15547
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64,n_div_4_subtile)15548 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64, n_div_4_subtile) {
15549 TEST_REQUIRES_X86_AVX;
15550 for (uint32_t n = 8; n <= 12; n += 4) {
15551 for (size_t k = 1; k <= 40; k += 9) {
15552 for (uint32_t m = 1; m <= 2; m++) {
15553 GemmMicrokernelTester()
15554 .mr(2)
15555 .nr(4)
15556 .kr(2)
15557 .sr(4)
15558 .m(m)
15559 .n(n)
15560 .k(k)
15561 .iterations(1)
15562 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15563 }
15564 }
15565 }
15566 }
15567
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64,strided_cm_subtile)15568 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64, strided_cm_subtile) {
15569 TEST_REQUIRES_X86_AVX;
15570 for (size_t k = 1; k <= 40; k += 9) {
15571 for (uint32_t n = 1; n <= 4; n++) {
15572 for (uint32_t m = 1; m <= 2; m++) {
15573 GemmMicrokernelTester()
15574 .mr(2)
15575 .nr(4)
15576 .kr(2)
15577 .sr(4)
15578 .m(m)
15579 .n(n)
15580 .k(k)
15581 .cm_stride(7)
15582 .iterations(1)
15583 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15584 }
15585 }
15586 }
15587 }
15588
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64,qmin)15589 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64, qmin) {
15590 TEST_REQUIRES_X86_AVX;
15591 GemmMicrokernelTester()
15592 .mr(2)
15593 .nr(4)
15594 .kr(2)
15595 .sr(4)
15596 .m(2)
15597 .n(4)
15598 .k(8)
15599 .qmin(128)
15600 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15601 }
15602
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64,qmax)15603 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64, qmax) {
15604 TEST_REQUIRES_X86_AVX;
15605 GemmMicrokernelTester()
15606 .mr(2)
15607 .nr(4)
15608 .kr(2)
15609 .sr(4)
15610 .m(2)
15611 .n(4)
15612 .k(8)
15613 .qmax(128)
15614 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15615 }
15616
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64,strided_cm)15617 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64, strided_cm) {
15618 TEST_REQUIRES_X86_AVX;
15619 GemmMicrokernelTester()
15620 .mr(2)
15621 .nr(4)
15622 .kr(2)
15623 .sr(4)
15624 .m(2)
15625 .n(4)
15626 .k(8)
15627 .cm_stride(7)
15628 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15629 }
15630
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64,no_a_zero_point)15631 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64, no_a_zero_point) {
15632 TEST_REQUIRES_X86_AVX;
15633 for (size_t k = 1; k <= 40; k += 9) {
15634 GemmMicrokernelTester()
15635 .mr(2)
15636 .nr(4)
15637 .kr(2)
15638 .sr(4)
15639 .m(2)
15640 .n(4)
15641 .k(k)
15642 .a_zero_point(0)
15643 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15644 }
15645 }
15646
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64,no_b_zero_point)15647 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64, no_b_zero_point) {
15648 TEST_REQUIRES_X86_AVX;
15649 for (size_t k = 1; k <= 40; k += 9) {
15650 GemmMicrokernelTester()
15651 .mr(2)
15652 .nr(4)
15653 .kr(2)
15654 .sr(4)
15655 .m(2)
15656 .n(4)
15657 .k(k)
15658 .b_zero_point(0)
15659 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15660 }
15661 }
15662
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64,no_zero_point)15663 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD64, no_zero_point) {
15664 TEST_REQUIRES_X86_AVX;
15665 for (size_t k = 1; k <= 40; k += 9) {
15666 GemmMicrokernelTester()
15667 .mr(2)
15668 .nr(4)
15669 .kr(2)
15670 .sr(4)
15671 .m(2)
15672 .n(4)
15673 .k(k)
15674 .a_zero_point(0)
15675 .b_zero_point(0)
15676 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15677 }
15678 }
15679 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
15680
15681
15682 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64,k_eq_8)15683 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64, k_eq_8) {
15684 TEST_REQUIRES_X86_XOP;
15685 GemmMicrokernelTester()
15686 .mr(2)
15687 .nr(4)
15688 .kr(2)
15689 .sr(4)
15690 .m(2)
15691 .n(4)
15692 .k(8)
15693 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15694 }
15695
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64,strided_cn)15696 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64, strided_cn) {
15697 TEST_REQUIRES_X86_XOP;
15698 GemmMicrokernelTester()
15699 .mr(2)
15700 .nr(4)
15701 .kr(2)
15702 .sr(4)
15703 .m(2)
15704 .n(4)
15705 .k(8)
15706 .cn_stride(7)
15707 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15708 }
15709
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64,k_eq_8_strided_a)15710 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64, k_eq_8_strided_a) {
15711 TEST_REQUIRES_X86_XOP;
15712 GemmMicrokernelTester()
15713 .mr(2)
15714 .nr(4)
15715 .kr(2)
15716 .sr(4)
15717 .m(2)
15718 .n(4)
15719 .k(8)
15720 .a_stride(11)
15721 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15722 }
15723
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64,k_eq_8_subtile)15724 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64, k_eq_8_subtile) {
15725 TEST_REQUIRES_X86_XOP;
15726 for (uint32_t n = 1; n <= 4; n++) {
15727 for (uint32_t m = 1; m <= 2; m++) {
15728 GemmMicrokernelTester()
15729 .mr(2)
15730 .nr(4)
15731 .kr(2)
15732 .sr(4)
15733 .m(m)
15734 .n(n)
15735 .k(8)
15736 .iterations(1)
15737 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15738 }
15739 }
15740 }
15741
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64,k_eq_8_subtile_m)15742 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64, k_eq_8_subtile_m) {
15743 TEST_REQUIRES_X86_XOP;
15744 for (uint32_t m = 1; m <= 2; m++) {
15745 GemmMicrokernelTester()
15746 .mr(2)
15747 .nr(4)
15748 .kr(2)
15749 .sr(4)
15750 .m(m)
15751 .n(4)
15752 .k(8)
15753 .iterations(1)
15754 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15755 }
15756 }
15757
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64,k_eq_8_subtile_n)15758 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64, k_eq_8_subtile_n) {
15759 TEST_REQUIRES_X86_XOP;
15760 for (uint32_t n = 1; n <= 4; n++) {
15761 GemmMicrokernelTester()
15762 .mr(2)
15763 .nr(4)
15764 .kr(2)
15765 .sr(4)
15766 .m(2)
15767 .n(n)
15768 .k(8)
15769 .iterations(1)
15770 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15771 }
15772 }
15773
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64,k_lt_8)15774 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64, k_lt_8) {
15775 TEST_REQUIRES_X86_XOP;
15776 for (size_t k = 1; k < 8; k++) {
15777 GemmMicrokernelTester()
15778 .mr(2)
15779 .nr(4)
15780 .kr(2)
15781 .sr(4)
15782 .m(2)
15783 .n(4)
15784 .k(k)
15785 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15786 }
15787 }
15788
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64,k_lt_8_strided_a)15789 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64, k_lt_8_strided_a) {
15790 TEST_REQUIRES_X86_XOP;
15791 for (size_t k = 1; k < 8; k++) {
15792 GemmMicrokernelTester()
15793 .mr(2)
15794 .nr(4)
15795 .kr(2)
15796 .sr(4)
15797 .m(2)
15798 .n(4)
15799 .k(k)
15800 .a_stride(11)
15801 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15802 }
15803 }
15804
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64,k_lt_8_subtile)15805 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64, k_lt_8_subtile) {
15806 TEST_REQUIRES_X86_XOP;
15807 for (size_t k = 1; k < 8; k++) {
15808 for (uint32_t n = 1; n <= 4; n++) {
15809 for (uint32_t m = 1; m <= 2; m++) {
15810 GemmMicrokernelTester()
15811 .mr(2)
15812 .nr(4)
15813 .kr(2)
15814 .sr(4)
15815 .m(m)
15816 .n(n)
15817 .k(k)
15818 .iterations(1)
15819 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15820 }
15821 }
15822 }
15823 }
15824
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64,k_gt_8)15825 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64, k_gt_8) {
15826 TEST_REQUIRES_X86_XOP;
15827 for (size_t k = 9; k < 16; k++) {
15828 GemmMicrokernelTester()
15829 .mr(2)
15830 .nr(4)
15831 .kr(2)
15832 .sr(4)
15833 .m(2)
15834 .n(4)
15835 .k(k)
15836 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15837 }
15838 }
15839
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64,k_gt_8_strided_a)15840 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64, k_gt_8_strided_a) {
15841 TEST_REQUIRES_X86_XOP;
15842 for (size_t k = 9; k < 16; k++) {
15843 GemmMicrokernelTester()
15844 .mr(2)
15845 .nr(4)
15846 .kr(2)
15847 .sr(4)
15848 .m(2)
15849 .n(4)
15850 .k(k)
15851 .a_stride(19)
15852 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15853 }
15854 }
15855
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64,k_gt_8_subtile)15856 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64, k_gt_8_subtile) {
15857 TEST_REQUIRES_X86_XOP;
15858 for (size_t k = 9; k < 16; k++) {
15859 for (uint32_t n = 1; n <= 4; n++) {
15860 for (uint32_t m = 1; m <= 2; m++) {
15861 GemmMicrokernelTester()
15862 .mr(2)
15863 .nr(4)
15864 .kr(2)
15865 .sr(4)
15866 .m(m)
15867 .n(n)
15868 .k(k)
15869 .iterations(1)
15870 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15871 }
15872 }
15873 }
15874 }
15875
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64,k_div_8)15876 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64, k_div_8) {
15877 TEST_REQUIRES_X86_XOP;
15878 for (size_t k = 16; k <= 80; k += 8) {
15879 GemmMicrokernelTester()
15880 .mr(2)
15881 .nr(4)
15882 .kr(2)
15883 .sr(4)
15884 .m(2)
15885 .n(4)
15886 .k(k)
15887 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15888 }
15889 }
15890
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64,k_div_8_strided_a)15891 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64, k_div_8_strided_a) {
15892 TEST_REQUIRES_X86_XOP;
15893 for (size_t k = 16; k <= 80; k += 8) {
15894 GemmMicrokernelTester()
15895 .mr(2)
15896 .nr(4)
15897 .kr(2)
15898 .sr(4)
15899 .m(2)
15900 .n(4)
15901 .k(k)
15902 .a_stride(83)
15903 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15904 }
15905 }
15906
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64,k_div_8_subtile)15907 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64, k_div_8_subtile) {
15908 TEST_REQUIRES_X86_XOP;
15909 for (size_t k = 16; k <= 80; k += 8) {
15910 for (uint32_t n = 1; n <= 4; n++) {
15911 for (uint32_t m = 1; m <= 2; m++) {
15912 GemmMicrokernelTester()
15913 .mr(2)
15914 .nr(4)
15915 .kr(2)
15916 .sr(4)
15917 .m(m)
15918 .n(n)
15919 .k(k)
15920 .iterations(1)
15921 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15922 }
15923 }
15924 }
15925 }
15926
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64,n_gt_4)15927 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64, n_gt_4) {
15928 TEST_REQUIRES_X86_XOP;
15929 for (uint32_t n = 5; n < 8; n++) {
15930 for (size_t k = 1; k <= 40; k += 9) {
15931 GemmMicrokernelTester()
15932 .mr(2)
15933 .nr(4)
15934 .kr(2)
15935 .sr(4)
15936 .m(2)
15937 .n(n)
15938 .k(k)
15939 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15940 }
15941 }
15942 }
15943
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64,n_gt_4_strided_cn)15944 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64, n_gt_4_strided_cn) {
15945 TEST_REQUIRES_X86_XOP;
15946 for (uint32_t n = 5; n < 8; n++) {
15947 for (size_t k = 1; k <= 40; k += 9) {
15948 GemmMicrokernelTester()
15949 .mr(2)
15950 .nr(4)
15951 .kr(2)
15952 .sr(4)
15953 .m(2)
15954 .n(n)
15955 .k(k)
15956 .cn_stride(7)
15957 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15958 }
15959 }
15960 }
15961
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64,n_gt_4_strided_a)15962 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64, n_gt_4_strided_a) {
15963 TEST_REQUIRES_X86_XOP;
15964 for (uint32_t n = 5; n < 8; n++) {
15965 for (size_t k = 1; k <= 40; k += 9) {
15966 GemmMicrokernelTester()
15967 .mr(2)
15968 .nr(4)
15969 .kr(2)
15970 .sr(4)
15971 .m(2)
15972 .n(n)
15973 .k(k)
15974 .a_stride(43)
15975 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15976 }
15977 }
15978 }
15979
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64,n_gt_4_subtile)15980 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64, n_gt_4_subtile) {
15981 TEST_REQUIRES_X86_XOP;
15982 for (uint32_t n = 5; n < 8; n++) {
15983 for (size_t k = 1; k <= 40; k += 9) {
15984 for (uint32_t m = 1; m <= 2; m++) {
15985 GemmMicrokernelTester()
15986 .mr(2)
15987 .nr(4)
15988 .kr(2)
15989 .sr(4)
15990 .m(m)
15991 .n(n)
15992 .k(k)
15993 .iterations(1)
15994 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
15995 }
15996 }
15997 }
15998 }
15999
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64,n_div_4)16000 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64, n_div_4) {
16001 TEST_REQUIRES_X86_XOP;
16002 for (uint32_t n = 8; n <= 12; n += 4) {
16003 for (size_t k = 1; k <= 40; k += 9) {
16004 GemmMicrokernelTester()
16005 .mr(2)
16006 .nr(4)
16007 .kr(2)
16008 .sr(4)
16009 .m(2)
16010 .n(n)
16011 .k(k)
16012 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16013 }
16014 }
16015 }
16016
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64,n_div_4_strided_cn)16017 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64, n_div_4_strided_cn) {
16018 TEST_REQUIRES_X86_XOP;
16019 for (uint32_t n = 8; n <= 12; n += 4) {
16020 for (size_t k = 1; k <= 40; k += 9) {
16021 GemmMicrokernelTester()
16022 .mr(2)
16023 .nr(4)
16024 .kr(2)
16025 .sr(4)
16026 .m(2)
16027 .n(n)
16028 .k(k)
16029 .cn_stride(7)
16030 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16031 }
16032 }
16033 }
16034
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64,n_div_4_strided_a)16035 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64, n_div_4_strided_a) {
16036 TEST_REQUIRES_X86_XOP;
16037 for (uint32_t n = 8; n <= 12; n += 4) {
16038 for (size_t k = 1; k <= 40; k += 9) {
16039 GemmMicrokernelTester()
16040 .mr(2)
16041 .nr(4)
16042 .kr(2)
16043 .sr(4)
16044 .m(2)
16045 .n(n)
16046 .k(k)
16047 .a_stride(43)
16048 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16049 }
16050 }
16051 }
16052
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64,n_div_4_subtile)16053 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64, n_div_4_subtile) {
16054 TEST_REQUIRES_X86_XOP;
16055 for (uint32_t n = 8; n <= 12; n += 4) {
16056 for (size_t k = 1; k <= 40; k += 9) {
16057 for (uint32_t m = 1; m <= 2; m++) {
16058 GemmMicrokernelTester()
16059 .mr(2)
16060 .nr(4)
16061 .kr(2)
16062 .sr(4)
16063 .m(m)
16064 .n(n)
16065 .k(k)
16066 .iterations(1)
16067 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16068 }
16069 }
16070 }
16071 }
16072
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64,strided_cm_subtile)16073 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64, strided_cm_subtile) {
16074 TEST_REQUIRES_X86_XOP;
16075 for (size_t k = 1; k <= 40; k += 9) {
16076 for (uint32_t n = 1; n <= 4; n++) {
16077 for (uint32_t m = 1; m <= 2; m++) {
16078 GemmMicrokernelTester()
16079 .mr(2)
16080 .nr(4)
16081 .kr(2)
16082 .sr(4)
16083 .m(m)
16084 .n(n)
16085 .k(k)
16086 .cm_stride(7)
16087 .iterations(1)
16088 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16089 }
16090 }
16091 }
16092 }
16093
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64,qmin)16094 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64, qmin) {
16095 TEST_REQUIRES_X86_XOP;
16096 GemmMicrokernelTester()
16097 .mr(2)
16098 .nr(4)
16099 .kr(2)
16100 .sr(4)
16101 .m(2)
16102 .n(4)
16103 .k(8)
16104 .qmin(128)
16105 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16106 }
16107
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64,qmax)16108 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64, qmax) {
16109 TEST_REQUIRES_X86_XOP;
16110 GemmMicrokernelTester()
16111 .mr(2)
16112 .nr(4)
16113 .kr(2)
16114 .sr(4)
16115 .m(2)
16116 .n(4)
16117 .k(8)
16118 .qmax(128)
16119 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16120 }
16121
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64,strided_cm)16122 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64, strided_cm) {
16123 TEST_REQUIRES_X86_XOP;
16124 GemmMicrokernelTester()
16125 .mr(2)
16126 .nr(4)
16127 .kr(2)
16128 .sr(4)
16129 .m(2)
16130 .n(4)
16131 .k(8)
16132 .cm_stride(7)
16133 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16134 }
16135
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64,no_a_zero_point)16136 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64, no_a_zero_point) {
16137 TEST_REQUIRES_X86_XOP;
16138 for (size_t k = 1; k <= 40; k += 9) {
16139 GemmMicrokernelTester()
16140 .mr(2)
16141 .nr(4)
16142 .kr(2)
16143 .sr(4)
16144 .m(2)
16145 .n(4)
16146 .k(k)
16147 .a_zero_point(0)
16148 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16149 }
16150 }
16151
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64,no_b_zero_point)16152 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64, no_b_zero_point) {
16153 TEST_REQUIRES_X86_XOP;
16154 for (size_t k = 1; k <= 40; k += 9) {
16155 GemmMicrokernelTester()
16156 .mr(2)
16157 .nr(4)
16158 .kr(2)
16159 .sr(4)
16160 .m(2)
16161 .n(4)
16162 .k(k)
16163 .b_zero_point(0)
16164 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16165 }
16166 }
16167
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64,no_zero_point)16168 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__XOP_LD64, no_zero_point) {
16169 TEST_REQUIRES_X86_XOP;
16170 for (size_t k = 1; k <= 40; k += 9) {
16171 GemmMicrokernelTester()
16172 .mr(2)
16173 .nr(4)
16174 .kr(2)
16175 .sr(4)
16176 .m(2)
16177 .n(4)
16178 .k(k)
16179 .a_zero_point(0)
16180 .b_zero_point(0)
16181 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16182 }
16183 }
16184 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
16185
16186
16187 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64,k_eq_8)16188 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64, k_eq_8) {
16189 TEST_REQUIRES_X86_AVX;
16190 GemmMicrokernelTester()
16191 .mr(4)
16192 .nr(4)
16193 .kr(2)
16194 .sr(4)
16195 .m(4)
16196 .n(4)
16197 .k(8)
16198 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16199 }
16200
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64,strided_cn)16201 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64, strided_cn) {
16202 TEST_REQUIRES_X86_AVX;
16203 GemmMicrokernelTester()
16204 .mr(4)
16205 .nr(4)
16206 .kr(2)
16207 .sr(4)
16208 .m(4)
16209 .n(4)
16210 .k(8)
16211 .cn_stride(7)
16212 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16213 }
16214
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64,k_eq_8_strided_a)16215 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64, k_eq_8_strided_a) {
16216 TEST_REQUIRES_X86_AVX;
16217 GemmMicrokernelTester()
16218 .mr(4)
16219 .nr(4)
16220 .kr(2)
16221 .sr(4)
16222 .m(4)
16223 .n(4)
16224 .k(8)
16225 .a_stride(11)
16226 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16227 }
16228
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64,k_eq_8_subtile)16229 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64, k_eq_8_subtile) {
16230 TEST_REQUIRES_X86_AVX;
16231 for (uint32_t n = 1; n <= 4; n++) {
16232 for (uint32_t m = 1; m <= 4; m++) {
16233 GemmMicrokernelTester()
16234 .mr(4)
16235 .nr(4)
16236 .kr(2)
16237 .sr(4)
16238 .m(m)
16239 .n(n)
16240 .k(8)
16241 .iterations(1)
16242 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16243 }
16244 }
16245 }
16246
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64,k_eq_8_subtile_m)16247 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64, k_eq_8_subtile_m) {
16248 TEST_REQUIRES_X86_AVX;
16249 for (uint32_t m = 1; m <= 4; m++) {
16250 GemmMicrokernelTester()
16251 .mr(4)
16252 .nr(4)
16253 .kr(2)
16254 .sr(4)
16255 .m(m)
16256 .n(4)
16257 .k(8)
16258 .iterations(1)
16259 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16260 }
16261 }
16262
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64,k_eq_8_subtile_n)16263 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64, k_eq_8_subtile_n) {
16264 TEST_REQUIRES_X86_AVX;
16265 for (uint32_t n = 1; n <= 4; n++) {
16266 GemmMicrokernelTester()
16267 .mr(4)
16268 .nr(4)
16269 .kr(2)
16270 .sr(4)
16271 .m(4)
16272 .n(n)
16273 .k(8)
16274 .iterations(1)
16275 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16276 }
16277 }
16278
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64,k_lt_8)16279 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64, k_lt_8) {
16280 TEST_REQUIRES_X86_AVX;
16281 for (size_t k = 1; k < 8; k++) {
16282 GemmMicrokernelTester()
16283 .mr(4)
16284 .nr(4)
16285 .kr(2)
16286 .sr(4)
16287 .m(4)
16288 .n(4)
16289 .k(k)
16290 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16291 }
16292 }
16293
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64,k_lt_8_strided_a)16294 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64, k_lt_8_strided_a) {
16295 TEST_REQUIRES_X86_AVX;
16296 for (size_t k = 1; k < 8; k++) {
16297 GemmMicrokernelTester()
16298 .mr(4)
16299 .nr(4)
16300 .kr(2)
16301 .sr(4)
16302 .m(4)
16303 .n(4)
16304 .k(k)
16305 .a_stride(11)
16306 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16307 }
16308 }
16309
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64,k_lt_8_subtile)16310 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64, k_lt_8_subtile) {
16311 TEST_REQUIRES_X86_AVX;
16312 for (size_t k = 1; k < 8; k++) {
16313 for (uint32_t n = 1; n <= 4; n++) {
16314 for (uint32_t m = 1; m <= 4; m++) {
16315 GemmMicrokernelTester()
16316 .mr(4)
16317 .nr(4)
16318 .kr(2)
16319 .sr(4)
16320 .m(m)
16321 .n(n)
16322 .k(k)
16323 .iterations(1)
16324 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16325 }
16326 }
16327 }
16328 }
16329
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64,k_gt_8)16330 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64, k_gt_8) {
16331 TEST_REQUIRES_X86_AVX;
16332 for (size_t k = 9; k < 16; k++) {
16333 GemmMicrokernelTester()
16334 .mr(4)
16335 .nr(4)
16336 .kr(2)
16337 .sr(4)
16338 .m(4)
16339 .n(4)
16340 .k(k)
16341 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16342 }
16343 }
16344
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64,k_gt_8_strided_a)16345 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64, k_gt_8_strided_a) {
16346 TEST_REQUIRES_X86_AVX;
16347 for (size_t k = 9; k < 16; k++) {
16348 GemmMicrokernelTester()
16349 .mr(4)
16350 .nr(4)
16351 .kr(2)
16352 .sr(4)
16353 .m(4)
16354 .n(4)
16355 .k(k)
16356 .a_stride(19)
16357 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16358 }
16359 }
16360
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64,k_gt_8_subtile)16361 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64, k_gt_8_subtile) {
16362 TEST_REQUIRES_X86_AVX;
16363 for (size_t k = 9; k < 16; k++) {
16364 for (uint32_t n = 1; n <= 4; n++) {
16365 for (uint32_t m = 1; m <= 4; m++) {
16366 GemmMicrokernelTester()
16367 .mr(4)
16368 .nr(4)
16369 .kr(2)
16370 .sr(4)
16371 .m(m)
16372 .n(n)
16373 .k(k)
16374 .iterations(1)
16375 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16376 }
16377 }
16378 }
16379 }
16380
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64,k_div_8)16381 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64, k_div_8) {
16382 TEST_REQUIRES_X86_AVX;
16383 for (size_t k = 16; k <= 80; k += 8) {
16384 GemmMicrokernelTester()
16385 .mr(4)
16386 .nr(4)
16387 .kr(2)
16388 .sr(4)
16389 .m(4)
16390 .n(4)
16391 .k(k)
16392 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16393 }
16394 }
16395
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64,k_div_8_strided_a)16396 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64, k_div_8_strided_a) {
16397 TEST_REQUIRES_X86_AVX;
16398 for (size_t k = 16; k <= 80; k += 8) {
16399 GemmMicrokernelTester()
16400 .mr(4)
16401 .nr(4)
16402 .kr(2)
16403 .sr(4)
16404 .m(4)
16405 .n(4)
16406 .k(k)
16407 .a_stride(83)
16408 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16409 }
16410 }
16411
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64,k_div_8_subtile)16412 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64, k_div_8_subtile) {
16413 TEST_REQUIRES_X86_AVX;
16414 for (size_t k = 16; k <= 80; k += 8) {
16415 for (uint32_t n = 1; n <= 4; n++) {
16416 for (uint32_t m = 1; m <= 4; m++) {
16417 GemmMicrokernelTester()
16418 .mr(4)
16419 .nr(4)
16420 .kr(2)
16421 .sr(4)
16422 .m(m)
16423 .n(n)
16424 .k(k)
16425 .iterations(1)
16426 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16427 }
16428 }
16429 }
16430 }
16431
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64,n_gt_4)16432 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64, n_gt_4) {
16433 TEST_REQUIRES_X86_AVX;
16434 for (uint32_t n = 5; n < 8; n++) {
16435 for (size_t k = 1; k <= 40; k += 9) {
16436 GemmMicrokernelTester()
16437 .mr(4)
16438 .nr(4)
16439 .kr(2)
16440 .sr(4)
16441 .m(4)
16442 .n(n)
16443 .k(k)
16444 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16445 }
16446 }
16447 }
16448
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64,n_gt_4_strided_cn)16449 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64, n_gt_4_strided_cn) {
16450 TEST_REQUIRES_X86_AVX;
16451 for (uint32_t n = 5; n < 8; n++) {
16452 for (size_t k = 1; k <= 40; k += 9) {
16453 GemmMicrokernelTester()
16454 .mr(4)
16455 .nr(4)
16456 .kr(2)
16457 .sr(4)
16458 .m(4)
16459 .n(n)
16460 .k(k)
16461 .cn_stride(7)
16462 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16463 }
16464 }
16465 }
16466
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64,n_gt_4_strided_a)16467 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64, n_gt_4_strided_a) {
16468 TEST_REQUIRES_X86_AVX;
16469 for (uint32_t n = 5; n < 8; n++) {
16470 for (size_t k = 1; k <= 40; k += 9) {
16471 GemmMicrokernelTester()
16472 .mr(4)
16473 .nr(4)
16474 .kr(2)
16475 .sr(4)
16476 .m(4)
16477 .n(n)
16478 .k(k)
16479 .a_stride(43)
16480 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16481 }
16482 }
16483 }
16484
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64,n_gt_4_subtile)16485 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64, n_gt_4_subtile) {
16486 TEST_REQUIRES_X86_AVX;
16487 for (uint32_t n = 5; n < 8; n++) {
16488 for (size_t k = 1; k <= 40; k += 9) {
16489 for (uint32_t m = 1; m <= 4; m++) {
16490 GemmMicrokernelTester()
16491 .mr(4)
16492 .nr(4)
16493 .kr(2)
16494 .sr(4)
16495 .m(m)
16496 .n(n)
16497 .k(k)
16498 .iterations(1)
16499 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16500 }
16501 }
16502 }
16503 }
16504
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64,n_div_4)16505 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64, n_div_4) {
16506 TEST_REQUIRES_X86_AVX;
16507 for (uint32_t n = 8; n <= 12; n += 4) {
16508 for (size_t k = 1; k <= 40; k += 9) {
16509 GemmMicrokernelTester()
16510 .mr(4)
16511 .nr(4)
16512 .kr(2)
16513 .sr(4)
16514 .m(4)
16515 .n(n)
16516 .k(k)
16517 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16518 }
16519 }
16520 }
16521
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64,n_div_4_strided_cn)16522 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64, n_div_4_strided_cn) {
16523 TEST_REQUIRES_X86_AVX;
16524 for (uint32_t n = 8; n <= 12; n += 4) {
16525 for (size_t k = 1; k <= 40; k += 9) {
16526 GemmMicrokernelTester()
16527 .mr(4)
16528 .nr(4)
16529 .kr(2)
16530 .sr(4)
16531 .m(4)
16532 .n(n)
16533 .k(k)
16534 .cn_stride(7)
16535 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16536 }
16537 }
16538 }
16539
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64,n_div_4_strided_a)16540 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64, n_div_4_strided_a) {
16541 TEST_REQUIRES_X86_AVX;
16542 for (uint32_t n = 8; n <= 12; n += 4) {
16543 for (size_t k = 1; k <= 40; k += 9) {
16544 GemmMicrokernelTester()
16545 .mr(4)
16546 .nr(4)
16547 .kr(2)
16548 .sr(4)
16549 .m(4)
16550 .n(n)
16551 .k(k)
16552 .a_stride(43)
16553 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16554 }
16555 }
16556 }
16557
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64,n_div_4_subtile)16558 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64, n_div_4_subtile) {
16559 TEST_REQUIRES_X86_AVX;
16560 for (uint32_t n = 8; n <= 12; n += 4) {
16561 for (size_t k = 1; k <= 40; k += 9) {
16562 for (uint32_t m = 1; m <= 4; m++) {
16563 GemmMicrokernelTester()
16564 .mr(4)
16565 .nr(4)
16566 .kr(2)
16567 .sr(4)
16568 .m(m)
16569 .n(n)
16570 .k(k)
16571 .iterations(1)
16572 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16573 }
16574 }
16575 }
16576 }
16577
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64,strided_cm_subtile)16578 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64, strided_cm_subtile) {
16579 TEST_REQUIRES_X86_AVX;
16580 for (size_t k = 1; k <= 40; k += 9) {
16581 for (uint32_t n = 1; n <= 4; n++) {
16582 for (uint32_t m = 1; m <= 4; m++) {
16583 GemmMicrokernelTester()
16584 .mr(4)
16585 .nr(4)
16586 .kr(2)
16587 .sr(4)
16588 .m(m)
16589 .n(n)
16590 .k(k)
16591 .cm_stride(7)
16592 .iterations(1)
16593 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16594 }
16595 }
16596 }
16597 }
16598
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64,qmin)16599 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64, qmin) {
16600 TEST_REQUIRES_X86_AVX;
16601 GemmMicrokernelTester()
16602 .mr(4)
16603 .nr(4)
16604 .kr(2)
16605 .sr(4)
16606 .m(4)
16607 .n(4)
16608 .k(8)
16609 .qmin(128)
16610 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16611 }
16612
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64,qmax)16613 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64, qmax) {
16614 TEST_REQUIRES_X86_AVX;
16615 GemmMicrokernelTester()
16616 .mr(4)
16617 .nr(4)
16618 .kr(2)
16619 .sr(4)
16620 .m(4)
16621 .n(4)
16622 .k(8)
16623 .qmax(128)
16624 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16625 }
16626
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64,strided_cm)16627 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64, strided_cm) {
16628 TEST_REQUIRES_X86_AVX;
16629 GemmMicrokernelTester()
16630 .mr(4)
16631 .nr(4)
16632 .kr(2)
16633 .sr(4)
16634 .m(4)
16635 .n(4)
16636 .k(8)
16637 .cm_stride(7)
16638 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16639 }
16640
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64,no_a_zero_point)16641 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64, no_a_zero_point) {
16642 TEST_REQUIRES_X86_AVX;
16643 for (size_t k = 1; k <= 40; k += 9) {
16644 GemmMicrokernelTester()
16645 .mr(4)
16646 .nr(4)
16647 .kr(2)
16648 .sr(4)
16649 .m(4)
16650 .n(4)
16651 .k(k)
16652 .a_zero_point(0)
16653 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16654 }
16655 }
16656
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64,no_b_zero_point)16657 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64, no_b_zero_point) {
16658 TEST_REQUIRES_X86_AVX;
16659 for (size_t k = 1; k <= 40; k += 9) {
16660 GemmMicrokernelTester()
16661 .mr(4)
16662 .nr(4)
16663 .kr(2)
16664 .sr(4)
16665 .m(4)
16666 .n(4)
16667 .k(k)
16668 .b_zero_point(0)
16669 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16670 }
16671 }
16672
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64,no_zero_point)16673 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__AVX_LD64, no_zero_point) {
16674 TEST_REQUIRES_X86_AVX;
16675 for (size_t k = 1; k <= 40; k += 9) {
16676 GemmMicrokernelTester()
16677 .mr(4)
16678 .nr(4)
16679 .kr(2)
16680 .sr(4)
16681 .m(4)
16682 .n(4)
16683 .k(k)
16684 .a_zero_point(0)
16685 .b_zero_point(0)
16686 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16687 }
16688 }
16689 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
16690
16691
16692 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64,k_eq_8)16693 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64, k_eq_8) {
16694 TEST_REQUIRES_X86_XOP;
16695 GemmMicrokernelTester()
16696 .mr(4)
16697 .nr(4)
16698 .kr(2)
16699 .sr(4)
16700 .m(4)
16701 .n(4)
16702 .k(8)
16703 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16704 }
16705
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64,strided_cn)16706 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64, strided_cn) {
16707 TEST_REQUIRES_X86_XOP;
16708 GemmMicrokernelTester()
16709 .mr(4)
16710 .nr(4)
16711 .kr(2)
16712 .sr(4)
16713 .m(4)
16714 .n(4)
16715 .k(8)
16716 .cn_stride(7)
16717 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16718 }
16719
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64,k_eq_8_strided_a)16720 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64, k_eq_8_strided_a) {
16721 TEST_REQUIRES_X86_XOP;
16722 GemmMicrokernelTester()
16723 .mr(4)
16724 .nr(4)
16725 .kr(2)
16726 .sr(4)
16727 .m(4)
16728 .n(4)
16729 .k(8)
16730 .a_stride(11)
16731 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16732 }
16733
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64,k_eq_8_subtile)16734 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64, k_eq_8_subtile) {
16735 TEST_REQUIRES_X86_XOP;
16736 for (uint32_t n = 1; n <= 4; n++) {
16737 for (uint32_t m = 1; m <= 4; m++) {
16738 GemmMicrokernelTester()
16739 .mr(4)
16740 .nr(4)
16741 .kr(2)
16742 .sr(4)
16743 .m(m)
16744 .n(n)
16745 .k(8)
16746 .iterations(1)
16747 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16748 }
16749 }
16750 }
16751
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64,k_eq_8_subtile_m)16752 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64, k_eq_8_subtile_m) {
16753 TEST_REQUIRES_X86_XOP;
16754 for (uint32_t m = 1; m <= 4; m++) {
16755 GemmMicrokernelTester()
16756 .mr(4)
16757 .nr(4)
16758 .kr(2)
16759 .sr(4)
16760 .m(m)
16761 .n(4)
16762 .k(8)
16763 .iterations(1)
16764 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16765 }
16766 }
16767
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64,k_eq_8_subtile_n)16768 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64, k_eq_8_subtile_n) {
16769 TEST_REQUIRES_X86_XOP;
16770 for (uint32_t n = 1; n <= 4; n++) {
16771 GemmMicrokernelTester()
16772 .mr(4)
16773 .nr(4)
16774 .kr(2)
16775 .sr(4)
16776 .m(4)
16777 .n(n)
16778 .k(8)
16779 .iterations(1)
16780 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16781 }
16782 }
16783
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64,k_lt_8)16784 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64, k_lt_8) {
16785 TEST_REQUIRES_X86_XOP;
16786 for (size_t k = 1; k < 8; k++) {
16787 GemmMicrokernelTester()
16788 .mr(4)
16789 .nr(4)
16790 .kr(2)
16791 .sr(4)
16792 .m(4)
16793 .n(4)
16794 .k(k)
16795 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16796 }
16797 }
16798
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64,k_lt_8_strided_a)16799 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64, k_lt_8_strided_a) {
16800 TEST_REQUIRES_X86_XOP;
16801 for (size_t k = 1; k < 8; k++) {
16802 GemmMicrokernelTester()
16803 .mr(4)
16804 .nr(4)
16805 .kr(2)
16806 .sr(4)
16807 .m(4)
16808 .n(4)
16809 .k(k)
16810 .a_stride(11)
16811 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16812 }
16813 }
16814
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64,k_lt_8_subtile)16815 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64, k_lt_8_subtile) {
16816 TEST_REQUIRES_X86_XOP;
16817 for (size_t k = 1; k < 8; k++) {
16818 for (uint32_t n = 1; n <= 4; n++) {
16819 for (uint32_t m = 1; m <= 4; m++) {
16820 GemmMicrokernelTester()
16821 .mr(4)
16822 .nr(4)
16823 .kr(2)
16824 .sr(4)
16825 .m(m)
16826 .n(n)
16827 .k(k)
16828 .iterations(1)
16829 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16830 }
16831 }
16832 }
16833 }
16834
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64,k_gt_8)16835 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64, k_gt_8) {
16836 TEST_REQUIRES_X86_XOP;
16837 for (size_t k = 9; k < 16; k++) {
16838 GemmMicrokernelTester()
16839 .mr(4)
16840 .nr(4)
16841 .kr(2)
16842 .sr(4)
16843 .m(4)
16844 .n(4)
16845 .k(k)
16846 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16847 }
16848 }
16849
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64,k_gt_8_strided_a)16850 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64, k_gt_8_strided_a) {
16851 TEST_REQUIRES_X86_XOP;
16852 for (size_t k = 9; k < 16; k++) {
16853 GemmMicrokernelTester()
16854 .mr(4)
16855 .nr(4)
16856 .kr(2)
16857 .sr(4)
16858 .m(4)
16859 .n(4)
16860 .k(k)
16861 .a_stride(19)
16862 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16863 }
16864 }
16865
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64,k_gt_8_subtile)16866 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64, k_gt_8_subtile) {
16867 TEST_REQUIRES_X86_XOP;
16868 for (size_t k = 9; k < 16; k++) {
16869 for (uint32_t n = 1; n <= 4; n++) {
16870 for (uint32_t m = 1; m <= 4; m++) {
16871 GemmMicrokernelTester()
16872 .mr(4)
16873 .nr(4)
16874 .kr(2)
16875 .sr(4)
16876 .m(m)
16877 .n(n)
16878 .k(k)
16879 .iterations(1)
16880 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16881 }
16882 }
16883 }
16884 }
16885
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64,k_div_8)16886 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64, k_div_8) {
16887 TEST_REQUIRES_X86_XOP;
16888 for (size_t k = 16; k <= 80; k += 8) {
16889 GemmMicrokernelTester()
16890 .mr(4)
16891 .nr(4)
16892 .kr(2)
16893 .sr(4)
16894 .m(4)
16895 .n(4)
16896 .k(k)
16897 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16898 }
16899 }
16900
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64,k_div_8_strided_a)16901 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64, k_div_8_strided_a) {
16902 TEST_REQUIRES_X86_XOP;
16903 for (size_t k = 16; k <= 80; k += 8) {
16904 GemmMicrokernelTester()
16905 .mr(4)
16906 .nr(4)
16907 .kr(2)
16908 .sr(4)
16909 .m(4)
16910 .n(4)
16911 .k(k)
16912 .a_stride(83)
16913 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16914 }
16915 }
16916
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64,k_div_8_subtile)16917 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64, k_div_8_subtile) {
16918 TEST_REQUIRES_X86_XOP;
16919 for (size_t k = 16; k <= 80; k += 8) {
16920 for (uint32_t n = 1; n <= 4; n++) {
16921 for (uint32_t m = 1; m <= 4; m++) {
16922 GemmMicrokernelTester()
16923 .mr(4)
16924 .nr(4)
16925 .kr(2)
16926 .sr(4)
16927 .m(m)
16928 .n(n)
16929 .k(k)
16930 .iterations(1)
16931 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16932 }
16933 }
16934 }
16935 }
16936
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64,n_gt_4)16937 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64, n_gt_4) {
16938 TEST_REQUIRES_X86_XOP;
16939 for (uint32_t n = 5; n < 8; n++) {
16940 for (size_t k = 1; k <= 40; k += 9) {
16941 GemmMicrokernelTester()
16942 .mr(4)
16943 .nr(4)
16944 .kr(2)
16945 .sr(4)
16946 .m(4)
16947 .n(n)
16948 .k(k)
16949 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16950 }
16951 }
16952 }
16953
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64,n_gt_4_strided_cn)16954 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64, n_gt_4_strided_cn) {
16955 TEST_REQUIRES_X86_XOP;
16956 for (uint32_t n = 5; n < 8; n++) {
16957 for (size_t k = 1; k <= 40; k += 9) {
16958 GemmMicrokernelTester()
16959 .mr(4)
16960 .nr(4)
16961 .kr(2)
16962 .sr(4)
16963 .m(4)
16964 .n(n)
16965 .k(k)
16966 .cn_stride(7)
16967 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16968 }
16969 }
16970 }
16971
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64,n_gt_4_strided_a)16972 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64, n_gt_4_strided_a) {
16973 TEST_REQUIRES_X86_XOP;
16974 for (uint32_t n = 5; n < 8; n++) {
16975 for (size_t k = 1; k <= 40; k += 9) {
16976 GemmMicrokernelTester()
16977 .mr(4)
16978 .nr(4)
16979 .kr(2)
16980 .sr(4)
16981 .m(4)
16982 .n(n)
16983 .k(k)
16984 .a_stride(43)
16985 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
16986 }
16987 }
16988 }
16989
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64,n_gt_4_subtile)16990 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64, n_gt_4_subtile) {
16991 TEST_REQUIRES_X86_XOP;
16992 for (uint32_t n = 5; n < 8; n++) {
16993 for (size_t k = 1; k <= 40; k += 9) {
16994 for (uint32_t m = 1; m <= 4; m++) {
16995 GemmMicrokernelTester()
16996 .mr(4)
16997 .nr(4)
16998 .kr(2)
16999 .sr(4)
17000 .m(m)
17001 .n(n)
17002 .k(k)
17003 .iterations(1)
17004 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17005 }
17006 }
17007 }
17008 }
17009
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64,n_div_4)17010 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64, n_div_4) {
17011 TEST_REQUIRES_X86_XOP;
17012 for (uint32_t n = 8; n <= 12; n += 4) {
17013 for (size_t k = 1; k <= 40; k += 9) {
17014 GemmMicrokernelTester()
17015 .mr(4)
17016 .nr(4)
17017 .kr(2)
17018 .sr(4)
17019 .m(4)
17020 .n(n)
17021 .k(k)
17022 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17023 }
17024 }
17025 }
17026
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64,n_div_4_strided_cn)17027 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64, n_div_4_strided_cn) {
17028 TEST_REQUIRES_X86_XOP;
17029 for (uint32_t n = 8; n <= 12; n += 4) {
17030 for (size_t k = 1; k <= 40; k += 9) {
17031 GemmMicrokernelTester()
17032 .mr(4)
17033 .nr(4)
17034 .kr(2)
17035 .sr(4)
17036 .m(4)
17037 .n(n)
17038 .k(k)
17039 .cn_stride(7)
17040 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17041 }
17042 }
17043 }
17044
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64,n_div_4_strided_a)17045 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64, n_div_4_strided_a) {
17046 TEST_REQUIRES_X86_XOP;
17047 for (uint32_t n = 8; n <= 12; n += 4) {
17048 for (size_t k = 1; k <= 40; k += 9) {
17049 GemmMicrokernelTester()
17050 .mr(4)
17051 .nr(4)
17052 .kr(2)
17053 .sr(4)
17054 .m(4)
17055 .n(n)
17056 .k(k)
17057 .a_stride(43)
17058 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17059 }
17060 }
17061 }
17062
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64,n_div_4_subtile)17063 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64, n_div_4_subtile) {
17064 TEST_REQUIRES_X86_XOP;
17065 for (uint32_t n = 8; n <= 12; n += 4) {
17066 for (size_t k = 1; k <= 40; k += 9) {
17067 for (uint32_t m = 1; m <= 4; m++) {
17068 GemmMicrokernelTester()
17069 .mr(4)
17070 .nr(4)
17071 .kr(2)
17072 .sr(4)
17073 .m(m)
17074 .n(n)
17075 .k(k)
17076 .iterations(1)
17077 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17078 }
17079 }
17080 }
17081 }
17082
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64,strided_cm_subtile)17083 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64, strided_cm_subtile) {
17084 TEST_REQUIRES_X86_XOP;
17085 for (size_t k = 1; k <= 40; k += 9) {
17086 for (uint32_t n = 1; n <= 4; n++) {
17087 for (uint32_t m = 1; m <= 4; m++) {
17088 GemmMicrokernelTester()
17089 .mr(4)
17090 .nr(4)
17091 .kr(2)
17092 .sr(4)
17093 .m(m)
17094 .n(n)
17095 .k(k)
17096 .cm_stride(7)
17097 .iterations(1)
17098 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17099 }
17100 }
17101 }
17102 }
17103
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64,qmin)17104 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64, qmin) {
17105 TEST_REQUIRES_X86_XOP;
17106 GemmMicrokernelTester()
17107 .mr(4)
17108 .nr(4)
17109 .kr(2)
17110 .sr(4)
17111 .m(4)
17112 .n(4)
17113 .k(8)
17114 .qmin(128)
17115 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17116 }
17117
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64,qmax)17118 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64, qmax) {
17119 TEST_REQUIRES_X86_XOP;
17120 GemmMicrokernelTester()
17121 .mr(4)
17122 .nr(4)
17123 .kr(2)
17124 .sr(4)
17125 .m(4)
17126 .n(4)
17127 .k(8)
17128 .qmax(128)
17129 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17130 }
17131
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64,strided_cm)17132 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64, strided_cm) {
17133 TEST_REQUIRES_X86_XOP;
17134 GemmMicrokernelTester()
17135 .mr(4)
17136 .nr(4)
17137 .kr(2)
17138 .sr(4)
17139 .m(4)
17140 .n(4)
17141 .k(8)
17142 .cm_stride(7)
17143 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17144 }
17145
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64,no_a_zero_point)17146 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64, no_a_zero_point) {
17147 TEST_REQUIRES_X86_XOP;
17148 for (size_t k = 1; k <= 40; k += 9) {
17149 GemmMicrokernelTester()
17150 .mr(4)
17151 .nr(4)
17152 .kr(2)
17153 .sr(4)
17154 .m(4)
17155 .n(4)
17156 .k(k)
17157 .a_zero_point(0)
17158 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17159 }
17160 }
17161
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64,no_b_zero_point)17162 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64, no_b_zero_point) {
17163 TEST_REQUIRES_X86_XOP;
17164 for (size_t k = 1; k <= 40; k += 9) {
17165 GemmMicrokernelTester()
17166 .mr(4)
17167 .nr(4)
17168 .kr(2)
17169 .sr(4)
17170 .m(4)
17171 .n(4)
17172 .k(k)
17173 .b_zero_point(0)
17174 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17175 }
17176 }
17177
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64,no_zero_point)17178 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD64, no_zero_point) {
17179 TEST_REQUIRES_X86_XOP;
17180 for (size_t k = 1; k <= 40; k += 9) {
17181 GemmMicrokernelTester()
17182 .mr(4)
17183 .nr(4)
17184 .kr(2)
17185 .sr(4)
17186 .m(4)
17187 .n(4)
17188 .k(k)
17189 .a_zero_point(0)
17190 .b_zero_point(0)
17191 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17192 }
17193 }
17194 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
17195
17196
17197 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128,k_eq_8)17198 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128, k_eq_8) {
17199 TEST_REQUIRES_X86_SSE41;
17200 GemmMicrokernelTester()
17201 .mr(1)
17202 .nr(4)
17203 .kr(2)
17204 .sr(4)
17205 .m(1)
17206 .n(4)
17207 .k(8)
17208 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17209 }
17210
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128,strided_cn)17211 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128, strided_cn) {
17212 TEST_REQUIRES_X86_SSE41;
17213 GemmMicrokernelTester()
17214 .mr(1)
17215 .nr(4)
17216 .kr(2)
17217 .sr(4)
17218 .m(1)
17219 .n(4)
17220 .k(8)
17221 .cn_stride(7)
17222 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17223 }
17224
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128,k_eq_8_strided_a)17225 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128, k_eq_8_strided_a) {
17226 TEST_REQUIRES_X86_SSE41;
17227 GemmMicrokernelTester()
17228 .mr(1)
17229 .nr(4)
17230 .kr(2)
17231 .sr(4)
17232 .m(1)
17233 .n(4)
17234 .k(8)
17235 .a_stride(11)
17236 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17237 }
17238
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128,k_eq_8_subtile)17239 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128, k_eq_8_subtile) {
17240 TEST_REQUIRES_X86_SSE41;
17241 for (uint32_t n = 1; n <= 4; n++) {
17242 for (uint32_t m = 1; m <= 1; m++) {
17243 GemmMicrokernelTester()
17244 .mr(1)
17245 .nr(4)
17246 .kr(2)
17247 .sr(4)
17248 .m(m)
17249 .n(n)
17250 .k(8)
17251 .iterations(1)
17252 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17253 }
17254 }
17255 }
17256
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128,k_eq_8_subtile_m)17257 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128, k_eq_8_subtile_m) {
17258 TEST_REQUIRES_X86_SSE41;
17259 for (uint32_t m = 1; m <= 1; m++) {
17260 GemmMicrokernelTester()
17261 .mr(1)
17262 .nr(4)
17263 .kr(2)
17264 .sr(4)
17265 .m(m)
17266 .n(4)
17267 .k(8)
17268 .iterations(1)
17269 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17270 }
17271 }
17272
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128,k_eq_8_subtile_n)17273 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128, k_eq_8_subtile_n) {
17274 TEST_REQUIRES_X86_SSE41;
17275 for (uint32_t n = 1; n <= 4; n++) {
17276 GemmMicrokernelTester()
17277 .mr(1)
17278 .nr(4)
17279 .kr(2)
17280 .sr(4)
17281 .m(1)
17282 .n(n)
17283 .k(8)
17284 .iterations(1)
17285 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17286 }
17287 }
17288
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128,k_lt_8)17289 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128, k_lt_8) {
17290 TEST_REQUIRES_X86_SSE41;
17291 for (size_t k = 1; k < 8; k++) {
17292 GemmMicrokernelTester()
17293 .mr(1)
17294 .nr(4)
17295 .kr(2)
17296 .sr(4)
17297 .m(1)
17298 .n(4)
17299 .k(k)
17300 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17301 }
17302 }
17303
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128,k_lt_8_strided_a)17304 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128, k_lt_8_strided_a) {
17305 TEST_REQUIRES_X86_SSE41;
17306 for (size_t k = 1; k < 8; k++) {
17307 GemmMicrokernelTester()
17308 .mr(1)
17309 .nr(4)
17310 .kr(2)
17311 .sr(4)
17312 .m(1)
17313 .n(4)
17314 .k(k)
17315 .a_stride(11)
17316 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17317 }
17318 }
17319
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128,k_lt_8_subtile)17320 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128, k_lt_8_subtile) {
17321 TEST_REQUIRES_X86_SSE41;
17322 for (size_t k = 1; k < 8; k++) {
17323 for (uint32_t n = 1; n <= 4; n++) {
17324 for (uint32_t m = 1; m <= 1; m++) {
17325 GemmMicrokernelTester()
17326 .mr(1)
17327 .nr(4)
17328 .kr(2)
17329 .sr(4)
17330 .m(m)
17331 .n(n)
17332 .k(k)
17333 .iterations(1)
17334 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17335 }
17336 }
17337 }
17338 }
17339
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128,k_gt_8)17340 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128, k_gt_8) {
17341 TEST_REQUIRES_X86_SSE41;
17342 for (size_t k = 9; k < 16; k++) {
17343 GemmMicrokernelTester()
17344 .mr(1)
17345 .nr(4)
17346 .kr(2)
17347 .sr(4)
17348 .m(1)
17349 .n(4)
17350 .k(k)
17351 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17352 }
17353 }
17354
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128,k_gt_8_strided_a)17355 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128, k_gt_8_strided_a) {
17356 TEST_REQUIRES_X86_SSE41;
17357 for (size_t k = 9; k < 16; k++) {
17358 GemmMicrokernelTester()
17359 .mr(1)
17360 .nr(4)
17361 .kr(2)
17362 .sr(4)
17363 .m(1)
17364 .n(4)
17365 .k(k)
17366 .a_stride(19)
17367 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17368 }
17369 }
17370
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128,k_gt_8_subtile)17371 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128, k_gt_8_subtile) {
17372 TEST_REQUIRES_X86_SSE41;
17373 for (size_t k = 9; k < 16; k++) {
17374 for (uint32_t n = 1; n <= 4; n++) {
17375 for (uint32_t m = 1; m <= 1; m++) {
17376 GemmMicrokernelTester()
17377 .mr(1)
17378 .nr(4)
17379 .kr(2)
17380 .sr(4)
17381 .m(m)
17382 .n(n)
17383 .k(k)
17384 .iterations(1)
17385 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17386 }
17387 }
17388 }
17389 }
17390
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128,k_div_8)17391 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128, k_div_8) {
17392 TEST_REQUIRES_X86_SSE41;
17393 for (size_t k = 16; k <= 80; k += 8) {
17394 GemmMicrokernelTester()
17395 .mr(1)
17396 .nr(4)
17397 .kr(2)
17398 .sr(4)
17399 .m(1)
17400 .n(4)
17401 .k(k)
17402 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17403 }
17404 }
17405
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128,k_div_8_strided_a)17406 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128, k_div_8_strided_a) {
17407 TEST_REQUIRES_X86_SSE41;
17408 for (size_t k = 16; k <= 80; k += 8) {
17409 GemmMicrokernelTester()
17410 .mr(1)
17411 .nr(4)
17412 .kr(2)
17413 .sr(4)
17414 .m(1)
17415 .n(4)
17416 .k(k)
17417 .a_stride(83)
17418 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17419 }
17420 }
17421
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128,k_div_8_subtile)17422 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128, k_div_8_subtile) {
17423 TEST_REQUIRES_X86_SSE41;
17424 for (size_t k = 16; k <= 80; k += 8) {
17425 for (uint32_t n = 1; n <= 4; n++) {
17426 for (uint32_t m = 1; m <= 1; m++) {
17427 GemmMicrokernelTester()
17428 .mr(1)
17429 .nr(4)
17430 .kr(2)
17431 .sr(4)
17432 .m(m)
17433 .n(n)
17434 .k(k)
17435 .iterations(1)
17436 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17437 }
17438 }
17439 }
17440 }
17441
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128,n_gt_4)17442 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128, n_gt_4) {
17443 TEST_REQUIRES_X86_SSE41;
17444 for (uint32_t n = 5; n < 8; n++) {
17445 for (size_t k = 1; k <= 40; k += 9) {
17446 GemmMicrokernelTester()
17447 .mr(1)
17448 .nr(4)
17449 .kr(2)
17450 .sr(4)
17451 .m(1)
17452 .n(n)
17453 .k(k)
17454 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17455 }
17456 }
17457 }
17458
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128,n_gt_4_strided_cn)17459 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128, n_gt_4_strided_cn) {
17460 TEST_REQUIRES_X86_SSE41;
17461 for (uint32_t n = 5; n < 8; n++) {
17462 for (size_t k = 1; k <= 40; k += 9) {
17463 GemmMicrokernelTester()
17464 .mr(1)
17465 .nr(4)
17466 .kr(2)
17467 .sr(4)
17468 .m(1)
17469 .n(n)
17470 .k(k)
17471 .cn_stride(7)
17472 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17473 }
17474 }
17475 }
17476
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128,n_gt_4_strided_a)17477 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128, n_gt_4_strided_a) {
17478 TEST_REQUIRES_X86_SSE41;
17479 for (uint32_t n = 5; n < 8; n++) {
17480 for (size_t k = 1; k <= 40; k += 9) {
17481 GemmMicrokernelTester()
17482 .mr(1)
17483 .nr(4)
17484 .kr(2)
17485 .sr(4)
17486 .m(1)
17487 .n(n)
17488 .k(k)
17489 .a_stride(43)
17490 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17491 }
17492 }
17493 }
17494
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128,n_gt_4_subtile)17495 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128, n_gt_4_subtile) {
17496 TEST_REQUIRES_X86_SSE41;
17497 for (uint32_t n = 5; n < 8; n++) {
17498 for (size_t k = 1; k <= 40; k += 9) {
17499 for (uint32_t m = 1; m <= 1; m++) {
17500 GemmMicrokernelTester()
17501 .mr(1)
17502 .nr(4)
17503 .kr(2)
17504 .sr(4)
17505 .m(m)
17506 .n(n)
17507 .k(k)
17508 .iterations(1)
17509 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17510 }
17511 }
17512 }
17513 }
17514
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128,n_div_4)17515 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128, n_div_4) {
17516 TEST_REQUIRES_X86_SSE41;
17517 for (uint32_t n = 8; n <= 12; n += 4) {
17518 for (size_t k = 1; k <= 40; k += 9) {
17519 GemmMicrokernelTester()
17520 .mr(1)
17521 .nr(4)
17522 .kr(2)
17523 .sr(4)
17524 .m(1)
17525 .n(n)
17526 .k(k)
17527 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17528 }
17529 }
17530 }
17531
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128,n_div_4_strided_cn)17532 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128, n_div_4_strided_cn) {
17533 TEST_REQUIRES_X86_SSE41;
17534 for (uint32_t n = 8; n <= 12; n += 4) {
17535 for (size_t k = 1; k <= 40; k += 9) {
17536 GemmMicrokernelTester()
17537 .mr(1)
17538 .nr(4)
17539 .kr(2)
17540 .sr(4)
17541 .m(1)
17542 .n(n)
17543 .k(k)
17544 .cn_stride(7)
17545 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17546 }
17547 }
17548 }
17549
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128,n_div_4_strided_a)17550 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128, n_div_4_strided_a) {
17551 TEST_REQUIRES_X86_SSE41;
17552 for (uint32_t n = 8; n <= 12; n += 4) {
17553 for (size_t k = 1; k <= 40; k += 9) {
17554 GemmMicrokernelTester()
17555 .mr(1)
17556 .nr(4)
17557 .kr(2)
17558 .sr(4)
17559 .m(1)
17560 .n(n)
17561 .k(k)
17562 .a_stride(43)
17563 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17564 }
17565 }
17566 }
17567
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128,n_div_4_subtile)17568 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128, n_div_4_subtile) {
17569 TEST_REQUIRES_X86_SSE41;
17570 for (uint32_t n = 8; n <= 12; n += 4) {
17571 for (size_t k = 1; k <= 40; k += 9) {
17572 for (uint32_t m = 1; m <= 1; m++) {
17573 GemmMicrokernelTester()
17574 .mr(1)
17575 .nr(4)
17576 .kr(2)
17577 .sr(4)
17578 .m(m)
17579 .n(n)
17580 .k(k)
17581 .iterations(1)
17582 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17583 }
17584 }
17585 }
17586 }
17587
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128,strided_cm_subtile)17588 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128, strided_cm_subtile) {
17589 TEST_REQUIRES_X86_SSE41;
17590 for (size_t k = 1; k <= 40; k += 9) {
17591 for (uint32_t n = 1; n <= 4; n++) {
17592 for (uint32_t m = 1; m <= 1; m++) {
17593 GemmMicrokernelTester()
17594 .mr(1)
17595 .nr(4)
17596 .kr(2)
17597 .sr(4)
17598 .m(m)
17599 .n(n)
17600 .k(k)
17601 .cm_stride(7)
17602 .iterations(1)
17603 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17604 }
17605 }
17606 }
17607 }
17608
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128,qmin)17609 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128, qmin) {
17610 TEST_REQUIRES_X86_SSE41;
17611 GemmMicrokernelTester()
17612 .mr(1)
17613 .nr(4)
17614 .kr(2)
17615 .sr(4)
17616 .m(1)
17617 .n(4)
17618 .k(8)
17619 .qmin(128)
17620 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17621 }
17622
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128,qmax)17623 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128, qmax) {
17624 TEST_REQUIRES_X86_SSE41;
17625 GemmMicrokernelTester()
17626 .mr(1)
17627 .nr(4)
17628 .kr(2)
17629 .sr(4)
17630 .m(1)
17631 .n(4)
17632 .k(8)
17633 .qmax(128)
17634 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17635 }
17636
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128,strided_cm)17637 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128, strided_cm) {
17638 TEST_REQUIRES_X86_SSE41;
17639 GemmMicrokernelTester()
17640 .mr(1)
17641 .nr(4)
17642 .kr(2)
17643 .sr(4)
17644 .m(1)
17645 .n(4)
17646 .k(8)
17647 .cm_stride(7)
17648 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17649 }
17650
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128,no_a_zero_point)17651 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128, no_a_zero_point) {
17652 TEST_REQUIRES_X86_SSE41;
17653 for (size_t k = 1; k <= 40; k += 9) {
17654 GemmMicrokernelTester()
17655 .mr(1)
17656 .nr(4)
17657 .kr(2)
17658 .sr(4)
17659 .m(1)
17660 .n(4)
17661 .k(k)
17662 .a_zero_point(0)
17663 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17664 }
17665 }
17666
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128,no_b_zero_point)17667 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128, no_b_zero_point) {
17668 TEST_REQUIRES_X86_SSE41;
17669 for (size_t k = 1; k <= 40; k += 9) {
17670 GemmMicrokernelTester()
17671 .mr(1)
17672 .nr(4)
17673 .kr(2)
17674 .sr(4)
17675 .m(1)
17676 .n(4)
17677 .k(k)
17678 .b_zero_point(0)
17679 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17680 }
17681 }
17682
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128,no_zero_point)17683 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__SSE41_LD128, no_zero_point) {
17684 TEST_REQUIRES_X86_SSE41;
17685 for (size_t k = 1; k <= 40; k += 9) {
17686 GemmMicrokernelTester()
17687 .mr(1)
17688 .nr(4)
17689 .kr(2)
17690 .sr(4)
17691 .m(1)
17692 .n(4)
17693 .k(k)
17694 .a_zero_point(0)
17695 .b_zero_point(0)
17696 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17697 }
17698 }
17699 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
17700
17701
17702 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128,k_eq_8)17703 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128, k_eq_8) {
17704 TEST_REQUIRES_X86_SSE2;
17705 GemmMicrokernelTester()
17706 .mr(2)
17707 .nr(4)
17708 .kr(2)
17709 .sr(4)
17710 .m(2)
17711 .n(4)
17712 .k(8)
17713 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17714 }
17715
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128,strided_cn)17716 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128, strided_cn) {
17717 TEST_REQUIRES_X86_SSE2;
17718 GemmMicrokernelTester()
17719 .mr(2)
17720 .nr(4)
17721 .kr(2)
17722 .sr(4)
17723 .m(2)
17724 .n(4)
17725 .k(8)
17726 .cn_stride(7)
17727 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17728 }
17729
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128,k_eq_8_strided_a)17730 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128, k_eq_8_strided_a) {
17731 TEST_REQUIRES_X86_SSE2;
17732 GemmMicrokernelTester()
17733 .mr(2)
17734 .nr(4)
17735 .kr(2)
17736 .sr(4)
17737 .m(2)
17738 .n(4)
17739 .k(8)
17740 .a_stride(11)
17741 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17742 }
17743
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128,k_eq_8_subtile)17744 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128, k_eq_8_subtile) {
17745 TEST_REQUIRES_X86_SSE2;
17746 for (uint32_t n = 1; n <= 4; n++) {
17747 for (uint32_t m = 1; m <= 2; m++) {
17748 GemmMicrokernelTester()
17749 .mr(2)
17750 .nr(4)
17751 .kr(2)
17752 .sr(4)
17753 .m(m)
17754 .n(n)
17755 .k(8)
17756 .iterations(1)
17757 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17758 }
17759 }
17760 }
17761
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128,k_eq_8_subtile_m)17762 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128, k_eq_8_subtile_m) {
17763 TEST_REQUIRES_X86_SSE2;
17764 for (uint32_t m = 1; m <= 2; m++) {
17765 GemmMicrokernelTester()
17766 .mr(2)
17767 .nr(4)
17768 .kr(2)
17769 .sr(4)
17770 .m(m)
17771 .n(4)
17772 .k(8)
17773 .iterations(1)
17774 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17775 }
17776 }
17777
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128,k_eq_8_subtile_n)17778 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128, k_eq_8_subtile_n) {
17779 TEST_REQUIRES_X86_SSE2;
17780 for (uint32_t n = 1; n <= 4; n++) {
17781 GemmMicrokernelTester()
17782 .mr(2)
17783 .nr(4)
17784 .kr(2)
17785 .sr(4)
17786 .m(2)
17787 .n(n)
17788 .k(8)
17789 .iterations(1)
17790 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17791 }
17792 }
17793
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128,k_lt_8)17794 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128, k_lt_8) {
17795 TEST_REQUIRES_X86_SSE2;
17796 for (size_t k = 1; k < 8; k++) {
17797 GemmMicrokernelTester()
17798 .mr(2)
17799 .nr(4)
17800 .kr(2)
17801 .sr(4)
17802 .m(2)
17803 .n(4)
17804 .k(k)
17805 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17806 }
17807 }
17808
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128,k_lt_8_strided_a)17809 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128, k_lt_8_strided_a) {
17810 TEST_REQUIRES_X86_SSE2;
17811 for (size_t k = 1; k < 8; k++) {
17812 GemmMicrokernelTester()
17813 .mr(2)
17814 .nr(4)
17815 .kr(2)
17816 .sr(4)
17817 .m(2)
17818 .n(4)
17819 .k(k)
17820 .a_stride(11)
17821 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17822 }
17823 }
17824
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128,k_lt_8_subtile)17825 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128, k_lt_8_subtile) {
17826 TEST_REQUIRES_X86_SSE2;
17827 for (size_t k = 1; k < 8; k++) {
17828 for (uint32_t n = 1; n <= 4; n++) {
17829 for (uint32_t m = 1; m <= 2; m++) {
17830 GemmMicrokernelTester()
17831 .mr(2)
17832 .nr(4)
17833 .kr(2)
17834 .sr(4)
17835 .m(m)
17836 .n(n)
17837 .k(k)
17838 .iterations(1)
17839 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17840 }
17841 }
17842 }
17843 }
17844
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128,k_gt_8)17845 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128, k_gt_8) {
17846 TEST_REQUIRES_X86_SSE2;
17847 for (size_t k = 9; k < 16; k++) {
17848 GemmMicrokernelTester()
17849 .mr(2)
17850 .nr(4)
17851 .kr(2)
17852 .sr(4)
17853 .m(2)
17854 .n(4)
17855 .k(k)
17856 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17857 }
17858 }
17859
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128,k_gt_8_strided_a)17860 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128, k_gt_8_strided_a) {
17861 TEST_REQUIRES_X86_SSE2;
17862 for (size_t k = 9; k < 16; k++) {
17863 GemmMicrokernelTester()
17864 .mr(2)
17865 .nr(4)
17866 .kr(2)
17867 .sr(4)
17868 .m(2)
17869 .n(4)
17870 .k(k)
17871 .a_stride(19)
17872 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17873 }
17874 }
17875
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128,k_gt_8_subtile)17876 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128, k_gt_8_subtile) {
17877 TEST_REQUIRES_X86_SSE2;
17878 for (size_t k = 9; k < 16; k++) {
17879 for (uint32_t n = 1; n <= 4; n++) {
17880 for (uint32_t m = 1; m <= 2; m++) {
17881 GemmMicrokernelTester()
17882 .mr(2)
17883 .nr(4)
17884 .kr(2)
17885 .sr(4)
17886 .m(m)
17887 .n(n)
17888 .k(k)
17889 .iterations(1)
17890 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17891 }
17892 }
17893 }
17894 }
17895
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128,k_div_8)17896 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128, k_div_8) {
17897 TEST_REQUIRES_X86_SSE2;
17898 for (size_t k = 16; k <= 80; k += 8) {
17899 GemmMicrokernelTester()
17900 .mr(2)
17901 .nr(4)
17902 .kr(2)
17903 .sr(4)
17904 .m(2)
17905 .n(4)
17906 .k(k)
17907 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17908 }
17909 }
17910
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128,k_div_8_strided_a)17911 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128, k_div_8_strided_a) {
17912 TEST_REQUIRES_X86_SSE2;
17913 for (size_t k = 16; k <= 80; k += 8) {
17914 GemmMicrokernelTester()
17915 .mr(2)
17916 .nr(4)
17917 .kr(2)
17918 .sr(4)
17919 .m(2)
17920 .n(4)
17921 .k(k)
17922 .a_stride(83)
17923 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17924 }
17925 }
17926
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128,k_div_8_subtile)17927 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128, k_div_8_subtile) {
17928 TEST_REQUIRES_X86_SSE2;
17929 for (size_t k = 16; k <= 80; k += 8) {
17930 for (uint32_t n = 1; n <= 4; n++) {
17931 for (uint32_t m = 1; m <= 2; m++) {
17932 GemmMicrokernelTester()
17933 .mr(2)
17934 .nr(4)
17935 .kr(2)
17936 .sr(4)
17937 .m(m)
17938 .n(n)
17939 .k(k)
17940 .iterations(1)
17941 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17942 }
17943 }
17944 }
17945 }
17946
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128,n_gt_4)17947 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128, n_gt_4) {
17948 TEST_REQUIRES_X86_SSE2;
17949 for (uint32_t n = 5; n < 8; n++) {
17950 for (size_t k = 1; k <= 40; k += 9) {
17951 GemmMicrokernelTester()
17952 .mr(2)
17953 .nr(4)
17954 .kr(2)
17955 .sr(4)
17956 .m(2)
17957 .n(n)
17958 .k(k)
17959 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17960 }
17961 }
17962 }
17963
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128,n_gt_4_strided_cn)17964 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128, n_gt_4_strided_cn) {
17965 TEST_REQUIRES_X86_SSE2;
17966 for (uint32_t n = 5; n < 8; n++) {
17967 for (size_t k = 1; k <= 40; k += 9) {
17968 GemmMicrokernelTester()
17969 .mr(2)
17970 .nr(4)
17971 .kr(2)
17972 .sr(4)
17973 .m(2)
17974 .n(n)
17975 .k(k)
17976 .cn_stride(7)
17977 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17978 }
17979 }
17980 }
17981
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128,n_gt_4_strided_a)17982 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128, n_gt_4_strided_a) {
17983 TEST_REQUIRES_X86_SSE2;
17984 for (uint32_t n = 5; n < 8; n++) {
17985 for (size_t k = 1; k <= 40; k += 9) {
17986 GemmMicrokernelTester()
17987 .mr(2)
17988 .nr(4)
17989 .kr(2)
17990 .sr(4)
17991 .m(2)
17992 .n(n)
17993 .k(k)
17994 .a_stride(43)
17995 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
17996 }
17997 }
17998 }
17999
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128,n_gt_4_subtile)18000 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128, n_gt_4_subtile) {
18001 TEST_REQUIRES_X86_SSE2;
18002 for (uint32_t n = 5; n < 8; n++) {
18003 for (size_t k = 1; k <= 40; k += 9) {
18004 for (uint32_t m = 1; m <= 2; m++) {
18005 GemmMicrokernelTester()
18006 .mr(2)
18007 .nr(4)
18008 .kr(2)
18009 .sr(4)
18010 .m(m)
18011 .n(n)
18012 .k(k)
18013 .iterations(1)
18014 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18015 }
18016 }
18017 }
18018 }
18019
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128,n_div_4)18020 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128, n_div_4) {
18021 TEST_REQUIRES_X86_SSE2;
18022 for (uint32_t n = 8; n <= 12; n += 4) {
18023 for (size_t k = 1; k <= 40; k += 9) {
18024 GemmMicrokernelTester()
18025 .mr(2)
18026 .nr(4)
18027 .kr(2)
18028 .sr(4)
18029 .m(2)
18030 .n(n)
18031 .k(k)
18032 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18033 }
18034 }
18035 }
18036
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128,n_div_4_strided_cn)18037 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128, n_div_4_strided_cn) {
18038 TEST_REQUIRES_X86_SSE2;
18039 for (uint32_t n = 8; n <= 12; n += 4) {
18040 for (size_t k = 1; k <= 40; k += 9) {
18041 GemmMicrokernelTester()
18042 .mr(2)
18043 .nr(4)
18044 .kr(2)
18045 .sr(4)
18046 .m(2)
18047 .n(n)
18048 .k(k)
18049 .cn_stride(7)
18050 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18051 }
18052 }
18053 }
18054
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128,n_div_4_strided_a)18055 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128, n_div_4_strided_a) {
18056 TEST_REQUIRES_X86_SSE2;
18057 for (uint32_t n = 8; n <= 12; n += 4) {
18058 for (size_t k = 1; k <= 40; k += 9) {
18059 GemmMicrokernelTester()
18060 .mr(2)
18061 .nr(4)
18062 .kr(2)
18063 .sr(4)
18064 .m(2)
18065 .n(n)
18066 .k(k)
18067 .a_stride(43)
18068 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18069 }
18070 }
18071 }
18072
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128,n_div_4_subtile)18073 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128, n_div_4_subtile) {
18074 TEST_REQUIRES_X86_SSE2;
18075 for (uint32_t n = 8; n <= 12; n += 4) {
18076 for (size_t k = 1; k <= 40; k += 9) {
18077 for (uint32_t m = 1; m <= 2; m++) {
18078 GemmMicrokernelTester()
18079 .mr(2)
18080 .nr(4)
18081 .kr(2)
18082 .sr(4)
18083 .m(m)
18084 .n(n)
18085 .k(k)
18086 .iterations(1)
18087 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18088 }
18089 }
18090 }
18091 }
18092
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128,strided_cm_subtile)18093 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128, strided_cm_subtile) {
18094 TEST_REQUIRES_X86_SSE2;
18095 for (size_t k = 1; k <= 40; k += 9) {
18096 for (uint32_t n = 1; n <= 4; n++) {
18097 for (uint32_t m = 1; m <= 2; m++) {
18098 GemmMicrokernelTester()
18099 .mr(2)
18100 .nr(4)
18101 .kr(2)
18102 .sr(4)
18103 .m(m)
18104 .n(n)
18105 .k(k)
18106 .cm_stride(7)
18107 .iterations(1)
18108 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18109 }
18110 }
18111 }
18112 }
18113
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128,qmin)18114 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128, qmin) {
18115 TEST_REQUIRES_X86_SSE2;
18116 GemmMicrokernelTester()
18117 .mr(2)
18118 .nr(4)
18119 .kr(2)
18120 .sr(4)
18121 .m(2)
18122 .n(4)
18123 .k(8)
18124 .qmin(128)
18125 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18126 }
18127
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128,qmax)18128 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128, qmax) {
18129 TEST_REQUIRES_X86_SSE2;
18130 GemmMicrokernelTester()
18131 .mr(2)
18132 .nr(4)
18133 .kr(2)
18134 .sr(4)
18135 .m(2)
18136 .n(4)
18137 .k(8)
18138 .qmax(128)
18139 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18140 }
18141
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128,strided_cm)18142 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128, strided_cm) {
18143 TEST_REQUIRES_X86_SSE2;
18144 GemmMicrokernelTester()
18145 .mr(2)
18146 .nr(4)
18147 .kr(2)
18148 .sr(4)
18149 .m(2)
18150 .n(4)
18151 .k(8)
18152 .cm_stride(7)
18153 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18154 }
18155
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128,no_a_zero_point)18156 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128, no_a_zero_point) {
18157 TEST_REQUIRES_X86_SSE2;
18158 for (size_t k = 1; k <= 40; k += 9) {
18159 GemmMicrokernelTester()
18160 .mr(2)
18161 .nr(4)
18162 .kr(2)
18163 .sr(4)
18164 .m(2)
18165 .n(4)
18166 .k(k)
18167 .a_zero_point(0)
18168 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18169 }
18170 }
18171
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128,no_b_zero_point)18172 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128, no_b_zero_point) {
18173 TEST_REQUIRES_X86_SSE2;
18174 for (size_t k = 1; k <= 40; k += 9) {
18175 GemmMicrokernelTester()
18176 .mr(2)
18177 .nr(4)
18178 .kr(2)
18179 .sr(4)
18180 .m(2)
18181 .n(4)
18182 .k(k)
18183 .b_zero_point(0)
18184 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18185 }
18186 }
18187
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128,no_zero_point)18188 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE2_LD128, no_zero_point) {
18189 TEST_REQUIRES_X86_SSE2;
18190 for (size_t k = 1; k <= 40; k += 9) {
18191 GemmMicrokernelTester()
18192 .mr(2)
18193 .nr(4)
18194 .kr(2)
18195 .sr(4)
18196 .m(2)
18197 .n(4)
18198 .k(k)
18199 .a_zero_point(0)
18200 .b_zero_point(0)
18201 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18202 }
18203 }
18204 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
18205
18206
18207 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128,k_eq_8)18208 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128, k_eq_8) {
18209 TEST_REQUIRES_X86_SSE41;
18210 GemmMicrokernelTester()
18211 .mr(2)
18212 .nr(4)
18213 .kr(2)
18214 .sr(4)
18215 .m(2)
18216 .n(4)
18217 .k(8)
18218 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18219 }
18220
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128,strided_cn)18221 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128, strided_cn) {
18222 TEST_REQUIRES_X86_SSE41;
18223 GemmMicrokernelTester()
18224 .mr(2)
18225 .nr(4)
18226 .kr(2)
18227 .sr(4)
18228 .m(2)
18229 .n(4)
18230 .k(8)
18231 .cn_stride(7)
18232 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18233 }
18234
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128,k_eq_8_strided_a)18235 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128, k_eq_8_strided_a) {
18236 TEST_REQUIRES_X86_SSE41;
18237 GemmMicrokernelTester()
18238 .mr(2)
18239 .nr(4)
18240 .kr(2)
18241 .sr(4)
18242 .m(2)
18243 .n(4)
18244 .k(8)
18245 .a_stride(11)
18246 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18247 }
18248
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128,k_eq_8_subtile)18249 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128, k_eq_8_subtile) {
18250 TEST_REQUIRES_X86_SSE41;
18251 for (uint32_t n = 1; n <= 4; n++) {
18252 for (uint32_t m = 1; m <= 2; m++) {
18253 GemmMicrokernelTester()
18254 .mr(2)
18255 .nr(4)
18256 .kr(2)
18257 .sr(4)
18258 .m(m)
18259 .n(n)
18260 .k(8)
18261 .iterations(1)
18262 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18263 }
18264 }
18265 }
18266
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128,k_eq_8_subtile_m)18267 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128, k_eq_8_subtile_m) {
18268 TEST_REQUIRES_X86_SSE41;
18269 for (uint32_t m = 1; m <= 2; m++) {
18270 GemmMicrokernelTester()
18271 .mr(2)
18272 .nr(4)
18273 .kr(2)
18274 .sr(4)
18275 .m(m)
18276 .n(4)
18277 .k(8)
18278 .iterations(1)
18279 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18280 }
18281 }
18282
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128,k_eq_8_subtile_n)18283 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128, k_eq_8_subtile_n) {
18284 TEST_REQUIRES_X86_SSE41;
18285 for (uint32_t n = 1; n <= 4; n++) {
18286 GemmMicrokernelTester()
18287 .mr(2)
18288 .nr(4)
18289 .kr(2)
18290 .sr(4)
18291 .m(2)
18292 .n(n)
18293 .k(8)
18294 .iterations(1)
18295 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18296 }
18297 }
18298
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128,k_lt_8)18299 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128, k_lt_8) {
18300 TEST_REQUIRES_X86_SSE41;
18301 for (size_t k = 1; k < 8; k++) {
18302 GemmMicrokernelTester()
18303 .mr(2)
18304 .nr(4)
18305 .kr(2)
18306 .sr(4)
18307 .m(2)
18308 .n(4)
18309 .k(k)
18310 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18311 }
18312 }
18313
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128,k_lt_8_strided_a)18314 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128, k_lt_8_strided_a) {
18315 TEST_REQUIRES_X86_SSE41;
18316 for (size_t k = 1; k < 8; k++) {
18317 GemmMicrokernelTester()
18318 .mr(2)
18319 .nr(4)
18320 .kr(2)
18321 .sr(4)
18322 .m(2)
18323 .n(4)
18324 .k(k)
18325 .a_stride(11)
18326 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18327 }
18328 }
18329
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128,k_lt_8_subtile)18330 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128, k_lt_8_subtile) {
18331 TEST_REQUIRES_X86_SSE41;
18332 for (size_t k = 1; k < 8; k++) {
18333 for (uint32_t n = 1; n <= 4; n++) {
18334 for (uint32_t m = 1; m <= 2; m++) {
18335 GemmMicrokernelTester()
18336 .mr(2)
18337 .nr(4)
18338 .kr(2)
18339 .sr(4)
18340 .m(m)
18341 .n(n)
18342 .k(k)
18343 .iterations(1)
18344 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18345 }
18346 }
18347 }
18348 }
18349
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128,k_gt_8)18350 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128, k_gt_8) {
18351 TEST_REQUIRES_X86_SSE41;
18352 for (size_t k = 9; k < 16; k++) {
18353 GemmMicrokernelTester()
18354 .mr(2)
18355 .nr(4)
18356 .kr(2)
18357 .sr(4)
18358 .m(2)
18359 .n(4)
18360 .k(k)
18361 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18362 }
18363 }
18364
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128,k_gt_8_strided_a)18365 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128, k_gt_8_strided_a) {
18366 TEST_REQUIRES_X86_SSE41;
18367 for (size_t k = 9; k < 16; k++) {
18368 GemmMicrokernelTester()
18369 .mr(2)
18370 .nr(4)
18371 .kr(2)
18372 .sr(4)
18373 .m(2)
18374 .n(4)
18375 .k(k)
18376 .a_stride(19)
18377 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18378 }
18379 }
18380
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128,k_gt_8_subtile)18381 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128, k_gt_8_subtile) {
18382 TEST_REQUIRES_X86_SSE41;
18383 for (size_t k = 9; k < 16; k++) {
18384 for (uint32_t n = 1; n <= 4; n++) {
18385 for (uint32_t m = 1; m <= 2; m++) {
18386 GemmMicrokernelTester()
18387 .mr(2)
18388 .nr(4)
18389 .kr(2)
18390 .sr(4)
18391 .m(m)
18392 .n(n)
18393 .k(k)
18394 .iterations(1)
18395 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18396 }
18397 }
18398 }
18399 }
18400
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128,k_div_8)18401 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128, k_div_8) {
18402 TEST_REQUIRES_X86_SSE41;
18403 for (size_t k = 16; k <= 80; k += 8) {
18404 GemmMicrokernelTester()
18405 .mr(2)
18406 .nr(4)
18407 .kr(2)
18408 .sr(4)
18409 .m(2)
18410 .n(4)
18411 .k(k)
18412 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18413 }
18414 }
18415
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128,k_div_8_strided_a)18416 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128, k_div_8_strided_a) {
18417 TEST_REQUIRES_X86_SSE41;
18418 for (size_t k = 16; k <= 80; k += 8) {
18419 GemmMicrokernelTester()
18420 .mr(2)
18421 .nr(4)
18422 .kr(2)
18423 .sr(4)
18424 .m(2)
18425 .n(4)
18426 .k(k)
18427 .a_stride(83)
18428 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18429 }
18430 }
18431
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128,k_div_8_subtile)18432 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128, k_div_8_subtile) {
18433 TEST_REQUIRES_X86_SSE41;
18434 for (size_t k = 16; k <= 80; k += 8) {
18435 for (uint32_t n = 1; n <= 4; n++) {
18436 for (uint32_t m = 1; m <= 2; m++) {
18437 GemmMicrokernelTester()
18438 .mr(2)
18439 .nr(4)
18440 .kr(2)
18441 .sr(4)
18442 .m(m)
18443 .n(n)
18444 .k(k)
18445 .iterations(1)
18446 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18447 }
18448 }
18449 }
18450 }
18451
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128,n_gt_4)18452 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128, n_gt_4) {
18453 TEST_REQUIRES_X86_SSE41;
18454 for (uint32_t n = 5; n < 8; n++) {
18455 for (size_t k = 1; k <= 40; k += 9) {
18456 GemmMicrokernelTester()
18457 .mr(2)
18458 .nr(4)
18459 .kr(2)
18460 .sr(4)
18461 .m(2)
18462 .n(n)
18463 .k(k)
18464 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18465 }
18466 }
18467 }
18468
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128,n_gt_4_strided_cn)18469 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128, n_gt_4_strided_cn) {
18470 TEST_REQUIRES_X86_SSE41;
18471 for (uint32_t n = 5; n < 8; n++) {
18472 for (size_t k = 1; k <= 40; k += 9) {
18473 GemmMicrokernelTester()
18474 .mr(2)
18475 .nr(4)
18476 .kr(2)
18477 .sr(4)
18478 .m(2)
18479 .n(n)
18480 .k(k)
18481 .cn_stride(7)
18482 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18483 }
18484 }
18485 }
18486
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128,n_gt_4_strided_a)18487 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128, n_gt_4_strided_a) {
18488 TEST_REQUIRES_X86_SSE41;
18489 for (uint32_t n = 5; n < 8; n++) {
18490 for (size_t k = 1; k <= 40; k += 9) {
18491 GemmMicrokernelTester()
18492 .mr(2)
18493 .nr(4)
18494 .kr(2)
18495 .sr(4)
18496 .m(2)
18497 .n(n)
18498 .k(k)
18499 .a_stride(43)
18500 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18501 }
18502 }
18503 }
18504
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128,n_gt_4_subtile)18505 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128, n_gt_4_subtile) {
18506 TEST_REQUIRES_X86_SSE41;
18507 for (uint32_t n = 5; n < 8; n++) {
18508 for (size_t k = 1; k <= 40; k += 9) {
18509 for (uint32_t m = 1; m <= 2; m++) {
18510 GemmMicrokernelTester()
18511 .mr(2)
18512 .nr(4)
18513 .kr(2)
18514 .sr(4)
18515 .m(m)
18516 .n(n)
18517 .k(k)
18518 .iterations(1)
18519 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18520 }
18521 }
18522 }
18523 }
18524
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128,n_div_4)18525 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128, n_div_4) {
18526 TEST_REQUIRES_X86_SSE41;
18527 for (uint32_t n = 8; n <= 12; n += 4) {
18528 for (size_t k = 1; k <= 40; k += 9) {
18529 GemmMicrokernelTester()
18530 .mr(2)
18531 .nr(4)
18532 .kr(2)
18533 .sr(4)
18534 .m(2)
18535 .n(n)
18536 .k(k)
18537 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18538 }
18539 }
18540 }
18541
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128,n_div_4_strided_cn)18542 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128, n_div_4_strided_cn) {
18543 TEST_REQUIRES_X86_SSE41;
18544 for (uint32_t n = 8; n <= 12; n += 4) {
18545 for (size_t k = 1; k <= 40; k += 9) {
18546 GemmMicrokernelTester()
18547 .mr(2)
18548 .nr(4)
18549 .kr(2)
18550 .sr(4)
18551 .m(2)
18552 .n(n)
18553 .k(k)
18554 .cn_stride(7)
18555 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18556 }
18557 }
18558 }
18559
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128,n_div_4_strided_a)18560 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128, n_div_4_strided_a) {
18561 TEST_REQUIRES_X86_SSE41;
18562 for (uint32_t n = 8; n <= 12; n += 4) {
18563 for (size_t k = 1; k <= 40; k += 9) {
18564 GemmMicrokernelTester()
18565 .mr(2)
18566 .nr(4)
18567 .kr(2)
18568 .sr(4)
18569 .m(2)
18570 .n(n)
18571 .k(k)
18572 .a_stride(43)
18573 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18574 }
18575 }
18576 }
18577
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128,n_div_4_subtile)18578 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128, n_div_4_subtile) {
18579 TEST_REQUIRES_X86_SSE41;
18580 for (uint32_t n = 8; n <= 12; n += 4) {
18581 for (size_t k = 1; k <= 40; k += 9) {
18582 for (uint32_t m = 1; m <= 2; m++) {
18583 GemmMicrokernelTester()
18584 .mr(2)
18585 .nr(4)
18586 .kr(2)
18587 .sr(4)
18588 .m(m)
18589 .n(n)
18590 .k(k)
18591 .iterations(1)
18592 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18593 }
18594 }
18595 }
18596 }
18597
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128,strided_cm_subtile)18598 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128, strided_cm_subtile) {
18599 TEST_REQUIRES_X86_SSE41;
18600 for (size_t k = 1; k <= 40; k += 9) {
18601 for (uint32_t n = 1; n <= 4; n++) {
18602 for (uint32_t m = 1; m <= 2; m++) {
18603 GemmMicrokernelTester()
18604 .mr(2)
18605 .nr(4)
18606 .kr(2)
18607 .sr(4)
18608 .m(m)
18609 .n(n)
18610 .k(k)
18611 .cm_stride(7)
18612 .iterations(1)
18613 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18614 }
18615 }
18616 }
18617 }
18618
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128,qmin)18619 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128, qmin) {
18620 TEST_REQUIRES_X86_SSE41;
18621 GemmMicrokernelTester()
18622 .mr(2)
18623 .nr(4)
18624 .kr(2)
18625 .sr(4)
18626 .m(2)
18627 .n(4)
18628 .k(8)
18629 .qmin(128)
18630 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18631 }
18632
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128,qmax)18633 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128, qmax) {
18634 TEST_REQUIRES_X86_SSE41;
18635 GemmMicrokernelTester()
18636 .mr(2)
18637 .nr(4)
18638 .kr(2)
18639 .sr(4)
18640 .m(2)
18641 .n(4)
18642 .k(8)
18643 .qmax(128)
18644 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18645 }
18646
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128,strided_cm)18647 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128, strided_cm) {
18648 TEST_REQUIRES_X86_SSE41;
18649 GemmMicrokernelTester()
18650 .mr(2)
18651 .nr(4)
18652 .kr(2)
18653 .sr(4)
18654 .m(2)
18655 .n(4)
18656 .k(8)
18657 .cm_stride(7)
18658 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18659 }
18660
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128,no_a_zero_point)18661 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128, no_a_zero_point) {
18662 TEST_REQUIRES_X86_SSE41;
18663 for (size_t k = 1; k <= 40; k += 9) {
18664 GemmMicrokernelTester()
18665 .mr(2)
18666 .nr(4)
18667 .kr(2)
18668 .sr(4)
18669 .m(2)
18670 .n(4)
18671 .k(k)
18672 .a_zero_point(0)
18673 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18674 }
18675 }
18676
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128,no_b_zero_point)18677 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128, no_b_zero_point) {
18678 TEST_REQUIRES_X86_SSE41;
18679 for (size_t k = 1; k <= 40; k += 9) {
18680 GemmMicrokernelTester()
18681 .mr(2)
18682 .nr(4)
18683 .kr(2)
18684 .sr(4)
18685 .m(2)
18686 .n(4)
18687 .k(k)
18688 .b_zero_point(0)
18689 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18690 }
18691 }
18692
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128,no_zero_point)18693 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__SSE41_LD128, no_zero_point) {
18694 TEST_REQUIRES_X86_SSE41;
18695 for (size_t k = 1; k <= 40; k += 9) {
18696 GemmMicrokernelTester()
18697 .mr(2)
18698 .nr(4)
18699 .kr(2)
18700 .sr(4)
18701 .m(2)
18702 .n(4)
18703 .k(k)
18704 .a_zero_point(0)
18705 .b_zero_point(0)
18706 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18707 }
18708 }
18709 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
18710
18711
18712 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128,k_eq_8)18713 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128, k_eq_8) {
18714 TEST_REQUIRES_X86_SSE2;
18715 GemmMicrokernelTester()
18716 .mr(3)
18717 .nr(4)
18718 .kr(2)
18719 .sr(4)
18720 .m(3)
18721 .n(4)
18722 .k(8)
18723 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18724 }
18725
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128,strided_cn)18726 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128, strided_cn) {
18727 TEST_REQUIRES_X86_SSE2;
18728 GemmMicrokernelTester()
18729 .mr(3)
18730 .nr(4)
18731 .kr(2)
18732 .sr(4)
18733 .m(3)
18734 .n(4)
18735 .k(8)
18736 .cn_stride(7)
18737 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18738 }
18739
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128,k_eq_8_strided_a)18740 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128, k_eq_8_strided_a) {
18741 TEST_REQUIRES_X86_SSE2;
18742 GemmMicrokernelTester()
18743 .mr(3)
18744 .nr(4)
18745 .kr(2)
18746 .sr(4)
18747 .m(3)
18748 .n(4)
18749 .k(8)
18750 .a_stride(11)
18751 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18752 }
18753
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128,k_eq_8_subtile)18754 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128, k_eq_8_subtile) {
18755 TEST_REQUIRES_X86_SSE2;
18756 for (uint32_t n = 1; n <= 4; n++) {
18757 for (uint32_t m = 1; m <= 3; m++) {
18758 GemmMicrokernelTester()
18759 .mr(3)
18760 .nr(4)
18761 .kr(2)
18762 .sr(4)
18763 .m(m)
18764 .n(n)
18765 .k(8)
18766 .iterations(1)
18767 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18768 }
18769 }
18770 }
18771
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128,k_eq_8_subtile_m)18772 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128, k_eq_8_subtile_m) {
18773 TEST_REQUIRES_X86_SSE2;
18774 for (uint32_t m = 1; m <= 3; m++) {
18775 GemmMicrokernelTester()
18776 .mr(3)
18777 .nr(4)
18778 .kr(2)
18779 .sr(4)
18780 .m(m)
18781 .n(4)
18782 .k(8)
18783 .iterations(1)
18784 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18785 }
18786 }
18787
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128,k_eq_8_subtile_n)18788 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128, k_eq_8_subtile_n) {
18789 TEST_REQUIRES_X86_SSE2;
18790 for (uint32_t n = 1; n <= 4; n++) {
18791 GemmMicrokernelTester()
18792 .mr(3)
18793 .nr(4)
18794 .kr(2)
18795 .sr(4)
18796 .m(3)
18797 .n(n)
18798 .k(8)
18799 .iterations(1)
18800 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18801 }
18802 }
18803
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128,k_lt_8)18804 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128, k_lt_8) {
18805 TEST_REQUIRES_X86_SSE2;
18806 for (size_t k = 1; k < 8; k++) {
18807 GemmMicrokernelTester()
18808 .mr(3)
18809 .nr(4)
18810 .kr(2)
18811 .sr(4)
18812 .m(3)
18813 .n(4)
18814 .k(k)
18815 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18816 }
18817 }
18818
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128,k_lt_8_strided_a)18819 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128, k_lt_8_strided_a) {
18820 TEST_REQUIRES_X86_SSE2;
18821 for (size_t k = 1; k < 8; k++) {
18822 GemmMicrokernelTester()
18823 .mr(3)
18824 .nr(4)
18825 .kr(2)
18826 .sr(4)
18827 .m(3)
18828 .n(4)
18829 .k(k)
18830 .a_stride(11)
18831 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18832 }
18833 }
18834
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128,k_lt_8_subtile)18835 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128, k_lt_8_subtile) {
18836 TEST_REQUIRES_X86_SSE2;
18837 for (size_t k = 1; k < 8; k++) {
18838 for (uint32_t n = 1; n <= 4; n++) {
18839 for (uint32_t m = 1; m <= 3; m++) {
18840 GemmMicrokernelTester()
18841 .mr(3)
18842 .nr(4)
18843 .kr(2)
18844 .sr(4)
18845 .m(m)
18846 .n(n)
18847 .k(k)
18848 .iterations(1)
18849 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18850 }
18851 }
18852 }
18853 }
18854
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128,k_gt_8)18855 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128, k_gt_8) {
18856 TEST_REQUIRES_X86_SSE2;
18857 for (size_t k = 9; k < 16; k++) {
18858 GemmMicrokernelTester()
18859 .mr(3)
18860 .nr(4)
18861 .kr(2)
18862 .sr(4)
18863 .m(3)
18864 .n(4)
18865 .k(k)
18866 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18867 }
18868 }
18869
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128,k_gt_8_strided_a)18870 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128, k_gt_8_strided_a) {
18871 TEST_REQUIRES_X86_SSE2;
18872 for (size_t k = 9; k < 16; k++) {
18873 GemmMicrokernelTester()
18874 .mr(3)
18875 .nr(4)
18876 .kr(2)
18877 .sr(4)
18878 .m(3)
18879 .n(4)
18880 .k(k)
18881 .a_stride(19)
18882 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18883 }
18884 }
18885
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128,k_gt_8_subtile)18886 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128, k_gt_8_subtile) {
18887 TEST_REQUIRES_X86_SSE2;
18888 for (size_t k = 9; k < 16; k++) {
18889 for (uint32_t n = 1; n <= 4; n++) {
18890 for (uint32_t m = 1; m <= 3; m++) {
18891 GemmMicrokernelTester()
18892 .mr(3)
18893 .nr(4)
18894 .kr(2)
18895 .sr(4)
18896 .m(m)
18897 .n(n)
18898 .k(k)
18899 .iterations(1)
18900 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18901 }
18902 }
18903 }
18904 }
18905
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128,k_div_8)18906 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128, k_div_8) {
18907 TEST_REQUIRES_X86_SSE2;
18908 for (size_t k = 16; k <= 80; k += 8) {
18909 GemmMicrokernelTester()
18910 .mr(3)
18911 .nr(4)
18912 .kr(2)
18913 .sr(4)
18914 .m(3)
18915 .n(4)
18916 .k(k)
18917 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18918 }
18919 }
18920
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128,k_div_8_strided_a)18921 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128, k_div_8_strided_a) {
18922 TEST_REQUIRES_X86_SSE2;
18923 for (size_t k = 16; k <= 80; k += 8) {
18924 GemmMicrokernelTester()
18925 .mr(3)
18926 .nr(4)
18927 .kr(2)
18928 .sr(4)
18929 .m(3)
18930 .n(4)
18931 .k(k)
18932 .a_stride(83)
18933 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18934 }
18935 }
18936
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128,k_div_8_subtile)18937 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128, k_div_8_subtile) {
18938 TEST_REQUIRES_X86_SSE2;
18939 for (size_t k = 16; k <= 80; k += 8) {
18940 for (uint32_t n = 1; n <= 4; n++) {
18941 for (uint32_t m = 1; m <= 3; m++) {
18942 GemmMicrokernelTester()
18943 .mr(3)
18944 .nr(4)
18945 .kr(2)
18946 .sr(4)
18947 .m(m)
18948 .n(n)
18949 .k(k)
18950 .iterations(1)
18951 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18952 }
18953 }
18954 }
18955 }
18956
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128,n_gt_4)18957 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128, n_gt_4) {
18958 TEST_REQUIRES_X86_SSE2;
18959 for (uint32_t n = 5; n < 8; n++) {
18960 for (size_t k = 1; k <= 40; k += 9) {
18961 GemmMicrokernelTester()
18962 .mr(3)
18963 .nr(4)
18964 .kr(2)
18965 .sr(4)
18966 .m(3)
18967 .n(n)
18968 .k(k)
18969 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18970 }
18971 }
18972 }
18973
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128,n_gt_4_strided_cn)18974 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128, n_gt_4_strided_cn) {
18975 TEST_REQUIRES_X86_SSE2;
18976 for (uint32_t n = 5; n < 8; n++) {
18977 for (size_t k = 1; k <= 40; k += 9) {
18978 GemmMicrokernelTester()
18979 .mr(3)
18980 .nr(4)
18981 .kr(2)
18982 .sr(4)
18983 .m(3)
18984 .n(n)
18985 .k(k)
18986 .cn_stride(7)
18987 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
18988 }
18989 }
18990 }
18991
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128,n_gt_4_strided_a)18992 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128, n_gt_4_strided_a) {
18993 TEST_REQUIRES_X86_SSE2;
18994 for (uint32_t n = 5; n < 8; n++) {
18995 for (size_t k = 1; k <= 40; k += 9) {
18996 GemmMicrokernelTester()
18997 .mr(3)
18998 .nr(4)
18999 .kr(2)
19000 .sr(4)
19001 .m(3)
19002 .n(n)
19003 .k(k)
19004 .a_stride(43)
19005 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19006 }
19007 }
19008 }
19009
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128,n_gt_4_subtile)19010 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128, n_gt_4_subtile) {
19011 TEST_REQUIRES_X86_SSE2;
19012 for (uint32_t n = 5; n < 8; n++) {
19013 for (size_t k = 1; k <= 40; k += 9) {
19014 for (uint32_t m = 1; m <= 3; m++) {
19015 GemmMicrokernelTester()
19016 .mr(3)
19017 .nr(4)
19018 .kr(2)
19019 .sr(4)
19020 .m(m)
19021 .n(n)
19022 .k(k)
19023 .iterations(1)
19024 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19025 }
19026 }
19027 }
19028 }
19029
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128,n_div_4)19030 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128, n_div_4) {
19031 TEST_REQUIRES_X86_SSE2;
19032 for (uint32_t n = 8; n <= 12; n += 4) {
19033 for (size_t k = 1; k <= 40; k += 9) {
19034 GemmMicrokernelTester()
19035 .mr(3)
19036 .nr(4)
19037 .kr(2)
19038 .sr(4)
19039 .m(3)
19040 .n(n)
19041 .k(k)
19042 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19043 }
19044 }
19045 }
19046
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128,n_div_4_strided_cn)19047 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128, n_div_4_strided_cn) {
19048 TEST_REQUIRES_X86_SSE2;
19049 for (uint32_t n = 8; n <= 12; n += 4) {
19050 for (size_t k = 1; k <= 40; k += 9) {
19051 GemmMicrokernelTester()
19052 .mr(3)
19053 .nr(4)
19054 .kr(2)
19055 .sr(4)
19056 .m(3)
19057 .n(n)
19058 .k(k)
19059 .cn_stride(7)
19060 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19061 }
19062 }
19063 }
19064
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128,n_div_4_strided_a)19065 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128, n_div_4_strided_a) {
19066 TEST_REQUIRES_X86_SSE2;
19067 for (uint32_t n = 8; n <= 12; n += 4) {
19068 for (size_t k = 1; k <= 40; k += 9) {
19069 GemmMicrokernelTester()
19070 .mr(3)
19071 .nr(4)
19072 .kr(2)
19073 .sr(4)
19074 .m(3)
19075 .n(n)
19076 .k(k)
19077 .a_stride(43)
19078 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19079 }
19080 }
19081 }
19082
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128,n_div_4_subtile)19083 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128, n_div_4_subtile) {
19084 TEST_REQUIRES_X86_SSE2;
19085 for (uint32_t n = 8; n <= 12; n += 4) {
19086 for (size_t k = 1; k <= 40; k += 9) {
19087 for (uint32_t m = 1; m <= 3; m++) {
19088 GemmMicrokernelTester()
19089 .mr(3)
19090 .nr(4)
19091 .kr(2)
19092 .sr(4)
19093 .m(m)
19094 .n(n)
19095 .k(k)
19096 .iterations(1)
19097 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19098 }
19099 }
19100 }
19101 }
19102
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128,strided_cm_subtile)19103 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128, strided_cm_subtile) {
19104 TEST_REQUIRES_X86_SSE2;
19105 for (size_t k = 1; k <= 40; k += 9) {
19106 for (uint32_t n = 1; n <= 4; n++) {
19107 for (uint32_t m = 1; m <= 3; m++) {
19108 GemmMicrokernelTester()
19109 .mr(3)
19110 .nr(4)
19111 .kr(2)
19112 .sr(4)
19113 .m(m)
19114 .n(n)
19115 .k(k)
19116 .cm_stride(7)
19117 .iterations(1)
19118 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19119 }
19120 }
19121 }
19122 }
19123
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128,qmin)19124 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128, qmin) {
19125 TEST_REQUIRES_X86_SSE2;
19126 GemmMicrokernelTester()
19127 .mr(3)
19128 .nr(4)
19129 .kr(2)
19130 .sr(4)
19131 .m(3)
19132 .n(4)
19133 .k(8)
19134 .qmin(128)
19135 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19136 }
19137
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128,qmax)19138 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128, qmax) {
19139 TEST_REQUIRES_X86_SSE2;
19140 GemmMicrokernelTester()
19141 .mr(3)
19142 .nr(4)
19143 .kr(2)
19144 .sr(4)
19145 .m(3)
19146 .n(4)
19147 .k(8)
19148 .qmax(128)
19149 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19150 }
19151
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128,strided_cm)19152 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128, strided_cm) {
19153 TEST_REQUIRES_X86_SSE2;
19154 GemmMicrokernelTester()
19155 .mr(3)
19156 .nr(4)
19157 .kr(2)
19158 .sr(4)
19159 .m(3)
19160 .n(4)
19161 .k(8)
19162 .cm_stride(7)
19163 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19164 }
19165
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128,no_a_zero_point)19166 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128, no_a_zero_point) {
19167 TEST_REQUIRES_X86_SSE2;
19168 for (size_t k = 1; k <= 40; k += 9) {
19169 GemmMicrokernelTester()
19170 .mr(3)
19171 .nr(4)
19172 .kr(2)
19173 .sr(4)
19174 .m(3)
19175 .n(4)
19176 .k(k)
19177 .a_zero_point(0)
19178 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19179 }
19180 }
19181
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128,no_b_zero_point)19182 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128, no_b_zero_point) {
19183 TEST_REQUIRES_X86_SSE2;
19184 for (size_t k = 1; k <= 40; k += 9) {
19185 GemmMicrokernelTester()
19186 .mr(3)
19187 .nr(4)
19188 .kr(2)
19189 .sr(4)
19190 .m(3)
19191 .n(4)
19192 .k(k)
19193 .b_zero_point(0)
19194 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19195 }
19196 }
19197
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128,no_zero_point)19198 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE2_LD128, no_zero_point) {
19199 TEST_REQUIRES_X86_SSE2;
19200 for (size_t k = 1; k <= 40; k += 9) {
19201 GemmMicrokernelTester()
19202 .mr(3)
19203 .nr(4)
19204 .kr(2)
19205 .sr(4)
19206 .m(3)
19207 .n(4)
19208 .k(k)
19209 .a_zero_point(0)
19210 .b_zero_point(0)
19211 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19212 }
19213 }
19214 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
19215
19216
19217 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128,k_eq_8)19218 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128, k_eq_8) {
19219 TEST_REQUIRES_X86_SSE41;
19220 GemmMicrokernelTester()
19221 .mr(3)
19222 .nr(4)
19223 .kr(2)
19224 .sr(4)
19225 .m(3)
19226 .n(4)
19227 .k(8)
19228 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19229 }
19230
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128,strided_cn)19231 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128, strided_cn) {
19232 TEST_REQUIRES_X86_SSE41;
19233 GemmMicrokernelTester()
19234 .mr(3)
19235 .nr(4)
19236 .kr(2)
19237 .sr(4)
19238 .m(3)
19239 .n(4)
19240 .k(8)
19241 .cn_stride(7)
19242 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19243 }
19244
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128,k_eq_8_strided_a)19245 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128, k_eq_8_strided_a) {
19246 TEST_REQUIRES_X86_SSE41;
19247 GemmMicrokernelTester()
19248 .mr(3)
19249 .nr(4)
19250 .kr(2)
19251 .sr(4)
19252 .m(3)
19253 .n(4)
19254 .k(8)
19255 .a_stride(11)
19256 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19257 }
19258
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128,k_eq_8_subtile)19259 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128, k_eq_8_subtile) {
19260 TEST_REQUIRES_X86_SSE41;
19261 for (uint32_t n = 1; n <= 4; n++) {
19262 for (uint32_t m = 1; m <= 3; m++) {
19263 GemmMicrokernelTester()
19264 .mr(3)
19265 .nr(4)
19266 .kr(2)
19267 .sr(4)
19268 .m(m)
19269 .n(n)
19270 .k(8)
19271 .iterations(1)
19272 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19273 }
19274 }
19275 }
19276
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128,k_eq_8_subtile_m)19277 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128, k_eq_8_subtile_m) {
19278 TEST_REQUIRES_X86_SSE41;
19279 for (uint32_t m = 1; m <= 3; m++) {
19280 GemmMicrokernelTester()
19281 .mr(3)
19282 .nr(4)
19283 .kr(2)
19284 .sr(4)
19285 .m(m)
19286 .n(4)
19287 .k(8)
19288 .iterations(1)
19289 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19290 }
19291 }
19292
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128,k_eq_8_subtile_n)19293 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128, k_eq_8_subtile_n) {
19294 TEST_REQUIRES_X86_SSE41;
19295 for (uint32_t n = 1; n <= 4; n++) {
19296 GemmMicrokernelTester()
19297 .mr(3)
19298 .nr(4)
19299 .kr(2)
19300 .sr(4)
19301 .m(3)
19302 .n(n)
19303 .k(8)
19304 .iterations(1)
19305 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19306 }
19307 }
19308
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128,k_lt_8)19309 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128, k_lt_8) {
19310 TEST_REQUIRES_X86_SSE41;
19311 for (size_t k = 1; k < 8; k++) {
19312 GemmMicrokernelTester()
19313 .mr(3)
19314 .nr(4)
19315 .kr(2)
19316 .sr(4)
19317 .m(3)
19318 .n(4)
19319 .k(k)
19320 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19321 }
19322 }
19323
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128,k_lt_8_strided_a)19324 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128, k_lt_8_strided_a) {
19325 TEST_REQUIRES_X86_SSE41;
19326 for (size_t k = 1; k < 8; k++) {
19327 GemmMicrokernelTester()
19328 .mr(3)
19329 .nr(4)
19330 .kr(2)
19331 .sr(4)
19332 .m(3)
19333 .n(4)
19334 .k(k)
19335 .a_stride(11)
19336 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19337 }
19338 }
19339
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128,k_lt_8_subtile)19340 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128, k_lt_8_subtile) {
19341 TEST_REQUIRES_X86_SSE41;
19342 for (size_t k = 1; k < 8; k++) {
19343 for (uint32_t n = 1; n <= 4; n++) {
19344 for (uint32_t m = 1; m <= 3; m++) {
19345 GemmMicrokernelTester()
19346 .mr(3)
19347 .nr(4)
19348 .kr(2)
19349 .sr(4)
19350 .m(m)
19351 .n(n)
19352 .k(k)
19353 .iterations(1)
19354 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19355 }
19356 }
19357 }
19358 }
19359
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128,k_gt_8)19360 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128, k_gt_8) {
19361 TEST_REQUIRES_X86_SSE41;
19362 for (size_t k = 9; k < 16; k++) {
19363 GemmMicrokernelTester()
19364 .mr(3)
19365 .nr(4)
19366 .kr(2)
19367 .sr(4)
19368 .m(3)
19369 .n(4)
19370 .k(k)
19371 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19372 }
19373 }
19374
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128,k_gt_8_strided_a)19375 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128, k_gt_8_strided_a) {
19376 TEST_REQUIRES_X86_SSE41;
19377 for (size_t k = 9; k < 16; k++) {
19378 GemmMicrokernelTester()
19379 .mr(3)
19380 .nr(4)
19381 .kr(2)
19382 .sr(4)
19383 .m(3)
19384 .n(4)
19385 .k(k)
19386 .a_stride(19)
19387 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19388 }
19389 }
19390
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128,k_gt_8_subtile)19391 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128, k_gt_8_subtile) {
19392 TEST_REQUIRES_X86_SSE41;
19393 for (size_t k = 9; k < 16; k++) {
19394 for (uint32_t n = 1; n <= 4; n++) {
19395 for (uint32_t m = 1; m <= 3; m++) {
19396 GemmMicrokernelTester()
19397 .mr(3)
19398 .nr(4)
19399 .kr(2)
19400 .sr(4)
19401 .m(m)
19402 .n(n)
19403 .k(k)
19404 .iterations(1)
19405 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19406 }
19407 }
19408 }
19409 }
19410
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128,k_div_8)19411 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128, k_div_8) {
19412 TEST_REQUIRES_X86_SSE41;
19413 for (size_t k = 16; k <= 80; k += 8) {
19414 GemmMicrokernelTester()
19415 .mr(3)
19416 .nr(4)
19417 .kr(2)
19418 .sr(4)
19419 .m(3)
19420 .n(4)
19421 .k(k)
19422 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19423 }
19424 }
19425
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128,k_div_8_strided_a)19426 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128, k_div_8_strided_a) {
19427 TEST_REQUIRES_X86_SSE41;
19428 for (size_t k = 16; k <= 80; k += 8) {
19429 GemmMicrokernelTester()
19430 .mr(3)
19431 .nr(4)
19432 .kr(2)
19433 .sr(4)
19434 .m(3)
19435 .n(4)
19436 .k(k)
19437 .a_stride(83)
19438 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19439 }
19440 }
19441
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128,k_div_8_subtile)19442 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128, k_div_8_subtile) {
19443 TEST_REQUIRES_X86_SSE41;
19444 for (size_t k = 16; k <= 80; k += 8) {
19445 for (uint32_t n = 1; n <= 4; n++) {
19446 for (uint32_t m = 1; m <= 3; m++) {
19447 GemmMicrokernelTester()
19448 .mr(3)
19449 .nr(4)
19450 .kr(2)
19451 .sr(4)
19452 .m(m)
19453 .n(n)
19454 .k(k)
19455 .iterations(1)
19456 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19457 }
19458 }
19459 }
19460 }
19461
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128,n_gt_4)19462 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128, n_gt_4) {
19463 TEST_REQUIRES_X86_SSE41;
19464 for (uint32_t n = 5; n < 8; n++) {
19465 for (size_t k = 1; k <= 40; k += 9) {
19466 GemmMicrokernelTester()
19467 .mr(3)
19468 .nr(4)
19469 .kr(2)
19470 .sr(4)
19471 .m(3)
19472 .n(n)
19473 .k(k)
19474 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19475 }
19476 }
19477 }
19478
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128,n_gt_4_strided_cn)19479 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128, n_gt_4_strided_cn) {
19480 TEST_REQUIRES_X86_SSE41;
19481 for (uint32_t n = 5; n < 8; n++) {
19482 for (size_t k = 1; k <= 40; k += 9) {
19483 GemmMicrokernelTester()
19484 .mr(3)
19485 .nr(4)
19486 .kr(2)
19487 .sr(4)
19488 .m(3)
19489 .n(n)
19490 .k(k)
19491 .cn_stride(7)
19492 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19493 }
19494 }
19495 }
19496
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128,n_gt_4_strided_a)19497 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128, n_gt_4_strided_a) {
19498 TEST_REQUIRES_X86_SSE41;
19499 for (uint32_t n = 5; n < 8; n++) {
19500 for (size_t k = 1; k <= 40; k += 9) {
19501 GemmMicrokernelTester()
19502 .mr(3)
19503 .nr(4)
19504 .kr(2)
19505 .sr(4)
19506 .m(3)
19507 .n(n)
19508 .k(k)
19509 .a_stride(43)
19510 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19511 }
19512 }
19513 }
19514
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128,n_gt_4_subtile)19515 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128, n_gt_4_subtile) {
19516 TEST_REQUIRES_X86_SSE41;
19517 for (uint32_t n = 5; n < 8; n++) {
19518 for (size_t k = 1; k <= 40; k += 9) {
19519 for (uint32_t m = 1; m <= 3; m++) {
19520 GemmMicrokernelTester()
19521 .mr(3)
19522 .nr(4)
19523 .kr(2)
19524 .sr(4)
19525 .m(m)
19526 .n(n)
19527 .k(k)
19528 .iterations(1)
19529 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19530 }
19531 }
19532 }
19533 }
19534
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128,n_div_4)19535 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128, n_div_4) {
19536 TEST_REQUIRES_X86_SSE41;
19537 for (uint32_t n = 8; n <= 12; n += 4) {
19538 for (size_t k = 1; k <= 40; k += 9) {
19539 GemmMicrokernelTester()
19540 .mr(3)
19541 .nr(4)
19542 .kr(2)
19543 .sr(4)
19544 .m(3)
19545 .n(n)
19546 .k(k)
19547 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19548 }
19549 }
19550 }
19551
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128,n_div_4_strided_cn)19552 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128, n_div_4_strided_cn) {
19553 TEST_REQUIRES_X86_SSE41;
19554 for (uint32_t n = 8; n <= 12; n += 4) {
19555 for (size_t k = 1; k <= 40; k += 9) {
19556 GemmMicrokernelTester()
19557 .mr(3)
19558 .nr(4)
19559 .kr(2)
19560 .sr(4)
19561 .m(3)
19562 .n(n)
19563 .k(k)
19564 .cn_stride(7)
19565 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19566 }
19567 }
19568 }
19569
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128,n_div_4_strided_a)19570 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128, n_div_4_strided_a) {
19571 TEST_REQUIRES_X86_SSE41;
19572 for (uint32_t n = 8; n <= 12; n += 4) {
19573 for (size_t k = 1; k <= 40; k += 9) {
19574 GemmMicrokernelTester()
19575 .mr(3)
19576 .nr(4)
19577 .kr(2)
19578 .sr(4)
19579 .m(3)
19580 .n(n)
19581 .k(k)
19582 .a_stride(43)
19583 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19584 }
19585 }
19586 }
19587
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128,n_div_4_subtile)19588 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128, n_div_4_subtile) {
19589 TEST_REQUIRES_X86_SSE41;
19590 for (uint32_t n = 8; n <= 12; n += 4) {
19591 for (size_t k = 1; k <= 40; k += 9) {
19592 for (uint32_t m = 1; m <= 3; m++) {
19593 GemmMicrokernelTester()
19594 .mr(3)
19595 .nr(4)
19596 .kr(2)
19597 .sr(4)
19598 .m(m)
19599 .n(n)
19600 .k(k)
19601 .iterations(1)
19602 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19603 }
19604 }
19605 }
19606 }
19607
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128,strided_cm_subtile)19608 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128, strided_cm_subtile) {
19609 TEST_REQUIRES_X86_SSE41;
19610 for (size_t k = 1; k <= 40; k += 9) {
19611 for (uint32_t n = 1; n <= 4; n++) {
19612 for (uint32_t m = 1; m <= 3; m++) {
19613 GemmMicrokernelTester()
19614 .mr(3)
19615 .nr(4)
19616 .kr(2)
19617 .sr(4)
19618 .m(m)
19619 .n(n)
19620 .k(k)
19621 .cm_stride(7)
19622 .iterations(1)
19623 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19624 }
19625 }
19626 }
19627 }
19628
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128,qmin)19629 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128, qmin) {
19630 TEST_REQUIRES_X86_SSE41;
19631 GemmMicrokernelTester()
19632 .mr(3)
19633 .nr(4)
19634 .kr(2)
19635 .sr(4)
19636 .m(3)
19637 .n(4)
19638 .k(8)
19639 .qmin(128)
19640 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19641 }
19642
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128,qmax)19643 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128, qmax) {
19644 TEST_REQUIRES_X86_SSE41;
19645 GemmMicrokernelTester()
19646 .mr(3)
19647 .nr(4)
19648 .kr(2)
19649 .sr(4)
19650 .m(3)
19651 .n(4)
19652 .k(8)
19653 .qmax(128)
19654 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19655 }
19656
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128,strided_cm)19657 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128, strided_cm) {
19658 TEST_REQUIRES_X86_SSE41;
19659 GemmMicrokernelTester()
19660 .mr(3)
19661 .nr(4)
19662 .kr(2)
19663 .sr(4)
19664 .m(3)
19665 .n(4)
19666 .k(8)
19667 .cm_stride(7)
19668 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19669 }
19670
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128,no_a_zero_point)19671 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128, no_a_zero_point) {
19672 TEST_REQUIRES_X86_SSE41;
19673 for (size_t k = 1; k <= 40; k += 9) {
19674 GemmMicrokernelTester()
19675 .mr(3)
19676 .nr(4)
19677 .kr(2)
19678 .sr(4)
19679 .m(3)
19680 .n(4)
19681 .k(k)
19682 .a_zero_point(0)
19683 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19684 }
19685 }
19686
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128,no_b_zero_point)19687 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128, no_b_zero_point) {
19688 TEST_REQUIRES_X86_SSE41;
19689 for (size_t k = 1; k <= 40; k += 9) {
19690 GemmMicrokernelTester()
19691 .mr(3)
19692 .nr(4)
19693 .kr(2)
19694 .sr(4)
19695 .m(3)
19696 .n(4)
19697 .k(k)
19698 .b_zero_point(0)
19699 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19700 }
19701 }
19702
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128,no_zero_point)19703 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__SSE41_LD128, no_zero_point) {
19704 TEST_REQUIRES_X86_SSE41;
19705 for (size_t k = 1; k <= 40; k += 9) {
19706 GemmMicrokernelTester()
19707 .mr(3)
19708 .nr(4)
19709 .kr(2)
19710 .sr(4)
19711 .m(3)
19712 .n(4)
19713 .k(k)
19714 .a_zero_point(0)
19715 .b_zero_point(0)
19716 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19717 }
19718 }
19719 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
19720
19721
19722 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128,k_eq_8)19723 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128, k_eq_8) {
19724 TEST_REQUIRES_X86_AVX;
19725 GemmMicrokernelTester()
19726 .mr(1)
19727 .nr(4)
19728 .kr(2)
19729 .sr(4)
19730 .m(1)
19731 .n(4)
19732 .k(8)
19733 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19734 }
19735
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128,strided_cn)19736 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128, strided_cn) {
19737 TEST_REQUIRES_X86_AVX;
19738 GemmMicrokernelTester()
19739 .mr(1)
19740 .nr(4)
19741 .kr(2)
19742 .sr(4)
19743 .m(1)
19744 .n(4)
19745 .k(8)
19746 .cn_stride(7)
19747 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19748 }
19749
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128,k_eq_8_strided_a)19750 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128, k_eq_8_strided_a) {
19751 TEST_REQUIRES_X86_AVX;
19752 GemmMicrokernelTester()
19753 .mr(1)
19754 .nr(4)
19755 .kr(2)
19756 .sr(4)
19757 .m(1)
19758 .n(4)
19759 .k(8)
19760 .a_stride(11)
19761 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19762 }
19763
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128,k_eq_8_subtile)19764 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128, k_eq_8_subtile) {
19765 TEST_REQUIRES_X86_AVX;
19766 for (uint32_t n = 1; n <= 4; n++) {
19767 for (uint32_t m = 1; m <= 1; m++) {
19768 GemmMicrokernelTester()
19769 .mr(1)
19770 .nr(4)
19771 .kr(2)
19772 .sr(4)
19773 .m(m)
19774 .n(n)
19775 .k(8)
19776 .iterations(1)
19777 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19778 }
19779 }
19780 }
19781
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128,k_eq_8_subtile_m)19782 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128, k_eq_8_subtile_m) {
19783 TEST_REQUIRES_X86_AVX;
19784 for (uint32_t m = 1; m <= 1; m++) {
19785 GemmMicrokernelTester()
19786 .mr(1)
19787 .nr(4)
19788 .kr(2)
19789 .sr(4)
19790 .m(m)
19791 .n(4)
19792 .k(8)
19793 .iterations(1)
19794 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19795 }
19796 }
19797
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128,k_eq_8_subtile_n)19798 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128, k_eq_8_subtile_n) {
19799 TEST_REQUIRES_X86_AVX;
19800 for (uint32_t n = 1; n <= 4; n++) {
19801 GemmMicrokernelTester()
19802 .mr(1)
19803 .nr(4)
19804 .kr(2)
19805 .sr(4)
19806 .m(1)
19807 .n(n)
19808 .k(8)
19809 .iterations(1)
19810 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19811 }
19812 }
19813
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128,k_lt_8)19814 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128, k_lt_8) {
19815 TEST_REQUIRES_X86_AVX;
19816 for (size_t k = 1; k < 8; k++) {
19817 GemmMicrokernelTester()
19818 .mr(1)
19819 .nr(4)
19820 .kr(2)
19821 .sr(4)
19822 .m(1)
19823 .n(4)
19824 .k(k)
19825 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19826 }
19827 }
19828
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128,k_lt_8_strided_a)19829 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128, k_lt_8_strided_a) {
19830 TEST_REQUIRES_X86_AVX;
19831 for (size_t k = 1; k < 8; k++) {
19832 GemmMicrokernelTester()
19833 .mr(1)
19834 .nr(4)
19835 .kr(2)
19836 .sr(4)
19837 .m(1)
19838 .n(4)
19839 .k(k)
19840 .a_stride(11)
19841 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19842 }
19843 }
19844
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128,k_lt_8_subtile)19845 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128, k_lt_8_subtile) {
19846 TEST_REQUIRES_X86_AVX;
19847 for (size_t k = 1; k < 8; k++) {
19848 for (uint32_t n = 1; n <= 4; n++) {
19849 for (uint32_t m = 1; m <= 1; m++) {
19850 GemmMicrokernelTester()
19851 .mr(1)
19852 .nr(4)
19853 .kr(2)
19854 .sr(4)
19855 .m(m)
19856 .n(n)
19857 .k(k)
19858 .iterations(1)
19859 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19860 }
19861 }
19862 }
19863 }
19864
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128,k_gt_8)19865 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128, k_gt_8) {
19866 TEST_REQUIRES_X86_AVX;
19867 for (size_t k = 9; k < 16; k++) {
19868 GemmMicrokernelTester()
19869 .mr(1)
19870 .nr(4)
19871 .kr(2)
19872 .sr(4)
19873 .m(1)
19874 .n(4)
19875 .k(k)
19876 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19877 }
19878 }
19879
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128,k_gt_8_strided_a)19880 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128, k_gt_8_strided_a) {
19881 TEST_REQUIRES_X86_AVX;
19882 for (size_t k = 9; k < 16; k++) {
19883 GemmMicrokernelTester()
19884 .mr(1)
19885 .nr(4)
19886 .kr(2)
19887 .sr(4)
19888 .m(1)
19889 .n(4)
19890 .k(k)
19891 .a_stride(19)
19892 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19893 }
19894 }
19895
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128,k_gt_8_subtile)19896 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128, k_gt_8_subtile) {
19897 TEST_REQUIRES_X86_AVX;
19898 for (size_t k = 9; k < 16; k++) {
19899 for (uint32_t n = 1; n <= 4; n++) {
19900 for (uint32_t m = 1; m <= 1; m++) {
19901 GemmMicrokernelTester()
19902 .mr(1)
19903 .nr(4)
19904 .kr(2)
19905 .sr(4)
19906 .m(m)
19907 .n(n)
19908 .k(k)
19909 .iterations(1)
19910 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19911 }
19912 }
19913 }
19914 }
19915
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128,k_div_8)19916 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128, k_div_8) {
19917 TEST_REQUIRES_X86_AVX;
19918 for (size_t k = 16; k <= 80; k += 8) {
19919 GemmMicrokernelTester()
19920 .mr(1)
19921 .nr(4)
19922 .kr(2)
19923 .sr(4)
19924 .m(1)
19925 .n(4)
19926 .k(k)
19927 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19928 }
19929 }
19930
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128,k_div_8_strided_a)19931 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128, k_div_8_strided_a) {
19932 TEST_REQUIRES_X86_AVX;
19933 for (size_t k = 16; k <= 80; k += 8) {
19934 GemmMicrokernelTester()
19935 .mr(1)
19936 .nr(4)
19937 .kr(2)
19938 .sr(4)
19939 .m(1)
19940 .n(4)
19941 .k(k)
19942 .a_stride(83)
19943 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19944 }
19945 }
19946
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128,k_div_8_subtile)19947 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128, k_div_8_subtile) {
19948 TEST_REQUIRES_X86_AVX;
19949 for (size_t k = 16; k <= 80; k += 8) {
19950 for (uint32_t n = 1; n <= 4; n++) {
19951 for (uint32_t m = 1; m <= 1; m++) {
19952 GemmMicrokernelTester()
19953 .mr(1)
19954 .nr(4)
19955 .kr(2)
19956 .sr(4)
19957 .m(m)
19958 .n(n)
19959 .k(k)
19960 .iterations(1)
19961 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19962 }
19963 }
19964 }
19965 }
19966
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128,n_gt_4)19967 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128, n_gt_4) {
19968 TEST_REQUIRES_X86_AVX;
19969 for (uint32_t n = 5; n < 8; n++) {
19970 for (size_t k = 1; k <= 40; k += 9) {
19971 GemmMicrokernelTester()
19972 .mr(1)
19973 .nr(4)
19974 .kr(2)
19975 .sr(4)
19976 .m(1)
19977 .n(n)
19978 .k(k)
19979 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19980 }
19981 }
19982 }
19983
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128,n_gt_4_strided_cn)19984 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128, n_gt_4_strided_cn) {
19985 TEST_REQUIRES_X86_AVX;
19986 for (uint32_t n = 5; n < 8; n++) {
19987 for (size_t k = 1; k <= 40; k += 9) {
19988 GemmMicrokernelTester()
19989 .mr(1)
19990 .nr(4)
19991 .kr(2)
19992 .sr(4)
19993 .m(1)
19994 .n(n)
19995 .k(k)
19996 .cn_stride(7)
19997 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
19998 }
19999 }
20000 }
20001
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128,n_gt_4_strided_a)20002 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128, n_gt_4_strided_a) {
20003 TEST_REQUIRES_X86_AVX;
20004 for (uint32_t n = 5; n < 8; n++) {
20005 for (size_t k = 1; k <= 40; k += 9) {
20006 GemmMicrokernelTester()
20007 .mr(1)
20008 .nr(4)
20009 .kr(2)
20010 .sr(4)
20011 .m(1)
20012 .n(n)
20013 .k(k)
20014 .a_stride(43)
20015 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20016 }
20017 }
20018 }
20019
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128,n_gt_4_subtile)20020 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128, n_gt_4_subtile) {
20021 TEST_REQUIRES_X86_AVX;
20022 for (uint32_t n = 5; n < 8; n++) {
20023 for (size_t k = 1; k <= 40; k += 9) {
20024 for (uint32_t m = 1; m <= 1; m++) {
20025 GemmMicrokernelTester()
20026 .mr(1)
20027 .nr(4)
20028 .kr(2)
20029 .sr(4)
20030 .m(m)
20031 .n(n)
20032 .k(k)
20033 .iterations(1)
20034 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20035 }
20036 }
20037 }
20038 }
20039
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128,n_div_4)20040 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128, n_div_4) {
20041 TEST_REQUIRES_X86_AVX;
20042 for (uint32_t n = 8; n <= 12; n += 4) {
20043 for (size_t k = 1; k <= 40; k += 9) {
20044 GemmMicrokernelTester()
20045 .mr(1)
20046 .nr(4)
20047 .kr(2)
20048 .sr(4)
20049 .m(1)
20050 .n(n)
20051 .k(k)
20052 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20053 }
20054 }
20055 }
20056
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128,n_div_4_strided_cn)20057 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128, n_div_4_strided_cn) {
20058 TEST_REQUIRES_X86_AVX;
20059 for (uint32_t n = 8; n <= 12; n += 4) {
20060 for (size_t k = 1; k <= 40; k += 9) {
20061 GemmMicrokernelTester()
20062 .mr(1)
20063 .nr(4)
20064 .kr(2)
20065 .sr(4)
20066 .m(1)
20067 .n(n)
20068 .k(k)
20069 .cn_stride(7)
20070 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20071 }
20072 }
20073 }
20074
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128,n_div_4_strided_a)20075 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128, n_div_4_strided_a) {
20076 TEST_REQUIRES_X86_AVX;
20077 for (uint32_t n = 8; n <= 12; n += 4) {
20078 for (size_t k = 1; k <= 40; k += 9) {
20079 GemmMicrokernelTester()
20080 .mr(1)
20081 .nr(4)
20082 .kr(2)
20083 .sr(4)
20084 .m(1)
20085 .n(n)
20086 .k(k)
20087 .a_stride(43)
20088 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20089 }
20090 }
20091 }
20092
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128,n_div_4_subtile)20093 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128, n_div_4_subtile) {
20094 TEST_REQUIRES_X86_AVX;
20095 for (uint32_t n = 8; n <= 12; n += 4) {
20096 for (size_t k = 1; k <= 40; k += 9) {
20097 for (uint32_t m = 1; m <= 1; m++) {
20098 GemmMicrokernelTester()
20099 .mr(1)
20100 .nr(4)
20101 .kr(2)
20102 .sr(4)
20103 .m(m)
20104 .n(n)
20105 .k(k)
20106 .iterations(1)
20107 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20108 }
20109 }
20110 }
20111 }
20112
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128,strided_cm_subtile)20113 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128, strided_cm_subtile) {
20114 TEST_REQUIRES_X86_AVX;
20115 for (size_t k = 1; k <= 40; k += 9) {
20116 for (uint32_t n = 1; n <= 4; n++) {
20117 for (uint32_t m = 1; m <= 1; m++) {
20118 GemmMicrokernelTester()
20119 .mr(1)
20120 .nr(4)
20121 .kr(2)
20122 .sr(4)
20123 .m(m)
20124 .n(n)
20125 .k(k)
20126 .cm_stride(7)
20127 .iterations(1)
20128 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20129 }
20130 }
20131 }
20132 }
20133
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128,qmin)20134 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128, qmin) {
20135 TEST_REQUIRES_X86_AVX;
20136 GemmMicrokernelTester()
20137 .mr(1)
20138 .nr(4)
20139 .kr(2)
20140 .sr(4)
20141 .m(1)
20142 .n(4)
20143 .k(8)
20144 .qmin(128)
20145 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20146 }
20147
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128,qmax)20148 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128, qmax) {
20149 TEST_REQUIRES_X86_AVX;
20150 GemmMicrokernelTester()
20151 .mr(1)
20152 .nr(4)
20153 .kr(2)
20154 .sr(4)
20155 .m(1)
20156 .n(4)
20157 .k(8)
20158 .qmax(128)
20159 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20160 }
20161
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128,strided_cm)20162 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128, strided_cm) {
20163 TEST_REQUIRES_X86_AVX;
20164 GemmMicrokernelTester()
20165 .mr(1)
20166 .nr(4)
20167 .kr(2)
20168 .sr(4)
20169 .m(1)
20170 .n(4)
20171 .k(8)
20172 .cm_stride(7)
20173 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20174 }
20175
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128,no_a_zero_point)20176 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128, no_a_zero_point) {
20177 TEST_REQUIRES_X86_AVX;
20178 for (size_t k = 1; k <= 40; k += 9) {
20179 GemmMicrokernelTester()
20180 .mr(1)
20181 .nr(4)
20182 .kr(2)
20183 .sr(4)
20184 .m(1)
20185 .n(4)
20186 .k(k)
20187 .a_zero_point(0)
20188 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20189 }
20190 }
20191
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128,no_b_zero_point)20192 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128, no_b_zero_point) {
20193 TEST_REQUIRES_X86_AVX;
20194 for (size_t k = 1; k <= 40; k += 9) {
20195 GemmMicrokernelTester()
20196 .mr(1)
20197 .nr(4)
20198 .kr(2)
20199 .sr(4)
20200 .m(1)
20201 .n(4)
20202 .k(k)
20203 .b_zero_point(0)
20204 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20205 }
20206 }
20207
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128,no_zero_point)20208 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__AVX_LD128, no_zero_point) {
20209 TEST_REQUIRES_X86_AVX;
20210 for (size_t k = 1; k <= 40; k += 9) {
20211 GemmMicrokernelTester()
20212 .mr(1)
20213 .nr(4)
20214 .kr(2)
20215 .sr(4)
20216 .m(1)
20217 .n(4)
20218 .k(k)
20219 .a_zero_point(0)
20220 .b_zero_point(0)
20221 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20222 }
20223 }
20224 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
20225
20226
20227 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128,k_eq_8)20228 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128, k_eq_8) {
20229 TEST_REQUIRES_X86_AVX;
20230 GemmMicrokernelTester()
20231 .mr(2)
20232 .nr(4)
20233 .kr(2)
20234 .sr(4)
20235 .m(2)
20236 .n(4)
20237 .k(8)
20238 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20239 }
20240
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128,strided_cn)20241 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128, strided_cn) {
20242 TEST_REQUIRES_X86_AVX;
20243 GemmMicrokernelTester()
20244 .mr(2)
20245 .nr(4)
20246 .kr(2)
20247 .sr(4)
20248 .m(2)
20249 .n(4)
20250 .k(8)
20251 .cn_stride(7)
20252 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20253 }
20254
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128,k_eq_8_strided_a)20255 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128, k_eq_8_strided_a) {
20256 TEST_REQUIRES_X86_AVX;
20257 GemmMicrokernelTester()
20258 .mr(2)
20259 .nr(4)
20260 .kr(2)
20261 .sr(4)
20262 .m(2)
20263 .n(4)
20264 .k(8)
20265 .a_stride(11)
20266 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20267 }
20268
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128,k_eq_8_subtile)20269 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128, k_eq_8_subtile) {
20270 TEST_REQUIRES_X86_AVX;
20271 for (uint32_t n = 1; n <= 4; n++) {
20272 for (uint32_t m = 1; m <= 2; m++) {
20273 GemmMicrokernelTester()
20274 .mr(2)
20275 .nr(4)
20276 .kr(2)
20277 .sr(4)
20278 .m(m)
20279 .n(n)
20280 .k(8)
20281 .iterations(1)
20282 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20283 }
20284 }
20285 }
20286
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128,k_eq_8_subtile_m)20287 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128, k_eq_8_subtile_m) {
20288 TEST_REQUIRES_X86_AVX;
20289 for (uint32_t m = 1; m <= 2; m++) {
20290 GemmMicrokernelTester()
20291 .mr(2)
20292 .nr(4)
20293 .kr(2)
20294 .sr(4)
20295 .m(m)
20296 .n(4)
20297 .k(8)
20298 .iterations(1)
20299 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20300 }
20301 }
20302
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128,k_eq_8_subtile_n)20303 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128, k_eq_8_subtile_n) {
20304 TEST_REQUIRES_X86_AVX;
20305 for (uint32_t n = 1; n <= 4; n++) {
20306 GemmMicrokernelTester()
20307 .mr(2)
20308 .nr(4)
20309 .kr(2)
20310 .sr(4)
20311 .m(2)
20312 .n(n)
20313 .k(8)
20314 .iterations(1)
20315 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20316 }
20317 }
20318
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128,k_lt_8)20319 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128, k_lt_8) {
20320 TEST_REQUIRES_X86_AVX;
20321 for (size_t k = 1; k < 8; k++) {
20322 GemmMicrokernelTester()
20323 .mr(2)
20324 .nr(4)
20325 .kr(2)
20326 .sr(4)
20327 .m(2)
20328 .n(4)
20329 .k(k)
20330 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20331 }
20332 }
20333
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128,k_lt_8_strided_a)20334 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128, k_lt_8_strided_a) {
20335 TEST_REQUIRES_X86_AVX;
20336 for (size_t k = 1; k < 8; k++) {
20337 GemmMicrokernelTester()
20338 .mr(2)
20339 .nr(4)
20340 .kr(2)
20341 .sr(4)
20342 .m(2)
20343 .n(4)
20344 .k(k)
20345 .a_stride(11)
20346 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20347 }
20348 }
20349
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128,k_lt_8_subtile)20350 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128, k_lt_8_subtile) {
20351 TEST_REQUIRES_X86_AVX;
20352 for (size_t k = 1; k < 8; k++) {
20353 for (uint32_t n = 1; n <= 4; n++) {
20354 for (uint32_t m = 1; m <= 2; m++) {
20355 GemmMicrokernelTester()
20356 .mr(2)
20357 .nr(4)
20358 .kr(2)
20359 .sr(4)
20360 .m(m)
20361 .n(n)
20362 .k(k)
20363 .iterations(1)
20364 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20365 }
20366 }
20367 }
20368 }
20369
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128,k_gt_8)20370 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128, k_gt_8) {
20371 TEST_REQUIRES_X86_AVX;
20372 for (size_t k = 9; k < 16; k++) {
20373 GemmMicrokernelTester()
20374 .mr(2)
20375 .nr(4)
20376 .kr(2)
20377 .sr(4)
20378 .m(2)
20379 .n(4)
20380 .k(k)
20381 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20382 }
20383 }
20384
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128,k_gt_8_strided_a)20385 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128, k_gt_8_strided_a) {
20386 TEST_REQUIRES_X86_AVX;
20387 for (size_t k = 9; k < 16; k++) {
20388 GemmMicrokernelTester()
20389 .mr(2)
20390 .nr(4)
20391 .kr(2)
20392 .sr(4)
20393 .m(2)
20394 .n(4)
20395 .k(k)
20396 .a_stride(19)
20397 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20398 }
20399 }
20400
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128,k_gt_8_subtile)20401 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128, k_gt_8_subtile) {
20402 TEST_REQUIRES_X86_AVX;
20403 for (size_t k = 9; k < 16; k++) {
20404 for (uint32_t n = 1; n <= 4; n++) {
20405 for (uint32_t m = 1; m <= 2; m++) {
20406 GemmMicrokernelTester()
20407 .mr(2)
20408 .nr(4)
20409 .kr(2)
20410 .sr(4)
20411 .m(m)
20412 .n(n)
20413 .k(k)
20414 .iterations(1)
20415 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20416 }
20417 }
20418 }
20419 }
20420
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128,k_div_8)20421 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128, k_div_8) {
20422 TEST_REQUIRES_X86_AVX;
20423 for (size_t k = 16; k <= 80; k += 8) {
20424 GemmMicrokernelTester()
20425 .mr(2)
20426 .nr(4)
20427 .kr(2)
20428 .sr(4)
20429 .m(2)
20430 .n(4)
20431 .k(k)
20432 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20433 }
20434 }
20435
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128,k_div_8_strided_a)20436 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128, k_div_8_strided_a) {
20437 TEST_REQUIRES_X86_AVX;
20438 for (size_t k = 16; k <= 80; k += 8) {
20439 GemmMicrokernelTester()
20440 .mr(2)
20441 .nr(4)
20442 .kr(2)
20443 .sr(4)
20444 .m(2)
20445 .n(4)
20446 .k(k)
20447 .a_stride(83)
20448 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20449 }
20450 }
20451
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128,k_div_8_subtile)20452 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128, k_div_8_subtile) {
20453 TEST_REQUIRES_X86_AVX;
20454 for (size_t k = 16; k <= 80; k += 8) {
20455 for (uint32_t n = 1; n <= 4; n++) {
20456 for (uint32_t m = 1; m <= 2; m++) {
20457 GemmMicrokernelTester()
20458 .mr(2)
20459 .nr(4)
20460 .kr(2)
20461 .sr(4)
20462 .m(m)
20463 .n(n)
20464 .k(k)
20465 .iterations(1)
20466 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20467 }
20468 }
20469 }
20470 }
20471
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128,n_gt_4)20472 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128, n_gt_4) {
20473 TEST_REQUIRES_X86_AVX;
20474 for (uint32_t n = 5; n < 8; n++) {
20475 for (size_t k = 1; k <= 40; k += 9) {
20476 GemmMicrokernelTester()
20477 .mr(2)
20478 .nr(4)
20479 .kr(2)
20480 .sr(4)
20481 .m(2)
20482 .n(n)
20483 .k(k)
20484 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20485 }
20486 }
20487 }
20488
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128,n_gt_4_strided_cn)20489 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128, n_gt_4_strided_cn) {
20490 TEST_REQUIRES_X86_AVX;
20491 for (uint32_t n = 5; n < 8; n++) {
20492 for (size_t k = 1; k <= 40; k += 9) {
20493 GemmMicrokernelTester()
20494 .mr(2)
20495 .nr(4)
20496 .kr(2)
20497 .sr(4)
20498 .m(2)
20499 .n(n)
20500 .k(k)
20501 .cn_stride(7)
20502 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20503 }
20504 }
20505 }
20506
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128,n_gt_4_strided_a)20507 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128, n_gt_4_strided_a) {
20508 TEST_REQUIRES_X86_AVX;
20509 for (uint32_t n = 5; n < 8; n++) {
20510 for (size_t k = 1; k <= 40; k += 9) {
20511 GemmMicrokernelTester()
20512 .mr(2)
20513 .nr(4)
20514 .kr(2)
20515 .sr(4)
20516 .m(2)
20517 .n(n)
20518 .k(k)
20519 .a_stride(43)
20520 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20521 }
20522 }
20523 }
20524
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128,n_gt_4_subtile)20525 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128, n_gt_4_subtile) {
20526 TEST_REQUIRES_X86_AVX;
20527 for (uint32_t n = 5; n < 8; n++) {
20528 for (size_t k = 1; k <= 40; k += 9) {
20529 for (uint32_t m = 1; m <= 2; m++) {
20530 GemmMicrokernelTester()
20531 .mr(2)
20532 .nr(4)
20533 .kr(2)
20534 .sr(4)
20535 .m(m)
20536 .n(n)
20537 .k(k)
20538 .iterations(1)
20539 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20540 }
20541 }
20542 }
20543 }
20544
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128,n_div_4)20545 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128, n_div_4) {
20546 TEST_REQUIRES_X86_AVX;
20547 for (uint32_t n = 8; n <= 12; n += 4) {
20548 for (size_t k = 1; k <= 40; k += 9) {
20549 GemmMicrokernelTester()
20550 .mr(2)
20551 .nr(4)
20552 .kr(2)
20553 .sr(4)
20554 .m(2)
20555 .n(n)
20556 .k(k)
20557 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20558 }
20559 }
20560 }
20561
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128,n_div_4_strided_cn)20562 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128, n_div_4_strided_cn) {
20563 TEST_REQUIRES_X86_AVX;
20564 for (uint32_t n = 8; n <= 12; n += 4) {
20565 for (size_t k = 1; k <= 40; k += 9) {
20566 GemmMicrokernelTester()
20567 .mr(2)
20568 .nr(4)
20569 .kr(2)
20570 .sr(4)
20571 .m(2)
20572 .n(n)
20573 .k(k)
20574 .cn_stride(7)
20575 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20576 }
20577 }
20578 }
20579
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128,n_div_4_strided_a)20580 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128, n_div_4_strided_a) {
20581 TEST_REQUIRES_X86_AVX;
20582 for (uint32_t n = 8; n <= 12; n += 4) {
20583 for (size_t k = 1; k <= 40; k += 9) {
20584 GemmMicrokernelTester()
20585 .mr(2)
20586 .nr(4)
20587 .kr(2)
20588 .sr(4)
20589 .m(2)
20590 .n(n)
20591 .k(k)
20592 .a_stride(43)
20593 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20594 }
20595 }
20596 }
20597
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128,n_div_4_subtile)20598 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128, n_div_4_subtile) {
20599 TEST_REQUIRES_X86_AVX;
20600 for (uint32_t n = 8; n <= 12; n += 4) {
20601 for (size_t k = 1; k <= 40; k += 9) {
20602 for (uint32_t m = 1; m <= 2; m++) {
20603 GemmMicrokernelTester()
20604 .mr(2)
20605 .nr(4)
20606 .kr(2)
20607 .sr(4)
20608 .m(m)
20609 .n(n)
20610 .k(k)
20611 .iterations(1)
20612 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20613 }
20614 }
20615 }
20616 }
20617
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128,strided_cm_subtile)20618 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128, strided_cm_subtile) {
20619 TEST_REQUIRES_X86_AVX;
20620 for (size_t k = 1; k <= 40; k += 9) {
20621 for (uint32_t n = 1; n <= 4; n++) {
20622 for (uint32_t m = 1; m <= 2; m++) {
20623 GemmMicrokernelTester()
20624 .mr(2)
20625 .nr(4)
20626 .kr(2)
20627 .sr(4)
20628 .m(m)
20629 .n(n)
20630 .k(k)
20631 .cm_stride(7)
20632 .iterations(1)
20633 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20634 }
20635 }
20636 }
20637 }
20638
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128,qmin)20639 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128, qmin) {
20640 TEST_REQUIRES_X86_AVX;
20641 GemmMicrokernelTester()
20642 .mr(2)
20643 .nr(4)
20644 .kr(2)
20645 .sr(4)
20646 .m(2)
20647 .n(4)
20648 .k(8)
20649 .qmin(128)
20650 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20651 }
20652
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128,qmax)20653 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128, qmax) {
20654 TEST_REQUIRES_X86_AVX;
20655 GemmMicrokernelTester()
20656 .mr(2)
20657 .nr(4)
20658 .kr(2)
20659 .sr(4)
20660 .m(2)
20661 .n(4)
20662 .k(8)
20663 .qmax(128)
20664 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20665 }
20666
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128,strided_cm)20667 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128, strided_cm) {
20668 TEST_REQUIRES_X86_AVX;
20669 GemmMicrokernelTester()
20670 .mr(2)
20671 .nr(4)
20672 .kr(2)
20673 .sr(4)
20674 .m(2)
20675 .n(4)
20676 .k(8)
20677 .cm_stride(7)
20678 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20679 }
20680
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128,no_a_zero_point)20681 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128, no_a_zero_point) {
20682 TEST_REQUIRES_X86_AVX;
20683 for (size_t k = 1; k <= 40; k += 9) {
20684 GemmMicrokernelTester()
20685 .mr(2)
20686 .nr(4)
20687 .kr(2)
20688 .sr(4)
20689 .m(2)
20690 .n(4)
20691 .k(k)
20692 .a_zero_point(0)
20693 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20694 }
20695 }
20696
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128,no_b_zero_point)20697 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128, no_b_zero_point) {
20698 TEST_REQUIRES_X86_AVX;
20699 for (size_t k = 1; k <= 40; k += 9) {
20700 GemmMicrokernelTester()
20701 .mr(2)
20702 .nr(4)
20703 .kr(2)
20704 .sr(4)
20705 .m(2)
20706 .n(4)
20707 .k(k)
20708 .b_zero_point(0)
20709 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20710 }
20711 }
20712
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128,no_zero_point)20713 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__AVX_LD128, no_zero_point) {
20714 TEST_REQUIRES_X86_AVX;
20715 for (size_t k = 1; k <= 40; k += 9) {
20716 GemmMicrokernelTester()
20717 .mr(2)
20718 .nr(4)
20719 .kr(2)
20720 .sr(4)
20721 .m(2)
20722 .n(4)
20723 .k(k)
20724 .a_zero_point(0)
20725 .b_zero_point(0)
20726 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20727 }
20728 }
20729 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
20730
20731
20732 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128,k_eq_8)20733 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128, k_eq_8) {
20734 TEST_REQUIRES_X86_XOP;
20735 GemmMicrokernelTester()
20736 .mr(3)
20737 .nr(4)
20738 .kr(2)
20739 .sr(4)
20740 .m(3)
20741 .n(4)
20742 .k(8)
20743 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20744 }
20745
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128,strided_cn)20746 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128, strided_cn) {
20747 TEST_REQUIRES_X86_XOP;
20748 GemmMicrokernelTester()
20749 .mr(3)
20750 .nr(4)
20751 .kr(2)
20752 .sr(4)
20753 .m(3)
20754 .n(4)
20755 .k(8)
20756 .cn_stride(7)
20757 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20758 }
20759
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128,k_eq_8_strided_a)20760 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128, k_eq_8_strided_a) {
20761 TEST_REQUIRES_X86_XOP;
20762 GemmMicrokernelTester()
20763 .mr(3)
20764 .nr(4)
20765 .kr(2)
20766 .sr(4)
20767 .m(3)
20768 .n(4)
20769 .k(8)
20770 .a_stride(11)
20771 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20772 }
20773
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128,k_eq_8_subtile)20774 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128, k_eq_8_subtile) {
20775 TEST_REQUIRES_X86_XOP;
20776 for (uint32_t n = 1; n <= 4; n++) {
20777 for (uint32_t m = 1; m <= 3; m++) {
20778 GemmMicrokernelTester()
20779 .mr(3)
20780 .nr(4)
20781 .kr(2)
20782 .sr(4)
20783 .m(m)
20784 .n(n)
20785 .k(8)
20786 .iterations(1)
20787 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20788 }
20789 }
20790 }
20791
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128,k_eq_8_subtile_m)20792 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128, k_eq_8_subtile_m) {
20793 TEST_REQUIRES_X86_XOP;
20794 for (uint32_t m = 1; m <= 3; m++) {
20795 GemmMicrokernelTester()
20796 .mr(3)
20797 .nr(4)
20798 .kr(2)
20799 .sr(4)
20800 .m(m)
20801 .n(4)
20802 .k(8)
20803 .iterations(1)
20804 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20805 }
20806 }
20807
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128,k_eq_8_subtile_n)20808 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128, k_eq_8_subtile_n) {
20809 TEST_REQUIRES_X86_XOP;
20810 for (uint32_t n = 1; n <= 4; n++) {
20811 GemmMicrokernelTester()
20812 .mr(3)
20813 .nr(4)
20814 .kr(2)
20815 .sr(4)
20816 .m(3)
20817 .n(n)
20818 .k(8)
20819 .iterations(1)
20820 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20821 }
20822 }
20823
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128,k_lt_8)20824 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128, k_lt_8) {
20825 TEST_REQUIRES_X86_XOP;
20826 for (size_t k = 1; k < 8; k++) {
20827 GemmMicrokernelTester()
20828 .mr(3)
20829 .nr(4)
20830 .kr(2)
20831 .sr(4)
20832 .m(3)
20833 .n(4)
20834 .k(k)
20835 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20836 }
20837 }
20838
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128,k_lt_8_strided_a)20839 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128, k_lt_8_strided_a) {
20840 TEST_REQUIRES_X86_XOP;
20841 for (size_t k = 1; k < 8; k++) {
20842 GemmMicrokernelTester()
20843 .mr(3)
20844 .nr(4)
20845 .kr(2)
20846 .sr(4)
20847 .m(3)
20848 .n(4)
20849 .k(k)
20850 .a_stride(11)
20851 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20852 }
20853 }
20854
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128,k_lt_8_subtile)20855 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128, k_lt_8_subtile) {
20856 TEST_REQUIRES_X86_XOP;
20857 for (size_t k = 1; k < 8; k++) {
20858 for (uint32_t n = 1; n <= 4; n++) {
20859 for (uint32_t m = 1; m <= 3; m++) {
20860 GemmMicrokernelTester()
20861 .mr(3)
20862 .nr(4)
20863 .kr(2)
20864 .sr(4)
20865 .m(m)
20866 .n(n)
20867 .k(k)
20868 .iterations(1)
20869 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20870 }
20871 }
20872 }
20873 }
20874
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128,k_gt_8)20875 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128, k_gt_8) {
20876 TEST_REQUIRES_X86_XOP;
20877 for (size_t k = 9; k < 16; k++) {
20878 GemmMicrokernelTester()
20879 .mr(3)
20880 .nr(4)
20881 .kr(2)
20882 .sr(4)
20883 .m(3)
20884 .n(4)
20885 .k(k)
20886 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20887 }
20888 }
20889
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128,k_gt_8_strided_a)20890 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128, k_gt_8_strided_a) {
20891 TEST_REQUIRES_X86_XOP;
20892 for (size_t k = 9; k < 16; k++) {
20893 GemmMicrokernelTester()
20894 .mr(3)
20895 .nr(4)
20896 .kr(2)
20897 .sr(4)
20898 .m(3)
20899 .n(4)
20900 .k(k)
20901 .a_stride(19)
20902 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20903 }
20904 }
20905
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128,k_gt_8_subtile)20906 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128, k_gt_8_subtile) {
20907 TEST_REQUIRES_X86_XOP;
20908 for (size_t k = 9; k < 16; k++) {
20909 for (uint32_t n = 1; n <= 4; n++) {
20910 for (uint32_t m = 1; m <= 3; m++) {
20911 GemmMicrokernelTester()
20912 .mr(3)
20913 .nr(4)
20914 .kr(2)
20915 .sr(4)
20916 .m(m)
20917 .n(n)
20918 .k(k)
20919 .iterations(1)
20920 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20921 }
20922 }
20923 }
20924 }
20925
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128,k_div_8)20926 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128, k_div_8) {
20927 TEST_REQUIRES_X86_XOP;
20928 for (size_t k = 16; k <= 80; k += 8) {
20929 GemmMicrokernelTester()
20930 .mr(3)
20931 .nr(4)
20932 .kr(2)
20933 .sr(4)
20934 .m(3)
20935 .n(4)
20936 .k(k)
20937 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20938 }
20939 }
20940
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128,k_div_8_strided_a)20941 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128, k_div_8_strided_a) {
20942 TEST_REQUIRES_X86_XOP;
20943 for (size_t k = 16; k <= 80; k += 8) {
20944 GemmMicrokernelTester()
20945 .mr(3)
20946 .nr(4)
20947 .kr(2)
20948 .sr(4)
20949 .m(3)
20950 .n(4)
20951 .k(k)
20952 .a_stride(83)
20953 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20954 }
20955 }
20956
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128,k_div_8_subtile)20957 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128, k_div_8_subtile) {
20958 TEST_REQUIRES_X86_XOP;
20959 for (size_t k = 16; k <= 80; k += 8) {
20960 for (uint32_t n = 1; n <= 4; n++) {
20961 for (uint32_t m = 1; m <= 3; m++) {
20962 GemmMicrokernelTester()
20963 .mr(3)
20964 .nr(4)
20965 .kr(2)
20966 .sr(4)
20967 .m(m)
20968 .n(n)
20969 .k(k)
20970 .iterations(1)
20971 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20972 }
20973 }
20974 }
20975 }
20976
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128,n_gt_4)20977 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128, n_gt_4) {
20978 TEST_REQUIRES_X86_XOP;
20979 for (uint32_t n = 5; n < 8; n++) {
20980 for (size_t k = 1; k <= 40; k += 9) {
20981 GemmMicrokernelTester()
20982 .mr(3)
20983 .nr(4)
20984 .kr(2)
20985 .sr(4)
20986 .m(3)
20987 .n(n)
20988 .k(k)
20989 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
20990 }
20991 }
20992 }
20993
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128,n_gt_4_strided_cn)20994 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128, n_gt_4_strided_cn) {
20995 TEST_REQUIRES_X86_XOP;
20996 for (uint32_t n = 5; n < 8; n++) {
20997 for (size_t k = 1; k <= 40; k += 9) {
20998 GemmMicrokernelTester()
20999 .mr(3)
21000 .nr(4)
21001 .kr(2)
21002 .sr(4)
21003 .m(3)
21004 .n(n)
21005 .k(k)
21006 .cn_stride(7)
21007 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21008 }
21009 }
21010 }
21011
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128,n_gt_4_strided_a)21012 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128, n_gt_4_strided_a) {
21013 TEST_REQUIRES_X86_XOP;
21014 for (uint32_t n = 5; n < 8; n++) {
21015 for (size_t k = 1; k <= 40; k += 9) {
21016 GemmMicrokernelTester()
21017 .mr(3)
21018 .nr(4)
21019 .kr(2)
21020 .sr(4)
21021 .m(3)
21022 .n(n)
21023 .k(k)
21024 .a_stride(43)
21025 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21026 }
21027 }
21028 }
21029
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128,n_gt_4_subtile)21030 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128, n_gt_4_subtile) {
21031 TEST_REQUIRES_X86_XOP;
21032 for (uint32_t n = 5; n < 8; n++) {
21033 for (size_t k = 1; k <= 40; k += 9) {
21034 for (uint32_t m = 1; m <= 3; m++) {
21035 GemmMicrokernelTester()
21036 .mr(3)
21037 .nr(4)
21038 .kr(2)
21039 .sr(4)
21040 .m(m)
21041 .n(n)
21042 .k(k)
21043 .iterations(1)
21044 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21045 }
21046 }
21047 }
21048 }
21049
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128,n_div_4)21050 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128, n_div_4) {
21051 TEST_REQUIRES_X86_XOP;
21052 for (uint32_t n = 8; n <= 12; n += 4) {
21053 for (size_t k = 1; k <= 40; k += 9) {
21054 GemmMicrokernelTester()
21055 .mr(3)
21056 .nr(4)
21057 .kr(2)
21058 .sr(4)
21059 .m(3)
21060 .n(n)
21061 .k(k)
21062 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21063 }
21064 }
21065 }
21066
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128,n_div_4_strided_cn)21067 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128, n_div_4_strided_cn) {
21068 TEST_REQUIRES_X86_XOP;
21069 for (uint32_t n = 8; n <= 12; n += 4) {
21070 for (size_t k = 1; k <= 40; k += 9) {
21071 GemmMicrokernelTester()
21072 .mr(3)
21073 .nr(4)
21074 .kr(2)
21075 .sr(4)
21076 .m(3)
21077 .n(n)
21078 .k(k)
21079 .cn_stride(7)
21080 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21081 }
21082 }
21083 }
21084
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128,n_div_4_strided_a)21085 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128, n_div_4_strided_a) {
21086 TEST_REQUIRES_X86_XOP;
21087 for (uint32_t n = 8; n <= 12; n += 4) {
21088 for (size_t k = 1; k <= 40; k += 9) {
21089 GemmMicrokernelTester()
21090 .mr(3)
21091 .nr(4)
21092 .kr(2)
21093 .sr(4)
21094 .m(3)
21095 .n(n)
21096 .k(k)
21097 .a_stride(43)
21098 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21099 }
21100 }
21101 }
21102
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128,n_div_4_subtile)21103 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128, n_div_4_subtile) {
21104 TEST_REQUIRES_X86_XOP;
21105 for (uint32_t n = 8; n <= 12; n += 4) {
21106 for (size_t k = 1; k <= 40; k += 9) {
21107 for (uint32_t m = 1; m <= 3; m++) {
21108 GemmMicrokernelTester()
21109 .mr(3)
21110 .nr(4)
21111 .kr(2)
21112 .sr(4)
21113 .m(m)
21114 .n(n)
21115 .k(k)
21116 .iterations(1)
21117 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21118 }
21119 }
21120 }
21121 }
21122
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128,strided_cm_subtile)21123 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128, strided_cm_subtile) {
21124 TEST_REQUIRES_X86_XOP;
21125 for (size_t k = 1; k <= 40; k += 9) {
21126 for (uint32_t n = 1; n <= 4; n++) {
21127 for (uint32_t m = 1; m <= 3; m++) {
21128 GemmMicrokernelTester()
21129 .mr(3)
21130 .nr(4)
21131 .kr(2)
21132 .sr(4)
21133 .m(m)
21134 .n(n)
21135 .k(k)
21136 .cm_stride(7)
21137 .iterations(1)
21138 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21139 }
21140 }
21141 }
21142 }
21143
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128,qmin)21144 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128, qmin) {
21145 TEST_REQUIRES_X86_XOP;
21146 GemmMicrokernelTester()
21147 .mr(3)
21148 .nr(4)
21149 .kr(2)
21150 .sr(4)
21151 .m(3)
21152 .n(4)
21153 .k(8)
21154 .qmin(128)
21155 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21156 }
21157
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128,qmax)21158 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128, qmax) {
21159 TEST_REQUIRES_X86_XOP;
21160 GemmMicrokernelTester()
21161 .mr(3)
21162 .nr(4)
21163 .kr(2)
21164 .sr(4)
21165 .m(3)
21166 .n(4)
21167 .k(8)
21168 .qmax(128)
21169 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21170 }
21171
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128,strided_cm)21172 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128, strided_cm) {
21173 TEST_REQUIRES_X86_XOP;
21174 GemmMicrokernelTester()
21175 .mr(3)
21176 .nr(4)
21177 .kr(2)
21178 .sr(4)
21179 .m(3)
21180 .n(4)
21181 .k(8)
21182 .cm_stride(7)
21183 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21184 }
21185
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128,no_a_zero_point)21186 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128, no_a_zero_point) {
21187 TEST_REQUIRES_X86_XOP;
21188 for (size_t k = 1; k <= 40; k += 9) {
21189 GemmMicrokernelTester()
21190 .mr(3)
21191 .nr(4)
21192 .kr(2)
21193 .sr(4)
21194 .m(3)
21195 .n(4)
21196 .k(k)
21197 .a_zero_point(0)
21198 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21199 }
21200 }
21201
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128,no_b_zero_point)21202 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128, no_b_zero_point) {
21203 TEST_REQUIRES_X86_XOP;
21204 for (size_t k = 1; k <= 40; k += 9) {
21205 GemmMicrokernelTester()
21206 .mr(3)
21207 .nr(4)
21208 .kr(2)
21209 .sr(4)
21210 .m(3)
21211 .n(4)
21212 .k(k)
21213 .b_zero_point(0)
21214 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21215 }
21216 }
21217
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128,no_zero_point)21218 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__XOP_LD128, no_zero_point) {
21219 TEST_REQUIRES_X86_XOP;
21220 for (size_t k = 1; k <= 40; k += 9) {
21221 GemmMicrokernelTester()
21222 .mr(3)
21223 .nr(4)
21224 .kr(2)
21225 .sr(4)
21226 .m(3)
21227 .n(4)
21228 .k(k)
21229 .a_zero_point(0)
21230 .b_zero_point(0)
21231 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21232 }
21233 }
21234 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
21235
21236
21237 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128,k_eq_8)21238 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128, k_eq_8) {
21239 TEST_REQUIRES_X86_XOP;
21240 GemmMicrokernelTester()
21241 .mr(4)
21242 .nr(4)
21243 .kr(2)
21244 .sr(4)
21245 .m(4)
21246 .n(4)
21247 .k(8)
21248 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21249 }
21250
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128,strided_cn)21251 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128, strided_cn) {
21252 TEST_REQUIRES_X86_XOP;
21253 GemmMicrokernelTester()
21254 .mr(4)
21255 .nr(4)
21256 .kr(2)
21257 .sr(4)
21258 .m(4)
21259 .n(4)
21260 .k(8)
21261 .cn_stride(7)
21262 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21263 }
21264
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128,k_eq_8_strided_a)21265 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128, k_eq_8_strided_a) {
21266 TEST_REQUIRES_X86_XOP;
21267 GemmMicrokernelTester()
21268 .mr(4)
21269 .nr(4)
21270 .kr(2)
21271 .sr(4)
21272 .m(4)
21273 .n(4)
21274 .k(8)
21275 .a_stride(11)
21276 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21277 }
21278
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128,k_eq_8_subtile)21279 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128, k_eq_8_subtile) {
21280 TEST_REQUIRES_X86_XOP;
21281 for (uint32_t n = 1; n <= 4; n++) {
21282 for (uint32_t m = 1; m <= 4; m++) {
21283 GemmMicrokernelTester()
21284 .mr(4)
21285 .nr(4)
21286 .kr(2)
21287 .sr(4)
21288 .m(m)
21289 .n(n)
21290 .k(8)
21291 .iterations(1)
21292 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21293 }
21294 }
21295 }
21296
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128,k_eq_8_subtile_m)21297 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128, k_eq_8_subtile_m) {
21298 TEST_REQUIRES_X86_XOP;
21299 for (uint32_t m = 1; m <= 4; m++) {
21300 GemmMicrokernelTester()
21301 .mr(4)
21302 .nr(4)
21303 .kr(2)
21304 .sr(4)
21305 .m(m)
21306 .n(4)
21307 .k(8)
21308 .iterations(1)
21309 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21310 }
21311 }
21312
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128,k_eq_8_subtile_n)21313 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128, k_eq_8_subtile_n) {
21314 TEST_REQUIRES_X86_XOP;
21315 for (uint32_t n = 1; n <= 4; n++) {
21316 GemmMicrokernelTester()
21317 .mr(4)
21318 .nr(4)
21319 .kr(2)
21320 .sr(4)
21321 .m(4)
21322 .n(n)
21323 .k(8)
21324 .iterations(1)
21325 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21326 }
21327 }
21328
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128,k_lt_8)21329 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128, k_lt_8) {
21330 TEST_REQUIRES_X86_XOP;
21331 for (size_t k = 1; k < 8; k++) {
21332 GemmMicrokernelTester()
21333 .mr(4)
21334 .nr(4)
21335 .kr(2)
21336 .sr(4)
21337 .m(4)
21338 .n(4)
21339 .k(k)
21340 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21341 }
21342 }
21343
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128,k_lt_8_strided_a)21344 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128, k_lt_8_strided_a) {
21345 TEST_REQUIRES_X86_XOP;
21346 for (size_t k = 1; k < 8; k++) {
21347 GemmMicrokernelTester()
21348 .mr(4)
21349 .nr(4)
21350 .kr(2)
21351 .sr(4)
21352 .m(4)
21353 .n(4)
21354 .k(k)
21355 .a_stride(11)
21356 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21357 }
21358 }
21359
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128,k_lt_8_subtile)21360 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128, k_lt_8_subtile) {
21361 TEST_REQUIRES_X86_XOP;
21362 for (size_t k = 1; k < 8; k++) {
21363 for (uint32_t n = 1; n <= 4; n++) {
21364 for (uint32_t m = 1; m <= 4; m++) {
21365 GemmMicrokernelTester()
21366 .mr(4)
21367 .nr(4)
21368 .kr(2)
21369 .sr(4)
21370 .m(m)
21371 .n(n)
21372 .k(k)
21373 .iterations(1)
21374 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21375 }
21376 }
21377 }
21378 }
21379
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128,k_gt_8)21380 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128, k_gt_8) {
21381 TEST_REQUIRES_X86_XOP;
21382 for (size_t k = 9; k < 16; k++) {
21383 GemmMicrokernelTester()
21384 .mr(4)
21385 .nr(4)
21386 .kr(2)
21387 .sr(4)
21388 .m(4)
21389 .n(4)
21390 .k(k)
21391 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21392 }
21393 }
21394
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128,k_gt_8_strided_a)21395 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128, k_gt_8_strided_a) {
21396 TEST_REQUIRES_X86_XOP;
21397 for (size_t k = 9; k < 16; k++) {
21398 GemmMicrokernelTester()
21399 .mr(4)
21400 .nr(4)
21401 .kr(2)
21402 .sr(4)
21403 .m(4)
21404 .n(4)
21405 .k(k)
21406 .a_stride(19)
21407 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21408 }
21409 }
21410
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128,k_gt_8_subtile)21411 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128, k_gt_8_subtile) {
21412 TEST_REQUIRES_X86_XOP;
21413 for (size_t k = 9; k < 16; k++) {
21414 for (uint32_t n = 1; n <= 4; n++) {
21415 for (uint32_t m = 1; m <= 4; m++) {
21416 GemmMicrokernelTester()
21417 .mr(4)
21418 .nr(4)
21419 .kr(2)
21420 .sr(4)
21421 .m(m)
21422 .n(n)
21423 .k(k)
21424 .iterations(1)
21425 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21426 }
21427 }
21428 }
21429 }
21430
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128,k_div_8)21431 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128, k_div_8) {
21432 TEST_REQUIRES_X86_XOP;
21433 for (size_t k = 16; k <= 80; k += 8) {
21434 GemmMicrokernelTester()
21435 .mr(4)
21436 .nr(4)
21437 .kr(2)
21438 .sr(4)
21439 .m(4)
21440 .n(4)
21441 .k(k)
21442 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21443 }
21444 }
21445
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128,k_div_8_strided_a)21446 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128, k_div_8_strided_a) {
21447 TEST_REQUIRES_X86_XOP;
21448 for (size_t k = 16; k <= 80; k += 8) {
21449 GemmMicrokernelTester()
21450 .mr(4)
21451 .nr(4)
21452 .kr(2)
21453 .sr(4)
21454 .m(4)
21455 .n(4)
21456 .k(k)
21457 .a_stride(83)
21458 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21459 }
21460 }
21461
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128,k_div_8_subtile)21462 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128, k_div_8_subtile) {
21463 TEST_REQUIRES_X86_XOP;
21464 for (size_t k = 16; k <= 80; k += 8) {
21465 for (uint32_t n = 1; n <= 4; n++) {
21466 for (uint32_t m = 1; m <= 4; m++) {
21467 GemmMicrokernelTester()
21468 .mr(4)
21469 .nr(4)
21470 .kr(2)
21471 .sr(4)
21472 .m(m)
21473 .n(n)
21474 .k(k)
21475 .iterations(1)
21476 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21477 }
21478 }
21479 }
21480 }
21481
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128,n_gt_4)21482 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128, n_gt_4) {
21483 TEST_REQUIRES_X86_XOP;
21484 for (uint32_t n = 5; n < 8; n++) {
21485 for (size_t k = 1; k <= 40; k += 9) {
21486 GemmMicrokernelTester()
21487 .mr(4)
21488 .nr(4)
21489 .kr(2)
21490 .sr(4)
21491 .m(4)
21492 .n(n)
21493 .k(k)
21494 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21495 }
21496 }
21497 }
21498
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128,n_gt_4_strided_cn)21499 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128, n_gt_4_strided_cn) {
21500 TEST_REQUIRES_X86_XOP;
21501 for (uint32_t n = 5; n < 8; n++) {
21502 for (size_t k = 1; k <= 40; k += 9) {
21503 GemmMicrokernelTester()
21504 .mr(4)
21505 .nr(4)
21506 .kr(2)
21507 .sr(4)
21508 .m(4)
21509 .n(n)
21510 .k(k)
21511 .cn_stride(7)
21512 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21513 }
21514 }
21515 }
21516
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128,n_gt_4_strided_a)21517 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128, n_gt_4_strided_a) {
21518 TEST_REQUIRES_X86_XOP;
21519 for (uint32_t n = 5; n < 8; n++) {
21520 for (size_t k = 1; k <= 40; k += 9) {
21521 GemmMicrokernelTester()
21522 .mr(4)
21523 .nr(4)
21524 .kr(2)
21525 .sr(4)
21526 .m(4)
21527 .n(n)
21528 .k(k)
21529 .a_stride(43)
21530 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21531 }
21532 }
21533 }
21534
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128,n_gt_4_subtile)21535 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128, n_gt_4_subtile) {
21536 TEST_REQUIRES_X86_XOP;
21537 for (uint32_t n = 5; n < 8; n++) {
21538 for (size_t k = 1; k <= 40; k += 9) {
21539 for (uint32_t m = 1; m <= 4; m++) {
21540 GemmMicrokernelTester()
21541 .mr(4)
21542 .nr(4)
21543 .kr(2)
21544 .sr(4)
21545 .m(m)
21546 .n(n)
21547 .k(k)
21548 .iterations(1)
21549 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21550 }
21551 }
21552 }
21553 }
21554
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128,n_div_4)21555 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128, n_div_4) {
21556 TEST_REQUIRES_X86_XOP;
21557 for (uint32_t n = 8; n <= 12; n += 4) {
21558 for (size_t k = 1; k <= 40; k += 9) {
21559 GemmMicrokernelTester()
21560 .mr(4)
21561 .nr(4)
21562 .kr(2)
21563 .sr(4)
21564 .m(4)
21565 .n(n)
21566 .k(k)
21567 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21568 }
21569 }
21570 }
21571
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128,n_div_4_strided_cn)21572 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128, n_div_4_strided_cn) {
21573 TEST_REQUIRES_X86_XOP;
21574 for (uint32_t n = 8; n <= 12; n += 4) {
21575 for (size_t k = 1; k <= 40; k += 9) {
21576 GemmMicrokernelTester()
21577 .mr(4)
21578 .nr(4)
21579 .kr(2)
21580 .sr(4)
21581 .m(4)
21582 .n(n)
21583 .k(k)
21584 .cn_stride(7)
21585 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21586 }
21587 }
21588 }
21589
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128,n_div_4_strided_a)21590 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128, n_div_4_strided_a) {
21591 TEST_REQUIRES_X86_XOP;
21592 for (uint32_t n = 8; n <= 12; n += 4) {
21593 for (size_t k = 1; k <= 40; k += 9) {
21594 GemmMicrokernelTester()
21595 .mr(4)
21596 .nr(4)
21597 .kr(2)
21598 .sr(4)
21599 .m(4)
21600 .n(n)
21601 .k(k)
21602 .a_stride(43)
21603 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21604 }
21605 }
21606 }
21607
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128,n_div_4_subtile)21608 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128, n_div_4_subtile) {
21609 TEST_REQUIRES_X86_XOP;
21610 for (uint32_t n = 8; n <= 12; n += 4) {
21611 for (size_t k = 1; k <= 40; k += 9) {
21612 for (uint32_t m = 1; m <= 4; m++) {
21613 GemmMicrokernelTester()
21614 .mr(4)
21615 .nr(4)
21616 .kr(2)
21617 .sr(4)
21618 .m(m)
21619 .n(n)
21620 .k(k)
21621 .iterations(1)
21622 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21623 }
21624 }
21625 }
21626 }
21627
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128,strided_cm_subtile)21628 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128, strided_cm_subtile) {
21629 TEST_REQUIRES_X86_XOP;
21630 for (size_t k = 1; k <= 40; k += 9) {
21631 for (uint32_t n = 1; n <= 4; n++) {
21632 for (uint32_t m = 1; m <= 4; m++) {
21633 GemmMicrokernelTester()
21634 .mr(4)
21635 .nr(4)
21636 .kr(2)
21637 .sr(4)
21638 .m(m)
21639 .n(n)
21640 .k(k)
21641 .cm_stride(7)
21642 .iterations(1)
21643 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21644 }
21645 }
21646 }
21647 }
21648
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128,qmin)21649 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128, qmin) {
21650 TEST_REQUIRES_X86_XOP;
21651 GemmMicrokernelTester()
21652 .mr(4)
21653 .nr(4)
21654 .kr(2)
21655 .sr(4)
21656 .m(4)
21657 .n(4)
21658 .k(8)
21659 .qmin(128)
21660 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21661 }
21662
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128,qmax)21663 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128, qmax) {
21664 TEST_REQUIRES_X86_XOP;
21665 GemmMicrokernelTester()
21666 .mr(4)
21667 .nr(4)
21668 .kr(2)
21669 .sr(4)
21670 .m(4)
21671 .n(4)
21672 .k(8)
21673 .qmax(128)
21674 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21675 }
21676
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128,strided_cm)21677 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128, strided_cm) {
21678 TEST_REQUIRES_X86_XOP;
21679 GemmMicrokernelTester()
21680 .mr(4)
21681 .nr(4)
21682 .kr(2)
21683 .sr(4)
21684 .m(4)
21685 .n(4)
21686 .k(8)
21687 .cm_stride(7)
21688 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21689 }
21690
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128,no_a_zero_point)21691 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128, no_a_zero_point) {
21692 TEST_REQUIRES_X86_XOP;
21693 for (size_t k = 1; k <= 40; k += 9) {
21694 GemmMicrokernelTester()
21695 .mr(4)
21696 .nr(4)
21697 .kr(2)
21698 .sr(4)
21699 .m(4)
21700 .n(4)
21701 .k(k)
21702 .a_zero_point(0)
21703 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21704 }
21705 }
21706
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128,no_b_zero_point)21707 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128, no_b_zero_point) {
21708 TEST_REQUIRES_X86_XOP;
21709 for (size_t k = 1; k <= 40; k += 9) {
21710 GemmMicrokernelTester()
21711 .mr(4)
21712 .nr(4)
21713 .kr(2)
21714 .sr(4)
21715 .m(4)
21716 .n(4)
21717 .k(k)
21718 .b_zero_point(0)
21719 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21720 }
21721 }
21722
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128,no_zero_point)21723 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__XOP_LD128, no_zero_point) {
21724 TEST_REQUIRES_X86_XOP;
21725 for (size_t k = 1; k <= 40; k += 9) {
21726 GemmMicrokernelTester()
21727 .mr(4)
21728 .nr(4)
21729 .kr(2)
21730 .sr(4)
21731 .m(4)
21732 .n(4)
21733 .k(k)
21734 .a_zero_point(0)
21735 .b_zero_point(0)
21736 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21737 }
21738 }
21739 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
21740
21741
21742 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64,k_eq_8)21743 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8) {
21744 TEST_REQUIRES_X86_SSE41;
21745 GemmMicrokernelTester()
21746 .mr(1)
21747 .nr(4)
21748 .kr(8)
21749 .sr(1)
21750 .m(1)
21751 .n(4)
21752 .k(8)
21753 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21754 }
21755
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64,strided_cn)21756 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, strided_cn) {
21757 TEST_REQUIRES_X86_SSE41;
21758 GemmMicrokernelTester()
21759 .mr(1)
21760 .nr(4)
21761 .kr(8)
21762 .sr(1)
21763 .m(1)
21764 .n(4)
21765 .k(8)
21766 .cn_stride(7)
21767 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21768 }
21769
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64,k_eq_8_strided_a)21770 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8_strided_a) {
21771 TEST_REQUIRES_X86_SSE41;
21772 GemmMicrokernelTester()
21773 .mr(1)
21774 .nr(4)
21775 .kr(8)
21776 .sr(1)
21777 .m(1)
21778 .n(4)
21779 .k(8)
21780 .a_stride(11)
21781 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21782 }
21783
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64,k_eq_8_subtile)21784 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8_subtile) {
21785 TEST_REQUIRES_X86_SSE41;
21786 for (uint32_t n = 1; n <= 4; n++) {
21787 for (uint32_t m = 1; m <= 1; m++) {
21788 GemmMicrokernelTester()
21789 .mr(1)
21790 .nr(4)
21791 .kr(8)
21792 .sr(1)
21793 .m(m)
21794 .n(n)
21795 .k(8)
21796 .iterations(1)
21797 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21798 }
21799 }
21800 }
21801
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64,k_eq_8_subtile_m)21802 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8_subtile_m) {
21803 TEST_REQUIRES_X86_SSE41;
21804 for (uint32_t m = 1; m <= 1; m++) {
21805 GemmMicrokernelTester()
21806 .mr(1)
21807 .nr(4)
21808 .kr(8)
21809 .sr(1)
21810 .m(m)
21811 .n(4)
21812 .k(8)
21813 .iterations(1)
21814 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21815 }
21816 }
21817
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64,k_eq_8_subtile_n)21818 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_eq_8_subtile_n) {
21819 TEST_REQUIRES_X86_SSE41;
21820 for (uint32_t n = 1; n <= 4; n++) {
21821 GemmMicrokernelTester()
21822 .mr(1)
21823 .nr(4)
21824 .kr(8)
21825 .sr(1)
21826 .m(1)
21827 .n(n)
21828 .k(8)
21829 .iterations(1)
21830 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21831 }
21832 }
21833
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64,k_lt_8)21834 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_lt_8) {
21835 TEST_REQUIRES_X86_SSE41;
21836 for (size_t k = 1; k < 8; k++) {
21837 GemmMicrokernelTester()
21838 .mr(1)
21839 .nr(4)
21840 .kr(8)
21841 .sr(1)
21842 .m(1)
21843 .n(4)
21844 .k(k)
21845 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21846 }
21847 }
21848
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64,k_lt_8_strided_a)21849 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_lt_8_strided_a) {
21850 TEST_REQUIRES_X86_SSE41;
21851 for (size_t k = 1; k < 8; k++) {
21852 GemmMicrokernelTester()
21853 .mr(1)
21854 .nr(4)
21855 .kr(8)
21856 .sr(1)
21857 .m(1)
21858 .n(4)
21859 .k(k)
21860 .a_stride(11)
21861 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21862 }
21863 }
21864
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64,k_lt_8_subtile)21865 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_lt_8_subtile) {
21866 TEST_REQUIRES_X86_SSE41;
21867 for (size_t k = 1; k < 8; k++) {
21868 for (uint32_t n = 1; n <= 4; n++) {
21869 for (uint32_t m = 1; m <= 1; m++) {
21870 GemmMicrokernelTester()
21871 .mr(1)
21872 .nr(4)
21873 .kr(8)
21874 .sr(1)
21875 .m(m)
21876 .n(n)
21877 .k(k)
21878 .iterations(1)
21879 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21880 }
21881 }
21882 }
21883 }
21884
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64,k_gt_8)21885 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_gt_8) {
21886 TEST_REQUIRES_X86_SSE41;
21887 for (size_t k = 9; k < 16; k++) {
21888 GemmMicrokernelTester()
21889 .mr(1)
21890 .nr(4)
21891 .kr(8)
21892 .sr(1)
21893 .m(1)
21894 .n(4)
21895 .k(k)
21896 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21897 }
21898 }
21899
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64,k_gt_8_strided_a)21900 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_gt_8_strided_a) {
21901 TEST_REQUIRES_X86_SSE41;
21902 for (size_t k = 9; k < 16; k++) {
21903 GemmMicrokernelTester()
21904 .mr(1)
21905 .nr(4)
21906 .kr(8)
21907 .sr(1)
21908 .m(1)
21909 .n(4)
21910 .k(k)
21911 .a_stride(19)
21912 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21913 }
21914 }
21915
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64,k_gt_8_subtile)21916 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_gt_8_subtile) {
21917 TEST_REQUIRES_X86_SSE41;
21918 for (size_t k = 9; k < 16; k++) {
21919 for (uint32_t n = 1; n <= 4; n++) {
21920 for (uint32_t m = 1; m <= 1; m++) {
21921 GemmMicrokernelTester()
21922 .mr(1)
21923 .nr(4)
21924 .kr(8)
21925 .sr(1)
21926 .m(m)
21927 .n(n)
21928 .k(k)
21929 .iterations(1)
21930 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21931 }
21932 }
21933 }
21934 }
21935
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64,k_div_8)21936 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_div_8) {
21937 TEST_REQUIRES_X86_SSE41;
21938 for (size_t k = 16; k <= 80; k += 8) {
21939 GemmMicrokernelTester()
21940 .mr(1)
21941 .nr(4)
21942 .kr(8)
21943 .sr(1)
21944 .m(1)
21945 .n(4)
21946 .k(k)
21947 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21948 }
21949 }
21950
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64,k_div_8_strided_a)21951 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_div_8_strided_a) {
21952 TEST_REQUIRES_X86_SSE41;
21953 for (size_t k = 16; k <= 80; k += 8) {
21954 GemmMicrokernelTester()
21955 .mr(1)
21956 .nr(4)
21957 .kr(8)
21958 .sr(1)
21959 .m(1)
21960 .n(4)
21961 .k(k)
21962 .a_stride(83)
21963 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21964 }
21965 }
21966
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64,k_div_8_subtile)21967 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, k_div_8_subtile) {
21968 TEST_REQUIRES_X86_SSE41;
21969 for (size_t k = 16; k <= 80; k += 8) {
21970 for (uint32_t n = 1; n <= 4; n++) {
21971 for (uint32_t m = 1; m <= 1; m++) {
21972 GemmMicrokernelTester()
21973 .mr(1)
21974 .nr(4)
21975 .kr(8)
21976 .sr(1)
21977 .m(m)
21978 .n(n)
21979 .k(k)
21980 .iterations(1)
21981 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
21982 }
21983 }
21984 }
21985 }
21986
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64,n_gt_4)21987 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_gt_4) {
21988 TEST_REQUIRES_X86_SSE41;
21989 for (uint32_t n = 5; n < 8; n++) {
21990 for (size_t k = 1; k <= 40; k += 9) {
21991 GemmMicrokernelTester()
21992 .mr(1)
21993 .nr(4)
21994 .kr(8)
21995 .sr(1)
21996 .m(1)
21997 .n(n)
21998 .k(k)
21999 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22000 }
22001 }
22002 }
22003
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64,n_gt_4_strided_cn)22004 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_gt_4_strided_cn) {
22005 TEST_REQUIRES_X86_SSE41;
22006 for (uint32_t n = 5; n < 8; n++) {
22007 for (size_t k = 1; k <= 40; k += 9) {
22008 GemmMicrokernelTester()
22009 .mr(1)
22010 .nr(4)
22011 .kr(8)
22012 .sr(1)
22013 .m(1)
22014 .n(n)
22015 .k(k)
22016 .cn_stride(7)
22017 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22018 }
22019 }
22020 }
22021
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64,n_gt_4_strided_a)22022 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_gt_4_strided_a) {
22023 TEST_REQUIRES_X86_SSE41;
22024 for (uint32_t n = 5; n < 8; n++) {
22025 for (size_t k = 1; k <= 40; k += 9) {
22026 GemmMicrokernelTester()
22027 .mr(1)
22028 .nr(4)
22029 .kr(8)
22030 .sr(1)
22031 .m(1)
22032 .n(n)
22033 .k(k)
22034 .a_stride(43)
22035 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22036 }
22037 }
22038 }
22039
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64,n_gt_4_subtile)22040 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_gt_4_subtile) {
22041 TEST_REQUIRES_X86_SSE41;
22042 for (uint32_t n = 5; n < 8; n++) {
22043 for (size_t k = 1; k <= 40; k += 9) {
22044 for (uint32_t m = 1; m <= 1; m++) {
22045 GemmMicrokernelTester()
22046 .mr(1)
22047 .nr(4)
22048 .kr(8)
22049 .sr(1)
22050 .m(m)
22051 .n(n)
22052 .k(k)
22053 .iterations(1)
22054 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22055 }
22056 }
22057 }
22058 }
22059
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64,n_div_4)22060 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_div_4) {
22061 TEST_REQUIRES_X86_SSE41;
22062 for (uint32_t n = 8; n <= 12; n += 4) {
22063 for (size_t k = 1; k <= 40; k += 9) {
22064 GemmMicrokernelTester()
22065 .mr(1)
22066 .nr(4)
22067 .kr(8)
22068 .sr(1)
22069 .m(1)
22070 .n(n)
22071 .k(k)
22072 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22073 }
22074 }
22075 }
22076
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64,n_div_4_strided_cn)22077 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_div_4_strided_cn) {
22078 TEST_REQUIRES_X86_SSE41;
22079 for (uint32_t n = 8; n <= 12; n += 4) {
22080 for (size_t k = 1; k <= 40; k += 9) {
22081 GemmMicrokernelTester()
22082 .mr(1)
22083 .nr(4)
22084 .kr(8)
22085 .sr(1)
22086 .m(1)
22087 .n(n)
22088 .k(k)
22089 .cn_stride(7)
22090 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22091 }
22092 }
22093 }
22094
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64,n_div_4_strided_a)22095 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_div_4_strided_a) {
22096 TEST_REQUIRES_X86_SSE41;
22097 for (uint32_t n = 8; n <= 12; n += 4) {
22098 for (size_t k = 1; k <= 40; k += 9) {
22099 GemmMicrokernelTester()
22100 .mr(1)
22101 .nr(4)
22102 .kr(8)
22103 .sr(1)
22104 .m(1)
22105 .n(n)
22106 .k(k)
22107 .a_stride(43)
22108 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22109 }
22110 }
22111 }
22112
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64,n_div_4_subtile)22113 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, n_div_4_subtile) {
22114 TEST_REQUIRES_X86_SSE41;
22115 for (uint32_t n = 8; n <= 12; n += 4) {
22116 for (size_t k = 1; k <= 40; k += 9) {
22117 for (uint32_t m = 1; m <= 1; m++) {
22118 GemmMicrokernelTester()
22119 .mr(1)
22120 .nr(4)
22121 .kr(8)
22122 .sr(1)
22123 .m(m)
22124 .n(n)
22125 .k(k)
22126 .iterations(1)
22127 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22128 }
22129 }
22130 }
22131 }
22132
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64,strided_cm_subtile)22133 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, strided_cm_subtile) {
22134 TEST_REQUIRES_X86_SSE41;
22135 for (size_t k = 1; k <= 40; k += 9) {
22136 for (uint32_t n = 1; n <= 4; n++) {
22137 for (uint32_t m = 1; m <= 1; m++) {
22138 GemmMicrokernelTester()
22139 .mr(1)
22140 .nr(4)
22141 .kr(8)
22142 .sr(1)
22143 .m(m)
22144 .n(n)
22145 .k(k)
22146 .cm_stride(7)
22147 .iterations(1)
22148 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22149 }
22150 }
22151 }
22152 }
22153
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64,qmin)22154 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, qmin) {
22155 TEST_REQUIRES_X86_SSE41;
22156 GemmMicrokernelTester()
22157 .mr(1)
22158 .nr(4)
22159 .kr(8)
22160 .sr(1)
22161 .m(1)
22162 .n(4)
22163 .k(8)
22164 .qmin(128)
22165 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22166 }
22167
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64,qmax)22168 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, qmax) {
22169 TEST_REQUIRES_X86_SSE41;
22170 GemmMicrokernelTester()
22171 .mr(1)
22172 .nr(4)
22173 .kr(8)
22174 .sr(1)
22175 .m(1)
22176 .n(4)
22177 .k(8)
22178 .qmax(128)
22179 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22180 }
22181
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64,strided_cm)22182 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, strided_cm) {
22183 TEST_REQUIRES_X86_SSE41;
22184 GemmMicrokernelTester()
22185 .mr(1)
22186 .nr(4)
22187 .kr(8)
22188 .sr(1)
22189 .m(1)
22190 .n(4)
22191 .k(8)
22192 .cm_stride(7)
22193 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22194 }
22195
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64,no_a_zero_point)22196 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, no_a_zero_point) {
22197 TEST_REQUIRES_X86_SSE41;
22198 for (size_t k = 1; k <= 40; k += 9) {
22199 GemmMicrokernelTester()
22200 .mr(1)
22201 .nr(4)
22202 .kr(8)
22203 .sr(1)
22204 .m(1)
22205 .n(4)
22206 .k(k)
22207 .a_zero_point(0)
22208 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22209 }
22210 }
22211
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64,no_b_zero_point)22212 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, no_b_zero_point) {
22213 TEST_REQUIRES_X86_SSE41;
22214 for (size_t k = 1; k <= 40; k += 9) {
22215 GemmMicrokernelTester()
22216 .mr(1)
22217 .nr(4)
22218 .kr(8)
22219 .sr(1)
22220 .m(1)
22221 .n(4)
22222 .k(k)
22223 .b_zero_point(0)
22224 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22225 }
22226 }
22227
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64,no_zero_point)22228 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__SSE41_LD64, no_zero_point) {
22229 TEST_REQUIRES_X86_SSE41;
22230 for (size_t k = 1; k <= 40; k += 9) {
22231 GemmMicrokernelTester()
22232 .mr(1)
22233 .nr(4)
22234 .kr(8)
22235 .sr(1)
22236 .m(1)
22237 .n(4)
22238 .k(k)
22239 .a_zero_point(0)
22240 .b_zero_point(0)
22241 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22242 }
22243 }
22244 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
22245
22246
22247 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64,k_eq_8)22248 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_eq_8) {
22249 TEST_REQUIRES_X86_SSE41;
22250 GemmMicrokernelTester()
22251 .mr(2)
22252 .nr(4)
22253 .kr(8)
22254 .sr(1)
22255 .m(2)
22256 .n(4)
22257 .k(8)
22258 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22259 }
22260
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64,strided_cn)22261 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, strided_cn) {
22262 TEST_REQUIRES_X86_SSE41;
22263 GemmMicrokernelTester()
22264 .mr(2)
22265 .nr(4)
22266 .kr(8)
22267 .sr(1)
22268 .m(2)
22269 .n(4)
22270 .k(8)
22271 .cn_stride(7)
22272 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22273 }
22274
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64,k_eq_8_strided_a)22275 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_eq_8_strided_a) {
22276 TEST_REQUIRES_X86_SSE41;
22277 GemmMicrokernelTester()
22278 .mr(2)
22279 .nr(4)
22280 .kr(8)
22281 .sr(1)
22282 .m(2)
22283 .n(4)
22284 .k(8)
22285 .a_stride(11)
22286 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22287 }
22288
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64,k_eq_8_subtile)22289 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_eq_8_subtile) {
22290 TEST_REQUIRES_X86_SSE41;
22291 for (uint32_t n = 1; n <= 4; n++) {
22292 for (uint32_t m = 1; m <= 2; m++) {
22293 GemmMicrokernelTester()
22294 .mr(2)
22295 .nr(4)
22296 .kr(8)
22297 .sr(1)
22298 .m(m)
22299 .n(n)
22300 .k(8)
22301 .iterations(1)
22302 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22303 }
22304 }
22305 }
22306
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64,k_eq_8_subtile_m)22307 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_eq_8_subtile_m) {
22308 TEST_REQUIRES_X86_SSE41;
22309 for (uint32_t m = 1; m <= 2; m++) {
22310 GemmMicrokernelTester()
22311 .mr(2)
22312 .nr(4)
22313 .kr(8)
22314 .sr(1)
22315 .m(m)
22316 .n(4)
22317 .k(8)
22318 .iterations(1)
22319 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22320 }
22321 }
22322
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64,k_eq_8_subtile_n)22323 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_eq_8_subtile_n) {
22324 TEST_REQUIRES_X86_SSE41;
22325 for (uint32_t n = 1; n <= 4; n++) {
22326 GemmMicrokernelTester()
22327 .mr(2)
22328 .nr(4)
22329 .kr(8)
22330 .sr(1)
22331 .m(2)
22332 .n(n)
22333 .k(8)
22334 .iterations(1)
22335 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22336 }
22337 }
22338
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64,k_lt_8)22339 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_lt_8) {
22340 TEST_REQUIRES_X86_SSE41;
22341 for (size_t k = 1; k < 8; k++) {
22342 GemmMicrokernelTester()
22343 .mr(2)
22344 .nr(4)
22345 .kr(8)
22346 .sr(1)
22347 .m(2)
22348 .n(4)
22349 .k(k)
22350 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22351 }
22352 }
22353
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64,k_lt_8_strided_a)22354 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_lt_8_strided_a) {
22355 TEST_REQUIRES_X86_SSE41;
22356 for (size_t k = 1; k < 8; k++) {
22357 GemmMicrokernelTester()
22358 .mr(2)
22359 .nr(4)
22360 .kr(8)
22361 .sr(1)
22362 .m(2)
22363 .n(4)
22364 .k(k)
22365 .a_stride(11)
22366 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22367 }
22368 }
22369
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64,k_lt_8_subtile)22370 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_lt_8_subtile) {
22371 TEST_REQUIRES_X86_SSE41;
22372 for (size_t k = 1; k < 8; k++) {
22373 for (uint32_t n = 1; n <= 4; n++) {
22374 for (uint32_t m = 1; m <= 2; m++) {
22375 GemmMicrokernelTester()
22376 .mr(2)
22377 .nr(4)
22378 .kr(8)
22379 .sr(1)
22380 .m(m)
22381 .n(n)
22382 .k(k)
22383 .iterations(1)
22384 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22385 }
22386 }
22387 }
22388 }
22389
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64,k_gt_8)22390 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_gt_8) {
22391 TEST_REQUIRES_X86_SSE41;
22392 for (size_t k = 9; k < 16; k++) {
22393 GemmMicrokernelTester()
22394 .mr(2)
22395 .nr(4)
22396 .kr(8)
22397 .sr(1)
22398 .m(2)
22399 .n(4)
22400 .k(k)
22401 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22402 }
22403 }
22404
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64,k_gt_8_strided_a)22405 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_gt_8_strided_a) {
22406 TEST_REQUIRES_X86_SSE41;
22407 for (size_t k = 9; k < 16; k++) {
22408 GemmMicrokernelTester()
22409 .mr(2)
22410 .nr(4)
22411 .kr(8)
22412 .sr(1)
22413 .m(2)
22414 .n(4)
22415 .k(k)
22416 .a_stride(19)
22417 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22418 }
22419 }
22420
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64,k_gt_8_subtile)22421 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_gt_8_subtile) {
22422 TEST_REQUIRES_X86_SSE41;
22423 for (size_t k = 9; k < 16; k++) {
22424 for (uint32_t n = 1; n <= 4; n++) {
22425 for (uint32_t m = 1; m <= 2; m++) {
22426 GemmMicrokernelTester()
22427 .mr(2)
22428 .nr(4)
22429 .kr(8)
22430 .sr(1)
22431 .m(m)
22432 .n(n)
22433 .k(k)
22434 .iterations(1)
22435 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22436 }
22437 }
22438 }
22439 }
22440
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64,k_div_8)22441 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_div_8) {
22442 TEST_REQUIRES_X86_SSE41;
22443 for (size_t k = 16; k <= 80; k += 8) {
22444 GemmMicrokernelTester()
22445 .mr(2)
22446 .nr(4)
22447 .kr(8)
22448 .sr(1)
22449 .m(2)
22450 .n(4)
22451 .k(k)
22452 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22453 }
22454 }
22455
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64,k_div_8_strided_a)22456 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_div_8_strided_a) {
22457 TEST_REQUIRES_X86_SSE41;
22458 for (size_t k = 16; k <= 80; k += 8) {
22459 GemmMicrokernelTester()
22460 .mr(2)
22461 .nr(4)
22462 .kr(8)
22463 .sr(1)
22464 .m(2)
22465 .n(4)
22466 .k(k)
22467 .a_stride(83)
22468 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22469 }
22470 }
22471
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64,k_div_8_subtile)22472 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, k_div_8_subtile) {
22473 TEST_REQUIRES_X86_SSE41;
22474 for (size_t k = 16; k <= 80; k += 8) {
22475 for (uint32_t n = 1; n <= 4; n++) {
22476 for (uint32_t m = 1; m <= 2; m++) {
22477 GemmMicrokernelTester()
22478 .mr(2)
22479 .nr(4)
22480 .kr(8)
22481 .sr(1)
22482 .m(m)
22483 .n(n)
22484 .k(k)
22485 .iterations(1)
22486 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22487 }
22488 }
22489 }
22490 }
22491
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64,n_gt_4)22492 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_gt_4) {
22493 TEST_REQUIRES_X86_SSE41;
22494 for (uint32_t n = 5; n < 8; n++) {
22495 for (size_t k = 1; k <= 40; k += 9) {
22496 GemmMicrokernelTester()
22497 .mr(2)
22498 .nr(4)
22499 .kr(8)
22500 .sr(1)
22501 .m(2)
22502 .n(n)
22503 .k(k)
22504 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22505 }
22506 }
22507 }
22508
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64,n_gt_4_strided_cn)22509 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_gt_4_strided_cn) {
22510 TEST_REQUIRES_X86_SSE41;
22511 for (uint32_t n = 5; n < 8; n++) {
22512 for (size_t k = 1; k <= 40; k += 9) {
22513 GemmMicrokernelTester()
22514 .mr(2)
22515 .nr(4)
22516 .kr(8)
22517 .sr(1)
22518 .m(2)
22519 .n(n)
22520 .k(k)
22521 .cn_stride(7)
22522 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22523 }
22524 }
22525 }
22526
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64,n_gt_4_strided_a)22527 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_gt_4_strided_a) {
22528 TEST_REQUIRES_X86_SSE41;
22529 for (uint32_t n = 5; n < 8; n++) {
22530 for (size_t k = 1; k <= 40; k += 9) {
22531 GemmMicrokernelTester()
22532 .mr(2)
22533 .nr(4)
22534 .kr(8)
22535 .sr(1)
22536 .m(2)
22537 .n(n)
22538 .k(k)
22539 .a_stride(43)
22540 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22541 }
22542 }
22543 }
22544
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64,n_gt_4_subtile)22545 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_gt_4_subtile) {
22546 TEST_REQUIRES_X86_SSE41;
22547 for (uint32_t n = 5; n < 8; n++) {
22548 for (size_t k = 1; k <= 40; k += 9) {
22549 for (uint32_t m = 1; m <= 2; m++) {
22550 GemmMicrokernelTester()
22551 .mr(2)
22552 .nr(4)
22553 .kr(8)
22554 .sr(1)
22555 .m(m)
22556 .n(n)
22557 .k(k)
22558 .iterations(1)
22559 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22560 }
22561 }
22562 }
22563 }
22564
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64,n_div_4)22565 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_div_4) {
22566 TEST_REQUIRES_X86_SSE41;
22567 for (uint32_t n = 8; n <= 12; n += 4) {
22568 for (size_t k = 1; k <= 40; k += 9) {
22569 GemmMicrokernelTester()
22570 .mr(2)
22571 .nr(4)
22572 .kr(8)
22573 .sr(1)
22574 .m(2)
22575 .n(n)
22576 .k(k)
22577 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22578 }
22579 }
22580 }
22581
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64,n_div_4_strided_cn)22582 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_div_4_strided_cn) {
22583 TEST_REQUIRES_X86_SSE41;
22584 for (uint32_t n = 8; n <= 12; n += 4) {
22585 for (size_t k = 1; k <= 40; k += 9) {
22586 GemmMicrokernelTester()
22587 .mr(2)
22588 .nr(4)
22589 .kr(8)
22590 .sr(1)
22591 .m(2)
22592 .n(n)
22593 .k(k)
22594 .cn_stride(7)
22595 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22596 }
22597 }
22598 }
22599
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64,n_div_4_strided_a)22600 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_div_4_strided_a) {
22601 TEST_REQUIRES_X86_SSE41;
22602 for (uint32_t n = 8; n <= 12; n += 4) {
22603 for (size_t k = 1; k <= 40; k += 9) {
22604 GemmMicrokernelTester()
22605 .mr(2)
22606 .nr(4)
22607 .kr(8)
22608 .sr(1)
22609 .m(2)
22610 .n(n)
22611 .k(k)
22612 .a_stride(43)
22613 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22614 }
22615 }
22616 }
22617
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64,n_div_4_subtile)22618 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, n_div_4_subtile) {
22619 TEST_REQUIRES_X86_SSE41;
22620 for (uint32_t n = 8; n <= 12; n += 4) {
22621 for (size_t k = 1; k <= 40; k += 9) {
22622 for (uint32_t m = 1; m <= 2; m++) {
22623 GemmMicrokernelTester()
22624 .mr(2)
22625 .nr(4)
22626 .kr(8)
22627 .sr(1)
22628 .m(m)
22629 .n(n)
22630 .k(k)
22631 .iterations(1)
22632 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22633 }
22634 }
22635 }
22636 }
22637
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64,strided_cm_subtile)22638 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, strided_cm_subtile) {
22639 TEST_REQUIRES_X86_SSE41;
22640 for (size_t k = 1; k <= 40; k += 9) {
22641 for (uint32_t n = 1; n <= 4; n++) {
22642 for (uint32_t m = 1; m <= 2; m++) {
22643 GemmMicrokernelTester()
22644 .mr(2)
22645 .nr(4)
22646 .kr(8)
22647 .sr(1)
22648 .m(m)
22649 .n(n)
22650 .k(k)
22651 .cm_stride(7)
22652 .iterations(1)
22653 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22654 }
22655 }
22656 }
22657 }
22658
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64,qmin)22659 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, qmin) {
22660 TEST_REQUIRES_X86_SSE41;
22661 GemmMicrokernelTester()
22662 .mr(2)
22663 .nr(4)
22664 .kr(8)
22665 .sr(1)
22666 .m(2)
22667 .n(4)
22668 .k(8)
22669 .qmin(128)
22670 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22671 }
22672
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64,qmax)22673 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, qmax) {
22674 TEST_REQUIRES_X86_SSE41;
22675 GemmMicrokernelTester()
22676 .mr(2)
22677 .nr(4)
22678 .kr(8)
22679 .sr(1)
22680 .m(2)
22681 .n(4)
22682 .k(8)
22683 .qmax(128)
22684 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22685 }
22686
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64,strided_cm)22687 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, strided_cm) {
22688 TEST_REQUIRES_X86_SSE41;
22689 GemmMicrokernelTester()
22690 .mr(2)
22691 .nr(4)
22692 .kr(8)
22693 .sr(1)
22694 .m(2)
22695 .n(4)
22696 .k(8)
22697 .cm_stride(7)
22698 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22699 }
22700
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64,no_a_zero_point)22701 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, no_a_zero_point) {
22702 TEST_REQUIRES_X86_SSE41;
22703 for (size_t k = 1; k <= 40; k += 9) {
22704 GemmMicrokernelTester()
22705 .mr(2)
22706 .nr(4)
22707 .kr(8)
22708 .sr(1)
22709 .m(2)
22710 .n(4)
22711 .k(k)
22712 .a_zero_point(0)
22713 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22714 }
22715 }
22716
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64,no_b_zero_point)22717 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, no_b_zero_point) {
22718 TEST_REQUIRES_X86_SSE41;
22719 for (size_t k = 1; k <= 40; k += 9) {
22720 GemmMicrokernelTester()
22721 .mr(2)
22722 .nr(4)
22723 .kr(8)
22724 .sr(1)
22725 .m(2)
22726 .n(4)
22727 .k(k)
22728 .b_zero_point(0)
22729 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22730 }
22731 }
22732
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64,no_zero_point)22733 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__SSE41_LD64, no_zero_point) {
22734 TEST_REQUIRES_X86_SSE41;
22735 for (size_t k = 1; k <= 40; k += 9) {
22736 GemmMicrokernelTester()
22737 .mr(2)
22738 .nr(4)
22739 .kr(8)
22740 .sr(1)
22741 .m(2)
22742 .n(4)
22743 .k(k)
22744 .a_zero_point(0)
22745 .b_zero_point(0)
22746 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22747 }
22748 }
22749 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
22750
22751
22752 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64,k_eq_8)22753 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8) {
22754 TEST_REQUIRES_X86_SSE2;
22755 GemmMicrokernelTester()
22756 .mr(3)
22757 .nr(4)
22758 .kr(8)
22759 .sr(1)
22760 .m(3)
22761 .n(4)
22762 .k(8)
22763 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22764 }
22765
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64,strided_cn)22766 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, strided_cn) {
22767 TEST_REQUIRES_X86_SSE2;
22768 GemmMicrokernelTester()
22769 .mr(3)
22770 .nr(4)
22771 .kr(8)
22772 .sr(1)
22773 .m(3)
22774 .n(4)
22775 .k(8)
22776 .cn_stride(7)
22777 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22778 }
22779
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64,k_eq_8_strided_a)22780 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8_strided_a) {
22781 TEST_REQUIRES_X86_SSE2;
22782 GemmMicrokernelTester()
22783 .mr(3)
22784 .nr(4)
22785 .kr(8)
22786 .sr(1)
22787 .m(3)
22788 .n(4)
22789 .k(8)
22790 .a_stride(11)
22791 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22792 }
22793
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64,k_eq_8_subtile)22794 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8_subtile) {
22795 TEST_REQUIRES_X86_SSE2;
22796 for (uint32_t n = 1; n <= 4; n++) {
22797 for (uint32_t m = 1; m <= 3; m++) {
22798 GemmMicrokernelTester()
22799 .mr(3)
22800 .nr(4)
22801 .kr(8)
22802 .sr(1)
22803 .m(m)
22804 .n(n)
22805 .k(8)
22806 .iterations(1)
22807 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22808 }
22809 }
22810 }
22811
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64,k_eq_8_subtile_m)22812 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8_subtile_m) {
22813 TEST_REQUIRES_X86_SSE2;
22814 for (uint32_t m = 1; m <= 3; m++) {
22815 GemmMicrokernelTester()
22816 .mr(3)
22817 .nr(4)
22818 .kr(8)
22819 .sr(1)
22820 .m(m)
22821 .n(4)
22822 .k(8)
22823 .iterations(1)
22824 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22825 }
22826 }
22827
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64,k_eq_8_subtile_n)22828 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_eq_8_subtile_n) {
22829 TEST_REQUIRES_X86_SSE2;
22830 for (uint32_t n = 1; n <= 4; n++) {
22831 GemmMicrokernelTester()
22832 .mr(3)
22833 .nr(4)
22834 .kr(8)
22835 .sr(1)
22836 .m(3)
22837 .n(n)
22838 .k(8)
22839 .iterations(1)
22840 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22841 }
22842 }
22843
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64,k_lt_8)22844 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_lt_8) {
22845 TEST_REQUIRES_X86_SSE2;
22846 for (size_t k = 1; k < 8; k++) {
22847 GemmMicrokernelTester()
22848 .mr(3)
22849 .nr(4)
22850 .kr(8)
22851 .sr(1)
22852 .m(3)
22853 .n(4)
22854 .k(k)
22855 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22856 }
22857 }
22858
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64,k_lt_8_strided_a)22859 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_lt_8_strided_a) {
22860 TEST_REQUIRES_X86_SSE2;
22861 for (size_t k = 1; k < 8; k++) {
22862 GemmMicrokernelTester()
22863 .mr(3)
22864 .nr(4)
22865 .kr(8)
22866 .sr(1)
22867 .m(3)
22868 .n(4)
22869 .k(k)
22870 .a_stride(11)
22871 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22872 }
22873 }
22874
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64,k_lt_8_subtile)22875 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_lt_8_subtile) {
22876 TEST_REQUIRES_X86_SSE2;
22877 for (size_t k = 1; k < 8; k++) {
22878 for (uint32_t n = 1; n <= 4; n++) {
22879 for (uint32_t m = 1; m <= 3; m++) {
22880 GemmMicrokernelTester()
22881 .mr(3)
22882 .nr(4)
22883 .kr(8)
22884 .sr(1)
22885 .m(m)
22886 .n(n)
22887 .k(k)
22888 .iterations(1)
22889 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22890 }
22891 }
22892 }
22893 }
22894
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64,k_gt_8)22895 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_gt_8) {
22896 TEST_REQUIRES_X86_SSE2;
22897 for (size_t k = 9; k < 16; k++) {
22898 GemmMicrokernelTester()
22899 .mr(3)
22900 .nr(4)
22901 .kr(8)
22902 .sr(1)
22903 .m(3)
22904 .n(4)
22905 .k(k)
22906 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22907 }
22908 }
22909
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64,k_gt_8_strided_a)22910 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_gt_8_strided_a) {
22911 TEST_REQUIRES_X86_SSE2;
22912 for (size_t k = 9; k < 16; k++) {
22913 GemmMicrokernelTester()
22914 .mr(3)
22915 .nr(4)
22916 .kr(8)
22917 .sr(1)
22918 .m(3)
22919 .n(4)
22920 .k(k)
22921 .a_stride(19)
22922 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22923 }
22924 }
22925
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64,k_gt_8_subtile)22926 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_gt_8_subtile) {
22927 TEST_REQUIRES_X86_SSE2;
22928 for (size_t k = 9; k < 16; k++) {
22929 for (uint32_t n = 1; n <= 4; n++) {
22930 for (uint32_t m = 1; m <= 3; m++) {
22931 GemmMicrokernelTester()
22932 .mr(3)
22933 .nr(4)
22934 .kr(8)
22935 .sr(1)
22936 .m(m)
22937 .n(n)
22938 .k(k)
22939 .iterations(1)
22940 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22941 }
22942 }
22943 }
22944 }
22945
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64,k_div_8)22946 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_div_8) {
22947 TEST_REQUIRES_X86_SSE2;
22948 for (size_t k = 16; k <= 80; k += 8) {
22949 GemmMicrokernelTester()
22950 .mr(3)
22951 .nr(4)
22952 .kr(8)
22953 .sr(1)
22954 .m(3)
22955 .n(4)
22956 .k(k)
22957 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22958 }
22959 }
22960
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64,k_div_8_strided_a)22961 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_div_8_strided_a) {
22962 TEST_REQUIRES_X86_SSE2;
22963 for (size_t k = 16; k <= 80; k += 8) {
22964 GemmMicrokernelTester()
22965 .mr(3)
22966 .nr(4)
22967 .kr(8)
22968 .sr(1)
22969 .m(3)
22970 .n(4)
22971 .k(k)
22972 .a_stride(83)
22973 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22974 }
22975 }
22976
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64,k_div_8_subtile)22977 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, k_div_8_subtile) {
22978 TEST_REQUIRES_X86_SSE2;
22979 for (size_t k = 16; k <= 80; k += 8) {
22980 for (uint32_t n = 1; n <= 4; n++) {
22981 for (uint32_t m = 1; m <= 3; m++) {
22982 GemmMicrokernelTester()
22983 .mr(3)
22984 .nr(4)
22985 .kr(8)
22986 .sr(1)
22987 .m(m)
22988 .n(n)
22989 .k(k)
22990 .iterations(1)
22991 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
22992 }
22993 }
22994 }
22995 }
22996
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64,n_gt_4)22997 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_gt_4) {
22998 TEST_REQUIRES_X86_SSE2;
22999 for (uint32_t n = 5; n < 8; n++) {
23000 for (size_t k = 1; k <= 40; k += 9) {
23001 GemmMicrokernelTester()
23002 .mr(3)
23003 .nr(4)
23004 .kr(8)
23005 .sr(1)
23006 .m(3)
23007 .n(n)
23008 .k(k)
23009 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23010 }
23011 }
23012 }
23013
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64,n_gt_4_strided_cn)23014 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_gt_4_strided_cn) {
23015 TEST_REQUIRES_X86_SSE2;
23016 for (uint32_t n = 5; n < 8; n++) {
23017 for (size_t k = 1; k <= 40; k += 9) {
23018 GemmMicrokernelTester()
23019 .mr(3)
23020 .nr(4)
23021 .kr(8)
23022 .sr(1)
23023 .m(3)
23024 .n(n)
23025 .k(k)
23026 .cn_stride(7)
23027 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23028 }
23029 }
23030 }
23031
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64,n_gt_4_strided_a)23032 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_gt_4_strided_a) {
23033 TEST_REQUIRES_X86_SSE2;
23034 for (uint32_t n = 5; n < 8; n++) {
23035 for (size_t k = 1; k <= 40; k += 9) {
23036 GemmMicrokernelTester()
23037 .mr(3)
23038 .nr(4)
23039 .kr(8)
23040 .sr(1)
23041 .m(3)
23042 .n(n)
23043 .k(k)
23044 .a_stride(43)
23045 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23046 }
23047 }
23048 }
23049
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64,n_gt_4_subtile)23050 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_gt_4_subtile) {
23051 TEST_REQUIRES_X86_SSE2;
23052 for (uint32_t n = 5; n < 8; n++) {
23053 for (size_t k = 1; k <= 40; k += 9) {
23054 for (uint32_t m = 1; m <= 3; m++) {
23055 GemmMicrokernelTester()
23056 .mr(3)
23057 .nr(4)
23058 .kr(8)
23059 .sr(1)
23060 .m(m)
23061 .n(n)
23062 .k(k)
23063 .iterations(1)
23064 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23065 }
23066 }
23067 }
23068 }
23069
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64,n_div_4)23070 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_div_4) {
23071 TEST_REQUIRES_X86_SSE2;
23072 for (uint32_t n = 8; n <= 12; n += 4) {
23073 for (size_t k = 1; k <= 40; k += 9) {
23074 GemmMicrokernelTester()
23075 .mr(3)
23076 .nr(4)
23077 .kr(8)
23078 .sr(1)
23079 .m(3)
23080 .n(n)
23081 .k(k)
23082 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23083 }
23084 }
23085 }
23086
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64,n_div_4_strided_cn)23087 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_div_4_strided_cn) {
23088 TEST_REQUIRES_X86_SSE2;
23089 for (uint32_t n = 8; n <= 12; n += 4) {
23090 for (size_t k = 1; k <= 40; k += 9) {
23091 GemmMicrokernelTester()
23092 .mr(3)
23093 .nr(4)
23094 .kr(8)
23095 .sr(1)
23096 .m(3)
23097 .n(n)
23098 .k(k)
23099 .cn_stride(7)
23100 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23101 }
23102 }
23103 }
23104
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64,n_div_4_strided_a)23105 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_div_4_strided_a) {
23106 TEST_REQUIRES_X86_SSE2;
23107 for (uint32_t n = 8; n <= 12; n += 4) {
23108 for (size_t k = 1; k <= 40; k += 9) {
23109 GemmMicrokernelTester()
23110 .mr(3)
23111 .nr(4)
23112 .kr(8)
23113 .sr(1)
23114 .m(3)
23115 .n(n)
23116 .k(k)
23117 .a_stride(43)
23118 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23119 }
23120 }
23121 }
23122
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64,n_div_4_subtile)23123 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, n_div_4_subtile) {
23124 TEST_REQUIRES_X86_SSE2;
23125 for (uint32_t n = 8; n <= 12; n += 4) {
23126 for (size_t k = 1; k <= 40; k += 9) {
23127 for (uint32_t m = 1; m <= 3; m++) {
23128 GemmMicrokernelTester()
23129 .mr(3)
23130 .nr(4)
23131 .kr(8)
23132 .sr(1)
23133 .m(m)
23134 .n(n)
23135 .k(k)
23136 .iterations(1)
23137 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23138 }
23139 }
23140 }
23141 }
23142
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64,strided_cm_subtile)23143 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, strided_cm_subtile) {
23144 TEST_REQUIRES_X86_SSE2;
23145 for (size_t k = 1; k <= 40; k += 9) {
23146 for (uint32_t n = 1; n <= 4; n++) {
23147 for (uint32_t m = 1; m <= 3; m++) {
23148 GemmMicrokernelTester()
23149 .mr(3)
23150 .nr(4)
23151 .kr(8)
23152 .sr(1)
23153 .m(m)
23154 .n(n)
23155 .k(k)
23156 .cm_stride(7)
23157 .iterations(1)
23158 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23159 }
23160 }
23161 }
23162 }
23163
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64,qmin)23164 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, qmin) {
23165 TEST_REQUIRES_X86_SSE2;
23166 GemmMicrokernelTester()
23167 .mr(3)
23168 .nr(4)
23169 .kr(8)
23170 .sr(1)
23171 .m(3)
23172 .n(4)
23173 .k(8)
23174 .qmin(128)
23175 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23176 }
23177
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64,qmax)23178 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, qmax) {
23179 TEST_REQUIRES_X86_SSE2;
23180 GemmMicrokernelTester()
23181 .mr(3)
23182 .nr(4)
23183 .kr(8)
23184 .sr(1)
23185 .m(3)
23186 .n(4)
23187 .k(8)
23188 .qmax(128)
23189 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23190 }
23191
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64,strided_cm)23192 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, strided_cm) {
23193 TEST_REQUIRES_X86_SSE2;
23194 GemmMicrokernelTester()
23195 .mr(3)
23196 .nr(4)
23197 .kr(8)
23198 .sr(1)
23199 .m(3)
23200 .n(4)
23201 .k(8)
23202 .cm_stride(7)
23203 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23204 }
23205
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64,no_a_zero_point)23206 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, no_a_zero_point) {
23207 TEST_REQUIRES_X86_SSE2;
23208 for (size_t k = 1; k <= 40; k += 9) {
23209 GemmMicrokernelTester()
23210 .mr(3)
23211 .nr(4)
23212 .kr(8)
23213 .sr(1)
23214 .m(3)
23215 .n(4)
23216 .k(k)
23217 .a_zero_point(0)
23218 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23219 }
23220 }
23221
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64,no_b_zero_point)23222 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, no_b_zero_point) {
23223 TEST_REQUIRES_X86_SSE2;
23224 for (size_t k = 1; k <= 40; k += 9) {
23225 GemmMicrokernelTester()
23226 .mr(3)
23227 .nr(4)
23228 .kr(8)
23229 .sr(1)
23230 .m(3)
23231 .n(4)
23232 .k(k)
23233 .b_zero_point(0)
23234 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23235 }
23236 }
23237
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64,no_zero_point)23238 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD64, no_zero_point) {
23239 TEST_REQUIRES_X86_SSE2;
23240 for (size_t k = 1; k <= 40; k += 9) {
23241 GemmMicrokernelTester()
23242 .mr(3)
23243 .nr(4)
23244 .kr(8)
23245 .sr(1)
23246 .m(3)
23247 .n(4)
23248 .k(k)
23249 .a_zero_point(0)
23250 .b_zero_point(0)
23251 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23252 }
23253 }
23254 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
23255
23256
23257 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64,k_eq_8)23258 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8) {
23259 TEST_REQUIRES_X86_AVX;
23260 GemmMicrokernelTester()
23261 .mr(1)
23262 .nr(4)
23263 .kr(8)
23264 .sr(1)
23265 .m(1)
23266 .n(4)
23267 .k(8)
23268 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23269 }
23270
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64,strided_cn)23271 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, strided_cn) {
23272 TEST_REQUIRES_X86_AVX;
23273 GemmMicrokernelTester()
23274 .mr(1)
23275 .nr(4)
23276 .kr(8)
23277 .sr(1)
23278 .m(1)
23279 .n(4)
23280 .k(8)
23281 .cn_stride(7)
23282 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23283 }
23284
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64,k_eq_8_strided_a)23285 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8_strided_a) {
23286 TEST_REQUIRES_X86_AVX;
23287 GemmMicrokernelTester()
23288 .mr(1)
23289 .nr(4)
23290 .kr(8)
23291 .sr(1)
23292 .m(1)
23293 .n(4)
23294 .k(8)
23295 .a_stride(11)
23296 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23297 }
23298
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64,k_eq_8_subtile)23299 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8_subtile) {
23300 TEST_REQUIRES_X86_AVX;
23301 for (uint32_t n = 1; n <= 4; n++) {
23302 for (uint32_t m = 1; m <= 1; m++) {
23303 GemmMicrokernelTester()
23304 .mr(1)
23305 .nr(4)
23306 .kr(8)
23307 .sr(1)
23308 .m(m)
23309 .n(n)
23310 .k(8)
23311 .iterations(1)
23312 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23313 }
23314 }
23315 }
23316
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64,k_eq_8_subtile_m)23317 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8_subtile_m) {
23318 TEST_REQUIRES_X86_AVX;
23319 for (uint32_t m = 1; m <= 1; m++) {
23320 GemmMicrokernelTester()
23321 .mr(1)
23322 .nr(4)
23323 .kr(8)
23324 .sr(1)
23325 .m(m)
23326 .n(4)
23327 .k(8)
23328 .iterations(1)
23329 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23330 }
23331 }
23332
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64,k_eq_8_subtile_n)23333 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_eq_8_subtile_n) {
23334 TEST_REQUIRES_X86_AVX;
23335 for (uint32_t n = 1; n <= 4; n++) {
23336 GemmMicrokernelTester()
23337 .mr(1)
23338 .nr(4)
23339 .kr(8)
23340 .sr(1)
23341 .m(1)
23342 .n(n)
23343 .k(8)
23344 .iterations(1)
23345 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23346 }
23347 }
23348
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64,k_lt_8)23349 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_lt_8) {
23350 TEST_REQUIRES_X86_AVX;
23351 for (size_t k = 1; k < 8; k++) {
23352 GemmMicrokernelTester()
23353 .mr(1)
23354 .nr(4)
23355 .kr(8)
23356 .sr(1)
23357 .m(1)
23358 .n(4)
23359 .k(k)
23360 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23361 }
23362 }
23363
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64,k_lt_8_strided_a)23364 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_lt_8_strided_a) {
23365 TEST_REQUIRES_X86_AVX;
23366 for (size_t k = 1; k < 8; k++) {
23367 GemmMicrokernelTester()
23368 .mr(1)
23369 .nr(4)
23370 .kr(8)
23371 .sr(1)
23372 .m(1)
23373 .n(4)
23374 .k(k)
23375 .a_stride(11)
23376 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23377 }
23378 }
23379
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64,k_lt_8_subtile)23380 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_lt_8_subtile) {
23381 TEST_REQUIRES_X86_AVX;
23382 for (size_t k = 1; k < 8; k++) {
23383 for (uint32_t n = 1; n <= 4; n++) {
23384 for (uint32_t m = 1; m <= 1; m++) {
23385 GemmMicrokernelTester()
23386 .mr(1)
23387 .nr(4)
23388 .kr(8)
23389 .sr(1)
23390 .m(m)
23391 .n(n)
23392 .k(k)
23393 .iterations(1)
23394 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23395 }
23396 }
23397 }
23398 }
23399
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64,k_gt_8)23400 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_gt_8) {
23401 TEST_REQUIRES_X86_AVX;
23402 for (size_t k = 9; k < 16; k++) {
23403 GemmMicrokernelTester()
23404 .mr(1)
23405 .nr(4)
23406 .kr(8)
23407 .sr(1)
23408 .m(1)
23409 .n(4)
23410 .k(k)
23411 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23412 }
23413 }
23414
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64,k_gt_8_strided_a)23415 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_gt_8_strided_a) {
23416 TEST_REQUIRES_X86_AVX;
23417 for (size_t k = 9; k < 16; k++) {
23418 GemmMicrokernelTester()
23419 .mr(1)
23420 .nr(4)
23421 .kr(8)
23422 .sr(1)
23423 .m(1)
23424 .n(4)
23425 .k(k)
23426 .a_stride(19)
23427 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23428 }
23429 }
23430
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64,k_gt_8_subtile)23431 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_gt_8_subtile) {
23432 TEST_REQUIRES_X86_AVX;
23433 for (size_t k = 9; k < 16; k++) {
23434 for (uint32_t n = 1; n <= 4; n++) {
23435 for (uint32_t m = 1; m <= 1; m++) {
23436 GemmMicrokernelTester()
23437 .mr(1)
23438 .nr(4)
23439 .kr(8)
23440 .sr(1)
23441 .m(m)
23442 .n(n)
23443 .k(k)
23444 .iterations(1)
23445 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23446 }
23447 }
23448 }
23449 }
23450
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64,k_div_8)23451 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_div_8) {
23452 TEST_REQUIRES_X86_AVX;
23453 for (size_t k = 16; k <= 80; k += 8) {
23454 GemmMicrokernelTester()
23455 .mr(1)
23456 .nr(4)
23457 .kr(8)
23458 .sr(1)
23459 .m(1)
23460 .n(4)
23461 .k(k)
23462 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23463 }
23464 }
23465
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64,k_div_8_strided_a)23466 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_div_8_strided_a) {
23467 TEST_REQUIRES_X86_AVX;
23468 for (size_t k = 16; k <= 80; k += 8) {
23469 GemmMicrokernelTester()
23470 .mr(1)
23471 .nr(4)
23472 .kr(8)
23473 .sr(1)
23474 .m(1)
23475 .n(4)
23476 .k(k)
23477 .a_stride(83)
23478 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23479 }
23480 }
23481
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64,k_div_8_subtile)23482 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, k_div_8_subtile) {
23483 TEST_REQUIRES_X86_AVX;
23484 for (size_t k = 16; k <= 80; k += 8) {
23485 for (uint32_t n = 1; n <= 4; n++) {
23486 for (uint32_t m = 1; m <= 1; m++) {
23487 GemmMicrokernelTester()
23488 .mr(1)
23489 .nr(4)
23490 .kr(8)
23491 .sr(1)
23492 .m(m)
23493 .n(n)
23494 .k(k)
23495 .iterations(1)
23496 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23497 }
23498 }
23499 }
23500 }
23501
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64,n_gt_4)23502 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_gt_4) {
23503 TEST_REQUIRES_X86_AVX;
23504 for (uint32_t n = 5; n < 8; n++) {
23505 for (size_t k = 1; k <= 40; k += 9) {
23506 GemmMicrokernelTester()
23507 .mr(1)
23508 .nr(4)
23509 .kr(8)
23510 .sr(1)
23511 .m(1)
23512 .n(n)
23513 .k(k)
23514 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23515 }
23516 }
23517 }
23518
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64,n_gt_4_strided_cn)23519 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_gt_4_strided_cn) {
23520 TEST_REQUIRES_X86_AVX;
23521 for (uint32_t n = 5; n < 8; n++) {
23522 for (size_t k = 1; k <= 40; k += 9) {
23523 GemmMicrokernelTester()
23524 .mr(1)
23525 .nr(4)
23526 .kr(8)
23527 .sr(1)
23528 .m(1)
23529 .n(n)
23530 .k(k)
23531 .cn_stride(7)
23532 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23533 }
23534 }
23535 }
23536
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64,n_gt_4_strided_a)23537 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_gt_4_strided_a) {
23538 TEST_REQUIRES_X86_AVX;
23539 for (uint32_t n = 5; n < 8; n++) {
23540 for (size_t k = 1; k <= 40; k += 9) {
23541 GemmMicrokernelTester()
23542 .mr(1)
23543 .nr(4)
23544 .kr(8)
23545 .sr(1)
23546 .m(1)
23547 .n(n)
23548 .k(k)
23549 .a_stride(43)
23550 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23551 }
23552 }
23553 }
23554
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64,n_gt_4_subtile)23555 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_gt_4_subtile) {
23556 TEST_REQUIRES_X86_AVX;
23557 for (uint32_t n = 5; n < 8; n++) {
23558 for (size_t k = 1; k <= 40; k += 9) {
23559 for (uint32_t m = 1; m <= 1; m++) {
23560 GemmMicrokernelTester()
23561 .mr(1)
23562 .nr(4)
23563 .kr(8)
23564 .sr(1)
23565 .m(m)
23566 .n(n)
23567 .k(k)
23568 .iterations(1)
23569 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23570 }
23571 }
23572 }
23573 }
23574
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64,n_div_4)23575 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_div_4) {
23576 TEST_REQUIRES_X86_AVX;
23577 for (uint32_t n = 8; n <= 12; n += 4) {
23578 for (size_t k = 1; k <= 40; k += 9) {
23579 GemmMicrokernelTester()
23580 .mr(1)
23581 .nr(4)
23582 .kr(8)
23583 .sr(1)
23584 .m(1)
23585 .n(n)
23586 .k(k)
23587 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23588 }
23589 }
23590 }
23591
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64,n_div_4_strided_cn)23592 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_div_4_strided_cn) {
23593 TEST_REQUIRES_X86_AVX;
23594 for (uint32_t n = 8; n <= 12; n += 4) {
23595 for (size_t k = 1; k <= 40; k += 9) {
23596 GemmMicrokernelTester()
23597 .mr(1)
23598 .nr(4)
23599 .kr(8)
23600 .sr(1)
23601 .m(1)
23602 .n(n)
23603 .k(k)
23604 .cn_stride(7)
23605 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23606 }
23607 }
23608 }
23609
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64,n_div_4_strided_a)23610 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_div_4_strided_a) {
23611 TEST_REQUIRES_X86_AVX;
23612 for (uint32_t n = 8; n <= 12; n += 4) {
23613 for (size_t k = 1; k <= 40; k += 9) {
23614 GemmMicrokernelTester()
23615 .mr(1)
23616 .nr(4)
23617 .kr(8)
23618 .sr(1)
23619 .m(1)
23620 .n(n)
23621 .k(k)
23622 .a_stride(43)
23623 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23624 }
23625 }
23626 }
23627
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64,n_div_4_subtile)23628 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, n_div_4_subtile) {
23629 TEST_REQUIRES_X86_AVX;
23630 for (uint32_t n = 8; n <= 12; n += 4) {
23631 for (size_t k = 1; k <= 40; k += 9) {
23632 for (uint32_t m = 1; m <= 1; m++) {
23633 GemmMicrokernelTester()
23634 .mr(1)
23635 .nr(4)
23636 .kr(8)
23637 .sr(1)
23638 .m(m)
23639 .n(n)
23640 .k(k)
23641 .iterations(1)
23642 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23643 }
23644 }
23645 }
23646 }
23647
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64,strided_cm_subtile)23648 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, strided_cm_subtile) {
23649 TEST_REQUIRES_X86_AVX;
23650 for (size_t k = 1; k <= 40; k += 9) {
23651 for (uint32_t n = 1; n <= 4; n++) {
23652 for (uint32_t m = 1; m <= 1; m++) {
23653 GemmMicrokernelTester()
23654 .mr(1)
23655 .nr(4)
23656 .kr(8)
23657 .sr(1)
23658 .m(m)
23659 .n(n)
23660 .k(k)
23661 .cm_stride(7)
23662 .iterations(1)
23663 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23664 }
23665 }
23666 }
23667 }
23668
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64,qmin)23669 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, qmin) {
23670 TEST_REQUIRES_X86_AVX;
23671 GemmMicrokernelTester()
23672 .mr(1)
23673 .nr(4)
23674 .kr(8)
23675 .sr(1)
23676 .m(1)
23677 .n(4)
23678 .k(8)
23679 .qmin(128)
23680 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23681 }
23682
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64,qmax)23683 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, qmax) {
23684 TEST_REQUIRES_X86_AVX;
23685 GemmMicrokernelTester()
23686 .mr(1)
23687 .nr(4)
23688 .kr(8)
23689 .sr(1)
23690 .m(1)
23691 .n(4)
23692 .k(8)
23693 .qmax(128)
23694 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23695 }
23696
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64,strided_cm)23697 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, strided_cm) {
23698 TEST_REQUIRES_X86_AVX;
23699 GemmMicrokernelTester()
23700 .mr(1)
23701 .nr(4)
23702 .kr(8)
23703 .sr(1)
23704 .m(1)
23705 .n(4)
23706 .k(8)
23707 .cm_stride(7)
23708 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23709 }
23710
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64,no_a_zero_point)23711 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, no_a_zero_point) {
23712 TEST_REQUIRES_X86_AVX;
23713 for (size_t k = 1; k <= 40; k += 9) {
23714 GemmMicrokernelTester()
23715 .mr(1)
23716 .nr(4)
23717 .kr(8)
23718 .sr(1)
23719 .m(1)
23720 .n(4)
23721 .k(k)
23722 .a_zero_point(0)
23723 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23724 }
23725 }
23726
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64,no_b_zero_point)23727 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, no_b_zero_point) {
23728 TEST_REQUIRES_X86_AVX;
23729 for (size_t k = 1; k <= 40; k += 9) {
23730 GemmMicrokernelTester()
23731 .mr(1)
23732 .nr(4)
23733 .kr(8)
23734 .sr(1)
23735 .m(1)
23736 .n(4)
23737 .k(k)
23738 .b_zero_point(0)
23739 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23740 }
23741 }
23742
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64,no_zero_point)23743 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__AVX_LD64, no_zero_point) {
23744 TEST_REQUIRES_X86_AVX;
23745 for (size_t k = 1; k <= 40; k += 9) {
23746 GemmMicrokernelTester()
23747 .mr(1)
23748 .nr(4)
23749 .kr(8)
23750 .sr(1)
23751 .m(1)
23752 .n(4)
23753 .k(k)
23754 .a_zero_point(0)
23755 .b_zero_point(0)
23756 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23757 }
23758 }
23759 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
23760
23761
23762 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64,k_eq_8)23763 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8) {
23764 TEST_REQUIRES_X86_XOP;
23765 GemmMicrokernelTester()
23766 .mr(1)
23767 .nr(4)
23768 .kr(8)
23769 .sr(1)
23770 .m(1)
23771 .n(4)
23772 .k(8)
23773 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23774 }
23775
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64,strided_cn)23776 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, strided_cn) {
23777 TEST_REQUIRES_X86_XOP;
23778 GemmMicrokernelTester()
23779 .mr(1)
23780 .nr(4)
23781 .kr(8)
23782 .sr(1)
23783 .m(1)
23784 .n(4)
23785 .k(8)
23786 .cn_stride(7)
23787 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23788 }
23789
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64,k_eq_8_strided_a)23790 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8_strided_a) {
23791 TEST_REQUIRES_X86_XOP;
23792 GemmMicrokernelTester()
23793 .mr(1)
23794 .nr(4)
23795 .kr(8)
23796 .sr(1)
23797 .m(1)
23798 .n(4)
23799 .k(8)
23800 .a_stride(11)
23801 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23802 }
23803
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64,k_eq_8_subtile)23804 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8_subtile) {
23805 TEST_REQUIRES_X86_XOP;
23806 for (uint32_t n = 1; n <= 4; n++) {
23807 for (uint32_t m = 1; m <= 1; m++) {
23808 GemmMicrokernelTester()
23809 .mr(1)
23810 .nr(4)
23811 .kr(8)
23812 .sr(1)
23813 .m(m)
23814 .n(n)
23815 .k(8)
23816 .iterations(1)
23817 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23818 }
23819 }
23820 }
23821
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64,k_eq_8_subtile_m)23822 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8_subtile_m) {
23823 TEST_REQUIRES_X86_XOP;
23824 for (uint32_t m = 1; m <= 1; m++) {
23825 GemmMicrokernelTester()
23826 .mr(1)
23827 .nr(4)
23828 .kr(8)
23829 .sr(1)
23830 .m(m)
23831 .n(4)
23832 .k(8)
23833 .iterations(1)
23834 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23835 }
23836 }
23837
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64,k_eq_8_subtile_n)23838 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_eq_8_subtile_n) {
23839 TEST_REQUIRES_X86_XOP;
23840 for (uint32_t n = 1; n <= 4; n++) {
23841 GemmMicrokernelTester()
23842 .mr(1)
23843 .nr(4)
23844 .kr(8)
23845 .sr(1)
23846 .m(1)
23847 .n(n)
23848 .k(8)
23849 .iterations(1)
23850 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23851 }
23852 }
23853
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64,k_lt_8)23854 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_lt_8) {
23855 TEST_REQUIRES_X86_XOP;
23856 for (size_t k = 1; k < 8; k++) {
23857 GemmMicrokernelTester()
23858 .mr(1)
23859 .nr(4)
23860 .kr(8)
23861 .sr(1)
23862 .m(1)
23863 .n(4)
23864 .k(k)
23865 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23866 }
23867 }
23868
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64,k_lt_8_strided_a)23869 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_lt_8_strided_a) {
23870 TEST_REQUIRES_X86_XOP;
23871 for (size_t k = 1; k < 8; k++) {
23872 GemmMicrokernelTester()
23873 .mr(1)
23874 .nr(4)
23875 .kr(8)
23876 .sr(1)
23877 .m(1)
23878 .n(4)
23879 .k(k)
23880 .a_stride(11)
23881 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23882 }
23883 }
23884
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64,k_lt_8_subtile)23885 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_lt_8_subtile) {
23886 TEST_REQUIRES_X86_XOP;
23887 for (size_t k = 1; k < 8; k++) {
23888 for (uint32_t n = 1; n <= 4; n++) {
23889 for (uint32_t m = 1; m <= 1; m++) {
23890 GemmMicrokernelTester()
23891 .mr(1)
23892 .nr(4)
23893 .kr(8)
23894 .sr(1)
23895 .m(m)
23896 .n(n)
23897 .k(k)
23898 .iterations(1)
23899 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23900 }
23901 }
23902 }
23903 }
23904
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64,k_gt_8)23905 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_gt_8) {
23906 TEST_REQUIRES_X86_XOP;
23907 for (size_t k = 9; k < 16; k++) {
23908 GemmMicrokernelTester()
23909 .mr(1)
23910 .nr(4)
23911 .kr(8)
23912 .sr(1)
23913 .m(1)
23914 .n(4)
23915 .k(k)
23916 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23917 }
23918 }
23919
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64,k_gt_8_strided_a)23920 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_gt_8_strided_a) {
23921 TEST_REQUIRES_X86_XOP;
23922 for (size_t k = 9; k < 16; k++) {
23923 GemmMicrokernelTester()
23924 .mr(1)
23925 .nr(4)
23926 .kr(8)
23927 .sr(1)
23928 .m(1)
23929 .n(4)
23930 .k(k)
23931 .a_stride(19)
23932 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23933 }
23934 }
23935
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64,k_gt_8_subtile)23936 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_gt_8_subtile) {
23937 TEST_REQUIRES_X86_XOP;
23938 for (size_t k = 9; k < 16; k++) {
23939 for (uint32_t n = 1; n <= 4; n++) {
23940 for (uint32_t m = 1; m <= 1; m++) {
23941 GemmMicrokernelTester()
23942 .mr(1)
23943 .nr(4)
23944 .kr(8)
23945 .sr(1)
23946 .m(m)
23947 .n(n)
23948 .k(k)
23949 .iterations(1)
23950 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23951 }
23952 }
23953 }
23954 }
23955
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64,k_div_8)23956 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_div_8) {
23957 TEST_REQUIRES_X86_XOP;
23958 for (size_t k = 16; k <= 80; k += 8) {
23959 GemmMicrokernelTester()
23960 .mr(1)
23961 .nr(4)
23962 .kr(8)
23963 .sr(1)
23964 .m(1)
23965 .n(4)
23966 .k(k)
23967 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23968 }
23969 }
23970
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64,k_div_8_strided_a)23971 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_div_8_strided_a) {
23972 TEST_REQUIRES_X86_XOP;
23973 for (size_t k = 16; k <= 80; k += 8) {
23974 GemmMicrokernelTester()
23975 .mr(1)
23976 .nr(4)
23977 .kr(8)
23978 .sr(1)
23979 .m(1)
23980 .n(4)
23981 .k(k)
23982 .a_stride(83)
23983 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
23984 }
23985 }
23986
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64,k_div_8_subtile)23987 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, k_div_8_subtile) {
23988 TEST_REQUIRES_X86_XOP;
23989 for (size_t k = 16; k <= 80; k += 8) {
23990 for (uint32_t n = 1; n <= 4; n++) {
23991 for (uint32_t m = 1; m <= 1; m++) {
23992 GemmMicrokernelTester()
23993 .mr(1)
23994 .nr(4)
23995 .kr(8)
23996 .sr(1)
23997 .m(m)
23998 .n(n)
23999 .k(k)
24000 .iterations(1)
24001 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24002 }
24003 }
24004 }
24005 }
24006
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64,n_gt_4)24007 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_gt_4) {
24008 TEST_REQUIRES_X86_XOP;
24009 for (uint32_t n = 5; n < 8; n++) {
24010 for (size_t k = 1; k <= 40; k += 9) {
24011 GemmMicrokernelTester()
24012 .mr(1)
24013 .nr(4)
24014 .kr(8)
24015 .sr(1)
24016 .m(1)
24017 .n(n)
24018 .k(k)
24019 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24020 }
24021 }
24022 }
24023
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64,n_gt_4_strided_cn)24024 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_gt_4_strided_cn) {
24025 TEST_REQUIRES_X86_XOP;
24026 for (uint32_t n = 5; n < 8; n++) {
24027 for (size_t k = 1; k <= 40; k += 9) {
24028 GemmMicrokernelTester()
24029 .mr(1)
24030 .nr(4)
24031 .kr(8)
24032 .sr(1)
24033 .m(1)
24034 .n(n)
24035 .k(k)
24036 .cn_stride(7)
24037 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24038 }
24039 }
24040 }
24041
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64,n_gt_4_strided_a)24042 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_gt_4_strided_a) {
24043 TEST_REQUIRES_X86_XOP;
24044 for (uint32_t n = 5; n < 8; n++) {
24045 for (size_t k = 1; k <= 40; k += 9) {
24046 GemmMicrokernelTester()
24047 .mr(1)
24048 .nr(4)
24049 .kr(8)
24050 .sr(1)
24051 .m(1)
24052 .n(n)
24053 .k(k)
24054 .a_stride(43)
24055 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24056 }
24057 }
24058 }
24059
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64,n_gt_4_subtile)24060 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_gt_4_subtile) {
24061 TEST_REQUIRES_X86_XOP;
24062 for (uint32_t n = 5; n < 8; n++) {
24063 for (size_t k = 1; k <= 40; k += 9) {
24064 for (uint32_t m = 1; m <= 1; m++) {
24065 GemmMicrokernelTester()
24066 .mr(1)
24067 .nr(4)
24068 .kr(8)
24069 .sr(1)
24070 .m(m)
24071 .n(n)
24072 .k(k)
24073 .iterations(1)
24074 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24075 }
24076 }
24077 }
24078 }
24079
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64,n_div_4)24080 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_div_4) {
24081 TEST_REQUIRES_X86_XOP;
24082 for (uint32_t n = 8; n <= 12; n += 4) {
24083 for (size_t k = 1; k <= 40; k += 9) {
24084 GemmMicrokernelTester()
24085 .mr(1)
24086 .nr(4)
24087 .kr(8)
24088 .sr(1)
24089 .m(1)
24090 .n(n)
24091 .k(k)
24092 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24093 }
24094 }
24095 }
24096
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64,n_div_4_strided_cn)24097 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_div_4_strided_cn) {
24098 TEST_REQUIRES_X86_XOP;
24099 for (uint32_t n = 8; n <= 12; n += 4) {
24100 for (size_t k = 1; k <= 40; k += 9) {
24101 GemmMicrokernelTester()
24102 .mr(1)
24103 .nr(4)
24104 .kr(8)
24105 .sr(1)
24106 .m(1)
24107 .n(n)
24108 .k(k)
24109 .cn_stride(7)
24110 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24111 }
24112 }
24113 }
24114
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64,n_div_4_strided_a)24115 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_div_4_strided_a) {
24116 TEST_REQUIRES_X86_XOP;
24117 for (uint32_t n = 8; n <= 12; n += 4) {
24118 for (size_t k = 1; k <= 40; k += 9) {
24119 GemmMicrokernelTester()
24120 .mr(1)
24121 .nr(4)
24122 .kr(8)
24123 .sr(1)
24124 .m(1)
24125 .n(n)
24126 .k(k)
24127 .a_stride(43)
24128 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24129 }
24130 }
24131 }
24132
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64,n_div_4_subtile)24133 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, n_div_4_subtile) {
24134 TEST_REQUIRES_X86_XOP;
24135 for (uint32_t n = 8; n <= 12; n += 4) {
24136 for (size_t k = 1; k <= 40; k += 9) {
24137 for (uint32_t m = 1; m <= 1; m++) {
24138 GemmMicrokernelTester()
24139 .mr(1)
24140 .nr(4)
24141 .kr(8)
24142 .sr(1)
24143 .m(m)
24144 .n(n)
24145 .k(k)
24146 .iterations(1)
24147 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24148 }
24149 }
24150 }
24151 }
24152
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64,strided_cm_subtile)24153 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, strided_cm_subtile) {
24154 TEST_REQUIRES_X86_XOP;
24155 for (size_t k = 1; k <= 40; k += 9) {
24156 for (uint32_t n = 1; n <= 4; n++) {
24157 for (uint32_t m = 1; m <= 1; m++) {
24158 GemmMicrokernelTester()
24159 .mr(1)
24160 .nr(4)
24161 .kr(8)
24162 .sr(1)
24163 .m(m)
24164 .n(n)
24165 .k(k)
24166 .cm_stride(7)
24167 .iterations(1)
24168 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24169 }
24170 }
24171 }
24172 }
24173
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64,qmin)24174 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, qmin) {
24175 TEST_REQUIRES_X86_XOP;
24176 GemmMicrokernelTester()
24177 .mr(1)
24178 .nr(4)
24179 .kr(8)
24180 .sr(1)
24181 .m(1)
24182 .n(4)
24183 .k(8)
24184 .qmin(128)
24185 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24186 }
24187
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64,qmax)24188 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, qmax) {
24189 TEST_REQUIRES_X86_XOP;
24190 GemmMicrokernelTester()
24191 .mr(1)
24192 .nr(4)
24193 .kr(8)
24194 .sr(1)
24195 .m(1)
24196 .n(4)
24197 .k(8)
24198 .qmax(128)
24199 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24200 }
24201
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64,strided_cm)24202 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, strided_cm) {
24203 TEST_REQUIRES_X86_XOP;
24204 GemmMicrokernelTester()
24205 .mr(1)
24206 .nr(4)
24207 .kr(8)
24208 .sr(1)
24209 .m(1)
24210 .n(4)
24211 .k(8)
24212 .cm_stride(7)
24213 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24214 }
24215
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64,no_a_zero_point)24216 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, no_a_zero_point) {
24217 TEST_REQUIRES_X86_XOP;
24218 for (size_t k = 1; k <= 40; k += 9) {
24219 GemmMicrokernelTester()
24220 .mr(1)
24221 .nr(4)
24222 .kr(8)
24223 .sr(1)
24224 .m(1)
24225 .n(4)
24226 .k(k)
24227 .a_zero_point(0)
24228 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24229 }
24230 }
24231
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64,no_b_zero_point)24232 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, no_b_zero_point) {
24233 TEST_REQUIRES_X86_XOP;
24234 for (size_t k = 1; k <= 40; k += 9) {
24235 GemmMicrokernelTester()
24236 .mr(1)
24237 .nr(4)
24238 .kr(8)
24239 .sr(1)
24240 .m(1)
24241 .n(4)
24242 .k(k)
24243 .b_zero_point(0)
24244 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24245 }
24246 }
24247
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64,no_zero_point)24248 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD64, no_zero_point) {
24249 TEST_REQUIRES_X86_XOP;
24250 for (size_t k = 1; k <= 40; k += 9) {
24251 GemmMicrokernelTester()
24252 .mr(1)
24253 .nr(4)
24254 .kr(8)
24255 .sr(1)
24256 .m(1)
24257 .n(4)
24258 .k(k)
24259 .a_zero_point(0)
24260 .b_zero_point(0)
24261 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24262 }
24263 }
24264 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
24265
24266
24267 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128,k_eq_8)24268 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8) {
24269 TEST_REQUIRES_X86_SSE2;
24270 GemmMicrokernelTester()
24271 .mr(3)
24272 .nr(4)
24273 .kr(8)
24274 .sr(1)
24275 .m(3)
24276 .n(4)
24277 .k(8)
24278 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24279 }
24280
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128,strided_cn)24281 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, strided_cn) {
24282 TEST_REQUIRES_X86_SSE2;
24283 GemmMicrokernelTester()
24284 .mr(3)
24285 .nr(4)
24286 .kr(8)
24287 .sr(1)
24288 .m(3)
24289 .n(4)
24290 .k(8)
24291 .cn_stride(7)
24292 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24293 }
24294
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128,k_eq_8_strided_a)24295 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8_strided_a) {
24296 TEST_REQUIRES_X86_SSE2;
24297 GemmMicrokernelTester()
24298 .mr(3)
24299 .nr(4)
24300 .kr(8)
24301 .sr(1)
24302 .m(3)
24303 .n(4)
24304 .k(8)
24305 .a_stride(11)
24306 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24307 }
24308
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128,k_eq_8_subtile)24309 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8_subtile) {
24310 TEST_REQUIRES_X86_SSE2;
24311 for (uint32_t n = 1; n <= 4; n++) {
24312 for (uint32_t m = 1; m <= 3; m++) {
24313 GemmMicrokernelTester()
24314 .mr(3)
24315 .nr(4)
24316 .kr(8)
24317 .sr(1)
24318 .m(m)
24319 .n(n)
24320 .k(8)
24321 .iterations(1)
24322 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24323 }
24324 }
24325 }
24326
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128,k_eq_8_subtile_m)24327 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8_subtile_m) {
24328 TEST_REQUIRES_X86_SSE2;
24329 for (uint32_t m = 1; m <= 3; m++) {
24330 GemmMicrokernelTester()
24331 .mr(3)
24332 .nr(4)
24333 .kr(8)
24334 .sr(1)
24335 .m(m)
24336 .n(4)
24337 .k(8)
24338 .iterations(1)
24339 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24340 }
24341 }
24342
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128,k_eq_8_subtile_n)24343 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_eq_8_subtile_n) {
24344 TEST_REQUIRES_X86_SSE2;
24345 for (uint32_t n = 1; n <= 4; n++) {
24346 GemmMicrokernelTester()
24347 .mr(3)
24348 .nr(4)
24349 .kr(8)
24350 .sr(1)
24351 .m(3)
24352 .n(n)
24353 .k(8)
24354 .iterations(1)
24355 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24356 }
24357 }
24358
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128,k_lt_8)24359 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_lt_8) {
24360 TEST_REQUIRES_X86_SSE2;
24361 for (size_t k = 1; k < 8; k++) {
24362 GemmMicrokernelTester()
24363 .mr(3)
24364 .nr(4)
24365 .kr(8)
24366 .sr(1)
24367 .m(3)
24368 .n(4)
24369 .k(k)
24370 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24371 }
24372 }
24373
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128,k_lt_8_strided_a)24374 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_lt_8_strided_a) {
24375 TEST_REQUIRES_X86_SSE2;
24376 for (size_t k = 1; k < 8; k++) {
24377 GemmMicrokernelTester()
24378 .mr(3)
24379 .nr(4)
24380 .kr(8)
24381 .sr(1)
24382 .m(3)
24383 .n(4)
24384 .k(k)
24385 .a_stride(11)
24386 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24387 }
24388 }
24389
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128,k_lt_8_subtile)24390 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_lt_8_subtile) {
24391 TEST_REQUIRES_X86_SSE2;
24392 for (size_t k = 1; k < 8; k++) {
24393 for (uint32_t n = 1; n <= 4; n++) {
24394 for (uint32_t m = 1; m <= 3; m++) {
24395 GemmMicrokernelTester()
24396 .mr(3)
24397 .nr(4)
24398 .kr(8)
24399 .sr(1)
24400 .m(m)
24401 .n(n)
24402 .k(k)
24403 .iterations(1)
24404 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24405 }
24406 }
24407 }
24408 }
24409
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128,k_gt_8)24410 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_gt_8) {
24411 TEST_REQUIRES_X86_SSE2;
24412 for (size_t k = 9; k < 16; k++) {
24413 GemmMicrokernelTester()
24414 .mr(3)
24415 .nr(4)
24416 .kr(8)
24417 .sr(1)
24418 .m(3)
24419 .n(4)
24420 .k(k)
24421 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24422 }
24423 }
24424
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128,k_gt_8_strided_a)24425 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_gt_8_strided_a) {
24426 TEST_REQUIRES_X86_SSE2;
24427 for (size_t k = 9; k < 16; k++) {
24428 GemmMicrokernelTester()
24429 .mr(3)
24430 .nr(4)
24431 .kr(8)
24432 .sr(1)
24433 .m(3)
24434 .n(4)
24435 .k(k)
24436 .a_stride(19)
24437 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24438 }
24439 }
24440
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128,k_gt_8_subtile)24441 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_gt_8_subtile) {
24442 TEST_REQUIRES_X86_SSE2;
24443 for (size_t k = 9; k < 16; k++) {
24444 for (uint32_t n = 1; n <= 4; n++) {
24445 for (uint32_t m = 1; m <= 3; m++) {
24446 GemmMicrokernelTester()
24447 .mr(3)
24448 .nr(4)
24449 .kr(8)
24450 .sr(1)
24451 .m(m)
24452 .n(n)
24453 .k(k)
24454 .iterations(1)
24455 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24456 }
24457 }
24458 }
24459 }
24460
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128,k_div_8)24461 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_div_8) {
24462 TEST_REQUIRES_X86_SSE2;
24463 for (size_t k = 16; k <= 80; k += 8) {
24464 GemmMicrokernelTester()
24465 .mr(3)
24466 .nr(4)
24467 .kr(8)
24468 .sr(1)
24469 .m(3)
24470 .n(4)
24471 .k(k)
24472 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24473 }
24474 }
24475
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128,k_div_8_strided_a)24476 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_div_8_strided_a) {
24477 TEST_REQUIRES_X86_SSE2;
24478 for (size_t k = 16; k <= 80; k += 8) {
24479 GemmMicrokernelTester()
24480 .mr(3)
24481 .nr(4)
24482 .kr(8)
24483 .sr(1)
24484 .m(3)
24485 .n(4)
24486 .k(k)
24487 .a_stride(83)
24488 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24489 }
24490 }
24491
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128,k_div_8_subtile)24492 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, k_div_8_subtile) {
24493 TEST_REQUIRES_X86_SSE2;
24494 for (size_t k = 16; k <= 80; k += 8) {
24495 for (uint32_t n = 1; n <= 4; n++) {
24496 for (uint32_t m = 1; m <= 3; m++) {
24497 GemmMicrokernelTester()
24498 .mr(3)
24499 .nr(4)
24500 .kr(8)
24501 .sr(1)
24502 .m(m)
24503 .n(n)
24504 .k(k)
24505 .iterations(1)
24506 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24507 }
24508 }
24509 }
24510 }
24511
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128,n_gt_4)24512 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_gt_4) {
24513 TEST_REQUIRES_X86_SSE2;
24514 for (uint32_t n = 5; n < 8; n++) {
24515 for (size_t k = 1; k <= 40; k += 9) {
24516 GemmMicrokernelTester()
24517 .mr(3)
24518 .nr(4)
24519 .kr(8)
24520 .sr(1)
24521 .m(3)
24522 .n(n)
24523 .k(k)
24524 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24525 }
24526 }
24527 }
24528
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128,n_gt_4_strided_cn)24529 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_gt_4_strided_cn) {
24530 TEST_REQUIRES_X86_SSE2;
24531 for (uint32_t n = 5; n < 8; n++) {
24532 for (size_t k = 1; k <= 40; k += 9) {
24533 GemmMicrokernelTester()
24534 .mr(3)
24535 .nr(4)
24536 .kr(8)
24537 .sr(1)
24538 .m(3)
24539 .n(n)
24540 .k(k)
24541 .cn_stride(7)
24542 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24543 }
24544 }
24545 }
24546
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128,n_gt_4_strided_a)24547 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_gt_4_strided_a) {
24548 TEST_REQUIRES_X86_SSE2;
24549 for (uint32_t n = 5; n < 8; n++) {
24550 for (size_t k = 1; k <= 40; k += 9) {
24551 GemmMicrokernelTester()
24552 .mr(3)
24553 .nr(4)
24554 .kr(8)
24555 .sr(1)
24556 .m(3)
24557 .n(n)
24558 .k(k)
24559 .a_stride(43)
24560 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24561 }
24562 }
24563 }
24564
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128,n_gt_4_subtile)24565 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_gt_4_subtile) {
24566 TEST_REQUIRES_X86_SSE2;
24567 for (uint32_t n = 5; n < 8; n++) {
24568 for (size_t k = 1; k <= 40; k += 9) {
24569 for (uint32_t m = 1; m <= 3; m++) {
24570 GemmMicrokernelTester()
24571 .mr(3)
24572 .nr(4)
24573 .kr(8)
24574 .sr(1)
24575 .m(m)
24576 .n(n)
24577 .k(k)
24578 .iterations(1)
24579 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24580 }
24581 }
24582 }
24583 }
24584
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128,n_div_4)24585 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_div_4) {
24586 TEST_REQUIRES_X86_SSE2;
24587 for (uint32_t n = 8; n <= 12; n += 4) {
24588 for (size_t k = 1; k <= 40; k += 9) {
24589 GemmMicrokernelTester()
24590 .mr(3)
24591 .nr(4)
24592 .kr(8)
24593 .sr(1)
24594 .m(3)
24595 .n(n)
24596 .k(k)
24597 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24598 }
24599 }
24600 }
24601
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128,n_div_4_strided_cn)24602 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_div_4_strided_cn) {
24603 TEST_REQUIRES_X86_SSE2;
24604 for (uint32_t n = 8; n <= 12; n += 4) {
24605 for (size_t k = 1; k <= 40; k += 9) {
24606 GemmMicrokernelTester()
24607 .mr(3)
24608 .nr(4)
24609 .kr(8)
24610 .sr(1)
24611 .m(3)
24612 .n(n)
24613 .k(k)
24614 .cn_stride(7)
24615 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24616 }
24617 }
24618 }
24619
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128,n_div_4_strided_a)24620 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_div_4_strided_a) {
24621 TEST_REQUIRES_X86_SSE2;
24622 for (uint32_t n = 8; n <= 12; n += 4) {
24623 for (size_t k = 1; k <= 40; k += 9) {
24624 GemmMicrokernelTester()
24625 .mr(3)
24626 .nr(4)
24627 .kr(8)
24628 .sr(1)
24629 .m(3)
24630 .n(n)
24631 .k(k)
24632 .a_stride(43)
24633 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24634 }
24635 }
24636 }
24637
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128,n_div_4_subtile)24638 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, n_div_4_subtile) {
24639 TEST_REQUIRES_X86_SSE2;
24640 for (uint32_t n = 8; n <= 12; n += 4) {
24641 for (size_t k = 1; k <= 40; k += 9) {
24642 for (uint32_t m = 1; m <= 3; m++) {
24643 GemmMicrokernelTester()
24644 .mr(3)
24645 .nr(4)
24646 .kr(8)
24647 .sr(1)
24648 .m(m)
24649 .n(n)
24650 .k(k)
24651 .iterations(1)
24652 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24653 }
24654 }
24655 }
24656 }
24657
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128,strided_cm_subtile)24658 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, strided_cm_subtile) {
24659 TEST_REQUIRES_X86_SSE2;
24660 for (size_t k = 1; k <= 40; k += 9) {
24661 for (uint32_t n = 1; n <= 4; n++) {
24662 for (uint32_t m = 1; m <= 3; m++) {
24663 GemmMicrokernelTester()
24664 .mr(3)
24665 .nr(4)
24666 .kr(8)
24667 .sr(1)
24668 .m(m)
24669 .n(n)
24670 .k(k)
24671 .cm_stride(7)
24672 .iterations(1)
24673 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24674 }
24675 }
24676 }
24677 }
24678
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128,qmin)24679 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, qmin) {
24680 TEST_REQUIRES_X86_SSE2;
24681 GemmMicrokernelTester()
24682 .mr(3)
24683 .nr(4)
24684 .kr(8)
24685 .sr(1)
24686 .m(3)
24687 .n(4)
24688 .k(8)
24689 .qmin(128)
24690 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24691 }
24692
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128,qmax)24693 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, qmax) {
24694 TEST_REQUIRES_X86_SSE2;
24695 GemmMicrokernelTester()
24696 .mr(3)
24697 .nr(4)
24698 .kr(8)
24699 .sr(1)
24700 .m(3)
24701 .n(4)
24702 .k(8)
24703 .qmax(128)
24704 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24705 }
24706
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128,strided_cm)24707 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, strided_cm) {
24708 TEST_REQUIRES_X86_SSE2;
24709 GemmMicrokernelTester()
24710 .mr(3)
24711 .nr(4)
24712 .kr(8)
24713 .sr(1)
24714 .m(3)
24715 .n(4)
24716 .k(8)
24717 .cm_stride(7)
24718 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24719 }
24720
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128,no_a_zero_point)24721 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, no_a_zero_point) {
24722 TEST_REQUIRES_X86_SSE2;
24723 for (size_t k = 1; k <= 40; k += 9) {
24724 GemmMicrokernelTester()
24725 .mr(3)
24726 .nr(4)
24727 .kr(8)
24728 .sr(1)
24729 .m(3)
24730 .n(4)
24731 .k(k)
24732 .a_zero_point(0)
24733 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24734 }
24735 }
24736
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128,no_b_zero_point)24737 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, no_b_zero_point) {
24738 TEST_REQUIRES_X86_SSE2;
24739 for (size_t k = 1; k <= 40; k += 9) {
24740 GemmMicrokernelTester()
24741 .mr(3)
24742 .nr(4)
24743 .kr(8)
24744 .sr(1)
24745 .m(3)
24746 .n(4)
24747 .k(k)
24748 .b_zero_point(0)
24749 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24750 }
24751 }
24752
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128,no_zero_point)24753 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE2_LD128, no_zero_point) {
24754 TEST_REQUIRES_X86_SSE2;
24755 for (size_t k = 1; k <= 40; k += 9) {
24756 GemmMicrokernelTester()
24757 .mr(3)
24758 .nr(4)
24759 .kr(8)
24760 .sr(1)
24761 .m(3)
24762 .n(4)
24763 .k(k)
24764 .a_zero_point(0)
24765 .b_zero_point(0)
24766 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24767 }
24768 }
24769 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
24770
24771
24772 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128,k_eq_8)24773 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_eq_8) {
24774 TEST_REQUIRES_X86_SSE41;
24775 GemmMicrokernelTester()
24776 .mr(3)
24777 .nr(4)
24778 .kr(8)
24779 .sr(1)
24780 .m(3)
24781 .n(4)
24782 .k(8)
24783 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24784 }
24785
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128,strided_cn)24786 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, strided_cn) {
24787 TEST_REQUIRES_X86_SSE41;
24788 GemmMicrokernelTester()
24789 .mr(3)
24790 .nr(4)
24791 .kr(8)
24792 .sr(1)
24793 .m(3)
24794 .n(4)
24795 .k(8)
24796 .cn_stride(7)
24797 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24798 }
24799
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128,k_eq_8_strided_a)24800 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_eq_8_strided_a) {
24801 TEST_REQUIRES_X86_SSE41;
24802 GemmMicrokernelTester()
24803 .mr(3)
24804 .nr(4)
24805 .kr(8)
24806 .sr(1)
24807 .m(3)
24808 .n(4)
24809 .k(8)
24810 .a_stride(11)
24811 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24812 }
24813
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128,k_eq_8_subtile)24814 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_eq_8_subtile) {
24815 TEST_REQUIRES_X86_SSE41;
24816 for (uint32_t n = 1; n <= 4; n++) {
24817 for (uint32_t m = 1; m <= 3; m++) {
24818 GemmMicrokernelTester()
24819 .mr(3)
24820 .nr(4)
24821 .kr(8)
24822 .sr(1)
24823 .m(m)
24824 .n(n)
24825 .k(8)
24826 .iterations(1)
24827 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24828 }
24829 }
24830 }
24831
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128,k_eq_8_subtile_m)24832 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_eq_8_subtile_m) {
24833 TEST_REQUIRES_X86_SSE41;
24834 for (uint32_t m = 1; m <= 3; m++) {
24835 GemmMicrokernelTester()
24836 .mr(3)
24837 .nr(4)
24838 .kr(8)
24839 .sr(1)
24840 .m(m)
24841 .n(4)
24842 .k(8)
24843 .iterations(1)
24844 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24845 }
24846 }
24847
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128,k_eq_8_subtile_n)24848 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_eq_8_subtile_n) {
24849 TEST_REQUIRES_X86_SSE41;
24850 for (uint32_t n = 1; n <= 4; n++) {
24851 GemmMicrokernelTester()
24852 .mr(3)
24853 .nr(4)
24854 .kr(8)
24855 .sr(1)
24856 .m(3)
24857 .n(n)
24858 .k(8)
24859 .iterations(1)
24860 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24861 }
24862 }
24863
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128,k_lt_8)24864 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_lt_8) {
24865 TEST_REQUIRES_X86_SSE41;
24866 for (size_t k = 1; k < 8; k++) {
24867 GemmMicrokernelTester()
24868 .mr(3)
24869 .nr(4)
24870 .kr(8)
24871 .sr(1)
24872 .m(3)
24873 .n(4)
24874 .k(k)
24875 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24876 }
24877 }
24878
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128,k_lt_8_strided_a)24879 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_lt_8_strided_a) {
24880 TEST_REQUIRES_X86_SSE41;
24881 for (size_t k = 1; k < 8; k++) {
24882 GemmMicrokernelTester()
24883 .mr(3)
24884 .nr(4)
24885 .kr(8)
24886 .sr(1)
24887 .m(3)
24888 .n(4)
24889 .k(k)
24890 .a_stride(11)
24891 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24892 }
24893 }
24894
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128,k_lt_8_subtile)24895 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_lt_8_subtile) {
24896 TEST_REQUIRES_X86_SSE41;
24897 for (size_t k = 1; k < 8; k++) {
24898 for (uint32_t n = 1; n <= 4; n++) {
24899 for (uint32_t m = 1; m <= 3; m++) {
24900 GemmMicrokernelTester()
24901 .mr(3)
24902 .nr(4)
24903 .kr(8)
24904 .sr(1)
24905 .m(m)
24906 .n(n)
24907 .k(k)
24908 .iterations(1)
24909 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24910 }
24911 }
24912 }
24913 }
24914
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128,k_gt_8)24915 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_gt_8) {
24916 TEST_REQUIRES_X86_SSE41;
24917 for (size_t k = 9; k < 16; k++) {
24918 GemmMicrokernelTester()
24919 .mr(3)
24920 .nr(4)
24921 .kr(8)
24922 .sr(1)
24923 .m(3)
24924 .n(4)
24925 .k(k)
24926 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24927 }
24928 }
24929
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128,k_gt_8_strided_a)24930 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_gt_8_strided_a) {
24931 TEST_REQUIRES_X86_SSE41;
24932 for (size_t k = 9; k < 16; k++) {
24933 GemmMicrokernelTester()
24934 .mr(3)
24935 .nr(4)
24936 .kr(8)
24937 .sr(1)
24938 .m(3)
24939 .n(4)
24940 .k(k)
24941 .a_stride(19)
24942 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24943 }
24944 }
24945
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128,k_gt_8_subtile)24946 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_gt_8_subtile) {
24947 TEST_REQUIRES_X86_SSE41;
24948 for (size_t k = 9; k < 16; k++) {
24949 for (uint32_t n = 1; n <= 4; n++) {
24950 for (uint32_t m = 1; m <= 3; m++) {
24951 GemmMicrokernelTester()
24952 .mr(3)
24953 .nr(4)
24954 .kr(8)
24955 .sr(1)
24956 .m(m)
24957 .n(n)
24958 .k(k)
24959 .iterations(1)
24960 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24961 }
24962 }
24963 }
24964 }
24965
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128,k_div_8)24966 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_div_8) {
24967 TEST_REQUIRES_X86_SSE41;
24968 for (size_t k = 16; k <= 80; k += 8) {
24969 GemmMicrokernelTester()
24970 .mr(3)
24971 .nr(4)
24972 .kr(8)
24973 .sr(1)
24974 .m(3)
24975 .n(4)
24976 .k(k)
24977 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24978 }
24979 }
24980
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128,k_div_8_strided_a)24981 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_div_8_strided_a) {
24982 TEST_REQUIRES_X86_SSE41;
24983 for (size_t k = 16; k <= 80; k += 8) {
24984 GemmMicrokernelTester()
24985 .mr(3)
24986 .nr(4)
24987 .kr(8)
24988 .sr(1)
24989 .m(3)
24990 .n(4)
24991 .k(k)
24992 .a_stride(83)
24993 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
24994 }
24995 }
24996
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128,k_div_8_subtile)24997 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, k_div_8_subtile) {
24998 TEST_REQUIRES_X86_SSE41;
24999 for (size_t k = 16; k <= 80; k += 8) {
25000 for (uint32_t n = 1; n <= 4; n++) {
25001 for (uint32_t m = 1; m <= 3; m++) {
25002 GemmMicrokernelTester()
25003 .mr(3)
25004 .nr(4)
25005 .kr(8)
25006 .sr(1)
25007 .m(m)
25008 .n(n)
25009 .k(k)
25010 .iterations(1)
25011 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25012 }
25013 }
25014 }
25015 }
25016
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128,n_gt_4)25017 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_gt_4) {
25018 TEST_REQUIRES_X86_SSE41;
25019 for (uint32_t n = 5; n < 8; n++) {
25020 for (size_t k = 1; k <= 40; k += 9) {
25021 GemmMicrokernelTester()
25022 .mr(3)
25023 .nr(4)
25024 .kr(8)
25025 .sr(1)
25026 .m(3)
25027 .n(n)
25028 .k(k)
25029 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25030 }
25031 }
25032 }
25033
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128,n_gt_4_strided_cn)25034 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_gt_4_strided_cn) {
25035 TEST_REQUIRES_X86_SSE41;
25036 for (uint32_t n = 5; n < 8; n++) {
25037 for (size_t k = 1; k <= 40; k += 9) {
25038 GemmMicrokernelTester()
25039 .mr(3)
25040 .nr(4)
25041 .kr(8)
25042 .sr(1)
25043 .m(3)
25044 .n(n)
25045 .k(k)
25046 .cn_stride(7)
25047 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25048 }
25049 }
25050 }
25051
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128,n_gt_4_strided_a)25052 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_gt_4_strided_a) {
25053 TEST_REQUIRES_X86_SSE41;
25054 for (uint32_t n = 5; n < 8; n++) {
25055 for (size_t k = 1; k <= 40; k += 9) {
25056 GemmMicrokernelTester()
25057 .mr(3)
25058 .nr(4)
25059 .kr(8)
25060 .sr(1)
25061 .m(3)
25062 .n(n)
25063 .k(k)
25064 .a_stride(43)
25065 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25066 }
25067 }
25068 }
25069
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128,n_gt_4_subtile)25070 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_gt_4_subtile) {
25071 TEST_REQUIRES_X86_SSE41;
25072 for (uint32_t n = 5; n < 8; n++) {
25073 for (size_t k = 1; k <= 40; k += 9) {
25074 for (uint32_t m = 1; m <= 3; m++) {
25075 GemmMicrokernelTester()
25076 .mr(3)
25077 .nr(4)
25078 .kr(8)
25079 .sr(1)
25080 .m(m)
25081 .n(n)
25082 .k(k)
25083 .iterations(1)
25084 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25085 }
25086 }
25087 }
25088 }
25089
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128,n_div_4)25090 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_div_4) {
25091 TEST_REQUIRES_X86_SSE41;
25092 for (uint32_t n = 8; n <= 12; n += 4) {
25093 for (size_t k = 1; k <= 40; k += 9) {
25094 GemmMicrokernelTester()
25095 .mr(3)
25096 .nr(4)
25097 .kr(8)
25098 .sr(1)
25099 .m(3)
25100 .n(n)
25101 .k(k)
25102 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25103 }
25104 }
25105 }
25106
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128,n_div_4_strided_cn)25107 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_div_4_strided_cn) {
25108 TEST_REQUIRES_X86_SSE41;
25109 for (uint32_t n = 8; n <= 12; n += 4) {
25110 for (size_t k = 1; k <= 40; k += 9) {
25111 GemmMicrokernelTester()
25112 .mr(3)
25113 .nr(4)
25114 .kr(8)
25115 .sr(1)
25116 .m(3)
25117 .n(n)
25118 .k(k)
25119 .cn_stride(7)
25120 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25121 }
25122 }
25123 }
25124
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128,n_div_4_strided_a)25125 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_div_4_strided_a) {
25126 TEST_REQUIRES_X86_SSE41;
25127 for (uint32_t n = 8; n <= 12; n += 4) {
25128 for (size_t k = 1; k <= 40; k += 9) {
25129 GemmMicrokernelTester()
25130 .mr(3)
25131 .nr(4)
25132 .kr(8)
25133 .sr(1)
25134 .m(3)
25135 .n(n)
25136 .k(k)
25137 .a_stride(43)
25138 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25139 }
25140 }
25141 }
25142
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128,n_div_4_subtile)25143 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, n_div_4_subtile) {
25144 TEST_REQUIRES_X86_SSE41;
25145 for (uint32_t n = 8; n <= 12; n += 4) {
25146 for (size_t k = 1; k <= 40; k += 9) {
25147 for (uint32_t m = 1; m <= 3; m++) {
25148 GemmMicrokernelTester()
25149 .mr(3)
25150 .nr(4)
25151 .kr(8)
25152 .sr(1)
25153 .m(m)
25154 .n(n)
25155 .k(k)
25156 .iterations(1)
25157 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25158 }
25159 }
25160 }
25161 }
25162
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128,strided_cm_subtile)25163 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, strided_cm_subtile) {
25164 TEST_REQUIRES_X86_SSE41;
25165 for (size_t k = 1; k <= 40; k += 9) {
25166 for (uint32_t n = 1; n <= 4; n++) {
25167 for (uint32_t m = 1; m <= 3; m++) {
25168 GemmMicrokernelTester()
25169 .mr(3)
25170 .nr(4)
25171 .kr(8)
25172 .sr(1)
25173 .m(m)
25174 .n(n)
25175 .k(k)
25176 .cm_stride(7)
25177 .iterations(1)
25178 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25179 }
25180 }
25181 }
25182 }
25183
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128,qmin)25184 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, qmin) {
25185 TEST_REQUIRES_X86_SSE41;
25186 GemmMicrokernelTester()
25187 .mr(3)
25188 .nr(4)
25189 .kr(8)
25190 .sr(1)
25191 .m(3)
25192 .n(4)
25193 .k(8)
25194 .qmin(128)
25195 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25196 }
25197
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128,qmax)25198 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, qmax) {
25199 TEST_REQUIRES_X86_SSE41;
25200 GemmMicrokernelTester()
25201 .mr(3)
25202 .nr(4)
25203 .kr(8)
25204 .sr(1)
25205 .m(3)
25206 .n(4)
25207 .k(8)
25208 .qmax(128)
25209 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25210 }
25211
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128,strided_cm)25212 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, strided_cm) {
25213 TEST_REQUIRES_X86_SSE41;
25214 GemmMicrokernelTester()
25215 .mr(3)
25216 .nr(4)
25217 .kr(8)
25218 .sr(1)
25219 .m(3)
25220 .n(4)
25221 .k(8)
25222 .cm_stride(7)
25223 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25224 }
25225
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128,no_a_zero_point)25226 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, no_a_zero_point) {
25227 TEST_REQUIRES_X86_SSE41;
25228 for (size_t k = 1; k <= 40; k += 9) {
25229 GemmMicrokernelTester()
25230 .mr(3)
25231 .nr(4)
25232 .kr(8)
25233 .sr(1)
25234 .m(3)
25235 .n(4)
25236 .k(k)
25237 .a_zero_point(0)
25238 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25239 }
25240 }
25241
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128,no_b_zero_point)25242 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, no_b_zero_point) {
25243 TEST_REQUIRES_X86_SSE41;
25244 for (size_t k = 1; k <= 40; k += 9) {
25245 GemmMicrokernelTester()
25246 .mr(3)
25247 .nr(4)
25248 .kr(8)
25249 .sr(1)
25250 .m(3)
25251 .n(4)
25252 .k(k)
25253 .b_zero_point(0)
25254 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25255 }
25256 }
25257
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128,no_zero_point)25258 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__SSE41_LD128, no_zero_point) {
25259 TEST_REQUIRES_X86_SSE41;
25260 for (size_t k = 1; k <= 40; k += 9) {
25261 GemmMicrokernelTester()
25262 .mr(3)
25263 .nr(4)
25264 .kr(8)
25265 .sr(1)
25266 .m(3)
25267 .n(4)
25268 .k(k)
25269 .a_zero_point(0)
25270 .b_zero_point(0)
25271 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25272 }
25273 }
25274 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
25275
25276
25277 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128,k_eq_8)25278 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8) {
25279 TEST_REQUIRES_X86_XOP;
25280 GemmMicrokernelTester()
25281 .mr(1)
25282 .nr(4)
25283 .kr(8)
25284 .sr(1)
25285 .m(1)
25286 .n(4)
25287 .k(8)
25288 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25289 }
25290
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128,strided_cn)25291 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, strided_cn) {
25292 TEST_REQUIRES_X86_XOP;
25293 GemmMicrokernelTester()
25294 .mr(1)
25295 .nr(4)
25296 .kr(8)
25297 .sr(1)
25298 .m(1)
25299 .n(4)
25300 .k(8)
25301 .cn_stride(7)
25302 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25303 }
25304
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128,k_eq_8_strided_a)25305 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8_strided_a) {
25306 TEST_REQUIRES_X86_XOP;
25307 GemmMicrokernelTester()
25308 .mr(1)
25309 .nr(4)
25310 .kr(8)
25311 .sr(1)
25312 .m(1)
25313 .n(4)
25314 .k(8)
25315 .a_stride(11)
25316 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25317 }
25318
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128,k_eq_8_subtile)25319 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8_subtile) {
25320 TEST_REQUIRES_X86_XOP;
25321 for (uint32_t n = 1; n <= 4; n++) {
25322 for (uint32_t m = 1; m <= 1; m++) {
25323 GemmMicrokernelTester()
25324 .mr(1)
25325 .nr(4)
25326 .kr(8)
25327 .sr(1)
25328 .m(m)
25329 .n(n)
25330 .k(8)
25331 .iterations(1)
25332 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25333 }
25334 }
25335 }
25336
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128,k_eq_8_subtile_m)25337 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8_subtile_m) {
25338 TEST_REQUIRES_X86_XOP;
25339 for (uint32_t m = 1; m <= 1; m++) {
25340 GemmMicrokernelTester()
25341 .mr(1)
25342 .nr(4)
25343 .kr(8)
25344 .sr(1)
25345 .m(m)
25346 .n(4)
25347 .k(8)
25348 .iterations(1)
25349 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25350 }
25351 }
25352
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128,k_eq_8_subtile_n)25353 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_eq_8_subtile_n) {
25354 TEST_REQUIRES_X86_XOP;
25355 for (uint32_t n = 1; n <= 4; n++) {
25356 GemmMicrokernelTester()
25357 .mr(1)
25358 .nr(4)
25359 .kr(8)
25360 .sr(1)
25361 .m(1)
25362 .n(n)
25363 .k(8)
25364 .iterations(1)
25365 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25366 }
25367 }
25368
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128,k_lt_8)25369 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_lt_8) {
25370 TEST_REQUIRES_X86_XOP;
25371 for (size_t k = 1; k < 8; k++) {
25372 GemmMicrokernelTester()
25373 .mr(1)
25374 .nr(4)
25375 .kr(8)
25376 .sr(1)
25377 .m(1)
25378 .n(4)
25379 .k(k)
25380 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25381 }
25382 }
25383
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128,k_lt_8_strided_a)25384 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_lt_8_strided_a) {
25385 TEST_REQUIRES_X86_XOP;
25386 for (size_t k = 1; k < 8; k++) {
25387 GemmMicrokernelTester()
25388 .mr(1)
25389 .nr(4)
25390 .kr(8)
25391 .sr(1)
25392 .m(1)
25393 .n(4)
25394 .k(k)
25395 .a_stride(11)
25396 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25397 }
25398 }
25399
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128,k_lt_8_subtile)25400 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_lt_8_subtile) {
25401 TEST_REQUIRES_X86_XOP;
25402 for (size_t k = 1; k < 8; k++) {
25403 for (uint32_t n = 1; n <= 4; n++) {
25404 for (uint32_t m = 1; m <= 1; m++) {
25405 GemmMicrokernelTester()
25406 .mr(1)
25407 .nr(4)
25408 .kr(8)
25409 .sr(1)
25410 .m(m)
25411 .n(n)
25412 .k(k)
25413 .iterations(1)
25414 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25415 }
25416 }
25417 }
25418 }
25419
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128,k_gt_8)25420 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_gt_8) {
25421 TEST_REQUIRES_X86_XOP;
25422 for (size_t k = 9; k < 16; k++) {
25423 GemmMicrokernelTester()
25424 .mr(1)
25425 .nr(4)
25426 .kr(8)
25427 .sr(1)
25428 .m(1)
25429 .n(4)
25430 .k(k)
25431 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25432 }
25433 }
25434
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128,k_gt_8_strided_a)25435 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_gt_8_strided_a) {
25436 TEST_REQUIRES_X86_XOP;
25437 for (size_t k = 9; k < 16; k++) {
25438 GemmMicrokernelTester()
25439 .mr(1)
25440 .nr(4)
25441 .kr(8)
25442 .sr(1)
25443 .m(1)
25444 .n(4)
25445 .k(k)
25446 .a_stride(19)
25447 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25448 }
25449 }
25450
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128,k_gt_8_subtile)25451 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_gt_8_subtile) {
25452 TEST_REQUIRES_X86_XOP;
25453 for (size_t k = 9; k < 16; k++) {
25454 for (uint32_t n = 1; n <= 4; n++) {
25455 for (uint32_t m = 1; m <= 1; m++) {
25456 GemmMicrokernelTester()
25457 .mr(1)
25458 .nr(4)
25459 .kr(8)
25460 .sr(1)
25461 .m(m)
25462 .n(n)
25463 .k(k)
25464 .iterations(1)
25465 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25466 }
25467 }
25468 }
25469 }
25470
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128,k_div_8)25471 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_div_8) {
25472 TEST_REQUIRES_X86_XOP;
25473 for (size_t k = 16; k <= 80; k += 8) {
25474 GemmMicrokernelTester()
25475 .mr(1)
25476 .nr(4)
25477 .kr(8)
25478 .sr(1)
25479 .m(1)
25480 .n(4)
25481 .k(k)
25482 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25483 }
25484 }
25485
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128,k_div_8_strided_a)25486 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_div_8_strided_a) {
25487 TEST_REQUIRES_X86_XOP;
25488 for (size_t k = 16; k <= 80; k += 8) {
25489 GemmMicrokernelTester()
25490 .mr(1)
25491 .nr(4)
25492 .kr(8)
25493 .sr(1)
25494 .m(1)
25495 .n(4)
25496 .k(k)
25497 .a_stride(83)
25498 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25499 }
25500 }
25501
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128,k_div_8_subtile)25502 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, k_div_8_subtile) {
25503 TEST_REQUIRES_X86_XOP;
25504 for (size_t k = 16; k <= 80; k += 8) {
25505 for (uint32_t n = 1; n <= 4; n++) {
25506 for (uint32_t m = 1; m <= 1; m++) {
25507 GemmMicrokernelTester()
25508 .mr(1)
25509 .nr(4)
25510 .kr(8)
25511 .sr(1)
25512 .m(m)
25513 .n(n)
25514 .k(k)
25515 .iterations(1)
25516 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25517 }
25518 }
25519 }
25520 }
25521
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128,n_gt_4)25522 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_gt_4) {
25523 TEST_REQUIRES_X86_XOP;
25524 for (uint32_t n = 5; n < 8; n++) {
25525 for (size_t k = 1; k <= 40; k += 9) {
25526 GemmMicrokernelTester()
25527 .mr(1)
25528 .nr(4)
25529 .kr(8)
25530 .sr(1)
25531 .m(1)
25532 .n(n)
25533 .k(k)
25534 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25535 }
25536 }
25537 }
25538
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128,n_gt_4_strided_cn)25539 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_gt_4_strided_cn) {
25540 TEST_REQUIRES_X86_XOP;
25541 for (uint32_t n = 5; n < 8; n++) {
25542 for (size_t k = 1; k <= 40; k += 9) {
25543 GemmMicrokernelTester()
25544 .mr(1)
25545 .nr(4)
25546 .kr(8)
25547 .sr(1)
25548 .m(1)
25549 .n(n)
25550 .k(k)
25551 .cn_stride(7)
25552 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25553 }
25554 }
25555 }
25556
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128,n_gt_4_strided_a)25557 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_gt_4_strided_a) {
25558 TEST_REQUIRES_X86_XOP;
25559 for (uint32_t n = 5; n < 8; n++) {
25560 for (size_t k = 1; k <= 40; k += 9) {
25561 GemmMicrokernelTester()
25562 .mr(1)
25563 .nr(4)
25564 .kr(8)
25565 .sr(1)
25566 .m(1)
25567 .n(n)
25568 .k(k)
25569 .a_stride(43)
25570 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25571 }
25572 }
25573 }
25574
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128,n_gt_4_subtile)25575 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_gt_4_subtile) {
25576 TEST_REQUIRES_X86_XOP;
25577 for (uint32_t n = 5; n < 8; n++) {
25578 for (size_t k = 1; k <= 40; k += 9) {
25579 for (uint32_t m = 1; m <= 1; m++) {
25580 GemmMicrokernelTester()
25581 .mr(1)
25582 .nr(4)
25583 .kr(8)
25584 .sr(1)
25585 .m(m)
25586 .n(n)
25587 .k(k)
25588 .iterations(1)
25589 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25590 }
25591 }
25592 }
25593 }
25594
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128,n_div_4)25595 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_div_4) {
25596 TEST_REQUIRES_X86_XOP;
25597 for (uint32_t n = 8; n <= 12; n += 4) {
25598 for (size_t k = 1; k <= 40; k += 9) {
25599 GemmMicrokernelTester()
25600 .mr(1)
25601 .nr(4)
25602 .kr(8)
25603 .sr(1)
25604 .m(1)
25605 .n(n)
25606 .k(k)
25607 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25608 }
25609 }
25610 }
25611
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128,n_div_4_strided_cn)25612 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_div_4_strided_cn) {
25613 TEST_REQUIRES_X86_XOP;
25614 for (uint32_t n = 8; n <= 12; n += 4) {
25615 for (size_t k = 1; k <= 40; k += 9) {
25616 GemmMicrokernelTester()
25617 .mr(1)
25618 .nr(4)
25619 .kr(8)
25620 .sr(1)
25621 .m(1)
25622 .n(n)
25623 .k(k)
25624 .cn_stride(7)
25625 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25626 }
25627 }
25628 }
25629
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128,n_div_4_strided_a)25630 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_div_4_strided_a) {
25631 TEST_REQUIRES_X86_XOP;
25632 for (uint32_t n = 8; n <= 12; n += 4) {
25633 for (size_t k = 1; k <= 40; k += 9) {
25634 GemmMicrokernelTester()
25635 .mr(1)
25636 .nr(4)
25637 .kr(8)
25638 .sr(1)
25639 .m(1)
25640 .n(n)
25641 .k(k)
25642 .a_stride(43)
25643 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25644 }
25645 }
25646 }
25647
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128,n_div_4_subtile)25648 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, n_div_4_subtile) {
25649 TEST_REQUIRES_X86_XOP;
25650 for (uint32_t n = 8; n <= 12; n += 4) {
25651 for (size_t k = 1; k <= 40; k += 9) {
25652 for (uint32_t m = 1; m <= 1; m++) {
25653 GemmMicrokernelTester()
25654 .mr(1)
25655 .nr(4)
25656 .kr(8)
25657 .sr(1)
25658 .m(m)
25659 .n(n)
25660 .k(k)
25661 .iterations(1)
25662 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25663 }
25664 }
25665 }
25666 }
25667
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128,strided_cm_subtile)25668 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, strided_cm_subtile) {
25669 TEST_REQUIRES_X86_XOP;
25670 for (size_t k = 1; k <= 40; k += 9) {
25671 for (uint32_t n = 1; n <= 4; n++) {
25672 for (uint32_t m = 1; m <= 1; m++) {
25673 GemmMicrokernelTester()
25674 .mr(1)
25675 .nr(4)
25676 .kr(8)
25677 .sr(1)
25678 .m(m)
25679 .n(n)
25680 .k(k)
25681 .cm_stride(7)
25682 .iterations(1)
25683 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25684 }
25685 }
25686 }
25687 }
25688
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128,qmin)25689 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, qmin) {
25690 TEST_REQUIRES_X86_XOP;
25691 GemmMicrokernelTester()
25692 .mr(1)
25693 .nr(4)
25694 .kr(8)
25695 .sr(1)
25696 .m(1)
25697 .n(4)
25698 .k(8)
25699 .qmin(128)
25700 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25701 }
25702
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128,qmax)25703 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, qmax) {
25704 TEST_REQUIRES_X86_XOP;
25705 GemmMicrokernelTester()
25706 .mr(1)
25707 .nr(4)
25708 .kr(8)
25709 .sr(1)
25710 .m(1)
25711 .n(4)
25712 .k(8)
25713 .qmax(128)
25714 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25715 }
25716
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128,strided_cm)25717 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, strided_cm) {
25718 TEST_REQUIRES_X86_XOP;
25719 GemmMicrokernelTester()
25720 .mr(1)
25721 .nr(4)
25722 .kr(8)
25723 .sr(1)
25724 .m(1)
25725 .n(4)
25726 .k(8)
25727 .cm_stride(7)
25728 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25729 }
25730
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128,no_a_zero_point)25731 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, no_a_zero_point) {
25732 TEST_REQUIRES_X86_XOP;
25733 for (size_t k = 1; k <= 40; k += 9) {
25734 GemmMicrokernelTester()
25735 .mr(1)
25736 .nr(4)
25737 .kr(8)
25738 .sr(1)
25739 .m(1)
25740 .n(4)
25741 .k(k)
25742 .a_zero_point(0)
25743 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25744 }
25745 }
25746
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128,no_b_zero_point)25747 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, no_b_zero_point) {
25748 TEST_REQUIRES_X86_XOP;
25749 for (size_t k = 1; k <= 40; k += 9) {
25750 GemmMicrokernelTester()
25751 .mr(1)
25752 .nr(4)
25753 .kr(8)
25754 .sr(1)
25755 .m(1)
25756 .n(4)
25757 .k(k)
25758 .b_zero_point(0)
25759 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25760 }
25761 }
25762
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128,no_zero_point)25763 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__XOP_LD128, no_zero_point) {
25764 TEST_REQUIRES_X86_XOP;
25765 for (size_t k = 1; k <= 40; k += 9) {
25766 GemmMicrokernelTester()
25767 .mr(1)
25768 .nr(4)
25769 .kr(8)
25770 .sr(1)
25771 .m(1)
25772 .n(4)
25773 .k(k)
25774 .a_zero_point(0)
25775 .b_zero_point(0)
25776 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25777 }
25778 }
25779 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
25780
25781
25782 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128,k_eq_8)25783 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8) {
25784 TEST_REQUIRES_X86_XOP;
25785 GemmMicrokernelTester()
25786 .mr(2)
25787 .nr(4)
25788 .kr(8)
25789 .sr(1)
25790 .m(2)
25791 .n(4)
25792 .k(8)
25793 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25794 }
25795
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128,strided_cn)25796 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, strided_cn) {
25797 TEST_REQUIRES_X86_XOP;
25798 GemmMicrokernelTester()
25799 .mr(2)
25800 .nr(4)
25801 .kr(8)
25802 .sr(1)
25803 .m(2)
25804 .n(4)
25805 .k(8)
25806 .cn_stride(7)
25807 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25808 }
25809
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128,k_eq_8_strided_a)25810 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8_strided_a) {
25811 TEST_REQUIRES_X86_XOP;
25812 GemmMicrokernelTester()
25813 .mr(2)
25814 .nr(4)
25815 .kr(8)
25816 .sr(1)
25817 .m(2)
25818 .n(4)
25819 .k(8)
25820 .a_stride(11)
25821 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25822 }
25823
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128,k_eq_8_subtile)25824 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8_subtile) {
25825 TEST_REQUIRES_X86_XOP;
25826 for (uint32_t n = 1; n <= 4; n++) {
25827 for (uint32_t m = 1; m <= 2; m++) {
25828 GemmMicrokernelTester()
25829 .mr(2)
25830 .nr(4)
25831 .kr(8)
25832 .sr(1)
25833 .m(m)
25834 .n(n)
25835 .k(8)
25836 .iterations(1)
25837 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25838 }
25839 }
25840 }
25841
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128,k_eq_8_subtile_m)25842 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8_subtile_m) {
25843 TEST_REQUIRES_X86_XOP;
25844 for (uint32_t m = 1; m <= 2; m++) {
25845 GemmMicrokernelTester()
25846 .mr(2)
25847 .nr(4)
25848 .kr(8)
25849 .sr(1)
25850 .m(m)
25851 .n(4)
25852 .k(8)
25853 .iterations(1)
25854 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25855 }
25856 }
25857
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128,k_eq_8_subtile_n)25858 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_eq_8_subtile_n) {
25859 TEST_REQUIRES_X86_XOP;
25860 for (uint32_t n = 1; n <= 4; n++) {
25861 GemmMicrokernelTester()
25862 .mr(2)
25863 .nr(4)
25864 .kr(8)
25865 .sr(1)
25866 .m(2)
25867 .n(n)
25868 .k(8)
25869 .iterations(1)
25870 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25871 }
25872 }
25873
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128,k_lt_8)25874 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_lt_8) {
25875 TEST_REQUIRES_X86_XOP;
25876 for (size_t k = 1; k < 8; k++) {
25877 GemmMicrokernelTester()
25878 .mr(2)
25879 .nr(4)
25880 .kr(8)
25881 .sr(1)
25882 .m(2)
25883 .n(4)
25884 .k(k)
25885 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25886 }
25887 }
25888
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128,k_lt_8_strided_a)25889 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_lt_8_strided_a) {
25890 TEST_REQUIRES_X86_XOP;
25891 for (size_t k = 1; k < 8; k++) {
25892 GemmMicrokernelTester()
25893 .mr(2)
25894 .nr(4)
25895 .kr(8)
25896 .sr(1)
25897 .m(2)
25898 .n(4)
25899 .k(k)
25900 .a_stride(11)
25901 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25902 }
25903 }
25904
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128,k_lt_8_subtile)25905 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_lt_8_subtile) {
25906 TEST_REQUIRES_X86_XOP;
25907 for (size_t k = 1; k < 8; k++) {
25908 for (uint32_t n = 1; n <= 4; n++) {
25909 for (uint32_t m = 1; m <= 2; m++) {
25910 GemmMicrokernelTester()
25911 .mr(2)
25912 .nr(4)
25913 .kr(8)
25914 .sr(1)
25915 .m(m)
25916 .n(n)
25917 .k(k)
25918 .iterations(1)
25919 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25920 }
25921 }
25922 }
25923 }
25924
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128,k_gt_8)25925 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_gt_8) {
25926 TEST_REQUIRES_X86_XOP;
25927 for (size_t k = 9; k < 16; k++) {
25928 GemmMicrokernelTester()
25929 .mr(2)
25930 .nr(4)
25931 .kr(8)
25932 .sr(1)
25933 .m(2)
25934 .n(4)
25935 .k(k)
25936 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25937 }
25938 }
25939
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128,k_gt_8_strided_a)25940 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_gt_8_strided_a) {
25941 TEST_REQUIRES_X86_XOP;
25942 for (size_t k = 9; k < 16; k++) {
25943 GemmMicrokernelTester()
25944 .mr(2)
25945 .nr(4)
25946 .kr(8)
25947 .sr(1)
25948 .m(2)
25949 .n(4)
25950 .k(k)
25951 .a_stride(19)
25952 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25953 }
25954 }
25955
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128,k_gt_8_subtile)25956 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_gt_8_subtile) {
25957 TEST_REQUIRES_X86_XOP;
25958 for (size_t k = 9; k < 16; k++) {
25959 for (uint32_t n = 1; n <= 4; n++) {
25960 for (uint32_t m = 1; m <= 2; m++) {
25961 GemmMicrokernelTester()
25962 .mr(2)
25963 .nr(4)
25964 .kr(8)
25965 .sr(1)
25966 .m(m)
25967 .n(n)
25968 .k(k)
25969 .iterations(1)
25970 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25971 }
25972 }
25973 }
25974 }
25975
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128,k_div_8)25976 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_div_8) {
25977 TEST_REQUIRES_X86_XOP;
25978 for (size_t k = 16; k <= 80; k += 8) {
25979 GemmMicrokernelTester()
25980 .mr(2)
25981 .nr(4)
25982 .kr(8)
25983 .sr(1)
25984 .m(2)
25985 .n(4)
25986 .k(k)
25987 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
25988 }
25989 }
25990
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128,k_div_8_strided_a)25991 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_div_8_strided_a) {
25992 TEST_REQUIRES_X86_XOP;
25993 for (size_t k = 16; k <= 80; k += 8) {
25994 GemmMicrokernelTester()
25995 .mr(2)
25996 .nr(4)
25997 .kr(8)
25998 .sr(1)
25999 .m(2)
26000 .n(4)
26001 .k(k)
26002 .a_stride(83)
26003 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26004 }
26005 }
26006
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128,k_div_8_subtile)26007 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, k_div_8_subtile) {
26008 TEST_REQUIRES_X86_XOP;
26009 for (size_t k = 16; k <= 80; k += 8) {
26010 for (uint32_t n = 1; n <= 4; n++) {
26011 for (uint32_t m = 1; m <= 2; m++) {
26012 GemmMicrokernelTester()
26013 .mr(2)
26014 .nr(4)
26015 .kr(8)
26016 .sr(1)
26017 .m(m)
26018 .n(n)
26019 .k(k)
26020 .iterations(1)
26021 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26022 }
26023 }
26024 }
26025 }
26026
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128,n_gt_4)26027 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_gt_4) {
26028 TEST_REQUIRES_X86_XOP;
26029 for (uint32_t n = 5; n < 8; n++) {
26030 for (size_t k = 1; k <= 40; k += 9) {
26031 GemmMicrokernelTester()
26032 .mr(2)
26033 .nr(4)
26034 .kr(8)
26035 .sr(1)
26036 .m(2)
26037 .n(n)
26038 .k(k)
26039 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26040 }
26041 }
26042 }
26043
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128,n_gt_4_strided_cn)26044 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_gt_4_strided_cn) {
26045 TEST_REQUIRES_X86_XOP;
26046 for (uint32_t n = 5; n < 8; n++) {
26047 for (size_t k = 1; k <= 40; k += 9) {
26048 GemmMicrokernelTester()
26049 .mr(2)
26050 .nr(4)
26051 .kr(8)
26052 .sr(1)
26053 .m(2)
26054 .n(n)
26055 .k(k)
26056 .cn_stride(7)
26057 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26058 }
26059 }
26060 }
26061
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128,n_gt_4_strided_a)26062 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_gt_4_strided_a) {
26063 TEST_REQUIRES_X86_XOP;
26064 for (uint32_t n = 5; n < 8; n++) {
26065 for (size_t k = 1; k <= 40; k += 9) {
26066 GemmMicrokernelTester()
26067 .mr(2)
26068 .nr(4)
26069 .kr(8)
26070 .sr(1)
26071 .m(2)
26072 .n(n)
26073 .k(k)
26074 .a_stride(43)
26075 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26076 }
26077 }
26078 }
26079
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128,n_gt_4_subtile)26080 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_gt_4_subtile) {
26081 TEST_REQUIRES_X86_XOP;
26082 for (uint32_t n = 5; n < 8; n++) {
26083 for (size_t k = 1; k <= 40; k += 9) {
26084 for (uint32_t m = 1; m <= 2; m++) {
26085 GemmMicrokernelTester()
26086 .mr(2)
26087 .nr(4)
26088 .kr(8)
26089 .sr(1)
26090 .m(m)
26091 .n(n)
26092 .k(k)
26093 .iterations(1)
26094 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26095 }
26096 }
26097 }
26098 }
26099
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128,n_div_4)26100 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_div_4) {
26101 TEST_REQUIRES_X86_XOP;
26102 for (uint32_t n = 8; n <= 12; n += 4) {
26103 for (size_t k = 1; k <= 40; k += 9) {
26104 GemmMicrokernelTester()
26105 .mr(2)
26106 .nr(4)
26107 .kr(8)
26108 .sr(1)
26109 .m(2)
26110 .n(n)
26111 .k(k)
26112 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26113 }
26114 }
26115 }
26116
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128,n_div_4_strided_cn)26117 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_div_4_strided_cn) {
26118 TEST_REQUIRES_X86_XOP;
26119 for (uint32_t n = 8; n <= 12; n += 4) {
26120 for (size_t k = 1; k <= 40; k += 9) {
26121 GemmMicrokernelTester()
26122 .mr(2)
26123 .nr(4)
26124 .kr(8)
26125 .sr(1)
26126 .m(2)
26127 .n(n)
26128 .k(k)
26129 .cn_stride(7)
26130 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26131 }
26132 }
26133 }
26134
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128,n_div_4_strided_a)26135 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_div_4_strided_a) {
26136 TEST_REQUIRES_X86_XOP;
26137 for (uint32_t n = 8; n <= 12; n += 4) {
26138 for (size_t k = 1; k <= 40; k += 9) {
26139 GemmMicrokernelTester()
26140 .mr(2)
26141 .nr(4)
26142 .kr(8)
26143 .sr(1)
26144 .m(2)
26145 .n(n)
26146 .k(k)
26147 .a_stride(43)
26148 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26149 }
26150 }
26151 }
26152
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128,n_div_4_subtile)26153 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, n_div_4_subtile) {
26154 TEST_REQUIRES_X86_XOP;
26155 for (uint32_t n = 8; n <= 12; n += 4) {
26156 for (size_t k = 1; k <= 40; k += 9) {
26157 for (uint32_t m = 1; m <= 2; m++) {
26158 GemmMicrokernelTester()
26159 .mr(2)
26160 .nr(4)
26161 .kr(8)
26162 .sr(1)
26163 .m(m)
26164 .n(n)
26165 .k(k)
26166 .iterations(1)
26167 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26168 }
26169 }
26170 }
26171 }
26172
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128,strided_cm_subtile)26173 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, strided_cm_subtile) {
26174 TEST_REQUIRES_X86_XOP;
26175 for (size_t k = 1; k <= 40; k += 9) {
26176 for (uint32_t n = 1; n <= 4; n++) {
26177 for (uint32_t m = 1; m <= 2; m++) {
26178 GemmMicrokernelTester()
26179 .mr(2)
26180 .nr(4)
26181 .kr(8)
26182 .sr(1)
26183 .m(m)
26184 .n(n)
26185 .k(k)
26186 .cm_stride(7)
26187 .iterations(1)
26188 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26189 }
26190 }
26191 }
26192 }
26193
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128,qmin)26194 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, qmin) {
26195 TEST_REQUIRES_X86_XOP;
26196 GemmMicrokernelTester()
26197 .mr(2)
26198 .nr(4)
26199 .kr(8)
26200 .sr(1)
26201 .m(2)
26202 .n(4)
26203 .k(8)
26204 .qmin(128)
26205 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26206 }
26207
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128,qmax)26208 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, qmax) {
26209 TEST_REQUIRES_X86_XOP;
26210 GemmMicrokernelTester()
26211 .mr(2)
26212 .nr(4)
26213 .kr(8)
26214 .sr(1)
26215 .m(2)
26216 .n(4)
26217 .k(8)
26218 .qmax(128)
26219 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26220 }
26221
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128,strided_cm)26222 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, strided_cm) {
26223 TEST_REQUIRES_X86_XOP;
26224 GemmMicrokernelTester()
26225 .mr(2)
26226 .nr(4)
26227 .kr(8)
26228 .sr(1)
26229 .m(2)
26230 .n(4)
26231 .k(8)
26232 .cm_stride(7)
26233 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26234 }
26235
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128,no_a_zero_point)26236 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, no_a_zero_point) {
26237 TEST_REQUIRES_X86_XOP;
26238 for (size_t k = 1; k <= 40; k += 9) {
26239 GemmMicrokernelTester()
26240 .mr(2)
26241 .nr(4)
26242 .kr(8)
26243 .sr(1)
26244 .m(2)
26245 .n(4)
26246 .k(k)
26247 .a_zero_point(0)
26248 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26249 }
26250 }
26251
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128,no_b_zero_point)26252 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, no_b_zero_point) {
26253 TEST_REQUIRES_X86_XOP;
26254 for (size_t k = 1; k <= 40; k += 9) {
26255 GemmMicrokernelTester()
26256 .mr(2)
26257 .nr(4)
26258 .kr(8)
26259 .sr(1)
26260 .m(2)
26261 .n(4)
26262 .k(k)
26263 .b_zero_point(0)
26264 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26265 }
26266 }
26267
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128,no_zero_point)26268 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__XOP_LD128, no_zero_point) {
26269 TEST_REQUIRES_X86_XOP;
26270 for (size_t k = 1; k <= 40; k += 9) {
26271 GemmMicrokernelTester()
26272 .mr(2)
26273 .nr(4)
26274 .kr(8)
26275 .sr(1)
26276 .m(2)
26277 .n(4)
26278 .k(k)
26279 .a_zero_point(0)
26280 .b_zero_point(0)
26281 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26282 }
26283 }
26284 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
26285
26286
26287 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128,k_eq_8)26288 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8) {
26289 TEST_REQUIRES_X86_AVX;
26290 GemmMicrokernelTester()
26291 .mr(3)
26292 .nr(4)
26293 .kr(8)
26294 .sr(1)
26295 .m(3)
26296 .n(4)
26297 .k(8)
26298 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26299 }
26300
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128,strided_cn)26301 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, strided_cn) {
26302 TEST_REQUIRES_X86_AVX;
26303 GemmMicrokernelTester()
26304 .mr(3)
26305 .nr(4)
26306 .kr(8)
26307 .sr(1)
26308 .m(3)
26309 .n(4)
26310 .k(8)
26311 .cn_stride(7)
26312 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26313 }
26314
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128,k_eq_8_strided_a)26315 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8_strided_a) {
26316 TEST_REQUIRES_X86_AVX;
26317 GemmMicrokernelTester()
26318 .mr(3)
26319 .nr(4)
26320 .kr(8)
26321 .sr(1)
26322 .m(3)
26323 .n(4)
26324 .k(8)
26325 .a_stride(11)
26326 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26327 }
26328
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128,k_eq_8_subtile)26329 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8_subtile) {
26330 TEST_REQUIRES_X86_AVX;
26331 for (uint32_t n = 1; n <= 4; n++) {
26332 for (uint32_t m = 1; m <= 3; m++) {
26333 GemmMicrokernelTester()
26334 .mr(3)
26335 .nr(4)
26336 .kr(8)
26337 .sr(1)
26338 .m(m)
26339 .n(n)
26340 .k(8)
26341 .iterations(1)
26342 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26343 }
26344 }
26345 }
26346
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128,k_eq_8_subtile_m)26347 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8_subtile_m) {
26348 TEST_REQUIRES_X86_AVX;
26349 for (uint32_t m = 1; m <= 3; m++) {
26350 GemmMicrokernelTester()
26351 .mr(3)
26352 .nr(4)
26353 .kr(8)
26354 .sr(1)
26355 .m(m)
26356 .n(4)
26357 .k(8)
26358 .iterations(1)
26359 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26360 }
26361 }
26362
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128,k_eq_8_subtile_n)26363 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_eq_8_subtile_n) {
26364 TEST_REQUIRES_X86_AVX;
26365 for (uint32_t n = 1; n <= 4; n++) {
26366 GemmMicrokernelTester()
26367 .mr(3)
26368 .nr(4)
26369 .kr(8)
26370 .sr(1)
26371 .m(3)
26372 .n(n)
26373 .k(8)
26374 .iterations(1)
26375 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26376 }
26377 }
26378
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128,k_lt_8)26379 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_lt_8) {
26380 TEST_REQUIRES_X86_AVX;
26381 for (size_t k = 1; k < 8; k++) {
26382 GemmMicrokernelTester()
26383 .mr(3)
26384 .nr(4)
26385 .kr(8)
26386 .sr(1)
26387 .m(3)
26388 .n(4)
26389 .k(k)
26390 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26391 }
26392 }
26393
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128,k_lt_8_strided_a)26394 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_lt_8_strided_a) {
26395 TEST_REQUIRES_X86_AVX;
26396 for (size_t k = 1; k < 8; k++) {
26397 GemmMicrokernelTester()
26398 .mr(3)
26399 .nr(4)
26400 .kr(8)
26401 .sr(1)
26402 .m(3)
26403 .n(4)
26404 .k(k)
26405 .a_stride(11)
26406 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26407 }
26408 }
26409
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128,k_lt_8_subtile)26410 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_lt_8_subtile) {
26411 TEST_REQUIRES_X86_AVX;
26412 for (size_t k = 1; k < 8; k++) {
26413 for (uint32_t n = 1; n <= 4; n++) {
26414 for (uint32_t m = 1; m <= 3; m++) {
26415 GemmMicrokernelTester()
26416 .mr(3)
26417 .nr(4)
26418 .kr(8)
26419 .sr(1)
26420 .m(m)
26421 .n(n)
26422 .k(k)
26423 .iterations(1)
26424 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26425 }
26426 }
26427 }
26428 }
26429
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128,k_gt_8)26430 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_gt_8) {
26431 TEST_REQUIRES_X86_AVX;
26432 for (size_t k = 9; k < 16; k++) {
26433 GemmMicrokernelTester()
26434 .mr(3)
26435 .nr(4)
26436 .kr(8)
26437 .sr(1)
26438 .m(3)
26439 .n(4)
26440 .k(k)
26441 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26442 }
26443 }
26444
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128,k_gt_8_strided_a)26445 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_gt_8_strided_a) {
26446 TEST_REQUIRES_X86_AVX;
26447 for (size_t k = 9; k < 16; k++) {
26448 GemmMicrokernelTester()
26449 .mr(3)
26450 .nr(4)
26451 .kr(8)
26452 .sr(1)
26453 .m(3)
26454 .n(4)
26455 .k(k)
26456 .a_stride(19)
26457 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26458 }
26459 }
26460
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128,k_gt_8_subtile)26461 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_gt_8_subtile) {
26462 TEST_REQUIRES_X86_AVX;
26463 for (size_t k = 9; k < 16; k++) {
26464 for (uint32_t n = 1; n <= 4; n++) {
26465 for (uint32_t m = 1; m <= 3; m++) {
26466 GemmMicrokernelTester()
26467 .mr(3)
26468 .nr(4)
26469 .kr(8)
26470 .sr(1)
26471 .m(m)
26472 .n(n)
26473 .k(k)
26474 .iterations(1)
26475 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26476 }
26477 }
26478 }
26479 }
26480
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128,k_div_8)26481 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_div_8) {
26482 TEST_REQUIRES_X86_AVX;
26483 for (size_t k = 16; k <= 80; k += 8) {
26484 GemmMicrokernelTester()
26485 .mr(3)
26486 .nr(4)
26487 .kr(8)
26488 .sr(1)
26489 .m(3)
26490 .n(4)
26491 .k(k)
26492 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26493 }
26494 }
26495
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128,k_div_8_strided_a)26496 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_div_8_strided_a) {
26497 TEST_REQUIRES_X86_AVX;
26498 for (size_t k = 16; k <= 80; k += 8) {
26499 GemmMicrokernelTester()
26500 .mr(3)
26501 .nr(4)
26502 .kr(8)
26503 .sr(1)
26504 .m(3)
26505 .n(4)
26506 .k(k)
26507 .a_stride(83)
26508 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26509 }
26510 }
26511
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128,k_div_8_subtile)26512 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, k_div_8_subtile) {
26513 TEST_REQUIRES_X86_AVX;
26514 for (size_t k = 16; k <= 80; k += 8) {
26515 for (uint32_t n = 1; n <= 4; n++) {
26516 for (uint32_t m = 1; m <= 3; m++) {
26517 GemmMicrokernelTester()
26518 .mr(3)
26519 .nr(4)
26520 .kr(8)
26521 .sr(1)
26522 .m(m)
26523 .n(n)
26524 .k(k)
26525 .iterations(1)
26526 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26527 }
26528 }
26529 }
26530 }
26531
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128,n_gt_4)26532 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_gt_4) {
26533 TEST_REQUIRES_X86_AVX;
26534 for (uint32_t n = 5; n < 8; n++) {
26535 for (size_t k = 1; k <= 40; k += 9) {
26536 GemmMicrokernelTester()
26537 .mr(3)
26538 .nr(4)
26539 .kr(8)
26540 .sr(1)
26541 .m(3)
26542 .n(n)
26543 .k(k)
26544 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26545 }
26546 }
26547 }
26548
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128,n_gt_4_strided_cn)26549 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_gt_4_strided_cn) {
26550 TEST_REQUIRES_X86_AVX;
26551 for (uint32_t n = 5; n < 8; n++) {
26552 for (size_t k = 1; k <= 40; k += 9) {
26553 GemmMicrokernelTester()
26554 .mr(3)
26555 .nr(4)
26556 .kr(8)
26557 .sr(1)
26558 .m(3)
26559 .n(n)
26560 .k(k)
26561 .cn_stride(7)
26562 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26563 }
26564 }
26565 }
26566
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128,n_gt_4_strided_a)26567 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_gt_4_strided_a) {
26568 TEST_REQUIRES_X86_AVX;
26569 for (uint32_t n = 5; n < 8; n++) {
26570 for (size_t k = 1; k <= 40; k += 9) {
26571 GemmMicrokernelTester()
26572 .mr(3)
26573 .nr(4)
26574 .kr(8)
26575 .sr(1)
26576 .m(3)
26577 .n(n)
26578 .k(k)
26579 .a_stride(43)
26580 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26581 }
26582 }
26583 }
26584
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128,n_gt_4_subtile)26585 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_gt_4_subtile) {
26586 TEST_REQUIRES_X86_AVX;
26587 for (uint32_t n = 5; n < 8; n++) {
26588 for (size_t k = 1; k <= 40; k += 9) {
26589 for (uint32_t m = 1; m <= 3; m++) {
26590 GemmMicrokernelTester()
26591 .mr(3)
26592 .nr(4)
26593 .kr(8)
26594 .sr(1)
26595 .m(m)
26596 .n(n)
26597 .k(k)
26598 .iterations(1)
26599 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26600 }
26601 }
26602 }
26603 }
26604
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128,n_div_4)26605 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_div_4) {
26606 TEST_REQUIRES_X86_AVX;
26607 for (uint32_t n = 8; n <= 12; n += 4) {
26608 for (size_t k = 1; k <= 40; k += 9) {
26609 GemmMicrokernelTester()
26610 .mr(3)
26611 .nr(4)
26612 .kr(8)
26613 .sr(1)
26614 .m(3)
26615 .n(n)
26616 .k(k)
26617 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26618 }
26619 }
26620 }
26621
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128,n_div_4_strided_cn)26622 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_div_4_strided_cn) {
26623 TEST_REQUIRES_X86_AVX;
26624 for (uint32_t n = 8; n <= 12; n += 4) {
26625 for (size_t k = 1; k <= 40; k += 9) {
26626 GemmMicrokernelTester()
26627 .mr(3)
26628 .nr(4)
26629 .kr(8)
26630 .sr(1)
26631 .m(3)
26632 .n(n)
26633 .k(k)
26634 .cn_stride(7)
26635 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26636 }
26637 }
26638 }
26639
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128,n_div_4_strided_a)26640 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_div_4_strided_a) {
26641 TEST_REQUIRES_X86_AVX;
26642 for (uint32_t n = 8; n <= 12; n += 4) {
26643 for (size_t k = 1; k <= 40; k += 9) {
26644 GemmMicrokernelTester()
26645 .mr(3)
26646 .nr(4)
26647 .kr(8)
26648 .sr(1)
26649 .m(3)
26650 .n(n)
26651 .k(k)
26652 .a_stride(43)
26653 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26654 }
26655 }
26656 }
26657
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128,n_div_4_subtile)26658 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, n_div_4_subtile) {
26659 TEST_REQUIRES_X86_AVX;
26660 for (uint32_t n = 8; n <= 12; n += 4) {
26661 for (size_t k = 1; k <= 40; k += 9) {
26662 for (uint32_t m = 1; m <= 3; m++) {
26663 GemmMicrokernelTester()
26664 .mr(3)
26665 .nr(4)
26666 .kr(8)
26667 .sr(1)
26668 .m(m)
26669 .n(n)
26670 .k(k)
26671 .iterations(1)
26672 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26673 }
26674 }
26675 }
26676 }
26677
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128,strided_cm_subtile)26678 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, strided_cm_subtile) {
26679 TEST_REQUIRES_X86_AVX;
26680 for (size_t k = 1; k <= 40; k += 9) {
26681 for (uint32_t n = 1; n <= 4; n++) {
26682 for (uint32_t m = 1; m <= 3; m++) {
26683 GemmMicrokernelTester()
26684 .mr(3)
26685 .nr(4)
26686 .kr(8)
26687 .sr(1)
26688 .m(m)
26689 .n(n)
26690 .k(k)
26691 .cm_stride(7)
26692 .iterations(1)
26693 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26694 }
26695 }
26696 }
26697 }
26698
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128,qmin)26699 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, qmin) {
26700 TEST_REQUIRES_X86_AVX;
26701 GemmMicrokernelTester()
26702 .mr(3)
26703 .nr(4)
26704 .kr(8)
26705 .sr(1)
26706 .m(3)
26707 .n(4)
26708 .k(8)
26709 .qmin(128)
26710 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26711 }
26712
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128,qmax)26713 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, qmax) {
26714 TEST_REQUIRES_X86_AVX;
26715 GemmMicrokernelTester()
26716 .mr(3)
26717 .nr(4)
26718 .kr(8)
26719 .sr(1)
26720 .m(3)
26721 .n(4)
26722 .k(8)
26723 .qmax(128)
26724 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26725 }
26726
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128,strided_cm)26727 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, strided_cm) {
26728 TEST_REQUIRES_X86_AVX;
26729 GemmMicrokernelTester()
26730 .mr(3)
26731 .nr(4)
26732 .kr(8)
26733 .sr(1)
26734 .m(3)
26735 .n(4)
26736 .k(8)
26737 .cm_stride(7)
26738 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26739 }
26740
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128,no_a_zero_point)26741 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, no_a_zero_point) {
26742 TEST_REQUIRES_X86_AVX;
26743 for (size_t k = 1; k <= 40; k += 9) {
26744 GemmMicrokernelTester()
26745 .mr(3)
26746 .nr(4)
26747 .kr(8)
26748 .sr(1)
26749 .m(3)
26750 .n(4)
26751 .k(k)
26752 .a_zero_point(0)
26753 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26754 }
26755 }
26756
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128,no_b_zero_point)26757 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, no_b_zero_point) {
26758 TEST_REQUIRES_X86_AVX;
26759 for (size_t k = 1; k <= 40; k += 9) {
26760 GemmMicrokernelTester()
26761 .mr(3)
26762 .nr(4)
26763 .kr(8)
26764 .sr(1)
26765 .m(3)
26766 .n(4)
26767 .k(k)
26768 .b_zero_point(0)
26769 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26770 }
26771 }
26772
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128,no_zero_point)26773 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__AVX_LD128, no_zero_point) {
26774 TEST_REQUIRES_X86_AVX;
26775 for (size_t k = 1; k <= 40; k += 9) {
26776 GemmMicrokernelTester()
26777 .mr(3)
26778 .nr(4)
26779 .kr(8)
26780 .sr(1)
26781 .m(3)
26782 .n(4)
26783 .k(k)
26784 .a_zero_point(0)
26785 .b_zero_point(0)
26786 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, xnn_init_qu8_conv_minmax_fp32_sse2_params, xnn_qu8_requantize_fp32);
26787 }
26788 }
26789 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
26790
26791
26792 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2,k_eq_8)26793 TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2, k_eq_8) {
26794 TEST_REQUIRES_X86_AVX2;
26795 GemmMicrokernelTester()
26796 .mr(1)
26797 .nr(8)
26798 .kr(8)
26799 .sr(1)
26800 .m(1)
26801 .n(8)
26802 .k(8)
26803 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qu8_conv_minmax_fp32_avx2_params, xnn_qu8_requantize_fp32);
26804 }
26805
TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2,strided_cn)26806 TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2, strided_cn) {
26807 TEST_REQUIRES_X86_AVX2;
26808 GemmMicrokernelTester()
26809 .mr(1)
26810 .nr(8)
26811 .kr(8)
26812 .sr(1)
26813 .m(1)
26814 .n(8)
26815 .k(8)
26816 .cn_stride(11)
26817 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qu8_conv_minmax_fp32_avx2_params, xnn_qu8_requantize_fp32);
26818 }
26819
TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2,k_eq_8_strided_a)26820 TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2, k_eq_8_strided_a) {
26821 TEST_REQUIRES_X86_AVX2;
26822 GemmMicrokernelTester()
26823 .mr(1)
26824 .nr(8)
26825 .kr(8)
26826 .sr(1)
26827 .m(1)
26828 .n(8)
26829 .k(8)
26830 .a_stride(11)
26831 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qu8_conv_minmax_fp32_avx2_params, xnn_qu8_requantize_fp32);
26832 }
26833
TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2,k_eq_8_subtile)26834 TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2, k_eq_8_subtile) {
26835 TEST_REQUIRES_X86_AVX2;
26836 for (uint32_t n = 1; n <= 8; n++) {
26837 for (uint32_t m = 1; m <= 1; m++) {
26838 GemmMicrokernelTester()
26839 .mr(1)
26840 .nr(8)
26841 .kr(8)
26842 .sr(1)
26843 .m(m)
26844 .n(n)
26845 .k(8)
26846 .iterations(1)
26847 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qu8_conv_minmax_fp32_avx2_params, xnn_qu8_requantize_fp32);
26848 }
26849 }
26850 }
26851
TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2,k_eq_8_subtile_m)26852 TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2, k_eq_8_subtile_m) {
26853 TEST_REQUIRES_X86_AVX2;
26854 for (uint32_t m = 1; m <= 1; m++) {
26855 GemmMicrokernelTester()
26856 .mr(1)
26857 .nr(8)
26858 .kr(8)
26859 .sr(1)
26860 .m(m)
26861 .n(8)
26862 .k(8)
26863 .iterations(1)
26864 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qu8_conv_minmax_fp32_avx2_params, xnn_qu8_requantize_fp32);
26865 }
26866 }
26867
TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2,k_eq_8_subtile_n)26868 TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2, k_eq_8_subtile_n) {
26869 TEST_REQUIRES_X86_AVX2;
26870 for (uint32_t n = 1; n <= 8; n++) {
26871 GemmMicrokernelTester()
26872 .mr(1)
26873 .nr(8)
26874 .kr(8)
26875 .sr(1)
26876 .m(1)
26877 .n(n)
26878 .k(8)
26879 .iterations(1)
26880 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qu8_conv_minmax_fp32_avx2_params, xnn_qu8_requantize_fp32);
26881 }
26882 }
26883
TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2,k_lt_8)26884 TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2, k_lt_8) {
26885 TEST_REQUIRES_X86_AVX2;
26886 for (size_t k = 1; k < 8; k++) {
26887 GemmMicrokernelTester()
26888 .mr(1)
26889 .nr(8)
26890 .kr(8)
26891 .sr(1)
26892 .m(1)
26893 .n(8)
26894 .k(k)
26895 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qu8_conv_minmax_fp32_avx2_params, xnn_qu8_requantize_fp32);
26896 }
26897 }
26898
TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2,k_lt_8_strided_a)26899 TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2, k_lt_8_strided_a) {
26900 TEST_REQUIRES_X86_AVX2;
26901 for (size_t k = 1; k < 8; k++) {
26902 GemmMicrokernelTester()
26903 .mr(1)
26904 .nr(8)
26905 .kr(8)
26906 .sr(1)
26907 .m(1)
26908 .n(8)
26909 .k(k)
26910 .a_stride(11)
26911 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qu8_conv_minmax_fp32_avx2_params, xnn_qu8_requantize_fp32);
26912 }
26913 }
26914
TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2,k_lt_8_subtile)26915 TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2, k_lt_8_subtile) {
26916 TEST_REQUIRES_X86_AVX2;
26917 for (size_t k = 1; k < 8; k++) {
26918 for (uint32_t n = 1; n <= 8; n++) {
26919 for (uint32_t m = 1; m <= 1; m++) {
26920 GemmMicrokernelTester()
26921 .mr(1)
26922 .nr(8)
26923 .kr(8)
26924 .sr(1)
26925 .m(m)
26926 .n(n)
26927 .k(k)
26928 .iterations(1)
26929 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qu8_conv_minmax_fp32_avx2_params, xnn_qu8_requantize_fp32);
26930 }
26931 }
26932 }
26933 }
26934
TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2,k_gt_8)26935 TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2, k_gt_8) {
26936 TEST_REQUIRES_X86_AVX2;
26937 for (size_t k = 9; k < 16; k++) {
26938 GemmMicrokernelTester()
26939 .mr(1)
26940 .nr(8)
26941 .kr(8)
26942 .sr(1)
26943 .m(1)
26944 .n(8)
26945 .k(k)
26946 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qu8_conv_minmax_fp32_avx2_params, xnn_qu8_requantize_fp32);
26947 }
26948 }
26949
TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2,k_gt_8_strided_a)26950 TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2, k_gt_8_strided_a) {
26951 TEST_REQUIRES_X86_AVX2;
26952 for (size_t k = 9; k < 16; k++) {
26953 GemmMicrokernelTester()
26954 .mr(1)
26955 .nr(8)
26956 .kr(8)
26957 .sr(1)
26958 .m(1)
26959 .n(8)
26960 .k(k)
26961 .a_stride(19)
26962 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qu8_conv_minmax_fp32_avx2_params, xnn_qu8_requantize_fp32);
26963 }
26964 }
26965
TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2,k_gt_8_subtile)26966 TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2, k_gt_8_subtile) {
26967 TEST_REQUIRES_X86_AVX2;
26968 for (size_t k = 9; k < 16; k++) {
26969 for (uint32_t n = 1; n <= 8; n++) {
26970 for (uint32_t m = 1; m <= 1; m++) {
26971 GemmMicrokernelTester()
26972 .mr(1)
26973 .nr(8)
26974 .kr(8)
26975 .sr(1)
26976 .m(m)
26977 .n(n)
26978 .k(k)
26979 .iterations(1)
26980 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qu8_conv_minmax_fp32_avx2_params, xnn_qu8_requantize_fp32);
26981 }
26982 }
26983 }
26984 }
26985
TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2,k_div_8)26986 TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2, k_div_8) {
26987 TEST_REQUIRES_X86_AVX2;
26988 for (size_t k = 16; k <= 80; k += 8) {
26989 GemmMicrokernelTester()
26990 .mr(1)
26991 .nr(8)
26992 .kr(8)
26993 .sr(1)
26994 .m(1)
26995 .n(8)
26996 .k(k)
26997 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qu8_conv_minmax_fp32_avx2_params, xnn_qu8_requantize_fp32);
26998 }
26999 }
27000
TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2,k_div_8_strided_a)27001 TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2, k_div_8_strided_a) {
27002 TEST_REQUIRES_X86_AVX2;
27003 for (size_t k = 16; k <= 80; k += 8) {
27004 GemmMicrokernelTester()
27005 .mr(1)
27006 .nr(8)
27007 .kr(8)
27008 .sr(1)
27009 .m(1)
27010 .n(8)
27011 .k(k)
27012 .a_stride(83)
27013 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qu8_conv_minmax_fp32_avx2_params, xnn_qu8_requantize_fp32);
27014 }
27015 }
27016
TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2,k_div_8_subtile)27017 TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2, k_div_8_subtile) {
27018 TEST_REQUIRES_X86_AVX2;
27019 for (size_t k = 16; k <= 80; k += 8) {
27020 for (uint32_t n = 1; n <= 8; n++) {
27021 for (uint32_t m = 1; m <= 1; m++) {
27022 GemmMicrokernelTester()
27023 .mr(1)
27024 .nr(8)
27025 .kr(8)
27026 .sr(1)
27027 .m(m)
27028 .n(n)
27029 .k(k)
27030 .iterations(1)
27031 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qu8_conv_minmax_fp32_avx2_params, xnn_qu8_requantize_fp32);
27032 }
27033 }
27034 }
27035 }
27036
TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2,n_gt_8)27037 TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2, n_gt_8) {
27038 TEST_REQUIRES_X86_AVX2;
27039 for (uint32_t n = 9; n < 16; n++) {
27040 for (size_t k = 1; k <= 40; k += 9) {
27041 GemmMicrokernelTester()
27042 .mr(1)
27043 .nr(8)
27044 .kr(8)
27045 .sr(1)
27046 .m(1)
27047 .n(n)
27048 .k(k)
27049 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qu8_conv_minmax_fp32_avx2_params, xnn_qu8_requantize_fp32);
27050 }
27051 }
27052 }
27053
TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2,n_gt_8_strided_cn)27054 TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2, n_gt_8_strided_cn) {
27055 TEST_REQUIRES_X86_AVX2;
27056 for (uint32_t n = 9; n < 16; n++) {
27057 for (size_t k = 1; k <= 40; k += 9) {
27058 GemmMicrokernelTester()
27059 .mr(1)
27060 .nr(8)
27061 .kr(8)
27062 .sr(1)
27063 .m(1)
27064 .n(n)
27065 .k(k)
27066 .cn_stride(11)
27067 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qu8_conv_minmax_fp32_avx2_params, xnn_qu8_requantize_fp32);
27068 }
27069 }
27070 }
27071
TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2,n_gt_8_strided_a)27072 TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2, n_gt_8_strided_a) {
27073 TEST_REQUIRES_X86_AVX2;
27074 for (uint32_t n = 9; n < 16; n++) {
27075 for (size_t k = 1; k <= 40; k += 9) {
27076 GemmMicrokernelTester()
27077 .mr(1)
27078 .nr(8)
27079 .kr(8)
27080 .sr(1)
27081 .m(1)
27082 .n(n)
27083 .k(k)
27084 .a_stride(43)
27085 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qu8_conv_minmax_fp32_avx2_params, xnn_qu8_requantize_fp32);
27086 }
27087 }
27088 }
27089
TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2,n_gt_8_subtile)27090 TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2, n_gt_8_subtile) {
27091 TEST_REQUIRES_X86_AVX2;
27092 for (uint32_t n = 9; n < 16; n++) {
27093 for (size_t k = 1; k <= 40; k += 9) {
27094 for (uint32_t m = 1; m <= 1; m++) {
27095 GemmMicrokernelTester()
27096 .mr(1)
27097 .nr(8)
27098 .kr(8)
27099 .sr(1)
27100 .m(m)
27101 .n(n)
27102 .k(k)
27103 .iterations(1)
27104 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qu8_conv_minmax_fp32_avx2_params, xnn_qu8_requantize_fp32);
27105 }
27106 }
27107 }
27108 }
27109
TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2,n_div_8)27110 TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2, n_div_8) {
27111 TEST_REQUIRES_X86_AVX2;
27112 for (uint32_t n = 16; n <= 24; n += 8) {
27113 for (size_t k = 1; k <= 40; k += 9) {
27114 GemmMicrokernelTester()
27115 .mr(1)
27116 .nr(8)
27117 .kr(8)
27118 .sr(1)
27119 .m(1)
27120 .n(n)
27121 .k(k)
27122 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qu8_conv_minmax_fp32_avx2_params, xnn_qu8_requantize_fp32);
27123 }
27124 }
27125 }
27126
TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2,n_div_8_strided_cn)27127 TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2, n_div_8_strided_cn) {
27128 TEST_REQUIRES_X86_AVX2;
27129 for (uint32_t n = 16; n <= 24; n += 8) {
27130 for (size_t k = 1; k <= 40; k += 9) {
27131 GemmMicrokernelTester()
27132 .mr(1)
27133 .nr(8)
27134 .kr(8)
27135 .sr(1)
27136 .m(1)
27137 .n(n)
27138 .k(k)
27139 .cn_stride(11)
27140 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qu8_conv_minmax_fp32_avx2_params, xnn_qu8_requantize_fp32);
27141 }
27142 }
27143 }
27144
TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2,n_div_8_strided_a)27145 TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2, n_div_8_strided_a) {
27146 TEST_REQUIRES_X86_AVX2;
27147 for (uint32_t n = 16; n <= 24; n += 8) {
27148 for (size_t k = 1; k <= 40; k += 9) {
27149 GemmMicrokernelTester()
27150 .mr(1)
27151 .nr(8)
27152 .kr(8)
27153 .sr(1)
27154 .m(1)
27155 .n(n)
27156 .k(k)
27157 .a_stride(43)
27158 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qu8_conv_minmax_fp32_avx2_params, xnn_qu8_requantize_fp32);
27159 }
27160 }
27161 }
27162
TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2,n_div_8_subtile)27163 TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2, n_div_8_subtile) {
27164 TEST_REQUIRES_X86_AVX2;
27165 for (uint32_t n = 16; n <= 24; n += 8) {
27166 for (size_t k = 1; k <= 40; k += 9) {
27167 for (uint32_t m = 1; m <= 1; m++) {
27168 GemmMicrokernelTester()
27169 .mr(1)
27170 .nr(8)
27171 .kr(8)
27172 .sr(1)
27173 .m(m)
27174 .n(n)
27175 .k(k)
27176 .iterations(1)
27177 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qu8_conv_minmax_fp32_avx2_params, xnn_qu8_requantize_fp32);
27178 }
27179 }
27180 }
27181 }
27182
TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2,strided_cm_subtile)27183 TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2, strided_cm_subtile) {
27184 TEST_REQUIRES_X86_AVX2;
27185 for (size_t k = 1; k <= 40; k += 9) {
27186 for (uint32_t n = 1; n <= 8; n++) {
27187 for (uint32_t m = 1; m <= 1; m++) {
27188 GemmMicrokernelTester()
27189 .mr(1)
27190 .nr(8)
27191 .kr(8)
27192 .sr(1)
27193 .m(m)
27194 .n(n)
27195 .k(k)
27196 .cm_stride(11)
27197 .iterations(1)
27198 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qu8_conv_minmax_fp32_avx2_params, xnn_qu8_requantize_fp32);
27199 }
27200 }
27201 }
27202 }
27203
TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2,qmin)27204 TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2, qmin) {
27205 TEST_REQUIRES_X86_AVX2;
27206 GemmMicrokernelTester()
27207 .mr(1)
27208 .nr(8)
27209 .kr(8)
27210 .sr(1)
27211 .m(1)
27212 .n(8)
27213 .k(8)
27214 .qmin(128)
27215 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qu8_conv_minmax_fp32_avx2_params, xnn_qu8_requantize_fp32);
27216 }
27217
TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2,qmax)27218 TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2, qmax) {
27219 TEST_REQUIRES_X86_AVX2;
27220 GemmMicrokernelTester()
27221 .mr(1)
27222 .nr(8)
27223 .kr(8)
27224 .sr(1)
27225 .m(1)
27226 .n(8)
27227 .k(8)
27228 .qmax(128)
27229 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qu8_conv_minmax_fp32_avx2_params, xnn_qu8_requantize_fp32);
27230 }
27231
TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2,strided_cm)27232 TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2, strided_cm) {
27233 TEST_REQUIRES_X86_AVX2;
27234 GemmMicrokernelTester()
27235 .mr(1)
27236 .nr(8)
27237 .kr(8)
27238 .sr(1)
27239 .m(1)
27240 .n(8)
27241 .k(8)
27242 .cm_stride(11)
27243 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qu8_conv_minmax_fp32_avx2_params, xnn_qu8_requantize_fp32);
27244 }
27245
TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2,no_a_zero_point)27246 TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2, no_a_zero_point) {
27247 TEST_REQUIRES_X86_AVX2;
27248 for (size_t k = 1; k <= 40; k += 9) {
27249 GemmMicrokernelTester()
27250 .mr(1)
27251 .nr(8)
27252 .kr(8)
27253 .sr(1)
27254 .m(1)
27255 .n(8)
27256 .k(k)
27257 .a_zero_point(0)
27258 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qu8_conv_minmax_fp32_avx2_params, xnn_qu8_requantize_fp32);
27259 }
27260 }
27261
TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2,no_b_zero_point)27262 TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2, no_b_zero_point) {
27263 TEST_REQUIRES_X86_AVX2;
27264 for (size_t k = 1; k <= 40; k += 9) {
27265 GemmMicrokernelTester()
27266 .mr(1)
27267 .nr(8)
27268 .kr(8)
27269 .sr(1)
27270 .m(1)
27271 .n(8)
27272 .k(k)
27273 .b_zero_point(0)
27274 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qu8_conv_minmax_fp32_avx2_params, xnn_qu8_requantize_fp32);
27275 }
27276 }
27277
TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2,no_zero_point)27278 TEST(QU8_GEMM_MINMAX_FP32_1X8C8__AVX2, no_zero_point) {
27279 TEST_REQUIRES_X86_AVX2;
27280 for (size_t k = 1; k <= 40; k += 9) {
27281 GemmMicrokernelTester()
27282 .mr(1)
27283 .nr(8)
27284 .kr(8)
27285 .sr(1)
27286 .m(1)
27287 .n(8)
27288 .k(k)
27289 .a_zero_point(0)
27290 .b_zero_point(0)
27291 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2, xnn_init_qu8_conv_minmax_fp32_avx2_params, xnn_qu8_requantize_fp32);
27292 }
27293 }
27294 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
27295
27296
27297 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX,k_eq_8)27298 TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_eq_8) {
27299 TEST_REQUIRES_X86_AVX512SKX;
27300 GemmMicrokernelTester()
27301 .mr(1)
27302 .nr(16)
27303 .kr(8)
27304 .sr(1)
27305 .m(1)
27306 .n(16)
27307 .k(8)
27308 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27309 }
27310
TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX,strided_cn)27311 TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, strided_cn) {
27312 TEST_REQUIRES_X86_AVX512SKX;
27313 GemmMicrokernelTester()
27314 .mr(1)
27315 .nr(16)
27316 .kr(8)
27317 .sr(1)
27318 .m(1)
27319 .n(16)
27320 .k(8)
27321 .cn_stride(19)
27322 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27323 }
27324
TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX,k_eq_8_strided_a)27325 TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_eq_8_strided_a) {
27326 TEST_REQUIRES_X86_AVX512SKX;
27327 GemmMicrokernelTester()
27328 .mr(1)
27329 .nr(16)
27330 .kr(8)
27331 .sr(1)
27332 .m(1)
27333 .n(16)
27334 .k(8)
27335 .a_stride(11)
27336 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27337 }
27338
TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX,k_eq_8_subtile)27339 TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_eq_8_subtile) {
27340 TEST_REQUIRES_X86_AVX512SKX;
27341 for (uint32_t n = 1; n <= 16; n++) {
27342 for (uint32_t m = 1; m <= 1; m++) {
27343 GemmMicrokernelTester()
27344 .mr(1)
27345 .nr(16)
27346 .kr(8)
27347 .sr(1)
27348 .m(m)
27349 .n(n)
27350 .k(8)
27351 .iterations(1)
27352 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27353 }
27354 }
27355 }
27356
TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX,k_eq_8_subtile_m)27357 TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_eq_8_subtile_m) {
27358 TEST_REQUIRES_X86_AVX512SKX;
27359 for (uint32_t m = 1; m <= 1; m++) {
27360 GemmMicrokernelTester()
27361 .mr(1)
27362 .nr(16)
27363 .kr(8)
27364 .sr(1)
27365 .m(m)
27366 .n(16)
27367 .k(8)
27368 .iterations(1)
27369 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27370 }
27371 }
27372
TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX,k_eq_8_subtile_n)27373 TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_eq_8_subtile_n) {
27374 TEST_REQUIRES_X86_AVX512SKX;
27375 for (uint32_t n = 1; n <= 16; n++) {
27376 GemmMicrokernelTester()
27377 .mr(1)
27378 .nr(16)
27379 .kr(8)
27380 .sr(1)
27381 .m(1)
27382 .n(n)
27383 .k(8)
27384 .iterations(1)
27385 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27386 }
27387 }
27388
TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX,k_lt_8)27389 TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_lt_8) {
27390 TEST_REQUIRES_X86_AVX512SKX;
27391 for (size_t k = 1; k < 8; k++) {
27392 GemmMicrokernelTester()
27393 .mr(1)
27394 .nr(16)
27395 .kr(8)
27396 .sr(1)
27397 .m(1)
27398 .n(16)
27399 .k(k)
27400 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27401 }
27402 }
27403
TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX,k_lt_8_strided_a)27404 TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_lt_8_strided_a) {
27405 TEST_REQUIRES_X86_AVX512SKX;
27406 for (size_t k = 1; k < 8; k++) {
27407 GemmMicrokernelTester()
27408 .mr(1)
27409 .nr(16)
27410 .kr(8)
27411 .sr(1)
27412 .m(1)
27413 .n(16)
27414 .k(k)
27415 .a_stride(11)
27416 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27417 }
27418 }
27419
TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX,k_lt_8_subtile)27420 TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_lt_8_subtile) {
27421 TEST_REQUIRES_X86_AVX512SKX;
27422 for (size_t k = 1; k < 8; k++) {
27423 for (uint32_t n = 1; n <= 16; n++) {
27424 for (uint32_t m = 1; m <= 1; m++) {
27425 GemmMicrokernelTester()
27426 .mr(1)
27427 .nr(16)
27428 .kr(8)
27429 .sr(1)
27430 .m(m)
27431 .n(n)
27432 .k(k)
27433 .iterations(1)
27434 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27435 }
27436 }
27437 }
27438 }
27439
TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX,k_gt_8)27440 TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_gt_8) {
27441 TEST_REQUIRES_X86_AVX512SKX;
27442 for (size_t k = 9; k < 16; k++) {
27443 GemmMicrokernelTester()
27444 .mr(1)
27445 .nr(16)
27446 .kr(8)
27447 .sr(1)
27448 .m(1)
27449 .n(16)
27450 .k(k)
27451 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27452 }
27453 }
27454
TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX,k_gt_8_strided_a)27455 TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_gt_8_strided_a) {
27456 TEST_REQUIRES_X86_AVX512SKX;
27457 for (size_t k = 9; k < 16; k++) {
27458 GemmMicrokernelTester()
27459 .mr(1)
27460 .nr(16)
27461 .kr(8)
27462 .sr(1)
27463 .m(1)
27464 .n(16)
27465 .k(k)
27466 .a_stride(19)
27467 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27468 }
27469 }
27470
TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX,k_gt_8_subtile)27471 TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_gt_8_subtile) {
27472 TEST_REQUIRES_X86_AVX512SKX;
27473 for (size_t k = 9; k < 16; k++) {
27474 for (uint32_t n = 1; n <= 16; n++) {
27475 for (uint32_t m = 1; m <= 1; m++) {
27476 GemmMicrokernelTester()
27477 .mr(1)
27478 .nr(16)
27479 .kr(8)
27480 .sr(1)
27481 .m(m)
27482 .n(n)
27483 .k(k)
27484 .iterations(1)
27485 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27486 }
27487 }
27488 }
27489 }
27490
TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX,k_div_8)27491 TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_div_8) {
27492 TEST_REQUIRES_X86_AVX512SKX;
27493 for (size_t k = 16; k <= 80; k += 8) {
27494 GemmMicrokernelTester()
27495 .mr(1)
27496 .nr(16)
27497 .kr(8)
27498 .sr(1)
27499 .m(1)
27500 .n(16)
27501 .k(k)
27502 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27503 }
27504 }
27505
TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX,k_div_8_strided_a)27506 TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_div_8_strided_a) {
27507 TEST_REQUIRES_X86_AVX512SKX;
27508 for (size_t k = 16; k <= 80; k += 8) {
27509 GemmMicrokernelTester()
27510 .mr(1)
27511 .nr(16)
27512 .kr(8)
27513 .sr(1)
27514 .m(1)
27515 .n(16)
27516 .k(k)
27517 .a_stride(83)
27518 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27519 }
27520 }
27521
TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX,k_div_8_subtile)27522 TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, k_div_8_subtile) {
27523 TEST_REQUIRES_X86_AVX512SKX;
27524 for (size_t k = 16; k <= 80; k += 8) {
27525 for (uint32_t n = 1; n <= 16; n++) {
27526 for (uint32_t m = 1; m <= 1; m++) {
27527 GemmMicrokernelTester()
27528 .mr(1)
27529 .nr(16)
27530 .kr(8)
27531 .sr(1)
27532 .m(m)
27533 .n(n)
27534 .k(k)
27535 .iterations(1)
27536 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27537 }
27538 }
27539 }
27540 }
27541
TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX,n_gt_16)27542 TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, n_gt_16) {
27543 TEST_REQUIRES_X86_AVX512SKX;
27544 for (uint32_t n = 17; n < 32; n++) {
27545 for (size_t k = 1; k <= 40; k += 9) {
27546 GemmMicrokernelTester()
27547 .mr(1)
27548 .nr(16)
27549 .kr(8)
27550 .sr(1)
27551 .m(1)
27552 .n(n)
27553 .k(k)
27554 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27555 }
27556 }
27557 }
27558
TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX,n_gt_16_strided_cn)27559 TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, n_gt_16_strided_cn) {
27560 TEST_REQUIRES_X86_AVX512SKX;
27561 for (uint32_t n = 17; n < 32; n++) {
27562 for (size_t k = 1; k <= 40; k += 9) {
27563 GemmMicrokernelTester()
27564 .mr(1)
27565 .nr(16)
27566 .kr(8)
27567 .sr(1)
27568 .m(1)
27569 .n(n)
27570 .k(k)
27571 .cn_stride(19)
27572 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27573 }
27574 }
27575 }
27576
TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX,n_gt_16_strided_a)27577 TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, n_gt_16_strided_a) {
27578 TEST_REQUIRES_X86_AVX512SKX;
27579 for (uint32_t n = 17; n < 32; n++) {
27580 for (size_t k = 1; k <= 40; k += 9) {
27581 GemmMicrokernelTester()
27582 .mr(1)
27583 .nr(16)
27584 .kr(8)
27585 .sr(1)
27586 .m(1)
27587 .n(n)
27588 .k(k)
27589 .a_stride(43)
27590 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27591 }
27592 }
27593 }
27594
TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX,n_gt_16_subtile)27595 TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, n_gt_16_subtile) {
27596 TEST_REQUIRES_X86_AVX512SKX;
27597 for (uint32_t n = 17; n < 32; n++) {
27598 for (size_t k = 1; k <= 40; k += 9) {
27599 for (uint32_t m = 1; m <= 1; m++) {
27600 GemmMicrokernelTester()
27601 .mr(1)
27602 .nr(16)
27603 .kr(8)
27604 .sr(1)
27605 .m(m)
27606 .n(n)
27607 .k(k)
27608 .iterations(1)
27609 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27610 }
27611 }
27612 }
27613 }
27614
TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX,n_div_16)27615 TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, n_div_16) {
27616 TEST_REQUIRES_X86_AVX512SKX;
27617 for (uint32_t n = 32; n <= 48; n += 16) {
27618 for (size_t k = 1; k <= 40; k += 9) {
27619 GemmMicrokernelTester()
27620 .mr(1)
27621 .nr(16)
27622 .kr(8)
27623 .sr(1)
27624 .m(1)
27625 .n(n)
27626 .k(k)
27627 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27628 }
27629 }
27630 }
27631
TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX,n_div_16_strided_cn)27632 TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, n_div_16_strided_cn) {
27633 TEST_REQUIRES_X86_AVX512SKX;
27634 for (uint32_t n = 32; n <= 48; n += 16) {
27635 for (size_t k = 1; k <= 40; k += 9) {
27636 GemmMicrokernelTester()
27637 .mr(1)
27638 .nr(16)
27639 .kr(8)
27640 .sr(1)
27641 .m(1)
27642 .n(n)
27643 .k(k)
27644 .cn_stride(19)
27645 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27646 }
27647 }
27648 }
27649
TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX,n_div_16_strided_a)27650 TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, n_div_16_strided_a) {
27651 TEST_REQUIRES_X86_AVX512SKX;
27652 for (uint32_t n = 32; n <= 48; n += 16) {
27653 for (size_t k = 1; k <= 40; k += 9) {
27654 GemmMicrokernelTester()
27655 .mr(1)
27656 .nr(16)
27657 .kr(8)
27658 .sr(1)
27659 .m(1)
27660 .n(n)
27661 .k(k)
27662 .a_stride(43)
27663 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27664 }
27665 }
27666 }
27667
TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX,n_div_16_subtile)27668 TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, n_div_16_subtile) {
27669 TEST_REQUIRES_X86_AVX512SKX;
27670 for (uint32_t n = 32; n <= 48; n += 16) {
27671 for (size_t k = 1; k <= 40; k += 9) {
27672 for (uint32_t m = 1; m <= 1; m++) {
27673 GemmMicrokernelTester()
27674 .mr(1)
27675 .nr(16)
27676 .kr(8)
27677 .sr(1)
27678 .m(m)
27679 .n(n)
27680 .k(k)
27681 .iterations(1)
27682 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27683 }
27684 }
27685 }
27686 }
27687
TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX,strided_cm_subtile)27688 TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, strided_cm_subtile) {
27689 TEST_REQUIRES_X86_AVX512SKX;
27690 for (size_t k = 1; k <= 40; k += 9) {
27691 for (uint32_t n = 1; n <= 16; n++) {
27692 for (uint32_t m = 1; m <= 1; m++) {
27693 GemmMicrokernelTester()
27694 .mr(1)
27695 .nr(16)
27696 .kr(8)
27697 .sr(1)
27698 .m(m)
27699 .n(n)
27700 .k(k)
27701 .cm_stride(19)
27702 .iterations(1)
27703 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27704 }
27705 }
27706 }
27707 }
27708
TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX,qmin)27709 TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, qmin) {
27710 TEST_REQUIRES_X86_AVX512SKX;
27711 GemmMicrokernelTester()
27712 .mr(1)
27713 .nr(16)
27714 .kr(8)
27715 .sr(1)
27716 .m(1)
27717 .n(16)
27718 .k(8)
27719 .qmin(128)
27720 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27721 }
27722
TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX,qmax)27723 TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, qmax) {
27724 TEST_REQUIRES_X86_AVX512SKX;
27725 GemmMicrokernelTester()
27726 .mr(1)
27727 .nr(16)
27728 .kr(8)
27729 .sr(1)
27730 .m(1)
27731 .n(16)
27732 .k(8)
27733 .qmax(128)
27734 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27735 }
27736
TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX,strided_cm)27737 TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, strided_cm) {
27738 TEST_REQUIRES_X86_AVX512SKX;
27739 GemmMicrokernelTester()
27740 .mr(1)
27741 .nr(16)
27742 .kr(8)
27743 .sr(1)
27744 .m(1)
27745 .n(16)
27746 .k(8)
27747 .cm_stride(19)
27748 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27749 }
27750
TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX,no_a_zero_point)27751 TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, no_a_zero_point) {
27752 TEST_REQUIRES_X86_AVX512SKX;
27753 for (size_t k = 1; k <= 40; k += 9) {
27754 GemmMicrokernelTester()
27755 .mr(1)
27756 .nr(16)
27757 .kr(8)
27758 .sr(1)
27759 .m(1)
27760 .n(16)
27761 .k(k)
27762 .a_zero_point(0)
27763 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27764 }
27765 }
27766
TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX,no_b_zero_point)27767 TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, no_b_zero_point) {
27768 TEST_REQUIRES_X86_AVX512SKX;
27769 for (size_t k = 1; k <= 40; k += 9) {
27770 GemmMicrokernelTester()
27771 .mr(1)
27772 .nr(16)
27773 .kr(8)
27774 .sr(1)
27775 .m(1)
27776 .n(16)
27777 .k(k)
27778 .b_zero_point(0)
27779 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27780 }
27781 }
27782
TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX,no_zero_point)27783 TEST(QU8_GEMM_MINMAX_FP32_1X16C8__AVX512SKX, no_zero_point) {
27784 TEST_REQUIRES_X86_AVX512SKX;
27785 for (size_t k = 1; k <= 40; k += 9) {
27786 GemmMicrokernelTester()
27787 .mr(1)
27788 .nr(16)
27789 .kr(8)
27790 .sr(1)
27791 .m(1)
27792 .n(16)
27793 .k(k)
27794 .a_zero_point(0)
27795 .b_zero_point(0)
27796 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27797 }
27798 }
27799 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
27800
27801
27802 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX,k_eq_8)27803 TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_eq_8) {
27804 TEST_REQUIRES_X86_AVX512SKX;
27805 GemmMicrokernelTester()
27806 .mr(2)
27807 .nr(16)
27808 .kr(8)
27809 .sr(1)
27810 .m(2)
27811 .n(16)
27812 .k(8)
27813 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27814 }
27815
TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX,strided_cn)27816 TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, strided_cn) {
27817 TEST_REQUIRES_X86_AVX512SKX;
27818 GemmMicrokernelTester()
27819 .mr(2)
27820 .nr(16)
27821 .kr(8)
27822 .sr(1)
27823 .m(2)
27824 .n(16)
27825 .k(8)
27826 .cn_stride(19)
27827 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27828 }
27829
TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX,k_eq_8_strided_a)27830 TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_eq_8_strided_a) {
27831 TEST_REQUIRES_X86_AVX512SKX;
27832 GemmMicrokernelTester()
27833 .mr(2)
27834 .nr(16)
27835 .kr(8)
27836 .sr(1)
27837 .m(2)
27838 .n(16)
27839 .k(8)
27840 .a_stride(11)
27841 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27842 }
27843
TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX,k_eq_8_subtile)27844 TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_eq_8_subtile) {
27845 TEST_REQUIRES_X86_AVX512SKX;
27846 for (uint32_t n = 1; n <= 16; n++) {
27847 for (uint32_t m = 1; m <= 2; m++) {
27848 GemmMicrokernelTester()
27849 .mr(2)
27850 .nr(16)
27851 .kr(8)
27852 .sr(1)
27853 .m(m)
27854 .n(n)
27855 .k(8)
27856 .iterations(1)
27857 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27858 }
27859 }
27860 }
27861
TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX,k_eq_8_subtile_m)27862 TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_eq_8_subtile_m) {
27863 TEST_REQUIRES_X86_AVX512SKX;
27864 for (uint32_t m = 1; m <= 2; m++) {
27865 GemmMicrokernelTester()
27866 .mr(2)
27867 .nr(16)
27868 .kr(8)
27869 .sr(1)
27870 .m(m)
27871 .n(16)
27872 .k(8)
27873 .iterations(1)
27874 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27875 }
27876 }
27877
TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX,k_eq_8_subtile_n)27878 TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_eq_8_subtile_n) {
27879 TEST_REQUIRES_X86_AVX512SKX;
27880 for (uint32_t n = 1; n <= 16; n++) {
27881 GemmMicrokernelTester()
27882 .mr(2)
27883 .nr(16)
27884 .kr(8)
27885 .sr(1)
27886 .m(2)
27887 .n(n)
27888 .k(8)
27889 .iterations(1)
27890 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27891 }
27892 }
27893
TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX,k_lt_8)27894 TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_lt_8) {
27895 TEST_REQUIRES_X86_AVX512SKX;
27896 for (size_t k = 1; k < 8; k++) {
27897 GemmMicrokernelTester()
27898 .mr(2)
27899 .nr(16)
27900 .kr(8)
27901 .sr(1)
27902 .m(2)
27903 .n(16)
27904 .k(k)
27905 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27906 }
27907 }
27908
TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX,k_lt_8_strided_a)27909 TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_lt_8_strided_a) {
27910 TEST_REQUIRES_X86_AVX512SKX;
27911 for (size_t k = 1; k < 8; k++) {
27912 GemmMicrokernelTester()
27913 .mr(2)
27914 .nr(16)
27915 .kr(8)
27916 .sr(1)
27917 .m(2)
27918 .n(16)
27919 .k(k)
27920 .a_stride(11)
27921 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27922 }
27923 }
27924
TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX,k_lt_8_subtile)27925 TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_lt_8_subtile) {
27926 TEST_REQUIRES_X86_AVX512SKX;
27927 for (size_t k = 1; k < 8; k++) {
27928 for (uint32_t n = 1; n <= 16; n++) {
27929 for (uint32_t m = 1; m <= 2; m++) {
27930 GemmMicrokernelTester()
27931 .mr(2)
27932 .nr(16)
27933 .kr(8)
27934 .sr(1)
27935 .m(m)
27936 .n(n)
27937 .k(k)
27938 .iterations(1)
27939 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27940 }
27941 }
27942 }
27943 }
27944
TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX,k_gt_8)27945 TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_gt_8) {
27946 TEST_REQUIRES_X86_AVX512SKX;
27947 for (size_t k = 9; k < 16; k++) {
27948 GemmMicrokernelTester()
27949 .mr(2)
27950 .nr(16)
27951 .kr(8)
27952 .sr(1)
27953 .m(2)
27954 .n(16)
27955 .k(k)
27956 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27957 }
27958 }
27959
TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX,k_gt_8_strided_a)27960 TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_gt_8_strided_a) {
27961 TEST_REQUIRES_X86_AVX512SKX;
27962 for (size_t k = 9; k < 16; k++) {
27963 GemmMicrokernelTester()
27964 .mr(2)
27965 .nr(16)
27966 .kr(8)
27967 .sr(1)
27968 .m(2)
27969 .n(16)
27970 .k(k)
27971 .a_stride(19)
27972 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27973 }
27974 }
27975
TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX,k_gt_8_subtile)27976 TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_gt_8_subtile) {
27977 TEST_REQUIRES_X86_AVX512SKX;
27978 for (size_t k = 9; k < 16; k++) {
27979 for (uint32_t n = 1; n <= 16; n++) {
27980 for (uint32_t m = 1; m <= 2; m++) {
27981 GemmMicrokernelTester()
27982 .mr(2)
27983 .nr(16)
27984 .kr(8)
27985 .sr(1)
27986 .m(m)
27987 .n(n)
27988 .k(k)
27989 .iterations(1)
27990 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
27991 }
27992 }
27993 }
27994 }
27995
TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX,k_div_8)27996 TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_div_8) {
27997 TEST_REQUIRES_X86_AVX512SKX;
27998 for (size_t k = 16; k <= 80; k += 8) {
27999 GemmMicrokernelTester()
28000 .mr(2)
28001 .nr(16)
28002 .kr(8)
28003 .sr(1)
28004 .m(2)
28005 .n(16)
28006 .k(k)
28007 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28008 }
28009 }
28010
TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX,k_div_8_strided_a)28011 TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_div_8_strided_a) {
28012 TEST_REQUIRES_X86_AVX512SKX;
28013 for (size_t k = 16; k <= 80; k += 8) {
28014 GemmMicrokernelTester()
28015 .mr(2)
28016 .nr(16)
28017 .kr(8)
28018 .sr(1)
28019 .m(2)
28020 .n(16)
28021 .k(k)
28022 .a_stride(83)
28023 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28024 }
28025 }
28026
TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX,k_div_8_subtile)28027 TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, k_div_8_subtile) {
28028 TEST_REQUIRES_X86_AVX512SKX;
28029 for (size_t k = 16; k <= 80; k += 8) {
28030 for (uint32_t n = 1; n <= 16; n++) {
28031 for (uint32_t m = 1; m <= 2; m++) {
28032 GemmMicrokernelTester()
28033 .mr(2)
28034 .nr(16)
28035 .kr(8)
28036 .sr(1)
28037 .m(m)
28038 .n(n)
28039 .k(k)
28040 .iterations(1)
28041 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28042 }
28043 }
28044 }
28045 }
28046
TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX,n_gt_16)28047 TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, n_gt_16) {
28048 TEST_REQUIRES_X86_AVX512SKX;
28049 for (uint32_t n = 17; n < 32; n++) {
28050 for (size_t k = 1; k <= 40; k += 9) {
28051 GemmMicrokernelTester()
28052 .mr(2)
28053 .nr(16)
28054 .kr(8)
28055 .sr(1)
28056 .m(2)
28057 .n(n)
28058 .k(k)
28059 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28060 }
28061 }
28062 }
28063
TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX,n_gt_16_strided_cn)28064 TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, n_gt_16_strided_cn) {
28065 TEST_REQUIRES_X86_AVX512SKX;
28066 for (uint32_t n = 17; n < 32; n++) {
28067 for (size_t k = 1; k <= 40; k += 9) {
28068 GemmMicrokernelTester()
28069 .mr(2)
28070 .nr(16)
28071 .kr(8)
28072 .sr(1)
28073 .m(2)
28074 .n(n)
28075 .k(k)
28076 .cn_stride(19)
28077 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28078 }
28079 }
28080 }
28081
TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX,n_gt_16_strided_a)28082 TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, n_gt_16_strided_a) {
28083 TEST_REQUIRES_X86_AVX512SKX;
28084 for (uint32_t n = 17; n < 32; n++) {
28085 for (size_t k = 1; k <= 40; k += 9) {
28086 GemmMicrokernelTester()
28087 .mr(2)
28088 .nr(16)
28089 .kr(8)
28090 .sr(1)
28091 .m(2)
28092 .n(n)
28093 .k(k)
28094 .a_stride(43)
28095 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28096 }
28097 }
28098 }
28099
TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX,n_gt_16_subtile)28100 TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, n_gt_16_subtile) {
28101 TEST_REQUIRES_X86_AVX512SKX;
28102 for (uint32_t n = 17; n < 32; n++) {
28103 for (size_t k = 1; k <= 40; k += 9) {
28104 for (uint32_t m = 1; m <= 2; m++) {
28105 GemmMicrokernelTester()
28106 .mr(2)
28107 .nr(16)
28108 .kr(8)
28109 .sr(1)
28110 .m(m)
28111 .n(n)
28112 .k(k)
28113 .iterations(1)
28114 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28115 }
28116 }
28117 }
28118 }
28119
TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX,n_div_16)28120 TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, n_div_16) {
28121 TEST_REQUIRES_X86_AVX512SKX;
28122 for (uint32_t n = 32; n <= 48; n += 16) {
28123 for (size_t k = 1; k <= 40; k += 9) {
28124 GemmMicrokernelTester()
28125 .mr(2)
28126 .nr(16)
28127 .kr(8)
28128 .sr(1)
28129 .m(2)
28130 .n(n)
28131 .k(k)
28132 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28133 }
28134 }
28135 }
28136
TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX,n_div_16_strided_cn)28137 TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, n_div_16_strided_cn) {
28138 TEST_REQUIRES_X86_AVX512SKX;
28139 for (uint32_t n = 32; n <= 48; n += 16) {
28140 for (size_t k = 1; k <= 40; k += 9) {
28141 GemmMicrokernelTester()
28142 .mr(2)
28143 .nr(16)
28144 .kr(8)
28145 .sr(1)
28146 .m(2)
28147 .n(n)
28148 .k(k)
28149 .cn_stride(19)
28150 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28151 }
28152 }
28153 }
28154
TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX,n_div_16_strided_a)28155 TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, n_div_16_strided_a) {
28156 TEST_REQUIRES_X86_AVX512SKX;
28157 for (uint32_t n = 32; n <= 48; n += 16) {
28158 for (size_t k = 1; k <= 40; k += 9) {
28159 GemmMicrokernelTester()
28160 .mr(2)
28161 .nr(16)
28162 .kr(8)
28163 .sr(1)
28164 .m(2)
28165 .n(n)
28166 .k(k)
28167 .a_stride(43)
28168 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28169 }
28170 }
28171 }
28172
TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX,n_div_16_subtile)28173 TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, n_div_16_subtile) {
28174 TEST_REQUIRES_X86_AVX512SKX;
28175 for (uint32_t n = 32; n <= 48; n += 16) {
28176 for (size_t k = 1; k <= 40; k += 9) {
28177 for (uint32_t m = 1; m <= 2; m++) {
28178 GemmMicrokernelTester()
28179 .mr(2)
28180 .nr(16)
28181 .kr(8)
28182 .sr(1)
28183 .m(m)
28184 .n(n)
28185 .k(k)
28186 .iterations(1)
28187 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28188 }
28189 }
28190 }
28191 }
28192
TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX,strided_cm_subtile)28193 TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, strided_cm_subtile) {
28194 TEST_REQUIRES_X86_AVX512SKX;
28195 for (size_t k = 1; k <= 40; k += 9) {
28196 for (uint32_t n = 1; n <= 16; n++) {
28197 for (uint32_t m = 1; m <= 2; m++) {
28198 GemmMicrokernelTester()
28199 .mr(2)
28200 .nr(16)
28201 .kr(8)
28202 .sr(1)
28203 .m(m)
28204 .n(n)
28205 .k(k)
28206 .cm_stride(19)
28207 .iterations(1)
28208 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28209 }
28210 }
28211 }
28212 }
28213
TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX,qmin)28214 TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, qmin) {
28215 TEST_REQUIRES_X86_AVX512SKX;
28216 GemmMicrokernelTester()
28217 .mr(2)
28218 .nr(16)
28219 .kr(8)
28220 .sr(1)
28221 .m(2)
28222 .n(16)
28223 .k(8)
28224 .qmin(128)
28225 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28226 }
28227
TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX,qmax)28228 TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, qmax) {
28229 TEST_REQUIRES_X86_AVX512SKX;
28230 GemmMicrokernelTester()
28231 .mr(2)
28232 .nr(16)
28233 .kr(8)
28234 .sr(1)
28235 .m(2)
28236 .n(16)
28237 .k(8)
28238 .qmax(128)
28239 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28240 }
28241
TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX,strided_cm)28242 TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, strided_cm) {
28243 TEST_REQUIRES_X86_AVX512SKX;
28244 GemmMicrokernelTester()
28245 .mr(2)
28246 .nr(16)
28247 .kr(8)
28248 .sr(1)
28249 .m(2)
28250 .n(16)
28251 .k(8)
28252 .cm_stride(19)
28253 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28254 }
28255
TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX,no_a_zero_point)28256 TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, no_a_zero_point) {
28257 TEST_REQUIRES_X86_AVX512SKX;
28258 for (size_t k = 1; k <= 40; k += 9) {
28259 GemmMicrokernelTester()
28260 .mr(2)
28261 .nr(16)
28262 .kr(8)
28263 .sr(1)
28264 .m(2)
28265 .n(16)
28266 .k(k)
28267 .a_zero_point(0)
28268 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28269 }
28270 }
28271
TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX,no_b_zero_point)28272 TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, no_b_zero_point) {
28273 TEST_REQUIRES_X86_AVX512SKX;
28274 for (size_t k = 1; k <= 40; k += 9) {
28275 GemmMicrokernelTester()
28276 .mr(2)
28277 .nr(16)
28278 .kr(8)
28279 .sr(1)
28280 .m(2)
28281 .n(16)
28282 .k(k)
28283 .b_zero_point(0)
28284 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28285 }
28286 }
28287
TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX,no_zero_point)28288 TEST(QU8_GEMM_MINMAX_FP32_2X16C8__AVX512SKX, no_zero_point) {
28289 TEST_REQUIRES_X86_AVX512SKX;
28290 for (size_t k = 1; k <= 40; k += 9) {
28291 GemmMicrokernelTester()
28292 .mr(2)
28293 .nr(16)
28294 .kr(8)
28295 .sr(1)
28296 .m(2)
28297 .n(16)
28298 .k(k)
28299 .a_zero_point(0)
28300 .b_zero_point(0)
28301 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28302 }
28303 }
28304 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
28305
28306
28307 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX,k_eq_8)28308 TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, k_eq_8) {
28309 TEST_REQUIRES_X86_AVX512SKX;
28310 GemmMicrokernelTester()
28311 .mr(4)
28312 .nr(16)
28313 .kr(8)
28314 .sr(1)
28315 .m(4)
28316 .n(16)
28317 .k(8)
28318 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28319 }
28320
TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX,strided_cn)28321 TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, strided_cn) {
28322 TEST_REQUIRES_X86_AVX512SKX;
28323 GemmMicrokernelTester()
28324 .mr(4)
28325 .nr(16)
28326 .kr(8)
28327 .sr(1)
28328 .m(4)
28329 .n(16)
28330 .k(8)
28331 .cn_stride(19)
28332 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28333 }
28334
TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX,k_eq_8_strided_a)28335 TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, k_eq_8_strided_a) {
28336 TEST_REQUIRES_X86_AVX512SKX;
28337 GemmMicrokernelTester()
28338 .mr(4)
28339 .nr(16)
28340 .kr(8)
28341 .sr(1)
28342 .m(4)
28343 .n(16)
28344 .k(8)
28345 .a_stride(11)
28346 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28347 }
28348
TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX,k_eq_8_subtile)28349 TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, k_eq_8_subtile) {
28350 TEST_REQUIRES_X86_AVX512SKX;
28351 for (uint32_t n = 1; n <= 16; n++) {
28352 for (uint32_t m = 1; m <= 4; m++) {
28353 GemmMicrokernelTester()
28354 .mr(4)
28355 .nr(16)
28356 .kr(8)
28357 .sr(1)
28358 .m(m)
28359 .n(n)
28360 .k(8)
28361 .iterations(1)
28362 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28363 }
28364 }
28365 }
28366
TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX,k_eq_8_subtile_m)28367 TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, k_eq_8_subtile_m) {
28368 TEST_REQUIRES_X86_AVX512SKX;
28369 for (uint32_t m = 1; m <= 4; m++) {
28370 GemmMicrokernelTester()
28371 .mr(4)
28372 .nr(16)
28373 .kr(8)
28374 .sr(1)
28375 .m(m)
28376 .n(16)
28377 .k(8)
28378 .iterations(1)
28379 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28380 }
28381 }
28382
TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX,k_eq_8_subtile_n)28383 TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, k_eq_8_subtile_n) {
28384 TEST_REQUIRES_X86_AVX512SKX;
28385 for (uint32_t n = 1; n <= 16; n++) {
28386 GemmMicrokernelTester()
28387 .mr(4)
28388 .nr(16)
28389 .kr(8)
28390 .sr(1)
28391 .m(4)
28392 .n(n)
28393 .k(8)
28394 .iterations(1)
28395 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28396 }
28397 }
28398
TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX,k_lt_8)28399 TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, k_lt_8) {
28400 TEST_REQUIRES_X86_AVX512SKX;
28401 for (size_t k = 1; k < 8; k++) {
28402 GemmMicrokernelTester()
28403 .mr(4)
28404 .nr(16)
28405 .kr(8)
28406 .sr(1)
28407 .m(4)
28408 .n(16)
28409 .k(k)
28410 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28411 }
28412 }
28413
TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX,k_lt_8_strided_a)28414 TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, k_lt_8_strided_a) {
28415 TEST_REQUIRES_X86_AVX512SKX;
28416 for (size_t k = 1; k < 8; k++) {
28417 GemmMicrokernelTester()
28418 .mr(4)
28419 .nr(16)
28420 .kr(8)
28421 .sr(1)
28422 .m(4)
28423 .n(16)
28424 .k(k)
28425 .a_stride(11)
28426 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28427 }
28428 }
28429
TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX,k_lt_8_subtile)28430 TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, k_lt_8_subtile) {
28431 TEST_REQUIRES_X86_AVX512SKX;
28432 for (size_t k = 1; k < 8; k++) {
28433 for (uint32_t n = 1; n <= 16; n++) {
28434 for (uint32_t m = 1; m <= 4; m++) {
28435 GemmMicrokernelTester()
28436 .mr(4)
28437 .nr(16)
28438 .kr(8)
28439 .sr(1)
28440 .m(m)
28441 .n(n)
28442 .k(k)
28443 .iterations(1)
28444 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28445 }
28446 }
28447 }
28448 }
28449
TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX,k_gt_8)28450 TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, k_gt_8) {
28451 TEST_REQUIRES_X86_AVX512SKX;
28452 for (size_t k = 9; k < 16; k++) {
28453 GemmMicrokernelTester()
28454 .mr(4)
28455 .nr(16)
28456 .kr(8)
28457 .sr(1)
28458 .m(4)
28459 .n(16)
28460 .k(k)
28461 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28462 }
28463 }
28464
TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX,k_gt_8_strided_a)28465 TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, k_gt_8_strided_a) {
28466 TEST_REQUIRES_X86_AVX512SKX;
28467 for (size_t k = 9; k < 16; k++) {
28468 GemmMicrokernelTester()
28469 .mr(4)
28470 .nr(16)
28471 .kr(8)
28472 .sr(1)
28473 .m(4)
28474 .n(16)
28475 .k(k)
28476 .a_stride(19)
28477 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28478 }
28479 }
28480
TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX,k_gt_8_subtile)28481 TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, k_gt_8_subtile) {
28482 TEST_REQUIRES_X86_AVX512SKX;
28483 for (size_t k = 9; k < 16; k++) {
28484 for (uint32_t n = 1; n <= 16; n++) {
28485 for (uint32_t m = 1; m <= 4; m++) {
28486 GemmMicrokernelTester()
28487 .mr(4)
28488 .nr(16)
28489 .kr(8)
28490 .sr(1)
28491 .m(m)
28492 .n(n)
28493 .k(k)
28494 .iterations(1)
28495 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28496 }
28497 }
28498 }
28499 }
28500
TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX,k_div_8)28501 TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, k_div_8) {
28502 TEST_REQUIRES_X86_AVX512SKX;
28503 for (size_t k = 16; k <= 80; k += 8) {
28504 GemmMicrokernelTester()
28505 .mr(4)
28506 .nr(16)
28507 .kr(8)
28508 .sr(1)
28509 .m(4)
28510 .n(16)
28511 .k(k)
28512 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28513 }
28514 }
28515
TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX,k_div_8_strided_a)28516 TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, k_div_8_strided_a) {
28517 TEST_REQUIRES_X86_AVX512SKX;
28518 for (size_t k = 16; k <= 80; k += 8) {
28519 GemmMicrokernelTester()
28520 .mr(4)
28521 .nr(16)
28522 .kr(8)
28523 .sr(1)
28524 .m(4)
28525 .n(16)
28526 .k(k)
28527 .a_stride(83)
28528 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28529 }
28530 }
28531
TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX,k_div_8_subtile)28532 TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, k_div_8_subtile) {
28533 TEST_REQUIRES_X86_AVX512SKX;
28534 for (size_t k = 16; k <= 80; k += 8) {
28535 for (uint32_t n = 1; n <= 16; n++) {
28536 for (uint32_t m = 1; m <= 4; m++) {
28537 GemmMicrokernelTester()
28538 .mr(4)
28539 .nr(16)
28540 .kr(8)
28541 .sr(1)
28542 .m(m)
28543 .n(n)
28544 .k(k)
28545 .iterations(1)
28546 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28547 }
28548 }
28549 }
28550 }
28551
TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX,n_gt_16)28552 TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, n_gt_16) {
28553 TEST_REQUIRES_X86_AVX512SKX;
28554 for (uint32_t n = 17; n < 32; n++) {
28555 for (size_t k = 1; k <= 40; k += 9) {
28556 GemmMicrokernelTester()
28557 .mr(4)
28558 .nr(16)
28559 .kr(8)
28560 .sr(1)
28561 .m(4)
28562 .n(n)
28563 .k(k)
28564 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28565 }
28566 }
28567 }
28568
TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX,n_gt_16_strided_cn)28569 TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, n_gt_16_strided_cn) {
28570 TEST_REQUIRES_X86_AVX512SKX;
28571 for (uint32_t n = 17; n < 32; n++) {
28572 for (size_t k = 1; k <= 40; k += 9) {
28573 GemmMicrokernelTester()
28574 .mr(4)
28575 .nr(16)
28576 .kr(8)
28577 .sr(1)
28578 .m(4)
28579 .n(n)
28580 .k(k)
28581 .cn_stride(19)
28582 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28583 }
28584 }
28585 }
28586
TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX,n_gt_16_strided_a)28587 TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, n_gt_16_strided_a) {
28588 TEST_REQUIRES_X86_AVX512SKX;
28589 for (uint32_t n = 17; n < 32; n++) {
28590 for (size_t k = 1; k <= 40; k += 9) {
28591 GemmMicrokernelTester()
28592 .mr(4)
28593 .nr(16)
28594 .kr(8)
28595 .sr(1)
28596 .m(4)
28597 .n(n)
28598 .k(k)
28599 .a_stride(43)
28600 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28601 }
28602 }
28603 }
28604
TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX,n_gt_16_subtile)28605 TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, n_gt_16_subtile) {
28606 TEST_REQUIRES_X86_AVX512SKX;
28607 for (uint32_t n = 17; n < 32; n++) {
28608 for (size_t k = 1; k <= 40; k += 9) {
28609 for (uint32_t m = 1; m <= 4; m++) {
28610 GemmMicrokernelTester()
28611 .mr(4)
28612 .nr(16)
28613 .kr(8)
28614 .sr(1)
28615 .m(m)
28616 .n(n)
28617 .k(k)
28618 .iterations(1)
28619 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28620 }
28621 }
28622 }
28623 }
28624
TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX,n_div_16)28625 TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, n_div_16) {
28626 TEST_REQUIRES_X86_AVX512SKX;
28627 for (uint32_t n = 32; n <= 48; n += 16) {
28628 for (size_t k = 1; k <= 40; k += 9) {
28629 GemmMicrokernelTester()
28630 .mr(4)
28631 .nr(16)
28632 .kr(8)
28633 .sr(1)
28634 .m(4)
28635 .n(n)
28636 .k(k)
28637 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28638 }
28639 }
28640 }
28641
TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX,n_div_16_strided_cn)28642 TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, n_div_16_strided_cn) {
28643 TEST_REQUIRES_X86_AVX512SKX;
28644 for (uint32_t n = 32; n <= 48; n += 16) {
28645 for (size_t k = 1; k <= 40; k += 9) {
28646 GemmMicrokernelTester()
28647 .mr(4)
28648 .nr(16)
28649 .kr(8)
28650 .sr(1)
28651 .m(4)
28652 .n(n)
28653 .k(k)
28654 .cn_stride(19)
28655 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28656 }
28657 }
28658 }
28659
TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX,n_div_16_strided_a)28660 TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, n_div_16_strided_a) {
28661 TEST_REQUIRES_X86_AVX512SKX;
28662 for (uint32_t n = 32; n <= 48; n += 16) {
28663 for (size_t k = 1; k <= 40; k += 9) {
28664 GemmMicrokernelTester()
28665 .mr(4)
28666 .nr(16)
28667 .kr(8)
28668 .sr(1)
28669 .m(4)
28670 .n(n)
28671 .k(k)
28672 .a_stride(43)
28673 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28674 }
28675 }
28676 }
28677
TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX,n_div_16_subtile)28678 TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, n_div_16_subtile) {
28679 TEST_REQUIRES_X86_AVX512SKX;
28680 for (uint32_t n = 32; n <= 48; n += 16) {
28681 for (size_t k = 1; k <= 40; k += 9) {
28682 for (uint32_t m = 1; m <= 4; m++) {
28683 GemmMicrokernelTester()
28684 .mr(4)
28685 .nr(16)
28686 .kr(8)
28687 .sr(1)
28688 .m(m)
28689 .n(n)
28690 .k(k)
28691 .iterations(1)
28692 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28693 }
28694 }
28695 }
28696 }
28697
TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX,strided_cm_subtile)28698 TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, strided_cm_subtile) {
28699 TEST_REQUIRES_X86_AVX512SKX;
28700 for (size_t k = 1; k <= 40; k += 9) {
28701 for (uint32_t n = 1; n <= 16; n++) {
28702 for (uint32_t m = 1; m <= 4; m++) {
28703 GemmMicrokernelTester()
28704 .mr(4)
28705 .nr(16)
28706 .kr(8)
28707 .sr(1)
28708 .m(m)
28709 .n(n)
28710 .k(k)
28711 .cm_stride(19)
28712 .iterations(1)
28713 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28714 }
28715 }
28716 }
28717 }
28718
TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX,qmin)28719 TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, qmin) {
28720 TEST_REQUIRES_X86_AVX512SKX;
28721 GemmMicrokernelTester()
28722 .mr(4)
28723 .nr(16)
28724 .kr(8)
28725 .sr(1)
28726 .m(4)
28727 .n(16)
28728 .k(8)
28729 .qmin(128)
28730 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28731 }
28732
TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX,qmax)28733 TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, qmax) {
28734 TEST_REQUIRES_X86_AVX512SKX;
28735 GemmMicrokernelTester()
28736 .mr(4)
28737 .nr(16)
28738 .kr(8)
28739 .sr(1)
28740 .m(4)
28741 .n(16)
28742 .k(8)
28743 .qmax(128)
28744 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28745 }
28746
TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX,strided_cm)28747 TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, strided_cm) {
28748 TEST_REQUIRES_X86_AVX512SKX;
28749 GemmMicrokernelTester()
28750 .mr(4)
28751 .nr(16)
28752 .kr(8)
28753 .sr(1)
28754 .m(4)
28755 .n(16)
28756 .k(8)
28757 .cm_stride(19)
28758 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28759 }
28760
TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX,no_a_zero_point)28761 TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, no_a_zero_point) {
28762 TEST_REQUIRES_X86_AVX512SKX;
28763 for (size_t k = 1; k <= 40; k += 9) {
28764 GemmMicrokernelTester()
28765 .mr(4)
28766 .nr(16)
28767 .kr(8)
28768 .sr(1)
28769 .m(4)
28770 .n(16)
28771 .k(k)
28772 .a_zero_point(0)
28773 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28774 }
28775 }
28776
TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX,no_b_zero_point)28777 TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, no_b_zero_point) {
28778 TEST_REQUIRES_X86_AVX512SKX;
28779 for (size_t k = 1; k <= 40; k += 9) {
28780 GemmMicrokernelTester()
28781 .mr(4)
28782 .nr(16)
28783 .kr(8)
28784 .sr(1)
28785 .m(4)
28786 .n(16)
28787 .k(k)
28788 .b_zero_point(0)
28789 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28790 }
28791 }
28792
TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX,no_zero_point)28793 TEST(QU8_GEMM_MINMAX_FP32_4X16C8__AVX512SKX, no_zero_point) {
28794 TEST_REQUIRES_X86_AVX512SKX;
28795 for (size_t k = 1; k <= 40; k += 9) {
28796 GemmMicrokernelTester()
28797 .mr(4)
28798 .nr(16)
28799 .kr(8)
28800 .sr(1)
28801 .m(4)
28802 .n(16)
28803 .k(k)
28804 .a_zero_point(0)
28805 .b_zero_point(0)
28806 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, xnn_init_qu8_conv_minmax_fp32_avx512_params, xnn_qu8_requantize_fp32);
28807 }
28808 }
28809 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
28810
28811
28812 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64,k_eq_8)28813 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, k_eq_8) {
28814 GemmMicrokernelTester()
28815 .mr(1)
28816 .nr(4)
28817 .kr(2)
28818 .sr(1)
28819 .m(1)
28820 .n(4)
28821 .k(8)
28822 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
28823 }
28824
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64,strided_cn)28825 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, strided_cn) {
28826 GemmMicrokernelTester()
28827 .mr(1)
28828 .nr(4)
28829 .kr(2)
28830 .sr(1)
28831 .m(1)
28832 .n(4)
28833 .k(8)
28834 .cn_stride(7)
28835 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
28836 }
28837
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64,k_eq_8_strided_a)28838 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, k_eq_8_strided_a) {
28839 GemmMicrokernelTester()
28840 .mr(1)
28841 .nr(4)
28842 .kr(2)
28843 .sr(1)
28844 .m(1)
28845 .n(4)
28846 .k(8)
28847 .a_stride(11)
28848 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
28849 }
28850
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64,k_eq_8_subtile)28851 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile) {
28852 for (uint32_t n = 1; n <= 4; n++) {
28853 for (uint32_t m = 1; m <= 1; m++) {
28854 GemmMicrokernelTester()
28855 .mr(1)
28856 .nr(4)
28857 .kr(2)
28858 .sr(1)
28859 .m(m)
28860 .n(n)
28861 .k(8)
28862 .iterations(1)
28863 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
28864 }
28865 }
28866 }
28867
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64,k_eq_8_subtile_m)28868 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_m) {
28869 for (uint32_t m = 1; m <= 1; m++) {
28870 GemmMicrokernelTester()
28871 .mr(1)
28872 .nr(4)
28873 .kr(2)
28874 .sr(1)
28875 .m(m)
28876 .n(4)
28877 .k(8)
28878 .iterations(1)
28879 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
28880 }
28881 }
28882
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64,k_eq_8_subtile_n)28883 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_n) {
28884 for (uint32_t n = 1; n <= 4; n++) {
28885 GemmMicrokernelTester()
28886 .mr(1)
28887 .nr(4)
28888 .kr(2)
28889 .sr(1)
28890 .m(1)
28891 .n(n)
28892 .k(8)
28893 .iterations(1)
28894 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
28895 }
28896 }
28897
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64,k_lt_8)28898 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, k_lt_8) {
28899 for (size_t k = 1; k < 8; k++) {
28900 GemmMicrokernelTester()
28901 .mr(1)
28902 .nr(4)
28903 .kr(2)
28904 .sr(1)
28905 .m(1)
28906 .n(4)
28907 .k(k)
28908 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
28909 }
28910 }
28911
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64,k_lt_8_strided_a)28912 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, k_lt_8_strided_a) {
28913 for (size_t k = 1; k < 8; k++) {
28914 GemmMicrokernelTester()
28915 .mr(1)
28916 .nr(4)
28917 .kr(2)
28918 .sr(1)
28919 .m(1)
28920 .n(4)
28921 .k(k)
28922 .a_stride(11)
28923 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
28924 }
28925 }
28926
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64,k_lt_8_subtile)28927 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, k_lt_8_subtile) {
28928 for (size_t k = 1; k < 8; k++) {
28929 for (uint32_t n = 1; n <= 4; n++) {
28930 for (uint32_t m = 1; m <= 1; m++) {
28931 GemmMicrokernelTester()
28932 .mr(1)
28933 .nr(4)
28934 .kr(2)
28935 .sr(1)
28936 .m(m)
28937 .n(n)
28938 .k(k)
28939 .iterations(1)
28940 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
28941 }
28942 }
28943 }
28944 }
28945
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64,k_gt_8)28946 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, k_gt_8) {
28947 for (size_t k = 9; k < 16; k++) {
28948 GemmMicrokernelTester()
28949 .mr(1)
28950 .nr(4)
28951 .kr(2)
28952 .sr(1)
28953 .m(1)
28954 .n(4)
28955 .k(k)
28956 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
28957 }
28958 }
28959
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64,k_gt_8_strided_a)28960 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, k_gt_8_strided_a) {
28961 for (size_t k = 9; k < 16; k++) {
28962 GemmMicrokernelTester()
28963 .mr(1)
28964 .nr(4)
28965 .kr(2)
28966 .sr(1)
28967 .m(1)
28968 .n(4)
28969 .k(k)
28970 .a_stride(19)
28971 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
28972 }
28973 }
28974
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64,k_gt_8_subtile)28975 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, k_gt_8_subtile) {
28976 for (size_t k = 9; k < 16; k++) {
28977 for (uint32_t n = 1; n <= 4; n++) {
28978 for (uint32_t m = 1; m <= 1; m++) {
28979 GemmMicrokernelTester()
28980 .mr(1)
28981 .nr(4)
28982 .kr(2)
28983 .sr(1)
28984 .m(m)
28985 .n(n)
28986 .k(k)
28987 .iterations(1)
28988 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
28989 }
28990 }
28991 }
28992 }
28993
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64,k_div_8)28994 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, k_div_8) {
28995 for (size_t k = 16; k <= 80; k += 8) {
28996 GemmMicrokernelTester()
28997 .mr(1)
28998 .nr(4)
28999 .kr(2)
29000 .sr(1)
29001 .m(1)
29002 .n(4)
29003 .k(k)
29004 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29005 }
29006 }
29007
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64,k_div_8_strided_a)29008 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, k_div_8_strided_a) {
29009 for (size_t k = 16; k <= 80; k += 8) {
29010 GemmMicrokernelTester()
29011 .mr(1)
29012 .nr(4)
29013 .kr(2)
29014 .sr(1)
29015 .m(1)
29016 .n(4)
29017 .k(k)
29018 .a_stride(83)
29019 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29020 }
29021 }
29022
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64,k_div_8_subtile)29023 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, k_div_8_subtile) {
29024 for (size_t k = 16; k <= 80; k += 8) {
29025 for (uint32_t n = 1; n <= 4; n++) {
29026 for (uint32_t m = 1; m <= 1; m++) {
29027 GemmMicrokernelTester()
29028 .mr(1)
29029 .nr(4)
29030 .kr(2)
29031 .sr(1)
29032 .m(m)
29033 .n(n)
29034 .k(k)
29035 .iterations(1)
29036 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29037 }
29038 }
29039 }
29040 }
29041
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64,n_gt_4)29042 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, n_gt_4) {
29043 for (uint32_t n = 5; n < 8; n++) {
29044 for (size_t k = 1; k <= 40; k += 9) {
29045 GemmMicrokernelTester()
29046 .mr(1)
29047 .nr(4)
29048 .kr(2)
29049 .sr(1)
29050 .m(1)
29051 .n(n)
29052 .k(k)
29053 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29054 }
29055 }
29056 }
29057
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64,n_gt_4_strided_cn)29058 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_cn) {
29059 for (uint32_t n = 5; n < 8; n++) {
29060 for (size_t k = 1; k <= 40; k += 9) {
29061 GemmMicrokernelTester()
29062 .mr(1)
29063 .nr(4)
29064 .kr(2)
29065 .sr(1)
29066 .m(1)
29067 .n(n)
29068 .k(k)
29069 .cn_stride(7)
29070 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29071 }
29072 }
29073 }
29074
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64,n_gt_4_strided_a)29075 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_a) {
29076 for (uint32_t n = 5; n < 8; n++) {
29077 for (size_t k = 1; k <= 40; k += 9) {
29078 GemmMicrokernelTester()
29079 .mr(1)
29080 .nr(4)
29081 .kr(2)
29082 .sr(1)
29083 .m(1)
29084 .n(n)
29085 .k(k)
29086 .a_stride(43)
29087 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29088 }
29089 }
29090 }
29091
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64,n_gt_4_subtile)29092 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, n_gt_4_subtile) {
29093 for (uint32_t n = 5; n < 8; n++) {
29094 for (size_t k = 1; k <= 40; k += 9) {
29095 for (uint32_t m = 1; m <= 1; m++) {
29096 GemmMicrokernelTester()
29097 .mr(1)
29098 .nr(4)
29099 .kr(2)
29100 .sr(1)
29101 .m(m)
29102 .n(n)
29103 .k(k)
29104 .iterations(1)
29105 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29106 }
29107 }
29108 }
29109 }
29110
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64,n_div_4)29111 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, n_div_4) {
29112 for (uint32_t n = 8; n <= 12; n += 4) {
29113 for (size_t k = 1; k <= 40; k += 9) {
29114 GemmMicrokernelTester()
29115 .mr(1)
29116 .nr(4)
29117 .kr(2)
29118 .sr(1)
29119 .m(1)
29120 .n(n)
29121 .k(k)
29122 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29123 }
29124 }
29125 }
29126
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64,n_div_4_strided_cn)29127 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, n_div_4_strided_cn) {
29128 for (uint32_t n = 8; n <= 12; n += 4) {
29129 for (size_t k = 1; k <= 40; k += 9) {
29130 GemmMicrokernelTester()
29131 .mr(1)
29132 .nr(4)
29133 .kr(2)
29134 .sr(1)
29135 .m(1)
29136 .n(n)
29137 .k(k)
29138 .cn_stride(7)
29139 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29140 }
29141 }
29142 }
29143
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64,n_div_4_strided_a)29144 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, n_div_4_strided_a) {
29145 for (uint32_t n = 8; n <= 12; n += 4) {
29146 for (size_t k = 1; k <= 40; k += 9) {
29147 GemmMicrokernelTester()
29148 .mr(1)
29149 .nr(4)
29150 .kr(2)
29151 .sr(1)
29152 .m(1)
29153 .n(n)
29154 .k(k)
29155 .a_stride(43)
29156 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29157 }
29158 }
29159 }
29160
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64,n_div_4_subtile)29161 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, n_div_4_subtile) {
29162 for (uint32_t n = 8; n <= 12; n += 4) {
29163 for (size_t k = 1; k <= 40; k += 9) {
29164 for (uint32_t m = 1; m <= 1; m++) {
29165 GemmMicrokernelTester()
29166 .mr(1)
29167 .nr(4)
29168 .kr(2)
29169 .sr(1)
29170 .m(m)
29171 .n(n)
29172 .k(k)
29173 .iterations(1)
29174 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29175 }
29176 }
29177 }
29178 }
29179
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64,strided_cm_subtile)29180 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, strided_cm_subtile) {
29181 for (size_t k = 1; k <= 40; k += 9) {
29182 for (uint32_t n = 1; n <= 4; n++) {
29183 for (uint32_t m = 1; m <= 1; m++) {
29184 GemmMicrokernelTester()
29185 .mr(1)
29186 .nr(4)
29187 .kr(2)
29188 .sr(1)
29189 .m(m)
29190 .n(n)
29191 .k(k)
29192 .cm_stride(7)
29193 .iterations(1)
29194 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29195 }
29196 }
29197 }
29198 }
29199
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64,qmin)29200 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, qmin) {
29201 GemmMicrokernelTester()
29202 .mr(1)
29203 .nr(4)
29204 .kr(2)
29205 .sr(1)
29206 .m(1)
29207 .n(4)
29208 .k(8)
29209 .qmin(128)
29210 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29211 }
29212
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64,qmax)29213 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, qmax) {
29214 GemmMicrokernelTester()
29215 .mr(1)
29216 .nr(4)
29217 .kr(2)
29218 .sr(1)
29219 .m(1)
29220 .n(4)
29221 .k(8)
29222 .qmax(128)
29223 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29224 }
29225
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64,strided_cm)29226 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, strided_cm) {
29227 GemmMicrokernelTester()
29228 .mr(1)
29229 .nr(4)
29230 .kr(2)
29231 .sr(1)
29232 .m(1)
29233 .n(4)
29234 .k(8)
29235 .cm_stride(7)
29236 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29237 }
29238
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64,no_a_zero_point)29239 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, no_a_zero_point) {
29240 for (size_t k = 1; k <= 40; k += 9) {
29241 GemmMicrokernelTester()
29242 .mr(1)
29243 .nr(4)
29244 .kr(2)
29245 .sr(1)
29246 .m(1)
29247 .n(4)
29248 .k(k)
29249 .a_zero_point(0)
29250 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29251 }
29252 }
29253
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64,no_b_zero_point)29254 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, no_b_zero_point) {
29255 for (size_t k = 1; k <= 40; k += 9) {
29256 GemmMicrokernelTester()
29257 .mr(1)
29258 .nr(4)
29259 .kr(2)
29260 .sr(1)
29261 .m(1)
29262 .n(4)
29263 .k(k)
29264 .b_zero_point(0)
29265 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29266 }
29267 }
29268
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64,no_zero_point)29269 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD64, no_zero_point) {
29270 for (size_t k = 1; k <= 40; k += 9) {
29271 GemmMicrokernelTester()
29272 .mr(1)
29273 .nr(4)
29274 .kr(2)
29275 .sr(1)
29276 .m(1)
29277 .n(4)
29278 .k(k)
29279 .a_zero_point(0)
29280 .b_zero_point(0)
29281 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29282 }
29283 }
29284 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
29285
29286
29287 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128,k_eq_8)29288 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128, k_eq_8) {
29289 GemmMicrokernelTester()
29290 .mr(1)
29291 .nr(4)
29292 .kr(2)
29293 .sr(1)
29294 .m(1)
29295 .n(4)
29296 .k(8)
29297 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29298 }
29299
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128,strided_cn)29300 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128, strided_cn) {
29301 GemmMicrokernelTester()
29302 .mr(1)
29303 .nr(4)
29304 .kr(2)
29305 .sr(1)
29306 .m(1)
29307 .n(4)
29308 .k(8)
29309 .cn_stride(7)
29310 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29311 }
29312
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128,k_eq_8_strided_a)29313 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128, k_eq_8_strided_a) {
29314 GemmMicrokernelTester()
29315 .mr(1)
29316 .nr(4)
29317 .kr(2)
29318 .sr(1)
29319 .m(1)
29320 .n(4)
29321 .k(8)
29322 .a_stride(11)
29323 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29324 }
29325
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128,k_eq_8_subtile)29326 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile) {
29327 for (uint32_t n = 1; n <= 4; n++) {
29328 for (uint32_t m = 1; m <= 1; m++) {
29329 GemmMicrokernelTester()
29330 .mr(1)
29331 .nr(4)
29332 .kr(2)
29333 .sr(1)
29334 .m(m)
29335 .n(n)
29336 .k(8)
29337 .iterations(1)
29338 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29339 }
29340 }
29341 }
29342
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128,k_eq_8_subtile_m)29343 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile_m) {
29344 for (uint32_t m = 1; m <= 1; m++) {
29345 GemmMicrokernelTester()
29346 .mr(1)
29347 .nr(4)
29348 .kr(2)
29349 .sr(1)
29350 .m(m)
29351 .n(4)
29352 .k(8)
29353 .iterations(1)
29354 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29355 }
29356 }
29357
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128,k_eq_8_subtile_n)29358 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile_n) {
29359 for (uint32_t n = 1; n <= 4; n++) {
29360 GemmMicrokernelTester()
29361 .mr(1)
29362 .nr(4)
29363 .kr(2)
29364 .sr(1)
29365 .m(1)
29366 .n(n)
29367 .k(8)
29368 .iterations(1)
29369 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29370 }
29371 }
29372
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128,k_lt_8)29373 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128, k_lt_8) {
29374 for (size_t k = 1; k < 8; k++) {
29375 GemmMicrokernelTester()
29376 .mr(1)
29377 .nr(4)
29378 .kr(2)
29379 .sr(1)
29380 .m(1)
29381 .n(4)
29382 .k(k)
29383 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29384 }
29385 }
29386
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128,k_lt_8_strided_a)29387 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128, k_lt_8_strided_a) {
29388 for (size_t k = 1; k < 8; k++) {
29389 GemmMicrokernelTester()
29390 .mr(1)
29391 .nr(4)
29392 .kr(2)
29393 .sr(1)
29394 .m(1)
29395 .n(4)
29396 .k(k)
29397 .a_stride(11)
29398 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29399 }
29400 }
29401
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128,k_lt_8_subtile)29402 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128, k_lt_8_subtile) {
29403 for (size_t k = 1; k < 8; k++) {
29404 for (uint32_t n = 1; n <= 4; n++) {
29405 for (uint32_t m = 1; m <= 1; m++) {
29406 GemmMicrokernelTester()
29407 .mr(1)
29408 .nr(4)
29409 .kr(2)
29410 .sr(1)
29411 .m(m)
29412 .n(n)
29413 .k(k)
29414 .iterations(1)
29415 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29416 }
29417 }
29418 }
29419 }
29420
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128,k_gt_8)29421 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128, k_gt_8) {
29422 for (size_t k = 9; k < 16; k++) {
29423 GemmMicrokernelTester()
29424 .mr(1)
29425 .nr(4)
29426 .kr(2)
29427 .sr(1)
29428 .m(1)
29429 .n(4)
29430 .k(k)
29431 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29432 }
29433 }
29434
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128,k_gt_8_strided_a)29435 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128, k_gt_8_strided_a) {
29436 for (size_t k = 9; k < 16; k++) {
29437 GemmMicrokernelTester()
29438 .mr(1)
29439 .nr(4)
29440 .kr(2)
29441 .sr(1)
29442 .m(1)
29443 .n(4)
29444 .k(k)
29445 .a_stride(19)
29446 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29447 }
29448 }
29449
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128,k_gt_8_subtile)29450 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128, k_gt_8_subtile) {
29451 for (size_t k = 9; k < 16; k++) {
29452 for (uint32_t n = 1; n <= 4; n++) {
29453 for (uint32_t m = 1; m <= 1; m++) {
29454 GemmMicrokernelTester()
29455 .mr(1)
29456 .nr(4)
29457 .kr(2)
29458 .sr(1)
29459 .m(m)
29460 .n(n)
29461 .k(k)
29462 .iterations(1)
29463 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29464 }
29465 }
29466 }
29467 }
29468
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128,k_div_8)29469 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128, k_div_8) {
29470 for (size_t k = 16; k <= 80; k += 8) {
29471 GemmMicrokernelTester()
29472 .mr(1)
29473 .nr(4)
29474 .kr(2)
29475 .sr(1)
29476 .m(1)
29477 .n(4)
29478 .k(k)
29479 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29480 }
29481 }
29482
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128,k_div_8_strided_a)29483 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128, k_div_8_strided_a) {
29484 for (size_t k = 16; k <= 80; k += 8) {
29485 GemmMicrokernelTester()
29486 .mr(1)
29487 .nr(4)
29488 .kr(2)
29489 .sr(1)
29490 .m(1)
29491 .n(4)
29492 .k(k)
29493 .a_stride(83)
29494 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29495 }
29496 }
29497
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128,k_div_8_subtile)29498 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128, k_div_8_subtile) {
29499 for (size_t k = 16; k <= 80; k += 8) {
29500 for (uint32_t n = 1; n <= 4; n++) {
29501 for (uint32_t m = 1; m <= 1; m++) {
29502 GemmMicrokernelTester()
29503 .mr(1)
29504 .nr(4)
29505 .kr(2)
29506 .sr(1)
29507 .m(m)
29508 .n(n)
29509 .k(k)
29510 .iterations(1)
29511 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29512 }
29513 }
29514 }
29515 }
29516
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128,n_gt_4)29517 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128, n_gt_4) {
29518 for (uint32_t n = 5; n < 8; n++) {
29519 for (size_t k = 1; k <= 40; k += 9) {
29520 GemmMicrokernelTester()
29521 .mr(1)
29522 .nr(4)
29523 .kr(2)
29524 .sr(1)
29525 .m(1)
29526 .n(n)
29527 .k(k)
29528 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29529 }
29530 }
29531 }
29532
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128,n_gt_4_strided_cn)29533 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128, n_gt_4_strided_cn) {
29534 for (uint32_t n = 5; n < 8; n++) {
29535 for (size_t k = 1; k <= 40; k += 9) {
29536 GemmMicrokernelTester()
29537 .mr(1)
29538 .nr(4)
29539 .kr(2)
29540 .sr(1)
29541 .m(1)
29542 .n(n)
29543 .k(k)
29544 .cn_stride(7)
29545 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29546 }
29547 }
29548 }
29549
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128,n_gt_4_strided_a)29550 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128, n_gt_4_strided_a) {
29551 for (uint32_t n = 5; n < 8; n++) {
29552 for (size_t k = 1; k <= 40; k += 9) {
29553 GemmMicrokernelTester()
29554 .mr(1)
29555 .nr(4)
29556 .kr(2)
29557 .sr(1)
29558 .m(1)
29559 .n(n)
29560 .k(k)
29561 .a_stride(43)
29562 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29563 }
29564 }
29565 }
29566
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128,n_gt_4_subtile)29567 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128, n_gt_4_subtile) {
29568 for (uint32_t n = 5; n < 8; n++) {
29569 for (size_t k = 1; k <= 40; k += 9) {
29570 for (uint32_t m = 1; m <= 1; m++) {
29571 GemmMicrokernelTester()
29572 .mr(1)
29573 .nr(4)
29574 .kr(2)
29575 .sr(1)
29576 .m(m)
29577 .n(n)
29578 .k(k)
29579 .iterations(1)
29580 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29581 }
29582 }
29583 }
29584 }
29585
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128,n_div_4)29586 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128, n_div_4) {
29587 for (uint32_t n = 8; n <= 12; n += 4) {
29588 for (size_t k = 1; k <= 40; k += 9) {
29589 GemmMicrokernelTester()
29590 .mr(1)
29591 .nr(4)
29592 .kr(2)
29593 .sr(1)
29594 .m(1)
29595 .n(n)
29596 .k(k)
29597 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29598 }
29599 }
29600 }
29601
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128,n_div_4_strided_cn)29602 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128, n_div_4_strided_cn) {
29603 for (uint32_t n = 8; n <= 12; n += 4) {
29604 for (size_t k = 1; k <= 40; k += 9) {
29605 GemmMicrokernelTester()
29606 .mr(1)
29607 .nr(4)
29608 .kr(2)
29609 .sr(1)
29610 .m(1)
29611 .n(n)
29612 .k(k)
29613 .cn_stride(7)
29614 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29615 }
29616 }
29617 }
29618
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128,n_div_4_strided_a)29619 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128, n_div_4_strided_a) {
29620 for (uint32_t n = 8; n <= 12; n += 4) {
29621 for (size_t k = 1; k <= 40; k += 9) {
29622 GemmMicrokernelTester()
29623 .mr(1)
29624 .nr(4)
29625 .kr(2)
29626 .sr(1)
29627 .m(1)
29628 .n(n)
29629 .k(k)
29630 .a_stride(43)
29631 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29632 }
29633 }
29634 }
29635
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128,n_div_4_subtile)29636 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128, n_div_4_subtile) {
29637 for (uint32_t n = 8; n <= 12; n += 4) {
29638 for (size_t k = 1; k <= 40; k += 9) {
29639 for (uint32_t m = 1; m <= 1; m++) {
29640 GemmMicrokernelTester()
29641 .mr(1)
29642 .nr(4)
29643 .kr(2)
29644 .sr(1)
29645 .m(m)
29646 .n(n)
29647 .k(k)
29648 .iterations(1)
29649 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29650 }
29651 }
29652 }
29653 }
29654
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128,strided_cm_subtile)29655 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128, strided_cm_subtile) {
29656 for (size_t k = 1; k <= 40; k += 9) {
29657 for (uint32_t n = 1; n <= 4; n++) {
29658 for (uint32_t m = 1; m <= 1; m++) {
29659 GemmMicrokernelTester()
29660 .mr(1)
29661 .nr(4)
29662 .kr(2)
29663 .sr(1)
29664 .m(m)
29665 .n(n)
29666 .k(k)
29667 .cm_stride(7)
29668 .iterations(1)
29669 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29670 }
29671 }
29672 }
29673 }
29674
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128,qmin)29675 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128, qmin) {
29676 GemmMicrokernelTester()
29677 .mr(1)
29678 .nr(4)
29679 .kr(2)
29680 .sr(1)
29681 .m(1)
29682 .n(4)
29683 .k(8)
29684 .qmin(128)
29685 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29686 }
29687
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128,qmax)29688 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128, qmax) {
29689 GemmMicrokernelTester()
29690 .mr(1)
29691 .nr(4)
29692 .kr(2)
29693 .sr(1)
29694 .m(1)
29695 .n(4)
29696 .k(8)
29697 .qmax(128)
29698 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29699 }
29700
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128,strided_cm)29701 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128, strided_cm) {
29702 GemmMicrokernelTester()
29703 .mr(1)
29704 .nr(4)
29705 .kr(2)
29706 .sr(1)
29707 .m(1)
29708 .n(4)
29709 .k(8)
29710 .cm_stride(7)
29711 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29712 }
29713
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128,no_a_zero_point)29714 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128, no_a_zero_point) {
29715 for (size_t k = 1; k <= 40; k += 9) {
29716 GemmMicrokernelTester()
29717 .mr(1)
29718 .nr(4)
29719 .kr(2)
29720 .sr(1)
29721 .m(1)
29722 .n(4)
29723 .k(k)
29724 .a_zero_point(0)
29725 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29726 }
29727 }
29728
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128,no_b_zero_point)29729 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128, no_b_zero_point) {
29730 for (size_t k = 1; k <= 40; k += 9) {
29731 GemmMicrokernelTester()
29732 .mr(1)
29733 .nr(4)
29734 .kr(2)
29735 .sr(1)
29736 .m(1)
29737 .n(4)
29738 .k(k)
29739 .b_zero_point(0)
29740 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29741 }
29742 }
29743
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128,no_zero_point)29744 TEST(QU8_GEMM_MINMAX_FP32_1X4C2__WASMSIMD_DOT16X2_LD128, no_zero_point) {
29745 for (size_t k = 1; k <= 40; k += 9) {
29746 GemmMicrokernelTester()
29747 .mr(1)
29748 .nr(4)
29749 .kr(2)
29750 .sr(1)
29751 .m(1)
29752 .n(4)
29753 .k(k)
29754 .a_zero_point(0)
29755 .b_zero_point(0)
29756 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29757 }
29758 }
29759 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
29760
29761
29762 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64,k_eq_8)29763 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8) {
29764 GemmMicrokernelTester()
29765 .mr(1)
29766 .nr(4)
29767 .kr(2)
29768 .sr(4)
29769 .m(1)
29770 .n(4)
29771 .k(8)
29772 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29773 }
29774
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64,strided_cn)29775 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64, strided_cn) {
29776 GemmMicrokernelTester()
29777 .mr(1)
29778 .nr(4)
29779 .kr(2)
29780 .sr(4)
29781 .m(1)
29782 .n(4)
29783 .k(8)
29784 .cn_stride(7)
29785 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29786 }
29787
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64,k_eq_8_strided_a)29788 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8_strided_a) {
29789 GemmMicrokernelTester()
29790 .mr(1)
29791 .nr(4)
29792 .kr(2)
29793 .sr(4)
29794 .m(1)
29795 .n(4)
29796 .k(8)
29797 .a_stride(11)
29798 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29799 }
29800
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64,k_eq_8_subtile)29801 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile) {
29802 for (uint32_t n = 1; n <= 4; n++) {
29803 for (uint32_t m = 1; m <= 1; m++) {
29804 GemmMicrokernelTester()
29805 .mr(1)
29806 .nr(4)
29807 .kr(2)
29808 .sr(4)
29809 .m(m)
29810 .n(n)
29811 .k(8)
29812 .iterations(1)
29813 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29814 }
29815 }
29816 }
29817
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64,k_eq_8_subtile_m)29818 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_m) {
29819 for (uint32_t m = 1; m <= 1; m++) {
29820 GemmMicrokernelTester()
29821 .mr(1)
29822 .nr(4)
29823 .kr(2)
29824 .sr(4)
29825 .m(m)
29826 .n(4)
29827 .k(8)
29828 .iterations(1)
29829 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29830 }
29831 }
29832
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64,k_eq_8_subtile_n)29833 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_n) {
29834 for (uint32_t n = 1; n <= 4; n++) {
29835 GemmMicrokernelTester()
29836 .mr(1)
29837 .nr(4)
29838 .kr(2)
29839 .sr(4)
29840 .m(1)
29841 .n(n)
29842 .k(8)
29843 .iterations(1)
29844 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29845 }
29846 }
29847
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64,k_lt_8)29848 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64, k_lt_8) {
29849 for (size_t k = 1; k < 8; k++) {
29850 GemmMicrokernelTester()
29851 .mr(1)
29852 .nr(4)
29853 .kr(2)
29854 .sr(4)
29855 .m(1)
29856 .n(4)
29857 .k(k)
29858 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29859 }
29860 }
29861
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64,k_lt_8_strided_a)29862 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64, k_lt_8_strided_a) {
29863 for (size_t k = 1; k < 8; k++) {
29864 GemmMicrokernelTester()
29865 .mr(1)
29866 .nr(4)
29867 .kr(2)
29868 .sr(4)
29869 .m(1)
29870 .n(4)
29871 .k(k)
29872 .a_stride(11)
29873 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29874 }
29875 }
29876
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64,k_lt_8_subtile)29877 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64, k_lt_8_subtile) {
29878 for (size_t k = 1; k < 8; k++) {
29879 for (uint32_t n = 1; n <= 4; n++) {
29880 for (uint32_t m = 1; m <= 1; m++) {
29881 GemmMicrokernelTester()
29882 .mr(1)
29883 .nr(4)
29884 .kr(2)
29885 .sr(4)
29886 .m(m)
29887 .n(n)
29888 .k(k)
29889 .iterations(1)
29890 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29891 }
29892 }
29893 }
29894 }
29895
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64,k_gt_8)29896 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64, k_gt_8) {
29897 for (size_t k = 9; k < 16; k++) {
29898 GemmMicrokernelTester()
29899 .mr(1)
29900 .nr(4)
29901 .kr(2)
29902 .sr(4)
29903 .m(1)
29904 .n(4)
29905 .k(k)
29906 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29907 }
29908 }
29909
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64,k_gt_8_strided_a)29910 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64, k_gt_8_strided_a) {
29911 for (size_t k = 9; k < 16; k++) {
29912 GemmMicrokernelTester()
29913 .mr(1)
29914 .nr(4)
29915 .kr(2)
29916 .sr(4)
29917 .m(1)
29918 .n(4)
29919 .k(k)
29920 .a_stride(19)
29921 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29922 }
29923 }
29924
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64,k_gt_8_subtile)29925 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64, k_gt_8_subtile) {
29926 for (size_t k = 9; k < 16; k++) {
29927 for (uint32_t n = 1; n <= 4; n++) {
29928 for (uint32_t m = 1; m <= 1; m++) {
29929 GemmMicrokernelTester()
29930 .mr(1)
29931 .nr(4)
29932 .kr(2)
29933 .sr(4)
29934 .m(m)
29935 .n(n)
29936 .k(k)
29937 .iterations(1)
29938 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29939 }
29940 }
29941 }
29942 }
29943
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64,k_div_8)29944 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64, k_div_8) {
29945 for (size_t k = 16; k <= 80; k += 8) {
29946 GemmMicrokernelTester()
29947 .mr(1)
29948 .nr(4)
29949 .kr(2)
29950 .sr(4)
29951 .m(1)
29952 .n(4)
29953 .k(k)
29954 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29955 }
29956 }
29957
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64,k_div_8_strided_a)29958 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64, k_div_8_strided_a) {
29959 for (size_t k = 16; k <= 80; k += 8) {
29960 GemmMicrokernelTester()
29961 .mr(1)
29962 .nr(4)
29963 .kr(2)
29964 .sr(4)
29965 .m(1)
29966 .n(4)
29967 .k(k)
29968 .a_stride(83)
29969 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29970 }
29971 }
29972
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64,k_div_8_subtile)29973 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64, k_div_8_subtile) {
29974 for (size_t k = 16; k <= 80; k += 8) {
29975 for (uint32_t n = 1; n <= 4; n++) {
29976 for (uint32_t m = 1; m <= 1; m++) {
29977 GemmMicrokernelTester()
29978 .mr(1)
29979 .nr(4)
29980 .kr(2)
29981 .sr(4)
29982 .m(m)
29983 .n(n)
29984 .k(k)
29985 .iterations(1)
29986 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
29987 }
29988 }
29989 }
29990 }
29991
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64,n_gt_4)29992 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64, n_gt_4) {
29993 for (uint32_t n = 5; n < 8; n++) {
29994 for (size_t k = 1; k <= 40; k += 9) {
29995 GemmMicrokernelTester()
29996 .mr(1)
29997 .nr(4)
29998 .kr(2)
29999 .sr(4)
30000 .m(1)
30001 .n(n)
30002 .k(k)
30003 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30004 }
30005 }
30006 }
30007
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64,n_gt_4_strided_cn)30008 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_cn) {
30009 for (uint32_t n = 5; n < 8; n++) {
30010 for (size_t k = 1; k <= 40; k += 9) {
30011 GemmMicrokernelTester()
30012 .mr(1)
30013 .nr(4)
30014 .kr(2)
30015 .sr(4)
30016 .m(1)
30017 .n(n)
30018 .k(k)
30019 .cn_stride(7)
30020 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30021 }
30022 }
30023 }
30024
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64,n_gt_4_strided_a)30025 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_a) {
30026 for (uint32_t n = 5; n < 8; n++) {
30027 for (size_t k = 1; k <= 40; k += 9) {
30028 GemmMicrokernelTester()
30029 .mr(1)
30030 .nr(4)
30031 .kr(2)
30032 .sr(4)
30033 .m(1)
30034 .n(n)
30035 .k(k)
30036 .a_stride(43)
30037 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30038 }
30039 }
30040 }
30041
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64,n_gt_4_subtile)30042 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64, n_gt_4_subtile) {
30043 for (uint32_t n = 5; n < 8; n++) {
30044 for (size_t k = 1; k <= 40; k += 9) {
30045 for (uint32_t m = 1; m <= 1; m++) {
30046 GemmMicrokernelTester()
30047 .mr(1)
30048 .nr(4)
30049 .kr(2)
30050 .sr(4)
30051 .m(m)
30052 .n(n)
30053 .k(k)
30054 .iterations(1)
30055 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30056 }
30057 }
30058 }
30059 }
30060
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64,n_div_4)30061 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64, n_div_4) {
30062 for (uint32_t n = 8; n <= 12; n += 4) {
30063 for (size_t k = 1; k <= 40; k += 9) {
30064 GemmMicrokernelTester()
30065 .mr(1)
30066 .nr(4)
30067 .kr(2)
30068 .sr(4)
30069 .m(1)
30070 .n(n)
30071 .k(k)
30072 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30073 }
30074 }
30075 }
30076
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64,n_div_4_strided_cn)30077 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64, n_div_4_strided_cn) {
30078 for (uint32_t n = 8; n <= 12; n += 4) {
30079 for (size_t k = 1; k <= 40; k += 9) {
30080 GemmMicrokernelTester()
30081 .mr(1)
30082 .nr(4)
30083 .kr(2)
30084 .sr(4)
30085 .m(1)
30086 .n(n)
30087 .k(k)
30088 .cn_stride(7)
30089 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30090 }
30091 }
30092 }
30093
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64,n_div_4_strided_a)30094 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64, n_div_4_strided_a) {
30095 for (uint32_t n = 8; n <= 12; n += 4) {
30096 for (size_t k = 1; k <= 40; k += 9) {
30097 GemmMicrokernelTester()
30098 .mr(1)
30099 .nr(4)
30100 .kr(2)
30101 .sr(4)
30102 .m(1)
30103 .n(n)
30104 .k(k)
30105 .a_stride(43)
30106 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30107 }
30108 }
30109 }
30110
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64,n_div_4_subtile)30111 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64, n_div_4_subtile) {
30112 for (uint32_t n = 8; n <= 12; n += 4) {
30113 for (size_t k = 1; k <= 40; k += 9) {
30114 for (uint32_t m = 1; m <= 1; m++) {
30115 GemmMicrokernelTester()
30116 .mr(1)
30117 .nr(4)
30118 .kr(2)
30119 .sr(4)
30120 .m(m)
30121 .n(n)
30122 .k(k)
30123 .iterations(1)
30124 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30125 }
30126 }
30127 }
30128 }
30129
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64,strided_cm_subtile)30130 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64, strided_cm_subtile) {
30131 for (size_t k = 1; k <= 40; k += 9) {
30132 for (uint32_t n = 1; n <= 4; n++) {
30133 for (uint32_t m = 1; m <= 1; m++) {
30134 GemmMicrokernelTester()
30135 .mr(1)
30136 .nr(4)
30137 .kr(2)
30138 .sr(4)
30139 .m(m)
30140 .n(n)
30141 .k(k)
30142 .cm_stride(7)
30143 .iterations(1)
30144 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30145 }
30146 }
30147 }
30148 }
30149
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64,qmin)30150 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64, qmin) {
30151 GemmMicrokernelTester()
30152 .mr(1)
30153 .nr(4)
30154 .kr(2)
30155 .sr(4)
30156 .m(1)
30157 .n(4)
30158 .k(8)
30159 .qmin(128)
30160 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30161 }
30162
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64,qmax)30163 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64, qmax) {
30164 GemmMicrokernelTester()
30165 .mr(1)
30166 .nr(4)
30167 .kr(2)
30168 .sr(4)
30169 .m(1)
30170 .n(4)
30171 .k(8)
30172 .qmax(128)
30173 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30174 }
30175
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64,strided_cm)30176 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64, strided_cm) {
30177 GemmMicrokernelTester()
30178 .mr(1)
30179 .nr(4)
30180 .kr(2)
30181 .sr(4)
30182 .m(1)
30183 .n(4)
30184 .k(8)
30185 .cm_stride(7)
30186 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30187 }
30188
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64,no_a_zero_point)30189 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64, no_a_zero_point) {
30190 for (size_t k = 1; k <= 40; k += 9) {
30191 GemmMicrokernelTester()
30192 .mr(1)
30193 .nr(4)
30194 .kr(2)
30195 .sr(4)
30196 .m(1)
30197 .n(4)
30198 .k(k)
30199 .a_zero_point(0)
30200 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30201 }
30202 }
30203
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64,no_b_zero_point)30204 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64, no_b_zero_point) {
30205 for (size_t k = 1; k <= 40; k += 9) {
30206 GemmMicrokernelTester()
30207 .mr(1)
30208 .nr(4)
30209 .kr(2)
30210 .sr(4)
30211 .m(1)
30212 .n(4)
30213 .k(k)
30214 .b_zero_point(0)
30215 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30216 }
30217 }
30218
TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64,no_zero_point)30219 TEST(QU8_GEMM_MINMAX_FP32_1X4C2S4__WASMSIMD_DOT16X2_LD64, no_zero_point) {
30220 for (size_t k = 1; k <= 40; k += 9) {
30221 GemmMicrokernelTester()
30222 .mr(1)
30223 .nr(4)
30224 .kr(2)
30225 .sr(4)
30226 .m(1)
30227 .n(4)
30228 .k(k)
30229 .a_zero_point(0)
30230 .b_zero_point(0)
30231 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30232 }
30233 }
30234 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
30235
30236
30237 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128,k_eq_8)30238 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128, k_eq_8) {
30239 GemmMicrokernelTester()
30240 .mr(1)
30241 .nr(4)
30242 .kr(8)
30243 .sr(1)
30244 .m(1)
30245 .n(4)
30246 .k(8)
30247 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30248 }
30249
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128,strided_cn)30250 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128, strided_cn) {
30251 GemmMicrokernelTester()
30252 .mr(1)
30253 .nr(4)
30254 .kr(8)
30255 .sr(1)
30256 .m(1)
30257 .n(4)
30258 .k(8)
30259 .cn_stride(7)
30260 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30261 }
30262
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128,k_eq_8_strided_a)30263 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128, k_eq_8_strided_a) {
30264 GemmMicrokernelTester()
30265 .mr(1)
30266 .nr(4)
30267 .kr(8)
30268 .sr(1)
30269 .m(1)
30270 .n(4)
30271 .k(8)
30272 .a_stride(11)
30273 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30274 }
30275
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128,k_eq_8_subtile)30276 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile) {
30277 for (uint32_t n = 1; n <= 4; n++) {
30278 for (uint32_t m = 1; m <= 1; m++) {
30279 GemmMicrokernelTester()
30280 .mr(1)
30281 .nr(4)
30282 .kr(8)
30283 .sr(1)
30284 .m(m)
30285 .n(n)
30286 .k(8)
30287 .iterations(1)
30288 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30289 }
30290 }
30291 }
30292
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128,k_eq_8_subtile_m)30293 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile_m) {
30294 for (uint32_t m = 1; m <= 1; m++) {
30295 GemmMicrokernelTester()
30296 .mr(1)
30297 .nr(4)
30298 .kr(8)
30299 .sr(1)
30300 .m(m)
30301 .n(4)
30302 .k(8)
30303 .iterations(1)
30304 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30305 }
30306 }
30307
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128,k_eq_8_subtile_n)30308 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile_n) {
30309 for (uint32_t n = 1; n <= 4; n++) {
30310 GemmMicrokernelTester()
30311 .mr(1)
30312 .nr(4)
30313 .kr(8)
30314 .sr(1)
30315 .m(1)
30316 .n(n)
30317 .k(8)
30318 .iterations(1)
30319 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30320 }
30321 }
30322
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128,k_lt_8)30323 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128, k_lt_8) {
30324 for (size_t k = 1; k < 8; k++) {
30325 GemmMicrokernelTester()
30326 .mr(1)
30327 .nr(4)
30328 .kr(8)
30329 .sr(1)
30330 .m(1)
30331 .n(4)
30332 .k(k)
30333 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30334 }
30335 }
30336
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128,k_lt_8_strided_a)30337 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128, k_lt_8_strided_a) {
30338 for (size_t k = 1; k < 8; k++) {
30339 GemmMicrokernelTester()
30340 .mr(1)
30341 .nr(4)
30342 .kr(8)
30343 .sr(1)
30344 .m(1)
30345 .n(4)
30346 .k(k)
30347 .a_stride(11)
30348 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30349 }
30350 }
30351
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128,k_lt_8_subtile)30352 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128, k_lt_8_subtile) {
30353 for (size_t k = 1; k < 8; k++) {
30354 for (uint32_t n = 1; n <= 4; n++) {
30355 for (uint32_t m = 1; m <= 1; m++) {
30356 GemmMicrokernelTester()
30357 .mr(1)
30358 .nr(4)
30359 .kr(8)
30360 .sr(1)
30361 .m(m)
30362 .n(n)
30363 .k(k)
30364 .iterations(1)
30365 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30366 }
30367 }
30368 }
30369 }
30370
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128,k_gt_8)30371 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128, k_gt_8) {
30372 for (size_t k = 9; k < 16; k++) {
30373 GemmMicrokernelTester()
30374 .mr(1)
30375 .nr(4)
30376 .kr(8)
30377 .sr(1)
30378 .m(1)
30379 .n(4)
30380 .k(k)
30381 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30382 }
30383 }
30384
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128,k_gt_8_strided_a)30385 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128, k_gt_8_strided_a) {
30386 for (size_t k = 9; k < 16; k++) {
30387 GemmMicrokernelTester()
30388 .mr(1)
30389 .nr(4)
30390 .kr(8)
30391 .sr(1)
30392 .m(1)
30393 .n(4)
30394 .k(k)
30395 .a_stride(19)
30396 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30397 }
30398 }
30399
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128,k_gt_8_subtile)30400 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128, k_gt_8_subtile) {
30401 for (size_t k = 9; k < 16; k++) {
30402 for (uint32_t n = 1; n <= 4; n++) {
30403 for (uint32_t m = 1; m <= 1; m++) {
30404 GemmMicrokernelTester()
30405 .mr(1)
30406 .nr(4)
30407 .kr(8)
30408 .sr(1)
30409 .m(m)
30410 .n(n)
30411 .k(k)
30412 .iterations(1)
30413 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30414 }
30415 }
30416 }
30417 }
30418
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128,k_div_8)30419 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128, k_div_8) {
30420 for (size_t k = 16; k <= 80; k += 8) {
30421 GemmMicrokernelTester()
30422 .mr(1)
30423 .nr(4)
30424 .kr(8)
30425 .sr(1)
30426 .m(1)
30427 .n(4)
30428 .k(k)
30429 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30430 }
30431 }
30432
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128,k_div_8_strided_a)30433 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128, k_div_8_strided_a) {
30434 for (size_t k = 16; k <= 80; k += 8) {
30435 GemmMicrokernelTester()
30436 .mr(1)
30437 .nr(4)
30438 .kr(8)
30439 .sr(1)
30440 .m(1)
30441 .n(4)
30442 .k(k)
30443 .a_stride(83)
30444 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30445 }
30446 }
30447
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128,k_div_8_subtile)30448 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128, k_div_8_subtile) {
30449 for (size_t k = 16; k <= 80; k += 8) {
30450 for (uint32_t n = 1; n <= 4; n++) {
30451 for (uint32_t m = 1; m <= 1; m++) {
30452 GemmMicrokernelTester()
30453 .mr(1)
30454 .nr(4)
30455 .kr(8)
30456 .sr(1)
30457 .m(m)
30458 .n(n)
30459 .k(k)
30460 .iterations(1)
30461 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30462 }
30463 }
30464 }
30465 }
30466
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128,n_gt_4)30467 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128, n_gt_4) {
30468 for (uint32_t n = 5; n < 8; n++) {
30469 for (size_t k = 1; k <= 40; k += 9) {
30470 GemmMicrokernelTester()
30471 .mr(1)
30472 .nr(4)
30473 .kr(8)
30474 .sr(1)
30475 .m(1)
30476 .n(n)
30477 .k(k)
30478 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30479 }
30480 }
30481 }
30482
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128,n_gt_4_strided_cn)30483 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128, n_gt_4_strided_cn) {
30484 for (uint32_t n = 5; n < 8; n++) {
30485 for (size_t k = 1; k <= 40; k += 9) {
30486 GemmMicrokernelTester()
30487 .mr(1)
30488 .nr(4)
30489 .kr(8)
30490 .sr(1)
30491 .m(1)
30492 .n(n)
30493 .k(k)
30494 .cn_stride(7)
30495 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30496 }
30497 }
30498 }
30499
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128,n_gt_4_strided_a)30500 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128, n_gt_4_strided_a) {
30501 for (uint32_t n = 5; n < 8; n++) {
30502 for (size_t k = 1; k <= 40; k += 9) {
30503 GemmMicrokernelTester()
30504 .mr(1)
30505 .nr(4)
30506 .kr(8)
30507 .sr(1)
30508 .m(1)
30509 .n(n)
30510 .k(k)
30511 .a_stride(43)
30512 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30513 }
30514 }
30515 }
30516
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128,n_gt_4_subtile)30517 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128, n_gt_4_subtile) {
30518 for (uint32_t n = 5; n < 8; n++) {
30519 for (size_t k = 1; k <= 40; k += 9) {
30520 for (uint32_t m = 1; m <= 1; m++) {
30521 GemmMicrokernelTester()
30522 .mr(1)
30523 .nr(4)
30524 .kr(8)
30525 .sr(1)
30526 .m(m)
30527 .n(n)
30528 .k(k)
30529 .iterations(1)
30530 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30531 }
30532 }
30533 }
30534 }
30535
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128,n_div_4)30536 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128, n_div_4) {
30537 for (uint32_t n = 8; n <= 12; n += 4) {
30538 for (size_t k = 1; k <= 40; k += 9) {
30539 GemmMicrokernelTester()
30540 .mr(1)
30541 .nr(4)
30542 .kr(8)
30543 .sr(1)
30544 .m(1)
30545 .n(n)
30546 .k(k)
30547 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30548 }
30549 }
30550 }
30551
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128,n_div_4_strided_cn)30552 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128, n_div_4_strided_cn) {
30553 for (uint32_t n = 8; n <= 12; n += 4) {
30554 for (size_t k = 1; k <= 40; k += 9) {
30555 GemmMicrokernelTester()
30556 .mr(1)
30557 .nr(4)
30558 .kr(8)
30559 .sr(1)
30560 .m(1)
30561 .n(n)
30562 .k(k)
30563 .cn_stride(7)
30564 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30565 }
30566 }
30567 }
30568
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128,n_div_4_strided_a)30569 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128, n_div_4_strided_a) {
30570 for (uint32_t n = 8; n <= 12; n += 4) {
30571 for (size_t k = 1; k <= 40; k += 9) {
30572 GemmMicrokernelTester()
30573 .mr(1)
30574 .nr(4)
30575 .kr(8)
30576 .sr(1)
30577 .m(1)
30578 .n(n)
30579 .k(k)
30580 .a_stride(43)
30581 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30582 }
30583 }
30584 }
30585
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128,n_div_4_subtile)30586 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128, n_div_4_subtile) {
30587 for (uint32_t n = 8; n <= 12; n += 4) {
30588 for (size_t k = 1; k <= 40; k += 9) {
30589 for (uint32_t m = 1; m <= 1; m++) {
30590 GemmMicrokernelTester()
30591 .mr(1)
30592 .nr(4)
30593 .kr(8)
30594 .sr(1)
30595 .m(m)
30596 .n(n)
30597 .k(k)
30598 .iterations(1)
30599 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30600 }
30601 }
30602 }
30603 }
30604
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128,strided_cm_subtile)30605 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128, strided_cm_subtile) {
30606 for (size_t k = 1; k <= 40; k += 9) {
30607 for (uint32_t n = 1; n <= 4; n++) {
30608 for (uint32_t m = 1; m <= 1; m++) {
30609 GemmMicrokernelTester()
30610 .mr(1)
30611 .nr(4)
30612 .kr(8)
30613 .sr(1)
30614 .m(m)
30615 .n(n)
30616 .k(k)
30617 .cm_stride(7)
30618 .iterations(1)
30619 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30620 }
30621 }
30622 }
30623 }
30624
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128,qmin)30625 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128, qmin) {
30626 GemmMicrokernelTester()
30627 .mr(1)
30628 .nr(4)
30629 .kr(8)
30630 .sr(1)
30631 .m(1)
30632 .n(4)
30633 .k(8)
30634 .qmin(128)
30635 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30636 }
30637
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128,qmax)30638 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128, qmax) {
30639 GemmMicrokernelTester()
30640 .mr(1)
30641 .nr(4)
30642 .kr(8)
30643 .sr(1)
30644 .m(1)
30645 .n(4)
30646 .k(8)
30647 .qmax(128)
30648 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30649 }
30650
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128,strided_cm)30651 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128, strided_cm) {
30652 GemmMicrokernelTester()
30653 .mr(1)
30654 .nr(4)
30655 .kr(8)
30656 .sr(1)
30657 .m(1)
30658 .n(4)
30659 .k(8)
30660 .cm_stride(7)
30661 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30662 }
30663
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128,no_a_zero_point)30664 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128, no_a_zero_point) {
30665 for (size_t k = 1; k <= 40; k += 9) {
30666 GemmMicrokernelTester()
30667 .mr(1)
30668 .nr(4)
30669 .kr(8)
30670 .sr(1)
30671 .m(1)
30672 .n(4)
30673 .k(k)
30674 .a_zero_point(0)
30675 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30676 }
30677 }
30678
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128,no_b_zero_point)30679 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128, no_b_zero_point) {
30680 for (size_t k = 1; k <= 40; k += 9) {
30681 GemmMicrokernelTester()
30682 .mr(1)
30683 .nr(4)
30684 .kr(8)
30685 .sr(1)
30686 .m(1)
30687 .n(4)
30688 .k(k)
30689 .b_zero_point(0)
30690 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30691 }
30692 }
30693
TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128,no_zero_point)30694 TEST(QU8_GEMM_MINMAX_FP32_1X4C8__WASMSIMD_DOT16X2_LD128, no_zero_point) {
30695 for (size_t k = 1; k <= 40; k += 9) {
30696 GemmMicrokernelTester()
30697 .mr(1)
30698 .nr(4)
30699 .kr(8)
30700 .sr(1)
30701 .m(1)
30702 .n(4)
30703 .k(k)
30704 .a_zero_point(0)
30705 .b_zero_point(0)
30706 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30707 }
30708 }
30709 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
30710
30711
30712 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128,k_eq_8)30713 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128, k_eq_8) {
30714 GemmMicrokernelTester()
30715 .mr(2)
30716 .nr(4)
30717 .kr(2)
30718 .sr(1)
30719 .m(2)
30720 .n(4)
30721 .k(8)
30722 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30723 }
30724
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128,strided_cn)30725 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128, strided_cn) {
30726 GemmMicrokernelTester()
30727 .mr(2)
30728 .nr(4)
30729 .kr(2)
30730 .sr(1)
30731 .m(2)
30732 .n(4)
30733 .k(8)
30734 .cn_stride(7)
30735 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30736 }
30737
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128,k_eq_8_strided_a)30738 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128, k_eq_8_strided_a) {
30739 GemmMicrokernelTester()
30740 .mr(2)
30741 .nr(4)
30742 .kr(2)
30743 .sr(1)
30744 .m(2)
30745 .n(4)
30746 .k(8)
30747 .a_stride(11)
30748 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30749 }
30750
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128,k_eq_8_subtile)30751 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile) {
30752 for (uint32_t n = 1; n <= 4; n++) {
30753 for (uint32_t m = 1; m <= 2; m++) {
30754 GemmMicrokernelTester()
30755 .mr(2)
30756 .nr(4)
30757 .kr(2)
30758 .sr(1)
30759 .m(m)
30760 .n(n)
30761 .k(8)
30762 .iterations(1)
30763 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30764 }
30765 }
30766 }
30767
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128,k_eq_8_subtile_m)30768 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile_m) {
30769 for (uint32_t m = 1; m <= 2; m++) {
30770 GemmMicrokernelTester()
30771 .mr(2)
30772 .nr(4)
30773 .kr(2)
30774 .sr(1)
30775 .m(m)
30776 .n(4)
30777 .k(8)
30778 .iterations(1)
30779 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30780 }
30781 }
30782
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128,k_eq_8_subtile_n)30783 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile_n) {
30784 for (uint32_t n = 1; n <= 4; n++) {
30785 GemmMicrokernelTester()
30786 .mr(2)
30787 .nr(4)
30788 .kr(2)
30789 .sr(1)
30790 .m(2)
30791 .n(n)
30792 .k(8)
30793 .iterations(1)
30794 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30795 }
30796 }
30797
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128,k_lt_8)30798 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128, k_lt_8) {
30799 for (size_t k = 1; k < 8; k++) {
30800 GemmMicrokernelTester()
30801 .mr(2)
30802 .nr(4)
30803 .kr(2)
30804 .sr(1)
30805 .m(2)
30806 .n(4)
30807 .k(k)
30808 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30809 }
30810 }
30811
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128,k_lt_8_strided_a)30812 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128, k_lt_8_strided_a) {
30813 for (size_t k = 1; k < 8; k++) {
30814 GemmMicrokernelTester()
30815 .mr(2)
30816 .nr(4)
30817 .kr(2)
30818 .sr(1)
30819 .m(2)
30820 .n(4)
30821 .k(k)
30822 .a_stride(11)
30823 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30824 }
30825 }
30826
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128,k_lt_8_subtile)30827 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128, k_lt_8_subtile) {
30828 for (size_t k = 1; k < 8; k++) {
30829 for (uint32_t n = 1; n <= 4; n++) {
30830 for (uint32_t m = 1; m <= 2; m++) {
30831 GemmMicrokernelTester()
30832 .mr(2)
30833 .nr(4)
30834 .kr(2)
30835 .sr(1)
30836 .m(m)
30837 .n(n)
30838 .k(k)
30839 .iterations(1)
30840 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30841 }
30842 }
30843 }
30844 }
30845
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128,k_gt_8)30846 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128, k_gt_8) {
30847 for (size_t k = 9; k < 16; k++) {
30848 GemmMicrokernelTester()
30849 .mr(2)
30850 .nr(4)
30851 .kr(2)
30852 .sr(1)
30853 .m(2)
30854 .n(4)
30855 .k(k)
30856 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30857 }
30858 }
30859
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128,k_gt_8_strided_a)30860 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128, k_gt_8_strided_a) {
30861 for (size_t k = 9; k < 16; k++) {
30862 GemmMicrokernelTester()
30863 .mr(2)
30864 .nr(4)
30865 .kr(2)
30866 .sr(1)
30867 .m(2)
30868 .n(4)
30869 .k(k)
30870 .a_stride(19)
30871 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30872 }
30873 }
30874
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128,k_gt_8_subtile)30875 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128, k_gt_8_subtile) {
30876 for (size_t k = 9; k < 16; k++) {
30877 for (uint32_t n = 1; n <= 4; n++) {
30878 for (uint32_t m = 1; m <= 2; m++) {
30879 GemmMicrokernelTester()
30880 .mr(2)
30881 .nr(4)
30882 .kr(2)
30883 .sr(1)
30884 .m(m)
30885 .n(n)
30886 .k(k)
30887 .iterations(1)
30888 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30889 }
30890 }
30891 }
30892 }
30893
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128,k_div_8)30894 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128, k_div_8) {
30895 for (size_t k = 16; k <= 80; k += 8) {
30896 GemmMicrokernelTester()
30897 .mr(2)
30898 .nr(4)
30899 .kr(2)
30900 .sr(1)
30901 .m(2)
30902 .n(4)
30903 .k(k)
30904 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30905 }
30906 }
30907
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128,k_div_8_strided_a)30908 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128, k_div_8_strided_a) {
30909 for (size_t k = 16; k <= 80; k += 8) {
30910 GemmMicrokernelTester()
30911 .mr(2)
30912 .nr(4)
30913 .kr(2)
30914 .sr(1)
30915 .m(2)
30916 .n(4)
30917 .k(k)
30918 .a_stride(83)
30919 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30920 }
30921 }
30922
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128,k_div_8_subtile)30923 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128, k_div_8_subtile) {
30924 for (size_t k = 16; k <= 80; k += 8) {
30925 for (uint32_t n = 1; n <= 4; n++) {
30926 for (uint32_t m = 1; m <= 2; m++) {
30927 GemmMicrokernelTester()
30928 .mr(2)
30929 .nr(4)
30930 .kr(2)
30931 .sr(1)
30932 .m(m)
30933 .n(n)
30934 .k(k)
30935 .iterations(1)
30936 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30937 }
30938 }
30939 }
30940 }
30941
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128,n_gt_4)30942 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128, n_gt_4) {
30943 for (uint32_t n = 5; n < 8; n++) {
30944 for (size_t k = 1; k <= 40; k += 9) {
30945 GemmMicrokernelTester()
30946 .mr(2)
30947 .nr(4)
30948 .kr(2)
30949 .sr(1)
30950 .m(2)
30951 .n(n)
30952 .k(k)
30953 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30954 }
30955 }
30956 }
30957
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128,n_gt_4_strided_cn)30958 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128, n_gt_4_strided_cn) {
30959 for (uint32_t n = 5; n < 8; n++) {
30960 for (size_t k = 1; k <= 40; k += 9) {
30961 GemmMicrokernelTester()
30962 .mr(2)
30963 .nr(4)
30964 .kr(2)
30965 .sr(1)
30966 .m(2)
30967 .n(n)
30968 .k(k)
30969 .cn_stride(7)
30970 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30971 }
30972 }
30973 }
30974
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128,n_gt_4_strided_a)30975 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128, n_gt_4_strided_a) {
30976 for (uint32_t n = 5; n < 8; n++) {
30977 for (size_t k = 1; k <= 40; k += 9) {
30978 GemmMicrokernelTester()
30979 .mr(2)
30980 .nr(4)
30981 .kr(2)
30982 .sr(1)
30983 .m(2)
30984 .n(n)
30985 .k(k)
30986 .a_stride(43)
30987 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
30988 }
30989 }
30990 }
30991
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128,n_gt_4_subtile)30992 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128, n_gt_4_subtile) {
30993 for (uint32_t n = 5; n < 8; n++) {
30994 for (size_t k = 1; k <= 40; k += 9) {
30995 for (uint32_t m = 1; m <= 2; m++) {
30996 GemmMicrokernelTester()
30997 .mr(2)
30998 .nr(4)
30999 .kr(2)
31000 .sr(1)
31001 .m(m)
31002 .n(n)
31003 .k(k)
31004 .iterations(1)
31005 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31006 }
31007 }
31008 }
31009 }
31010
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128,n_div_4)31011 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128, n_div_4) {
31012 for (uint32_t n = 8; n <= 12; n += 4) {
31013 for (size_t k = 1; k <= 40; k += 9) {
31014 GemmMicrokernelTester()
31015 .mr(2)
31016 .nr(4)
31017 .kr(2)
31018 .sr(1)
31019 .m(2)
31020 .n(n)
31021 .k(k)
31022 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31023 }
31024 }
31025 }
31026
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128,n_div_4_strided_cn)31027 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128, n_div_4_strided_cn) {
31028 for (uint32_t n = 8; n <= 12; n += 4) {
31029 for (size_t k = 1; k <= 40; k += 9) {
31030 GemmMicrokernelTester()
31031 .mr(2)
31032 .nr(4)
31033 .kr(2)
31034 .sr(1)
31035 .m(2)
31036 .n(n)
31037 .k(k)
31038 .cn_stride(7)
31039 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31040 }
31041 }
31042 }
31043
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128,n_div_4_strided_a)31044 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128, n_div_4_strided_a) {
31045 for (uint32_t n = 8; n <= 12; n += 4) {
31046 for (size_t k = 1; k <= 40; k += 9) {
31047 GemmMicrokernelTester()
31048 .mr(2)
31049 .nr(4)
31050 .kr(2)
31051 .sr(1)
31052 .m(2)
31053 .n(n)
31054 .k(k)
31055 .a_stride(43)
31056 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31057 }
31058 }
31059 }
31060
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128,n_div_4_subtile)31061 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128, n_div_4_subtile) {
31062 for (uint32_t n = 8; n <= 12; n += 4) {
31063 for (size_t k = 1; k <= 40; k += 9) {
31064 for (uint32_t m = 1; m <= 2; m++) {
31065 GemmMicrokernelTester()
31066 .mr(2)
31067 .nr(4)
31068 .kr(2)
31069 .sr(1)
31070 .m(m)
31071 .n(n)
31072 .k(k)
31073 .iterations(1)
31074 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31075 }
31076 }
31077 }
31078 }
31079
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128,strided_cm_subtile)31080 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128, strided_cm_subtile) {
31081 for (size_t k = 1; k <= 40; k += 9) {
31082 for (uint32_t n = 1; n <= 4; n++) {
31083 for (uint32_t m = 1; m <= 2; m++) {
31084 GemmMicrokernelTester()
31085 .mr(2)
31086 .nr(4)
31087 .kr(2)
31088 .sr(1)
31089 .m(m)
31090 .n(n)
31091 .k(k)
31092 .cm_stride(7)
31093 .iterations(1)
31094 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31095 }
31096 }
31097 }
31098 }
31099
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128,qmin)31100 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128, qmin) {
31101 GemmMicrokernelTester()
31102 .mr(2)
31103 .nr(4)
31104 .kr(2)
31105 .sr(1)
31106 .m(2)
31107 .n(4)
31108 .k(8)
31109 .qmin(128)
31110 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31111 }
31112
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128,qmax)31113 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128, qmax) {
31114 GemmMicrokernelTester()
31115 .mr(2)
31116 .nr(4)
31117 .kr(2)
31118 .sr(1)
31119 .m(2)
31120 .n(4)
31121 .k(8)
31122 .qmax(128)
31123 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31124 }
31125
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128,strided_cm)31126 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128, strided_cm) {
31127 GemmMicrokernelTester()
31128 .mr(2)
31129 .nr(4)
31130 .kr(2)
31131 .sr(1)
31132 .m(2)
31133 .n(4)
31134 .k(8)
31135 .cm_stride(7)
31136 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31137 }
31138
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128,no_a_zero_point)31139 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128, no_a_zero_point) {
31140 for (size_t k = 1; k <= 40; k += 9) {
31141 GemmMicrokernelTester()
31142 .mr(2)
31143 .nr(4)
31144 .kr(2)
31145 .sr(1)
31146 .m(2)
31147 .n(4)
31148 .k(k)
31149 .a_zero_point(0)
31150 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31151 }
31152 }
31153
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128,no_b_zero_point)31154 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128, no_b_zero_point) {
31155 for (size_t k = 1; k <= 40; k += 9) {
31156 GemmMicrokernelTester()
31157 .mr(2)
31158 .nr(4)
31159 .kr(2)
31160 .sr(1)
31161 .m(2)
31162 .n(4)
31163 .k(k)
31164 .b_zero_point(0)
31165 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31166 }
31167 }
31168
TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128,no_zero_point)31169 TEST(QU8_GEMM_MINMAX_FP32_2X4C2__WASMSIMD_DOT16X2_LD128, no_zero_point) {
31170 for (size_t k = 1; k <= 40; k += 9) {
31171 GemmMicrokernelTester()
31172 .mr(2)
31173 .nr(4)
31174 .kr(2)
31175 .sr(1)
31176 .m(2)
31177 .n(4)
31178 .k(k)
31179 .a_zero_point(0)
31180 .b_zero_point(0)
31181 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31182 }
31183 }
31184 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
31185
31186
31187 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64,k_eq_8)31188 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8) {
31189 GemmMicrokernelTester()
31190 .mr(2)
31191 .nr(4)
31192 .kr(2)
31193 .sr(4)
31194 .m(2)
31195 .n(4)
31196 .k(8)
31197 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31198 }
31199
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64,strided_cn)31200 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, strided_cn) {
31201 GemmMicrokernelTester()
31202 .mr(2)
31203 .nr(4)
31204 .kr(2)
31205 .sr(4)
31206 .m(2)
31207 .n(4)
31208 .k(8)
31209 .cn_stride(7)
31210 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31211 }
31212
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64,k_eq_8_strided_a)31213 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8_strided_a) {
31214 GemmMicrokernelTester()
31215 .mr(2)
31216 .nr(4)
31217 .kr(2)
31218 .sr(4)
31219 .m(2)
31220 .n(4)
31221 .k(8)
31222 .a_stride(11)
31223 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31224 }
31225
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64,k_eq_8_subtile)31226 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile) {
31227 for (uint32_t n = 1; n <= 4; n++) {
31228 for (uint32_t m = 1; m <= 2; m++) {
31229 GemmMicrokernelTester()
31230 .mr(2)
31231 .nr(4)
31232 .kr(2)
31233 .sr(4)
31234 .m(m)
31235 .n(n)
31236 .k(8)
31237 .iterations(1)
31238 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31239 }
31240 }
31241 }
31242
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64,k_eq_8_subtile_m)31243 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_m) {
31244 for (uint32_t m = 1; m <= 2; m++) {
31245 GemmMicrokernelTester()
31246 .mr(2)
31247 .nr(4)
31248 .kr(2)
31249 .sr(4)
31250 .m(m)
31251 .n(4)
31252 .k(8)
31253 .iterations(1)
31254 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31255 }
31256 }
31257
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64,k_eq_8_subtile_n)31258 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_n) {
31259 for (uint32_t n = 1; n <= 4; n++) {
31260 GemmMicrokernelTester()
31261 .mr(2)
31262 .nr(4)
31263 .kr(2)
31264 .sr(4)
31265 .m(2)
31266 .n(n)
31267 .k(8)
31268 .iterations(1)
31269 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31270 }
31271 }
31272
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64,k_lt_8)31273 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, k_lt_8) {
31274 for (size_t k = 1; k < 8; k++) {
31275 GemmMicrokernelTester()
31276 .mr(2)
31277 .nr(4)
31278 .kr(2)
31279 .sr(4)
31280 .m(2)
31281 .n(4)
31282 .k(k)
31283 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31284 }
31285 }
31286
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64,k_lt_8_strided_a)31287 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, k_lt_8_strided_a) {
31288 for (size_t k = 1; k < 8; k++) {
31289 GemmMicrokernelTester()
31290 .mr(2)
31291 .nr(4)
31292 .kr(2)
31293 .sr(4)
31294 .m(2)
31295 .n(4)
31296 .k(k)
31297 .a_stride(11)
31298 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31299 }
31300 }
31301
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64,k_lt_8_subtile)31302 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, k_lt_8_subtile) {
31303 for (size_t k = 1; k < 8; k++) {
31304 for (uint32_t n = 1; n <= 4; n++) {
31305 for (uint32_t m = 1; m <= 2; m++) {
31306 GemmMicrokernelTester()
31307 .mr(2)
31308 .nr(4)
31309 .kr(2)
31310 .sr(4)
31311 .m(m)
31312 .n(n)
31313 .k(k)
31314 .iterations(1)
31315 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31316 }
31317 }
31318 }
31319 }
31320
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64,k_gt_8)31321 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, k_gt_8) {
31322 for (size_t k = 9; k < 16; k++) {
31323 GemmMicrokernelTester()
31324 .mr(2)
31325 .nr(4)
31326 .kr(2)
31327 .sr(4)
31328 .m(2)
31329 .n(4)
31330 .k(k)
31331 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31332 }
31333 }
31334
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64,k_gt_8_strided_a)31335 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, k_gt_8_strided_a) {
31336 for (size_t k = 9; k < 16; k++) {
31337 GemmMicrokernelTester()
31338 .mr(2)
31339 .nr(4)
31340 .kr(2)
31341 .sr(4)
31342 .m(2)
31343 .n(4)
31344 .k(k)
31345 .a_stride(19)
31346 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31347 }
31348 }
31349
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64,k_gt_8_subtile)31350 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, k_gt_8_subtile) {
31351 for (size_t k = 9; k < 16; k++) {
31352 for (uint32_t n = 1; n <= 4; n++) {
31353 for (uint32_t m = 1; m <= 2; m++) {
31354 GemmMicrokernelTester()
31355 .mr(2)
31356 .nr(4)
31357 .kr(2)
31358 .sr(4)
31359 .m(m)
31360 .n(n)
31361 .k(k)
31362 .iterations(1)
31363 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31364 }
31365 }
31366 }
31367 }
31368
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64,k_div_8)31369 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, k_div_8) {
31370 for (size_t k = 16; k <= 80; k += 8) {
31371 GemmMicrokernelTester()
31372 .mr(2)
31373 .nr(4)
31374 .kr(2)
31375 .sr(4)
31376 .m(2)
31377 .n(4)
31378 .k(k)
31379 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31380 }
31381 }
31382
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64,k_div_8_strided_a)31383 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, k_div_8_strided_a) {
31384 for (size_t k = 16; k <= 80; k += 8) {
31385 GemmMicrokernelTester()
31386 .mr(2)
31387 .nr(4)
31388 .kr(2)
31389 .sr(4)
31390 .m(2)
31391 .n(4)
31392 .k(k)
31393 .a_stride(83)
31394 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31395 }
31396 }
31397
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64,k_div_8_subtile)31398 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, k_div_8_subtile) {
31399 for (size_t k = 16; k <= 80; k += 8) {
31400 for (uint32_t n = 1; n <= 4; n++) {
31401 for (uint32_t m = 1; m <= 2; m++) {
31402 GemmMicrokernelTester()
31403 .mr(2)
31404 .nr(4)
31405 .kr(2)
31406 .sr(4)
31407 .m(m)
31408 .n(n)
31409 .k(k)
31410 .iterations(1)
31411 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31412 }
31413 }
31414 }
31415 }
31416
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64,n_gt_4)31417 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, n_gt_4) {
31418 for (uint32_t n = 5; n < 8; n++) {
31419 for (size_t k = 1; k <= 40; k += 9) {
31420 GemmMicrokernelTester()
31421 .mr(2)
31422 .nr(4)
31423 .kr(2)
31424 .sr(4)
31425 .m(2)
31426 .n(n)
31427 .k(k)
31428 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31429 }
31430 }
31431 }
31432
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64,n_gt_4_strided_cn)31433 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_cn) {
31434 for (uint32_t n = 5; n < 8; n++) {
31435 for (size_t k = 1; k <= 40; k += 9) {
31436 GemmMicrokernelTester()
31437 .mr(2)
31438 .nr(4)
31439 .kr(2)
31440 .sr(4)
31441 .m(2)
31442 .n(n)
31443 .k(k)
31444 .cn_stride(7)
31445 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31446 }
31447 }
31448 }
31449
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64,n_gt_4_strided_a)31450 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_a) {
31451 for (uint32_t n = 5; n < 8; n++) {
31452 for (size_t k = 1; k <= 40; k += 9) {
31453 GemmMicrokernelTester()
31454 .mr(2)
31455 .nr(4)
31456 .kr(2)
31457 .sr(4)
31458 .m(2)
31459 .n(n)
31460 .k(k)
31461 .a_stride(43)
31462 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31463 }
31464 }
31465 }
31466
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64,n_gt_4_subtile)31467 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, n_gt_4_subtile) {
31468 for (uint32_t n = 5; n < 8; n++) {
31469 for (size_t k = 1; k <= 40; k += 9) {
31470 for (uint32_t m = 1; m <= 2; m++) {
31471 GemmMicrokernelTester()
31472 .mr(2)
31473 .nr(4)
31474 .kr(2)
31475 .sr(4)
31476 .m(m)
31477 .n(n)
31478 .k(k)
31479 .iterations(1)
31480 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31481 }
31482 }
31483 }
31484 }
31485
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64,n_div_4)31486 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, n_div_4) {
31487 for (uint32_t n = 8; n <= 12; n += 4) {
31488 for (size_t k = 1; k <= 40; k += 9) {
31489 GemmMicrokernelTester()
31490 .mr(2)
31491 .nr(4)
31492 .kr(2)
31493 .sr(4)
31494 .m(2)
31495 .n(n)
31496 .k(k)
31497 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31498 }
31499 }
31500 }
31501
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64,n_div_4_strided_cn)31502 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, n_div_4_strided_cn) {
31503 for (uint32_t n = 8; n <= 12; n += 4) {
31504 for (size_t k = 1; k <= 40; k += 9) {
31505 GemmMicrokernelTester()
31506 .mr(2)
31507 .nr(4)
31508 .kr(2)
31509 .sr(4)
31510 .m(2)
31511 .n(n)
31512 .k(k)
31513 .cn_stride(7)
31514 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31515 }
31516 }
31517 }
31518
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64,n_div_4_strided_a)31519 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, n_div_4_strided_a) {
31520 for (uint32_t n = 8; n <= 12; n += 4) {
31521 for (size_t k = 1; k <= 40; k += 9) {
31522 GemmMicrokernelTester()
31523 .mr(2)
31524 .nr(4)
31525 .kr(2)
31526 .sr(4)
31527 .m(2)
31528 .n(n)
31529 .k(k)
31530 .a_stride(43)
31531 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31532 }
31533 }
31534 }
31535
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64,n_div_4_subtile)31536 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, n_div_4_subtile) {
31537 for (uint32_t n = 8; n <= 12; n += 4) {
31538 for (size_t k = 1; k <= 40; k += 9) {
31539 for (uint32_t m = 1; m <= 2; m++) {
31540 GemmMicrokernelTester()
31541 .mr(2)
31542 .nr(4)
31543 .kr(2)
31544 .sr(4)
31545 .m(m)
31546 .n(n)
31547 .k(k)
31548 .iterations(1)
31549 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31550 }
31551 }
31552 }
31553 }
31554
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64,strided_cm_subtile)31555 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, strided_cm_subtile) {
31556 for (size_t k = 1; k <= 40; k += 9) {
31557 for (uint32_t n = 1; n <= 4; n++) {
31558 for (uint32_t m = 1; m <= 2; m++) {
31559 GemmMicrokernelTester()
31560 .mr(2)
31561 .nr(4)
31562 .kr(2)
31563 .sr(4)
31564 .m(m)
31565 .n(n)
31566 .k(k)
31567 .cm_stride(7)
31568 .iterations(1)
31569 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31570 }
31571 }
31572 }
31573 }
31574
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64,qmin)31575 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, qmin) {
31576 GemmMicrokernelTester()
31577 .mr(2)
31578 .nr(4)
31579 .kr(2)
31580 .sr(4)
31581 .m(2)
31582 .n(4)
31583 .k(8)
31584 .qmin(128)
31585 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31586 }
31587
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64,qmax)31588 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, qmax) {
31589 GemmMicrokernelTester()
31590 .mr(2)
31591 .nr(4)
31592 .kr(2)
31593 .sr(4)
31594 .m(2)
31595 .n(4)
31596 .k(8)
31597 .qmax(128)
31598 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31599 }
31600
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64,strided_cm)31601 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, strided_cm) {
31602 GemmMicrokernelTester()
31603 .mr(2)
31604 .nr(4)
31605 .kr(2)
31606 .sr(4)
31607 .m(2)
31608 .n(4)
31609 .k(8)
31610 .cm_stride(7)
31611 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31612 }
31613
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64,no_a_zero_point)31614 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, no_a_zero_point) {
31615 for (size_t k = 1; k <= 40; k += 9) {
31616 GemmMicrokernelTester()
31617 .mr(2)
31618 .nr(4)
31619 .kr(2)
31620 .sr(4)
31621 .m(2)
31622 .n(4)
31623 .k(k)
31624 .a_zero_point(0)
31625 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31626 }
31627 }
31628
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64,no_b_zero_point)31629 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, no_b_zero_point) {
31630 for (size_t k = 1; k <= 40; k += 9) {
31631 GemmMicrokernelTester()
31632 .mr(2)
31633 .nr(4)
31634 .kr(2)
31635 .sr(4)
31636 .m(2)
31637 .n(4)
31638 .k(k)
31639 .b_zero_point(0)
31640 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31641 }
31642 }
31643
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64,no_zero_point)31644 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD64, no_zero_point) {
31645 for (size_t k = 1; k <= 40; k += 9) {
31646 GemmMicrokernelTester()
31647 .mr(2)
31648 .nr(4)
31649 .kr(2)
31650 .sr(4)
31651 .m(2)
31652 .n(4)
31653 .k(k)
31654 .a_zero_point(0)
31655 .b_zero_point(0)
31656 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31657 }
31658 }
31659 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
31660
31661
31662 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128,k_eq_8)31663 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128, k_eq_8) {
31664 GemmMicrokernelTester()
31665 .mr(2)
31666 .nr(4)
31667 .kr(2)
31668 .sr(4)
31669 .m(2)
31670 .n(4)
31671 .k(8)
31672 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31673 }
31674
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128,strided_cn)31675 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128, strided_cn) {
31676 GemmMicrokernelTester()
31677 .mr(2)
31678 .nr(4)
31679 .kr(2)
31680 .sr(4)
31681 .m(2)
31682 .n(4)
31683 .k(8)
31684 .cn_stride(7)
31685 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31686 }
31687
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128,k_eq_8_strided_a)31688 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128, k_eq_8_strided_a) {
31689 GemmMicrokernelTester()
31690 .mr(2)
31691 .nr(4)
31692 .kr(2)
31693 .sr(4)
31694 .m(2)
31695 .n(4)
31696 .k(8)
31697 .a_stride(11)
31698 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31699 }
31700
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128,k_eq_8_subtile)31701 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile) {
31702 for (uint32_t n = 1; n <= 4; n++) {
31703 for (uint32_t m = 1; m <= 2; m++) {
31704 GemmMicrokernelTester()
31705 .mr(2)
31706 .nr(4)
31707 .kr(2)
31708 .sr(4)
31709 .m(m)
31710 .n(n)
31711 .k(8)
31712 .iterations(1)
31713 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31714 }
31715 }
31716 }
31717
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128,k_eq_8_subtile_m)31718 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile_m) {
31719 for (uint32_t m = 1; m <= 2; m++) {
31720 GemmMicrokernelTester()
31721 .mr(2)
31722 .nr(4)
31723 .kr(2)
31724 .sr(4)
31725 .m(m)
31726 .n(4)
31727 .k(8)
31728 .iterations(1)
31729 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31730 }
31731 }
31732
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128,k_eq_8_subtile_n)31733 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile_n) {
31734 for (uint32_t n = 1; n <= 4; n++) {
31735 GemmMicrokernelTester()
31736 .mr(2)
31737 .nr(4)
31738 .kr(2)
31739 .sr(4)
31740 .m(2)
31741 .n(n)
31742 .k(8)
31743 .iterations(1)
31744 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31745 }
31746 }
31747
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128,k_lt_8)31748 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128, k_lt_8) {
31749 for (size_t k = 1; k < 8; k++) {
31750 GemmMicrokernelTester()
31751 .mr(2)
31752 .nr(4)
31753 .kr(2)
31754 .sr(4)
31755 .m(2)
31756 .n(4)
31757 .k(k)
31758 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31759 }
31760 }
31761
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128,k_lt_8_strided_a)31762 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128, k_lt_8_strided_a) {
31763 for (size_t k = 1; k < 8; k++) {
31764 GemmMicrokernelTester()
31765 .mr(2)
31766 .nr(4)
31767 .kr(2)
31768 .sr(4)
31769 .m(2)
31770 .n(4)
31771 .k(k)
31772 .a_stride(11)
31773 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31774 }
31775 }
31776
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128,k_lt_8_subtile)31777 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128, k_lt_8_subtile) {
31778 for (size_t k = 1; k < 8; k++) {
31779 for (uint32_t n = 1; n <= 4; n++) {
31780 for (uint32_t m = 1; m <= 2; m++) {
31781 GemmMicrokernelTester()
31782 .mr(2)
31783 .nr(4)
31784 .kr(2)
31785 .sr(4)
31786 .m(m)
31787 .n(n)
31788 .k(k)
31789 .iterations(1)
31790 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31791 }
31792 }
31793 }
31794 }
31795
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128,k_gt_8)31796 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128, k_gt_8) {
31797 for (size_t k = 9; k < 16; k++) {
31798 GemmMicrokernelTester()
31799 .mr(2)
31800 .nr(4)
31801 .kr(2)
31802 .sr(4)
31803 .m(2)
31804 .n(4)
31805 .k(k)
31806 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31807 }
31808 }
31809
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128,k_gt_8_strided_a)31810 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128, k_gt_8_strided_a) {
31811 for (size_t k = 9; k < 16; k++) {
31812 GemmMicrokernelTester()
31813 .mr(2)
31814 .nr(4)
31815 .kr(2)
31816 .sr(4)
31817 .m(2)
31818 .n(4)
31819 .k(k)
31820 .a_stride(19)
31821 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31822 }
31823 }
31824
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128,k_gt_8_subtile)31825 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128, k_gt_8_subtile) {
31826 for (size_t k = 9; k < 16; k++) {
31827 for (uint32_t n = 1; n <= 4; n++) {
31828 for (uint32_t m = 1; m <= 2; m++) {
31829 GemmMicrokernelTester()
31830 .mr(2)
31831 .nr(4)
31832 .kr(2)
31833 .sr(4)
31834 .m(m)
31835 .n(n)
31836 .k(k)
31837 .iterations(1)
31838 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31839 }
31840 }
31841 }
31842 }
31843
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128,k_div_8)31844 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128, k_div_8) {
31845 for (size_t k = 16; k <= 80; k += 8) {
31846 GemmMicrokernelTester()
31847 .mr(2)
31848 .nr(4)
31849 .kr(2)
31850 .sr(4)
31851 .m(2)
31852 .n(4)
31853 .k(k)
31854 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31855 }
31856 }
31857
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128,k_div_8_strided_a)31858 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128, k_div_8_strided_a) {
31859 for (size_t k = 16; k <= 80; k += 8) {
31860 GemmMicrokernelTester()
31861 .mr(2)
31862 .nr(4)
31863 .kr(2)
31864 .sr(4)
31865 .m(2)
31866 .n(4)
31867 .k(k)
31868 .a_stride(83)
31869 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31870 }
31871 }
31872
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128,k_div_8_subtile)31873 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128, k_div_8_subtile) {
31874 for (size_t k = 16; k <= 80; k += 8) {
31875 for (uint32_t n = 1; n <= 4; n++) {
31876 for (uint32_t m = 1; m <= 2; m++) {
31877 GemmMicrokernelTester()
31878 .mr(2)
31879 .nr(4)
31880 .kr(2)
31881 .sr(4)
31882 .m(m)
31883 .n(n)
31884 .k(k)
31885 .iterations(1)
31886 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31887 }
31888 }
31889 }
31890 }
31891
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128,n_gt_4)31892 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128, n_gt_4) {
31893 for (uint32_t n = 5; n < 8; n++) {
31894 for (size_t k = 1; k <= 40; k += 9) {
31895 GemmMicrokernelTester()
31896 .mr(2)
31897 .nr(4)
31898 .kr(2)
31899 .sr(4)
31900 .m(2)
31901 .n(n)
31902 .k(k)
31903 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31904 }
31905 }
31906 }
31907
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128,n_gt_4_strided_cn)31908 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128, n_gt_4_strided_cn) {
31909 for (uint32_t n = 5; n < 8; n++) {
31910 for (size_t k = 1; k <= 40; k += 9) {
31911 GemmMicrokernelTester()
31912 .mr(2)
31913 .nr(4)
31914 .kr(2)
31915 .sr(4)
31916 .m(2)
31917 .n(n)
31918 .k(k)
31919 .cn_stride(7)
31920 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31921 }
31922 }
31923 }
31924
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128,n_gt_4_strided_a)31925 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128, n_gt_4_strided_a) {
31926 for (uint32_t n = 5; n < 8; n++) {
31927 for (size_t k = 1; k <= 40; k += 9) {
31928 GemmMicrokernelTester()
31929 .mr(2)
31930 .nr(4)
31931 .kr(2)
31932 .sr(4)
31933 .m(2)
31934 .n(n)
31935 .k(k)
31936 .a_stride(43)
31937 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31938 }
31939 }
31940 }
31941
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128,n_gt_4_subtile)31942 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128, n_gt_4_subtile) {
31943 for (uint32_t n = 5; n < 8; n++) {
31944 for (size_t k = 1; k <= 40; k += 9) {
31945 for (uint32_t m = 1; m <= 2; m++) {
31946 GemmMicrokernelTester()
31947 .mr(2)
31948 .nr(4)
31949 .kr(2)
31950 .sr(4)
31951 .m(m)
31952 .n(n)
31953 .k(k)
31954 .iterations(1)
31955 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31956 }
31957 }
31958 }
31959 }
31960
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128,n_div_4)31961 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128, n_div_4) {
31962 for (uint32_t n = 8; n <= 12; n += 4) {
31963 for (size_t k = 1; k <= 40; k += 9) {
31964 GemmMicrokernelTester()
31965 .mr(2)
31966 .nr(4)
31967 .kr(2)
31968 .sr(4)
31969 .m(2)
31970 .n(n)
31971 .k(k)
31972 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31973 }
31974 }
31975 }
31976
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128,n_div_4_strided_cn)31977 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128, n_div_4_strided_cn) {
31978 for (uint32_t n = 8; n <= 12; n += 4) {
31979 for (size_t k = 1; k <= 40; k += 9) {
31980 GemmMicrokernelTester()
31981 .mr(2)
31982 .nr(4)
31983 .kr(2)
31984 .sr(4)
31985 .m(2)
31986 .n(n)
31987 .k(k)
31988 .cn_stride(7)
31989 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
31990 }
31991 }
31992 }
31993
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128,n_div_4_strided_a)31994 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128, n_div_4_strided_a) {
31995 for (uint32_t n = 8; n <= 12; n += 4) {
31996 for (size_t k = 1; k <= 40; k += 9) {
31997 GemmMicrokernelTester()
31998 .mr(2)
31999 .nr(4)
32000 .kr(2)
32001 .sr(4)
32002 .m(2)
32003 .n(n)
32004 .k(k)
32005 .a_stride(43)
32006 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32007 }
32008 }
32009 }
32010
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128,n_div_4_subtile)32011 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128, n_div_4_subtile) {
32012 for (uint32_t n = 8; n <= 12; n += 4) {
32013 for (size_t k = 1; k <= 40; k += 9) {
32014 for (uint32_t m = 1; m <= 2; m++) {
32015 GemmMicrokernelTester()
32016 .mr(2)
32017 .nr(4)
32018 .kr(2)
32019 .sr(4)
32020 .m(m)
32021 .n(n)
32022 .k(k)
32023 .iterations(1)
32024 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32025 }
32026 }
32027 }
32028 }
32029
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128,strided_cm_subtile)32030 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128, strided_cm_subtile) {
32031 for (size_t k = 1; k <= 40; k += 9) {
32032 for (uint32_t n = 1; n <= 4; n++) {
32033 for (uint32_t m = 1; m <= 2; m++) {
32034 GemmMicrokernelTester()
32035 .mr(2)
32036 .nr(4)
32037 .kr(2)
32038 .sr(4)
32039 .m(m)
32040 .n(n)
32041 .k(k)
32042 .cm_stride(7)
32043 .iterations(1)
32044 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32045 }
32046 }
32047 }
32048 }
32049
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128,qmin)32050 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128, qmin) {
32051 GemmMicrokernelTester()
32052 .mr(2)
32053 .nr(4)
32054 .kr(2)
32055 .sr(4)
32056 .m(2)
32057 .n(4)
32058 .k(8)
32059 .qmin(128)
32060 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32061 }
32062
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128,qmax)32063 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128, qmax) {
32064 GemmMicrokernelTester()
32065 .mr(2)
32066 .nr(4)
32067 .kr(2)
32068 .sr(4)
32069 .m(2)
32070 .n(4)
32071 .k(8)
32072 .qmax(128)
32073 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32074 }
32075
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128,strided_cm)32076 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128, strided_cm) {
32077 GemmMicrokernelTester()
32078 .mr(2)
32079 .nr(4)
32080 .kr(2)
32081 .sr(4)
32082 .m(2)
32083 .n(4)
32084 .k(8)
32085 .cm_stride(7)
32086 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32087 }
32088
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128,no_a_zero_point)32089 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128, no_a_zero_point) {
32090 for (size_t k = 1; k <= 40; k += 9) {
32091 GemmMicrokernelTester()
32092 .mr(2)
32093 .nr(4)
32094 .kr(2)
32095 .sr(4)
32096 .m(2)
32097 .n(4)
32098 .k(k)
32099 .a_zero_point(0)
32100 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32101 }
32102 }
32103
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128,no_b_zero_point)32104 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128, no_b_zero_point) {
32105 for (size_t k = 1; k <= 40; k += 9) {
32106 GemmMicrokernelTester()
32107 .mr(2)
32108 .nr(4)
32109 .kr(2)
32110 .sr(4)
32111 .m(2)
32112 .n(4)
32113 .k(k)
32114 .b_zero_point(0)
32115 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32116 }
32117 }
32118
TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128,no_zero_point)32119 TEST(QU8_GEMM_MINMAX_FP32_2X4C2S4__WASMSIMD_DOT16X2_LD128, no_zero_point) {
32120 for (size_t k = 1; k <= 40; k += 9) {
32121 GemmMicrokernelTester()
32122 .mr(2)
32123 .nr(4)
32124 .kr(2)
32125 .sr(4)
32126 .m(2)
32127 .n(4)
32128 .k(k)
32129 .a_zero_point(0)
32130 .b_zero_point(0)
32131 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32132 }
32133 }
32134 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
32135
32136
32137 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64,k_eq_8)32138 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, k_eq_8) {
32139 GemmMicrokernelTester()
32140 .mr(2)
32141 .nr(4)
32142 .kr(8)
32143 .sr(1)
32144 .m(2)
32145 .n(4)
32146 .k(8)
32147 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32148 }
32149
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64,strided_cn)32150 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, strided_cn) {
32151 GemmMicrokernelTester()
32152 .mr(2)
32153 .nr(4)
32154 .kr(8)
32155 .sr(1)
32156 .m(2)
32157 .n(4)
32158 .k(8)
32159 .cn_stride(7)
32160 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32161 }
32162
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64,k_eq_8_strided_a)32163 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, k_eq_8_strided_a) {
32164 GemmMicrokernelTester()
32165 .mr(2)
32166 .nr(4)
32167 .kr(8)
32168 .sr(1)
32169 .m(2)
32170 .n(4)
32171 .k(8)
32172 .a_stride(11)
32173 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32174 }
32175
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64,k_eq_8_subtile)32176 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile) {
32177 for (uint32_t n = 1; n <= 4; n++) {
32178 for (uint32_t m = 1; m <= 2; m++) {
32179 GemmMicrokernelTester()
32180 .mr(2)
32181 .nr(4)
32182 .kr(8)
32183 .sr(1)
32184 .m(m)
32185 .n(n)
32186 .k(8)
32187 .iterations(1)
32188 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32189 }
32190 }
32191 }
32192
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64,k_eq_8_subtile_m)32193 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_m) {
32194 for (uint32_t m = 1; m <= 2; m++) {
32195 GemmMicrokernelTester()
32196 .mr(2)
32197 .nr(4)
32198 .kr(8)
32199 .sr(1)
32200 .m(m)
32201 .n(4)
32202 .k(8)
32203 .iterations(1)
32204 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32205 }
32206 }
32207
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64,k_eq_8_subtile_n)32208 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_n) {
32209 for (uint32_t n = 1; n <= 4; n++) {
32210 GemmMicrokernelTester()
32211 .mr(2)
32212 .nr(4)
32213 .kr(8)
32214 .sr(1)
32215 .m(2)
32216 .n(n)
32217 .k(8)
32218 .iterations(1)
32219 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32220 }
32221 }
32222
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64,k_lt_8)32223 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, k_lt_8) {
32224 for (size_t k = 1; k < 8; k++) {
32225 GemmMicrokernelTester()
32226 .mr(2)
32227 .nr(4)
32228 .kr(8)
32229 .sr(1)
32230 .m(2)
32231 .n(4)
32232 .k(k)
32233 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32234 }
32235 }
32236
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64,k_lt_8_strided_a)32237 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, k_lt_8_strided_a) {
32238 for (size_t k = 1; k < 8; k++) {
32239 GemmMicrokernelTester()
32240 .mr(2)
32241 .nr(4)
32242 .kr(8)
32243 .sr(1)
32244 .m(2)
32245 .n(4)
32246 .k(k)
32247 .a_stride(11)
32248 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32249 }
32250 }
32251
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64,k_lt_8_subtile)32252 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, k_lt_8_subtile) {
32253 for (size_t k = 1; k < 8; k++) {
32254 for (uint32_t n = 1; n <= 4; n++) {
32255 for (uint32_t m = 1; m <= 2; m++) {
32256 GemmMicrokernelTester()
32257 .mr(2)
32258 .nr(4)
32259 .kr(8)
32260 .sr(1)
32261 .m(m)
32262 .n(n)
32263 .k(k)
32264 .iterations(1)
32265 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32266 }
32267 }
32268 }
32269 }
32270
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64,k_gt_8)32271 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, k_gt_8) {
32272 for (size_t k = 9; k < 16; k++) {
32273 GemmMicrokernelTester()
32274 .mr(2)
32275 .nr(4)
32276 .kr(8)
32277 .sr(1)
32278 .m(2)
32279 .n(4)
32280 .k(k)
32281 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32282 }
32283 }
32284
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64,k_gt_8_strided_a)32285 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, k_gt_8_strided_a) {
32286 for (size_t k = 9; k < 16; k++) {
32287 GemmMicrokernelTester()
32288 .mr(2)
32289 .nr(4)
32290 .kr(8)
32291 .sr(1)
32292 .m(2)
32293 .n(4)
32294 .k(k)
32295 .a_stride(19)
32296 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32297 }
32298 }
32299
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64,k_gt_8_subtile)32300 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, k_gt_8_subtile) {
32301 for (size_t k = 9; k < 16; k++) {
32302 for (uint32_t n = 1; n <= 4; n++) {
32303 for (uint32_t m = 1; m <= 2; m++) {
32304 GemmMicrokernelTester()
32305 .mr(2)
32306 .nr(4)
32307 .kr(8)
32308 .sr(1)
32309 .m(m)
32310 .n(n)
32311 .k(k)
32312 .iterations(1)
32313 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32314 }
32315 }
32316 }
32317 }
32318
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64,k_div_8)32319 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, k_div_8) {
32320 for (size_t k = 16; k <= 80; k += 8) {
32321 GemmMicrokernelTester()
32322 .mr(2)
32323 .nr(4)
32324 .kr(8)
32325 .sr(1)
32326 .m(2)
32327 .n(4)
32328 .k(k)
32329 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32330 }
32331 }
32332
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64,k_div_8_strided_a)32333 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, k_div_8_strided_a) {
32334 for (size_t k = 16; k <= 80; k += 8) {
32335 GemmMicrokernelTester()
32336 .mr(2)
32337 .nr(4)
32338 .kr(8)
32339 .sr(1)
32340 .m(2)
32341 .n(4)
32342 .k(k)
32343 .a_stride(83)
32344 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32345 }
32346 }
32347
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64,k_div_8_subtile)32348 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, k_div_8_subtile) {
32349 for (size_t k = 16; k <= 80; k += 8) {
32350 for (uint32_t n = 1; n <= 4; n++) {
32351 for (uint32_t m = 1; m <= 2; m++) {
32352 GemmMicrokernelTester()
32353 .mr(2)
32354 .nr(4)
32355 .kr(8)
32356 .sr(1)
32357 .m(m)
32358 .n(n)
32359 .k(k)
32360 .iterations(1)
32361 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32362 }
32363 }
32364 }
32365 }
32366
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64,n_gt_4)32367 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, n_gt_4) {
32368 for (uint32_t n = 5; n < 8; n++) {
32369 for (size_t k = 1; k <= 40; k += 9) {
32370 GemmMicrokernelTester()
32371 .mr(2)
32372 .nr(4)
32373 .kr(8)
32374 .sr(1)
32375 .m(2)
32376 .n(n)
32377 .k(k)
32378 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32379 }
32380 }
32381 }
32382
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64,n_gt_4_strided_cn)32383 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_cn) {
32384 for (uint32_t n = 5; n < 8; n++) {
32385 for (size_t k = 1; k <= 40; k += 9) {
32386 GemmMicrokernelTester()
32387 .mr(2)
32388 .nr(4)
32389 .kr(8)
32390 .sr(1)
32391 .m(2)
32392 .n(n)
32393 .k(k)
32394 .cn_stride(7)
32395 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32396 }
32397 }
32398 }
32399
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64,n_gt_4_strided_a)32400 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_a) {
32401 for (uint32_t n = 5; n < 8; n++) {
32402 for (size_t k = 1; k <= 40; k += 9) {
32403 GemmMicrokernelTester()
32404 .mr(2)
32405 .nr(4)
32406 .kr(8)
32407 .sr(1)
32408 .m(2)
32409 .n(n)
32410 .k(k)
32411 .a_stride(43)
32412 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32413 }
32414 }
32415 }
32416
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64,n_gt_4_subtile)32417 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, n_gt_4_subtile) {
32418 for (uint32_t n = 5; n < 8; n++) {
32419 for (size_t k = 1; k <= 40; k += 9) {
32420 for (uint32_t m = 1; m <= 2; m++) {
32421 GemmMicrokernelTester()
32422 .mr(2)
32423 .nr(4)
32424 .kr(8)
32425 .sr(1)
32426 .m(m)
32427 .n(n)
32428 .k(k)
32429 .iterations(1)
32430 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32431 }
32432 }
32433 }
32434 }
32435
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64,n_div_4)32436 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, n_div_4) {
32437 for (uint32_t n = 8; n <= 12; n += 4) {
32438 for (size_t k = 1; k <= 40; k += 9) {
32439 GemmMicrokernelTester()
32440 .mr(2)
32441 .nr(4)
32442 .kr(8)
32443 .sr(1)
32444 .m(2)
32445 .n(n)
32446 .k(k)
32447 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32448 }
32449 }
32450 }
32451
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64,n_div_4_strided_cn)32452 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, n_div_4_strided_cn) {
32453 for (uint32_t n = 8; n <= 12; n += 4) {
32454 for (size_t k = 1; k <= 40; k += 9) {
32455 GemmMicrokernelTester()
32456 .mr(2)
32457 .nr(4)
32458 .kr(8)
32459 .sr(1)
32460 .m(2)
32461 .n(n)
32462 .k(k)
32463 .cn_stride(7)
32464 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32465 }
32466 }
32467 }
32468
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64,n_div_4_strided_a)32469 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, n_div_4_strided_a) {
32470 for (uint32_t n = 8; n <= 12; n += 4) {
32471 for (size_t k = 1; k <= 40; k += 9) {
32472 GemmMicrokernelTester()
32473 .mr(2)
32474 .nr(4)
32475 .kr(8)
32476 .sr(1)
32477 .m(2)
32478 .n(n)
32479 .k(k)
32480 .a_stride(43)
32481 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32482 }
32483 }
32484 }
32485
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64,n_div_4_subtile)32486 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, n_div_4_subtile) {
32487 for (uint32_t n = 8; n <= 12; n += 4) {
32488 for (size_t k = 1; k <= 40; k += 9) {
32489 for (uint32_t m = 1; m <= 2; m++) {
32490 GemmMicrokernelTester()
32491 .mr(2)
32492 .nr(4)
32493 .kr(8)
32494 .sr(1)
32495 .m(m)
32496 .n(n)
32497 .k(k)
32498 .iterations(1)
32499 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32500 }
32501 }
32502 }
32503 }
32504
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64,strided_cm_subtile)32505 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, strided_cm_subtile) {
32506 for (size_t k = 1; k <= 40; k += 9) {
32507 for (uint32_t n = 1; n <= 4; n++) {
32508 for (uint32_t m = 1; m <= 2; m++) {
32509 GemmMicrokernelTester()
32510 .mr(2)
32511 .nr(4)
32512 .kr(8)
32513 .sr(1)
32514 .m(m)
32515 .n(n)
32516 .k(k)
32517 .cm_stride(7)
32518 .iterations(1)
32519 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32520 }
32521 }
32522 }
32523 }
32524
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64,qmin)32525 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, qmin) {
32526 GemmMicrokernelTester()
32527 .mr(2)
32528 .nr(4)
32529 .kr(8)
32530 .sr(1)
32531 .m(2)
32532 .n(4)
32533 .k(8)
32534 .qmin(128)
32535 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32536 }
32537
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64,qmax)32538 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, qmax) {
32539 GemmMicrokernelTester()
32540 .mr(2)
32541 .nr(4)
32542 .kr(8)
32543 .sr(1)
32544 .m(2)
32545 .n(4)
32546 .k(8)
32547 .qmax(128)
32548 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32549 }
32550
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64,strided_cm)32551 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, strided_cm) {
32552 GemmMicrokernelTester()
32553 .mr(2)
32554 .nr(4)
32555 .kr(8)
32556 .sr(1)
32557 .m(2)
32558 .n(4)
32559 .k(8)
32560 .cm_stride(7)
32561 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32562 }
32563
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64,no_a_zero_point)32564 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, no_a_zero_point) {
32565 for (size_t k = 1; k <= 40; k += 9) {
32566 GemmMicrokernelTester()
32567 .mr(2)
32568 .nr(4)
32569 .kr(8)
32570 .sr(1)
32571 .m(2)
32572 .n(4)
32573 .k(k)
32574 .a_zero_point(0)
32575 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32576 }
32577 }
32578
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64,no_b_zero_point)32579 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, no_b_zero_point) {
32580 for (size_t k = 1; k <= 40; k += 9) {
32581 GemmMicrokernelTester()
32582 .mr(2)
32583 .nr(4)
32584 .kr(8)
32585 .sr(1)
32586 .m(2)
32587 .n(4)
32588 .k(k)
32589 .b_zero_point(0)
32590 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32591 }
32592 }
32593
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64,no_zero_point)32594 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD64, no_zero_point) {
32595 for (size_t k = 1; k <= 40; k += 9) {
32596 GemmMicrokernelTester()
32597 .mr(2)
32598 .nr(4)
32599 .kr(8)
32600 .sr(1)
32601 .m(2)
32602 .n(4)
32603 .k(k)
32604 .a_zero_point(0)
32605 .b_zero_point(0)
32606 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32607 }
32608 }
32609 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
32610
32611
32612 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128,k_eq_8)32613 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128, k_eq_8) {
32614 GemmMicrokernelTester()
32615 .mr(2)
32616 .nr(4)
32617 .kr(8)
32618 .sr(1)
32619 .m(2)
32620 .n(4)
32621 .k(8)
32622 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32623 }
32624
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128,strided_cn)32625 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128, strided_cn) {
32626 GemmMicrokernelTester()
32627 .mr(2)
32628 .nr(4)
32629 .kr(8)
32630 .sr(1)
32631 .m(2)
32632 .n(4)
32633 .k(8)
32634 .cn_stride(7)
32635 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32636 }
32637
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128,k_eq_8_strided_a)32638 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128, k_eq_8_strided_a) {
32639 GemmMicrokernelTester()
32640 .mr(2)
32641 .nr(4)
32642 .kr(8)
32643 .sr(1)
32644 .m(2)
32645 .n(4)
32646 .k(8)
32647 .a_stride(11)
32648 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32649 }
32650
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128,k_eq_8_subtile)32651 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile) {
32652 for (uint32_t n = 1; n <= 4; n++) {
32653 for (uint32_t m = 1; m <= 2; m++) {
32654 GemmMicrokernelTester()
32655 .mr(2)
32656 .nr(4)
32657 .kr(8)
32658 .sr(1)
32659 .m(m)
32660 .n(n)
32661 .k(8)
32662 .iterations(1)
32663 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32664 }
32665 }
32666 }
32667
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128,k_eq_8_subtile_m)32668 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile_m) {
32669 for (uint32_t m = 1; m <= 2; m++) {
32670 GemmMicrokernelTester()
32671 .mr(2)
32672 .nr(4)
32673 .kr(8)
32674 .sr(1)
32675 .m(m)
32676 .n(4)
32677 .k(8)
32678 .iterations(1)
32679 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32680 }
32681 }
32682
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128,k_eq_8_subtile_n)32683 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile_n) {
32684 for (uint32_t n = 1; n <= 4; n++) {
32685 GemmMicrokernelTester()
32686 .mr(2)
32687 .nr(4)
32688 .kr(8)
32689 .sr(1)
32690 .m(2)
32691 .n(n)
32692 .k(8)
32693 .iterations(1)
32694 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32695 }
32696 }
32697
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128,k_lt_8)32698 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128, k_lt_8) {
32699 for (size_t k = 1; k < 8; k++) {
32700 GemmMicrokernelTester()
32701 .mr(2)
32702 .nr(4)
32703 .kr(8)
32704 .sr(1)
32705 .m(2)
32706 .n(4)
32707 .k(k)
32708 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32709 }
32710 }
32711
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128,k_lt_8_strided_a)32712 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128, k_lt_8_strided_a) {
32713 for (size_t k = 1; k < 8; k++) {
32714 GemmMicrokernelTester()
32715 .mr(2)
32716 .nr(4)
32717 .kr(8)
32718 .sr(1)
32719 .m(2)
32720 .n(4)
32721 .k(k)
32722 .a_stride(11)
32723 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32724 }
32725 }
32726
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128,k_lt_8_subtile)32727 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128, k_lt_8_subtile) {
32728 for (size_t k = 1; k < 8; k++) {
32729 for (uint32_t n = 1; n <= 4; n++) {
32730 for (uint32_t m = 1; m <= 2; m++) {
32731 GemmMicrokernelTester()
32732 .mr(2)
32733 .nr(4)
32734 .kr(8)
32735 .sr(1)
32736 .m(m)
32737 .n(n)
32738 .k(k)
32739 .iterations(1)
32740 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32741 }
32742 }
32743 }
32744 }
32745
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128,k_gt_8)32746 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128, k_gt_8) {
32747 for (size_t k = 9; k < 16; k++) {
32748 GemmMicrokernelTester()
32749 .mr(2)
32750 .nr(4)
32751 .kr(8)
32752 .sr(1)
32753 .m(2)
32754 .n(4)
32755 .k(k)
32756 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32757 }
32758 }
32759
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128,k_gt_8_strided_a)32760 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128, k_gt_8_strided_a) {
32761 for (size_t k = 9; k < 16; k++) {
32762 GemmMicrokernelTester()
32763 .mr(2)
32764 .nr(4)
32765 .kr(8)
32766 .sr(1)
32767 .m(2)
32768 .n(4)
32769 .k(k)
32770 .a_stride(19)
32771 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32772 }
32773 }
32774
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128,k_gt_8_subtile)32775 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128, k_gt_8_subtile) {
32776 for (size_t k = 9; k < 16; k++) {
32777 for (uint32_t n = 1; n <= 4; n++) {
32778 for (uint32_t m = 1; m <= 2; m++) {
32779 GemmMicrokernelTester()
32780 .mr(2)
32781 .nr(4)
32782 .kr(8)
32783 .sr(1)
32784 .m(m)
32785 .n(n)
32786 .k(k)
32787 .iterations(1)
32788 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32789 }
32790 }
32791 }
32792 }
32793
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128,k_div_8)32794 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128, k_div_8) {
32795 for (size_t k = 16; k <= 80; k += 8) {
32796 GemmMicrokernelTester()
32797 .mr(2)
32798 .nr(4)
32799 .kr(8)
32800 .sr(1)
32801 .m(2)
32802 .n(4)
32803 .k(k)
32804 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32805 }
32806 }
32807
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128,k_div_8_strided_a)32808 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128, k_div_8_strided_a) {
32809 for (size_t k = 16; k <= 80; k += 8) {
32810 GemmMicrokernelTester()
32811 .mr(2)
32812 .nr(4)
32813 .kr(8)
32814 .sr(1)
32815 .m(2)
32816 .n(4)
32817 .k(k)
32818 .a_stride(83)
32819 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32820 }
32821 }
32822
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128,k_div_8_subtile)32823 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128, k_div_8_subtile) {
32824 for (size_t k = 16; k <= 80; k += 8) {
32825 for (uint32_t n = 1; n <= 4; n++) {
32826 for (uint32_t m = 1; m <= 2; m++) {
32827 GemmMicrokernelTester()
32828 .mr(2)
32829 .nr(4)
32830 .kr(8)
32831 .sr(1)
32832 .m(m)
32833 .n(n)
32834 .k(k)
32835 .iterations(1)
32836 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32837 }
32838 }
32839 }
32840 }
32841
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128,n_gt_4)32842 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128, n_gt_4) {
32843 for (uint32_t n = 5; n < 8; n++) {
32844 for (size_t k = 1; k <= 40; k += 9) {
32845 GemmMicrokernelTester()
32846 .mr(2)
32847 .nr(4)
32848 .kr(8)
32849 .sr(1)
32850 .m(2)
32851 .n(n)
32852 .k(k)
32853 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32854 }
32855 }
32856 }
32857
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128,n_gt_4_strided_cn)32858 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128, n_gt_4_strided_cn) {
32859 for (uint32_t n = 5; n < 8; n++) {
32860 for (size_t k = 1; k <= 40; k += 9) {
32861 GemmMicrokernelTester()
32862 .mr(2)
32863 .nr(4)
32864 .kr(8)
32865 .sr(1)
32866 .m(2)
32867 .n(n)
32868 .k(k)
32869 .cn_stride(7)
32870 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32871 }
32872 }
32873 }
32874
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128,n_gt_4_strided_a)32875 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128, n_gt_4_strided_a) {
32876 for (uint32_t n = 5; n < 8; n++) {
32877 for (size_t k = 1; k <= 40; k += 9) {
32878 GemmMicrokernelTester()
32879 .mr(2)
32880 .nr(4)
32881 .kr(8)
32882 .sr(1)
32883 .m(2)
32884 .n(n)
32885 .k(k)
32886 .a_stride(43)
32887 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32888 }
32889 }
32890 }
32891
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128,n_gt_4_subtile)32892 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128, n_gt_4_subtile) {
32893 for (uint32_t n = 5; n < 8; n++) {
32894 for (size_t k = 1; k <= 40; k += 9) {
32895 for (uint32_t m = 1; m <= 2; m++) {
32896 GemmMicrokernelTester()
32897 .mr(2)
32898 .nr(4)
32899 .kr(8)
32900 .sr(1)
32901 .m(m)
32902 .n(n)
32903 .k(k)
32904 .iterations(1)
32905 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32906 }
32907 }
32908 }
32909 }
32910
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128,n_div_4)32911 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128, n_div_4) {
32912 for (uint32_t n = 8; n <= 12; n += 4) {
32913 for (size_t k = 1; k <= 40; k += 9) {
32914 GemmMicrokernelTester()
32915 .mr(2)
32916 .nr(4)
32917 .kr(8)
32918 .sr(1)
32919 .m(2)
32920 .n(n)
32921 .k(k)
32922 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32923 }
32924 }
32925 }
32926
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128,n_div_4_strided_cn)32927 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128, n_div_4_strided_cn) {
32928 for (uint32_t n = 8; n <= 12; n += 4) {
32929 for (size_t k = 1; k <= 40; k += 9) {
32930 GemmMicrokernelTester()
32931 .mr(2)
32932 .nr(4)
32933 .kr(8)
32934 .sr(1)
32935 .m(2)
32936 .n(n)
32937 .k(k)
32938 .cn_stride(7)
32939 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32940 }
32941 }
32942 }
32943
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128,n_div_4_strided_a)32944 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128, n_div_4_strided_a) {
32945 for (uint32_t n = 8; n <= 12; n += 4) {
32946 for (size_t k = 1; k <= 40; k += 9) {
32947 GemmMicrokernelTester()
32948 .mr(2)
32949 .nr(4)
32950 .kr(8)
32951 .sr(1)
32952 .m(2)
32953 .n(n)
32954 .k(k)
32955 .a_stride(43)
32956 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32957 }
32958 }
32959 }
32960
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128,n_div_4_subtile)32961 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128, n_div_4_subtile) {
32962 for (uint32_t n = 8; n <= 12; n += 4) {
32963 for (size_t k = 1; k <= 40; k += 9) {
32964 for (uint32_t m = 1; m <= 2; m++) {
32965 GemmMicrokernelTester()
32966 .mr(2)
32967 .nr(4)
32968 .kr(8)
32969 .sr(1)
32970 .m(m)
32971 .n(n)
32972 .k(k)
32973 .iterations(1)
32974 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32975 }
32976 }
32977 }
32978 }
32979
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128,strided_cm_subtile)32980 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128, strided_cm_subtile) {
32981 for (size_t k = 1; k <= 40; k += 9) {
32982 for (uint32_t n = 1; n <= 4; n++) {
32983 for (uint32_t m = 1; m <= 2; m++) {
32984 GemmMicrokernelTester()
32985 .mr(2)
32986 .nr(4)
32987 .kr(8)
32988 .sr(1)
32989 .m(m)
32990 .n(n)
32991 .k(k)
32992 .cm_stride(7)
32993 .iterations(1)
32994 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
32995 }
32996 }
32997 }
32998 }
32999
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128,qmin)33000 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128, qmin) {
33001 GemmMicrokernelTester()
33002 .mr(2)
33003 .nr(4)
33004 .kr(8)
33005 .sr(1)
33006 .m(2)
33007 .n(4)
33008 .k(8)
33009 .qmin(128)
33010 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33011 }
33012
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128,qmax)33013 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128, qmax) {
33014 GemmMicrokernelTester()
33015 .mr(2)
33016 .nr(4)
33017 .kr(8)
33018 .sr(1)
33019 .m(2)
33020 .n(4)
33021 .k(8)
33022 .qmax(128)
33023 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33024 }
33025
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128,strided_cm)33026 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128, strided_cm) {
33027 GemmMicrokernelTester()
33028 .mr(2)
33029 .nr(4)
33030 .kr(8)
33031 .sr(1)
33032 .m(2)
33033 .n(4)
33034 .k(8)
33035 .cm_stride(7)
33036 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33037 }
33038
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128,no_a_zero_point)33039 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128, no_a_zero_point) {
33040 for (size_t k = 1; k <= 40; k += 9) {
33041 GemmMicrokernelTester()
33042 .mr(2)
33043 .nr(4)
33044 .kr(8)
33045 .sr(1)
33046 .m(2)
33047 .n(4)
33048 .k(k)
33049 .a_zero_point(0)
33050 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33051 }
33052 }
33053
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128,no_b_zero_point)33054 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128, no_b_zero_point) {
33055 for (size_t k = 1; k <= 40; k += 9) {
33056 GemmMicrokernelTester()
33057 .mr(2)
33058 .nr(4)
33059 .kr(8)
33060 .sr(1)
33061 .m(2)
33062 .n(4)
33063 .k(k)
33064 .b_zero_point(0)
33065 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33066 }
33067 }
33068
TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128,no_zero_point)33069 TEST(QU8_GEMM_MINMAX_FP32_2X4C8__WASMSIMD_DOT16X2_LD128, no_zero_point) {
33070 for (size_t k = 1; k <= 40; k += 9) {
33071 GemmMicrokernelTester()
33072 .mr(2)
33073 .nr(4)
33074 .kr(8)
33075 .sr(1)
33076 .m(2)
33077 .n(4)
33078 .k(k)
33079 .a_zero_point(0)
33080 .b_zero_point(0)
33081 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33082 }
33083 }
33084 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
33085
33086
33087 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64,k_eq_8)33088 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, k_eq_8) {
33089 GemmMicrokernelTester()
33090 .mr(3)
33091 .nr(4)
33092 .kr(2)
33093 .sr(1)
33094 .m(3)
33095 .n(4)
33096 .k(8)
33097 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33098 }
33099
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64,strided_cn)33100 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, strided_cn) {
33101 GemmMicrokernelTester()
33102 .mr(3)
33103 .nr(4)
33104 .kr(2)
33105 .sr(1)
33106 .m(3)
33107 .n(4)
33108 .k(8)
33109 .cn_stride(7)
33110 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33111 }
33112
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64,k_eq_8_strided_a)33113 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, k_eq_8_strided_a) {
33114 GemmMicrokernelTester()
33115 .mr(3)
33116 .nr(4)
33117 .kr(2)
33118 .sr(1)
33119 .m(3)
33120 .n(4)
33121 .k(8)
33122 .a_stride(11)
33123 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33124 }
33125
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64,k_eq_8_subtile)33126 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile) {
33127 for (uint32_t n = 1; n <= 4; n++) {
33128 for (uint32_t m = 1; m <= 3; m++) {
33129 GemmMicrokernelTester()
33130 .mr(3)
33131 .nr(4)
33132 .kr(2)
33133 .sr(1)
33134 .m(m)
33135 .n(n)
33136 .k(8)
33137 .iterations(1)
33138 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33139 }
33140 }
33141 }
33142
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64,k_eq_8_subtile_m)33143 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_m) {
33144 for (uint32_t m = 1; m <= 3; m++) {
33145 GemmMicrokernelTester()
33146 .mr(3)
33147 .nr(4)
33148 .kr(2)
33149 .sr(1)
33150 .m(m)
33151 .n(4)
33152 .k(8)
33153 .iterations(1)
33154 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33155 }
33156 }
33157
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64,k_eq_8_subtile_n)33158 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_n) {
33159 for (uint32_t n = 1; n <= 4; n++) {
33160 GemmMicrokernelTester()
33161 .mr(3)
33162 .nr(4)
33163 .kr(2)
33164 .sr(1)
33165 .m(3)
33166 .n(n)
33167 .k(8)
33168 .iterations(1)
33169 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33170 }
33171 }
33172
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64,k_lt_8)33173 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, k_lt_8) {
33174 for (size_t k = 1; k < 8; k++) {
33175 GemmMicrokernelTester()
33176 .mr(3)
33177 .nr(4)
33178 .kr(2)
33179 .sr(1)
33180 .m(3)
33181 .n(4)
33182 .k(k)
33183 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33184 }
33185 }
33186
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64,k_lt_8_strided_a)33187 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, k_lt_8_strided_a) {
33188 for (size_t k = 1; k < 8; k++) {
33189 GemmMicrokernelTester()
33190 .mr(3)
33191 .nr(4)
33192 .kr(2)
33193 .sr(1)
33194 .m(3)
33195 .n(4)
33196 .k(k)
33197 .a_stride(11)
33198 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33199 }
33200 }
33201
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64,k_lt_8_subtile)33202 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, k_lt_8_subtile) {
33203 for (size_t k = 1; k < 8; k++) {
33204 for (uint32_t n = 1; n <= 4; n++) {
33205 for (uint32_t m = 1; m <= 3; m++) {
33206 GemmMicrokernelTester()
33207 .mr(3)
33208 .nr(4)
33209 .kr(2)
33210 .sr(1)
33211 .m(m)
33212 .n(n)
33213 .k(k)
33214 .iterations(1)
33215 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33216 }
33217 }
33218 }
33219 }
33220
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64,k_gt_8)33221 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, k_gt_8) {
33222 for (size_t k = 9; k < 16; k++) {
33223 GemmMicrokernelTester()
33224 .mr(3)
33225 .nr(4)
33226 .kr(2)
33227 .sr(1)
33228 .m(3)
33229 .n(4)
33230 .k(k)
33231 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33232 }
33233 }
33234
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64,k_gt_8_strided_a)33235 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, k_gt_8_strided_a) {
33236 for (size_t k = 9; k < 16; k++) {
33237 GemmMicrokernelTester()
33238 .mr(3)
33239 .nr(4)
33240 .kr(2)
33241 .sr(1)
33242 .m(3)
33243 .n(4)
33244 .k(k)
33245 .a_stride(19)
33246 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33247 }
33248 }
33249
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64,k_gt_8_subtile)33250 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, k_gt_8_subtile) {
33251 for (size_t k = 9; k < 16; k++) {
33252 for (uint32_t n = 1; n <= 4; n++) {
33253 for (uint32_t m = 1; m <= 3; m++) {
33254 GemmMicrokernelTester()
33255 .mr(3)
33256 .nr(4)
33257 .kr(2)
33258 .sr(1)
33259 .m(m)
33260 .n(n)
33261 .k(k)
33262 .iterations(1)
33263 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33264 }
33265 }
33266 }
33267 }
33268
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64,k_div_8)33269 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, k_div_8) {
33270 for (size_t k = 16; k <= 80; k += 8) {
33271 GemmMicrokernelTester()
33272 .mr(3)
33273 .nr(4)
33274 .kr(2)
33275 .sr(1)
33276 .m(3)
33277 .n(4)
33278 .k(k)
33279 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33280 }
33281 }
33282
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64,k_div_8_strided_a)33283 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, k_div_8_strided_a) {
33284 for (size_t k = 16; k <= 80; k += 8) {
33285 GemmMicrokernelTester()
33286 .mr(3)
33287 .nr(4)
33288 .kr(2)
33289 .sr(1)
33290 .m(3)
33291 .n(4)
33292 .k(k)
33293 .a_stride(83)
33294 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33295 }
33296 }
33297
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64,k_div_8_subtile)33298 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, k_div_8_subtile) {
33299 for (size_t k = 16; k <= 80; k += 8) {
33300 for (uint32_t n = 1; n <= 4; n++) {
33301 for (uint32_t m = 1; m <= 3; m++) {
33302 GemmMicrokernelTester()
33303 .mr(3)
33304 .nr(4)
33305 .kr(2)
33306 .sr(1)
33307 .m(m)
33308 .n(n)
33309 .k(k)
33310 .iterations(1)
33311 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33312 }
33313 }
33314 }
33315 }
33316
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64,n_gt_4)33317 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, n_gt_4) {
33318 for (uint32_t n = 5; n < 8; n++) {
33319 for (size_t k = 1; k <= 40; k += 9) {
33320 GemmMicrokernelTester()
33321 .mr(3)
33322 .nr(4)
33323 .kr(2)
33324 .sr(1)
33325 .m(3)
33326 .n(n)
33327 .k(k)
33328 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33329 }
33330 }
33331 }
33332
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64,n_gt_4_strided_cn)33333 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_cn) {
33334 for (uint32_t n = 5; n < 8; n++) {
33335 for (size_t k = 1; k <= 40; k += 9) {
33336 GemmMicrokernelTester()
33337 .mr(3)
33338 .nr(4)
33339 .kr(2)
33340 .sr(1)
33341 .m(3)
33342 .n(n)
33343 .k(k)
33344 .cn_stride(7)
33345 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33346 }
33347 }
33348 }
33349
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64,n_gt_4_strided_a)33350 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_a) {
33351 for (uint32_t n = 5; n < 8; n++) {
33352 for (size_t k = 1; k <= 40; k += 9) {
33353 GemmMicrokernelTester()
33354 .mr(3)
33355 .nr(4)
33356 .kr(2)
33357 .sr(1)
33358 .m(3)
33359 .n(n)
33360 .k(k)
33361 .a_stride(43)
33362 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33363 }
33364 }
33365 }
33366
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64,n_gt_4_subtile)33367 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, n_gt_4_subtile) {
33368 for (uint32_t n = 5; n < 8; n++) {
33369 for (size_t k = 1; k <= 40; k += 9) {
33370 for (uint32_t m = 1; m <= 3; m++) {
33371 GemmMicrokernelTester()
33372 .mr(3)
33373 .nr(4)
33374 .kr(2)
33375 .sr(1)
33376 .m(m)
33377 .n(n)
33378 .k(k)
33379 .iterations(1)
33380 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33381 }
33382 }
33383 }
33384 }
33385
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64,n_div_4)33386 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, n_div_4) {
33387 for (uint32_t n = 8; n <= 12; n += 4) {
33388 for (size_t k = 1; k <= 40; k += 9) {
33389 GemmMicrokernelTester()
33390 .mr(3)
33391 .nr(4)
33392 .kr(2)
33393 .sr(1)
33394 .m(3)
33395 .n(n)
33396 .k(k)
33397 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33398 }
33399 }
33400 }
33401
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64,n_div_4_strided_cn)33402 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, n_div_4_strided_cn) {
33403 for (uint32_t n = 8; n <= 12; n += 4) {
33404 for (size_t k = 1; k <= 40; k += 9) {
33405 GemmMicrokernelTester()
33406 .mr(3)
33407 .nr(4)
33408 .kr(2)
33409 .sr(1)
33410 .m(3)
33411 .n(n)
33412 .k(k)
33413 .cn_stride(7)
33414 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33415 }
33416 }
33417 }
33418
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64,n_div_4_strided_a)33419 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, n_div_4_strided_a) {
33420 for (uint32_t n = 8; n <= 12; n += 4) {
33421 for (size_t k = 1; k <= 40; k += 9) {
33422 GemmMicrokernelTester()
33423 .mr(3)
33424 .nr(4)
33425 .kr(2)
33426 .sr(1)
33427 .m(3)
33428 .n(n)
33429 .k(k)
33430 .a_stride(43)
33431 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33432 }
33433 }
33434 }
33435
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64,n_div_4_subtile)33436 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, n_div_4_subtile) {
33437 for (uint32_t n = 8; n <= 12; n += 4) {
33438 for (size_t k = 1; k <= 40; k += 9) {
33439 for (uint32_t m = 1; m <= 3; m++) {
33440 GemmMicrokernelTester()
33441 .mr(3)
33442 .nr(4)
33443 .kr(2)
33444 .sr(1)
33445 .m(m)
33446 .n(n)
33447 .k(k)
33448 .iterations(1)
33449 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33450 }
33451 }
33452 }
33453 }
33454
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64,strided_cm_subtile)33455 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, strided_cm_subtile) {
33456 for (size_t k = 1; k <= 40; k += 9) {
33457 for (uint32_t n = 1; n <= 4; n++) {
33458 for (uint32_t m = 1; m <= 3; m++) {
33459 GemmMicrokernelTester()
33460 .mr(3)
33461 .nr(4)
33462 .kr(2)
33463 .sr(1)
33464 .m(m)
33465 .n(n)
33466 .k(k)
33467 .cm_stride(7)
33468 .iterations(1)
33469 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33470 }
33471 }
33472 }
33473 }
33474
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64,qmin)33475 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, qmin) {
33476 GemmMicrokernelTester()
33477 .mr(3)
33478 .nr(4)
33479 .kr(2)
33480 .sr(1)
33481 .m(3)
33482 .n(4)
33483 .k(8)
33484 .qmin(128)
33485 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33486 }
33487
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64,qmax)33488 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, qmax) {
33489 GemmMicrokernelTester()
33490 .mr(3)
33491 .nr(4)
33492 .kr(2)
33493 .sr(1)
33494 .m(3)
33495 .n(4)
33496 .k(8)
33497 .qmax(128)
33498 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33499 }
33500
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64,strided_cm)33501 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, strided_cm) {
33502 GemmMicrokernelTester()
33503 .mr(3)
33504 .nr(4)
33505 .kr(2)
33506 .sr(1)
33507 .m(3)
33508 .n(4)
33509 .k(8)
33510 .cm_stride(7)
33511 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33512 }
33513
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64,no_a_zero_point)33514 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, no_a_zero_point) {
33515 for (size_t k = 1; k <= 40; k += 9) {
33516 GemmMicrokernelTester()
33517 .mr(3)
33518 .nr(4)
33519 .kr(2)
33520 .sr(1)
33521 .m(3)
33522 .n(4)
33523 .k(k)
33524 .a_zero_point(0)
33525 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33526 }
33527 }
33528
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64,no_b_zero_point)33529 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, no_b_zero_point) {
33530 for (size_t k = 1; k <= 40; k += 9) {
33531 GemmMicrokernelTester()
33532 .mr(3)
33533 .nr(4)
33534 .kr(2)
33535 .sr(1)
33536 .m(3)
33537 .n(4)
33538 .k(k)
33539 .b_zero_point(0)
33540 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33541 }
33542 }
33543
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64,no_zero_point)33544 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD64, no_zero_point) {
33545 for (size_t k = 1; k <= 40; k += 9) {
33546 GemmMicrokernelTester()
33547 .mr(3)
33548 .nr(4)
33549 .kr(2)
33550 .sr(1)
33551 .m(3)
33552 .n(4)
33553 .k(k)
33554 .a_zero_point(0)
33555 .b_zero_point(0)
33556 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33557 }
33558 }
33559 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
33560
33561
33562 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128,k_eq_8)33563 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, k_eq_8) {
33564 GemmMicrokernelTester()
33565 .mr(3)
33566 .nr(4)
33567 .kr(2)
33568 .sr(1)
33569 .m(3)
33570 .n(4)
33571 .k(8)
33572 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33573 }
33574
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128,strided_cn)33575 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, strided_cn) {
33576 GemmMicrokernelTester()
33577 .mr(3)
33578 .nr(4)
33579 .kr(2)
33580 .sr(1)
33581 .m(3)
33582 .n(4)
33583 .k(8)
33584 .cn_stride(7)
33585 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33586 }
33587
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128,k_eq_8_strided_a)33588 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, k_eq_8_strided_a) {
33589 GemmMicrokernelTester()
33590 .mr(3)
33591 .nr(4)
33592 .kr(2)
33593 .sr(1)
33594 .m(3)
33595 .n(4)
33596 .k(8)
33597 .a_stride(11)
33598 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33599 }
33600
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128,k_eq_8_subtile)33601 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile) {
33602 for (uint32_t n = 1; n <= 4; n++) {
33603 for (uint32_t m = 1; m <= 3; m++) {
33604 GemmMicrokernelTester()
33605 .mr(3)
33606 .nr(4)
33607 .kr(2)
33608 .sr(1)
33609 .m(m)
33610 .n(n)
33611 .k(8)
33612 .iterations(1)
33613 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33614 }
33615 }
33616 }
33617
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128,k_eq_8_subtile_m)33618 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile_m) {
33619 for (uint32_t m = 1; m <= 3; m++) {
33620 GemmMicrokernelTester()
33621 .mr(3)
33622 .nr(4)
33623 .kr(2)
33624 .sr(1)
33625 .m(m)
33626 .n(4)
33627 .k(8)
33628 .iterations(1)
33629 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33630 }
33631 }
33632
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128,k_eq_8_subtile_n)33633 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile_n) {
33634 for (uint32_t n = 1; n <= 4; n++) {
33635 GemmMicrokernelTester()
33636 .mr(3)
33637 .nr(4)
33638 .kr(2)
33639 .sr(1)
33640 .m(3)
33641 .n(n)
33642 .k(8)
33643 .iterations(1)
33644 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33645 }
33646 }
33647
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128,k_lt_8)33648 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, k_lt_8) {
33649 for (size_t k = 1; k < 8; k++) {
33650 GemmMicrokernelTester()
33651 .mr(3)
33652 .nr(4)
33653 .kr(2)
33654 .sr(1)
33655 .m(3)
33656 .n(4)
33657 .k(k)
33658 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33659 }
33660 }
33661
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128,k_lt_8_strided_a)33662 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, k_lt_8_strided_a) {
33663 for (size_t k = 1; k < 8; k++) {
33664 GemmMicrokernelTester()
33665 .mr(3)
33666 .nr(4)
33667 .kr(2)
33668 .sr(1)
33669 .m(3)
33670 .n(4)
33671 .k(k)
33672 .a_stride(11)
33673 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33674 }
33675 }
33676
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128,k_lt_8_subtile)33677 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, k_lt_8_subtile) {
33678 for (size_t k = 1; k < 8; k++) {
33679 for (uint32_t n = 1; n <= 4; n++) {
33680 for (uint32_t m = 1; m <= 3; m++) {
33681 GemmMicrokernelTester()
33682 .mr(3)
33683 .nr(4)
33684 .kr(2)
33685 .sr(1)
33686 .m(m)
33687 .n(n)
33688 .k(k)
33689 .iterations(1)
33690 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33691 }
33692 }
33693 }
33694 }
33695
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128,k_gt_8)33696 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, k_gt_8) {
33697 for (size_t k = 9; k < 16; k++) {
33698 GemmMicrokernelTester()
33699 .mr(3)
33700 .nr(4)
33701 .kr(2)
33702 .sr(1)
33703 .m(3)
33704 .n(4)
33705 .k(k)
33706 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33707 }
33708 }
33709
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128,k_gt_8_strided_a)33710 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, k_gt_8_strided_a) {
33711 for (size_t k = 9; k < 16; k++) {
33712 GemmMicrokernelTester()
33713 .mr(3)
33714 .nr(4)
33715 .kr(2)
33716 .sr(1)
33717 .m(3)
33718 .n(4)
33719 .k(k)
33720 .a_stride(19)
33721 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33722 }
33723 }
33724
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128,k_gt_8_subtile)33725 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, k_gt_8_subtile) {
33726 for (size_t k = 9; k < 16; k++) {
33727 for (uint32_t n = 1; n <= 4; n++) {
33728 for (uint32_t m = 1; m <= 3; m++) {
33729 GemmMicrokernelTester()
33730 .mr(3)
33731 .nr(4)
33732 .kr(2)
33733 .sr(1)
33734 .m(m)
33735 .n(n)
33736 .k(k)
33737 .iterations(1)
33738 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33739 }
33740 }
33741 }
33742 }
33743
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128,k_div_8)33744 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, k_div_8) {
33745 for (size_t k = 16; k <= 80; k += 8) {
33746 GemmMicrokernelTester()
33747 .mr(3)
33748 .nr(4)
33749 .kr(2)
33750 .sr(1)
33751 .m(3)
33752 .n(4)
33753 .k(k)
33754 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33755 }
33756 }
33757
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128,k_div_8_strided_a)33758 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, k_div_8_strided_a) {
33759 for (size_t k = 16; k <= 80; k += 8) {
33760 GemmMicrokernelTester()
33761 .mr(3)
33762 .nr(4)
33763 .kr(2)
33764 .sr(1)
33765 .m(3)
33766 .n(4)
33767 .k(k)
33768 .a_stride(83)
33769 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33770 }
33771 }
33772
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128,k_div_8_subtile)33773 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, k_div_8_subtile) {
33774 for (size_t k = 16; k <= 80; k += 8) {
33775 for (uint32_t n = 1; n <= 4; n++) {
33776 for (uint32_t m = 1; m <= 3; m++) {
33777 GemmMicrokernelTester()
33778 .mr(3)
33779 .nr(4)
33780 .kr(2)
33781 .sr(1)
33782 .m(m)
33783 .n(n)
33784 .k(k)
33785 .iterations(1)
33786 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33787 }
33788 }
33789 }
33790 }
33791
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128,n_gt_4)33792 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, n_gt_4) {
33793 for (uint32_t n = 5; n < 8; n++) {
33794 for (size_t k = 1; k <= 40; k += 9) {
33795 GemmMicrokernelTester()
33796 .mr(3)
33797 .nr(4)
33798 .kr(2)
33799 .sr(1)
33800 .m(3)
33801 .n(n)
33802 .k(k)
33803 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33804 }
33805 }
33806 }
33807
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128,n_gt_4_strided_cn)33808 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, n_gt_4_strided_cn) {
33809 for (uint32_t n = 5; n < 8; n++) {
33810 for (size_t k = 1; k <= 40; k += 9) {
33811 GemmMicrokernelTester()
33812 .mr(3)
33813 .nr(4)
33814 .kr(2)
33815 .sr(1)
33816 .m(3)
33817 .n(n)
33818 .k(k)
33819 .cn_stride(7)
33820 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33821 }
33822 }
33823 }
33824
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128,n_gt_4_strided_a)33825 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, n_gt_4_strided_a) {
33826 for (uint32_t n = 5; n < 8; n++) {
33827 for (size_t k = 1; k <= 40; k += 9) {
33828 GemmMicrokernelTester()
33829 .mr(3)
33830 .nr(4)
33831 .kr(2)
33832 .sr(1)
33833 .m(3)
33834 .n(n)
33835 .k(k)
33836 .a_stride(43)
33837 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33838 }
33839 }
33840 }
33841
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128,n_gt_4_subtile)33842 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, n_gt_4_subtile) {
33843 for (uint32_t n = 5; n < 8; n++) {
33844 for (size_t k = 1; k <= 40; k += 9) {
33845 for (uint32_t m = 1; m <= 3; m++) {
33846 GemmMicrokernelTester()
33847 .mr(3)
33848 .nr(4)
33849 .kr(2)
33850 .sr(1)
33851 .m(m)
33852 .n(n)
33853 .k(k)
33854 .iterations(1)
33855 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33856 }
33857 }
33858 }
33859 }
33860
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128,n_div_4)33861 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, n_div_4) {
33862 for (uint32_t n = 8; n <= 12; n += 4) {
33863 for (size_t k = 1; k <= 40; k += 9) {
33864 GemmMicrokernelTester()
33865 .mr(3)
33866 .nr(4)
33867 .kr(2)
33868 .sr(1)
33869 .m(3)
33870 .n(n)
33871 .k(k)
33872 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33873 }
33874 }
33875 }
33876
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128,n_div_4_strided_cn)33877 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, n_div_4_strided_cn) {
33878 for (uint32_t n = 8; n <= 12; n += 4) {
33879 for (size_t k = 1; k <= 40; k += 9) {
33880 GemmMicrokernelTester()
33881 .mr(3)
33882 .nr(4)
33883 .kr(2)
33884 .sr(1)
33885 .m(3)
33886 .n(n)
33887 .k(k)
33888 .cn_stride(7)
33889 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33890 }
33891 }
33892 }
33893
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128,n_div_4_strided_a)33894 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, n_div_4_strided_a) {
33895 for (uint32_t n = 8; n <= 12; n += 4) {
33896 for (size_t k = 1; k <= 40; k += 9) {
33897 GemmMicrokernelTester()
33898 .mr(3)
33899 .nr(4)
33900 .kr(2)
33901 .sr(1)
33902 .m(3)
33903 .n(n)
33904 .k(k)
33905 .a_stride(43)
33906 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33907 }
33908 }
33909 }
33910
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128,n_div_4_subtile)33911 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, n_div_4_subtile) {
33912 for (uint32_t n = 8; n <= 12; n += 4) {
33913 for (size_t k = 1; k <= 40; k += 9) {
33914 for (uint32_t m = 1; m <= 3; m++) {
33915 GemmMicrokernelTester()
33916 .mr(3)
33917 .nr(4)
33918 .kr(2)
33919 .sr(1)
33920 .m(m)
33921 .n(n)
33922 .k(k)
33923 .iterations(1)
33924 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33925 }
33926 }
33927 }
33928 }
33929
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128,strided_cm_subtile)33930 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, strided_cm_subtile) {
33931 for (size_t k = 1; k <= 40; k += 9) {
33932 for (uint32_t n = 1; n <= 4; n++) {
33933 for (uint32_t m = 1; m <= 3; m++) {
33934 GemmMicrokernelTester()
33935 .mr(3)
33936 .nr(4)
33937 .kr(2)
33938 .sr(1)
33939 .m(m)
33940 .n(n)
33941 .k(k)
33942 .cm_stride(7)
33943 .iterations(1)
33944 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33945 }
33946 }
33947 }
33948 }
33949
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128,qmin)33950 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, qmin) {
33951 GemmMicrokernelTester()
33952 .mr(3)
33953 .nr(4)
33954 .kr(2)
33955 .sr(1)
33956 .m(3)
33957 .n(4)
33958 .k(8)
33959 .qmin(128)
33960 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33961 }
33962
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128,qmax)33963 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, qmax) {
33964 GemmMicrokernelTester()
33965 .mr(3)
33966 .nr(4)
33967 .kr(2)
33968 .sr(1)
33969 .m(3)
33970 .n(4)
33971 .k(8)
33972 .qmax(128)
33973 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33974 }
33975
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128,strided_cm)33976 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, strided_cm) {
33977 GemmMicrokernelTester()
33978 .mr(3)
33979 .nr(4)
33980 .kr(2)
33981 .sr(1)
33982 .m(3)
33983 .n(4)
33984 .k(8)
33985 .cm_stride(7)
33986 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
33987 }
33988
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128,no_a_zero_point)33989 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, no_a_zero_point) {
33990 for (size_t k = 1; k <= 40; k += 9) {
33991 GemmMicrokernelTester()
33992 .mr(3)
33993 .nr(4)
33994 .kr(2)
33995 .sr(1)
33996 .m(3)
33997 .n(4)
33998 .k(k)
33999 .a_zero_point(0)
34000 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34001 }
34002 }
34003
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128,no_b_zero_point)34004 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, no_b_zero_point) {
34005 for (size_t k = 1; k <= 40; k += 9) {
34006 GemmMicrokernelTester()
34007 .mr(3)
34008 .nr(4)
34009 .kr(2)
34010 .sr(1)
34011 .m(3)
34012 .n(4)
34013 .k(k)
34014 .b_zero_point(0)
34015 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34016 }
34017 }
34018
TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128,no_zero_point)34019 TEST(QU8_GEMM_MINMAX_FP32_3X4C2__WASMSIMD_DOT16X2_LD128, no_zero_point) {
34020 for (size_t k = 1; k <= 40; k += 9) {
34021 GemmMicrokernelTester()
34022 .mr(3)
34023 .nr(4)
34024 .kr(2)
34025 .sr(1)
34026 .m(3)
34027 .n(4)
34028 .k(k)
34029 .a_zero_point(0)
34030 .b_zero_point(0)
34031 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34032 }
34033 }
34034 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
34035
34036
34037 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64,k_eq_8)34038 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8) {
34039 GemmMicrokernelTester()
34040 .mr(3)
34041 .nr(4)
34042 .kr(2)
34043 .sr(4)
34044 .m(3)
34045 .n(4)
34046 .k(8)
34047 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34048 }
34049
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64,strided_cn)34050 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, strided_cn) {
34051 GemmMicrokernelTester()
34052 .mr(3)
34053 .nr(4)
34054 .kr(2)
34055 .sr(4)
34056 .m(3)
34057 .n(4)
34058 .k(8)
34059 .cn_stride(7)
34060 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34061 }
34062
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64,k_eq_8_strided_a)34063 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8_strided_a) {
34064 GemmMicrokernelTester()
34065 .mr(3)
34066 .nr(4)
34067 .kr(2)
34068 .sr(4)
34069 .m(3)
34070 .n(4)
34071 .k(8)
34072 .a_stride(11)
34073 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34074 }
34075
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64,k_eq_8_subtile)34076 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile) {
34077 for (uint32_t n = 1; n <= 4; n++) {
34078 for (uint32_t m = 1; m <= 3; m++) {
34079 GemmMicrokernelTester()
34080 .mr(3)
34081 .nr(4)
34082 .kr(2)
34083 .sr(4)
34084 .m(m)
34085 .n(n)
34086 .k(8)
34087 .iterations(1)
34088 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34089 }
34090 }
34091 }
34092
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64,k_eq_8_subtile_m)34093 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_m) {
34094 for (uint32_t m = 1; m <= 3; m++) {
34095 GemmMicrokernelTester()
34096 .mr(3)
34097 .nr(4)
34098 .kr(2)
34099 .sr(4)
34100 .m(m)
34101 .n(4)
34102 .k(8)
34103 .iterations(1)
34104 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34105 }
34106 }
34107
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64,k_eq_8_subtile_n)34108 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_n) {
34109 for (uint32_t n = 1; n <= 4; n++) {
34110 GemmMicrokernelTester()
34111 .mr(3)
34112 .nr(4)
34113 .kr(2)
34114 .sr(4)
34115 .m(3)
34116 .n(n)
34117 .k(8)
34118 .iterations(1)
34119 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34120 }
34121 }
34122
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64,k_lt_8)34123 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, k_lt_8) {
34124 for (size_t k = 1; k < 8; k++) {
34125 GemmMicrokernelTester()
34126 .mr(3)
34127 .nr(4)
34128 .kr(2)
34129 .sr(4)
34130 .m(3)
34131 .n(4)
34132 .k(k)
34133 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34134 }
34135 }
34136
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64,k_lt_8_strided_a)34137 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, k_lt_8_strided_a) {
34138 for (size_t k = 1; k < 8; k++) {
34139 GemmMicrokernelTester()
34140 .mr(3)
34141 .nr(4)
34142 .kr(2)
34143 .sr(4)
34144 .m(3)
34145 .n(4)
34146 .k(k)
34147 .a_stride(11)
34148 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34149 }
34150 }
34151
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64,k_lt_8_subtile)34152 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, k_lt_8_subtile) {
34153 for (size_t k = 1; k < 8; k++) {
34154 for (uint32_t n = 1; n <= 4; n++) {
34155 for (uint32_t m = 1; m <= 3; m++) {
34156 GemmMicrokernelTester()
34157 .mr(3)
34158 .nr(4)
34159 .kr(2)
34160 .sr(4)
34161 .m(m)
34162 .n(n)
34163 .k(k)
34164 .iterations(1)
34165 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34166 }
34167 }
34168 }
34169 }
34170
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64,k_gt_8)34171 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, k_gt_8) {
34172 for (size_t k = 9; k < 16; k++) {
34173 GemmMicrokernelTester()
34174 .mr(3)
34175 .nr(4)
34176 .kr(2)
34177 .sr(4)
34178 .m(3)
34179 .n(4)
34180 .k(k)
34181 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34182 }
34183 }
34184
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64,k_gt_8_strided_a)34185 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, k_gt_8_strided_a) {
34186 for (size_t k = 9; k < 16; k++) {
34187 GemmMicrokernelTester()
34188 .mr(3)
34189 .nr(4)
34190 .kr(2)
34191 .sr(4)
34192 .m(3)
34193 .n(4)
34194 .k(k)
34195 .a_stride(19)
34196 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34197 }
34198 }
34199
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64,k_gt_8_subtile)34200 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, k_gt_8_subtile) {
34201 for (size_t k = 9; k < 16; k++) {
34202 for (uint32_t n = 1; n <= 4; n++) {
34203 for (uint32_t m = 1; m <= 3; m++) {
34204 GemmMicrokernelTester()
34205 .mr(3)
34206 .nr(4)
34207 .kr(2)
34208 .sr(4)
34209 .m(m)
34210 .n(n)
34211 .k(k)
34212 .iterations(1)
34213 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34214 }
34215 }
34216 }
34217 }
34218
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64,k_div_8)34219 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, k_div_8) {
34220 for (size_t k = 16; k <= 80; k += 8) {
34221 GemmMicrokernelTester()
34222 .mr(3)
34223 .nr(4)
34224 .kr(2)
34225 .sr(4)
34226 .m(3)
34227 .n(4)
34228 .k(k)
34229 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34230 }
34231 }
34232
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64,k_div_8_strided_a)34233 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, k_div_8_strided_a) {
34234 for (size_t k = 16; k <= 80; k += 8) {
34235 GemmMicrokernelTester()
34236 .mr(3)
34237 .nr(4)
34238 .kr(2)
34239 .sr(4)
34240 .m(3)
34241 .n(4)
34242 .k(k)
34243 .a_stride(83)
34244 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34245 }
34246 }
34247
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64,k_div_8_subtile)34248 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, k_div_8_subtile) {
34249 for (size_t k = 16; k <= 80; k += 8) {
34250 for (uint32_t n = 1; n <= 4; n++) {
34251 for (uint32_t m = 1; m <= 3; m++) {
34252 GemmMicrokernelTester()
34253 .mr(3)
34254 .nr(4)
34255 .kr(2)
34256 .sr(4)
34257 .m(m)
34258 .n(n)
34259 .k(k)
34260 .iterations(1)
34261 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34262 }
34263 }
34264 }
34265 }
34266
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64,n_gt_4)34267 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, n_gt_4) {
34268 for (uint32_t n = 5; n < 8; n++) {
34269 for (size_t k = 1; k <= 40; k += 9) {
34270 GemmMicrokernelTester()
34271 .mr(3)
34272 .nr(4)
34273 .kr(2)
34274 .sr(4)
34275 .m(3)
34276 .n(n)
34277 .k(k)
34278 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34279 }
34280 }
34281 }
34282
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64,n_gt_4_strided_cn)34283 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_cn) {
34284 for (uint32_t n = 5; n < 8; n++) {
34285 for (size_t k = 1; k <= 40; k += 9) {
34286 GemmMicrokernelTester()
34287 .mr(3)
34288 .nr(4)
34289 .kr(2)
34290 .sr(4)
34291 .m(3)
34292 .n(n)
34293 .k(k)
34294 .cn_stride(7)
34295 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34296 }
34297 }
34298 }
34299
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64,n_gt_4_strided_a)34300 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_a) {
34301 for (uint32_t n = 5; n < 8; n++) {
34302 for (size_t k = 1; k <= 40; k += 9) {
34303 GemmMicrokernelTester()
34304 .mr(3)
34305 .nr(4)
34306 .kr(2)
34307 .sr(4)
34308 .m(3)
34309 .n(n)
34310 .k(k)
34311 .a_stride(43)
34312 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34313 }
34314 }
34315 }
34316
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64,n_gt_4_subtile)34317 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, n_gt_4_subtile) {
34318 for (uint32_t n = 5; n < 8; n++) {
34319 for (size_t k = 1; k <= 40; k += 9) {
34320 for (uint32_t m = 1; m <= 3; m++) {
34321 GemmMicrokernelTester()
34322 .mr(3)
34323 .nr(4)
34324 .kr(2)
34325 .sr(4)
34326 .m(m)
34327 .n(n)
34328 .k(k)
34329 .iterations(1)
34330 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34331 }
34332 }
34333 }
34334 }
34335
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64,n_div_4)34336 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, n_div_4) {
34337 for (uint32_t n = 8; n <= 12; n += 4) {
34338 for (size_t k = 1; k <= 40; k += 9) {
34339 GemmMicrokernelTester()
34340 .mr(3)
34341 .nr(4)
34342 .kr(2)
34343 .sr(4)
34344 .m(3)
34345 .n(n)
34346 .k(k)
34347 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34348 }
34349 }
34350 }
34351
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64,n_div_4_strided_cn)34352 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, n_div_4_strided_cn) {
34353 for (uint32_t n = 8; n <= 12; n += 4) {
34354 for (size_t k = 1; k <= 40; k += 9) {
34355 GemmMicrokernelTester()
34356 .mr(3)
34357 .nr(4)
34358 .kr(2)
34359 .sr(4)
34360 .m(3)
34361 .n(n)
34362 .k(k)
34363 .cn_stride(7)
34364 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34365 }
34366 }
34367 }
34368
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64,n_div_4_strided_a)34369 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, n_div_4_strided_a) {
34370 for (uint32_t n = 8; n <= 12; n += 4) {
34371 for (size_t k = 1; k <= 40; k += 9) {
34372 GemmMicrokernelTester()
34373 .mr(3)
34374 .nr(4)
34375 .kr(2)
34376 .sr(4)
34377 .m(3)
34378 .n(n)
34379 .k(k)
34380 .a_stride(43)
34381 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34382 }
34383 }
34384 }
34385
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64,n_div_4_subtile)34386 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, n_div_4_subtile) {
34387 for (uint32_t n = 8; n <= 12; n += 4) {
34388 for (size_t k = 1; k <= 40; k += 9) {
34389 for (uint32_t m = 1; m <= 3; m++) {
34390 GemmMicrokernelTester()
34391 .mr(3)
34392 .nr(4)
34393 .kr(2)
34394 .sr(4)
34395 .m(m)
34396 .n(n)
34397 .k(k)
34398 .iterations(1)
34399 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34400 }
34401 }
34402 }
34403 }
34404
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64,strided_cm_subtile)34405 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, strided_cm_subtile) {
34406 for (size_t k = 1; k <= 40; k += 9) {
34407 for (uint32_t n = 1; n <= 4; n++) {
34408 for (uint32_t m = 1; m <= 3; m++) {
34409 GemmMicrokernelTester()
34410 .mr(3)
34411 .nr(4)
34412 .kr(2)
34413 .sr(4)
34414 .m(m)
34415 .n(n)
34416 .k(k)
34417 .cm_stride(7)
34418 .iterations(1)
34419 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34420 }
34421 }
34422 }
34423 }
34424
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64,qmin)34425 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, qmin) {
34426 GemmMicrokernelTester()
34427 .mr(3)
34428 .nr(4)
34429 .kr(2)
34430 .sr(4)
34431 .m(3)
34432 .n(4)
34433 .k(8)
34434 .qmin(128)
34435 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34436 }
34437
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64,qmax)34438 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, qmax) {
34439 GemmMicrokernelTester()
34440 .mr(3)
34441 .nr(4)
34442 .kr(2)
34443 .sr(4)
34444 .m(3)
34445 .n(4)
34446 .k(8)
34447 .qmax(128)
34448 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34449 }
34450
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64,strided_cm)34451 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, strided_cm) {
34452 GemmMicrokernelTester()
34453 .mr(3)
34454 .nr(4)
34455 .kr(2)
34456 .sr(4)
34457 .m(3)
34458 .n(4)
34459 .k(8)
34460 .cm_stride(7)
34461 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34462 }
34463
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64,no_a_zero_point)34464 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, no_a_zero_point) {
34465 for (size_t k = 1; k <= 40; k += 9) {
34466 GemmMicrokernelTester()
34467 .mr(3)
34468 .nr(4)
34469 .kr(2)
34470 .sr(4)
34471 .m(3)
34472 .n(4)
34473 .k(k)
34474 .a_zero_point(0)
34475 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34476 }
34477 }
34478
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64,no_b_zero_point)34479 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, no_b_zero_point) {
34480 for (size_t k = 1; k <= 40; k += 9) {
34481 GemmMicrokernelTester()
34482 .mr(3)
34483 .nr(4)
34484 .kr(2)
34485 .sr(4)
34486 .m(3)
34487 .n(4)
34488 .k(k)
34489 .b_zero_point(0)
34490 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34491 }
34492 }
34493
TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64,no_zero_point)34494 TEST(QU8_GEMM_MINMAX_FP32_3X4C2S4__WASMSIMD_DOT16X2_LD64, no_zero_point) {
34495 for (size_t k = 1; k <= 40; k += 9) {
34496 GemmMicrokernelTester()
34497 .mr(3)
34498 .nr(4)
34499 .kr(2)
34500 .sr(4)
34501 .m(3)
34502 .n(4)
34503 .k(k)
34504 .a_zero_point(0)
34505 .b_zero_point(0)
34506 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34507 }
34508 }
34509 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
34510
34511
34512 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128,k_eq_8)34513 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, k_eq_8) {
34514 GemmMicrokernelTester()
34515 .mr(3)
34516 .nr(4)
34517 .kr(8)
34518 .sr(1)
34519 .m(3)
34520 .n(4)
34521 .k(8)
34522 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34523 }
34524
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128,strided_cn)34525 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, strided_cn) {
34526 GemmMicrokernelTester()
34527 .mr(3)
34528 .nr(4)
34529 .kr(8)
34530 .sr(1)
34531 .m(3)
34532 .n(4)
34533 .k(8)
34534 .cn_stride(7)
34535 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34536 }
34537
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128,k_eq_8_strided_a)34538 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, k_eq_8_strided_a) {
34539 GemmMicrokernelTester()
34540 .mr(3)
34541 .nr(4)
34542 .kr(8)
34543 .sr(1)
34544 .m(3)
34545 .n(4)
34546 .k(8)
34547 .a_stride(11)
34548 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34549 }
34550
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128,k_eq_8_subtile)34551 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile) {
34552 for (uint32_t n = 1; n <= 4; n++) {
34553 for (uint32_t m = 1; m <= 3; m++) {
34554 GemmMicrokernelTester()
34555 .mr(3)
34556 .nr(4)
34557 .kr(8)
34558 .sr(1)
34559 .m(m)
34560 .n(n)
34561 .k(8)
34562 .iterations(1)
34563 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34564 }
34565 }
34566 }
34567
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128,k_eq_8_subtile_m)34568 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile_m) {
34569 for (uint32_t m = 1; m <= 3; m++) {
34570 GemmMicrokernelTester()
34571 .mr(3)
34572 .nr(4)
34573 .kr(8)
34574 .sr(1)
34575 .m(m)
34576 .n(4)
34577 .k(8)
34578 .iterations(1)
34579 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34580 }
34581 }
34582
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128,k_eq_8_subtile_n)34583 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile_n) {
34584 for (uint32_t n = 1; n <= 4; n++) {
34585 GemmMicrokernelTester()
34586 .mr(3)
34587 .nr(4)
34588 .kr(8)
34589 .sr(1)
34590 .m(3)
34591 .n(n)
34592 .k(8)
34593 .iterations(1)
34594 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34595 }
34596 }
34597
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128,k_lt_8)34598 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, k_lt_8) {
34599 for (size_t k = 1; k < 8; k++) {
34600 GemmMicrokernelTester()
34601 .mr(3)
34602 .nr(4)
34603 .kr(8)
34604 .sr(1)
34605 .m(3)
34606 .n(4)
34607 .k(k)
34608 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34609 }
34610 }
34611
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128,k_lt_8_strided_a)34612 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, k_lt_8_strided_a) {
34613 for (size_t k = 1; k < 8; k++) {
34614 GemmMicrokernelTester()
34615 .mr(3)
34616 .nr(4)
34617 .kr(8)
34618 .sr(1)
34619 .m(3)
34620 .n(4)
34621 .k(k)
34622 .a_stride(11)
34623 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34624 }
34625 }
34626
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128,k_lt_8_subtile)34627 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, k_lt_8_subtile) {
34628 for (size_t k = 1; k < 8; k++) {
34629 for (uint32_t n = 1; n <= 4; n++) {
34630 for (uint32_t m = 1; m <= 3; m++) {
34631 GemmMicrokernelTester()
34632 .mr(3)
34633 .nr(4)
34634 .kr(8)
34635 .sr(1)
34636 .m(m)
34637 .n(n)
34638 .k(k)
34639 .iterations(1)
34640 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34641 }
34642 }
34643 }
34644 }
34645
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128,k_gt_8)34646 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, k_gt_8) {
34647 for (size_t k = 9; k < 16; k++) {
34648 GemmMicrokernelTester()
34649 .mr(3)
34650 .nr(4)
34651 .kr(8)
34652 .sr(1)
34653 .m(3)
34654 .n(4)
34655 .k(k)
34656 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34657 }
34658 }
34659
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128,k_gt_8_strided_a)34660 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, k_gt_8_strided_a) {
34661 for (size_t k = 9; k < 16; k++) {
34662 GemmMicrokernelTester()
34663 .mr(3)
34664 .nr(4)
34665 .kr(8)
34666 .sr(1)
34667 .m(3)
34668 .n(4)
34669 .k(k)
34670 .a_stride(19)
34671 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34672 }
34673 }
34674
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128,k_gt_8_subtile)34675 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, k_gt_8_subtile) {
34676 for (size_t k = 9; k < 16; k++) {
34677 for (uint32_t n = 1; n <= 4; n++) {
34678 for (uint32_t m = 1; m <= 3; m++) {
34679 GemmMicrokernelTester()
34680 .mr(3)
34681 .nr(4)
34682 .kr(8)
34683 .sr(1)
34684 .m(m)
34685 .n(n)
34686 .k(k)
34687 .iterations(1)
34688 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34689 }
34690 }
34691 }
34692 }
34693
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128,k_div_8)34694 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, k_div_8) {
34695 for (size_t k = 16; k <= 80; k += 8) {
34696 GemmMicrokernelTester()
34697 .mr(3)
34698 .nr(4)
34699 .kr(8)
34700 .sr(1)
34701 .m(3)
34702 .n(4)
34703 .k(k)
34704 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34705 }
34706 }
34707
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128,k_div_8_strided_a)34708 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, k_div_8_strided_a) {
34709 for (size_t k = 16; k <= 80; k += 8) {
34710 GemmMicrokernelTester()
34711 .mr(3)
34712 .nr(4)
34713 .kr(8)
34714 .sr(1)
34715 .m(3)
34716 .n(4)
34717 .k(k)
34718 .a_stride(83)
34719 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34720 }
34721 }
34722
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128,k_div_8_subtile)34723 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, k_div_8_subtile) {
34724 for (size_t k = 16; k <= 80; k += 8) {
34725 for (uint32_t n = 1; n <= 4; n++) {
34726 for (uint32_t m = 1; m <= 3; m++) {
34727 GemmMicrokernelTester()
34728 .mr(3)
34729 .nr(4)
34730 .kr(8)
34731 .sr(1)
34732 .m(m)
34733 .n(n)
34734 .k(k)
34735 .iterations(1)
34736 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34737 }
34738 }
34739 }
34740 }
34741
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128,n_gt_4)34742 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, n_gt_4) {
34743 for (uint32_t n = 5; n < 8; n++) {
34744 for (size_t k = 1; k <= 40; k += 9) {
34745 GemmMicrokernelTester()
34746 .mr(3)
34747 .nr(4)
34748 .kr(8)
34749 .sr(1)
34750 .m(3)
34751 .n(n)
34752 .k(k)
34753 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34754 }
34755 }
34756 }
34757
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128,n_gt_4_strided_cn)34758 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, n_gt_4_strided_cn) {
34759 for (uint32_t n = 5; n < 8; n++) {
34760 for (size_t k = 1; k <= 40; k += 9) {
34761 GemmMicrokernelTester()
34762 .mr(3)
34763 .nr(4)
34764 .kr(8)
34765 .sr(1)
34766 .m(3)
34767 .n(n)
34768 .k(k)
34769 .cn_stride(7)
34770 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34771 }
34772 }
34773 }
34774
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128,n_gt_4_strided_a)34775 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, n_gt_4_strided_a) {
34776 for (uint32_t n = 5; n < 8; n++) {
34777 for (size_t k = 1; k <= 40; k += 9) {
34778 GemmMicrokernelTester()
34779 .mr(3)
34780 .nr(4)
34781 .kr(8)
34782 .sr(1)
34783 .m(3)
34784 .n(n)
34785 .k(k)
34786 .a_stride(43)
34787 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34788 }
34789 }
34790 }
34791
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128,n_gt_4_subtile)34792 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, n_gt_4_subtile) {
34793 for (uint32_t n = 5; n < 8; n++) {
34794 for (size_t k = 1; k <= 40; k += 9) {
34795 for (uint32_t m = 1; m <= 3; m++) {
34796 GemmMicrokernelTester()
34797 .mr(3)
34798 .nr(4)
34799 .kr(8)
34800 .sr(1)
34801 .m(m)
34802 .n(n)
34803 .k(k)
34804 .iterations(1)
34805 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34806 }
34807 }
34808 }
34809 }
34810
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128,n_div_4)34811 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, n_div_4) {
34812 for (uint32_t n = 8; n <= 12; n += 4) {
34813 for (size_t k = 1; k <= 40; k += 9) {
34814 GemmMicrokernelTester()
34815 .mr(3)
34816 .nr(4)
34817 .kr(8)
34818 .sr(1)
34819 .m(3)
34820 .n(n)
34821 .k(k)
34822 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34823 }
34824 }
34825 }
34826
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128,n_div_4_strided_cn)34827 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, n_div_4_strided_cn) {
34828 for (uint32_t n = 8; n <= 12; n += 4) {
34829 for (size_t k = 1; k <= 40; k += 9) {
34830 GemmMicrokernelTester()
34831 .mr(3)
34832 .nr(4)
34833 .kr(8)
34834 .sr(1)
34835 .m(3)
34836 .n(n)
34837 .k(k)
34838 .cn_stride(7)
34839 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34840 }
34841 }
34842 }
34843
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128,n_div_4_strided_a)34844 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, n_div_4_strided_a) {
34845 for (uint32_t n = 8; n <= 12; n += 4) {
34846 for (size_t k = 1; k <= 40; k += 9) {
34847 GemmMicrokernelTester()
34848 .mr(3)
34849 .nr(4)
34850 .kr(8)
34851 .sr(1)
34852 .m(3)
34853 .n(n)
34854 .k(k)
34855 .a_stride(43)
34856 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34857 }
34858 }
34859 }
34860
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128,n_div_4_subtile)34861 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, n_div_4_subtile) {
34862 for (uint32_t n = 8; n <= 12; n += 4) {
34863 for (size_t k = 1; k <= 40; k += 9) {
34864 for (uint32_t m = 1; m <= 3; m++) {
34865 GemmMicrokernelTester()
34866 .mr(3)
34867 .nr(4)
34868 .kr(8)
34869 .sr(1)
34870 .m(m)
34871 .n(n)
34872 .k(k)
34873 .iterations(1)
34874 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34875 }
34876 }
34877 }
34878 }
34879
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128,strided_cm_subtile)34880 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, strided_cm_subtile) {
34881 for (size_t k = 1; k <= 40; k += 9) {
34882 for (uint32_t n = 1; n <= 4; n++) {
34883 for (uint32_t m = 1; m <= 3; m++) {
34884 GemmMicrokernelTester()
34885 .mr(3)
34886 .nr(4)
34887 .kr(8)
34888 .sr(1)
34889 .m(m)
34890 .n(n)
34891 .k(k)
34892 .cm_stride(7)
34893 .iterations(1)
34894 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34895 }
34896 }
34897 }
34898 }
34899
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128,qmin)34900 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, qmin) {
34901 GemmMicrokernelTester()
34902 .mr(3)
34903 .nr(4)
34904 .kr(8)
34905 .sr(1)
34906 .m(3)
34907 .n(4)
34908 .k(8)
34909 .qmin(128)
34910 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34911 }
34912
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128,qmax)34913 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, qmax) {
34914 GemmMicrokernelTester()
34915 .mr(3)
34916 .nr(4)
34917 .kr(8)
34918 .sr(1)
34919 .m(3)
34920 .n(4)
34921 .k(8)
34922 .qmax(128)
34923 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34924 }
34925
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128,strided_cm)34926 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, strided_cm) {
34927 GemmMicrokernelTester()
34928 .mr(3)
34929 .nr(4)
34930 .kr(8)
34931 .sr(1)
34932 .m(3)
34933 .n(4)
34934 .k(8)
34935 .cm_stride(7)
34936 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34937 }
34938
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128,no_a_zero_point)34939 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, no_a_zero_point) {
34940 for (size_t k = 1; k <= 40; k += 9) {
34941 GemmMicrokernelTester()
34942 .mr(3)
34943 .nr(4)
34944 .kr(8)
34945 .sr(1)
34946 .m(3)
34947 .n(4)
34948 .k(k)
34949 .a_zero_point(0)
34950 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34951 }
34952 }
34953
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128,no_b_zero_point)34954 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, no_b_zero_point) {
34955 for (size_t k = 1; k <= 40; k += 9) {
34956 GemmMicrokernelTester()
34957 .mr(3)
34958 .nr(4)
34959 .kr(8)
34960 .sr(1)
34961 .m(3)
34962 .n(4)
34963 .k(k)
34964 .b_zero_point(0)
34965 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34966 }
34967 }
34968
TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128,no_zero_point)34969 TEST(QU8_GEMM_MINMAX_FP32_3X4C8__WASMSIMD_DOT16X2_LD128, no_zero_point) {
34970 for (size_t k = 1; k <= 40; k += 9) {
34971 GemmMicrokernelTester()
34972 .mr(3)
34973 .nr(4)
34974 .kr(8)
34975 .sr(1)
34976 .m(3)
34977 .n(4)
34978 .k(k)
34979 .a_zero_point(0)
34980 .b_zero_point(0)
34981 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34982 }
34983 }
34984 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
34985
34986
34987 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64,k_eq_8)34988 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8) {
34989 GemmMicrokernelTester()
34990 .mr(4)
34991 .nr(4)
34992 .kr(2)
34993 .sr(4)
34994 .m(4)
34995 .n(4)
34996 .k(8)
34997 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
34998 }
34999
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64,strided_cn)35000 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, strided_cn) {
35001 GemmMicrokernelTester()
35002 .mr(4)
35003 .nr(4)
35004 .kr(2)
35005 .sr(4)
35006 .m(4)
35007 .n(4)
35008 .k(8)
35009 .cn_stride(7)
35010 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35011 }
35012
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64,k_eq_8_strided_a)35013 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8_strided_a) {
35014 GemmMicrokernelTester()
35015 .mr(4)
35016 .nr(4)
35017 .kr(2)
35018 .sr(4)
35019 .m(4)
35020 .n(4)
35021 .k(8)
35022 .a_stride(11)
35023 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35024 }
35025
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64,k_eq_8_subtile)35026 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile) {
35027 for (uint32_t n = 1; n <= 4; n++) {
35028 for (uint32_t m = 1; m <= 4; m++) {
35029 GemmMicrokernelTester()
35030 .mr(4)
35031 .nr(4)
35032 .kr(2)
35033 .sr(4)
35034 .m(m)
35035 .n(n)
35036 .k(8)
35037 .iterations(1)
35038 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35039 }
35040 }
35041 }
35042
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64,k_eq_8_subtile_m)35043 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_m) {
35044 for (uint32_t m = 1; m <= 4; m++) {
35045 GemmMicrokernelTester()
35046 .mr(4)
35047 .nr(4)
35048 .kr(2)
35049 .sr(4)
35050 .m(m)
35051 .n(4)
35052 .k(8)
35053 .iterations(1)
35054 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35055 }
35056 }
35057
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64,k_eq_8_subtile_n)35058 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_n) {
35059 for (uint32_t n = 1; n <= 4; n++) {
35060 GemmMicrokernelTester()
35061 .mr(4)
35062 .nr(4)
35063 .kr(2)
35064 .sr(4)
35065 .m(4)
35066 .n(n)
35067 .k(8)
35068 .iterations(1)
35069 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35070 }
35071 }
35072
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64,k_lt_8)35073 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, k_lt_8) {
35074 for (size_t k = 1; k < 8; k++) {
35075 GemmMicrokernelTester()
35076 .mr(4)
35077 .nr(4)
35078 .kr(2)
35079 .sr(4)
35080 .m(4)
35081 .n(4)
35082 .k(k)
35083 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35084 }
35085 }
35086
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64,k_lt_8_strided_a)35087 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, k_lt_8_strided_a) {
35088 for (size_t k = 1; k < 8; k++) {
35089 GemmMicrokernelTester()
35090 .mr(4)
35091 .nr(4)
35092 .kr(2)
35093 .sr(4)
35094 .m(4)
35095 .n(4)
35096 .k(k)
35097 .a_stride(11)
35098 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35099 }
35100 }
35101
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64,k_lt_8_subtile)35102 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, k_lt_8_subtile) {
35103 for (size_t k = 1; k < 8; k++) {
35104 for (uint32_t n = 1; n <= 4; n++) {
35105 for (uint32_t m = 1; m <= 4; m++) {
35106 GemmMicrokernelTester()
35107 .mr(4)
35108 .nr(4)
35109 .kr(2)
35110 .sr(4)
35111 .m(m)
35112 .n(n)
35113 .k(k)
35114 .iterations(1)
35115 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35116 }
35117 }
35118 }
35119 }
35120
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64,k_gt_8)35121 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, k_gt_8) {
35122 for (size_t k = 9; k < 16; k++) {
35123 GemmMicrokernelTester()
35124 .mr(4)
35125 .nr(4)
35126 .kr(2)
35127 .sr(4)
35128 .m(4)
35129 .n(4)
35130 .k(k)
35131 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35132 }
35133 }
35134
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64,k_gt_8_strided_a)35135 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, k_gt_8_strided_a) {
35136 for (size_t k = 9; k < 16; k++) {
35137 GemmMicrokernelTester()
35138 .mr(4)
35139 .nr(4)
35140 .kr(2)
35141 .sr(4)
35142 .m(4)
35143 .n(4)
35144 .k(k)
35145 .a_stride(19)
35146 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35147 }
35148 }
35149
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64,k_gt_8_subtile)35150 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, k_gt_8_subtile) {
35151 for (size_t k = 9; k < 16; k++) {
35152 for (uint32_t n = 1; n <= 4; n++) {
35153 for (uint32_t m = 1; m <= 4; m++) {
35154 GemmMicrokernelTester()
35155 .mr(4)
35156 .nr(4)
35157 .kr(2)
35158 .sr(4)
35159 .m(m)
35160 .n(n)
35161 .k(k)
35162 .iterations(1)
35163 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35164 }
35165 }
35166 }
35167 }
35168
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64,k_div_8)35169 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, k_div_8) {
35170 for (size_t k = 16; k <= 80; k += 8) {
35171 GemmMicrokernelTester()
35172 .mr(4)
35173 .nr(4)
35174 .kr(2)
35175 .sr(4)
35176 .m(4)
35177 .n(4)
35178 .k(k)
35179 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35180 }
35181 }
35182
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64,k_div_8_strided_a)35183 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, k_div_8_strided_a) {
35184 for (size_t k = 16; k <= 80; k += 8) {
35185 GemmMicrokernelTester()
35186 .mr(4)
35187 .nr(4)
35188 .kr(2)
35189 .sr(4)
35190 .m(4)
35191 .n(4)
35192 .k(k)
35193 .a_stride(83)
35194 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35195 }
35196 }
35197
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64,k_div_8_subtile)35198 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, k_div_8_subtile) {
35199 for (size_t k = 16; k <= 80; k += 8) {
35200 for (uint32_t n = 1; n <= 4; n++) {
35201 for (uint32_t m = 1; m <= 4; m++) {
35202 GemmMicrokernelTester()
35203 .mr(4)
35204 .nr(4)
35205 .kr(2)
35206 .sr(4)
35207 .m(m)
35208 .n(n)
35209 .k(k)
35210 .iterations(1)
35211 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35212 }
35213 }
35214 }
35215 }
35216
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64,n_gt_4)35217 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, n_gt_4) {
35218 for (uint32_t n = 5; n < 8; n++) {
35219 for (size_t k = 1; k <= 40; k += 9) {
35220 GemmMicrokernelTester()
35221 .mr(4)
35222 .nr(4)
35223 .kr(2)
35224 .sr(4)
35225 .m(4)
35226 .n(n)
35227 .k(k)
35228 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35229 }
35230 }
35231 }
35232
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64,n_gt_4_strided_cn)35233 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_cn) {
35234 for (uint32_t n = 5; n < 8; n++) {
35235 for (size_t k = 1; k <= 40; k += 9) {
35236 GemmMicrokernelTester()
35237 .mr(4)
35238 .nr(4)
35239 .kr(2)
35240 .sr(4)
35241 .m(4)
35242 .n(n)
35243 .k(k)
35244 .cn_stride(7)
35245 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35246 }
35247 }
35248 }
35249
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64,n_gt_4_strided_a)35250 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_a) {
35251 for (uint32_t n = 5; n < 8; n++) {
35252 for (size_t k = 1; k <= 40; k += 9) {
35253 GemmMicrokernelTester()
35254 .mr(4)
35255 .nr(4)
35256 .kr(2)
35257 .sr(4)
35258 .m(4)
35259 .n(n)
35260 .k(k)
35261 .a_stride(43)
35262 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35263 }
35264 }
35265 }
35266
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64,n_gt_4_subtile)35267 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, n_gt_4_subtile) {
35268 for (uint32_t n = 5; n < 8; n++) {
35269 for (size_t k = 1; k <= 40; k += 9) {
35270 for (uint32_t m = 1; m <= 4; m++) {
35271 GemmMicrokernelTester()
35272 .mr(4)
35273 .nr(4)
35274 .kr(2)
35275 .sr(4)
35276 .m(m)
35277 .n(n)
35278 .k(k)
35279 .iterations(1)
35280 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35281 }
35282 }
35283 }
35284 }
35285
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64,n_div_4)35286 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, n_div_4) {
35287 for (uint32_t n = 8; n <= 12; n += 4) {
35288 for (size_t k = 1; k <= 40; k += 9) {
35289 GemmMicrokernelTester()
35290 .mr(4)
35291 .nr(4)
35292 .kr(2)
35293 .sr(4)
35294 .m(4)
35295 .n(n)
35296 .k(k)
35297 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35298 }
35299 }
35300 }
35301
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64,n_div_4_strided_cn)35302 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, n_div_4_strided_cn) {
35303 for (uint32_t n = 8; n <= 12; n += 4) {
35304 for (size_t k = 1; k <= 40; k += 9) {
35305 GemmMicrokernelTester()
35306 .mr(4)
35307 .nr(4)
35308 .kr(2)
35309 .sr(4)
35310 .m(4)
35311 .n(n)
35312 .k(k)
35313 .cn_stride(7)
35314 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35315 }
35316 }
35317 }
35318
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64,n_div_4_strided_a)35319 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, n_div_4_strided_a) {
35320 for (uint32_t n = 8; n <= 12; n += 4) {
35321 for (size_t k = 1; k <= 40; k += 9) {
35322 GemmMicrokernelTester()
35323 .mr(4)
35324 .nr(4)
35325 .kr(2)
35326 .sr(4)
35327 .m(4)
35328 .n(n)
35329 .k(k)
35330 .a_stride(43)
35331 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35332 }
35333 }
35334 }
35335
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64,n_div_4_subtile)35336 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, n_div_4_subtile) {
35337 for (uint32_t n = 8; n <= 12; n += 4) {
35338 for (size_t k = 1; k <= 40; k += 9) {
35339 for (uint32_t m = 1; m <= 4; m++) {
35340 GemmMicrokernelTester()
35341 .mr(4)
35342 .nr(4)
35343 .kr(2)
35344 .sr(4)
35345 .m(m)
35346 .n(n)
35347 .k(k)
35348 .iterations(1)
35349 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35350 }
35351 }
35352 }
35353 }
35354
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64,strided_cm_subtile)35355 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, strided_cm_subtile) {
35356 for (size_t k = 1; k <= 40; k += 9) {
35357 for (uint32_t n = 1; n <= 4; n++) {
35358 for (uint32_t m = 1; m <= 4; m++) {
35359 GemmMicrokernelTester()
35360 .mr(4)
35361 .nr(4)
35362 .kr(2)
35363 .sr(4)
35364 .m(m)
35365 .n(n)
35366 .k(k)
35367 .cm_stride(7)
35368 .iterations(1)
35369 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35370 }
35371 }
35372 }
35373 }
35374
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64,qmin)35375 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, qmin) {
35376 GemmMicrokernelTester()
35377 .mr(4)
35378 .nr(4)
35379 .kr(2)
35380 .sr(4)
35381 .m(4)
35382 .n(4)
35383 .k(8)
35384 .qmin(128)
35385 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35386 }
35387
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64,qmax)35388 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, qmax) {
35389 GemmMicrokernelTester()
35390 .mr(4)
35391 .nr(4)
35392 .kr(2)
35393 .sr(4)
35394 .m(4)
35395 .n(4)
35396 .k(8)
35397 .qmax(128)
35398 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35399 }
35400
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64,strided_cm)35401 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, strided_cm) {
35402 GemmMicrokernelTester()
35403 .mr(4)
35404 .nr(4)
35405 .kr(2)
35406 .sr(4)
35407 .m(4)
35408 .n(4)
35409 .k(8)
35410 .cm_stride(7)
35411 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35412 }
35413
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64,no_a_zero_point)35414 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, no_a_zero_point) {
35415 for (size_t k = 1; k <= 40; k += 9) {
35416 GemmMicrokernelTester()
35417 .mr(4)
35418 .nr(4)
35419 .kr(2)
35420 .sr(4)
35421 .m(4)
35422 .n(4)
35423 .k(k)
35424 .a_zero_point(0)
35425 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35426 }
35427 }
35428
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64,no_b_zero_point)35429 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, no_b_zero_point) {
35430 for (size_t k = 1; k <= 40; k += 9) {
35431 GemmMicrokernelTester()
35432 .mr(4)
35433 .nr(4)
35434 .kr(2)
35435 .sr(4)
35436 .m(4)
35437 .n(4)
35438 .k(k)
35439 .b_zero_point(0)
35440 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35441 }
35442 }
35443
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64,no_zero_point)35444 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD64, no_zero_point) {
35445 for (size_t k = 1; k <= 40; k += 9) {
35446 GemmMicrokernelTester()
35447 .mr(4)
35448 .nr(4)
35449 .kr(2)
35450 .sr(4)
35451 .m(4)
35452 .n(4)
35453 .k(k)
35454 .a_zero_point(0)
35455 .b_zero_point(0)
35456 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35457 }
35458 }
35459 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
35460
35461
35462 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128,k_eq_8)35463 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, k_eq_8) {
35464 GemmMicrokernelTester()
35465 .mr(4)
35466 .nr(4)
35467 .kr(2)
35468 .sr(4)
35469 .m(4)
35470 .n(4)
35471 .k(8)
35472 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35473 }
35474
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128,strided_cn)35475 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, strided_cn) {
35476 GemmMicrokernelTester()
35477 .mr(4)
35478 .nr(4)
35479 .kr(2)
35480 .sr(4)
35481 .m(4)
35482 .n(4)
35483 .k(8)
35484 .cn_stride(7)
35485 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35486 }
35487
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128,k_eq_8_strided_a)35488 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, k_eq_8_strided_a) {
35489 GemmMicrokernelTester()
35490 .mr(4)
35491 .nr(4)
35492 .kr(2)
35493 .sr(4)
35494 .m(4)
35495 .n(4)
35496 .k(8)
35497 .a_stride(11)
35498 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35499 }
35500
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128,k_eq_8_subtile)35501 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile) {
35502 for (uint32_t n = 1; n <= 4; n++) {
35503 for (uint32_t m = 1; m <= 4; m++) {
35504 GemmMicrokernelTester()
35505 .mr(4)
35506 .nr(4)
35507 .kr(2)
35508 .sr(4)
35509 .m(m)
35510 .n(n)
35511 .k(8)
35512 .iterations(1)
35513 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35514 }
35515 }
35516 }
35517
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128,k_eq_8_subtile_m)35518 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile_m) {
35519 for (uint32_t m = 1; m <= 4; m++) {
35520 GemmMicrokernelTester()
35521 .mr(4)
35522 .nr(4)
35523 .kr(2)
35524 .sr(4)
35525 .m(m)
35526 .n(4)
35527 .k(8)
35528 .iterations(1)
35529 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35530 }
35531 }
35532
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128,k_eq_8_subtile_n)35533 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, k_eq_8_subtile_n) {
35534 for (uint32_t n = 1; n <= 4; n++) {
35535 GemmMicrokernelTester()
35536 .mr(4)
35537 .nr(4)
35538 .kr(2)
35539 .sr(4)
35540 .m(4)
35541 .n(n)
35542 .k(8)
35543 .iterations(1)
35544 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35545 }
35546 }
35547
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128,k_lt_8)35548 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, k_lt_8) {
35549 for (size_t k = 1; k < 8; k++) {
35550 GemmMicrokernelTester()
35551 .mr(4)
35552 .nr(4)
35553 .kr(2)
35554 .sr(4)
35555 .m(4)
35556 .n(4)
35557 .k(k)
35558 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35559 }
35560 }
35561
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128,k_lt_8_strided_a)35562 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, k_lt_8_strided_a) {
35563 for (size_t k = 1; k < 8; k++) {
35564 GemmMicrokernelTester()
35565 .mr(4)
35566 .nr(4)
35567 .kr(2)
35568 .sr(4)
35569 .m(4)
35570 .n(4)
35571 .k(k)
35572 .a_stride(11)
35573 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35574 }
35575 }
35576
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128,k_lt_8_subtile)35577 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, k_lt_8_subtile) {
35578 for (size_t k = 1; k < 8; k++) {
35579 for (uint32_t n = 1; n <= 4; n++) {
35580 for (uint32_t m = 1; m <= 4; m++) {
35581 GemmMicrokernelTester()
35582 .mr(4)
35583 .nr(4)
35584 .kr(2)
35585 .sr(4)
35586 .m(m)
35587 .n(n)
35588 .k(k)
35589 .iterations(1)
35590 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35591 }
35592 }
35593 }
35594 }
35595
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128,k_gt_8)35596 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, k_gt_8) {
35597 for (size_t k = 9; k < 16; k++) {
35598 GemmMicrokernelTester()
35599 .mr(4)
35600 .nr(4)
35601 .kr(2)
35602 .sr(4)
35603 .m(4)
35604 .n(4)
35605 .k(k)
35606 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35607 }
35608 }
35609
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128,k_gt_8_strided_a)35610 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, k_gt_8_strided_a) {
35611 for (size_t k = 9; k < 16; k++) {
35612 GemmMicrokernelTester()
35613 .mr(4)
35614 .nr(4)
35615 .kr(2)
35616 .sr(4)
35617 .m(4)
35618 .n(4)
35619 .k(k)
35620 .a_stride(19)
35621 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35622 }
35623 }
35624
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128,k_gt_8_subtile)35625 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, k_gt_8_subtile) {
35626 for (size_t k = 9; k < 16; k++) {
35627 for (uint32_t n = 1; n <= 4; n++) {
35628 for (uint32_t m = 1; m <= 4; m++) {
35629 GemmMicrokernelTester()
35630 .mr(4)
35631 .nr(4)
35632 .kr(2)
35633 .sr(4)
35634 .m(m)
35635 .n(n)
35636 .k(k)
35637 .iterations(1)
35638 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35639 }
35640 }
35641 }
35642 }
35643
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128,k_div_8)35644 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, k_div_8) {
35645 for (size_t k = 16; k <= 80; k += 8) {
35646 GemmMicrokernelTester()
35647 .mr(4)
35648 .nr(4)
35649 .kr(2)
35650 .sr(4)
35651 .m(4)
35652 .n(4)
35653 .k(k)
35654 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35655 }
35656 }
35657
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128,k_div_8_strided_a)35658 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, k_div_8_strided_a) {
35659 for (size_t k = 16; k <= 80; k += 8) {
35660 GemmMicrokernelTester()
35661 .mr(4)
35662 .nr(4)
35663 .kr(2)
35664 .sr(4)
35665 .m(4)
35666 .n(4)
35667 .k(k)
35668 .a_stride(83)
35669 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35670 }
35671 }
35672
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128,k_div_8_subtile)35673 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, k_div_8_subtile) {
35674 for (size_t k = 16; k <= 80; k += 8) {
35675 for (uint32_t n = 1; n <= 4; n++) {
35676 for (uint32_t m = 1; m <= 4; m++) {
35677 GemmMicrokernelTester()
35678 .mr(4)
35679 .nr(4)
35680 .kr(2)
35681 .sr(4)
35682 .m(m)
35683 .n(n)
35684 .k(k)
35685 .iterations(1)
35686 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35687 }
35688 }
35689 }
35690 }
35691
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128,n_gt_4)35692 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, n_gt_4) {
35693 for (uint32_t n = 5; n < 8; n++) {
35694 for (size_t k = 1; k <= 40; k += 9) {
35695 GemmMicrokernelTester()
35696 .mr(4)
35697 .nr(4)
35698 .kr(2)
35699 .sr(4)
35700 .m(4)
35701 .n(n)
35702 .k(k)
35703 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35704 }
35705 }
35706 }
35707
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128,n_gt_4_strided_cn)35708 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, n_gt_4_strided_cn) {
35709 for (uint32_t n = 5; n < 8; n++) {
35710 for (size_t k = 1; k <= 40; k += 9) {
35711 GemmMicrokernelTester()
35712 .mr(4)
35713 .nr(4)
35714 .kr(2)
35715 .sr(4)
35716 .m(4)
35717 .n(n)
35718 .k(k)
35719 .cn_stride(7)
35720 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35721 }
35722 }
35723 }
35724
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128,n_gt_4_strided_a)35725 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, n_gt_4_strided_a) {
35726 for (uint32_t n = 5; n < 8; n++) {
35727 for (size_t k = 1; k <= 40; k += 9) {
35728 GemmMicrokernelTester()
35729 .mr(4)
35730 .nr(4)
35731 .kr(2)
35732 .sr(4)
35733 .m(4)
35734 .n(n)
35735 .k(k)
35736 .a_stride(43)
35737 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35738 }
35739 }
35740 }
35741
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128,n_gt_4_subtile)35742 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, n_gt_4_subtile) {
35743 for (uint32_t n = 5; n < 8; n++) {
35744 for (size_t k = 1; k <= 40; k += 9) {
35745 for (uint32_t m = 1; m <= 4; m++) {
35746 GemmMicrokernelTester()
35747 .mr(4)
35748 .nr(4)
35749 .kr(2)
35750 .sr(4)
35751 .m(m)
35752 .n(n)
35753 .k(k)
35754 .iterations(1)
35755 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35756 }
35757 }
35758 }
35759 }
35760
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128,n_div_4)35761 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, n_div_4) {
35762 for (uint32_t n = 8; n <= 12; n += 4) {
35763 for (size_t k = 1; k <= 40; k += 9) {
35764 GemmMicrokernelTester()
35765 .mr(4)
35766 .nr(4)
35767 .kr(2)
35768 .sr(4)
35769 .m(4)
35770 .n(n)
35771 .k(k)
35772 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35773 }
35774 }
35775 }
35776
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128,n_div_4_strided_cn)35777 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, n_div_4_strided_cn) {
35778 for (uint32_t n = 8; n <= 12; n += 4) {
35779 for (size_t k = 1; k <= 40; k += 9) {
35780 GemmMicrokernelTester()
35781 .mr(4)
35782 .nr(4)
35783 .kr(2)
35784 .sr(4)
35785 .m(4)
35786 .n(n)
35787 .k(k)
35788 .cn_stride(7)
35789 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35790 }
35791 }
35792 }
35793
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128,n_div_4_strided_a)35794 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, n_div_4_strided_a) {
35795 for (uint32_t n = 8; n <= 12; n += 4) {
35796 for (size_t k = 1; k <= 40; k += 9) {
35797 GemmMicrokernelTester()
35798 .mr(4)
35799 .nr(4)
35800 .kr(2)
35801 .sr(4)
35802 .m(4)
35803 .n(n)
35804 .k(k)
35805 .a_stride(43)
35806 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35807 }
35808 }
35809 }
35810
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128,n_div_4_subtile)35811 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, n_div_4_subtile) {
35812 for (uint32_t n = 8; n <= 12; n += 4) {
35813 for (size_t k = 1; k <= 40; k += 9) {
35814 for (uint32_t m = 1; m <= 4; m++) {
35815 GemmMicrokernelTester()
35816 .mr(4)
35817 .nr(4)
35818 .kr(2)
35819 .sr(4)
35820 .m(m)
35821 .n(n)
35822 .k(k)
35823 .iterations(1)
35824 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35825 }
35826 }
35827 }
35828 }
35829
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128,strided_cm_subtile)35830 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, strided_cm_subtile) {
35831 for (size_t k = 1; k <= 40; k += 9) {
35832 for (uint32_t n = 1; n <= 4; n++) {
35833 for (uint32_t m = 1; m <= 4; m++) {
35834 GemmMicrokernelTester()
35835 .mr(4)
35836 .nr(4)
35837 .kr(2)
35838 .sr(4)
35839 .m(m)
35840 .n(n)
35841 .k(k)
35842 .cm_stride(7)
35843 .iterations(1)
35844 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35845 }
35846 }
35847 }
35848 }
35849
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128,qmin)35850 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, qmin) {
35851 GemmMicrokernelTester()
35852 .mr(4)
35853 .nr(4)
35854 .kr(2)
35855 .sr(4)
35856 .m(4)
35857 .n(4)
35858 .k(8)
35859 .qmin(128)
35860 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35861 }
35862
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128,qmax)35863 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, qmax) {
35864 GemmMicrokernelTester()
35865 .mr(4)
35866 .nr(4)
35867 .kr(2)
35868 .sr(4)
35869 .m(4)
35870 .n(4)
35871 .k(8)
35872 .qmax(128)
35873 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35874 }
35875
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128,strided_cm)35876 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, strided_cm) {
35877 GemmMicrokernelTester()
35878 .mr(4)
35879 .nr(4)
35880 .kr(2)
35881 .sr(4)
35882 .m(4)
35883 .n(4)
35884 .k(8)
35885 .cm_stride(7)
35886 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35887 }
35888
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128,no_a_zero_point)35889 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, no_a_zero_point) {
35890 for (size_t k = 1; k <= 40; k += 9) {
35891 GemmMicrokernelTester()
35892 .mr(4)
35893 .nr(4)
35894 .kr(2)
35895 .sr(4)
35896 .m(4)
35897 .n(4)
35898 .k(k)
35899 .a_zero_point(0)
35900 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35901 }
35902 }
35903
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128,no_b_zero_point)35904 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, no_b_zero_point) {
35905 for (size_t k = 1; k <= 40; k += 9) {
35906 GemmMicrokernelTester()
35907 .mr(4)
35908 .nr(4)
35909 .kr(2)
35910 .sr(4)
35911 .m(4)
35912 .n(4)
35913 .k(k)
35914 .b_zero_point(0)
35915 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35916 }
35917 }
35918
TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128,no_zero_point)35919 TEST(QU8_GEMM_MINMAX_FP32_4X4C2S4__WASMSIMD_DOT16X2_LD128, no_zero_point) {
35920 for (size_t k = 1; k <= 40; k += 9) {
35921 GemmMicrokernelTester()
35922 .mr(4)
35923 .nr(4)
35924 .kr(2)
35925 .sr(4)
35926 .m(4)
35927 .n(4)
35928 .k(k)
35929 .a_zero_point(0)
35930 .b_zero_point(0)
35931 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35932 }
35933 }
35934 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
35935
35936
35937 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64,k_eq_8)35938 TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, k_eq_8) {
35939 GemmMicrokernelTester()
35940 .mr(4)
35941 .nr(4)
35942 .kr(8)
35943 .sr(1)
35944 .m(4)
35945 .n(4)
35946 .k(8)
35947 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35948 }
35949
TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64,strided_cn)35950 TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, strided_cn) {
35951 GemmMicrokernelTester()
35952 .mr(4)
35953 .nr(4)
35954 .kr(8)
35955 .sr(1)
35956 .m(4)
35957 .n(4)
35958 .k(8)
35959 .cn_stride(7)
35960 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35961 }
35962
TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64,k_eq_8_strided_a)35963 TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, k_eq_8_strided_a) {
35964 GemmMicrokernelTester()
35965 .mr(4)
35966 .nr(4)
35967 .kr(8)
35968 .sr(1)
35969 .m(4)
35970 .n(4)
35971 .k(8)
35972 .a_stride(11)
35973 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35974 }
35975
TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64,k_eq_8_subtile)35976 TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile) {
35977 for (uint32_t n = 1; n <= 4; n++) {
35978 for (uint32_t m = 1; m <= 4; m++) {
35979 GemmMicrokernelTester()
35980 .mr(4)
35981 .nr(4)
35982 .kr(8)
35983 .sr(1)
35984 .m(m)
35985 .n(n)
35986 .k(8)
35987 .iterations(1)
35988 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
35989 }
35990 }
35991 }
35992
TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64,k_eq_8_subtile_m)35993 TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_m) {
35994 for (uint32_t m = 1; m <= 4; m++) {
35995 GemmMicrokernelTester()
35996 .mr(4)
35997 .nr(4)
35998 .kr(8)
35999 .sr(1)
36000 .m(m)
36001 .n(4)
36002 .k(8)
36003 .iterations(1)
36004 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
36005 }
36006 }
36007
TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64,k_eq_8_subtile_n)36008 TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, k_eq_8_subtile_n) {
36009 for (uint32_t n = 1; n <= 4; n++) {
36010 GemmMicrokernelTester()
36011 .mr(4)
36012 .nr(4)
36013 .kr(8)
36014 .sr(1)
36015 .m(4)
36016 .n(n)
36017 .k(8)
36018 .iterations(1)
36019 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
36020 }
36021 }
36022
TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64,k_lt_8)36023 TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, k_lt_8) {
36024 for (size_t k = 1; k < 8; k++) {
36025 GemmMicrokernelTester()
36026 .mr(4)
36027 .nr(4)
36028 .kr(8)
36029 .sr(1)
36030 .m(4)
36031 .n(4)
36032 .k(k)
36033 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
36034 }
36035 }
36036
TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64,k_lt_8_strided_a)36037 TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, k_lt_8_strided_a) {
36038 for (size_t k = 1; k < 8; k++) {
36039 GemmMicrokernelTester()
36040 .mr(4)
36041 .nr(4)
36042 .kr(8)
36043 .sr(1)
36044 .m(4)
36045 .n(4)
36046 .k(k)
36047 .a_stride(11)
36048 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
36049 }
36050 }
36051
TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64,k_lt_8_subtile)36052 TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, k_lt_8_subtile) {
36053 for (size_t k = 1; k < 8; k++) {
36054 for (uint32_t n = 1; n <= 4; n++) {
36055 for (uint32_t m = 1; m <= 4; m++) {
36056 GemmMicrokernelTester()
36057 .mr(4)
36058 .nr(4)
36059 .kr(8)
36060 .sr(1)
36061 .m(m)
36062 .n(n)
36063 .k(k)
36064 .iterations(1)
36065 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
36066 }
36067 }
36068 }
36069 }
36070
TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64,k_gt_8)36071 TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, k_gt_8) {
36072 for (size_t k = 9; k < 16; k++) {
36073 GemmMicrokernelTester()
36074 .mr(4)
36075 .nr(4)
36076 .kr(8)
36077 .sr(1)
36078 .m(4)
36079 .n(4)
36080 .k(k)
36081 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
36082 }
36083 }
36084
TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64,k_gt_8_strided_a)36085 TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, k_gt_8_strided_a) {
36086 for (size_t k = 9; k < 16; k++) {
36087 GemmMicrokernelTester()
36088 .mr(4)
36089 .nr(4)
36090 .kr(8)
36091 .sr(1)
36092 .m(4)
36093 .n(4)
36094 .k(k)
36095 .a_stride(19)
36096 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
36097 }
36098 }
36099
TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64,k_gt_8_subtile)36100 TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, k_gt_8_subtile) {
36101 for (size_t k = 9; k < 16; k++) {
36102 for (uint32_t n = 1; n <= 4; n++) {
36103 for (uint32_t m = 1; m <= 4; m++) {
36104 GemmMicrokernelTester()
36105 .mr(4)
36106 .nr(4)
36107 .kr(8)
36108 .sr(1)
36109 .m(m)
36110 .n(n)
36111 .k(k)
36112 .iterations(1)
36113 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
36114 }
36115 }
36116 }
36117 }
36118
TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64,k_div_8)36119 TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, k_div_8) {
36120 for (size_t k = 16; k <= 80; k += 8) {
36121 GemmMicrokernelTester()
36122 .mr(4)
36123 .nr(4)
36124 .kr(8)
36125 .sr(1)
36126 .m(4)
36127 .n(4)
36128 .k(k)
36129 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
36130 }
36131 }
36132
TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64,k_div_8_strided_a)36133 TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, k_div_8_strided_a) {
36134 for (size_t k = 16; k <= 80; k += 8) {
36135 GemmMicrokernelTester()
36136 .mr(4)
36137 .nr(4)
36138 .kr(8)
36139 .sr(1)
36140 .m(4)
36141 .n(4)
36142 .k(k)
36143 .a_stride(83)
36144 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
36145 }
36146 }
36147
TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64,k_div_8_subtile)36148 TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, k_div_8_subtile) {
36149 for (size_t k = 16; k <= 80; k += 8) {
36150 for (uint32_t n = 1; n <= 4; n++) {
36151 for (uint32_t m = 1; m <= 4; m++) {
36152 GemmMicrokernelTester()
36153 .mr(4)
36154 .nr(4)
36155 .kr(8)
36156 .sr(1)
36157 .m(m)
36158 .n(n)
36159 .k(k)
36160 .iterations(1)
36161 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
36162 }
36163 }
36164 }
36165 }
36166
TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64,n_gt_4)36167 TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, n_gt_4) {
36168 for (uint32_t n = 5; n < 8; n++) {
36169 for (size_t k = 1; k <= 40; k += 9) {
36170 GemmMicrokernelTester()
36171 .mr(4)
36172 .nr(4)
36173 .kr(8)
36174 .sr(1)
36175 .m(4)
36176 .n(n)
36177 .k(k)
36178 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
36179 }
36180 }
36181 }
36182
TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64,n_gt_4_strided_cn)36183 TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_cn) {
36184 for (uint32_t n = 5; n < 8; n++) {
36185 for (size_t k = 1; k <= 40; k += 9) {
36186 GemmMicrokernelTester()
36187 .mr(4)
36188 .nr(4)
36189 .kr(8)
36190 .sr(1)
36191 .m(4)
36192 .n(n)
36193 .k(k)
36194 .cn_stride(7)
36195 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
36196 }
36197 }
36198 }
36199
TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64,n_gt_4_strided_a)36200 TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, n_gt_4_strided_a) {
36201 for (uint32_t n = 5; n < 8; n++) {
36202 for (size_t k = 1; k <= 40; k += 9) {
36203 GemmMicrokernelTester()
36204 .mr(4)
36205 .nr(4)
36206 .kr(8)
36207 .sr(1)
36208 .m(4)
36209 .n(n)
36210 .k(k)
36211 .a_stride(43)
36212 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
36213 }
36214 }
36215 }
36216
TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64,n_gt_4_subtile)36217 TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, n_gt_4_subtile) {
36218 for (uint32_t n = 5; n < 8; n++) {
36219 for (size_t k = 1; k <= 40; k += 9) {
36220 for (uint32_t m = 1; m <= 4; m++) {
36221 GemmMicrokernelTester()
36222 .mr(4)
36223 .nr(4)
36224 .kr(8)
36225 .sr(1)
36226 .m(m)
36227 .n(n)
36228 .k(k)
36229 .iterations(1)
36230 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
36231 }
36232 }
36233 }
36234 }
36235
TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64,n_div_4)36236 TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, n_div_4) {
36237 for (uint32_t n = 8; n <= 12; n += 4) {
36238 for (size_t k = 1; k <= 40; k += 9) {
36239 GemmMicrokernelTester()
36240 .mr(4)
36241 .nr(4)
36242 .kr(8)
36243 .sr(1)
36244 .m(4)
36245 .n(n)
36246 .k(k)
36247 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
36248 }
36249 }
36250 }
36251
TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64,n_div_4_strided_cn)36252 TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, n_div_4_strided_cn) {
36253 for (uint32_t n = 8; n <= 12; n += 4) {
36254 for (size_t k = 1; k <= 40; k += 9) {
36255 GemmMicrokernelTester()
36256 .mr(4)
36257 .nr(4)
36258 .kr(8)
36259 .sr(1)
36260 .m(4)
36261 .n(n)
36262 .k(k)
36263 .cn_stride(7)
36264 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
36265 }
36266 }
36267 }
36268
TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64,n_div_4_strided_a)36269 TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, n_div_4_strided_a) {
36270 for (uint32_t n = 8; n <= 12; n += 4) {
36271 for (size_t k = 1; k <= 40; k += 9) {
36272 GemmMicrokernelTester()
36273 .mr(4)
36274 .nr(4)
36275 .kr(8)
36276 .sr(1)
36277 .m(4)
36278 .n(n)
36279 .k(k)
36280 .a_stride(43)
36281 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
36282 }
36283 }
36284 }
36285
TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64,n_div_4_subtile)36286 TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, n_div_4_subtile) {
36287 for (uint32_t n = 8; n <= 12; n += 4) {
36288 for (size_t k = 1; k <= 40; k += 9) {
36289 for (uint32_t m = 1; m <= 4; m++) {
36290 GemmMicrokernelTester()
36291 .mr(4)
36292 .nr(4)
36293 .kr(8)
36294 .sr(1)
36295 .m(m)
36296 .n(n)
36297 .k(k)
36298 .iterations(1)
36299 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
36300 }
36301 }
36302 }
36303 }
36304
TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64,strided_cm_subtile)36305 TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, strided_cm_subtile) {
36306 for (size_t k = 1; k <= 40; k += 9) {
36307 for (uint32_t n = 1; n <= 4; n++) {
36308 for (uint32_t m = 1; m <= 4; m++) {
36309 GemmMicrokernelTester()
36310 .mr(4)
36311 .nr(4)
36312 .kr(8)
36313 .sr(1)
36314 .m(m)
36315 .n(n)
36316 .k(k)
36317 .cm_stride(7)
36318 .iterations(1)
36319 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
36320 }
36321 }
36322 }
36323 }
36324
TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64,qmin)36325 TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, qmin) {
36326 GemmMicrokernelTester()
36327 .mr(4)
36328 .nr(4)
36329 .kr(8)
36330 .sr(1)
36331 .m(4)
36332 .n(4)
36333 .k(8)
36334 .qmin(128)
36335 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
36336 }
36337
TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64,qmax)36338 TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, qmax) {
36339 GemmMicrokernelTester()
36340 .mr(4)
36341 .nr(4)
36342 .kr(8)
36343 .sr(1)
36344 .m(4)
36345 .n(4)
36346 .k(8)
36347 .qmax(128)
36348 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
36349 }
36350
TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64,strided_cm)36351 TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, strided_cm) {
36352 GemmMicrokernelTester()
36353 .mr(4)
36354 .nr(4)
36355 .kr(8)
36356 .sr(1)
36357 .m(4)
36358 .n(4)
36359 .k(8)
36360 .cm_stride(7)
36361 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
36362 }
36363
TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64,no_a_zero_point)36364 TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, no_a_zero_point) {
36365 for (size_t k = 1; k <= 40; k += 9) {
36366 GemmMicrokernelTester()
36367 .mr(4)
36368 .nr(4)
36369 .kr(8)
36370 .sr(1)
36371 .m(4)
36372 .n(4)
36373 .k(k)
36374 .a_zero_point(0)
36375 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
36376 }
36377 }
36378
TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64,no_b_zero_point)36379 TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, no_b_zero_point) {
36380 for (size_t k = 1; k <= 40; k += 9) {
36381 GemmMicrokernelTester()
36382 .mr(4)
36383 .nr(4)
36384 .kr(8)
36385 .sr(1)
36386 .m(4)
36387 .n(4)
36388 .k(k)
36389 .b_zero_point(0)
36390 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
36391 }
36392 }
36393
TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64,no_zero_point)36394 TEST(QU8_GEMM_MINMAX_FP32_4X4C8__WASMSIMD_DOT16X2_LD64, no_zero_point) {
36395 for (size_t k = 1; k <= 40; k += 9) {
36396 GemmMicrokernelTester()
36397 .mr(4)
36398 .nr(4)
36399 .kr(8)
36400 .sr(1)
36401 .m(4)
36402 .n(4)
36403 .k(k)
36404 .a_zero_point(0)
36405 .b_zero_point(0)
36406 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, xnn_init_qu8_conv_minmax_fp32_wasmsimd_params, xnn_qu8_requantize_fp32);
36407 }
36408 }
36409 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
36410
36411
36412 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC,k_eq_1)36413 TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC, k_eq_1) {
36414 GemmMicrokernelTester()
36415 .mr(3)
36416 .nr(2)
36417 .kr(1)
36418 .sr(1)
36419 .m(3)
36420 .n(2)
36421 .k(1)
36422 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36423 }
36424
TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC,strided_cn)36425 TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC, strided_cn) {
36426 GemmMicrokernelTester()
36427 .mr(3)
36428 .nr(2)
36429 .kr(1)
36430 .sr(1)
36431 .m(3)
36432 .n(2)
36433 .k(1)
36434 .cn_stride(5)
36435 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36436 }
36437
TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC,k_eq_1_strided_a)36438 TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC, k_eq_1_strided_a) {
36439 GemmMicrokernelTester()
36440 .mr(3)
36441 .nr(2)
36442 .kr(1)
36443 .sr(1)
36444 .m(3)
36445 .n(2)
36446 .k(1)
36447 .a_stride(3)
36448 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36449 }
36450
TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC,k_eq_1_subtile)36451 TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC, k_eq_1_subtile) {
36452 for (uint32_t n = 1; n <= 2; n++) {
36453 for (uint32_t m = 1; m <= 3; m++) {
36454 GemmMicrokernelTester()
36455 .mr(3)
36456 .nr(2)
36457 .kr(1)
36458 .sr(1)
36459 .m(m)
36460 .n(n)
36461 .k(1)
36462 .iterations(1)
36463 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36464 }
36465 }
36466 }
36467
TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC,k_eq_1_subtile_m)36468 TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC, k_eq_1_subtile_m) {
36469 for (uint32_t m = 1; m <= 3; m++) {
36470 GemmMicrokernelTester()
36471 .mr(3)
36472 .nr(2)
36473 .kr(1)
36474 .sr(1)
36475 .m(m)
36476 .n(2)
36477 .k(1)
36478 .iterations(1)
36479 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36480 }
36481 }
36482
TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC,k_eq_1_subtile_n)36483 TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC, k_eq_1_subtile_n) {
36484 for (uint32_t n = 1; n <= 2; n++) {
36485 GemmMicrokernelTester()
36486 .mr(3)
36487 .nr(2)
36488 .kr(1)
36489 .sr(1)
36490 .m(3)
36491 .n(n)
36492 .k(1)
36493 .iterations(1)
36494 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36495 }
36496 }
36497
TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC,k_gt_1)36498 TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC, k_gt_1) {
36499 for (size_t k = 2; k < 10; k++) {
36500 GemmMicrokernelTester()
36501 .mr(3)
36502 .nr(2)
36503 .kr(1)
36504 .sr(1)
36505 .m(3)
36506 .n(2)
36507 .k(k)
36508 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36509 }
36510 }
36511
TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC,k_gt_1_strided_a)36512 TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC, k_gt_1_strided_a) {
36513 for (size_t k = 2; k < 10; k++) {
36514 GemmMicrokernelTester()
36515 .mr(3)
36516 .nr(2)
36517 .kr(1)
36518 .sr(1)
36519 .m(3)
36520 .n(2)
36521 .k(k)
36522 .a_stride(11)
36523 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36524 }
36525 }
36526
TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC,k_gt_1_subtile)36527 TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC, k_gt_1_subtile) {
36528 for (size_t k = 2; k < 10; k++) {
36529 for (uint32_t n = 1; n <= 2; n++) {
36530 for (uint32_t m = 1; m <= 3; m++) {
36531 GemmMicrokernelTester()
36532 .mr(3)
36533 .nr(2)
36534 .kr(1)
36535 .sr(1)
36536 .m(m)
36537 .n(n)
36538 .k(k)
36539 .iterations(1)
36540 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36541 }
36542 }
36543 }
36544 }
36545
TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC,n_gt_2)36546 TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC, n_gt_2) {
36547 for (uint32_t n = 3; n < 4; n++) {
36548 for (size_t k = 1; k <= 5; k += 2) {
36549 GemmMicrokernelTester()
36550 .mr(3)
36551 .nr(2)
36552 .kr(1)
36553 .sr(1)
36554 .m(3)
36555 .n(n)
36556 .k(k)
36557 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36558 }
36559 }
36560 }
36561
TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC,n_gt_2_strided_cn)36562 TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC, n_gt_2_strided_cn) {
36563 for (uint32_t n = 3; n < 4; n++) {
36564 for (size_t k = 1; k <= 5; k += 2) {
36565 GemmMicrokernelTester()
36566 .mr(3)
36567 .nr(2)
36568 .kr(1)
36569 .sr(1)
36570 .m(3)
36571 .n(n)
36572 .k(k)
36573 .cn_stride(5)
36574 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36575 }
36576 }
36577 }
36578
TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC,n_gt_2_strided_a)36579 TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC, n_gt_2_strided_a) {
36580 for (uint32_t n = 3; n < 4; n++) {
36581 for (size_t k = 1; k <= 5; k += 2) {
36582 GemmMicrokernelTester()
36583 .mr(3)
36584 .nr(2)
36585 .kr(1)
36586 .sr(1)
36587 .m(3)
36588 .n(n)
36589 .k(k)
36590 .a_stride(7)
36591 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36592 }
36593 }
36594 }
36595
TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC,n_gt_2_subtile)36596 TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC, n_gt_2_subtile) {
36597 for (uint32_t n = 3; n < 4; n++) {
36598 for (size_t k = 1; k <= 5; k += 2) {
36599 for (uint32_t m = 1; m <= 3; m++) {
36600 GemmMicrokernelTester()
36601 .mr(3)
36602 .nr(2)
36603 .kr(1)
36604 .sr(1)
36605 .m(m)
36606 .n(n)
36607 .k(k)
36608 .iterations(1)
36609 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36610 }
36611 }
36612 }
36613 }
36614
TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC,n_div_2)36615 TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC, n_div_2) {
36616 for (uint32_t n = 4; n <= 6; n += 2) {
36617 for (size_t k = 1; k <= 5; k += 2) {
36618 GemmMicrokernelTester()
36619 .mr(3)
36620 .nr(2)
36621 .kr(1)
36622 .sr(1)
36623 .m(3)
36624 .n(n)
36625 .k(k)
36626 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36627 }
36628 }
36629 }
36630
TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC,n_div_2_strided_cn)36631 TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC, n_div_2_strided_cn) {
36632 for (uint32_t n = 4; n <= 6; n += 2) {
36633 for (size_t k = 1; k <= 5; k += 2) {
36634 GemmMicrokernelTester()
36635 .mr(3)
36636 .nr(2)
36637 .kr(1)
36638 .sr(1)
36639 .m(3)
36640 .n(n)
36641 .k(k)
36642 .cn_stride(5)
36643 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36644 }
36645 }
36646 }
36647
TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC,n_div_2_strided_a)36648 TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC, n_div_2_strided_a) {
36649 for (uint32_t n = 4; n <= 6; n += 2) {
36650 for (size_t k = 1; k <= 5; k += 2) {
36651 GemmMicrokernelTester()
36652 .mr(3)
36653 .nr(2)
36654 .kr(1)
36655 .sr(1)
36656 .m(3)
36657 .n(n)
36658 .k(k)
36659 .a_stride(7)
36660 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36661 }
36662 }
36663 }
36664
TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC,n_div_2_subtile)36665 TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC, n_div_2_subtile) {
36666 for (uint32_t n = 4; n <= 6; n += 2) {
36667 for (size_t k = 1; k <= 5; k += 2) {
36668 for (uint32_t m = 1; m <= 3; m++) {
36669 GemmMicrokernelTester()
36670 .mr(3)
36671 .nr(2)
36672 .kr(1)
36673 .sr(1)
36674 .m(m)
36675 .n(n)
36676 .k(k)
36677 .iterations(1)
36678 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36679 }
36680 }
36681 }
36682 }
36683
TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC,strided_cm_subtile)36684 TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC, strided_cm_subtile) {
36685 for (size_t k = 1; k <= 5; k += 2) {
36686 for (uint32_t n = 1; n <= 2; n++) {
36687 for (uint32_t m = 1; m <= 3; m++) {
36688 GemmMicrokernelTester()
36689 .mr(3)
36690 .nr(2)
36691 .kr(1)
36692 .sr(1)
36693 .m(m)
36694 .n(n)
36695 .k(k)
36696 .cm_stride(5)
36697 .iterations(1)
36698 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36699 }
36700 }
36701 }
36702 }
36703
TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC,qmin)36704 TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC, qmin) {
36705 GemmMicrokernelTester()
36706 .mr(3)
36707 .nr(2)
36708 .kr(1)
36709 .sr(1)
36710 .m(3)
36711 .n(2)
36712 .k(1)
36713 .qmin(128)
36714 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36715 }
36716
TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC,qmax)36717 TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC, qmax) {
36718 GemmMicrokernelTester()
36719 .mr(3)
36720 .nr(2)
36721 .kr(1)
36722 .sr(1)
36723 .m(3)
36724 .n(2)
36725 .k(1)
36726 .qmax(128)
36727 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36728 }
36729
TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC,strided_cm)36730 TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC, strided_cm) {
36731 GemmMicrokernelTester()
36732 .mr(3)
36733 .nr(2)
36734 .kr(1)
36735 .sr(1)
36736 .m(3)
36737 .n(2)
36738 .k(1)
36739 .cm_stride(5)
36740 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36741 }
36742
TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC,no_a_zero_point)36743 TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC, no_a_zero_point) {
36744 for (size_t k = 1; k <= 5; k += 2) {
36745 GemmMicrokernelTester()
36746 .mr(3)
36747 .nr(2)
36748 .kr(1)
36749 .sr(1)
36750 .m(3)
36751 .n(2)
36752 .k(k)
36753 .a_zero_point(0)
36754 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36755 }
36756 }
36757
TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC,no_b_zero_point)36758 TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC, no_b_zero_point) {
36759 for (size_t k = 1; k <= 5; k += 2) {
36760 GemmMicrokernelTester()
36761 .mr(3)
36762 .nr(2)
36763 .kr(1)
36764 .sr(1)
36765 .m(3)
36766 .n(2)
36767 .k(k)
36768 .b_zero_point(0)
36769 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36770 }
36771 }
36772
TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC,no_zero_point)36773 TEST(QU8_GEMM_MINMAX_FP32_3X2__WASM_FMAGIC, no_zero_point) {
36774 for (size_t k = 1; k <= 5; k += 2) {
36775 GemmMicrokernelTester()
36776 .mr(3)
36777 .nr(2)
36778 .kr(1)
36779 .sr(1)
36780 .m(3)
36781 .n(2)
36782 .k(k)
36783 .a_zero_point(0)
36784 .b_zero_point(0)
36785 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36786 }
36787 }
36788 #endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
36789
36790
36791 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC,k_eq_1)36792 TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC, k_eq_1) {
36793 GemmMicrokernelTester()
36794 .mr(3)
36795 .nr(4)
36796 .kr(1)
36797 .sr(1)
36798 .m(3)
36799 .n(4)
36800 .k(1)
36801 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36802 }
36803
TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC,strided_cn)36804 TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC, strided_cn) {
36805 GemmMicrokernelTester()
36806 .mr(3)
36807 .nr(4)
36808 .kr(1)
36809 .sr(1)
36810 .m(3)
36811 .n(4)
36812 .k(1)
36813 .cn_stride(7)
36814 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36815 }
36816
TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC,k_eq_1_strided_a)36817 TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC, k_eq_1_strided_a) {
36818 GemmMicrokernelTester()
36819 .mr(3)
36820 .nr(4)
36821 .kr(1)
36822 .sr(1)
36823 .m(3)
36824 .n(4)
36825 .k(1)
36826 .a_stride(3)
36827 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36828 }
36829
TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC,k_eq_1_subtile)36830 TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC, k_eq_1_subtile) {
36831 for (uint32_t n = 1; n <= 4; n++) {
36832 for (uint32_t m = 1; m <= 3; m++) {
36833 GemmMicrokernelTester()
36834 .mr(3)
36835 .nr(4)
36836 .kr(1)
36837 .sr(1)
36838 .m(m)
36839 .n(n)
36840 .k(1)
36841 .iterations(1)
36842 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36843 }
36844 }
36845 }
36846
TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC,k_eq_1_subtile_m)36847 TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC, k_eq_1_subtile_m) {
36848 for (uint32_t m = 1; m <= 3; m++) {
36849 GemmMicrokernelTester()
36850 .mr(3)
36851 .nr(4)
36852 .kr(1)
36853 .sr(1)
36854 .m(m)
36855 .n(4)
36856 .k(1)
36857 .iterations(1)
36858 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36859 }
36860 }
36861
TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC,k_eq_1_subtile_n)36862 TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC, k_eq_1_subtile_n) {
36863 for (uint32_t n = 1; n <= 4; n++) {
36864 GemmMicrokernelTester()
36865 .mr(3)
36866 .nr(4)
36867 .kr(1)
36868 .sr(1)
36869 .m(3)
36870 .n(n)
36871 .k(1)
36872 .iterations(1)
36873 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36874 }
36875 }
36876
TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC,k_gt_1)36877 TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC, k_gt_1) {
36878 for (size_t k = 2; k < 10; k++) {
36879 GemmMicrokernelTester()
36880 .mr(3)
36881 .nr(4)
36882 .kr(1)
36883 .sr(1)
36884 .m(3)
36885 .n(4)
36886 .k(k)
36887 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36888 }
36889 }
36890
TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC,k_gt_1_strided_a)36891 TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC, k_gt_1_strided_a) {
36892 for (size_t k = 2; k < 10; k++) {
36893 GemmMicrokernelTester()
36894 .mr(3)
36895 .nr(4)
36896 .kr(1)
36897 .sr(1)
36898 .m(3)
36899 .n(4)
36900 .k(k)
36901 .a_stride(11)
36902 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36903 }
36904 }
36905
TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC,k_gt_1_subtile)36906 TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC, k_gt_1_subtile) {
36907 for (size_t k = 2; k < 10; k++) {
36908 for (uint32_t n = 1; n <= 4; n++) {
36909 for (uint32_t m = 1; m <= 3; m++) {
36910 GemmMicrokernelTester()
36911 .mr(3)
36912 .nr(4)
36913 .kr(1)
36914 .sr(1)
36915 .m(m)
36916 .n(n)
36917 .k(k)
36918 .iterations(1)
36919 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36920 }
36921 }
36922 }
36923 }
36924
TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC,n_gt_4)36925 TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC, n_gt_4) {
36926 for (uint32_t n = 5; n < 8; n++) {
36927 for (size_t k = 1; k <= 5; k += 2) {
36928 GemmMicrokernelTester()
36929 .mr(3)
36930 .nr(4)
36931 .kr(1)
36932 .sr(1)
36933 .m(3)
36934 .n(n)
36935 .k(k)
36936 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36937 }
36938 }
36939 }
36940
TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC,n_gt_4_strided_cn)36941 TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC, n_gt_4_strided_cn) {
36942 for (uint32_t n = 5; n < 8; n++) {
36943 for (size_t k = 1; k <= 5; k += 2) {
36944 GemmMicrokernelTester()
36945 .mr(3)
36946 .nr(4)
36947 .kr(1)
36948 .sr(1)
36949 .m(3)
36950 .n(n)
36951 .k(k)
36952 .cn_stride(7)
36953 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36954 }
36955 }
36956 }
36957
TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC,n_gt_4_strided_a)36958 TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC, n_gt_4_strided_a) {
36959 for (uint32_t n = 5; n < 8; n++) {
36960 for (size_t k = 1; k <= 5; k += 2) {
36961 GemmMicrokernelTester()
36962 .mr(3)
36963 .nr(4)
36964 .kr(1)
36965 .sr(1)
36966 .m(3)
36967 .n(n)
36968 .k(k)
36969 .a_stride(7)
36970 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36971 }
36972 }
36973 }
36974
TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC,n_gt_4_subtile)36975 TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC, n_gt_4_subtile) {
36976 for (uint32_t n = 5; n < 8; n++) {
36977 for (size_t k = 1; k <= 5; k += 2) {
36978 for (uint32_t m = 1; m <= 3; m++) {
36979 GemmMicrokernelTester()
36980 .mr(3)
36981 .nr(4)
36982 .kr(1)
36983 .sr(1)
36984 .m(m)
36985 .n(n)
36986 .k(k)
36987 .iterations(1)
36988 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
36989 }
36990 }
36991 }
36992 }
36993
TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC,n_div_4)36994 TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC, n_div_4) {
36995 for (uint32_t n = 8; n <= 12; n += 4) {
36996 for (size_t k = 1; k <= 5; k += 2) {
36997 GemmMicrokernelTester()
36998 .mr(3)
36999 .nr(4)
37000 .kr(1)
37001 .sr(1)
37002 .m(3)
37003 .n(n)
37004 .k(k)
37005 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
37006 }
37007 }
37008 }
37009
TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC,n_div_4_strided_cn)37010 TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC, n_div_4_strided_cn) {
37011 for (uint32_t n = 8; n <= 12; n += 4) {
37012 for (size_t k = 1; k <= 5; k += 2) {
37013 GemmMicrokernelTester()
37014 .mr(3)
37015 .nr(4)
37016 .kr(1)
37017 .sr(1)
37018 .m(3)
37019 .n(n)
37020 .k(k)
37021 .cn_stride(7)
37022 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
37023 }
37024 }
37025 }
37026
TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC,n_div_4_strided_a)37027 TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC, n_div_4_strided_a) {
37028 for (uint32_t n = 8; n <= 12; n += 4) {
37029 for (size_t k = 1; k <= 5; k += 2) {
37030 GemmMicrokernelTester()
37031 .mr(3)
37032 .nr(4)
37033 .kr(1)
37034 .sr(1)
37035 .m(3)
37036 .n(n)
37037 .k(k)
37038 .a_stride(7)
37039 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
37040 }
37041 }
37042 }
37043
TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC,n_div_4_subtile)37044 TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC, n_div_4_subtile) {
37045 for (uint32_t n = 8; n <= 12; n += 4) {
37046 for (size_t k = 1; k <= 5; k += 2) {
37047 for (uint32_t m = 1; m <= 3; m++) {
37048 GemmMicrokernelTester()
37049 .mr(3)
37050 .nr(4)
37051 .kr(1)
37052 .sr(1)
37053 .m(m)
37054 .n(n)
37055 .k(k)
37056 .iterations(1)
37057 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
37058 }
37059 }
37060 }
37061 }
37062
TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC,strided_cm_subtile)37063 TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC, strided_cm_subtile) {
37064 for (size_t k = 1; k <= 5; k += 2) {
37065 for (uint32_t n = 1; n <= 4; n++) {
37066 for (uint32_t m = 1; m <= 3; m++) {
37067 GemmMicrokernelTester()
37068 .mr(3)
37069 .nr(4)
37070 .kr(1)
37071 .sr(1)
37072 .m(m)
37073 .n(n)
37074 .k(k)
37075 .cm_stride(7)
37076 .iterations(1)
37077 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
37078 }
37079 }
37080 }
37081 }
37082
TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC,qmin)37083 TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC, qmin) {
37084 GemmMicrokernelTester()
37085 .mr(3)
37086 .nr(4)
37087 .kr(1)
37088 .sr(1)
37089 .m(3)
37090 .n(4)
37091 .k(1)
37092 .qmin(128)
37093 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
37094 }
37095
TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC,qmax)37096 TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC, qmax) {
37097 GemmMicrokernelTester()
37098 .mr(3)
37099 .nr(4)
37100 .kr(1)
37101 .sr(1)
37102 .m(3)
37103 .n(4)
37104 .k(1)
37105 .qmax(128)
37106 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
37107 }
37108
TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC,strided_cm)37109 TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC, strided_cm) {
37110 GemmMicrokernelTester()
37111 .mr(3)
37112 .nr(4)
37113 .kr(1)
37114 .sr(1)
37115 .m(3)
37116 .n(4)
37117 .k(1)
37118 .cm_stride(7)
37119 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
37120 }
37121
TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC,no_a_zero_point)37122 TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC, no_a_zero_point) {
37123 for (size_t k = 1; k <= 5; k += 2) {
37124 GemmMicrokernelTester()
37125 .mr(3)
37126 .nr(4)
37127 .kr(1)
37128 .sr(1)
37129 .m(3)
37130 .n(4)
37131 .k(k)
37132 .a_zero_point(0)
37133 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
37134 }
37135 }
37136
TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC,no_b_zero_point)37137 TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC, no_b_zero_point) {
37138 for (size_t k = 1; k <= 5; k += 2) {
37139 GemmMicrokernelTester()
37140 .mr(3)
37141 .nr(4)
37142 .kr(1)
37143 .sr(1)
37144 .m(3)
37145 .n(4)
37146 .k(k)
37147 .b_zero_point(0)
37148 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
37149 }
37150 }
37151
TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC,no_zero_point)37152 TEST(QU8_GEMM_MINMAX_FP32_3X4__WASM_FMAGIC, no_zero_point) {
37153 for (size_t k = 1; k <= 5; k += 2) {
37154 GemmMicrokernelTester()
37155 .mr(3)
37156 .nr(4)
37157 .kr(1)
37158 .sr(1)
37159 .m(3)
37160 .n(4)
37161 .k(k)
37162 .a_zero_point(0)
37163 .b_zero_point(0)
37164 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
37165 }
37166 }
37167 #endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
37168
37169
TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC,k_eq_1)37170 TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC, k_eq_1) {
37171 GemmMicrokernelTester()
37172 .mr(1)
37173 .nr(2)
37174 .kr(1)
37175 .sr(1)
37176 .m(1)
37177 .n(2)
37178 .k(1)
37179 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37180 }
37181
TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC,strided_cn)37182 TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC, strided_cn) {
37183 GemmMicrokernelTester()
37184 .mr(1)
37185 .nr(2)
37186 .kr(1)
37187 .sr(1)
37188 .m(1)
37189 .n(2)
37190 .k(1)
37191 .cn_stride(5)
37192 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37193 }
37194
TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC,k_eq_1_strided_a)37195 TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC, k_eq_1_strided_a) {
37196 GemmMicrokernelTester()
37197 .mr(1)
37198 .nr(2)
37199 .kr(1)
37200 .sr(1)
37201 .m(1)
37202 .n(2)
37203 .k(1)
37204 .a_stride(3)
37205 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37206 }
37207
TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC,k_eq_1_subtile)37208 TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC, k_eq_1_subtile) {
37209 for (uint32_t n = 1; n <= 2; n++) {
37210 for (uint32_t m = 1; m <= 1; m++) {
37211 GemmMicrokernelTester()
37212 .mr(1)
37213 .nr(2)
37214 .kr(1)
37215 .sr(1)
37216 .m(m)
37217 .n(n)
37218 .k(1)
37219 .iterations(1)
37220 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37221 }
37222 }
37223 }
37224
TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC,k_eq_1_subtile_m)37225 TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC, k_eq_1_subtile_m) {
37226 for (uint32_t m = 1; m <= 1; m++) {
37227 GemmMicrokernelTester()
37228 .mr(1)
37229 .nr(2)
37230 .kr(1)
37231 .sr(1)
37232 .m(m)
37233 .n(2)
37234 .k(1)
37235 .iterations(1)
37236 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37237 }
37238 }
37239
TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC,k_eq_1_subtile_n)37240 TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC, k_eq_1_subtile_n) {
37241 for (uint32_t n = 1; n <= 2; n++) {
37242 GemmMicrokernelTester()
37243 .mr(1)
37244 .nr(2)
37245 .kr(1)
37246 .sr(1)
37247 .m(1)
37248 .n(n)
37249 .k(1)
37250 .iterations(1)
37251 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37252 }
37253 }
37254
TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC,k_gt_1)37255 TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC, k_gt_1) {
37256 for (size_t k = 2; k < 10; k++) {
37257 GemmMicrokernelTester()
37258 .mr(1)
37259 .nr(2)
37260 .kr(1)
37261 .sr(1)
37262 .m(1)
37263 .n(2)
37264 .k(k)
37265 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37266 }
37267 }
37268
TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC,k_gt_1_strided_a)37269 TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC, k_gt_1_strided_a) {
37270 for (size_t k = 2; k < 10; k++) {
37271 GemmMicrokernelTester()
37272 .mr(1)
37273 .nr(2)
37274 .kr(1)
37275 .sr(1)
37276 .m(1)
37277 .n(2)
37278 .k(k)
37279 .a_stride(11)
37280 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37281 }
37282 }
37283
TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC,k_gt_1_subtile)37284 TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC, k_gt_1_subtile) {
37285 for (size_t k = 2; k < 10; k++) {
37286 for (uint32_t n = 1; n <= 2; n++) {
37287 for (uint32_t m = 1; m <= 1; m++) {
37288 GemmMicrokernelTester()
37289 .mr(1)
37290 .nr(2)
37291 .kr(1)
37292 .sr(1)
37293 .m(m)
37294 .n(n)
37295 .k(k)
37296 .iterations(1)
37297 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37298 }
37299 }
37300 }
37301 }
37302
TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC,n_gt_2)37303 TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC, n_gt_2) {
37304 for (uint32_t n = 3; n < 4; n++) {
37305 for (size_t k = 1; k <= 5; k += 2) {
37306 GemmMicrokernelTester()
37307 .mr(1)
37308 .nr(2)
37309 .kr(1)
37310 .sr(1)
37311 .m(1)
37312 .n(n)
37313 .k(k)
37314 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37315 }
37316 }
37317 }
37318
TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC,n_gt_2_strided_cn)37319 TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC, n_gt_2_strided_cn) {
37320 for (uint32_t n = 3; n < 4; n++) {
37321 for (size_t k = 1; k <= 5; k += 2) {
37322 GemmMicrokernelTester()
37323 .mr(1)
37324 .nr(2)
37325 .kr(1)
37326 .sr(1)
37327 .m(1)
37328 .n(n)
37329 .k(k)
37330 .cn_stride(5)
37331 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37332 }
37333 }
37334 }
37335
TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC,n_gt_2_strided_a)37336 TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC, n_gt_2_strided_a) {
37337 for (uint32_t n = 3; n < 4; n++) {
37338 for (size_t k = 1; k <= 5; k += 2) {
37339 GemmMicrokernelTester()
37340 .mr(1)
37341 .nr(2)
37342 .kr(1)
37343 .sr(1)
37344 .m(1)
37345 .n(n)
37346 .k(k)
37347 .a_stride(7)
37348 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37349 }
37350 }
37351 }
37352
TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC,n_gt_2_subtile)37353 TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC, n_gt_2_subtile) {
37354 for (uint32_t n = 3; n < 4; n++) {
37355 for (size_t k = 1; k <= 5; k += 2) {
37356 for (uint32_t m = 1; m <= 1; m++) {
37357 GemmMicrokernelTester()
37358 .mr(1)
37359 .nr(2)
37360 .kr(1)
37361 .sr(1)
37362 .m(m)
37363 .n(n)
37364 .k(k)
37365 .iterations(1)
37366 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37367 }
37368 }
37369 }
37370 }
37371
TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC,n_div_2)37372 TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC, n_div_2) {
37373 for (uint32_t n = 4; n <= 6; n += 2) {
37374 for (size_t k = 1; k <= 5; k += 2) {
37375 GemmMicrokernelTester()
37376 .mr(1)
37377 .nr(2)
37378 .kr(1)
37379 .sr(1)
37380 .m(1)
37381 .n(n)
37382 .k(k)
37383 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37384 }
37385 }
37386 }
37387
TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC,n_div_2_strided_cn)37388 TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC, n_div_2_strided_cn) {
37389 for (uint32_t n = 4; n <= 6; n += 2) {
37390 for (size_t k = 1; k <= 5; k += 2) {
37391 GemmMicrokernelTester()
37392 .mr(1)
37393 .nr(2)
37394 .kr(1)
37395 .sr(1)
37396 .m(1)
37397 .n(n)
37398 .k(k)
37399 .cn_stride(5)
37400 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37401 }
37402 }
37403 }
37404
TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC,n_div_2_strided_a)37405 TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC, n_div_2_strided_a) {
37406 for (uint32_t n = 4; n <= 6; n += 2) {
37407 for (size_t k = 1; k <= 5; k += 2) {
37408 GemmMicrokernelTester()
37409 .mr(1)
37410 .nr(2)
37411 .kr(1)
37412 .sr(1)
37413 .m(1)
37414 .n(n)
37415 .k(k)
37416 .a_stride(7)
37417 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37418 }
37419 }
37420 }
37421
TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC,n_div_2_subtile)37422 TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC, n_div_2_subtile) {
37423 for (uint32_t n = 4; n <= 6; n += 2) {
37424 for (size_t k = 1; k <= 5; k += 2) {
37425 for (uint32_t m = 1; m <= 1; m++) {
37426 GemmMicrokernelTester()
37427 .mr(1)
37428 .nr(2)
37429 .kr(1)
37430 .sr(1)
37431 .m(m)
37432 .n(n)
37433 .k(k)
37434 .iterations(1)
37435 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37436 }
37437 }
37438 }
37439 }
37440
TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC,strided_cm_subtile)37441 TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC, strided_cm_subtile) {
37442 for (size_t k = 1; k <= 5; k += 2) {
37443 for (uint32_t n = 1; n <= 2; n++) {
37444 for (uint32_t m = 1; m <= 1; m++) {
37445 GemmMicrokernelTester()
37446 .mr(1)
37447 .nr(2)
37448 .kr(1)
37449 .sr(1)
37450 .m(m)
37451 .n(n)
37452 .k(k)
37453 .cm_stride(5)
37454 .iterations(1)
37455 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37456 }
37457 }
37458 }
37459 }
37460
TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC,qmin)37461 TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC, qmin) {
37462 GemmMicrokernelTester()
37463 .mr(1)
37464 .nr(2)
37465 .kr(1)
37466 .sr(1)
37467 .m(1)
37468 .n(2)
37469 .k(1)
37470 .qmin(128)
37471 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37472 }
37473
TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC,qmax)37474 TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC, qmax) {
37475 GemmMicrokernelTester()
37476 .mr(1)
37477 .nr(2)
37478 .kr(1)
37479 .sr(1)
37480 .m(1)
37481 .n(2)
37482 .k(1)
37483 .qmax(128)
37484 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37485 }
37486
TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC,strided_cm)37487 TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC, strided_cm) {
37488 GemmMicrokernelTester()
37489 .mr(1)
37490 .nr(2)
37491 .kr(1)
37492 .sr(1)
37493 .m(1)
37494 .n(2)
37495 .k(1)
37496 .cm_stride(5)
37497 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37498 }
37499
TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC,no_a_zero_point)37500 TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC, no_a_zero_point) {
37501 for (size_t k = 1; k <= 5; k += 2) {
37502 GemmMicrokernelTester()
37503 .mr(1)
37504 .nr(2)
37505 .kr(1)
37506 .sr(1)
37507 .m(1)
37508 .n(2)
37509 .k(k)
37510 .a_zero_point(0)
37511 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37512 }
37513 }
37514
TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC,no_b_zero_point)37515 TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC, no_b_zero_point) {
37516 for (size_t k = 1; k <= 5; k += 2) {
37517 GemmMicrokernelTester()
37518 .mr(1)
37519 .nr(2)
37520 .kr(1)
37521 .sr(1)
37522 .m(1)
37523 .n(2)
37524 .k(k)
37525 .b_zero_point(0)
37526 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37527 }
37528 }
37529
TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC,no_zero_point)37530 TEST(QU8_GEMM_MINMAX_FP32_1X2__SCALAR_IMAGIC, no_zero_point) {
37531 for (size_t k = 1; k <= 5; k += 2) {
37532 GemmMicrokernelTester()
37533 .mr(1)
37534 .nr(2)
37535 .kr(1)
37536 .sr(1)
37537 .m(1)
37538 .n(2)
37539 .k(k)
37540 .a_zero_point(0)
37541 .b_zero_point(0)
37542 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37543 }
37544 }
37545
37546
TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC,k_eq_1)37547 TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, k_eq_1) {
37548 GemmMicrokernelTester()
37549 .mr(1)
37550 .nr(4)
37551 .kr(1)
37552 .sr(1)
37553 .m(1)
37554 .n(4)
37555 .k(1)
37556 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37557 }
37558
TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC,strided_cn)37559 TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, strided_cn) {
37560 GemmMicrokernelTester()
37561 .mr(1)
37562 .nr(4)
37563 .kr(1)
37564 .sr(1)
37565 .m(1)
37566 .n(4)
37567 .k(1)
37568 .cn_stride(7)
37569 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37570 }
37571
TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC,k_eq_1_strided_a)37572 TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, k_eq_1_strided_a) {
37573 GemmMicrokernelTester()
37574 .mr(1)
37575 .nr(4)
37576 .kr(1)
37577 .sr(1)
37578 .m(1)
37579 .n(4)
37580 .k(1)
37581 .a_stride(3)
37582 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37583 }
37584
TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC,k_eq_1_subtile)37585 TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, k_eq_1_subtile) {
37586 for (uint32_t n = 1; n <= 4; n++) {
37587 for (uint32_t m = 1; m <= 1; m++) {
37588 GemmMicrokernelTester()
37589 .mr(1)
37590 .nr(4)
37591 .kr(1)
37592 .sr(1)
37593 .m(m)
37594 .n(n)
37595 .k(1)
37596 .iterations(1)
37597 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37598 }
37599 }
37600 }
37601
TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC,k_eq_1_subtile_m)37602 TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, k_eq_1_subtile_m) {
37603 for (uint32_t m = 1; m <= 1; m++) {
37604 GemmMicrokernelTester()
37605 .mr(1)
37606 .nr(4)
37607 .kr(1)
37608 .sr(1)
37609 .m(m)
37610 .n(4)
37611 .k(1)
37612 .iterations(1)
37613 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37614 }
37615 }
37616
TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC,k_eq_1_subtile_n)37617 TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, k_eq_1_subtile_n) {
37618 for (uint32_t n = 1; n <= 4; n++) {
37619 GemmMicrokernelTester()
37620 .mr(1)
37621 .nr(4)
37622 .kr(1)
37623 .sr(1)
37624 .m(1)
37625 .n(n)
37626 .k(1)
37627 .iterations(1)
37628 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37629 }
37630 }
37631
TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC,k_gt_1)37632 TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, k_gt_1) {
37633 for (size_t k = 2; k < 10; k++) {
37634 GemmMicrokernelTester()
37635 .mr(1)
37636 .nr(4)
37637 .kr(1)
37638 .sr(1)
37639 .m(1)
37640 .n(4)
37641 .k(k)
37642 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37643 }
37644 }
37645
TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC,k_gt_1_strided_a)37646 TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, k_gt_1_strided_a) {
37647 for (size_t k = 2; k < 10; k++) {
37648 GemmMicrokernelTester()
37649 .mr(1)
37650 .nr(4)
37651 .kr(1)
37652 .sr(1)
37653 .m(1)
37654 .n(4)
37655 .k(k)
37656 .a_stride(11)
37657 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37658 }
37659 }
37660
TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC,k_gt_1_subtile)37661 TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, k_gt_1_subtile) {
37662 for (size_t k = 2; k < 10; k++) {
37663 for (uint32_t n = 1; n <= 4; n++) {
37664 for (uint32_t m = 1; m <= 1; m++) {
37665 GemmMicrokernelTester()
37666 .mr(1)
37667 .nr(4)
37668 .kr(1)
37669 .sr(1)
37670 .m(m)
37671 .n(n)
37672 .k(k)
37673 .iterations(1)
37674 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37675 }
37676 }
37677 }
37678 }
37679
TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC,n_gt_4)37680 TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, n_gt_4) {
37681 for (uint32_t n = 5; n < 8; n++) {
37682 for (size_t k = 1; k <= 5; k += 2) {
37683 GemmMicrokernelTester()
37684 .mr(1)
37685 .nr(4)
37686 .kr(1)
37687 .sr(1)
37688 .m(1)
37689 .n(n)
37690 .k(k)
37691 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37692 }
37693 }
37694 }
37695
TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC,n_gt_4_strided_cn)37696 TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, n_gt_4_strided_cn) {
37697 for (uint32_t n = 5; n < 8; n++) {
37698 for (size_t k = 1; k <= 5; k += 2) {
37699 GemmMicrokernelTester()
37700 .mr(1)
37701 .nr(4)
37702 .kr(1)
37703 .sr(1)
37704 .m(1)
37705 .n(n)
37706 .k(k)
37707 .cn_stride(7)
37708 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37709 }
37710 }
37711 }
37712
TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC,n_gt_4_strided_a)37713 TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, n_gt_4_strided_a) {
37714 for (uint32_t n = 5; n < 8; n++) {
37715 for (size_t k = 1; k <= 5; k += 2) {
37716 GemmMicrokernelTester()
37717 .mr(1)
37718 .nr(4)
37719 .kr(1)
37720 .sr(1)
37721 .m(1)
37722 .n(n)
37723 .k(k)
37724 .a_stride(7)
37725 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37726 }
37727 }
37728 }
37729
TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC,n_gt_4_subtile)37730 TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, n_gt_4_subtile) {
37731 for (uint32_t n = 5; n < 8; n++) {
37732 for (size_t k = 1; k <= 5; k += 2) {
37733 for (uint32_t m = 1; m <= 1; m++) {
37734 GemmMicrokernelTester()
37735 .mr(1)
37736 .nr(4)
37737 .kr(1)
37738 .sr(1)
37739 .m(m)
37740 .n(n)
37741 .k(k)
37742 .iterations(1)
37743 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37744 }
37745 }
37746 }
37747 }
37748
TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC,n_div_4)37749 TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, n_div_4) {
37750 for (uint32_t n = 8; n <= 12; n += 4) {
37751 for (size_t k = 1; k <= 5; k += 2) {
37752 GemmMicrokernelTester()
37753 .mr(1)
37754 .nr(4)
37755 .kr(1)
37756 .sr(1)
37757 .m(1)
37758 .n(n)
37759 .k(k)
37760 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37761 }
37762 }
37763 }
37764
TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC,n_div_4_strided_cn)37765 TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, n_div_4_strided_cn) {
37766 for (uint32_t n = 8; n <= 12; n += 4) {
37767 for (size_t k = 1; k <= 5; k += 2) {
37768 GemmMicrokernelTester()
37769 .mr(1)
37770 .nr(4)
37771 .kr(1)
37772 .sr(1)
37773 .m(1)
37774 .n(n)
37775 .k(k)
37776 .cn_stride(7)
37777 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37778 }
37779 }
37780 }
37781
TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC,n_div_4_strided_a)37782 TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, n_div_4_strided_a) {
37783 for (uint32_t n = 8; n <= 12; n += 4) {
37784 for (size_t k = 1; k <= 5; k += 2) {
37785 GemmMicrokernelTester()
37786 .mr(1)
37787 .nr(4)
37788 .kr(1)
37789 .sr(1)
37790 .m(1)
37791 .n(n)
37792 .k(k)
37793 .a_stride(7)
37794 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37795 }
37796 }
37797 }
37798
TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC,n_div_4_subtile)37799 TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, n_div_4_subtile) {
37800 for (uint32_t n = 8; n <= 12; n += 4) {
37801 for (size_t k = 1; k <= 5; k += 2) {
37802 for (uint32_t m = 1; m <= 1; m++) {
37803 GemmMicrokernelTester()
37804 .mr(1)
37805 .nr(4)
37806 .kr(1)
37807 .sr(1)
37808 .m(m)
37809 .n(n)
37810 .k(k)
37811 .iterations(1)
37812 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37813 }
37814 }
37815 }
37816 }
37817
TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC,strided_cm_subtile)37818 TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, strided_cm_subtile) {
37819 for (size_t k = 1; k <= 5; k += 2) {
37820 for (uint32_t n = 1; n <= 4; n++) {
37821 for (uint32_t m = 1; m <= 1; m++) {
37822 GemmMicrokernelTester()
37823 .mr(1)
37824 .nr(4)
37825 .kr(1)
37826 .sr(1)
37827 .m(m)
37828 .n(n)
37829 .k(k)
37830 .cm_stride(7)
37831 .iterations(1)
37832 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37833 }
37834 }
37835 }
37836 }
37837
TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC,qmin)37838 TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, qmin) {
37839 GemmMicrokernelTester()
37840 .mr(1)
37841 .nr(4)
37842 .kr(1)
37843 .sr(1)
37844 .m(1)
37845 .n(4)
37846 .k(1)
37847 .qmin(128)
37848 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37849 }
37850
TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC,qmax)37851 TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, qmax) {
37852 GemmMicrokernelTester()
37853 .mr(1)
37854 .nr(4)
37855 .kr(1)
37856 .sr(1)
37857 .m(1)
37858 .n(4)
37859 .k(1)
37860 .qmax(128)
37861 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37862 }
37863
TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC,strided_cm)37864 TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, strided_cm) {
37865 GemmMicrokernelTester()
37866 .mr(1)
37867 .nr(4)
37868 .kr(1)
37869 .sr(1)
37870 .m(1)
37871 .n(4)
37872 .k(1)
37873 .cm_stride(7)
37874 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37875 }
37876
TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC,no_a_zero_point)37877 TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, no_a_zero_point) {
37878 for (size_t k = 1; k <= 5; k += 2) {
37879 GemmMicrokernelTester()
37880 .mr(1)
37881 .nr(4)
37882 .kr(1)
37883 .sr(1)
37884 .m(1)
37885 .n(4)
37886 .k(k)
37887 .a_zero_point(0)
37888 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37889 }
37890 }
37891
TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC,no_b_zero_point)37892 TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, no_b_zero_point) {
37893 for (size_t k = 1; k <= 5; k += 2) {
37894 GemmMicrokernelTester()
37895 .mr(1)
37896 .nr(4)
37897 .kr(1)
37898 .sr(1)
37899 .m(1)
37900 .n(4)
37901 .k(k)
37902 .b_zero_point(0)
37903 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37904 }
37905 }
37906
TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC,no_zero_point)37907 TEST(QU8_GEMM_MINMAX_FP32_1X4__SCALAR_IMAGIC, no_zero_point) {
37908 for (size_t k = 1; k <= 5; k += 2) {
37909 GemmMicrokernelTester()
37910 .mr(1)
37911 .nr(4)
37912 .kr(1)
37913 .sr(1)
37914 .m(1)
37915 .n(4)
37916 .k(k)
37917 .a_zero_point(0)
37918 .b_zero_point(0)
37919 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37920 }
37921 }
37922
37923
TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC,k_eq_1)37924 TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, k_eq_1) {
37925 GemmMicrokernelTester()
37926 .mr(2)
37927 .nr(2)
37928 .kr(1)
37929 .sr(1)
37930 .m(2)
37931 .n(2)
37932 .k(1)
37933 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37934 }
37935
TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC,strided_cn)37936 TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, strided_cn) {
37937 GemmMicrokernelTester()
37938 .mr(2)
37939 .nr(2)
37940 .kr(1)
37941 .sr(1)
37942 .m(2)
37943 .n(2)
37944 .k(1)
37945 .cn_stride(5)
37946 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37947 }
37948
TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC,k_eq_1_strided_a)37949 TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, k_eq_1_strided_a) {
37950 GemmMicrokernelTester()
37951 .mr(2)
37952 .nr(2)
37953 .kr(1)
37954 .sr(1)
37955 .m(2)
37956 .n(2)
37957 .k(1)
37958 .a_stride(3)
37959 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37960 }
37961
TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC,k_eq_1_subtile)37962 TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, k_eq_1_subtile) {
37963 for (uint32_t n = 1; n <= 2; n++) {
37964 for (uint32_t m = 1; m <= 2; m++) {
37965 GemmMicrokernelTester()
37966 .mr(2)
37967 .nr(2)
37968 .kr(1)
37969 .sr(1)
37970 .m(m)
37971 .n(n)
37972 .k(1)
37973 .iterations(1)
37974 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37975 }
37976 }
37977 }
37978
TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC,k_eq_1_subtile_m)37979 TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, k_eq_1_subtile_m) {
37980 for (uint32_t m = 1; m <= 2; m++) {
37981 GemmMicrokernelTester()
37982 .mr(2)
37983 .nr(2)
37984 .kr(1)
37985 .sr(1)
37986 .m(m)
37987 .n(2)
37988 .k(1)
37989 .iterations(1)
37990 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
37991 }
37992 }
37993
TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC,k_eq_1_subtile_n)37994 TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, k_eq_1_subtile_n) {
37995 for (uint32_t n = 1; n <= 2; n++) {
37996 GemmMicrokernelTester()
37997 .mr(2)
37998 .nr(2)
37999 .kr(1)
38000 .sr(1)
38001 .m(2)
38002 .n(n)
38003 .k(1)
38004 .iterations(1)
38005 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38006 }
38007 }
38008
TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC,k_gt_1)38009 TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, k_gt_1) {
38010 for (size_t k = 2; k < 10; k++) {
38011 GemmMicrokernelTester()
38012 .mr(2)
38013 .nr(2)
38014 .kr(1)
38015 .sr(1)
38016 .m(2)
38017 .n(2)
38018 .k(k)
38019 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38020 }
38021 }
38022
TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC,k_gt_1_strided_a)38023 TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, k_gt_1_strided_a) {
38024 for (size_t k = 2; k < 10; k++) {
38025 GemmMicrokernelTester()
38026 .mr(2)
38027 .nr(2)
38028 .kr(1)
38029 .sr(1)
38030 .m(2)
38031 .n(2)
38032 .k(k)
38033 .a_stride(11)
38034 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38035 }
38036 }
38037
TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC,k_gt_1_subtile)38038 TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, k_gt_1_subtile) {
38039 for (size_t k = 2; k < 10; k++) {
38040 for (uint32_t n = 1; n <= 2; n++) {
38041 for (uint32_t m = 1; m <= 2; m++) {
38042 GemmMicrokernelTester()
38043 .mr(2)
38044 .nr(2)
38045 .kr(1)
38046 .sr(1)
38047 .m(m)
38048 .n(n)
38049 .k(k)
38050 .iterations(1)
38051 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38052 }
38053 }
38054 }
38055 }
38056
TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC,n_gt_2)38057 TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, n_gt_2) {
38058 for (uint32_t n = 3; n < 4; n++) {
38059 for (size_t k = 1; k <= 5; k += 2) {
38060 GemmMicrokernelTester()
38061 .mr(2)
38062 .nr(2)
38063 .kr(1)
38064 .sr(1)
38065 .m(2)
38066 .n(n)
38067 .k(k)
38068 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38069 }
38070 }
38071 }
38072
TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC,n_gt_2_strided_cn)38073 TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, n_gt_2_strided_cn) {
38074 for (uint32_t n = 3; n < 4; n++) {
38075 for (size_t k = 1; k <= 5; k += 2) {
38076 GemmMicrokernelTester()
38077 .mr(2)
38078 .nr(2)
38079 .kr(1)
38080 .sr(1)
38081 .m(2)
38082 .n(n)
38083 .k(k)
38084 .cn_stride(5)
38085 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38086 }
38087 }
38088 }
38089
TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC,n_gt_2_strided_a)38090 TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, n_gt_2_strided_a) {
38091 for (uint32_t n = 3; n < 4; n++) {
38092 for (size_t k = 1; k <= 5; k += 2) {
38093 GemmMicrokernelTester()
38094 .mr(2)
38095 .nr(2)
38096 .kr(1)
38097 .sr(1)
38098 .m(2)
38099 .n(n)
38100 .k(k)
38101 .a_stride(7)
38102 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38103 }
38104 }
38105 }
38106
TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC,n_gt_2_subtile)38107 TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, n_gt_2_subtile) {
38108 for (uint32_t n = 3; n < 4; n++) {
38109 for (size_t k = 1; k <= 5; k += 2) {
38110 for (uint32_t m = 1; m <= 2; m++) {
38111 GemmMicrokernelTester()
38112 .mr(2)
38113 .nr(2)
38114 .kr(1)
38115 .sr(1)
38116 .m(m)
38117 .n(n)
38118 .k(k)
38119 .iterations(1)
38120 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38121 }
38122 }
38123 }
38124 }
38125
TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC,n_div_2)38126 TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, n_div_2) {
38127 for (uint32_t n = 4; n <= 6; n += 2) {
38128 for (size_t k = 1; k <= 5; k += 2) {
38129 GemmMicrokernelTester()
38130 .mr(2)
38131 .nr(2)
38132 .kr(1)
38133 .sr(1)
38134 .m(2)
38135 .n(n)
38136 .k(k)
38137 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38138 }
38139 }
38140 }
38141
TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC,n_div_2_strided_cn)38142 TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, n_div_2_strided_cn) {
38143 for (uint32_t n = 4; n <= 6; n += 2) {
38144 for (size_t k = 1; k <= 5; k += 2) {
38145 GemmMicrokernelTester()
38146 .mr(2)
38147 .nr(2)
38148 .kr(1)
38149 .sr(1)
38150 .m(2)
38151 .n(n)
38152 .k(k)
38153 .cn_stride(5)
38154 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38155 }
38156 }
38157 }
38158
TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC,n_div_2_strided_a)38159 TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, n_div_2_strided_a) {
38160 for (uint32_t n = 4; n <= 6; n += 2) {
38161 for (size_t k = 1; k <= 5; k += 2) {
38162 GemmMicrokernelTester()
38163 .mr(2)
38164 .nr(2)
38165 .kr(1)
38166 .sr(1)
38167 .m(2)
38168 .n(n)
38169 .k(k)
38170 .a_stride(7)
38171 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38172 }
38173 }
38174 }
38175
TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC,n_div_2_subtile)38176 TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, n_div_2_subtile) {
38177 for (uint32_t n = 4; n <= 6; n += 2) {
38178 for (size_t k = 1; k <= 5; k += 2) {
38179 for (uint32_t m = 1; m <= 2; m++) {
38180 GemmMicrokernelTester()
38181 .mr(2)
38182 .nr(2)
38183 .kr(1)
38184 .sr(1)
38185 .m(m)
38186 .n(n)
38187 .k(k)
38188 .iterations(1)
38189 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38190 }
38191 }
38192 }
38193 }
38194
TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC,strided_cm_subtile)38195 TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, strided_cm_subtile) {
38196 for (size_t k = 1; k <= 5; k += 2) {
38197 for (uint32_t n = 1; n <= 2; n++) {
38198 for (uint32_t m = 1; m <= 2; m++) {
38199 GemmMicrokernelTester()
38200 .mr(2)
38201 .nr(2)
38202 .kr(1)
38203 .sr(1)
38204 .m(m)
38205 .n(n)
38206 .k(k)
38207 .cm_stride(5)
38208 .iterations(1)
38209 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38210 }
38211 }
38212 }
38213 }
38214
TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC,qmin)38215 TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, qmin) {
38216 GemmMicrokernelTester()
38217 .mr(2)
38218 .nr(2)
38219 .kr(1)
38220 .sr(1)
38221 .m(2)
38222 .n(2)
38223 .k(1)
38224 .qmin(128)
38225 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38226 }
38227
TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC,qmax)38228 TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, qmax) {
38229 GemmMicrokernelTester()
38230 .mr(2)
38231 .nr(2)
38232 .kr(1)
38233 .sr(1)
38234 .m(2)
38235 .n(2)
38236 .k(1)
38237 .qmax(128)
38238 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38239 }
38240
TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC,strided_cm)38241 TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, strided_cm) {
38242 GemmMicrokernelTester()
38243 .mr(2)
38244 .nr(2)
38245 .kr(1)
38246 .sr(1)
38247 .m(2)
38248 .n(2)
38249 .k(1)
38250 .cm_stride(5)
38251 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38252 }
38253
TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC,no_a_zero_point)38254 TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, no_a_zero_point) {
38255 for (size_t k = 1; k <= 5; k += 2) {
38256 GemmMicrokernelTester()
38257 .mr(2)
38258 .nr(2)
38259 .kr(1)
38260 .sr(1)
38261 .m(2)
38262 .n(2)
38263 .k(k)
38264 .a_zero_point(0)
38265 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38266 }
38267 }
38268
TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC,no_b_zero_point)38269 TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, no_b_zero_point) {
38270 for (size_t k = 1; k <= 5; k += 2) {
38271 GemmMicrokernelTester()
38272 .mr(2)
38273 .nr(2)
38274 .kr(1)
38275 .sr(1)
38276 .m(2)
38277 .n(2)
38278 .k(k)
38279 .b_zero_point(0)
38280 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38281 }
38282 }
38283
TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC,no_zero_point)38284 TEST(QU8_GEMM_MINMAX_FP32_2X2__SCALAR_IMAGIC, no_zero_point) {
38285 for (size_t k = 1; k <= 5; k += 2) {
38286 GemmMicrokernelTester()
38287 .mr(2)
38288 .nr(2)
38289 .kr(1)
38290 .sr(1)
38291 .m(2)
38292 .n(2)
38293 .k(k)
38294 .a_zero_point(0)
38295 .b_zero_point(0)
38296 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38297 }
38298 }
38299
38300
TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC,k_eq_1)38301 TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, k_eq_1) {
38302 GemmMicrokernelTester()
38303 .mr(2)
38304 .nr(4)
38305 .kr(1)
38306 .sr(1)
38307 .m(2)
38308 .n(4)
38309 .k(1)
38310 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38311 }
38312
TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC,strided_cn)38313 TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, strided_cn) {
38314 GemmMicrokernelTester()
38315 .mr(2)
38316 .nr(4)
38317 .kr(1)
38318 .sr(1)
38319 .m(2)
38320 .n(4)
38321 .k(1)
38322 .cn_stride(7)
38323 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38324 }
38325
TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC,k_eq_1_strided_a)38326 TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, k_eq_1_strided_a) {
38327 GemmMicrokernelTester()
38328 .mr(2)
38329 .nr(4)
38330 .kr(1)
38331 .sr(1)
38332 .m(2)
38333 .n(4)
38334 .k(1)
38335 .a_stride(3)
38336 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38337 }
38338
TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC,k_eq_1_subtile)38339 TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, k_eq_1_subtile) {
38340 for (uint32_t n = 1; n <= 4; n++) {
38341 for (uint32_t m = 1; m <= 2; m++) {
38342 GemmMicrokernelTester()
38343 .mr(2)
38344 .nr(4)
38345 .kr(1)
38346 .sr(1)
38347 .m(m)
38348 .n(n)
38349 .k(1)
38350 .iterations(1)
38351 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38352 }
38353 }
38354 }
38355
TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC,k_eq_1_subtile_m)38356 TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, k_eq_1_subtile_m) {
38357 for (uint32_t m = 1; m <= 2; m++) {
38358 GemmMicrokernelTester()
38359 .mr(2)
38360 .nr(4)
38361 .kr(1)
38362 .sr(1)
38363 .m(m)
38364 .n(4)
38365 .k(1)
38366 .iterations(1)
38367 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38368 }
38369 }
38370
TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC,k_eq_1_subtile_n)38371 TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, k_eq_1_subtile_n) {
38372 for (uint32_t n = 1; n <= 4; n++) {
38373 GemmMicrokernelTester()
38374 .mr(2)
38375 .nr(4)
38376 .kr(1)
38377 .sr(1)
38378 .m(2)
38379 .n(n)
38380 .k(1)
38381 .iterations(1)
38382 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38383 }
38384 }
38385
TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC,k_gt_1)38386 TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, k_gt_1) {
38387 for (size_t k = 2; k < 10; k++) {
38388 GemmMicrokernelTester()
38389 .mr(2)
38390 .nr(4)
38391 .kr(1)
38392 .sr(1)
38393 .m(2)
38394 .n(4)
38395 .k(k)
38396 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38397 }
38398 }
38399
TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC,k_gt_1_strided_a)38400 TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, k_gt_1_strided_a) {
38401 for (size_t k = 2; k < 10; k++) {
38402 GemmMicrokernelTester()
38403 .mr(2)
38404 .nr(4)
38405 .kr(1)
38406 .sr(1)
38407 .m(2)
38408 .n(4)
38409 .k(k)
38410 .a_stride(11)
38411 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38412 }
38413 }
38414
TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC,k_gt_1_subtile)38415 TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, k_gt_1_subtile) {
38416 for (size_t k = 2; k < 10; k++) {
38417 for (uint32_t n = 1; n <= 4; n++) {
38418 for (uint32_t m = 1; m <= 2; m++) {
38419 GemmMicrokernelTester()
38420 .mr(2)
38421 .nr(4)
38422 .kr(1)
38423 .sr(1)
38424 .m(m)
38425 .n(n)
38426 .k(k)
38427 .iterations(1)
38428 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38429 }
38430 }
38431 }
38432 }
38433
TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC,n_gt_4)38434 TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, n_gt_4) {
38435 for (uint32_t n = 5; n < 8; n++) {
38436 for (size_t k = 1; k <= 5; k += 2) {
38437 GemmMicrokernelTester()
38438 .mr(2)
38439 .nr(4)
38440 .kr(1)
38441 .sr(1)
38442 .m(2)
38443 .n(n)
38444 .k(k)
38445 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38446 }
38447 }
38448 }
38449
TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC,n_gt_4_strided_cn)38450 TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, n_gt_4_strided_cn) {
38451 for (uint32_t n = 5; n < 8; n++) {
38452 for (size_t k = 1; k <= 5; k += 2) {
38453 GemmMicrokernelTester()
38454 .mr(2)
38455 .nr(4)
38456 .kr(1)
38457 .sr(1)
38458 .m(2)
38459 .n(n)
38460 .k(k)
38461 .cn_stride(7)
38462 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38463 }
38464 }
38465 }
38466
TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC,n_gt_4_strided_a)38467 TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, n_gt_4_strided_a) {
38468 for (uint32_t n = 5; n < 8; n++) {
38469 for (size_t k = 1; k <= 5; k += 2) {
38470 GemmMicrokernelTester()
38471 .mr(2)
38472 .nr(4)
38473 .kr(1)
38474 .sr(1)
38475 .m(2)
38476 .n(n)
38477 .k(k)
38478 .a_stride(7)
38479 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38480 }
38481 }
38482 }
38483
TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC,n_gt_4_subtile)38484 TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, n_gt_4_subtile) {
38485 for (uint32_t n = 5; n < 8; n++) {
38486 for (size_t k = 1; k <= 5; k += 2) {
38487 for (uint32_t m = 1; m <= 2; m++) {
38488 GemmMicrokernelTester()
38489 .mr(2)
38490 .nr(4)
38491 .kr(1)
38492 .sr(1)
38493 .m(m)
38494 .n(n)
38495 .k(k)
38496 .iterations(1)
38497 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38498 }
38499 }
38500 }
38501 }
38502
TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC,n_div_4)38503 TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, n_div_4) {
38504 for (uint32_t n = 8; n <= 12; n += 4) {
38505 for (size_t k = 1; k <= 5; k += 2) {
38506 GemmMicrokernelTester()
38507 .mr(2)
38508 .nr(4)
38509 .kr(1)
38510 .sr(1)
38511 .m(2)
38512 .n(n)
38513 .k(k)
38514 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38515 }
38516 }
38517 }
38518
TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC,n_div_4_strided_cn)38519 TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, n_div_4_strided_cn) {
38520 for (uint32_t n = 8; n <= 12; n += 4) {
38521 for (size_t k = 1; k <= 5; k += 2) {
38522 GemmMicrokernelTester()
38523 .mr(2)
38524 .nr(4)
38525 .kr(1)
38526 .sr(1)
38527 .m(2)
38528 .n(n)
38529 .k(k)
38530 .cn_stride(7)
38531 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38532 }
38533 }
38534 }
38535
TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC,n_div_4_strided_a)38536 TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, n_div_4_strided_a) {
38537 for (uint32_t n = 8; n <= 12; n += 4) {
38538 for (size_t k = 1; k <= 5; k += 2) {
38539 GemmMicrokernelTester()
38540 .mr(2)
38541 .nr(4)
38542 .kr(1)
38543 .sr(1)
38544 .m(2)
38545 .n(n)
38546 .k(k)
38547 .a_stride(7)
38548 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38549 }
38550 }
38551 }
38552
TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC,n_div_4_subtile)38553 TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, n_div_4_subtile) {
38554 for (uint32_t n = 8; n <= 12; n += 4) {
38555 for (size_t k = 1; k <= 5; k += 2) {
38556 for (uint32_t m = 1; m <= 2; m++) {
38557 GemmMicrokernelTester()
38558 .mr(2)
38559 .nr(4)
38560 .kr(1)
38561 .sr(1)
38562 .m(m)
38563 .n(n)
38564 .k(k)
38565 .iterations(1)
38566 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38567 }
38568 }
38569 }
38570 }
38571
TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC,strided_cm_subtile)38572 TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, strided_cm_subtile) {
38573 for (size_t k = 1; k <= 5; k += 2) {
38574 for (uint32_t n = 1; n <= 4; n++) {
38575 for (uint32_t m = 1; m <= 2; m++) {
38576 GemmMicrokernelTester()
38577 .mr(2)
38578 .nr(4)
38579 .kr(1)
38580 .sr(1)
38581 .m(m)
38582 .n(n)
38583 .k(k)
38584 .cm_stride(7)
38585 .iterations(1)
38586 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38587 }
38588 }
38589 }
38590 }
38591
TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC,qmin)38592 TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, qmin) {
38593 GemmMicrokernelTester()
38594 .mr(2)
38595 .nr(4)
38596 .kr(1)
38597 .sr(1)
38598 .m(2)
38599 .n(4)
38600 .k(1)
38601 .qmin(128)
38602 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38603 }
38604
TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC,qmax)38605 TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, qmax) {
38606 GemmMicrokernelTester()
38607 .mr(2)
38608 .nr(4)
38609 .kr(1)
38610 .sr(1)
38611 .m(2)
38612 .n(4)
38613 .k(1)
38614 .qmax(128)
38615 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38616 }
38617
TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC,strided_cm)38618 TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, strided_cm) {
38619 GemmMicrokernelTester()
38620 .mr(2)
38621 .nr(4)
38622 .kr(1)
38623 .sr(1)
38624 .m(2)
38625 .n(4)
38626 .k(1)
38627 .cm_stride(7)
38628 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38629 }
38630
TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC,no_a_zero_point)38631 TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, no_a_zero_point) {
38632 for (size_t k = 1; k <= 5; k += 2) {
38633 GemmMicrokernelTester()
38634 .mr(2)
38635 .nr(4)
38636 .kr(1)
38637 .sr(1)
38638 .m(2)
38639 .n(4)
38640 .k(k)
38641 .a_zero_point(0)
38642 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38643 }
38644 }
38645
TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC,no_b_zero_point)38646 TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, no_b_zero_point) {
38647 for (size_t k = 1; k <= 5; k += 2) {
38648 GemmMicrokernelTester()
38649 .mr(2)
38650 .nr(4)
38651 .kr(1)
38652 .sr(1)
38653 .m(2)
38654 .n(4)
38655 .k(k)
38656 .b_zero_point(0)
38657 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38658 }
38659 }
38660
TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC,no_zero_point)38661 TEST(QU8_GEMM_MINMAX_FP32_2X4__SCALAR_IMAGIC, no_zero_point) {
38662 for (size_t k = 1; k <= 5; k += 2) {
38663 GemmMicrokernelTester()
38664 .mr(2)
38665 .nr(4)
38666 .kr(1)
38667 .sr(1)
38668 .m(2)
38669 .n(4)
38670 .k(k)
38671 .a_zero_point(0)
38672 .b_zero_point(0)
38673 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params, xnn_qu8_requantize_fp32);
38674 }
38675 }
38676
38677
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC,k_eq_1)38678 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC, k_eq_1) {
38679 GemmMicrokernelTester()
38680 .mr(3)
38681 .nr(2)
38682 .kr(1)
38683 .sr(1)
38684 .m(3)
38685 .n(2)
38686 .k(1)
38687 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
38688 }
38689
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC,strided_cn)38690 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC, strided_cn) {
38691 GemmMicrokernelTester()
38692 .mr(3)
38693 .nr(2)
38694 .kr(1)
38695 .sr(1)
38696 .m(3)
38697 .n(2)
38698 .k(1)
38699 .cn_stride(5)
38700 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
38701 }
38702
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC,k_eq_1_strided_a)38703 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC, k_eq_1_strided_a) {
38704 GemmMicrokernelTester()
38705 .mr(3)
38706 .nr(2)
38707 .kr(1)
38708 .sr(1)
38709 .m(3)
38710 .n(2)
38711 .k(1)
38712 .a_stride(3)
38713 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
38714 }
38715
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC,k_eq_1_subtile)38716 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC, k_eq_1_subtile) {
38717 for (uint32_t n = 1; n <= 2; n++) {
38718 for (uint32_t m = 1; m <= 3; m++) {
38719 GemmMicrokernelTester()
38720 .mr(3)
38721 .nr(2)
38722 .kr(1)
38723 .sr(1)
38724 .m(m)
38725 .n(n)
38726 .k(1)
38727 .iterations(1)
38728 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
38729 }
38730 }
38731 }
38732
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC,k_eq_1_subtile_m)38733 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC, k_eq_1_subtile_m) {
38734 for (uint32_t m = 1; m <= 3; m++) {
38735 GemmMicrokernelTester()
38736 .mr(3)
38737 .nr(2)
38738 .kr(1)
38739 .sr(1)
38740 .m(m)
38741 .n(2)
38742 .k(1)
38743 .iterations(1)
38744 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
38745 }
38746 }
38747
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC,k_eq_1_subtile_n)38748 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC, k_eq_1_subtile_n) {
38749 for (uint32_t n = 1; n <= 2; n++) {
38750 GemmMicrokernelTester()
38751 .mr(3)
38752 .nr(2)
38753 .kr(1)
38754 .sr(1)
38755 .m(3)
38756 .n(n)
38757 .k(1)
38758 .iterations(1)
38759 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
38760 }
38761 }
38762
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC,k_gt_1)38763 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC, k_gt_1) {
38764 for (size_t k = 2; k < 10; k++) {
38765 GemmMicrokernelTester()
38766 .mr(3)
38767 .nr(2)
38768 .kr(1)
38769 .sr(1)
38770 .m(3)
38771 .n(2)
38772 .k(k)
38773 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
38774 }
38775 }
38776
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC,k_gt_1_strided_a)38777 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC, k_gt_1_strided_a) {
38778 for (size_t k = 2; k < 10; k++) {
38779 GemmMicrokernelTester()
38780 .mr(3)
38781 .nr(2)
38782 .kr(1)
38783 .sr(1)
38784 .m(3)
38785 .n(2)
38786 .k(k)
38787 .a_stride(11)
38788 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
38789 }
38790 }
38791
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC,k_gt_1_subtile)38792 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC, k_gt_1_subtile) {
38793 for (size_t k = 2; k < 10; k++) {
38794 for (uint32_t n = 1; n <= 2; n++) {
38795 for (uint32_t m = 1; m <= 3; m++) {
38796 GemmMicrokernelTester()
38797 .mr(3)
38798 .nr(2)
38799 .kr(1)
38800 .sr(1)
38801 .m(m)
38802 .n(n)
38803 .k(k)
38804 .iterations(1)
38805 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
38806 }
38807 }
38808 }
38809 }
38810
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC,n_gt_2)38811 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC, n_gt_2) {
38812 for (uint32_t n = 3; n < 4; n++) {
38813 for (size_t k = 1; k <= 5; k += 2) {
38814 GemmMicrokernelTester()
38815 .mr(3)
38816 .nr(2)
38817 .kr(1)
38818 .sr(1)
38819 .m(3)
38820 .n(n)
38821 .k(k)
38822 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
38823 }
38824 }
38825 }
38826
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC,n_gt_2_strided_cn)38827 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC, n_gt_2_strided_cn) {
38828 for (uint32_t n = 3; n < 4; n++) {
38829 for (size_t k = 1; k <= 5; k += 2) {
38830 GemmMicrokernelTester()
38831 .mr(3)
38832 .nr(2)
38833 .kr(1)
38834 .sr(1)
38835 .m(3)
38836 .n(n)
38837 .k(k)
38838 .cn_stride(5)
38839 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
38840 }
38841 }
38842 }
38843
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC,n_gt_2_strided_a)38844 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC, n_gt_2_strided_a) {
38845 for (uint32_t n = 3; n < 4; n++) {
38846 for (size_t k = 1; k <= 5; k += 2) {
38847 GemmMicrokernelTester()
38848 .mr(3)
38849 .nr(2)
38850 .kr(1)
38851 .sr(1)
38852 .m(3)
38853 .n(n)
38854 .k(k)
38855 .a_stride(7)
38856 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
38857 }
38858 }
38859 }
38860
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC,n_gt_2_subtile)38861 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC, n_gt_2_subtile) {
38862 for (uint32_t n = 3; n < 4; n++) {
38863 for (size_t k = 1; k <= 5; k += 2) {
38864 for (uint32_t m = 1; m <= 3; m++) {
38865 GemmMicrokernelTester()
38866 .mr(3)
38867 .nr(2)
38868 .kr(1)
38869 .sr(1)
38870 .m(m)
38871 .n(n)
38872 .k(k)
38873 .iterations(1)
38874 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
38875 }
38876 }
38877 }
38878 }
38879
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC,n_div_2)38880 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC, n_div_2) {
38881 for (uint32_t n = 4; n <= 6; n += 2) {
38882 for (size_t k = 1; k <= 5; k += 2) {
38883 GemmMicrokernelTester()
38884 .mr(3)
38885 .nr(2)
38886 .kr(1)
38887 .sr(1)
38888 .m(3)
38889 .n(n)
38890 .k(k)
38891 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
38892 }
38893 }
38894 }
38895
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC,n_div_2_strided_cn)38896 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC, n_div_2_strided_cn) {
38897 for (uint32_t n = 4; n <= 6; n += 2) {
38898 for (size_t k = 1; k <= 5; k += 2) {
38899 GemmMicrokernelTester()
38900 .mr(3)
38901 .nr(2)
38902 .kr(1)
38903 .sr(1)
38904 .m(3)
38905 .n(n)
38906 .k(k)
38907 .cn_stride(5)
38908 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
38909 }
38910 }
38911 }
38912
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC,n_div_2_strided_a)38913 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC, n_div_2_strided_a) {
38914 for (uint32_t n = 4; n <= 6; n += 2) {
38915 for (size_t k = 1; k <= 5; k += 2) {
38916 GemmMicrokernelTester()
38917 .mr(3)
38918 .nr(2)
38919 .kr(1)
38920 .sr(1)
38921 .m(3)
38922 .n(n)
38923 .k(k)
38924 .a_stride(7)
38925 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
38926 }
38927 }
38928 }
38929
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC,n_div_2_subtile)38930 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC, n_div_2_subtile) {
38931 for (uint32_t n = 4; n <= 6; n += 2) {
38932 for (size_t k = 1; k <= 5; k += 2) {
38933 for (uint32_t m = 1; m <= 3; m++) {
38934 GemmMicrokernelTester()
38935 .mr(3)
38936 .nr(2)
38937 .kr(1)
38938 .sr(1)
38939 .m(m)
38940 .n(n)
38941 .k(k)
38942 .iterations(1)
38943 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
38944 }
38945 }
38946 }
38947 }
38948
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC,strided_cm_subtile)38949 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC, strided_cm_subtile) {
38950 for (size_t k = 1; k <= 5; k += 2) {
38951 for (uint32_t n = 1; n <= 2; n++) {
38952 for (uint32_t m = 1; m <= 3; m++) {
38953 GemmMicrokernelTester()
38954 .mr(3)
38955 .nr(2)
38956 .kr(1)
38957 .sr(1)
38958 .m(m)
38959 .n(n)
38960 .k(k)
38961 .cm_stride(5)
38962 .iterations(1)
38963 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
38964 }
38965 }
38966 }
38967 }
38968
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC,qmin)38969 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC, qmin) {
38970 GemmMicrokernelTester()
38971 .mr(3)
38972 .nr(2)
38973 .kr(1)
38974 .sr(1)
38975 .m(3)
38976 .n(2)
38977 .k(1)
38978 .qmin(128)
38979 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
38980 }
38981
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC,qmax)38982 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC, qmax) {
38983 GemmMicrokernelTester()
38984 .mr(3)
38985 .nr(2)
38986 .kr(1)
38987 .sr(1)
38988 .m(3)
38989 .n(2)
38990 .k(1)
38991 .qmax(128)
38992 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
38993 }
38994
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC,strided_cm)38995 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC, strided_cm) {
38996 GemmMicrokernelTester()
38997 .mr(3)
38998 .nr(2)
38999 .kr(1)
39000 .sr(1)
39001 .m(3)
39002 .n(2)
39003 .k(1)
39004 .cm_stride(5)
39005 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
39006 }
39007
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC,no_a_zero_point)39008 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC, no_a_zero_point) {
39009 for (size_t k = 1; k <= 5; k += 2) {
39010 GemmMicrokernelTester()
39011 .mr(3)
39012 .nr(2)
39013 .kr(1)
39014 .sr(1)
39015 .m(3)
39016 .n(2)
39017 .k(k)
39018 .a_zero_point(0)
39019 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
39020 }
39021 }
39022
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC,no_b_zero_point)39023 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC, no_b_zero_point) {
39024 for (size_t k = 1; k <= 5; k += 2) {
39025 GemmMicrokernelTester()
39026 .mr(3)
39027 .nr(2)
39028 .kr(1)
39029 .sr(1)
39030 .m(3)
39031 .n(2)
39032 .k(k)
39033 .b_zero_point(0)
39034 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
39035 }
39036 }
39037
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC,no_zero_point)39038 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_FMAGIC, no_zero_point) {
39039 for (size_t k = 1; k <= 5; k += 2) {
39040 GemmMicrokernelTester()
39041 .mr(3)
39042 .nr(2)
39043 .kr(1)
39044 .sr(1)
39045 .m(3)
39046 .n(2)
39047 .k(k)
39048 .a_zero_point(0)
39049 .b_zero_point(0)
39050 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
39051 }
39052 }
39053
39054
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF,k_eq_1)39055 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF, k_eq_1) {
39056 GemmMicrokernelTester()
39057 .mr(3)
39058 .nr(2)
39059 .kr(1)
39060 .sr(1)
39061 .m(3)
39062 .n(2)
39063 .k(1)
39064 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39065 }
39066
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF,strided_cn)39067 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF, strided_cn) {
39068 GemmMicrokernelTester()
39069 .mr(3)
39070 .nr(2)
39071 .kr(1)
39072 .sr(1)
39073 .m(3)
39074 .n(2)
39075 .k(1)
39076 .cn_stride(5)
39077 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39078 }
39079
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF,k_eq_1_strided_a)39080 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF, k_eq_1_strided_a) {
39081 GemmMicrokernelTester()
39082 .mr(3)
39083 .nr(2)
39084 .kr(1)
39085 .sr(1)
39086 .m(3)
39087 .n(2)
39088 .k(1)
39089 .a_stride(3)
39090 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39091 }
39092
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF,k_eq_1_subtile)39093 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF, k_eq_1_subtile) {
39094 for (uint32_t n = 1; n <= 2; n++) {
39095 for (uint32_t m = 1; m <= 3; m++) {
39096 GemmMicrokernelTester()
39097 .mr(3)
39098 .nr(2)
39099 .kr(1)
39100 .sr(1)
39101 .m(m)
39102 .n(n)
39103 .k(1)
39104 .iterations(1)
39105 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39106 }
39107 }
39108 }
39109
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF,k_eq_1_subtile_m)39110 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF, k_eq_1_subtile_m) {
39111 for (uint32_t m = 1; m <= 3; m++) {
39112 GemmMicrokernelTester()
39113 .mr(3)
39114 .nr(2)
39115 .kr(1)
39116 .sr(1)
39117 .m(m)
39118 .n(2)
39119 .k(1)
39120 .iterations(1)
39121 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39122 }
39123 }
39124
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF,k_eq_1_subtile_n)39125 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF, k_eq_1_subtile_n) {
39126 for (uint32_t n = 1; n <= 2; n++) {
39127 GemmMicrokernelTester()
39128 .mr(3)
39129 .nr(2)
39130 .kr(1)
39131 .sr(1)
39132 .m(3)
39133 .n(n)
39134 .k(1)
39135 .iterations(1)
39136 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39137 }
39138 }
39139
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF,k_gt_1)39140 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF, k_gt_1) {
39141 for (size_t k = 2; k < 10; k++) {
39142 GemmMicrokernelTester()
39143 .mr(3)
39144 .nr(2)
39145 .kr(1)
39146 .sr(1)
39147 .m(3)
39148 .n(2)
39149 .k(k)
39150 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39151 }
39152 }
39153
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF,k_gt_1_strided_a)39154 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF, k_gt_1_strided_a) {
39155 for (size_t k = 2; k < 10; k++) {
39156 GemmMicrokernelTester()
39157 .mr(3)
39158 .nr(2)
39159 .kr(1)
39160 .sr(1)
39161 .m(3)
39162 .n(2)
39163 .k(k)
39164 .a_stride(11)
39165 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39166 }
39167 }
39168
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF,k_gt_1_subtile)39169 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF, k_gt_1_subtile) {
39170 for (size_t k = 2; k < 10; k++) {
39171 for (uint32_t n = 1; n <= 2; n++) {
39172 for (uint32_t m = 1; m <= 3; m++) {
39173 GemmMicrokernelTester()
39174 .mr(3)
39175 .nr(2)
39176 .kr(1)
39177 .sr(1)
39178 .m(m)
39179 .n(n)
39180 .k(k)
39181 .iterations(1)
39182 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39183 }
39184 }
39185 }
39186 }
39187
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF,n_gt_2)39188 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF, n_gt_2) {
39189 for (uint32_t n = 3; n < 4; n++) {
39190 for (size_t k = 1; k <= 5; k += 2) {
39191 GemmMicrokernelTester()
39192 .mr(3)
39193 .nr(2)
39194 .kr(1)
39195 .sr(1)
39196 .m(3)
39197 .n(n)
39198 .k(k)
39199 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39200 }
39201 }
39202 }
39203
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF,n_gt_2_strided_cn)39204 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF, n_gt_2_strided_cn) {
39205 for (uint32_t n = 3; n < 4; n++) {
39206 for (size_t k = 1; k <= 5; k += 2) {
39207 GemmMicrokernelTester()
39208 .mr(3)
39209 .nr(2)
39210 .kr(1)
39211 .sr(1)
39212 .m(3)
39213 .n(n)
39214 .k(k)
39215 .cn_stride(5)
39216 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39217 }
39218 }
39219 }
39220
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF,n_gt_2_strided_a)39221 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF, n_gt_2_strided_a) {
39222 for (uint32_t n = 3; n < 4; n++) {
39223 for (size_t k = 1; k <= 5; k += 2) {
39224 GemmMicrokernelTester()
39225 .mr(3)
39226 .nr(2)
39227 .kr(1)
39228 .sr(1)
39229 .m(3)
39230 .n(n)
39231 .k(k)
39232 .a_stride(7)
39233 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39234 }
39235 }
39236 }
39237
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF,n_gt_2_subtile)39238 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF, n_gt_2_subtile) {
39239 for (uint32_t n = 3; n < 4; n++) {
39240 for (size_t k = 1; k <= 5; k += 2) {
39241 for (uint32_t m = 1; m <= 3; m++) {
39242 GemmMicrokernelTester()
39243 .mr(3)
39244 .nr(2)
39245 .kr(1)
39246 .sr(1)
39247 .m(m)
39248 .n(n)
39249 .k(k)
39250 .iterations(1)
39251 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39252 }
39253 }
39254 }
39255 }
39256
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF,n_div_2)39257 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF, n_div_2) {
39258 for (uint32_t n = 4; n <= 6; n += 2) {
39259 for (size_t k = 1; k <= 5; k += 2) {
39260 GemmMicrokernelTester()
39261 .mr(3)
39262 .nr(2)
39263 .kr(1)
39264 .sr(1)
39265 .m(3)
39266 .n(n)
39267 .k(k)
39268 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39269 }
39270 }
39271 }
39272
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF,n_div_2_strided_cn)39273 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF, n_div_2_strided_cn) {
39274 for (uint32_t n = 4; n <= 6; n += 2) {
39275 for (size_t k = 1; k <= 5; k += 2) {
39276 GemmMicrokernelTester()
39277 .mr(3)
39278 .nr(2)
39279 .kr(1)
39280 .sr(1)
39281 .m(3)
39282 .n(n)
39283 .k(k)
39284 .cn_stride(5)
39285 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39286 }
39287 }
39288 }
39289
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF,n_div_2_strided_a)39290 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF, n_div_2_strided_a) {
39291 for (uint32_t n = 4; n <= 6; n += 2) {
39292 for (size_t k = 1; k <= 5; k += 2) {
39293 GemmMicrokernelTester()
39294 .mr(3)
39295 .nr(2)
39296 .kr(1)
39297 .sr(1)
39298 .m(3)
39299 .n(n)
39300 .k(k)
39301 .a_stride(7)
39302 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39303 }
39304 }
39305 }
39306
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF,n_div_2_subtile)39307 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF, n_div_2_subtile) {
39308 for (uint32_t n = 4; n <= 6; n += 2) {
39309 for (size_t k = 1; k <= 5; k += 2) {
39310 for (uint32_t m = 1; m <= 3; m++) {
39311 GemmMicrokernelTester()
39312 .mr(3)
39313 .nr(2)
39314 .kr(1)
39315 .sr(1)
39316 .m(m)
39317 .n(n)
39318 .k(k)
39319 .iterations(1)
39320 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39321 }
39322 }
39323 }
39324 }
39325
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF,strided_cm_subtile)39326 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF, strided_cm_subtile) {
39327 for (size_t k = 1; k <= 5; k += 2) {
39328 for (uint32_t n = 1; n <= 2; n++) {
39329 for (uint32_t m = 1; m <= 3; m++) {
39330 GemmMicrokernelTester()
39331 .mr(3)
39332 .nr(2)
39333 .kr(1)
39334 .sr(1)
39335 .m(m)
39336 .n(n)
39337 .k(k)
39338 .cm_stride(5)
39339 .iterations(1)
39340 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39341 }
39342 }
39343 }
39344 }
39345
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF,qmin)39346 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF, qmin) {
39347 GemmMicrokernelTester()
39348 .mr(3)
39349 .nr(2)
39350 .kr(1)
39351 .sr(1)
39352 .m(3)
39353 .n(2)
39354 .k(1)
39355 .qmin(128)
39356 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39357 }
39358
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF,qmax)39359 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF, qmax) {
39360 GemmMicrokernelTester()
39361 .mr(3)
39362 .nr(2)
39363 .kr(1)
39364 .sr(1)
39365 .m(3)
39366 .n(2)
39367 .k(1)
39368 .qmax(128)
39369 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39370 }
39371
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF,strided_cm)39372 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF, strided_cm) {
39373 GemmMicrokernelTester()
39374 .mr(3)
39375 .nr(2)
39376 .kr(1)
39377 .sr(1)
39378 .m(3)
39379 .n(2)
39380 .k(1)
39381 .cm_stride(5)
39382 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39383 }
39384
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF,no_a_zero_point)39385 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF, no_a_zero_point) {
39386 for (size_t k = 1; k <= 5; k += 2) {
39387 GemmMicrokernelTester()
39388 .mr(3)
39389 .nr(2)
39390 .kr(1)
39391 .sr(1)
39392 .m(3)
39393 .n(2)
39394 .k(k)
39395 .a_zero_point(0)
39396 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39397 }
39398 }
39399
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF,no_b_zero_point)39400 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF, no_b_zero_point) {
39401 for (size_t k = 1; k <= 5; k += 2) {
39402 GemmMicrokernelTester()
39403 .mr(3)
39404 .nr(2)
39405 .kr(1)
39406 .sr(1)
39407 .m(3)
39408 .n(2)
39409 .k(k)
39410 .b_zero_point(0)
39411 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39412 }
39413 }
39414
TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF,no_zero_point)39415 TEST(QU8_GEMM_MINMAX_FP32_3X2__SCALAR_LRINTF, no_zero_point) {
39416 for (size_t k = 1; k <= 5; k += 2) {
39417 GemmMicrokernelTester()
39418 .mr(3)
39419 .nr(2)
39420 .kr(1)
39421 .sr(1)
39422 .m(3)
39423 .n(2)
39424 .k(k)
39425 .a_zero_point(0)
39426 .b_zero_point(0)
39427 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39428 }
39429 }
39430
39431
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC,k_eq_1)39432 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, k_eq_1) {
39433 GemmMicrokernelTester()
39434 .mr(3)
39435 .nr(4)
39436 .kr(1)
39437 .sr(1)
39438 .m(3)
39439 .n(4)
39440 .k(1)
39441 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
39442 }
39443
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC,strided_cn)39444 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, strided_cn) {
39445 GemmMicrokernelTester()
39446 .mr(3)
39447 .nr(4)
39448 .kr(1)
39449 .sr(1)
39450 .m(3)
39451 .n(4)
39452 .k(1)
39453 .cn_stride(7)
39454 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
39455 }
39456
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC,k_eq_1_strided_a)39457 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, k_eq_1_strided_a) {
39458 GemmMicrokernelTester()
39459 .mr(3)
39460 .nr(4)
39461 .kr(1)
39462 .sr(1)
39463 .m(3)
39464 .n(4)
39465 .k(1)
39466 .a_stride(3)
39467 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
39468 }
39469
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC,k_eq_1_subtile)39470 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, k_eq_1_subtile) {
39471 for (uint32_t n = 1; n <= 4; n++) {
39472 for (uint32_t m = 1; m <= 3; m++) {
39473 GemmMicrokernelTester()
39474 .mr(3)
39475 .nr(4)
39476 .kr(1)
39477 .sr(1)
39478 .m(m)
39479 .n(n)
39480 .k(1)
39481 .iterations(1)
39482 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
39483 }
39484 }
39485 }
39486
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC,k_eq_1_subtile_m)39487 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, k_eq_1_subtile_m) {
39488 for (uint32_t m = 1; m <= 3; m++) {
39489 GemmMicrokernelTester()
39490 .mr(3)
39491 .nr(4)
39492 .kr(1)
39493 .sr(1)
39494 .m(m)
39495 .n(4)
39496 .k(1)
39497 .iterations(1)
39498 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
39499 }
39500 }
39501
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC,k_eq_1_subtile_n)39502 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, k_eq_1_subtile_n) {
39503 for (uint32_t n = 1; n <= 4; n++) {
39504 GemmMicrokernelTester()
39505 .mr(3)
39506 .nr(4)
39507 .kr(1)
39508 .sr(1)
39509 .m(3)
39510 .n(n)
39511 .k(1)
39512 .iterations(1)
39513 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
39514 }
39515 }
39516
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC,k_gt_1)39517 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, k_gt_1) {
39518 for (size_t k = 2; k < 10; k++) {
39519 GemmMicrokernelTester()
39520 .mr(3)
39521 .nr(4)
39522 .kr(1)
39523 .sr(1)
39524 .m(3)
39525 .n(4)
39526 .k(k)
39527 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
39528 }
39529 }
39530
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC,k_gt_1_strided_a)39531 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, k_gt_1_strided_a) {
39532 for (size_t k = 2; k < 10; k++) {
39533 GemmMicrokernelTester()
39534 .mr(3)
39535 .nr(4)
39536 .kr(1)
39537 .sr(1)
39538 .m(3)
39539 .n(4)
39540 .k(k)
39541 .a_stride(11)
39542 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
39543 }
39544 }
39545
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC,k_gt_1_subtile)39546 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, k_gt_1_subtile) {
39547 for (size_t k = 2; k < 10; k++) {
39548 for (uint32_t n = 1; n <= 4; n++) {
39549 for (uint32_t m = 1; m <= 3; m++) {
39550 GemmMicrokernelTester()
39551 .mr(3)
39552 .nr(4)
39553 .kr(1)
39554 .sr(1)
39555 .m(m)
39556 .n(n)
39557 .k(k)
39558 .iterations(1)
39559 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
39560 }
39561 }
39562 }
39563 }
39564
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC,n_gt_4)39565 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, n_gt_4) {
39566 for (uint32_t n = 5; n < 8; n++) {
39567 for (size_t k = 1; k <= 5; k += 2) {
39568 GemmMicrokernelTester()
39569 .mr(3)
39570 .nr(4)
39571 .kr(1)
39572 .sr(1)
39573 .m(3)
39574 .n(n)
39575 .k(k)
39576 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
39577 }
39578 }
39579 }
39580
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC,n_gt_4_strided_cn)39581 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, n_gt_4_strided_cn) {
39582 for (uint32_t n = 5; n < 8; n++) {
39583 for (size_t k = 1; k <= 5; k += 2) {
39584 GemmMicrokernelTester()
39585 .mr(3)
39586 .nr(4)
39587 .kr(1)
39588 .sr(1)
39589 .m(3)
39590 .n(n)
39591 .k(k)
39592 .cn_stride(7)
39593 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
39594 }
39595 }
39596 }
39597
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC,n_gt_4_strided_a)39598 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, n_gt_4_strided_a) {
39599 for (uint32_t n = 5; n < 8; n++) {
39600 for (size_t k = 1; k <= 5; k += 2) {
39601 GemmMicrokernelTester()
39602 .mr(3)
39603 .nr(4)
39604 .kr(1)
39605 .sr(1)
39606 .m(3)
39607 .n(n)
39608 .k(k)
39609 .a_stride(7)
39610 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
39611 }
39612 }
39613 }
39614
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC,n_gt_4_subtile)39615 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, n_gt_4_subtile) {
39616 for (uint32_t n = 5; n < 8; n++) {
39617 for (size_t k = 1; k <= 5; k += 2) {
39618 for (uint32_t m = 1; m <= 3; m++) {
39619 GemmMicrokernelTester()
39620 .mr(3)
39621 .nr(4)
39622 .kr(1)
39623 .sr(1)
39624 .m(m)
39625 .n(n)
39626 .k(k)
39627 .iterations(1)
39628 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
39629 }
39630 }
39631 }
39632 }
39633
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC,n_div_4)39634 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, n_div_4) {
39635 for (uint32_t n = 8; n <= 12; n += 4) {
39636 for (size_t k = 1; k <= 5; k += 2) {
39637 GemmMicrokernelTester()
39638 .mr(3)
39639 .nr(4)
39640 .kr(1)
39641 .sr(1)
39642 .m(3)
39643 .n(n)
39644 .k(k)
39645 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
39646 }
39647 }
39648 }
39649
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC,n_div_4_strided_cn)39650 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, n_div_4_strided_cn) {
39651 for (uint32_t n = 8; n <= 12; n += 4) {
39652 for (size_t k = 1; k <= 5; k += 2) {
39653 GemmMicrokernelTester()
39654 .mr(3)
39655 .nr(4)
39656 .kr(1)
39657 .sr(1)
39658 .m(3)
39659 .n(n)
39660 .k(k)
39661 .cn_stride(7)
39662 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
39663 }
39664 }
39665 }
39666
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC,n_div_4_strided_a)39667 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, n_div_4_strided_a) {
39668 for (uint32_t n = 8; n <= 12; n += 4) {
39669 for (size_t k = 1; k <= 5; k += 2) {
39670 GemmMicrokernelTester()
39671 .mr(3)
39672 .nr(4)
39673 .kr(1)
39674 .sr(1)
39675 .m(3)
39676 .n(n)
39677 .k(k)
39678 .a_stride(7)
39679 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
39680 }
39681 }
39682 }
39683
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC,n_div_4_subtile)39684 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, n_div_4_subtile) {
39685 for (uint32_t n = 8; n <= 12; n += 4) {
39686 for (size_t k = 1; k <= 5; k += 2) {
39687 for (uint32_t m = 1; m <= 3; m++) {
39688 GemmMicrokernelTester()
39689 .mr(3)
39690 .nr(4)
39691 .kr(1)
39692 .sr(1)
39693 .m(m)
39694 .n(n)
39695 .k(k)
39696 .iterations(1)
39697 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
39698 }
39699 }
39700 }
39701 }
39702
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC,strided_cm_subtile)39703 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, strided_cm_subtile) {
39704 for (size_t k = 1; k <= 5; k += 2) {
39705 for (uint32_t n = 1; n <= 4; n++) {
39706 for (uint32_t m = 1; m <= 3; m++) {
39707 GemmMicrokernelTester()
39708 .mr(3)
39709 .nr(4)
39710 .kr(1)
39711 .sr(1)
39712 .m(m)
39713 .n(n)
39714 .k(k)
39715 .cm_stride(7)
39716 .iterations(1)
39717 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
39718 }
39719 }
39720 }
39721 }
39722
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC,qmin)39723 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, qmin) {
39724 GemmMicrokernelTester()
39725 .mr(3)
39726 .nr(4)
39727 .kr(1)
39728 .sr(1)
39729 .m(3)
39730 .n(4)
39731 .k(1)
39732 .qmin(128)
39733 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
39734 }
39735
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC,qmax)39736 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, qmax) {
39737 GemmMicrokernelTester()
39738 .mr(3)
39739 .nr(4)
39740 .kr(1)
39741 .sr(1)
39742 .m(3)
39743 .n(4)
39744 .k(1)
39745 .qmax(128)
39746 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
39747 }
39748
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC,strided_cm)39749 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, strided_cm) {
39750 GemmMicrokernelTester()
39751 .mr(3)
39752 .nr(4)
39753 .kr(1)
39754 .sr(1)
39755 .m(3)
39756 .n(4)
39757 .k(1)
39758 .cm_stride(7)
39759 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
39760 }
39761
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC,no_a_zero_point)39762 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, no_a_zero_point) {
39763 for (size_t k = 1; k <= 5; k += 2) {
39764 GemmMicrokernelTester()
39765 .mr(3)
39766 .nr(4)
39767 .kr(1)
39768 .sr(1)
39769 .m(3)
39770 .n(4)
39771 .k(k)
39772 .a_zero_point(0)
39773 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
39774 }
39775 }
39776
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC,no_b_zero_point)39777 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, no_b_zero_point) {
39778 for (size_t k = 1; k <= 5; k += 2) {
39779 GemmMicrokernelTester()
39780 .mr(3)
39781 .nr(4)
39782 .kr(1)
39783 .sr(1)
39784 .m(3)
39785 .n(4)
39786 .k(k)
39787 .b_zero_point(0)
39788 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
39789 }
39790 }
39791
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC,no_zero_point)39792 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_FMAGIC, no_zero_point) {
39793 for (size_t k = 1; k <= 5; k += 2) {
39794 GemmMicrokernelTester()
39795 .mr(3)
39796 .nr(4)
39797 .kr(1)
39798 .sr(1)
39799 .m(3)
39800 .n(4)
39801 .k(k)
39802 .a_zero_point(0)
39803 .b_zero_point(0)
39804 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
39805 }
39806 }
39807
39808
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF,k_eq_1)39809 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, k_eq_1) {
39810 GemmMicrokernelTester()
39811 .mr(3)
39812 .nr(4)
39813 .kr(1)
39814 .sr(1)
39815 .m(3)
39816 .n(4)
39817 .k(1)
39818 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39819 }
39820
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF,strided_cn)39821 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, strided_cn) {
39822 GemmMicrokernelTester()
39823 .mr(3)
39824 .nr(4)
39825 .kr(1)
39826 .sr(1)
39827 .m(3)
39828 .n(4)
39829 .k(1)
39830 .cn_stride(7)
39831 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39832 }
39833
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF,k_eq_1_strided_a)39834 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, k_eq_1_strided_a) {
39835 GemmMicrokernelTester()
39836 .mr(3)
39837 .nr(4)
39838 .kr(1)
39839 .sr(1)
39840 .m(3)
39841 .n(4)
39842 .k(1)
39843 .a_stride(3)
39844 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39845 }
39846
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF,k_eq_1_subtile)39847 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, k_eq_1_subtile) {
39848 for (uint32_t n = 1; n <= 4; n++) {
39849 for (uint32_t m = 1; m <= 3; m++) {
39850 GemmMicrokernelTester()
39851 .mr(3)
39852 .nr(4)
39853 .kr(1)
39854 .sr(1)
39855 .m(m)
39856 .n(n)
39857 .k(1)
39858 .iterations(1)
39859 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39860 }
39861 }
39862 }
39863
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF,k_eq_1_subtile_m)39864 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, k_eq_1_subtile_m) {
39865 for (uint32_t m = 1; m <= 3; m++) {
39866 GemmMicrokernelTester()
39867 .mr(3)
39868 .nr(4)
39869 .kr(1)
39870 .sr(1)
39871 .m(m)
39872 .n(4)
39873 .k(1)
39874 .iterations(1)
39875 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39876 }
39877 }
39878
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF,k_eq_1_subtile_n)39879 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, k_eq_1_subtile_n) {
39880 for (uint32_t n = 1; n <= 4; n++) {
39881 GemmMicrokernelTester()
39882 .mr(3)
39883 .nr(4)
39884 .kr(1)
39885 .sr(1)
39886 .m(3)
39887 .n(n)
39888 .k(1)
39889 .iterations(1)
39890 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39891 }
39892 }
39893
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF,k_gt_1)39894 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, k_gt_1) {
39895 for (size_t k = 2; k < 10; k++) {
39896 GemmMicrokernelTester()
39897 .mr(3)
39898 .nr(4)
39899 .kr(1)
39900 .sr(1)
39901 .m(3)
39902 .n(4)
39903 .k(k)
39904 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39905 }
39906 }
39907
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF,k_gt_1_strided_a)39908 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, k_gt_1_strided_a) {
39909 for (size_t k = 2; k < 10; k++) {
39910 GemmMicrokernelTester()
39911 .mr(3)
39912 .nr(4)
39913 .kr(1)
39914 .sr(1)
39915 .m(3)
39916 .n(4)
39917 .k(k)
39918 .a_stride(11)
39919 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39920 }
39921 }
39922
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF,k_gt_1_subtile)39923 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, k_gt_1_subtile) {
39924 for (size_t k = 2; k < 10; k++) {
39925 for (uint32_t n = 1; n <= 4; n++) {
39926 for (uint32_t m = 1; m <= 3; m++) {
39927 GemmMicrokernelTester()
39928 .mr(3)
39929 .nr(4)
39930 .kr(1)
39931 .sr(1)
39932 .m(m)
39933 .n(n)
39934 .k(k)
39935 .iterations(1)
39936 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39937 }
39938 }
39939 }
39940 }
39941
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF,n_gt_4)39942 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, n_gt_4) {
39943 for (uint32_t n = 5; n < 8; n++) {
39944 for (size_t k = 1; k <= 5; k += 2) {
39945 GemmMicrokernelTester()
39946 .mr(3)
39947 .nr(4)
39948 .kr(1)
39949 .sr(1)
39950 .m(3)
39951 .n(n)
39952 .k(k)
39953 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39954 }
39955 }
39956 }
39957
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF,n_gt_4_strided_cn)39958 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, n_gt_4_strided_cn) {
39959 for (uint32_t n = 5; n < 8; n++) {
39960 for (size_t k = 1; k <= 5; k += 2) {
39961 GemmMicrokernelTester()
39962 .mr(3)
39963 .nr(4)
39964 .kr(1)
39965 .sr(1)
39966 .m(3)
39967 .n(n)
39968 .k(k)
39969 .cn_stride(7)
39970 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39971 }
39972 }
39973 }
39974
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF,n_gt_4_strided_a)39975 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, n_gt_4_strided_a) {
39976 for (uint32_t n = 5; n < 8; n++) {
39977 for (size_t k = 1; k <= 5; k += 2) {
39978 GemmMicrokernelTester()
39979 .mr(3)
39980 .nr(4)
39981 .kr(1)
39982 .sr(1)
39983 .m(3)
39984 .n(n)
39985 .k(k)
39986 .a_stride(7)
39987 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
39988 }
39989 }
39990 }
39991
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF,n_gt_4_subtile)39992 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, n_gt_4_subtile) {
39993 for (uint32_t n = 5; n < 8; n++) {
39994 for (size_t k = 1; k <= 5; k += 2) {
39995 for (uint32_t m = 1; m <= 3; m++) {
39996 GemmMicrokernelTester()
39997 .mr(3)
39998 .nr(4)
39999 .kr(1)
40000 .sr(1)
40001 .m(m)
40002 .n(n)
40003 .k(k)
40004 .iterations(1)
40005 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40006 }
40007 }
40008 }
40009 }
40010
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF,n_div_4)40011 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, n_div_4) {
40012 for (uint32_t n = 8; n <= 12; n += 4) {
40013 for (size_t k = 1; k <= 5; k += 2) {
40014 GemmMicrokernelTester()
40015 .mr(3)
40016 .nr(4)
40017 .kr(1)
40018 .sr(1)
40019 .m(3)
40020 .n(n)
40021 .k(k)
40022 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40023 }
40024 }
40025 }
40026
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF,n_div_4_strided_cn)40027 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, n_div_4_strided_cn) {
40028 for (uint32_t n = 8; n <= 12; n += 4) {
40029 for (size_t k = 1; k <= 5; k += 2) {
40030 GemmMicrokernelTester()
40031 .mr(3)
40032 .nr(4)
40033 .kr(1)
40034 .sr(1)
40035 .m(3)
40036 .n(n)
40037 .k(k)
40038 .cn_stride(7)
40039 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40040 }
40041 }
40042 }
40043
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF,n_div_4_strided_a)40044 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, n_div_4_strided_a) {
40045 for (uint32_t n = 8; n <= 12; n += 4) {
40046 for (size_t k = 1; k <= 5; k += 2) {
40047 GemmMicrokernelTester()
40048 .mr(3)
40049 .nr(4)
40050 .kr(1)
40051 .sr(1)
40052 .m(3)
40053 .n(n)
40054 .k(k)
40055 .a_stride(7)
40056 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40057 }
40058 }
40059 }
40060
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF,n_div_4_subtile)40061 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, n_div_4_subtile) {
40062 for (uint32_t n = 8; n <= 12; n += 4) {
40063 for (size_t k = 1; k <= 5; k += 2) {
40064 for (uint32_t m = 1; m <= 3; m++) {
40065 GemmMicrokernelTester()
40066 .mr(3)
40067 .nr(4)
40068 .kr(1)
40069 .sr(1)
40070 .m(m)
40071 .n(n)
40072 .k(k)
40073 .iterations(1)
40074 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40075 }
40076 }
40077 }
40078 }
40079
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF,strided_cm_subtile)40080 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, strided_cm_subtile) {
40081 for (size_t k = 1; k <= 5; k += 2) {
40082 for (uint32_t n = 1; n <= 4; n++) {
40083 for (uint32_t m = 1; m <= 3; m++) {
40084 GemmMicrokernelTester()
40085 .mr(3)
40086 .nr(4)
40087 .kr(1)
40088 .sr(1)
40089 .m(m)
40090 .n(n)
40091 .k(k)
40092 .cm_stride(7)
40093 .iterations(1)
40094 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40095 }
40096 }
40097 }
40098 }
40099
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF,qmin)40100 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, qmin) {
40101 GemmMicrokernelTester()
40102 .mr(3)
40103 .nr(4)
40104 .kr(1)
40105 .sr(1)
40106 .m(3)
40107 .n(4)
40108 .k(1)
40109 .qmin(128)
40110 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40111 }
40112
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF,qmax)40113 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, qmax) {
40114 GemmMicrokernelTester()
40115 .mr(3)
40116 .nr(4)
40117 .kr(1)
40118 .sr(1)
40119 .m(3)
40120 .n(4)
40121 .k(1)
40122 .qmax(128)
40123 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40124 }
40125
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF,strided_cm)40126 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, strided_cm) {
40127 GemmMicrokernelTester()
40128 .mr(3)
40129 .nr(4)
40130 .kr(1)
40131 .sr(1)
40132 .m(3)
40133 .n(4)
40134 .k(1)
40135 .cm_stride(7)
40136 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40137 }
40138
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF,no_a_zero_point)40139 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, no_a_zero_point) {
40140 for (size_t k = 1; k <= 5; k += 2) {
40141 GemmMicrokernelTester()
40142 .mr(3)
40143 .nr(4)
40144 .kr(1)
40145 .sr(1)
40146 .m(3)
40147 .n(4)
40148 .k(k)
40149 .a_zero_point(0)
40150 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40151 }
40152 }
40153
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF,no_b_zero_point)40154 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, no_b_zero_point) {
40155 for (size_t k = 1; k <= 5; k += 2) {
40156 GemmMicrokernelTester()
40157 .mr(3)
40158 .nr(4)
40159 .kr(1)
40160 .sr(1)
40161 .m(3)
40162 .n(4)
40163 .k(k)
40164 .b_zero_point(0)
40165 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40166 }
40167 }
40168
TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF,no_zero_point)40169 TEST(QU8_GEMM_MINMAX_FP32_3X4__SCALAR_LRINTF, no_zero_point) {
40170 for (size_t k = 1; k <= 5; k += 2) {
40171 GemmMicrokernelTester()
40172 .mr(3)
40173 .nr(4)
40174 .kr(1)
40175 .sr(1)
40176 .m(3)
40177 .n(4)
40178 .k(k)
40179 .a_zero_point(0)
40180 .b_zero_point(0)
40181 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40182 }
40183 }
40184
40185
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC,k_eq_1)40186 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, k_eq_1) {
40187 GemmMicrokernelTester()
40188 .mr(4)
40189 .nr(2)
40190 .kr(1)
40191 .sr(1)
40192 .m(4)
40193 .n(2)
40194 .k(1)
40195 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
40196 }
40197
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC,strided_cn)40198 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, strided_cn) {
40199 GemmMicrokernelTester()
40200 .mr(4)
40201 .nr(2)
40202 .kr(1)
40203 .sr(1)
40204 .m(4)
40205 .n(2)
40206 .k(1)
40207 .cn_stride(5)
40208 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
40209 }
40210
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC,k_eq_1_strided_a)40211 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, k_eq_1_strided_a) {
40212 GemmMicrokernelTester()
40213 .mr(4)
40214 .nr(2)
40215 .kr(1)
40216 .sr(1)
40217 .m(4)
40218 .n(2)
40219 .k(1)
40220 .a_stride(3)
40221 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
40222 }
40223
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC,k_eq_1_subtile)40224 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, k_eq_1_subtile) {
40225 for (uint32_t n = 1; n <= 2; n++) {
40226 for (uint32_t m = 1; m <= 4; m++) {
40227 GemmMicrokernelTester()
40228 .mr(4)
40229 .nr(2)
40230 .kr(1)
40231 .sr(1)
40232 .m(m)
40233 .n(n)
40234 .k(1)
40235 .iterations(1)
40236 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
40237 }
40238 }
40239 }
40240
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC,k_eq_1_subtile_m)40241 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, k_eq_1_subtile_m) {
40242 for (uint32_t m = 1; m <= 4; m++) {
40243 GemmMicrokernelTester()
40244 .mr(4)
40245 .nr(2)
40246 .kr(1)
40247 .sr(1)
40248 .m(m)
40249 .n(2)
40250 .k(1)
40251 .iterations(1)
40252 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
40253 }
40254 }
40255
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC,k_eq_1_subtile_n)40256 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, k_eq_1_subtile_n) {
40257 for (uint32_t n = 1; n <= 2; n++) {
40258 GemmMicrokernelTester()
40259 .mr(4)
40260 .nr(2)
40261 .kr(1)
40262 .sr(1)
40263 .m(4)
40264 .n(n)
40265 .k(1)
40266 .iterations(1)
40267 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
40268 }
40269 }
40270
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC,k_gt_1)40271 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, k_gt_1) {
40272 for (size_t k = 2; k < 10; k++) {
40273 GemmMicrokernelTester()
40274 .mr(4)
40275 .nr(2)
40276 .kr(1)
40277 .sr(1)
40278 .m(4)
40279 .n(2)
40280 .k(k)
40281 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
40282 }
40283 }
40284
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC,k_gt_1_strided_a)40285 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, k_gt_1_strided_a) {
40286 for (size_t k = 2; k < 10; k++) {
40287 GemmMicrokernelTester()
40288 .mr(4)
40289 .nr(2)
40290 .kr(1)
40291 .sr(1)
40292 .m(4)
40293 .n(2)
40294 .k(k)
40295 .a_stride(11)
40296 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
40297 }
40298 }
40299
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC,k_gt_1_subtile)40300 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, k_gt_1_subtile) {
40301 for (size_t k = 2; k < 10; k++) {
40302 for (uint32_t n = 1; n <= 2; n++) {
40303 for (uint32_t m = 1; m <= 4; m++) {
40304 GemmMicrokernelTester()
40305 .mr(4)
40306 .nr(2)
40307 .kr(1)
40308 .sr(1)
40309 .m(m)
40310 .n(n)
40311 .k(k)
40312 .iterations(1)
40313 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
40314 }
40315 }
40316 }
40317 }
40318
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC,n_gt_2)40319 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, n_gt_2) {
40320 for (uint32_t n = 3; n < 4; n++) {
40321 for (size_t k = 1; k <= 5; k += 2) {
40322 GemmMicrokernelTester()
40323 .mr(4)
40324 .nr(2)
40325 .kr(1)
40326 .sr(1)
40327 .m(4)
40328 .n(n)
40329 .k(k)
40330 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
40331 }
40332 }
40333 }
40334
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC,n_gt_2_strided_cn)40335 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, n_gt_2_strided_cn) {
40336 for (uint32_t n = 3; n < 4; n++) {
40337 for (size_t k = 1; k <= 5; k += 2) {
40338 GemmMicrokernelTester()
40339 .mr(4)
40340 .nr(2)
40341 .kr(1)
40342 .sr(1)
40343 .m(4)
40344 .n(n)
40345 .k(k)
40346 .cn_stride(5)
40347 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
40348 }
40349 }
40350 }
40351
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC,n_gt_2_strided_a)40352 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, n_gt_2_strided_a) {
40353 for (uint32_t n = 3; n < 4; n++) {
40354 for (size_t k = 1; k <= 5; k += 2) {
40355 GemmMicrokernelTester()
40356 .mr(4)
40357 .nr(2)
40358 .kr(1)
40359 .sr(1)
40360 .m(4)
40361 .n(n)
40362 .k(k)
40363 .a_stride(7)
40364 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
40365 }
40366 }
40367 }
40368
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC,n_gt_2_subtile)40369 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, n_gt_2_subtile) {
40370 for (uint32_t n = 3; n < 4; n++) {
40371 for (size_t k = 1; k <= 5; k += 2) {
40372 for (uint32_t m = 1; m <= 4; m++) {
40373 GemmMicrokernelTester()
40374 .mr(4)
40375 .nr(2)
40376 .kr(1)
40377 .sr(1)
40378 .m(m)
40379 .n(n)
40380 .k(k)
40381 .iterations(1)
40382 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
40383 }
40384 }
40385 }
40386 }
40387
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC,n_div_2)40388 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, n_div_2) {
40389 for (uint32_t n = 4; n <= 6; n += 2) {
40390 for (size_t k = 1; k <= 5; k += 2) {
40391 GemmMicrokernelTester()
40392 .mr(4)
40393 .nr(2)
40394 .kr(1)
40395 .sr(1)
40396 .m(4)
40397 .n(n)
40398 .k(k)
40399 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
40400 }
40401 }
40402 }
40403
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC,n_div_2_strided_cn)40404 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, n_div_2_strided_cn) {
40405 for (uint32_t n = 4; n <= 6; n += 2) {
40406 for (size_t k = 1; k <= 5; k += 2) {
40407 GemmMicrokernelTester()
40408 .mr(4)
40409 .nr(2)
40410 .kr(1)
40411 .sr(1)
40412 .m(4)
40413 .n(n)
40414 .k(k)
40415 .cn_stride(5)
40416 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
40417 }
40418 }
40419 }
40420
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC,n_div_2_strided_a)40421 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, n_div_2_strided_a) {
40422 for (uint32_t n = 4; n <= 6; n += 2) {
40423 for (size_t k = 1; k <= 5; k += 2) {
40424 GemmMicrokernelTester()
40425 .mr(4)
40426 .nr(2)
40427 .kr(1)
40428 .sr(1)
40429 .m(4)
40430 .n(n)
40431 .k(k)
40432 .a_stride(7)
40433 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
40434 }
40435 }
40436 }
40437
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC,n_div_2_subtile)40438 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, n_div_2_subtile) {
40439 for (uint32_t n = 4; n <= 6; n += 2) {
40440 for (size_t k = 1; k <= 5; k += 2) {
40441 for (uint32_t m = 1; m <= 4; m++) {
40442 GemmMicrokernelTester()
40443 .mr(4)
40444 .nr(2)
40445 .kr(1)
40446 .sr(1)
40447 .m(m)
40448 .n(n)
40449 .k(k)
40450 .iterations(1)
40451 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
40452 }
40453 }
40454 }
40455 }
40456
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC,strided_cm_subtile)40457 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, strided_cm_subtile) {
40458 for (size_t k = 1; k <= 5; k += 2) {
40459 for (uint32_t n = 1; n <= 2; n++) {
40460 for (uint32_t m = 1; m <= 4; m++) {
40461 GemmMicrokernelTester()
40462 .mr(4)
40463 .nr(2)
40464 .kr(1)
40465 .sr(1)
40466 .m(m)
40467 .n(n)
40468 .k(k)
40469 .cm_stride(5)
40470 .iterations(1)
40471 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
40472 }
40473 }
40474 }
40475 }
40476
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC,qmin)40477 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, qmin) {
40478 GemmMicrokernelTester()
40479 .mr(4)
40480 .nr(2)
40481 .kr(1)
40482 .sr(1)
40483 .m(4)
40484 .n(2)
40485 .k(1)
40486 .qmin(128)
40487 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
40488 }
40489
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC,qmax)40490 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, qmax) {
40491 GemmMicrokernelTester()
40492 .mr(4)
40493 .nr(2)
40494 .kr(1)
40495 .sr(1)
40496 .m(4)
40497 .n(2)
40498 .k(1)
40499 .qmax(128)
40500 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
40501 }
40502
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC,strided_cm)40503 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, strided_cm) {
40504 GemmMicrokernelTester()
40505 .mr(4)
40506 .nr(2)
40507 .kr(1)
40508 .sr(1)
40509 .m(4)
40510 .n(2)
40511 .k(1)
40512 .cm_stride(5)
40513 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
40514 }
40515
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC,no_a_zero_point)40516 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, no_a_zero_point) {
40517 for (size_t k = 1; k <= 5; k += 2) {
40518 GemmMicrokernelTester()
40519 .mr(4)
40520 .nr(2)
40521 .kr(1)
40522 .sr(1)
40523 .m(4)
40524 .n(2)
40525 .k(k)
40526 .a_zero_point(0)
40527 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
40528 }
40529 }
40530
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC,no_b_zero_point)40531 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, no_b_zero_point) {
40532 for (size_t k = 1; k <= 5; k += 2) {
40533 GemmMicrokernelTester()
40534 .mr(4)
40535 .nr(2)
40536 .kr(1)
40537 .sr(1)
40538 .m(4)
40539 .n(2)
40540 .k(k)
40541 .b_zero_point(0)
40542 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
40543 }
40544 }
40545
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC,no_zero_point)40546 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_FMAGIC, no_zero_point) {
40547 for (size_t k = 1; k <= 5; k += 2) {
40548 GemmMicrokernelTester()
40549 .mr(4)
40550 .nr(2)
40551 .kr(1)
40552 .sr(1)
40553 .m(4)
40554 .n(2)
40555 .k(k)
40556 .a_zero_point(0)
40557 .b_zero_point(0)
40558 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
40559 }
40560 }
40561
40562
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF,k_eq_1)40563 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, k_eq_1) {
40564 GemmMicrokernelTester()
40565 .mr(4)
40566 .nr(2)
40567 .kr(1)
40568 .sr(1)
40569 .m(4)
40570 .n(2)
40571 .k(1)
40572 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40573 }
40574
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF,strided_cn)40575 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, strided_cn) {
40576 GemmMicrokernelTester()
40577 .mr(4)
40578 .nr(2)
40579 .kr(1)
40580 .sr(1)
40581 .m(4)
40582 .n(2)
40583 .k(1)
40584 .cn_stride(5)
40585 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40586 }
40587
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF,k_eq_1_strided_a)40588 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, k_eq_1_strided_a) {
40589 GemmMicrokernelTester()
40590 .mr(4)
40591 .nr(2)
40592 .kr(1)
40593 .sr(1)
40594 .m(4)
40595 .n(2)
40596 .k(1)
40597 .a_stride(3)
40598 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40599 }
40600
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF,k_eq_1_subtile)40601 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, k_eq_1_subtile) {
40602 for (uint32_t n = 1; n <= 2; n++) {
40603 for (uint32_t m = 1; m <= 4; m++) {
40604 GemmMicrokernelTester()
40605 .mr(4)
40606 .nr(2)
40607 .kr(1)
40608 .sr(1)
40609 .m(m)
40610 .n(n)
40611 .k(1)
40612 .iterations(1)
40613 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40614 }
40615 }
40616 }
40617
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF,k_eq_1_subtile_m)40618 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, k_eq_1_subtile_m) {
40619 for (uint32_t m = 1; m <= 4; m++) {
40620 GemmMicrokernelTester()
40621 .mr(4)
40622 .nr(2)
40623 .kr(1)
40624 .sr(1)
40625 .m(m)
40626 .n(2)
40627 .k(1)
40628 .iterations(1)
40629 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40630 }
40631 }
40632
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF,k_eq_1_subtile_n)40633 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, k_eq_1_subtile_n) {
40634 for (uint32_t n = 1; n <= 2; n++) {
40635 GemmMicrokernelTester()
40636 .mr(4)
40637 .nr(2)
40638 .kr(1)
40639 .sr(1)
40640 .m(4)
40641 .n(n)
40642 .k(1)
40643 .iterations(1)
40644 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40645 }
40646 }
40647
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF,k_gt_1)40648 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, k_gt_1) {
40649 for (size_t k = 2; k < 10; k++) {
40650 GemmMicrokernelTester()
40651 .mr(4)
40652 .nr(2)
40653 .kr(1)
40654 .sr(1)
40655 .m(4)
40656 .n(2)
40657 .k(k)
40658 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40659 }
40660 }
40661
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF,k_gt_1_strided_a)40662 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, k_gt_1_strided_a) {
40663 for (size_t k = 2; k < 10; k++) {
40664 GemmMicrokernelTester()
40665 .mr(4)
40666 .nr(2)
40667 .kr(1)
40668 .sr(1)
40669 .m(4)
40670 .n(2)
40671 .k(k)
40672 .a_stride(11)
40673 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40674 }
40675 }
40676
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF,k_gt_1_subtile)40677 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, k_gt_1_subtile) {
40678 for (size_t k = 2; k < 10; k++) {
40679 for (uint32_t n = 1; n <= 2; n++) {
40680 for (uint32_t m = 1; m <= 4; m++) {
40681 GemmMicrokernelTester()
40682 .mr(4)
40683 .nr(2)
40684 .kr(1)
40685 .sr(1)
40686 .m(m)
40687 .n(n)
40688 .k(k)
40689 .iterations(1)
40690 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40691 }
40692 }
40693 }
40694 }
40695
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF,n_gt_2)40696 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, n_gt_2) {
40697 for (uint32_t n = 3; n < 4; n++) {
40698 for (size_t k = 1; k <= 5; k += 2) {
40699 GemmMicrokernelTester()
40700 .mr(4)
40701 .nr(2)
40702 .kr(1)
40703 .sr(1)
40704 .m(4)
40705 .n(n)
40706 .k(k)
40707 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40708 }
40709 }
40710 }
40711
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF,n_gt_2_strided_cn)40712 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, n_gt_2_strided_cn) {
40713 for (uint32_t n = 3; n < 4; n++) {
40714 for (size_t k = 1; k <= 5; k += 2) {
40715 GemmMicrokernelTester()
40716 .mr(4)
40717 .nr(2)
40718 .kr(1)
40719 .sr(1)
40720 .m(4)
40721 .n(n)
40722 .k(k)
40723 .cn_stride(5)
40724 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40725 }
40726 }
40727 }
40728
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF,n_gt_2_strided_a)40729 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, n_gt_2_strided_a) {
40730 for (uint32_t n = 3; n < 4; n++) {
40731 for (size_t k = 1; k <= 5; k += 2) {
40732 GemmMicrokernelTester()
40733 .mr(4)
40734 .nr(2)
40735 .kr(1)
40736 .sr(1)
40737 .m(4)
40738 .n(n)
40739 .k(k)
40740 .a_stride(7)
40741 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40742 }
40743 }
40744 }
40745
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF,n_gt_2_subtile)40746 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, n_gt_2_subtile) {
40747 for (uint32_t n = 3; n < 4; n++) {
40748 for (size_t k = 1; k <= 5; k += 2) {
40749 for (uint32_t m = 1; m <= 4; m++) {
40750 GemmMicrokernelTester()
40751 .mr(4)
40752 .nr(2)
40753 .kr(1)
40754 .sr(1)
40755 .m(m)
40756 .n(n)
40757 .k(k)
40758 .iterations(1)
40759 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40760 }
40761 }
40762 }
40763 }
40764
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF,n_div_2)40765 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, n_div_2) {
40766 for (uint32_t n = 4; n <= 6; n += 2) {
40767 for (size_t k = 1; k <= 5; k += 2) {
40768 GemmMicrokernelTester()
40769 .mr(4)
40770 .nr(2)
40771 .kr(1)
40772 .sr(1)
40773 .m(4)
40774 .n(n)
40775 .k(k)
40776 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40777 }
40778 }
40779 }
40780
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF,n_div_2_strided_cn)40781 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, n_div_2_strided_cn) {
40782 for (uint32_t n = 4; n <= 6; n += 2) {
40783 for (size_t k = 1; k <= 5; k += 2) {
40784 GemmMicrokernelTester()
40785 .mr(4)
40786 .nr(2)
40787 .kr(1)
40788 .sr(1)
40789 .m(4)
40790 .n(n)
40791 .k(k)
40792 .cn_stride(5)
40793 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40794 }
40795 }
40796 }
40797
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF,n_div_2_strided_a)40798 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, n_div_2_strided_a) {
40799 for (uint32_t n = 4; n <= 6; n += 2) {
40800 for (size_t k = 1; k <= 5; k += 2) {
40801 GemmMicrokernelTester()
40802 .mr(4)
40803 .nr(2)
40804 .kr(1)
40805 .sr(1)
40806 .m(4)
40807 .n(n)
40808 .k(k)
40809 .a_stride(7)
40810 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40811 }
40812 }
40813 }
40814
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF,n_div_2_subtile)40815 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, n_div_2_subtile) {
40816 for (uint32_t n = 4; n <= 6; n += 2) {
40817 for (size_t k = 1; k <= 5; k += 2) {
40818 for (uint32_t m = 1; m <= 4; m++) {
40819 GemmMicrokernelTester()
40820 .mr(4)
40821 .nr(2)
40822 .kr(1)
40823 .sr(1)
40824 .m(m)
40825 .n(n)
40826 .k(k)
40827 .iterations(1)
40828 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40829 }
40830 }
40831 }
40832 }
40833
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF,strided_cm_subtile)40834 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, strided_cm_subtile) {
40835 for (size_t k = 1; k <= 5; k += 2) {
40836 for (uint32_t n = 1; n <= 2; n++) {
40837 for (uint32_t m = 1; m <= 4; m++) {
40838 GemmMicrokernelTester()
40839 .mr(4)
40840 .nr(2)
40841 .kr(1)
40842 .sr(1)
40843 .m(m)
40844 .n(n)
40845 .k(k)
40846 .cm_stride(5)
40847 .iterations(1)
40848 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40849 }
40850 }
40851 }
40852 }
40853
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF,qmin)40854 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, qmin) {
40855 GemmMicrokernelTester()
40856 .mr(4)
40857 .nr(2)
40858 .kr(1)
40859 .sr(1)
40860 .m(4)
40861 .n(2)
40862 .k(1)
40863 .qmin(128)
40864 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40865 }
40866
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF,qmax)40867 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, qmax) {
40868 GemmMicrokernelTester()
40869 .mr(4)
40870 .nr(2)
40871 .kr(1)
40872 .sr(1)
40873 .m(4)
40874 .n(2)
40875 .k(1)
40876 .qmax(128)
40877 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40878 }
40879
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF,strided_cm)40880 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, strided_cm) {
40881 GemmMicrokernelTester()
40882 .mr(4)
40883 .nr(2)
40884 .kr(1)
40885 .sr(1)
40886 .m(4)
40887 .n(2)
40888 .k(1)
40889 .cm_stride(5)
40890 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40891 }
40892
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF,no_a_zero_point)40893 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, no_a_zero_point) {
40894 for (size_t k = 1; k <= 5; k += 2) {
40895 GemmMicrokernelTester()
40896 .mr(4)
40897 .nr(2)
40898 .kr(1)
40899 .sr(1)
40900 .m(4)
40901 .n(2)
40902 .k(k)
40903 .a_zero_point(0)
40904 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40905 }
40906 }
40907
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF,no_b_zero_point)40908 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, no_b_zero_point) {
40909 for (size_t k = 1; k <= 5; k += 2) {
40910 GemmMicrokernelTester()
40911 .mr(4)
40912 .nr(2)
40913 .kr(1)
40914 .sr(1)
40915 .m(4)
40916 .n(2)
40917 .k(k)
40918 .b_zero_point(0)
40919 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40920 }
40921 }
40922
TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF,no_zero_point)40923 TEST(QU8_GEMM_MINMAX_FP32_4X2__SCALAR_LRINTF, no_zero_point) {
40924 for (size_t k = 1; k <= 5; k += 2) {
40925 GemmMicrokernelTester()
40926 .mr(4)
40927 .nr(2)
40928 .kr(1)
40929 .sr(1)
40930 .m(4)
40931 .n(2)
40932 .k(k)
40933 .a_zero_point(0)
40934 .b_zero_point(0)
40935 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
40936 }
40937 }
40938
40939
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC,k_eq_1)40940 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, k_eq_1) {
40941 GemmMicrokernelTester()
40942 .mr(4)
40943 .nr(4)
40944 .kr(1)
40945 .sr(1)
40946 .m(4)
40947 .n(4)
40948 .k(1)
40949 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
40950 }
40951
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC,strided_cn)40952 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, strided_cn) {
40953 GemmMicrokernelTester()
40954 .mr(4)
40955 .nr(4)
40956 .kr(1)
40957 .sr(1)
40958 .m(4)
40959 .n(4)
40960 .k(1)
40961 .cn_stride(7)
40962 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
40963 }
40964
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC,k_eq_1_strided_a)40965 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, k_eq_1_strided_a) {
40966 GemmMicrokernelTester()
40967 .mr(4)
40968 .nr(4)
40969 .kr(1)
40970 .sr(1)
40971 .m(4)
40972 .n(4)
40973 .k(1)
40974 .a_stride(3)
40975 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
40976 }
40977
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC,k_eq_1_subtile)40978 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, k_eq_1_subtile) {
40979 for (uint32_t n = 1; n <= 4; n++) {
40980 for (uint32_t m = 1; m <= 4; m++) {
40981 GemmMicrokernelTester()
40982 .mr(4)
40983 .nr(4)
40984 .kr(1)
40985 .sr(1)
40986 .m(m)
40987 .n(n)
40988 .k(1)
40989 .iterations(1)
40990 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
40991 }
40992 }
40993 }
40994
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC,k_eq_1_subtile_m)40995 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, k_eq_1_subtile_m) {
40996 for (uint32_t m = 1; m <= 4; m++) {
40997 GemmMicrokernelTester()
40998 .mr(4)
40999 .nr(4)
41000 .kr(1)
41001 .sr(1)
41002 .m(m)
41003 .n(4)
41004 .k(1)
41005 .iterations(1)
41006 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
41007 }
41008 }
41009
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC,k_eq_1_subtile_n)41010 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, k_eq_1_subtile_n) {
41011 for (uint32_t n = 1; n <= 4; n++) {
41012 GemmMicrokernelTester()
41013 .mr(4)
41014 .nr(4)
41015 .kr(1)
41016 .sr(1)
41017 .m(4)
41018 .n(n)
41019 .k(1)
41020 .iterations(1)
41021 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
41022 }
41023 }
41024
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC,k_gt_1)41025 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, k_gt_1) {
41026 for (size_t k = 2; k < 10; k++) {
41027 GemmMicrokernelTester()
41028 .mr(4)
41029 .nr(4)
41030 .kr(1)
41031 .sr(1)
41032 .m(4)
41033 .n(4)
41034 .k(k)
41035 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
41036 }
41037 }
41038
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC,k_gt_1_strided_a)41039 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, k_gt_1_strided_a) {
41040 for (size_t k = 2; k < 10; k++) {
41041 GemmMicrokernelTester()
41042 .mr(4)
41043 .nr(4)
41044 .kr(1)
41045 .sr(1)
41046 .m(4)
41047 .n(4)
41048 .k(k)
41049 .a_stride(11)
41050 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
41051 }
41052 }
41053
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC,k_gt_1_subtile)41054 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, k_gt_1_subtile) {
41055 for (size_t k = 2; k < 10; k++) {
41056 for (uint32_t n = 1; n <= 4; n++) {
41057 for (uint32_t m = 1; m <= 4; m++) {
41058 GemmMicrokernelTester()
41059 .mr(4)
41060 .nr(4)
41061 .kr(1)
41062 .sr(1)
41063 .m(m)
41064 .n(n)
41065 .k(k)
41066 .iterations(1)
41067 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
41068 }
41069 }
41070 }
41071 }
41072
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC,n_gt_4)41073 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, n_gt_4) {
41074 for (uint32_t n = 5; n < 8; n++) {
41075 for (size_t k = 1; k <= 5; k += 2) {
41076 GemmMicrokernelTester()
41077 .mr(4)
41078 .nr(4)
41079 .kr(1)
41080 .sr(1)
41081 .m(4)
41082 .n(n)
41083 .k(k)
41084 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
41085 }
41086 }
41087 }
41088
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC,n_gt_4_strided_cn)41089 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, n_gt_4_strided_cn) {
41090 for (uint32_t n = 5; n < 8; n++) {
41091 for (size_t k = 1; k <= 5; k += 2) {
41092 GemmMicrokernelTester()
41093 .mr(4)
41094 .nr(4)
41095 .kr(1)
41096 .sr(1)
41097 .m(4)
41098 .n(n)
41099 .k(k)
41100 .cn_stride(7)
41101 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
41102 }
41103 }
41104 }
41105
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC,n_gt_4_strided_a)41106 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, n_gt_4_strided_a) {
41107 for (uint32_t n = 5; n < 8; n++) {
41108 for (size_t k = 1; k <= 5; k += 2) {
41109 GemmMicrokernelTester()
41110 .mr(4)
41111 .nr(4)
41112 .kr(1)
41113 .sr(1)
41114 .m(4)
41115 .n(n)
41116 .k(k)
41117 .a_stride(7)
41118 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
41119 }
41120 }
41121 }
41122
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC,n_gt_4_subtile)41123 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, n_gt_4_subtile) {
41124 for (uint32_t n = 5; n < 8; n++) {
41125 for (size_t k = 1; k <= 5; k += 2) {
41126 for (uint32_t m = 1; m <= 4; m++) {
41127 GemmMicrokernelTester()
41128 .mr(4)
41129 .nr(4)
41130 .kr(1)
41131 .sr(1)
41132 .m(m)
41133 .n(n)
41134 .k(k)
41135 .iterations(1)
41136 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
41137 }
41138 }
41139 }
41140 }
41141
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC,n_div_4)41142 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, n_div_4) {
41143 for (uint32_t n = 8; n <= 12; n += 4) {
41144 for (size_t k = 1; k <= 5; k += 2) {
41145 GemmMicrokernelTester()
41146 .mr(4)
41147 .nr(4)
41148 .kr(1)
41149 .sr(1)
41150 .m(4)
41151 .n(n)
41152 .k(k)
41153 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
41154 }
41155 }
41156 }
41157
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC,n_div_4_strided_cn)41158 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, n_div_4_strided_cn) {
41159 for (uint32_t n = 8; n <= 12; n += 4) {
41160 for (size_t k = 1; k <= 5; k += 2) {
41161 GemmMicrokernelTester()
41162 .mr(4)
41163 .nr(4)
41164 .kr(1)
41165 .sr(1)
41166 .m(4)
41167 .n(n)
41168 .k(k)
41169 .cn_stride(7)
41170 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
41171 }
41172 }
41173 }
41174
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC,n_div_4_strided_a)41175 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, n_div_4_strided_a) {
41176 for (uint32_t n = 8; n <= 12; n += 4) {
41177 for (size_t k = 1; k <= 5; k += 2) {
41178 GemmMicrokernelTester()
41179 .mr(4)
41180 .nr(4)
41181 .kr(1)
41182 .sr(1)
41183 .m(4)
41184 .n(n)
41185 .k(k)
41186 .a_stride(7)
41187 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
41188 }
41189 }
41190 }
41191
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC,n_div_4_subtile)41192 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, n_div_4_subtile) {
41193 for (uint32_t n = 8; n <= 12; n += 4) {
41194 for (size_t k = 1; k <= 5; k += 2) {
41195 for (uint32_t m = 1; m <= 4; m++) {
41196 GemmMicrokernelTester()
41197 .mr(4)
41198 .nr(4)
41199 .kr(1)
41200 .sr(1)
41201 .m(m)
41202 .n(n)
41203 .k(k)
41204 .iterations(1)
41205 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
41206 }
41207 }
41208 }
41209 }
41210
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC,strided_cm_subtile)41211 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, strided_cm_subtile) {
41212 for (size_t k = 1; k <= 5; k += 2) {
41213 for (uint32_t n = 1; n <= 4; n++) {
41214 for (uint32_t m = 1; m <= 4; m++) {
41215 GemmMicrokernelTester()
41216 .mr(4)
41217 .nr(4)
41218 .kr(1)
41219 .sr(1)
41220 .m(m)
41221 .n(n)
41222 .k(k)
41223 .cm_stride(7)
41224 .iterations(1)
41225 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
41226 }
41227 }
41228 }
41229 }
41230
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC,qmin)41231 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, qmin) {
41232 GemmMicrokernelTester()
41233 .mr(4)
41234 .nr(4)
41235 .kr(1)
41236 .sr(1)
41237 .m(4)
41238 .n(4)
41239 .k(1)
41240 .qmin(128)
41241 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
41242 }
41243
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC,qmax)41244 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, qmax) {
41245 GemmMicrokernelTester()
41246 .mr(4)
41247 .nr(4)
41248 .kr(1)
41249 .sr(1)
41250 .m(4)
41251 .n(4)
41252 .k(1)
41253 .qmax(128)
41254 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
41255 }
41256
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC,strided_cm)41257 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, strided_cm) {
41258 GemmMicrokernelTester()
41259 .mr(4)
41260 .nr(4)
41261 .kr(1)
41262 .sr(1)
41263 .m(4)
41264 .n(4)
41265 .k(1)
41266 .cm_stride(7)
41267 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
41268 }
41269
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC,no_a_zero_point)41270 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, no_a_zero_point) {
41271 for (size_t k = 1; k <= 5; k += 2) {
41272 GemmMicrokernelTester()
41273 .mr(4)
41274 .nr(4)
41275 .kr(1)
41276 .sr(1)
41277 .m(4)
41278 .n(4)
41279 .k(k)
41280 .a_zero_point(0)
41281 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
41282 }
41283 }
41284
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC,no_b_zero_point)41285 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, no_b_zero_point) {
41286 for (size_t k = 1; k <= 5; k += 2) {
41287 GemmMicrokernelTester()
41288 .mr(4)
41289 .nr(4)
41290 .kr(1)
41291 .sr(1)
41292 .m(4)
41293 .n(4)
41294 .k(k)
41295 .b_zero_point(0)
41296 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
41297 }
41298 }
41299
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC,no_zero_point)41300 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_FMAGIC, no_zero_point) {
41301 for (size_t k = 1; k <= 5; k += 2) {
41302 GemmMicrokernelTester()
41303 .mr(4)
41304 .nr(4)
41305 .kr(1)
41306 .sr(1)
41307 .m(4)
41308 .n(4)
41309 .k(k)
41310 .a_zero_point(0)
41311 .b_zero_point(0)
41312 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params, xnn_qu8_requantize_fp32);
41313 }
41314 }
41315
41316
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF,k_eq_1)41317 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, k_eq_1) {
41318 GemmMicrokernelTester()
41319 .mr(4)
41320 .nr(4)
41321 .kr(1)
41322 .sr(1)
41323 .m(4)
41324 .n(4)
41325 .k(1)
41326 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
41327 }
41328
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF,strided_cn)41329 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, strided_cn) {
41330 GemmMicrokernelTester()
41331 .mr(4)
41332 .nr(4)
41333 .kr(1)
41334 .sr(1)
41335 .m(4)
41336 .n(4)
41337 .k(1)
41338 .cn_stride(7)
41339 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
41340 }
41341
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF,k_eq_1_strided_a)41342 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, k_eq_1_strided_a) {
41343 GemmMicrokernelTester()
41344 .mr(4)
41345 .nr(4)
41346 .kr(1)
41347 .sr(1)
41348 .m(4)
41349 .n(4)
41350 .k(1)
41351 .a_stride(3)
41352 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
41353 }
41354
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF,k_eq_1_subtile)41355 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, k_eq_1_subtile) {
41356 for (uint32_t n = 1; n <= 4; n++) {
41357 for (uint32_t m = 1; m <= 4; m++) {
41358 GemmMicrokernelTester()
41359 .mr(4)
41360 .nr(4)
41361 .kr(1)
41362 .sr(1)
41363 .m(m)
41364 .n(n)
41365 .k(1)
41366 .iterations(1)
41367 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
41368 }
41369 }
41370 }
41371
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF,k_eq_1_subtile_m)41372 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, k_eq_1_subtile_m) {
41373 for (uint32_t m = 1; m <= 4; m++) {
41374 GemmMicrokernelTester()
41375 .mr(4)
41376 .nr(4)
41377 .kr(1)
41378 .sr(1)
41379 .m(m)
41380 .n(4)
41381 .k(1)
41382 .iterations(1)
41383 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
41384 }
41385 }
41386
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF,k_eq_1_subtile_n)41387 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, k_eq_1_subtile_n) {
41388 for (uint32_t n = 1; n <= 4; n++) {
41389 GemmMicrokernelTester()
41390 .mr(4)
41391 .nr(4)
41392 .kr(1)
41393 .sr(1)
41394 .m(4)
41395 .n(n)
41396 .k(1)
41397 .iterations(1)
41398 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
41399 }
41400 }
41401
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF,k_gt_1)41402 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, k_gt_1) {
41403 for (size_t k = 2; k < 10; k++) {
41404 GemmMicrokernelTester()
41405 .mr(4)
41406 .nr(4)
41407 .kr(1)
41408 .sr(1)
41409 .m(4)
41410 .n(4)
41411 .k(k)
41412 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
41413 }
41414 }
41415
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF,k_gt_1_strided_a)41416 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, k_gt_1_strided_a) {
41417 for (size_t k = 2; k < 10; k++) {
41418 GemmMicrokernelTester()
41419 .mr(4)
41420 .nr(4)
41421 .kr(1)
41422 .sr(1)
41423 .m(4)
41424 .n(4)
41425 .k(k)
41426 .a_stride(11)
41427 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
41428 }
41429 }
41430
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF,k_gt_1_subtile)41431 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, k_gt_1_subtile) {
41432 for (size_t k = 2; k < 10; k++) {
41433 for (uint32_t n = 1; n <= 4; n++) {
41434 for (uint32_t m = 1; m <= 4; m++) {
41435 GemmMicrokernelTester()
41436 .mr(4)
41437 .nr(4)
41438 .kr(1)
41439 .sr(1)
41440 .m(m)
41441 .n(n)
41442 .k(k)
41443 .iterations(1)
41444 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
41445 }
41446 }
41447 }
41448 }
41449
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF,n_gt_4)41450 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, n_gt_4) {
41451 for (uint32_t n = 5; n < 8; n++) {
41452 for (size_t k = 1; k <= 5; k += 2) {
41453 GemmMicrokernelTester()
41454 .mr(4)
41455 .nr(4)
41456 .kr(1)
41457 .sr(1)
41458 .m(4)
41459 .n(n)
41460 .k(k)
41461 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
41462 }
41463 }
41464 }
41465
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF,n_gt_4_strided_cn)41466 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, n_gt_4_strided_cn) {
41467 for (uint32_t n = 5; n < 8; n++) {
41468 for (size_t k = 1; k <= 5; k += 2) {
41469 GemmMicrokernelTester()
41470 .mr(4)
41471 .nr(4)
41472 .kr(1)
41473 .sr(1)
41474 .m(4)
41475 .n(n)
41476 .k(k)
41477 .cn_stride(7)
41478 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
41479 }
41480 }
41481 }
41482
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF,n_gt_4_strided_a)41483 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, n_gt_4_strided_a) {
41484 for (uint32_t n = 5; n < 8; n++) {
41485 for (size_t k = 1; k <= 5; k += 2) {
41486 GemmMicrokernelTester()
41487 .mr(4)
41488 .nr(4)
41489 .kr(1)
41490 .sr(1)
41491 .m(4)
41492 .n(n)
41493 .k(k)
41494 .a_stride(7)
41495 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
41496 }
41497 }
41498 }
41499
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF,n_gt_4_subtile)41500 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, n_gt_4_subtile) {
41501 for (uint32_t n = 5; n < 8; n++) {
41502 for (size_t k = 1; k <= 5; k += 2) {
41503 for (uint32_t m = 1; m <= 4; m++) {
41504 GemmMicrokernelTester()
41505 .mr(4)
41506 .nr(4)
41507 .kr(1)
41508 .sr(1)
41509 .m(m)
41510 .n(n)
41511 .k(k)
41512 .iterations(1)
41513 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
41514 }
41515 }
41516 }
41517 }
41518
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF,n_div_4)41519 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, n_div_4) {
41520 for (uint32_t n = 8; n <= 12; n += 4) {
41521 for (size_t k = 1; k <= 5; k += 2) {
41522 GemmMicrokernelTester()
41523 .mr(4)
41524 .nr(4)
41525 .kr(1)
41526 .sr(1)
41527 .m(4)
41528 .n(n)
41529 .k(k)
41530 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
41531 }
41532 }
41533 }
41534
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF,n_div_4_strided_cn)41535 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, n_div_4_strided_cn) {
41536 for (uint32_t n = 8; n <= 12; n += 4) {
41537 for (size_t k = 1; k <= 5; k += 2) {
41538 GemmMicrokernelTester()
41539 .mr(4)
41540 .nr(4)
41541 .kr(1)
41542 .sr(1)
41543 .m(4)
41544 .n(n)
41545 .k(k)
41546 .cn_stride(7)
41547 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
41548 }
41549 }
41550 }
41551
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF,n_div_4_strided_a)41552 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, n_div_4_strided_a) {
41553 for (uint32_t n = 8; n <= 12; n += 4) {
41554 for (size_t k = 1; k <= 5; k += 2) {
41555 GemmMicrokernelTester()
41556 .mr(4)
41557 .nr(4)
41558 .kr(1)
41559 .sr(1)
41560 .m(4)
41561 .n(n)
41562 .k(k)
41563 .a_stride(7)
41564 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
41565 }
41566 }
41567 }
41568
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF,n_div_4_subtile)41569 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, n_div_4_subtile) {
41570 for (uint32_t n = 8; n <= 12; n += 4) {
41571 for (size_t k = 1; k <= 5; k += 2) {
41572 for (uint32_t m = 1; m <= 4; m++) {
41573 GemmMicrokernelTester()
41574 .mr(4)
41575 .nr(4)
41576 .kr(1)
41577 .sr(1)
41578 .m(m)
41579 .n(n)
41580 .k(k)
41581 .iterations(1)
41582 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
41583 }
41584 }
41585 }
41586 }
41587
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF,strided_cm_subtile)41588 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, strided_cm_subtile) {
41589 for (size_t k = 1; k <= 5; k += 2) {
41590 for (uint32_t n = 1; n <= 4; n++) {
41591 for (uint32_t m = 1; m <= 4; m++) {
41592 GemmMicrokernelTester()
41593 .mr(4)
41594 .nr(4)
41595 .kr(1)
41596 .sr(1)
41597 .m(m)
41598 .n(n)
41599 .k(k)
41600 .cm_stride(7)
41601 .iterations(1)
41602 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
41603 }
41604 }
41605 }
41606 }
41607
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF,qmin)41608 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, qmin) {
41609 GemmMicrokernelTester()
41610 .mr(4)
41611 .nr(4)
41612 .kr(1)
41613 .sr(1)
41614 .m(4)
41615 .n(4)
41616 .k(1)
41617 .qmin(128)
41618 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
41619 }
41620
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF,qmax)41621 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, qmax) {
41622 GemmMicrokernelTester()
41623 .mr(4)
41624 .nr(4)
41625 .kr(1)
41626 .sr(1)
41627 .m(4)
41628 .n(4)
41629 .k(1)
41630 .qmax(128)
41631 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
41632 }
41633
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF,strided_cm)41634 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, strided_cm) {
41635 GemmMicrokernelTester()
41636 .mr(4)
41637 .nr(4)
41638 .kr(1)
41639 .sr(1)
41640 .m(4)
41641 .n(4)
41642 .k(1)
41643 .cm_stride(7)
41644 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
41645 }
41646
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF,no_a_zero_point)41647 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, no_a_zero_point) {
41648 for (size_t k = 1; k <= 5; k += 2) {
41649 GemmMicrokernelTester()
41650 .mr(4)
41651 .nr(4)
41652 .kr(1)
41653 .sr(1)
41654 .m(4)
41655 .n(4)
41656 .k(k)
41657 .a_zero_point(0)
41658 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
41659 }
41660 }
41661
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF,no_b_zero_point)41662 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, no_b_zero_point) {
41663 for (size_t k = 1; k <= 5; k += 2) {
41664 GemmMicrokernelTester()
41665 .mr(4)
41666 .nr(4)
41667 .kr(1)
41668 .sr(1)
41669 .m(4)
41670 .n(4)
41671 .k(k)
41672 .b_zero_point(0)
41673 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
41674 }
41675 }
41676
TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF,no_zero_point)41677 TEST(QU8_GEMM_MINMAX_FP32_4X4__SCALAR_LRINTF, no_zero_point) {
41678 for (size_t k = 1; k <= 5; k += 2) {
41679 GemmMicrokernelTester()
41680 .mr(4)
41681 .nr(4)
41682 .kr(1)
41683 .sr(1)
41684 .m(4)
41685 .n(4)
41686 .k(k)
41687 .a_zero_point(0)
41688 .b_zero_point(0)
41689 .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params, xnn_qu8_requantize_fp32);
41690 }
41691 }
41692