xref: /aosp_15_r20/external/XNNPACK/test/qs8-requantization.cc (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2020 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <cmath>
10 #include <cstddef>
11 #include <cstdlib>
12 
13 #include <gtest/gtest.h>
14 
15 #include <xnnpack/common.h>
16 #include <xnnpack/isa-checks.h>
17 #include <xnnpack/requantization-stubs.h>
18 #include "requantization-tester.h"
19 
20 
21 /*
22  * Round-to-nearest, ties away from zero, scalar implementation using unsigned 32-bit arithmetics.
23  */
24 
TEST(QS8_RNDNA__SCALAR_UNSIGNED32,exact_divide_by_po2)25 TEST(QS8_RNDNA__SCALAR_UNSIGNED32, exact_divide_by_po2) {
26   for (uint32_t s = 1; s < 32; s++) {
27     RequantizationTester()
28       .qmin(std::numeric_limits<int8_t>::min())
29       .qmax(std::numeric_limits<int8_t>::max())
30       .s(s)
31       .TestExactDivideByPO2(xnn_qs8_requantize_rndna__scalar_unsigned32);
32   }
33 }
34 
TEST(QS8_RNDNA__SCALAR_UNSIGNED32,exact_divide_by_po2_with_zero_point)35 TEST(QS8_RNDNA__SCALAR_UNSIGNED32, exact_divide_by_po2_with_zero_point) {
36   for (int32_t zero_point = std::numeric_limits<int8_t>::min();
37        zero_point <= std::numeric_limits<int8_t>::max();
38        zero_point++)
39   {
40     for (uint32_t s = 1; s < 32; s++) {
41       RequantizationTester()
42         .zero_point(zero_point)
43         .qmin(std::numeric_limits<int8_t>::min())
44         .qmax(std::numeric_limits<int8_t>::max())
45         .s(s)
46         .TestExactDivideByPO2(xnn_qs8_requantize_rndna__scalar_unsigned32);
47     }
48   }
49 }
50 
TEST(QS8_RNDNA__SCALAR_UNSIGNED32,divide_by_po2_with_rounding_up)51 TEST(QS8_RNDNA__SCALAR_UNSIGNED32, divide_by_po2_with_rounding_up) {
52   for (int32_t zero_point = std::numeric_limits<int8_t>::min();
53        zero_point <= std::numeric_limits<int8_t>::max();
54        zero_point++)
55   {
56     for (uint32_t s = 1; s < 32; s++) {
57       RequantizationTester()
58         .zero_point(zero_point)
59         .qmin(std::numeric_limits<int8_t>::min())
60         .qmax(std::numeric_limits<int8_t>::max())
61         .s(s)
62         .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_rndna__scalar_unsigned32);
63     }
64   }
65 }
66 
TEST(QS8_RNDNA__SCALAR_UNSIGNED32,divide_by_po2_with_rounding_down)67 TEST(QS8_RNDNA__SCALAR_UNSIGNED32, divide_by_po2_with_rounding_down) {
68   for (int32_t zero_point = std::numeric_limits<int8_t>::min();
69        zero_point <= std::numeric_limits<int8_t>::max();
70        zero_point++)
71   {
72     for (uint32_t s = 1; s < 32; s++) {
73       RequantizationTester()
74         .zero_point(zero_point)
75         .qmin(std::numeric_limits<int8_t>::min())
76         .qmax(std::numeric_limits<int8_t>::max())
77         .s(s)
78         .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_rndna__scalar_unsigned32);
79     }
80   }
81 }
82 
TEST(QS8_RNDNA__SCALAR_UNSIGNED32,divide_by_po2_with_rounding_away)83 TEST(QS8_RNDNA__SCALAR_UNSIGNED32, divide_by_po2_with_rounding_away) {
84   for (int32_t zero_point = std::numeric_limits<int8_t>::min();
85        zero_point <= std::numeric_limits<int8_t>::max();
86        zero_point++)
87   {
88     for (uint32_t s = 1; s < 32; s++) {
89       RequantizationTester()
90         .zero_point(zero_point)
91         .qmin(std::numeric_limits<int8_t>::min())
92         .qmax(std::numeric_limits<int8_t>::max())
93         .s(s)
94         .TestDivideByPO2WithRoundingTiesAway(xnn_qs8_requantize_rndna__scalar_unsigned32);
95     }
96   }
97 }
98 
TEST(QS8_RNDNA__SCALAR_UNSIGNED32,special_cases)99 TEST(QS8_RNDNA__SCALAR_UNSIGNED32, special_cases) {
100   RequantizationTester()
101     .qmin(std::numeric_limits<int8_t>::min())
102     .qmax(std::numeric_limits<int8_t>::max())
103     .TestSpecialCases(xnn_qs8_requantize_rndna__scalar_unsigned32);
104 }
105 
TEST(QS8_RNDNA__SCALAR_UNSIGNED32,random_cases)106 TEST(QS8_RNDNA__SCALAR_UNSIGNED32, random_cases) {
107   RequantizationTester()
108     .qmin(std::numeric_limits<int8_t>::min())
109     .qmax(std::numeric_limits<int8_t>::max())
110     .iterations(100)
111     .TestRandomCasesRoundToNearestTiesAway(xnn_qs8_requantize_rndna__scalar_unsigned32);
112 }
113 
114 
115 /*
116  * Round-to-nearest, ties away from zero, scalar implementation using unsigned 64-bit arithmetics.
117  */
118 
TEST(QS8_RNDNA__SCALAR_UNSIGNED64,exact_divide_by_po2)119 TEST(QS8_RNDNA__SCALAR_UNSIGNED64, exact_divide_by_po2) {
120   for (uint32_t s = 1; s < 32; s++) {
121     RequantizationTester()
122       .qmin(std::numeric_limits<int8_t>::min())
123       .qmax(std::numeric_limits<int8_t>::max())
124       .s(s)
125       .TestExactDivideByPO2(xnn_qs8_requantize_rndna__scalar_unsigned64);
126   }
127 }
128 
TEST(QS8_RNDNA__SCALAR_UNSIGNED64,exact_divide_by_po2_with_zero_point)129 TEST(QS8_RNDNA__SCALAR_UNSIGNED64, exact_divide_by_po2_with_zero_point) {
130   for (int32_t zero_point = std::numeric_limits<int8_t>::min();
131        zero_point <= std::numeric_limits<int8_t>::max();
132        zero_point++)
133   {
134     for (uint32_t s = 1; s < 32; s++) {
135       RequantizationTester()
136         .zero_point(zero_point)
137         .qmin(std::numeric_limits<int8_t>::min())
138         .qmax(std::numeric_limits<int8_t>::max())
139         .s(s)
140         .TestExactDivideByPO2(xnn_qs8_requantize_rndna__scalar_unsigned64);
141     }
142   }
143 }
144 
TEST(QS8_RNDNA__SCALAR_UNSIGNED64,divide_by_po2_with_rounding_up)145 TEST(QS8_RNDNA__SCALAR_UNSIGNED64, divide_by_po2_with_rounding_up) {
146   for (int32_t zero_point = std::numeric_limits<int8_t>::min();
147        zero_point <= std::numeric_limits<int8_t>::max();
148        zero_point++)
149   {
150     for (uint32_t s = 1; s < 32; s++) {
151       RequantizationTester()
152         .zero_point(zero_point)
153         .qmin(std::numeric_limits<int8_t>::min())
154         .qmax(std::numeric_limits<int8_t>::max())
155         .s(s)
156         .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_rndna__scalar_unsigned64);
157     }
158   }
159 }
160 
TEST(QS8_RNDNA__SCALAR_UNSIGNED64,divide_by_po2_with_rounding_down)161 TEST(QS8_RNDNA__SCALAR_UNSIGNED64, divide_by_po2_with_rounding_down) {
162   for (int32_t zero_point = std::numeric_limits<int8_t>::min();
163        zero_point <= std::numeric_limits<int8_t>::max();
164        zero_point++)
165   {
166     for (uint32_t s = 1; s < 32; s++) {
167       RequantizationTester()
168         .zero_point(zero_point)
169         .qmin(std::numeric_limits<int8_t>::min())
170         .qmax(std::numeric_limits<int8_t>::max())
171         .s(s)
172         .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_rndna__scalar_unsigned64);
173     }
174   }
175 }
176 
TEST(QS8_RNDNA__SCALAR_UNSIGNED64,divide_by_po2_with_rounding_away)177 TEST(QS8_RNDNA__SCALAR_UNSIGNED64, divide_by_po2_with_rounding_away) {
178   for (int32_t zero_point = std::numeric_limits<int8_t>::min();
179        zero_point <= std::numeric_limits<int8_t>::max();
180        zero_point++)
181   {
182     for (uint32_t s = 1; s < 32; s++) {
183       RequantizationTester()
184         .zero_point(zero_point)
185         .qmin(std::numeric_limits<int8_t>::min())
186         .qmax(std::numeric_limits<int8_t>::max())
187         .s(s)
188         .TestDivideByPO2WithRoundingTiesAway(xnn_qs8_requantize_rndna__scalar_unsigned64);
189     }
190   }
191 }
192 
TEST(QS8_RNDNA__SCALAR_UNSIGNED64,special_cases)193 TEST(QS8_RNDNA__SCALAR_UNSIGNED64, special_cases) {
194   RequantizationTester()
195     .qmin(std::numeric_limits<int8_t>::min())
196     .qmax(std::numeric_limits<int8_t>::max())
197     .TestSpecialCases(xnn_qs8_requantize_rndna__scalar_unsigned64);
198 }
199 
TEST(QS8_RNDNA__SCALAR_UNSIGNED64,random_cases)200 TEST(QS8_RNDNA__SCALAR_UNSIGNED64, random_cases) {
201   RequantizationTester()
202     .qmin(std::numeric_limits<int8_t>::min())
203     .qmax(std::numeric_limits<int8_t>::max())
204     .iterations(100)
205     .TestRandomCasesRoundToNearestTiesAway(xnn_qs8_requantize_rndna__scalar_unsigned64);
206 }
207 
208 
209 /*
210  * Round-to-nearest, ties away from zero, scalar implementation using signed 64-bit arithmetics.
211  */
212 
TEST(QS8_RNDNA__SCALAR_SIGNED64,exact_divide_by_po2)213 TEST(QS8_RNDNA__SCALAR_SIGNED64, exact_divide_by_po2) {
214   for (uint32_t s = 1; s < 32; s++) {
215     RequantizationTester()
216       .qmin(std::numeric_limits<int8_t>::min())
217       .qmax(std::numeric_limits<int8_t>::max())
218       .s(s)
219       .TestExactDivideByPO2(xnn_qs8_requantize_rndna__scalar_signed64);
220   }
221 }
222 
TEST(QS8_RNDNA__SCALAR_SIGNED64,exact_divide_by_po2_with_zero_point)223 TEST(QS8_RNDNA__SCALAR_SIGNED64, exact_divide_by_po2_with_zero_point) {
224   for (int32_t zero_point = std::numeric_limits<int8_t>::min();
225        zero_point <= std::numeric_limits<int8_t>::max();
226        zero_point++)
227   {
228     for (uint32_t s = 1; s < 32; s++) {
229       RequantizationTester()
230         .zero_point(zero_point)
231         .qmin(std::numeric_limits<int8_t>::min())
232         .qmax(std::numeric_limits<int8_t>::max())
233         .s(s)
234         .TestExactDivideByPO2(xnn_qs8_requantize_rndna__scalar_signed64);
235     }
236   }
237 }
238 
TEST(QS8_RNDNA__SCALAR_SIGNED64,divide_by_po2_with_rounding_up)239 TEST(QS8_RNDNA__SCALAR_SIGNED64, divide_by_po2_with_rounding_up) {
240   for (int32_t zero_point = std::numeric_limits<int8_t>::min();
241        zero_point <= std::numeric_limits<int8_t>::max();
242        zero_point++)
243   {
244     for (uint32_t s = 1; s < 32; s++) {
245       RequantizationTester()
246         .zero_point(zero_point)
247         .qmin(std::numeric_limits<int8_t>::min())
248         .qmax(std::numeric_limits<int8_t>::max())
249         .s(s)
250         .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_rndna__scalar_signed64);
251     }
252   }
253 }
254 
TEST(QS8_RNDNA__SCALAR_SIGNED64,divide_by_po2_with_rounding_down)255 TEST(QS8_RNDNA__SCALAR_SIGNED64, divide_by_po2_with_rounding_down) {
256   for (int32_t zero_point = std::numeric_limits<int8_t>::min();
257        zero_point <= std::numeric_limits<int8_t>::max();
258        zero_point++)
259   {
260     for (uint32_t s = 1; s < 32; s++) {
261       RequantizationTester()
262         .zero_point(zero_point)
263         .qmin(std::numeric_limits<int8_t>::min())
264         .qmax(std::numeric_limits<int8_t>::max())
265         .s(s)
266         .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_rndna__scalar_signed64);
267     }
268   }
269 }
270 
TEST(QS8_RNDNA__SCALAR_SIGNED64,divide_by_po2_with_rounding_away)271 TEST(QS8_RNDNA__SCALAR_SIGNED64, divide_by_po2_with_rounding_away) {
272   for (int32_t zero_point = std::numeric_limits<int8_t>::min();
273        zero_point <= std::numeric_limits<int8_t>::max();
274        zero_point++)
275   {
276     for (uint32_t s = 1; s < 32; s++) {
277       RequantizationTester()
278         .zero_point(zero_point)
279         .qmin(std::numeric_limits<int8_t>::min())
280         .qmax(std::numeric_limits<int8_t>::max())
281         .s(s)
282         .TestDivideByPO2WithRoundingTiesAway(xnn_qs8_requantize_rndna__scalar_signed64);
283     }
284   }
285 }
286 
TEST(QS8_RNDNA__SCALAR_SIGNED64,special_cases)287 TEST(QS8_RNDNA__SCALAR_SIGNED64, special_cases) {
288   RequantizationTester()
289     .qmin(std::numeric_limits<int8_t>::min())
290     .qmax(std::numeric_limits<int8_t>::max())
291     .TestSpecialCases(xnn_qs8_requantize_rndna__scalar_signed64);
292 }
293 
TEST(QS8_RNDNA__SCALAR_SIGNED64,random_cases)294 TEST(QS8_RNDNA__SCALAR_SIGNED64, random_cases) {
295   RequantizationTester()
296     .qmin(std::numeric_limits<int8_t>::min())
297     .qmax(std::numeric_limits<int8_t>::max())
298     .iterations(100)
299     .TestRandomCasesRoundToNearestTiesAway(xnn_qs8_requantize_rndna__scalar_signed64);
300 }
301 
302 
303 /*
304  * Round-to-nearest, ties up, scalar implementation using signed 64-bit arithmetics.
305  */
306 
TEST(QS8_RNDNU__SCALAR,exact_divide_by_po2)307 TEST(QS8_RNDNU__SCALAR, exact_divide_by_po2) {
308   for (uint32_t s = 1; s < 32; s++) {
309     RequantizationTester()
310       .qmin(std::numeric_limits<int8_t>::min())
311       .qmax(std::numeric_limits<int8_t>::max())
312       .s(s)
313       .TestExactDivideByPO2(xnn_qs8_requantize_rndnu__scalar);
314   }
315 }
316 
TEST(QS8_RNDNU__SCALAR,exact_divide_by_po2_with_zero_point)317 TEST(QS8_RNDNU__SCALAR, exact_divide_by_po2_with_zero_point) {
318   for (int32_t zero_point = std::numeric_limits<int8_t>::min();
319        zero_point <= std::numeric_limits<int8_t>::max();
320        zero_point++)
321   {
322     for (uint32_t s = 1; s < 32; s++) {
323       RequantizationTester()
324         .zero_point(zero_point)
325         .qmin(std::numeric_limits<int8_t>::min())
326         .qmax(std::numeric_limits<int8_t>::max())
327         .s(s)
328         .TestExactDivideByPO2(xnn_qs8_requantize_rndnu__scalar);
329     }
330   }
331 }
332 
TEST(QS8_RNDNU__SCALAR,divide_by_po2_with_rounding_up)333 TEST(QS8_RNDNU__SCALAR, divide_by_po2_with_rounding_up) {
334   for (int32_t zero_point = std::numeric_limits<int8_t>::min();
335        zero_point <= std::numeric_limits<int8_t>::max();
336        zero_point++)
337   {
338     for (uint32_t s = 1; s < 32; s++) {
339       RequantizationTester()
340         .zero_point(zero_point)
341         .qmin(std::numeric_limits<int8_t>::min())
342         .qmax(std::numeric_limits<int8_t>::max())
343         .s(s)
344         .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_rndnu__scalar);
345     }
346   }
347 }
348 
TEST(QS8_RNDNU__SCALAR,divide_by_po2_with_rounding_down)349 TEST(QS8_RNDNU__SCALAR, divide_by_po2_with_rounding_down) {
350   for (int32_t zero_point = std::numeric_limits<int8_t>::min();
351        zero_point <= std::numeric_limits<int8_t>::max();
352        zero_point++)
353   {
354     for (uint32_t s = 1; s < 32; s++) {
355       RequantizationTester()
356         .zero_point(zero_point)
357         .qmin(std::numeric_limits<int8_t>::min())
358         .qmax(std::numeric_limits<int8_t>::max())
359         .s(s)
360         .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_rndnu__scalar);
361     }
362   }
363 }
364 
TEST(QS8_RNDNU__SCALAR,divide_by_po2_with_rounding_away)365 TEST(QS8_RNDNU__SCALAR, divide_by_po2_with_rounding_away) {
366   for (int32_t zero_point = std::numeric_limits<int8_t>::min();
367        zero_point <= std::numeric_limits<int8_t>::max();
368        zero_point++)
369   {
370     for (uint32_t s = 1; s < 32; s++) {
371       RequantizationTester()
372         .zero_point(zero_point)
373         .qmin(std::numeric_limits<int8_t>::min())
374         .qmax(std::numeric_limits<int8_t>::max())
375         .s(s)
376         .TestDivideByPO2WithRoundingTiesUp(xnn_qs8_requantize_rndnu__scalar);
377     }
378   }
379 }
380 
TEST(QS8_RNDNU__SCALAR,random_cases)381 TEST(QS8_RNDNU__SCALAR, random_cases) {
382   RequantizationTester()
383     .qmin(std::numeric_limits<int8_t>::min())
384     .qmax(std::numeric_limits<int8_t>::max())
385     .iterations(100)
386     .TestRandomCasesRoundToNearestTiesUp(xnn_qs8_requantize_rndnu__scalar);
387 }
388 
389 
390 /*
391  * FP32-based scalar implementation using lrintf function.
392  */
393 
TEST(QS8_FP32__SCALAR_LRINTF,random_cases)394 TEST(QS8_FP32__SCALAR_LRINTF, random_cases) {
395   RequantizationTester()
396     .qmin(std::numeric_limits<int8_t>::min())
397     .qmax(std::numeric_limits<int8_t>::max())
398     .iterations(1000)
399     .TestRandomCasesApproximate(xnn_qs8_requantize_fp32__scalar_lrintf);
400 }
401 
402 
403 /*
404  * FP32-based scalar implementation using magic trick for FP32->INT32 conversion.
405  */
406 
TEST(QS8_FP32__SCALAR_FMAGIC,random_cases)407 TEST(QS8_FP32__SCALAR_FMAGIC, random_cases) {
408   RequantizationTester()
409     .qmin(std::numeric_limits<int8_t>::min())
410     .qmax(std::numeric_limits<int8_t>::max())
411     .iterations(1000)
412     .TestRandomCasesApproximate(xnn_qs8_requantize_fp32__scalar_fmagic);
413 }
414 
415 
416 /*
417  * GEMMLOWP-equivalent scalar implementation.
418  */
419 
TEST(QS8_GEMMLOWP__SCALAR,exact_divide_by_po2)420 TEST(QS8_GEMMLOWP__SCALAR, exact_divide_by_po2) {
421   for (uint32_t s = 1; s < 32; s++) {
422     RequantizationTester()
423       .qmin(std::numeric_limits<int8_t>::min())
424       .qmax(std::numeric_limits<int8_t>::max())
425       .s(s)
426       .TestExactDivideByPO2(xnn_qs8_requantize_gemmlowp__scalar);
427   }
428 }
429 
TEST(QS8_GEMMLOWP__SCALAR,exact_divide_by_po2_with_zero_point)430 TEST(QS8_GEMMLOWP__SCALAR, exact_divide_by_po2_with_zero_point) {
431   for (int32_t zero_point = std::numeric_limits<int8_t>::min();
432        zero_point <= std::numeric_limits<int8_t>::max();
433        zero_point++)
434   {
435     for (uint32_t s = 1; s < 32; s++) {
436       RequantizationTester()
437         .zero_point(zero_point)
438         .qmin(std::numeric_limits<int8_t>::min())
439         .qmax(std::numeric_limits<int8_t>::max())
440         .s(s)
441         .TestExactDivideByPO2(xnn_qs8_requantize_gemmlowp__scalar);
442     }
443   }
444 }
445 
TEST(QS8_GEMMLOWP__SCALAR,divide_by_po2_with_rounding_up)446 TEST(QS8_GEMMLOWP__SCALAR, divide_by_po2_with_rounding_up) {
447   for (int32_t zero_point = std::numeric_limits<int8_t>::min();
448        zero_point <= std::numeric_limits<int8_t>::max();
449        zero_point++)
450   {
451     for (uint32_t s = 1; s < 32; s++) {
452       RequantizationTester()
453         .zero_point(zero_point)
454         .qmin(std::numeric_limits<int8_t>::min())
455         .qmax(std::numeric_limits<int8_t>::max())
456         .s(s)
457         .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_gemmlowp__scalar);
458     }
459   }
460 }
461 
462 /* No rounding down test - it fails because of upward bias in multiplication */
463 /* No rounding away test - it fails because of upward bias in multiplication */
464 
TEST(QS8_GEMMLOWP__SCALAR,special_cases)465 TEST(QS8_GEMMLOWP__SCALAR, special_cases) {
466   RequantizationTester()
467     .qmin(std::numeric_limits<int8_t>::min())
468     .qmax(std::numeric_limits<int8_t>::max())
469     .TestSpecialCases(xnn_qs8_requantize_gemmlowp__scalar);
470 }
471 
TEST(QS8_GEMMLOWP__SCALAR,random_cases)472 TEST(QS8_GEMMLOWP__SCALAR, random_cases) {
473   RequantizationTester()
474     .qmin(std::numeric_limits<int8_t>::min())
475     .qmax(std::numeric_limits<int8_t>::max())
476     .iterations(100)
477     .TestRandomCasesApproximate(xnn_qs8_requantize_gemmlowp__scalar);
478 }
479 
480 
481 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
482   /*
483    * Round-to-nearest, ties away from zero, SSE2 implementation using floating-point shuffle.
484    */
485 
TEST(QS8_RNDNA__SSE2,exact_divide_by_po2)486   TEST(QS8_RNDNA__SSE2, exact_divide_by_po2) {
487     for (uint32_t s = 1; s < 32; s++) {
488       RequantizationTester()
489         .qmin(std::numeric_limits<int8_t>::min())
490         .qmax(std::numeric_limits<int8_t>::max())
491         .s(s)
492         .TestExactDivideByPO2(xnn_qs8_requantize_rndna__sse2);
493     }
494   }
495 
TEST(QS8_RNDNA__SSE2,exact_divide_by_po2_with_zero_point)496   TEST(QS8_RNDNA__SSE2, exact_divide_by_po2_with_zero_point) {
497     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
498          zero_point <= std::numeric_limits<int8_t>::max();
499          zero_point++)
500     {
501       for (uint32_t s = 1; s < 32; s++) {
502         RequantizationTester()
503           .zero_point(zero_point)
504           .qmin(std::numeric_limits<int8_t>::min())
505           .qmax(std::numeric_limits<int8_t>::max())
506           .s(s)
507           .TestExactDivideByPO2(xnn_qs8_requantize_rndna__sse2);
508       }
509     }
510   }
511 
TEST(QS8_RNDNA__SSE2,divide_by_po2_with_rounding_up)512   TEST(QS8_RNDNA__SSE2, divide_by_po2_with_rounding_up) {
513     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
514          zero_point <= std::numeric_limits<int8_t>::max();
515          zero_point++)
516     {
517       for (uint32_t s = 1; s < 32; s++) {
518         RequantizationTester()
519           .zero_point(zero_point)
520           .qmin(std::numeric_limits<int8_t>::min())
521           .qmax(std::numeric_limits<int8_t>::max())
522           .s(s)
523           .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_rndna__sse2);
524       }
525     }
526   }
527 
TEST(QS8_RNDNA__SSE2,divide_by_po2_with_rounding_down)528   TEST(QS8_RNDNA__SSE2, divide_by_po2_with_rounding_down) {
529     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
530          zero_point <= std::numeric_limits<int8_t>::max();
531          zero_point++)
532     {
533       for (uint32_t s = 1; s < 32; s++) {
534         RequantizationTester()
535           .zero_point(zero_point)
536           .qmin(std::numeric_limits<int8_t>::min())
537           .qmax(std::numeric_limits<int8_t>::max())
538           .s(s)
539           .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_rndna__sse2);
540       }
541     }
542   }
543 
TEST(QS8_RNDNA__SSE2,divide_by_po2_with_rounding_away)544   TEST(QS8_RNDNA__SSE2, divide_by_po2_with_rounding_away) {
545     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
546          zero_point <= std::numeric_limits<int8_t>::max();
547          zero_point++)
548     {
549       for (uint32_t s = 1; s < 32; s++) {
550         RequantizationTester()
551           .zero_point(zero_point)
552           .qmin(std::numeric_limits<int8_t>::min())
553           .qmax(std::numeric_limits<int8_t>::max())
554           .s(s)
555           .TestDivideByPO2WithRoundingTiesAway(xnn_qs8_requantize_rndna__sse2);
556       }
557     }
558   }
559 
TEST(QS8_RNDNA__SSE2,special_cases)560   TEST(QS8_RNDNA__SSE2, special_cases) {
561     RequantizationTester()
562       .qmin(std::numeric_limits<int8_t>::min())
563       .qmax(std::numeric_limits<int8_t>::max())
564       .TestSpecialCases(xnn_qs8_requantize_rndna__sse2);
565   }
566 
TEST(QS8_RNDNA__SSE2,random_cases)567   TEST(QS8_RNDNA__SSE2, random_cases) {
568     RequantizationTester()
569       .qmin(std::numeric_limits<int8_t>::min())
570       .qmax(std::numeric_limits<int8_t>::max())
571       .iterations(100)
572       .TestRandomCasesRoundToNearestTiesAway(xnn_qs8_requantize_rndna__sse2);
573   }
574 
575 
576   /*
577    * Round-to-nearest, ties away from zero, SSSE3 implementation using floating-point shuffle.
578    */
579 
TEST(QS8_RNDNA__SSSE3,exact_divide_by_po2)580   TEST(QS8_RNDNA__SSSE3, exact_divide_by_po2) {
581     TEST_REQUIRES_X86_SSSE3;
582     for (uint32_t s = 1; s < 32; s++) {
583       RequantizationTester()
584         .qmin(std::numeric_limits<int8_t>::min())
585         .qmax(std::numeric_limits<int8_t>::max())
586         .s(s)
587         .TestExactDivideByPO2(xnn_qs8_requantize_rndna__ssse3);
588     }
589   }
590 
TEST(QS8_RNDNA__SSSE3,exact_divide_by_po2_with_zero_point)591   TEST(QS8_RNDNA__SSSE3, exact_divide_by_po2_with_zero_point) {
592     TEST_REQUIRES_X86_SSSE3;
593     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
594          zero_point <= std::numeric_limits<int8_t>::max();
595          zero_point++)
596     {
597       for (uint32_t s = 1; s < 32; s++) {
598         RequantizationTester()
599           .zero_point(zero_point)
600           .qmin(std::numeric_limits<int8_t>::min())
601           .qmax(std::numeric_limits<int8_t>::max())
602           .s(s)
603           .TestExactDivideByPO2(xnn_qs8_requantize_rndna__ssse3);
604       }
605     }
606   }
607 
TEST(QS8_RNDNA__SSSE3,divide_by_po2_with_rounding_up)608   TEST(QS8_RNDNA__SSSE3, divide_by_po2_with_rounding_up) {
609     TEST_REQUIRES_X86_SSSE3;
610     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
611          zero_point <= std::numeric_limits<int8_t>::max();
612          zero_point++)
613     {
614       for (uint32_t s = 1; s < 32; s++) {
615         RequantizationTester()
616           .zero_point(zero_point)
617           .qmin(std::numeric_limits<int8_t>::min())
618           .qmax(std::numeric_limits<int8_t>::max())
619           .s(s)
620           .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_rndna__ssse3);
621       }
622     }
623   }
624 
TEST(QS8_RNDNA__SSSE3,divide_by_po2_with_rounding_down)625   TEST(QS8_RNDNA__SSSE3, divide_by_po2_with_rounding_down) {
626     TEST_REQUIRES_X86_SSSE3;
627     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
628          zero_point <= std::numeric_limits<int8_t>::max();
629          zero_point++)
630     {
631       for (uint32_t s = 1; s < 32; s++) {
632         RequantizationTester()
633           .zero_point(zero_point)
634           .qmin(std::numeric_limits<int8_t>::min())
635           .qmax(std::numeric_limits<int8_t>::max())
636           .s(s)
637           .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_rndna__ssse3);
638       }
639     }
640   }
641 
TEST(QS8_RNDNA__SSSE3,divide_by_po2_with_rounding_away)642   TEST(QS8_RNDNA__SSSE3, divide_by_po2_with_rounding_away) {
643     TEST_REQUIRES_X86_SSSE3;
644     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
645          zero_point <= std::numeric_limits<int8_t>::max();
646          zero_point++)
647     {
648       for (uint32_t s = 1; s < 32; s++) {
649         RequantizationTester()
650           .zero_point(zero_point)
651           .qmin(std::numeric_limits<int8_t>::min())
652           .qmax(std::numeric_limits<int8_t>::max())
653           .s(s)
654           .TestDivideByPO2WithRoundingTiesAway(xnn_qs8_requantize_rndna__ssse3);
655       }
656     }
657   }
658 
TEST(QS8_RNDNA__SSSE3,special_cases)659   TEST(QS8_RNDNA__SSSE3, special_cases) {
660     TEST_REQUIRES_X86_SSSE3;
661     RequantizationTester()
662       .qmin(std::numeric_limits<int8_t>::min())
663       .qmax(std::numeric_limits<int8_t>::max())
664       .TestSpecialCases(xnn_qs8_requantize_rndna__ssse3);
665   }
666 
TEST(QS8_RNDNA__SSSE3,random_cases)667   TEST(QS8_RNDNA__SSSE3, random_cases) {
668     TEST_REQUIRES_X86_SSSE3;
669     RequantizationTester()
670       .qmin(std::numeric_limits<int8_t>::min())
671       .qmax(std::numeric_limits<int8_t>::max())
672       .iterations(100)
673       .TestRandomCasesRoundToNearestTiesAway(xnn_qs8_requantize_rndna__ssse3);
674   }
675 
676 
677   /*
678    * Round-to-nearest, ties away from zero, SSE4.1 implementation using static blend instruction.
679    */
680 
TEST(QS8_RNDNA__SSE4,exact_divide_by_po2)681   TEST(QS8_RNDNA__SSE4, exact_divide_by_po2) {
682     TEST_REQUIRES_X86_SSE41;
683     for (uint32_t s = 1; s < 32; s++) {
684       RequantizationTester()
685         .qmin(std::numeric_limits<int8_t>::min())
686         .qmax(std::numeric_limits<int8_t>::max())
687         .s(s)
688         .TestExactDivideByPO2(xnn_qs8_requantize_rndna__sse4);
689     }
690   }
691 
TEST(QS8_RNDNA__SSE4,exact_divide_by_po2_with_zero_point)692   TEST(QS8_RNDNA__SSE4, exact_divide_by_po2_with_zero_point) {
693     TEST_REQUIRES_X86_SSE41;
694     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
695          zero_point <= std::numeric_limits<int8_t>::max();
696          zero_point++)
697     {
698       for (uint32_t s = 1; s < 32; s++) {
699         RequantizationTester()
700           .zero_point(zero_point)
701           .qmin(std::numeric_limits<int8_t>::min())
702           .qmax(std::numeric_limits<int8_t>::max())
703           .s(s)
704           .TestExactDivideByPO2(xnn_qs8_requantize_rndna__sse4);
705       }
706     }
707   }
708 
TEST(QS8_RNDNA__SSE4,divide_by_po2_with_rounding_up)709   TEST(QS8_RNDNA__SSE4, divide_by_po2_with_rounding_up) {
710     TEST_REQUIRES_X86_SSE41;
711     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
712          zero_point <= std::numeric_limits<int8_t>::max();
713          zero_point++)
714     {
715       for (uint32_t s = 1; s < 32; s++) {
716         RequantizationTester()
717           .zero_point(zero_point)
718           .qmin(std::numeric_limits<int8_t>::min())
719           .qmax(std::numeric_limits<int8_t>::max())
720           .s(s)
721           .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_rndna__sse4);
722       }
723     }
724   }
725 
TEST(QS8_RNDNA__SSE4,divide_by_po2_with_rounding_down)726   TEST(QS8_RNDNA__SSE4, divide_by_po2_with_rounding_down) {
727     TEST_REQUIRES_X86_SSE41;
728     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
729          zero_point <= std::numeric_limits<int8_t>::max();
730          zero_point++)
731     {
732       for (uint32_t s = 1; s < 32; s++) {
733         RequantizationTester()
734           .zero_point(zero_point)
735           .qmin(std::numeric_limits<int8_t>::min())
736           .qmax(std::numeric_limits<int8_t>::max())
737           .s(s)
738           .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_rndna__sse4);
739       }
740     }
741   }
742 
TEST(QS8_RNDNA__SSE4,divide_by_po2_with_rounding_away)743   TEST(QS8_RNDNA__SSE4, divide_by_po2_with_rounding_away) {
744     TEST_REQUIRES_X86_SSE41;
745     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
746          zero_point <= std::numeric_limits<int8_t>::max();
747          zero_point++)
748     {
749       for (uint32_t s = 1; s < 32; s++) {
750         RequantizationTester()
751           .zero_point(zero_point)
752           .qmin(std::numeric_limits<int8_t>::min())
753           .qmax(std::numeric_limits<int8_t>::max())
754           .s(s)
755           .TestDivideByPO2WithRoundingTiesAway(xnn_qs8_requantize_rndna__sse4);
756       }
757     }
758   }
759 
TEST(QS8_RNDNA__SSE4,special_cases)760   TEST(QS8_RNDNA__SSE4, special_cases) {
761     TEST_REQUIRES_X86_SSE41;
762     RequantizationTester()
763       .qmin(std::numeric_limits<int8_t>::min())
764       .qmax(std::numeric_limits<int8_t>::max())
765       .TestSpecialCases(xnn_qs8_requantize_rndna__sse4);
766   }
767 
TEST(QS8_RNDNA__SSE4,random_cases)768   TEST(QS8_RNDNA__SSE4, random_cases) {
769     TEST_REQUIRES_X86_SSE41;
770     RequantizationTester()
771       .qmin(std::numeric_limits<int8_t>::min())
772       .qmax(std::numeric_limits<int8_t>::max())
773       .iterations(100)
774       .TestRandomCasesRoundToNearestTiesAway(xnn_qs8_requantize_rndna__sse4);
775   }
776 
777 
778   /*
779    * Round-to-nearest, ties up, SSE4.1 implementation using arithmetic shift right.
780    */
781 
TEST(QS8_RNDNU__SSE4_SRA,exact_divide_by_po2)782   TEST(QS8_RNDNU__SSE4_SRA, exact_divide_by_po2) {
783     TEST_REQUIRES_X86_SSE41;
784     for (uint32_t s = 1; s < 32; s++) {
785       RequantizationTester()
786         .qmin(std::numeric_limits<int8_t>::min())
787         .qmax(std::numeric_limits<int8_t>::max())
788         .s(s)
789         .TestExactDivideByPO2(xnn_qs8_requantize_rndnu__sse4_sra);
790     }
791   }
792 
TEST(QS8_RNDNU__SSE4_SRA,exact_divide_by_po2_with_zero_point)793   TEST(QS8_RNDNU__SSE4_SRA, exact_divide_by_po2_with_zero_point) {
794     TEST_REQUIRES_X86_SSE41;
795     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
796          zero_point <= std::numeric_limits<int8_t>::max();
797          zero_point++)
798     {
799       for (uint32_t s = 1; s < 32; s++) {
800         RequantizationTester()
801           .zero_point(zero_point)
802           .qmin(std::numeric_limits<int8_t>::min())
803           .qmax(std::numeric_limits<int8_t>::max())
804           .s(s)
805           .TestExactDivideByPO2(xnn_qs8_requantize_rndnu__sse4_sra);
806       }
807     }
808   }
809 
TEST(QS8_RNDNU__SSE4_SRA,divide_by_po2_with_rounding_up)810   TEST(QS8_RNDNU__SSE4_SRA, divide_by_po2_with_rounding_up) {
811     TEST_REQUIRES_X86_SSE41;
812     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
813          zero_point <= std::numeric_limits<int8_t>::max();
814          zero_point++)
815     {
816       for (uint32_t s = 1; s < 32; s++) {
817         RequantizationTester()
818           .zero_point(zero_point)
819           .qmin(std::numeric_limits<int8_t>::min())
820           .qmax(std::numeric_limits<int8_t>::max())
821           .s(s)
822           .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_rndnu__sse4_sra);
823       }
824     }
825   }
826 
TEST(QS8_RNDNU__SSE4_SRA,divide_by_po2_with_rounding_down)827   TEST(QS8_RNDNU__SSE4_SRA, divide_by_po2_with_rounding_down) {
828     TEST_REQUIRES_X86_SSE41;
829     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
830          zero_point <= std::numeric_limits<int8_t>::max();
831          zero_point++)
832     {
833       for (uint32_t s = 1; s < 32; s++) {
834         RequantizationTester()
835           .zero_point(zero_point)
836           .qmin(std::numeric_limits<int8_t>::min())
837           .qmax(std::numeric_limits<int8_t>::max())
838           .s(s)
839           .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_rndnu__sse4_sra);
840       }
841     }
842   }
843 
TEST(QS8_RNDNU__SSE4_SRA,divide_by_po2_with_rounding_away)844   TEST(QS8_RNDNU__SSE4_SRA, divide_by_po2_with_rounding_away) {
845     TEST_REQUIRES_X86_SSE41;
846     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
847          zero_point <= std::numeric_limits<int8_t>::max();
848          zero_point++)
849     {
850       for (uint32_t s = 1; s < 32; s++) {
851         RequantizationTester()
852           .zero_point(zero_point)
853           .qmin(std::numeric_limits<int8_t>::min())
854           .qmax(std::numeric_limits<int8_t>::max())
855           .s(s)
856           .TestDivideByPO2WithRoundingTiesUp(xnn_qs8_requantize_rndnu__sse4_sra);
857       }
858     }
859   }
860 
TEST(QS8_RNDNU__SSE4_SRA,random_cases)861   TEST(QS8_RNDNU__SSE4_SRA, random_cases) {
862     TEST_REQUIRES_X86_SSE41;
863     RequantizationTester()
864       .qmin(std::numeric_limits<int8_t>::min())
865       .qmax(std::numeric_limits<int8_t>::max())
866       .iterations(100)
867       .TestRandomCasesRoundToNearestTiesUp(xnn_qs8_requantize_rndnu__sse4_sra);
868   }
869 
870 
871   /*
872    * Round-to-nearest, ties up, SSE4.1 implementation using logical shift right.
873    */
874 
TEST(QS8_RNDNU__SSE4_SRL,exact_divide_by_po2)875   TEST(QS8_RNDNU__SSE4_SRL, exact_divide_by_po2) {
876     TEST_REQUIRES_X86_SSE41;
877     for (uint32_t s = 1; s < 32; s++) {
878       RequantizationTester()
879         .qmin(std::numeric_limits<int8_t>::min())
880         .qmax(std::numeric_limits<int8_t>::max())
881         .s(s)
882         .TestExactDivideByPO2(xnn_qs8_requantize_rndnu__sse4_srl);
883     }
884   }
885 
TEST(QS8_RNDNU__SSE4_SRL,exact_divide_by_po2_with_zero_point)886   TEST(QS8_RNDNU__SSE4_SRL, exact_divide_by_po2_with_zero_point) {
887     TEST_REQUIRES_X86_SSE41;
888     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
889          zero_point <= std::numeric_limits<int8_t>::max();
890          zero_point++)
891     {
892       for (uint32_t s = 1; s < 32; s++) {
893         RequantizationTester()
894           .zero_point(zero_point)
895           .qmin(std::numeric_limits<int8_t>::min())
896           .qmax(std::numeric_limits<int8_t>::max())
897           .s(s)
898           .TestExactDivideByPO2(xnn_qs8_requantize_rndnu__sse4_srl);
899       }
900     }
901   }
902 
TEST(QS8_RNDNU__SSE4_SRL,divide_by_po2_with_rounding_up)903   TEST(QS8_RNDNU__SSE4_SRL, divide_by_po2_with_rounding_up) {
904     TEST_REQUIRES_X86_SSE41;
905     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
906          zero_point <= std::numeric_limits<int8_t>::max();
907          zero_point++)
908     {
909       for (uint32_t s = 1; s < 32; s++) {
910         RequantizationTester()
911           .zero_point(zero_point)
912           .qmin(std::numeric_limits<int8_t>::min())
913           .qmax(std::numeric_limits<int8_t>::max())
914           .s(s)
915           .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_rndnu__sse4_srl);
916       }
917     }
918   }
919 
TEST(QS8_RNDNU__SSE4_SRL,divide_by_po2_with_rounding_down)920   TEST(QS8_RNDNU__SSE4_SRL, divide_by_po2_with_rounding_down) {
921     TEST_REQUIRES_X86_SSE41;
922     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
923          zero_point <= std::numeric_limits<int8_t>::max();
924          zero_point++)
925     {
926       for (uint32_t s = 1; s < 32; s++) {
927         RequantizationTester()
928           .zero_point(zero_point)
929           .qmin(std::numeric_limits<int8_t>::min())
930           .qmax(std::numeric_limits<int8_t>::max())
931           .s(s)
932           .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_rndnu__sse4_srl);
933       }
934     }
935   }
936 
TEST(QS8_RNDNU__SSE4_SRL,divide_by_po2_with_rounding_away)937   TEST(QS8_RNDNU__SSE4_SRL, divide_by_po2_with_rounding_away) {
938     TEST_REQUIRES_X86_SSE41;
939     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
940          zero_point <= std::numeric_limits<int8_t>::max();
941          zero_point++)
942     {
943       for (uint32_t s = 1; s < 32; s++) {
944         RequantizationTester()
945           .zero_point(zero_point)
946           .qmin(std::numeric_limits<int8_t>::min())
947           .qmax(std::numeric_limits<int8_t>::max())
948           .s(s)
949           .TestDivideByPO2WithRoundingTiesUp(xnn_qs8_requantize_rndnu__sse4_srl);
950       }
951     }
952   }
953 
TEST(QS8_RNDNU__SSE4_SRL,random_cases)954   TEST(QS8_RNDNU__SSE4_SRL, random_cases) {
955     TEST_REQUIRES_X86_SSE41;
956     RequantizationTester()
957       .qmin(std::numeric_limits<int8_t>::min())
958       .qmax(std::numeric_limits<int8_t>::max())
959       .iterations(100)
960       .TestRandomCasesRoundToNearestTiesUp(xnn_qs8_requantize_rndnu__sse4_srl);
961   }
962 
963 
964   /*
965    * FP32-based x86 SSE2 implementation.
966    */
967 
TEST(QS8_FP32__SSE2,random_cases)968   TEST(QS8_FP32__SSE2, random_cases) {
969     RequantizationTester()
970       .qmin(std::numeric_limits<int8_t>::min())
971       .qmax(std::numeric_limits<int8_t>::max())
972       .iterations(1000)
973       .TestRandomCasesApproximate(xnn_qs8_requantize_fp32__sse2);
974   }
975 
976 
977   /*
978    * FP32-based x86 SSE4 implementation.
979    */
980 
TEST(QS8_FP32__SSE4,random_cases)981   TEST(QS8_FP32__SSE4, random_cases) {
982     RequantizationTester()
983       .qmin(std::numeric_limits<int8_t>::min())
984       .qmax(std::numeric_limits<int8_t>::max())
985       .iterations(1000)
986       .TestRandomCasesApproximate(xnn_qs8_requantize_fp32__sse4);
987   }
988 
989 
990   /*
991    * GEMMLOWP-equivalent x86 SSE2 implementation.
992    */
993 
TEST(QS8_GEMMLOWP__SSE2,exact_divide_by_po2)994   TEST(QS8_GEMMLOWP__SSE2, exact_divide_by_po2) {
995     for (uint32_t s = 1; s < 32; s++) {
996       RequantizationTester()
997         .qmin(std::numeric_limits<int8_t>::min())
998         .qmax(std::numeric_limits<int8_t>::max())
999         .s(s)
1000         .TestExactDivideByPO2(xnn_qs8_requantize_gemmlowp__sse2);
1001     }
1002   }
1003 
TEST(QS8_GEMMLOWP__SSE2,exact_divide_by_po2_with_zero_point)1004   TEST(QS8_GEMMLOWP__SSE2, exact_divide_by_po2_with_zero_point) {
1005     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
1006          zero_point <= std::numeric_limits<int8_t>::max();
1007          zero_point++)
1008     {
1009       for (uint32_t s = 1; s < 32; s++) {
1010         RequantizationTester()
1011           .zero_point(zero_point)
1012           .qmin(std::numeric_limits<int8_t>::min())
1013           .qmax(std::numeric_limits<int8_t>::max())
1014           .s(s)
1015           .TestExactDivideByPO2(xnn_qs8_requantize_gemmlowp__sse2);
1016       }
1017     }
1018   }
1019 
TEST(QS8_GEMMLOWP__SSE2,divide_by_po2_with_rounding_up)1020   TEST(QS8_GEMMLOWP__SSE2, divide_by_po2_with_rounding_up) {
1021     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
1022          zero_point <= std::numeric_limits<int8_t>::max();
1023          zero_point++)
1024     {
1025       for (uint32_t s = 1; s < 32; s++) {
1026         RequantizationTester()
1027           .zero_point(zero_point)
1028           .qmin(std::numeric_limits<int8_t>::min())
1029           .qmax(std::numeric_limits<int8_t>::max())
1030           .s(s)
1031           .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_gemmlowp__sse2);
1032       }
1033     }
1034   }
1035 
1036   /* No rounding down test - it fails because of upward bias in multiplication */
1037   /* No rounding away test - it fails because of upward bias in multiplication */
1038 
TEST(QS8_GEMMLOWP__SSE2,special_cases)1039   TEST(QS8_GEMMLOWP__SSE2, special_cases) {
1040     RequantizationTester()
1041       .qmin(std::numeric_limits<int8_t>::min())
1042       .qmax(std::numeric_limits<int8_t>::max())
1043       .TestSpecialCases(xnn_qs8_requantize_gemmlowp__sse2);
1044   }
1045 
TEST(QS8_GEMMLOWP__SSE2,random_cases)1046   TEST(QS8_GEMMLOWP__SSE2, random_cases) {
1047     RequantizationTester()
1048       .qmin(std::numeric_limits<int8_t>::min())
1049       .qmax(std::numeric_limits<int8_t>::max())
1050       .iterations(100)
1051       .TestRandomCasesApproximate(xnn_qs8_requantize_gemmlowp__sse2);
1052   }
1053 
1054 
1055   /*
1056    * GEMMLOWP-equivalent x86 SSSE3 implementation.
1057    */
1058 
TEST(QS8_GEMMLOWP__SSSE3,exact_divide_by_po2)1059   TEST(QS8_GEMMLOWP__SSSE3, exact_divide_by_po2) {
1060     TEST_REQUIRES_X86_SSSE3;
1061     for (uint32_t s = 1; s < 32; s++) {
1062       RequantizationTester()
1063         .qmin(std::numeric_limits<int8_t>::min())
1064         .qmax(std::numeric_limits<int8_t>::max())
1065         .s(s)
1066         .TestExactDivideByPO2(xnn_qs8_requantize_gemmlowp__ssse3);
1067     }
1068   }
1069 
TEST(QS8_GEMMLOWP__SSSE3,exact_divide_by_po2_with_zero_point)1070   TEST(QS8_GEMMLOWP__SSSE3, exact_divide_by_po2_with_zero_point) {
1071     TEST_REQUIRES_X86_SSSE3;
1072     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
1073          zero_point <= std::numeric_limits<int8_t>::max();
1074          zero_point++)
1075     {
1076       for (uint32_t s = 1; s < 32; s++) {
1077         RequantizationTester()
1078           .zero_point(zero_point)
1079           .qmin(std::numeric_limits<int8_t>::min())
1080           .qmax(std::numeric_limits<int8_t>::max())
1081           .s(s)
1082           .TestExactDivideByPO2(xnn_qs8_requantize_gemmlowp__ssse3);
1083       }
1084     }
1085   }
1086 
TEST(QS8_GEMMLOWP__SSSE3,divide_by_po2_with_rounding_up)1087   TEST(QS8_GEMMLOWP__SSSE3, divide_by_po2_with_rounding_up) {
1088     TEST_REQUIRES_X86_SSSE3;
1089     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
1090          zero_point <= std::numeric_limits<int8_t>::max();
1091          zero_point++)
1092     {
1093       for (uint32_t s = 1; s < 32; s++) {
1094         RequantizationTester()
1095           .zero_point(zero_point)
1096           .qmin(std::numeric_limits<int8_t>::min())
1097           .qmax(std::numeric_limits<int8_t>::max())
1098           .s(s)
1099           .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_gemmlowp__ssse3);
1100       }
1101     }
1102   }
1103 
1104   /* No rounding down test - it fails because of upward bias in multiplication */
1105   /* No rounding away test - it fails because of upward bias in multiplication */
1106 
TEST(QS8_GEMMLOWP__SSSE3,special_cases)1107   TEST(QS8_GEMMLOWP__SSSE3, special_cases) {
1108     TEST_REQUIRES_X86_SSSE3;
1109     RequantizationTester()
1110       .qmin(std::numeric_limits<int8_t>::min())
1111       .qmax(std::numeric_limits<int8_t>::max())
1112       .TestSpecialCases(xnn_qs8_requantize_gemmlowp__ssse3);
1113   }
1114 
TEST(QS8_GEMMLOWP__SSSE3,random_cases)1115   TEST(QS8_GEMMLOWP__SSSE3, random_cases) {
1116     TEST_REQUIRES_X86_SSSE3;
1117     RequantizationTester()
1118       .qmin(std::numeric_limits<int8_t>::min())
1119       .qmax(std::numeric_limits<int8_t>::max())
1120       .iterations(100)
1121       .TestRandomCasesApproximate(xnn_qs8_requantize_gemmlowp__ssse3);
1122   }
1123 
1124 
1125   /*
1126    * GEMMLOWP-equivalent x86 SSE4 implementation.
1127    */
1128 
TEST(QS8_GEMMLOWP__SSE4,exact_divide_by_po2)1129   TEST(QS8_GEMMLOWP__SSE4, exact_divide_by_po2) {
1130     TEST_REQUIRES_X86_SSE41;
1131     for (uint32_t s = 1; s < 32; s++) {
1132       RequantizationTester()
1133         .qmin(std::numeric_limits<int8_t>::min())
1134         .qmax(std::numeric_limits<int8_t>::max())
1135         .s(s)
1136         .TestExactDivideByPO2(xnn_qs8_requantize_gemmlowp__sse4);
1137     }
1138   }
1139 
TEST(QS8_GEMMLOWP__SSE4,exact_divide_by_po2_with_zero_point)1140   TEST(QS8_GEMMLOWP__SSE4, exact_divide_by_po2_with_zero_point) {
1141     TEST_REQUIRES_X86_SSE41;
1142     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
1143          zero_point <= std::numeric_limits<int8_t>::max();
1144          zero_point++)
1145     {
1146       for (uint32_t s = 1; s < 32; s++) {
1147         RequantizationTester()
1148           .zero_point(zero_point)
1149           .qmin(std::numeric_limits<int8_t>::min())
1150           .qmax(std::numeric_limits<int8_t>::max())
1151           .s(s)
1152           .TestExactDivideByPO2(xnn_qs8_requantize_gemmlowp__sse4);
1153       }
1154     }
1155   }
1156 
TEST(QS8_GEMMLOWP__SSE4,divide_by_po2_with_rounding_up)1157   TEST(QS8_GEMMLOWP__SSE4, divide_by_po2_with_rounding_up) {
1158     TEST_REQUIRES_X86_SSE41;
1159     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
1160          zero_point <= std::numeric_limits<int8_t>::max();
1161          zero_point++)
1162     {
1163       for (uint32_t s = 1; s < 32; s++) {
1164         RequantizationTester()
1165           .zero_point(zero_point)
1166           .qmin(std::numeric_limits<int8_t>::min())
1167           .qmax(std::numeric_limits<int8_t>::max())
1168           .s(s)
1169           .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_gemmlowp__sse4);
1170       }
1171     }
1172   }
1173 
1174   /* No rounding down test - it fails because of upward bias in multiplication */
1175   /* No rounding away test - it fails because of upward bias in multiplication */
1176 
TEST(QS8_GEMMLOWP__SSE4,special_cases)1177   TEST(QS8_GEMMLOWP__SSE4, special_cases) {
1178     TEST_REQUIRES_X86_SSE41;
1179     RequantizationTester()
1180       .qmin(std::numeric_limits<int8_t>::min())
1181       .qmax(std::numeric_limits<int8_t>::max())
1182       .TestSpecialCases(xnn_qs8_requantize_gemmlowp__sse4);
1183   }
1184 
TEST(QS8_GEMMLOWP__SSE4,random_cases)1185   TEST(QS8_GEMMLOWP__SSE4, random_cases) {
1186     TEST_REQUIRES_X86_SSE41;
1187     RequantizationTester()
1188       .qmin(std::numeric_limits<int8_t>::min())
1189       .qmax(std::numeric_limits<int8_t>::max())
1190       .iterations(100)
1191       .TestRandomCasesApproximate(xnn_qs8_requantize_gemmlowp__sse4);
1192   }
1193 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1194 
1195 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
1196   /*
1197    * Round-to-nearest, ties away from zero, ARM NEON implementation.
1198    */
1199 
TEST(QS8_RNDNA__NEON,exact_divide_by_po2)1200   TEST(QS8_RNDNA__NEON, exact_divide_by_po2) {
1201     TEST_REQUIRES_ARM_NEON;
1202     for (uint32_t s = 1; s < 32; s++) {
1203       RequantizationTester()
1204         .s(s)
1205         .qmin(std::numeric_limits<int8_t>::min())
1206         .qmax(std::numeric_limits<int8_t>::max())
1207         .TestExactDivideByPO2(xnn_qs8_requantize_rndna__neon);
1208     }
1209   }
1210 
TEST(QS8_RNDNA__NEON,exact_divide_by_po2_with_zero_point)1211   TEST(QS8_RNDNA__NEON, exact_divide_by_po2_with_zero_point) {
1212     TEST_REQUIRES_ARM_NEON;
1213     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
1214          zero_point <= std::numeric_limits<int8_t>::max();
1215          zero_point++)
1216     {
1217       for (uint32_t s = 1; s < 32; s++) {
1218         RequantizationTester()
1219           .zero_point(zero_point)
1220           .qmin(std::numeric_limits<int8_t>::min())
1221           .qmax(std::numeric_limits<int8_t>::max())
1222           .s(s)
1223           .TestExactDivideByPO2(xnn_qs8_requantize_rndna__neon);
1224       }
1225     }
1226   }
1227 
TEST(QS8_RNDNA__NEON,divide_by_po2_with_rounding_up)1228   TEST(QS8_RNDNA__NEON, divide_by_po2_with_rounding_up) {
1229     TEST_REQUIRES_ARM_NEON;
1230     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
1231          zero_point <= std::numeric_limits<int8_t>::max();
1232          zero_point++)
1233     {
1234       for (uint32_t s = 1; s < 32; s++) {
1235         RequantizationTester()
1236           .zero_point(zero_point)
1237           .qmin(std::numeric_limits<int8_t>::min())
1238           .qmax(std::numeric_limits<int8_t>::max())
1239           .s(s)
1240           .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_rndna__neon);
1241       }
1242     }
1243   }
1244 
TEST(QS8_RNDNA__NEON,divide_by_po2_with_rounding_down)1245   TEST(QS8_RNDNA__NEON, divide_by_po2_with_rounding_down) {
1246     TEST_REQUIRES_ARM_NEON;
1247     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
1248          zero_point <= std::numeric_limits<int8_t>::max();
1249          zero_point++)
1250     {
1251       for (uint32_t s = 1; s < 32; s++) {
1252         RequantizationTester()
1253           .zero_point(zero_point)
1254           .qmin(std::numeric_limits<int8_t>::min())
1255           .qmax(std::numeric_limits<int8_t>::max())
1256           .s(s)
1257           .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_rndna__neon);
1258       }
1259     }
1260   }
1261 
TEST(QS8_RNDNA__NEON,divide_by_po2_with_rounding_away)1262   TEST(QS8_RNDNA__NEON, divide_by_po2_with_rounding_away) {
1263     TEST_REQUIRES_ARM_NEON;
1264     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
1265          zero_point <= std::numeric_limits<int8_t>::max();
1266          zero_point++)
1267     {
1268       for (uint32_t s = 1; s < 32; s++) {
1269         RequantizationTester()
1270           .zero_point(zero_point)
1271           .qmin(std::numeric_limits<int8_t>::min())
1272           .qmax(std::numeric_limits<int8_t>::max())
1273           .s(s)
1274           .TestDivideByPO2WithRoundingTiesAway(xnn_qs8_requantize_rndna__neon);
1275       }
1276     }
1277   }
1278 
TEST(QS8_RNDNA__NEON,special_cases)1279   TEST(QS8_RNDNA__NEON, special_cases) {
1280     TEST_REQUIRES_ARM_NEON;
1281     RequantizationTester()
1282       .qmin(std::numeric_limits<int8_t>::min())
1283       .qmax(std::numeric_limits<int8_t>::max())
1284       .TestSpecialCases(xnn_qs8_requantize_rndna__neon);
1285   }
1286 
TEST(QS8_RNDNA__NEON,random_cases)1287   TEST(QS8_RNDNA__NEON, random_cases) {
1288     TEST_REQUIRES_ARM_NEON;
1289     RequantizationTester()
1290       .qmin(std::numeric_limits<int8_t>::min())
1291       .qmax(std::numeric_limits<int8_t>::max())
1292       .iterations(100)
1293       .TestRandomCasesRoundToNearestTiesAway(xnn_qs8_requantize_rndna__neon);
1294   }
1295 
1296 
1297   /*
1298    * Round-to-nearest, ties up, ARM NEON implementation using extended multiplication.
1299    */
1300 
TEST(QS8_RNDNU__NEON_MULL,exact_divide_by_po2)1301   TEST(QS8_RNDNU__NEON_MULL, exact_divide_by_po2) {
1302     TEST_REQUIRES_ARM_NEON;
1303     for (uint32_t s = 1; s < 32; s++) {
1304       RequantizationTester()
1305         .qmin(std::numeric_limits<int8_t>::min())
1306         .qmax(std::numeric_limits<int8_t>::max())
1307         .s(s)
1308         .TestExactDivideByPO2(xnn_qs8_requantize_rndnu__neon_mull);
1309     }
1310   }
1311 
TEST(QS8_RNDNU__NEON_MULL,exact_divide_by_po2_with_zero_point)1312   TEST(QS8_RNDNU__NEON_MULL, exact_divide_by_po2_with_zero_point) {
1313     TEST_REQUIRES_ARM_NEON;
1314     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
1315          zero_point <= std::numeric_limits<int8_t>::max();
1316          zero_point++)
1317     {
1318       for (uint32_t s = 1; s < 32; s++) {
1319         RequantizationTester()
1320           .zero_point(zero_point)
1321           .qmin(std::numeric_limits<int8_t>::min())
1322           .qmax(std::numeric_limits<int8_t>::max())
1323           .s(s)
1324           .TestExactDivideByPO2(xnn_qs8_requantize_rndnu__neon_mull);
1325       }
1326     }
1327   }
1328 
TEST(QS8_RNDNU__NEON_MULL,divide_by_po2_with_rounding_up)1329   TEST(QS8_RNDNU__NEON_MULL, divide_by_po2_with_rounding_up) {
1330     TEST_REQUIRES_ARM_NEON;
1331     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
1332          zero_point <= std::numeric_limits<int8_t>::max();
1333          zero_point++)
1334     {
1335       for (uint32_t s = 1; s < 32; s++) {
1336         RequantizationTester()
1337           .zero_point(zero_point)
1338           .qmin(std::numeric_limits<int8_t>::min())
1339           .qmax(std::numeric_limits<int8_t>::max())
1340           .s(s)
1341           .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_rndnu__neon_mull);
1342       }
1343     }
1344   }
1345 
TEST(QS8_RNDNU__NEON_MULL,divide_by_po2_with_rounding_down)1346   TEST(QS8_RNDNU__NEON_MULL, divide_by_po2_with_rounding_down) {
1347     TEST_REQUIRES_ARM_NEON;
1348     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
1349          zero_point <= std::numeric_limits<int8_t>::max();
1350          zero_point++)
1351     {
1352       for (uint32_t s = 1; s < 32; s++) {
1353         RequantizationTester()
1354           .zero_point(zero_point)
1355           .qmin(std::numeric_limits<int8_t>::min())
1356           .qmax(std::numeric_limits<int8_t>::max())
1357           .s(s)
1358           .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_rndnu__neon_mull);
1359       }
1360     }
1361   }
1362 
TEST(QS8_RNDNU__NEON_MULL,divide_by_po2_with_rounding_away)1363   TEST(QS8_RNDNU__NEON_MULL, divide_by_po2_with_rounding_away) {
1364     TEST_REQUIRES_ARM_NEON;
1365     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
1366          zero_point <= std::numeric_limits<int8_t>::max();
1367          zero_point++)
1368     {
1369       for (uint32_t s = 1; s < 32; s++) {
1370         RequantizationTester()
1371           .zero_point(zero_point)
1372           .qmin(std::numeric_limits<int8_t>::min())
1373           .qmax(std::numeric_limits<int8_t>::max())
1374           .s(s)
1375           .TestDivideByPO2WithRoundingTiesUp(xnn_qs8_requantize_rndnu__neon_mull);
1376       }
1377     }
1378   }
1379 
TEST(QS8_RNDNU__NEON_MULL,random_cases)1380   TEST(QS8_RNDNU__NEON_MULL, random_cases) {
1381     TEST_REQUIRES_ARM_NEON;
1382     RequantizationTester()
1383       .qmin(std::numeric_limits<int8_t>::min())
1384       .qmax(std::numeric_limits<int8_t>::max())
1385       .iterations(100)
1386       .TestRandomCasesRoundToNearestTiesUp(xnn_qs8_requantize_rndnu__neon_mull);
1387   }
1388 
1389 
1390   /*
1391    * Round-to-nearest, ties up, ARM NEON implementation using Q31 multiplication.
1392    */
1393 
TEST(QS8_RNDNU__NEON_QDMULH,exact_divide_by_po2)1394   TEST(QS8_RNDNU__NEON_QDMULH, exact_divide_by_po2) {
1395     TEST_REQUIRES_ARM_NEON;
1396     for (uint32_t s = 1; s < 32; s++) {
1397       RequantizationTester()
1398         .qmin(std::numeric_limits<int8_t>::min())
1399         .qmax(std::numeric_limits<int8_t>::max())
1400         .s(s)
1401         .TestExactDivideByPO2(xnn_qs8_requantize_rndnu__neon_qdmulh);
1402     }
1403   }
1404 
TEST(QS8_RNDNU__NEON_QDMULH,exact_divide_by_po2_with_zero_point)1405   TEST(QS8_RNDNU__NEON_QDMULH, exact_divide_by_po2_with_zero_point) {
1406     TEST_REQUIRES_ARM_NEON;
1407     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
1408          zero_point <= std::numeric_limits<int8_t>::max();
1409          zero_point++)
1410     {
1411       for (uint32_t s = 1; s < 32; s++) {
1412         RequantizationTester()
1413           .zero_point(zero_point)
1414           .qmin(std::numeric_limits<int8_t>::min())
1415           .qmax(std::numeric_limits<int8_t>::max())
1416           .s(s)
1417           .TestExactDivideByPO2(xnn_qs8_requantize_rndnu__neon_qdmulh);
1418       }
1419     }
1420   }
1421 
TEST(QS8_RNDNU__NEON_QDMULH,divide_by_po2_with_rounding_up)1422   TEST(QS8_RNDNU__NEON_QDMULH, divide_by_po2_with_rounding_up) {
1423     TEST_REQUIRES_ARM_NEON;
1424     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
1425          zero_point <= std::numeric_limits<int8_t>::max();
1426          zero_point++)
1427     {
1428       for (uint32_t s = 1; s < 32; s++) {
1429         RequantizationTester()
1430           .zero_point(zero_point)
1431           .qmin(std::numeric_limits<int8_t>::min())
1432           .qmax(std::numeric_limits<int8_t>::max())
1433           .s(s)
1434           .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_rndnu__neon_qdmulh);
1435       }
1436     }
1437   }
1438 
TEST(QS8_RNDNU__NEON_QDMULH,divide_by_po2_with_rounding_down)1439   TEST(QS8_RNDNU__NEON_QDMULH, divide_by_po2_with_rounding_down) {
1440     TEST_REQUIRES_ARM_NEON;
1441     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
1442          zero_point <= std::numeric_limits<int8_t>::max();
1443          zero_point++)
1444     {
1445       for (uint32_t s = 1; s < 32; s++) {
1446         RequantizationTester()
1447           .zero_point(zero_point)
1448           .qmin(std::numeric_limits<int8_t>::min())
1449           .qmax(std::numeric_limits<int8_t>::max())
1450           .s(s)
1451           .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_rndnu__neon_qdmulh);
1452       }
1453     }
1454   }
1455 
TEST(QS8_RNDNU__NEON_QDMULH,divide_by_po2_with_rounding_away)1456   TEST(QS8_RNDNU__NEON_QDMULH, divide_by_po2_with_rounding_away) {
1457     TEST_REQUIRES_ARM_NEON;
1458     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
1459          zero_point <= std::numeric_limits<int8_t>::max();
1460          zero_point++)
1461     {
1462       for (uint32_t s = 1; s < 32; s++) {
1463         RequantizationTester()
1464           .zero_point(zero_point)
1465           .qmin(std::numeric_limits<int8_t>::min())
1466           .qmax(std::numeric_limits<int8_t>::max())
1467           .s(s)
1468           .TestDivideByPO2WithRoundingTiesUp(xnn_qs8_requantize_rndnu__neon_qdmulh);
1469       }
1470     }
1471   }
1472 
TEST(QS8_RNDNU__NEON_QDMULH,random_cases)1473   TEST(QS8_RNDNU__NEON_QDMULH, random_cases) {
1474     TEST_REQUIRES_ARM_NEON;
1475     RequantizationTester()
1476       .qmin(std::numeric_limits<int8_t>::min())
1477       .qmax(std::numeric_limits<int8_t>::max())
1478       .iterations(100)
1479       .TestRandomCasesRoundToNearestTiesUp(xnn_qs8_requantize_rndnu__neon_qdmulh);
1480   }
1481 
1482 
1483   /*
1484    * FP32-based ARM NEON implementation.
1485    */
1486 
TEST(QS8_FP32__NEON,random_cases)1487   TEST(QS8_FP32__NEON, random_cases) {
1488     TEST_REQUIRES_ARM_NEON;
1489     RequantizationTester()
1490       .qmin(std::numeric_limits<int8_t>::min())
1491       .qmax(std::numeric_limits<int8_t>::max())
1492       .iterations(1000)
1493       .TestRandomCasesApproximate(xnn_qs8_requantize_fp32__neon);
1494   }
1495 
1496 
1497   /*
1498    * GEMMLOWP-equivalent ARM NEON implementation.
1499    */
1500 
TEST(QS8_GEMMLOWP__NEON,exact_divide_by_po2)1501   TEST(QS8_GEMMLOWP__NEON, exact_divide_by_po2) {
1502     TEST_REQUIRES_ARM_NEON;
1503     for (uint32_t s = 1; s < 32; s++) {
1504       RequantizationTester()
1505         .qmin(std::numeric_limits<int8_t>::min())
1506         .qmax(std::numeric_limits<int8_t>::max())
1507         .s(s)
1508         .TestExactDivideByPO2(xnn_qs8_requantize_gemmlowp__neon);
1509     }
1510   }
1511 
TEST(QS8_GEMMLOWP__NEON,exact_divide_by_po2_with_zero_point)1512   TEST(QS8_GEMMLOWP__NEON, exact_divide_by_po2_with_zero_point) {
1513     TEST_REQUIRES_ARM_NEON;
1514     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
1515          zero_point <= std::numeric_limits<int8_t>::max();
1516          zero_point++)
1517     {
1518       for (uint32_t s = 1; s < 32; s++) {
1519         RequantizationTester()
1520           .zero_point(zero_point)
1521           .qmin(std::numeric_limits<int8_t>::min())
1522           .qmax(std::numeric_limits<int8_t>::max())
1523           .s(s)
1524           .TestExactDivideByPO2(xnn_qs8_requantize_gemmlowp__neon);
1525       }
1526     }
1527   }
1528 
TEST(QS8_GEMMLOWP__NEON,divide_by_po2_with_rounding_up)1529   TEST(QS8_GEMMLOWP__NEON, divide_by_po2_with_rounding_up) {
1530     TEST_REQUIRES_ARM_NEON;
1531     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
1532          zero_point <= std::numeric_limits<int8_t>::max();
1533          zero_point++)
1534     {
1535       for (uint32_t s = 1; s < 32; s++) {
1536         RequantizationTester()
1537           .zero_point(zero_point)
1538           .qmin(std::numeric_limits<int8_t>::min())
1539           .qmax(std::numeric_limits<int8_t>::max())
1540           .s(s)
1541           .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_gemmlowp__neon);
1542       }
1543     }
1544   }
1545 
1546   /* No rounding down test - it fails because of upward bias in multiplication */
1547   /* No rounding away test - it fails because of upward bias in multiplication */
1548 
TEST(QS8_GEMMLOWP__NEON,special_cases)1549   TEST(QS8_GEMMLOWP__NEON, special_cases) {
1550     TEST_REQUIRES_ARM_NEON;
1551     RequantizationTester()
1552       .qmin(std::numeric_limits<int8_t>::min())
1553       .qmax(std::numeric_limits<int8_t>::max())
1554       .TestSpecialCases(xnn_qs8_requantize_gemmlowp__neon);
1555   }
1556 
TEST(QS8_GEMMLOWP__NEON,random_cases)1557   TEST(QS8_GEMMLOWP__NEON, random_cases) {
1558     TEST_REQUIRES_ARM_NEON;
1559     RequantizationTester()
1560       .qmin(std::numeric_limits<int8_t>::min())
1561       .qmax(std::numeric_limits<int8_t>::max())
1562       .iterations(100)
1563       .TestRandomCasesApproximate(xnn_qs8_requantize_gemmlowp__neon);
1564   }
1565 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
1566 
1567 #if XNN_ARCH_WASMSIMD
1568   /*
1569    * FP32-based WAsm SIMD implementation.
1570    */
1571 
TEST(QS8_FP32__WASMSIMD,random_cases)1572   TEST(QS8_FP32__WASMSIMD, random_cases) {
1573     RequantizationTester()
1574       .qmin(std::numeric_limits<int8_t>::min())
1575       .qmax(std::numeric_limits<int8_t>::max())
1576       .iterations(1000)
1577       .TestRandomCasesApproximate(xnn_qs8_requantize_fp32__wasmsimd);
1578   }
1579 
1580   /*
1581    * GEMMLOWP-equivalent WAsm SIMD implementation.
1582    */
1583 
TEST(QS8_GEMMLOWP__WASMSIMD,exact_divide_by_po2)1584   TEST(QS8_GEMMLOWP__WASMSIMD, exact_divide_by_po2) {
1585     for (uint32_t s = 1; s < 32; s++) {
1586       RequantizationTester()
1587         .qmin(std::numeric_limits<int8_t>::min())
1588         .qmax(std::numeric_limits<int8_t>::max())
1589         .s(s)
1590         .TestExactDivideByPO2(xnn_qs8_requantize_gemmlowp__wasmsimd);
1591     }
1592   }
1593 
TEST(QS8_GEMMLOWP__WASMSIMD,exact_divide_by_po2_with_zero_point)1594   TEST(QS8_GEMMLOWP__WASMSIMD, exact_divide_by_po2_with_zero_point) {
1595     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
1596          zero_point <= std::numeric_limits<int8_t>::max();
1597          zero_point++)
1598     {
1599       for (uint32_t s = 1; s < 32; s++) {
1600         RequantizationTester()
1601           .zero_point(zero_point)
1602           .qmin(std::numeric_limits<int8_t>::min())
1603           .qmax(std::numeric_limits<int8_t>::max())
1604           .s(s)
1605           .TestExactDivideByPO2(xnn_qs8_requantize_gemmlowp__wasmsimd);
1606       }
1607     }
1608   }
1609 
TEST(QS8_GEMMLOWP__WASMSIMD,divide_by_po2_with_rounding_up)1610   TEST(QS8_GEMMLOWP__WASMSIMD, divide_by_po2_with_rounding_up) {
1611     for (int32_t zero_point = std::numeric_limits<int8_t>::min();
1612          zero_point <= std::numeric_limits<int8_t>::max();
1613          zero_point++)
1614     {
1615       for (uint32_t s = 1; s < 32; s++) {
1616         RequantizationTester()
1617           .zero_point(zero_point)
1618           .qmin(std::numeric_limits<int8_t>::min())
1619           .qmax(std::numeric_limits<int8_t>::max())
1620           .s(s)
1621           .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_gemmlowp__wasmsimd);
1622       }
1623     }
1624   }
1625 
1626   /* No rounding down test - it fails because of upward bias in multiplication */
1627   /* No rounding away test - it fails because of upward bias in multiplication */
1628 
TEST(QS8_GEMMLOWP__WASMSIMD,special_cases)1629   TEST(QS8_GEMMLOWP__WASMSIMD, special_cases) {
1630     RequantizationTester()
1631       .qmin(std::numeric_limits<int8_t>::min())
1632       .qmax(std::numeric_limits<int8_t>::max())
1633       .TestSpecialCases(xnn_qs8_requantize_gemmlowp__wasmsimd);
1634   }
1635 
TEST(QS8_GEMMLOWP__WASMSIMD,random_cases)1636   TEST(QS8_GEMMLOWP__WASMSIMD, random_cases) {
1637     RequantizationTester()
1638       .qmin(std::numeric_limits<int8_t>::min())
1639       .qmax(std::numeric_limits<int8_t>::max())
1640       .iterations(100)
1641       .TestRandomCasesApproximate(xnn_qs8_requantize_gemmlowp__wasmsimd);
1642   }
1643 #endif  // XNN_ARCH_WASMSIMD
1644