xref: /aosp_15_r20/external/XNNPACK/eval/f32-f16-cvt.cc (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cmath>
8 #include <cstddef>
9 #include <cstdint>
10 #include <cstdlib>
11 #include <iomanip>
12 #include <ios>
13 #include <vector>
14 
15 #include <gtest/gtest.h>
16 
17 #include <fp16.h>
18 
19 #include <xnnpack/aligned-allocator.h>
20 #include <xnnpack/common.h>
21 #include <xnnpack/isa-checks.h>
22 #include <xnnpack/math.h>
23 #include <xnnpack/math-stubs.h>
24 
25 
26 constexpr int kBlockSize = 1024;
27 
28 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(CVT__SSE2,positive_normal)29   TEST(CVT__SSE2, positive_normal) {
30     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
31     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
32     for (uint32_t n = UINT32_C(0x387FE000); n < UINT32_C(0x477FF000); n += kBlockSize) {
33       for (uint32_t i = 0; i < kBlockSize; i++) {
34         inputs[i] = uint32_as_float(n + i);
35       }
36       xnn_math_f32_f16_cvt__sse2(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
37       for (uint32_t i = 0; i < kBlockSize; i++) {
38         const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
39         ASSERT_EQ(reference_output, outputs[i])
40           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
41           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
42           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
43       }
44     }
45   }
46 
TEST(CVT__SSE2,negative_normal)47   TEST(CVT__SSE2, negative_normal) {
48     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
49     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
50     for (uint32_t n = UINT32_C(0xB87FE000); n < UINT32_C(0xC77FF000); n += kBlockSize) {
51       for (uint32_t i = 0; i < kBlockSize; i++) {
52         inputs[i] = uint32_as_float(n + i);
53       }
54       xnn_math_f32_f16_cvt__sse2(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
55       for (uint32_t i = 0; i < kBlockSize; i++) {
56         const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
57         ASSERT_EQ(reference_output, outputs[i])
58           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
59           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
60           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
61       }
62     }
63   }
64 
TEST(CVT__SSE2,positive_subnormal)65   TEST(CVT__SSE2, positive_subnormal) {
66     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
67     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
68     for (uint32_t n = UINT32_C(0x33000001); n < UINT32_C(0x387FE000); n += kBlockSize) {
69       for (uint32_t i = 0; i < kBlockSize; i++) {
70         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0x387FDFFF)));
71       }
72       xnn_math_f32_f16_cvt__sse2(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
73       for (uint32_t i = 0; i < kBlockSize; i++) {
74         const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
75         ASSERT_EQ(reference_output, outputs[i])
76           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
77           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
78           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
79       }
80     }
81   }
82 
TEST(CVT__SSE2,negative_subnormal)83   TEST(CVT__SSE2, negative_subnormal) {
84     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
85     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
86     for (uint32_t n = UINT32_C(0xB3000001); n < UINT32_C(0xB87FE000); n += kBlockSize) {
87       for (uint32_t i = 0; i < kBlockSize; i++) {
88         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0xB87FDFFF)));
89       }
90       xnn_math_f32_f16_cvt__sse2(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
91       for (uint32_t i = 0; i < kBlockSize; i++) {
92         const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
93         ASSERT_EQ(reference_output, outputs[i])
94           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
95           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
96           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
97       }
98     }
99   }
100 
TEST(CVT__SSE2,positive_underflow)101   TEST(CVT__SSE2, positive_underflow) {
102     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
103     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
104     for (uint32_t n = UINT32_C(0x00000001); n < UINT32_C(0x33000001); n += kBlockSize) {
105       for (uint32_t i = 0; i < kBlockSize; i++) {
106         inputs[i] = uint32_as_float(n + i);
107       }
108       xnn_math_f32_f16_cvt__sse2(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
109       for (uint32_t i = 0; i < kBlockSize; i++) {
110         const uint16_t reference_output = UINT16_C(0x0000);
111         ASSERT_EQ(reference_output, outputs[i])
112           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
113           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
114           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
115       }
116     }
117   }
118 
TEST(CVT__SSE2,negative_underflow)119   TEST(CVT__SSE2, negative_underflow) {
120     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
121     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
122     for (uint32_t n = UINT32_C(0x80000001); n < UINT32_C(0xB3000001); n += kBlockSize) {
123       for (uint32_t i = 0; i < kBlockSize; i++) {
124         inputs[i] = uint32_as_float(n + i);
125       }
126       xnn_math_f32_f16_cvt__sse2(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
127       for (uint32_t i = 0; i < kBlockSize; i++) {
128         const uint16_t reference_output = UINT16_C(0x8000);
129         ASSERT_EQ(reference_output, outputs[i])
130           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
131           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
132           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
133       }
134     }
135   }
136 
TEST(CVT__SSE2,positive_zero)137   TEST(CVT__SSE2, positive_zero) {
138     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
139     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
140     std::fill(inputs.begin(), inputs.end(), +0.0f);
141     xnn_math_f32_f16_cvt__sse2(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
142     const uint16_t reference_output = UINT16_C(0x0000);
143     ASSERT_EQ(reference_output, outputs[0])
144       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
145       << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
146       << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
147   }
148 
TEST(CVT__SSE2,negative_zero)149   TEST(CVT__SSE2, negative_zero) {
150     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
151     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
152     std::fill(inputs.begin(), inputs.end(), -0.0f);
153     xnn_math_f32_f16_cvt__sse2(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
154     const uint16_t reference_output = UINT16_C(0x8000);
155     ASSERT_EQ(reference_output, outputs[0])
156       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
157       << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
158       << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
159   }
160 
TEST(CVT__SSE2,positive_overflow)161   TEST(CVT__SSE2, positive_overflow) {
162     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
163     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
164     for (uint32_t n = UINT32_C(0x477FF000); n < UINT32_C(0x7F800000); n += kBlockSize) {
165       for (uint32_t i = 0; i < kBlockSize; i++) {
166         inputs[i] = uint32_as_float(n + i);
167       }
168       xnn_math_f32_f16_cvt__sse2(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
169       for (uint32_t i = 0; i < kBlockSize; i++) {
170         const uint16_t reference_output = UINT16_C(0x7C00);
171         ASSERT_EQ(reference_output, outputs[i])
172           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
173           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
174           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
175       }
176     }
177   }
178 
TEST(CVT__SSE2,negative_overflow)179   TEST(CVT__SSE2, negative_overflow) {
180     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
181     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
182     for (uint32_t n = UINT32_C(0xC77FF000); n < UINT32_C(0xFF800000); n += kBlockSize) {
183       for (uint32_t i = 0; i < kBlockSize; i++) {
184         inputs[i] = uint32_as_float(n + i);
185       }
186       xnn_math_f32_f16_cvt__sse2(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
187       for (uint32_t i = 0; i < kBlockSize; i++) {
188         const uint16_t reference_output = UINT16_C(0xFC00);
189         ASSERT_EQ(reference_output, outputs[i])
190           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
191           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
192           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
193       }
194     }
195   }
196 
TEST(CVT__SSE2,positive_infinity)197   TEST(CVT__SSE2, positive_infinity) {
198     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
199     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
200     std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
201     xnn_math_f32_f16_cvt__sse2(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
202     const uint16_t reference_output = UINT16_C(0x7C00);
203     ASSERT_EQ(reference_output, outputs[0])
204       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
205       << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
206       << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
207   }
208 
TEST(CVT__SSE2,negative_infinity)209   TEST(CVT__SSE2, negative_infinity) {
210     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
211     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
212     std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
213     xnn_math_f32_f16_cvt__sse2(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
214     const uint16_t reference_output = UINT16_C(0xFC00);
215     ASSERT_EQ(reference_output, outputs[0])
216       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
217       << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
218       << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
219   }
220 
TEST(CVT__SSE2,positive_nan)221   TEST(CVT__SSE2, positive_nan) {
222     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
223     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
224     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
225       for (uint32_t i = 0; i < kBlockSize; i++) {
226         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0x7FFFFFFF)));
227       }
228       xnn_math_f32_f16_cvt__sse2(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
229       for (uint32_t i = 0; i < kBlockSize; i++) {
230         ASSERT_GT(outputs[i], UINT16_C(0x7C00))
231           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
232           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
233         ASSERT_LT(outputs[i], UINT16_C(0x8000))
234           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
235           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
236       }
237     }
238   }
239 
TEST(CVT__SSE2,negative_nan)240   TEST(CVT__SSE2, negative_nan) {
241     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
242     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
243     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
244       for (uint32_t i = 0; i < kBlockSize; i++) {
245         inputs[i] = uint32_as_float(UINT32_C(0x80000000) | std::min<uint32_t>(n + i, UINT32_C(0x7FFFFFFF)));
246       }
247       xnn_math_f32_f16_cvt__sse2(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
248       for (uint32_t i = 0; i < kBlockSize; i++) {
249         ASSERT_GT(outputs[i], UINT16_C(0xFC00))
250           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
251           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
252       }
253     }
254   }
255 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
256 
257 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(CVT__SSE41,positive_normal)258   TEST(CVT__SSE41, positive_normal) {
259     TEST_REQUIRES_X86_SSE41;
260 
261     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
262     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
263     for (uint32_t n = UINT32_C(0x387FE000); n < UINT32_C(0x477FF000); n += kBlockSize) {
264       for (uint32_t i = 0; i < kBlockSize; i++) {
265         inputs[i] = uint32_as_float(n + i);
266       }
267       xnn_math_f32_f16_cvt__sse41(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
268       for (uint32_t i = 0; i < kBlockSize; i++) {
269         const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
270         ASSERT_EQ(reference_output, outputs[i])
271           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
272           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
273           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
274       }
275     }
276   }
277 
TEST(CVT__SSE41,negative_normal)278   TEST(CVT__SSE41, negative_normal) {
279     TEST_REQUIRES_X86_SSE41;
280 
281     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
282     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
283     for (uint32_t n = UINT32_C(0xB87FE000); n < UINT32_C(0xC77FF000); n += kBlockSize) {
284       for (uint32_t i = 0; i < kBlockSize; i++) {
285         inputs[i] = uint32_as_float(n + i);
286       }
287       xnn_math_f32_f16_cvt__sse41(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
288       for (uint32_t i = 0; i < kBlockSize; i++) {
289         const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
290         ASSERT_EQ(reference_output, outputs[i])
291           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
292           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
293           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
294       }
295     }
296   }
297 
TEST(CVT__SSE41,positive_subnormal)298   TEST(CVT__SSE41, positive_subnormal) {
299     TEST_REQUIRES_X86_SSE41;
300 
301     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
302     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
303     for (uint32_t n = UINT32_C(0x33000001); n < UINT32_C(0x387FE000); n += kBlockSize) {
304       for (uint32_t i = 0; i < kBlockSize; i++) {
305         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0x387FDFFF)));
306       }
307       xnn_math_f32_f16_cvt__sse41(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
308       for (uint32_t i = 0; i < kBlockSize; i++) {
309         const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
310         ASSERT_EQ(reference_output, outputs[i])
311           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
312           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
313           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
314       }
315     }
316   }
317 
TEST(CVT__SSE41,negative_subnormal)318   TEST(CVT__SSE41, negative_subnormal) {
319     TEST_REQUIRES_X86_SSE41;
320 
321     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
322     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
323     for (uint32_t n = UINT32_C(0xB3000001); n < UINT32_C(0xB87FE000); n += kBlockSize) {
324       for (uint32_t i = 0; i < kBlockSize; i++) {
325         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0xB87FDFFF)));
326       }
327       xnn_math_f32_f16_cvt__sse41(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
328       for (uint32_t i = 0; i < kBlockSize; i++) {
329         const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
330         ASSERT_EQ(reference_output, outputs[i])
331           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
332           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
333           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
334       }
335     }
336   }
337 
TEST(CVT__SSE41,positive_underflow)338   TEST(CVT__SSE41, positive_underflow) {
339     TEST_REQUIRES_X86_SSE41;
340 
341     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
342     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
343     for (uint32_t n = UINT32_C(0x00000001); n < UINT32_C(0x33000001); n += kBlockSize) {
344       for (uint32_t i = 0; i < kBlockSize; i++) {
345         inputs[i] = uint32_as_float(n + i);
346       }
347       xnn_math_f32_f16_cvt__sse41(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
348       for (uint32_t i = 0; i < kBlockSize; i++) {
349         const uint16_t reference_output = UINT16_C(0x0000);
350         ASSERT_EQ(reference_output, outputs[i])
351           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
352           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
353           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
354       }
355     }
356   }
357 
TEST(CVT__SSE41,negative_underflow)358   TEST(CVT__SSE41, negative_underflow) {
359     TEST_REQUIRES_X86_SSE41;
360 
361     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
362     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
363     for (uint32_t n = UINT32_C(0x80000001); n < UINT32_C(0xB3000001); n += kBlockSize) {
364       for (uint32_t i = 0; i < kBlockSize; i++) {
365         inputs[i] = uint32_as_float(n + i);
366       }
367       xnn_math_f32_f16_cvt__sse41(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
368       for (uint32_t i = 0; i < kBlockSize; i++) {
369         const uint16_t reference_output = UINT16_C(0x8000);
370         ASSERT_EQ(reference_output, outputs[i])
371           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
372           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
373           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
374       }
375     }
376   }
377 
TEST(CVT__SSE41,positive_zero)378   TEST(CVT__SSE41, positive_zero) {
379     TEST_REQUIRES_X86_SSE41;
380 
381     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
382     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
383     std::fill(inputs.begin(), inputs.end(), +0.0f);
384     xnn_math_f32_f16_cvt__sse41(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
385     const uint16_t reference_output = UINT16_C(0x0000);
386     ASSERT_EQ(reference_output, outputs[0])
387       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
388       << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
389       << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
390   }
391 
TEST(CVT__SSE41,negative_zero)392   TEST(CVT__SSE41, negative_zero) {
393     TEST_REQUIRES_X86_SSE41;
394 
395     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
396     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
397     std::fill(inputs.begin(), inputs.end(), -0.0f);
398     xnn_math_f32_f16_cvt__sse41(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
399     const uint16_t reference_output = UINT16_C(0x8000);
400     ASSERT_EQ(reference_output, outputs[0])
401       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
402       << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
403       << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
404   }
405 
TEST(CVT__SSE41,positive_overflow)406   TEST(CVT__SSE41, positive_overflow) {
407     TEST_REQUIRES_X86_SSE41;
408 
409     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
410     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
411     for (uint32_t n = UINT32_C(0x477FF000); n < UINT32_C(0x7F800000); n += kBlockSize) {
412       for (uint32_t i = 0; i < kBlockSize; i++) {
413         inputs[i] = uint32_as_float(n + i);
414       }
415       xnn_math_f32_f16_cvt__sse41(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
416       for (uint32_t i = 0; i < kBlockSize; i++) {
417         const uint16_t reference_output = UINT16_C(0x7C00);
418         ASSERT_EQ(reference_output, outputs[i])
419           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
420           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
421           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
422       }
423     }
424   }
425 
TEST(CVT__SSE41,negative_overflow)426   TEST(CVT__SSE41, negative_overflow) {
427     TEST_REQUIRES_X86_SSE41;
428 
429     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
430     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
431     for (uint32_t n = UINT32_C(0xC77FF000); n < UINT32_C(0xFF800000); n += kBlockSize) {
432       for (uint32_t i = 0; i < kBlockSize; i++) {
433         inputs[i] = uint32_as_float(n + i);
434       }
435       xnn_math_f32_f16_cvt__sse41(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
436       for (uint32_t i = 0; i < kBlockSize; i++) {
437         const uint16_t reference_output = UINT16_C(0xFC00);
438         ASSERT_EQ(reference_output, outputs[i])
439           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
440           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
441           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
442       }
443     }
444   }
445 
TEST(CVT__SSE41,positive_infinity)446   TEST(CVT__SSE41, positive_infinity) {
447     TEST_REQUIRES_X86_SSE41;
448 
449     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
450     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
451     std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
452     xnn_math_f32_f16_cvt__sse41(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
453     const uint16_t reference_output = UINT16_C(0x7C00);
454     ASSERT_EQ(reference_output, outputs[0])
455       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
456       << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
457       << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
458   }
459 
TEST(CVT__SSE41,negative_infinity)460   TEST(CVT__SSE41, negative_infinity) {
461     TEST_REQUIRES_X86_SSE41;
462 
463     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
464     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
465     std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
466     xnn_math_f32_f16_cvt__sse41(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
467     const uint16_t reference_output = UINT16_C(0xFC00);
468     ASSERT_EQ(reference_output, outputs[0])
469       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
470       << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
471       << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
472   }
473 
TEST(CVT__SSE41,positive_nan)474   TEST(CVT__SSE41, positive_nan) {
475     TEST_REQUIRES_X86_SSE41;
476 
477     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
478     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
479     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
480       for (uint32_t i = 0; i < kBlockSize; i++) {
481         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0x7FFFFFFF)));
482       }
483       xnn_math_f32_f16_cvt__sse41(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
484       for (uint32_t i = 0; i < kBlockSize; i++) {
485         ASSERT_GT(outputs[i], UINT16_C(0x7C00))
486           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
487           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
488         ASSERT_LT(outputs[i], UINT16_C(0x8000))
489           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
490           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
491       }
492     }
493   }
494 
TEST(CVT__SSE41,negative_nan)495   TEST(CVT__SSE41, negative_nan) {
496     TEST_REQUIRES_X86_SSE41;
497 
498     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
499     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
500     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
501       for (uint32_t i = 0; i < kBlockSize; i++) {
502         inputs[i] = uint32_as_float(UINT32_C(0x80000000) | std::min<uint32_t>(n + i, UINT32_C(0x7FFFFFFF)));
503       }
504       xnn_math_f32_f16_cvt__sse41(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
505       for (uint32_t i = 0; i < kBlockSize; i++) {
506         ASSERT_GT(outputs[i], UINT16_C(0xFC00))
507           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
508           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
509       }
510     }
511   }
512 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
513 
514 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(CVT__F16C,positive_normal)515   TEST(CVT__F16C, positive_normal) {
516     TEST_REQUIRES_X86_F16C;
517 
518     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
519     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
520     for (uint32_t n = UINT32_C(0x387FE000); n < UINT32_C(0x477FF000); n += kBlockSize) {
521       for (uint32_t i = 0; i < kBlockSize; i++) {
522         inputs[i] = uint32_as_float(n + i);
523       }
524       xnn_math_f32_f16_cvt__f16c(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
525       for (uint32_t i = 0; i < kBlockSize; i++) {
526         const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
527         ASSERT_EQ(reference_output, outputs[i])
528           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
529           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
530           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
531       }
532     }
533   }
534 
TEST(CVT__F16C,negative_normal)535   TEST(CVT__F16C, negative_normal) {
536     TEST_REQUIRES_X86_F16C;
537 
538     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
539     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
540     for (uint32_t n = UINT32_C(0xB87FE000); n < UINT32_C(0xC77FF000); n += kBlockSize) {
541       for (uint32_t i = 0; i < kBlockSize; i++) {
542         inputs[i] = uint32_as_float(n + i);
543       }
544       xnn_math_f32_f16_cvt__f16c(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
545       for (uint32_t i = 0; i < kBlockSize; i++) {
546         const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
547         ASSERT_EQ(reference_output, outputs[i])
548           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
549           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
550           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
551       }
552     }
553   }
554 
TEST(CVT__F16C,positive_subnormal)555   TEST(CVT__F16C, positive_subnormal) {
556     TEST_REQUIRES_X86_F16C;
557 
558     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
559     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
560     for (uint32_t n = UINT32_C(0x33000001); n < UINT32_C(0x387FE000); n += kBlockSize) {
561       for (uint32_t i = 0; i < kBlockSize; i++) {
562         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0x387FDFFF)));
563       }
564       xnn_math_f32_f16_cvt__f16c(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
565       for (uint32_t i = 0; i < kBlockSize; i++) {
566         const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
567         ASSERT_EQ(reference_output, outputs[i])
568           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
569           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
570           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
571       }
572     }
573   }
574 
TEST(CVT__F16C,negative_subnormal)575   TEST(CVT__F16C, negative_subnormal) {
576     TEST_REQUIRES_X86_F16C;
577 
578     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
579     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
580     for (uint32_t n = UINT32_C(0xB3000001); n < UINT32_C(0xB87FE000); n += kBlockSize) {
581       for (uint32_t i = 0; i < kBlockSize; i++) {
582         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0xB87FDFFF)));
583       }
584       xnn_math_f32_f16_cvt__f16c(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
585       for (uint32_t i = 0; i < kBlockSize; i++) {
586         const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
587         ASSERT_EQ(reference_output, outputs[i])
588           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
589           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
590           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
591       }
592     }
593   }
594 
TEST(CVT__F16C,positive_underflow)595   TEST(CVT__F16C, positive_underflow) {
596     TEST_REQUIRES_X86_F16C;
597 
598     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
599     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
600     for (uint32_t n = UINT32_C(0x00000001); n < UINT32_C(0x33000001); n += kBlockSize) {
601       for (uint32_t i = 0; i < kBlockSize; i++) {
602         inputs[i] = uint32_as_float(n + i);
603       }
604       xnn_math_f32_f16_cvt__f16c(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
605       for (uint32_t i = 0; i < kBlockSize; i++) {
606         const uint16_t reference_output = UINT16_C(0x0000);
607         ASSERT_EQ(reference_output, outputs[i])
608           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
609           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
610           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
611       }
612     }
613   }
614 
TEST(CVT__F16C,negative_underflow)615   TEST(CVT__F16C, negative_underflow) {
616     TEST_REQUIRES_X86_F16C;
617 
618     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
619     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
620     for (uint32_t n = UINT32_C(0x80000001); n < UINT32_C(0xB3000001); n += kBlockSize) {
621       for (uint32_t i = 0; i < kBlockSize; i++) {
622         inputs[i] = uint32_as_float(n + i);
623       }
624       xnn_math_f32_f16_cvt__f16c(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
625       for (uint32_t i = 0; i < kBlockSize; i++) {
626         const uint16_t reference_output = UINT16_C(0x8000);
627         ASSERT_EQ(reference_output, outputs[i])
628           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
629           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
630           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
631       }
632     }
633   }
634 
TEST(CVT__F16C,positive_zero)635   TEST(CVT__F16C, positive_zero) {
636     TEST_REQUIRES_X86_F16C;
637 
638     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
639     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
640     std::fill(inputs.begin(), inputs.end(), +0.0f);
641     xnn_math_f32_f16_cvt__f16c(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
642     const uint16_t reference_output = UINT16_C(0x0000);
643     ASSERT_EQ(reference_output, outputs[0])
644       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
645       << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
646       << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
647   }
648 
TEST(CVT__F16C,negative_zero)649   TEST(CVT__F16C, negative_zero) {
650     TEST_REQUIRES_X86_F16C;
651 
652     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
653     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
654     std::fill(inputs.begin(), inputs.end(), -0.0f);
655     xnn_math_f32_f16_cvt__f16c(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
656     const uint16_t reference_output = UINT16_C(0x8000);
657     ASSERT_EQ(reference_output, outputs[0])
658       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
659       << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
660       << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
661   }
662 
TEST(CVT__F16C,positive_overflow)663   TEST(CVT__F16C, positive_overflow) {
664     TEST_REQUIRES_X86_F16C;
665 
666     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
667     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
668     for (uint32_t n = UINT32_C(0x477FF000); n < UINT32_C(0x7F800000); n += kBlockSize) {
669       for (uint32_t i = 0; i < kBlockSize; i++) {
670         inputs[i] = uint32_as_float(n + i);
671       }
672       xnn_math_f32_f16_cvt__f16c(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
673       for (uint32_t i = 0; i < kBlockSize; i++) {
674         const uint16_t reference_output = UINT16_C(0x7C00);
675         ASSERT_EQ(reference_output, outputs[i])
676           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
677           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
678           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
679       }
680     }
681   }
682 
TEST(CVT__F16C,negative_overflow)683   TEST(CVT__F16C, negative_overflow) {
684     TEST_REQUIRES_X86_F16C;
685 
686     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
687     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
688     for (uint32_t n = UINT32_C(0xC77FF000); n < UINT32_C(0xFF800000); n += kBlockSize) {
689       for (uint32_t i = 0; i < kBlockSize; i++) {
690         inputs[i] = uint32_as_float(n + i);
691       }
692       xnn_math_f32_f16_cvt__f16c(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
693       for (uint32_t i = 0; i < kBlockSize; i++) {
694         const uint16_t reference_output = UINT16_C(0xFC00);
695         ASSERT_EQ(reference_output, outputs[i])
696           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
697           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
698           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
699       }
700     }
701   }
702 
TEST(CVT__F16C,positive_infinity)703   TEST(CVT__F16C, positive_infinity) {
704     TEST_REQUIRES_X86_F16C;
705 
706     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
707     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
708     std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
709     xnn_math_f32_f16_cvt__f16c(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
710     const uint16_t reference_output = UINT16_C(0x7C00);
711     ASSERT_EQ(reference_output, outputs[0])
712       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
713       << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
714       << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
715   }
716 
TEST(CVT__F16C,negative_infinity)717   TEST(CVT__F16C, negative_infinity) {
718     TEST_REQUIRES_X86_F16C;
719 
720     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
721     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
722     std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
723     xnn_math_f32_f16_cvt__f16c(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
724     const uint16_t reference_output = UINT16_C(0xFC00);
725     ASSERT_EQ(reference_output, outputs[0])
726       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
727       << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
728       << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
729   }
730 
TEST(CVT__F16C,positive_nan)731   TEST(CVT__F16C, positive_nan) {
732     TEST_REQUIRES_X86_F16C;
733 
734     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
735     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
736     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
737       for (uint32_t i = 0; i < kBlockSize; i++) {
738         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0x7FFFFFFF)));
739       }
740       xnn_math_f32_f16_cvt__f16c(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
741       for (uint32_t i = 0; i < kBlockSize; i++) {
742         ASSERT_GT(outputs[i], UINT16_C(0x7C00))
743           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
744           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
745         ASSERT_LT(outputs[i], UINT16_C(0x8000))
746           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
747           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
748       }
749     }
750   }
751 
TEST(CVT__F16C,negative_nan)752   TEST(CVT__F16C, negative_nan) {
753     TEST_REQUIRES_X86_F16C;
754 
755     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
756     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
757     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
758       for (uint32_t i = 0; i < kBlockSize; i++) {
759         inputs[i] = uint32_as_float(UINT32_C(0x80000000) | std::min<uint32_t>(n + i, UINT32_C(0x7FFFFFFF)));
760       }
761       xnn_math_f32_f16_cvt__f16c(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
762       for (uint32_t i = 0; i < kBlockSize; i++) {
763         ASSERT_GT(outputs[i], UINT16_C(0xFC00))
764           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
765           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
766       }
767     }
768   }
769 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
770 
771 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(CVT__NEON,positive_normal)772   TEST(CVT__NEON, positive_normal) {
773     TEST_REQUIRES_ARM_NEON;
774 
775     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
776     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
777     for (uint32_t n = UINT32_C(0x387FE000); n < UINT32_C(0x477FF000); n += kBlockSize) {
778       for (uint32_t i = 0; i < kBlockSize; i++) {
779         inputs[i] = uint32_as_float(n + i);
780       }
781       xnn_math_f32_f16_cvt__neon(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
782       for (uint32_t i = 0; i < kBlockSize; i++) {
783         const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
784         ASSERT_EQ(reference_output, outputs[i])
785           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
786           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
787           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
788       }
789     }
790   }
791 
TEST(CVT__NEON,negative_normal)792   TEST(CVT__NEON, negative_normal) {
793     TEST_REQUIRES_ARM_NEON;
794 
795     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
796     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
797     for (uint32_t n = UINT32_C(0xB87FE000); n < UINT32_C(0xC77FF000); n += kBlockSize) {
798       for (uint32_t i = 0; i < kBlockSize; i++) {
799         inputs[i] = uint32_as_float(n + i);
800       }
801       xnn_math_f32_f16_cvt__neon(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
802       for (uint32_t i = 0; i < kBlockSize; i++) {
803         const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
804         ASSERT_EQ(reference_output, outputs[i])
805           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
806           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
807           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
808       }
809     }
810   }
811 
TEST(CVT__NEON,positive_subnormal)812   TEST(CVT__NEON, positive_subnormal) {
813     TEST_REQUIRES_ARM_NEON;
814 
815     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
816     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
817     for (uint32_t n = UINT32_C(0x33000001); n < UINT32_C(0x387FE000); n += kBlockSize) {
818       for (uint32_t i = 0; i < kBlockSize; i++) {
819         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0x387FDFFF)));
820       }
821       xnn_math_f32_f16_cvt__neon(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
822       for (uint32_t i = 0; i < kBlockSize; i++) {
823         const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
824         ASSERT_EQ(reference_output, outputs[i])
825           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
826           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
827           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
828       }
829     }
830   }
831 
TEST(CVT__NEON,negative_subnormal)832   TEST(CVT__NEON, negative_subnormal) {
833     TEST_REQUIRES_ARM_NEON;
834 
835     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
836     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
837     for (uint32_t n = UINT32_C(0xB3000001); n < UINT32_C(0xB87FE000); n += kBlockSize) {
838       for (uint32_t i = 0; i < kBlockSize; i++) {
839         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0xB87FDFFF)));
840       }
841       xnn_math_f32_f16_cvt__neon(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
842       for (uint32_t i = 0; i < kBlockSize; i++) {
843         const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
844         ASSERT_EQ(reference_output, outputs[i])
845           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
846           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
847           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
848       }
849     }
850   }
851 
TEST(CVT__NEON,positive_underflow)852   TEST(CVT__NEON, positive_underflow) {
853     TEST_REQUIRES_ARM_NEON;
854 
855     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
856     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
857     for (uint32_t n = UINT32_C(0x00000001); n < UINT32_C(0x33000001); n += kBlockSize) {
858       for (uint32_t i = 0; i < kBlockSize; i++) {
859         inputs[i] = uint32_as_float(n + i);
860       }
861       xnn_math_f32_f16_cvt__neon(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
862       for (uint32_t i = 0; i < kBlockSize; i++) {
863         const uint16_t reference_output = UINT16_C(0x0000);
864         ASSERT_EQ(reference_output, outputs[i])
865           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
866           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
867           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
868       }
869     }
870   }
871 
TEST(CVT__NEON,negative_underflow)872   TEST(CVT__NEON, negative_underflow) {
873     TEST_REQUIRES_ARM_NEON;
874 
875     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
876     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
877     for (uint32_t n = UINT32_C(0x80000001); n < UINT32_C(0xB3000001); n += kBlockSize) {
878       for (uint32_t i = 0; i < kBlockSize; i++) {
879         inputs[i] = uint32_as_float(n + i);
880       }
881       xnn_math_f32_f16_cvt__neon(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
882       for (uint32_t i = 0; i < kBlockSize; i++) {
883         const uint16_t reference_output = UINT16_C(0x8000);
884         ASSERT_EQ(reference_output, outputs[i])
885           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
886           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
887           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
888       }
889     }
890   }
891 
TEST(CVT__NEON,positive_zero)892   TEST(CVT__NEON, positive_zero) {
893     TEST_REQUIRES_ARM_NEON;
894 
895     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
896     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
897     std::fill(inputs.begin(), inputs.end(), +0.0f);
898     xnn_math_f32_f16_cvt__neon(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
899     const uint16_t reference_output = UINT16_C(0x0000);
900     ASSERT_EQ(reference_output, outputs[0])
901       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
902       << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
903       << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
904   }
905 
TEST(CVT__NEON,negative_zero)906   TEST(CVT__NEON, negative_zero) {
907     TEST_REQUIRES_ARM_NEON;
908 
909     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
910     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
911     std::fill(inputs.begin(), inputs.end(), -0.0f);
912     xnn_math_f32_f16_cvt__neon(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
913     const uint16_t reference_output = UINT16_C(0x8000);
914     ASSERT_EQ(reference_output, outputs[0])
915       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
916       << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
917       << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
918   }
919 
TEST(CVT__NEON,positive_overflow)920   TEST(CVT__NEON, positive_overflow) {
921     TEST_REQUIRES_ARM_NEON;
922 
923     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
924     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
925     for (uint32_t n = UINT32_C(0x477FF000); n < UINT32_C(0x7F800000); n += kBlockSize) {
926       for (uint32_t i = 0; i < kBlockSize; i++) {
927         inputs[i] = uint32_as_float(n + i);
928       }
929       xnn_math_f32_f16_cvt__neon(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
930       for (uint32_t i = 0; i < kBlockSize; i++) {
931         const uint16_t reference_output = UINT16_C(0x7C00);
932         ASSERT_EQ(reference_output, outputs[i])
933           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
934           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
935           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
936       }
937     }
938   }
939 
TEST(CVT__NEON,negative_overflow)940   TEST(CVT__NEON, negative_overflow) {
941     TEST_REQUIRES_ARM_NEON;
942 
943     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
944     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
945     for (uint32_t n = UINT32_C(0xC77FF000); n < UINT32_C(0xFF800000); n += kBlockSize) {
946       for (uint32_t i = 0; i < kBlockSize; i++) {
947         inputs[i] = uint32_as_float(n + i);
948       }
949       xnn_math_f32_f16_cvt__neon(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
950       for (uint32_t i = 0; i < kBlockSize; i++) {
951         const uint16_t reference_output = UINT16_C(0xFC00);
952         ASSERT_EQ(reference_output, outputs[i])
953           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
954           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
955           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
956       }
957     }
958   }
959 
TEST(CVT__NEON,positive_infinity)960   TEST(CVT__NEON, positive_infinity) {
961     TEST_REQUIRES_ARM_NEON;
962 
963     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
964     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
965     std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
966     xnn_math_f32_f16_cvt__neon(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
967     const uint16_t reference_output = UINT16_C(0x7C00);
968     ASSERT_EQ(reference_output, outputs[0])
969       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
970       << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
971       << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
972   }
973 
TEST(CVT__NEON,negative_infinity)974   TEST(CVT__NEON, negative_infinity) {
975     TEST_REQUIRES_ARM_NEON;
976 
977     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
978     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
979     std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
980     xnn_math_f32_f16_cvt__neon(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
981     const uint16_t reference_output = UINT16_C(0xFC00);
982     ASSERT_EQ(reference_output, outputs[0])
983       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
984       << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
985       << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
986   }
987 
TEST(CVT__NEON,positive_nan)988   TEST(CVT__NEON, positive_nan) {
989     TEST_REQUIRES_ARM_NEON;
990 
991     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
992     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
993     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
994       for (uint32_t i = 0; i < kBlockSize; i++) {
995         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0x7FFFFFFF)));
996       }
997       xnn_math_f32_f16_cvt__neon(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
998       for (uint32_t i = 0; i < kBlockSize; i++) {
999         ASSERT_GT(outputs[i], UINT16_C(0x7C00))
1000           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1001           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1002         ASSERT_LT(outputs[i], UINT16_C(0x8000))
1003           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1004           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1005       }
1006     }
1007   }
1008 
TEST(CVT__NEON,negative_nan)1009   TEST(CVT__NEON, negative_nan) {
1010     TEST_REQUIRES_ARM_NEON;
1011 
1012     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1013     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1014     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1015       for (uint32_t i = 0; i < kBlockSize; i++) {
1016         inputs[i] = uint32_as_float(UINT32_C(0x80000000) | std::min<uint32_t>(n + i, UINT32_C(0x7FFFFFFF)));
1017       }
1018       xnn_math_f32_f16_cvt__neon(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1019       for (uint32_t i = 0; i < kBlockSize; i++) {
1020         ASSERT_GT(outputs[i], UINT16_C(0xFC00))
1021           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1022           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1023       }
1024     }
1025   }
1026 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
1027 
1028 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(CVT__NEONFP16,positive_normal)1029   TEST(CVT__NEONFP16, positive_normal) {
1030     TEST_REQUIRES_ARM_NEON_FP16;
1031 
1032     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1033     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1034     for (uint32_t n = UINT32_C(0x387FE000); n < UINT32_C(0x477FF000); n += kBlockSize) {
1035       for (uint32_t i = 0; i < kBlockSize; i++) {
1036         inputs[i] = uint32_as_float(n + i);
1037       }
1038       xnn_math_f32_f16_cvt__neonfp16(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1039       for (uint32_t i = 0; i < kBlockSize; i++) {
1040         const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
1041         ASSERT_EQ(reference_output, outputs[i])
1042           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1043           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1044           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1045       }
1046     }
1047   }
1048 
TEST(CVT__NEONFP16,negative_normal)1049   TEST(CVT__NEONFP16, negative_normal) {
1050     TEST_REQUIRES_ARM_NEON_FP16;
1051 
1052     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1053     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1054     for (uint32_t n = UINT32_C(0xB87FE000); n < UINT32_C(0xC77FF000); n += kBlockSize) {
1055       for (uint32_t i = 0; i < kBlockSize; i++) {
1056         inputs[i] = uint32_as_float(n + i);
1057       }
1058       xnn_math_f32_f16_cvt__neonfp16(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1059       for (uint32_t i = 0; i < kBlockSize; i++) {
1060         const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
1061         ASSERT_EQ(reference_output, outputs[i])
1062           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1063           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1064           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1065       }
1066     }
1067   }
1068 
TEST(CVT__NEONFP16,positive_subnormal)1069   TEST(CVT__NEONFP16, positive_subnormal) {
1070     TEST_REQUIRES_ARM_NEON_FP16;
1071 
1072     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1073     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1074     for (uint32_t n = UINT32_C(0x33000001); n < UINT32_C(0x387FE000); n += kBlockSize) {
1075       for (uint32_t i = 0; i < kBlockSize; i++) {
1076         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0x387FDFFF)));
1077       }
1078       xnn_math_f32_f16_cvt__neonfp16(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1079       for (uint32_t i = 0; i < kBlockSize; i++) {
1080         const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
1081         ASSERT_EQ(reference_output, outputs[i])
1082           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1083           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1084           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1085       }
1086     }
1087   }
1088 
TEST(CVT__NEONFP16,negative_subnormal)1089   TEST(CVT__NEONFP16, negative_subnormal) {
1090     TEST_REQUIRES_ARM_NEON_FP16;
1091 
1092     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1093     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1094     for (uint32_t n = UINT32_C(0xB3000001); n < UINT32_C(0xB87FE000); n += kBlockSize) {
1095       for (uint32_t i = 0; i < kBlockSize; i++) {
1096         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0xB87FDFFF)));
1097       }
1098       xnn_math_f32_f16_cvt__neonfp16(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1099       for (uint32_t i = 0; i < kBlockSize; i++) {
1100         const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
1101         ASSERT_EQ(reference_output, outputs[i])
1102           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1103           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1104           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1105       }
1106     }
1107   }
1108 
TEST(CVT__NEONFP16,positive_underflow)1109   TEST(CVT__NEONFP16, positive_underflow) {
1110     TEST_REQUIRES_ARM_NEON_FP16;
1111 
1112     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1113     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1114     for (uint32_t n = UINT32_C(0x00000001); n < UINT32_C(0x33000001); n += kBlockSize) {
1115       for (uint32_t i = 0; i < kBlockSize; i++) {
1116         inputs[i] = uint32_as_float(n + i);
1117       }
1118       xnn_math_f32_f16_cvt__neonfp16(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1119       for (uint32_t i = 0; i < kBlockSize; i++) {
1120         const uint16_t reference_output = UINT16_C(0x0000);
1121         ASSERT_EQ(reference_output, outputs[i])
1122           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1123           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1124           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1125       }
1126     }
1127   }
1128 
TEST(CVT__NEONFP16,negative_underflow)1129   TEST(CVT__NEONFP16, negative_underflow) {
1130     TEST_REQUIRES_ARM_NEON_FP16;
1131 
1132     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1133     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1134     for (uint32_t n = UINT32_C(0x80000001); n < UINT32_C(0xB3000001); n += kBlockSize) {
1135       for (uint32_t i = 0; i < kBlockSize; i++) {
1136         inputs[i] = uint32_as_float(n + i);
1137       }
1138       xnn_math_f32_f16_cvt__neonfp16(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1139       for (uint32_t i = 0; i < kBlockSize; i++) {
1140         const uint16_t reference_output = UINT16_C(0x8000);
1141         ASSERT_EQ(reference_output, outputs[i])
1142           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1143           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1144           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1145       }
1146     }
1147   }
1148 
TEST(CVT__NEONFP16,positive_zero)1149   TEST(CVT__NEONFP16, positive_zero) {
1150     TEST_REQUIRES_ARM_NEON_FP16;
1151 
1152     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1153     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1154     std::fill(inputs.begin(), inputs.end(), +0.0f);
1155     xnn_math_f32_f16_cvt__neonfp16(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1156     const uint16_t reference_output = UINT16_C(0x0000);
1157     ASSERT_EQ(reference_output, outputs[0])
1158       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
1159       << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1160       << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
1161   }
1162 
TEST(CVT__NEONFP16,negative_zero)1163   TEST(CVT__NEONFP16, negative_zero) {
1164     TEST_REQUIRES_ARM_NEON_FP16;
1165 
1166     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1167     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1168     std::fill(inputs.begin(), inputs.end(), -0.0f);
1169     xnn_math_f32_f16_cvt__neonfp16(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1170     const uint16_t reference_output = UINT16_C(0x8000);
1171     ASSERT_EQ(reference_output, outputs[0])
1172       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
1173       << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1174       << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
1175   }
1176 
TEST(CVT__NEONFP16,positive_overflow)1177   TEST(CVT__NEONFP16, positive_overflow) {
1178     TEST_REQUIRES_ARM_NEON_FP16;
1179 
1180     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1181     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1182     for (uint32_t n = UINT32_C(0x477FF000); n < UINT32_C(0x7F800000); n += kBlockSize) {
1183       for (uint32_t i = 0; i < kBlockSize; i++) {
1184         inputs[i] = uint32_as_float(n + i);
1185       }
1186       xnn_math_f32_f16_cvt__neonfp16(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1187       for (uint32_t i = 0; i < kBlockSize; i++) {
1188         const uint16_t reference_output = UINT16_C(0x7C00);
1189         ASSERT_EQ(reference_output, outputs[i])
1190           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1191           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1192           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1193       }
1194     }
1195   }
1196 
TEST(CVT__NEONFP16,negative_overflow)1197   TEST(CVT__NEONFP16, negative_overflow) {
1198     TEST_REQUIRES_ARM_NEON_FP16;
1199 
1200     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1201     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1202     for (uint32_t n = UINT32_C(0xC77FF000); n < UINT32_C(0xFF800000); n += kBlockSize) {
1203       for (uint32_t i = 0; i < kBlockSize; i++) {
1204         inputs[i] = uint32_as_float(n + i);
1205       }
1206       xnn_math_f32_f16_cvt__neonfp16(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1207       for (uint32_t i = 0; i < kBlockSize; i++) {
1208         const uint16_t reference_output = UINT16_C(0xFC00);
1209         ASSERT_EQ(reference_output, outputs[i])
1210           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1211           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1212           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1213       }
1214     }
1215   }
1216 
TEST(CVT__NEONFP16,positive_infinity)1217   TEST(CVT__NEONFP16, positive_infinity) {
1218     TEST_REQUIRES_ARM_NEON_FP16;
1219 
1220     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1221     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1222     std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
1223     xnn_math_f32_f16_cvt__neonfp16(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1224     const uint16_t reference_output = UINT16_C(0x7C00);
1225     ASSERT_EQ(reference_output, outputs[0])
1226       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
1227       << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1228       << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
1229   }
1230 
TEST(CVT__NEONFP16,negative_infinity)1231   TEST(CVT__NEONFP16, negative_infinity) {
1232     TEST_REQUIRES_ARM_NEON_FP16;
1233 
1234     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1235     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1236     std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
1237     xnn_math_f32_f16_cvt__neonfp16(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1238     const uint16_t reference_output = UINT16_C(0xFC00);
1239     ASSERT_EQ(reference_output, outputs[0])
1240       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
1241       << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1242       << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
1243   }
1244 
TEST(CVT__NEONFP16,positive_nan)1245   TEST(CVT__NEONFP16, positive_nan) {
1246     TEST_REQUIRES_ARM_NEON_FP16;
1247 
1248     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1249     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1250     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1251       for (uint32_t i = 0; i < kBlockSize; i++) {
1252         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0x7FFFFFFF)));
1253       }
1254       xnn_math_f32_f16_cvt__neonfp16(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1255       for (uint32_t i = 0; i < kBlockSize; i++) {
1256         ASSERT_GT(outputs[i], UINT16_C(0x7C00))
1257           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1258           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1259         ASSERT_LT(outputs[i], UINT16_C(0x8000))
1260           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1261           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1262       }
1263     }
1264   }
1265 
TEST(CVT__NEONFP16,negative_nan)1266   TEST(CVT__NEONFP16, negative_nan) {
1267     TEST_REQUIRES_ARM_NEON_FP16;
1268 
1269     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1270     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1271     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1272       for (uint32_t i = 0; i < kBlockSize; i++) {
1273         inputs[i] = uint32_as_float(UINT32_C(0x80000000) | std::min<uint32_t>(n + i, UINT32_C(0x7FFFFFFF)));
1274       }
1275       xnn_math_f32_f16_cvt__neonfp16(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1276       for (uint32_t i = 0; i < kBlockSize; i++) {
1277         ASSERT_GT(outputs[i], UINT16_C(0xFC00))
1278           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1279           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1280       }
1281     }
1282   }
1283 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
1284 
1285 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
TEST(CVT__WASMSIMD,positive_normal)1286   TEST(CVT__WASMSIMD, positive_normal) {
1287     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1288     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1289     for (uint32_t n = UINT32_C(0x387FE000); n < UINT32_C(0x477FF000); n += kBlockSize) {
1290       for (uint32_t i = 0; i < kBlockSize; i++) {
1291         inputs[i] = uint32_as_float(n + i);
1292       }
1293       xnn_math_f32_f16_cvt__wasmsimd(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1294       for (uint32_t i = 0; i < kBlockSize; i++) {
1295         const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
1296         ASSERT_EQ(reference_output, outputs[i])
1297           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1298           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1299           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1300       }
1301     }
1302   }
1303 
TEST(CVT__WASMSIMD,negative_normal)1304   TEST(CVT__WASMSIMD, negative_normal) {
1305     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1306     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1307     for (uint32_t n = UINT32_C(0xB87FE000); n < UINT32_C(0xC77FF000); n += kBlockSize) {
1308       for (uint32_t i = 0; i < kBlockSize; i++) {
1309         inputs[i] = uint32_as_float(n + i);
1310       }
1311       xnn_math_f32_f16_cvt__wasmsimd(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1312       for (uint32_t i = 0; i < kBlockSize; i++) {
1313         const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
1314         ASSERT_EQ(reference_output, outputs[i])
1315           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1316           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1317           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1318       }
1319     }
1320   }
1321 
TEST(CVT__WASMSIMD,positive_subnormal)1322   TEST(CVT__WASMSIMD, positive_subnormal) {
1323     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1324     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1325     for (uint32_t n = UINT32_C(0x33000001); n < UINT32_C(0x387FE000); n += kBlockSize) {
1326       for (uint32_t i = 0; i < kBlockSize; i++) {
1327         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0x387FDFFF)));
1328       }
1329       xnn_math_f32_f16_cvt__wasmsimd(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1330       for (uint32_t i = 0; i < kBlockSize; i++) {
1331         const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
1332         ASSERT_EQ(reference_output, outputs[i])
1333           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1334           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1335           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1336       }
1337     }
1338   }
1339 
TEST(CVT__WASMSIMD,negative_subnormal)1340   TEST(CVT__WASMSIMD, negative_subnormal) {
1341     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1342     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1343     for (uint32_t n = UINT32_C(0xB3000001); n < UINT32_C(0xB87FE000); n += kBlockSize) {
1344       for (uint32_t i = 0; i < kBlockSize; i++) {
1345         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0xB87FDFFF)));
1346       }
1347       xnn_math_f32_f16_cvt__wasmsimd(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1348       for (uint32_t i = 0; i < kBlockSize; i++) {
1349         const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
1350         ASSERT_EQ(reference_output, outputs[i])
1351           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1352           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1353           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1354       }
1355     }
1356   }
1357 
TEST(CVT__WASMSIMD,positive_underflow)1358   TEST(CVT__WASMSIMD, positive_underflow) {
1359     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1360     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1361     for (uint32_t n = UINT32_C(0x00000001); n < UINT32_C(0x33000001); n += kBlockSize) {
1362       for (uint32_t i = 0; i < kBlockSize; i++) {
1363         inputs[i] = uint32_as_float(n + i);
1364       }
1365       xnn_math_f32_f16_cvt__wasmsimd(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1366       for (uint32_t i = 0; i < kBlockSize; i++) {
1367         const uint16_t reference_output = UINT16_C(0x0000);
1368         ASSERT_EQ(reference_output, outputs[i])
1369           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1370           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1371           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1372       }
1373     }
1374   }
1375 
TEST(CVT__WASMSIMD,negative_underflow)1376   TEST(CVT__WASMSIMD, negative_underflow) {
1377     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1378     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1379     for (uint32_t n = UINT32_C(0x80000001); n < UINT32_C(0xB3000001); n += kBlockSize) {
1380       for (uint32_t i = 0; i < kBlockSize; i++) {
1381         inputs[i] = uint32_as_float(n + i);
1382       }
1383       xnn_math_f32_f16_cvt__wasmsimd(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1384       for (uint32_t i = 0; i < kBlockSize; i++) {
1385         const uint16_t reference_output = UINT16_C(0x8000);
1386         ASSERT_EQ(reference_output, outputs[i])
1387           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1388           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1389           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1390       }
1391     }
1392   }
1393 
TEST(CVT__WASMSIMD,positive_zero)1394   TEST(CVT__WASMSIMD, positive_zero) {
1395     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1396     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1397     std::fill(inputs.begin(), inputs.end(), +0.0f);
1398     xnn_math_f32_f16_cvt__wasmsimd(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1399     const uint16_t reference_output = UINT16_C(0x0000);
1400     ASSERT_EQ(reference_output, outputs[0])
1401       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
1402       << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1403       << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
1404   }
1405 
TEST(CVT__WASMSIMD,negative_zero)1406   TEST(CVT__WASMSIMD, negative_zero) {
1407     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1408     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1409     std::fill(inputs.begin(), inputs.end(), -0.0f);
1410     xnn_math_f32_f16_cvt__wasmsimd(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1411     const uint16_t reference_output = UINT16_C(0x8000);
1412     ASSERT_EQ(reference_output, outputs[0])
1413       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
1414       << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1415       << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
1416   }
1417 
TEST(CVT__WASMSIMD,positive_overflow)1418   TEST(CVT__WASMSIMD, positive_overflow) {
1419     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1420     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1421     for (uint32_t n = UINT32_C(0x477FF000); n < UINT32_C(0x7F800000); n += kBlockSize) {
1422       for (uint32_t i = 0; i < kBlockSize; i++) {
1423         inputs[i] = uint32_as_float(n + i);
1424       }
1425       xnn_math_f32_f16_cvt__wasmsimd(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1426       for (uint32_t i = 0; i < kBlockSize; i++) {
1427         const uint16_t reference_output = UINT16_C(0x7C00);
1428         ASSERT_EQ(reference_output, outputs[i])
1429           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1430           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1431           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1432       }
1433     }
1434   }
1435 
TEST(CVT__WASMSIMD,negative_overflow)1436   TEST(CVT__WASMSIMD, negative_overflow) {
1437     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1438     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1439     for (uint32_t n = UINT32_C(0xC77FF000); n < UINT32_C(0xFF800000); n += kBlockSize) {
1440       for (uint32_t i = 0; i < kBlockSize; i++) {
1441         inputs[i] = uint32_as_float(n + i);
1442       }
1443       xnn_math_f32_f16_cvt__wasmsimd(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1444       for (uint32_t i = 0; i < kBlockSize; i++) {
1445         const uint16_t reference_output = UINT16_C(0xFC00);
1446         ASSERT_EQ(reference_output, outputs[i])
1447           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1448           << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1449           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1450       }
1451     }
1452   }
1453 
TEST(CVT__WASMSIMD,positive_infinity)1454   TEST(CVT__WASMSIMD, positive_infinity) {
1455     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1456     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1457     std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
1458     xnn_math_f32_f16_cvt__wasmsimd(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1459     const uint16_t reference_output = UINT16_C(0x7C00);
1460     ASSERT_EQ(reference_output, outputs[0])
1461       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
1462       << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1463       << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
1464   }
1465 
TEST(CVT__WASMSIMD,negative_infinity)1466   TEST(CVT__WASMSIMD, negative_infinity) {
1467     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1468     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1469     std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
1470     xnn_math_f32_f16_cvt__wasmsimd(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1471     const uint16_t reference_output = UINT16_C(0xFC00);
1472     ASSERT_EQ(reference_output, outputs[0])
1473       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
1474       << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1475       << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
1476   }
1477 
TEST(CVT__WASMSIMD,positive_nan)1478   TEST(CVT__WASMSIMD, positive_nan) {
1479     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1480     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1481     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1482       for (uint32_t i = 0; i < kBlockSize; i++) {
1483         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0x7FFFFFFF)));
1484       }
1485       xnn_math_f32_f16_cvt__wasmsimd(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1486       for (uint32_t i = 0; i < kBlockSize; i++) {
1487         ASSERT_GT(outputs[i], UINT16_C(0x7C00))
1488           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1489           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1490         ASSERT_LT(outputs[i], UINT16_C(0x8000))
1491           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1492           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1493       }
1494     }
1495   }
1496 
TEST(CVT__WASMSIMD,negative_nan)1497   TEST(CVT__WASMSIMD, negative_nan) {
1498     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1499     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1500     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1501       for (uint32_t i = 0; i < kBlockSize; i++) {
1502         inputs[i] = uint32_as_float(UINT32_C(0x80000000) | std::min<uint32_t>(n + i, UINT32_C(0x7FFFFFFF)));
1503       }
1504       xnn_math_f32_f16_cvt__wasmsimd(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1505       for (uint32_t i = 0; i < kBlockSize; i++) {
1506         ASSERT_GT(outputs[i], UINT16_C(0xFC00))
1507           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1508           << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1509       }
1510     }
1511   }
1512 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1513 
TEST(CVT__SCALAR_BITCAST,positive_normal)1514 TEST(CVT__SCALAR_BITCAST, positive_normal) {
1515   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1516   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1517   for (uint32_t n = UINT32_C(0x387FE000); n < UINT32_C(0x477FF000); n += kBlockSize) {
1518     for (uint32_t i = 0; i < kBlockSize; i++) {
1519       inputs[i] = uint32_as_float(n + i);
1520     }
1521     xnn_math_f32_f16_cvt__scalar_bitcast(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1522     for (uint32_t i = 0; i < kBlockSize; i++) {
1523       const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
1524       ASSERT_EQ(reference_output, outputs[i])
1525         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1526         << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1527         << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1528     }
1529   }
1530 }
1531 
TEST(CVT__SCALAR_BITCAST,negative_normal)1532 TEST(CVT__SCALAR_BITCAST, negative_normal) {
1533   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1534   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1535   for (uint32_t n = UINT32_C(0xB87FE000); n < UINT32_C(0xC77FF000); n += kBlockSize) {
1536     for (uint32_t i = 0; i < kBlockSize; i++) {
1537       inputs[i] = uint32_as_float(n + i);
1538     }
1539     xnn_math_f32_f16_cvt__scalar_bitcast(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1540     for (uint32_t i = 0; i < kBlockSize; i++) {
1541       const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
1542       ASSERT_EQ(reference_output, outputs[i])
1543         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1544         << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1545         << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1546     }
1547   }
1548 }
1549 
TEST(CVT__SCALAR_BITCAST,positive_subnormal)1550 TEST(CVT__SCALAR_BITCAST, positive_subnormal) {
1551   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1552   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1553   for (uint32_t n = UINT32_C(0x33000001); n < UINT32_C(0x387FE000); n += kBlockSize) {
1554     for (uint32_t i = 0; i < kBlockSize; i++) {
1555       inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0x387FDFFF)));
1556     }
1557     xnn_math_f32_f16_cvt__scalar_bitcast(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1558     for (uint32_t i = 0; i < kBlockSize; i++) {
1559       const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
1560       ASSERT_EQ(reference_output, outputs[i])
1561         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1562         << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1563         << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1564     }
1565   }
1566 }
1567 
TEST(CVT__SCALAR_BITCAST,negative_subnormal)1568 TEST(CVT__SCALAR_BITCAST, negative_subnormal) {
1569   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1570   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1571   for (uint32_t n = UINT32_C(0xB3000001); n < UINT32_C(0xB87FE000); n += kBlockSize) {
1572     for (uint32_t i = 0; i < kBlockSize; i++) {
1573       inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0xB87FDFFF)));
1574     }
1575     xnn_math_f32_f16_cvt__scalar_bitcast(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1576     for (uint32_t i = 0; i < kBlockSize; i++) {
1577       const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
1578       ASSERT_EQ(reference_output, outputs[i])
1579         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1580         << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1581         << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1582     }
1583   }
1584 }
1585 
TEST(CVT__SCALAR_BITCAST,positive_underflow)1586 TEST(CVT__SCALAR_BITCAST, positive_underflow) {
1587   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1588   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1589   for (uint32_t n = UINT32_C(0x00000001); n < UINT32_C(0x33000001); n += kBlockSize) {
1590     for (uint32_t i = 0; i < kBlockSize; i++) {
1591       inputs[i] = uint32_as_float(n + i);
1592     }
1593     xnn_math_f32_f16_cvt__scalar_bitcast(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1594     for (uint32_t i = 0; i < kBlockSize; i++) {
1595       const uint16_t reference_output = UINT16_C(0x0000);
1596       ASSERT_EQ(reference_output, outputs[i])
1597         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1598         << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1599         << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1600     }
1601   }
1602 }
1603 
TEST(CVT__SCALAR_BITCAST,negative_underflow)1604 TEST(CVT__SCALAR_BITCAST, negative_underflow) {
1605   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1606   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1607   for (uint32_t n = UINT32_C(0x80000001); n < UINT32_C(0xB3000001); n += kBlockSize) {
1608     for (uint32_t i = 0; i < kBlockSize; i++) {
1609       inputs[i] = uint32_as_float(n + i);
1610     }
1611     xnn_math_f32_f16_cvt__scalar_bitcast(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1612     for (uint32_t i = 0; i < kBlockSize; i++) {
1613       const uint16_t reference_output = UINT16_C(0x8000);
1614       ASSERT_EQ(reference_output, outputs[i])
1615         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1616         << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1617         << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1618     }
1619   }
1620 }
1621 
TEST(CVT__SCALAR_BITCAST,positive_zero)1622 TEST(CVT__SCALAR_BITCAST, positive_zero) {
1623   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1624   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1625   std::fill(inputs.begin(), inputs.end(), +0.0f);
1626   xnn_math_f32_f16_cvt__scalar_bitcast(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1627   const uint16_t reference_output = UINT16_C(0x0000);
1628   ASSERT_EQ(reference_output, outputs[0])
1629     << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
1630     << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1631     << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
1632 }
1633 
TEST(CVT__SCALAR_BITCAST,negative_zero)1634 TEST(CVT__SCALAR_BITCAST, negative_zero) {
1635   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1636   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1637   std::fill(inputs.begin(), inputs.end(), -0.0f);
1638   xnn_math_f32_f16_cvt__scalar_bitcast(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1639   const uint16_t reference_output = UINT16_C(0x8000);
1640   ASSERT_EQ(reference_output, outputs[0])
1641     << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
1642     << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1643     << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
1644 }
1645 
TEST(CVT__SCALAR_BITCAST,positive_overflow)1646 TEST(CVT__SCALAR_BITCAST, positive_overflow) {
1647   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1648   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1649   for (uint32_t n = UINT32_C(0x477FF000); n < UINT32_C(0x7F800000); n += kBlockSize) {
1650     for (uint32_t i = 0; i < kBlockSize; i++) {
1651       inputs[i] = uint32_as_float(n + i);
1652     }
1653     xnn_math_f32_f16_cvt__scalar_bitcast(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1654     for (uint32_t i = 0; i < kBlockSize; i++) {
1655       const uint16_t reference_output = UINT16_C(0x7C00);
1656       ASSERT_EQ(reference_output, outputs[i])
1657         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1658         << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1659         << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1660     }
1661   }
1662 }
1663 
TEST(CVT__SCALAR_BITCAST,negative_overflow)1664 TEST(CVT__SCALAR_BITCAST, negative_overflow) {
1665   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1666   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1667   for (uint32_t n = UINT32_C(0xC77FF000); n < UINT32_C(0xFF800000); n += kBlockSize) {
1668     for (uint32_t i = 0; i < kBlockSize; i++) {
1669       inputs[i] = uint32_as_float(n + i);
1670     }
1671     xnn_math_f32_f16_cvt__scalar_bitcast(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1672     for (uint32_t i = 0; i < kBlockSize; i++) {
1673       const uint16_t reference_output = UINT16_C(0xFC00);
1674       ASSERT_EQ(reference_output, outputs[i])
1675         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1676         << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1677         << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1678     }
1679   }
1680 }
1681 
TEST(CVT__SCALAR_BITCAST,positive_infinity)1682 TEST(CVT__SCALAR_BITCAST, positive_infinity) {
1683   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1684   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1685   std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
1686   xnn_math_f32_f16_cvt__scalar_bitcast(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1687   const uint16_t reference_output = UINT16_C(0x7C00);
1688   ASSERT_EQ(reference_output, outputs[0])
1689     << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
1690     << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1691     << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
1692 }
1693 
TEST(CVT__SCALAR_BITCAST,negative_infinity)1694 TEST(CVT__SCALAR_BITCAST, negative_infinity) {
1695   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1696   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1697   std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
1698   xnn_math_f32_f16_cvt__scalar_bitcast(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1699   const uint16_t reference_output = UINT16_C(0xFC00);
1700   ASSERT_EQ(reference_output, outputs[0])
1701     << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
1702     << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1703     << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
1704 }
1705 
TEST(CVT__SCALAR_BITCAST,positive_nan)1706 TEST(CVT__SCALAR_BITCAST, positive_nan) {
1707   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1708   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1709   for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1710     for (uint32_t i = 0; i < kBlockSize; i++) {
1711       inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0x7FFFFFFF)));
1712     }
1713     xnn_math_f32_f16_cvt__scalar_bitcast(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1714     for (uint32_t i = 0; i < kBlockSize; i++) {
1715       ASSERT_GT(outputs[i], UINT16_C(0x7C00))
1716         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1717         << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1718       ASSERT_LT(outputs[i], UINT16_C(0x8000))
1719         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1720         << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1721     }
1722   }
1723 }
1724 
TEST(CVT__SCALAR_BITCAST,negative_nan)1725 TEST(CVT__SCALAR_BITCAST, negative_nan) {
1726   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1727   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1728   for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1729     for (uint32_t i = 0; i < kBlockSize; i++) {
1730       inputs[i] = uint32_as_float(UINT32_C(0x80000000) | std::min<uint32_t>(n + i, UINT32_C(0x7FFFFFFF)));
1731     }
1732     xnn_math_f32_f16_cvt__scalar_bitcast(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1733     for (uint32_t i = 0; i < kBlockSize; i++) {
1734       ASSERT_GT(outputs[i], UINT16_C(0xFC00))
1735         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1736         << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1737     }
1738   }
1739 }
1740 
TEST(CVT__SCALAR_FABSF,positive_normal)1741 TEST(CVT__SCALAR_FABSF, positive_normal) {
1742   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1743   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1744   for (uint32_t n = UINT32_C(0x387FE000); n < UINT32_C(0x477FF000); n += kBlockSize) {
1745     for (uint32_t i = 0; i < kBlockSize; i++) {
1746       inputs[i] = uint32_as_float(n + i);
1747     }
1748     xnn_math_f32_f16_cvt__scalar_fabsf(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1749     for (uint32_t i = 0; i < kBlockSize; i++) {
1750       const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
1751       ASSERT_EQ(reference_output, outputs[i])
1752         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1753         << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1754         << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1755     }
1756   }
1757 }
1758 
TEST(CVT__SCALAR_FABSF,negative_normal)1759 TEST(CVT__SCALAR_FABSF, negative_normal) {
1760   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1761   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1762   for (uint32_t n = UINT32_C(0xB87FE000); n < UINT32_C(0xC77FF000); n += kBlockSize) {
1763     for (uint32_t i = 0; i < kBlockSize; i++) {
1764       inputs[i] = uint32_as_float(n + i);
1765     }
1766     xnn_math_f32_f16_cvt__scalar_fabsf(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1767     for (uint32_t i = 0; i < kBlockSize; i++) {
1768       const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
1769       ASSERT_EQ(reference_output, outputs[i])
1770         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1771         << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1772         << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1773     }
1774   }
1775 }
1776 
TEST(CVT__SCALAR_FABSF,positive_subnormal)1777 TEST(CVT__SCALAR_FABSF, positive_subnormal) {
1778   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1779   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1780   for (uint32_t n = UINT32_C(0x33000001); n < UINT32_C(0x387FE000); n += kBlockSize) {
1781     for (uint32_t i = 0; i < kBlockSize; i++) {
1782       inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0x387FDFFF)));
1783     }
1784     xnn_math_f32_f16_cvt__scalar_fabsf(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1785     for (uint32_t i = 0; i < kBlockSize; i++) {
1786       const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
1787       ASSERT_EQ(reference_output, outputs[i])
1788         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1789         << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1790         << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1791     }
1792   }
1793 }
1794 
TEST(CVT__SCALAR_FABSF,negative_subnormal)1795 TEST(CVT__SCALAR_FABSF, negative_subnormal) {
1796   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1797   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1798   for (uint32_t n = UINT32_C(0xB3000001); n < UINT32_C(0xB87FE000); n += kBlockSize) {
1799     for (uint32_t i = 0; i < kBlockSize; i++) {
1800       inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0xB87FDFFF)));
1801     }
1802     xnn_math_f32_f16_cvt__scalar_fabsf(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1803     for (uint32_t i = 0; i < kBlockSize; i++) {
1804       const uint16_t reference_output = fp16_ieee_from_fp32_value(inputs[i]);
1805       ASSERT_EQ(reference_output, outputs[i])
1806         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1807         << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1808         << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1809     }
1810   }
1811 }
1812 
TEST(CVT__SCALAR_FABSF,positive_underflow)1813 TEST(CVT__SCALAR_FABSF, positive_underflow) {
1814   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1815   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1816   for (uint32_t n = UINT32_C(0x00000001); n < UINT32_C(0x33000001); n += kBlockSize) {
1817     for (uint32_t i = 0; i < kBlockSize; i++) {
1818       inputs[i] = uint32_as_float(n + i);
1819     }
1820     xnn_math_f32_f16_cvt__scalar_fabsf(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1821     for (uint32_t i = 0; i < kBlockSize; i++) {
1822       const uint16_t reference_output = UINT16_C(0x0000);
1823       ASSERT_EQ(reference_output, outputs[i])
1824         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1825         << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1826         << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1827     }
1828   }
1829 }
1830 
TEST(CVT__SCALAR_FABSF,negative_underflow)1831 TEST(CVT__SCALAR_FABSF, negative_underflow) {
1832   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1833   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1834   for (uint32_t n = UINT32_C(0x80000001); n < UINT32_C(0xB3000001); n += kBlockSize) {
1835     for (uint32_t i = 0; i < kBlockSize; i++) {
1836       inputs[i] = uint32_as_float(n + i);
1837     }
1838     xnn_math_f32_f16_cvt__scalar_fabsf(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1839     for (uint32_t i = 0; i < kBlockSize; i++) {
1840       const uint16_t reference_output = UINT16_C(0x8000);
1841       ASSERT_EQ(reference_output, outputs[i])
1842         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1843         << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1844         << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1845     }
1846   }
1847 }
1848 
TEST(CVT__SCALAR_FABSF,positive_zero)1849 TEST(CVT__SCALAR_FABSF, positive_zero) {
1850   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1851   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1852   std::fill(inputs.begin(), inputs.end(), +0.0f);
1853   xnn_math_f32_f16_cvt__scalar_fabsf(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1854   const uint16_t reference_output = UINT16_C(0x0000);
1855   ASSERT_EQ(reference_output, outputs[0])
1856     << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
1857     << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1858     << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
1859 }
1860 
TEST(CVT__SCALAR_FABSF,negative_zero)1861 TEST(CVT__SCALAR_FABSF, negative_zero) {
1862   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1863   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1864   std::fill(inputs.begin(), inputs.end(), -0.0f);
1865   xnn_math_f32_f16_cvt__scalar_fabsf(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1866   const uint16_t reference_output = UINT16_C(0x8000);
1867   ASSERT_EQ(reference_output, outputs[0])
1868     << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
1869     << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1870     << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
1871 }
1872 
TEST(CVT__SCALAR_FABSF,positive_overflow)1873 TEST(CVT__SCALAR_FABSF, positive_overflow) {
1874   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1875   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1876   for (uint32_t n = UINT32_C(0x477FF000); n < UINT32_C(0x7F800000); n += kBlockSize) {
1877     for (uint32_t i = 0; i < kBlockSize; i++) {
1878       inputs[i] = uint32_as_float(n + i);
1879     }
1880     xnn_math_f32_f16_cvt__scalar_fabsf(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1881     for (uint32_t i = 0; i < kBlockSize; i++) {
1882       const uint16_t reference_output = UINT16_C(0x7C00);
1883       ASSERT_EQ(reference_output, outputs[i])
1884         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1885         << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1886         << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1887     }
1888   }
1889 }
1890 
TEST(CVT__SCALAR_FABSF,negative_overflow)1891 TEST(CVT__SCALAR_FABSF, negative_overflow) {
1892   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1893   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1894   for (uint32_t n = UINT32_C(0xC77FF000); n < UINT32_C(0xFF800000); n += kBlockSize) {
1895     for (uint32_t i = 0; i < kBlockSize; i++) {
1896       inputs[i] = uint32_as_float(n + i);
1897     }
1898     xnn_math_f32_f16_cvt__scalar_fabsf(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1899     for (uint32_t i = 0; i < kBlockSize; i++) {
1900       const uint16_t reference_output = UINT16_C(0xFC00);
1901       ASSERT_EQ(reference_output, outputs[i])
1902         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1903         << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1904         << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1905     }
1906   }
1907 }
1908 
TEST(CVT__SCALAR_FABSF,positive_infinity)1909 TEST(CVT__SCALAR_FABSF, positive_infinity) {
1910   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1911   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1912   std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
1913   xnn_math_f32_f16_cvt__scalar_fabsf(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1914   const uint16_t reference_output = UINT16_C(0x7C00);
1915   ASSERT_EQ(reference_output, outputs[0])
1916     << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
1917     << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1918     << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
1919 }
1920 
TEST(CVT__SCALAR_FABSF,negative_infinity)1921 TEST(CVT__SCALAR_FABSF, negative_infinity) {
1922   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1923   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1924   std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
1925   xnn_math_f32_f16_cvt__scalar_fabsf(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1926   const uint16_t reference_output = UINT16_C(0xFC00);
1927   ASSERT_EQ(reference_output, outputs[0])
1928     << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
1929     << ", reference = 0x" << std::hex << std::setw(4) << std::setfill('0') << reference_output
1930     << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[0];
1931 }
1932 
TEST(CVT__SCALAR_FABSF,positive_nan)1933 TEST(CVT__SCALAR_FABSF, positive_nan) {
1934   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1935   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1936   for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1937     for (uint32_t i = 0; i < kBlockSize; i++) {
1938       inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0x7FFFFFFF)));
1939     }
1940     xnn_math_f32_f16_cvt__scalar_fabsf(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1941     for (uint32_t i = 0; i < kBlockSize; i++) {
1942       ASSERT_GT(outputs[i], UINT16_C(0x7C00))
1943         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1944         << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1945       ASSERT_LT(outputs[i], UINT16_C(0x8000))
1946         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1947         << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1948     }
1949   }
1950 }
1951 
TEST(CVT__SCALAR_FABSF,negative_nan)1952 TEST(CVT__SCALAR_FABSF, negative_nan) {
1953   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1954   std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> outputs(kBlockSize);
1955   for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1956     for (uint32_t i = 0; i < kBlockSize; i++) {
1957       inputs[i] = uint32_as_float(UINT32_C(0x80000000) | std::min<uint32_t>(n + i, UINT32_C(0x7FFFFFFF)));
1958     }
1959     xnn_math_f32_f16_cvt__scalar_fabsf(kBlockSize * sizeof(uint16_t), inputs.data(), outputs.data());
1960     for (uint32_t i = 0; i < kBlockSize; i++) {
1961       ASSERT_GT(outputs[i], UINT16_C(0xFC00))
1962         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1963         << ", optimized = 0x" << std::hex << std::setw(4) << std::setfill('0') << outputs[i];
1964     }
1965   }
1966 }
1967