xref: /aosp_15_r20/external/XNNPACK/eval/f32-exp.cc (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright 2020 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cmath>
8 #include <cstddef>
9 #include <cstdint>
10 #include <cstdlib>
11 #include <iomanip>
12 #include <ios>
13 #include <vector>
14 
15 #include <gtest/gtest.h>
16 
17 #include <fp16.h>
18 
19 #include <xnnpack/aligned-allocator.h>
20 #include <xnnpack/common.h>
21 #include <xnnpack/isa-checks.h>
22 #include <xnnpack/math.h>
23 #include <xnnpack/math-stubs.h>
24 
25 
26 constexpr int kBlockSize = 1024;
27 
28 
29 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(EXP__NEONFMA_RR2_LUT64_P2,negative_zero)30   TEST(EXP__NEONFMA_RR2_LUT64_P2, negative_zero) {
31     TEST_REQUIRES_ARM_NEON_FMA;
32 
33     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
34     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
35     std::fill(inputs.begin(), inputs.end(), -0.0f);
36     xnn_math_f32_exp__neonfma_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
37     const float reference_output = 1.0f;
38     ASSERT_EQ(reference_output, outputs[0])
39       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
40       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(reference_output)
41       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[0]);
42   }
43 
TEST(EXP__NEONFMA_RR2_LUT64_P2,positive_zero)44   TEST(EXP__NEONFMA_RR2_LUT64_P2, positive_zero) {
45     TEST_REQUIRES_ARM_NEON_FMA;
46 
47     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
48     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
49     std::fill(inputs.begin(), inputs.end(), +0.0f);
50     xnn_math_f32_exp__neonfma_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
51     const float reference_output = 1.0f;
52     ASSERT_EQ(reference_output, outputs[0])
53       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
54       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(reference_output)
55       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[0]);
56   }
57 
TEST(EXP__NEONFMA_RR2_LUT64_P2,negative_saturation)58   TEST(EXP__NEONFMA_RR2_LUT64_P2, negative_saturation) {
59     TEST_REQUIRES_ARM_NEON_FMA;
60 
61     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
62     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
63     for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
64       for (uint32_t i = 0; i < kBlockSize; i++) {
65         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0xFF800000)));
66       }
67       xnn_math_f32_exp__neonfma_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
68       for (uint32_t i = 0; i < kBlockSize; i++) {
69         const uint32_t reference_output = UINT32_C(0x00000000);
70         ASSERT_EQ(reference_output, float_as_uint32(outputs[i]))
71           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
72           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
73           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
74       }
75     }
76   }
77 
TEST(EXP__NEONFMA_RR2_LUT64_P2,positive_overflow)78   TEST(EXP__NEONFMA_RR2_LUT64_P2, positive_overflow) {
79     TEST_REQUIRES_ARM_NEON_FMA;
80 
81     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
82     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
83     for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
84       for (uint32_t i = 0; i < kBlockSize; i++) {
85         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0x7F800000)));
86       }
87       xnn_math_f32_exp__neonfma_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
88       for (uint32_t i = 0; i < kBlockSize; i++) {
89         const uint32_t reference_output = UINT32_C(0x7F800000);
90         ASSERT_EQ(reference_output, float_as_uint32(outputs[i]))
91           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
92           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
93           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
94       }
95     }
96   }
97 
TEST(EXP__NEONFMA_RR2_LUT64_P2,positive_nan)98   TEST(EXP__NEONFMA_RR2_LUT64_P2, positive_nan) {
99     TEST_REQUIRES_ARM_NEON_FMA;
100 
101     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
102     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
103     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
104       for (uint32_t i = 0; i < kBlockSize; i++) {
105         inputs[i] = uint32_as_float(std::min<uint32_t>(UINT32_C(0x7FFFFFFF), n + i));
106       }
107       xnn_math_f32_exp__neonfma_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
108       for (uint32_t i = 0; i < kBlockSize; i++) {
109         ASSERT_TRUE(std::isnan(outputs[i]))
110           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
111           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
112       }
113     }
114   }
115 
TEST(EXP__NEONFMA_RR2_LUT64_P2,negative_nan)116   TEST(EXP__NEONFMA_RR2_LUT64_P2, negative_nan) {
117     TEST_REQUIRES_ARM_NEON_FMA;
118 
119     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
120     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
121     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
122       for (uint32_t i = 0; i < kBlockSize; i++) {
123         inputs[i] = uint32_as_float(std::min<uint32_t>(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
124       }
125       xnn_math_f32_exp__neonfma_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
126       for (uint32_t i = 0; i < kBlockSize; i++) {
127         ASSERT_TRUE(std::isnan(outputs[i]))
128           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
129           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
130       }
131     }
132   }
133 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
134 
135 
136 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(EXP__NEONFMA_RR2_P5,negative_zero)137   TEST(EXP__NEONFMA_RR2_P5, negative_zero) {
138     TEST_REQUIRES_ARM_NEON_FMA;
139 
140     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
141     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
142     std::fill(inputs.begin(), inputs.end(), -0.0f);
143     xnn_math_f32_exp__neonfma_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
144     const float reference_output = 1.0f;
145     ASSERT_EQ(reference_output, outputs[0])
146       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
147       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(reference_output)
148       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[0]);
149   }
150 
TEST(EXP__NEONFMA_RR2_P5,positive_zero)151   TEST(EXP__NEONFMA_RR2_P5, positive_zero) {
152     TEST_REQUIRES_ARM_NEON_FMA;
153 
154     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
155     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
156     std::fill(inputs.begin(), inputs.end(), +0.0f);
157     xnn_math_f32_exp__neonfma_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
158     const float reference_output = 1.0f;
159     ASSERT_EQ(reference_output, outputs[0])
160       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
161       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(reference_output)
162       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[0]);
163   }
164 
TEST(EXP__NEONFMA_RR2_P5,negative_saturation)165   TEST(EXP__NEONFMA_RR2_P5, negative_saturation) {
166     TEST_REQUIRES_ARM_NEON_FMA;
167 
168     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
169     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
170     for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
171       for (uint32_t i = 0; i < kBlockSize; i++) {
172         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0xFF800000)));
173       }
174       xnn_math_f32_exp__neonfma_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
175       for (uint32_t i = 0; i < kBlockSize; i++) {
176         const uint32_t reference_output = UINT32_C(0x00000000);
177         ASSERT_EQ(reference_output, float_as_uint32(outputs[i]))
178           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
179           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
180           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
181       }
182     }
183   }
184 
TEST(EXP__NEONFMA_RR2_P5,positive_overflow)185   TEST(EXP__NEONFMA_RR2_P5, positive_overflow) {
186     TEST_REQUIRES_ARM_NEON_FMA;
187 
188     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
189     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
190     for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
191       for (uint32_t i = 0; i < kBlockSize; i++) {
192         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0x7F800000)));
193       }
194       xnn_math_f32_exp__neonfma_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
195       for (uint32_t i = 0; i < kBlockSize; i++) {
196         const uint32_t reference_output = UINT32_C(0x7F800000);
197         ASSERT_EQ(reference_output, float_as_uint32(outputs[i]))
198           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
199           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
200           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
201       }
202     }
203   }
204 
TEST(EXP__NEONFMA_RR2_P5,positive_nan)205   TEST(EXP__NEONFMA_RR2_P5, positive_nan) {
206     TEST_REQUIRES_ARM_NEON_FMA;
207 
208     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
209     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
210     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
211       for (uint32_t i = 0; i < kBlockSize; i++) {
212         inputs[i] = uint32_as_float(std::min<uint32_t>(UINT32_C(0x7FFFFFFF), n + i));
213       }
214       xnn_math_f32_exp__neonfma_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
215       for (uint32_t i = 0; i < kBlockSize; i++) {
216         ASSERT_TRUE(std::isnan(outputs[i]))
217           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
218           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
219       }
220     }
221   }
222 
TEST(EXP__NEONFMA_RR2_P5,negative_nan)223   TEST(EXP__NEONFMA_RR2_P5, negative_nan) {
224     TEST_REQUIRES_ARM_NEON_FMA;
225 
226     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
227     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
228     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
229       for (uint32_t i = 0; i < kBlockSize; i++) {
230         inputs[i] = uint32_as_float(std::min<uint32_t>(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
231       }
232       xnn_math_f32_exp__neonfma_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
233       for (uint32_t i = 0; i < kBlockSize; i++) {
234         ASSERT_TRUE(std::isnan(outputs[i]))
235           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
236           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
237       }
238     }
239   }
240 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
241 
242 
243 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(EXP__AVX512F_RR2_LUT16_P3_PERM,negative_zero)244   TEST(EXP__AVX512F_RR2_LUT16_P3_PERM, negative_zero) {
245     TEST_REQUIRES_X86_AVX512F;
246 
247     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
248     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
249     std::fill(inputs.begin(), inputs.end(), -0.0f);
250     xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
251     const float reference_output = 1.0f;
252     ASSERT_EQ(reference_output, outputs[0])
253       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
254       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(reference_output)
255       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[0]);
256   }
257 
TEST(EXP__AVX512F_RR2_LUT16_P3_PERM,positive_zero)258   TEST(EXP__AVX512F_RR2_LUT16_P3_PERM, positive_zero) {
259     TEST_REQUIRES_X86_AVX512F;
260 
261     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
262     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
263     std::fill(inputs.begin(), inputs.end(), +0.0f);
264     xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
265     const float reference_output = 1.0f;
266     ASSERT_EQ(reference_output, outputs[0])
267       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
268       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(reference_output)
269       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[0]);
270   }
271 
TEST(EXP__AVX512F_RR2_LUT16_P3_PERM,negative_saturation)272   TEST(EXP__AVX512F_RR2_LUT16_P3_PERM, negative_saturation) {
273     TEST_REQUIRES_X86_AVX512F;
274 
275     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
276     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
277     for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
278       for (uint32_t i = 0; i < kBlockSize; i++) {
279         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0xFF800000)));
280       }
281       xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
282       for (uint32_t i = 0; i < kBlockSize; i++) {
283         const uint32_t reference_output = UINT32_C(0x00000000);
284         ASSERT_EQ(reference_output, float_as_uint32(outputs[i]))
285           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
286           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
287           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
288       }
289     }
290   }
291 
TEST(EXP__AVX512F_RR2_LUT16_P3_PERM,positive_overflow)292   TEST(EXP__AVX512F_RR2_LUT16_P3_PERM, positive_overflow) {
293     TEST_REQUIRES_X86_AVX512F;
294 
295     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
296     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
297     for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
298       for (uint32_t i = 0; i < kBlockSize; i++) {
299         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0x7F800000)));
300       }
301       xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
302       for (uint32_t i = 0; i < kBlockSize; i++) {
303         const uint32_t reference_output = UINT32_C(0x7F800000);
304         ASSERT_EQ(reference_output, float_as_uint32(outputs[i]))
305           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
306           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
307           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
308       }
309     }
310   }
311 
TEST(EXP__AVX512F_RR2_LUT16_P3_PERM,positive_nan)312   TEST(EXP__AVX512F_RR2_LUT16_P3_PERM, positive_nan) {
313     TEST_REQUIRES_X86_AVX512F;
314 
315     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
316     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
317     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
318       for (uint32_t i = 0; i < kBlockSize; i++) {
319         inputs[i] = uint32_as_float(std::min<uint32_t>(UINT32_C(0x7FFFFFFF), n + i));
320       }
321       xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
322       for (uint32_t i = 0; i < kBlockSize; i++) {
323         ASSERT_TRUE(std::isnan(outputs[i]))
324           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
325           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
326       }
327     }
328   }
329 
TEST(EXP__AVX512F_RR2_LUT16_P3_PERM,negative_nan)330   TEST(EXP__AVX512F_RR2_LUT16_P3_PERM, negative_nan) {
331     TEST_REQUIRES_X86_AVX512F;
332 
333     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
334     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
335     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
336       for (uint32_t i = 0; i < kBlockSize; i++) {
337         inputs[i] = uint32_as_float(std::min<uint32_t>(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
338       }
339       xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
340       for (uint32_t i = 0; i < kBlockSize; i++) {
341         ASSERT_TRUE(std::isnan(outputs[i]))
342           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
343           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
344       }
345     }
346   }
347 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
348 
349 
350 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF,negative_zero)351   TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF, negative_zero) {
352     TEST_REQUIRES_X86_AVX512F;
353 
354     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
355     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
356     std::fill(inputs.begin(), inputs.end(), -0.0f);
357     xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
358     const float reference_output = 1.0f;
359     ASSERT_EQ(reference_output, outputs[0])
360       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
361       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(reference_output)
362       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[0]);
363   }
364 
TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF,positive_zero)365   TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF, positive_zero) {
366     TEST_REQUIRES_X86_AVX512F;
367 
368     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
369     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
370     std::fill(inputs.begin(), inputs.end(), +0.0f);
371     xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
372     const float reference_output = 1.0f;
373     ASSERT_EQ(reference_output, outputs[0])
374       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
375       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(reference_output)
376       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[0]);
377   }
378 
TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF,negative_saturation)379   TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF, negative_saturation) {
380     TEST_REQUIRES_X86_AVX512F;
381 
382     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
383     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
384     for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
385       for (uint32_t i = 0; i < kBlockSize; i++) {
386         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0xFF800000)));
387       }
388       xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
389       for (uint32_t i = 0; i < kBlockSize; i++) {
390         const uint32_t reference_output = UINT32_C(0x00000000);
391         ASSERT_EQ(reference_output, float_as_uint32(outputs[i]))
392           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
393           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
394           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
395       }
396     }
397   }
398 
TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF,positive_overflow)399   TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF, positive_overflow) {
400     TEST_REQUIRES_X86_AVX512F;
401 
402     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
403     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
404     for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
405       for (uint32_t i = 0; i < kBlockSize; i++) {
406         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0x7F800000)));
407       }
408       xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
409       for (uint32_t i = 0; i < kBlockSize; i++) {
410         const uint32_t reference_output = UINT32_C(0x7F800000);
411         ASSERT_EQ(reference_output, float_as_uint32(outputs[i]))
412           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
413           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
414           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
415       }
416     }
417   }
418 
TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF,positive_nan)419   TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF, positive_nan) {
420     TEST_REQUIRES_X86_AVX512F;
421 
422     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
423     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
424     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
425       for (uint32_t i = 0; i < kBlockSize; i++) {
426         inputs[i] = uint32_as_float(std::min<uint32_t>(UINT32_C(0x7FFFFFFF), n + i));
427       }
428       xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
429       for (uint32_t i = 0; i < kBlockSize; i++) {
430         ASSERT_TRUE(std::isnan(outputs[i]))
431           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
432           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
433       }
434     }
435   }
436 
TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF,negative_nan)437   TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF, negative_nan) {
438     TEST_REQUIRES_X86_AVX512F;
439 
440     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
441     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
442     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
443       for (uint32_t i = 0; i < kBlockSize; i++) {
444         inputs[i] = uint32_as_float(std::min<uint32_t>(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
445       }
446       xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
447       for (uint32_t i = 0; i < kBlockSize; i++) {
448         ASSERT_TRUE(std::isnan(outputs[i]))
449           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
450           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
451       }
452     }
453   }
454 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
455 
456 
457 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2,negative_zero)458   TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2, negative_zero) {
459     TEST_REQUIRES_X86_AVX512F;
460 
461     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
462     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
463     std::fill(inputs.begin(), inputs.end(), -0.0f);
464     xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
465     const float reference_output = 1.0f;
466     ASSERT_EQ(reference_output, outputs[0])
467       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
468       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(reference_output)
469       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[0]);
470   }
471 
TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2,positive_zero)472   TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2, positive_zero) {
473     TEST_REQUIRES_X86_AVX512F;
474 
475     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
476     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
477     std::fill(inputs.begin(), inputs.end(), +0.0f);
478     xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
479     const float reference_output = 1.0f;
480     ASSERT_EQ(reference_output, outputs[0])
481       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
482       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(reference_output)
483       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[0]);
484   }
485 
TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2,negative_saturation)486   TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2, negative_saturation) {
487     TEST_REQUIRES_X86_AVX512F;
488 
489     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
490     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
491     for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
492       for (uint32_t i = 0; i < kBlockSize; i++) {
493         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0xFF800000)));
494       }
495       xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
496       for (uint32_t i = 0; i < kBlockSize; i++) {
497         const uint32_t reference_output = UINT32_C(0x00000000);
498         ASSERT_EQ(reference_output, float_as_uint32(outputs[i]))
499           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
500           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
501           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
502       }
503     }
504   }
505 
TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2,positive_overflow)506   TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2, positive_overflow) {
507     TEST_REQUIRES_X86_AVX512F;
508 
509     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
510     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
511     for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
512       for (uint32_t i = 0; i < kBlockSize; i++) {
513         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0x7F800000)));
514       }
515       xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
516       for (uint32_t i = 0; i < kBlockSize; i++) {
517         const uint32_t reference_output = UINT32_C(0x7F800000);
518         ASSERT_EQ(reference_output, float_as_uint32(outputs[i]))
519           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
520           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
521           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
522       }
523     }
524   }
525 
TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2,positive_nan)526   TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2, positive_nan) {
527     TEST_REQUIRES_X86_AVX512F;
528 
529     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
530     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
531     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
532       for (uint32_t i = 0; i < kBlockSize; i++) {
533         inputs[i] = uint32_as_float(std::min<uint32_t>(UINT32_C(0x7FFFFFFF), n + i));
534       }
535       xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
536       for (uint32_t i = 0; i < kBlockSize; i++) {
537         ASSERT_TRUE(std::isnan(outputs[i]))
538           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
539           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
540       }
541     }
542   }
543 
TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2,negative_nan)544   TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2, negative_nan) {
545     TEST_REQUIRES_X86_AVX512F;
546 
547     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
548     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
549     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
550       for (uint32_t i = 0; i < kBlockSize; i++) {
551         inputs[i] = uint32_as_float(std::min<uint32_t>(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
552       }
553       xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
554       for (uint32_t i = 0; i < kBlockSize; i++) {
555         ASSERT_TRUE(std::isnan(outputs[i]))
556           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
557           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
558       }
559     }
560   }
561 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
562 
563 
564 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF,negative_zero)565   TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF, negative_zero) {
566     TEST_REQUIRES_X86_AVX512F;
567 
568     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
569     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
570     std::fill(inputs.begin(), inputs.end(), -0.0f);
571     xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
572     const float reference_output = 1.0f;
573     ASSERT_EQ(reference_output, outputs[0])
574       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
575       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(reference_output)
576       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[0]);
577   }
578 
TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF,positive_zero)579   TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF, positive_zero) {
580     TEST_REQUIRES_X86_AVX512F;
581 
582     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
583     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
584     std::fill(inputs.begin(), inputs.end(), +0.0f);
585     xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
586     const float reference_output = 1.0f;
587     ASSERT_EQ(reference_output, outputs[0])
588       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
589       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(reference_output)
590       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[0]);
591   }
592 
TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF,negative_saturation)593   TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF, negative_saturation) {
594     TEST_REQUIRES_X86_AVX512F;
595 
596     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
597     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
598     for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
599       for (uint32_t i = 0; i < kBlockSize; i++) {
600         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0xFF800000)));
601       }
602       xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
603       for (uint32_t i = 0; i < kBlockSize; i++) {
604         const uint32_t reference_output = UINT32_C(0x00000000);
605         ASSERT_EQ(reference_output, float_as_uint32(outputs[i]))
606           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
607           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
608           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
609       }
610     }
611   }
612 
TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF,positive_overflow)613   TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF, positive_overflow) {
614     TEST_REQUIRES_X86_AVX512F;
615 
616     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
617     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
618     for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
619       for (uint32_t i = 0; i < kBlockSize; i++) {
620         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0x7F800000)));
621       }
622       xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
623       for (uint32_t i = 0; i < kBlockSize; i++) {
624         const uint32_t reference_output = UINT32_C(0x7F800000);
625         ASSERT_EQ(reference_output, float_as_uint32(outputs[i]))
626           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
627           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
628           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
629       }
630     }
631   }
632 
TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF,positive_nan)633   TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF, positive_nan) {
634     TEST_REQUIRES_X86_AVX512F;
635 
636     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
637     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
638     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
639       for (uint32_t i = 0; i < kBlockSize; i++) {
640         inputs[i] = uint32_as_float(std::min<uint32_t>(UINT32_C(0x7FFFFFFF), n + i));
641       }
642       xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
643       for (uint32_t i = 0; i < kBlockSize; i++) {
644         ASSERT_TRUE(std::isnan(outputs[i]))
645           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
646           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
647       }
648     }
649   }
650 
TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF,negative_nan)651   TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF, negative_nan) {
652     TEST_REQUIRES_X86_AVX512F;
653 
654     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
655     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
656     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
657       for (uint32_t i = 0; i < kBlockSize; i++) {
658         inputs[i] = uint32_as_float(std::min<uint32_t>(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
659       }
660       xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
661       for (uint32_t i = 0; i < kBlockSize; i++) {
662         ASSERT_TRUE(std::isnan(outputs[i]))
663           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
664           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
665       }
666     }
667   }
668 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
669 
670 
671 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(EXP__AVX512F_RR2_P5,negative_zero)672   TEST(EXP__AVX512F_RR2_P5, negative_zero) {
673     TEST_REQUIRES_X86_AVX512F;
674 
675     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
676     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
677     std::fill(inputs.begin(), inputs.end(), -0.0f);
678     xnn_math_f32_exp__avx512f_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
679     const float reference_output = 1.0f;
680     ASSERT_EQ(reference_output, outputs[0])
681       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
682       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(reference_output)
683       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[0]);
684   }
685 
TEST(EXP__AVX512F_RR2_P5,positive_zero)686   TEST(EXP__AVX512F_RR2_P5, positive_zero) {
687     TEST_REQUIRES_X86_AVX512F;
688 
689     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
690     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
691     std::fill(inputs.begin(), inputs.end(), +0.0f);
692     xnn_math_f32_exp__avx512f_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
693     const float reference_output = 1.0f;
694     ASSERT_EQ(reference_output, outputs[0])
695       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
696       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(reference_output)
697       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[0]);
698   }
699 
TEST(EXP__AVX512F_RR2_P5,negative_saturation)700   TEST(EXP__AVX512F_RR2_P5, negative_saturation) {
701     TEST_REQUIRES_X86_AVX512F;
702 
703     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
704     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
705     for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
706       for (uint32_t i = 0; i < kBlockSize; i++) {
707         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0xFF800000)));
708       }
709       xnn_math_f32_exp__avx512f_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
710       for (uint32_t i = 0; i < kBlockSize; i++) {
711         const uint32_t reference_output = UINT32_C(0x00000000);
712         ASSERT_EQ(reference_output, float_as_uint32(outputs[i]))
713           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
714           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
715           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
716       }
717     }
718   }
719 
TEST(EXP__AVX512F_RR2_P5,positive_overflow)720   TEST(EXP__AVX512F_RR2_P5, positive_overflow) {
721     TEST_REQUIRES_X86_AVX512F;
722 
723     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
724     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
725     for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
726       for (uint32_t i = 0; i < kBlockSize; i++) {
727         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0x7F800000)));
728       }
729       xnn_math_f32_exp__avx512f_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
730       for (uint32_t i = 0; i < kBlockSize; i++) {
731         const uint32_t reference_output = UINT32_C(0x7F800000);
732         ASSERT_EQ(reference_output, float_as_uint32(outputs[i]))
733           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
734           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
735           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
736       }
737     }
738   }
739 
TEST(EXP__AVX512F_RR2_P5,positive_nan)740   TEST(EXP__AVX512F_RR2_P5, positive_nan) {
741     TEST_REQUIRES_X86_AVX512F;
742 
743     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
744     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
745     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
746       for (uint32_t i = 0; i < kBlockSize; i++) {
747         inputs[i] = uint32_as_float(std::min<uint32_t>(UINT32_C(0x7FFFFFFF), n + i));
748       }
749       xnn_math_f32_exp__avx512f_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
750       for (uint32_t i = 0; i < kBlockSize; i++) {
751         ASSERT_TRUE(std::isnan(outputs[i]))
752           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
753           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
754       }
755     }
756   }
757 
TEST(EXP__AVX512F_RR2_P5,negative_nan)758   TEST(EXP__AVX512F_RR2_P5, negative_nan) {
759     TEST_REQUIRES_X86_AVX512F;
760 
761     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
762     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
763     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
764       for (uint32_t i = 0; i < kBlockSize; i++) {
765         inputs[i] = uint32_as_float(std::min<uint32_t>(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
766       }
767       xnn_math_f32_exp__avx512f_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
768       for (uint32_t i = 0; i < kBlockSize; i++) {
769         ASSERT_TRUE(std::isnan(outputs[i]))
770           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
771           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
772       }
773     }
774   }
775 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
776 
777 
778 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(EXP__AVX512F_RR2_P5_SCALEF,negative_zero)779   TEST(EXP__AVX512F_RR2_P5_SCALEF, negative_zero) {
780     TEST_REQUIRES_X86_AVX512F;
781 
782     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
783     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
784     std::fill(inputs.begin(), inputs.end(), -0.0f);
785     xnn_math_f32_exp__avx512f_rr2_p5_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
786     const float reference_output = 1.0f;
787     ASSERT_EQ(reference_output, outputs[0])
788       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
789       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(reference_output)
790       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[0]);
791   }
792 
TEST(EXP__AVX512F_RR2_P5_SCALEF,positive_zero)793   TEST(EXP__AVX512F_RR2_P5_SCALEF, positive_zero) {
794     TEST_REQUIRES_X86_AVX512F;
795 
796     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
797     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
798     std::fill(inputs.begin(), inputs.end(), +0.0f);
799     xnn_math_f32_exp__avx512f_rr2_p5_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
800     const float reference_output = 1.0f;
801     ASSERT_EQ(reference_output, outputs[0])
802       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
803       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(reference_output)
804       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[0]);
805   }
806 
TEST(EXP__AVX512F_RR2_P5_SCALEF,negative_saturation)807   TEST(EXP__AVX512F_RR2_P5_SCALEF, negative_saturation) {
808     TEST_REQUIRES_X86_AVX512F;
809 
810     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
811     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
812     for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
813       for (uint32_t i = 0; i < kBlockSize; i++) {
814         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0xFF800000)));
815       }
816       xnn_math_f32_exp__avx512f_rr2_p5_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
817       for (uint32_t i = 0; i < kBlockSize; i++) {
818         const uint32_t reference_output = UINT32_C(0x00000000);
819         ASSERT_EQ(reference_output, float_as_uint32(outputs[i]))
820           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
821           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
822           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
823       }
824     }
825   }
826 
TEST(EXP__AVX512F_RR2_P5_SCALEF,positive_overflow)827   TEST(EXP__AVX512F_RR2_P5_SCALEF, positive_overflow) {
828     TEST_REQUIRES_X86_AVX512F;
829 
830     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
831     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
832     for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
833       for (uint32_t i = 0; i < kBlockSize; i++) {
834         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0x7F800000)));
835       }
836       xnn_math_f32_exp__avx512f_rr2_p5_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
837       for (uint32_t i = 0; i < kBlockSize; i++) {
838         const uint32_t reference_output = UINT32_C(0x7F800000);
839         ASSERT_EQ(reference_output, float_as_uint32(outputs[i]))
840           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
841           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
842           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
843       }
844     }
845   }
846 
TEST(EXP__AVX512F_RR2_P5_SCALEF,positive_nan)847   TEST(EXP__AVX512F_RR2_P5_SCALEF, positive_nan) {
848     TEST_REQUIRES_X86_AVX512F;
849 
850     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
851     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
852     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
853       for (uint32_t i = 0; i < kBlockSize; i++) {
854         inputs[i] = uint32_as_float(std::min<uint32_t>(UINT32_C(0x7FFFFFFF), n + i));
855       }
856       xnn_math_f32_exp__avx512f_rr2_p5_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
857       for (uint32_t i = 0; i < kBlockSize; i++) {
858         ASSERT_TRUE(std::isnan(outputs[i]))
859           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
860           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
861       }
862     }
863   }
864 
TEST(EXP__AVX512F_RR2_P5_SCALEF,negative_nan)865   TEST(EXP__AVX512F_RR2_P5_SCALEF, negative_nan) {
866     TEST_REQUIRES_X86_AVX512F;
867 
868     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
869     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
870     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
871       for (uint32_t i = 0; i < kBlockSize; i++) {
872         inputs[i] = uint32_as_float(std::min<uint32_t>(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
873       }
874       xnn_math_f32_exp__avx512f_rr2_p5_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
875       for (uint32_t i = 0; i < kBlockSize; i++) {
876         ASSERT_TRUE(std::isnan(outputs[i]))
877           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
878           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
879       }
880     }
881   }
882 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
883 
884 
885 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(EXP__AVX2_RR2_LUT8_P3_PERM,negative_zero)886   TEST(EXP__AVX2_RR2_LUT8_P3_PERM, negative_zero) {
887     TEST_REQUIRES_X86_AVX2;
888 
889     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
890     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
891     std::fill(inputs.begin(), inputs.end(), -0.0f);
892     xnn_math_f32_exp__avx2_rr2_lut8_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
893     const float reference_output = 1.0f;
894     ASSERT_EQ(reference_output, outputs[0])
895       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
896       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(reference_output)
897       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[0]);
898   }
899 
TEST(EXP__AVX2_RR2_LUT8_P3_PERM,positive_zero)900   TEST(EXP__AVX2_RR2_LUT8_P3_PERM, positive_zero) {
901     TEST_REQUIRES_X86_AVX2;
902 
903     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
904     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
905     std::fill(inputs.begin(), inputs.end(), +0.0f);
906     xnn_math_f32_exp__avx2_rr2_lut8_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
907     const float reference_output = 1.0f;
908     ASSERT_EQ(reference_output, outputs[0])
909       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
910       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(reference_output)
911       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[0]);
912   }
913 
TEST(EXP__AVX2_RR2_LUT8_P3_PERM,negative_saturation)914   TEST(EXP__AVX2_RR2_LUT8_P3_PERM, negative_saturation) {
915     TEST_REQUIRES_X86_AVX2;
916 
917     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
918     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
919     for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
920       for (uint32_t i = 0; i < kBlockSize; i++) {
921         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0xFF800000)));
922       }
923       xnn_math_f32_exp__avx2_rr2_lut8_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
924       for (uint32_t i = 0; i < kBlockSize; i++) {
925         const uint32_t reference_output = UINT32_C(0x00000000);
926         ASSERT_EQ(reference_output, float_as_uint32(outputs[i]))
927           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
928           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
929           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
930       }
931     }
932   }
933 
TEST(EXP__AVX2_RR2_LUT8_P3_PERM,positive_overflow)934   TEST(EXP__AVX2_RR2_LUT8_P3_PERM, positive_overflow) {
935     TEST_REQUIRES_X86_AVX2;
936 
937     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
938     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
939     for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
940       for (uint32_t i = 0; i < kBlockSize; i++) {
941         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0x7F800000)));
942       }
943       xnn_math_f32_exp__avx2_rr2_lut8_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
944       for (uint32_t i = 0; i < kBlockSize; i++) {
945         const uint32_t reference_output = UINT32_C(0x7F800000);
946         ASSERT_EQ(reference_output, float_as_uint32(outputs[i]))
947           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
948           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
949           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
950       }
951     }
952   }
953 
TEST(EXP__AVX2_RR2_LUT8_P3_PERM,positive_nan)954   TEST(EXP__AVX2_RR2_LUT8_P3_PERM, positive_nan) {
955     TEST_REQUIRES_X86_AVX2;
956 
957     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
958     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
959     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
960       for (uint32_t i = 0; i < kBlockSize; i++) {
961         inputs[i] = uint32_as_float(std::min<uint32_t>(UINT32_C(0x7FFFFFFF), n + i));
962       }
963       xnn_math_f32_exp__avx2_rr2_lut8_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
964       for (uint32_t i = 0; i < kBlockSize; i++) {
965         ASSERT_TRUE(std::isnan(outputs[i]))
966           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
967           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
968       }
969     }
970   }
971 
TEST(EXP__AVX2_RR2_LUT8_P3_PERM,negative_nan)972   TEST(EXP__AVX2_RR2_LUT8_P3_PERM, negative_nan) {
973     TEST_REQUIRES_X86_AVX2;
974 
975     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
976     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
977     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
978       for (uint32_t i = 0; i < kBlockSize; i++) {
979         inputs[i] = uint32_as_float(std::min<uint32_t>(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
980       }
981       xnn_math_f32_exp__avx2_rr2_lut8_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
982       for (uint32_t i = 0; i < kBlockSize; i++) {
983         ASSERT_TRUE(std::isnan(outputs[i]))
984           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
985           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
986       }
987     }
988   }
989 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
990 
991 
992 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(EXP__AVX2_RR2_LUT8_P4_PERM,negative_zero)993   TEST(EXP__AVX2_RR2_LUT8_P4_PERM, negative_zero) {
994     TEST_REQUIRES_X86_AVX2;
995 
996     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
997     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
998     std::fill(inputs.begin(), inputs.end(), -0.0f);
999     xnn_math_f32_exp__avx2_rr2_lut8_p4_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1000     const float reference_output = 1.0f;
1001     ASSERT_EQ(reference_output, outputs[0])
1002       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
1003       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(reference_output)
1004       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[0]);
1005   }
1006 
TEST(EXP__AVX2_RR2_LUT8_P4_PERM,positive_zero)1007   TEST(EXP__AVX2_RR2_LUT8_P4_PERM, positive_zero) {
1008     TEST_REQUIRES_X86_AVX2;
1009 
1010     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1011     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1012     std::fill(inputs.begin(), inputs.end(), +0.0f);
1013     xnn_math_f32_exp__avx2_rr2_lut8_p4_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1014     const float reference_output = 1.0f;
1015     ASSERT_EQ(reference_output, outputs[0])
1016       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
1017       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(reference_output)
1018       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[0]);
1019   }
1020 
TEST(EXP__AVX2_RR2_LUT8_P4_PERM,negative_saturation)1021   TEST(EXP__AVX2_RR2_LUT8_P4_PERM, negative_saturation) {
1022     TEST_REQUIRES_X86_AVX2;
1023 
1024     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1025     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1026     for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
1027       for (uint32_t i = 0; i < kBlockSize; i++) {
1028         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0xFF800000)));
1029       }
1030       xnn_math_f32_exp__avx2_rr2_lut8_p4_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1031       for (uint32_t i = 0; i < kBlockSize; i++) {
1032         const uint32_t reference_output = UINT32_C(0x00000000);
1033         ASSERT_EQ(reference_output, float_as_uint32(outputs[i]))
1034           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1035           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1036           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
1037       }
1038     }
1039   }
1040 
TEST(EXP__AVX2_RR2_LUT8_P4_PERM,positive_overflow)1041   TEST(EXP__AVX2_RR2_LUT8_P4_PERM, positive_overflow) {
1042     TEST_REQUIRES_X86_AVX2;
1043 
1044     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1045     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1046     for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
1047       for (uint32_t i = 0; i < kBlockSize; i++) {
1048         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0x7F800000)));
1049       }
1050       xnn_math_f32_exp__avx2_rr2_lut8_p4_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1051       for (uint32_t i = 0; i < kBlockSize; i++) {
1052         const uint32_t reference_output = UINT32_C(0x7F800000);
1053         ASSERT_EQ(reference_output, float_as_uint32(outputs[i]))
1054           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1055           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1056           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
1057       }
1058     }
1059   }
1060 
TEST(EXP__AVX2_RR2_LUT8_P4_PERM,positive_nan)1061   TEST(EXP__AVX2_RR2_LUT8_P4_PERM, positive_nan) {
1062     TEST_REQUIRES_X86_AVX2;
1063 
1064     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1065     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1066     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1067       for (uint32_t i = 0; i < kBlockSize; i++) {
1068         inputs[i] = uint32_as_float(std::min<uint32_t>(UINT32_C(0x7FFFFFFF), n + i));
1069       }
1070       xnn_math_f32_exp__avx2_rr2_lut8_p4_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1071       for (uint32_t i = 0; i < kBlockSize; i++) {
1072         ASSERT_TRUE(std::isnan(outputs[i]))
1073           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1074           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
1075       }
1076     }
1077   }
1078 
TEST(EXP__AVX2_RR2_LUT8_P4_PERM,negative_nan)1079   TEST(EXP__AVX2_RR2_LUT8_P4_PERM, negative_nan) {
1080     TEST_REQUIRES_X86_AVX2;
1081 
1082     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1083     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1084     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1085       for (uint32_t i = 0; i < kBlockSize; i++) {
1086         inputs[i] = uint32_as_float(std::min<uint32_t>(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
1087       }
1088       xnn_math_f32_exp__avx2_rr2_lut8_p4_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1089       for (uint32_t i = 0; i < kBlockSize; i++) {
1090         ASSERT_TRUE(std::isnan(outputs[i]))
1091           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1092           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
1093       }
1094     }
1095   }
1096 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1097 
1098 
1099 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(EXP__AVX2_RR2_P5,negative_zero)1100   TEST(EXP__AVX2_RR2_P5, negative_zero) {
1101     TEST_REQUIRES_X86_AVX2;
1102 
1103     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1104     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1105     std::fill(inputs.begin(), inputs.end(), -0.0f);
1106     xnn_math_f32_exp__avx2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1107     const float reference_output = 1.0f;
1108     ASSERT_EQ(reference_output, outputs[0])
1109       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
1110       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(reference_output)
1111       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[0]);
1112   }
1113 
TEST(EXP__AVX2_RR2_P5,positive_zero)1114   TEST(EXP__AVX2_RR2_P5, positive_zero) {
1115     TEST_REQUIRES_X86_AVX2;
1116 
1117     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1118     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1119     std::fill(inputs.begin(), inputs.end(), +0.0f);
1120     xnn_math_f32_exp__avx2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1121     const float reference_output = 1.0f;
1122     ASSERT_EQ(reference_output, outputs[0])
1123       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
1124       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(reference_output)
1125       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[0]);
1126   }
1127 
TEST(EXP__AVX2_RR2_P5,negative_saturation)1128   TEST(EXP__AVX2_RR2_P5, negative_saturation) {
1129     TEST_REQUIRES_X86_AVX2;
1130 
1131     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1132     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1133     for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
1134       for (uint32_t i = 0; i < kBlockSize; i++) {
1135         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0xFF800000)));
1136       }
1137       xnn_math_f32_exp__avx2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1138       for (uint32_t i = 0; i < kBlockSize; i++) {
1139         const uint32_t reference_output = UINT32_C(0x00000000);
1140         ASSERT_EQ(reference_output, float_as_uint32(outputs[i]))
1141           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1142           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1143           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
1144       }
1145     }
1146   }
1147 
TEST(EXP__AVX2_RR2_P5,positive_overflow)1148   TEST(EXP__AVX2_RR2_P5, positive_overflow) {
1149     TEST_REQUIRES_X86_AVX2;
1150 
1151     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1152     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1153     for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
1154       for (uint32_t i = 0; i < kBlockSize; i++) {
1155         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0x7F800000)));
1156       }
1157       xnn_math_f32_exp__avx2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1158       for (uint32_t i = 0; i < kBlockSize; i++) {
1159         const uint32_t reference_output = UINT32_C(0x7F800000);
1160         ASSERT_EQ(reference_output, float_as_uint32(outputs[i]))
1161           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1162           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1163           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
1164       }
1165     }
1166   }
1167 
TEST(EXP__AVX2_RR2_P5,positive_nan)1168   TEST(EXP__AVX2_RR2_P5, positive_nan) {
1169     TEST_REQUIRES_X86_AVX2;
1170 
1171     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1172     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1173     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1174       for (uint32_t i = 0; i < kBlockSize; i++) {
1175         inputs[i] = uint32_as_float(std::min<uint32_t>(UINT32_C(0x7FFFFFFF), n + i));
1176       }
1177       xnn_math_f32_exp__avx2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1178       for (uint32_t i = 0; i < kBlockSize; i++) {
1179         ASSERT_TRUE(std::isnan(outputs[i]))
1180           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1181           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
1182       }
1183     }
1184   }
1185 
TEST(EXP__AVX2_RR2_P5,negative_nan)1186   TEST(EXP__AVX2_RR2_P5, negative_nan) {
1187     TEST_REQUIRES_X86_AVX2;
1188 
1189     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1190     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1191     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1192       for (uint32_t i = 0; i < kBlockSize; i++) {
1193         inputs[i] = uint32_as_float(std::min<uint32_t>(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
1194       }
1195       xnn_math_f32_exp__avx2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1196       for (uint32_t i = 0; i < kBlockSize; i++) {
1197         ASSERT_TRUE(std::isnan(outputs[i]))
1198           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1199           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
1200       }
1201     }
1202   }
1203 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1204 
1205 
1206 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(EXP__AVX_RR2_P5,negative_zero)1207   TEST(EXP__AVX_RR2_P5, negative_zero) {
1208     TEST_REQUIRES_X86_AVX;
1209 
1210     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1211     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1212     std::fill(inputs.begin(), inputs.end(), -0.0f);
1213     xnn_math_f32_exp__avx_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1214     const float reference_output = 1.0f;
1215     ASSERT_EQ(reference_output, outputs[0])
1216       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
1217       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(reference_output)
1218       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[0]);
1219   }
1220 
TEST(EXP__AVX_RR2_P5,positive_zero)1221   TEST(EXP__AVX_RR2_P5, positive_zero) {
1222     TEST_REQUIRES_X86_AVX;
1223 
1224     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1225     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1226     std::fill(inputs.begin(), inputs.end(), +0.0f);
1227     xnn_math_f32_exp__avx_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1228     const float reference_output = 1.0f;
1229     ASSERT_EQ(reference_output, outputs[0])
1230       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
1231       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(reference_output)
1232       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[0]);
1233   }
1234 
TEST(EXP__AVX_RR2_P5,negative_saturation)1235   TEST(EXP__AVX_RR2_P5, negative_saturation) {
1236     TEST_REQUIRES_X86_AVX;
1237 
1238     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1239     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1240     for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
1241       for (uint32_t i = 0; i < kBlockSize; i++) {
1242         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0xFF800000)));
1243       }
1244       xnn_math_f32_exp__avx_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1245       for (uint32_t i = 0; i < kBlockSize; i++) {
1246         const uint32_t reference_output = UINT32_C(0x00000000);
1247         ASSERT_EQ(reference_output, float_as_uint32(outputs[i]))
1248           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1249           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1250           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
1251       }
1252     }
1253   }
1254 
TEST(EXP__AVX_RR2_P5,positive_overflow)1255   TEST(EXP__AVX_RR2_P5, positive_overflow) {
1256     TEST_REQUIRES_X86_AVX;
1257 
1258     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1259     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1260     for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
1261       for (uint32_t i = 0; i < kBlockSize; i++) {
1262         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0x7F800000)));
1263       }
1264       xnn_math_f32_exp__avx_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1265       for (uint32_t i = 0; i < kBlockSize; i++) {
1266         const uint32_t reference_output = UINT32_C(0x7F800000);
1267         ASSERT_EQ(reference_output, float_as_uint32(outputs[i]))
1268           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1269           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1270           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
1271       }
1272     }
1273   }
1274 
TEST(EXP__AVX_RR2_P5,positive_nan)1275   TEST(EXP__AVX_RR2_P5, positive_nan) {
1276     TEST_REQUIRES_X86_AVX;
1277 
1278     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1279     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1280     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1281       for (uint32_t i = 0; i < kBlockSize; i++) {
1282         inputs[i] = uint32_as_float(std::min<uint32_t>(UINT32_C(0x7FFFFFFF), n + i));
1283       }
1284       xnn_math_f32_exp__avx_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1285       for (uint32_t i = 0; i < kBlockSize; i++) {
1286         ASSERT_TRUE(std::isnan(outputs[i]))
1287           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1288           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
1289       }
1290     }
1291   }
1292 
TEST(EXP__AVX_RR2_P5,negative_nan)1293   TEST(EXP__AVX_RR2_P5, negative_nan) {
1294     TEST_REQUIRES_X86_AVX;
1295 
1296     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1297     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1298     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1299       for (uint32_t i = 0; i < kBlockSize; i++) {
1300         inputs[i] = uint32_as_float(std::min<uint32_t>(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
1301       }
1302       xnn_math_f32_exp__avx_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1303       for (uint32_t i = 0; i < kBlockSize; i++) {
1304         ASSERT_TRUE(std::isnan(outputs[i]))
1305           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1306           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
1307       }
1308     }
1309   }
1310 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1311 
1312 
1313 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(EXP__SSE2_RR2_LUT64_P2,negative_zero)1314   TEST(EXP__SSE2_RR2_LUT64_P2, negative_zero) {
1315     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1316     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1317     std::fill(inputs.begin(), inputs.end(), -0.0f);
1318     xnn_math_f32_exp__sse2_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1319     const float reference_output = 1.0f;
1320     ASSERT_EQ(reference_output, outputs[0])
1321       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
1322       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(reference_output)
1323       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[0]);
1324   }
1325 
TEST(EXP__SSE2_RR2_LUT64_P2,positive_zero)1326   TEST(EXP__SSE2_RR2_LUT64_P2, positive_zero) {
1327     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1328     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1329     std::fill(inputs.begin(), inputs.end(), +0.0f);
1330     xnn_math_f32_exp__sse2_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1331     const float reference_output = 1.0f;
1332     ASSERT_EQ(reference_output, outputs[0])
1333       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
1334       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(reference_output)
1335       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[0]);
1336   }
1337 
TEST(EXP__SSE2_RR2_LUT64_P2,negative_saturation)1338   TEST(EXP__SSE2_RR2_LUT64_P2, negative_saturation) {
1339     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1340     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1341     for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
1342       for (uint32_t i = 0; i < kBlockSize; i++) {
1343         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0xFF800000)));
1344       }
1345       xnn_math_f32_exp__sse2_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1346       for (uint32_t i = 0; i < kBlockSize; i++) {
1347         const uint32_t reference_output = UINT32_C(0x00000000);
1348         ASSERT_EQ(reference_output, float_as_uint32(outputs[i]))
1349           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1350           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1351           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
1352       }
1353     }
1354   }
1355 
TEST(EXP__SSE2_RR2_LUT64_P2,positive_overflow)1356   TEST(EXP__SSE2_RR2_LUT64_P2, positive_overflow) {
1357     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1358     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1359     for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
1360       for (uint32_t i = 0; i < kBlockSize; i++) {
1361         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0x7F800000)));
1362       }
1363       xnn_math_f32_exp__sse2_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1364       for (uint32_t i = 0; i < kBlockSize; i++) {
1365         const uint32_t reference_output = UINT32_C(0x7F800000);
1366         ASSERT_EQ(reference_output, float_as_uint32(outputs[i]))
1367           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1368           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1369           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
1370       }
1371     }
1372   }
1373 
TEST(EXP__SSE2_RR2_LUT64_P2,positive_nan)1374   TEST(EXP__SSE2_RR2_LUT64_P2, positive_nan) {
1375     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1376     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1377     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1378       for (uint32_t i = 0; i < kBlockSize; i++) {
1379         inputs[i] = uint32_as_float(std::min<uint32_t>(UINT32_C(0x7FFFFFFF), n + i));
1380       }
1381       xnn_math_f32_exp__sse2_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1382       for (uint32_t i = 0; i < kBlockSize; i++) {
1383         ASSERT_TRUE(std::isnan(outputs[i]))
1384           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1385           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
1386       }
1387     }
1388   }
1389 
TEST(EXP__SSE2_RR2_LUT64_P2,negative_nan)1390   TEST(EXP__SSE2_RR2_LUT64_P2, negative_nan) {
1391     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1392     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1393     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1394       for (uint32_t i = 0; i < kBlockSize; i++) {
1395         inputs[i] = uint32_as_float(std::min<uint32_t>(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
1396       }
1397       xnn_math_f32_exp__sse2_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1398       for (uint32_t i = 0; i < kBlockSize; i++) {
1399         ASSERT_TRUE(std::isnan(outputs[i]))
1400           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1401           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
1402       }
1403     }
1404   }
1405 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1406 
1407 
1408 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(EXP__SSE2_RR2_P5,negative_zero)1409   TEST(EXP__SSE2_RR2_P5, negative_zero) {
1410     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1411     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1412     std::fill(inputs.begin(), inputs.end(), -0.0f);
1413     xnn_math_f32_exp__sse2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1414     const float reference_output = 1.0f;
1415     ASSERT_EQ(reference_output, outputs[0])
1416       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
1417       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(reference_output)
1418       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[0]);
1419   }
1420 
TEST(EXP__SSE2_RR2_P5,positive_zero)1421   TEST(EXP__SSE2_RR2_P5, positive_zero) {
1422     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1423     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1424     std::fill(inputs.begin(), inputs.end(), +0.0f);
1425     xnn_math_f32_exp__sse2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1426     const float reference_output = 1.0f;
1427     ASSERT_EQ(reference_output, outputs[0])
1428       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[0])
1429       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(reference_output)
1430       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[0]);
1431   }
1432 
TEST(EXP__SSE2_RR2_P5,negative_saturation)1433   TEST(EXP__SSE2_RR2_P5, negative_saturation) {
1434     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1435     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1436     for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
1437       for (uint32_t i = 0; i < kBlockSize; i++) {
1438         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0xFF800000)));
1439       }
1440       xnn_math_f32_exp__sse2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1441       for (uint32_t i = 0; i < kBlockSize; i++) {
1442         const uint32_t reference_output = UINT32_C(0x00000000);
1443         ASSERT_EQ(reference_output, float_as_uint32(outputs[i]))
1444           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1445           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1446           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
1447       }
1448     }
1449   }
1450 
TEST(EXP__SSE2_RR2_P5,positive_overflow)1451   TEST(EXP__SSE2_RR2_P5, positive_overflow) {
1452     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1453     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1454     for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
1455       for (uint32_t i = 0; i < kBlockSize; i++) {
1456         inputs[i] = uint32_as_float(std::min<uint32_t>(n + i, UINT32_C(0x7F800000)));
1457       }
1458       xnn_math_f32_exp__sse2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1459       for (uint32_t i = 0; i < kBlockSize; i++) {
1460         const uint32_t reference_output = UINT32_C(0x7F800000);
1461         ASSERT_EQ(reference_output, float_as_uint32(outputs[i]))
1462           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1463           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1464           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
1465       }
1466     }
1467   }
1468 
TEST(EXP__SSE2_RR2_P5,positive_nan)1469   TEST(EXP__SSE2_RR2_P5, positive_nan) {
1470     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1471     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1472     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1473       for (uint32_t i = 0; i < kBlockSize; i++) {
1474         inputs[i] = uint32_as_float(std::min<uint32_t>(UINT32_C(0x7FFFFFFF), n + i));
1475       }
1476       xnn_math_f32_exp__sse2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1477       for (uint32_t i = 0; i < kBlockSize; i++) {
1478         ASSERT_TRUE(std::isnan(outputs[i]))
1479           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1480           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
1481       }
1482     }
1483   }
1484 
TEST(EXP__SSE2_RR2_P5,negative_nan)1485   TEST(EXP__SSE2_RR2_P5, negative_nan) {
1486     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1487     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1488     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1489       for (uint32_t i = 0; i < kBlockSize; i++) {
1490         inputs[i] = uint32_as_float(std::min<uint32_t>(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
1491       }
1492       xnn_math_f32_exp__sse2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1493       for (uint32_t i = 0; i < kBlockSize; i++) {
1494         ASSERT_TRUE(std::isnan(outputs[i]))
1495           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(inputs[i])
1496           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << float_as_uint32(outputs[i]);
1497       }
1498     }
1499   }
1500 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1501