xref: /aosp_15_r20/external/XNNPACK/eval/u32-sqrt.cc (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright 2022 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cmath>
8 #include <cstddef>
9 #include <cstdint>
10 #include <cstdlib>
11 #include <vector>
12 
13 #include <gtest/gtest.h>
14 
15 #include <xnnpack/aligned-allocator.h>
16 #include <xnnpack/common.h>
17 #include <xnnpack/math-stubs.h>
18 
19 
20 constexpr int kBlockSize = 1024;
21 
22 
TEST(SQRT__SCALAR_BITMANIP,uint16_output)23 TEST(SQRT__SCALAR_BITMANIP, uint16_output) {
24   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize);
25   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize);
26   for (uint32_t n = 0; n <= UINT32_C(4294901760); n += kBlockSize) {
27     for (uint32_t i = 0; i < kBlockSize; i++) {
28       inputs[i] = std::min<uint32_t>(n + i, UINT32_C(4294901760));
29     }
30     xnn_math_u32_sqrt__scalar_bitmanip(kBlockSize * sizeof(uint32_t), inputs.data(), outputs.data());
31     for (uint32_t i = 0; i < kBlockSize; i++) {
32       const uint32_t input = inputs[i];
33       const uint32_t output = outputs[i];
34       const int64_t squared_output = int64_t(uint64_t(output) * uint64_t(output));
35 
36       const uint32_t prev_output = output - 1;
37       const int64_t squared_prev_output = int64_t(uint64_t(prev_output) * uint64_t(prev_output));
38       ASSERT_LT(std::abs(squared_output - int64_t(input)), std::abs(squared_prev_output - int64_t(input)))
39         << "input = " << input << ", output = " << output;
40 
41       const uint32_t next_output = output + 1;
42       const int64_t squared_next_output = int64_t(uint64_t(next_output) * uint64_t(next_output));
43       ASSERT_LT(std::abs(squared_output - int64_t(input)), std::abs(squared_next_output - int64_t(input)))
44         << "input = " << input << ", output = " << output;
45     }
46   }
47 }
48 
49 TEST(SQRT__SCALAR_BITMANIP, 65536_output) {
50   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize);
51   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize);
52   for (uint32_t n = UINT32_C(4294901761); n >= UINT32_C(4294901761); n += kBlockSize) {
53     for (uint32_t i = 0; i < kBlockSize; i++) {
54       inputs[i] = std::max<uint32_t>(n + i, UINT32_C(4294901761));
55     }
56     xnn_math_u32_sqrt__scalar_bitmanip(kBlockSize * sizeof(uint32_t), inputs.data(), outputs.data());
57     for (uint32_t i = 0; i < kBlockSize; i++) {
58       const uint32_t input = inputs[i];
59       const uint32_t output = outputs[i];
60       ASSERT_EQ(output, UINT32_C(0x00010000))
61         << "input = " << input << ", output = " << output;
62     }
63   }
64 }
65 
66 
TEST(SQRT__SCALAR_CLZ_BINSEARCH,uint16_output)67 TEST(SQRT__SCALAR_CLZ_BINSEARCH, uint16_output) {
68   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize);
69   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize);
70   for (uint32_t n = 0; n <= UINT32_C(4294901760); n += kBlockSize) {
71     for (uint32_t i = 0; i < kBlockSize; i++) {
72       inputs[i] = std::min<uint32_t>(n + i, UINT32_C(4294901760));
73     }
74     xnn_math_u32_sqrt__scalar_clz_binsearch(kBlockSize * sizeof(uint32_t), inputs.data(), outputs.data());
75     for (uint32_t i = 0; i < kBlockSize; i++) {
76       const uint32_t input = inputs[i];
77       const uint32_t output = outputs[i];
78       const int64_t squared_output = int64_t(uint64_t(output) * uint64_t(output));
79 
80       const uint32_t prev_output = output - 1;
81       const int64_t squared_prev_output = int64_t(uint64_t(prev_output) * uint64_t(prev_output));
82       ASSERT_LT(std::abs(squared_output - int64_t(input)), std::abs(squared_prev_output - int64_t(input)))
83         << "input = " << input << ", output = " << output;
84 
85       const uint32_t next_output = output + 1;
86       const int64_t squared_next_output = int64_t(uint64_t(next_output) * uint64_t(next_output));
87       ASSERT_LT(std::abs(squared_output - int64_t(input)), std::abs(squared_next_output - int64_t(input)))
88         << "input = " << input << ", output = " << output;
89     }
90   }
91 }
92 
93 TEST(SQRT__SCALAR_CLZ_BINSEARCH, 65536_output) {
94   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize);
95   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize);
96   for (uint32_t n = UINT32_C(4294901761); n >= UINT32_C(4294901761); n += kBlockSize) {
97     for (uint32_t i = 0; i < kBlockSize; i++) {
98       inputs[i] = std::max<uint32_t>(n + i, UINT32_C(4294901761));
99     }
100     xnn_math_u32_sqrt__scalar_clz_binsearch(kBlockSize * sizeof(uint32_t), inputs.data(), outputs.data());
101     for (uint32_t i = 0; i < kBlockSize; i++) {
102       const uint32_t input = inputs[i];
103       const uint32_t output = outputs[i];
104       ASSERT_EQ(output, UINT32_C(0x00010000))
105         << "input = " << input << ", output = " << output;
106     }
107   }
108 }
109 
110 
TEST(SQRT__SCALAR_CLZ_NEWTON,uint16_output)111 TEST(SQRT__SCALAR_CLZ_NEWTON, uint16_output) {
112   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize);
113   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize);
114   for (uint32_t n = 0; n <= UINT32_C(4294901760); n += kBlockSize) {
115     for (uint32_t i = 0; i < kBlockSize; i++) {
116       inputs[i] = std::min<uint32_t>(n + i, UINT32_C(4294901760));
117     }
118     xnn_math_u32_sqrt__scalar_clz_newton(kBlockSize * sizeof(uint32_t), inputs.data(), outputs.data());
119     for (uint32_t i = 0; i < kBlockSize; i++) {
120       const uint32_t input = inputs[i];
121       const uint32_t output = outputs[i];
122       const int64_t squared_output = int64_t(uint64_t(output) * uint64_t(output));
123 
124       const uint32_t prev_output = output - 1;
125       const int64_t squared_prev_output = int64_t(uint64_t(prev_output) * uint64_t(prev_output));
126       ASSERT_LT(std::abs(squared_output - int64_t(input)), std::abs(squared_prev_output - int64_t(input)))
127         << "input = " << input << ", output = " << output;
128 
129       const uint32_t next_output = output + 1;
130       const int64_t squared_next_output = int64_t(uint64_t(next_output) * uint64_t(next_output));
131       ASSERT_LT(std::abs(squared_output - int64_t(input)), std::abs(squared_next_output - int64_t(input)))
132         << "input = " << input << ", output = " << output;
133     }
134   }
135 }
136 
137 TEST(SQRT__SCALAR_CLZ_NEWTON, 65536_output) {
138   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize);
139   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize);
140   for (uint32_t n = UINT32_C(4294901761); n >= UINT32_C(4294901761); n += kBlockSize) {
141     for (uint32_t i = 0; i < kBlockSize; i++) {
142       inputs[i] = std::max<uint32_t>(n + i, UINT32_C(4294901761));
143     }
144     xnn_math_u32_sqrt__scalar_clz_newton(kBlockSize * sizeof(uint32_t), inputs.data(), outputs.data());
145     for (uint32_t i = 0; i < kBlockSize; i++) {
146       const uint32_t input = inputs[i];
147       const uint32_t output = outputs[i];
148       ASSERT_EQ(output, UINT32_C(0x00010000))
149         << "input = " << input << ", output = " << output;
150     }
151   }
152 }
153 
154 
TEST(SQRT__SCALAR_CVTI32_SQRT_LRINT,uint16_output)155 TEST(SQRT__SCALAR_CVTI32_SQRT_LRINT, uint16_output) {
156   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize);
157   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize);
158   for (uint32_t n = 0; n <= UINT32_C(4294901760); n += kBlockSize) {
159     for (uint32_t i = 0; i < kBlockSize; i++) {
160       inputs[i] = std::min<uint32_t>(n + i, UINT32_C(4294901760));
161     }
162     xnn_math_u32_sqrt__scalar_cvti32_sqrt_lrint(kBlockSize * sizeof(uint32_t), inputs.data(), outputs.data());
163     for (uint32_t i = 0; i < kBlockSize; i++) {
164       const uint32_t input = inputs[i];
165       const uint32_t output = outputs[i];
166       const int64_t squared_output = int64_t(uint64_t(output) * uint64_t(output));
167 
168       const uint32_t prev_output = output - 1;
169       const int64_t squared_prev_output = int64_t(uint64_t(prev_output) * uint64_t(prev_output));
170       ASSERT_LT(std::abs(squared_output - int64_t(input)), std::abs(squared_prev_output - int64_t(input)))
171         << "input = " << input << ", output = " << output;
172 
173       const uint32_t next_output = output + 1;
174       const int64_t squared_next_output = int64_t(uint64_t(next_output) * uint64_t(next_output));
175       ASSERT_LT(std::abs(squared_output - int64_t(input)), std::abs(squared_next_output - int64_t(input)))
176         << "input = " << input << ", output = " << output;
177     }
178   }
179 }
180 
181 TEST(SQRT__SCALAR_CVTI32_SQRT_LRINT, 65536_output) {
182   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize);
183   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize);
184   for (uint32_t n = UINT32_C(4294901761); n >= UINT32_C(4294901761); n += kBlockSize) {
185     for (uint32_t i = 0; i < kBlockSize; i++) {
186       inputs[i] = std::max<uint32_t>(n + i, UINT32_C(4294901761));
187     }
188     xnn_math_u32_sqrt__scalar_cvti32_sqrt_lrint(kBlockSize * sizeof(uint32_t), inputs.data(), outputs.data());
189     for (uint32_t i = 0; i < kBlockSize; i++) {
190       const uint32_t input = inputs[i];
191       const uint32_t output = outputs[i];
192       ASSERT_EQ(output, UINT32_C(0x00010000))
193         << "input = " << input << ", output = " << output;
194     }
195   }
196 }
197 
198 
TEST(SQRT__SCALAR_CVTI64_SQRT_LRINT,uint16_output)199 TEST(SQRT__SCALAR_CVTI64_SQRT_LRINT, uint16_output) {
200   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize);
201   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize);
202   for (uint32_t n = 0; n <= UINT32_C(4294901760); n += kBlockSize) {
203     for (uint32_t i = 0; i < kBlockSize; i++) {
204       inputs[i] = std::min<uint32_t>(n + i, UINT32_C(4294901760));
205     }
206     xnn_math_u32_sqrt__scalar_cvti64_sqrt_lrint(kBlockSize * sizeof(uint32_t), inputs.data(), outputs.data());
207     for (uint32_t i = 0; i < kBlockSize; i++) {
208       const uint32_t input = inputs[i];
209       const uint32_t output = outputs[i];
210       const int64_t squared_output = int64_t(uint64_t(output) * uint64_t(output));
211 
212       const uint32_t prev_output = output - 1;
213       const int64_t squared_prev_output = int64_t(uint64_t(prev_output) * uint64_t(prev_output));
214       ASSERT_LT(std::abs(squared_output - int64_t(input)), std::abs(squared_prev_output - int64_t(input)))
215         << "input = " << input << ", output = " << output;
216 
217       const uint32_t next_output = output + 1;
218       const int64_t squared_next_output = int64_t(uint64_t(next_output) * uint64_t(next_output));
219       ASSERT_LT(std::abs(squared_output - int64_t(input)), std::abs(squared_next_output - int64_t(input)))
220         << "input = " << input << ", output = " << output;
221     }
222   }
223 }
224 
225 TEST(SQRT__SCALAR_CVTI64_SQRT_LRINT, 65536_output) {
226   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize);
227   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize);
228   for (uint32_t n = UINT32_C(4294901761); n >= UINT32_C(4294901761); n += kBlockSize) {
229     for (uint32_t i = 0; i < kBlockSize; i++) {
230       inputs[i] = std::max<uint32_t>(n + i, UINT32_C(4294901761));
231     }
232     xnn_math_u32_sqrt__scalar_cvti64_sqrt_lrint(kBlockSize * sizeof(uint32_t), inputs.data(), outputs.data());
233     for (uint32_t i = 0; i < kBlockSize; i++) {
234       const uint32_t input = inputs[i];
235       const uint32_t output = outputs[i];
236       ASSERT_EQ(output, UINT32_C(0x00010000))
237         << "input = " << input << ", output = " << output;
238     }
239   }
240 }
241 
242 
TEST(SQRT__SCALAR_CVTU32_SQRT_LRINT,uint16_output)243 TEST(SQRT__SCALAR_CVTU32_SQRT_LRINT, uint16_output) {
244   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize);
245   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize);
246   for (uint32_t n = 0; n <= UINT32_C(4294901760); n += kBlockSize) {
247     for (uint32_t i = 0; i < kBlockSize; i++) {
248       inputs[i] = std::min<uint32_t>(n + i, UINT32_C(4294901760));
249     }
250     xnn_math_u32_sqrt__scalar_cvtu32_sqrt_lrint(kBlockSize * sizeof(uint32_t), inputs.data(), outputs.data());
251     for (uint32_t i = 0; i < kBlockSize; i++) {
252       const uint32_t input = inputs[i];
253       const uint32_t output = outputs[i];
254       const int64_t squared_output = int64_t(uint64_t(output) * uint64_t(output));
255 
256       const uint32_t prev_output = output - 1;
257       const int64_t squared_prev_output = int64_t(uint64_t(prev_output) * uint64_t(prev_output));
258       ASSERT_LT(std::abs(squared_output - int64_t(input)), std::abs(squared_prev_output - int64_t(input)))
259         << "input = " << input << ", output = " << output;
260 
261       const uint32_t next_output = output + 1;
262       const int64_t squared_next_output = int64_t(uint64_t(next_output) * uint64_t(next_output));
263       ASSERT_LT(std::abs(squared_output - int64_t(input)), std::abs(squared_next_output - int64_t(input)))
264         << "input = " << input << ", output = " << output;
265     }
266   }
267 }
268 
269 TEST(SQRT__SCALAR_CVTU32_SQRT_LRINT, 65536_output) {
270   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize);
271   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize);
272   for (uint32_t n = UINT32_C(4294901761); n >= UINT32_C(4294901761); n += kBlockSize) {
273     for (uint32_t i = 0; i < kBlockSize; i++) {
274       inputs[i] = std::max<uint32_t>(n + i, UINT32_C(4294901761));
275     }
276     xnn_math_u32_sqrt__scalar_cvtu32_sqrt_lrint(kBlockSize * sizeof(uint32_t), inputs.data(), outputs.data());
277     for (uint32_t i = 0; i < kBlockSize; i++) {
278       const uint32_t input = inputs[i];
279       const uint32_t output = outputs[i];
280       ASSERT_EQ(output, UINT32_C(0x00010000))
281         << "input = " << input << ", output = " << output;
282     }
283   }
284 }
285 
286 
TEST(SQRT__SCALAR_CVTI64_SQRTF_LRINTF,uint16_output)287 TEST(SQRT__SCALAR_CVTI64_SQRTF_LRINTF, uint16_output) {
288   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize);
289   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize);
290   for (uint32_t n = 0; n <= UINT32_C(4294901760); n += kBlockSize) {
291     for (uint32_t i = 0; i < kBlockSize; i++) {
292       inputs[i] = std::min<uint32_t>(n + i, UINT32_C(4294901760));
293     }
294     xnn_math_u32_sqrt__scalar_cvti64_sqrtf_lrintf(kBlockSize * sizeof(uint32_t), inputs.data(), outputs.data());
295     for (uint32_t i = 0; i < kBlockSize; i++) {
296       const uint32_t input = inputs[i];
297       const uint32_t output = outputs[i];
298       const int64_t squared_output = int64_t(uint64_t(output) * uint64_t(output));
299 
300       const uint32_t prev_output = output - 1;
301       const int64_t squared_prev_output = int64_t(uint64_t(prev_output) * uint64_t(prev_output));
302       ASSERT_LT(std::abs(squared_output - int64_t(input)), std::abs(squared_prev_output - int64_t(input)))
303         << "input = " << input << ", output = " << output;
304 
305       const uint32_t next_output = output + 1;
306       const int64_t squared_next_output = int64_t(uint64_t(next_output) * uint64_t(next_output));
307       ASSERT_LT(std::abs(squared_output - int64_t(input)), std::abs(squared_next_output - int64_t(input)))
308         << "input = " << input << ", output = " << output;
309     }
310   }
311 }
312 
313 TEST(SQRT__SCALAR_CVTI64_SQRTF_LRINTF, 65536_output) {
314   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize);
315   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize);
316   for (uint32_t n = UINT32_C(4294901761); n >= UINT32_C(4294901761); n += kBlockSize) {
317     for (uint32_t i = 0; i < kBlockSize; i++) {
318       inputs[i] = std::max<uint32_t>(n + i, UINT32_C(4294901761));
319     }
320     xnn_math_u32_sqrt__scalar_cvti64_sqrtf_lrintf(kBlockSize * sizeof(uint32_t), inputs.data(), outputs.data());
321     for (uint32_t i = 0; i < kBlockSize; i++) {
322       const uint32_t input = inputs[i];
323       const uint32_t output = outputs[i];
324       ASSERT_EQ(output, UINT32_C(0x00010000))
325         << "input = " << input << ", output = " << output;
326     }
327   }
328 }
329 
330 
TEST(SQRT__SCALAR_CVTU32_SQRTF_LRINTF,uint16_output)331 TEST(SQRT__SCALAR_CVTU32_SQRTF_LRINTF, uint16_output) {
332   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize);
333   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize);
334   for (uint32_t n = 0; n <= UINT32_C(4294901760); n += kBlockSize) {
335     for (uint32_t i = 0; i < kBlockSize; i++) {
336       inputs[i] = std::min<uint32_t>(n + i, UINT32_C(4294901760));
337     }
338     xnn_math_u32_sqrt__scalar_cvtu32_sqrtf_lrintf(kBlockSize * sizeof(uint32_t), inputs.data(), outputs.data());
339     for (uint32_t i = 0; i < kBlockSize; i++) {
340       const uint32_t input = inputs[i];
341       const uint32_t output = outputs[i];
342       const int64_t squared_output = int64_t(uint64_t(output) * uint64_t(output));
343 
344       const uint32_t prev_output = output - 1;
345       const int64_t squared_prev_output = int64_t(uint64_t(prev_output) * uint64_t(prev_output));
346       ASSERT_LT(std::abs(squared_output - int64_t(input)), std::abs(squared_prev_output - int64_t(input)))
347         << "input = " << input << ", output = " << output;
348 
349       const uint32_t next_output = output + 1;
350       const int64_t squared_next_output = int64_t(uint64_t(next_output) * uint64_t(next_output));
351       ASSERT_LT(std::abs(squared_output - int64_t(input)), std::abs(squared_next_output - int64_t(input)))
352         << "input = " << input << ", output = " << output;
353     }
354   }
355 }
356 
357 TEST(SQRT__SCALAR_CVTU32_SQRTF_LRINTF, 65536_output) {
358   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize);
359   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize);
360   for (uint32_t n = UINT32_C(4294901761); n >= UINT32_C(4294901761); n += kBlockSize) {
361     for (uint32_t i = 0; i < kBlockSize; i++) {
362       inputs[i] = std::max<uint32_t>(n + i, UINT32_C(4294901761));
363     }
364     xnn_math_u32_sqrt__scalar_cvtu32_sqrtf_lrintf(kBlockSize * sizeof(uint32_t), inputs.data(), outputs.data());
365     for (uint32_t i = 0; i < kBlockSize; i++) {
366       const uint32_t input = inputs[i];
367       const uint32_t output = outputs[i];
368       ASSERT_EQ(output, UINT32_C(0x00010000))
369         << "input = " << input << ", output = " << output;
370     }
371   }
372 }
373 
374 
TEST(SQRT__SCALAR_HASHEMIAN,uint16_output)375 TEST(SQRT__SCALAR_HASHEMIAN, uint16_output) {
376   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize);
377   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize);
378   for (uint32_t n = 0; n <= UINT32_C(4294901760); n += kBlockSize) {
379     for (uint32_t i = 0; i < kBlockSize; i++) {
380       inputs[i] = std::min<uint32_t>(n + i, UINT32_C(4294901760));
381     }
382     xnn_math_u32_sqrt__scalar_hashemian(kBlockSize * sizeof(uint32_t), inputs.data(), outputs.data());
383     for (uint32_t i = 0; i < kBlockSize; i++) {
384       const uint32_t input = inputs[i];
385       const uint32_t output = outputs[i];
386       const int64_t squared_output = int64_t(uint64_t(output) * uint64_t(output));
387 
388       const uint32_t prev_output = output - 1;
389       const int64_t squared_prev_output = int64_t(uint64_t(prev_output) * uint64_t(prev_output));
390       ASSERT_LT(std::abs(squared_output - int64_t(input)), std::abs(squared_prev_output - int64_t(input)))
391         << "input = " << input << ", output = " << output;
392 
393       const uint32_t next_output = output + 1;
394       const int64_t squared_next_output = int64_t(uint64_t(next_output) * uint64_t(next_output));
395       ASSERT_LT(std::abs(squared_output - int64_t(input)), std::abs(squared_next_output - int64_t(input)))
396         << "input = " << input << ", output = " << output;
397     }
398   }
399 }
400 
401 TEST(SQRT__SCALAR_HASHEMIAN, 65536_output) {
402   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize);
403   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize);
404   for (uint32_t n = UINT32_C(4294901761); n >= UINT32_C(4294901761); n += kBlockSize) {
405     for (uint32_t i = 0; i < kBlockSize; i++) {
406       inputs[i] = std::max<uint32_t>(n + i, UINT32_C(4294901761));
407     }
408     xnn_math_u32_sqrt__scalar_hashemian(kBlockSize * sizeof(uint32_t), inputs.data(), outputs.data());
409     for (uint32_t i = 0; i < kBlockSize; i++) {
410       const uint32_t input = inputs[i];
411       const uint32_t output = outputs[i];
412       ASSERT_EQ(output, UINT32_C(0x00010000))
413         << "input = " << input << ", output = " << output;
414     }
415   }
416 }
417 
418 
TEST(SQRT__SCALAR_TFLM,uint16_output)419 TEST(SQRT__SCALAR_TFLM, uint16_output) {
420   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize);
421   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize);
422   for (uint32_t n = 0; n <= UINT32_C(4294901760); n += kBlockSize) {
423     for (uint32_t i = 0; i < kBlockSize; i++) {
424       inputs[i] = std::min<uint32_t>(n + i, UINT32_C(4294901760));
425     }
426     xnn_math_u32_sqrt__scalar_tflm(kBlockSize * sizeof(uint32_t), inputs.data(), outputs.data());
427     for (uint32_t i = 0; i < kBlockSize; i++) {
428       const uint32_t input = inputs[i];
429       const uint32_t output = outputs[i];
430       const int64_t squared_output = int64_t(uint64_t(output) * uint64_t(output));
431 
432       const uint32_t prev_output = output - 1;
433       const int64_t squared_prev_output = int64_t(uint64_t(prev_output) * uint64_t(prev_output));
434       ASSERT_LT(std::abs(squared_output - int64_t(input)), std::abs(squared_prev_output - int64_t(input)))
435         << "input = " << input << ", output = " << output;
436 
437       const uint32_t next_output = output + 1;
438       const int64_t squared_next_output = int64_t(uint64_t(next_output) * uint64_t(next_output));
439       ASSERT_LT(std::abs(squared_output - int64_t(input)), std::abs(squared_next_output - int64_t(input)))
440         << "input = " << input << ", output = " << output;
441     }
442   }
443 }
444 
TEST(SQRT__SCALAR_TFLM,DISABLED_65536_output)445 TEST(SQRT__SCALAR_TFLM, DISABLED_65536_output) {
446   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> inputs(kBlockSize);
447   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> outputs(kBlockSize);
448   for (uint32_t n = UINT32_C(4294901761); n >= UINT32_C(4294901761); n += kBlockSize) {
449     for (uint32_t i = 0; i < kBlockSize; i++) {
450       inputs[i] = std::max<uint32_t>(n + i, UINT32_C(4294901761));
451     }
452     xnn_math_u32_sqrt__scalar_tflm(kBlockSize * sizeof(uint32_t), inputs.data(), outputs.data());
453     for (uint32_t i = 0; i < kBlockSize; i++) {
454       const uint32_t input = inputs[i];
455       const uint32_t output = outputs[i];
456       ASSERT_EQ(output, UINT32_C(0x00010000))
457         << "input = " << input << ", output = " << output;
458     }
459   }
460 }
461