xref: /aosp_15_r20/external/XNNPACK/test/avgpool-microkernel-tester.h (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #pragma once
10 
11 #include <gtest/gtest.h>
12 
13 #include <algorithm>
14 #include <cassert>
15 #include <cmath>
16 #include <cstddef>
17 #include <cstdlib>
18 #include <limits>
19 #include <random>
20 #include <vector>
21 
22 #include <fp16.h>
23 
24 #include <xnnpack.h>
25 #include <xnnpack/aligned-allocator.h>
26 #include <xnnpack/microfnptr.h>
27 #include <xnnpack/microparams-init.h>
28 #include <xnnpack/requantization.h>
29 
30 
31 class AvgPoolMicrokernelTester {
32  public:
output_pixels(size_t output_pixels)33   inline AvgPoolMicrokernelTester& output_pixels(size_t output_pixels) {
34     assert(output_pixels != 0);
35     this->output_pixels_ = output_pixels;
36     return *this;
37   }
38 
output_pixels()39   inline size_t output_pixels() const {
40     return this->output_pixels_;
41   }
42 
step(size_t step)43   inline AvgPoolMicrokernelTester& step(size_t step) {
44     assert(step != 0);
45     this->step_ = step;
46     return *this;
47   }
48 
step()49   inline size_t step() const {
50     return this->step_;
51   }
52 
input_offset(size_t input_offset)53   inline AvgPoolMicrokernelTester& input_offset(size_t input_offset) {
54     assert(input_offset != 0);
55     this->input_offset_ = input_offset;
56     return *this;
57   }
58 
input_offset()59   inline size_t input_offset() const {
60     return this->input_offset_;
61   }
62 
zero_index(size_t zero_index)63   inline AvgPoolMicrokernelTester& zero_index(size_t zero_index) {
64     this->zero_index_ = zero_index;
65     return *this;
66   }
67 
zero_index()68   inline size_t zero_index() const {
69     return this->zero_index_;
70   }
71 
pooling_elements(size_t pooling_elements)72   inline AvgPoolMicrokernelTester& pooling_elements(size_t pooling_elements) {
73     assert(pooling_elements != 0);
74     this->pooling_elements_ = pooling_elements;
75     return *this;
76   }
77 
pooling_elements()78   inline size_t pooling_elements() const {
79     return this->pooling_elements_;
80   }
81 
packed_pooling_elements()82   inline size_t packed_pooling_elements() const {
83     if (pooling_elements() <= primary_pooling_tile()) {
84       return primary_pooling_tile();
85     } else {
86       return (pooling_elements() - primary_pooling_tile()) % incremental_pooling_tile() == 0 ? pooling_elements() : ((pooling_elements() - primary_pooling_tile()) / incremental_pooling_tile() + 1) * incremental_pooling_tile() + primary_pooling_tile();
87     }
88   }
89 
90   inline AvgPoolMicrokernelTester& pooling_tile(size_t primary_tile, size_t incremental_tile = 0) {
91     assert(primary_tile != 0);
92     this->primary_pooling_tile_ = primary_tile;
93     this->incremental_pooling_tile_ = incremental_tile;
94     return *this;
95   }
96 
primary_pooling_tile(size_t primary_pooling_tile)97   inline AvgPoolMicrokernelTester& primary_pooling_tile(size_t primary_pooling_tile) {
98     assert(primary_pooling_tile != 0);
99     this->primary_pooling_tile_ = primary_pooling_tile;
100     return *this;
101   }
102 
primary_pooling_tile()103   inline size_t primary_pooling_tile() const {
104     return this->primary_pooling_tile_;
105   }
106 
incremental_pooling_tile(size_t incremental_pooling_tile)107   inline AvgPoolMicrokernelTester& incremental_pooling_tile(size_t incremental_pooling_tile) {
108     assert(incremental_pooling_tile != 0);
109     this->incremental_pooling_tile_ = incremental_pooling_tile;
110     return *this;
111   }
112 
incremental_pooling_tile()113   inline size_t incremental_pooling_tile() const {
114     return this->incremental_pooling_tile_;
115   }
116 
channels(size_t channels)117   inline AvgPoolMicrokernelTester& channels(size_t channels) {
118     assert(channels != 0);
119     this->channels_ = channels;
120     return *this;
121   }
122 
channels()123   inline size_t channels() const {
124     return this->channels_;
125   }
126 
output_stride(size_t output_stride)127   inline AvgPoolMicrokernelTester& output_stride(size_t output_stride) {
128     assert(output_stride != 0);
129     this->output_stride_ = output_stride;
130     return *this;
131   }
132 
output_stride()133   inline size_t output_stride() const {
134     if (this->output_stride_ == 0) {
135       return channels();
136     } else {
137       assert(this->output_stride_ >= channels());
138       return this->output_stride_;
139     }
140   }
141 
input_scale(float input_scale)142   inline AvgPoolMicrokernelTester& input_scale(float input_scale) {
143     assert(input_scale > 0.0f);
144     assert(std::isnormal(input_scale));
145     this->input_scale_ = input_scale;
146     return *this;
147   }
148 
input_scale()149   inline float input_scale() const {
150     return this->input_scale_;
151   }
152 
input_zero_point(uint8_t input_zero_point)153   inline AvgPoolMicrokernelTester& input_zero_point(uint8_t input_zero_point) {
154     this->input_zero_point_ = input_zero_point;
155     return *this;
156   }
157 
input_zero_point()158   inline uint8_t input_zero_point() const {
159     return this->input_zero_point_;
160   }
161 
output_scale(float output_scale)162   inline AvgPoolMicrokernelTester& output_scale(float output_scale) {
163     assert(output_scale > 0.0f);
164     assert(std::isnormal(output_scale));
165     this->output_scale_ = output_scale;
166     return *this;
167   }
168 
output_scale()169   inline float output_scale() const {
170     return this->output_scale_;
171   }
172 
output_zero_point(uint8_t output_zero_point)173   inline AvgPoolMicrokernelTester& output_zero_point(uint8_t output_zero_point) {
174     this->output_zero_point_ = output_zero_point;
175     return *this;
176   }
177 
output_zero_point()178   inline uint8_t output_zero_point() const {
179     return this->output_zero_point_;
180   }
181 
qmin(uint8_t qmin)182   inline AvgPoolMicrokernelTester& qmin(uint8_t qmin) {
183     this->qmin_ = qmin;
184     return *this;
185   }
186 
qmin()187   inline uint8_t qmin() const {
188     return this->qmin_;
189   }
190 
qmax(uint8_t qmax)191   inline AvgPoolMicrokernelTester& qmax(uint8_t qmax) {
192     this->qmax_ = qmax;
193     return *this;
194   }
195 
qmax()196   inline uint8_t qmax() const {
197     return this->qmax_;
198   }
199 
iterations(size_t iterations)200   inline AvgPoolMicrokernelTester& iterations(size_t iterations) {
201     this->iterations_ = iterations;
202     return *this;
203   }
204 
iterations()205   inline size_t iterations() const {
206     return this->iterations_;
207   }
208 
Test(xnn_f16_avgpool_minmax_unipass_ukernel_function avgpool_minmax,xnn_init_f16_scaleminmax_params_fn init_params)209   void Test(xnn_f16_avgpool_minmax_unipass_ukernel_function avgpool_minmax, xnn_init_f16_scaleminmax_params_fn init_params) const {
210     std::random_device random_device;
211     auto rng = std::mt19937(random_device());
212     std::uniform_real_distribution<float> f32dist;
213 
214     std::vector<const uint16_t*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements());
215     std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) +
216       input_offset() + indirect_input.size() * channels());
217     std::vector<uint16_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(uint16_t));
218     std::vector<uint16_t> output((output_pixels() - 1) * output_stride() + channels());
219     std::vector<float> output_ref(output_pixels() * channels());
220     for (size_t iteration = 0; iteration < iterations(); iteration++) {
221       std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
222       std::fill(input.begin(), input.begin() + input_offset(), UINT16_C(0x7E00) /* NaN */);
223       std::fill(input.end() - XNN_EXTRA_BYTES / sizeof(uint16_t), input.end(), UINT16_C(0x7E00) /* NaN */);
224       std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
225 
226       for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) {
227         indirect_input[i] = input.data() + i * channels();
228       }
229       std::shuffle(indirect_input.begin(),
230         indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng);
231       if (zero_index() != SIZE_MAX) {
232         indirect_input[zero_index()] = zero.data();
233       }
234 
235       // Compute reference results, without clamping.
236       for (size_t x = 0; x < output_pixels(); x++) {
237         for (size_t c = 0; c < channels(); c++) {
238           float acc = 0.0f;
239           for (size_t p = 0; p < pooling_elements(); p++) {
240             const uint16_t* row = indirect_input[x * step() + p];
241             if (row != zero.data()) {
242               acc += fp16_ieee_to_fp32_value(row[c + input_offset()]);
243             }
244           }
245           output_ref[x * channels() + c] = acc / float(pooling_elements());
246         }
247       }
248 
249       // Compute clamping parameters.
250       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
251       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
252       const float accumulated_range = accumulated_max - accumulated_min;
253       float output_min_as_float = accumulated_min + float(qmin()) / 255.0f * accumulated_range;
254       float output_max_as_float = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range;
255       const uint16_t output_min_as_half = fp16_ieee_from_fp32_value(output_min_as_float);
256       const uint16_t output_max_as_half = fp16_ieee_from_fp32_value(output_max_as_float);
257       output_min_as_float = fp16_ieee_to_fp32_value(output_min_as_half);
258       output_max_as_float = fp16_ieee_to_fp32_value(output_max_as_half);
259 
260       // Clamp reference results.
261       for (float& output_value : output_ref) {
262         output_value = std::max(std::min(output_value, output_max_as_float), output_min_as_float);
263       }
264 
265       // Prepare parameters.
266       xnn_f16_scaleminmax_params params;
267       init_params(&params, fp16_ieee_from_fp32_value(1.0f / float(pooling_elements())), output_min_as_half, output_max_as_half);
268 
269       // Call optimized micro-kernel.
270       avgpool_minmax(output_pixels(), pooling_elements(), channels(),
271         reinterpret_cast<const void**>(indirect_input.data()), input_offset() * sizeof(uint16_t), zero.data(),
272         output.data(),
273         step() * sizeof(void*),
274         (output_stride() - channels()) * sizeof(uint16_t),
275         &params);
276 
277       // Verify results.
278       for (size_t x = 0; x < output_pixels(); x++) {
279         for (size_t c = 0; c < channels(); c++) {
280           ASSERT_GE(fp16_ieee_to_fp32_value(output[x * output_stride() + c]), output_min_as_float)
281             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
282             << ", pooling elements = " << pooling_elements() << ", step = " << step()
283             << ", input offset = " << input_offset();
284           ASSERT_LE(fp16_ieee_to_fp32_value(output[x * output_stride() + c]), output_max_as_float)
285             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
286             << ", pooling elements = " << pooling_elements() << ", step = " << step()
287             << ", input offset = " << input_offset();
288           ASSERT_NEAR(
289               fp16_ieee_to_fp32_value(output[x * output_stride() + c]),
290               output_ref[x * channels() + c],
291               std::max(1.0e-4f, std::abs(output_ref[x * channels() + c]) * 3.0e-3f))
292             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
293             << ", pooling elements = " << pooling_elements() << ", step = " << step()
294             << ", input offset = " << input_offset();
295         }
296       }
297     }
298   }
299 
Test(xnn_f16_avgpool_minmax_multipass_ukernel_function avgpool_minmax,xnn_init_f16_scaleminmax_params_fn init_params)300   void Test(xnn_f16_avgpool_minmax_multipass_ukernel_function avgpool_minmax, xnn_init_f16_scaleminmax_params_fn init_params) const {
301     std::random_device random_device;
302     auto rng = std::mt19937(random_device());
303     std::uniform_real_distribution<float> f32dist;
304 
305     std::vector<const uint16_t*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements());
306     std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) +
307       input_offset() + indirect_input.size() * channels());
308     std::vector<uint16_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(uint16_t));
309     std::vector<uint16_t> output((output_pixels() - 1) * output_stride() + channels());
310     std::vector<float> output_ref(output_pixels() * channels());
311     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> buffer(XNN_EXTRA_BYTES / sizeof(uint16_t) + channels());
312     for (size_t iteration = 0; iteration < iterations(); iteration++) {
313       std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
314       std::fill(input.begin(), input.begin() + input_offset(), UINT16_C(0x7E00) /* NaN */);
315       std::fill(input.end() - XNN_EXTRA_BYTES / sizeof(uint16_t), input.end(), UINT16_C(0x7E00) /* NaN */);
316       std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
317 
318       for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) {
319         indirect_input[i] = input.data() + i * channels();
320       }
321       std::shuffle(indirect_input.begin(),
322         indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng);
323       if (zero_index() != SIZE_MAX) {
324         indirect_input[zero_index()] = zero.data();
325       }
326 
327       // Compute reference results, without clamping.
328       for (size_t x = 0; x < output_pixels(); x++) {
329         for (size_t c = 0; c < channels(); c++) {
330           float acc = 0.0f;
331           for (size_t p = 0; p < pooling_elements(); p++) {
332             const uint16_t* row = indirect_input[x * step() + p];
333             if (row != zero.data()) {
334               acc += fp16_ieee_to_fp32_value(row[c + input_offset()]);
335             }
336           }
337           output_ref[x * channels() + c] = acc / float(pooling_elements());
338         }
339       }
340 
341       // Compute clamping parameters.
342       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
343       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
344       const float accumulated_range = accumulated_max - accumulated_min;
345       float output_min_as_float = accumulated_min + float(qmin()) / 255.0f * accumulated_range;
346       float output_max_as_float = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range;
347       const uint16_t output_min_as_half = fp16_ieee_from_fp32_value(output_min_as_float);
348       const uint16_t output_max_as_half = fp16_ieee_from_fp32_value(output_max_as_float);
349       output_min_as_float = fp16_ieee_to_fp32_value(output_min_as_half);
350       output_max_as_float = fp16_ieee_to_fp32_value(output_max_as_half);
351 
352       // Clamp reference results.
353       for (float& output_value : output_ref) {
354         output_value = std::max(std::min(output_value, output_max_as_float), output_min_as_float);
355       }
356 
357       // Prepare parameters.
358       xnn_f16_scaleminmax_params params;
359       init_params(&params, fp16_ieee_from_fp32_value(1.0f / float(pooling_elements())), output_min_as_half, output_max_as_half);
360 
361       // Call optimized micro-kernel.
362       avgpool_minmax(output_pixels(), pooling_elements(), channels(),
363         reinterpret_cast<const void**>(indirect_input.data()), input_offset() * sizeof(uint16_t), zero.data(),
364         buffer.data(), output.data(),
365         (step() - (packed_pooling_elements() - incremental_pooling_tile())) * sizeof(void*),
366         (output_stride() - channels()) * sizeof(uint16_t),
367         &params);
368 
369       // Verify results.
370       for (size_t x = 0; x < output_pixels(); x++) {
371         for (size_t c = 0; c < channels(); c++) {
372           ASSERT_GE(fp16_ieee_to_fp32_value(output[x * output_stride() + c]), output_min_as_float)
373             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
374             << ", pooling elements = " << pooling_elements() << ", step = " << step()
375             << ", input offset = " << input_offset();
376           ASSERT_LE(fp16_ieee_to_fp32_value(output[x * output_stride() + c]), output_max_as_float)
377             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
378             << ", pooling elements = " << pooling_elements() << ", step = " << step()
379             << ", input offset = " << input_offset();
380           ASSERT_NEAR(
381               fp16_ieee_to_fp32_value(output[x * output_stride() + c]),
382               output_ref[x * channels() + c],
383               std::max(1.0e-4f, std::abs(output_ref[x * channels() + c]) * 3.0e-3f))
384             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
385             << ", pooling elements = " << pooling_elements() << ", step = " << step()
386             << ", input offset = " << input_offset();
387         }
388       }
389     }
390   }
391 
Test(xnn_f32_avgpool_minmax_unipass_ukernel_function avgpool_minmax,xnn_init_f32_scaleminmax_params_fn init_params)392   void Test(xnn_f32_avgpool_minmax_unipass_ukernel_function avgpool_minmax, xnn_init_f32_scaleminmax_params_fn init_params) const {
393     std::random_device random_device;
394     auto rng = std::mt19937(random_device());
395     std::uniform_real_distribution<float> f32dist;
396 
397     std::vector<const float*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements());
398     std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) +
399       input_offset() + indirect_input.size() * channels());
400     std::vector<float> zero(channels() + XNN_EXTRA_BYTES / sizeof(float));
401     std::vector<float> output((output_pixels() - 1) * output_stride() + channels());
402     std::vector<float> output_ref(output_pixels() * channels());
403     for (size_t iteration = 0; iteration < iterations(); iteration++) {
404       std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); });
405       std::fill(input.begin(), input.begin() + input_offset(), std::nanf(""));
406       std::fill(input.end() - XNN_EXTRA_BYTES / sizeof(float), input.end(), std::nanf(""));
407       std::fill(output.begin(), output.end(), std::nanf(""));
408 
409       for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) {
410         indirect_input[i] = input.data() + i * channels();
411       }
412       std::shuffle(indirect_input.begin(),
413         indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng);
414       if (zero_index() != SIZE_MAX) {
415         indirect_input[zero_index()] = zero.data();
416       }
417 
418       // Compute reference results, without clamping.
419       for (size_t x = 0; x < output_pixels(); x++) {
420         for (size_t c = 0; c < channels(); c++) {
421           float acc = 0.0f;
422           for (size_t p = 0; p < pooling_elements(); p++) {
423             const float* row = indirect_input[x * step() + p];
424             if (row != zero.data()) {
425               acc += row[c + input_offset()];
426             }
427           }
428           output_ref[x * channels() + c] = acc / float(pooling_elements());
429         }
430       }
431 
432       // Compute clamping parameters.
433       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
434       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
435       const float accumulated_range = accumulated_max - accumulated_min;
436       const float output_min = accumulated_min + float(qmin()) / 255.0f * accumulated_range;
437       const float output_max = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range;
438 
439       // Clamp reference results.
440       for (float& output_value : output_ref) {
441         output_value = std::max(std::min(output_value, output_max), output_min);
442       }
443 
444       // Prepare parameters.
445       xnn_f32_scaleminmax_params params;
446       init_params(&params, 1.0f / float(pooling_elements()), output_min, output_max);
447 
448       // Call optimized micro-kernel.
449       avgpool_minmax(output_pixels(), pooling_elements(), channels(),
450         indirect_input.data(), input_offset() * sizeof(float), zero.data(),
451         output.data(),
452         step() * sizeof(void*),
453         (output_stride() - channels()) * sizeof(float),
454         &params);
455 
456       // Verify results.
457       for (size_t x = 0; x < output_pixels(); x++) {
458         for (size_t c = 0; c < channels(); c++) {
459           ASSERT_GE(output[x * output_stride() + c], output_min)
460             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
461             << ", pooling elements = " << pooling_elements() << ", step = " << step()
462             << ", input offset = " << input_offset();
463           ASSERT_LE(output[x * output_stride() + c], output_max)
464             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
465             << ", pooling elements = " << pooling_elements() << ", step = " << step()
466             << ", input offset = " << input_offset();
467           ASSERT_NEAR(
468               output[x * output_stride() + c],
469               output_ref[x * channels() + c],
470               std::abs(output_ref[x * channels() + c]) * 1.0e-6f)
471             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
472             << ", pooling elements = " << pooling_elements() << ", step = " << step()
473             << ", input offset = " << input_offset();
474         }
475       }
476     }
477   }
478 
Test(xnn_f32_avgpool_minmax_multipass_ukernel_function avgpool_minmax,xnn_init_f32_scaleminmax_params_fn init_params)479   void Test(xnn_f32_avgpool_minmax_multipass_ukernel_function avgpool_minmax, xnn_init_f32_scaleminmax_params_fn init_params) const {
480     std::random_device random_device;
481     auto rng = std::mt19937(random_device());
482     std::uniform_real_distribution<float> f32dist;
483 
484     std::vector<const float*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements());
485     std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) +
486       input_offset() + indirect_input.size() * channels());
487     std::vector<float> zero(channels() + XNN_EXTRA_BYTES / sizeof(float));
488     std::vector<float> output((output_pixels() - 1) * output_stride() + channels());
489     std::vector<float> output_ref(output_pixels() * channels());
490     std::vector<float, AlignedAllocator<float, 64>> buffer(XNN_EXTRA_BYTES / sizeof(float) + channels());
491     for (size_t iteration = 0; iteration < iterations(); iteration++) {
492       std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); });
493       std::fill(input.begin(), input.begin() + input_offset(), std::nanf(""));
494       std::fill(input.end() - XNN_EXTRA_BYTES / sizeof(float), input.end(), std::nanf(""));
495       std::fill(output.begin(), output.end(), std::nanf(""));
496 
497       for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) {
498         indirect_input[i] = input.data() + i * channels();
499       }
500       std::shuffle(indirect_input.begin(),
501         indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng);
502       if (zero_index() != SIZE_MAX) {
503         indirect_input[zero_index()] = zero.data();
504       }
505 
506       // Compute reference results, without clamping.
507       for (size_t x = 0; x < output_pixels(); x++) {
508         for (size_t c = 0; c < channels(); c++) {
509           float acc = 0.0f;
510           for (size_t p = 0; p < pooling_elements(); p++) {
511             const float* row = indirect_input[x * step() + p];
512             if (row != zero.data()) {
513               acc += row[c + input_offset()];
514             }
515           }
516           output_ref[x * channels() + c] = acc / float(pooling_elements());
517         }
518       }
519 
520       // Compute clamping parameters.
521       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
522       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
523       const float accumulated_range = accumulated_max - accumulated_min;
524       const float output_min = accumulated_min + float(qmin()) / 255.0f * accumulated_range;
525       const float output_max = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range;
526 
527       // Clamp reference results.
528       for (float& output_value : output_ref) {
529         output_value = std::max(std::min(output_value, output_max), output_min);
530       }
531 
532       // Prepare parameters.
533       xnn_f32_scaleminmax_params params;
534       init_params(&params, 1.0f / float(pooling_elements()), output_min, output_max);
535 
536       // Call optimized micro-kernel.
537       avgpool_minmax(output_pixels(), pooling_elements(), channels(),
538         indirect_input.data(), input_offset() * sizeof(float), zero.data(),
539         buffer.data(), output.data(),
540         (step() - (packed_pooling_elements() - incremental_pooling_tile())) * sizeof(void*),
541         (output_stride() - channels()) * sizeof(float),
542         &params);
543 
544       // Verify results.
545       for (size_t x = 0; x < output_pixels(); x++) {
546         for (size_t c = 0; c < channels(); c++) {
547           ASSERT_GE(output[x * output_stride() + c], output_min)
548             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
549             << ", pooling elements = " << pooling_elements() << ", step = " << step()
550             << ", input offset = " << input_offset();
551           ASSERT_LE(output[x * output_stride() + c], output_max)
552             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
553             << ", pooling elements = " << pooling_elements() << ", step = " << step()
554             << ", input offset = " << input_offset();
555           ASSERT_NEAR(
556               output[x * output_stride() + c],
557               output_ref[x * channels() + c],
558               std::abs(output_ref[x * channels() + c]) * 1.0e-6f)
559             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
560             << ", pooling elements = " << pooling_elements() << ", step = " << step()
561             << ", input offset = " << input_offset();
562         }
563       }
564     }
565   }
566 
Test(xnn_qu8_avgpool_minmax_unipass_ukernel_function avgpool_minmax,xnn_init_qu8_avgpool_minmax_params_fn init_params)567   void Test(xnn_qu8_avgpool_minmax_unipass_ukernel_function avgpool_minmax, xnn_init_qu8_avgpool_minmax_params_fn init_params) const {
568     std::random_device random_device;
569     auto rng = std::mt19937(random_device());
570     std::uniform_int_distribution<int32_t> u8dist(
571       std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max());
572 
573     std::vector<const uint8_t*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements());
574     std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) +
575       input_offset() + indirect_input.size() * channels());
576     std::vector<uint8_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(uint8_t));
577     std::vector<uint8_t> output((output_pixels() - 1) * output_stride() + channels());
578     std::vector<uint8_t> output_ref(output_pixels() * channels());
579     std::vector<float> output_real(output_pixels() * channels());
580     std::vector<int32_t> accumulator(output_pixels() * channels());
581     for (size_t iteration = 0; iteration < iterations(); iteration++) {
582       do {
583         std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); });
584       } while (input.size() > 1 && *std::max_element(input.cbegin(), input.cend()) == *std::min_element(input.cbegin(), input.cend()));
585       std::fill(input.begin(), input.begin() + input_offset(), UINT8_C(0xA5));
586       std::fill(input.end() - XNN_EXTRA_BYTES / sizeof(uint8_t), input.end(), UINT8_C(0xA5));
587       std::fill(output.begin(), output.end(), UINT8_C(0xA5));
588 
589       for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) {
590         indirect_input[i] = input.data() + i * channels();
591       }
592       std::shuffle(indirect_input.begin(),
593         indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng);
594       if (zero_index() != SIZE_MAX) {
595         indirect_input[zero_index()] = zero.data();
596       }
597 
598       // Prepare parameters.
599       xnn_qu8_avgpool_minmax_params params;
600       init_params(
601         &params,
602         -int32_t(input_zero_point()) * int32_t(pooling_elements()),
603         input_scale() / (output_scale() * float(pooling_elements())),
604         output_zero_point(), qmin(), qmax());
605 
606       // Compute reference results.
607       for (size_t x = 0; x < output_pixels(); x++) {
608         for (size_t c = 0; c < channels(); c++) {
609           int32_t acc = 0;
610           for (size_t p = 0; p < pooling_elements(); p++) {
611             const uint8_t* row = indirect_input[x * step() + p];
612             if (row != zero.data()) {
613               acc += int32_t(row[c + input_offset()]);
614             }
615             acc -= int32_t(input_zero_point());
616           }
617           accumulator[x * channels() + c] = acc;
618           output_ref[x * channels() + c] = xnn_qu8_requantize_rndna(
619             acc, input_scale() / (output_scale() * float(pooling_elements())), output_zero_point(), qmin(), qmax());
620           const float scaled_acc =
621             float(acc) * input_scale() / (output_scale() * float(pooling_elements())) + float(output_zero_point());
622           output_real[x * channels() + c] = std::min(std::max(scaled_acc, float(qmin())), float(qmax()));
623         }
624       }
625 
626       // Call optimized micro-kernel.
627       avgpool_minmax(output_pixels(), pooling_elements(), channels(),
628         indirect_input.data(), input_offset() * sizeof(uint8_t), zero.data(),
629         output.data(),
630         step() * sizeof(void*),
631         (output_stride() - channels()) * sizeof(uint8_t),
632         &params);
633 
634       // Verify results.
635       for (size_t x = 0; x < output_pixels(); x++) {
636         for (size_t c = 0; c < channels(); c++) {
637           ASSERT_GE(uint32_t(output[x * output_stride() + c]), uint32_t(qmin()))
638             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
639             << ", pooling elements = " << pooling_elements() << ", step = " << step()
640             << ", input offset = " << input_offset();
641           ASSERT_LE(uint32_t(output[x * output_stride() + c]), uint32_t(qmax()))
642             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
643             << ", pooling elements = " << pooling_elements() << ", step = " << step()
644             << ", input offset = " << input_offset();
645           ASSERT_NEAR(float(int32_t(output[x * output_stride() + c])), output_real[x * channels() + c], 0.5f)
646             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
647             << ", pooling elements = " << pooling_elements() << ", step = " << step()
648             << ", input offset = " << input_offset() << ", accumulator = " << accumulator[x * channels() + c];
649           ASSERT_EQ(uint32_t(output_ref[x * channels() + c]), uint32_t(output[x * output_stride() + c]))
650             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
651             << ", pooling elements = " << pooling_elements() << ", step = " << step()
652             << ", input offset = " << input_offset() << ", accumulator = " << accumulator[x * channels() + c];
653         }
654       }
655     }
656   }
657 
Test(xnn_qu8_avgpool_minmax_multipass_ukernel_function avgpool_minmax,xnn_init_qu8_avgpool_minmax_params_fn init_params)658   void Test(xnn_qu8_avgpool_minmax_multipass_ukernel_function avgpool_minmax, xnn_init_qu8_avgpool_minmax_params_fn init_params) const {
659     std::random_device random_device;
660     auto rng = std::mt19937(random_device());
661     std::uniform_int_distribution<int32_t> u8dist(
662       std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max());
663 
664     std::vector<const uint8_t*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements());
665     std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) +
666       input_offset() + indirect_input.size() * channels());
667     std::vector<uint8_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(uint8_t));
668     std::vector<uint8_t> output((output_pixels() - 1) * output_stride() + channels());
669     std::vector<uint8_t> output_ref(output_pixels() * channels());
670     std::vector<float> output_real(output_pixels() * channels());
671     std::vector<int32_t> accumulator(output_pixels() * channels());
672     std::vector<int32_t, AlignedAllocator<int32_t, 64>> buffer(XNN_EXTRA_BYTES / sizeof(uint8_t) + channels());
673     for (size_t iteration = 0; iteration < iterations(); iteration++) {
674       do {
675         std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); });
676       } while (input.size() > 1 && *std::max_element(input.cbegin(), input.cend()) == *std::min_element(input.cbegin(), input.cend()));
677       std::fill(input.begin(), input.begin() + input_offset(), UINT8_C(0xA5));
678       std::fill(input.end() - XNN_EXTRA_BYTES / sizeof(uint8_t), input.end(), UINT8_C(0xA5));
679       std::fill(output.begin(), output.end(), UINT8_C(0xA5));
680 
681       for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) {
682         indirect_input[i] = input.data() + i * channels();
683       }
684       std::shuffle(indirect_input.begin(),
685         indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng);
686       if (zero_index() != SIZE_MAX) {
687         indirect_input[zero_index()] = zero.data();
688       }
689 
690       // Prepare parameters.
691       xnn_qu8_avgpool_minmax_params params;
692       init_params(
693         &params,
694         -int32_t(input_zero_point()) * int32_t(pooling_elements()),
695         input_scale() / (output_scale() * float(pooling_elements())),
696         output_zero_point(), qmin(), qmax());
697 
698       // Compute reference results.
699       for (size_t x = 0; x < output_pixels(); x++) {
700         for (size_t c = 0; c < channels(); c++) {
701           int32_t acc = 0;
702           for (size_t p = 0; p < pooling_elements(); p++) {
703             const uint8_t* row = indirect_input[x * step() + p];
704             if (row != zero.data()) {
705               acc += int32_t(row[c + input_offset()]);
706             }
707             acc -= int32_t(input_zero_point());
708           }
709           accumulator[x * channels() + c] = acc;
710           output_ref[x * channels() + c] = xnn_qu8_requantize_rndna(
711             acc, input_scale() / (output_scale() * float(pooling_elements())), output_zero_point(), qmin(), qmax());
712           const float scaled_acc =
713             float(acc) * input_scale() / (output_scale() * float(pooling_elements())) + float(output_zero_point());
714           output_real[x * channels() + c] = std::min(std::max(scaled_acc, float(qmin())), float(qmax()));
715         }
716       }
717 
718       // Call optimized micro-kernel.
719       avgpool_minmax(output_pixels(), pooling_elements(), channels(),
720         indirect_input.data(), input_offset() * sizeof(uint8_t), zero.data(),
721         buffer.data(), output.data(),
722         (step() - (packed_pooling_elements() - incremental_pooling_tile())) * sizeof(void*),
723         (output_stride() - channels()) * sizeof(uint8_t),
724         &params);
725 
726       // Verify results.
727       for (size_t x = 0; x < output_pixels(); x++) {
728         for (size_t c = 0; c < channels(); c++) {
729           ASSERT_GE(uint32_t(output[x * output_stride() + c]), uint32_t(qmin()))
730             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
731             << ", pooling elements = " << pooling_elements() << ", step = " << step()
732             << ", input offset = " << input_offset();
733           ASSERT_LE(uint32_t(output[x * output_stride() + c]), uint32_t(qmax()))
734             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
735             << ", pooling elements = " << pooling_elements() << ", step = " << step()
736             << ", input offset = " << input_offset();
737           ASSERT_NEAR(float(int32_t(output[x * output_stride() + c])), output_real[x * channels() + c], 0.5f)
738             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
739             << ", pooling elements = " << pooling_elements() << ", step = " << step()
740             << ", input offset = " << input_offset() << ", accumulator = " << accumulator[x * channels() + c];
741           ASSERT_EQ(uint32_t(output_ref[x * channels() + c]), uint32_t(output[x * output_stride() + c]))
742             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
743             << ", pooling elements = " << pooling_elements() << ", step = " << step()
744             << ", input offset = " << input_offset() << ", accumulator = " << accumulator[x * channels() + c];
745         }
746       }
747     }
748   }
749 
Test(xnn_f16_pavgpool_minmax_unipass_ukernel_function pavgpool_minmax,xnn_init_f16_minmax_params_fn init_params)750   void Test(xnn_f16_pavgpool_minmax_unipass_ukernel_function pavgpool_minmax, xnn_init_f16_minmax_params_fn init_params) const {
751     std::random_device random_device;
752     auto rng = std::mt19937(random_device());
753     std::uniform_real_distribution<float> f32dist;
754     std::uniform_real_distribution<float> m32dist(0.1f, 0.5f);
755 
756     std::vector<const uint16_t*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements());
757     std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) +
758       input_offset() + indirect_input.size() * channels());
759     std::vector<uint16_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(uint16_t));
760     std::vector<uint16_t> multiplier(output_pixels());
761     std::vector<uint16_t> output((output_pixels() - 1) * output_stride() + channels());
762     std::vector<float> output_ref(output_pixels() * channels());
763     for (size_t iteration = 0; iteration < iterations(); iteration++) {
764       std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
765       std::fill(input.begin(), input.begin() + input_offset(), UINT16_C(0x7E00) /* NaN */);
766       std::fill(input.end() - XNN_EXTRA_BYTES / sizeof(uint16_t), input.end(), UINT16_C(0x7E00) /* NaN */);
767       std::generate(multiplier.begin(), multiplier.end(), [&]() { return fp16_ieee_from_fp32_value(m32dist(rng)); });
768       std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
769 
770       for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) {
771         indirect_input[i] = input.data() + i * channels();
772       }
773       std::shuffle(indirect_input.begin(),
774         indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng);
775       if (zero_index() != SIZE_MAX) {
776         indirect_input[zero_index()] = zero.data();
777       }
778 
779       // Compute reference results, without clamping.
780       for (size_t x = 0; x < output_pixels(); x++) {
781         for (size_t c = 0; c < channels(); c++) {
782           float acc = 0.0f;
783           for (size_t p = 0; p < pooling_elements(); p++) {
784             const uint16_t* row = indirect_input[x * step() + p];
785             if (row != zero.data()) {
786               acc += fp16_ieee_to_fp32_value(row[c + input_offset()]);
787             }
788           }
789           output_ref[x * channels() + c] = acc * fp16_ieee_to_fp32_value(multiplier[x]);
790         }
791       }
792 
793       // Compute clamping parameters.
794       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
795       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
796       const float accumulated_range = accumulated_max - accumulated_min;
797       float output_min_as_float = accumulated_min + float(qmin()) / 255.0f * accumulated_range;
798       float output_max_as_float = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range;
799       const uint16_t output_min_as_half = fp16_ieee_from_fp32_value(output_min_as_float);
800       const uint16_t output_max_as_half = fp16_ieee_from_fp32_value(output_max_as_float);
801       output_min_as_float = fp16_ieee_to_fp32_value(output_min_as_half);
802       output_max_as_float = fp16_ieee_to_fp32_value(output_max_as_half);
803 
804       // Clamp reference results.
805       for (float& output_value : output_ref) {
806         output_value = std::max(std::min(output_value, output_max_as_float), output_min_as_float);
807       }
808 
809       // Prepare parameters.
810       xnn_f16_minmax_params params;
811       init_params(&params, output_min_as_half, output_max_as_half);
812 
813       // Call optimized micro-kernel.
814       pavgpool_minmax(output_pixels(), pooling_elements(), channels(),
815         reinterpret_cast<const void**>(indirect_input.data()), input_offset() * sizeof(uint16_t), zero.data(),
816         multiplier.data(), output.data(),
817         step() * sizeof(void*),
818         (output_stride() - channels()) * sizeof(uint16_t),
819         &params);
820 
821       // Verify results.
822       for (size_t x = 0; x < output_pixels(); x++) {
823         for (size_t c = 0; c < channels(); c++) {
824           ASSERT_GE(fp16_ieee_to_fp32_value(output[x * output_stride() + c]), output_min_as_float)
825             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
826             << ", pooling elements = " << pooling_elements() << ", step = " << step()
827             << ", input offset = " << input_offset();
828           ASSERT_LE(fp16_ieee_to_fp32_value(output[x * output_stride() + c]), output_max_as_float)
829             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
830             << ", pooling elements = " << pooling_elements() << ", step = " << step()
831             << ", input offset = " << input_offset();
832           ASSERT_NEAR(
833               fp16_ieee_to_fp32_value(output[x * output_stride() + c]),
834               output_ref[x * channels() + c],
835               std::max(1.0e-4f, std::abs(output_ref[x * channels() + c]) * 3.0e-3f))
836             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
837             << ", pooling elements = " << pooling_elements() << ", step = " << step()
838             << ", input offset = " << input_offset();
839         }
840       }
841     }
842   }
843 
Test(xnn_f16_pavgpool_minmax_multipass_ukernel_function pavgpool_minmax,xnn_init_f16_minmax_params_fn init_params)844   void Test(xnn_f16_pavgpool_minmax_multipass_ukernel_function pavgpool_minmax, xnn_init_f16_minmax_params_fn init_params) const {
845     std::random_device random_device;
846     auto rng = std::mt19937(random_device());
847     std::uniform_real_distribution<float> f32dist;
848     std::uniform_real_distribution<float> m32dist(0.1f, 0.5f);
849 
850     std::vector<const uint16_t*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements());
851     std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) +
852       input_offset() + indirect_input.size() * channels());
853     std::vector<uint16_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(uint16_t));
854     std::vector<uint16_t> multiplier(output_pixels());
855     std::vector<uint16_t> output((output_pixels() - 1) * output_stride() + channels());
856     std::vector<float> output_ref(output_pixels() * channels());
857     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> buffer(XNN_EXTRA_BYTES / sizeof(uint16_t) + channels());
858     for (size_t iteration = 0; iteration < iterations(); iteration++) {
859       std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
860       std::fill(input.begin(), input.begin() + input_offset(), UINT16_C(0x7E00) /* NaN */);
861       std::fill(input.end() - XNN_EXTRA_BYTES / sizeof(uint16_t), input.end(), UINT16_C(0x7E00) /* NaN */);
862       std::generate(multiplier.begin(), multiplier.end(), [&]() { return fp16_ieee_from_fp32_value(m32dist(rng)); });
863       std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
864 
865       for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) {
866         indirect_input[i] = input.data() + i * channels();
867       }
868       std::shuffle(indirect_input.begin(),
869         indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng);
870       if (zero_index() != SIZE_MAX) {
871         indirect_input[zero_index()] = zero.data();
872       }
873 
874       // Compute reference results, without clamping.
875       for (size_t x = 0; x < output_pixels(); x++) {
876         for (size_t c = 0; c < channels(); c++) {
877           float acc = 0.0f;
878           for (size_t p = 0; p < pooling_elements(); p++) {
879             const uint16_t* row = indirect_input[x * step() + p];
880             if (row != zero.data()) {
881               acc += fp16_ieee_to_fp32_value(row[c + input_offset()]);
882             }
883           }
884           output_ref[x * channels() + c] = acc * fp16_ieee_to_fp32_value(multiplier[x]);
885         }
886       }
887 
888       // Compute clamping parameters.
889       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
890       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
891       const float accumulated_range = accumulated_max - accumulated_min;
892       float output_min_as_float = accumulated_min + float(qmin()) / 255.0f * accumulated_range;
893       float output_max_as_float = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range;
894       const uint16_t output_min_as_half = fp16_ieee_from_fp32_value(output_min_as_float);
895       const uint16_t output_max_as_half = fp16_ieee_from_fp32_value(output_max_as_float);
896       output_min_as_float = fp16_ieee_to_fp32_value(output_min_as_half);
897       output_max_as_float = fp16_ieee_to_fp32_value(output_max_as_half);
898 
899       // Clamp reference results.
900       for (float& output_value : output_ref) {
901         output_value = std::max(std::min(output_value, output_max_as_float), output_min_as_float);
902       }
903 
904       // Prepare parameters.
905       xnn_f16_minmax_params params;
906       init_params(&params, output_min_as_half, output_max_as_half);
907 
908       // Call optimized micro-kernel.
909       pavgpool_minmax(output_pixels(), pooling_elements(), channels(),
910         reinterpret_cast<const void**>(indirect_input.data()), input_offset() * sizeof(uint16_t), zero.data(),
911         multiplier.data(), buffer.data(), output.data(),
912         (step() - (packed_pooling_elements() - incremental_pooling_tile())) * sizeof(void*),
913         (output_stride() - channels()) * sizeof(uint16_t),
914         &params);
915 
916       // Verify results.
917       for (size_t x = 0; x < output_pixels(); x++) {
918         for (size_t c = 0; c < channels(); c++) {
919           ASSERT_GE(fp16_ieee_to_fp32_value(output[x * output_stride() + c]), output_min_as_float)
920             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
921             << ", pooling elements = " << pooling_elements() << ", step = " << step()
922             << ", input offset = " << input_offset();
923           ASSERT_LE(fp16_ieee_to_fp32_value(output[x * output_stride() + c]), output_max_as_float)
924             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
925             << ", pooling elements = " << pooling_elements() << ", step = " << step()
926             << ", input offset = " << input_offset();
927           ASSERT_NEAR(
928               fp16_ieee_to_fp32_value(output[x * output_stride() + c]),
929               output_ref[x * channels() + c],
930               std::max(1.0e-4f, std::abs(output_ref[x * channels() + c]) * 3.0e-3f))
931             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
932             << ", pooling elements = " << pooling_elements() << ", step = " << step()
933             << ", input offset = " << input_offset();
934         }
935       }
936     }
937   }
938 
Test(xnn_f32_pavgpool_minmax_unipass_ukernel_function pavgpool_minmax,xnn_init_f32_minmax_params_fn init_params)939   void Test(xnn_f32_pavgpool_minmax_unipass_ukernel_function pavgpool_minmax, xnn_init_f32_minmax_params_fn init_params) const {
940     std::random_device random_device;
941     auto rng = std::mt19937(random_device());
942     std::uniform_real_distribution<float> f32dist;
943     std::uniform_real_distribution<float> m32dist(0.1f, 0.5f);
944 
945     std::vector<const float*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements());
946     std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) +
947       input_offset() + indirect_input.size() * channels());
948     std::vector<float> zero(channels() + XNN_EXTRA_BYTES / sizeof(float));
949     std::vector<float> multiplier(output_pixels());
950     std::vector<float> output((output_pixels() - 1) * output_stride() + channels());
951     std::vector<float> output_ref(output_pixels() * channels());
952     for (size_t iteration = 0; iteration < iterations(); iteration++) {
953       std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); });
954       std::fill(input.begin(), input.begin() + input_offset(), std::nanf(""));
955       std::fill(input.end() - XNN_EXTRA_BYTES / sizeof(float), input.end(), std::nanf(""));
956       std::generate(multiplier.begin(), multiplier.end(), [&]() { return m32dist(rng); });
957       std::fill(output.begin(), output.end(), std::nanf(""));
958 
959       for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) {
960         indirect_input[i] = input.data() + i * channels();
961       }
962       std::shuffle(indirect_input.begin(),
963         indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng);
964       if (zero_index() != SIZE_MAX) {
965         indirect_input[zero_index()] = zero.data();
966       }
967 
968       // Compute reference results, without clamping.
969       for (size_t x = 0; x < output_pixels(); x++) {
970         for (size_t c = 0; c < channels(); c++) {
971           float acc = 0.0f;
972           for (size_t p = 0; p < pooling_elements(); p++) {
973             const float* row = indirect_input[x * step() + p];
974             if (row != zero.data()) {
975               acc += row[c + input_offset()];
976             }
977           }
978           output_ref[x * channels() + c] = acc * multiplier[x];
979         }
980       }
981 
982       // Compute clamping parameters.
983       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
984       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
985       const float accumulated_range = accumulated_max - accumulated_min;
986       const float output_min = accumulated_min + float(qmin()) / 255.0f * accumulated_range;
987       const float output_max = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range;
988 
989       // Clamp reference results.
990       for (float& output_value : output_ref) {
991         output_value = std::max(std::min(output_value, output_max), output_min);
992       }
993 
994       // Prepare parameters.
995       xnn_f32_minmax_params params;
996       init_params(&params, output_min, output_max);
997 
998       // Call optimized micro-kernel.
999       pavgpool_minmax(output_pixels(), pooling_elements(), channels(),
1000         indirect_input.data(), input_offset() * sizeof(float), zero.data(),
1001         multiplier.data(), output.data(),
1002         step() * sizeof(void*),
1003         (output_stride() - channels()) * sizeof(float),
1004         &params);
1005 
1006       // Verify results.
1007       for (size_t x = 0; x < output_pixels(); x++) {
1008         for (size_t c = 0; c < channels(); c++) {
1009           ASSERT_GE(output[x * output_stride() + c], output_min)
1010             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
1011             << ", pooling elements = " << pooling_elements() << ", step = " << step()
1012             << ", input offset = " << input_offset();
1013           ASSERT_LE(output[x * output_stride() + c], output_max)
1014             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
1015             << ", pooling elements = " << pooling_elements() << ", step = " << step()
1016             << ", input offset = " << input_offset();
1017           ASSERT_NEAR(
1018               output[x * output_stride() + c],
1019               output_ref[x * channels() + c],
1020               std::abs(output_ref[x * channels() + c]) * 1.0e-6f)
1021             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
1022             << ", pooling elements = " << pooling_elements() << ", step = " << step()
1023             << ", input offset = " << input_offset();
1024         }
1025       }
1026     }
1027   }
1028 
Test(xnn_f32_pavgpool_minmax_multipass_ukernel_function pavgpool_minmax,xnn_init_f32_minmax_params_fn init_params)1029   void Test(xnn_f32_pavgpool_minmax_multipass_ukernel_function pavgpool_minmax, xnn_init_f32_minmax_params_fn init_params) const {
1030     std::random_device random_device;
1031     auto rng = std::mt19937(random_device());
1032     std::uniform_real_distribution<float> f32dist;
1033     std::uniform_real_distribution<float> m32dist(0.1f, 0.5f);
1034 
1035     std::vector<const float*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements());
1036     std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) +
1037       input_offset() + indirect_input.size() * channels());
1038     std::vector<float> zero(channels() + XNN_EXTRA_BYTES / sizeof(float));
1039     std::vector<float> multiplier(output_pixels());
1040     std::vector<float> output((output_pixels() - 1) * output_stride() + channels());
1041     std::vector<float> output_ref(output_pixels() * channels());
1042     std::vector<float, AlignedAllocator<float, 64>> buffer(XNN_EXTRA_BYTES / sizeof(float) + channels());
1043     for (size_t iteration = 0; iteration < iterations(); iteration++) {
1044       std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); });
1045       std::fill(input.begin(), input.begin() + input_offset(), std::nanf(""));
1046       std::fill(input.end() - XNN_EXTRA_BYTES / sizeof(float), input.end(), std::nanf(""));
1047       std::generate(multiplier.begin(), multiplier.end(), [&]() { return m32dist(rng); });
1048       std::fill(output.begin(), output.end(), std::nanf(""));
1049 
1050       for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) {
1051         indirect_input[i] = input.data() + i * channels();
1052       }
1053       std::shuffle(indirect_input.begin(),
1054         indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng);
1055       if (zero_index() != SIZE_MAX) {
1056         indirect_input[zero_index()] = zero.data();
1057       }
1058 
1059       // Compute reference results, without clamping.
1060       for (size_t x = 0; x < output_pixels(); x++) {
1061         for (size_t c = 0; c < channels(); c++) {
1062           float acc = 0.0f;
1063           for (size_t p = 0; p < pooling_elements(); p++) {
1064             const float* row = indirect_input[x * step() + p];
1065             if (row != zero.data()) {
1066               acc += row[c + input_offset()];
1067             }
1068           }
1069           output_ref[x * channels() + c] = acc * multiplier[x];
1070         }
1071       }
1072 
1073       // Compute clamping parameters.
1074       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
1075       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
1076       const float accumulated_range = accumulated_max - accumulated_min;
1077       const float output_min = accumulated_min + float(qmin()) / 255.0f * accumulated_range;
1078       const float output_max = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range;
1079 
1080       // Clamp reference results.
1081       for (float& output_value : output_ref) {
1082         output_value = std::max(std::min(output_value, output_max), output_min);
1083       }
1084 
1085       // Prepare parameters.
1086       xnn_f32_minmax_params params;
1087       init_params(&params, output_min, output_max);
1088 
1089       // Call optimized micro-kernel.
1090       pavgpool_minmax(output_pixels(), pooling_elements(), channels(),
1091         indirect_input.data(), input_offset() * sizeof(float), zero.data(),
1092         multiplier.data(), buffer.data(), output.data(),
1093         (step() - (packed_pooling_elements() - incremental_pooling_tile())) * sizeof(void*),
1094         (output_stride() - channels()) * sizeof(float),
1095         &params);
1096 
1097       // Verify results.
1098       for (size_t x = 0; x < output_pixels(); x++) {
1099         for (size_t c = 0; c < channels(); c++) {
1100           ASSERT_GE(output[x * output_stride() + c], output_min)
1101             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
1102             << ", pooling elements = " << pooling_elements() << ", step = " << step()
1103             << ", input offset = " << input_offset();
1104           ASSERT_LE(output[x * output_stride() + c], output_max)
1105             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
1106             << ", pooling elements = " << pooling_elements() << ", step = " << step()
1107             << ", input offset = " << input_offset();
1108           ASSERT_NEAR(
1109               output[x * output_stride() + c],
1110               output_ref[x * channels() + c],
1111               std::abs(output_ref[x * channels() + c]) * 1.0e-6f)
1112             << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
1113             << ", pooling elements = " << pooling_elements() << ", step = " << step()
1114             << ", input offset = " << input_offset();
1115         }
1116       }
1117     }
1118   }
1119 
1120  private:
1121   size_t output_pixels_{1};
1122   size_t pooling_elements_{1};
1123   size_t channels_{1};
1124   size_t input_offset_{0};
1125   size_t zero_index_{SIZE_MAX};
1126   size_t step_{1};
1127   size_t primary_pooling_tile_{1};
1128   size_t incremental_pooling_tile_{1};
1129   size_t output_stride_{0};
1130   float input_scale_{1.25f};
1131   float output_scale_{0.75f};
1132   uint8_t input_zero_point_{121};
1133   uint8_t output_zero_point_{133};
1134   uint8_t qmin_{0};
1135   uint8_t qmax_{255};
1136   size_t iterations_{3};
1137 };
1138