xref: /aosp_15_r20/external/XNNPACK/test/average-pooling-operator-tester.h (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #pragma once
10 
11 #include <gtest/gtest.h>
12 
13 #include <fp16.h>
14 
15 #include <algorithm>
16 #include <cmath>
17 #include <cassert>
18 #include <cstddef>
19 #include <cstdlib>
20 #include <limits>
21 #include <random>
22 #include <vector>
23 
24 #include <xnnpack.h>
25 
26 
27 class AveragePoolingOperatorTester {
28  public:
padding_tf_same(bool padding_same)29   inline AveragePoolingOperatorTester& padding_tf_same(bool padding_same) {
30     if (padding_same) {
31       assert(padding_top() == 0);
32       assert(padding_left() == 0);
33       assert(padding_bottom() == 0);
34       assert(padding_right() == 0);
35     }
36     this->padding_tf_same_ = padding_same;
37     return *this;
38   }
39 
padding_tf_same()40   inline bool padding_tf_same() const {
41     return this->padding_tf_same_;
42   }
43 
padding(uint32_t padding)44   inline AveragePoolingOperatorTester& padding(uint32_t padding) {
45     assert(!padding_tf_same());
46     this->padding_top_ = padding;
47     this->padding_right_ = padding;
48     this->padding_bottom_ = padding;
49     this->padding_left_ = padding;
50     return *this;
51   }
52 
padding(uint32_t padding_height,uint32_t padding_width)53   inline AveragePoolingOperatorTester& padding(uint32_t padding_height, uint32_t padding_width) {
54     assert(!padding_tf_same());
55     this->padding_top_ = padding_height;
56     this->padding_right_ = padding_width;
57     this->padding_bottom_ = padding_height;
58     this->padding_left_ = padding_width;
59     return *this;
60   }
61 
padding_height(uint32_t padding_height)62   inline AveragePoolingOperatorTester& padding_height(uint32_t padding_height) {
63     assert(!padding_tf_same());
64     this->padding_top_ = padding_height;
65     this->padding_bottom_ = padding_height;
66     return *this;
67   }
68 
padding_width(uint32_t padding_width)69   inline AveragePoolingOperatorTester& padding_width(uint32_t padding_width) {
70     assert(!padding_tf_same());
71     this->padding_right_ = padding_width;
72     this->padding_left_ = padding_width;
73     return *this;
74   }
75 
padding_top(uint32_t padding_top)76   inline AveragePoolingOperatorTester& padding_top(uint32_t padding_top) {
77     assert(!padding_tf_same());
78     this->padding_top_ = padding_top;
79     return *this;
80   }
81 
padding_top()82   inline uint32_t padding_top() const {
83     if (padding_tf_same()) {
84       const uint32_t total_padding_height =
85         (output_height() - 1) * stride_height() + pooling_height() - input_height();
86       return total_padding_height / 2;
87     } else {
88       return this->padding_top_;
89     }
90   }
91 
padding_left(uint32_t padding_left)92   inline AveragePoolingOperatorTester& padding_left(uint32_t padding_left) {
93     assert(!padding_tf_same());
94     this->padding_left_ = padding_left;
95     return *this;
96   }
97 
padding_left()98   inline uint32_t padding_left() const {
99     if (padding_tf_same()) {
100       const uint32_t total_padding_width =
101         (output_width() - 1) * stride_width() + pooling_width() - input_width();
102       return total_padding_width / 2;
103     } else {
104       return this->padding_left_;
105     }
106   }
107 
padding_bottom(uint32_t padding_bottom)108   inline AveragePoolingOperatorTester& padding_bottom(uint32_t padding_bottom) {
109     assert(!padding_tf_same());
110     this->padding_bottom_ = padding_bottom;
111     return *this;
112   }
113 
padding_bottom()114   inline uint32_t padding_bottom() const {
115     if (padding_tf_same()) {
116       const uint32_t total_padding_height =
117         (output_height() - 1) * stride_height() + pooling_height() - input_height();
118       return total_padding_height - total_padding_height / 2;
119     } else {
120       return this->padding_bottom_;
121     }
122   }
123 
padding_right(uint32_t padding_right)124   inline AveragePoolingOperatorTester& padding_right(uint32_t padding_right) {
125     assert(!padding_tf_same());
126     this->padding_right_ = padding_right;
127     return *this;
128   }
129 
padding_right()130   inline uint32_t padding_right() const {
131     if (padding_tf_same()) {
132       const uint32_t total_padding_width =
133         (output_width() - 1) * stride_width() + pooling_width() - input_width();
134       return total_padding_width - total_padding_width / 2;
135     } else {
136       return this->padding_right_;
137     }
138   }
139 
input_size(size_t input_height,size_t input_width)140   inline AveragePoolingOperatorTester& input_size(size_t input_height, size_t input_width) {
141     assert(input_height >= 1);
142     assert(input_width >= 1);
143     this->input_height_ = input_height;
144     this->input_width_ = input_width;
145     return *this;
146   }
147 
input_height(size_t input_height)148   inline AveragePoolingOperatorTester& input_height(size_t input_height) {
149     assert(input_height >= 1);
150     this->input_height_ = input_height;
151     return *this;
152   }
153 
input_height()154   inline size_t input_height() const {
155     return this->input_height_;
156   }
157 
input_width(size_t input_width)158   inline AveragePoolingOperatorTester& input_width(size_t input_width) {
159     assert(input_width >= 1);
160     this->input_width_ = input_width;
161     return *this;
162   }
163 
input_width()164   inline size_t input_width() const {
165     return this->input_width_;
166   }
167 
channels(size_t channels)168   inline AveragePoolingOperatorTester& channels(size_t channels) {
169     assert(channels != 0);
170     this->channels_ = channels;
171     return *this;
172   }
173 
channels()174   inline size_t channels() const {
175     return this->channels_;
176   }
177 
batch_size(size_t batch_size)178   inline AveragePoolingOperatorTester& batch_size(size_t batch_size) {
179     assert(batch_size != 0);
180     this->batch_size_ = batch_size;
181     return *this;
182   }
183 
batch_size()184   inline size_t batch_size() const {
185     return this->batch_size_;
186   }
187 
pooling_size(uint32_t pooling_size)188   inline AveragePoolingOperatorTester& pooling_size(uint32_t pooling_size) {
189     assert(pooling_size >= 1);
190     this->pooling_height_ = pooling_size;
191     this->pooling_width_ = pooling_size;
192     return *this;
193   }
194 
pooling_size(uint32_t pooling_height,uint32_t pooling_width)195   inline AveragePoolingOperatorTester& pooling_size(uint32_t pooling_height, uint32_t pooling_width) {
196     assert(pooling_height >= 1);
197     assert(pooling_width >= 1);
198     this->pooling_height_ = pooling_height;
199     this->pooling_width_ = pooling_width;
200     return *this;
201   }
202 
pooling_height(uint32_t pooling_height)203   inline AveragePoolingOperatorTester& pooling_height(uint32_t pooling_height) {
204     assert(pooling_height >= 1);
205     this->pooling_height_ = pooling_height;
206     return *this;
207   }
208 
pooling_height()209   inline uint32_t pooling_height() const {
210     return this->pooling_height_;
211   }
212 
pooling_width(uint32_t pooling_width)213   inline AveragePoolingOperatorTester& pooling_width(uint32_t pooling_width) {
214     assert(pooling_width >= 1);
215     this->pooling_width_ = pooling_width;
216     return *this;
217   }
218 
pooling_width()219   inline uint32_t pooling_width() const {
220     return this->pooling_width_;
221   }
222 
stride(uint32_t stride)223   inline AveragePoolingOperatorTester& stride(uint32_t stride) {
224     assert(stride >= 1);
225     this->stride_height_ = stride;
226     this->stride_width_ = stride;
227     return *this;
228   }
229 
stride(uint32_t stride_height,uint32_t stride_width)230   inline AveragePoolingOperatorTester& stride(uint32_t stride_height, uint32_t stride_width) {
231     assert(stride_height >= 1);
232     assert(stride_width >= 1);
233     this->stride_height_ = stride_height;
234     this->stride_width_ = stride_width;
235     return *this;
236   }
237 
stride_height(uint32_t stride_height)238   inline AveragePoolingOperatorTester& stride_height(uint32_t stride_height) {
239     assert(stride_height >= 1);
240     this->stride_height_ = stride_height;
241     return *this;
242   }
243 
stride_height()244   inline uint32_t stride_height() const {
245     return this->stride_height_;
246   }
247 
stride_width(uint32_t stride_width)248   inline AveragePoolingOperatorTester& stride_width(uint32_t stride_width) {
249     assert(stride_width >= 1);
250     this->stride_width_ = stride_width;
251     return *this;
252   }
253 
stride_width()254   inline uint32_t stride_width() const {
255     return this->stride_width_;
256   }
257 
output_height()258   inline size_t output_height() const {
259     if (padding_tf_same()) {
260       return (input_height() + stride_height() - 1) / stride_height();
261     } else {
262       const size_t padded_input_height = padding_top() + input_height() + padding_bottom();
263       if (padded_input_height <= pooling_height()) {
264         return 1;
265       } else {
266         return (padded_input_height - pooling_height()) / stride_height() + 1;
267       }
268     }
269   }
270 
output_width()271   inline size_t output_width() const {
272     if (padding_tf_same()) {
273       return (input_width() + stride_width() - 1) / stride_width();
274     } else {
275       const size_t padded_input_width = padding_left() + input_width() + padding_right();
276       if (padded_input_width <= pooling_width()) {
277         return 1;
278       } else {
279         return (padded_input_width - pooling_width()) / stride_width() + 1;
280       }
281     }
282   }
283 
input_pixel_stride(size_t input_pixel_stride)284   inline AveragePoolingOperatorTester& input_pixel_stride(size_t input_pixel_stride) {
285     assert(input_pixel_stride != 0);
286     this->input_pixel_stride_ = input_pixel_stride;
287     return *this;
288   }
289 
input_pixel_stride()290   inline size_t input_pixel_stride() const {
291     if (this->input_pixel_stride_ == 0) {
292       return channels();
293     } else {
294       assert(this->input_pixel_stride_ >= channels());
295       return this->input_pixel_stride_;
296     }
297   }
298 
output_pixel_stride(size_t output_pixel_stride)299   inline AveragePoolingOperatorTester& output_pixel_stride(size_t output_pixel_stride) {
300     assert(output_pixel_stride != 0);
301     this->output_pixel_stride_ = output_pixel_stride;
302     return *this;
303   }
304 
output_pixel_stride()305   inline size_t output_pixel_stride() const {
306     if (this->output_pixel_stride_ == 0) {
307       return channels();
308     } else {
309       assert(this->output_pixel_stride_ >= channels());
310       return this->output_pixel_stride_;
311     }
312   }
313 
next_input_size(uint32_t next_input_height,uint32_t next_input_width)314   inline AveragePoolingOperatorTester& next_input_size(uint32_t next_input_height, uint32_t next_input_width) {
315     assert(next_input_height >= 1);
316     assert(next_input_width >= 1);
317     this->next_input_height_ = next_input_height;
318     this->next_input_width_ = next_input_width;
319     return *this;
320   }
321 
next_input_height(uint32_t next_input_height)322   inline AveragePoolingOperatorTester& next_input_height(uint32_t next_input_height) {
323     assert(next_input_height >= 1);
324     this->next_input_height_ = next_input_height;
325     return *this;
326   }
327 
next_input_height()328   inline uint32_t next_input_height() const {
329     if (this->next_input_height_ == 0) {
330       return input_height();
331     } else {
332       return this->next_input_height_;
333     }
334   }
335 
next_input_width(uint32_t next_input_width)336   inline AveragePoolingOperatorTester& next_input_width(uint32_t next_input_width) {
337     assert(next_input_width >= 1);
338     this->next_input_width_ = next_input_width;
339     return *this;
340   }
341 
next_input_width()342   inline uint32_t next_input_width() const {
343     if (this->next_input_width_ == 0) {
344       return input_width();
345     } else {
346       return this->next_input_width_;
347     }
348   }
349 
next_output_height()350   inline size_t next_output_height() const {
351     const size_t padded_next_input_height = padding_top() + next_input_height() + padding_bottom();
352     if (padded_next_input_height <= pooling_height()) {
353       return 1;
354     } else {
355       return (padded_next_input_height - pooling_height()) / stride_height() + 1;
356     }
357   }
358 
next_output_width()359   inline size_t next_output_width() const {
360     const size_t padded_next_input_width = padding_left() + next_input_width() + padding_right();
361     if (padded_next_input_width <= pooling_width()) {
362       return 1;
363     } else {
364       return (padded_next_input_width - pooling_width()) / stride_width() + 1;
365     }
366   }
367 
next_batch_size(size_t next_batch_size)368   inline AveragePoolingOperatorTester& next_batch_size(size_t next_batch_size) {
369     assert(next_batch_size >= 1);
370     this->next_batch_size_ = next_batch_size;
371     return *this;
372   }
373 
next_batch_size()374   inline size_t next_batch_size() const {
375     if (this->next_batch_size_ == 0) {
376       return batch_size();
377     } else {
378       return this->next_batch_size_;
379     }
380   }
381 
input_scale(float input_scale)382   inline AveragePoolingOperatorTester& input_scale(float input_scale) {
383     assert(input_scale > 0.0f);
384     assert(std::isnormal(input_scale));
385     this->input_scale_ = input_scale;
386     return *this;
387   }
388 
input_scale()389   inline float input_scale() const {
390     return this->input_scale_;
391   }
392 
input_zero_point(uint8_t input_zero_point)393   inline AveragePoolingOperatorTester& input_zero_point(uint8_t input_zero_point) {
394     this->input_zero_point_ = input_zero_point;
395     return *this;
396   }
397 
input_zero_point()398   inline uint8_t input_zero_point() const {
399     return this->input_zero_point_;
400   }
401 
output_scale(float output_scale)402   inline AveragePoolingOperatorTester& output_scale(float output_scale) {
403     assert(output_scale > 0.0f);
404     assert(std::isnormal(output_scale));
405     this->output_scale_ = output_scale;
406     return *this;
407   }
408 
output_scale()409   inline float output_scale() const {
410     return this->output_scale_;
411   }
412 
output_zero_point(uint8_t output_zero_point)413   inline AveragePoolingOperatorTester& output_zero_point(uint8_t output_zero_point) {
414     this->output_zero_point_ = output_zero_point;
415     return *this;
416   }
417 
output_zero_point()418   inline uint8_t output_zero_point() const {
419     return this->output_zero_point_;
420   }
421 
qmin(uint8_t qmin)422   inline AveragePoolingOperatorTester& qmin(uint8_t qmin) {
423     this->qmin_ = qmin;
424     return *this;
425   }
426 
qmin()427   inline uint8_t qmin() const {
428     return this->qmin_;
429   }
430 
qmax(uint8_t qmax)431   inline AveragePoolingOperatorTester& qmax(uint8_t qmax) {
432     this->qmax_ = qmax;
433     return *this;
434   }
435 
qmax()436   inline uint8_t qmax() const {
437     return this->qmax_;
438   }
439 
iterations(size_t iterations)440   inline AveragePoolingOperatorTester& iterations(size_t iterations) {
441     this->iterations_ = iterations;
442     return *this;
443   }
444 
iterations()445   inline size_t iterations() const {
446     return this->iterations_;
447   }
448 
TestF16()449   void TestF16() const {
450     std::random_device random_device;
451     auto rng = std::mt19937(random_device());
452     std::uniform_real_distribution<float> f32dist;
453 
454     std::vector<uint16_t> input((batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + channels() + XNN_EXTRA_BYTES / sizeof(uint16_t));
455     std::vector<uint16_t> output((batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + channels());
456     std::vector<float> output_ref(batch_size() * output_height() * output_width() * channels());
457     for (size_t iteration = 0; iteration < iterations(); iteration++) {
458       std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
459       std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
460 
461       // Compute reference results, without clamping.
462       for (size_t i = 0; i < batch_size(); i++) {
463         for (size_t oy = 0; oy < output_height(); oy++) {
464           for (size_t ox = 0; ox < output_width(); ox++) {
465             for (size_t c = 0; c < channels(); c++) {
466               float acc = 0.0f;
467               int32_t n = 0;
468               for (size_t py = 0; py < pooling_height(); py++) {
469                 const size_t iy = oy * stride_height() + py - padding_top();
470                 for (size_t px = 0; px < pooling_width(); px++) {
471                   const size_t ix = ox * stride_width() + px - padding_left();
472                   if (ix < input_width() && iy < input_height()) {
473                     acc += fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + c]);
474                     n += 1;
475                   }
476                 }
477               }
478               output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] = acc / float(n);
479             }
480           }
481         }
482       }
483 
484       // Compute clamping parameters.
485       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
486       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
487       const float accumulated_range = accumulated_max - accumulated_min;
488       float output_min = accumulated_min + accumulated_range / 255.0f * float(qmin());
489       float output_max = accumulated_max - accumulated_range / 255.0f * float(255 - qmax());
490       output_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_min));
491       output_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_max));
492       if (accumulated_range == 0.0f) {
493         output_min = -std::numeric_limits<float>::infinity();
494         output_max = +std::numeric_limits<float>::infinity();
495       }
496       if (qmin() == std::numeric_limits<uint8_t>::min()) {
497         output_min = -std::numeric_limits<float>::infinity();
498       }
499       if (qmax() == std::numeric_limits<uint8_t>::max()) {
500         output_max = +std::numeric_limits<float>::infinity();
501       }
502 
503       // Clamp reference results.
504       for (float& value : output_ref) {
505         value = std::max(std::min(value, output_max), output_min);
506       }
507 
508       // Create, setup, run, and destroy Average Pooling operator.
509       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
510       xnn_operator_t average_pooling_op = nullptr;
511 
512       const xnn_status status = xnn_create_average_pooling2d_nhwc_f16(
513           padding_top(), padding_right(), padding_bottom(), padding_left(),
514           pooling_height(), pooling_width(),
515           stride_height(), stride_width(),
516           channels(), input_pixel_stride(), output_pixel_stride(),
517           output_min, output_max,
518           0, &average_pooling_op);
519       if (status == xnn_status_unsupported_hardware) {
520         GTEST_SKIP();
521       }
522       ASSERT_EQ(xnn_status_success, status);
523       ASSERT_NE(nullptr, average_pooling_op);
524 
525       // Smart pointer to automatically delete average_pooling_op.
526       std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_average_pooling_op(average_pooling_op, xnn_delete_operator);
527 
528       ASSERT_EQ(xnn_status_success,
529         xnn_setup_average_pooling2d_nhwc_f16(
530           average_pooling_op,
531           batch_size(), input_height(), input_width(),
532           input.data(), output.data(),
533           nullptr /* thread pool */));
534 
535       ASSERT_EQ(xnn_status_success,
536         xnn_run_operator(average_pooling_op, nullptr /* thread pool */));
537 
538       // Verify results.
539       for (size_t i = 0; i < batch_size(); i++) {
540         for (size_t y = 0; y < output_height(); y++) {
541           for (size_t x = 0; x < output_width(); x++) {
542             for (size_t c = 0; c < channels(); c++) {
543               ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), output_max);
544               ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), output_min);
545               ASSERT_NEAR(
546                   fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]),
547                   output_ref[((i * output_height() + y) * output_width() + x) * channels() + c],
548                   std::max(1.0e-3f, std::abs(output_ref[((i * output_height() + y) * output_width() + x) * channels() + c]) * 1.0e-2f)) <<
549                 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c;
550             }
551           }
552         }
553       }
554     }
555   }
556 
TestF32()557   void TestF32() const {
558     std::random_device random_device;
559     auto rng = std::mt19937(random_device());
560     std::uniform_real_distribution<float> f32dist;
561 
562     std::vector<float> input((batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + channels() + XNN_EXTRA_BYTES / sizeof(float));
563     std::vector<float> output((batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + channels());
564     std::vector<float> output_ref(batch_size() * output_height() * output_width() * channels());
565     for (size_t iteration = 0; iteration < iterations(); iteration++) {
566       std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); });
567       std::fill(output.begin(), output.end(), std::nanf(""));
568 
569       // Compute reference results, without clamping.
570       for (size_t i = 0; i < batch_size(); i++) {
571         for (size_t oy = 0; oy < output_height(); oy++) {
572           for (size_t ox = 0; ox < output_width(); ox++) {
573             for (size_t c = 0; c < channels(); c++) {
574               float acc = 0.0f;
575               int32_t n = 0;
576               for (size_t py = 0; py < pooling_height(); py++) {
577                 const size_t iy = oy * stride_height() + py - padding_top();
578                 for (size_t px = 0; px < pooling_width(); px++) {
579                   const size_t ix = ox * stride_width() + px - padding_left();
580                   if (ix < input_width() && iy < input_height()) {
581                     acc += input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + c];
582                     n += 1;
583                   }
584                 }
585               }
586               output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] = acc / float(n);
587             }
588           }
589         }
590       }
591 
592       // Compute clamping parameters.
593       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
594       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
595       const float accumulated_range = accumulated_max - accumulated_min;
596       const float output_min = accumulated_range == 0.0f ?
597         -std::numeric_limits<float>::infinity() :
598         accumulated_min + accumulated_range / 255.0f * float(qmin());
599       const float output_max = accumulated_range == 0.0f ?
600         +std::numeric_limits<float>::infinity() :
601         accumulated_max - accumulated_range / 255.0f * float(255 - qmax());
602 
603       // Clamp reference results.
604       for (float& value : output_ref) {
605         value = std::max(std::min(value, output_max), output_min);
606       }
607 
608       // Create, setup, run, and destroy Average Pooling operator.
609       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
610       xnn_operator_t average_pooling_op = nullptr;
611 
612       ASSERT_EQ(xnn_status_success,
613         xnn_create_average_pooling2d_nhwc_f32(
614           padding_top(), padding_right(), padding_bottom(), padding_left(),
615           pooling_height(), pooling_width(),
616           stride_height(), stride_width(),
617           channels(), input_pixel_stride(), output_pixel_stride(),
618           output_min, output_max,
619           0, &average_pooling_op));
620       ASSERT_NE(nullptr, average_pooling_op);
621 
622       // Smart pointer to automatically delete average_pooling_op.
623       std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_average_pooling_op(average_pooling_op, xnn_delete_operator);
624 
625       ASSERT_EQ(xnn_status_success,
626         xnn_setup_average_pooling2d_nhwc_f32(
627           average_pooling_op,
628           batch_size(), input_height(), input_width(),
629           input.data(), output.data(),
630           nullptr /* thread pool */));
631 
632       ASSERT_EQ(xnn_status_success,
633         xnn_run_operator(average_pooling_op, nullptr /* thread pool */));
634 
635       // Verify results.
636       for (size_t i = 0; i < batch_size(); i++) {
637         for (size_t y = 0; y < output_height(); y++) {
638           for (size_t x = 0; x < output_width(); x++) {
639             for (size_t c = 0; c < channels(); c++) {
640               ASSERT_LE(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c], output_max);
641               ASSERT_GE(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c], output_min);
642               ASSERT_NEAR(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c],
643                   output_ref[((i * output_height() + y) * output_width() + x) * channels() + c],
644                   std::abs(output_ref[((i * output_height() + y) * output_width() + x) * channels() + c]) * 1.0e-6f) <<
645                 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c;
646             }
647           }
648         }
649       }
650     }
651   }
652 
TestQU8()653   void TestQU8() const {
654     std::random_device random_device;
655     auto rng = std::mt19937(random_device());
656     std::uniform_int_distribution<int32_t> u8dist(
657       std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max());
658 
659     std::vector<uint8_t> input((batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + channels() + XNN_EXTRA_BYTES / sizeof(uint8_t));
660     std::vector<uint8_t> output((batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + channels());
661     std::vector<float> output_ref(batch_size() * output_height() * output_width() * channels());
662     for (size_t iteration = 0; iteration < iterations(); iteration++) {
663       std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); });
664       std::fill(output.begin(), output.end(), UINT8_C(0xA5));
665 
666       // Compute reference results.
667       const double scale = double(input_scale()) / (double(output_scale()) * double(pooling_height() * pooling_width()));
668       for (size_t i = 0; i < batch_size(); i++) {
669         for (size_t oy = 0; oy < output_height(); oy++) {
670           for (size_t ox = 0; ox < output_width(); ox++) {
671             for (size_t c = 0; c < channels(); c++) {
672               double acc = 0.0f;
673               for (size_t py = 0; py < pooling_height(); py++) {
674                 const size_t iy = oy * stride_height() + py - padding_top();
675                 for (size_t px = 0; px < pooling_width(); px++) {
676                   const size_t ix = ox * stride_width() + px - padding_left();
677                   if (ix < input_width() && iy < input_height()) {
678                     acc += double(int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + c]) - int32_t(input_zero_point()));
679                   }
680                 }
681               }
682               output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] = float(acc * scale + double(output_zero_point()));
683               output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] =
684                 std::min<float>(output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c], float(qmax()));
685               output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] =
686                 std::max<float>(output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c], float(qmin()));
687             }
688           }
689         }
690       }
691 
692       // Create, setup, run, and destroy Average Pooling operator.
693       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
694       xnn_operator_t average_pooling_op = nullptr;
695 
696       ASSERT_EQ(xnn_status_success,
697         xnn_create_average_pooling2d_nhwc_qu8(
698           padding_top(), padding_right(), padding_bottom(), padding_left(),
699           pooling_height(), pooling_width(),
700           stride_height(), stride_width(),
701           channels(), input_pixel_stride(), output_pixel_stride(),
702           input_zero_point(), input_scale(),
703           output_zero_point(), output_scale(),
704           qmin(), qmax(),
705           0, &average_pooling_op));
706       ASSERT_NE(nullptr, average_pooling_op);
707 
708       // Smart pointer to automatically delete average_pooling_op.
709       std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_average_pooling_op(average_pooling_op, xnn_delete_operator);
710 
711       ASSERT_EQ(xnn_status_success,
712         xnn_setup_average_pooling2d_nhwc_qu8(
713           average_pooling_op,
714           batch_size(), input_height(), input_width(),
715           input.data(), output.data(),
716           nullptr /* thread pool */));
717 
718       ASSERT_EQ(xnn_status_success,
719         xnn_run_operator(average_pooling_op, nullptr /* thread pool */));
720 
721       // Verify results.
722       for (size_t i = 0; i < batch_size(); i++) {
723         for (size_t y = 0; y < output_height(); y++) {
724           for (size_t x = 0; x < output_width(); x++) {
725             for (size_t c = 0; c < channels(); c++) {
726               ASSERT_LE(uint32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), uint32_t(qmax()));
727               ASSERT_GE(uint32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), uint32_t(qmin()));
728               ASSERT_NEAR(float(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c])),
729                 output_ref[((i * output_height() + y) * output_width() + x) * channels() + c], 0.80f) <<
730                 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c;
731             }
732           }
733         }
734       }
735     }
736   }
737 
TestSetupF16()738   void TestSetupF16() const {
739     std::random_device random_device;
740     auto rng = std::mt19937(random_device());
741     std::uniform_real_distribution<float> f32dist;
742 
743     std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) + std::max<size_t>(
744       (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + channels(),
745       (next_batch_size() * next_input_height() * next_input_width() - 1) * input_pixel_stride() + channels()));
746     std::vector<uint16_t> output(std::max<size_t>(
747       (batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + channels(),
748       (next_batch_size() * next_output_height() * next_output_width() - 1) * output_pixel_stride() + channels()));
749     std::vector<float> output_ref(batch_size() * output_height() * output_width() * channels());
750     std::vector<float> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * channels());
751     for (size_t iteration = 0; iteration < iterations(); iteration++) {
752       std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
753       std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
754 
755       // Compute reference results, without clamping.
756       for (size_t i = 0; i < batch_size(); i++) {
757         for (size_t oy = 0; oy < output_height(); oy++) {
758           for (size_t ox = 0; ox < output_width(); ox++) {
759             for (size_t c = 0; c < channels(); c++) {
760               float acc = 0.0f;
761               size_t n = 0;
762               for (size_t py = 0; py < pooling_height(); py++) {
763                 const size_t iy = oy * stride_height() + py - padding_top();
764                 for (size_t px = 0; px < pooling_width(); px++) {
765                   const size_t ix = ox * stride_width() + px - padding_left();
766                   if (ix < input_width() && iy < input_height()) {
767                     acc += fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + c]);
768                     n += 1;
769                   }
770                 }
771               }
772               output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] = acc / float(n);
773             }
774           }
775         }
776       }
777 
778       // Compute clamping parameters.
779       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
780       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
781       const float accumulated_range = accumulated_max - accumulated_min;
782       float output_min = accumulated_min + accumulated_range / 255.0f * float(qmin());
783       float output_max = accumulated_max - accumulated_range / 255.0f * float(255 - qmax());
784       output_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_min));
785       output_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_max));
786       if (accumulated_range == 0.0f) {
787         output_min = -std::numeric_limits<float>::infinity();
788         output_max = +std::numeric_limits<float>::infinity();
789       }
790       if (qmin() == std::numeric_limits<uint8_t>::min()) {
791         output_min = -std::numeric_limits<float>::infinity();
792       }
793       if (qmax() == std::numeric_limits<uint8_t>::max()) {
794         output_max = +std::numeric_limits<float>::infinity();
795       }
796 
797       // Clamp reference results.
798       for (float& value : output_ref) {
799         value = std::max(std::min(value, output_max), output_min);
800       }
801 
802       // Create, setup, and run Average Pooling operator once.
803       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
804       xnn_operator_t average_pooling_op = nullptr;
805 
806       const xnn_status status = xnn_create_average_pooling2d_nhwc_f16(
807           padding_top(), padding_right(), padding_bottom(), padding_left(),
808           pooling_height(), pooling_width(),
809           stride_height(), stride_width(),
810           channels(), input_pixel_stride(), output_pixel_stride(),
811           output_min, output_max,
812           0, &average_pooling_op);
813       if (status == xnn_status_unsupported_hardware) {
814         GTEST_SKIP();
815       }
816       ASSERT_EQ(xnn_status_success, status);
817       ASSERT_NE(nullptr, average_pooling_op);
818 
819       ASSERT_EQ(xnn_status_success,
820         xnn_setup_average_pooling2d_nhwc_f16(
821           average_pooling_op,
822           batch_size(), input_height(), input_width(),
823           input.data(), output.data(),
824           nullptr /* thread pool */));
825 
826       ASSERT_EQ(xnn_status_success,
827         xnn_run_operator(average_pooling_op, nullptr /* thread pool */));
828 
829       // Verify results of the first run.
830       for (size_t i = 0; i < batch_size(); i++) {
831         for (size_t y = 0; y < output_height(); y++) {
832           for (size_t x = 0; x < output_width(); x++) {
833             for (size_t c = 0; c < channels(); c++) {
834               ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), output_max);
835               ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), output_min);
836               ASSERT_NEAR(
837                   fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]),
838                   output_ref[((i * output_height() + y) * output_width() + x) * channels() + c],
839                   std::max(1.0e-3f, std::abs(output_ref[((i * output_height() + y) * output_width() + x) * channels() + c]) * 1.0e-2f)) <<
840                 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c;
841             }
842           }
843         }
844       }
845 
846       // Re-generate data for the second run.
847       std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
848       std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
849 
850       // Compute reference results for the second run.
851       for (size_t i = 0; i < next_batch_size(); i++) {
852         for (size_t oy = 0; oy < next_output_height(); oy++) {
853           for (size_t ox = 0; ox < next_output_width(); ox++) {
854             for (size_t c = 0; c < channels(); c++) {
855               float acc = 0.0f;
856               int32_t n = 0;
857               for (size_t py = 0; py < pooling_height(); py++) {
858                 const size_t iy = oy * stride_height() + py - padding_top();
859                 for (size_t px = 0; px < pooling_width(); px++) {
860                   const size_t ix = ox * stride_width() + px - padding_left();
861                   if (ix < next_input_width() && iy < next_input_height()) {
862                     acc += fp16_ieee_to_fp32_value(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_pixel_stride() + c]);
863                     n += 1;
864                   }
865                 }
866               }
867               next_output_ref[((i * next_output_height() + oy) * next_output_width() + ox) * channels() + c] =
868                 std::max(std::min(acc / float(n), output_max), output_min);
869             }
870           }
871         }
872       }
873 
874       // Setup and run Average Pooling operator the second time, and destroy the operator.
875       ASSERT_EQ(xnn_status_success,
876         xnn_setup_average_pooling2d_nhwc_f16(
877           average_pooling_op,
878           next_batch_size(), next_input_height(), next_input_width(),
879           input.data(), output.data(),
880           nullptr /* thread pool */));
881 
882       ASSERT_EQ(xnn_status_success,
883         xnn_run_operator(average_pooling_op, nullptr /* thread pool */));
884 
885       ASSERT_EQ(xnn_status_success,
886         xnn_delete_operator(average_pooling_op));
887       average_pooling_op = nullptr;
888 
889       // Verify results of the second run.
890       for (size_t i = 0; i < next_batch_size(); i++) {
891         for (size_t y = 0; y < next_output_height(); y++) {
892           for (size_t x = 0; x < next_output_width(); x++) {
893             for (size_t c = 0; c < channels(); c++) {
894               ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c]), output_max);
895               ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c]), output_min);
896               ASSERT_NEAR(
897                   fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c]),
898                   next_output_ref[((i * next_output_height() + y) * next_output_width() + x) * channels() + c],
899                   std::max(1.0e-3f, std::abs(next_output_ref[((i * next_output_height() + y) * next_output_width() + x) * channels() + c]) * 1.0e-2f)) <<
900                 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c;
901             }
902           }
903         }
904       }
905     }
906   }
907 
TestSetupF32()908   void TestSetupF32() const {
909     std::random_device random_device;
910     auto rng = std::mt19937(random_device());
911     std::uniform_real_distribution<float> f32dist;
912 
913     std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + std::max<size_t>(
914       (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + channels(),
915       (next_batch_size() * next_input_height() * next_input_width() - 1) * input_pixel_stride() + channels()));
916     std::vector<float> output(std::max<size_t>(
917       (batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + channels(),
918       (next_batch_size() * next_output_height() * next_output_width() - 1) * output_pixel_stride() + channels()));
919     std::vector<float> output_ref(batch_size() * output_height() * output_width() * channels());
920     std::vector<float> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * channels());
921     for (size_t iteration = 0; iteration < iterations(); iteration++) {
922       std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); });
923       std::fill(output.begin(), output.end(), std::nanf(""));
924 
925       // Compute reference results, without clamping.
926       for (size_t i = 0; i < batch_size(); i++) {
927         for (size_t oy = 0; oy < output_height(); oy++) {
928           for (size_t ox = 0; ox < output_width(); ox++) {
929             for (size_t c = 0; c < channels(); c++) {
930               float acc = 0.0f;
931               size_t n = 0;
932               for (size_t py = 0; py < pooling_height(); py++) {
933                 const size_t iy = oy * stride_height() + py - padding_top();
934                 for (size_t px = 0; px < pooling_width(); px++) {
935                   const size_t ix = ox * stride_width() + px - padding_left();
936                   if (ix < input_width() && iy < input_height()) {
937                     acc += input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + c];
938                     n += 1;
939                   }
940                 }
941               }
942               output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] = acc / float(n);
943             }
944           }
945         }
946       }
947 
948       // Compute clamping parameters.
949       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
950       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
951       const float accumulated_range = accumulated_max - accumulated_min;
952       const float output_min = accumulated_range == 0.0f ?
953         -std::numeric_limits<float>::infinity() :
954         accumulated_min + accumulated_range / 255.0f * float(qmin());
955       const float output_max = accumulated_range == 0.0f ?
956         +std::numeric_limits<float>::infinity() :
957         accumulated_max - accumulated_range / 255.0f * float(255 - qmax());
958 
959       // Clamp reference results.
960       for (float& value : output_ref) {
961         value = std::max(std::min(value, output_max), output_min);
962       }
963 
964       // Create, setup, and run Average Pooling operator once.
965       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
966       xnn_operator_t average_pooling_op = nullptr;
967 
968       ASSERT_EQ(xnn_status_success,
969         xnn_create_average_pooling2d_nhwc_f32(
970           padding_top(), padding_right(), padding_bottom(), padding_left(),
971           pooling_height(), pooling_width(),
972           stride_height(), stride_width(),
973           channels(), input_pixel_stride(), output_pixel_stride(),
974           output_min, output_max,
975           0, &average_pooling_op));
976       ASSERT_NE(nullptr, average_pooling_op);
977 
978       ASSERT_EQ(xnn_status_success,
979         xnn_setup_average_pooling2d_nhwc_f32(
980           average_pooling_op,
981           batch_size(), input_height(), input_width(),
982           input.data(), output.data(),
983           nullptr /* thread pool */));
984 
985       ASSERT_EQ(xnn_status_success,
986         xnn_run_operator(average_pooling_op, nullptr /* thread pool */));
987 
988       // Verify results of the first run.
989       for (size_t i = 0; i < batch_size(); i++) {
990         for (size_t y = 0; y < output_height(); y++) {
991           for (size_t x = 0; x < output_width(); x++) {
992             for (size_t c = 0; c < channels(); c++) {
993               ASSERT_LE(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c], output_max);
994               ASSERT_GE(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c], output_min);
995               ASSERT_NEAR(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c],
996                   output_ref[((i * output_height() + y) * output_width() + x) * channels() + c],
997                   std::abs(output_ref[((i * output_height() + y) * output_width() + x) * channels() + c]) * 1.0e-6f) <<
998                 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c;
999             }
1000           }
1001         }
1002       }
1003 
1004       // Re-generate data for the second run.
1005       std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); });
1006       std::fill(output.begin(), output.end(), std::nanf(""));
1007 
1008       // Compute reference results for the second run.
1009       for (size_t i = 0; i < next_batch_size(); i++) {
1010         for (size_t oy = 0; oy < next_output_height(); oy++) {
1011           for (size_t ox = 0; ox < next_output_width(); ox++) {
1012             for (size_t c = 0; c < channels(); c++) {
1013               float acc = 0.0f;
1014               int32_t n = 0;
1015               for (size_t py = 0; py < pooling_height(); py++) {
1016                 const size_t iy = oy * stride_height() + py - padding_top();
1017                 for (size_t px = 0; px < pooling_width(); px++) {
1018                   const size_t ix = ox * stride_width() + px - padding_left();
1019                   if (ix < next_input_width() && iy < next_input_height()) {
1020                     acc += input[((i * next_input_height() + iy) * next_input_width() + ix) * input_pixel_stride() + c];
1021                     n += 1;
1022                   }
1023                 }
1024               }
1025               next_output_ref[((i * next_output_height() + oy) * next_output_width() + ox) * channels() + c] =
1026                 std::max(std::min(acc / float(n), output_max), output_min);
1027             }
1028           }
1029         }
1030       }
1031 
1032       // Setup and run Average Pooling operator the second time, and destroy the operator.
1033       ASSERT_EQ(xnn_status_success,
1034         xnn_setup_average_pooling2d_nhwc_f32(
1035           average_pooling_op,
1036           next_batch_size(), next_input_height(), next_input_width(),
1037           input.data(), output.data(),
1038           nullptr /* thread pool */));
1039 
1040       ASSERT_EQ(xnn_status_success,
1041         xnn_run_operator(average_pooling_op, nullptr /* thread pool */));
1042 
1043       ASSERT_EQ(xnn_status_success,
1044         xnn_delete_operator(average_pooling_op));
1045       average_pooling_op = nullptr;
1046 
1047       // Verify results of the second run.
1048       for (size_t i = 0; i < next_batch_size(); i++) {
1049         for (size_t y = 0; y < next_output_height(); y++) {
1050           for (size_t x = 0; x < next_output_width(); x++) {
1051             for (size_t c = 0; c < channels(); c++) {
1052               ASSERT_LE(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c], output_max);
1053               ASSERT_GE(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c], output_min);
1054               ASSERT_NEAR(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c],
1055                   next_output_ref[((i * next_output_height() + y) * next_output_width() + x) * channels() + c],
1056                   std::abs(next_output_ref[((i * next_output_height() + y) * next_output_width() + x) * channels() + c]) * 1.0e-6f) <<
1057                 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c;
1058             }
1059           }
1060         }
1061       }
1062     }
1063   }
1064 
TestSetupQU8()1065   void TestSetupQU8() const {
1066     std::random_device random_device;
1067     auto rng = std::mt19937(random_device());
1068     std::uniform_int_distribution<int32_t> u8dist(
1069       std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max());
1070 
1071     std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) + std::max<size_t>(
1072       (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + channels(),
1073       (next_batch_size() * next_input_height() * next_input_width() - 1) * input_pixel_stride() + channels()));
1074     std::vector<uint8_t> output(std::max<size_t>(
1075       (batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + channels(),
1076       (next_batch_size() * next_output_height() * next_output_width() - 1) * output_pixel_stride() + channels()));
1077     std::vector<float> output_ref(batch_size() * output_height() * output_width() * channels());
1078     std::vector<float> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * channels());
1079     for (size_t iteration = 0; iteration < iterations(); iteration++) {
1080       std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); });
1081       std::fill(output.begin(), output.end(), INT8_C(0xA5));
1082 
1083       // Compute reference results.
1084       const double scale = double(input_scale()) / (double(output_scale()) * double(pooling_height() * pooling_width()));
1085       for (size_t i = 0; i < batch_size(); i++) {
1086         for (size_t oy = 0; oy < output_height(); oy++) {
1087           for (size_t ox = 0; ox < output_width(); ox++) {
1088             for (size_t c = 0; c < channels(); c++) {
1089               double acc = 0.0f;
1090               for (size_t py = 0; py < pooling_height(); py++) {
1091                 const size_t iy = oy * stride_height() + py - padding_top();
1092                 for (size_t px = 0; px < pooling_width(); px++) {
1093                   const size_t ix = ox * stride_width() + px - padding_left();
1094                   if (ix < input_width() && iy < input_height()) {
1095                     acc += double(int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + c]) - int32_t(input_zero_point()));
1096                   }
1097                 }
1098               }
1099               output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] = float(acc * scale + double(output_zero_point()));
1100               output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] =
1101                 std::min<float>(output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c], float(qmax()));
1102               output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] =
1103                 std::max<float>(output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c], float(qmin()));
1104             }
1105           }
1106         }
1107       }
1108 
1109       // Create, setup, and run Average Pooling operator once.
1110       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
1111       xnn_operator_t average_pooling_op = nullptr;
1112 
1113       ASSERT_EQ(xnn_status_success,
1114         xnn_create_average_pooling2d_nhwc_qu8(
1115           padding_top(), padding_right(), padding_bottom(), padding_left(),
1116           pooling_height(), pooling_width(),
1117           stride_height(), stride_width(),
1118           channels(), input_pixel_stride(), output_pixel_stride(),
1119           input_zero_point(), input_scale(),
1120           output_zero_point(), output_scale(),
1121           qmin(), qmax(),
1122           0, &average_pooling_op));
1123       ASSERT_NE(nullptr, average_pooling_op);
1124 
1125       ASSERT_EQ(xnn_status_success,
1126         xnn_setup_average_pooling2d_nhwc_qu8(
1127           average_pooling_op,
1128           batch_size(), input_height(), input_width(),
1129           input.data(), output.data(),
1130           nullptr /* thread pool */));
1131 
1132       ASSERT_EQ(xnn_status_success,
1133         xnn_run_operator(average_pooling_op, nullptr /* thread pool */));
1134 
1135       // Verify results of the first run.
1136       for (size_t i = 0; i < batch_size(); i++) {
1137         for (size_t y = 0; y < output_height(); y++) {
1138           for (size_t x = 0; x < output_width(); x++) {
1139             for (size_t c = 0; c < channels(); c++) {
1140               ASSERT_LE(uint32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), uint32_t(qmax()));
1141               ASSERT_GE(uint32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), uint32_t(qmin()));
1142               ASSERT_NEAR(float(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c])),
1143                 output_ref[((i * output_height() + y) * output_width() + x) * channels() + c], 0.80f) <<
1144                 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c;
1145             }
1146           }
1147         }
1148       }
1149 
1150       // Re-generate data for the second run.
1151       std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); });
1152       std::fill(output.begin(), output.end(), UINT8_C(0xA5));
1153 
1154       // Compute reference results for the second run.
1155       for (size_t i = 0; i < next_batch_size(); i++) {
1156         for (size_t oy = 0; oy < next_output_height(); oy++) {
1157           for (size_t ox = 0; ox < next_output_width(); ox++) {
1158             for (size_t c = 0; c < channels(); c++) {
1159               double acc = 0.0f;
1160               for (size_t py = 0; py < pooling_height(); py++) {
1161                 const size_t iy = oy * stride_height() + py - padding_top();
1162                 for (size_t px = 0; px < pooling_width(); px++) {
1163                   const size_t ix = ox * stride_width() + px - padding_left();
1164                   if (ix < next_input_width() && iy < next_input_height()) {
1165                     acc += double(int32_t(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_pixel_stride() + c]) - int32_t(input_zero_point()));
1166                   }
1167                 }
1168               }
1169               next_output_ref[((i * next_output_height() + oy) * next_output_width() + ox) * channels() + c] = float(acc * scale + double(output_zero_point()));
1170               next_output_ref[((i * next_output_height() + oy) * next_output_width() + ox) * channels() + c] =
1171                 std::min<float>(next_output_ref[((i * next_output_height() + oy) * next_output_width() + ox) * channels() + c], float(qmax()));
1172               next_output_ref[((i * next_output_height() + oy) * next_output_width() + ox) * channels() + c] =
1173                 std::max<float>(next_output_ref[((i * next_output_height() + oy) * next_output_width() + ox) * channels() + c], float(qmin()));
1174             }
1175           }
1176         }
1177       }
1178 
1179       // Setup and run Average Pooling operator the second time, and destroy the operator.
1180       ASSERT_EQ(xnn_status_success,
1181         xnn_setup_average_pooling2d_nhwc_qu8(
1182           average_pooling_op,
1183           next_batch_size(), next_input_height(), next_input_width(),
1184           input.data(), output.data(),
1185           nullptr /* thread pool */));
1186 
1187       ASSERT_EQ(xnn_status_success,
1188         xnn_run_operator(average_pooling_op, nullptr /* thread pool */));
1189 
1190       ASSERT_EQ(xnn_status_success,
1191         xnn_delete_operator(average_pooling_op));
1192       average_pooling_op = nullptr;
1193 
1194       // Verify results of the second run.
1195       for (size_t i = 0; i < next_batch_size(); i++) {
1196         for (size_t y = 0; y < next_output_height(); y++) {
1197           for (size_t x = 0; x < next_output_width(); x++) {
1198             for (size_t c = 0; c < channels(); c++) {
1199               ASSERT_LE(uint32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c]), uint32_t(qmax()));
1200               ASSERT_GE(uint32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c]), uint32_t(qmin()));
1201               ASSERT_NEAR(float(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c])),
1202                 next_output_ref[((i * next_output_height() + y) * next_output_width() + x) * channels() + c], 0.80f) <<
1203                 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c;
1204             }
1205           }
1206         }
1207       }
1208     }
1209   }
1210 
1211  private:
1212   uint32_t padding_top_{0};
1213   uint32_t padding_right_{0};
1214   uint32_t padding_bottom_{0};
1215   uint32_t padding_left_{0};
1216   bool padding_tf_same_{false};
1217   size_t input_height_{1};
1218   size_t input_width_{1};
1219   size_t channels_{1};
1220   size_t batch_size_{1};
1221   size_t input_pixel_stride_{0};
1222   size_t output_pixel_stride_{0};
1223   uint32_t pooling_height_{1};
1224   uint32_t pooling_width_{1};
1225   uint32_t stride_height_{1};
1226   uint32_t stride_width_{1};
1227   size_t next_input_height_{0};
1228   size_t next_input_width_{0};
1229   size_t next_batch_size_{0};
1230   float input_scale_{1.0f};
1231   float output_scale_{1.0f};
1232   uint8_t input_zero_point_{121};
1233   uint8_t output_zero_point_{133};
1234   uint8_t qmin_{0};
1235   uint8_t qmax_{255};
1236   size_t iterations_{1};
1237 };
1238