xref: /aosp_15_r20/external/XNNPACK/test/max-pooling-operator-tester.h (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #pragma once
10 
11 #include <gtest/gtest.h>
12 
13 #include <fp16.h>
14 
15 #include <algorithm>
16 #include <cassert>
17 #include <cstddef>
18 #include <cstdlib>
19 #include <limits>
20 #include <random>
21 #include <vector>
22 
23 #include <xnnpack.h>
24 
25 
26 class MaxPoolingOperatorTester {
27  public:
padding_tf_same(bool padding_same)28   inline MaxPoolingOperatorTester& padding_tf_same(bool padding_same) {
29     if (padding_same) {
30       assert(padding_top() == 0);
31       assert(padding_left() == 0);
32       assert(padding_bottom() == 0);
33       assert(padding_right() == 0);
34     }
35     this->padding_tf_same_ = padding_same;
36     return *this;
37   }
38 
padding_tf_same()39   inline bool padding_tf_same() const {
40     return this->padding_tf_same_;
41   }
42 
padding(uint32_t padding)43   inline MaxPoolingOperatorTester& padding(uint32_t padding) {
44     assert(!padding_tf_same());
45     this->padding_top_ = padding;
46     this->padding_right_ = padding;
47     this->padding_bottom_ = padding;
48     this->padding_left_ = padding;
49     return *this;
50   }
51 
padding(uint32_t padding_height,uint32_t padding_width)52   inline MaxPoolingOperatorTester& padding(uint32_t padding_height, uint32_t padding_width) {
53     assert(!padding_tf_same());
54     this->padding_top_ = padding_height;
55     this->padding_right_ = padding_width;
56     this->padding_bottom_ = padding_height;
57     this->padding_left_ = padding_width;
58     return *this;
59   }
60 
padding_height(uint32_t padding_height)61   inline MaxPoolingOperatorTester& padding_height(uint32_t padding_height) {
62     assert(!padding_tf_same());
63     this->padding_top_ = padding_height;
64     this->padding_bottom_ = padding_height;
65     return *this;
66   }
67 
padding_width(uint32_t padding_width)68   inline MaxPoolingOperatorTester& padding_width(uint32_t padding_width) {
69     assert(!padding_tf_same());
70     this->padding_right_ = padding_width;
71     this->padding_left_ = padding_width;
72     return *this;
73   }
74 
padding_top(uint32_t padding_top)75   inline MaxPoolingOperatorTester& padding_top(uint32_t padding_top) {
76     assert(!padding_tf_same());
77     this->padding_top_ = padding_top;
78     return *this;
79   }
80 
padding_top()81   inline uint32_t padding_top() const {
82     if (padding_tf_same()) {
83       const uint32_t total_padding_height =
84         (output_height() - 1) * stride_height() + dilated_pooling_height() - input_height();
85       return total_padding_height / 2;
86     } else {
87       return this->padding_top_;
88     }
89   }
90 
padding_left(uint32_t padding_left)91   inline MaxPoolingOperatorTester& padding_left(uint32_t padding_left) {
92     assert(!padding_tf_same());
93     this->padding_left_ = padding_left;
94     return *this;
95   }
96 
padding_left()97   inline uint32_t padding_left() const {
98     if (padding_tf_same()) {
99       const uint32_t total_padding_width =
100         (output_width() - 1) * stride_width() + dilated_pooling_width() - input_width();
101       return total_padding_width / 2;
102     } else {
103       return this->padding_left_;
104     }
105   }
106 
padding_bottom(uint32_t padding_bottom)107   inline MaxPoolingOperatorTester& padding_bottom(uint32_t padding_bottom) {
108     assert(!padding_tf_same());
109     this->padding_bottom_ = padding_bottom;
110     return *this;
111   }
112 
padding_bottom()113   inline uint32_t padding_bottom() const {
114     if (padding_tf_same()) {
115       const uint32_t total_padding_height =
116         (output_height() - 1) * stride_height() + dilated_pooling_height() - input_height();
117       return total_padding_height - total_padding_height / 2;
118     } else {
119       return this->padding_bottom_;
120     }
121   }
122 
padding_right(uint32_t padding_right)123   inline MaxPoolingOperatorTester& padding_right(uint32_t padding_right) {
124     assert(!padding_tf_same());
125     this->padding_right_ = padding_right;
126     return *this;
127   }
128 
padding_right()129   inline uint32_t padding_right() const {
130     if (padding_tf_same()) {
131       const uint32_t total_padding_width =
132         (output_width() - 1) * stride_width() + dilated_pooling_width() - input_width();
133       return total_padding_width - total_padding_width / 2;
134     } else {
135       return this->padding_right_;
136     }
137   }
138 
input_size(size_t input_height,size_t input_width)139   inline MaxPoolingOperatorTester& input_size(size_t input_height, size_t input_width) {
140     assert(input_height >= 1);
141     assert(input_width >= 1);
142     this->input_height_ = input_height;
143     this->input_width_ = input_width;
144     return *this;
145   }
146 
input_height(size_t input_height)147   inline MaxPoolingOperatorTester& input_height(size_t input_height) {
148     assert(input_height >= 1);
149     this->input_height_ = input_height;
150     return *this;
151   }
152 
input_height()153   inline size_t input_height() const {
154     return this->input_height_;
155   }
156 
input_width(size_t input_width)157   inline MaxPoolingOperatorTester& input_width(size_t input_width) {
158     assert(input_width >= 1);
159     this->input_width_ = input_width;
160     return *this;
161   }
162 
input_width()163   inline size_t input_width() const {
164     return this->input_width_;
165   }
166 
channels(size_t channels)167   inline MaxPoolingOperatorTester& channels(size_t channels) {
168     assert(channels != 0);
169     this->channels_ = channels;
170     return *this;
171   }
172 
channels()173   inline size_t channels() const {
174     return this->channels_;
175   }
176 
batch_size(size_t batch_size)177   inline MaxPoolingOperatorTester& batch_size(size_t batch_size) {
178     assert(batch_size != 0);
179     this->batch_size_ = batch_size;
180     return *this;
181   }
182 
batch_size()183   inline size_t batch_size() const {
184     return this->batch_size_;
185   }
186 
pooling_size(uint32_t pooling_size)187   inline MaxPoolingOperatorTester& pooling_size(uint32_t pooling_size) {
188     assert(pooling_size >= 1);
189     this->pooling_height_ = pooling_size;
190     this->pooling_width_ = pooling_size;
191     return *this;
192   }
193 
pooling_size(uint32_t pooling_height,uint32_t pooling_width)194   inline MaxPoolingOperatorTester& pooling_size(uint32_t pooling_height, uint32_t pooling_width) {
195     assert(pooling_height >= 1);
196     assert(pooling_width >= 1);
197     this->pooling_height_ = pooling_height;
198     this->pooling_width_ = pooling_width;
199     return *this;
200   }
201 
pooling_height(uint32_t pooling_height)202   inline MaxPoolingOperatorTester& pooling_height(uint32_t pooling_height) {
203     assert(pooling_height >= 1);
204     this->pooling_height_ = pooling_height;
205     return *this;
206   }
207 
pooling_height()208   inline uint32_t pooling_height() const {
209     return this->pooling_height_;
210   }
211 
pooling_width(uint32_t pooling_width)212   inline MaxPoolingOperatorTester& pooling_width(uint32_t pooling_width) {
213     assert(pooling_width >= 1);
214     this->pooling_width_ = pooling_width;
215     return *this;
216   }
217 
pooling_width()218   inline uint32_t pooling_width() const {
219     return this->pooling_width_;
220   }
221 
stride(uint32_t stride)222   inline MaxPoolingOperatorTester& stride(uint32_t stride) {
223     assert(stride >= 1);
224     this->stride_height_ = stride;
225     this->stride_width_ = stride;
226     return *this;
227   }
228 
stride(uint32_t stride_height,uint32_t stride_width)229   inline MaxPoolingOperatorTester& stride(uint32_t stride_height, uint32_t stride_width) {
230     assert(stride_height >= 1);
231     assert(stride_width >= 1);
232     this->stride_height_ = stride_height;
233     this->stride_width_ = stride_width;
234     return *this;
235   }
236 
stride_height(uint32_t stride_height)237   inline MaxPoolingOperatorTester& stride_height(uint32_t stride_height) {
238     assert(stride_height >= 1);
239     this->stride_height_ = stride_height;
240     return *this;
241   }
242 
stride_height()243   inline uint32_t stride_height() const {
244     return this->stride_height_;
245   }
246 
stride_width(uint32_t stride_width)247   inline MaxPoolingOperatorTester& stride_width(uint32_t stride_width) {
248     assert(stride_width >= 1);
249     this->stride_width_ = stride_width;
250     return *this;
251   }
252 
stride_width()253   inline uint32_t stride_width() const {
254     return this->stride_width_;
255   }
256 
dilation(uint32_t dilation)257   inline MaxPoolingOperatorTester& dilation(uint32_t dilation) {
258     assert(dilation >= 1);
259     this->dilation_height_ = dilation;
260     this->dilation_width_ = dilation;
261     return *this;
262   }
263 
dilation(uint32_t dilation_height,uint32_t dilation_width)264   inline MaxPoolingOperatorTester& dilation(uint32_t dilation_height, uint32_t dilation_width) {
265     assert(dilation_height >= 1);
266     assert(dilation_width >= 1);
267     this->dilation_height_ = dilation_height;
268     this->dilation_width_ = dilation_width;
269     return *this;
270   }
271 
dilation_height(uint32_t dilation_height)272   inline MaxPoolingOperatorTester& dilation_height(uint32_t dilation_height) {
273     assert(dilation_height >= 1);
274     this->dilation_height_ = dilation_height;
275     return *this;
276   }
277 
dilation_height()278   inline uint32_t dilation_height() const {
279     return this->dilation_height_;
280   }
281 
dilation_width(uint32_t dilation_width)282   inline MaxPoolingOperatorTester& dilation_width(uint32_t dilation_width) {
283     assert(dilation_width >= 1);
284     this->dilation_width_ = dilation_width;
285     return *this;
286   }
287 
dilation_width()288   inline uint32_t dilation_width() const {
289     return this->dilation_width_;
290   }
291 
dilated_pooling_height()292   inline uint32_t dilated_pooling_height() const {
293     return (pooling_height() - 1) * dilation_height() + 1;
294   }
295 
dilated_pooling_width()296   inline uint32_t dilated_pooling_width() const {
297     return (pooling_width() - 1) * dilation_width() + 1;
298   }
299 
output_height()300   inline size_t output_height() const {
301     if (padding_tf_same()) {
302       return (input_height() + stride_height() - 1) / stride_height();
303     } else {
304       const size_t padded_input_height = padding_top() + input_height() + padding_bottom();
305       if (padded_input_height <= dilated_pooling_height()) {
306         return 1;
307       } else {
308         return (padded_input_height - dilated_pooling_height()) / stride_height() + 1;
309       }
310     }
311   }
312 
output_width()313   inline size_t output_width() const {
314     if (padding_tf_same()) {
315       return (input_width() + stride_width() - 1) / stride_width();
316     } else {
317       const size_t padded_input_width = padding_left() + input_width() + padding_right();
318       if (padded_input_width <= dilated_pooling_width()) {
319         return 1;
320       } else {
321         return (padded_input_width - dilated_pooling_width()) / stride_width() + 1;
322       }
323     }
324   }
325 
input_pixel_stride(size_t input_pixel_stride)326   inline MaxPoolingOperatorTester& input_pixel_stride(size_t input_pixel_stride) {
327     assert(input_pixel_stride != 0);
328     this->input_pixel_stride_ = input_pixel_stride;
329     return *this;
330   }
331 
input_pixel_stride()332   inline size_t input_pixel_stride() const {
333     if (this->input_pixel_stride_ == 0) {
334       return channels();
335     } else {
336       assert(this->input_pixel_stride_ >= channels());
337       return this->input_pixel_stride_;
338     }
339   }
340 
output_pixel_stride(size_t output_pixel_stride)341   inline MaxPoolingOperatorTester& output_pixel_stride(size_t output_pixel_stride) {
342     assert(output_pixel_stride != 0);
343     this->output_pixel_stride_ = output_pixel_stride;
344     return *this;
345   }
346 
output_pixel_stride()347   inline size_t output_pixel_stride() const {
348     if (this->output_pixel_stride_ == 0) {
349       return channels();
350     } else {
351       assert(this->output_pixel_stride_ >= channels());
352       return this->output_pixel_stride_;
353     }
354   }
355 
next_input_size(uint32_t next_input_height,uint32_t next_input_width)356   inline MaxPoolingOperatorTester& next_input_size(uint32_t next_input_height, uint32_t next_input_width) {
357     assert(next_input_height >= 1);
358     assert(next_input_width >= 1);
359     this->next_input_height_ = next_input_height;
360     this->next_input_width_ = next_input_width;
361     return *this;
362   }
363 
next_input_height(uint32_t next_input_height)364   inline MaxPoolingOperatorTester& next_input_height(uint32_t next_input_height) {
365     assert(next_input_height >= 1);
366     this->next_input_height_ = next_input_height;
367     return *this;
368   }
369 
next_input_height()370   inline uint32_t next_input_height() const {
371     if (this->next_input_height_ == 0) {
372       return input_height();
373     } else {
374       return this->next_input_height_;
375     }
376   }
377 
next_input_width(uint32_t next_input_width)378   inline MaxPoolingOperatorTester& next_input_width(uint32_t next_input_width) {
379     assert(next_input_width >= 1);
380     this->next_input_width_ = next_input_width;
381     return *this;
382   }
383 
next_input_width()384   inline uint32_t next_input_width() const {
385     if (this->next_input_width_ == 0) {
386       return input_width();
387     } else {
388       return this->next_input_width_;
389     }
390   }
391 
next_output_height()392   inline size_t next_output_height() const {
393     const size_t padded_next_input_height = padding_top() + next_input_height() + padding_bottom();
394     if (padded_next_input_height <= dilated_pooling_height()) {
395       return 1;
396     } else {
397       return (padded_next_input_height - dilated_pooling_height()) / stride_height() + 1;
398     }
399   }
400 
next_output_width()401   inline size_t next_output_width() const {
402     const size_t padded_next_input_width = padding_left() + next_input_width() + padding_right();
403     if (padded_next_input_width <= dilated_pooling_width()) {
404       return 1;
405     } else {
406       return (padded_next_input_width - dilated_pooling_width()) / stride_width() + 1;
407     }
408   }
409 
next_batch_size(size_t next_batch_size)410   inline MaxPoolingOperatorTester& next_batch_size(size_t next_batch_size) {
411     assert(next_batch_size >= 1);
412     this->next_batch_size_ = next_batch_size;
413     return *this;
414   }
415 
next_batch_size()416   inline size_t next_batch_size() const {
417     if (this->next_batch_size_ == 0) {
418       return batch_size();
419     } else {
420       return this->next_batch_size_;
421     }
422   }
423 
qmin(uint8_t qmin)424   inline MaxPoolingOperatorTester& qmin(uint8_t qmin) {
425     this->qmin_ = qmin;
426     return *this;
427   }
428 
qmin()429   inline uint8_t qmin() const {
430     return this->qmin_;
431   }
432 
qmax(uint8_t qmax)433   inline MaxPoolingOperatorTester& qmax(uint8_t qmax) {
434     this->qmax_ = qmax;
435     return *this;
436   }
437 
qmax()438   inline uint8_t qmax() const {
439     return this->qmax_;
440   }
441 
iterations(size_t iterations)442   inline MaxPoolingOperatorTester& iterations(size_t iterations) {
443     this->iterations_ = iterations;
444     return *this;
445   }
446 
iterations()447   inline size_t iterations() const {
448     return this->iterations_;
449   }
450 
TestS8()451   void TestS8() const {
452     std::random_device random_device;
453     auto rng = std::mt19937(random_device());
454     std::uniform_int_distribution<int32_t> i8dist(
455       std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max());
456 
457     std::vector<int8_t> input((batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + channels() + XNN_EXTRA_BYTES / sizeof(int8_t));
458     std::vector<int8_t> output((batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + channels() + XNN_EXTRA_BYTES / sizeof(int8_t));
459     std::vector<int8_t> output_ref(batch_size() * output_height() * output_width() * channels());
460     for (size_t iteration = 0; iteration < iterations(); iteration++) {
461       std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); });
462       std::fill(output.begin(), output.end(), 0xA5);
463 
464       // Compute reference results.
465       for (size_t i = 0; i < batch_size(); i++) {
466         for (size_t oy = 0; oy < output_height(); oy++) {
467           for (size_t ox = 0; ox < output_width(); ox++) {
468             for (size_t c = 0; c < channels(); c++) {
469               int8_t max_value = std::numeric_limits<int8_t>::min();
470               for (size_t py = 0; py < pooling_height(); py++) {
471                 const size_t iy = oy * stride_height() + py * dilation_height() - padding_top();
472                 for (size_t px = 0; px < pooling_width(); px++) {
473                   const size_t ix = ox * stride_width() + px * dilation_width() - padding_left();
474                   if (ix < input_width() && iy < input_height()) {
475                     max_value = std::max(max_value,
476                       input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + c]);
477                   }
478                 }
479               }
480               max_value = std::min(max_value, int8_t(qmax() - 0x80));
481               max_value = std::max(max_value, int8_t(qmin() - 0x80));
482               output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] = max_value;
483             }
484           }
485         }
486       }
487 
488       // Create, setup, run, and destroy Max Pooling operator.
489       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
490       xnn_operator_t max_pooling_op = nullptr;
491 
492       ASSERT_EQ(xnn_status_success,
493         xnn_create_max_pooling2d_nhwc_s8(
494           padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(),
495           padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(),
496           pooling_height(), pooling_width(),
497           stride_height(), stride_width(),
498           dilation_height(), dilation_width(),
499           channels(), input_pixel_stride(), output_pixel_stride(),
500           int8_t(qmin() - 0x80), int8_t(qmax() - 0x80),
501           padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0,
502           &max_pooling_op));
503       ASSERT_NE(nullptr, max_pooling_op);
504 
505       // Smart pointer to automatically delete max_pooling_op.
506       std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_max_pooling_op(max_pooling_op, xnn_delete_operator);
507 
508       ASSERT_EQ(xnn_status_success,
509         xnn_setup_max_pooling2d_nhwc_s8(
510           max_pooling_op,
511           batch_size(), input_height(), input_width(),
512           input.data(), output.data(),
513           nullptr /* thread pool */));
514 
515       ASSERT_EQ(xnn_status_success,
516         xnn_run_operator(max_pooling_op, nullptr /* thread pool */));
517 
518       // Verify results.
519       for (size_t i = 0; i < batch_size(); i++) {
520         for (size_t y = 0; y < output_height(); y++) {
521           for (size_t x = 0; x < output_width(); x++) {
522             for (size_t c = 0; c < channels(); c++) {
523               ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), int32_t(qmax() - 0x80));
524               ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), int32_t(qmin() - 0x80));
525               ASSERT_EQ(int32_t(output_ref[((i * output_height() + y) * output_width() + x) * channels() + c]),
526                 int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c])) <<
527                 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c;
528             }
529           }
530         }
531       }
532     }
533   }
534 
TestU8()535   void TestU8() const {
536     std::random_device random_device;
537     auto rng = std::mt19937(random_device());
538     std::uniform_int_distribution<int32_t> u8dist(
539       std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max());
540 
541     std::vector<uint8_t> input((batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + channels() + XNN_EXTRA_BYTES / sizeof(uint8_t));
542     std::vector<uint8_t> output((batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + channels() + XNN_EXTRA_BYTES / sizeof(uint8_t));
543     std::vector<uint8_t> output_ref(batch_size() * output_height() * output_width() * channels());
544     for (size_t iteration = 0; iteration < iterations(); iteration++) {
545       std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); });
546       std::fill(output.begin(), output.end(), 0xA5);
547 
548       // Compute reference results.
549       for (size_t i = 0; i < batch_size(); i++) {
550         for (size_t oy = 0; oy < output_height(); oy++) {
551           for (size_t ox = 0; ox < output_width(); ox++) {
552             for (size_t c = 0; c < channels(); c++) {
553               uint8_t max_value = 0;
554               for (size_t py = 0; py < pooling_height(); py++) {
555                 const size_t iy = oy * stride_height() + py * dilation_height() - padding_top();
556                 for (size_t px = 0; px < pooling_width(); px++) {
557                   const size_t ix = ox * stride_width() + px * dilation_width() - padding_left();
558                   if (ix < input_width() && iy < input_height()) {
559                     max_value = std::max(max_value,
560                       input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + c]);
561                   }
562                 }
563               }
564               max_value = std::min(max_value, qmax());
565               max_value = std::max(max_value, qmin());
566               output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] = max_value;
567             }
568           }
569         }
570       }
571 
572       // Create, setup, run, and destroy Max Pooling operator.
573       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
574       xnn_operator_t max_pooling_op = nullptr;
575 
576       ASSERT_EQ(xnn_status_success,
577         xnn_create_max_pooling2d_nhwc_u8(
578           padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(),
579           padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(),
580           pooling_height(), pooling_width(),
581           stride_height(), stride_width(),
582           dilation_height(), dilation_width(),
583           channels(), input_pixel_stride(), output_pixel_stride(),
584           qmin(), qmax(),
585           padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0,
586           &max_pooling_op));
587       ASSERT_NE(nullptr, max_pooling_op);
588 
589       // Smart pointer to automatically delete max_pooling_op.
590       std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_max_pooling_op(max_pooling_op, xnn_delete_operator);
591 
592       ASSERT_EQ(xnn_status_success,
593         xnn_setup_max_pooling2d_nhwc_u8(
594           max_pooling_op,
595           batch_size(), input_height(), input_width(),
596           input.data(), output.data(),
597           nullptr /* thread pool */));
598 
599       ASSERT_EQ(xnn_status_success,
600         xnn_run_operator(max_pooling_op, nullptr /* thread pool */));
601 
602       // Verify results.
603       for (size_t i = 0; i < batch_size(); i++) {
604         for (size_t y = 0; y < output_height(); y++) {
605           for (size_t x = 0; x < output_width(); x++) {
606             for (size_t c = 0; c < channels(); c++) {
607               ASSERT_LE(uint32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), uint32_t(qmax()));
608               ASSERT_GE(uint32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), uint32_t(qmin()));
609               ASSERT_EQ(uint32_t(output_ref[((i * output_height() + y) * output_width() + x) * channels() + c]),
610                 uint32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c])) <<
611                 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c;
612             }
613           }
614         }
615       }
616     }
617   }
618 
TestF16()619   void TestF16() const {
620     std::random_device random_device;
621     auto rng = std::mt19937(random_device());
622     // Note: we need to avoid FP16 denormals in the generated tensor because they might be processed differently in
623     // native vs emulated arithmetics, and we use exact comparison to verify the results against reference.
624     std::uniform_real_distribution<float> f32dist(0.001f, 1.0f);
625 
626     std::vector<uint16_t> input((batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + channels() + XNN_EXTRA_BYTES / sizeof(uint16_t));
627     std::vector<uint16_t> output((batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + channels() + XNN_EXTRA_BYTES / sizeof(uint16_t));
628     std::vector<float> output_ref(batch_size() * output_height() * output_width() * channels());
629     for (size_t iteration = 0; iteration < iterations(); iteration++) {
630       std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
631       std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
632 
633       // Compute reference results, without clamping.
634       for (size_t i = 0; i < batch_size(); i++) {
635         for (size_t oy = 0; oy < output_height(); oy++) {
636           for (size_t ox = 0; ox < output_width(); ox++) {
637             for (size_t c = 0; c < channels(); c++) {
638               float max_value = -std::numeric_limits<float>::infinity();
639               for (size_t py = 0; py < pooling_height(); py++) {
640                 const size_t iy = oy * stride_height() + py * dilation_height() - padding_top();
641                 for (size_t px = 0; px < pooling_width(); px++) {
642                   const size_t ix = ox * stride_width() + px * dilation_width() - padding_left();
643                   if (ix < input_width() && iy < input_height()) {
644                     max_value = std::max(max_value,
645                       fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + c]));
646                   }
647                 }
648               }
649               output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] = max_value;
650             }
651           }
652         }
653       }
654 
655       // Compute clamping parameters.
656       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
657       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
658       const float accumulated_range = accumulated_max - accumulated_min;
659       float output_min = accumulated_min + accumulated_range / 255.0f * float(qmin());
660       float output_max = accumulated_max - accumulated_range / 255.0f * float(255 - qmax());
661       output_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_min));
662       output_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_max));
663       if (accumulated_range == 0.0f) {
664         output_min = -std::numeric_limits<float>::infinity();
665         output_max = +std::numeric_limits<float>::infinity();
666       }
667       if (qmin() == std::numeric_limits<uint8_t>::min()) {
668         output_min = -std::numeric_limits<float>::infinity();
669       }
670       if (qmax() == std::numeric_limits<uint8_t>::max()) {
671         output_max = +std::numeric_limits<float>::infinity();
672       }
673 
674       // Clamp reference results.
675       for (float& value : output_ref) {
676         value = std::max(std::min(value, output_max), output_min);
677       }
678 
679       // Create, setup, run, and destroy Max Pooling operator.
680       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
681       xnn_operator_t max_pooling_op = nullptr;
682 
683       const xnn_status status = xnn_create_max_pooling2d_nhwc_f16(
684           padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(),
685           padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(),
686           pooling_height(), pooling_width(),
687           stride_height(), stride_width(),
688           dilation_height(), dilation_width(),
689           channels(), input_pixel_stride(), output_pixel_stride(),
690           output_min, output_max,
691           padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0,
692           &max_pooling_op);
693       if (status == xnn_status_unsupported_hardware) {
694         GTEST_SKIP();
695       }
696       ASSERT_EQ(xnn_status_success, status);
697       ASSERT_NE(nullptr, max_pooling_op);
698 
699       // Smart pointer to automatically delete max_pooling_op.
700       std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_max_pooling_op(max_pooling_op, xnn_delete_operator);
701 
702       ASSERT_EQ(xnn_status_success,
703         xnn_setup_max_pooling2d_nhwc_f16(
704           max_pooling_op,
705           batch_size(), input_height(), input_width(),
706           input.data(), output.data(),
707           nullptr /* thread pool */));
708 
709       ASSERT_EQ(xnn_status_success,
710         xnn_run_operator(max_pooling_op, nullptr /* thread pool */));
711 
712       // Verify results.
713       for (size_t i = 0; i < batch_size(); i++) {
714         for (size_t y = 0; y < output_height(); y++) {
715           for (size_t x = 0; x < output_width(); x++) {
716             for (size_t c = 0; c < channels(); c++) {
717               ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), output_max);
718               ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), output_min);
719               ASSERT_EQ(
720                   fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]),
721                   output_ref[((i * output_height() + y) * output_width() + x) * channels() + c]) <<
722                 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c
723                 << ", min = " << output_min << ", max = " << output_max;
724             }
725           }
726         }
727       }
728     }
729   }
730 
TestF32()731   void TestF32() const {
732     std::random_device random_device;
733     auto rng = std::mt19937(random_device());
734     std::uniform_real_distribution<float> f32dist;
735 
736     std::vector<float> input((batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + channels() + XNN_EXTRA_BYTES / sizeof(float));
737     std::vector<float> output((batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + channels() + XNN_EXTRA_BYTES / sizeof(float));
738     std::vector<float> output_ref(batch_size() * output_height() * output_width() * channels());
739     for (size_t iteration = 0; iteration < iterations(); iteration++) {
740       std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); });
741       std::fill(output.begin(), output.end(), nanf(""));
742 
743       // Compute reference results, without clamping.
744       for (size_t i = 0; i < batch_size(); i++) {
745         for (size_t oy = 0; oy < output_height(); oy++) {
746           for (size_t ox = 0; ox < output_width(); ox++) {
747             for (size_t c = 0; c < channels(); c++) {
748               float max_value = -std::numeric_limits<float>::infinity();
749               for (size_t py = 0; py < pooling_height(); py++) {
750                 const size_t iy = oy * stride_height() + py * dilation_height() - padding_top();
751                 for (size_t px = 0; px < pooling_width(); px++) {
752                   const size_t ix = ox * stride_width() + px * dilation_width() - padding_left();
753                   if (ix < input_width() && iy < input_height()) {
754                     max_value = std::max(max_value,
755                       input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + c]);
756                   }
757                 }
758               }
759               output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] = max_value;
760             }
761           }
762         }
763       }
764 
765       // Compute clamping parameters.
766       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
767       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
768       const float accumulated_range = accumulated_max - accumulated_min;
769       const float output_min = accumulated_range == 0.0f ?
770         -std::numeric_limits<float>::infinity() :
771         accumulated_min + accumulated_range / 255.0f * float(qmin());
772       const float output_max = accumulated_range == 0.0f ?
773         +std::numeric_limits<float>::infinity() :
774         accumulated_max - accumulated_range / 255.0f * float(255 - qmax());
775 
776       // Clamp reference results.
777       for (float& value : output_ref) {
778         value = std::max(std::min(value, output_max), output_min);
779       }
780 
781       // Create, setup, run, and destroy Max Pooling operator.
782       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
783       xnn_operator_t max_pooling_op = nullptr;
784 
785       ASSERT_EQ(xnn_status_success,
786         xnn_create_max_pooling2d_nhwc_f32(
787           padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(),
788           padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(),
789           pooling_height(), pooling_width(),
790           stride_height(), stride_width(),
791           dilation_height(), dilation_width(),
792           channels(), input_pixel_stride(), output_pixel_stride(),
793           output_min, output_max,
794           padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0,
795           &max_pooling_op));
796       ASSERT_NE(nullptr, max_pooling_op);
797 
798       // Smart pointer to automatically delete max_pooling_op.
799       std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_max_pooling_op(max_pooling_op, xnn_delete_operator);
800 
801       ASSERT_EQ(xnn_status_success,
802         xnn_setup_max_pooling2d_nhwc_f32(
803           max_pooling_op,
804           batch_size(), input_height(), input_width(),
805           input.data(), output.data(),
806           nullptr /* thread pool */));
807 
808       ASSERT_EQ(xnn_status_success,
809         xnn_run_operator(max_pooling_op, nullptr /* thread pool */));
810 
811       // Verify results.
812       for (size_t i = 0; i < batch_size(); i++) {
813         for (size_t y = 0; y < output_height(); y++) {
814           for (size_t x = 0; x < output_width(); x++) {
815             for (size_t c = 0; c < channels(); c++) {
816               ASSERT_LE(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c], output_max);
817               ASSERT_GE(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c], output_min);
818               ASSERT_EQ(output_ref[((i * output_height() + y) * output_width() + x) * channels() + c],
819                 output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]) <<
820                 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c
821                 << ", min = " << output_min << ", max = " << output_max;
822             }
823           }
824         }
825       }
826     }
827   }
828 
TestSetupS8()829   void TestSetupS8() const {
830     std::random_device random_device;
831     auto rng = std::mt19937(random_device());
832     std::uniform_int_distribution<int32_t> i8dist(
833       std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max());
834 
835     std::vector<int8_t> input(XNN_EXTRA_BYTES / sizeof(int8_t) + std::max<size_t>(
836       (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + channels(),
837       (next_batch_size() * next_input_height() * next_input_width() - 1) * input_pixel_stride() + channels()));
838     std::vector<int8_t> output(XNN_EXTRA_BYTES / sizeof(int8_t) + std::max<size_t>(
839       (batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + channels(),
840       (next_batch_size() * next_output_height() * next_output_width() - 1) * output_pixel_stride() + channels()));
841     std::vector<float> output_ref(batch_size() * output_height() * output_width() * channels());
842     std::vector<float> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * channels());
843     for (size_t iteration = 0; iteration < iterations(); iteration++) {
844       std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); });
845       std::fill(output.begin(), output.end(), INT8_C(0xA5));
846 
847       // Compute reference results.
848       for (size_t i = 0; i < batch_size(); i++) {
849         for (size_t oy = 0; oy < output_height(); oy++) {
850           for (size_t ox = 0; ox < output_width(); ox++) {
851             for (size_t c = 0; c < channels(); c++) {
852               int8_t max_value = std::numeric_limits<int8_t>::min();
853               for (size_t py = 0; py < pooling_height(); py++) {
854                 const size_t iy = oy * stride_height() + py * dilation_height() - padding_top();
855                 for (size_t px = 0; px < pooling_width(); px++) {
856                   const size_t ix = ox * stride_width() + px * dilation_width() - padding_left();
857                   if (ix < input_width() && iy < input_height()) {
858                     max_value = std::max(max_value,
859                       input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + c]);
860                   }
861                 }
862               }
863               max_value = std::min(max_value, int8_t(qmax() - 0x80));
864               max_value = std::max(max_value, int8_t(qmin() - 0x80));
865               output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] = max_value;
866             }
867           }
868         }
869       }
870 
871       // Create, setup, and run Max Pooling operator once.
872       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
873       xnn_operator_t max_pooling_op = nullptr;
874 
875       ASSERT_EQ(xnn_status_success,
876         xnn_create_max_pooling2d_nhwc_s8(
877           padding_top(), padding_right(), padding_bottom(), padding_left(),
878           pooling_height(), pooling_width(),
879           stride_height(), stride_width(),
880           dilation_height(), dilation_width(),
881           channels(), input_pixel_stride(), output_pixel_stride(),
882           int8_t(qmin() - 0x80), int8_t(qmax() - 0x80),
883           0, &max_pooling_op));
884       ASSERT_NE(nullptr, max_pooling_op);
885 
886       // Smart pointer to automatically delete max_pooling_op.
887       std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_max_pooling_op(max_pooling_op, xnn_delete_operator);
888 
889       ASSERT_EQ(xnn_status_success,
890         xnn_setup_max_pooling2d_nhwc_s8(
891           max_pooling_op,
892           batch_size(), input_height(), input_width(),
893           input.data(), output.data(),
894           nullptr /* thread pool */));
895 
896       ASSERT_EQ(xnn_status_success,
897         xnn_run_operator(max_pooling_op, nullptr /* thread pool */));
898 
899       // Verify results of the first run.
900       for (size_t i = 0; i < batch_size(); i++) {
901         for (size_t y = 0; y < output_height(); y++) {
902           for (size_t x = 0; x < output_width(); x++) {
903             for (size_t c = 0; c < channels(); c++) {
904               ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), int32_t(qmax() - 0x80));
905               ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), int32_t(qmin() - 0x80));
906               ASSERT_EQ(int32_t(output_ref[((i * output_height() + y) * output_width() + x) * channels() + c]),
907                 int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c])) <<
908                 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c;
909             }
910           }
911         }
912       }
913 
914       // Re-generate data for the second run.
915       std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); });
916       std::fill(output.begin(), output.end(), 0xA5);
917 
918       // Compute reference results for the second run.
919       for (size_t i = 0; i < next_batch_size(); i++) {
920         for (size_t oy = 0; oy < next_output_height(); oy++) {
921           for (size_t ox = 0; ox < next_output_width(); ox++) {
922             for (size_t c = 0; c < channels(); c++) {
923               int8_t max_value = std::numeric_limits<int8_t>::min();
924               for (size_t py = 0; py < pooling_height(); py++) {
925                 const size_t iy = oy * stride_height() + py * dilation_height() - padding_top();
926                 for (size_t px = 0; px < pooling_width(); px++) {
927                   const size_t ix = ox * stride_width() + px * dilation_width() - padding_left();
928                   if (ix < next_input_width() && iy < next_input_height()) {
929                     max_value = std::max(max_value,
930                       input[((i * next_input_height() + iy) * next_input_width() + ix) * input_pixel_stride() + c]);
931                   }
932                 }
933               }
934               max_value = std::min(max_value, int8_t(qmax() - 0x80));
935               max_value = std::max(max_value, int8_t(qmin() - 0x80));
936               next_output_ref[((i * next_output_height() + oy) * next_output_width() + ox) * channels() + c] = max_value;
937             }
938           }
939         }
940       }
941 
942       // Setup and run Max Pooling operator the second time, and destroy the operator.
943       ASSERT_EQ(xnn_status_success,
944         xnn_setup_max_pooling2d_nhwc_s8(
945           max_pooling_op,
946           next_batch_size(), next_input_height(), next_input_width(),
947           input.data(), output.data(),
948           nullptr /* thread pool */));
949 
950       ASSERT_EQ(xnn_status_success,
951         xnn_run_operator(max_pooling_op, nullptr /* thread pool */));
952 
953       // Verify results of the second run.
954       for (size_t i = 0; i < next_batch_size(); i++) {
955         for (size_t y = 0; y < next_output_height(); y++) {
956           for (size_t x = 0; x < next_output_width(); x++) {
957             for (size_t c = 0; c < channels(); c++) {
958               ASSERT_LE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c]), int32_t(qmax() - 0x80));
959               ASSERT_GE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c]), int32_t(qmin() - 0x80));
960               ASSERT_EQ(int32_t(next_output_ref[((i * next_output_height() + y) * next_output_width() + x) * channels() + c]),
961                 int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c])) <<
962                 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c;
963             }
964           }
965         }
966       }
967     }
968   }
969 
TestSetupU8()970   void TestSetupU8() const {
971     std::random_device random_device;
972     auto rng = std::mt19937(random_device());
973     std::uniform_int_distribution<int32_t> u8dist(
974       std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max());
975 
976     std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) + std::max<size_t>(
977       (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + channels(),
978       (next_batch_size() * next_input_height() * next_input_width() - 1) * input_pixel_stride() + channels()));
979     std::vector<uint8_t> output(XNN_EXTRA_BYTES / sizeof(uint8_t) + std::max<size_t>(
980       (batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + channels(),
981       (next_batch_size() * next_output_height() * next_output_width() - 1) * output_pixel_stride() + channels()));
982     std::vector<float> output_ref(batch_size() * output_height() * output_width() * channels());
983     std::vector<float> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * channels());
984     for (size_t iteration = 0; iteration < iterations(); iteration++) {
985       std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); });
986       std::fill(output.begin(), output.end(), 0xA5);
987 
988       // Compute reference results.
989       for (size_t i = 0; i < batch_size(); i++) {
990         for (size_t oy = 0; oy < output_height(); oy++) {
991           for (size_t ox = 0; ox < output_width(); ox++) {
992             for (size_t c = 0; c < channels(); c++) {
993               uint8_t max_value = 0;
994               for (size_t py = 0; py < pooling_height(); py++) {
995                 const size_t iy = oy * stride_height() + py * dilation_height() - padding_top();
996                 for (size_t px = 0; px < pooling_width(); px++) {
997                   const size_t ix = ox * stride_width() + px * dilation_width() - padding_left();
998                   if (ix < input_width() && iy < input_height()) {
999                     max_value = std::max(max_value,
1000                       input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + c]);
1001                   }
1002                 }
1003               }
1004               max_value = std::min(max_value, qmax());
1005               max_value = std::max(max_value, qmin());
1006               output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] = max_value;
1007             }
1008           }
1009         }
1010       }
1011 
1012       // Create, setup, and run Max Pooling operator once.
1013       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
1014       xnn_operator_t max_pooling_op = nullptr;
1015 
1016       ASSERT_EQ(xnn_status_success,
1017         xnn_create_max_pooling2d_nhwc_u8(
1018           padding_top(), padding_right(), padding_bottom(), padding_left(),
1019           pooling_height(), pooling_width(),
1020           stride_height(), stride_width(),
1021           dilation_height(), dilation_width(),
1022           channels(), input_pixel_stride(), output_pixel_stride(),
1023           qmin(), qmax(),
1024           0, &max_pooling_op));
1025       ASSERT_NE(nullptr, max_pooling_op);
1026 
1027       // Smart pointer to automatically delete max_pooling_op.
1028       std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_max_pooling_op(max_pooling_op, xnn_delete_operator);
1029 
1030       ASSERT_EQ(xnn_status_success,
1031         xnn_setup_max_pooling2d_nhwc_u8(
1032           max_pooling_op,
1033           batch_size(), input_height(), input_width(),
1034           input.data(), output.data(),
1035           nullptr /* thread pool */));
1036 
1037       ASSERT_EQ(xnn_status_success,
1038         xnn_run_operator(max_pooling_op, nullptr /* thread pool */));
1039 
1040       // Verify results of the first run.
1041       for (size_t i = 0; i < batch_size(); i++) {
1042         for (size_t y = 0; y < output_height(); y++) {
1043           for (size_t x = 0; x < output_width(); x++) {
1044             for (size_t c = 0; c < channels(); c++) {
1045               ASSERT_LE(uint32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), uint32_t(qmax()));
1046               ASSERT_GE(uint32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), uint32_t(qmin()));
1047               ASSERT_EQ(uint32_t(output_ref[((i * output_height() + y) * output_width() + x) * channels() + c]),
1048                 uint32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c])) <<
1049                 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c;
1050             }
1051           }
1052         }
1053       }
1054 
1055       // Re-generate data for the second run.
1056       std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); });
1057       std::fill(output.begin(), output.end(), 0xA5);
1058 
1059       // Compute reference results for the second run.
1060       for (size_t i = 0; i < next_batch_size(); i++) {
1061         for (size_t oy = 0; oy < next_output_height(); oy++) {
1062           for (size_t ox = 0; ox < next_output_width(); ox++) {
1063             for (size_t c = 0; c < channels(); c++) {
1064               uint8_t max_value = 0;
1065               for (size_t py = 0; py < pooling_height(); py++) {
1066                 const size_t iy = oy * stride_height() + py * dilation_height() - padding_top();
1067                 for (size_t px = 0; px < pooling_width(); px++) {
1068                   const size_t ix = ox * stride_width() + px * dilation_width() - padding_left();
1069                   if (ix < next_input_width() && iy < next_input_height()) {
1070                     max_value = std::max(max_value,
1071                       input[((i * next_input_height() + iy) * next_input_width() + ix) * input_pixel_stride() + c]);
1072                   }
1073                 }
1074               }
1075               max_value = std::min(max_value, qmax());
1076               max_value = std::max(max_value, qmin());
1077               next_output_ref[((i * next_output_height() + oy) * next_output_width() + ox) * channels() + c] = max_value;
1078             }
1079           }
1080         }
1081       }
1082 
1083       // Setup and run Max Pooling operator the second time, and destroy the operator.
1084       ASSERT_EQ(xnn_status_success,
1085         xnn_setup_max_pooling2d_nhwc_u8(
1086           max_pooling_op,
1087           next_batch_size(), next_input_height(), next_input_width(),
1088           input.data(), output.data(),
1089           nullptr /* thread pool */));
1090 
1091       ASSERT_EQ(xnn_status_success,
1092         xnn_run_operator(max_pooling_op, nullptr /* thread pool */));
1093 
1094       // Verify results of the second run.
1095       for (size_t i = 0; i < next_batch_size(); i++) {
1096         for (size_t y = 0; y < next_output_height(); y++) {
1097           for (size_t x = 0; x < next_output_width(); x++) {
1098             for (size_t c = 0; c < channels(); c++) {
1099               ASSERT_LE(uint32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c]), uint32_t(qmax()));
1100               ASSERT_GE(uint32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c]), uint32_t(qmin()));
1101               ASSERT_EQ(uint32_t(next_output_ref[((i * next_output_height() + y) * next_output_width() + x) * channels() + c]),
1102                 uint32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c])) <<
1103                 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c;
1104             }
1105           }
1106         }
1107       }
1108     }
1109   }
1110 
TestSetupF16()1111   void TestSetupF16() const {
1112     std::random_device random_device;
1113     auto rng = std::mt19937(random_device());
1114     // Note: we need to avoid FP16 denormals in the generated tensor because they might be processed differently in
1115     // native vs emulated arithmetics, and we use exact comparison to verify the results against reference.
1116     std::uniform_real_distribution<float> f32dist(0.001f, 1.0f);
1117 
1118     std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) + std::max<size_t>(
1119       (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + channels(),
1120       (next_batch_size() * next_input_height() * next_input_width() - 1) * input_pixel_stride() + channels()));
1121     std::vector<uint16_t> output(XNN_EXTRA_BYTES / sizeof(uint16_t) + std::max<size_t>(
1122       (batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + channels(),
1123       (next_batch_size() * next_output_height() * next_output_width() - 1) * output_pixel_stride() + channels()));
1124     std::vector<float> output_ref(batch_size() * output_height() * output_width() * channels());
1125     std::vector<float> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * channels());
1126     for (size_t iteration = 0; iteration < iterations(); iteration++) {
1127       std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
1128       std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
1129 
1130       // Compute reference results, without clamping.
1131       for (size_t i = 0; i < batch_size(); i++) {
1132         for (size_t oy = 0; oy < output_height(); oy++) {
1133           for (size_t ox = 0; ox < output_width(); ox++) {
1134             for (size_t c = 0; c < channels(); c++) {
1135               float max_value = -std::numeric_limits<float>::infinity();
1136               for (size_t py = 0; py < pooling_height(); py++) {
1137                 const size_t iy = oy * stride_height() + py * dilation_height() - padding_top();
1138                 for (size_t px = 0; px < pooling_width(); px++) {
1139                   const size_t ix = ox * stride_width() + px * dilation_width() - padding_left();
1140                   if (ix < input_width() && iy < input_height()) {
1141                     max_value = std::max(max_value,
1142                       fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + c]));
1143                   }
1144                 }
1145               }
1146               output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] = max_value;
1147             }
1148           }
1149         }
1150       }
1151 
1152       // Compute clamping parameters.
1153       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
1154       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
1155       const float accumulated_range = accumulated_max - accumulated_min;
1156       float output_min = accumulated_min + accumulated_range / 255.0f * float(qmin());
1157       float output_max = accumulated_max - accumulated_range / 255.0f * float(255 - qmax());
1158       output_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_min));
1159       output_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_max));
1160       if (accumulated_range == 0.0f) {
1161         output_min = -std::numeric_limits<float>::infinity();
1162         output_max = +std::numeric_limits<float>::infinity();
1163       }
1164       if (qmin() == std::numeric_limits<uint8_t>::min()) {
1165         output_min = -std::numeric_limits<float>::infinity();
1166       }
1167       if (qmax() == std::numeric_limits<uint8_t>::max()) {
1168         output_max = +std::numeric_limits<float>::infinity();
1169       }
1170 
1171       // Clamp reference results.
1172       for (float& value : output_ref) {
1173         value = std::max(std::min(value, output_max), output_min);
1174       }
1175 
1176       // Create, setup, and run Max Pooling operator once.
1177       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
1178       xnn_operator_t max_pooling_op = nullptr;
1179 
1180       const xnn_status status = xnn_create_max_pooling2d_nhwc_f16(
1181           padding_top(), padding_right(), padding_bottom(), padding_left(),
1182           pooling_height(), pooling_width(),
1183           stride_height(), stride_width(),
1184           dilation_height(), dilation_width(),
1185           channels(), input_pixel_stride(), output_pixel_stride(),
1186           output_min, output_max,
1187           0, &max_pooling_op);
1188       if (status == xnn_status_unsupported_hardware) {
1189         GTEST_SKIP();
1190       }
1191       ASSERT_EQ(xnn_status_success, status);
1192       ASSERT_NE(nullptr, max_pooling_op);
1193 
1194       // Smart pointer to automatically delete max_pooling_op.
1195       std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_max_pooling_op(max_pooling_op, xnn_delete_operator);
1196 
1197       ASSERT_EQ(xnn_status_success,
1198         xnn_setup_max_pooling2d_nhwc_f16(
1199           max_pooling_op,
1200           batch_size(), input_height(), input_width(),
1201           input.data(), output.data(),
1202           nullptr /* thread pool */));
1203 
1204       ASSERT_EQ(xnn_status_success,
1205         xnn_run_operator(max_pooling_op, nullptr /* thread pool */));
1206 
1207       // Verify results of the first run.
1208       for (size_t i = 0; i < batch_size(); i++) {
1209         for (size_t y = 0; y < output_height(); y++) {
1210           for (size_t x = 0; x < output_width(); x++) {
1211             for (size_t c = 0; c < channels(); c++) {
1212               ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), output_max);
1213               ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), output_min);
1214               ASSERT_EQ(
1215                   fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]),
1216                   output_ref[((i * output_height() + y) * output_width() + x) * channels() + c]) <<
1217                 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c
1218                 << ", min = " << output_min << ", max = " << output_max;
1219             }
1220           }
1221         }
1222       }
1223 
1224       // Re-generate data for the second run.
1225       std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
1226       std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
1227 
1228       // Compute reference results for the second run, including clamping.
1229       for (size_t i = 0; i < next_batch_size(); i++) {
1230         for (size_t oy = 0; oy < next_output_height(); oy++) {
1231           for (size_t ox = 0; ox < next_output_width(); ox++) {
1232             for (size_t c = 0; c < channels(); c++) {
1233               float max_value = -std::numeric_limits<float>::infinity();
1234               for (size_t py = 0; py < pooling_height(); py++) {
1235                 const size_t iy = oy * stride_height() + py * dilation_height() - padding_top();
1236                 for (size_t px = 0; px < pooling_width(); px++) {
1237                   const size_t ix = ox * stride_width() + px * dilation_width() - padding_left();
1238                   if (ix < next_input_width() && iy < next_input_height()) {
1239                     max_value = std::max(max_value,
1240                       fp16_ieee_to_fp32_value(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_pixel_stride() + c]));
1241                   }
1242                 }
1243               }
1244               max_value = std::min(max_value, output_max);
1245               max_value = std::max(max_value, output_min);
1246               next_output_ref[((i * next_output_height() + oy) * next_output_width() + ox) * channels() + c] = max_value;
1247             }
1248           }
1249         }
1250       }
1251 
1252       // Setup and run Max Pooling operator the second time, and destroy the operator.
1253       ASSERT_EQ(xnn_status_success,
1254         xnn_setup_max_pooling2d_nhwc_f16(
1255           max_pooling_op,
1256           next_batch_size(), next_input_height(), next_input_width(),
1257           input.data(), output.data(),
1258           nullptr /* thread pool */));
1259 
1260       ASSERT_EQ(xnn_status_success,
1261         xnn_run_operator(max_pooling_op, nullptr /* thread pool */));
1262 
1263       // Verify results of the second run.
1264       for (size_t i = 0; i < next_batch_size(); i++) {
1265         for (size_t y = 0; y < next_output_height(); y++) {
1266           for (size_t x = 0; x < next_output_width(); x++) {
1267             for (size_t c = 0; c < channels(); c++) {
1268               ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c]), output_max);
1269               ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c]), output_min);
1270               ASSERT_EQ(
1271                   fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c]),
1272                   next_output_ref[((i * next_output_height() + y) * next_output_width() + x) * channels() + c]) <<
1273                 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c
1274                 << ", min = " << output_min << ", max = " << output_max;
1275             }
1276           }
1277         }
1278       }
1279     }
1280   }
1281 
TestSetupF32()1282   void TestSetupF32() const {
1283     std::random_device random_device;
1284     auto rng = std::mt19937(random_device());
1285     std::uniform_real_distribution<float> f32dist;
1286 
1287     std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + std::max<size_t>(
1288       (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + channels(),
1289       (next_batch_size() * next_input_height() * next_input_width() - 1) * input_pixel_stride() + channels()));
1290     std::vector<float> output(XNN_EXTRA_BYTES / sizeof(float) + std::max<size_t>(
1291       (batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + channels(),
1292       (next_batch_size() * next_output_height() * next_output_width() - 1) * output_pixel_stride() + channels()));
1293     std::vector<float> output_ref(batch_size() * output_height() * output_width() * channels());
1294     std::vector<float> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * channels());
1295     for (size_t iteration = 0; iteration < iterations(); iteration++) {
1296       std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); });
1297       std::fill(output.begin(), output.end(), nanf(""));
1298 
1299       // Compute reference results, without clamping.
1300       for (size_t i = 0; i < batch_size(); i++) {
1301         for (size_t oy = 0; oy < output_height(); oy++) {
1302           for (size_t ox = 0; ox < output_width(); ox++) {
1303             for (size_t c = 0; c < channels(); c++) {
1304               float max_value = -std::numeric_limits<float>::infinity();
1305               for (size_t py = 0; py < pooling_height(); py++) {
1306                 const size_t iy = oy * stride_height() + py * dilation_height() - padding_top();
1307                 for (size_t px = 0; px < pooling_width(); px++) {
1308                   const size_t ix = ox * stride_width() + px * dilation_width() - padding_left();
1309                   if (ix < input_width() && iy < input_height()) {
1310                     max_value = std::max(max_value,
1311                       input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + c]);
1312                   }
1313                 }
1314               }
1315               output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] = max_value;
1316             }
1317           }
1318         }
1319       }
1320 
1321       // Compute clamping parameters.
1322       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
1323       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
1324       const float accumulated_range = accumulated_max - accumulated_min;
1325       const float output_min = accumulated_range == 0.0f ?
1326         -std::numeric_limits<float>::infinity() :
1327         accumulated_min + accumulated_range / 255.0f * float(qmin());
1328       const float output_max = accumulated_range == 0.0f ?
1329         +std::numeric_limits<float>::infinity() :
1330         accumulated_max - accumulated_range / 255.0f * float(255 - qmax());
1331 
1332       // Clamp reference results.
1333       for (float& value : output_ref) {
1334         value = std::max(std::min(value, output_max), output_min);
1335       }
1336 
1337       // Create, setup, and run Max Pooling operator once.
1338       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
1339       xnn_operator_t max_pooling_op = nullptr;
1340 
1341       ASSERT_EQ(xnn_status_success,
1342         xnn_create_max_pooling2d_nhwc_f32(
1343           padding_top(), padding_right(), padding_bottom(), padding_left(),
1344           pooling_height(), pooling_width(),
1345           stride_height(), stride_width(),
1346           dilation_height(), dilation_width(),
1347           channels(), input_pixel_stride(), output_pixel_stride(),
1348           output_min, output_max,
1349           0, &max_pooling_op));
1350       ASSERT_NE(nullptr, max_pooling_op);
1351 
1352       // Smart pointer to automatically delete max_pooling_op.
1353       std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_max_pooling_op(max_pooling_op, xnn_delete_operator);
1354 
1355       ASSERT_EQ(xnn_status_success,
1356         xnn_setup_max_pooling2d_nhwc_f32(
1357           max_pooling_op,
1358           batch_size(), input_height(), input_width(),
1359           input.data(), output.data(),
1360           nullptr /* thread pool */));
1361 
1362       ASSERT_EQ(xnn_status_success,
1363         xnn_run_operator(max_pooling_op, nullptr /* thread pool */));
1364 
1365       // Verify results of the first run.
1366       for (size_t i = 0; i < batch_size(); i++) {
1367         for (size_t y = 0; y < output_height(); y++) {
1368           for (size_t x = 0; x < output_width(); x++) {
1369             for (size_t c = 0; c < channels(); c++) {
1370               ASSERT_LE(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c], output_max);
1371               ASSERT_GE(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c], output_min);
1372               ASSERT_EQ(output_ref[((i * output_height() + y) * output_width() + x) * channels() + c],
1373                 output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]) <<
1374                 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c;
1375             }
1376           }
1377         }
1378       }
1379 
1380       // Re-generate data for the second run.
1381       std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); });
1382       std::fill(output.begin(), output.end(), std::nanf(""));
1383 
1384       // Compute reference results for the second run, including clamping.
1385       for (size_t i = 0; i < next_batch_size(); i++) {
1386         for (size_t oy = 0; oy < next_output_height(); oy++) {
1387           for (size_t ox = 0; ox < next_output_width(); ox++) {
1388             for (size_t c = 0; c < channels(); c++) {
1389               float max_value = -std::numeric_limits<float>::infinity();
1390               for (size_t py = 0; py < pooling_height(); py++) {
1391                 const size_t iy = oy * stride_height() + py * dilation_height() - padding_top();
1392                 for (size_t px = 0; px < pooling_width(); px++) {
1393                   const size_t ix = ox * stride_width() + px * dilation_width() - padding_left();
1394                   if (ix < next_input_width() && iy < next_input_height()) {
1395                     max_value = std::max(max_value,
1396                       input[((i * next_input_height() + iy) * next_input_width() + ix) * input_pixel_stride() + c]);
1397                   }
1398                 }
1399               }
1400               max_value = std::min(max_value, output_max);
1401               max_value = std::max(max_value, output_min);
1402               next_output_ref[((i * next_output_height() + oy) * next_output_width() + ox) * channels() + c] = max_value;
1403             }
1404           }
1405         }
1406       }
1407 
1408       // Setup and run Max Pooling operator the second time, and destroy the operator.
1409       ASSERT_EQ(xnn_status_success,
1410         xnn_setup_max_pooling2d_nhwc_f32(
1411           max_pooling_op,
1412           next_batch_size(), next_input_height(), next_input_width(),
1413           input.data(), output.data(),
1414           nullptr /* thread pool */));
1415 
1416       ASSERT_EQ(xnn_status_success,
1417         xnn_run_operator(max_pooling_op, nullptr /* thread pool */));
1418 
1419       // Verify results of the second run.
1420       for (size_t i = 0; i < next_batch_size(); i++) {
1421         for (size_t y = 0; y < next_output_height(); y++) {
1422           for (size_t x = 0; x < next_output_width(); x++) {
1423             for (size_t c = 0; c < channels(); c++) {
1424               ASSERT_LE(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c], output_max);
1425               ASSERT_GE(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c], output_min);
1426               ASSERT_EQ(next_output_ref[((i * next_output_height() + y) * next_output_width() + x) * channels() + c],
1427                 output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c]) <<
1428                 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c;
1429             }
1430           }
1431         }
1432       }
1433     }
1434   }
1435 
1436  private:
1437   uint32_t padding_top_{0};
1438   uint32_t padding_right_{0};
1439   uint32_t padding_bottom_{0};
1440   uint32_t padding_left_{0};
1441   bool padding_tf_same_{false};
1442   size_t input_height_{1};
1443   size_t input_width_{1};
1444   size_t channels_{1};
1445   size_t batch_size_{1};
1446   size_t input_pixel_stride_{0};
1447   size_t output_pixel_stride_{0};
1448   uint32_t pooling_height_{1};
1449   uint32_t pooling_width_{1};
1450   uint32_t stride_height_{1};
1451   uint32_t stride_width_{1};
1452   uint32_t dilation_height_{1};
1453   uint32_t dilation_width_{1};
1454   size_t next_input_height_{0};
1455   size_t next_input_width_{0};
1456   size_t next_batch_size_{0};
1457   uint8_t qmin_{0};
1458   uint8_t qmax_{255};
1459   size_t iterations_{1};
1460 };
1461