xref: /aosp_15_r20/external/XNNPACK/test/convolution-operator-tester.h (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #pragma once
10 
11 #include <gtest/gtest.h>
12 
13 #include <algorithm>
14 #include <cassert>
15 #include <cmath>
16 #include <cstddef>
17 #include <cstdlib>
18 #include <limits>
19 #include <random>
20 #include <vector>
21 
22 #include "convolution-test-helpers.h"
23 #include <fp16.h>
24 
25 #include <xnnpack.h>
26 #include <xnnpack/cache.h>
27 #include <xnnpack/allocator.h>
28 
29 
30 class ConvolutionOperatorTester {
31  public:
32   enum class WeightsType {
33     Default,
34     FP32,
35   };
36 
padding_tf_same(bool padding_same)37   inline ConvolutionOperatorTester& padding_tf_same(bool padding_same) {
38     if (padding_same) {
39       assert(padding_top() == 0);
40       assert(padding_left() == 0);
41       assert(padding_bottom() == 0);
42       assert(padding_right() == 0);
43     }
44     this->padding_tf_same_ = padding_same;
45     return *this;
46   }
47 
padding_tf_same()48   inline bool padding_tf_same() const {
49     return this->padding_tf_same_;
50   }
51 
padding(uint32_t padding)52   inline ConvolutionOperatorTester& padding(uint32_t padding) {
53     assert(!padding_tf_same());
54     this->padding_top_ = padding;
55     this->padding_right_ = padding;
56     this->padding_bottom_ = padding;
57     this->padding_left_ = padding;
58     return *this;
59   }
60 
padding(uint32_t padding_height,uint32_t padding_width)61   inline ConvolutionOperatorTester& padding(uint32_t padding_height, uint32_t padding_width) {
62     assert(!padding_tf_same());
63     this->padding_top_ = padding_height;
64     this->padding_right_ = padding_width;
65     this->padding_bottom_ = padding_height;
66     this->padding_left_ = padding_width;
67     return *this;
68   }
69 
padding_height(uint32_t padding_height)70   inline ConvolutionOperatorTester& padding_height(uint32_t padding_height) {
71     assert(!padding_tf_same());
72     this->padding_top_ = padding_height;
73     this->padding_bottom_ = padding_height;
74     return *this;
75   }
76 
padding_width(uint32_t padding_width)77   inline ConvolutionOperatorTester& padding_width(uint32_t padding_width) {
78     assert(!padding_tf_same());
79     this->padding_right_ = padding_width;
80     this->padding_left_ = padding_width;
81     return *this;
82   }
83 
padding_top(uint32_t padding_top)84   inline ConvolutionOperatorTester& padding_top(uint32_t padding_top) {
85     assert(!padding_tf_same());
86     this->padding_top_ = padding_top;
87     return *this;
88   }
89 
padding_top()90   inline uint32_t padding_top() const {
91     if (padding_tf_same()) {
92       const uint32_t total_padding_height =
93         (output_height() - 1) * subsampling_height() + dilated_kernel_height() - input_height();
94       return total_padding_height / 2;
95     } else {
96       return this->padding_top_;
97     }
98   }
99 
padding_left(uint32_t padding_left)100   inline ConvolutionOperatorTester& padding_left(uint32_t padding_left) {
101     assert(!padding_tf_same());
102     this->padding_left_ = padding_left;
103     return *this;
104   }
105 
padding_left()106   inline uint32_t padding_left() const {
107     if (padding_tf_same()) {
108       const uint32_t total_padding_width =
109         (output_width() - 1) * subsampling_width() + dilated_kernel_width() - input_width();
110       return total_padding_width / 2;
111     } else {
112       return this->padding_left_;
113     }
114   }
115 
padding_bottom(uint32_t padding_bottom)116   inline ConvolutionOperatorTester& padding_bottom(uint32_t padding_bottom) {
117     assert(!padding_tf_same());
118     this->padding_bottom_ = padding_bottom;
119     return *this;
120   }
121 
padding_bottom()122   inline uint32_t padding_bottom() const {
123     if (padding_tf_same()) {
124       const uint32_t total_padding_height =
125         (output_height() - 1) * subsampling_height() + dilated_kernel_height() - input_height();
126       return total_padding_height - total_padding_height / 2;
127     } else {
128       return this->padding_bottom_;
129     }
130   }
131 
padding_right(uint32_t padding_right)132   inline ConvolutionOperatorTester& padding_right(uint32_t padding_right) {
133     assert(!padding_tf_same());
134     this->padding_right_ = padding_right;
135     return *this;
136   }
137 
padding_right()138   inline uint32_t padding_right() const {
139     if (padding_tf_same()) {
140       const uint32_t total_padding_width =
141         (output_width() - 1) * subsampling_width() + dilated_kernel_width() - input_width();
142       return total_padding_width - total_padding_width / 2;
143     } else {
144       return this->padding_right_;
145     }
146   }
147 
input_size(uint32_t input_height,uint32_t input_width)148   inline ConvolutionOperatorTester& input_size(uint32_t input_height, uint32_t input_width) {
149     assert(input_height >= 1);
150     assert(input_width >= 1);
151     this->input_height_ = input_height;
152     this->input_width_ = input_width;
153     return *this;
154   }
155 
input_height(uint32_t input_height)156   inline ConvolutionOperatorTester& input_height(uint32_t input_height) {
157     assert(input_height >= 1);
158     this->input_height_ = input_height;
159     return *this;
160   }
161 
input_height()162   inline uint32_t input_height() const {
163     return this->input_height_;
164   }
165 
input_width(uint32_t input_width)166   inline ConvolutionOperatorTester& input_width(uint32_t input_width) {
167     assert(input_width >= 1);
168     this->input_width_ = input_width;
169     return *this;
170   }
171 
input_width()172   inline uint32_t input_width() const {
173     return this->input_width_;
174   }
175 
groups(uint32_t groups)176   inline ConvolutionOperatorTester& groups(uint32_t groups) {
177     assert(groups >= 1);
178     this->groups_ = groups;
179     return *this;
180   }
181 
groups()182   inline uint32_t groups() const {
183     return this->groups_;
184   }
185 
group_input_channels(size_t group_input_channels)186   inline ConvolutionOperatorTester& group_input_channels(size_t group_input_channels) {
187     assert(group_input_channels >= 1);
188     this->group_input_channels_ = group_input_channels;
189     return *this;
190   }
191 
group_input_channels()192   inline size_t group_input_channels() const {
193     return this->group_input_channels_;
194   }
195 
group_output_channels(size_t group_output_channels)196   inline ConvolutionOperatorTester& group_output_channels(size_t group_output_channels) {
197     assert(group_output_channels >= 1);
198     this->group_output_channels_ = group_output_channels;
199     return *this;
200   }
201 
group_output_channels()202   inline size_t group_output_channels() const {
203     return this->group_output_channels_;
204   }
205 
batch_size(size_t batch_size)206   inline ConvolutionOperatorTester& batch_size(size_t batch_size) {
207     assert(batch_size >= 1);
208     this->batch_size_ = batch_size;
209     return *this;
210   }
211 
batch_size()212   inline size_t batch_size() const {
213     return this->batch_size_;
214   }
215 
kernel_size(uint32_t kernel_size)216   inline ConvolutionOperatorTester& kernel_size(uint32_t kernel_size) {
217     assert(kernel_size >= 1);
218     this->kernel_height_ = kernel_size;
219     this->kernel_width_ = kernel_size;
220     return *this;
221   }
222 
kernel_size(uint32_t kernel_height,uint32_t kernel_width)223   inline ConvolutionOperatorTester& kernel_size(uint32_t kernel_height, uint32_t kernel_width) {
224     assert(kernel_height >= 1);
225     assert(kernel_width >= 1);
226     this->kernel_height_ = kernel_height;
227     this->kernel_width_ = kernel_width;
228     return *this;
229   }
230 
kernel_height(uint32_t kernel_height)231   inline ConvolutionOperatorTester& kernel_height(uint32_t kernel_height) {
232     assert(kernel_height >= 1);
233     this->kernel_height_ = kernel_height;
234     return *this;
235   }
236 
kernel_height()237   inline uint32_t kernel_height() const {
238     return this->kernel_height_;
239   }
240 
kernel_width(uint32_t kernel_width)241   inline ConvolutionOperatorTester& kernel_width(uint32_t kernel_width) {
242     assert(kernel_width >= 1);
243     this->kernel_width_ = kernel_width;
244     return *this;
245   }
246 
kernel_width()247   inline uint32_t kernel_width() const {
248     return this->kernel_width_;
249   }
250 
dilation(uint32_t dilation)251   inline ConvolutionOperatorTester& dilation(uint32_t dilation) {
252     assert(dilation >= 1);
253     this->dilation_height_ = dilation;
254     this->dilation_width_ = dilation;
255     return *this;
256   }
257 
dilation(uint32_t dilation_height,uint32_t dilation_width)258   inline ConvolutionOperatorTester& dilation(uint32_t dilation_height, uint32_t dilation_width) {
259     assert(dilation_height >= 1);
260     assert(dilation_width >= 1);
261     this->dilation_height_ = dilation_height;
262     this->dilation_width_ = dilation_width;
263     return *this;
264   }
265 
dilation_height(uint32_t dilation_height)266   inline ConvolutionOperatorTester& dilation_height(uint32_t dilation_height) {
267     assert(dilation_height >= 1);
268     this->dilation_height_ = dilation_height;
269     return *this;
270   }
271 
dilation_height()272   inline uint32_t dilation_height() const {
273     return this->dilation_height_;
274   }
275 
dilation_width(uint32_t dilation_width)276   inline ConvolutionOperatorTester& dilation_width(uint32_t dilation_width) {
277     assert(dilation_width >= 1);
278     this->dilation_width_ = dilation_width;
279     return *this;
280   }
281 
dilation_width()282   inline uint32_t dilation_width() const {
283     return this->dilation_width_;
284   }
285 
subsampling(uint32_t subsampling)286   inline ConvolutionOperatorTester& subsampling(uint32_t subsampling) {
287     assert(subsampling >= 1);
288     this->subsampling_height_ = subsampling;
289     this->subsampling_width_ = subsampling;
290     return *this;
291   }
292 
subsampling(uint32_t subsampling_height,uint32_t subsampling_width)293   inline ConvolutionOperatorTester& subsampling(uint32_t subsampling_height, uint32_t subsampling_width) {
294     assert(subsampling_height >= 1);
295     assert(subsampling_width >= 1);
296     this->subsampling_height_ = subsampling_height;
297     this->subsampling_width_ = subsampling_width;
298     return *this;
299   }
300 
subsampling_height(uint32_t subsampling_height)301   inline ConvolutionOperatorTester& subsampling_height(uint32_t subsampling_height) {
302     assert(subsampling_height >= 1);
303     this->subsampling_height_ = subsampling_height;
304     return *this;
305   }
306 
subsampling_height()307   inline uint32_t subsampling_height() const {
308     return this->subsampling_height_;
309   }
310 
subsampling_width(uint32_t subsampling_width)311   inline ConvolutionOperatorTester& subsampling_width(uint32_t subsampling_width) {
312     assert(subsampling_width >= 1);
313     this->subsampling_width_ = subsampling_width;
314     return *this;
315   }
316 
subsampling_width()317   inline uint32_t subsampling_width() const {
318     return this->subsampling_width_;
319   }
320 
input_channel_stride(size_t input_channel_stride)321   inline ConvolutionOperatorTester& input_channel_stride(size_t input_channel_stride) {
322     assert(input_channel_stride >= 1);
323     this->input_channel_stride_ = input_channel_stride;
324     return *this;
325   }
326 
input_channel_stride()327   inline size_t input_channel_stride() const {
328     if (this->input_channel_stride_ == 0) {
329       return group_input_channels() * groups();
330     } else {
331       assert(this->input_channel_stride_ >= group_input_channels() * groups());
332       return this->input_channel_stride_;
333     }
334   }
335 
output_channel_stride(size_t output_channel_stride)336   inline ConvolutionOperatorTester& output_channel_stride(size_t output_channel_stride) {
337     assert(output_channel_stride >= 1);
338     this->output_channel_stride_ = output_channel_stride;
339     return *this;
340   }
341 
output_channel_stride()342   inline size_t output_channel_stride() const {
343     if (this->output_channel_stride_ == 0) {
344       return group_output_channels() * groups();
345     } else {
346       assert(this->output_channel_stride_ >= group_output_channels() * groups());
347       return this->output_channel_stride_;
348     }
349   }
350 
dilated_kernel_height()351   inline uint32_t dilated_kernel_height() const {
352     return (kernel_height() - 1) * dilation_height() + 1;
353   }
354 
dilated_kernel_width()355   inline uint32_t dilated_kernel_width() const {
356     return (kernel_width() - 1) * dilation_width() + 1;
357   }
358 
output_height()359   inline size_t output_height() const {
360     if (padding_tf_same()) {
361       return (input_height() + subsampling_height() - 1) / subsampling_height();
362     } else {
363       const size_t padded_input_height = padding_top() + input_height() + padding_bottom();
364       if (padded_input_height <= dilated_kernel_height()) {
365         return 1;
366       } else {
367         return (padded_input_height - dilated_kernel_height()) / subsampling_height() + 1;
368       }
369     }
370   }
371 
output_width()372   inline size_t output_width() const {
373     if (padding_tf_same()) {
374       return (input_width() + subsampling_width() - 1) / subsampling_width();
375     } else {
376       const size_t padded_input_width = padding_left() + input_width() + padding_right();
377       if (padded_input_width <= dilated_kernel_width()) {
378         return 1;
379       } else {
380         return (padded_input_width - dilated_kernel_width()) / subsampling_width() + 1;
381       }
382     }
383   }
384 
next_input_size(uint32_t next_input_height,uint32_t next_input_width)385   inline ConvolutionOperatorTester& next_input_size(uint32_t next_input_height, uint32_t next_input_width) {
386     assert(next_input_height >= 1);
387     assert(next_input_width >= 1);
388     this->next_input_height_ = next_input_height;
389     this->next_input_width_ = next_input_width;
390     return *this;
391   }
392 
next_input_height(uint32_t next_input_height)393   inline ConvolutionOperatorTester& next_input_height(uint32_t next_input_height) {
394     assert(next_input_height >= 1);
395     this->next_input_height_ = next_input_height;
396     return *this;
397   }
398 
next_input_height()399   inline uint32_t next_input_height() const {
400     if (this->next_input_height_ == 0) {
401       return input_height();
402     } else {
403       return this->next_input_height_;
404     }
405   }
406 
next_input_width(uint32_t next_input_width)407   inline ConvolutionOperatorTester& next_input_width(uint32_t next_input_width) {
408     assert(next_input_width >= 1);
409     this->next_input_width_ = next_input_width;
410     return *this;
411   }
412 
next_input_width()413   inline uint32_t next_input_width() const {
414     if (this->next_input_width_ == 0) {
415       return input_width();
416     } else {
417       return this->next_input_width_;
418     }
419   }
420 
next_output_height()421   inline size_t next_output_height() const {
422     const size_t padded_input_height = padding_top() + next_input_height() + padding_bottom();
423     if (padded_input_height <= dilated_kernel_height()) {
424       return 1;
425     } else {
426       return (padded_input_height - dilated_kernel_height()) / subsampling_height() + 1;
427     }
428   }
429 
next_output_width()430   inline size_t next_output_width() const {
431     const size_t padded_input_width = padding_left() + next_input_width() + padding_right();
432     if (padded_input_width <= dilated_kernel_width()) {
433       return 1;
434     } else {
435       return (padded_input_width - dilated_kernel_width()) / subsampling_width() + 1;
436     }
437   }
438 
next_batch_size(size_t next_batch_size)439   inline ConvolutionOperatorTester& next_batch_size(size_t next_batch_size) {
440     assert(next_batch_size >= 1);
441     this->next_batch_size_ = next_batch_size;
442     return *this;
443   }
444 
next_batch_size()445   inline size_t next_batch_size() const {
446     if (this->next_batch_size_ == 0) {
447       return batch_size();
448     } else {
449       return this->next_batch_size_;
450     }
451   }
452 
sparsity(float sparsity)453   inline ConvolutionOperatorTester& sparsity(float sparsity) {
454     this->sparsity_ = sparsity;
455     return *this;
456   }
457 
sparsity()458   inline float sparsity() const {
459     return this->sparsity_;
460   }
461 
qmin(uint8_t qmin)462   inline ConvolutionOperatorTester& qmin(uint8_t qmin) {
463     this->qmin_ = qmin;
464     return *this;
465   }
466 
qmin()467   inline uint8_t qmin() const {
468     return this->qmin_;
469   }
470 
qmax(uint8_t qmax)471   inline ConvolutionOperatorTester& qmax(uint8_t qmax) {
472     this->qmax_ = qmax;
473     return *this;
474   }
475 
qmax()476   inline uint8_t qmax() const {
477     return this->qmax_;
478   }
479 
force_nhwc_input(bool force_nhwc_input)480   inline ConvolutionOperatorTester& force_nhwc_input(bool force_nhwc_input) {
481     this->force_nhwc_input_ = force_nhwc_input;
482     return *this;
483   }
484 
force_nhwc_input()485   inline bool force_nhwc_input() const {
486     return this->force_nhwc_input_;
487   }
488 
depthwise_layout(bool depthwise_layout)489   inline ConvolutionOperatorTester& depthwise_layout(bool depthwise_layout) {
490     this->depthwise_layout_ = depthwise_layout;
491     return *this;
492   }
493 
depthwise_layout()494   inline bool depthwise_layout() const {
495     return this->depthwise_layout_;
496   }
497 
has_bias(bool has_bias)498   inline ConvolutionOperatorTester& has_bias(bool has_bias) {
499     this->has_bias_ = has_bias;
500     return *this;
501   }
502 
has_bias()503   inline bool has_bias() const {
504     return this->has_bias_;
505   }
506 
weights_type(WeightsType weights_type)507   inline ConvolutionOperatorTester& weights_type(WeightsType weights_type) {
508     this->weights_type_ = weights_type;
509     return *this;
510   }
511 
weights_type()512   inline WeightsType weights_type() const {
513     return this->weights_type_;
514   }
515 
iterations(size_t iterations)516   inline ConvolutionOperatorTester& iterations(size_t iterations) {
517     this->iterations_ = iterations;
518     return *this;
519   }
520 
iterations()521   inline size_t iterations() const {
522     return this->iterations_;
523   }
524 
525 #if XNN_PLATFORM_JIT
use_jit(bool use_jit)526   inline ConvolutionOperatorTester& use_jit(bool use_jit) {
527     this->use_jit_ = use_jit;
528     return *this;
529   }
530 
use_jit()531   inline bool use_jit() const {
532     return this->use_jit_;
533   }
534 #endif
535 
use_weights_cache(bool use_weights_cache)536   inline ConvolutionOperatorTester& use_weights_cache(bool use_weights_cache) {
537     this->use_weights_cache_ = use_weights_cache;
538     return *this;
539   }
540 
use_weights_cache()541   inline bool use_weights_cache() const {
542     return this->use_weights_cache_;
543   }
544 
TestNHWCxQC8()545   void TestNHWCxQC8() const {
546     ASSERT_EQ(weights_type(), WeightsType::Default);
547 
548     std::random_device random_device;
549     auto rng = std::mt19937(random_device());
550     std::uniform_int_distribution<int32_t> i32dist(-10000, 10000);
551     std::uniform_int_distribution<int32_t> i8dist(
552       std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max());
553     std::uniform_int_distribution<int32_t> w8dist(
554       -std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max());
555 
556     std::vector<int8_t> input(XNN_EXTRA_BYTES / sizeof(int8_t) +
557       batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()));
558     std::vector<int8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
559     std::vector<int32_t> bias(groups() * group_output_channels());
560     std::vector<int8_t> output(batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()));
561     std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
562     std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
563     std::vector<float> requantization_scales(groups() * group_output_channels());
564 
565     const int8_t input_zero_point = -1;
566     const int8_t output_zero_point = -1;
567 
568     for (size_t iteration = 0; iteration < iterations(); iteration++) {
569       std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); });
570       std::generate(kernel.begin(), kernel.end(), [&]() { return w8dist(rng); });
571       std::generate(bias.begin(), bias.end(), [&]() { return i32dist(rng); });
572       std::fill(output.begin(), output.end(), INT8_C(0xA5));
573 
574       // Compute reference results, without renormalization.
575       if (depthwise_layout()) {
576         ASSERT_EQ(group_input_channels(), 1);
577         xnnpack::compute_depthwise_convolution_qs8_reference_results(
578           batch_size(),
579           output_height(),
580           output_width(),
581           input_height(),
582           input_width(),
583           padding_top(),
584           padding_right(),
585           padding_bottom(),
586           padding_left(),
587           kernel_height(),
588           kernel_width(),
589           subsampling_height(),
590           subsampling_width(),
591           dilation_height(),
592           dilation_width(),
593           groups(),
594           group_output_channels(),
595           input_channel_stride(),
596           input_zero_point,
597           input,
598           kernel,
599           accumulators,
600           has_bias(),
601           bias);
602       } else {
603         xnnpack::compute_convolution_qs8_reference_results(
604           batch_size(),
605           output_height(),
606           output_width(),
607           input_height(),
608           input_width(),
609           padding_top(),
610           padding_right(),
611           padding_bottom(),
612           padding_left(),
613           kernel_height(),
614           kernel_width(),
615           subsampling_height(),
616           subsampling_width(),
617           dilation_height(),
618           dilation_width(),
619           groups(),
620           group_input_channels(),
621           group_output_channels(),
622           input_channel_stride(),
623           input_zero_point,
624           input,
625           kernel,
626           accumulators,
627           has_bias(),
628           bias);
629       }
630 
631       // Compute renormalization parameters.
632       for (size_t c = 0; c < groups() * group_output_channels(); c++) {
633         int32_t accumulated_min = accumulators[c];
634         int32_t accumulated_max = accumulators[c];
635         for (size_t px = 0; px < batch_size() * output_height() * output_width(); px++) {
636           accumulated_min = std::min(accumulated_min, accumulators[px * groups() * group_output_channels() + c]);
637           accumulated_max = std::max(accumulated_max, accumulators[px * groups() * group_output_channels() + c]);
638         }
639 
640         float requantization_scale = 0x1.0p-32f;
641         if (accumulated_max != 0) {
642           requantization_scale = std::max(requantization_scale,
643             float(int32_t(std::numeric_limits<int8_t>::max()) - int32_t(output_zero_point)) / float(accumulated_max));
644         }
645         if (accumulated_min != 0) {
646           requantization_scale = std::max(requantization_scale,
647             float(int32_t(std::numeric_limits<int8_t>::min()) - int32_t(output_zero_point)) / float(accumulated_min));
648         }
649         requantization_scale = std::min(requantization_scale, 0x1.FFFFFEp-1f);
650 
651         requantization_scales[c] = requantization_scale;
652       }
653 
654       // Renormalize reference results.
655       for (size_t c = 0; c < groups() * group_output_channels(); c++) {
656         for (size_t px = 0; px < batch_size() * output_height() * output_width(); px++) {
657           output_ref[px * groups() * group_output_channels() + c] = double(int32_t(output_zero_point)) +
658             double(accumulators[px * groups() * group_output_channels() + c]) * double(requantization_scales[c]);
659         }
660       }
661       std::transform(output_ref.cbegin(), output_ref.cend(), output_ref.begin(),
662         [this](double x) -> double {
663           return std::max<double>(std::min<double>(x, double(qmax() - 0x80)), double(qmin() - 0x80));
664         });
665 
666       // Create, setup, run, and destroy Convolution operator.
667       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
668       xnn_operator_t convolution_op = nullptr;
669       xnn_caches caches = {
670         .code_cache = NULL,
671         .weights_cache = NULL,
672       };
673       xnn_weights_cache weights_cache;
674       if (use_weights_cache()) {
675         xnn_init_weights_cache(&weights_cache);
676         caches.weights_cache = &weights_cache;
677       }
678 
679       xnn_status status = xnn_create_convolution2d_nhwc_qc8(
680           padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(),
681           padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(),
682           kernel_height(), kernel_width(),
683           subsampling_height(), subsampling_width(),
684           dilation_height(), dilation_width(),
685           groups(), group_input_channels(), group_output_channels(),
686           input_channel_stride(), output_channel_stride(),
687           input_zero_point, 1.0f /* input scale */, requantization_scales.data(),
688           kernel.data(), has_bias() ? bias.data() : nullptr,
689           output_zero_point, 1.0f /* output scale */, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80),
690           (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0),
691           &caches,
692           &convolution_op);
693       if (status == xnn_status_unsupported_hardware) {
694         GTEST_SKIP();
695       }
696       ASSERT_EQ(xnn_status_success, status);
697       ASSERT_NE(nullptr, convolution_op);
698       if (use_weights_cache()) {
699         ASSERT_EQ(xnn_status_success,
700                   xnn_finalize_weights_cache(&weights_cache, xnn_weights_cache_finalization_kind_soft));
701       }
702 
703       // Smart pointer to automatically delete convolution_op.
704       std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
705 
706       ASSERT_EQ(xnn_status_success,
707         xnn_setup_convolution2d_nhwc_qc8(
708           convolution_op,
709           batch_size(), input_height(), input_width(),
710           input.data(), output.data(),
711           nullptr /* thread pool */));
712 
713       ASSERT_EQ(xnn_status_success,
714         xnn_run_operator(convolution_op, nullptr /* thread pool */));
715 
716       // Verify results.
717       VerifyNHWCxQC8(output, output_ref);
718 
719       if (use_weights_cache()) {
720         xnn_operator_t convolution_op2 = nullptr;
721         size_t old_weights_cache_size = weights_cache.cache.weights.size;
722 
723         xnn_status status = xnn_create_convolution2d_nhwc_qc8(
724             padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(),
725             padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(),
726             kernel_height(), kernel_width(),
727             subsampling_height(), subsampling_width(),
728             dilation_height(), dilation_width(),
729             groups(), group_input_channels(), group_output_channels(),
730             input_channel_stride(), output_channel_stride(),
731             input_zero_point, 1.0f /* input scale */, requantization_scales.data(),
732             kernel.data(), has_bias() ? bias.data() : nullptr,
733             output_zero_point, 1.0f /* output scale */, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80),
734             (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0),
735             &caches,
736             &convolution_op2);
737         (void) status;
738         ASSERT_NE(nullptr, convolution_op2);
739 
740         // Smart pointer to automatically delete convolution_op2.
741         std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op2, xnn_delete_operator);
742         std::vector<int8_t> output2(output.size(), INT8_C(0xA5));
743         ASSERT_EQ(xnn_status_success,
744                   xnn_setup_convolution2d_nhwc_qc8(
745                       convolution_op2,
746                       batch_size(), input_height(), input_width(),
747                       input.data(), output2.data(),
748                       nullptr /* thread pool */));
749 
750         ASSERT_EQ(xnn_status_success,
751                   xnn_run_operator(convolution_op2, nullptr /* thread pool */));
752 
753         VerifyNHWCxQC8(output2, output_ref);
754         VerifyWeightsCache(weights_cache, old_weights_cache_size);
755         xnn_release_weights_cache(&weights_cache);
756       }
757     }
758   }
759 
VerifyNHWCxQC8(const std::vector<int8_t> & output,const std::vector<double> & output_ref)760   void VerifyNHWCxQC8(const std::vector<int8_t> &output,
761                       const std::vector<double> &output_ref) const {
762     for (size_t i = 0; i < batch_size(); i++) {
763       for (size_t y = 0; y < output_height(); y++) {
764         for (size_t x = 0; x < output_width(); x++) {
765           for (size_t g = 0; g < groups(); g++) {
766             for (size_t c = 0; c < group_output_channels(); c++) {
767               ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax() - 0x80))
768                 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
769               ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin() - 0x80))
770                 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
771               ASSERT_NEAR(
772                   output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
773                   double(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]),
774                   0.9)
775                 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
776             }
777           }
778         }
779       }
780     }
781   }
782 
TestNHWCxQS8()783   void TestNHWCxQS8() const {
784     ASSERT_EQ(weights_type(), WeightsType::Default);
785 
786     std::random_device random_device;
787     auto rng = std::mt19937(random_device());
788     std::uniform_int_distribution<int32_t> i32dist(-10000, 10000);
789     std::uniform_int_distribution<int32_t> i8dist(
790       std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max());
791     std::uniform_int_distribution<int32_t> w8dist(
792       -std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max());
793 
794     std::vector<int8_t> input(XNN_EXTRA_BYTES / sizeof(int8_t) +
795       batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()));
796     std::vector<int8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
797     std::vector<int32_t> bias(groups() * group_output_channels());
798     std::vector<int8_t> output(batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()));
799     std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
800     std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
801 
802     const int8_t input_zero_point = -1;
803 
804     for (size_t iteration = 0; iteration < iterations(); iteration++) {
805       std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); });
806       std::generate(kernel.begin(), kernel.end(), [&]() { return w8dist(rng); });
807       std::generate(bias.begin(), bias.end(), [&]() { return i32dist(rng); });
808       std::fill(output.begin(), output.end(), INT8_C(0xA5));
809 
810       // Compute reference results, without renormalization.
811       if (depthwise_layout()) {
812         ASSERT_EQ(group_input_channels(), 1);
813         xnnpack::compute_depthwise_convolution_qs8_reference_results(
814           batch_size(),
815           output_height(),
816           output_width(),
817           input_height(),
818           input_width(),
819           padding_top(),
820           padding_right(),
821           padding_bottom(),
822           padding_left(),
823           kernel_height(),
824           kernel_width(),
825           subsampling_height(),
826           subsampling_width(),
827           dilation_height(),
828           dilation_width(),
829           groups(),
830           group_output_channels(),
831           input_channel_stride(),
832           input_zero_point,
833           input,
834           kernel,
835           accumulators,
836           has_bias(),
837           bias);
838       } else {
839         xnnpack::compute_convolution_qs8_reference_results(
840           batch_size(),
841           output_height(),
842           output_width(),
843           input_height(),
844           input_width(),
845           padding_top(),
846           padding_right(),
847           padding_bottom(),
848           padding_left(),
849           kernel_height(),
850           kernel_width(),
851           subsampling_height(),
852           subsampling_width(),
853           dilation_height(),
854           dilation_width(),
855           groups(),
856           group_input_channels(),
857           group_output_channels(),
858           input_channel_stride(),
859           input_zero_point,
860           input,
861           kernel,
862           accumulators,
863           has_bias(),
864           bias);
865       }
866 
867       // Compute renormalization parameters.
868       const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
869       const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
870 
871       const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0;
872       const int8_t output_zero_point = int8_t(std::max(std::min(
873         lrint(-0.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
874         long(std::numeric_limits<int8_t>::max())), long(std::numeric_limits<int8_t>::min())));
875 
876       // Renormalize reference results.
877       std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(),
878         [this, output_scale, output_zero_point](int32_t x) -> double {
879           return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax() - 0x80) - output_zero_point), double(qmin() - 0x80) - output_zero_point);
880         });
881 
882       // Create, setup, run, and destroy Convolution operator.
883       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
884       xnn_operator_t convolution_op = nullptr;
885       xnn_caches caches = {
886         .code_cache = NULL,
887         .weights_cache = NULL,
888       };
889       xnn_weights_cache weights_cache;
890       if (use_weights_cache()) {
891         xnn_init_weights_cache(&weights_cache);
892         caches.weights_cache = &weights_cache;
893       }
894 
895       xnn_status status = xnn_create_convolution2d_nhwc_qs8(
896           padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(),
897           padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(),
898           kernel_height(), kernel_width(),
899           subsampling_height(), subsampling_width(),
900           dilation_height(), dilation_width(),
901           groups(), group_input_channels(), group_output_channels(),
902           input_channel_stride(), output_channel_stride(),
903           input_zero_point, 1.0f /* input scale */, 1.0f /* kernel scale */,
904           kernel.data(), has_bias() ? bias.data() : nullptr,
905           output_zero_point, output_scale, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80),
906           (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0),
907           &caches,
908           &convolution_op);
909       if (status == xnn_status_unsupported_hardware) {
910         GTEST_SKIP();
911       }
912       ASSERT_EQ(xnn_status_success, status);
913       ASSERT_NE(nullptr, convolution_op);
914       if (use_weights_cache()) {
915         ASSERT_EQ(xnn_status_success,
916                   xnn_finalize_weights_cache(&weights_cache, xnn_weights_cache_finalization_kind_soft));
917       }
918 
919       // Smart pointer to automatically delete convolution_op.
920       std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
921 
922       ASSERT_EQ(xnn_status_success,
923         xnn_setup_convolution2d_nhwc_qs8(
924           convolution_op,
925           batch_size(), input_height(), input_width(),
926           input.data(), output.data(),
927           nullptr /* thread pool */));
928 
929       ASSERT_EQ(xnn_status_success,
930         xnn_run_operator(convolution_op, nullptr /* thread pool */));
931 
932       VerifyNHWCxQS8(output, output_ref, output_zero_point);
933 
934       if (use_weights_cache()) {
935         xnn_operator_t convolution_op2 = nullptr;
936         size_t old_weights_cache_size = weights_cache.cache.weights.size;
937 
938         ASSERT_EQ(
939             xnn_status_success,
940             xnn_create_convolution2d_nhwc_qs8(
941                 padding_tf_same() ? 0 : padding_top(),
942                 padding_tf_same() ? 0 : padding_right(),
943                 padding_tf_same() ? 0 : padding_bottom(),
944                 padding_tf_same() ? 0 : padding_left(), kernel_height(),
945                 kernel_width(), subsampling_height(), subsampling_width(),
946                 dilation_height(), dilation_width(), groups(),
947                 group_input_channels(), group_output_channels(),
948                 input_channel_stride(), output_channel_stride(),
949                 input_zero_point, 1.0f /* input scale */,
950                 1.0f /* kernel scale */, kernel.data(),
951                 has_bias() ? bias.data() : nullptr, output_zero_point,
952                 output_scale, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80),
953                 (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) |
954                     (padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0),
955                 &caches, &convolution_op2));
956         ASSERT_NE(nullptr, convolution_op2);
957 
958         // Smart pointer to automatically delete convolution_op.
959         std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>
960             auto_convolution_op(convolution_op2, xnn_delete_operator);
961 
962         std::vector<int8_t> output2(output.size(), INT8_C(0xA5));
963         ASSERT_EQ(xnn_status_success,
964                   xnn_setup_convolution2d_nhwc_qs8(
965                       convolution_op2, batch_size(), input_height(),
966                       input_width(), input.data(), output2.data(),
967                       nullptr /* thread pool */));
968 
969         ASSERT_EQ(xnn_status_success,
970                   xnn_run_operator(convolution_op2, nullptr /* thread pool */));
971 
972         VerifyNHWCxQS8(output2, output_ref, output_zero_point);
973         VerifyWeightsCache(weights_cache, old_weights_cache_size);
974         xnn_release_weights_cache(&weights_cache);
975       }
976     }
977   }
978 
VerifyNHWCxQS8(const std::vector<int8_t> & output,const std::vector<double> & output_ref,const int8_t output_zero_point)979   void VerifyNHWCxQS8(const std::vector<int8_t> &output,
980                       const std::vector<double> &output_ref,
981                       const int8_t output_zero_point) const {
982     for (size_t i = 0; i < batch_size(); i++) {
983       for (size_t y = 0; y < output_height(); y++) {
984         for (size_t x = 0; x < output_width(); x++) {
985           for (size_t g = 0; g < groups(); g++) {
986             for (size_t c = 0; c < group_output_channels(); c++) {
987               ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax() - 0x80))
988                 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
989               ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin() - 0x80))
990                 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
991               ASSERT_NEAR(
992                   output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
993                   double(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
994                   0.9)
995                 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
996             }
997           }
998         }
999       }
1000     }
1001   }
1002 
TestNHWCxQU8()1003   void TestNHWCxQU8() const {
1004     ASSERT_EQ(weights_type(), WeightsType::Default);
1005 
1006     std::random_device random_device;
1007     auto rng = std::mt19937(random_device());
1008     std::uniform_int_distribution<int32_t> i32dist(-10000, 10000);
1009     std::uniform_int_distribution<int32_t> u8dist(
1010       std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max());
1011 
1012     std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) +
1013       batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()));
1014     std::vector<uint8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
1015     std::vector<int32_t> bias(groups() * group_output_channels());
1016     std::vector<uint8_t> output(batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()));
1017     std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1018     std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1019 
1020     const uint8_t input_zero_point = 127;
1021     const uint8_t kernel_zero_point = 127;
1022 
1023     for (size_t iteration = 0; iteration < iterations(); iteration++) {
1024       std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); });
1025       std::generate(kernel.begin(), kernel.end(), [&]() { return u8dist(rng); });
1026       std::generate(bias.begin(), bias.end(), [&]() { return i32dist(rng); });
1027       std::fill(output.begin(), output.end(), UINT8_C(0xA5));
1028 
1029       // Compute reference results, without renormalization.
1030       if (has_bias()) {
1031         for (size_t i = 0; i < batch_size(); i++) {
1032           for (size_t oy = 0; oy < output_height(); oy++) {
1033             for (size_t ox = 0; ox < output_width(); ox++) {
1034               for (size_t g = 0; g < groups(); g++) {
1035                 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1036                   accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1037                     bias[g * group_output_channels() + oc];
1038                 }
1039               }
1040             }
1041           }
1042         }
1043       } else {
1044         std::fill(accumulators.begin(), accumulators.end(), 0);
1045       }
1046       if (depthwise_layout()) {
1047         ASSERT_EQ(group_input_channels(), 1);
1048         xnnpack::compute_depthwise_convolution_qu8_reference_results(
1049             batch_size(),
1050             output_height(),
1051             output_width(),
1052             input_height(),
1053             input_width(),
1054             padding_top(),
1055             padding_right(),
1056             padding_bottom(),
1057             padding_left(),
1058             kernel_height(),
1059             kernel_width(),
1060             subsampling_height(),
1061             subsampling_width(),
1062             dilation_height(),
1063             dilation_width(),
1064             groups(),
1065             group_output_channels(),
1066             input_channel_stride(),
1067             input_zero_point,
1068             kernel_zero_point,
1069             input,
1070             kernel,
1071             accumulators,
1072             has_bias(),
1073             bias);
1074       } else {
1075         xnnpack::compute_convolution_qu8_reference_results(
1076             batch_size(),
1077             output_height(),
1078             output_width(),
1079             input_height(),
1080             input_width(),
1081             padding_top(),
1082             padding_right(),
1083             padding_bottom(),
1084             padding_left(),
1085             kernel_height(),
1086             kernel_width(),
1087             subsampling_height(),
1088             subsampling_width(),
1089             dilation_height(),
1090             dilation_width(),
1091             groups(),
1092             group_input_channels(),
1093             group_output_channels(),
1094             input_channel_stride(),
1095             input_zero_point,
1096             kernel_zero_point,
1097             input,
1098             kernel,
1099             accumulators,
1100             has_bias(),
1101             bias);
1102       }
1103 
1104       // Compute renormalization parameters.
1105       const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
1106       const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
1107 
1108       const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0;
1109       const uint8_t output_zero_point = uint8_t(std::max(std::min(
1110         lrint(127.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
1111         long(std::numeric_limits<uint8_t>::max())), long(std::numeric_limits<uint8_t>::min())));
1112 
1113       // Renormalize reference results.
1114       std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(),
1115         [this, output_scale, output_zero_point](int32_t x) -> double {
1116           return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax()) - output_zero_point), double(qmin()) - output_zero_point);
1117         });
1118 
1119       // Create, setup, run, and destroy Convolution operator.
1120       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
1121       xnn_operator_t convolution_op = nullptr;
1122 
1123       xnn_caches caches = {
1124         .code_cache = NULL,
1125         .weights_cache = NULL,
1126       };
1127       xnn_weights_cache weights_cache;
1128       if (use_weights_cache()) {
1129         xnn_init_weights_cache(&weights_cache);
1130         caches.weights_cache = &weights_cache;
1131       }
1132 
1133       xnn_status status = xnn_create_convolution2d_nhwc_qu8(
1134           padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(),
1135           padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(),
1136           kernel_height(), kernel_width(),
1137           subsampling_height(), subsampling_width(),
1138           dilation_height(), dilation_width(),
1139           groups(), group_input_channels(), group_output_channels(),
1140           input_channel_stride(), output_channel_stride(),
1141           input_zero_point, 1.0f /* input scale */,
1142           kernel_zero_point, 1.0f /* kernel scale */,
1143           kernel.data(), has_bias() ? bias.data() : nullptr,
1144           output_zero_point, output_scale, qmin(), qmax(),
1145           (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0),
1146           &caches,
1147           &convolution_op);
1148       if (status == xnn_status_unsupported_hardware) {
1149         GTEST_SKIP();
1150       }
1151       ASSERT_EQ(xnn_status_success, status);
1152       ASSERT_NE(nullptr, convolution_op);
1153       if (use_weights_cache()) {
1154         ASSERT_EQ(xnn_status_success,
1155                   xnn_finalize_weights_cache(&weights_cache, xnn_weights_cache_finalization_kind_soft));
1156       }
1157 
1158       // Smart pointer to automatically delete convolution_op.
1159       std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
1160 
1161       ASSERT_EQ(xnn_status_success,
1162         xnn_setup_convolution2d_nhwc_qu8(
1163           convolution_op,
1164           batch_size(), input_height(), input_width(),
1165           input.data(), output.data(),
1166           nullptr /* thread pool */));
1167 
1168       ASSERT_EQ(xnn_status_success,
1169         xnn_run_operator(convolution_op, nullptr /* thread pool */));
1170 
1171       // Verify results.
1172       VerifyNHWCxQU8(output, output_ref, output_zero_point);
1173 
1174       if (use_weights_cache()) {
1175         xnn_operator_t convolution_op2 = nullptr;
1176         size_t old_weights_cache_size = weights_cache.cache.weights.size;
1177 
1178         ASSERT_EQ(
1179             xnn_status_success,
1180             xnn_create_convolution2d_nhwc_qu8(
1181                 padding_tf_same() ? 0 : padding_top(),
1182                 padding_tf_same() ? 0 : padding_right(),
1183                 padding_tf_same() ? 0 : padding_bottom(),
1184                 padding_tf_same() ? 0 : padding_left(), kernel_height(),
1185                 kernel_width(), subsampling_height(), subsampling_width(),
1186                 dilation_height(), dilation_width(), groups(),
1187                 group_input_channels(), group_output_channels(),
1188                 input_channel_stride(), output_channel_stride(),
1189                 input_zero_point, 1.0f /* input scale */, kernel_zero_point,
1190                 1.0f /* kernel scale */, kernel.data(),
1191                 has_bias() ? bias.data() : nullptr, output_zero_point,
1192                 output_scale, qmin(), qmax(),
1193                 (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) |
1194                     (padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0),
1195                 &caches, &convolution_op2));
1196         ASSERT_NE(nullptr, convolution_op2);
1197 
1198         // Smart pointer to automatically delete convolution_op2.
1199         std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>
1200             auto_convolution_op2(convolution_op2, xnn_delete_operator);
1201         std::vector<uint8_t> output2(output.size(), UINT8_C(0xA5));
1202 
1203         ASSERT_EQ(xnn_status_success,
1204                   xnn_setup_convolution2d_nhwc_qu8(
1205                       convolution_op2, batch_size(), input_height(),
1206                       input_width(), input.data(), output2.data(),
1207                       nullptr /* thread pool */));
1208 
1209         ASSERT_EQ(xnn_status_success,
1210                   xnn_run_operator(convolution_op2, nullptr /* thread pool */));
1211 
1212         // Verify results.
1213         VerifyNHWCxQU8(output2, output_ref, output_zero_point);
1214         VerifyWeightsCache(weights_cache, old_weights_cache_size);
1215         xnn_release_weights_cache(&weights_cache);
1216       }
1217     }
1218   }
1219 
VerifyNHWCxQU8(const std::vector<uint8_t> & output,const std::vector<double> & output_ref,const uint8_t output_zero_point)1220   void VerifyNHWCxQU8(const std::vector<uint8_t> &output,
1221                       const std::vector<double> &output_ref,
1222                       const uint8_t output_zero_point) const {
1223     for (size_t i = 0; i < batch_size(); i++) {
1224       for (size_t y = 0; y < output_height(); y++) {
1225         for (size_t x = 0; x < output_width(); x++) {
1226           for (size_t g = 0; g < groups(); g++) {
1227             for (size_t c = 0; c < group_output_channels(); c++) {
1228               ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax()))
1229                 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1230               ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin()))
1231                 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1232               ASSERT_NEAR(
1233                   output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
1234                   double(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
1235                   0.9)
1236                 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1237             }
1238           }
1239         }
1240       }
1241     }
1242   }
1243 
TestNHWCxF32()1244   void TestNHWCxF32() const {
1245     ASSERT_EQ(weights_type(), WeightsType::Default);
1246 
1247     std::random_device random_device;
1248     auto rng = std::mt19937(random_device());
1249     std::uniform_real_distribution<float> f32dist(0.1f, 1.0f);
1250 
1251     std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) +
1252       batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()));
1253     std::vector<float> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
1254     std::vector<float> bias(groups() * group_output_channels());
1255     std::vector<float> output(batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()));
1256     std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1257 
1258     for (size_t iteration = 0; iteration < iterations(); iteration++) {
1259       std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); });
1260       std::generate(kernel.begin(), kernel.end(), [&]() { return f32dist(rng); });
1261       std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); });
1262       std::fill(output.begin(), output.end(), nanf(""));
1263 
1264       // Compute reference results, without clamping.
1265       if (has_bias()) {
1266         for (size_t i = 0; i < batch_size(); i++) {
1267           for (size_t oy = 0; oy < output_height(); oy++) {
1268             for (size_t ox = 0; ox < output_width(); ox++) {
1269               for (size_t g = 0; g < groups(); g++) {
1270                 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1271                   output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1272                     bias[g * group_output_channels() + oc];
1273                 }
1274               }
1275             }
1276           }
1277         }
1278       } else {
1279         std::fill(output_ref.begin(), output_ref.end(), 0.0f);
1280       }
1281       if (depthwise_layout()) {
1282         ASSERT_EQ(group_input_channels(), 1);
1283 
1284         for (size_t i = 0; i < batch_size(); i++) {
1285           for (size_t oy = 0; oy < output_height(); oy++) {
1286             for (size_t ox = 0; ox < output_width(); ox++) {
1287               for (size_t ky = 0; ky < kernel_height(); ky++) {
1288                 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1289                 if (iy < input_height()) {
1290                   for (size_t kx = 0; kx < kernel_width(); kx++) {
1291                     const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1292                     if (ix < input_width()) {
1293                       for (size_t g = 0; g < groups(); g++) {
1294                         for (size_t oc = 0; oc < group_output_channels(); oc++) {
1295                           output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1296                             input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g] *
1297                             kernel[((ky * kernel_width() + kx) * groups() + g) * group_output_channels() + oc];
1298                         }
1299                       }
1300                     }
1301                   }
1302                 }
1303               }
1304             }
1305           }
1306         }
1307       } else {
1308         for (size_t i = 0; i < batch_size(); i++) {
1309           for (size_t oy = 0; oy < output_height(); oy++) {
1310             for (size_t ox = 0; ox < output_width(); ox++) {
1311               for (size_t ky = 0; ky < kernel_height(); ky++) {
1312                 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1313                 if (iy < input_height()) {
1314                   for (size_t kx = 0; kx < kernel_width(); kx++) {
1315                     const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1316                     if (ix < input_width()) {
1317                       for (size_t g = 0; g < groups(); g++) {
1318                         for (size_t oc = 0; oc < group_output_channels(); oc++) {
1319                           for (size_t ic = 0; ic < group_input_channels(); ic++) {
1320                             output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1321                               input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic] *
1322                               kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
1323                           }
1324                         }
1325                       }
1326                     }
1327                   }
1328                 }
1329               }
1330             }
1331           }
1332         }
1333       }
1334 
1335       // Compute clamping parameters.
1336       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
1337       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
1338 
1339       const float output_min = accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
1340       const float output_max = accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
1341 
1342       // Clamp reference results.
1343       for (float& value : output_ref) {
1344         value = std::max(std::min(value, output_max), output_min);
1345       }
1346 
1347       // Create, setup, run, and destroy Convolution operator.
1348       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
1349       xnn_operator_t convolution_op = nullptr;
1350 
1351       xnn_caches caches = {
1352         .code_cache = NULL,
1353         .weights_cache = NULL,
1354       };
1355       #if XNN_PLATFORM_JIT
1356         xnn_code_cache code_cache;
1357         if (use_jit()) {
1358           xnn_init_code_cache(&code_cache);
1359           caches.code_cache = &code_cache;
1360         }
1361       #endif
1362       xnn_weights_cache weights_cache;
1363       if (use_weights_cache()) {
1364         xnn_init_weights_cache(&weights_cache);
1365         caches.weights_cache = &weights_cache;
1366       }
1367 
1368       xnn_status status = xnn_create_convolution2d_nhwc_f32(
1369           padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(),
1370           padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(),
1371           kernel_height(), kernel_width(),
1372           subsampling_height(), subsampling_width(),
1373           dilation_height(), dilation_width(),
1374           groups(), group_input_channels(), group_output_channels(),
1375           input_channel_stride(), output_channel_stride(),
1376           kernel.data(), has_bias() ? bias.data() : nullptr,
1377           output_min, output_max,
1378           (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0),
1379           &caches,
1380           &convolution_op);
1381       if (status == xnn_status_unsupported_hardware) {
1382         GTEST_SKIP();
1383       }
1384       ASSERT_EQ(xnn_status_success, status);
1385       ASSERT_NE(nullptr, convolution_op);
1386       if (use_weights_cache()) {
1387         ASSERT_EQ(xnn_status_success,
1388                   xnn_finalize_weights_cache(&weights_cache, xnn_weights_cache_finalization_kind_soft));
1389       }
1390 
1391       // Smart pointer to automatically delete convolution_op.
1392       std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
1393 
1394       #if XNN_PLATFORM_JIT
1395         if (use_jit()) {
1396           // Check that we actually generated code.
1397           ASSERT_GT(code_cache.cache.code.size, 0);
1398           xnn_finalize_code_memory(&code_cache.cache.code);
1399         }
1400       #endif
1401 
1402       ASSERT_EQ(xnn_status_success,
1403         xnn_setup_convolution2d_nhwc_f32(
1404           convolution_op,
1405           batch_size(), input_height(), input_width(),
1406           input.data(), output.data(),
1407           nullptr /* thread pool */));
1408 
1409       ASSERT_EQ(xnn_status_success,
1410         xnn_run_operator(convolution_op, nullptr /* thread pool */));
1411 
1412       VerifyNHWCxF32(output, output_ref, output_min, output_max);
1413 
1414       if (use_weights_cache()) {
1415         // We already finalized the code cache, so create a new code cache if we are testing JIT.
1416         #if XNN_PLATFORM_JIT
1417           xnn_code_cache inner_code_cache;
1418           if (use_jit()) {
1419             xnn_init_code_cache(&inner_code_cache);
1420             caches.code_cache = &inner_code_cache;
1421           }
1422         #endif
1423         // To test weights cache, we create the operator with the same parameters, and setup with a different output.
1424         xnn_operator_t convolution_op2 = nullptr;
1425         size_t old_weights_cache_size = weights_cache.cache.weights.size;
1426 
1427         ASSERT_EQ(xnn_status_success, xnn_create_convolution2d_nhwc_f32(
1428             padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(),
1429             padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(),
1430             kernel_height(), kernel_width(),
1431             subsampling_height(), subsampling_width(),
1432             dilation_height(), dilation_width(),
1433             groups(), group_input_channels(), group_output_channels(),
1434             input_channel_stride(), output_channel_stride(),
1435             kernel.data(), has_bias() ? bias.data() : nullptr,
1436             output_min, output_max,
1437             (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0),
1438             &caches,
1439             &convolution_op2));
1440 
1441         ASSERT_NE(nullptr, convolution_op2);
1442 
1443         #if XNN_PLATFORM_JIT
1444           if (use_jit()) {
1445             // Check that we actually generated code.
1446             ASSERT_GT(inner_code_cache.cache.code.size, 0);
1447             xnn_finalize_code_memory(&inner_code_cache.cache.code);
1448           }
1449         #endif
1450 
1451         std::vector<float> output2(output.size(), nanf(""));
1452         ASSERT_EQ(xnn_status_success,
1453                   xnn_setup_convolution2d_nhwc_f32(
1454                       convolution_op2,
1455                       batch_size(), input_height(), input_width(),
1456                       input.data(), output2.data(),
1457                       nullptr /* thread pool */));
1458         ASSERT_EQ(xnn_status_success,
1459                   xnn_run_operator(convolution_op2, nullptr /* thread pool */));
1460 
1461         std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op2(convolution_op2, xnn_delete_operator);
1462         ASSERT_EQ(weights_cache.cache.hits, 1);
1463         // Ensure that we did not write more weights to the cache because it was a cache hit.
1464         ASSERT_EQ(old_weights_cache_size, weights_cache.cache.weights.size);
1465 
1466         VerifyNHWCxF32(output2, output_ref, output_min, output_max);
1467         #if XNN_PLATFORM_JIT
1468           if (use_jit()) {
1469             xnn_release_code_cache(&inner_code_cache);
1470           }
1471         #endif
1472       }
1473 
1474       #if XNN_PLATFORM_JIT
1475         if (use_jit()) {
1476           xnn_release_code_cache(&code_cache);
1477         }
1478       #endif
1479       if (use_weights_cache()) {
1480         xnn_release_weights_cache(&weights_cache);
1481       }
1482     }
1483   }
1484 
VerifyNHWCxF32(const std::vector<float> & output,const std::vector<float> & output_ref,const float output_min,const float output_max)1485   void VerifyNHWCxF32(const std::vector<float>& output, const std::vector<float>& output_ref, const float output_min, const float output_max) const {
1486     for (size_t i = 0; i < batch_size(); i++) {
1487       for (size_t y = 0; y < output_height(); y++) {
1488         for (size_t x = 0; x < output_width(); x++) {
1489           for (size_t g = 0; g < groups(); g++) {
1490             for (size_t c = 0; c < group_output_channels(); c++) {
1491               ASSERT_GE(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c], output_min)
1492                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1493               ASSERT_LE(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c], output_max)
1494                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1495               ASSERT_NEAR(
1496                   output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
1497                   output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c],
1498                   1.0e-4 * std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c]))
1499                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1500             }
1501           }
1502         }
1503       }
1504     }
1505   }
1506 
TestNHWCxF16()1507   void TestNHWCxF16() const {
1508     switch (weights_type()) {
1509       case WeightsType::Default:
1510         break;
1511       case WeightsType::FP32:
1512         break;
1513       default:
1514         GTEST_FAIL() << "unexpected weights type";
1515     }
1516 
1517     std::random_device random_device;
1518     auto rng = std::mt19937(random_device());
1519     std::uniform_real_distribution<float> f32dist(0.1f, 1.0f);
1520 
1521     std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) +
1522       batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()));
1523     std::vector<uint16_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
1524     std::vector<float> kernel_as_float(kernel.size());
1525     std::vector<uint16_t> bias(groups() * group_output_channels());
1526     std::vector<float> bias_as_float(bias.size());
1527     std::vector<uint16_t> output(batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()));
1528     std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1529 
1530     for (size_t iteration = 0; iteration < iterations(); iteration++) {
1531       std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
1532       std::generate(kernel.begin(), kernel.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
1533       std::transform(kernel.cbegin(), kernel.cend(), kernel_as_float.begin(), fp16_ieee_to_fp32_value);
1534       std::generate(bias.begin(), bias.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
1535       std::transform(bias.cbegin(), bias.cend(), bias_as_float.begin(), fp16_ieee_to_fp32_value);
1536       std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
1537 
1538 
1539       // Compute reference results, without clamping.
1540       if (has_bias()) {
1541         for (size_t i = 0; i < batch_size(); i++) {
1542           for (size_t oy = 0; oy < output_height(); oy++) {
1543             for (size_t ox = 0; ox < output_width(); ox++) {
1544               for (size_t g = 0; g < groups(); g++) {
1545                 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1546                   output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1547                     fp16_ieee_to_fp32_value(bias[g * group_output_channels() + oc]);
1548                 }
1549               }
1550             }
1551           }
1552         }
1553       } else {
1554         std::fill(output_ref.begin(), output_ref.end(), 0.0f);
1555       }
1556       if (depthwise_layout()) {
1557         ASSERT_EQ(group_input_channels(), 1);
1558 
1559         for (size_t i = 0; i < batch_size(); i++) {
1560           for (size_t oy = 0; oy < output_height(); oy++) {
1561             for (size_t ox = 0; ox < output_width(); ox++) {
1562               for (size_t ky = 0; ky < kernel_height(); ky++) {
1563                 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1564                 if (iy < input_height()) {
1565                   for (size_t kx = 0; kx < kernel_width(); kx++) {
1566                     const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1567                     if (ix < input_width()) {
1568                       for (size_t g = 0; g < groups(); g++) {
1569                         for (size_t oc = 0; oc < group_output_channels(); oc++) {
1570                           output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1571                             fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g]) *
1572                             fp16_ieee_to_fp32_value(kernel[((ky * kernel_width() + kx) * groups() + g) * group_output_channels() + oc]);
1573                         }
1574                       }
1575                     }
1576                   }
1577                 }
1578               }
1579             }
1580           }
1581         }
1582       } else {
1583         for (size_t i = 0; i < batch_size(); i++) {
1584           for (size_t oy = 0; oy < output_height(); oy++) {
1585             for (size_t ox = 0; ox < output_width(); ox++) {
1586               for (size_t ky = 0; ky < kernel_height(); ky++) {
1587                 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1588                 if (iy < input_height()) {
1589                   for (size_t kx = 0; kx < kernel_width(); kx++) {
1590                     const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1591                     if (ix < input_width()) {
1592                       for (size_t g = 0; g < groups(); g++) {
1593                         for (size_t oc = 0; oc < group_output_channels(); oc++) {
1594                           for (size_t ic = 0; ic < group_input_channels(); ic++) {
1595                             output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1596                               fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) *
1597                               fp16_ieee_to_fp32_value(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
1598                           }
1599                         }
1600                       }
1601                     }
1602                   }
1603                 }
1604               }
1605             }
1606           }
1607         }
1608       }
1609 
1610       // Compute clamping parameters.
1611       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
1612       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
1613       const float accumulated_range = accumulated_max - accumulated_min;
1614       const float scaled_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_min + accumulated_range / 255.0f * float(qmin())));
1615       const float scaled_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_max - accumulated_range / 255.0f * float(255 - qmax())));
1616       const float output_min = scaled_min == scaled_max ? -std::numeric_limits<float>::infinity() : scaled_min;
1617       const float output_max = scaled_min == scaled_max ? +std::numeric_limits<float>::infinity() : scaled_max;
1618 
1619       // Clamp reference results.
1620       for (float& value : output_ref) {
1621         value = std::max(std::min(value, output_max), output_min);
1622       }
1623 
1624       // Create, setup, run, and destroy Convolution operator.
1625       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
1626       xnn_operator_t convolution_op = nullptr;
1627       xnn_caches caches = {
1628         .code_cache = NULL,
1629         .weights_cache = NULL,
1630       };
1631       xnn_weights_cache weights_cache;
1632       if (use_weights_cache()) {
1633         xnn_init_weights_cache(&weights_cache);
1634         caches.weights_cache = &weights_cache;
1635       }
1636 
1637       const void* kernel_data = kernel.data();
1638       const void* bias_data = bias.data();
1639       if (weights_type() == WeightsType::FP32) {
1640         kernel_data = kernel_as_float.data();
1641         bias_data = bias_as_float.data();
1642       }
1643       uint32_t flags = 0;
1644       if (depthwise_layout()) {
1645         flags |= XNN_FLAG_DEPTHWISE_CONVOLUTION;
1646       }
1647       if (padding_tf_same()) {
1648         flags |= XNN_FLAG_TENSORFLOW_SAME_PADDING;
1649       }
1650       if (weights_type() == WeightsType::FP32) {
1651         flags |= XNN_FLAG_FP32_STATIC_WEIGHTS;
1652       }
1653       xnn_status status = xnn_create_convolution2d_nhwc_f16(
1654           padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(),
1655           padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(),
1656           kernel_height(), kernel_width(),
1657           subsampling_height(), subsampling_width(),
1658           dilation_height(), dilation_width(),
1659           groups(), group_input_channels(), group_output_channels(),
1660           input_channel_stride(), output_channel_stride(),
1661           kernel_data, has_bias() ? bias_data : nullptr,
1662           output_min, output_max,
1663           flags,
1664           &caches,
1665           &convolution_op);
1666       if (status == xnn_status_unsupported_hardware) {
1667         GTEST_SKIP();
1668       }
1669       ASSERT_EQ(xnn_status_success, status);
1670       ASSERT_NE(nullptr, convolution_op);
1671       if (use_weights_cache()) {
1672         ASSERT_EQ(xnn_status_success,
1673                   xnn_finalize_weights_cache(&weights_cache, xnn_weights_cache_finalization_kind_soft));
1674       }
1675 
1676       // Smart pointer to automatically delete convolution_op.
1677       std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
1678 
1679       ASSERT_EQ(xnn_status_success,
1680         xnn_setup_convolution2d_nhwc_f16(
1681           convolution_op,
1682           batch_size(), input_height(), input_width(),
1683           input.data(), output.data(),
1684           nullptr /* thread pool */));
1685 
1686       ASSERT_EQ(xnn_status_success,
1687         xnn_run_operator(convolution_op, nullptr /* thread pool */));
1688 
1689       VerifyNHWCxF16(output, output_ref, output_min, output_max);
1690 
1691       if (use_weights_cache()) {
1692         xnn_operator_t convolution_op2 = nullptr;
1693         size_t old_weights_cache_size = weights_cache.cache.weights.size;
1694         ASSERT_EQ(xnn_status_success, xnn_create_convolution2d_nhwc_f16(
1695             padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(),
1696             padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(),
1697             kernel_height(), kernel_width(),
1698             subsampling_height(), subsampling_width(),
1699             dilation_height(), dilation_width(),
1700             groups(), group_input_channels(), group_output_channels(),
1701             input_channel_stride(), output_channel_stride(),
1702             kernel_data, has_bias() ? bias_data : nullptr,
1703             output_min, output_max,
1704             flags,
1705             &caches,
1706             &convolution_op2));
1707         ASSERT_NE(nullptr, convolution_op2);
1708 
1709         // Smart pointer to automatically delete convolution_op.
1710         std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op2, xnn_delete_operator);
1711 
1712         std::vector<uint16_t> output2(output.size(), UINT16_C(0x7E00) /* NaN */);
1713         ASSERT_EQ(xnn_status_success,
1714                   xnn_setup_convolution2d_nhwc_f16(
1715                       convolution_op2,
1716                       batch_size(), input_height(), input_width(),
1717                       input.data(), output2.data(),
1718                       nullptr /* thread pool */));
1719 
1720         ASSERT_EQ(xnn_status_success,
1721                   xnn_run_operator(convolution_op2, nullptr /* thread pool */));
1722 
1723         VerifyNHWCxF16(output2, output_ref, output_min, output_max);
1724         VerifyWeightsCache(weights_cache, old_weights_cache_size);
1725         xnn_release_weights_cache(&weights_cache);
1726       }
1727     }
1728   }
1729 
VerifyNHWCxF16(const std::vector<uint16_t> & output,const std::vector<float> & output_ref,const float output_min,const float output_max)1730   void VerifyNHWCxF16(const std::vector<uint16_t> &output,
1731                       const std::vector<float> &output_ref,
1732                       const float output_min, const float output_max) const {
1733     for (size_t i = 0; i < batch_size(); i++) {
1734       for (size_t y = 0; y < output_height(); y++) {
1735         for (size_t x = 0; x < output_width(); x++) {
1736           for (size_t g = 0; g < groups(); g++) {
1737             for (size_t c = 0; c < group_output_channels(); c++) {
1738              ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), output_min)
1739                << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1740              ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), output_max)
1741                << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1742               ASSERT_NEAR(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c], fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), std::max(1.0e-4f, std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c]) * 1.0e-2f))
1743                 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1744             }
1745           }
1746         }
1747       }
1748     }
1749   }
1750 
TestNCHWxF32()1751   void TestNCHWxF32() {
1752     ASSERT_EQ(weights_type(), WeightsType::Default);
1753 
1754     std::random_device random_device;
1755     auto rng = std::mt19937(random_device());
1756     std::uniform_real_distribution<float> f32dist(0.1f, 1.0f);
1757     std::uniform_real_distribution<float> pdist;
1758 
1759     std::vector<float> input(2 * XNN_EXTRA_BYTES / sizeof(float) +
1760       ((batch_size() - 1) * input_channel_stride() + groups() * group_input_channels()) * input_height() * input_width());
1761     std::vector<float> kernel(
1762       groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
1763     std::vector<float> bias(groups() * group_output_channels());
1764     std::vector<float> output(
1765       ((batch_size() - 1) * output_channel_stride() + groups() * group_output_channels()) * output_height() * output_width());
1766     std::vector<float> output_ref(batch_size() * groups() * group_output_channels() * output_height() * output_width());
1767 
1768     for (size_t iteration = 0; iteration < iterations(); iteration++) {
1769       std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); });
1770       std::generate(kernel.begin(), kernel.end(), [&]() { return f32dist(rng); });
1771       for (float& k : kernel) {
1772         if (pdist(rng) <= sparsity()) {
1773           k = 0.0f;
1774         }
1775       }
1776       std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); });
1777       std::fill(output.begin(), output.end(), nanf(""));
1778 
1779       // Compute reference results, without clamping.
1780       if (has_bias()) {
1781         for (size_t i = 0; i < batch_size(); i++) {
1782           for (size_t oy = 0; oy < output_height(); oy++) {
1783             for (size_t ox = 0; ox < output_width(); ox++) {
1784               for (size_t g = 0; g < groups(); g++) {
1785                 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1786                   output_ref[(((i * groups() + g) * group_output_channels() + oc) * output_height() + oy) * output_width() + ox] =
1787                     bias[g * group_output_channels() + oc];
1788                 }
1789               }
1790             }
1791           }
1792         }
1793       } else {
1794         std::fill(output_ref.begin(), output_ref.end(), 0.0f);
1795       }
1796       if (force_nhwc_input()) {
1797         for (size_t i = 0; i < batch_size(); i++) {
1798           for (size_t oy = 0; oy < output_height(); oy++) {
1799             for (size_t ox = 0; ox < output_width(); ox++) {
1800               for (size_t ky = 0; ky < kernel_height(); ky++) {
1801                 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1802                 if (iy < input_height()) {
1803                   for (size_t kx = 0; kx < kernel_width(); kx++) {
1804                     const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1805                     if (ix < input_width()) {
1806                       for (size_t g = 0; g < groups(); g++) {
1807                         for (size_t oc = 0; oc < group_output_channels(); oc++) {
1808                           for (size_t ic = 0; ic < group_input_channels(); ic++) {
1809                             output_ref[(((i * groups() + g) * group_output_channels() + oc) * output_height() + oy) * output_width() + ox] +=
1810                               input[((((i * input_height() + iy) * input_width() + ix) * groups() + g) * group_input_channels() + ic)] *
1811                               kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
1812                           }
1813                         }
1814                       }
1815                     }
1816                   }
1817                 }
1818               }
1819             }
1820           }
1821         }
1822       } else if (depthwise_layout()) {
1823         ASSERT_EQ(group_input_channels(), 1);
1824 
1825         for (size_t i = 0; i < batch_size(); i++) {
1826           for (size_t oy = 0; oy < output_height(); oy++) {
1827             for (size_t ox = 0; ox < output_width(); ox++) {
1828               for (size_t ky = 0; ky < kernel_height(); ky++) {
1829                 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1830                 if (iy < input_height()) {
1831                   for (size_t kx = 0; kx < kernel_width(); kx++) {
1832                     const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1833                     if (ix < input_width()) {
1834                       for (size_t g = 0; g < groups(); g++) {
1835                         for (size_t oc = 0; oc < group_output_channels(); oc++) {
1836                           output_ref[(((i * groups() + g) * group_output_channels() + oc) * output_height() + oy) * output_width() + ox] +=
1837                             input[((i * input_channel_stride() + g) * input_height() + iy) * input_width() + ix] *
1838                             kernel[((ky * kernel_width() + kx) * groups() + g) * group_output_channels() + oc];
1839                         }
1840                       }
1841                     }
1842                   }
1843                 }
1844               }
1845             }
1846           }
1847         }
1848       } else {
1849         for (size_t i = 0; i < batch_size(); i++) {
1850           for (size_t oy = 0; oy < output_height(); oy++) {
1851             for (size_t ox = 0; ox < output_width(); ox++) {
1852               for (size_t ky = 0; ky < kernel_height(); ky++) {
1853                 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1854                 if (iy < input_height()) {
1855                   for (size_t kx = 0; kx < kernel_width(); kx++) {
1856                     const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1857                     if (ix < input_width()) {
1858                       for (size_t g = 0; g < groups(); g++) {
1859                         for (size_t oc = 0; oc < group_output_channels(); oc++) {
1860                           for (size_t ic = 0; ic < group_input_channels(); ic++) {
1861                             output_ref[(((i * groups() + g) * group_output_channels() + oc) * output_height() + oy) * output_width() + ox] +=
1862                               input[((i * input_channel_stride() + g * group_input_channels() + ic) * input_height() + iy) * input_width() + ix] *
1863                               kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
1864                           }
1865                         }
1866                       }
1867                     }
1868                   }
1869                 }
1870               }
1871             }
1872           }
1873         }
1874       }
1875 
1876       // Compute clamping parameters.
1877       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
1878       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
1879 
1880       const float output_min = qmin() == 0 ? -std::numeric_limits<float>::infinity() :
1881         accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
1882       const float output_max = qmax() == 255 ? std::numeric_limits<float>::infinity() :
1883         accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
1884 
1885       // Clamp reference results.
1886       for (float& value : output_ref) {
1887         value = std::max(std::min(value, output_max), output_min);
1888       }
1889 
1890       // Create, setup, run, and destroy Convolution operator.
1891       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
1892       xnn_operator_t convolution_op = nullptr;
1893       xnn_caches caches = {
1894         .code_cache = NULL,
1895         .weights_cache = NULL,
1896       };
1897       xnn_weights_cache weights_cache;
1898       if (use_weights_cache()) {
1899         xnn_init_weights_cache(&weights_cache);
1900         caches.weights_cache = &weights_cache;
1901       }
1902 
1903       xnn_status status = xnn_create_convolution2d_nchw_f32(
1904           padding_top(), padding_right(), padding_bottom(), padding_left(),
1905           kernel_height(), kernel_width(),
1906           subsampling_height(), subsampling_width(),
1907           dilation_height(), dilation_width(),
1908           groups(), group_input_channels(), group_output_channels(),
1909           input_channel_stride(), output_channel_stride(),
1910           kernel.data(), has_bias() ? bias.data() : nullptr,
1911           output_min, output_max,
1912           (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (force_nhwc_input() ? XNN_FLAG_INPUT_NHWC : 0),
1913           &caches,
1914           &convolution_op);
1915       if (status == xnn_status_unsupported_parameter) {
1916         GTEST_SKIP();
1917       }
1918       ASSERT_EQ(xnn_status_success, status);
1919       ASSERT_NE(nullptr, convolution_op);
1920       if (use_weights_cache()) {
1921         ASSERT_EQ(xnn_status_success,
1922                   xnn_finalize_weights_cache(&weights_cache, xnn_weights_cache_finalization_kind_soft));
1923       }
1924 
1925       // Smart pointer to automatically delete convolution_op.
1926       std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
1927 
1928       ASSERT_EQ(xnn_status_success,
1929         xnn_setup_convolution2d_nchw_f32(
1930           convolution_op,
1931           batch_size(), input_height(), input_width(),
1932           input.data(), output.data(),
1933           nullptr /* thread pool */));
1934 
1935       ASSERT_EQ(xnn_status_success,
1936         xnn_run_operator(convolution_op, nullptr /* thread pool */));
1937 
1938       VerifyNCHWxF32(output, output_ref, output_min, output_max);
1939 
1940       if (use_weights_cache()) {
1941         xnn_operator_t convolution_op2 = nullptr;
1942         size_t old_weights_cache_size = weights_cache.cache.weights.size;
1943         ASSERT_EQ(
1944             xnn_status_success,
1945             xnn_create_convolution2d_nchw_f32(
1946                 padding_top(), padding_right(), padding_bottom(),
1947                 padding_left(), kernel_height(), kernel_width(),
1948                 subsampling_height(), subsampling_width(), dilation_height(),
1949                 dilation_width(), groups(), group_input_channels(),
1950                 group_output_channels(), input_channel_stride(),
1951                 output_channel_stride(), kernel.data(),
1952                 has_bias() ? bias.data() : nullptr, output_min, output_max,
1953                 (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) |
1954                     (force_nhwc_input() ? XNN_FLAG_INPUT_NHWC : 0),
1955                 &caches, &convolution_op2));
1956         ASSERT_NE(nullptr, convolution_op2);
1957 
1958         // Smart pointer to automatically delete convolution_op2.
1959         std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op2, xnn_delete_operator);
1960         std::vector<float> output2(output.size(), nanf(""));
1961 
1962         ASSERT_EQ(xnn_status_success,
1963                   xnn_setup_convolution2d_nchw_f32(
1964                       convolution_op2,
1965                       batch_size(), input_height(), input_width(),
1966                       input.data(), output2.data(),
1967                       nullptr /* thread pool */));
1968 
1969         ASSERT_EQ(xnn_status_success,
1970                   xnn_run_operator(convolution_op2, nullptr /* thread pool */));
1971 
1972         VerifyNCHWxF32(output2, output_ref, output_min, output_max);
1973         if (IsSpmm()) {
1974           VerifyWeightsCacheUnused(weights_cache);
1975         } else {
1976           VerifyWeightsCache(weights_cache, old_weights_cache_size);
1977         }
1978         xnn_release_weights_cache(&weights_cache);
1979       }
1980     }
1981   }
1982 
VerifyNCHWxF32(const std::vector<float> & output,const std::vector<float> & output_ref,const float output_min,const float output_max)1983   void VerifyNCHWxF32(const std::vector<float> &output,
1984                       const std::vector<float> &output_ref,
1985                       const float output_min, const float output_max) const {
1986     for (size_t i = 0; i < batch_size(); i++) {
1987       for (size_t y = 0; y < output_height(); y++) {
1988         for (size_t x = 0; x < output_width(); x++) {
1989           for (size_t g = 0; g < groups(); g++) {
1990             for (size_t c = 0; c < group_output_channels(); c++) {
1991               ASSERT_GE(output[((i * output_channel_stride() + g * group_output_channels() + c) * output_height() + y) * output_width() + x], output_min)
1992                 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c << ", image = " << i;
1993               ASSERT_LE(output[((i * output_channel_stride() + g * group_output_channels() + c) * output_height() + y) * output_width() + x], output_max)
1994                 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c << ", image = " << i;
1995               ASSERT_NEAR(
1996                   output_ref[(((i * groups() + g) * group_output_channels() + c) * output_height() + y) * output_width() + x],
1997                   output[((i * output_channel_stride() + g * group_output_channels() + c) * output_height() + y) * output_width() + x],
1998                   1.0e-4 * std::abs(output_ref[(((i * groups() + g) * group_output_channels() + c) * output_height() + y) * output_width() + x]))
1999                 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c << ", image = " << i;
2000             }
2001           }
2002         }
2003       }
2004     }
2005   }
2006 
TestSetupNHWCxQC8()2007   void TestSetupNHWCxQC8() const {
2008     ASSERT_EQ(weights_type(), WeightsType::Default);
2009 
2010     ASSERT_FALSE(depthwise_layout());
2011 
2012     std::random_device random_device;
2013     auto rng = std::mt19937(random_device());
2014     std::uniform_int_distribution<int32_t> i32dist(-10000, 10000);
2015     std::uniform_int_distribution<int32_t> i8dist(
2016       std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max());
2017     std::uniform_int_distribution<int32_t> w8dist(
2018       -std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max());
2019 
2020     std::vector<int8_t> input(XNN_EXTRA_BYTES / sizeof(int8_t) + std::max(
2021       batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()),
2022       next_batch_size() * ((next_input_height() * next_input_width() - 1) * input_channel_stride() + groups() * group_input_channels())));
2023     std::vector<int8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
2024     std::vector<int32_t> bias(groups() * group_output_channels());
2025     std::vector<int8_t> output(std::max(
2026       batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()),
2027       next_batch_size() * ((next_output_height() * next_output_width() - 1) * output_channel_stride() + groups() * group_output_channels())));
2028     std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
2029     std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
2030     std::vector<float> requantization_scales(groups() * group_output_channels());
2031     std::vector<int32_t> next_accumulators(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
2032     std::vector<double> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
2033     std::vector<float> next_requantization_scales(groups() * group_output_channels());
2034 
2035     const int8_t input_zero_point = -1;
2036     const int8_t output_zero_point = -1;
2037 
2038     for (size_t iteration = 0; iteration < iterations(); iteration++) {
2039       std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); });
2040       std::generate(kernel.begin(), kernel.end(), [&]() { return w8dist(rng); });
2041       std::generate(bias.begin(), bias.end(), [&]() { return i32dist(rng); });
2042       std::fill(output.begin(), output.end(), INT8_C(0xA5));
2043 
2044       // Compute reference results, without renormalization.
2045       if (has_bias()) {
2046         for (size_t i = 0; i < batch_size(); i++) {
2047           for (size_t oy = 0; oy < output_height(); oy++) {
2048             for (size_t ox = 0; ox < output_width(); ox++) {
2049               for (size_t g = 0; g < groups(); g++) {
2050                 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2051                   accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
2052                     bias[g * group_output_channels() + oc];
2053                 }
2054               }
2055             }
2056           }
2057         }
2058       } else {
2059         std::fill(accumulators.begin(), accumulators.end(), 0);
2060       }
2061       for (size_t i = 0; i < batch_size(); i++) {
2062         for (size_t oy = 0; oy < output_height(); oy++) {
2063           for (size_t ox = 0; ox < output_width(); ox++) {
2064             for (size_t ky = 0; ky < kernel_height(); ky++) {
2065               const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
2066               if (iy < input_height()) {
2067                 for (size_t kx = 0; kx < kernel_width(); kx++) {
2068                   const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
2069                   if (ix < input_width()) {
2070                     for (size_t g = 0; g < groups(); g++) {
2071                       for (size_t oc = 0; oc < group_output_channels(); oc++) {
2072                         for (size_t ic = 0; ic < group_input_channels(); ic++) {
2073                           accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
2074                             (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
2075                             int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
2076                         }
2077                       }
2078                     }
2079                   }
2080                 }
2081               }
2082             }
2083           }
2084         }
2085       }
2086 
2087       // Compute renormalization parameters.
2088       for (size_t c = 0; c < groups() * group_output_channels(); c++) {
2089         int32_t accumulated_min = accumulators[c];
2090         int32_t accumulated_max = accumulators[c];
2091         for (size_t px = 0; px < batch_size() * output_height() * output_width(); px++) {
2092           accumulated_min = std::min(accumulated_min, accumulators[px * groups() * group_output_channels() + c]);
2093           accumulated_max = std::max(accumulated_max, accumulators[px * groups() * group_output_channels() + c]);
2094         }
2095 
2096         float requantization_scale = 0x1.0p-32f;
2097         if (accumulated_max != 0) {
2098           requantization_scale = std::max(requantization_scale,
2099             float(int32_t(std::numeric_limits<int8_t>::max()) - int32_t(output_zero_point)) / float(accumulated_max));
2100         }
2101         if (accumulated_min != 0) {
2102           requantization_scale = std::max(requantization_scale,
2103             float(int32_t(std::numeric_limits<int8_t>::min()) - int32_t(output_zero_point)) / float(accumulated_min));
2104         }
2105         requantization_scale = std::min(requantization_scale, 0x1.FFFFFEp-1f);
2106 
2107         requantization_scales[c] = requantization_scale;
2108       }
2109 
2110       // Renormalize reference results.
2111       for (size_t c = 0; c < groups() * group_output_channels(); c++) {
2112         for (size_t px = 0; px < batch_size() * output_height() * output_width(); px++) {
2113           output_ref[px * groups() * group_output_channels() + c] = double(int32_t(output_zero_point)) +
2114             double(accumulators[px * groups() * group_output_channels() + c]) * double(requantization_scales[c]);
2115         }
2116       }
2117       std::transform(output_ref.cbegin(), output_ref.cend(), output_ref.begin(),
2118         [this](double x) -> double {
2119           return std::max<double>(std::min<double>(x, double(qmax() - 0x80)), double(qmin() - 0x80));
2120         });
2121 
2122       // Create, setup, and run Convolution operator once.
2123       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
2124       xnn_operator_t convolution_op = nullptr;
2125 
2126       xnn_status status = xnn_create_convolution2d_nhwc_qc8(
2127           padding_top(), padding_right(), padding_bottom(), padding_left(),
2128           kernel_height(), kernel_width(),
2129           subsampling_height(), subsampling_width(),
2130           dilation_height(), dilation_width(),
2131           groups(), group_input_channels(), group_output_channels(),
2132           input_channel_stride(), output_channel_stride(),
2133           input_zero_point, 1.0f /* input scale */, requantization_scales.data(),
2134           kernel.data(), has_bias() ? bias.data() : nullptr,
2135           output_zero_point, 1.0f /* output scale */, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80),
2136           0, NULL, &convolution_op);
2137       if (status == xnn_status_unsupported_hardware) {
2138         GTEST_SKIP();
2139       }
2140       ASSERT_EQ(xnn_status_success, status);
2141       ASSERT_NE(nullptr, convolution_op);
2142 
2143       // Smart pointer to automatically delete convolution_op.
2144       std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
2145 
2146       ASSERT_EQ(xnn_status_success,
2147         xnn_setup_convolution2d_nhwc_qc8(
2148           convolution_op,
2149           batch_size(), input_height(), input_width(),
2150           input.data(), output.data(),
2151           nullptr /* thread pool */));
2152 
2153       ASSERT_EQ(xnn_status_success,
2154         xnn_run_operator(convolution_op, nullptr /* thread pool */));
2155 
2156       // Verify results of the first run.
2157       for (size_t i = 0; i < batch_size(); i++) {
2158         for (size_t y = 0; y < output_height(); y++) {
2159           for (size_t x = 0; x < output_width(); x++) {
2160             for (size_t g = 0; g < groups(); g++) {
2161               for (size_t c = 0; c < group_output_channels(); c++) {
2162                 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax() - 0x80))
2163                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2164                 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin() - 0x80))
2165                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2166                 ASSERT_NEAR(
2167                     output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
2168                     double(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]),
2169                     0.9)
2170                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2171               }
2172             }
2173           }
2174         }
2175       }
2176 
2177       // Re-generate data for the second run.
2178       std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); });
2179       std::fill(output.begin(), output.end(), INT8_C(0xA5));
2180 
2181       // Compute reference results for the second run, including renormalization.
2182       if (has_bias()) {
2183         for (size_t i = 0; i < next_batch_size(); i++) {
2184           for (size_t oy = 0; oy < next_output_height(); oy++) {
2185             for (size_t ox = 0; ox < next_output_width(); ox++) {
2186               for (size_t g = 0; g < groups(); g++) {
2187                 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2188                   next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
2189                     bias[g * group_output_channels() + oc];
2190                 }
2191               }
2192             }
2193           }
2194         }
2195       } else {
2196         std::fill(next_accumulators.begin(), next_accumulators.end(), 0);
2197       }
2198       for (size_t i = 0; i < next_batch_size(); i++) {
2199         for (size_t oy = 0; oy < next_output_height(); oy++) {
2200           for (size_t ox = 0; ox < next_output_width(); ox++) {
2201             for (size_t ky = 0; ky < kernel_height(); ky++) {
2202               const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
2203               if (iy < next_input_height()) {
2204                 for (size_t kx = 0; kx < kernel_width(); kx++) {
2205                   const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
2206                   if (ix < next_input_width()) {
2207                     for (size_t g = 0; g < groups(); g++) {
2208                       for (size_t oc = 0; oc < group_output_channels(); oc++) {
2209                         for (size_t ic = 0; ic < group_input_channels(); ic++) {
2210                           next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
2211                             (int32_t(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
2212                             int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
2213                         }
2214                       }
2215                     }
2216                   }
2217                 }
2218               }
2219             }
2220           }
2221         }
2222       }
2223       for (size_t c = 0; c < groups() * group_output_channels(); c++) {
2224         for (size_t px = 0; px < next_batch_size() * next_output_height() * next_output_width(); px++) {
2225           next_output_ref[px * groups() * group_output_channels() + c] = double(int32_t(output_zero_point)) +
2226             double(next_accumulators[px * groups() * group_output_channels() + c]) * double(requantization_scales[c]);
2227         }
2228       }
2229       std::transform(next_output_ref.cbegin(), next_output_ref.cend(), next_output_ref.begin(),
2230         [this](double x) -> double {
2231           return std::max<double>(std::min<double>(x, double(qmax() - 0x80)), double(qmin() - 0x80));
2232         });
2233 
2234       // Setup and run Convolution operator the second time, and destroy the operator.
2235       ASSERT_EQ(xnn_status_success,
2236         xnn_setup_convolution2d_nhwc_qc8(
2237           convolution_op,
2238           next_batch_size(), next_input_height(), next_input_width(),
2239           input.data(), output.data(),
2240           nullptr /* thread pool */));
2241 
2242       ASSERT_EQ(xnn_status_success,
2243         xnn_run_operator(convolution_op, nullptr /* thread pool */));
2244 
2245       // Verify results of the second run.
2246       for (size_t i = 0; i < next_batch_size(); i++) {
2247         for (size_t y = 0; y < next_output_height(); y++) {
2248           for (size_t x = 0; x < next_output_width(); x++) {
2249             for (size_t g = 0; g < groups(); g++) {
2250               for (size_t c = 0; c < group_output_channels(); c++) {
2251                 ASSERT_LE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax() - 0x80))
2252                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2253                 ASSERT_GE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin() - 0x80))
2254                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2255                 ASSERT_NEAR(
2256                     next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c],
2257                     double(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]),
2258                     0.9)
2259                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2260               }
2261             }
2262           }
2263         }
2264       }
2265     }
2266   }
2267 
TestSetupNHWCxQS8()2268   void TestSetupNHWCxQS8() const {
2269     ASSERT_EQ(weights_type(), WeightsType::Default);
2270 
2271     ASSERT_FALSE(depthwise_layout());
2272 
2273     std::random_device random_device;
2274     auto rng = std::mt19937(random_device());
2275     std::uniform_int_distribution<int32_t> i32dist(-10000, 10000);
2276     std::uniform_int_distribution<int32_t> i8dist(
2277       std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max());
2278     std::uniform_int_distribution<int32_t> w8dist(
2279       -std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max());
2280 
2281     std::vector<int8_t> input(XNN_EXTRA_BYTES / sizeof(int8_t) + std::max(
2282       batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()),
2283       next_batch_size() * ((next_input_height() * next_input_width() - 1) * input_channel_stride() + groups() * group_input_channels())));
2284     std::vector<int8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
2285     std::vector<int32_t> bias(groups() * group_output_channels());
2286     std::vector<int8_t> output(std::max(
2287       batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()),
2288       next_batch_size() * ((next_output_height() * next_output_width() - 1) * output_channel_stride() + groups() * group_output_channels())));
2289     std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
2290     std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
2291     std::vector<int32_t> next_accumulators(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
2292     std::vector<double> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
2293 
2294     const int8_t input_zero_point = -1;
2295 
2296     for (size_t iteration = 0; iteration < iterations(); iteration++) {
2297       std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); });
2298       std::generate(kernel.begin(), kernel.end(), [&]() { return w8dist(rng); });
2299       std::generate(bias.begin(), bias.end(), [&]() { return i32dist(rng); });
2300       std::fill(output.begin(), output.end(), INT8_C(0xA5));
2301 
2302       // Compute reference results, without renormalization.
2303       if (has_bias()) {
2304         for (size_t i = 0; i < batch_size(); i++) {
2305           for (size_t oy = 0; oy < output_height(); oy++) {
2306             for (size_t ox = 0; ox < output_width(); ox++) {
2307               for (size_t g = 0; g < groups(); g++) {
2308                 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2309                   accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
2310                     bias[g * group_output_channels() + oc];
2311                 }
2312               }
2313             }
2314           }
2315         }
2316       } else {
2317         std::fill(accumulators.begin(), accumulators.end(), 0);
2318       }
2319       for (size_t i = 0; i < batch_size(); i++) {
2320         for (size_t oy = 0; oy < output_height(); oy++) {
2321           for (size_t ox = 0; ox < output_width(); ox++) {
2322             for (size_t ky = 0; ky < kernel_height(); ky++) {
2323               const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
2324               if (iy < input_height()) {
2325                 for (size_t kx = 0; kx < kernel_width(); kx++) {
2326                   const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
2327                   if (ix < input_width()) {
2328                     for (size_t g = 0; g < groups(); g++) {
2329                       for (size_t oc = 0; oc < group_output_channels(); oc++) {
2330                         for (size_t ic = 0; ic < group_input_channels(); ic++) {
2331                           accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
2332                             (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
2333                             int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
2334                         }
2335                       }
2336                     }
2337                   }
2338                 }
2339               }
2340             }
2341           }
2342         }
2343       }
2344 
2345       // Compute renormalization parameters.
2346       const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
2347       const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
2348 
2349       const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0;
2350       const int8_t output_zero_point = int8_t(std::max(std::min(
2351         lrint(-0.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
2352         long(std::numeric_limits<int8_t>::max())), long(std::numeric_limits<int8_t>::min())));
2353 
2354       // Renormalize reference results.
2355       std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(),
2356         [this, output_scale, output_zero_point](int32_t x) -> double {
2357           return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax() - 0x80) - output_zero_point), double(qmin() - 0x80) - output_zero_point);
2358         });
2359 
2360       // Create, setup, and run Convolution operator once.
2361       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
2362       xnn_operator_t convolution_op = nullptr;
2363 
2364       xnn_status status = xnn_create_convolution2d_nhwc_qs8(
2365           padding_top(), padding_right(), padding_bottom(), padding_left(),
2366           kernel_height(), kernel_width(),
2367           subsampling_height(), subsampling_width(),
2368           dilation_height(), dilation_width(),
2369           groups(), group_input_channels(), group_output_channels(),
2370           input_channel_stride(), output_channel_stride(),
2371           input_zero_point, 1.0f /* input scale */, 1.0f /* kernel scale */,
2372           kernel.data(), has_bias() ? bias.data() : nullptr,
2373           output_zero_point, output_scale, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80),
2374           0, NULL, &convolution_op);
2375       if (status == xnn_status_unsupported_hardware) {
2376         GTEST_SKIP();
2377       }
2378       ASSERT_EQ(xnn_status_success, status);
2379       ASSERT_NE(nullptr, convolution_op);
2380 
2381       // Smart pointer to automatically delete convolution_op.
2382       std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
2383 
2384       ASSERT_EQ(xnn_status_success,
2385         xnn_setup_convolution2d_nhwc_qs8(
2386           convolution_op,
2387           batch_size(), input_height(), input_width(),
2388           input.data(), output.data(),
2389           nullptr /* thread pool */));
2390 
2391       ASSERT_EQ(xnn_status_success,
2392         xnn_run_operator(convolution_op, nullptr /* thread pool */));
2393 
2394       // Verify results of the first run.
2395       for (size_t i = 0; i < batch_size(); i++) {
2396         for (size_t y = 0; y < output_height(); y++) {
2397           for (size_t x = 0; x < output_width(); x++) {
2398             for (size_t g = 0; g < groups(); g++) {
2399               for (size_t c = 0; c < group_output_channels(); c++) {
2400                 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax() - 0x80))
2401                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2402                 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin() - 0x80))
2403                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2404                 ASSERT_NEAR(
2405                     output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
2406                     double(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
2407                     0.9)
2408                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2409               }
2410             }
2411           }
2412         }
2413       }
2414 
2415       // Re-generate data for the second run.
2416       std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); });
2417       std::fill(output.begin(), output.end(), INT8_C(0xA5));
2418 
2419       // Compute reference results for the second run, including renormalization.
2420       if (has_bias()) {
2421         for (size_t i = 0; i < next_batch_size(); i++) {
2422           for (size_t oy = 0; oy < next_output_height(); oy++) {
2423             for (size_t ox = 0; ox < next_output_width(); ox++) {
2424               for (size_t g = 0; g < groups(); g++) {
2425                 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2426                   next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
2427                     bias[g * group_output_channels() + oc];
2428                 }
2429               }
2430             }
2431           }
2432         }
2433       } else {
2434         std::fill(next_accumulators.begin(), next_accumulators.end(), 0);
2435       }
2436       for (size_t i = 0; i < next_batch_size(); i++) {
2437         for (size_t oy = 0; oy < next_output_height(); oy++) {
2438           for (size_t ox = 0; ox < next_output_width(); ox++) {
2439             for (size_t ky = 0; ky < kernel_height(); ky++) {
2440               const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
2441               if (iy < next_input_height()) {
2442                 for (size_t kx = 0; kx < kernel_width(); kx++) {
2443                   const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
2444                   if (ix < next_input_width()) {
2445                     for (size_t g = 0; g < groups(); g++) {
2446                       for (size_t oc = 0; oc < group_output_channels(); oc++) {
2447                         for (size_t ic = 0; ic < group_input_channels(); ic++) {
2448                           next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
2449                             (int32_t(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
2450                             int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
2451                         }
2452                       }
2453                     }
2454                   }
2455                 }
2456               }
2457             }
2458           }
2459         }
2460       }
2461       std::transform(next_accumulators.cbegin(), next_accumulators.cend(), next_output_ref.begin(),
2462         [this, output_scale, output_zero_point](int32_t x) -> double {
2463           return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax() - 0x80) - output_zero_point), double(qmin() - 0x80) - output_zero_point);
2464         });
2465 
2466       // Setup and run Convolution operator the second time, and destroy the operator.
2467       ASSERT_EQ(xnn_status_success,
2468         xnn_setup_convolution2d_nhwc_qs8(
2469           convolution_op,
2470           next_batch_size(), next_input_height(), next_input_width(),
2471           input.data(), output.data(),
2472           nullptr /* thread pool */));
2473 
2474       ASSERT_EQ(xnn_status_success,
2475         xnn_run_operator(convolution_op, nullptr /* thread pool */));
2476 
2477       // Verify results of the second run.
2478       for (size_t i = 0; i < next_batch_size(); i++) {
2479         for (size_t y = 0; y < next_output_height(); y++) {
2480           for (size_t x = 0; x < next_output_width(); x++) {
2481             for (size_t g = 0; g < groups(); g++) {
2482               for (size_t c = 0; c < group_output_channels(); c++) {
2483                 ASSERT_LE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax() - 0x80))
2484                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2485                 ASSERT_GE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin() - 0x80))
2486                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2487                 ASSERT_NEAR(
2488                     next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c],
2489                     double(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
2490                     0.9)
2491                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2492               }
2493             }
2494           }
2495         }
2496       }
2497     }
2498   }
2499 
TestSetupNHWCxQU8()2500   void TestSetupNHWCxQU8() const {
2501     ASSERT_EQ(weights_type(), WeightsType::Default);
2502 
2503     ASSERT_FALSE(depthwise_layout());
2504 
2505     std::random_device random_device;
2506     auto rng = std::mt19937(random_device());
2507     std::uniform_int_distribution<int32_t> i32dist(-10000, 10000);
2508     std::uniform_int_distribution<int32_t> u8dist(
2509       std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max());
2510 
2511     std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) + std::max(
2512       batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()),
2513       next_batch_size() * ((next_input_height() * next_input_width() - 1) * input_channel_stride() + groups() * group_input_channels())));
2514     std::vector<uint8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
2515     std::vector<int32_t> bias(groups() * group_output_channels());
2516     std::vector<uint8_t> output(std::max(
2517       batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()),
2518       next_batch_size() * ((next_output_height() * next_output_width() - 1) * output_channel_stride() + groups() * group_output_channels())));
2519     std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
2520     std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
2521     std::vector<int32_t> next_accumulators(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
2522     std::vector<double> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
2523 
2524     const uint8_t input_zero_point = 127;
2525     const uint8_t kernel_zero_point = 127;
2526 
2527     for (size_t iteration = 0; iteration < iterations(); iteration++) {
2528       std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); });
2529       std::generate(kernel.begin(), kernel.end(), [&]() { return u8dist(rng); });
2530       std::generate(bias.begin(), bias.end(), [&]() { return i32dist(rng); });
2531       std::fill(output.begin(), output.end(), UINT8_C(0xA5));
2532 
2533       // Compute reference results, without renormalization.
2534       if (has_bias()) {
2535         for (size_t i = 0; i < batch_size(); i++) {
2536           for (size_t oy = 0; oy < output_height(); oy++) {
2537             for (size_t ox = 0; ox < output_width(); ox++) {
2538               for (size_t g = 0; g < groups(); g++) {
2539                 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2540                   accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
2541                     bias[g * group_output_channels() + oc];
2542                 }
2543               }
2544             }
2545           }
2546         }
2547       } else {
2548         std::fill(accumulators.begin(), accumulators.end(), 0);
2549       }
2550       for (size_t i = 0; i < batch_size(); i++) {
2551         for (size_t oy = 0; oy < output_height(); oy++) {
2552           for (size_t ox = 0; ox < output_width(); ox++) {
2553             for (size_t ky = 0; ky < kernel_height(); ky++) {
2554               const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
2555               if (iy < input_height()) {
2556                 for (size_t kx = 0; kx < kernel_width(); kx++) {
2557                   const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
2558                   if (ix < input_width()) {
2559                     for (size_t g = 0; g < groups(); g++) {
2560                       for (size_t oc = 0; oc < group_output_channels(); oc++) {
2561                         for (size_t ic = 0; ic < group_input_channels(); ic++) {
2562                           accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
2563                             (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
2564                             (int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]) - int32_t(kernel_zero_point));
2565                         }
2566                       }
2567                     }
2568                   }
2569                 }
2570               }
2571             }
2572           }
2573         }
2574       }
2575 
2576       // Compute renormalization parameters.
2577       const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
2578       const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
2579 
2580       const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0;
2581       const uint8_t output_zero_point = uint8_t(std::max(std::min(
2582         lrint(127.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
2583         long(std::numeric_limits<uint8_t>::max())), long(std::numeric_limits<uint8_t>::min())));
2584 
2585       // Renormalize reference results.
2586       std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(),
2587         [this, output_scale, output_zero_point](int32_t x) -> double {
2588           return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax()) - output_zero_point), double(qmin()) - output_zero_point);
2589         });
2590 
2591       // Create, setup, and run Convolution operator once.
2592       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
2593       xnn_operator_t convolution_op = nullptr;
2594 
2595       xnn_status status = xnn_create_convolution2d_nhwc_qu8(
2596           padding_top(), padding_right(), padding_bottom(), padding_left(),
2597           kernel_height(), kernel_width(),
2598           subsampling_height(), subsampling_width(),
2599           dilation_height(), dilation_width(),
2600           groups(), group_input_channels(), group_output_channels(),
2601           input_channel_stride(), output_channel_stride(),
2602           input_zero_point, 1.0f /* input scale */,
2603           kernel_zero_point, 1.0f /* kernel scale */,
2604           kernel.data(), has_bias() ? bias.data() : nullptr,
2605           output_zero_point, output_scale, qmin(), qmax(),
2606           0, NULL, &convolution_op);
2607       if (status == xnn_status_unsupported_hardware) {
2608         GTEST_SKIP();
2609       }
2610       ASSERT_EQ(xnn_status_success, status);
2611       ASSERT_NE(nullptr, convolution_op);
2612 
2613       // Smart pointer to automatically delete convolution_op.
2614       std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
2615 
2616       ASSERT_EQ(xnn_status_success,
2617         xnn_setup_convolution2d_nhwc_qu8(
2618           convolution_op,
2619           batch_size(), input_height(), input_width(),
2620           input.data(), output.data(),
2621           nullptr /* thread pool */));
2622 
2623       ASSERT_EQ(xnn_status_success,
2624         xnn_run_operator(convolution_op, nullptr /* thread pool */));
2625 
2626       // Verify results of the first run.
2627       for (size_t i = 0; i < batch_size(); i++) {
2628         for (size_t y = 0; y < output_height(); y++) {
2629           for (size_t x = 0; x < output_width(); x++) {
2630             for (size_t g = 0; g < groups(); g++) {
2631               for (size_t c = 0; c < group_output_channels(); c++) {
2632                 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax()))
2633                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2634                 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin()))
2635                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2636                 ASSERT_NEAR(
2637                     output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
2638                     double(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
2639                     0.9)
2640                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2641               }
2642             }
2643           }
2644         }
2645       }
2646 
2647       // Re-generate data for the second run.
2648       std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); });
2649       std::fill(output.begin(), output.end(), 0xA5);
2650 
2651       // Compute reference results for the second run, including renormalization.
2652       if (has_bias()) {
2653         for (size_t i = 0; i < next_batch_size(); i++) {
2654           for (size_t oy = 0; oy < next_output_height(); oy++) {
2655             for (size_t ox = 0; ox < next_output_width(); ox++) {
2656               for (size_t g = 0; g < groups(); g++) {
2657                 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2658                   next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
2659                     bias[g * group_output_channels() + oc];
2660                 }
2661               }
2662             }
2663           }
2664         }
2665       } else {
2666         std::fill(next_accumulators.begin(), next_accumulators.end(), 0);
2667       }
2668       for (size_t i = 0; i < next_batch_size(); i++) {
2669         for (size_t oy = 0; oy < next_output_height(); oy++) {
2670           for (size_t ox = 0; ox < next_output_width(); ox++) {
2671             for (size_t ky = 0; ky < kernel_height(); ky++) {
2672               const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
2673               if (iy < next_input_height()) {
2674                 for (size_t kx = 0; kx < kernel_width(); kx++) {
2675                   const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
2676                   if (ix < next_input_width()) {
2677                     for (size_t g = 0; g < groups(); g++) {
2678                       for (size_t oc = 0; oc < group_output_channels(); oc++) {
2679                         for (size_t ic = 0; ic < group_input_channels(); ic++) {
2680                           next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
2681                             (int32_t(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
2682                             (int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]) - int32_t(kernel_zero_point));
2683                         }
2684                       }
2685                     }
2686                   }
2687                 }
2688               }
2689             }
2690           }
2691         }
2692       }
2693       std::transform(next_accumulators.cbegin(), next_accumulators.cend(), next_output_ref.begin(),
2694         [this, output_scale, output_zero_point](int32_t x) -> double {
2695           return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax()) - output_zero_point), double(qmin()) - output_zero_point);
2696         });
2697 
2698       // Setup and run Convolution operator the second time, and destroy the operator.
2699       ASSERT_EQ(xnn_status_success,
2700         xnn_setup_convolution2d_nhwc_qu8(
2701           convolution_op,
2702           next_batch_size(), next_input_height(), next_input_width(),
2703           input.data(), output.data(),
2704           nullptr /* thread pool */));
2705 
2706       ASSERT_EQ(xnn_status_success,
2707         xnn_run_operator(convolution_op, nullptr /* thread pool */));
2708 
2709       // Verify results of the second run.
2710       for (size_t i = 0; i < next_batch_size(); i++) {
2711         for (size_t y = 0; y < next_output_height(); y++) {
2712           for (size_t x = 0; x < next_output_width(); x++) {
2713             for (size_t g = 0; g < groups(); g++) {
2714               for (size_t c = 0; c < group_output_channels(); c++) {
2715                 ASSERT_LE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax()))
2716                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2717                 ASSERT_GE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin()))
2718                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2719                 ASSERT_NEAR(
2720                     next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c],
2721                     double(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
2722                     0.9)
2723                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2724               }
2725             }
2726           }
2727         }
2728       }
2729     }
2730   }
2731 
TestSetupNHWCxF16()2732   void TestSetupNHWCxF16() const {
2733     ASSERT_EQ(weights_type(), WeightsType::Default);
2734 
2735     ASSERT_FALSE(depthwise_layout());
2736 
2737     std::random_device random_device;
2738     auto rng = std::mt19937(random_device());
2739     std::uniform_real_distribution<float> f32dist(0.1f, 1.0f);
2740 
2741     std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) + std::max(
2742       batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()),
2743       next_batch_size() * ((next_input_height() * next_input_width() - 1) * input_channel_stride() + groups() * group_input_channels())));
2744     std::vector<uint16_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
2745     std::vector<uint16_t> bias(groups() * group_output_channels());
2746     std::vector<uint16_t> output(std::max(
2747       batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()),
2748       next_batch_size() * ((next_output_height() * next_output_width() - 1) * output_channel_stride() + groups() * group_output_channels())));
2749     std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
2750     std::vector<float> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
2751 
2752     for (size_t iteration = 0; iteration < iterations(); iteration++) {
2753       std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
2754       std::generate(kernel.begin(), kernel.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
2755       std::generate(bias.begin(), bias.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
2756       std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
2757 
2758       // Compute reference results, without clamping.
2759       if (has_bias()) {
2760         for (size_t i = 0; i < batch_size(); i++) {
2761           for (size_t oy = 0; oy < output_height(); oy++) {
2762             for (size_t ox = 0; ox < output_width(); ox++) {
2763               for (size_t g = 0; g < groups(); g++) {
2764                 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2765                   output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
2766                     fp16_ieee_to_fp32_value(bias[g * group_output_channels() + oc]);
2767                 }
2768               }
2769             }
2770           }
2771         }
2772       } else {
2773         std::fill(output_ref.begin(), output_ref.end(), 0.0f);
2774       }
2775       for (size_t i = 0; i < batch_size(); i++) {
2776         for (size_t oy = 0; oy < output_height(); oy++) {
2777           for (size_t ox = 0; ox < output_width(); ox++) {
2778             for (size_t ky = 0; ky < kernel_height(); ky++) {
2779               const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
2780               if (iy < input_height()) {
2781                 for (size_t kx = 0; kx < kernel_width(); kx++) {
2782                   const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
2783                   if (ix < input_width()) {
2784                     for (size_t g = 0; g < groups(); g++) {
2785                       for (size_t oc = 0; oc < group_output_channels(); oc++) {
2786                         for (size_t ic = 0; ic < group_input_channels(); ic++) {
2787                           output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
2788                             fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) *
2789                             fp16_ieee_to_fp32_value(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
2790                         }
2791                       }
2792                     }
2793                   }
2794                 }
2795               }
2796             }
2797           }
2798         }
2799       }
2800 
2801       // Compute clamping parameters.
2802       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
2803       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
2804       const float accumulated_range = accumulated_max - accumulated_min;
2805       const float scaled_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_min + accumulated_range / 255.0f * float(qmin())));
2806       const float scaled_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_max - accumulated_range / 255.0f * float(255 - qmax())));
2807       const float output_min = scaled_min == scaled_max ? -std::numeric_limits<float>::infinity() : scaled_min;
2808       const float output_max = scaled_min == scaled_max ? +std::numeric_limits<float>::infinity() : scaled_max;
2809 
2810       for (float& output_value : output_ref) {
2811         output_value = std::min(std::max(output_value, output_min), output_max);
2812       }
2813 
2814       // Create, setup, and run Convolution operator once.
2815       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
2816       xnn_operator_t convolution_op = nullptr;
2817 
2818       xnn_status status = xnn_create_convolution2d_nhwc_f16(
2819           padding_top(), padding_right(), padding_bottom(), padding_left(),
2820           kernel_height(), kernel_width(),
2821           subsampling_height(), subsampling_width(),
2822           dilation_height(), dilation_width(),
2823           groups(), group_input_channels(), group_output_channels(),
2824           input_channel_stride(), output_channel_stride(),
2825           kernel.data(), has_bias() ? bias.data() : nullptr,
2826           output_min, output_max,
2827           0, NULL, &convolution_op);
2828       if (status == xnn_status_unsupported_hardware) {
2829         GTEST_SKIP();
2830       }
2831       ASSERT_EQ(xnn_status_success, status);
2832       ASSERT_NE(nullptr, convolution_op);
2833 
2834       // Smart pointer to automatically delete convolution_op.
2835       std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
2836 
2837       ASSERT_EQ(xnn_status_success,
2838         xnn_setup_convolution2d_nhwc_f16(
2839           convolution_op,
2840           batch_size(), input_height(), input_width(),
2841           input.data(), output.data(),
2842           nullptr /* thread pool */));
2843 
2844       ASSERT_EQ(xnn_status_success,
2845         xnn_run_operator(convolution_op, nullptr /* thread pool */));
2846 
2847       // Verify results of the first run.
2848       for (size_t i = 0; i < batch_size(); i++) {
2849         for (size_t y = 0; y < output_height(); y++) {
2850           for (size_t x = 0; x < output_width(); x++) {
2851             for (size_t g = 0; g < groups(); g++) {
2852               for (size_t c = 0; c < group_output_channels(); c++) {
2853                 ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), output_min)
2854                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2855                 ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), output_max)
2856                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2857                 ASSERT_NEAR(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c], fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), std::max(1.0e-4f, std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c]) * 1.0e-2f))
2858                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2859               }
2860             }
2861           }
2862         }
2863       }
2864 
2865       // Re-generate data for the second run.
2866       std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
2867       std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
2868 
2869       // Compute reference results for the second run, including clamping.
2870       if (has_bias()) {
2871         for (size_t i = 0; i < next_batch_size(); i++) {
2872           for (size_t oy = 0; oy < next_output_height(); oy++) {
2873             for (size_t ox = 0; ox < next_output_width(); ox++) {
2874               for (size_t g = 0; g < groups(); g++) {
2875                 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2876                   next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
2877                     fp16_ieee_to_fp32_value(bias[g * group_output_channels() + oc]);
2878                 }
2879               }
2880             }
2881           }
2882         }
2883       } else {
2884         std::fill(next_output_ref.begin(), next_output_ref.end(), 0.0f);
2885       }
2886       for (size_t i = 0; i < next_batch_size(); i++) {
2887         for (size_t oy = 0; oy < next_output_height(); oy++) {
2888           for (size_t ox = 0; ox < next_output_width(); ox++) {
2889             for (size_t ky = 0; ky < kernel_height(); ky++) {
2890               const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
2891               if (iy < next_input_height()) {
2892                 for (size_t kx = 0; kx < kernel_width(); kx++) {
2893                   const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
2894                   if (ix < next_input_width()) {
2895                     for (size_t g = 0; g < groups(); g++) {
2896                       for (size_t oc = 0; oc < group_output_channels(); oc++) {
2897                         for (size_t ic = 0; ic < group_input_channels(); ic++) {
2898                           next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
2899                             fp16_ieee_to_fp32_value(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) *
2900                             fp16_ieee_to_fp32_value(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
2901                         }
2902                       }
2903                     }
2904                   }
2905                 }
2906               }
2907             }
2908           }
2909         }
2910       }
2911       for (float& value : next_output_ref) {
2912         value = std::max(std::min(value, output_max), output_min);
2913       }
2914 
2915       // Setup and run Convolution operator the second time, and destroy the operator.
2916       ASSERT_EQ(xnn_status_success,
2917         xnn_setup_convolution2d_nhwc_f16(
2918           convolution_op,
2919           next_batch_size(), next_input_height(), next_input_width(),
2920           input.data(), output.data(),
2921           nullptr /* thread pool */));
2922 
2923       ASSERT_EQ(xnn_status_success,
2924         xnn_run_operator(convolution_op, nullptr /* thread pool */));
2925 
2926       // Verify results of the second run.
2927       for (size_t i = 0; i < next_batch_size(); i++) {
2928         for (size_t y = 0; y < next_output_height(); y++) {
2929           for (size_t x = 0; x < next_output_width(); x++) {
2930             for (size_t g = 0; g < groups(); g++) {
2931               for (size_t c = 0; c < group_output_channels(); c++) {
2932                 ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), output_min)
2933                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2934                 ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), output_max)
2935                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2936                 ASSERT_NEAR(next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c], fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), std::max(1.0e-4f, std::abs(next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c]) * 1.0e-2f))
2937                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2938               }
2939             }
2940           }
2941         }
2942       }
2943     }
2944   }
2945 
TestSetupNHWCxF32()2946   void TestSetupNHWCxF32() const {
2947     ASSERT_EQ(weights_type(), WeightsType::Default);
2948 
2949     ASSERT_FALSE(depthwise_layout());
2950 
2951     std::random_device random_device;
2952     auto rng = std::mt19937(random_device());
2953     std::uniform_real_distribution<float> f32dist(0.1f, 1.0f);
2954 
2955     std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + std::max(
2956       batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()),
2957       next_batch_size() * ((next_input_height() * next_input_width() - 1) * input_channel_stride() + groups() * group_input_channels())));
2958     std::vector<float> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
2959     std::vector<float> bias(groups() * group_output_channels());
2960     std::vector<float> output(std::max(
2961       batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()),
2962       next_batch_size() * ((next_output_height() * next_output_width() - 1) * output_channel_stride() + groups() * group_output_channels())));
2963     std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
2964     std::vector<float> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
2965 
2966     for (size_t iteration = 0; iteration < iterations(); iteration++) {
2967       std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); });
2968       std::generate(kernel.begin(), kernel.end(), [&]() { return f32dist(rng); });
2969       std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); });
2970       std::fill(output.begin(), output.end(), nanf(""));
2971 
2972       // Compute reference results, without clamping.
2973       if (has_bias()) {
2974         for (size_t i = 0; i < batch_size(); i++) {
2975           for (size_t oy = 0; oy < output_height(); oy++) {
2976             for (size_t ox = 0; ox < output_width(); ox++) {
2977               for (size_t g = 0; g < groups(); g++) {
2978                 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2979                   output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
2980                     bias[g * group_output_channels() + oc];
2981                 }
2982               }
2983             }
2984           }
2985         }
2986       } else {
2987         std::fill(output_ref.begin(), output_ref.end(), 0.0f);
2988       }
2989       for (size_t i = 0; i < batch_size(); i++) {
2990         for (size_t oy = 0; oy < output_height(); oy++) {
2991           for (size_t ox = 0; ox < output_width(); ox++) {
2992             for (size_t ky = 0; ky < kernel_height(); ky++) {
2993               const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
2994               if (iy < input_height()) {
2995                 for (size_t kx = 0; kx < kernel_width(); kx++) {
2996                   const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
2997                   if (ix < input_width()) {
2998                     for (size_t g = 0; g < groups(); g++) {
2999                       for (size_t oc = 0; oc < group_output_channels(); oc++) {
3000                         for (size_t ic = 0; ic < group_input_channels(); ic++) {
3001                           output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
3002                             input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic] *
3003                             kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
3004                         }
3005                       }
3006                     }
3007                   }
3008                 }
3009               }
3010             }
3011           }
3012         }
3013       }
3014 
3015       // Compute clamping parameters.
3016       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
3017       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
3018 
3019       const float output_min = accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
3020       const float output_max = accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
3021 
3022       // Clamp reference results.
3023       for (float& value : output_ref) {
3024         value = std::max(std::min(value, output_max), output_min);
3025       }
3026 
3027       // Create, setup, and run Convolution operator once.
3028       ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
3029       xnn_operator_t convolution_op = nullptr;
3030 
3031       xnn_status status = xnn_create_convolution2d_nhwc_f32(
3032           padding_top(), padding_right(), padding_bottom(), padding_left(),
3033           kernel_height(), kernel_width(),
3034           subsampling_height(), subsampling_width(),
3035           dilation_height(), dilation_width(),
3036           groups(), group_input_channels(), group_output_channels(),
3037           input_channel_stride(), output_channel_stride(),
3038           kernel.data(), has_bias() ? bias.data() : nullptr,
3039           output_min, output_max,
3040           0, NULL, &convolution_op);
3041       if (status == xnn_status_unsupported_hardware) {
3042         GTEST_SKIP();
3043       }
3044       ASSERT_EQ(xnn_status_success, status);
3045       ASSERT_NE(nullptr, convolution_op);
3046 
3047       // Smart pointer to automatically delete convolution_op.
3048       std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
3049 
3050       ASSERT_EQ(xnn_status_success,
3051         xnn_setup_convolution2d_nhwc_f32(
3052           convolution_op,
3053           batch_size(), input_height(), input_width(),
3054           input.data(), output.data(),
3055           nullptr /* thread pool */));
3056 
3057       ASSERT_EQ(xnn_status_success,
3058         xnn_run_operator(convolution_op, nullptr /* thread pool */));
3059 
3060       // Verify results of the first run.
3061       for (size_t i = 0; i < batch_size(); i++) {
3062         for (size_t y = 0; y < output_height(); y++) {
3063           for (size_t x = 0; x < output_width(); x++) {
3064             for (size_t g = 0; g < groups(); g++) {
3065               for (size_t c = 0; c < group_output_channels(); c++) {
3066                 ASSERT_GE(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c], output_min)
3067                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
3068                 ASSERT_LE(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c], output_max)
3069                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
3070                 ASSERT_NEAR(
3071                     output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
3072                     output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c],
3073                     1.0e-4 * std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c]))
3074                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
3075               }
3076             }
3077           }
3078         }
3079       }
3080 
3081       // Re-generate data for the second run.
3082       std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); });
3083       std::fill(output.begin(), output.end(), nanf(""));
3084 
3085       // Compute reference results for the second run, including clamping.
3086       if (has_bias()) {
3087         for (size_t i = 0; i < next_batch_size(); i++) {
3088           for (size_t oy = 0; oy < next_output_height(); oy++) {
3089             for (size_t ox = 0; ox < next_output_width(); ox++) {
3090               for (size_t g = 0; g < groups(); g++) {
3091                 for (size_t oc = 0; oc < group_output_channels(); oc++) {
3092                   next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
3093                     bias[g * group_output_channels() + oc];
3094                 }
3095               }
3096             }
3097           }
3098         }
3099       } else {
3100         std::fill(next_output_ref.begin(), next_output_ref.end(), 0.0f);
3101       }
3102       for (size_t i = 0; i < next_batch_size(); i++) {
3103         for (size_t oy = 0; oy < next_output_height(); oy++) {
3104           for (size_t ox = 0; ox < next_output_width(); ox++) {
3105             for (size_t ky = 0; ky < kernel_height(); ky++) {
3106               const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
3107               if (iy < next_input_height()) {
3108                 for (size_t kx = 0; kx < kernel_width(); kx++) {
3109                   const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
3110                   if (ix < next_input_width()) {
3111                     for (size_t g = 0; g < groups(); g++) {
3112                       for (size_t oc = 0; oc < group_output_channels(); oc++) {
3113                         for (size_t ic = 0; ic < group_input_channels(); ic++) {
3114                           next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
3115                             input[((i * next_input_height() + iy) * next_input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic] *
3116                             kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
3117                         }
3118                       }
3119                     }
3120                   }
3121                 }
3122               }
3123             }
3124           }
3125         }
3126       }
3127       for (float& value : next_output_ref) {
3128         value = std::max(std::min(value, output_max), output_min);
3129       }
3130 
3131       // Setup and run Convolution operator the second time, and destroy the operator.
3132       ASSERT_EQ(xnn_status_success,
3133         xnn_setup_convolution2d_nhwc_f32(
3134           convolution_op,
3135           next_batch_size(), next_input_height(), next_input_width(),
3136           input.data(), output.data(),
3137           nullptr /* thread pool */));
3138 
3139       ASSERT_EQ(xnn_status_success,
3140         xnn_run_operator(convolution_op, nullptr /* thread pool */));
3141 
3142       // Verify results of the second run.
3143       for (size_t i = 0; i < next_batch_size(); i++) {
3144         for (size_t y = 0; y < next_output_height(); y++) {
3145           for (size_t x = 0; x < next_output_width(); x++) {
3146             for (size_t g = 0; g < groups(); g++) {
3147               for (size_t c = 0; c < group_output_channels(); c++) {
3148                 ASSERT_GE(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c], output_min)
3149                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
3150                 ASSERT_LE(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c], output_max)
3151                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
3152                 ASSERT_NEAR(
3153                     next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c],
3154                     output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c],
3155                     1.0e-4 * std::abs(next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c]))
3156                   << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
3157               }
3158             }
3159           }
3160         }
3161       }
3162     }
3163   }
3164 
VerifyWeightsCache(const xnn_weights_cache & weights_cache,size_t old_size)3165   void VerifyWeightsCache(const xnn_weights_cache &weights_cache, size_t old_size) const {
3166     ASSERT_EQ(weights_cache.cache.hits, 1);
3167     // Ensure that we did not write more weights to the cache because it was a
3168     // cache hit.
3169     ASSERT_EQ(old_size, weights_cache.cache.weights.size);
3170   };
3171 
VerifyWeightsCacheUnused(const xnn_weights_cache & weights_cache)3172   void VerifyWeightsCacheUnused(const xnn_weights_cache &weights_cache) const {
3173     ASSERT_EQ(weights_cache.cache.hits, 0);
3174     ASSERT_EQ(0, weights_cache.cache.weights.size);
3175   }
3176 
IsSpmm()3177   bool IsSpmm() const {
3178     const bool is_1x1 = kernel_width() == 1 && kernel_height() == 1 &&
3179         subsampling_height() == 1 && subsampling_width() == 1;
3180     const bool any_padding = (padding_left() | padding_top() | padding_right() | padding_bottom()) != 0;
3181     return is_1x1 && !any_padding && !force_nhwc_input() && groups() == 1;
3182   }
3183 
3184  private:
3185   uint32_t padding_top_{0};
3186   uint32_t padding_right_{0};
3187   uint32_t padding_bottom_{0};
3188   uint32_t padding_left_{0};
3189   bool padding_tf_same_{false};
3190   size_t input_height_{1};
3191   size_t input_width_{1};
3192   uint32_t groups_{1};
3193   size_t group_input_channels_{1};
3194   size_t input_channel_stride_{0};
3195   size_t group_output_channels_{1};
3196   size_t output_channel_stride_{0};
3197   size_t batch_size_{1};
3198   uint32_t kernel_height_{1};
3199   uint32_t kernel_width_{1};
3200   uint32_t dilation_height_{1};
3201   uint32_t dilation_width_{1};
3202   uint32_t subsampling_height_{1};
3203   uint32_t subsampling_width_{1};
3204   size_t next_input_height_{0};
3205   size_t next_input_width_{0};
3206   size_t next_batch_size_{0};
3207   float sparsity_{0.0f};
3208   uint8_t qmin_{0};
3209   uint8_t qmax_{255};
3210   bool depthwise_layout_{false};
3211   bool force_nhwc_input_{false};
3212   bool has_bias_{true};
3213   WeightsType weights_type_{WeightsType::Default};
3214   size_t iterations_{1};
3215 #if XNN_PLATFORM_JIT
3216   bool use_jit_{false};
3217 #endif
3218   bool use_weights_cache_{false};
3219 };
3220