xref: /aosp_15_r20/external/XNNPACK/test/conv-hwc2chw-microkernel-tester.h (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #pragma once
7 
8 #include <gtest/gtest.h>
9 
10 #include <algorithm>
11 #include <cassert>
12 #include <cmath>
13 #include <cstddef>
14 #include <cstdlib>
15 #include <limits>
16 #include <random>
17 #include <vector>
18 
19 #include <fp16.h>
20 
21 #include <xnnpack.h>
22 #include <xnnpack/aligned-allocator.h>
23 #include <xnnpack/pack.h>
24 #include <xnnpack/microfnptr.h>
25 #include <xnnpack/microparams-init.h>
26 
27 
28 class ConvHWC2CHWMicrokernelTester {
29 public:
30   enum class Variant {
31     Native,
32     Scalar,
33   };
34 
output_channels_tile(uint32_t output_channels_tile)35   inline ConvHWC2CHWMicrokernelTester& output_channels_tile(uint32_t output_channels_tile) {
36     this->output_channels_tile_ = output_channels_tile;
37     return *this;
38   }
39 
output_channels_tile()40   inline uint32_t output_channels_tile() const {
41     return this->output_channels_tile_;
42   }
43 
padding(uint32_t padding)44   inline ConvHWC2CHWMicrokernelTester& padding(uint32_t padding) {
45     this->padding_top_ = padding;
46     this->padding_right_ = padding;
47     this->padding_bottom_ = padding;
48     this->padding_left_ = padding;
49     return *this;
50   }
51 
padding_height(uint32_t padding_height)52   inline ConvHWC2CHWMicrokernelTester& padding_height(uint32_t padding_height) {
53     this->padding_top_ = padding_height;
54     this->padding_bottom_ = padding_height;
55     return *this;
56   }
57 
padding_width(uint32_t padding_width)58   inline ConvHWC2CHWMicrokernelTester& padding_width(uint32_t padding_width) {
59     this->padding_right_ = padding_width;
60     this->padding_left_ = padding_width;
61     return *this;
62   }
63 
padding_top(uint32_t padding_top)64   inline ConvHWC2CHWMicrokernelTester& padding_top(uint32_t padding_top) {
65     this->padding_top_ = padding_top;
66     return *this;
67   }
68 
padding_top()69   inline uint32_t padding_top() const {
70     return this->padding_top_;
71   }
72 
padding_right(uint32_t padding_right)73   inline ConvHWC2CHWMicrokernelTester& padding_right(uint32_t padding_right) {
74     this->padding_right_ = padding_right;
75     return *this;
76   }
77 
padding_right()78   inline uint32_t padding_right() const {
79     return this->padding_right_;
80   }
81 
padding_bottom(uint32_t padding_bottom)82   inline ConvHWC2CHWMicrokernelTester& padding_bottom(uint32_t padding_bottom) {
83     this->padding_bottom_ = padding_bottom;
84     return *this;
85   }
86 
padding_bottom()87   inline uint32_t padding_bottom() const {
88     return this->padding_bottom_;
89   }
90 
padding_left(uint32_t padding_left)91   inline ConvHWC2CHWMicrokernelTester& padding_left(uint32_t padding_left) {
92     this->padding_left_ = padding_left;
93     return *this;
94   }
95 
padding_left()96   inline uint32_t padding_left() const {
97     return this->padding_left_;
98   }
99 
input_size(uint32_t input_height,uint32_t input_width)100   inline ConvHWC2CHWMicrokernelTester& input_size(uint32_t input_height, uint32_t input_width) {
101     assert(input_height >= 1);
102     assert(input_width >= 1);
103     this->input_height_ = input_height;
104     this->input_width_ = input_width;
105     return *this;
106   }
107 
input_height(uint32_t input_height)108   inline ConvHWC2CHWMicrokernelTester& input_height(uint32_t input_height) {
109     assert(input_height >= 1);
110     this->input_height_ = input_height;
111     return *this;
112   }
113 
input_height()114   inline uint32_t input_height() const {
115     return this->input_height_;
116   }
117 
input_width(uint32_t input_width)118   inline ConvHWC2CHWMicrokernelTester& input_width(uint32_t input_width) {
119     assert(input_width >= 1);
120     this->input_width_ = input_width;
121     return *this;
122   }
123 
input_width()124   inline uint32_t input_width() const {
125     return this->input_width_;
126   }
127 
input_channels(size_t input_channels)128   inline ConvHWC2CHWMicrokernelTester& input_channels(size_t input_channels) {
129     assert(input_channels >= 1);
130     this->input_channels_ = input_channels;
131     return *this;
132   }
133 
input_channels()134   inline size_t input_channels() const {
135     return this->input_channels_;
136   }
137 
output_channels(size_t output_channels)138   inline ConvHWC2CHWMicrokernelTester& output_channels(size_t output_channels) {
139     assert(output_channels >= 1);
140     this->output_channels_ = output_channels;
141     return *this;
142   }
143 
output_channels()144   inline size_t output_channels() const {
145     return this->output_channels_;
146   }
147 
packed_output_channels()148   inline size_t packed_output_channels() const {
149     return output_channels() % output_channels_tile() == 0 ? output_channels() : output_channels() / output_channels_tile() * output_channels_tile() + output_channels_tile();
150   }
151 
batch_size(size_t batch_size)152   inline ConvHWC2CHWMicrokernelTester& batch_size(size_t batch_size) {
153     assert(batch_size >= 1);
154     this->batch_size_ = batch_size;
155     return *this;
156   }
157 
batch_size()158   inline size_t batch_size() const {
159     return this->batch_size_;
160   }
161 
kernel_size(uint32_t kernel_size)162   inline ConvHWC2CHWMicrokernelTester& kernel_size(uint32_t kernel_size) {
163     assert(kernel_size >= 1);
164     this->kernel_height_ = kernel_size;
165     this->kernel_width_ = kernel_size;
166     return *this;
167   }
168 
kernel_height(uint32_t kernel_height)169   inline ConvHWC2CHWMicrokernelTester& kernel_height(uint32_t kernel_height) {
170     assert(kernel_height >= 1);
171     this->kernel_height_ = kernel_height;
172     return *this;
173   }
174 
kernel_height()175   inline uint32_t kernel_height() const {
176     return this->kernel_height_;
177   }
178 
kernel_width(uint32_t kernel_width)179   inline ConvHWC2CHWMicrokernelTester& kernel_width(uint32_t kernel_width) {
180     assert(kernel_width >= 1);
181     this->kernel_width_ = kernel_width;
182     return *this;
183   }
184 
kernel_width()185   inline uint32_t kernel_width() const {
186     return this->kernel_width_;
187   }
188 
subsampling(uint32_t subsampling)189   inline ConvHWC2CHWMicrokernelTester& subsampling(uint32_t subsampling) {
190     assert(subsampling >= 1);
191     this->subsampling_height_ = subsampling;
192     this->subsampling_width_ = subsampling;
193     return *this;
194   }
195 
subsampling_height(uint32_t subsampling_height)196   inline ConvHWC2CHWMicrokernelTester& subsampling_height(uint32_t subsampling_height) {
197     assert(subsampling_height >= 1);
198     this->subsampling_height_ = subsampling_height;
199     return *this;
200   }
201 
subsampling_height()202   inline uint32_t subsampling_height() const {
203     return this->subsampling_height_;
204   }
205 
subsampling_width(uint32_t subsampling_width)206   inline ConvHWC2CHWMicrokernelTester& subsampling_width(uint32_t subsampling_width) {
207     assert(subsampling_width >= 1);
208     this->subsampling_width_ = subsampling_width;
209     return *this;
210   }
211 
subsampling_width()212   inline uint32_t subsampling_width() const {
213     return this->subsampling_width_;
214   }
215 
output_y_start(uint32_t output_y_start)216   inline ConvHWC2CHWMicrokernelTester& output_y_start(uint32_t output_y_start) {
217     this->output_y_start_ = output_y_start;
218     return *this;
219   }
220 
output_y_start()221   inline uint32_t output_y_start() const {
222     return this->output_y_start_;
223   }
224 
output_y_end(uint32_t output_y_end)225   inline ConvHWC2CHWMicrokernelTester& output_y_end(uint32_t output_y_end) {
226     this->output_y_end_ = output_y_end;
227     return *this;
228   }
229 
output_y_end()230   inline uint32_t output_y_end() const {
231     if (this->output_y_end_ == std::numeric_limits<uint32_t>::max()) {
232       return output_height();
233     } else {
234       return this->output_y_end_;
235     }
236   }
237 
input_pixel_stride()238   inline size_t input_pixel_stride() const {
239     return input_channels();
240   }
241 
output_pixel_stride()242   inline size_t output_pixel_stride() const {
243     return output_channels();
244   }
245 
output_height()246   inline size_t output_height() const {
247     const size_t padded_input_height = padding_top() + input_height() + padding_bottom();
248     if (padded_input_height < kernel_height()) {
249       return 0;
250     } else {
251       return (padded_input_height - kernel_height()) / subsampling_height() + 1;
252     }
253   }
254 
output_width()255   inline size_t output_width() const {
256     const size_t padded_input_width = padding_left() + input_width() + padding_right();
257     if (padded_input_width < kernel_width()) {
258       return 0;
259     } else {
260       return (padded_input_width - kernel_width()) / subsampling_width() + 1;
261     }
262   }
263 
qmin(uint8_t qmin)264   inline ConvHWC2CHWMicrokernelTester& qmin(uint8_t qmin) {
265     this->qmin_ = qmin;
266     return *this;
267   }
268 
qmin()269   inline uint8_t qmin() const {
270     return this->qmin_;
271   }
272 
qmax(uint8_t qmax)273   inline ConvHWC2CHWMicrokernelTester& qmax(uint8_t qmax) {
274     this->qmax_ = qmax;
275     return *this;
276   }
277 
qmax()278   inline uint8_t qmax() const {
279     return this->qmax_;
280   }
281 
iterations(size_t iterations)282   inline ConvHWC2CHWMicrokernelTester& iterations(size_t iterations) {
283     this->iterations_ = iterations;
284     return *this;
285   }
286 
iterations()287   inline size_t iterations() const {
288     return this->iterations_;
289   }
290 
291   void Test(xnn_f32_conv_hwc2chw_ukernel_function conv, Variant variant = Variant::Native) const {
292     ASSERT_LT(output_y_start(), output_height());
293     ASSERT_LE(output_y_end(), output_height());
294     ASSERT_GT(output_y_end(), output_y_start());
295     ASSERT_GE(output_width(), 1);
296     ASSERT_GE(output_height(), 1);
297 
298     std::random_device random_device;
299     auto rng = std::mt19937(random_device());
300     std::uniform_real_distribution<float> f32dist(0.1f, 1.0f);
301 
302     std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) +
303       batch_size() * ((input_height() * input_width() - 1) * input_pixel_stride() + input_channels()));
304     std::vector<float> zero(XNN_EXTRA_BYTES / sizeof(float) + input_width() * input_channels());
305     std::vector<float> kernel(output_channels() * kernel_height() * kernel_width() * input_channels());
306     std::vector<float> bias(output_channels());
307     std::vector<float> output(batch_size() * output_channels() * output_height() * output_width());
308     std::vector<float> output_ref(batch_size() * output_channels() * output_height() * output_width());
309     std::vector<float, AlignedAllocator<float, 64>> packed_weights((input_channels() * kernel_height() * kernel_width() + 1) * packed_output_channels());
310 
311     for (size_t iteration = 0; iteration < iterations(); iteration++) {
312       std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); });
313       std::generate(kernel.begin(), kernel.end(), [&]() { return f32dist(rng); });
314       std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); });
315       std::fill(output.begin(), output.end(), nanf(""));
316       std::fill(packed_weights.begin(), packed_weights.end(), 0.0f);
317 
318       xnn_pack_f32_dconv_oki_w(
319         output_channels(),
320         input_channels(),
321         output_channels_tile(),
322         kernel_height(), kernel_width(),
323         kernel.data(), bias.data(), packed_weights.data(), nullptr);
324 
325       // Compute reference results, without clamping.
326       for (size_t i = 0; i < batch_size(); i++) {
327         for (size_t oy = 0; oy < output_height(); oy++) {
328           for (size_t ox = 0; ox < output_width(); ox++) {
329             for (size_t oc = 0; oc < output_channels(); oc++) {
330               float acc = bias[oc];
331               for (size_t ky = 0; ky < kernel_height(); ky++) {
332                 const size_t iy = oy * subsampling_height() + ky - padding_top();
333                 if (iy < input_height()) {
334                   for (size_t kx = 0; kx < kernel_width(); kx++) {
335                     const size_t ix = ox * subsampling_width() + kx - padding_left();
336                     if (ix < input_width()) {
337                       for (size_t ic = 0; ic < input_channels(); ic++) {
338                         acc +=
339                           input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + ic] *
340                           kernel[((oc * kernel_height() + ky) * kernel_width() + kx) * input_channels() + ic];
341                       }
342                     }
343                   }
344                 }
345               }
346               output_ref[((i * output_channels() + oc) * output_height() + oy) * output_width() + ox] = acc;
347             }
348           }
349         }
350       }
351 
352       // Compute clamping parameters.
353       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
354       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
355 
356       const float output_min = accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
357       const float output_max = accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
358 
359       // Clamp reference results.
360       for (float& value : output_ref) {
361         value = std::max(std::min(value, output_max), output_min);
362       }
363 
364       // Prepare parameters.
365       xnn_f32_minmax_params params;
366       switch (variant) {
367         case Variant::Native:
368           xnn_init_f32_minmax_params(&params, output_min, output_max);
369           break;
370         case Variant::Scalar:
371           xnn_init_f32_minmax_scalar_params(&params, output_min, output_max);
372           break;
373       }
374 
375       // Call optimized micro-kernel.
376       conv(
377         input_height(), input_width(),
378         output_y_start(), output_y_end(),
379         input.data(), zero.data(), packed_weights.data(), output.data(),
380         padding_top(), output_channels(),
381         output_width() * sizeof(float),
382         output_height() * output_width() * sizeof(float),
383         &params);
384 
385       // Verify results.
386       for (size_t i = 0; i < batch_size(); i++) {
387         for (size_t y = output_y_start(); y < output_y_end(); y++) {
388           for (size_t x = 0; x < output_width(); x++) {
389             for (size_t c = 0; c < output_channels(); c++) {
390               ASSERT_GE(output[((i * output_channels() + c) * output_height() + y) * output_width() + x], output_min)
391                 << "(x, y) = (" << x << ", " << y << "), channel = " << c;
392               ASSERT_LE(output[((i * output_channels() + c) * output_height() + y) * output_width() + x], output_max)
393                 << "(x, y) = (" << x << ", " << y << "), channel = " << c;
394               ASSERT_NEAR(
395                   output_ref[((i * output_channels() + c) * output_height() + y) * output_width() + x],
396                   output[((i * output_channels() + c) * output_height() + y) * output_width() + x],
397                   1.0e-4 * std::abs(output_ref[((i * output_channels() + c) * output_height() + y) * output_width() + x]))
398                 << "(x, y) = (" << x << ", " << y << "), channel = " << c;
399             }
400           }
401         }
402       }
403     }
404   }
405 
Test(xnn_f16_conv_hwc2chw_ukernel_function conv,xnn_init_f16_minmax_params_fn init_params)406   void Test(xnn_f16_conv_hwc2chw_ukernel_function conv, xnn_init_f16_minmax_params_fn init_params) const {
407     ASSERT_LT(output_y_start(), output_height());
408     ASSERT_LE(output_y_end(), output_height());
409     ASSERT_GT(output_y_end(), output_y_start());
410     ASSERT_GE(output_width(), 1);
411     ASSERT_GE(output_height(), 1);
412 
413     std::random_device random_device;
414     auto rng = std::mt19937(random_device());
415     std::uniform_real_distribution<float> f32dist(0.1f, 1.0f);
416 
417     std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) +
418       batch_size() * ((input_height() * input_width() - 1) * input_pixel_stride() + input_channels()));
419     std::vector<uint16_t> zero(XNN_EXTRA_BYTES / sizeof(uint16_t) + input_width() * input_channels());
420     std::vector<uint16_t> kernel(output_channels() * kernel_height() * kernel_width() * input_channels());
421     std::vector<uint16_t> bias(output_channels());
422     std::vector<uint16_t> output(batch_size() * output_channels() * output_height() * output_width());
423     std::vector<float> output_ref(batch_size() * output_channels() * output_height() * output_width());
424     std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> packed_weights((input_channels() * kernel_height() * kernel_width() + 1) * packed_output_channels());
425 
426     for (size_t iteration = 0; iteration < iterations(); iteration++) {
427       std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
428       std::generate(kernel.begin(), kernel.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
429       std::generate(bias.begin(), bias.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); });
430       std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
431       std::fill(packed_weights.begin(), packed_weights.end(), 0);
432 
433       xnn_pack_f16_dconv_oki_w(
434         output_channels(),
435         input_channels(),
436         output_channels_tile(),
437         kernel_height(), kernel_width(),
438         kernel.data(), bias.data(), packed_weights.data(), nullptr);
439 
440       // Compute reference results, without clamping.
441       for (size_t i = 0; i < batch_size(); i++) {
442         for (size_t oy = 0; oy < output_height(); oy++) {
443           for (size_t ox = 0; ox < output_width(); ox++) {
444             for (size_t oc = 0; oc < output_channels(); oc++) {
445               float acc = fp16_ieee_to_fp32_value(bias[oc]);
446               for (size_t ky = 0; ky < kernel_height(); ky++) {
447                 const size_t iy = oy * subsampling_height() + ky - padding_top();
448                 if (iy < input_height()) {
449                   for (size_t kx = 0; kx < kernel_width(); kx++) {
450                     const size_t ix = ox * subsampling_width() + kx - padding_left();
451                     if (ix < input_width()) {
452                       for (size_t ic = 0; ic < input_channels(); ic++) {
453                         acc +=
454                           fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + ic]) *
455                           fp16_ieee_to_fp32_value(kernel[((oc * kernel_height() + ky) * kernel_width() + kx) * input_channels() + ic]);
456                       }
457                     }
458                   }
459                 }
460               }
461               output_ref[((i * output_channels() + oc) * output_height() + oy) * output_width() + ox] = acc;
462             }
463           }
464         }
465       }
466 
467       // Compute clamping parameters.
468       const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
469       const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
470       const float accumulated_range = accumulated_max - accumulated_min;
471       const float output_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_min + accumulated_range / 255.0f * float(qmin())));
472       const float output_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_max - accumulated_range / 255.0f * float(255 - qmax())));
473 
474       // Clamp reference results.
475       for (float& value : output_ref) {
476         value = std::max(std::min(value, output_max), output_min);
477       }
478 
479       // Prepare parameters.
480       xnn_f16_minmax_params params;
481       init_params(&params, fp16_ieee_from_fp32_value(output_min), fp16_ieee_from_fp32_value(output_max));
482 
483       // Call optimized micro-kernel.
484       conv(
485         input_height(), input_width(),
486         output_y_start(), output_y_end(),
487         input.data(), zero.data(), packed_weights.data(), output.data(),
488         padding_top(), output_channels(),
489         output_width() * sizeof(uint16_t),
490         output_height() * output_width() * sizeof(uint16_t),
491         &params);
492 
493       // Verify results.
494       for (size_t i = 0; i < batch_size(); i++) {
495         for (size_t y = output_y_start(); y < output_y_end(); y++) {
496           for (size_t x = 0; x < output_width(); x++) {
497             for (size_t c = 0; c < output_channels(); c++) {
498               ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * output_channels() + c) * output_height() + y) * output_width() + x]), output_min)
499                 << "(x, y) = (" << x << ", " << y << "), channel = " << c;
500               ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * output_channels() + c) * output_height() + y) * output_width() + x]), output_max)
501                 << "(x, y) = (" << x << ", " << y << "), channel = " << c;
502               ASSERT_NEAR(
503                   output_ref[((i * output_channels() + c) * output_height() + y) * output_width() + x],
504                   fp16_ieee_to_fp32_value(output[((i * output_channels() + c) * output_height() + y) * output_width() + x]),
505                   std::max(1.0e-4f, 1.0e-2f * std::abs(output_ref[((i * output_channels() + c) * output_height() + y) * output_width() + x])))
506                 << "(x, y) = (" << x << ", " << y << "), channel = " << c;
507             }
508           }
509         }
510       }
511     }
512   }
513 
514  private:
515   uint32_t padding_top_{0};
516   uint32_t padding_right_{0};
517   uint32_t padding_bottom_{0};
518   uint32_t padding_left_{0};
519   size_t input_height_{1};
520   size_t input_width_{1};
521   size_t input_channels_{1};
522   size_t output_channels_{1};
523   uint32_t output_channels_tile_{1};
524   size_t batch_size_{1};
525   uint32_t kernel_height_{1};
526   uint32_t kernel_width_{1};
527   uint32_t subsampling_height_{1};
528   uint32_t subsampling_width_{1};
529   uint32_t output_y_start_{0};
530   uint32_t output_y_end_{std::numeric_limits<uint32_t>::max()};
531   uint8_t qmin_{0};
532   uint8_t qmax_{255};
533   size_t iterations_{1};
534 };
535