1 // Copyright 2019 Google LLC 2 // 3 // This source code is licensed under the BSD-style license found in the 4 // LICENSE file in the root directory of this source tree. 5 6 #pragma once 7 8 #include <gtest/gtest.h> 9 10 #include <algorithm> 11 #include <cassert> 12 #include <cmath> 13 #include <cstddef> 14 #include <cstdlib> 15 #include <limits> 16 #include <random> 17 #include <vector> 18 19 #include <xnnpack.h> 20 #include <xnnpack/aligned-allocator.h> 21 #include <xnnpack/pack.h> 22 #include <xnnpack/microfnptr.h> 23 #include <xnnpack/microparams-init.h> 24 25 26 class ConvHWCMicrokernelTester { 27 public: 28 enum class Variant { 29 Native, 30 Scalar, 31 }; 32 output_channels_tile(uint32_t output_channels_tile)33 inline ConvHWCMicrokernelTester& output_channels_tile(uint32_t output_channels_tile) { 34 this->output_channels_tile_ = output_channels_tile; 35 return *this; 36 } 37 output_channels_tile()38 inline uint32_t output_channels_tile() const { 39 return this->output_channels_tile_; 40 } 41 padding(uint32_t padding)42 inline ConvHWCMicrokernelTester& padding(uint32_t padding) { 43 this->padding_top_ = padding; 44 this->padding_right_ = padding; 45 this->padding_bottom_ = padding; 46 this->padding_left_ = padding; 47 return *this; 48 } 49 padding_height(uint32_t padding_height)50 inline ConvHWCMicrokernelTester& padding_height(uint32_t padding_height) { 51 this->padding_top_ = padding_height; 52 this->padding_bottom_ = padding_height; 53 return *this; 54 } 55 padding_width(uint32_t padding_width)56 inline ConvHWCMicrokernelTester& padding_width(uint32_t padding_width) { 57 this->padding_right_ = padding_width; 58 this->padding_left_ = padding_width; 59 return *this; 60 } 61 padding_top(uint32_t padding_top)62 inline ConvHWCMicrokernelTester& padding_top(uint32_t padding_top) { 63 this->padding_top_ = padding_top; 64 return *this; 65 } 66 padding_top()67 inline uint32_t padding_top() const { 68 return this->padding_top_; 69 } 70 padding_right(uint32_t padding_right)71 inline ConvHWCMicrokernelTester& padding_right(uint32_t padding_right) { 72 this->padding_right_ = padding_right; 73 return *this; 74 } 75 padding_right()76 inline uint32_t padding_right() const { 77 return this->padding_right_; 78 } 79 padding_bottom(uint32_t padding_bottom)80 inline ConvHWCMicrokernelTester& padding_bottom(uint32_t padding_bottom) { 81 this->padding_bottom_ = padding_bottom; 82 return *this; 83 } 84 padding_bottom()85 inline uint32_t padding_bottom() const { 86 return this->padding_bottom_; 87 } 88 padding_left(uint32_t padding_left)89 inline ConvHWCMicrokernelTester& padding_left(uint32_t padding_left) { 90 this->padding_left_ = padding_left; 91 return *this; 92 } 93 padding_left()94 inline uint32_t padding_left() const { 95 return this->padding_left_; 96 } 97 input_size(uint32_t input_height,uint32_t input_width)98 inline ConvHWCMicrokernelTester& input_size(uint32_t input_height, uint32_t input_width) { 99 assert(input_height >= 1); 100 assert(input_width >= 1); 101 this->input_height_ = input_height; 102 this->input_width_ = input_width; 103 return *this; 104 } 105 input_height(uint32_t input_height)106 inline ConvHWCMicrokernelTester& input_height(uint32_t input_height) { 107 assert(input_height >= 1); 108 this->input_height_ = input_height; 109 return *this; 110 } 111 input_height()112 inline uint32_t input_height() const { 113 return this->input_height_; 114 } 115 input_width(uint32_t input_width)116 inline ConvHWCMicrokernelTester& input_width(uint32_t input_width) { 117 assert(input_width >= 1); 118 this->input_width_ = input_width; 119 return *this; 120 } 121 input_width()122 inline uint32_t input_width() const { 123 return this->input_width_; 124 } 125 input_channels(size_t input_channels)126 inline ConvHWCMicrokernelTester& input_channels(size_t input_channels) { 127 assert(input_channels >= 1); 128 this->input_channels_ = input_channels; 129 return *this; 130 } 131 input_channels()132 inline size_t input_channels() const { 133 return this->input_channels_; 134 } 135 output_channels(size_t output_channels)136 inline ConvHWCMicrokernelTester& output_channels(size_t output_channels) { 137 assert(output_channels >= 1); 138 this->output_channels_ = output_channels; 139 return *this; 140 } 141 output_channels()142 inline size_t output_channels() const { 143 return this->output_channels_; 144 } 145 packed_output_channels()146 inline size_t packed_output_channels() const { 147 return output_channels() % output_channels_tile() == 0 ? output_channels() : output_channels() / output_channels_tile() * output_channels_tile() + output_channels_tile(); 148 } 149 batch_size(size_t batch_size)150 inline ConvHWCMicrokernelTester& batch_size(size_t batch_size) { 151 assert(batch_size >= 1); 152 this->batch_size_ = batch_size; 153 return *this; 154 } 155 batch_size()156 inline size_t batch_size() const { 157 return this->batch_size_; 158 } 159 kernel_size(uint32_t kernel_size)160 inline ConvHWCMicrokernelTester& kernel_size(uint32_t kernel_size) { 161 assert(kernel_size >= 1); 162 this->kernel_height_ = kernel_size; 163 this->kernel_width_ = kernel_size; 164 return *this; 165 } 166 kernel_height(uint32_t kernel_height)167 inline ConvHWCMicrokernelTester& kernel_height(uint32_t kernel_height) { 168 assert(kernel_height >= 1); 169 this->kernel_height_ = kernel_height; 170 return *this; 171 } 172 kernel_height()173 inline uint32_t kernel_height() const { 174 return this->kernel_height_; 175 } 176 kernel_width(uint32_t kernel_width)177 inline ConvHWCMicrokernelTester& kernel_width(uint32_t kernel_width) { 178 assert(kernel_width >= 1); 179 this->kernel_width_ = kernel_width; 180 return *this; 181 } 182 kernel_width()183 inline uint32_t kernel_width() const { 184 return this->kernel_width_; 185 } 186 subsampling(uint32_t subsampling)187 inline ConvHWCMicrokernelTester& subsampling(uint32_t subsampling) { 188 assert(subsampling >= 1); 189 this->subsampling_height_ = subsampling; 190 this->subsampling_width_ = subsampling; 191 return *this; 192 } 193 subsampling_height(uint32_t subsampling_height)194 inline ConvHWCMicrokernelTester& subsampling_height(uint32_t subsampling_height) { 195 assert(subsampling_height >= 1); 196 this->subsampling_height_ = subsampling_height; 197 return *this; 198 } 199 subsampling_height()200 inline uint32_t subsampling_height() const { 201 return this->subsampling_height_; 202 } 203 subsampling_width(uint32_t subsampling_width)204 inline ConvHWCMicrokernelTester& subsampling_width(uint32_t subsampling_width) { 205 assert(subsampling_width >= 1); 206 this->subsampling_width_ = subsampling_width; 207 return *this; 208 } 209 subsampling_width()210 inline uint32_t subsampling_width() const { 211 return this->subsampling_width_; 212 } 213 output_y_start(uint32_t output_y_start)214 inline ConvHWCMicrokernelTester& output_y_start(uint32_t output_y_start) { 215 this->output_y_start_ = output_y_start; 216 return *this; 217 } 218 output_y_start()219 inline uint32_t output_y_start() const { 220 return this->output_y_start_; 221 } 222 output_y_end(uint32_t output_y_end)223 inline ConvHWCMicrokernelTester& output_y_end(uint32_t output_y_end) { 224 this->output_y_end_ = output_y_end; 225 return *this; 226 } 227 output_y_end()228 inline uint32_t output_y_end() const { 229 if (this->output_y_end_ == std::numeric_limits<uint32_t>::max()) { 230 return output_height(); 231 } else { 232 return this->output_y_end_; 233 } 234 } 235 input_pixel_stride()236 inline size_t input_pixel_stride() const { 237 return input_channels(); 238 } 239 output_pixel_stride()240 inline size_t output_pixel_stride() const { 241 return output_channels(); 242 } 243 output_height()244 inline size_t output_height() const { 245 const size_t padded_input_height = padding_top() + input_height() + padding_bottom(); 246 return (std::max<size_t>(padded_input_height + subsampling_height(), kernel_height()) - kernel_height()) 247 / subsampling_height(); 248 } 249 output_width()250 inline size_t output_width() const { 251 const size_t padded_input_width = padding_left() + input_width() + padding_right(); 252 return (std::max<size_t>(padded_input_width + subsampling_width(), kernel_width()) - kernel_width()) 253 / subsampling_width(); 254 } 255 qmin(uint8_t qmin)256 inline ConvHWCMicrokernelTester& qmin(uint8_t qmin) { 257 this->qmin_ = qmin; 258 return *this; 259 } 260 qmin()261 inline uint8_t qmin() const { 262 return this->qmin_; 263 } 264 qmax(uint8_t qmax)265 inline ConvHWCMicrokernelTester& qmax(uint8_t qmax) { 266 this->qmax_ = qmax; 267 return *this; 268 } 269 qmax()270 inline uint8_t qmax() const { 271 return this->qmax_; 272 } 273 iterations(size_t iterations)274 inline ConvHWCMicrokernelTester& iterations(size_t iterations) { 275 this->iterations_ = iterations; 276 return *this; 277 } 278 iterations()279 inline size_t iterations() const { 280 return this->iterations_; 281 } 282 283 void Test(xnn_f32_conv_hwc_ukernel_function conv, Variant variant = Variant::Native) const { 284 ASSERT_LT(output_y_start(), output_height()); 285 ASSERT_LE(output_y_end(), output_height()); 286 ASSERT_GT(output_y_end(), output_y_start()); 287 ASSERT_GE(output_width(), 1); 288 ASSERT_GE(output_height(), 1); 289 290 std::random_device random_device; 291 auto rng = std::mt19937(random_device()); 292 std::uniform_real_distribution<float> f32dist(0.1f, 1.0f); 293 294 std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + 295 batch_size() * ((input_height() * input_width() - 1) * input_pixel_stride() + input_channels())); 296 std::vector<float> zero(XNN_EXTRA_BYTES / sizeof(float) + input_width() * input_channels()); 297 std::vector<float> kernel(output_channels() * kernel_height() * kernel_width() * input_channels()); 298 std::vector<float> bias(output_channels()); 299 std::vector<float> output(batch_size() * ((output_height() * output_width() - 1) * output_pixel_stride() + output_channels())); 300 std::vector<float> output_ref(batch_size() * output_height() * output_width() * output_channels()); 301 std::vector<float, AlignedAllocator<float, 64>> packed_weights((input_channels() * kernel_height() * kernel_width() + 1) * packed_output_channels()); 302 303 for (size_t iteration = 0; iteration < iterations(); iteration++) { 304 std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); 305 std::generate(kernel.begin(), kernel.end(), [&]() { return f32dist(rng); }); 306 std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); }); 307 std::fill(output.begin(), output.end(), nanf("")); 308 std::fill(packed_weights.begin(), packed_weights.end(), 0.0f); 309 310 xnn_pack_f32_dconv_oki_w( 311 output_channels(), 312 input_channels(), 313 output_channels_tile(), 314 kernel_height(), kernel_width(), 315 kernel.data(), bias.data(), packed_weights.data(), nullptr); 316 317 // Compute reference results, without clamping. 318 for (size_t i = 0; i < batch_size(); i++) { 319 for (size_t oy = 0; oy < output_height(); oy++) { 320 for (size_t ox = 0; ox < output_width(); ox++) { 321 for (size_t oc = 0; oc < output_channels(); oc++) { 322 float acc = bias[oc]; 323 for (size_t ky = 0; ky < kernel_height(); ky++) { 324 const size_t iy = oy * subsampling_height() + ky - padding_top(); 325 if (iy < input_height()) { 326 for (size_t kx = 0; kx < kernel_width(); kx++) { 327 const size_t ix = ox * subsampling_width() + kx - padding_left(); 328 if (ix < input_width()) { 329 for (size_t ic = 0; ic < input_channels(); ic++) { 330 acc += 331 input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + ic] * 332 kernel[((oc * kernel_height() + ky) * kernel_width() + kx) * input_channels() + ic]; 333 } 334 } 335 } 336 } 337 } 338 output_ref[((i * output_height() + oy) * output_width() + ox) * output_channels() + oc] = acc; 339 } 340 } 341 } 342 } 343 344 // Compute clamping parameters. 345 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); 346 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); 347 348 const float output_min = accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin()); 349 const float output_max = accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax()); 350 351 // Clamp reference results. 352 for (float& value : output_ref) { 353 value = std::max(std::min(value, output_max), output_min); 354 } 355 356 // Prepare parameters. 357 xnn_f32_minmax_params params; 358 switch (variant) { 359 case Variant::Native: 360 xnn_init_f32_minmax_params(¶ms, output_min, output_max); 361 break; 362 case Variant::Scalar: 363 xnn_init_f32_minmax_scalar_params(¶ms, output_min, output_max); 364 break; 365 } 366 367 // Call optimized micro-kernel. 368 conv( 369 input_height(), input_width(), 370 output_y_start(), output_y_end(), 371 input.data(), zero.data(), packed_weights.data(), output.data(), 372 padding_top(), output_channels(), 373 output_pixel_stride() * output_width() * sizeof(float), 374 output_pixel_stride() * sizeof(float), 375 ¶ms); 376 377 // Verify results. 378 for (size_t i = 0; i < batch_size(); i++) { 379 for (size_t y = output_y_start(); y < output_y_end(); y++) { 380 for (size_t x = 0; x < output_width(); x++) { 381 for (size_t c = 0; c < output_channels(); c++) { 382 ASSERT_GE(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c], output_min) 383 << "(x, y) = (" << x << ", " << y << "), channel = " << c; 384 ASSERT_LE(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c], output_max) 385 << "(x, y) = (" << x << ", " << y << "), channel = " << c; 386 ASSERT_NEAR( 387 output_ref[((i * output_height() + y) * output_width() + x) * output_channels() + c], 388 output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c], 389 1.0e-4 * std::abs(output_ref[((i * output_height() + y) * output_width() + x) * output_channels() + c])) 390 << "(x, y) = (" << x << ", " << y << "), channel = " << c; 391 } 392 } 393 } 394 } 395 } 396 } 397 398 private: 399 uint32_t padding_top_{0}; 400 uint32_t padding_right_{0}; 401 uint32_t padding_bottom_{0}; 402 uint32_t padding_left_{0}; 403 size_t input_height_{1}; 404 size_t input_width_{1}; 405 size_t input_channels_{1}; 406 size_t output_channels_{1}; 407 uint32_t output_channels_tile_{1}; 408 size_t batch_size_{1}; 409 uint32_t kernel_height_{1}; 410 uint32_t kernel_width_{1}; 411 uint32_t subsampling_height_{1}; 412 uint32_t subsampling_width_{1}; 413 uint32_t output_y_start_{0}; 414 uint32_t output_y_end_{std::numeric_limits<uint32_t>::max()}; 415 uint8_t qmin_{0}; 416 uint8_t qmax_{255}; 417 size_t iterations_{1}; 418 }; 419