1 // Copyright 2019 Google LLC 2 // 3 // This source code is licensed under the BSD-style license found in the 4 // LICENSE file in the root directory of this source tree. 5 6 #pragma once 7 8 #include <gtest/gtest.h> 9 10 #include <algorithm> 11 #include <cassert> 12 #include <cmath> 13 #include <cstddef> 14 #include <cstdlib> 15 #include <limits> 16 #include <random> 17 #include <vector> 18 19 #include <fp16.h> 20 21 #include <xnnpack.h> 22 #include <xnnpack/aligned-allocator.h> 23 #include <xnnpack/pack.h> 24 #include <xnnpack/microfnptr.h> 25 #include <xnnpack/microparams-init.h> 26 27 28 class ConvHWC2CHWMicrokernelTester { 29 public: 30 enum class Variant { 31 Native, 32 Scalar, 33 }; 34 output_channels_tile(uint32_t output_channels_tile)35 inline ConvHWC2CHWMicrokernelTester& output_channels_tile(uint32_t output_channels_tile) { 36 this->output_channels_tile_ = output_channels_tile; 37 return *this; 38 } 39 output_channels_tile()40 inline uint32_t output_channels_tile() const { 41 return this->output_channels_tile_; 42 } 43 padding(uint32_t padding)44 inline ConvHWC2CHWMicrokernelTester& padding(uint32_t padding) { 45 this->padding_top_ = padding; 46 this->padding_right_ = padding; 47 this->padding_bottom_ = padding; 48 this->padding_left_ = padding; 49 return *this; 50 } 51 padding_height(uint32_t padding_height)52 inline ConvHWC2CHWMicrokernelTester& padding_height(uint32_t padding_height) { 53 this->padding_top_ = padding_height; 54 this->padding_bottom_ = padding_height; 55 return *this; 56 } 57 padding_width(uint32_t padding_width)58 inline ConvHWC2CHWMicrokernelTester& padding_width(uint32_t padding_width) { 59 this->padding_right_ = padding_width; 60 this->padding_left_ = padding_width; 61 return *this; 62 } 63 padding_top(uint32_t padding_top)64 inline ConvHWC2CHWMicrokernelTester& padding_top(uint32_t padding_top) { 65 this->padding_top_ = padding_top; 66 return *this; 67 } 68 padding_top()69 inline uint32_t padding_top() const { 70 return this->padding_top_; 71 } 72 padding_right(uint32_t padding_right)73 inline ConvHWC2CHWMicrokernelTester& padding_right(uint32_t padding_right) { 74 this->padding_right_ = padding_right; 75 return *this; 76 } 77 padding_right()78 inline uint32_t padding_right() const { 79 return this->padding_right_; 80 } 81 padding_bottom(uint32_t padding_bottom)82 inline ConvHWC2CHWMicrokernelTester& padding_bottom(uint32_t padding_bottom) { 83 this->padding_bottom_ = padding_bottom; 84 return *this; 85 } 86 padding_bottom()87 inline uint32_t padding_bottom() const { 88 return this->padding_bottom_; 89 } 90 padding_left(uint32_t padding_left)91 inline ConvHWC2CHWMicrokernelTester& padding_left(uint32_t padding_left) { 92 this->padding_left_ = padding_left; 93 return *this; 94 } 95 padding_left()96 inline uint32_t padding_left() const { 97 return this->padding_left_; 98 } 99 input_size(uint32_t input_height,uint32_t input_width)100 inline ConvHWC2CHWMicrokernelTester& input_size(uint32_t input_height, uint32_t input_width) { 101 assert(input_height >= 1); 102 assert(input_width >= 1); 103 this->input_height_ = input_height; 104 this->input_width_ = input_width; 105 return *this; 106 } 107 input_height(uint32_t input_height)108 inline ConvHWC2CHWMicrokernelTester& input_height(uint32_t input_height) { 109 assert(input_height >= 1); 110 this->input_height_ = input_height; 111 return *this; 112 } 113 input_height()114 inline uint32_t input_height() const { 115 return this->input_height_; 116 } 117 input_width(uint32_t input_width)118 inline ConvHWC2CHWMicrokernelTester& input_width(uint32_t input_width) { 119 assert(input_width >= 1); 120 this->input_width_ = input_width; 121 return *this; 122 } 123 input_width()124 inline uint32_t input_width() const { 125 return this->input_width_; 126 } 127 input_channels(size_t input_channels)128 inline ConvHWC2CHWMicrokernelTester& input_channels(size_t input_channels) { 129 assert(input_channels >= 1); 130 this->input_channels_ = input_channels; 131 return *this; 132 } 133 input_channels()134 inline size_t input_channels() const { 135 return this->input_channels_; 136 } 137 output_channels(size_t output_channels)138 inline ConvHWC2CHWMicrokernelTester& output_channels(size_t output_channels) { 139 assert(output_channels >= 1); 140 this->output_channels_ = output_channels; 141 return *this; 142 } 143 output_channels()144 inline size_t output_channels() const { 145 return this->output_channels_; 146 } 147 packed_output_channels()148 inline size_t packed_output_channels() const { 149 return output_channels() % output_channels_tile() == 0 ? output_channels() : output_channels() / output_channels_tile() * output_channels_tile() + output_channels_tile(); 150 } 151 batch_size(size_t batch_size)152 inline ConvHWC2CHWMicrokernelTester& batch_size(size_t batch_size) { 153 assert(batch_size >= 1); 154 this->batch_size_ = batch_size; 155 return *this; 156 } 157 batch_size()158 inline size_t batch_size() const { 159 return this->batch_size_; 160 } 161 kernel_size(uint32_t kernel_size)162 inline ConvHWC2CHWMicrokernelTester& kernel_size(uint32_t kernel_size) { 163 assert(kernel_size >= 1); 164 this->kernel_height_ = kernel_size; 165 this->kernel_width_ = kernel_size; 166 return *this; 167 } 168 kernel_height(uint32_t kernel_height)169 inline ConvHWC2CHWMicrokernelTester& kernel_height(uint32_t kernel_height) { 170 assert(kernel_height >= 1); 171 this->kernel_height_ = kernel_height; 172 return *this; 173 } 174 kernel_height()175 inline uint32_t kernel_height() const { 176 return this->kernel_height_; 177 } 178 kernel_width(uint32_t kernel_width)179 inline ConvHWC2CHWMicrokernelTester& kernel_width(uint32_t kernel_width) { 180 assert(kernel_width >= 1); 181 this->kernel_width_ = kernel_width; 182 return *this; 183 } 184 kernel_width()185 inline uint32_t kernel_width() const { 186 return this->kernel_width_; 187 } 188 subsampling(uint32_t subsampling)189 inline ConvHWC2CHWMicrokernelTester& subsampling(uint32_t subsampling) { 190 assert(subsampling >= 1); 191 this->subsampling_height_ = subsampling; 192 this->subsampling_width_ = subsampling; 193 return *this; 194 } 195 subsampling_height(uint32_t subsampling_height)196 inline ConvHWC2CHWMicrokernelTester& subsampling_height(uint32_t subsampling_height) { 197 assert(subsampling_height >= 1); 198 this->subsampling_height_ = subsampling_height; 199 return *this; 200 } 201 subsampling_height()202 inline uint32_t subsampling_height() const { 203 return this->subsampling_height_; 204 } 205 subsampling_width(uint32_t subsampling_width)206 inline ConvHWC2CHWMicrokernelTester& subsampling_width(uint32_t subsampling_width) { 207 assert(subsampling_width >= 1); 208 this->subsampling_width_ = subsampling_width; 209 return *this; 210 } 211 subsampling_width()212 inline uint32_t subsampling_width() const { 213 return this->subsampling_width_; 214 } 215 output_y_start(uint32_t output_y_start)216 inline ConvHWC2CHWMicrokernelTester& output_y_start(uint32_t output_y_start) { 217 this->output_y_start_ = output_y_start; 218 return *this; 219 } 220 output_y_start()221 inline uint32_t output_y_start() const { 222 return this->output_y_start_; 223 } 224 output_y_end(uint32_t output_y_end)225 inline ConvHWC2CHWMicrokernelTester& output_y_end(uint32_t output_y_end) { 226 this->output_y_end_ = output_y_end; 227 return *this; 228 } 229 output_y_end()230 inline uint32_t output_y_end() const { 231 if (this->output_y_end_ == std::numeric_limits<uint32_t>::max()) { 232 return output_height(); 233 } else { 234 return this->output_y_end_; 235 } 236 } 237 input_pixel_stride()238 inline size_t input_pixel_stride() const { 239 return input_channels(); 240 } 241 output_pixel_stride()242 inline size_t output_pixel_stride() const { 243 return output_channels(); 244 } 245 output_height()246 inline size_t output_height() const { 247 const size_t padded_input_height = padding_top() + input_height() + padding_bottom(); 248 if (padded_input_height < kernel_height()) { 249 return 0; 250 } else { 251 return (padded_input_height - kernel_height()) / subsampling_height() + 1; 252 } 253 } 254 output_width()255 inline size_t output_width() const { 256 const size_t padded_input_width = padding_left() + input_width() + padding_right(); 257 if (padded_input_width < kernel_width()) { 258 return 0; 259 } else { 260 return (padded_input_width - kernel_width()) / subsampling_width() + 1; 261 } 262 } 263 qmin(uint8_t qmin)264 inline ConvHWC2CHWMicrokernelTester& qmin(uint8_t qmin) { 265 this->qmin_ = qmin; 266 return *this; 267 } 268 qmin()269 inline uint8_t qmin() const { 270 return this->qmin_; 271 } 272 qmax(uint8_t qmax)273 inline ConvHWC2CHWMicrokernelTester& qmax(uint8_t qmax) { 274 this->qmax_ = qmax; 275 return *this; 276 } 277 qmax()278 inline uint8_t qmax() const { 279 return this->qmax_; 280 } 281 iterations(size_t iterations)282 inline ConvHWC2CHWMicrokernelTester& iterations(size_t iterations) { 283 this->iterations_ = iterations; 284 return *this; 285 } 286 iterations()287 inline size_t iterations() const { 288 return this->iterations_; 289 } 290 291 void Test(xnn_f32_conv_hwc2chw_ukernel_function conv, Variant variant = Variant::Native) const { 292 ASSERT_LT(output_y_start(), output_height()); 293 ASSERT_LE(output_y_end(), output_height()); 294 ASSERT_GT(output_y_end(), output_y_start()); 295 ASSERT_GE(output_width(), 1); 296 ASSERT_GE(output_height(), 1); 297 298 std::random_device random_device; 299 auto rng = std::mt19937(random_device()); 300 std::uniform_real_distribution<float> f32dist(0.1f, 1.0f); 301 302 std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + 303 batch_size() * ((input_height() * input_width() - 1) * input_pixel_stride() + input_channels())); 304 std::vector<float> zero(XNN_EXTRA_BYTES / sizeof(float) + input_width() * input_channels()); 305 std::vector<float> kernel(output_channels() * kernel_height() * kernel_width() * input_channels()); 306 std::vector<float> bias(output_channels()); 307 std::vector<float> output(batch_size() * output_channels() * output_height() * output_width()); 308 std::vector<float> output_ref(batch_size() * output_channels() * output_height() * output_width()); 309 std::vector<float, AlignedAllocator<float, 64>> packed_weights((input_channels() * kernel_height() * kernel_width() + 1) * packed_output_channels()); 310 311 for (size_t iteration = 0; iteration < iterations(); iteration++) { 312 std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); 313 std::generate(kernel.begin(), kernel.end(), [&]() { return f32dist(rng); }); 314 std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); }); 315 std::fill(output.begin(), output.end(), nanf("")); 316 std::fill(packed_weights.begin(), packed_weights.end(), 0.0f); 317 318 xnn_pack_f32_dconv_oki_w( 319 output_channels(), 320 input_channels(), 321 output_channels_tile(), 322 kernel_height(), kernel_width(), 323 kernel.data(), bias.data(), packed_weights.data(), nullptr); 324 325 // Compute reference results, without clamping. 326 for (size_t i = 0; i < batch_size(); i++) { 327 for (size_t oy = 0; oy < output_height(); oy++) { 328 for (size_t ox = 0; ox < output_width(); ox++) { 329 for (size_t oc = 0; oc < output_channels(); oc++) { 330 float acc = bias[oc]; 331 for (size_t ky = 0; ky < kernel_height(); ky++) { 332 const size_t iy = oy * subsampling_height() + ky - padding_top(); 333 if (iy < input_height()) { 334 for (size_t kx = 0; kx < kernel_width(); kx++) { 335 const size_t ix = ox * subsampling_width() + kx - padding_left(); 336 if (ix < input_width()) { 337 for (size_t ic = 0; ic < input_channels(); ic++) { 338 acc += 339 input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + ic] * 340 kernel[((oc * kernel_height() + ky) * kernel_width() + kx) * input_channels() + ic]; 341 } 342 } 343 } 344 } 345 } 346 output_ref[((i * output_channels() + oc) * output_height() + oy) * output_width() + ox] = acc; 347 } 348 } 349 } 350 } 351 352 // Compute clamping parameters. 353 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); 354 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); 355 356 const float output_min = accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin()); 357 const float output_max = accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax()); 358 359 // Clamp reference results. 360 for (float& value : output_ref) { 361 value = std::max(std::min(value, output_max), output_min); 362 } 363 364 // Prepare parameters. 365 xnn_f32_minmax_params params; 366 switch (variant) { 367 case Variant::Native: 368 xnn_init_f32_minmax_params(¶ms, output_min, output_max); 369 break; 370 case Variant::Scalar: 371 xnn_init_f32_minmax_scalar_params(¶ms, output_min, output_max); 372 break; 373 } 374 375 // Call optimized micro-kernel. 376 conv( 377 input_height(), input_width(), 378 output_y_start(), output_y_end(), 379 input.data(), zero.data(), packed_weights.data(), output.data(), 380 padding_top(), output_channels(), 381 output_width() * sizeof(float), 382 output_height() * output_width() * sizeof(float), 383 ¶ms); 384 385 // Verify results. 386 for (size_t i = 0; i < batch_size(); i++) { 387 for (size_t y = output_y_start(); y < output_y_end(); y++) { 388 for (size_t x = 0; x < output_width(); x++) { 389 for (size_t c = 0; c < output_channels(); c++) { 390 ASSERT_GE(output[((i * output_channels() + c) * output_height() + y) * output_width() + x], output_min) 391 << "(x, y) = (" << x << ", " << y << "), channel = " << c; 392 ASSERT_LE(output[((i * output_channels() + c) * output_height() + y) * output_width() + x], output_max) 393 << "(x, y) = (" << x << ", " << y << "), channel = " << c; 394 ASSERT_NEAR( 395 output_ref[((i * output_channels() + c) * output_height() + y) * output_width() + x], 396 output[((i * output_channels() + c) * output_height() + y) * output_width() + x], 397 1.0e-4 * std::abs(output_ref[((i * output_channels() + c) * output_height() + y) * output_width() + x])) 398 << "(x, y) = (" << x << ", " << y << "), channel = " << c; 399 } 400 } 401 } 402 } 403 } 404 } 405 Test(xnn_f16_conv_hwc2chw_ukernel_function conv,xnn_init_f16_minmax_params_fn init_params)406 void Test(xnn_f16_conv_hwc2chw_ukernel_function conv, xnn_init_f16_minmax_params_fn init_params) const { 407 ASSERT_LT(output_y_start(), output_height()); 408 ASSERT_LE(output_y_end(), output_height()); 409 ASSERT_GT(output_y_end(), output_y_start()); 410 ASSERT_GE(output_width(), 1); 411 ASSERT_GE(output_height(), 1); 412 413 std::random_device random_device; 414 auto rng = std::mt19937(random_device()); 415 std::uniform_real_distribution<float> f32dist(0.1f, 1.0f); 416 417 std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) + 418 batch_size() * ((input_height() * input_width() - 1) * input_pixel_stride() + input_channels())); 419 std::vector<uint16_t> zero(XNN_EXTRA_BYTES / sizeof(uint16_t) + input_width() * input_channels()); 420 std::vector<uint16_t> kernel(output_channels() * kernel_height() * kernel_width() * input_channels()); 421 std::vector<uint16_t> bias(output_channels()); 422 std::vector<uint16_t> output(batch_size() * output_channels() * output_height() * output_width()); 423 std::vector<float> output_ref(batch_size() * output_channels() * output_height() * output_width()); 424 std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> packed_weights((input_channels() * kernel_height() * kernel_width() + 1) * packed_output_channels()); 425 426 for (size_t iteration = 0; iteration < iterations(); iteration++) { 427 std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 428 std::generate(kernel.begin(), kernel.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 429 std::generate(bias.begin(), bias.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 430 std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */); 431 std::fill(packed_weights.begin(), packed_weights.end(), 0); 432 433 xnn_pack_f16_dconv_oki_w( 434 output_channels(), 435 input_channels(), 436 output_channels_tile(), 437 kernel_height(), kernel_width(), 438 kernel.data(), bias.data(), packed_weights.data(), nullptr); 439 440 // Compute reference results, without clamping. 441 for (size_t i = 0; i < batch_size(); i++) { 442 for (size_t oy = 0; oy < output_height(); oy++) { 443 for (size_t ox = 0; ox < output_width(); ox++) { 444 for (size_t oc = 0; oc < output_channels(); oc++) { 445 float acc = fp16_ieee_to_fp32_value(bias[oc]); 446 for (size_t ky = 0; ky < kernel_height(); ky++) { 447 const size_t iy = oy * subsampling_height() + ky - padding_top(); 448 if (iy < input_height()) { 449 for (size_t kx = 0; kx < kernel_width(); kx++) { 450 const size_t ix = ox * subsampling_width() + kx - padding_left(); 451 if (ix < input_width()) { 452 for (size_t ic = 0; ic < input_channels(); ic++) { 453 acc += 454 fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + ic]) * 455 fp16_ieee_to_fp32_value(kernel[((oc * kernel_height() + ky) * kernel_width() + kx) * input_channels() + ic]); 456 } 457 } 458 } 459 } 460 } 461 output_ref[((i * output_channels() + oc) * output_height() + oy) * output_width() + ox] = acc; 462 } 463 } 464 } 465 } 466 467 // Compute clamping parameters. 468 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); 469 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); 470 const float accumulated_range = accumulated_max - accumulated_min; 471 const float output_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_min + accumulated_range / 255.0f * float(qmin()))); 472 const float output_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_max - accumulated_range / 255.0f * float(255 - qmax()))); 473 474 // Clamp reference results. 475 for (float& value : output_ref) { 476 value = std::max(std::min(value, output_max), output_min); 477 } 478 479 // Prepare parameters. 480 xnn_f16_minmax_params params; 481 init_params(¶ms, fp16_ieee_from_fp32_value(output_min), fp16_ieee_from_fp32_value(output_max)); 482 483 // Call optimized micro-kernel. 484 conv( 485 input_height(), input_width(), 486 output_y_start(), output_y_end(), 487 input.data(), zero.data(), packed_weights.data(), output.data(), 488 padding_top(), output_channels(), 489 output_width() * sizeof(uint16_t), 490 output_height() * output_width() * sizeof(uint16_t), 491 ¶ms); 492 493 // Verify results. 494 for (size_t i = 0; i < batch_size(); i++) { 495 for (size_t y = output_y_start(); y < output_y_end(); y++) { 496 for (size_t x = 0; x < output_width(); x++) { 497 for (size_t c = 0; c < output_channels(); c++) { 498 ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * output_channels() + c) * output_height() + y) * output_width() + x]), output_min) 499 << "(x, y) = (" << x << ", " << y << "), channel = " << c; 500 ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * output_channels() + c) * output_height() + y) * output_width() + x]), output_max) 501 << "(x, y) = (" << x << ", " << y << "), channel = " << c; 502 ASSERT_NEAR( 503 output_ref[((i * output_channels() + c) * output_height() + y) * output_width() + x], 504 fp16_ieee_to_fp32_value(output[((i * output_channels() + c) * output_height() + y) * output_width() + x]), 505 std::max(1.0e-4f, 1.0e-2f * std::abs(output_ref[((i * output_channels() + c) * output_height() + y) * output_width() + x]))) 506 << "(x, y) = (" << x << ", " << y << "), channel = " << c; 507 } 508 } 509 } 510 } 511 } 512 } 513 514 private: 515 uint32_t padding_top_{0}; 516 uint32_t padding_right_{0}; 517 uint32_t padding_bottom_{0}; 518 uint32_t padding_left_{0}; 519 size_t input_height_{1}; 520 size_t input_width_{1}; 521 size_t input_channels_{1}; 522 size_t output_channels_{1}; 523 uint32_t output_channels_tile_{1}; 524 size_t batch_size_{1}; 525 uint32_t kernel_height_{1}; 526 uint32_t kernel_width_{1}; 527 uint32_t subsampling_height_{1}; 528 uint32_t subsampling_width_{1}; 529 uint32_t output_y_start_{0}; 530 uint32_t output_y_end_{std::numeric_limits<uint32_t>::max()}; 531 uint8_t qmin_{0}; 532 uint8_t qmax_{255}; 533 size_t iterations_{1}; 534 }; 535