1 // Copyright (c) Facebook, Inc. and its affiliates. 2 // All rights reserved. 3 // 4 // Copyright 2019 Google LLC 5 // 6 // This source code is licensed under the BSD-style license found in the 7 // LICENSE file in the root directory of this source tree. 8 9 #pragma once 10 11 #include <gtest/gtest.h> 12 13 #include <algorithm> 14 #include <cassert> 15 #include <cmath> 16 #include <cstddef> 17 #include <cstdlib> 18 #include <limits> 19 #include <random> 20 #include <vector> 21 22 #include "convolution-test-helpers.h" 23 #include <fp16.h> 24 25 #include <xnnpack.h> 26 #include <xnnpack/cache.h> 27 #include <xnnpack/allocator.h> 28 29 30 class ConvolutionOperatorTester { 31 public: 32 enum class WeightsType { 33 Default, 34 FP32, 35 }; 36 padding_tf_same(bool padding_same)37 inline ConvolutionOperatorTester& padding_tf_same(bool padding_same) { 38 if (padding_same) { 39 assert(padding_top() == 0); 40 assert(padding_left() == 0); 41 assert(padding_bottom() == 0); 42 assert(padding_right() == 0); 43 } 44 this->padding_tf_same_ = padding_same; 45 return *this; 46 } 47 padding_tf_same()48 inline bool padding_tf_same() const { 49 return this->padding_tf_same_; 50 } 51 padding(uint32_t padding)52 inline ConvolutionOperatorTester& padding(uint32_t padding) { 53 assert(!padding_tf_same()); 54 this->padding_top_ = padding; 55 this->padding_right_ = padding; 56 this->padding_bottom_ = padding; 57 this->padding_left_ = padding; 58 return *this; 59 } 60 padding(uint32_t padding_height,uint32_t padding_width)61 inline ConvolutionOperatorTester& padding(uint32_t padding_height, uint32_t padding_width) { 62 assert(!padding_tf_same()); 63 this->padding_top_ = padding_height; 64 this->padding_right_ = padding_width; 65 this->padding_bottom_ = padding_height; 66 this->padding_left_ = padding_width; 67 return *this; 68 } 69 padding_height(uint32_t padding_height)70 inline ConvolutionOperatorTester& padding_height(uint32_t padding_height) { 71 assert(!padding_tf_same()); 72 this->padding_top_ = padding_height; 73 this->padding_bottom_ = padding_height; 74 return *this; 75 } 76 padding_width(uint32_t padding_width)77 inline ConvolutionOperatorTester& padding_width(uint32_t padding_width) { 78 assert(!padding_tf_same()); 79 this->padding_right_ = padding_width; 80 this->padding_left_ = padding_width; 81 return *this; 82 } 83 padding_top(uint32_t padding_top)84 inline ConvolutionOperatorTester& padding_top(uint32_t padding_top) { 85 assert(!padding_tf_same()); 86 this->padding_top_ = padding_top; 87 return *this; 88 } 89 padding_top()90 inline uint32_t padding_top() const { 91 if (padding_tf_same()) { 92 const uint32_t total_padding_height = 93 (output_height() - 1) * subsampling_height() + dilated_kernel_height() - input_height(); 94 return total_padding_height / 2; 95 } else { 96 return this->padding_top_; 97 } 98 } 99 padding_left(uint32_t padding_left)100 inline ConvolutionOperatorTester& padding_left(uint32_t padding_left) { 101 assert(!padding_tf_same()); 102 this->padding_left_ = padding_left; 103 return *this; 104 } 105 padding_left()106 inline uint32_t padding_left() const { 107 if (padding_tf_same()) { 108 const uint32_t total_padding_width = 109 (output_width() - 1) * subsampling_width() + dilated_kernel_width() - input_width(); 110 return total_padding_width / 2; 111 } else { 112 return this->padding_left_; 113 } 114 } 115 padding_bottom(uint32_t padding_bottom)116 inline ConvolutionOperatorTester& padding_bottom(uint32_t padding_bottom) { 117 assert(!padding_tf_same()); 118 this->padding_bottom_ = padding_bottom; 119 return *this; 120 } 121 padding_bottom()122 inline uint32_t padding_bottom() const { 123 if (padding_tf_same()) { 124 const uint32_t total_padding_height = 125 (output_height() - 1) * subsampling_height() + dilated_kernel_height() - input_height(); 126 return total_padding_height - total_padding_height / 2; 127 } else { 128 return this->padding_bottom_; 129 } 130 } 131 padding_right(uint32_t padding_right)132 inline ConvolutionOperatorTester& padding_right(uint32_t padding_right) { 133 assert(!padding_tf_same()); 134 this->padding_right_ = padding_right; 135 return *this; 136 } 137 padding_right()138 inline uint32_t padding_right() const { 139 if (padding_tf_same()) { 140 const uint32_t total_padding_width = 141 (output_width() - 1) * subsampling_width() + dilated_kernel_width() - input_width(); 142 return total_padding_width - total_padding_width / 2; 143 } else { 144 return this->padding_right_; 145 } 146 } 147 input_size(uint32_t input_height,uint32_t input_width)148 inline ConvolutionOperatorTester& input_size(uint32_t input_height, uint32_t input_width) { 149 assert(input_height >= 1); 150 assert(input_width >= 1); 151 this->input_height_ = input_height; 152 this->input_width_ = input_width; 153 return *this; 154 } 155 input_height(uint32_t input_height)156 inline ConvolutionOperatorTester& input_height(uint32_t input_height) { 157 assert(input_height >= 1); 158 this->input_height_ = input_height; 159 return *this; 160 } 161 input_height()162 inline uint32_t input_height() const { 163 return this->input_height_; 164 } 165 input_width(uint32_t input_width)166 inline ConvolutionOperatorTester& input_width(uint32_t input_width) { 167 assert(input_width >= 1); 168 this->input_width_ = input_width; 169 return *this; 170 } 171 input_width()172 inline uint32_t input_width() const { 173 return this->input_width_; 174 } 175 groups(uint32_t groups)176 inline ConvolutionOperatorTester& groups(uint32_t groups) { 177 assert(groups >= 1); 178 this->groups_ = groups; 179 return *this; 180 } 181 groups()182 inline uint32_t groups() const { 183 return this->groups_; 184 } 185 group_input_channels(size_t group_input_channels)186 inline ConvolutionOperatorTester& group_input_channels(size_t group_input_channels) { 187 assert(group_input_channels >= 1); 188 this->group_input_channels_ = group_input_channels; 189 return *this; 190 } 191 group_input_channels()192 inline size_t group_input_channels() const { 193 return this->group_input_channels_; 194 } 195 group_output_channels(size_t group_output_channels)196 inline ConvolutionOperatorTester& group_output_channels(size_t group_output_channels) { 197 assert(group_output_channels >= 1); 198 this->group_output_channels_ = group_output_channels; 199 return *this; 200 } 201 group_output_channels()202 inline size_t group_output_channels() const { 203 return this->group_output_channels_; 204 } 205 batch_size(size_t batch_size)206 inline ConvolutionOperatorTester& batch_size(size_t batch_size) { 207 assert(batch_size >= 1); 208 this->batch_size_ = batch_size; 209 return *this; 210 } 211 batch_size()212 inline size_t batch_size() const { 213 return this->batch_size_; 214 } 215 kernel_size(uint32_t kernel_size)216 inline ConvolutionOperatorTester& kernel_size(uint32_t kernel_size) { 217 assert(kernel_size >= 1); 218 this->kernel_height_ = kernel_size; 219 this->kernel_width_ = kernel_size; 220 return *this; 221 } 222 kernel_size(uint32_t kernel_height,uint32_t kernel_width)223 inline ConvolutionOperatorTester& kernel_size(uint32_t kernel_height, uint32_t kernel_width) { 224 assert(kernel_height >= 1); 225 assert(kernel_width >= 1); 226 this->kernel_height_ = kernel_height; 227 this->kernel_width_ = kernel_width; 228 return *this; 229 } 230 kernel_height(uint32_t kernel_height)231 inline ConvolutionOperatorTester& kernel_height(uint32_t kernel_height) { 232 assert(kernel_height >= 1); 233 this->kernel_height_ = kernel_height; 234 return *this; 235 } 236 kernel_height()237 inline uint32_t kernel_height() const { 238 return this->kernel_height_; 239 } 240 kernel_width(uint32_t kernel_width)241 inline ConvolutionOperatorTester& kernel_width(uint32_t kernel_width) { 242 assert(kernel_width >= 1); 243 this->kernel_width_ = kernel_width; 244 return *this; 245 } 246 kernel_width()247 inline uint32_t kernel_width() const { 248 return this->kernel_width_; 249 } 250 dilation(uint32_t dilation)251 inline ConvolutionOperatorTester& dilation(uint32_t dilation) { 252 assert(dilation >= 1); 253 this->dilation_height_ = dilation; 254 this->dilation_width_ = dilation; 255 return *this; 256 } 257 dilation(uint32_t dilation_height,uint32_t dilation_width)258 inline ConvolutionOperatorTester& dilation(uint32_t dilation_height, uint32_t dilation_width) { 259 assert(dilation_height >= 1); 260 assert(dilation_width >= 1); 261 this->dilation_height_ = dilation_height; 262 this->dilation_width_ = dilation_width; 263 return *this; 264 } 265 dilation_height(uint32_t dilation_height)266 inline ConvolutionOperatorTester& dilation_height(uint32_t dilation_height) { 267 assert(dilation_height >= 1); 268 this->dilation_height_ = dilation_height; 269 return *this; 270 } 271 dilation_height()272 inline uint32_t dilation_height() const { 273 return this->dilation_height_; 274 } 275 dilation_width(uint32_t dilation_width)276 inline ConvolutionOperatorTester& dilation_width(uint32_t dilation_width) { 277 assert(dilation_width >= 1); 278 this->dilation_width_ = dilation_width; 279 return *this; 280 } 281 dilation_width()282 inline uint32_t dilation_width() const { 283 return this->dilation_width_; 284 } 285 subsampling(uint32_t subsampling)286 inline ConvolutionOperatorTester& subsampling(uint32_t subsampling) { 287 assert(subsampling >= 1); 288 this->subsampling_height_ = subsampling; 289 this->subsampling_width_ = subsampling; 290 return *this; 291 } 292 subsampling(uint32_t subsampling_height,uint32_t subsampling_width)293 inline ConvolutionOperatorTester& subsampling(uint32_t subsampling_height, uint32_t subsampling_width) { 294 assert(subsampling_height >= 1); 295 assert(subsampling_width >= 1); 296 this->subsampling_height_ = subsampling_height; 297 this->subsampling_width_ = subsampling_width; 298 return *this; 299 } 300 subsampling_height(uint32_t subsampling_height)301 inline ConvolutionOperatorTester& subsampling_height(uint32_t subsampling_height) { 302 assert(subsampling_height >= 1); 303 this->subsampling_height_ = subsampling_height; 304 return *this; 305 } 306 subsampling_height()307 inline uint32_t subsampling_height() const { 308 return this->subsampling_height_; 309 } 310 subsampling_width(uint32_t subsampling_width)311 inline ConvolutionOperatorTester& subsampling_width(uint32_t subsampling_width) { 312 assert(subsampling_width >= 1); 313 this->subsampling_width_ = subsampling_width; 314 return *this; 315 } 316 subsampling_width()317 inline uint32_t subsampling_width() const { 318 return this->subsampling_width_; 319 } 320 input_channel_stride(size_t input_channel_stride)321 inline ConvolutionOperatorTester& input_channel_stride(size_t input_channel_stride) { 322 assert(input_channel_stride >= 1); 323 this->input_channel_stride_ = input_channel_stride; 324 return *this; 325 } 326 input_channel_stride()327 inline size_t input_channel_stride() const { 328 if (this->input_channel_stride_ == 0) { 329 return group_input_channels() * groups(); 330 } else { 331 assert(this->input_channel_stride_ >= group_input_channels() * groups()); 332 return this->input_channel_stride_; 333 } 334 } 335 output_channel_stride(size_t output_channel_stride)336 inline ConvolutionOperatorTester& output_channel_stride(size_t output_channel_stride) { 337 assert(output_channel_stride >= 1); 338 this->output_channel_stride_ = output_channel_stride; 339 return *this; 340 } 341 output_channel_stride()342 inline size_t output_channel_stride() const { 343 if (this->output_channel_stride_ == 0) { 344 return group_output_channels() * groups(); 345 } else { 346 assert(this->output_channel_stride_ >= group_output_channels() * groups()); 347 return this->output_channel_stride_; 348 } 349 } 350 dilated_kernel_height()351 inline uint32_t dilated_kernel_height() const { 352 return (kernel_height() - 1) * dilation_height() + 1; 353 } 354 dilated_kernel_width()355 inline uint32_t dilated_kernel_width() const { 356 return (kernel_width() - 1) * dilation_width() + 1; 357 } 358 output_height()359 inline size_t output_height() const { 360 if (padding_tf_same()) { 361 return (input_height() + subsampling_height() - 1) / subsampling_height(); 362 } else { 363 const size_t padded_input_height = padding_top() + input_height() + padding_bottom(); 364 if (padded_input_height <= dilated_kernel_height()) { 365 return 1; 366 } else { 367 return (padded_input_height - dilated_kernel_height()) / subsampling_height() + 1; 368 } 369 } 370 } 371 output_width()372 inline size_t output_width() const { 373 if (padding_tf_same()) { 374 return (input_width() + subsampling_width() - 1) / subsampling_width(); 375 } else { 376 const size_t padded_input_width = padding_left() + input_width() + padding_right(); 377 if (padded_input_width <= dilated_kernel_width()) { 378 return 1; 379 } else { 380 return (padded_input_width - dilated_kernel_width()) / subsampling_width() + 1; 381 } 382 } 383 } 384 next_input_size(uint32_t next_input_height,uint32_t next_input_width)385 inline ConvolutionOperatorTester& next_input_size(uint32_t next_input_height, uint32_t next_input_width) { 386 assert(next_input_height >= 1); 387 assert(next_input_width >= 1); 388 this->next_input_height_ = next_input_height; 389 this->next_input_width_ = next_input_width; 390 return *this; 391 } 392 next_input_height(uint32_t next_input_height)393 inline ConvolutionOperatorTester& next_input_height(uint32_t next_input_height) { 394 assert(next_input_height >= 1); 395 this->next_input_height_ = next_input_height; 396 return *this; 397 } 398 next_input_height()399 inline uint32_t next_input_height() const { 400 if (this->next_input_height_ == 0) { 401 return input_height(); 402 } else { 403 return this->next_input_height_; 404 } 405 } 406 next_input_width(uint32_t next_input_width)407 inline ConvolutionOperatorTester& next_input_width(uint32_t next_input_width) { 408 assert(next_input_width >= 1); 409 this->next_input_width_ = next_input_width; 410 return *this; 411 } 412 next_input_width()413 inline uint32_t next_input_width() const { 414 if (this->next_input_width_ == 0) { 415 return input_width(); 416 } else { 417 return this->next_input_width_; 418 } 419 } 420 next_output_height()421 inline size_t next_output_height() const { 422 const size_t padded_input_height = padding_top() + next_input_height() + padding_bottom(); 423 if (padded_input_height <= dilated_kernel_height()) { 424 return 1; 425 } else { 426 return (padded_input_height - dilated_kernel_height()) / subsampling_height() + 1; 427 } 428 } 429 next_output_width()430 inline size_t next_output_width() const { 431 const size_t padded_input_width = padding_left() + next_input_width() + padding_right(); 432 if (padded_input_width <= dilated_kernel_width()) { 433 return 1; 434 } else { 435 return (padded_input_width - dilated_kernel_width()) / subsampling_width() + 1; 436 } 437 } 438 next_batch_size(size_t next_batch_size)439 inline ConvolutionOperatorTester& next_batch_size(size_t next_batch_size) { 440 assert(next_batch_size >= 1); 441 this->next_batch_size_ = next_batch_size; 442 return *this; 443 } 444 next_batch_size()445 inline size_t next_batch_size() const { 446 if (this->next_batch_size_ == 0) { 447 return batch_size(); 448 } else { 449 return this->next_batch_size_; 450 } 451 } 452 sparsity(float sparsity)453 inline ConvolutionOperatorTester& sparsity(float sparsity) { 454 this->sparsity_ = sparsity; 455 return *this; 456 } 457 sparsity()458 inline float sparsity() const { 459 return this->sparsity_; 460 } 461 qmin(uint8_t qmin)462 inline ConvolutionOperatorTester& qmin(uint8_t qmin) { 463 this->qmin_ = qmin; 464 return *this; 465 } 466 qmin()467 inline uint8_t qmin() const { 468 return this->qmin_; 469 } 470 qmax(uint8_t qmax)471 inline ConvolutionOperatorTester& qmax(uint8_t qmax) { 472 this->qmax_ = qmax; 473 return *this; 474 } 475 qmax()476 inline uint8_t qmax() const { 477 return this->qmax_; 478 } 479 force_nhwc_input(bool force_nhwc_input)480 inline ConvolutionOperatorTester& force_nhwc_input(bool force_nhwc_input) { 481 this->force_nhwc_input_ = force_nhwc_input; 482 return *this; 483 } 484 force_nhwc_input()485 inline bool force_nhwc_input() const { 486 return this->force_nhwc_input_; 487 } 488 depthwise_layout(bool depthwise_layout)489 inline ConvolutionOperatorTester& depthwise_layout(bool depthwise_layout) { 490 this->depthwise_layout_ = depthwise_layout; 491 return *this; 492 } 493 depthwise_layout()494 inline bool depthwise_layout() const { 495 return this->depthwise_layout_; 496 } 497 has_bias(bool has_bias)498 inline ConvolutionOperatorTester& has_bias(bool has_bias) { 499 this->has_bias_ = has_bias; 500 return *this; 501 } 502 has_bias()503 inline bool has_bias() const { 504 return this->has_bias_; 505 } 506 weights_type(WeightsType weights_type)507 inline ConvolutionOperatorTester& weights_type(WeightsType weights_type) { 508 this->weights_type_ = weights_type; 509 return *this; 510 } 511 weights_type()512 inline WeightsType weights_type() const { 513 return this->weights_type_; 514 } 515 iterations(size_t iterations)516 inline ConvolutionOperatorTester& iterations(size_t iterations) { 517 this->iterations_ = iterations; 518 return *this; 519 } 520 iterations()521 inline size_t iterations() const { 522 return this->iterations_; 523 } 524 525 #if XNN_PLATFORM_JIT use_jit(bool use_jit)526 inline ConvolutionOperatorTester& use_jit(bool use_jit) { 527 this->use_jit_ = use_jit; 528 return *this; 529 } 530 use_jit()531 inline bool use_jit() const { 532 return this->use_jit_; 533 } 534 #endif 535 use_weights_cache(bool use_weights_cache)536 inline ConvolutionOperatorTester& use_weights_cache(bool use_weights_cache) { 537 this->use_weights_cache_ = use_weights_cache; 538 return *this; 539 } 540 use_weights_cache()541 inline bool use_weights_cache() const { 542 return this->use_weights_cache_; 543 } 544 TestNHWCxQC8()545 void TestNHWCxQC8() const { 546 ASSERT_EQ(weights_type(), WeightsType::Default); 547 548 std::random_device random_device; 549 auto rng = std::mt19937(random_device()); 550 std::uniform_int_distribution<int32_t> i32dist(-10000, 10000); 551 std::uniform_int_distribution<int32_t> i8dist( 552 std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()); 553 std::uniform_int_distribution<int32_t> w8dist( 554 -std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max()); 555 556 std::vector<int8_t> input(XNN_EXTRA_BYTES / sizeof(int8_t) + 557 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels())); 558 std::vector<int8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels()); 559 std::vector<int32_t> bias(groups() * group_output_channels()); 560 std::vector<int8_t> output(batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels())); 561 std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels()); 562 std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels()); 563 std::vector<float> requantization_scales(groups() * group_output_channels()); 564 565 const int8_t input_zero_point = -1; 566 const int8_t output_zero_point = -1; 567 568 for (size_t iteration = 0; iteration < iterations(); iteration++) { 569 std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); 570 std::generate(kernel.begin(), kernel.end(), [&]() { return w8dist(rng); }); 571 std::generate(bias.begin(), bias.end(), [&]() { return i32dist(rng); }); 572 std::fill(output.begin(), output.end(), INT8_C(0xA5)); 573 574 // Compute reference results, without renormalization. 575 if (depthwise_layout()) { 576 ASSERT_EQ(group_input_channels(), 1); 577 xnnpack::compute_depthwise_convolution_qs8_reference_results( 578 batch_size(), 579 output_height(), 580 output_width(), 581 input_height(), 582 input_width(), 583 padding_top(), 584 padding_right(), 585 padding_bottom(), 586 padding_left(), 587 kernel_height(), 588 kernel_width(), 589 subsampling_height(), 590 subsampling_width(), 591 dilation_height(), 592 dilation_width(), 593 groups(), 594 group_output_channels(), 595 input_channel_stride(), 596 input_zero_point, 597 input, 598 kernel, 599 accumulators, 600 has_bias(), 601 bias); 602 } else { 603 xnnpack::compute_convolution_qs8_reference_results( 604 batch_size(), 605 output_height(), 606 output_width(), 607 input_height(), 608 input_width(), 609 padding_top(), 610 padding_right(), 611 padding_bottom(), 612 padding_left(), 613 kernel_height(), 614 kernel_width(), 615 subsampling_height(), 616 subsampling_width(), 617 dilation_height(), 618 dilation_width(), 619 groups(), 620 group_input_channels(), 621 group_output_channels(), 622 input_channel_stride(), 623 input_zero_point, 624 input, 625 kernel, 626 accumulators, 627 has_bias(), 628 bias); 629 } 630 631 // Compute renormalization parameters. 632 for (size_t c = 0; c < groups() * group_output_channels(); c++) { 633 int32_t accumulated_min = accumulators[c]; 634 int32_t accumulated_max = accumulators[c]; 635 for (size_t px = 0; px < batch_size() * output_height() * output_width(); px++) { 636 accumulated_min = std::min(accumulated_min, accumulators[px * groups() * group_output_channels() + c]); 637 accumulated_max = std::max(accumulated_max, accumulators[px * groups() * group_output_channels() + c]); 638 } 639 640 float requantization_scale = 0x1.0p-32f; 641 if (accumulated_max != 0) { 642 requantization_scale = std::max(requantization_scale, 643 float(int32_t(std::numeric_limits<int8_t>::max()) - int32_t(output_zero_point)) / float(accumulated_max)); 644 } 645 if (accumulated_min != 0) { 646 requantization_scale = std::max(requantization_scale, 647 float(int32_t(std::numeric_limits<int8_t>::min()) - int32_t(output_zero_point)) / float(accumulated_min)); 648 } 649 requantization_scale = std::min(requantization_scale, 0x1.FFFFFEp-1f); 650 651 requantization_scales[c] = requantization_scale; 652 } 653 654 // Renormalize reference results. 655 for (size_t c = 0; c < groups() * group_output_channels(); c++) { 656 for (size_t px = 0; px < batch_size() * output_height() * output_width(); px++) { 657 output_ref[px * groups() * group_output_channels() + c] = double(int32_t(output_zero_point)) + 658 double(accumulators[px * groups() * group_output_channels() + c]) * double(requantization_scales[c]); 659 } 660 } 661 std::transform(output_ref.cbegin(), output_ref.cend(), output_ref.begin(), 662 [this](double x) -> double { 663 return std::max<double>(std::min<double>(x, double(qmax() - 0x80)), double(qmin() - 0x80)); 664 }); 665 666 // Create, setup, run, and destroy Convolution operator. 667 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); 668 xnn_operator_t convolution_op = nullptr; 669 xnn_caches caches = { 670 .code_cache = NULL, 671 .weights_cache = NULL, 672 }; 673 xnn_weights_cache weights_cache; 674 if (use_weights_cache()) { 675 xnn_init_weights_cache(&weights_cache); 676 caches.weights_cache = &weights_cache; 677 } 678 679 xnn_status status = xnn_create_convolution2d_nhwc_qc8( 680 padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(), 681 padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(), 682 kernel_height(), kernel_width(), 683 subsampling_height(), subsampling_width(), 684 dilation_height(), dilation_width(), 685 groups(), group_input_channels(), group_output_channels(), 686 input_channel_stride(), output_channel_stride(), 687 input_zero_point, 1.0f /* input scale */, requantization_scales.data(), 688 kernel.data(), has_bias() ? bias.data() : nullptr, 689 output_zero_point, 1.0f /* output scale */, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80), 690 (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0), 691 &caches, 692 &convolution_op); 693 if (status == xnn_status_unsupported_hardware) { 694 GTEST_SKIP(); 695 } 696 ASSERT_EQ(xnn_status_success, status); 697 ASSERT_NE(nullptr, convolution_op); 698 if (use_weights_cache()) { 699 ASSERT_EQ(xnn_status_success, 700 xnn_finalize_weights_cache(&weights_cache, xnn_weights_cache_finalization_kind_soft)); 701 } 702 703 // Smart pointer to automatically delete convolution_op. 704 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator); 705 706 ASSERT_EQ(xnn_status_success, 707 xnn_setup_convolution2d_nhwc_qc8( 708 convolution_op, 709 batch_size(), input_height(), input_width(), 710 input.data(), output.data(), 711 nullptr /* thread pool */)); 712 713 ASSERT_EQ(xnn_status_success, 714 xnn_run_operator(convolution_op, nullptr /* thread pool */)); 715 716 // Verify results. 717 VerifyNHWCxQC8(output, output_ref); 718 719 if (use_weights_cache()) { 720 xnn_operator_t convolution_op2 = nullptr; 721 size_t old_weights_cache_size = weights_cache.cache.weights.size; 722 723 xnn_status status = xnn_create_convolution2d_nhwc_qc8( 724 padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(), 725 padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(), 726 kernel_height(), kernel_width(), 727 subsampling_height(), subsampling_width(), 728 dilation_height(), dilation_width(), 729 groups(), group_input_channels(), group_output_channels(), 730 input_channel_stride(), output_channel_stride(), 731 input_zero_point, 1.0f /* input scale */, requantization_scales.data(), 732 kernel.data(), has_bias() ? bias.data() : nullptr, 733 output_zero_point, 1.0f /* output scale */, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80), 734 (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0), 735 &caches, 736 &convolution_op2); 737 (void) status; 738 ASSERT_NE(nullptr, convolution_op2); 739 740 // Smart pointer to automatically delete convolution_op2. 741 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op2, xnn_delete_operator); 742 std::vector<int8_t> output2(output.size(), INT8_C(0xA5)); 743 ASSERT_EQ(xnn_status_success, 744 xnn_setup_convolution2d_nhwc_qc8( 745 convolution_op2, 746 batch_size(), input_height(), input_width(), 747 input.data(), output2.data(), 748 nullptr /* thread pool */)); 749 750 ASSERT_EQ(xnn_status_success, 751 xnn_run_operator(convolution_op2, nullptr /* thread pool */)); 752 753 VerifyNHWCxQC8(output2, output_ref); 754 VerifyWeightsCache(weights_cache, old_weights_cache_size); 755 xnn_release_weights_cache(&weights_cache); 756 } 757 } 758 } 759 VerifyNHWCxQC8(const std::vector<int8_t> & output,const std::vector<double> & output_ref)760 void VerifyNHWCxQC8(const std::vector<int8_t> &output, 761 const std::vector<double> &output_ref) const { 762 for (size_t i = 0; i < batch_size(); i++) { 763 for (size_t y = 0; y < output_height(); y++) { 764 for (size_t x = 0; x < output_width(); x++) { 765 for (size_t g = 0; g < groups(); g++) { 766 for (size_t c = 0; c < group_output_channels(); c++) { 767 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax() - 0x80)) 768 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 769 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin() - 0x80)) 770 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 771 ASSERT_NEAR( 772 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c], 773 double(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), 774 0.9) 775 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 776 } 777 } 778 } 779 } 780 } 781 } 782 TestNHWCxQS8()783 void TestNHWCxQS8() const { 784 ASSERT_EQ(weights_type(), WeightsType::Default); 785 786 std::random_device random_device; 787 auto rng = std::mt19937(random_device()); 788 std::uniform_int_distribution<int32_t> i32dist(-10000, 10000); 789 std::uniform_int_distribution<int32_t> i8dist( 790 std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()); 791 std::uniform_int_distribution<int32_t> w8dist( 792 -std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max()); 793 794 std::vector<int8_t> input(XNN_EXTRA_BYTES / sizeof(int8_t) + 795 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels())); 796 std::vector<int8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels()); 797 std::vector<int32_t> bias(groups() * group_output_channels()); 798 std::vector<int8_t> output(batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels())); 799 std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels()); 800 std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels()); 801 802 const int8_t input_zero_point = -1; 803 804 for (size_t iteration = 0; iteration < iterations(); iteration++) { 805 std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); 806 std::generate(kernel.begin(), kernel.end(), [&]() { return w8dist(rng); }); 807 std::generate(bias.begin(), bias.end(), [&]() { return i32dist(rng); }); 808 std::fill(output.begin(), output.end(), INT8_C(0xA5)); 809 810 // Compute reference results, without renormalization. 811 if (depthwise_layout()) { 812 ASSERT_EQ(group_input_channels(), 1); 813 xnnpack::compute_depthwise_convolution_qs8_reference_results( 814 batch_size(), 815 output_height(), 816 output_width(), 817 input_height(), 818 input_width(), 819 padding_top(), 820 padding_right(), 821 padding_bottom(), 822 padding_left(), 823 kernel_height(), 824 kernel_width(), 825 subsampling_height(), 826 subsampling_width(), 827 dilation_height(), 828 dilation_width(), 829 groups(), 830 group_output_channels(), 831 input_channel_stride(), 832 input_zero_point, 833 input, 834 kernel, 835 accumulators, 836 has_bias(), 837 bias); 838 } else { 839 xnnpack::compute_convolution_qs8_reference_results( 840 batch_size(), 841 output_height(), 842 output_width(), 843 input_height(), 844 input_width(), 845 padding_top(), 846 padding_right(), 847 padding_bottom(), 848 padding_left(), 849 kernel_height(), 850 kernel_width(), 851 subsampling_height(), 852 subsampling_width(), 853 dilation_height(), 854 dilation_width(), 855 groups(), 856 group_input_channels(), 857 group_output_channels(), 858 input_channel_stride(), 859 input_zero_point, 860 input, 861 kernel, 862 accumulators, 863 has_bias(), 864 bias); 865 } 866 867 // Compute renormalization parameters. 868 const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend()); 869 const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend()); 870 871 const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0; 872 const int8_t output_zero_point = int8_t(std::max(std::min( 873 lrint(-0.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale), 874 long(std::numeric_limits<int8_t>::max())), long(std::numeric_limits<int8_t>::min()))); 875 876 // Renormalize reference results. 877 std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(), 878 [this, output_scale, output_zero_point](int32_t x) -> double { 879 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax() - 0x80) - output_zero_point), double(qmin() - 0x80) - output_zero_point); 880 }); 881 882 // Create, setup, run, and destroy Convolution operator. 883 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); 884 xnn_operator_t convolution_op = nullptr; 885 xnn_caches caches = { 886 .code_cache = NULL, 887 .weights_cache = NULL, 888 }; 889 xnn_weights_cache weights_cache; 890 if (use_weights_cache()) { 891 xnn_init_weights_cache(&weights_cache); 892 caches.weights_cache = &weights_cache; 893 } 894 895 xnn_status status = xnn_create_convolution2d_nhwc_qs8( 896 padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(), 897 padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(), 898 kernel_height(), kernel_width(), 899 subsampling_height(), subsampling_width(), 900 dilation_height(), dilation_width(), 901 groups(), group_input_channels(), group_output_channels(), 902 input_channel_stride(), output_channel_stride(), 903 input_zero_point, 1.0f /* input scale */, 1.0f /* kernel scale */, 904 kernel.data(), has_bias() ? bias.data() : nullptr, 905 output_zero_point, output_scale, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80), 906 (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0), 907 &caches, 908 &convolution_op); 909 if (status == xnn_status_unsupported_hardware) { 910 GTEST_SKIP(); 911 } 912 ASSERT_EQ(xnn_status_success, status); 913 ASSERT_NE(nullptr, convolution_op); 914 if (use_weights_cache()) { 915 ASSERT_EQ(xnn_status_success, 916 xnn_finalize_weights_cache(&weights_cache, xnn_weights_cache_finalization_kind_soft)); 917 } 918 919 // Smart pointer to automatically delete convolution_op. 920 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator); 921 922 ASSERT_EQ(xnn_status_success, 923 xnn_setup_convolution2d_nhwc_qs8( 924 convolution_op, 925 batch_size(), input_height(), input_width(), 926 input.data(), output.data(), 927 nullptr /* thread pool */)); 928 929 ASSERT_EQ(xnn_status_success, 930 xnn_run_operator(convolution_op, nullptr /* thread pool */)); 931 932 VerifyNHWCxQS8(output, output_ref, output_zero_point); 933 934 if (use_weights_cache()) { 935 xnn_operator_t convolution_op2 = nullptr; 936 size_t old_weights_cache_size = weights_cache.cache.weights.size; 937 938 ASSERT_EQ( 939 xnn_status_success, 940 xnn_create_convolution2d_nhwc_qs8( 941 padding_tf_same() ? 0 : padding_top(), 942 padding_tf_same() ? 0 : padding_right(), 943 padding_tf_same() ? 0 : padding_bottom(), 944 padding_tf_same() ? 0 : padding_left(), kernel_height(), 945 kernel_width(), subsampling_height(), subsampling_width(), 946 dilation_height(), dilation_width(), groups(), 947 group_input_channels(), group_output_channels(), 948 input_channel_stride(), output_channel_stride(), 949 input_zero_point, 1.0f /* input scale */, 950 1.0f /* kernel scale */, kernel.data(), 951 has_bias() ? bias.data() : nullptr, output_zero_point, 952 output_scale, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80), 953 (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | 954 (padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0), 955 &caches, &convolution_op2)); 956 ASSERT_NE(nullptr, convolution_op2); 957 958 // Smart pointer to automatically delete convolution_op. 959 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> 960 auto_convolution_op(convolution_op2, xnn_delete_operator); 961 962 std::vector<int8_t> output2(output.size(), INT8_C(0xA5)); 963 ASSERT_EQ(xnn_status_success, 964 xnn_setup_convolution2d_nhwc_qs8( 965 convolution_op2, batch_size(), input_height(), 966 input_width(), input.data(), output2.data(), 967 nullptr /* thread pool */)); 968 969 ASSERT_EQ(xnn_status_success, 970 xnn_run_operator(convolution_op2, nullptr /* thread pool */)); 971 972 VerifyNHWCxQS8(output2, output_ref, output_zero_point); 973 VerifyWeightsCache(weights_cache, old_weights_cache_size); 974 xnn_release_weights_cache(&weights_cache); 975 } 976 } 977 } 978 VerifyNHWCxQS8(const std::vector<int8_t> & output,const std::vector<double> & output_ref,const int8_t output_zero_point)979 void VerifyNHWCxQS8(const std::vector<int8_t> &output, 980 const std::vector<double> &output_ref, 981 const int8_t output_zero_point) const { 982 for (size_t i = 0; i < batch_size(); i++) { 983 for (size_t y = 0; y < output_height(); y++) { 984 for (size_t x = 0; x < output_width(); x++) { 985 for (size_t g = 0; g < groups(); g++) { 986 for (size_t c = 0; c < group_output_channels(); c++) { 987 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax() - 0x80)) 988 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 989 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin() - 0x80)) 990 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 991 ASSERT_NEAR( 992 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c], 993 double(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]) - double(output_zero_point), 994 0.9) 995 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 996 } 997 } 998 } 999 } 1000 } 1001 } 1002 TestNHWCxQU8()1003 void TestNHWCxQU8() const { 1004 ASSERT_EQ(weights_type(), WeightsType::Default); 1005 1006 std::random_device random_device; 1007 auto rng = std::mt19937(random_device()); 1008 std::uniform_int_distribution<int32_t> i32dist(-10000, 10000); 1009 std::uniform_int_distribution<int32_t> u8dist( 1010 std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max()); 1011 1012 std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) + 1013 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels())); 1014 std::vector<uint8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels()); 1015 std::vector<int32_t> bias(groups() * group_output_channels()); 1016 std::vector<uint8_t> output(batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels())); 1017 std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels()); 1018 std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels()); 1019 1020 const uint8_t input_zero_point = 127; 1021 const uint8_t kernel_zero_point = 127; 1022 1023 for (size_t iteration = 0; iteration < iterations(); iteration++) { 1024 std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); }); 1025 std::generate(kernel.begin(), kernel.end(), [&]() { return u8dist(rng); }); 1026 std::generate(bias.begin(), bias.end(), [&]() { return i32dist(rng); }); 1027 std::fill(output.begin(), output.end(), UINT8_C(0xA5)); 1028 1029 // Compute reference results, without renormalization. 1030 if (has_bias()) { 1031 for (size_t i = 0; i < batch_size(); i++) { 1032 for (size_t oy = 0; oy < output_height(); oy++) { 1033 for (size_t ox = 0; ox < output_width(); ox++) { 1034 for (size_t g = 0; g < groups(); g++) { 1035 for (size_t oc = 0; oc < group_output_channels(); oc++) { 1036 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] = 1037 bias[g * group_output_channels() + oc]; 1038 } 1039 } 1040 } 1041 } 1042 } 1043 } else { 1044 std::fill(accumulators.begin(), accumulators.end(), 0); 1045 } 1046 if (depthwise_layout()) { 1047 ASSERT_EQ(group_input_channels(), 1); 1048 xnnpack::compute_depthwise_convolution_qu8_reference_results( 1049 batch_size(), 1050 output_height(), 1051 output_width(), 1052 input_height(), 1053 input_width(), 1054 padding_top(), 1055 padding_right(), 1056 padding_bottom(), 1057 padding_left(), 1058 kernel_height(), 1059 kernel_width(), 1060 subsampling_height(), 1061 subsampling_width(), 1062 dilation_height(), 1063 dilation_width(), 1064 groups(), 1065 group_output_channels(), 1066 input_channel_stride(), 1067 input_zero_point, 1068 kernel_zero_point, 1069 input, 1070 kernel, 1071 accumulators, 1072 has_bias(), 1073 bias); 1074 } else { 1075 xnnpack::compute_convolution_qu8_reference_results( 1076 batch_size(), 1077 output_height(), 1078 output_width(), 1079 input_height(), 1080 input_width(), 1081 padding_top(), 1082 padding_right(), 1083 padding_bottom(), 1084 padding_left(), 1085 kernel_height(), 1086 kernel_width(), 1087 subsampling_height(), 1088 subsampling_width(), 1089 dilation_height(), 1090 dilation_width(), 1091 groups(), 1092 group_input_channels(), 1093 group_output_channels(), 1094 input_channel_stride(), 1095 input_zero_point, 1096 kernel_zero_point, 1097 input, 1098 kernel, 1099 accumulators, 1100 has_bias(), 1101 bias); 1102 } 1103 1104 // Compute renormalization parameters. 1105 const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend()); 1106 const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend()); 1107 1108 const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0; 1109 const uint8_t output_zero_point = uint8_t(std::max(std::min( 1110 lrint(127.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale), 1111 long(std::numeric_limits<uint8_t>::max())), long(std::numeric_limits<uint8_t>::min()))); 1112 1113 // Renormalize reference results. 1114 std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(), 1115 [this, output_scale, output_zero_point](int32_t x) -> double { 1116 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax()) - output_zero_point), double(qmin()) - output_zero_point); 1117 }); 1118 1119 // Create, setup, run, and destroy Convolution operator. 1120 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); 1121 xnn_operator_t convolution_op = nullptr; 1122 1123 xnn_caches caches = { 1124 .code_cache = NULL, 1125 .weights_cache = NULL, 1126 }; 1127 xnn_weights_cache weights_cache; 1128 if (use_weights_cache()) { 1129 xnn_init_weights_cache(&weights_cache); 1130 caches.weights_cache = &weights_cache; 1131 } 1132 1133 xnn_status status = xnn_create_convolution2d_nhwc_qu8( 1134 padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(), 1135 padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(), 1136 kernel_height(), kernel_width(), 1137 subsampling_height(), subsampling_width(), 1138 dilation_height(), dilation_width(), 1139 groups(), group_input_channels(), group_output_channels(), 1140 input_channel_stride(), output_channel_stride(), 1141 input_zero_point, 1.0f /* input scale */, 1142 kernel_zero_point, 1.0f /* kernel scale */, 1143 kernel.data(), has_bias() ? bias.data() : nullptr, 1144 output_zero_point, output_scale, qmin(), qmax(), 1145 (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0), 1146 &caches, 1147 &convolution_op); 1148 if (status == xnn_status_unsupported_hardware) { 1149 GTEST_SKIP(); 1150 } 1151 ASSERT_EQ(xnn_status_success, status); 1152 ASSERT_NE(nullptr, convolution_op); 1153 if (use_weights_cache()) { 1154 ASSERT_EQ(xnn_status_success, 1155 xnn_finalize_weights_cache(&weights_cache, xnn_weights_cache_finalization_kind_soft)); 1156 } 1157 1158 // Smart pointer to automatically delete convolution_op. 1159 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator); 1160 1161 ASSERT_EQ(xnn_status_success, 1162 xnn_setup_convolution2d_nhwc_qu8( 1163 convolution_op, 1164 batch_size(), input_height(), input_width(), 1165 input.data(), output.data(), 1166 nullptr /* thread pool */)); 1167 1168 ASSERT_EQ(xnn_status_success, 1169 xnn_run_operator(convolution_op, nullptr /* thread pool */)); 1170 1171 // Verify results. 1172 VerifyNHWCxQU8(output, output_ref, output_zero_point); 1173 1174 if (use_weights_cache()) { 1175 xnn_operator_t convolution_op2 = nullptr; 1176 size_t old_weights_cache_size = weights_cache.cache.weights.size; 1177 1178 ASSERT_EQ( 1179 xnn_status_success, 1180 xnn_create_convolution2d_nhwc_qu8( 1181 padding_tf_same() ? 0 : padding_top(), 1182 padding_tf_same() ? 0 : padding_right(), 1183 padding_tf_same() ? 0 : padding_bottom(), 1184 padding_tf_same() ? 0 : padding_left(), kernel_height(), 1185 kernel_width(), subsampling_height(), subsampling_width(), 1186 dilation_height(), dilation_width(), groups(), 1187 group_input_channels(), group_output_channels(), 1188 input_channel_stride(), output_channel_stride(), 1189 input_zero_point, 1.0f /* input scale */, kernel_zero_point, 1190 1.0f /* kernel scale */, kernel.data(), 1191 has_bias() ? bias.data() : nullptr, output_zero_point, 1192 output_scale, qmin(), qmax(), 1193 (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | 1194 (padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0), 1195 &caches, &convolution_op2)); 1196 ASSERT_NE(nullptr, convolution_op2); 1197 1198 // Smart pointer to automatically delete convolution_op2. 1199 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> 1200 auto_convolution_op2(convolution_op2, xnn_delete_operator); 1201 std::vector<uint8_t> output2(output.size(), UINT8_C(0xA5)); 1202 1203 ASSERT_EQ(xnn_status_success, 1204 xnn_setup_convolution2d_nhwc_qu8( 1205 convolution_op2, batch_size(), input_height(), 1206 input_width(), input.data(), output2.data(), 1207 nullptr /* thread pool */)); 1208 1209 ASSERT_EQ(xnn_status_success, 1210 xnn_run_operator(convolution_op2, nullptr /* thread pool */)); 1211 1212 // Verify results. 1213 VerifyNHWCxQU8(output2, output_ref, output_zero_point); 1214 VerifyWeightsCache(weights_cache, old_weights_cache_size); 1215 xnn_release_weights_cache(&weights_cache); 1216 } 1217 } 1218 } 1219 VerifyNHWCxQU8(const std::vector<uint8_t> & output,const std::vector<double> & output_ref,const uint8_t output_zero_point)1220 void VerifyNHWCxQU8(const std::vector<uint8_t> &output, 1221 const std::vector<double> &output_ref, 1222 const uint8_t output_zero_point) const { 1223 for (size_t i = 0; i < batch_size(); i++) { 1224 for (size_t y = 0; y < output_height(); y++) { 1225 for (size_t x = 0; x < output_width(); x++) { 1226 for (size_t g = 0; g < groups(); g++) { 1227 for (size_t c = 0; c < group_output_channels(); c++) { 1228 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax())) 1229 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 1230 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin())) 1231 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 1232 ASSERT_NEAR( 1233 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c], 1234 double(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]) - double(output_zero_point), 1235 0.9) 1236 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 1237 } 1238 } 1239 } 1240 } 1241 } 1242 } 1243 TestNHWCxF32()1244 void TestNHWCxF32() const { 1245 ASSERT_EQ(weights_type(), WeightsType::Default); 1246 1247 std::random_device random_device; 1248 auto rng = std::mt19937(random_device()); 1249 std::uniform_real_distribution<float> f32dist(0.1f, 1.0f); 1250 1251 std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + 1252 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels())); 1253 std::vector<float> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels()); 1254 std::vector<float> bias(groups() * group_output_channels()); 1255 std::vector<float> output(batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels())); 1256 std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels()); 1257 1258 for (size_t iteration = 0; iteration < iterations(); iteration++) { 1259 std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); 1260 std::generate(kernel.begin(), kernel.end(), [&]() { return f32dist(rng); }); 1261 std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); }); 1262 std::fill(output.begin(), output.end(), nanf("")); 1263 1264 // Compute reference results, without clamping. 1265 if (has_bias()) { 1266 for (size_t i = 0; i < batch_size(); i++) { 1267 for (size_t oy = 0; oy < output_height(); oy++) { 1268 for (size_t ox = 0; ox < output_width(); ox++) { 1269 for (size_t g = 0; g < groups(); g++) { 1270 for (size_t oc = 0; oc < group_output_channels(); oc++) { 1271 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] = 1272 bias[g * group_output_channels() + oc]; 1273 } 1274 } 1275 } 1276 } 1277 } 1278 } else { 1279 std::fill(output_ref.begin(), output_ref.end(), 0.0f); 1280 } 1281 if (depthwise_layout()) { 1282 ASSERT_EQ(group_input_channels(), 1); 1283 1284 for (size_t i = 0; i < batch_size(); i++) { 1285 for (size_t oy = 0; oy < output_height(); oy++) { 1286 for (size_t ox = 0; ox < output_width(); ox++) { 1287 for (size_t ky = 0; ky < kernel_height(); ky++) { 1288 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top(); 1289 if (iy < input_height()) { 1290 for (size_t kx = 0; kx < kernel_width(); kx++) { 1291 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left(); 1292 if (ix < input_width()) { 1293 for (size_t g = 0; g < groups(); g++) { 1294 for (size_t oc = 0; oc < group_output_channels(); oc++) { 1295 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] += 1296 input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g] * 1297 kernel[((ky * kernel_width() + kx) * groups() + g) * group_output_channels() + oc]; 1298 } 1299 } 1300 } 1301 } 1302 } 1303 } 1304 } 1305 } 1306 } 1307 } else { 1308 for (size_t i = 0; i < batch_size(); i++) { 1309 for (size_t oy = 0; oy < output_height(); oy++) { 1310 for (size_t ox = 0; ox < output_width(); ox++) { 1311 for (size_t ky = 0; ky < kernel_height(); ky++) { 1312 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top(); 1313 if (iy < input_height()) { 1314 for (size_t kx = 0; kx < kernel_width(); kx++) { 1315 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left(); 1316 if (ix < input_width()) { 1317 for (size_t g = 0; g < groups(); g++) { 1318 for (size_t oc = 0; oc < group_output_channels(); oc++) { 1319 for (size_t ic = 0; ic < group_input_channels(); ic++) { 1320 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] += 1321 input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic] * 1322 kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]; 1323 } 1324 } 1325 } 1326 } 1327 } 1328 } 1329 } 1330 } 1331 } 1332 } 1333 } 1334 1335 // Compute clamping parameters. 1336 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); 1337 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); 1338 1339 const float output_min = accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin()); 1340 const float output_max = accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax()); 1341 1342 // Clamp reference results. 1343 for (float& value : output_ref) { 1344 value = std::max(std::min(value, output_max), output_min); 1345 } 1346 1347 // Create, setup, run, and destroy Convolution operator. 1348 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); 1349 xnn_operator_t convolution_op = nullptr; 1350 1351 xnn_caches caches = { 1352 .code_cache = NULL, 1353 .weights_cache = NULL, 1354 }; 1355 #if XNN_PLATFORM_JIT 1356 xnn_code_cache code_cache; 1357 if (use_jit()) { 1358 xnn_init_code_cache(&code_cache); 1359 caches.code_cache = &code_cache; 1360 } 1361 #endif 1362 xnn_weights_cache weights_cache; 1363 if (use_weights_cache()) { 1364 xnn_init_weights_cache(&weights_cache); 1365 caches.weights_cache = &weights_cache; 1366 } 1367 1368 xnn_status status = xnn_create_convolution2d_nhwc_f32( 1369 padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(), 1370 padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(), 1371 kernel_height(), kernel_width(), 1372 subsampling_height(), subsampling_width(), 1373 dilation_height(), dilation_width(), 1374 groups(), group_input_channels(), group_output_channels(), 1375 input_channel_stride(), output_channel_stride(), 1376 kernel.data(), has_bias() ? bias.data() : nullptr, 1377 output_min, output_max, 1378 (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0), 1379 &caches, 1380 &convolution_op); 1381 if (status == xnn_status_unsupported_hardware) { 1382 GTEST_SKIP(); 1383 } 1384 ASSERT_EQ(xnn_status_success, status); 1385 ASSERT_NE(nullptr, convolution_op); 1386 if (use_weights_cache()) { 1387 ASSERT_EQ(xnn_status_success, 1388 xnn_finalize_weights_cache(&weights_cache, xnn_weights_cache_finalization_kind_soft)); 1389 } 1390 1391 // Smart pointer to automatically delete convolution_op. 1392 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator); 1393 1394 #if XNN_PLATFORM_JIT 1395 if (use_jit()) { 1396 // Check that we actually generated code. 1397 ASSERT_GT(code_cache.cache.code.size, 0); 1398 xnn_finalize_code_memory(&code_cache.cache.code); 1399 } 1400 #endif 1401 1402 ASSERT_EQ(xnn_status_success, 1403 xnn_setup_convolution2d_nhwc_f32( 1404 convolution_op, 1405 batch_size(), input_height(), input_width(), 1406 input.data(), output.data(), 1407 nullptr /* thread pool */)); 1408 1409 ASSERT_EQ(xnn_status_success, 1410 xnn_run_operator(convolution_op, nullptr /* thread pool */)); 1411 1412 VerifyNHWCxF32(output, output_ref, output_min, output_max); 1413 1414 if (use_weights_cache()) { 1415 // We already finalized the code cache, so create a new code cache if we are testing JIT. 1416 #if XNN_PLATFORM_JIT 1417 xnn_code_cache inner_code_cache; 1418 if (use_jit()) { 1419 xnn_init_code_cache(&inner_code_cache); 1420 caches.code_cache = &inner_code_cache; 1421 } 1422 #endif 1423 // To test weights cache, we create the operator with the same parameters, and setup with a different output. 1424 xnn_operator_t convolution_op2 = nullptr; 1425 size_t old_weights_cache_size = weights_cache.cache.weights.size; 1426 1427 ASSERT_EQ(xnn_status_success, xnn_create_convolution2d_nhwc_f32( 1428 padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(), 1429 padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(), 1430 kernel_height(), kernel_width(), 1431 subsampling_height(), subsampling_width(), 1432 dilation_height(), dilation_width(), 1433 groups(), group_input_channels(), group_output_channels(), 1434 input_channel_stride(), output_channel_stride(), 1435 kernel.data(), has_bias() ? bias.data() : nullptr, 1436 output_min, output_max, 1437 (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0), 1438 &caches, 1439 &convolution_op2)); 1440 1441 ASSERT_NE(nullptr, convolution_op2); 1442 1443 #if XNN_PLATFORM_JIT 1444 if (use_jit()) { 1445 // Check that we actually generated code. 1446 ASSERT_GT(inner_code_cache.cache.code.size, 0); 1447 xnn_finalize_code_memory(&inner_code_cache.cache.code); 1448 } 1449 #endif 1450 1451 std::vector<float> output2(output.size(), nanf("")); 1452 ASSERT_EQ(xnn_status_success, 1453 xnn_setup_convolution2d_nhwc_f32( 1454 convolution_op2, 1455 batch_size(), input_height(), input_width(), 1456 input.data(), output2.data(), 1457 nullptr /* thread pool */)); 1458 ASSERT_EQ(xnn_status_success, 1459 xnn_run_operator(convolution_op2, nullptr /* thread pool */)); 1460 1461 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op2(convolution_op2, xnn_delete_operator); 1462 ASSERT_EQ(weights_cache.cache.hits, 1); 1463 // Ensure that we did not write more weights to the cache because it was a cache hit. 1464 ASSERT_EQ(old_weights_cache_size, weights_cache.cache.weights.size); 1465 1466 VerifyNHWCxF32(output2, output_ref, output_min, output_max); 1467 #if XNN_PLATFORM_JIT 1468 if (use_jit()) { 1469 xnn_release_code_cache(&inner_code_cache); 1470 } 1471 #endif 1472 } 1473 1474 #if XNN_PLATFORM_JIT 1475 if (use_jit()) { 1476 xnn_release_code_cache(&code_cache); 1477 } 1478 #endif 1479 if (use_weights_cache()) { 1480 xnn_release_weights_cache(&weights_cache); 1481 } 1482 } 1483 } 1484 VerifyNHWCxF32(const std::vector<float> & output,const std::vector<float> & output_ref,const float output_min,const float output_max)1485 void VerifyNHWCxF32(const std::vector<float>& output, const std::vector<float>& output_ref, const float output_min, const float output_max) const { 1486 for (size_t i = 0; i < batch_size(); i++) { 1487 for (size_t y = 0; y < output_height(); y++) { 1488 for (size_t x = 0; x < output_width(); x++) { 1489 for (size_t g = 0; g < groups(); g++) { 1490 for (size_t c = 0; c < group_output_channels(); c++) { 1491 ASSERT_GE(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c], output_min) 1492 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 1493 ASSERT_LE(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c], output_max) 1494 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 1495 ASSERT_NEAR( 1496 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c], 1497 output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c], 1498 1.0e-4 * std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c])) 1499 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 1500 } 1501 } 1502 } 1503 } 1504 } 1505 } 1506 TestNHWCxF16()1507 void TestNHWCxF16() const { 1508 switch (weights_type()) { 1509 case WeightsType::Default: 1510 break; 1511 case WeightsType::FP32: 1512 break; 1513 default: 1514 GTEST_FAIL() << "unexpected weights type"; 1515 } 1516 1517 std::random_device random_device; 1518 auto rng = std::mt19937(random_device()); 1519 std::uniform_real_distribution<float> f32dist(0.1f, 1.0f); 1520 1521 std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) + 1522 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels())); 1523 std::vector<uint16_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels()); 1524 std::vector<float> kernel_as_float(kernel.size()); 1525 std::vector<uint16_t> bias(groups() * group_output_channels()); 1526 std::vector<float> bias_as_float(bias.size()); 1527 std::vector<uint16_t> output(batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels())); 1528 std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels()); 1529 1530 for (size_t iteration = 0; iteration < iterations(); iteration++) { 1531 std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 1532 std::generate(kernel.begin(), kernel.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 1533 std::transform(kernel.cbegin(), kernel.cend(), kernel_as_float.begin(), fp16_ieee_to_fp32_value); 1534 std::generate(bias.begin(), bias.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 1535 std::transform(bias.cbegin(), bias.cend(), bias_as_float.begin(), fp16_ieee_to_fp32_value); 1536 std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */); 1537 1538 1539 // Compute reference results, without clamping. 1540 if (has_bias()) { 1541 for (size_t i = 0; i < batch_size(); i++) { 1542 for (size_t oy = 0; oy < output_height(); oy++) { 1543 for (size_t ox = 0; ox < output_width(); ox++) { 1544 for (size_t g = 0; g < groups(); g++) { 1545 for (size_t oc = 0; oc < group_output_channels(); oc++) { 1546 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] = 1547 fp16_ieee_to_fp32_value(bias[g * group_output_channels() + oc]); 1548 } 1549 } 1550 } 1551 } 1552 } 1553 } else { 1554 std::fill(output_ref.begin(), output_ref.end(), 0.0f); 1555 } 1556 if (depthwise_layout()) { 1557 ASSERT_EQ(group_input_channels(), 1); 1558 1559 for (size_t i = 0; i < batch_size(); i++) { 1560 for (size_t oy = 0; oy < output_height(); oy++) { 1561 for (size_t ox = 0; ox < output_width(); ox++) { 1562 for (size_t ky = 0; ky < kernel_height(); ky++) { 1563 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top(); 1564 if (iy < input_height()) { 1565 for (size_t kx = 0; kx < kernel_width(); kx++) { 1566 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left(); 1567 if (ix < input_width()) { 1568 for (size_t g = 0; g < groups(); g++) { 1569 for (size_t oc = 0; oc < group_output_channels(); oc++) { 1570 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] += 1571 fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g]) * 1572 fp16_ieee_to_fp32_value(kernel[((ky * kernel_width() + kx) * groups() + g) * group_output_channels() + oc]); 1573 } 1574 } 1575 } 1576 } 1577 } 1578 } 1579 } 1580 } 1581 } 1582 } else { 1583 for (size_t i = 0; i < batch_size(); i++) { 1584 for (size_t oy = 0; oy < output_height(); oy++) { 1585 for (size_t ox = 0; ox < output_width(); ox++) { 1586 for (size_t ky = 0; ky < kernel_height(); ky++) { 1587 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top(); 1588 if (iy < input_height()) { 1589 for (size_t kx = 0; kx < kernel_width(); kx++) { 1590 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left(); 1591 if (ix < input_width()) { 1592 for (size_t g = 0; g < groups(); g++) { 1593 for (size_t oc = 0; oc < group_output_channels(); oc++) { 1594 for (size_t ic = 0; ic < group_input_channels(); ic++) { 1595 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] += 1596 fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) * 1597 fp16_ieee_to_fp32_value(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]); 1598 } 1599 } 1600 } 1601 } 1602 } 1603 } 1604 } 1605 } 1606 } 1607 } 1608 } 1609 1610 // Compute clamping parameters. 1611 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); 1612 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); 1613 const float accumulated_range = accumulated_max - accumulated_min; 1614 const float scaled_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_min + accumulated_range / 255.0f * float(qmin()))); 1615 const float scaled_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_max - accumulated_range / 255.0f * float(255 - qmax()))); 1616 const float output_min = scaled_min == scaled_max ? -std::numeric_limits<float>::infinity() : scaled_min; 1617 const float output_max = scaled_min == scaled_max ? +std::numeric_limits<float>::infinity() : scaled_max; 1618 1619 // Clamp reference results. 1620 for (float& value : output_ref) { 1621 value = std::max(std::min(value, output_max), output_min); 1622 } 1623 1624 // Create, setup, run, and destroy Convolution operator. 1625 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); 1626 xnn_operator_t convolution_op = nullptr; 1627 xnn_caches caches = { 1628 .code_cache = NULL, 1629 .weights_cache = NULL, 1630 }; 1631 xnn_weights_cache weights_cache; 1632 if (use_weights_cache()) { 1633 xnn_init_weights_cache(&weights_cache); 1634 caches.weights_cache = &weights_cache; 1635 } 1636 1637 const void* kernel_data = kernel.data(); 1638 const void* bias_data = bias.data(); 1639 if (weights_type() == WeightsType::FP32) { 1640 kernel_data = kernel_as_float.data(); 1641 bias_data = bias_as_float.data(); 1642 } 1643 uint32_t flags = 0; 1644 if (depthwise_layout()) { 1645 flags |= XNN_FLAG_DEPTHWISE_CONVOLUTION; 1646 } 1647 if (padding_tf_same()) { 1648 flags |= XNN_FLAG_TENSORFLOW_SAME_PADDING; 1649 } 1650 if (weights_type() == WeightsType::FP32) { 1651 flags |= XNN_FLAG_FP32_STATIC_WEIGHTS; 1652 } 1653 xnn_status status = xnn_create_convolution2d_nhwc_f16( 1654 padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(), 1655 padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(), 1656 kernel_height(), kernel_width(), 1657 subsampling_height(), subsampling_width(), 1658 dilation_height(), dilation_width(), 1659 groups(), group_input_channels(), group_output_channels(), 1660 input_channel_stride(), output_channel_stride(), 1661 kernel_data, has_bias() ? bias_data : nullptr, 1662 output_min, output_max, 1663 flags, 1664 &caches, 1665 &convolution_op); 1666 if (status == xnn_status_unsupported_hardware) { 1667 GTEST_SKIP(); 1668 } 1669 ASSERT_EQ(xnn_status_success, status); 1670 ASSERT_NE(nullptr, convolution_op); 1671 if (use_weights_cache()) { 1672 ASSERT_EQ(xnn_status_success, 1673 xnn_finalize_weights_cache(&weights_cache, xnn_weights_cache_finalization_kind_soft)); 1674 } 1675 1676 // Smart pointer to automatically delete convolution_op. 1677 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator); 1678 1679 ASSERT_EQ(xnn_status_success, 1680 xnn_setup_convolution2d_nhwc_f16( 1681 convolution_op, 1682 batch_size(), input_height(), input_width(), 1683 input.data(), output.data(), 1684 nullptr /* thread pool */)); 1685 1686 ASSERT_EQ(xnn_status_success, 1687 xnn_run_operator(convolution_op, nullptr /* thread pool */)); 1688 1689 VerifyNHWCxF16(output, output_ref, output_min, output_max); 1690 1691 if (use_weights_cache()) { 1692 xnn_operator_t convolution_op2 = nullptr; 1693 size_t old_weights_cache_size = weights_cache.cache.weights.size; 1694 ASSERT_EQ(xnn_status_success, xnn_create_convolution2d_nhwc_f16( 1695 padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(), 1696 padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(), 1697 kernel_height(), kernel_width(), 1698 subsampling_height(), subsampling_width(), 1699 dilation_height(), dilation_width(), 1700 groups(), group_input_channels(), group_output_channels(), 1701 input_channel_stride(), output_channel_stride(), 1702 kernel_data, has_bias() ? bias_data : nullptr, 1703 output_min, output_max, 1704 flags, 1705 &caches, 1706 &convolution_op2)); 1707 ASSERT_NE(nullptr, convolution_op2); 1708 1709 // Smart pointer to automatically delete convolution_op. 1710 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op2, xnn_delete_operator); 1711 1712 std::vector<uint16_t> output2(output.size(), UINT16_C(0x7E00) /* NaN */); 1713 ASSERT_EQ(xnn_status_success, 1714 xnn_setup_convolution2d_nhwc_f16( 1715 convolution_op2, 1716 batch_size(), input_height(), input_width(), 1717 input.data(), output2.data(), 1718 nullptr /* thread pool */)); 1719 1720 ASSERT_EQ(xnn_status_success, 1721 xnn_run_operator(convolution_op2, nullptr /* thread pool */)); 1722 1723 VerifyNHWCxF16(output2, output_ref, output_min, output_max); 1724 VerifyWeightsCache(weights_cache, old_weights_cache_size); 1725 xnn_release_weights_cache(&weights_cache); 1726 } 1727 } 1728 } 1729 VerifyNHWCxF16(const std::vector<uint16_t> & output,const std::vector<float> & output_ref,const float output_min,const float output_max)1730 void VerifyNHWCxF16(const std::vector<uint16_t> &output, 1731 const std::vector<float> &output_ref, 1732 const float output_min, const float output_max) const { 1733 for (size_t i = 0; i < batch_size(); i++) { 1734 for (size_t y = 0; y < output_height(); y++) { 1735 for (size_t x = 0; x < output_width(); x++) { 1736 for (size_t g = 0; g < groups(); g++) { 1737 for (size_t c = 0; c < group_output_channels(); c++) { 1738 ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), output_min) 1739 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 1740 ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), output_max) 1741 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 1742 ASSERT_NEAR(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c], fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), std::max(1.0e-4f, std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c]) * 1.0e-2f)) 1743 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 1744 } 1745 } 1746 } 1747 } 1748 } 1749 } 1750 TestNCHWxF32()1751 void TestNCHWxF32() { 1752 ASSERT_EQ(weights_type(), WeightsType::Default); 1753 1754 std::random_device random_device; 1755 auto rng = std::mt19937(random_device()); 1756 std::uniform_real_distribution<float> f32dist(0.1f, 1.0f); 1757 std::uniform_real_distribution<float> pdist; 1758 1759 std::vector<float> input(2 * XNN_EXTRA_BYTES / sizeof(float) + 1760 ((batch_size() - 1) * input_channel_stride() + groups() * group_input_channels()) * input_height() * input_width()); 1761 std::vector<float> kernel( 1762 groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels()); 1763 std::vector<float> bias(groups() * group_output_channels()); 1764 std::vector<float> output( 1765 ((batch_size() - 1) * output_channel_stride() + groups() * group_output_channels()) * output_height() * output_width()); 1766 std::vector<float> output_ref(batch_size() * groups() * group_output_channels() * output_height() * output_width()); 1767 1768 for (size_t iteration = 0; iteration < iterations(); iteration++) { 1769 std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); 1770 std::generate(kernel.begin(), kernel.end(), [&]() { return f32dist(rng); }); 1771 for (float& k : kernel) { 1772 if (pdist(rng) <= sparsity()) { 1773 k = 0.0f; 1774 } 1775 } 1776 std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); }); 1777 std::fill(output.begin(), output.end(), nanf("")); 1778 1779 // Compute reference results, without clamping. 1780 if (has_bias()) { 1781 for (size_t i = 0; i < batch_size(); i++) { 1782 for (size_t oy = 0; oy < output_height(); oy++) { 1783 for (size_t ox = 0; ox < output_width(); ox++) { 1784 for (size_t g = 0; g < groups(); g++) { 1785 for (size_t oc = 0; oc < group_output_channels(); oc++) { 1786 output_ref[(((i * groups() + g) * group_output_channels() + oc) * output_height() + oy) * output_width() + ox] = 1787 bias[g * group_output_channels() + oc]; 1788 } 1789 } 1790 } 1791 } 1792 } 1793 } else { 1794 std::fill(output_ref.begin(), output_ref.end(), 0.0f); 1795 } 1796 if (force_nhwc_input()) { 1797 for (size_t i = 0; i < batch_size(); i++) { 1798 for (size_t oy = 0; oy < output_height(); oy++) { 1799 for (size_t ox = 0; ox < output_width(); ox++) { 1800 for (size_t ky = 0; ky < kernel_height(); ky++) { 1801 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top(); 1802 if (iy < input_height()) { 1803 for (size_t kx = 0; kx < kernel_width(); kx++) { 1804 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left(); 1805 if (ix < input_width()) { 1806 for (size_t g = 0; g < groups(); g++) { 1807 for (size_t oc = 0; oc < group_output_channels(); oc++) { 1808 for (size_t ic = 0; ic < group_input_channels(); ic++) { 1809 output_ref[(((i * groups() + g) * group_output_channels() + oc) * output_height() + oy) * output_width() + ox] += 1810 input[((((i * input_height() + iy) * input_width() + ix) * groups() + g) * group_input_channels() + ic)] * 1811 kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]; 1812 } 1813 } 1814 } 1815 } 1816 } 1817 } 1818 } 1819 } 1820 } 1821 } 1822 } else if (depthwise_layout()) { 1823 ASSERT_EQ(group_input_channels(), 1); 1824 1825 for (size_t i = 0; i < batch_size(); i++) { 1826 for (size_t oy = 0; oy < output_height(); oy++) { 1827 for (size_t ox = 0; ox < output_width(); ox++) { 1828 for (size_t ky = 0; ky < kernel_height(); ky++) { 1829 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top(); 1830 if (iy < input_height()) { 1831 for (size_t kx = 0; kx < kernel_width(); kx++) { 1832 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left(); 1833 if (ix < input_width()) { 1834 for (size_t g = 0; g < groups(); g++) { 1835 for (size_t oc = 0; oc < group_output_channels(); oc++) { 1836 output_ref[(((i * groups() + g) * group_output_channels() + oc) * output_height() + oy) * output_width() + ox] += 1837 input[((i * input_channel_stride() + g) * input_height() + iy) * input_width() + ix] * 1838 kernel[((ky * kernel_width() + kx) * groups() + g) * group_output_channels() + oc]; 1839 } 1840 } 1841 } 1842 } 1843 } 1844 } 1845 } 1846 } 1847 } 1848 } else { 1849 for (size_t i = 0; i < batch_size(); i++) { 1850 for (size_t oy = 0; oy < output_height(); oy++) { 1851 for (size_t ox = 0; ox < output_width(); ox++) { 1852 for (size_t ky = 0; ky < kernel_height(); ky++) { 1853 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top(); 1854 if (iy < input_height()) { 1855 for (size_t kx = 0; kx < kernel_width(); kx++) { 1856 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left(); 1857 if (ix < input_width()) { 1858 for (size_t g = 0; g < groups(); g++) { 1859 for (size_t oc = 0; oc < group_output_channels(); oc++) { 1860 for (size_t ic = 0; ic < group_input_channels(); ic++) { 1861 output_ref[(((i * groups() + g) * group_output_channels() + oc) * output_height() + oy) * output_width() + ox] += 1862 input[((i * input_channel_stride() + g * group_input_channels() + ic) * input_height() + iy) * input_width() + ix] * 1863 kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]; 1864 } 1865 } 1866 } 1867 } 1868 } 1869 } 1870 } 1871 } 1872 } 1873 } 1874 } 1875 1876 // Compute clamping parameters. 1877 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); 1878 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); 1879 1880 const float output_min = qmin() == 0 ? -std::numeric_limits<float>::infinity() : 1881 accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin()); 1882 const float output_max = qmax() == 255 ? std::numeric_limits<float>::infinity() : 1883 accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax()); 1884 1885 // Clamp reference results. 1886 for (float& value : output_ref) { 1887 value = std::max(std::min(value, output_max), output_min); 1888 } 1889 1890 // Create, setup, run, and destroy Convolution operator. 1891 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); 1892 xnn_operator_t convolution_op = nullptr; 1893 xnn_caches caches = { 1894 .code_cache = NULL, 1895 .weights_cache = NULL, 1896 }; 1897 xnn_weights_cache weights_cache; 1898 if (use_weights_cache()) { 1899 xnn_init_weights_cache(&weights_cache); 1900 caches.weights_cache = &weights_cache; 1901 } 1902 1903 xnn_status status = xnn_create_convolution2d_nchw_f32( 1904 padding_top(), padding_right(), padding_bottom(), padding_left(), 1905 kernel_height(), kernel_width(), 1906 subsampling_height(), subsampling_width(), 1907 dilation_height(), dilation_width(), 1908 groups(), group_input_channels(), group_output_channels(), 1909 input_channel_stride(), output_channel_stride(), 1910 kernel.data(), has_bias() ? bias.data() : nullptr, 1911 output_min, output_max, 1912 (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (force_nhwc_input() ? XNN_FLAG_INPUT_NHWC : 0), 1913 &caches, 1914 &convolution_op); 1915 if (status == xnn_status_unsupported_parameter) { 1916 GTEST_SKIP(); 1917 } 1918 ASSERT_EQ(xnn_status_success, status); 1919 ASSERT_NE(nullptr, convolution_op); 1920 if (use_weights_cache()) { 1921 ASSERT_EQ(xnn_status_success, 1922 xnn_finalize_weights_cache(&weights_cache, xnn_weights_cache_finalization_kind_soft)); 1923 } 1924 1925 // Smart pointer to automatically delete convolution_op. 1926 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator); 1927 1928 ASSERT_EQ(xnn_status_success, 1929 xnn_setup_convolution2d_nchw_f32( 1930 convolution_op, 1931 batch_size(), input_height(), input_width(), 1932 input.data(), output.data(), 1933 nullptr /* thread pool */)); 1934 1935 ASSERT_EQ(xnn_status_success, 1936 xnn_run_operator(convolution_op, nullptr /* thread pool */)); 1937 1938 VerifyNCHWxF32(output, output_ref, output_min, output_max); 1939 1940 if (use_weights_cache()) { 1941 xnn_operator_t convolution_op2 = nullptr; 1942 size_t old_weights_cache_size = weights_cache.cache.weights.size; 1943 ASSERT_EQ( 1944 xnn_status_success, 1945 xnn_create_convolution2d_nchw_f32( 1946 padding_top(), padding_right(), padding_bottom(), 1947 padding_left(), kernel_height(), kernel_width(), 1948 subsampling_height(), subsampling_width(), dilation_height(), 1949 dilation_width(), groups(), group_input_channels(), 1950 group_output_channels(), input_channel_stride(), 1951 output_channel_stride(), kernel.data(), 1952 has_bias() ? bias.data() : nullptr, output_min, output_max, 1953 (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | 1954 (force_nhwc_input() ? XNN_FLAG_INPUT_NHWC : 0), 1955 &caches, &convolution_op2)); 1956 ASSERT_NE(nullptr, convolution_op2); 1957 1958 // Smart pointer to automatically delete convolution_op2. 1959 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op2, xnn_delete_operator); 1960 std::vector<float> output2(output.size(), nanf("")); 1961 1962 ASSERT_EQ(xnn_status_success, 1963 xnn_setup_convolution2d_nchw_f32( 1964 convolution_op2, 1965 batch_size(), input_height(), input_width(), 1966 input.data(), output2.data(), 1967 nullptr /* thread pool */)); 1968 1969 ASSERT_EQ(xnn_status_success, 1970 xnn_run_operator(convolution_op2, nullptr /* thread pool */)); 1971 1972 VerifyNCHWxF32(output2, output_ref, output_min, output_max); 1973 if (IsSpmm()) { 1974 VerifyWeightsCacheUnused(weights_cache); 1975 } else { 1976 VerifyWeightsCache(weights_cache, old_weights_cache_size); 1977 } 1978 xnn_release_weights_cache(&weights_cache); 1979 } 1980 } 1981 } 1982 VerifyNCHWxF32(const std::vector<float> & output,const std::vector<float> & output_ref,const float output_min,const float output_max)1983 void VerifyNCHWxF32(const std::vector<float> &output, 1984 const std::vector<float> &output_ref, 1985 const float output_min, const float output_max) const { 1986 for (size_t i = 0; i < batch_size(); i++) { 1987 for (size_t y = 0; y < output_height(); y++) { 1988 for (size_t x = 0; x < output_width(); x++) { 1989 for (size_t g = 0; g < groups(); g++) { 1990 for (size_t c = 0; c < group_output_channels(); c++) { 1991 ASSERT_GE(output[((i * output_channel_stride() + g * group_output_channels() + c) * output_height() + y) * output_width() + x], output_min) 1992 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c << ", image = " << i; 1993 ASSERT_LE(output[((i * output_channel_stride() + g * group_output_channels() + c) * output_height() + y) * output_width() + x], output_max) 1994 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c << ", image = " << i; 1995 ASSERT_NEAR( 1996 output_ref[(((i * groups() + g) * group_output_channels() + c) * output_height() + y) * output_width() + x], 1997 output[((i * output_channel_stride() + g * group_output_channels() + c) * output_height() + y) * output_width() + x], 1998 1.0e-4 * std::abs(output_ref[(((i * groups() + g) * group_output_channels() + c) * output_height() + y) * output_width() + x])) 1999 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c << ", image = " << i; 2000 } 2001 } 2002 } 2003 } 2004 } 2005 } 2006 TestSetupNHWCxQC8()2007 void TestSetupNHWCxQC8() const { 2008 ASSERT_EQ(weights_type(), WeightsType::Default); 2009 2010 ASSERT_FALSE(depthwise_layout()); 2011 2012 std::random_device random_device; 2013 auto rng = std::mt19937(random_device()); 2014 std::uniform_int_distribution<int32_t> i32dist(-10000, 10000); 2015 std::uniform_int_distribution<int32_t> i8dist( 2016 std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()); 2017 std::uniform_int_distribution<int32_t> w8dist( 2018 -std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max()); 2019 2020 std::vector<int8_t> input(XNN_EXTRA_BYTES / sizeof(int8_t) + std::max( 2021 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()), 2022 next_batch_size() * ((next_input_height() * next_input_width() - 1) * input_channel_stride() + groups() * group_input_channels()))); 2023 std::vector<int8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels()); 2024 std::vector<int32_t> bias(groups() * group_output_channels()); 2025 std::vector<int8_t> output(std::max( 2026 batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()), 2027 next_batch_size() * ((next_output_height() * next_output_width() - 1) * output_channel_stride() + groups() * group_output_channels()))); 2028 std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels()); 2029 std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels()); 2030 std::vector<float> requantization_scales(groups() * group_output_channels()); 2031 std::vector<int32_t> next_accumulators(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels()); 2032 std::vector<double> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels()); 2033 std::vector<float> next_requantization_scales(groups() * group_output_channels()); 2034 2035 const int8_t input_zero_point = -1; 2036 const int8_t output_zero_point = -1; 2037 2038 for (size_t iteration = 0; iteration < iterations(); iteration++) { 2039 std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); 2040 std::generate(kernel.begin(), kernel.end(), [&]() { return w8dist(rng); }); 2041 std::generate(bias.begin(), bias.end(), [&]() { return i32dist(rng); }); 2042 std::fill(output.begin(), output.end(), INT8_C(0xA5)); 2043 2044 // Compute reference results, without renormalization. 2045 if (has_bias()) { 2046 for (size_t i = 0; i < batch_size(); i++) { 2047 for (size_t oy = 0; oy < output_height(); oy++) { 2048 for (size_t ox = 0; ox < output_width(); ox++) { 2049 for (size_t g = 0; g < groups(); g++) { 2050 for (size_t oc = 0; oc < group_output_channels(); oc++) { 2051 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] = 2052 bias[g * group_output_channels() + oc]; 2053 } 2054 } 2055 } 2056 } 2057 } 2058 } else { 2059 std::fill(accumulators.begin(), accumulators.end(), 0); 2060 } 2061 for (size_t i = 0; i < batch_size(); i++) { 2062 for (size_t oy = 0; oy < output_height(); oy++) { 2063 for (size_t ox = 0; ox < output_width(); ox++) { 2064 for (size_t ky = 0; ky < kernel_height(); ky++) { 2065 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top(); 2066 if (iy < input_height()) { 2067 for (size_t kx = 0; kx < kernel_width(); kx++) { 2068 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left(); 2069 if (ix < input_width()) { 2070 for (size_t g = 0; g < groups(); g++) { 2071 for (size_t oc = 0; oc < group_output_channels(); oc++) { 2072 for (size_t ic = 0; ic < group_input_channels(); ic++) { 2073 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] += 2074 (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) * 2075 int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]); 2076 } 2077 } 2078 } 2079 } 2080 } 2081 } 2082 } 2083 } 2084 } 2085 } 2086 2087 // Compute renormalization parameters. 2088 for (size_t c = 0; c < groups() * group_output_channels(); c++) { 2089 int32_t accumulated_min = accumulators[c]; 2090 int32_t accumulated_max = accumulators[c]; 2091 for (size_t px = 0; px < batch_size() * output_height() * output_width(); px++) { 2092 accumulated_min = std::min(accumulated_min, accumulators[px * groups() * group_output_channels() + c]); 2093 accumulated_max = std::max(accumulated_max, accumulators[px * groups() * group_output_channels() + c]); 2094 } 2095 2096 float requantization_scale = 0x1.0p-32f; 2097 if (accumulated_max != 0) { 2098 requantization_scale = std::max(requantization_scale, 2099 float(int32_t(std::numeric_limits<int8_t>::max()) - int32_t(output_zero_point)) / float(accumulated_max)); 2100 } 2101 if (accumulated_min != 0) { 2102 requantization_scale = std::max(requantization_scale, 2103 float(int32_t(std::numeric_limits<int8_t>::min()) - int32_t(output_zero_point)) / float(accumulated_min)); 2104 } 2105 requantization_scale = std::min(requantization_scale, 0x1.FFFFFEp-1f); 2106 2107 requantization_scales[c] = requantization_scale; 2108 } 2109 2110 // Renormalize reference results. 2111 for (size_t c = 0; c < groups() * group_output_channels(); c++) { 2112 for (size_t px = 0; px < batch_size() * output_height() * output_width(); px++) { 2113 output_ref[px * groups() * group_output_channels() + c] = double(int32_t(output_zero_point)) + 2114 double(accumulators[px * groups() * group_output_channels() + c]) * double(requantization_scales[c]); 2115 } 2116 } 2117 std::transform(output_ref.cbegin(), output_ref.cend(), output_ref.begin(), 2118 [this](double x) -> double { 2119 return std::max<double>(std::min<double>(x, double(qmax() - 0x80)), double(qmin() - 0x80)); 2120 }); 2121 2122 // Create, setup, and run Convolution operator once. 2123 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); 2124 xnn_operator_t convolution_op = nullptr; 2125 2126 xnn_status status = xnn_create_convolution2d_nhwc_qc8( 2127 padding_top(), padding_right(), padding_bottom(), padding_left(), 2128 kernel_height(), kernel_width(), 2129 subsampling_height(), subsampling_width(), 2130 dilation_height(), dilation_width(), 2131 groups(), group_input_channels(), group_output_channels(), 2132 input_channel_stride(), output_channel_stride(), 2133 input_zero_point, 1.0f /* input scale */, requantization_scales.data(), 2134 kernel.data(), has_bias() ? bias.data() : nullptr, 2135 output_zero_point, 1.0f /* output scale */, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80), 2136 0, NULL, &convolution_op); 2137 if (status == xnn_status_unsupported_hardware) { 2138 GTEST_SKIP(); 2139 } 2140 ASSERT_EQ(xnn_status_success, status); 2141 ASSERT_NE(nullptr, convolution_op); 2142 2143 // Smart pointer to automatically delete convolution_op. 2144 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator); 2145 2146 ASSERT_EQ(xnn_status_success, 2147 xnn_setup_convolution2d_nhwc_qc8( 2148 convolution_op, 2149 batch_size(), input_height(), input_width(), 2150 input.data(), output.data(), 2151 nullptr /* thread pool */)); 2152 2153 ASSERT_EQ(xnn_status_success, 2154 xnn_run_operator(convolution_op, nullptr /* thread pool */)); 2155 2156 // Verify results of the first run. 2157 for (size_t i = 0; i < batch_size(); i++) { 2158 for (size_t y = 0; y < output_height(); y++) { 2159 for (size_t x = 0; x < output_width(); x++) { 2160 for (size_t g = 0; g < groups(); g++) { 2161 for (size_t c = 0; c < group_output_channels(); c++) { 2162 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax() - 0x80)) 2163 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 2164 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin() - 0x80)) 2165 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 2166 ASSERT_NEAR( 2167 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c], 2168 double(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), 2169 0.9) 2170 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 2171 } 2172 } 2173 } 2174 } 2175 } 2176 2177 // Re-generate data for the second run. 2178 std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); 2179 std::fill(output.begin(), output.end(), INT8_C(0xA5)); 2180 2181 // Compute reference results for the second run, including renormalization. 2182 if (has_bias()) { 2183 for (size_t i = 0; i < next_batch_size(); i++) { 2184 for (size_t oy = 0; oy < next_output_height(); oy++) { 2185 for (size_t ox = 0; ox < next_output_width(); ox++) { 2186 for (size_t g = 0; g < groups(); g++) { 2187 for (size_t oc = 0; oc < group_output_channels(); oc++) { 2188 next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] = 2189 bias[g * group_output_channels() + oc]; 2190 } 2191 } 2192 } 2193 } 2194 } 2195 } else { 2196 std::fill(next_accumulators.begin(), next_accumulators.end(), 0); 2197 } 2198 for (size_t i = 0; i < next_batch_size(); i++) { 2199 for (size_t oy = 0; oy < next_output_height(); oy++) { 2200 for (size_t ox = 0; ox < next_output_width(); ox++) { 2201 for (size_t ky = 0; ky < kernel_height(); ky++) { 2202 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top(); 2203 if (iy < next_input_height()) { 2204 for (size_t kx = 0; kx < kernel_width(); kx++) { 2205 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left(); 2206 if (ix < next_input_width()) { 2207 for (size_t g = 0; g < groups(); g++) { 2208 for (size_t oc = 0; oc < group_output_channels(); oc++) { 2209 for (size_t ic = 0; ic < group_input_channels(); ic++) { 2210 next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] += 2211 (int32_t(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) * 2212 int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]); 2213 } 2214 } 2215 } 2216 } 2217 } 2218 } 2219 } 2220 } 2221 } 2222 } 2223 for (size_t c = 0; c < groups() * group_output_channels(); c++) { 2224 for (size_t px = 0; px < next_batch_size() * next_output_height() * next_output_width(); px++) { 2225 next_output_ref[px * groups() * group_output_channels() + c] = double(int32_t(output_zero_point)) + 2226 double(next_accumulators[px * groups() * group_output_channels() + c]) * double(requantization_scales[c]); 2227 } 2228 } 2229 std::transform(next_output_ref.cbegin(), next_output_ref.cend(), next_output_ref.begin(), 2230 [this](double x) -> double { 2231 return std::max<double>(std::min<double>(x, double(qmax() - 0x80)), double(qmin() - 0x80)); 2232 }); 2233 2234 // Setup and run Convolution operator the second time, and destroy the operator. 2235 ASSERT_EQ(xnn_status_success, 2236 xnn_setup_convolution2d_nhwc_qc8( 2237 convolution_op, 2238 next_batch_size(), next_input_height(), next_input_width(), 2239 input.data(), output.data(), 2240 nullptr /* thread pool */)); 2241 2242 ASSERT_EQ(xnn_status_success, 2243 xnn_run_operator(convolution_op, nullptr /* thread pool */)); 2244 2245 // Verify results of the second run. 2246 for (size_t i = 0; i < next_batch_size(); i++) { 2247 for (size_t y = 0; y < next_output_height(); y++) { 2248 for (size_t x = 0; x < next_output_width(); x++) { 2249 for (size_t g = 0; g < groups(); g++) { 2250 for (size_t c = 0; c < group_output_channels(); c++) { 2251 ASSERT_LE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax() - 0x80)) 2252 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 2253 ASSERT_GE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin() - 0x80)) 2254 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 2255 ASSERT_NEAR( 2256 next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c], 2257 double(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), 2258 0.9) 2259 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 2260 } 2261 } 2262 } 2263 } 2264 } 2265 } 2266 } 2267 TestSetupNHWCxQS8()2268 void TestSetupNHWCxQS8() const { 2269 ASSERT_EQ(weights_type(), WeightsType::Default); 2270 2271 ASSERT_FALSE(depthwise_layout()); 2272 2273 std::random_device random_device; 2274 auto rng = std::mt19937(random_device()); 2275 std::uniform_int_distribution<int32_t> i32dist(-10000, 10000); 2276 std::uniform_int_distribution<int32_t> i8dist( 2277 std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()); 2278 std::uniform_int_distribution<int32_t> w8dist( 2279 -std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max()); 2280 2281 std::vector<int8_t> input(XNN_EXTRA_BYTES / sizeof(int8_t) + std::max( 2282 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()), 2283 next_batch_size() * ((next_input_height() * next_input_width() - 1) * input_channel_stride() + groups() * group_input_channels()))); 2284 std::vector<int8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels()); 2285 std::vector<int32_t> bias(groups() * group_output_channels()); 2286 std::vector<int8_t> output(std::max( 2287 batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()), 2288 next_batch_size() * ((next_output_height() * next_output_width() - 1) * output_channel_stride() + groups() * group_output_channels()))); 2289 std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels()); 2290 std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels()); 2291 std::vector<int32_t> next_accumulators(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels()); 2292 std::vector<double> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels()); 2293 2294 const int8_t input_zero_point = -1; 2295 2296 for (size_t iteration = 0; iteration < iterations(); iteration++) { 2297 std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); 2298 std::generate(kernel.begin(), kernel.end(), [&]() { return w8dist(rng); }); 2299 std::generate(bias.begin(), bias.end(), [&]() { return i32dist(rng); }); 2300 std::fill(output.begin(), output.end(), INT8_C(0xA5)); 2301 2302 // Compute reference results, without renormalization. 2303 if (has_bias()) { 2304 for (size_t i = 0; i < batch_size(); i++) { 2305 for (size_t oy = 0; oy < output_height(); oy++) { 2306 for (size_t ox = 0; ox < output_width(); ox++) { 2307 for (size_t g = 0; g < groups(); g++) { 2308 for (size_t oc = 0; oc < group_output_channels(); oc++) { 2309 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] = 2310 bias[g * group_output_channels() + oc]; 2311 } 2312 } 2313 } 2314 } 2315 } 2316 } else { 2317 std::fill(accumulators.begin(), accumulators.end(), 0); 2318 } 2319 for (size_t i = 0; i < batch_size(); i++) { 2320 for (size_t oy = 0; oy < output_height(); oy++) { 2321 for (size_t ox = 0; ox < output_width(); ox++) { 2322 for (size_t ky = 0; ky < kernel_height(); ky++) { 2323 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top(); 2324 if (iy < input_height()) { 2325 for (size_t kx = 0; kx < kernel_width(); kx++) { 2326 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left(); 2327 if (ix < input_width()) { 2328 for (size_t g = 0; g < groups(); g++) { 2329 for (size_t oc = 0; oc < group_output_channels(); oc++) { 2330 for (size_t ic = 0; ic < group_input_channels(); ic++) { 2331 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] += 2332 (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) * 2333 int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]); 2334 } 2335 } 2336 } 2337 } 2338 } 2339 } 2340 } 2341 } 2342 } 2343 } 2344 2345 // Compute renormalization parameters. 2346 const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend()); 2347 const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend()); 2348 2349 const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0; 2350 const int8_t output_zero_point = int8_t(std::max(std::min( 2351 lrint(-0.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale), 2352 long(std::numeric_limits<int8_t>::max())), long(std::numeric_limits<int8_t>::min()))); 2353 2354 // Renormalize reference results. 2355 std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(), 2356 [this, output_scale, output_zero_point](int32_t x) -> double { 2357 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax() - 0x80) - output_zero_point), double(qmin() - 0x80) - output_zero_point); 2358 }); 2359 2360 // Create, setup, and run Convolution operator once. 2361 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); 2362 xnn_operator_t convolution_op = nullptr; 2363 2364 xnn_status status = xnn_create_convolution2d_nhwc_qs8( 2365 padding_top(), padding_right(), padding_bottom(), padding_left(), 2366 kernel_height(), kernel_width(), 2367 subsampling_height(), subsampling_width(), 2368 dilation_height(), dilation_width(), 2369 groups(), group_input_channels(), group_output_channels(), 2370 input_channel_stride(), output_channel_stride(), 2371 input_zero_point, 1.0f /* input scale */, 1.0f /* kernel scale */, 2372 kernel.data(), has_bias() ? bias.data() : nullptr, 2373 output_zero_point, output_scale, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80), 2374 0, NULL, &convolution_op); 2375 if (status == xnn_status_unsupported_hardware) { 2376 GTEST_SKIP(); 2377 } 2378 ASSERT_EQ(xnn_status_success, status); 2379 ASSERT_NE(nullptr, convolution_op); 2380 2381 // Smart pointer to automatically delete convolution_op. 2382 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator); 2383 2384 ASSERT_EQ(xnn_status_success, 2385 xnn_setup_convolution2d_nhwc_qs8( 2386 convolution_op, 2387 batch_size(), input_height(), input_width(), 2388 input.data(), output.data(), 2389 nullptr /* thread pool */)); 2390 2391 ASSERT_EQ(xnn_status_success, 2392 xnn_run_operator(convolution_op, nullptr /* thread pool */)); 2393 2394 // Verify results of the first run. 2395 for (size_t i = 0; i < batch_size(); i++) { 2396 for (size_t y = 0; y < output_height(); y++) { 2397 for (size_t x = 0; x < output_width(); x++) { 2398 for (size_t g = 0; g < groups(); g++) { 2399 for (size_t c = 0; c < group_output_channels(); c++) { 2400 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax() - 0x80)) 2401 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 2402 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin() - 0x80)) 2403 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 2404 ASSERT_NEAR( 2405 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c], 2406 double(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]) - double(output_zero_point), 2407 0.9) 2408 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 2409 } 2410 } 2411 } 2412 } 2413 } 2414 2415 // Re-generate data for the second run. 2416 std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); 2417 std::fill(output.begin(), output.end(), INT8_C(0xA5)); 2418 2419 // Compute reference results for the second run, including renormalization. 2420 if (has_bias()) { 2421 for (size_t i = 0; i < next_batch_size(); i++) { 2422 for (size_t oy = 0; oy < next_output_height(); oy++) { 2423 for (size_t ox = 0; ox < next_output_width(); ox++) { 2424 for (size_t g = 0; g < groups(); g++) { 2425 for (size_t oc = 0; oc < group_output_channels(); oc++) { 2426 next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] = 2427 bias[g * group_output_channels() + oc]; 2428 } 2429 } 2430 } 2431 } 2432 } 2433 } else { 2434 std::fill(next_accumulators.begin(), next_accumulators.end(), 0); 2435 } 2436 for (size_t i = 0; i < next_batch_size(); i++) { 2437 for (size_t oy = 0; oy < next_output_height(); oy++) { 2438 for (size_t ox = 0; ox < next_output_width(); ox++) { 2439 for (size_t ky = 0; ky < kernel_height(); ky++) { 2440 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top(); 2441 if (iy < next_input_height()) { 2442 for (size_t kx = 0; kx < kernel_width(); kx++) { 2443 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left(); 2444 if (ix < next_input_width()) { 2445 for (size_t g = 0; g < groups(); g++) { 2446 for (size_t oc = 0; oc < group_output_channels(); oc++) { 2447 for (size_t ic = 0; ic < group_input_channels(); ic++) { 2448 next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] += 2449 (int32_t(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) * 2450 int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]); 2451 } 2452 } 2453 } 2454 } 2455 } 2456 } 2457 } 2458 } 2459 } 2460 } 2461 std::transform(next_accumulators.cbegin(), next_accumulators.cend(), next_output_ref.begin(), 2462 [this, output_scale, output_zero_point](int32_t x) -> double { 2463 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax() - 0x80) - output_zero_point), double(qmin() - 0x80) - output_zero_point); 2464 }); 2465 2466 // Setup and run Convolution operator the second time, and destroy the operator. 2467 ASSERT_EQ(xnn_status_success, 2468 xnn_setup_convolution2d_nhwc_qs8( 2469 convolution_op, 2470 next_batch_size(), next_input_height(), next_input_width(), 2471 input.data(), output.data(), 2472 nullptr /* thread pool */)); 2473 2474 ASSERT_EQ(xnn_status_success, 2475 xnn_run_operator(convolution_op, nullptr /* thread pool */)); 2476 2477 // Verify results of the second run. 2478 for (size_t i = 0; i < next_batch_size(); i++) { 2479 for (size_t y = 0; y < next_output_height(); y++) { 2480 for (size_t x = 0; x < next_output_width(); x++) { 2481 for (size_t g = 0; g < groups(); g++) { 2482 for (size_t c = 0; c < group_output_channels(); c++) { 2483 ASSERT_LE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax() - 0x80)) 2484 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 2485 ASSERT_GE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin() - 0x80)) 2486 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 2487 ASSERT_NEAR( 2488 next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c], 2489 double(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]) - double(output_zero_point), 2490 0.9) 2491 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 2492 } 2493 } 2494 } 2495 } 2496 } 2497 } 2498 } 2499 TestSetupNHWCxQU8()2500 void TestSetupNHWCxQU8() const { 2501 ASSERT_EQ(weights_type(), WeightsType::Default); 2502 2503 ASSERT_FALSE(depthwise_layout()); 2504 2505 std::random_device random_device; 2506 auto rng = std::mt19937(random_device()); 2507 std::uniform_int_distribution<int32_t> i32dist(-10000, 10000); 2508 std::uniform_int_distribution<int32_t> u8dist( 2509 std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max()); 2510 2511 std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) + std::max( 2512 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()), 2513 next_batch_size() * ((next_input_height() * next_input_width() - 1) * input_channel_stride() + groups() * group_input_channels()))); 2514 std::vector<uint8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels()); 2515 std::vector<int32_t> bias(groups() * group_output_channels()); 2516 std::vector<uint8_t> output(std::max( 2517 batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()), 2518 next_batch_size() * ((next_output_height() * next_output_width() - 1) * output_channel_stride() + groups() * group_output_channels()))); 2519 std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels()); 2520 std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels()); 2521 std::vector<int32_t> next_accumulators(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels()); 2522 std::vector<double> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels()); 2523 2524 const uint8_t input_zero_point = 127; 2525 const uint8_t kernel_zero_point = 127; 2526 2527 for (size_t iteration = 0; iteration < iterations(); iteration++) { 2528 std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); }); 2529 std::generate(kernel.begin(), kernel.end(), [&]() { return u8dist(rng); }); 2530 std::generate(bias.begin(), bias.end(), [&]() { return i32dist(rng); }); 2531 std::fill(output.begin(), output.end(), UINT8_C(0xA5)); 2532 2533 // Compute reference results, without renormalization. 2534 if (has_bias()) { 2535 for (size_t i = 0; i < batch_size(); i++) { 2536 for (size_t oy = 0; oy < output_height(); oy++) { 2537 for (size_t ox = 0; ox < output_width(); ox++) { 2538 for (size_t g = 0; g < groups(); g++) { 2539 for (size_t oc = 0; oc < group_output_channels(); oc++) { 2540 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] = 2541 bias[g * group_output_channels() + oc]; 2542 } 2543 } 2544 } 2545 } 2546 } 2547 } else { 2548 std::fill(accumulators.begin(), accumulators.end(), 0); 2549 } 2550 for (size_t i = 0; i < batch_size(); i++) { 2551 for (size_t oy = 0; oy < output_height(); oy++) { 2552 for (size_t ox = 0; ox < output_width(); ox++) { 2553 for (size_t ky = 0; ky < kernel_height(); ky++) { 2554 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top(); 2555 if (iy < input_height()) { 2556 for (size_t kx = 0; kx < kernel_width(); kx++) { 2557 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left(); 2558 if (ix < input_width()) { 2559 for (size_t g = 0; g < groups(); g++) { 2560 for (size_t oc = 0; oc < group_output_channels(); oc++) { 2561 for (size_t ic = 0; ic < group_input_channels(); ic++) { 2562 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] += 2563 (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) * 2564 (int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]) - int32_t(kernel_zero_point)); 2565 } 2566 } 2567 } 2568 } 2569 } 2570 } 2571 } 2572 } 2573 } 2574 } 2575 2576 // Compute renormalization parameters. 2577 const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend()); 2578 const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend()); 2579 2580 const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0; 2581 const uint8_t output_zero_point = uint8_t(std::max(std::min( 2582 lrint(127.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale), 2583 long(std::numeric_limits<uint8_t>::max())), long(std::numeric_limits<uint8_t>::min()))); 2584 2585 // Renormalize reference results. 2586 std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(), 2587 [this, output_scale, output_zero_point](int32_t x) -> double { 2588 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax()) - output_zero_point), double(qmin()) - output_zero_point); 2589 }); 2590 2591 // Create, setup, and run Convolution operator once. 2592 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); 2593 xnn_operator_t convolution_op = nullptr; 2594 2595 xnn_status status = xnn_create_convolution2d_nhwc_qu8( 2596 padding_top(), padding_right(), padding_bottom(), padding_left(), 2597 kernel_height(), kernel_width(), 2598 subsampling_height(), subsampling_width(), 2599 dilation_height(), dilation_width(), 2600 groups(), group_input_channels(), group_output_channels(), 2601 input_channel_stride(), output_channel_stride(), 2602 input_zero_point, 1.0f /* input scale */, 2603 kernel_zero_point, 1.0f /* kernel scale */, 2604 kernel.data(), has_bias() ? bias.data() : nullptr, 2605 output_zero_point, output_scale, qmin(), qmax(), 2606 0, NULL, &convolution_op); 2607 if (status == xnn_status_unsupported_hardware) { 2608 GTEST_SKIP(); 2609 } 2610 ASSERT_EQ(xnn_status_success, status); 2611 ASSERT_NE(nullptr, convolution_op); 2612 2613 // Smart pointer to automatically delete convolution_op. 2614 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator); 2615 2616 ASSERT_EQ(xnn_status_success, 2617 xnn_setup_convolution2d_nhwc_qu8( 2618 convolution_op, 2619 batch_size(), input_height(), input_width(), 2620 input.data(), output.data(), 2621 nullptr /* thread pool */)); 2622 2623 ASSERT_EQ(xnn_status_success, 2624 xnn_run_operator(convolution_op, nullptr /* thread pool */)); 2625 2626 // Verify results of the first run. 2627 for (size_t i = 0; i < batch_size(); i++) { 2628 for (size_t y = 0; y < output_height(); y++) { 2629 for (size_t x = 0; x < output_width(); x++) { 2630 for (size_t g = 0; g < groups(); g++) { 2631 for (size_t c = 0; c < group_output_channels(); c++) { 2632 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax())) 2633 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 2634 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin())) 2635 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 2636 ASSERT_NEAR( 2637 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c], 2638 double(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]) - double(output_zero_point), 2639 0.9) 2640 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 2641 } 2642 } 2643 } 2644 } 2645 } 2646 2647 // Re-generate data for the second run. 2648 std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); }); 2649 std::fill(output.begin(), output.end(), 0xA5); 2650 2651 // Compute reference results for the second run, including renormalization. 2652 if (has_bias()) { 2653 for (size_t i = 0; i < next_batch_size(); i++) { 2654 for (size_t oy = 0; oy < next_output_height(); oy++) { 2655 for (size_t ox = 0; ox < next_output_width(); ox++) { 2656 for (size_t g = 0; g < groups(); g++) { 2657 for (size_t oc = 0; oc < group_output_channels(); oc++) { 2658 next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] = 2659 bias[g * group_output_channels() + oc]; 2660 } 2661 } 2662 } 2663 } 2664 } 2665 } else { 2666 std::fill(next_accumulators.begin(), next_accumulators.end(), 0); 2667 } 2668 for (size_t i = 0; i < next_batch_size(); i++) { 2669 for (size_t oy = 0; oy < next_output_height(); oy++) { 2670 for (size_t ox = 0; ox < next_output_width(); ox++) { 2671 for (size_t ky = 0; ky < kernel_height(); ky++) { 2672 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top(); 2673 if (iy < next_input_height()) { 2674 for (size_t kx = 0; kx < kernel_width(); kx++) { 2675 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left(); 2676 if (ix < next_input_width()) { 2677 for (size_t g = 0; g < groups(); g++) { 2678 for (size_t oc = 0; oc < group_output_channels(); oc++) { 2679 for (size_t ic = 0; ic < group_input_channels(); ic++) { 2680 next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] += 2681 (int32_t(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) * 2682 (int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]) - int32_t(kernel_zero_point)); 2683 } 2684 } 2685 } 2686 } 2687 } 2688 } 2689 } 2690 } 2691 } 2692 } 2693 std::transform(next_accumulators.cbegin(), next_accumulators.cend(), next_output_ref.begin(), 2694 [this, output_scale, output_zero_point](int32_t x) -> double { 2695 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax()) - output_zero_point), double(qmin()) - output_zero_point); 2696 }); 2697 2698 // Setup and run Convolution operator the second time, and destroy the operator. 2699 ASSERT_EQ(xnn_status_success, 2700 xnn_setup_convolution2d_nhwc_qu8( 2701 convolution_op, 2702 next_batch_size(), next_input_height(), next_input_width(), 2703 input.data(), output.data(), 2704 nullptr /* thread pool */)); 2705 2706 ASSERT_EQ(xnn_status_success, 2707 xnn_run_operator(convolution_op, nullptr /* thread pool */)); 2708 2709 // Verify results of the second run. 2710 for (size_t i = 0; i < next_batch_size(); i++) { 2711 for (size_t y = 0; y < next_output_height(); y++) { 2712 for (size_t x = 0; x < next_output_width(); x++) { 2713 for (size_t g = 0; g < groups(); g++) { 2714 for (size_t c = 0; c < group_output_channels(); c++) { 2715 ASSERT_LE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax())) 2716 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 2717 ASSERT_GE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin())) 2718 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 2719 ASSERT_NEAR( 2720 next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c], 2721 double(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]) - double(output_zero_point), 2722 0.9) 2723 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 2724 } 2725 } 2726 } 2727 } 2728 } 2729 } 2730 } 2731 TestSetupNHWCxF16()2732 void TestSetupNHWCxF16() const { 2733 ASSERT_EQ(weights_type(), WeightsType::Default); 2734 2735 ASSERT_FALSE(depthwise_layout()); 2736 2737 std::random_device random_device; 2738 auto rng = std::mt19937(random_device()); 2739 std::uniform_real_distribution<float> f32dist(0.1f, 1.0f); 2740 2741 std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) + std::max( 2742 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()), 2743 next_batch_size() * ((next_input_height() * next_input_width() - 1) * input_channel_stride() + groups() * group_input_channels()))); 2744 std::vector<uint16_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels()); 2745 std::vector<uint16_t> bias(groups() * group_output_channels()); 2746 std::vector<uint16_t> output(std::max( 2747 batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()), 2748 next_batch_size() * ((next_output_height() * next_output_width() - 1) * output_channel_stride() + groups() * group_output_channels()))); 2749 std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels()); 2750 std::vector<float> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels()); 2751 2752 for (size_t iteration = 0; iteration < iterations(); iteration++) { 2753 std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 2754 std::generate(kernel.begin(), kernel.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 2755 std::generate(bias.begin(), bias.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 2756 std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */); 2757 2758 // Compute reference results, without clamping. 2759 if (has_bias()) { 2760 for (size_t i = 0; i < batch_size(); i++) { 2761 for (size_t oy = 0; oy < output_height(); oy++) { 2762 for (size_t ox = 0; ox < output_width(); ox++) { 2763 for (size_t g = 0; g < groups(); g++) { 2764 for (size_t oc = 0; oc < group_output_channels(); oc++) { 2765 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] = 2766 fp16_ieee_to_fp32_value(bias[g * group_output_channels() + oc]); 2767 } 2768 } 2769 } 2770 } 2771 } 2772 } else { 2773 std::fill(output_ref.begin(), output_ref.end(), 0.0f); 2774 } 2775 for (size_t i = 0; i < batch_size(); i++) { 2776 for (size_t oy = 0; oy < output_height(); oy++) { 2777 for (size_t ox = 0; ox < output_width(); ox++) { 2778 for (size_t ky = 0; ky < kernel_height(); ky++) { 2779 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top(); 2780 if (iy < input_height()) { 2781 for (size_t kx = 0; kx < kernel_width(); kx++) { 2782 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left(); 2783 if (ix < input_width()) { 2784 for (size_t g = 0; g < groups(); g++) { 2785 for (size_t oc = 0; oc < group_output_channels(); oc++) { 2786 for (size_t ic = 0; ic < group_input_channels(); ic++) { 2787 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] += 2788 fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) * 2789 fp16_ieee_to_fp32_value(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]); 2790 } 2791 } 2792 } 2793 } 2794 } 2795 } 2796 } 2797 } 2798 } 2799 } 2800 2801 // Compute clamping parameters. 2802 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); 2803 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); 2804 const float accumulated_range = accumulated_max - accumulated_min; 2805 const float scaled_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_min + accumulated_range / 255.0f * float(qmin()))); 2806 const float scaled_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_max - accumulated_range / 255.0f * float(255 - qmax()))); 2807 const float output_min = scaled_min == scaled_max ? -std::numeric_limits<float>::infinity() : scaled_min; 2808 const float output_max = scaled_min == scaled_max ? +std::numeric_limits<float>::infinity() : scaled_max; 2809 2810 for (float& output_value : output_ref) { 2811 output_value = std::min(std::max(output_value, output_min), output_max); 2812 } 2813 2814 // Create, setup, and run Convolution operator once. 2815 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); 2816 xnn_operator_t convolution_op = nullptr; 2817 2818 xnn_status status = xnn_create_convolution2d_nhwc_f16( 2819 padding_top(), padding_right(), padding_bottom(), padding_left(), 2820 kernel_height(), kernel_width(), 2821 subsampling_height(), subsampling_width(), 2822 dilation_height(), dilation_width(), 2823 groups(), group_input_channels(), group_output_channels(), 2824 input_channel_stride(), output_channel_stride(), 2825 kernel.data(), has_bias() ? bias.data() : nullptr, 2826 output_min, output_max, 2827 0, NULL, &convolution_op); 2828 if (status == xnn_status_unsupported_hardware) { 2829 GTEST_SKIP(); 2830 } 2831 ASSERT_EQ(xnn_status_success, status); 2832 ASSERT_NE(nullptr, convolution_op); 2833 2834 // Smart pointer to automatically delete convolution_op. 2835 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator); 2836 2837 ASSERT_EQ(xnn_status_success, 2838 xnn_setup_convolution2d_nhwc_f16( 2839 convolution_op, 2840 batch_size(), input_height(), input_width(), 2841 input.data(), output.data(), 2842 nullptr /* thread pool */)); 2843 2844 ASSERT_EQ(xnn_status_success, 2845 xnn_run_operator(convolution_op, nullptr /* thread pool */)); 2846 2847 // Verify results of the first run. 2848 for (size_t i = 0; i < batch_size(); i++) { 2849 for (size_t y = 0; y < output_height(); y++) { 2850 for (size_t x = 0; x < output_width(); x++) { 2851 for (size_t g = 0; g < groups(); g++) { 2852 for (size_t c = 0; c < group_output_channels(); c++) { 2853 ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), output_min) 2854 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 2855 ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), output_max) 2856 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 2857 ASSERT_NEAR(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c], fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), std::max(1.0e-4f, std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c]) * 1.0e-2f)) 2858 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 2859 } 2860 } 2861 } 2862 } 2863 } 2864 2865 // Re-generate data for the second run. 2866 std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 2867 std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */); 2868 2869 // Compute reference results for the second run, including clamping. 2870 if (has_bias()) { 2871 for (size_t i = 0; i < next_batch_size(); i++) { 2872 for (size_t oy = 0; oy < next_output_height(); oy++) { 2873 for (size_t ox = 0; ox < next_output_width(); ox++) { 2874 for (size_t g = 0; g < groups(); g++) { 2875 for (size_t oc = 0; oc < group_output_channels(); oc++) { 2876 next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] = 2877 fp16_ieee_to_fp32_value(bias[g * group_output_channels() + oc]); 2878 } 2879 } 2880 } 2881 } 2882 } 2883 } else { 2884 std::fill(next_output_ref.begin(), next_output_ref.end(), 0.0f); 2885 } 2886 for (size_t i = 0; i < next_batch_size(); i++) { 2887 for (size_t oy = 0; oy < next_output_height(); oy++) { 2888 for (size_t ox = 0; ox < next_output_width(); ox++) { 2889 for (size_t ky = 0; ky < kernel_height(); ky++) { 2890 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top(); 2891 if (iy < next_input_height()) { 2892 for (size_t kx = 0; kx < kernel_width(); kx++) { 2893 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left(); 2894 if (ix < next_input_width()) { 2895 for (size_t g = 0; g < groups(); g++) { 2896 for (size_t oc = 0; oc < group_output_channels(); oc++) { 2897 for (size_t ic = 0; ic < group_input_channels(); ic++) { 2898 next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] += 2899 fp16_ieee_to_fp32_value(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) * 2900 fp16_ieee_to_fp32_value(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]); 2901 } 2902 } 2903 } 2904 } 2905 } 2906 } 2907 } 2908 } 2909 } 2910 } 2911 for (float& value : next_output_ref) { 2912 value = std::max(std::min(value, output_max), output_min); 2913 } 2914 2915 // Setup and run Convolution operator the second time, and destroy the operator. 2916 ASSERT_EQ(xnn_status_success, 2917 xnn_setup_convolution2d_nhwc_f16( 2918 convolution_op, 2919 next_batch_size(), next_input_height(), next_input_width(), 2920 input.data(), output.data(), 2921 nullptr /* thread pool */)); 2922 2923 ASSERT_EQ(xnn_status_success, 2924 xnn_run_operator(convolution_op, nullptr /* thread pool */)); 2925 2926 // Verify results of the second run. 2927 for (size_t i = 0; i < next_batch_size(); i++) { 2928 for (size_t y = 0; y < next_output_height(); y++) { 2929 for (size_t x = 0; x < next_output_width(); x++) { 2930 for (size_t g = 0; g < groups(); g++) { 2931 for (size_t c = 0; c < group_output_channels(); c++) { 2932 ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), output_min) 2933 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 2934 ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), output_max) 2935 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 2936 ASSERT_NEAR(next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c], fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), std::max(1.0e-4f, std::abs(next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c]) * 1.0e-2f)) 2937 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 2938 } 2939 } 2940 } 2941 } 2942 } 2943 } 2944 } 2945 TestSetupNHWCxF32()2946 void TestSetupNHWCxF32() const { 2947 ASSERT_EQ(weights_type(), WeightsType::Default); 2948 2949 ASSERT_FALSE(depthwise_layout()); 2950 2951 std::random_device random_device; 2952 auto rng = std::mt19937(random_device()); 2953 std::uniform_real_distribution<float> f32dist(0.1f, 1.0f); 2954 2955 std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + std::max( 2956 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()), 2957 next_batch_size() * ((next_input_height() * next_input_width() - 1) * input_channel_stride() + groups() * group_input_channels()))); 2958 std::vector<float> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels()); 2959 std::vector<float> bias(groups() * group_output_channels()); 2960 std::vector<float> output(std::max( 2961 batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()), 2962 next_batch_size() * ((next_output_height() * next_output_width() - 1) * output_channel_stride() + groups() * group_output_channels()))); 2963 std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels()); 2964 std::vector<float> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels()); 2965 2966 for (size_t iteration = 0; iteration < iterations(); iteration++) { 2967 std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); 2968 std::generate(kernel.begin(), kernel.end(), [&]() { return f32dist(rng); }); 2969 std::generate(bias.begin(), bias.end(), [&]() { return f32dist(rng); }); 2970 std::fill(output.begin(), output.end(), nanf("")); 2971 2972 // Compute reference results, without clamping. 2973 if (has_bias()) { 2974 for (size_t i = 0; i < batch_size(); i++) { 2975 for (size_t oy = 0; oy < output_height(); oy++) { 2976 for (size_t ox = 0; ox < output_width(); ox++) { 2977 for (size_t g = 0; g < groups(); g++) { 2978 for (size_t oc = 0; oc < group_output_channels(); oc++) { 2979 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] = 2980 bias[g * group_output_channels() + oc]; 2981 } 2982 } 2983 } 2984 } 2985 } 2986 } else { 2987 std::fill(output_ref.begin(), output_ref.end(), 0.0f); 2988 } 2989 for (size_t i = 0; i < batch_size(); i++) { 2990 for (size_t oy = 0; oy < output_height(); oy++) { 2991 for (size_t ox = 0; ox < output_width(); ox++) { 2992 for (size_t ky = 0; ky < kernel_height(); ky++) { 2993 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top(); 2994 if (iy < input_height()) { 2995 for (size_t kx = 0; kx < kernel_width(); kx++) { 2996 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left(); 2997 if (ix < input_width()) { 2998 for (size_t g = 0; g < groups(); g++) { 2999 for (size_t oc = 0; oc < group_output_channels(); oc++) { 3000 for (size_t ic = 0; ic < group_input_channels(); ic++) { 3001 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] += 3002 input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic] * 3003 kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]; 3004 } 3005 } 3006 } 3007 } 3008 } 3009 } 3010 } 3011 } 3012 } 3013 } 3014 3015 // Compute clamping parameters. 3016 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); 3017 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); 3018 3019 const float output_min = accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin()); 3020 const float output_max = accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax()); 3021 3022 // Clamp reference results. 3023 for (float& value : output_ref) { 3024 value = std::max(std::min(value, output_max), output_min); 3025 } 3026 3027 // Create, setup, and run Convolution operator once. 3028 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); 3029 xnn_operator_t convolution_op = nullptr; 3030 3031 xnn_status status = xnn_create_convolution2d_nhwc_f32( 3032 padding_top(), padding_right(), padding_bottom(), padding_left(), 3033 kernel_height(), kernel_width(), 3034 subsampling_height(), subsampling_width(), 3035 dilation_height(), dilation_width(), 3036 groups(), group_input_channels(), group_output_channels(), 3037 input_channel_stride(), output_channel_stride(), 3038 kernel.data(), has_bias() ? bias.data() : nullptr, 3039 output_min, output_max, 3040 0, NULL, &convolution_op); 3041 if (status == xnn_status_unsupported_hardware) { 3042 GTEST_SKIP(); 3043 } 3044 ASSERT_EQ(xnn_status_success, status); 3045 ASSERT_NE(nullptr, convolution_op); 3046 3047 // Smart pointer to automatically delete convolution_op. 3048 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator); 3049 3050 ASSERT_EQ(xnn_status_success, 3051 xnn_setup_convolution2d_nhwc_f32( 3052 convolution_op, 3053 batch_size(), input_height(), input_width(), 3054 input.data(), output.data(), 3055 nullptr /* thread pool */)); 3056 3057 ASSERT_EQ(xnn_status_success, 3058 xnn_run_operator(convolution_op, nullptr /* thread pool */)); 3059 3060 // Verify results of the first run. 3061 for (size_t i = 0; i < batch_size(); i++) { 3062 for (size_t y = 0; y < output_height(); y++) { 3063 for (size_t x = 0; x < output_width(); x++) { 3064 for (size_t g = 0; g < groups(); g++) { 3065 for (size_t c = 0; c < group_output_channels(); c++) { 3066 ASSERT_GE(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c], output_min) 3067 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 3068 ASSERT_LE(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c], output_max) 3069 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 3070 ASSERT_NEAR( 3071 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c], 3072 output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c], 3073 1.0e-4 * std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c])) 3074 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 3075 } 3076 } 3077 } 3078 } 3079 } 3080 3081 // Re-generate data for the second run. 3082 std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); 3083 std::fill(output.begin(), output.end(), nanf("")); 3084 3085 // Compute reference results for the second run, including clamping. 3086 if (has_bias()) { 3087 for (size_t i = 0; i < next_batch_size(); i++) { 3088 for (size_t oy = 0; oy < next_output_height(); oy++) { 3089 for (size_t ox = 0; ox < next_output_width(); ox++) { 3090 for (size_t g = 0; g < groups(); g++) { 3091 for (size_t oc = 0; oc < group_output_channels(); oc++) { 3092 next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] = 3093 bias[g * group_output_channels() + oc]; 3094 } 3095 } 3096 } 3097 } 3098 } 3099 } else { 3100 std::fill(next_output_ref.begin(), next_output_ref.end(), 0.0f); 3101 } 3102 for (size_t i = 0; i < next_batch_size(); i++) { 3103 for (size_t oy = 0; oy < next_output_height(); oy++) { 3104 for (size_t ox = 0; ox < next_output_width(); ox++) { 3105 for (size_t ky = 0; ky < kernel_height(); ky++) { 3106 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top(); 3107 if (iy < next_input_height()) { 3108 for (size_t kx = 0; kx < kernel_width(); kx++) { 3109 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left(); 3110 if (ix < next_input_width()) { 3111 for (size_t g = 0; g < groups(); g++) { 3112 for (size_t oc = 0; oc < group_output_channels(); oc++) { 3113 for (size_t ic = 0; ic < group_input_channels(); ic++) { 3114 next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] += 3115 input[((i * next_input_height() + iy) * next_input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic] * 3116 kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]; 3117 } 3118 } 3119 } 3120 } 3121 } 3122 } 3123 } 3124 } 3125 } 3126 } 3127 for (float& value : next_output_ref) { 3128 value = std::max(std::min(value, output_max), output_min); 3129 } 3130 3131 // Setup and run Convolution operator the second time, and destroy the operator. 3132 ASSERT_EQ(xnn_status_success, 3133 xnn_setup_convolution2d_nhwc_f32( 3134 convolution_op, 3135 next_batch_size(), next_input_height(), next_input_width(), 3136 input.data(), output.data(), 3137 nullptr /* thread pool */)); 3138 3139 ASSERT_EQ(xnn_status_success, 3140 xnn_run_operator(convolution_op, nullptr /* thread pool */)); 3141 3142 // Verify results of the second run. 3143 for (size_t i = 0; i < next_batch_size(); i++) { 3144 for (size_t y = 0; y < next_output_height(); y++) { 3145 for (size_t x = 0; x < next_output_width(); x++) { 3146 for (size_t g = 0; g < groups(); g++) { 3147 for (size_t c = 0; c < group_output_channels(); c++) { 3148 ASSERT_GE(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c], output_min) 3149 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 3150 ASSERT_LE(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c], output_max) 3151 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 3152 ASSERT_NEAR( 3153 next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c], 3154 output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c], 3155 1.0e-4 * std::abs(next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c])) 3156 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c; 3157 } 3158 } 3159 } 3160 } 3161 } 3162 } 3163 } 3164 VerifyWeightsCache(const xnn_weights_cache & weights_cache,size_t old_size)3165 void VerifyWeightsCache(const xnn_weights_cache &weights_cache, size_t old_size) const { 3166 ASSERT_EQ(weights_cache.cache.hits, 1); 3167 // Ensure that we did not write more weights to the cache because it was a 3168 // cache hit. 3169 ASSERT_EQ(old_size, weights_cache.cache.weights.size); 3170 }; 3171 VerifyWeightsCacheUnused(const xnn_weights_cache & weights_cache)3172 void VerifyWeightsCacheUnused(const xnn_weights_cache &weights_cache) const { 3173 ASSERT_EQ(weights_cache.cache.hits, 0); 3174 ASSERT_EQ(0, weights_cache.cache.weights.size); 3175 } 3176 IsSpmm()3177 bool IsSpmm() const { 3178 const bool is_1x1 = kernel_width() == 1 && kernel_height() == 1 && 3179 subsampling_height() == 1 && subsampling_width() == 1; 3180 const bool any_padding = (padding_left() | padding_top() | padding_right() | padding_bottom()) != 0; 3181 return is_1x1 && !any_padding && !force_nhwc_input() && groups() == 1; 3182 } 3183 3184 private: 3185 uint32_t padding_top_{0}; 3186 uint32_t padding_right_{0}; 3187 uint32_t padding_bottom_{0}; 3188 uint32_t padding_left_{0}; 3189 bool padding_tf_same_{false}; 3190 size_t input_height_{1}; 3191 size_t input_width_{1}; 3192 uint32_t groups_{1}; 3193 size_t group_input_channels_{1}; 3194 size_t input_channel_stride_{0}; 3195 size_t group_output_channels_{1}; 3196 size_t output_channel_stride_{0}; 3197 size_t batch_size_{1}; 3198 uint32_t kernel_height_{1}; 3199 uint32_t kernel_width_{1}; 3200 uint32_t dilation_height_{1}; 3201 uint32_t dilation_width_{1}; 3202 uint32_t subsampling_height_{1}; 3203 uint32_t subsampling_width_{1}; 3204 size_t next_input_height_{0}; 3205 size_t next_input_width_{0}; 3206 size_t next_batch_size_{0}; 3207 float sparsity_{0.0f}; 3208 uint8_t qmin_{0}; 3209 uint8_t qmax_{255}; 3210 bool depthwise_layout_{false}; 3211 bool force_nhwc_input_{false}; 3212 bool has_bias_{true}; 3213 WeightsType weights_type_{WeightsType::Default}; 3214 size_t iterations_{1}; 3215 #if XNN_PLATFORM_JIT 3216 bool use_jit_{false}; 3217 #endif 3218 bool use_weights_cache_{false}; 3219 }; 3220