1 // Copyright (c) Facebook, Inc. and its affiliates. 2 // All rights reserved. 3 // 4 // Copyright 2019 Google LLC 5 // 6 // This source code is licensed under the BSD-style license found in the 7 // LICENSE file in the root directory of this source tree. 8 9 #pragma once 10 11 #include <gtest/gtest.h> 12 13 #include <fp16.h> 14 15 #include <algorithm> 16 #include <cassert> 17 #include <cstddef> 18 #include <cstdlib> 19 #include <limits> 20 #include <random> 21 #include <vector> 22 23 #include <xnnpack.h> 24 25 26 class MaxPoolingOperatorTester { 27 public: padding_tf_same(bool padding_same)28 inline MaxPoolingOperatorTester& padding_tf_same(bool padding_same) { 29 if (padding_same) { 30 assert(padding_top() == 0); 31 assert(padding_left() == 0); 32 assert(padding_bottom() == 0); 33 assert(padding_right() == 0); 34 } 35 this->padding_tf_same_ = padding_same; 36 return *this; 37 } 38 padding_tf_same()39 inline bool padding_tf_same() const { 40 return this->padding_tf_same_; 41 } 42 padding(uint32_t padding)43 inline MaxPoolingOperatorTester& padding(uint32_t padding) { 44 assert(!padding_tf_same()); 45 this->padding_top_ = padding; 46 this->padding_right_ = padding; 47 this->padding_bottom_ = padding; 48 this->padding_left_ = padding; 49 return *this; 50 } 51 padding(uint32_t padding_height,uint32_t padding_width)52 inline MaxPoolingOperatorTester& padding(uint32_t padding_height, uint32_t padding_width) { 53 assert(!padding_tf_same()); 54 this->padding_top_ = padding_height; 55 this->padding_right_ = padding_width; 56 this->padding_bottom_ = padding_height; 57 this->padding_left_ = padding_width; 58 return *this; 59 } 60 padding_height(uint32_t padding_height)61 inline MaxPoolingOperatorTester& padding_height(uint32_t padding_height) { 62 assert(!padding_tf_same()); 63 this->padding_top_ = padding_height; 64 this->padding_bottom_ = padding_height; 65 return *this; 66 } 67 padding_width(uint32_t padding_width)68 inline MaxPoolingOperatorTester& padding_width(uint32_t padding_width) { 69 assert(!padding_tf_same()); 70 this->padding_right_ = padding_width; 71 this->padding_left_ = padding_width; 72 return *this; 73 } 74 padding_top(uint32_t padding_top)75 inline MaxPoolingOperatorTester& padding_top(uint32_t padding_top) { 76 assert(!padding_tf_same()); 77 this->padding_top_ = padding_top; 78 return *this; 79 } 80 padding_top()81 inline uint32_t padding_top() const { 82 if (padding_tf_same()) { 83 const uint32_t total_padding_height = 84 (output_height() - 1) * stride_height() + dilated_pooling_height() - input_height(); 85 return total_padding_height / 2; 86 } else { 87 return this->padding_top_; 88 } 89 } 90 padding_left(uint32_t padding_left)91 inline MaxPoolingOperatorTester& padding_left(uint32_t padding_left) { 92 assert(!padding_tf_same()); 93 this->padding_left_ = padding_left; 94 return *this; 95 } 96 padding_left()97 inline uint32_t padding_left() const { 98 if (padding_tf_same()) { 99 const uint32_t total_padding_width = 100 (output_width() - 1) * stride_width() + dilated_pooling_width() - input_width(); 101 return total_padding_width / 2; 102 } else { 103 return this->padding_left_; 104 } 105 } 106 padding_bottom(uint32_t padding_bottom)107 inline MaxPoolingOperatorTester& padding_bottom(uint32_t padding_bottom) { 108 assert(!padding_tf_same()); 109 this->padding_bottom_ = padding_bottom; 110 return *this; 111 } 112 padding_bottom()113 inline uint32_t padding_bottom() const { 114 if (padding_tf_same()) { 115 const uint32_t total_padding_height = 116 (output_height() - 1) * stride_height() + dilated_pooling_height() - input_height(); 117 return total_padding_height - total_padding_height / 2; 118 } else { 119 return this->padding_bottom_; 120 } 121 } 122 padding_right(uint32_t padding_right)123 inline MaxPoolingOperatorTester& padding_right(uint32_t padding_right) { 124 assert(!padding_tf_same()); 125 this->padding_right_ = padding_right; 126 return *this; 127 } 128 padding_right()129 inline uint32_t padding_right() const { 130 if (padding_tf_same()) { 131 const uint32_t total_padding_width = 132 (output_width() - 1) * stride_width() + dilated_pooling_width() - input_width(); 133 return total_padding_width - total_padding_width / 2; 134 } else { 135 return this->padding_right_; 136 } 137 } 138 input_size(size_t input_height,size_t input_width)139 inline MaxPoolingOperatorTester& input_size(size_t input_height, size_t input_width) { 140 assert(input_height >= 1); 141 assert(input_width >= 1); 142 this->input_height_ = input_height; 143 this->input_width_ = input_width; 144 return *this; 145 } 146 input_height(size_t input_height)147 inline MaxPoolingOperatorTester& input_height(size_t input_height) { 148 assert(input_height >= 1); 149 this->input_height_ = input_height; 150 return *this; 151 } 152 input_height()153 inline size_t input_height() const { 154 return this->input_height_; 155 } 156 input_width(size_t input_width)157 inline MaxPoolingOperatorTester& input_width(size_t input_width) { 158 assert(input_width >= 1); 159 this->input_width_ = input_width; 160 return *this; 161 } 162 input_width()163 inline size_t input_width() const { 164 return this->input_width_; 165 } 166 channels(size_t channels)167 inline MaxPoolingOperatorTester& channels(size_t channels) { 168 assert(channels != 0); 169 this->channels_ = channels; 170 return *this; 171 } 172 channels()173 inline size_t channels() const { 174 return this->channels_; 175 } 176 batch_size(size_t batch_size)177 inline MaxPoolingOperatorTester& batch_size(size_t batch_size) { 178 assert(batch_size != 0); 179 this->batch_size_ = batch_size; 180 return *this; 181 } 182 batch_size()183 inline size_t batch_size() const { 184 return this->batch_size_; 185 } 186 pooling_size(uint32_t pooling_size)187 inline MaxPoolingOperatorTester& pooling_size(uint32_t pooling_size) { 188 assert(pooling_size >= 1); 189 this->pooling_height_ = pooling_size; 190 this->pooling_width_ = pooling_size; 191 return *this; 192 } 193 pooling_size(uint32_t pooling_height,uint32_t pooling_width)194 inline MaxPoolingOperatorTester& pooling_size(uint32_t pooling_height, uint32_t pooling_width) { 195 assert(pooling_height >= 1); 196 assert(pooling_width >= 1); 197 this->pooling_height_ = pooling_height; 198 this->pooling_width_ = pooling_width; 199 return *this; 200 } 201 pooling_height(uint32_t pooling_height)202 inline MaxPoolingOperatorTester& pooling_height(uint32_t pooling_height) { 203 assert(pooling_height >= 1); 204 this->pooling_height_ = pooling_height; 205 return *this; 206 } 207 pooling_height()208 inline uint32_t pooling_height() const { 209 return this->pooling_height_; 210 } 211 pooling_width(uint32_t pooling_width)212 inline MaxPoolingOperatorTester& pooling_width(uint32_t pooling_width) { 213 assert(pooling_width >= 1); 214 this->pooling_width_ = pooling_width; 215 return *this; 216 } 217 pooling_width()218 inline uint32_t pooling_width() const { 219 return this->pooling_width_; 220 } 221 stride(uint32_t stride)222 inline MaxPoolingOperatorTester& stride(uint32_t stride) { 223 assert(stride >= 1); 224 this->stride_height_ = stride; 225 this->stride_width_ = stride; 226 return *this; 227 } 228 stride(uint32_t stride_height,uint32_t stride_width)229 inline MaxPoolingOperatorTester& stride(uint32_t stride_height, uint32_t stride_width) { 230 assert(stride_height >= 1); 231 assert(stride_width >= 1); 232 this->stride_height_ = stride_height; 233 this->stride_width_ = stride_width; 234 return *this; 235 } 236 stride_height(uint32_t stride_height)237 inline MaxPoolingOperatorTester& stride_height(uint32_t stride_height) { 238 assert(stride_height >= 1); 239 this->stride_height_ = stride_height; 240 return *this; 241 } 242 stride_height()243 inline uint32_t stride_height() const { 244 return this->stride_height_; 245 } 246 stride_width(uint32_t stride_width)247 inline MaxPoolingOperatorTester& stride_width(uint32_t stride_width) { 248 assert(stride_width >= 1); 249 this->stride_width_ = stride_width; 250 return *this; 251 } 252 stride_width()253 inline uint32_t stride_width() const { 254 return this->stride_width_; 255 } 256 dilation(uint32_t dilation)257 inline MaxPoolingOperatorTester& dilation(uint32_t dilation) { 258 assert(dilation >= 1); 259 this->dilation_height_ = dilation; 260 this->dilation_width_ = dilation; 261 return *this; 262 } 263 dilation(uint32_t dilation_height,uint32_t dilation_width)264 inline MaxPoolingOperatorTester& dilation(uint32_t dilation_height, uint32_t dilation_width) { 265 assert(dilation_height >= 1); 266 assert(dilation_width >= 1); 267 this->dilation_height_ = dilation_height; 268 this->dilation_width_ = dilation_width; 269 return *this; 270 } 271 dilation_height(uint32_t dilation_height)272 inline MaxPoolingOperatorTester& dilation_height(uint32_t dilation_height) { 273 assert(dilation_height >= 1); 274 this->dilation_height_ = dilation_height; 275 return *this; 276 } 277 dilation_height()278 inline uint32_t dilation_height() const { 279 return this->dilation_height_; 280 } 281 dilation_width(uint32_t dilation_width)282 inline MaxPoolingOperatorTester& dilation_width(uint32_t dilation_width) { 283 assert(dilation_width >= 1); 284 this->dilation_width_ = dilation_width; 285 return *this; 286 } 287 dilation_width()288 inline uint32_t dilation_width() const { 289 return this->dilation_width_; 290 } 291 dilated_pooling_height()292 inline uint32_t dilated_pooling_height() const { 293 return (pooling_height() - 1) * dilation_height() + 1; 294 } 295 dilated_pooling_width()296 inline uint32_t dilated_pooling_width() const { 297 return (pooling_width() - 1) * dilation_width() + 1; 298 } 299 output_height()300 inline size_t output_height() const { 301 if (padding_tf_same()) { 302 return (input_height() + stride_height() - 1) / stride_height(); 303 } else { 304 const size_t padded_input_height = padding_top() + input_height() + padding_bottom(); 305 if (padded_input_height <= dilated_pooling_height()) { 306 return 1; 307 } else { 308 return (padded_input_height - dilated_pooling_height()) / stride_height() + 1; 309 } 310 } 311 } 312 output_width()313 inline size_t output_width() const { 314 if (padding_tf_same()) { 315 return (input_width() + stride_width() - 1) / stride_width(); 316 } else { 317 const size_t padded_input_width = padding_left() + input_width() + padding_right(); 318 if (padded_input_width <= dilated_pooling_width()) { 319 return 1; 320 } else { 321 return (padded_input_width - dilated_pooling_width()) / stride_width() + 1; 322 } 323 } 324 } 325 input_pixel_stride(size_t input_pixel_stride)326 inline MaxPoolingOperatorTester& input_pixel_stride(size_t input_pixel_stride) { 327 assert(input_pixel_stride != 0); 328 this->input_pixel_stride_ = input_pixel_stride; 329 return *this; 330 } 331 input_pixel_stride()332 inline size_t input_pixel_stride() const { 333 if (this->input_pixel_stride_ == 0) { 334 return channels(); 335 } else { 336 assert(this->input_pixel_stride_ >= channels()); 337 return this->input_pixel_stride_; 338 } 339 } 340 output_pixel_stride(size_t output_pixel_stride)341 inline MaxPoolingOperatorTester& output_pixel_stride(size_t output_pixel_stride) { 342 assert(output_pixel_stride != 0); 343 this->output_pixel_stride_ = output_pixel_stride; 344 return *this; 345 } 346 output_pixel_stride()347 inline size_t output_pixel_stride() const { 348 if (this->output_pixel_stride_ == 0) { 349 return channels(); 350 } else { 351 assert(this->output_pixel_stride_ >= channels()); 352 return this->output_pixel_stride_; 353 } 354 } 355 next_input_size(uint32_t next_input_height,uint32_t next_input_width)356 inline MaxPoolingOperatorTester& next_input_size(uint32_t next_input_height, uint32_t next_input_width) { 357 assert(next_input_height >= 1); 358 assert(next_input_width >= 1); 359 this->next_input_height_ = next_input_height; 360 this->next_input_width_ = next_input_width; 361 return *this; 362 } 363 next_input_height(uint32_t next_input_height)364 inline MaxPoolingOperatorTester& next_input_height(uint32_t next_input_height) { 365 assert(next_input_height >= 1); 366 this->next_input_height_ = next_input_height; 367 return *this; 368 } 369 next_input_height()370 inline uint32_t next_input_height() const { 371 if (this->next_input_height_ == 0) { 372 return input_height(); 373 } else { 374 return this->next_input_height_; 375 } 376 } 377 next_input_width(uint32_t next_input_width)378 inline MaxPoolingOperatorTester& next_input_width(uint32_t next_input_width) { 379 assert(next_input_width >= 1); 380 this->next_input_width_ = next_input_width; 381 return *this; 382 } 383 next_input_width()384 inline uint32_t next_input_width() const { 385 if (this->next_input_width_ == 0) { 386 return input_width(); 387 } else { 388 return this->next_input_width_; 389 } 390 } 391 next_output_height()392 inline size_t next_output_height() const { 393 const size_t padded_next_input_height = padding_top() + next_input_height() + padding_bottom(); 394 if (padded_next_input_height <= dilated_pooling_height()) { 395 return 1; 396 } else { 397 return (padded_next_input_height - dilated_pooling_height()) / stride_height() + 1; 398 } 399 } 400 next_output_width()401 inline size_t next_output_width() const { 402 const size_t padded_next_input_width = padding_left() + next_input_width() + padding_right(); 403 if (padded_next_input_width <= dilated_pooling_width()) { 404 return 1; 405 } else { 406 return (padded_next_input_width - dilated_pooling_width()) / stride_width() + 1; 407 } 408 } 409 next_batch_size(size_t next_batch_size)410 inline MaxPoolingOperatorTester& next_batch_size(size_t next_batch_size) { 411 assert(next_batch_size >= 1); 412 this->next_batch_size_ = next_batch_size; 413 return *this; 414 } 415 next_batch_size()416 inline size_t next_batch_size() const { 417 if (this->next_batch_size_ == 0) { 418 return batch_size(); 419 } else { 420 return this->next_batch_size_; 421 } 422 } 423 qmin(uint8_t qmin)424 inline MaxPoolingOperatorTester& qmin(uint8_t qmin) { 425 this->qmin_ = qmin; 426 return *this; 427 } 428 qmin()429 inline uint8_t qmin() const { 430 return this->qmin_; 431 } 432 qmax(uint8_t qmax)433 inline MaxPoolingOperatorTester& qmax(uint8_t qmax) { 434 this->qmax_ = qmax; 435 return *this; 436 } 437 qmax()438 inline uint8_t qmax() const { 439 return this->qmax_; 440 } 441 iterations(size_t iterations)442 inline MaxPoolingOperatorTester& iterations(size_t iterations) { 443 this->iterations_ = iterations; 444 return *this; 445 } 446 iterations()447 inline size_t iterations() const { 448 return this->iterations_; 449 } 450 TestS8()451 void TestS8() const { 452 std::random_device random_device; 453 auto rng = std::mt19937(random_device()); 454 std::uniform_int_distribution<int32_t> i8dist( 455 std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()); 456 457 std::vector<int8_t> input((batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + channels() + XNN_EXTRA_BYTES / sizeof(int8_t)); 458 std::vector<int8_t> output((batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + channels() + XNN_EXTRA_BYTES / sizeof(int8_t)); 459 std::vector<int8_t> output_ref(batch_size() * output_height() * output_width() * channels()); 460 for (size_t iteration = 0; iteration < iterations(); iteration++) { 461 std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); 462 std::fill(output.begin(), output.end(), 0xA5); 463 464 // Compute reference results. 465 for (size_t i = 0; i < batch_size(); i++) { 466 for (size_t oy = 0; oy < output_height(); oy++) { 467 for (size_t ox = 0; ox < output_width(); ox++) { 468 for (size_t c = 0; c < channels(); c++) { 469 int8_t max_value = std::numeric_limits<int8_t>::min(); 470 for (size_t py = 0; py < pooling_height(); py++) { 471 const size_t iy = oy * stride_height() + py * dilation_height() - padding_top(); 472 for (size_t px = 0; px < pooling_width(); px++) { 473 const size_t ix = ox * stride_width() + px * dilation_width() - padding_left(); 474 if (ix < input_width() && iy < input_height()) { 475 max_value = std::max(max_value, 476 input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + c]); 477 } 478 } 479 } 480 max_value = std::min(max_value, int8_t(qmax() - 0x80)); 481 max_value = std::max(max_value, int8_t(qmin() - 0x80)); 482 output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] = max_value; 483 } 484 } 485 } 486 } 487 488 // Create, setup, run, and destroy Max Pooling operator. 489 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); 490 xnn_operator_t max_pooling_op = nullptr; 491 492 ASSERT_EQ(xnn_status_success, 493 xnn_create_max_pooling2d_nhwc_s8( 494 padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(), 495 padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(), 496 pooling_height(), pooling_width(), 497 stride_height(), stride_width(), 498 dilation_height(), dilation_width(), 499 channels(), input_pixel_stride(), output_pixel_stride(), 500 int8_t(qmin() - 0x80), int8_t(qmax() - 0x80), 501 padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0, 502 &max_pooling_op)); 503 ASSERT_NE(nullptr, max_pooling_op); 504 505 // Smart pointer to automatically delete max_pooling_op. 506 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_max_pooling_op(max_pooling_op, xnn_delete_operator); 507 508 ASSERT_EQ(xnn_status_success, 509 xnn_setup_max_pooling2d_nhwc_s8( 510 max_pooling_op, 511 batch_size(), input_height(), input_width(), 512 input.data(), output.data(), 513 nullptr /* thread pool */)); 514 515 ASSERT_EQ(xnn_status_success, 516 xnn_run_operator(max_pooling_op, nullptr /* thread pool */)); 517 518 // Verify results. 519 for (size_t i = 0; i < batch_size(); i++) { 520 for (size_t y = 0; y < output_height(); y++) { 521 for (size_t x = 0; x < output_width(); x++) { 522 for (size_t c = 0; c < channels(); c++) { 523 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), int32_t(qmax() - 0x80)); 524 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), int32_t(qmin() - 0x80)); 525 ASSERT_EQ(int32_t(output_ref[((i * output_height() + y) * output_width() + x) * channels() + c]), 526 int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c])) << 527 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c; 528 } 529 } 530 } 531 } 532 } 533 } 534 TestU8()535 void TestU8() const { 536 std::random_device random_device; 537 auto rng = std::mt19937(random_device()); 538 std::uniform_int_distribution<int32_t> u8dist( 539 std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max()); 540 541 std::vector<uint8_t> input((batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + channels() + XNN_EXTRA_BYTES / sizeof(uint8_t)); 542 std::vector<uint8_t> output((batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + channels() + XNN_EXTRA_BYTES / sizeof(uint8_t)); 543 std::vector<uint8_t> output_ref(batch_size() * output_height() * output_width() * channels()); 544 for (size_t iteration = 0; iteration < iterations(); iteration++) { 545 std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); }); 546 std::fill(output.begin(), output.end(), 0xA5); 547 548 // Compute reference results. 549 for (size_t i = 0; i < batch_size(); i++) { 550 for (size_t oy = 0; oy < output_height(); oy++) { 551 for (size_t ox = 0; ox < output_width(); ox++) { 552 for (size_t c = 0; c < channels(); c++) { 553 uint8_t max_value = 0; 554 for (size_t py = 0; py < pooling_height(); py++) { 555 const size_t iy = oy * stride_height() + py * dilation_height() - padding_top(); 556 for (size_t px = 0; px < pooling_width(); px++) { 557 const size_t ix = ox * stride_width() + px * dilation_width() - padding_left(); 558 if (ix < input_width() && iy < input_height()) { 559 max_value = std::max(max_value, 560 input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + c]); 561 } 562 } 563 } 564 max_value = std::min(max_value, qmax()); 565 max_value = std::max(max_value, qmin()); 566 output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] = max_value; 567 } 568 } 569 } 570 } 571 572 // Create, setup, run, and destroy Max Pooling operator. 573 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); 574 xnn_operator_t max_pooling_op = nullptr; 575 576 ASSERT_EQ(xnn_status_success, 577 xnn_create_max_pooling2d_nhwc_u8( 578 padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(), 579 padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(), 580 pooling_height(), pooling_width(), 581 stride_height(), stride_width(), 582 dilation_height(), dilation_width(), 583 channels(), input_pixel_stride(), output_pixel_stride(), 584 qmin(), qmax(), 585 padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0, 586 &max_pooling_op)); 587 ASSERT_NE(nullptr, max_pooling_op); 588 589 // Smart pointer to automatically delete max_pooling_op. 590 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_max_pooling_op(max_pooling_op, xnn_delete_operator); 591 592 ASSERT_EQ(xnn_status_success, 593 xnn_setup_max_pooling2d_nhwc_u8( 594 max_pooling_op, 595 batch_size(), input_height(), input_width(), 596 input.data(), output.data(), 597 nullptr /* thread pool */)); 598 599 ASSERT_EQ(xnn_status_success, 600 xnn_run_operator(max_pooling_op, nullptr /* thread pool */)); 601 602 // Verify results. 603 for (size_t i = 0; i < batch_size(); i++) { 604 for (size_t y = 0; y < output_height(); y++) { 605 for (size_t x = 0; x < output_width(); x++) { 606 for (size_t c = 0; c < channels(); c++) { 607 ASSERT_LE(uint32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), uint32_t(qmax())); 608 ASSERT_GE(uint32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), uint32_t(qmin())); 609 ASSERT_EQ(uint32_t(output_ref[((i * output_height() + y) * output_width() + x) * channels() + c]), 610 uint32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c])) << 611 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c; 612 } 613 } 614 } 615 } 616 } 617 } 618 TestF16()619 void TestF16() const { 620 std::random_device random_device; 621 auto rng = std::mt19937(random_device()); 622 // Note: we need to avoid FP16 denormals in the generated tensor because they might be processed differently in 623 // native vs emulated arithmetics, and we use exact comparison to verify the results against reference. 624 std::uniform_real_distribution<float> f32dist(0.001f, 1.0f); 625 626 std::vector<uint16_t> input((batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + channels() + XNN_EXTRA_BYTES / sizeof(uint16_t)); 627 std::vector<uint16_t> output((batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + channels() + XNN_EXTRA_BYTES / sizeof(uint16_t)); 628 std::vector<float> output_ref(batch_size() * output_height() * output_width() * channels()); 629 for (size_t iteration = 0; iteration < iterations(); iteration++) { 630 std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 631 std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */); 632 633 // Compute reference results, without clamping. 634 for (size_t i = 0; i < batch_size(); i++) { 635 for (size_t oy = 0; oy < output_height(); oy++) { 636 for (size_t ox = 0; ox < output_width(); ox++) { 637 for (size_t c = 0; c < channels(); c++) { 638 float max_value = -std::numeric_limits<float>::infinity(); 639 for (size_t py = 0; py < pooling_height(); py++) { 640 const size_t iy = oy * stride_height() + py * dilation_height() - padding_top(); 641 for (size_t px = 0; px < pooling_width(); px++) { 642 const size_t ix = ox * stride_width() + px * dilation_width() - padding_left(); 643 if (ix < input_width() && iy < input_height()) { 644 max_value = std::max(max_value, 645 fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + c])); 646 } 647 } 648 } 649 output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] = max_value; 650 } 651 } 652 } 653 } 654 655 // Compute clamping parameters. 656 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); 657 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); 658 const float accumulated_range = accumulated_max - accumulated_min; 659 float output_min = accumulated_min + accumulated_range / 255.0f * float(qmin()); 660 float output_max = accumulated_max - accumulated_range / 255.0f * float(255 - qmax()); 661 output_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_min)); 662 output_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_max)); 663 if (accumulated_range == 0.0f) { 664 output_min = -std::numeric_limits<float>::infinity(); 665 output_max = +std::numeric_limits<float>::infinity(); 666 } 667 if (qmin() == std::numeric_limits<uint8_t>::min()) { 668 output_min = -std::numeric_limits<float>::infinity(); 669 } 670 if (qmax() == std::numeric_limits<uint8_t>::max()) { 671 output_max = +std::numeric_limits<float>::infinity(); 672 } 673 674 // Clamp reference results. 675 for (float& value : output_ref) { 676 value = std::max(std::min(value, output_max), output_min); 677 } 678 679 // Create, setup, run, and destroy Max Pooling operator. 680 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); 681 xnn_operator_t max_pooling_op = nullptr; 682 683 const xnn_status status = xnn_create_max_pooling2d_nhwc_f16( 684 padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(), 685 padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(), 686 pooling_height(), pooling_width(), 687 stride_height(), stride_width(), 688 dilation_height(), dilation_width(), 689 channels(), input_pixel_stride(), output_pixel_stride(), 690 output_min, output_max, 691 padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0, 692 &max_pooling_op); 693 if (status == xnn_status_unsupported_hardware) { 694 GTEST_SKIP(); 695 } 696 ASSERT_EQ(xnn_status_success, status); 697 ASSERT_NE(nullptr, max_pooling_op); 698 699 // Smart pointer to automatically delete max_pooling_op. 700 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_max_pooling_op(max_pooling_op, xnn_delete_operator); 701 702 ASSERT_EQ(xnn_status_success, 703 xnn_setup_max_pooling2d_nhwc_f16( 704 max_pooling_op, 705 batch_size(), input_height(), input_width(), 706 input.data(), output.data(), 707 nullptr /* thread pool */)); 708 709 ASSERT_EQ(xnn_status_success, 710 xnn_run_operator(max_pooling_op, nullptr /* thread pool */)); 711 712 // Verify results. 713 for (size_t i = 0; i < batch_size(); i++) { 714 for (size_t y = 0; y < output_height(); y++) { 715 for (size_t x = 0; x < output_width(); x++) { 716 for (size_t c = 0; c < channels(); c++) { 717 ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), output_max); 718 ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), output_min); 719 ASSERT_EQ( 720 fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), 721 output_ref[((i * output_height() + y) * output_width() + x) * channels() + c]) << 722 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c 723 << ", min = " << output_min << ", max = " << output_max; 724 } 725 } 726 } 727 } 728 } 729 } 730 TestF32()731 void TestF32() const { 732 std::random_device random_device; 733 auto rng = std::mt19937(random_device()); 734 std::uniform_real_distribution<float> f32dist; 735 736 std::vector<float> input((batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + channels() + XNN_EXTRA_BYTES / sizeof(float)); 737 std::vector<float> output((batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + channels() + XNN_EXTRA_BYTES / sizeof(float)); 738 std::vector<float> output_ref(batch_size() * output_height() * output_width() * channels()); 739 for (size_t iteration = 0; iteration < iterations(); iteration++) { 740 std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); 741 std::fill(output.begin(), output.end(), nanf("")); 742 743 // Compute reference results, without clamping. 744 for (size_t i = 0; i < batch_size(); i++) { 745 for (size_t oy = 0; oy < output_height(); oy++) { 746 for (size_t ox = 0; ox < output_width(); ox++) { 747 for (size_t c = 0; c < channels(); c++) { 748 float max_value = -std::numeric_limits<float>::infinity(); 749 for (size_t py = 0; py < pooling_height(); py++) { 750 const size_t iy = oy * stride_height() + py * dilation_height() - padding_top(); 751 for (size_t px = 0; px < pooling_width(); px++) { 752 const size_t ix = ox * stride_width() + px * dilation_width() - padding_left(); 753 if (ix < input_width() && iy < input_height()) { 754 max_value = std::max(max_value, 755 input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + c]); 756 } 757 } 758 } 759 output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] = max_value; 760 } 761 } 762 } 763 } 764 765 // Compute clamping parameters. 766 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); 767 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); 768 const float accumulated_range = accumulated_max - accumulated_min; 769 const float output_min = accumulated_range == 0.0f ? 770 -std::numeric_limits<float>::infinity() : 771 accumulated_min + accumulated_range / 255.0f * float(qmin()); 772 const float output_max = accumulated_range == 0.0f ? 773 +std::numeric_limits<float>::infinity() : 774 accumulated_max - accumulated_range / 255.0f * float(255 - qmax()); 775 776 // Clamp reference results. 777 for (float& value : output_ref) { 778 value = std::max(std::min(value, output_max), output_min); 779 } 780 781 // Create, setup, run, and destroy Max Pooling operator. 782 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); 783 xnn_operator_t max_pooling_op = nullptr; 784 785 ASSERT_EQ(xnn_status_success, 786 xnn_create_max_pooling2d_nhwc_f32( 787 padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(), 788 padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(), 789 pooling_height(), pooling_width(), 790 stride_height(), stride_width(), 791 dilation_height(), dilation_width(), 792 channels(), input_pixel_stride(), output_pixel_stride(), 793 output_min, output_max, 794 padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0, 795 &max_pooling_op)); 796 ASSERT_NE(nullptr, max_pooling_op); 797 798 // Smart pointer to automatically delete max_pooling_op. 799 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_max_pooling_op(max_pooling_op, xnn_delete_operator); 800 801 ASSERT_EQ(xnn_status_success, 802 xnn_setup_max_pooling2d_nhwc_f32( 803 max_pooling_op, 804 batch_size(), input_height(), input_width(), 805 input.data(), output.data(), 806 nullptr /* thread pool */)); 807 808 ASSERT_EQ(xnn_status_success, 809 xnn_run_operator(max_pooling_op, nullptr /* thread pool */)); 810 811 // Verify results. 812 for (size_t i = 0; i < batch_size(); i++) { 813 for (size_t y = 0; y < output_height(); y++) { 814 for (size_t x = 0; x < output_width(); x++) { 815 for (size_t c = 0; c < channels(); c++) { 816 ASSERT_LE(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c], output_max); 817 ASSERT_GE(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c], output_min); 818 ASSERT_EQ(output_ref[((i * output_height() + y) * output_width() + x) * channels() + c], 819 output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]) << 820 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c 821 << ", min = " << output_min << ", max = " << output_max; 822 } 823 } 824 } 825 } 826 } 827 } 828 TestSetupS8()829 void TestSetupS8() const { 830 std::random_device random_device; 831 auto rng = std::mt19937(random_device()); 832 std::uniform_int_distribution<int32_t> i8dist( 833 std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()); 834 835 std::vector<int8_t> input(XNN_EXTRA_BYTES / sizeof(int8_t) + std::max<size_t>( 836 (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + channels(), 837 (next_batch_size() * next_input_height() * next_input_width() - 1) * input_pixel_stride() + channels())); 838 std::vector<int8_t> output(XNN_EXTRA_BYTES / sizeof(int8_t) + std::max<size_t>( 839 (batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + channels(), 840 (next_batch_size() * next_output_height() * next_output_width() - 1) * output_pixel_stride() + channels())); 841 std::vector<float> output_ref(batch_size() * output_height() * output_width() * channels()); 842 std::vector<float> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * channels()); 843 for (size_t iteration = 0; iteration < iterations(); iteration++) { 844 std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); 845 std::fill(output.begin(), output.end(), INT8_C(0xA5)); 846 847 // Compute reference results. 848 for (size_t i = 0; i < batch_size(); i++) { 849 for (size_t oy = 0; oy < output_height(); oy++) { 850 for (size_t ox = 0; ox < output_width(); ox++) { 851 for (size_t c = 0; c < channels(); c++) { 852 int8_t max_value = std::numeric_limits<int8_t>::min(); 853 for (size_t py = 0; py < pooling_height(); py++) { 854 const size_t iy = oy * stride_height() + py * dilation_height() - padding_top(); 855 for (size_t px = 0; px < pooling_width(); px++) { 856 const size_t ix = ox * stride_width() + px * dilation_width() - padding_left(); 857 if (ix < input_width() && iy < input_height()) { 858 max_value = std::max(max_value, 859 input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + c]); 860 } 861 } 862 } 863 max_value = std::min(max_value, int8_t(qmax() - 0x80)); 864 max_value = std::max(max_value, int8_t(qmin() - 0x80)); 865 output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] = max_value; 866 } 867 } 868 } 869 } 870 871 // Create, setup, and run Max Pooling operator once. 872 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); 873 xnn_operator_t max_pooling_op = nullptr; 874 875 ASSERT_EQ(xnn_status_success, 876 xnn_create_max_pooling2d_nhwc_s8( 877 padding_top(), padding_right(), padding_bottom(), padding_left(), 878 pooling_height(), pooling_width(), 879 stride_height(), stride_width(), 880 dilation_height(), dilation_width(), 881 channels(), input_pixel_stride(), output_pixel_stride(), 882 int8_t(qmin() - 0x80), int8_t(qmax() - 0x80), 883 0, &max_pooling_op)); 884 ASSERT_NE(nullptr, max_pooling_op); 885 886 // Smart pointer to automatically delete max_pooling_op. 887 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_max_pooling_op(max_pooling_op, xnn_delete_operator); 888 889 ASSERT_EQ(xnn_status_success, 890 xnn_setup_max_pooling2d_nhwc_s8( 891 max_pooling_op, 892 batch_size(), input_height(), input_width(), 893 input.data(), output.data(), 894 nullptr /* thread pool */)); 895 896 ASSERT_EQ(xnn_status_success, 897 xnn_run_operator(max_pooling_op, nullptr /* thread pool */)); 898 899 // Verify results of the first run. 900 for (size_t i = 0; i < batch_size(); i++) { 901 for (size_t y = 0; y < output_height(); y++) { 902 for (size_t x = 0; x < output_width(); x++) { 903 for (size_t c = 0; c < channels(); c++) { 904 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), int32_t(qmax() - 0x80)); 905 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), int32_t(qmin() - 0x80)); 906 ASSERT_EQ(int32_t(output_ref[((i * output_height() + y) * output_width() + x) * channels() + c]), 907 int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c])) << 908 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c; 909 } 910 } 911 } 912 } 913 914 // Re-generate data for the second run. 915 std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); 916 std::fill(output.begin(), output.end(), 0xA5); 917 918 // Compute reference results for the second run. 919 for (size_t i = 0; i < next_batch_size(); i++) { 920 for (size_t oy = 0; oy < next_output_height(); oy++) { 921 for (size_t ox = 0; ox < next_output_width(); ox++) { 922 for (size_t c = 0; c < channels(); c++) { 923 int8_t max_value = std::numeric_limits<int8_t>::min(); 924 for (size_t py = 0; py < pooling_height(); py++) { 925 const size_t iy = oy * stride_height() + py * dilation_height() - padding_top(); 926 for (size_t px = 0; px < pooling_width(); px++) { 927 const size_t ix = ox * stride_width() + px * dilation_width() - padding_left(); 928 if (ix < next_input_width() && iy < next_input_height()) { 929 max_value = std::max(max_value, 930 input[((i * next_input_height() + iy) * next_input_width() + ix) * input_pixel_stride() + c]); 931 } 932 } 933 } 934 max_value = std::min(max_value, int8_t(qmax() - 0x80)); 935 max_value = std::max(max_value, int8_t(qmin() - 0x80)); 936 next_output_ref[((i * next_output_height() + oy) * next_output_width() + ox) * channels() + c] = max_value; 937 } 938 } 939 } 940 } 941 942 // Setup and run Max Pooling operator the second time, and destroy the operator. 943 ASSERT_EQ(xnn_status_success, 944 xnn_setup_max_pooling2d_nhwc_s8( 945 max_pooling_op, 946 next_batch_size(), next_input_height(), next_input_width(), 947 input.data(), output.data(), 948 nullptr /* thread pool */)); 949 950 ASSERT_EQ(xnn_status_success, 951 xnn_run_operator(max_pooling_op, nullptr /* thread pool */)); 952 953 // Verify results of the second run. 954 for (size_t i = 0; i < next_batch_size(); i++) { 955 for (size_t y = 0; y < next_output_height(); y++) { 956 for (size_t x = 0; x < next_output_width(); x++) { 957 for (size_t c = 0; c < channels(); c++) { 958 ASSERT_LE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c]), int32_t(qmax() - 0x80)); 959 ASSERT_GE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c]), int32_t(qmin() - 0x80)); 960 ASSERT_EQ(int32_t(next_output_ref[((i * next_output_height() + y) * next_output_width() + x) * channels() + c]), 961 int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c])) << 962 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c; 963 } 964 } 965 } 966 } 967 } 968 } 969 TestSetupU8()970 void TestSetupU8() const { 971 std::random_device random_device; 972 auto rng = std::mt19937(random_device()); 973 std::uniform_int_distribution<int32_t> u8dist( 974 std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max()); 975 976 std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) + std::max<size_t>( 977 (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + channels(), 978 (next_batch_size() * next_input_height() * next_input_width() - 1) * input_pixel_stride() + channels())); 979 std::vector<uint8_t> output(XNN_EXTRA_BYTES / sizeof(uint8_t) + std::max<size_t>( 980 (batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + channels(), 981 (next_batch_size() * next_output_height() * next_output_width() - 1) * output_pixel_stride() + channels())); 982 std::vector<float> output_ref(batch_size() * output_height() * output_width() * channels()); 983 std::vector<float> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * channels()); 984 for (size_t iteration = 0; iteration < iterations(); iteration++) { 985 std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); }); 986 std::fill(output.begin(), output.end(), 0xA5); 987 988 // Compute reference results. 989 for (size_t i = 0; i < batch_size(); i++) { 990 for (size_t oy = 0; oy < output_height(); oy++) { 991 for (size_t ox = 0; ox < output_width(); ox++) { 992 for (size_t c = 0; c < channels(); c++) { 993 uint8_t max_value = 0; 994 for (size_t py = 0; py < pooling_height(); py++) { 995 const size_t iy = oy * stride_height() + py * dilation_height() - padding_top(); 996 for (size_t px = 0; px < pooling_width(); px++) { 997 const size_t ix = ox * stride_width() + px * dilation_width() - padding_left(); 998 if (ix < input_width() && iy < input_height()) { 999 max_value = std::max(max_value, 1000 input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + c]); 1001 } 1002 } 1003 } 1004 max_value = std::min(max_value, qmax()); 1005 max_value = std::max(max_value, qmin()); 1006 output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] = max_value; 1007 } 1008 } 1009 } 1010 } 1011 1012 // Create, setup, and run Max Pooling operator once. 1013 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); 1014 xnn_operator_t max_pooling_op = nullptr; 1015 1016 ASSERT_EQ(xnn_status_success, 1017 xnn_create_max_pooling2d_nhwc_u8( 1018 padding_top(), padding_right(), padding_bottom(), padding_left(), 1019 pooling_height(), pooling_width(), 1020 stride_height(), stride_width(), 1021 dilation_height(), dilation_width(), 1022 channels(), input_pixel_stride(), output_pixel_stride(), 1023 qmin(), qmax(), 1024 0, &max_pooling_op)); 1025 ASSERT_NE(nullptr, max_pooling_op); 1026 1027 // Smart pointer to automatically delete max_pooling_op. 1028 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_max_pooling_op(max_pooling_op, xnn_delete_operator); 1029 1030 ASSERT_EQ(xnn_status_success, 1031 xnn_setup_max_pooling2d_nhwc_u8( 1032 max_pooling_op, 1033 batch_size(), input_height(), input_width(), 1034 input.data(), output.data(), 1035 nullptr /* thread pool */)); 1036 1037 ASSERT_EQ(xnn_status_success, 1038 xnn_run_operator(max_pooling_op, nullptr /* thread pool */)); 1039 1040 // Verify results of the first run. 1041 for (size_t i = 0; i < batch_size(); i++) { 1042 for (size_t y = 0; y < output_height(); y++) { 1043 for (size_t x = 0; x < output_width(); x++) { 1044 for (size_t c = 0; c < channels(); c++) { 1045 ASSERT_LE(uint32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), uint32_t(qmax())); 1046 ASSERT_GE(uint32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), uint32_t(qmin())); 1047 ASSERT_EQ(uint32_t(output_ref[((i * output_height() + y) * output_width() + x) * channels() + c]), 1048 uint32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c])) << 1049 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c; 1050 } 1051 } 1052 } 1053 } 1054 1055 // Re-generate data for the second run. 1056 std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); }); 1057 std::fill(output.begin(), output.end(), 0xA5); 1058 1059 // Compute reference results for the second run. 1060 for (size_t i = 0; i < next_batch_size(); i++) { 1061 for (size_t oy = 0; oy < next_output_height(); oy++) { 1062 for (size_t ox = 0; ox < next_output_width(); ox++) { 1063 for (size_t c = 0; c < channels(); c++) { 1064 uint8_t max_value = 0; 1065 for (size_t py = 0; py < pooling_height(); py++) { 1066 const size_t iy = oy * stride_height() + py * dilation_height() - padding_top(); 1067 for (size_t px = 0; px < pooling_width(); px++) { 1068 const size_t ix = ox * stride_width() + px * dilation_width() - padding_left(); 1069 if (ix < next_input_width() && iy < next_input_height()) { 1070 max_value = std::max(max_value, 1071 input[((i * next_input_height() + iy) * next_input_width() + ix) * input_pixel_stride() + c]); 1072 } 1073 } 1074 } 1075 max_value = std::min(max_value, qmax()); 1076 max_value = std::max(max_value, qmin()); 1077 next_output_ref[((i * next_output_height() + oy) * next_output_width() + ox) * channels() + c] = max_value; 1078 } 1079 } 1080 } 1081 } 1082 1083 // Setup and run Max Pooling operator the second time, and destroy the operator. 1084 ASSERT_EQ(xnn_status_success, 1085 xnn_setup_max_pooling2d_nhwc_u8( 1086 max_pooling_op, 1087 next_batch_size(), next_input_height(), next_input_width(), 1088 input.data(), output.data(), 1089 nullptr /* thread pool */)); 1090 1091 ASSERT_EQ(xnn_status_success, 1092 xnn_run_operator(max_pooling_op, nullptr /* thread pool */)); 1093 1094 // Verify results of the second run. 1095 for (size_t i = 0; i < next_batch_size(); i++) { 1096 for (size_t y = 0; y < next_output_height(); y++) { 1097 for (size_t x = 0; x < next_output_width(); x++) { 1098 for (size_t c = 0; c < channels(); c++) { 1099 ASSERT_LE(uint32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c]), uint32_t(qmax())); 1100 ASSERT_GE(uint32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c]), uint32_t(qmin())); 1101 ASSERT_EQ(uint32_t(next_output_ref[((i * next_output_height() + y) * next_output_width() + x) * channels() + c]), 1102 uint32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c])) << 1103 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c; 1104 } 1105 } 1106 } 1107 } 1108 } 1109 } 1110 TestSetupF16()1111 void TestSetupF16() const { 1112 std::random_device random_device; 1113 auto rng = std::mt19937(random_device()); 1114 // Note: we need to avoid FP16 denormals in the generated tensor because they might be processed differently in 1115 // native vs emulated arithmetics, and we use exact comparison to verify the results against reference. 1116 std::uniform_real_distribution<float> f32dist(0.001f, 1.0f); 1117 1118 std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) + std::max<size_t>( 1119 (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + channels(), 1120 (next_batch_size() * next_input_height() * next_input_width() - 1) * input_pixel_stride() + channels())); 1121 std::vector<uint16_t> output(XNN_EXTRA_BYTES / sizeof(uint16_t) + std::max<size_t>( 1122 (batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + channels(), 1123 (next_batch_size() * next_output_height() * next_output_width() - 1) * output_pixel_stride() + channels())); 1124 std::vector<float> output_ref(batch_size() * output_height() * output_width() * channels()); 1125 std::vector<float> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * channels()); 1126 for (size_t iteration = 0; iteration < iterations(); iteration++) { 1127 std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 1128 std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */); 1129 1130 // Compute reference results, without clamping. 1131 for (size_t i = 0; i < batch_size(); i++) { 1132 for (size_t oy = 0; oy < output_height(); oy++) { 1133 for (size_t ox = 0; ox < output_width(); ox++) { 1134 for (size_t c = 0; c < channels(); c++) { 1135 float max_value = -std::numeric_limits<float>::infinity(); 1136 for (size_t py = 0; py < pooling_height(); py++) { 1137 const size_t iy = oy * stride_height() + py * dilation_height() - padding_top(); 1138 for (size_t px = 0; px < pooling_width(); px++) { 1139 const size_t ix = ox * stride_width() + px * dilation_width() - padding_left(); 1140 if (ix < input_width() && iy < input_height()) { 1141 max_value = std::max(max_value, 1142 fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + c])); 1143 } 1144 } 1145 } 1146 output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] = max_value; 1147 } 1148 } 1149 } 1150 } 1151 1152 // Compute clamping parameters. 1153 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); 1154 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); 1155 const float accumulated_range = accumulated_max - accumulated_min; 1156 float output_min = accumulated_min + accumulated_range / 255.0f * float(qmin()); 1157 float output_max = accumulated_max - accumulated_range / 255.0f * float(255 - qmax()); 1158 output_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_min)); 1159 output_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(output_max)); 1160 if (accumulated_range == 0.0f) { 1161 output_min = -std::numeric_limits<float>::infinity(); 1162 output_max = +std::numeric_limits<float>::infinity(); 1163 } 1164 if (qmin() == std::numeric_limits<uint8_t>::min()) { 1165 output_min = -std::numeric_limits<float>::infinity(); 1166 } 1167 if (qmax() == std::numeric_limits<uint8_t>::max()) { 1168 output_max = +std::numeric_limits<float>::infinity(); 1169 } 1170 1171 // Clamp reference results. 1172 for (float& value : output_ref) { 1173 value = std::max(std::min(value, output_max), output_min); 1174 } 1175 1176 // Create, setup, and run Max Pooling operator once. 1177 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); 1178 xnn_operator_t max_pooling_op = nullptr; 1179 1180 const xnn_status status = xnn_create_max_pooling2d_nhwc_f16( 1181 padding_top(), padding_right(), padding_bottom(), padding_left(), 1182 pooling_height(), pooling_width(), 1183 stride_height(), stride_width(), 1184 dilation_height(), dilation_width(), 1185 channels(), input_pixel_stride(), output_pixel_stride(), 1186 output_min, output_max, 1187 0, &max_pooling_op); 1188 if (status == xnn_status_unsupported_hardware) { 1189 GTEST_SKIP(); 1190 } 1191 ASSERT_EQ(xnn_status_success, status); 1192 ASSERT_NE(nullptr, max_pooling_op); 1193 1194 // Smart pointer to automatically delete max_pooling_op. 1195 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_max_pooling_op(max_pooling_op, xnn_delete_operator); 1196 1197 ASSERT_EQ(xnn_status_success, 1198 xnn_setup_max_pooling2d_nhwc_f16( 1199 max_pooling_op, 1200 batch_size(), input_height(), input_width(), 1201 input.data(), output.data(), 1202 nullptr /* thread pool */)); 1203 1204 ASSERT_EQ(xnn_status_success, 1205 xnn_run_operator(max_pooling_op, nullptr /* thread pool */)); 1206 1207 // Verify results of the first run. 1208 for (size_t i = 0; i < batch_size(); i++) { 1209 for (size_t y = 0; y < output_height(); y++) { 1210 for (size_t x = 0; x < output_width(); x++) { 1211 for (size_t c = 0; c < channels(); c++) { 1212 ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), output_max); 1213 ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), output_min); 1214 ASSERT_EQ( 1215 fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]), 1216 output_ref[((i * output_height() + y) * output_width() + x) * channels() + c]) << 1217 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c 1218 << ", min = " << output_min << ", max = " << output_max; 1219 } 1220 } 1221 } 1222 } 1223 1224 // Re-generate data for the second run. 1225 std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 1226 std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */); 1227 1228 // Compute reference results for the second run, including clamping. 1229 for (size_t i = 0; i < next_batch_size(); i++) { 1230 for (size_t oy = 0; oy < next_output_height(); oy++) { 1231 for (size_t ox = 0; ox < next_output_width(); ox++) { 1232 for (size_t c = 0; c < channels(); c++) { 1233 float max_value = -std::numeric_limits<float>::infinity(); 1234 for (size_t py = 0; py < pooling_height(); py++) { 1235 const size_t iy = oy * stride_height() + py * dilation_height() - padding_top(); 1236 for (size_t px = 0; px < pooling_width(); px++) { 1237 const size_t ix = ox * stride_width() + px * dilation_width() - padding_left(); 1238 if (ix < next_input_width() && iy < next_input_height()) { 1239 max_value = std::max(max_value, 1240 fp16_ieee_to_fp32_value(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_pixel_stride() + c])); 1241 } 1242 } 1243 } 1244 max_value = std::min(max_value, output_max); 1245 max_value = std::max(max_value, output_min); 1246 next_output_ref[((i * next_output_height() + oy) * next_output_width() + ox) * channels() + c] = max_value; 1247 } 1248 } 1249 } 1250 } 1251 1252 // Setup and run Max Pooling operator the second time, and destroy the operator. 1253 ASSERT_EQ(xnn_status_success, 1254 xnn_setup_max_pooling2d_nhwc_f16( 1255 max_pooling_op, 1256 next_batch_size(), next_input_height(), next_input_width(), 1257 input.data(), output.data(), 1258 nullptr /* thread pool */)); 1259 1260 ASSERT_EQ(xnn_status_success, 1261 xnn_run_operator(max_pooling_op, nullptr /* thread pool */)); 1262 1263 // Verify results of the second run. 1264 for (size_t i = 0; i < next_batch_size(); i++) { 1265 for (size_t y = 0; y < next_output_height(); y++) { 1266 for (size_t x = 0; x < next_output_width(); x++) { 1267 for (size_t c = 0; c < channels(); c++) { 1268 ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c]), output_max); 1269 ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c]), output_min); 1270 ASSERT_EQ( 1271 fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c]), 1272 next_output_ref[((i * next_output_height() + y) * next_output_width() + x) * channels() + c]) << 1273 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c 1274 << ", min = " << output_min << ", max = " << output_max; 1275 } 1276 } 1277 } 1278 } 1279 } 1280 } 1281 TestSetupF32()1282 void TestSetupF32() const { 1283 std::random_device random_device; 1284 auto rng = std::mt19937(random_device()); 1285 std::uniform_real_distribution<float> f32dist; 1286 1287 std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + std::max<size_t>( 1288 (batch_size() * input_height() * input_width() - 1) * input_pixel_stride() + channels(), 1289 (next_batch_size() * next_input_height() * next_input_width() - 1) * input_pixel_stride() + channels())); 1290 std::vector<float> output(XNN_EXTRA_BYTES / sizeof(float) + std::max<size_t>( 1291 (batch_size() * output_height() * output_width() - 1) * output_pixel_stride() + channels(), 1292 (next_batch_size() * next_output_height() * next_output_width() - 1) * output_pixel_stride() + channels())); 1293 std::vector<float> output_ref(batch_size() * output_height() * output_width() * channels()); 1294 std::vector<float> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * channels()); 1295 for (size_t iteration = 0; iteration < iterations(); iteration++) { 1296 std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); 1297 std::fill(output.begin(), output.end(), nanf("")); 1298 1299 // Compute reference results, without clamping. 1300 for (size_t i = 0; i < batch_size(); i++) { 1301 for (size_t oy = 0; oy < output_height(); oy++) { 1302 for (size_t ox = 0; ox < output_width(); ox++) { 1303 for (size_t c = 0; c < channels(); c++) { 1304 float max_value = -std::numeric_limits<float>::infinity(); 1305 for (size_t py = 0; py < pooling_height(); py++) { 1306 const size_t iy = oy * stride_height() + py * dilation_height() - padding_top(); 1307 for (size_t px = 0; px < pooling_width(); px++) { 1308 const size_t ix = ox * stride_width() + px * dilation_width() - padding_left(); 1309 if (ix < input_width() && iy < input_height()) { 1310 max_value = std::max(max_value, 1311 input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + c]); 1312 } 1313 } 1314 } 1315 output_ref[((i * output_height() + oy) * output_width() + ox) * channels() + c] = max_value; 1316 } 1317 } 1318 } 1319 } 1320 1321 // Compute clamping parameters. 1322 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); 1323 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); 1324 const float accumulated_range = accumulated_max - accumulated_min; 1325 const float output_min = accumulated_range == 0.0f ? 1326 -std::numeric_limits<float>::infinity() : 1327 accumulated_min + accumulated_range / 255.0f * float(qmin()); 1328 const float output_max = accumulated_range == 0.0f ? 1329 +std::numeric_limits<float>::infinity() : 1330 accumulated_max - accumulated_range / 255.0f * float(255 - qmax()); 1331 1332 // Clamp reference results. 1333 for (float& value : output_ref) { 1334 value = std::max(std::min(value, output_max), output_min); 1335 } 1336 1337 // Create, setup, and run Max Pooling operator once. 1338 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); 1339 xnn_operator_t max_pooling_op = nullptr; 1340 1341 ASSERT_EQ(xnn_status_success, 1342 xnn_create_max_pooling2d_nhwc_f32( 1343 padding_top(), padding_right(), padding_bottom(), padding_left(), 1344 pooling_height(), pooling_width(), 1345 stride_height(), stride_width(), 1346 dilation_height(), dilation_width(), 1347 channels(), input_pixel_stride(), output_pixel_stride(), 1348 output_min, output_max, 1349 0, &max_pooling_op)); 1350 ASSERT_NE(nullptr, max_pooling_op); 1351 1352 // Smart pointer to automatically delete max_pooling_op. 1353 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_max_pooling_op(max_pooling_op, xnn_delete_operator); 1354 1355 ASSERT_EQ(xnn_status_success, 1356 xnn_setup_max_pooling2d_nhwc_f32( 1357 max_pooling_op, 1358 batch_size(), input_height(), input_width(), 1359 input.data(), output.data(), 1360 nullptr /* thread pool */)); 1361 1362 ASSERT_EQ(xnn_status_success, 1363 xnn_run_operator(max_pooling_op, nullptr /* thread pool */)); 1364 1365 // Verify results of the first run. 1366 for (size_t i = 0; i < batch_size(); i++) { 1367 for (size_t y = 0; y < output_height(); y++) { 1368 for (size_t x = 0; x < output_width(); x++) { 1369 for (size_t c = 0; c < channels(); c++) { 1370 ASSERT_LE(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c], output_max); 1371 ASSERT_GE(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c], output_min); 1372 ASSERT_EQ(output_ref[((i * output_height() + y) * output_width() + x) * channels() + c], 1373 output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + c]) << 1374 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c; 1375 } 1376 } 1377 } 1378 } 1379 1380 // Re-generate data for the second run. 1381 std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); 1382 std::fill(output.begin(), output.end(), std::nanf("")); 1383 1384 // Compute reference results for the second run, including clamping. 1385 for (size_t i = 0; i < next_batch_size(); i++) { 1386 for (size_t oy = 0; oy < next_output_height(); oy++) { 1387 for (size_t ox = 0; ox < next_output_width(); ox++) { 1388 for (size_t c = 0; c < channels(); c++) { 1389 float max_value = -std::numeric_limits<float>::infinity(); 1390 for (size_t py = 0; py < pooling_height(); py++) { 1391 const size_t iy = oy * stride_height() + py * dilation_height() - padding_top(); 1392 for (size_t px = 0; px < pooling_width(); px++) { 1393 const size_t ix = ox * stride_width() + px * dilation_width() - padding_left(); 1394 if (ix < next_input_width() && iy < next_input_height()) { 1395 max_value = std::max(max_value, 1396 input[((i * next_input_height() + iy) * next_input_width() + ix) * input_pixel_stride() + c]); 1397 } 1398 } 1399 } 1400 max_value = std::min(max_value, output_max); 1401 max_value = std::max(max_value, output_min); 1402 next_output_ref[((i * next_output_height() + oy) * next_output_width() + ox) * channels() + c] = max_value; 1403 } 1404 } 1405 } 1406 } 1407 1408 // Setup and run Max Pooling operator the second time, and destroy the operator. 1409 ASSERT_EQ(xnn_status_success, 1410 xnn_setup_max_pooling2d_nhwc_f32( 1411 max_pooling_op, 1412 next_batch_size(), next_input_height(), next_input_width(), 1413 input.data(), output.data(), 1414 nullptr /* thread pool */)); 1415 1416 ASSERT_EQ(xnn_status_success, 1417 xnn_run_operator(max_pooling_op, nullptr /* thread pool */)); 1418 1419 // Verify results of the second run. 1420 for (size_t i = 0; i < next_batch_size(); i++) { 1421 for (size_t y = 0; y < next_output_height(); y++) { 1422 for (size_t x = 0; x < next_output_width(); x++) { 1423 for (size_t c = 0; c < channels(); c++) { 1424 ASSERT_LE(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c], output_max); 1425 ASSERT_GE(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c], output_min); 1426 ASSERT_EQ(next_output_ref[((i * next_output_height() + y) * next_output_width() + x) * channels() + c], 1427 output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + c]) << 1428 "in batch index " << i << ", pixel (" << y << ", " << x << "), channel " << c; 1429 } 1430 } 1431 } 1432 } 1433 } 1434 } 1435 1436 private: 1437 uint32_t padding_top_{0}; 1438 uint32_t padding_right_{0}; 1439 uint32_t padding_bottom_{0}; 1440 uint32_t padding_left_{0}; 1441 bool padding_tf_same_{false}; 1442 size_t input_height_{1}; 1443 size_t input_width_{1}; 1444 size_t channels_{1}; 1445 size_t batch_size_{1}; 1446 size_t input_pixel_stride_{0}; 1447 size_t output_pixel_stride_{0}; 1448 uint32_t pooling_height_{1}; 1449 uint32_t pooling_width_{1}; 1450 uint32_t stride_height_{1}; 1451 uint32_t stride_width_{1}; 1452 uint32_t dilation_height_{1}; 1453 uint32_t dilation_width_{1}; 1454 size_t next_input_height_{0}; 1455 size_t next_input_width_{0}; 1456 size_t next_batch_size_{0}; 1457 uint8_t qmin_{0}; 1458 uint8_t qmax_{255}; 1459 size_t iterations_{1}; 1460 }; 1461