1 // Copyright (c) Facebook, Inc. and its affiliates. 2 // All rights reserved. 3 // 4 // Copyright 2019 Google LLC 5 // 6 // This source code is licensed under the BSD-style license found in the 7 // LICENSE file in the root directory of this source tree. 8 9 #pragma once 10 11 #include <gtest/gtest.h> 12 13 #include <algorithm> 14 #include <cassert> 15 #include <cmath> 16 #include <cstddef> 17 #include <cstdlib> 18 #include <limits> 19 #include <random> 20 #include <vector> 21 22 #include <fp16.h> 23 24 #include <xnnpack.h> 25 #include <xnnpack/aligned-allocator.h> 26 #include <xnnpack/microfnptr.h> 27 #include <xnnpack/microparams-init.h> 28 #include <xnnpack/requantization.h> 29 30 31 class AvgPoolMicrokernelTester { 32 public: output_pixels(size_t output_pixels)33 inline AvgPoolMicrokernelTester& output_pixels(size_t output_pixels) { 34 assert(output_pixels != 0); 35 this->output_pixels_ = output_pixels; 36 return *this; 37 } 38 output_pixels()39 inline size_t output_pixels() const { 40 return this->output_pixels_; 41 } 42 step(size_t step)43 inline AvgPoolMicrokernelTester& step(size_t step) { 44 assert(step != 0); 45 this->step_ = step; 46 return *this; 47 } 48 step()49 inline size_t step() const { 50 return this->step_; 51 } 52 input_offset(size_t input_offset)53 inline AvgPoolMicrokernelTester& input_offset(size_t input_offset) { 54 assert(input_offset != 0); 55 this->input_offset_ = input_offset; 56 return *this; 57 } 58 input_offset()59 inline size_t input_offset() const { 60 return this->input_offset_; 61 } 62 zero_index(size_t zero_index)63 inline AvgPoolMicrokernelTester& zero_index(size_t zero_index) { 64 this->zero_index_ = zero_index; 65 return *this; 66 } 67 zero_index()68 inline size_t zero_index() const { 69 return this->zero_index_; 70 } 71 pooling_elements(size_t pooling_elements)72 inline AvgPoolMicrokernelTester& pooling_elements(size_t pooling_elements) { 73 assert(pooling_elements != 0); 74 this->pooling_elements_ = pooling_elements; 75 return *this; 76 } 77 pooling_elements()78 inline size_t pooling_elements() const { 79 return this->pooling_elements_; 80 } 81 packed_pooling_elements()82 inline size_t packed_pooling_elements() const { 83 if (pooling_elements() <= primary_pooling_tile()) { 84 return primary_pooling_tile(); 85 } else { 86 return (pooling_elements() - primary_pooling_tile()) % incremental_pooling_tile() == 0 ? pooling_elements() : ((pooling_elements() - primary_pooling_tile()) / incremental_pooling_tile() + 1) * incremental_pooling_tile() + primary_pooling_tile(); 87 } 88 } 89 90 inline AvgPoolMicrokernelTester& pooling_tile(size_t primary_tile, size_t incremental_tile = 0) { 91 assert(primary_tile != 0); 92 this->primary_pooling_tile_ = primary_tile; 93 this->incremental_pooling_tile_ = incremental_tile; 94 return *this; 95 } 96 primary_pooling_tile(size_t primary_pooling_tile)97 inline AvgPoolMicrokernelTester& primary_pooling_tile(size_t primary_pooling_tile) { 98 assert(primary_pooling_tile != 0); 99 this->primary_pooling_tile_ = primary_pooling_tile; 100 return *this; 101 } 102 primary_pooling_tile()103 inline size_t primary_pooling_tile() const { 104 return this->primary_pooling_tile_; 105 } 106 incremental_pooling_tile(size_t incremental_pooling_tile)107 inline AvgPoolMicrokernelTester& incremental_pooling_tile(size_t incremental_pooling_tile) { 108 assert(incremental_pooling_tile != 0); 109 this->incremental_pooling_tile_ = incremental_pooling_tile; 110 return *this; 111 } 112 incremental_pooling_tile()113 inline size_t incremental_pooling_tile() const { 114 return this->incremental_pooling_tile_; 115 } 116 channels(size_t channels)117 inline AvgPoolMicrokernelTester& channels(size_t channels) { 118 assert(channels != 0); 119 this->channels_ = channels; 120 return *this; 121 } 122 channels()123 inline size_t channels() const { 124 return this->channels_; 125 } 126 output_stride(size_t output_stride)127 inline AvgPoolMicrokernelTester& output_stride(size_t output_stride) { 128 assert(output_stride != 0); 129 this->output_stride_ = output_stride; 130 return *this; 131 } 132 output_stride()133 inline size_t output_stride() const { 134 if (this->output_stride_ == 0) { 135 return channels(); 136 } else { 137 assert(this->output_stride_ >= channels()); 138 return this->output_stride_; 139 } 140 } 141 input_scale(float input_scale)142 inline AvgPoolMicrokernelTester& input_scale(float input_scale) { 143 assert(input_scale > 0.0f); 144 assert(std::isnormal(input_scale)); 145 this->input_scale_ = input_scale; 146 return *this; 147 } 148 input_scale()149 inline float input_scale() const { 150 return this->input_scale_; 151 } 152 input_zero_point(uint8_t input_zero_point)153 inline AvgPoolMicrokernelTester& input_zero_point(uint8_t input_zero_point) { 154 this->input_zero_point_ = input_zero_point; 155 return *this; 156 } 157 input_zero_point()158 inline uint8_t input_zero_point() const { 159 return this->input_zero_point_; 160 } 161 output_scale(float output_scale)162 inline AvgPoolMicrokernelTester& output_scale(float output_scale) { 163 assert(output_scale > 0.0f); 164 assert(std::isnormal(output_scale)); 165 this->output_scale_ = output_scale; 166 return *this; 167 } 168 output_scale()169 inline float output_scale() const { 170 return this->output_scale_; 171 } 172 output_zero_point(uint8_t output_zero_point)173 inline AvgPoolMicrokernelTester& output_zero_point(uint8_t output_zero_point) { 174 this->output_zero_point_ = output_zero_point; 175 return *this; 176 } 177 output_zero_point()178 inline uint8_t output_zero_point() const { 179 return this->output_zero_point_; 180 } 181 qmin(uint8_t qmin)182 inline AvgPoolMicrokernelTester& qmin(uint8_t qmin) { 183 this->qmin_ = qmin; 184 return *this; 185 } 186 qmin()187 inline uint8_t qmin() const { 188 return this->qmin_; 189 } 190 qmax(uint8_t qmax)191 inline AvgPoolMicrokernelTester& qmax(uint8_t qmax) { 192 this->qmax_ = qmax; 193 return *this; 194 } 195 qmax()196 inline uint8_t qmax() const { 197 return this->qmax_; 198 } 199 iterations(size_t iterations)200 inline AvgPoolMicrokernelTester& iterations(size_t iterations) { 201 this->iterations_ = iterations; 202 return *this; 203 } 204 iterations()205 inline size_t iterations() const { 206 return this->iterations_; 207 } 208 Test(xnn_f16_avgpool_minmax_unipass_ukernel_function avgpool_minmax,xnn_init_f16_scaleminmax_params_fn init_params)209 void Test(xnn_f16_avgpool_minmax_unipass_ukernel_function avgpool_minmax, xnn_init_f16_scaleminmax_params_fn init_params) const { 210 std::random_device random_device; 211 auto rng = std::mt19937(random_device()); 212 std::uniform_real_distribution<float> f32dist; 213 214 std::vector<const uint16_t*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements()); 215 std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) + 216 input_offset() + indirect_input.size() * channels()); 217 std::vector<uint16_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(uint16_t)); 218 std::vector<uint16_t> output((output_pixels() - 1) * output_stride() + channels()); 219 std::vector<float> output_ref(output_pixels() * channels()); 220 for (size_t iteration = 0; iteration < iterations(); iteration++) { 221 std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 222 std::fill(input.begin(), input.begin() + input_offset(), UINT16_C(0x7E00) /* NaN */); 223 std::fill(input.end() - XNN_EXTRA_BYTES / sizeof(uint16_t), input.end(), UINT16_C(0x7E00) /* NaN */); 224 std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */); 225 226 for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) { 227 indirect_input[i] = input.data() + i * channels(); 228 } 229 std::shuffle(indirect_input.begin(), 230 indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng); 231 if (zero_index() != SIZE_MAX) { 232 indirect_input[zero_index()] = zero.data(); 233 } 234 235 // Compute reference results, without clamping. 236 for (size_t x = 0; x < output_pixels(); x++) { 237 for (size_t c = 0; c < channels(); c++) { 238 float acc = 0.0f; 239 for (size_t p = 0; p < pooling_elements(); p++) { 240 const uint16_t* row = indirect_input[x * step() + p]; 241 if (row != zero.data()) { 242 acc += fp16_ieee_to_fp32_value(row[c + input_offset()]); 243 } 244 } 245 output_ref[x * channels() + c] = acc / float(pooling_elements()); 246 } 247 } 248 249 // Compute clamping parameters. 250 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); 251 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); 252 const float accumulated_range = accumulated_max - accumulated_min; 253 float output_min_as_float = accumulated_min + float(qmin()) / 255.0f * accumulated_range; 254 float output_max_as_float = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range; 255 const uint16_t output_min_as_half = fp16_ieee_from_fp32_value(output_min_as_float); 256 const uint16_t output_max_as_half = fp16_ieee_from_fp32_value(output_max_as_float); 257 output_min_as_float = fp16_ieee_to_fp32_value(output_min_as_half); 258 output_max_as_float = fp16_ieee_to_fp32_value(output_max_as_half); 259 260 // Clamp reference results. 261 for (float& output_value : output_ref) { 262 output_value = std::max(std::min(output_value, output_max_as_float), output_min_as_float); 263 } 264 265 // Prepare parameters. 266 xnn_f16_scaleminmax_params params; 267 init_params(¶ms, fp16_ieee_from_fp32_value(1.0f / float(pooling_elements())), output_min_as_half, output_max_as_half); 268 269 // Call optimized micro-kernel. 270 avgpool_minmax(output_pixels(), pooling_elements(), channels(), 271 reinterpret_cast<const void**>(indirect_input.data()), input_offset() * sizeof(uint16_t), zero.data(), 272 output.data(), 273 step() * sizeof(void*), 274 (output_stride() - channels()) * sizeof(uint16_t), 275 ¶ms); 276 277 // Verify results. 278 for (size_t x = 0; x < output_pixels(); x++) { 279 for (size_t c = 0; c < channels(); c++) { 280 ASSERT_GE(fp16_ieee_to_fp32_value(output[x * output_stride() + c]), output_min_as_float) 281 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 282 << ", pooling elements = " << pooling_elements() << ", step = " << step() 283 << ", input offset = " << input_offset(); 284 ASSERT_LE(fp16_ieee_to_fp32_value(output[x * output_stride() + c]), output_max_as_float) 285 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 286 << ", pooling elements = " << pooling_elements() << ", step = " << step() 287 << ", input offset = " << input_offset(); 288 ASSERT_NEAR( 289 fp16_ieee_to_fp32_value(output[x * output_stride() + c]), 290 output_ref[x * channels() + c], 291 std::max(1.0e-4f, std::abs(output_ref[x * channels() + c]) * 3.0e-3f)) 292 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 293 << ", pooling elements = " << pooling_elements() << ", step = " << step() 294 << ", input offset = " << input_offset(); 295 } 296 } 297 } 298 } 299 Test(xnn_f16_avgpool_minmax_multipass_ukernel_function avgpool_minmax,xnn_init_f16_scaleminmax_params_fn init_params)300 void Test(xnn_f16_avgpool_minmax_multipass_ukernel_function avgpool_minmax, xnn_init_f16_scaleminmax_params_fn init_params) const { 301 std::random_device random_device; 302 auto rng = std::mt19937(random_device()); 303 std::uniform_real_distribution<float> f32dist; 304 305 std::vector<const uint16_t*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements()); 306 std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) + 307 input_offset() + indirect_input.size() * channels()); 308 std::vector<uint16_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(uint16_t)); 309 std::vector<uint16_t> output((output_pixels() - 1) * output_stride() + channels()); 310 std::vector<float> output_ref(output_pixels() * channels()); 311 std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> buffer(XNN_EXTRA_BYTES / sizeof(uint16_t) + channels()); 312 for (size_t iteration = 0; iteration < iterations(); iteration++) { 313 std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 314 std::fill(input.begin(), input.begin() + input_offset(), UINT16_C(0x7E00) /* NaN */); 315 std::fill(input.end() - XNN_EXTRA_BYTES / sizeof(uint16_t), input.end(), UINT16_C(0x7E00) /* NaN */); 316 std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */); 317 318 for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) { 319 indirect_input[i] = input.data() + i * channels(); 320 } 321 std::shuffle(indirect_input.begin(), 322 indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng); 323 if (zero_index() != SIZE_MAX) { 324 indirect_input[zero_index()] = zero.data(); 325 } 326 327 // Compute reference results, without clamping. 328 for (size_t x = 0; x < output_pixels(); x++) { 329 for (size_t c = 0; c < channels(); c++) { 330 float acc = 0.0f; 331 for (size_t p = 0; p < pooling_elements(); p++) { 332 const uint16_t* row = indirect_input[x * step() + p]; 333 if (row != zero.data()) { 334 acc += fp16_ieee_to_fp32_value(row[c + input_offset()]); 335 } 336 } 337 output_ref[x * channels() + c] = acc / float(pooling_elements()); 338 } 339 } 340 341 // Compute clamping parameters. 342 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); 343 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); 344 const float accumulated_range = accumulated_max - accumulated_min; 345 float output_min_as_float = accumulated_min + float(qmin()) / 255.0f * accumulated_range; 346 float output_max_as_float = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range; 347 const uint16_t output_min_as_half = fp16_ieee_from_fp32_value(output_min_as_float); 348 const uint16_t output_max_as_half = fp16_ieee_from_fp32_value(output_max_as_float); 349 output_min_as_float = fp16_ieee_to_fp32_value(output_min_as_half); 350 output_max_as_float = fp16_ieee_to_fp32_value(output_max_as_half); 351 352 // Clamp reference results. 353 for (float& output_value : output_ref) { 354 output_value = std::max(std::min(output_value, output_max_as_float), output_min_as_float); 355 } 356 357 // Prepare parameters. 358 xnn_f16_scaleminmax_params params; 359 init_params(¶ms, fp16_ieee_from_fp32_value(1.0f / float(pooling_elements())), output_min_as_half, output_max_as_half); 360 361 // Call optimized micro-kernel. 362 avgpool_minmax(output_pixels(), pooling_elements(), channels(), 363 reinterpret_cast<const void**>(indirect_input.data()), input_offset() * sizeof(uint16_t), zero.data(), 364 buffer.data(), output.data(), 365 (step() - (packed_pooling_elements() - incremental_pooling_tile())) * sizeof(void*), 366 (output_stride() - channels()) * sizeof(uint16_t), 367 ¶ms); 368 369 // Verify results. 370 for (size_t x = 0; x < output_pixels(); x++) { 371 for (size_t c = 0; c < channels(); c++) { 372 ASSERT_GE(fp16_ieee_to_fp32_value(output[x * output_stride() + c]), output_min_as_float) 373 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 374 << ", pooling elements = " << pooling_elements() << ", step = " << step() 375 << ", input offset = " << input_offset(); 376 ASSERT_LE(fp16_ieee_to_fp32_value(output[x * output_stride() + c]), output_max_as_float) 377 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 378 << ", pooling elements = " << pooling_elements() << ", step = " << step() 379 << ", input offset = " << input_offset(); 380 ASSERT_NEAR( 381 fp16_ieee_to_fp32_value(output[x * output_stride() + c]), 382 output_ref[x * channels() + c], 383 std::max(1.0e-4f, std::abs(output_ref[x * channels() + c]) * 3.0e-3f)) 384 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 385 << ", pooling elements = " << pooling_elements() << ", step = " << step() 386 << ", input offset = " << input_offset(); 387 } 388 } 389 } 390 } 391 Test(xnn_f32_avgpool_minmax_unipass_ukernel_function avgpool_minmax,xnn_init_f32_scaleminmax_params_fn init_params)392 void Test(xnn_f32_avgpool_minmax_unipass_ukernel_function avgpool_minmax, xnn_init_f32_scaleminmax_params_fn init_params) const { 393 std::random_device random_device; 394 auto rng = std::mt19937(random_device()); 395 std::uniform_real_distribution<float> f32dist; 396 397 std::vector<const float*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements()); 398 std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + 399 input_offset() + indirect_input.size() * channels()); 400 std::vector<float> zero(channels() + XNN_EXTRA_BYTES / sizeof(float)); 401 std::vector<float> output((output_pixels() - 1) * output_stride() + channels()); 402 std::vector<float> output_ref(output_pixels() * channels()); 403 for (size_t iteration = 0; iteration < iterations(); iteration++) { 404 std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); 405 std::fill(input.begin(), input.begin() + input_offset(), std::nanf("")); 406 std::fill(input.end() - XNN_EXTRA_BYTES / sizeof(float), input.end(), std::nanf("")); 407 std::fill(output.begin(), output.end(), std::nanf("")); 408 409 for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) { 410 indirect_input[i] = input.data() + i * channels(); 411 } 412 std::shuffle(indirect_input.begin(), 413 indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng); 414 if (zero_index() != SIZE_MAX) { 415 indirect_input[zero_index()] = zero.data(); 416 } 417 418 // Compute reference results, without clamping. 419 for (size_t x = 0; x < output_pixels(); x++) { 420 for (size_t c = 0; c < channels(); c++) { 421 float acc = 0.0f; 422 for (size_t p = 0; p < pooling_elements(); p++) { 423 const float* row = indirect_input[x * step() + p]; 424 if (row != zero.data()) { 425 acc += row[c + input_offset()]; 426 } 427 } 428 output_ref[x * channels() + c] = acc / float(pooling_elements()); 429 } 430 } 431 432 // Compute clamping parameters. 433 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); 434 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); 435 const float accumulated_range = accumulated_max - accumulated_min; 436 const float output_min = accumulated_min + float(qmin()) / 255.0f * accumulated_range; 437 const float output_max = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range; 438 439 // Clamp reference results. 440 for (float& output_value : output_ref) { 441 output_value = std::max(std::min(output_value, output_max), output_min); 442 } 443 444 // Prepare parameters. 445 xnn_f32_scaleminmax_params params; 446 init_params(¶ms, 1.0f / float(pooling_elements()), output_min, output_max); 447 448 // Call optimized micro-kernel. 449 avgpool_minmax(output_pixels(), pooling_elements(), channels(), 450 indirect_input.data(), input_offset() * sizeof(float), zero.data(), 451 output.data(), 452 step() * sizeof(void*), 453 (output_stride() - channels()) * sizeof(float), 454 ¶ms); 455 456 // Verify results. 457 for (size_t x = 0; x < output_pixels(); x++) { 458 for (size_t c = 0; c < channels(); c++) { 459 ASSERT_GE(output[x * output_stride() + c], output_min) 460 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 461 << ", pooling elements = " << pooling_elements() << ", step = " << step() 462 << ", input offset = " << input_offset(); 463 ASSERT_LE(output[x * output_stride() + c], output_max) 464 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 465 << ", pooling elements = " << pooling_elements() << ", step = " << step() 466 << ", input offset = " << input_offset(); 467 ASSERT_NEAR( 468 output[x * output_stride() + c], 469 output_ref[x * channels() + c], 470 std::abs(output_ref[x * channels() + c]) * 1.0e-6f) 471 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 472 << ", pooling elements = " << pooling_elements() << ", step = " << step() 473 << ", input offset = " << input_offset(); 474 } 475 } 476 } 477 } 478 Test(xnn_f32_avgpool_minmax_multipass_ukernel_function avgpool_minmax,xnn_init_f32_scaleminmax_params_fn init_params)479 void Test(xnn_f32_avgpool_minmax_multipass_ukernel_function avgpool_minmax, xnn_init_f32_scaleminmax_params_fn init_params) const { 480 std::random_device random_device; 481 auto rng = std::mt19937(random_device()); 482 std::uniform_real_distribution<float> f32dist; 483 484 std::vector<const float*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements()); 485 std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + 486 input_offset() + indirect_input.size() * channels()); 487 std::vector<float> zero(channels() + XNN_EXTRA_BYTES / sizeof(float)); 488 std::vector<float> output((output_pixels() - 1) * output_stride() + channels()); 489 std::vector<float> output_ref(output_pixels() * channels()); 490 std::vector<float, AlignedAllocator<float, 64>> buffer(XNN_EXTRA_BYTES / sizeof(float) + channels()); 491 for (size_t iteration = 0; iteration < iterations(); iteration++) { 492 std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); 493 std::fill(input.begin(), input.begin() + input_offset(), std::nanf("")); 494 std::fill(input.end() - XNN_EXTRA_BYTES / sizeof(float), input.end(), std::nanf("")); 495 std::fill(output.begin(), output.end(), std::nanf("")); 496 497 for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) { 498 indirect_input[i] = input.data() + i * channels(); 499 } 500 std::shuffle(indirect_input.begin(), 501 indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng); 502 if (zero_index() != SIZE_MAX) { 503 indirect_input[zero_index()] = zero.data(); 504 } 505 506 // Compute reference results, without clamping. 507 for (size_t x = 0; x < output_pixels(); x++) { 508 for (size_t c = 0; c < channels(); c++) { 509 float acc = 0.0f; 510 for (size_t p = 0; p < pooling_elements(); p++) { 511 const float* row = indirect_input[x * step() + p]; 512 if (row != zero.data()) { 513 acc += row[c + input_offset()]; 514 } 515 } 516 output_ref[x * channels() + c] = acc / float(pooling_elements()); 517 } 518 } 519 520 // Compute clamping parameters. 521 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); 522 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); 523 const float accumulated_range = accumulated_max - accumulated_min; 524 const float output_min = accumulated_min + float(qmin()) / 255.0f * accumulated_range; 525 const float output_max = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range; 526 527 // Clamp reference results. 528 for (float& output_value : output_ref) { 529 output_value = std::max(std::min(output_value, output_max), output_min); 530 } 531 532 // Prepare parameters. 533 xnn_f32_scaleminmax_params params; 534 init_params(¶ms, 1.0f / float(pooling_elements()), output_min, output_max); 535 536 // Call optimized micro-kernel. 537 avgpool_minmax(output_pixels(), pooling_elements(), channels(), 538 indirect_input.data(), input_offset() * sizeof(float), zero.data(), 539 buffer.data(), output.data(), 540 (step() - (packed_pooling_elements() - incremental_pooling_tile())) * sizeof(void*), 541 (output_stride() - channels()) * sizeof(float), 542 ¶ms); 543 544 // Verify results. 545 for (size_t x = 0; x < output_pixels(); x++) { 546 for (size_t c = 0; c < channels(); c++) { 547 ASSERT_GE(output[x * output_stride() + c], output_min) 548 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 549 << ", pooling elements = " << pooling_elements() << ", step = " << step() 550 << ", input offset = " << input_offset(); 551 ASSERT_LE(output[x * output_stride() + c], output_max) 552 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 553 << ", pooling elements = " << pooling_elements() << ", step = " << step() 554 << ", input offset = " << input_offset(); 555 ASSERT_NEAR( 556 output[x * output_stride() + c], 557 output_ref[x * channels() + c], 558 std::abs(output_ref[x * channels() + c]) * 1.0e-6f) 559 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 560 << ", pooling elements = " << pooling_elements() << ", step = " << step() 561 << ", input offset = " << input_offset(); 562 } 563 } 564 } 565 } 566 Test(xnn_qu8_avgpool_minmax_unipass_ukernel_function avgpool_minmax,xnn_init_qu8_avgpool_minmax_params_fn init_params)567 void Test(xnn_qu8_avgpool_minmax_unipass_ukernel_function avgpool_minmax, xnn_init_qu8_avgpool_minmax_params_fn init_params) const { 568 std::random_device random_device; 569 auto rng = std::mt19937(random_device()); 570 std::uniform_int_distribution<int32_t> u8dist( 571 std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max()); 572 573 std::vector<const uint8_t*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements()); 574 std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) + 575 input_offset() + indirect_input.size() * channels()); 576 std::vector<uint8_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(uint8_t)); 577 std::vector<uint8_t> output((output_pixels() - 1) * output_stride() + channels()); 578 std::vector<uint8_t> output_ref(output_pixels() * channels()); 579 std::vector<float> output_real(output_pixels() * channels()); 580 std::vector<int32_t> accumulator(output_pixels() * channels()); 581 for (size_t iteration = 0; iteration < iterations(); iteration++) { 582 do { 583 std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); }); 584 } while (input.size() > 1 && *std::max_element(input.cbegin(), input.cend()) == *std::min_element(input.cbegin(), input.cend())); 585 std::fill(input.begin(), input.begin() + input_offset(), UINT8_C(0xA5)); 586 std::fill(input.end() - XNN_EXTRA_BYTES / sizeof(uint8_t), input.end(), UINT8_C(0xA5)); 587 std::fill(output.begin(), output.end(), UINT8_C(0xA5)); 588 589 for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) { 590 indirect_input[i] = input.data() + i * channels(); 591 } 592 std::shuffle(indirect_input.begin(), 593 indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng); 594 if (zero_index() != SIZE_MAX) { 595 indirect_input[zero_index()] = zero.data(); 596 } 597 598 // Prepare parameters. 599 xnn_qu8_avgpool_minmax_params params; 600 init_params( 601 ¶ms, 602 -int32_t(input_zero_point()) * int32_t(pooling_elements()), 603 input_scale() / (output_scale() * float(pooling_elements())), 604 output_zero_point(), qmin(), qmax()); 605 606 // Compute reference results. 607 for (size_t x = 0; x < output_pixels(); x++) { 608 for (size_t c = 0; c < channels(); c++) { 609 int32_t acc = 0; 610 for (size_t p = 0; p < pooling_elements(); p++) { 611 const uint8_t* row = indirect_input[x * step() + p]; 612 if (row != zero.data()) { 613 acc += int32_t(row[c + input_offset()]); 614 } 615 acc -= int32_t(input_zero_point()); 616 } 617 accumulator[x * channels() + c] = acc; 618 output_ref[x * channels() + c] = xnn_qu8_requantize_rndna( 619 acc, input_scale() / (output_scale() * float(pooling_elements())), output_zero_point(), qmin(), qmax()); 620 const float scaled_acc = 621 float(acc) * input_scale() / (output_scale() * float(pooling_elements())) + float(output_zero_point()); 622 output_real[x * channels() + c] = std::min(std::max(scaled_acc, float(qmin())), float(qmax())); 623 } 624 } 625 626 // Call optimized micro-kernel. 627 avgpool_minmax(output_pixels(), pooling_elements(), channels(), 628 indirect_input.data(), input_offset() * sizeof(uint8_t), zero.data(), 629 output.data(), 630 step() * sizeof(void*), 631 (output_stride() - channels()) * sizeof(uint8_t), 632 ¶ms); 633 634 // Verify results. 635 for (size_t x = 0; x < output_pixels(); x++) { 636 for (size_t c = 0; c < channels(); c++) { 637 ASSERT_GE(uint32_t(output[x * output_stride() + c]), uint32_t(qmin())) 638 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 639 << ", pooling elements = " << pooling_elements() << ", step = " << step() 640 << ", input offset = " << input_offset(); 641 ASSERT_LE(uint32_t(output[x * output_stride() + c]), uint32_t(qmax())) 642 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 643 << ", pooling elements = " << pooling_elements() << ", step = " << step() 644 << ", input offset = " << input_offset(); 645 ASSERT_NEAR(float(int32_t(output[x * output_stride() + c])), output_real[x * channels() + c], 0.5f) 646 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 647 << ", pooling elements = " << pooling_elements() << ", step = " << step() 648 << ", input offset = " << input_offset() << ", accumulator = " << accumulator[x * channels() + c]; 649 ASSERT_EQ(uint32_t(output_ref[x * channels() + c]), uint32_t(output[x * output_stride() + c])) 650 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 651 << ", pooling elements = " << pooling_elements() << ", step = " << step() 652 << ", input offset = " << input_offset() << ", accumulator = " << accumulator[x * channels() + c]; 653 } 654 } 655 } 656 } 657 Test(xnn_qu8_avgpool_minmax_multipass_ukernel_function avgpool_minmax,xnn_init_qu8_avgpool_minmax_params_fn init_params)658 void Test(xnn_qu8_avgpool_minmax_multipass_ukernel_function avgpool_minmax, xnn_init_qu8_avgpool_minmax_params_fn init_params) const { 659 std::random_device random_device; 660 auto rng = std::mt19937(random_device()); 661 std::uniform_int_distribution<int32_t> u8dist( 662 std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max()); 663 664 std::vector<const uint8_t*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements()); 665 std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) + 666 input_offset() + indirect_input.size() * channels()); 667 std::vector<uint8_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(uint8_t)); 668 std::vector<uint8_t> output((output_pixels() - 1) * output_stride() + channels()); 669 std::vector<uint8_t> output_ref(output_pixels() * channels()); 670 std::vector<float> output_real(output_pixels() * channels()); 671 std::vector<int32_t> accumulator(output_pixels() * channels()); 672 std::vector<int32_t, AlignedAllocator<int32_t, 64>> buffer(XNN_EXTRA_BYTES / sizeof(uint8_t) + channels()); 673 for (size_t iteration = 0; iteration < iterations(); iteration++) { 674 do { 675 std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); }); 676 } while (input.size() > 1 && *std::max_element(input.cbegin(), input.cend()) == *std::min_element(input.cbegin(), input.cend())); 677 std::fill(input.begin(), input.begin() + input_offset(), UINT8_C(0xA5)); 678 std::fill(input.end() - XNN_EXTRA_BYTES / sizeof(uint8_t), input.end(), UINT8_C(0xA5)); 679 std::fill(output.begin(), output.end(), UINT8_C(0xA5)); 680 681 for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) { 682 indirect_input[i] = input.data() + i * channels(); 683 } 684 std::shuffle(indirect_input.begin(), 685 indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng); 686 if (zero_index() != SIZE_MAX) { 687 indirect_input[zero_index()] = zero.data(); 688 } 689 690 // Prepare parameters. 691 xnn_qu8_avgpool_minmax_params params; 692 init_params( 693 ¶ms, 694 -int32_t(input_zero_point()) * int32_t(pooling_elements()), 695 input_scale() / (output_scale() * float(pooling_elements())), 696 output_zero_point(), qmin(), qmax()); 697 698 // Compute reference results. 699 for (size_t x = 0; x < output_pixels(); x++) { 700 for (size_t c = 0; c < channels(); c++) { 701 int32_t acc = 0; 702 for (size_t p = 0; p < pooling_elements(); p++) { 703 const uint8_t* row = indirect_input[x * step() + p]; 704 if (row != zero.data()) { 705 acc += int32_t(row[c + input_offset()]); 706 } 707 acc -= int32_t(input_zero_point()); 708 } 709 accumulator[x * channels() + c] = acc; 710 output_ref[x * channels() + c] = xnn_qu8_requantize_rndna( 711 acc, input_scale() / (output_scale() * float(pooling_elements())), output_zero_point(), qmin(), qmax()); 712 const float scaled_acc = 713 float(acc) * input_scale() / (output_scale() * float(pooling_elements())) + float(output_zero_point()); 714 output_real[x * channels() + c] = std::min(std::max(scaled_acc, float(qmin())), float(qmax())); 715 } 716 } 717 718 // Call optimized micro-kernel. 719 avgpool_minmax(output_pixels(), pooling_elements(), channels(), 720 indirect_input.data(), input_offset() * sizeof(uint8_t), zero.data(), 721 buffer.data(), output.data(), 722 (step() - (packed_pooling_elements() - incremental_pooling_tile())) * sizeof(void*), 723 (output_stride() - channels()) * sizeof(uint8_t), 724 ¶ms); 725 726 // Verify results. 727 for (size_t x = 0; x < output_pixels(); x++) { 728 for (size_t c = 0; c < channels(); c++) { 729 ASSERT_GE(uint32_t(output[x * output_stride() + c]), uint32_t(qmin())) 730 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 731 << ", pooling elements = " << pooling_elements() << ", step = " << step() 732 << ", input offset = " << input_offset(); 733 ASSERT_LE(uint32_t(output[x * output_stride() + c]), uint32_t(qmax())) 734 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 735 << ", pooling elements = " << pooling_elements() << ", step = " << step() 736 << ", input offset = " << input_offset(); 737 ASSERT_NEAR(float(int32_t(output[x * output_stride() + c])), output_real[x * channels() + c], 0.5f) 738 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 739 << ", pooling elements = " << pooling_elements() << ", step = " << step() 740 << ", input offset = " << input_offset() << ", accumulator = " << accumulator[x * channels() + c]; 741 ASSERT_EQ(uint32_t(output_ref[x * channels() + c]), uint32_t(output[x * output_stride() + c])) 742 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 743 << ", pooling elements = " << pooling_elements() << ", step = " << step() 744 << ", input offset = " << input_offset() << ", accumulator = " << accumulator[x * channels() + c]; 745 } 746 } 747 } 748 } 749 Test(xnn_f16_pavgpool_minmax_unipass_ukernel_function pavgpool_minmax,xnn_init_f16_minmax_params_fn init_params)750 void Test(xnn_f16_pavgpool_minmax_unipass_ukernel_function pavgpool_minmax, xnn_init_f16_minmax_params_fn init_params) const { 751 std::random_device random_device; 752 auto rng = std::mt19937(random_device()); 753 std::uniform_real_distribution<float> f32dist; 754 std::uniform_real_distribution<float> m32dist(0.1f, 0.5f); 755 756 std::vector<const uint16_t*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements()); 757 std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) + 758 input_offset() + indirect_input.size() * channels()); 759 std::vector<uint16_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(uint16_t)); 760 std::vector<uint16_t> multiplier(output_pixels()); 761 std::vector<uint16_t> output((output_pixels() - 1) * output_stride() + channels()); 762 std::vector<float> output_ref(output_pixels() * channels()); 763 for (size_t iteration = 0; iteration < iterations(); iteration++) { 764 std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 765 std::fill(input.begin(), input.begin() + input_offset(), UINT16_C(0x7E00) /* NaN */); 766 std::fill(input.end() - XNN_EXTRA_BYTES / sizeof(uint16_t), input.end(), UINT16_C(0x7E00) /* NaN */); 767 std::generate(multiplier.begin(), multiplier.end(), [&]() { return fp16_ieee_from_fp32_value(m32dist(rng)); }); 768 std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */); 769 770 for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) { 771 indirect_input[i] = input.data() + i * channels(); 772 } 773 std::shuffle(indirect_input.begin(), 774 indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng); 775 if (zero_index() != SIZE_MAX) { 776 indirect_input[zero_index()] = zero.data(); 777 } 778 779 // Compute reference results, without clamping. 780 for (size_t x = 0; x < output_pixels(); x++) { 781 for (size_t c = 0; c < channels(); c++) { 782 float acc = 0.0f; 783 for (size_t p = 0; p < pooling_elements(); p++) { 784 const uint16_t* row = indirect_input[x * step() + p]; 785 if (row != zero.data()) { 786 acc += fp16_ieee_to_fp32_value(row[c + input_offset()]); 787 } 788 } 789 output_ref[x * channels() + c] = acc * fp16_ieee_to_fp32_value(multiplier[x]); 790 } 791 } 792 793 // Compute clamping parameters. 794 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); 795 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); 796 const float accumulated_range = accumulated_max - accumulated_min; 797 float output_min_as_float = accumulated_min + float(qmin()) / 255.0f * accumulated_range; 798 float output_max_as_float = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range; 799 const uint16_t output_min_as_half = fp16_ieee_from_fp32_value(output_min_as_float); 800 const uint16_t output_max_as_half = fp16_ieee_from_fp32_value(output_max_as_float); 801 output_min_as_float = fp16_ieee_to_fp32_value(output_min_as_half); 802 output_max_as_float = fp16_ieee_to_fp32_value(output_max_as_half); 803 804 // Clamp reference results. 805 for (float& output_value : output_ref) { 806 output_value = std::max(std::min(output_value, output_max_as_float), output_min_as_float); 807 } 808 809 // Prepare parameters. 810 xnn_f16_minmax_params params; 811 init_params(¶ms, output_min_as_half, output_max_as_half); 812 813 // Call optimized micro-kernel. 814 pavgpool_minmax(output_pixels(), pooling_elements(), channels(), 815 reinterpret_cast<const void**>(indirect_input.data()), input_offset() * sizeof(uint16_t), zero.data(), 816 multiplier.data(), output.data(), 817 step() * sizeof(void*), 818 (output_stride() - channels()) * sizeof(uint16_t), 819 ¶ms); 820 821 // Verify results. 822 for (size_t x = 0; x < output_pixels(); x++) { 823 for (size_t c = 0; c < channels(); c++) { 824 ASSERT_GE(fp16_ieee_to_fp32_value(output[x * output_stride() + c]), output_min_as_float) 825 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 826 << ", pooling elements = " << pooling_elements() << ", step = " << step() 827 << ", input offset = " << input_offset(); 828 ASSERT_LE(fp16_ieee_to_fp32_value(output[x * output_stride() + c]), output_max_as_float) 829 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 830 << ", pooling elements = " << pooling_elements() << ", step = " << step() 831 << ", input offset = " << input_offset(); 832 ASSERT_NEAR( 833 fp16_ieee_to_fp32_value(output[x * output_stride() + c]), 834 output_ref[x * channels() + c], 835 std::max(1.0e-4f, std::abs(output_ref[x * channels() + c]) * 3.0e-3f)) 836 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 837 << ", pooling elements = " << pooling_elements() << ", step = " << step() 838 << ", input offset = " << input_offset(); 839 } 840 } 841 } 842 } 843 Test(xnn_f16_pavgpool_minmax_multipass_ukernel_function pavgpool_minmax,xnn_init_f16_minmax_params_fn init_params)844 void Test(xnn_f16_pavgpool_minmax_multipass_ukernel_function pavgpool_minmax, xnn_init_f16_minmax_params_fn init_params) const { 845 std::random_device random_device; 846 auto rng = std::mt19937(random_device()); 847 std::uniform_real_distribution<float> f32dist; 848 std::uniform_real_distribution<float> m32dist(0.1f, 0.5f); 849 850 std::vector<const uint16_t*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements()); 851 std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) + 852 input_offset() + indirect_input.size() * channels()); 853 std::vector<uint16_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(uint16_t)); 854 std::vector<uint16_t> multiplier(output_pixels()); 855 std::vector<uint16_t> output((output_pixels() - 1) * output_stride() + channels()); 856 std::vector<float> output_ref(output_pixels() * channels()); 857 std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> buffer(XNN_EXTRA_BYTES / sizeof(uint16_t) + channels()); 858 for (size_t iteration = 0; iteration < iterations(); iteration++) { 859 std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 860 std::fill(input.begin(), input.begin() + input_offset(), UINT16_C(0x7E00) /* NaN */); 861 std::fill(input.end() - XNN_EXTRA_BYTES / sizeof(uint16_t), input.end(), UINT16_C(0x7E00) /* NaN */); 862 std::generate(multiplier.begin(), multiplier.end(), [&]() { return fp16_ieee_from_fp32_value(m32dist(rng)); }); 863 std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */); 864 865 for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) { 866 indirect_input[i] = input.data() + i * channels(); 867 } 868 std::shuffle(indirect_input.begin(), 869 indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng); 870 if (zero_index() != SIZE_MAX) { 871 indirect_input[zero_index()] = zero.data(); 872 } 873 874 // Compute reference results, without clamping. 875 for (size_t x = 0; x < output_pixels(); x++) { 876 for (size_t c = 0; c < channels(); c++) { 877 float acc = 0.0f; 878 for (size_t p = 0; p < pooling_elements(); p++) { 879 const uint16_t* row = indirect_input[x * step() + p]; 880 if (row != zero.data()) { 881 acc += fp16_ieee_to_fp32_value(row[c + input_offset()]); 882 } 883 } 884 output_ref[x * channels() + c] = acc * fp16_ieee_to_fp32_value(multiplier[x]); 885 } 886 } 887 888 // Compute clamping parameters. 889 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); 890 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); 891 const float accumulated_range = accumulated_max - accumulated_min; 892 float output_min_as_float = accumulated_min + float(qmin()) / 255.0f * accumulated_range; 893 float output_max_as_float = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range; 894 const uint16_t output_min_as_half = fp16_ieee_from_fp32_value(output_min_as_float); 895 const uint16_t output_max_as_half = fp16_ieee_from_fp32_value(output_max_as_float); 896 output_min_as_float = fp16_ieee_to_fp32_value(output_min_as_half); 897 output_max_as_float = fp16_ieee_to_fp32_value(output_max_as_half); 898 899 // Clamp reference results. 900 for (float& output_value : output_ref) { 901 output_value = std::max(std::min(output_value, output_max_as_float), output_min_as_float); 902 } 903 904 // Prepare parameters. 905 xnn_f16_minmax_params params; 906 init_params(¶ms, output_min_as_half, output_max_as_half); 907 908 // Call optimized micro-kernel. 909 pavgpool_minmax(output_pixels(), pooling_elements(), channels(), 910 reinterpret_cast<const void**>(indirect_input.data()), input_offset() * sizeof(uint16_t), zero.data(), 911 multiplier.data(), buffer.data(), output.data(), 912 (step() - (packed_pooling_elements() - incremental_pooling_tile())) * sizeof(void*), 913 (output_stride() - channels()) * sizeof(uint16_t), 914 ¶ms); 915 916 // Verify results. 917 for (size_t x = 0; x < output_pixels(); x++) { 918 for (size_t c = 0; c < channels(); c++) { 919 ASSERT_GE(fp16_ieee_to_fp32_value(output[x * output_stride() + c]), output_min_as_float) 920 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 921 << ", pooling elements = " << pooling_elements() << ", step = " << step() 922 << ", input offset = " << input_offset(); 923 ASSERT_LE(fp16_ieee_to_fp32_value(output[x * output_stride() + c]), output_max_as_float) 924 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 925 << ", pooling elements = " << pooling_elements() << ", step = " << step() 926 << ", input offset = " << input_offset(); 927 ASSERT_NEAR( 928 fp16_ieee_to_fp32_value(output[x * output_stride() + c]), 929 output_ref[x * channels() + c], 930 std::max(1.0e-4f, std::abs(output_ref[x * channels() + c]) * 3.0e-3f)) 931 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 932 << ", pooling elements = " << pooling_elements() << ", step = " << step() 933 << ", input offset = " << input_offset(); 934 } 935 } 936 } 937 } 938 Test(xnn_f32_pavgpool_minmax_unipass_ukernel_function pavgpool_minmax,xnn_init_f32_minmax_params_fn init_params)939 void Test(xnn_f32_pavgpool_minmax_unipass_ukernel_function pavgpool_minmax, xnn_init_f32_minmax_params_fn init_params) const { 940 std::random_device random_device; 941 auto rng = std::mt19937(random_device()); 942 std::uniform_real_distribution<float> f32dist; 943 std::uniform_real_distribution<float> m32dist(0.1f, 0.5f); 944 945 std::vector<const float*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements()); 946 std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + 947 input_offset() + indirect_input.size() * channels()); 948 std::vector<float> zero(channels() + XNN_EXTRA_BYTES / sizeof(float)); 949 std::vector<float> multiplier(output_pixels()); 950 std::vector<float> output((output_pixels() - 1) * output_stride() + channels()); 951 std::vector<float> output_ref(output_pixels() * channels()); 952 for (size_t iteration = 0; iteration < iterations(); iteration++) { 953 std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); 954 std::fill(input.begin(), input.begin() + input_offset(), std::nanf("")); 955 std::fill(input.end() - XNN_EXTRA_BYTES / sizeof(float), input.end(), std::nanf("")); 956 std::generate(multiplier.begin(), multiplier.end(), [&]() { return m32dist(rng); }); 957 std::fill(output.begin(), output.end(), std::nanf("")); 958 959 for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) { 960 indirect_input[i] = input.data() + i * channels(); 961 } 962 std::shuffle(indirect_input.begin(), 963 indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng); 964 if (zero_index() != SIZE_MAX) { 965 indirect_input[zero_index()] = zero.data(); 966 } 967 968 // Compute reference results, without clamping. 969 for (size_t x = 0; x < output_pixels(); x++) { 970 for (size_t c = 0; c < channels(); c++) { 971 float acc = 0.0f; 972 for (size_t p = 0; p < pooling_elements(); p++) { 973 const float* row = indirect_input[x * step() + p]; 974 if (row != zero.data()) { 975 acc += row[c + input_offset()]; 976 } 977 } 978 output_ref[x * channels() + c] = acc * multiplier[x]; 979 } 980 } 981 982 // Compute clamping parameters. 983 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); 984 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); 985 const float accumulated_range = accumulated_max - accumulated_min; 986 const float output_min = accumulated_min + float(qmin()) / 255.0f * accumulated_range; 987 const float output_max = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range; 988 989 // Clamp reference results. 990 for (float& output_value : output_ref) { 991 output_value = std::max(std::min(output_value, output_max), output_min); 992 } 993 994 // Prepare parameters. 995 xnn_f32_minmax_params params; 996 init_params(¶ms, output_min, output_max); 997 998 // Call optimized micro-kernel. 999 pavgpool_minmax(output_pixels(), pooling_elements(), channels(), 1000 indirect_input.data(), input_offset() * sizeof(float), zero.data(), 1001 multiplier.data(), output.data(), 1002 step() * sizeof(void*), 1003 (output_stride() - channels()) * sizeof(float), 1004 ¶ms); 1005 1006 // Verify results. 1007 for (size_t x = 0; x < output_pixels(); x++) { 1008 for (size_t c = 0; c < channels(); c++) { 1009 ASSERT_GE(output[x * output_stride() + c], output_min) 1010 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 1011 << ", pooling elements = " << pooling_elements() << ", step = " << step() 1012 << ", input offset = " << input_offset(); 1013 ASSERT_LE(output[x * output_stride() + c], output_max) 1014 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 1015 << ", pooling elements = " << pooling_elements() << ", step = " << step() 1016 << ", input offset = " << input_offset(); 1017 ASSERT_NEAR( 1018 output[x * output_stride() + c], 1019 output_ref[x * channels() + c], 1020 std::abs(output_ref[x * channels() + c]) * 1.0e-6f) 1021 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 1022 << ", pooling elements = " << pooling_elements() << ", step = " << step() 1023 << ", input offset = " << input_offset(); 1024 } 1025 } 1026 } 1027 } 1028 Test(xnn_f32_pavgpool_minmax_multipass_ukernel_function pavgpool_minmax,xnn_init_f32_minmax_params_fn init_params)1029 void Test(xnn_f32_pavgpool_minmax_multipass_ukernel_function pavgpool_minmax, xnn_init_f32_minmax_params_fn init_params) const { 1030 std::random_device random_device; 1031 auto rng = std::mt19937(random_device()); 1032 std::uniform_real_distribution<float> f32dist; 1033 std::uniform_real_distribution<float> m32dist(0.1f, 0.5f); 1034 1035 std::vector<const float*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements()); 1036 std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + 1037 input_offset() + indirect_input.size() * channels()); 1038 std::vector<float> zero(channels() + XNN_EXTRA_BYTES / sizeof(float)); 1039 std::vector<float> multiplier(output_pixels()); 1040 std::vector<float> output((output_pixels() - 1) * output_stride() + channels()); 1041 std::vector<float> output_ref(output_pixels() * channels()); 1042 std::vector<float, AlignedAllocator<float, 64>> buffer(XNN_EXTRA_BYTES / sizeof(float) + channels()); 1043 for (size_t iteration = 0; iteration < iterations(); iteration++) { 1044 std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); 1045 std::fill(input.begin(), input.begin() + input_offset(), std::nanf("")); 1046 std::fill(input.end() - XNN_EXTRA_BYTES / sizeof(float), input.end(), std::nanf("")); 1047 std::generate(multiplier.begin(), multiplier.end(), [&]() { return m32dist(rng); }); 1048 std::fill(output.begin(), output.end(), std::nanf("")); 1049 1050 for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) { 1051 indirect_input[i] = input.data() + i * channels(); 1052 } 1053 std::shuffle(indirect_input.begin(), 1054 indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng); 1055 if (zero_index() != SIZE_MAX) { 1056 indirect_input[zero_index()] = zero.data(); 1057 } 1058 1059 // Compute reference results, without clamping. 1060 for (size_t x = 0; x < output_pixels(); x++) { 1061 for (size_t c = 0; c < channels(); c++) { 1062 float acc = 0.0f; 1063 for (size_t p = 0; p < pooling_elements(); p++) { 1064 const float* row = indirect_input[x * step() + p]; 1065 if (row != zero.data()) { 1066 acc += row[c + input_offset()]; 1067 } 1068 } 1069 output_ref[x * channels() + c] = acc * multiplier[x]; 1070 } 1071 } 1072 1073 // Compute clamping parameters. 1074 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); 1075 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); 1076 const float accumulated_range = accumulated_max - accumulated_min; 1077 const float output_min = accumulated_min + float(qmin()) / 255.0f * accumulated_range; 1078 const float output_max = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range; 1079 1080 // Clamp reference results. 1081 for (float& output_value : output_ref) { 1082 output_value = std::max(std::min(output_value, output_max), output_min); 1083 } 1084 1085 // Prepare parameters. 1086 xnn_f32_minmax_params params; 1087 init_params(¶ms, output_min, output_max); 1088 1089 // Call optimized micro-kernel. 1090 pavgpool_minmax(output_pixels(), pooling_elements(), channels(), 1091 indirect_input.data(), input_offset() * sizeof(float), zero.data(), 1092 multiplier.data(), buffer.data(), output.data(), 1093 (step() - (packed_pooling_elements() - incremental_pooling_tile())) * sizeof(void*), 1094 (output_stride() - channels()) * sizeof(float), 1095 ¶ms); 1096 1097 // Verify results. 1098 for (size_t x = 0; x < output_pixels(); x++) { 1099 for (size_t c = 0; c < channels(); c++) { 1100 ASSERT_GE(output[x * output_stride() + c], output_min) 1101 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 1102 << ", pooling elements = " << pooling_elements() << ", step = " << step() 1103 << ", input offset = " << input_offset(); 1104 ASSERT_LE(output[x * output_stride() + c], output_max) 1105 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 1106 << ", pooling elements = " << pooling_elements() << ", step = " << step() 1107 << ", input offset = " << input_offset(); 1108 ASSERT_NEAR( 1109 output[x * output_stride() + c], 1110 output_ref[x * channels() + c], 1111 std::abs(output_ref[x * channels() + c]) * 1.0e-6f) 1112 << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels() 1113 << ", pooling elements = " << pooling_elements() << ", step = " << step() 1114 << ", input offset = " << input_offset(); 1115 } 1116 } 1117 } 1118 } 1119 1120 private: 1121 size_t output_pixels_{1}; 1122 size_t pooling_elements_{1}; 1123 size_t channels_{1}; 1124 size_t input_offset_{0}; 1125 size_t zero_index_{SIZE_MAX}; 1126 size_t step_{1}; 1127 size_t primary_pooling_tile_{1}; 1128 size_t incremental_pooling_tile_{1}; 1129 size_t output_stride_{0}; 1130 float input_scale_{1.25f}; 1131 float output_scale_{0.75f}; 1132 uint8_t input_zero_point_{121}; 1133 uint8_t output_zero_point_{133}; 1134 uint8_t qmin_{0}; 1135 uint8_t qmax_{255}; 1136 size_t iterations_{3}; 1137 }; 1138