1 // Copyright 2019 Google LLC 2 // 3 // This source code is licensed under the BSD-style license found in the 4 // LICENSE file in the root directory of this source tree. 5 6 #pragma once 7 8 #include <gtest/gtest.h> 9 10 #include <algorithm> 11 #include <cassert> 12 #include <cmath> 13 #include <cstddef> 14 #include <cstdint> 15 #include <random> 16 #include <vector> 17 18 #include <fp16.h> 19 20 #include <xnnpack.h> 21 #include <xnnpack/aligned-allocator.h> 22 #include <xnnpack/microfnptr.h> 23 #include <xnnpack/math.h> 24 25 26 class IBilinearMicrokernelTester { 27 public: pixels(uint32_t pixels)28 inline IBilinearMicrokernelTester& pixels(uint32_t pixels) { 29 assert(pixels >= 1); 30 this->pixels_ = pixels; 31 return *this; 32 } 33 pixels()34 inline uint32_t pixels() const { 35 return this->pixels_; 36 } 37 channels(uint32_t channels)38 inline IBilinearMicrokernelTester& channels(uint32_t channels) { 39 assert(channels >= 1); 40 this->channels_ = channels; 41 return *this; 42 } 43 channels()44 inline uint32_t channels() const { 45 return this->channels_; 46 } 47 input_offset(uint32_t input_offset)48 inline IBilinearMicrokernelTester& input_offset(uint32_t input_offset) { 49 this->input_offset_ = input_offset; 50 return *this; 51 } 52 input_offset()53 inline uint32_t input_offset() const { 54 return this->input_offset_; 55 } 56 output_stride(uint32_t output_stride)57 inline IBilinearMicrokernelTester& output_stride(uint32_t output_stride) { 58 assert(output_stride != 0); 59 this->output_stride_ = output_stride; 60 return *this; 61 } 62 output_stride()63 inline uint32_t output_stride() const { 64 if (this->output_stride_ == 0) { 65 return channels(); 66 } else { 67 assert(this->output_stride_ >= channels()); 68 return this->output_stride_; 69 } 70 } 71 iterations(size_t iterations)72 inline IBilinearMicrokernelTester& iterations(size_t iterations) { 73 this->iterations_ = iterations; 74 return *this; 75 } 76 iterations()77 inline size_t iterations() const { 78 return this->iterations_; 79 } 80 input_stride(uint32_t input_stride)81 inline IBilinearMicrokernelTester& input_stride(uint32_t input_stride) { 82 assert(input_stride != 0); 83 this->input_stride_ = input_stride; 84 return *this; 85 } 86 input_stride()87 inline uint32_t input_stride() const { 88 if (this->input_stride_ == 0) { 89 return 4 * pixels(); 90 } else { 91 assert(this->input_stride_ >= 4 * pixels()); 92 return this->input_stride_; 93 } 94 } 95 Test(xnn_f16_ibilinear_ukernel_function ibilinear)96 void Test(xnn_f16_ibilinear_ukernel_function ibilinear) const { 97 std::random_device random_device; 98 auto rng = std::mt19937(random_device()); 99 std::uniform_real_distribution<float> f32dist(0.1f, 1.0f); 100 101 std::vector<const uint16_t*> indirection(pixels() * 4); 102 std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) + indirection.size() * channels()); 103 std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> packed_weights(pixels() * 2); 104 std::vector<uint16_t> output((pixels() - 1) * output_stride() + channels()); 105 std::vector<float> output_ref(pixels() * channels()); 106 107 for (size_t iteration = 0; iteration < iterations(); iteration++) { 108 std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 109 std::generate(packed_weights.begin(), packed_weights.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 110 std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */); 111 112 for (size_t i = 0; i < indirection.size(); i++) { 113 indirection[i] = input.data() + i * channels() - input_offset(); 114 } 115 std::shuffle(indirection.begin(), indirection.end(), rng); 116 117 // Compute reference results. 118 for (size_t i = 0; i < pixels(); i++) { 119 for (size_t c = 0; c < channels(); c++) { 120 const float alpha_h = fp16_ieee_to_fp32_value(packed_weights[i * 2 + 0]); 121 const float alpha_v = fp16_ieee_to_fp32_value(packed_weights[i * 2 + 1]); 122 output_ref[i * channels() + c] = 123 fp16_ieee_to_fp32_value(indirection[i * 4 + 0][c + input_offset()]) * (1.0f - alpha_h) * (1.0f - alpha_v) + 124 fp16_ieee_to_fp32_value(indirection[i * 4 + 1][c + input_offset()]) * alpha_h * (1.0f - alpha_v) + 125 fp16_ieee_to_fp32_value(indirection[i * 4 + 2][c + input_offset()]) * (1.0f - alpha_h) * alpha_v + 126 fp16_ieee_to_fp32_value(indirection[i * 4 + 3][c + input_offset()]) * alpha_h * alpha_v; 127 } 128 } 129 130 // Call optimized micro-kernel. 131 ibilinear( 132 pixels(), channels() * sizeof(uint16_t), 133 reinterpret_cast<const void**>(indirection.data()), input_offset() * sizeof(uint16_t), 134 packed_weights.data(), output.data(), 135 (output_stride() - channels()) * sizeof(uint16_t)); 136 137 // Verify results. 138 for (size_t i = 0; i < pixels(); i++) { 139 for (size_t c = 0; c < channels(); c++) { 140 ASSERT_NEAR( 141 fp16_ieee_to_fp32_value(output[i * output_stride() + c]), 142 output_ref[i * channels() + c], 143 std::abs(output_ref[i * channels() + c]) * 1.0e-2f) 144 << "pixel " << i << " / " << pixels() << ", channel " << c << " / " << channels(); 145 } 146 } 147 } 148 } 149 Test(xnn_f32_ibilinear_ukernel_function ibilinear)150 void Test(xnn_f32_ibilinear_ukernel_function ibilinear) const { 151 std::random_device random_device; 152 auto rng = std::mt19937(random_device()); 153 std::uniform_real_distribution<float> f32dist; 154 155 std::vector<const float*> indirection(pixels() * 4); 156 std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + indirection.size() * channels()); 157 std::vector<float, AlignedAllocator<float, 64>> packed_weights(pixels() * 2); 158 std::vector<float> output((pixels() - 1) * output_stride() + channels()); 159 std::vector<float> output_ref(pixels() * channels()); 160 161 for (size_t iteration = 0; iteration < iterations(); iteration++) { 162 std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); 163 std::generate(packed_weights.begin(), packed_weights.end(), [&]() { return f32dist(rng); }); 164 std::fill(output.begin(), output.end(), nanf("")); 165 166 for (size_t i = 0; i < indirection.size(); i++) { 167 indirection[i] = input.data() + i * channels() - input_offset(); 168 } 169 std::shuffle(indirection.begin(), indirection.end(), rng); 170 171 // Compute reference results. 172 for (size_t i = 0; i < pixels(); i++) { 173 for (size_t c = 0; c < channels(); c++) { 174 const float alpha_h = packed_weights[i * 2 + 0]; 175 const float alpha_v = packed_weights[i * 2 + 1]; 176 output_ref[i * channels() + c] = 177 indirection[i * 4 + 0][c + input_offset()] * (1.0f - alpha_h) * (1.0f - alpha_v) + 178 indirection[i * 4 + 1][c + input_offset()] * alpha_h * (1.0f - alpha_v) + 179 indirection[i * 4 + 2][c + input_offset()] * (1.0f - alpha_h) * alpha_v + 180 indirection[i * 4 + 3][c + input_offset()] * alpha_h * alpha_v; 181 } 182 } 183 184 // Call optimized micro-kernel. 185 ibilinear( 186 pixels(), channels() * sizeof(float), 187 indirection.data(), input_offset() * sizeof(float), 188 packed_weights.data(), output.data(), 189 (output_stride() - channels()) * sizeof(float)); 190 191 // Verify results. 192 for (size_t i = 0; i < pixels(); i++) { 193 for (size_t c = 0; c < channels(); c++) { 194 ASSERT_NEAR( 195 output_ref[i * channels() + c], 196 output[i * output_stride() + c], 197 std::abs(output_ref[i * channels() + c]) * 1.0e-4) 198 << "pixel " << i << " / " << pixels() << ", channel " << c << " / " << channels(); 199 } 200 } 201 } 202 } 203 Test(xnn_s8_ibilinear_ukernel_function ibilinear)204 void Test(xnn_s8_ibilinear_ukernel_function ibilinear) const { 205 std::random_device random_device; 206 auto rng = std::mt19937(random_device()); 207 std::uniform_int_distribution<int32_t> i8dist( 208 std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()); 209 std::uniform_int_distribution<int16_t> w11dist(0, 2047); 210 211 std::vector<const int8_t*> indirection(pixels() * 4); 212 std::vector<int8_t> input(XNN_EXTRA_BYTES / sizeof(int8_t) + indirection.size() * channels()); 213 std::vector<int16_t, AlignedAllocator<int16_t, 64>> packed_weights(pixels() * 2); 214 std::vector<int8_t> output((pixels() - 1) * output_stride() + channels()); 215 std::vector<int8_t> output_ref(pixels() * channels()); 216 217 for (size_t iteration = 0; iteration < iterations(); iteration++) { 218 std::generate(input.begin(), input.end(), [&]() { return i8dist(rng); }); 219 std::generate(packed_weights.begin(), packed_weights.end(), [&]() { return w11dist(rng); }); 220 std::fill(output.begin(), output.end(), INT8_C(0xFA)); 221 222 for (size_t i = 0; i < indirection.size(); i++) { 223 indirection[i] = input.data() + i * channels() - input_offset(); 224 } 225 std::shuffle(indirection.begin(), indirection.end(), rng); 226 227 // Compute reference results. 228 for (size_t i = 0; i < pixels(); i++) { 229 for (size_t c = 0; c < channels(); c++) { 230 const int32_t alpha_h = packed_weights[i * 2 + 0]; 231 const int32_t alpha_v = packed_weights[i * 2 + 1]; 232 const int32_t acc = math_asr_s32( 233 int32_t(indirection[i * 4 + 0][c + input_offset()]) * (2048 - alpha_h) * (2048 - alpha_v) + 234 int32_t(indirection[i * 4 + 1][c + input_offset()]) * alpha_h * (2048 - alpha_v) + 235 int32_t(indirection[i * 4 + 2][c + input_offset()]) * (2048 - alpha_h) * alpha_v + 236 int32_t(indirection[i * 4 + 3][c + input_offset()]) * alpha_h * alpha_v + 237 2097152, 22); 238 ASSERT_GE(acc, std::numeric_limits<int8_t>::min()); 239 ASSERT_LE(acc, std::numeric_limits<int8_t>::max()); 240 output_ref[i * channels() + c] = (int8_t) acc; 241 } 242 } 243 244 // Call optimized micro-kernel. 245 ibilinear( 246 pixels(), channels() * sizeof(int8_t), 247 indirection.data(), input_offset() * sizeof(int8_t), 248 packed_weights.data(), output.data(), 249 (output_stride() - channels()) * sizeof(int8_t)); 250 251 // Verify results. 252 for (size_t i = 0; i < pixels(); i++) { 253 for (size_t c = 0; c < channels(); c++) { 254 ASSERT_EQ(int32_t(output_ref[i * channels() + c]), int32_t(output[i * output_stride() + c])) 255 << "pixel " << i << " / " << pixels() << ", channel " << c << " / " << channels(); 256 } 257 } 258 } 259 } 260 Test(xnn_u8_ibilinear_ukernel_function ibilinear)261 void Test(xnn_u8_ibilinear_ukernel_function ibilinear) const { 262 std::random_device random_device; 263 auto rng = std::mt19937(random_device()); 264 std::uniform_int_distribution<int32_t> u8dist( 265 std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max()); 266 std::uniform_int_distribution<int16_t> w11dist(0, 2047); 267 268 std::vector<const uint8_t*> indirection(pixels() * 4); 269 std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) + indirection.size() * channels()); 270 std::vector<int16_t, AlignedAllocator<int16_t, 64>> packed_weights(pixels() * 2); 271 std::vector<uint8_t> output((pixels() - 1) * output_stride() + channels()); 272 std::vector<uint8_t> output_ref(pixels() * channels()); 273 274 for (size_t iteration = 0; iteration < iterations(); iteration++) { 275 std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); }); 276 std::generate(packed_weights.begin(), packed_weights.end(), [&]() { return w11dist(rng); }); 277 std::fill(output.begin(), output.end(), UINT8_C(0xFA)); 278 279 for (size_t i = 0; i < indirection.size(); i++) { 280 indirection[i] = input.data() + i * channels() - input_offset(); 281 } 282 std::shuffle(indirection.begin(), indirection.end(), rng); 283 284 // Compute reference results. 285 for (size_t i = 0; i < pixels(); i++) { 286 for (size_t c = 0; c < channels(); c++) { 287 const uint32_t alpha_h = uint32_t(int32_t(packed_weights[i * 2 + 0])); 288 const uint32_t alpha_v = uint32_t(int32_t(packed_weights[i * 2 + 1])); 289 const uint32_t acc = (2097152 + 290 int32_t(indirection[i * 4 + 0][c + input_offset()]) * (2048 - alpha_h) * (2048 - alpha_v) + 291 int32_t(indirection[i * 4 + 1][c + input_offset()]) * alpha_h * (2048 - alpha_v) + 292 int32_t(indirection[i * 4 + 2][c + input_offset()]) * (2048 - alpha_h) * alpha_v + 293 int32_t(indirection[i * 4 + 3][c + input_offset()]) * alpha_h * alpha_v) >> 22; 294 ASSERT_LE(acc, std::numeric_limits<uint8_t>::max()); 295 output_ref[i * channels() + c] = (uint8_t) acc; 296 } 297 } 298 299 // Call optimized micro-kernel. 300 ibilinear( 301 pixels(), channels() * sizeof(uint8_t), 302 indirection.data(), input_offset() * sizeof(uint8_t), 303 packed_weights.data(), output.data(), 304 (output_stride() - channels()) * sizeof(uint8_t)); 305 306 // Verify results. 307 for (size_t i = 0; i < pixels(); i++) { 308 for (size_t c = 0; c < channels(); c++) { 309 ASSERT_EQ(uint32_t(output_ref[i * channels() + c]), uint32_t(output[i * output_stride() + c])) 310 << "pixel " << i << " / " << pixels() << ", channel " << c << " / " << channels(); 311 } 312 } 313 } 314 } 315 TestCHW(xnn_f16_ibilinear_chw_ukernel_function ibilinear)316 void TestCHW(xnn_f16_ibilinear_chw_ukernel_function ibilinear) const { 317 std::random_device random_device; 318 auto rng = std::mt19937(random_device()); 319 std::uniform_real_distribution<float> f32dist(0.1f, 1.0f); 320 321 std::vector<const uint16_t*> indirection(pixels() * 2); 322 std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) + (channels() - 1) * input_stride() + 4 * pixels()); 323 std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> packed_weights(pixels() * 2); 324 std::vector<uint16_t> output(pixels() * channels()); 325 std::vector<float> output_ref(pixels() * channels()); 326 327 for (size_t iteration = 0; iteration < iterations(); iteration++) { 328 std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 329 std::generate(packed_weights.begin(), packed_weights.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 330 std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */); 331 332 // Indirection will point to the even ("left") pixels of the input. 333 // The kernels will expect "right" pixels to be placed right next to them. 334 for (size_t i = 0; i < indirection.size(); i++) { 335 const uint16_t* left_corner = input.data() + 2 * i - input_offset(); 336 indirection[i] = left_corner; 337 } 338 std::shuffle(indirection.begin(), indirection.end(), rng); 339 340 // Compute reference results. 341 for (size_t i = 0; i < pixels(); i++) { 342 for (size_t c = 0; c < channels(); c++) { 343 const float alpha_h = fp16_ieee_to_fp32_value(packed_weights[i * 2 + 0]); 344 const float alpha_v = fp16_ieee_to_fp32_value(packed_weights[i * 2 + 1]); 345 // `c * pixels() + i` because the output is NCHW. 346 output_ref[c * pixels() + i] = 347 // `c * indirection.size()` because the input is NCHW. 348 fp16_ieee_to_fp32_value((indirection[i * 2 + 0] + 0)[c * input_stride() + input_offset()]) * (1.0f - alpha_h) * (1.0f - alpha_v) + 349 fp16_ieee_to_fp32_value((indirection[i * 2 + 0] + 1)[c * input_stride() + input_offset()]) * alpha_h * (1.0f - alpha_v) + 350 fp16_ieee_to_fp32_value((indirection[i * 2 + 1] + 0)[c * input_stride() + input_offset()]) * (1.0f - alpha_h) * alpha_v + 351 fp16_ieee_to_fp32_value((indirection[i * 2 + 1] + 1)[c * input_stride() + input_offset()]) * alpha_h * alpha_v; 352 } 353 } 354 355 // Call optimized micro-kernel. 356 ibilinear( 357 pixels(), channels(), 358 reinterpret_cast<const void**>(indirection.data()), input_offset() * sizeof(uint16_t), 359 packed_weights.data(), output.data(), input_stride() * sizeof(uint16_t)); 360 361 // Verify results. 362 for (size_t c = 0; c < channels(); c++) { 363 for (size_t i = 0; i < pixels(); i++) { 364 ASSERT_NEAR( 365 fp16_ieee_to_fp32_value(output[c * pixels() + i]), 366 output_ref[c * pixels() + i], 367 std::abs(output_ref[c * pixels() + i]) * 1.0e-2f) 368 << "i = " << i << ", channel = " << c; 369 } 370 } 371 } 372 } 373 TestCHW(xnn_f32_ibilinear_chw_ukernel_function ibilinear)374 void TestCHW(xnn_f32_ibilinear_chw_ukernel_function ibilinear) const { 375 std::random_device random_device; 376 auto rng = std::mt19937(random_device()); 377 std::uniform_real_distribution<float> f32dist; 378 379 std::vector<const float*> indirection(pixels() * 2); 380 std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + (channels() - 1) * input_stride() + 4 * pixels()); 381 std::vector<float, AlignedAllocator<float, 64>> packed_weights(pixels() * 2); 382 std::vector<float> output(pixels() * channels()); 383 std::vector<float> output_ref(pixels() * channels()); 384 385 for (size_t iteration = 0; iteration < iterations(); iteration++) { 386 std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); 387 std::generate(packed_weights.begin(), packed_weights.end(), [&]() { return f32dist(rng); }); 388 std::fill(output.begin(), output.end(), nanf("")); 389 390 // Indirection will point to the even ("left") pixels of the input. 391 // The kernels will expect "right" pixels to be placed right next to them. 392 for (size_t i = 0; i < indirection.size(); i++) { 393 const float* left_corner = input.data() + 2 * i - input_offset(); 394 indirection[i] = left_corner; 395 } 396 std::shuffle(indirection.begin(), indirection.end(), rng); 397 398 // Compute reference results. 399 for (size_t i = 0; i < pixels(); i++) { 400 for (size_t c = 0; c < channels(); c++) { 401 const float alpha_h = packed_weights[i * 2 + 0]; 402 const float alpha_v = packed_weights[i * 2 + 1]; 403 // `c * pixels() + i` because the output is NCHW. 404 output_ref[c * pixels() + i] = 405 // `c * indirection.size()` because the input is NCHW. 406 (indirection[i * 2 + 0] + 0)[c * input_stride() + input_offset()] * (1.0f - alpha_h) * (1.0f - alpha_v) + 407 (indirection[i * 2 + 0] + 1)[c * input_stride() + input_offset()] * alpha_h * (1.0f - alpha_v) + 408 (indirection[i * 2 + 1] + 0)[c * input_stride() + input_offset()] * (1.0f - alpha_h) * alpha_v + 409 (indirection[i * 2 + 1] + 1)[c * input_stride() + input_offset()] * alpha_h * alpha_v; 410 } 411 } 412 413 // Call optimized micro-kernel. 414 ibilinear( 415 pixels(), channels(), 416 indirection.data(), input_offset() * sizeof(float), 417 packed_weights.data(), output.data(), input_stride() * sizeof(float)); 418 419 // Verify results. 420 for (size_t c = 0; c < channels(); c++) { 421 for (size_t i = 0; i < pixels(); i++) { 422 ASSERT_NEAR( 423 output_ref[c * pixels() + i], 424 output[c * pixels() + i], 425 std::abs(output_ref[c * pixels() + i]) * 1.0e-4) 426 << "i = " << i << ", channel = " << c; 427 } 428 } 429 } 430 } 431 432 private: 433 uint32_t channels_{1}; 434 uint32_t pixels_{1}; 435 uint32_t output_stride_{0}; 436 uint32_t input_stride_{0}; 437 uint32_t input_offset_{0}; 438 size_t iterations_{3}; 439 }; 440