1 // Copyright (c) Facebook, Inc. and its affiliates. 2 // All rights reserved. 3 // 4 // Copyright 2019 Google LLC 5 // 6 // This source code is licensed under the BSD-style license found in the 7 // LICENSE file in the root directory of this source tree. 8 9 #pragma once 10 11 #include <gtest/gtest.h> 12 13 #include <algorithm> 14 #include <cassert> 15 #include <cmath> 16 #include <cstddef> 17 #include <cstdlib> 18 #include <random> 19 #include <vector> 20 21 #include <fp16.h> 22 23 #include <xnnpack.h> 24 #include <xnnpack/aligned-allocator.h> 25 #include <xnnpack/math.h> 26 #include <xnnpack/pack.h> 27 #include <xnnpack/microfnptr.h> 28 #include <xnnpack/microparams-init.h> 29 30 31 class DWConv2DMicrokernelTester { 32 public: 33 enum class Variant { 34 Native, 35 Scalar, 36 }; 37 padding_left(uint32_t padding_left)38 inline DWConv2DMicrokernelTester& padding_left(uint32_t padding_left) { 39 this->padding_left_ = padding_left; 40 return *this; 41 } 42 padding_left()43 inline uint32_t padding_left() const { 44 return this->padding_left_; 45 } 46 padding_right(uint32_t padding_right)47 inline DWConv2DMicrokernelTester& padding_right(uint32_t padding_right) { 48 this->padding_right_ = padding_right; 49 return *this; 50 } 51 padding_right()52 inline uint32_t padding_right() const { 53 return this->padding_right_; 54 } 55 padding_top(uint32_t padding_top)56 inline DWConv2DMicrokernelTester& padding_top(uint32_t padding_top) { 57 this->padding_top_ = padding_top; 58 return *this; 59 } 60 padding_top()61 inline uint32_t padding_top() const { 62 return this->padding_top_; 63 } 64 65 padding_bottom(uint32_t padding_bottom)66 inline DWConv2DMicrokernelTester& padding_bottom(uint32_t padding_bottom) { 67 this->padding_bottom_ = padding_bottom; 68 return *this; 69 } padding_bottom()70 inline uint32_t padding_bottom() const { 71 return this->padding_bottom_; 72 } 73 input_height(uint32_t input_height)74 inline DWConv2DMicrokernelTester& input_height(uint32_t input_height) { 75 assert(input_height >= 1); 76 this->input_height_ = input_height; 77 return *this; 78 } 79 input_height()80 inline uint32_t input_height() const { 81 return this->input_height_; 82 } 83 input_width(uint32_t input_width)84 inline DWConv2DMicrokernelTester& input_width(uint32_t input_width) { 85 assert(input_width >= 1); 86 this->input_width_ = input_width; 87 return *this; 88 } 89 input_width()90 inline uint32_t input_width() const { 91 return this->input_width_; 92 } 93 subsampling(uint32_t subsampling)94 inline DWConv2DMicrokernelTester& subsampling(uint32_t subsampling) { 95 assert(subsampling >= 1); 96 this->subsampling_ = subsampling; 97 return *this; 98 } 99 subsampling()100 inline uint32_t subsampling() const { 101 return this->subsampling_; 102 } 103 kernel_height(uint32_t kernel_height)104 inline DWConv2DMicrokernelTester& kernel_height(uint32_t kernel_height) { 105 assert(kernel_height != 0); 106 this->kernel_height_ = kernel_height; 107 return *this; 108 } 109 kernel_height()110 inline uint32_t kernel_height() const { 111 return this->kernel_height_; 112 } 113 kernel_width(uint32_t kernel_width)114 inline DWConv2DMicrokernelTester& kernel_width(uint32_t kernel_width) { 115 assert(kernel_width != 0); 116 this->kernel_width_ = kernel_width; 117 return *this; 118 } 119 kernel_width()120 inline uint32_t kernel_width() const { 121 return this->kernel_width_; 122 } 123 kernel_size()124 inline uint32_t kernel_size() const { 125 return kernel_height() * kernel_width(); 126 } 127 output_height()128 inline uint32_t output_height() const { 129 const uint32_t padded_input_height = padding_top() + input_height() + padding_bottom(); 130 if (padded_input_height <= kernel_height()) { 131 return 1; 132 } else { 133 return (padded_input_height - kernel_height()) / subsampling() + 1; 134 } 135 } 136 output_width()137 inline uint32_t output_width() const { 138 const uint32_t padded_input_width = padding_left() + input_width() + padding_right(); 139 if (padded_input_width <= kernel_width()) { 140 return 1; 141 } else { 142 return (padded_input_width - kernel_width()) / subsampling() + 1; 143 } 144 } 145 qmin(uint8_t qmin)146 inline DWConv2DMicrokernelTester& qmin(uint8_t qmin) { 147 this->qmin_ = qmin; 148 return *this; 149 } 150 qmin()151 inline uint8_t qmin() const { 152 return this->qmin_; 153 } 154 qmax(uint8_t qmax)155 inline DWConv2DMicrokernelTester& qmax(uint8_t qmax) { 156 this->qmax_ = qmax; 157 return *this; 158 } 159 qmax()160 inline uint8_t qmax() const { 161 return this->qmax_; 162 } 163 iterations(size_t iterations)164 inline DWConv2DMicrokernelTester& iterations(size_t iterations) { 165 this->iterations_ = iterations; 166 return *this; 167 } 168 iterations()169 inline size_t iterations() const { 170 return this->iterations_; 171 } 172 173 void Test(xnn_f32_dwconv2d_chw_ukernel_function dwconv, Variant variant = Variant::Native) const { 174 std::random_device random_device; 175 auto rng = std::mt19937(random_device()); 176 std::uniform_real_distribution<float> f32dist; 177 178 std::vector<float, AlignedAllocator<float, 64>> input(input_height() * input_width() + 2 * XNN_EXTRA_BYTES); 179 std::vector<float> zero(input_width() + 2 * XNN_EXTRA_BYTES); 180 std::vector<float> packed_weights(kernel_size() + 1); 181 std::vector<float, AlignedAllocator<float, 64>> output(output_height() * output_width()); 182 std::vector<float> output_ref(output_height() * output_width()); 183 184 for (size_t iteration = 0; iteration < iterations(); iteration++) { 185 std::generate(input.begin(), input.end(), [&]() { return f32dist(rng); }); 186 std::generate(packed_weights.begin(), packed_weights.end(), [&]() { return f32dist(rng); }); 187 std::fill(output.begin(), output.end(), nanf("")); 188 189 for (size_t oy = 0; oy < output_height(); oy++) { 190 for (size_t ox = 0; ox < output_width(); ox++) { 191 float acc = packed_weights[0]; 192 for (size_t ky = 0; ky < kernel_height(); ky++) { 193 const size_t iy = oy * subsampling() + ky - padding_top(); 194 for (size_t kx = 0; kx < kernel_width(); kx++) { 195 const size_t ix = ox * subsampling() + kx - padding_left(); 196 if (ix < input_width() && iy < input_height()) { 197 const float input_val = input[iy * input_width() + ix]; 198 const float kernel_val = packed_weights[1 + ky * kernel_width() + kx]; 199 acc += input_val * kernel_val; 200 } 201 } 202 } 203 output_ref[oy * output_width() + ox] = acc; 204 } 205 } 206 207 // Compute clamping parameters. 208 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); 209 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); 210 const float accumulated_range = accumulated_max - accumulated_min; 211 const float output_min = accumulated_min + accumulated_range / 255.0f * float(qmin()); 212 const float output_max = accumulated_max - accumulated_range / 255.0f * float(255 - qmax()); 213 214 // Prepare parameters. 215 xnn_f32_chw_params chw_params; 216 switch (variant) { 217 case Variant::Native: 218 xnn_init_f32_chw_params(&chw_params, input_width(), output_min, output_max); 219 break; 220 case Variant::Scalar: 221 xnn_init_scalar_f32_chw_params(&chw_params, input_width(), output_min, output_max); 222 break; 223 } 224 225 // Clamp reference results. 226 for (float& output_val : output_ref) { 227 output_val = std::max(std::min(output_val, output_max), output_min); 228 } 229 230 // Call optimized micro-kernel. 231 dwconv( 232 input_height(), input_width() * sizeof(float), 233 input.data(), packed_weights.data(), zero.data(), output.data(), 234 padding_top(), 235 &chw_params); 236 237 // Verify results. 238 for (size_t y = 0; y < output_height(); y++) { 239 for (size_t x = 0; x < output_width(); x++) { 240 ASSERT_NEAR( 241 output_ref[y * output_width() + x], 242 output[y * output_width() + x], 243 std::abs(output_ref[y * output_width() + x]) * 1.0e-5) 244 << "x = " << x << ", y = " << y; 245 } 246 } 247 } 248 } 249 Test(xnn_f16_dwconv2d_chw_ukernel_function dwconv,xnn_init_f16_chw_params_fn init_params)250 void Test(xnn_f16_dwconv2d_chw_ukernel_function dwconv, xnn_init_f16_chw_params_fn init_params) const { 251 std::random_device random_device; 252 auto rng = std::mt19937(random_device()); 253 std::uniform_real_distribution<float> f32dist; 254 255 std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> input(input_height() * input_width() + 2 * XNN_EXTRA_BYTES); 256 std::vector<uint16_t> zero(input_width() + 2 * XNN_EXTRA_BYTES); 257 std::vector<uint16_t> packed_weights(kernel_size() + 1); 258 std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> output(output_height() * output_width()); 259 std::vector<float> output_ref(output_height() * output_width()); 260 261 for (size_t iteration = 0; iteration < iterations(); iteration++) { 262 std::generate(input.begin(), input.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 263 std::generate(packed_weights.begin(), packed_weights.end(), [&]() { return fp16_ieee_from_fp32_value(f32dist(rng)); }); 264 std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */); 265 266 for (size_t oy = 0; oy < output_height(); oy++) { 267 for (size_t ox = 0; ox < output_width(); ox++) { 268 float acc = fp16_ieee_to_fp32_value(packed_weights[0]); 269 for (size_t ky = 0; ky < kernel_height(); ky++) { 270 const size_t iy = oy * subsampling() + ky - padding_top(); 271 for (size_t kx = 0; kx < kernel_width(); kx++) { 272 const size_t ix = ox * subsampling() + kx - padding_left(); 273 if (ix < input_width() && iy < input_height()) { 274 const float input_val = fp16_ieee_to_fp32_value(input[iy * input_width() + ix]); 275 const float kernel_val = fp16_ieee_to_fp32_value(packed_weights[1 + ky * kernel_width() + kx]); 276 acc += input_val * kernel_val; 277 } 278 } 279 } 280 output_ref[oy * output_width() + ox] = acc; 281 } 282 } 283 284 // Compute clamping parameters. 285 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend()); 286 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend()); 287 const float accumulated_range = accumulated_max - accumulated_min; 288 const float output_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_min + accumulated_range / 255.0f * float(qmin()))); 289 const float output_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_max - accumulated_range / 255.0f * float(255 - qmax()))); 290 291 // Prepare parameters. 292 xnn_f16_chw_params chw_params; 293 init_params(&chw_params, input_width(), 294 fp16_ieee_from_fp32_value(output_min), 295 fp16_ieee_from_fp32_value(output_max)); 296 297 // Clamp reference results. 298 for (float& output_val : output_ref) { 299 output_val = std::max(std::min(output_val, output_max), output_min); 300 } 301 302 // Call optimized micro-kernel. 303 dwconv( 304 input_height(), input_width() * sizeof(uint16_t), 305 input.data(), packed_weights.data(), zero.data(), output.data(), 306 padding_top(), 307 &chw_params); 308 309 // Verify results. 310 for (size_t y = 0; y < output_height(); y++) { 311 for (size_t x = 0; x < output_width(); x++) { 312 ASSERT_NEAR( 313 output_ref[y * output_width() + x], 314 fp16_ieee_to_fp32_value(output[y * output_width() + x]), 315 std::abs(output_ref[y * output_width() + x]) * 1.0e-2f) 316 << "x = " << x << ", y = " << y; 317 } 318 } 319 } 320 } 321 322 private: 323 uint32_t padding_left_{0}; 324 uint32_t padding_right_{0}; 325 uint32_t padding_top_{0}; 326 uint32_t padding_bottom_{0}; 327 uint32_t input_height_{1}; 328 uint32_t input_width_{1}; 329 uint32_t subsampling_{1}; 330 uint32_t kernel_height_{1}; 331 uint32_t kernel_width_{1}; 332 uint8_t qmin_{0}; 333 uint8_t qmax_{255}; 334 size_t iterations_{1}; 335 }; 336