1 /* 2 * Copyright (c) 2017-2021 Arm Limited. 3 * 4 * SPDX-License-Identifier: MIT 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to 8 * deal in the Software without restriction, including without limitation the 9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 10 * sell copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in all 14 * copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 #pragma once 25 26 #include <algorithm> 27 #include <cassert> 28 29 #include "arm_gemm.hpp" 30 #include "bias_adder.hpp" 31 #include "ndrange.hpp" 32 #include "performance_parameters.hpp" 33 #include "transform.hpp" 34 #include "utils.hpp" 35 36 #ifdef CYCLE_PROFILING 37 #include "profiler.hpp" 38 #endif 39 40 namespace arm_gemm { 41 42 // Implementation of the GemmCommon abstract class. 43 template<typename strategy, typename To, typename Tr> 44 class GemmHybrid : public GemmCommon<To, Tr> { 45 typedef typename strategy::operand_type Toi; 46 typedef typename strategy::result_type Tri; 47 48 /* const properties set by constructor */ 49 const CPUInfo * const _ci; 50 51 const unsigned int _Msize; 52 const unsigned int _Nsize; 53 const unsigned int _Ksize; 54 55 const unsigned int _nbatches; 56 const unsigned int _nmulti; 57 58 const Activation _act; 59 60 /* Blocking info */ 61 const unsigned int _k_block; 62 const unsigned int _n_block; 63 const unsigned int _Mround; 64 65 /* Pretransposed buffer. */ 66 const Toi *_B_transposed=nullptr; 67 68 const NDRange<4> _window_range; 69 compute_k_block(const GemmArgs & args)70 static unsigned int compute_k_block(const GemmArgs &args) { 71 // Some kernels don't support accumulate mode - these can't do K blocking at all. 72 if (!strategy::supports_accumulate()) { 73 return args._Ksize; 74 } 75 76 if (args._cfg && args._cfg->inner_block_size) { 77 return roundup(args._cfg->inner_block_size, strategy::k_unroll()); 78 } 79 80 // Target block size (512 for FP32, scaling for other types). Don't block until size reaches 1.5X this. 81 unsigned int target_block_size = 2048 / sizeof(To); 82 83 if (args._Ksize >= ((3 * target_block_size) / 2)) { 84 unsigned int target_blocks = iceildiv(args._Ksize, target_block_size); 85 86 unsigned int block_size = iceildiv(args._Ksize, target_blocks); 87 88 block_size = roundup(block_size, strategy::k_unroll()); 89 90 return block_size; 91 } 92 93 return args._Ksize; 94 } 95 96 // New N blocking strategy: if it's narrow, or much taller than it is wide, do the full width. Otherwise do a 97 // single block. compute_n_block(const GemmArgs & args)98 static unsigned int compute_n_block(const GemmArgs &args) { 99 if (args._cfg && args._cfg->outer_block_size) { 100 unsigned int n_block = args._cfg->outer_block_size; 101 102 // Needs to be (at least a single) multiple of the kernel output width. 103 n_block /= strategy::out_width(); 104 n_block = std::max(n_block, 1u) * strategy::out_width(); 105 106 return n_block; 107 } 108 109 if (args._Nsize <= 64) { 110 return args._Nsize; 111 } 112 113 if ((args._Msize / args._Nsize) > 155) { 114 return args._Nsize; 115 } 116 117 // Go slightly wider if thread count and depth are small. 118 if ((args._Ksize <= 128) && (args._maxthreads <= 16)) { 119 return strategy::out_width() * 3; 120 } 121 122 return strategy::out_width(); 123 } 124 125 public: 126 GemmHybrid(GemmHybrid &) = delete; 127 GemmHybrid & operator= (GemmHybrid &) = delete; 128 129 /* Constructor */ GemmHybrid(const GemmArgs & args)130 GemmHybrid(const GemmArgs &args) 131 : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize), 132 _nbatches(args._nbatches), _nmulti(args._nmulti), 133 _act(args._act), 134 _k_block(compute_k_block(args)), _n_block(compute_n_block(args)), 135 _Mround(roundup(args._Msize, strategy::out_height())), 136 _window_range(iceildiv(args._Msize, strategy::out_height()), _nbatches, iceildiv(_Nsize, _n_block), _nmulti) { } 137 138 // Interface implementation - Compulsory functions get_window_size() const139 ndrange_t get_window_size() const override { 140 return { _window_range.total_size() }; 141 } 142 143 // This kernel can always be dynamically scheduled. supports_dynamic_scheduling() const144 bool supports_dynamic_scheduling() const override { 145 return true; 146 } 147 148 // Execute execute(const ndcoord_t & work_range,const ndcoord_t &,int)149 void execute(const ndcoord_t &work_range, const ndcoord_t &, int) override { 150 #ifdef CYCLE_PROFILING 151 profiler prof; 152 #endif 153 strategy strat(_ci); 154 155 /* Make sure we've been set up correctly. */ 156 assert(_B_transposed); 157 static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same."); 158 static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same."); 159 160 /* For now, each work item implies all the K for a given output 161 * pixel (so we don't need to synchronize access to the output 162 * array). So separate the loop over K blocks here. */ 163 for (unsigned int k0=0; k0<_Ksize; k0+=_k_block) { 164 unsigned int kmax = std::min(k0 + _k_block, _Ksize); 165 unsigned int kern_k = roundup(kmax-k0, strategy::k_unroll()); 166 167 const bool first_pass = (k0 == 0); 168 const bool last_pass = (kmax == _Ksize); 169 170 auto p = _window_range.iterator(work_range.get_position(0), work_range.get_position_end(0)); 171 172 if (p.done()) { 173 return; 174 } 175 176 do { 177 const unsigned int m_start = p.dim(0) * strategy::out_height(); 178 const unsigned int m_end = std::min(p.dim0_max() * strategy::out_height(), _Msize); 179 const unsigned int batch = p.dim(1); 180 const unsigned int n0 = p.dim(2) * _n_block; 181 const unsigned int nmax = std::min(n0 + _n_block, _Nsize); 182 const unsigned int multi = p.dim(3); 183 184 const Toi *b_panel = _B_transposed + 185 (multi * roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll())) + 186 (k0 * roundup(_Nsize, strategy::out_width())) + 187 (n0 * kern_k); 188 189 #ifdef CYCLE_PROFILING 190 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width())); 191 #endif 192 193 strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (m_start * this->_lda) + k0, this->_lda, 194 b_panel, 195 this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc, 196 (m_end - m_start), (nmax - n0), kmax-k0, 197 (strategy::supports_bias() && first_pass && this->_bias) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr, 198 last_pass ? _act : Activation(), !first_pass); 199 200 // Add bias externally if needed 201 if (!strategy::supports_bias() && this->_bias && first_pass) { 202 bias_adder(this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc, 203 this->_bias + (multi * this->_bias_multi_stride) + n0, 204 (m_end - m_start), (nmax - n0)); 205 } 206 207 } while (p.next_dim1()); 208 } 209 } 210 211 // Interface implementation - pretransposed B_is_pretransposed() const212 bool B_is_pretransposed() const override { 213 return true; 214 } 215 B_pretranspose_required() const216 bool B_pretranspose_required() const override { 217 return (_B_transposed==nullptr); 218 } 219 get_B_pretransposed_array_size() const220 size_t get_B_pretransposed_array_size() const override { 221 return roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll()) * _nmulti * sizeof(Toi); 222 } 223 pretranspose_B_array(void * in_buffer,const To * B,const int ldb,const int B_multi_stride)224 void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override { 225 Toi *buffer = reinterpret_cast<Toi *>(in_buffer); 226 _B_transposed = buffer; 227 strategy strat(_ci); 228 229 for (unsigned int multi=0; multi<_nmulti; multi++) { 230 for (unsigned int k0=0; k0<_Ksize; k0+=_k_block) { 231 const unsigned int kmax = std::min(k0 + _k_block, _Ksize); 232 const unsigned int k_size = roundup(kmax-k0, strategy::k_unroll()); 233 234 for (unsigned int x0=0; x0<_Nsize; x0+=_n_block) { 235 const unsigned int xmax = std::min(x0+_n_block, _Nsize); 236 237 const unsigned int size = roundup(xmax-x0, strategy::out_width()) * k_size; 238 239 strat.transforms.PrepareB( buffer, B + (multi * B_multi_stride), ldb, 240 x0, xmax, k0, kmax); 241 242 buffer += size; 243 } 244 } 245 } 246 } 247 set_pretransposed_B_data(void * in_buffer)248 void set_pretransposed_B_data(void *in_buffer) override { 249 _B_transposed = reinterpret_cast<Toi *>(in_buffer); 250 } 251 252 // Estimate cycles for given problem given provided parameters estimate_cycles(const GemmArgs & args,const PerformanceParameters & params)253 static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters ¶ms) { 254 // Note: Current hybrid kernels don't actually round up height (they 255 // have paths for each possible height). Might need to make this 256 // configurable in future. 257 uint64_t total_macs = static_cast<uint64_t>(args._nbatches) * args._nmulti * args._Msize * roundup(args._Nsize, strategy::out_width()) * roundup(args._Ksize, strategy::k_unroll()); 258 259 float mac_cycles = static_cast<float>(total_macs) / params.kernel_macs_cycle; 260 261 // TODO: A bit of a kludge here: current hybrid kernels incur extra 262 // overhead where the width is not a multiple of kernel width. It's 263 // most noticable where the overall width is quite low, so add 15% 264 // penalty for such widths. 265 if ((args._Nsize < strategy::out_width()) || (args._Nsize > strategy::out_width() && args._Nsize < 2*strategy::out_width())) { 266 mac_cycles *= 1.15f; 267 } 268 269 uint64_t total_cycles = mac_cycles; 270 271 return total_cycles; 272 } 273 get_config()274 GemmConfig get_config() override { 275 GemmConfig c; 276 277 c.method = GemmMethod::GEMM_HYBRID; 278 c.inner_block_size = _k_block; 279 c.outer_block_size = _n_block; 280 c.filter = get_type_name<strategy>(); 281 282 return c; 283 } 284 }; 285 286 } // namespace arm_gemm 287