gemmlowp/internal/block_params.h

*5f39d1b3SJooyung Han// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
*5f39d1b3SJooyung Han//
*5f39d1b3SJooyung Han// Licensed under the Apache License, Version 2.0 (the "License");
*5f39d1b3SJooyung Han// you may not use this file except in compliance with the License.
*5f39d1b3SJooyung Han// You may obtain a copy of the License at
*5f39d1b3SJooyung Han//
*5f39d1b3SJooyung Han//     http://www.apache.org/licenses/LICENSE-2.0
*5f39d1b3SJooyung Han//
*5f39d1b3SJooyung Han// Unless required by applicable law or agreed to in writing, software
*5f39d1b3SJooyung Han// distributed under the License is distributed on an "AS IS" BASIS,
*5f39d1b3SJooyung Han// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*5f39d1b3SJooyung Han// See the License for the specific language governing permissions and
*5f39d1b3SJooyung Han// limitations under the License.
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han// block_params.h: Logic to choose L1 and L2 block sizes
*5f39d1b3SJooyung Han// to optimize cache-friendliness.
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han#ifndef GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_
*5f39d1b3SJooyung Han#define GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han#include "common.h"
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Hannamespace gemmlowp {
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han// A BlockParams instance contains a full description of all the block size
*5f39d1b3SJooyung Han// parameters to be used by a Gemm.
*5f39d1b3SJooyung Han// There are two nested levels of block subdivisions: first a subdivision
*5f39d1b3SJooyung Han// into large blocks that should fit in last-level cache (what we call L2 here)
*5f39d1b3SJooyung Han// and then another subdivision into smaller blocks that should fit in
*5f39d1b3SJooyung Han// L1 cache. There is then actually a third level of subdivision to fit
*5f39d1b3SJooyung Han// in registers, but we are not concerned with that here.
*5f39d1b3SJooyung Hanstruct BlockParams {
*5f39d1b3SJooyung Han  // L1 block parameters determine the size of small blocks that should
*5f39d1b3SJooyung Han  // fit in L1 cache.
*5f39d1b3SJooyung Han  int l1_rows;
*5f39d1b3SJooyung Han  int l1_cols;
*5f39d1b3SJooyung Han  int l1_depth;
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  // L2 block parameters determine the size of larger blocks that should
*5f39d1b3SJooyung Han  // fit in L2 cache.
*5f39d1b3SJooyung Han  int l2_rows;
*5f39d1b3SJooyung Han  int l2_cols;
*5f39d1b3SJooyung Han  int l2_depth;
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  template <typename KernelFormat>
*5f39d1b3SJooyung Han  void Init(int rows, int cols, int depth, int num_threads, int l1_bytes_to_use,
*5f39d1b3SJooyung Han            int l2_bytes_to_use, float l2_rhs_factor) {
*5f39d1b3SJooyung Han    FindL2BlockSizes<KernelFormat>(rows, cols, depth, num_threads,
*5f39d1b3SJooyung Han                                   l2_bytes_to_use, l2_rhs_factor, &l2_rows,
*5f39d1b3SJooyung Han                                   &l2_cols, &l2_depth);
*5f39d1b3SJooyung Han    FindL1BlockSizes<KernelFormat>(l2_rows, l2_cols, l2_depth, l1_bytes_to_use,
*5f39d1b3SJooyung Han                                   &l1_rows, &l1_cols, &l1_depth);
*5f39d1b3SJooyung Han  }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  template <typename KernelFormat>
*5f39d1b3SJooyung Han  static void FindL2BlockSizes(int rows, int cols, int depth, int num_threads,
*5f39d1b3SJooyung Han                               int l2_bytes_to_use, float l2_rhs_factor,
*5f39d1b3SJooyung Han                               int* out_l2_rows, int* out_l2_cols,
*5f39d1b3SJooyung Han                               int* out_l2_depth) {
*5f39d1b3SJooyung Han    int l2_rows = 0;
*5f39d1b3SJooyung Han    int l2_cols = 0;
*5f39d1b3SJooyung Han    int l2_depth = 0;
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han    int per_thread_rows =
*5f39d1b3SJooyung Han        std::max(1, RoundUp<KernelFormat::kRows>(rows) / num_threads);
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han    // No L2 blocking in the depth dimension at the moment.
*5f39d1b3SJooyung Han    // Too much loss of accuracy due to storing intermediate results in
*5f39d1b3SJooyung Han    // low precision.
*5f39d1b3SJooyung Han    // However, we still want to round l2_depth up to the next multiple
*5f39d1b3SJooyung Han    // of register size, so as to avoid having to special-case unaligned depths.
*5f39d1b3SJooyung Han    l2_depth = RoundUp<kRegisterSize>(depth);
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han    {
*5f39d1b3SJooyung Han      int max_cache_friendly_l2_cols = std::max(
*5f39d1b3SJooyung Han          1, static_cast<int>(l2_rhs_factor * (l2_bytes_to_use / l2_depth)));
*5f39d1b3SJooyung Han      int min_l2_cols_blocks =
*5f39d1b3SJooyung Han          std::max(1, CeilQuotient(cols, max_cache_friendly_l2_cols));
*5f39d1b3SJooyung Han      l2_cols =
*5f39d1b3SJooyung Han          RoundUp<KernelFormat::kCols>(CeilQuotient(cols, min_l2_cols_blocks));
*5f39d1b3SJooyung Han    }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han    // No L2 blocking in the row dimension if l2_rhs_factor is 1.0 as the row
*5f39d1b3SJooyung Han    // dimension concerns only the LHS. Blocking only RHS matrix for L2 enhances
*5f39d1b3SJooyung Han    // the performance on x86.
*5f39d1b3SJooyung Han    if (l2_rhs_factor == 1.0f) {
*5f39d1b3SJooyung Han      l2_rows = RoundUp<KernelFormat::kRows>(per_thread_rows);
*5f39d1b3SJooyung Han    } else {
*5f39d1b3SJooyung Han      int max_cache_friendly_l2_rows =
*5f39d1b3SJooyung Han          std::max(1, (l2_bytes_to_use - l2_depth * l2_cols) /
*5f39d1b3SJooyung Han                          (num_threads * (l2_depth + 4 * l2_cols)));
*5f39d1b3SJooyung Han      int min_l2_rows_blocks = std::max(
*5f39d1b3SJooyung Han          1, CeilQuotient(per_thread_rows, max_cache_friendly_l2_rows));
*5f39d1b3SJooyung Han      l2_rows = RoundUp<KernelFormat::kRows>(
*5f39d1b3SJooyung Han          CeilQuotient(per_thread_rows, min_l2_rows_blocks));
*5f39d1b3SJooyung Han    }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han    *out_l2_rows = l2_rows;
*5f39d1b3SJooyung Han    *out_l2_cols = l2_cols;
*5f39d1b3SJooyung Han    *out_l2_depth = l2_depth;
*5f39d1b3SJooyung Han  }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  template <typename KernelFormat>
*5f39d1b3SJooyung Han  static void FindL1BlockSizes(int rows, int cols, int depth,
*5f39d1b3SJooyung Han                               int l1_bytes_to_use, int* out_l1_rows,
*5f39d1b3SJooyung Han                               int* out_l1_cols, int* out_l1_depth) {
*5f39d1b3SJooyung Han    int l1_rows = 0;
*5f39d1b3SJooyung Han    int l1_cols = 0;
*5f39d1b3SJooyung Han    int l1_depth = 0;
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han    // L2 block sizes should already be multiples of kernel block sizes.
*5f39d1b3SJooyung Han    assert(rows % KernelFormat::kRows == 0);
*5f39d1b3SJooyung Han    assert(cols % KernelFormat::kCols == 0);
*5f39d1b3SJooyung Han    assert(depth % KernelFormat::kDepth == 0);
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han    // No L1 blocking in the columns dimension at the moment.
*5f39d1b3SJooyung Han    // Thought not to be needed. Similar to Eigen.
*5f39d1b3SJooyung Han    l1_cols = cols;
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han    {
*5f39d1b3SJooyung Han      int max_cache_friendly_l1_depth = std::max(
*5f39d1b3SJooyung Han          1, (l1_bytes_to_use - 4 * KernelFormat::kRows * KernelFormat::kCols) /
*5f39d1b3SJooyung Han                 (KernelFormat::kRows + KernelFormat::kCols));
*5f39d1b3SJooyung Han      int min_l1_depth_blocks =
*5f39d1b3SJooyung Han          std::max(1, CeilQuotient(depth, max_cache_friendly_l1_depth));
*5f39d1b3SJooyung Han      l1_depth =
*5f39d1b3SJooyung Han          RoundUp<kRegisterSize>(CeilQuotient(depth, min_l1_depth_blocks));
*5f39d1b3SJooyung Han    }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han    {
*5f39d1b3SJooyung Han      int max_cache_friendly_l1_rows =
*5f39d1b3SJooyung Han          std::max(1, l1_bytes_to_use / (l1_depth + 4 * l1_cols));
*5f39d1b3SJooyung Han      int min_l1_rows_blocks =
*5f39d1b3SJooyung Han          std::max(1, CeilQuotient(rows, max_cache_friendly_l1_rows));
*5f39d1b3SJooyung Han      l1_rows =
*5f39d1b3SJooyung Han          RoundUp<KernelFormat::kRows>(CeilQuotient(rows, min_l1_rows_blocks));
*5f39d1b3SJooyung Han    }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han    *out_l1_rows = l1_rows;
*5f39d1b3SJooyung Han    *out_l1_cols = l1_cols;
*5f39d1b3SJooyung Han    *out_l1_depth = l1_depth;
*5f39d1b3SJooyung Han  }
*5f39d1b3SJooyung Han};
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han// A SideBlockParams instance contains only the block params relevant to
*5f39d1b3SJooyung Han// one side (LHS or RHS), expressed in terms of 'width' instead of
*5f39d1b3SJooyung Han// rows/colums. See the explanation in kernel.h: in the LHS, 'width' means
*5f39d1b3SJooyung Han// the number of rows, while in the RHS, 'width' means the number of columns.
*5f39d1b3SJooyung Han// That allows us to write generic code that applies to either LHS or RHS.
*5f39d1b3SJooyung Hanstruct SideBlockParams {
*5f39d1b3SJooyung Han  // L1 block parameters determine the size of small blocks that should
*5f39d1b3SJooyung Han  // fit in L1 cache.
*5f39d1b3SJooyung Han  int l1_width;
*5f39d1b3SJooyung Han  int l1_depth;
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  // L2 block parameters determine the size of larger blocks that should
*5f39d1b3SJooyung Han  // fit in L2 cache.
*5f39d1b3SJooyung Han  int l2_width;
*5f39d1b3SJooyung Han  int l2_depth;
*5f39d1b3SJooyung Han};
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Hanenum class Side { Lhs, Rhs };
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Haninline void GetSideBlockParams(Side side, SideBlockParams* side_block_params,
*5f39d1b3SJooyung Han                               const BlockParams& block_params) {
*5f39d1b3SJooyung Han  side_block_params->l1_width =
*5f39d1b3SJooyung Han      side == Side::Lhs ? block_params.l1_rows : block_params.l1_cols;
*5f39d1b3SJooyung Han  side_block_params->l2_width =
*5f39d1b3SJooyung Han      side == Side::Lhs ? block_params.l2_rows : block_params.l2_cols;
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  side_block_params->l1_depth = block_params.l1_depth;
*5f39d1b3SJooyung Han  side_block_params->l2_depth = block_params.l2_depth;
*5f39d1b3SJooyung Han}
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han}  // namespace gemmlowp
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han#endif  // GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_