workgroup_selection.cc (revision b6fb3261f9314811a0f4371741dbb8839866f948) - OpenGrok cross reference for /aosp_15_r20/external/tensorflow/tensorflow/lite/delegates/gpu/common/workgroup_selection.cc

/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include "tensorflow/lite/delegates/gpu/common/workgroup_selection.h"

#include <math.h>

#include <cmath>
#include <set>
#include <vector>

#include "tensorflow/lite/delegates/gpu/common/util.h"

namespace tflite {
namespace gpu {

namespace {

template <typename T>
void AddCornerCases(const T& grid, int max_work_group_total_size,
                    const T& max_work_group_sizes,
                    WorkGroupSizeAlignment x_alignment,
                    WorkGroupSizeAlignment y_alignment,
                    WorkGroupSizeAlignment z_alignment,
                    std::vector<T>* work_groups) {
  for (int x = 1; x <= 4; ++x) {
    for (int y = 1; y <= 4; ++y) {
      for (int z = 1; z <= 4; ++z) {
        int wg_x = DivideRoundUp(grid.x, x);
        int wg_y = DivideRoundUp(grid.y, y);
        int wg_z = DivideRoundUp(grid.z, z);
        if (wg_x > max_work_group_sizes.x || wg_y > max_work_group_sizes.y ||
            wg_z > max_work_group_sizes.z ||
            wg_x * wg_y * wg_z > max_work_group_total_size) {
          continue;
        }
        if (x_alignment == WorkGroupSizeAlignment::PRECISE &&
            grid.x % wg_x != 0) {
          continue;
        }
        if (y_alignment == WorkGroupSizeAlignment::PRECISE &&
            grid.y % wg_y != 0) {
          continue;
        }
        if (z_alignment == WorkGroupSizeAlignment::PRECISE &&
            grid.z % wg_z != 0) {
          continue;
        }
        work_groups->push_back({wg_x, wg_y, wg_z});
      }
    }
  }

  // this will add at least {1, 1, 1} always.
  for (int x = 1; x <= 4; ++x) {
    for (int y = 1; y <= 4; ++y) {
      for (int z = 1; z <= 4; ++z) {
        if (x > max_work_group_sizes.x || y > max_work_group_sizes.y ||
            z > max_work_group_sizes.z ||
            x * y * z > max_work_group_total_size) {
          continue;
        }
        if (x_alignment == WorkGroupSizeAlignment::PRECISE && grid.x % x != 0) {
          continue;
        }
        if (y_alignment == WorkGroupSizeAlignment::PRECISE && grid.y % y != 0) {
          continue;
        }
        if (z_alignment == WorkGroupSizeAlignment::PRECISE && grid.z % z != 0) {
          continue;
        }
        work_groups->push_back({x, y, z});
      }
    }
  }
}

std::vector<int> GetDivisors(int number) {
  const int max_divisor = static_cast<int>(std::sqrt(number));
  std::vector<int> divisors;
  // we don't know the number of dividers, so it is just heuristic.
  divisors.reserve(max_divisor / 3 + 1);
  for (int i = 1; i <= max_divisor; ++i) {
    const int d = number / i;
    if (i * d == number) {
      divisors.push_back(i);
      if (d != i) {
        divisors.push_back(d);
      }
    }
  }
  return divisors;
}

std::vector<int> GetDivisorsForRange(int number, int range) {
  const int last_number = number + range;
  const int max_divisor = static_cast<int>(std::sqrt(last_number));
  std::set<int> divisors;
  for (int i = 1; i <= max_divisor; ++i) {
    const int reminder = number % i;
    // iterate through numbers that divisible by i in our range;
    const int first_number = number + (i - reminder) % i;
    if (first_number <= last_number) {
      divisors.insert(i);
    }
    for (int j = first_number; j <= last_number; j += i) {
      const int d = j / i;
      if (d != i) {
        divisors.insert(d);
      }
    }
  }
  return std::vector<int>(divisors.begin(), divisors.end());
}

}  // namespace

std::vector<int> GetPossibleSizes(int number,
                                  WorkGroupSizeAlignment z_alignment) {
  if (z_alignment == WorkGroupSizeAlignment::PRECISE) {
    // we will use for potential sizes, sizes that cover grid precisely
    // work group size * k (k is integer) == grid_size
    return GetDivisors(number);
  } else {
    // when we chose work group size we can use work group size that
    //   work group size * k (k is integer) != grid_size (slightly bigger)
    // so in this heuristic we trying to find potential size, that satisfies
    //   to this : work group size * k (k is integer) <= grid_size + 5
    //   and this : work group size * k (k is integer) >= grid_size
    return GetDivisorsForRange(number, 5);
  }
}

template <typename T>
std::vector<T> GenerateWorkGroupSizes(
    const T& grid, int min_work_group_total_size, int max_work_group_total_size,
    const T& max_work_group_sizes, WorkGroupSizeAlignment x_alignment,
    WorkGroupSizeAlignment y_alignment, WorkGroupSizeAlignment z_alignment) {
  std::vector<T> work_groups;
  work_groups.reserve(64);

  std::vector<int> sizes_x = GetPossibleSizes(grid.x, x_alignment);
  std::vector<int> sizes_y = GetPossibleSizes(grid.y, y_alignment);
  std::vector<int> sizes_z = GetPossibleSizes(grid.z, z_alignment);

  for (auto x : sizes_x) {
    if (x > max_work_group_sizes.x) continue;
    for (auto y : sizes_y) {
      if (y > max_work_group_sizes.y) continue;
      for (auto z : sizes_z) {
        if (z > max_work_group_sizes.z) continue;
        const int work_group_size = x * y * z;
        if (work_group_size < min_work_group_total_size ||
            work_group_size > max_work_group_total_size)
          continue;
        work_groups.push_back({x, y, z});
      }
    }
  }

  return work_groups;
}

// Specializations of GenerateWorkGroupSizes for int3 and uint3

template std::vector<int3> GenerateWorkGroupSizes(
    const int3& grid, int min_work_group_total_size,
    int max_work_group_total_size, const int3& max_work_group_sizes,
    WorkGroupSizeAlignment x_alignment, WorkGroupSizeAlignment y_alignment,
    WorkGroupSizeAlignment z_alignment);

template std::vector<uint3> GenerateWorkGroupSizes(
    const uint3& grid, int min_work_group_total_size,
    int max_work_group_total_size, const uint3& max_work_group_sizes,
    WorkGroupSizeAlignment x_alignment, WorkGroupSizeAlignment y_alignment,
    WorkGroupSizeAlignment z_alignment);

template <typename T>
void GenerateWorkGroupSizesAlignedToGrid(const T& grid,
                                         const T& max_work_group_size,
                                         const int max_work_group_total_size,
                                         std::vector<T>* work_groups) {
  auto alignment = WorkGroupSizeAlignment::PRECISE;
  *work_groups = GenerateWorkGroupSizes<T>(
      grid, /*min_work_group_total_size = */ 32, max_work_group_total_size,
      max_work_group_size, alignment, alignment, alignment);
  // If the grid parameter too small, method below cannot generate workgroups.
  if (work_groups->empty()) {
    AddCornerCases(grid, max_work_group_total_size, max_work_group_size,
                   alignment, alignment, alignment, work_groups);
  }
}

// Specializations of GenerateWorkGroupSizesAlignedToGrid for int3 and uint3

template void GenerateWorkGroupSizesAlignedToGrid(
    const int3& grid, const int3& max_work_group_size,
    const int max_work_group_total_size, std::vector<int3>* work_groups);

template void GenerateWorkGroupSizesAlignedToGrid(
    const uint3& grid, const uint3& max_work_group_size,
    const int max_work_group_total_size, std::vector<uint3>* work_groups);

}  // namespace gpu
}  // namespace tflite