xref: /aosp_15_r20/external/ruy/ruy/ctx.cc (revision bb86c7ed5fb1b98a7eac808e443a46cc8b90dfc0)
1 /* Copyright 2019 Google LLC. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "ruy/ctx.h"
17 
18 #include <cstdlib>
19 #include <functional>
20 #include <string>
21 
22 #include "ruy/check_macros.h"
23 #include "ruy/cpuinfo.h"
24 #include "ruy/ctx_impl.h"
25 #include "ruy/have_built_path_for.h"
26 #include "ruy/path.h"
27 #include "ruy/performance_advisory.h"
28 #include "ruy/platform.h"
29 #include "ruy/strategy_controls.h"
30 #include "ruy/prepacked_cache.h"
31 #include "ruy/trace.h"
32 
33 namespace ruy {
34 
impl() const35 const CtxImpl& Ctx::impl() const { return static_cast<const CtxImpl&>(*this); }
mutable_impl()36 CtxImpl* Ctx::mutable_impl() { return static_cast<CtxImpl*>(this); }
37 
last_used_path() const38 Path Ctx::last_used_path() const { return impl().last_used_path_; }
explicit_tuning() const39 Tuning Ctx::explicit_tuning() const { return impl().explicit_tuning_; }
set_explicit_tuning(Tuning value)40 void Ctx::set_explicit_tuning(Tuning value) {
41   mutable_impl()->explicit_tuning_ = value;
42 }
thread_pool() const43 const ThreadPool& Ctx::thread_pool() const { return impl().thread_pool_; }
mutable_thread_pool()44 ThreadPool* Ctx::mutable_thread_pool() { return &mutable_impl()->thread_pool_; }
max_num_threads() const45 int Ctx::max_num_threads() const { return impl().max_num_threads_; }
set_max_num_threads(int value)46 void Ctx::set_max_num_threads(int value) {
47   mutable_impl()->max_num_threads_ = value;
48 }
clear_performance_advisories()49 void Ctx::clear_performance_advisories() {
50   mutable_impl()->performance_advisory_ = PerformanceAdvisory::kNone;
51 }
set_performance_advisory(PerformanceAdvisory advisory)52 void Ctx::set_performance_advisory(PerformanceAdvisory advisory) {
53   mutable_impl()->performance_advisory_ =
54       mutable_impl()->performance_advisory_ | advisory;
55 }
performance_advisory(PerformanceAdvisory advisory) const56 bool Ctx::performance_advisory(PerformanceAdvisory advisory) const {
57   return (impl().performance_advisory_ & advisory) !=
58          PerformanceAdvisory::kNone;
59 }
set_num_threads_strategy(NumThreadsStrategy strategy)60 void Ctx::set_num_threads_strategy(NumThreadsStrategy strategy) {
61   mutable_impl()->num_threads_strategy_ = strategy;
62 }
num_threads_strategy() const63 NumThreadsStrategy Ctx::num_threads_strategy() const {
64   return impl().num_threads_strategy_;
65 }
66 
SetRuntimeEnabledPaths(Path paths)67 void Ctx::SetRuntimeEnabledPaths(Path paths) {
68   if (paths == Path::kNone) {
69     // Revert to default behavior using runtime detection.
70     mutable_impl()->runtime_enabled_paths_ = Path::kNone;
71   } else {
72     // Explicitly set enabled paths. Ensure that non-arch are always enabled
73     // (needed for fallbacks).
74     mutable_impl()->runtime_enabled_paths_ = paths | kNonArchPaths;
75   }
76 }
77 
mutable_cpuinfo()78 CpuInfo* Ctx::mutable_cpuinfo() { return &mutable_impl()->cpuinfo_; }
79 
80 namespace {
81 
GetHexIntEnvVarOrZero(const char * name)82 int GetHexIntEnvVarOrZero(const char* name) {
83   const char* val = getenv(name);
84   if (!val) {
85     return 0;
86   }
87   return std::stoi(val, nullptr, 16);
88 }
89 
90 // For each Path bit set in `paths_to_test`, performs runtime detection and
91 // sets the corresponding bit in the return value if and only if it is
92 // supported. Path bits that are not set in the input
93 // `paths_to_detect` value are also left not set in the return value.
DetectRuntimeSupportedPaths(Path paths_to_detect,CpuInfo * cpuinfo)94 Path DetectRuntimeSupportedPaths(Path paths_to_detect, CpuInfo* cpuinfo) {
95   // Paths in kNonArchPathsIncludingInternalVariants are always implicitly
96   // supported. Further logic below may add more bits to `results`.
97   Path result = kNonArchPathsIncludingInternalVariants;
98 
99   // Conditionally sets the `path` bit in `result`, if reported as supported
100   // by the `is_supported` predicate.
101   auto maybe_add = [&](Path path, std::function<bool(void)> is_supported) {
102     if ((paths_to_detect & path) != Path::kNone) {
103       if (is_supported()) {
104         result = result | path;
105       }
106     }
107   };
108 
109 #if RUY_PLATFORM_ARM
110   // NEON is unconditionally available on ARM64.
111   // On ARM32 it's technically possible for it to be unavailable, but we've
112   // always chosen to just crash on such devices. We could reevaluate that,
113   // however for non-NEON devices to be actually supported, we would need to
114   // address also compiler-generated NEON code. That would mean to remove
115   // -mfpu=neon from ruy_copts and only use this flag in select NEON translation
116   // units, and implement have_built_path_for_neon, similar to the x86 SIMD
117   // paths.
118   maybe_add(Path::kNeon, []() { return true; });
119 
120   // NEON dotprod requires runtime detection, however unlike the x86 SIMD paths
121   // it still does not require have_built_path_for because we unconditionally
122   // build it at the moment. That is largely because we have had to machine
123   // encode dotprod instructions, so we don't actually rely on toolchain support
124   // for them.
125   maybe_add(Path::kNeonDotprod, [=]() { return cpuinfo->NeonDotprod(); });
126 #elif RUY_PLATFORM_X86
127   // x86 SIMD paths currently require both runtime detection, and detection of
128   // whether we're building the path at all.
129   maybe_add(Path::kAvx,
130             [=]() { return HaveBuiltPathForAvx() && cpuinfo->Avx(); });
131   maybe_add(Path::kAvx2Fma,
132             [=]() { return HaveBuiltPathForAvx2Fma() && cpuinfo->Avx2Fma(); });
133   maybe_add(Path::kAvx512,
134             [=]() { return HaveBuiltPathForAvx512() && cpuinfo->Avx512(); });
135 #else
136   (void)maybe_add;
137   (void)cpuinfo;
138 #endif
139 
140   // Sanity checks
141   RUY_DCHECK_EQ(kNonArchPaths & ~result, Path::kNone);
142   RUY_DCHECK_EQ(
143       result & ~(kNonArchPathsIncludingInternalVariants | paths_to_detect),
144       Path::kNone);
145   return result;
146 }
147 
148 }  // namespace
149 
GetRuntimeEnabledPaths()150 Path Ctx::GetRuntimeEnabledPaths() {
151   RUY_TRACE_SCOPE;
152   // Just a shorthand alias. Using a pointer to make it clear we're mutating
153   // this value in-place.
154   Path* paths = &mutable_impl()->runtime_enabled_paths_;
155 
156   // The value Path::kNone indicates the initial state before detection has been
157   // performed.
158   if (*paths != Path::kNone) {
159     RUY_TRACE_INFO(GET_RUNTIME_ENABLED_PATHS_USING_SET_VALUE);
160     return *paths;
161   }
162   // User may have set path explicitly in env var.
163   Path paths_bitfield = static_cast<Path>(GetHexIntEnvVarOrZero("RUY_PATHS"));
164   if (paths_bitfield != Path::kNone) {
165     *paths = paths_bitfield;
166     RUY_TRACE_INFO(GET_RUNTIME_ENABLED_PATHS_USING_ENV_VAR);
167     return *paths;
168   }
169   // Finally, use runtime detection.
170   *paths = DetectRuntimeSupportedPaths(kAllPaths, mutable_cpuinfo());
171   RUY_TRACE_INFO(GET_RUNTIME_ENABLED_PATHS_USING_DETECTION);
172   return *paths;
173 }
174 
SelectPath(Path compiled_paths)175 Path Ctx::SelectPath(Path compiled_paths) {
176   return mutable_impl()->last_used_path_ =
177              GetMostSignificantPath(compiled_paths & GetRuntimeEnabledPaths());
178 }
179 
EnsureThreadSpecificResources(int thread_count)180 void Ctx::EnsureThreadSpecificResources(int thread_count) {
181   auto& resources = mutable_impl()->thread_specific_resources_;
182   while (thread_count > static_cast<int>(resources.size())) {
183     resources.emplace_back(new ThreadSpecificResource);
184   }
185   RUY_DCHECK_LE(thread_count, static_cast<int>(resources.size()));
186 }
187 
GetThreadSpecificTuningResolver(int thread_index) const188 TuningResolver* Ctx::GetThreadSpecificTuningResolver(int thread_index) const {
189   const auto& resources = impl().thread_specific_resources_;
190   RUY_DCHECK_LT(thread_index, static_cast<int>(resources.size()));
191   return &resources[thread_index]->tuning_resolver;
192 }
193 
GetThreadSpecificAllocator(int thread_index) const194 Allocator* Ctx::GetThreadSpecificAllocator(int thread_index) const {
195   const auto& resources = impl().thread_specific_resources_;
196   RUY_DCHECK_LT(thread_index, static_cast<int>(resources.size()));
197   return &resources[thread_index]->allocator;
198 }
199 
GetMainAllocator()200 Allocator* Ctx::GetMainAllocator() {
201   if (!impl().main_allocator_) {
202     mutable_impl()->main_allocator_.reset(new Allocator);
203   }
204   return impl().main_allocator_.get();
205 }
206 
GetPrepackedCache()207 PrepackedCache* Ctx::GetPrepackedCache() {
208   if (!impl().prepacked_cache_) {
209     mutable_impl()->prepacked_cache_.reset(new PrepackedCache);
210   }
211   return impl().prepacked_cache_.get();
212 }
213 
GetMainThreadTuning()214 Tuning Ctx::GetMainThreadTuning() {
215   EnsureThreadSpecificResources(1);
216   TuningResolver* tuning_resolver = GetThreadSpecificTuningResolver(0);
217   tuning_resolver->SetTuning(explicit_tuning());
218   return tuning_resolver->Resolve(mutable_cpuinfo());
219 }
220 
ClearPrepackedCache()221 void Ctx::ClearPrepackedCache() { mutable_impl()->prepacked_cache_ = nullptr; }
222 
223 }  // namespace ruy
224