1 /* Copyright 2019 Google LLC. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "ruy/ctx.h"
17
18 #include <cstdlib>
19 #include <functional>
20 #include <string>
21
22 #include "ruy/check_macros.h"
23 #include "ruy/cpuinfo.h"
24 #include "ruy/ctx_impl.h"
25 #include "ruy/have_built_path_for.h"
26 #include "ruy/path.h"
27 #include "ruy/performance_advisory.h"
28 #include "ruy/platform.h"
29 #include "ruy/strategy_controls.h"
30 #include "ruy/prepacked_cache.h"
31 #include "ruy/trace.h"
32
33 namespace ruy {
34
impl() const35 const CtxImpl& Ctx::impl() const { return static_cast<const CtxImpl&>(*this); }
mutable_impl()36 CtxImpl* Ctx::mutable_impl() { return static_cast<CtxImpl*>(this); }
37
last_used_path() const38 Path Ctx::last_used_path() const { return impl().last_used_path_; }
explicit_tuning() const39 Tuning Ctx::explicit_tuning() const { return impl().explicit_tuning_; }
set_explicit_tuning(Tuning value)40 void Ctx::set_explicit_tuning(Tuning value) {
41 mutable_impl()->explicit_tuning_ = value;
42 }
thread_pool() const43 const ThreadPool& Ctx::thread_pool() const { return impl().thread_pool_; }
mutable_thread_pool()44 ThreadPool* Ctx::mutable_thread_pool() { return &mutable_impl()->thread_pool_; }
max_num_threads() const45 int Ctx::max_num_threads() const { return impl().max_num_threads_; }
set_max_num_threads(int value)46 void Ctx::set_max_num_threads(int value) {
47 mutable_impl()->max_num_threads_ = value;
48 }
clear_performance_advisories()49 void Ctx::clear_performance_advisories() {
50 mutable_impl()->performance_advisory_ = PerformanceAdvisory::kNone;
51 }
set_performance_advisory(PerformanceAdvisory advisory)52 void Ctx::set_performance_advisory(PerformanceAdvisory advisory) {
53 mutable_impl()->performance_advisory_ =
54 mutable_impl()->performance_advisory_ | advisory;
55 }
performance_advisory(PerformanceAdvisory advisory) const56 bool Ctx::performance_advisory(PerformanceAdvisory advisory) const {
57 return (impl().performance_advisory_ & advisory) !=
58 PerformanceAdvisory::kNone;
59 }
set_num_threads_strategy(NumThreadsStrategy strategy)60 void Ctx::set_num_threads_strategy(NumThreadsStrategy strategy) {
61 mutable_impl()->num_threads_strategy_ = strategy;
62 }
num_threads_strategy() const63 NumThreadsStrategy Ctx::num_threads_strategy() const {
64 return impl().num_threads_strategy_;
65 }
66
SetRuntimeEnabledPaths(Path paths)67 void Ctx::SetRuntimeEnabledPaths(Path paths) {
68 if (paths == Path::kNone) {
69 // Revert to default behavior using runtime detection.
70 mutable_impl()->runtime_enabled_paths_ = Path::kNone;
71 } else {
72 // Explicitly set enabled paths. Ensure that non-arch are always enabled
73 // (needed for fallbacks).
74 mutable_impl()->runtime_enabled_paths_ = paths | kNonArchPaths;
75 }
76 }
77
mutable_cpuinfo()78 CpuInfo* Ctx::mutable_cpuinfo() { return &mutable_impl()->cpuinfo_; }
79
80 namespace {
81
GetHexIntEnvVarOrZero(const char * name)82 int GetHexIntEnvVarOrZero(const char* name) {
83 const char* val = getenv(name);
84 if (!val) {
85 return 0;
86 }
87 return std::stoi(val, nullptr, 16);
88 }
89
90 // For each Path bit set in `paths_to_test`, performs runtime detection and
91 // sets the corresponding bit in the return value if and only if it is
92 // supported. Path bits that are not set in the input
93 // `paths_to_detect` value are also left not set in the return value.
DetectRuntimeSupportedPaths(Path paths_to_detect,CpuInfo * cpuinfo)94 Path DetectRuntimeSupportedPaths(Path paths_to_detect, CpuInfo* cpuinfo) {
95 // Paths in kNonArchPathsIncludingInternalVariants are always implicitly
96 // supported. Further logic below may add more bits to `results`.
97 Path result = kNonArchPathsIncludingInternalVariants;
98
99 // Conditionally sets the `path` bit in `result`, if reported as supported
100 // by the `is_supported` predicate.
101 auto maybe_add = [&](Path path, std::function<bool(void)> is_supported) {
102 if ((paths_to_detect & path) != Path::kNone) {
103 if (is_supported()) {
104 result = result | path;
105 }
106 }
107 };
108
109 #if RUY_PLATFORM_ARM
110 // NEON is unconditionally available on ARM64.
111 // On ARM32 it's technically possible for it to be unavailable, but we've
112 // always chosen to just crash on such devices. We could reevaluate that,
113 // however for non-NEON devices to be actually supported, we would need to
114 // address also compiler-generated NEON code. That would mean to remove
115 // -mfpu=neon from ruy_copts and only use this flag in select NEON translation
116 // units, and implement have_built_path_for_neon, similar to the x86 SIMD
117 // paths.
118 maybe_add(Path::kNeon, []() { return true; });
119
120 // NEON dotprod requires runtime detection, however unlike the x86 SIMD paths
121 // it still does not require have_built_path_for because we unconditionally
122 // build it at the moment. That is largely because we have had to machine
123 // encode dotprod instructions, so we don't actually rely on toolchain support
124 // for them.
125 maybe_add(Path::kNeonDotprod, [=]() { return cpuinfo->NeonDotprod(); });
126 #elif RUY_PLATFORM_X86
127 // x86 SIMD paths currently require both runtime detection, and detection of
128 // whether we're building the path at all.
129 maybe_add(Path::kAvx,
130 [=]() { return HaveBuiltPathForAvx() && cpuinfo->Avx(); });
131 maybe_add(Path::kAvx2Fma,
132 [=]() { return HaveBuiltPathForAvx2Fma() && cpuinfo->Avx2Fma(); });
133 maybe_add(Path::kAvx512,
134 [=]() { return HaveBuiltPathForAvx512() && cpuinfo->Avx512(); });
135 #else
136 (void)maybe_add;
137 (void)cpuinfo;
138 #endif
139
140 // Sanity checks
141 RUY_DCHECK_EQ(kNonArchPaths & ~result, Path::kNone);
142 RUY_DCHECK_EQ(
143 result & ~(kNonArchPathsIncludingInternalVariants | paths_to_detect),
144 Path::kNone);
145 return result;
146 }
147
148 } // namespace
149
GetRuntimeEnabledPaths()150 Path Ctx::GetRuntimeEnabledPaths() {
151 RUY_TRACE_SCOPE;
152 // Just a shorthand alias. Using a pointer to make it clear we're mutating
153 // this value in-place.
154 Path* paths = &mutable_impl()->runtime_enabled_paths_;
155
156 // The value Path::kNone indicates the initial state before detection has been
157 // performed.
158 if (*paths != Path::kNone) {
159 RUY_TRACE_INFO(GET_RUNTIME_ENABLED_PATHS_USING_SET_VALUE);
160 return *paths;
161 }
162 // User may have set path explicitly in env var.
163 Path paths_bitfield = static_cast<Path>(GetHexIntEnvVarOrZero("RUY_PATHS"));
164 if (paths_bitfield != Path::kNone) {
165 *paths = paths_bitfield;
166 RUY_TRACE_INFO(GET_RUNTIME_ENABLED_PATHS_USING_ENV_VAR);
167 return *paths;
168 }
169 // Finally, use runtime detection.
170 *paths = DetectRuntimeSupportedPaths(kAllPaths, mutable_cpuinfo());
171 RUY_TRACE_INFO(GET_RUNTIME_ENABLED_PATHS_USING_DETECTION);
172 return *paths;
173 }
174
SelectPath(Path compiled_paths)175 Path Ctx::SelectPath(Path compiled_paths) {
176 return mutable_impl()->last_used_path_ =
177 GetMostSignificantPath(compiled_paths & GetRuntimeEnabledPaths());
178 }
179
EnsureThreadSpecificResources(int thread_count)180 void Ctx::EnsureThreadSpecificResources(int thread_count) {
181 auto& resources = mutable_impl()->thread_specific_resources_;
182 while (thread_count > static_cast<int>(resources.size())) {
183 resources.emplace_back(new ThreadSpecificResource);
184 }
185 RUY_DCHECK_LE(thread_count, static_cast<int>(resources.size()));
186 }
187
GetThreadSpecificTuningResolver(int thread_index) const188 TuningResolver* Ctx::GetThreadSpecificTuningResolver(int thread_index) const {
189 const auto& resources = impl().thread_specific_resources_;
190 RUY_DCHECK_LT(thread_index, static_cast<int>(resources.size()));
191 return &resources[thread_index]->tuning_resolver;
192 }
193
GetThreadSpecificAllocator(int thread_index) const194 Allocator* Ctx::GetThreadSpecificAllocator(int thread_index) const {
195 const auto& resources = impl().thread_specific_resources_;
196 RUY_DCHECK_LT(thread_index, static_cast<int>(resources.size()));
197 return &resources[thread_index]->allocator;
198 }
199
GetMainAllocator()200 Allocator* Ctx::GetMainAllocator() {
201 if (!impl().main_allocator_) {
202 mutable_impl()->main_allocator_.reset(new Allocator);
203 }
204 return impl().main_allocator_.get();
205 }
206
GetPrepackedCache()207 PrepackedCache* Ctx::GetPrepackedCache() {
208 if (!impl().prepacked_cache_) {
209 mutable_impl()->prepacked_cache_.reset(new PrepackedCache);
210 }
211 return impl().prepacked_cache_.get();
212 }
213
GetMainThreadTuning()214 Tuning Ctx::GetMainThreadTuning() {
215 EnsureThreadSpecificResources(1);
216 TuningResolver* tuning_resolver = GetThreadSpecificTuningResolver(0);
217 tuning_resolver->SetTuning(explicit_tuning());
218 return tuning_resolver->Resolve(mutable_cpuinfo());
219 }
220
ClearPrepackedCache()221 void Ctx::ClearPrepackedCache() { mutable_impl()->prepacked_cache_ = nullptr; }
222
223 } // namespace ruy
224