xref: /aosp_15_r20/external/angle/third_party/abseil-cpp/absl/synchronization/mutex_benchmark.cc (revision 8975f5c5ed3d1c378011245431ada316dfb6f244)
1 // Copyright 2017 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include <cstdint>
16 #include <mutex>  // NOLINT(build/c++11)
17 #include <vector>
18 
19 #include "absl/base/config.h"
20 #include "absl/base/internal/cycleclock.h"
21 #include "absl/base/internal/spinlock.h"
22 #include "absl/base/no_destructor.h"
23 #include "absl/synchronization/blocking_counter.h"
24 #include "absl/synchronization/internal/thread_pool.h"
25 #include "absl/synchronization/mutex.h"
26 #include "benchmark/benchmark.h"
27 
28 namespace {
29 
BM_Mutex(benchmark::State & state)30 void BM_Mutex(benchmark::State& state) {
31   static absl::NoDestructor<absl::Mutex> mu;
32   for (auto _ : state) {
33     absl::MutexLock lock(mu.get());
34   }
35 }
36 BENCHMARK(BM_Mutex)->UseRealTime()->Threads(1)->ThreadPerCpu();
37 
BM_ReaderLock(benchmark::State & state)38 void BM_ReaderLock(benchmark::State& state) {
39   static absl::NoDestructor<absl::Mutex> mu;
40   for (auto _ : state) {
41     absl::ReaderMutexLock lock(mu.get());
42   }
43 }
44 BENCHMARK(BM_ReaderLock)->UseRealTime()->Threads(1)->ThreadPerCpu();
45 
BM_TryLock(benchmark::State & state)46 void BM_TryLock(benchmark::State& state) {
47   absl::Mutex mu;
48   for (auto _ : state) {
49     if (mu.TryLock()) {
50       mu.Unlock();
51     }
52   }
53 }
54 BENCHMARK(BM_TryLock);
55 
BM_ReaderTryLock(benchmark::State & state)56 void BM_ReaderTryLock(benchmark::State& state) {
57   static absl::NoDestructor<absl::Mutex> mu;
58   for (auto _ : state) {
59     if (mu->ReaderTryLock()) {
60       mu->ReaderUnlock();
61     }
62   }
63 }
64 BENCHMARK(BM_ReaderTryLock)->UseRealTime()->Threads(1)->ThreadPerCpu();
65 
DelayNs(int64_t ns,int * data)66 static void DelayNs(int64_t ns, int* data) {
67   int64_t end = absl::base_internal::CycleClock::Now() +
68                 ns * absl::base_internal::CycleClock::Frequency() / 1e9;
69   while (absl::base_internal::CycleClock::Now() < end) {
70     ++(*data);
71     benchmark::DoNotOptimize(*data);
72   }
73 }
74 
75 template <typename MutexType>
76 class RaiiLocker {
77  public:
RaiiLocker(MutexType * mu)78   explicit RaiiLocker(MutexType* mu) : mu_(mu) { mu_->Lock(); }
~RaiiLocker()79   ~RaiiLocker() { mu_->Unlock(); }
80  private:
81   MutexType* mu_;
82 };
83 
84 template <>
85 class RaiiLocker<std::mutex> {
86  public:
RaiiLocker(std::mutex * mu)87   explicit RaiiLocker(std::mutex* mu) : mu_(mu) { mu_->lock(); }
~RaiiLocker()88   ~RaiiLocker() { mu_->unlock(); }
89  private:
90   std::mutex* mu_;
91 };
92 
93 // RAII object to change the Mutex priority of the running thread.
94 class ScopedThreadMutexPriority {
95  public:
ScopedThreadMutexPriority(int priority)96   explicit ScopedThreadMutexPriority(int priority) {
97     absl::base_internal::ThreadIdentity* identity =
98         absl::synchronization_internal::GetOrCreateCurrentThreadIdentity();
99     identity->per_thread_synch.priority = priority;
100     // Bump next_priority_read_cycles to the infinite future so that the
101     // implementation doesn't re-read the thread's actual scheduler priority
102     // and replace our temporary scoped priority.
103     identity->per_thread_synch.next_priority_read_cycles =
104         std::numeric_limits<int64_t>::max();
105   }
~ScopedThreadMutexPriority()106   ~ScopedThreadMutexPriority() {
107     // Reset the "next priority read time" back to the infinite past so that
108     // the next time the Mutex implementation wants to know this thread's
109     // priority, it re-reads it from the OS instead of using our overridden
110     // priority.
111     absl::synchronization_internal::GetOrCreateCurrentThreadIdentity()
112         ->per_thread_synch.next_priority_read_cycles =
113         std::numeric_limits<int64_t>::min();
114   }
115 };
116 
BM_MutexEnqueue(benchmark::State & state)117 void BM_MutexEnqueue(benchmark::State& state) {
118   // In the "multiple priorities" variant of the benchmark, one of the
119   // threads runs with Mutex priority 0 while the rest run at elevated priority.
120   // This benchmarks the performance impact of the presence of a low priority
121   // waiter when a higher priority waiter adds itself of the queue
122   // (b/175224064).
123   //
124   // NOTE: The actual scheduler priority is not modified in this benchmark:
125   // all of the threads get CPU slices with the same priority. Only the
126   // Mutex queueing behavior is modified.
127   const bool multiple_priorities = state.range(0);
128   ScopedThreadMutexPriority priority_setter(
129       (multiple_priorities && state.thread_index() != 0) ? 1 : 0);
130 
131   struct Shared {
132     absl::Mutex mu;
133     std::atomic<int> looping_threads{0};
134     std::atomic<int> blocked_threads{0};
135     std::atomic<bool> thread_has_mutex{false};
136   };
137   static absl::NoDestructor<Shared> shared;
138 
139   // Set up 'blocked_threads' to count how many threads are currently blocked
140   // in Abseil synchronization code.
141   //
142   // NOTE: Blocking done within the Google Benchmark library itself (e.g.
143   // the barrier which synchronizes threads entering and exiting the benchmark
144   // loop) does _not_ get registered in this counter. This is because Google
145   // Benchmark uses its own synchronization primitives based on std::mutex, not
146   // Abseil synchronization primitives. If at some point the benchmark library
147   // merges into Abseil, this code may break.
148   absl::synchronization_internal::PerThreadSem::SetThreadBlockedCounter(
149       &shared->blocked_threads);
150 
151   // The benchmark framework may run several iterations in the same process,
152   // reusing the same static-initialized 'shared' object. Given the semantics
153   // of the members, here, we expect everything to be reset to zero by the
154   // end of any iteration. Assert that's the case, just to be sure.
155   ABSL_RAW_CHECK(
156       shared->looping_threads.load(std::memory_order_relaxed) == 0 &&
157           shared->blocked_threads.load(std::memory_order_relaxed) == 0 &&
158           !shared->thread_has_mutex.load(std::memory_order_relaxed),
159       "Shared state isn't zeroed at start of benchmark iteration");
160 
161   static constexpr int kBatchSize = 1000;
162   while (state.KeepRunningBatch(kBatchSize)) {
163     shared->looping_threads.fetch_add(1);
164     for (int i = 0; i < kBatchSize; i++) {
165       {
166         absl::MutexLock l(&shared->mu);
167         shared->thread_has_mutex.store(true, std::memory_order_relaxed);
168         // Spin until all other threads are either out of the benchmark loop
169         // or blocked on the mutex. This ensures that the mutex queue is kept
170         // at its maximal length to benchmark the performance of queueing on
171         // a highly contended mutex.
172         while (shared->looping_threads.load(std::memory_order_relaxed) -
173                    shared->blocked_threads.load(std::memory_order_relaxed) !=
174                1) {
175         }
176         shared->thread_has_mutex.store(false);
177       }
178       // Spin until some other thread has acquired the mutex before we block
179       // again. This ensures that we always go through the slow (queueing)
180       // acquisition path rather than reacquiring the mutex we just released.
181       while (!shared->thread_has_mutex.load(std::memory_order_relaxed) &&
182              shared->looping_threads.load(std::memory_order_relaxed) > 1) {
183       }
184     }
185     // The benchmark framework uses a barrier to ensure that all of the threads
186     // complete their benchmark loop together before any of the threads exit
187     // the loop. So, we need to remove ourselves from the "looping threads"
188     // counter here before potentially blocking on that barrier. Otherwise,
189     // another thread spinning above might wait forever for this thread to
190     // block on the mutex while we in fact are waiting to exit.
191     shared->looping_threads.fetch_add(-1);
192   }
193   absl::synchronization_internal::PerThreadSem::SetThreadBlockedCounter(
194       nullptr);
195 }
196 
197 BENCHMARK(BM_MutexEnqueue)
198     ->Threads(4)
199     ->Threads(64)
200     ->Threads(128)
201     ->Threads(512)
202     ->ArgName("multiple_priorities")
203     ->Arg(false)
204     ->Arg(true);
205 
206 template <typename MutexType>
BM_Contended(benchmark::State & state)207 void BM_Contended(benchmark::State& state) {
208   int priority = state.thread_index() % state.range(1);
209   ScopedThreadMutexPriority priority_setter(priority);
210 
211   struct Shared {
212     MutexType mu;
213     int data = 0;
214   };
215   static absl::NoDestructor<Shared> shared;
216   int local = 0;
217   for (auto _ : state) {
218     // Here we model both local work outside of the critical section as well as
219     // some work inside of the critical section. The idea is to capture some
220     // more or less realisitic contention levels.
221     // If contention is too low, the benchmark won't measure anything useful.
222     // If contention is unrealistically high, the benchmark will favor
223     // bad mutex implementations that block and otherwise distract threads
224     // from the mutex and shared state for as much as possible.
225     // To achieve this amount of local work is multiplied by number of threads
226     // to keep ratio between local work and critical section approximately
227     // equal regardless of number of threads.
228     DelayNs(100 * state.threads(), &local);
229     RaiiLocker<MutexType> locker(&shared->mu);
230     DelayNs(state.range(0), &shared->data);
231   }
232 }
SetupBenchmarkArgs(benchmark::internal::Benchmark * bm,bool do_test_priorities)233 void SetupBenchmarkArgs(benchmark::internal::Benchmark* bm,
234                         bool do_test_priorities) {
235   const int max_num_priorities = do_test_priorities ? 2 : 1;
236   bm->UseRealTime()
237       // ThreadPerCpu poorly handles non-power-of-two CPU counts.
238       ->Threads(1)
239       ->Threads(2)
240       ->Threads(4)
241       ->Threads(6)
242       ->Threads(8)
243       ->Threads(12)
244       ->Threads(16)
245       ->Threads(24)
246       ->Threads(32)
247       ->Threads(48)
248       ->Threads(64)
249       ->Threads(96)
250       ->Threads(128)
251       ->Threads(192)
252       ->Threads(256)
253       ->ArgNames({"cs_ns", "num_prios"});
254   // Some empirically chosen amounts of work in critical section.
255   // 1 is low contention, 2000 is high contention and few values in between.
256   for (int critical_section_ns : {1, 20, 50, 200, 2000}) {
257     for (int num_priorities = 1; num_priorities <= max_num_priorities;
258          num_priorities++) {
259       bm->ArgPair(critical_section_ns, num_priorities);
260     }
261   }
262 }
263 
264 BENCHMARK_TEMPLATE(BM_Contended, absl::Mutex)
__anon489130a80202(benchmark::internal::Benchmark* bm) 265     ->Apply([](benchmark::internal::Benchmark* bm) {
266       SetupBenchmarkArgs(bm, /*do_test_priorities=*/true);
267     });
268 
269 BENCHMARK_TEMPLATE(BM_Contended, absl::base_internal::SpinLock)
__anon489130a80302(benchmark::internal::Benchmark* bm) 270     ->Apply([](benchmark::internal::Benchmark* bm) {
271       SetupBenchmarkArgs(bm, /*do_test_priorities=*/false);
272     });
273 
274 BENCHMARK_TEMPLATE(BM_Contended, std::mutex)
__anon489130a80402(benchmark::internal::Benchmark* bm) 275     ->Apply([](benchmark::internal::Benchmark* bm) {
276       SetupBenchmarkArgs(bm, /*do_test_priorities=*/false);
277     });
278 
279 // Measure the overhead of conditions on mutex release (when they must be
280 // evaluated).  Mutex has (some) support for equivalence classes allowing
281 // Conditions with the same function/argument to potentially not be multiply
282 // evaluated.
283 //
284 // num_classes==0 is used for the special case of every waiter being distinct.
BM_ConditionWaiters(benchmark::State & state)285 void BM_ConditionWaiters(benchmark::State& state) {
286   int num_classes = state.range(0);
287   int num_waiters = state.range(1);
288 
289   struct Helper {
290     static void Waiter(absl::BlockingCounter* init, absl::Mutex* m, int* p) {
291       init->DecrementCount();
292       m->LockWhen(absl::Condition(
293           static_cast<bool (*)(int*)>([](int* v) { return *v == 0; }), p));
294       m->Unlock();
295     }
296   };
297 
298   if (num_classes == 0) {
299     // No equivalence classes.
300     num_classes = num_waiters;
301   }
302 
303   absl::BlockingCounter init(num_waiters);
304   absl::Mutex mu;
305   std::vector<int> equivalence_classes(num_classes, 1);
306 
307   // Must be declared last to be destroyed first.
308   absl::synchronization_internal::ThreadPool pool(num_waiters);
309 
310   for (int i = 0; i < num_waiters; i++) {
311     // Mutex considers Conditions with the same function and argument
312     // to be equivalent.
313     pool.Schedule([&, i] {
314       Helper::Waiter(&init, &mu, &equivalence_classes[i % num_classes]);
315     });
316   }
317   init.Wait();
318 
319   for (auto _ : state) {
320     mu.Lock();
321     mu.Unlock();  // Each unlock requires Condition evaluation for our waiters.
322   }
323 
324   mu.Lock();
325   for (int i = 0; i < num_classes; i++) {
326     equivalence_classes[i] = 0;
327   }
328   mu.Unlock();
329 }
330 
331 // Some configurations have higher thread limits than others.
332 #if defined(__linux__) && !defined(ABSL_HAVE_THREAD_SANITIZER)
333 constexpr int kMaxConditionWaiters = 8192;
334 #else
335 constexpr int kMaxConditionWaiters = 1024;
336 #endif
337 BENCHMARK(BM_ConditionWaiters)->RangePair(0, 2, 1, kMaxConditionWaiters);
338 
339 }  // namespace
340