xref: /aosp_15_r20/external/webrtc/third_party/abseil-cpp/absl/synchronization/mutex_benchmark.cc (revision d9f758449e529ab9291ac668be2861e7a55c2422)
1 // Copyright 2017 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include <cstdint>
16 #include <mutex>  // NOLINT(build/c++11)
17 #include <vector>
18 
19 #include "absl/base/config.h"
20 #include "absl/base/internal/cycleclock.h"
21 #include "absl/base/internal/spinlock.h"
22 #include "absl/synchronization/blocking_counter.h"
23 #include "absl/synchronization/internal/thread_pool.h"
24 #include "absl/synchronization/mutex.h"
25 #include "benchmark/benchmark.h"
26 
27 namespace {
28 
BM_Mutex(benchmark::State & state)29 void BM_Mutex(benchmark::State& state) {
30   static absl::Mutex* mu = new absl::Mutex;
31   for (auto _ : state) {
32     absl::MutexLock lock(mu);
33   }
34 }
35 BENCHMARK(BM_Mutex)->UseRealTime()->Threads(1)->ThreadPerCpu();
36 
DelayNs(int64_t ns,int * data)37 static void DelayNs(int64_t ns, int* data) {
38   int64_t end = absl::base_internal::CycleClock::Now() +
39                 ns * absl::base_internal::CycleClock::Frequency() / 1e9;
40   while (absl::base_internal::CycleClock::Now() < end) {
41     ++(*data);
42     benchmark::DoNotOptimize(*data);
43   }
44 }
45 
46 template <typename MutexType>
47 class RaiiLocker {
48  public:
RaiiLocker(MutexType * mu)49   explicit RaiiLocker(MutexType* mu) : mu_(mu) { mu_->Lock(); }
~RaiiLocker()50   ~RaiiLocker() { mu_->Unlock(); }
51  private:
52   MutexType* mu_;
53 };
54 
55 template <>
56 class RaiiLocker<std::mutex> {
57  public:
RaiiLocker(std::mutex * mu)58   explicit RaiiLocker(std::mutex* mu) : mu_(mu) { mu_->lock(); }
~RaiiLocker()59   ~RaiiLocker() { mu_->unlock(); }
60  private:
61   std::mutex* mu_;
62 };
63 
64 // RAII object to change the Mutex priority of the running thread.
65 class ScopedThreadMutexPriority {
66  public:
ScopedThreadMutexPriority(int priority)67   explicit ScopedThreadMutexPriority(int priority) {
68     absl::base_internal::ThreadIdentity* identity =
69         absl::synchronization_internal::GetOrCreateCurrentThreadIdentity();
70     identity->per_thread_synch.priority = priority;
71     // Bump next_priority_read_cycles to the infinite future so that the
72     // implementation doesn't re-read the thread's actual scheduler priority
73     // and replace our temporary scoped priority.
74     identity->per_thread_synch.next_priority_read_cycles =
75         std::numeric_limits<int64_t>::max();
76   }
~ScopedThreadMutexPriority()77   ~ScopedThreadMutexPriority() {
78     // Reset the "next priority read time" back to the infinite past so that
79     // the next time the Mutex implementation wants to know this thread's
80     // priority, it re-reads it from the OS instead of using our overridden
81     // priority.
82     absl::synchronization_internal::GetOrCreateCurrentThreadIdentity()
83         ->per_thread_synch.next_priority_read_cycles =
84         std::numeric_limits<int64_t>::min();
85   }
86 };
87 
BM_MutexEnqueue(benchmark::State & state)88 void BM_MutexEnqueue(benchmark::State& state) {
89   // In the "multiple priorities" variant of the benchmark, one of the
90   // threads runs with Mutex priority 0 while the rest run at elevated priority.
91   // This benchmarks the performance impact of the presence of a low priority
92   // waiter when a higher priority waiter adds itself of the queue
93   // (b/175224064).
94   //
95   // NOTE: The actual scheduler priority is not modified in this benchmark:
96   // all of the threads get CPU slices with the same priority. Only the
97   // Mutex queueing behavior is modified.
98   const bool multiple_priorities = state.range(0);
99   ScopedThreadMutexPriority priority_setter(
100       (multiple_priorities && state.thread_index() != 0) ? 1 : 0);
101 
102   struct Shared {
103     absl::Mutex mu;
104     std::atomic<int> looping_threads{0};
105     std::atomic<int> blocked_threads{0};
106     std::atomic<bool> thread_has_mutex{false};
107   };
108   static Shared* shared = new Shared;
109 
110   // Set up 'blocked_threads' to count how many threads are currently blocked
111   // in Abseil synchronization code.
112   //
113   // NOTE: Blocking done within the Google Benchmark library itself (e.g.
114   // the barrier which synchronizes threads entering and exiting the benchmark
115   // loop) does _not_ get registered in this counter. This is because Google
116   // Benchmark uses its own synchronization primitives based on std::mutex, not
117   // Abseil synchronization primitives. If at some point the benchmark library
118   // merges into Abseil, this code may break.
119   absl::synchronization_internal::PerThreadSem::SetThreadBlockedCounter(
120       &shared->blocked_threads);
121 
122   // The benchmark framework may run several iterations in the same process,
123   // reusing the same static-initialized 'shared' object. Given the semantics
124   // of the members, here, we expect everything to be reset to zero by the
125   // end of any iteration. Assert that's the case, just to be sure.
126   ABSL_RAW_CHECK(
127       shared->looping_threads.load(std::memory_order_relaxed) == 0 &&
128           shared->blocked_threads.load(std::memory_order_relaxed) == 0 &&
129           !shared->thread_has_mutex.load(std::memory_order_relaxed),
130       "Shared state isn't zeroed at start of benchmark iteration");
131 
132   static constexpr int kBatchSize = 1000;
133   while (state.KeepRunningBatch(kBatchSize)) {
134     shared->looping_threads.fetch_add(1);
135     for (int i = 0; i < kBatchSize; i++) {
136       {
137         absl::MutexLock l(&shared->mu);
138         shared->thread_has_mutex.store(true, std::memory_order_relaxed);
139         // Spin until all other threads are either out of the benchmark loop
140         // or blocked on the mutex. This ensures that the mutex queue is kept
141         // at its maximal length to benchmark the performance of queueing on
142         // a highly contended mutex.
143         while (shared->looping_threads.load(std::memory_order_relaxed) -
144                    shared->blocked_threads.load(std::memory_order_relaxed) !=
145                1) {
146         }
147         shared->thread_has_mutex.store(false);
148       }
149       // Spin until some other thread has acquired the mutex before we block
150       // again. This ensures that we always go through the slow (queueing)
151       // acquisition path rather than reacquiring the mutex we just released.
152       while (!shared->thread_has_mutex.load(std::memory_order_relaxed) &&
153              shared->looping_threads.load(std::memory_order_relaxed) > 1) {
154       }
155     }
156     // The benchmark framework uses a barrier to ensure that all of the threads
157     // complete their benchmark loop together before any of the threads exit
158     // the loop. So, we need to remove ourselves from the "looping threads"
159     // counter here before potentially blocking on that barrier. Otherwise,
160     // another thread spinning above might wait forever for this thread to
161     // block on the mutex while we in fact are waiting to exit.
162     shared->looping_threads.fetch_add(-1);
163   }
164   absl::synchronization_internal::PerThreadSem::SetThreadBlockedCounter(
165       nullptr);
166 }
167 
168 BENCHMARK(BM_MutexEnqueue)
169     ->Threads(4)
170     ->Threads(64)
171     ->Threads(128)
172     ->Threads(512)
173     ->ArgName("multiple_priorities")
174     ->Arg(false)
175     ->Arg(true);
176 
177 template <typename MutexType>
BM_Contended(benchmark::State & state)178 void BM_Contended(benchmark::State& state) {
179   int priority = state.thread_index() % state.range(1);
180   ScopedThreadMutexPriority priority_setter(priority);
181 
182   struct Shared {
183     MutexType mu;
184     int data = 0;
185   };
186   static auto* shared = new Shared;
187   int local = 0;
188   for (auto _ : state) {
189     // Here we model both local work outside of the critical section as well as
190     // some work inside of the critical section. The idea is to capture some
191     // more or less realisitic contention levels.
192     // If contention is too low, the benchmark won't measure anything useful.
193     // If contention is unrealistically high, the benchmark will favor
194     // bad mutex implementations that block and otherwise distract threads
195     // from the mutex and shared state for as much as possible.
196     // To achieve this amount of local work is multiplied by number of threads
197     // to keep ratio between local work and critical section approximately
198     // equal regardless of number of threads.
199     DelayNs(100 * state.threads(), &local);
200     RaiiLocker<MutexType> locker(&shared->mu);
201     DelayNs(state.range(0), &shared->data);
202   }
203 }
SetupBenchmarkArgs(benchmark::internal::Benchmark * bm,bool do_test_priorities)204 void SetupBenchmarkArgs(benchmark::internal::Benchmark* bm,
205                         bool do_test_priorities) {
206   const int max_num_priorities = do_test_priorities ? 2 : 1;
207   bm->UseRealTime()
208       // ThreadPerCpu poorly handles non-power-of-two CPU counts.
209       ->Threads(1)
210       ->Threads(2)
211       ->Threads(4)
212       ->Threads(6)
213       ->Threads(8)
214       ->Threads(12)
215       ->Threads(16)
216       ->Threads(24)
217       ->Threads(32)
218       ->Threads(48)
219       ->Threads(64)
220       ->Threads(96)
221       ->Threads(128)
222       ->Threads(192)
223       ->Threads(256)
224       ->ArgNames({"cs_ns", "num_prios"});
225   // Some empirically chosen amounts of work in critical section.
226   // 1 is low contention, 2000 is high contention and few values in between.
227   for (int critical_section_ns : {1, 20, 50, 200, 2000}) {
228     for (int num_priorities = 1; num_priorities <= max_num_priorities;
229          num_priorities++) {
230       bm->ArgPair(critical_section_ns, num_priorities);
231     }
232   }
233 }
234 
235 BENCHMARK_TEMPLATE(BM_Contended, absl::Mutex)
__anonfe5b15a80202(benchmark::internal::Benchmark* bm) 236     ->Apply([](benchmark::internal::Benchmark* bm) {
237       SetupBenchmarkArgs(bm, /*do_test_priorities=*/true);
238     });
239 
240 BENCHMARK_TEMPLATE(BM_Contended, absl::base_internal::SpinLock)
__anonfe5b15a80302(benchmark::internal::Benchmark* bm) 241     ->Apply([](benchmark::internal::Benchmark* bm) {
242       SetupBenchmarkArgs(bm, /*do_test_priorities=*/false);
243     });
244 
245 BENCHMARK_TEMPLATE(BM_Contended, std::mutex)
__anonfe5b15a80402(benchmark::internal::Benchmark* bm) 246     ->Apply([](benchmark::internal::Benchmark* bm) {
247       SetupBenchmarkArgs(bm, /*do_test_priorities=*/false);
248     });
249 
250 // Measure the overhead of conditions on mutex release (when they must be
251 // evaluated).  Mutex has (some) support for equivalence classes allowing
252 // Conditions with the same function/argument to potentially not be multiply
253 // evaluated.
254 //
255 // num_classes==0 is used for the special case of every waiter being distinct.
BM_ConditionWaiters(benchmark::State & state)256 void BM_ConditionWaiters(benchmark::State& state) {
257   int num_classes = state.range(0);
258   int num_waiters = state.range(1);
259 
260   struct Helper {
261     static void Waiter(absl::BlockingCounter* init, absl::Mutex* m, int* p) {
262       init->DecrementCount();
263       m->LockWhen(absl::Condition(
264           static_cast<bool (*)(int*)>([](int* v) { return *v == 0; }), p));
265       m->Unlock();
266     }
267   };
268 
269   if (num_classes == 0) {
270     // No equivalence classes.
271     num_classes = num_waiters;
272   }
273 
274   absl::BlockingCounter init(num_waiters);
275   absl::Mutex mu;
276   std::vector<int> equivalence_classes(num_classes, 1);
277 
278   // Must be declared last to be destroyed first.
279   absl::synchronization_internal::ThreadPool pool(num_waiters);
280 
281   for (int i = 0; i < num_waiters; i++) {
282     // Mutex considers Conditions with the same function and argument
283     // to be equivalent.
284     pool.Schedule([&, i] {
285       Helper::Waiter(&init, &mu, &equivalence_classes[i % num_classes]);
286     });
287   }
288   init.Wait();
289 
290   for (auto _ : state) {
291     mu.Lock();
292     mu.Unlock();  // Each unlock requires Condition evaluation for our waiters.
293   }
294 
295   mu.Lock();
296   for (int i = 0; i < num_classes; i++) {
297     equivalence_classes[i] = 0;
298   }
299   mu.Unlock();
300 }
301 
302 // Some configurations have higher thread limits than others.
303 #if defined(__linux__) && !defined(ABSL_HAVE_THREAD_SANITIZER)
304 constexpr int kMaxConditionWaiters = 8192;
305 #else
306 constexpr int kMaxConditionWaiters = 1024;
307 #endif
308 BENCHMARK(BM_ConditionWaiters)->RangePair(0, 2, 1, kMaxConditionWaiters);
309 
310 }  // namespace
311