1 // Copyright 2017 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include <cstdint>
16 #include <mutex> // NOLINT(build/c++11)
17 #include <vector>
18
19 #include "absl/base/config.h"
20 #include "absl/base/internal/cycleclock.h"
21 #include "absl/base/internal/spinlock.h"
22 #include "absl/base/no_destructor.h"
23 #include "absl/synchronization/blocking_counter.h"
24 #include "absl/synchronization/internal/thread_pool.h"
25 #include "absl/synchronization/mutex.h"
26 #include "benchmark/benchmark.h"
27
28 namespace {
29
BM_Mutex(benchmark::State & state)30 void BM_Mutex(benchmark::State& state) {
31 static absl::NoDestructor<absl::Mutex> mu;
32 for (auto _ : state) {
33 absl::MutexLock lock(mu.get());
34 }
35 }
36 BENCHMARK(BM_Mutex)->UseRealTime()->Threads(1)->ThreadPerCpu();
37
BM_ReaderLock(benchmark::State & state)38 void BM_ReaderLock(benchmark::State& state) {
39 static absl::NoDestructor<absl::Mutex> mu;
40 for (auto _ : state) {
41 absl::ReaderMutexLock lock(mu.get());
42 }
43 }
44 BENCHMARK(BM_ReaderLock)->UseRealTime()->Threads(1)->ThreadPerCpu();
45
BM_TryLock(benchmark::State & state)46 void BM_TryLock(benchmark::State& state) {
47 absl::Mutex mu;
48 for (auto _ : state) {
49 if (mu.TryLock()) {
50 mu.Unlock();
51 }
52 }
53 }
54 BENCHMARK(BM_TryLock);
55
BM_ReaderTryLock(benchmark::State & state)56 void BM_ReaderTryLock(benchmark::State& state) {
57 static absl::NoDestructor<absl::Mutex> mu;
58 for (auto _ : state) {
59 if (mu->ReaderTryLock()) {
60 mu->ReaderUnlock();
61 }
62 }
63 }
64 BENCHMARK(BM_ReaderTryLock)->UseRealTime()->Threads(1)->ThreadPerCpu();
65
DelayNs(int64_t ns,int * data)66 static void DelayNs(int64_t ns, int* data) {
67 int64_t end = absl::base_internal::CycleClock::Now() +
68 ns * absl::base_internal::CycleClock::Frequency() / 1e9;
69 while (absl::base_internal::CycleClock::Now() < end) {
70 ++(*data);
71 benchmark::DoNotOptimize(*data);
72 }
73 }
74
75 template <typename MutexType>
76 class RaiiLocker {
77 public:
RaiiLocker(MutexType * mu)78 explicit RaiiLocker(MutexType* mu) : mu_(mu) { mu_->Lock(); }
~RaiiLocker()79 ~RaiiLocker() { mu_->Unlock(); }
80 private:
81 MutexType* mu_;
82 };
83
84 template <>
85 class RaiiLocker<std::mutex> {
86 public:
RaiiLocker(std::mutex * mu)87 explicit RaiiLocker(std::mutex* mu) : mu_(mu) { mu_->lock(); }
~RaiiLocker()88 ~RaiiLocker() { mu_->unlock(); }
89 private:
90 std::mutex* mu_;
91 };
92
93 // RAII object to change the Mutex priority of the running thread.
94 class ScopedThreadMutexPriority {
95 public:
ScopedThreadMutexPriority(int priority)96 explicit ScopedThreadMutexPriority(int priority) {
97 absl::base_internal::ThreadIdentity* identity =
98 absl::synchronization_internal::GetOrCreateCurrentThreadIdentity();
99 identity->per_thread_synch.priority = priority;
100 // Bump next_priority_read_cycles to the infinite future so that the
101 // implementation doesn't re-read the thread's actual scheduler priority
102 // and replace our temporary scoped priority.
103 identity->per_thread_synch.next_priority_read_cycles =
104 std::numeric_limits<int64_t>::max();
105 }
~ScopedThreadMutexPriority()106 ~ScopedThreadMutexPriority() {
107 // Reset the "next priority read time" back to the infinite past so that
108 // the next time the Mutex implementation wants to know this thread's
109 // priority, it re-reads it from the OS instead of using our overridden
110 // priority.
111 absl::synchronization_internal::GetOrCreateCurrentThreadIdentity()
112 ->per_thread_synch.next_priority_read_cycles =
113 std::numeric_limits<int64_t>::min();
114 }
115 };
116
BM_MutexEnqueue(benchmark::State & state)117 void BM_MutexEnqueue(benchmark::State& state) {
118 // In the "multiple priorities" variant of the benchmark, one of the
119 // threads runs with Mutex priority 0 while the rest run at elevated priority.
120 // This benchmarks the performance impact of the presence of a low priority
121 // waiter when a higher priority waiter adds itself of the queue
122 // (b/175224064).
123 //
124 // NOTE: The actual scheduler priority is not modified in this benchmark:
125 // all of the threads get CPU slices with the same priority. Only the
126 // Mutex queueing behavior is modified.
127 const bool multiple_priorities = state.range(0);
128 ScopedThreadMutexPriority priority_setter(
129 (multiple_priorities && state.thread_index() != 0) ? 1 : 0);
130
131 struct Shared {
132 absl::Mutex mu;
133 std::atomic<int> looping_threads{0};
134 std::atomic<int> blocked_threads{0};
135 std::atomic<bool> thread_has_mutex{false};
136 };
137 static absl::NoDestructor<Shared> shared;
138
139 // Set up 'blocked_threads' to count how many threads are currently blocked
140 // in Abseil synchronization code.
141 //
142 // NOTE: Blocking done within the Google Benchmark library itself (e.g.
143 // the barrier which synchronizes threads entering and exiting the benchmark
144 // loop) does _not_ get registered in this counter. This is because Google
145 // Benchmark uses its own synchronization primitives based on std::mutex, not
146 // Abseil synchronization primitives. If at some point the benchmark library
147 // merges into Abseil, this code may break.
148 absl::synchronization_internal::PerThreadSem::SetThreadBlockedCounter(
149 &shared->blocked_threads);
150
151 // The benchmark framework may run several iterations in the same process,
152 // reusing the same static-initialized 'shared' object. Given the semantics
153 // of the members, here, we expect everything to be reset to zero by the
154 // end of any iteration. Assert that's the case, just to be sure.
155 ABSL_RAW_CHECK(
156 shared->looping_threads.load(std::memory_order_relaxed) == 0 &&
157 shared->blocked_threads.load(std::memory_order_relaxed) == 0 &&
158 !shared->thread_has_mutex.load(std::memory_order_relaxed),
159 "Shared state isn't zeroed at start of benchmark iteration");
160
161 static constexpr int kBatchSize = 1000;
162 while (state.KeepRunningBatch(kBatchSize)) {
163 shared->looping_threads.fetch_add(1);
164 for (int i = 0; i < kBatchSize; i++) {
165 {
166 absl::MutexLock l(&shared->mu);
167 shared->thread_has_mutex.store(true, std::memory_order_relaxed);
168 // Spin until all other threads are either out of the benchmark loop
169 // or blocked on the mutex. This ensures that the mutex queue is kept
170 // at its maximal length to benchmark the performance of queueing on
171 // a highly contended mutex.
172 while (shared->looping_threads.load(std::memory_order_relaxed) -
173 shared->blocked_threads.load(std::memory_order_relaxed) !=
174 1) {
175 }
176 shared->thread_has_mutex.store(false);
177 }
178 // Spin until some other thread has acquired the mutex before we block
179 // again. This ensures that we always go through the slow (queueing)
180 // acquisition path rather than reacquiring the mutex we just released.
181 while (!shared->thread_has_mutex.load(std::memory_order_relaxed) &&
182 shared->looping_threads.load(std::memory_order_relaxed) > 1) {
183 }
184 }
185 // The benchmark framework uses a barrier to ensure that all of the threads
186 // complete their benchmark loop together before any of the threads exit
187 // the loop. So, we need to remove ourselves from the "looping threads"
188 // counter here before potentially blocking on that barrier. Otherwise,
189 // another thread spinning above might wait forever for this thread to
190 // block on the mutex while we in fact are waiting to exit.
191 shared->looping_threads.fetch_add(-1);
192 }
193 absl::synchronization_internal::PerThreadSem::SetThreadBlockedCounter(
194 nullptr);
195 }
196
197 BENCHMARK(BM_MutexEnqueue)
198 ->Threads(4)
199 ->Threads(64)
200 ->Threads(128)
201 ->Threads(512)
202 ->ArgName("multiple_priorities")
203 ->Arg(false)
204 ->Arg(true);
205
206 template <typename MutexType>
BM_Contended(benchmark::State & state)207 void BM_Contended(benchmark::State& state) {
208 int priority = state.thread_index() % state.range(1);
209 ScopedThreadMutexPriority priority_setter(priority);
210
211 struct Shared {
212 MutexType mu;
213 int data = 0;
214 };
215 static absl::NoDestructor<Shared> shared;
216 int local = 0;
217 for (auto _ : state) {
218 // Here we model both local work outside of the critical section as well as
219 // some work inside of the critical section. The idea is to capture some
220 // more or less realisitic contention levels.
221 // If contention is too low, the benchmark won't measure anything useful.
222 // If contention is unrealistically high, the benchmark will favor
223 // bad mutex implementations that block and otherwise distract threads
224 // from the mutex and shared state for as much as possible.
225 // To achieve this amount of local work is multiplied by number of threads
226 // to keep ratio between local work and critical section approximately
227 // equal regardless of number of threads.
228 DelayNs(100 * state.threads(), &local);
229 RaiiLocker<MutexType> locker(&shared->mu);
230 DelayNs(state.range(0), &shared->data);
231 }
232 }
SetupBenchmarkArgs(benchmark::internal::Benchmark * bm,bool do_test_priorities)233 void SetupBenchmarkArgs(benchmark::internal::Benchmark* bm,
234 bool do_test_priorities) {
235 const int max_num_priorities = do_test_priorities ? 2 : 1;
236 bm->UseRealTime()
237 // ThreadPerCpu poorly handles non-power-of-two CPU counts.
238 ->Threads(1)
239 ->Threads(2)
240 ->Threads(4)
241 ->Threads(6)
242 ->Threads(8)
243 ->Threads(12)
244 ->Threads(16)
245 ->Threads(24)
246 ->Threads(32)
247 ->Threads(48)
248 ->Threads(64)
249 ->Threads(96)
250 ->Threads(128)
251 ->Threads(192)
252 ->Threads(256)
253 ->ArgNames({"cs_ns", "num_prios"});
254 // Some empirically chosen amounts of work in critical section.
255 // 1 is low contention, 2000 is high contention and few values in between.
256 for (int critical_section_ns : {1, 20, 50, 200, 2000}) {
257 for (int num_priorities = 1; num_priorities <= max_num_priorities;
258 num_priorities++) {
259 bm->ArgPair(critical_section_ns, num_priorities);
260 }
261 }
262 }
263
264 BENCHMARK_TEMPLATE(BM_Contended, absl::Mutex)
__anon489130a80202(benchmark::internal::Benchmark* bm) 265 ->Apply([](benchmark::internal::Benchmark* bm) {
266 SetupBenchmarkArgs(bm, /*do_test_priorities=*/true);
267 });
268
269 BENCHMARK_TEMPLATE(BM_Contended, absl::base_internal::SpinLock)
__anon489130a80302(benchmark::internal::Benchmark* bm) 270 ->Apply([](benchmark::internal::Benchmark* bm) {
271 SetupBenchmarkArgs(bm, /*do_test_priorities=*/false);
272 });
273
274 BENCHMARK_TEMPLATE(BM_Contended, std::mutex)
__anon489130a80402(benchmark::internal::Benchmark* bm) 275 ->Apply([](benchmark::internal::Benchmark* bm) {
276 SetupBenchmarkArgs(bm, /*do_test_priorities=*/false);
277 });
278
279 // Measure the overhead of conditions on mutex release (when they must be
280 // evaluated). Mutex has (some) support for equivalence classes allowing
281 // Conditions with the same function/argument to potentially not be multiply
282 // evaluated.
283 //
284 // num_classes==0 is used for the special case of every waiter being distinct.
BM_ConditionWaiters(benchmark::State & state)285 void BM_ConditionWaiters(benchmark::State& state) {
286 int num_classes = state.range(0);
287 int num_waiters = state.range(1);
288
289 struct Helper {
290 static void Waiter(absl::BlockingCounter* init, absl::Mutex* m, int* p) {
291 init->DecrementCount();
292 m->LockWhen(absl::Condition(
293 static_cast<bool (*)(int*)>([](int* v) { return *v == 0; }), p));
294 m->Unlock();
295 }
296 };
297
298 if (num_classes == 0) {
299 // No equivalence classes.
300 num_classes = num_waiters;
301 }
302
303 absl::BlockingCounter init(num_waiters);
304 absl::Mutex mu;
305 std::vector<int> equivalence_classes(num_classes, 1);
306
307 // Must be declared last to be destroyed first.
308 absl::synchronization_internal::ThreadPool pool(num_waiters);
309
310 for (int i = 0; i < num_waiters; i++) {
311 // Mutex considers Conditions with the same function and argument
312 // to be equivalent.
313 pool.Schedule([&, i] {
314 Helper::Waiter(&init, &mu, &equivalence_classes[i % num_classes]);
315 });
316 }
317 init.Wait();
318
319 for (auto _ : state) {
320 mu.Lock();
321 mu.Unlock(); // Each unlock requires Condition evaluation for our waiters.
322 }
323
324 mu.Lock();
325 for (int i = 0; i < num_classes; i++) {
326 equivalence_classes[i] = 0;
327 }
328 mu.Unlock();
329 }
330
331 // Some configurations have higher thread limits than others.
332 #if defined(__linux__) && !defined(ABSL_HAVE_THREAD_SANITIZER)
333 constexpr int kMaxConditionWaiters = 8192;
334 #else
335 constexpr int kMaxConditionWaiters = 1024;
336 #endif
337 BENCHMARK(BM_ConditionWaiters)->RangePair(0, 2, 1, kMaxConditionWaiters);
338
339 } // namespace
340