// Copyright 2023 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #![allow(missing_docs, unused_results, clippy::unwrap_used)] use criterion::{black_box, criterion_group, criterion_main, Criterion}; use handle_map::*; use std::sync::Arc; use std::thread; use std::time::{Duration, Instant}; // For the sake of these benchmarks, we just want // to be able to allocate an almost-arbitrary number of handles // if needed by the particular benchmark. const MAX_ACTIVE_HANDLES: u32 = u32::MAX - 1; // The default number of threads+shards to use // in multi-threaded benchmarks, chosen // to represent a typical multi-threaded use-case. const DEFAULT_SHARDS_AND_THREADS: u8 = 8; /// We're using u8's for the wrapped object type to deliberately ignore /// costs related to accessing the underlying data - we just care about /// benchmarking handle-map operations. type BenchHandleMap = HandleMap; fn build_handle_map(num_shards: u8) -> BenchHandleMap { let dimensions = HandleMapDimensions { num_shards, max_active_handles: MAX_ACTIVE_HANDLES, }; HandleMap::with_dimensions(dimensions) } /// Benchmark for repeated reads on a single thread fn single_threaded_read_benchmark(c: &mut Criterion) { // The number of shards should not matter much here. let handle_map = build_handle_map(8); // Pre-populate the map with a single handle. let handle = handle_map.allocate(|| 0xFF).unwrap(); let handle_map_ref = &handle_map; // Perform repeated reads let _ = c.bench_function("single-threaded reads", |b| { b.iter(|| { let guard = handle_map_ref.get(black_box(handle)).unwrap(); black_box(*guard) }) }); } /// Benchmark for repeated writes on a single thread fn single_threaded_write_benchmark(c: &mut Criterion) { // The number of shards should not matter much here. let handle_map = build_handle_map(8); // Pre-populate the map with a single handle. let handle = handle_map.allocate(|| 0xFF).unwrap(); let handle_map_ref = &handle_map; // Perform repeated writes c.bench_function("single-threaded writes", |b| { b.iter(|| { let mut guard = handle_map_ref.get_mut(black_box(handle)).unwrap(); *guard *= black_box(0); }) }); } /// Benchmark for repeated allocations on a single thread #[allow(unused_must_use)] fn single_threaded_allocate_benchmark(c: &mut Criterion) { // The number of shards here is chosen to be quasi-reasonable. // Realistically, performance will fluctuate somewhat with this // due to the difference in the number of internal hash-map collisions, // but ultimately we only care about the asymptotic behavior. let handle_map = build_handle_map(8); let handle_map_ref = &handle_map; // Perform repeated allocations c.bench_function("single-threaded allocations", |b| { b.iter(|| { handle_map_ref.allocate(|| black_box(0xEE)); }) }); } /// Benchmark for repeated allocation->deallocation on a single thread /// starting from an empty map. #[allow(unused_must_use)] fn single_threaded_allocate_deallocate_near_empty_benchmark(c: &mut Criterion) { // The number of shards should not matter much here. let handle_map = build_handle_map(8); let handle_map_ref = &handle_map; // Perform repeated allocation/deallocation pairs c.bench_function( "single-threaded allocate/deallocate pairs (empty init state)", |b| { b.iter(|| { let handle = handle_map_ref.allocate(|| black_box(0xDD)).unwrap(); handle_map_ref.deallocate(handle); }) }, ); } /// Benchmark for repeated allocation->deallocation starting /// from a map with a thousand elements per shard. #[allow(unused_must_use)] fn single_threaded_allocate_deallocate_one_thousand_benchmark(c: &mut Criterion) { let handle_map = build_handle_map(8); for _ in 0..(8 * 1000) { handle_map.allocate(|| black_box(0xFF)).unwrap(); } let handle_map_ref = &handle_map; // Perform repeated allocation/deallocation pairs c.bench_function( "single-threaded allocate/deallocate pairs (init one thousand elements per shard)", |b| { b.iter(|| { let handle = handle_map_ref.allocate(|| black_box(0xDD)).unwrap(); handle_map_ref.deallocate(handle); }) }, ); } /// Benchmark for repeated allocation->deallocation starting /// from a map with a million elements per shard. #[allow(unused_must_use)] fn single_threaded_allocate_deallocate_one_million_benchmark(c: &mut Criterion) { let handle_map = build_handle_map(8); for _ in 0..(8 * 1000000) { handle_map.allocate(|| black_box(0xFF)).unwrap(); } let handle_map_ref = &handle_map; // Perform repeated allocation/deallocation pairs c.bench_function( "single-threaded allocate/deallocate pairs (init one million elements per shard)", |b| { b.iter(|| { let handle = handle_map_ref.allocate(|| black_box(0xDD)).unwrap(); handle_map_ref.deallocate(handle); }) }, ); } /// Helper function for creating a multi-threaded benchmark /// with the set number of threads. Untimed Initialization of state /// should occur prior to this call. This method will repeatedly /// execute the benchmark code on all threads fn bench_multithreaded(name: &str, num_threads: u8, benchable_fn_ref: Arc, c: &mut Criterion) where F: Fn() + Send + Sync + 'static, { c.bench_function(name, move |b| { b.iter_custom(|iters| { // The returned results from the threads will be their timings let mut join_handles = Vec::new(); for _ in 0..num_threads { let benchable_fn_ref_clone = benchable_fn_ref.clone(); let join_handle = thread::spawn(move || { let start = Instant::now(); for _ in 0..iters { benchable_fn_ref_clone(); } start.elapsed() }); join_handles.push(join_handle); } //Threads spawned, now collect the results let mut total_duration = Duration::from_nanos(0); for join_handle in join_handles { let thread_duration = join_handle.join().unwrap(); total_duration += thread_duration; } total_duration /= num_threads as u32; total_duration }) }); } /// Defines a number of reads/writes to perform [relative /// to other operations performed in a given test] #[derive(Debug, Clone, Copy)] struct ReadWriteCount { num_reads: usize, num_writes: usize, } const READS_ONLY: ReadWriteCount = ReadWriteCount { num_reads: 4, num_writes: 0, }; const READ_LEANING: ReadWriteCount = ReadWriteCount { num_reads: 3, num_writes: 1, }; const BALANCED: ReadWriteCount = ReadWriteCount { num_reads: 2, num_writes: 2, }; const WRITE_LEANING: ReadWriteCount = ReadWriteCount { num_reads: 1, num_writes: 3, }; const WRITES_ONLY: ReadWriteCount = ReadWriteCount { num_reads: 0, num_writes: 4, }; const READ_WRITE_COUNTS: [ReadWriteCount; 5] = [ READS_ONLY, READ_LEANING, BALANCED, WRITE_LEANING, WRITES_ONLY, ]; /// Benchmarks a repeated allocate/[X writes]/[Y reads]/deallocate workflow across /// the default number of threads and shards. fn lifecycle_read_and_write_multithreaded(per_object_rw_count: ReadWriteCount, c: &mut Criterion) { let name = format!( "lifecycle_read_and_write_multithreaded(reads/object: {}, writes/object:{})", per_object_rw_count.num_reads, per_object_rw_count.num_writes ); // We need an Arc to be able to pass this off to `bench_multithreaded`. let handle_map = Arc::new(build_handle_map(DEFAULT_SHARDS_AND_THREADS)); bench_multithreaded( &name, DEFAULT_SHARDS_AND_THREADS, Arc::new(move || { let handle = handle_map.allocate(|| black_box(0xFF)).unwrap(); for _ in 0..per_object_rw_count.num_writes { let mut write_guard = handle_map.get_mut(handle).unwrap(); *write_guard *= black_box(1); } for _ in 0..per_object_rw_count.num_reads { let read_guard = handle_map.get_mut(handle).unwrap(); black_box(*read_guard); } handle_map.deallocate(handle).unwrap(); }), c, ); } fn lifecycle_benchmarks(c: &mut Criterion) { for read_write_count in READ_WRITE_COUNTS { // Using 8 separate shards to roughly match the number of active threads. lifecycle_read_and_write_multithreaded(read_write_count, c); } } /// Benchmarks repeated cycles of accessing the given number of distinct objects, each /// with the given number of reads and writes across the given number of threads. fn read_and_write_contention_multithreaded( num_threads: u8, num_distinct_objects: u32, per_object_rw_count: ReadWriteCount, c: &mut Criterion, ) { let name = format!("read_and_write_contention_multithreaded(threads: {}, objects: {}, reads/object: {}, writes/object:{})", num_threads, num_distinct_objects, per_object_rw_count.num_reads, per_object_rw_count.num_writes); // Default to 8 shards, ultimately doesn't matter much since we only ever access a few shards. // This needs to be `'static` in order to pass a ref off to `bench_multithreaded`. let handle_map = Arc::new(build_handle_map(8)); let mut handles = Vec::new(); for i in 0..num_distinct_objects { let handle = handle_map.allocate(|| i as u8).unwrap(); handles.push(handle); } bench_multithreaded( &name, num_threads, Arc::new(move || { // Deliberately performing all writes first, with // the goal of getting staggered timings from // all of the threads for subsequent iterations. // We also perform reads and writes across all objects // in batches to get something resembling uniform access. for _ in 0..per_object_rw_count.num_writes { for handle in handles.iter() { let mut write_guard = handle_map.get_mut(*handle).unwrap(); *write_guard *= black_box(1); } } for _ in 0..per_object_rw_count.num_reads { for handle in handles.iter() { let read_guard = handle_map.get(*handle).unwrap(); black_box(*read_guard); } } }), c, ); } fn read_write_contention_benchmarks(c: &mut Criterion) { for num_threads in [2, 4, 8].iter() { for num_distinct_objects in [1u32, 2u32, 4u32].iter() { if num_distinct_objects >= &(*num_threads as u32) { continue; } for read_write_count in READ_WRITE_COUNTS { read_and_write_contention_multithreaded( *num_threads, *num_distinct_objects, read_write_count, c, ); } } } } /// Benchmarks allocation (starting from empty) for the default number of shards and threads fn allocator_contention_benchmark(c: &mut Criterion) { let name = "allocator_contention_multithreaded"; // We need an Arc to pass this off to `bench_multithreaded` let handle_map = Arc::new(build_handle_map(DEFAULT_SHARDS_AND_THREADS)); bench_multithreaded( name, DEFAULT_SHARDS_AND_THREADS, Arc::new(move || { handle_map.allocate(|| black_box(0xFF)).unwrap(); }), c, ); } criterion_group!( benches, single_threaded_read_benchmark, single_threaded_write_benchmark, single_threaded_allocate_benchmark, single_threaded_allocate_deallocate_near_empty_benchmark, single_threaded_allocate_deallocate_one_thousand_benchmark, single_threaded_allocate_deallocate_one_million_benchmark, lifecycle_benchmarks, read_write_contention_benchmarks, allocator_contention_benchmark, ); criterion_main!(benches);