xref: /aosp_15_r20/external/abseil-cpp/absl/random/internal/nanobenchmark.cc (revision 9356374a3709195abf420251b3e825997ff56c0f)
1*9356374aSAndroid Build Coastguard Worker // Copyright 2017 Google Inc. All Rights Reserved.
2*9356374aSAndroid Build Coastguard Worker //
3*9356374aSAndroid Build Coastguard Worker // Licensed under the Apache License, Version 2.0 (the "License");
4*9356374aSAndroid Build Coastguard Worker // you may not use this file except in compliance with the License.
5*9356374aSAndroid Build Coastguard Worker // You may obtain a copy of the License at
6*9356374aSAndroid Build Coastguard Worker //
7*9356374aSAndroid Build Coastguard Worker //     https://www.apache.org/licenses/LICENSE-2.0
8*9356374aSAndroid Build Coastguard Worker //
9*9356374aSAndroid Build Coastguard Worker // Unless required by applicable law or agreed to in writing, software
10*9356374aSAndroid Build Coastguard Worker // distributed under the License is distributed on an "AS IS" BASIS,
11*9356374aSAndroid Build Coastguard Worker // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*9356374aSAndroid Build Coastguard Worker // See the License for the specific language governing permissions and
13*9356374aSAndroid Build Coastguard Worker // limitations under the License.
14*9356374aSAndroid Build Coastguard Worker 
15*9356374aSAndroid Build Coastguard Worker #include "absl/random/internal/nanobenchmark.h"
16*9356374aSAndroid Build Coastguard Worker 
17*9356374aSAndroid Build Coastguard Worker #include <sys/types.h>
18*9356374aSAndroid Build Coastguard Worker 
19*9356374aSAndroid Build Coastguard Worker #include <algorithm>  // sort
20*9356374aSAndroid Build Coastguard Worker #include <atomic>
21*9356374aSAndroid Build Coastguard Worker #include <cstddef>
22*9356374aSAndroid Build Coastguard Worker #include <cstdint>
23*9356374aSAndroid Build Coastguard Worker #include <cstdlib>
24*9356374aSAndroid Build Coastguard Worker #include <cstring>  // memcpy
25*9356374aSAndroid Build Coastguard Worker #include <limits>
26*9356374aSAndroid Build Coastguard Worker #include <string>
27*9356374aSAndroid Build Coastguard Worker #include <utility>
28*9356374aSAndroid Build Coastguard Worker #include <vector>
29*9356374aSAndroid Build Coastguard Worker 
30*9356374aSAndroid Build Coastguard Worker #include "absl/base/attributes.h"
31*9356374aSAndroid Build Coastguard Worker #include "absl/base/internal/raw_logging.h"
32*9356374aSAndroid Build Coastguard Worker #include "absl/random/internal/platform.h"
33*9356374aSAndroid Build Coastguard Worker #include "absl/random/internal/randen_engine.h"
34*9356374aSAndroid Build Coastguard Worker 
35*9356374aSAndroid Build Coastguard Worker // OS
36*9356374aSAndroid Build Coastguard Worker #if defined(_WIN32) || defined(_WIN64)
37*9356374aSAndroid Build Coastguard Worker #define ABSL_OS_WIN
38*9356374aSAndroid Build Coastguard Worker #include <windows.h>  // NOLINT
39*9356374aSAndroid Build Coastguard Worker 
40*9356374aSAndroid Build Coastguard Worker #elif defined(__ANDROID__)
41*9356374aSAndroid Build Coastguard Worker #define ABSL_OS_ANDROID
42*9356374aSAndroid Build Coastguard Worker 
43*9356374aSAndroid Build Coastguard Worker #elif defined(__linux__)
44*9356374aSAndroid Build Coastguard Worker #define ABSL_OS_LINUX
45*9356374aSAndroid Build Coastguard Worker #include <sched.h>        // NOLINT
46*9356374aSAndroid Build Coastguard Worker #include <sys/syscall.h>  // NOLINT
47*9356374aSAndroid Build Coastguard Worker #endif
48*9356374aSAndroid Build Coastguard Worker 
49*9356374aSAndroid Build Coastguard Worker #if defined(ABSL_ARCH_X86_64) && !defined(ABSL_OS_WIN)
50*9356374aSAndroid Build Coastguard Worker #include <cpuid.h>  // NOLINT
51*9356374aSAndroid Build Coastguard Worker #endif
52*9356374aSAndroid Build Coastguard Worker 
53*9356374aSAndroid Build Coastguard Worker // __ppc_get_timebase_freq
54*9356374aSAndroid Build Coastguard Worker #if defined(ABSL_ARCH_PPC)
55*9356374aSAndroid Build Coastguard Worker #include <sys/platform/ppc.h>  // NOLINT
56*9356374aSAndroid Build Coastguard Worker #endif
57*9356374aSAndroid Build Coastguard Worker 
58*9356374aSAndroid Build Coastguard Worker // clock_gettime
59*9356374aSAndroid Build Coastguard Worker #if defined(ABSL_ARCH_ARM) || defined(ABSL_ARCH_AARCH64)
60*9356374aSAndroid Build Coastguard Worker #include <time.h>  // NOLINT
61*9356374aSAndroid Build Coastguard Worker #endif
62*9356374aSAndroid Build Coastguard Worker 
63*9356374aSAndroid Build Coastguard Worker // ABSL_RANDOM_INTERNAL_ATTRIBUTE_NEVER_INLINE prevents inlining of the method.
64*9356374aSAndroid Build Coastguard Worker #if ABSL_HAVE_ATTRIBUTE(noinline) || (defined(__GNUC__) && !defined(__clang__))
65*9356374aSAndroid Build Coastguard Worker #define ABSL_RANDOM_INTERNAL_ATTRIBUTE_NEVER_INLINE __attribute__((noinline))
66*9356374aSAndroid Build Coastguard Worker #elif defined(_MSC_VER)
67*9356374aSAndroid Build Coastguard Worker #define ABSL_RANDOM_INTERNAL_ATTRIBUTE_NEVER_INLINE __declspec(noinline)
68*9356374aSAndroid Build Coastguard Worker #else
69*9356374aSAndroid Build Coastguard Worker #define ABSL_RANDOM_INTERNAL_ATTRIBUTE_NEVER_INLINE
70*9356374aSAndroid Build Coastguard Worker #endif
71*9356374aSAndroid Build Coastguard Worker 
72*9356374aSAndroid Build Coastguard Worker namespace absl {
73*9356374aSAndroid Build Coastguard Worker ABSL_NAMESPACE_BEGIN
74*9356374aSAndroid Build Coastguard Worker namespace random_internal_nanobenchmark {
75*9356374aSAndroid Build Coastguard Worker namespace {
76*9356374aSAndroid Build Coastguard Worker 
77*9356374aSAndroid Build Coastguard Worker // For code folding.
78*9356374aSAndroid Build Coastguard Worker namespace platform {
79*9356374aSAndroid Build Coastguard Worker #if defined(ABSL_ARCH_X86_64)
80*9356374aSAndroid Build Coastguard Worker 
81*9356374aSAndroid Build Coastguard Worker // TODO(janwas): Merge with the one in randen_hwaes.cc?
Cpuid(const uint32_t level,const uint32_t count,uint32_t * ABSL_RANDOM_INTERNAL_RESTRICT abcd)82*9356374aSAndroid Build Coastguard Worker void Cpuid(const uint32_t level, const uint32_t count,
83*9356374aSAndroid Build Coastguard Worker            uint32_t* ABSL_RANDOM_INTERNAL_RESTRICT abcd) {
84*9356374aSAndroid Build Coastguard Worker #if defined(ABSL_OS_WIN)
85*9356374aSAndroid Build Coastguard Worker   int regs[4];
86*9356374aSAndroid Build Coastguard Worker   __cpuidex(regs, level, count);
87*9356374aSAndroid Build Coastguard Worker   for (int i = 0; i < 4; ++i) {
88*9356374aSAndroid Build Coastguard Worker     abcd[i] = regs[i];
89*9356374aSAndroid Build Coastguard Worker   }
90*9356374aSAndroid Build Coastguard Worker #else
91*9356374aSAndroid Build Coastguard Worker   uint32_t a, b, c, d;
92*9356374aSAndroid Build Coastguard Worker   __cpuid_count(level, count, a, b, c, d);
93*9356374aSAndroid Build Coastguard Worker   abcd[0] = a;
94*9356374aSAndroid Build Coastguard Worker   abcd[1] = b;
95*9356374aSAndroid Build Coastguard Worker   abcd[2] = c;
96*9356374aSAndroid Build Coastguard Worker   abcd[3] = d;
97*9356374aSAndroid Build Coastguard Worker #endif
98*9356374aSAndroid Build Coastguard Worker }
99*9356374aSAndroid Build Coastguard Worker 
BrandString()100*9356374aSAndroid Build Coastguard Worker std::string BrandString() {
101*9356374aSAndroid Build Coastguard Worker   char brand_string[49];
102*9356374aSAndroid Build Coastguard Worker   uint32_t abcd[4];
103*9356374aSAndroid Build Coastguard Worker 
104*9356374aSAndroid Build Coastguard Worker   // Check if brand string is supported (it is on all reasonable Intel/AMD)
105*9356374aSAndroid Build Coastguard Worker   Cpuid(0x80000000U, 0, abcd);
106*9356374aSAndroid Build Coastguard Worker   if (abcd[0] < 0x80000004U) {
107*9356374aSAndroid Build Coastguard Worker     return std::string();
108*9356374aSAndroid Build Coastguard Worker   }
109*9356374aSAndroid Build Coastguard Worker 
110*9356374aSAndroid Build Coastguard Worker   for (int i = 0; i < 3; ++i) {
111*9356374aSAndroid Build Coastguard Worker     Cpuid(0x80000002U + i, 0, abcd);
112*9356374aSAndroid Build Coastguard Worker     memcpy(brand_string + i * 16, &abcd, sizeof(abcd));
113*9356374aSAndroid Build Coastguard Worker   }
114*9356374aSAndroid Build Coastguard Worker   brand_string[48] = 0;
115*9356374aSAndroid Build Coastguard Worker   return brand_string;
116*9356374aSAndroid Build Coastguard Worker }
117*9356374aSAndroid Build Coastguard Worker 
118*9356374aSAndroid Build Coastguard Worker // Returns the frequency quoted inside the brand string. This does not
119*9356374aSAndroid Build Coastguard Worker // account for throttling nor Turbo Boost.
NominalClockRate()120*9356374aSAndroid Build Coastguard Worker double NominalClockRate() {
121*9356374aSAndroid Build Coastguard Worker   const std::string& brand_string = BrandString();
122*9356374aSAndroid Build Coastguard Worker   // Brand strings include the maximum configured frequency. These prefixes are
123*9356374aSAndroid Build Coastguard Worker   // defined by Intel CPUID documentation.
124*9356374aSAndroid Build Coastguard Worker   const char* prefixes[3] = {"MHz", "GHz", "THz"};
125*9356374aSAndroid Build Coastguard Worker   const double multipliers[3] = {1E6, 1E9, 1E12};
126*9356374aSAndroid Build Coastguard Worker   for (size_t i = 0; i < 3; ++i) {
127*9356374aSAndroid Build Coastguard Worker     const size_t pos_prefix = brand_string.find(prefixes[i]);
128*9356374aSAndroid Build Coastguard Worker     if (pos_prefix != std::string::npos) {
129*9356374aSAndroid Build Coastguard Worker       const size_t pos_space = brand_string.rfind(' ', pos_prefix - 1);
130*9356374aSAndroid Build Coastguard Worker       if (pos_space != std::string::npos) {
131*9356374aSAndroid Build Coastguard Worker         const std::string digits =
132*9356374aSAndroid Build Coastguard Worker             brand_string.substr(pos_space + 1, pos_prefix - pos_space - 1);
133*9356374aSAndroid Build Coastguard Worker         return std::stod(digits) * multipliers[i];
134*9356374aSAndroid Build Coastguard Worker       }
135*9356374aSAndroid Build Coastguard Worker     }
136*9356374aSAndroid Build Coastguard Worker   }
137*9356374aSAndroid Build Coastguard Worker 
138*9356374aSAndroid Build Coastguard Worker   return 0.0;
139*9356374aSAndroid Build Coastguard Worker }
140*9356374aSAndroid Build Coastguard Worker 
141*9356374aSAndroid Build Coastguard Worker #endif  // ABSL_ARCH_X86_64
142*9356374aSAndroid Build Coastguard Worker }  // namespace platform
143*9356374aSAndroid Build Coastguard Worker 
144*9356374aSAndroid Build Coastguard Worker // Prevents the compiler from eliding the computations that led to "output".
145*9356374aSAndroid Build Coastguard Worker template <class T>
PreventElision(T && output)146*9356374aSAndroid Build Coastguard Worker inline void PreventElision(T&& output) {
147*9356374aSAndroid Build Coastguard Worker #ifndef ABSL_OS_WIN
148*9356374aSAndroid Build Coastguard Worker   // Works by indicating to the compiler that "output" is being read and
149*9356374aSAndroid Build Coastguard Worker   // modified. The +r constraint avoids unnecessary writes to memory, but only
150*9356374aSAndroid Build Coastguard Worker   // works for built-in types (typically FuncOutput).
151*9356374aSAndroid Build Coastguard Worker   asm volatile("" : "+r"(output) : : "memory");
152*9356374aSAndroid Build Coastguard Worker #else
153*9356374aSAndroid Build Coastguard Worker   // MSVC does not support inline assembly anymore (and never supported GCC's
154*9356374aSAndroid Build Coastguard Worker   // RTL constraints). Self-assignment with #pragma optimize("off") might be
155*9356374aSAndroid Build Coastguard Worker   // expected to prevent elision, but it does not with MSVC 2015. Type-punning
156*9356374aSAndroid Build Coastguard Worker   // with volatile pointers generates inefficient code on MSVC 2017.
157*9356374aSAndroid Build Coastguard Worker   static std::atomic<T> dummy(T{});
158*9356374aSAndroid Build Coastguard Worker   dummy.store(output, std::memory_order_relaxed);
159*9356374aSAndroid Build Coastguard Worker #endif
160*9356374aSAndroid Build Coastguard Worker }
161*9356374aSAndroid Build Coastguard Worker 
162*9356374aSAndroid Build Coastguard Worker namespace timer {
163*9356374aSAndroid Build Coastguard Worker 
164*9356374aSAndroid Build Coastguard Worker // Start/Stop return absolute timestamps and must be placed immediately before
165*9356374aSAndroid Build Coastguard Worker // and after the region to measure. We provide separate Start/Stop functions
166*9356374aSAndroid Build Coastguard Worker // because they use different fences.
167*9356374aSAndroid Build Coastguard Worker //
168*9356374aSAndroid Build Coastguard Worker // Background: RDTSC is not 'serializing'; earlier instructions may complete
169*9356374aSAndroid Build Coastguard Worker // after it, and/or later instructions may complete before it. 'Fences' ensure
170*9356374aSAndroid Build Coastguard Worker // regions' elapsed times are independent of such reordering. The only
171*9356374aSAndroid Build Coastguard Worker // documented unprivileged serializing instruction is CPUID, which acts as a
172*9356374aSAndroid Build Coastguard Worker // full fence (no reordering across it in either direction). Unfortunately
173*9356374aSAndroid Build Coastguard Worker // the latency of CPUID varies wildly (perhaps made worse by not initializing
174*9356374aSAndroid Build Coastguard Worker // its EAX input). Because it cannot reliably be deducted from the region's
175*9356374aSAndroid Build Coastguard Worker // elapsed time, it must not be included in the region to measure (i.e.
176*9356374aSAndroid Build Coastguard Worker // between the two RDTSC).
177*9356374aSAndroid Build Coastguard Worker //
178*9356374aSAndroid Build Coastguard Worker // The newer RDTSCP is sometimes described as serializing, but it actually
179*9356374aSAndroid Build Coastguard Worker // only serves as a half-fence with release semantics. Although all
180*9356374aSAndroid Build Coastguard Worker // instructions in the region will complete before the final timestamp is
181*9356374aSAndroid Build Coastguard Worker // captured, subsequent instructions may leak into the region and increase the
182*9356374aSAndroid Build Coastguard Worker // elapsed time. Inserting another fence after the final RDTSCP would prevent
183*9356374aSAndroid Build Coastguard Worker // such reordering without affecting the measured region.
184*9356374aSAndroid Build Coastguard Worker //
185*9356374aSAndroid Build Coastguard Worker // Fortunately, such a fence exists. The LFENCE instruction is only documented
186*9356374aSAndroid Build Coastguard Worker // to delay later loads until earlier loads are visible. However, Intel's
187*9356374aSAndroid Build Coastguard Worker // reference manual says it acts as a full fence (waiting until all earlier
188*9356374aSAndroid Build Coastguard Worker // instructions have completed, and delaying later instructions until it
189*9356374aSAndroid Build Coastguard Worker // completes). AMD assigns the same behavior to MFENCE.
190*9356374aSAndroid Build Coastguard Worker //
191*9356374aSAndroid Build Coastguard Worker // We need a fence before the initial RDTSC to prevent earlier instructions
192*9356374aSAndroid Build Coastguard Worker // from leaking into the region, and arguably another after RDTSC to avoid
193*9356374aSAndroid Build Coastguard Worker // region instructions from completing before the timestamp is recorded.
194*9356374aSAndroid Build Coastguard Worker // When surrounded by fences, the additional RDTSCP half-fence provides no
195*9356374aSAndroid Build Coastguard Worker // benefit, so the initial timestamp can be recorded via RDTSC, which has
196*9356374aSAndroid Build Coastguard Worker // lower overhead than RDTSCP because it does not read TSC_AUX. In summary,
197*9356374aSAndroid Build Coastguard Worker // we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE.
198*9356374aSAndroid Build Coastguard Worker //
199*9356374aSAndroid Build Coastguard Worker // Using Start+Start leads to higher variance and overhead than Stop+Stop.
200*9356374aSAndroid Build Coastguard Worker // However, Stop+Stop includes an LFENCE in the region measurements, which
201*9356374aSAndroid Build Coastguard Worker // adds a delay dependent on earlier loads. The combination of Start+Stop
202*9356374aSAndroid Build Coastguard Worker // is faster than Start+Start and more consistent than Stop+Stop because
203*9356374aSAndroid Build Coastguard Worker // the first LFENCE already delayed subsequent loads before the measured
204*9356374aSAndroid Build Coastguard Worker // region. This combination seems not to have been considered in prior work:
205*9356374aSAndroid Build Coastguard Worker // http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c
206*9356374aSAndroid Build Coastguard Worker //
207*9356374aSAndroid Build Coastguard Worker // Note: performance counters can measure 'exact' instructions-retired or
208*9356374aSAndroid Build Coastguard Worker // (unhalted) cycle counts. The RDPMC instruction is not serializing and also
209*9356374aSAndroid Build Coastguard Worker // requires fences. Unfortunately, it is not accessible on all OSes and we
210*9356374aSAndroid Build Coastguard Worker // prefer to avoid kernel-mode drivers. Performance counters are also affected
211*9356374aSAndroid Build Coastguard Worker // by several under/over-count errata, so we use the TSC instead.
212*9356374aSAndroid Build Coastguard Worker 
213*9356374aSAndroid Build Coastguard Worker // Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
214*9356374aSAndroid Build Coastguard Worker // divide by InvariantTicksPerSecond.
Start64()215*9356374aSAndroid Build Coastguard Worker inline uint64_t Start64() {
216*9356374aSAndroid Build Coastguard Worker   uint64_t t;
217*9356374aSAndroid Build Coastguard Worker #if defined(ABSL_ARCH_PPC)
218*9356374aSAndroid Build Coastguard Worker   asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
219*9356374aSAndroid Build Coastguard Worker #elif defined(ABSL_ARCH_X86_64)
220*9356374aSAndroid Build Coastguard Worker #if defined(ABSL_OS_WIN)
221*9356374aSAndroid Build Coastguard Worker   _ReadWriteBarrier();
222*9356374aSAndroid Build Coastguard Worker   _mm_lfence();
223*9356374aSAndroid Build Coastguard Worker   _ReadWriteBarrier();
224*9356374aSAndroid Build Coastguard Worker   t = __rdtsc();
225*9356374aSAndroid Build Coastguard Worker   _ReadWriteBarrier();
226*9356374aSAndroid Build Coastguard Worker   _mm_lfence();
227*9356374aSAndroid Build Coastguard Worker   _ReadWriteBarrier();
228*9356374aSAndroid Build Coastguard Worker #else
229*9356374aSAndroid Build Coastguard Worker   asm volatile(
230*9356374aSAndroid Build Coastguard Worker       "lfence\n\t"
231*9356374aSAndroid Build Coastguard Worker       "rdtsc\n\t"
232*9356374aSAndroid Build Coastguard Worker       "shl $32, %%rdx\n\t"
233*9356374aSAndroid Build Coastguard Worker       "or %%rdx, %0\n\t"
234*9356374aSAndroid Build Coastguard Worker       "lfence"
235*9356374aSAndroid Build Coastguard Worker       : "=a"(t)
236*9356374aSAndroid Build Coastguard Worker       :
237*9356374aSAndroid Build Coastguard Worker       // "memory" avoids reordering. rdx = TSC >> 32.
238*9356374aSAndroid Build Coastguard Worker       // "cc" = flags modified by SHL.
239*9356374aSAndroid Build Coastguard Worker       : "rdx", "memory", "cc");
240*9356374aSAndroid Build Coastguard Worker #endif
241*9356374aSAndroid Build Coastguard Worker #else
242*9356374aSAndroid Build Coastguard Worker   // Fall back to OS - unsure how to reliably query cntvct_el0 frequency.
243*9356374aSAndroid Build Coastguard Worker   timespec ts;
244*9356374aSAndroid Build Coastguard Worker   clock_gettime(CLOCK_REALTIME, &ts);
245*9356374aSAndroid Build Coastguard Worker   t = ts.tv_sec * 1000000000LL + ts.tv_nsec;
246*9356374aSAndroid Build Coastguard Worker #endif
247*9356374aSAndroid Build Coastguard Worker   return t;
248*9356374aSAndroid Build Coastguard Worker }
249*9356374aSAndroid Build Coastguard Worker 
Stop64()250*9356374aSAndroid Build Coastguard Worker inline uint64_t Stop64() {
251*9356374aSAndroid Build Coastguard Worker   uint64_t t;
252*9356374aSAndroid Build Coastguard Worker #if defined(ABSL_ARCH_X86_64)
253*9356374aSAndroid Build Coastguard Worker #if defined(ABSL_OS_WIN)
254*9356374aSAndroid Build Coastguard Worker   _ReadWriteBarrier();
255*9356374aSAndroid Build Coastguard Worker   unsigned aux;
256*9356374aSAndroid Build Coastguard Worker   t = __rdtscp(&aux);
257*9356374aSAndroid Build Coastguard Worker   _ReadWriteBarrier();
258*9356374aSAndroid Build Coastguard Worker   _mm_lfence();
259*9356374aSAndroid Build Coastguard Worker   _ReadWriteBarrier();
260*9356374aSAndroid Build Coastguard Worker #else
261*9356374aSAndroid Build Coastguard Worker   // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
262*9356374aSAndroid Build Coastguard Worker   asm volatile(
263*9356374aSAndroid Build Coastguard Worker       "rdtscp\n\t"
264*9356374aSAndroid Build Coastguard Worker       "shl $32, %%rdx\n\t"
265*9356374aSAndroid Build Coastguard Worker       "or %%rdx, %0\n\t"
266*9356374aSAndroid Build Coastguard Worker       "lfence"
267*9356374aSAndroid Build Coastguard Worker       : "=a"(t)
268*9356374aSAndroid Build Coastguard Worker       :
269*9356374aSAndroid Build Coastguard Worker       // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
270*9356374aSAndroid Build Coastguard Worker       // "cc" = flags modified by SHL.
271*9356374aSAndroid Build Coastguard Worker       : "rcx", "rdx", "memory", "cc");
272*9356374aSAndroid Build Coastguard Worker #endif
273*9356374aSAndroid Build Coastguard Worker #else
274*9356374aSAndroid Build Coastguard Worker   t = Start64();
275*9356374aSAndroid Build Coastguard Worker #endif
276*9356374aSAndroid Build Coastguard Worker   return t;
277*9356374aSAndroid Build Coastguard Worker }
278*9356374aSAndroid Build Coastguard Worker 
279*9356374aSAndroid Build Coastguard Worker // Returns a 32-bit timestamp with about 4 cycles less overhead than
280*9356374aSAndroid Build Coastguard Worker // Start64. Only suitable for measuring very short regions because the
281*9356374aSAndroid Build Coastguard Worker // timestamp overflows about once a second.
Start32()282*9356374aSAndroid Build Coastguard Worker inline uint32_t Start32() {
283*9356374aSAndroid Build Coastguard Worker   uint32_t t;
284*9356374aSAndroid Build Coastguard Worker #if defined(ABSL_ARCH_X86_64)
285*9356374aSAndroid Build Coastguard Worker #if defined(ABSL_OS_WIN)
286*9356374aSAndroid Build Coastguard Worker   _ReadWriteBarrier();
287*9356374aSAndroid Build Coastguard Worker   _mm_lfence();
288*9356374aSAndroid Build Coastguard Worker   _ReadWriteBarrier();
289*9356374aSAndroid Build Coastguard Worker   t = static_cast<uint32_t>(__rdtsc());
290*9356374aSAndroid Build Coastguard Worker   _ReadWriteBarrier();
291*9356374aSAndroid Build Coastguard Worker   _mm_lfence();
292*9356374aSAndroid Build Coastguard Worker   _ReadWriteBarrier();
293*9356374aSAndroid Build Coastguard Worker #else
294*9356374aSAndroid Build Coastguard Worker   asm volatile(
295*9356374aSAndroid Build Coastguard Worker       "lfence\n\t"
296*9356374aSAndroid Build Coastguard Worker       "rdtsc\n\t"
297*9356374aSAndroid Build Coastguard Worker       "lfence"
298*9356374aSAndroid Build Coastguard Worker       : "=a"(t)
299*9356374aSAndroid Build Coastguard Worker       :
300*9356374aSAndroid Build Coastguard Worker       // "memory" avoids reordering. rdx = TSC >> 32.
301*9356374aSAndroid Build Coastguard Worker       : "rdx", "memory");
302*9356374aSAndroid Build Coastguard Worker #endif
303*9356374aSAndroid Build Coastguard Worker #else
304*9356374aSAndroid Build Coastguard Worker   t = static_cast<uint32_t>(Start64());
305*9356374aSAndroid Build Coastguard Worker #endif
306*9356374aSAndroid Build Coastguard Worker   return t;
307*9356374aSAndroid Build Coastguard Worker }
308*9356374aSAndroid Build Coastguard Worker 
Stop32()309*9356374aSAndroid Build Coastguard Worker inline uint32_t Stop32() {
310*9356374aSAndroid Build Coastguard Worker   uint32_t t;
311*9356374aSAndroid Build Coastguard Worker #if defined(ABSL_ARCH_X86_64)
312*9356374aSAndroid Build Coastguard Worker #if defined(ABSL_OS_WIN)
313*9356374aSAndroid Build Coastguard Worker   _ReadWriteBarrier();
314*9356374aSAndroid Build Coastguard Worker   unsigned aux;
315*9356374aSAndroid Build Coastguard Worker   t = static_cast<uint32_t>(__rdtscp(&aux));
316*9356374aSAndroid Build Coastguard Worker   _ReadWriteBarrier();
317*9356374aSAndroid Build Coastguard Worker   _mm_lfence();
318*9356374aSAndroid Build Coastguard Worker   _ReadWriteBarrier();
319*9356374aSAndroid Build Coastguard Worker #else
320*9356374aSAndroid Build Coastguard Worker   // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
321*9356374aSAndroid Build Coastguard Worker   asm volatile(
322*9356374aSAndroid Build Coastguard Worker       "rdtscp\n\t"
323*9356374aSAndroid Build Coastguard Worker       "lfence"
324*9356374aSAndroid Build Coastguard Worker       : "=a"(t)
325*9356374aSAndroid Build Coastguard Worker       :
326*9356374aSAndroid Build Coastguard Worker       // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
327*9356374aSAndroid Build Coastguard Worker       : "rcx", "rdx", "memory");
328*9356374aSAndroid Build Coastguard Worker #endif
329*9356374aSAndroid Build Coastguard Worker #else
330*9356374aSAndroid Build Coastguard Worker   t = static_cast<uint32_t>(Stop64());
331*9356374aSAndroid Build Coastguard Worker #endif
332*9356374aSAndroid Build Coastguard Worker   return t;
333*9356374aSAndroid Build Coastguard Worker }
334*9356374aSAndroid Build Coastguard Worker 
335*9356374aSAndroid Build Coastguard Worker }  // namespace timer
336*9356374aSAndroid Build Coastguard Worker 
337*9356374aSAndroid Build Coastguard Worker namespace robust_statistics {
338*9356374aSAndroid Build Coastguard Worker 
339*9356374aSAndroid Build Coastguard Worker // Sorts integral values in ascending order (e.g. for Mode). About 3x faster
340*9356374aSAndroid Build Coastguard Worker // than std::sort for input distributions with very few unique values.
341*9356374aSAndroid Build Coastguard Worker template <class T>
CountingSort(T * values,size_t num_values)342*9356374aSAndroid Build Coastguard Worker void CountingSort(T* values, size_t num_values) {
343*9356374aSAndroid Build Coastguard Worker   // Unique values and their frequency (similar to flat_map).
344*9356374aSAndroid Build Coastguard Worker   using Unique = std::pair<T, int>;
345*9356374aSAndroid Build Coastguard Worker   std::vector<Unique> unique;
346*9356374aSAndroid Build Coastguard Worker   for (size_t i = 0; i < num_values; ++i) {
347*9356374aSAndroid Build Coastguard Worker     const T value = values[i];
348*9356374aSAndroid Build Coastguard Worker     const auto pos =
349*9356374aSAndroid Build Coastguard Worker         std::find_if(unique.begin(), unique.end(),
350*9356374aSAndroid Build Coastguard Worker                      [value](const Unique u) { return u.first == value; });
351*9356374aSAndroid Build Coastguard Worker     if (pos == unique.end()) {
352*9356374aSAndroid Build Coastguard Worker       unique.push_back(std::make_pair(value, 1));
353*9356374aSAndroid Build Coastguard Worker     } else {
354*9356374aSAndroid Build Coastguard Worker       ++pos->second;
355*9356374aSAndroid Build Coastguard Worker     }
356*9356374aSAndroid Build Coastguard Worker   }
357*9356374aSAndroid Build Coastguard Worker 
358*9356374aSAndroid Build Coastguard Worker   // Sort in ascending order of value (pair.first).
359*9356374aSAndroid Build Coastguard Worker   std::sort(unique.begin(), unique.end());
360*9356374aSAndroid Build Coastguard Worker 
361*9356374aSAndroid Build Coastguard Worker   // Write that many copies of each unique value to the array.
362*9356374aSAndroid Build Coastguard Worker   T* ABSL_RANDOM_INTERNAL_RESTRICT p = values;
363*9356374aSAndroid Build Coastguard Worker   for (const auto& value_count : unique) {
364*9356374aSAndroid Build Coastguard Worker     std::fill_n(p, value_count.second, value_count.first);
365*9356374aSAndroid Build Coastguard Worker     p += value_count.second;
366*9356374aSAndroid Build Coastguard Worker   }
367*9356374aSAndroid Build Coastguard Worker   ABSL_RAW_CHECK(p == values + num_values, "Did not produce enough output");
368*9356374aSAndroid Build Coastguard Worker }
369*9356374aSAndroid Build Coastguard Worker 
370*9356374aSAndroid Build Coastguard Worker // @return i in [idx_begin, idx_begin + half_count) that minimizes
371*9356374aSAndroid Build Coastguard Worker // sorted[i + half_count] - sorted[i].
372*9356374aSAndroid Build Coastguard Worker template <typename T>
MinRange(const T * const ABSL_RANDOM_INTERNAL_RESTRICT sorted,const size_t idx_begin,const size_t half_count)373*9356374aSAndroid Build Coastguard Worker size_t MinRange(const T* const ABSL_RANDOM_INTERNAL_RESTRICT sorted,
374*9356374aSAndroid Build Coastguard Worker                 const size_t idx_begin, const size_t half_count) {
375*9356374aSAndroid Build Coastguard Worker   T min_range = (std::numeric_limits<T>::max)();
376*9356374aSAndroid Build Coastguard Worker   size_t min_idx = 0;
377*9356374aSAndroid Build Coastguard Worker 
378*9356374aSAndroid Build Coastguard Worker   for (size_t idx = idx_begin; idx < idx_begin + half_count; ++idx) {
379*9356374aSAndroid Build Coastguard Worker     ABSL_RAW_CHECK(sorted[idx] <= sorted[idx + half_count], "Not sorted");
380*9356374aSAndroid Build Coastguard Worker     const T range = sorted[idx + half_count] - sorted[idx];
381*9356374aSAndroid Build Coastguard Worker     if (range < min_range) {
382*9356374aSAndroid Build Coastguard Worker       min_range = range;
383*9356374aSAndroid Build Coastguard Worker       min_idx = idx;
384*9356374aSAndroid Build Coastguard Worker     }
385*9356374aSAndroid Build Coastguard Worker   }
386*9356374aSAndroid Build Coastguard Worker 
387*9356374aSAndroid Build Coastguard Worker   return min_idx;
388*9356374aSAndroid Build Coastguard Worker }
389*9356374aSAndroid Build Coastguard Worker 
390*9356374aSAndroid Build Coastguard Worker // Returns an estimate of the mode by calling MinRange on successively
391*9356374aSAndroid Build Coastguard Worker // halved intervals. "sorted" must be in ascending order. This is the
392*9356374aSAndroid Build Coastguard Worker // Half Sample Mode estimator proposed by Bickel in "On a fast, robust
393*9356374aSAndroid Build Coastguard Worker // estimator of the mode", with complexity O(N log N). The mode is less
394*9356374aSAndroid Build Coastguard Worker // affected by outliers in highly-skewed distributions than the median.
395*9356374aSAndroid Build Coastguard Worker // The averaging operation below assumes "T" is an unsigned integer type.
396*9356374aSAndroid Build Coastguard Worker template <typename T>
ModeOfSorted(const T * const ABSL_RANDOM_INTERNAL_RESTRICT sorted,const size_t num_values)397*9356374aSAndroid Build Coastguard Worker T ModeOfSorted(const T* const ABSL_RANDOM_INTERNAL_RESTRICT sorted,
398*9356374aSAndroid Build Coastguard Worker                const size_t num_values) {
399*9356374aSAndroid Build Coastguard Worker   size_t idx_begin = 0;
400*9356374aSAndroid Build Coastguard Worker   size_t half_count = num_values / 2;
401*9356374aSAndroid Build Coastguard Worker   while (half_count > 1) {
402*9356374aSAndroid Build Coastguard Worker     idx_begin = MinRange(sorted, idx_begin, half_count);
403*9356374aSAndroid Build Coastguard Worker     half_count >>= 1;
404*9356374aSAndroid Build Coastguard Worker   }
405*9356374aSAndroid Build Coastguard Worker 
406*9356374aSAndroid Build Coastguard Worker   const T x = sorted[idx_begin + 0];
407*9356374aSAndroid Build Coastguard Worker   if (half_count == 0) {
408*9356374aSAndroid Build Coastguard Worker     return x;
409*9356374aSAndroid Build Coastguard Worker   }
410*9356374aSAndroid Build Coastguard Worker   ABSL_RAW_CHECK(half_count == 1, "Should stop at half_count=1");
411*9356374aSAndroid Build Coastguard Worker   const T average = (x + sorted[idx_begin + 1] + 1) / 2;
412*9356374aSAndroid Build Coastguard Worker   return average;
413*9356374aSAndroid Build Coastguard Worker }
414*9356374aSAndroid Build Coastguard Worker 
415*9356374aSAndroid Build Coastguard Worker // Returns the mode. Side effect: sorts "values".
416*9356374aSAndroid Build Coastguard Worker template <typename T>
Mode(T * values,const size_t num_values)417*9356374aSAndroid Build Coastguard Worker T Mode(T* values, const size_t num_values) {
418*9356374aSAndroid Build Coastguard Worker   CountingSort(values, num_values);
419*9356374aSAndroid Build Coastguard Worker   return ModeOfSorted(values, num_values);
420*9356374aSAndroid Build Coastguard Worker }
421*9356374aSAndroid Build Coastguard Worker 
422*9356374aSAndroid Build Coastguard Worker template <typename T, size_t N>
Mode(T (& values)[N])423*9356374aSAndroid Build Coastguard Worker T Mode(T (&values)[N]) {
424*9356374aSAndroid Build Coastguard Worker   return Mode(&values[0], N);
425*9356374aSAndroid Build Coastguard Worker }
426*9356374aSAndroid Build Coastguard Worker 
427*9356374aSAndroid Build Coastguard Worker // Returns the median value. Side effect: sorts "values".
428*9356374aSAndroid Build Coastguard Worker template <typename T>
Median(T * values,const size_t num_values)429*9356374aSAndroid Build Coastguard Worker T Median(T* values, const size_t num_values) {
430*9356374aSAndroid Build Coastguard Worker   ABSL_RAW_CHECK(num_values != 0, "Empty input");
431*9356374aSAndroid Build Coastguard Worker   std::sort(values, values + num_values);
432*9356374aSAndroid Build Coastguard Worker   const size_t half = num_values / 2;
433*9356374aSAndroid Build Coastguard Worker   // Odd count: return middle
434*9356374aSAndroid Build Coastguard Worker   if (num_values % 2) {
435*9356374aSAndroid Build Coastguard Worker     return values[half];
436*9356374aSAndroid Build Coastguard Worker   }
437*9356374aSAndroid Build Coastguard Worker   // Even count: return average of middle two.
438*9356374aSAndroid Build Coastguard Worker   return (values[half] + values[half - 1] + 1) / 2;
439*9356374aSAndroid Build Coastguard Worker }
440*9356374aSAndroid Build Coastguard Worker 
441*9356374aSAndroid Build Coastguard Worker // Returns a robust measure of variability.
442*9356374aSAndroid Build Coastguard Worker template <typename T>
MedianAbsoluteDeviation(const T * values,const size_t num_values,const T median)443*9356374aSAndroid Build Coastguard Worker T MedianAbsoluteDeviation(const T* values, const size_t num_values,
444*9356374aSAndroid Build Coastguard Worker                           const T median) {
445*9356374aSAndroid Build Coastguard Worker   ABSL_RAW_CHECK(num_values != 0, "Empty input");
446*9356374aSAndroid Build Coastguard Worker   std::vector<T> abs_deviations;
447*9356374aSAndroid Build Coastguard Worker   abs_deviations.reserve(num_values);
448*9356374aSAndroid Build Coastguard Worker   for (size_t i = 0; i < num_values; ++i) {
449*9356374aSAndroid Build Coastguard Worker     const int64_t abs = std::abs(int64_t(values[i]) - int64_t(median));
450*9356374aSAndroid Build Coastguard Worker     abs_deviations.push_back(static_cast<T>(abs));
451*9356374aSAndroid Build Coastguard Worker   }
452*9356374aSAndroid Build Coastguard Worker   return Median(abs_deviations.data(), num_values);
453*9356374aSAndroid Build Coastguard Worker }
454*9356374aSAndroid Build Coastguard Worker 
455*9356374aSAndroid Build Coastguard Worker }  // namespace robust_statistics
456*9356374aSAndroid Build Coastguard Worker 
457*9356374aSAndroid Build Coastguard Worker // Ticks := platform-specific timer values (CPU cycles on x86). Must be
458*9356374aSAndroid Build Coastguard Worker // unsigned to guarantee wraparound on overflow. 32 bit timers are faster to
459*9356374aSAndroid Build Coastguard Worker // read than 64 bit.
460*9356374aSAndroid Build Coastguard Worker using Ticks = uint32_t;
461*9356374aSAndroid Build Coastguard Worker 
462*9356374aSAndroid Build Coastguard Worker // Returns timer overhead / minimum measurable difference.
TimerResolution()463*9356374aSAndroid Build Coastguard Worker Ticks TimerResolution() {
464*9356374aSAndroid Build Coastguard Worker   // Nested loop avoids exceeding stack/L1 capacity.
465*9356374aSAndroid Build Coastguard Worker   Ticks repetitions[Params::kTimerSamples];
466*9356374aSAndroid Build Coastguard Worker   for (size_t rep = 0; rep < Params::kTimerSamples; ++rep) {
467*9356374aSAndroid Build Coastguard Worker     Ticks samples[Params::kTimerSamples];
468*9356374aSAndroid Build Coastguard Worker     for (size_t i = 0; i < Params::kTimerSamples; ++i) {
469*9356374aSAndroid Build Coastguard Worker       const Ticks t0 = timer::Start32();
470*9356374aSAndroid Build Coastguard Worker       const Ticks t1 = timer::Stop32();
471*9356374aSAndroid Build Coastguard Worker       samples[i] = t1 - t0;
472*9356374aSAndroid Build Coastguard Worker     }
473*9356374aSAndroid Build Coastguard Worker     repetitions[rep] = robust_statistics::Mode(samples);
474*9356374aSAndroid Build Coastguard Worker   }
475*9356374aSAndroid Build Coastguard Worker   return robust_statistics::Mode(repetitions);
476*9356374aSAndroid Build Coastguard Worker }
477*9356374aSAndroid Build Coastguard Worker 
478*9356374aSAndroid Build Coastguard Worker static const Ticks timer_resolution = TimerResolution();
479*9356374aSAndroid Build Coastguard Worker 
480*9356374aSAndroid Build Coastguard Worker // Estimates the expected value of "lambda" values with a variable number of
481*9356374aSAndroid Build Coastguard Worker // samples until the variability "rel_mad" is less than "max_rel_mad".
482*9356374aSAndroid Build Coastguard Worker template <class Lambda>
SampleUntilStable(const double max_rel_mad,double * rel_mad,const Params & p,const Lambda & lambda)483*9356374aSAndroid Build Coastguard Worker Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad,
484*9356374aSAndroid Build Coastguard Worker                         const Params& p, const Lambda& lambda) {
485*9356374aSAndroid Build Coastguard Worker   auto measure_duration = [&lambda]() -> Ticks {
486*9356374aSAndroid Build Coastguard Worker     const Ticks t0 = timer::Start32();
487*9356374aSAndroid Build Coastguard Worker     lambda();
488*9356374aSAndroid Build Coastguard Worker     const Ticks t1 = timer::Stop32();
489*9356374aSAndroid Build Coastguard Worker     return t1 - t0;
490*9356374aSAndroid Build Coastguard Worker   };
491*9356374aSAndroid Build Coastguard Worker 
492*9356374aSAndroid Build Coastguard Worker   // Choose initial samples_per_eval based on a single estimated duration.
493*9356374aSAndroid Build Coastguard Worker   Ticks est = measure_duration();
494*9356374aSAndroid Build Coastguard Worker   static const double ticks_per_second = InvariantTicksPerSecond();
495*9356374aSAndroid Build Coastguard Worker   const size_t ticks_per_eval = ticks_per_second * p.seconds_per_eval;
496*9356374aSAndroid Build Coastguard Worker   size_t samples_per_eval = ticks_per_eval / est;
497*9356374aSAndroid Build Coastguard Worker   samples_per_eval = (std::max)(samples_per_eval, p.min_samples_per_eval);
498*9356374aSAndroid Build Coastguard Worker 
499*9356374aSAndroid Build Coastguard Worker   std::vector<Ticks> samples;
500*9356374aSAndroid Build Coastguard Worker   samples.reserve(1 + samples_per_eval);
501*9356374aSAndroid Build Coastguard Worker   samples.push_back(est);
502*9356374aSAndroid Build Coastguard Worker 
503*9356374aSAndroid Build Coastguard Worker   // Percentage is too strict for tiny differences, so also allow a small
504*9356374aSAndroid Build Coastguard Worker   // absolute "median absolute deviation".
505*9356374aSAndroid Build Coastguard Worker   const Ticks max_abs_mad = (timer_resolution + 99) / 100;
506*9356374aSAndroid Build Coastguard Worker   *rel_mad = 0.0;  // ensure initialized
507*9356374aSAndroid Build Coastguard Worker 
508*9356374aSAndroid Build Coastguard Worker   for (size_t eval = 0; eval < p.max_evals; ++eval, samples_per_eval *= 2) {
509*9356374aSAndroid Build Coastguard Worker     samples.reserve(samples.size() + samples_per_eval);
510*9356374aSAndroid Build Coastguard Worker     for (size_t i = 0; i < samples_per_eval; ++i) {
511*9356374aSAndroid Build Coastguard Worker       const Ticks r = measure_duration();
512*9356374aSAndroid Build Coastguard Worker       samples.push_back(r);
513*9356374aSAndroid Build Coastguard Worker     }
514*9356374aSAndroid Build Coastguard Worker 
515*9356374aSAndroid Build Coastguard Worker     if (samples.size() >= p.min_mode_samples) {
516*9356374aSAndroid Build Coastguard Worker       est = robust_statistics::Mode(samples.data(), samples.size());
517*9356374aSAndroid Build Coastguard Worker     } else {
518*9356374aSAndroid Build Coastguard Worker       // For "few" (depends also on the variance) samples, Median is safer.
519*9356374aSAndroid Build Coastguard Worker       est = robust_statistics::Median(samples.data(), samples.size());
520*9356374aSAndroid Build Coastguard Worker     }
521*9356374aSAndroid Build Coastguard Worker     ABSL_RAW_CHECK(est != 0, "Estimator returned zero duration");
522*9356374aSAndroid Build Coastguard Worker 
523*9356374aSAndroid Build Coastguard Worker     // Median absolute deviation (mad) is a robust measure of 'variability'.
524*9356374aSAndroid Build Coastguard Worker     const Ticks abs_mad = robust_statistics::MedianAbsoluteDeviation(
525*9356374aSAndroid Build Coastguard Worker         samples.data(), samples.size(), est);
526*9356374aSAndroid Build Coastguard Worker     *rel_mad = static_cast<double>(static_cast<int>(abs_mad)) / est;
527*9356374aSAndroid Build Coastguard Worker 
528*9356374aSAndroid Build Coastguard Worker     if (*rel_mad <= max_rel_mad || abs_mad <= max_abs_mad) {
529*9356374aSAndroid Build Coastguard Worker       if (p.verbose) {
530*9356374aSAndroid Build Coastguard Worker         ABSL_RAW_LOG(INFO,
531*9356374aSAndroid Build Coastguard Worker                      "%6zu samples => %5u (abs_mad=%4u, rel_mad=%4.2f%%)\n",
532*9356374aSAndroid Build Coastguard Worker                      samples.size(), est, abs_mad, *rel_mad * 100.0);
533*9356374aSAndroid Build Coastguard Worker       }
534*9356374aSAndroid Build Coastguard Worker       return est;
535*9356374aSAndroid Build Coastguard Worker     }
536*9356374aSAndroid Build Coastguard Worker   }
537*9356374aSAndroid Build Coastguard Worker 
538*9356374aSAndroid Build Coastguard Worker   if (p.verbose) {
539*9356374aSAndroid Build Coastguard Worker     ABSL_RAW_LOG(WARNING,
540*9356374aSAndroid Build Coastguard Worker                  "rel_mad=%4.2f%% still exceeds %4.2f%% after %6zu samples.\n",
541*9356374aSAndroid Build Coastguard Worker                  *rel_mad * 100.0, max_rel_mad * 100.0, samples.size());
542*9356374aSAndroid Build Coastguard Worker   }
543*9356374aSAndroid Build Coastguard Worker   return est;
544*9356374aSAndroid Build Coastguard Worker }
545*9356374aSAndroid Build Coastguard Worker 
546*9356374aSAndroid Build Coastguard Worker using InputVec = std::vector<FuncInput>;
547*9356374aSAndroid Build Coastguard Worker 
548*9356374aSAndroid Build Coastguard Worker // Returns vector of unique input values.
UniqueInputs(const FuncInput * inputs,const size_t num_inputs)549*9356374aSAndroid Build Coastguard Worker InputVec UniqueInputs(const FuncInput* inputs, const size_t num_inputs) {
550*9356374aSAndroid Build Coastguard Worker   InputVec unique(inputs, inputs + num_inputs);
551*9356374aSAndroid Build Coastguard Worker   std::sort(unique.begin(), unique.end());
552*9356374aSAndroid Build Coastguard Worker   unique.erase(std::unique(unique.begin(), unique.end()), unique.end());
553*9356374aSAndroid Build Coastguard Worker   return unique;
554*9356374aSAndroid Build Coastguard Worker }
555*9356374aSAndroid Build Coastguard Worker 
556*9356374aSAndroid Build Coastguard Worker // Returns how often we need to call func for sufficient precision, or zero
557*9356374aSAndroid Build Coastguard Worker // on failure (e.g. the elapsed time is too long for a 32-bit tick count).
NumSkip(const Func func,const void * arg,const InputVec & unique,const Params & p)558*9356374aSAndroid Build Coastguard Worker size_t NumSkip(const Func func, const void* arg, const InputVec& unique,
559*9356374aSAndroid Build Coastguard Worker                const Params& p) {
560*9356374aSAndroid Build Coastguard Worker   // Min elapsed ticks for any input.
561*9356374aSAndroid Build Coastguard Worker   Ticks min_duration = ~0u;
562*9356374aSAndroid Build Coastguard Worker 
563*9356374aSAndroid Build Coastguard Worker   for (const FuncInput input : unique) {
564*9356374aSAndroid Build Coastguard Worker     // Make sure a 32-bit timer is sufficient.
565*9356374aSAndroid Build Coastguard Worker     const uint64_t t0 = timer::Start64();
566*9356374aSAndroid Build Coastguard Worker     PreventElision(func(arg, input));
567*9356374aSAndroid Build Coastguard Worker     const uint64_t t1 = timer::Stop64();
568*9356374aSAndroid Build Coastguard Worker     const uint64_t elapsed = t1 - t0;
569*9356374aSAndroid Build Coastguard Worker     if (elapsed >= (1ULL << 30)) {
570*9356374aSAndroid Build Coastguard Worker       ABSL_RAW_LOG(WARNING,
571*9356374aSAndroid Build Coastguard Worker                    "Measurement failed: need 64-bit timer for input=%zu\n",
572*9356374aSAndroid Build Coastguard Worker                    static_cast<size_t>(input));
573*9356374aSAndroid Build Coastguard Worker       return 0;
574*9356374aSAndroid Build Coastguard Worker     }
575*9356374aSAndroid Build Coastguard Worker 
576*9356374aSAndroid Build Coastguard Worker     double rel_mad;
577*9356374aSAndroid Build Coastguard Worker     const Ticks total = SampleUntilStable(
578*9356374aSAndroid Build Coastguard Worker         p.target_rel_mad, &rel_mad, p,
579*9356374aSAndroid Build Coastguard Worker         [func, arg, input]() { PreventElision(func(arg, input)); });
580*9356374aSAndroid Build Coastguard Worker     min_duration = (std::min)(min_duration, total - timer_resolution);
581*9356374aSAndroid Build Coastguard Worker   }
582*9356374aSAndroid Build Coastguard Worker 
583*9356374aSAndroid Build Coastguard Worker   // Number of repetitions required to reach the target resolution.
584*9356374aSAndroid Build Coastguard Worker   const size_t max_skip = p.precision_divisor;
585*9356374aSAndroid Build Coastguard Worker   // Number of repetitions given the estimated duration.
586*9356374aSAndroid Build Coastguard Worker   const size_t num_skip =
587*9356374aSAndroid Build Coastguard Worker       min_duration == 0 ? 0 : (max_skip + min_duration - 1) / min_duration;
588*9356374aSAndroid Build Coastguard Worker   if (p.verbose) {
589*9356374aSAndroid Build Coastguard Worker     ABSL_RAW_LOG(INFO, "res=%u max_skip=%zu min_dur=%u num_skip=%zu\n",
590*9356374aSAndroid Build Coastguard Worker                  timer_resolution, max_skip, min_duration, num_skip);
591*9356374aSAndroid Build Coastguard Worker   }
592*9356374aSAndroid Build Coastguard Worker   return num_skip;
593*9356374aSAndroid Build Coastguard Worker }
594*9356374aSAndroid Build Coastguard Worker 
595*9356374aSAndroid Build Coastguard Worker // Replicates inputs until we can omit "num_skip" occurrences of an input.
ReplicateInputs(const FuncInput * inputs,const size_t num_inputs,const size_t num_unique,const size_t num_skip,const Params & p)596*9356374aSAndroid Build Coastguard Worker InputVec ReplicateInputs(const FuncInput* inputs, const size_t num_inputs,
597*9356374aSAndroid Build Coastguard Worker                          const size_t num_unique, const size_t num_skip,
598*9356374aSAndroid Build Coastguard Worker                          const Params& p) {
599*9356374aSAndroid Build Coastguard Worker   InputVec full;
600*9356374aSAndroid Build Coastguard Worker   if (num_unique == 1) {
601*9356374aSAndroid Build Coastguard Worker     full.assign(p.subset_ratio * num_skip, inputs[0]);
602*9356374aSAndroid Build Coastguard Worker     return full;
603*9356374aSAndroid Build Coastguard Worker   }
604*9356374aSAndroid Build Coastguard Worker 
605*9356374aSAndroid Build Coastguard Worker   full.reserve(p.subset_ratio * num_skip * num_inputs);
606*9356374aSAndroid Build Coastguard Worker   for (size_t i = 0; i < p.subset_ratio * num_skip; ++i) {
607*9356374aSAndroid Build Coastguard Worker     full.insert(full.end(), inputs, inputs + num_inputs);
608*9356374aSAndroid Build Coastguard Worker   }
609*9356374aSAndroid Build Coastguard Worker   absl::random_internal::randen_engine<uint32_t> rng;
610*9356374aSAndroid Build Coastguard Worker   std::shuffle(full.begin(), full.end(), rng);
611*9356374aSAndroid Build Coastguard Worker   return full;
612*9356374aSAndroid Build Coastguard Worker }
613*9356374aSAndroid Build Coastguard Worker 
614*9356374aSAndroid Build Coastguard Worker // Copies the "full" to "subset" in the same order, but with "num_skip"
615*9356374aSAndroid Build Coastguard Worker // randomly selected occurrences of "input_to_skip" removed.
FillSubset(const InputVec & full,const FuncInput input_to_skip,const size_t num_skip,InputVec * subset)616*9356374aSAndroid Build Coastguard Worker void FillSubset(const InputVec& full, const FuncInput input_to_skip,
617*9356374aSAndroid Build Coastguard Worker                 const size_t num_skip, InputVec* subset) {
618*9356374aSAndroid Build Coastguard Worker   const size_t count = std::count(full.begin(), full.end(), input_to_skip);
619*9356374aSAndroid Build Coastguard Worker   // Generate num_skip random indices: which occurrence to skip.
620*9356374aSAndroid Build Coastguard Worker   std::vector<uint32_t> omit;
621*9356374aSAndroid Build Coastguard Worker   // Replacement for std::iota, not yet available in MSVC builds.
622*9356374aSAndroid Build Coastguard Worker   omit.reserve(count);
623*9356374aSAndroid Build Coastguard Worker   for (size_t i = 0; i < count; ++i) {
624*9356374aSAndroid Build Coastguard Worker     omit.push_back(i);
625*9356374aSAndroid Build Coastguard Worker   }
626*9356374aSAndroid Build Coastguard Worker   // omit[] is the same on every call, but that's OK because they identify the
627*9356374aSAndroid Build Coastguard Worker   // Nth instance of input_to_skip, so the position within full[] differs.
628*9356374aSAndroid Build Coastguard Worker   absl::random_internal::randen_engine<uint32_t> rng;
629*9356374aSAndroid Build Coastguard Worker   std::shuffle(omit.begin(), omit.end(), rng);
630*9356374aSAndroid Build Coastguard Worker   omit.resize(num_skip);
631*9356374aSAndroid Build Coastguard Worker   std::sort(omit.begin(), omit.end());
632*9356374aSAndroid Build Coastguard Worker 
633*9356374aSAndroid Build Coastguard Worker   uint32_t occurrence = ~0u;  // 0 after preincrement
634*9356374aSAndroid Build Coastguard Worker   size_t idx_omit = 0;        // cursor within omit[]
635*9356374aSAndroid Build Coastguard Worker   size_t idx_subset = 0;      // cursor within *subset
636*9356374aSAndroid Build Coastguard Worker   for (const FuncInput next : full) {
637*9356374aSAndroid Build Coastguard Worker     if (next == input_to_skip) {
638*9356374aSAndroid Build Coastguard Worker       ++occurrence;
639*9356374aSAndroid Build Coastguard Worker       // Haven't removed enough already
640*9356374aSAndroid Build Coastguard Worker       if (idx_omit < num_skip) {
641*9356374aSAndroid Build Coastguard Worker         // This one is up for removal
642*9356374aSAndroid Build Coastguard Worker         if (occurrence == omit[idx_omit]) {
643*9356374aSAndroid Build Coastguard Worker           ++idx_omit;
644*9356374aSAndroid Build Coastguard Worker           continue;
645*9356374aSAndroid Build Coastguard Worker         }
646*9356374aSAndroid Build Coastguard Worker       }
647*9356374aSAndroid Build Coastguard Worker     }
648*9356374aSAndroid Build Coastguard Worker     if (idx_subset < subset->size()) {
649*9356374aSAndroid Build Coastguard Worker       (*subset)[idx_subset++] = next;
650*9356374aSAndroid Build Coastguard Worker     }
651*9356374aSAndroid Build Coastguard Worker   }
652*9356374aSAndroid Build Coastguard Worker   ABSL_RAW_CHECK(idx_subset == subset->size(), "idx_subset not at end");
653*9356374aSAndroid Build Coastguard Worker   ABSL_RAW_CHECK(idx_omit == omit.size(), "idx_omit not at end");
654*9356374aSAndroid Build Coastguard Worker   ABSL_RAW_CHECK(occurrence == count - 1, "occurrence not at end");
655*9356374aSAndroid Build Coastguard Worker }
656*9356374aSAndroid Build Coastguard Worker 
657*9356374aSAndroid Build Coastguard Worker // Returns total ticks elapsed for all inputs.
TotalDuration(const Func func,const void * arg,const InputVec * inputs,const Params & p,double * max_rel_mad)658*9356374aSAndroid Build Coastguard Worker Ticks TotalDuration(const Func func, const void* arg, const InputVec* inputs,
659*9356374aSAndroid Build Coastguard Worker                     const Params& p, double* max_rel_mad) {
660*9356374aSAndroid Build Coastguard Worker   double rel_mad;
661*9356374aSAndroid Build Coastguard Worker   const Ticks duration =
662*9356374aSAndroid Build Coastguard Worker       SampleUntilStable(p.target_rel_mad, &rel_mad, p, [func, arg, inputs]() {
663*9356374aSAndroid Build Coastguard Worker         for (const FuncInput input : *inputs) {
664*9356374aSAndroid Build Coastguard Worker           PreventElision(func(arg, input));
665*9356374aSAndroid Build Coastguard Worker         }
666*9356374aSAndroid Build Coastguard Worker       });
667*9356374aSAndroid Build Coastguard Worker   *max_rel_mad = (std::max)(*max_rel_mad, rel_mad);
668*9356374aSAndroid Build Coastguard Worker   return duration;
669*9356374aSAndroid Build Coastguard Worker }
670*9356374aSAndroid Build Coastguard Worker 
671*9356374aSAndroid Build Coastguard Worker // (Nearly) empty Func for measuring timer overhead/resolution.
672*9356374aSAndroid Build Coastguard Worker ABSL_RANDOM_INTERNAL_ATTRIBUTE_NEVER_INLINE FuncOutput
EmptyFunc(const void * arg,const FuncInput input)673*9356374aSAndroid Build Coastguard Worker EmptyFunc(const void* arg, const FuncInput input) {
674*9356374aSAndroid Build Coastguard Worker   return input;
675*9356374aSAndroid Build Coastguard Worker }
676*9356374aSAndroid Build Coastguard Worker 
677*9356374aSAndroid Build Coastguard Worker // Returns overhead of accessing inputs[] and calling a function; this will
678*9356374aSAndroid Build Coastguard Worker // be deducted from future TotalDuration return values.
Overhead(const void * arg,const InputVec * inputs,const Params & p)679*9356374aSAndroid Build Coastguard Worker Ticks Overhead(const void* arg, const InputVec* inputs, const Params& p) {
680*9356374aSAndroid Build Coastguard Worker   double rel_mad;
681*9356374aSAndroid Build Coastguard Worker   // Zero tolerance because repeatability is crucial and EmptyFunc is fast.
682*9356374aSAndroid Build Coastguard Worker   return SampleUntilStable(0.0, &rel_mad, p, [arg, inputs]() {
683*9356374aSAndroid Build Coastguard Worker     for (const FuncInput input : *inputs) {
684*9356374aSAndroid Build Coastguard Worker       PreventElision(EmptyFunc(arg, input));
685*9356374aSAndroid Build Coastguard Worker     }
686*9356374aSAndroid Build Coastguard Worker   });
687*9356374aSAndroid Build Coastguard Worker }
688*9356374aSAndroid Build Coastguard Worker 
689*9356374aSAndroid Build Coastguard Worker }  // namespace
690*9356374aSAndroid Build Coastguard Worker 
PinThreadToCPU(int cpu)691*9356374aSAndroid Build Coastguard Worker void PinThreadToCPU(int cpu) {
692*9356374aSAndroid Build Coastguard Worker   // We might migrate to another CPU before pinning below, but at least cpu
693*9356374aSAndroid Build Coastguard Worker   // will be one of the CPUs on which this thread ran.
694*9356374aSAndroid Build Coastguard Worker #if defined(ABSL_OS_WIN)
695*9356374aSAndroid Build Coastguard Worker   if (cpu < 0) {
696*9356374aSAndroid Build Coastguard Worker     cpu = static_cast<int>(GetCurrentProcessorNumber());
697*9356374aSAndroid Build Coastguard Worker     ABSL_RAW_CHECK(cpu >= 0, "PinThreadToCPU detect failed");
698*9356374aSAndroid Build Coastguard Worker     if (cpu >= 64) {
699*9356374aSAndroid Build Coastguard Worker       // NOTE: On wine, at least, GetCurrentProcessorNumber() sometimes returns
700*9356374aSAndroid Build Coastguard Worker       // a value > 64, which is out of range. When this happens, log a message
701*9356374aSAndroid Build Coastguard Worker       // and don't set a cpu affinity.
702*9356374aSAndroid Build Coastguard Worker       ABSL_RAW_LOG(ERROR, "Invalid CPU number: %d", cpu);
703*9356374aSAndroid Build Coastguard Worker       return;
704*9356374aSAndroid Build Coastguard Worker     }
705*9356374aSAndroid Build Coastguard Worker   } else if (cpu >= 64) {
706*9356374aSAndroid Build Coastguard Worker     // User specified an explicit CPU affinity > the valid range.
707*9356374aSAndroid Build Coastguard Worker     ABSL_RAW_LOG(FATAL, "Invalid CPU number: %d", cpu);
708*9356374aSAndroid Build Coastguard Worker   }
709*9356374aSAndroid Build Coastguard Worker   const DWORD_PTR prev = SetThreadAffinityMask(GetCurrentThread(), 1ULL << cpu);
710*9356374aSAndroid Build Coastguard Worker   ABSL_RAW_CHECK(prev != 0, "SetAffinity failed");
711*9356374aSAndroid Build Coastguard Worker #elif defined(ABSL_OS_LINUX) && !defined(ABSL_OS_ANDROID)
712*9356374aSAndroid Build Coastguard Worker   if (cpu < 0) {
713*9356374aSAndroid Build Coastguard Worker     cpu = sched_getcpu();
714*9356374aSAndroid Build Coastguard Worker     ABSL_RAW_CHECK(cpu >= 0, "PinThreadToCPU detect failed");
715*9356374aSAndroid Build Coastguard Worker   }
716*9356374aSAndroid Build Coastguard Worker   const pid_t pid = 0;  // current thread
717*9356374aSAndroid Build Coastguard Worker   cpu_set_t set;
718*9356374aSAndroid Build Coastguard Worker   CPU_ZERO(&set);
719*9356374aSAndroid Build Coastguard Worker   CPU_SET(cpu, &set);
720*9356374aSAndroid Build Coastguard Worker   const int err = sched_setaffinity(pid, sizeof(set), &set);
721*9356374aSAndroid Build Coastguard Worker   ABSL_RAW_CHECK(err == 0, "SetAffinity failed");
722*9356374aSAndroid Build Coastguard Worker #endif
723*9356374aSAndroid Build Coastguard Worker }
724*9356374aSAndroid Build Coastguard Worker 
725*9356374aSAndroid Build Coastguard Worker // Returns tick rate. Invariant means the tick counter frequency is independent
726*9356374aSAndroid Build Coastguard Worker // of CPU throttling or sleep. May be expensive, caller should cache the result.
InvariantTicksPerSecond()727*9356374aSAndroid Build Coastguard Worker double InvariantTicksPerSecond() {
728*9356374aSAndroid Build Coastguard Worker #if defined(ABSL_ARCH_PPC)
729*9356374aSAndroid Build Coastguard Worker   return __ppc_get_timebase_freq();
730*9356374aSAndroid Build Coastguard Worker #elif defined(ABSL_ARCH_X86_64)
731*9356374aSAndroid Build Coastguard Worker   // We assume the TSC is invariant; it is on all recent Intel/AMD CPUs.
732*9356374aSAndroid Build Coastguard Worker   return platform::NominalClockRate();
733*9356374aSAndroid Build Coastguard Worker #else
734*9356374aSAndroid Build Coastguard Worker   // Fall back to clock_gettime nanoseconds.
735*9356374aSAndroid Build Coastguard Worker   return 1E9;
736*9356374aSAndroid Build Coastguard Worker #endif
737*9356374aSAndroid Build Coastguard Worker }
738*9356374aSAndroid Build Coastguard Worker 
MeasureImpl(const Func func,const void * arg,const size_t num_skip,const InputVec & unique,const InputVec & full,const Params & p,Result * results)739*9356374aSAndroid Build Coastguard Worker size_t MeasureImpl(const Func func, const void* arg, const size_t num_skip,
740*9356374aSAndroid Build Coastguard Worker                    const InputVec& unique, const InputVec& full,
741*9356374aSAndroid Build Coastguard Worker                    const Params& p, Result* results) {
742*9356374aSAndroid Build Coastguard Worker   const float mul = 1.0f / static_cast<int>(num_skip);
743*9356374aSAndroid Build Coastguard Worker 
744*9356374aSAndroid Build Coastguard Worker   InputVec subset(full.size() - num_skip);
745*9356374aSAndroid Build Coastguard Worker   const Ticks overhead = Overhead(arg, &full, p);
746*9356374aSAndroid Build Coastguard Worker   const Ticks overhead_skip = Overhead(arg, &subset, p);
747*9356374aSAndroid Build Coastguard Worker   if (overhead < overhead_skip) {
748*9356374aSAndroid Build Coastguard Worker     ABSL_RAW_LOG(WARNING, "Measurement failed: overhead %u < %u\n", overhead,
749*9356374aSAndroid Build Coastguard Worker                  overhead_skip);
750*9356374aSAndroid Build Coastguard Worker     return 0;
751*9356374aSAndroid Build Coastguard Worker   }
752*9356374aSAndroid Build Coastguard Worker 
753*9356374aSAndroid Build Coastguard Worker   if (p.verbose) {
754*9356374aSAndroid Build Coastguard Worker     ABSL_RAW_LOG(INFO, "#inputs=%5zu,%5zu overhead=%5u,%5u\n", full.size(),
755*9356374aSAndroid Build Coastguard Worker                  subset.size(), overhead, overhead_skip);
756*9356374aSAndroid Build Coastguard Worker   }
757*9356374aSAndroid Build Coastguard Worker 
758*9356374aSAndroid Build Coastguard Worker   double max_rel_mad = 0.0;
759*9356374aSAndroid Build Coastguard Worker   const Ticks total = TotalDuration(func, arg, &full, p, &max_rel_mad);
760*9356374aSAndroid Build Coastguard Worker 
761*9356374aSAndroid Build Coastguard Worker   for (size_t i = 0; i < unique.size(); ++i) {
762*9356374aSAndroid Build Coastguard Worker     FillSubset(full, unique[i], num_skip, &subset);
763*9356374aSAndroid Build Coastguard Worker     const Ticks total_skip = TotalDuration(func, arg, &subset, p, &max_rel_mad);
764*9356374aSAndroid Build Coastguard Worker 
765*9356374aSAndroid Build Coastguard Worker     if (total < total_skip) {
766*9356374aSAndroid Build Coastguard Worker       ABSL_RAW_LOG(WARNING, "Measurement failed: total %u < %u\n", total,
767*9356374aSAndroid Build Coastguard Worker                    total_skip);
768*9356374aSAndroid Build Coastguard Worker       return 0;
769*9356374aSAndroid Build Coastguard Worker     }
770*9356374aSAndroid Build Coastguard Worker 
771*9356374aSAndroid Build Coastguard Worker     const Ticks duration = (total - overhead) - (total_skip - overhead_skip);
772*9356374aSAndroid Build Coastguard Worker     results[i].input = unique[i];
773*9356374aSAndroid Build Coastguard Worker     results[i].ticks = duration * mul;
774*9356374aSAndroid Build Coastguard Worker     results[i].variability = max_rel_mad;
775*9356374aSAndroid Build Coastguard Worker   }
776*9356374aSAndroid Build Coastguard Worker 
777*9356374aSAndroid Build Coastguard Worker   return unique.size();
778*9356374aSAndroid Build Coastguard Worker }
779*9356374aSAndroid Build Coastguard Worker 
Measure(const Func func,const void * arg,const FuncInput * inputs,const size_t num_inputs,Result * results,const Params & p)780*9356374aSAndroid Build Coastguard Worker size_t Measure(const Func func, const void* arg, const FuncInput* inputs,
781*9356374aSAndroid Build Coastguard Worker                const size_t num_inputs, Result* results, const Params& p) {
782*9356374aSAndroid Build Coastguard Worker   ABSL_RAW_CHECK(num_inputs != 0, "No inputs");
783*9356374aSAndroid Build Coastguard Worker 
784*9356374aSAndroid Build Coastguard Worker   const InputVec unique = UniqueInputs(inputs, num_inputs);
785*9356374aSAndroid Build Coastguard Worker   const size_t num_skip = NumSkip(func, arg, unique, p);  // never 0
786*9356374aSAndroid Build Coastguard Worker   if (num_skip == 0) return 0;  // NumSkip already printed error message
787*9356374aSAndroid Build Coastguard Worker 
788*9356374aSAndroid Build Coastguard Worker   const InputVec full =
789*9356374aSAndroid Build Coastguard Worker       ReplicateInputs(inputs, num_inputs, unique.size(), num_skip, p);
790*9356374aSAndroid Build Coastguard Worker 
791*9356374aSAndroid Build Coastguard Worker   // MeasureImpl may fail up to p.max_measure_retries times.
792*9356374aSAndroid Build Coastguard Worker   for (size_t i = 0; i < p.max_measure_retries; i++) {
793*9356374aSAndroid Build Coastguard Worker     auto result = MeasureImpl(func, arg, num_skip, unique, full, p, results);
794*9356374aSAndroid Build Coastguard Worker     if (result != 0) {
795*9356374aSAndroid Build Coastguard Worker       return result;
796*9356374aSAndroid Build Coastguard Worker     }
797*9356374aSAndroid Build Coastguard Worker   }
798*9356374aSAndroid Build Coastguard Worker   // All retries failed. (Unusual)
799*9356374aSAndroid Build Coastguard Worker   return 0;
800*9356374aSAndroid Build Coastguard Worker }
801*9356374aSAndroid Build Coastguard Worker 
802*9356374aSAndroid Build Coastguard Worker }  // namespace random_internal_nanobenchmark
803*9356374aSAndroid Build Coastguard Worker ABSL_NAMESPACE_END
804*9356374aSAndroid Build Coastguard Worker }  // namespace absl
805