1*01826a49SYabin Cui /*
2*01826a49SYabin Cui * Copyright (c) Meta Platforms, Inc. and affiliates.
3*01826a49SYabin Cui * All rights reserved.
4*01826a49SYabin Cui *
5*01826a49SYabin Cui * This source code is licensed under both the BSD-style license (found in the
6*01826a49SYabin Cui * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7*01826a49SYabin Cui * in the COPYING file in the root directory of this source tree).
8*01826a49SYabin Cui * You may select, at your option, one of the above-listed licenses.
9*01826a49SYabin Cui */
10*01826a49SYabin Cui
11*01826a49SYabin Cui
12*01826a49SYabin Cui
13*01826a49SYabin Cui /* *************************************
14*01826a49SYabin Cui * Includes
15*01826a49SYabin Cui ***************************************/
16*01826a49SYabin Cui #include <stdlib.h> /* malloc, free */
17*01826a49SYabin Cui #include <string.h> /* memset */
18*01826a49SYabin Cui #include <assert.h> /* assert */
19*01826a49SYabin Cui
20*01826a49SYabin Cui #include "timefn.h" /* UTIL_time_t, UTIL_getTime */
21*01826a49SYabin Cui #include "benchfn.h"
22*01826a49SYabin Cui
23*01826a49SYabin Cui
24*01826a49SYabin Cui /* *************************************
25*01826a49SYabin Cui * Constants
26*01826a49SYabin Cui ***************************************/
27*01826a49SYabin Cui #define TIMELOOP_MICROSEC SEC_TO_MICRO /* 1 second */
28*01826a49SYabin Cui #define TIMELOOP_NANOSEC (1*1000000000ULL) /* 1 second */
29*01826a49SYabin Cui
30*01826a49SYabin Cui #define KB *(1 <<10)
31*01826a49SYabin Cui #define MB *(1 <<20)
32*01826a49SYabin Cui #define GB *(1U<<30)
33*01826a49SYabin Cui
34*01826a49SYabin Cui
35*01826a49SYabin Cui /* *************************************
36*01826a49SYabin Cui * Debug errors
37*01826a49SYabin Cui ***************************************/
38*01826a49SYabin Cui #if defined(DEBUG) && (DEBUG >= 1)
39*01826a49SYabin Cui # include <stdio.h> /* fprintf */
40*01826a49SYabin Cui # define DISPLAY(...) fprintf(stderr, __VA_ARGS__)
41*01826a49SYabin Cui # define DEBUGOUTPUT(...) { if (DEBUG) DISPLAY(__VA_ARGS__); }
42*01826a49SYabin Cui #else
43*01826a49SYabin Cui # define DEBUGOUTPUT(...)
44*01826a49SYabin Cui #endif
45*01826a49SYabin Cui
46*01826a49SYabin Cui
47*01826a49SYabin Cui /* error without displaying */
48*01826a49SYabin Cui #define RETURN_QUIET_ERROR(retValue, ...) { \
49*01826a49SYabin Cui DEBUGOUTPUT("%s: %i: \n", __FILE__, __LINE__); \
50*01826a49SYabin Cui DEBUGOUTPUT("Error : "); \
51*01826a49SYabin Cui DEBUGOUTPUT(__VA_ARGS__); \
52*01826a49SYabin Cui DEBUGOUTPUT(" \n"); \
53*01826a49SYabin Cui return retValue; \
54*01826a49SYabin Cui }
55*01826a49SYabin Cui
56*01826a49SYabin Cui /* Abort execution if a condition is not met */
57*01826a49SYabin Cui #define CONTROL(c) { if (!(c)) { DEBUGOUTPUT("error: %s \n", #c); abort(); } }
58*01826a49SYabin Cui
59*01826a49SYabin Cui
60*01826a49SYabin Cui /* *************************************
61*01826a49SYabin Cui * Benchmarking an arbitrary function
62*01826a49SYabin Cui ***************************************/
63*01826a49SYabin Cui
BMK_isSuccessful_runOutcome(BMK_runOutcome_t outcome)64*01826a49SYabin Cui int BMK_isSuccessful_runOutcome(BMK_runOutcome_t outcome)
65*01826a49SYabin Cui {
66*01826a49SYabin Cui return outcome.error_tag_never_ever_use_directly == 0;
67*01826a49SYabin Cui }
68*01826a49SYabin Cui
69*01826a49SYabin Cui /* warning : this function will stop program execution if outcome is invalid !
70*01826a49SYabin Cui * check outcome validity first, using BMK_isValid_runResult() */
BMK_extract_runTime(BMK_runOutcome_t outcome)71*01826a49SYabin Cui BMK_runTime_t BMK_extract_runTime(BMK_runOutcome_t outcome)
72*01826a49SYabin Cui {
73*01826a49SYabin Cui CONTROL(outcome.error_tag_never_ever_use_directly == 0);
74*01826a49SYabin Cui return outcome.internal_never_ever_use_directly;
75*01826a49SYabin Cui }
76*01826a49SYabin Cui
BMK_extract_errorResult(BMK_runOutcome_t outcome)77*01826a49SYabin Cui size_t BMK_extract_errorResult(BMK_runOutcome_t outcome)
78*01826a49SYabin Cui {
79*01826a49SYabin Cui CONTROL(outcome.error_tag_never_ever_use_directly != 0);
80*01826a49SYabin Cui return outcome.error_result_never_ever_use_directly;
81*01826a49SYabin Cui }
82*01826a49SYabin Cui
BMK_runOutcome_error(size_t errorResult)83*01826a49SYabin Cui static BMK_runOutcome_t BMK_runOutcome_error(size_t errorResult)
84*01826a49SYabin Cui {
85*01826a49SYabin Cui BMK_runOutcome_t b;
86*01826a49SYabin Cui memset(&b, 0, sizeof(b));
87*01826a49SYabin Cui b.error_tag_never_ever_use_directly = 1;
88*01826a49SYabin Cui b.error_result_never_ever_use_directly = errorResult;
89*01826a49SYabin Cui return b;
90*01826a49SYabin Cui }
91*01826a49SYabin Cui
BMK_setValid_runTime(BMK_runTime_t runTime)92*01826a49SYabin Cui static BMK_runOutcome_t BMK_setValid_runTime(BMK_runTime_t runTime)
93*01826a49SYabin Cui {
94*01826a49SYabin Cui BMK_runOutcome_t outcome;
95*01826a49SYabin Cui outcome.error_tag_never_ever_use_directly = 0;
96*01826a49SYabin Cui outcome.internal_never_ever_use_directly = runTime;
97*01826a49SYabin Cui return outcome;
98*01826a49SYabin Cui }
99*01826a49SYabin Cui
100*01826a49SYabin Cui
101*01826a49SYabin Cui /* initFn will be measured once, benchFn will be measured `nbLoops` times */
102*01826a49SYabin Cui /* initFn is optional, provide NULL if none */
103*01826a49SYabin Cui /* benchFn must return a size_t value that errorFn can interpret */
104*01826a49SYabin Cui /* takes # of blocks and list of size & stuff for each. */
105*01826a49SYabin Cui /* can report result of benchFn for each block into blockResult. */
106*01826a49SYabin Cui /* blockResult is optional, provide NULL if this information is not required */
107*01826a49SYabin Cui /* note : time per loop can be reported as zero if run time < timer resolution */
BMK_benchFunction(BMK_benchParams_t p,unsigned nbLoops)108*01826a49SYabin Cui BMK_runOutcome_t BMK_benchFunction(BMK_benchParams_t p,
109*01826a49SYabin Cui unsigned nbLoops)
110*01826a49SYabin Cui {
111*01826a49SYabin Cui nbLoops += !nbLoops; /* minimum nbLoops is 1 */
112*01826a49SYabin Cui
113*01826a49SYabin Cui /* init */
114*01826a49SYabin Cui { size_t i;
115*01826a49SYabin Cui for(i = 0; i < p.blockCount; i++) {
116*01826a49SYabin Cui memset(p.dstBuffers[i], 0xE5, p.dstCapacities[i]); /* warm up and erase result buffer */
117*01826a49SYabin Cui } }
118*01826a49SYabin Cui
119*01826a49SYabin Cui /* benchmark */
120*01826a49SYabin Cui { size_t dstSize = 0;
121*01826a49SYabin Cui UTIL_time_t const clockStart = UTIL_getTime();
122*01826a49SYabin Cui unsigned loopNb, blockNb;
123*01826a49SYabin Cui if (p.initFn != NULL) p.initFn(p.initPayload);
124*01826a49SYabin Cui for (loopNb = 0; loopNb < nbLoops; loopNb++) {
125*01826a49SYabin Cui for (blockNb = 0; blockNb < p.blockCount; blockNb++) {
126*01826a49SYabin Cui size_t const res = p.benchFn(p.srcBuffers[blockNb], p.srcSizes[blockNb],
127*01826a49SYabin Cui p.dstBuffers[blockNb], p.dstCapacities[blockNb],
128*01826a49SYabin Cui p.benchPayload);
129*01826a49SYabin Cui if (loopNb == 0) {
130*01826a49SYabin Cui if (p.blockResults != NULL) p.blockResults[blockNb] = res;
131*01826a49SYabin Cui if ((p.errorFn != NULL) && (p.errorFn(res))) {
132*01826a49SYabin Cui RETURN_QUIET_ERROR(BMK_runOutcome_error(res),
133*01826a49SYabin Cui "Function benchmark failed on block %u (of size %u) with error %i",
134*01826a49SYabin Cui blockNb, (unsigned)p.srcSizes[blockNb], (int)res);
135*01826a49SYabin Cui }
136*01826a49SYabin Cui dstSize += res;
137*01826a49SYabin Cui } }
138*01826a49SYabin Cui } /* for (loopNb = 0; loopNb < nbLoops; loopNb++) */
139*01826a49SYabin Cui
140*01826a49SYabin Cui { PTime const totalTime = UTIL_clockSpanNano(clockStart);
141*01826a49SYabin Cui BMK_runTime_t rt;
142*01826a49SYabin Cui rt.nanoSecPerRun = (double)totalTime / nbLoops;
143*01826a49SYabin Cui rt.sumOfReturn = dstSize;
144*01826a49SYabin Cui return BMK_setValid_runTime(rt);
145*01826a49SYabin Cui } }
146*01826a49SYabin Cui }
147*01826a49SYabin Cui
148*01826a49SYabin Cui
149*01826a49SYabin Cui /* ==== Benchmarking any function, providing intermediate results ==== */
150*01826a49SYabin Cui
151*01826a49SYabin Cui struct BMK_timedFnState_s {
152*01826a49SYabin Cui PTime timeSpent_ns;
153*01826a49SYabin Cui PTime timeBudget_ns;
154*01826a49SYabin Cui PTime runBudget_ns;
155*01826a49SYabin Cui BMK_runTime_t fastestRun;
156*01826a49SYabin Cui unsigned nbLoops;
157*01826a49SYabin Cui UTIL_time_t coolTime;
158*01826a49SYabin Cui }; /* typedef'd to BMK_timedFnState_t within bench.h */
159*01826a49SYabin Cui
BMK_createTimedFnState(unsigned total_ms,unsigned run_ms)160*01826a49SYabin Cui BMK_timedFnState_t* BMK_createTimedFnState(unsigned total_ms, unsigned run_ms)
161*01826a49SYabin Cui {
162*01826a49SYabin Cui BMK_timedFnState_t* const r = (BMK_timedFnState_t*)malloc(sizeof(*r));
163*01826a49SYabin Cui if (r == NULL) return NULL; /* malloc() error */
164*01826a49SYabin Cui BMK_resetTimedFnState(r, total_ms, run_ms);
165*01826a49SYabin Cui return r;
166*01826a49SYabin Cui }
167*01826a49SYabin Cui
BMK_freeTimedFnState(BMK_timedFnState_t * state)168*01826a49SYabin Cui void BMK_freeTimedFnState(BMK_timedFnState_t* state) { free(state); }
169*01826a49SYabin Cui
170*01826a49SYabin Cui BMK_timedFnState_t*
BMK_initStatic_timedFnState(void * buffer,size_t size,unsigned total_ms,unsigned run_ms)171*01826a49SYabin Cui BMK_initStatic_timedFnState(void* buffer, size_t size, unsigned total_ms, unsigned run_ms)
172*01826a49SYabin Cui {
173*01826a49SYabin Cui typedef char check_size[ 2 * (sizeof(BMK_timedFnState_shell) >= sizeof(struct BMK_timedFnState_s)) - 1]; /* static assert : a compilation failure indicates that BMK_timedFnState_shell is not large enough */
174*01826a49SYabin Cui typedef struct { check_size c; BMK_timedFnState_t tfs; } tfs_align; /* force tfs to be aligned at its next best position */
175*01826a49SYabin Cui size_t const tfs_alignment = offsetof(tfs_align, tfs); /* provides the minimal alignment restriction for BMK_timedFnState_t */
176*01826a49SYabin Cui BMK_timedFnState_t* const r = (BMK_timedFnState_t*)buffer;
177*01826a49SYabin Cui if (buffer == NULL) return NULL;
178*01826a49SYabin Cui if (size < sizeof(struct BMK_timedFnState_s)) return NULL;
179*01826a49SYabin Cui if ((size_t)buffer % tfs_alignment) return NULL; /* buffer must be properly aligned */
180*01826a49SYabin Cui BMK_resetTimedFnState(r, total_ms, run_ms);
181*01826a49SYabin Cui return r;
182*01826a49SYabin Cui }
183*01826a49SYabin Cui
BMK_resetTimedFnState(BMK_timedFnState_t * timedFnState,unsigned total_ms,unsigned run_ms)184*01826a49SYabin Cui void BMK_resetTimedFnState(BMK_timedFnState_t* timedFnState, unsigned total_ms, unsigned run_ms)
185*01826a49SYabin Cui {
186*01826a49SYabin Cui if (!total_ms) total_ms = 1 ;
187*01826a49SYabin Cui if (!run_ms) run_ms = 1;
188*01826a49SYabin Cui if (run_ms > total_ms) run_ms = total_ms;
189*01826a49SYabin Cui timedFnState->timeSpent_ns = 0;
190*01826a49SYabin Cui timedFnState->timeBudget_ns = (PTime)total_ms * TIMELOOP_NANOSEC / 1000;
191*01826a49SYabin Cui timedFnState->runBudget_ns = (PTime)run_ms * TIMELOOP_NANOSEC / 1000;
192*01826a49SYabin Cui timedFnState->fastestRun.nanoSecPerRun = (double)TIMELOOP_NANOSEC * 2000000000; /* hopefully large enough : must be larger than any potential measurement */
193*01826a49SYabin Cui timedFnState->fastestRun.sumOfReturn = (size_t)(-1LL);
194*01826a49SYabin Cui timedFnState->nbLoops = 1;
195*01826a49SYabin Cui timedFnState->coolTime = UTIL_getTime();
196*01826a49SYabin Cui }
197*01826a49SYabin Cui
198*01826a49SYabin Cui /* Tells if nb of seconds set in timedFnState for all runs is spent.
199*01826a49SYabin Cui * note : this function will return 1 if BMK_benchFunctionTimed() has actually errored. */
BMK_isCompleted_TimedFn(const BMK_timedFnState_t * timedFnState)200*01826a49SYabin Cui int BMK_isCompleted_TimedFn(const BMK_timedFnState_t* timedFnState)
201*01826a49SYabin Cui {
202*01826a49SYabin Cui return (timedFnState->timeSpent_ns >= timedFnState->timeBudget_ns);
203*01826a49SYabin Cui }
204*01826a49SYabin Cui
205*01826a49SYabin Cui
206*01826a49SYabin Cui #undef MIN
207*01826a49SYabin Cui #define MIN(a,b) ( (a) < (b) ? (a) : (b) )
208*01826a49SYabin Cui
209*01826a49SYabin Cui #define MINUSABLETIME (TIMELOOP_NANOSEC / 2) /* 0.5 seconds */
210*01826a49SYabin Cui
BMK_benchTimedFn(BMK_timedFnState_t * cont,BMK_benchParams_t p)211*01826a49SYabin Cui BMK_runOutcome_t BMK_benchTimedFn(BMK_timedFnState_t* cont,
212*01826a49SYabin Cui BMK_benchParams_t p)
213*01826a49SYabin Cui {
214*01826a49SYabin Cui PTime const runBudget_ns = cont->runBudget_ns;
215*01826a49SYabin Cui PTime const runTimeMin_ns = runBudget_ns / 2;
216*01826a49SYabin Cui int completed = 0;
217*01826a49SYabin Cui BMK_runTime_t bestRunTime = cont->fastestRun;
218*01826a49SYabin Cui
219*01826a49SYabin Cui while (!completed) {
220*01826a49SYabin Cui BMK_runOutcome_t const runResult = BMK_benchFunction(p, cont->nbLoops);
221*01826a49SYabin Cui
222*01826a49SYabin Cui if(!BMK_isSuccessful_runOutcome(runResult)) { /* error : move out */
223*01826a49SYabin Cui return runResult;
224*01826a49SYabin Cui }
225*01826a49SYabin Cui
226*01826a49SYabin Cui { BMK_runTime_t const newRunTime = BMK_extract_runTime(runResult);
227*01826a49SYabin Cui double const loopDuration_ns = newRunTime.nanoSecPerRun * cont->nbLoops;
228*01826a49SYabin Cui
229*01826a49SYabin Cui cont->timeSpent_ns += (unsigned long long)loopDuration_ns;
230*01826a49SYabin Cui
231*01826a49SYabin Cui /* estimate nbLoops for next run to last approximately 1 second */
232*01826a49SYabin Cui if (loopDuration_ns > ((double)runBudget_ns / 50)) {
233*01826a49SYabin Cui double const fastestRun_ns = MIN(bestRunTime.nanoSecPerRun, newRunTime.nanoSecPerRun);
234*01826a49SYabin Cui cont->nbLoops = (unsigned)((double)runBudget_ns / fastestRun_ns) + 1;
235*01826a49SYabin Cui } else {
236*01826a49SYabin Cui /* previous run was too short : blindly increase workload by x multiplier */
237*01826a49SYabin Cui const unsigned multiplier = 10;
238*01826a49SYabin Cui assert(cont->nbLoops < ((unsigned)-1) / multiplier); /* avoid overflow */
239*01826a49SYabin Cui cont->nbLoops *= multiplier;
240*01826a49SYabin Cui }
241*01826a49SYabin Cui
242*01826a49SYabin Cui if(loopDuration_ns < (double)runTimeMin_ns) {
243*01826a49SYabin Cui /* don't report results for which benchmark run time was too small : increased risks of rounding errors */
244*01826a49SYabin Cui assert(completed == 0);
245*01826a49SYabin Cui continue;
246*01826a49SYabin Cui } else {
247*01826a49SYabin Cui if(newRunTime.nanoSecPerRun < bestRunTime.nanoSecPerRun) {
248*01826a49SYabin Cui bestRunTime = newRunTime;
249*01826a49SYabin Cui }
250*01826a49SYabin Cui completed = 1;
251*01826a49SYabin Cui }
252*01826a49SYabin Cui }
253*01826a49SYabin Cui } /* while (!completed) */
254*01826a49SYabin Cui
255*01826a49SYabin Cui return BMK_setValid_runTime(bestRunTime);
256*01826a49SYabin Cui }
257