1#!/usr/bin/python 2# encoding: utf-8 3 4# Copyright 2017 Google Inc. 5# 6# Use of this source code is governed by a BSD-style license that can be found 7# in the LICENSE file. 8# 9# This is an A/B test utility script used by calmbench.py 10# 11# For each bench, we get a distribution of min_ms measurements from nanobench. 12# From that, we try to recover the 1/3 and 2/3 quantiles of the distribution. 13# If range (1/3 quantile, 2/3 quantile) is completely disjoint between A and B, 14# we report that as a regression. 15# 16# The more measurements we have for a bench, the more accurate our quantiles 17# are. However, taking more measurements is time consuming. Hence we'll prune 18# out benches and only take more measurements for benches whose current quantile 19# ranges are disjoint. 20# 21# P.S. The current script is brute forcely translated from a ruby script. So it 22# may be ugly... 23 24 25from __future__ import print_function 26import re 27import os 28import sys 29import time 30import json 31import subprocess 32import shlex 33import multiprocessing 34import traceback 35from argparse import ArgumentParser 36from multiprocessing import Process 37from threading import Thread 38from threading import Lock 39from pdb import set_trace 40 41 42HELP = """ 43\033[31mPlease call calmbench.py to drive this script if you're not doing so. 44This script is not supposed to be used by itself. (At least, it's not easy to 45use by itself. The calmbench bots may use this script directly.) 46\033[0m 47""" 48 49FACTOR = 3 # lower/upper quantile factor 50DIFF_T = 0.99 # different enough threshold 51TERM = 10 # terminate after this no. of iterations without suspect changes 52MAXTRY = 30 # max number of nanobench tries to narrow down suspects 53 54UNITS = "ns µs ms s".split() 55 56 57timesLock = Lock() 58timesA = {} 59timesB = {} 60 61 62def parse_args(): 63 parser = ArgumentParser(description=HELP) 64 65 parser.add_argument('outdir', type=str, help="output directory") 66 parser.add_argument('a', type=str, help="name of A") 67 parser.add_argument('b', type=str, help="name of B") 68 parser.add_argument('nano_a', type=str, help="path to A's nanobench binary") 69 parser.add_argument('nano_b', type=str, help="path to B's nanobench binary") 70 parser.add_argument('arg_a', type=str, help="args for A's nanobench run") 71 parser.add_argument('arg_b', type=str, help="args for B's nanobench run") 72 parser.add_argument('repeat', type=int, help="number of initial runs") 73 parser.add_argument('skip_b', type=str, help=("whether to skip running B" 74 " ('true' or 'false')")) 75 parser.add_argument('config', type=str, help="nanobenh config") 76 parser.add_argument('threads', type=int, help="number of threads to run") 77 parser.add_argument('noinit', type=str, help=("whether to skip running B" 78 " ('true' or 'false')")) 79 80 parser.add_argument('--concise', dest='concise', action="store_true", 81 help="If set, no verbose thread info will be printed.") 82 parser.set_defaults(concise=False) 83 84 # Additional args for bots 85 BHELP = "bot specific options" 86 parser.add_argument('--githash', type=str, default="", help=BHELP) 87 parser.add_argument('--keys', type=str, default=[], nargs='+', help=BHELP) 88 89 args = parser.parse_args() 90 args.skip_b = args.skip_b == "true" 91 args.noinit = args.noinit == "true" 92 93 if args.threads == -1: 94 args.threads = 1 95 if args.config in ["8888", "565"]: # multi-thread for CPU only 96 args.threads = max(1, multiprocessing.cpu_count() / 2) 97 98 return args 99 100def append_dict_sorted_array(dict_array, key, value): 101 if key not in dict_array: 102 dict_array[key] = [] 103 dict_array[key].append(value) 104 dict_array[key].sort() 105 106 107def add_time(args, name, bench, t, unit): 108 normalized_t = t * 1000 ** UNITS.index(unit); 109 if name.startswith(args.a): 110 append_dict_sorted_array(timesA, bench, normalized_t) 111 else: 112 append_dict_sorted_array(timesB, bench, normalized_t) 113 114 115def append_times_from_file(args, name, filename): 116 with open(filename) as f: 117 lines = f.readlines() 118 for line in lines: 119 items = line.split() 120 if len(items) > 10: 121 bench = items[10] 122 matches = re.search("([+-]?\d*.?\d+)(s|ms|µs|ns)", items[3]) 123 if (not matches or items[9] != args.config): 124 continue 125 time_num = matches.group(1) 126 time_unit = matches.group(2) 127 add_time(args, name, bench, float(time_num), time_unit) 128 129 130class ThreadWithException(Thread): 131 def __init__(self, target): 132 super(ThreadWithException, self).__init__(target = target) 133 self.exception = None 134 135 def run(self): 136 try: 137 self._Thread__target(*self._Thread__args, **self._Thread__kwargs) 138 except BaseException as e: 139 self.exception = e 140 141 def join(self, timeout=None): 142 super(ThreadWithException, self).join(timeout) 143 144 145class ThreadRunner: 146 """Simplest and stupidiest threaded executer.""" 147 def __init__(self, args): 148 self.concise = args.concise 149 self.threads = [] 150 151 def add(self, args, fn): 152 if len(self.threads) >= args.threads: 153 self.wait() 154 t = ThreadWithException(target = fn) 155 t.daemon = True 156 self.threads.append(t) 157 t.start() 158 159 def wait(self): 160 def spin(): 161 i = 0 162 spinners = [". ", ".. ", "..."] 163 while len(self.threads) > 0: 164 timesLock.acquire() 165 sys.stderr.write( 166 "\r" + spinners[i % len(spinners)] + 167 " (%d threads running)" % len(self.threads) + 168 " \r" # spaces for erasing characters 169 ) 170 timesLock.release() 171 time.sleep(0.5) 172 i += 1 173 174 if not self.concise: 175 ts = Thread(target = spin); 176 ts.start() 177 178 for t in self.threads: 179 t.join() 180 181 exceptions = [] 182 for t in self.threads: 183 if t.exception: 184 exceptions.append(t.exception) 185 186 self.threads = [] 187 188 if not self.concise: 189 ts.join() 190 191 if len(exceptions): 192 for exc in exceptions: 193 print(exc) 194 raise exceptions[0] 195 196 197def split_arg(arg): 198 raw = shlex.split(arg) 199 result = [] 200 for r in raw: 201 if '~' in r: 202 result.append(os.path.expanduser(r)) 203 else: 204 result.append(r) 205 return result 206 207 208def run(args, threadRunner, name, nano, arg, i): 209 def task(): 210 file_i = "%s/%s.out%d" % (args.outdir, name, i) 211 212 should_run = not args.noinit and not (name == args.b and args.skip_b) 213 if i <= 0: 214 should_run = True # always run for suspects 215 216 if should_run: 217 if i > 0: 218 timesLock.acquire() 219 print("Init run %d for %s..." % (i, name)) 220 timesLock.release() 221 subprocess.check_call(["touch", file_i]) 222 with open(file_i, 'w') as f: 223 subprocess.check_call([nano] + split_arg(arg) + 224 ["--config", args.config], stderr=f, stdout=f) 225 226 timesLock.acquire() 227 append_times_from_file(args, name, file_i) 228 timesLock.release() 229 230 threadRunner.add(args, task) 231 232 233def init_run(args): 234 threadRunner = ThreadRunner(args) 235 for i in range(1, max(args.repeat, args.threads / 2) + 1): 236 run(args, threadRunner, args.a, args.nano_a, args.arg_a, i) 237 run(args, threadRunner, args.b, args.nano_b, args.arg_b, i) 238 threadRunner.wait() 239 240 241def get_lower_upper(values): 242 i = max(0, (len(values) - 1) / FACTOR) 243 return values[i], values[-i - 1] 244 245 246def different_enough(lower1, upper2): 247 return upper2 < DIFF_T * lower1 248 249 250# TODO(liyuqian): we used this hacky criteria mainly because that I didn't have 251# time to study more rigorous statistical tests. We should adopt a more rigorous 252# test in the future. 253def get_suspects(): 254 suspects = [] 255 for bench in timesA.keys(): 256 if bench not in timesB: 257 continue 258 lowerA, upperA = get_lower_upper(timesA[bench]) 259 lowerB, upperB = get_lower_upper(timesB[bench]) 260 if different_enough(lowerA, upperB) or different_enough(lowerB, upperA): 261 suspects.append(bench) 262 return suspects 263 264 265def process_bench_pattern(s): 266 if ".skp" in s: # skp bench won't match their exact names... 267 return "^\"" + s[0:(s.index(".skp") + 3)] + "\"" 268 else: 269 return "^\"" + s + "\"$" 270 271 272def suspects_arg(suspects): 273 patterns = map(process_bench_pattern, suspects) 274 return " --match " + (" ".join(patterns)) 275 276 277def median(array): 278 return array[len(array) / 2] 279 280 281def regression(bench): 282 a = median(timesA[bench]) 283 b = median(timesB[bench]) 284 if (a == 0): # bad bench, just return no regression 285 return 1 286 return b / a 287 288 289def percentage(x): 290 return (x - 1) * 100 291 292 293def format_r(r): 294 return ('%6.2f' % percentage(r)) + "%" 295 296 297def normalize_r(r): 298 if r > 1.0: 299 return r - 1.0 300 else: 301 return 1.0 - 1/r 302 303 304def test(): 305 args = parse_args() 306 307 init_run(args) 308 last_unchanged_iter = 0 309 last_suspect_number = -1 310 tryCnt = 0 311 it = 0 312 while tryCnt < MAXTRY: 313 it += 1 314 suspects = get_suspects() 315 if len(suspects) != last_suspect_number: 316 last_suspect_number = len(suspects) 317 last_unchanged_iter = it 318 if (len(suspects) == 0 or it - last_unchanged_iter >= TERM): 319 break 320 321 print("Number of suspects at iteration %d: %d" % (it, len(suspects))) 322 threadRunner = ThreadRunner(args) 323 for j in range(1, max(1, args.threads / 2) + 1): 324 run(args, threadRunner, args.a, args.nano_a, 325 args.arg_a + suspects_arg(suspects), -j) 326 run(args, threadRunner, args.b, args.nano_b, 327 args.arg_b + suspects_arg(suspects), -j) 328 tryCnt += 1 329 threadRunner.wait() 330 331 suspects = get_suspects() 332 if len(suspects) == 0: 333 print(("%s and %s does not seem to have significant " + \ 334 "performance differences.") % (args.a, args.b)) 335 else: 336 suspects.sort(key = regression) 337 print("%s (compared to %s) is likely" % (args.a, args.b)) 338 for suspect in suspects: 339 r = regression(suspect) 340 if r < 1: 341 print("\033[31m %s slower in %s\033[0m" % (format_r(1/r), suspect)) 342 else: 343 print("\033[32m %s faster in %s\033[0m" % (format_r(r), suspect)) 344 345 with open("%s/bench_%s_%s.json" % (args.outdir, args.a, args.b), 'w') as f: 346 results = {} 347 for bench in timesA: 348 r = regression(bench) if bench in suspects else 1.0 349 results[bench] = { 350 args.config: { 351 "signed_regression": normalize_r(r), 352 "lower_quantile_ms": get_lower_upper(timesA[bench])[0] * 1e-6, 353 "upper_quantile_ms": get_lower_upper(timesA[bench])[1] * 1e-6, 354 "options": { 355 # TODO(liyuqian): let ab.py call nanobench with --outResultsFile so 356 # nanobench could generate the json for us that's exactly the same 357 # as that being used by perf bots. Currently, we cannot guarantee 358 # that bench is the name (e.g., bench may have additional resolution 359 # information appended after name). 360 "name": bench 361 } 362 } 363 } 364 365 output = {"results": results} 366 if args.githash: 367 output["gitHash"] = args.githash 368 if args.keys: 369 keys = {} 370 for i in range(len(args.keys) / 2): 371 keys[args.keys[i * 2]] = args.keys[i * 2 + 1] 372 output["key"] = keys 373 f.write(json.dumps(output, indent=4)) 374 print(("\033[36mJSON results available in %s\033[0m" % f.name)) 375 376 with open("%s/bench_%s_%s.csv" % (args.outdir, args.a, args.b), 'w') as out: 377 out.write(("bench, significant?, raw regresion, " + 378 "%(A)s quantile (ns), %(B)s quantile (ns), " + 379 "%(A)s (ns), %(B)s (ns)\n") % {'A': args.a, 'B': args.b}) 380 for bench in suspects + timesA.keys(): 381 if (bench not in timesA or bench not in timesB): 382 continue 383 ta = timesA[bench] 384 tb = timesB[bench] 385 out.write( 386 "%s, %s, %f, " % (bench, bench in suspects, regression(bench)) + 387 ' '.join(map(str, get_lower_upper(ta))) + ", " + 388 ' '.join(map(str, get_lower_upper(tb))) + ", " + 389 ("%s, %s\n" % (' '.join(map(str, ta)), ' '.join(map(str, tb)))) 390 ) 391 print(("\033[36m" + 392 "Compared %d benches. " + 393 "%d of them seem to be significantly differrent." + 394 "\033[0m") % 395 (len([x for x in timesA if x in timesB]), len(suspects))) 396 print("\033[36mPlease see detailed bench results in %s\033[0m" % out.name) 397 398 399if __name__ == "__main__": 400 try: 401 test() 402 except Exception as e: 403 print(e) 404 print(HELP) 405 traceback.print_exc() 406 raise e 407