xref: /aosp_15_r20/external/skia/tools/calmbench/ab.py (revision c8dee2aa9b3f27cf6c858bd81872bdeb2c07ed17)
1#!/usr/bin/python
2# encoding: utf-8
3
4# Copyright 2017 Google Inc.
5#
6# Use of this source code is governed by a BSD-style license that can be found
7# in the LICENSE file.
8#
9# This is an A/B test utility script used by calmbench.py
10#
11# For each bench, we get a distribution of min_ms measurements from nanobench.
12# From that, we try to recover the 1/3 and 2/3 quantiles of the distribution.
13# If range (1/3 quantile, 2/3 quantile) is completely disjoint between A and B,
14# we report that as a regression.
15#
16# The more measurements we have for a bench, the more accurate our quantiles
17# are. However, taking more measurements is time consuming. Hence we'll prune
18# out benches and only take more measurements for benches whose current quantile
19# ranges are disjoint.
20#
21# P.S. The current script is brute forcely translated from a ruby script. So it
22# may be ugly...
23
24
25from __future__ import print_function
26import re
27import os
28import sys
29import time
30import json
31import subprocess
32import shlex
33import multiprocessing
34import traceback
35from argparse import ArgumentParser
36from multiprocessing import Process
37from threading import Thread
38from threading import Lock
39from pdb import set_trace
40
41
42HELP = """
43\033[31mPlease call calmbench.py to drive this script if you're not doing so.
44This script is not supposed to be used by itself. (At least, it's not easy to
45use by itself. The calmbench bots may use this script directly.)
46\033[0m
47"""
48
49FACTOR  = 3     # lower/upper quantile factor
50DIFF_T  = 0.99  # different enough threshold
51TERM    = 10    # terminate after this no. of iterations without suspect changes
52MAXTRY  = 30    # max number of nanobench tries to narrow down suspects
53
54UNITS   = "ns µs ms s".split()
55
56
57timesLock = Lock()
58timesA  = {}
59timesB  = {}
60
61
62def parse_args():
63  parser = ArgumentParser(description=HELP)
64
65  parser.add_argument('outdir', type=str, help="output directory")
66  parser.add_argument('a', type=str, help="name of A")
67  parser.add_argument('b', type=str, help="name of B")
68  parser.add_argument('nano_a', type=str, help="path to A's nanobench binary")
69  parser.add_argument('nano_b', type=str, help="path to B's nanobench binary")
70  parser.add_argument('arg_a', type=str, help="args for A's nanobench run")
71  parser.add_argument('arg_b', type=str, help="args for B's nanobench run")
72  parser.add_argument('repeat', type=int, help="number of initial runs")
73  parser.add_argument('skip_b', type=str, help=("whether to skip running B"
74                                                " ('true' or 'false')"))
75  parser.add_argument('config', type=str, help="nanobenh config")
76  parser.add_argument('threads', type=int, help="number of threads to run")
77  parser.add_argument('noinit', type=str, help=("whether to skip running B"
78                                                " ('true' or 'false')"))
79
80  parser.add_argument('--concise', dest='concise', action="store_true",
81      help="If set, no verbose thread info will be printed.")
82  parser.set_defaults(concise=False)
83
84  # Additional args for bots
85  BHELP = "bot specific options"
86  parser.add_argument('--githash', type=str, default="", help=BHELP)
87  parser.add_argument('--keys', type=str, default=[], nargs='+', help=BHELP)
88
89  args = parser.parse_args()
90  args.skip_b = args.skip_b == "true"
91  args.noinit = args.noinit == "true"
92
93  if args.threads == -1:
94    args.threads = 1
95    if args.config in ["8888", "565"]: # multi-thread for CPU only
96        args.threads = max(1, multiprocessing.cpu_count() / 2)
97
98  return args
99
100def append_dict_sorted_array(dict_array, key, value):
101  if key not in dict_array:
102    dict_array[key] = []
103  dict_array[key].append(value)
104  dict_array[key].sort()
105
106
107def add_time(args, name, bench, t, unit):
108  normalized_t = t * 1000 ** UNITS.index(unit);
109  if name.startswith(args.a):
110    append_dict_sorted_array(timesA, bench, normalized_t)
111  else:
112    append_dict_sorted_array(timesB, bench, normalized_t)
113
114
115def append_times_from_file(args, name, filename):
116  with open(filename) as f:
117    lines = f.readlines()
118  for line in lines:
119    items = line.split()
120    if len(items) > 10:
121      bench = items[10]
122      matches = re.search("([+-]?\d*.?\d+)(s|ms|µs|ns)", items[3])
123      if (not matches or items[9] != args.config):
124        continue
125      time_num = matches.group(1)
126      time_unit = matches.group(2)
127      add_time(args, name, bench, float(time_num), time_unit)
128
129
130class ThreadWithException(Thread):
131  def __init__(self, target):
132    super(ThreadWithException, self).__init__(target = target)
133    self.exception = None
134
135  def run(self):
136    try:
137      self._Thread__target(*self._Thread__args, **self._Thread__kwargs)
138    except BaseException as e:
139      self.exception = e
140
141  def join(self, timeout=None):
142    super(ThreadWithException, self).join(timeout)
143
144
145class ThreadRunner:
146  """Simplest and stupidiest threaded executer."""
147  def __init__(self, args):
148    self.concise = args.concise
149    self.threads = []
150
151  def add(self, args, fn):
152    if len(self.threads) >= args.threads:
153      self.wait()
154    t = ThreadWithException(target = fn)
155    t.daemon = True
156    self.threads.append(t)
157    t.start()
158
159  def wait(self):
160    def spin():
161      i = 0
162      spinners = [".  ", ".. ", "..."]
163      while len(self.threads) > 0:
164        timesLock.acquire()
165        sys.stderr.write(
166            "\r" + spinners[i % len(spinners)] +
167            " (%d threads running)" % len(self.threads) +
168            "           \r" # spaces for erasing characters
169        )
170        timesLock.release()
171        time.sleep(0.5)
172        i += 1
173
174    if not self.concise:
175      ts = Thread(target = spin);
176      ts.start()
177
178    for t in self.threads:
179      t.join()
180
181    exceptions = []
182    for t in self.threads:
183      if t.exception:
184        exceptions.append(t.exception)
185
186    self.threads = []
187
188    if not self.concise:
189      ts.join()
190
191    if len(exceptions):
192      for exc in exceptions:
193        print(exc)
194      raise exceptions[0]
195
196
197def split_arg(arg):
198  raw = shlex.split(arg)
199  result = []
200  for r in raw:
201    if '~' in r:
202      result.append(os.path.expanduser(r))
203    else:
204      result.append(r)
205  return result
206
207
208def run(args, threadRunner, name, nano, arg, i):
209  def task():
210    file_i = "%s/%s.out%d" % (args.outdir, name, i)
211
212    should_run = not args.noinit and not (name == args.b and args.skip_b)
213    if i <= 0:
214      should_run = True # always run for suspects
215
216    if should_run:
217      if i > 0:
218        timesLock.acquire()
219        print("Init run %d for %s..." % (i, name))
220        timesLock.release()
221      subprocess.check_call(["touch", file_i])
222      with open(file_i, 'w') as f:
223        subprocess.check_call([nano] + split_arg(arg) +
224                              ["--config", args.config], stderr=f, stdout=f)
225
226    timesLock.acquire()
227    append_times_from_file(args, name, file_i)
228    timesLock.release()
229
230  threadRunner.add(args, task)
231
232
233def init_run(args):
234  threadRunner = ThreadRunner(args)
235  for i in range(1, max(args.repeat, args.threads / 2) + 1):
236    run(args, threadRunner, args.a, args.nano_a, args.arg_a, i)
237    run(args, threadRunner, args.b, args.nano_b, args.arg_b, i)
238  threadRunner.wait()
239
240
241def get_lower_upper(values):
242  i = max(0, (len(values) - 1) / FACTOR)
243  return values[i], values[-i - 1]
244
245
246def different_enough(lower1, upper2):
247  return upper2 < DIFF_T * lower1
248
249
250# TODO(liyuqian): we used this hacky criteria mainly because that I didn't have
251# time to study more rigorous statistical tests. We should adopt a more rigorous
252# test in the future.
253def get_suspects():
254  suspects = []
255  for bench in timesA.keys():
256    if bench not in timesB:
257      continue
258    lowerA, upperA = get_lower_upper(timesA[bench])
259    lowerB, upperB = get_lower_upper(timesB[bench])
260    if different_enough(lowerA, upperB) or different_enough(lowerB, upperA):
261      suspects.append(bench)
262  return suspects
263
264
265def process_bench_pattern(s):
266  if ".skp" in s: # skp bench won't match their exact names...
267    return "^\"" + s[0:(s.index(".skp") + 3)] + "\""
268  else:
269    return "^\"" + s + "\"$"
270
271
272def suspects_arg(suspects):
273  patterns = map(process_bench_pattern, suspects)
274  return " --match " + (" ".join(patterns))
275
276
277def median(array):
278  return array[len(array) / 2]
279
280
281def regression(bench):
282  a = median(timesA[bench])
283  b = median(timesB[bench])
284  if (a == 0): # bad bench, just return no regression
285    return 1
286  return b / a
287
288
289def percentage(x):
290  return (x - 1) * 100
291
292
293def format_r(r):
294  return ('%6.2f' % percentage(r)) + "%"
295
296
297def normalize_r(r):
298  if r > 1.0:
299    return r - 1.0
300  else:
301    return 1.0 - 1/r
302
303
304def test():
305  args = parse_args()
306
307  init_run(args)
308  last_unchanged_iter = 0
309  last_suspect_number = -1
310  tryCnt = 0
311  it = 0
312  while tryCnt < MAXTRY:
313    it += 1
314    suspects = get_suspects()
315    if len(suspects) != last_suspect_number:
316      last_suspect_number = len(suspects)
317      last_unchanged_iter = it
318    if (len(suspects) == 0 or it - last_unchanged_iter >= TERM):
319      break
320
321    print("Number of suspects at iteration %d: %d" % (it, len(suspects)))
322    threadRunner = ThreadRunner(args)
323    for j in range(1, max(1, args.threads / 2) + 1):
324      run(args, threadRunner, args.a, args.nano_a,
325          args.arg_a + suspects_arg(suspects), -j)
326      run(args, threadRunner, args.b, args.nano_b,
327          args.arg_b + suspects_arg(suspects), -j)
328      tryCnt += 1
329    threadRunner.wait()
330
331  suspects = get_suspects()
332  if len(suspects) == 0:
333    print(("%s and %s does not seem to have significant " + \
334           "performance differences.") % (args.a, args.b))
335  else:
336    suspects.sort(key = regression)
337    print("%s (compared to %s) is likely" % (args.a, args.b))
338    for suspect in suspects:
339      r = regression(suspect)
340      if r < 1:
341        print("\033[31m  %s slower in %s\033[0m" % (format_r(1/r), suspect))
342      else:
343        print("\033[32m  %s faster in %s\033[0m" % (format_r(r), suspect))
344
345  with open("%s/bench_%s_%s.json" % (args.outdir, args.a, args.b), 'w') as f:
346    results = {}
347    for bench in timesA:
348      r = regression(bench) if bench in suspects else 1.0
349      results[bench] = {
350        args.config: {
351          "signed_regression": normalize_r(r),
352          "lower_quantile_ms": get_lower_upper(timesA[bench])[0] * 1e-6,
353          "upper_quantile_ms": get_lower_upper(timesA[bench])[1] * 1e-6,
354          "options": {
355            # TODO(liyuqian): let ab.py call nanobench with --outResultsFile so
356            # nanobench could generate the json for us that's exactly the same
357            # as that being used by perf bots. Currently, we cannot guarantee
358            # that bench is the name (e.g., bench may have additional resolution
359            # information appended after name).
360            "name": bench
361          }
362        }
363      }
364
365    output = {"results": results}
366    if args.githash:
367      output["gitHash"] = args.githash
368    if args.keys:
369      keys = {}
370      for i in range(len(args.keys) / 2):
371        keys[args.keys[i * 2]] = args.keys[i * 2 + 1]
372      output["key"] = keys
373    f.write(json.dumps(output, indent=4))
374    print(("\033[36mJSON results available in %s\033[0m" % f.name))
375
376  with open("%s/bench_%s_%s.csv" % (args.outdir, args.a, args.b), 'w') as out:
377    out.write(("bench, significant?, raw regresion, " +
378                   "%(A)s quantile (ns), %(B)s quantile (ns), " +
379                   "%(A)s (ns), %(B)s (ns)\n") % {'A': args.a, 'B': args.b})
380    for bench in suspects + timesA.keys():
381      if (bench not in timesA or bench not in timesB):
382        continue
383      ta = timesA[bench]
384      tb = timesB[bench]
385      out.write(
386          "%s, %s, %f, " % (bench, bench in suspects, regression(bench)) +
387          ' '.join(map(str, get_lower_upper(ta))) + ", " +
388          ' '.join(map(str, get_lower_upper(tb))) + ", " +
389          ("%s, %s\n" % (' '.join(map(str, ta)), ' '.join(map(str, tb))))
390      )
391    print(("\033[36m" +
392           "Compared %d benches. " +
393           "%d of them seem to be significantly differrent." +
394           "\033[0m") %
395           (len([x for x in timesA if x in timesB]), len(suspects)))
396    print("\033[36mPlease see detailed bench results in %s\033[0m" % out.name)
397
398
399if __name__ == "__main__":
400  try:
401    test()
402  except Exception as e:
403    print(e)
404    print(HELP)
405    traceback.print_exc()
406    raise e
407