1#!/usr/bin/env python3 2 3import unittest 4""" 5compare.py - versatile benchmark output compare tool 6""" 7 8import argparse 9from argparse import ArgumentParser 10import json 11import sys 12import os 13import gbench 14from gbench import util, report 15 16 17def check_inputs(in1, in2, flags): 18 """ 19 Perform checking on the user provided inputs and diagnose any abnormalities 20 """ 21 in1_kind, in1_err = util.classify_input_file(in1) 22 in2_kind, in2_err = util.classify_input_file(in2) 23 output_file = util.find_benchmark_flag('--benchmark_out=', flags) 24 output_type = util.find_benchmark_flag('--benchmark_out_format=', flags) 25 if in1_kind == util.IT_Executable and in2_kind == util.IT_Executable and output_file: 26 print(("WARNING: '--benchmark_out=%s' will be passed to both " 27 "benchmarks causing it to be overwritten") % output_file) 28 if in1_kind == util.IT_JSON and in2_kind == util.IT_JSON: 29 # When both sides are JSON the only supported flag is 30 # --benchmark_filter= 31 for flag in util.remove_benchmark_flags('--benchmark_filter=', flags): 32 print("WARNING: passing %s has no effect since both " 33 "inputs are JSON" % flag) 34 if output_type is not None and output_type != 'json': 35 print(("ERROR: passing '--benchmark_out_format=%s' to 'compare.py`" 36 " is not supported.") % output_type) 37 sys.exit(1) 38 39 40def create_parser(): 41 parser = ArgumentParser( 42 description='versatile benchmark output compare tool') 43 44 parser.add_argument( 45 '-a', 46 '--display_aggregates_only', 47 dest='display_aggregates_only', 48 action="store_true", 49 help="If there are repetitions, by default, we display everything - the" 50 " actual runs, and the aggregates computed. Sometimes, it is " 51 "desirable to only view the aggregates. E.g. when there are a lot " 52 "of repetitions. Do note that only the display is affected. " 53 "Internally, all the actual runs are still used, e.g. for U test.") 54 55 parser.add_argument( 56 '--no-color', 57 dest='color', 58 default=True, 59 action="store_false", 60 help="Do not use colors in the terminal output" 61 ) 62 63 parser.add_argument( 64 '-d', 65 '--dump_to_json', 66 dest='dump_to_json', 67 help="Additionally, dump benchmark comparison output to this file in JSON format.") 68 69 utest = parser.add_argument_group() 70 utest.add_argument( 71 '--no-utest', 72 dest='utest', 73 default=True, 74 action="store_false", 75 help="The tool can do a two-tailed Mann-Whitney U test with the null hypothesis that it is equally likely that a randomly selected value from one sample will be less than or greater than a randomly selected value from a second sample.\nWARNING: requires **LARGE** (no less than {}) number of repetitions to be meaningful!\nThe test is being done by default, if at least {} repetitions were done.\nThis option can disable the U Test.".format(report.UTEST_OPTIMAL_REPETITIONS, report.UTEST_MIN_REPETITIONS)) 76 alpha_default = 0.05 77 utest.add_argument( 78 "--alpha", 79 dest='utest_alpha', 80 default=alpha_default, 81 type=float, 82 help=("significance level alpha. if the calculated p-value is below this value, then the result is said to be statistically significant and the null hypothesis is rejected.\n(default: %0.4f)") % 83 alpha_default) 84 85 subparsers = parser.add_subparsers( 86 help='This tool has multiple modes of operation:', 87 dest='mode') 88 89 parser_a = subparsers.add_parser( 90 'benchmarks', 91 help='The most simple use-case, compare all the output of these two benchmarks') 92 baseline = parser_a.add_argument_group( 93 'baseline', 'The benchmark baseline') 94 baseline.add_argument( 95 'test_baseline', 96 metavar='test_baseline', 97 type=argparse.FileType('r'), 98 nargs=1, 99 help='A benchmark executable or JSON output file') 100 contender = parser_a.add_argument_group( 101 'contender', 'The benchmark that will be compared against the baseline') 102 contender.add_argument( 103 'test_contender', 104 metavar='test_contender', 105 type=argparse.FileType('r'), 106 nargs=1, 107 help='A benchmark executable or JSON output file') 108 parser_a.add_argument( 109 'benchmark_options', 110 metavar='benchmark_options', 111 nargs=argparse.REMAINDER, 112 help='Arguments to pass when running benchmark executables') 113 114 parser_b = subparsers.add_parser( 115 'filters', help='Compare filter one with the filter two of benchmark') 116 baseline = parser_b.add_argument_group( 117 'baseline', 'The benchmark baseline') 118 baseline.add_argument( 119 'test', 120 metavar='test', 121 type=argparse.FileType('r'), 122 nargs=1, 123 help='A benchmark executable or JSON output file') 124 baseline.add_argument( 125 'filter_baseline', 126 metavar='filter_baseline', 127 type=str, 128 nargs=1, 129 help='The first filter, that will be used as baseline') 130 contender = parser_b.add_argument_group( 131 'contender', 'The benchmark that will be compared against the baseline') 132 contender.add_argument( 133 'filter_contender', 134 metavar='filter_contender', 135 type=str, 136 nargs=1, 137 help='The second filter, that will be compared against the baseline') 138 parser_b.add_argument( 139 'benchmark_options', 140 metavar='benchmark_options', 141 nargs=argparse.REMAINDER, 142 help='Arguments to pass when running benchmark executables') 143 144 parser_c = subparsers.add_parser( 145 'benchmarksfiltered', 146 help='Compare filter one of first benchmark with filter two of the second benchmark') 147 baseline = parser_c.add_argument_group( 148 'baseline', 'The benchmark baseline') 149 baseline.add_argument( 150 'test_baseline', 151 metavar='test_baseline', 152 type=argparse.FileType('r'), 153 nargs=1, 154 help='A benchmark executable or JSON output file') 155 baseline.add_argument( 156 'filter_baseline', 157 metavar='filter_baseline', 158 type=str, 159 nargs=1, 160 help='The first filter, that will be used as baseline') 161 contender = parser_c.add_argument_group( 162 'contender', 'The benchmark that will be compared against the baseline') 163 contender.add_argument( 164 'test_contender', 165 metavar='test_contender', 166 type=argparse.FileType('r'), 167 nargs=1, 168 help='The second benchmark executable or JSON output file, that will be compared against the baseline') 169 contender.add_argument( 170 'filter_contender', 171 metavar='filter_contender', 172 type=str, 173 nargs=1, 174 help='The second filter, that will be compared against the baseline') 175 parser_c.add_argument( 176 'benchmark_options', 177 metavar='benchmark_options', 178 nargs=argparse.REMAINDER, 179 help='Arguments to pass when running benchmark executables') 180 181 return parser 182 183 184def main(): 185 # Parse the command line flags 186 parser = create_parser() 187 args, unknown_args = parser.parse_known_args() 188 if args.mode is None: 189 parser.print_help() 190 exit(1) 191 assert not unknown_args 192 benchmark_options = args.benchmark_options 193 194 if args.mode == 'benchmarks': 195 test_baseline = args.test_baseline[0].name 196 test_contender = args.test_contender[0].name 197 filter_baseline = '' 198 filter_contender = '' 199 200 # NOTE: if test_baseline == test_contender, you are analyzing the stdev 201 202 description = 'Comparing %s to %s' % (test_baseline, test_contender) 203 elif args.mode == 'filters': 204 test_baseline = args.test[0].name 205 test_contender = args.test[0].name 206 filter_baseline = args.filter_baseline[0] 207 filter_contender = args.filter_contender[0] 208 209 # NOTE: if filter_baseline == filter_contender, you are analyzing the 210 # stdev 211 212 description = 'Comparing %s to %s (from %s)' % ( 213 filter_baseline, filter_contender, args.test[0].name) 214 elif args.mode == 'benchmarksfiltered': 215 test_baseline = args.test_baseline[0].name 216 test_contender = args.test_contender[0].name 217 filter_baseline = args.filter_baseline[0] 218 filter_contender = args.filter_contender[0] 219 220 # NOTE: if test_baseline == test_contender and 221 # filter_baseline == filter_contender, you are analyzing the stdev 222 223 description = 'Comparing %s (from %s) to %s (from %s)' % ( 224 filter_baseline, test_baseline, filter_contender, test_contender) 225 else: 226 # should never happen 227 print("Unrecognized mode of operation: '%s'" % args.mode) 228 parser.print_help() 229 exit(1) 230 231 check_inputs(test_baseline, test_contender, benchmark_options) 232 233 if args.display_aggregates_only: 234 benchmark_options += ['--benchmark_display_aggregates_only=true'] 235 236 options_baseline = [] 237 options_contender = [] 238 239 if filter_baseline and filter_contender: 240 options_baseline = ['--benchmark_filter=%s' % filter_baseline] 241 options_contender = ['--benchmark_filter=%s' % filter_contender] 242 243 # Run the benchmarks and report the results 244 json1 = json1_orig = gbench.util.sort_benchmark_results(gbench.util.run_or_load_benchmark( 245 test_baseline, benchmark_options + options_baseline)) 246 json2 = json2_orig = gbench.util.sort_benchmark_results(gbench.util.run_or_load_benchmark( 247 test_contender, benchmark_options + options_contender)) 248 249 # Now, filter the benchmarks so that the difference report can work 250 if filter_baseline and filter_contender: 251 replacement = '[%s vs. %s]' % (filter_baseline, filter_contender) 252 json1 = gbench.report.filter_benchmark( 253 json1_orig, filter_baseline, replacement) 254 json2 = gbench.report.filter_benchmark( 255 json2_orig, filter_contender, replacement) 256 257 diff_report = gbench.report.get_difference_report( 258 json1, json2, args.utest) 259 output_lines = gbench.report.print_difference_report( 260 diff_report, 261 args.display_aggregates_only, 262 args.utest, args.utest_alpha, args.color) 263 print(description) 264 for ln in output_lines: 265 print(ln) 266 267 # Optionally, diff and output to JSON 268 if args.dump_to_json is not None: 269 with open(args.dump_to_json, 'w') as f_json: 270 json.dump(diff_report, f_json) 271 272class TestParser(unittest.TestCase): 273 def setUp(self): 274 self.parser = create_parser() 275 testInputs = os.path.join( 276 os.path.dirname( 277 os.path.realpath(__file__)), 278 'gbench', 279 'Inputs') 280 self.testInput0 = os.path.join(testInputs, 'test1_run1.json') 281 self.testInput1 = os.path.join(testInputs, 'test1_run2.json') 282 283 def test_benchmarks_basic(self): 284 parsed = self.parser.parse_args( 285 ['benchmarks', self.testInput0, self.testInput1]) 286 self.assertFalse(parsed.display_aggregates_only) 287 self.assertTrue(parsed.utest) 288 self.assertEqual(parsed.mode, 'benchmarks') 289 self.assertEqual(parsed.test_baseline[0].name, self.testInput0) 290 self.assertEqual(parsed.test_contender[0].name, self.testInput1) 291 self.assertFalse(parsed.benchmark_options) 292 293 def test_benchmarks_basic_without_utest(self): 294 parsed = self.parser.parse_args( 295 ['--no-utest', 'benchmarks', self.testInput0, self.testInput1]) 296 self.assertFalse(parsed.display_aggregates_only) 297 self.assertFalse(parsed.utest) 298 self.assertEqual(parsed.utest_alpha, 0.05) 299 self.assertEqual(parsed.mode, 'benchmarks') 300 self.assertEqual(parsed.test_baseline[0].name, self.testInput0) 301 self.assertEqual(parsed.test_contender[0].name, self.testInput1) 302 self.assertFalse(parsed.benchmark_options) 303 304 def test_benchmarks_basic_display_aggregates_only(self): 305 parsed = self.parser.parse_args( 306 ['-a', 'benchmarks', self.testInput0, self.testInput1]) 307 self.assertTrue(parsed.display_aggregates_only) 308 self.assertTrue(parsed.utest) 309 self.assertEqual(parsed.mode, 'benchmarks') 310 self.assertEqual(parsed.test_baseline[0].name, self.testInput0) 311 self.assertEqual(parsed.test_contender[0].name, self.testInput1) 312 self.assertFalse(parsed.benchmark_options) 313 314 def test_benchmarks_basic_with_utest_alpha(self): 315 parsed = self.parser.parse_args( 316 ['--alpha=0.314', 'benchmarks', self.testInput0, self.testInput1]) 317 self.assertFalse(parsed.display_aggregates_only) 318 self.assertTrue(parsed.utest) 319 self.assertEqual(parsed.utest_alpha, 0.314) 320 self.assertEqual(parsed.mode, 'benchmarks') 321 self.assertEqual(parsed.test_baseline[0].name, self.testInput0) 322 self.assertEqual(parsed.test_contender[0].name, self.testInput1) 323 self.assertFalse(parsed.benchmark_options) 324 325 def test_benchmarks_basic_without_utest_with_utest_alpha(self): 326 parsed = self.parser.parse_args( 327 ['--no-utest', '--alpha=0.314', 'benchmarks', self.testInput0, self.testInput1]) 328 self.assertFalse(parsed.display_aggregates_only) 329 self.assertFalse(parsed.utest) 330 self.assertEqual(parsed.utest_alpha, 0.314) 331 self.assertEqual(parsed.mode, 'benchmarks') 332 self.assertEqual(parsed.test_baseline[0].name, self.testInput0) 333 self.assertEqual(parsed.test_contender[0].name, self.testInput1) 334 self.assertFalse(parsed.benchmark_options) 335 336 def test_benchmarks_with_remainder(self): 337 parsed = self.parser.parse_args( 338 ['benchmarks', self.testInput0, self.testInput1, 'd']) 339 self.assertFalse(parsed.display_aggregates_only) 340 self.assertTrue(parsed.utest) 341 self.assertEqual(parsed.mode, 'benchmarks') 342 self.assertEqual(parsed.test_baseline[0].name, self.testInput0) 343 self.assertEqual(parsed.test_contender[0].name, self.testInput1) 344 self.assertEqual(parsed.benchmark_options, ['d']) 345 346 def test_benchmarks_with_remainder_after_doubleminus(self): 347 parsed = self.parser.parse_args( 348 ['benchmarks', self.testInput0, self.testInput1, '--', 'e']) 349 self.assertFalse(parsed.display_aggregates_only) 350 self.assertTrue(parsed.utest) 351 self.assertEqual(parsed.mode, 'benchmarks') 352 self.assertEqual(parsed.test_baseline[0].name, self.testInput0) 353 self.assertEqual(parsed.test_contender[0].name, self.testInput1) 354 self.assertEqual(parsed.benchmark_options, ['e']) 355 356 def test_filters_basic(self): 357 parsed = self.parser.parse_args( 358 ['filters', self.testInput0, 'c', 'd']) 359 self.assertFalse(parsed.display_aggregates_only) 360 self.assertTrue(parsed.utest) 361 self.assertEqual(parsed.mode, 'filters') 362 self.assertEqual(parsed.test[0].name, self.testInput0) 363 self.assertEqual(parsed.filter_baseline[0], 'c') 364 self.assertEqual(parsed.filter_contender[0], 'd') 365 self.assertFalse(parsed.benchmark_options) 366 367 def test_filters_with_remainder(self): 368 parsed = self.parser.parse_args( 369 ['filters', self.testInput0, 'c', 'd', 'e']) 370 self.assertFalse(parsed.display_aggregates_only) 371 self.assertTrue(parsed.utest) 372 self.assertEqual(parsed.mode, 'filters') 373 self.assertEqual(parsed.test[0].name, self.testInput0) 374 self.assertEqual(parsed.filter_baseline[0], 'c') 375 self.assertEqual(parsed.filter_contender[0], 'd') 376 self.assertEqual(parsed.benchmark_options, ['e']) 377 378 def test_filters_with_remainder_after_doubleminus(self): 379 parsed = self.parser.parse_args( 380 ['filters', self.testInput0, 'c', 'd', '--', 'f']) 381 self.assertFalse(parsed.display_aggregates_only) 382 self.assertTrue(parsed.utest) 383 self.assertEqual(parsed.mode, 'filters') 384 self.assertEqual(parsed.test[0].name, self.testInput0) 385 self.assertEqual(parsed.filter_baseline[0], 'c') 386 self.assertEqual(parsed.filter_contender[0], 'd') 387 self.assertEqual(parsed.benchmark_options, ['f']) 388 389 def test_benchmarksfiltered_basic(self): 390 parsed = self.parser.parse_args( 391 ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e']) 392 self.assertFalse(parsed.display_aggregates_only) 393 self.assertTrue(parsed.utest) 394 self.assertEqual(parsed.mode, 'benchmarksfiltered') 395 self.assertEqual(parsed.test_baseline[0].name, self.testInput0) 396 self.assertEqual(parsed.filter_baseline[0], 'c') 397 self.assertEqual(parsed.test_contender[0].name, self.testInput1) 398 self.assertEqual(parsed.filter_contender[0], 'e') 399 self.assertFalse(parsed.benchmark_options) 400 401 def test_benchmarksfiltered_with_remainder(self): 402 parsed = self.parser.parse_args( 403 ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', 'f']) 404 self.assertFalse(parsed.display_aggregates_only) 405 self.assertTrue(parsed.utest) 406 self.assertEqual(parsed.mode, 'benchmarksfiltered') 407 self.assertEqual(parsed.test_baseline[0].name, self.testInput0) 408 self.assertEqual(parsed.filter_baseline[0], 'c') 409 self.assertEqual(parsed.test_contender[0].name, self.testInput1) 410 self.assertEqual(parsed.filter_contender[0], 'e') 411 self.assertEqual(parsed.benchmark_options[0], 'f') 412 413 def test_benchmarksfiltered_with_remainder_after_doubleminus(self): 414 parsed = self.parser.parse_args( 415 ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', '--', 'g']) 416 self.assertFalse(parsed.display_aggregates_only) 417 self.assertTrue(parsed.utest) 418 self.assertEqual(parsed.mode, 'benchmarksfiltered') 419 self.assertEqual(parsed.test_baseline[0].name, self.testInput0) 420 self.assertEqual(parsed.filter_baseline[0], 'c') 421 self.assertEqual(parsed.test_contender[0].name, self.testInput1) 422 self.assertEqual(parsed.filter_contender[0], 'e') 423 self.assertEqual(parsed.benchmark_options[0], 'g') 424 425 426if __name__ == '__main__': 427 # unittest.main() 428 main() 429 430# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 431# kate: tab-width: 4; replace-tabs on; indent-width 4; tab-indents: off; 432# kate: indent-mode python; remove-trailing-spaces modified; 433