1#!/usr/bin/env python3 2 3# type: ignore 4 5""" 6compare.py - versatile benchmark output compare tool 7""" 8 9import argparse 10import json 11import os 12import sys 13import unittest 14from argparse import ArgumentParser 15 16import gbench 17from gbench import report, util 18 19 20def check_inputs(in1, in2, flags): 21 """ 22 Perform checking on the user provided inputs and diagnose any abnormalities 23 """ 24 in1_kind, in1_err = util.classify_input_file(in1) 25 in2_kind, in2_err = util.classify_input_file(in2) 26 output_file = util.find_benchmark_flag("--benchmark_out=", flags) 27 output_type = util.find_benchmark_flag("--benchmark_out_format=", flags) 28 if ( 29 in1_kind == util.IT_Executable 30 and in2_kind == util.IT_Executable 31 and output_file 32 ): 33 print( 34 ( 35 "WARNING: '--benchmark_out=%s' will be passed to both " 36 "benchmarks causing it to be overwritten" 37 ) 38 % output_file 39 ) 40 if in1_kind == util.IT_JSON and in2_kind == util.IT_JSON: 41 # When both sides are JSON the only supported flag is 42 # --benchmark_filter= 43 for flag in util.remove_benchmark_flags("--benchmark_filter=", flags): 44 print( 45 "WARNING: passing %s has no effect since both " 46 "inputs are JSON" % flag 47 ) 48 if output_type is not None and output_type != "json": 49 print( 50 ( 51 "ERROR: passing '--benchmark_out_format=%s' to 'compare.py`" 52 " is not supported." 53 ) 54 % output_type 55 ) 56 sys.exit(1) 57 58 59def create_parser(): 60 parser = ArgumentParser( 61 description="versatile benchmark output compare tool" 62 ) 63 64 parser.add_argument( 65 "-a", 66 "--display_aggregates_only", 67 dest="display_aggregates_only", 68 action="store_true", 69 help="If there are repetitions, by default, we display everything - the" 70 " actual runs, and the aggregates computed. Sometimes, it is " 71 "desirable to only view the aggregates. E.g. when there are a lot " 72 "of repetitions. Do note that only the display is affected. " 73 "Internally, all the actual runs are still used, e.g. for U test.", 74 ) 75 76 parser.add_argument( 77 "--no-color", 78 dest="color", 79 default=True, 80 action="store_false", 81 help="Do not use colors in the terminal output", 82 ) 83 84 parser.add_argument( 85 "-d", 86 "--dump_to_json", 87 dest="dump_to_json", 88 help="Additionally, dump benchmark comparison output to this file in JSON format.", 89 ) 90 91 utest = parser.add_argument_group() 92 utest.add_argument( 93 "--no-utest", 94 dest="utest", 95 default=True, 96 action="store_false", 97 help="The tool can do a two-tailed Mann-Whitney U test with the null hypothesis that it is equally likely that a randomly selected value from one sample will be less than or greater than a randomly selected value from a second sample.\nWARNING: requires **LARGE** (no less than {}) number of repetitions to be meaningful!\nThe test is being done by default, if at least {} repetitions were done.\nThis option can disable the U Test.".format( 98 report.UTEST_OPTIMAL_REPETITIONS, report.UTEST_MIN_REPETITIONS 99 ), 100 ) 101 alpha_default = 0.05 102 utest.add_argument( 103 "--alpha", 104 dest="utest_alpha", 105 default=alpha_default, 106 type=float, 107 help=( 108 "significance level alpha. if the calculated p-value is below this value, then the result is said to be statistically significant and the null hypothesis is rejected.\n(default: %0.4f)" 109 ) 110 % alpha_default, 111 ) 112 113 subparsers = parser.add_subparsers( 114 help="This tool has multiple modes of operation:", dest="mode" 115 ) 116 117 parser_a = subparsers.add_parser( 118 "benchmarks", 119 help="The most simple use-case, compare all the output of these two benchmarks", 120 ) 121 baseline = parser_a.add_argument_group("baseline", "The benchmark baseline") 122 baseline.add_argument( 123 "test_baseline", 124 metavar="test_baseline", 125 type=argparse.FileType("r"), 126 nargs=1, 127 help="A benchmark executable or JSON output file", 128 ) 129 contender = parser_a.add_argument_group( 130 "contender", "The benchmark that will be compared against the baseline" 131 ) 132 contender.add_argument( 133 "test_contender", 134 metavar="test_contender", 135 type=argparse.FileType("r"), 136 nargs=1, 137 help="A benchmark executable or JSON output file", 138 ) 139 parser_a.add_argument( 140 "benchmark_options", 141 metavar="benchmark_options", 142 nargs=argparse.REMAINDER, 143 help="Arguments to pass when running benchmark executables", 144 ) 145 146 parser_b = subparsers.add_parser( 147 "filters", help="Compare filter one with the filter two of benchmark" 148 ) 149 baseline = parser_b.add_argument_group("baseline", "The benchmark baseline") 150 baseline.add_argument( 151 "test", 152 metavar="test", 153 type=argparse.FileType("r"), 154 nargs=1, 155 help="A benchmark executable or JSON output file", 156 ) 157 baseline.add_argument( 158 "filter_baseline", 159 metavar="filter_baseline", 160 type=str, 161 nargs=1, 162 help="The first filter, that will be used as baseline", 163 ) 164 contender = parser_b.add_argument_group( 165 "contender", "The benchmark that will be compared against the baseline" 166 ) 167 contender.add_argument( 168 "filter_contender", 169 metavar="filter_contender", 170 type=str, 171 nargs=1, 172 help="The second filter, that will be compared against the baseline", 173 ) 174 parser_b.add_argument( 175 "benchmark_options", 176 metavar="benchmark_options", 177 nargs=argparse.REMAINDER, 178 help="Arguments to pass when running benchmark executables", 179 ) 180 181 parser_c = subparsers.add_parser( 182 "benchmarksfiltered", 183 help="Compare filter one of first benchmark with filter two of the second benchmark", 184 ) 185 baseline = parser_c.add_argument_group("baseline", "The benchmark baseline") 186 baseline.add_argument( 187 "test_baseline", 188 metavar="test_baseline", 189 type=argparse.FileType("r"), 190 nargs=1, 191 help="A benchmark executable or JSON output file", 192 ) 193 baseline.add_argument( 194 "filter_baseline", 195 metavar="filter_baseline", 196 type=str, 197 nargs=1, 198 help="The first filter, that will be used as baseline", 199 ) 200 contender = parser_c.add_argument_group( 201 "contender", "The benchmark that will be compared against the baseline" 202 ) 203 contender.add_argument( 204 "test_contender", 205 metavar="test_contender", 206 type=argparse.FileType("r"), 207 nargs=1, 208 help="The second benchmark executable or JSON output file, that will be compared against the baseline", 209 ) 210 contender.add_argument( 211 "filter_contender", 212 metavar="filter_contender", 213 type=str, 214 nargs=1, 215 help="The second filter, that will be compared against the baseline", 216 ) 217 parser_c.add_argument( 218 "benchmark_options", 219 metavar="benchmark_options", 220 nargs=argparse.REMAINDER, 221 help="Arguments to pass when running benchmark executables", 222 ) 223 224 return parser 225 226 227def main(): 228 # Parse the command line flags 229 parser = create_parser() 230 args, unknown_args = parser.parse_known_args() 231 if args.mode is None: 232 parser.print_help() 233 exit(1) 234 assert not unknown_args 235 benchmark_options = args.benchmark_options 236 237 if args.mode == "benchmarks": 238 test_baseline = args.test_baseline[0].name 239 test_contender = args.test_contender[0].name 240 filter_baseline = "" 241 filter_contender = "" 242 243 # NOTE: if test_baseline == test_contender, you are analyzing the stdev 244 245 description = "Comparing %s to %s" % (test_baseline, test_contender) 246 elif args.mode == "filters": 247 test_baseline = args.test[0].name 248 test_contender = args.test[0].name 249 filter_baseline = args.filter_baseline[0] 250 filter_contender = args.filter_contender[0] 251 252 # NOTE: if filter_baseline == filter_contender, you are analyzing the 253 # stdev 254 255 description = "Comparing %s to %s (from %s)" % ( 256 filter_baseline, 257 filter_contender, 258 args.test[0].name, 259 ) 260 elif args.mode == "benchmarksfiltered": 261 test_baseline = args.test_baseline[0].name 262 test_contender = args.test_contender[0].name 263 filter_baseline = args.filter_baseline[0] 264 filter_contender = args.filter_contender[0] 265 266 # NOTE: if test_baseline == test_contender and 267 # filter_baseline == filter_contender, you are analyzing the stdev 268 269 description = "Comparing %s (from %s) to %s (from %s)" % ( 270 filter_baseline, 271 test_baseline, 272 filter_contender, 273 test_contender, 274 ) 275 else: 276 # should never happen 277 print("Unrecognized mode of operation: '%s'" % args.mode) 278 parser.print_help() 279 exit(1) 280 281 check_inputs(test_baseline, test_contender, benchmark_options) 282 283 if args.display_aggregates_only: 284 benchmark_options += ["--benchmark_display_aggregates_only=true"] 285 286 options_baseline = [] 287 options_contender = [] 288 289 if filter_baseline and filter_contender: 290 options_baseline = ["--benchmark_filter=%s" % filter_baseline] 291 options_contender = ["--benchmark_filter=%s" % filter_contender] 292 293 # Run the benchmarks and report the results 294 json1 = json1_orig = gbench.util.sort_benchmark_results( 295 gbench.util.run_or_load_benchmark( 296 test_baseline, benchmark_options + options_baseline 297 ) 298 ) 299 json2 = json2_orig = gbench.util.sort_benchmark_results( 300 gbench.util.run_or_load_benchmark( 301 test_contender, benchmark_options + options_contender 302 ) 303 ) 304 305 # Now, filter the benchmarks so that the difference report can work 306 if filter_baseline and filter_contender: 307 replacement = "[%s vs. %s]" % (filter_baseline, filter_contender) 308 json1 = gbench.report.filter_benchmark( 309 json1_orig, filter_baseline, replacement 310 ) 311 json2 = gbench.report.filter_benchmark( 312 json2_orig, filter_contender, replacement 313 ) 314 315 diff_report = gbench.report.get_difference_report(json1, json2, args.utest) 316 output_lines = gbench.report.print_difference_report( 317 diff_report, 318 args.display_aggregates_only, 319 args.utest, 320 args.utest_alpha, 321 args.color, 322 ) 323 print(description) 324 for ln in output_lines: 325 print(ln) 326 327 # Optionally, diff and output to JSON 328 if args.dump_to_json is not None: 329 with open(args.dump_to_json, "w") as f_json: 330 json.dump(diff_report, f_json, indent=1) 331 332 333class TestParser(unittest.TestCase): 334 def setUp(self): 335 self.parser = create_parser() 336 testInputs = os.path.join( 337 os.path.dirname(os.path.realpath(__file__)), "gbench", "Inputs" 338 ) 339 self.testInput0 = os.path.join(testInputs, "test1_run1.json") 340 self.testInput1 = os.path.join(testInputs, "test1_run2.json") 341 342 def test_benchmarks_basic(self): 343 parsed = self.parser.parse_args( 344 ["benchmarks", self.testInput0, self.testInput1] 345 ) 346 self.assertFalse(parsed.display_aggregates_only) 347 self.assertTrue(parsed.utest) 348 self.assertEqual(parsed.mode, "benchmarks") 349 self.assertEqual(parsed.test_baseline[0].name, self.testInput0) 350 self.assertEqual(parsed.test_contender[0].name, self.testInput1) 351 self.assertFalse(parsed.benchmark_options) 352 353 def test_benchmarks_basic_without_utest(self): 354 parsed = self.parser.parse_args( 355 ["--no-utest", "benchmarks", self.testInput0, self.testInput1] 356 ) 357 self.assertFalse(parsed.display_aggregates_only) 358 self.assertFalse(parsed.utest) 359 self.assertEqual(parsed.utest_alpha, 0.05) 360 self.assertEqual(parsed.mode, "benchmarks") 361 self.assertEqual(parsed.test_baseline[0].name, self.testInput0) 362 self.assertEqual(parsed.test_contender[0].name, self.testInput1) 363 self.assertFalse(parsed.benchmark_options) 364 365 def test_benchmarks_basic_display_aggregates_only(self): 366 parsed = self.parser.parse_args( 367 ["-a", "benchmarks", self.testInput0, self.testInput1] 368 ) 369 self.assertTrue(parsed.display_aggregates_only) 370 self.assertTrue(parsed.utest) 371 self.assertEqual(parsed.mode, "benchmarks") 372 self.assertEqual(parsed.test_baseline[0].name, self.testInput0) 373 self.assertEqual(parsed.test_contender[0].name, self.testInput1) 374 self.assertFalse(parsed.benchmark_options) 375 376 def test_benchmarks_basic_with_utest_alpha(self): 377 parsed = self.parser.parse_args( 378 ["--alpha=0.314", "benchmarks", self.testInput0, self.testInput1] 379 ) 380 self.assertFalse(parsed.display_aggregates_only) 381 self.assertTrue(parsed.utest) 382 self.assertEqual(parsed.utest_alpha, 0.314) 383 self.assertEqual(parsed.mode, "benchmarks") 384 self.assertEqual(parsed.test_baseline[0].name, self.testInput0) 385 self.assertEqual(parsed.test_contender[0].name, self.testInput1) 386 self.assertFalse(parsed.benchmark_options) 387 388 def test_benchmarks_basic_without_utest_with_utest_alpha(self): 389 parsed = self.parser.parse_args( 390 [ 391 "--no-utest", 392 "--alpha=0.314", 393 "benchmarks", 394 self.testInput0, 395 self.testInput1, 396 ] 397 ) 398 self.assertFalse(parsed.display_aggregates_only) 399 self.assertFalse(parsed.utest) 400 self.assertEqual(parsed.utest_alpha, 0.314) 401 self.assertEqual(parsed.mode, "benchmarks") 402 self.assertEqual(parsed.test_baseline[0].name, self.testInput0) 403 self.assertEqual(parsed.test_contender[0].name, self.testInput1) 404 self.assertFalse(parsed.benchmark_options) 405 406 def test_benchmarks_with_remainder(self): 407 parsed = self.parser.parse_args( 408 ["benchmarks", self.testInput0, self.testInput1, "d"] 409 ) 410 self.assertFalse(parsed.display_aggregates_only) 411 self.assertTrue(parsed.utest) 412 self.assertEqual(parsed.mode, "benchmarks") 413 self.assertEqual(parsed.test_baseline[0].name, self.testInput0) 414 self.assertEqual(parsed.test_contender[0].name, self.testInput1) 415 self.assertEqual(parsed.benchmark_options, ["d"]) 416 417 def test_benchmarks_with_remainder_after_doubleminus(self): 418 parsed = self.parser.parse_args( 419 ["benchmarks", self.testInput0, self.testInput1, "--", "e"] 420 ) 421 self.assertFalse(parsed.display_aggregates_only) 422 self.assertTrue(parsed.utest) 423 self.assertEqual(parsed.mode, "benchmarks") 424 self.assertEqual(parsed.test_baseline[0].name, self.testInput0) 425 self.assertEqual(parsed.test_contender[0].name, self.testInput1) 426 self.assertEqual(parsed.benchmark_options, ["e"]) 427 428 def test_filters_basic(self): 429 parsed = self.parser.parse_args(["filters", self.testInput0, "c", "d"]) 430 self.assertFalse(parsed.display_aggregates_only) 431 self.assertTrue(parsed.utest) 432 self.assertEqual(parsed.mode, "filters") 433 self.assertEqual(parsed.test[0].name, self.testInput0) 434 self.assertEqual(parsed.filter_baseline[0], "c") 435 self.assertEqual(parsed.filter_contender[0], "d") 436 self.assertFalse(parsed.benchmark_options) 437 438 def test_filters_with_remainder(self): 439 parsed = self.parser.parse_args( 440 ["filters", self.testInput0, "c", "d", "e"] 441 ) 442 self.assertFalse(parsed.display_aggregates_only) 443 self.assertTrue(parsed.utest) 444 self.assertEqual(parsed.mode, "filters") 445 self.assertEqual(parsed.test[0].name, self.testInput0) 446 self.assertEqual(parsed.filter_baseline[0], "c") 447 self.assertEqual(parsed.filter_contender[0], "d") 448 self.assertEqual(parsed.benchmark_options, ["e"]) 449 450 def test_filters_with_remainder_after_doubleminus(self): 451 parsed = self.parser.parse_args( 452 ["filters", self.testInput0, "c", "d", "--", "f"] 453 ) 454 self.assertFalse(parsed.display_aggregates_only) 455 self.assertTrue(parsed.utest) 456 self.assertEqual(parsed.mode, "filters") 457 self.assertEqual(parsed.test[0].name, self.testInput0) 458 self.assertEqual(parsed.filter_baseline[0], "c") 459 self.assertEqual(parsed.filter_contender[0], "d") 460 self.assertEqual(parsed.benchmark_options, ["f"]) 461 462 def test_benchmarksfiltered_basic(self): 463 parsed = self.parser.parse_args( 464 ["benchmarksfiltered", self.testInput0, "c", self.testInput1, "e"] 465 ) 466 self.assertFalse(parsed.display_aggregates_only) 467 self.assertTrue(parsed.utest) 468 self.assertEqual(parsed.mode, "benchmarksfiltered") 469 self.assertEqual(parsed.test_baseline[0].name, self.testInput0) 470 self.assertEqual(parsed.filter_baseline[0], "c") 471 self.assertEqual(parsed.test_contender[0].name, self.testInput1) 472 self.assertEqual(parsed.filter_contender[0], "e") 473 self.assertFalse(parsed.benchmark_options) 474 475 def test_benchmarksfiltered_with_remainder(self): 476 parsed = self.parser.parse_args( 477 [ 478 "benchmarksfiltered", 479 self.testInput0, 480 "c", 481 self.testInput1, 482 "e", 483 "f", 484 ] 485 ) 486 self.assertFalse(parsed.display_aggregates_only) 487 self.assertTrue(parsed.utest) 488 self.assertEqual(parsed.mode, "benchmarksfiltered") 489 self.assertEqual(parsed.test_baseline[0].name, self.testInput0) 490 self.assertEqual(parsed.filter_baseline[0], "c") 491 self.assertEqual(parsed.test_contender[0].name, self.testInput1) 492 self.assertEqual(parsed.filter_contender[0], "e") 493 self.assertEqual(parsed.benchmark_options[0], "f") 494 495 def test_benchmarksfiltered_with_remainder_after_doubleminus(self): 496 parsed = self.parser.parse_args( 497 [ 498 "benchmarksfiltered", 499 self.testInput0, 500 "c", 501 self.testInput1, 502 "e", 503 "--", 504 "g", 505 ] 506 ) 507 self.assertFalse(parsed.display_aggregates_only) 508 self.assertTrue(parsed.utest) 509 self.assertEqual(parsed.mode, "benchmarksfiltered") 510 self.assertEqual(parsed.test_baseline[0].name, self.testInput0) 511 self.assertEqual(parsed.filter_baseline[0], "c") 512 self.assertEqual(parsed.test_contender[0].name, self.testInput1) 513 self.assertEqual(parsed.filter_contender[0], "e") 514 self.assertEqual(parsed.benchmark_options[0], "g") 515 516 517if __name__ == "__main__": 518 # unittest.main() 519 main() 520 521# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 522# kate: tab-width: 4; replace-tabs on; indent-width 4; tab-indents: off; 523# kate: indent-mode python; remove-trailing-spaces modified; 524