xref: /aosp_15_r20/external/google-benchmark/tools/compare.py (revision dbb99499c3810fa1611fa2242a2fc446be01a57c)
1#!/usr/bin/env python3
2
3# type: ignore
4
5"""
6compare.py - versatile benchmark output compare tool
7"""
8
9import argparse
10import json
11import os
12import sys
13import unittest
14from argparse import ArgumentParser
15
16import gbench
17from gbench import report, util
18
19
20def check_inputs(in1, in2, flags):
21    """
22    Perform checking on the user provided inputs and diagnose any abnormalities
23    """
24    in1_kind, in1_err = util.classify_input_file(in1)
25    in2_kind, in2_err = util.classify_input_file(in2)
26    output_file = util.find_benchmark_flag("--benchmark_out=", flags)
27    output_type = util.find_benchmark_flag("--benchmark_out_format=", flags)
28    if (
29        in1_kind == util.IT_Executable
30        and in2_kind == util.IT_Executable
31        and output_file
32    ):
33        print(
34            (
35                "WARNING: '--benchmark_out=%s' will be passed to both "
36                "benchmarks causing it to be overwritten"
37            )
38            % output_file
39        )
40    if in1_kind == util.IT_JSON and in2_kind == util.IT_JSON:
41        # When both sides are JSON the only supported flag is
42        # --benchmark_filter=
43        for flag in util.remove_benchmark_flags("--benchmark_filter=", flags):
44            print(
45                "WARNING: passing %s has no effect since both "
46                "inputs are JSON" % flag
47            )
48    if output_type is not None and output_type != "json":
49        print(
50            (
51                "ERROR: passing '--benchmark_out_format=%s' to 'compare.py`"
52                " is not supported."
53            )
54            % output_type
55        )
56        sys.exit(1)
57
58
59def create_parser():
60    parser = ArgumentParser(
61        description="versatile benchmark output compare tool"
62    )
63
64    parser.add_argument(
65        "-a",
66        "--display_aggregates_only",
67        dest="display_aggregates_only",
68        action="store_true",
69        help="If there are repetitions, by default, we display everything - the"
70        " actual runs, and the aggregates computed. Sometimes, it is "
71        "desirable to only view the aggregates. E.g. when there are a lot "
72        "of repetitions. Do note that only the display is affected. "
73        "Internally, all the actual runs are still used, e.g. for U test.",
74    )
75
76    parser.add_argument(
77        "--no-color",
78        dest="color",
79        default=True,
80        action="store_false",
81        help="Do not use colors in the terminal output",
82    )
83
84    parser.add_argument(
85        "-d",
86        "--dump_to_json",
87        dest="dump_to_json",
88        help="Additionally, dump benchmark comparison output to this file in JSON format.",
89    )
90
91    utest = parser.add_argument_group()
92    utest.add_argument(
93        "--no-utest",
94        dest="utest",
95        default=True,
96        action="store_false",
97        help="The tool can do a two-tailed Mann-Whitney U test with the null hypothesis that it is equally likely that a randomly selected value from one sample will be less than or greater than a randomly selected value from a second sample.\nWARNING: requires **LARGE** (no less than {}) number of repetitions to be meaningful!\nThe test is being done by default, if at least {} repetitions were done.\nThis option can disable the U Test.".format(
98            report.UTEST_OPTIMAL_REPETITIONS, report.UTEST_MIN_REPETITIONS
99        ),
100    )
101    alpha_default = 0.05
102    utest.add_argument(
103        "--alpha",
104        dest="utest_alpha",
105        default=alpha_default,
106        type=float,
107        help=(
108            "significance level alpha. if the calculated p-value is below this value, then the result is said to be statistically significant and the null hypothesis is rejected.\n(default: %0.4f)"
109        )
110        % alpha_default,
111    )
112
113    subparsers = parser.add_subparsers(
114        help="This tool has multiple modes of operation:", dest="mode"
115    )
116
117    parser_a = subparsers.add_parser(
118        "benchmarks",
119        help="The most simple use-case, compare all the output of these two benchmarks",
120    )
121    baseline = parser_a.add_argument_group("baseline", "The benchmark baseline")
122    baseline.add_argument(
123        "test_baseline",
124        metavar="test_baseline",
125        type=argparse.FileType("r"),
126        nargs=1,
127        help="A benchmark executable or JSON output file",
128    )
129    contender = parser_a.add_argument_group(
130        "contender", "The benchmark that will be compared against the baseline"
131    )
132    contender.add_argument(
133        "test_contender",
134        metavar="test_contender",
135        type=argparse.FileType("r"),
136        nargs=1,
137        help="A benchmark executable or JSON output file",
138    )
139    parser_a.add_argument(
140        "benchmark_options",
141        metavar="benchmark_options",
142        nargs=argparse.REMAINDER,
143        help="Arguments to pass when running benchmark executables",
144    )
145
146    parser_b = subparsers.add_parser(
147        "filters", help="Compare filter one with the filter two of benchmark"
148    )
149    baseline = parser_b.add_argument_group("baseline", "The benchmark baseline")
150    baseline.add_argument(
151        "test",
152        metavar="test",
153        type=argparse.FileType("r"),
154        nargs=1,
155        help="A benchmark executable or JSON output file",
156    )
157    baseline.add_argument(
158        "filter_baseline",
159        metavar="filter_baseline",
160        type=str,
161        nargs=1,
162        help="The first filter, that will be used as baseline",
163    )
164    contender = parser_b.add_argument_group(
165        "contender", "The benchmark that will be compared against the baseline"
166    )
167    contender.add_argument(
168        "filter_contender",
169        metavar="filter_contender",
170        type=str,
171        nargs=1,
172        help="The second filter, that will be compared against the baseline",
173    )
174    parser_b.add_argument(
175        "benchmark_options",
176        metavar="benchmark_options",
177        nargs=argparse.REMAINDER,
178        help="Arguments to pass when running benchmark executables",
179    )
180
181    parser_c = subparsers.add_parser(
182        "benchmarksfiltered",
183        help="Compare filter one of first benchmark with filter two of the second benchmark",
184    )
185    baseline = parser_c.add_argument_group("baseline", "The benchmark baseline")
186    baseline.add_argument(
187        "test_baseline",
188        metavar="test_baseline",
189        type=argparse.FileType("r"),
190        nargs=1,
191        help="A benchmark executable or JSON output file",
192    )
193    baseline.add_argument(
194        "filter_baseline",
195        metavar="filter_baseline",
196        type=str,
197        nargs=1,
198        help="The first filter, that will be used as baseline",
199    )
200    contender = parser_c.add_argument_group(
201        "contender", "The benchmark that will be compared against the baseline"
202    )
203    contender.add_argument(
204        "test_contender",
205        metavar="test_contender",
206        type=argparse.FileType("r"),
207        nargs=1,
208        help="The second benchmark executable or JSON output file, that will be compared against the baseline",
209    )
210    contender.add_argument(
211        "filter_contender",
212        metavar="filter_contender",
213        type=str,
214        nargs=1,
215        help="The second filter, that will be compared against the baseline",
216    )
217    parser_c.add_argument(
218        "benchmark_options",
219        metavar="benchmark_options",
220        nargs=argparse.REMAINDER,
221        help="Arguments to pass when running benchmark executables",
222    )
223
224    return parser
225
226
227def main():
228    # Parse the command line flags
229    parser = create_parser()
230    args, unknown_args = parser.parse_known_args()
231    if args.mode is None:
232        parser.print_help()
233        exit(1)
234    assert not unknown_args
235    benchmark_options = args.benchmark_options
236
237    if args.mode == "benchmarks":
238        test_baseline = args.test_baseline[0].name
239        test_contender = args.test_contender[0].name
240        filter_baseline = ""
241        filter_contender = ""
242
243        # NOTE: if test_baseline == test_contender, you are analyzing the stdev
244
245        description = "Comparing %s to %s" % (test_baseline, test_contender)
246    elif args.mode == "filters":
247        test_baseline = args.test[0].name
248        test_contender = args.test[0].name
249        filter_baseline = args.filter_baseline[0]
250        filter_contender = args.filter_contender[0]
251
252        # NOTE: if filter_baseline == filter_contender, you are analyzing the
253        # stdev
254
255        description = "Comparing %s to %s (from %s)" % (
256            filter_baseline,
257            filter_contender,
258            args.test[0].name,
259        )
260    elif args.mode == "benchmarksfiltered":
261        test_baseline = args.test_baseline[0].name
262        test_contender = args.test_contender[0].name
263        filter_baseline = args.filter_baseline[0]
264        filter_contender = args.filter_contender[0]
265
266        # NOTE: if test_baseline == test_contender and
267        # filter_baseline == filter_contender, you are analyzing the stdev
268
269        description = "Comparing %s (from %s) to %s (from %s)" % (
270            filter_baseline,
271            test_baseline,
272            filter_contender,
273            test_contender,
274        )
275    else:
276        # should never happen
277        print("Unrecognized mode of operation: '%s'" % args.mode)
278        parser.print_help()
279        exit(1)
280
281    check_inputs(test_baseline, test_contender, benchmark_options)
282
283    if args.display_aggregates_only:
284        benchmark_options += ["--benchmark_display_aggregates_only=true"]
285
286    options_baseline = []
287    options_contender = []
288
289    if filter_baseline and filter_contender:
290        options_baseline = ["--benchmark_filter=%s" % filter_baseline]
291        options_contender = ["--benchmark_filter=%s" % filter_contender]
292
293    # Run the benchmarks and report the results
294    json1 = json1_orig = gbench.util.sort_benchmark_results(
295        gbench.util.run_or_load_benchmark(
296            test_baseline, benchmark_options + options_baseline
297        )
298    )
299    json2 = json2_orig = gbench.util.sort_benchmark_results(
300        gbench.util.run_or_load_benchmark(
301            test_contender, benchmark_options + options_contender
302        )
303    )
304
305    # Now, filter the benchmarks so that the difference report can work
306    if filter_baseline and filter_contender:
307        replacement = "[%s vs. %s]" % (filter_baseline, filter_contender)
308        json1 = gbench.report.filter_benchmark(
309            json1_orig, filter_baseline, replacement
310        )
311        json2 = gbench.report.filter_benchmark(
312            json2_orig, filter_contender, replacement
313        )
314
315    diff_report = gbench.report.get_difference_report(json1, json2, args.utest)
316    output_lines = gbench.report.print_difference_report(
317        diff_report,
318        args.display_aggregates_only,
319        args.utest,
320        args.utest_alpha,
321        args.color,
322    )
323    print(description)
324    for ln in output_lines:
325        print(ln)
326
327    # Optionally, diff and output to JSON
328    if args.dump_to_json is not None:
329        with open(args.dump_to_json, "w") as f_json:
330            json.dump(diff_report, f_json, indent=1)
331
332
333class TestParser(unittest.TestCase):
334    def setUp(self):
335        self.parser = create_parser()
336        testInputs = os.path.join(
337            os.path.dirname(os.path.realpath(__file__)), "gbench", "Inputs"
338        )
339        self.testInput0 = os.path.join(testInputs, "test1_run1.json")
340        self.testInput1 = os.path.join(testInputs, "test1_run2.json")
341
342    def test_benchmarks_basic(self):
343        parsed = self.parser.parse_args(
344            ["benchmarks", self.testInput0, self.testInput1]
345        )
346        self.assertFalse(parsed.display_aggregates_only)
347        self.assertTrue(parsed.utest)
348        self.assertEqual(parsed.mode, "benchmarks")
349        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
350        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
351        self.assertFalse(parsed.benchmark_options)
352
353    def test_benchmarks_basic_without_utest(self):
354        parsed = self.parser.parse_args(
355            ["--no-utest", "benchmarks", self.testInput0, self.testInput1]
356        )
357        self.assertFalse(parsed.display_aggregates_only)
358        self.assertFalse(parsed.utest)
359        self.assertEqual(parsed.utest_alpha, 0.05)
360        self.assertEqual(parsed.mode, "benchmarks")
361        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
362        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
363        self.assertFalse(parsed.benchmark_options)
364
365    def test_benchmarks_basic_display_aggregates_only(self):
366        parsed = self.parser.parse_args(
367            ["-a", "benchmarks", self.testInput0, self.testInput1]
368        )
369        self.assertTrue(parsed.display_aggregates_only)
370        self.assertTrue(parsed.utest)
371        self.assertEqual(parsed.mode, "benchmarks")
372        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
373        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
374        self.assertFalse(parsed.benchmark_options)
375
376    def test_benchmarks_basic_with_utest_alpha(self):
377        parsed = self.parser.parse_args(
378            ["--alpha=0.314", "benchmarks", self.testInput0, self.testInput1]
379        )
380        self.assertFalse(parsed.display_aggregates_only)
381        self.assertTrue(parsed.utest)
382        self.assertEqual(parsed.utest_alpha, 0.314)
383        self.assertEqual(parsed.mode, "benchmarks")
384        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
385        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
386        self.assertFalse(parsed.benchmark_options)
387
388    def test_benchmarks_basic_without_utest_with_utest_alpha(self):
389        parsed = self.parser.parse_args(
390            [
391                "--no-utest",
392                "--alpha=0.314",
393                "benchmarks",
394                self.testInput0,
395                self.testInput1,
396            ]
397        )
398        self.assertFalse(parsed.display_aggregates_only)
399        self.assertFalse(parsed.utest)
400        self.assertEqual(parsed.utest_alpha, 0.314)
401        self.assertEqual(parsed.mode, "benchmarks")
402        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
403        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
404        self.assertFalse(parsed.benchmark_options)
405
406    def test_benchmarks_with_remainder(self):
407        parsed = self.parser.parse_args(
408            ["benchmarks", self.testInput0, self.testInput1, "d"]
409        )
410        self.assertFalse(parsed.display_aggregates_only)
411        self.assertTrue(parsed.utest)
412        self.assertEqual(parsed.mode, "benchmarks")
413        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
414        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
415        self.assertEqual(parsed.benchmark_options, ["d"])
416
417    def test_benchmarks_with_remainder_after_doubleminus(self):
418        parsed = self.parser.parse_args(
419            ["benchmarks", self.testInput0, self.testInput1, "--", "e"]
420        )
421        self.assertFalse(parsed.display_aggregates_only)
422        self.assertTrue(parsed.utest)
423        self.assertEqual(parsed.mode, "benchmarks")
424        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
425        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
426        self.assertEqual(parsed.benchmark_options, ["e"])
427
428    def test_filters_basic(self):
429        parsed = self.parser.parse_args(["filters", self.testInput0, "c", "d"])
430        self.assertFalse(parsed.display_aggregates_only)
431        self.assertTrue(parsed.utest)
432        self.assertEqual(parsed.mode, "filters")
433        self.assertEqual(parsed.test[0].name, self.testInput0)
434        self.assertEqual(parsed.filter_baseline[0], "c")
435        self.assertEqual(parsed.filter_contender[0], "d")
436        self.assertFalse(parsed.benchmark_options)
437
438    def test_filters_with_remainder(self):
439        parsed = self.parser.parse_args(
440            ["filters", self.testInput0, "c", "d", "e"]
441        )
442        self.assertFalse(parsed.display_aggregates_only)
443        self.assertTrue(parsed.utest)
444        self.assertEqual(parsed.mode, "filters")
445        self.assertEqual(parsed.test[0].name, self.testInput0)
446        self.assertEqual(parsed.filter_baseline[0], "c")
447        self.assertEqual(parsed.filter_contender[0], "d")
448        self.assertEqual(parsed.benchmark_options, ["e"])
449
450    def test_filters_with_remainder_after_doubleminus(self):
451        parsed = self.parser.parse_args(
452            ["filters", self.testInput0, "c", "d", "--", "f"]
453        )
454        self.assertFalse(parsed.display_aggregates_only)
455        self.assertTrue(parsed.utest)
456        self.assertEqual(parsed.mode, "filters")
457        self.assertEqual(parsed.test[0].name, self.testInput0)
458        self.assertEqual(parsed.filter_baseline[0], "c")
459        self.assertEqual(parsed.filter_contender[0], "d")
460        self.assertEqual(parsed.benchmark_options, ["f"])
461
462    def test_benchmarksfiltered_basic(self):
463        parsed = self.parser.parse_args(
464            ["benchmarksfiltered", self.testInput0, "c", self.testInput1, "e"]
465        )
466        self.assertFalse(parsed.display_aggregates_only)
467        self.assertTrue(parsed.utest)
468        self.assertEqual(parsed.mode, "benchmarksfiltered")
469        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
470        self.assertEqual(parsed.filter_baseline[0], "c")
471        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
472        self.assertEqual(parsed.filter_contender[0], "e")
473        self.assertFalse(parsed.benchmark_options)
474
475    def test_benchmarksfiltered_with_remainder(self):
476        parsed = self.parser.parse_args(
477            [
478                "benchmarksfiltered",
479                self.testInput0,
480                "c",
481                self.testInput1,
482                "e",
483                "f",
484            ]
485        )
486        self.assertFalse(parsed.display_aggregates_only)
487        self.assertTrue(parsed.utest)
488        self.assertEqual(parsed.mode, "benchmarksfiltered")
489        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
490        self.assertEqual(parsed.filter_baseline[0], "c")
491        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
492        self.assertEqual(parsed.filter_contender[0], "e")
493        self.assertEqual(parsed.benchmark_options[0], "f")
494
495    def test_benchmarksfiltered_with_remainder_after_doubleminus(self):
496        parsed = self.parser.parse_args(
497            [
498                "benchmarksfiltered",
499                self.testInput0,
500                "c",
501                self.testInput1,
502                "e",
503                "--",
504                "g",
505            ]
506        )
507        self.assertFalse(parsed.display_aggregates_only)
508        self.assertTrue(parsed.utest)
509        self.assertEqual(parsed.mode, "benchmarksfiltered")
510        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
511        self.assertEqual(parsed.filter_baseline[0], "c")
512        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
513        self.assertEqual(parsed.filter_contender[0], "e")
514        self.assertEqual(parsed.benchmark_options[0], "g")
515
516
517if __name__ == "__main__":
518    # unittest.main()
519    main()
520
521# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
522# kate: tab-width: 4; replace-tabs on; indent-width 4; tab-indents: off;
523# kate: indent-mode python; remove-trailing-spaces modified;
524