xref: /aosp_15_r20/external/cronet/third_party/google_benchmark/src/tools/compare.py (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1#!/usr/bin/env python3
2
3import unittest
4"""
5compare.py - versatile benchmark output compare tool
6"""
7
8import argparse
9from argparse import ArgumentParser
10import json
11import sys
12import os
13import gbench
14from gbench import util, report
15
16
17def check_inputs(in1, in2, flags):
18    """
19    Perform checking on the user provided inputs and diagnose any abnormalities
20    """
21    in1_kind, in1_err = util.classify_input_file(in1)
22    in2_kind, in2_err = util.classify_input_file(in2)
23    output_file = util.find_benchmark_flag('--benchmark_out=', flags)
24    output_type = util.find_benchmark_flag('--benchmark_out_format=', flags)
25    if in1_kind == util.IT_Executable and in2_kind == util.IT_Executable and output_file:
26        print(("WARNING: '--benchmark_out=%s' will be passed to both "
27               "benchmarks causing it to be overwritten") % output_file)
28    if in1_kind == util.IT_JSON and in2_kind == util.IT_JSON:
29        # When both sides are JSON the only supported flag is
30        # --benchmark_filter=
31        for flag in util.remove_benchmark_flags('--benchmark_filter=', flags):
32            print("WARNING: passing %s has no effect since both "
33                  "inputs are JSON" % flag)
34    if output_type is not None and output_type != 'json':
35        print(("ERROR: passing '--benchmark_out_format=%s' to 'compare.py`"
36               " is not supported.") % output_type)
37        sys.exit(1)
38
39
40def create_parser():
41    parser = ArgumentParser(
42        description='versatile benchmark output compare tool')
43
44    parser.add_argument(
45        '-a',
46        '--display_aggregates_only',
47        dest='display_aggregates_only',
48        action="store_true",
49        help="If there are repetitions, by default, we display everything - the"
50             " actual runs, and the aggregates computed. Sometimes, it is "
51             "desirable to only view the aggregates. E.g. when there are a lot "
52             "of repetitions. Do note that only the display is affected. "
53             "Internally, all the actual runs are still used, e.g. for U test.")
54
55    parser.add_argument(
56        '--no-color',
57        dest='color',
58        default=True,
59        action="store_false",
60        help="Do not use colors in the terminal output"
61    )
62
63    parser.add_argument(
64        '-d',
65        '--dump_to_json',
66        dest='dump_to_json',
67        help="Additionally, dump benchmark comparison output to this file in JSON format.")
68
69    utest = parser.add_argument_group()
70    utest.add_argument(
71        '--no-utest',
72        dest='utest',
73        default=True,
74        action="store_false",
75        help="The tool can do a two-tailed Mann-Whitney U test with the null hypothesis that it is equally likely that a randomly selected value from one sample will be less than or greater than a randomly selected value from a second sample.\nWARNING: requires **LARGE** (no less than {}) number of repetitions to be meaningful!\nThe test is being done by default, if at least {} repetitions were done.\nThis option can disable the U Test.".format(report.UTEST_OPTIMAL_REPETITIONS, report.UTEST_MIN_REPETITIONS))
76    alpha_default = 0.05
77    utest.add_argument(
78        "--alpha",
79        dest='utest_alpha',
80        default=alpha_default,
81        type=float,
82        help=("significance level alpha. if the calculated p-value is below this value, then the result is said to be statistically significant and the null hypothesis is rejected.\n(default: %0.4f)") %
83        alpha_default)
84
85    subparsers = parser.add_subparsers(
86        help='This tool has multiple modes of operation:',
87        dest='mode')
88
89    parser_a = subparsers.add_parser(
90        'benchmarks',
91        help='The most simple use-case, compare all the output of these two benchmarks')
92    baseline = parser_a.add_argument_group(
93        'baseline', 'The benchmark baseline')
94    baseline.add_argument(
95        'test_baseline',
96        metavar='test_baseline',
97        type=argparse.FileType('r'),
98        nargs=1,
99        help='A benchmark executable or JSON output file')
100    contender = parser_a.add_argument_group(
101        'contender', 'The benchmark that will be compared against the baseline')
102    contender.add_argument(
103        'test_contender',
104        metavar='test_contender',
105        type=argparse.FileType('r'),
106        nargs=1,
107        help='A benchmark executable or JSON output file')
108    parser_a.add_argument(
109        'benchmark_options',
110        metavar='benchmark_options',
111        nargs=argparse.REMAINDER,
112        help='Arguments to pass when running benchmark executables')
113
114    parser_b = subparsers.add_parser(
115        'filters', help='Compare filter one with the filter two of benchmark')
116    baseline = parser_b.add_argument_group(
117        'baseline', 'The benchmark baseline')
118    baseline.add_argument(
119        'test',
120        metavar='test',
121        type=argparse.FileType('r'),
122        nargs=1,
123        help='A benchmark executable or JSON output file')
124    baseline.add_argument(
125        'filter_baseline',
126        metavar='filter_baseline',
127        type=str,
128        nargs=1,
129        help='The first filter, that will be used as baseline')
130    contender = parser_b.add_argument_group(
131        'contender', 'The benchmark that will be compared against the baseline')
132    contender.add_argument(
133        'filter_contender',
134        metavar='filter_contender',
135        type=str,
136        nargs=1,
137        help='The second filter, that will be compared against the baseline')
138    parser_b.add_argument(
139        'benchmark_options',
140        metavar='benchmark_options',
141        nargs=argparse.REMAINDER,
142        help='Arguments to pass when running benchmark executables')
143
144    parser_c = subparsers.add_parser(
145        'benchmarksfiltered',
146        help='Compare filter one of first benchmark with filter two of the second benchmark')
147    baseline = parser_c.add_argument_group(
148        'baseline', 'The benchmark baseline')
149    baseline.add_argument(
150        'test_baseline',
151        metavar='test_baseline',
152        type=argparse.FileType('r'),
153        nargs=1,
154        help='A benchmark executable or JSON output file')
155    baseline.add_argument(
156        'filter_baseline',
157        metavar='filter_baseline',
158        type=str,
159        nargs=1,
160        help='The first filter, that will be used as baseline')
161    contender = parser_c.add_argument_group(
162        'contender', 'The benchmark that will be compared against the baseline')
163    contender.add_argument(
164        'test_contender',
165        metavar='test_contender',
166        type=argparse.FileType('r'),
167        nargs=1,
168        help='The second benchmark executable or JSON output file, that will be compared against the baseline')
169    contender.add_argument(
170        'filter_contender',
171        metavar='filter_contender',
172        type=str,
173        nargs=1,
174        help='The second filter, that will be compared against the baseline')
175    parser_c.add_argument(
176        'benchmark_options',
177        metavar='benchmark_options',
178        nargs=argparse.REMAINDER,
179        help='Arguments to pass when running benchmark executables')
180
181    return parser
182
183
184def main():
185    # Parse the command line flags
186    parser = create_parser()
187    args, unknown_args = parser.parse_known_args()
188    if args.mode is None:
189        parser.print_help()
190        exit(1)
191    assert not unknown_args
192    benchmark_options = args.benchmark_options
193
194    if args.mode == 'benchmarks':
195        test_baseline = args.test_baseline[0].name
196        test_contender = args.test_contender[0].name
197        filter_baseline = ''
198        filter_contender = ''
199
200        # NOTE: if test_baseline == test_contender, you are analyzing the stdev
201
202        description = 'Comparing %s to %s' % (test_baseline, test_contender)
203    elif args.mode == 'filters':
204        test_baseline = args.test[0].name
205        test_contender = args.test[0].name
206        filter_baseline = args.filter_baseline[0]
207        filter_contender = args.filter_contender[0]
208
209        # NOTE: if filter_baseline == filter_contender, you are analyzing the
210        # stdev
211
212        description = 'Comparing %s to %s (from %s)' % (
213            filter_baseline, filter_contender, args.test[0].name)
214    elif args.mode == 'benchmarksfiltered':
215        test_baseline = args.test_baseline[0].name
216        test_contender = args.test_contender[0].name
217        filter_baseline = args.filter_baseline[0]
218        filter_contender = args.filter_contender[0]
219
220        # NOTE: if test_baseline == test_contender and
221        # filter_baseline == filter_contender, you are analyzing the stdev
222
223        description = 'Comparing %s (from %s) to %s (from %s)' % (
224            filter_baseline, test_baseline, filter_contender, test_contender)
225    else:
226        # should never happen
227        print("Unrecognized mode of operation: '%s'" % args.mode)
228        parser.print_help()
229        exit(1)
230
231    check_inputs(test_baseline, test_contender, benchmark_options)
232
233    if args.display_aggregates_only:
234        benchmark_options += ['--benchmark_display_aggregates_only=true']
235
236    options_baseline = []
237    options_contender = []
238
239    if filter_baseline and filter_contender:
240        options_baseline = ['--benchmark_filter=%s' % filter_baseline]
241        options_contender = ['--benchmark_filter=%s' % filter_contender]
242
243    # Run the benchmarks and report the results
244    json1 = json1_orig = gbench.util.sort_benchmark_results(gbench.util.run_or_load_benchmark(
245        test_baseline, benchmark_options + options_baseline))
246    json2 = json2_orig = gbench.util.sort_benchmark_results(gbench.util.run_or_load_benchmark(
247        test_contender, benchmark_options + options_contender))
248
249    # Now, filter the benchmarks so that the difference report can work
250    if filter_baseline and filter_contender:
251        replacement = '[%s vs. %s]' % (filter_baseline, filter_contender)
252        json1 = gbench.report.filter_benchmark(
253            json1_orig, filter_baseline, replacement)
254        json2 = gbench.report.filter_benchmark(
255            json2_orig, filter_contender, replacement)
256
257    diff_report = gbench.report.get_difference_report(
258        json1, json2, args.utest)
259    output_lines = gbench.report.print_difference_report(
260        diff_report,
261        args.display_aggregates_only,
262        args.utest, args.utest_alpha, args.color)
263    print(description)
264    for ln in output_lines:
265        print(ln)
266
267    # Optionally, diff and output to JSON
268    if args.dump_to_json is not None:
269        with open(args.dump_to_json, 'w') as f_json:
270            json.dump(diff_report, f_json)
271
272class TestParser(unittest.TestCase):
273    def setUp(self):
274        self.parser = create_parser()
275        testInputs = os.path.join(
276            os.path.dirname(
277                os.path.realpath(__file__)),
278            'gbench',
279            'Inputs')
280        self.testInput0 = os.path.join(testInputs, 'test1_run1.json')
281        self.testInput1 = os.path.join(testInputs, 'test1_run2.json')
282
283    def test_benchmarks_basic(self):
284        parsed = self.parser.parse_args(
285            ['benchmarks', self.testInput0, self.testInput1])
286        self.assertFalse(parsed.display_aggregates_only)
287        self.assertTrue(parsed.utest)
288        self.assertEqual(parsed.mode, 'benchmarks')
289        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
290        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
291        self.assertFalse(parsed.benchmark_options)
292
293    def test_benchmarks_basic_without_utest(self):
294        parsed = self.parser.parse_args(
295            ['--no-utest', 'benchmarks', self.testInput0, self.testInput1])
296        self.assertFalse(parsed.display_aggregates_only)
297        self.assertFalse(parsed.utest)
298        self.assertEqual(parsed.utest_alpha, 0.05)
299        self.assertEqual(parsed.mode, 'benchmarks')
300        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
301        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
302        self.assertFalse(parsed.benchmark_options)
303
304    def test_benchmarks_basic_display_aggregates_only(self):
305        parsed = self.parser.parse_args(
306            ['-a', 'benchmarks', self.testInput0, self.testInput1])
307        self.assertTrue(parsed.display_aggregates_only)
308        self.assertTrue(parsed.utest)
309        self.assertEqual(parsed.mode, 'benchmarks')
310        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
311        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
312        self.assertFalse(parsed.benchmark_options)
313
314    def test_benchmarks_basic_with_utest_alpha(self):
315        parsed = self.parser.parse_args(
316            ['--alpha=0.314', 'benchmarks', self.testInput0, self.testInput1])
317        self.assertFalse(parsed.display_aggregates_only)
318        self.assertTrue(parsed.utest)
319        self.assertEqual(parsed.utest_alpha, 0.314)
320        self.assertEqual(parsed.mode, 'benchmarks')
321        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
322        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
323        self.assertFalse(parsed.benchmark_options)
324
325    def test_benchmarks_basic_without_utest_with_utest_alpha(self):
326        parsed = self.parser.parse_args(
327            ['--no-utest', '--alpha=0.314', 'benchmarks', self.testInput0, self.testInput1])
328        self.assertFalse(parsed.display_aggregates_only)
329        self.assertFalse(parsed.utest)
330        self.assertEqual(parsed.utest_alpha, 0.314)
331        self.assertEqual(parsed.mode, 'benchmarks')
332        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
333        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
334        self.assertFalse(parsed.benchmark_options)
335
336    def test_benchmarks_with_remainder(self):
337        parsed = self.parser.parse_args(
338            ['benchmarks', self.testInput0, self.testInput1, 'd'])
339        self.assertFalse(parsed.display_aggregates_only)
340        self.assertTrue(parsed.utest)
341        self.assertEqual(parsed.mode, 'benchmarks')
342        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
343        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
344        self.assertEqual(parsed.benchmark_options, ['d'])
345
346    def test_benchmarks_with_remainder_after_doubleminus(self):
347        parsed = self.parser.parse_args(
348            ['benchmarks', self.testInput0, self.testInput1, '--', 'e'])
349        self.assertFalse(parsed.display_aggregates_only)
350        self.assertTrue(parsed.utest)
351        self.assertEqual(parsed.mode, 'benchmarks')
352        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
353        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
354        self.assertEqual(parsed.benchmark_options, ['e'])
355
356    def test_filters_basic(self):
357        parsed = self.parser.parse_args(
358            ['filters', self.testInput0, 'c', 'd'])
359        self.assertFalse(parsed.display_aggregates_only)
360        self.assertTrue(parsed.utest)
361        self.assertEqual(parsed.mode, 'filters')
362        self.assertEqual(parsed.test[0].name, self.testInput0)
363        self.assertEqual(parsed.filter_baseline[0], 'c')
364        self.assertEqual(parsed.filter_contender[0], 'd')
365        self.assertFalse(parsed.benchmark_options)
366
367    def test_filters_with_remainder(self):
368        parsed = self.parser.parse_args(
369            ['filters', self.testInput0, 'c', 'd', 'e'])
370        self.assertFalse(parsed.display_aggregates_only)
371        self.assertTrue(parsed.utest)
372        self.assertEqual(parsed.mode, 'filters')
373        self.assertEqual(parsed.test[0].name, self.testInput0)
374        self.assertEqual(parsed.filter_baseline[0], 'c')
375        self.assertEqual(parsed.filter_contender[0], 'd')
376        self.assertEqual(parsed.benchmark_options, ['e'])
377
378    def test_filters_with_remainder_after_doubleminus(self):
379        parsed = self.parser.parse_args(
380            ['filters', self.testInput0, 'c', 'd', '--', 'f'])
381        self.assertFalse(parsed.display_aggregates_only)
382        self.assertTrue(parsed.utest)
383        self.assertEqual(parsed.mode, 'filters')
384        self.assertEqual(parsed.test[0].name, self.testInput0)
385        self.assertEqual(parsed.filter_baseline[0], 'c')
386        self.assertEqual(parsed.filter_contender[0], 'd')
387        self.assertEqual(parsed.benchmark_options, ['f'])
388
389    def test_benchmarksfiltered_basic(self):
390        parsed = self.parser.parse_args(
391            ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e'])
392        self.assertFalse(parsed.display_aggregates_only)
393        self.assertTrue(parsed.utest)
394        self.assertEqual(parsed.mode, 'benchmarksfiltered')
395        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
396        self.assertEqual(parsed.filter_baseline[0], 'c')
397        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
398        self.assertEqual(parsed.filter_contender[0], 'e')
399        self.assertFalse(parsed.benchmark_options)
400
401    def test_benchmarksfiltered_with_remainder(self):
402        parsed = self.parser.parse_args(
403            ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', 'f'])
404        self.assertFalse(parsed.display_aggregates_only)
405        self.assertTrue(parsed.utest)
406        self.assertEqual(parsed.mode, 'benchmarksfiltered')
407        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
408        self.assertEqual(parsed.filter_baseline[0], 'c')
409        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
410        self.assertEqual(parsed.filter_contender[0], 'e')
411        self.assertEqual(parsed.benchmark_options[0], 'f')
412
413    def test_benchmarksfiltered_with_remainder_after_doubleminus(self):
414        parsed = self.parser.parse_args(
415            ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', '--', 'g'])
416        self.assertFalse(parsed.display_aggregates_only)
417        self.assertTrue(parsed.utest)
418        self.assertEqual(parsed.mode, 'benchmarksfiltered')
419        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
420        self.assertEqual(parsed.filter_baseline[0], 'c')
421        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
422        self.assertEqual(parsed.filter_contender[0], 'e')
423        self.assertEqual(parsed.benchmark_options[0], 'g')
424
425
426if __name__ == '__main__':
427    # unittest.main()
428    main()
429
430# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
431# kate: tab-width: 4; replace-tabs on; indent-width 4; tab-indents: off;
432# kate: indent-mode python; remove-trailing-spaces modified;
433