import csv import os import re import sys # This script takes the logs produced by the benchmark scripts (e.g., # torchbench.py) and parses it into a CSV file that summarizes what # is failing and why. It is kept separate from the benchmark script # emitting a more structured output as it is often more convenient # to iterate quickly on log files offline instead of having to make # a change to the benchmark script and then do a full sweep to see # the updates. # # This script is not very well written, feel free to rewrite it as necessary assert len(sys.argv) == 2 full_log = open(sys.argv[1]).read() # If the log contains a gist URL, extract it so we can include it in the CSV gist_url = "" m = re.search(r"https://gist.github.com/[a-f0-9]+", full_log) if m is not None: gist_url = m.group(0) # Split the log into an entry per benchmark entries = re.split( r"(?:cuda (?:train|eval) +([^ ]+)|WARNING:root:([^ ]+) failed to load)", full_log )[1:] # Entries schema example: # `['hf_Bert', None, ' # PASS\nTIMING: entire_frame_compile:1.80925 backend_compile:6e-05\nDynamo produced 1 graph(s) covering 367 ops\n']` def chunker(seq, size): return (seq[pos : pos + size] for pos in range(0, len(seq), size)) c = 0 i = 0 out = csv.DictWriter( sys.stdout, [ "bench", "name", "result", "component", "context", "explain", "frame_time", "backend_time", "graph_count", "op_count", "graph_breaks", "unique_graph_breaks", ], dialect="excel", ) out.writeheader() out.writerow({"explain": gist_url}) # Sometimes backtraces will be in third party code, which results # in very long file names. Delete the absolute path in this case. def normalize_file(f): if "site-packages/" in f: return f.split("site-packages/", 2)[1] else: return os.path.relpath(f) # Assume we run torchbench, huggingface, timm_models in that order # (as output doesn't say which suite the benchmark is part of) # TODO: make this more robust bench = "torchbench" # 3 = 1 + number of matches in the entries split regex for name, name2, log in chunker(entries, 3): if name is None: name = name2 if name.startswith("Albert"): bench = "huggingface" elif name.startswith("adv_inc"): bench = "timm_models" # Payload that will go into the csv r = "UNKNOWN" explain = "" component = "" context = "" if "PASS" in log: r = "PASS" if "TIMEOUT" in log: r = "FAIL TIMEOUT" if "Accuracy failed" in log: r = "FAIL ACCURACY" # Attempt to extract out useful information from the traceback log = log.split( "The above exception was the direct cause of the following exception" )[0] split = log.split("Traceback (most recent call last)", maxsplit=1) if len(split) == 2: log = split[1] log = log.split("Original traceback:")[0] m = re.search( r'File "([^"]+)", line ([0-9]+), in .+\n +(.+)\n([A-Za-z]+(?:Error|Exception|NotImplementedError): ?.*)', log, ) if m is not None: r = "FAIL" component = f"{normalize_file(m.group(1))}:{m.group(2)}" context = m.group(3) explain = f"{m.group(4)}" else: m = re.search( r'File "([^"]+)", line ([0-9]+), in .+\n +(.+)\nAssertionError', log ) if m is not None: r = "FAIL" component = f"{normalize_file(m.group(1))}:{m.group(2)}" context = m.group(3) explain = "AssertionError" # Sometimes, the benchmark will say FAIL without any useful info # See https://github.com/pytorch/torchdynamo/issues/1910 if "FAIL" in log: r = "FAIL" if r == "UNKNOWN": c += 1 backend_time = None frame_time = None if "TIMING:" in log: result = re.search("TIMING:(.*)\n", log).group(1) split_str = result.split("backend_compile:") if len(split_str) == 2: backend_time = float(split_str[1]) frame_time = float(split_str[0].split("entire_frame_compile:")[1]) if "STATS:" in log: result = re.search("STATS:(.*)\n", log).group(1) # call_* op count: 970 | FakeTensor.__torch_dispatch__:35285 | ProxyTorchDispatchMode.__torch_dispatch__:13339 split_all = result.split("|") # TODO: rewrite this to work with arbitrarily many stats graph_count = None op_count = None graph_breaks = None unique_graph_breaks = None if m := re.search( r"Dynamo produced (\d+) graphs covering (\d+) ops with (\d+) graph breaks \((\d+) unique\)", log, ): graph_count = m.group(1) op_count = m.group(2) graph_breaks = m.group(3) unique_graph_breaks = m.group(4) # If the context string is too long, don't put it in the CSV. # This is a hack to try to make it more likely that Google Sheets will # offer to split columns if len(context) > 78: context = "" # Temporary file names are meaningless, report it's generated code in this # case if "/tmp/" in component: component = "generated code" context = "" out.writerow( { "bench": bench, "name": name, "result": r, "component": component, "context": context, "explain": explain, "frame_time": frame_time, "backend_time": backend_time, "graph_count": graph_count, "op_count": op_count, "graph_breaks": graph_breaks, "unique_graph_breaks": unique_graph_breaks, } ) i += 1 if c: print(f"failed to classify {c} entries", file=sys.stderr)