#!/usr/bin/env python3 import gc import importlib import logging import os import re import sys import warnings from collections import namedtuple from os.path import abspath, exists import torch try: from .common import BenchmarkRunner, load_yaml_file, main except ImportError: from common import BenchmarkRunner, load_yaml_file, main from torch._dynamo.testing import collect_results, reduce_to_scalar_loss from torch._dynamo.utils import clone_inputs # We are primarily interested in tf32 datatype torch.backends.cuda.matmul.allow_tf32 = True # Enable FX graph caching if "TORCHINDUCTOR_FX_GRAPH_CACHE" not in os.environ: torch._inductor.config.fx_graph_cache = True def _reassign_parameters(model): # torch_geometric models register parameter as tensors due to # https://github.com/pyg-team/pytorch_geometric/blob/master/torch_geometric/nn/dense/linear.py#L158-L168 # Since it is unusual thing to do, we just reassign them to parameters def state_dict_hook(module, destination, prefix, local_metadata): for name, param in module.named_parameters(): if isinstance(destination[name], torch.Tensor) and not isinstance( destination[name], torch.nn.Parameter ): destination[name] = torch.nn.Parameter(destination[name]) model._register_state_dict_hook(state_dict_hook) def setup_torchbench_cwd(): original_dir = abspath(os.getcwd()) os.environ["KALDI_ROOT"] = "/tmp" # avoids some spam for torchbench_dir in ( "./torchbenchmark", "../torchbenchmark", "../torchbench", "../benchmark", "../../torchbenchmark", "../../torchbench", "../../benchmark", "../../../torchbenchmark", "../../../torchbench", "../../../benchmark", ): if exists(torchbench_dir): break if exists(torchbench_dir): torchbench_dir = abspath(torchbench_dir) os.chdir(torchbench_dir) sys.path.append(torchbench_dir) return original_dir def process_hf_reformer_output(out): assert isinstance(out, list) # second output is unstable return [elem for i, elem in enumerate(out) if i != 1] def process_hf_whisper_output(out): out_ret = [] for i, elem in enumerate(out): if i == 0: assert isinstance(elem, dict) out_ret.append({k: v for k, v in elem.items() if k != "logits"}) elif i != 1: out_ret.append(elem) return out_ret process_train_model_output = { "hf_Reformer": process_hf_reformer_output, "hf_Whisper": process_hf_whisper_output, } class TorchBenchmarkRunner(BenchmarkRunner): def __init__(self): super().__init__() self.suite_name = "torchbench" self.optimizer = None @property def _config(self): return load_yaml_file("torchbench.yaml") @property def _skip(self): return self._config["skip"] @property def _batch_size(self): return self._config["batch_size"] @property def _tolerance(self): return self._config["tolerance"] @property def _require_larger_multiplier_for_smaller_tensor(self): return self._config["require_larger_multiplier_for_smaller_tensor"] @property def _accuracy(self): return self._config["accuracy"] @property def skip_models(self): return self._skip["all"] @property def skip_models_for_cpu(self): return self._skip["device"]["cpu"] @property def skip_models_for_cuda(self): return self._skip["device"]["cuda"] @property def skip_models_for_freezing_cuda(self): return self._skip["freezing"]["cuda"] @property def skip_models_for_freezing_cpu(self): return self._skip["freezing"]["cpu"] @property def slow_models(self): return self._config["slow"] @property def very_slow_models(self): return self._config["very_slow"] @property def non_deterministic_models(self): return self._config["non_deterministic"] @property def get_output_amp_train_process_func(self): return process_train_model_output @property def skip_not_suitable_for_training_models(self): return self._skip["test"]["training"] @property def failing_fx2trt_models(self): return self._config["trt_not_yet_working"] @property def force_amp_for_fp16_bf16_models(self): return self._config["dtype"]["force_amp_for_fp16_bf16_models"] @property def force_fp16_for_bf16_models(self): return self._config["dtype"]["force_fp16_for_bf16_models"] @property def skip_accuracy_checks_large_models_dashboard(self): if self.args.dashboard or self.args.accuracy: return self._accuracy["skip"]["large_models"] return set() @property def skip_accuracy_check_as_eager_non_deterministic(self): if self.args.accuracy and self.args.training: return self._accuracy["skip"]["eager_not_deterministic"] return set() @property def skip_multiprocess_models(self): return self._skip["multiprocess"] @property def skip_models_due_to_control_flow(self): return self._skip["control_flow"] @property def guard_on_nn_module_models(self): return { "vision_maskrcnn", } @property def inline_inbuilt_nn_modules_models(self): return { "basic_gnn_edgecnn", "drq", "hf_Reformer", "DALLE2_pytorch", "hf_BigBird", "detectron2_maskrcnn_r_50_fpn", "detectron2_maskrcnn_r_101_fpn", "vision_maskrcnn", "doctr_reco_predictor", "hf_T5_generate", } def load_model( self, device, model_name, batch_size=None, part=None, extra_args=None, ): if self.args.enable_activation_checkpointing: raise NotImplementedError( "Activation checkpointing not implemented for Torchbench models" ) is_training = self.args.training use_eval_mode = self.args.use_eval_mode dynamic_shapes = self.args.dynamic_shapes candidates = [ f"torchbenchmark.models.{model_name}", f"torchbenchmark.canary_models.{model_name}", f"torchbenchmark.models.fb.{model_name}", ] for c in candidates: try: module = importlib.import_module(c) break except ModuleNotFoundError as e: if e.name != c: raise else: raise ImportError(f"could not import any of {candidates}") benchmark_cls = getattr(module, "Model", None) if benchmark_cls is None: raise NotImplementedError(f"{model_name}.Model is None") if not hasattr(benchmark_cls, "name"): benchmark_cls.name = model_name cant_change_batch_size = ( not getattr(benchmark_cls, "ALLOW_CUSTOMIZE_BSIZE", True) or model_name in self._config["dont_change_batch_size"] ) if cant_change_batch_size: batch_size = None if ( batch_size is None and is_training and model_name in self._batch_size["training"] ): batch_size = self._batch_size["training"][model_name] elif ( batch_size is None and not is_training and model_name in self._batch_size["inference"] ): batch_size = self._batch_size["inference"][model_name] # Control the memory footprint for few models if self.args.accuracy and model_name in self._accuracy["max_batch_size"]: batch_size = min(batch_size, self._accuracy["max_batch_size"][model_name]) # workaround "RuntimeError: not allowed to set torch.backends.cudnn flags" torch.backends.__allow_nonbracketed_mutation_flag = True if extra_args is None: extra_args = [] if part: extra_args += ["--part", part] # sam_fast only runs with amp if model_name == "sam_fast": self.args.amp = True self.setup_amp() if model_name == "vision_maskrcnn" and is_training: # Output of vision_maskrcnn model is a list of bounding boxes, # sorted on the basis of their scores. This makes accuracy # comparison hard with torch.compile. torch.compile can cause minor # divergences in the output because of how fusion works for amp in # TorchInductor compared to eager. Therefore, instead of looking at # all the bounding boxes, we compare only top 4. model_kwargs = {"box_detections_per_img": 4} benchmark = benchmark_cls( test="train", device=device, batch_size=batch_size, extra_args=extra_args, model_kwargs=model_kwargs, ) use_eval_mode = True elif is_training: benchmark = benchmark_cls( test="train", device=device, batch_size=batch_size, extra_args=extra_args, ) else: benchmark = benchmark_cls( test="eval", device=device, batch_size=batch_size, extra_args=extra_args, ) model, example_inputs = benchmark.get_module() if model_name in [ "basic_gnn_edgecnn", "basic_gnn_gcn", "basic_gnn_sage", "basic_gnn_gin", ]: _reassign_parameters(model) # Models that must be in train mode while training if is_training and ( not use_eval_mode or model_name in self._config["only_training"] ): model.train() else: model.eval() gc.collect() batch_size = benchmark.batch_size if model_name == "torchrec_dlrm": batch_namedtuple = namedtuple( "Batch", "dense_features sparse_features labels" ) example_inputs = tuple( batch_namedtuple( dense_features=batch.dense_features, sparse_features=batch.sparse_features, labels=batch.labels, ) for batch in example_inputs ) # Torchbench has quite different setup for yolov3, so directly passing # the right example_inputs if model_name == "yolov3": example_inputs = (torch.rand(batch_size, 3, 384, 512).to(device),) # See https://github.com/pytorch/benchmark/issues/1561 if model_name == "maml_omniglot": batch_size = 5 assert example_inputs[0].shape[0] == batch_size if model_name == "vision_maskrcnn": batch_size = 1 # global current_name, current_device # current_device = device # current_name = benchmark.name if self.args.trace_on_xla: # work around for: https://github.com/pytorch/xla/issues/4174 import torch_xla # noqa: F401 self.validate_model(model, example_inputs) return device, benchmark.name, model, example_inputs, batch_size def iter_model_names(self, args): from torchbenchmark import _list_canary_model_paths, _list_model_paths models = _list_model_paths() models += [ f for f in _list_canary_model_paths() if os.path.basename(f) in self._config["canary_models"] ] models.sort() start, end = self.get_benchmark_indices(len(models)) for index, model_path in enumerate(models): if index < start or index >= end: continue model_name = os.path.basename(model_path) if ( not re.search("|".join(args.filter), model_name, re.IGNORECASE) or re.search("|".join(args.exclude), model_name, re.IGNORECASE) or model_name in args.exclude_exact or model_name in self.skip_models ): continue yield model_name def pick_grad(self, name, is_training): if is_training or name in ("maml",): return torch.enable_grad() else: return torch.no_grad() def use_larger_multiplier_for_smaller_tensor(self, name): return name in self._require_larger_multiplier_for_smaller_tensor def get_tolerance_and_cosine_flag(self, is_training, current_device, name): tolerance = 1e-4 cosine = self.args.cosine # Increase the tolerance for torch allclose if self.args.float16 or self.args.amp: if name in self._tolerance["higher_fp16"]: return 1e-2, cosine elif name in self._tolerance["even_higher"]: return 8 * 1e-2, cosine return 1e-3, cosine if self.args.bfloat16: if name in self._tolerance["higher_bf16"]: return 1e-2, cosine if is_training and (current_device == "cuda" or current_device == "xpu"): tolerance = 1e-3 if name in self._tolerance["cosine"]: cosine = True elif name in self._tolerance["higher"]: tolerance = 1e-3 elif name in self._tolerance["even_higher"]: tolerance = 8 * 1e-2 return tolerance, cosine def compute_loss(self, pred): return reduce_to_scalar_loss(pred) def forward_pass(self, mod, inputs, collect_outputs=True): with self.autocast(**self.autocast_arg): if isinstance(inputs, dict): return mod(**inputs) else: return mod(*inputs) def forward_and_backward_pass(self, mod, inputs, collect_outputs=True): cloned_inputs = clone_inputs(inputs) self.optimizer_zero_grad(mod) with self.autocast(**self.autocast_arg): if isinstance(cloned_inputs, dict): pred = mod(**cloned_inputs) else: pred = mod(*cloned_inputs) loss = self.compute_loss(pred) self.grad_scaler.scale(loss).backward() self.optimizer_step() if collect_outputs: return collect_results(mod, pred, loss, cloned_inputs) return None def torchbench_main(): original_dir = setup_torchbench_cwd() logging.basicConfig(level=logging.WARNING) warnings.filterwarnings("ignore") main(TorchBenchmarkRunner(), original_dir) if __name__ == "__main__": torchbench_main()