1# Some models have large dataset that doesn't fit in memory. Lower the batch 2# size to test the accuracy. 3batch_size: 4 training: 5 demucs: 4 6 dlrm: 1024 7 densenet121: 4 8 hf_Reformer: 4 9 hf_T5_base: 4 10 timm_efficientdet: 1 11 llama_v2_7b_16h: 1 12 # reduced from 16 due to cudagraphs OOM in TorchInductor dashboard 13 yolov3: 8 14 15 inference: 16 timm_efficientdet: 32 17 18 19dont_change_batch_size: 20 - demucs 21 - pytorch_struct 22 - pyhpc_turbulent_kinetic_energy 23 # https://github.com/pytorch/benchmark/pull/1656 24 - vision_maskrcnn 25 26 27tolerance: 28 # Need lower tolerance on GPU. GPU kernels have non deterministic kernels for these models. 29 higher: 30 - alexnet 31 - attention_is_all_you_need_pytorch 32 - densenet121 33 - hf_Albert 34 - vgg16 35 - mobilenet_v3_large 36 - nvidia_deeprecommender 37 - timm_efficientdet 38 39 # These models need >1e-3 tolerance 40 even_higher: 41 - soft_actor_critic 42 - tacotron2 43 - yolov3 44 - timm_efficientdet 45 - squeezenet1_1 46 47 higher_fp16: 48 - doctr_reco_predictor 49 - drq 50 - hf_Whisper 51 52 higher_bf16: 53 - doctr_reco_predictor 54 - drq 55 - hf_Whisper 56 57 cosine: [] 58 59require_larger_multiplier_for_smaller_tensor: 60 - yolov3 61 62# These benchmarks took >600s on an i9-11900K CPU 63very_slow: &VERY_SLOW_MODELS 64 # 3339s 65 - hf_BigBird 66 # 3062s 67 - hf_Longformer 68 # 930s 69 - hf_T5 70 71 72# These benchmarks took >60s on an i9-11900K CPU 73slow: 74 - *VERY_SLOW_MODELS 75 # 137s 76 - BERT_pytorch 77 # 116s 78 - demucs 79 # 242s 80 - fastNLP_Bert 81 # 221s 82 - hf_Albert 83 # 400s 84 - hf_Bart 85 # 334s 86 - hf_Bert 87 # 187s 88 - hf_DistilBert 89 # 470s 90 - hf_GPT2 91 # 141s 92 - hf_Reformer 93 # 317s 94 - speech_transformer 95 # 99s 96 - vision_maskrcnn 97 98 99non_deterministic: 100 # https://github.com/pytorch/pytorch/issues/98355 101 - mobilenet_v3_large 102 - sam_fast 103 104 105dtype: 106 force_amp_for_fp16_bf16_models: 107 - DALLE2_pytorch 108 - doctr_det_predictor 109 - doctr_reco_predictor 110 - Super_SloMo 111 - tts_angular 112 - pyhpc_turbulent_kinetic_energy 113 - detectron2_fcos_r_50_fpn 114 115 force_fp16_for_bf16_models: 116 - vision_maskrcnn 117 118 119# models in canary_models that we should run anyway 120canary_models: 121 - torchrec_dlrm 122 123 124detectron2_models: &DETECTRON2_MODELS 125 - detectron2_fasterrcnn_r_101_c4 126 - detectron2_fasterrcnn_r_101_dc5 127 - detectron2_fasterrcnn_r_101_fpn 128 - detectron2_fasterrcnn_r_50_c4 129 - detectron2_fasterrcnn_r_50_dc5 130 - detectron2_fasterrcnn_r_50_fpn 131 - detectron2_maskrcnn_r_101_c4 132 - detectron2_maskrcnn_r_101_fpn 133 - detectron2_maskrcnn_r_50_fpn 134 135 136# These models support only train mode. So accuracy checking can't be done in 137# eval mode. 138only_training: 139 - *DETECTRON2_MODELS 140 - tts_angular 141 - tacotron2 142 - demucs 143 - hf_Reformer 144 - pytorch_struct 145 - yolov3 146 147 148trt_not_yet_working: 149 - alexnet 150 - resnet18 151 - resnet50 152 - mobilenet_v2 153 - mnasnet1_0 154 - squeezenet1_1 155 - shufflenetv2_x1_0 156 - vgg16 157 - resnext50_32x4d 158 159 160skip: 161 all: 162 # OOMs (A100 40G) 163 - detectron2_maskrcnn 164 # TIMEOUT, https://github.com/pytorch/pytorch/issues/98467 165 - tacotron2 166 # Failing in eager mode 167 - hf_clip 168 # multi gpu not always available in benchmark runners 169 - simple_gpt_tp_manual 170 171 device: 172 cpu: 173 # OOMs 174 - hf_T5_generate 175 # model is CUDA only 176 - cm3leon_generate 177 # timeout 178 - nanogpt 179 # timeout 180 - sam 181 # model is CUDA only 182 - sam_fast 183 # model is CUDA only 184 - llama_v2_7b_16h 185 # flaky 186 - stable_diffusion 187 # requires FBGEMM, CUDA only 188 - torchrec_dlrm 189 - simple_gpt 190 # works on cuda, accuracy failure on cpu 191 - hf_Whisper 192 - stable_diffusion_text_encoder 193 - llava 194 - moco 195 196 cuda: [] 197 198 test: 199 training: 200 - *DETECTRON2_MODELS 201 # not designed for training 202 - pyhpc_equation_of_state 203 - pyhpc_isoneutral_mixing 204 - pyhpc_turbulent_kinetic_energy 205 - maml 206 - llama 207 - llama_v2_7b_16h 208 - simple_gpt 209 - sam_fast 210 # Model's DEFAULT_TRAIN_BSIZE is not implemented 211 - cm3leon_generate 212 - hf_T5_generate 213 - doctr_det_predictor 214 - doctr_reco_predictor 215 - moondream 216 # doesnt fit in memory 217 - phi_1_5 218 - detectron2_fcos_r_50_fpn 219 220 control_flow: 221 - cm3leon_generate 222 - detectron2_fcos_r_50_fpn 223 - fastNLP_Bert 224 - hf_Longformer 225 - hf_Reformer 226 - hf_T5_generate 227 - opacus_cifar10 228 - speech_transformer 229 230 # Models that should only run in --multiprocess mode 231 multiprocess: 232 - simple_gpt 233 234 # for these models, conv-batchnorm fusing causes big numerical churn. 235 # Skip them 236 # mnasnet1_0 and shufflenet_v2_x1_0 can pass on cpu, moco cuda only. 237 freezing: 238 cuda: 239 - mnasnet1_0 240 - moco 241 - shufflenet_v2_x1_0 242 cpu: [] 243 244 245 246 247accuracy: 248 skip: 249 large_models: 250 # Models too large to have eager, dynamo and fp64_numbers simultaneosuly 251 # even for 40 GB machine. We have tested accuracy for smaller version of 252 # these models 253 - hf_GPT2_large 254 - hf_T5_large 255 - timm_vision_transformer_large 256 # accuracy https://github.com/pytorch/pytorch/issues/93847 257 - maml 258 - llama_v2_7b_16h 259 - Background_Matting 260 - stable_diffusion_unet 261 eager_not_deterministic: 262 # Models that deterministic algorithms can not be turned on for eager mode. 263 - Background_Matting 264 - pytorch_unet 265 266 max_batch_size: 267 hf_GPT2: 2 268 pytorch_unet: 2 269