xref: /aosp_15_r20/external/pytorch/binaries/speed_benchmark_torch.cc (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 /**
2  * Copyright (c) 2016-present, Facebook, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <string>
18 #include <vector>
19 
20 #include <ATen/ATen.h>
21 #include "caffe2/core/timer.h"
22 #include "caffe2/utils/string_utils.h"
23 #include <torch/csrc/autograd/grad_mode.h>
24 #include <torch/csrc/jit/mobile/module.h>
25 #include <torch/csrc/jit/mobile/import.h>
26 #include <torch/csrc/jit/serialization/import.h>
27 #include <torch/script.h>
28 
29 #include <c10/mobile/CPUCachingAllocator.h>
30 
31 #include <chrono>
32 using namespace std::chrono;
33 
34 C10_DEFINE_string(model, "", "The given torch script model to benchmark.");
35 C10_DEFINE_string(
36     input_dims,
37     "",
38     "Alternate to input_files, if all inputs are simple "
39     "float TensorCPUs, specify the dimension using comma "
40     "separated numbers. If multiple input needed, use "
41     "semicolon to separate the dimension of different "
42     "tensors.");
43 C10_DEFINE_string(input_type, "", "Input type (uint8_t/float)");
44 C10_DEFINE_string(
45     input_memory_format,
46     "contiguous_format",
47     "Input memory format (contiguous_format/channels_last)");
48 C10_DEFINE_bool(
49   no_inputs,
50   false,
51   "Whether the model has any input. Will ignore other input arguments if true");
52 C10_DEFINE_bool(
53   use_caching_allocator,
54   false,
55   "Whether to cache allocations between inference iterations");
56 C10_DEFINE_int(
57     use_bundled_input,
58     -1,
59     "If set, benchmark will expect the model to have bundled inputs "
60     "and will run on the input with this index. ");
61 C10_DEFINE_bool(
62   print_output,
63   false,
64   "Whether to print output with all one input tensor.");
65 C10_DEFINE_int(warmup, 0, "The number of iterations to warm up.");
66 C10_DEFINE_int(iter, 10, "The number of iterations to run.");
67 C10_DEFINE_bool(
68   report_pep,
69   false,
70   "Whether to print performance stats for AI-PEP.");
71 
72 C10_DEFINE_int(pytext_len, 0, "Length of input sequence.");
73 C10_DEFINE_bool(vulkan, false, "Whether to use Vulkan backend (GPU).");
74 
75 namespace {
76 
77 std::vector<std::string>
split(char separator,const std::string & string,bool ignore_empty=true)78 split(char separator, const std::string& string, bool ignore_empty = true) {
79   std::vector<std::string> pieces;
80   std::stringstream ss(string);
81   std::string item;
82   while (getline(ss, item, separator)) {
83     if (!ignore_empty || !item.empty()) {
84       pieces.push_back(std::move(item));
85     }
86   }
87   return pieces;
88 }
89 
create_inputs()90 std::vector<c10::IValue> create_inputs() {
91   if (FLAGS_no_inputs) {
92     return {};
93   }
94 
95   if (FLAGS_use_bundled_input >= 0) {
96     // Need to get these after the model is loaded.
97     return {};
98   }
99 
100   CAFFE_ENFORCE_GE(FLAGS_input_dims.size(), 0, "Input dims must be specified.");
101   CAFFE_ENFORCE_GE(FLAGS_input_type.size(), 0, "Input type must be specified.");
102 
103   std::vector<std::string> input_dims_list = split(';', FLAGS_input_dims);
104   std::vector<std::string> input_type_list = split(';', FLAGS_input_type);
105   std::vector<std::string> input_memory_format_list =
106       split(';', FLAGS_input_memory_format);
107 
108   CAFFE_ENFORCE_EQ(
109       input_dims_list.size(),
110       input_type_list.size(),
111       "Input dims and type should have the same number of items.");
112   CAFFE_ENFORCE_EQ(
113       input_dims_list.size(),
114       input_memory_format_list.size(),
115       "Input dims and format should have the same number of items.");
116 
117   std::vector<c10::IValue> inputs;
118   for (size_t i = 0; i < input_dims_list.size(); ++i) {
119     auto input_dims_str = split(',', input_dims_list[i]);
120     std::vector<int64_t> input_dims;
121     for (const auto& s : input_dims_str) {
122       input_dims.push_back(std::stoi(s));
123     }
124 
125     at::ScalarType input_type;
126     if (input_type_list[i] == "float") {
127       input_type = at::ScalarType::Float;
128     } else if (input_type_list[i] == "uint8_t") {
129       input_type = at::ScalarType::Byte;
130     } else if (input_type_list[i] == "int64") {
131       input_type = at::ScalarType::Long;
132     } else {
133       CAFFE_THROW("Unsupported input type: ", input_type_list[i]);
134     }
135 
136     at::MemoryFormat input_memory_format;
137     if (input_memory_format_list[i] == "channels_last") {
138       if (input_dims.size() != 4u) {
139         CAFFE_THROW(
140             "channels_last memory format only available on 4D tensors!");
141       }
142       input_memory_format = at::MemoryFormat::ChannelsLast;
143     } else if (input_memory_format_list[i] == "contiguous_format") {
144       input_memory_format = at::MemoryFormat::Contiguous;
145     } else {
146       CAFFE_THROW(
147           "Unsupported input memory format: ", input_memory_format_list[i]);
148     }
149 
150     inputs.push_back(
151         torch::ones(
152             input_dims,
153             at::TensorOptions(input_type).
154             memory_format(input_memory_format)));
155   }
156 
157   if (FLAGS_pytext_len > 0) {
158     auto stensor = FLAGS_pytext_len * at::ones({1}, torch::kI64);
159     inputs.push_back(stensor);
160   }
161 
162   return inputs;
163 }
164 
165 template<class T>
166 class Runner {
167  public:
168   virtual ~Runner() = default;
run(T & module,const std::vector<c10::IValue> & inputs)169   virtual c10::IValue run(
170       T& module,
171       const std::vector<c10::IValue>& inputs) {
172     return module.forward(inputs);
173   }
174 };
175 
176 template<class T>
177 class vkRunner final : public Runner<T> {
178  public:
179   virtual ~vkRunner() = default;
run(T & module,const std::vector<c10::IValue> & inputs)180   virtual c10::IValue run(
181       T& module,
182       const std::vector<c10::IValue>& inputs) override {
183     if (!module.attr("requires_backend_transfers", at::IValue(true)).toBool()) {
184       // No need to transfer input/output backends
185       return module.forward(inputs);
186     }
187 
188     if (inputs_.size() == 0) {
189       // Upload the input tensor(s) to GPU memory.
190       inputs_.clear();
191       inputs_.reserve(inputs.size());
192       for (const auto& input : inputs) {
193         if (input.isTensor()) {
194           inputs_.emplace_back(at::rand(input.toTensor().sizes()).vulkan());
195         }
196         else if (input.isTensorList()) {
197           const c10::List<at::Tensor> input_as_list = input.toTensorList();
198           c10::List<at::Tensor> input_vk_list;
199           input_vk_list.reserve(input_as_list.size());
200           for (int i=0; i < input_as_list.size(); ++i) {
201             const at::Tensor element = input_as_list.get(i);
202             input_vk_list.emplace_back(at::rand(element.sizes()).vulkan());
203           }
204           inputs_.emplace_back(c10::IValue(input_vk_list));
205         }
206         else {
207           CAFFE_THROW("Inputs must only contain IValues of type c10::Tensor or c10::TensorList!");
208         }
209       }
210     }
211 
212     // Run, and download the output tensor to system memory.
213     c10::IValue output = module.forward(inputs_);
214     if (output.isTensor()) {
215       return output.toTensor().cpu();
216     }
217     else if (output.isTensorList()) {
218       return output.toTensorList().get(0).cpu();
219     }
220     else if (output.isList()) {
221       return output.toList().get(0).toTensor().cpu();
222     }
223     else if (output.isTuple()) {
224       return output.toTuple()->elements()[0].toTensor().cpu();
225     }
226     else {
227       CAFFE_THROW("Outputs must only be either c10::Tensor or c10::TensorList!");
228     };
229   }
230 
231  private:
232   std::vector<c10::IValue> inputs_;
233 };
234 
235 } // namespace
236 
main(int argc,char ** argv)237 int main(int argc, char** argv) {
238   c10::SetUsageMessage(
239     "Run speed benchmark for pytorch model.\n"
240     "Example usage:\n"
241     "./speed_benchmark_torch"
242     " --model=<model_file>"
243     " --use_bundled_input=0"
244     " --warmup=5"
245     " --iter=20");
246   if (!c10::ParseCommandLineFlags(&argc, &argv)) {
247     std::cerr << "Failed to parse command line flags!" << std::endl;
248     return 1;
249   }
250 
251   std::vector<c10::IValue> inputs = create_inputs();
252 
253   c10::InferenceMode mode;
254 #if BUILD_LITE_INTERPRETER
255   auto module = torch::jit::_load_for_mobile(FLAGS_model);
256 #else
257   torch::jit::GraphOptimizerEnabledGuard no_optimizer_guard(false);
258   auto module = torch::jit::load(FLAGS_model);
259 #endif
260 
261   if (FLAGS_use_bundled_input >= 0) {
262     auto get_method = module.find_method("get_all_bundled_inputs");
263     if (!get_method) {
264       std::cerr << "Model does not have bundled inputs.  Before saving," << std::endl
265         << "use torch.utils.bundled_inputs.augment_model_with_bundled_inputs." << std::endl;
266       return 1;
267     }
268 
269     auto all_inputs = (*get_method)({}).toList();
270     if (FLAGS_use_bundled_input >= all_inputs.size()) {
271       // NOTE: This check is only to make the error message nicer.
272       // The get call below does internal bounds checking.
273       std::cerr << "Model has only " << all_inputs.size() << " bundled inputs." << std::endl;
274       return 1;
275     }
276     inputs = all_inputs.get(FLAGS_use_bundled_input).toTupleRef().elements();
277   }
278 
279 #ifdef BUILD_LITE_INTERPRETER
280   using ModuleType = torch::jit::mobile::Module;
281 #else
282   using ModuleType = torch::jit::Module;
283 #endif
284 
285   const auto runner = FLAGS_vulkan ? std::make_unique<vkRunner<ModuleType>>()
286                                    : std::make_unique<Runner<ModuleType>>();
287 
288 #ifndef BUILD_LITE_INTERPRETER
289   module.eval();
290 #endif
291 
292   if (FLAGS_print_output) {
293     std::cout << runner->run(module, inputs) << std::endl;
294   }
295 
296   c10::CPUCachingAllocator caching_allocator;
297   std::optional<c10::WithCPUCachingAllocatorGuard> caching_allocator_guard;
298   if (FLAGS_use_caching_allocator) {
299     caching_allocator_guard.emplace(&caching_allocator);
300   }
301   std::cout << "Starting benchmark." << std::endl;
302   std::cout << "Running warmup runs." << std::endl;
303   CAFFE_ENFORCE(
304       FLAGS_warmup >= 0,
305       "Number of warm up runs should be non negative, provided ",
306       FLAGS_warmup,
307       ".");
308   for (int i = 0; i < FLAGS_warmup; ++i) {
309     runner->run(module, inputs);
310   }
311 
312   std::cout << "Main runs." << std::endl;
313   CAFFE_ENFORCE(
314       FLAGS_iter >= 0,
315       "Number of main runs should be non negative, provided ",
316       FLAGS_iter,
317       ".");
318   caffe2::Timer timer;
319   std::vector<float> times;
320   auto micros = timer.MicroSeconds();
321   for (int i = 0; i < FLAGS_iter; ++i) {
322     auto start = high_resolution_clock::now();
323     runner->run(module, inputs);
324     auto stop = high_resolution_clock::now();
325     auto duration = duration_cast<microseconds>(stop - start);
326     times.push_back(duration.count());
327   }
328   micros = timer.MicroSeconds();
329   if (FLAGS_report_pep) {
330     for (auto t : times) {
331       std::cout << "PyTorchObserver {\"type\": \"NET\", \"unit\": \"us\", \"metric\": \"latency\", \"value\": \"" << t << "\"}" << std::endl;
332     }
333   }
334   std::cout << "Main run finished. Microseconds per iter: "
335             << micros / FLAGS_iter
336             << ". Iters per second: " << 1000.0 * 1000 * FLAGS_iter / micros
337             << std::endl;
338 
339   return 0;
340 }
341