1 /**
2 * Copyright (c) 2016-present, Facebook, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <string>
18 #include <vector>
19
20 #include <ATen/ATen.h>
21 #include "caffe2/core/timer.h"
22 #include "caffe2/utils/string_utils.h"
23 #include <torch/csrc/autograd/grad_mode.h>
24 #include <torch/csrc/jit/mobile/module.h>
25 #include <torch/csrc/jit/mobile/import.h>
26 #include <torch/csrc/jit/serialization/import.h>
27 #include <torch/script.h>
28
29 #include <c10/mobile/CPUCachingAllocator.h>
30
31 #include <chrono>
32 using namespace std::chrono;
33
34 C10_DEFINE_string(model, "", "The given torch script model to benchmark.");
35 C10_DEFINE_string(
36 input_dims,
37 "",
38 "Alternate to input_files, if all inputs are simple "
39 "float TensorCPUs, specify the dimension using comma "
40 "separated numbers. If multiple input needed, use "
41 "semicolon to separate the dimension of different "
42 "tensors.");
43 C10_DEFINE_string(input_type, "", "Input type (uint8_t/float)");
44 C10_DEFINE_string(
45 input_memory_format,
46 "contiguous_format",
47 "Input memory format (contiguous_format/channels_last)");
48 C10_DEFINE_bool(
49 no_inputs,
50 false,
51 "Whether the model has any input. Will ignore other input arguments if true");
52 C10_DEFINE_bool(
53 use_caching_allocator,
54 false,
55 "Whether to cache allocations between inference iterations");
56 C10_DEFINE_int(
57 use_bundled_input,
58 -1,
59 "If set, benchmark will expect the model to have bundled inputs "
60 "and will run on the input with this index. ");
61 C10_DEFINE_bool(
62 print_output,
63 false,
64 "Whether to print output with all one input tensor.");
65 C10_DEFINE_int(warmup, 0, "The number of iterations to warm up.");
66 C10_DEFINE_int(iter, 10, "The number of iterations to run.");
67 C10_DEFINE_bool(
68 report_pep,
69 false,
70 "Whether to print performance stats for AI-PEP.");
71
72 C10_DEFINE_int(pytext_len, 0, "Length of input sequence.");
73 C10_DEFINE_bool(vulkan, false, "Whether to use Vulkan backend (GPU).");
74
75 namespace {
76
77 std::vector<std::string>
split(char separator,const std::string & string,bool ignore_empty=true)78 split(char separator, const std::string& string, bool ignore_empty = true) {
79 std::vector<std::string> pieces;
80 std::stringstream ss(string);
81 std::string item;
82 while (getline(ss, item, separator)) {
83 if (!ignore_empty || !item.empty()) {
84 pieces.push_back(std::move(item));
85 }
86 }
87 return pieces;
88 }
89
create_inputs()90 std::vector<c10::IValue> create_inputs() {
91 if (FLAGS_no_inputs) {
92 return {};
93 }
94
95 if (FLAGS_use_bundled_input >= 0) {
96 // Need to get these after the model is loaded.
97 return {};
98 }
99
100 CAFFE_ENFORCE_GE(FLAGS_input_dims.size(), 0, "Input dims must be specified.");
101 CAFFE_ENFORCE_GE(FLAGS_input_type.size(), 0, "Input type must be specified.");
102
103 std::vector<std::string> input_dims_list = split(';', FLAGS_input_dims);
104 std::vector<std::string> input_type_list = split(';', FLAGS_input_type);
105 std::vector<std::string> input_memory_format_list =
106 split(';', FLAGS_input_memory_format);
107
108 CAFFE_ENFORCE_EQ(
109 input_dims_list.size(),
110 input_type_list.size(),
111 "Input dims and type should have the same number of items.");
112 CAFFE_ENFORCE_EQ(
113 input_dims_list.size(),
114 input_memory_format_list.size(),
115 "Input dims and format should have the same number of items.");
116
117 std::vector<c10::IValue> inputs;
118 for (size_t i = 0; i < input_dims_list.size(); ++i) {
119 auto input_dims_str = split(',', input_dims_list[i]);
120 std::vector<int64_t> input_dims;
121 for (const auto& s : input_dims_str) {
122 input_dims.push_back(std::stoi(s));
123 }
124
125 at::ScalarType input_type;
126 if (input_type_list[i] == "float") {
127 input_type = at::ScalarType::Float;
128 } else if (input_type_list[i] == "uint8_t") {
129 input_type = at::ScalarType::Byte;
130 } else if (input_type_list[i] == "int64") {
131 input_type = at::ScalarType::Long;
132 } else {
133 CAFFE_THROW("Unsupported input type: ", input_type_list[i]);
134 }
135
136 at::MemoryFormat input_memory_format;
137 if (input_memory_format_list[i] == "channels_last") {
138 if (input_dims.size() != 4u) {
139 CAFFE_THROW(
140 "channels_last memory format only available on 4D tensors!");
141 }
142 input_memory_format = at::MemoryFormat::ChannelsLast;
143 } else if (input_memory_format_list[i] == "contiguous_format") {
144 input_memory_format = at::MemoryFormat::Contiguous;
145 } else {
146 CAFFE_THROW(
147 "Unsupported input memory format: ", input_memory_format_list[i]);
148 }
149
150 inputs.push_back(
151 torch::ones(
152 input_dims,
153 at::TensorOptions(input_type).
154 memory_format(input_memory_format)));
155 }
156
157 if (FLAGS_pytext_len > 0) {
158 auto stensor = FLAGS_pytext_len * at::ones({1}, torch::kI64);
159 inputs.push_back(stensor);
160 }
161
162 return inputs;
163 }
164
165 template<class T>
166 class Runner {
167 public:
168 virtual ~Runner() = default;
run(T & module,const std::vector<c10::IValue> & inputs)169 virtual c10::IValue run(
170 T& module,
171 const std::vector<c10::IValue>& inputs) {
172 return module.forward(inputs);
173 }
174 };
175
176 template<class T>
177 class vkRunner final : public Runner<T> {
178 public:
179 virtual ~vkRunner() = default;
run(T & module,const std::vector<c10::IValue> & inputs)180 virtual c10::IValue run(
181 T& module,
182 const std::vector<c10::IValue>& inputs) override {
183 if (!module.attr("requires_backend_transfers", at::IValue(true)).toBool()) {
184 // No need to transfer input/output backends
185 return module.forward(inputs);
186 }
187
188 if (inputs_.size() == 0) {
189 // Upload the input tensor(s) to GPU memory.
190 inputs_.clear();
191 inputs_.reserve(inputs.size());
192 for (const auto& input : inputs) {
193 if (input.isTensor()) {
194 inputs_.emplace_back(at::rand(input.toTensor().sizes()).vulkan());
195 }
196 else if (input.isTensorList()) {
197 const c10::List<at::Tensor> input_as_list = input.toTensorList();
198 c10::List<at::Tensor> input_vk_list;
199 input_vk_list.reserve(input_as_list.size());
200 for (int i=0; i < input_as_list.size(); ++i) {
201 const at::Tensor element = input_as_list.get(i);
202 input_vk_list.emplace_back(at::rand(element.sizes()).vulkan());
203 }
204 inputs_.emplace_back(c10::IValue(input_vk_list));
205 }
206 else {
207 CAFFE_THROW("Inputs must only contain IValues of type c10::Tensor or c10::TensorList!");
208 }
209 }
210 }
211
212 // Run, and download the output tensor to system memory.
213 c10::IValue output = module.forward(inputs_);
214 if (output.isTensor()) {
215 return output.toTensor().cpu();
216 }
217 else if (output.isTensorList()) {
218 return output.toTensorList().get(0).cpu();
219 }
220 else if (output.isList()) {
221 return output.toList().get(0).toTensor().cpu();
222 }
223 else if (output.isTuple()) {
224 return output.toTuple()->elements()[0].toTensor().cpu();
225 }
226 else {
227 CAFFE_THROW("Outputs must only be either c10::Tensor or c10::TensorList!");
228 };
229 }
230
231 private:
232 std::vector<c10::IValue> inputs_;
233 };
234
235 } // namespace
236
main(int argc,char ** argv)237 int main(int argc, char** argv) {
238 c10::SetUsageMessage(
239 "Run speed benchmark for pytorch model.\n"
240 "Example usage:\n"
241 "./speed_benchmark_torch"
242 " --model=<model_file>"
243 " --use_bundled_input=0"
244 " --warmup=5"
245 " --iter=20");
246 if (!c10::ParseCommandLineFlags(&argc, &argv)) {
247 std::cerr << "Failed to parse command line flags!" << std::endl;
248 return 1;
249 }
250
251 std::vector<c10::IValue> inputs = create_inputs();
252
253 c10::InferenceMode mode;
254 #if BUILD_LITE_INTERPRETER
255 auto module = torch::jit::_load_for_mobile(FLAGS_model);
256 #else
257 torch::jit::GraphOptimizerEnabledGuard no_optimizer_guard(false);
258 auto module = torch::jit::load(FLAGS_model);
259 #endif
260
261 if (FLAGS_use_bundled_input >= 0) {
262 auto get_method = module.find_method("get_all_bundled_inputs");
263 if (!get_method) {
264 std::cerr << "Model does not have bundled inputs. Before saving," << std::endl
265 << "use torch.utils.bundled_inputs.augment_model_with_bundled_inputs." << std::endl;
266 return 1;
267 }
268
269 auto all_inputs = (*get_method)({}).toList();
270 if (FLAGS_use_bundled_input >= all_inputs.size()) {
271 // NOTE: This check is only to make the error message nicer.
272 // The get call below does internal bounds checking.
273 std::cerr << "Model has only " << all_inputs.size() << " bundled inputs." << std::endl;
274 return 1;
275 }
276 inputs = all_inputs.get(FLAGS_use_bundled_input).toTupleRef().elements();
277 }
278
279 #ifdef BUILD_LITE_INTERPRETER
280 using ModuleType = torch::jit::mobile::Module;
281 #else
282 using ModuleType = torch::jit::Module;
283 #endif
284
285 const auto runner = FLAGS_vulkan ? std::make_unique<vkRunner<ModuleType>>()
286 : std::make_unique<Runner<ModuleType>>();
287
288 #ifndef BUILD_LITE_INTERPRETER
289 module.eval();
290 #endif
291
292 if (FLAGS_print_output) {
293 std::cout << runner->run(module, inputs) << std::endl;
294 }
295
296 c10::CPUCachingAllocator caching_allocator;
297 std::optional<c10::WithCPUCachingAllocatorGuard> caching_allocator_guard;
298 if (FLAGS_use_caching_allocator) {
299 caching_allocator_guard.emplace(&caching_allocator);
300 }
301 std::cout << "Starting benchmark." << std::endl;
302 std::cout << "Running warmup runs." << std::endl;
303 CAFFE_ENFORCE(
304 FLAGS_warmup >= 0,
305 "Number of warm up runs should be non negative, provided ",
306 FLAGS_warmup,
307 ".");
308 for (int i = 0; i < FLAGS_warmup; ++i) {
309 runner->run(module, inputs);
310 }
311
312 std::cout << "Main runs." << std::endl;
313 CAFFE_ENFORCE(
314 FLAGS_iter >= 0,
315 "Number of main runs should be non negative, provided ",
316 FLAGS_iter,
317 ".");
318 caffe2::Timer timer;
319 std::vector<float> times;
320 auto micros = timer.MicroSeconds();
321 for (int i = 0; i < FLAGS_iter; ++i) {
322 auto start = high_resolution_clock::now();
323 runner->run(module, inputs);
324 auto stop = high_resolution_clock::now();
325 auto duration = duration_cast<microseconds>(stop - start);
326 times.push_back(duration.count());
327 }
328 micros = timer.MicroSeconds();
329 if (FLAGS_report_pep) {
330 for (auto t : times) {
331 std::cout << "PyTorchObserver {\"type\": \"NET\", \"unit\": \"us\", \"metric\": \"latency\", \"value\": \"" << t << "\"}" << std::endl;
332 }
333 }
334 std::cout << "Main run finished. Microseconds per iter: "
335 << micros / FLAGS_iter
336 << ". Iters per second: " << 1000.0 * 1000 * FLAGS_iter / micros
337 << std::endl;
338
339 return 0;
340 }
341