1 /* Copyright (c) Meta Platforms, Inc. and affiliates.
2 * All rights reserved.
3 * Copyright 2023-2024 Arm Limited and/or its affiliates.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9 #include <errno.h>
10 #include <stdio.h>
11 #include <unistd.h>
12 #include <memory>
13 #include <vector>
14
15 #include <executorch/extension/data_loader/buffer_data_loader.h>
16 #include <executorch/extension/runner_util/inputs.h>
17 #include <executorch/runtime/core/memory_allocator.h>
18 #include <executorch/runtime/executor/program.h>
19 #include <executorch/runtime/platform/log.h>
20 #include <executorch/runtime/platform/platform.h>
21 #include <executorch/runtime/platform/runtime.h>
22
23 #include "arm_perf_monitor.h"
24
25 #ifdef SEMIHOSTING
26
27 /**
28 * The input_file_allocation_pool should be large enough to fit the various
29 * input file data used when loading the data files when running semihosting
30 * e.g. the input file data and the pte file data
31 * In our unit test flow, we have the capability to provide an enitre model to
32 * the Corstone-3xx FVP using semi hosting. Hence, the input file allocation
33 * pool needs to be large enough to take an entire model and input. On the FVP,
34 * input_data_sec is linked to the DDR, which is large (256MB on
35 * Corstone-300).
36 * If you use semihosting on your HW this can be lowered to fit your
37 * files/memory
38 */
39
40 const size_t input_file_allocation_pool_size = 60 * 1024 * 1024;
41 unsigned char __attribute__((
42 section("input_data_sec"),
43 aligned(16))) input_file_allocation_pool[input_file_allocation_pool_size];
44 char* model_pte = nullptr;
45
46 #else
47
48 /**
49 * This header file is generated by the build process based on the .pte file
50 * specified in the ET_PTE_FILE_PATH variable to the cmake build.
51 * Control of the action of the .pte, it's use of operators and delegates, and
52 * which are included in the bare metal build are also orchestrated by the
53 * CMakeLists file. For example use see examples/arm/run.sh
54 *
55 * e.g. This includes the pte as a big chunk of data struct into this file
56 */
57 #include "model_pte.h"
58
59 #endif
60
61 using executorch::aten::ScalarType;
62 using executorch::aten::Tensor;
63 using executorch::aten::TensorImpl;
64 using executorch::extension::BufferCleanup;
65 using executorch::extension::BufferDataLoader;
66 using executorch::runtime::Error;
67 using executorch::runtime::EValue;
68 using executorch::runtime::HierarchicalAllocator;
69 using executorch::runtime::MemoryAllocator;
70 using executorch::runtime::MemoryManager;
71 using executorch::runtime::Method;
72 using executorch::runtime::MethodMeta;
73 using executorch::runtime::Program;
74 using executorch::runtime::Result;
75 using executorch::runtime::Span;
76 using executorch::runtime::Tag;
77 using executorch::runtime::TensorInfo;
78
79 /**
80 * The method_allocation_pool should be large enough to fit the setup, input
81 * used and other data used like the planned memory pool (e.g. memory-planned
82 * buffers to use for mutable tensor data) In this example we run on a
83 * Corstone-3xx FVP so we can use a lot of memory to be able to run and test
84 * large models if you run on HW this should be lowered to fit into your
85 * availible memory.
86 */
87 #ifndef ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE
88 #define ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE (20 * 1024 * 1024)
89 #endif
90 const size_t method_allocation_pool_size =
91 ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE;
92 unsigned char __attribute__((
93 section("input_data_sec"),
94 aligned(16))) method_allocation_pool[method_allocation_pool_size];
95
96 /**
97 * The temp_allocation_pool is used for allocating temporary data during kernel
98 * or delegate execution. This will be reset after each kernel or delegate call.
99 * Currently a MemoryAllocator is used but a PlatformMemoryAllocator is probably
100 * a better fit
101 */
102 #ifndef ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE
103 #define ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE (1 * 1024 * 1024)
104 #endif
105 const size_t temp_allocation_pool_size =
106 ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE;
107 unsigned char __attribute__((
108 section("input_data_sec"),
109 aligned(16))) temp_allocation_pool[temp_allocation_pool_size];
110
et_pal_init(void)111 void et_pal_init(void) {}
112
et_pal_abort(void)113 ET_NORETURN void et_pal_abort(void) {
114 #ifndef SEMIHOSTING
115 __builtin_trap();
116 #else
117 _exit(-1);
118 #endif
119 }
120
121 /**
122 * Emit a log message via platform output (serial port, console, etc).
123 */
et_pal_emit_log_message(ET_UNUSED et_timestamp_t timestamp,et_pal_log_level_t level,const char * filename,ET_UNUSED const char * function,size_t line,const char * message,ET_UNUSED size_t length)124 void et_pal_emit_log_message(
125 ET_UNUSED et_timestamp_t timestamp,
126 et_pal_log_level_t level,
127 const char* filename,
128 ET_UNUSED const char* function,
129 size_t line,
130 const char* message,
131 ET_UNUSED size_t length) {
132 fprintf(
133 stderr, "%c [executorch:%s:%zu] %s\n", level, filename, line, message);
134 }
135
136 namespace {
137
138 // Setup our own allocator that can show some extra stuff like used and free
139 // memory info
140 class ArmMemoryAllocator : public executorch::runtime::MemoryAllocator {
141 public:
ArmMemoryAllocator(uint32_t size,uint8_t * base_address)142 ArmMemoryAllocator(uint32_t size, uint8_t* base_address)
143 : MemoryAllocator(size, base_address), used_(0) {}
144
allocate(size_t size,size_t alignment=kDefaultAlignment)145 void* allocate(size_t size, size_t alignment = kDefaultAlignment) override {
146 void* ret = executorch::runtime::MemoryAllocator::allocate(size, alignment);
147 if (ret != nullptr) {
148 // Align with the same code as in MemoryAllocator::allocate() to keep
149 // used_ "in sync" As alignment is expected to be power of 2 (checked by
150 // MemoryAllocator::allocate()) we can check it the lower bits
151 // (same as alignment - 1) is zero or not.
152 if ((size & (alignment - 1)) == 0) {
153 // Already aligned.
154 used_ += size;
155 } else {
156 used_ = (used_ | (alignment - 1)) + 1 + size;
157 }
158 }
159 return ret;
160 }
161
162 // Returns the used size of the allocator's memory buffer.
used_size() const163 size_t used_size() const {
164 return used_;
165 }
166
167 // Returns the free size of the allocator's memory buffer.
free_size() const168 size_t free_size() const {
169 return executorch::runtime::MemoryAllocator::size() - used_;
170 }
171
172 private:
173 size_t used_;
174 };
175
prepare_input_tensors(Method & method,MemoryAllocator & allocator,std::vector<std::pair<char *,size_t>> & input_buffers)176 Result<BufferCleanup> prepare_input_tensors(
177 Method& method,
178 MemoryAllocator& allocator,
179 std::vector<std::pair<char*, size_t>>& input_buffers) {
180 MethodMeta method_meta = method.method_meta();
181 size_t num_inputs = method_meta.num_inputs();
182 size_t num_allocated = 0;
183
184 #ifdef SEMIHOSTING
185 ET_CHECK_OR_RETURN_ERROR(
186 input_buffers.size() > 0 && num_inputs == input_buffers.size(),
187 InvalidArgument,
188 "Wrong number of inputs allocated compared to method");
189 #endif
190
191 void** inputs =
192 static_cast<void**>(allocator.allocate(num_inputs * sizeof(void*)));
193
194 ET_CHECK_OR_RETURN_ERROR(
195 inputs != nullptr,
196 MemoryAllocationFailed,
197 "Could not allocate memory for pointers to input buffers.");
198
199 for (size_t i = 0; i < num_inputs; i++) {
200 auto tag = method_meta.input_tag(i);
201 ET_CHECK_OK_OR_RETURN_ERROR(tag.error());
202
203 if (tag.get() != Tag::Tensor) {
204 ET_LOG(Debug, "Skipping non-tensor input %zu", i);
205 continue;
206 }
207 Result<TensorInfo> tensor_meta = method_meta.input_tensor_meta(i);
208 ET_CHECK_OK_OR_RETURN_ERROR(tensor_meta.error());
209
210 // Input is a tensor. Allocate a buffer for it.
211 void* data_ptr = allocator.allocate(tensor_meta->nbytes());
212 ET_CHECK_OR_RETURN_ERROR(
213 data_ptr != nullptr,
214 MemoryAllocationFailed,
215 "Could not allocate memory for input buffers.");
216 inputs[num_allocated++] = data_ptr;
217
218 Error err = Error::Ok;
219 if (input_buffers.size() > 0) {
220 auto [buffer, buffer_size] = input_buffers.at(i);
221 if (buffer_size != tensor_meta->nbytes()) {
222 ET_LOG(
223 Error,
224 "input size (%d) and tensor size (%d) missmatch!",
225 buffer_size,
226 tensor_meta->nbytes());
227 err = Error::InvalidArgument;
228 } else {
229 ET_LOG(Info, "Copying read input to tensor.");
230 std::memcpy(data_ptr, buffer, buffer_size);
231 }
232 }
233
234 TensorImpl impl = TensorImpl(
235 tensor_meta.get().scalar_type(),
236 tensor_meta.get().sizes().size(),
237 const_cast<TensorImpl::SizesType*>(tensor_meta.get().sizes().data()),
238 data_ptr,
239 const_cast<TensorImpl::DimOrderType*>(
240 tensor_meta.get().dim_order().data()));
241 Tensor t(&impl);
242
243 // If input_buffers.size <= 0, we don't have any input, fill t with 1's.
244 if (input_buffers.size() <= 0) {
245 for (size_t j = 0; j < t.numel(); j++) {
246 switch (t.scalar_type()) {
247 case ScalarType::Int:
248 t.mutable_data_ptr<int>()[j] = 1;
249 break;
250 case ScalarType::Float:
251 t.mutable_data_ptr<float>()[j] = 1.;
252 break;
253 }
254 }
255 }
256
257 err = method.set_input(t, i);
258
259 if (err != Error::Ok) {
260 ET_LOG(
261 Error, "Failed to prepare input %zu: 0x%" PRIx32, i, (uint32_t)err);
262 // The BufferCleanup will free the inputs when it goes out of scope.
263 BufferCleanup cleanup({inputs, num_allocated});
264 return err;
265 }
266 }
267 return BufferCleanup({inputs, num_allocated});
268 }
269
270 #ifdef SEMIHOSTING
271
read_binary_file(const char * filename,MemoryAllocator & allocator)272 std::pair<char*, size_t> read_binary_file(
273 const char* filename,
274 MemoryAllocator& allocator) {
275 FILE* fp = fopen(filename, "rb");
276 if (!fp) {
277 ET_LOG(
278 Fatal,
279 "Could not open file %s (errno: %d) for reading, exiting!",
280 filename,
281 errno);
282 _exit(1);
283 }
284
285 fseek(fp, 0, SEEK_END);
286 auto file_size = ftell(fp);
287 fseek(fp, 0, SEEK_SET);
288
289 char* buffer = static_cast<char*>(allocator.allocate(file_size));
290
291 auto read_size = fread(buffer, 1, file_size, fp);
292 if (read_size != file_size) {
293 ET_LOG(
294 Info,
295 "Failed to read whole file (%), read %zu bytes!",
296 filename,
297 read_size);
298 }
299 fclose(fp);
300 return std::make_pair(buffer, read_size);
301 }
302 #endif
303
304 } // namespace
305
main(int argc,const char * argv[])306 int main(int argc, const char* argv[]) {
307 #ifdef SEMIHOSTING
308 ET_LOG(Info, "Running executor with parameter:");
309 if (argc < 7) {
310 ET_LOG(Fatal, "Not right number of parameters!");
311 ET_LOG(
312 Fatal,
313 "app -m model.pte -i input.bin [-i input2.bin] -o output_basename");
314 ET_LOG(Fatal, "Exiting!");
315 _exit(1);
316 }
317 ET_LOG(Info, " %s", argv[0]);
318 for (int i = 1; i < argc; i++) {
319 ET_LOG(Info, " %s %s", argv[i], argv[++i]);
320 }
321 #else
322 (void)argc;
323 (void)argv;
324 #endif
325
326 executorch::runtime::runtime_init();
327 std::vector<std::pair<char*, size_t>> input_buffers;
328 size_t pte_size = sizeof(model_pte);
329
330 #ifdef SEMIHOSTING
331 const char* output_basename = nullptr;
332 ArmMemoryAllocator input_file_allocator(
333 input_file_allocation_pool_size, input_file_allocation_pool);
334
335 /* parse input parameters */
336 for (int i = 0; i < argc; i++) {
337 size_t nbr_inputs = 0;
338 if (std::strcmp(argv[i], "-i") == 0) {
339 // input file, read the data into memory
340 const char* input_tensor_filename = argv[++i];
341 ET_LOG(
342 Info,
343 "Reading input tensor %d from file %s",
344 ++nbr_inputs,
345 input_tensor_filename);
346 auto [buffer, buffer_size] =
347 read_binary_file(input_tensor_filename, input_file_allocator);
348 input_buffers.push_back(std::make_pair(buffer, buffer_size));
349 } else if (std::strcmp(argv[i], "-m") == 0) {
350 const char* pte_filename = argv[++i];
351 ET_LOG(Info, "Reading pte model from file %s", pte_filename);
352 auto [buffer, buffer_size] =
353 read_binary_file(pte_filename, input_file_allocator);
354 // Store the model data with the same variable as if it was loaded
355 // from compiled in location.
356 model_pte = buffer;
357 pte_size = buffer_size;
358 } else if (std::strcmp(argv[i], "-o") == 0) {
359 // store the base filename to write output to.
360 output_basename = argv[++i];
361 }
362 }
363 #endif
364 ET_LOG(Info, "Model in %p %c", model_pte, model_pte[0]);
365 auto loader = BufferDataLoader(model_pte, pte_size);
366 ET_LOG(Info, "Model PTE file loaded. Size: %lu bytes.", pte_size);
367 Result<Program> program = Program::load(&loader);
368 if (!program.ok()) {
369 ET_LOG(
370 Info,
371 "Program loading failed @ 0x%p: 0x%" PRIx32,
372 model_pte,
373 program.error());
374 }
375
376 ET_LOG(Info, "Model buffer loaded, has %lu methods", program->num_methods());
377
378 const char* method_name = nullptr;
379 {
380 const auto method_name_result = program->get_method_name(0);
381 ET_CHECK_MSG(method_name_result.ok(), "Program has no methods");
382 method_name = *method_name_result;
383 }
384 ET_LOG(Info, "Running method %s", method_name);
385
386 Result<MethodMeta> method_meta = program->method_meta(method_name);
387 if (!method_meta.ok()) {
388 ET_LOG(
389 Info,
390 "Failed to get method_meta for %s: 0x%x",
391 method_name,
392 (unsigned int)method_meta.error());
393 }
394
395 ET_LOG(
396 Info,
397 "Setup Method allocator pool. Size: %lu bytes.",
398 method_allocation_pool_size);
399
400 ArmMemoryAllocator method_allocator(
401 method_allocation_pool_size, method_allocation_pool);
402
403 std::vector<uint8_t*> planned_buffers; // Owns the memory
404 std::vector<Span<uint8_t>> planned_spans; // Passed to the allocator
405 size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers();
406
407 size_t planned_buffer_membase = method_allocator.used_size();
408
409 for (size_t id = 0; id < num_memory_planned_buffers; ++id) {
410 size_t buffer_size =
411 static_cast<size_t>(method_meta->memory_planned_buffer_size(id).get());
412 ET_LOG(Info, "Setting up planned buffer %zu, size %zu.", id, buffer_size);
413
414 /* Move to it's own allocator when MemoryPlanner is in place. */
415 uint8_t* buffer =
416 reinterpret_cast<uint8_t*>(method_allocator.allocate(buffer_size));
417 planned_buffers.push_back(buffer);
418 planned_spans.push_back({planned_buffers.back(), buffer_size});
419 }
420
421 size_t planned_buffer_memsize =
422 method_allocator.used_size() - planned_buffer_membase;
423
424 HierarchicalAllocator planned_memory(
425 {planned_spans.data(), planned_spans.size()});
426
427 ArmMemoryAllocator temp_allocator(
428 temp_allocation_pool_size, temp_allocation_pool);
429
430 MemoryManager memory_manager(
431 &method_allocator, &planned_memory, &temp_allocator);
432
433 size_t method_loaded_membase = method_allocator.used_size();
434
435 Result<Method> method = program->load_method(method_name, &memory_manager);
436 if (!method.ok()) {
437 ET_LOG(
438 Info,
439 "Loading of method %s failed with status 0x%" PRIx32,
440 method_name,
441 method.error());
442 }
443 size_t method_loaded_memsize =
444 method_allocator.used_size() - method_loaded_membase;
445 ET_LOG(Info, "Method loaded.");
446
447 ET_LOG(Info, "Preparing inputs...");
448 size_t input_membase = method_allocator.used_size();
449
450 auto inputs =
451 ::prepare_input_tensors(*method, method_allocator, input_buffers);
452
453 if (!inputs.ok()) {
454 ET_LOG(
455 Info,
456 "Preparing inputs tensors for method %s failed with status 0x%" PRIx32,
457 method_name,
458 inputs.error());
459 }
460 size_t input_memsize = method_allocator.used_size() - input_membase;
461 ET_LOG(Info, "Input prepared.");
462
463 ET_LOG(Info, "Starting the model execution...");
464 size_t executor_membase = method_allocator.used_size();
465 StartMeasurements();
466 Error status = method->execute();
467 StopMeasurements();
468 size_t executor_memsize = method_allocator.used_size() - executor_membase;
469
470 ET_LOG(Info, "model_pte_loaded_size: %lu bytes.", pte_size);
471 #ifdef SEMIHOSTING
472 if (input_file_allocator.size() > 0) {
473 ET_LOG(
474 Info,
475 "input_file_allocator_used: %zu / %zu free: %zu ( used: %zu %% ) ",
476 input_file_allocator.used_size(),
477 input_file_allocator.size(),
478 input_file_allocator.free_size(),
479 100 * input_file_allocator.used_size() / input_file_allocator.size());
480 }
481 #endif
482 if (method_allocator.size() != 0) {
483 size_t method_allocator_used = method_allocator.used_size();
484 ET_LOG(
485 Info,
486 "method_allocator_used: %zu / %zu free: %zu ( used: %zu %% ) ",
487 method_allocator_used,
488 method_allocator.size(),
489 method_allocator.free_size(),
490 100 * method_allocator_used / method_allocator.size());
491 ET_LOG(
492 Info, "method_allocator_planned: %zu bytes", planned_buffer_memsize);
493 ET_LOG(Info, "method_allocator_loaded: %zu bytes", method_loaded_memsize);
494 ET_LOG(Info, "method_allocator_input: %zu bytes", input_memsize);
495 ET_LOG(Info, "method_allocator_executor: %zu bytes", executor_memsize);
496 }
497 if (temp_allocator.size() > 0) {
498 ET_LOG(
499 Info,
500 "temp_allocator_used: %zu / %zu free: %zu ( used: %zu %% ) ",
501 temp_allocator.used_size(),
502 temp_allocator.size(),
503 temp_allocator.free_size(),
504 100 * temp_allocator.used_size() / temp_allocator.size());
505 }
506
507 if (status != Error::Ok) {
508 ET_LOG(
509 Info,
510 "Execution of method %s failed with status 0x%" PRIx32,
511 method_name,
512 status);
513 } else {
514 ET_LOG(Info, "Model executed successfully.");
515 }
516
517 std::vector<EValue> outputs(method->outputs_size());
518 ET_LOG(Info, "%zu outputs: ", outputs.size());
519 status = method->get_outputs(outputs.data(), outputs.size());
520 ET_CHECK(status == Error::Ok);
521 for (int i = 0; i < outputs.size(); ++i) {
522 Tensor t = outputs[i].toTensor();
523 #ifndef SEMIHOSTING
524 // The output might be collected and parsed so printf() is used instead
525 // of ET_LOG() here
526 for (int j = 0; j < outputs[i].toTensor().numel(); ++j) {
527 if (t.scalar_type() == ScalarType::Int) {
528 printf(
529 "Output[%d][%d]: %d\n",
530 i,
531 j,
532 outputs[i].toTensor().const_data_ptr<int>()[j]);
533 } else {
534 printf(
535 "Output[%d][%d]: %f\n",
536 i,
537 j,
538 outputs[i].toTensor().const_data_ptr<float>()[j]);
539 }
540 }
541 #else
542 char out_filename[255];
543 snprintf(out_filename, 255, "%s-%d.bin", output_basename, i);
544 ET_LOG(Info, "Writing output to file: %s", out_filename);
545 FILE* out_file = fopen(out_filename, "wb");
546 auto written_size = fwrite(
547 outputs[i].toTensor().const_data_ptr<char>(),
548 1,
549 outputs[i].toTensor().nbytes(),
550 out_file);
551 fclose(out_file);
552 #endif
553 }
554 out:
555 ET_LOG(Info, "Program complete, exiting.");
556 #ifdef SEMIHOSTING
557 _exit(0);
558 #endif
559 ET_LOG(Info, "\04");
560 return 0;
561 }
562