xref: /aosp_15_r20/external/executorch/examples/arm/executor_runner/arm_executor_runner.cpp (revision 523fa7a60841cd1ecfb9cc4201f1ca8b03ed023a)
1 /* Copyright (c) Meta Platforms, Inc. and affiliates.
2  * All rights reserved.
3  * Copyright 2023-2024 Arm Limited and/or its affiliates.
4  *
5  * This source code is licensed under the BSD-style license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 #include <errno.h>
10 #include <stdio.h>
11 #include <unistd.h>
12 #include <memory>
13 #include <vector>
14 
15 #include <executorch/extension/data_loader/buffer_data_loader.h>
16 #include <executorch/extension/runner_util/inputs.h>
17 #include <executorch/runtime/core/memory_allocator.h>
18 #include <executorch/runtime/executor/program.h>
19 #include <executorch/runtime/platform/log.h>
20 #include <executorch/runtime/platform/platform.h>
21 #include <executorch/runtime/platform/runtime.h>
22 
23 #include "arm_perf_monitor.h"
24 
25 #ifdef SEMIHOSTING
26 
27 /**
28  * The input_file_allocation_pool should be large enough to fit the various
29  * input file data used when loading the data files when running semihosting
30  * e.g. the input file data and the pte file data
31  * In our unit test flow, we have the capability to provide an enitre model to
32  * the Corstone-3xx FVP using semi hosting. Hence, the input file allocation
33  * pool needs to be large enough to take an entire model and input. On the FVP,
34  * input_data_sec is linked to the DDR, which is large (256MB on
35  * Corstone-300).
36  * If you use semihosting on your HW this can be lowered to fit your
37  * files/memory
38  */
39 
40 const size_t input_file_allocation_pool_size = 60 * 1024 * 1024;
41 unsigned char __attribute__((
42     section("input_data_sec"),
43     aligned(16))) input_file_allocation_pool[input_file_allocation_pool_size];
44 char* model_pte = nullptr;
45 
46 #else
47 
48 /**
49  * This header file is generated by the build process based on the .pte file
50  * specified in the ET_PTE_FILE_PATH variable to the cmake build.
51  * Control of the action of the .pte, it's use of operators and delegates, and
52  * which are included in the bare metal build are also orchestrated by the
53  * CMakeLists file. For example use see examples/arm/run.sh
54  *
55  * e.g. This includes the pte as a big chunk of data struct into this file
56  */
57 #include "model_pte.h"
58 
59 #endif
60 
61 using executorch::aten::ScalarType;
62 using executorch::aten::Tensor;
63 using executorch::aten::TensorImpl;
64 using executorch::extension::BufferCleanup;
65 using executorch::extension::BufferDataLoader;
66 using executorch::runtime::Error;
67 using executorch::runtime::EValue;
68 using executorch::runtime::HierarchicalAllocator;
69 using executorch::runtime::MemoryAllocator;
70 using executorch::runtime::MemoryManager;
71 using executorch::runtime::Method;
72 using executorch::runtime::MethodMeta;
73 using executorch::runtime::Program;
74 using executorch::runtime::Result;
75 using executorch::runtime::Span;
76 using executorch::runtime::Tag;
77 using executorch::runtime::TensorInfo;
78 
79 /**
80  * The method_allocation_pool should be large enough to fit the setup, input
81  * used and other data used like the planned memory pool (e.g. memory-planned
82  * buffers to use for mutable tensor data) In this example we run on a
83  * Corstone-3xx FVP so we can use a lot of memory to be able to run and test
84  * large models if you run on HW this should be lowered to fit into your
85  * availible memory.
86  */
87 #ifndef ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE
88 #define ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE (20 * 1024 * 1024)
89 #endif
90 const size_t method_allocation_pool_size =
91     ET_ARM_BAREMETAL_METHOD_ALLOCATOR_POOL_SIZE;
92 unsigned char __attribute__((
93     section("input_data_sec"),
94     aligned(16))) method_allocation_pool[method_allocation_pool_size];
95 
96 /**
97  * The temp_allocation_pool is used for allocating temporary data during kernel
98  * or delegate execution. This will be reset after each kernel or delegate call.
99  * Currently a MemoryAllocator is used but a PlatformMemoryAllocator is probably
100  * a better fit
101  */
102 #ifndef ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE
103 #define ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE (1 * 1024 * 1024)
104 #endif
105 const size_t temp_allocation_pool_size =
106     ET_ARM_BAREMETAL_TEMP_ALLOCATOR_POOL_SIZE;
107 unsigned char __attribute__((
108     section("input_data_sec"),
109     aligned(16))) temp_allocation_pool[temp_allocation_pool_size];
110 
et_pal_init(void)111 void et_pal_init(void) {}
112 
et_pal_abort(void)113 ET_NORETURN void et_pal_abort(void) {
114 #ifndef SEMIHOSTING
115   __builtin_trap();
116 #else
117   _exit(-1);
118 #endif
119 }
120 
121 /**
122  * Emit a log message via platform output (serial port, console, etc).
123  */
et_pal_emit_log_message(ET_UNUSED et_timestamp_t timestamp,et_pal_log_level_t level,const char * filename,ET_UNUSED const char * function,size_t line,const char * message,ET_UNUSED size_t length)124 void et_pal_emit_log_message(
125     ET_UNUSED et_timestamp_t timestamp,
126     et_pal_log_level_t level,
127     const char* filename,
128     ET_UNUSED const char* function,
129     size_t line,
130     const char* message,
131     ET_UNUSED size_t length) {
132   fprintf(
133       stderr, "%c [executorch:%s:%zu] %s\n", level, filename, line, message);
134 }
135 
136 namespace {
137 
138 // Setup our own allocator that can show some extra stuff like used and free
139 // memory info
140 class ArmMemoryAllocator : public executorch::runtime::MemoryAllocator {
141  public:
ArmMemoryAllocator(uint32_t size,uint8_t * base_address)142   ArmMemoryAllocator(uint32_t size, uint8_t* base_address)
143       : MemoryAllocator(size, base_address), used_(0) {}
144 
allocate(size_t size,size_t alignment=kDefaultAlignment)145   void* allocate(size_t size, size_t alignment = kDefaultAlignment) override {
146     void* ret = executorch::runtime::MemoryAllocator::allocate(size, alignment);
147     if (ret != nullptr) {
148       // Align with the same code as in MemoryAllocator::allocate() to keep
149       // used_ "in sync" As alignment is expected to be power of 2 (checked by
150       // MemoryAllocator::allocate()) we can check it the lower bits
151       // (same as alignment - 1) is zero or not.
152       if ((size & (alignment - 1)) == 0) {
153         // Already aligned.
154         used_ += size;
155       } else {
156         used_ = (used_ | (alignment - 1)) + 1 + size;
157       }
158     }
159     return ret;
160   }
161 
162   // Returns the used size of the allocator's memory buffer.
used_size() const163   size_t used_size() const {
164     return used_;
165   }
166 
167   // Returns the free size of the allocator's memory buffer.
free_size() const168   size_t free_size() const {
169     return executorch::runtime::MemoryAllocator::size() - used_;
170   }
171 
172  private:
173   size_t used_;
174 };
175 
prepare_input_tensors(Method & method,MemoryAllocator & allocator,std::vector<std::pair<char *,size_t>> & input_buffers)176 Result<BufferCleanup> prepare_input_tensors(
177     Method& method,
178     MemoryAllocator& allocator,
179     std::vector<std::pair<char*, size_t>>& input_buffers) {
180   MethodMeta method_meta = method.method_meta();
181   size_t num_inputs = method_meta.num_inputs();
182   size_t num_allocated = 0;
183 
184 #ifdef SEMIHOSTING
185   ET_CHECK_OR_RETURN_ERROR(
186       input_buffers.size() > 0 && num_inputs == input_buffers.size(),
187       InvalidArgument,
188       "Wrong number of inputs allocated compared to method");
189 #endif
190 
191   void** inputs =
192       static_cast<void**>(allocator.allocate(num_inputs * sizeof(void*)));
193 
194   ET_CHECK_OR_RETURN_ERROR(
195       inputs != nullptr,
196       MemoryAllocationFailed,
197       "Could not allocate memory for pointers to input buffers.");
198 
199   for (size_t i = 0; i < num_inputs; i++) {
200     auto tag = method_meta.input_tag(i);
201     ET_CHECK_OK_OR_RETURN_ERROR(tag.error());
202 
203     if (tag.get() != Tag::Tensor) {
204       ET_LOG(Debug, "Skipping non-tensor input %zu", i);
205       continue;
206     }
207     Result<TensorInfo> tensor_meta = method_meta.input_tensor_meta(i);
208     ET_CHECK_OK_OR_RETURN_ERROR(tensor_meta.error());
209 
210     // Input is a tensor. Allocate a buffer for it.
211     void* data_ptr = allocator.allocate(tensor_meta->nbytes());
212     ET_CHECK_OR_RETURN_ERROR(
213         data_ptr != nullptr,
214         MemoryAllocationFailed,
215         "Could not allocate memory for input buffers.");
216     inputs[num_allocated++] = data_ptr;
217 
218     Error err = Error::Ok;
219     if (input_buffers.size() > 0) {
220       auto [buffer, buffer_size] = input_buffers.at(i);
221       if (buffer_size != tensor_meta->nbytes()) {
222         ET_LOG(
223             Error,
224             "input size (%d) and tensor size (%d) missmatch!",
225             buffer_size,
226             tensor_meta->nbytes());
227         err = Error::InvalidArgument;
228       } else {
229         ET_LOG(Info, "Copying read input to tensor.");
230         std::memcpy(data_ptr, buffer, buffer_size);
231       }
232     }
233 
234     TensorImpl impl = TensorImpl(
235         tensor_meta.get().scalar_type(),
236         tensor_meta.get().sizes().size(),
237         const_cast<TensorImpl::SizesType*>(tensor_meta.get().sizes().data()),
238         data_ptr,
239         const_cast<TensorImpl::DimOrderType*>(
240             tensor_meta.get().dim_order().data()));
241     Tensor t(&impl);
242 
243     // If input_buffers.size <= 0, we don't have any input, fill t with 1's.
244     if (input_buffers.size() <= 0) {
245       for (size_t j = 0; j < t.numel(); j++) {
246         switch (t.scalar_type()) {
247           case ScalarType::Int:
248             t.mutable_data_ptr<int>()[j] = 1;
249             break;
250           case ScalarType::Float:
251             t.mutable_data_ptr<float>()[j] = 1.;
252             break;
253         }
254       }
255     }
256 
257     err = method.set_input(t, i);
258 
259     if (err != Error::Ok) {
260       ET_LOG(
261           Error, "Failed to prepare input %zu: 0x%" PRIx32, i, (uint32_t)err);
262       // The BufferCleanup will free the inputs when it goes out of scope.
263       BufferCleanup cleanup({inputs, num_allocated});
264       return err;
265     }
266   }
267   return BufferCleanup({inputs, num_allocated});
268 }
269 
270 #ifdef SEMIHOSTING
271 
read_binary_file(const char * filename,MemoryAllocator & allocator)272 std::pair<char*, size_t> read_binary_file(
273     const char* filename,
274     MemoryAllocator& allocator) {
275   FILE* fp = fopen(filename, "rb");
276   if (!fp) {
277     ET_LOG(
278         Fatal,
279         "Could not open file %s (errno: %d) for reading, exiting!",
280         filename,
281         errno);
282     _exit(1);
283   }
284 
285   fseek(fp, 0, SEEK_END);
286   auto file_size = ftell(fp);
287   fseek(fp, 0, SEEK_SET);
288 
289   char* buffer = static_cast<char*>(allocator.allocate(file_size));
290 
291   auto read_size = fread(buffer, 1, file_size, fp);
292   if (read_size != file_size) {
293     ET_LOG(
294         Info,
295         "Failed to read whole file (%), read %zu bytes!",
296         filename,
297         read_size);
298   }
299   fclose(fp);
300   return std::make_pair(buffer, read_size);
301 }
302 #endif
303 
304 } // namespace
305 
main(int argc,const char * argv[])306 int main(int argc, const char* argv[]) {
307 #ifdef SEMIHOSTING
308   ET_LOG(Info, "Running executor with parameter:");
309   if (argc < 7) {
310     ET_LOG(Fatal, "Not right number of parameters!");
311     ET_LOG(
312         Fatal,
313         "app -m model.pte -i input.bin [-i input2.bin] -o output_basename");
314     ET_LOG(Fatal, "Exiting!");
315     _exit(1);
316   }
317   ET_LOG(Info, "   %s", argv[0]);
318   for (int i = 1; i < argc; i++) {
319     ET_LOG(Info, "   %s %s", argv[i], argv[++i]);
320   }
321 #else
322   (void)argc;
323   (void)argv;
324 #endif
325 
326   executorch::runtime::runtime_init();
327   std::vector<std::pair<char*, size_t>> input_buffers;
328   size_t pte_size = sizeof(model_pte);
329 
330 #ifdef SEMIHOSTING
331   const char* output_basename = nullptr;
332   ArmMemoryAllocator input_file_allocator(
333       input_file_allocation_pool_size, input_file_allocation_pool);
334 
335   /* parse input parameters */
336   for (int i = 0; i < argc; i++) {
337     size_t nbr_inputs = 0;
338     if (std::strcmp(argv[i], "-i") == 0) {
339       // input file, read the data into memory
340       const char* input_tensor_filename = argv[++i];
341       ET_LOG(
342           Info,
343           "Reading input tensor %d from file %s",
344           ++nbr_inputs,
345           input_tensor_filename);
346       auto [buffer, buffer_size] =
347           read_binary_file(input_tensor_filename, input_file_allocator);
348       input_buffers.push_back(std::make_pair(buffer, buffer_size));
349     } else if (std::strcmp(argv[i], "-m") == 0) {
350       const char* pte_filename = argv[++i];
351       ET_LOG(Info, "Reading pte model from file %s", pte_filename);
352       auto [buffer, buffer_size] =
353           read_binary_file(pte_filename, input_file_allocator);
354       // Store the model data with the same variable as if it was loaded
355       // from compiled in location.
356       model_pte = buffer;
357       pte_size = buffer_size;
358     } else if (std::strcmp(argv[i], "-o") == 0) {
359       // store the base filename to write output to.
360       output_basename = argv[++i];
361     }
362   }
363 #endif
364   ET_LOG(Info, "Model in %p %c", model_pte, model_pte[0]);
365   auto loader = BufferDataLoader(model_pte, pte_size);
366   ET_LOG(Info, "Model PTE file loaded. Size: %lu bytes.", pte_size);
367   Result<Program> program = Program::load(&loader);
368   if (!program.ok()) {
369     ET_LOG(
370         Info,
371         "Program loading failed @ 0x%p: 0x%" PRIx32,
372         model_pte,
373         program.error());
374   }
375 
376   ET_LOG(Info, "Model buffer loaded, has %lu methods", program->num_methods());
377 
378   const char* method_name = nullptr;
379   {
380     const auto method_name_result = program->get_method_name(0);
381     ET_CHECK_MSG(method_name_result.ok(), "Program has no methods");
382     method_name = *method_name_result;
383   }
384   ET_LOG(Info, "Running method %s", method_name);
385 
386   Result<MethodMeta> method_meta = program->method_meta(method_name);
387   if (!method_meta.ok()) {
388     ET_LOG(
389         Info,
390         "Failed to get method_meta for %s: 0x%x",
391         method_name,
392         (unsigned int)method_meta.error());
393   }
394 
395   ET_LOG(
396       Info,
397       "Setup Method allocator pool. Size: %lu bytes.",
398       method_allocation_pool_size);
399 
400   ArmMemoryAllocator method_allocator(
401       method_allocation_pool_size, method_allocation_pool);
402 
403   std::vector<uint8_t*> planned_buffers; // Owns the memory
404   std::vector<Span<uint8_t>> planned_spans; // Passed to the allocator
405   size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers();
406 
407   size_t planned_buffer_membase = method_allocator.used_size();
408 
409   for (size_t id = 0; id < num_memory_planned_buffers; ++id) {
410     size_t buffer_size =
411         static_cast<size_t>(method_meta->memory_planned_buffer_size(id).get());
412     ET_LOG(Info, "Setting up planned buffer %zu, size %zu.", id, buffer_size);
413 
414     /* Move to it's own allocator when MemoryPlanner is in place. */
415     uint8_t* buffer =
416         reinterpret_cast<uint8_t*>(method_allocator.allocate(buffer_size));
417     planned_buffers.push_back(buffer);
418     planned_spans.push_back({planned_buffers.back(), buffer_size});
419   }
420 
421   size_t planned_buffer_memsize =
422       method_allocator.used_size() - planned_buffer_membase;
423 
424   HierarchicalAllocator planned_memory(
425       {planned_spans.data(), planned_spans.size()});
426 
427   ArmMemoryAllocator temp_allocator(
428       temp_allocation_pool_size, temp_allocation_pool);
429 
430   MemoryManager memory_manager(
431       &method_allocator, &planned_memory, &temp_allocator);
432 
433   size_t method_loaded_membase = method_allocator.used_size();
434 
435   Result<Method> method = program->load_method(method_name, &memory_manager);
436   if (!method.ok()) {
437     ET_LOG(
438         Info,
439         "Loading of method %s failed with status 0x%" PRIx32,
440         method_name,
441         method.error());
442   }
443   size_t method_loaded_memsize =
444       method_allocator.used_size() - method_loaded_membase;
445   ET_LOG(Info, "Method loaded.");
446 
447   ET_LOG(Info, "Preparing inputs...");
448   size_t input_membase = method_allocator.used_size();
449 
450   auto inputs =
451       ::prepare_input_tensors(*method, method_allocator, input_buffers);
452 
453   if (!inputs.ok()) {
454     ET_LOG(
455         Info,
456         "Preparing inputs tensors for method %s failed with status 0x%" PRIx32,
457         method_name,
458         inputs.error());
459   }
460   size_t input_memsize = method_allocator.used_size() - input_membase;
461   ET_LOG(Info, "Input prepared.");
462 
463   ET_LOG(Info, "Starting the model execution...");
464   size_t executor_membase = method_allocator.used_size();
465   StartMeasurements();
466   Error status = method->execute();
467   StopMeasurements();
468   size_t executor_memsize = method_allocator.used_size() - executor_membase;
469 
470   ET_LOG(Info, "model_pte_loaded_size:     %lu bytes.", pte_size);
471 #ifdef SEMIHOSTING
472   if (input_file_allocator.size() > 0) {
473     ET_LOG(
474         Info,
475         "input_file_allocator_used: %zu / %zu free: %zu ( used: %zu %% ) ",
476         input_file_allocator.used_size(),
477         input_file_allocator.size(),
478         input_file_allocator.free_size(),
479         100 * input_file_allocator.used_size() / input_file_allocator.size());
480   }
481 #endif
482   if (method_allocator.size() != 0) {
483     size_t method_allocator_used = method_allocator.used_size();
484     ET_LOG(
485         Info,
486         "method_allocator_used:     %zu / %zu  free: %zu ( used: %zu %% ) ",
487         method_allocator_used,
488         method_allocator.size(),
489         method_allocator.free_size(),
490         100 * method_allocator_used / method_allocator.size());
491     ET_LOG(
492         Info, "method_allocator_planned:  %zu bytes", planned_buffer_memsize);
493     ET_LOG(Info, "method_allocator_loaded:   %zu bytes", method_loaded_memsize);
494     ET_LOG(Info, "method_allocator_input:    %zu bytes", input_memsize);
495     ET_LOG(Info, "method_allocator_executor: %zu bytes", executor_memsize);
496   }
497   if (temp_allocator.size() > 0) {
498     ET_LOG(
499         Info,
500         "temp_allocator_used:       %zu / %zu free: %zu ( used: %zu %% ) ",
501         temp_allocator.used_size(),
502         temp_allocator.size(),
503         temp_allocator.free_size(),
504         100 * temp_allocator.used_size() / temp_allocator.size());
505   }
506 
507   if (status != Error::Ok) {
508     ET_LOG(
509         Info,
510         "Execution of method %s failed with status 0x%" PRIx32,
511         method_name,
512         status);
513   } else {
514     ET_LOG(Info, "Model executed successfully.");
515   }
516 
517   std::vector<EValue> outputs(method->outputs_size());
518   ET_LOG(Info, "%zu outputs: ", outputs.size());
519   status = method->get_outputs(outputs.data(), outputs.size());
520   ET_CHECK(status == Error::Ok);
521   for (int i = 0; i < outputs.size(); ++i) {
522     Tensor t = outputs[i].toTensor();
523 #ifndef SEMIHOSTING
524     // The output might be collected and parsed so printf() is used instead
525     // of ET_LOG() here
526     for (int j = 0; j < outputs[i].toTensor().numel(); ++j) {
527       if (t.scalar_type() == ScalarType::Int) {
528         printf(
529             "Output[%d][%d]: %d\n",
530             i,
531             j,
532             outputs[i].toTensor().const_data_ptr<int>()[j]);
533       } else {
534         printf(
535             "Output[%d][%d]: %f\n",
536             i,
537             j,
538             outputs[i].toTensor().const_data_ptr<float>()[j]);
539       }
540     }
541 #else
542     char out_filename[255];
543     snprintf(out_filename, 255, "%s-%d.bin", output_basename, i);
544     ET_LOG(Info, "Writing output to file: %s", out_filename);
545     FILE* out_file = fopen(out_filename, "wb");
546     auto written_size = fwrite(
547         outputs[i].toTensor().const_data_ptr<char>(),
548         1,
549         outputs[i].toTensor().nbytes(),
550         out_file);
551     fclose(out_file);
552 #endif
553   }
554 out:
555   ET_LOG(Info, "Program complete, exiting.");
556 #ifdef SEMIHOSTING
557   _exit(0);
558 #endif
559   ET_LOG(Info, "\04");
560   return 0;
561 }
562