xref: /aosp_15_r20/external/executorch/examples/mediatek/executor_runner/llama_runner/ModelChunk.cpp (revision 523fa7a60841cd1ecfb9cc4201f1ca8b03ed023a)
1 /*
2  * Copyright (c) 2024 MediaTek Inc.
3  *
4  * Licensed under the BSD License (the "License"); you may not use this file
5  * except in compliance with the License. See the license file in the root
6  * directory of this source tree for more details.
7  */
8 
9 #include "ModelChunk.h"
10 
11 #include <sstream>
12 
13 #include "executorch/backends/mediatek/runtime/include/NeuronBufferAllocator.h"
14 
15 #include <executorch/extension/data_loader/file_data_loader.h>
16 #include <executorch/extension/evalue_util/print_evalue.h>
17 #include <executorch/runtime/executor/method.h>
18 #include <executorch/runtime/executor/program.h>
19 #include <executorch/runtime/platform/log.h>
20 #include <executorch/runtime/platform/profiler.h>
21 #include <executorch/runtime/platform/runtime.h>
22 
23 #define ENSURE_INIT \
24   ET_CHECK_MSG(Initialized(), "Error: Model chunk not initialized.");
25 
26 namespace example {
27 
28 using executorch::aten::Tensor;
29 using executorch::aten::TensorImpl;
30 using executorch::extension::FileDataLoader;
31 using executorch::runtime::Error;
32 using executorch::runtime::HierarchicalAllocator;
33 using executorch::runtime::MemoryAllocator;
34 using executorch::runtime::MemoryManager;
35 using executorch::runtime::Method;
36 using executorch::runtime::MethodMeta;
37 using executorch::runtime::Program;
38 using executorch::runtime::Result;
39 using executorch::runtime::Span;
40 using executorch::runtime::Tag;
41 
42 static constexpr size_t kMethodAllocatorPoolSize = 4 * 1024U * 1024U; // 4MB
43 
44 // ExecuTorch model instance
45 // The member ordering affects the order of destruction.
46 struct ModelInstance {
47   std::unique_ptr<Program> program;
48 
49   std::vector<std::unique_ptr<uint8_t[]>> planned_buffers;
50   std::vector<Span<uint8_t>> planned_spans;
51 
52   std::vector<uint8_t> method_allocator_pool;
53   std::unique_ptr<MemoryAllocator> method_allocator;
54   std::unique_ptr<HierarchicalAllocator> planned_memory;
55   std::unique_ptr<MemoryManager> memory_manager;
56 
57   std::unique_ptr<Method> method;
58 };
59 
Initialize()60 void ModelChunk::Initialize() {
61   LoadModels();
62   GetModelIoInfo();
63   AllocateIoBuffers();
64   SetBackendInputs();
65   SetBackendOutputs();
66   mIsInitialized = true;
67 }
68 
Initialized()69 bool ModelChunk::Initialized() {
70   return mIsInitialized;
71 }
72 
Release()73 void ModelChunk::Release() {
74   ENSURE_INIT
75   ReleaseModels();
76   ReleaseIoBuffers();
77 }
78 
Run()79 void ModelChunk::Run() {
80   ENSURE_INIT
81   auto beforeExec = std::chrono::high_resolution_clock::now();
82   Error status = Error::Ok;
83   status = GetModelMethod().execute();
84   auto afterExec = std::chrono::high_resolution_clock::now();
85   const double elapsedTime =
86       std::chrono::duration_cast<std::chrono::microseconds>(
87           afterExec - beforeExec)
88           .count();
89   ET_LOG(Debug, "Inference took %f ms", elapsedTime / 1000.0);
90   ET_CHECK_MSG(
91       status == Error::Ok,
92       "Execution of method failed with status 0x%" PRIx32,
93       status);
94   ET_LOG(Debug, "Model executed successfully.");
95 }
96 
HotSwapModel(const size_t tokenBatchSize)97 bool ModelChunk::HotSwapModel(const size_t tokenBatchSize) {
98   ENSURE_INIT
99   // Save old values
100   const auto oldInstanceBatchSize = GetModelId();
101   const auto oldTokenBatchSize = mTokenBatchSize;
102 
103   if (!HasModel(tokenBatchSize)) {
104     ET_LOG(
105         Error,
106         "Model swap: No model with batchSize=%zu is available",
107         tokenBatchSize);
108     return false;
109   }
110 
111   if (oldInstanceBatchSize == tokenBatchSize) {
112     ET_LOG(Info, "Model swapping to itself");
113     return true;
114   }
115 
116   SelectModel(tokenBatchSize);
117 
118   const auto newInstanceBatchSize = GetModelId();
119   if (oldInstanceBatchSize == newInstanceBatchSize) {
120     ET_LOG(
121         Error,
122         "Failed to switch to model with batchSize=%zu. Model currently remain at batchSize=%zu",
123         tokenBatchSize,
124         oldTokenBatchSize);
125     return false;
126   }
127 
128   // Update model variables
129   // Mask length = cache size (length) + num input token (token batch size)
130   mTokenBatchSize = tokenBatchSize;
131 
132   UpdateModelIoInfo();
133   SetBackendInputs();
134   SetBackendOutputs();
135   return true;
136 }
137 
SetInputBuffer(const void * data,const size_t size,const size_t index)138 void ModelChunk::SetInputBuffer(
139     const void* data,
140     const size_t size,
141     const size_t index) {
142   ENSURE_INIT
143   auto& targetBufInfo = mInputBufferInfos[index];
144   ET_CHECK_MSG(
145       targetBufInfo.nbytes >= size,
146       "Error: Input[%zu] has only allocated %zu but need to set input with size %zu",
147       index,
148       targetBufInfo.nbytes,
149       size);
150   std::memcpy(targetBufInfo.data, data, size);
151 }
152 
SetInputBuffer(const BufferInfo & bufferInfo,const size_t index)153 void ModelChunk::SetInputBuffer(
154     const BufferInfo& bufferInfo,
155     const size_t index) {
156   // Allow calling this method without initialized first to assign preallocated
157   // buffers.
158   if (index >= mInputBufferInfos.size()) {
159     mInputBufferInfos.resize(index + 1);
160   }
161   // If the existing buffer has been allocated, memory copy the content.
162   // Otherwise, share the input buffer info.
163   auto& targetBufInfo = mInputBufferInfos[index];
164   if (targetBufInfo.data != nullptr) {
165     // Already allocated, do memcpy.
166     SetInputBuffer(bufferInfo.data, bufferInfo.nbytesUsed, index);
167   } else {
168     // Share the buffer info.
169     targetBufInfo = bufferInfo;
170   }
171 }
172 
GetInputBuffer(const size_t index)173 BufferInfo ModelChunk::GetInputBuffer(const size_t index) {
174   ENSURE_INIT
175   ET_CHECK_MSG(
176       index < mInputBufferInfos.size(),
177       "Error: Index out of range: %zu",
178       index);
179   return mInputBufferInfos[index];
180 }
181 
GetOutputBuffer(const size_t index)182 BufferInfo ModelChunk::GetOutputBuffer(const size_t index) {
183   ENSURE_INIT
184   ET_CHECK_MSG(
185       index < mOutputBufferInfos.size(),
186       "Error: Index out of range: %zu",
187       index);
188   return mOutputBufferInfos[index];
189 }
190 
LogIoSummary()191 void ModelChunk::LogIoSummary() {
192   ENSURE_INIT
193   const auto& method = GetModelMethod();
194   const auto method_meta = method.method_meta();
195 
196   auto getShapeStr = [](const auto shape) {
197     std::ostringstream ss;
198     ss << "(";
199     for (size_t i = 0; i < shape.size(); i++) {
200       ss << shape[i];
201       if (i < shape.size() - 1)
202         ss << ", ";
203     }
204     ss << ")";
205     return ss.str();
206   };
207 
208   ET_LOG(Info, "Model Chunk IO Summary:");
209 
210   const size_t input_size = method.inputs_size();
211   const size_t output_size = method.outputs_size();
212 
213   for (size_t i = 0; i < input_size; i++) {
214     if (*method_meta.input_tag(i) != Tag::Tensor) {
215       ET_LOG(Info, "  Input %zu: Non-Tensor", i);
216       continue;
217     }
218     const auto nbytes = method_meta.input_tensor_meta(i)->nbytes();
219     const auto shape = getShapeStr(method_meta.input_tensor_meta(i)->sizes());
220     const auto type =
221         static_cast<int>(method_meta.input_tensor_meta(i)->scalar_type());
222     ET_LOG(
223         Info,
224         "  Input %zu: Shape: %s, Size: %zu bytes, Type: %d",
225         i,
226         shape.c_str(),
227         nbytes,
228         type);
229   }
230 
231   for (size_t i = 0; i < output_size; i++) {
232     if (*method_meta.output_tag(i) != Tag::Tensor) {
233       ET_LOG(Info, "  Output %zu: Non-Tensor", i);
234       continue;
235     }
236     const auto nbytes = method_meta.output_tensor_meta(i)->nbytes();
237     const auto shape = getShapeStr(method_meta.output_tensor_meta(i)->sizes());
238     const auto type =
239         static_cast<int>(method_meta.output_tensor_meta(i)->scalar_type());
240     ET_LOG(
241         Info,
242         "  Output %zu: Shape: %s, Size: %zu bytes, Type: %d",
243         i,
244         shape.c_str(),
245         nbytes,
246         type);
247   }
248 }
249 
GetModelIoInfo()250 void ModelChunk::GetModelIoInfo() {
251   const auto& method = GetModelMethod();
252   const auto method_meta = method.method_meta();
253 
254   const size_t input_size = method.inputs_size();
255   const size_t output_size = method.outputs_size();
256 
257   mInputBufferInfos.resize(input_size);
258   for (size_t i = 0; i < input_size; i++) {
259     if (*method_meta.input_tag(i) != Tag::Tensor) {
260       ET_LOG(Info, "Input %zu is not a tensor, skipping", i);
261       continue;
262     }
263     auto& bufInfo = mInputBufferInfos[i];
264     const auto nbytes = method_meta.input_tensor_meta(i)->nbytes();
265     if (bufInfo.data != nullptr) {
266       // Already preallocated, so just update the size used by the model.
267       ET_CHECK_MSG(
268           bufInfo.nbytes >= nbytes,
269           "Error: Model input[%zu] requires size=%zu but only preallocated size=%zu",
270           i,
271           nbytes,
272           bufInfo.nbytes);
273       bufInfo.nbytesUsed = nbytes;
274       continue;
275     }
276     bufInfo.nbytes = nbytes;
277     bufInfo.nbytesUsed = nbytes;
278   }
279 
280   mOutputBufferInfos.resize(output_size);
281   for (size_t i = 0; i < output_size; i++) {
282     if (*method_meta.output_tag(i) != Tag::Tensor) {
283       ET_LOG(Info, "Output %zu is not a tensor, skipping", i);
284       continue;
285     }
286     auto& bufInfo = mOutputBufferInfos[i];
287     const auto nbytes = method_meta.output_tensor_meta(i)->nbytes();
288     if (bufInfo.data != nullptr) {
289       // Already preallocated, so just update the size used by model.
290       ET_CHECK_MSG(
291           bufInfo.nbytes >= nbytes,
292           "Error: Model output[%zu] requires size of %zu but only preallocated size of %zu",
293           i,
294           nbytes,
295           bufInfo.nbytes);
296       bufInfo.nbytesUsed = nbytes;
297       continue;
298     }
299     bufInfo.nbytes = nbytes;
300     bufInfo.nbytesUsed = nbytes;
301   }
302 }
303 
304 // Update actual used IO sizes by the model
UpdateModelIoInfo()305 void ModelChunk::UpdateModelIoInfo() {
306   const auto& method = GetModelMethod();
307   const auto method_meta = method.method_meta();
308 
309   const size_t numModelInputs = method.inputs_size();
310   const size_t numModelOutputs = method.outputs_size();
311 
312   const size_t numInputBuffers = mInputBufferInfos.size();
313   const size_t numOutputBuffers = mOutputBufferInfos.size();
314 
315   if (numInputBuffers != numModelInputs) {
316     ET_LOG(
317         Info,
318         "Existing num inputs (%zu) != new num inputs (%zu)",
319         numInputBuffers,
320         numModelInputs);
321   }
322   if (numOutputBuffers != numModelOutputs) {
323     ET_LOG(
324         Info,
325         "Existing num outputs (%zu) != new num outputs (%zu)",
326         numOutputBuffers,
327         numModelOutputs);
328   }
329   mInputBufferInfos.resize(numModelInputs);
330   for (size_t inputIdx = 0; inputIdx < numModelInputs; inputIdx++) {
331     auto& sizeAllocated = mInputBufferInfos[inputIdx].nbytes;
332     auto& sizeRequired = mInputBufferInfos[inputIdx].nbytesUsed;
333     const auto before = sizeRequired;
334 
335     // Update
336     sizeRequired = method_meta.input_tensor_meta(inputIdx)->nbytes();
337     if (sizeAllocated < sizeRequired) {
338       ET_LOG(
339           Error,
340           "Insufficient buffer size for input[%zu]. Requires %zu but only allocated %zu",
341           inputIdx,
342           sizeRequired,
343           sizeAllocated);
344     }
345     if (before != sizeRequired) {
346       ET_LOG(
347           Debug,
348           "Update input[%zu] size:  %zu -> %zu",
349           inputIdx,
350           before,
351           sizeRequired);
352     }
353   }
354   mOutputBufferInfos.resize(numModelOutputs);
355   for (size_t outputIdx = 0; outputIdx < numModelOutputs; outputIdx++) {
356     auto& sizeAllocated = mOutputBufferInfos[outputIdx].nbytes;
357     auto& sizeRequired = mOutputBufferInfos[outputIdx].nbytesUsed;
358     const auto before = sizeRequired;
359 
360     // Update
361     sizeRequired = method_meta.output_tensor_meta(outputIdx)->nbytes();
362     if (sizeAllocated < sizeRequired) {
363       ET_LOG(
364           Error,
365           "Insufficient buffer size for output[%zu]. Requires %zu but only allocated %zu",
366           outputIdx,
367           sizeRequired,
368           sizeAllocated);
369     }
370     if (before != sizeRequired) {
371       ET_LOG(
372           Debug,
373           "Update output[%zu] size:  %zu -> %zu",
374           outputIdx,
375           before,
376           sizeRequired);
377     }
378   }
379 }
380 
LinkModelIO(const size_t inputIndex,const size_t outputIndex)381 void ModelChunk::LinkModelIO(
382     const size_t inputIndex,
383     const size_t outputIndex) {
384   mModelOutToInIndexLinks.emplace(outputIndex, inputIndex);
385 }
386 
GetLinkedInputIndex(const size_t outputIndex) const387 std::optional<size_t> ModelChunk::GetLinkedInputIndex(
388     const size_t outputIndex) const {
389   auto hasKey = [](const auto& map, const auto& key) {
390     return map.find(key) != map.end();
391   };
392   if (hasKey(mModelOutToInIndexLinks, outputIndex))
393     return mModelOutToInIndexLinks.at(outputIndex);
394   else
395     return std::nullopt;
396 }
397 
SetBackendInputs()398 void ModelChunk::SetBackendInputs() {
399   auto& method = GetModelMethod();
400   const auto method_meta = method.method_meta();
401   const size_t input_size = method.inputs_size();
402   for (size_t i = 0; i < input_size; i++) {
403     const auto tensor_meta = method_meta.input_tensor_meta(i);
404     auto scalar_type = tensor_meta->scalar_type();
405     auto sizes_raw = tensor_meta->sizes();
406     auto dim = sizes_raw.size();
407     auto dim_order_raw = tensor_meta->dim_order();
408     std::vector sizes(sizes_raw.begin(), sizes_raw.end());
409     std::vector dim_order(dim_order_raw.begin(), dim_order_raw.end());
410     auto buffer_data = mInputBufferInfos[i].data;
411 
412     TensorImpl impl = TensorImpl(
413         scalar_type, dim, sizes.data(), buffer_data, dim_order.data());
414     Tensor tensor(&impl);
415     const auto error = method.set_input(tensor, i);
416     ET_CHECK_MSG(
417         error == Error::Ok,
418         "Error: 0x%" PRIx32 " setting input %zu.",
419         error,
420         i);
421   }
422 }
423 
SetBackendOutputs()424 void ModelChunk::SetBackendOutputs() {
425   auto& method = GetModelMethod();
426   for (size_t i = 0; i < mOutputBufferInfos.size(); i++) {
427     auto data = mOutputBufferInfos[i].data;
428     const auto nbytes = mOutputBufferInfos[i].nbytes;
429     const auto output_err = method.set_output_data_ptr(data, nbytes, i);
430     ET_CHECK_MSG(
431         output_err == Error::Ok,
432         "Error: 0x%" PRIx32 " setting output %zu.",
433         output_err,
434         i);
435   }
436 }
437 
AllocateIoBuffers()438 void ModelChunk::AllocateIoBuffers() {
439   auto& buffer_allocator = GET_NEURON_ALLOCATOR;
440 
441   // Inputs
442   for (auto& inBufInfo : mInputBufferInfos) {
443     if (inBufInfo.data != nullptr) {
444       continue; // Already allocated
445     }
446     void* ahwb_data = buffer_allocator.Allocate(inBufInfo.nbytes);
447     inBufInfo.data = ahwb_data;
448   }
449 
450   // Outputs
451   const auto numOutputBuffers = mOutputBufferInfos.size();
452   for (size_t outputIdx = 0; outputIdx < numOutputBuffers; outputIdx++) {
453     auto& outBufInfo = mOutputBufferInfos[outputIdx];
454     if (outBufInfo.data != nullptr) {
455       continue; // Already allocated
456     }
457     const auto linkedInputIdx = GetLinkedInputIndex(outputIdx);
458     if (linkedInputIdx) {
459       const auto& linkedInBufInfo = mInputBufferInfos[*linkedInputIdx];
460       // Ensure the linked IO sizes match, then reuse the linked input buffer
461       ET_CHECK_MSG(
462           outBufInfo.nbytes == linkedInBufInfo.nbytes,
463           "Error: Mismatch sizes between linked IO. "
464           "Input %zu size is %zu, but Output %zu size is %zu.",
465           *linkedInputIdx,
466           linkedInBufInfo.nbytes,
467           outputIdx,
468           outBufInfo.nbytes);
469       outBufInfo = linkedInBufInfo;
470       continue;
471     }
472     // Allocate output buffer as usual
473     void* ahwb_data = buffer_allocator.Allocate(outBufInfo.nbytes);
474     outBufInfo.data = ahwb_data;
475   }
476 }
477 
ReleaseIoBuffers()478 void ModelChunk::ReleaseIoBuffers() {
479   auto& buffer_allocator = GET_NEURON_ALLOCATOR;
480 
481   for (size_t i = 0; i < mInputBufferInfos.size(); i++)
482     buffer_allocator.RemoveBuffer(mInputBufferInfos[i].data);
483 
484   for (size_t i = 0; i < mOutputBufferInfos.size(); i++)
485     buffer_allocator.RemoveBuffer(mOutputBufferInfos[i].data);
486 }
487 
GetModelMethod()488 Method& ModelChunk::GetModelMethod() {
489   auto modelInstance = reinterpret_cast<ModelInstance*>(GetModelInstance());
490   return *(modelInstance->method);
491 }
492 
493 // Override the virtual functions
CreateModelInstance(const std::string & modelPath)494 void* ModelChunk::CreateModelInstance(const std::string& modelPath) {
495   auto modelInstance = new ModelInstance;
496 
497   // Create a loader to get the data of the program file. There are other
498   // DataLoaders that use mmap() or point to data that's already in memory, and
499   // users can create their own DataLoaders to load from arbitrary sources.
500   Result<FileDataLoader> loader = FileDataLoader::from(modelPath.c_str());
501   ET_CHECK_MSG(
502       loader.ok(), "FileDataLoader::from() failed: 0x%" PRIx32, loader.error());
503 
504   // Parse the program file. This is immutable, and can also be reused between
505   // multiple execution invocations across multiple threads.
506   Result<Program> program_loaded = Program::load(&loader.get());
507   if (!program_loaded.ok()) {
508     ET_LOG(Error, "Failed to parse model file %s", modelPath.c_str());
509     return nullptr;
510   }
511   ET_LOG(Debug, "Model file %s is loaded.", modelPath.c_str());
512 
513   // Extract program out to a persistent storage before calling any of its
514   // methods.
515   modelInstance->program =
516       std::make_unique<Program>(std::move(program_loaded.get()));
517   auto& program = modelInstance->program;
518 
519   // Use the first method in the program.
520   const char* method_name = nullptr;
521   {
522     const auto method_name_result = program->get_method_name(0);
523     ET_CHECK_MSG(method_name_result.ok(), "Program has no methods");
524     method_name = *method_name_result;
525   }
526   ET_LOG(Debug, "Using method %s", method_name);
527 
528   // MethodMeta describes the memory requirements of the method.
529   Result<MethodMeta> method_meta = program->method_meta(method_name);
530   ET_CHECK_MSG(
531       method_meta.ok(),
532       "Failed to get method_meta for %s: 0x%x",
533       method_name,
534       (unsigned int)method_meta.error());
535 
536   modelInstance->method_allocator_pool.resize(kMethodAllocatorPoolSize);
537   modelInstance->method_allocator = std::make_unique<MemoryAllocator>(
538       kMethodAllocatorPoolSize, modelInstance->method_allocator_pool.data());
539   auto& method_allocator = modelInstance->method_allocator;
540   method_allocator->enable_profiling("method allocator");
541 
542   auto& planned_buffers = modelInstance->planned_buffers; // Owns the memory
543   auto& planned_spans = modelInstance->planned_spans; // Passed to the allocator
544 
545   size_t num_memory_planned_buffers = method_meta->num_memory_planned_buffers();
546   for (size_t id = 0; id < num_memory_planned_buffers; ++id) {
547     // .get() will always succeed because id < num_memory_planned_buffers.
548     size_t buffer_size =
549         static_cast<size_t>(method_meta->memory_planned_buffer_size(id).get());
550     ET_LOG(Debug, "Setting up planned buffer %zu, size %zu.", id, buffer_size);
551     planned_buffers.push_back(std::make_unique<uint8_t[]>(buffer_size));
552     planned_spans.push_back({planned_buffers.back().get(), buffer_size});
553   }
554   modelInstance->planned_memory = std::make_unique<HierarchicalAllocator>(
555       Span<Span<uint8_t>>{planned_spans.data(), planned_spans.size()});
556   auto& planned_memory = modelInstance->planned_memory;
557 
558   // Assemble all of the allocators into the MemoryManager that the Executor
559   // will use.
560   auto& neuron_allocator = GET_NEURON_ALLOCATOR;
561   modelInstance->memory_manager = std::make_unique<MemoryManager>(
562       method_allocator.get(),
563       planned_memory.get(),
564       dynamic_cast<MemoryAllocator*>(&neuron_allocator));
565   auto& memory_manager = modelInstance->memory_manager;
566 
567   ET_LOG(Debug, "Begin loading method %s", method_name);
568   Result<Method> method =
569       program->load_method(method_name, memory_manager.get());
570   ET_CHECK_MSG(
571       method.ok(),
572       "Loading of method %s failed with status 0x%" PRIx32,
573       method_name,
574       method.error());
575   ET_LOG(Debug, "Method loaded.");
576 
577   modelInstance->method = std::make_unique<Method>(std::move(method.get()));
578   return modelInstance;
579 }
580 
ReleaseModelInstance(void * modelInstance)581 void ModelChunk::ReleaseModelInstance(void* modelInstance) {
582   if (modelInstance != nullptr) {
583     delete reinterpret_cast<ModelInstance*>(modelInstance);
584   }
585 }
586 
587 } // namespace example
588