1 //
2 // Copyright © 2017-2023 Arm Ltd and Contributors. All rights reserved.
3 // SPDX-License-Identifier: MIT
4 //
5
6 #define LOG_TAG "ArmnnDriver"
7
8 #include "ArmnnPreparedModel_1_2.hpp"
9
10 #include "Utils.hpp"
11
12 #include <armnn/Types.hpp>
13
14 #include <log/log.h>
15 #include <OperationsUtils.h>
16 #include <ExecutionBurstServer.h>
17 #include <ValidateHal.h>
18
19 #include <chrono>
20 #include <cinttypes>
21
22 #ifdef ARMNN_ANDROID_S
23 #include <LegacyUtils.h>
24 #endif
25
26 using namespace android;
27 using namespace android::hardware;
28
29 namespace {
30
31 static const V1_2::Timing g_NoTiming = {.timeOnDevice = UINT64_MAX, .timeInDriver = UINT64_MAX};
32 using namespace armnn_driver;
33 using TimePoint = std::chrono::steady_clock::time_point;
34
Now()35 TimePoint Now()
36 {
37 return std::chrono::steady_clock::now();
38 }
39
MicrosecondsDuration(TimePoint endPoint,TimePoint startPoint)40 unsigned long MicrosecondsDuration(TimePoint endPoint, TimePoint startPoint)
41 {
42 return static_cast<unsigned long>(std::chrono::duration_cast<std::chrono::microseconds>(
43 endPoint - startPoint).count());
44 }
45
NotifyCallbackAndCheck(const::android::sp<V1_0::IExecutionCallback> & callback,V1_0::ErrorStatus errorStatus,std::vector<V1_2::OutputShape>,const V1_2::Timing,std::string callingFunction)46 void NotifyCallbackAndCheck(const ::android::sp<V1_0::IExecutionCallback>& callback,
47 V1_0::ErrorStatus errorStatus,
48 std::vector<V1_2::OutputShape>,
49 const V1_2::Timing,
50 std::string callingFunction)
51 {
52 Return<void> returned = callback->notify(errorStatus);
53 // This check is required, if the callback fails and it isn't checked it will bring down the service
54 if (!returned.isOk())
55 {
56 ALOGE("ArmnnDriver::%s: hidl callback failed to return properly: %s",
57 callingFunction.c_str(), returned.description().c_str());
58 }
59 }
60
NotifyCallbackAndCheck(const::android::sp<V1_2::IExecutionCallback> & callback,V1_0::ErrorStatus errorStatus,std::vector<V1_2::OutputShape> outputShapes,const V1_2::Timing timing,std::string callingFunction)61 void NotifyCallbackAndCheck(const ::android::sp<V1_2::IExecutionCallback>& callback,
62 V1_0::ErrorStatus errorStatus,
63 std::vector<V1_2::OutputShape> outputShapes,
64 const V1_2::Timing timing,
65 std::string callingFunction)
66 {
67 Return<void> returned = callback->notify_1_2(errorStatus, outputShapes, timing);
68 // This check is required, if the callback fails and it isn't checked it will bring down the service
69 if (!returned.isOk())
70 {
71 ALOGE("ArmnnDriver::%s: hidl callback failed to return properly: %s",
72 callingFunction.c_str(), returned.description().c_str());
73 }
74 }
75
ValidateRequestArgument(const V1_0::RequestArgument & requestArg,const armnn::TensorInfo & tensorInfo)76 bool ValidateRequestArgument(const V1_0::RequestArgument& requestArg, const armnn::TensorInfo& tensorInfo)
77 {
78 if (requestArg.dimensions.size() != 0)
79 {
80 if (requestArg.dimensions.size() != tensorInfo.GetNumDimensions())
81 {
82 ALOGE("Mismatched dimensions (request argument: %zu, expected: %u)",
83 requestArg.dimensions.size(), tensorInfo.GetNumDimensions());
84 return false;
85 }
86
87 for (unsigned int d = 0; d < tensorInfo.GetNumDimensions(); ++d)
88 {
89 if (requestArg.dimensions[d] != 0 && requestArg.dimensions[d] != tensorInfo.GetShape()[d])
90 {
91 ALOGE("Mismatched size for dimension %d (request argument: %u, expected %u)",
92 d, requestArg.dimensions[d], tensorInfo.GetShape()[d]);
93 return false;
94 }
95 }
96 }
97
98 return true;
99 }
100
GetTensorForRequestArgument(const V1_0::RequestArgument & requestArg,const armnn::TensorInfo & tensorInfo,const std::vector<::android::nn::RunTimePoolInfo> & requestPools)101 armnn::Tensor GetTensorForRequestArgument(const V1_0::RequestArgument& requestArg,
102 const armnn::TensorInfo& tensorInfo,
103 const std::vector<::android::nn::RunTimePoolInfo>& requestPools)
104 {
105 if (!ValidateRequestArgument(requestArg, tensorInfo))
106 {
107 return armnn::Tensor();
108 }
109
110 return armnn::Tensor(tensorInfo, GetMemoryFromPool(requestArg.location, requestPools));
111 }
112
BuildTensorName(const char * tensorNamePrefix,std::size_t index)113 inline std::string BuildTensorName(const char* tensorNamePrefix, std::size_t index)
114 {
115 return tensorNamePrefix + std::to_string(index);
116 }
117
118 } // anonymous namespace
119
120 using namespace android::hardware;
121
122 namespace armnn_driver
123 {
124
125 template<typename HalVersion>
126 RequestThread<ArmnnPreparedModel_1_2, HalVersion, CallbackContext_1_2>
127 ArmnnPreparedModel_1_2<HalVersion>::m_RequestThread;
128
129 template<typename HalVersion>
130 std::unique_ptr<armnn::Threadpool> ArmnnPreparedModel_1_2<HalVersion>::m_Threadpool(nullptr);
131
132 template<typename HalVersion>
133 template<typename TensorBindingCollection>
DumpTensorsIfRequired(char const * tensorNamePrefix,const TensorBindingCollection & tensorBindings)134 void ArmnnPreparedModel_1_2<HalVersion>::DumpTensorsIfRequired(char const* tensorNamePrefix,
135 const TensorBindingCollection& tensorBindings)
136 {
137 if (!m_RequestInputsAndOutputsDumpDir.empty())
138 {
139 const std::string requestName = std::to_string(m_NetworkId) + "_" + std::to_string(m_RequestCount) + ".dump";
140 for (std::size_t i = 0u; i < tensorBindings.size(); ++i)
141 {
142 DumpTensor(m_RequestInputsAndOutputsDumpDir,
143 requestName,
144 BuildTensorName(tensorNamePrefix, i),
145 tensorBindings[i].second);
146 }
147 }
148 }
149
150 template<typename HalVersion>
ArmnnPreparedModel_1_2(armnn::NetworkId networkId,armnn::IRuntime * runtime,const V1_2::Model & model,const std::string & requestInputsAndOutputsDumpDir,const bool gpuProfilingEnabled,const bool asyncModelExecutionEnabled,const unsigned int numberOfThreads,const bool importEnabled,const bool exportEnabled)151 ArmnnPreparedModel_1_2<HalVersion>::ArmnnPreparedModel_1_2(armnn::NetworkId networkId,
152 armnn::IRuntime* runtime,
153 const V1_2::Model& model,
154 const std::string& requestInputsAndOutputsDumpDir,
155 const bool gpuProfilingEnabled,
156 const bool asyncModelExecutionEnabled,
157 const unsigned int numberOfThreads,
158 const bool importEnabled,
159 const bool exportEnabled)
160 : m_NetworkId(networkId)
161 , m_Runtime(runtime)
162 , m_Model(model)
163 , m_RequestCount(0)
164 , m_RequestInputsAndOutputsDumpDir(requestInputsAndOutputsDumpDir)
165 , m_GpuProfilingEnabled(gpuProfilingEnabled)
166 , m_AsyncModelExecutionEnabled(asyncModelExecutionEnabled)
167 , m_EnableImport(importEnabled)
168 , m_EnableExport(exportEnabled)
169 , m_PreparedFromCache(false)
170 {
171 // Enable profiling if required.
172 m_Runtime->GetProfiler(m_NetworkId)->EnableProfiling(m_GpuProfilingEnabled);
173
174 if (m_AsyncModelExecutionEnabled)
175 {
176 std::vector<std::shared_ptr<armnn::IWorkingMemHandle>> memHandles;
177 for (unsigned int i=0; i < numberOfThreads; ++i)
178 {
179 memHandles.emplace_back(m_Runtime->CreateWorkingMemHandle(networkId));
180 }
181
182 if (!m_Threadpool)
183 {
184 m_Threadpool = std::make_unique<armnn::Threadpool>(numberOfThreads, runtime, memHandles);
185 }
186 else
187 {
188 m_Threadpool->LoadMemHandles(memHandles);
189 }
190
191 m_WorkingMemHandle = memHandles.back();
192 }
193 }
194
195 template<typename HalVersion>
ArmnnPreparedModel_1_2(armnn::NetworkId networkId,armnn::IRuntime * runtime,const std::string & requestInputsAndOutputsDumpDir,const bool gpuProfilingEnabled,const bool asyncModelExecutionEnabled,const unsigned int numberOfThreads,const bool importEnabled,const bool exportEnabled,const bool preparedFromCache)196 ArmnnPreparedModel_1_2<HalVersion>::ArmnnPreparedModel_1_2(armnn::NetworkId networkId,
197 armnn::IRuntime* runtime,
198 const std::string& requestInputsAndOutputsDumpDir,
199 const bool gpuProfilingEnabled,
200 const bool asyncModelExecutionEnabled,
201 const unsigned int numberOfThreads,
202 const bool importEnabled,
203 const bool exportEnabled,
204 const bool preparedFromCache)
205 : m_NetworkId(networkId)
206 , m_Runtime(runtime)
207 , m_RequestCount(0)
208 , m_RequestInputsAndOutputsDumpDir(requestInputsAndOutputsDumpDir)
209 , m_GpuProfilingEnabled(gpuProfilingEnabled)
210 , m_AsyncModelExecutionEnabled(asyncModelExecutionEnabled)
211 , m_EnableImport(importEnabled)
212 , m_EnableExport(exportEnabled)
213 , m_PreparedFromCache(preparedFromCache)
214 {
215 // Enable profiling if required.
216 m_Runtime->GetProfiler(m_NetworkId)->EnableProfiling(m_GpuProfilingEnabled);
217
218 if (m_AsyncModelExecutionEnabled)
219 {
220 std::vector<std::shared_ptr<armnn::IWorkingMemHandle>> memHandles;
221 for (unsigned int i=0; i < numberOfThreads; ++i)
222 {
223 memHandles.emplace_back(m_Runtime->CreateWorkingMemHandle(networkId));
224 }
225
226 if (!m_Threadpool)
227 {
228 m_Threadpool = std::make_unique<armnn::Threadpool>(numberOfThreads, runtime, memHandles);
229 }
230 else
231 {
232 m_Threadpool->LoadMemHandles(memHandles);
233 }
234
235 m_WorkingMemHandle = memHandles.back();
236 }
237 }
238
239 template<typename HalVersion>
~ArmnnPreparedModel_1_2()240 ArmnnPreparedModel_1_2<HalVersion>::~ArmnnPreparedModel_1_2()
241 {
242 // Get a hold of the profiler used by this model.
243 std::shared_ptr<armnn::IProfiler> profiler = m_Runtime->GetProfiler(m_NetworkId);
244 if (profiler && m_GpuProfilingEnabled)
245 {
246 // Dump the profiling info to a file if required.
247 DumpJsonProfilingIfRequired(m_GpuProfilingEnabled, m_RequestInputsAndOutputsDumpDir, m_NetworkId,
248 profiler.get());
249 }
250
251 // Unload the network associated with this model.
252 m_Runtime->UnloadNetwork(m_NetworkId);
253
254 // Unload the network memhandles from the threadpool
255 if (m_AsyncModelExecutionEnabled)
256 {
257 m_Threadpool->UnloadMemHandles(m_NetworkId);
258 }
259 }
260
261 template<typename HalVersion>
execute(const V1_0::Request & request,const::android::sp<V1_0::IExecutionCallback> & callback)262 Return <V1_0::ErrorStatus> ArmnnPreparedModel_1_2<HalVersion>::execute(const V1_0::Request& request,
263 const ::android::sp<V1_0::IExecutionCallback>& callback)
264 {
265 if (callback.get() == nullptr)
266 {
267 ALOGE("ArmnnPreparedModel_1_2::execute invalid callback passed");
268 return V1_0::ErrorStatus::INVALID_ARGUMENT;
269 }
270
271 auto cb = [callback](V1_0::ErrorStatus errorStatus,
272 std::vector<V1_2::OutputShape> outputShapes,
273 const V1_2::Timing& timing,
274 std::string callingFunction)
275 {
276 NotifyCallbackAndCheck(callback, errorStatus, outputShapes, timing, callingFunction);
277 };
278
279 return Execute(request, V1_2::MeasureTiming::NO, cb);
280 }
281
282 template<typename HalVersion>
execute_1_2(const V1_0::Request & request,V1_2::MeasureTiming measureTiming,const sp<V1_2::IExecutionCallback> & callback)283 Return <V1_0::ErrorStatus> ArmnnPreparedModel_1_2<HalVersion>::execute_1_2(
284 const V1_0::Request& request,
285 V1_2::MeasureTiming measureTiming,
286 const sp<V1_2::IExecutionCallback>& callback)
287 {
288 if (callback.get() == nullptr)
289 {
290 ALOGE("ArmnnPreparedModel_1_2::execute_1_2 invalid callback passed");
291 return V1_0::ErrorStatus::INVALID_ARGUMENT;
292 }
293
294 auto cb = [callback](V1_0::ErrorStatus errorStatus,
295 std::vector<V1_2::OutputShape> outputShapes,
296 const V1_2::Timing& timing,
297 std::string callingFunction)
298 {
299 NotifyCallbackAndCheck(callback, errorStatus, outputShapes, timing, callingFunction);
300 };
301
302 return Execute(request, measureTiming, cb);
303 }
304
305 template<typename HalVersion>
PrepareMemoryForInputs(armnn::InputTensors & inputs,const V1_0::Request & request,const std::vector<android::nn::RunTimePoolInfo> & memPools)306 Return<V1_0::ErrorStatus> ArmnnPreparedModel_1_2<HalVersion>::PrepareMemoryForInputs(
307 armnn::InputTensors& inputs,
308 const V1_0::Request& request,
309 const std::vector<android::nn::RunTimePoolInfo>& memPools)
310 {
311 inputs.reserve(request.inputs.size());
312 for (unsigned int i = 0; i < request.inputs.size(); i++)
313 {
314 const auto& inputArg = request.inputs[i];
315 armnn::TensorInfo inputTensorInfo = m_Runtime->GetInputTensorInfo(m_NetworkId, i);
316 // inputs (of type InputTensors) is composed of a vector of ConstTensors.
317 // Therefore, set all TensorInfo isConstant parameters of input Tensors to true.
318 inputTensorInfo.SetConstant();
319 auto result = ValidateRequestArgument<V1_0::ErrorStatus, V1_0::Request>(request,
320 inputTensorInfo,
321 inputArg,
322 "input");
323
324 if (result != V1_0::ErrorStatus::NONE)
325 {
326 return result;
327 }
328
329 const armnn::Tensor inputTensor = GetTensorForRequestArgument(inputArg, inputTensorInfo, memPools);
330
331 if (inputTensor.GetMemoryArea() == nullptr)
332 {
333 ALOGE("Cannot execute request. Error converting request input %u to tensor", i);
334 return V1_0::ErrorStatus::GENERAL_FAILURE;
335 }
336
337 inputs.emplace_back(i, inputTensor);
338 }
339
340 return V1_0::ErrorStatus::NONE;
341 }
342
343 template<typename HalVersion>
PrepareMemoryForOutputs(armnn::OutputTensors & outputs,std::vector<V1_2::OutputShape> & outputShapes,const V1_0::Request & request,const std::vector<android::nn::RunTimePoolInfo> & memPools)344 Return<V1_0::ErrorStatus> ArmnnPreparedModel_1_2<HalVersion>::PrepareMemoryForOutputs(
345 armnn::OutputTensors& outputs,
346 std::vector<V1_2::OutputShape> &outputShapes,
347 const V1_0::Request& request,
348 const std::vector<android::nn::RunTimePoolInfo>& memPools)
349 {
350 outputs.reserve(request.outputs.size());
351 for (unsigned int i = 0; i < request.outputs.size(); i++)
352 {
353 const auto& outputArg = request.outputs[i];
354 armnn::TensorInfo outputTensorInfo = m_Runtime->GetOutputTensorInfo(m_NetworkId, i);
355 auto result = ValidateRequestArgument<V1_0::ErrorStatus, V1_0::Request>(request,
356 outputTensorInfo,
357 outputArg,
358 "output");
359
360 if (result != V1_0::ErrorStatus::NONE)
361 {
362 return result;
363 }
364
365 const armnn::Tensor outputTensor = GetTensorForRequestArgument(outputArg, outputTensorInfo, memPools);
366 if (outputTensor.GetMemoryArea() == nullptr)
367 {
368 ALOGE("Cannot execute request. Error converting request output %u to tensor", i);
369 return V1_0::ErrorStatus::GENERAL_FAILURE;
370 }
371
372 const size_t outputSize = outputTensorInfo.GetNumBytes();
373
374 if (outputArg.location.length < outputSize)
375 {
376 ALOGW("ArmnnPreparedModel_1_2::Execute failed: outputArg.location.length < outputSize");
377 return V1_0::ErrorStatus::OUTPUT_INSUFFICIENT_SIZE;
378 }
379
380 #if !defined(ARMNN_ANDROID_S)
381 const size_t bufferSize = memPools.at(outputArg.location.poolIndex).getHidlMemory().size();
382 if (bufferSize < outputSize)
383 {
384 ALOGW("ArmnnPreparedModel_1_2::Execute failed: bufferSize < outputSize");
385 return V1_0::ErrorStatus::OUTPUT_INSUFFICIENT_SIZE;
386 }
387 #else
388 const size_t bufferSize = memPools.at(outputArg.location.poolIndex).getSize();
389 if (bufferSize < outputSize)
390 {
391 ALOGW("ArmnnPreparedModel_1_2::Execute failed bufferSize (%s) < outputSize (%s)",
392 std::to_string(bufferSize).c_str(), std::to_string(outputSize).c_str());
393 outputShapes[i].isSufficient = false;
394 return V1_0::ErrorStatus::OUTPUT_INSUFFICIENT_SIZE;
395 }
396 #endif
397 outputs.emplace_back(i, outputTensor);
398 outputShapes[i] = ComputeShape(outputTensorInfo);
399 }
400
401 return V1_0::ErrorStatus::NONE;
402 }
403
404 template<typename HalVersion>
PrepareMemoryForIO(armnn::InputTensors & inputs,armnn::OutputTensors & outputs,std::vector<android::nn::RunTimePoolInfo> & memPools,const V1_0::Request & request,CallbackAsync_1_2 callback)405 Return<V1_0::ErrorStatus> ArmnnPreparedModel_1_2<HalVersion>::PrepareMemoryForIO(
406 armnn::InputTensors& inputs,
407 armnn::OutputTensors& outputs,
408 std::vector<android::nn::RunTimePoolInfo>& memPools,
409 const V1_0::Request& request,
410 CallbackAsync_1_2 callback)
411 {
412 #if !defined(ARMNN_ANDROID_S)
413 if (!setRunTimePoolInfosFromHidlMemories(&memPools, request.pools))
414 #else
415 if (!setRunTimePoolInfosFromCanonicalMemories(&memPools, uncheckedConvert(request.pools)))
416 #endif
417 {
418 callback(V1_0::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_2::execute");
419 return V1_0::ErrorStatus::GENERAL_FAILURE;
420 }
421 // add the inputs and outputs with their data
422 try
423 {
424 if (PrepareMemoryForInputs(inputs, request, memPools) != V1_0::ErrorStatus::NONE)
425 {
426 callback(V1_0::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_2::execute");
427 return V1_0::ErrorStatus::GENERAL_FAILURE;
428 }
429
430 std::vector<V1_2::OutputShape> outputShapes(request.outputs.size());
431
432 auto errorStatus = PrepareMemoryForOutputs(outputs, outputShapes, request, memPools);
433 if (errorStatus != V1_0::ErrorStatus::NONE)
434 {
435 callback(errorStatus,
436 outputShapes,
437 g_NoTiming,
438 "ArmnnPreparedModel_1_2::Execute");
439 return errorStatus;
440 }
441 }
442 catch (armnn::Exception& e)
443 {
444 ALOGW("armnn::Exception caught while preparing for EnqueueWorkload: %s", e.what());
445 callback(V1_0::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_2::execute");
446 return V1_0::ErrorStatus::GENERAL_FAILURE;
447 }
448 catch (std::exception& e)
449 {
450 ALOGE("std::exception caught while preparing for EnqueueWorkload: %s", e.what());
451 callback(V1_0::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_2::execute");
452 return V1_0::ErrorStatus::GENERAL_FAILURE;
453 }
454
455 return V1_0::ErrorStatus::NONE;
456 }
457
458 template<typename HalVersion>
executeSynchronously(const V1_0::Request & request,V1_2::MeasureTiming measureTiming,executeSynchronously_cb cb)459 Return<void> ArmnnPreparedModel_1_2<HalVersion>::executeSynchronously(const V1_0::Request& request,
460 V1_2::MeasureTiming measureTiming,
461 executeSynchronously_cb cb)
462 {
463 if (!m_PreparedFromCache)
464 {
465 ALOGV("ArmnnPreparedModel_1_2::executeSynchronously(): %s", GetModelSummary(m_Model).c_str());
466 }
467 m_RequestCount++;
468
469 if (cb == nullptr)
470 {
471 ALOGE("ArmnnPreparedModel_1_2::executeSynchronously invalid callback passed");
472 return Void();
473 }
474
475 TimePoint driverStart;
476
477 if (measureTiming == V1_2::MeasureTiming::YES)
478 {
479 driverStart = Now();
480 }
481
482 if (!m_PreparedFromCache && !android::nn::validateRequest(request, m_Model))
483 {
484 ALOGE("ArmnnPreparedModel_1_2::executeSynchronously invalid request model");
485 cb(V1_0::ErrorStatus::INVALID_ARGUMENT, {}, g_NoTiming);
486 return Void();
487 }
488
489 auto cbWrapper = [cb](V1_0::ErrorStatus errorStatus,
490 std::vector<V1_2::OutputShape> outputShapes,
491 const V1_2::Timing& timing,
492 std::string)
493 {
494 cb(errorStatus, outputShapes, timing);
495 };
496
497 // map the memory pool into shared pointers
498 // use a shared memory pools vector on the heap, as it is passed to the request thread
499 auto memPools = std::make_shared<std::vector<android::nn::RunTimePoolInfo>>();
500
501 // allocate the tensors on the heap, as they are passed to the request thread
502 auto inputs = std::make_shared<armnn::InputTensors>();
503 auto outputs = std::make_shared<armnn::OutputTensors>();
504
505 auto prepareStatus = PrepareMemoryForIO(*inputs, *outputs, *memPools, request, cbWrapper);
506 if (prepareStatus != V1_0::ErrorStatus::NONE)
507 {
508 return Void();
509 }
510
511 ALOGV("ArmnnPreparedModel_1_2::executeSynchronously() before Execution");
512
513 CallbackContext_1_2 cbCtx;
514 cbCtx.callback = cbWrapper;
515 cbCtx.ctx.measureTimings = measureTiming;
516 cbCtx.ctx.driverStart = driverStart;
517 ExecuteGraph(memPools, *inputs, *outputs, cbCtx);
518
519 return Void();
520 }
521
522 template<typename HalVersion>
523 template<typename CallbackContext>
ExecuteGraph(std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>> & pMemPools,armnn::InputTensors & inputTensors,armnn::OutputTensors & outputTensors,CallbackContext cb)524 bool ArmnnPreparedModel_1_2<HalVersion>::ExecuteGraph(
525 std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& pMemPools,
526 armnn::InputTensors& inputTensors,
527 armnn::OutputTensors& outputTensors,
528 CallbackContext cb)
529 {
530 ALOGV("ArmnnPreparedModel_1_2::ExecuteGraph(...)");
531
532 TimePoint driverEnd, deviceStart, deviceEnd;
533 // Capture the graph execution start time.
534 std::chrono::time_point<std::chrono::system_clock> graphExecutionStart = std::chrono::system_clock::now();
535
536 DumpTensorsIfRequired("Input", inputTensors);
537
538 std::vector<V1_2::OutputShape> outputShapes(outputTensors.size());
539 for (unsigned int i = 0; i < outputTensors.size(); i++)
540 {
541 std::pair<int, armnn::Tensor> outputTensorPair = outputTensors[i];
542 const armnn::Tensor outputTensor = outputTensorPair.second;
543 const armnn::TensorInfo outputTensorInfo = outputTensor.GetInfo();
544
545 outputShapes[i] = ComputeShape(outputTensorInfo);
546 }
547
548 // run it
549 try
550 {
551 if (cb.ctx.measureTimings == V1_2::MeasureTiming::YES)
552 {
553 deviceStart = Now();
554 }
555
556 armnn::Status status;
557 if (m_AsyncModelExecutionEnabled)
558 {
559 ALOGW("ArmnnPreparedModel_1_2::ExecuteGraph m_AsyncModelExecutionEnabled true");
560 status = m_Runtime->Execute(*m_WorkingMemHandle, inputTensors, outputTensors);
561 }
562 else
563 {
564 ALOGW("ArmnnPreparedModel_1_2::ExecuteGraph m_AsyncModelExecutionEnabled false");
565
566 // Create a vector of Input and Output Ids which can be imported. An empty vector means all will be copied.
567 std::vector<armnn::ImportedInputId> importedInputIds;
568 if (m_EnableImport)
569 {
570 importedInputIds = m_Runtime->ImportInputs(m_NetworkId, inputTensors, armnn::MemorySource::Malloc);
571 }
572 std::vector<armnn::ImportedOutputId> importedOutputIds;
573 if (m_EnableExport)
574 {
575 importedOutputIds = m_Runtime->ImportOutputs(m_NetworkId, outputTensors, armnn::MemorySource::Malloc);
576 }
577 status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors,
578 importedInputIds, importedOutputIds);
579 }
580
581 if (cb.ctx.measureTimings == V1_2::MeasureTiming::YES)
582 {
583 deviceEnd = Now();
584 }
585 if (status != armnn::Status::Success)
586 {
587 ALOGW("EnqueueWorkload failed");
588 cb.callback(V1_0::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming,
589 "ArmnnPreparedModel_1_2::ExecuteGraph");
590 return false;
591 }
592 }
593 catch (armnn::Exception& e)
594 {
595 ALOGW("armnn:Exception caught from EnqueueWorkload: %s", e.what());
596 cb.callback(V1_0::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_2::ExecuteGraph");
597 return false;
598 }
599 catch (std::exception& e)
600 {
601 ALOGE("std::exception caught from EnqueueWorkload: %s", e.what());
602 cb.callback(V1_0::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_2::ExecuteGraph");
603 return false;
604 }
605
606 CommitPools(*pMemPools);
607
608 DumpTensorsIfRequired("Output", outputTensors);
609
610 if (cb.ctx.measureTimings == V1_2::MeasureTiming::YES)
611 {
612 driverEnd = Now();
613 V1_2::Timing timing;
614 timing.timeOnDevice = MicrosecondsDuration(deviceEnd, deviceStart);
615 timing.timeInDriver = MicrosecondsDuration(driverEnd, cb.ctx.driverStart);
616 ALOGV("ArmnnPreparedModel_1_2::execute timing - Device = %lu Driver = %lu",
617 static_cast<unsigned long>(timing.timeOnDevice), static_cast<unsigned long>(timing.timeInDriver));
618 cb.callback(V1_0::ErrorStatus::NONE, outputShapes, timing, "ArmnnPreparedModel_1_2::ExecuteGraph");
619 } else {
620 cb.callback(V1_0::ErrorStatus::NONE, outputShapes, g_NoTiming, "ArmnnPreparedModel_1_2::ExecuteGraph");
621 }
622
623 // Log the total time in this call. This is a good number to compare to that printed out by
624 // RuntimeImpl::EnqueueWorkload. The difference should be the execution overhead of the driver.
625 ALOGI("ArmnnPreparedModel_1_2::ExecuteGraph Execution time = %lld µs",
626 std::chrono::duration_cast<std::chrono::microseconds>
627 (std::chrono::system_clock::now() - graphExecutionStart).count());
628 return true;
629 }
630
631 template<typename HalVersion>
ExecuteWithDummyInputs(unsigned int numInputs,unsigned int numOutputs)632 bool ArmnnPreparedModel_1_2<HalVersion>::ExecuteWithDummyInputs(unsigned int numInputs, unsigned int numOutputs)
633 {
634 std::vector<std::vector<char>> storage;
635 armnn::InputTensors inputTensors;
636 for (unsigned int i = 0; i < numInputs; i++)
637 {
638 armnn::TensorInfo inputTensorInfo = m_Runtime->GetInputTensorInfo(m_NetworkId, i);
639 // pInputTensors (of type InputTensors) is composed of a vector of ConstTensors.
640 // Therefore, set all TensorInfo isConstant parameters of input Tensors to true.
641 inputTensorInfo.SetConstant();
642
643 storage.emplace_back(inputTensorInfo.GetNumBytes());
644 const armnn::ConstTensor inputTensor(inputTensorInfo, storage.back().data());
645
646 inputTensors.emplace_back(i, inputTensor);
647 }
648
649 armnn::OutputTensors outputTensors;
650 for (unsigned int i = 0; i < numOutputs; i++)
651 {
652 const armnn::TensorInfo outputTensorInfo = m_Runtime->GetOutputTensorInfo(m_NetworkId, i);
653 storage.emplace_back(outputTensorInfo.GetNumBytes());
654 const armnn::Tensor outputTensor(outputTensorInfo, storage.back().data());
655
656 outputTensors.emplace_back(i, outputTensor);
657 }
658
659 auto nullCallback = [](V1_0::ErrorStatus, std::vector<V1_2::OutputShape>, const V1_2::Timing&, std::string) {};
660 CallbackContext_1_2 callbackContext;
661 callbackContext.callback = nullCallback;
662 callbackContext.ctx.measureTimings = V1_2::MeasureTiming::NO;
663 auto memPools = std::make_shared<std::vector<::android::nn::RunTimePoolInfo>>();
664 return ExecuteGraph(memPools,
665 inputTensors,
666 outputTensors,
667 callbackContext);
668 }
669
670 template<typename HalVersion>
Execute(const V1_0::Request & request,V1_2::MeasureTiming measureTiming,CallbackAsync_1_2 callback)671 Return <V1_0::ErrorStatus> ArmnnPreparedModel_1_2<HalVersion>::Execute(const V1_0::Request& request,
672 V1_2::MeasureTiming measureTiming,
673 CallbackAsync_1_2 callback)
674 {
675 ExecutionContext_1_2 ctx;
676 if (measureTiming == V1_2::MeasureTiming::YES)
677 {
678 ctx.measureTimings = measureTiming;
679 ctx.driverStart = Now();
680 }
681
682 if (!m_PreparedFromCache)
683 {
684 ALOGV("ArmnnPreparedModel_1_2::execute(): %s", GetModelSummary(m_Model).c_str());
685 }
686 m_RequestCount++;
687
688 if (!m_PreparedFromCache && !android::nn::validateRequest(request, m_Model))
689 {
690 callback(V1_0::ErrorStatus::INVALID_ARGUMENT, {}, g_NoTiming, "ArmnnPreparedModel_1_2::execute");
691 return V1_0::ErrorStatus::INVALID_ARGUMENT;
692 }
693
694 if (!m_RequestInputsAndOutputsDumpDir.empty())
695 {
696 ALOGD("Dumping inputs and outputs for request %" PRIuPTR, reinterpret_cast<std::uintptr_t>(&callback));
697 }
698
699 // map the memory pool into shared pointers
700 // use a shared memory pools vector on the heap, as it is passed to the request thread
701 auto memPools = std::make_shared<std::vector<android::nn::RunTimePoolInfo>>();
702
703 // allocate the tensors on the heap, as they are passed to the request thread
704 auto inputTensors = std::make_shared<armnn::InputTensors>();
705 auto outputTensors = std::make_shared<armnn::OutputTensors>();
706
707 auto prepareStatus = PrepareMemoryForIO(*inputTensors, *outputTensors, *memPools, request, callback);
708 switch(prepareStatus)
709 {
710 case V1_0::ErrorStatus::OUTPUT_INSUFFICIENT_SIZE:
711 return V1_0::ErrorStatus::NONE;
712 case V1_0::ErrorStatus::GENERAL_FAILURE:
713 return V1_0::ErrorStatus::GENERAL_FAILURE;
714 default:
715 {}
716 }
717
718
719 // post the request for asynchronous execution
720 CallbackContext_1_2 cb;
721 cb.callback = callback;
722 cb.ctx = ctx;
723
724 if (m_AsyncModelExecutionEnabled)
725 {
726 ALOGV("ArmnnPreparedModel_1_2::execute(...) before ScheduleGraphForExecution");
727 ScheduleGraphForExecution(memPools, inputTensors, outputTensors, cb);
728 ALOGV("ArmnnPreparedModel_1_2::execute(...) after ScheduleGraphForExecution");
729 return V1_0::ErrorStatus::NONE;
730 }
731
732 ALOGV("ArmnnPreparedModel_1_2::execute(...) before PostMsg");
733 m_RequestThread.PostMsg(this, memPools, inputTensors, outputTensors, cb);
734 ALOGV("ArmnnPreparedModel_1_2::execute(...) after PostMsg");
735 return V1_0::ErrorStatus::NONE;
736 }
737
738 template<typename HalVersion>
configureExecutionBurst(const sp<V1_2::IBurstCallback> & callback,const MQDescriptorSync<V1_2::FmqRequestDatum> & requestChannel,const MQDescriptorSync<V1_2::FmqResultDatum> & resultChannel,V1_2::IPreparedModel::configureExecutionBurst_cb cb)739 Return<void> ArmnnPreparedModel_1_2<HalVersion>::configureExecutionBurst(
740 const sp<V1_2::IBurstCallback>& callback,
741 const MQDescriptorSync<V1_2::FmqRequestDatum>& requestChannel,
742 const MQDescriptorSync<V1_2::FmqResultDatum>& resultChannel,
743 V1_2::IPreparedModel::configureExecutionBurst_cb cb)
744 {
745 ALOGV("ArmnnPreparedModel_1_2::configureExecutionBurst");
746 const sp<V1_2::IBurstContext> burst = ExecutionBurstServer::create(callback,
747 requestChannel,
748 resultChannel,
749 this);
750
751 if (burst == nullptr)
752 {
753 cb(V1_0::ErrorStatus::GENERAL_FAILURE, {});
754 }
755 else
756 {
757 cb(V1_0::ErrorStatus::NONE, burst);
758 }
759 return Void();
760 }
761
762 /// Schedule the graph prepared from the request for execution
763 template<typename HalVersion>
764 template<typename CallbackContext>
ScheduleGraphForExecution(std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>> & pMemPools,std::shared_ptr<armnn::InputTensors> & inputTensors,std::shared_ptr<armnn::OutputTensors> & outputTensors,CallbackContext callbackContext)765 void ArmnnPreparedModel_1_2<HalVersion>::ScheduleGraphForExecution(
766 std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& pMemPools,
767 std::shared_ptr<armnn::InputTensors>& inputTensors,
768 std::shared_ptr<armnn::OutputTensors>& outputTensors,
769 CallbackContext callbackContext)
770 {
771 ALOGV("ArmnnPreparedModel_1_2::ScheduleGraphForExecution(...)");
772
773 DumpTensorsIfRequired("Input", *inputTensors);
774
775 unsigned int outputTensorSize = outputTensors.get()->size();
776 std::vector<V1_2::OutputShape> outputShapes(outputTensorSize);
777 for (unsigned int i = 0; i < outputTensorSize; i++)
778 {
779 std::pair<int, armnn::Tensor> outputTensorPair = outputTensors.get()->at(i);
780 const armnn::Tensor outputTensor = outputTensorPair.second;
781 const armnn::TensorInfo outputTensorInfo = outputTensor.GetInfo();
782
783 outputShapes[i] = ComputeShape(outputTensorInfo);
784 }
785
786 auto tpCb = std::make_shared<
787 ArmnnThreadPoolCallback_1_2<CallbackContext_1_2>>(this,
788 pMemPools,
789 outputShapes,
790 inputTensors,
791 outputTensors,
792 callbackContext);
793
794 m_Threadpool->Schedule(m_NetworkId,
795 *tpCb->m_InputTensors,
796 *tpCb->m_OutputTensors,
797 armnn::QosExecPriority::Medium,
798 tpCb);
799 ALOGV("ArmnnPreparedModel_1_2::ScheduleGraphForExecution end");
800 }
801
802 template<typename HalVersion>
803 template <typename CallbackContext>
Notify(armnn::Status status,armnn::InferenceTimingPair timeTaken)804 void ArmnnPreparedModel_1_2<HalVersion>::ArmnnThreadPoolCallback_1_2<CallbackContext>::Notify(
805 armnn::Status status, armnn::InferenceTimingPair timeTaken)
806 {
807 ALOGV("ArmnnPreparedModel_1_2::ArmnnThreadPoolCallback_1_2 Notify");
808
809 TimePoint driverEnd;
810
811 CommitPools(*m_MemPools);
812
813 m_Model->DumpTensorsIfRequired("Output", *m_OutputTensors);
814
815 if (status != armnn::Status::Success)
816 {
817 ALOGW("ArmnnThreadPoolCallback::Notify EnqueueWorkload failed");
818 m_CallbackContext.callback(
819 V1_0::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel::ExecuteGraph");
820 return;
821 }
822
823 if (m_CallbackContext.ctx.measureTimings == V1_2::MeasureTiming::YES)
824 {
825 driverEnd = std::chrono::steady_clock::now();
826 V1_2::Timing timing;
827 timing.timeOnDevice = MicrosecondsDuration(timeTaken.second, timeTaken.first);
828 timing.timeInDriver = MicrosecondsDuration(driverEnd, m_CallbackContext.ctx.driverStart);
829 ALOGV("ArmnnPreparedModel_1_2::execute timing - Device = %lu Driver = %lu",
830 static_cast<unsigned long>(timing.timeOnDevice), static_cast<unsigned long>(timing.timeInDriver));
831 m_CallbackContext.callback(
832 V1_0::ErrorStatus::NONE, m_OutputShapes, timing, "ArmnnPreparedModel_1_2::ExecuteGraph");
833 } else {
834 m_CallbackContext.callback(
835 V1_0::ErrorStatus::NONE, m_OutputShapes, g_NoTiming, "ArmnnPreparedModel_1_2::ExecuteGraph");
836 }
837 return;
838 }
839
840 #if defined(ARMNN_ANDROID_NN_V1_2) || defined(ARMNN_ANDROID_NN_V1_3)
841 template class ArmnnPreparedModel_1_2<hal_1_2::HalPolicy>;
842 template bool ArmnnPreparedModel_1_2<hal_1_2::HalPolicy>::ExecuteGraph<CallbackContext_1_2>(
843 std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& pMemPools,
844 armnn::InputTensors& pInputTensors,
845 armnn::OutputTensors& pOutputTensors,
846 CallbackContext_1_2 cb);
847
848 template void ArmnnPreparedModel_1_2<hal_1_2::HalPolicy>::ScheduleGraphForExecution<CallbackContext_1_2>(
849 std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& pMemPools,
850 std::shared_ptr<armnn::InputTensors>& inputTensors,
851 std::shared_ptr<armnn::OutputTensors>& outputTensors,
852 CallbackContext_1_2 callbackContext);
853 #endif
854
855 } // namespace armnn_driver
856