xref: /aosp_15_r20/external/armnn/src/backends/cl/test/ClImportTensorHandleTests.cpp (revision 89c4ff92f2867872bb9e2354d150bf0c8c502810)
1 //
2 // Copyright © 2021, 2023 Arm Ltd and Contributors. All rights reserved.
3 // SPDX-License-Identifier: MIT
4 //
5 
6 #include <arm_compute/runtime/CL/functions/CLActivationLayer.h>
7 
8 #include <cl/ClImportTensorHandle.hpp>
9 #include <cl/ClImportTensorHandleFactory.hpp>
10 #include <cl/test/ClContextControlFixture.hpp>
11 
12 #include <doctest/doctest.h>
13 
14 #include <armnn/IRuntime.hpp>
15 #include <armnn/INetwork.hpp>
16 #include "Network.hpp"
17 
18 using namespace armnn;
19 
20 TEST_SUITE("ClImportTensorHandleTests")
21 {
22 TEST_CASE_FIXTURE(ClContextControlFixture, "ClMallocImport")
23 {
24     ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc),
25                                               static_cast<MemorySourceFlags>(MemorySource::Malloc));
26 
27     TensorInfo info({ 1, 24, 16, 3 }, DataType::Float32);
28     unsigned int numElements = info.GetNumElements();
29 
30     // create TensorHandle for memory import
31     auto handle = handleFactory.CreateTensorHandle(info);
32 
33     // Get CLtensor
34     arm_compute::CLTensor& tensor = PolymorphicDowncast<ClImportTensorHandle*>(handle.get())->GetTensor();
35 
36     // Create and configure activation function
37     const arm_compute::ActivationLayerInfo act_info(arm_compute::ActivationLayerInfo::ActivationFunction::RELU);
38     arm_compute::CLActivationLayer act_func;
39     act_func.configure(&tensor, nullptr, act_info);
40 
41     // Allocate user memory
42     const size_t totalBytes = tensor.info()->total_size();
43     const size_t alignment =
44         arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
45     size_t space = totalBytes + alignment + alignment;
46     auto testData = std::make_unique<uint8_t[]>(space);
47     void* alignedPtr = testData.get();
48     CHECK(std::align(alignment, totalBytes, alignedPtr, space));
49 
50     // Import memory
51     CHECK(handle->Import(alignedPtr, armnn::MemorySource::Malloc));
52 
53     // Input with negative values
54     auto* typedPtr = reinterpret_cast<float*>(alignedPtr);
55     std::fill_n(typedPtr, numElements, -5.0f);
56 
57     // Execute function and sync
58     act_func.run();
59     arm_compute::CLScheduler::get().sync();
60 
61     // Validate result by checking that the output has no negative values
62     for(unsigned int i = 0; i < numElements; ++i)
63     {
64         CHECK(typedPtr[i] == 0);
65     }
66 }
67 
68 TEST_CASE_FIXTURE(ClContextControlFixture, "ClIncorrectMemorySourceImport")
69 {
70     ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc),
71                                               static_cast<MemorySourceFlags>(MemorySource::Malloc));
72 
73     TensorInfo info({ 1, 24, 16, 3 }, DataType::Float32);
74 
75     // create TensorHandle for memory import
76     auto handle = handleFactory.CreateTensorHandle(info);
77 
78     // Get CLtensor
79     arm_compute::CLTensor& tensor = PolymorphicDowncast<ClImportTensorHandle*>(handle.get())->GetTensor();
80 
81     // Allocate user memory
82     const size_t totalBytes = tensor.info()->total_size();
83     const size_t alignment =
84         arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
85     size_t space = totalBytes + alignment + alignment;
86     auto testData = std::make_unique<uint8_t[]>(space);
87     void* alignedPtr = testData.get();
88     CHECK(std::align(alignment, totalBytes, alignedPtr, space));
89 
90     // Import memory
91     CHECK_THROWS_AS(handle->Import(alignedPtr, armnn::MemorySource::Undefined), MemoryImportException);
92 }
93 
94 TEST_CASE_FIXTURE(ClContextControlFixture, "ClInvalidMemorySourceImport")
95 {
96     MemorySource invalidMemSource = static_cast<MemorySource>(256);
97     ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(invalidMemSource),
98                                               static_cast<MemorySourceFlags>(invalidMemSource));
99 
100     TensorInfo info({ 1, 2, 2, 1 }, DataType::Float32);
101 
102     // create TensorHandle for memory import
103     auto handle = handleFactory.CreateTensorHandle(info);
104 
105     // Allocate user memory
106     std::vector<float> inputData
107     {
108         1.0f, 2.0f, 3.0f, 4.0f
109     };
110 
111     // Import non-support memory
112     CHECK_THROWS_AS(handle->Import(inputData.data(), invalidMemSource), MemoryImportException);
113 }
114 
115 TEST_CASE_FIXTURE(ClContextControlFixture, "ClImportEndToEnd")
116 {
117     // Create runtime in which test will run
118     IRuntime::CreationOptions options;
119     IRuntimePtr runtime(armnn::IRuntime::Create(options));
120 
121     // build up the structure of the network
122     INetworkPtr net(INetwork::Create());
123 
124     IConnectableLayer* input = net->AddInputLayer(0, "Input");
125 
126     ActivationDescriptor descriptor;
127     descriptor.m_Function = ActivationFunction::ReLu;
128     IConnectableLayer* activation = net->AddActivationLayer(descriptor, "Activation");
129 
130     IConnectableLayer* output = net->AddOutputLayer(0, "Output");
131 
132     input->GetOutputSlot(0).Connect(activation->GetInputSlot(0));
133     activation->GetOutputSlot(0).Connect(output->GetInputSlot(0));
134 
135     TensorInfo tensorInfo = TensorInfo({ 1, 24, 16, 3 }, DataType::Float32);
136     unsigned int numElements = tensorInfo.GetNumElements();
137     size_t totalBytes = numElements * sizeof(float);
138 
139     input->GetOutputSlot(0).SetTensorInfo(tensorInfo);
140     activation->GetOutputSlot(0).SetTensorInfo(tensorInfo);
141 
142     // Optimize the network
143     OptimizerOptionsOpaque optOptions;
144     optOptions.SetImportEnabled(true);
145     optOptions.SetExportEnabled(true);
146     std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
147     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions);
148     CHECK(optNet);
149 
150     // Loads it into the runtime.
151     NetworkId netId;
152     std::string ignoredErrorMessage;
153     // Enable Importing
154     INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
155     runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
156 
157     // Creates structures for input & output
158     const size_t alignment =
159         arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
160     size_t space = totalBytes + alignment + alignment;
161     auto inputData = std::make_unique<uint8_t[]>(space);
162     void* alignedInputPtr = inputData.get();
163     CHECK(std::align(alignment, totalBytes, alignedInputPtr, space));
164 
165     // Input with negative values
166     auto* intputPtr = reinterpret_cast<float*>(alignedInputPtr);
167     std::fill_n(intputPtr, numElements, -5.0f);
168 
169     auto outputData = std::make_unique<uint8_t[]>(space);
170     void* alignedOutputPtr = outputData.get();
171     CHECK(std::align(alignment, totalBytes, alignedOutputPtr, space));
172     auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
173     std::fill_n(outputPtr, numElements, -10.0f);
174 
175     TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
176     inputTensorInfo.SetConstant(true);
177     InputTensors inputTensors
178     {
179         {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
180     };
181     OutputTensors outputTensors
182     {
183         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
184     };
185 
186     runtime->GetProfiler(netId)->EnableProfiling(true);
187 
188     // Do the inference
189     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
190 
191     // Retrieve the Profiler.Print() output to get the workload execution
192     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
193     std::stringstream ss;
194     profilerManager.GetProfiler()->Print(ss);;
195     std::string dump = ss.str();
196 
197     // Contains ActivationWorkload
198     std::size_t found = dump.find("ActivationWorkload");
199     CHECK(found != std::string::npos);
200 
201     // Contains SyncMemGeneric
202     found = dump.find("SyncMemGeneric");
203     CHECK(found != std::string::npos);
204 
205     // Does not contain CopyMemGeneric
206     found = dump.find("CopyMemGeneric");
207     CHECK(found == std::string::npos);
208 
209     runtime->UnloadNetwork(netId);
210 
211     // Check output is as expected
212     // Validate result by checking that the output has no negative values
213     auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
214     CHECK(outputResult);
215     for(unsigned int i = 0; i < numElements; ++i)
216     {
217         CHECK(outputResult[i] >= 0);
218     }
219 }
220 
221 TEST_CASE_FIXTURE(ClContextControlFixture, "ClCanBeImported")
222 {
223     ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc),
224                                               static_cast<MemorySourceFlags>(MemorySource::Malloc));
225 
226     TensorInfo info({ 1, 24, 16, 3 }, DataType::Float32);
227 
228     // create TensorHandle for memory import
229     auto handle = handleFactory.CreateTensorHandle(info, DataLayout::NHWC);
230 
231     // Get CLtensor
232     arm_compute::CLTensor& tensor = PolymorphicDowncast<ClImportTensorHandle*>(handle.get())->GetTensor();
233 
234     // Allocate user memory
235     const size_t totalBytes = tensor.info()->total_size();
236     const size_t alignment =
237             arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
238     size_t space = totalBytes + alignment + alignment;
239     auto testData = std::make_unique<uint8_t[]>(space);
240     void* alignedPtr = testData.get();
241     CHECK(std::align(alignment, totalBytes, alignedPtr, space));
242 
243     // Import memory
244     CHECK_THROWS_AS(handle->CanBeImported(alignedPtr, armnn::MemorySource::Undefined), MemoryImportException);
245 
246 }
247 
248 TEST_CASE("ClCanBeImportedAlignedMemory")
249 {
250     ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc),
251                                               static_cast<MemorySourceFlags>(MemorySource::Malloc));
252 
253     TensorInfo info({ 1, 1, 1, 1 }, DataType::Float32);
254 
255     // create TensorHandle (Memory Managed status is irrelevant)
256     auto handle = handleFactory.CreateTensorHandle(info, DataLayout::NHWC);
257     // Get CLtensor
258     arm_compute::CLTensor& tensor = PolymorphicDowncast<ClImportTensorHandle*>(handle.get())->GetTensor();
259 
260     // Create an aligned buffer
261     const size_t totalBytes = tensor.info()->total_size();
262     const size_t alignment =
263             arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
264     size_t space = totalBytes + alignment + alignment;
265     auto testData = std::make_unique<uint8_t[]>(space);
266     void* alignedPtr = testData.get();
267     CHECK(std::align(alignment, totalBytes, alignedPtr, space));
268 
269     // Check aligned buffers return true
270     CHECK(handle->CanBeImported(alignedPtr, MemorySource::Malloc) == true);
271 
272     // Due to the nature of how GPU memory is mapped it is entirely possible for memory which is misaligned on cpu
273     // to be successfully import on GPU. As such there is no way to create a misaligned pointer that will always fail.
274     // Rather it will succeed on some devices and fail on others. As long as a correctly aligned buffer returns true
275     // we can be confident that it will be successfully imported. All other cases will need to be handled by the user.
276 }
277 
278 TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportConv2dEndToEnd")
279 {
280     // Create runtime in which test will run
281     IRuntime::CreationOptions options;
282     IRuntimePtr runtime(armnn::IRuntime::Create(options));
283 
284     // build up the structure of the network
285     INetworkPtr network(INetwork::Create());
286 
287     armnn::TensorInfo inputInfo({ 1, 3, 4, 1 }, DataType::Float32);
288     armnn::TensorInfo kernelInfo({ 1, 3, 3, 1 }, DataType::Float32);
289     armnn::TensorInfo outputInfo({ 1, 3, 4, 1 }, DataType::Float32);
290 
291     kernelInfo.SetConstant(true);
292 
293     std::vector<float> kernel =
294     {
295         4, 5, 6,
296         0, 0, 0,
297         3, 2, 1
298     };
299 
300     const std::vector<float> expectedOutput =
301     {
302         23, 41, 33, 21,
303         44, 65, 76, 52,
304         82, 85, 79, 42
305     };
306 
307     unsigned int numElements = inputInfo.GetNumElements();
308     size_t totalBytes = numElements * sizeof(float);
309 
310     IConnectableLayer* const inputLayer = network->AddInputLayer(0, "input");
311     ARMNN_ASSERT(inputLayer);
312 
313     armnn::ConstTensor weights(kernelInfo, kernel);
314 
315     armnn::Convolution2dDescriptor convDesc2d;
316     convDesc2d.m_StrideX = 1;
317     convDesc2d.m_StrideY = 1;
318     convDesc2d.m_PadLeft = 1;
319     convDesc2d.m_PadRight = 1;
320     convDesc2d.m_PadTop = 1;
321     convDesc2d.m_PadBottom = 1;
322     convDesc2d.m_DataLayout = DataLayout::NHWC;
323 
324     armnn::IConnectableLayer* const convLayer = network->AddConvolution2dLayer(convDesc2d, "conv");
325     armnn::IConnectableLayer* weightsLayer = network->AddConstantLayer(weights);
326 
327     ARMNN_ASSERT(convLayer);
328 
329     weightsLayer->GetOutputSlot(0).SetTensorInfo(weights.GetInfo());
330     weightsLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(1u));
331 
332     inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
333     inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
334 
335     IConnectableLayer* output = network->AddOutputLayer(0, "output");
336     convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
337     convLayer->GetOutputSlot(0).SetTensorInfo(outputInfo);
338 
339     // Optimize the network
340     OptimizerOptionsOpaque optOptions;
341     optOptions.SetImportEnabled(false);
342     optOptions.SetExportEnabled(false);
343     std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
344     IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec(), optOptions);
345     CHECK(optNet);
346 
347     // Loads it into the runtime.
348     NetworkId netId;
349     std::string ignoredErrorMessage;
350     // Enable Importing
351     INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
352     runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
353 
354     // Creates structures for input & output
355     const size_t alignment =
356         arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
357     size_t space = totalBytes + alignment + alignment;
358     auto inputData = std::make_unique<uint8_t[]>(space);
359     void* alignedInputPtr = inputData.get();
360     CHECK(std::align(alignment, totalBytes, alignedInputPtr, space));
361 
362     // Input with negative values
363     auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
364     inputPtr[0] = 1;
365     inputPtr[1] = 5;
366     inputPtr[2] = 2;
367     inputPtr[3] = 3;
368     inputPtr[4] = 8;
369     inputPtr[5] = 7;
370     inputPtr[6] = 3;
371     inputPtr[7] = 6;
372     inputPtr[8] = 3;
373     inputPtr[9] = 3;
374     inputPtr[10] = 9;
375     inputPtr[11] = 1;
376 
377 
378     auto outputData = std::make_unique<uint8_t[]>(space);
379     void* alignedOutputPtr = outputData.get();
380     CHECK(std::align(alignment, totalBytes, alignedOutputPtr, space));
381     auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
382     std::fill_n(outputPtr, numElements, -10.0f);
383 
384     TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
385     inputTensorInfo.SetConstant(true);
386     InputTensors inputTensors
387     {
388         {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
389     };
390     OutputTensors outputTensors
391     {
392         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
393     };
394 
395     runtime->GetProfiler(netId)->EnableProfiling(true);
396 
397     INFO("Run ImportInputs");
398     std::vector<ImportedInputId> importedInputIds =
399         runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
400     // We expect the import to have succeeded.
401     CHECK(importedInputIds.size() == 1);
402     std::vector<ImportedOutputId> importedOutputIds =
403         runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
404     // We expect the import to have succeeded.
405     CHECK(importedOutputIds.size() == 1);
406     // Do the inference
407     runtime->EnqueueWorkload(netId, InputTensors(), OutputTensors(), importedInputIds, importedOutputIds);
408 
409     // Retrieve the Profiler.Print() output to get the workload execution
410     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
411     std::stringstream ss;
412     profilerManager.GetProfiler()->Print(ss);;
413     std::string dump = ss.str();
414 
415     // Contains Convolution2dWorkload
416     std::size_t found = dump.find("Convolution2dWorkload");
417     CHECK(found != std::string::npos);
418 
419     // Contains SyncMemGeneric
420     found = dump.find("SyncMemGeneric");
421     CHECK(found != std::string::npos);
422 
423     // Does not contain CopyMemGeneric
424     found = dump.find("CopyMemGeneric");
425     CHECK(found == std::string::npos);
426 
427     runtime->UnloadNetwork(netId);
428 
429     // Check output is as expected
430     // Validate result by checking that the output has no negative values
431     auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
432     CHECK(outputResult);
433 
434     // Check the output is correct
435     CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
436 }
437 
438 TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportConvertFp16toFp32EndToEnd")
439 {
440     using namespace half_float::literal;
441 
442     // Create runtime in which test will run
443     IRuntime::CreationOptions options;
444     IRuntimePtr runtime(armnn::IRuntime::Create(options));
445 
446     // build up the structure of the network
447     NetworkImpl network;
448 
449     armnn::TensorInfo inputInfo({1, 3, 2, 3}, armnn::DataType::Float16);
450     armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float32);
451 
452     std::vector<float> expectedOutput =
453     {
454         -37.5f, -15.2f, -8.76f, -2.0f, -1.5f, -1.3f, -0.5f, -0.4f, 0.0f,
455         1.0f, 0.4f, 0.5f, 1.3f, 1.5f, 2.0f, 8.76f, 15.2f, 37.5f
456     };
457 
458     unsigned int numElements = inputInfo.GetNumElements();
459     size_t totalBytesInput = numElements * sizeof(Half);
460     size_t totalBytesOutput = numElements * sizeof(float);
461 
462     IConnectableLayer* const inputLayer = network.AddInputLayer(0, "input");
463     ARMNN_ASSERT(inputLayer);
464 
465     armnn::IConnectableLayer* const convLayer = network.AddConvertFp16ToFp32Layer("convert");
466     ARMNN_ASSERT(convLayer);
467 
468     inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
469     inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
470 
471     IConnectableLayer* output = network.AddOutputLayer(0, "output");
472     convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
473     convLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
474 
475     // Optimize the network
476     OptimizerOptionsOpaque optOptions;
477     optOptions.SetImportEnabled(false);
478     optOptions.SetExportEnabled(false);
479     std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
480     IOptimizedNetworkPtr optNet = Optimize(network.GetGraph(), backends, runtime->GetDeviceSpec(), optOptions);
481     CHECK(optNet);
482 
483     // Loads it into the runtime.
484     NetworkId netId;
485     std::string ignoredErrorMessage;
486     // Enable Importing
487     INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
488     runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
489 
490     // Creates structures for input & output
491     const size_t alignment =
492         arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
493     size_t spaceInput = totalBytesInput + alignment + alignment;
494     size_t spaceOutput = totalBytesOutput + alignment + alignment;
495     auto inputData = std::make_unique<uint8_t[]>(spaceInput);
496     void* alignedInputPtr = inputData.get();
497     CHECK(std::align(alignment, totalBytesInput, alignedInputPtr, spaceInput));
498 
499     // Input with negative values
500     auto* inputPtr = reinterpret_cast<Half*>(alignedInputPtr);
501     inputPtr[0] = -37.5_h;
502     inputPtr[1] = -15.2_h;
503     inputPtr[2] = -8.76_h;
504     inputPtr[3] = -2.0_h;
505     inputPtr[4] = -1.5_h;
506     inputPtr[5] = -1.3_h;
507     inputPtr[6] = -0.5_h;
508     inputPtr[7] = -0.4_h;
509     inputPtr[8] = 0.0_h;
510     inputPtr[9] = 1.0_h;
511     inputPtr[10] = 0.4_h;
512     inputPtr[11] = 0.5_h;
513     inputPtr[12] = 1.3_h;
514     inputPtr[13] = 1.5_h;
515     inputPtr[14] = 2.0_h;
516     inputPtr[15] = 8.76_h;
517     inputPtr[16] = 15.2_h;
518     inputPtr[17] = 37.5_h;
519 
520     auto outputData = std::make_unique<uint8_t[]>(spaceOutput);
521     void* alignedOutputPtr = outputData.get();
522     CHECK(std::align(alignment, totalBytesOutput, alignedOutputPtr, spaceOutput));
523     auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
524     std::fill_n(outputPtr, numElements, -10.0f);
525 
526     TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
527     inputTensorInfo.SetConstant(true);
528     InputTensors inputTensors
529     {
530         {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
531     };
532     OutputTensors outputTensors
533     {
534         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
535     };
536 
537     runtime->GetProfiler(netId)->EnableProfiling(true);
538 
539     INFO("Run ImportInputs");
540     std::vector<ImportedInputId> importedInputIds =
541         runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
542     // We expect the import to have succeeded.
543     CHECK(importedInputIds.size() == 1);
544     std::vector<ImportedOutputId> importedOutputIds =
545         runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
546     // We expect the import to have succeeded.
547     CHECK(importedOutputIds.size() == 1);
548 
549     // Do the inference
550     runtime->EnqueueWorkload(netId, InputTensors(), OutputTensors(), importedInputIds, importedOutputIds);
551 
552     // Retrieve the Profiler.Print() output to get the workload execution
553     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
554     std::stringstream ss;
555     profilerManager.GetProfiler()->Print(ss);;
556     std::string dump = ss.str();
557 
558     // Contains Convolution2dWorkload
559     std::size_t found = dump.find("ConvertFp16ToFp32Workload");
560     CHECK(found != std::string::npos);
561 
562     // Contains SyncMemGeneric
563     found = dump.find("SyncMemGeneric");
564     CHECK(found != std::string::npos);
565 
566     // Does not contain CopyMemGeneric
567     found = dump.find("CopyMemGeneric");
568     CHECK(found == std::string::npos);
569 
570     runtime->UnloadNetwork(netId);
571 
572     // Check output is as expected
573     // Validate result by checking that the output has no negative values
574     auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
575     CHECK(outputResult);
576 
577     // Check the output is correct
578     for (size_t i = 0; i < numElements; ++i)
579     {
580         DOCTEST_CHECK_MESSAGE(outputResult[i] == doctest::Approx(expectedOutput[i]).epsilon(0.0004),
581                               "outputValue[" << i << "]: " << outputResult[i] << " != " << expectedOutput[i]);
582     }
583 }
584 
585 
586 TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportConvertFp32toFp16EndToEnd")
587 {
588     using namespace half_float::literal;
589 
590     // Create runtime in which test will run
591     IRuntime::CreationOptions options;
592     IRuntimePtr runtime(armnn::IRuntime::Create(options));
593 
594     // build up the structure of the network
595     NetworkImpl network;
596 
597     armnn::TensorInfo inputInfo({1, 3, 2, 3}, armnn::DataType::Float32);
598     armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float16);
599 
600     std::vector<Half> expectedOutput =
601     {
602         -37.5_h, -15.2_h, -8.76_h, -2.0_h, -1.5_h, -1.3_h, -0.5_h, -0.4_h, 0.0_h,
603         1.0_h, 0.4_h, 0.5_h, 1.3_h, 1.5_h, 2.0_h, 8.76_h, 15.2_h, 37.5_h
604     };
605 
606     unsigned int numElements = inputInfo.GetNumElements();
607     size_t totalBytesInput = numElements * sizeof(float);
608     size_t totalBytesOutput = numElements * sizeof(Half);
609 
610     IConnectableLayer* const inputLayer = network.AddInputLayer(0, "input");
611     ARMNN_ASSERT(inputLayer);
612 
613     armnn::IConnectableLayer* const convLayer = network.AddConvertFp32ToFp16Layer("convert");
614     ARMNN_ASSERT(convLayer);
615 
616     inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
617     inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
618 
619     IConnectableLayer* output = network.AddOutputLayer(0, "output");
620     convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
621     convLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
622 
623     // Optimize the network
624     OptimizerOptionsOpaque optOptions;
625     optOptions.SetImportEnabled(false);
626     optOptions.SetExportEnabled(false);
627     std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
628     IOptimizedNetworkPtr optNet = Optimize(network.GetGraph(), backends, runtime->GetDeviceSpec(), optOptions);
629     CHECK(optNet);
630 
631     // Loads it into the runtime.
632     NetworkId netId;
633     std::string ignoredErrorMessage;
634     // Enable Importing
635     INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
636     runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
637 
638     // Creates structures for input & output
639     const size_t alignment =
640         arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
641     size_t spaceInput = totalBytesInput + alignment + alignment;
642     size_t spaceOutput = totalBytesOutput + alignment + alignment;
643     auto inputData = std::make_unique<uint8_t[]>(spaceInput);
644     void* alignedInputPtr = inputData.get();
645     CHECK(std::align(alignment, totalBytesInput, alignedInputPtr, spaceInput));
646 
647     // Input with negative values
648     auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
649     inputPtr[0] = -37.5f;
650     inputPtr[1] = -15.2f;
651     inputPtr[2] = -8.76f;
652     inputPtr[3] = -2.0f;
653     inputPtr[4] = -1.5f;
654     inputPtr[5] = -1.3f;
655     inputPtr[6] = -0.5f;
656     inputPtr[7] = -0.4f;
657     inputPtr[8] = 0.0f;
658     inputPtr[9] = 1.0f;
659     inputPtr[10] = 0.4f;
660     inputPtr[11] = 0.5f;
661     inputPtr[12] = 1.3f;
662     inputPtr[13] = 1.5f;
663     inputPtr[14] = 2.0f;
664     inputPtr[15] = 8.76f;
665     inputPtr[16] = 15.2f;
666     inputPtr[17] = 37.5f;
667 
668     auto outputData = std::make_unique<uint8_t[]>(spaceOutput);
669     void* alignedOutputPtr = outputData.get();
670     CHECK(std::align(alignment, totalBytesOutput, alignedOutputPtr, spaceOutput));
671     auto* outputPtr = reinterpret_cast<Half*>(alignedOutputPtr);
672     std::fill_n(outputPtr, numElements, -10.0f);
673 
674     TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
675     inputTensorInfo.SetConstant(true);
676     InputTensors inputTensors
677     {
678         {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
679     };
680     OutputTensors outputTensors
681     {
682         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
683     };
684 
685     runtime->GetProfiler(netId)->EnableProfiling(true);
686 
687     INFO("Run ImportInputs");
688     std::vector<ImportedInputId> importedInputIds =
689         runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
690     // We expect the import to have succeeded.
691     CHECK(importedInputIds.size() == 1);
692     std::vector<ImportedOutputId> importedOutputIds =
693         runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
694     // We expect the import to have succeeded.
695     CHECK(importedOutputIds.size() == 1);
696 
697     // Do the inference
698     runtime->EnqueueWorkload(netId, InputTensors(), OutputTensors(), importedInputIds, importedOutputIds);
699 
700     // Retrieve the Profiler.Print() output to get the workload execution
701     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
702     std::stringstream ss;
703     profilerManager.GetProfiler()->Print(ss);;
704     std::string dump = ss.str();
705 
706     // Contains Convolution2dWorkload
707     std::size_t found = dump.find("ConvertFp32ToFp16Workload");
708     CHECK(found != std::string::npos);
709 
710     // Contains SyncMemGeneric
711     found = dump.find("SyncMemGeneric");
712     CHECK(found != std::string::npos);
713 
714     // Does not contain CopyMemGeneric
715     found = dump.find("CopyMemGeneric");
716     CHECK(found == std::string::npos);
717 
718     runtime->UnloadNetwork(netId);
719 
720     // Check output is as expected
721     // Validate result by checking that the output has no negative values
722     auto* outputResult = reinterpret_cast<Half*>(alignedOutputPtr);
723     CHECK(outputResult);
724 
725     // Check the output is correct
726     CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
727 }
728 
729 TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportSimpleConvertFp32toFp16EndToEnd")
730 {
731     using namespace half_float::literal;
732 
733     // Create runtime in which test will run
734     IRuntime::CreationOptions options;
735     IRuntimePtr runtime(armnn::IRuntime::Create(options));
736 
737     // build up the structure of the network
738     NetworkImpl network;
739 
740     armnn::TensorInfo inputInfo({1}, armnn::DataType::Float32);
741     armnn::TensorInfo outputTensorInfo({1}, armnn::DataType::Float16);
742 
743     std::vector<Half> expectedOutput = { 1.0_h };
744 
745     unsigned int numElements = inputInfo.GetNumElements();
746     size_t totalBytesInput = numElements * sizeof(float);
747     size_t totalBytesOutput = numElements * sizeof(Half);
748 
749     IConnectableLayer* const inputLayer = network.AddInputLayer(0, "input");
750     ARMNN_ASSERT(inputLayer);
751 
752     armnn::IConnectableLayer* const convLayer = network.AddConvertFp32ToFp16Layer("convert");
753     ARMNN_ASSERT(convLayer);
754 
755     inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
756     inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
757 
758     IConnectableLayer* output = network.AddOutputLayer(0, "output");
759     convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
760     convLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
761 
762     // Optimize the network
763     OptimizerOptionsOpaque optOptions;
764     optOptions.SetImportEnabled(false);
765     optOptions.SetExportEnabled(false);
766     std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
767     IOptimizedNetworkPtr optNet = Optimize(network.GetGraph(), backends, runtime->GetDeviceSpec(), optOptions);
768     CHECK(optNet);
769 
770     // Loads it into the runtime.
771     NetworkId netId;
772     std::string ignoredErrorMessage;
773     // Enable Importing
774     INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
775     runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
776 
777     // Creates structures for input & output
778     const size_t alignment =
779         arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
780     size_t spaceInput = totalBytesInput + alignment + alignment;
781     size_t spaceOutput = totalBytesOutput + alignment + alignment;
782     auto inputData = std::make_unique<uint8_t[]>(spaceInput);
783     void* alignedInputPtr = inputData.get();
784     CHECK(std::align(alignment, totalBytesInput, alignedInputPtr, spaceInput));
785 
786     // Input with negative values
787     auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
788     inputPtr[0] = 1.0f;
789 
790     auto outputData = std::make_unique<uint8_t[]>(spaceOutput);
791     void* alignedOutputPtr = outputData.get();
792     CHECK(std::align(alignment, totalBytesOutput, alignedOutputPtr, spaceOutput));
793     auto* outputPtr = reinterpret_cast<Half*>(alignedOutputPtr);
794     std::fill_n(outputPtr, numElements, -10.0f);
795 
796     TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
797     inputTensorInfo.SetConstant(true);
798     InputTensors inputTensors
799     {
800         {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
801     };
802     OutputTensors outputTensors
803     {
804         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
805     };
806 
807     runtime->GetProfiler(netId)->EnableProfiling(true);
808 
809     INFO("Run ImportInputs");
810     std::vector<ImportedInputId> importedInputIds =
811         runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
812     CHECK(importedInputIds.size() == 1);
813     std::vector<ImportedOutputId> importedOutputIds =
814         runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
815     CHECK(importedOutputIds.size() == 1);
816 
817     // Do the inference
818     runtime->EnqueueWorkload(netId, InputTensors(), OutputTensors(), importedInputIds, importedOutputIds);
819 
820     // Retrieve the Profiler.Print() output to get the workload execution
821     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
822     std::stringstream ss;
823     profilerManager.GetProfiler()->Print(ss);;
824     std::string dump = ss.str();
825 
826     // Contains Convolution2dWorkload
827     std::size_t found = dump.find("ConvertFp32ToFp16Workload");
828     CHECK(found != std::string::npos);
829 
830     // Contains SyncMemGeneric
831     found = dump.find("SyncMemGeneric");
832     CHECK(found != std::string::npos);
833 
834     // Does not contain CopyMemGeneric
835     found = dump.find("CopyMemGeneric");
836     CHECK(found == std::string::npos);
837 
838     runtime->UnloadNetwork(netId);
839 
840     // Check output is as expected
841     // Validate result by checking that the output has no negative values
842     auto* outputResult = reinterpret_cast<Half*>(alignedOutputPtr);
843     CHECK(outputResult);
844 
845     // Check the output is correct
846     CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
847 }
848 
849 TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportRepeatedInferencesEndToEndTest")
850 {
851 /*
852  * This is a test to check the functionality of the Forced Import functionality when using repeated inferences that
853  * require switching from importing to copy. For the first inference we create aligned Pointers and check they are
854  * imported correctly. For the second we use similar pointers but don't use PreImporting.
855  */
856     // Create runtime in which test will run
857     IRuntime::CreationOptions options;
858     IRuntimePtr runtime(armnn::IRuntime::Create(options));
859 
860     // build up the structure of the network
861     INetworkPtr network(INetwork::Create());
862 
863     armnn::TensorInfo inputInfo({ 1, 3, 4, 1 }, DataType::Float32);
864     armnn::TensorInfo kernelInfo({ 1, 3, 3, 1 }, DataType::Float32);
865     armnn::TensorInfo outputInfo({ 1, 3, 4, 1 }, DataType::Float32);
866 
867     kernelInfo.SetConstant(true);
868 
869     std::vector<float> kernel =
870     {
871         4, 5, 6,
872         0, 0, 0,
873         3, 2, 1
874     };
875 
876     const std::vector<float> expectedOutput =
877     {
878         23, 41, 33, 21,
879         44, 65, 76, 52,
880         82, 85, 79, 42
881     };
882 
883     unsigned int numElements = inputInfo.GetNumElements();
884     size_t totalBytes = numElements * sizeof(float);
885 
886     IConnectableLayer* const inputLayer = network->AddInputLayer(0, "input");
887     ARMNN_ASSERT(inputLayer);
888 
889     armnn::ConstTensor weights(kernelInfo, kernel);
890 
891     armnn::Convolution2dDescriptor convDesc2d;
892     convDesc2d.m_StrideX = 1;
893     convDesc2d.m_StrideY = 1;
894     convDesc2d.m_PadLeft = 1;
895     convDesc2d.m_PadRight = 1;
896     convDesc2d.m_PadTop = 1;
897     convDesc2d.m_PadBottom = 1;
898     convDesc2d.m_DataLayout = DataLayout::NHWC;
899     armnn::IConnectableLayer* const convLayer = network->AddConvolution2dLayer(convDesc2d, "conv");
900     ARMNN_ASSERT(convLayer);
901 
902     armnn::IConnectableLayer* weightsLayer = network->AddConstantLayer(weights);
903 
904     weightsLayer->GetOutputSlot(0).SetTensorInfo(weights.GetInfo());
905     weightsLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(1u));
906 
907     inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
908     inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
909 
910     IConnectableLayer* output = network->AddOutputLayer(0, "output");
911     convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
912     convLayer->GetOutputSlot(0).SetTensorInfo(outputInfo);
913 
914     // Optimize the network
915     OptimizerOptionsOpaque optOptions;
916     optOptions.SetImportEnabled(false);
917     optOptions.SetExportEnabled(false);
918     std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
919     IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec(), optOptions);
920     CHECK(optNet);
921 
922     // Loads it into the runtime.
923     NetworkId netId;
924     std::string ignoredErrorMessage;
925     // Enable Importing
926     INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
927     runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
928 
929     // Creates structures for input & output
930     const size_t alignment =
931         arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
932     size_t space = totalBytes + alignment + alignment;
933     auto inputData = std::make_unique<uint8_t[]>(space);
934     void* alignedInputPtr = inputData.get();
935     CHECK(std::align(alignment, totalBytes, alignedInputPtr, space));
936 
937     // Fill input with values
938     auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
939     inputPtr[0] = 1;
940     inputPtr[1] = 5;
941     inputPtr[2] = 2;
942     inputPtr[3] = 3;
943     inputPtr[4] = 8;
944     inputPtr[5] = 7;
945     inputPtr[6] = 3;
946     inputPtr[7] = 6;
947     inputPtr[8] = 3;
948     inputPtr[9] = 3;
949     inputPtr[10] = 9;
950     inputPtr[11] = 1;
951 
952 
953     auto outputData = std::make_unique<uint8_t[]>(space);
954     void* alignedOutputPtr = outputData.get();
955     CHECK(std::align(alignment, totalBytes, alignedOutputPtr, space));
956     auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
957     std::fill_n(outputPtr, numElements, -10.0f);
958 
959     TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
960     inputTensorInfo.SetConstant(true);
961     InputTensors inputTensors
962     {
963         {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
964     };
965     OutputTensors outputTensors
966     {
967         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
968     };
969 
970     runtime->GetProfiler(netId)->EnableProfiling(true);
971 
972     INFO("Run ImportInputs");
973     std::vector<ImportedInputId> importedInputIds =
974         runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
975     // We expect the import to have succeeded.
976     CHECK(importedInputIds.size() == 1);
977     std::vector<ImportedOutputId> importedOutputIds =
978         runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
979     // We expect the import to have succeeded.
980     CHECK(importedOutputIds.size() == 1);
981 
982     // Do the inference
983     runtime->EnqueueWorkload(netId, InputTensors(), OutputTensors(), importedInputIds, importedOutputIds);
984 
985     // Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
986     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
987     std::stringstream ss;
988     profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
989     std::string dump = ss.str();
990 
991     // Contains Convolution2dWorkload
992     std::size_t found = dump.find("Convolution2dWorkload");
993     CHECK(found != std::string::npos);
994 
995     // Contains SyncMemGeneric
996     found = dump.find("SyncMemGeneric");
997     CHECK(found != std::string::npos);
998 
999     // Does not contain CopyMemGeneric
1000     found = dump.find("CopyMemGeneric");
1001     CHECK(found == std::string::npos);
1002 
1003     // Sync the outputs so we can read the data
1004     arm_compute::CLScheduler::get().sync();
1005 
1006     // Check output is as expected
1007     auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
1008     CHECK(outputResult);
1009     CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
1010 
1011     // Repeat the inference, with new tensors and without using PreImporting to force it to fall back to copying
1012 
1013     // Creates structures for input & output
1014     auto inputDataCopy = std::make_unique<uint8_t[]>(space);
1015     void* copyInputPtr = inputDataCopy.get();
1016 
1017     // Fill input with values
1018     auto* inputCopyPtr = reinterpret_cast<float*>(copyInputPtr);
1019     inputCopyPtr[0] = 1;
1020     inputCopyPtr[1] = 5;
1021     inputCopyPtr[2] = 2;
1022     inputCopyPtr[3] = 3;
1023     inputCopyPtr[4] = 8;
1024     inputCopyPtr[5] = 7;
1025     inputCopyPtr[6] = 3;
1026     inputCopyPtr[7] = 6;
1027     inputCopyPtr[8] = 3;
1028     inputCopyPtr[9] = 3;
1029     inputCopyPtr[10] = 9;
1030     inputCopyPtr[11] = 1;
1031 
1032     // Output pre-filled with -10.0f
1033     auto outputDataCopy = std::make_unique<uint8_t[]>(space);
1034     void* copyOutputPtr = outputDataCopy.get();
1035     auto* outputCopyPtr = reinterpret_cast<float*>(copyOutputPtr);
1036     std::fill_n(outputCopyPtr, numElements, -10.0f);
1037 
1038     InputTensors inputTensorsCopy
1039     {
1040         {0,armnn::ConstTensor(inputTensorInfo, copyInputPtr)},
1041     };
1042     OutputTensors outputTensorsCopy
1043     {
1044         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), copyOutputPtr)}
1045     };
1046 
1047     // Do the inference without any pre-imported input/output ids
1048     runtime->EnqueueWorkload(netId, inputTensorsCopy, outputTensorsCopy);
1049     // Sync the outputs so we can read the data
1050     arm_compute::CLScheduler::get().sync();
1051 
1052     // Check the output is correct
1053     outputResult = reinterpret_cast<float*>(copyOutputPtr);
1054     CHECK(outputResult);
1055     CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
1056 
1057     // Query the profiler again, this will contain the results of both inferences
1058     profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
1059     dump = ss.str();
1060 
1061     // Contains Convolution2dWorkload
1062     found = dump.find("Convolution2dWorkload");
1063     CHECK(found != std::string::npos);
1064 
1065     // Should still contain the SyncMemGeneric
1066     found = dump.find("SyncMemGeneric");
1067     CHECK(found != std::string::npos);
1068 
1069     // Should now also contain a CopyMemGeneric
1070     found = dump.find("CopyMemGeneric");
1071     CHECK(found != std::string::npos);
1072     runtime->UnloadNetwork(netId);
1073 }
1074 
1075 TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportRepeatedInferencesInvertedEndToEndTest")
1076 {
1077 /*
1078  * This test is similar to the test above but instead of importing and then copying, we start by copying and then do
1079  * the import.
1080  */
1081     // Create runtime in which test will run
1082     IRuntime::CreationOptions options;
1083     IRuntimePtr runtime(armnn::IRuntime::Create(options));
1084 
1085     // build up the structure of the network
1086     INetworkPtr network(INetwork::Create());
1087 
1088     armnn::TensorInfo inputInfo({ 1, 3, 4, 1 }, DataType::Float32);
1089     armnn::TensorInfo kernelInfo({ 1, 3, 3, 1 }, DataType::Float32);
1090     armnn::TensorInfo outputInfo({ 1, 3, 4, 1 }, DataType::Float32);
1091 
1092     kernelInfo.SetConstant(true);
1093 
1094     std::vector<float> kernel =
1095     {
1096         4, 5, 6,
1097         0, 0, 0,
1098         3, 2, 1
1099     };
1100 
1101     const std::vector<float> expectedOutput =
1102     {
1103         23, 41, 33, 21,
1104         44, 65, 76, 52,
1105         82, 85, 79, 42
1106     };
1107 
1108     unsigned int numElements = inputInfo.GetNumElements();
1109     size_t totalBytes = numElements * sizeof(float);
1110 
1111     IConnectableLayer* const inputLayer = network->AddInputLayer(0, "input");
1112     ARMNN_ASSERT(inputLayer);
1113 
1114     armnn::ConstTensor weights(kernelInfo, kernel);
1115 
1116     armnn::Convolution2dDescriptor convDesc2d;
1117     convDesc2d.m_StrideX = 1;
1118     convDesc2d.m_StrideY = 1;
1119     convDesc2d.m_PadLeft = 1;
1120     convDesc2d.m_PadRight = 1;
1121     convDesc2d.m_PadTop = 1;
1122     convDesc2d.m_PadBottom = 1;
1123     convDesc2d.m_DataLayout = DataLayout::NHWC;
1124 
1125     armnn::IConnectableLayer* const convLayer = network->AddConvolution2dLayer(convDesc2d, "conv");
1126     ARMNN_ASSERT(convLayer);
1127 
1128     armnn::IConnectableLayer* weightsLayer = network->AddConstantLayer(weights);
1129 
1130     weightsLayer->GetOutputSlot(0).SetTensorInfo(weights.GetInfo());
1131     weightsLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(1u));
1132 
1133     inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
1134     inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
1135 
1136     IConnectableLayer* output = network->AddOutputLayer(0, "output");
1137     convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
1138     convLayer->GetOutputSlot(0).SetTensorInfo(outputInfo);
1139 
1140     // Optimize the network
1141     OptimizerOptionsOpaque optOptions;
1142     optOptions.SetImportEnabled(false);
1143     optOptions.SetExportEnabled(false);
1144     std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
1145     IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec(), optOptions);
1146     CHECK(optNet);
1147 
1148     // Loads it into the runtime.
1149     NetworkId netId;
1150     std::string ignoredErrorMessage;
1151     // Enable Importing
1152     INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
1153     runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
1154 
1155     // Creates structures for input & output
1156     const size_t alignment =
1157         arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
1158     size_t space = totalBytes + alignment + alignment;
1159     auto inputData = std::make_unique<uint8_t[]>(space);
1160     void* copyInputPtr = inputData.get();
1161 
1162     // Fill input with values
1163     auto* inputPtr = reinterpret_cast<float*>(copyInputPtr);
1164     inputPtr[0] = 1;
1165     inputPtr[1] = 5;
1166     inputPtr[2] = 2;
1167     inputPtr[3] = 3;
1168     inputPtr[4] = 8;
1169     inputPtr[5] = 7;
1170     inputPtr[6] = 3;
1171     inputPtr[7] = 6;
1172     inputPtr[8] = 3;
1173     inputPtr[9] = 3;
1174     inputPtr[10] = 9;
1175     inputPtr[11] = 1;
1176 
1177     // Create output buffer and fill it with -10.0f
1178     auto outputData = std::make_unique<uint8_t[]>(space);
1179     void* copyOutputPtr = outputData.get();
1180     auto* outputPtr = reinterpret_cast<float*>(copyOutputPtr);
1181     std::fill_n(outputPtr, numElements, -10.0f);
1182 
1183     TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
1184     inputTensorInfo.SetConstant(true);
1185     InputTensors inputTensors
1186     {
1187         {0,armnn::ConstTensor(inputTensorInfo, copyInputPtr)},
1188     };
1189     OutputTensors outputTensors
1190     {
1191         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), copyOutputPtr)}
1192     };
1193 
1194     runtime->GetProfiler(netId)->EnableProfiling(true);
1195 
1196     // Do the inference without any pre-imported inputs/outputs
1197     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
1198 
1199     // Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
1200     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
1201     std::stringstream ss;
1202     profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
1203     std::string dump = ss.str();
1204 
1205     // Contains Convolution2dWorkload
1206     std::size_t found = dump.find("Convolution2dWorkload");
1207     CHECK(found != std::string::npos);
1208 
1209     // Does not contain SyncMemGeneric
1210     found = dump.find("SyncMemGeneric");
1211     CHECK(found == std::string::npos);
1212 
1213     // Does contain CopyMemGeneric
1214     found = dump.find("CopyMemGeneric");
1215     CHECK(found != std::string::npos);
1216 
1217     // Sync the outputs so we can read the data
1218     arm_compute::CLScheduler::get().sync();
1219 
1220     // Check output is as expected
1221     auto* outputResult = reinterpret_cast<float*>(copyOutputPtr);
1222     CHECK(outputResult);
1223     CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
1224 
1225     // Repeat the inference, with new tensors and while using pre-importing to force it to import
1226 
1227     // Creates structures for input & output
1228     auto inputDataImport = std::make_unique<uint8_t[]>(space);
1229     void* alignedInputImportPtr = inputDataImport.get();
1230     CHECK(std::align(alignment, totalBytes, alignedInputImportPtr, space));
1231 
1232     // Fill input with values
1233     auto* inputImportPtr = reinterpret_cast<float*>(alignedInputImportPtr);
1234     inputImportPtr[0] = 1;
1235     inputImportPtr[1] = 5;
1236     inputImportPtr[2] = 2;
1237     inputImportPtr[3] = 3;
1238     inputImportPtr[4] = 8;
1239     inputImportPtr[5] = 7;
1240     inputImportPtr[6] = 3;
1241     inputImportPtr[7] = 6;
1242     inputImportPtr[8] = 3;
1243     inputImportPtr[9] = 3;
1244     inputImportPtr[10] = 9;
1245     inputImportPtr[11] = 1;
1246 
1247     // Output pre-filled with -10.0f
1248     auto outputDataImport = std::make_unique<uint8_t[]>(space);
1249     void* alignedOutputImportPtr = outputDataImport.get();
1250     CHECK(std::align(alignment, totalBytes, alignedOutputImportPtr, space));
1251     auto* outputImportPtr = reinterpret_cast<float*>(alignedOutputImportPtr);
1252     std::fill_n(outputImportPtr, numElements, -10.0f);
1253 
1254     InputTensors inputTensorsImport
1255     {
1256         {0,armnn::ConstTensor(inputTensorInfo, alignedInputImportPtr)},
1257     };
1258     OutputTensors outputTensorsImport
1259     {
1260         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputImportPtr)}
1261     };
1262 
1263     INFO("Run ImportInputs");
1264     std::vector<ImportedInputId> importedInputIds =
1265         runtime->ImportInputs(netId, inputTensorsImport, MemorySource::Malloc);
1266     CHECK(importedInputIds.size() == 1);
1267     std::vector<ImportedOutputId> importedOutputIds =
1268         runtime->ImportOutputs(netId, outputTensorsImport, MemorySource::Malloc);
1269     CHECK(importedOutputIds.size() == 1);
1270 
1271     // Do the inference with pre-imported inputs/outputs
1272     runtime->EnqueueWorkload(netId, InputTensors(), OutputTensors(), importedInputIds, importedOutputIds);
1273     // Sync the outputs so we can read the data
1274     arm_compute::CLScheduler::get().sync();
1275 
1276     // Check the output is correct
1277     outputResult = reinterpret_cast<float*>(alignedOutputImportPtr);
1278     CHECK(outputResult);
1279     CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
1280 
1281 
1282     // Query the profiler again, this will contain the results of both inferences
1283     profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
1284     dump = ss.str();
1285 
1286     // Contains Convolution2dWorkload
1287     found = dump.find("Convolution2dWorkload");
1288     CHECK(found != std::string::npos);
1289 
1290     // Should now contain the SyncMemGeneric
1291     found = dump.find("SyncMemGeneric");
1292     CHECK(found != std::string::npos);
1293 
1294     // Should still contain a CopyMemGeneric from the first inference
1295     found = dump.find("CopyMemGeneric");
1296     CHECK(found != std::string::npos);
1297     runtime->UnloadNetwork(netId);
1298 }
1299 
1300 }
1301