xref: /aosp_15_r20/external/armnn/src/backends/cl/test/ClFallbackTests.cpp (revision 89c4ff92f2867872bb9e2354d150bf0c8c502810)
1 //
2 // Copyright © 2020-2023 Arm Ltd and Contributors. All rights reserved.
3 // SPDX-License-Identifier: MIT
4 //
5 
6 #include <CommonTestUtils.hpp>
7 
8 #include <GraphUtils.hpp>
9 
10 #include <doctest/doctest.h>
11 
12 TEST_SUITE("ClFallback")
13 {
14 TEST_CASE("ClImportEnabledFallbackToNeon")
15 {
16     using namespace armnn;
17 
18     IRuntime::CreationOptions options;
19     IRuntimePtr runtime(IRuntime::Create(options));
20 
21     // Builds up the structure of the network.
22     INetworkPtr net(INetwork::Create());
23 
24     IConnectableLayer* input0 = net->AddInputLayer(0, "input0");
25     IConnectableLayer* input1 = net->AddInputLayer(1, "input1");
26     IConnectableLayer* input2 = net->AddInputLayer(2, "input2");
27     IConnectableLayer* add = net->AddElementwiseBinaryLayer(BinaryOperation::Add, "add");
28     IConnectableLayer* sub = net->AddElementwiseBinaryLayer(BinaryOperation::Sub, "sub");
29     IConnectableLayer* output = net->AddOutputLayer(0, "output");
30 
31     input0->GetOutputSlot(0).Connect(add->GetInputSlot(0));
32     input1->GetOutputSlot(0).Connect(add->GetInputSlot(1));
33     input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0));
34     add->GetOutputSlot(0).Connect(sub->GetInputSlot(1));
35     sub->GetOutputSlot(0).Connect(output->GetInputSlot(0));
36 
37     TensorInfo info = TensorInfo({ 1, 2, 4, 2 }, DataType::Float32);
38     info.SetConstant(true);
39 
40     input0->GetOutputSlot(0).SetTensorInfo(info);
41     input1->GetOutputSlot(0).SetTensorInfo(info);
42     input2->GetOutputSlot(0).SetTensorInfo(info);
43     add->GetOutputSlot(0).SetTensorInfo(info);
44     sub->GetOutputSlot(0).SetTensorInfo(info);
45 
46     std::vector<BackendId> backends = { Compute::GpuAcc, Compute::CpuAcc };
47     // Use BackendSelectionHint to specify CpuAcc for Subtraction layer
48     sub->BackendSelectionHint(backends[1]);
49 
50     // optimize the network
51     OptimizerOptionsOpaque optOptions;
52     optOptions.SetImportEnabled(true);
53     optOptions.SetExportEnabled(true);
54     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions);
55 
56     Graph& graph = GetGraphForTesting(optNet.get());
57 
58     armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0");
59     armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1");
60     armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2");
61     armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add");
62     armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]");
63     armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub");
64     armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "output");
65 
66     // Checks order is valid.
67     CHECK(CheckOrder(graph, layer0, layer1));
68     CHECK(CheckOrder(graph, layer1, layer2));
69     CHECK(CheckOrder(graph, layer2, layer3));
70     CHECK(CheckOrder(graph, layer3, layer4));
71     CHECK(CheckOrder(graph, layer4, layer5));
72     CHECK(CheckOrder(graph, layer5, layer6));
73 
74     // Use memory import between backends
75     CHECK((layer4->GetType() == LayerType::MemCopy));
76 
77     // Correctly use backend hint
78     CHECK((layer5->GetBackendId() == Compute::CpuAcc ));
79 
80     // Load it into the runtime. It should pass.
81     NetworkId netId;
82     std::string ignoredErrorMessage;
83     INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
84     runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
85 
86     // Creates structures for input & output
87     std::vector<float> inputValue0
88     {
89         1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f, 1.0f, 1.0f, 2.0f, 2.0f
90     };
91     std::vector<float> inputValue1
92     {
93         0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 0.0f, 1.0f, 1.0f, 2.0f
94     };
95     std::vector<float> inputData2
96     {
97         12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 12.0f, 11.0f, 10.0f, 9.0f
98     };
99 
100     std::vector<float> outputData(16);
101 
102     std::vector<float> expectedOutput
103     {
104         11.0f, 9.0f, 7.0f, 5.0f, 3.0f, 1.0f, -1.0f, -3.0f, -5.0f, -7.0f, -9.0f, -11.0f, 11.0f, 9.0f, 7.0f, 5.0f
105     };
106 
107     // Prepare aligned data
108     unsigned int numElements = info.GetNumElements();
109     size_t totalBytes = numElements * sizeof(float);
110     const size_t alignment = 64;
111     size_t space = totalBytes + alignment + alignment;
112     auto inputData0 = std::make_unique<uint8_t[]>(space);
113     void* alignedInputPtr0 = inputData0.get();
114     CHECK(std::align(alignment, totalBytes, alignedInputPtr0, space));
115 
116     auto* intputPtr0 = reinterpret_cast<float*>(alignedInputPtr0);
117     std::copy(inputValue0.begin(), inputValue0.end(), intputPtr0);
118 
119     auto inputData1 = std::make_unique<uint8_t[]>(space);
120     void* alignedInputPtr1 = inputData1.get();
121     CHECK(std::align(alignment, totalBytes, alignedInputPtr1, space));
122 
123     auto* intputPtr1 = reinterpret_cast<float*>(alignedInputPtr1);
124     std::copy(inputValue1.begin(), inputValue1.end(), intputPtr1);
125 
126     InputTensors inputTensors
127     {
128         { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), alignedInputPtr0) },
129         { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), alignedInputPtr1) },
130         { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) }
131     };
132     OutputTensors outputTensors
133     {
134         { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) }
135     };
136 
137     runtime->GetProfiler(netId)->EnableProfiling(true);
138 
139     // Do the inference
140     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
141 
142     // Retrieve the Profiler.Print() output to get the workload execution
143     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
144     std::stringstream ss;
145     profilerManager.GetProfiler()->Print(ss);;
146     std::string dump = ss.str();
147 
148     // Executed Subtraction using CpuAcc
149     std::size_t found = dump.find("NeonSubtractionWorkload_Execute");
150     CHECK(found != std::string::npos);
151 
152     // Contain CopyMemGeneric
153     found = dump.find("CopyMemGeneric");
154     CHECK(found != std::string::npos);
155 
156     // Check output is as expected
157     CHECK(outputData == expectedOutput);
158 
159     runtime->UnloadNetwork(netId);
160 }
161 
162 TEST_CASE("ClImportDisabledFallbackToNeon")
163 {
164     using namespace armnn;
165 
166     IRuntime::CreationOptions options;
167     IRuntimePtr runtime(IRuntime::Create(options));
168 
169     // Builds up the structure of the network.
170     INetworkPtr net(INetwork::Create());
171 
172     IConnectableLayer* input0 = net->AddInputLayer(0, "input0");
173     IConnectableLayer* input1 = net->AddInputLayer(1, "input1");
174     IConnectableLayer* input2 = net->AddInputLayer(2, "input2");
175     IConnectableLayer* add = net->AddElementwiseBinaryLayer(BinaryOperation::Add, "add");
176     IConnectableLayer* sub = net->AddElementwiseBinaryLayer(BinaryOperation::Sub, "sub");
177     IConnectableLayer* output = net->AddOutputLayer(0, "output");
178 
179     input0->GetOutputSlot(0).Connect(add->GetInputSlot(0));
180     input1->GetOutputSlot(0).Connect(add->GetInputSlot(1));
181     input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0));
182     add->GetOutputSlot(0).Connect(sub->GetInputSlot(1));
183     sub->GetOutputSlot(0).Connect(output->GetInputSlot(0));
184 
185     TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32);
186     info.SetConstant(true);
187 
188     input0->GetOutputSlot(0).SetTensorInfo(info);
189     input1->GetOutputSlot(0).SetTensorInfo(info);
190     input2->GetOutputSlot(0).SetTensorInfo(info);
191     add->GetOutputSlot(0).SetTensorInfo(info);
192     sub->GetOutputSlot(0).SetTensorInfo(info);
193 
194     std::vector<BackendId> backends = { Compute::GpuAcc, Compute::CpuAcc };
195     // Use BackendSelectionHint to specify CpuAcc for Subtraction layer
196     sub->BackendSelectionHint(backends[1]);
197 
198     // optimize the network
199     OptimizerOptionsOpaque optOptions;
200     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions);
201 
202     Graph& graph = GetGraphForTesting(optNet.get());
203 
204     armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0");
205     armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1");
206     armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2");
207     armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add");
208     armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]");
209     armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub");
210     armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "output");
211 
212     // Checks order is valid.
213     CHECK(CheckOrder(graph, layer0, layer1));
214     CHECK(CheckOrder(graph, layer1, layer2));
215     CHECK(CheckOrder(graph, layer2, layer3));
216     CHECK(CheckOrder(graph, layer3, layer4));
217     CHECK(CheckOrder(graph, layer4, layer5));
218     CHECK(CheckOrder(graph, layer5, layer6));
219 
220     // Use memory import between backends
221     CHECK((layer4->GetType() == LayerType::MemCopy));
222 
223     // Correctly use backend hint
224     CHECK((layer5->GetBackendId() == Compute::CpuAcc ));
225 
226     // Load it into the runtime. It should pass.
227     NetworkId netId;
228     runtime->LoadNetwork(netId, std::move(optNet));
229 
230     // Creates structures for input & output
231     std::vector<float> inputData0
232     {
233         1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f
234     };
235     std::vector<float> inputData1
236     {
237         0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f
238     };
239     std::vector<float> inputData2
240     {
241         12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f
242     };
243 
244     std::vector<float> outputData(12);
245 
246     std::vector<float> expectedOutput
247     {
248         11.0f, 9.0f, 7.0f, 5.0f, 3.0f, 1.0f, -1.0f, -3.0f, -5.0f, -7.0f, -9.0f, -11.0f
249     };
250 
251     InputTensors inputTensors
252     {
253         { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData0.data()) },
254         { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), inputData1.data()) },
255         { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) }
256     };
257     OutputTensors outputTensors
258     {
259         { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) }
260     };
261 
262     runtime->GetProfiler(netId)->EnableProfiling(true);
263 
264     // Do the inference
265     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
266 
267     // Retrieve the Profiler.Print() output to get the workload execution
268     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
269     std::stringstream ss;
270     profilerManager.GetProfiler()->Print(ss);;
271     std::string dump = ss.str();
272 
273     // Executed Subtraction using CpuAcc
274     std::size_t found = dump.find("NeonSubtractionWorkload_Execute");
275     CHECK(found != std::string::npos);
276 
277     // Contain CopyMemGeneric
278     found = dump.find("CopyMemGeneric");
279     CHECK(found != std::string::npos);
280 
281     // Check output is as expected
282     CHECK(outputData == expectedOutput);
283 }
284 
285 TEST_CASE("ClImportEnabledFallbackSubgraphToNeon")
286 {
287     using namespace armnn;
288 
289     IRuntime::CreationOptions options;
290     IRuntimePtr runtime(IRuntime::Create(options));
291 
292     // Builds up the structure of the network.
293     INetworkPtr net(INetwork::Create());
294 
295     Pooling2dDescriptor desc;
296     desc.m_PoolWidth = 2;
297     desc.m_PoolHeight = 2;
298     desc.m_StrideX = 2;
299     desc.m_StrideY = 2;
300 
301     IConnectableLayer* input0 = net->AddInputLayer(0, "input0");
302     IConnectableLayer* input1 = net->AddInputLayer(1, "input1");
303     IConnectableLayer* input2 = net->AddInputLayer(2, "input2");
304     IConnectableLayer* add = net->AddElementwiseBinaryLayer(BinaryOperation::Add, "add");
305     IConnectableLayer* sub = net->AddElementwiseBinaryLayer(BinaryOperation::Sub, "sub");
306     IConnectableLayer* pooling = net->AddPooling2dLayer(desc, "pooling");
307     IConnectableLayer* output = net->AddOutputLayer(0, "output");
308 
309     input0->GetOutputSlot(0).Connect(add->GetInputSlot(0));
310     input1->GetOutputSlot(0).Connect(add->GetInputSlot(1));
311     input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0));
312     add->GetOutputSlot(0).Connect(sub->GetInputSlot(1));
313     sub->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
314     pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
315 
316     TensorInfo info = TensorInfo({ 1, 2, 4, 2 }, DataType::Float32);
317     info.SetConstant(true);
318     TensorInfo poolingInfo = TensorInfo({ 1, 2, 2, 1 }, DataType::Float32);
319 
320     input0->GetOutputSlot(0).SetTensorInfo(info);
321     input1->GetOutputSlot(0).SetTensorInfo(info);
322     input2->GetOutputSlot(0).SetTensorInfo(info);
323     add->GetOutputSlot(0).SetTensorInfo(info);
324     sub->GetOutputSlot(0).SetTensorInfo(info);
325     pooling->GetOutputSlot(0).SetTensorInfo(poolingInfo);
326 
327     std::vector<BackendId> backends = { Compute::GpuAcc, Compute::CpuAcc };
328     // Use BackendSelectionHint to specify CpuAcc for Subtraction layer
329     sub->BackendSelectionHint(backends[1]);
330 
331     // optimize the network
332     OptimizerOptionsOpaque optOptions;
333     optOptions.SetImportEnabled(true);
334     optOptions.SetExportEnabled(true);
335     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions);
336 
337     Graph& graph = GetGraphForTesting(optNet.get());
338 
339     armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0");
340     armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1");
341     armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2");
342     armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add");
343     armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]");
344     armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub");
345     armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "[ sub (0) -> pooling (0) ]");
346     armnn::Layer* const layer7 = GetFirstLayerWithName(graph, "pooling");
347     armnn::Layer* const layer8 = GetFirstLayerWithName(graph, "output");
348 
349     // Checks order is valid.
350     CHECK(CheckOrder(graph, layer0, layer1));
351     CHECK(CheckOrder(graph, layer1, layer2));
352     CHECK(CheckOrder(graph, layer2, layer3));
353     CHECK(CheckOrder(graph, layer3, layer4));
354     CHECK(CheckOrder(graph, layer4, layer5));
355     CHECK(CheckOrder(graph, layer5, layer6));
356     CHECK(CheckOrder(graph, layer6, layer7));
357     CHECK(CheckOrder(graph, layer7, layer8));
358 
359     // Use memory import between backends
360     CHECK((layer4->GetType() == LayerType::MemCopy));
361     CHECK((layer6->GetType() == LayerType::MemCopy));
362 
363     // Correctly use backend hint
364     CHECK((layer5->GetBackendId() == Compute::CpuAcc ));
365 
366     // Load it into the runtime. It should pass.
367     NetworkId netId;
368     std::string ignoredErrorMessage;
369     INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
370     runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
371 
372     // Creates structures for input & output
373     std::vector<float> inputValue0
374     {
375         1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f, 1.0f, 1.0f, 2.0f, 2.0f
376     };
377     std::vector<float> inputValue1
378     {
379         0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 0.0f, 1.0f, 1.0f, 2.0f
380     };
381     std::vector<float> inputData2
382     {
383         12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 12.0f, 11.0f, 10.0f, 9.0f
384     };
385 
386     std::vector<float> outputData(4);
387 
388     std::vector<float> expectedOutput{ 11.0f, 3.0f, -5.0f, 11.0f };
389 
390     unsigned int numElements = info.GetNumElements();
391     size_t totalBytes = numElements * sizeof(float);
392     const size_t alignment = 64;
393     size_t space = totalBytes + alignment + alignment;
394     auto inputData0 = std::make_unique<uint8_t[]>(space);
395     void* alignedInputPtr0 = inputData0.get();
396     CHECK(std::align(alignment, totalBytes, alignedInputPtr0, space));
397 
398     auto* intputPtr0 = reinterpret_cast<float*>(alignedInputPtr0);
399     std::copy(inputValue0.begin(), inputValue0.end(), intputPtr0);
400 
401     auto inputData1 = std::make_unique<uint8_t[]>(space);
402     void* alignedInputPtr1 = inputData1.get();
403     CHECK(std::align(alignment, totalBytes, alignedInputPtr1, space));
404 
405     auto* intputPtr1 = reinterpret_cast<float*>(alignedInputPtr1);
406     std::copy(inputValue1.begin(), inputValue1.end(), intputPtr1);
407 
408     InputTensors inputTensors
409     {
410         { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), alignedInputPtr0) },
411         { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), alignedInputPtr1) },
412         { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) }
413     };
414     OutputTensors outputTensors
415     {
416         { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) }
417     };
418 
419     runtime->GetProfiler(netId)->EnableProfiling(true);
420 
421     // Do the inference
422     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
423 
424     // Retrieve the Profiler.Print() output to get the workload execution
425     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
426     std::stringstream ss;
427     profilerManager.GetProfiler()->Print(ss);;
428     std::string dump = ss.str();
429 
430     // Executed Subtraction using CpuAcc
431     std::size_t found = dump.find("NeonSubtractionWorkload_Execute");
432     CHECK(found != std::string::npos);
433 
434     // Correctly switch back to GpuAcc
435     found = dump.find("ClPooling2dWorkload_Execute");
436     CHECK(found != std::string::npos);
437 
438     // Contain CopyMemGeneric
439     found = dump.find("CopyMemGeneric");
440     CHECK(found != std::string::npos);
441 
442     // Check output is as expected
443     CHECK(outputData == expectedOutput);
444 
445     runtime->UnloadNetwork(netId);
446 }
447 
448 TEST_CASE("ClImportDisableFallbackSubgraphToNeon")
449 {
450     using namespace armnn;
451 
452     IRuntime::CreationOptions options;
453     IRuntimePtr runtime(IRuntime::Create(options));
454 
455     // Builds up the structure of the network.
456     INetworkPtr net(INetwork::Create());
457 
458     Pooling2dDescriptor desc;
459 
460     IConnectableLayer* input0 = net->AddInputLayer(0, "input0");
461     IConnectableLayer* input1 = net->AddInputLayer(1, "input1");
462     IConnectableLayer* input2 = net->AddInputLayer(2, "input2");
463     IConnectableLayer* add = net->AddElementwiseBinaryLayer(BinaryOperation::Add, "add");
464     IConnectableLayer* sub = net->AddElementwiseBinaryLayer(BinaryOperation::Sub, "sub");
465     IConnectableLayer* pooling = net->AddPooling2dLayer(desc, "pooling");
466     IConnectableLayer* output = net->AddOutputLayer(0, "output");
467 
468     input0->GetOutputSlot(0).Connect(add->GetInputSlot(0));
469     input1->GetOutputSlot(0).Connect(add->GetInputSlot(1));
470     input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0));
471     add->GetOutputSlot(0).Connect(sub->GetInputSlot(1));
472     sub->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
473     pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
474 
475     TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32);
476     info.SetConstant(true);
477     TensorInfo poolingInfo = TensorInfo({ 1, 2, 1, 1 }, DataType::Float32);
478 
479     input0->GetOutputSlot(0).SetTensorInfo(info);
480     input1->GetOutputSlot(0).SetTensorInfo(info);
481     input2->GetOutputSlot(0).SetTensorInfo(info);
482     add->GetOutputSlot(0).SetTensorInfo(info);
483     sub->GetOutputSlot(0).SetTensorInfo(info);
484     pooling->GetOutputSlot(0).SetTensorInfo(poolingInfo);
485 
486     std::vector<BackendId> backends = { Compute::GpuAcc, Compute::CpuAcc };
487     // Use BackendSelectionHint to specify CpuAcc for Subtraction layer
488     sub->BackendSelectionHint(backends[1]);
489 
490     // optimize the network
491     OptimizerOptionsOpaque optOptions;
492     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions);
493 
494     Graph& graph = GetGraphForTesting(optNet.get());
495 
496     armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0");
497     armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1");
498     armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2");
499     armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add");
500     armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]");
501     armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub");
502     armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "[ sub (0) -> pooling (0) ]");
503     armnn::Layer* const layer7 = GetFirstLayerWithName(graph, "pooling");
504     armnn::Layer* const layer8 = GetFirstLayerWithName(graph, "output");
505 
506     // Checks order is valid.
507     CHECK(CheckOrder(graph, layer0, layer1));
508     CHECK(CheckOrder(graph, layer1, layer2));
509     CHECK(CheckOrder(graph, layer2, layer3));
510     CHECK(CheckOrder(graph, layer3, layer4));
511     CHECK(CheckOrder(graph, layer4, layer5));
512     CHECK(CheckOrder(graph, layer5, layer6));
513     CHECK(CheckOrder(graph, layer6, layer7));
514     CHECK(CheckOrder(graph, layer7, layer8));
515 
516     // Use memory import between backends
517     CHECK((layer4->GetType() == LayerType::MemCopy));
518     CHECK((layer6->GetType() == LayerType::MemCopy));
519 
520     // Correctly use backend hint
521     CHECK((layer5->GetBackendId() == Compute::CpuAcc ));
522 
523     // Load it into the runtime. It should pass.
524     NetworkId netId;
525     runtime->LoadNetwork(netId, std::move(optNet));
526 
527     // Creates structures for input & output
528     std::vector<float> inputData0
529     {
530         1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f
531     };
532     std::vector<float> inputData1
533     {
534         0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f
535     };
536     std::vector<float> inputData2
537     {
538         12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f
539     };
540 
541     std::vector<float> outputData(2);
542 
543     std::vector<float> expectedOutput{ 11.0f, -1.0f };
544 
545     InputTensors inputTensors
546     {
547         { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData0.data()) },
548         { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), inputData1.data()) },
549         { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) }
550     };
551     OutputTensors outputTensors
552     {
553         { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) }
554     };
555 
556     runtime->GetProfiler(netId)->EnableProfiling(true);
557 
558     // Do the inference
559     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
560 
561     // Retrieve the Profiler.Print() output to get the workload execution
562     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
563     std::stringstream ss;
564     profilerManager.GetProfiler()->Print(ss);;
565     std::string dump = ss.str();
566 
567     // Executed Subtraction using CpuAcc
568     std::size_t found = dump.find("NeonSubtractionWorkload_Execute");
569     CHECK(found != std::string::npos);
570 
571     // Correctly switch back to GpuAcc
572     found = dump.find("ClPooling2dWorkload_Execute");
573     CHECK(found != std::string::npos);
574 
575     // Contain CopyMemGeneric
576     found = dump.find("CopyMemGeneric");
577     CHECK(found != std::string::npos);
578 
579     // Check output is as expected
580     CHECK(outputData == expectedOutput);
581 }
582 
583 }
584