1 // 2 // Copyright © 2020-2023 Arm Ltd and Contributors. All rights reserved. 3 // SPDX-License-Identifier: MIT 4 // 5 6 #include <CommonTestUtils.hpp> 7 8 #include <GraphUtils.hpp> 9 10 #include <doctest/doctest.h> 11 12 TEST_SUITE("ClFallback") 13 { 14 TEST_CASE("ClImportEnabledFallbackToNeon") 15 { 16 using namespace armnn; 17 18 IRuntime::CreationOptions options; 19 IRuntimePtr runtime(IRuntime::Create(options)); 20 21 // Builds up the structure of the network. 22 INetworkPtr net(INetwork::Create()); 23 24 IConnectableLayer* input0 = net->AddInputLayer(0, "input0"); 25 IConnectableLayer* input1 = net->AddInputLayer(1, "input1"); 26 IConnectableLayer* input2 = net->AddInputLayer(2, "input2"); 27 IConnectableLayer* add = net->AddElementwiseBinaryLayer(BinaryOperation::Add, "add"); 28 IConnectableLayer* sub = net->AddElementwiseBinaryLayer(BinaryOperation::Sub, "sub"); 29 IConnectableLayer* output = net->AddOutputLayer(0, "output"); 30 31 input0->GetOutputSlot(0).Connect(add->GetInputSlot(0)); 32 input1->GetOutputSlot(0).Connect(add->GetInputSlot(1)); 33 input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0)); 34 add->GetOutputSlot(0).Connect(sub->GetInputSlot(1)); 35 sub->GetOutputSlot(0).Connect(output->GetInputSlot(0)); 36 37 TensorInfo info = TensorInfo({ 1, 2, 4, 2 }, DataType::Float32); 38 info.SetConstant(true); 39 40 input0->GetOutputSlot(0).SetTensorInfo(info); 41 input1->GetOutputSlot(0).SetTensorInfo(info); 42 input2->GetOutputSlot(0).SetTensorInfo(info); 43 add->GetOutputSlot(0).SetTensorInfo(info); 44 sub->GetOutputSlot(0).SetTensorInfo(info); 45 46 std::vector<BackendId> backends = { Compute::GpuAcc, Compute::CpuAcc }; 47 // Use BackendSelectionHint to specify CpuAcc for Subtraction layer 48 sub->BackendSelectionHint(backends[1]); 49 50 // optimize the network 51 OptimizerOptionsOpaque optOptions; 52 optOptions.SetImportEnabled(true); 53 optOptions.SetExportEnabled(true); 54 IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions); 55 56 Graph& graph = GetGraphForTesting(optNet.get()); 57 58 armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0"); 59 armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1"); 60 armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2"); 61 armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add"); 62 armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]"); 63 armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub"); 64 armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "output"); 65 66 // Checks order is valid. 67 CHECK(CheckOrder(graph, layer0, layer1)); 68 CHECK(CheckOrder(graph, layer1, layer2)); 69 CHECK(CheckOrder(graph, layer2, layer3)); 70 CHECK(CheckOrder(graph, layer3, layer4)); 71 CHECK(CheckOrder(graph, layer4, layer5)); 72 CHECK(CheckOrder(graph, layer5, layer6)); 73 74 // Use memory import between backends 75 CHECK((layer4->GetType() == LayerType::MemCopy)); 76 77 // Correctly use backend hint 78 CHECK((layer5->GetBackendId() == Compute::CpuAcc )); 79 80 // Load it into the runtime. It should pass. 81 NetworkId netId; 82 std::string ignoredErrorMessage; 83 INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc); 84 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties); 85 86 // Creates structures for input & output 87 std::vector<float> inputValue0 88 { 89 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f, 1.0f, 1.0f, 2.0f, 2.0f 90 }; 91 std::vector<float> inputValue1 92 { 93 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 0.0f, 1.0f, 1.0f, 2.0f 94 }; 95 std::vector<float> inputData2 96 { 97 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 12.0f, 11.0f, 10.0f, 9.0f 98 }; 99 100 std::vector<float> outputData(16); 101 102 std::vector<float> expectedOutput 103 { 104 11.0f, 9.0f, 7.0f, 5.0f, 3.0f, 1.0f, -1.0f, -3.0f, -5.0f, -7.0f, -9.0f, -11.0f, 11.0f, 9.0f, 7.0f, 5.0f 105 }; 106 107 // Prepare aligned data 108 unsigned int numElements = info.GetNumElements(); 109 size_t totalBytes = numElements * sizeof(float); 110 const size_t alignment = 64; 111 size_t space = totalBytes + alignment + alignment; 112 auto inputData0 = std::make_unique<uint8_t[]>(space); 113 void* alignedInputPtr0 = inputData0.get(); 114 CHECK(std::align(alignment, totalBytes, alignedInputPtr0, space)); 115 116 auto* intputPtr0 = reinterpret_cast<float*>(alignedInputPtr0); 117 std::copy(inputValue0.begin(), inputValue0.end(), intputPtr0); 118 119 auto inputData1 = std::make_unique<uint8_t[]>(space); 120 void* alignedInputPtr1 = inputData1.get(); 121 CHECK(std::align(alignment, totalBytes, alignedInputPtr1, space)); 122 123 auto* intputPtr1 = reinterpret_cast<float*>(alignedInputPtr1); 124 std::copy(inputValue1.begin(), inputValue1.end(), intputPtr1); 125 126 InputTensors inputTensors 127 { 128 { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), alignedInputPtr0) }, 129 { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), alignedInputPtr1) }, 130 { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) } 131 }; 132 OutputTensors outputTensors 133 { 134 { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) } 135 }; 136 137 runtime->GetProfiler(netId)->EnableProfiling(true); 138 139 // Do the inference 140 runtime->EnqueueWorkload(netId, inputTensors, outputTensors); 141 142 // Retrieve the Profiler.Print() output to get the workload execution 143 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); 144 std::stringstream ss; 145 profilerManager.GetProfiler()->Print(ss);; 146 std::string dump = ss.str(); 147 148 // Executed Subtraction using CpuAcc 149 std::size_t found = dump.find("NeonSubtractionWorkload_Execute"); 150 CHECK(found != std::string::npos); 151 152 // Contain CopyMemGeneric 153 found = dump.find("CopyMemGeneric"); 154 CHECK(found != std::string::npos); 155 156 // Check output is as expected 157 CHECK(outputData == expectedOutput); 158 159 runtime->UnloadNetwork(netId); 160 } 161 162 TEST_CASE("ClImportDisabledFallbackToNeon") 163 { 164 using namespace armnn; 165 166 IRuntime::CreationOptions options; 167 IRuntimePtr runtime(IRuntime::Create(options)); 168 169 // Builds up the structure of the network. 170 INetworkPtr net(INetwork::Create()); 171 172 IConnectableLayer* input0 = net->AddInputLayer(0, "input0"); 173 IConnectableLayer* input1 = net->AddInputLayer(1, "input1"); 174 IConnectableLayer* input2 = net->AddInputLayer(2, "input2"); 175 IConnectableLayer* add = net->AddElementwiseBinaryLayer(BinaryOperation::Add, "add"); 176 IConnectableLayer* sub = net->AddElementwiseBinaryLayer(BinaryOperation::Sub, "sub"); 177 IConnectableLayer* output = net->AddOutputLayer(0, "output"); 178 179 input0->GetOutputSlot(0).Connect(add->GetInputSlot(0)); 180 input1->GetOutputSlot(0).Connect(add->GetInputSlot(1)); 181 input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0)); 182 add->GetOutputSlot(0).Connect(sub->GetInputSlot(1)); 183 sub->GetOutputSlot(0).Connect(output->GetInputSlot(0)); 184 185 TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32); 186 info.SetConstant(true); 187 188 input0->GetOutputSlot(0).SetTensorInfo(info); 189 input1->GetOutputSlot(0).SetTensorInfo(info); 190 input2->GetOutputSlot(0).SetTensorInfo(info); 191 add->GetOutputSlot(0).SetTensorInfo(info); 192 sub->GetOutputSlot(0).SetTensorInfo(info); 193 194 std::vector<BackendId> backends = { Compute::GpuAcc, Compute::CpuAcc }; 195 // Use BackendSelectionHint to specify CpuAcc for Subtraction layer 196 sub->BackendSelectionHint(backends[1]); 197 198 // optimize the network 199 OptimizerOptionsOpaque optOptions; 200 IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions); 201 202 Graph& graph = GetGraphForTesting(optNet.get()); 203 204 armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0"); 205 armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1"); 206 armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2"); 207 armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add"); 208 armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]"); 209 armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub"); 210 armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "output"); 211 212 // Checks order is valid. 213 CHECK(CheckOrder(graph, layer0, layer1)); 214 CHECK(CheckOrder(graph, layer1, layer2)); 215 CHECK(CheckOrder(graph, layer2, layer3)); 216 CHECK(CheckOrder(graph, layer3, layer4)); 217 CHECK(CheckOrder(graph, layer4, layer5)); 218 CHECK(CheckOrder(graph, layer5, layer6)); 219 220 // Use memory import between backends 221 CHECK((layer4->GetType() == LayerType::MemCopy)); 222 223 // Correctly use backend hint 224 CHECK((layer5->GetBackendId() == Compute::CpuAcc )); 225 226 // Load it into the runtime. It should pass. 227 NetworkId netId; 228 runtime->LoadNetwork(netId, std::move(optNet)); 229 230 // Creates structures for input & output 231 std::vector<float> inputData0 232 { 233 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f 234 }; 235 std::vector<float> inputData1 236 { 237 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f 238 }; 239 std::vector<float> inputData2 240 { 241 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f 242 }; 243 244 std::vector<float> outputData(12); 245 246 std::vector<float> expectedOutput 247 { 248 11.0f, 9.0f, 7.0f, 5.0f, 3.0f, 1.0f, -1.0f, -3.0f, -5.0f, -7.0f, -9.0f, -11.0f 249 }; 250 251 InputTensors inputTensors 252 { 253 { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData0.data()) }, 254 { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), inputData1.data()) }, 255 { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) } 256 }; 257 OutputTensors outputTensors 258 { 259 { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) } 260 }; 261 262 runtime->GetProfiler(netId)->EnableProfiling(true); 263 264 // Do the inference 265 runtime->EnqueueWorkload(netId, inputTensors, outputTensors); 266 267 // Retrieve the Profiler.Print() output to get the workload execution 268 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); 269 std::stringstream ss; 270 profilerManager.GetProfiler()->Print(ss);; 271 std::string dump = ss.str(); 272 273 // Executed Subtraction using CpuAcc 274 std::size_t found = dump.find("NeonSubtractionWorkload_Execute"); 275 CHECK(found != std::string::npos); 276 277 // Contain CopyMemGeneric 278 found = dump.find("CopyMemGeneric"); 279 CHECK(found != std::string::npos); 280 281 // Check output is as expected 282 CHECK(outputData == expectedOutput); 283 } 284 285 TEST_CASE("ClImportEnabledFallbackSubgraphToNeon") 286 { 287 using namespace armnn; 288 289 IRuntime::CreationOptions options; 290 IRuntimePtr runtime(IRuntime::Create(options)); 291 292 // Builds up the structure of the network. 293 INetworkPtr net(INetwork::Create()); 294 295 Pooling2dDescriptor desc; 296 desc.m_PoolWidth = 2; 297 desc.m_PoolHeight = 2; 298 desc.m_StrideX = 2; 299 desc.m_StrideY = 2; 300 301 IConnectableLayer* input0 = net->AddInputLayer(0, "input0"); 302 IConnectableLayer* input1 = net->AddInputLayer(1, "input1"); 303 IConnectableLayer* input2 = net->AddInputLayer(2, "input2"); 304 IConnectableLayer* add = net->AddElementwiseBinaryLayer(BinaryOperation::Add, "add"); 305 IConnectableLayer* sub = net->AddElementwiseBinaryLayer(BinaryOperation::Sub, "sub"); 306 IConnectableLayer* pooling = net->AddPooling2dLayer(desc, "pooling"); 307 IConnectableLayer* output = net->AddOutputLayer(0, "output"); 308 309 input0->GetOutputSlot(0).Connect(add->GetInputSlot(0)); 310 input1->GetOutputSlot(0).Connect(add->GetInputSlot(1)); 311 input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0)); 312 add->GetOutputSlot(0).Connect(sub->GetInputSlot(1)); 313 sub->GetOutputSlot(0).Connect(pooling->GetInputSlot(0)); 314 pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0)); 315 316 TensorInfo info = TensorInfo({ 1, 2, 4, 2 }, DataType::Float32); 317 info.SetConstant(true); 318 TensorInfo poolingInfo = TensorInfo({ 1, 2, 2, 1 }, DataType::Float32); 319 320 input0->GetOutputSlot(0).SetTensorInfo(info); 321 input1->GetOutputSlot(0).SetTensorInfo(info); 322 input2->GetOutputSlot(0).SetTensorInfo(info); 323 add->GetOutputSlot(0).SetTensorInfo(info); 324 sub->GetOutputSlot(0).SetTensorInfo(info); 325 pooling->GetOutputSlot(0).SetTensorInfo(poolingInfo); 326 327 std::vector<BackendId> backends = { Compute::GpuAcc, Compute::CpuAcc }; 328 // Use BackendSelectionHint to specify CpuAcc for Subtraction layer 329 sub->BackendSelectionHint(backends[1]); 330 331 // optimize the network 332 OptimizerOptionsOpaque optOptions; 333 optOptions.SetImportEnabled(true); 334 optOptions.SetExportEnabled(true); 335 IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions); 336 337 Graph& graph = GetGraphForTesting(optNet.get()); 338 339 armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0"); 340 armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1"); 341 armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2"); 342 armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add"); 343 armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]"); 344 armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub"); 345 armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "[ sub (0) -> pooling (0) ]"); 346 armnn::Layer* const layer7 = GetFirstLayerWithName(graph, "pooling"); 347 armnn::Layer* const layer8 = GetFirstLayerWithName(graph, "output"); 348 349 // Checks order is valid. 350 CHECK(CheckOrder(graph, layer0, layer1)); 351 CHECK(CheckOrder(graph, layer1, layer2)); 352 CHECK(CheckOrder(graph, layer2, layer3)); 353 CHECK(CheckOrder(graph, layer3, layer4)); 354 CHECK(CheckOrder(graph, layer4, layer5)); 355 CHECK(CheckOrder(graph, layer5, layer6)); 356 CHECK(CheckOrder(graph, layer6, layer7)); 357 CHECK(CheckOrder(graph, layer7, layer8)); 358 359 // Use memory import between backends 360 CHECK((layer4->GetType() == LayerType::MemCopy)); 361 CHECK((layer6->GetType() == LayerType::MemCopy)); 362 363 // Correctly use backend hint 364 CHECK((layer5->GetBackendId() == Compute::CpuAcc )); 365 366 // Load it into the runtime. It should pass. 367 NetworkId netId; 368 std::string ignoredErrorMessage; 369 INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc); 370 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties); 371 372 // Creates structures for input & output 373 std::vector<float> inputValue0 374 { 375 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f, 1.0f, 1.0f, 2.0f, 2.0f 376 }; 377 std::vector<float> inputValue1 378 { 379 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 0.0f, 1.0f, 1.0f, 2.0f 380 }; 381 std::vector<float> inputData2 382 { 383 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 12.0f, 11.0f, 10.0f, 9.0f 384 }; 385 386 std::vector<float> outputData(4); 387 388 std::vector<float> expectedOutput{ 11.0f, 3.0f, -5.0f, 11.0f }; 389 390 unsigned int numElements = info.GetNumElements(); 391 size_t totalBytes = numElements * sizeof(float); 392 const size_t alignment = 64; 393 size_t space = totalBytes + alignment + alignment; 394 auto inputData0 = std::make_unique<uint8_t[]>(space); 395 void* alignedInputPtr0 = inputData0.get(); 396 CHECK(std::align(alignment, totalBytes, alignedInputPtr0, space)); 397 398 auto* intputPtr0 = reinterpret_cast<float*>(alignedInputPtr0); 399 std::copy(inputValue0.begin(), inputValue0.end(), intputPtr0); 400 401 auto inputData1 = std::make_unique<uint8_t[]>(space); 402 void* alignedInputPtr1 = inputData1.get(); 403 CHECK(std::align(alignment, totalBytes, alignedInputPtr1, space)); 404 405 auto* intputPtr1 = reinterpret_cast<float*>(alignedInputPtr1); 406 std::copy(inputValue1.begin(), inputValue1.end(), intputPtr1); 407 408 InputTensors inputTensors 409 { 410 { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), alignedInputPtr0) }, 411 { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), alignedInputPtr1) }, 412 { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) } 413 }; 414 OutputTensors outputTensors 415 { 416 { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) } 417 }; 418 419 runtime->GetProfiler(netId)->EnableProfiling(true); 420 421 // Do the inference 422 runtime->EnqueueWorkload(netId, inputTensors, outputTensors); 423 424 // Retrieve the Profiler.Print() output to get the workload execution 425 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); 426 std::stringstream ss; 427 profilerManager.GetProfiler()->Print(ss);; 428 std::string dump = ss.str(); 429 430 // Executed Subtraction using CpuAcc 431 std::size_t found = dump.find("NeonSubtractionWorkload_Execute"); 432 CHECK(found != std::string::npos); 433 434 // Correctly switch back to GpuAcc 435 found = dump.find("ClPooling2dWorkload_Execute"); 436 CHECK(found != std::string::npos); 437 438 // Contain CopyMemGeneric 439 found = dump.find("CopyMemGeneric"); 440 CHECK(found != std::string::npos); 441 442 // Check output is as expected 443 CHECK(outputData == expectedOutput); 444 445 runtime->UnloadNetwork(netId); 446 } 447 448 TEST_CASE("ClImportDisableFallbackSubgraphToNeon") 449 { 450 using namespace armnn; 451 452 IRuntime::CreationOptions options; 453 IRuntimePtr runtime(IRuntime::Create(options)); 454 455 // Builds up the structure of the network. 456 INetworkPtr net(INetwork::Create()); 457 458 Pooling2dDescriptor desc; 459 460 IConnectableLayer* input0 = net->AddInputLayer(0, "input0"); 461 IConnectableLayer* input1 = net->AddInputLayer(1, "input1"); 462 IConnectableLayer* input2 = net->AddInputLayer(2, "input2"); 463 IConnectableLayer* add = net->AddElementwiseBinaryLayer(BinaryOperation::Add, "add"); 464 IConnectableLayer* sub = net->AddElementwiseBinaryLayer(BinaryOperation::Sub, "sub"); 465 IConnectableLayer* pooling = net->AddPooling2dLayer(desc, "pooling"); 466 IConnectableLayer* output = net->AddOutputLayer(0, "output"); 467 468 input0->GetOutputSlot(0).Connect(add->GetInputSlot(0)); 469 input1->GetOutputSlot(0).Connect(add->GetInputSlot(1)); 470 input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0)); 471 add->GetOutputSlot(0).Connect(sub->GetInputSlot(1)); 472 sub->GetOutputSlot(0).Connect(pooling->GetInputSlot(0)); 473 pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0)); 474 475 TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32); 476 info.SetConstant(true); 477 TensorInfo poolingInfo = TensorInfo({ 1, 2, 1, 1 }, DataType::Float32); 478 479 input0->GetOutputSlot(0).SetTensorInfo(info); 480 input1->GetOutputSlot(0).SetTensorInfo(info); 481 input2->GetOutputSlot(0).SetTensorInfo(info); 482 add->GetOutputSlot(0).SetTensorInfo(info); 483 sub->GetOutputSlot(0).SetTensorInfo(info); 484 pooling->GetOutputSlot(0).SetTensorInfo(poolingInfo); 485 486 std::vector<BackendId> backends = { Compute::GpuAcc, Compute::CpuAcc }; 487 // Use BackendSelectionHint to specify CpuAcc for Subtraction layer 488 sub->BackendSelectionHint(backends[1]); 489 490 // optimize the network 491 OptimizerOptionsOpaque optOptions; 492 IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions); 493 494 Graph& graph = GetGraphForTesting(optNet.get()); 495 496 armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0"); 497 armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1"); 498 armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2"); 499 armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add"); 500 armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]"); 501 armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub"); 502 armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "[ sub (0) -> pooling (0) ]"); 503 armnn::Layer* const layer7 = GetFirstLayerWithName(graph, "pooling"); 504 armnn::Layer* const layer8 = GetFirstLayerWithName(graph, "output"); 505 506 // Checks order is valid. 507 CHECK(CheckOrder(graph, layer0, layer1)); 508 CHECK(CheckOrder(graph, layer1, layer2)); 509 CHECK(CheckOrder(graph, layer2, layer3)); 510 CHECK(CheckOrder(graph, layer3, layer4)); 511 CHECK(CheckOrder(graph, layer4, layer5)); 512 CHECK(CheckOrder(graph, layer5, layer6)); 513 CHECK(CheckOrder(graph, layer6, layer7)); 514 CHECK(CheckOrder(graph, layer7, layer8)); 515 516 // Use memory import between backends 517 CHECK((layer4->GetType() == LayerType::MemCopy)); 518 CHECK((layer6->GetType() == LayerType::MemCopy)); 519 520 // Correctly use backend hint 521 CHECK((layer5->GetBackendId() == Compute::CpuAcc )); 522 523 // Load it into the runtime. It should pass. 524 NetworkId netId; 525 runtime->LoadNetwork(netId, std::move(optNet)); 526 527 // Creates structures for input & output 528 std::vector<float> inputData0 529 { 530 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f 531 }; 532 std::vector<float> inputData1 533 { 534 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f 535 }; 536 std::vector<float> inputData2 537 { 538 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f 539 }; 540 541 std::vector<float> outputData(2); 542 543 std::vector<float> expectedOutput{ 11.0f, -1.0f }; 544 545 InputTensors inputTensors 546 { 547 { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData0.data()) }, 548 { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), inputData1.data()) }, 549 { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) } 550 }; 551 OutputTensors outputTensors 552 { 553 { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) } 554 }; 555 556 runtime->GetProfiler(netId)->EnableProfiling(true); 557 558 // Do the inference 559 runtime->EnqueueWorkload(netId, inputTensors, outputTensors); 560 561 // Retrieve the Profiler.Print() output to get the workload execution 562 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); 563 std::stringstream ss; 564 profilerManager.GetProfiler()->Print(ss);; 565 std::string dump = ss.str(); 566 567 // Executed Subtraction using CpuAcc 568 std::size_t found = dump.find("NeonSubtractionWorkload_Execute"); 569 CHECK(found != std::string::npos); 570 571 // Correctly switch back to GpuAcc 572 found = dump.find("ClPooling2dWorkload_Execute"); 573 CHECK(found != std::string::npos); 574 575 // Contain CopyMemGeneric 576 found = dump.find("CopyMemGeneric"); 577 CHECK(found != std::string::npos); 578 579 // Check output is as expected 580 CHECK(outputData == expectedOutput); 581 } 582 583 } 584