1 // 2 // Copyright © 2021, 2023 Arm Ltd and Contributors. All rights reserved. 3 // SPDX-License-Identifier: MIT 4 // 5 6 #include <arm_compute/runtime/CL/functions/CLActivationLayer.h> 7 8 #include <cl/ClImportTensorHandle.hpp> 9 #include <cl/ClImportTensorHandleFactory.hpp> 10 #include <cl/test/ClContextControlFixture.hpp> 11 12 #include <doctest/doctest.h> 13 14 #include <armnn/IRuntime.hpp> 15 #include <armnn/INetwork.hpp> 16 #include "Network.hpp" 17 18 using namespace armnn; 19 20 TEST_SUITE("ClImportTensorHandleTests") 21 { 22 TEST_CASE_FIXTURE(ClContextControlFixture, "ClMallocImport") 23 { 24 ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc), 25 static_cast<MemorySourceFlags>(MemorySource::Malloc)); 26 27 TensorInfo info({ 1, 24, 16, 3 }, DataType::Float32); 28 unsigned int numElements = info.GetNumElements(); 29 30 // create TensorHandle for memory import 31 auto handle = handleFactory.CreateTensorHandle(info); 32 33 // Get CLtensor 34 arm_compute::CLTensor& tensor = PolymorphicDowncast<ClImportTensorHandle*>(handle.get())->GetTensor(); 35 36 // Create and configure activation function 37 const arm_compute::ActivationLayerInfo act_info(arm_compute::ActivationLayerInfo::ActivationFunction::RELU); 38 arm_compute::CLActivationLayer act_func; 39 act_func.configure(&tensor, nullptr, act_info); 40 41 // Allocate user memory 42 const size_t totalBytes = tensor.info()->total_size(); 43 const size_t alignment = 44 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>(); 45 size_t space = totalBytes + alignment + alignment; 46 auto testData = std::make_unique<uint8_t[]>(space); 47 void* alignedPtr = testData.get(); 48 CHECK(std::align(alignment, totalBytes, alignedPtr, space)); 49 50 // Import memory 51 CHECK(handle->Import(alignedPtr, armnn::MemorySource::Malloc)); 52 53 // Input with negative values 54 auto* typedPtr = reinterpret_cast<float*>(alignedPtr); 55 std::fill_n(typedPtr, numElements, -5.0f); 56 57 // Execute function and sync 58 act_func.run(); 59 arm_compute::CLScheduler::get().sync(); 60 61 // Validate result by checking that the output has no negative values 62 for(unsigned int i = 0; i < numElements; ++i) 63 { 64 CHECK(typedPtr[i] == 0); 65 } 66 } 67 68 TEST_CASE_FIXTURE(ClContextControlFixture, "ClIncorrectMemorySourceImport") 69 { 70 ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc), 71 static_cast<MemorySourceFlags>(MemorySource::Malloc)); 72 73 TensorInfo info({ 1, 24, 16, 3 }, DataType::Float32); 74 75 // create TensorHandle for memory import 76 auto handle = handleFactory.CreateTensorHandle(info); 77 78 // Get CLtensor 79 arm_compute::CLTensor& tensor = PolymorphicDowncast<ClImportTensorHandle*>(handle.get())->GetTensor(); 80 81 // Allocate user memory 82 const size_t totalBytes = tensor.info()->total_size(); 83 const size_t alignment = 84 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>(); 85 size_t space = totalBytes + alignment + alignment; 86 auto testData = std::make_unique<uint8_t[]>(space); 87 void* alignedPtr = testData.get(); 88 CHECK(std::align(alignment, totalBytes, alignedPtr, space)); 89 90 // Import memory 91 CHECK_THROWS_AS(handle->Import(alignedPtr, armnn::MemorySource::Undefined), MemoryImportException); 92 } 93 94 TEST_CASE_FIXTURE(ClContextControlFixture, "ClInvalidMemorySourceImport") 95 { 96 MemorySource invalidMemSource = static_cast<MemorySource>(256); 97 ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(invalidMemSource), 98 static_cast<MemorySourceFlags>(invalidMemSource)); 99 100 TensorInfo info({ 1, 2, 2, 1 }, DataType::Float32); 101 102 // create TensorHandle for memory import 103 auto handle = handleFactory.CreateTensorHandle(info); 104 105 // Allocate user memory 106 std::vector<float> inputData 107 { 108 1.0f, 2.0f, 3.0f, 4.0f 109 }; 110 111 // Import non-support memory 112 CHECK_THROWS_AS(handle->Import(inputData.data(), invalidMemSource), MemoryImportException); 113 } 114 115 TEST_CASE_FIXTURE(ClContextControlFixture, "ClImportEndToEnd") 116 { 117 // Create runtime in which test will run 118 IRuntime::CreationOptions options; 119 IRuntimePtr runtime(armnn::IRuntime::Create(options)); 120 121 // build up the structure of the network 122 INetworkPtr net(INetwork::Create()); 123 124 IConnectableLayer* input = net->AddInputLayer(0, "Input"); 125 126 ActivationDescriptor descriptor; 127 descriptor.m_Function = ActivationFunction::ReLu; 128 IConnectableLayer* activation = net->AddActivationLayer(descriptor, "Activation"); 129 130 IConnectableLayer* output = net->AddOutputLayer(0, "Output"); 131 132 input->GetOutputSlot(0).Connect(activation->GetInputSlot(0)); 133 activation->GetOutputSlot(0).Connect(output->GetInputSlot(0)); 134 135 TensorInfo tensorInfo = TensorInfo({ 1, 24, 16, 3 }, DataType::Float32); 136 unsigned int numElements = tensorInfo.GetNumElements(); 137 size_t totalBytes = numElements * sizeof(float); 138 139 input->GetOutputSlot(0).SetTensorInfo(tensorInfo); 140 activation->GetOutputSlot(0).SetTensorInfo(tensorInfo); 141 142 // Optimize the network 143 OptimizerOptionsOpaque optOptions; 144 optOptions.SetImportEnabled(true); 145 optOptions.SetExportEnabled(true); 146 std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc}; 147 IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions); 148 CHECK(optNet); 149 150 // Loads it into the runtime. 151 NetworkId netId; 152 std::string ignoredErrorMessage; 153 // Enable Importing 154 INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc); 155 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties); 156 157 // Creates structures for input & output 158 const size_t alignment = 159 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>(); 160 size_t space = totalBytes + alignment + alignment; 161 auto inputData = std::make_unique<uint8_t[]>(space); 162 void* alignedInputPtr = inputData.get(); 163 CHECK(std::align(alignment, totalBytes, alignedInputPtr, space)); 164 165 // Input with negative values 166 auto* intputPtr = reinterpret_cast<float*>(alignedInputPtr); 167 std::fill_n(intputPtr, numElements, -5.0f); 168 169 auto outputData = std::make_unique<uint8_t[]>(space); 170 void* alignedOutputPtr = outputData.get(); 171 CHECK(std::align(alignment, totalBytes, alignedOutputPtr, space)); 172 auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr); 173 std::fill_n(outputPtr, numElements, -10.0f); 174 175 TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0); 176 inputTensorInfo.SetConstant(true); 177 InputTensors inputTensors 178 { 179 {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)}, 180 }; 181 OutputTensors outputTensors 182 { 183 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)} 184 }; 185 186 runtime->GetProfiler(netId)->EnableProfiling(true); 187 188 // Do the inference 189 runtime->EnqueueWorkload(netId, inputTensors, outputTensors); 190 191 // Retrieve the Profiler.Print() output to get the workload execution 192 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); 193 std::stringstream ss; 194 profilerManager.GetProfiler()->Print(ss);; 195 std::string dump = ss.str(); 196 197 // Contains ActivationWorkload 198 std::size_t found = dump.find("ActivationWorkload"); 199 CHECK(found != std::string::npos); 200 201 // Contains SyncMemGeneric 202 found = dump.find("SyncMemGeneric"); 203 CHECK(found != std::string::npos); 204 205 // Does not contain CopyMemGeneric 206 found = dump.find("CopyMemGeneric"); 207 CHECK(found == std::string::npos); 208 209 runtime->UnloadNetwork(netId); 210 211 // Check output is as expected 212 // Validate result by checking that the output has no negative values 213 auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr); 214 CHECK(outputResult); 215 for(unsigned int i = 0; i < numElements; ++i) 216 { 217 CHECK(outputResult[i] >= 0); 218 } 219 } 220 221 TEST_CASE_FIXTURE(ClContextControlFixture, "ClCanBeImported") 222 { 223 ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc), 224 static_cast<MemorySourceFlags>(MemorySource::Malloc)); 225 226 TensorInfo info({ 1, 24, 16, 3 }, DataType::Float32); 227 228 // create TensorHandle for memory import 229 auto handle = handleFactory.CreateTensorHandle(info, DataLayout::NHWC); 230 231 // Get CLtensor 232 arm_compute::CLTensor& tensor = PolymorphicDowncast<ClImportTensorHandle*>(handle.get())->GetTensor(); 233 234 // Allocate user memory 235 const size_t totalBytes = tensor.info()->total_size(); 236 const size_t alignment = 237 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>(); 238 size_t space = totalBytes + alignment + alignment; 239 auto testData = std::make_unique<uint8_t[]>(space); 240 void* alignedPtr = testData.get(); 241 CHECK(std::align(alignment, totalBytes, alignedPtr, space)); 242 243 // Import memory 244 CHECK_THROWS_AS(handle->CanBeImported(alignedPtr, armnn::MemorySource::Undefined), MemoryImportException); 245 246 } 247 248 TEST_CASE("ClCanBeImportedAlignedMemory") 249 { 250 ClImportTensorHandleFactory handleFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc), 251 static_cast<MemorySourceFlags>(MemorySource::Malloc)); 252 253 TensorInfo info({ 1, 1, 1, 1 }, DataType::Float32); 254 255 // create TensorHandle (Memory Managed status is irrelevant) 256 auto handle = handleFactory.CreateTensorHandle(info, DataLayout::NHWC); 257 // Get CLtensor 258 arm_compute::CLTensor& tensor = PolymorphicDowncast<ClImportTensorHandle*>(handle.get())->GetTensor(); 259 260 // Create an aligned buffer 261 const size_t totalBytes = tensor.info()->total_size(); 262 const size_t alignment = 263 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>(); 264 size_t space = totalBytes + alignment + alignment; 265 auto testData = std::make_unique<uint8_t[]>(space); 266 void* alignedPtr = testData.get(); 267 CHECK(std::align(alignment, totalBytes, alignedPtr, space)); 268 269 // Check aligned buffers return true 270 CHECK(handle->CanBeImported(alignedPtr, MemorySource::Malloc) == true); 271 272 // Due to the nature of how GPU memory is mapped it is entirely possible for memory which is misaligned on cpu 273 // to be successfully import on GPU. As such there is no way to create a misaligned pointer that will always fail. 274 // Rather it will succeed on some devices and fail on others. As long as a correctly aligned buffer returns true 275 // we can be confident that it will be successfully imported. All other cases will need to be handled by the user. 276 } 277 278 TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportConv2dEndToEnd") 279 { 280 // Create runtime in which test will run 281 IRuntime::CreationOptions options; 282 IRuntimePtr runtime(armnn::IRuntime::Create(options)); 283 284 // build up the structure of the network 285 INetworkPtr network(INetwork::Create()); 286 287 armnn::TensorInfo inputInfo({ 1, 3, 4, 1 }, DataType::Float32); 288 armnn::TensorInfo kernelInfo({ 1, 3, 3, 1 }, DataType::Float32); 289 armnn::TensorInfo outputInfo({ 1, 3, 4, 1 }, DataType::Float32); 290 291 kernelInfo.SetConstant(true); 292 293 std::vector<float> kernel = 294 { 295 4, 5, 6, 296 0, 0, 0, 297 3, 2, 1 298 }; 299 300 const std::vector<float> expectedOutput = 301 { 302 23, 41, 33, 21, 303 44, 65, 76, 52, 304 82, 85, 79, 42 305 }; 306 307 unsigned int numElements = inputInfo.GetNumElements(); 308 size_t totalBytes = numElements * sizeof(float); 309 310 IConnectableLayer* const inputLayer = network->AddInputLayer(0, "input"); 311 ARMNN_ASSERT(inputLayer); 312 313 armnn::ConstTensor weights(kernelInfo, kernel); 314 315 armnn::Convolution2dDescriptor convDesc2d; 316 convDesc2d.m_StrideX = 1; 317 convDesc2d.m_StrideY = 1; 318 convDesc2d.m_PadLeft = 1; 319 convDesc2d.m_PadRight = 1; 320 convDesc2d.m_PadTop = 1; 321 convDesc2d.m_PadBottom = 1; 322 convDesc2d.m_DataLayout = DataLayout::NHWC; 323 324 armnn::IConnectableLayer* const convLayer = network->AddConvolution2dLayer(convDesc2d, "conv"); 325 armnn::IConnectableLayer* weightsLayer = network->AddConstantLayer(weights); 326 327 ARMNN_ASSERT(convLayer); 328 329 weightsLayer->GetOutputSlot(0).SetTensorInfo(weights.GetInfo()); 330 weightsLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(1u)); 331 332 inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0)); 333 inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo); 334 335 IConnectableLayer* output = network->AddOutputLayer(0, "output"); 336 convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0)); 337 convLayer->GetOutputSlot(0).SetTensorInfo(outputInfo); 338 339 // Optimize the network 340 OptimizerOptionsOpaque optOptions; 341 optOptions.SetImportEnabled(false); 342 optOptions.SetExportEnabled(false); 343 std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc}; 344 IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec(), optOptions); 345 CHECK(optNet); 346 347 // Loads it into the runtime. 348 NetworkId netId; 349 std::string ignoredErrorMessage; 350 // Enable Importing 351 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined); 352 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties); 353 354 // Creates structures for input & output 355 const size_t alignment = 356 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>(); 357 size_t space = totalBytes + alignment + alignment; 358 auto inputData = std::make_unique<uint8_t[]>(space); 359 void* alignedInputPtr = inputData.get(); 360 CHECK(std::align(alignment, totalBytes, alignedInputPtr, space)); 361 362 // Input with negative values 363 auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr); 364 inputPtr[0] = 1; 365 inputPtr[1] = 5; 366 inputPtr[2] = 2; 367 inputPtr[3] = 3; 368 inputPtr[4] = 8; 369 inputPtr[5] = 7; 370 inputPtr[6] = 3; 371 inputPtr[7] = 6; 372 inputPtr[8] = 3; 373 inputPtr[9] = 3; 374 inputPtr[10] = 9; 375 inputPtr[11] = 1; 376 377 378 auto outputData = std::make_unique<uint8_t[]>(space); 379 void* alignedOutputPtr = outputData.get(); 380 CHECK(std::align(alignment, totalBytes, alignedOutputPtr, space)); 381 auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr); 382 std::fill_n(outputPtr, numElements, -10.0f); 383 384 TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0); 385 inputTensorInfo.SetConstant(true); 386 InputTensors inputTensors 387 { 388 {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)}, 389 }; 390 OutputTensors outputTensors 391 { 392 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)} 393 }; 394 395 runtime->GetProfiler(netId)->EnableProfiling(true); 396 397 INFO("Run ImportInputs"); 398 std::vector<ImportedInputId> importedInputIds = 399 runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc); 400 // We expect the import to have succeeded. 401 CHECK(importedInputIds.size() == 1); 402 std::vector<ImportedOutputId> importedOutputIds = 403 runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc); 404 // We expect the import to have succeeded. 405 CHECK(importedOutputIds.size() == 1); 406 // Do the inference 407 runtime->EnqueueWorkload(netId, InputTensors(), OutputTensors(), importedInputIds, importedOutputIds); 408 409 // Retrieve the Profiler.Print() output to get the workload execution 410 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); 411 std::stringstream ss; 412 profilerManager.GetProfiler()->Print(ss);; 413 std::string dump = ss.str(); 414 415 // Contains Convolution2dWorkload 416 std::size_t found = dump.find("Convolution2dWorkload"); 417 CHECK(found != std::string::npos); 418 419 // Contains SyncMemGeneric 420 found = dump.find("SyncMemGeneric"); 421 CHECK(found != std::string::npos); 422 423 // Does not contain CopyMemGeneric 424 found = dump.find("CopyMemGeneric"); 425 CHECK(found == std::string::npos); 426 427 runtime->UnloadNetwork(netId); 428 429 // Check output is as expected 430 // Validate result by checking that the output has no negative values 431 auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr); 432 CHECK(outputResult); 433 434 // Check the output is correct 435 CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end())); 436 } 437 438 TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportConvertFp16toFp32EndToEnd") 439 { 440 using namespace half_float::literal; 441 442 // Create runtime in which test will run 443 IRuntime::CreationOptions options; 444 IRuntimePtr runtime(armnn::IRuntime::Create(options)); 445 446 // build up the structure of the network 447 NetworkImpl network; 448 449 armnn::TensorInfo inputInfo({1, 3, 2, 3}, armnn::DataType::Float16); 450 armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float32); 451 452 std::vector<float> expectedOutput = 453 { 454 -37.5f, -15.2f, -8.76f, -2.0f, -1.5f, -1.3f, -0.5f, -0.4f, 0.0f, 455 1.0f, 0.4f, 0.5f, 1.3f, 1.5f, 2.0f, 8.76f, 15.2f, 37.5f 456 }; 457 458 unsigned int numElements = inputInfo.GetNumElements(); 459 size_t totalBytesInput = numElements * sizeof(Half); 460 size_t totalBytesOutput = numElements * sizeof(float); 461 462 IConnectableLayer* const inputLayer = network.AddInputLayer(0, "input"); 463 ARMNN_ASSERT(inputLayer); 464 465 armnn::IConnectableLayer* const convLayer = network.AddConvertFp16ToFp32Layer("convert"); 466 ARMNN_ASSERT(convLayer); 467 468 inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0)); 469 inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo); 470 471 IConnectableLayer* output = network.AddOutputLayer(0, "output"); 472 convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0)); 473 convLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo); 474 475 // Optimize the network 476 OptimizerOptionsOpaque optOptions; 477 optOptions.SetImportEnabled(false); 478 optOptions.SetExportEnabled(false); 479 std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc}; 480 IOptimizedNetworkPtr optNet = Optimize(network.GetGraph(), backends, runtime->GetDeviceSpec(), optOptions); 481 CHECK(optNet); 482 483 // Loads it into the runtime. 484 NetworkId netId; 485 std::string ignoredErrorMessage; 486 // Enable Importing 487 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined); 488 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties); 489 490 // Creates structures for input & output 491 const size_t alignment = 492 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>(); 493 size_t spaceInput = totalBytesInput + alignment + alignment; 494 size_t spaceOutput = totalBytesOutput + alignment + alignment; 495 auto inputData = std::make_unique<uint8_t[]>(spaceInput); 496 void* alignedInputPtr = inputData.get(); 497 CHECK(std::align(alignment, totalBytesInput, alignedInputPtr, spaceInput)); 498 499 // Input with negative values 500 auto* inputPtr = reinterpret_cast<Half*>(alignedInputPtr); 501 inputPtr[0] = -37.5_h; 502 inputPtr[1] = -15.2_h; 503 inputPtr[2] = -8.76_h; 504 inputPtr[3] = -2.0_h; 505 inputPtr[4] = -1.5_h; 506 inputPtr[5] = -1.3_h; 507 inputPtr[6] = -0.5_h; 508 inputPtr[7] = -0.4_h; 509 inputPtr[8] = 0.0_h; 510 inputPtr[9] = 1.0_h; 511 inputPtr[10] = 0.4_h; 512 inputPtr[11] = 0.5_h; 513 inputPtr[12] = 1.3_h; 514 inputPtr[13] = 1.5_h; 515 inputPtr[14] = 2.0_h; 516 inputPtr[15] = 8.76_h; 517 inputPtr[16] = 15.2_h; 518 inputPtr[17] = 37.5_h; 519 520 auto outputData = std::make_unique<uint8_t[]>(spaceOutput); 521 void* alignedOutputPtr = outputData.get(); 522 CHECK(std::align(alignment, totalBytesOutput, alignedOutputPtr, spaceOutput)); 523 auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr); 524 std::fill_n(outputPtr, numElements, -10.0f); 525 526 TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0); 527 inputTensorInfo.SetConstant(true); 528 InputTensors inputTensors 529 { 530 {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)}, 531 }; 532 OutputTensors outputTensors 533 { 534 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)} 535 }; 536 537 runtime->GetProfiler(netId)->EnableProfiling(true); 538 539 INFO("Run ImportInputs"); 540 std::vector<ImportedInputId> importedInputIds = 541 runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc); 542 // We expect the import to have succeeded. 543 CHECK(importedInputIds.size() == 1); 544 std::vector<ImportedOutputId> importedOutputIds = 545 runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc); 546 // We expect the import to have succeeded. 547 CHECK(importedOutputIds.size() == 1); 548 549 // Do the inference 550 runtime->EnqueueWorkload(netId, InputTensors(), OutputTensors(), importedInputIds, importedOutputIds); 551 552 // Retrieve the Profiler.Print() output to get the workload execution 553 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); 554 std::stringstream ss; 555 profilerManager.GetProfiler()->Print(ss);; 556 std::string dump = ss.str(); 557 558 // Contains Convolution2dWorkload 559 std::size_t found = dump.find("ConvertFp16ToFp32Workload"); 560 CHECK(found != std::string::npos); 561 562 // Contains SyncMemGeneric 563 found = dump.find("SyncMemGeneric"); 564 CHECK(found != std::string::npos); 565 566 // Does not contain CopyMemGeneric 567 found = dump.find("CopyMemGeneric"); 568 CHECK(found == std::string::npos); 569 570 runtime->UnloadNetwork(netId); 571 572 // Check output is as expected 573 // Validate result by checking that the output has no negative values 574 auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr); 575 CHECK(outputResult); 576 577 // Check the output is correct 578 for (size_t i = 0; i < numElements; ++i) 579 { 580 DOCTEST_CHECK_MESSAGE(outputResult[i] == doctest::Approx(expectedOutput[i]).epsilon(0.0004), 581 "outputValue[" << i << "]: " << outputResult[i] << " != " << expectedOutput[i]); 582 } 583 } 584 585 586 TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportConvertFp32toFp16EndToEnd") 587 { 588 using namespace half_float::literal; 589 590 // Create runtime in which test will run 591 IRuntime::CreationOptions options; 592 IRuntimePtr runtime(armnn::IRuntime::Create(options)); 593 594 // build up the structure of the network 595 NetworkImpl network; 596 597 armnn::TensorInfo inputInfo({1, 3, 2, 3}, armnn::DataType::Float32); 598 armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float16); 599 600 std::vector<Half> expectedOutput = 601 { 602 -37.5_h, -15.2_h, -8.76_h, -2.0_h, -1.5_h, -1.3_h, -0.5_h, -0.4_h, 0.0_h, 603 1.0_h, 0.4_h, 0.5_h, 1.3_h, 1.5_h, 2.0_h, 8.76_h, 15.2_h, 37.5_h 604 }; 605 606 unsigned int numElements = inputInfo.GetNumElements(); 607 size_t totalBytesInput = numElements * sizeof(float); 608 size_t totalBytesOutput = numElements * sizeof(Half); 609 610 IConnectableLayer* const inputLayer = network.AddInputLayer(0, "input"); 611 ARMNN_ASSERT(inputLayer); 612 613 armnn::IConnectableLayer* const convLayer = network.AddConvertFp32ToFp16Layer("convert"); 614 ARMNN_ASSERT(convLayer); 615 616 inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0)); 617 inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo); 618 619 IConnectableLayer* output = network.AddOutputLayer(0, "output"); 620 convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0)); 621 convLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo); 622 623 // Optimize the network 624 OptimizerOptionsOpaque optOptions; 625 optOptions.SetImportEnabled(false); 626 optOptions.SetExportEnabled(false); 627 std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc}; 628 IOptimizedNetworkPtr optNet = Optimize(network.GetGraph(), backends, runtime->GetDeviceSpec(), optOptions); 629 CHECK(optNet); 630 631 // Loads it into the runtime. 632 NetworkId netId; 633 std::string ignoredErrorMessage; 634 // Enable Importing 635 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined); 636 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties); 637 638 // Creates structures for input & output 639 const size_t alignment = 640 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>(); 641 size_t spaceInput = totalBytesInput + alignment + alignment; 642 size_t spaceOutput = totalBytesOutput + alignment + alignment; 643 auto inputData = std::make_unique<uint8_t[]>(spaceInput); 644 void* alignedInputPtr = inputData.get(); 645 CHECK(std::align(alignment, totalBytesInput, alignedInputPtr, spaceInput)); 646 647 // Input with negative values 648 auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr); 649 inputPtr[0] = -37.5f; 650 inputPtr[1] = -15.2f; 651 inputPtr[2] = -8.76f; 652 inputPtr[3] = -2.0f; 653 inputPtr[4] = -1.5f; 654 inputPtr[5] = -1.3f; 655 inputPtr[6] = -0.5f; 656 inputPtr[7] = -0.4f; 657 inputPtr[8] = 0.0f; 658 inputPtr[9] = 1.0f; 659 inputPtr[10] = 0.4f; 660 inputPtr[11] = 0.5f; 661 inputPtr[12] = 1.3f; 662 inputPtr[13] = 1.5f; 663 inputPtr[14] = 2.0f; 664 inputPtr[15] = 8.76f; 665 inputPtr[16] = 15.2f; 666 inputPtr[17] = 37.5f; 667 668 auto outputData = std::make_unique<uint8_t[]>(spaceOutput); 669 void* alignedOutputPtr = outputData.get(); 670 CHECK(std::align(alignment, totalBytesOutput, alignedOutputPtr, spaceOutput)); 671 auto* outputPtr = reinterpret_cast<Half*>(alignedOutputPtr); 672 std::fill_n(outputPtr, numElements, -10.0f); 673 674 TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0); 675 inputTensorInfo.SetConstant(true); 676 InputTensors inputTensors 677 { 678 {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)}, 679 }; 680 OutputTensors outputTensors 681 { 682 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)} 683 }; 684 685 runtime->GetProfiler(netId)->EnableProfiling(true); 686 687 INFO("Run ImportInputs"); 688 std::vector<ImportedInputId> importedInputIds = 689 runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc); 690 // We expect the import to have succeeded. 691 CHECK(importedInputIds.size() == 1); 692 std::vector<ImportedOutputId> importedOutputIds = 693 runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc); 694 // We expect the import to have succeeded. 695 CHECK(importedOutputIds.size() == 1); 696 697 // Do the inference 698 runtime->EnqueueWorkload(netId, InputTensors(), OutputTensors(), importedInputIds, importedOutputIds); 699 700 // Retrieve the Profiler.Print() output to get the workload execution 701 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); 702 std::stringstream ss; 703 profilerManager.GetProfiler()->Print(ss);; 704 std::string dump = ss.str(); 705 706 // Contains Convolution2dWorkload 707 std::size_t found = dump.find("ConvertFp32ToFp16Workload"); 708 CHECK(found != std::string::npos); 709 710 // Contains SyncMemGeneric 711 found = dump.find("SyncMemGeneric"); 712 CHECK(found != std::string::npos); 713 714 // Does not contain CopyMemGeneric 715 found = dump.find("CopyMemGeneric"); 716 CHECK(found == std::string::npos); 717 718 runtime->UnloadNetwork(netId); 719 720 // Check output is as expected 721 // Validate result by checking that the output has no negative values 722 auto* outputResult = reinterpret_cast<Half*>(alignedOutputPtr); 723 CHECK(outputResult); 724 725 // Check the output is correct 726 CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end())); 727 } 728 729 TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportSimpleConvertFp32toFp16EndToEnd") 730 { 731 using namespace half_float::literal; 732 733 // Create runtime in which test will run 734 IRuntime::CreationOptions options; 735 IRuntimePtr runtime(armnn::IRuntime::Create(options)); 736 737 // build up the structure of the network 738 NetworkImpl network; 739 740 armnn::TensorInfo inputInfo({1}, armnn::DataType::Float32); 741 armnn::TensorInfo outputTensorInfo({1}, armnn::DataType::Float16); 742 743 std::vector<Half> expectedOutput = { 1.0_h }; 744 745 unsigned int numElements = inputInfo.GetNumElements(); 746 size_t totalBytesInput = numElements * sizeof(float); 747 size_t totalBytesOutput = numElements * sizeof(Half); 748 749 IConnectableLayer* const inputLayer = network.AddInputLayer(0, "input"); 750 ARMNN_ASSERT(inputLayer); 751 752 armnn::IConnectableLayer* const convLayer = network.AddConvertFp32ToFp16Layer("convert"); 753 ARMNN_ASSERT(convLayer); 754 755 inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0)); 756 inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo); 757 758 IConnectableLayer* output = network.AddOutputLayer(0, "output"); 759 convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0)); 760 convLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo); 761 762 // Optimize the network 763 OptimizerOptionsOpaque optOptions; 764 optOptions.SetImportEnabled(false); 765 optOptions.SetExportEnabled(false); 766 std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc}; 767 IOptimizedNetworkPtr optNet = Optimize(network.GetGraph(), backends, runtime->GetDeviceSpec(), optOptions); 768 CHECK(optNet); 769 770 // Loads it into the runtime. 771 NetworkId netId; 772 std::string ignoredErrorMessage; 773 // Enable Importing 774 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined); 775 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties); 776 777 // Creates structures for input & output 778 const size_t alignment = 779 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>(); 780 size_t spaceInput = totalBytesInput + alignment + alignment; 781 size_t spaceOutput = totalBytesOutput + alignment + alignment; 782 auto inputData = std::make_unique<uint8_t[]>(spaceInput); 783 void* alignedInputPtr = inputData.get(); 784 CHECK(std::align(alignment, totalBytesInput, alignedInputPtr, spaceInput)); 785 786 // Input with negative values 787 auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr); 788 inputPtr[0] = 1.0f; 789 790 auto outputData = std::make_unique<uint8_t[]>(spaceOutput); 791 void* alignedOutputPtr = outputData.get(); 792 CHECK(std::align(alignment, totalBytesOutput, alignedOutputPtr, spaceOutput)); 793 auto* outputPtr = reinterpret_cast<Half*>(alignedOutputPtr); 794 std::fill_n(outputPtr, numElements, -10.0f); 795 796 TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0); 797 inputTensorInfo.SetConstant(true); 798 InputTensors inputTensors 799 { 800 {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)}, 801 }; 802 OutputTensors outputTensors 803 { 804 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)} 805 }; 806 807 runtime->GetProfiler(netId)->EnableProfiling(true); 808 809 INFO("Run ImportInputs"); 810 std::vector<ImportedInputId> importedInputIds = 811 runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc); 812 CHECK(importedInputIds.size() == 1); 813 std::vector<ImportedOutputId> importedOutputIds = 814 runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc); 815 CHECK(importedOutputIds.size() == 1); 816 817 // Do the inference 818 runtime->EnqueueWorkload(netId, InputTensors(), OutputTensors(), importedInputIds, importedOutputIds); 819 820 // Retrieve the Profiler.Print() output to get the workload execution 821 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); 822 std::stringstream ss; 823 profilerManager.GetProfiler()->Print(ss);; 824 std::string dump = ss.str(); 825 826 // Contains Convolution2dWorkload 827 std::size_t found = dump.find("ConvertFp32ToFp16Workload"); 828 CHECK(found != std::string::npos); 829 830 // Contains SyncMemGeneric 831 found = dump.find("SyncMemGeneric"); 832 CHECK(found != std::string::npos); 833 834 // Does not contain CopyMemGeneric 835 found = dump.find("CopyMemGeneric"); 836 CHECK(found == std::string::npos); 837 838 runtime->UnloadNetwork(netId); 839 840 // Check output is as expected 841 // Validate result by checking that the output has no negative values 842 auto* outputResult = reinterpret_cast<Half*>(alignedOutputPtr); 843 CHECK(outputResult); 844 845 // Check the output is correct 846 CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end())); 847 } 848 849 TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportRepeatedInferencesEndToEndTest") 850 { 851 /* 852 * This is a test to check the functionality of the Forced Import functionality when using repeated inferences that 853 * require switching from importing to copy. For the first inference we create aligned Pointers and check they are 854 * imported correctly. For the second we use similar pointers but don't use PreImporting. 855 */ 856 // Create runtime in which test will run 857 IRuntime::CreationOptions options; 858 IRuntimePtr runtime(armnn::IRuntime::Create(options)); 859 860 // build up the structure of the network 861 INetworkPtr network(INetwork::Create()); 862 863 armnn::TensorInfo inputInfo({ 1, 3, 4, 1 }, DataType::Float32); 864 armnn::TensorInfo kernelInfo({ 1, 3, 3, 1 }, DataType::Float32); 865 armnn::TensorInfo outputInfo({ 1, 3, 4, 1 }, DataType::Float32); 866 867 kernelInfo.SetConstant(true); 868 869 std::vector<float> kernel = 870 { 871 4, 5, 6, 872 0, 0, 0, 873 3, 2, 1 874 }; 875 876 const std::vector<float> expectedOutput = 877 { 878 23, 41, 33, 21, 879 44, 65, 76, 52, 880 82, 85, 79, 42 881 }; 882 883 unsigned int numElements = inputInfo.GetNumElements(); 884 size_t totalBytes = numElements * sizeof(float); 885 886 IConnectableLayer* const inputLayer = network->AddInputLayer(0, "input"); 887 ARMNN_ASSERT(inputLayer); 888 889 armnn::ConstTensor weights(kernelInfo, kernel); 890 891 armnn::Convolution2dDescriptor convDesc2d; 892 convDesc2d.m_StrideX = 1; 893 convDesc2d.m_StrideY = 1; 894 convDesc2d.m_PadLeft = 1; 895 convDesc2d.m_PadRight = 1; 896 convDesc2d.m_PadTop = 1; 897 convDesc2d.m_PadBottom = 1; 898 convDesc2d.m_DataLayout = DataLayout::NHWC; 899 armnn::IConnectableLayer* const convLayer = network->AddConvolution2dLayer(convDesc2d, "conv"); 900 ARMNN_ASSERT(convLayer); 901 902 armnn::IConnectableLayer* weightsLayer = network->AddConstantLayer(weights); 903 904 weightsLayer->GetOutputSlot(0).SetTensorInfo(weights.GetInfo()); 905 weightsLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(1u)); 906 907 inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0)); 908 inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo); 909 910 IConnectableLayer* output = network->AddOutputLayer(0, "output"); 911 convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0)); 912 convLayer->GetOutputSlot(0).SetTensorInfo(outputInfo); 913 914 // Optimize the network 915 OptimizerOptionsOpaque optOptions; 916 optOptions.SetImportEnabled(false); 917 optOptions.SetExportEnabled(false); 918 std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc}; 919 IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec(), optOptions); 920 CHECK(optNet); 921 922 // Loads it into the runtime. 923 NetworkId netId; 924 std::string ignoredErrorMessage; 925 // Enable Importing 926 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined); 927 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties); 928 929 // Creates structures for input & output 930 const size_t alignment = 931 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>(); 932 size_t space = totalBytes + alignment + alignment; 933 auto inputData = std::make_unique<uint8_t[]>(space); 934 void* alignedInputPtr = inputData.get(); 935 CHECK(std::align(alignment, totalBytes, alignedInputPtr, space)); 936 937 // Fill input with values 938 auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr); 939 inputPtr[0] = 1; 940 inputPtr[1] = 5; 941 inputPtr[2] = 2; 942 inputPtr[3] = 3; 943 inputPtr[4] = 8; 944 inputPtr[5] = 7; 945 inputPtr[6] = 3; 946 inputPtr[7] = 6; 947 inputPtr[8] = 3; 948 inputPtr[9] = 3; 949 inputPtr[10] = 9; 950 inputPtr[11] = 1; 951 952 953 auto outputData = std::make_unique<uint8_t[]>(space); 954 void* alignedOutputPtr = outputData.get(); 955 CHECK(std::align(alignment, totalBytes, alignedOutputPtr, space)); 956 auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr); 957 std::fill_n(outputPtr, numElements, -10.0f); 958 959 TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0); 960 inputTensorInfo.SetConstant(true); 961 InputTensors inputTensors 962 { 963 {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)}, 964 }; 965 OutputTensors outputTensors 966 { 967 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)} 968 }; 969 970 runtime->GetProfiler(netId)->EnableProfiling(true); 971 972 INFO("Run ImportInputs"); 973 std::vector<ImportedInputId> importedInputIds = 974 runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc); 975 // We expect the import to have succeeded. 976 CHECK(importedInputIds.size() == 1); 977 std::vector<ImportedOutputId> importedOutputIds = 978 runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc); 979 // We expect the import to have succeeded. 980 CHECK(importedOutputIds.size() == 1); 981 982 // Do the inference 983 runtime->EnqueueWorkload(netId, InputTensors(), OutputTensors(), importedInputIds, importedOutputIds); 984 985 // Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution 986 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); 987 std::stringstream ss; 988 profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss); 989 std::string dump = ss.str(); 990 991 // Contains Convolution2dWorkload 992 std::size_t found = dump.find("Convolution2dWorkload"); 993 CHECK(found != std::string::npos); 994 995 // Contains SyncMemGeneric 996 found = dump.find("SyncMemGeneric"); 997 CHECK(found != std::string::npos); 998 999 // Does not contain CopyMemGeneric 1000 found = dump.find("CopyMemGeneric"); 1001 CHECK(found == std::string::npos); 1002 1003 // Sync the outputs so we can read the data 1004 arm_compute::CLScheduler::get().sync(); 1005 1006 // Check output is as expected 1007 auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr); 1008 CHECK(outputResult); 1009 CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end())); 1010 1011 // Repeat the inference, with new tensors and without using PreImporting to force it to fall back to copying 1012 1013 // Creates structures for input & output 1014 auto inputDataCopy = std::make_unique<uint8_t[]>(space); 1015 void* copyInputPtr = inputDataCopy.get(); 1016 1017 // Fill input with values 1018 auto* inputCopyPtr = reinterpret_cast<float*>(copyInputPtr); 1019 inputCopyPtr[0] = 1; 1020 inputCopyPtr[1] = 5; 1021 inputCopyPtr[2] = 2; 1022 inputCopyPtr[3] = 3; 1023 inputCopyPtr[4] = 8; 1024 inputCopyPtr[5] = 7; 1025 inputCopyPtr[6] = 3; 1026 inputCopyPtr[7] = 6; 1027 inputCopyPtr[8] = 3; 1028 inputCopyPtr[9] = 3; 1029 inputCopyPtr[10] = 9; 1030 inputCopyPtr[11] = 1; 1031 1032 // Output pre-filled with -10.0f 1033 auto outputDataCopy = std::make_unique<uint8_t[]>(space); 1034 void* copyOutputPtr = outputDataCopy.get(); 1035 auto* outputCopyPtr = reinterpret_cast<float*>(copyOutputPtr); 1036 std::fill_n(outputCopyPtr, numElements, -10.0f); 1037 1038 InputTensors inputTensorsCopy 1039 { 1040 {0,armnn::ConstTensor(inputTensorInfo, copyInputPtr)}, 1041 }; 1042 OutputTensors outputTensorsCopy 1043 { 1044 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), copyOutputPtr)} 1045 }; 1046 1047 // Do the inference without any pre-imported input/output ids 1048 runtime->EnqueueWorkload(netId, inputTensorsCopy, outputTensorsCopy); 1049 // Sync the outputs so we can read the data 1050 arm_compute::CLScheduler::get().sync(); 1051 1052 // Check the output is correct 1053 outputResult = reinterpret_cast<float*>(copyOutputPtr); 1054 CHECK(outputResult); 1055 CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end())); 1056 1057 // Query the profiler again, this will contain the results of both inferences 1058 profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss); 1059 dump = ss.str(); 1060 1061 // Contains Convolution2dWorkload 1062 found = dump.find("Convolution2dWorkload"); 1063 CHECK(found != std::string::npos); 1064 1065 // Should still contain the SyncMemGeneric 1066 found = dump.find("SyncMemGeneric"); 1067 CHECK(found != std::string::npos); 1068 1069 // Should now also contain a CopyMemGeneric 1070 found = dump.find("CopyMemGeneric"); 1071 CHECK(found != std::string::npos); 1072 runtime->UnloadNetwork(netId); 1073 } 1074 1075 TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportRepeatedInferencesInvertedEndToEndTest") 1076 { 1077 /* 1078 * This test is similar to the test above but instead of importing and then copying, we start by copying and then do 1079 * the import. 1080 */ 1081 // Create runtime in which test will run 1082 IRuntime::CreationOptions options; 1083 IRuntimePtr runtime(armnn::IRuntime::Create(options)); 1084 1085 // build up the structure of the network 1086 INetworkPtr network(INetwork::Create()); 1087 1088 armnn::TensorInfo inputInfo({ 1, 3, 4, 1 }, DataType::Float32); 1089 armnn::TensorInfo kernelInfo({ 1, 3, 3, 1 }, DataType::Float32); 1090 armnn::TensorInfo outputInfo({ 1, 3, 4, 1 }, DataType::Float32); 1091 1092 kernelInfo.SetConstant(true); 1093 1094 std::vector<float> kernel = 1095 { 1096 4, 5, 6, 1097 0, 0, 0, 1098 3, 2, 1 1099 }; 1100 1101 const std::vector<float> expectedOutput = 1102 { 1103 23, 41, 33, 21, 1104 44, 65, 76, 52, 1105 82, 85, 79, 42 1106 }; 1107 1108 unsigned int numElements = inputInfo.GetNumElements(); 1109 size_t totalBytes = numElements * sizeof(float); 1110 1111 IConnectableLayer* const inputLayer = network->AddInputLayer(0, "input"); 1112 ARMNN_ASSERT(inputLayer); 1113 1114 armnn::ConstTensor weights(kernelInfo, kernel); 1115 1116 armnn::Convolution2dDescriptor convDesc2d; 1117 convDesc2d.m_StrideX = 1; 1118 convDesc2d.m_StrideY = 1; 1119 convDesc2d.m_PadLeft = 1; 1120 convDesc2d.m_PadRight = 1; 1121 convDesc2d.m_PadTop = 1; 1122 convDesc2d.m_PadBottom = 1; 1123 convDesc2d.m_DataLayout = DataLayout::NHWC; 1124 1125 armnn::IConnectableLayer* const convLayer = network->AddConvolution2dLayer(convDesc2d, "conv"); 1126 ARMNN_ASSERT(convLayer); 1127 1128 armnn::IConnectableLayer* weightsLayer = network->AddConstantLayer(weights); 1129 1130 weightsLayer->GetOutputSlot(0).SetTensorInfo(weights.GetInfo()); 1131 weightsLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(1u)); 1132 1133 inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0)); 1134 inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo); 1135 1136 IConnectableLayer* output = network->AddOutputLayer(0, "output"); 1137 convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0)); 1138 convLayer->GetOutputSlot(0).SetTensorInfo(outputInfo); 1139 1140 // Optimize the network 1141 OptimizerOptionsOpaque optOptions; 1142 optOptions.SetImportEnabled(false); 1143 optOptions.SetExportEnabled(false); 1144 std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc}; 1145 IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec(), optOptions); 1146 CHECK(optNet); 1147 1148 // Loads it into the runtime. 1149 NetworkId netId; 1150 std::string ignoredErrorMessage; 1151 // Enable Importing 1152 INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined); 1153 runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties); 1154 1155 // Creates structures for input & output 1156 const size_t alignment = 1157 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>(); 1158 size_t space = totalBytes + alignment + alignment; 1159 auto inputData = std::make_unique<uint8_t[]>(space); 1160 void* copyInputPtr = inputData.get(); 1161 1162 // Fill input with values 1163 auto* inputPtr = reinterpret_cast<float*>(copyInputPtr); 1164 inputPtr[0] = 1; 1165 inputPtr[1] = 5; 1166 inputPtr[2] = 2; 1167 inputPtr[3] = 3; 1168 inputPtr[4] = 8; 1169 inputPtr[5] = 7; 1170 inputPtr[6] = 3; 1171 inputPtr[7] = 6; 1172 inputPtr[8] = 3; 1173 inputPtr[9] = 3; 1174 inputPtr[10] = 9; 1175 inputPtr[11] = 1; 1176 1177 // Create output buffer and fill it with -10.0f 1178 auto outputData = std::make_unique<uint8_t[]>(space); 1179 void* copyOutputPtr = outputData.get(); 1180 auto* outputPtr = reinterpret_cast<float*>(copyOutputPtr); 1181 std::fill_n(outputPtr, numElements, -10.0f); 1182 1183 TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0); 1184 inputTensorInfo.SetConstant(true); 1185 InputTensors inputTensors 1186 { 1187 {0,armnn::ConstTensor(inputTensorInfo, copyInputPtr)}, 1188 }; 1189 OutputTensors outputTensors 1190 { 1191 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), copyOutputPtr)} 1192 }; 1193 1194 runtime->GetProfiler(netId)->EnableProfiling(true); 1195 1196 // Do the inference without any pre-imported inputs/outputs 1197 runtime->EnqueueWorkload(netId, inputTensors, outputTensors); 1198 1199 // Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution 1200 ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); 1201 std::stringstream ss; 1202 profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss); 1203 std::string dump = ss.str(); 1204 1205 // Contains Convolution2dWorkload 1206 std::size_t found = dump.find("Convolution2dWorkload"); 1207 CHECK(found != std::string::npos); 1208 1209 // Does not contain SyncMemGeneric 1210 found = dump.find("SyncMemGeneric"); 1211 CHECK(found == std::string::npos); 1212 1213 // Does contain CopyMemGeneric 1214 found = dump.find("CopyMemGeneric"); 1215 CHECK(found != std::string::npos); 1216 1217 // Sync the outputs so we can read the data 1218 arm_compute::CLScheduler::get().sync(); 1219 1220 // Check output is as expected 1221 auto* outputResult = reinterpret_cast<float*>(copyOutputPtr); 1222 CHECK(outputResult); 1223 CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end())); 1224 1225 // Repeat the inference, with new tensors and while using pre-importing to force it to import 1226 1227 // Creates structures for input & output 1228 auto inputDataImport = std::make_unique<uint8_t[]>(space); 1229 void* alignedInputImportPtr = inputDataImport.get(); 1230 CHECK(std::align(alignment, totalBytes, alignedInputImportPtr, space)); 1231 1232 // Fill input with values 1233 auto* inputImportPtr = reinterpret_cast<float*>(alignedInputImportPtr); 1234 inputImportPtr[0] = 1; 1235 inputImportPtr[1] = 5; 1236 inputImportPtr[2] = 2; 1237 inputImportPtr[3] = 3; 1238 inputImportPtr[4] = 8; 1239 inputImportPtr[5] = 7; 1240 inputImportPtr[6] = 3; 1241 inputImportPtr[7] = 6; 1242 inputImportPtr[8] = 3; 1243 inputImportPtr[9] = 3; 1244 inputImportPtr[10] = 9; 1245 inputImportPtr[11] = 1; 1246 1247 // Output pre-filled with -10.0f 1248 auto outputDataImport = std::make_unique<uint8_t[]>(space); 1249 void* alignedOutputImportPtr = outputDataImport.get(); 1250 CHECK(std::align(alignment, totalBytes, alignedOutputImportPtr, space)); 1251 auto* outputImportPtr = reinterpret_cast<float*>(alignedOutputImportPtr); 1252 std::fill_n(outputImportPtr, numElements, -10.0f); 1253 1254 InputTensors inputTensorsImport 1255 { 1256 {0,armnn::ConstTensor(inputTensorInfo, alignedInputImportPtr)}, 1257 }; 1258 OutputTensors outputTensorsImport 1259 { 1260 {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputImportPtr)} 1261 }; 1262 1263 INFO("Run ImportInputs"); 1264 std::vector<ImportedInputId> importedInputIds = 1265 runtime->ImportInputs(netId, inputTensorsImport, MemorySource::Malloc); 1266 CHECK(importedInputIds.size() == 1); 1267 std::vector<ImportedOutputId> importedOutputIds = 1268 runtime->ImportOutputs(netId, outputTensorsImport, MemorySource::Malloc); 1269 CHECK(importedOutputIds.size() == 1); 1270 1271 // Do the inference with pre-imported inputs/outputs 1272 runtime->EnqueueWorkload(netId, InputTensors(), OutputTensors(), importedInputIds, importedOutputIds); 1273 // Sync the outputs so we can read the data 1274 arm_compute::CLScheduler::get().sync(); 1275 1276 // Check the output is correct 1277 outputResult = reinterpret_cast<float*>(alignedOutputImportPtr); 1278 CHECK(outputResult); 1279 CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end())); 1280 1281 1282 // Query the profiler again, this will contain the results of both inferences 1283 profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss); 1284 dump = ss.str(); 1285 1286 // Contains Convolution2dWorkload 1287 found = dump.find("Convolution2dWorkload"); 1288 CHECK(found != std::string::npos); 1289 1290 // Should now contain the SyncMemGeneric 1291 found = dump.find("SyncMemGeneric"); 1292 CHECK(found != std::string::npos); 1293 1294 // Should still contain a CopyMemGeneric from the first inference 1295 found = dump.find("CopyMemGeneric"); 1296 CHECK(found != std::string::npos); 1297 runtime->UnloadNetwork(netId); 1298 } 1299 1300 } 1301