1 /*
2  * Copyright (C) 2021 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <android-base/logging.h>
18 #include <android-base/unique_fd.h>
19 #include <android/hardware_buffer.h>
20 #include <gtest/gtest.h>
21 #include <vulkan/vulkan.h>
22 #include <vulkan/vulkan_android.h>
23 
24 #include <algorithm>
25 #include <cmath>
26 #include <cstring>
27 #include <memory>
28 #include <string>
29 #include <utility>
30 #include <vector>
31 
32 #include "TestNeuralNetworksWrapper.h"
33 
34 #ifndef NNTEST_ONLY_PUBLIC_API
35 #include "Manager.h"
36 #endif
37 
38 namespace android::nn {
39 namespace {
40 
41 using Type = test_wrapper::Type;
42 using OperandType = test_wrapper::OperandType;
43 using Result = test_wrapper::Result;
44 
45 constexpr uint32_t kOperandSizeX = 256;
46 constexpr uint32_t kOperandSizeY = 256;
47 constexpr uint32_t kOperandLength = kOperandSizeX * kOperandSizeY;
48 constexpr uint32_t kNumberOfIterationsToTest = 100;
49 constexpr uint32_t kMaxNumberOfPrintedErrors = 10;
50 
51 // This file implements a test suite that exercises a GPU -> NNAPI pipeline using AHardwareBuffer
52 // and sync fence. One pass of the pipeline involves the following three stages:
53 //
54 //   - GPU: Invoke the compute shader to clear the all elements in the output buffer to value "1"
55 //          of the corresponding element type. Because GPU may not be able to natively support
56 //          float16/int8/uint8 data types, we pack each data type into a 4-byte chunk as uint32_t
57 //          and pass to the shader. E.g., float16 will be packed as 0x3c003c00 -- float16 value
58 //          of "1" (0x3c00) repeated twice. The compute shader will use this 4-byte chunk to clear
59 //          the data in the output buffer (see CLEAR_DATA in the compute shader code).
60 //
61 //          The GPU workload will output directly to an AHardwareBuffer and export an Android sync
62 //          fence.
63 //
64 //   - NNAPI: Execute a broadcast ADD operation
65 //
66 //                output = ADD(input, const, act)
67 //
68 //            where "input" and "output" are of size [kOperandSizeY, kOperandSizeX], "const" and
69 //            "act" are model constant operands, "const" is of size [1] and value "1" of the
70 //            corresponding element type, "act" = 0. The ADD operation will increment each element
71 //            in the input tensor by 1.
72 //
73 //            The NNAPI executor takes the GPU output AHardwareBuffer as its input memory,
74 //            and directly outputs to another AHardwareBuffer. We use startComputeWithDependencies
75 //            to wait on the sync fence from the GPU workload. If supported, the NNAPI executor will
76 //            emit a sync fence; Otherwise, it will wait until the workload is finished.
77 //
78 //   - Check: Verify that each element in the resulting tensor is 1 + 1 = 2.
79 //
80 // We use introspection API to run the pipeline with each individual driver. Because this test is
81 // added in NNAPI feature level 5, we will exclude devices with a lower feature level. We expect
82 // that if the driver successfully prepares the model, it should finish execution without an error.
83 //
84 // The pipeline is tested with four data types: float32, float16, quant8_asymm, and
85 // quant8_asymm_signed. These data types are chosen to make sure that a driver is likely to
86 // support at least one of the data types.
87 //
88 // For each configuration, we run the pipeline for kNumberOfIterationsToTest iterations.
89 
90 const std::vector<uint32_t> kComputeShader =
91 #include "shaders/TestGpuNnapi.comp.spv.inl"
92         ;
93 
94 // The expected element value in the final NNAPI output AHardwareBuffer.
95 constexpr uint32_t kExpectedResultInInt = 2;
96 
97 // Helper templates for information related to a primary tensor data type. Only four specializations
98 // exists for this template: Type::TENSOR_FLOAT32, Type::TENSOR_FLOAT16, Type::TENSOR_QUANT8_ASYMM,
99 // and Type::TENSOR_QUANT8_ASYMM_SIGNED. Each specialization corresponds to a primary data type for
100 // the testing pipeline.
101 //
102 // Each template specialization defines the following fields:
103 //   - ElementType: The corresponding C++ type. Use sizeof(ElementType) to get the element size.
104 //   - kIsQuantized: Whether the data type is a quantized type or not.
105 //   - kClearData: The CLEAR_DATA used in the compute shader.
106 //   - kTolerance: The absolute tolerance used to check the computation result.
107 template <Type dataType>
108 struct TestTypeHelper;
109 template <>
110 struct TestTypeHelper<Type::TENSOR_FLOAT32> {
111     using ElementType = float;
112     static constexpr bool kIsQuantized = false;
113     // One float32 of value (1.0) packed into uint32_t
114     static constexpr uint32_t kClearData = 0x3f800000;
115     static constexpr double kTolerance = 1e-6;
116 };
117 template <>
118 struct TestTypeHelper<Type::TENSOR_FLOAT16> {
119     using ElementType = _Float16;
120     static constexpr bool kIsQuantized = false;
121     // Two float16 of value (1.0) packed into uint32_t
122     static constexpr uint32_t kClearData = 0x3c003c00;
123     static constexpr double kTolerance = 1e-3;
124 };
125 template <>
126 struct TestTypeHelper<Type::TENSOR_QUANT8_ASYMM> {
127     using ElementType = uint8_t;
128     static constexpr bool kIsQuantized = true;
129     // Four uint8_t of value (1) packed into uint32_t
130     static constexpr uint32_t kClearData = 0x01010101;
131     static constexpr double kTolerance = 0;
132 };
133 template <>
134 struct TestTypeHelper<Type::TENSOR_QUANT8_ASYMM_SIGNED> {
135     using ElementType = int8_t;
136     static constexpr bool kIsQuantized = true;
137     // Four int8_t of value (1) packed into uint32_t
138     static constexpr uint32_t kClearData = 0x01010101;
139     static constexpr double kTolerance = 0;
140 };
141 
isExtensionSupported(const std::vector<VkExtensionProperties> & supportedExtensions,const char * requestedExtension)142 bool isExtensionSupported(const std::vector<VkExtensionProperties>& supportedExtensions,
143                           const char* requestedExtension) {
144     return std::any_of(supportedExtensions.begin(), supportedExtensions.end(),
145                        [requestedExtension](const auto& extension) {
146                            return strcmp(extension.extensionName, requestedExtension) == 0;
147                        });
148 }
149 
150 // Records the workgroup size and the group counts of dispatching the compute shader.
151 struct DispatchSize {
152     uint32_t workgroupSize;
153     uint32_t groupCountX;
154     uint32_t groupCountY;
155 };
156 
157 // Choose an appropriate dispatch size. We are using a square workgroup size.
158 template <Type dataType>
chooseDispatchSize(const VkPhysicalDeviceLimits & limits)159 DispatchSize chooseDispatchSize(const VkPhysicalDeviceLimits& limits) {
160     // Compute the number of invocations along each dimension.
161     const uint32_t elementSize = sizeof(typename TestTypeHelper<dataType>::ElementType);
162     const uint32_t numberOfElementsPerInvocation = sizeof(uint32_t) / elementSize;
163     const uint32_t workgroupInvocationsX = kOperandSizeX / numberOfElementsPerInvocation;
164     const uint32_t workgroupInvocationsY = kOperandSizeY;
165 
166     // Make sure the workgroup size does not exceed the number of invocations along the X and Y
167     // dimensions.
168     uint32_t workgroupSize = std::min(workgroupInvocationsX, workgroupInvocationsY);
169 
170     // Make sure the workgroup size does not exceed the device limit along the X and Y dimensions.
171     workgroupSize = std::min<uint32_t>(workgroupSize, limits.maxComputeWorkGroupSize[0]);
172     workgroupSize = std::min<uint32_t>(workgroupSize, limits.maxComputeWorkGroupSize[1]);
173 
174     // Make sure the total number of invocations does not exceed the device limit.
175     uint32_t maxSquareWorkGroupSize =
176             static_cast<uint32_t>(std::sqrt(limits.maxComputeWorkGroupInvocations));
177     workgroupSize = std::min(workgroupSize, maxSquareWorkGroupSize);
178 
179     // Round down to a power of 2. This is to make sure workgroupInvocationsX and
180     // workgroupInvocationsY are divisible by the workgroup size so that we don't need to apply
181     // bound check in the shader.
182     uint32_t power = static_cast<uint32_t>(std::log2(static_cast<float>(workgroupSize)));
183     workgroupSize = 1u << power;
184     CHECK(workgroupInvocationsX % workgroupSize == 0);
185     CHECK(workgroupInvocationsY % workgroupSize == 0);
186 
187     return {
188             .workgroupSize = workgroupSize,
189             .groupCountX = workgroupInvocationsX / workgroupSize,
190             .groupCountY = workgroupInvocationsY / workgroupSize,
191     };
192 }
193 
194 // Find the first memory index that satisfies the requirements
195 // See VkAndroidHardwareBufferPropertiesANDROID::memoryTypeBits for the semantics of
196 // "memoryTypeBitsRequirement"
findMemoryType(const VkPhysicalDeviceMemoryProperties & properties,uint32_t memoryTypeBitsRequirement,VkDeviceSize sizeRequirement)197 std::optional<uint32_t> findMemoryType(const VkPhysicalDeviceMemoryProperties& properties,
198                                        uint32_t memoryTypeBitsRequirement,
199                                        VkDeviceSize sizeRequirement) {
200     for (uint32_t memoryIndex = 0; memoryIndex < VK_MAX_MEMORY_TYPES; ++memoryIndex) {
201         const uint32_t memoryTypeBits = (1 << memoryIndex);
202         const bool isRequiredMemoryType = memoryTypeBitsRequirement & memoryTypeBits;
203         const uint32_t heapIndex = properties.memoryTypes[memoryIndex].heapIndex;
204         const bool isLargeEnough = properties.memoryHeaps[heapIndex].size >= sizeRequirement;
205         if (isRequiredMemoryType && isLargeEnough) return memoryIndex;
206     }
207 
208     // failed to find memory type.
209     return std::nullopt;
210 }
211 
addBufferTransitionBarrier(VkCommandBuffer commandBuffer,VkBuffer buffer,VkPipelineStageFlags srcStageMask,VkPipelineStageFlags dstStageMask,VkAccessFlags srcAccessMask,VkAccessFlags dstAccessMask,uint32_t srcQueue,uint32_t dstQueue)212 void addBufferTransitionBarrier(VkCommandBuffer commandBuffer, VkBuffer buffer,
213                                 VkPipelineStageFlags srcStageMask,
214                                 VkPipelineStageFlags dstStageMask, VkAccessFlags srcAccessMask,
215                                 VkAccessFlags dstAccessMask, uint32_t srcQueue, uint32_t dstQueue) {
216     const VkBufferMemoryBarrier bufferBarrier = {
217             .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
218             .pNext = nullptr,
219             .srcAccessMask = srcAccessMask,
220             .dstAccessMask = dstAccessMask,
221             .srcQueueFamilyIndex = srcQueue,
222             .dstQueueFamilyIndex = dstQueue,
223             .buffer = buffer,
224             .offset = 0,
225             .size = VK_WHOLE_SIZE,
226     };
227     vkCmdPipelineBarrier(commandBuffer, srcStageMask, dstStageMask, 0, 0, nullptr, 1,
228                          &bufferBarrier, 0, nullptr);
229 }
230 
allocateBlobAhwb(uint32_t size,uint64_t usage,AHardwareBuffer ** outAhwb)231 void allocateBlobAhwb(uint32_t size, uint64_t usage, AHardwareBuffer** outAhwb) {
232     AHardwareBuffer_Desc desc = {
233             .width = size,
234             .height = 1u,
235             .layers = 1u,
236             .format = AHARDWAREBUFFER_FORMAT_BLOB,
237             .usage = usage,
238     };
239     if (AHardwareBuffer_allocate(&desc, outAhwb) != 0) {
240         GTEST_SKIP() << "Device failed to allocate Android hardware buffer";
241     }
242 }
243 
244 using NameAndDevice = std::pair<const char*, const ANeuralNetworksDevice*>;
245 
getNnapiDevices(std::vector<NameAndDevice> * outDevices)246 void getNnapiDevices(std::vector<NameAndDevice>* outDevices) {
247     // Get the number of available NNAPI devices
248     uint32_t numDevices = 0;
249     ASSERT_EQ(ANeuralNetworks_getDeviceCount(&numDevices), ANEURALNETWORKS_NO_ERROR);
250 
251     std::vector<NameAndDevice> devices;
252     for (uint32_t i = 0; i < numDevices; i++) {
253         // Get device
254         ANeuralNetworksDevice* device;
255         ASSERT_EQ(ANeuralNetworks_getDevice(/*devIndex=*/i, &device), ANEURALNETWORKS_NO_ERROR);
256 
257         // Get device name
258         const char* deviceName = nullptr;
259         ASSERT_EQ(ANeuralNetworksDevice_getName(device, &deviceName), ANEURALNETWORKS_NO_ERROR);
260 
261         // Check device feature level. This test is added in NNAPI feature level 5, so skip if the
262         // device is of a lower feature level.
263         int64_t featureLevel;
264         ASSERT_EQ(ANeuralNetworksDevice_getFeatureLevel(device, &featureLevel),
265                   ANEURALNETWORKS_NO_ERROR);
266         if (featureLevel < ANEURALNETWORKS_FEATURE_LEVEL_5) {
267             continue;
268         }
269 
270         devices.emplace_back(deviceName, device);
271     }
272     *outDevices = std::move(devices);
273 }
274 
getNnapiDevices()275 std::vector<NameAndDevice> getNnapiDevices() {
276     std::vector<NameAndDevice> devices;
277     getNnapiDevices(&devices);
278     return devices;
279 }
280 
printGpuNnapiTest(const testing::TestParamInfo<NameAndDevice> & info)281 std::string printGpuNnapiTest(const testing::TestParamInfo<NameAndDevice>& info) {
282     std::string name = info.param.first;
283     // gtest test names must only contain alphanumeric characters
284     std::replace_if(
285             name.begin(), name.end(), [](char c) { return !std::isalnum(c); }, '_');
286     return name;
287 }
288 
289 template <Type dataType>
290 class VulkanComputePipeline {
291    public:
292     // Returns the created object on success, or nullptr on failure.
create(AHardwareBuffer * output)293     static std::unique_ptr<VulkanComputePipeline> create(AHardwareBuffer* output) {
294         auto pipeline = std::make_unique<VulkanComputePipeline>();
295         pipeline->initialize(output);
296         return pipeline->mIsValid ? std::move(pipeline) : nullptr;
297     }
298 
~VulkanComputePipeline()299     ~VulkanComputePipeline() {
300         if (mDevice != VK_NULL_HANDLE) {
301             vkDestroyFence(mDevice, mFence, nullptr);
302             vkDestroyPipeline(mDevice, mPipeline, nullptr);
303             vkDestroyDescriptorSetLayout(mDevice, mDescriptorSetLayout, nullptr);
304             vkDestroyPipelineLayout(mDevice, mPipelineLayout, nullptr);
305             vkFreeMemory(mDevice, mOutputBufferMemory, nullptr);
306             vkDestroyBuffer(mDevice, mOutputBuffer, nullptr);
307             vkDestroyShaderModule(mDevice, mShaderModule, nullptr);
308             vkDestroyCommandPool(mDevice, mCommandPool, nullptr);
309             vkDestroyDescriptorPool(mDevice, mDescriptorPool, nullptr);
310         }
311         vkDestroyDevice(mDevice, nullptr);
312         vkDestroyInstance(mInstance, nullptr);
313     }
314 
315     // Returns {success, sync_fd}
run()316     std::pair<bool, base::unique_fd> run() {
317         bool success = false;
318         base::unique_fd outSyncFd;
319         runInternal(&success, &outSyncFd);
320         return {success, std::move(outSyncFd)};
321     }
322 
323    private:
initialize(AHardwareBuffer * output)324     void initialize(AHardwareBuffer* output) {
325         // Create instance
326         const VkApplicationInfo applicationDesc = {
327                 .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
328                 .pApplicationName = "TestGpuNnapi",
329                 .applicationVersion = VK_MAKE_VERSION(1, 0, 0),
330                 .apiVersion = VK_API_VERSION_1_1,
331         };
332         const VkInstanceCreateInfo instanceDesc = {
333                 .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
334                 .pApplicationInfo = &applicationDesc,
335                 .enabledLayerCount = 0,
336                 .ppEnabledLayerNames = nullptr,
337                 .enabledExtensionCount = 0,
338                 .ppEnabledExtensionNames = nullptr,
339         };
340         ASSERT_EQ(vkCreateInstance(&instanceDesc, nullptr, &mInstance), VK_SUCCESS);
341 
342         // Enumerate physical devices
343         uint32_t numberOfDevices = 0;
344         ASSERT_EQ(vkEnumeratePhysicalDevices(mInstance, &numberOfDevices, nullptr), VK_SUCCESS);
345         std::vector<VkPhysicalDevice> physicalDevices(numberOfDevices);
346         ASSERT_EQ(vkEnumeratePhysicalDevices(mInstance, &numberOfDevices, physicalDevices.data()),
347                   VK_SUCCESS);
348 
349         // Pick the first device with a compute queue
350         for (const auto& physicalDevice : physicalDevices) {
351             uint32_t numberOfQueueFamilies = 0;
352             vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &numberOfQueueFamilies,
353                                                      nullptr);
354             std::vector<VkQueueFamilyProperties> queueFamilies(numberOfQueueFamilies);
355             vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &numberOfQueueFamilies,
356                                                      queueFamilies.data());
357 
358             uint32_t pickedQueueFamilyIndex = 0;
359             bool hasComputeQueue = false;
360             for (uint32_t i = 0; i < queueFamilies.size(); i++) {
361                 if (queueFamilies[i].queueFlags & VK_QUEUE_COMPUTE_BIT) {
362                     pickedQueueFamilyIndex = i;
363                     hasComputeQueue = true;
364                     break;
365                 }
366             }
367             if (!hasComputeQueue) continue;
368             mPhysicalDevice = physicalDevice;
369             mQueueFamilyIndex = pickedQueueFamilyIndex;
370             break;
371         }
372         if (mPhysicalDevice == VK_NULL_HANDLE) {
373             GTEST_SKIP() << "No device can handle a compute queue";
374         }
375 
376         // Get physical device properties
377         vkGetPhysicalDeviceProperties(mPhysicalDevice, &mPhysicalDeviceProperties);
378         vkGetPhysicalDeviceMemoryProperties(mPhysicalDevice, &mPhysicalDeviceMemoryProperties);
379 
380         // Check physical device version
381         if (mPhysicalDeviceProperties.apiVersion < VK_API_VERSION_1_1) {
382             GTEST_SKIP() << "Device API version too low";
383         }
384 
385         // Check if the physical device is able to handle the compute work
386         const auto dispatchSize = chooseDispatchSize<dataType>(mPhysicalDeviceProperties.limits);
387         if (mPhysicalDeviceProperties.limits.maxComputeWorkGroupCount[0] <
388             dispatchSize.groupCountX) {
389             GTEST_SKIP() << "Device cannot handle " << dispatchSize.groupCountX
390                          << " workgroups for the X dimension";
391         }
392         if (mPhysicalDeviceProperties.limits.maxComputeWorkGroupCount[1] <
393             dispatchSize.groupCountY) {
394             GTEST_SKIP() << "Device cannot handle " << dispatchSize.groupCountY
395                          << " workgroups for the Y dimension";
396         }
397 
398         // Enumerate device extensions
399         uint32_t numberOfExtensions = 0;
400         ASSERT_EQ(vkEnumerateDeviceExtensionProperties(mPhysicalDevice, nullptr,
401                                                        &numberOfExtensions, nullptr),
402                   VK_SUCCESS);
403         std::vector<VkExtensionProperties> extensions(numberOfExtensions);
404         ASSERT_EQ(vkEnumerateDeviceExtensionProperties(mPhysicalDevice, nullptr,
405                                                        &numberOfExtensions, extensions.data()),
406                   VK_SUCCESS);
407 
408         // Required device extensions
409         std::vector<const char*> requiredDeviceExtensions = {
410                 // The following extensions are required to import an AHardwareBuffer to Vulkan
411                 VK_ANDROID_EXTERNAL_MEMORY_ANDROID_HARDWARE_BUFFER_EXTENSION_NAME,
412                 VK_EXT_QUEUE_FAMILY_FOREIGN_EXTENSION_NAME,
413                 VK_KHR_GET_MEMORY_REQUIREMENTS_2_EXTENSION_NAME,
414                 VK_KHR_BIND_MEMORY_2_EXTENSION_NAME,
415                 VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME,
416                 // The following extensions are required to export a sync fence
417                 VK_KHR_EXTERNAL_FENCE_FD_EXTENSION_NAME,
418                 VK_KHR_MAINTENANCE1_EXTENSION_NAME,
419         };
420         for (const char* requiredDeviceExtension : requiredDeviceExtensions) {
421             if (!isExtensionSupported(extensions, requiredDeviceExtension)) {
422                 GTEST_SKIP() << "Device extension " << requiredDeviceExtension
423                              << " is not supported";
424             }
425         }
426 
427         // Check external memory properties
428         const VkPhysicalDeviceExternalBufferInfo externalBufferInfo = {
429                 .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_BUFFER_INFO,
430                 .pNext = nullptr,
431                 .flags = 0u,
432                 .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
433                 .handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID,
434         };
435         VkExternalBufferProperties externalBufferProperties = {
436                 .sType = VK_STRUCTURE_TYPE_EXTERNAL_BUFFER_PROPERTIES,
437         };
438         vkGetPhysicalDeviceExternalBufferProperties(mPhysicalDevice, &externalBufferInfo,
439                                                     &externalBufferProperties);
440         if (!(externalBufferProperties.externalMemoryProperties.externalMemoryFeatures &
441               VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT)) {
442             GTEST_SKIP() << "Device is not able to import Android hardware buffer";
443         }
444         ASSERT_FALSE(externalBufferProperties.externalMemoryProperties.externalMemoryFeatures &
445                      VK_EXTERNAL_MEMORY_FEATURE_DEDICATED_ONLY_BIT);
446 
447         // Check external fence properties
448         const VkPhysicalDeviceExternalFenceInfo externalFenceInfo = {
449                 .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_FENCE_INFO,
450                 .pNext = nullptr,
451                 .handleType = VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT,
452         };
453         VkExternalFenceProperties externalFenceProperties;
454         vkGetPhysicalDeviceExternalFenceProperties(mPhysicalDevice, &externalFenceInfo,
455                                                    &externalFenceProperties);
456         if (!(externalFenceProperties.externalFenceFeatures &
457               VK_EXTERNAL_FENCE_FEATURE_EXPORTABLE_BIT)) {
458             GTEST_SKIP() << "Device is not able to export Android sync fence FD";
459         }
460 
461         // Create logical device
462         const float queuePriority = 1.0f;
463         const VkDeviceQueueCreateInfo queueDesc = {
464                 .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
465                 .queueFamilyIndex = mQueueFamilyIndex,
466                 .queueCount = 1,
467                 .pQueuePriorities = &queuePriority,
468         };
469         const VkDeviceCreateInfo deviceDesc = {
470                 .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
471                 .queueCreateInfoCount = 1,
472                 .pQueueCreateInfos = &queueDesc,
473                 .enabledExtensionCount = static_cast<uint32_t>(requiredDeviceExtensions.size()),
474                 .ppEnabledExtensionNames = requiredDeviceExtensions.data(),
475                 .pEnabledFeatures = nullptr,
476         };
477         ASSERT_EQ(vkCreateDevice(mPhysicalDevice, &deviceDesc, nullptr, &mDevice), VK_SUCCESS);
478         vkGetDeviceQueue(mDevice, mQueueFamilyIndex, 0, &mQueue);
479 
480         // Get extension function pointers
481         mPfnVkGetFenceFdKHR = reinterpret_cast<PFN_vkGetFenceFdKHR>(
482                 vkGetDeviceProcAddr(mDevice, "vkGetFenceFdKHR"));
483         ASSERT_NE(mPfnVkGetFenceFdKHR, nullptr);
484 
485         // Create descriptor pool
486         const std::vector<VkDescriptorPoolSize> descriptorPoolSizes = {
487                 {
488                         .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
489                         .descriptorCount = 1,
490                 },
491         };
492         const VkDescriptorPoolCreateInfo descriptorPoolCreateInfo = {
493                 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
494                 .maxSets = 1,
495                 .poolSizeCount = static_cast<uint32_t>(descriptorPoolSizes.size()),
496                 .pPoolSizes = descriptorPoolSizes.data(),
497         };
498         ASSERT_EQ(vkCreateDescriptorPool(mDevice, &descriptorPoolCreateInfo, nullptr,
499                                          &mDescriptorPool),
500                   VK_SUCCESS);
501 
502         // Create descriptor set layout
503         const std::vector<VkDescriptorSetLayoutBinding> descriptorsetLayoutBinding = {
504                 {
505                         .binding = 0,  // output buffer
506                         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
507                         .descriptorCount = 1,
508                         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
509                 },
510 
511         };
512         const VkDescriptorSetLayoutCreateInfo descriptorsetLayoutDesc = {
513                 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
514                 .bindingCount = static_cast<uint32_t>(descriptorsetLayoutBinding.size()),
515                 .pBindings = descriptorsetLayoutBinding.data(),
516         };
517         ASSERT_EQ(vkCreateDescriptorSetLayout(mDevice, &descriptorsetLayoutDesc, nullptr,
518                                               &mDescriptorSetLayout),
519                   VK_SUCCESS);
520 
521         // Allocate descriptor set
522         const VkDescriptorSetAllocateInfo descriptorSetAllocateInfo = {
523                 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
524                 .descriptorPool = mDescriptorPool,
525                 .descriptorSetCount = 1,
526                 .pSetLayouts = &mDescriptorSetLayout,
527         };
528         ASSERT_EQ(vkAllocateDescriptorSets(mDevice, &descriptorSetAllocateInfo, &mDescriptorSet),
529                   VK_SUCCESS);
530 
531         // Check the output AHardwareBuffer format and usage bits
532         AHardwareBuffer_Desc desc;
533         AHardwareBuffer_describe(output, &desc);
534         ASSERT_EQ(desc.format, AHARDWAREBUFFER_FORMAT_BLOB);
535         ASSERT_TRUE(desc.usage & AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER);
536 
537         // Get AHardwareBuffer properties
538         VkAndroidHardwareBufferPropertiesANDROID properties = {
539                 .sType = VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_PROPERTIES_ANDROID,
540                 .pNext = nullptr,
541         };
542         ASSERT_EQ(vkGetAndroidHardwareBufferPropertiesANDROID(mDevice, output, &properties),
543                   VK_SUCCESS);
544 
545         // Create the output buffer with AHardwareBuffer memory
546         const VkExternalMemoryBufferCreateInfo externalMemoryBufferCreateInfo = {
547                 .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO,
548                 .pNext = nullptr,
549                 .handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID,
550         };
551         const VkBufferCreateInfo bufferCreateInfo = {
552                 .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
553                 .pNext = &externalMemoryBufferCreateInfo,
554                 .flags = 0u,
555                 .size = desc.width,
556                 .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
557                 .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
558                 .queueFamilyIndexCount = 0u,
559                 .pQueueFamilyIndices = nullptr,
560         };
561         ASSERT_EQ(vkCreateBuffer(mDevice, &bufferCreateInfo, nullptr, &mOutputBuffer), VK_SUCCESS);
562 
563         // Find a proper memory type
564         const auto maybeMemoryTypeIndex =
565                 findMemoryType(mPhysicalDeviceMemoryProperties, properties.memoryTypeBits,
566                                properties.allocationSize);
567         if (!maybeMemoryTypeIndex.has_value()) {
568             GTEST_SKIP() << "None of the memory type is suitable for allocation";
569         }
570 
571         // Import the AHardwareBuffer memory
572         const VkImportAndroidHardwareBufferInfoANDROID importMemoryAllocateInfo = {
573                 .sType = VK_STRUCTURE_TYPE_IMPORT_ANDROID_HARDWARE_BUFFER_INFO_ANDROID,
574                 .pNext = nullptr,
575                 .buffer = output,
576         };
577         const VkMemoryAllocateInfo memoryAllocInfo = {
578                 .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
579                 .pNext = &importMemoryAllocateInfo,
580                 .allocationSize = properties.allocationSize,
581                 .memoryTypeIndex = maybeMemoryTypeIndex.value(),
582         };
583         const auto allocationResult =
584                 vkAllocateMemory(mDevice, &memoryAllocInfo, nullptr, &mOutputBufferMemory);
585         // Memory allocation may fail if the size exceeds the upper limit of a single allocation
586         // that the platform supports
587         if (allocationResult == VK_ERROR_OUT_OF_DEVICE_MEMORY) {
588             GTEST_SKIP() << "Unable to allocate device memory of " << properties.allocationSize
589                          << " bytes";
590         }
591         ASSERT_EQ(allocationResult, VK_SUCCESS);
592 
593         // Bind the memory with the buffer
594         ASSERT_EQ(vkBindBufferMemory(mDevice, mOutputBuffer, mOutputBufferMemory, 0), VK_SUCCESS);
595 
596         // Update the descriptor sets
597         const VkDescriptorBufferInfo outputBufferDesc = {
598                 .buffer = mOutputBuffer,
599                 .offset = 0,
600                 .range = VK_WHOLE_SIZE,
601         };
602         const std::vector<VkWriteDescriptorSet> writeDst = {
603                 {
604                         .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
605                         .pNext = nullptr,
606                         .dstSet = mDescriptorSet,
607                         .dstBinding = 0,  // output buffer
608                         .dstArrayElement = 0,
609                         .descriptorCount = 1,
610                         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
611                         .pImageInfo = nullptr,
612                         .pBufferInfo = &outputBufferDesc,
613                         .pTexelBufferView = nullptr,
614                 },
615         };
616         vkUpdateDescriptorSets(mDevice, writeDst.size(), writeDst.data(), 0, nullptr);
617 
618         // Create shader module
619         const VkShaderModuleCreateInfo shaderDesc = {
620                 .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
621                 .flags = 0,
622                 .codeSize = kComputeShader.size() * sizeof(uint32_t),
623                 .pCode = kComputeShader.data(),
624         };
625         ASSERT_EQ(vkCreateShaderModule(mDevice, &shaderDesc, nullptr, &mShaderModule), VK_SUCCESS);
626 
627         // Create pipeline layout
628         const VkPipelineLayoutCreateInfo layoutDesc = {
629                 .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
630                 .setLayoutCount = 1,
631                 .pSetLayouts = &mDescriptorSetLayout,
632                 .pushConstantRangeCount = 0,
633                 .pPushConstantRanges = nullptr,
634         };
635         ASSERT_EQ(vkCreatePipelineLayout(mDevice, &layoutDesc, nullptr, &mPipelineLayout),
636                   VK_SUCCESS);
637 
638         // Create compute pipeline
639         const uint32_t specializationData[] = {
640                 dispatchSize.workgroupSize,            // local_size_x
641                 dispatchSize.workgroupSize,            // local_size_y
642                 TestTypeHelper<dataType>::kClearData,  // CLEAR_DATA
643         };
644         const std::vector<VkSpecializationMapEntry> specializationMap = {
645                 // {constantID, offset, size}
646                 {0, 0 * sizeof(uint32_t), sizeof(uint32_t)},
647                 {1, 1 * sizeof(uint32_t), sizeof(uint32_t)},
648                 {2, 2 * sizeof(uint32_t), sizeof(uint32_t)},
649         };
650         const VkSpecializationInfo specializationInfo = {
651                 .mapEntryCount = static_cast<uint32_t>(specializationMap.size()),
652                 .pMapEntries = specializationMap.data(),
653                 .dataSize = sizeof(specializationData),
654                 .pData = specializationData,
655         };
656         const VkComputePipelineCreateInfo pipelineDesc = {
657                 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
658                 .stage =
659                         {
660                                 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
661                                 .stage = VK_SHADER_STAGE_COMPUTE_BIT,
662                                 .module = mShaderModule,
663                                 .pName = "main",
664                                 .pSpecializationInfo = &specializationInfo,
665                         },
666                 .layout = mPipelineLayout,
667         };
668         ASSERT_EQ(vkCreateComputePipelines(mDevice, VK_NULL_HANDLE, 1, &pipelineDesc, nullptr,
669                                            &mPipeline),
670                   VK_SUCCESS);
671 
672         // Create command pool
673         const VkCommandPoolCreateInfo cmdpoolDesc = {
674                 .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
675                 .flags = 0u,
676                 .queueFamilyIndex = mQueueFamilyIndex,
677         };
678         ASSERT_EQ(vkCreateCommandPool(mDevice, &cmdpoolDesc, nullptr, &mCommandPool), VK_SUCCESS);
679 
680         // Create a command buffer
681         const VkCommandBufferAllocateInfo cmdBufferCreateInfo = {
682                 .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
683                 .pNext = nullptr,
684                 .commandPool = mCommandPool,
685                 .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
686                 .commandBufferCount = 1,
687         };
688         ASSERT_EQ(vkAllocateCommandBuffers(mDevice, &cmdBufferCreateInfo, &mCommandBuffer),
689                   VK_SUCCESS);
690 
691         // Record command buffer
692         const VkCommandBufferBeginInfo commandBufferBeginInfo = {
693                 .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
694                 .pNext = nullptr,
695                 .flags = 0,
696                 .pInheritanceInfo = nullptr,
697         };
698         ASSERT_EQ(vkBeginCommandBuffer(mCommandBuffer, &commandBufferBeginInfo), VK_SUCCESS);
699 
700         // Buffer barrier to acquire the ownership of the output buffer
701         addBufferTransitionBarrier(mCommandBuffer, mOutputBuffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
702                                    VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0,
703                                    VK_ACCESS_SHADER_WRITE_BIT, VK_QUEUE_FAMILY_FOREIGN_EXT,
704                                    mQueueFamilyIndex);
705 
706         // Setup resources
707         vkCmdBindPipeline(mCommandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, mPipeline);
708         vkCmdBindDescriptorSets(mCommandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, mPipelineLayout, 0,
709                                 1, &mDescriptorSet, 0, nullptr);
710 
711         // Dispatch compute
712         vkCmdDispatch(mCommandBuffer, dispatchSize.groupCountX, dispatchSize.groupCountY, 1);
713 
714         // Buffer barrier to release the ownership of the output buffer
715         addBufferTransitionBarrier(mCommandBuffer, mOutputBuffer,
716                                    VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
717                                    VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, VK_ACCESS_SHADER_WRITE_BIT,
718                                    0, mQueueFamilyIndex, VK_QUEUE_FAMILY_FOREIGN_EXT);
719 
720         // Finish recording the command buffer
721         ASSERT_EQ(vkEndCommandBuffer(mCommandBuffer), VK_SUCCESS);
722 
723         // Create fence
724         const VkExportFenceCreateInfo exportFenceCreateInfo = {
725                 .sType = VK_STRUCTURE_TYPE_EXPORT_FENCE_CREATE_INFO,
726                 .pNext = nullptr,
727                 .handleTypes = VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT,
728         };
729         const VkFenceCreateInfo fenceCreateInfo = {
730                 .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO,
731                 .pNext = &exportFenceCreateInfo,
732                 .flags = 0,
733         };
734         ASSERT_EQ(vkCreateFence(mDevice, &fenceCreateInfo, nullptr, &mFence), VK_SUCCESS);
735 
736         mIsValid = true;
737     }
738 
runInternal(bool * outSuccess,base::unique_fd * outSyncFd)739     void runInternal(bool* outSuccess, base::unique_fd* outSyncFd) {
740         *outSuccess = false;
741 
742         // Submit to queue
743         const VkSubmitInfo submitInfo = {
744                 .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
745                 .waitSemaphoreCount = 0,
746                 .pWaitSemaphores = nullptr,
747                 .pWaitDstStageMask = nullptr,
748                 .commandBufferCount = 1,
749                 .pCommandBuffers = &mCommandBuffer,
750                 .signalSemaphoreCount = 0,
751                 .pSignalSemaphores = nullptr,
752         };
753         ASSERT_EQ(vkResetFences(mDevice, 1, &mFence), VK_SUCCESS);
754         ASSERT_EQ(vkQueueSubmit(mQueue, 1, &submitInfo, mFence), VK_SUCCESS);
755 
756         // Export a Android sync fence FD
757         int syncFd = -1;
758         const VkFenceGetFdInfoKHR fenceGetFdInfo = {
759                 .sType = VK_STRUCTURE_TYPE_FENCE_GET_FD_INFO_KHR,
760                 .pNext = nullptr,
761                 .fence = mFence,
762                 .handleType = VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT,
763         };
764         ASSERT_EQ(mPfnVkGetFenceFdKHR(mDevice, &fenceGetFdInfo, &syncFd), VK_SUCCESS);
765         *outSyncFd = base::unique_fd(syncFd);
766 
767         *outSuccess = true;
768     }
769 
770     // Instance
771     VkInstance mInstance = VK_NULL_HANDLE;
772 
773     // Physical device and queue family
774     VkPhysicalDevice mPhysicalDevice = VK_NULL_HANDLE;
775     VkPhysicalDeviceProperties mPhysicalDeviceProperties{};
776     VkPhysicalDeviceMemoryProperties mPhysicalDeviceMemoryProperties{};
777     uint32_t mQueueFamilyIndex = 0;
778 
779     // Logical device and queue
780     VkDevice mDevice = VK_NULL_HANDLE;
781     VkQueue mQueue = VK_NULL_HANDLE;
782 
783     // Extension functions
784     PFN_vkGetFenceFdKHR mPfnVkGetFenceFdKHR = nullptr;
785 
786     // Resource descriptors
787     VkDescriptorPool mDescriptorPool = VK_NULL_HANDLE;
788     VkDescriptorSetLayout mDescriptorSetLayout = VK_NULL_HANDLE;
789     VkDescriptorSet mDescriptorSet = VK_NULL_HANDLE;
790 
791     // Output buffer
792     VkBuffer mOutputBuffer = VK_NULL_HANDLE;
793     VkDeviceMemory mOutputBufferMemory = VK_NULL_HANDLE;
794 
795     // Compute pipeline
796     VkShaderModule mShaderModule = VK_NULL_HANDLE;
797     VkPipelineLayout mPipelineLayout = VK_NULL_HANDLE;
798     VkPipeline mPipeline = VK_NULL_HANDLE;
799 
800     // Command buffer
801     VkCommandPool mCommandPool = VK_NULL_HANDLE;
802     VkCommandBuffer mCommandBuffer = VK_NULL_HANDLE;
803     VkFence mFence = VK_NULL_HANDLE;
804 
805     bool mIsValid = false;
806 };
807 
808 template <Type dataType>
809 class NnapiExecutor {
810    public:
811     // Returns the created object on success, or nullptr on failure.
create(const ANeuralNetworksDevice * device,AHardwareBuffer * input,AHardwareBuffer * output)812     static std::unique_ptr<NnapiExecutor> create(const ANeuralNetworksDevice* device,
813                                                  AHardwareBuffer* input, AHardwareBuffer* output) {
814         auto nnapi = std::make_unique<NnapiExecutor>(input, output);
815         nnapi->initialize(device);
816         return nnapi->mIsValid ? std::move(nnapi) : nullptr;
817     }
818 
819     // Prefer NnapiExecutor::create
NnapiExecutor(AHardwareBuffer * input,AHardwareBuffer * output)820     NnapiExecutor(AHardwareBuffer* input, AHardwareBuffer* output)
821         : mInputMemory(input), mOutputMemory(output) {}
822 
823     // Returns {success, sync_fd}
run(const base::unique_fd & inSyncFd)824     std::pair<bool, base::unique_fd> run(const base::unique_fd& inSyncFd) {
825         bool success = false;
826         base::unique_fd outSyncFd;
827         runInternal(inSyncFd, &success, &outSyncFd);
828         return {success, std::move(outSyncFd)};
829     }
830 
831    private:
832     using ElementType = typename TestTypeHelper<dataType>::ElementType;
833 
initialize(const ANeuralNetworksDevice * device)834     void initialize(const ANeuralNetworksDevice* device) {
835         ASSERT_TRUE(mInputMemory.isValid());
836         ASSERT_TRUE(mOutputMemory.isValid());
837 
838         // Model input
839         const float scale = TestTypeHelper<dataType>::kIsQuantized ? 1.0f : 0.0f;
840         const OperandType tensorType(dataType, {kOperandSizeY, kOperandSizeX}, scale,
841                                      /*zeroPoint=*/0);
842         uint32_t inputTensor = mModel.addOperand(&tensorType);
843 
844         // Constant tensor
845         const OperandType constTensorType(dataType, {1}, scale, /*zeroPoint=*/0);
846         const ElementType constTensorData = static_cast<ElementType>(1);
847         uint32_t constTensor =
848                 mModel.addConstantOperand<ElementType>(&constTensorType, constTensorData);
849 
850         // Activation (NONE)
851         const OperandType activationType(Type::INT32, {});
852         uint32_t activationScalar = mModel.addConstantOperand<int32_t>(&activationType, 0);
853 
854         // Model output
855         uint32_t outputTensor = mModel.addOperand(&tensorType);
856 
857         // Model operation
858         mModel.addOperation(ANEURALNETWORKS_ADD, {inputTensor, constTensor, activationScalar},
859                             {outputTensor});
860 
861         // Finish model
862         mModel.identifyInputsAndOutputs({inputTensor}, {outputTensor});
863         mModel.relaxComputationFloat32toFloat16(/*isRelax=*/true);
864         ASSERT_TRUE(mModel.isValid());
865         ASSERT_EQ(mModel.finish(), Result::NO_ERROR);
866 
867         // Create compilation for the target device
868         Result result;
869         std::tie(result, mCompilation) =
870                 test_wrapper::Compilation::createForDevice(&mModel, device);
871         ASSERT_EQ(result, Result::NO_ERROR);
872 
873         // Finish the compilation
874         result = mCompilation.finish();
875         if (result != Result::NO_ERROR) {
876             GTEST_SKIP() << "Model is not supported by the device";
877         }
878 
879         mIsValid = true;
880     }
881 
runInternal(const base::unique_fd & inSyncFd,bool * outSuccess,base::unique_fd * outSyncFd)882     void runInternal(const base::unique_fd& inSyncFd, bool* outSuccess,
883                      base::unique_fd* outSyncFd) {
884         *outSuccess = false;
885 
886         // Setup execution
887         mExecution = std::make_unique<test_wrapper::Execution>(&mCompilation);
888         ASSERT_EQ(mExecution->setInputFromMemory(/*index=*/0, &mInputMemory, /*offset=*/0,
889                                                  kOperandLength * sizeof(ElementType)),
890                   Result::NO_ERROR);
891         ASSERT_EQ(mExecution->setOutputFromMemory(/*index=*/0, &mOutputMemory, /*offset=*/0,
892                                                   kOperandLength * sizeof(ElementType)),
893                   Result::NO_ERROR);
894 
895         // Setup dependencies
896         std::vector<const test_wrapper::Event*> dependencies;
897         test_wrapper::Event start;
898         // The sync fence from Vulkan may not be valid if GPU workload has already finished
899         // prior to exporting the fence.
900         if (inSyncFd.ok()) {
901             start = test_wrapper::Event(inSyncFd.get());
902             ASSERT_TRUE(start.isValid());
903             dependencies = {&start};
904         }
905 
906         // Fenced compute
907         test_wrapper::Event finished;
908         mExecution->startComputeWithDependencies(dependencies, /*infinite timeout*/ 0, &finished);
909 
910         // Get the output sync fence if supported; Otherwise, wait until the execution is finished
911         int syncFd = -1;
912         finished.getSyncFenceFd(&syncFd);
913         if (syncFd == -1) {
914             ASSERT_EQ(finished.wait(), Result::NO_ERROR);
915         }
916         *outSyncFd = base::unique_fd(syncFd);
917         *outSuccess = true;
918     }
919 
920     test_wrapper::Model mModel;
921     test_wrapper::Compilation mCompilation;
922     std::unique_ptr<test_wrapper::Execution> mExecution;
923     test_wrapper::Memory mInputMemory, mOutputMemory;
924     bool mIsValid = false;
925 };
926 
927 class GpuNnapiTest : public testing::TestWithParam<NameAndDevice> {
928    protected:
TearDown()929     void TearDown() override {
930         if (mGpuOutput) {
931             AHardwareBuffer_release(mGpuOutput);
932         }
933         if (mNnapiOutput) {
934             AHardwareBuffer_release(mNnapiOutput);
935         }
936     }
937 
938     template <Type dataType>
runTest()939     void runTest() {
940 #ifndef NNTEST_ONLY_PUBLIC_API
941         if (DeviceManager::get()->getUseCpuOnly()) {
942             GTEST_SKIP();
943         }
944 #endif
945 
946         // Allocate hardware buffers for GPU and NNAPI outputs
947         const size_t size = kOperandLength * sizeof(typename TestTypeHelper<dataType>::ElementType);
948         allocateBlobAhwb(
949                 size, AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER | AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN,
950                 &mGpuOutput);
951         allocateBlobAhwb(
952                 size, AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN | AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN,
953                 &mNnapiOutput);
954         if (mGpuOutput == nullptr || mNnapiOutput == nullptr) return;
955 
956         // Create Vulkan compute pipeline
957         auto vulkan = VulkanComputePipeline<dataType>::create(mGpuOutput);
958         if (vulkan == nullptr) return;
959 
960         // Create NNAPI executor
961         auto nnapi = NnapiExecutor<dataType>::create(kDevice, mGpuOutput, mNnapiOutput);
962         if (nnapi == nullptr) return;
963 
964         // Run the test repeatly for kNumberOfIterationsToTest iterations
965         for (uint32_t i = 0; i < kNumberOfIterationsToTest; i++) {
966             auto [gpuSuccess, gpuSyncFd] = vulkan->run();
967             ASSERT_TRUE(gpuSuccess);
968 
969             auto [nnapiSuccess, nnapiSyncFd] = nnapi->run(gpuSyncFd);
970             ASSERT_TRUE(nnapiSuccess);
971 
972             const double tolerance = TestTypeHelper<dataType>::kTolerance;
973             checkResults<dataType>(std::move(nnapiSyncFd), tolerance);
974         }
975     }
976 
977     template <Type dataType>
checkResults(base::unique_fd syncFd,double tolerance)978     void checkResults(base::unique_fd syncFd, double tolerance) {
979         using ElementType = typename TestTypeHelper<dataType>::ElementType;
980 
981         // Lock the buffer with the sync fence
982         // AHardwareBuffer_lock will take the ownership and close the sync fence even on errors
983         void* data;
984         ASSERT_EQ(AHardwareBuffer_lock(mNnapiOutput, AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN,
985                                        syncFd.release(), /*rect=*/nullptr, &data),
986                   0);
987 
988         // Compare the actual results with the expect value
989         uint32_t numberOfErrors = 0;
990         const ElementType expected = static_cast<ElementType>(kExpectedResultInInt);
991         for (uint32_t i = 0; i < kOperandLength; i++) {
992             const ElementType actual = reinterpret_cast<ElementType*>(data)[i];
993 
994             // We expect the absolute difference in double is within the tolerance.
995             const double expected_f64 = static_cast<double>(expected);
996             const double actual_f64 = static_cast<double>(actual);
997             const double diff = std::abs(expected_f64 - actual_f64);
998             if (diff > tolerance) {
999                 // Print at most kMaxNumberOfPrintedErrors errors by EXPECT_EQ
1000                 if (numberOfErrors < kMaxNumberOfPrintedErrors) {
1001                     EXPECT_NEAR(actual_f64, expected_f64, tolerance)
1002                             << "When comparing element [" << kOperandLength / kOperandSizeX << ", "
1003                             << kOperandLength % kOperandSizeX << "]";
1004                 }
1005                 numberOfErrors++;
1006             }
1007         }
1008         EXPECT_EQ(numberOfErrors, 0u);
1009         ASSERT_EQ(AHardwareBuffer_unlock(mNnapiOutput, /*fence=*/nullptr), 0);
1010     }
1011 
1012     // The NNAPI device under test
1013     const ANeuralNetworksDevice* kDevice = GetParam().second;
1014 
1015     AHardwareBuffer* mGpuOutput = nullptr;
1016     AHardwareBuffer* mNnapiOutput = nullptr;
1017 };
1018 
TEST_P(GpuNnapiTest,Float32)1019 TEST_P(GpuNnapiTest, Float32) {
1020     runTest<Type::TENSOR_FLOAT32>();
1021 }
TEST_P(GpuNnapiTest,Float16)1022 TEST_P(GpuNnapiTest, Float16) {
1023     runTest<Type::TENSOR_FLOAT16>();
1024 }
TEST_P(GpuNnapiTest,Quant8Asymm)1025 TEST_P(GpuNnapiTest, Quant8Asymm) {
1026     runTest<Type::TENSOR_QUANT8_ASYMM>();
1027 }
TEST_P(GpuNnapiTest,Quant8AsymmSigned)1028 TEST_P(GpuNnapiTest, Quant8AsymmSigned) {
1029     runTest<Type::TENSOR_QUANT8_ASYMM_SIGNED>();
1030 }
1031 
1032 INSTANTIATE_TEST_SUITE_P(TestGpuNnapi, GpuNnapiTest, testing::ValuesIn(getNnapiDevices()),
1033                          printGpuNnapiTest);
1034 
1035 }  // namespace
1036 }  // namespace android::nn
1037