xref: /aosp_15_r20/external/angle/src/libANGLE/renderer/vulkan/CLCommandQueueVk.cpp (revision 8975f5c5ed3d1c378011245431ada316dfb6f244)
1 //
2 // Copyright 2021 The ANGLE Project Authors. All rights reserved.
3 // Use of this source code is governed by a BSD-style license that can be
4 // found in the LICENSE file.
5 //
6 // CLCommandQueueVk.cpp: Implements the class methods for CLCommandQueueVk.
7 
8 #include "common/PackedCLEnums_autogen.h"
9 #include "common/PackedEnums.h"
10 
11 #include "libANGLE/cl_types.h"
12 #include "libANGLE/renderer/vulkan/CLCommandQueueVk.h"
13 #include "libANGLE/renderer/vulkan/CLContextVk.h"
14 #include "libANGLE/renderer/vulkan/CLDeviceVk.h"
15 #include "libANGLE/renderer/vulkan/CLKernelVk.h"
16 #include "libANGLE/renderer/vulkan/CLMemoryVk.h"
17 #include "libANGLE/renderer/vulkan/CLProgramVk.h"
18 #include "libANGLE/renderer/vulkan/CLSamplerVk.h"
19 #include "libANGLE/renderer/vulkan/cl_types.h"
20 #include "libANGLE/renderer/vulkan/clspv_utils.h"
21 #include "libANGLE/renderer/vulkan/vk_cache_utils.h"
22 #include "libANGLE/renderer/vulkan/vk_cl_utils.h"
23 #include "libANGLE/renderer/vulkan/vk_renderer.h"
24 #include "libANGLE/renderer/vulkan/vk_wrapper.h"
25 
26 #include "libANGLE/CLBuffer.h"
27 #include "libANGLE/CLCommandQueue.h"
28 #include "libANGLE/CLContext.h"
29 #include "libANGLE/CLEvent.h"
30 #include "libANGLE/CLImage.h"
31 #include "libANGLE/CLKernel.h"
32 #include "libANGLE/CLSampler.h"
33 #include "libANGLE/cl_utils.h"
34 
35 #include "spirv/unified1/NonSemanticClspvReflection.h"
36 #include "vulkan/vulkan_core.h"
37 
38 namespace rx
39 {
40 
41 class CLAsyncFinishTask : public angle::Closure
42 {
43   public:
CLAsyncFinishTask(CLCommandQueueVk * queueVk)44     CLAsyncFinishTask(CLCommandQueueVk *queueVk) : mQueueVk(queueVk) {}
45 
operator ()()46     void operator()() override
47     {
48         ANGLE_TRACE_EVENT0("gpu.angle", "CLCommandQueueVk::finish (async)");
49         if (IsError(mQueueVk->finish()))
50         {
51             ERR() << "Async finish (clFlush) failed for queue (" << mQueueVk << ")!";
52         }
53     }
54 
55   private:
56     CLCommandQueueVk *mQueueVk;
57 };
58 
CLCommandQueueVk(const cl::CommandQueue & commandQueue)59 CLCommandQueueVk::CLCommandQueueVk(const cl::CommandQueue &commandQueue)
60     : CLCommandQueueImpl(commandQueue),
61       mContext(&commandQueue.getContext().getImpl<CLContextVk>()),
62       mDevice(&commandQueue.getDevice().getImpl<CLDeviceVk>()),
63       mPrintfBuffer(nullptr),
64       mComputePassCommands(nullptr),
65       mCurrentQueueSerialIndex(kInvalidQueueSerialIndex),
66       mHasAnyCommandsPendingSubmission(false),
67       mNeedPrintfHandling(false),
68       mPrintfInfos(nullptr)
69 {}
70 
init()71 angle::Result CLCommandQueueVk::init()
72 {
73     ANGLE_CL_IMPL_TRY_ERROR(
74         vk::OutsideRenderPassCommandBuffer::InitializeCommandPool(
75             mContext, &mCommandPool.outsideRenderPassPool,
76             mContext->getRenderer()->getQueueFamilyIndex(), getProtectionType()),
77         CL_OUT_OF_RESOURCES);
78 
79     ANGLE_CL_IMPL_TRY_ERROR(mContext->getRenderer()->getOutsideRenderPassCommandBufferHelper(
80                                 mContext, &mCommandPool.outsideRenderPassPool,
81                                 &mOutsideRenderPassCommandsAllocator, &mComputePassCommands),
82                             CL_OUT_OF_RESOURCES);
83 
84     // Generate initial QueueSerial for command buffer helper
85     ANGLE_CL_IMPL_TRY_ERROR(
86         mContext->getRenderer()->allocateQueueSerialIndex(&mCurrentQueueSerialIndex),
87         CL_OUT_OF_RESOURCES);
88     mComputePassCommands->setQueueSerial(
89         mCurrentQueueSerialIndex,
90         mContext->getRenderer()->generateQueueSerial(mCurrentQueueSerialIndex));
91 
92     // Initialize serials to be valid but appear submitted and finished.
93     mLastFlushedQueueSerial   = QueueSerial(mCurrentQueueSerialIndex, Serial());
94     mLastSubmittedQueueSerial = mLastFlushedQueueSerial;
95 
96     return angle::Result::Continue;
97 }
98 
~CLCommandQueueVk()99 CLCommandQueueVk::~CLCommandQueueVk()
100 {
101     ASSERT(mComputePassCommands->empty());
102     ASSERT(!mNeedPrintfHandling);
103 
104     if (mPrintfBuffer)
105     {
106         mPrintfBuffer->release();
107     }
108 
109     VkDevice vkDevice = mContext->getDevice();
110 
111     if (mCurrentQueueSerialIndex != kInvalidQueueSerialIndex)
112     {
113         mContext->getRenderer()->releaseQueueSerialIndex(mCurrentQueueSerialIndex);
114         mCurrentQueueSerialIndex = kInvalidQueueSerialIndex;
115     }
116 
117     // Recycle the current command buffers
118     mContext->getRenderer()->recycleOutsideRenderPassCommandBufferHelper(&mComputePassCommands);
119     mCommandPool.outsideRenderPassPool.destroy(vkDevice);
120 }
121 
setProperty(cl::CommandQueueProperties properties,cl_bool enable)122 angle::Result CLCommandQueueVk::setProperty(cl::CommandQueueProperties properties, cl_bool enable)
123 {
124     // NOTE: "clSetCommandQueueProperty" has been deprecated as of OpenCL 1.1
125     // http://man.opencl.org/deprecated.html
126     return angle::Result::Continue;
127 }
128 
enqueueReadBuffer(const cl::Buffer & buffer,bool blocking,size_t offset,size_t size,void * ptr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)129 angle::Result CLCommandQueueVk::enqueueReadBuffer(const cl::Buffer &buffer,
130                                                   bool blocking,
131                                                   size_t offset,
132                                                   size_t size,
133                                                   void *ptr,
134                                                   const cl::EventPtrs &waitEvents,
135                                                   CLEventImpl::CreateFunc *eventCreateFunc)
136 {
137     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
138 
139     ANGLE_TRY(processWaitlist(waitEvents));
140     CLBufferVk *bufferVk = &buffer.getImpl<CLBufferVk>();
141 
142     if (blocking)
143     {
144         ANGLE_TRY(finishInternal());
145         ANGLE_TRY(bufferVk->copyTo(ptr, offset, size));
146 
147         ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete));
148     }
149     else
150     {
151         // Stage a transfer routine
152         HostTransferConfig transferConfig;
153         transferConfig.type       = CL_COMMAND_READ_BUFFER;
154         transferConfig.offset     = offset;
155         transferConfig.size       = size;
156         transferConfig.dstHostPtr = ptr;
157         ANGLE_TRY(addToHostTransferList(bufferVk, transferConfig));
158 
159         ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
160     }
161 
162     return angle::Result::Continue;
163 }
164 
enqueueWriteBuffer(const cl::Buffer & buffer,bool blocking,size_t offset,size_t size,const void * ptr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)165 angle::Result CLCommandQueueVk::enqueueWriteBuffer(const cl::Buffer &buffer,
166                                                    bool blocking,
167                                                    size_t offset,
168                                                    size_t size,
169                                                    const void *ptr,
170                                                    const cl::EventPtrs &waitEvents,
171                                                    CLEventImpl::CreateFunc *eventCreateFunc)
172 {
173     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
174 
175     ANGLE_TRY(processWaitlist(waitEvents));
176 
177     auto bufferVk = &buffer.getImpl<CLBufferVk>();
178 
179     if (blocking)
180     {
181         ANGLE_TRY(finishInternal());
182         ANGLE_TRY(bufferVk->copyFrom(ptr, offset, size));
183     }
184     else
185     {
186         // Stage a transfer routine
187         HostTransferConfig config;
188         config.type       = CL_COMMAND_WRITE_BUFFER;
189         config.offset     = offset;
190         config.size       = size;
191         config.srcHostPtr = ptr;
192         ANGLE_TRY(addToHostTransferList(bufferVk, config));
193     }
194 
195     ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete));
196 
197     return angle::Result::Continue;
198 }
199 
enqueueReadBufferRect(const cl::Buffer & buffer,bool blocking,const cl::MemOffsets & bufferOrigin,const cl::MemOffsets & hostOrigin,const cl::Coordinate & region,size_t bufferRowPitch,size_t bufferSlicePitch,size_t hostRowPitch,size_t hostSlicePitch,void * ptr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)200 angle::Result CLCommandQueueVk::enqueueReadBufferRect(const cl::Buffer &buffer,
201                                                       bool blocking,
202                                                       const cl::MemOffsets &bufferOrigin,
203                                                       const cl::MemOffsets &hostOrigin,
204                                                       const cl::Coordinate &region,
205                                                       size_t bufferRowPitch,
206                                                       size_t bufferSlicePitch,
207                                                       size_t hostRowPitch,
208                                                       size_t hostSlicePitch,
209                                                       void *ptr,
210                                                       const cl::EventPtrs &waitEvents,
211                                                       CLEventImpl::CreateFunc *eventCreateFunc)
212 {
213     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
214 
215     ANGLE_TRY(processWaitlist(waitEvents));
216     auto bufferVk = &buffer.getImpl<CLBufferVk>();
217 
218     ANGLE_TRY(finishInternal());
219 
220     cl::BufferRect bufferRect{cl::Offset{bufferOrigin.x, bufferOrigin.y, bufferOrigin.z},
221                               cl::Extents{region.x, region.y, region.z}, bufferRowPitch,
222                               bufferSlicePitch, 1};
223 
224     cl::BufferRect ptrRect{cl::Offset{hostOrigin.x, hostOrigin.y, hostOrigin.z},
225                            cl::Extents{region.x, region.y, region.z}, hostRowPitch, hostSlicePitch,
226                            1};
227 
228     ANGLE_TRY(bufferVk->getRect(bufferRect, ptrRect, ptr));
229 
230     ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete));
231     return angle::Result::Continue;
232 }
233 
enqueueWriteBufferRect(const cl::Buffer & buffer,bool blocking,const cl::MemOffsets & bufferOrigin,const cl::MemOffsets & hostOrigin,const cl::Coordinate & region,size_t bufferRowPitch,size_t bufferSlicePitch,size_t hostRowPitch,size_t hostSlicePitch,const void * ptr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)234 angle::Result CLCommandQueueVk::enqueueWriteBufferRect(const cl::Buffer &buffer,
235                                                        bool blocking,
236                                                        const cl::MemOffsets &bufferOrigin,
237                                                        const cl::MemOffsets &hostOrigin,
238                                                        const cl::Coordinate &region,
239                                                        size_t bufferRowPitch,
240                                                        size_t bufferSlicePitch,
241                                                        size_t hostRowPitch,
242                                                        size_t hostSlicePitch,
243                                                        const void *ptr,
244                                                        const cl::EventPtrs &waitEvents,
245                                                        CLEventImpl::CreateFunc *eventCreateFunc)
246 {
247     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
248 
249     ANGLE_TRY(processWaitlist(waitEvents));
250     auto bufferVk = &buffer.getImpl<CLBufferVk>();
251 
252     ANGLE_TRY(finishInternal());
253 
254     cl::BufferRect bufferRect{cl::Offset{bufferOrigin.x, bufferOrigin.y, bufferOrigin.z},
255                               cl::Extents{region.x, region.y, region.z}, bufferRowPitch,
256                               bufferSlicePitch, 1};
257 
258     cl::BufferRect ptrRect{cl::Offset{hostOrigin.x, hostOrigin.y, hostOrigin.z},
259                            cl::Extents{region.x, region.y, region.z}, hostRowPitch, hostSlicePitch,
260                            1};
261 
262     ANGLE_TRY(bufferVk->setRect(ptr, ptrRect, bufferRect));
263 
264     ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete));
265     return angle::Result::Continue;
266 }
267 
enqueueCopyBuffer(const cl::Buffer & srcBuffer,const cl::Buffer & dstBuffer,size_t srcOffset,size_t dstOffset,size_t size,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)268 angle::Result CLCommandQueueVk::enqueueCopyBuffer(const cl::Buffer &srcBuffer,
269                                                   const cl::Buffer &dstBuffer,
270                                                   size_t srcOffset,
271                                                   size_t dstOffset,
272                                                   size_t size,
273                                                   const cl::EventPtrs &waitEvents,
274                                                   CLEventImpl::CreateFunc *eventCreateFunc)
275 {
276     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
277 
278     ANGLE_TRY(processWaitlist(waitEvents));
279 
280     CLBufferVk *srcBufferVk = &srcBuffer.getImpl<CLBufferVk>();
281     CLBufferVk *dstBufferVk = &dstBuffer.getImpl<CLBufferVk>();
282 
283     vk::CommandBufferAccess access;
284     if (srcBufferVk->isSubBuffer() && dstBufferVk->isSubBuffer() &&
285         (srcBufferVk->getParent() == dstBufferVk->getParent()))
286     {
287         // this is a self copy
288         access.onBufferSelfCopy(&srcBufferVk->getBuffer());
289     }
290     else
291     {
292         access.onBufferTransferRead(&srcBufferVk->getBuffer());
293         access.onBufferTransferWrite(&dstBufferVk->getBuffer());
294     }
295 
296     vk::OutsideRenderPassCommandBuffer *commandBuffer;
297     ANGLE_TRY(getCommandBuffer(access, &commandBuffer));
298 
299     VkBufferCopy copyRegion = {srcOffset, dstOffset, size};
300     // update the offset in the case of sub-buffers
301     if (srcBufferVk->getOffset())
302     {
303         copyRegion.srcOffset += srcBufferVk->getOffset();
304     }
305     if (dstBufferVk->getOffset())
306     {
307         copyRegion.dstOffset += dstBufferVk->getOffset();
308     }
309     commandBuffer->copyBuffer(srcBufferVk->getBuffer().getBuffer(),
310                               dstBufferVk->getBuffer().getBuffer(), 1, &copyRegion);
311 
312     ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
313 
314     return angle::Result::Continue;
315 }
316 
enqueueCopyBufferRect(const cl::Buffer & srcBuffer,const cl::Buffer & dstBuffer,const cl::MemOffsets & srcOrigin,const cl::MemOffsets & dstOrigin,const cl::Coordinate & region,size_t srcRowPitch,size_t srcSlicePitch,size_t dstRowPitch,size_t dstSlicePitch,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)317 angle::Result CLCommandQueueVk::enqueueCopyBufferRect(const cl::Buffer &srcBuffer,
318                                                       const cl::Buffer &dstBuffer,
319                                                       const cl::MemOffsets &srcOrigin,
320                                                       const cl::MemOffsets &dstOrigin,
321                                                       const cl::Coordinate &region,
322                                                       size_t srcRowPitch,
323                                                       size_t srcSlicePitch,
324                                                       size_t dstRowPitch,
325                                                       size_t dstSlicePitch,
326                                                       const cl::EventPtrs &waitEvents,
327                                                       CLEventImpl::CreateFunc *eventCreateFunc)
328 {
329     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
330     ANGLE_TRY(processWaitlist(waitEvents));
331     ANGLE_TRY(finishInternal());
332 
333     cl::BufferRect srcRect{cl::Offset{srcOrigin.x, srcOrigin.y, srcOrigin.z},
334                            cl::Extents{region.x, region.y, region.z}, srcRowPitch, srcSlicePitch,
335                            1};
336 
337     cl::BufferRect dstRect{cl::Offset{dstOrigin.x, dstOrigin.y, dstOrigin.z},
338                            cl::Extents{region.x, region.y, region.z}, dstRowPitch, dstSlicePitch,
339                            1};
340 
341     auto srcBufferVk    = &srcBuffer.getImpl<CLBufferVk>();
342     auto dstBufferVk    = &dstBuffer.getImpl<CLBufferVk>();
343     uint8_t *mapPointer = nullptr;
344     ANGLE_TRY(srcBufferVk->map(mapPointer));
345     ASSERT(mapPointer);
346     ANGLE_TRY(dstBufferVk->setRect(static_cast<const void *>(mapPointer), srcRect, dstRect));
347 
348     ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete));
349     return angle::Result::Continue;
350 }
351 
enqueueFillBuffer(const cl::Buffer & buffer,const void * pattern,size_t patternSize,size_t offset,size_t size,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)352 angle::Result CLCommandQueueVk::enqueueFillBuffer(const cl::Buffer &buffer,
353                                                   const void *pattern,
354                                                   size_t patternSize,
355                                                   size_t offset,
356                                                   size_t size,
357                                                   const cl::EventPtrs &waitEvents,
358                                                   CLEventImpl::CreateFunc *eventCreateFunc)
359 {
360     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
361 
362     ANGLE_TRY(processWaitlist(waitEvents));
363 
364     CLBufferVk *bufferVk = &buffer.getImpl<CLBufferVk>();
365     if (mComputePassCommands->usesBuffer(bufferVk->getBuffer()))
366     {
367         ANGLE_TRY(finishInternal());
368     }
369 
370     ANGLE_TRY(bufferVk->fillWithPattern(pattern, patternSize, offset, size));
371 
372     ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete));
373 
374     return angle::Result::Continue;
375 }
376 
enqueueMapBuffer(const cl::Buffer & buffer,bool blocking,cl::MapFlags mapFlags,size_t offset,size_t size,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc,void * & mapPtr)377 angle::Result CLCommandQueueVk::enqueueMapBuffer(const cl::Buffer &buffer,
378                                                  bool blocking,
379                                                  cl::MapFlags mapFlags,
380                                                  size_t offset,
381                                                  size_t size,
382                                                  const cl::EventPtrs &waitEvents,
383                                                  CLEventImpl::CreateFunc *eventCreateFunc,
384                                                  void *&mapPtr)
385 {
386     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
387 
388     ANGLE_TRY(processWaitlist(waitEvents));
389 
390     cl::ExecutionStatus eventComplete = cl::ExecutionStatus::Queued;
391     if (blocking || !eventCreateFunc)
392     {
393         ANGLE_TRY(finishInternal());
394         eventComplete = cl::ExecutionStatus::Complete;
395     }
396 
397     CLBufferVk *bufferVk = &buffer.getImpl<CLBufferVk>();
398     uint8_t *mapPointer  = nullptr;
399     if (buffer.getFlags().intersects(CL_MEM_USE_HOST_PTR))
400     {
401         ANGLE_TRY(finishInternal());
402         mapPointer = static_cast<uint8_t *>(buffer.getHostPtr()) + offset;
403         ANGLE_TRY(bufferVk->copyTo(mapPointer, offset, size));
404         eventComplete = cl::ExecutionStatus::Complete;
405     }
406     else
407     {
408         ANGLE_TRY(bufferVk->map(mapPointer, offset));
409     }
410     mapPtr = static_cast<void *>(mapPointer);
411 
412     if (bufferVk->isCurrentlyInUse())
413     {
414         eventComplete = cl::ExecutionStatus::Queued;
415     }
416     ANGLE_TRY(createEvent(eventCreateFunc, eventComplete));
417 
418     return angle::Result::Continue;
419 }
420 
copyImageToFromBuffer(CLImageVk & imageVk,vk::BufferHelper & buffer,const cl::MemOffsets & origin,const cl::Coordinate & region,size_t bufferOffset,ImageBufferCopyDirection direction)421 angle::Result CLCommandQueueVk::copyImageToFromBuffer(CLImageVk &imageVk,
422                                                       vk::BufferHelper &buffer,
423                                                       const cl::MemOffsets &origin,
424                                                       const cl::Coordinate &region,
425                                                       size_t bufferOffset,
426                                                       ImageBufferCopyDirection direction)
427 {
428     vk::CommandBufferAccess access;
429     vk::OutsideRenderPassCommandBuffer *commandBuffer;
430     VkImageAspectFlags aspectFlags = imageVk.getImage().getAspectFlags();
431     if (direction == ImageBufferCopyDirection::ToBuffer)
432     {
433         access.onImageTransferRead(aspectFlags, &imageVk.getImage());
434         access.onBufferTransferWrite(&buffer);
435     }
436     else
437     {
438         access.onImageTransferWrite(gl::LevelIndex(0), 1, 0,
439                                     static_cast<uint32_t>(imageVk.getArraySize()), aspectFlags,
440                                     &imageVk.getImage());
441         access.onBufferTransferRead(&buffer);
442     }
443     ANGLE_TRY(getCommandBuffer(access, &commandBuffer));
444 
445     VkBufferImageCopy copyRegion = {};
446     copyRegion.bufferOffset      = bufferOffset;
447     copyRegion.bufferRowLength   = 0;
448     copyRegion.bufferImageHeight = 0;
449     copyRegion.imageExtent       = cl_vk::GetExtent(imageVk.getExtentForCopy(region));
450     copyRegion.imageOffset       = cl_vk::GetOffset(imageVk.getOffsetForCopy(origin));
451     copyRegion.imageSubresource  = imageVk.getSubresourceLayersForCopy(
452         origin, region, imageVk.getType(), ImageCopyWith::Buffer);
453     if (imageVk.isWritable())
454     {
455         // We need an execution barrier if image can be written to by kernel
456         ANGLE_TRY(insertBarrier());
457     }
458 
459     if (direction == ImageBufferCopyDirection::ToBuffer)
460     {
461         commandBuffer->copyImageToBuffer(imageVk.getImage().getImage(),
462                                          VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
463                                          buffer.getBuffer().getHandle(), 1, &copyRegion);
464     }
465     else
466     {
467         commandBuffer->copyBufferToImage(buffer.getBuffer().getHandle(),
468                                          imageVk.getImage().getImage(),
469                                          VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &copyRegion);
470     }
471 
472     return angle::Result::Continue;
473 }
474 
addToHostTransferList(CLBufferVk * srcBuffer,CLCommandQueueVk::HostTransferConfig transferConfig)475 angle::Result CLCommandQueueVk::addToHostTransferList(
476     CLBufferVk *srcBuffer,
477     CLCommandQueueVk::HostTransferConfig transferConfig)
478 {
479     // TODO(aannestrand): Flush here if we reach some max-transfer-buffer heuristic
480     // http://anglebug.com/377545840
481 
482     cl::Memory *transferBufferHandle =
483         cl::Buffer::Cast(this->mContext->getFrontendObject().createBuffer(
484             nullptr, cl::MemFlags{CL_MEM_READ_WRITE}, srcBuffer->getSize(), nullptr));
485     if (transferBufferHandle == nullptr)
486     {
487         ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
488     }
489     HostTransferEntry transferEntry{transferConfig, cl::MemoryPtr{transferBufferHandle}};
490     mHostTransferList.emplace_back(transferEntry);
491 
492     // Release initialization reference, lifetime controlled by RefPointer.
493     mHostTransferList.back().transferBufferHandle->release();
494 
495     // We need an execution barrier if buffer can be written to by kernel
496     if (!mComputePassCommands->getCommandBuffer().empty() && srcBuffer->isWritable())
497     {
498         // TODO(aannestrand): Look into combining these kernel execution barriers
499         // http://anglebug.com/377545840
500         VkMemoryBarrier memoryBarrier = {VK_STRUCTURE_TYPE_MEMORY_BARRIER, nullptr,
501                                          VK_ACCESS_SHADER_WRITE_BIT,
502                                          VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT};
503         mComputePassCommands->getCommandBuffer().pipelineBarrier(
504             VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1,
505             &memoryBarrier, 0, nullptr, 0, nullptr);
506     }
507 
508     // Enqueue blit/transfer cmd
509     VkPipelineStageFlags srcStageMask = {};
510     VkPipelineStageFlags dstStageMask = {};
511     VkMemoryBarrier memBarrier        = {};
512     memBarrier.sType                  = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
513     CLBufferVk &transferBufferHandleVk =
514         mHostTransferList.back().transferBufferHandle->getImpl<CLBufferVk>();
515     switch (transferConfig.type)
516     {
517         case CL_COMMAND_WRITE_BUFFER:
518         {
519             VkBufferCopy copyRegion = {transferConfig.offset, transferConfig.offset,
520                                        transferConfig.size};
521             ANGLE_TRY(transferBufferHandleVk.copyFrom(transferConfig.srcHostPtr,
522                                                       transferConfig.offset, transferConfig.size));
523             copyRegion.srcOffset += transferBufferHandleVk.getOffset();
524             copyRegion.dstOffset += srcBuffer->getOffset();
525             mComputePassCommands->getCommandBuffer().copyBuffer(
526                 transferBufferHandleVk.getBuffer().getBuffer(), srcBuffer->getBuffer().getBuffer(),
527                 1, &copyRegion);
528 
529             srcStageMask             = VK_PIPELINE_STAGE_TRANSFER_BIT;
530             dstStageMask             = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
531             memBarrier.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT;
532             memBarrier.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT;
533             break;
534         }
535         case CL_COMMAND_READ_BUFFER:
536         {
537             VkBufferCopy copyRegion = {transferConfig.offset, transferConfig.offset,
538                                        transferConfig.size};
539             copyRegion.srcOffset += srcBuffer->getOffset();
540             copyRegion.dstOffset += transferBufferHandleVk.getOffset();
541             mComputePassCommands->getCommandBuffer().copyBuffer(
542                 srcBuffer->getBuffer().getBuffer(), transferBufferHandleVk.getBuffer().getBuffer(),
543                 1, &copyRegion);
544 
545             srcStageMask             = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
546             dstStageMask             = VK_PIPELINE_STAGE_HOST_BIT;
547             memBarrier.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT;
548             memBarrier.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT;
549             break;
550         }
551         default:
552             UNIMPLEMENTED();
553             break;
554     }
555 
556     // TODO(aannestrand): Look into combining these transfer barriers
557     // http://anglebug.com/377545840
558     mComputePassCommands->getCommandBuffer().pipelineBarrier(srcStageMask, dstStageMask, 0, 1,
559                                                              &memBarrier, 0, nullptr, 0, nullptr);
560 
561     return angle::Result::Continue;
562 }
563 
addToHostTransferList(CLImageVk * srcImage,CLCommandQueueVk::HostTransferConfig transferConfig)564 angle::Result CLCommandQueueVk::addToHostTransferList(
565     CLImageVk *srcImage,
566     CLCommandQueueVk::HostTransferConfig transferConfig)
567 {
568     // TODO(aannestrand): Flush here if we reach some max-transfer-buffer heuristic
569     // http://anglebug.com/377545840
570 
571     cl::Memory *transferBufferHandle =
572         cl::Buffer::Cast(this->mContext->getFrontendObject().createBuffer(
573             nullptr, cl::MemFlags{CL_MEM_READ_WRITE}, srcImage->getSize(), nullptr));
574     if (transferBufferHandle == nullptr)
575     {
576         ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
577     }
578 
579     HostTransferEntry transferEntry{transferConfig, cl::MemoryPtr{transferBufferHandle}};
580     mHostTransferList.emplace_back(transferEntry);
581 
582     // Release initialization reference, lifetime controlled by RefPointer.
583     mHostTransferList.back().transferBufferHandle->release();
584 
585     // Enqueue blit
586     CLBufferVk &transferBufferHandleVk =
587         mHostTransferList.back().transferBufferHandle->getImpl<CLBufferVk>();
588     ANGLE_TRY(copyImageToFromBuffer(*srcImage, transferBufferHandleVk.getBuffer(),
589                                     transferConfig.origin, transferConfig.region, 0,
590                                     ImageBufferCopyDirection::ToBuffer));
591 
592     return angle::Result::Continue;
593 }
594 
enqueueReadImage(const cl::Image & image,bool blocking,const cl::MemOffsets & origin,const cl::Coordinate & region,size_t rowPitch,size_t slicePitch,void * ptr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)595 angle::Result CLCommandQueueVk::enqueueReadImage(const cl::Image &image,
596                                                  bool blocking,
597                                                  const cl::MemOffsets &origin,
598                                                  const cl::Coordinate &region,
599                                                  size_t rowPitch,
600                                                  size_t slicePitch,
601                                                  void *ptr,
602                                                  const cl::EventPtrs &waitEvents,
603                                                  CLEventImpl::CreateFunc *eventCreateFunc)
604 {
605     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
606     CLImageVk &imageVk = image.getImpl<CLImageVk>();
607     size_t size        = (region.x * region.y * region.z * imageVk.getElementSize());
608 
609     ANGLE_TRY(processWaitlist(waitEvents));
610 
611     if (imageVk.isStagingBufferInitialized() == false)
612     {
613         ANGLE_TRY(imageVk.createStagingBuffer(imageVk.getSize()));
614     }
615 
616     if (blocking)
617     {
618         ANGLE_TRY(copyImageToFromBuffer(imageVk, imageVk.getStagingBuffer(), origin, region, 0,
619                                         ImageBufferCopyDirection::ToBuffer));
620         ANGLE_TRY(finishInternal());
621         if (rowPitch == 0 && slicePitch == 0)
622         {
623             ANGLE_TRY(imageVk.copyStagingTo(ptr, 0, size));
624         }
625         else
626         {
627             ANGLE_TRY(imageVk.copyStagingToFromWithPitch(ptr, region, rowPitch, slicePitch,
628                                                          StagingBufferCopyDirection::ToHost));
629         }
630         ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete));
631     }
632     else
633     {
634         // Create a transfer buffer and push it in update list
635         HostTransferConfig transferConfig;
636         transferConfig.type       = CL_COMMAND_READ_IMAGE;
637         transferConfig.size       = size;
638         transferConfig.dstHostPtr = ptr;
639         transferConfig.origin     = origin;
640         transferConfig.region     = region;
641         ANGLE_TRY(addToHostTransferList(&imageVk, transferConfig));
642 
643         ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
644     }
645 
646     return angle::Result::Continue;
647 }
648 
enqueueWriteImage(const cl::Image & image,bool blocking,const cl::MemOffsets & origin,const cl::Coordinate & region,size_t inputRowPitch,size_t inputSlicePitch,const void * ptr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)649 angle::Result CLCommandQueueVk::enqueueWriteImage(const cl::Image &image,
650                                                   bool blocking,
651                                                   const cl::MemOffsets &origin,
652                                                   const cl::Coordinate &region,
653                                                   size_t inputRowPitch,
654                                                   size_t inputSlicePitch,
655                                                   const void *ptr,
656                                                   const cl::EventPtrs &waitEvents,
657                                                   CLEventImpl::CreateFunc *eventCreateFunc)
658 {
659     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
660     ANGLE_TRY(processWaitlist(waitEvents));
661 
662     CLImageVk &imageVk = image.getImpl<CLImageVk>();
663     size_t size        = (region.x * region.y * region.z * imageVk.getElementSize());
664     cl::ExecutionStatus eventInitialState = cl::ExecutionStatus::Queued;
665     if (imageVk.isStagingBufferInitialized() == false)
666     {
667         ANGLE_TRY(imageVk.createStagingBuffer(imageVk.getSize()));
668     }
669 
670     if (inputRowPitch == 0 && inputSlicePitch == 0)
671     {
672         ANGLE_TRY(imageVk.copyStagingFrom((void *)ptr, 0, size));
673     }
674     else
675     {
676         ANGLE_TRY(imageVk.copyStagingToFromWithPitch((void *)ptr, region, inputRowPitch,
677                                                      inputSlicePitch,
678                                                      StagingBufferCopyDirection::ToStagingBuffer));
679     }
680 
681     ANGLE_TRY(copyImageToFromBuffer(imageVk, imageVk.getStagingBuffer(), origin, region, 0,
682                                     ImageBufferCopyDirection::ToImage));
683 
684     if (blocking)
685     {
686         ANGLE_TRY(finishInternal());
687         eventInitialState = cl::ExecutionStatus::Complete;
688     }
689 
690     ANGLE_TRY(createEvent(eventCreateFunc, eventInitialState));
691 
692     return angle::Result::Continue;
693 }
694 
enqueueCopyImage(const cl::Image & srcImage,const cl::Image & dstImage,const cl::MemOffsets & srcOrigin,const cl::MemOffsets & dstOrigin,const cl::Coordinate & region,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)695 angle::Result CLCommandQueueVk::enqueueCopyImage(const cl::Image &srcImage,
696                                                  const cl::Image &dstImage,
697                                                  const cl::MemOffsets &srcOrigin,
698                                                  const cl::MemOffsets &dstOrigin,
699                                                  const cl::Coordinate &region,
700                                                  const cl::EventPtrs &waitEvents,
701                                                  CLEventImpl::CreateFunc *eventCreateFunc)
702 {
703     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
704     ANGLE_TRY(processWaitlist(waitEvents));
705 
706     auto srcImageVk = &srcImage.getImpl<CLImageVk>();
707     auto dstImageVk = &dstImage.getImpl<CLImageVk>();
708 
709     vk::CommandBufferAccess access;
710     vk::OutsideRenderPassCommandBuffer *commandBuffer;
711     VkImageAspectFlags dstAspectFlags = srcImageVk->getImage().getAspectFlags();
712     VkImageAspectFlags srcAspectFlags = dstImageVk->getImage().getAspectFlags();
713     access.onImageTransferWrite(gl::LevelIndex(0), 1, 0, 1, dstAspectFlags,
714                                 &dstImageVk->getImage());
715     access.onImageTransferRead(srcAspectFlags, &srcImageVk->getImage());
716     ANGLE_TRY(getCommandBuffer(access, &commandBuffer));
717 
718     VkImageCopy copyRegion    = {};
719     copyRegion.extent         = cl_vk::GetExtent(srcImageVk->getExtentForCopy(region));
720     copyRegion.srcOffset      = cl_vk::GetOffset(srcImageVk->getOffsetForCopy(srcOrigin));
721     copyRegion.dstOffset      = cl_vk::GetOffset(dstImageVk->getOffsetForCopy(dstOrigin));
722     copyRegion.srcSubresource = srcImageVk->getSubresourceLayersForCopy(
723         srcOrigin, region, dstImageVk->getType(), ImageCopyWith::Image);
724     copyRegion.dstSubresource = dstImageVk->getSubresourceLayersForCopy(
725         dstOrigin, region, srcImageVk->getType(), ImageCopyWith::Image);
726     if (srcImageVk->isWritable() || dstImageVk->isWritable())
727     {
728         // We need an execution barrier if buffer can be written to by kernel
729         ANGLE_TRY(insertBarrier());
730     }
731 
732     commandBuffer->copyImage(
733         srcImageVk->getImage().getImage(), VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
734         dstImageVk->getImage().getImage(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &copyRegion);
735 
736     ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
737 
738     return angle::Result::Continue;
739 }
740 
enqueueFillImage(const cl::Image & image,const void * fillColor,const cl::MemOffsets & origin,const cl::Coordinate & region,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)741 angle::Result CLCommandQueueVk::enqueueFillImage(const cl::Image &image,
742                                                  const void *fillColor,
743                                                  const cl::MemOffsets &origin,
744                                                  const cl::Coordinate &region,
745                                                  const cl::EventPtrs &waitEvents,
746                                                  CLEventImpl::CreateFunc *eventCreateFunc)
747 {
748     CLImageVk &imageVk = image.getImpl<CLImageVk>();
749     PixelColor packedColor;
750     cl::Extents extent = imageVk.getImageExtent();
751 
752     imageVk.packPixels(fillColor, &packedColor);
753 
754     ANGLE_TRY(enqueueWaitForEvents(waitEvents));
755 
756     if (imageVk.isStagingBufferInitialized() == false)
757     {
758         ANGLE_TRY(imageVk.createStagingBuffer(imageVk.getSize()));
759     }
760 
761     ANGLE_TRY(copyImageToFromBuffer(imageVk, imageVk.getStagingBuffer(), cl::kMemOffsetsZero,
762                                     {extent.width, extent.height, extent.depth}, 0,
763                                     ImageBufferCopyDirection::ToBuffer));
764     ANGLE_TRY(finishInternal());
765 
766     uint8_t *mapPointer = nullptr;
767     ANGLE_TRY(imageVk.map(mapPointer, 0));
768     imageVk.fillImageWithColor(origin, region, mapPointer, &packedColor);
769     imageVk.unmap();
770     mapPointer = nullptr;
771     ANGLE_TRY(copyImageToFromBuffer(imageVk, imageVk.getStagingBuffer(), cl::kMemOffsetsZero,
772                                     {extent.width, extent.height, extent.depth}, 0,
773                                     ImageBufferCopyDirection::ToImage));
774 
775     ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
776 
777     return angle::Result::Continue;
778 }
779 
enqueueCopyImageToBuffer(const cl::Image & srcImage,const cl::Buffer & dstBuffer,const cl::MemOffsets & srcOrigin,const cl::Coordinate & region,size_t dstOffset,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)780 angle::Result CLCommandQueueVk::enqueueCopyImageToBuffer(const cl::Image &srcImage,
781                                                          const cl::Buffer &dstBuffer,
782                                                          const cl::MemOffsets &srcOrigin,
783                                                          const cl::Coordinate &region,
784                                                          size_t dstOffset,
785                                                          const cl::EventPtrs &waitEvents,
786                                                          CLEventImpl::CreateFunc *eventCreateFunc)
787 {
788     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
789     CLImageVk &srcImageVk   = srcImage.getImpl<CLImageVk>();
790     CLBufferVk &dstBufferVk = dstBuffer.getImpl<CLBufferVk>();
791 
792     ANGLE_TRY(processWaitlist(waitEvents));
793 
794     ANGLE_TRY(copyImageToFromBuffer(srcImageVk, dstBufferVk.getBuffer(), srcOrigin, region,
795                                     dstOffset, ImageBufferCopyDirection::ToBuffer));
796 
797     ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
798 
799     return angle::Result::Continue;
800 }
801 
enqueueCopyBufferToImage(const cl::Buffer & srcBuffer,const cl::Image & dstImage,size_t srcOffset,const cl::MemOffsets & dstOrigin,const cl::Coordinate & region,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)802 angle::Result CLCommandQueueVk::enqueueCopyBufferToImage(const cl::Buffer &srcBuffer,
803                                                          const cl::Image &dstImage,
804                                                          size_t srcOffset,
805                                                          const cl::MemOffsets &dstOrigin,
806                                                          const cl::Coordinate &region,
807                                                          const cl::EventPtrs &waitEvents,
808                                                          CLEventImpl::CreateFunc *eventCreateFunc)
809 {
810     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
811     CLBufferVk &srcBufferVk = srcBuffer.getImpl<CLBufferVk>();
812     CLImageVk &dstImageVk   = dstImage.getImpl<CLImageVk>();
813 
814     ANGLE_TRY(processWaitlist(waitEvents));
815 
816     ANGLE_TRY(copyImageToFromBuffer(dstImageVk, srcBufferVk.getBuffer(), dstOrigin, region,
817                                     srcOffset, ImageBufferCopyDirection::ToImage));
818 
819     ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
820 
821     return angle::Result::Continue;
822 }
823 
enqueueMapImage(const cl::Image & image,bool blocking,cl::MapFlags mapFlags,const cl::MemOffsets & origin,const cl::Coordinate & region,size_t * imageRowPitch,size_t * imageSlicePitch,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc,void * & mapPtr)824 angle::Result CLCommandQueueVk::enqueueMapImage(const cl::Image &image,
825                                                 bool blocking,
826                                                 cl::MapFlags mapFlags,
827                                                 const cl::MemOffsets &origin,
828                                                 const cl::Coordinate &region,
829                                                 size_t *imageRowPitch,
830                                                 size_t *imageSlicePitch,
831                                                 const cl::EventPtrs &waitEvents,
832                                                 CLEventImpl::CreateFunc *eventCreateFunc,
833                                                 void *&mapPtr)
834 {
835     ANGLE_TRY(enqueueWaitForEvents(waitEvents));
836 
837     // TODO: Look into better enqueue handling of this map-op if non-blocking
838     // https://anglebug.com/376722715
839     CLImageVk *imageVk = &image.getImpl<CLImageVk>();
840     cl::Extents extent = imageVk->getImageExtent();
841     if (blocking)
842     {
843         ANGLE_TRY(finishInternal());
844     }
845 
846     mComputePassCommands->imageRead(mContext, imageVk->getImage().getAspectFlags(),
847                                     vk::ImageLayout::TransferSrc, &imageVk->getImage());
848 
849     if (imageVk->isStagingBufferInitialized() == false)
850     {
851         ANGLE_TRY(imageVk->createStagingBuffer(imageVk->getSize()));
852     }
853 
854     ANGLE_TRY(copyImageToFromBuffer(*imageVk, imageVk->getStagingBuffer(), cl::kMemOffsetsZero,
855                                     {extent.width, extent.height, extent.depth}, 0,
856                                     ImageBufferCopyDirection::ToBuffer));
857     ANGLE_TRY(finishInternal());
858 
859     uint8_t *mapPointer = nullptr;
860     size_t elementSize  = imageVk->getElementSize();
861     size_t rowPitch     = (extent.width * elementSize);
862     size_t offset =
863         (origin.x * elementSize) + (origin.y * rowPitch) + (origin.z * extent.height * rowPitch);
864     size_t size = (region.x * region.y * region.z * elementSize);
865 
866     if (image.getFlags().intersects(CL_MEM_USE_HOST_PTR))
867     {
868         mapPointer = static_cast<uint8_t *>(image.getHostPtr()) + offset;
869         ANGLE_TRY(imageVk->copyTo(mapPointer, offset, size));
870     }
871     else
872     {
873         ANGLE_TRY(imageVk->map(mapPointer, offset));
874     }
875     mapPtr = static_cast<void *>(mapPointer);
876 
877     *imageRowPitch = rowPitch;
878 
879     switch (imageVk->getDescriptor().type)
880     {
881         case cl::MemObjectType::Image1D:
882         case cl::MemObjectType::Image1D_Buffer:
883         case cl::MemObjectType::Image2D:
884             if (imageSlicePitch != nullptr)
885             {
886                 *imageSlicePitch = 0;
887             }
888             break;
889         case cl::MemObjectType::Image2D_Array:
890         case cl::MemObjectType::Image3D:
891             *imageSlicePitch = (extent.height * (*imageRowPitch));
892             break;
893         case cl::MemObjectType::Image1D_Array:
894             *imageSlicePitch = *imageRowPitch;
895             break;
896         default:
897             UNREACHABLE();
898             break;
899     }
900 
901     ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete));
902 
903     return angle::Result::Continue;
904 }
905 
enqueueUnmapMemObject(const cl::Memory & memory,void * mappedPtr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)906 angle::Result CLCommandQueueVk::enqueueUnmapMemObject(const cl::Memory &memory,
907                                                       void *mappedPtr,
908                                                       const cl::EventPtrs &waitEvents,
909                                                       CLEventImpl::CreateFunc *eventCreateFunc)
910 {
911     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
912 
913     ANGLE_TRY(processWaitlist(waitEvents));
914 
915     cl::ExecutionStatus eventComplete = cl::ExecutionStatus::Queued;
916     if (!eventCreateFunc)
917     {
918         ANGLE_TRY(finishInternal());
919         eventComplete = cl::ExecutionStatus::Complete;
920     }
921 
922     if (memory.getType() == cl::MemObjectType::Buffer)
923     {
924         CLBufferVk &bufferVk = memory.getImpl<CLBufferVk>();
925         if (memory.getFlags().intersects(CL_MEM_USE_HOST_PTR))
926         {
927             ANGLE_TRY(finishInternal());
928             ANGLE_TRY(bufferVk.copyFrom(memory.getHostPtr(), 0, bufferVk.getSize()));
929             eventComplete = cl::ExecutionStatus::Complete;
930         }
931     }
932     else if (memory.getType() != cl::MemObjectType::Pipe)
933     {
934         // of image type
935         CLImageVk &imageVk = memory.getImpl<CLImageVk>();
936         if (memory.getFlags().intersects(CL_MEM_USE_HOST_PTR))
937         {
938             uint8_t *mapPointer = static_cast<uint8_t *>(memory.getHostPtr());
939             ANGLE_TRY(imageVk.copyStagingFrom(mapPointer, 0, imageVk.getSize()));
940         }
941         cl::Extents extent = imageVk.getImageExtent();
942         ANGLE_TRY(copyImageToFromBuffer(imageVk, imageVk.getStagingBuffer(), cl::kMemOffsetsZero,
943                                         {extent.width, extent.height, extent.depth}, 0,
944                                         ImageBufferCopyDirection::ToImage));
945         ANGLE_TRY(finishInternal());
946         eventComplete = cl::ExecutionStatus::Complete;
947     }
948     else
949     {
950         // mem object type pipe is not supported and creation of such an object should have
951         // failed
952         UNREACHABLE();
953     }
954 
955     memory.getImpl<CLMemoryVk>().unmap();
956     ANGLE_TRY(createEvent(eventCreateFunc, eventComplete));
957 
958     return angle::Result::Continue;
959 }
960 
enqueueMigrateMemObjects(const cl::MemoryPtrs & memObjects,cl::MemMigrationFlags flags,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)961 angle::Result CLCommandQueueVk::enqueueMigrateMemObjects(const cl::MemoryPtrs &memObjects,
962                                                          cl::MemMigrationFlags flags,
963                                                          const cl::EventPtrs &waitEvents,
964                                                          CLEventImpl::CreateFunc *eventCreateFunc)
965 {
966     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
967 
968     ANGLE_TRY(processWaitlist(waitEvents));
969 
970     if (mCommandQueue.getContext().getDevices().size() > 1)
971     {
972         // TODO(aannestrand): Later implement support to allow migration of mem objects across
973         // different devices. http://anglebug.com/377942759
974         UNIMPLEMENTED();
975         ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
976     }
977 
978     ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete));
979 
980     return angle::Result::Continue;
981 }
982 
enqueueNDRangeKernel(const cl::Kernel & kernel,const cl::NDRange & ndrange,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)983 angle::Result CLCommandQueueVk::enqueueNDRangeKernel(const cl::Kernel &kernel,
984                                                      const cl::NDRange &ndrange,
985                                                      const cl::EventPtrs &waitEvents,
986                                                      CLEventImpl::CreateFunc *eventCreateFunc)
987 {
988     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
989 
990     ANGLE_TRY(processWaitlist(waitEvents));
991 
992     cl::WorkgroupCount workgroupCount;
993     vk::PipelineCacheAccess pipelineCache;
994     vk::PipelineHelper *pipelineHelper = nullptr;
995     CLKernelVk &kernelImpl             = kernel.getImpl<CLKernelVk>();
996 
997     // Here, we create-update-bind the kernel's descriptor set, put push-constants in cmd
998     // buffer, capture kernel resources, and handle kernel execution dependencies
999     ANGLE_TRY(processKernelResources(kernelImpl, ndrange, workgroupCount));
1000 
1001     // Fetch or create compute pipeline (if we miss in cache)
1002     ANGLE_CL_IMPL_TRY_ERROR(mContext->getRenderer()->getPipelineCache(mContext, &pipelineCache),
1003                             CL_OUT_OF_RESOURCES);
1004     ANGLE_TRY(kernelImpl.getOrCreateComputePipeline(
1005         &pipelineCache, ndrange, mCommandQueue.getDevice(), &pipelineHelper, &workgroupCount));
1006 
1007     mComputePassCommands->retainResource(pipelineHelper);
1008 
1009     mComputePassCommands->getCommandBuffer().bindComputePipeline(pipelineHelper->getPipeline());
1010     mComputePassCommands->getCommandBuffer().dispatch(workgroupCount[0], workgroupCount[1],
1011                                                       workgroupCount[2]);
1012 
1013     ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
1014 
1015     return angle::Result::Continue;
1016 }
1017 
enqueueTask(const cl::Kernel & kernel,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)1018 angle::Result CLCommandQueueVk::enqueueTask(const cl::Kernel &kernel,
1019                                             const cl::EventPtrs &waitEvents,
1020                                             CLEventImpl::CreateFunc *eventCreateFunc)
1021 {
1022     constexpr size_t globalWorkSize[3] = {1, 0, 0};
1023     constexpr size_t localWorkSize[3]  = {1, 0, 0};
1024     cl::NDRange ndrange(1, nullptr, globalWorkSize, localWorkSize);
1025     return enqueueNDRangeKernel(kernel, ndrange, waitEvents, eventCreateFunc);
1026 }
1027 
enqueueNativeKernel(cl::UserFunc userFunc,void * args,size_t cbArgs,const cl::BufferPtrs & buffers,const std::vector<size_t> bufferPtrOffsets,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)1028 angle::Result CLCommandQueueVk::enqueueNativeKernel(cl::UserFunc userFunc,
1029                                                     void *args,
1030                                                     size_t cbArgs,
1031                                                     const cl::BufferPtrs &buffers,
1032                                                     const std::vector<size_t> bufferPtrOffsets,
1033                                                     const cl::EventPtrs &waitEvents,
1034                                                     CLEventImpl::CreateFunc *eventCreateFunc)
1035 {
1036     UNIMPLEMENTED();
1037     ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
1038 }
1039 
enqueueMarkerWithWaitList(const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)1040 angle::Result CLCommandQueueVk::enqueueMarkerWithWaitList(const cl::EventPtrs &waitEvents,
1041                                                           CLEventImpl::CreateFunc *eventCreateFunc)
1042 {
1043     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1044 
1045     ANGLE_TRY(processWaitlist(waitEvents));
1046     ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
1047 
1048     return angle::Result::Continue;
1049 }
1050 
enqueueMarker(CLEventImpl::CreateFunc & eventCreateFunc)1051 angle::Result CLCommandQueueVk::enqueueMarker(CLEventImpl::CreateFunc &eventCreateFunc)
1052 {
1053     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1054 
1055     // This deprecated API is essentially a super-set of clEnqueueBarrier, where we also return
1056     // an event object (i.e. marker) since clEnqueueBarrier does not provide this
1057     ANGLE_TRY(insertBarrier());
1058 
1059     ANGLE_TRY(createEvent(&eventCreateFunc, cl::ExecutionStatus::Queued));
1060 
1061     return angle::Result::Continue;
1062 }
1063 
enqueueWaitForEvents(const cl::EventPtrs & events)1064 angle::Result CLCommandQueueVk::enqueueWaitForEvents(const cl::EventPtrs &events)
1065 {
1066     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1067 
1068     // Unlike clWaitForEvents, this routine is non-blocking
1069     ANGLE_TRY(processWaitlist(events));
1070 
1071     return angle::Result::Continue;
1072 }
1073 
enqueueBarrierWithWaitList(const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)1074 angle::Result CLCommandQueueVk::enqueueBarrierWithWaitList(const cl::EventPtrs &waitEvents,
1075                                                            CLEventImpl::CreateFunc *eventCreateFunc)
1076 {
1077     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1078 
1079     // The barrier command either waits for a list of events to complete, or if the list is
1080     // empty it waits for all commands previously enqueued in command_queue to complete before
1081     // it completes
1082     if (waitEvents.empty())
1083     {
1084         ANGLE_TRY(insertBarrier());
1085     }
1086     else
1087     {
1088         ANGLE_TRY(processWaitlist(waitEvents));
1089     }
1090 
1091     ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
1092 
1093     return angle::Result::Continue;
1094 }
1095 
insertBarrier()1096 angle::Result CLCommandQueueVk::insertBarrier()
1097 {
1098     VkMemoryBarrier memoryBarrier = {VK_STRUCTURE_TYPE_MEMORY_BARRIER, nullptr,
1099                                      VK_ACCESS_SHADER_WRITE_BIT,
1100                                      VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT};
1101     mComputePassCommands->getCommandBuffer().pipelineBarrier(
1102         VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1,
1103         &memoryBarrier, 0, nullptr, 0, nullptr);
1104 
1105     return angle::Result::Continue;
1106 }
1107 
enqueueBarrier()1108 angle::Result CLCommandQueueVk::enqueueBarrier()
1109 {
1110     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1111 
1112     ANGLE_TRY(insertBarrier());
1113 
1114     return angle::Result::Continue;
1115 }
1116 
flush()1117 angle::Result CLCommandQueueVk::flush()
1118 {
1119     ANGLE_TRACE_EVENT0("gpu.angle", "CLCommandQueueVk::flush");
1120 
1121     // Non-blocking finish
1122     // TODO: Ideally we should try to find better impl. to avoid spawning a submit-thread/Task here
1123     // https://anglebug.com/42267107
1124     std::shared_ptr<angle::WaitableEvent> asyncEvent =
1125         getPlatform()->postMultiThreadWorkerTask(std::make_shared<CLAsyncFinishTask>(this));
1126     ASSERT(asyncEvent != nullptr);
1127 
1128     return angle::Result::Continue;
1129 }
1130 
finish()1131 angle::Result CLCommandQueueVk::finish()
1132 {
1133     std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1134 
1135     ANGLE_TRACE_EVENT0("gpu.angle", "CLCommandQueueVk::finish");
1136 
1137     // Blocking finish
1138     return finishInternal();
1139 }
1140 
syncHostBuffers()1141 angle::Result CLCommandQueueVk::syncHostBuffers()
1142 {
1143     if (!mHostTransferList.empty())
1144     {
1145         for (const HostTransferEntry &hostTransferEntry : mHostTransferList)
1146         {
1147             const HostTransferConfig &transferConfig = hostTransferEntry.transferConfig;
1148             CLBufferVk &transferBufferVk =
1149                 hostTransferEntry.transferBufferHandle->getImpl<CLBufferVk>();
1150             switch (hostTransferEntry.transferConfig.type)
1151             {
1152                 case CL_COMMAND_READ_BUFFER:
1153                 case CL_COMMAND_READ_IMAGE:
1154                     ANGLE_TRY(transferBufferVk.copyTo(transferConfig.dstHostPtr,
1155                                                       transferConfig.offset, transferConfig.size));
1156                     break;
1157                 default:
1158                     UNIMPLEMENTED();
1159                     break;
1160             }
1161         }
1162     }
1163     mHostTransferList.clear();
1164 
1165     return angle::Result::Continue;
1166 }
1167 
addMemoryDependencies(cl::Memory * clMem)1168 angle::Result CLCommandQueueVk::addMemoryDependencies(cl::Memory *clMem)
1169 {
1170     cl::Memory *parentMem = clMem->getParent() ? clMem->getParent().get() : nullptr;
1171 
1172     // Take an usage count
1173     mMemoryCaptures.emplace_back(clMem);
1174 
1175     // Handle possible resource RAW hazard
1176     bool insertBarrier = false;
1177     if (clMem->getFlags().intersects(CL_MEM_READ_WRITE))
1178     {
1179         // Texel buffers have backing buffer obects
1180         if (mDependencyTracker.contains(clMem) || mDependencyTracker.contains(parentMem) ||
1181             mDependencyTracker.size() == kMaxDependencyTrackerSize)
1182         {
1183             insertBarrier = true;
1184             mDependencyTracker.clear();
1185         }
1186         mDependencyTracker.insert(clMem);
1187         if (parentMem)
1188         {
1189             mDependencyTracker.insert(parentMem);
1190         }
1191     }
1192 
1193     // Insert a layout transition for images
1194     if (cl::IsImageType(clMem->getType()))
1195     {
1196         CLImageVk &vkMem = clMem->getImpl<CLImageVk>();
1197         mComputePassCommands->imageWrite(mContext, gl::LevelIndex(0), 0, 1,
1198                                          vkMem.getImage().getAspectFlags(),
1199                                          vk::ImageLayout::ComputeShaderWrite, &vkMem.getImage());
1200     }
1201     else if (insertBarrier && cl::IsBufferType(clMem->getType()))
1202     {
1203         CLBufferVk &vkMem = clMem->getImpl<CLBufferVk>();
1204 
1205         mComputePassCommands->bufferWrite(VK_ACCESS_SHADER_WRITE_BIT,
1206                                           vk::PipelineStage::ComputeShader, &vkMem.getBuffer());
1207     }
1208 
1209     return angle::Result::Continue;
1210 }
1211 
processKernelResources(CLKernelVk & kernelVk,const cl::NDRange & ndrange,const cl::WorkgroupCount & workgroupCount)1212 angle::Result CLCommandQueueVk::processKernelResources(CLKernelVk &kernelVk,
1213                                                        const cl::NDRange &ndrange,
1214                                                        const cl::WorkgroupCount &workgroupCount)
1215 {
1216     bool needsBarrier = false;
1217     const CLProgramVk::DeviceProgramData *devProgramData =
1218         kernelVk.getProgram()->getDeviceProgramData(mCommandQueue.getDevice().getNative());
1219     ASSERT(devProgramData != nullptr);
1220 
1221     // Set the descriptor set layouts and allocate descriptor sets
1222     // The descriptor set layouts are setup in the order of their appearance, as Vulkan requires
1223     // them to point to valid handles.
1224     angle::EnumIterator<DescriptorSetIndex> layoutIndex(DescriptorSetIndex::LiteralSampler);
1225     for (DescriptorSetIndex index : angle::AllEnums<DescriptorSetIndex>())
1226     {
1227         if (!kernelVk.getDescriptorSetLayoutDesc(index).empty())
1228         {
1229             // Setup the descriptor layout
1230             ANGLE_CL_IMPL_TRY_ERROR(mContext->getDescriptorSetLayoutCache()->getDescriptorSetLayout(
1231                                         mContext, kernelVk.getDescriptorSetLayoutDesc(index),
1232                                         &kernelVk.getDescriptorSetLayouts()[*layoutIndex]),
1233                                     CL_INVALID_OPERATION);
1234 
1235             ANGLE_CL_IMPL_TRY_ERROR(
1236                 kernelVk.getProgram()->getMetaDescriptorPool(index).bindCachedDescriptorPool(
1237                     mContext, kernelVk.getDescriptorSetLayoutDesc(index), 1,
1238                     mContext->getDescriptorSetLayoutCache(),
1239                     &kernelVk.getProgram()->getDynamicDescriptorPoolPointer(index)),
1240                 CL_INVALID_OPERATION);
1241 
1242             // Allocate descriptor set
1243             ANGLE_TRY(kernelVk.allocateDescriptorSet(index, layoutIndex, mComputePassCommands));
1244             ++layoutIndex;
1245         }
1246     }
1247 
1248     // Setup the pipeline layout
1249     ANGLE_CL_IMPL_TRY_ERROR(kernelVk.initPipelineLayout(), CL_INVALID_OPERATION);
1250 
1251     // Push global offset data
1252     const VkPushConstantRange *globalOffsetRange = devProgramData->getGlobalOffsetRange();
1253     if (globalOffsetRange != nullptr)
1254     {
1255         mComputePassCommands->getCommandBuffer().pushConstants(
1256             kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, globalOffsetRange->offset,
1257             globalOffsetRange->size, ndrange.globalWorkOffset.data());
1258     }
1259 
1260     // Push global size data
1261     const VkPushConstantRange *globalSizeRange = devProgramData->getGlobalSizeRange();
1262     if (globalSizeRange != nullptr)
1263     {
1264         mComputePassCommands->getCommandBuffer().pushConstants(
1265             kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, globalSizeRange->offset,
1266             globalSizeRange->size, ndrange.globalWorkSize.data());
1267     }
1268 
1269     // Push region offset data.
1270     const VkPushConstantRange *regionOffsetRange = devProgramData->getRegionOffsetRange();
1271     if (regionOffsetRange != nullptr)
1272     {
1273         // We dont support non-uniform batches yet in ANGLE, this field also represents global
1274         // offset for NDR in uniform cases. Update this when non-uniform batches are supported.
1275         // https://github.com/google/clspv/blob/main/docs/OpenCLCOnVulkan.md#module-scope-push-constants
1276         mComputePassCommands->getCommandBuffer().pushConstants(
1277             kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, regionOffsetRange->offset,
1278             regionOffsetRange->size, ndrange.globalWorkOffset.data());
1279     }
1280 
1281     // Push region group offset data.
1282     const VkPushConstantRange *regionGroupOffsetRange = devProgramData->getRegionGroupOffsetRange();
1283     if (regionGroupOffsetRange != nullptr)
1284     {
1285         // We dont support non-uniform batches yet in ANGLE, and based on clspv doc/notes:
1286         // "only required when non-uniform NDRanges are supported"
1287         // For now, we set this field to zeros until we later support non-uniform.
1288         // https://github.com/google/clspv/blob/main/docs/OpenCLCOnVulkan.md#module-scope-push-constants
1289         uint32_t regionGroupOffsets[3] = {0, 0, 0};
1290         mComputePassCommands->getCommandBuffer().pushConstants(
1291             kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT,
1292             regionGroupOffsetRange->offset, regionGroupOffsetRange->size, &regionGroupOffsets);
1293     }
1294 
1295     // Push enqueued local size
1296     const VkPushConstantRange *enqueuedLocalSizeRange = devProgramData->getEnqueuedLocalSizeRange();
1297     if (enqueuedLocalSizeRange != nullptr)
1298     {
1299         mComputePassCommands->getCommandBuffer().pushConstants(
1300             kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT,
1301             enqueuedLocalSizeRange->offset, enqueuedLocalSizeRange->size,
1302             ndrange.localWorkSize.data());
1303     }
1304 
1305     // Push number of workgroups
1306     const VkPushConstantRange *numWorkgroupsRange = devProgramData->getNumWorkgroupsRange();
1307     if (devProgramData->reflectionData.pushConstants.contains(
1308             NonSemanticClspvReflectionPushConstantNumWorkgroups))
1309     {
1310         uint32_t numWorkgroups[3] = {workgroupCount[0], workgroupCount[1], workgroupCount[2]};
1311         mComputePassCommands->getCommandBuffer().pushConstants(
1312             kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, numWorkgroupsRange->offset,
1313             numWorkgroupsRange->size, &numWorkgroups);
1314     }
1315 
1316     // Retain kernel object until we finish executing it later
1317     mKernelCaptures.push_back(cl::KernelPtr{&kernelVk.getFrontendObject()});
1318 
1319     // Process each kernel argument/resource
1320     vk::DescriptorSetArray<UpdateDescriptorSetsBuilder> updateDescriptorSetsBuilders;
1321     CLKernelArguments args = kernelVk.getArgs();
1322     for (size_t index = 0; index < args.size(); index++)
1323     {
1324         const auto &arg = args.at(index);
1325         UpdateDescriptorSetsBuilder &kernelArgDescSetBuilder =
1326             updateDescriptorSetsBuilders[DescriptorSetIndex::KernelArguments];
1327         switch (arg.type)
1328         {
1329             case NonSemanticClspvReflectionArgumentUniform:
1330             case NonSemanticClspvReflectionArgumentStorageBuffer:
1331             {
1332                 cl::Memory *clMem = cl::Buffer::Cast(*static_cast<const cl_mem *>(arg.handle));
1333                 CLBufferVk &vkMem = clMem->getImpl<CLBufferVk>();
1334 
1335                 ANGLE_TRY(addMemoryDependencies(clMem));
1336 
1337                 // Update buffer/descriptor info
1338                 VkDescriptorBufferInfo &bufferInfo =
1339                     kernelArgDescSetBuilder.allocDescriptorBufferInfo();
1340                 bufferInfo.range  = clMem->getSize();
1341                 bufferInfo.offset = clMem->getOffset();
1342                 bufferInfo.buffer = vkMem.getBuffer().getBuffer().getHandle();
1343                 VkWriteDescriptorSet &writeDescriptorSet =
1344                     kernelArgDescSetBuilder.allocWriteDescriptorSet();
1345                 writeDescriptorSet.descriptorCount = 1;
1346                 writeDescriptorSet.descriptorType =
1347                     arg.type == NonSemanticClspvReflectionArgumentUniform
1348                         ? VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER
1349                         : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
1350                 writeDescriptorSet.pBufferInfo = &bufferInfo;
1351                 writeDescriptorSet.sType       = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
1352                 writeDescriptorSet.dstSet =
1353                     kernelVk.getDescriptorSet(DescriptorSetIndex::KernelArguments);
1354                 writeDescriptorSet.dstBinding = arg.descriptorBinding;
1355                 break;
1356             }
1357             case NonSemanticClspvReflectionArgumentPodPushConstant:
1358             {
1359                 // Spec requires the size and offset to be multiple of 4, round up for size and
1360                 // round down for offset to ensure this
1361                 uint32_t offset = roundDownPow2(arg.pushConstOffset, 4u);
1362                 uint32_t size =
1363                     roundUpPow2(arg.pushConstOffset + arg.pushConstantSize, 4u) - offset;
1364                 ASSERT(offset + size <= kernelVk.getPodArgumentsData().size());
1365                 mComputePassCommands->getCommandBuffer().pushConstants(
1366                     kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, offset, size,
1367                     &kernelVk.getPodArgumentsData()[offset]);
1368                 break;
1369             }
1370             case NonSemanticClspvReflectionArgumentSampler:
1371             {
1372                 cl::Sampler *clSampler =
1373                     cl::Sampler::Cast(*static_cast<const cl_sampler *>(arg.handle));
1374                 CLSamplerVk &vkSampler = clSampler->getImpl<CLSamplerVk>();
1375                 VkDescriptorImageInfo &samplerInfo =
1376                     kernelArgDescSetBuilder.allocDescriptorImageInfo();
1377                 samplerInfo.sampler = vkSampler.getSamplerHelper().get().getHandle();
1378                 VkWriteDescriptorSet &writeDescriptorSet =
1379                     kernelArgDescSetBuilder.allocWriteDescriptorSet();
1380                 writeDescriptorSet.descriptorCount = 1;
1381                 writeDescriptorSet.descriptorType  = VK_DESCRIPTOR_TYPE_SAMPLER;
1382                 writeDescriptorSet.pImageInfo      = &samplerInfo;
1383                 writeDescriptorSet.sType           = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
1384                 writeDescriptorSet.dstSet =
1385                     kernelVk.getDescriptorSet(DescriptorSetIndex::KernelArguments);
1386                 writeDescriptorSet.dstBinding = arg.descriptorBinding;
1387 
1388                 const VkPushConstantRange *samplerMaskRange =
1389                     devProgramData->getNormalizedSamplerMaskRange(index);
1390                 if (samplerMaskRange != nullptr)
1391                 {
1392                     if (clSampler->getNormalizedCoords() == false)
1393                     {
1394                         ANGLE_TRY(vkSampler.createNormalized());
1395                         samplerInfo.sampler =
1396                             vkSampler.getSamplerHelperNormalized().get().getHandle();
1397                     }
1398                     uint32_t mask = vkSampler.getSamplerMask();
1399                     mComputePassCommands->getCommandBuffer().pushConstants(
1400                         kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT,
1401                         samplerMaskRange->offset, samplerMaskRange->size, &mask);
1402                 }
1403                 break;
1404             }
1405             case NonSemanticClspvReflectionArgumentStorageImage:
1406             case NonSemanticClspvReflectionArgumentSampledImage:
1407             {
1408                 cl::Memory *clMem = cl::Image::Cast(*static_cast<const cl_mem *>(arg.handle));
1409                 CLImageVk &vkMem  = clMem->getImpl<CLImageVk>();
1410 
1411                 ANGLE_TRY(addMemoryDependencies(clMem));
1412 
1413                 cl_image_format imageFormat = vkMem.getFormat();
1414                 const VkPushConstantRange *imageDataChannelOrderRange =
1415                     devProgramData->getImageDataChannelOrderRange(index);
1416                 if (imageDataChannelOrderRange != nullptr)
1417                 {
1418                     mComputePassCommands->getCommandBuffer().pushConstants(
1419                         kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT,
1420                         imageDataChannelOrderRange->offset, imageDataChannelOrderRange->size,
1421                         &imageFormat.image_channel_order);
1422                 }
1423 
1424                 const VkPushConstantRange *imageDataChannelDataTypeRange =
1425                     devProgramData->getImageDataChannelDataTypeRange(index);
1426                 if (imageDataChannelDataTypeRange != nullptr)
1427                 {
1428                     mComputePassCommands->getCommandBuffer().pushConstants(
1429                         kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT,
1430                         imageDataChannelDataTypeRange->offset, imageDataChannelDataTypeRange->size,
1431                         &imageFormat.image_channel_data_type);
1432                 }
1433 
1434                 // Update image/descriptor info
1435                 VkDescriptorImageInfo &imageInfo =
1436                     kernelArgDescSetBuilder.allocDescriptorImageInfo();
1437                 imageInfo.imageLayout =
1438                     arg.type == NonSemanticClspvReflectionArgumentStorageImage
1439                         ? VK_IMAGE_LAYOUT_GENERAL
1440                         : vkMem.getImage().getCurrentLayout(mContext->getRenderer());
1441                 imageInfo.imageView = vkMem.getImageView().getHandle();
1442                 imageInfo.sampler   = VK_NULL_HANDLE;
1443                 VkWriteDescriptorSet &writeDescriptorSet =
1444                     kernelArgDescSetBuilder.allocWriteDescriptorSet();
1445                 writeDescriptorSet.descriptorCount = 1;
1446                 writeDescriptorSet.descriptorType =
1447                     arg.type == NonSemanticClspvReflectionArgumentStorageImage
1448                         ? VK_DESCRIPTOR_TYPE_STORAGE_IMAGE
1449                         : VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE;
1450                 writeDescriptorSet.pImageInfo = &imageInfo;
1451                 writeDescriptorSet.sType      = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
1452                 writeDescriptorSet.dstSet =
1453                     kernelVk.getDescriptorSet(DescriptorSetIndex::KernelArguments);
1454                 writeDescriptorSet.dstBinding = arg.descriptorBinding;
1455                 break;
1456             }
1457             case NonSemanticClspvReflectionArgumentUniformTexelBuffer:
1458             case NonSemanticClspvReflectionArgumentStorageTexelBuffer:
1459             {
1460                 cl::Memory *clMem = cl::Image::Cast(*static_cast<const cl_mem *>(arg.handle));
1461                 CLImageVk &vkMem  = clMem->getImpl<CLImageVk>();
1462 
1463                 ANGLE_TRY(addMemoryDependencies(clMem));
1464 
1465                 VkBufferView &bufferView           = kernelArgDescSetBuilder.allocBufferView();
1466                 const vk::BufferView *vkBufferView = nullptr;
1467                 ANGLE_TRY(vkMem.getBufferView(&vkBufferView));
1468                 bufferView = vkBufferView->getHandle();
1469 
1470                 VkWriteDescriptorSet &writeDescriptorSet =
1471                     kernelArgDescSetBuilder.allocWriteDescriptorSet();
1472                 writeDescriptorSet.descriptorCount = 1;
1473                 writeDescriptorSet.descriptorType =
1474                     arg.type == NonSemanticClspvReflectionArgumentStorageTexelBuffer
1475                         ? VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER
1476                         : VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER;
1477                 writeDescriptorSet.pImageInfo = nullptr;
1478                 writeDescriptorSet.sType      = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
1479                 writeDescriptorSet.dstSet =
1480                     kernelVk.getDescriptorSet(DescriptorSetIndex::KernelArguments);
1481                 writeDescriptorSet.dstBinding       = arg.descriptorBinding;
1482                 writeDescriptorSet.pTexelBufferView = &bufferView;
1483 
1484                 break;
1485             }
1486             case NonSemanticClspvReflectionArgumentPodUniform:
1487             case NonSemanticClspvReflectionArgumentPointerUniform:
1488             case NonSemanticClspvReflectionArgumentPodStorageBuffer:
1489             case NonSemanticClspvReflectionArgumentPointerPushConstant:
1490             default:
1491             {
1492                 UNIMPLEMENTED();
1493                 break;
1494             }
1495         }
1496     }
1497 
1498     // process the printf storage buffer
1499     if (kernelVk.usesPrintf())
1500     {
1501         UpdateDescriptorSetsBuilder &printfDescSetBuilder =
1502             updateDescriptorSetsBuilders[DescriptorSetIndex::Printf];
1503 
1504         cl::Memory *clMem   = cl::Buffer::Cast(getOrCreatePrintfBuffer());
1505         CLBufferVk &vkMem   = clMem->getImpl<CLBufferVk>();
1506         uint8_t *mapPointer = nullptr;
1507         ANGLE_TRY(vkMem.map(mapPointer, 0));
1508         // The spec calls out *The first 4 bytes of the buffer should be zero-initialized.*
1509         memset(mapPointer, 0, 4);
1510 
1511         auto &bufferInfo  = printfDescSetBuilder.allocDescriptorBufferInfo();
1512         bufferInfo.range  = clMem->getSize();
1513         bufferInfo.offset = clMem->getOffset();
1514         bufferInfo.buffer = vkMem.getBuffer().getBuffer().getHandle();
1515 
1516         auto &writeDescriptorSet           = printfDescSetBuilder.allocWriteDescriptorSet();
1517         writeDescriptorSet.descriptorCount = 1;
1518         writeDescriptorSet.descriptorType  = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
1519         writeDescriptorSet.pBufferInfo     = &bufferInfo;
1520         writeDescriptorSet.sType           = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
1521         writeDescriptorSet.dstSet          = kernelVk.getDescriptorSet(DescriptorSetIndex::Printf);
1522         writeDescriptorSet.dstBinding      = kernelVk.getProgram()
1523                                             ->getDeviceProgramData(kernelVk.getKernelName().c_str())
1524                                             ->reflectionData.printfBufferStorage.binding;
1525 
1526         mNeedPrintfHandling = true;
1527         mPrintfInfos        = kernelVk.getProgram()->getPrintfDescriptors(kernelVk.getKernelName());
1528     }
1529 
1530     angle::EnumIterator<DescriptorSetIndex> descriptorSetIndex(DescriptorSetIndex::LiteralSampler);
1531     for (DescriptorSetIndex index : angle::AllEnums<DescriptorSetIndex>())
1532     {
1533         if (!kernelVk.getDescriptorSetLayoutDesc(index).empty())
1534         {
1535             mContext->getPerfCounters().writeDescriptorSets =
1536                 updateDescriptorSetsBuilders[index].flushDescriptorSetUpdates(
1537                     mContext->getRenderer()->getDevice());
1538 
1539             VkDescriptorSet descriptorSet = kernelVk.getDescriptorSet(index);
1540             mComputePassCommands->getCommandBuffer().bindDescriptorSets(
1541                 kernelVk.getPipelineLayout(), VK_PIPELINE_BIND_POINT_COMPUTE, *descriptorSetIndex,
1542                 1, &descriptorSet, 0, nullptr);
1543 
1544             ++descriptorSetIndex;
1545         }
1546     }
1547 
1548     if (needsBarrier)
1549     {
1550         ANGLE_TRY(insertBarrier());
1551     }
1552 
1553     return angle::Result::Continue;
1554 }
1555 
flushComputePassCommands()1556 angle::Result CLCommandQueueVk::flushComputePassCommands()
1557 {
1558     if (mComputePassCommands->empty())
1559     {
1560         return angle::Result::Continue;
1561     }
1562 
1563     // Flush any host visible buffers by adding appropriate barriers
1564     if (mComputePassCommands->getAndResetHasHostVisibleBufferWrite())
1565     {
1566         // Make sure all writes to host-visible buffers are flushed.
1567         VkMemoryBarrier memoryBarrier = {};
1568         memoryBarrier.sType           = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
1569         memoryBarrier.srcAccessMask   = VK_ACCESS_MEMORY_WRITE_BIT;
1570         memoryBarrier.dstAccessMask   = VK_ACCESS_HOST_READ_BIT | VK_ACCESS_HOST_WRITE_BIT;
1571 
1572         mComputePassCommands->getCommandBuffer().memoryBarrier(
1573             VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
1574             VK_PIPELINE_STAGE_HOST_BIT, memoryBarrier);
1575     }
1576 
1577     mLastFlushedQueueSerial = mComputePassCommands->getQueueSerial();
1578     // Here, we flush our compute cmds to RendererVk's primary command buffer
1579     ANGLE_TRY(mContext->getRenderer()->flushOutsideRPCommands(
1580         mContext, getProtectionType(), egl::ContextPriority::Medium, &mComputePassCommands));
1581 
1582     mHasAnyCommandsPendingSubmission = true;
1583 
1584     mContext->getPerfCounters().flushedOutsideRenderPassCommandBuffers++;
1585 
1586     // Generate new serial for next batch of cmds
1587     mComputePassCommands->setQueueSerial(
1588         mCurrentQueueSerialIndex,
1589         mContext->getRenderer()->generateQueueSerial(mCurrentQueueSerialIndex));
1590 
1591     return angle::Result::Continue;
1592 }
1593 
processWaitlist(const cl::EventPtrs & waitEvents)1594 angle::Result CLCommandQueueVk::processWaitlist(const cl::EventPtrs &waitEvents)
1595 {
1596     if (!waitEvents.empty())
1597     {
1598         bool insertedBarrier = false;
1599         for (const cl::EventPtr &event : waitEvents)
1600         {
1601             if (event->getImpl<CLEventVk>().isUserEvent() ||
1602                 event->getCommandQueue() != &mCommandQueue)
1603             {
1604                 // We cannot use a barrier in these cases, therefore defer the event
1605                 // handling till submission time
1606                 // TODO: Perhaps we could utilize VkEvents here instead and have GPU wait(s)
1607                 // https://anglebug.com/42267109
1608                 mDependantEvents.push_back(event);
1609             }
1610             else if (event->getCommandQueue() == &mCommandQueue && !insertedBarrier)
1611             {
1612                 // As long as there is at least one dependant command in same queue,
1613                 // we just need to insert one execution barrier
1614                 ANGLE_TRY(insertBarrier());
1615 
1616                 insertedBarrier = true;
1617             }
1618         }
1619     }
1620     return angle::Result::Continue;
1621 }
1622 
submitCommands()1623 angle::Result CLCommandQueueVk::submitCommands()
1624 {
1625     ANGLE_TRACE_EVENT0("gpu.angle", "CLCommandQueueVk::submitCommands()");
1626 
1627     // Kick off renderer submit
1628     ANGLE_TRY(mContext->getRenderer()->submitCommands(mContext, getProtectionType(),
1629                                                       egl::ContextPriority::Medium, nullptr,
1630                                                       nullptr, mLastFlushedQueueSerial));
1631 
1632     mLastSubmittedQueueSerial = mLastFlushedQueueSerial;
1633 
1634     // Now that we have submitted commands, some of pending garbage may no longer pending
1635     // and should be moved to garbage list.
1636     mContext->getRenderer()->cleanupPendingSubmissionGarbage();
1637 
1638     mHasAnyCommandsPendingSubmission = false;
1639 
1640     return angle::Result::Continue;
1641 }
1642 
createEvent(CLEventImpl::CreateFunc * createFunc,cl::ExecutionStatus initialStatus)1643 angle::Result CLCommandQueueVk::createEvent(CLEventImpl::CreateFunc *createFunc,
1644                                             cl::ExecutionStatus initialStatus)
1645 {
1646     if (createFunc != nullptr)
1647     {
1648         *createFunc = [this, initialStatus](const cl::Event &event) {
1649             auto eventVk = new (std::nothrow) CLEventVk(event);
1650             if (eventVk == nullptr)
1651             {
1652                 ERR() << "Failed to create event obj!";
1653                 ANGLE_CL_SET_ERROR(CL_OUT_OF_HOST_MEMORY);
1654                 return CLEventImpl::Ptr(nullptr);
1655             }
1656 
1657             if (initialStatus == cl::ExecutionStatus::Complete)
1658             {
1659                 // Submission finished at this point, just set event to complete
1660                 if (IsError(eventVk->setStatusAndExecuteCallback(cl::ToCLenum(initialStatus))))
1661                 {
1662                     ANGLE_CL_SET_ERROR(CL_OUT_OF_RESOURCES);
1663                 }
1664             }
1665             else if (mCommandQueue.getProperties().intersects(CL_QUEUE_PROFILING_ENABLE))
1666             {
1667                 // We also block for profiling so that we get timestamps per-command
1668                 if (IsError(mCommandQueue.getImpl<CLCommandQueueVk>().finish()))
1669                 {
1670                     ANGLE_CL_SET_ERROR(CL_OUT_OF_RESOURCES);
1671                 }
1672                 // Submission finished at this point, just set event to complete
1673                 if (IsError(eventVk->setStatusAndExecuteCallback(CL_COMPLETE)))
1674                 {
1675                     ANGLE_CL_SET_ERROR(CL_OUT_OF_RESOURCES);
1676                 }
1677             }
1678             else
1679             {
1680                 eventVk->setQueueSerial(mComputePassCommands->getQueueSerial());
1681                 // Save a reference to this event
1682                 mAssociatedEvents.push_back(cl::EventPtr{&eventVk->getFrontendObject()});
1683             }
1684 
1685             return CLEventImpl::Ptr(eventVk);
1686         };
1687     }
1688     return angle::Result::Continue;
1689 }
1690 
finishInternal()1691 angle::Result CLCommandQueueVk::finishInternal()
1692 {
1693     for (cl::EventPtr event : mAssociatedEvents)
1694     {
1695         ANGLE_TRY(event->getImpl<CLEventVk>().setStatusAndExecuteCallback(CL_SUBMITTED));
1696     }
1697 
1698     if (!mComputePassCommands->empty())
1699     {
1700         // If we still have dependant events, handle them now
1701         if (!mDependantEvents.empty())
1702         {
1703             for (const auto &depEvent : mDependantEvents)
1704             {
1705                 if (depEvent->getImpl<CLEventVk>().isUserEvent())
1706                 {
1707                     // We just wait here for user to set the event object
1708                     cl_int status = CL_QUEUED;
1709                     ANGLE_TRY(depEvent->getImpl<CLEventVk>().waitForUserEventStatus());
1710                     ANGLE_TRY(depEvent->getImpl<CLEventVk>().getCommandExecutionStatus(status));
1711                     if (status < 0)
1712                     {
1713                         ERR() << "Invalid dependant user-event (" << depEvent.get()
1714                               << ") status encountered!";
1715                         mComputePassCommands->getCommandBuffer().reset();
1716                         ANGLE_CL_RETURN_ERROR(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
1717                     }
1718                 }
1719                 else
1720                 {
1721                     // Otherwise, we just need to submit/finish for dependant event queues
1722                     // here that are not associated with this queue
1723                     ANGLE_TRY(depEvent->getCommandQueue()->finish());
1724                 }
1725             }
1726             mDependantEvents.clear();
1727         }
1728 
1729         ANGLE_TRY(flushComputePassCommands());
1730     }
1731 
1732     for (cl::EventPtr event : mAssociatedEvents)
1733     {
1734         ANGLE_TRY(event->getImpl<CLEventVk>().setStatusAndExecuteCallback(CL_RUNNING));
1735     }
1736 
1737     if (mHasAnyCommandsPendingSubmission)
1738     {
1739         // Submit and wait for fence
1740         ANGLE_TRY(submitCommands());
1741         ANGLE_TRY(mContext->getRenderer()->finishQueueSerial(mContext, mLastSubmittedQueueSerial));
1742 
1743         // Ensure any resources are synced back to host on GPU completion
1744         ANGLE_TRY(syncHostBuffers());
1745     }
1746 
1747     if (mNeedPrintfHandling)
1748     {
1749         ANGLE_TRY(processPrintfBuffer());
1750         mNeedPrintfHandling = false;
1751     }
1752 
1753     for (cl::EventPtr event : mAssociatedEvents)
1754     {
1755         ANGLE_TRY(event->getImpl<CLEventVk>().setStatusAndExecuteCallback(CL_COMPLETE));
1756     }
1757 
1758     mMemoryCaptures.clear();
1759     mAssociatedEvents.clear();
1760     mDependencyTracker.clear();
1761     mKernelCaptures.clear();
1762 
1763     return angle::Result::Continue;
1764 }
1765 
1766 // Helper function to insert appropriate memory barriers before accessing the resources in the
1767 // command buffer.
onResourceAccess(const vk::CommandBufferAccess & access)1768 angle::Result CLCommandQueueVk::onResourceAccess(const vk::CommandBufferAccess &access)
1769 {
1770     // Buffers
1771     for (const vk::CommandBufferBufferAccess &bufferAccess : access.getReadBuffers())
1772     {
1773         if (mComputePassCommands->usesBufferForWrite(*bufferAccess.buffer))
1774         {
1775             // read buffers only need a new command buffer if previously used for write
1776             ANGLE_TRY(flush());
1777         }
1778 
1779         mComputePassCommands->bufferRead(bufferAccess.accessType, bufferAccess.stage,
1780                                          bufferAccess.buffer);
1781     }
1782 
1783     for (const vk::CommandBufferBufferAccess &bufferAccess : access.getWriteBuffers())
1784     {
1785         if (mComputePassCommands->usesBuffer(*bufferAccess.buffer))
1786         {
1787             // write buffers always need a new command buffer
1788             ANGLE_TRY(flush());
1789         }
1790 
1791         mComputePassCommands->bufferWrite(bufferAccess.accessType, bufferAccess.stage,
1792                                           bufferAccess.buffer);
1793         if (bufferAccess.buffer->isHostVisible())
1794         {
1795             // currently all are host visible so nothing to do
1796         }
1797     }
1798 
1799     for (const vk::CommandBufferBufferExternalAcquireRelease &bufferAcquireRelease :
1800          access.getExternalAcquireReleaseBuffers())
1801     {
1802         mComputePassCommands->retainResourceForWrite(bufferAcquireRelease.buffer);
1803     }
1804 
1805     for (const vk::CommandBufferResourceAccess &resourceAccess : access.getAccessResources())
1806     {
1807         mComputePassCommands->retainResource(resourceAccess.resource);
1808     }
1809 
1810     return angle::Result::Continue;
1811 }
1812 
processPrintfBuffer()1813 angle::Result CLCommandQueueVk::processPrintfBuffer()
1814 {
1815     ASSERT(mPrintfBuffer);
1816     ASSERT(mNeedPrintfHandling);
1817     ASSERT(mPrintfInfos);
1818 
1819     cl::Memory *clMem = cl::Buffer::Cast(getOrCreatePrintfBuffer());
1820     CLBufferVk &vkMem = clMem->getImpl<CLBufferVk>();
1821 
1822     unsigned char *data = nullptr;
1823     ANGLE_TRY(vkMem.map(data, 0));
1824     ANGLE_TRY(ClspvProcessPrintfBuffer(data, vkMem.getSize(), mPrintfInfos));
1825     vkMem.unmap();
1826 
1827     return angle::Result::Continue;
1828 }
1829 
1830 // A single CL buffer is setup for every command queue of size kPrintfBufferSize. This can be
1831 // expanded later, if more storage is needed.
getOrCreatePrintfBuffer()1832 cl_mem CLCommandQueueVk::getOrCreatePrintfBuffer()
1833 {
1834     if (!mPrintfBuffer)
1835     {
1836         mPrintfBuffer = cl::Buffer::Cast(mContext->getFrontendObject().createBuffer(
1837             nullptr, cl::MemFlags(CL_MEM_READ_WRITE), kPrintfBufferSize, nullptr));
1838     }
1839     return mPrintfBuffer;
1840 }
1841 
hasUserEventDependency() const1842 bool CLCommandQueueVk::hasUserEventDependency() const
1843 {
1844     return std::any_of(mDependantEvents.begin(), mDependantEvents.end(),
1845                        [](const cl::EventPtr event) { return event->isUserEvent(); });
1846 }
1847 
1848 }  // namespace rx
1849