1 //
2 // Copyright 2021 The ANGLE Project Authors. All rights reserved.
3 // Use of this source code is governed by a BSD-style license that can be
4 // found in the LICENSE file.
5 //
6 // CLCommandQueueVk.cpp: Implements the class methods for CLCommandQueueVk.
7
8 #include "common/PackedCLEnums_autogen.h"
9 #include "common/PackedEnums.h"
10
11 #include "libANGLE/cl_types.h"
12 #include "libANGLE/renderer/vulkan/CLCommandQueueVk.h"
13 #include "libANGLE/renderer/vulkan/CLContextVk.h"
14 #include "libANGLE/renderer/vulkan/CLDeviceVk.h"
15 #include "libANGLE/renderer/vulkan/CLKernelVk.h"
16 #include "libANGLE/renderer/vulkan/CLMemoryVk.h"
17 #include "libANGLE/renderer/vulkan/CLProgramVk.h"
18 #include "libANGLE/renderer/vulkan/CLSamplerVk.h"
19 #include "libANGLE/renderer/vulkan/cl_types.h"
20 #include "libANGLE/renderer/vulkan/clspv_utils.h"
21 #include "libANGLE/renderer/vulkan/vk_cache_utils.h"
22 #include "libANGLE/renderer/vulkan/vk_cl_utils.h"
23 #include "libANGLE/renderer/vulkan/vk_renderer.h"
24 #include "libANGLE/renderer/vulkan/vk_wrapper.h"
25
26 #include "libANGLE/CLBuffer.h"
27 #include "libANGLE/CLCommandQueue.h"
28 #include "libANGLE/CLContext.h"
29 #include "libANGLE/CLEvent.h"
30 #include "libANGLE/CLImage.h"
31 #include "libANGLE/CLKernel.h"
32 #include "libANGLE/CLSampler.h"
33 #include "libANGLE/cl_utils.h"
34
35 #include "spirv/unified1/NonSemanticClspvReflection.h"
36 #include "vulkan/vulkan_core.h"
37
38 namespace rx
39 {
40
41 class CLAsyncFinishTask : public angle::Closure
42 {
43 public:
CLAsyncFinishTask(CLCommandQueueVk * queueVk)44 CLAsyncFinishTask(CLCommandQueueVk *queueVk) : mQueueVk(queueVk) {}
45
operator ()()46 void operator()() override
47 {
48 ANGLE_TRACE_EVENT0("gpu.angle", "CLCommandQueueVk::finish (async)");
49 if (IsError(mQueueVk->finish()))
50 {
51 ERR() << "Async finish (clFlush) failed for queue (" << mQueueVk << ")!";
52 }
53 }
54
55 private:
56 CLCommandQueueVk *mQueueVk;
57 };
58
CLCommandQueueVk(const cl::CommandQueue & commandQueue)59 CLCommandQueueVk::CLCommandQueueVk(const cl::CommandQueue &commandQueue)
60 : CLCommandQueueImpl(commandQueue),
61 mContext(&commandQueue.getContext().getImpl<CLContextVk>()),
62 mDevice(&commandQueue.getDevice().getImpl<CLDeviceVk>()),
63 mPrintfBuffer(nullptr),
64 mComputePassCommands(nullptr),
65 mCurrentQueueSerialIndex(kInvalidQueueSerialIndex),
66 mHasAnyCommandsPendingSubmission(false),
67 mNeedPrintfHandling(false),
68 mPrintfInfos(nullptr)
69 {}
70
init()71 angle::Result CLCommandQueueVk::init()
72 {
73 ANGLE_CL_IMPL_TRY_ERROR(
74 vk::OutsideRenderPassCommandBuffer::InitializeCommandPool(
75 mContext, &mCommandPool.outsideRenderPassPool,
76 mContext->getRenderer()->getQueueFamilyIndex(), getProtectionType()),
77 CL_OUT_OF_RESOURCES);
78
79 ANGLE_CL_IMPL_TRY_ERROR(mContext->getRenderer()->getOutsideRenderPassCommandBufferHelper(
80 mContext, &mCommandPool.outsideRenderPassPool,
81 &mOutsideRenderPassCommandsAllocator, &mComputePassCommands),
82 CL_OUT_OF_RESOURCES);
83
84 // Generate initial QueueSerial for command buffer helper
85 ANGLE_CL_IMPL_TRY_ERROR(
86 mContext->getRenderer()->allocateQueueSerialIndex(&mCurrentQueueSerialIndex),
87 CL_OUT_OF_RESOURCES);
88 mComputePassCommands->setQueueSerial(
89 mCurrentQueueSerialIndex,
90 mContext->getRenderer()->generateQueueSerial(mCurrentQueueSerialIndex));
91
92 // Initialize serials to be valid but appear submitted and finished.
93 mLastFlushedQueueSerial = QueueSerial(mCurrentQueueSerialIndex, Serial());
94 mLastSubmittedQueueSerial = mLastFlushedQueueSerial;
95
96 return angle::Result::Continue;
97 }
98
~CLCommandQueueVk()99 CLCommandQueueVk::~CLCommandQueueVk()
100 {
101 ASSERT(mComputePassCommands->empty());
102 ASSERT(!mNeedPrintfHandling);
103
104 if (mPrintfBuffer)
105 {
106 mPrintfBuffer->release();
107 }
108
109 VkDevice vkDevice = mContext->getDevice();
110
111 if (mCurrentQueueSerialIndex != kInvalidQueueSerialIndex)
112 {
113 mContext->getRenderer()->releaseQueueSerialIndex(mCurrentQueueSerialIndex);
114 mCurrentQueueSerialIndex = kInvalidQueueSerialIndex;
115 }
116
117 // Recycle the current command buffers
118 mContext->getRenderer()->recycleOutsideRenderPassCommandBufferHelper(&mComputePassCommands);
119 mCommandPool.outsideRenderPassPool.destroy(vkDevice);
120 }
121
setProperty(cl::CommandQueueProperties properties,cl_bool enable)122 angle::Result CLCommandQueueVk::setProperty(cl::CommandQueueProperties properties, cl_bool enable)
123 {
124 // NOTE: "clSetCommandQueueProperty" has been deprecated as of OpenCL 1.1
125 // http://man.opencl.org/deprecated.html
126 return angle::Result::Continue;
127 }
128
enqueueReadBuffer(const cl::Buffer & buffer,bool blocking,size_t offset,size_t size,void * ptr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)129 angle::Result CLCommandQueueVk::enqueueReadBuffer(const cl::Buffer &buffer,
130 bool blocking,
131 size_t offset,
132 size_t size,
133 void *ptr,
134 const cl::EventPtrs &waitEvents,
135 CLEventImpl::CreateFunc *eventCreateFunc)
136 {
137 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
138
139 ANGLE_TRY(processWaitlist(waitEvents));
140 CLBufferVk *bufferVk = &buffer.getImpl<CLBufferVk>();
141
142 if (blocking)
143 {
144 ANGLE_TRY(finishInternal());
145 ANGLE_TRY(bufferVk->copyTo(ptr, offset, size));
146
147 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete));
148 }
149 else
150 {
151 // Stage a transfer routine
152 HostTransferConfig transferConfig;
153 transferConfig.type = CL_COMMAND_READ_BUFFER;
154 transferConfig.offset = offset;
155 transferConfig.size = size;
156 transferConfig.dstHostPtr = ptr;
157 ANGLE_TRY(addToHostTransferList(bufferVk, transferConfig));
158
159 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
160 }
161
162 return angle::Result::Continue;
163 }
164
enqueueWriteBuffer(const cl::Buffer & buffer,bool blocking,size_t offset,size_t size,const void * ptr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)165 angle::Result CLCommandQueueVk::enqueueWriteBuffer(const cl::Buffer &buffer,
166 bool blocking,
167 size_t offset,
168 size_t size,
169 const void *ptr,
170 const cl::EventPtrs &waitEvents,
171 CLEventImpl::CreateFunc *eventCreateFunc)
172 {
173 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
174
175 ANGLE_TRY(processWaitlist(waitEvents));
176
177 auto bufferVk = &buffer.getImpl<CLBufferVk>();
178
179 if (blocking)
180 {
181 ANGLE_TRY(finishInternal());
182 ANGLE_TRY(bufferVk->copyFrom(ptr, offset, size));
183 }
184 else
185 {
186 // Stage a transfer routine
187 HostTransferConfig config;
188 config.type = CL_COMMAND_WRITE_BUFFER;
189 config.offset = offset;
190 config.size = size;
191 config.srcHostPtr = ptr;
192 ANGLE_TRY(addToHostTransferList(bufferVk, config));
193 }
194
195 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete));
196
197 return angle::Result::Continue;
198 }
199
enqueueReadBufferRect(const cl::Buffer & buffer,bool blocking,const cl::MemOffsets & bufferOrigin,const cl::MemOffsets & hostOrigin,const cl::Coordinate & region,size_t bufferRowPitch,size_t bufferSlicePitch,size_t hostRowPitch,size_t hostSlicePitch,void * ptr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)200 angle::Result CLCommandQueueVk::enqueueReadBufferRect(const cl::Buffer &buffer,
201 bool blocking,
202 const cl::MemOffsets &bufferOrigin,
203 const cl::MemOffsets &hostOrigin,
204 const cl::Coordinate ®ion,
205 size_t bufferRowPitch,
206 size_t bufferSlicePitch,
207 size_t hostRowPitch,
208 size_t hostSlicePitch,
209 void *ptr,
210 const cl::EventPtrs &waitEvents,
211 CLEventImpl::CreateFunc *eventCreateFunc)
212 {
213 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
214
215 ANGLE_TRY(processWaitlist(waitEvents));
216 auto bufferVk = &buffer.getImpl<CLBufferVk>();
217
218 ANGLE_TRY(finishInternal());
219
220 cl::BufferRect bufferRect{cl::Offset{bufferOrigin.x, bufferOrigin.y, bufferOrigin.z},
221 cl::Extents{region.x, region.y, region.z}, bufferRowPitch,
222 bufferSlicePitch, 1};
223
224 cl::BufferRect ptrRect{cl::Offset{hostOrigin.x, hostOrigin.y, hostOrigin.z},
225 cl::Extents{region.x, region.y, region.z}, hostRowPitch, hostSlicePitch,
226 1};
227
228 ANGLE_TRY(bufferVk->getRect(bufferRect, ptrRect, ptr));
229
230 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete));
231 return angle::Result::Continue;
232 }
233
enqueueWriteBufferRect(const cl::Buffer & buffer,bool blocking,const cl::MemOffsets & bufferOrigin,const cl::MemOffsets & hostOrigin,const cl::Coordinate & region,size_t bufferRowPitch,size_t bufferSlicePitch,size_t hostRowPitch,size_t hostSlicePitch,const void * ptr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)234 angle::Result CLCommandQueueVk::enqueueWriteBufferRect(const cl::Buffer &buffer,
235 bool blocking,
236 const cl::MemOffsets &bufferOrigin,
237 const cl::MemOffsets &hostOrigin,
238 const cl::Coordinate ®ion,
239 size_t bufferRowPitch,
240 size_t bufferSlicePitch,
241 size_t hostRowPitch,
242 size_t hostSlicePitch,
243 const void *ptr,
244 const cl::EventPtrs &waitEvents,
245 CLEventImpl::CreateFunc *eventCreateFunc)
246 {
247 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
248
249 ANGLE_TRY(processWaitlist(waitEvents));
250 auto bufferVk = &buffer.getImpl<CLBufferVk>();
251
252 ANGLE_TRY(finishInternal());
253
254 cl::BufferRect bufferRect{cl::Offset{bufferOrigin.x, bufferOrigin.y, bufferOrigin.z},
255 cl::Extents{region.x, region.y, region.z}, bufferRowPitch,
256 bufferSlicePitch, 1};
257
258 cl::BufferRect ptrRect{cl::Offset{hostOrigin.x, hostOrigin.y, hostOrigin.z},
259 cl::Extents{region.x, region.y, region.z}, hostRowPitch, hostSlicePitch,
260 1};
261
262 ANGLE_TRY(bufferVk->setRect(ptr, ptrRect, bufferRect));
263
264 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete));
265 return angle::Result::Continue;
266 }
267
enqueueCopyBuffer(const cl::Buffer & srcBuffer,const cl::Buffer & dstBuffer,size_t srcOffset,size_t dstOffset,size_t size,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)268 angle::Result CLCommandQueueVk::enqueueCopyBuffer(const cl::Buffer &srcBuffer,
269 const cl::Buffer &dstBuffer,
270 size_t srcOffset,
271 size_t dstOffset,
272 size_t size,
273 const cl::EventPtrs &waitEvents,
274 CLEventImpl::CreateFunc *eventCreateFunc)
275 {
276 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
277
278 ANGLE_TRY(processWaitlist(waitEvents));
279
280 CLBufferVk *srcBufferVk = &srcBuffer.getImpl<CLBufferVk>();
281 CLBufferVk *dstBufferVk = &dstBuffer.getImpl<CLBufferVk>();
282
283 vk::CommandBufferAccess access;
284 if (srcBufferVk->isSubBuffer() && dstBufferVk->isSubBuffer() &&
285 (srcBufferVk->getParent() == dstBufferVk->getParent()))
286 {
287 // this is a self copy
288 access.onBufferSelfCopy(&srcBufferVk->getBuffer());
289 }
290 else
291 {
292 access.onBufferTransferRead(&srcBufferVk->getBuffer());
293 access.onBufferTransferWrite(&dstBufferVk->getBuffer());
294 }
295
296 vk::OutsideRenderPassCommandBuffer *commandBuffer;
297 ANGLE_TRY(getCommandBuffer(access, &commandBuffer));
298
299 VkBufferCopy copyRegion = {srcOffset, dstOffset, size};
300 // update the offset in the case of sub-buffers
301 if (srcBufferVk->getOffset())
302 {
303 copyRegion.srcOffset += srcBufferVk->getOffset();
304 }
305 if (dstBufferVk->getOffset())
306 {
307 copyRegion.dstOffset += dstBufferVk->getOffset();
308 }
309 commandBuffer->copyBuffer(srcBufferVk->getBuffer().getBuffer(),
310 dstBufferVk->getBuffer().getBuffer(), 1, ©Region);
311
312 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
313
314 return angle::Result::Continue;
315 }
316
enqueueCopyBufferRect(const cl::Buffer & srcBuffer,const cl::Buffer & dstBuffer,const cl::MemOffsets & srcOrigin,const cl::MemOffsets & dstOrigin,const cl::Coordinate & region,size_t srcRowPitch,size_t srcSlicePitch,size_t dstRowPitch,size_t dstSlicePitch,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)317 angle::Result CLCommandQueueVk::enqueueCopyBufferRect(const cl::Buffer &srcBuffer,
318 const cl::Buffer &dstBuffer,
319 const cl::MemOffsets &srcOrigin,
320 const cl::MemOffsets &dstOrigin,
321 const cl::Coordinate ®ion,
322 size_t srcRowPitch,
323 size_t srcSlicePitch,
324 size_t dstRowPitch,
325 size_t dstSlicePitch,
326 const cl::EventPtrs &waitEvents,
327 CLEventImpl::CreateFunc *eventCreateFunc)
328 {
329 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
330 ANGLE_TRY(processWaitlist(waitEvents));
331 ANGLE_TRY(finishInternal());
332
333 cl::BufferRect srcRect{cl::Offset{srcOrigin.x, srcOrigin.y, srcOrigin.z},
334 cl::Extents{region.x, region.y, region.z}, srcRowPitch, srcSlicePitch,
335 1};
336
337 cl::BufferRect dstRect{cl::Offset{dstOrigin.x, dstOrigin.y, dstOrigin.z},
338 cl::Extents{region.x, region.y, region.z}, dstRowPitch, dstSlicePitch,
339 1};
340
341 auto srcBufferVk = &srcBuffer.getImpl<CLBufferVk>();
342 auto dstBufferVk = &dstBuffer.getImpl<CLBufferVk>();
343 uint8_t *mapPointer = nullptr;
344 ANGLE_TRY(srcBufferVk->map(mapPointer));
345 ASSERT(mapPointer);
346 ANGLE_TRY(dstBufferVk->setRect(static_cast<const void *>(mapPointer), srcRect, dstRect));
347
348 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete));
349 return angle::Result::Continue;
350 }
351
enqueueFillBuffer(const cl::Buffer & buffer,const void * pattern,size_t patternSize,size_t offset,size_t size,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)352 angle::Result CLCommandQueueVk::enqueueFillBuffer(const cl::Buffer &buffer,
353 const void *pattern,
354 size_t patternSize,
355 size_t offset,
356 size_t size,
357 const cl::EventPtrs &waitEvents,
358 CLEventImpl::CreateFunc *eventCreateFunc)
359 {
360 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
361
362 ANGLE_TRY(processWaitlist(waitEvents));
363
364 CLBufferVk *bufferVk = &buffer.getImpl<CLBufferVk>();
365 if (mComputePassCommands->usesBuffer(bufferVk->getBuffer()))
366 {
367 ANGLE_TRY(finishInternal());
368 }
369
370 ANGLE_TRY(bufferVk->fillWithPattern(pattern, patternSize, offset, size));
371
372 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete));
373
374 return angle::Result::Continue;
375 }
376
enqueueMapBuffer(const cl::Buffer & buffer,bool blocking,cl::MapFlags mapFlags,size_t offset,size_t size,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc,void * & mapPtr)377 angle::Result CLCommandQueueVk::enqueueMapBuffer(const cl::Buffer &buffer,
378 bool blocking,
379 cl::MapFlags mapFlags,
380 size_t offset,
381 size_t size,
382 const cl::EventPtrs &waitEvents,
383 CLEventImpl::CreateFunc *eventCreateFunc,
384 void *&mapPtr)
385 {
386 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
387
388 ANGLE_TRY(processWaitlist(waitEvents));
389
390 cl::ExecutionStatus eventComplete = cl::ExecutionStatus::Queued;
391 if (blocking || !eventCreateFunc)
392 {
393 ANGLE_TRY(finishInternal());
394 eventComplete = cl::ExecutionStatus::Complete;
395 }
396
397 CLBufferVk *bufferVk = &buffer.getImpl<CLBufferVk>();
398 uint8_t *mapPointer = nullptr;
399 if (buffer.getFlags().intersects(CL_MEM_USE_HOST_PTR))
400 {
401 ANGLE_TRY(finishInternal());
402 mapPointer = static_cast<uint8_t *>(buffer.getHostPtr()) + offset;
403 ANGLE_TRY(bufferVk->copyTo(mapPointer, offset, size));
404 eventComplete = cl::ExecutionStatus::Complete;
405 }
406 else
407 {
408 ANGLE_TRY(bufferVk->map(mapPointer, offset));
409 }
410 mapPtr = static_cast<void *>(mapPointer);
411
412 if (bufferVk->isCurrentlyInUse())
413 {
414 eventComplete = cl::ExecutionStatus::Queued;
415 }
416 ANGLE_TRY(createEvent(eventCreateFunc, eventComplete));
417
418 return angle::Result::Continue;
419 }
420
copyImageToFromBuffer(CLImageVk & imageVk,vk::BufferHelper & buffer,const cl::MemOffsets & origin,const cl::Coordinate & region,size_t bufferOffset,ImageBufferCopyDirection direction)421 angle::Result CLCommandQueueVk::copyImageToFromBuffer(CLImageVk &imageVk,
422 vk::BufferHelper &buffer,
423 const cl::MemOffsets &origin,
424 const cl::Coordinate ®ion,
425 size_t bufferOffset,
426 ImageBufferCopyDirection direction)
427 {
428 vk::CommandBufferAccess access;
429 vk::OutsideRenderPassCommandBuffer *commandBuffer;
430 VkImageAspectFlags aspectFlags = imageVk.getImage().getAspectFlags();
431 if (direction == ImageBufferCopyDirection::ToBuffer)
432 {
433 access.onImageTransferRead(aspectFlags, &imageVk.getImage());
434 access.onBufferTransferWrite(&buffer);
435 }
436 else
437 {
438 access.onImageTransferWrite(gl::LevelIndex(0), 1, 0,
439 static_cast<uint32_t>(imageVk.getArraySize()), aspectFlags,
440 &imageVk.getImage());
441 access.onBufferTransferRead(&buffer);
442 }
443 ANGLE_TRY(getCommandBuffer(access, &commandBuffer));
444
445 VkBufferImageCopy copyRegion = {};
446 copyRegion.bufferOffset = bufferOffset;
447 copyRegion.bufferRowLength = 0;
448 copyRegion.bufferImageHeight = 0;
449 copyRegion.imageExtent = cl_vk::GetExtent(imageVk.getExtentForCopy(region));
450 copyRegion.imageOffset = cl_vk::GetOffset(imageVk.getOffsetForCopy(origin));
451 copyRegion.imageSubresource = imageVk.getSubresourceLayersForCopy(
452 origin, region, imageVk.getType(), ImageCopyWith::Buffer);
453 if (imageVk.isWritable())
454 {
455 // We need an execution barrier if image can be written to by kernel
456 ANGLE_TRY(insertBarrier());
457 }
458
459 if (direction == ImageBufferCopyDirection::ToBuffer)
460 {
461 commandBuffer->copyImageToBuffer(imageVk.getImage().getImage(),
462 VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
463 buffer.getBuffer().getHandle(), 1, ©Region);
464 }
465 else
466 {
467 commandBuffer->copyBufferToImage(buffer.getBuffer().getHandle(),
468 imageVk.getImage().getImage(),
469 VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, ©Region);
470 }
471
472 return angle::Result::Continue;
473 }
474
addToHostTransferList(CLBufferVk * srcBuffer,CLCommandQueueVk::HostTransferConfig transferConfig)475 angle::Result CLCommandQueueVk::addToHostTransferList(
476 CLBufferVk *srcBuffer,
477 CLCommandQueueVk::HostTransferConfig transferConfig)
478 {
479 // TODO(aannestrand): Flush here if we reach some max-transfer-buffer heuristic
480 // http://anglebug.com/377545840
481
482 cl::Memory *transferBufferHandle =
483 cl::Buffer::Cast(this->mContext->getFrontendObject().createBuffer(
484 nullptr, cl::MemFlags{CL_MEM_READ_WRITE}, srcBuffer->getSize(), nullptr));
485 if (transferBufferHandle == nullptr)
486 {
487 ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
488 }
489 HostTransferEntry transferEntry{transferConfig, cl::MemoryPtr{transferBufferHandle}};
490 mHostTransferList.emplace_back(transferEntry);
491
492 // Release initialization reference, lifetime controlled by RefPointer.
493 mHostTransferList.back().transferBufferHandle->release();
494
495 // We need an execution barrier if buffer can be written to by kernel
496 if (!mComputePassCommands->getCommandBuffer().empty() && srcBuffer->isWritable())
497 {
498 // TODO(aannestrand): Look into combining these kernel execution barriers
499 // http://anglebug.com/377545840
500 VkMemoryBarrier memoryBarrier = {VK_STRUCTURE_TYPE_MEMORY_BARRIER, nullptr,
501 VK_ACCESS_SHADER_WRITE_BIT,
502 VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT};
503 mComputePassCommands->getCommandBuffer().pipelineBarrier(
504 VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1,
505 &memoryBarrier, 0, nullptr, 0, nullptr);
506 }
507
508 // Enqueue blit/transfer cmd
509 VkPipelineStageFlags srcStageMask = {};
510 VkPipelineStageFlags dstStageMask = {};
511 VkMemoryBarrier memBarrier = {};
512 memBarrier.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
513 CLBufferVk &transferBufferHandleVk =
514 mHostTransferList.back().transferBufferHandle->getImpl<CLBufferVk>();
515 switch (transferConfig.type)
516 {
517 case CL_COMMAND_WRITE_BUFFER:
518 {
519 VkBufferCopy copyRegion = {transferConfig.offset, transferConfig.offset,
520 transferConfig.size};
521 ANGLE_TRY(transferBufferHandleVk.copyFrom(transferConfig.srcHostPtr,
522 transferConfig.offset, transferConfig.size));
523 copyRegion.srcOffset += transferBufferHandleVk.getOffset();
524 copyRegion.dstOffset += srcBuffer->getOffset();
525 mComputePassCommands->getCommandBuffer().copyBuffer(
526 transferBufferHandleVk.getBuffer().getBuffer(), srcBuffer->getBuffer().getBuffer(),
527 1, ©Region);
528
529 srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT;
530 dstStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
531 memBarrier.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT;
532 memBarrier.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT;
533 break;
534 }
535 case CL_COMMAND_READ_BUFFER:
536 {
537 VkBufferCopy copyRegion = {transferConfig.offset, transferConfig.offset,
538 transferConfig.size};
539 copyRegion.srcOffset += srcBuffer->getOffset();
540 copyRegion.dstOffset += transferBufferHandleVk.getOffset();
541 mComputePassCommands->getCommandBuffer().copyBuffer(
542 srcBuffer->getBuffer().getBuffer(), transferBufferHandleVk.getBuffer().getBuffer(),
543 1, ©Region);
544
545 srcStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
546 dstStageMask = VK_PIPELINE_STAGE_HOST_BIT;
547 memBarrier.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT;
548 memBarrier.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT;
549 break;
550 }
551 default:
552 UNIMPLEMENTED();
553 break;
554 }
555
556 // TODO(aannestrand): Look into combining these transfer barriers
557 // http://anglebug.com/377545840
558 mComputePassCommands->getCommandBuffer().pipelineBarrier(srcStageMask, dstStageMask, 0, 1,
559 &memBarrier, 0, nullptr, 0, nullptr);
560
561 return angle::Result::Continue;
562 }
563
addToHostTransferList(CLImageVk * srcImage,CLCommandQueueVk::HostTransferConfig transferConfig)564 angle::Result CLCommandQueueVk::addToHostTransferList(
565 CLImageVk *srcImage,
566 CLCommandQueueVk::HostTransferConfig transferConfig)
567 {
568 // TODO(aannestrand): Flush here if we reach some max-transfer-buffer heuristic
569 // http://anglebug.com/377545840
570
571 cl::Memory *transferBufferHandle =
572 cl::Buffer::Cast(this->mContext->getFrontendObject().createBuffer(
573 nullptr, cl::MemFlags{CL_MEM_READ_WRITE}, srcImage->getSize(), nullptr));
574 if (transferBufferHandle == nullptr)
575 {
576 ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
577 }
578
579 HostTransferEntry transferEntry{transferConfig, cl::MemoryPtr{transferBufferHandle}};
580 mHostTransferList.emplace_back(transferEntry);
581
582 // Release initialization reference, lifetime controlled by RefPointer.
583 mHostTransferList.back().transferBufferHandle->release();
584
585 // Enqueue blit
586 CLBufferVk &transferBufferHandleVk =
587 mHostTransferList.back().transferBufferHandle->getImpl<CLBufferVk>();
588 ANGLE_TRY(copyImageToFromBuffer(*srcImage, transferBufferHandleVk.getBuffer(),
589 transferConfig.origin, transferConfig.region, 0,
590 ImageBufferCopyDirection::ToBuffer));
591
592 return angle::Result::Continue;
593 }
594
enqueueReadImage(const cl::Image & image,bool blocking,const cl::MemOffsets & origin,const cl::Coordinate & region,size_t rowPitch,size_t slicePitch,void * ptr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)595 angle::Result CLCommandQueueVk::enqueueReadImage(const cl::Image &image,
596 bool blocking,
597 const cl::MemOffsets &origin,
598 const cl::Coordinate ®ion,
599 size_t rowPitch,
600 size_t slicePitch,
601 void *ptr,
602 const cl::EventPtrs &waitEvents,
603 CLEventImpl::CreateFunc *eventCreateFunc)
604 {
605 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
606 CLImageVk &imageVk = image.getImpl<CLImageVk>();
607 size_t size = (region.x * region.y * region.z * imageVk.getElementSize());
608
609 ANGLE_TRY(processWaitlist(waitEvents));
610
611 if (imageVk.isStagingBufferInitialized() == false)
612 {
613 ANGLE_TRY(imageVk.createStagingBuffer(imageVk.getSize()));
614 }
615
616 if (blocking)
617 {
618 ANGLE_TRY(copyImageToFromBuffer(imageVk, imageVk.getStagingBuffer(), origin, region, 0,
619 ImageBufferCopyDirection::ToBuffer));
620 ANGLE_TRY(finishInternal());
621 if (rowPitch == 0 && slicePitch == 0)
622 {
623 ANGLE_TRY(imageVk.copyStagingTo(ptr, 0, size));
624 }
625 else
626 {
627 ANGLE_TRY(imageVk.copyStagingToFromWithPitch(ptr, region, rowPitch, slicePitch,
628 StagingBufferCopyDirection::ToHost));
629 }
630 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete));
631 }
632 else
633 {
634 // Create a transfer buffer and push it in update list
635 HostTransferConfig transferConfig;
636 transferConfig.type = CL_COMMAND_READ_IMAGE;
637 transferConfig.size = size;
638 transferConfig.dstHostPtr = ptr;
639 transferConfig.origin = origin;
640 transferConfig.region = region;
641 ANGLE_TRY(addToHostTransferList(&imageVk, transferConfig));
642
643 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
644 }
645
646 return angle::Result::Continue;
647 }
648
enqueueWriteImage(const cl::Image & image,bool blocking,const cl::MemOffsets & origin,const cl::Coordinate & region,size_t inputRowPitch,size_t inputSlicePitch,const void * ptr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)649 angle::Result CLCommandQueueVk::enqueueWriteImage(const cl::Image &image,
650 bool blocking,
651 const cl::MemOffsets &origin,
652 const cl::Coordinate ®ion,
653 size_t inputRowPitch,
654 size_t inputSlicePitch,
655 const void *ptr,
656 const cl::EventPtrs &waitEvents,
657 CLEventImpl::CreateFunc *eventCreateFunc)
658 {
659 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
660 ANGLE_TRY(processWaitlist(waitEvents));
661
662 CLImageVk &imageVk = image.getImpl<CLImageVk>();
663 size_t size = (region.x * region.y * region.z * imageVk.getElementSize());
664 cl::ExecutionStatus eventInitialState = cl::ExecutionStatus::Queued;
665 if (imageVk.isStagingBufferInitialized() == false)
666 {
667 ANGLE_TRY(imageVk.createStagingBuffer(imageVk.getSize()));
668 }
669
670 if (inputRowPitch == 0 && inputSlicePitch == 0)
671 {
672 ANGLE_TRY(imageVk.copyStagingFrom((void *)ptr, 0, size));
673 }
674 else
675 {
676 ANGLE_TRY(imageVk.copyStagingToFromWithPitch((void *)ptr, region, inputRowPitch,
677 inputSlicePitch,
678 StagingBufferCopyDirection::ToStagingBuffer));
679 }
680
681 ANGLE_TRY(copyImageToFromBuffer(imageVk, imageVk.getStagingBuffer(), origin, region, 0,
682 ImageBufferCopyDirection::ToImage));
683
684 if (blocking)
685 {
686 ANGLE_TRY(finishInternal());
687 eventInitialState = cl::ExecutionStatus::Complete;
688 }
689
690 ANGLE_TRY(createEvent(eventCreateFunc, eventInitialState));
691
692 return angle::Result::Continue;
693 }
694
enqueueCopyImage(const cl::Image & srcImage,const cl::Image & dstImage,const cl::MemOffsets & srcOrigin,const cl::MemOffsets & dstOrigin,const cl::Coordinate & region,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)695 angle::Result CLCommandQueueVk::enqueueCopyImage(const cl::Image &srcImage,
696 const cl::Image &dstImage,
697 const cl::MemOffsets &srcOrigin,
698 const cl::MemOffsets &dstOrigin,
699 const cl::Coordinate ®ion,
700 const cl::EventPtrs &waitEvents,
701 CLEventImpl::CreateFunc *eventCreateFunc)
702 {
703 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
704 ANGLE_TRY(processWaitlist(waitEvents));
705
706 auto srcImageVk = &srcImage.getImpl<CLImageVk>();
707 auto dstImageVk = &dstImage.getImpl<CLImageVk>();
708
709 vk::CommandBufferAccess access;
710 vk::OutsideRenderPassCommandBuffer *commandBuffer;
711 VkImageAspectFlags dstAspectFlags = srcImageVk->getImage().getAspectFlags();
712 VkImageAspectFlags srcAspectFlags = dstImageVk->getImage().getAspectFlags();
713 access.onImageTransferWrite(gl::LevelIndex(0), 1, 0, 1, dstAspectFlags,
714 &dstImageVk->getImage());
715 access.onImageTransferRead(srcAspectFlags, &srcImageVk->getImage());
716 ANGLE_TRY(getCommandBuffer(access, &commandBuffer));
717
718 VkImageCopy copyRegion = {};
719 copyRegion.extent = cl_vk::GetExtent(srcImageVk->getExtentForCopy(region));
720 copyRegion.srcOffset = cl_vk::GetOffset(srcImageVk->getOffsetForCopy(srcOrigin));
721 copyRegion.dstOffset = cl_vk::GetOffset(dstImageVk->getOffsetForCopy(dstOrigin));
722 copyRegion.srcSubresource = srcImageVk->getSubresourceLayersForCopy(
723 srcOrigin, region, dstImageVk->getType(), ImageCopyWith::Image);
724 copyRegion.dstSubresource = dstImageVk->getSubresourceLayersForCopy(
725 dstOrigin, region, srcImageVk->getType(), ImageCopyWith::Image);
726 if (srcImageVk->isWritable() || dstImageVk->isWritable())
727 {
728 // We need an execution barrier if buffer can be written to by kernel
729 ANGLE_TRY(insertBarrier());
730 }
731
732 commandBuffer->copyImage(
733 srcImageVk->getImage().getImage(), VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
734 dstImageVk->getImage().getImage(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, ©Region);
735
736 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
737
738 return angle::Result::Continue;
739 }
740
enqueueFillImage(const cl::Image & image,const void * fillColor,const cl::MemOffsets & origin,const cl::Coordinate & region,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)741 angle::Result CLCommandQueueVk::enqueueFillImage(const cl::Image &image,
742 const void *fillColor,
743 const cl::MemOffsets &origin,
744 const cl::Coordinate ®ion,
745 const cl::EventPtrs &waitEvents,
746 CLEventImpl::CreateFunc *eventCreateFunc)
747 {
748 CLImageVk &imageVk = image.getImpl<CLImageVk>();
749 PixelColor packedColor;
750 cl::Extents extent = imageVk.getImageExtent();
751
752 imageVk.packPixels(fillColor, &packedColor);
753
754 ANGLE_TRY(enqueueWaitForEvents(waitEvents));
755
756 if (imageVk.isStagingBufferInitialized() == false)
757 {
758 ANGLE_TRY(imageVk.createStagingBuffer(imageVk.getSize()));
759 }
760
761 ANGLE_TRY(copyImageToFromBuffer(imageVk, imageVk.getStagingBuffer(), cl::kMemOffsetsZero,
762 {extent.width, extent.height, extent.depth}, 0,
763 ImageBufferCopyDirection::ToBuffer));
764 ANGLE_TRY(finishInternal());
765
766 uint8_t *mapPointer = nullptr;
767 ANGLE_TRY(imageVk.map(mapPointer, 0));
768 imageVk.fillImageWithColor(origin, region, mapPointer, &packedColor);
769 imageVk.unmap();
770 mapPointer = nullptr;
771 ANGLE_TRY(copyImageToFromBuffer(imageVk, imageVk.getStagingBuffer(), cl::kMemOffsetsZero,
772 {extent.width, extent.height, extent.depth}, 0,
773 ImageBufferCopyDirection::ToImage));
774
775 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
776
777 return angle::Result::Continue;
778 }
779
enqueueCopyImageToBuffer(const cl::Image & srcImage,const cl::Buffer & dstBuffer,const cl::MemOffsets & srcOrigin,const cl::Coordinate & region,size_t dstOffset,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)780 angle::Result CLCommandQueueVk::enqueueCopyImageToBuffer(const cl::Image &srcImage,
781 const cl::Buffer &dstBuffer,
782 const cl::MemOffsets &srcOrigin,
783 const cl::Coordinate ®ion,
784 size_t dstOffset,
785 const cl::EventPtrs &waitEvents,
786 CLEventImpl::CreateFunc *eventCreateFunc)
787 {
788 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
789 CLImageVk &srcImageVk = srcImage.getImpl<CLImageVk>();
790 CLBufferVk &dstBufferVk = dstBuffer.getImpl<CLBufferVk>();
791
792 ANGLE_TRY(processWaitlist(waitEvents));
793
794 ANGLE_TRY(copyImageToFromBuffer(srcImageVk, dstBufferVk.getBuffer(), srcOrigin, region,
795 dstOffset, ImageBufferCopyDirection::ToBuffer));
796
797 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
798
799 return angle::Result::Continue;
800 }
801
enqueueCopyBufferToImage(const cl::Buffer & srcBuffer,const cl::Image & dstImage,size_t srcOffset,const cl::MemOffsets & dstOrigin,const cl::Coordinate & region,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)802 angle::Result CLCommandQueueVk::enqueueCopyBufferToImage(const cl::Buffer &srcBuffer,
803 const cl::Image &dstImage,
804 size_t srcOffset,
805 const cl::MemOffsets &dstOrigin,
806 const cl::Coordinate ®ion,
807 const cl::EventPtrs &waitEvents,
808 CLEventImpl::CreateFunc *eventCreateFunc)
809 {
810 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
811 CLBufferVk &srcBufferVk = srcBuffer.getImpl<CLBufferVk>();
812 CLImageVk &dstImageVk = dstImage.getImpl<CLImageVk>();
813
814 ANGLE_TRY(processWaitlist(waitEvents));
815
816 ANGLE_TRY(copyImageToFromBuffer(dstImageVk, srcBufferVk.getBuffer(), dstOrigin, region,
817 srcOffset, ImageBufferCopyDirection::ToImage));
818
819 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
820
821 return angle::Result::Continue;
822 }
823
enqueueMapImage(const cl::Image & image,bool blocking,cl::MapFlags mapFlags,const cl::MemOffsets & origin,const cl::Coordinate & region,size_t * imageRowPitch,size_t * imageSlicePitch,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc,void * & mapPtr)824 angle::Result CLCommandQueueVk::enqueueMapImage(const cl::Image &image,
825 bool blocking,
826 cl::MapFlags mapFlags,
827 const cl::MemOffsets &origin,
828 const cl::Coordinate ®ion,
829 size_t *imageRowPitch,
830 size_t *imageSlicePitch,
831 const cl::EventPtrs &waitEvents,
832 CLEventImpl::CreateFunc *eventCreateFunc,
833 void *&mapPtr)
834 {
835 ANGLE_TRY(enqueueWaitForEvents(waitEvents));
836
837 // TODO: Look into better enqueue handling of this map-op if non-blocking
838 // https://anglebug.com/376722715
839 CLImageVk *imageVk = &image.getImpl<CLImageVk>();
840 cl::Extents extent = imageVk->getImageExtent();
841 if (blocking)
842 {
843 ANGLE_TRY(finishInternal());
844 }
845
846 mComputePassCommands->imageRead(mContext, imageVk->getImage().getAspectFlags(),
847 vk::ImageLayout::TransferSrc, &imageVk->getImage());
848
849 if (imageVk->isStagingBufferInitialized() == false)
850 {
851 ANGLE_TRY(imageVk->createStagingBuffer(imageVk->getSize()));
852 }
853
854 ANGLE_TRY(copyImageToFromBuffer(*imageVk, imageVk->getStagingBuffer(), cl::kMemOffsetsZero,
855 {extent.width, extent.height, extent.depth}, 0,
856 ImageBufferCopyDirection::ToBuffer));
857 ANGLE_TRY(finishInternal());
858
859 uint8_t *mapPointer = nullptr;
860 size_t elementSize = imageVk->getElementSize();
861 size_t rowPitch = (extent.width * elementSize);
862 size_t offset =
863 (origin.x * elementSize) + (origin.y * rowPitch) + (origin.z * extent.height * rowPitch);
864 size_t size = (region.x * region.y * region.z * elementSize);
865
866 if (image.getFlags().intersects(CL_MEM_USE_HOST_PTR))
867 {
868 mapPointer = static_cast<uint8_t *>(image.getHostPtr()) + offset;
869 ANGLE_TRY(imageVk->copyTo(mapPointer, offset, size));
870 }
871 else
872 {
873 ANGLE_TRY(imageVk->map(mapPointer, offset));
874 }
875 mapPtr = static_cast<void *>(mapPointer);
876
877 *imageRowPitch = rowPitch;
878
879 switch (imageVk->getDescriptor().type)
880 {
881 case cl::MemObjectType::Image1D:
882 case cl::MemObjectType::Image1D_Buffer:
883 case cl::MemObjectType::Image2D:
884 if (imageSlicePitch != nullptr)
885 {
886 *imageSlicePitch = 0;
887 }
888 break;
889 case cl::MemObjectType::Image2D_Array:
890 case cl::MemObjectType::Image3D:
891 *imageSlicePitch = (extent.height * (*imageRowPitch));
892 break;
893 case cl::MemObjectType::Image1D_Array:
894 *imageSlicePitch = *imageRowPitch;
895 break;
896 default:
897 UNREACHABLE();
898 break;
899 }
900
901 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete));
902
903 return angle::Result::Continue;
904 }
905
enqueueUnmapMemObject(const cl::Memory & memory,void * mappedPtr,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)906 angle::Result CLCommandQueueVk::enqueueUnmapMemObject(const cl::Memory &memory,
907 void *mappedPtr,
908 const cl::EventPtrs &waitEvents,
909 CLEventImpl::CreateFunc *eventCreateFunc)
910 {
911 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
912
913 ANGLE_TRY(processWaitlist(waitEvents));
914
915 cl::ExecutionStatus eventComplete = cl::ExecutionStatus::Queued;
916 if (!eventCreateFunc)
917 {
918 ANGLE_TRY(finishInternal());
919 eventComplete = cl::ExecutionStatus::Complete;
920 }
921
922 if (memory.getType() == cl::MemObjectType::Buffer)
923 {
924 CLBufferVk &bufferVk = memory.getImpl<CLBufferVk>();
925 if (memory.getFlags().intersects(CL_MEM_USE_HOST_PTR))
926 {
927 ANGLE_TRY(finishInternal());
928 ANGLE_TRY(bufferVk.copyFrom(memory.getHostPtr(), 0, bufferVk.getSize()));
929 eventComplete = cl::ExecutionStatus::Complete;
930 }
931 }
932 else if (memory.getType() != cl::MemObjectType::Pipe)
933 {
934 // of image type
935 CLImageVk &imageVk = memory.getImpl<CLImageVk>();
936 if (memory.getFlags().intersects(CL_MEM_USE_HOST_PTR))
937 {
938 uint8_t *mapPointer = static_cast<uint8_t *>(memory.getHostPtr());
939 ANGLE_TRY(imageVk.copyStagingFrom(mapPointer, 0, imageVk.getSize()));
940 }
941 cl::Extents extent = imageVk.getImageExtent();
942 ANGLE_TRY(copyImageToFromBuffer(imageVk, imageVk.getStagingBuffer(), cl::kMemOffsetsZero,
943 {extent.width, extent.height, extent.depth}, 0,
944 ImageBufferCopyDirection::ToImage));
945 ANGLE_TRY(finishInternal());
946 eventComplete = cl::ExecutionStatus::Complete;
947 }
948 else
949 {
950 // mem object type pipe is not supported and creation of such an object should have
951 // failed
952 UNREACHABLE();
953 }
954
955 memory.getImpl<CLMemoryVk>().unmap();
956 ANGLE_TRY(createEvent(eventCreateFunc, eventComplete));
957
958 return angle::Result::Continue;
959 }
960
enqueueMigrateMemObjects(const cl::MemoryPtrs & memObjects,cl::MemMigrationFlags flags,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)961 angle::Result CLCommandQueueVk::enqueueMigrateMemObjects(const cl::MemoryPtrs &memObjects,
962 cl::MemMigrationFlags flags,
963 const cl::EventPtrs &waitEvents,
964 CLEventImpl::CreateFunc *eventCreateFunc)
965 {
966 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
967
968 ANGLE_TRY(processWaitlist(waitEvents));
969
970 if (mCommandQueue.getContext().getDevices().size() > 1)
971 {
972 // TODO(aannestrand): Later implement support to allow migration of mem objects across
973 // different devices. http://anglebug.com/377942759
974 UNIMPLEMENTED();
975 ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
976 }
977
978 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Complete));
979
980 return angle::Result::Continue;
981 }
982
enqueueNDRangeKernel(const cl::Kernel & kernel,const cl::NDRange & ndrange,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)983 angle::Result CLCommandQueueVk::enqueueNDRangeKernel(const cl::Kernel &kernel,
984 const cl::NDRange &ndrange,
985 const cl::EventPtrs &waitEvents,
986 CLEventImpl::CreateFunc *eventCreateFunc)
987 {
988 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
989
990 ANGLE_TRY(processWaitlist(waitEvents));
991
992 cl::WorkgroupCount workgroupCount;
993 vk::PipelineCacheAccess pipelineCache;
994 vk::PipelineHelper *pipelineHelper = nullptr;
995 CLKernelVk &kernelImpl = kernel.getImpl<CLKernelVk>();
996
997 // Here, we create-update-bind the kernel's descriptor set, put push-constants in cmd
998 // buffer, capture kernel resources, and handle kernel execution dependencies
999 ANGLE_TRY(processKernelResources(kernelImpl, ndrange, workgroupCount));
1000
1001 // Fetch or create compute pipeline (if we miss in cache)
1002 ANGLE_CL_IMPL_TRY_ERROR(mContext->getRenderer()->getPipelineCache(mContext, &pipelineCache),
1003 CL_OUT_OF_RESOURCES);
1004 ANGLE_TRY(kernelImpl.getOrCreateComputePipeline(
1005 &pipelineCache, ndrange, mCommandQueue.getDevice(), &pipelineHelper, &workgroupCount));
1006
1007 mComputePassCommands->retainResource(pipelineHelper);
1008
1009 mComputePassCommands->getCommandBuffer().bindComputePipeline(pipelineHelper->getPipeline());
1010 mComputePassCommands->getCommandBuffer().dispatch(workgroupCount[0], workgroupCount[1],
1011 workgroupCount[2]);
1012
1013 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
1014
1015 return angle::Result::Continue;
1016 }
1017
enqueueTask(const cl::Kernel & kernel,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)1018 angle::Result CLCommandQueueVk::enqueueTask(const cl::Kernel &kernel,
1019 const cl::EventPtrs &waitEvents,
1020 CLEventImpl::CreateFunc *eventCreateFunc)
1021 {
1022 constexpr size_t globalWorkSize[3] = {1, 0, 0};
1023 constexpr size_t localWorkSize[3] = {1, 0, 0};
1024 cl::NDRange ndrange(1, nullptr, globalWorkSize, localWorkSize);
1025 return enqueueNDRangeKernel(kernel, ndrange, waitEvents, eventCreateFunc);
1026 }
1027
enqueueNativeKernel(cl::UserFunc userFunc,void * args,size_t cbArgs,const cl::BufferPtrs & buffers,const std::vector<size_t> bufferPtrOffsets,const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)1028 angle::Result CLCommandQueueVk::enqueueNativeKernel(cl::UserFunc userFunc,
1029 void *args,
1030 size_t cbArgs,
1031 const cl::BufferPtrs &buffers,
1032 const std::vector<size_t> bufferPtrOffsets,
1033 const cl::EventPtrs &waitEvents,
1034 CLEventImpl::CreateFunc *eventCreateFunc)
1035 {
1036 UNIMPLEMENTED();
1037 ANGLE_CL_RETURN_ERROR(CL_OUT_OF_RESOURCES);
1038 }
1039
enqueueMarkerWithWaitList(const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)1040 angle::Result CLCommandQueueVk::enqueueMarkerWithWaitList(const cl::EventPtrs &waitEvents,
1041 CLEventImpl::CreateFunc *eventCreateFunc)
1042 {
1043 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1044
1045 ANGLE_TRY(processWaitlist(waitEvents));
1046 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
1047
1048 return angle::Result::Continue;
1049 }
1050
enqueueMarker(CLEventImpl::CreateFunc & eventCreateFunc)1051 angle::Result CLCommandQueueVk::enqueueMarker(CLEventImpl::CreateFunc &eventCreateFunc)
1052 {
1053 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1054
1055 // This deprecated API is essentially a super-set of clEnqueueBarrier, where we also return
1056 // an event object (i.e. marker) since clEnqueueBarrier does not provide this
1057 ANGLE_TRY(insertBarrier());
1058
1059 ANGLE_TRY(createEvent(&eventCreateFunc, cl::ExecutionStatus::Queued));
1060
1061 return angle::Result::Continue;
1062 }
1063
enqueueWaitForEvents(const cl::EventPtrs & events)1064 angle::Result CLCommandQueueVk::enqueueWaitForEvents(const cl::EventPtrs &events)
1065 {
1066 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1067
1068 // Unlike clWaitForEvents, this routine is non-blocking
1069 ANGLE_TRY(processWaitlist(events));
1070
1071 return angle::Result::Continue;
1072 }
1073
enqueueBarrierWithWaitList(const cl::EventPtrs & waitEvents,CLEventImpl::CreateFunc * eventCreateFunc)1074 angle::Result CLCommandQueueVk::enqueueBarrierWithWaitList(const cl::EventPtrs &waitEvents,
1075 CLEventImpl::CreateFunc *eventCreateFunc)
1076 {
1077 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1078
1079 // The barrier command either waits for a list of events to complete, or if the list is
1080 // empty it waits for all commands previously enqueued in command_queue to complete before
1081 // it completes
1082 if (waitEvents.empty())
1083 {
1084 ANGLE_TRY(insertBarrier());
1085 }
1086 else
1087 {
1088 ANGLE_TRY(processWaitlist(waitEvents));
1089 }
1090
1091 ANGLE_TRY(createEvent(eventCreateFunc, cl::ExecutionStatus::Queued));
1092
1093 return angle::Result::Continue;
1094 }
1095
insertBarrier()1096 angle::Result CLCommandQueueVk::insertBarrier()
1097 {
1098 VkMemoryBarrier memoryBarrier = {VK_STRUCTURE_TYPE_MEMORY_BARRIER, nullptr,
1099 VK_ACCESS_SHADER_WRITE_BIT,
1100 VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT};
1101 mComputePassCommands->getCommandBuffer().pipelineBarrier(
1102 VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1,
1103 &memoryBarrier, 0, nullptr, 0, nullptr);
1104
1105 return angle::Result::Continue;
1106 }
1107
enqueueBarrier()1108 angle::Result CLCommandQueueVk::enqueueBarrier()
1109 {
1110 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1111
1112 ANGLE_TRY(insertBarrier());
1113
1114 return angle::Result::Continue;
1115 }
1116
flush()1117 angle::Result CLCommandQueueVk::flush()
1118 {
1119 ANGLE_TRACE_EVENT0("gpu.angle", "CLCommandQueueVk::flush");
1120
1121 // Non-blocking finish
1122 // TODO: Ideally we should try to find better impl. to avoid spawning a submit-thread/Task here
1123 // https://anglebug.com/42267107
1124 std::shared_ptr<angle::WaitableEvent> asyncEvent =
1125 getPlatform()->postMultiThreadWorkerTask(std::make_shared<CLAsyncFinishTask>(this));
1126 ASSERT(asyncEvent != nullptr);
1127
1128 return angle::Result::Continue;
1129 }
1130
finish()1131 angle::Result CLCommandQueueVk::finish()
1132 {
1133 std::scoped_lock<std::mutex> sl(mCommandQueueMutex);
1134
1135 ANGLE_TRACE_EVENT0("gpu.angle", "CLCommandQueueVk::finish");
1136
1137 // Blocking finish
1138 return finishInternal();
1139 }
1140
syncHostBuffers()1141 angle::Result CLCommandQueueVk::syncHostBuffers()
1142 {
1143 if (!mHostTransferList.empty())
1144 {
1145 for (const HostTransferEntry &hostTransferEntry : mHostTransferList)
1146 {
1147 const HostTransferConfig &transferConfig = hostTransferEntry.transferConfig;
1148 CLBufferVk &transferBufferVk =
1149 hostTransferEntry.transferBufferHandle->getImpl<CLBufferVk>();
1150 switch (hostTransferEntry.transferConfig.type)
1151 {
1152 case CL_COMMAND_READ_BUFFER:
1153 case CL_COMMAND_READ_IMAGE:
1154 ANGLE_TRY(transferBufferVk.copyTo(transferConfig.dstHostPtr,
1155 transferConfig.offset, transferConfig.size));
1156 break;
1157 default:
1158 UNIMPLEMENTED();
1159 break;
1160 }
1161 }
1162 }
1163 mHostTransferList.clear();
1164
1165 return angle::Result::Continue;
1166 }
1167
addMemoryDependencies(cl::Memory * clMem)1168 angle::Result CLCommandQueueVk::addMemoryDependencies(cl::Memory *clMem)
1169 {
1170 cl::Memory *parentMem = clMem->getParent() ? clMem->getParent().get() : nullptr;
1171
1172 // Take an usage count
1173 mMemoryCaptures.emplace_back(clMem);
1174
1175 // Handle possible resource RAW hazard
1176 bool insertBarrier = false;
1177 if (clMem->getFlags().intersects(CL_MEM_READ_WRITE))
1178 {
1179 // Texel buffers have backing buffer obects
1180 if (mDependencyTracker.contains(clMem) || mDependencyTracker.contains(parentMem) ||
1181 mDependencyTracker.size() == kMaxDependencyTrackerSize)
1182 {
1183 insertBarrier = true;
1184 mDependencyTracker.clear();
1185 }
1186 mDependencyTracker.insert(clMem);
1187 if (parentMem)
1188 {
1189 mDependencyTracker.insert(parentMem);
1190 }
1191 }
1192
1193 // Insert a layout transition for images
1194 if (cl::IsImageType(clMem->getType()))
1195 {
1196 CLImageVk &vkMem = clMem->getImpl<CLImageVk>();
1197 mComputePassCommands->imageWrite(mContext, gl::LevelIndex(0), 0, 1,
1198 vkMem.getImage().getAspectFlags(),
1199 vk::ImageLayout::ComputeShaderWrite, &vkMem.getImage());
1200 }
1201 else if (insertBarrier && cl::IsBufferType(clMem->getType()))
1202 {
1203 CLBufferVk &vkMem = clMem->getImpl<CLBufferVk>();
1204
1205 mComputePassCommands->bufferWrite(VK_ACCESS_SHADER_WRITE_BIT,
1206 vk::PipelineStage::ComputeShader, &vkMem.getBuffer());
1207 }
1208
1209 return angle::Result::Continue;
1210 }
1211
processKernelResources(CLKernelVk & kernelVk,const cl::NDRange & ndrange,const cl::WorkgroupCount & workgroupCount)1212 angle::Result CLCommandQueueVk::processKernelResources(CLKernelVk &kernelVk,
1213 const cl::NDRange &ndrange,
1214 const cl::WorkgroupCount &workgroupCount)
1215 {
1216 bool needsBarrier = false;
1217 const CLProgramVk::DeviceProgramData *devProgramData =
1218 kernelVk.getProgram()->getDeviceProgramData(mCommandQueue.getDevice().getNative());
1219 ASSERT(devProgramData != nullptr);
1220
1221 // Set the descriptor set layouts and allocate descriptor sets
1222 // The descriptor set layouts are setup in the order of their appearance, as Vulkan requires
1223 // them to point to valid handles.
1224 angle::EnumIterator<DescriptorSetIndex> layoutIndex(DescriptorSetIndex::LiteralSampler);
1225 for (DescriptorSetIndex index : angle::AllEnums<DescriptorSetIndex>())
1226 {
1227 if (!kernelVk.getDescriptorSetLayoutDesc(index).empty())
1228 {
1229 // Setup the descriptor layout
1230 ANGLE_CL_IMPL_TRY_ERROR(mContext->getDescriptorSetLayoutCache()->getDescriptorSetLayout(
1231 mContext, kernelVk.getDescriptorSetLayoutDesc(index),
1232 &kernelVk.getDescriptorSetLayouts()[*layoutIndex]),
1233 CL_INVALID_OPERATION);
1234
1235 ANGLE_CL_IMPL_TRY_ERROR(
1236 kernelVk.getProgram()->getMetaDescriptorPool(index).bindCachedDescriptorPool(
1237 mContext, kernelVk.getDescriptorSetLayoutDesc(index), 1,
1238 mContext->getDescriptorSetLayoutCache(),
1239 &kernelVk.getProgram()->getDynamicDescriptorPoolPointer(index)),
1240 CL_INVALID_OPERATION);
1241
1242 // Allocate descriptor set
1243 ANGLE_TRY(kernelVk.allocateDescriptorSet(index, layoutIndex, mComputePassCommands));
1244 ++layoutIndex;
1245 }
1246 }
1247
1248 // Setup the pipeline layout
1249 ANGLE_CL_IMPL_TRY_ERROR(kernelVk.initPipelineLayout(), CL_INVALID_OPERATION);
1250
1251 // Push global offset data
1252 const VkPushConstantRange *globalOffsetRange = devProgramData->getGlobalOffsetRange();
1253 if (globalOffsetRange != nullptr)
1254 {
1255 mComputePassCommands->getCommandBuffer().pushConstants(
1256 kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, globalOffsetRange->offset,
1257 globalOffsetRange->size, ndrange.globalWorkOffset.data());
1258 }
1259
1260 // Push global size data
1261 const VkPushConstantRange *globalSizeRange = devProgramData->getGlobalSizeRange();
1262 if (globalSizeRange != nullptr)
1263 {
1264 mComputePassCommands->getCommandBuffer().pushConstants(
1265 kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, globalSizeRange->offset,
1266 globalSizeRange->size, ndrange.globalWorkSize.data());
1267 }
1268
1269 // Push region offset data.
1270 const VkPushConstantRange *regionOffsetRange = devProgramData->getRegionOffsetRange();
1271 if (regionOffsetRange != nullptr)
1272 {
1273 // We dont support non-uniform batches yet in ANGLE, this field also represents global
1274 // offset for NDR in uniform cases. Update this when non-uniform batches are supported.
1275 // https://github.com/google/clspv/blob/main/docs/OpenCLCOnVulkan.md#module-scope-push-constants
1276 mComputePassCommands->getCommandBuffer().pushConstants(
1277 kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, regionOffsetRange->offset,
1278 regionOffsetRange->size, ndrange.globalWorkOffset.data());
1279 }
1280
1281 // Push region group offset data.
1282 const VkPushConstantRange *regionGroupOffsetRange = devProgramData->getRegionGroupOffsetRange();
1283 if (regionGroupOffsetRange != nullptr)
1284 {
1285 // We dont support non-uniform batches yet in ANGLE, and based on clspv doc/notes:
1286 // "only required when non-uniform NDRanges are supported"
1287 // For now, we set this field to zeros until we later support non-uniform.
1288 // https://github.com/google/clspv/blob/main/docs/OpenCLCOnVulkan.md#module-scope-push-constants
1289 uint32_t regionGroupOffsets[3] = {0, 0, 0};
1290 mComputePassCommands->getCommandBuffer().pushConstants(
1291 kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT,
1292 regionGroupOffsetRange->offset, regionGroupOffsetRange->size, ®ionGroupOffsets);
1293 }
1294
1295 // Push enqueued local size
1296 const VkPushConstantRange *enqueuedLocalSizeRange = devProgramData->getEnqueuedLocalSizeRange();
1297 if (enqueuedLocalSizeRange != nullptr)
1298 {
1299 mComputePassCommands->getCommandBuffer().pushConstants(
1300 kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT,
1301 enqueuedLocalSizeRange->offset, enqueuedLocalSizeRange->size,
1302 ndrange.localWorkSize.data());
1303 }
1304
1305 // Push number of workgroups
1306 const VkPushConstantRange *numWorkgroupsRange = devProgramData->getNumWorkgroupsRange();
1307 if (devProgramData->reflectionData.pushConstants.contains(
1308 NonSemanticClspvReflectionPushConstantNumWorkgroups))
1309 {
1310 uint32_t numWorkgroups[3] = {workgroupCount[0], workgroupCount[1], workgroupCount[2]};
1311 mComputePassCommands->getCommandBuffer().pushConstants(
1312 kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, numWorkgroupsRange->offset,
1313 numWorkgroupsRange->size, &numWorkgroups);
1314 }
1315
1316 // Retain kernel object until we finish executing it later
1317 mKernelCaptures.push_back(cl::KernelPtr{&kernelVk.getFrontendObject()});
1318
1319 // Process each kernel argument/resource
1320 vk::DescriptorSetArray<UpdateDescriptorSetsBuilder> updateDescriptorSetsBuilders;
1321 CLKernelArguments args = kernelVk.getArgs();
1322 for (size_t index = 0; index < args.size(); index++)
1323 {
1324 const auto &arg = args.at(index);
1325 UpdateDescriptorSetsBuilder &kernelArgDescSetBuilder =
1326 updateDescriptorSetsBuilders[DescriptorSetIndex::KernelArguments];
1327 switch (arg.type)
1328 {
1329 case NonSemanticClspvReflectionArgumentUniform:
1330 case NonSemanticClspvReflectionArgumentStorageBuffer:
1331 {
1332 cl::Memory *clMem = cl::Buffer::Cast(*static_cast<const cl_mem *>(arg.handle));
1333 CLBufferVk &vkMem = clMem->getImpl<CLBufferVk>();
1334
1335 ANGLE_TRY(addMemoryDependencies(clMem));
1336
1337 // Update buffer/descriptor info
1338 VkDescriptorBufferInfo &bufferInfo =
1339 kernelArgDescSetBuilder.allocDescriptorBufferInfo();
1340 bufferInfo.range = clMem->getSize();
1341 bufferInfo.offset = clMem->getOffset();
1342 bufferInfo.buffer = vkMem.getBuffer().getBuffer().getHandle();
1343 VkWriteDescriptorSet &writeDescriptorSet =
1344 kernelArgDescSetBuilder.allocWriteDescriptorSet();
1345 writeDescriptorSet.descriptorCount = 1;
1346 writeDescriptorSet.descriptorType =
1347 arg.type == NonSemanticClspvReflectionArgumentUniform
1348 ? VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER
1349 : VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
1350 writeDescriptorSet.pBufferInfo = &bufferInfo;
1351 writeDescriptorSet.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
1352 writeDescriptorSet.dstSet =
1353 kernelVk.getDescriptorSet(DescriptorSetIndex::KernelArguments);
1354 writeDescriptorSet.dstBinding = arg.descriptorBinding;
1355 break;
1356 }
1357 case NonSemanticClspvReflectionArgumentPodPushConstant:
1358 {
1359 // Spec requires the size and offset to be multiple of 4, round up for size and
1360 // round down for offset to ensure this
1361 uint32_t offset = roundDownPow2(arg.pushConstOffset, 4u);
1362 uint32_t size =
1363 roundUpPow2(arg.pushConstOffset + arg.pushConstantSize, 4u) - offset;
1364 ASSERT(offset + size <= kernelVk.getPodArgumentsData().size());
1365 mComputePassCommands->getCommandBuffer().pushConstants(
1366 kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, offset, size,
1367 &kernelVk.getPodArgumentsData()[offset]);
1368 break;
1369 }
1370 case NonSemanticClspvReflectionArgumentSampler:
1371 {
1372 cl::Sampler *clSampler =
1373 cl::Sampler::Cast(*static_cast<const cl_sampler *>(arg.handle));
1374 CLSamplerVk &vkSampler = clSampler->getImpl<CLSamplerVk>();
1375 VkDescriptorImageInfo &samplerInfo =
1376 kernelArgDescSetBuilder.allocDescriptorImageInfo();
1377 samplerInfo.sampler = vkSampler.getSamplerHelper().get().getHandle();
1378 VkWriteDescriptorSet &writeDescriptorSet =
1379 kernelArgDescSetBuilder.allocWriteDescriptorSet();
1380 writeDescriptorSet.descriptorCount = 1;
1381 writeDescriptorSet.descriptorType = VK_DESCRIPTOR_TYPE_SAMPLER;
1382 writeDescriptorSet.pImageInfo = &samplerInfo;
1383 writeDescriptorSet.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
1384 writeDescriptorSet.dstSet =
1385 kernelVk.getDescriptorSet(DescriptorSetIndex::KernelArguments);
1386 writeDescriptorSet.dstBinding = arg.descriptorBinding;
1387
1388 const VkPushConstantRange *samplerMaskRange =
1389 devProgramData->getNormalizedSamplerMaskRange(index);
1390 if (samplerMaskRange != nullptr)
1391 {
1392 if (clSampler->getNormalizedCoords() == false)
1393 {
1394 ANGLE_TRY(vkSampler.createNormalized());
1395 samplerInfo.sampler =
1396 vkSampler.getSamplerHelperNormalized().get().getHandle();
1397 }
1398 uint32_t mask = vkSampler.getSamplerMask();
1399 mComputePassCommands->getCommandBuffer().pushConstants(
1400 kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT,
1401 samplerMaskRange->offset, samplerMaskRange->size, &mask);
1402 }
1403 break;
1404 }
1405 case NonSemanticClspvReflectionArgumentStorageImage:
1406 case NonSemanticClspvReflectionArgumentSampledImage:
1407 {
1408 cl::Memory *clMem = cl::Image::Cast(*static_cast<const cl_mem *>(arg.handle));
1409 CLImageVk &vkMem = clMem->getImpl<CLImageVk>();
1410
1411 ANGLE_TRY(addMemoryDependencies(clMem));
1412
1413 cl_image_format imageFormat = vkMem.getFormat();
1414 const VkPushConstantRange *imageDataChannelOrderRange =
1415 devProgramData->getImageDataChannelOrderRange(index);
1416 if (imageDataChannelOrderRange != nullptr)
1417 {
1418 mComputePassCommands->getCommandBuffer().pushConstants(
1419 kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT,
1420 imageDataChannelOrderRange->offset, imageDataChannelOrderRange->size,
1421 &imageFormat.image_channel_order);
1422 }
1423
1424 const VkPushConstantRange *imageDataChannelDataTypeRange =
1425 devProgramData->getImageDataChannelDataTypeRange(index);
1426 if (imageDataChannelDataTypeRange != nullptr)
1427 {
1428 mComputePassCommands->getCommandBuffer().pushConstants(
1429 kernelVk.getPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT,
1430 imageDataChannelDataTypeRange->offset, imageDataChannelDataTypeRange->size,
1431 &imageFormat.image_channel_data_type);
1432 }
1433
1434 // Update image/descriptor info
1435 VkDescriptorImageInfo &imageInfo =
1436 kernelArgDescSetBuilder.allocDescriptorImageInfo();
1437 imageInfo.imageLayout =
1438 arg.type == NonSemanticClspvReflectionArgumentStorageImage
1439 ? VK_IMAGE_LAYOUT_GENERAL
1440 : vkMem.getImage().getCurrentLayout(mContext->getRenderer());
1441 imageInfo.imageView = vkMem.getImageView().getHandle();
1442 imageInfo.sampler = VK_NULL_HANDLE;
1443 VkWriteDescriptorSet &writeDescriptorSet =
1444 kernelArgDescSetBuilder.allocWriteDescriptorSet();
1445 writeDescriptorSet.descriptorCount = 1;
1446 writeDescriptorSet.descriptorType =
1447 arg.type == NonSemanticClspvReflectionArgumentStorageImage
1448 ? VK_DESCRIPTOR_TYPE_STORAGE_IMAGE
1449 : VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE;
1450 writeDescriptorSet.pImageInfo = &imageInfo;
1451 writeDescriptorSet.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
1452 writeDescriptorSet.dstSet =
1453 kernelVk.getDescriptorSet(DescriptorSetIndex::KernelArguments);
1454 writeDescriptorSet.dstBinding = arg.descriptorBinding;
1455 break;
1456 }
1457 case NonSemanticClspvReflectionArgumentUniformTexelBuffer:
1458 case NonSemanticClspvReflectionArgumentStorageTexelBuffer:
1459 {
1460 cl::Memory *clMem = cl::Image::Cast(*static_cast<const cl_mem *>(arg.handle));
1461 CLImageVk &vkMem = clMem->getImpl<CLImageVk>();
1462
1463 ANGLE_TRY(addMemoryDependencies(clMem));
1464
1465 VkBufferView &bufferView = kernelArgDescSetBuilder.allocBufferView();
1466 const vk::BufferView *vkBufferView = nullptr;
1467 ANGLE_TRY(vkMem.getBufferView(&vkBufferView));
1468 bufferView = vkBufferView->getHandle();
1469
1470 VkWriteDescriptorSet &writeDescriptorSet =
1471 kernelArgDescSetBuilder.allocWriteDescriptorSet();
1472 writeDescriptorSet.descriptorCount = 1;
1473 writeDescriptorSet.descriptorType =
1474 arg.type == NonSemanticClspvReflectionArgumentStorageTexelBuffer
1475 ? VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER
1476 : VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER;
1477 writeDescriptorSet.pImageInfo = nullptr;
1478 writeDescriptorSet.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
1479 writeDescriptorSet.dstSet =
1480 kernelVk.getDescriptorSet(DescriptorSetIndex::KernelArguments);
1481 writeDescriptorSet.dstBinding = arg.descriptorBinding;
1482 writeDescriptorSet.pTexelBufferView = &bufferView;
1483
1484 break;
1485 }
1486 case NonSemanticClspvReflectionArgumentPodUniform:
1487 case NonSemanticClspvReflectionArgumentPointerUniform:
1488 case NonSemanticClspvReflectionArgumentPodStorageBuffer:
1489 case NonSemanticClspvReflectionArgumentPointerPushConstant:
1490 default:
1491 {
1492 UNIMPLEMENTED();
1493 break;
1494 }
1495 }
1496 }
1497
1498 // process the printf storage buffer
1499 if (kernelVk.usesPrintf())
1500 {
1501 UpdateDescriptorSetsBuilder &printfDescSetBuilder =
1502 updateDescriptorSetsBuilders[DescriptorSetIndex::Printf];
1503
1504 cl::Memory *clMem = cl::Buffer::Cast(getOrCreatePrintfBuffer());
1505 CLBufferVk &vkMem = clMem->getImpl<CLBufferVk>();
1506 uint8_t *mapPointer = nullptr;
1507 ANGLE_TRY(vkMem.map(mapPointer, 0));
1508 // The spec calls out *The first 4 bytes of the buffer should be zero-initialized.*
1509 memset(mapPointer, 0, 4);
1510
1511 auto &bufferInfo = printfDescSetBuilder.allocDescriptorBufferInfo();
1512 bufferInfo.range = clMem->getSize();
1513 bufferInfo.offset = clMem->getOffset();
1514 bufferInfo.buffer = vkMem.getBuffer().getBuffer().getHandle();
1515
1516 auto &writeDescriptorSet = printfDescSetBuilder.allocWriteDescriptorSet();
1517 writeDescriptorSet.descriptorCount = 1;
1518 writeDescriptorSet.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
1519 writeDescriptorSet.pBufferInfo = &bufferInfo;
1520 writeDescriptorSet.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
1521 writeDescriptorSet.dstSet = kernelVk.getDescriptorSet(DescriptorSetIndex::Printf);
1522 writeDescriptorSet.dstBinding = kernelVk.getProgram()
1523 ->getDeviceProgramData(kernelVk.getKernelName().c_str())
1524 ->reflectionData.printfBufferStorage.binding;
1525
1526 mNeedPrintfHandling = true;
1527 mPrintfInfos = kernelVk.getProgram()->getPrintfDescriptors(kernelVk.getKernelName());
1528 }
1529
1530 angle::EnumIterator<DescriptorSetIndex> descriptorSetIndex(DescriptorSetIndex::LiteralSampler);
1531 for (DescriptorSetIndex index : angle::AllEnums<DescriptorSetIndex>())
1532 {
1533 if (!kernelVk.getDescriptorSetLayoutDesc(index).empty())
1534 {
1535 mContext->getPerfCounters().writeDescriptorSets =
1536 updateDescriptorSetsBuilders[index].flushDescriptorSetUpdates(
1537 mContext->getRenderer()->getDevice());
1538
1539 VkDescriptorSet descriptorSet = kernelVk.getDescriptorSet(index);
1540 mComputePassCommands->getCommandBuffer().bindDescriptorSets(
1541 kernelVk.getPipelineLayout(), VK_PIPELINE_BIND_POINT_COMPUTE, *descriptorSetIndex,
1542 1, &descriptorSet, 0, nullptr);
1543
1544 ++descriptorSetIndex;
1545 }
1546 }
1547
1548 if (needsBarrier)
1549 {
1550 ANGLE_TRY(insertBarrier());
1551 }
1552
1553 return angle::Result::Continue;
1554 }
1555
flushComputePassCommands()1556 angle::Result CLCommandQueueVk::flushComputePassCommands()
1557 {
1558 if (mComputePassCommands->empty())
1559 {
1560 return angle::Result::Continue;
1561 }
1562
1563 // Flush any host visible buffers by adding appropriate barriers
1564 if (mComputePassCommands->getAndResetHasHostVisibleBufferWrite())
1565 {
1566 // Make sure all writes to host-visible buffers are flushed.
1567 VkMemoryBarrier memoryBarrier = {};
1568 memoryBarrier.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER;
1569 memoryBarrier.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT;
1570 memoryBarrier.dstAccessMask = VK_ACCESS_HOST_READ_BIT | VK_ACCESS_HOST_WRITE_BIT;
1571
1572 mComputePassCommands->getCommandBuffer().memoryBarrier(
1573 VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
1574 VK_PIPELINE_STAGE_HOST_BIT, memoryBarrier);
1575 }
1576
1577 mLastFlushedQueueSerial = mComputePassCommands->getQueueSerial();
1578 // Here, we flush our compute cmds to RendererVk's primary command buffer
1579 ANGLE_TRY(mContext->getRenderer()->flushOutsideRPCommands(
1580 mContext, getProtectionType(), egl::ContextPriority::Medium, &mComputePassCommands));
1581
1582 mHasAnyCommandsPendingSubmission = true;
1583
1584 mContext->getPerfCounters().flushedOutsideRenderPassCommandBuffers++;
1585
1586 // Generate new serial for next batch of cmds
1587 mComputePassCommands->setQueueSerial(
1588 mCurrentQueueSerialIndex,
1589 mContext->getRenderer()->generateQueueSerial(mCurrentQueueSerialIndex));
1590
1591 return angle::Result::Continue;
1592 }
1593
processWaitlist(const cl::EventPtrs & waitEvents)1594 angle::Result CLCommandQueueVk::processWaitlist(const cl::EventPtrs &waitEvents)
1595 {
1596 if (!waitEvents.empty())
1597 {
1598 bool insertedBarrier = false;
1599 for (const cl::EventPtr &event : waitEvents)
1600 {
1601 if (event->getImpl<CLEventVk>().isUserEvent() ||
1602 event->getCommandQueue() != &mCommandQueue)
1603 {
1604 // We cannot use a barrier in these cases, therefore defer the event
1605 // handling till submission time
1606 // TODO: Perhaps we could utilize VkEvents here instead and have GPU wait(s)
1607 // https://anglebug.com/42267109
1608 mDependantEvents.push_back(event);
1609 }
1610 else if (event->getCommandQueue() == &mCommandQueue && !insertedBarrier)
1611 {
1612 // As long as there is at least one dependant command in same queue,
1613 // we just need to insert one execution barrier
1614 ANGLE_TRY(insertBarrier());
1615
1616 insertedBarrier = true;
1617 }
1618 }
1619 }
1620 return angle::Result::Continue;
1621 }
1622
submitCommands()1623 angle::Result CLCommandQueueVk::submitCommands()
1624 {
1625 ANGLE_TRACE_EVENT0("gpu.angle", "CLCommandQueueVk::submitCommands()");
1626
1627 // Kick off renderer submit
1628 ANGLE_TRY(mContext->getRenderer()->submitCommands(mContext, getProtectionType(),
1629 egl::ContextPriority::Medium, nullptr,
1630 nullptr, mLastFlushedQueueSerial));
1631
1632 mLastSubmittedQueueSerial = mLastFlushedQueueSerial;
1633
1634 // Now that we have submitted commands, some of pending garbage may no longer pending
1635 // and should be moved to garbage list.
1636 mContext->getRenderer()->cleanupPendingSubmissionGarbage();
1637
1638 mHasAnyCommandsPendingSubmission = false;
1639
1640 return angle::Result::Continue;
1641 }
1642
createEvent(CLEventImpl::CreateFunc * createFunc,cl::ExecutionStatus initialStatus)1643 angle::Result CLCommandQueueVk::createEvent(CLEventImpl::CreateFunc *createFunc,
1644 cl::ExecutionStatus initialStatus)
1645 {
1646 if (createFunc != nullptr)
1647 {
1648 *createFunc = [this, initialStatus](const cl::Event &event) {
1649 auto eventVk = new (std::nothrow) CLEventVk(event);
1650 if (eventVk == nullptr)
1651 {
1652 ERR() << "Failed to create event obj!";
1653 ANGLE_CL_SET_ERROR(CL_OUT_OF_HOST_MEMORY);
1654 return CLEventImpl::Ptr(nullptr);
1655 }
1656
1657 if (initialStatus == cl::ExecutionStatus::Complete)
1658 {
1659 // Submission finished at this point, just set event to complete
1660 if (IsError(eventVk->setStatusAndExecuteCallback(cl::ToCLenum(initialStatus))))
1661 {
1662 ANGLE_CL_SET_ERROR(CL_OUT_OF_RESOURCES);
1663 }
1664 }
1665 else if (mCommandQueue.getProperties().intersects(CL_QUEUE_PROFILING_ENABLE))
1666 {
1667 // We also block for profiling so that we get timestamps per-command
1668 if (IsError(mCommandQueue.getImpl<CLCommandQueueVk>().finish()))
1669 {
1670 ANGLE_CL_SET_ERROR(CL_OUT_OF_RESOURCES);
1671 }
1672 // Submission finished at this point, just set event to complete
1673 if (IsError(eventVk->setStatusAndExecuteCallback(CL_COMPLETE)))
1674 {
1675 ANGLE_CL_SET_ERROR(CL_OUT_OF_RESOURCES);
1676 }
1677 }
1678 else
1679 {
1680 eventVk->setQueueSerial(mComputePassCommands->getQueueSerial());
1681 // Save a reference to this event
1682 mAssociatedEvents.push_back(cl::EventPtr{&eventVk->getFrontendObject()});
1683 }
1684
1685 return CLEventImpl::Ptr(eventVk);
1686 };
1687 }
1688 return angle::Result::Continue;
1689 }
1690
finishInternal()1691 angle::Result CLCommandQueueVk::finishInternal()
1692 {
1693 for (cl::EventPtr event : mAssociatedEvents)
1694 {
1695 ANGLE_TRY(event->getImpl<CLEventVk>().setStatusAndExecuteCallback(CL_SUBMITTED));
1696 }
1697
1698 if (!mComputePassCommands->empty())
1699 {
1700 // If we still have dependant events, handle them now
1701 if (!mDependantEvents.empty())
1702 {
1703 for (const auto &depEvent : mDependantEvents)
1704 {
1705 if (depEvent->getImpl<CLEventVk>().isUserEvent())
1706 {
1707 // We just wait here for user to set the event object
1708 cl_int status = CL_QUEUED;
1709 ANGLE_TRY(depEvent->getImpl<CLEventVk>().waitForUserEventStatus());
1710 ANGLE_TRY(depEvent->getImpl<CLEventVk>().getCommandExecutionStatus(status));
1711 if (status < 0)
1712 {
1713 ERR() << "Invalid dependant user-event (" << depEvent.get()
1714 << ") status encountered!";
1715 mComputePassCommands->getCommandBuffer().reset();
1716 ANGLE_CL_RETURN_ERROR(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
1717 }
1718 }
1719 else
1720 {
1721 // Otherwise, we just need to submit/finish for dependant event queues
1722 // here that are not associated with this queue
1723 ANGLE_TRY(depEvent->getCommandQueue()->finish());
1724 }
1725 }
1726 mDependantEvents.clear();
1727 }
1728
1729 ANGLE_TRY(flushComputePassCommands());
1730 }
1731
1732 for (cl::EventPtr event : mAssociatedEvents)
1733 {
1734 ANGLE_TRY(event->getImpl<CLEventVk>().setStatusAndExecuteCallback(CL_RUNNING));
1735 }
1736
1737 if (mHasAnyCommandsPendingSubmission)
1738 {
1739 // Submit and wait for fence
1740 ANGLE_TRY(submitCommands());
1741 ANGLE_TRY(mContext->getRenderer()->finishQueueSerial(mContext, mLastSubmittedQueueSerial));
1742
1743 // Ensure any resources are synced back to host on GPU completion
1744 ANGLE_TRY(syncHostBuffers());
1745 }
1746
1747 if (mNeedPrintfHandling)
1748 {
1749 ANGLE_TRY(processPrintfBuffer());
1750 mNeedPrintfHandling = false;
1751 }
1752
1753 for (cl::EventPtr event : mAssociatedEvents)
1754 {
1755 ANGLE_TRY(event->getImpl<CLEventVk>().setStatusAndExecuteCallback(CL_COMPLETE));
1756 }
1757
1758 mMemoryCaptures.clear();
1759 mAssociatedEvents.clear();
1760 mDependencyTracker.clear();
1761 mKernelCaptures.clear();
1762
1763 return angle::Result::Continue;
1764 }
1765
1766 // Helper function to insert appropriate memory barriers before accessing the resources in the
1767 // command buffer.
onResourceAccess(const vk::CommandBufferAccess & access)1768 angle::Result CLCommandQueueVk::onResourceAccess(const vk::CommandBufferAccess &access)
1769 {
1770 // Buffers
1771 for (const vk::CommandBufferBufferAccess &bufferAccess : access.getReadBuffers())
1772 {
1773 if (mComputePassCommands->usesBufferForWrite(*bufferAccess.buffer))
1774 {
1775 // read buffers only need a new command buffer if previously used for write
1776 ANGLE_TRY(flush());
1777 }
1778
1779 mComputePassCommands->bufferRead(bufferAccess.accessType, bufferAccess.stage,
1780 bufferAccess.buffer);
1781 }
1782
1783 for (const vk::CommandBufferBufferAccess &bufferAccess : access.getWriteBuffers())
1784 {
1785 if (mComputePassCommands->usesBuffer(*bufferAccess.buffer))
1786 {
1787 // write buffers always need a new command buffer
1788 ANGLE_TRY(flush());
1789 }
1790
1791 mComputePassCommands->bufferWrite(bufferAccess.accessType, bufferAccess.stage,
1792 bufferAccess.buffer);
1793 if (bufferAccess.buffer->isHostVisible())
1794 {
1795 // currently all are host visible so nothing to do
1796 }
1797 }
1798
1799 for (const vk::CommandBufferBufferExternalAcquireRelease &bufferAcquireRelease :
1800 access.getExternalAcquireReleaseBuffers())
1801 {
1802 mComputePassCommands->retainResourceForWrite(bufferAcquireRelease.buffer);
1803 }
1804
1805 for (const vk::CommandBufferResourceAccess &resourceAccess : access.getAccessResources())
1806 {
1807 mComputePassCommands->retainResource(resourceAccess.resource);
1808 }
1809
1810 return angle::Result::Continue;
1811 }
1812
processPrintfBuffer()1813 angle::Result CLCommandQueueVk::processPrintfBuffer()
1814 {
1815 ASSERT(mPrintfBuffer);
1816 ASSERT(mNeedPrintfHandling);
1817 ASSERT(mPrintfInfos);
1818
1819 cl::Memory *clMem = cl::Buffer::Cast(getOrCreatePrintfBuffer());
1820 CLBufferVk &vkMem = clMem->getImpl<CLBufferVk>();
1821
1822 unsigned char *data = nullptr;
1823 ANGLE_TRY(vkMem.map(data, 0));
1824 ANGLE_TRY(ClspvProcessPrintfBuffer(data, vkMem.getSize(), mPrintfInfos));
1825 vkMem.unmap();
1826
1827 return angle::Result::Continue;
1828 }
1829
1830 // A single CL buffer is setup for every command queue of size kPrintfBufferSize. This can be
1831 // expanded later, if more storage is needed.
getOrCreatePrintfBuffer()1832 cl_mem CLCommandQueueVk::getOrCreatePrintfBuffer()
1833 {
1834 if (!mPrintfBuffer)
1835 {
1836 mPrintfBuffer = cl::Buffer::Cast(mContext->getFrontendObject().createBuffer(
1837 nullptr, cl::MemFlags(CL_MEM_READ_WRITE), kPrintfBufferSize, nullptr));
1838 }
1839 return mPrintfBuffer;
1840 }
1841
hasUserEventDependency() const1842 bool CLCommandQueueVk::hasUserEventDependency() const
1843 {
1844 return std::any_of(mDependantEvents.begin(), mDependantEvents.end(),
1845 [](const cl::EventPtr event) { return event->isUserEvent(); });
1846 }
1847
1848 } // namespace rx
1849