xref: /aosp_15_r20/external/mesa3d/src/broadcom/vulkan/v3dv_meta_copy.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2019 Raspberry Pi Ltd
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "v3dv_private.h"
25 #include "v3dv_meta_common.h"
26 
27 #include "compiler/nir/nir_builder.h"
28 #include "util/u_pack_color.h"
29 #include "vk_common_entrypoints.h"
30 
31 static uint32_t
meta_blit_key_hash(const void * key)32 meta_blit_key_hash(const void *key)
33 {
34    return _mesa_hash_data(key, V3DV_META_BLIT_CACHE_KEY_SIZE);
35 }
36 
37 static bool
meta_blit_key_compare(const void * key1,const void * key2)38 meta_blit_key_compare(const void *key1, const void *key2)
39 {
40    return memcmp(key1, key2, V3DV_META_BLIT_CACHE_KEY_SIZE) == 0;
41 }
42 
43 static bool
44 texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
45                          VkImageAspectFlags aspect,
46                          struct v3dv_image *image,
47                          VkFormat dst_format,
48                          VkFormat src_format,
49                          struct v3dv_buffer *buffer,
50                          uint32_t buffer_bpp,
51                          VkColorComponentFlags cmask,
52                          VkComponentMapping *cswizzle,
53                          uint32_t region_count,
54                          const VkBufferImageCopy2 *regions);
55 
56 static bool
create_blit_pipeline_layout(struct v3dv_device * device,VkDescriptorSetLayout * descriptor_set_layout,VkPipelineLayout * pipeline_layout)57 create_blit_pipeline_layout(struct v3dv_device *device,
58                             VkDescriptorSetLayout *descriptor_set_layout,
59                             VkPipelineLayout *pipeline_layout)
60 {
61    VkResult result;
62 
63    if (*descriptor_set_layout == 0) {
64       VkDescriptorSetLayoutBinding descriptor_set_layout_binding = {
65          .binding = 0,
66          .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
67          .descriptorCount = 1,
68          .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT,
69       };
70       VkDescriptorSetLayoutCreateInfo descriptor_set_layout_info = {
71          .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
72          .bindingCount = 1,
73          .pBindings = &descriptor_set_layout_binding,
74       };
75       result =
76          v3dv_CreateDescriptorSetLayout(v3dv_device_to_handle(device),
77                                         &descriptor_set_layout_info,
78                                         &device->vk.alloc,
79                                         descriptor_set_layout);
80       if (result != VK_SUCCESS)
81          return false;
82    }
83 
84    assert(*pipeline_layout == 0);
85    VkPipelineLayoutCreateInfo pipeline_layout_info = {
86       .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
87       .setLayoutCount = 1,
88       .pSetLayouts = descriptor_set_layout,
89       .pushConstantRangeCount = 1,
90       .pPushConstantRanges =
91          &(VkPushConstantRange) { VK_SHADER_STAGE_VERTEX_BIT, 0, 20 },
92    };
93 
94    result =
95       v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
96                                 &pipeline_layout_info,
97                                 &device->vk.alloc,
98                                 pipeline_layout);
99    return result == VK_SUCCESS;
100 }
101 
102 void
v3dv_meta_blit_init(struct v3dv_device * device)103 v3dv_meta_blit_init(struct v3dv_device *device)
104 {
105    for (uint32_t i = 0; i < 3; i++) {
106       device->meta.blit.cache[i] =
107          _mesa_hash_table_create(NULL,
108                                  meta_blit_key_hash,
109                                  meta_blit_key_compare);
110    }
111 
112    create_blit_pipeline_layout(device,
113                                &device->meta.blit.ds_layout,
114                                &device->meta.blit.p_layout);
115 }
116 
117 static void
destroy_meta_blit_pipeline(VkDevice vk_device,uint64_t obj,VkAllocationCallbacks * alloc)118 destroy_meta_blit_pipeline(VkDevice vk_device,
119                            uint64_t obj,
120                            VkAllocationCallbacks *alloc)
121 {
122    struct v3dv_meta_blit_pipeline *p =
123       (struct v3dv_meta_blit_pipeline *)(uintptr_t) obj;
124    v3dv_DestroyPipeline(vk_device, p->pipeline, alloc);
125    v3dv_DestroyRenderPass(vk_device, p->pass, alloc);
126    v3dv_DestroyRenderPass(vk_device, p->pass_no_load, alloc);
127    vk_free(alloc, p);
128 }
129 
130 void
v3dv_meta_blit_finish(struct v3dv_device * device)131 v3dv_meta_blit_finish(struct v3dv_device *device)
132 {
133    VkDevice _device = v3dv_device_to_handle(device);
134 
135    for (uint32_t i = 0; i < 3; i++) {
136       hash_table_foreach(device->meta.blit.cache[i], entry) {
137          destroy_meta_blit_pipeline(_device, (uintptr_t)entry->data,
138                                     &device->vk.alloc);
139       }
140       _mesa_hash_table_destroy(device->meta.blit.cache[i], NULL);
141    }
142 
143    if (device->meta.blit.p_layout) {
144       v3dv_DestroyPipelineLayout(_device, device->meta.blit.p_layout,
145                                  &device->vk.alloc);
146    }
147 
148    if (device->meta.blit.ds_layout) {
149       v3dv_DestroyDescriptorSetLayout(_device, device->meta.blit.ds_layout,
150                                       &device->vk.alloc);
151    }
152 }
153 
154 static uint32_t
meta_texel_buffer_copy_key_hash(const void * key)155 meta_texel_buffer_copy_key_hash(const void *key)
156 {
157    return _mesa_hash_data(key, V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE);
158 }
159 
160 static bool
meta_texel_buffer_copy_key_compare(const void * key1,const void * key2)161 meta_texel_buffer_copy_key_compare(const void *key1, const void *key2)
162 {
163    return memcmp(key1, key2, V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE) == 0;
164 }
165 
166 static bool
create_texel_buffer_copy_pipeline_layout(struct v3dv_device * device,VkDescriptorSetLayout * ds_layout,VkPipelineLayout * p_layout)167 create_texel_buffer_copy_pipeline_layout(struct v3dv_device *device,
168                                          VkDescriptorSetLayout *ds_layout,
169                                          VkPipelineLayout *p_layout)
170 {
171    VkResult result;
172 
173    if (*ds_layout == 0) {
174       VkDescriptorSetLayoutBinding ds_layout_binding = {
175          .binding = 0,
176          .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
177          .descriptorCount = 1,
178          .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT,
179       };
180       VkDescriptorSetLayoutCreateInfo ds_layout_info = {
181          .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
182          .bindingCount = 1,
183          .pBindings = &ds_layout_binding,
184       };
185       result =
186          v3dv_CreateDescriptorSetLayout(v3dv_device_to_handle(device),
187                                         &ds_layout_info,
188                                         &device->vk.alloc,
189                                         ds_layout);
190       if (result != VK_SUCCESS)
191          return false;
192    }
193 
194    assert(*p_layout == 0);
195    /* FIXME: this is abusing a bit the API, since not all of our copy
196     * pipelines have a geometry shader. We could create 2 different pipeline
197     * layouts, but this works for us for now.
198     */
199 #define TEXEL_BUFFER_COPY_FS_BOX_PC_OFFSET      0
200 #define TEXEL_BUFFER_COPY_FS_STRIDE_PC_OFFSET  16
201 #define TEXEL_BUFFER_COPY_FS_OFFSET_PC_OFFSET  20
202 #define TEXEL_BUFFER_COPY_GS_LAYER_PC_OFFSET   24
203    VkPushConstantRange ranges[2] = {
204       { VK_SHADER_STAGE_FRAGMENT_BIT, 0, 24 },
205       { VK_SHADER_STAGE_GEOMETRY_BIT, 24, 4 },
206    };
207 
208    VkPipelineLayoutCreateInfo p_layout_info = {
209       .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
210       .setLayoutCount = 1,
211       .pSetLayouts = ds_layout,
212       .pushConstantRangeCount = 2,
213       .pPushConstantRanges = ranges,
214    };
215 
216    result =
217       v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
218                                 &p_layout_info,
219                                 &device->vk.alloc,
220                                 p_layout);
221    return result == VK_SUCCESS;
222 }
223 
224 void
v3dv_meta_texel_buffer_copy_init(struct v3dv_device * device)225 v3dv_meta_texel_buffer_copy_init(struct v3dv_device *device)
226 {
227    for (uint32_t i = 0; i < 3; i++) {
228       device->meta.texel_buffer_copy.cache[i] =
229          _mesa_hash_table_create(NULL,
230                                  meta_texel_buffer_copy_key_hash,
231                                  meta_texel_buffer_copy_key_compare);
232    }
233 
234    create_texel_buffer_copy_pipeline_layout(
235       device,
236       &device->meta.texel_buffer_copy.ds_layout,
237       &device->meta.texel_buffer_copy.p_layout);
238 }
239 
240 static void
destroy_meta_texel_buffer_copy_pipeline(VkDevice vk_device,uint64_t obj,VkAllocationCallbacks * alloc)241 destroy_meta_texel_buffer_copy_pipeline(VkDevice vk_device,
242                                         uint64_t obj,
243                                         VkAllocationCallbacks *alloc)
244 {
245    struct v3dv_meta_texel_buffer_copy_pipeline *p =
246       (struct v3dv_meta_texel_buffer_copy_pipeline *)(uintptr_t) obj;
247    v3dv_DestroyPipeline(vk_device, p->pipeline, alloc);
248    v3dv_DestroyRenderPass(vk_device, p->pass, alloc);
249    v3dv_DestroyRenderPass(vk_device, p->pass_no_load, alloc);
250    vk_free(alloc, p);
251 }
252 
253 void
v3dv_meta_texel_buffer_copy_finish(struct v3dv_device * device)254 v3dv_meta_texel_buffer_copy_finish(struct v3dv_device *device)
255 {
256    VkDevice _device = v3dv_device_to_handle(device);
257 
258    for (uint32_t i = 0; i < 3; i++) {
259       hash_table_foreach(device->meta.texel_buffer_copy.cache[i], entry) {
260          destroy_meta_texel_buffer_copy_pipeline(_device, (uintptr_t)entry->data,
261                                                  &device->vk.alloc);
262       }
263       _mesa_hash_table_destroy(device->meta.texel_buffer_copy.cache[i], NULL);
264    }
265 
266    if (device->meta.texel_buffer_copy.p_layout) {
267       v3dv_DestroyPipelineLayout(_device, device->meta.texel_buffer_copy.p_layout,
268                                  &device->vk.alloc);
269    }
270 
271    if (device->meta.texel_buffer_copy.ds_layout) {
272       v3dv_DestroyDescriptorSetLayout(_device, device->meta.texel_buffer_copy.ds_layout,
273                                       &device->vk.alloc);
274    }
275 }
276 
277 static VkFormat
get_compatible_tlb_format(VkFormat format)278 get_compatible_tlb_format(VkFormat format)
279 {
280    switch (format) {
281    case VK_FORMAT_R8G8B8A8_SNORM:
282       return VK_FORMAT_R8G8B8A8_UINT;
283 
284    case VK_FORMAT_R8G8_SNORM:
285       return VK_FORMAT_R8G8_UINT;
286 
287    case VK_FORMAT_R8_SNORM:
288       return VK_FORMAT_R8_UINT;
289 
290    case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
291       return VK_FORMAT_A8B8G8R8_UINT_PACK32;
292 
293    case VK_FORMAT_R16_UNORM:
294    case VK_FORMAT_R16_SNORM:
295       return VK_FORMAT_R16_UINT;
296 
297    case VK_FORMAT_R16G16_UNORM:
298    case VK_FORMAT_R16G16_SNORM:
299       return VK_FORMAT_R16G16_UINT;
300 
301    case VK_FORMAT_R16G16B16A16_UNORM:
302    case VK_FORMAT_R16G16B16A16_SNORM:
303       return VK_FORMAT_R16G16B16A16_UINT;
304 
305    case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
306       return VK_FORMAT_R32_SFLOAT;
307 
308    /* We can't render to compressed formats using the TLB so instead we use
309     * a compatible format with the same bpp as the compressed format. Because
310     * the compressed format's bpp is for a full block (i.e. 4x4 pixels in the
311     * case of ETC), when we implement copies with the compatible format we
312     * will have to divide offsets and dimensions on the compressed image by
313     * the compressed block size.
314     */
315    case VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK:
316    case VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK:
317    case VK_FORMAT_EAC_R11G11_UNORM_BLOCK:
318    case VK_FORMAT_EAC_R11G11_SNORM_BLOCK:
319    case VK_FORMAT_BC2_UNORM_BLOCK:
320    case VK_FORMAT_BC2_SRGB_BLOCK:
321    case VK_FORMAT_BC3_SRGB_BLOCK:
322    case VK_FORMAT_BC3_UNORM_BLOCK:
323    case VK_FORMAT_ASTC_4x4_UNORM_BLOCK:
324    case VK_FORMAT_ASTC_4x4_SRGB_BLOCK:
325    case VK_FORMAT_ASTC_5x4_UNORM_BLOCK:
326    case VK_FORMAT_ASTC_5x4_SRGB_BLOCK:
327    case VK_FORMAT_ASTC_5x5_UNORM_BLOCK:
328    case VK_FORMAT_ASTC_5x5_SRGB_BLOCK:
329    case VK_FORMAT_ASTC_6x5_UNORM_BLOCK:
330    case VK_FORMAT_ASTC_6x5_SRGB_BLOCK:
331    case VK_FORMAT_ASTC_6x6_UNORM_BLOCK:
332    case VK_FORMAT_ASTC_6x6_SRGB_BLOCK:
333    case VK_FORMAT_ASTC_8x5_UNORM_BLOCK:
334    case VK_FORMAT_ASTC_8x5_SRGB_BLOCK:
335    case VK_FORMAT_ASTC_8x6_UNORM_BLOCK:
336    case VK_FORMAT_ASTC_8x6_SRGB_BLOCK:
337    case VK_FORMAT_ASTC_8x8_UNORM_BLOCK:
338    case VK_FORMAT_ASTC_8x8_SRGB_BLOCK:
339    case VK_FORMAT_ASTC_10x5_UNORM_BLOCK:
340    case VK_FORMAT_ASTC_10x5_SRGB_BLOCK:
341    case VK_FORMAT_ASTC_10x6_UNORM_BLOCK:
342    case VK_FORMAT_ASTC_10x6_SRGB_BLOCK:
343    case VK_FORMAT_ASTC_10x8_UNORM_BLOCK:
344    case VK_FORMAT_ASTC_10x8_SRGB_BLOCK:
345    case VK_FORMAT_ASTC_10x10_UNORM_BLOCK:
346    case VK_FORMAT_ASTC_10x10_SRGB_BLOCK:
347    case VK_FORMAT_ASTC_12x10_UNORM_BLOCK:
348    case VK_FORMAT_ASTC_12x10_SRGB_BLOCK:
349    case VK_FORMAT_ASTC_12x12_UNORM_BLOCK:
350    case VK_FORMAT_ASTC_12x12_SRGB_BLOCK:
351       return VK_FORMAT_R32G32B32A32_UINT;
352 
353    case VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK:
354    case VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK:
355    case VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK:
356    case VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK:
357    case VK_FORMAT_EAC_R11_UNORM_BLOCK:
358    case VK_FORMAT_EAC_R11_SNORM_BLOCK:
359    case VK_FORMAT_BC1_RGB_UNORM_BLOCK:
360    case VK_FORMAT_BC1_RGB_SRGB_BLOCK:
361    case VK_FORMAT_BC1_RGBA_UNORM_BLOCK:
362    case VK_FORMAT_BC1_RGBA_SRGB_BLOCK:
363       return VK_FORMAT_R16G16B16A16_UINT;
364 
365    default:
366       return VK_FORMAT_UNDEFINED;
367    }
368 }
369 
370 /**
371  * Checks if we can implement an image copy or clear operation using the TLB
372  * hardware.
373  *
374  * The extent and miplevel are only used to validate tile stores (to match the
375  * region to store against the miplevel dimensions to avoid avoid cases where
376  * the region to store is not a aligned to tile boundaries). If extent is
377  * NULL no checks are done (which is fine if the image will only be used for a
378  * TLB load or when we know in advance that the store will be for the entire
379  * size of the image miplevel).
380  *
381  * For tlb copies we are doing a per-plane copy, so for multi-plane formats,
382  * the compatible format will be single-plane.
383  */
384 bool
v3dv_meta_can_use_tlb(struct v3dv_image * image,uint8_t plane,uint8_t miplevel,const VkOffset3D * offset,const VkExtent3D * extent,VkFormat * compat_format)385 v3dv_meta_can_use_tlb(struct v3dv_image *image,
386                       uint8_t plane,
387                       uint8_t miplevel,
388                       const VkOffset3D *offset,
389                       const VkExtent3D *extent,
390                       VkFormat *compat_format)
391 {
392    if (offset->x != 0 || offset->y != 0)
393       return false;
394 
395    /* FIXME: this is suboptimal, what we really want to check is that the
396     * extent of the region to copy is the full slice or a multiple of the
397     * tile size.
398     */
399    if (extent) {
400       struct v3d_resource_slice *slice = &image->planes[plane].slices[miplevel];
401       if (slice->width != extent->width || slice->height != extent->height)
402          return false;
403    }
404 
405    if (image->format->planes[plane].rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO) {
406       if (compat_format)
407          *compat_format = image->planes[plane].vk_format;
408       return true;
409    }
410 
411    /* If the image format is not TLB-supported, then check if we can use
412     * a compatible format instead.
413     */
414    if (compat_format) {
415       *compat_format = get_compatible_tlb_format(image->planes[plane].vk_format);
416       if (*compat_format != VK_FORMAT_UNDEFINED) {
417          assert(vk_format_get_plane_count(*compat_format) == 1);
418          return true;
419       }
420    }
421 
422    return false;
423 }
424 
425 /* Implements a copy using the TLB.
426  *
427  * This only works if we are copying from offset (0,0), since a TLB store for
428  * tile (x,y) will be written at the same tile offset into the destination.
429  * When this requirement is not met, we need to use a blit instead.
430  *
431  * Returns true if the implementation supports the requested operation (even if
432  * it failed to process it, for example, due to an out-of-memory error).
433  *
434  */
435 static bool
copy_image_to_buffer_tlb(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_buffer * buffer,struct v3dv_image * image,const VkBufferImageCopy2 * region)436 copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer,
437                          struct v3dv_buffer *buffer,
438                          struct v3dv_image *image,
439                          const VkBufferImageCopy2 *region)
440 {
441    VkFormat fb_format;
442    uint8_t plane = v3dv_plane_from_aspect(region->imageSubresource.aspectMask);
443    assert(plane < image->plane_count);
444 
445    if (!v3dv_meta_can_use_tlb(image, plane, region->imageSubresource.mipLevel,
446                               &region->imageOffset, &region->imageExtent,
447                               &fb_format)) {
448       return false;
449    }
450 
451    uint32_t internal_type, internal_bpp;
452    v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
453       (fb_format, region->imageSubresource.aspectMask,
454        &internal_type, &internal_bpp);
455 
456    uint32_t num_layers;
457    if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
458       num_layers = vk_image_subresource_layer_count(&image->vk,
459                                                     &region->imageSubresource);
460    } else {
461       num_layers = region->imageExtent.depth;
462    }
463    assert(num_layers > 0);
464 
465    struct v3dv_job *job =
466       v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
467    if (!job)
468       return true;
469 
470    /* Handle copy from compressed format using a compatible format */
471    const uint32_t block_w =
472       vk_format_get_blockwidth(image->planes[plane].vk_format);
473    const uint32_t block_h =
474       vk_format_get_blockheight(image->planes[plane].vk_format);
475    const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
476    const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
477 
478    v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
479                         internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
480                         false);
481 
482    struct v3dv_meta_framebuffer framebuffer;
483    v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
484                                               internal_type, &job->frame_tiling);
485 
486    v3dv_X(job->device, job_emit_binning_flush)(job);
487    v3dv_X(job->device, meta_emit_copy_image_to_buffer_rcl)
488       (job, buffer, image, &framebuffer, region);
489 
490    v3dv_cmd_buffer_finish_job(cmd_buffer);
491 
492    return true;
493 }
494 
495 static bool
496 blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
497             struct v3dv_image *dst,
498             VkFormat dst_format,
499             struct v3dv_image *src,
500             VkFormat src_format,
501             VkColorComponentFlags cmask,
502             VkComponentMapping *cswizzle,
503             const VkImageBlit2 *region,
504             VkFilter filter,
505             bool dst_is_padded_image);
506 
507 
508 /**
509  * A structure that contains all the information we may need in various
510  * processes involving image to buffer copies implemented with blit paths.
511  */
512 struct image_to_buffer_info {
513    /* Source image info */
514    VkFormat src_format;
515    uint8_t plane;
516    VkColorComponentFlags cmask;
517    VkComponentMapping cswizzle;
518    VkImageAspectFlags src_copy_aspect;
519    uint32_t block_width;
520    uint32_t block_height;
521 
522    /* Destination buffer info */
523    VkFormat dst_format;
524    uint32_t buf_width;
525    uint32_t buf_height;
526    uint32_t buf_bpp;
527    VkImageAspectFlags dst_copy_aspect;
528 };
529 
530 static VkImageBlit2
blit_region_for_image_to_buffer(const VkOffset3D * offset,const VkExtent3D * extent,uint32_t mip_level,uint32_t base_layer,uint32_t layer_offset,struct image_to_buffer_info * info)531 blit_region_for_image_to_buffer(const VkOffset3D *offset,
532                                 const VkExtent3D *extent,
533                                 uint32_t mip_level,
534                                 uint32_t base_layer,
535                                 uint32_t layer_offset,
536                                 struct image_to_buffer_info *info)
537 {
538    VkImageBlit2 output = {
539       .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2,
540       .srcSubresource = {
541          .aspectMask = info->src_copy_aspect,
542          .mipLevel = mip_level,
543          .baseArrayLayer = base_layer + layer_offset,
544          .layerCount = 1,
545       },
546       .srcOffsets = {
547          {
548             DIV_ROUND_UP(offset->x, info->block_width),
549             DIV_ROUND_UP(offset->y, info->block_height),
550             offset->z + layer_offset,
551          },
552          {
553             DIV_ROUND_UP(offset->x + extent->width, info->block_width),
554             DIV_ROUND_UP(offset->y + extent->height, info->block_height),
555             offset->z + layer_offset + 1,
556          },
557       },
558       .dstSubresource = {
559          .aspectMask = info->dst_copy_aspect,
560          .mipLevel = 0,
561          .baseArrayLayer = 0,
562          .layerCount = 1,
563       },
564       .dstOffsets = {
565          { 0, 0, 0 },
566          {
567             DIV_ROUND_UP(extent->width, info->block_width),
568             DIV_ROUND_UP(extent->height, info->block_height),
569             1
570          },
571       },
572    };
573 
574    return output;
575 }
576 
577 /**
578  * Produces an image_to_buffer_info struct from a VkBufferImageCopy2 that we can
579  * use to implement buffer to image copies with blit paths.
580  *
581  * Returns false if the copy operation can't be implemented with a blit.
582  */
583 static bool
gather_image_to_buffer_info(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * image,const VkBufferImageCopy2 * region,struct image_to_buffer_info * out_info)584 gather_image_to_buffer_info(struct v3dv_cmd_buffer *cmd_buffer,
585                             struct v3dv_image *image,
586                             const VkBufferImageCopy2 *region,
587                             struct image_to_buffer_info *out_info)
588 {
589    bool supported = false;
590 
591    VkImageAspectFlags dst_copy_aspect = region->imageSubresource.aspectMask;
592    /* For multi-planar images we copy one plane at a time using an image alias
593     * with a color aspect for each plane.
594     */
595    if (image->plane_count > 1)
596       dst_copy_aspect = VK_IMAGE_ASPECT_COLOR_BIT;
597 
598    VkImageAspectFlags src_copy_aspect = region->imageSubresource.aspectMask;
599    uint8_t plane = v3dv_plane_from_aspect(src_copy_aspect);
600    assert(plane < image->plane_count);
601 
602    /* Generally, the bpp of the data in the buffer matches that of the
603     * source image. The exception is the case where we are copying
604     * stencil (8bpp) to a combined d24s8 image (32bpp).
605     */
606    uint32_t buffer_bpp = image->planes[plane].cpp;
607 
608    /* Because we are going to implement the copy as a blit, we need to create
609     * a linear image from the destination buffer and we also want our blit
610     * source and destination formats to be the same (to avoid any format
611     * conversions), so we choose a canonical format that matches the
612     * source image bpp.
613     *
614     * The exception to the above is copying from combined depth/stencil images
615     * because we are copying only one aspect of the image, so we need to setup
616     * our formats, color write mask and source swizzle mask to match that.
617     */
618    VkFormat dst_format;
619    VkFormat src_format;
620    VkColorComponentFlags cmask = 0; /* All components */
621    VkComponentMapping cswizzle = {
622       .r = VK_COMPONENT_SWIZZLE_IDENTITY,
623       .g = VK_COMPONENT_SWIZZLE_IDENTITY,
624       .b = VK_COMPONENT_SWIZZLE_IDENTITY,
625       .a = VK_COMPONENT_SWIZZLE_IDENTITY,
626    };
627    switch (buffer_bpp) {
628    case 16:
629       assert(dst_copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
630       dst_format = VK_FORMAT_R32G32B32A32_UINT;
631       src_format = dst_format;
632       break;
633    case 8:
634       assert(dst_copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
635       dst_format = VK_FORMAT_R16G16B16A16_UINT;
636       src_format = dst_format;
637       break;
638    case 4:
639       switch (dst_copy_aspect) {
640       case VK_IMAGE_ASPECT_COLOR_BIT:
641          src_format = VK_FORMAT_R8G8B8A8_UINT;
642          dst_format = VK_FORMAT_R8G8B8A8_UINT;
643          break;
644       case VK_IMAGE_ASPECT_DEPTH_BIT:
645          assert(image->plane_count == 1);
646          assert(image->vk.format == VK_FORMAT_D32_SFLOAT ||
647                 image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
648                 image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32);
649          if (image->vk.format == VK_FORMAT_D32_SFLOAT) {
650             src_format = VK_FORMAT_R32_UINT;
651             dst_format = VK_FORMAT_R32_UINT;
652          } else {
653             /* We want to write depth in the buffer in the first 24-bits,
654              * however, the hardware has depth in bits 8-31, so swizzle the
655              * the source components to match what we want. Also, we don't
656              * want to write bits 24-31 in the destination.
657              */
658             src_format = VK_FORMAT_R8G8B8A8_UINT;
659             dst_format = VK_FORMAT_R8G8B8A8_UINT;
660             cmask = VK_COLOR_COMPONENT_R_BIT |
661                     VK_COLOR_COMPONENT_G_BIT |
662                     VK_COLOR_COMPONENT_B_BIT;
663             cswizzle.r = VK_COMPONENT_SWIZZLE_G;
664             cswizzle.g = VK_COMPONENT_SWIZZLE_B;
665             cswizzle.b = VK_COMPONENT_SWIZZLE_A;
666             cswizzle.a = VK_COMPONENT_SWIZZLE_ZERO;
667          }
668          break;
669       case VK_IMAGE_ASPECT_STENCIL_BIT:
670          assert(image->plane_count == 1);
671          assert(dst_copy_aspect == VK_IMAGE_ASPECT_STENCIL_BIT);
672          assert(image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT);
673          /* Copying from S8D24. We want to write 8-bit stencil values only,
674           * so adjust the buffer bpp for that. Since the hardware stores stencil
675           * in the LSB, we can just do a RGBA8UI to R8UI blit.
676           */
677          src_format = VK_FORMAT_R8G8B8A8_UINT;
678          dst_format = VK_FORMAT_R8_UINT;
679          buffer_bpp = 1;
680          break;
681       default:
682          unreachable("unsupported aspect");
683          return supported;
684       };
685       break;
686    case 2:
687       assert(dst_copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT ||
688              dst_copy_aspect == VK_IMAGE_ASPECT_DEPTH_BIT);
689       dst_format = VK_FORMAT_R16_UINT;
690       src_format = dst_format;
691       break;
692    case 1:
693       assert(dst_copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
694       dst_format = VK_FORMAT_R8_UINT;
695       src_format = dst_format;
696       break;
697    default:
698       unreachable("unsupported bit-size");
699       return supported;
700    };
701 
702    /* The hardware doesn't support linear depth/stencil stores, so we
703     * implement copies of depth/stencil aspect as color copies using a
704     * compatible color format.
705     */
706    assert(vk_format_is_color(src_format));
707    assert(vk_format_is_color(dst_format));
708    dst_copy_aspect = VK_IMAGE_ASPECT_COLOR_BIT;
709 
710    /* We should be able to handle the blit if we got this far */
711    supported = true;
712 
713    /* Obtain the 2D buffer region spec */
714    uint32_t buf_width, buf_height;
715    if (region->bufferRowLength == 0)
716       buf_width = region->imageExtent.width;
717    else
718       buf_width = region->bufferRowLength;
719 
720    if (region->bufferImageHeight == 0)
721       buf_height = region->imageExtent.height;
722    else
723       buf_height = region->bufferImageHeight;
724 
725    /* If the image is compressed, the bpp refers to blocks, not pixels */
726    uint32_t block_width =
727       vk_format_get_blockwidth(image->planes[plane].vk_format);
728    uint32_t block_height =
729       vk_format_get_blockheight(image->planes[plane].vk_format);
730    buf_width = DIV_ROUND_UP(buf_width, block_width);
731    buf_height = DIV_ROUND_UP(buf_height, block_height);
732 
733    out_info->src_format = src_format;
734    out_info->dst_format = dst_format;
735    out_info->src_copy_aspect = src_copy_aspect;
736    out_info->dst_copy_aspect = dst_copy_aspect;
737    out_info->buf_width = buf_width;
738    out_info->buf_height = buf_height;
739    out_info->buf_bpp = buffer_bpp;
740    out_info->block_width = block_width;
741    out_info->block_height = block_height;
742    out_info->cmask = cmask;
743    out_info->cswizzle = cswizzle;
744    out_info->plane = plane;
745 
746    return supported;
747 }
748 
749 /* Creates a linear image to alias buffer memory. It also includes that image
750  * as a private object in the cmd_buffer.
751  *
752  * This is used for cases where we want to implement an image to buffer copy,
753  * but we need to rely on a mechanism that uses an image as destination, like
754  * blitting.
755  */
756 static VkResult
create_image_from_buffer(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_buffer * buffer,const VkBufferImageCopy2 * region,struct image_to_buffer_info * info,uint32_t layer,VkImage * out_image)757 create_image_from_buffer(struct v3dv_cmd_buffer *cmd_buffer,
758                          struct v3dv_buffer *buffer,
759                          const VkBufferImageCopy2 *region,
760                          struct image_to_buffer_info *info,
761                          uint32_t layer,
762                          VkImage *out_image)
763 {
764    VkImageCreateInfo image_info = {
765       .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
766       .imageType = VK_IMAGE_TYPE_2D,
767       .format = info->dst_format,
768       .extent = { info->buf_width, info->buf_height, 1 },
769       .mipLevels = 1,
770       .arrayLayers = 1,
771       .samples = VK_SAMPLE_COUNT_1_BIT,
772       .tiling = VK_IMAGE_TILING_LINEAR,
773       .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT,
774       .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
775       .queueFamilyIndexCount = 0,
776       .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
777    };
778 
779    VkResult result;
780    struct v3dv_device *device = cmd_buffer->device;
781    VkDevice _device = v3dv_device_to_handle(device);
782 
783    VkImage buffer_image;
784    result =
785       v3dv_CreateImage(_device, &image_info, &device->vk.alloc, &buffer_image);
786    if (result != VK_SUCCESS)
787       return result;
788 
789    *out_image = buffer_image;
790 
791    v3dv_cmd_buffer_add_private_obj(
792       cmd_buffer, (uintptr_t)buffer_image,
793       (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
794 
795    /* Bind the buffer memory to the image
796     */
797    VkDeviceSize buffer_offset = buffer->mem_offset + region->bufferOffset +
798       layer * info->buf_width * info->buf_height * info->buf_bpp;
799 
800    result =
801       vk_common_BindImageMemory(_device, buffer_image,
802                                 v3dv_device_memory_to_handle(buffer->mem),
803                                 buffer_offset);
804    return result;
805 }
806 
807 /**
808  * Creates an image with a single mip level that aliases the memory of a
809  * mip level in another image, re-interpreting the memory with an uncompressed
810  * format. The image is added to the command buffer as a private object for
811  * disposal.
812  */
813 static bool
create_image_mip_level_alias(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * image,VkFormat format,uint32_t plane,uint32_t mip_level,uint32_t layer,VkImage * alias)814 create_image_mip_level_alias(struct v3dv_cmd_buffer *cmd_buffer,
815                              struct v3dv_image *image,
816                              VkFormat format,
817                              uint32_t plane,
818                              uint32_t mip_level,
819                              uint32_t layer,
820                              VkImage *alias)
821 {
822    VkResult result;
823    assert(!vk_format_is_compressed(format));
824 
825    struct v3dv_device *device = cmd_buffer->device;
826    VkDevice vk_device = v3dv_device_to_handle(device);
827    uint32_t mip_width = image->planes[plane].slices[mip_level].width;
828    uint32_t mip_height = image->planes[plane].slices[mip_level].height;
829 
830    uint32_t block_width =
831       vk_format_get_blockwidth(image->planes[plane].vk_format);
832    uint32_t block_height =
833       vk_format_get_blockheight(image->planes[plane].vk_format);
834 
835    VkImageCreateInfo info = {
836       .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
837       .imageType = image->vk.image_type,
838       .format = format,
839       .extent = { DIV_ROUND_UP(mip_width, block_width),
840                   DIV_ROUND_UP(mip_height, block_height),
841                   1 },
842       .mipLevels = 1,
843       .arrayLayers = 1,
844       .samples = image->vk.samples,
845       .tiling = image->tiled ? VK_IMAGE_TILING_OPTIMAL : VK_IMAGE_TILING_LINEAR,
846       .usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
847       .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
848       .queueFamilyIndexCount = 0,
849       .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
850    };
851    result = v3dv_CreateImage(vk_device, &info, &device->vk.alloc, alias);
852    if (result != VK_SUCCESS)
853       return false;
854 
855    /* The alias we have just created has just one mip, but we may be aliasing
856     * any mip in the original image. Because the slice setup changes based on
857     * the mip (particularly, for mips >= 2 it uses power of 2 sizes internally)
858     * and this can influence the tiling layout selected for the slice, we want
859     * to make sure we copy the slice description from the actual mip level in
860     * the original image, and then rewrite any fields that we need for the
861     * alias. Particularly, we want to make the offset 0 because we are going to
862     * bind the underlying image memory exactly at the start of the selected mip.
863     * We also want to relax the image alignment requirements to the minimum
864     * (the one imposed by the Texture Base Address field) since we may not be
865     * aliasing a level 0 (for which we typically want a page alignment for
866     * optimal performance).
867     */
868    V3DV_FROM_HANDLE(v3dv_image, v3dv_alias, *alias);
869    v3dv_alias->planes[plane].slices[0] = image->planes[plane].slices[mip_level];
870    v3dv_alias->planes[plane].slices[0].width = info.extent.width;
871    v3dv_alias->planes[plane].slices[0].height = info.extent.height;
872    v3dv_alias->planes[plane].slices[0].offset = 0;
873    v3dv_alias->planes[plane].alignment = 64;
874 
875    v3dv_cmd_buffer_add_private_obj(
876       cmd_buffer, (uintptr_t)*alias,
877       (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
878 
879    result =
880       vk_common_BindImageMemory(vk_device, *alias,
881                                 v3dv_device_memory_to_handle(image->planes[plane].mem),
882                                 v3dv_layer_offset(image, mip_level, layer, plane));
883    return result == VK_SUCCESS;
884 }
885 
886 /**
887  * Returns true if the implementation supports the requested operation (even if
888  * it failed to process it, for example, due to an out-of-memory error).
889  */
890 static bool
copy_image_to_buffer_blit(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_buffer * buffer,struct v3dv_image * image,const VkBufferImageCopy2 * region)891 copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
892                           struct v3dv_buffer *buffer,
893                           struct v3dv_image *image,
894                           const VkBufferImageCopy2 *region)
895 {
896    bool handled = false;
897    struct image_to_buffer_info info;
898 
899    /* This path uses a shader blit which doesn't support linear images. Return
900     * early to avoid all the heavy lifting in preparation for the
901     * blit_shader() call that is bound to fail in that scenario.
902     */
903    if (!image->tiled && image->vk.image_type != VK_IMAGE_TYPE_1D) {
904       return handled;
905    }
906 
907    handled = gather_image_to_buffer_info(cmd_buffer, image, region,
908                                          &info);
909 
910    if (!handled)
911       return handled;
912 
913    /* We should be able to handle the blit if we got this far */
914    handled = true;
915 
916    /* Compute layers to copy */
917    uint32_t num_layers;
918    if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
919       num_layers = vk_image_subresource_layer_count(&image->vk,
920                                                     &region->imageSubresource);
921    } else {
922       num_layers = region->imageExtent.depth;
923    }
924    assert(num_layers > 0);
925 
926    /* Copy requested layers */
927    VkResult result;
928    VkImageBlit2 blit_region;
929    uint32_t mip_level = region->imageSubresource.mipLevel;
930    uint32_t base_layer = region->imageSubresource.baseArrayLayer;
931    for (uint32_t i = 0; i < num_layers; i++) {
932       uint32_t layer_offset = i;
933 
934       if (vk_format_is_compressed(image->vk.format)) {
935          /* Our blit interface can see the real format of the images to detect
936           * copies between compressed and uncompressed images and adapt the
937           * blit region accordingly. Here we are just doing a raw copy of
938           * compressed data, but we are passing an uncompressed view of the
939           * buffer for the blit destination image (since compressed formats are
940           * not renderable), so we also want to provide an uncompressed view of
941           * the source image.
942           *
943           * It is important that we create the alias over the selected mip
944           * level (instead of aliasing the entire image) because an uncompressed
945           * view of the image won't have the same number of mip levels as the
946           * original image and the implicit mip size calculations the hw will
947           * do to sample from a non-zero mip level may not match exactly between
948           * compressed and uncompressed views.
949           */
950          VkImage alias;
951          if (!create_image_mip_level_alias(cmd_buffer, image, info.dst_format,
952                                            info.plane, mip_level,
953                                            base_layer + layer_offset,
954                                            &alias)) {
955             return handled;
956          }
957 
958          /* We are aliasing the selected mip level and layer with a
959           * single-mip and single-layer image.
960           */
961          image = v3dv_image_from_handle(alias);
962          mip_level = 0;
963          base_layer = 0;
964          layer_offset = 0;
965       }
966 
967       /* Create the destination blit image from the destination buffer */
968       VkImage buffer_image;
969       result =
970          create_image_from_buffer(cmd_buffer, buffer, region, &info,
971                                   i, &buffer_image);
972       if (result != VK_SUCCESS)
973          return handled;
974 
975       /* Blit-copy the requested image extent.
976        *
977        * Since we are copying, the blit must use the same format on the
978        * destination and source images to avoid format conversions. The
979        * only exception is copying stencil, which we upload to a R8UI source
980        * image, but that we need to blit to a S8D24 destination (the only
981        * stencil format we support).
982        */
983       blit_region =
984          blit_region_for_image_to_buffer(&region->imageOffset,
985                                          &region->imageExtent,
986                                          mip_level, base_layer, layer_offset,
987                                          &info);
988 
989       handled = blit_shader(cmd_buffer,
990                             v3dv_image_from_handle(buffer_image),
991                             info.dst_format,
992                             image, info.src_format,
993                             info.cmask, &info.cswizzle,
994                             &blit_region, VK_FILTER_NEAREST, false);
995       if (!handled) {
996          /* This is unexpected, we should have a supported blit spec */
997          unreachable("Unable to blit buffer to destination image");
998          return false;
999       }
1000    }
1001 
1002    assert(handled);
1003    return true;
1004 }
1005 
1006 static bool
1007 copy_image_linear_texel_buffer(struct v3dv_cmd_buffer *cmd_buffer,
1008                                struct v3dv_image *dst,
1009                                struct v3dv_image *src,
1010                                const VkImageCopy2 *region);
1011 
1012 static VkImageCopy2
image_copy_region_for_image_to_buffer(const VkBufferImageCopy2 * region,struct image_to_buffer_info * info,uint32_t layer)1013 image_copy_region_for_image_to_buffer(const VkBufferImageCopy2 *region,
1014                                       struct image_to_buffer_info *info,
1015                                       uint32_t layer)
1016 {
1017    VkImageCopy2 output = {
1018       .sType = VK_STRUCTURE_TYPE_IMAGE_COPY_2,
1019       .srcSubresource = {
1020          .aspectMask = info->src_copy_aspect,
1021          .mipLevel = region->imageSubresource.mipLevel,
1022          .baseArrayLayer = region->imageSubresource.baseArrayLayer + layer,
1023          .layerCount = 1,
1024       },
1025       .srcOffset = {
1026             DIV_ROUND_UP(region->imageOffset.x, info->block_width),
1027             DIV_ROUND_UP(region->imageOffset.y, info->block_height),
1028             region->imageOffset.z,
1029       },
1030       .dstSubresource = {
1031          .aspectMask = info->dst_copy_aspect,
1032          .mipLevel = 0,
1033          .baseArrayLayer = 0,
1034          .layerCount = 1,
1035       },
1036       .dstOffset = { 0, 0, 0 },
1037       .extent = {
1038          DIV_ROUND_UP(region->imageExtent.width, info->block_width),
1039          DIV_ROUND_UP(region->imageExtent.height, info->block_height),
1040          1
1041       },
1042    };
1043 
1044    return output;
1045 }
1046 
1047 /**
1048  * Returns true if the implementation supports the requested operation (even if
1049  * it failed to process it, for example, due to an out-of-memory error).
1050  */
1051 static bool
copy_image_to_buffer_texel_buffer(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_buffer * dst_buffer,struct v3dv_image * src_image,const VkBufferImageCopy2 * region)1052 copy_image_to_buffer_texel_buffer(struct v3dv_cmd_buffer *cmd_buffer,
1053                                   struct v3dv_buffer *dst_buffer,
1054                                   struct v3dv_image *src_image,
1055                                   const VkBufferImageCopy2 *region)
1056 {
1057    bool handled = false;
1058    VkImage dst_buffer_image;
1059    struct image_to_buffer_info info;
1060 
1061    /* This is a requirement for copy_image_linear_texel_buffer below. We check
1062     * it in advance in order to do an early return
1063     */
1064    if (src_image->tiled)
1065       return false;
1066 
1067    handled =
1068       gather_image_to_buffer_info(cmd_buffer, src_image, region,
1069                                   &info);
1070    if (!handled)
1071       return handled;
1072 
1073    /* At this point the implementation should support the copy, any possible
1074     * error below are for different reasons, like out-of-memory error
1075     */
1076    handled = true;
1077 
1078    uint32_t num_layers;
1079    if (src_image->vk.image_type != VK_IMAGE_TYPE_3D) {
1080       num_layers = vk_image_subresource_layer_count(&src_image->vk,
1081                                                     &region->imageSubresource);
1082    } else {
1083       num_layers = region->imageExtent.depth;
1084    }
1085    assert(num_layers > 0);
1086 
1087    VkResult result;
1088    VkImageCopy2 image_region;
1089    for (uint32_t layer = 0; layer < num_layers; layer++) {
1090       /* Create the destination image from the destination buffer */
1091       result =
1092          create_image_from_buffer(cmd_buffer, dst_buffer, region, &info,
1093                                   layer, &dst_buffer_image);
1094       if (result != VK_SUCCESS)
1095          return handled;
1096 
1097       image_region =
1098          image_copy_region_for_image_to_buffer(region, &info, layer);
1099 
1100       handled =
1101          copy_image_linear_texel_buffer(cmd_buffer,
1102                                         v3dv_image_from_handle(dst_buffer_image),
1103                                         src_image, &image_region);
1104    }
1105 
1106    return handled;
1107 }
1108 
1109 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdCopyImageToBuffer2(VkCommandBuffer commandBuffer,const VkCopyImageToBufferInfo2 * info)1110 v3dv_CmdCopyImageToBuffer2(VkCommandBuffer commandBuffer,
1111                               const VkCopyImageToBufferInfo2 *info)
1112 
1113 {
1114    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1115    V3DV_FROM_HANDLE(v3dv_image, image, info->srcImage);
1116    V3DV_FROM_HANDLE(v3dv_buffer, buffer, info->dstBuffer);
1117 
1118    assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT);
1119 
1120    cmd_buffer->state.is_transfer = true;
1121 
1122    for (uint32_t i = 0; i < info->regionCount; i++) {
1123       const VkBufferImageCopy2 *region = &info->pRegions[i];
1124 
1125       if (copy_image_to_buffer_tlb(cmd_buffer, buffer, image, region))
1126          continue;
1127 
1128       if (copy_image_to_buffer_blit(cmd_buffer, buffer, image, region))
1129          continue;
1130 
1131       if (copy_image_to_buffer_texel_buffer(cmd_buffer, buffer, image, region))
1132          continue;
1133 
1134       unreachable("Unsupported image to buffer copy.");
1135    }
1136    cmd_buffer->state.is_transfer = false;
1137 }
1138 
1139 /**
1140  * Returns true if the implementation supports the requested operation (even if
1141  * it failed to process it, for example, due to an out-of-memory error).
1142  */
1143 static bool
copy_image_tfu(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageCopy2 * region)1144 copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
1145                struct v3dv_image *dst,
1146                struct v3dv_image *src,
1147                const VkImageCopy2 *region)
1148 {
1149    if (V3D_DBG(DISABLE_TFU)) {
1150       perf_debug("Copy images: TFU disabled, fallbacks could be slower.\n");
1151       return false;
1152    }
1153 
1154    /* Destination can't be raster format */
1155    if (!dst->tiled)
1156       return false;
1157 
1158    /* We can only do full copies, so if the format is D24S8 both aspects need
1159     * to be copied. We only need to check the dst format because the spec
1160     * states that depth/stencil formats must match exactly.
1161     */
1162    if (dst->vk.format == VK_FORMAT_D24_UNORM_S8_UINT) {
1163        const VkImageAspectFlags ds_aspects = VK_IMAGE_ASPECT_DEPTH_BIT |
1164                                              VK_IMAGE_ASPECT_STENCIL_BIT;
1165        if (region->dstSubresource.aspectMask != ds_aspects)
1166           return false;
1167    }
1168 
1169    /* Don't handle copies between uncompressed and compressed formats for now.
1170     *
1171     * FIXME: we should be able to handle these easily but there is no coverage
1172     * in CTS at the moment that make such copies with full images (which we
1173     * require here), only partial copies. Also, in that case the code below that
1174     * checks for "dst image complete" requires some changes, since it is
1175     * checking against the region dimensions, which are in units of the source
1176     * image format.
1177     */
1178    if (vk_format_is_compressed(dst->vk.format) !=
1179        vk_format_is_compressed(src->vk.format)) {
1180       return false;
1181    }
1182 
1183    /* Source region must start at (0,0) */
1184    if (region->srcOffset.x != 0 || region->srcOffset.y != 0)
1185       return false;
1186 
1187    /* Destination image must be complete */
1188    if (region->dstOffset.x != 0 || region->dstOffset.y != 0)
1189       return false;
1190 
1191    uint8_t src_plane =
1192       v3dv_plane_from_aspect(region->srcSubresource.aspectMask);
1193    uint8_t dst_plane =
1194       v3dv_plane_from_aspect(region->dstSubresource.aspectMask);
1195 
1196    const uint32_t dst_mip_level = region->dstSubresource.mipLevel;
1197    uint32_t dst_width = u_minify(dst->planes[dst_plane].width, dst_mip_level);
1198    uint32_t dst_height = u_minify(dst->planes[dst_plane].height, dst_mip_level);
1199    if (region->extent.width != dst_width || region->extent.height != dst_height)
1200       return false;
1201 
1202    /* From vkCmdCopyImage:
1203     *
1204     *   "When copying between compressed and uncompressed formats the extent
1205     *    members represent the texel dimensions of the source image and not
1206     *    the destination."
1207     */
1208    const uint32_t block_w =
1209       vk_format_get_blockwidth(src->planes[src_plane].vk_format);
1210    const uint32_t block_h =
1211       vk_format_get_blockheight(src->planes[src_plane].vk_format);
1212    uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
1213    uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
1214 
1215    /* Account for sample count */
1216    assert(dst->vk.samples == src->vk.samples);
1217    if (dst->vk.samples > VK_SAMPLE_COUNT_1_BIT) {
1218       assert(dst->vk.samples == VK_SAMPLE_COUNT_4_BIT);
1219       width *= 2;
1220       height *= 2;
1221    }
1222 
1223    /* The TFU unit doesn't handle format conversions so we need the formats to
1224     * match. On the other hand, vkCmdCopyImage allows different color formats
1225     * on the source and destination images, but only if they are texel
1226     * compatible. For us, this means that we can effectively ignore different
1227     * formats and just make the copy using either of them, since we are just
1228     * moving raw data and not making any conversions.
1229     *
1230     * Also, the formats supported by the TFU unit are limited, but again, since
1231     * we are only doing raw copies here without interpreting or converting
1232     * the underlying pixel data according to its format, we can always choose
1233     * to use compatible formats that are supported with the TFU unit.
1234     */
1235    assert(dst->planes[dst_plane].cpp == src->planes[src_plane].cpp);
1236    const struct v3dv_format *format =
1237       v3dv_get_compatible_tfu_format(cmd_buffer->device,
1238                                      dst->planes[dst_plane].cpp, NULL);
1239 
1240    /* Emit a TFU job for each layer to blit */
1241    const uint32_t layer_count = dst->vk.image_type != VK_IMAGE_TYPE_3D ?
1242       vk_image_subresource_layer_count(&dst->vk, &region->dstSubresource) :
1243       region->extent.depth;
1244    const uint32_t src_mip_level = region->srcSubresource.mipLevel;
1245 
1246    const uint32_t base_src_layer = src->vk.image_type != VK_IMAGE_TYPE_3D ?
1247       region->srcSubresource.baseArrayLayer : region->srcOffset.z;
1248    const uint32_t base_dst_layer = dst->vk.image_type != VK_IMAGE_TYPE_3D ?
1249       region->dstSubresource.baseArrayLayer : region->dstOffset.z;
1250    for (uint32_t i = 0; i < layer_count; i++) {
1251       const uint32_t dst_offset =
1252          dst->planes[dst_plane].mem->bo->offset +
1253          v3dv_layer_offset(dst, dst_mip_level, base_dst_layer + i, dst_plane);
1254       const uint32_t src_offset =
1255          src->planes[src_plane].mem->bo->offset +
1256          v3dv_layer_offset(src, src_mip_level, base_src_layer + i, src_plane);
1257 
1258       const struct v3d_resource_slice *dst_slice =
1259          &dst->planes[dst_plane].slices[dst_mip_level];
1260       const struct v3d_resource_slice *src_slice =
1261          &src->planes[src_plane].slices[src_mip_level];
1262 
1263       v3dv_X(cmd_buffer->device, meta_emit_tfu_job)(
1264          cmd_buffer,
1265          dst->planes[dst_plane].mem->bo->handle,
1266          dst_offset,
1267          dst_slice->tiling,
1268          dst_slice->padded_height,
1269          dst->planes[dst_plane].cpp,
1270          src->planes[src_plane].mem->bo->handle,
1271          src_offset,
1272          src_slice->tiling,
1273          src_slice->tiling == V3D_TILING_RASTER ?
1274                               src_slice->stride : src_slice->padded_height,
1275          src->planes[src_plane].cpp,
1276          /* All compatible TFU formats are single-plane */
1277          width, height, &format->planes[0]);
1278    }
1279 
1280    return true;
1281 }
1282 
1283 inline bool
v3dv_cmd_buffer_copy_image_tfu(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageCopy2 * region)1284 v3dv_cmd_buffer_copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
1285                                struct v3dv_image *dst,
1286                                struct v3dv_image *src,
1287                                const VkImageCopy2 *region)
1288 {
1289    return copy_image_tfu(cmd_buffer, dst, src, region);
1290 }
1291 
1292 /**
1293  * Returns true if the implementation supports the requested operation (even if
1294  * it failed to process it, for example, due to an out-of-memory error).
1295  */
1296 static bool
copy_image_tlb(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageCopy2 * region)1297 copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
1298                struct v3dv_image *dst,
1299                struct v3dv_image *src,
1300                const VkImageCopy2 *region)
1301 {
1302    uint8_t src_plane =
1303       v3dv_plane_from_aspect(region->srcSubresource.aspectMask);
1304    assert(src_plane < src->plane_count);
1305    uint8_t dst_plane =
1306       v3dv_plane_from_aspect(region->dstSubresource.aspectMask);
1307    assert(dst_plane < dst->plane_count);
1308 
1309    VkFormat fb_format;
1310    if (!v3dv_meta_can_use_tlb(src, src_plane, region->srcSubresource.mipLevel,
1311                               &region->srcOffset, NULL, &fb_format) ||
1312        !v3dv_meta_can_use_tlb(dst, dst_plane, region->dstSubresource.mipLevel,
1313                               &region->dstOffset, &region->extent, &fb_format)) {
1314       return false;
1315    }
1316 
1317    /* We can't do TLB stores of linear D/S */
1318    if (!dst->tiled && vk_format_is_depth_or_stencil(fb_format))
1319       return false;
1320 
1321    /* From the Vulkan spec, VkImageCopy valid usage:
1322     *
1323     *    "If neither the calling command’s srcImage nor the calling command’s
1324     *     dstImage has a multi-planar image format then the aspectMask member
1325     *     of srcSubresource and dstSubresource must match."
1326     */
1327    assert(src->plane_count != 1 || dst->plane_count != 1 ||
1328           region->dstSubresource.aspectMask ==
1329           region->srcSubresource.aspectMask);
1330    uint32_t internal_type, internal_bpp;
1331    v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
1332       (fb_format, region->dstSubresource.aspectMask,
1333        &internal_type, &internal_bpp);
1334 
1335    /* From the Vulkan spec with VK_KHR_maintenance1, VkImageCopy valid usage:
1336     *
1337     * "The number of slices of the extent (for 3D) or layers of the
1338     *  srcSubresource (for non-3D) must match the number of slices of the
1339     *  extent (for 3D) or layers of the dstSubresource (for non-3D)."
1340     */
1341    assert((src->vk.image_type != VK_IMAGE_TYPE_3D ?
1342            vk_image_subresource_layer_count(&src->vk, &region->srcSubresource) :
1343            region->extent.depth) ==
1344           (dst->vk.image_type != VK_IMAGE_TYPE_3D ?
1345            vk_image_subresource_layer_count(&dst->vk, &region->dstSubresource) :
1346            region->extent.depth));
1347    uint32_t num_layers;
1348    if (dst->vk.image_type != VK_IMAGE_TYPE_3D) {
1349       num_layers = vk_image_subresource_layer_count(&dst->vk,
1350                                                     &region->dstSubresource);
1351    } else {
1352       num_layers = region->extent.depth;
1353    }
1354    assert(num_layers > 0);
1355 
1356    struct v3dv_job *job =
1357       v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
1358    if (!job)
1359       return true;
1360 
1361    /* Handle copy to compressed image using compatible format */
1362    const uint32_t block_w =
1363       vk_format_get_blockwidth(dst->planes[dst_plane].vk_format);
1364    const uint32_t block_h =
1365       vk_format_get_blockheight(dst->planes[dst_plane].vk_format);
1366    const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
1367    const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
1368 
1369    v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
1370                         internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
1371                         src->vk.samples > VK_SAMPLE_COUNT_1_BIT);
1372 
1373    struct v3dv_meta_framebuffer framebuffer;
1374    v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
1375                                               internal_type, &job->frame_tiling);
1376 
1377    v3dv_X(job->device, job_emit_binning_flush)(job);
1378    v3dv_X(job->device, meta_emit_copy_image_rcl)(job, dst, src, &framebuffer, region);
1379 
1380    v3dv_cmd_buffer_finish_job(cmd_buffer);
1381 
1382    return true;
1383 }
1384 
1385 /**
1386  * Takes the image provided as argument and creates a new image that has
1387  * the same specification and aliases the same memory storage, except that:
1388  *
1389  *   - It has the uncompressed format passed in.
1390  *   - Its original width/height are scaled by the factors passed in.
1391  *
1392  * This is useful to implement copies from compressed images using the blit
1393  * path. The idea is that we create uncompressed "image views" of both the
1394  * source and destination images using the uncompressed format and then we
1395  * define the copy blit in terms of that format.
1396  */
1397 static struct v3dv_image *
create_image_alias(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * src,float width_scale,float height_scale,VkFormat format)1398 create_image_alias(struct v3dv_cmd_buffer *cmd_buffer,
1399                    struct v3dv_image *src,
1400                    float width_scale,
1401                    float height_scale,
1402                    VkFormat format)
1403 {
1404    assert(!vk_format_is_compressed(format));
1405    /* We don't support ycbcr compressed formats */
1406    assert(src->plane_count == 1);
1407 
1408    VkDevice _device = v3dv_device_to_handle(cmd_buffer->device);
1409 
1410    VkImageCreateInfo info = {
1411       .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
1412       .imageType = src->vk.image_type,
1413       .format = format,
1414       .extent = {
1415          .width = src->vk.extent.width * width_scale,
1416          .height = src->vk.extent.height * height_scale,
1417          .depth = src->vk.extent.depth,
1418       },
1419       .mipLevels = src->vk.mip_levels,
1420       .arrayLayers = src->vk.array_layers,
1421       .samples = src->vk.samples,
1422       .tiling = src->tiled ? VK_IMAGE_TILING_OPTIMAL : VK_IMAGE_TILING_LINEAR,
1423       .usage = src->vk.usage,
1424    };
1425 
1426     VkImage _image;
1427     VkResult result =
1428       v3dv_CreateImage(_device, &info, &cmd_buffer->device->vk.alloc, &_image);
1429     if (result != VK_SUCCESS) {
1430        v3dv_flag_oom(cmd_buffer, NULL);
1431        return NULL;
1432     }
1433 
1434     v3dv_cmd_buffer_add_private_obj(
1435        cmd_buffer, (uintptr_t)_image,
1436        (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
1437 
1438     struct v3dv_image *image = v3dv_image_from_handle(_image);
1439     image->planes[0].mem = src->planes[0].mem;
1440     image->planes[0].mem_offset = src->planes[0].mem_offset;
1441     return image;
1442 }
1443 
1444 /**
1445  * Returns true if the implementation supports the requested operation (even if
1446  * it failed to process it, for example, due to an out-of-memory error).
1447  */
1448 static bool
copy_image_blit(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageCopy2 * region)1449 copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
1450                 struct v3dv_image *dst,
1451                 struct v3dv_image *src,
1452                 const VkImageCopy2 *region)
1453 {
1454    if (!src->tiled && src->vk.image_type != VK_IMAGE_TYPE_1D)
1455       return false;
1456 
1457    uint8_t src_plane =
1458       v3dv_plane_from_aspect(region->srcSubresource.aspectMask);
1459    assert(src_plane < src->plane_count);
1460    uint8_t dst_plane =
1461       v3dv_plane_from_aspect(region->dstSubresource.aspectMask);
1462    assert(dst_plane < dst->plane_count);
1463 
1464    const uint32_t src_block_w =
1465       vk_format_get_blockwidth(src->planes[src_plane].vk_format);
1466    const uint32_t src_block_h =
1467       vk_format_get_blockheight(src->planes[src_plane].vk_format);
1468    const uint32_t dst_block_w =
1469       vk_format_get_blockwidth(dst->planes[dst_plane].vk_format);
1470    const uint32_t dst_block_h =
1471       vk_format_get_blockheight(dst->planes[dst_plane].vk_format);
1472    const float block_scale_w = (float)src_block_w / (float)dst_block_w;
1473    const float block_scale_h = (float)src_block_h / (float)dst_block_h;
1474 
1475    /* We need to choose a single format for the blit to ensure that this is
1476     * really a copy and there are not format conversions going on. Since we
1477     * going to blit, we need to make sure that the selected format can be
1478     * both rendered to and textured from.
1479     */
1480    VkFormat format;
1481    float src_scale_w = 1.0f;
1482    float src_scale_h = 1.0f;
1483    float dst_scale_w = block_scale_w;
1484    float dst_scale_h = block_scale_h;
1485    if (vk_format_is_compressed(src->vk.format)) {
1486       /* If we are copying from a compressed format we should be aware that we
1487        * are going to texture from the source image, and the texture setup
1488        * knows the actual size of the image, so we need to choose a format
1489        * that has a per-texel (not per-block) bpp that is compatible for that
1490        * image size. For example, for a source image with size Bw*WxBh*H
1491        * and format ETC2_RGBA8_UNORM copied to a WxH image of format RGBA32UI,
1492        * each of the Bw*WxBh*H texels in the compressed source image is 8-bit
1493        * (which translates to a 128-bit 4x4 RGBA32 block when uncompressed),
1494        * so we could specify a blit with size Bw*WxBh*H and a format with
1495        * a bpp of 8-bit per texel (R8_UINT).
1496        *
1497        * Unfortunately, when copying from a format like ETC2_RGB8A1_UNORM,
1498        * which is 64-bit per texel, then we would need a 4-bit format, which
1499        * we don't have, so instead we still choose an 8-bit format, but we
1500        * apply a divisor to the row dimensions of the blit, since we are
1501        * copying two texels per item.
1502        *
1503        * Generally, we can choose any format so long as we compute appropriate
1504        * divisors for the width and height depending on the source image's
1505        * bpp.
1506        */
1507       assert(src->planes[src_plane].cpp == dst->planes[dst_plane].cpp);
1508 
1509       format = VK_FORMAT_R32G32_UINT;
1510       switch (src->planes[src_plane].cpp) {
1511       case 16:
1512          format = VK_FORMAT_R32G32B32A32_UINT;
1513          break;
1514       case 8:
1515          format = VK_FORMAT_R16G16B16A16_UINT;
1516          break;
1517       default:
1518          unreachable("Unsupported compressed format");
1519       }
1520 
1521       /* Create image views of the src/dst images that we can interpret in
1522        * terms of the canonical format.
1523        */
1524       src_scale_w /= src_block_w;
1525       src_scale_h /= src_block_h;
1526       dst_scale_w /= src_block_w;
1527       dst_scale_h /= src_block_h;
1528 
1529       src = create_image_alias(cmd_buffer, src,
1530                                src_scale_w, src_scale_h, format);
1531 
1532       dst = create_image_alias(cmd_buffer, dst,
1533                                dst_scale_w, dst_scale_h, format);
1534    } else {
1535       format = src->format->planes[src_plane].rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO ?
1536          src->planes[src_plane].vk_format :
1537          get_compatible_tlb_format(src->planes[src_plane].vk_format);
1538       if (format == VK_FORMAT_UNDEFINED)
1539          return false;
1540 
1541       const struct v3dv_format *f = v3dv_X(cmd_buffer->device, get_format)(format);
1542       assert(f->plane_count < 2);
1543       if (!f->plane_count || f->planes[0].tex_type == TEXTURE_DATA_FORMAT_NO)
1544          return false;
1545    }
1546 
1547    /* Given an uncompressed image with size WxH, if we copy it to a compressed
1548     * image, it will result in an image with size W*bWxH*bH, where bW and bH
1549     * are the compressed format's block width and height. This means that
1550     * copies between compressed and uncompressed images involve different
1551     * image sizes, and therefore, we need to take that into account when
1552     * setting up the source and destination blit regions below, so they are
1553     * consistent from the point of view of the single compatible format
1554     * selected for the copy.
1555     *
1556     * We should take into account that the dimensions of the region provided
1557     * to the copy command are specified in terms of the source image. With that
1558     * in mind, below we adjust the blit destination region to be consistent with
1559     * the source region for the compatible format, so basically, we apply
1560     * the block scale factor to the destination offset provided by the copy
1561     * command (because it is specified in terms of the destination image, not
1562     * the source), and then we just add the region copy dimensions to that
1563     * (since the region dimensions are already specified in terms of the source
1564     * image).
1565     */
1566    uint32_t region_width = region->extent.width * src_scale_w;
1567    uint32_t region_height = region->extent.height * src_scale_h;
1568    if (src_block_w > 1)
1569       region_width = util_next_power_of_two(region_width);
1570    if (src_block_h > 1)
1571       region_height = util_next_power_of_two(region_height);
1572 
1573    const VkOffset3D src_start = {
1574       region->srcOffset.x * src_scale_w,
1575       region->srcOffset.y * src_scale_h,
1576       region->srcOffset.z,
1577    };
1578    const VkOffset3D src_end = {
1579       src_start.x + region_width,
1580       src_start.y + region_height,
1581       src_start.z + region->extent.depth,
1582    };
1583 
1584    const VkOffset3D dst_start = {
1585       region->dstOffset.x * dst_scale_w,
1586       region->dstOffset.y * dst_scale_h,
1587       region->dstOffset.z,
1588    };
1589    const VkOffset3D dst_end = {
1590       dst_start.x + region_width,
1591       dst_start.y + region_height,
1592       dst_start.z + region->extent.depth,
1593    };
1594 
1595    const VkImageBlit2 blit_region = {
1596       .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2,
1597       .srcSubresource = region->srcSubresource,
1598       .srcOffsets = { src_start, src_end },
1599       .dstSubresource = region->dstSubresource,
1600       .dstOffsets = { dst_start, dst_end },
1601    };
1602    bool handled = blit_shader(cmd_buffer,
1603                               dst, format,
1604                               src, format,
1605                               0, NULL,
1606                               &blit_region, VK_FILTER_NEAREST, true);
1607 
1608    /* We should have selected formats that we can blit */
1609    assert(handled);
1610    return handled;
1611 }
1612 
1613 static bool
copy_image_linear_texel_buffer(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageCopy2 * region)1614 copy_image_linear_texel_buffer(struct v3dv_cmd_buffer *cmd_buffer,
1615                                struct v3dv_image *dst,
1616                                struct v3dv_image *src,
1617                                const VkImageCopy2 *region)
1618 {
1619    if (src->tiled)
1620       return false;
1621 
1622    /* Implementations are allowed to restrict linear images like this */
1623    assert(region->srcOffset.z == 0);
1624    assert(region->dstOffset.z == 0);
1625    assert(region->srcSubresource.mipLevel == 0);
1626    assert(region->srcSubresource.baseArrayLayer == 0);
1627    assert(region->srcSubresource.layerCount == 1);
1628    assert(region->dstSubresource.mipLevel == 0);
1629    assert(region->dstSubresource.baseArrayLayer == 0);
1630    assert(region->dstSubresource.layerCount == 1);
1631 
1632    uint8_t src_plane =
1633       v3dv_plane_from_aspect(region->srcSubresource.aspectMask);
1634    uint8_t dst_plane =
1635       v3dv_plane_from_aspect(region->dstSubresource.aspectMask);
1636 
1637    assert(src->planes[src_plane].cpp == dst->planes[dst_plane].cpp);
1638    const uint32_t bpp = src->planes[src_plane].cpp;
1639 
1640    VkFormat format;
1641    switch (bpp) {
1642    case 16:
1643       format = VK_FORMAT_R32G32B32A32_UINT;
1644       break;
1645    case 8:
1646       format = VK_FORMAT_R16G16B16A16_UINT;
1647       break;
1648    case 4:
1649       format = VK_FORMAT_R8G8B8A8_UINT;
1650       break;
1651    case 2:
1652       format = VK_FORMAT_R16_UINT;
1653       break;
1654    case 1:
1655       format = VK_FORMAT_R8_UINT;
1656       break;
1657    default:
1658       unreachable("unsupported bit-size");
1659       return false;
1660    }
1661 
1662    VkComponentMapping ident_swizzle = {
1663       .r = VK_COMPONENT_SWIZZLE_IDENTITY,
1664       .g = VK_COMPONENT_SWIZZLE_IDENTITY,
1665       .b = VK_COMPONENT_SWIZZLE_IDENTITY,
1666       .a = VK_COMPONENT_SWIZZLE_IDENTITY,
1667    };
1668 
1669    const uint32_t buf_stride = src->planes[src_plane].slices[0].stride;
1670    const VkDeviceSize buf_offset =
1671       region->srcOffset.y * buf_stride + region->srcOffset.x * bpp;
1672 
1673    struct v3dv_buffer src_buffer;
1674    vk_object_base_init(&cmd_buffer->device->vk, &src_buffer.base,
1675                        VK_OBJECT_TYPE_BUFFER);
1676 
1677    const struct VkBufferCreateInfo buf_create_info = {
1678       .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
1679       .size = src->planes[src_plane].size,
1680       .usage = VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT,
1681       .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
1682    };
1683    v3dv_buffer_init(cmd_buffer->device, &buf_create_info, &src_buffer,
1684                     src->planes[src_plane].alignment);
1685 
1686    const VkBindBufferMemoryInfo buf_bind_info = {
1687       .sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO,
1688       .buffer = v3dv_buffer_to_handle(&src_buffer),
1689       .memory = v3dv_device_memory_to_handle(src->planes[src_plane].mem),
1690       .memoryOffset = src->planes[src_plane].mem_offset +
1691          v3dv_layer_offset(src, 0, 0, src_plane),
1692    };
1693    v3dv_buffer_bind_memory(&buf_bind_info);
1694 
1695    const VkBufferImageCopy2 copy_region = {
1696       .sType = VK_STRUCTURE_TYPE_BUFFER_IMAGE_COPY_2,
1697       .pNext = NULL,
1698       .bufferOffset = buf_offset,
1699       .bufferRowLength = buf_stride / bpp,
1700       .bufferImageHeight = src->vk.extent.height,
1701       .imageSubresource = region->dstSubresource,
1702       .imageOffset = region->dstOffset,
1703       .imageExtent = region->extent,
1704    };
1705 
1706    return texel_buffer_shader_copy(cmd_buffer,
1707                                    region->dstSubresource.aspectMask,
1708                                    dst,
1709                                    format,
1710                                    format,
1711                                    &src_buffer,
1712                                    src->planes[src_plane].cpp,
1713                                    0 /* color mask: full */, &ident_swizzle,
1714                                    1, &copy_region);
1715 }
1716 
1717 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdCopyImage2(VkCommandBuffer commandBuffer,const VkCopyImageInfo2 * info)1718 v3dv_CmdCopyImage2(VkCommandBuffer commandBuffer,
1719                       const VkCopyImageInfo2 *info)
1720 
1721 {
1722    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1723    V3DV_FROM_HANDLE(v3dv_image, src, info->srcImage);
1724    V3DV_FROM_HANDLE(v3dv_image, dst, info->dstImage);
1725 
1726    assert(src->vk.samples == dst->vk.samples);
1727 
1728    cmd_buffer->state.is_transfer = true;
1729 
1730    for (uint32_t i = 0; i < info->regionCount; i++) {
1731       const VkImageCopy2 *region = &info->pRegions[i];
1732       if (copy_image_tfu(cmd_buffer, dst, src, region))
1733          continue;
1734       if (copy_image_tlb(cmd_buffer, dst, src, region))
1735          continue;
1736       if (copy_image_blit(cmd_buffer, dst, src, region))
1737          continue;
1738       if (copy_image_linear_texel_buffer(cmd_buffer, dst, src, region))
1739          continue;
1740       unreachable("Image copy not supported");
1741    }
1742 
1743    cmd_buffer->state.is_transfer = false;
1744 }
1745 
1746 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdCopyBuffer2(VkCommandBuffer commandBuffer,const VkCopyBufferInfo2 * pCopyBufferInfo)1747 v3dv_CmdCopyBuffer2(VkCommandBuffer commandBuffer,
1748                        const VkCopyBufferInfo2 *pCopyBufferInfo)
1749 {
1750    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1751    V3DV_FROM_HANDLE(v3dv_buffer, src_buffer, pCopyBufferInfo->srcBuffer);
1752    V3DV_FROM_HANDLE(v3dv_buffer, dst_buffer, pCopyBufferInfo->dstBuffer);
1753 
1754    cmd_buffer->state.is_transfer = true;
1755 
1756    for (uint32_t i = 0; i < pCopyBufferInfo->regionCount; i++) {
1757       v3dv_X(cmd_buffer->device, meta_copy_buffer)
1758          (cmd_buffer,
1759           dst_buffer->mem->bo, dst_buffer->mem_offset,
1760           src_buffer->mem->bo, src_buffer->mem_offset,
1761           &pCopyBufferInfo->pRegions[i]);
1762    }
1763 
1764    cmd_buffer->state.is_transfer = false;
1765 }
1766 
1767 static void
destroy_update_buffer_cb(VkDevice _device,uint64_t pobj,VkAllocationCallbacks * alloc)1768 destroy_update_buffer_cb(VkDevice _device,
1769                          uint64_t pobj,
1770                          VkAllocationCallbacks *alloc)
1771 {
1772    V3DV_FROM_HANDLE(v3dv_device, device, _device);
1773    struct v3dv_bo *bo = (struct v3dv_bo *)((uintptr_t) pobj);
1774    v3dv_bo_free(device, bo);
1775 }
1776 
1777 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdUpdateBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize dataSize,const void * pData)1778 v3dv_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
1779                      VkBuffer dstBuffer,
1780                      VkDeviceSize dstOffset,
1781                      VkDeviceSize dataSize,
1782                      const void *pData)
1783 {
1784    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1785    V3DV_FROM_HANDLE(v3dv_buffer, dst_buffer, dstBuffer);
1786 
1787    struct v3dv_bo *src_bo =
1788       v3dv_bo_alloc(cmd_buffer->device, dataSize, "vkCmdUpdateBuffer", true);
1789    if (!src_bo) {
1790       fprintf(stderr, "Failed to allocate BO for vkCmdUpdateBuffer.\n");
1791       return;
1792    }
1793 
1794    bool ok = v3dv_bo_map(cmd_buffer->device, src_bo, src_bo->size);
1795    if (!ok) {
1796       fprintf(stderr, "Failed to map BO for vkCmdUpdateBuffer.\n");
1797       return;
1798    }
1799 
1800    cmd_buffer->state.is_transfer = true;
1801 
1802    memcpy(src_bo->map, pData, dataSize);
1803 
1804    v3dv_bo_unmap(cmd_buffer->device, src_bo);
1805 
1806    VkBufferCopy2 region = {
1807       .sType = VK_STRUCTURE_TYPE_BUFFER_COPY_2,
1808       .srcOffset = 0,
1809       .dstOffset = dstOffset,
1810       .size = dataSize,
1811    };
1812    struct v3dv_job *copy_job =
1813       v3dv_X(cmd_buffer->device, meta_copy_buffer)
1814       (cmd_buffer, dst_buffer->mem->bo, dst_buffer->mem_offset,
1815        src_bo, 0, &region);
1816 
1817    if (copy_job) {
1818       v3dv_cmd_buffer_add_private_obj(
1819          cmd_buffer, (uint64_t)(uintptr_t)src_bo, destroy_update_buffer_cb);
1820    }
1821 
1822    cmd_buffer->state.is_transfer = false;
1823 }
1824 
1825 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdFillBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize size,uint32_t data)1826 v3dv_CmdFillBuffer(VkCommandBuffer commandBuffer,
1827                    VkBuffer dstBuffer,
1828                    VkDeviceSize dstOffset,
1829                    VkDeviceSize size,
1830                    uint32_t data)
1831 {
1832    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1833    V3DV_FROM_HANDLE(v3dv_buffer, dst_buffer, dstBuffer);
1834 
1835    cmd_buffer->state.is_transfer = true;
1836 
1837    struct v3dv_bo *bo = dst_buffer->mem->bo;
1838 
1839    /* From the Vulkan spec:
1840     *
1841     *   "If VK_WHOLE_SIZE is used and the remaining size of the buffer is not
1842     *    a multiple of 4, then the nearest smaller multiple is used."
1843     */
1844    if (size == VK_WHOLE_SIZE) {
1845       size = dst_buffer->size - dstOffset;
1846       size -= size % 4;
1847    }
1848 
1849    v3dv_X(cmd_buffer->device, meta_fill_buffer)
1850       (cmd_buffer, bo, dstOffset, size, data);
1851 
1852    cmd_buffer->state.is_transfer = false;
1853 }
1854 
1855 /**
1856  * Returns true if the implementation supports the requested operation (even if
1857  * it failed to process it, for example, due to an out-of-memory error).
1858  */
1859 static bool
copy_buffer_to_image_tfu(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * image,struct v3dv_buffer * buffer,const VkBufferImageCopy2 * region)1860 copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
1861                          struct v3dv_image *image,
1862                          struct v3dv_buffer *buffer,
1863                          const VkBufferImageCopy2 *region)
1864 {
1865    if (V3D_DBG(DISABLE_TFU)) {
1866       perf_debug("Copy buffer to image: TFU disabled, fallbacks could be slower.\n");
1867       return false;
1868    }
1869 
1870    assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT);
1871 
1872    /* Destination can't be raster format */
1873    if (!image->tiled)
1874       return false;
1875 
1876    /* We can't copy D24S8 because buffer to image copies only copy one aspect
1877     * at a time, and the TFU copies full images. Also, V3D depth bits for
1878     * both D24S8 and D24X8 stored in the 24-bit MSB of each 32-bit word, but
1879     * the Vulkan spec has the buffer data specified the other way around, so it
1880     * is not a straight copy, we would have to swizzle the channels, which the
1881     * TFU can't do.
1882     */
1883    if (image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
1884        image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32) {
1885          return false;
1886    }
1887 
1888    /* Region must include full slice */
1889    const uint32_t offset_x = region->imageOffset.x;
1890    const uint32_t offset_y = region->imageOffset.y;
1891    if (offset_x != 0 || offset_y != 0)
1892       return false;
1893 
1894    uint32_t width, height;
1895    if (region->bufferRowLength == 0)
1896       width = region->imageExtent.width;
1897    else
1898       width = region->bufferRowLength;
1899 
1900    if (region->bufferImageHeight == 0)
1901       height = region->imageExtent.height;
1902    else
1903       height = region->bufferImageHeight;
1904 
1905    const uint8_t plane =
1906       v3dv_plane_from_aspect(region->imageSubresource.aspectMask);
1907 
1908    const uint32_t mip_level = region->imageSubresource.mipLevel;
1909    const struct v3d_resource_slice *slice = &image->planes[plane].slices[mip_level];
1910 
1911    if (width != slice->width || height != slice->height)
1912       return false;
1913 
1914    /* Handle region semantics for compressed images */
1915    const uint32_t block_w =
1916       vk_format_get_blockwidth(image->planes[plane].vk_format);
1917    const uint32_t block_h =
1918       vk_format_get_blockheight(image->planes[plane].vk_format);
1919    width = DIV_ROUND_UP(width, block_w);
1920    height = DIV_ROUND_UP(height, block_h);
1921 
1922    /* Format must be supported for texturing via the TFU. Since we are just
1923     * copying raw data and not converting between pixel formats, we can ignore
1924     * the image's format and choose a compatible TFU format for the image
1925     * texel size instead, which expands the list of formats we can handle here.
1926     */
1927    const struct v3dv_format *format =
1928       v3dv_get_compatible_tfu_format(cmd_buffer->device,
1929                                      image->planes[plane].cpp, NULL);
1930    /* We only use single-plane formats with the TFU */
1931    assert(format->plane_count == 1);
1932    const struct v3dv_format_plane *format_plane = &format->planes[0];
1933 
1934    uint32_t num_layers;
1935    if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
1936       num_layers = vk_image_subresource_layer_count(&image->vk,
1937                                                     &region->imageSubresource);
1938    } else {
1939       num_layers = region->imageExtent.depth;
1940    }
1941    assert(num_layers > 0);
1942 
1943    assert(image->planes[plane].mem && image->planes[plane].mem->bo);
1944    const struct v3dv_bo *dst_bo = image->planes[plane].mem->bo;
1945 
1946    assert(buffer->mem && buffer->mem->bo);
1947    const struct v3dv_bo *src_bo = buffer->mem->bo;
1948 
1949    /* Emit a TFU job per layer to copy */
1950    const uint32_t buffer_stride = width * image->planes[plane].cpp;
1951    for (int i = 0; i < num_layers; i++) {
1952       uint32_t layer;
1953       if (image->vk.image_type != VK_IMAGE_TYPE_3D)
1954          layer = region->imageSubresource.baseArrayLayer + i;
1955       else
1956          layer = region->imageOffset.z + i;
1957 
1958       const uint32_t buffer_offset =
1959          buffer->mem_offset + region->bufferOffset +
1960          height * buffer_stride * i;
1961       const uint32_t src_offset = src_bo->offset + buffer_offset;
1962 
1963       const uint32_t dst_offset =
1964          dst_bo->offset + v3dv_layer_offset(image, mip_level, layer, plane);
1965 
1966       v3dv_X(cmd_buffer->device, meta_emit_tfu_job)(
1967              cmd_buffer,
1968              dst_bo->handle,
1969              dst_offset,
1970              slice->tiling,
1971              slice->padded_height,
1972              image->planes[plane].cpp,
1973              src_bo->handle,
1974              src_offset,
1975              V3D_TILING_RASTER,
1976              width,
1977              1,
1978              width, height, format_plane);
1979    }
1980 
1981    return true;
1982 }
1983 
1984 /**
1985  * Returns true if the implementation supports the requested operation (even if
1986  * it failed to process it, for example, due to an out-of-memory error).
1987  */
1988 static bool
copy_buffer_to_image_tlb(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * image,struct v3dv_buffer * buffer,const VkBufferImageCopy2 * region)1989 copy_buffer_to_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
1990                          struct v3dv_image *image,
1991                          struct v3dv_buffer *buffer,
1992                          const VkBufferImageCopy2 *region)
1993 {
1994    VkFormat fb_format;
1995    uint8_t plane = v3dv_plane_from_aspect(region->imageSubresource.aspectMask);
1996    assert(plane < image->plane_count);
1997 
1998    if (!v3dv_meta_can_use_tlb(image, plane, region->imageSubresource.mipLevel,
1999                               &region->imageOffset, &region->imageExtent,
2000                               &fb_format)) {
2001       return false;
2002    }
2003 
2004    /* From the Vulkan spec for VkBufferImageCopy2:
2005     *
2006     *   "The aspectMask member of imageSubresource must only have a
2007     *    single bit set."
2008     *
2009     * For us this has relevant implications because we can't do TLB stores
2010     * of linear depth/stencil so we work around this by loading D/S data to the
2011     * color tile buffer using a compatible color format (see
2012     * emit_copy_buffer_to_layer_per_tile_list and choose_tlb_format functions),
2013     * however, when we are copying a single aspect to a combined D/S image
2014     * we need to preserve the other aspect, and for that we will still use the
2015     * D/S tile buffer to load and store the aspect of the image we need to
2016     * preserve, so in this case we are still constrained by the hw restriction
2017     * for linear D/S stores.
2018     */
2019    assert(util_bitcount(region->imageSubresource.aspectMask) == 1);
2020    if (!image->tiled &&
2021        vk_format_has_depth(fb_format) &&
2022        vk_format_has_stencil(fb_format)) {
2023       return false;
2024    }
2025 
2026    uint32_t internal_type, internal_bpp;
2027    v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
2028       (fb_format, region->imageSubresource.aspectMask,
2029        &internal_type, &internal_bpp);
2030 
2031    uint32_t num_layers;
2032    if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
2033       num_layers = vk_image_subresource_layer_count(&image->vk,
2034                                                     &region->imageSubresource);
2035    } else {
2036       num_layers = region->imageExtent.depth;
2037    }
2038    assert(num_layers > 0);
2039 
2040    struct v3dv_job *job =
2041       v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
2042    if (!job)
2043       return true;
2044 
2045    /* Handle copy to compressed format using a compatible format */
2046    const uint32_t block_w =
2047       vk_format_get_blockwidth(image->planes[plane].vk_format);
2048    const uint32_t block_h =
2049       vk_format_get_blockheight(image->planes[plane].vk_format);
2050    const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
2051    const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
2052 
2053    v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
2054                         internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
2055                         false);
2056 
2057    struct v3dv_meta_framebuffer framebuffer;
2058    v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
2059                                               internal_type, &job->frame_tiling);
2060 
2061    v3dv_X(job->device, job_emit_binning_flush)(job);
2062    v3dv_X(job->device, meta_emit_copy_buffer_to_image_rcl)
2063       (job, image, buffer, &framebuffer, region);
2064 
2065    v3dv_cmd_buffer_finish_job(cmd_buffer);
2066 
2067    return true;
2068 }
2069 
2070 static bool
create_tiled_image_from_buffer(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * image,struct v3dv_buffer * buffer,const VkBufferImageCopy2 * region)2071 create_tiled_image_from_buffer(struct v3dv_cmd_buffer *cmd_buffer,
2072                                struct v3dv_image *image,
2073                                struct v3dv_buffer *buffer,
2074                                const VkBufferImageCopy2 *region)
2075 {
2076    if (copy_buffer_to_image_tfu(cmd_buffer, image, buffer, region))
2077       return true;
2078    if (copy_buffer_to_image_tlb(cmd_buffer, image, buffer, region))
2079       return true;
2080    return false;
2081 }
2082 
2083 static VkResult
create_texel_buffer_copy_descriptor_pool(struct v3dv_cmd_buffer * cmd_buffer)2084 create_texel_buffer_copy_descriptor_pool(struct v3dv_cmd_buffer *cmd_buffer)
2085 {
2086    /* If this is not the first pool we create for this command buffer
2087     * size it based on the size of the currently exhausted pool.
2088     */
2089    uint32_t descriptor_count = 64;
2090    if (cmd_buffer->meta.texel_buffer_copy.dspool != VK_NULL_HANDLE) {
2091       struct v3dv_descriptor_pool *exhausted_pool =
2092          v3dv_descriptor_pool_from_handle(cmd_buffer->meta.texel_buffer_copy.dspool);
2093       descriptor_count = MIN2(exhausted_pool->max_entry_count * 2, 1024);
2094    }
2095 
2096    /* Create the descriptor pool */
2097    cmd_buffer->meta.texel_buffer_copy.dspool = VK_NULL_HANDLE;
2098    VkDescriptorPoolSize pool_size = {
2099       .type = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
2100       .descriptorCount = descriptor_count,
2101    };
2102    VkDescriptorPoolCreateInfo info = {
2103       .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
2104       .maxSets = descriptor_count,
2105       .poolSizeCount = 1,
2106       .pPoolSizes = &pool_size,
2107       .flags = 0,
2108    };
2109    VkResult result =
2110       v3dv_CreateDescriptorPool(v3dv_device_to_handle(cmd_buffer->device),
2111                                 &info,
2112                                 &cmd_buffer->device->vk.alloc,
2113                                 &cmd_buffer->meta.texel_buffer_copy.dspool);
2114 
2115    if (result == VK_SUCCESS) {
2116       assert(cmd_buffer->meta.texel_buffer_copy.dspool != VK_NULL_HANDLE);
2117       const VkDescriptorPool _pool = cmd_buffer->meta.texel_buffer_copy.dspool;
2118 
2119       v3dv_cmd_buffer_add_private_obj(
2120          cmd_buffer, (uintptr_t) _pool,
2121          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyDescriptorPool);
2122 
2123       struct v3dv_descriptor_pool *pool =
2124          v3dv_descriptor_pool_from_handle(_pool);
2125       pool->is_driver_internal = true;
2126    }
2127 
2128    return result;
2129 }
2130 
2131 static VkResult
allocate_texel_buffer_copy_descriptor_set(struct v3dv_cmd_buffer * cmd_buffer,VkDescriptorSet * set)2132 allocate_texel_buffer_copy_descriptor_set(struct v3dv_cmd_buffer *cmd_buffer,
2133                                           VkDescriptorSet *set)
2134 {
2135    /* Make sure we have a descriptor pool */
2136    VkResult result;
2137    if (cmd_buffer->meta.texel_buffer_copy.dspool == VK_NULL_HANDLE) {
2138       result = create_texel_buffer_copy_descriptor_pool(cmd_buffer);
2139       if (result != VK_SUCCESS)
2140          return result;
2141    }
2142    assert(cmd_buffer->meta.texel_buffer_copy.dspool != VK_NULL_HANDLE);
2143 
2144    /* Allocate descriptor set */
2145    struct v3dv_device *device = cmd_buffer->device;
2146    VkDevice _device = v3dv_device_to_handle(device);
2147    VkDescriptorSetAllocateInfo info = {
2148       .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
2149       .descriptorPool = cmd_buffer->meta.texel_buffer_copy.dspool,
2150       .descriptorSetCount = 1,
2151       .pSetLayouts = &device->meta.texel_buffer_copy.ds_layout,
2152    };
2153    result = v3dv_AllocateDescriptorSets(_device, &info, set);
2154 
2155    /* If we ran out of pool space, grow the pool and try again */
2156    if (result == VK_ERROR_OUT_OF_POOL_MEMORY) {
2157       result = create_texel_buffer_copy_descriptor_pool(cmd_buffer);
2158       if (result == VK_SUCCESS) {
2159          info.descriptorPool = cmd_buffer->meta.texel_buffer_copy.dspool;
2160          result = v3dv_AllocateDescriptorSets(_device, &info, set);
2161       }
2162    }
2163 
2164    return result;
2165 }
2166 
2167 static void
get_texel_buffer_copy_pipeline_cache_key(VkFormat format,VkColorComponentFlags cmask,VkComponentMapping * cswizzle,bool is_layered,uint8_t * key)2168 get_texel_buffer_copy_pipeline_cache_key(VkFormat format,
2169                                          VkColorComponentFlags cmask,
2170                                          VkComponentMapping *cswizzle,
2171                                          bool is_layered,
2172                                          uint8_t *key)
2173 {
2174    memset(key, 0, V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE);
2175 
2176    uint32_t *p = (uint32_t *) key;
2177 
2178    *p = format;
2179    p++;
2180 
2181    *p = cmask;
2182    p++;
2183 
2184    /* Note that that we are using a single byte for this, so we could pack
2185     * more data into this 32-bit slot in the future.
2186     */
2187    *p = is_layered ? 1 : 0;
2188    p++;
2189 
2190    memcpy(p, cswizzle, sizeof(VkComponentMapping));
2191    p += sizeof(VkComponentMapping) / sizeof(uint32_t);
2192 
2193    assert(((uint8_t*)p - key) == V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE);
2194 }
2195 
2196 static bool
2197 create_blit_render_pass(struct v3dv_device *device,
2198                         VkFormat dst_format,
2199                         VkFormat src_format,
2200                         VkRenderPass *pass_load,
2201                         VkRenderPass *pass_no_load);
2202 
2203 static bool
2204 create_pipeline(struct v3dv_device *device,
2205                 struct v3dv_render_pass *pass,
2206                 struct nir_shader *vs_nir,
2207                 struct nir_shader *gs_nir,
2208                 struct nir_shader *fs_nir,
2209                 const VkPipelineVertexInputStateCreateInfo *vi_state,
2210                 const VkPipelineDepthStencilStateCreateInfo *ds_state,
2211                 const VkPipelineColorBlendStateCreateInfo *cb_state,
2212                 const VkPipelineMultisampleStateCreateInfo *ms_state,
2213                 const VkPipelineLayout layout,
2214                 VkPipeline *pipeline);
2215 
2216 static nir_shader *
get_texel_buffer_copy_vs(const nir_shader_compiler_options * options)2217 get_texel_buffer_copy_vs(const nir_shader_compiler_options *options)
2218 {
2219    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_VERTEX, options,
2220                                                   "meta texel buffer copy vs");
2221    nir_variable *vs_out_pos =
2222       nir_variable_create(b.shader, nir_var_shader_out,
2223                           glsl_vec4_type(), "gl_Position");
2224    vs_out_pos->data.location = VARYING_SLOT_POS;
2225 
2226    nir_def *pos = nir_gen_rect_vertices(&b, NULL, NULL);
2227    nir_store_var(&b, vs_out_pos, pos, 0xf);
2228 
2229    return b.shader;
2230 }
2231 
2232 static nir_shader *
get_texel_buffer_copy_gs(const nir_shader_compiler_options * options)2233 get_texel_buffer_copy_gs(const nir_shader_compiler_options *options)
2234 {
2235    /* FIXME: this creates a geometry shader that takes the index of a single
2236     * layer to clear from push constants, so we need to emit a draw call for
2237     * each layer that we want to clear. We could actually do better and have it
2238     * take a range of layers however, if we were to do this, we would need to
2239     * be careful not to exceed the maximum number of output vertices allowed in
2240     * a geometry shader.
2241     */
2242    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_GEOMETRY, options,
2243                                                   "meta texel buffer copy gs");
2244    nir_shader *nir = b.shader;
2245    nir->info.inputs_read = 1ull << VARYING_SLOT_POS;
2246    nir->info.outputs_written = (1ull << VARYING_SLOT_POS) |
2247                                (1ull << VARYING_SLOT_LAYER);
2248    nir->info.gs.input_primitive = MESA_PRIM_TRIANGLES;
2249    nir->info.gs.output_primitive = MESA_PRIM_TRIANGLE_STRIP;
2250    nir->info.gs.vertices_in = 3;
2251    nir->info.gs.vertices_out = 3;
2252    nir->info.gs.invocations = 1;
2253    nir->info.gs.active_stream_mask = 0x1;
2254 
2255    /* in vec4 gl_Position[3] */
2256    nir_variable *gs_in_pos =
2257       nir_variable_create(b.shader, nir_var_shader_in,
2258                           glsl_array_type(glsl_vec4_type(), 3, 0),
2259                           "in_gl_Position");
2260    gs_in_pos->data.location = VARYING_SLOT_POS;
2261 
2262    /* out vec4 gl_Position */
2263    nir_variable *gs_out_pos =
2264       nir_variable_create(b.shader, nir_var_shader_out, glsl_vec4_type(),
2265                           "out_gl_Position");
2266    gs_out_pos->data.location = VARYING_SLOT_POS;
2267 
2268    /* out float gl_Layer */
2269    nir_variable *gs_out_layer =
2270       nir_variable_create(b.shader, nir_var_shader_out, glsl_float_type(),
2271                           "out_gl_Layer");
2272    gs_out_layer->data.location = VARYING_SLOT_LAYER;
2273 
2274    /* Emit output triangle */
2275    for (uint32_t i = 0; i < 3; i++) {
2276       /* gl_Position from shader input */
2277       nir_deref_instr *in_pos_i =
2278          nir_build_deref_array_imm(&b, nir_build_deref_var(&b, gs_in_pos), i);
2279       nir_copy_deref(&b, nir_build_deref_var(&b, gs_out_pos), in_pos_i);
2280 
2281       /* gl_Layer from push constants */
2282       nir_def *layer =
2283          nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
2284                                 .base = TEXEL_BUFFER_COPY_GS_LAYER_PC_OFFSET,
2285                                 .range = 4);
2286       nir_store_var(&b, gs_out_layer, layer, 0x1);
2287 
2288       nir_emit_vertex(&b, 0);
2289    }
2290 
2291    nir_end_primitive(&b, 0);
2292 
2293    return nir;
2294 }
2295 
2296 static nir_def *
load_frag_coord(nir_builder * b)2297 load_frag_coord(nir_builder *b)
2298 {
2299    nir_foreach_shader_in_variable(var, b->shader) {
2300       if (var->data.location == VARYING_SLOT_POS)
2301          return nir_load_var(b, var);
2302    }
2303    nir_variable *pos = nir_variable_create(b->shader, nir_var_shader_in,
2304                                            glsl_vec4_type(), NULL);
2305    pos->data.location = VARYING_SLOT_POS;
2306    return nir_load_var(b, pos);
2307 }
2308 
2309 static uint32_t
component_swizzle_to_nir_swizzle(VkComponentSwizzle comp,VkComponentSwizzle swz)2310 component_swizzle_to_nir_swizzle(VkComponentSwizzle comp, VkComponentSwizzle swz)
2311 {
2312    if (swz == VK_COMPONENT_SWIZZLE_IDENTITY)
2313       swz = comp;
2314 
2315    switch (swz) {
2316    case VK_COMPONENT_SWIZZLE_R:
2317       return 0;
2318    case VK_COMPONENT_SWIZZLE_G:
2319       return 1;
2320    case VK_COMPONENT_SWIZZLE_B:
2321       return 2;
2322    case VK_COMPONENT_SWIZZLE_A:
2323       return 3;
2324    default:
2325       unreachable("Invalid swizzle");
2326    };
2327 }
2328 
2329 static nir_shader *
get_texel_buffer_copy_fs(const nir_shader_compiler_options * options,VkFormat format,VkComponentMapping * cswizzle)2330 get_texel_buffer_copy_fs(const nir_shader_compiler_options *options,
2331                          VkFormat format, VkComponentMapping *cswizzle)
2332 {
2333    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, options,
2334                                                   "meta texel buffer copy fs");
2335 
2336    /* We only use the copy from texel buffer shader to implement
2337     * copy_buffer_to_image_shader, which always selects a compatible integer
2338     * format for the copy.
2339     */
2340    assert(vk_format_is_int(format));
2341 
2342    /* Fragment shader output color */
2343    nir_variable *fs_out_color =
2344       nir_variable_create(b.shader, nir_var_shader_out,
2345                           glsl_uvec4_type(), "out_color");
2346    fs_out_color->data.location = FRAG_RESULT_DATA0;
2347 
2348    /* Texel buffer input */
2349    const struct glsl_type *sampler_type =
2350       glsl_sampler_type(GLSL_SAMPLER_DIM_BUF, false, false, GLSL_TYPE_UINT);
2351    nir_variable *sampler =
2352       nir_variable_create(b.shader, nir_var_uniform, sampler_type, "texel_buf");
2353    sampler->data.descriptor_set = 0;
2354    sampler->data.binding = 0;
2355 
2356    /* Load the box describing the pixel region we want to copy from the
2357     * texel buffer.
2358     */
2359    nir_def *box =
2360       nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0),
2361                              .base = TEXEL_BUFFER_COPY_FS_BOX_PC_OFFSET,
2362                              .range = 16);
2363 
2364    /* Load the buffer stride (this comes in texel units) */
2365    nir_def *stride =
2366       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
2367                              .base = TEXEL_BUFFER_COPY_FS_STRIDE_PC_OFFSET,
2368                              .range = 4);
2369 
2370    /* Load the buffer offset (this comes in texel units) */
2371    nir_def *offset =
2372       nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
2373                              .base = TEXEL_BUFFER_COPY_FS_OFFSET_PC_OFFSET,
2374                              .range = 4);
2375 
2376    nir_def *coord = nir_f2i32(&b, load_frag_coord(&b));
2377 
2378    /* Load pixel data from texel buffer based on the x,y offset of the pixel
2379     * within the box. Texel buffers are 1D arrays of texels.
2380     *
2381     * Notice that we already make sure that we only generate fragments that are
2382     * inside the box through the scissor/viewport state, so our offset into the
2383     * texel buffer should always be within its bounds and we we don't need
2384     * to add a check for that here.
2385     */
2386    nir_def *x_offset =
2387       nir_isub(&b, nir_channel(&b, coord, 0),
2388                    nir_channel(&b, box, 0));
2389    nir_def *y_offset =
2390       nir_isub(&b, nir_channel(&b, coord, 1),
2391                    nir_channel(&b, box, 1));
2392    nir_def *texel_offset =
2393       nir_iadd(&b, nir_iadd(&b, offset, x_offset),
2394                    nir_imul(&b, y_offset, stride));
2395 
2396    nir_def *tex_deref = &nir_build_deref_var(&b, sampler)->def;
2397    nir_tex_instr *tex = nir_tex_instr_create(b.shader, 2);
2398    tex->sampler_dim = GLSL_SAMPLER_DIM_BUF;
2399    tex->op = nir_texop_txf;
2400    tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, texel_offset);
2401    tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_texture_deref, tex_deref);
2402    tex->dest_type = nir_type_uint32;
2403    tex->is_array = false;
2404    tex->coord_components = 1;
2405    nir_def_init(&tex->instr, &tex->def, 4, 32);
2406    nir_builder_instr_insert(&b, &tex->instr);
2407 
2408    uint32_t swiz[4];
2409    swiz[0] =
2410       component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_R, cswizzle->r);
2411    swiz[1] =
2412       component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_G, cswizzle->g);
2413    swiz[2] =
2414       component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_B, cswizzle->b);
2415    swiz[3] =
2416       component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_A, cswizzle->a);
2417    nir_def *s = nir_swizzle(&b, &tex->def, swiz, 4);
2418    nir_store_var(&b, fs_out_color, s, 0xf);
2419 
2420    return b.shader;
2421 }
2422 
2423 static bool
create_texel_buffer_copy_pipeline(struct v3dv_device * device,VkFormat format,VkColorComponentFlags cmask,VkComponentMapping * cswizzle,bool is_layered,VkRenderPass _pass,VkPipelineLayout pipeline_layout,VkPipeline * pipeline)2424 create_texel_buffer_copy_pipeline(struct v3dv_device *device,
2425                                   VkFormat format,
2426                                   VkColorComponentFlags cmask,
2427                                   VkComponentMapping *cswizzle,
2428                                   bool is_layered,
2429                                   VkRenderPass _pass,
2430                                   VkPipelineLayout pipeline_layout,
2431                                   VkPipeline *pipeline)
2432 {
2433    struct v3dv_render_pass *pass = v3dv_render_pass_from_handle(_pass);
2434 
2435    assert(vk_format_is_color(format));
2436 
2437    const nir_shader_compiler_options *options =
2438       v3dv_pipeline_get_nir_options(&device->devinfo);
2439 
2440    nir_shader *vs_nir = get_texel_buffer_copy_vs(options);
2441    nir_shader *fs_nir = get_texel_buffer_copy_fs(options, format, cswizzle);
2442    nir_shader *gs_nir = is_layered ? get_texel_buffer_copy_gs(options) : NULL;
2443 
2444    const VkPipelineVertexInputStateCreateInfo vi_state = {
2445       .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
2446       .vertexBindingDescriptionCount = 0,
2447       .vertexAttributeDescriptionCount = 0,
2448    };
2449 
2450    VkPipelineDepthStencilStateCreateInfo ds_state = {
2451       .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
2452    };
2453 
2454    VkPipelineColorBlendAttachmentState blend_att_state[1] = { 0 };
2455    blend_att_state[0] = (VkPipelineColorBlendAttachmentState) {
2456       .blendEnable = false,
2457       .colorWriteMask = cmask,
2458    };
2459 
2460    const VkPipelineColorBlendStateCreateInfo cb_state = {
2461       .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
2462       .logicOpEnable = false,
2463       .attachmentCount = 1,
2464       .pAttachments = blend_att_state
2465    };
2466 
2467    const VkPipelineMultisampleStateCreateInfo ms_state = {
2468       .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
2469       .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT,
2470       .sampleShadingEnable = false,
2471       .pSampleMask = NULL,
2472       .alphaToCoverageEnable = false,
2473       .alphaToOneEnable = false,
2474    };
2475 
2476    return create_pipeline(device,
2477                           pass,
2478                           vs_nir, gs_nir, fs_nir,
2479                           &vi_state,
2480                           &ds_state,
2481                           &cb_state,
2482                           &ms_state,
2483                           pipeline_layout,
2484                           pipeline);
2485 }
2486 
2487 static bool
get_copy_texel_buffer_pipeline(struct v3dv_cmd_buffer * cmd_buffer,VkFormat format,VkColorComponentFlags cmask,VkComponentMapping * cswizzle,VkImageType image_type,bool is_layered,struct v3dv_meta_texel_buffer_copy_pipeline ** pipeline)2488 get_copy_texel_buffer_pipeline(
2489    struct v3dv_cmd_buffer *cmd_buffer,
2490    VkFormat format,
2491    VkColorComponentFlags cmask,
2492    VkComponentMapping *cswizzle,
2493    VkImageType image_type,
2494    bool is_layered,
2495    struct v3dv_meta_texel_buffer_copy_pipeline **pipeline)
2496 {
2497    bool ok = true;
2498    struct v3dv_device *device = cmd_buffer->device;
2499 
2500    uint8_t key[V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE];
2501    if (device->instance->meta_cache_enabled) {
2502       get_texel_buffer_copy_pipeline_cache_key(format, cmask, cswizzle, is_layered,
2503                                                key);
2504 
2505       mtx_lock(&device->meta.mtx);
2506       struct hash_entry *entry =
2507          _mesa_hash_table_search(device->meta.texel_buffer_copy.cache[image_type],
2508                                  key);
2509       if (entry) {
2510          mtx_unlock(&device->meta.mtx);
2511          *pipeline = entry->data;
2512          return true;
2513       }
2514    }
2515 
2516    *pipeline = vk_zalloc2(&device->vk.alloc, NULL, sizeof(**pipeline), 8,
2517                           VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
2518 
2519    if (*pipeline == NULL)
2520       goto fail;
2521 
2522    /* The blit render pass is compatible */
2523    ok = create_blit_render_pass(device, format, format,
2524                                 &(*pipeline)->pass,
2525                                 &(*pipeline)->pass_no_load);
2526    if (!ok)
2527       goto fail;
2528 
2529    ok =
2530       create_texel_buffer_copy_pipeline(device,
2531                                         format, cmask, cswizzle, is_layered,
2532                                         (*pipeline)->pass,
2533                                         device->meta.texel_buffer_copy.p_layout,
2534                                         &(*pipeline)->pipeline);
2535    if (!ok)
2536       goto fail;
2537 
2538    if (device->instance->meta_cache_enabled) {
2539       _mesa_hash_table_insert(device->meta.texel_buffer_copy.cache[image_type],
2540                               key, *pipeline);
2541       mtx_unlock(&device->meta.mtx);
2542    } else {
2543       v3dv_cmd_buffer_add_private_obj(
2544          cmd_buffer, (uintptr_t)*pipeline,
2545          (v3dv_cmd_buffer_private_obj_destroy_cb)destroy_meta_texel_buffer_copy_pipeline);
2546    }
2547 
2548    return true;
2549 
2550 fail:
2551    if (device->instance->meta_cache_enabled)
2552       mtx_unlock(&device->meta.mtx);
2553 
2554    VkDevice _device = v3dv_device_to_handle(device);
2555    if (*pipeline) {
2556       if ((*pipeline)->pass)
2557          v3dv_DestroyRenderPass(_device, (*pipeline)->pass, &device->vk.alloc);
2558       if ((*pipeline)->pipeline)
2559          v3dv_DestroyPipeline(_device, (*pipeline)->pipeline, &device->vk.alloc);
2560       vk_free(&device->vk.alloc, *pipeline);
2561       *pipeline = NULL;
2562    }
2563 
2564    return false;
2565 }
2566 
2567 static bool
texel_buffer_shader_copy(struct v3dv_cmd_buffer * cmd_buffer,VkImageAspectFlags aspect,struct v3dv_image * image,VkFormat dst_format,VkFormat src_format,struct v3dv_buffer * buffer,uint32_t buffer_bpp,VkColorComponentFlags cmask,VkComponentMapping * cswizzle,uint32_t region_count,const VkBufferImageCopy2 * regions)2568 texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
2569                          VkImageAspectFlags aspect,
2570                          struct v3dv_image *image,
2571                          VkFormat dst_format,
2572                          VkFormat src_format,
2573                          struct v3dv_buffer *buffer,
2574                          uint32_t buffer_bpp,
2575                          VkColorComponentFlags cmask,
2576                          VkComponentMapping *cswizzle,
2577                          uint32_t region_count,
2578                          const VkBufferImageCopy2 *regions)
2579 {
2580    VkResult result;
2581    bool handled = false;
2582 
2583    assert(cswizzle);
2584 
2585    /* This is a copy path, so we don't handle format conversions. The only
2586     * exception are stencil to D24S8 copies, which are handled as a color
2587     * masked R8->RGBA8 copy.
2588     */
2589    assert(src_format == dst_format ||
2590           (dst_format == VK_FORMAT_R8G8B8A8_UINT &&
2591            src_format == VK_FORMAT_R8_UINT &&
2592            cmask == VK_COLOR_COMPONENT_R_BIT));
2593 
2594    /* We only handle color copies. Callers can copy D/S aspects by using
2595     * a compatible color format and maybe a cmask/cswizzle for D24 formats.
2596     */
2597    if (!vk_format_is_color(dst_format) || !vk_format_is_color(src_format))
2598       return handled;
2599 
2600    /* FIXME: we only handle uncompressed images for now. */
2601    if (vk_format_is_compressed(image->vk.format))
2602       return handled;
2603 
2604    const VkColorComponentFlags full_cmask = VK_COLOR_COMPONENT_R_BIT |
2605                                             VK_COLOR_COMPONENT_G_BIT |
2606                                             VK_COLOR_COMPONENT_B_BIT |
2607                                             VK_COLOR_COMPONENT_A_BIT;
2608    if (cmask == 0)
2609       cmask = full_cmask;
2610 
2611    /* The buffer needs to have VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT
2612     * so we can bind it as a texel buffer. Otherwise, the buffer view
2613     * we create below won't setup the texture state that we need for this.
2614     */
2615    if (!(buffer->usage & VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT)) {
2616       if (v3dv_buffer_format_supports_features(
2617              cmd_buffer->device, src_format,
2618              VK_FORMAT_FEATURE_2_UNIFORM_TEXEL_BUFFER_BIT)) {
2619          buffer->usage |= VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT;
2620       } else {
2621          return handled;
2622       }
2623    }
2624 
2625    /* At this point we should be able to handle the copy unless an unexpected
2626     * error occurs, such as an OOM.
2627     */
2628    handled = true;
2629 
2630 
2631    /* Compute the number of layers to copy.
2632     *
2633     * If we are batching (region_count > 1) all our regions have the same
2634     * image subresource so we can take this from the first region. For 3D
2635     * images we require the same depth extent.
2636     */
2637    const VkImageSubresourceLayers *resource = &regions[0].imageSubresource;
2638    uint32_t num_layers;
2639    if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
2640       num_layers = vk_image_subresource_layer_count(&image->vk, resource);
2641    } else {
2642       assert(region_count == 1);
2643       num_layers = regions[0].imageExtent.depth;
2644    }
2645    assert(num_layers > 0);
2646 
2647    /* Get the texel buffer copy pipeline */
2648    struct v3dv_meta_texel_buffer_copy_pipeline *pipeline = NULL;
2649    bool ok = get_copy_texel_buffer_pipeline(cmd_buffer,
2650                                             dst_format, cmask, cswizzle,
2651                                             image->vk.image_type, num_layers > 1,
2652                                             &pipeline);
2653    if (!ok)
2654       return handled;
2655    assert(pipeline && pipeline->pipeline && pipeline->pass);
2656 
2657    /* Setup descriptor set for the source texel buffer. We don't have to
2658     * register the descriptor as a private command buffer object since
2659     * all descriptors will be freed automatically with the descriptor
2660     * pool.
2661     */
2662    VkDescriptorSet set;
2663    result = allocate_texel_buffer_copy_descriptor_set(cmd_buffer, &set);
2664    if (result != VK_SUCCESS)
2665       return handled;
2666 
2667    /* We can't pass region->bufferOffset here for the offset field because
2668     * the texture base pointer in the texture shader state must be a 64-byte
2669     * aligned value. Instead, we use 0 here and we pass the offset in texels
2670     * as a push constant to the shader.
2671     */
2672    VkDevice _device = v3dv_device_to_handle(cmd_buffer->device);
2673    VkBufferViewCreateInfo buffer_view_info = {
2674       .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO,
2675       .buffer = v3dv_buffer_to_handle(buffer),
2676       .format = src_format,
2677       .offset = 0,
2678       .range = VK_WHOLE_SIZE,
2679    };
2680 
2681    VkBufferView texel_buffer_view;
2682    result = v3dv_CreateBufferView(_device, &buffer_view_info,
2683                                   &cmd_buffer->device->vk.alloc,
2684                                   &texel_buffer_view);
2685    if (result != VK_SUCCESS)
2686       return handled;
2687 
2688    v3dv_cmd_buffer_add_private_obj(
2689       cmd_buffer, (uintptr_t)texel_buffer_view,
2690       (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyBufferView);
2691 
2692    VkWriteDescriptorSet write = {
2693       .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
2694       .dstSet = set,
2695       .dstBinding = 0,
2696       .dstArrayElement = 0,
2697       .descriptorCount = 1,
2698       .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
2699       .pTexelBufferView = &texel_buffer_view,
2700    };
2701    v3dv_UpdateDescriptorSets(_device, 1, &write, 0, NULL);
2702 
2703    /* Push command buffer state before starting meta operation */
2704    v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
2705 
2706    /* Bind common state for all layers and regions  */
2707    VkCommandBuffer _cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
2708    v3dv_CmdBindPipeline(_cmd_buffer,
2709                         VK_PIPELINE_BIND_POINT_GRAPHICS,
2710                         pipeline->pipeline);
2711 
2712    v3dv_CmdBindDescriptorSets(_cmd_buffer,
2713                               VK_PIPELINE_BIND_POINT_GRAPHICS,
2714                               cmd_buffer->device->meta.texel_buffer_copy.p_layout,
2715                               0, 1, &set,
2716                               0, NULL);
2717 
2718    /* Setup framebuffer.
2719     *
2720     * For 3D images, this creates a layered framebuffer with a number of
2721     * layers matching the depth extent of the 3D image.
2722     */
2723    uint8_t plane = v3dv_plane_from_aspect(aspect);
2724    uint32_t fb_width = u_minify(image->planes[plane].width, resource->mipLevel);
2725    uint32_t fb_height = u_minify(image->planes[plane].height, resource->mipLevel);
2726 
2727    VkImageViewCreateInfo image_view_info = {
2728       .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
2729       .image = v3dv_image_to_handle(image),
2730       .viewType = v3dv_image_type_to_view_type(image->vk.image_type),
2731       .format = dst_format,
2732       .subresourceRange = {
2733          .aspectMask = aspect,
2734          .baseMipLevel = resource->mipLevel,
2735          .levelCount = 1,
2736          .baseArrayLayer = resource->baseArrayLayer,
2737          .layerCount = num_layers,
2738       },
2739    };
2740    VkImageView image_view;
2741    result = v3dv_create_image_view(cmd_buffer->device,
2742                                    &image_view_info, &image_view);
2743    if (result != VK_SUCCESS)
2744       goto fail;
2745 
2746    v3dv_cmd_buffer_add_private_obj(
2747       cmd_buffer, (uintptr_t)image_view,
2748       (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImageView);
2749 
2750    VkFramebufferCreateInfo fb_info = {
2751       .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
2752       .renderPass = pipeline->pass,
2753       .attachmentCount = 1,
2754       .pAttachments = &image_view,
2755       .width = fb_width,
2756       .height = fb_height,
2757       .layers = num_layers,
2758    };
2759 
2760    VkFramebuffer fb;
2761    result = v3dv_CreateFramebuffer(_device, &fb_info,
2762                                    &cmd_buffer->device->vk.alloc, &fb);
2763    if (result != VK_SUCCESS)
2764       goto fail;
2765 
2766     v3dv_cmd_buffer_add_private_obj(
2767        cmd_buffer, (uintptr_t)fb,
2768        (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyFramebuffer);
2769 
2770    /* For each layer */
2771    for (uint32_t l = 0; l < num_layers; l++) {
2772        /* Start render pass for this layer.
2773         *
2774         * If the we only have one region to copy, then we might be able to
2775         * skip the TLB load if it is aligned to tile boundaries. All layers
2776         * copy the same area, so we only need to check this once.
2777         */
2778       bool can_skip_tlb_load = false;
2779       VkRect2D render_area;
2780       if (region_count == 1) {
2781          render_area.offset.x = regions[0].imageOffset.x;
2782          render_area.offset.y = regions[0].imageOffset.y;
2783          render_area.extent.width = regions[0].imageExtent.width;
2784          render_area.extent.height = regions[0].imageExtent.height;
2785 
2786          if (l == 0) {
2787             struct v3dv_render_pass *pipeline_pass =
2788                v3dv_render_pass_from_handle(pipeline->pass);
2789             can_skip_tlb_load =
2790                cmask == full_cmask &&
2791                v3dv_subpass_area_is_tile_aligned(cmd_buffer->device, &render_area,
2792                                                  v3dv_framebuffer_from_handle(fb),
2793                                                  pipeline_pass, 0);
2794          }
2795       } else {
2796          render_area.offset.x = 0;
2797          render_area.offset.y = 0;
2798          render_area.extent.width = fb_width;
2799          render_area.extent.height = fb_height;
2800       }
2801 
2802       VkRenderPassBeginInfo rp_info = {
2803          .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
2804          .renderPass = can_skip_tlb_load ? pipeline->pass_no_load :
2805                                            pipeline->pass,
2806          .framebuffer = fb,
2807          .renderArea = render_area,
2808          .clearValueCount = 0,
2809       };
2810 
2811       VkSubpassBeginInfo sp_info = {
2812          .sType = VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO,
2813          .contents = VK_SUBPASS_CONTENTS_INLINE,
2814       };
2815 
2816       v3dv_CmdBeginRenderPass2(_cmd_buffer, &rp_info, &sp_info);
2817       struct v3dv_job *job = cmd_buffer->state.job;
2818       if (!job)
2819          goto fail;
2820 
2821       /* If we are using a layered copy we need to specify the layer for the
2822        * Geometry Shader.
2823        */
2824       if (num_layers > 1) {
2825          uint32_t layer = resource->baseArrayLayer + l;
2826          v3dv_CmdPushConstants(_cmd_buffer,
2827                                cmd_buffer->device->meta.texel_buffer_copy.p_layout,
2828                                VK_SHADER_STAGE_GEOMETRY_BIT,
2829                                24, 4, &layer);
2830       }
2831 
2832       /* For each region */
2833       for (uint32_t r = 0; r < region_count; r++) {
2834          const VkBufferImageCopy2 *region = &regions[r];
2835 
2836          /* Obtain the 2D buffer region spec */
2837          uint32_t buf_width, buf_height;
2838          if (region->bufferRowLength == 0)
2839              buf_width = region->imageExtent.width;
2840          else
2841              buf_width = region->bufferRowLength;
2842 
2843          if (region->bufferImageHeight == 0)
2844              buf_height = region->imageExtent.height;
2845          else
2846              buf_height = region->bufferImageHeight;
2847 
2848          const VkViewport viewport = {
2849             .x = region->imageOffset.x,
2850             .y = region->imageOffset.y,
2851             .width = region->imageExtent.width,
2852             .height = region->imageExtent.height,
2853             .minDepth = 0.0f,
2854             .maxDepth = 1.0f
2855          };
2856          v3dv_CmdSetViewport(_cmd_buffer, 0, 1, &viewport);
2857          const VkRect2D scissor = {
2858             .offset = { region->imageOffset.x, region->imageOffset.y },
2859             .extent = { region->imageExtent.width, region->imageExtent.height }
2860          };
2861          v3dv_CmdSetScissor(_cmd_buffer, 0, 1, &scissor);
2862 
2863          const VkDeviceSize buf_offset =
2864             region->bufferOffset / buffer_bpp  + l * buf_height * buf_width;
2865          uint32_t push_data[6] = {
2866             region->imageOffset.x,
2867             region->imageOffset.y,
2868             region->imageOffset.x + region->imageExtent.width - 1,
2869             region->imageOffset.y + region->imageExtent.height - 1,
2870             buf_width,
2871             buf_offset,
2872          };
2873 
2874          v3dv_CmdPushConstants(_cmd_buffer,
2875                                cmd_buffer->device->meta.texel_buffer_copy.p_layout,
2876                                VK_SHADER_STAGE_FRAGMENT_BIT,
2877                                0, sizeof(push_data), &push_data);
2878 
2879          v3dv_CmdDraw(_cmd_buffer, 4, 1, 0, 0);
2880       } /* For each region */
2881 
2882       VkSubpassEndInfo sp_end_info = {
2883          .sType = VK_STRUCTURE_TYPE_SUBPASS_END_INFO,
2884       };
2885 
2886       v3dv_CmdEndRenderPass2(_cmd_buffer, &sp_end_info);
2887    } /* For each layer */
2888 
2889 fail:
2890    v3dv_cmd_buffer_meta_state_pop(cmd_buffer, true);
2891    return handled;
2892 }
2893 
2894 /**
2895  * Returns true if the implementation supports the requested operation (even if
2896  * it failed to process it, for example, due to an out-of-memory error).
2897  */
2898 static bool
copy_buffer_to_image_blit(struct v3dv_cmd_buffer * cmd_buffer,VkImageAspectFlags aspect,struct v3dv_image * image,VkFormat dst_format,VkFormat src_format,struct v3dv_buffer * buffer,uint32_t buffer_bpp,VkColorComponentFlags cmask,VkComponentMapping * cswizzle,uint32_t region_count,const VkBufferImageCopy2 * regions)2899 copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
2900                           VkImageAspectFlags aspect,
2901                           struct v3dv_image *image,
2902                           VkFormat dst_format,
2903                           VkFormat src_format,
2904                           struct v3dv_buffer *buffer,
2905                           uint32_t buffer_bpp,
2906                           VkColorComponentFlags cmask,
2907                           VkComponentMapping *cswizzle,
2908                           uint32_t region_count,
2909                           const VkBufferImageCopy2 *regions)
2910 {
2911    /* Since we can't sample linear images we need to upload the linear
2912     * buffer to a tiled image that we can use as a blit source, which
2913     * is slow.
2914     */
2915    perf_debug("Falling back to blit path for buffer to image copy.\n");
2916 
2917    struct v3dv_device *device = cmd_buffer->device;
2918    VkDevice _device = v3dv_device_to_handle(device);
2919    bool handled = true;
2920 
2921    /* Allocate memory for the tiled image. Since we copy layer by layer
2922     * we allocate memory to hold a full layer, which is the worse case.
2923     * For that we create a dummy image with that spec, get memory requirements
2924     * for it and use that information to create the memory allocation.
2925     * We will then reuse this memory store for all the regions we want to
2926     * copy.
2927     */
2928    VkImage dummy_image;
2929    VkImageCreateInfo dummy_info = {
2930       .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
2931       .imageType = VK_IMAGE_TYPE_2D,
2932       .format = src_format,
2933       .extent = { image->vk.extent.width, image->vk.extent.height, 1 },
2934       .mipLevels = 1,
2935       .arrayLayers = 1,
2936       .samples = VK_SAMPLE_COUNT_1_BIT,
2937       .tiling = VK_IMAGE_TILING_OPTIMAL,
2938       .usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
2939                VK_IMAGE_USAGE_TRANSFER_DST_BIT,
2940       .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
2941       .queueFamilyIndexCount = 0,
2942       .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
2943    };
2944    VkResult result =
2945       v3dv_CreateImage(_device, &dummy_info, &device->vk.alloc, &dummy_image);
2946    if (result != VK_SUCCESS)
2947       return handled;
2948 
2949    VkMemoryRequirements reqs;
2950    vk_common_GetImageMemoryRequirements(_device, dummy_image, &reqs);
2951    v3dv_DestroyImage(_device, dummy_image, &device->vk.alloc);
2952 
2953    VkDeviceMemory mem;
2954    VkMemoryAllocateInfo alloc_info = {
2955       .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
2956       .allocationSize = reqs.size,
2957       .memoryTypeIndex = 0,
2958    };
2959    result = v3dv_AllocateMemory(_device, &alloc_info, &device->vk.alloc, &mem);
2960    if (result != VK_SUCCESS)
2961       return handled;
2962 
2963    v3dv_cmd_buffer_add_private_obj(
2964       cmd_buffer, (uintptr_t)mem,
2965       (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_FreeMemory);
2966 
2967    /* Obtain the layer count.
2968     *
2969     * If we are batching (region_count > 1) all our regions have the same
2970     * image subresource so we can take this from the first region.
2971     */
2972    uint32_t num_layers;
2973    if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
2974       num_layers = vk_image_subresource_layer_count(&image->vk,
2975                                                     &regions[0].imageSubresource);
2976    } else {
2977       num_layers = regions[0].imageExtent.depth;
2978    }
2979    assert(num_layers > 0);
2980 
2981    /* Sanity check: we can only batch multiple regions together if they have
2982     * the same framebuffer (so the same layer).
2983     */
2984    assert(num_layers == 1 || region_count == 1);
2985 
2986    uint8_t plane = v3dv_plane_from_aspect(aspect);
2987    assert(plane < image->plane_count);
2988 
2989    const uint32_t block_width =
2990       vk_format_get_blockwidth(image->planes[plane].vk_format);
2991    const uint32_t block_height =
2992       vk_format_get_blockheight(image->planes[plane].vk_format);
2993 
2994    /* Copy regions by uploading each region to a temporary tiled image using
2995     * the memory we have just allocated as storage.
2996     */
2997    for (uint32_t r = 0; r < region_count; r++) {
2998       const VkBufferImageCopy2 *region = &regions[r];
2999 
3000       /* Obtain the 2D buffer region spec */
3001       uint32_t buf_width, buf_height;
3002       if (region->bufferRowLength == 0)
3003           buf_width = region->imageExtent.width;
3004       else
3005           buf_width = region->bufferRowLength;
3006 
3007       if (region->bufferImageHeight == 0)
3008           buf_height = region->imageExtent.height;
3009       else
3010           buf_height = region->bufferImageHeight;
3011 
3012       /* If the image is compressed, the bpp refers to blocks, not pixels */
3013       buf_width = buf_width / block_width;
3014       buf_height = buf_height / block_height;
3015 
3016       for (uint32_t i = 0; i < num_layers; i++) {
3017          /* Create the tiled image */
3018          VkImageCreateInfo image_info = {
3019             .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
3020             .imageType = VK_IMAGE_TYPE_2D,
3021             .format = src_format,
3022             .extent = { buf_width, buf_height, 1 },
3023             .mipLevels = 1,
3024             .arrayLayers = 1,
3025             .samples = VK_SAMPLE_COUNT_1_BIT,
3026             .tiling = VK_IMAGE_TILING_OPTIMAL,
3027             .usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
3028                      VK_IMAGE_USAGE_TRANSFER_DST_BIT,
3029             .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
3030             .queueFamilyIndexCount = 0,
3031             .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
3032          };
3033 
3034          VkImage buffer_image;
3035          VkResult result =
3036             v3dv_CreateImage(_device, &image_info, &device->vk.alloc,
3037                              &buffer_image);
3038          if (result != VK_SUCCESS)
3039             return handled;
3040 
3041          v3dv_cmd_buffer_add_private_obj(
3042             cmd_buffer, (uintptr_t)buffer_image,
3043             (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);
3044 
3045          result = vk_common_BindImageMemory(_device, buffer_image, mem, 0);
3046          if (result != VK_SUCCESS)
3047             return handled;
3048 
3049          /* When copying a multi-plane image the aspect indicates the plane to
3050           * copy. For these, we only copy one plane at a time, which is always
3051           * a color plane.
3052           */
3053          VkImageAspectFlags copy_aspect =
3054             image->plane_count == 1 ? aspect : VK_IMAGE_ASPECT_COLOR_BIT;
3055 
3056          /* Upload buffer contents for the selected layer */
3057          const VkDeviceSize buf_offset_bytes =
3058             region->bufferOffset + i * buf_height * buf_width * buffer_bpp;
3059          const VkBufferImageCopy2 buffer_image_copy = {
3060             .sType = VK_STRUCTURE_TYPE_BUFFER_IMAGE_COPY_2,
3061             .bufferOffset = buf_offset_bytes,
3062             .bufferRowLength = region->bufferRowLength / block_width,
3063             .bufferImageHeight = region->bufferImageHeight / block_height,
3064             .imageSubresource = {
3065                .aspectMask = copy_aspect,
3066                .mipLevel = 0,
3067                .baseArrayLayer = 0,
3068                .layerCount = 1,
3069             },
3070             .imageOffset = { 0, 0, 0 },
3071             .imageExtent = { buf_width, buf_height, 1 }
3072          };
3073          handled =
3074             create_tiled_image_from_buffer(cmd_buffer,
3075                                            v3dv_image_from_handle(buffer_image),
3076                                            buffer, &buffer_image_copy);
3077          if (!handled) {
3078             /* This is unexpected, we should have setup the upload to be
3079              * conformant to a TFU or TLB copy.
3080              */
3081             unreachable("Unable to copy buffer to image through TLB");
3082             return false;
3083          }
3084 
3085          /* Blit-copy the requested image extent from the buffer image to the
3086           * destination image.
3087           *
3088           * Since we are copying, the blit must use the same format on the
3089           * destination and source images to avoid format conversions. The
3090           * only exception is copying stencil, which we upload to a R8UI source
3091           * image, but that we need to blit to a S8D24 destination (the only
3092           * stencil format we support).
3093           */
3094          const VkImageBlit2 blit_region = {
3095             .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2,
3096             .srcSubresource = {
3097                .aspectMask = copy_aspect,
3098                .mipLevel = 0,
3099                .baseArrayLayer = 0,
3100                .layerCount = 1,
3101             },
3102             .srcOffsets = {
3103                { 0, 0, 0 },
3104                { region->imageExtent.width, region->imageExtent.height, 1 },
3105             },
3106             .dstSubresource = {
3107                .aspectMask = aspect,
3108                .mipLevel = region->imageSubresource.mipLevel,
3109                .baseArrayLayer = region->imageSubresource.baseArrayLayer + i,
3110                .layerCount = 1,
3111             },
3112             .dstOffsets = {
3113                {
3114                   DIV_ROUND_UP(region->imageOffset.x, block_width),
3115                   DIV_ROUND_UP(region->imageOffset.y, block_height),
3116                   region->imageOffset.z + i,
3117                },
3118                {
3119                   DIV_ROUND_UP(region->imageOffset.x + region->imageExtent.width,
3120                                block_width),
3121                   DIV_ROUND_UP(region->imageOffset.y + region->imageExtent.height,
3122                                block_height),
3123                   region->imageOffset.z + i + 1,
3124                },
3125             },
3126          };
3127 
3128          handled = blit_shader(cmd_buffer,
3129                                image, dst_format,
3130                                v3dv_image_from_handle(buffer_image), src_format,
3131                                cmask, cswizzle,
3132                                &blit_region, VK_FILTER_NEAREST, true);
3133          if (!handled) {
3134             /* This is unexpected, we should have a supported blit spec */
3135             unreachable("Unable to blit buffer to destination image");
3136             return false;
3137          }
3138       }
3139    }
3140 
3141    return handled;
3142 }
3143 
3144 /**
3145  * Returns true if the implementation supports the requested operation (even if
3146  * it failed to process it, for example, due to an out-of-memory error).
3147  */
3148 static bool
copy_buffer_to_image_shader(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * image,struct v3dv_buffer * buffer,uint32_t region_count,const VkBufferImageCopy2 * regions,bool use_texel_buffer)3149 copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer,
3150                             struct v3dv_image *image,
3151                             struct v3dv_buffer *buffer,
3152                             uint32_t region_count,
3153                             const VkBufferImageCopy2 *regions,
3154                             bool use_texel_buffer)
3155 {
3156    /* We can only call this with region_count > 1 if we can batch the regions
3157     * together, in which case they share the same image subresource, and so
3158     * the same aspect.
3159     */
3160    VkImageAspectFlags aspect = regions[0].imageSubresource.aspectMask;
3161    const VkImageAspectFlagBits any_plane_aspect =
3162       VK_IMAGE_ASPECT_PLANE_0_BIT |
3163       VK_IMAGE_ASPECT_PLANE_1_BIT |
3164       VK_IMAGE_ASPECT_PLANE_2_BIT;
3165 
3166    bool is_plane_aspect = aspect & any_plane_aspect;
3167 
3168    /* Generally, the bpp of the data in the buffer matches that of the
3169     * destination image. The exception is the case where we are uploading
3170     * stencil (8bpp) to a combined d24s8 image (32bpp).
3171     */
3172    uint8_t plane = v3dv_plane_from_aspect(aspect);
3173    assert(plane < image->plane_count);
3174    uint32_t buf_bpp = image->planes[plane].cpp;
3175 
3176    /* We are about to upload the buffer data to an image so we can then
3177     * blit that to our destination region. Because we are going to implement
3178     * the copy as a blit, we want our blit source and destination formats to be
3179     * the same (to avoid any format conversions), so we choose a canonical
3180     * format that matches the destination image bpp.
3181     */
3182    VkComponentMapping ident_swizzle = {
3183       .r = VK_COMPONENT_SWIZZLE_IDENTITY,
3184       .g = VK_COMPONENT_SWIZZLE_IDENTITY,
3185       .b = VK_COMPONENT_SWIZZLE_IDENTITY,
3186       .a = VK_COMPONENT_SWIZZLE_IDENTITY,
3187    };
3188 
3189    VkComponentMapping cswizzle = ident_swizzle;
3190    VkColorComponentFlags cmask = 0; /* Write all components */
3191    VkFormat src_format;
3192    VkFormat dst_format;
3193    switch (buf_bpp) {
3194    case 16:
3195       assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
3196       src_format = VK_FORMAT_R32G32B32A32_UINT;
3197       dst_format = src_format;
3198       break;
3199    case 8:
3200       assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
3201       src_format = VK_FORMAT_R16G16B16A16_UINT;
3202       dst_format = src_format;
3203       break;
3204    case 4:
3205       switch (aspect) {
3206       case VK_IMAGE_ASPECT_COLOR_BIT:
3207       case VK_IMAGE_ASPECT_PLANE_0_BIT:
3208       case VK_IMAGE_ASPECT_PLANE_1_BIT:
3209       case VK_IMAGE_ASPECT_PLANE_2_BIT:
3210          src_format = VK_FORMAT_R8G8B8A8_UINT;
3211          dst_format = src_format;
3212          break;
3213       case VK_IMAGE_ASPECT_DEPTH_BIT:
3214          assert(image->vk.format == VK_FORMAT_D32_SFLOAT ||
3215                 image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
3216                 image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32);
3217          src_format = VK_FORMAT_R8G8B8A8_UINT;
3218          dst_format = src_format;
3219 
3220          /* For D24 formats, the Vulkan spec states that the depth component
3221           * in the buffer is stored in the 24-LSB, but V3D wants it in the
3222           * 24-MSB.
3223           */
3224          if (image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
3225              image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32) {
3226             cmask = VK_COLOR_COMPONENT_G_BIT |
3227                     VK_COLOR_COMPONENT_B_BIT |
3228                     VK_COLOR_COMPONENT_A_BIT;
3229             cswizzle.r = VK_COMPONENT_SWIZZLE_R;
3230             cswizzle.g = VK_COMPONENT_SWIZZLE_R;
3231             cswizzle.b = VK_COMPONENT_SWIZZLE_G;
3232             cswizzle.a = VK_COMPONENT_SWIZZLE_B;
3233          }
3234          break;
3235       case VK_IMAGE_ASPECT_STENCIL_BIT:
3236          /* Since we don't support separate stencil this is always a stencil
3237           * copy to a combined depth/stencil image. Because we don't support
3238           * separate stencil images, we interpret the buffer data as a
3239           * color R8UI image, and implement the blit as a compatible color
3240           * blit to an RGBA8UI destination masking out writes to components
3241           * GBA (which map to the D24 component of a S8D24 image).
3242           */
3243          assert(image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT);
3244          buf_bpp = 1;
3245          src_format = VK_FORMAT_R8_UINT;
3246          dst_format = VK_FORMAT_R8G8B8A8_UINT;
3247          cmask = VK_COLOR_COMPONENT_R_BIT;
3248          break;
3249       default:
3250          unreachable("unsupported aspect");
3251          return false;
3252       };
3253       break;
3254    case 2:
3255       assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT ||
3256              aspect == VK_IMAGE_ASPECT_DEPTH_BIT ||
3257              is_plane_aspect);
3258       src_format = VK_FORMAT_R16_UINT;
3259       dst_format = src_format;
3260       break;
3261    case 1:
3262       assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT || is_plane_aspect);
3263       src_format = VK_FORMAT_R8_UINT;
3264       dst_format = src_format;
3265       break;
3266    default:
3267       unreachable("unsupported bit-size");
3268       return false;
3269    }
3270 
3271    if (use_texel_buffer) {
3272       return texel_buffer_shader_copy(cmd_buffer, aspect, image,
3273                                       dst_format, src_format,
3274                                       buffer, buf_bpp,
3275                                       cmask, &cswizzle,
3276                                       region_count, regions);
3277    } else {
3278       return copy_buffer_to_image_blit(cmd_buffer, aspect, image,
3279                                        dst_format, src_format,
3280                                        buffer, buf_bpp,
3281                                        cmask, &cswizzle,
3282                                        region_count, regions);
3283    }
3284 }
3285 
3286 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer,const VkCopyBufferToImageInfo2 * info)3287 v3dv_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer,
3288                               const VkCopyBufferToImageInfo2 *info)
3289 {
3290    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3291    V3DV_FROM_HANDLE(v3dv_buffer, buffer, info->srcBuffer);
3292    V3DV_FROM_HANDLE(v3dv_image, image, info->dstImage);
3293 
3294    assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT);
3295 
3296    cmd_buffer->state.is_transfer = true;
3297 
3298    uint32_t r = 0;
3299    while (r < info->regionCount) {
3300       /* The TFU and TLB paths can only copy one region at a time and the region
3301        * needs to start at the origin. We try these first for the common case
3302        * where we are copying full images, since they should be the fastest.
3303        */
3304       uint32_t batch_size = 1;
3305       if (copy_buffer_to_image_tfu(cmd_buffer, image, buffer, &info->pRegions[r]))
3306          goto handled;
3307 
3308       if (copy_buffer_to_image_tlb(cmd_buffer, image, buffer, &info->pRegions[r]))
3309          goto handled;
3310 
3311       /* Otherwise, we are copying subrects, so we fallback to copying
3312        * via shader and texel buffers and we try to batch the regions
3313        * if possible. We can only batch copies if they have the same
3314        * framebuffer spec, which is mostly determined by the image
3315        * subresource of the region.
3316        */
3317       const VkImageSubresourceLayers *rsc = &info->pRegions[r].imageSubresource;
3318       for (uint32_t s = r + 1; s < info->regionCount; s++) {
3319          const VkImageSubresourceLayers *rsc_s =
3320             &info->pRegions[s].imageSubresource;
3321 
3322          if (memcmp(rsc, rsc_s, sizeof(VkImageSubresourceLayers)) != 0)
3323             break;
3324 
3325          /* For 3D images we also need to check the depth extent */
3326          if (image->vk.image_type == VK_IMAGE_TYPE_3D &&
3327              info->pRegions[s].imageExtent.depth !=
3328              info->pRegions[r].imageExtent.depth) {
3329                break;
3330          }
3331 
3332          batch_size++;
3333       }
3334 
3335       if (copy_buffer_to_image_shader(cmd_buffer, image, buffer,
3336                                       batch_size, &info->pRegions[r], true)) {
3337          goto handled;
3338       }
3339 
3340       /* If we still could not copy, fallback to slower paths.
3341        *
3342        * FIXME: we could try to batch these too, but since they are bound to be
3343        * slow it might not be worth it and we should instead put more effort
3344        * in handling more cases with the other paths.
3345        */
3346       if (copy_buffer_to_image_shader(cmd_buffer, image, buffer,
3347                                       batch_size, &info->pRegions[r], false)) {
3348          goto handled;
3349       }
3350 
3351       unreachable("Unsupported buffer to image copy.");
3352 
3353 handled:
3354       r += batch_size;
3355    }
3356 
3357    cmd_buffer->state.is_transfer = false;
3358 }
3359 
3360 static void
3361 compute_blit_3d_layers(const VkOffset3D *offsets,
3362                        uint32_t *min_layer, uint32_t *max_layer,
3363                        bool *mirror_z);
3364 
3365 /**
3366  * Returns true if the implementation supports the requested operation (even if
3367  * it failed to process it, for example, due to an out-of-memory error).
3368  *
3369  * The TFU blit path doesn't handle scaling so the blit filter parameter can
3370  * be ignored.
3371  */
3372 static bool
blit_tfu(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageBlit2 * region)3373 blit_tfu(struct v3dv_cmd_buffer *cmd_buffer,
3374          struct v3dv_image *dst,
3375          struct v3dv_image *src,
3376          const VkImageBlit2 *region)
3377 {
3378    if (V3D_DBG(DISABLE_TFU)) {
3379       perf_debug("Blit: TFU disabled, fallbacks could be slower.");
3380       return false;
3381    }
3382 
3383    assert(dst->vk.samples == VK_SAMPLE_COUNT_1_BIT);
3384    assert(src->vk.samples == VK_SAMPLE_COUNT_1_BIT);
3385 
3386    /* From vkCmdBlitImage:
3387     *   "srcImage must not use a format that requires a sampler YCBCR
3388     *    conversion"
3389     *   "dstImage must not use a format that requires a sampler YCBCR
3390     *    conversion"
3391     */
3392    assert(dst->plane_count == 1);
3393    assert(src->plane_count == 1);
3394 
3395    /* Format must match */
3396    if (src->vk.format != dst->vk.format)
3397       return false;
3398 
3399    /* Destination can't be raster format */
3400    if (!dst->tiled)
3401       return false;
3402 
3403    /* Source region must start at (0,0) */
3404    if (region->srcOffsets[0].x != 0 || region->srcOffsets[0].y != 0)
3405       return false;
3406 
3407    /* Destination image must be complete */
3408    if (region->dstOffsets[0].x != 0 || region->dstOffsets[0].y != 0)
3409       return false;
3410 
3411    const uint32_t dst_mip_level = region->dstSubresource.mipLevel;
3412    const uint32_t dst_width = u_minify(dst->vk.extent.width, dst_mip_level);
3413    const uint32_t dst_height = u_minify(dst->vk.extent.height, dst_mip_level);
3414    if (region->dstOffsets[1].x < dst_width - 1||
3415        region->dstOffsets[1].y < dst_height - 1) {
3416       return false;
3417    }
3418 
3419    /* No XY scaling */
3420    if (region->srcOffsets[1].x != region->dstOffsets[1].x ||
3421        region->srcOffsets[1].y != region->dstOffsets[1].y) {
3422       return false;
3423    }
3424 
3425    /* If the format is D24S8 both aspects need to be copied, since the TFU
3426     * can't be programmed to copy only one aspect of the image.
3427     */
3428    if (dst->vk.format == VK_FORMAT_D24_UNORM_S8_UINT) {
3429        const VkImageAspectFlags ds_aspects = VK_IMAGE_ASPECT_DEPTH_BIT |
3430                                              VK_IMAGE_ASPECT_STENCIL_BIT;
3431        if (region->dstSubresource.aspectMask != ds_aspects)
3432           return false;
3433    }
3434 
3435    /* Our TFU blits only handle exact copies (it requires same formats
3436     * on input and output, no scaling, etc), so there is no pixel format
3437     * conversions and we can rewrite the format to use one that is TFU
3438     * compatible based on its texel size.
3439     */
3440    const struct v3dv_format *format =
3441       v3dv_get_compatible_tfu_format(cmd_buffer->device,
3442                                      dst->planes[0].cpp, NULL);
3443 
3444    /* Emit a TFU job for each layer to blit */
3445    assert(vk_image_subresource_layer_count(&dst->vk, &region->dstSubresource) ==
3446           vk_image_subresource_layer_count(&src->vk, &region->srcSubresource));
3447 
3448    uint32_t min_dst_layer;
3449    uint32_t max_dst_layer;
3450    bool dst_mirror_z = false;
3451    if (dst->vk.image_type == VK_IMAGE_TYPE_3D) {
3452       compute_blit_3d_layers(region->dstOffsets,
3453                              &min_dst_layer, &max_dst_layer,
3454                              &dst_mirror_z);
3455    } else {
3456       min_dst_layer = region->dstSubresource.baseArrayLayer;
3457       max_dst_layer = min_dst_layer +
3458                       vk_image_subresource_layer_count(&dst->vk,
3459                                                        &region->dstSubresource);
3460    }
3461 
3462    uint32_t min_src_layer;
3463    uint32_t max_src_layer;
3464    bool src_mirror_z = false;
3465    if (src->vk.image_type == VK_IMAGE_TYPE_3D) {
3466       compute_blit_3d_layers(region->srcOffsets,
3467                              &min_src_layer, &max_src_layer,
3468                              &src_mirror_z);
3469    } else {
3470       min_src_layer = region->srcSubresource.baseArrayLayer;
3471       max_src_layer = min_src_layer +
3472                       vk_image_subresource_layer_count(&src->vk,
3473                                                        &region->srcSubresource);
3474    }
3475 
3476    /* No Z scaling for 3D images (for non-3D images both src and dst must
3477     * have the same layerCount).
3478     */
3479    if (max_dst_layer - min_dst_layer != max_src_layer - min_src_layer)
3480       return false;
3481 
3482    const uint32_t layer_count = max_dst_layer - min_dst_layer;
3483    const uint32_t src_mip_level = region->srcSubresource.mipLevel;
3484    for (uint32_t i = 0; i < layer_count; i++) {
3485       /* Since the TFU path doesn't handle scaling, Z mirroring for 3D images
3486        * only involves reversing the order of the slices.
3487        */
3488       const uint32_t dst_layer =
3489          dst_mirror_z ? max_dst_layer - i - 1: min_dst_layer + i;
3490       const uint32_t src_layer =
3491          src_mirror_z ? max_src_layer - i - 1: min_src_layer + i;
3492 
3493       const uint32_t dst_offset =
3494          dst->planes[0].mem->bo->offset + v3dv_layer_offset(dst, dst_mip_level,
3495                                                             dst_layer, 0);
3496       const uint32_t src_offset =
3497          src->planes[0].mem->bo->offset + v3dv_layer_offset(src, src_mip_level,
3498                                                             src_layer, 0);
3499 
3500       const struct v3d_resource_slice *dst_slice = &dst->planes[0].slices[dst_mip_level];
3501       const struct v3d_resource_slice *src_slice = &src->planes[0].slices[src_mip_level];
3502 
3503       v3dv_X(cmd_buffer->device, meta_emit_tfu_job)(
3504          cmd_buffer,
3505          dst->planes[0].mem->bo->handle,
3506          dst_offset,
3507          dst_slice->tiling,
3508          dst_slice->padded_height,
3509          dst->planes[0].cpp,
3510          src->planes[0].mem->bo->handle,
3511          src_offset,
3512          src_slice->tiling,
3513          src_slice->tiling == V3D_TILING_RASTER ?
3514                               src_slice->stride : src_slice->padded_height,
3515          src->planes[0].cpp,
3516          dst_width, dst_height, &format->planes[0]);
3517    }
3518 
3519    return true;
3520 }
3521 
3522 static bool
format_needs_software_int_clamp(VkFormat format)3523 format_needs_software_int_clamp(VkFormat format)
3524 {
3525    switch (format) {
3526       case VK_FORMAT_A2R10G10B10_UINT_PACK32:
3527       case VK_FORMAT_A2R10G10B10_SINT_PACK32:
3528       case VK_FORMAT_A2B10G10R10_UINT_PACK32:
3529       case VK_FORMAT_A2B10G10R10_SINT_PACK32:
3530          return true;
3531       default:
3532          return false;
3533    };
3534 }
3535 
3536 static void
get_blit_pipeline_cache_key(VkFormat dst_format,VkFormat src_format,VkColorComponentFlags cmask,VkSampleCountFlagBits dst_samples,VkSampleCountFlagBits src_samples,uint8_t * key)3537 get_blit_pipeline_cache_key(VkFormat dst_format,
3538                             VkFormat src_format,
3539                             VkColorComponentFlags cmask,
3540                             VkSampleCountFlagBits dst_samples,
3541                             VkSampleCountFlagBits src_samples,
3542                             uint8_t *key)
3543 {
3544    memset(key, 0, V3DV_META_BLIT_CACHE_KEY_SIZE);
3545 
3546    uint32_t *p = (uint32_t *) key;
3547 
3548    *p = dst_format;
3549    p++;
3550 
3551    /* Generally, when blitting from a larger format to a smaller format
3552     * the hardware takes care of clamping the source to the RT range.
3553     * Specifically, for integer formats, this is done by using
3554     * V3D_RENDER_TARGET_CLAMP_INT in the render target setup, however, this
3555     * clamps to the bit-size of the render type, and some formats, such as
3556     * rgb10a2_uint have a 16-bit type, so it won't do what we need and we
3557     * require to clamp in software. In these cases, we need to amend the blit
3558     * shader with clamp code that depends on both the src and dst formats, so
3559     * we need the src format to be part of the key.
3560     */
3561    *p = format_needs_software_int_clamp(dst_format) ? src_format : 0;
3562    p++;
3563 
3564    *p = cmask;
3565    p++;
3566 
3567    *p = (dst_samples << 8) | src_samples;
3568    p++;
3569 
3570    assert(((uint8_t*)p - key) == V3DV_META_BLIT_CACHE_KEY_SIZE);
3571 }
3572 
3573 static bool
create_blit_render_pass(struct v3dv_device * device,VkFormat dst_format,VkFormat src_format,VkRenderPass * pass_load,VkRenderPass * pass_no_load)3574 create_blit_render_pass(struct v3dv_device *device,
3575                         VkFormat dst_format,
3576                         VkFormat src_format,
3577                         VkRenderPass *pass_load,
3578                         VkRenderPass *pass_no_load)
3579 {
3580    const bool is_color_blit = vk_format_is_color(dst_format);
3581 
3582    /* Attachment load operation is specified below */
3583    VkAttachmentDescription2 att = {
3584       .sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2,
3585       .format = dst_format,
3586       .samples = VK_SAMPLE_COUNT_1_BIT,
3587       .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
3588       .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
3589       .finalLayout = VK_IMAGE_LAYOUT_GENERAL,
3590    };
3591 
3592    VkAttachmentReference2 att_ref = {
3593       .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
3594       .attachment = 0,
3595       .layout = VK_IMAGE_LAYOUT_GENERAL,
3596    };
3597 
3598    VkSubpassDescription2 subpass = {
3599       .sType = VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2,
3600       .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
3601       .inputAttachmentCount = 0,
3602       .colorAttachmentCount = is_color_blit ? 1 : 0,
3603       .pColorAttachments = is_color_blit ? &att_ref : NULL,
3604       .pResolveAttachments = NULL,
3605       .pDepthStencilAttachment = is_color_blit ? NULL : &att_ref,
3606       .preserveAttachmentCount = 0,
3607       .pPreserveAttachments = NULL,
3608    };
3609 
3610    VkRenderPassCreateInfo2 info = {
3611       .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2,
3612       .attachmentCount = 1,
3613       .pAttachments = &att,
3614       .subpassCount = 1,
3615       .pSubpasses = &subpass,
3616       .dependencyCount = 0,
3617       .pDependencies = NULL,
3618    };
3619 
3620    VkResult result;
3621    att.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
3622    result = v3dv_CreateRenderPass2(v3dv_device_to_handle(device),
3623                                    &info, &device->vk.alloc, pass_load);
3624    if (result != VK_SUCCESS)
3625       return false;
3626 
3627    att.loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
3628    result = v3dv_CreateRenderPass2(v3dv_device_to_handle(device),
3629                                    &info, &device->vk.alloc, pass_no_load);
3630    return result == VK_SUCCESS;
3631 }
3632 
3633 static nir_def *
gen_tex_coords(nir_builder * b)3634 gen_tex_coords(nir_builder *b)
3635 {
3636    nir_def *tex_box =
3637       nir_load_push_constant(b, 4, 32, nir_imm_int(b, 0), .base = 0, .range = 16);
3638 
3639    nir_def *tex_z =
3640       nir_load_push_constant(b, 1, 32, nir_imm_int(b, 0), .base = 16, .range = 4);
3641 
3642    nir_def *vertex_id = nir_load_vertex_id(b);
3643 
3644    /* vertex 0: src0_x, src0_y
3645     * vertex 1: src0_x, src1_y
3646     * vertex 2: src1_x, src0_y
3647     * vertex 3: src1_x, src1_y
3648     *
3649     * So:
3650     *
3651     * channel 0 is vertex_id < 2 ? src0_x : src1_x
3652     * channel 1 is vertex id & 1 ? src1_y : src0_y
3653     */
3654 
3655    nir_def *one = nir_imm_int(b, 1);
3656    nir_def *c0cmp = nir_ilt_imm(b, vertex_id, 2);
3657    nir_def *c1cmp = nir_ieq(b, nir_iand(b, vertex_id, one), one);
3658 
3659    nir_def *comp[4];
3660    comp[0] = nir_bcsel(b, c0cmp,
3661                        nir_channel(b, tex_box, 0),
3662                        nir_channel(b, tex_box, 2));
3663 
3664    comp[1] = nir_bcsel(b, c1cmp,
3665                        nir_channel(b, tex_box, 3),
3666                        nir_channel(b, tex_box, 1));
3667    comp[2] = tex_z;
3668    comp[3] = nir_imm_float(b, 1.0f);
3669    return nir_vec(b, comp, 4);
3670 }
3671 
3672 static nir_def *
build_nir_tex_op_read(struct nir_builder * b,nir_def * tex_pos,enum glsl_base_type tex_type,enum glsl_sampler_dim dim)3673 build_nir_tex_op_read(struct nir_builder *b,
3674                       nir_def *tex_pos,
3675                       enum glsl_base_type tex_type,
3676                       enum glsl_sampler_dim dim)
3677 {
3678    assert(dim != GLSL_SAMPLER_DIM_MS);
3679 
3680    const struct glsl_type *sampler_type =
3681       glsl_sampler_type(dim, false, false, tex_type);
3682    nir_variable *sampler =
3683       nir_variable_create(b->shader, nir_var_uniform, sampler_type, "s_tex");
3684    sampler->data.descriptor_set = 0;
3685    sampler->data.binding = 0;
3686 
3687    nir_def *tex_deref = &nir_build_deref_var(b, sampler)->def;
3688    nir_tex_instr *tex = nir_tex_instr_create(b->shader, 3);
3689    tex->sampler_dim = dim;
3690    tex->op = nir_texop_tex;
3691    tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, tex_pos);
3692    tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_texture_deref, tex_deref);
3693    tex->src[2] = nir_tex_src_for_ssa(nir_tex_src_sampler_deref, tex_deref);
3694    tex->dest_type = nir_get_nir_type_for_glsl_base_type(tex_type);
3695    tex->is_array = glsl_sampler_type_is_array(sampler_type);
3696    tex->coord_components = tex_pos->num_components;
3697 
3698    nir_def_init(&tex->instr, &tex->def, 4, 32);
3699    nir_builder_instr_insert(b, &tex->instr);
3700    return &tex->def;
3701 }
3702 
3703 static nir_def *
build_nir_tex_op_ms_fetch_sample(struct nir_builder * b,nir_variable * sampler,nir_def * tex_deref,enum glsl_base_type tex_type,nir_def * tex_pos,nir_def * sample_idx)3704 build_nir_tex_op_ms_fetch_sample(struct nir_builder *b,
3705                                  nir_variable *sampler,
3706                                  nir_def *tex_deref,
3707                                  enum glsl_base_type tex_type,
3708                                  nir_def *tex_pos,
3709                                  nir_def *sample_idx)
3710 {
3711    nir_tex_instr *tex = nir_tex_instr_create(b->shader, 3);
3712    tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
3713    tex->op = nir_texop_txf_ms;
3714    tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, tex_pos);
3715    tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_texture_deref, tex_deref);
3716    tex->src[2] = nir_tex_src_for_ssa(nir_tex_src_ms_index, sample_idx);
3717    tex->dest_type = nir_get_nir_type_for_glsl_base_type(tex_type);
3718    tex->is_array = false;
3719    tex->coord_components = tex_pos->num_components;
3720 
3721    nir_def_init(&tex->instr, &tex->def, 4, 32);
3722    nir_builder_instr_insert(b, &tex->instr);
3723    return &tex->def;
3724 }
3725 
3726 /* Fetches all samples at the given position and averages them */
3727 static nir_def *
build_nir_tex_op_ms_resolve(struct nir_builder * b,nir_def * tex_pos,enum glsl_base_type tex_type,VkSampleCountFlagBits src_samples)3728 build_nir_tex_op_ms_resolve(struct nir_builder *b,
3729                             nir_def *tex_pos,
3730                             enum glsl_base_type tex_type,
3731                             VkSampleCountFlagBits src_samples)
3732 {
3733    assert(src_samples > VK_SAMPLE_COUNT_1_BIT);
3734    const struct glsl_type *sampler_type =
3735       glsl_sampler_type(GLSL_SAMPLER_DIM_MS, false, false, tex_type);
3736    nir_variable *sampler =
3737       nir_variable_create(b->shader, nir_var_uniform, sampler_type, "s_tex");
3738    sampler->data.descriptor_set = 0;
3739    sampler->data.binding = 0;
3740 
3741    const bool is_int = glsl_base_type_is_integer(tex_type);
3742 
3743    nir_def *tmp = NULL;
3744    nir_def *tex_deref = &nir_build_deref_var(b, sampler)->def;
3745    for (uint32_t i = 0; i < src_samples; i++) {
3746       nir_def *s =
3747          build_nir_tex_op_ms_fetch_sample(b, sampler, tex_deref,
3748                                           tex_type, tex_pos,
3749                                           nir_imm_int(b, i));
3750 
3751       /* For integer formats, the multisample resolve operation is expected to
3752        * return one of the samples, we just return the first one.
3753        */
3754       if (is_int)
3755          return s;
3756 
3757       tmp = i == 0 ? s : nir_fadd(b, tmp, s);
3758    }
3759 
3760    assert(!is_int);
3761    return nir_fmul_imm(b, tmp, 1.0f / src_samples);
3762 }
3763 
3764 /* Fetches the current sample (gl_SampleID) at the given position */
3765 static nir_def *
build_nir_tex_op_ms_read(struct nir_builder * b,nir_def * tex_pos,enum glsl_base_type tex_type)3766 build_nir_tex_op_ms_read(struct nir_builder *b,
3767                          nir_def *tex_pos,
3768                          enum glsl_base_type tex_type)
3769 {
3770    const struct glsl_type *sampler_type =
3771       glsl_sampler_type(GLSL_SAMPLER_DIM_MS, false, false, tex_type);
3772    nir_variable *sampler =
3773       nir_variable_create(b->shader, nir_var_uniform, sampler_type, "s_tex");
3774    sampler->data.descriptor_set = 0;
3775    sampler->data.binding = 0;
3776 
3777    nir_def *tex_deref = &nir_build_deref_var(b, sampler)->def;
3778 
3779    return build_nir_tex_op_ms_fetch_sample(b, sampler, tex_deref,
3780                                            tex_type, tex_pos,
3781                                            nir_load_sample_id(b));
3782 }
3783 
3784 static nir_def *
build_nir_tex_op(struct nir_builder * b,struct v3dv_device * device,nir_def * tex_pos,enum glsl_base_type tex_type,VkSampleCountFlagBits dst_samples,VkSampleCountFlagBits src_samples,enum glsl_sampler_dim dim)3785 build_nir_tex_op(struct nir_builder *b,
3786                  struct v3dv_device *device,
3787                  nir_def *tex_pos,
3788                  enum glsl_base_type tex_type,
3789                  VkSampleCountFlagBits dst_samples,
3790                  VkSampleCountFlagBits src_samples,
3791                  enum glsl_sampler_dim dim)
3792 {
3793    switch (dim) {
3794    case GLSL_SAMPLER_DIM_MS:
3795       assert(src_samples == VK_SAMPLE_COUNT_4_BIT);
3796       /* For multisampled texture sources we need to use fetching instead of
3797        * normalized texture coordinates. We already configured our blit
3798        * coordinates to be in texel units, but here we still need to convert
3799        * them from floating point to integer.
3800        */
3801       tex_pos = nir_f2i32(b, tex_pos);
3802 
3803       if (dst_samples == VK_SAMPLE_COUNT_1_BIT)
3804          return build_nir_tex_op_ms_resolve(b, tex_pos, tex_type, src_samples);
3805       else
3806          return build_nir_tex_op_ms_read(b, tex_pos, tex_type);
3807    default:
3808       assert(src_samples == VK_SAMPLE_COUNT_1_BIT);
3809       return build_nir_tex_op_read(b, tex_pos, tex_type, dim);
3810    }
3811 }
3812 
3813 static nir_shader *
get_blit_vs(const nir_shader_compiler_options * options)3814 get_blit_vs(const nir_shader_compiler_options *options)
3815 {
3816    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_VERTEX, options,
3817                                                   "meta blit vs");
3818 
3819    const struct glsl_type *vec4 = glsl_vec4_type();
3820 
3821    nir_variable *vs_out_pos =
3822       nir_variable_create(b.shader, nir_var_shader_out, vec4, "gl_Position");
3823    vs_out_pos->data.location = VARYING_SLOT_POS;
3824 
3825    nir_variable *vs_out_tex_coord =
3826       nir_variable_create(b.shader, nir_var_shader_out, vec4, "out_tex_coord");
3827    vs_out_tex_coord->data.location = VARYING_SLOT_VAR0;
3828    vs_out_tex_coord->data.interpolation = INTERP_MODE_SMOOTH;
3829 
3830    nir_def *pos = nir_gen_rect_vertices(&b, NULL, NULL);
3831    nir_store_var(&b, vs_out_pos, pos, 0xf);
3832 
3833    nir_def *tex_coord = gen_tex_coords(&b);
3834    nir_store_var(&b, vs_out_tex_coord, tex_coord, 0xf);
3835 
3836    return b.shader;
3837 }
3838 
3839 static uint32_t
get_channel_mask_for_sampler_dim(enum glsl_sampler_dim sampler_dim)3840 get_channel_mask_for_sampler_dim(enum glsl_sampler_dim sampler_dim)
3841 {
3842    switch (sampler_dim) {
3843    case GLSL_SAMPLER_DIM_1D: return 0x1;
3844    case GLSL_SAMPLER_DIM_2D: return 0x3;
3845    case GLSL_SAMPLER_DIM_MS: return 0x3;
3846    case GLSL_SAMPLER_DIM_3D: return 0x7;
3847    default:
3848       unreachable("invalid sampler dim");
3849    };
3850 }
3851 
3852 static nir_shader *
get_color_blit_fs(const nir_shader_compiler_options * options,struct v3dv_device * device,VkFormat dst_format,VkFormat src_format,VkSampleCountFlagBits dst_samples,VkSampleCountFlagBits src_samples,enum glsl_sampler_dim sampler_dim)3853 get_color_blit_fs(const nir_shader_compiler_options *options,
3854                   struct v3dv_device *device,
3855                   VkFormat dst_format,
3856                   VkFormat src_format,
3857                   VkSampleCountFlagBits dst_samples,
3858                   VkSampleCountFlagBits src_samples,
3859                   enum glsl_sampler_dim sampler_dim)
3860 {
3861    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, options,
3862                                                   "meta blit fs");
3863 
3864    const struct glsl_type *vec4 = glsl_vec4_type();
3865 
3866    nir_variable *fs_in_tex_coord =
3867       nir_variable_create(b.shader, nir_var_shader_in, vec4, "in_tex_coord");
3868    fs_in_tex_coord->data.location = VARYING_SLOT_VAR0;
3869 
3870    const struct glsl_type *fs_out_type =
3871       vk_format_is_sint(dst_format) ? glsl_ivec4_type() :
3872       vk_format_is_uint(dst_format) ? glsl_uvec4_type() :
3873                                       glsl_vec4_type();
3874 
3875    enum glsl_base_type src_base_type =
3876       vk_format_is_sint(src_format) ? GLSL_TYPE_INT :
3877       vk_format_is_uint(src_format) ? GLSL_TYPE_UINT :
3878                                       GLSL_TYPE_FLOAT;
3879 
3880    nir_variable *fs_out_color =
3881       nir_variable_create(b.shader, nir_var_shader_out, fs_out_type, "out_color");
3882    fs_out_color->data.location = FRAG_RESULT_DATA0;
3883 
3884    nir_def *tex_coord = nir_load_var(&b, fs_in_tex_coord);
3885    const uint32_t channel_mask = get_channel_mask_for_sampler_dim(sampler_dim);
3886    tex_coord = nir_channels(&b, tex_coord, channel_mask);
3887 
3888    nir_def *color = build_nir_tex_op(&b, device, tex_coord, src_base_type,
3889                                          dst_samples, src_samples, sampler_dim);
3890 
3891    /* For integer textures, if the bit-size of the destination is too small to
3892     * hold source value, Vulkan (CTS) expects the implementation to clamp to the
3893     * maximum value the destination can hold. The hardware can clamp to the
3894     * render target type, which usually matches the component bit-size, but
3895     * there are some cases that won't match, such as rgb10a2, which has a 16-bit
3896     * render target type, so in these cases we need to clamp manually.
3897     */
3898    if (format_needs_software_int_clamp(dst_format)) {
3899       assert(vk_format_is_int(dst_format));
3900       enum pipe_format src_pformat = vk_format_to_pipe_format(src_format);
3901       enum pipe_format dst_pformat = vk_format_to_pipe_format(dst_format);
3902 
3903       nir_def *c[4];
3904       for (uint32_t i = 0; i < 4; i++) {
3905          c[i] = nir_channel(&b, color, i);
3906 
3907          const uint32_t src_bit_size =
3908             util_format_get_component_bits(src_pformat,
3909                                            UTIL_FORMAT_COLORSPACE_RGB,
3910                                            i);
3911          const uint32_t dst_bit_size =
3912             util_format_get_component_bits(dst_pformat,
3913                                            UTIL_FORMAT_COLORSPACE_RGB,
3914                                            i);
3915 
3916          if (dst_bit_size >= src_bit_size)
3917             continue;
3918 
3919          assert(dst_bit_size > 0);
3920          if (util_format_is_pure_uint(dst_pformat)) {
3921             nir_def *max = nir_imm_int(&b, (1 << dst_bit_size) - 1);
3922             c[i] = nir_umin(&b, c[i], max);
3923          } else {
3924             nir_def *max = nir_imm_int(&b, (1 << (dst_bit_size - 1)) - 1);
3925             nir_def *min = nir_imm_int(&b, -(1 << (dst_bit_size - 1)));
3926             c[i] = nir_imax(&b, nir_imin(&b, c[i], max), min);
3927          }
3928       }
3929 
3930       color = nir_vec4(&b, c[0], c[1], c[2], c[3]);
3931    }
3932 
3933    nir_store_var(&b, fs_out_color, color, 0xf);
3934 
3935    return b.shader;
3936 }
3937 
3938 static bool
create_pipeline(struct v3dv_device * device,struct v3dv_render_pass * pass,struct nir_shader * vs_nir,struct nir_shader * gs_nir,struct nir_shader * fs_nir,const VkPipelineVertexInputStateCreateInfo * vi_state,const VkPipelineDepthStencilStateCreateInfo * ds_state,const VkPipelineColorBlendStateCreateInfo * cb_state,const VkPipelineMultisampleStateCreateInfo * ms_state,const VkPipelineLayout layout,VkPipeline * pipeline)3939 create_pipeline(struct v3dv_device *device,
3940                 struct v3dv_render_pass *pass,
3941                 struct nir_shader *vs_nir,
3942                 struct nir_shader *gs_nir,
3943                 struct nir_shader *fs_nir,
3944                 const VkPipelineVertexInputStateCreateInfo *vi_state,
3945                 const VkPipelineDepthStencilStateCreateInfo *ds_state,
3946                 const VkPipelineColorBlendStateCreateInfo *cb_state,
3947                 const VkPipelineMultisampleStateCreateInfo *ms_state,
3948                 const VkPipelineLayout layout,
3949                 VkPipeline *pipeline)
3950 {
3951    struct vk_shader_module vs_m = vk_shader_module_from_nir(vs_nir);
3952    struct vk_shader_module fs_m = vk_shader_module_from_nir(fs_nir);
3953    struct vk_shader_module gs_m;
3954 
3955    uint32_t num_stages = gs_nir ? 3 : 2;
3956 
3957 
3958    VkPipelineShaderStageCreateInfo stages[3] = {
3959       {
3960          .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
3961          .stage = VK_SHADER_STAGE_VERTEX_BIT,
3962          .module = vk_shader_module_to_handle(&vs_m),
3963          .pName = "main",
3964       },
3965       {
3966          .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
3967          .stage = VK_SHADER_STAGE_FRAGMENT_BIT,
3968          .module = vk_shader_module_to_handle(&fs_m),
3969          .pName = "main",
3970       },
3971       {
3972          .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
3973          .stage = VK_SHADER_STAGE_GEOMETRY_BIT,
3974          .module = VK_NULL_HANDLE,
3975          .pName = "main",
3976       },
3977    };
3978 
3979    if (gs_nir) {
3980       gs_m = vk_shader_module_from_nir(gs_nir);
3981       stages[2].module = vk_shader_module_to_handle(&gs_m);
3982    }
3983 
3984    VkGraphicsPipelineCreateInfo info = {
3985       .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
3986 
3987       .stageCount = num_stages,
3988       .pStages = stages,
3989 
3990       .pVertexInputState = vi_state,
3991 
3992       .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) {
3993          .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
3994          .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP,
3995          .primitiveRestartEnable = false,
3996       },
3997 
3998       .pViewportState = &(VkPipelineViewportStateCreateInfo) {
3999          .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
4000          .viewportCount = 1,
4001          .scissorCount = 1,
4002       },
4003 
4004       .pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) {
4005          .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
4006          .rasterizerDiscardEnable = false,
4007          .polygonMode = VK_POLYGON_MODE_FILL,
4008          .cullMode = VK_CULL_MODE_NONE,
4009          .frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE,
4010          .depthBiasEnable = false,
4011       },
4012 
4013       .pMultisampleState = ms_state,
4014 
4015       .pDepthStencilState = ds_state,
4016 
4017       .pColorBlendState = cb_state,
4018 
4019       /* The meta clear pipeline declares all state as dynamic.
4020        * As a consequence, vkCmdBindPipeline writes no dynamic state
4021        * to the cmd buffer. Therefore, at the end of the meta clear,
4022        * we need only restore dynamic state that was vkCmdSet.
4023        */
4024       .pDynamicState = &(VkPipelineDynamicStateCreateInfo) {
4025          .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO,
4026          .dynamicStateCount = 6,
4027          .pDynamicStates = (VkDynamicState[]) {
4028             VK_DYNAMIC_STATE_VIEWPORT,
4029             VK_DYNAMIC_STATE_SCISSOR,
4030             VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK,
4031             VK_DYNAMIC_STATE_STENCIL_WRITE_MASK,
4032             VK_DYNAMIC_STATE_STENCIL_REFERENCE,
4033             VK_DYNAMIC_STATE_BLEND_CONSTANTS,
4034             VK_DYNAMIC_STATE_DEPTH_BIAS,
4035             VK_DYNAMIC_STATE_LINE_WIDTH,
4036          },
4037       },
4038 
4039       .flags = 0,
4040       .layout = layout,
4041       .renderPass = v3dv_render_pass_to_handle(pass),
4042       .subpass = 0,
4043    };
4044 
4045    VkResult result =
4046       v3dv_CreateGraphicsPipelines(v3dv_device_to_handle(device),
4047                                    VK_NULL_HANDLE,
4048                                    1, &info,
4049                                    &device->vk.alloc,
4050                                    pipeline);
4051 
4052    ralloc_free(vs_nir);
4053    ralloc_free(gs_nir);
4054    ralloc_free(fs_nir);
4055 
4056    return result == VK_SUCCESS;
4057 }
4058 
4059 static enum glsl_sampler_dim
get_sampler_dim(VkImageType type,VkSampleCountFlagBits src_samples)4060 get_sampler_dim(VkImageType type, VkSampleCountFlagBits src_samples)
4061 {
4062    /* From the Vulkan 1.0 spec, VkImageCreateInfo Validu Usage:
4063     *
4064     *   "If samples is not VK_SAMPLE_COUNT_1_BIT, then imageType must be
4065     *    VK_IMAGE_TYPE_2D, ..."
4066     */
4067    assert(src_samples == VK_SAMPLE_COUNT_1_BIT || type == VK_IMAGE_TYPE_2D);
4068 
4069    switch (type) {
4070    case VK_IMAGE_TYPE_1D: return GLSL_SAMPLER_DIM_1D;
4071    case VK_IMAGE_TYPE_2D:
4072       return src_samples == VK_SAMPLE_COUNT_1_BIT ? GLSL_SAMPLER_DIM_2D :
4073                                                     GLSL_SAMPLER_DIM_MS;
4074    case VK_IMAGE_TYPE_3D: return GLSL_SAMPLER_DIM_3D;
4075    default:
4076       unreachable("Invalid image type");
4077    }
4078 }
4079 
4080 static bool
create_blit_pipeline(struct v3dv_device * device,VkFormat dst_format,VkFormat src_format,VkColorComponentFlags cmask,VkImageType src_type,VkSampleCountFlagBits dst_samples,VkSampleCountFlagBits src_samples,VkRenderPass _pass,VkPipelineLayout pipeline_layout,VkPipeline * pipeline)4081 create_blit_pipeline(struct v3dv_device *device,
4082                      VkFormat dst_format,
4083                      VkFormat src_format,
4084                      VkColorComponentFlags cmask,
4085                      VkImageType src_type,
4086                      VkSampleCountFlagBits dst_samples,
4087                      VkSampleCountFlagBits src_samples,
4088                      VkRenderPass _pass,
4089                      VkPipelineLayout pipeline_layout,
4090                      VkPipeline *pipeline)
4091 {
4092    struct v3dv_render_pass *pass = v3dv_render_pass_from_handle(_pass);
4093 
4094    /* We always rewrite depth/stencil blits to compatible color blits */
4095    assert(vk_format_is_color(dst_format));
4096    assert(vk_format_is_color(src_format));
4097 
4098    const nir_shader_compiler_options *options =
4099       v3dv_pipeline_get_nir_options(&device->devinfo);
4100 
4101    const enum glsl_sampler_dim sampler_dim =
4102       get_sampler_dim(src_type, src_samples);
4103 
4104    nir_shader *vs_nir = get_blit_vs(options);
4105    nir_shader *fs_nir =
4106       get_color_blit_fs(options, device, dst_format, src_format,
4107                         dst_samples, src_samples, sampler_dim);
4108 
4109    const VkPipelineVertexInputStateCreateInfo vi_state = {
4110       .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
4111       .vertexBindingDescriptionCount = 0,
4112       .vertexAttributeDescriptionCount = 0,
4113    };
4114 
4115    VkPipelineDepthStencilStateCreateInfo ds_state = {
4116       .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
4117    };
4118 
4119    VkPipelineColorBlendAttachmentState blend_att_state[1] = { 0 };
4120    blend_att_state[0] = (VkPipelineColorBlendAttachmentState) {
4121       .blendEnable = false,
4122       .colorWriteMask = cmask,
4123    };
4124 
4125    const VkPipelineColorBlendStateCreateInfo cb_state = {
4126       .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
4127       .logicOpEnable = false,
4128       .attachmentCount = 1,
4129       .pAttachments = blend_att_state
4130    };
4131 
4132    const VkPipelineMultisampleStateCreateInfo ms_state = {
4133       .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
4134       .rasterizationSamples = dst_samples,
4135       .sampleShadingEnable = dst_samples > VK_SAMPLE_COUNT_1_BIT,
4136       .pSampleMask = NULL,
4137       .alphaToCoverageEnable = false,
4138       .alphaToOneEnable = false,
4139    };
4140 
4141    return create_pipeline(device,
4142                           pass,
4143                           vs_nir, NULL, fs_nir,
4144                           &vi_state,
4145                           &ds_state,
4146                           &cb_state,
4147                           &ms_state,
4148                           pipeline_layout,
4149                           pipeline);
4150 }
4151 
4152 /**
4153  * Return a pipeline suitable for blitting the requested aspect given the
4154  * destination and source formats.
4155  */
4156 static bool
get_blit_pipeline(struct v3dv_cmd_buffer * cmd_buffer,VkFormat dst_format,VkFormat src_format,VkColorComponentFlags cmask,VkImageType src_type,VkSampleCountFlagBits dst_samples,VkSampleCountFlagBits src_samples,struct v3dv_meta_blit_pipeline ** pipeline)4157 get_blit_pipeline(struct v3dv_cmd_buffer *cmd_buffer,
4158                   VkFormat dst_format,
4159                   VkFormat src_format,
4160                   VkColorComponentFlags cmask,
4161                   VkImageType src_type,
4162                   VkSampleCountFlagBits dst_samples,
4163                   VkSampleCountFlagBits src_samples,
4164                   struct v3dv_meta_blit_pipeline **pipeline)
4165 {
4166    bool ok = true;
4167    struct v3dv_device *device = cmd_buffer->device;
4168 
4169    uint8_t key[V3DV_META_BLIT_CACHE_KEY_SIZE];
4170    if (device->instance->meta_cache_enabled) {
4171       get_blit_pipeline_cache_key(dst_format, src_format, cmask,
4172                                   dst_samples, src_samples, key);
4173       mtx_lock(&device->meta.mtx);
4174       struct hash_entry *entry =
4175          _mesa_hash_table_search(device->meta.blit.cache[src_type], &key);
4176       if (entry) {
4177          mtx_unlock(&device->meta.mtx);
4178          *pipeline = entry->data;
4179          return true;
4180       }
4181    }
4182 
4183    *pipeline = vk_zalloc2(&device->vk.alloc, NULL, sizeof(**pipeline), 8,
4184                           VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
4185 
4186    if (*pipeline == NULL)
4187       goto fail;
4188 
4189    ok = create_blit_render_pass(device, dst_format, src_format,
4190                                 &(*pipeline)->pass,
4191                                 &(*pipeline)->pass_no_load);
4192    if (!ok)
4193       goto fail;
4194 
4195    /* Create the pipeline using one of the render passes, they are both
4196     * compatible, so we don't care which one we use here.
4197     */
4198    ok = create_blit_pipeline(device,
4199                              dst_format,
4200                              src_format,
4201                              cmask,
4202                              src_type,
4203                              dst_samples,
4204                              src_samples,
4205                              (*pipeline)->pass,
4206                              device->meta.blit.p_layout,
4207                              &(*pipeline)->pipeline);
4208    if (!ok)
4209       goto fail;
4210 
4211    if (device->instance->meta_cache_enabled) {
4212       memcpy((*pipeline)->key, key, sizeof((*pipeline)->key));
4213       _mesa_hash_table_insert(device->meta.blit.cache[src_type],
4214                               &(*pipeline)->key, *pipeline);
4215       mtx_unlock(&device->meta.mtx);
4216    } else {
4217       v3dv_cmd_buffer_add_private_obj(
4218          cmd_buffer, (uintptr_t)*pipeline,
4219          (v3dv_cmd_buffer_private_obj_destroy_cb)destroy_meta_blit_pipeline);
4220    }
4221 
4222    return true;
4223 
4224 fail:
4225    if (device->instance->meta_cache_enabled)
4226       mtx_unlock(&device->meta.mtx);
4227 
4228    VkDevice _device = v3dv_device_to_handle(device);
4229    if (*pipeline) {
4230       if ((*pipeline)->pass)
4231          v3dv_DestroyRenderPass(_device, (*pipeline)->pass, &device->vk.alloc);
4232       if ((*pipeline)->pass_no_load)
4233          v3dv_DestroyRenderPass(_device, (*pipeline)->pass_no_load, &device->vk.alloc);
4234       if ((*pipeline)->pipeline)
4235          v3dv_DestroyPipeline(_device, (*pipeline)->pipeline, &device->vk.alloc);
4236       vk_free(&device->vk.alloc, *pipeline);
4237       *pipeline = NULL;
4238    }
4239 
4240    return false;
4241 }
4242 
4243 static void
compute_blit_box(const VkOffset3D * offsets,uint32_t image_w,uint32_t image_h,uint32_t * x,uint32_t * y,uint32_t * w,uint32_t * h,bool * mirror_x,bool * mirror_y)4244 compute_blit_box(const VkOffset3D *offsets,
4245                  uint32_t image_w, uint32_t image_h,
4246                  uint32_t *x, uint32_t *y, uint32_t *w, uint32_t *h,
4247                  bool *mirror_x, bool *mirror_y)
4248 {
4249    if (offsets[1].x >= offsets[0].x) {
4250       *mirror_x = false;
4251       *x = MIN2(offsets[0].x, image_w - 1);
4252       *w = MIN2(offsets[1].x - offsets[0].x, image_w - offsets[0].x);
4253    } else {
4254       *mirror_x = true;
4255       *x = MIN2(offsets[1].x, image_w - 1);
4256       *w = MIN2(offsets[0].x - offsets[1].x, image_w - offsets[1].x);
4257    }
4258    if (offsets[1].y >= offsets[0].y) {
4259       *mirror_y = false;
4260       *y = MIN2(offsets[0].y, image_h - 1);
4261       *h = MIN2(offsets[1].y - offsets[0].y, image_h - offsets[0].y);
4262    } else {
4263       *mirror_y = true;
4264       *y = MIN2(offsets[1].y, image_h - 1);
4265       *h = MIN2(offsets[0].y - offsets[1].y, image_h - offsets[1].y);
4266    }
4267 }
4268 
4269 static void
compute_blit_3d_layers(const VkOffset3D * offsets,uint32_t * min_layer,uint32_t * max_layer,bool * mirror_z)4270 compute_blit_3d_layers(const VkOffset3D *offsets,
4271                        uint32_t *min_layer, uint32_t *max_layer,
4272                        bool *mirror_z)
4273 {
4274    if (offsets[1].z >= offsets[0].z) {
4275       *mirror_z = false;
4276       *min_layer = offsets[0].z;
4277       *max_layer = offsets[1].z;
4278    } else {
4279       *mirror_z = true;
4280       *min_layer = offsets[1].z;
4281       *max_layer = offsets[0].z;
4282    }
4283 }
4284 
4285 static VkResult
create_blit_descriptor_pool(struct v3dv_cmd_buffer * cmd_buffer)4286 create_blit_descriptor_pool(struct v3dv_cmd_buffer *cmd_buffer)
4287 {
4288    /* If this is not the first pool we create for this command buffer
4289     * size it based on the size of the currently exhausted pool.
4290     */
4291    uint32_t descriptor_count = 64;
4292    if (cmd_buffer->meta.blit.dspool != VK_NULL_HANDLE) {
4293       struct v3dv_descriptor_pool *exhausted_pool =
4294          v3dv_descriptor_pool_from_handle(cmd_buffer->meta.blit.dspool);
4295       descriptor_count = MIN2(exhausted_pool->max_entry_count * 2, 1024);
4296    }
4297 
4298    /* Create the descriptor pool */
4299    cmd_buffer->meta.blit.dspool = VK_NULL_HANDLE;
4300    VkDescriptorPoolSize pool_size = {
4301       .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
4302       .descriptorCount = descriptor_count,
4303    };
4304    VkDescriptorPoolCreateInfo info = {
4305       .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
4306       .maxSets = descriptor_count,
4307       .poolSizeCount = 1,
4308       .pPoolSizes = &pool_size,
4309       .flags = 0,
4310    };
4311    VkResult result =
4312       v3dv_CreateDescriptorPool(v3dv_device_to_handle(cmd_buffer->device),
4313                                 &info,
4314                                 &cmd_buffer->device->vk.alloc,
4315                                 &cmd_buffer->meta.blit.dspool);
4316 
4317    if (result == VK_SUCCESS) {
4318       assert(cmd_buffer->meta.blit.dspool != VK_NULL_HANDLE);
4319       const VkDescriptorPool _pool = cmd_buffer->meta.blit.dspool;
4320 
4321       v3dv_cmd_buffer_add_private_obj(
4322          cmd_buffer, (uintptr_t) _pool,
4323          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyDescriptorPool);
4324 
4325       struct v3dv_descriptor_pool *pool =
4326          v3dv_descriptor_pool_from_handle(_pool);
4327       pool->is_driver_internal = true;
4328    }
4329 
4330    return result;
4331 }
4332 
4333 static VkResult
allocate_blit_source_descriptor_set(struct v3dv_cmd_buffer * cmd_buffer,VkDescriptorSet * set)4334 allocate_blit_source_descriptor_set(struct v3dv_cmd_buffer *cmd_buffer,
4335                                     VkDescriptorSet *set)
4336 {
4337    /* Make sure we have a descriptor pool */
4338    VkResult result;
4339    if (cmd_buffer->meta.blit.dspool == VK_NULL_HANDLE) {
4340       result = create_blit_descriptor_pool(cmd_buffer);
4341       if (result != VK_SUCCESS)
4342          return result;
4343    }
4344    assert(cmd_buffer->meta.blit.dspool != VK_NULL_HANDLE);
4345 
4346    /* Allocate descriptor set */
4347    struct v3dv_device *device = cmd_buffer->device;
4348    VkDevice _device = v3dv_device_to_handle(device);
4349    VkDescriptorSetAllocateInfo info = {
4350       .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
4351       .descriptorPool = cmd_buffer->meta.blit.dspool,
4352       .descriptorSetCount = 1,
4353       .pSetLayouts = &device->meta.blit.ds_layout,
4354    };
4355    result = v3dv_AllocateDescriptorSets(_device, &info, set);
4356 
4357    /* If we ran out of pool space, grow the pool and try again */
4358    if (result == VK_ERROR_OUT_OF_POOL_MEMORY) {
4359       result = create_blit_descriptor_pool(cmd_buffer);
4360       if (result == VK_SUCCESS) {
4361          info.descriptorPool = cmd_buffer->meta.blit.dspool;
4362          result = v3dv_AllocateDescriptorSets(_device, &info, set);
4363       }
4364    }
4365 
4366    return result;
4367 }
4368 
4369 /**
4370  * Returns true if the implementation supports the requested operation (even if
4371  * it failed to process it, for example, due to an out-of-memory error).
4372  *
4373  * The caller can specify the channels on the destination to be written via the
4374  * cmask parameter (which can be 0 to default to all channels), as well as a
4375  * swizzle to apply to the source via the cswizzle parameter  (which can be NULL
4376  * to use the default identity swizzle).
4377  *
4378  * Supports multi-plane formats too.
4379  */
4380 static bool
blit_shader(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,VkFormat dst_format,struct v3dv_image * src,VkFormat src_format,VkColorComponentFlags cmask,VkComponentMapping * cswizzle,const VkImageBlit2 * region,VkFilter filter,bool dst_is_padded_image)4381 blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
4382             struct v3dv_image *dst,
4383             VkFormat dst_format,
4384             struct v3dv_image *src,
4385             VkFormat src_format,
4386             VkColorComponentFlags cmask,
4387             VkComponentMapping *cswizzle,
4388             const VkImageBlit2 *region,
4389             VkFilter filter,
4390             bool dst_is_padded_image)
4391 {
4392    bool handled = true;
4393    VkResult result;
4394 
4395    /* Can't sample from linear images */
4396    if (!src->tiled && src->vk.image_type != VK_IMAGE_TYPE_1D) {
4397       return false;
4398    }
4399 
4400    /* Rewrite combined D/S blits to compatible color blits */
4401    if (vk_format_is_depth_or_stencil(dst_format)) {
4402       assert(src_format == dst_format);
4403       assert(cmask == 0);
4404       switch(dst_format) {
4405       case VK_FORMAT_D16_UNORM:
4406          dst_format = VK_FORMAT_R16_UINT;
4407          break;
4408       case VK_FORMAT_D32_SFLOAT:
4409          dst_format = VK_FORMAT_R32_UINT;
4410          break;
4411       case VK_FORMAT_X8_D24_UNORM_PACK32:
4412       case VK_FORMAT_D24_UNORM_S8_UINT:
4413          if (region->srcSubresource.aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
4414             cmask |= VK_COLOR_COMPONENT_G_BIT |
4415                      VK_COLOR_COMPONENT_B_BIT |
4416                      VK_COLOR_COMPONENT_A_BIT;
4417          }
4418          if (region->srcSubresource.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
4419             assert(dst_format == VK_FORMAT_D24_UNORM_S8_UINT);
4420             cmask |= VK_COLOR_COMPONENT_R_BIT;
4421          }
4422          dst_format = VK_FORMAT_R8G8B8A8_UINT;
4423          break;
4424       default:
4425          unreachable("Unsupported depth/stencil format");
4426       };
4427       src_format = dst_format;
4428    }
4429 
4430    uint8_t src_plane =
4431       v3dv_plane_from_aspect(region->srcSubresource.aspectMask);
4432    assert(src_plane < src->plane_count);
4433    uint8_t dst_plane =
4434       v3dv_plane_from_aspect(region->dstSubresource.aspectMask);
4435    assert(dst_plane < dst->plane_count);
4436 
4437    const VkColorComponentFlags full_cmask = VK_COLOR_COMPONENT_R_BIT |
4438                                             VK_COLOR_COMPONENT_G_BIT |
4439                                             VK_COLOR_COMPONENT_B_BIT |
4440                                             VK_COLOR_COMPONENT_A_BIT;
4441    if (cmask == 0)
4442       cmask = full_cmask;
4443 
4444    VkComponentMapping ident_swizzle = {
4445       .r = VK_COMPONENT_SWIZZLE_IDENTITY,
4446       .g = VK_COMPONENT_SWIZZLE_IDENTITY,
4447       .b = VK_COMPONENT_SWIZZLE_IDENTITY,
4448       .a = VK_COMPONENT_SWIZZLE_IDENTITY,
4449    };
4450    if (!cswizzle)
4451       cswizzle = &ident_swizzle;
4452 
4453    /* When we get here from a copy between compressed / uncompressed images
4454     * we choose to specify the destination blit region based on the size
4455     * semantics of the source image of the copy (see copy_image_blit), so we
4456     * need to apply those same semantics here when we compute the size of the
4457     * destination image level.
4458     */
4459    const uint32_t dst_block_w =
4460       vk_format_get_blockwidth(dst->planes[dst_plane].vk_format);
4461    const uint32_t dst_block_h =
4462       vk_format_get_blockheight(dst->planes[dst_plane].vk_format);
4463    const uint32_t src_block_w =
4464       vk_format_get_blockwidth(src->planes[src_plane].vk_format);
4465    const uint32_t src_block_h =
4466       vk_format_get_blockheight(src->planes[src_plane].vk_format);
4467    const uint32_t dst_level_w =
4468       u_minify(DIV_ROUND_UP(dst->vk.extent.width * src_block_w, dst_block_w),
4469                region->dstSubresource.mipLevel);
4470    const uint32_t dst_level_h =
4471       u_minify(DIV_ROUND_UP(dst->vk.extent.height * src_block_h, dst_block_h),
4472                region->dstSubresource.mipLevel);
4473 
4474    const uint32_t src_level_w =
4475       u_minify(src->planes[src_plane].width, region->srcSubresource.mipLevel);
4476    const uint32_t src_level_h =
4477       u_minify(src->planes[src_plane].height, region->srcSubresource.mipLevel);
4478 
4479    assert(src->plane_count == 1 || src->vk.image_type != VK_IMAGE_TYPE_3D);
4480    const uint32_t src_level_d =
4481       u_minify(src->vk.extent.depth, region->srcSubresource.mipLevel);
4482 
4483    uint32_t dst_x, dst_y, dst_w, dst_h;
4484    bool dst_mirror_x, dst_mirror_y;
4485    compute_blit_box(region->dstOffsets,
4486                     dst_level_w, dst_level_h,
4487                     &dst_x, &dst_y, &dst_w, &dst_h,
4488                     &dst_mirror_x, &dst_mirror_y);
4489 
4490    uint32_t src_x, src_y, src_w, src_h;
4491    bool src_mirror_x, src_mirror_y;
4492    compute_blit_box(region->srcOffsets,
4493                     src_level_w, src_level_h,
4494                     &src_x, &src_y, &src_w, &src_h,
4495                     &src_mirror_x, &src_mirror_y);
4496 
4497    uint32_t min_dst_layer;
4498    uint32_t max_dst_layer;
4499    bool dst_mirror_z = false;
4500    if (dst->vk.image_type != VK_IMAGE_TYPE_3D) {
4501       min_dst_layer = region->dstSubresource.baseArrayLayer;
4502       max_dst_layer = min_dst_layer +
4503                       vk_image_subresource_layer_count(&dst->vk,
4504                                                        &region->dstSubresource);
4505    } else {
4506       compute_blit_3d_layers(region->dstOffsets,
4507                              &min_dst_layer, &max_dst_layer,
4508                              &dst_mirror_z);
4509    }
4510 
4511    uint32_t min_src_layer;
4512    uint32_t max_src_layer;
4513    bool src_mirror_z = false;
4514    if (src->vk.image_type != VK_IMAGE_TYPE_3D) {
4515       min_src_layer = region->srcSubresource.baseArrayLayer;
4516       max_src_layer = min_src_layer +
4517                       vk_image_subresource_layer_count(&src->vk,
4518                                                        &region->srcSubresource);
4519    } else {
4520       compute_blit_3d_layers(region->srcOffsets,
4521                              &min_src_layer, &max_src_layer,
4522                              &src_mirror_z);
4523    }
4524 
4525    uint32_t layer_count = max_dst_layer - min_dst_layer;
4526 
4527    /* Translate source blit coordinates to normalized texture coordinates for
4528     * single sampled textures. For multisampled textures we require
4529     * unnormalized coordinates, since we can only do texelFetch on them.
4530     */
4531    float coords[4] =  {
4532       (float)src_x,
4533       (float)src_y,
4534       (float)(src_x + src_w),
4535       (float)(src_y + src_h),
4536    };
4537 
4538    if (src->vk.samples == VK_SAMPLE_COUNT_1_BIT) {
4539       coords[0] /= (float)src_level_w;
4540       coords[1] /= (float)src_level_h;
4541       coords[2] /= (float)src_level_w;
4542       coords[3] /= (float)src_level_h;
4543    }
4544 
4545    /* Handle mirroring */
4546    const bool mirror_x = dst_mirror_x != src_mirror_x;
4547    const bool mirror_y = dst_mirror_y != src_mirror_y;
4548    const bool mirror_z = dst_mirror_z != src_mirror_z;
4549    float tex_coords[5] = {
4550       !mirror_x ? coords[0] : coords[2],
4551       !mirror_y ? coords[1] : coords[3],
4552       !mirror_x ? coords[2] : coords[0],
4553       !mirror_y ? coords[3] : coords[1],
4554       /* Z coordinate for 3D blit sources, to be filled for each
4555        * destination layer
4556        */
4557       0.0f
4558    };
4559 
4560    /* For blits from 3D images we also need to compute the slice coordinate to
4561     * sample from, which will change for each layer in the destination.
4562     * Compute the step we should increase for each iteration.
4563     */
4564    const float src_z_step =
4565       (float)(max_src_layer - min_src_layer) / (float)layer_count;
4566 
4567    /* Get the blit pipeline */
4568    struct v3dv_meta_blit_pipeline *pipeline = NULL;
4569    bool ok = get_blit_pipeline(cmd_buffer,
4570                                dst_format, src_format, cmask, src->vk.image_type,
4571                                dst->vk.samples, src->vk.samples,
4572                                &pipeline);
4573    if (!ok)
4574       return handled;
4575    assert(pipeline && pipeline->pipeline &&
4576           pipeline->pass && pipeline->pass_no_load);
4577 
4578    struct v3dv_device *device = cmd_buffer->device;
4579    assert(device->meta.blit.ds_layout);
4580 
4581    VkDevice _device = v3dv_device_to_handle(device);
4582    VkCommandBuffer _cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
4583 
4584    /* Create sampler for blit source image */
4585    VkSamplerCreateInfo sampler_info = {
4586       .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO,
4587       .magFilter = filter,
4588       .minFilter = filter,
4589       .addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
4590       .addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
4591       .addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
4592       .mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST,
4593    };
4594    VkSampler sampler;
4595    result = v3dv_CreateSampler(_device, &sampler_info, &device->vk.alloc,
4596                                &sampler);
4597    if (result != VK_SUCCESS)
4598       goto fail;
4599 
4600    v3dv_cmd_buffer_add_private_obj(
4601       cmd_buffer, (uintptr_t)sampler,
4602       (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroySampler);
4603 
4604    /* Push command buffer state before starting meta operation */
4605    v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);
4606 
4607    /* Push state that is common for all layers */
4608    v3dv_CmdBindPipeline(_cmd_buffer,
4609                         VK_PIPELINE_BIND_POINT_GRAPHICS,
4610                         pipeline->pipeline);
4611 
4612    const VkViewport viewport = {
4613       .x = dst_x,
4614       .y = dst_y,
4615       .width = dst_w,
4616       .height = dst_h,
4617       .minDepth = 0.0f,
4618       .maxDepth = 1.0f
4619    };
4620    v3dv_CmdSetViewport(_cmd_buffer, 0, 1, &viewport);
4621 
4622    const VkRect2D scissor = {
4623       .offset = { dst_x, dst_y },
4624       .extent = { dst_w, dst_h }
4625    };
4626    v3dv_CmdSetScissor(_cmd_buffer, 0, 1, &scissor);
4627 
4628    bool can_skip_tlb_load = false;
4629    const VkRect2D render_area = {
4630       .offset = { dst_x, dst_y },
4631       .extent = { dst_w, dst_h },
4632    };
4633 
4634    /* Record per-layer commands */
4635    for (uint32_t i = 0; i < layer_count; i++) {
4636       /* Setup framebuffer */
4637       VkImageViewCreateInfo dst_image_view_info = {
4638          .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
4639          .image = v3dv_image_to_handle(dst),
4640          .viewType = v3dv_image_type_to_view_type(dst->vk.image_type),
4641          .format = dst_format,
4642          .subresourceRange = {
4643             .aspectMask = region->dstSubresource.aspectMask,
4644             .baseMipLevel = region->dstSubresource.mipLevel,
4645             .levelCount = 1,
4646             .baseArrayLayer = min_dst_layer + i,
4647             .layerCount = 1
4648          },
4649       };
4650       VkImageView dst_image_view;
4651       result = v3dv_create_image_view(device, &dst_image_view_info,
4652                                       &dst_image_view);
4653       if (result != VK_SUCCESS)
4654          goto fail;
4655 
4656       v3dv_cmd_buffer_add_private_obj(
4657          cmd_buffer, (uintptr_t)dst_image_view,
4658          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImageView);
4659 
4660       VkFramebufferCreateInfo fb_info = {
4661          .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
4662          .renderPass = pipeline->pass,
4663          .attachmentCount = 1,
4664          .pAttachments = &dst_image_view,
4665          .width = dst_x + dst_w,
4666          .height = dst_y + dst_h,
4667          .layers = 1,
4668       };
4669 
4670       VkFramebuffer fb;
4671       result = v3dv_CreateFramebuffer(_device, &fb_info,
4672                                       &cmd_buffer->device->vk.alloc, &fb);
4673       if (result != VK_SUCCESS)
4674          goto fail;
4675 
4676       struct v3dv_framebuffer *framebuffer = v3dv_framebuffer_from_handle(fb);
4677       framebuffer->has_edge_padding = fb_info.width == dst_level_w &&
4678                                       fb_info.height == dst_level_h &&
4679                                       dst_is_padded_image;
4680 
4681       v3dv_cmd_buffer_add_private_obj(
4682          cmd_buffer, (uintptr_t)fb,
4683          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyFramebuffer);
4684 
4685       /* Setup descriptor set for blit source texture. We don't have to
4686        * register the descriptor as a private command buffer object since
4687        * all descriptors will be freed automatically with the descriptor
4688        * pool.
4689        */
4690       VkDescriptorSet set;
4691       result = allocate_blit_source_descriptor_set(cmd_buffer, &set);
4692       if (result != VK_SUCCESS)
4693          goto fail;
4694 
4695       VkImageViewCreateInfo src_image_view_info = {
4696          .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
4697          .image = v3dv_image_to_handle(src),
4698          .viewType = v3dv_image_type_to_view_type(src->vk.image_type),
4699          .format = src_format,
4700          .components = *cswizzle,
4701          .subresourceRange = {
4702             .aspectMask = region->srcSubresource.aspectMask,
4703             .baseMipLevel = region->srcSubresource.mipLevel,
4704             .levelCount = 1,
4705             .baseArrayLayer =
4706                src->vk.image_type == VK_IMAGE_TYPE_3D ? 0 : min_src_layer + i,
4707             .layerCount = 1
4708          },
4709       };
4710       VkImageView src_image_view;
4711       result = v3dv_create_image_view(device, &src_image_view_info,
4712                                       &src_image_view);
4713       if (result != VK_SUCCESS)
4714          goto fail;
4715 
4716       v3dv_cmd_buffer_add_private_obj(
4717          cmd_buffer, (uintptr_t)src_image_view,
4718          (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImageView);
4719 
4720       VkDescriptorImageInfo image_info = {
4721          .sampler = sampler,
4722          .imageView = src_image_view,
4723          .imageLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
4724       };
4725       VkWriteDescriptorSet write = {
4726          .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
4727          .dstSet = set,
4728          .dstBinding = 0,
4729          .dstArrayElement = 0,
4730          .descriptorCount = 1,
4731          .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
4732          .pImageInfo = &image_info,
4733       };
4734       v3dv_UpdateDescriptorSets(_device, 1, &write, 0, NULL);
4735 
4736       v3dv_CmdBindDescriptorSets(_cmd_buffer,
4737                                  VK_PIPELINE_BIND_POINT_GRAPHICS,
4738                                  device->meta.blit.p_layout,
4739                                  0, 1, &set,
4740                                  0, NULL);
4741 
4742       /* If the region we are about to blit is tile-aligned, then we can
4743        * use the render pass version that won't pre-load the tile buffer
4744        * with the dst image contents before the blit. The exception is when we
4745        * don't have a full color mask, since in that case we need to preserve
4746        * the original value of some of the color components.
4747        *
4748        * Since all layers have the same area, we only need to compute this for
4749        * the first.
4750        */
4751       if (i == 0) {
4752          struct v3dv_render_pass *pipeline_pass =
4753             v3dv_render_pass_from_handle(pipeline->pass);
4754          can_skip_tlb_load =
4755             cmask == full_cmask &&
4756             v3dv_subpass_area_is_tile_aligned(cmd_buffer->device, &render_area,
4757                                               framebuffer, pipeline_pass, 0);
4758       }
4759 
4760       /* Record blit */
4761       VkRenderPassBeginInfo rp_info = {
4762          .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
4763          .renderPass = can_skip_tlb_load ? pipeline->pass_no_load :
4764                                            pipeline->pass,
4765          .framebuffer = fb,
4766          .renderArea = render_area,
4767          .clearValueCount = 0,
4768       };
4769 
4770       VkSubpassBeginInfo sp_info = {
4771          .sType = VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO,
4772          .contents = VK_SUBPASS_CONTENTS_INLINE,
4773       };
4774 
4775       v3dv_CmdBeginRenderPass2(_cmd_buffer, &rp_info, &sp_info);
4776       struct v3dv_job *job = cmd_buffer->state.job;
4777       if (!job)
4778          goto fail;
4779 
4780       /* For 3D blits we need to compute the source slice to blit from (the Z
4781        * coordinate of the source sample operation). We want to choose this
4782        * based on the ratio of the depth of the source and the destination
4783        * images, picking the coordinate in the middle of each step.
4784        */
4785       if (src->vk.image_type == VK_IMAGE_TYPE_3D) {
4786          tex_coords[4] =
4787             !mirror_z ?
4788             (min_src_layer + (i + 0.5f) * src_z_step) / (float)src_level_d :
4789             (max_src_layer - (i + 0.5f) * src_z_step) / (float)src_level_d;
4790       }
4791 
4792       v3dv_CmdPushConstants(_cmd_buffer,
4793                             device->meta.blit.p_layout,
4794                             VK_SHADER_STAGE_VERTEX_BIT, 0, 20,
4795                             &tex_coords);
4796 
4797       v3dv_CmdDraw(_cmd_buffer, 4, 1, 0, 0);
4798 
4799       VkSubpassEndInfo sp_end_info = {
4800          .sType = VK_STRUCTURE_TYPE_SUBPASS_END_INFO,
4801       };
4802 
4803       v3dv_CmdEndRenderPass2(_cmd_buffer, &sp_end_info);
4804    }
4805 
4806 fail:
4807    v3dv_cmd_buffer_meta_state_pop(cmd_buffer, true);
4808 
4809    return handled;
4810 }
4811 
4812 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBlitImage2(VkCommandBuffer commandBuffer,const VkBlitImageInfo2 * pBlitImageInfo)4813 v3dv_CmdBlitImage2(VkCommandBuffer commandBuffer,
4814                       const VkBlitImageInfo2 *pBlitImageInfo)
4815 {
4816    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4817    V3DV_FROM_HANDLE(v3dv_image, src, pBlitImageInfo->srcImage);
4818    V3DV_FROM_HANDLE(v3dv_image, dst, pBlitImageInfo->dstImage);
4819 
4820    /* From vkCmdBlitImage:
4821     *   "srcImage must not use a format that requires a sampler YCBCR
4822     *    conversion"
4823     *   "dstImage must not use a format that requires a sampler YCBCR
4824     *    conversion"
4825     */
4826    assert(src->plane_count == 1);
4827    assert(dst->plane_count == 1);
4828 
4829    /* This command can only happen outside a render pass */
4830    assert(cmd_buffer->state.pass == NULL);
4831    assert(cmd_buffer->state.job == NULL);
4832 
4833    /* From the Vulkan 1.0 spec, vkCmdBlitImage valid usage */
4834    assert(dst->vk.samples == VK_SAMPLE_COUNT_1_BIT &&
4835           src->vk.samples == VK_SAMPLE_COUNT_1_BIT);
4836 
4837    /* We don't export VK_FORMAT_FEATURE_BLIT_DST_BIT on compressed formats */
4838    assert(!vk_format_is_compressed(dst->vk.format));
4839 
4840    cmd_buffer->state.is_transfer = true;
4841 
4842    for (uint32_t i = 0; i < pBlitImageInfo->regionCount; i++) {
4843       const VkImageBlit2 *region = &pBlitImageInfo->pRegions[i];
4844 
4845       if (blit_tfu(cmd_buffer, dst, src, region))
4846          continue;
4847       if (blit_shader(cmd_buffer,
4848                       dst, dst->vk.format,
4849                       src, src->vk.format,
4850                       0, NULL,
4851                       region,
4852                       pBlitImageInfo->filter, true)) {
4853          continue;
4854       }
4855       unreachable("Unsupported blit operation");
4856    }
4857 
4858    cmd_buffer->state.is_transfer = false;
4859 }
4860 
4861 static bool
resolve_image_tlb(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageResolve2 * region)4862 resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
4863                   struct v3dv_image *dst,
4864                   struct v3dv_image *src,
4865                   const VkImageResolve2 *region)
4866 {
4867    /* No resolve for multi-planar images. Using plane 0 */
4868    assert(dst->plane_count == 1);
4869    assert(src->plane_count == 1);
4870 
4871    if (!v3dv_meta_can_use_tlb(src, 0, region->srcSubresource.mipLevel,
4872                               &region->srcOffset, NULL, NULL) ||
4873        !v3dv_meta_can_use_tlb(dst, 0, region->dstSubresource.mipLevel,
4874                               &region->dstOffset, &region->extent, NULL)) {
4875       return false;
4876    }
4877 
4878    if (!v3dv_X(cmd_buffer->device, format_supports_tlb_resolve)(src->format))
4879       return false;
4880 
4881    const VkFormat fb_format = src->vk.format;
4882 
4883    uint32_t num_layers;
4884    if (dst->vk.image_type != VK_IMAGE_TYPE_3D) {
4885       num_layers = vk_image_subresource_layer_count(&dst->vk,
4886                                                     &region->dstSubresource);
4887    } else {
4888       num_layers = region->extent.depth;
4889    }
4890    assert(num_layers > 0);
4891 
4892    struct v3dv_job *job =
4893       v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
4894    if (!job)
4895       return true;
4896 
4897    const uint32_t block_w =
4898       vk_format_get_blockwidth(dst->planes[0].vk_format);
4899    const uint32_t block_h =
4900       vk_format_get_blockheight(dst->planes[0].vk_format);
4901    const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
4902    const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
4903 
4904    uint32_t internal_type, internal_bpp;
4905    v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
4906       (fb_format, region->srcSubresource.aspectMask,
4907        &internal_type, &internal_bpp);
4908 
4909    v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
4910                         internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
4911                         true);
4912 
4913    struct v3dv_meta_framebuffer framebuffer;
4914    v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
4915                                               internal_type, &job->frame_tiling);
4916 
4917    v3dv_X(job->device, job_emit_binning_flush)(job);
4918    v3dv_X(job->device, meta_emit_resolve_image_rcl)(job, dst, src,
4919                                                     &framebuffer, region);
4920 
4921    v3dv_cmd_buffer_finish_job(cmd_buffer);
4922    return true;
4923 }
4924 
4925 static bool
resolve_image_blit(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_image * dst,struct v3dv_image * src,const VkImageResolve2 * region)4926 resolve_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
4927                    struct v3dv_image *dst,
4928                    struct v3dv_image *src,
4929                    const VkImageResolve2 *region)
4930 {
4931    const VkImageBlit2 blit_region = {
4932       .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2,
4933       .srcSubresource = region->srcSubresource,
4934       .srcOffsets = {
4935          region->srcOffset,
4936          {
4937             region->srcOffset.x + region->extent.width,
4938             region->srcOffset.y + region->extent.height,
4939          }
4940       },
4941       .dstSubresource = region->dstSubresource,
4942       .dstOffsets = {
4943          region->dstOffset,
4944          {
4945             region->dstOffset.x + region->extent.width,
4946             region->dstOffset.y + region->extent.height,
4947          }
4948       },
4949    };
4950    return blit_shader(cmd_buffer,
4951                       dst, dst->vk.format,
4952                       src, src->vk.format,
4953                       0, NULL,
4954                       &blit_region, VK_FILTER_NEAREST, true);
4955 }
4956 
4957 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdResolveImage2(VkCommandBuffer commandBuffer,const VkResolveImageInfo2 * info)4958 v3dv_CmdResolveImage2(VkCommandBuffer commandBuffer,
4959                          const VkResolveImageInfo2 *info)
4960 
4961 {
4962    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4963    V3DV_FROM_HANDLE(v3dv_image, src, info->srcImage);
4964    V3DV_FROM_HANDLE(v3dv_image, dst, info->dstImage);
4965 
4966     /* This command can only happen outside a render pass */
4967    assert(cmd_buffer->state.pass == NULL);
4968    assert(cmd_buffer->state.job == NULL);
4969 
4970    assert(src->vk.samples == VK_SAMPLE_COUNT_4_BIT);
4971    assert(dst->vk.samples == VK_SAMPLE_COUNT_1_BIT);
4972 
4973    /* We don't support multi-sampled multi-plane images */
4974    assert(src->plane_count == 1);
4975    assert(dst->plane_count == 1);
4976 
4977    cmd_buffer->state.is_transfer = true;
4978 
4979    for (uint32_t i = 0; i < info->regionCount; i++) {
4980       if (resolve_image_tlb(cmd_buffer, dst, src, &info->pRegions[i]))
4981          continue;
4982       if (resolve_image_blit(cmd_buffer, dst, src, &info->pRegions[i]))
4983          continue;
4984       unreachable("Unsupported multismaple resolve operation");
4985    }
4986 
4987    cmd_buffer->state.is_transfer = false;
4988 }
4989