broadcom/vulkan/v3dv_meta_copy.c

/*
 * Copyright © 2019 Raspberry Pi Ltd
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#include "v3dv_private.h"
#include "v3dv_meta_common.h"

#include "compiler/nir/nir_builder.h"
#include "util/u_pack_color.h"
#include "vk_common_entrypoints.h"

static uint32_t
meta_blit_key_hash(const void *key)
{
   return _mesa_hash_data(key, V3DV_META_BLIT_CACHE_KEY_SIZE);
}

static bool
meta_blit_key_compare(const void *key1, const void *key2)
{
   return memcmp(key1, key2, V3DV_META_BLIT_CACHE_KEY_SIZE) == 0;
}

static bool
texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
                         VkImageAspectFlags aspect,
                         struct v3dv_image *image,
                         VkFormat dst_format,
                         VkFormat src_format,
                         struct v3dv_buffer *buffer,
                         uint32_t buffer_bpp,
                         VkColorComponentFlags cmask,
                         VkComponentMapping *cswizzle,
                         uint32_t region_count,
                         const VkBufferImageCopy2 *regions);

static bool
create_blit_pipeline_layout(struct v3dv_device *device,
                            VkDescriptorSetLayout *descriptor_set_layout,
                            VkPipelineLayout *pipeline_layout)
{
   VkResult result;

   if (*descriptor_set_layout == 0) {
      VkDescriptorSetLayoutBinding descriptor_set_layout_binding = {
         .binding = 0,
         .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
         .descriptorCount = 1,
         .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT,
      };
      VkDescriptorSetLayoutCreateInfo descriptor_set_layout_info = {
         .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
         .bindingCount = 1,
         .pBindings = &descriptor_set_layout_binding,
      };
      result =
         v3dv_CreateDescriptorSetLayout(v3dv_device_to_handle(device),
                                        &descriptor_set_layout_info,
                                        &device->vk.alloc,
                                        descriptor_set_layout);
      if (result != VK_SUCCESS)
         return false;
   }

   assert(*pipeline_layout == 0);
   VkPipelineLayoutCreateInfo pipeline_layout_info = {
      .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
      .setLayoutCount = 1,
      .pSetLayouts = descriptor_set_layout,
      .pushConstantRangeCount = 1,
      .pPushConstantRanges =
         &(VkPushConstantRange) { VK_SHADER_STAGE_VERTEX_BIT, 0, 20 },
   };

   result =
      v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
                                &pipeline_layout_info,
                                &device->vk.alloc,
                                pipeline_layout);
   return result == VK_SUCCESS;
}

void
v3dv_meta_blit_init(struct v3dv_device *device)
{
   for (uint32_t i = 0; i < 3; i++) {
      device->meta.blit.cache[i] =
         _mesa_hash_table_create(NULL,
                                 meta_blit_key_hash,
                                 meta_blit_key_compare);
   }

   create_blit_pipeline_layout(device,
                               &device->meta.blit.ds_layout,
                               &device->meta.blit.p_layout);
}

static void
destroy_meta_blit_pipeline(VkDevice vk_device,
                           uint64_t obj,
                           VkAllocationCallbacks *alloc)
{
   struct v3dv_meta_blit_pipeline *p =
      (struct v3dv_meta_blit_pipeline *)(uintptr_t) obj;
   v3dv_DestroyPipeline(vk_device, p->pipeline, alloc);
   v3dv_DestroyRenderPass(vk_device, p->pass, alloc);
   v3dv_DestroyRenderPass(vk_device, p->pass_no_load, alloc);
   vk_free(alloc, p);
}

void
v3dv_meta_blit_finish(struct v3dv_device *device)
{
   VkDevice _device = v3dv_device_to_handle(device);

   for (uint32_t i = 0; i < 3; i++) {
      hash_table_foreach(device->meta.blit.cache[i], entry) {
         destroy_meta_blit_pipeline(_device, (uintptr_t)entry->data,
                                    &device->vk.alloc);
      }
      _mesa_hash_table_destroy(device->meta.blit.cache[i], NULL);
   }

   if (device->meta.blit.p_layout) {
      v3dv_DestroyPipelineLayout(_device, device->meta.blit.p_layout,
                                 &device->vk.alloc);
   }

   if (device->meta.blit.ds_layout) {
      v3dv_DestroyDescriptorSetLayout(_device, device->meta.blit.ds_layout,
                                      &device->vk.alloc);
   }
}

static uint32_t
meta_texel_buffer_copy_key_hash(const void *key)
{
   return _mesa_hash_data(key, V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE);
}

static bool
meta_texel_buffer_copy_key_compare(const void *key1, const void *key2)
{
   return memcmp(key1, key2, V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE) == 0;
}

static bool
create_texel_buffer_copy_pipeline_layout(struct v3dv_device *device,
                                         VkDescriptorSetLayout *ds_layout,
                                         VkPipelineLayout *p_layout)
{
   VkResult result;

   if (*ds_layout == 0) {
      VkDescriptorSetLayoutBinding ds_layout_binding = {
         .binding = 0,
         .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
         .descriptorCount = 1,
         .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT,
      };
      VkDescriptorSetLayoutCreateInfo ds_layout_info = {
         .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
         .bindingCount = 1,
         .pBindings = &ds_layout_binding,
      };
      result =
         v3dv_CreateDescriptorSetLayout(v3dv_device_to_handle(device),
                                        &ds_layout_info,
                                        &device->vk.alloc,
                                        ds_layout);
      if (result != VK_SUCCESS)
         return false;
   }

   assert(*p_layout == 0);
   /* FIXME: this is abusing a bit the API, since not all of our copy
    * pipelines have a geometry shader. We could create 2 different pipeline
    * layouts, but this works for us for now.
    */
#define TEXEL_BUFFER_COPY_FS_BOX_PC_OFFSET      0
#define TEXEL_BUFFER_COPY_FS_STRIDE_PC_OFFSET  16
#define TEXEL_BUFFER_COPY_FS_OFFSET_PC_OFFSET  20
#define TEXEL_BUFFER_COPY_GS_LAYER_PC_OFFSET   24
   VkPushConstantRange ranges[2] = {
      { VK_SHADER_STAGE_FRAGMENT_BIT, 0, 24 },
      { VK_SHADER_STAGE_GEOMETRY_BIT, 24, 4 },
   };

   VkPipelineLayoutCreateInfo p_layout_info = {
      .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
      .setLayoutCount = 1,
      .pSetLayouts = ds_layout,
      .pushConstantRangeCount = 2,
      .pPushConstantRanges = ranges,
   };

   result =
      v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
                                &p_layout_info,
                                &device->vk.alloc,
                                p_layout);
   return result == VK_SUCCESS;
}

void
v3dv_meta_texel_buffer_copy_init(struct v3dv_device *device)
{
   for (uint32_t i = 0; i < 3; i++) {
      device->meta.texel_buffer_copy.cache[i] =
         _mesa_hash_table_create(NULL,
                                 meta_texel_buffer_copy_key_hash,
                                 meta_texel_buffer_copy_key_compare);
   }

   create_texel_buffer_copy_pipeline_layout(
      device,
      &device->meta.texel_buffer_copy.ds_layout,
      &device->meta.texel_buffer_copy.p_layout);
}

static void
destroy_meta_texel_buffer_copy_pipeline(VkDevice vk_device,
                                        uint64_t obj,
                                        VkAllocationCallbacks *alloc)
{
   struct v3dv_meta_texel_buffer_copy_pipeline *p =
      (struct v3dv_meta_texel_buffer_copy_pipeline *)(uintptr_t) obj;
   v3dv_DestroyPipeline(vk_device, p->pipeline, alloc);
   v3dv_DestroyRenderPass(vk_device, p->pass, alloc);
   v3dv_DestroyRenderPass(vk_device, p->pass_no_load, alloc);
   vk_free(alloc, p);
}

void
v3dv_meta_texel_buffer_copy_finish(struct v3dv_device *device)
{
   VkDevice _device = v3dv_device_to_handle(device);

   for (uint32_t i = 0; i < 3; i++) {
      hash_table_foreach(device->meta.texel_buffer_copy.cache[i], entry) {
         destroy_meta_texel_buffer_copy_pipeline(_device, (uintptr_t)entry->data,
                                                 &device->vk.alloc);
      }
      _mesa_hash_table_destroy(device->meta.texel_buffer_copy.cache[i], NULL);
   }

   if (device->meta.texel_buffer_copy.p_layout) {
      v3dv_DestroyPipelineLayout(_device, device->meta.texel_buffer_copy.p_layout,
                                 &device->vk.alloc);
   }

   if (device->meta.texel_buffer_copy.ds_layout) {
      v3dv_DestroyDescriptorSetLayout(_device, device->meta.texel_buffer_copy.ds_layout,
                                      &device->vk.alloc);
   }
}

static VkFormat
get_compatible_tlb_format(VkFormat format)
{
   switch (format) {
   case VK_FORMAT_R8G8B8A8_SNORM:
      return VK_FORMAT_R8G8B8A8_UINT;

   case VK_FORMAT_R8G8_SNORM:
      return VK_FORMAT_R8G8_UINT;

   case VK_FORMAT_R8_SNORM:
      return VK_FORMAT_R8_UINT;

   case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
      return VK_FORMAT_A8B8G8R8_UINT_PACK32;

   case VK_FORMAT_R16_UNORM:
   case VK_FORMAT_R16_SNORM:
      return VK_FORMAT_R16_UINT;

   case VK_FORMAT_R16G16_UNORM:
   case VK_FORMAT_R16G16_SNORM:
      return VK_FORMAT_R16G16_UINT;

   case VK_FORMAT_R16G16B16A16_UNORM:
   case VK_FORMAT_R16G16B16A16_SNORM:
      return VK_FORMAT_R16G16B16A16_UINT;

   case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
      return VK_FORMAT_R32_SFLOAT;

   /* We can't render to compressed formats using the TLB so instead we use
    * a compatible format with the same bpp as the compressed format. Because
    * the compressed format's bpp is for a full block (i.e. 4x4 pixels in the
    * case of ETC), when we implement copies with the compatible format we
    * will have to divide offsets and dimensions on the compressed image by
    * the compressed block size.
    */
   case VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK:
   case VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK:
   case VK_FORMAT_EAC_R11G11_UNORM_BLOCK:
   case VK_FORMAT_EAC_R11G11_SNORM_BLOCK:
   case VK_FORMAT_BC2_UNORM_BLOCK:
   case VK_FORMAT_BC2_SRGB_BLOCK:
   case VK_FORMAT_BC3_SRGB_BLOCK:
   case VK_FORMAT_BC3_UNORM_BLOCK:
   case VK_FORMAT_ASTC_4x4_UNORM_BLOCK:
   case VK_FORMAT_ASTC_4x4_SRGB_BLOCK:
   case VK_FORMAT_ASTC_5x4_UNORM_BLOCK:
   case VK_FORMAT_ASTC_5x4_SRGB_BLOCK:
   case VK_FORMAT_ASTC_5x5_UNORM_BLOCK:
   case VK_FORMAT_ASTC_5x5_SRGB_BLOCK:
   case VK_FORMAT_ASTC_6x5_UNORM_BLOCK:
   case VK_FORMAT_ASTC_6x5_SRGB_BLOCK:
   case VK_FORMAT_ASTC_6x6_UNORM_BLOCK:
   case VK_FORMAT_ASTC_6x6_SRGB_BLOCK:
   case VK_FORMAT_ASTC_8x5_UNORM_BLOCK:
   case VK_FORMAT_ASTC_8x5_SRGB_BLOCK:
   case VK_FORMAT_ASTC_8x6_UNORM_BLOCK:
   case VK_FORMAT_ASTC_8x6_SRGB_BLOCK:
   case VK_FORMAT_ASTC_8x8_UNORM_BLOCK:
   case VK_FORMAT_ASTC_8x8_SRGB_BLOCK:
   case VK_FORMAT_ASTC_10x5_UNORM_BLOCK:
   case VK_FORMAT_ASTC_10x5_SRGB_BLOCK:
   case VK_FORMAT_ASTC_10x6_UNORM_BLOCK:
   case VK_FORMAT_ASTC_10x6_SRGB_BLOCK:
   case VK_FORMAT_ASTC_10x8_UNORM_BLOCK:
   case VK_FORMAT_ASTC_10x8_SRGB_BLOCK:
   case VK_FORMAT_ASTC_10x10_UNORM_BLOCK:
   case VK_FORMAT_ASTC_10x10_SRGB_BLOCK:
   case VK_FORMAT_ASTC_12x10_UNORM_BLOCK:
   case VK_FORMAT_ASTC_12x10_SRGB_BLOCK:
   case VK_FORMAT_ASTC_12x12_UNORM_BLOCK:
   case VK_FORMAT_ASTC_12x12_SRGB_BLOCK:
      return VK_FORMAT_R32G32B32A32_UINT;

   case VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK:
   case VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK:
   case VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK:
   case VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK:
   case VK_FORMAT_EAC_R11_UNORM_BLOCK:
   case VK_FORMAT_EAC_R11_SNORM_BLOCK:
   case VK_FORMAT_BC1_RGB_UNORM_BLOCK:
   case VK_FORMAT_BC1_RGB_SRGB_BLOCK:
   case VK_FORMAT_BC1_RGBA_UNORM_BLOCK:
   case VK_FORMAT_BC1_RGBA_SRGB_BLOCK:
      return VK_FORMAT_R16G16B16A16_UINT;

   default:
      return VK_FORMAT_UNDEFINED;
   }
}

/**
 * Checks if we can implement an image copy or clear operation using the TLB
 * hardware.
 *
 * The extent and miplevel are only used to validate tile stores (to match the
 * region to store against the miplevel dimensions to avoid avoid cases where
 * the region to store is not a aligned to tile boundaries). If extent is
 * NULL no checks are done (which is fine if the image will only be used for a
 * TLB load or when we know in advance that the store will be for the entire
 * size of the image miplevel).
 *
 * For tlb copies we are doing a per-plane copy, so for multi-plane formats,
 * the compatible format will be single-plane.
 */
bool
v3dv_meta_can_use_tlb(struct v3dv_image *image,
                      uint8_t plane,
                      uint8_t miplevel,
                      const VkOffset3D *offset,
                      const VkExtent3D *extent,
                      VkFormat *compat_format)
{
   if (offset->x != 0 || offset->y != 0)
      return false;

   /* FIXME: this is suboptimal, what we really want to check is that the
    * extent of the region to copy is the full slice or a multiple of the
    * tile size.
    */
   if (extent) {
      struct v3d_resource_slice *slice = &image->planes[plane].slices[miplevel];
      if (slice->width != extent->width || slice->height != extent->height)
         return false;
   }

   if (image->format->planes[plane].rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO) {
      if (compat_format)
         *compat_format = image->planes[plane].vk_format;
      return true;
   }

   /* If the image format is not TLB-supported, then check if we can use
    * a compatible format instead.
    */
   if (compat_format) {
      *compat_format = get_compatible_tlb_format(image->planes[plane].vk_format);
      if (*compat_format != VK_FORMAT_UNDEFINED) {
         assert(vk_format_get_plane_count(*compat_format) == 1);
         return true;
      }
   }

   return false;
}

/* Implements a copy using the TLB.
 *
 * This only works if we are copying from offset (0,0), since a TLB store for
 * tile (x,y) will be written at the same tile offset into the destination.
 * When this requirement is not met, we need to use a blit instead.
 *
 * Returns true if the implementation supports the requested operation (even if
 * it failed to process it, for example, due to an out-of-memory error).
 *
 */
static bool
copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer,
                         struct v3dv_buffer *buffer,
                         struct v3dv_image *image,
                         const VkBufferImageCopy2 *region)
{
   VkFormat fb_format;
   uint8_t plane = v3dv_plane_from_aspect(region->imageSubresource.aspectMask);
   assert(plane < image->plane_count);

   if (!v3dv_meta_can_use_tlb(image, plane, region->imageSubresource.mipLevel,
                              &region->imageOffset, &region->imageExtent,
                              &fb_format)) {
      return false;
   }

   uint32_t internal_type, internal_bpp;
   v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
      (fb_format, region->imageSubresource.aspectMask,
       &internal_type, &internal_bpp);

   uint32_t num_layers;
   if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
      num_layers = vk_image_subresource_layer_count(&image->vk,
                                                    &region->imageSubresource);
   } else {
      num_layers = region->imageExtent.depth;
   }
   assert(num_layers > 0);

   struct v3dv_job *job =
      v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
   if (!job)
      return true;

   /* Handle copy from compressed format using a compatible format */
   const uint32_t block_w =
      vk_format_get_blockwidth(image->planes[plane].vk_format);
   const uint32_t block_h =
      vk_format_get_blockheight(image->planes[plane].vk_format);
   const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
   const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);

   v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
                        internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
                        false);

   struct v3dv_meta_framebuffer framebuffer;
   v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
                                              internal_type, &job->frame_tiling);

   v3dv_X(job->device, job_emit_binning_flush)(job);
   v3dv_X(job->device, meta_emit_copy_image_to_buffer_rcl)
      (job, buffer, image, &framebuffer, region);

   v3dv_cmd_buffer_finish_job(cmd_buffer);

   return true;
}

static bool
blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
            struct v3dv_image *dst,
            VkFormat dst_format,
            struct v3dv_image *src,
            VkFormat src_format,
            VkColorComponentFlags cmask,
            VkComponentMapping *cswizzle,
            const VkImageBlit2 *region,
            VkFilter filter,
            bool dst_is_padded_image);


/**
 * A structure that contains all the information we may need in various
 * processes involving image to buffer copies implemented with blit paths.
 */
struct image_to_buffer_info {
   /* Source image info */
   VkFormat src_format;
   uint8_t plane;
   VkColorComponentFlags cmask;
   VkComponentMapping cswizzle;
   VkImageAspectFlags src_copy_aspect;
   uint32_t block_width;
   uint32_t block_height;

   /* Destination buffer info */
   VkFormat dst_format;
   uint32_t buf_width;
   uint32_t buf_height;
   uint32_t buf_bpp;
   VkImageAspectFlags dst_copy_aspect;
};

static VkImageBlit2
blit_region_for_image_to_buffer(const VkOffset3D *offset,
                                const VkExtent3D *extent,
                                uint32_t mip_level,
                                uint32_t base_layer,
                                uint32_t layer_offset,
                                struct image_to_buffer_info *info)
{
   VkImageBlit2 output = {
      .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2,
      .srcSubresource = {
         .aspectMask = info->src_copy_aspect,
         .mipLevel = mip_level,
         .baseArrayLayer = base_layer + layer_offset,
         .layerCount = 1,
      },
      .srcOffsets = {
         {
            DIV_ROUND_UP(offset->x, info->block_width),
            DIV_ROUND_UP(offset->y, info->block_height),
            offset->z + layer_offset,
         },
         {
            DIV_ROUND_UP(offset->x + extent->width, info->block_width),
            DIV_ROUND_UP(offset->y + extent->height, info->block_height),
            offset->z + layer_offset + 1,
         },
      },
      .dstSubresource = {
         .aspectMask = info->dst_copy_aspect,
         .mipLevel = 0,
         .baseArrayLayer = 0,
         .layerCount = 1,
      },
      .dstOffsets = {
         { 0, 0, 0 },
         {
            DIV_ROUND_UP(extent->width, info->block_width),
            DIV_ROUND_UP(extent->height, info->block_height),
            1
         },
      },
   };

   return output;
}

/**
 * Produces an image_to_buffer_info struct from a VkBufferImageCopy2 that we can
 * use to implement buffer to image copies with blit paths.
 *
 * Returns false if the copy operation can't be implemented with a blit.
 */
static bool
gather_image_to_buffer_info(struct v3dv_cmd_buffer *cmd_buffer,
                            struct v3dv_image *image,
                            const VkBufferImageCopy2 *region,
                            struct image_to_buffer_info *out_info)
{
   bool supported = false;

   VkImageAspectFlags dst_copy_aspect = region->imageSubresource.aspectMask;
   /* For multi-planar images we copy one plane at a time using an image alias
    * with a color aspect for each plane.
    */
   if (image->plane_count > 1)
      dst_copy_aspect = VK_IMAGE_ASPECT_COLOR_BIT;

   VkImageAspectFlags src_copy_aspect = region->imageSubresource.aspectMask;
   uint8_t plane = v3dv_plane_from_aspect(src_copy_aspect);
   assert(plane < image->plane_count);

   /* Generally, the bpp of the data in the buffer matches that of the
    * source image. The exception is the case where we are copying
    * stencil (8bpp) to a combined d24s8 image (32bpp).
    */
   uint32_t buffer_bpp = image->planes[plane].cpp;

   /* Because we are going to implement the copy as a blit, we need to create
    * a linear image from the destination buffer and we also want our blit
    * source and destination formats to be the same (to avoid any format
    * conversions), so we choose a canonical format that matches the
    * source image bpp.
    *
    * The exception to the above is copying from combined depth/stencil images
    * because we are copying only one aspect of the image, so we need to setup
    * our formats, color write mask and source swizzle mask to match that.
    */
   VkFormat dst_format;
   VkFormat src_format;
   VkColorComponentFlags cmask = 0; /* All components */
   VkComponentMapping cswizzle = {
      .r = VK_COMPONENT_SWIZZLE_IDENTITY,
      .g = VK_COMPONENT_SWIZZLE_IDENTITY,
      .b = VK_COMPONENT_SWIZZLE_IDENTITY,
      .a = VK_COMPONENT_SWIZZLE_IDENTITY,
   };
   switch (buffer_bpp) {
   case 16:
      assert(dst_copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
      dst_format = VK_FORMAT_R32G32B32A32_UINT;
      src_format = dst_format;
      break;
   case 8:
      assert(dst_copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
      dst_format = VK_FORMAT_R16G16B16A16_UINT;
      src_format = dst_format;
      break;
   case 4:
      switch (dst_copy_aspect) {
      case VK_IMAGE_ASPECT_COLOR_BIT:
         src_format = VK_FORMAT_R8G8B8A8_UINT;
         dst_format = VK_FORMAT_R8G8B8A8_UINT;
         break;
      case VK_IMAGE_ASPECT_DEPTH_BIT:
         assert(image->plane_count == 1);
         assert(image->vk.format == VK_FORMAT_D32_SFLOAT ||
                image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
                image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32);
         if (image->vk.format == VK_FORMAT_D32_SFLOAT) {
            src_format = VK_FORMAT_R32_UINT;
            dst_format = VK_FORMAT_R32_UINT;
         } else {
            /* We want to write depth in the buffer in the first 24-bits,
             * however, the hardware has depth in bits 8-31, so swizzle the
             * the source components to match what we want. Also, we don't
             * want to write bits 24-31 in the destination.
             */
            src_format = VK_FORMAT_R8G8B8A8_UINT;
            dst_format = VK_FORMAT_R8G8B8A8_UINT;
            cmask = VK_COLOR_COMPONENT_R_BIT |
                    VK_COLOR_COMPONENT_G_BIT |
                    VK_COLOR_COMPONENT_B_BIT;
            cswizzle.r = VK_COMPONENT_SWIZZLE_G;
            cswizzle.g = VK_COMPONENT_SWIZZLE_B;
            cswizzle.b = VK_COMPONENT_SWIZZLE_A;
            cswizzle.a = VK_COMPONENT_SWIZZLE_ZERO;
         }
         break;
      case VK_IMAGE_ASPECT_STENCIL_BIT:
         assert(image->plane_count == 1);
         assert(dst_copy_aspect == VK_IMAGE_ASPECT_STENCIL_BIT);
         assert(image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT);
         /* Copying from S8D24. We want to write 8-bit stencil values only,
          * so adjust the buffer bpp for that. Since the hardware stores stencil
          * in the LSB, we can just do a RGBA8UI to R8UI blit.
          */
         src_format = VK_FORMAT_R8G8B8A8_UINT;
         dst_format = VK_FORMAT_R8_UINT;
         buffer_bpp = 1;
         break;
      default:
         unreachable("unsupported aspect");
         return supported;
      };
      break;
   case 2:
      assert(dst_copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT ||
             dst_copy_aspect == VK_IMAGE_ASPECT_DEPTH_BIT);
      dst_format = VK_FORMAT_R16_UINT;
      src_format = dst_format;
      break;
   case 1:
      assert(dst_copy_aspect == VK_IMAGE_ASPECT_COLOR_BIT);
      dst_format = VK_FORMAT_R8_UINT;
      src_format = dst_format;
      break;
   default:
      unreachable("unsupported bit-size");
      return supported;
   };

   /* The hardware doesn't support linear depth/stencil stores, so we
    * implement copies of depth/stencil aspect as color copies using a
    * compatible color format.
    */
   assert(vk_format_is_color(src_format));
   assert(vk_format_is_color(dst_format));
   dst_copy_aspect = VK_IMAGE_ASPECT_COLOR_BIT;

   /* We should be able to handle the blit if we got this far */
   supported = true;

   /* Obtain the 2D buffer region spec */
   uint32_t buf_width, buf_height;
   if (region->bufferRowLength == 0)
      buf_width = region->imageExtent.width;
   else
      buf_width = region->bufferRowLength;

   if (region->bufferImageHeight == 0)
      buf_height = region->imageExtent.height;
   else
      buf_height = region->bufferImageHeight;

   /* If the image is compressed, the bpp refers to blocks, not pixels */
   uint32_t block_width =
      vk_format_get_blockwidth(image->planes[plane].vk_format);
   uint32_t block_height =
      vk_format_get_blockheight(image->planes[plane].vk_format);
   buf_width = DIV_ROUND_UP(buf_width, block_width);
   buf_height = DIV_ROUND_UP(buf_height, block_height);

   out_info->src_format = src_format;
   out_info->dst_format = dst_format;
   out_info->src_copy_aspect = src_copy_aspect;
   out_info->dst_copy_aspect = dst_copy_aspect;
   out_info->buf_width = buf_width;
   out_info->buf_height = buf_height;
   out_info->buf_bpp = buffer_bpp;
   out_info->block_width = block_width;
   out_info->block_height = block_height;
   out_info->cmask = cmask;
   out_info->cswizzle = cswizzle;
   out_info->plane = plane;

   return supported;
}

/* Creates a linear image to alias buffer memory. It also includes that image
 * as a private object in the cmd_buffer.
 *
 * This is used for cases where we want to implement an image to buffer copy,
 * but we need to rely on a mechanism that uses an image as destination, like
 * blitting.
 */
static VkResult
create_image_from_buffer(struct v3dv_cmd_buffer *cmd_buffer,
                         struct v3dv_buffer *buffer,
                         const VkBufferImageCopy2 *region,
                         struct image_to_buffer_info *info,
                         uint32_t layer,
                         VkImage *out_image)
{
   VkImageCreateInfo image_info = {
      .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
      .imageType = VK_IMAGE_TYPE_2D,
      .format = info->dst_format,
      .extent = { info->buf_width, info->buf_height, 1 },
      .mipLevels = 1,
      .arrayLayers = 1,
      .samples = VK_SAMPLE_COUNT_1_BIT,
      .tiling = VK_IMAGE_TILING_LINEAR,
      .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT,
      .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
      .queueFamilyIndexCount = 0,
      .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
   };

   VkResult result;
   struct v3dv_device *device = cmd_buffer->device;
   VkDevice _device = v3dv_device_to_handle(device);

   VkImage buffer_image;
   result =
      v3dv_CreateImage(_device, &image_info, &device->vk.alloc, &buffer_image);
   if (result != VK_SUCCESS)
      return result;

   *out_image = buffer_image;

   v3dv_cmd_buffer_add_private_obj(
      cmd_buffer, (uintptr_t)buffer_image,
      (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);

   /* Bind the buffer memory to the image
    */
   VkDeviceSize buffer_offset = buffer->mem_offset + region->bufferOffset +
      layer * info->buf_width * info->buf_height * info->buf_bpp;

   result =
      vk_common_BindImageMemory(_device, buffer_image,
                                v3dv_device_memory_to_handle(buffer->mem),
                                buffer_offset);
   return result;
}

/**
 * Creates an image with a single mip level that aliases the memory of a
 * mip level in another image, re-interpreting the memory with an uncompressed
 * format. The image is added to the command buffer as a private object for
 * disposal.
 */
static bool
create_image_mip_level_alias(struct v3dv_cmd_buffer *cmd_buffer,
                             struct v3dv_image *image,
                             VkFormat format,
                             uint32_t plane,
                             uint32_t mip_level,
                             uint32_t layer,
                             VkImage *alias)
{
   VkResult result;
   assert(!vk_format_is_compressed(format));

   struct v3dv_device *device = cmd_buffer->device;
   VkDevice vk_device = v3dv_device_to_handle(device);
   uint32_t mip_width = image->planes[plane].slices[mip_level].width;
   uint32_t mip_height = image->planes[plane].slices[mip_level].height;

   uint32_t block_width =
      vk_format_get_blockwidth(image->planes[plane].vk_format);
   uint32_t block_height =
      vk_format_get_blockheight(image->planes[plane].vk_format);

   VkImageCreateInfo info = {
      .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
      .imageType = image->vk.image_type,
      .format = format,
      .extent = { DIV_ROUND_UP(mip_width, block_width),
                  DIV_ROUND_UP(mip_height, block_height),
                  1 },
      .mipLevels = 1,
      .arrayLayers = 1,
      .samples = image->vk.samples,
      .tiling = image->tiled ? VK_IMAGE_TILING_OPTIMAL : VK_IMAGE_TILING_LINEAR,
      .usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
      .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
      .queueFamilyIndexCount = 0,
      .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
   };
   result = v3dv_CreateImage(vk_device, &info, &device->vk.alloc, alias);
   if (result != VK_SUCCESS)
      return false;

   /* The alias we have just created has just one mip, but we may be aliasing
    * any mip in the original image. Because the slice setup changes based on
    * the mip (particularly, for mips >= 2 it uses power of 2 sizes internally)
    * and this can influence the tiling layout selected for the slice, we want
    * to make sure we copy the slice description from the actual mip level in
    * the original image, and then rewrite any fields that we need for the
    * alias. Particularly, we want to make the offset 0 because we are going to
    * bind the underlying image memory exactly at the start of the selected mip.
    * We also want to relax the image alignment requirements to the minimum
    * (the one imposed by the Texture Base Address field) since we may not be
    * aliasing a level 0 (for which we typically want a page alignment for
    * optimal performance).
    */
   V3DV_FROM_HANDLE(v3dv_image, v3dv_alias, *alias);
   v3dv_alias->planes[plane].slices[0] = image->planes[plane].slices[mip_level];
   v3dv_alias->planes[plane].slices[0].width = info.extent.width;
   v3dv_alias->planes[plane].slices[0].height = info.extent.height;
   v3dv_alias->planes[plane].slices[0].offset = 0;
   v3dv_alias->planes[plane].alignment = 64;

   v3dv_cmd_buffer_add_private_obj(
      cmd_buffer, (uintptr_t)*alias,
      (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);

   result =
      vk_common_BindImageMemory(vk_device, *alias,
                                v3dv_device_memory_to_handle(image->planes[plane].mem),
                                v3dv_layer_offset(image, mip_level, layer, plane));
   return result == VK_SUCCESS;
}

/**
 * Returns true if the implementation supports the requested operation (even if
 * it failed to process it, for example, due to an out-of-memory error).
 */
static bool
copy_image_to_buffer_blit(struct v3dv_cmd_buffer *cmd_buffer,
                          struct v3dv_buffer *buffer,
                          struct v3dv_image *image,
                          const VkBufferImageCopy2 *region)
{
   bool handled = false;
   struct image_to_buffer_info info;

   /* This path uses a shader blit which doesn't support linear images. Return
    * early to avoid all the heavy lifting in preparation for the
    * blit_shader() call that is bound to fail in that scenario.
    */
   if (!image->tiled && image->vk.image_type != VK_IMAGE_TYPE_1D) {
      return handled;
   }

   handled = gather_image_to_buffer_info(cmd_buffer, image, region,
                                         &info);

   if (!handled)
      return handled;

   /* We should be able to handle the blit if we got this far */
   handled = true;

   /* Compute layers to copy */
   uint32_t num_layers;
   if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
      num_layers = vk_image_subresource_layer_count(&image->vk,
                                                    &region->imageSubresource);
   } else {
      num_layers = region->imageExtent.depth;
   }
   assert(num_layers > 0);

   /* Copy requested layers */
   VkResult result;
   VkImageBlit2 blit_region;
   uint32_t mip_level = region->imageSubresource.mipLevel;
   uint32_t base_layer = region->imageSubresource.baseArrayLayer;
   for (uint32_t i = 0; i < num_layers; i++) {
      uint32_t layer_offset = i;

      if (vk_format_is_compressed(image->vk.format)) {
         /* Our blit interface can see the real format of the images to detect
          * copies between compressed and uncompressed images and adapt the
          * blit region accordingly. Here we are just doing a raw copy of
          * compressed data, but we are passing an uncompressed view of the
          * buffer for the blit destination image (since compressed formats are
          * not renderable), so we also want to provide an uncompressed view of
          * the source image.
          *
          * It is important that we create the alias over the selected mip
          * level (instead of aliasing the entire image) because an uncompressed
          * view of the image won't have the same number of mip levels as the
          * original image and the implicit mip size calculations the hw will
          * do to sample from a non-zero mip level may not match exactly between
          * compressed and uncompressed views.
          */
         VkImage alias;
         if (!create_image_mip_level_alias(cmd_buffer, image, info.dst_format,
                                           info.plane, mip_level,
                                           base_layer + layer_offset,
                                           &alias)) {
            return handled;
         }

         /* We are aliasing the selected mip level and layer with a
          * single-mip and single-layer image.
          */
         image = v3dv_image_from_handle(alias);
         mip_level = 0;
         base_layer = 0;
         layer_offset = 0;
      }

      /* Create the destination blit image from the destination buffer */
      VkImage buffer_image;
      result =
         create_image_from_buffer(cmd_buffer, buffer, region, &info,
                                  i, &buffer_image);
      if (result != VK_SUCCESS)
         return handled;

      /* Blit-copy the requested image extent.
       *
       * Since we are copying, the blit must use the same format on the
       * destination and source images to avoid format conversions. The
       * only exception is copying stencil, which we upload to a R8UI source
       * image, but that we need to blit to a S8D24 destination (the only
       * stencil format we support).
       */
      blit_region =
         blit_region_for_image_to_buffer(&region->imageOffset,
                                         &region->imageExtent,
                                         mip_level, base_layer, layer_offset,
                                         &info);

      handled = blit_shader(cmd_buffer,
                            v3dv_image_from_handle(buffer_image),
                            info.dst_format,
                            image, info.src_format,
                            info.cmask, &info.cswizzle,
                            &blit_region, VK_FILTER_NEAREST, false);
      if (!handled) {
         /* This is unexpected, we should have a supported blit spec */
         unreachable("Unable to blit buffer to destination image");
         return false;
      }
   }

   assert(handled);
   return true;
}

static bool
copy_image_linear_texel_buffer(struct v3dv_cmd_buffer *cmd_buffer,
                               struct v3dv_image *dst,
                               struct v3dv_image *src,
                               const VkImageCopy2 *region);

static VkImageCopy2
image_copy_region_for_image_to_buffer(const VkBufferImageCopy2 *region,
                                      struct image_to_buffer_info *info,
                                      uint32_t layer)
{
   VkImageCopy2 output = {
      .sType = VK_STRUCTURE_TYPE_IMAGE_COPY_2,
      .srcSubresource = {
         .aspectMask = info->src_copy_aspect,
         .mipLevel = region->imageSubresource.mipLevel,
         .baseArrayLayer = region->imageSubresource.baseArrayLayer + layer,
         .layerCount = 1,
      },
      .srcOffset = {
            DIV_ROUND_UP(region->imageOffset.x, info->block_width),
            DIV_ROUND_UP(region->imageOffset.y, info->block_height),
            region->imageOffset.z,
      },
      .dstSubresource = {
         .aspectMask = info->dst_copy_aspect,
         .mipLevel = 0,
         .baseArrayLayer = 0,
         .layerCount = 1,
      },
      .dstOffset = { 0, 0, 0 },
      .extent = {
         DIV_ROUND_UP(region->imageExtent.width, info->block_width),
         DIV_ROUND_UP(region->imageExtent.height, info->block_height),
         1
      },
   };

   return output;
}

/**
 * Returns true if the implementation supports the requested operation (even if
 * it failed to process it, for example, due to an out-of-memory error).
 */
static bool
copy_image_to_buffer_texel_buffer(struct v3dv_cmd_buffer *cmd_buffer,
                                  struct v3dv_buffer *dst_buffer,
                                  struct v3dv_image *src_image,
                                  const VkBufferImageCopy2 *region)
{
   bool handled = false;
   VkImage dst_buffer_image;
   struct image_to_buffer_info info;

   /* This is a requirement for copy_image_linear_texel_buffer below. We check
    * it in advance in order to do an early return
    */
   if (src_image->tiled)
      return false;

   handled =
      gather_image_to_buffer_info(cmd_buffer, src_image, region,
                                  &info);
   if (!handled)
      return handled;

   /* At this point the implementation should support the copy, any possible
    * error below are for different reasons, like out-of-memory error
    */
   handled = true;

   uint32_t num_layers;
   if (src_image->vk.image_type != VK_IMAGE_TYPE_3D) {
      num_layers = vk_image_subresource_layer_count(&src_image->vk,
                                                    &region->imageSubresource);
   } else {
      num_layers = region->imageExtent.depth;
   }
   assert(num_layers > 0);

   VkResult result;
   VkImageCopy2 image_region;
   for (uint32_t layer = 0; layer < num_layers; layer++) {
      /* Create the destination image from the destination buffer */
      result =
         create_image_from_buffer(cmd_buffer, dst_buffer, region, &info,
                                  layer, &dst_buffer_image);
      if (result != VK_SUCCESS)
         return handled;

      image_region =
         image_copy_region_for_image_to_buffer(region, &info, layer);

      handled =
         copy_image_linear_texel_buffer(cmd_buffer,
                                        v3dv_image_from_handle(dst_buffer_image),
                                        src_image, &image_region);
   }

   return handled;
}

VKAPI_ATTR void VKAPI_CALL
v3dv_CmdCopyImageToBuffer2(VkCommandBuffer commandBuffer,
                              const VkCopyImageToBufferInfo2 *info)

{
   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
   V3DV_FROM_HANDLE(v3dv_image, image, info->srcImage);
   V3DV_FROM_HANDLE(v3dv_buffer, buffer, info->dstBuffer);

   assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT);

   cmd_buffer->state.is_transfer = true;

   for (uint32_t i = 0; i < info->regionCount; i++) {
      const VkBufferImageCopy2 *region = &info->pRegions[i];

      if (copy_image_to_buffer_tlb(cmd_buffer, buffer, image, region))
         continue;

      if (copy_image_to_buffer_blit(cmd_buffer, buffer, image, region))
         continue;

      if (copy_image_to_buffer_texel_buffer(cmd_buffer, buffer, image, region))
         continue;

      unreachable("Unsupported image to buffer copy.");
   }
   cmd_buffer->state.is_transfer = false;
}

/**
 * Returns true if the implementation supports the requested operation (even if
 * it failed to process it, for example, due to an out-of-memory error).
 */
static bool
copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
               struct v3dv_image *dst,
               struct v3dv_image *src,
               const VkImageCopy2 *region)
{
   if (V3D_DBG(DISABLE_TFU)) {
      perf_debug("Copy images: TFU disabled, fallbacks could be slower.\n");
      return false;
   }

   /* Destination can't be raster format */
   if (!dst->tiled)
      return false;

   /* We can only do full copies, so if the format is D24S8 both aspects need
    * to be copied. We only need to check the dst format because the spec
    * states that depth/stencil formats must match exactly.
    */
   if (dst->vk.format == VK_FORMAT_D24_UNORM_S8_UINT) {
       const VkImageAspectFlags ds_aspects = VK_IMAGE_ASPECT_DEPTH_BIT |
                                             VK_IMAGE_ASPECT_STENCIL_BIT;
       if (region->dstSubresource.aspectMask != ds_aspects)
          return false;
   }

   /* Don't handle copies between uncompressed and compressed formats for now.
    *
    * FIXME: we should be able to handle these easily but there is no coverage
    * in CTS at the moment that make such copies with full images (which we
    * require here), only partial copies. Also, in that case the code below that
    * checks for "dst image complete" requires some changes, since it is
    * checking against the region dimensions, which are in units of the source
    * image format.
    */
   if (vk_format_is_compressed(dst->vk.format) !=
       vk_format_is_compressed(src->vk.format)) {
      return false;
   }

   /* Source region must start at (0,0) */
   if (region->srcOffset.x != 0 || region->srcOffset.y != 0)
      return false;

   /* Destination image must be complete */
   if (region->dstOffset.x != 0 || region->dstOffset.y != 0)
      return false;

   uint8_t src_plane =
      v3dv_plane_from_aspect(region->srcSubresource.aspectMask);
   uint8_t dst_plane =
      v3dv_plane_from_aspect(region->dstSubresource.aspectMask);

   const uint32_t dst_mip_level = region->dstSubresource.mipLevel;
   uint32_t dst_width = u_minify(dst->planes[dst_plane].width, dst_mip_level);
   uint32_t dst_height = u_minify(dst->planes[dst_plane].height, dst_mip_level);
   if (region->extent.width != dst_width || region->extent.height != dst_height)
      return false;

   /* From vkCmdCopyImage:
    *
    *   "When copying between compressed and uncompressed formats the extent
    *    members represent the texel dimensions of the source image and not
    *    the destination."
    */
   const uint32_t block_w =
      vk_format_get_blockwidth(src->planes[src_plane].vk_format);
   const uint32_t block_h =
      vk_format_get_blockheight(src->planes[src_plane].vk_format);
   uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
   uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);

   /* Account for sample count */
   assert(dst->vk.samples == src->vk.samples);
   if (dst->vk.samples > VK_SAMPLE_COUNT_1_BIT) {
      assert(dst->vk.samples == VK_SAMPLE_COUNT_4_BIT);
      width *= 2;
      height *= 2;
   }

   /* The TFU unit doesn't handle format conversions so we need the formats to
    * match. On the other hand, vkCmdCopyImage allows different color formats
    * on the source and destination images, but only if they are texel
    * compatible. For us, this means that we can effectively ignore different
    * formats and just make the copy using either of them, since we are just
    * moving raw data and not making any conversions.
    *
    * Also, the formats supported by the TFU unit are limited, but again, since
    * we are only doing raw copies here without interpreting or converting
    * the underlying pixel data according to its format, we can always choose
    * to use compatible formats that are supported with the TFU unit.
    */
   assert(dst->planes[dst_plane].cpp == src->planes[src_plane].cpp);
   const struct v3dv_format *format =
      v3dv_get_compatible_tfu_format(cmd_buffer->device,
                                     dst->planes[dst_plane].cpp, NULL);

   /* Emit a TFU job for each layer to blit */
   const uint32_t layer_count = dst->vk.image_type != VK_IMAGE_TYPE_3D ?
      vk_image_subresource_layer_count(&dst->vk, &region->dstSubresource) :
      region->extent.depth;
   const uint32_t src_mip_level = region->srcSubresource.mipLevel;

   const uint32_t base_src_layer = src->vk.image_type != VK_IMAGE_TYPE_3D ?
      region->srcSubresource.baseArrayLayer : region->srcOffset.z;
   const uint32_t base_dst_layer = dst->vk.image_type != VK_IMAGE_TYPE_3D ?
      region->dstSubresource.baseArrayLayer : region->dstOffset.z;
   for (uint32_t i = 0; i < layer_count; i++) {
      const uint32_t dst_offset =
         dst->planes[dst_plane].mem->bo->offset +
         v3dv_layer_offset(dst, dst_mip_level, base_dst_layer + i, dst_plane);
      const uint32_t src_offset =
         src->planes[src_plane].mem->bo->offset +
         v3dv_layer_offset(src, src_mip_level, base_src_layer + i, src_plane);

      const struct v3d_resource_slice *dst_slice =
         &dst->planes[dst_plane].slices[dst_mip_level];
      const struct v3d_resource_slice *src_slice =
         &src->planes[src_plane].slices[src_mip_level];

      v3dv_X(cmd_buffer->device, meta_emit_tfu_job)(
         cmd_buffer,
         dst->planes[dst_plane].mem->bo->handle,
         dst_offset,
         dst_slice->tiling,
         dst_slice->padded_height,
         dst->planes[dst_plane].cpp,
         src->planes[src_plane].mem->bo->handle,
         src_offset,
         src_slice->tiling,
         src_slice->tiling == V3D_TILING_RASTER ?
                              src_slice->stride : src_slice->padded_height,
         src->planes[src_plane].cpp,
         /* All compatible TFU formats are single-plane */
         width, height, &format->planes[0]);
   }

   return true;
}

inline bool
v3dv_cmd_buffer_copy_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
                               struct v3dv_image *dst,
                               struct v3dv_image *src,
                               const VkImageCopy2 *region)
{
   return copy_image_tfu(cmd_buffer, dst, src, region);
}

/**
 * Returns true if the implementation supports the requested operation (even if
 * it failed to process it, for example, due to an out-of-memory error).
 */
static bool
copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
               struct v3dv_image *dst,
               struct v3dv_image *src,
               const VkImageCopy2 *region)
{
   uint8_t src_plane =
      v3dv_plane_from_aspect(region->srcSubresource.aspectMask);
   assert(src_plane < src->plane_count);
   uint8_t dst_plane =
      v3dv_plane_from_aspect(region->dstSubresource.aspectMask);
   assert(dst_plane < dst->plane_count);

   VkFormat fb_format;
   if (!v3dv_meta_can_use_tlb(src, src_plane, region->srcSubresource.mipLevel,
                              &region->srcOffset, NULL, &fb_format) ||
       !v3dv_meta_can_use_tlb(dst, dst_plane, region->dstSubresource.mipLevel,
                              &region->dstOffset, &region->extent, &fb_format)) {
      return false;
   }

   /* We can't do TLB stores of linear D/S */
   if (!dst->tiled && vk_format_is_depth_or_stencil(fb_format))
      return false;

   /* From the Vulkan spec, VkImageCopy valid usage:
    *
    *    "If neither the calling command’s srcImage nor the calling command’s
    *     dstImage has a multi-planar image format then the aspectMask member
    *     of srcSubresource and dstSubresource must match."
    */
   assert(src->plane_count != 1 || dst->plane_count != 1 ||
          region->dstSubresource.aspectMask ==
          region->srcSubresource.aspectMask);
   uint32_t internal_type, internal_bpp;
   v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
      (fb_format, region->dstSubresource.aspectMask,
       &internal_type, &internal_bpp);

   /* From the Vulkan spec with VK_KHR_maintenance1, VkImageCopy valid usage:
    *
    * "The number of slices of the extent (for 3D) or layers of the
    *  srcSubresource (for non-3D) must match the number of slices of the
    *  extent (for 3D) or layers of the dstSubresource (for non-3D)."
    */
   assert((src->vk.image_type != VK_IMAGE_TYPE_3D ?
           vk_image_subresource_layer_count(&src->vk, &region->srcSubresource) :
           region->extent.depth) ==
          (dst->vk.image_type != VK_IMAGE_TYPE_3D ?
           vk_image_subresource_layer_count(&dst->vk, &region->dstSubresource) :
           region->extent.depth));
   uint32_t num_layers;
   if (dst->vk.image_type != VK_IMAGE_TYPE_3D) {
      num_layers = vk_image_subresource_layer_count(&dst->vk,
                                                    &region->dstSubresource);
   } else {
      num_layers = region->extent.depth;
   }
   assert(num_layers > 0);

   struct v3dv_job *job =
      v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
   if (!job)
      return true;

   /* Handle copy to compressed image using compatible format */
   const uint32_t block_w =
      vk_format_get_blockwidth(dst->planes[dst_plane].vk_format);
   const uint32_t block_h =
      vk_format_get_blockheight(dst->planes[dst_plane].vk_format);
   const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
   const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);

   v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
                        internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
                        src->vk.samples > VK_SAMPLE_COUNT_1_BIT);

   struct v3dv_meta_framebuffer framebuffer;
   v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
                                              internal_type, &job->frame_tiling);

   v3dv_X(job->device, job_emit_binning_flush)(job);
   v3dv_X(job->device, meta_emit_copy_image_rcl)(job, dst, src, &framebuffer, region);

   v3dv_cmd_buffer_finish_job(cmd_buffer);

   return true;
}

/**
 * Takes the image provided as argument and creates a new image that has
 * the same specification and aliases the same memory storage, except that:
 *
 *   - It has the uncompressed format passed in.
 *   - Its original width/height are scaled by the factors passed in.
 *
 * This is useful to implement copies from compressed images using the blit
 * path. The idea is that we create uncompressed "image views" of both the
 * source and destination images using the uncompressed format and then we
 * define the copy blit in terms of that format.
 */
static struct v3dv_image *
create_image_alias(struct v3dv_cmd_buffer *cmd_buffer,
                   struct v3dv_image *src,
                   float width_scale,
                   float height_scale,
                   VkFormat format)
{
   assert(!vk_format_is_compressed(format));
   /* We don't support ycbcr compressed formats */
   assert(src->plane_count == 1);

   VkDevice _device = v3dv_device_to_handle(cmd_buffer->device);

   VkImageCreateInfo info = {
      .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
      .imageType = src->vk.image_type,
      .format = format,
      .extent = {
         .width = src->vk.extent.width * width_scale,
         .height = src->vk.extent.height * height_scale,
         .depth = src->vk.extent.depth,
      },
      .mipLevels = src->vk.mip_levels,
      .arrayLayers = src->vk.array_layers,
      .samples = src->vk.samples,
      .tiling = src->tiled ? VK_IMAGE_TILING_OPTIMAL : VK_IMAGE_TILING_LINEAR,
      .usage = src->vk.usage,
   };

    VkImage _image;
    VkResult result =
      v3dv_CreateImage(_device, &info, &cmd_buffer->device->vk.alloc, &_image);
    if (result != VK_SUCCESS) {
       v3dv_flag_oom(cmd_buffer, NULL);
       return NULL;
    }

    v3dv_cmd_buffer_add_private_obj(
       cmd_buffer, (uintptr_t)_image,
       (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);

    struct v3dv_image *image = v3dv_image_from_handle(_image);
    image->planes[0].mem = src->planes[0].mem;
    image->planes[0].mem_offset = src->planes[0].mem_offset;
    return image;
}

/**
 * Returns true if the implementation supports the requested operation (even if
 * it failed to process it, for example, due to an out-of-memory error).
 */
static bool
copy_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
                struct v3dv_image *dst,
                struct v3dv_image *src,
                const VkImageCopy2 *region)
{
   if (!src->tiled && src->vk.image_type != VK_IMAGE_TYPE_1D)
      return false;

   uint8_t src_plane =
      v3dv_plane_from_aspect(region->srcSubresource.aspectMask);
   assert(src_plane < src->plane_count);
   uint8_t dst_plane =
      v3dv_plane_from_aspect(region->dstSubresource.aspectMask);
   assert(dst_plane < dst->plane_count);

   const uint32_t src_block_w =
      vk_format_get_blockwidth(src->planes[src_plane].vk_format);
   const uint32_t src_block_h =
      vk_format_get_blockheight(src->planes[src_plane].vk_format);
   const uint32_t dst_block_w =
      vk_format_get_blockwidth(dst->planes[dst_plane].vk_format);
   const uint32_t dst_block_h =
      vk_format_get_blockheight(dst->planes[dst_plane].vk_format);
   const float block_scale_w = (float)src_block_w / (float)dst_block_w;
   const float block_scale_h = (float)src_block_h / (float)dst_block_h;

   /* We need to choose a single format for the blit to ensure that this is
    * really a copy and there are not format conversions going on. Since we
    * going to blit, we need to make sure that the selected format can be
    * both rendered to and textured from.
    */
   VkFormat format;
   float src_scale_w = 1.0f;
   float src_scale_h = 1.0f;
   float dst_scale_w = block_scale_w;
   float dst_scale_h = block_scale_h;
   if (vk_format_is_compressed(src->vk.format)) {
      /* If we are copying from a compressed format we should be aware that we
       * are going to texture from the source image, and the texture setup
       * knows the actual size of the image, so we need to choose a format
       * that has a per-texel (not per-block) bpp that is compatible for that
       * image size. For example, for a source image with size Bw*WxBh*H
       * and format ETC2_RGBA8_UNORM copied to a WxH image of format RGBA32UI,
       * each of the Bw*WxBh*H texels in the compressed source image is 8-bit
       * (which translates to a 128-bit 4x4 RGBA32 block when uncompressed),
       * so we could specify a blit with size Bw*WxBh*H and a format with
       * a bpp of 8-bit per texel (R8_UINT).
       *
       * Unfortunately, when copying from a format like ETC2_RGB8A1_UNORM,
       * which is 64-bit per texel, then we would need a 4-bit format, which
       * we don't have, so instead we still choose an 8-bit format, but we
       * apply a divisor to the row dimensions of the blit, since we are
       * copying two texels per item.
       *
       * Generally, we can choose any format so long as we compute appropriate
       * divisors for the width and height depending on the source image's
       * bpp.
       */
      assert(src->planes[src_plane].cpp == dst->planes[dst_plane].cpp);

      format = VK_FORMAT_R32G32_UINT;
      switch (src->planes[src_plane].cpp) {
      case 16:
         format = VK_FORMAT_R32G32B32A32_UINT;
         break;
      case 8:
         format = VK_FORMAT_R16G16B16A16_UINT;
         break;
      default:
         unreachable("Unsupported compressed format");
      }

      /* Create image views of the src/dst images that we can interpret in
       * terms of the canonical format.
       */
      src_scale_w /= src_block_w;
      src_scale_h /= src_block_h;
      dst_scale_w /= src_block_w;
      dst_scale_h /= src_block_h;

      src = create_image_alias(cmd_buffer, src,
                               src_scale_w, src_scale_h, format);

      dst = create_image_alias(cmd_buffer, dst,
                               dst_scale_w, dst_scale_h, format);
   } else {
      format = src->format->planes[src_plane].rt_type != V3D_OUTPUT_IMAGE_FORMAT_NO ?
         src->planes[src_plane].vk_format :
         get_compatible_tlb_format(src->planes[src_plane].vk_format);
      if (format == VK_FORMAT_UNDEFINED)
         return false;

      const struct v3dv_format *f = v3dv_X(cmd_buffer->device, get_format)(format);
      assert(f->plane_count < 2);
      if (!f->plane_count || f->planes[0].tex_type == TEXTURE_DATA_FORMAT_NO)
         return false;
   }

   /* Given an uncompressed image with size WxH, if we copy it to a compressed
    * image, it will result in an image with size W*bWxH*bH, where bW and bH
    * are the compressed format's block width and height. This means that
    * copies between compressed and uncompressed images involve different
    * image sizes, and therefore, we need to take that into account when
    * setting up the source and destination blit regions below, so they are
    * consistent from the point of view of the single compatible format
    * selected for the copy.
    *
    * We should take into account that the dimensions of the region provided
    * to the copy command are specified in terms of the source image. With that
    * in mind, below we adjust the blit destination region to be consistent with
    * the source region for the compatible format, so basically, we apply
    * the block scale factor to the destination offset provided by the copy
    * command (because it is specified in terms of the destination image, not
    * the source), and then we just add the region copy dimensions to that
    * (since the region dimensions are already specified in terms of the source
    * image).
    */
   uint32_t region_width = region->extent.width * src_scale_w;
   uint32_t region_height = region->extent.height * src_scale_h;
   if (src_block_w > 1)
      region_width = util_next_power_of_two(region_width);
   if (src_block_h > 1)
      region_height = util_next_power_of_two(region_height);

   const VkOffset3D src_start = {
      region->srcOffset.x * src_scale_w,
      region->srcOffset.y * src_scale_h,
      region->srcOffset.z,
   };
   const VkOffset3D src_end = {
      src_start.x + region_width,
      src_start.y + region_height,
      src_start.z + region->extent.depth,
   };

   const VkOffset3D dst_start = {
      region->dstOffset.x * dst_scale_w,
      region->dstOffset.y * dst_scale_h,
      region->dstOffset.z,
   };
   const VkOffset3D dst_end = {
      dst_start.x + region_width,
      dst_start.y + region_height,
      dst_start.z + region->extent.depth,
   };

   const VkImageBlit2 blit_region = {
      .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2,
      .srcSubresource = region->srcSubresource,
      .srcOffsets = { src_start, src_end },
      .dstSubresource = region->dstSubresource,
      .dstOffsets = { dst_start, dst_end },
   };
   bool handled = blit_shader(cmd_buffer,
                              dst, format,
                              src, format,
                              0, NULL,
                              &blit_region, VK_FILTER_NEAREST, true);

   /* We should have selected formats that we can blit */
   assert(handled);
   return handled;
}

static bool
copy_image_linear_texel_buffer(struct v3dv_cmd_buffer *cmd_buffer,
                               struct v3dv_image *dst,
                               struct v3dv_image *src,
                               const VkImageCopy2 *region)
{
   if (src->tiled)
      return false;

   /* Implementations are allowed to restrict linear images like this */
   assert(region->srcOffset.z == 0);
   assert(region->dstOffset.z == 0);
   assert(region->srcSubresource.mipLevel == 0);
   assert(region->srcSubresource.baseArrayLayer == 0);
   assert(region->srcSubresource.layerCount == 1);
   assert(region->dstSubresource.mipLevel == 0);
   assert(region->dstSubresource.baseArrayLayer == 0);
   assert(region->dstSubresource.layerCount == 1);

   uint8_t src_plane =
      v3dv_plane_from_aspect(region->srcSubresource.aspectMask);
   uint8_t dst_plane =
      v3dv_plane_from_aspect(region->dstSubresource.aspectMask);

   assert(src->planes[src_plane].cpp == dst->planes[dst_plane].cpp);
   const uint32_t bpp = src->planes[src_plane].cpp;

   VkFormat format;
   switch (bpp) {
   case 16:
      format = VK_FORMAT_R32G32B32A32_UINT;
      break;
   case 8:
      format = VK_FORMAT_R16G16B16A16_UINT;
      break;
   case 4:
      format = VK_FORMAT_R8G8B8A8_UINT;
      break;
   case 2:
      format = VK_FORMAT_R16_UINT;
      break;
   case 1:
      format = VK_FORMAT_R8_UINT;
      break;
   default:
      unreachable("unsupported bit-size");
      return false;
   }

   VkComponentMapping ident_swizzle = {
      .r = VK_COMPONENT_SWIZZLE_IDENTITY,
      .g = VK_COMPONENT_SWIZZLE_IDENTITY,
      .b = VK_COMPONENT_SWIZZLE_IDENTITY,
      .a = VK_COMPONENT_SWIZZLE_IDENTITY,
   };

   const uint32_t buf_stride = src->planes[src_plane].slices[0].stride;
   const VkDeviceSize buf_offset =
      region->srcOffset.y * buf_stride + region->srcOffset.x * bpp;

   struct v3dv_buffer src_buffer;
   vk_object_base_init(&cmd_buffer->device->vk, &src_buffer.base,
                       VK_OBJECT_TYPE_BUFFER);

   const struct VkBufferCreateInfo buf_create_info = {
      .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
      .size = src->planes[src_plane].size,
      .usage = VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT,
      .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
   };
   v3dv_buffer_init(cmd_buffer->device, &buf_create_info, &src_buffer,
                    src->planes[src_plane].alignment);

   const VkBindBufferMemoryInfo buf_bind_info = {
      .sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO,
      .buffer = v3dv_buffer_to_handle(&src_buffer),
      .memory = v3dv_device_memory_to_handle(src->planes[src_plane].mem),
      .memoryOffset = src->planes[src_plane].mem_offset +
         v3dv_layer_offset(src, 0, 0, src_plane),
   };
   v3dv_buffer_bind_memory(&buf_bind_info);

   const VkBufferImageCopy2 copy_region = {
      .sType = VK_STRUCTURE_TYPE_BUFFER_IMAGE_COPY_2,
      .pNext = NULL,
      .bufferOffset = buf_offset,
      .bufferRowLength = buf_stride / bpp,
      .bufferImageHeight = src->vk.extent.height,
      .imageSubresource = region->dstSubresource,
      .imageOffset = region->dstOffset,
      .imageExtent = region->extent,
   };

   return texel_buffer_shader_copy(cmd_buffer,
                                   region->dstSubresource.aspectMask,
                                   dst,
                                   format,
                                   format,
                                   &src_buffer,
                                   src->planes[src_plane].cpp,
                                   0 /* color mask: full */, &ident_swizzle,
                                   1, &copy_region);
}

VKAPI_ATTR void VKAPI_CALL
v3dv_CmdCopyImage2(VkCommandBuffer commandBuffer,
                      const VkCopyImageInfo2 *info)

{
   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
   V3DV_FROM_HANDLE(v3dv_image, src, info->srcImage);
   V3DV_FROM_HANDLE(v3dv_image, dst, info->dstImage);

   assert(src->vk.samples == dst->vk.samples);

   cmd_buffer->state.is_transfer = true;

   for (uint32_t i = 0; i < info->regionCount; i++) {
      const VkImageCopy2 *region = &info->pRegions[i];
      if (copy_image_tfu(cmd_buffer, dst, src, region))
         continue;
      if (copy_image_tlb(cmd_buffer, dst, src, region))
         continue;
      if (copy_image_blit(cmd_buffer, dst, src, region))
         continue;
      if (copy_image_linear_texel_buffer(cmd_buffer, dst, src, region))
         continue;
      unreachable("Image copy not supported");
   }

   cmd_buffer->state.is_transfer = false;
}

VKAPI_ATTR void VKAPI_CALL
v3dv_CmdCopyBuffer2(VkCommandBuffer commandBuffer,
                       const VkCopyBufferInfo2 *pCopyBufferInfo)
{
   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
   V3DV_FROM_HANDLE(v3dv_buffer, src_buffer, pCopyBufferInfo->srcBuffer);
   V3DV_FROM_HANDLE(v3dv_buffer, dst_buffer, pCopyBufferInfo->dstBuffer);

   cmd_buffer->state.is_transfer = true;

   for (uint32_t i = 0; i < pCopyBufferInfo->regionCount; i++) {
      v3dv_X(cmd_buffer->device, meta_copy_buffer)
         (cmd_buffer,
          dst_buffer->mem->bo, dst_buffer->mem_offset,
          src_buffer->mem->bo, src_buffer->mem_offset,
          &pCopyBufferInfo->pRegions[i]);
   }

   cmd_buffer->state.is_transfer = false;
}

static void
destroy_update_buffer_cb(VkDevice _device,
                         uint64_t pobj,
                         VkAllocationCallbacks *alloc)
{
   V3DV_FROM_HANDLE(v3dv_device, device, _device);
   struct v3dv_bo *bo = (struct v3dv_bo *)((uintptr_t) pobj);
   v3dv_bo_free(device, bo);
}

VKAPI_ATTR void VKAPI_CALL
v3dv_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
                     VkBuffer dstBuffer,
                     VkDeviceSize dstOffset,
                     VkDeviceSize dataSize,
                     const void *pData)
{
   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
   V3DV_FROM_HANDLE(v3dv_buffer, dst_buffer, dstBuffer);

   struct v3dv_bo *src_bo =
      v3dv_bo_alloc(cmd_buffer->device, dataSize, "vkCmdUpdateBuffer", true);
   if (!src_bo) {
      fprintf(stderr, "Failed to allocate BO for vkCmdUpdateBuffer.\n");
      return;
   }

   bool ok = v3dv_bo_map(cmd_buffer->device, src_bo, src_bo->size);
   if (!ok) {
      fprintf(stderr, "Failed to map BO for vkCmdUpdateBuffer.\n");
      return;
   }

   cmd_buffer->state.is_transfer = true;

   memcpy(src_bo->map, pData, dataSize);

   v3dv_bo_unmap(cmd_buffer->device, src_bo);

   VkBufferCopy2 region = {
      .sType = VK_STRUCTURE_TYPE_BUFFER_COPY_2,
      .srcOffset = 0,
      .dstOffset = dstOffset,
      .size = dataSize,
   };
   struct v3dv_job *copy_job =
      v3dv_X(cmd_buffer->device, meta_copy_buffer)
      (cmd_buffer, dst_buffer->mem->bo, dst_buffer->mem_offset,
       src_bo, 0, &region);

   if (copy_job) {
      v3dv_cmd_buffer_add_private_obj(
         cmd_buffer, (uint64_t)(uintptr_t)src_bo, destroy_update_buffer_cb);
   }

   cmd_buffer->state.is_transfer = false;
}

VKAPI_ATTR void VKAPI_CALL
v3dv_CmdFillBuffer(VkCommandBuffer commandBuffer,
                   VkBuffer dstBuffer,
                   VkDeviceSize dstOffset,
                   VkDeviceSize size,
                   uint32_t data)
{
   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
   V3DV_FROM_HANDLE(v3dv_buffer, dst_buffer, dstBuffer);

   cmd_buffer->state.is_transfer = true;

   struct v3dv_bo *bo = dst_buffer->mem->bo;

   /* From the Vulkan spec:
    *
    *   "If VK_WHOLE_SIZE is used and the remaining size of the buffer is not
    *    a multiple of 4, then the nearest smaller multiple is used."
    */
   if (size == VK_WHOLE_SIZE) {
      size = dst_buffer->size - dstOffset;
      size -= size % 4;
   }

   v3dv_X(cmd_buffer->device, meta_fill_buffer)
      (cmd_buffer, bo, dstOffset, size, data);

   cmd_buffer->state.is_transfer = false;
}

/**
 * Returns true if the implementation supports the requested operation (even if
 * it failed to process it, for example, due to an out-of-memory error).
 */
static bool
copy_buffer_to_image_tfu(struct v3dv_cmd_buffer *cmd_buffer,
                         struct v3dv_image *image,
                         struct v3dv_buffer *buffer,
                         const VkBufferImageCopy2 *region)
{
   if (V3D_DBG(DISABLE_TFU)) {
      perf_debug("Copy buffer to image: TFU disabled, fallbacks could be slower.\n");
      return false;
   }

   assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT);

   /* Destination can't be raster format */
   if (!image->tiled)
      return false;

   /* We can't copy D24S8 because buffer to image copies only copy one aspect
    * at a time, and the TFU copies full images. Also, V3D depth bits for
    * both D24S8 and D24X8 stored in the 24-bit MSB of each 32-bit word, but
    * the Vulkan spec has the buffer data specified the other way around, so it
    * is not a straight copy, we would have to swizzle the channels, which the
    * TFU can't do.
    */
   if (image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
       image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32) {
         return false;
   }

   /* Region must include full slice */
   const uint32_t offset_x = region->imageOffset.x;
   const uint32_t offset_y = region->imageOffset.y;
   if (offset_x != 0 || offset_y != 0)
      return false;

   uint32_t width, height;
   if (region->bufferRowLength == 0)
      width = region->imageExtent.width;
   else
      width = region->bufferRowLength;

   if (region->bufferImageHeight == 0)
      height = region->imageExtent.height;
   else
      height = region->bufferImageHeight;

   const uint8_t plane =
      v3dv_plane_from_aspect(region->imageSubresource.aspectMask);

   const uint32_t mip_level = region->imageSubresource.mipLevel;
   const struct v3d_resource_slice *slice = &image->planes[plane].slices[mip_level];

   if (width != slice->width || height != slice->height)
      return false;

   /* Handle region semantics for compressed images */
   const uint32_t block_w =
      vk_format_get_blockwidth(image->planes[plane].vk_format);
   const uint32_t block_h =
      vk_format_get_blockheight(image->planes[plane].vk_format);
   width = DIV_ROUND_UP(width, block_w);
   height = DIV_ROUND_UP(height, block_h);

   /* Format must be supported for texturing via the TFU. Since we are just
    * copying raw data and not converting between pixel formats, we can ignore
    * the image's format and choose a compatible TFU format for the image
    * texel size instead, which expands the list of formats we can handle here.
    */
   const struct v3dv_format *format =
      v3dv_get_compatible_tfu_format(cmd_buffer->device,
                                     image->planes[plane].cpp, NULL);
   /* We only use single-plane formats with the TFU */
   assert(format->plane_count == 1);
   const struct v3dv_format_plane *format_plane = &format->planes[0];

   uint32_t num_layers;
   if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
      num_layers = vk_image_subresource_layer_count(&image->vk,
                                                    &region->imageSubresource);
   } else {
      num_layers = region->imageExtent.depth;
   }
   assert(num_layers > 0);

   assert(image->planes[plane].mem && image->planes[plane].mem->bo);
   const struct v3dv_bo *dst_bo = image->planes[plane].mem->bo;

   assert(buffer->mem && buffer->mem->bo);
   const struct v3dv_bo *src_bo = buffer->mem->bo;

   /* Emit a TFU job per layer to copy */
   const uint32_t buffer_stride = width * image->planes[plane].cpp;
   for (int i = 0; i < num_layers; i++) {
      uint32_t layer;
      if (image->vk.image_type != VK_IMAGE_TYPE_3D)
         layer = region->imageSubresource.baseArrayLayer + i;
      else
         layer = region->imageOffset.z + i;

      const uint32_t buffer_offset =
         buffer->mem_offset + region->bufferOffset +
         height * buffer_stride * i;
      const uint32_t src_offset = src_bo->offset + buffer_offset;

      const uint32_t dst_offset =
         dst_bo->offset + v3dv_layer_offset(image, mip_level, layer, plane);

      v3dv_X(cmd_buffer->device, meta_emit_tfu_job)(
             cmd_buffer,
             dst_bo->handle,
             dst_offset,
             slice->tiling,
             slice->padded_height,
             image->planes[plane].cpp,
             src_bo->handle,
             src_offset,
             V3D_TILING_RASTER,
             width,
             1,
             width, height, format_plane);
   }

   return true;
}

/**
 * Returns true if the implementation supports the requested operation (even if
 * it failed to process it, for example, due to an out-of-memory error).
 */
static bool
copy_buffer_to_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
                         struct v3dv_image *image,
                         struct v3dv_buffer *buffer,
                         const VkBufferImageCopy2 *region)
{
   VkFormat fb_format;
   uint8_t plane = v3dv_plane_from_aspect(region->imageSubresource.aspectMask);
   assert(plane < image->plane_count);

   if (!v3dv_meta_can_use_tlb(image, plane, region->imageSubresource.mipLevel,
                              &region->imageOffset, &region->imageExtent,
                              &fb_format)) {
      return false;
   }

   /* From the Vulkan spec for VkBufferImageCopy2:
    *
    *   "The aspectMask member of imageSubresource must only have a
    *    single bit set."
    *
    * For us this has relevant implications because we can't do TLB stores
    * of linear depth/stencil so we work around this by loading D/S data to the
    * color tile buffer using a compatible color format (see
    * emit_copy_buffer_to_layer_per_tile_list and choose_tlb_format functions),
    * however, when we are copying a single aspect to a combined D/S image
    * we need to preserve the other aspect, and for that we will still use the
    * D/S tile buffer to load and store the aspect of the image we need to
    * preserve, so in this case we are still constrained by the hw restriction
    * for linear D/S stores.
    */
   assert(util_bitcount(region->imageSubresource.aspectMask) == 1);
   if (!image->tiled &&
       vk_format_has_depth(fb_format) &&
       vk_format_has_stencil(fb_format)) {
      return false;
   }

   uint32_t internal_type, internal_bpp;
   v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
      (fb_format, region->imageSubresource.aspectMask,
       &internal_type, &internal_bpp);

   uint32_t num_layers;
   if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
      num_layers = vk_image_subresource_layer_count(&image->vk,
                                                    &region->imageSubresource);
   } else {
      num_layers = region->imageExtent.depth;
   }
   assert(num_layers > 0);

   struct v3dv_job *job =
      v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
   if (!job)
      return true;

   /* Handle copy to compressed format using a compatible format */
   const uint32_t block_w =
      vk_format_get_blockwidth(image->planes[plane].vk_format);
   const uint32_t block_h =
      vk_format_get_blockheight(image->planes[plane].vk_format);
   const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
   const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);

   v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
                        internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
                        false);

   struct v3dv_meta_framebuffer framebuffer;
   v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
                                              internal_type, &job->frame_tiling);

   v3dv_X(job->device, job_emit_binning_flush)(job);
   v3dv_X(job->device, meta_emit_copy_buffer_to_image_rcl)
      (job, image, buffer, &framebuffer, region);

   v3dv_cmd_buffer_finish_job(cmd_buffer);

   return true;
}

static bool
create_tiled_image_from_buffer(struct v3dv_cmd_buffer *cmd_buffer,
                               struct v3dv_image *image,
                               struct v3dv_buffer *buffer,
                               const VkBufferImageCopy2 *region)
{
   if (copy_buffer_to_image_tfu(cmd_buffer, image, buffer, region))
      return true;
   if (copy_buffer_to_image_tlb(cmd_buffer, image, buffer, region))
      return true;
   return false;
}

static VkResult
create_texel_buffer_copy_descriptor_pool(struct v3dv_cmd_buffer *cmd_buffer)
{
   /* If this is not the first pool we create for this command buffer
    * size it based on the size of the currently exhausted pool.
    */
   uint32_t descriptor_count = 64;
   if (cmd_buffer->meta.texel_buffer_copy.dspool != VK_NULL_HANDLE) {
      struct v3dv_descriptor_pool *exhausted_pool =
         v3dv_descriptor_pool_from_handle(cmd_buffer->meta.texel_buffer_copy.dspool);
      descriptor_count = MIN2(exhausted_pool->max_entry_count * 2, 1024);
   }

   /* Create the descriptor pool */
   cmd_buffer->meta.texel_buffer_copy.dspool = VK_NULL_HANDLE;
   VkDescriptorPoolSize pool_size = {
      .type = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
      .descriptorCount = descriptor_count,
   };
   VkDescriptorPoolCreateInfo info = {
      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
      .maxSets = descriptor_count,
      .poolSizeCount = 1,
      .pPoolSizes = &pool_size,
      .flags = 0,
   };
   VkResult result =
      v3dv_CreateDescriptorPool(v3dv_device_to_handle(cmd_buffer->device),
                                &info,
                                &cmd_buffer->device->vk.alloc,
                                &cmd_buffer->meta.texel_buffer_copy.dspool);

   if (result == VK_SUCCESS) {
      assert(cmd_buffer->meta.texel_buffer_copy.dspool != VK_NULL_HANDLE);
      const VkDescriptorPool _pool = cmd_buffer->meta.texel_buffer_copy.dspool;

      v3dv_cmd_buffer_add_private_obj(
         cmd_buffer, (uintptr_t) _pool,
         (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyDescriptorPool);

      struct v3dv_descriptor_pool *pool =
         v3dv_descriptor_pool_from_handle(_pool);
      pool->is_driver_internal = true;
   }

   return result;
}

static VkResult
allocate_texel_buffer_copy_descriptor_set(struct v3dv_cmd_buffer *cmd_buffer,
                                          VkDescriptorSet *set)
{
   /* Make sure we have a descriptor pool */
   VkResult result;
   if (cmd_buffer->meta.texel_buffer_copy.dspool == VK_NULL_HANDLE) {
      result = create_texel_buffer_copy_descriptor_pool(cmd_buffer);
      if (result != VK_SUCCESS)
         return result;
   }
   assert(cmd_buffer->meta.texel_buffer_copy.dspool != VK_NULL_HANDLE);

   /* Allocate descriptor set */
   struct v3dv_device *device = cmd_buffer->device;
   VkDevice _device = v3dv_device_to_handle(device);
   VkDescriptorSetAllocateInfo info = {
      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
      .descriptorPool = cmd_buffer->meta.texel_buffer_copy.dspool,
      .descriptorSetCount = 1,
      .pSetLayouts = &device->meta.texel_buffer_copy.ds_layout,
   };
   result = v3dv_AllocateDescriptorSets(_device, &info, set);

   /* If we ran out of pool space, grow the pool and try again */
   if (result == VK_ERROR_OUT_OF_POOL_MEMORY) {
      result = create_texel_buffer_copy_descriptor_pool(cmd_buffer);
      if (result == VK_SUCCESS) {
         info.descriptorPool = cmd_buffer->meta.texel_buffer_copy.dspool;
         result = v3dv_AllocateDescriptorSets(_device, &info, set);
      }
   }

   return result;
}

static void
get_texel_buffer_copy_pipeline_cache_key(VkFormat format,
                                         VkColorComponentFlags cmask,
                                         VkComponentMapping *cswizzle,
                                         bool is_layered,
                                         uint8_t *key)
{
   memset(key, 0, V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE);

   uint32_t *p = (uint32_t *) key;

   *p = format;
   p++;

   *p = cmask;
   p++;

   /* Note that that we are using a single byte for this, so we could pack
    * more data into this 32-bit slot in the future.
    */
   *p = is_layered ? 1 : 0;
   p++;

   memcpy(p, cswizzle, sizeof(VkComponentMapping));
   p += sizeof(VkComponentMapping) / sizeof(uint32_t);

   assert(((uint8_t*)p - key) == V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE);
}

static bool
create_blit_render_pass(struct v3dv_device *device,
                        VkFormat dst_format,
                        VkFormat src_format,
                        VkRenderPass *pass_load,
                        VkRenderPass *pass_no_load);

static bool
create_pipeline(struct v3dv_device *device,
                struct v3dv_render_pass *pass,
                struct nir_shader *vs_nir,
                struct nir_shader *gs_nir,
                struct nir_shader *fs_nir,
                const VkPipelineVertexInputStateCreateInfo *vi_state,
                const VkPipelineDepthStencilStateCreateInfo *ds_state,
                const VkPipelineColorBlendStateCreateInfo *cb_state,
                const VkPipelineMultisampleStateCreateInfo *ms_state,
                const VkPipelineLayout layout,
                VkPipeline *pipeline);

static nir_shader *
get_texel_buffer_copy_vs(const nir_shader_compiler_options *options)
{
   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_VERTEX, options,
                                                  "meta texel buffer copy vs");
   nir_variable *vs_out_pos =
      nir_variable_create(b.shader, nir_var_shader_out,
                          glsl_vec4_type(), "gl_Position");
   vs_out_pos->data.location = VARYING_SLOT_POS;

   nir_def *pos = nir_gen_rect_vertices(&b, NULL, NULL);
   nir_store_var(&b, vs_out_pos, pos, 0xf);

   return b.shader;
}

static nir_shader *
get_texel_buffer_copy_gs(const nir_shader_compiler_options *options)
{
   /* FIXME: this creates a geometry shader that takes the index of a single
    * layer to clear from push constants, so we need to emit a draw call for
    * each layer that we want to clear. We could actually do better and have it
    * take a range of layers however, if we were to do this, we would need to
    * be careful not to exceed the maximum number of output vertices allowed in
    * a geometry shader.
    */
   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_GEOMETRY, options,
                                                  "meta texel buffer copy gs");
   nir_shader *nir = b.shader;
   nir->info.inputs_read = 1ull << VARYING_SLOT_POS;
   nir->info.outputs_written = (1ull << VARYING_SLOT_POS) |
                               (1ull << VARYING_SLOT_LAYER);
   nir->info.gs.input_primitive = MESA_PRIM_TRIANGLES;
   nir->info.gs.output_primitive = MESA_PRIM_TRIANGLE_STRIP;
   nir->info.gs.vertices_in = 3;
   nir->info.gs.vertices_out = 3;
   nir->info.gs.invocations = 1;
   nir->info.gs.active_stream_mask = 0x1;

   /* in vec4 gl_Position[3] */
   nir_variable *gs_in_pos =
      nir_variable_create(b.shader, nir_var_shader_in,
                          glsl_array_type(glsl_vec4_type(), 3, 0),
                          "in_gl_Position");
   gs_in_pos->data.location = VARYING_SLOT_POS;

   /* out vec4 gl_Position */
   nir_variable *gs_out_pos =
      nir_variable_create(b.shader, nir_var_shader_out, glsl_vec4_type(),
                          "out_gl_Position");
   gs_out_pos->data.location = VARYING_SLOT_POS;

   /* out float gl_Layer */
   nir_variable *gs_out_layer =
      nir_variable_create(b.shader, nir_var_shader_out, glsl_float_type(),
                          "out_gl_Layer");
   gs_out_layer->data.location = VARYING_SLOT_LAYER;

   /* Emit output triangle */
   for (uint32_t i = 0; i < 3; i++) {
      /* gl_Position from shader input */
      nir_deref_instr *in_pos_i =
         nir_build_deref_array_imm(&b, nir_build_deref_var(&b, gs_in_pos), i);
      nir_copy_deref(&b, nir_build_deref_var(&b, gs_out_pos), in_pos_i);

      /* gl_Layer from push constants */
      nir_def *layer =
         nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
                                .base = TEXEL_BUFFER_COPY_GS_LAYER_PC_OFFSET,
                                .range = 4);
      nir_store_var(&b, gs_out_layer, layer, 0x1);

      nir_emit_vertex(&b, 0);
   }

   nir_end_primitive(&b, 0);

   return nir;
}

static nir_def *
load_frag_coord(nir_builder *b)
{
   nir_foreach_shader_in_variable(var, b->shader) {
      if (var->data.location == VARYING_SLOT_POS)
         return nir_load_var(b, var);
   }
   nir_variable *pos = nir_variable_create(b->shader, nir_var_shader_in,
                                           glsl_vec4_type(), NULL);
   pos->data.location = VARYING_SLOT_POS;
   return nir_load_var(b, pos);
}

static uint32_t
component_swizzle_to_nir_swizzle(VkComponentSwizzle comp, VkComponentSwizzle swz)
{
   if (swz == VK_COMPONENT_SWIZZLE_IDENTITY)
      swz = comp;

   switch (swz) {
   case VK_COMPONENT_SWIZZLE_R:
      return 0;
   case VK_COMPONENT_SWIZZLE_G:
      return 1;
   case VK_COMPONENT_SWIZZLE_B:
      return 2;
   case VK_COMPONENT_SWIZZLE_A:
      return 3;
   default:
      unreachable("Invalid swizzle");
   };
}

static nir_shader *
get_texel_buffer_copy_fs(const nir_shader_compiler_options *options,
                         VkFormat format, VkComponentMapping *cswizzle)
{
   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, options,
                                                  "meta texel buffer copy fs");

   /* We only use the copy from texel buffer shader to implement
    * copy_buffer_to_image_shader, which always selects a compatible integer
    * format for the copy.
    */
   assert(vk_format_is_int(format));

   /* Fragment shader output color */
   nir_variable *fs_out_color =
      nir_variable_create(b.shader, nir_var_shader_out,
                          glsl_uvec4_type(), "out_color");
   fs_out_color->data.location = FRAG_RESULT_DATA0;

   /* Texel buffer input */
   const struct glsl_type *sampler_type =
      glsl_sampler_type(GLSL_SAMPLER_DIM_BUF, false, false, GLSL_TYPE_UINT);
   nir_variable *sampler =
      nir_variable_create(b.shader, nir_var_uniform, sampler_type, "texel_buf");
   sampler->data.descriptor_set = 0;
   sampler->data.binding = 0;

   /* Load the box describing the pixel region we want to copy from the
    * texel buffer.
    */
   nir_def *box =
      nir_load_push_constant(&b, 4, 32, nir_imm_int(&b, 0),
                             .base = TEXEL_BUFFER_COPY_FS_BOX_PC_OFFSET,
                             .range = 16);

   /* Load the buffer stride (this comes in texel units) */
   nir_def *stride =
      nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
                             .base = TEXEL_BUFFER_COPY_FS_STRIDE_PC_OFFSET,
                             .range = 4);

   /* Load the buffer offset (this comes in texel units) */
   nir_def *offset =
      nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0),
                             .base = TEXEL_BUFFER_COPY_FS_OFFSET_PC_OFFSET,
                             .range = 4);

   nir_def *coord = nir_f2i32(&b, load_frag_coord(&b));

   /* Load pixel data from texel buffer based on the x,y offset of the pixel
    * within the box. Texel buffers are 1D arrays of texels.
    *
    * Notice that we already make sure that we only generate fragments that are
    * inside the box through the scissor/viewport state, so our offset into the
    * texel buffer should always be within its bounds and we we don't need
    * to add a check for that here.
    */
   nir_def *x_offset =
      nir_isub(&b, nir_channel(&b, coord, 0),
                   nir_channel(&b, box, 0));
   nir_def *y_offset =
      nir_isub(&b, nir_channel(&b, coord, 1),
                   nir_channel(&b, box, 1));
   nir_def *texel_offset =
      nir_iadd(&b, nir_iadd(&b, offset, x_offset),
                   nir_imul(&b, y_offset, stride));

   nir_def *tex_deref = &nir_build_deref_var(&b, sampler)->def;
   nir_tex_instr *tex = nir_tex_instr_create(b.shader, 2);
   tex->sampler_dim = GLSL_SAMPLER_DIM_BUF;
   tex->op = nir_texop_txf;
   tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, texel_offset);
   tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_texture_deref, tex_deref);
   tex->dest_type = nir_type_uint32;
   tex->is_array = false;
   tex->coord_components = 1;
   nir_def_init(&tex->instr, &tex->def, 4, 32);
   nir_builder_instr_insert(&b, &tex->instr);

   uint32_t swiz[4];
   swiz[0] =
      component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_R, cswizzle->r);
   swiz[1] =
      component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_G, cswizzle->g);
   swiz[2] =
      component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_B, cswizzle->b);
   swiz[3] =
      component_swizzle_to_nir_swizzle(VK_COMPONENT_SWIZZLE_A, cswizzle->a);
   nir_def *s = nir_swizzle(&b, &tex->def, swiz, 4);
   nir_store_var(&b, fs_out_color, s, 0xf);

   return b.shader;
}

static bool
create_texel_buffer_copy_pipeline(struct v3dv_device *device,
                                  VkFormat format,
                                  VkColorComponentFlags cmask,
                                  VkComponentMapping *cswizzle,
                                  bool is_layered,
                                  VkRenderPass _pass,
                                  VkPipelineLayout pipeline_layout,
                                  VkPipeline *pipeline)
{
   struct v3dv_render_pass *pass = v3dv_render_pass_from_handle(_pass);

   assert(vk_format_is_color(format));

   const nir_shader_compiler_options *options =
      v3dv_pipeline_get_nir_options(&device->devinfo);

   nir_shader *vs_nir = get_texel_buffer_copy_vs(options);
   nir_shader *fs_nir = get_texel_buffer_copy_fs(options, format, cswizzle);
   nir_shader *gs_nir = is_layered ? get_texel_buffer_copy_gs(options) : NULL;

   const VkPipelineVertexInputStateCreateInfo vi_state = {
      .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
      .vertexBindingDescriptionCount = 0,
      .vertexAttributeDescriptionCount = 0,
   };

   VkPipelineDepthStencilStateCreateInfo ds_state = {
      .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
   };

   VkPipelineColorBlendAttachmentState blend_att_state[1] = { 0 };
   blend_att_state[0] = (VkPipelineColorBlendAttachmentState) {
      .blendEnable = false,
      .colorWriteMask = cmask,
   };

   const VkPipelineColorBlendStateCreateInfo cb_state = {
      .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
      .logicOpEnable = false,
      .attachmentCount = 1,
      .pAttachments = blend_att_state
   };

   const VkPipelineMultisampleStateCreateInfo ms_state = {
      .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
      .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT,
      .sampleShadingEnable = false,
      .pSampleMask = NULL,
      .alphaToCoverageEnable = false,
      .alphaToOneEnable = false,
   };

   return create_pipeline(device,
                          pass,
                          vs_nir, gs_nir, fs_nir,
                          &vi_state,
                          &ds_state,
                          &cb_state,
                          &ms_state,
                          pipeline_layout,
                          pipeline);
}

static bool
get_copy_texel_buffer_pipeline(
   struct v3dv_cmd_buffer *cmd_buffer,
   VkFormat format,
   VkColorComponentFlags cmask,
   VkComponentMapping *cswizzle,
   VkImageType image_type,
   bool is_layered,
   struct v3dv_meta_texel_buffer_copy_pipeline **pipeline)
{
   bool ok = true;
   struct v3dv_device *device = cmd_buffer->device;

   uint8_t key[V3DV_META_TEXEL_BUFFER_COPY_CACHE_KEY_SIZE];
   if (device->instance->meta_cache_enabled) {
      get_texel_buffer_copy_pipeline_cache_key(format, cmask, cswizzle, is_layered,
                                               key);

      mtx_lock(&device->meta.mtx);
      struct hash_entry *entry =
         _mesa_hash_table_search(device->meta.texel_buffer_copy.cache[image_type],
                                 key);
      if (entry) {
         mtx_unlock(&device->meta.mtx);
         *pipeline = entry->data;
         return true;
      }
   }

   *pipeline = vk_zalloc2(&device->vk.alloc, NULL, sizeof(**pipeline), 8,
                          VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);

   if (*pipeline == NULL)
      goto fail;

   /* The blit render pass is compatible */
   ok = create_blit_render_pass(device, format, format,
                                &(*pipeline)->pass,
                                &(*pipeline)->pass_no_load);
   if (!ok)
      goto fail;

   ok =
      create_texel_buffer_copy_pipeline(device,
                                        format, cmask, cswizzle, is_layered,
                                        (*pipeline)->pass,
                                        device->meta.texel_buffer_copy.p_layout,
                                        &(*pipeline)->pipeline);
   if (!ok)
      goto fail;

   if (device->instance->meta_cache_enabled) {
      _mesa_hash_table_insert(device->meta.texel_buffer_copy.cache[image_type],
                              key, *pipeline);
      mtx_unlock(&device->meta.mtx);
   } else {
      v3dv_cmd_buffer_add_private_obj(
         cmd_buffer, (uintptr_t)*pipeline,
         (v3dv_cmd_buffer_private_obj_destroy_cb)destroy_meta_texel_buffer_copy_pipeline);
   }

   return true;

fail:
   if (device->instance->meta_cache_enabled)
      mtx_unlock(&device->meta.mtx);

   VkDevice _device = v3dv_device_to_handle(device);
   if (*pipeline) {
      if ((*pipeline)->pass)
         v3dv_DestroyRenderPass(_device, (*pipeline)->pass, &device->vk.alloc);
      if ((*pipeline)->pipeline)
         v3dv_DestroyPipeline(_device, (*pipeline)->pipeline, &device->vk.alloc);
      vk_free(&device->vk.alloc, *pipeline);
      *pipeline = NULL;
   }

   return false;
}

static bool
texel_buffer_shader_copy(struct v3dv_cmd_buffer *cmd_buffer,
                         VkImageAspectFlags aspect,
                         struct v3dv_image *image,
                         VkFormat dst_format,
                         VkFormat src_format,
                         struct v3dv_buffer *buffer,
                         uint32_t buffer_bpp,
                         VkColorComponentFlags cmask,
                         VkComponentMapping *cswizzle,
                         uint32_t region_count,
                         const VkBufferImageCopy2 *regions)
{
   VkResult result;
   bool handled = false;

   assert(cswizzle);

   /* This is a copy path, so we don't handle format conversions. The only
    * exception are stencil to D24S8 copies, which are handled as a color
    * masked R8->RGBA8 copy.
    */
   assert(src_format == dst_format ||
          (dst_format == VK_FORMAT_R8G8B8A8_UINT &&
           src_format == VK_FORMAT_R8_UINT &&
           cmask == VK_COLOR_COMPONENT_R_BIT));

   /* We only handle color copies. Callers can copy D/S aspects by using
    * a compatible color format and maybe a cmask/cswizzle for D24 formats.
    */
   if (!vk_format_is_color(dst_format) || !vk_format_is_color(src_format))
      return handled;

   /* FIXME: we only handle uncompressed images for now. */
   if (vk_format_is_compressed(image->vk.format))
      return handled;

   const VkColorComponentFlags full_cmask = VK_COLOR_COMPONENT_R_BIT |
                                            VK_COLOR_COMPONENT_G_BIT |
                                            VK_COLOR_COMPONENT_B_BIT |
                                            VK_COLOR_COMPONENT_A_BIT;
   if (cmask == 0)
      cmask = full_cmask;

   /* The buffer needs to have VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT
    * so we can bind it as a texel buffer. Otherwise, the buffer view
    * we create below won't setup the texture state that we need for this.
    */
   if (!(buffer->usage & VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT)) {
      if (v3dv_buffer_format_supports_features(
             cmd_buffer->device, src_format,
             VK_FORMAT_FEATURE_2_UNIFORM_TEXEL_BUFFER_BIT)) {
         buffer->usage |= VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT;
      } else {
         return handled;
      }
   }

   /* At this point we should be able to handle the copy unless an unexpected
    * error occurs, such as an OOM.
    */
   handled = true;


   /* Compute the number of layers to copy.
    *
    * If we are batching (region_count > 1) all our regions have the same
    * image subresource so we can take this from the first region. For 3D
    * images we require the same depth extent.
    */
   const VkImageSubresourceLayers *resource = &regions[0].imageSubresource;
   uint32_t num_layers;
   if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
      num_layers = vk_image_subresource_layer_count(&image->vk, resource);
   } else {
      assert(region_count == 1);
      num_layers = regions[0].imageExtent.depth;
   }
   assert(num_layers > 0);

   /* Get the texel buffer copy pipeline */
   struct v3dv_meta_texel_buffer_copy_pipeline *pipeline = NULL;
   bool ok = get_copy_texel_buffer_pipeline(cmd_buffer,
                                            dst_format, cmask, cswizzle,
                                            image->vk.image_type, num_layers > 1,
                                            &pipeline);
   if (!ok)
      return handled;
   assert(pipeline && pipeline->pipeline && pipeline->pass);

   /* Setup descriptor set for the source texel buffer. We don't have to
    * register the descriptor as a private command buffer object since
    * all descriptors will be freed automatically with the descriptor
    * pool.
    */
   VkDescriptorSet set;
   result = allocate_texel_buffer_copy_descriptor_set(cmd_buffer, &set);
   if (result != VK_SUCCESS)
      return handled;

   /* We can't pass region->bufferOffset here for the offset field because
    * the texture base pointer in the texture shader state must be a 64-byte
    * aligned value. Instead, we use 0 here and we pass the offset in texels
    * as a push constant to the shader.
    */
   VkDevice _device = v3dv_device_to_handle(cmd_buffer->device);
   VkBufferViewCreateInfo buffer_view_info = {
      .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO,
      .buffer = v3dv_buffer_to_handle(buffer),
      .format = src_format,
      .offset = 0,
      .range = VK_WHOLE_SIZE,
   };

   VkBufferView texel_buffer_view;
   result = v3dv_CreateBufferView(_device, &buffer_view_info,
                                  &cmd_buffer->device->vk.alloc,
                                  &texel_buffer_view);
   if (result != VK_SUCCESS)
      return handled;

   v3dv_cmd_buffer_add_private_obj(
      cmd_buffer, (uintptr_t)texel_buffer_view,
      (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyBufferView);

   VkWriteDescriptorSet write = {
      .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
      .dstSet = set,
      .dstBinding = 0,
      .dstArrayElement = 0,
      .descriptorCount = 1,
      .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
      .pTexelBufferView = &texel_buffer_view,
   };
   v3dv_UpdateDescriptorSets(_device, 1, &write, 0, NULL);

   /* Push command buffer state before starting meta operation */
   v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);

   /* Bind common state for all layers and regions  */
   VkCommandBuffer _cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
   v3dv_CmdBindPipeline(_cmd_buffer,
                        VK_PIPELINE_BIND_POINT_GRAPHICS,
                        pipeline->pipeline);

   v3dv_CmdBindDescriptorSets(_cmd_buffer,
                              VK_PIPELINE_BIND_POINT_GRAPHICS,
                              cmd_buffer->device->meta.texel_buffer_copy.p_layout,
                              0, 1, &set,
                              0, NULL);

   /* Setup framebuffer.
    *
    * For 3D images, this creates a layered framebuffer with a number of
    * layers matching the depth extent of the 3D image.
    */
   uint8_t plane = v3dv_plane_from_aspect(aspect);
   uint32_t fb_width = u_minify(image->planes[plane].width, resource->mipLevel);
   uint32_t fb_height = u_minify(image->planes[plane].height, resource->mipLevel);

   VkImageViewCreateInfo image_view_info = {
      .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
      .image = v3dv_image_to_handle(image),
      .viewType = v3dv_image_type_to_view_type(image->vk.image_type),
      .format = dst_format,
      .subresourceRange = {
         .aspectMask = aspect,
         .baseMipLevel = resource->mipLevel,
         .levelCount = 1,
         .baseArrayLayer = resource->baseArrayLayer,
         .layerCount = num_layers,
      },
   };
   VkImageView image_view;
   result = v3dv_create_image_view(cmd_buffer->device,
                                   &image_view_info, &image_view);
   if (result != VK_SUCCESS)
      goto fail;

   v3dv_cmd_buffer_add_private_obj(
      cmd_buffer, (uintptr_t)image_view,
      (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImageView);

   VkFramebufferCreateInfo fb_info = {
      .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
      .renderPass = pipeline->pass,
      .attachmentCount = 1,
      .pAttachments = &image_view,
      .width = fb_width,
      .height = fb_height,
      .layers = num_layers,
   };

   VkFramebuffer fb;
   result = v3dv_CreateFramebuffer(_device, &fb_info,
                                   &cmd_buffer->device->vk.alloc, &fb);
   if (result != VK_SUCCESS)
      goto fail;

    v3dv_cmd_buffer_add_private_obj(
       cmd_buffer, (uintptr_t)fb,
       (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyFramebuffer);

   /* For each layer */
   for (uint32_t l = 0; l < num_layers; l++) {
       /* Start render pass for this layer.
        *
        * If the we only have one region to copy, then we might be able to
        * skip the TLB load if it is aligned to tile boundaries. All layers
        * copy the same area, so we only need to check this once.
        */
      bool can_skip_tlb_load = false;
      VkRect2D render_area;
      if (region_count == 1) {
         render_area.offset.x = regions[0].imageOffset.x;
         render_area.offset.y = regions[0].imageOffset.y;
         render_area.extent.width = regions[0].imageExtent.width;
         render_area.extent.height = regions[0].imageExtent.height;

         if (l == 0) {
            struct v3dv_render_pass *pipeline_pass =
               v3dv_render_pass_from_handle(pipeline->pass);
            can_skip_tlb_load =
               cmask == full_cmask &&
               v3dv_subpass_area_is_tile_aligned(cmd_buffer->device, &render_area,
                                                 v3dv_framebuffer_from_handle(fb),
                                                 pipeline_pass, 0);
         }
      } else {
         render_area.offset.x = 0;
         render_area.offset.y = 0;
         render_area.extent.width = fb_width;
         render_area.extent.height = fb_height;
      }

      VkRenderPassBeginInfo rp_info = {
         .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
         .renderPass = can_skip_tlb_load ? pipeline->pass_no_load :
                                           pipeline->pass,
         .framebuffer = fb,
         .renderArea = render_area,
         .clearValueCount = 0,
      };

      VkSubpassBeginInfo sp_info = {
         .sType = VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO,
         .contents = VK_SUBPASS_CONTENTS_INLINE,
      };

      v3dv_CmdBeginRenderPass2(_cmd_buffer, &rp_info, &sp_info);
      struct v3dv_job *job = cmd_buffer->state.job;
      if (!job)
         goto fail;

      /* If we are using a layered copy we need to specify the layer for the
       * Geometry Shader.
       */
      if (num_layers > 1) {
         uint32_t layer = resource->baseArrayLayer + l;
         v3dv_CmdPushConstants(_cmd_buffer,
                               cmd_buffer->device->meta.texel_buffer_copy.p_layout,
                               VK_SHADER_STAGE_GEOMETRY_BIT,
                               24, 4, &layer);
      }

      /* For each region */
      for (uint32_t r = 0; r < region_count; r++) {
         const VkBufferImageCopy2 *region = &regions[r];

         /* Obtain the 2D buffer region spec */
         uint32_t buf_width, buf_height;
         if (region->bufferRowLength == 0)
             buf_width = region->imageExtent.width;
         else
             buf_width = region->bufferRowLength;

         if (region->bufferImageHeight == 0)
             buf_height = region->imageExtent.height;
         else
             buf_height = region->bufferImageHeight;

         const VkViewport viewport = {
            .x = region->imageOffset.x,
            .y = region->imageOffset.y,
            .width = region->imageExtent.width,
            .height = region->imageExtent.height,
            .minDepth = 0.0f,
            .maxDepth = 1.0f
         };
         v3dv_CmdSetViewport(_cmd_buffer, 0, 1, &viewport);
         const VkRect2D scissor = {
            .offset = { region->imageOffset.x, region->imageOffset.y },
            .extent = { region->imageExtent.width, region->imageExtent.height }
         };
         v3dv_CmdSetScissor(_cmd_buffer, 0, 1, &scissor);

         const VkDeviceSize buf_offset =
            region->bufferOffset / buffer_bpp  + l * buf_height * buf_width;
         uint32_t push_data[6] = {
            region->imageOffset.x,
            region->imageOffset.y,
            region->imageOffset.x + region->imageExtent.width - 1,
            region->imageOffset.y + region->imageExtent.height - 1,
            buf_width,
            buf_offset,
         };

         v3dv_CmdPushConstants(_cmd_buffer,
                               cmd_buffer->device->meta.texel_buffer_copy.p_layout,
                               VK_SHADER_STAGE_FRAGMENT_BIT,
                               0, sizeof(push_data), &push_data);

         v3dv_CmdDraw(_cmd_buffer, 4, 1, 0, 0);
      } /* For each region */

      VkSubpassEndInfo sp_end_info = {
         .sType = VK_STRUCTURE_TYPE_SUBPASS_END_INFO,
      };

      v3dv_CmdEndRenderPass2(_cmd_buffer, &sp_end_info);
   } /* For each layer */

fail:
   v3dv_cmd_buffer_meta_state_pop(cmd_buffer, true);
   return handled;
}

/**
 * Returns true if the implementation supports the requested operation (even if
 * it failed to process it, for example, due to an out-of-memory error).
 */
static bool
copy_buffer_to_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
                          VkImageAspectFlags aspect,
                          struct v3dv_image *image,
                          VkFormat dst_format,
                          VkFormat src_format,
                          struct v3dv_buffer *buffer,
                          uint32_t buffer_bpp,
                          VkColorComponentFlags cmask,
                          VkComponentMapping *cswizzle,
                          uint32_t region_count,
                          const VkBufferImageCopy2 *regions)
{
   /* Since we can't sample linear images we need to upload the linear
    * buffer to a tiled image that we can use as a blit source, which
    * is slow.
    */
   perf_debug("Falling back to blit path for buffer to image copy.\n");

   struct v3dv_device *device = cmd_buffer->device;
   VkDevice _device = v3dv_device_to_handle(device);
   bool handled = true;

   /* Allocate memory for the tiled image. Since we copy layer by layer
    * we allocate memory to hold a full layer, which is the worse case.
    * For that we create a dummy image with that spec, get memory requirements
    * for it and use that information to create the memory allocation.
    * We will then reuse this memory store for all the regions we want to
    * copy.
    */
   VkImage dummy_image;
   VkImageCreateInfo dummy_info = {
      .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
      .imageType = VK_IMAGE_TYPE_2D,
      .format = src_format,
      .extent = { image->vk.extent.width, image->vk.extent.height, 1 },
      .mipLevels = 1,
      .arrayLayers = 1,
      .samples = VK_SAMPLE_COUNT_1_BIT,
      .tiling = VK_IMAGE_TILING_OPTIMAL,
      .usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
               VK_IMAGE_USAGE_TRANSFER_DST_BIT,
      .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
      .queueFamilyIndexCount = 0,
      .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
   };
   VkResult result =
      v3dv_CreateImage(_device, &dummy_info, &device->vk.alloc, &dummy_image);
   if (result != VK_SUCCESS)
      return handled;

   VkMemoryRequirements reqs;
   vk_common_GetImageMemoryRequirements(_device, dummy_image, &reqs);
   v3dv_DestroyImage(_device, dummy_image, &device->vk.alloc);

   VkDeviceMemory mem;
   VkMemoryAllocateInfo alloc_info = {
      .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
      .allocationSize = reqs.size,
      .memoryTypeIndex = 0,
   };
   result = v3dv_AllocateMemory(_device, &alloc_info, &device->vk.alloc, &mem);
   if (result != VK_SUCCESS)
      return handled;

   v3dv_cmd_buffer_add_private_obj(
      cmd_buffer, (uintptr_t)mem,
      (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_FreeMemory);

   /* Obtain the layer count.
    *
    * If we are batching (region_count > 1) all our regions have the same
    * image subresource so we can take this from the first region.
    */
   uint32_t num_layers;
   if (image->vk.image_type != VK_IMAGE_TYPE_3D) {
      num_layers = vk_image_subresource_layer_count(&image->vk,
                                                    &regions[0].imageSubresource);
   } else {
      num_layers = regions[0].imageExtent.depth;
   }
   assert(num_layers > 0);

   /* Sanity check: we can only batch multiple regions together if they have
    * the same framebuffer (so the same layer).
    */
   assert(num_layers == 1 || region_count == 1);

   uint8_t plane = v3dv_plane_from_aspect(aspect);
   assert(plane < image->plane_count);

   const uint32_t block_width =
      vk_format_get_blockwidth(image->planes[plane].vk_format);
   const uint32_t block_height =
      vk_format_get_blockheight(image->planes[plane].vk_format);

   /* Copy regions by uploading each region to a temporary tiled image using
    * the memory we have just allocated as storage.
    */
   for (uint32_t r = 0; r < region_count; r++) {
      const VkBufferImageCopy2 *region = &regions[r];

      /* Obtain the 2D buffer region spec */
      uint32_t buf_width, buf_height;
      if (region->bufferRowLength == 0)
          buf_width = region->imageExtent.width;
      else
          buf_width = region->bufferRowLength;

      if (region->bufferImageHeight == 0)
          buf_height = region->imageExtent.height;
      else
          buf_height = region->bufferImageHeight;

      /* If the image is compressed, the bpp refers to blocks, not pixels */
      buf_width = buf_width / block_width;
      buf_height = buf_height / block_height;

      for (uint32_t i = 0; i < num_layers; i++) {
         /* Create the tiled image */
         VkImageCreateInfo image_info = {
            .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
            .imageType = VK_IMAGE_TYPE_2D,
            .format = src_format,
            .extent = { buf_width, buf_height, 1 },
            .mipLevels = 1,
            .arrayLayers = 1,
            .samples = VK_SAMPLE_COUNT_1_BIT,
            .tiling = VK_IMAGE_TILING_OPTIMAL,
            .usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
                     VK_IMAGE_USAGE_TRANSFER_DST_BIT,
            .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
            .queueFamilyIndexCount = 0,
            .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
         };

         VkImage buffer_image;
         VkResult result =
            v3dv_CreateImage(_device, &image_info, &device->vk.alloc,
                             &buffer_image);
         if (result != VK_SUCCESS)
            return handled;

         v3dv_cmd_buffer_add_private_obj(
            cmd_buffer, (uintptr_t)buffer_image,
            (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImage);

         result = vk_common_BindImageMemory(_device, buffer_image, mem, 0);
         if (result != VK_SUCCESS)
            return handled;

         /* When copying a multi-plane image the aspect indicates the plane to
          * copy. For these, we only copy one plane at a time, which is always
          * a color plane.
          */
         VkImageAspectFlags copy_aspect =
            image->plane_count == 1 ? aspect : VK_IMAGE_ASPECT_COLOR_BIT;

         /* Upload buffer contents for the selected layer */
         const VkDeviceSize buf_offset_bytes =
            region->bufferOffset + i * buf_height * buf_width * buffer_bpp;
         const VkBufferImageCopy2 buffer_image_copy = {
            .sType = VK_STRUCTURE_TYPE_BUFFER_IMAGE_COPY_2,
            .bufferOffset = buf_offset_bytes,
            .bufferRowLength = region->bufferRowLength / block_width,
            .bufferImageHeight = region->bufferImageHeight / block_height,
            .imageSubresource = {
               .aspectMask = copy_aspect,
               .mipLevel = 0,
               .baseArrayLayer = 0,
               .layerCount = 1,
            },
            .imageOffset = { 0, 0, 0 },
            .imageExtent = { buf_width, buf_height, 1 }
         };
         handled =
            create_tiled_image_from_buffer(cmd_buffer,
                                           v3dv_image_from_handle(buffer_image),
                                           buffer, &buffer_image_copy);
         if (!handled) {
            /* This is unexpected, we should have setup the upload to be
             * conformant to a TFU or TLB copy.
             */
            unreachable("Unable to copy buffer to image through TLB");
            return false;
         }

         /* Blit-copy the requested image extent from the buffer image to the
          * destination image.
          *
          * Since we are copying, the blit must use the same format on the
          * destination and source images to avoid format conversions. The
          * only exception is copying stencil, which we upload to a R8UI source
          * image, but that we need to blit to a S8D24 destination (the only
          * stencil format we support).
          */
         const VkImageBlit2 blit_region = {
            .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2,
            .srcSubresource = {
               .aspectMask = copy_aspect,
               .mipLevel = 0,
               .baseArrayLayer = 0,
               .layerCount = 1,
            },
            .srcOffsets = {
               { 0, 0, 0 },
               { region->imageExtent.width, region->imageExtent.height, 1 },
            },
            .dstSubresource = {
               .aspectMask = aspect,
               .mipLevel = region->imageSubresource.mipLevel,
               .baseArrayLayer = region->imageSubresource.baseArrayLayer + i,
               .layerCount = 1,
            },
            .dstOffsets = {
               {
                  DIV_ROUND_UP(region->imageOffset.x, block_width),
                  DIV_ROUND_UP(region->imageOffset.y, block_height),
                  region->imageOffset.z + i,
               },
               {
                  DIV_ROUND_UP(region->imageOffset.x + region->imageExtent.width,
                               block_width),
                  DIV_ROUND_UP(region->imageOffset.y + region->imageExtent.height,
                               block_height),
                  region->imageOffset.z + i + 1,
               },
            },
         };

         handled = blit_shader(cmd_buffer,
                               image, dst_format,
                               v3dv_image_from_handle(buffer_image), src_format,
                               cmask, cswizzle,
                               &blit_region, VK_FILTER_NEAREST, true);
         if (!handled) {
            /* This is unexpected, we should have a supported blit spec */
            unreachable("Unable to blit buffer to destination image");
            return false;
         }
      }
   }

   return handled;
}

/**
 * Returns true if the implementation supports the requested operation (even if
 * it failed to process it, for example, due to an out-of-memory error).
 */
static bool
copy_buffer_to_image_shader(struct v3dv_cmd_buffer *cmd_buffer,
                            struct v3dv_image *image,
                            struct v3dv_buffer *buffer,
                            uint32_t region_count,
                            const VkBufferImageCopy2 *regions,
                            bool use_texel_buffer)
{
   /* We can only call this with region_count > 1 if we can batch the regions
    * together, in which case they share the same image subresource, and so
    * the same aspect.
    */
   VkImageAspectFlags aspect = regions[0].imageSubresource.aspectMask;
   const VkImageAspectFlagBits any_plane_aspect =
      VK_IMAGE_ASPECT_PLANE_0_BIT |
      VK_IMAGE_ASPECT_PLANE_1_BIT |
      VK_IMAGE_ASPECT_PLANE_2_BIT;

   bool is_plane_aspect = aspect & any_plane_aspect;

   /* Generally, the bpp of the data in the buffer matches that of the
    * destination image. The exception is the case where we are uploading
    * stencil (8bpp) to a combined d24s8 image (32bpp).
    */
   uint8_t plane = v3dv_plane_from_aspect(aspect);
   assert(plane < image->plane_count);
   uint32_t buf_bpp = image->planes[plane].cpp;

   /* We are about to upload the buffer data to an image so we can then
    * blit that to our destination region. Because we are going to implement
    * the copy as a blit, we want our blit source and destination formats to be
    * the same (to avoid any format conversions), so we choose a canonical
    * format that matches the destination image bpp.
    */
   VkComponentMapping ident_swizzle = {
      .r = VK_COMPONENT_SWIZZLE_IDENTITY,
      .g = VK_COMPONENT_SWIZZLE_IDENTITY,
      .b = VK_COMPONENT_SWIZZLE_IDENTITY,
      .a = VK_COMPONENT_SWIZZLE_IDENTITY,
   };

   VkComponentMapping cswizzle = ident_swizzle;
   VkColorComponentFlags cmask = 0; /* Write all components */
   VkFormat src_format;
   VkFormat dst_format;
   switch (buf_bpp) {
   case 16:
      assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
      src_format = VK_FORMAT_R32G32B32A32_UINT;
      dst_format = src_format;
      break;
   case 8:
      assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
      src_format = VK_FORMAT_R16G16B16A16_UINT;
      dst_format = src_format;
      break;
   case 4:
      switch (aspect) {
      case VK_IMAGE_ASPECT_COLOR_BIT:
      case VK_IMAGE_ASPECT_PLANE_0_BIT:
      case VK_IMAGE_ASPECT_PLANE_1_BIT:
      case VK_IMAGE_ASPECT_PLANE_2_BIT:
         src_format = VK_FORMAT_R8G8B8A8_UINT;
         dst_format = src_format;
         break;
      case VK_IMAGE_ASPECT_DEPTH_BIT:
         assert(image->vk.format == VK_FORMAT_D32_SFLOAT ||
                image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
                image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32);
         src_format = VK_FORMAT_R8G8B8A8_UINT;
         dst_format = src_format;

         /* For D24 formats, the Vulkan spec states that the depth component
          * in the buffer is stored in the 24-LSB, but V3D wants it in the
          * 24-MSB.
          */
         if (image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT ||
             image->vk.format == VK_FORMAT_X8_D24_UNORM_PACK32) {
            cmask = VK_COLOR_COMPONENT_G_BIT |
                    VK_COLOR_COMPONENT_B_BIT |
                    VK_COLOR_COMPONENT_A_BIT;
            cswizzle.r = VK_COMPONENT_SWIZZLE_R;
            cswizzle.g = VK_COMPONENT_SWIZZLE_R;
            cswizzle.b = VK_COMPONENT_SWIZZLE_G;
            cswizzle.a = VK_COMPONENT_SWIZZLE_B;
         }
         break;
      case VK_IMAGE_ASPECT_STENCIL_BIT:
         /* Since we don't support separate stencil this is always a stencil
          * copy to a combined depth/stencil image. Because we don't support
          * separate stencil images, we interpret the buffer data as a
          * color R8UI image, and implement the blit as a compatible color
          * blit to an RGBA8UI destination masking out writes to components
          * GBA (which map to the D24 component of a S8D24 image).
          */
         assert(image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT);
         buf_bpp = 1;
         src_format = VK_FORMAT_R8_UINT;
         dst_format = VK_FORMAT_R8G8B8A8_UINT;
         cmask = VK_COLOR_COMPONENT_R_BIT;
         break;
      default:
         unreachable("unsupported aspect");
         return false;
      };
      break;
   case 2:
      assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT ||
             aspect == VK_IMAGE_ASPECT_DEPTH_BIT ||
             is_plane_aspect);
      src_format = VK_FORMAT_R16_UINT;
      dst_format = src_format;
      break;
   case 1:
      assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT || is_plane_aspect);
      src_format = VK_FORMAT_R8_UINT;
      dst_format = src_format;
      break;
   default:
      unreachable("unsupported bit-size");
      return false;
   }

   if (use_texel_buffer) {
      return texel_buffer_shader_copy(cmd_buffer, aspect, image,
                                      dst_format, src_format,
                                      buffer, buf_bpp,
                                      cmask, &cswizzle,
                                      region_count, regions);
   } else {
      return copy_buffer_to_image_blit(cmd_buffer, aspect, image,
                                       dst_format, src_format,
                                       buffer, buf_bpp,
                                       cmask, &cswizzle,
                                       region_count, regions);
   }
}

VKAPI_ATTR void VKAPI_CALL
v3dv_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer,
                              const VkCopyBufferToImageInfo2 *info)
{
   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
   V3DV_FROM_HANDLE(v3dv_buffer, buffer, info->srcBuffer);
   V3DV_FROM_HANDLE(v3dv_image, image, info->dstImage);

   assert(image->vk.samples == VK_SAMPLE_COUNT_1_BIT);

   cmd_buffer->state.is_transfer = true;

   uint32_t r = 0;
   while (r < info->regionCount) {
      /* The TFU and TLB paths can only copy one region at a time and the region
       * needs to start at the origin. We try these first for the common case
       * where we are copying full images, since they should be the fastest.
       */
      uint32_t batch_size = 1;
      if (copy_buffer_to_image_tfu(cmd_buffer, image, buffer, &info->pRegions[r]))
         goto handled;

      if (copy_buffer_to_image_tlb(cmd_buffer, image, buffer, &info->pRegions[r]))
         goto handled;

      /* Otherwise, we are copying subrects, so we fallback to copying
       * via shader and texel buffers and we try to batch the regions
       * if possible. We can only batch copies if they have the same
       * framebuffer spec, which is mostly determined by the image
       * subresource of the region.
       */
      const VkImageSubresourceLayers *rsc = &info->pRegions[r].imageSubresource;
      for (uint32_t s = r + 1; s < info->regionCount; s++) {
         const VkImageSubresourceLayers *rsc_s =
            &info->pRegions[s].imageSubresource;

         if (memcmp(rsc, rsc_s, sizeof(VkImageSubresourceLayers)) != 0)
            break;

         /* For 3D images we also need to check the depth extent */
         if (image->vk.image_type == VK_IMAGE_TYPE_3D &&
             info->pRegions[s].imageExtent.depth !=
             info->pRegions[r].imageExtent.depth) {
               break;
         }

         batch_size++;
      }

      if (copy_buffer_to_image_shader(cmd_buffer, image, buffer,
                                      batch_size, &info->pRegions[r], true)) {
         goto handled;
      }

      /* If we still could not copy, fallback to slower paths.
       *
       * FIXME: we could try to batch these too, but since they are bound to be
       * slow it might not be worth it and we should instead put more effort
       * in handling more cases with the other paths.
       */
      if (copy_buffer_to_image_shader(cmd_buffer, image, buffer,
                                      batch_size, &info->pRegions[r], false)) {
         goto handled;
      }

      unreachable("Unsupported buffer to image copy.");

handled:
      r += batch_size;
   }

   cmd_buffer->state.is_transfer = false;
}

static void
compute_blit_3d_layers(const VkOffset3D *offsets,
                       uint32_t *min_layer, uint32_t *max_layer,
                       bool *mirror_z);

/**
 * Returns true if the implementation supports the requested operation (even if
 * it failed to process it, for example, due to an out-of-memory error).
 *
 * The TFU blit path doesn't handle scaling so the blit filter parameter can
 * be ignored.
 */
static bool
blit_tfu(struct v3dv_cmd_buffer *cmd_buffer,
         struct v3dv_image *dst,
         struct v3dv_image *src,
         const VkImageBlit2 *region)
{
   if (V3D_DBG(DISABLE_TFU)) {
      perf_debug("Blit: TFU disabled, fallbacks could be slower.");
      return false;
   }

   assert(dst->vk.samples == VK_SAMPLE_COUNT_1_BIT);
   assert(src->vk.samples == VK_SAMPLE_COUNT_1_BIT);

   /* From vkCmdBlitImage:
    *   "srcImage must not use a format that requires a sampler YCBCR
    *    conversion"
    *   "dstImage must not use a format that requires a sampler YCBCR
    *    conversion"
    */
   assert(dst->plane_count == 1);
   assert(src->plane_count == 1);

   /* Format must match */
   if (src->vk.format != dst->vk.format)
      return false;

   /* Destination can't be raster format */
   if (!dst->tiled)
      return false;

   /* Source region must start at (0,0) */
   if (region->srcOffsets[0].x != 0 || region->srcOffsets[0].y != 0)
      return false;

   /* Destination image must be complete */
   if (region->dstOffsets[0].x != 0 || region->dstOffsets[0].y != 0)
      return false;

   const uint32_t dst_mip_level = region->dstSubresource.mipLevel;
   const uint32_t dst_width = u_minify(dst->vk.extent.width, dst_mip_level);
   const uint32_t dst_height = u_minify(dst->vk.extent.height, dst_mip_level);
   if (region->dstOffsets[1].x < dst_width - 1||
       region->dstOffsets[1].y < dst_height - 1) {
      return false;
   }

   /* No XY scaling */
   if (region->srcOffsets[1].x != region->dstOffsets[1].x ||
       region->srcOffsets[1].y != region->dstOffsets[1].y) {
      return false;
   }

   /* If the format is D24S8 both aspects need to be copied, since the TFU
    * can't be programmed to copy only one aspect of the image.
    */
   if (dst->vk.format == VK_FORMAT_D24_UNORM_S8_UINT) {
       const VkImageAspectFlags ds_aspects = VK_IMAGE_ASPECT_DEPTH_BIT |
                                             VK_IMAGE_ASPECT_STENCIL_BIT;
       if (region->dstSubresource.aspectMask != ds_aspects)
          return false;
   }

   /* Our TFU blits only handle exact copies (it requires same formats
    * on input and output, no scaling, etc), so there is no pixel format
    * conversions and we can rewrite the format to use one that is TFU
    * compatible based on its texel size.
    */
   const struct v3dv_format *format =
      v3dv_get_compatible_tfu_format(cmd_buffer->device,
                                     dst->planes[0].cpp, NULL);

   /* Emit a TFU job for each layer to blit */
   assert(vk_image_subresource_layer_count(&dst->vk, &region->dstSubresource) ==
          vk_image_subresource_layer_count(&src->vk, &region->srcSubresource));

   uint32_t min_dst_layer;
   uint32_t max_dst_layer;
   bool dst_mirror_z = false;
   if (dst->vk.image_type == VK_IMAGE_TYPE_3D) {
      compute_blit_3d_layers(region->dstOffsets,
                             &min_dst_layer, &max_dst_layer,
                             &dst_mirror_z);
   } else {
      min_dst_layer = region->dstSubresource.baseArrayLayer;
      max_dst_layer = min_dst_layer +
                      vk_image_subresource_layer_count(&dst->vk,
                                                       &region->dstSubresource);
   }

   uint32_t min_src_layer;
   uint32_t max_src_layer;
   bool src_mirror_z = false;
   if (src->vk.image_type == VK_IMAGE_TYPE_3D) {
      compute_blit_3d_layers(region->srcOffsets,
                             &min_src_layer, &max_src_layer,
                             &src_mirror_z);
   } else {
      min_src_layer = region->srcSubresource.baseArrayLayer;
      max_src_layer = min_src_layer +
                      vk_image_subresource_layer_count(&src->vk,
                                                       &region->srcSubresource);
   }

   /* No Z scaling for 3D images (for non-3D images both src and dst must
    * have the same layerCount).
    */
   if (max_dst_layer - min_dst_layer != max_src_layer - min_src_layer)
      return false;

   const uint32_t layer_count = max_dst_layer - min_dst_layer;
   const uint32_t src_mip_level = region->srcSubresource.mipLevel;
   for (uint32_t i = 0; i < layer_count; i++) {
      /* Since the TFU path doesn't handle scaling, Z mirroring for 3D images
       * only involves reversing the order of the slices.
       */
      const uint32_t dst_layer =
         dst_mirror_z ? max_dst_layer - i - 1: min_dst_layer + i;
      const uint32_t src_layer =
         src_mirror_z ? max_src_layer - i - 1: min_src_layer + i;

      const uint32_t dst_offset =
         dst->planes[0].mem->bo->offset + v3dv_layer_offset(dst, dst_mip_level,
                                                            dst_layer, 0);
      const uint32_t src_offset =
         src->planes[0].mem->bo->offset + v3dv_layer_offset(src, src_mip_level,
                                                            src_layer, 0);

      const struct v3d_resource_slice *dst_slice = &dst->planes[0].slices[dst_mip_level];
      const struct v3d_resource_slice *src_slice = &src->planes[0].slices[src_mip_level];

      v3dv_X(cmd_buffer->device, meta_emit_tfu_job)(
         cmd_buffer,
         dst->planes[0].mem->bo->handle,
         dst_offset,
         dst_slice->tiling,
         dst_slice->padded_height,
         dst->planes[0].cpp,
         src->planes[0].mem->bo->handle,
         src_offset,
         src_slice->tiling,
         src_slice->tiling == V3D_TILING_RASTER ?
                              src_slice->stride : src_slice->padded_height,
         src->planes[0].cpp,
         dst_width, dst_height, &format->planes[0]);
   }

   return true;
}

static bool
format_needs_software_int_clamp(VkFormat format)
{
   switch (format) {
      case VK_FORMAT_A2R10G10B10_UINT_PACK32:
      case VK_FORMAT_A2R10G10B10_SINT_PACK32:
      case VK_FORMAT_A2B10G10R10_UINT_PACK32:
      case VK_FORMAT_A2B10G10R10_SINT_PACK32:
         return true;
      default:
         return false;
   };
}

static void
get_blit_pipeline_cache_key(VkFormat dst_format,
                            VkFormat src_format,
                            VkColorComponentFlags cmask,
                            VkSampleCountFlagBits dst_samples,
                            VkSampleCountFlagBits src_samples,
                            uint8_t *key)
{
   memset(key, 0, V3DV_META_BLIT_CACHE_KEY_SIZE);

   uint32_t *p = (uint32_t *) key;

   *p = dst_format;
   p++;

   /* Generally, when blitting from a larger format to a smaller format
    * the hardware takes care of clamping the source to the RT range.
    * Specifically, for integer formats, this is done by using
    * V3D_RENDER_TARGET_CLAMP_INT in the render target setup, however, this
    * clamps to the bit-size of the render type, and some formats, such as
    * rgb10a2_uint have a 16-bit type, so it won't do what we need and we
    * require to clamp in software. In these cases, we need to amend the blit
    * shader with clamp code that depends on both the src and dst formats, so
    * we need the src format to be part of the key.
    */
   *p = format_needs_software_int_clamp(dst_format) ? src_format : 0;
   p++;

   *p = cmask;
   p++;

   *p = (dst_samples << 8) | src_samples;
   p++;

   assert(((uint8_t*)p - key) == V3DV_META_BLIT_CACHE_KEY_SIZE);
}

static bool
create_blit_render_pass(struct v3dv_device *device,
                        VkFormat dst_format,
                        VkFormat src_format,
                        VkRenderPass *pass_load,
                        VkRenderPass *pass_no_load)
{
   const bool is_color_blit = vk_format_is_color(dst_format);

   /* Attachment load operation is specified below */
   VkAttachmentDescription2 att = {
      .sType = VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2,
      .format = dst_format,
      .samples = VK_SAMPLE_COUNT_1_BIT,
      .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
      .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
      .finalLayout = VK_IMAGE_LAYOUT_GENERAL,
   };

   VkAttachmentReference2 att_ref = {
      .sType = VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2,
      .attachment = 0,
      .layout = VK_IMAGE_LAYOUT_GENERAL,
   };

   VkSubpassDescription2 subpass = {
      .sType = VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2,
      .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
      .inputAttachmentCount = 0,
      .colorAttachmentCount = is_color_blit ? 1 : 0,
      .pColorAttachments = is_color_blit ? &att_ref : NULL,
      .pResolveAttachments = NULL,
      .pDepthStencilAttachment = is_color_blit ? NULL : &att_ref,
      .preserveAttachmentCount = 0,
      .pPreserveAttachments = NULL,
   };

   VkRenderPassCreateInfo2 info = {
      .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2,
      .attachmentCount = 1,
      .pAttachments = &att,
      .subpassCount = 1,
      .pSubpasses = &subpass,
      .dependencyCount = 0,
      .pDependencies = NULL,
   };

   VkResult result;
   att.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
   result = v3dv_CreateRenderPass2(v3dv_device_to_handle(device),
                                   &info, &device->vk.alloc, pass_load);
   if (result != VK_SUCCESS)
      return false;

   att.loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
   result = v3dv_CreateRenderPass2(v3dv_device_to_handle(device),
                                   &info, &device->vk.alloc, pass_no_load);
   return result == VK_SUCCESS;
}

static nir_def *
gen_tex_coords(nir_builder *b)
{
   nir_def *tex_box =
      nir_load_push_constant(b, 4, 32, nir_imm_int(b, 0), .base = 0, .range = 16);

   nir_def *tex_z =
      nir_load_push_constant(b, 1, 32, nir_imm_int(b, 0), .base = 16, .range = 4);

   nir_def *vertex_id = nir_load_vertex_id(b);

   /* vertex 0: src0_x, src0_y
    * vertex 1: src0_x, src1_y
    * vertex 2: src1_x, src0_y
    * vertex 3: src1_x, src1_y
    *
    * So:
    *
    * channel 0 is vertex_id < 2 ? src0_x : src1_x
    * channel 1 is vertex id & 1 ? src1_y : src0_y
    */

   nir_def *one = nir_imm_int(b, 1);
   nir_def *c0cmp = nir_ilt_imm(b, vertex_id, 2);
   nir_def *c1cmp = nir_ieq(b, nir_iand(b, vertex_id, one), one);

   nir_def *comp[4];
   comp[0] = nir_bcsel(b, c0cmp,
                       nir_channel(b, tex_box, 0),
                       nir_channel(b, tex_box, 2));

   comp[1] = nir_bcsel(b, c1cmp,
                       nir_channel(b, tex_box, 3),
                       nir_channel(b, tex_box, 1));
   comp[2] = tex_z;
   comp[3] = nir_imm_float(b, 1.0f);
   return nir_vec(b, comp, 4);
}

static nir_def *
build_nir_tex_op_read(struct nir_builder *b,
                      nir_def *tex_pos,
                      enum glsl_base_type tex_type,
                      enum glsl_sampler_dim dim)
{
   assert(dim != GLSL_SAMPLER_DIM_MS);

   const struct glsl_type *sampler_type =
      glsl_sampler_type(dim, false, false, tex_type);
   nir_variable *sampler =
      nir_variable_create(b->shader, nir_var_uniform, sampler_type, "s_tex");
   sampler->data.descriptor_set = 0;
   sampler->data.binding = 0;

   nir_def *tex_deref = &nir_build_deref_var(b, sampler)->def;
   nir_tex_instr *tex = nir_tex_instr_create(b->shader, 3);
   tex->sampler_dim = dim;
   tex->op = nir_texop_tex;
   tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, tex_pos);
   tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_texture_deref, tex_deref);
   tex->src[2] = nir_tex_src_for_ssa(nir_tex_src_sampler_deref, tex_deref);
   tex->dest_type = nir_get_nir_type_for_glsl_base_type(tex_type);
   tex->is_array = glsl_sampler_type_is_array(sampler_type);
   tex->coord_components = tex_pos->num_components;

   nir_def_init(&tex->instr, &tex->def, 4, 32);
   nir_builder_instr_insert(b, &tex->instr);
   return &tex->def;
}

static nir_def *
build_nir_tex_op_ms_fetch_sample(struct nir_builder *b,
                                 nir_variable *sampler,
                                 nir_def *tex_deref,
                                 enum glsl_base_type tex_type,
                                 nir_def *tex_pos,
                                 nir_def *sample_idx)
{
   nir_tex_instr *tex = nir_tex_instr_create(b->shader, 3);
   tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
   tex->op = nir_texop_txf_ms;
   tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, tex_pos);
   tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_texture_deref, tex_deref);
   tex->src[2] = nir_tex_src_for_ssa(nir_tex_src_ms_index, sample_idx);
   tex->dest_type = nir_get_nir_type_for_glsl_base_type(tex_type);
   tex->is_array = false;
   tex->coord_components = tex_pos->num_components;

   nir_def_init(&tex->instr, &tex->def, 4, 32);
   nir_builder_instr_insert(b, &tex->instr);
   return &tex->def;
}

/* Fetches all samples at the given position and averages them */
static nir_def *
build_nir_tex_op_ms_resolve(struct nir_builder *b,
                            nir_def *tex_pos,
                            enum glsl_base_type tex_type,
                            VkSampleCountFlagBits src_samples)
{
   assert(src_samples > VK_SAMPLE_COUNT_1_BIT);
   const struct glsl_type *sampler_type =
      glsl_sampler_type(GLSL_SAMPLER_DIM_MS, false, false, tex_type);
   nir_variable *sampler =
      nir_variable_create(b->shader, nir_var_uniform, sampler_type, "s_tex");
   sampler->data.descriptor_set = 0;
   sampler->data.binding = 0;

   const bool is_int = glsl_base_type_is_integer(tex_type);

   nir_def *tmp = NULL;
   nir_def *tex_deref = &nir_build_deref_var(b, sampler)->def;
   for (uint32_t i = 0; i < src_samples; i++) {
      nir_def *s =
         build_nir_tex_op_ms_fetch_sample(b, sampler, tex_deref,
                                          tex_type, tex_pos,
                                          nir_imm_int(b, i));

      /* For integer formats, the multisample resolve operation is expected to
       * return one of the samples, we just return the first one.
       */
      if (is_int)
         return s;

      tmp = i == 0 ? s : nir_fadd(b, tmp, s);
   }

   assert(!is_int);
   return nir_fmul_imm(b, tmp, 1.0f / src_samples);
}

/* Fetches the current sample (gl_SampleID) at the given position */
static nir_def *
build_nir_tex_op_ms_read(struct nir_builder *b,
                         nir_def *tex_pos,
                         enum glsl_base_type tex_type)
{
   const struct glsl_type *sampler_type =
      glsl_sampler_type(GLSL_SAMPLER_DIM_MS, false, false, tex_type);
   nir_variable *sampler =
      nir_variable_create(b->shader, nir_var_uniform, sampler_type, "s_tex");
   sampler->data.descriptor_set = 0;
   sampler->data.binding = 0;

   nir_def *tex_deref = &nir_build_deref_var(b, sampler)->def;

   return build_nir_tex_op_ms_fetch_sample(b, sampler, tex_deref,
                                           tex_type, tex_pos,
                                           nir_load_sample_id(b));
}

static nir_def *
build_nir_tex_op(struct nir_builder *b,
                 struct v3dv_device *device,
                 nir_def *tex_pos,
                 enum glsl_base_type tex_type,
                 VkSampleCountFlagBits dst_samples,
                 VkSampleCountFlagBits src_samples,
                 enum glsl_sampler_dim dim)
{
   switch (dim) {
   case GLSL_SAMPLER_DIM_MS:
      assert(src_samples == VK_SAMPLE_COUNT_4_BIT);
      /* For multisampled texture sources we need to use fetching instead of
       * normalized texture coordinates. We already configured our blit
       * coordinates to be in texel units, but here we still need to convert
       * them from floating point to integer.
       */
      tex_pos = nir_f2i32(b, tex_pos);

      if (dst_samples == VK_SAMPLE_COUNT_1_BIT)
         return build_nir_tex_op_ms_resolve(b, tex_pos, tex_type, src_samples);
      else
         return build_nir_tex_op_ms_read(b, tex_pos, tex_type);
   default:
      assert(src_samples == VK_SAMPLE_COUNT_1_BIT);
      return build_nir_tex_op_read(b, tex_pos, tex_type, dim);
   }
}

static nir_shader *
get_blit_vs(const nir_shader_compiler_options *options)
{
   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_VERTEX, options,
                                                  "meta blit vs");

   const struct glsl_type *vec4 = glsl_vec4_type();

   nir_variable *vs_out_pos =
      nir_variable_create(b.shader, nir_var_shader_out, vec4, "gl_Position");
   vs_out_pos->data.location = VARYING_SLOT_POS;

   nir_variable *vs_out_tex_coord =
      nir_variable_create(b.shader, nir_var_shader_out, vec4, "out_tex_coord");
   vs_out_tex_coord->data.location = VARYING_SLOT_VAR0;
   vs_out_tex_coord->data.interpolation = INTERP_MODE_SMOOTH;

   nir_def *pos = nir_gen_rect_vertices(&b, NULL, NULL);
   nir_store_var(&b, vs_out_pos, pos, 0xf);

   nir_def *tex_coord = gen_tex_coords(&b);
   nir_store_var(&b, vs_out_tex_coord, tex_coord, 0xf);

   return b.shader;
}

static uint32_t
get_channel_mask_for_sampler_dim(enum glsl_sampler_dim sampler_dim)
{
   switch (sampler_dim) {
   case GLSL_SAMPLER_DIM_1D: return 0x1;
   case GLSL_SAMPLER_DIM_2D: return 0x3;
   case GLSL_SAMPLER_DIM_MS: return 0x3;
   case GLSL_SAMPLER_DIM_3D: return 0x7;
   default:
      unreachable("invalid sampler dim");
   };
}

static nir_shader *
get_color_blit_fs(const nir_shader_compiler_options *options,
                  struct v3dv_device *device,
                  VkFormat dst_format,
                  VkFormat src_format,
                  VkSampleCountFlagBits dst_samples,
                  VkSampleCountFlagBits src_samples,
                  enum glsl_sampler_dim sampler_dim)
{
   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, options,
                                                  "meta blit fs");

   const struct glsl_type *vec4 = glsl_vec4_type();

   nir_variable *fs_in_tex_coord =
      nir_variable_create(b.shader, nir_var_shader_in, vec4, "in_tex_coord");
   fs_in_tex_coord->data.location = VARYING_SLOT_VAR0;

   const struct glsl_type *fs_out_type =
      vk_format_is_sint(dst_format) ? glsl_ivec4_type() :
      vk_format_is_uint(dst_format) ? glsl_uvec4_type() :
                                      glsl_vec4_type();

   enum glsl_base_type src_base_type =
      vk_format_is_sint(src_format) ? GLSL_TYPE_INT :
      vk_format_is_uint(src_format) ? GLSL_TYPE_UINT :
                                      GLSL_TYPE_FLOAT;

   nir_variable *fs_out_color =
      nir_variable_create(b.shader, nir_var_shader_out, fs_out_type, "out_color");
   fs_out_color->data.location = FRAG_RESULT_DATA0;

   nir_def *tex_coord = nir_load_var(&b, fs_in_tex_coord);
   const uint32_t channel_mask = get_channel_mask_for_sampler_dim(sampler_dim);
   tex_coord = nir_channels(&b, tex_coord, channel_mask);

   nir_def *color = build_nir_tex_op(&b, device, tex_coord, src_base_type,
                                         dst_samples, src_samples, sampler_dim);

   /* For integer textures, if the bit-size of the destination is too small to
    * hold source value, Vulkan (CTS) expects the implementation to clamp to the
    * maximum value the destination can hold. The hardware can clamp to the
    * render target type, which usually matches the component bit-size, but
    * there are some cases that won't match, such as rgb10a2, which has a 16-bit
    * render target type, so in these cases we need to clamp manually.
    */
   if (format_needs_software_int_clamp(dst_format)) {
      assert(vk_format_is_int(dst_format));
      enum pipe_format src_pformat = vk_format_to_pipe_format(src_format);
      enum pipe_format dst_pformat = vk_format_to_pipe_format(dst_format);

      nir_def *c[4];
      for (uint32_t i = 0; i < 4; i++) {
         c[i] = nir_channel(&b, color, i);

         const uint32_t src_bit_size =
            util_format_get_component_bits(src_pformat,
                                           UTIL_FORMAT_COLORSPACE_RGB,
                                           i);
         const uint32_t dst_bit_size =
            util_format_get_component_bits(dst_pformat,
                                           UTIL_FORMAT_COLORSPACE_RGB,
                                           i);

         if (dst_bit_size >= src_bit_size)
            continue;

         assert(dst_bit_size > 0);
         if (util_format_is_pure_uint(dst_pformat)) {
            nir_def *max = nir_imm_int(&b, (1 << dst_bit_size) - 1);
            c[i] = nir_umin(&b, c[i], max);
         } else {
            nir_def *max = nir_imm_int(&b, (1 << (dst_bit_size - 1)) - 1);
            nir_def *min = nir_imm_int(&b, -(1 << (dst_bit_size - 1)));
            c[i] = nir_imax(&b, nir_imin(&b, c[i], max), min);
         }
      }

      color = nir_vec4(&b, c[0], c[1], c[2], c[3]);
   }

   nir_store_var(&b, fs_out_color, color, 0xf);

   return b.shader;
}

static bool
create_pipeline(struct v3dv_device *device,
                struct v3dv_render_pass *pass,
                struct nir_shader *vs_nir,
                struct nir_shader *gs_nir,
                struct nir_shader *fs_nir,
                const VkPipelineVertexInputStateCreateInfo *vi_state,
                const VkPipelineDepthStencilStateCreateInfo *ds_state,
                const VkPipelineColorBlendStateCreateInfo *cb_state,
                const VkPipelineMultisampleStateCreateInfo *ms_state,
                const VkPipelineLayout layout,
                VkPipeline *pipeline)
{
   struct vk_shader_module vs_m = vk_shader_module_from_nir(vs_nir);
   struct vk_shader_module fs_m = vk_shader_module_from_nir(fs_nir);
   struct vk_shader_module gs_m;

   uint32_t num_stages = gs_nir ? 3 : 2;


   VkPipelineShaderStageCreateInfo stages[3] = {
      {
         .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
         .stage = VK_SHADER_STAGE_VERTEX_BIT,
         .module = vk_shader_module_to_handle(&vs_m),
         .pName = "main",
      },
      {
         .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
         .stage = VK_SHADER_STAGE_FRAGMENT_BIT,
         .module = vk_shader_module_to_handle(&fs_m),
         .pName = "main",
      },
      {
         .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
         .stage = VK_SHADER_STAGE_GEOMETRY_BIT,
         .module = VK_NULL_HANDLE,
         .pName = "main",
      },
   };

   if (gs_nir) {
      gs_m = vk_shader_module_from_nir(gs_nir);
      stages[2].module = vk_shader_module_to_handle(&gs_m);
   }

   VkGraphicsPipelineCreateInfo info = {
      .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,

      .stageCount = num_stages,
      .pStages = stages,

      .pVertexInputState = vi_state,

      .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) {
         .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
         .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP,
         .primitiveRestartEnable = false,
      },

      .pViewportState = &(VkPipelineViewportStateCreateInfo) {
         .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
         .viewportCount = 1,
         .scissorCount = 1,
      },

      .pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) {
         .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
         .rasterizerDiscardEnable = false,
         .polygonMode = VK_POLYGON_MODE_FILL,
         .cullMode = VK_CULL_MODE_NONE,
         .frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE,
         .depthBiasEnable = false,
      },

      .pMultisampleState = ms_state,

      .pDepthStencilState = ds_state,

      .pColorBlendState = cb_state,

      /* The meta clear pipeline declares all state as dynamic.
       * As a consequence, vkCmdBindPipeline writes no dynamic state
       * to the cmd buffer. Therefore, at the end of the meta clear,
       * we need only restore dynamic state that was vkCmdSet.
       */
      .pDynamicState = &(VkPipelineDynamicStateCreateInfo) {
         .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO,
         .dynamicStateCount = 6,
         .pDynamicStates = (VkDynamicState[]) {
            VK_DYNAMIC_STATE_VIEWPORT,
            VK_DYNAMIC_STATE_SCISSOR,
            VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK,
            VK_DYNAMIC_STATE_STENCIL_WRITE_MASK,
            VK_DYNAMIC_STATE_STENCIL_REFERENCE,
            VK_DYNAMIC_STATE_BLEND_CONSTANTS,
            VK_DYNAMIC_STATE_DEPTH_BIAS,
            VK_DYNAMIC_STATE_LINE_WIDTH,
         },
      },

      .flags = 0,
      .layout = layout,
      .renderPass = v3dv_render_pass_to_handle(pass),
      .subpass = 0,
   };

   VkResult result =
      v3dv_CreateGraphicsPipelines(v3dv_device_to_handle(device),
                                   VK_NULL_HANDLE,
                                   1, &info,
                                   &device->vk.alloc,
                                   pipeline);

   ralloc_free(vs_nir);
   ralloc_free(gs_nir);
   ralloc_free(fs_nir);

   return result == VK_SUCCESS;
}

static enum glsl_sampler_dim
get_sampler_dim(VkImageType type, VkSampleCountFlagBits src_samples)
{
   /* From the Vulkan 1.0 spec, VkImageCreateInfo Validu Usage:
    *
    *   "If samples is not VK_SAMPLE_COUNT_1_BIT, then imageType must be
    *    VK_IMAGE_TYPE_2D, ..."
    */
   assert(src_samples == VK_SAMPLE_COUNT_1_BIT || type == VK_IMAGE_TYPE_2D);

   switch (type) {
   case VK_IMAGE_TYPE_1D: return GLSL_SAMPLER_DIM_1D;
   case VK_IMAGE_TYPE_2D:
      return src_samples == VK_SAMPLE_COUNT_1_BIT ? GLSL_SAMPLER_DIM_2D :
                                                    GLSL_SAMPLER_DIM_MS;
   case VK_IMAGE_TYPE_3D: return GLSL_SAMPLER_DIM_3D;
   default:
      unreachable("Invalid image type");
   }
}

static bool
create_blit_pipeline(struct v3dv_device *device,
                     VkFormat dst_format,
                     VkFormat src_format,
                     VkColorComponentFlags cmask,
                     VkImageType src_type,
                     VkSampleCountFlagBits dst_samples,
                     VkSampleCountFlagBits src_samples,
                     VkRenderPass _pass,
                     VkPipelineLayout pipeline_layout,
                     VkPipeline *pipeline)
{
   struct v3dv_render_pass *pass = v3dv_render_pass_from_handle(_pass);

   /* We always rewrite depth/stencil blits to compatible color blits */
   assert(vk_format_is_color(dst_format));
   assert(vk_format_is_color(src_format));

   const nir_shader_compiler_options *options =
      v3dv_pipeline_get_nir_options(&device->devinfo);

   const enum glsl_sampler_dim sampler_dim =
      get_sampler_dim(src_type, src_samples);

   nir_shader *vs_nir = get_blit_vs(options);
   nir_shader *fs_nir =
      get_color_blit_fs(options, device, dst_format, src_format,
                        dst_samples, src_samples, sampler_dim);

   const VkPipelineVertexInputStateCreateInfo vi_state = {
      .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
      .vertexBindingDescriptionCount = 0,
      .vertexAttributeDescriptionCount = 0,
   };

   VkPipelineDepthStencilStateCreateInfo ds_state = {
      .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO,
   };

   VkPipelineColorBlendAttachmentState blend_att_state[1] = { 0 };
   blend_att_state[0] = (VkPipelineColorBlendAttachmentState) {
      .blendEnable = false,
      .colorWriteMask = cmask,
   };

   const VkPipelineColorBlendStateCreateInfo cb_state = {
      .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
      .logicOpEnable = false,
      .attachmentCount = 1,
      .pAttachments = blend_att_state
   };

   const VkPipelineMultisampleStateCreateInfo ms_state = {
      .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
      .rasterizationSamples = dst_samples,
      .sampleShadingEnable = dst_samples > VK_SAMPLE_COUNT_1_BIT,
      .pSampleMask = NULL,
      .alphaToCoverageEnable = false,
      .alphaToOneEnable = false,
   };

   return create_pipeline(device,
                          pass,
                          vs_nir, NULL, fs_nir,
                          &vi_state,
                          &ds_state,
                          &cb_state,
                          &ms_state,
                          pipeline_layout,
                          pipeline);
}

/**
 * Return a pipeline suitable for blitting the requested aspect given the
 * destination and source formats.
 */
static bool
get_blit_pipeline(struct v3dv_cmd_buffer *cmd_buffer,
                  VkFormat dst_format,
                  VkFormat src_format,
                  VkColorComponentFlags cmask,
                  VkImageType src_type,
                  VkSampleCountFlagBits dst_samples,
                  VkSampleCountFlagBits src_samples,
                  struct v3dv_meta_blit_pipeline **pipeline)
{
   bool ok = true;
   struct v3dv_device *device = cmd_buffer->device;

   uint8_t key[V3DV_META_BLIT_CACHE_KEY_SIZE];
   if (device->instance->meta_cache_enabled) {
      get_blit_pipeline_cache_key(dst_format, src_format, cmask,
                                  dst_samples, src_samples, key);
      mtx_lock(&device->meta.mtx);
      struct hash_entry *entry =
         _mesa_hash_table_search(device->meta.blit.cache[src_type], &key);
      if (entry) {
         mtx_unlock(&device->meta.mtx);
         *pipeline = entry->data;
         return true;
      }
   }

   *pipeline = vk_zalloc2(&device->vk.alloc, NULL, sizeof(**pipeline), 8,
                          VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);

   if (*pipeline == NULL)
      goto fail;

   ok = create_blit_render_pass(device, dst_format, src_format,
                                &(*pipeline)->pass,
                                &(*pipeline)->pass_no_load);
   if (!ok)
      goto fail;

   /* Create the pipeline using one of the render passes, they are both
    * compatible, so we don't care which one we use here.
    */
   ok = create_blit_pipeline(device,
                             dst_format,
                             src_format,
                             cmask,
                             src_type,
                             dst_samples,
                             src_samples,
                             (*pipeline)->pass,
                             device->meta.blit.p_layout,
                             &(*pipeline)->pipeline);
   if (!ok)
      goto fail;

   if (device->instance->meta_cache_enabled) {
      memcpy((*pipeline)->key, key, sizeof((*pipeline)->key));
      _mesa_hash_table_insert(device->meta.blit.cache[src_type],
                              &(*pipeline)->key, *pipeline);
      mtx_unlock(&device->meta.mtx);
   } else {
      v3dv_cmd_buffer_add_private_obj(
         cmd_buffer, (uintptr_t)*pipeline,
         (v3dv_cmd_buffer_private_obj_destroy_cb)destroy_meta_blit_pipeline);
   }

   return true;

fail:
   if (device->instance->meta_cache_enabled)
      mtx_unlock(&device->meta.mtx);

   VkDevice _device = v3dv_device_to_handle(device);
   if (*pipeline) {
      if ((*pipeline)->pass)
         v3dv_DestroyRenderPass(_device, (*pipeline)->pass, &device->vk.alloc);
      if ((*pipeline)->pass_no_load)
         v3dv_DestroyRenderPass(_device, (*pipeline)->pass_no_load, &device->vk.alloc);
      if ((*pipeline)->pipeline)
         v3dv_DestroyPipeline(_device, (*pipeline)->pipeline, &device->vk.alloc);
      vk_free(&device->vk.alloc, *pipeline);
      *pipeline = NULL;
   }

   return false;
}

static void
compute_blit_box(const VkOffset3D *offsets,
                 uint32_t image_w, uint32_t image_h,
                 uint32_t *x, uint32_t *y, uint32_t *w, uint32_t *h,
                 bool *mirror_x, bool *mirror_y)
{
   if (offsets[1].x >= offsets[0].x) {
      *mirror_x = false;
      *x = MIN2(offsets[0].x, image_w - 1);
      *w = MIN2(offsets[1].x - offsets[0].x, image_w - offsets[0].x);
   } else {
      *mirror_x = true;
      *x = MIN2(offsets[1].x, image_w - 1);
      *w = MIN2(offsets[0].x - offsets[1].x, image_w - offsets[1].x);
   }
   if (offsets[1].y >= offsets[0].y) {
      *mirror_y = false;
      *y = MIN2(offsets[0].y, image_h - 1);
      *h = MIN2(offsets[1].y - offsets[0].y, image_h - offsets[0].y);
   } else {
      *mirror_y = true;
      *y = MIN2(offsets[1].y, image_h - 1);
      *h = MIN2(offsets[0].y - offsets[1].y, image_h - offsets[1].y);
   }
}

static void
compute_blit_3d_layers(const VkOffset3D *offsets,
                       uint32_t *min_layer, uint32_t *max_layer,
                       bool *mirror_z)
{
   if (offsets[1].z >= offsets[0].z) {
      *mirror_z = false;
      *min_layer = offsets[0].z;
      *max_layer = offsets[1].z;
   } else {
      *mirror_z = true;
      *min_layer = offsets[1].z;
      *max_layer = offsets[0].z;
   }
}

static VkResult
create_blit_descriptor_pool(struct v3dv_cmd_buffer *cmd_buffer)
{
   /* If this is not the first pool we create for this command buffer
    * size it based on the size of the currently exhausted pool.
    */
   uint32_t descriptor_count = 64;
   if (cmd_buffer->meta.blit.dspool != VK_NULL_HANDLE) {
      struct v3dv_descriptor_pool *exhausted_pool =
         v3dv_descriptor_pool_from_handle(cmd_buffer->meta.blit.dspool);
      descriptor_count = MIN2(exhausted_pool->max_entry_count * 2, 1024);
   }

   /* Create the descriptor pool */
   cmd_buffer->meta.blit.dspool = VK_NULL_HANDLE;
   VkDescriptorPoolSize pool_size = {
      .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
      .descriptorCount = descriptor_count,
   };
   VkDescriptorPoolCreateInfo info = {
      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
      .maxSets = descriptor_count,
      .poolSizeCount = 1,
      .pPoolSizes = &pool_size,
      .flags = 0,
   };
   VkResult result =
      v3dv_CreateDescriptorPool(v3dv_device_to_handle(cmd_buffer->device),
                                &info,
                                &cmd_buffer->device->vk.alloc,
                                &cmd_buffer->meta.blit.dspool);

   if (result == VK_SUCCESS) {
      assert(cmd_buffer->meta.blit.dspool != VK_NULL_HANDLE);
      const VkDescriptorPool _pool = cmd_buffer->meta.blit.dspool;

      v3dv_cmd_buffer_add_private_obj(
         cmd_buffer, (uintptr_t) _pool,
         (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyDescriptorPool);

      struct v3dv_descriptor_pool *pool =
         v3dv_descriptor_pool_from_handle(_pool);
      pool->is_driver_internal = true;
   }

   return result;
}

static VkResult
allocate_blit_source_descriptor_set(struct v3dv_cmd_buffer *cmd_buffer,
                                    VkDescriptorSet *set)
{
   /* Make sure we have a descriptor pool */
   VkResult result;
   if (cmd_buffer->meta.blit.dspool == VK_NULL_HANDLE) {
      result = create_blit_descriptor_pool(cmd_buffer);
      if (result != VK_SUCCESS)
         return result;
   }
   assert(cmd_buffer->meta.blit.dspool != VK_NULL_HANDLE);

   /* Allocate descriptor set */
   struct v3dv_device *device = cmd_buffer->device;
   VkDevice _device = v3dv_device_to_handle(device);
   VkDescriptorSetAllocateInfo info = {
      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
      .descriptorPool = cmd_buffer->meta.blit.dspool,
      .descriptorSetCount = 1,
      .pSetLayouts = &device->meta.blit.ds_layout,
   };
   result = v3dv_AllocateDescriptorSets(_device, &info, set);

   /* If we ran out of pool space, grow the pool and try again */
   if (result == VK_ERROR_OUT_OF_POOL_MEMORY) {
      result = create_blit_descriptor_pool(cmd_buffer);
      if (result == VK_SUCCESS) {
         info.descriptorPool = cmd_buffer->meta.blit.dspool;
         result = v3dv_AllocateDescriptorSets(_device, &info, set);
      }
   }

   return result;
}

/**
 * Returns true if the implementation supports the requested operation (even if
 * it failed to process it, for example, due to an out-of-memory error).
 *
 * The caller can specify the channels on the destination to be written via the
 * cmask parameter (which can be 0 to default to all channels), as well as a
 * swizzle to apply to the source via the cswizzle parameter  (which can be NULL
 * to use the default identity swizzle).
 *
 * Supports multi-plane formats too.
 */
static bool
blit_shader(struct v3dv_cmd_buffer *cmd_buffer,
            struct v3dv_image *dst,
            VkFormat dst_format,
            struct v3dv_image *src,
            VkFormat src_format,
            VkColorComponentFlags cmask,
            VkComponentMapping *cswizzle,
            const VkImageBlit2 *region,
            VkFilter filter,
            bool dst_is_padded_image)
{
   bool handled = true;
   VkResult result;

   /* Can't sample from linear images */
   if (!src->tiled && src->vk.image_type != VK_IMAGE_TYPE_1D) {
      return false;
   }

   /* Rewrite combined D/S blits to compatible color blits */
   if (vk_format_is_depth_or_stencil(dst_format)) {
      assert(src_format == dst_format);
      assert(cmask == 0);
      switch(dst_format) {
      case VK_FORMAT_D16_UNORM:
         dst_format = VK_FORMAT_R16_UINT;
         break;
      case VK_FORMAT_D32_SFLOAT:
         dst_format = VK_FORMAT_R32_UINT;
         break;
      case VK_FORMAT_X8_D24_UNORM_PACK32:
      case VK_FORMAT_D24_UNORM_S8_UINT:
         if (region->srcSubresource.aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
            cmask |= VK_COLOR_COMPONENT_G_BIT |
                     VK_COLOR_COMPONENT_B_BIT |
                     VK_COLOR_COMPONENT_A_BIT;
         }
         if (region->srcSubresource.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
            assert(dst_format == VK_FORMAT_D24_UNORM_S8_UINT);
            cmask |= VK_COLOR_COMPONENT_R_BIT;
         }
         dst_format = VK_FORMAT_R8G8B8A8_UINT;
         break;
      default:
         unreachable("Unsupported depth/stencil format");
      };
      src_format = dst_format;
   }

   uint8_t src_plane =
      v3dv_plane_from_aspect(region->srcSubresource.aspectMask);
   assert(src_plane < src->plane_count);
   uint8_t dst_plane =
      v3dv_plane_from_aspect(region->dstSubresource.aspectMask);
   assert(dst_plane < dst->plane_count);

   const VkColorComponentFlags full_cmask = VK_COLOR_COMPONENT_R_BIT |
                                            VK_COLOR_COMPONENT_G_BIT |
                                            VK_COLOR_COMPONENT_B_BIT |
                                            VK_COLOR_COMPONENT_A_BIT;
   if (cmask == 0)
      cmask = full_cmask;

   VkComponentMapping ident_swizzle = {
      .r = VK_COMPONENT_SWIZZLE_IDENTITY,
      .g = VK_COMPONENT_SWIZZLE_IDENTITY,
      .b = VK_COMPONENT_SWIZZLE_IDENTITY,
      .a = VK_COMPONENT_SWIZZLE_IDENTITY,
   };
   if (!cswizzle)
      cswizzle = &ident_swizzle;

   /* When we get here from a copy between compressed / uncompressed images
    * we choose to specify the destination blit region based on the size
    * semantics of the source image of the copy (see copy_image_blit), so we
    * need to apply those same semantics here when we compute the size of the
    * destination image level.
    */
   const uint32_t dst_block_w =
      vk_format_get_blockwidth(dst->planes[dst_plane].vk_format);
   const uint32_t dst_block_h =
      vk_format_get_blockheight(dst->planes[dst_plane].vk_format);
   const uint32_t src_block_w =
      vk_format_get_blockwidth(src->planes[src_plane].vk_format);
   const uint32_t src_block_h =
      vk_format_get_blockheight(src->planes[src_plane].vk_format);
   const uint32_t dst_level_w =
      u_minify(DIV_ROUND_UP(dst->vk.extent.width * src_block_w, dst_block_w),
               region->dstSubresource.mipLevel);
   const uint32_t dst_level_h =
      u_minify(DIV_ROUND_UP(dst->vk.extent.height * src_block_h, dst_block_h),
               region->dstSubresource.mipLevel);

   const uint32_t src_level_w =
      u_minify(src->planes[src_plane].width, region->srcSubresource.mipLevel);
   const uint32_t src_level_h =
      u_minify(src->planes[src_plane].height, region->srcSubresource.mipLevel);

   assert(src->plane_count == 1 || src->vk.image_type != VK_IMAGE_TYPE_3D);
   const uint32_t src_level_d =
      u_minify(src->vk.extent.depth, region->srcSubresource.mipLevel);

   uint32_t dst_x, dst_y, dst_w, dst_h;
   bool dst_mirror_x, dst_mirror_y;
   compute_blit_box(region->dstOffsets,
                    dst_level_w, dst_level_h,
                    &dst_x, &dst_y, &dst_w, &dst_h,
                    &dst_mirror_x, &dst_mirror_y);

   uint32_t src_x, src_y, src_w, src_h;
   bool src_mirror_x, src_mirror_y;
   compute_blit_box(region->srcOffsets,
                    src_level_w, src_level_h,
                    &src_x, &src_y, &src_w, &src_h,
                    &src_mirror_x, &src_mirror_y);

   uint32_t min_dst_layer;
   uint32_t max_dst_layer;
   bool dst_mirror_z = false;
   if (dst->vk.image_type != VK_IMAGE_TYPE_3D) {
      min_dst_layer = region->dstSubresource.baseArrayLayer;
      max_dst_layer = min_dst_layer +
                      vk_image_subresource_layer_count(&dst->vk,
                                                       &region->dstSubresource);
   } else {
      compute_blit_3d_layers(region->dstOffsets,
                             &min_dst_layer, &max_dst_layer,
                             &dst_mirror_z);
   }

   uint32_t min_src_layer;
   uint32_t max_src_layer;
   bool src_mirror_z = false;
   if (src->vk.image_type != VK_IMAGE_TYPE_3D) {
      min_src_layer = region->srcSubresource.baseArrayLayer;
      max_src_layer = min_src_layer +
                      vk_image_subresource_layer_count(&src->vk,
                                                       &region->srcSubresource);
   } else {
      compute_blit_3d_layers(region->srcOffsets,
                             &min_src_layer, &max_src_layer,
                             &src_mirror_z);
   }

   uint32_t layer_count = max_dst_layer - min_dst_layer;

   /* Translate source blit coordinates to normalized texture coordinates for
    * single sampled textures. For multisampled textures we require
    * unnormalized coordinates, since we can only do texelFetch on them.
    */
   float coords[4] =  {
      (float)src_x,
      (float)src_y,
      (float)(src_x + src_w),
      (float)(src_y + src_h),
   };

   if (src->vk.samples == VK_SAMPLE_COUNT_1_BIT) {
      coords[0] /= (float)src_level_w;
      coords[1] /= (float)src_level_h;
      coords[2] /= (float)src_level_w;
      coords[3] /= (float)src_level_h;
   }

   /* Handle mirroring */
   const bool mirror_x = dst_mirror_x != src_mirror_x;
   const bool mirror_y = dst_mirror_y != src_mirror_y;
   const bool mirror_z = dst_mirror_z != src_mirror_z;
   float tex_coords[5] = {
      !mirror_x ? coords[0] : coords[2],
      !mirror_y ? coords[1] : coords[3],
      !mirror_x ? coords[2] : coords[0],
      !mirror_y ? coords[3] : coords[1],
      /* Z coordinate for 3D blit sources, to be filled for each
       * destination layer
       */
      0.0f
   };

   /* For blits from 3D images we also need to compute the slice coordinate to
    * sample from, which will change for each layer in the destination.
    * Compute the step we should increase for each iteration.
    */
   const float src_z_step =
      (float)(max_src_layer - min_src_layer) / (float)layer_count;

   /* Get the blit pipeline */
   struct v3dv_meta_blit_pipeline *pipeline = NULL;
   bool ok = get_blit_pipeline(cmd_buffer,
                               dst_format, src_format, cmask, src->vk.image_type,
                               dst->vk.samples, src->vk.samples,
                               &pipeline);
   if (!ok)
      return handled;
   assert(pipeline && pipeline->pipeline &&
          pipeline->pass && pipeline->pass_no_load);

   struct v3dv_device *device = cmd_buffer->device;
   assert(device->meta.blit.ds_layout);

   VkDevice _device = v3dv_device_to_handle(device);
   VkCommandBuffer _cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);

   /* Create sampler for blit source image */
   VkSamplerCreateInfo sampler_info = {
      .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO,
      .magFilter = filter,
      .minFilter = filter,
      .addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
      .addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
      .addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE,
      .mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST,
   };
   VkSampler sampler;
   result = v3dv_CreateSampler(_device, &sampler_info, &device->vk.alloc,
                               &sampler);
   if (result != VK_SUCCESS)
      goto fail;

   v3dv_cmd_buffer_add_private_obj(
      cmd_buffer, (uintptr_t)sampler,
      (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroySampler);

   /* Push command buffer state before starting meta operation */
   v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);

   /* Push state that is common for all layers */
   v3dv_CmdBindPipeline(_cmd_buffer,
                        VK_PIPELINE_BIND_POINT_GRAPHICS,
                        pipeline->pipeline);

   const VkViewport viewport = {
      .x = dst_x,
      .y = dst_y,
      .width = dst_w,
      .height = dst_h,
      .minDepth = 0.0f,
      .maxDepth = 1.0f
   };
   v3dv_CmdSetViewport(_cmd_buffer, 0, 1, &viewport);

   const VkRect2D scissor = {
      .offset = { dst_x, dst_y },
      .extent = { dst_w, dst_h }
   };
   v3dv_CmdSetScissor(_cmd_buffer, 0, 1, &scissor);

   bool can_skip_tlb_load = false;
   const VkRect2D render_area = {
      .offset = { dst_x, dst_y },
      .extent = { dst_w, dst_h },
   };

   /* Record per-layer commands */
   for (uint32_t i = 0; i < layer_count; i++) {
      /* Setup framebuffer */
      VkImageViewCreateInfo dst_image_view_info = {
         .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
         .image = v3dv_image_to_handle(dst),
         .viewType = v3dv_image_type_to_view_type(dst->vk.image_type),
         .format = dst_format,
         .subresourceRange = {
            .aspectMask = region->dstSubresource.aspectMask,
            .baseMipLevel = region->dstSubresource.mipLevel,
            .levelCount = 1,
            .baseArrayLayer = min_dst_layer + i,
            .layerCount = 1
         },
      };
      VkImageView dst_image_view;
      result = v3dv_create_image_view(device, &dst_image_view_info,
                                      &dst_image_view);
      if (result != VK_SUCCESS)
         goto fail;

      v3dv_cmd_buffer_add_private_obj(
         cmd_buffer, (uintptr_t)dst_image_view,
         (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImageView);

      VkFramebufferCreateInfo fb_info = {
         .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
         .renderPass = pipeline->pass,
         .attachmentCount = 1,
         .pAttachments = &dst_image_view,
         .width = dst_x + dst_w,
         .height = dst_y + dst_h,
         .layers = 1,
      };

      VkFramebuffer fb;
      result = v3dv_CreateFramebuffer(_device, &fb_info,
                                      &cmd_buffer->device->vk.alloc, &fb);
      if (result != VK_SUCCESS)
         goto fail;

      struct v3dv_framebuffer *framebuffer = v3dv_framebuffer_from_handle(fb);
      framebuffer->has_edge_padding = fb_info.width == dst_level_w &&
                                      fb_info.height == dst_level_h &&
                                      dst_is_padded_image;

      v3dv_cmd_buffer_add_private_obj(
         cmd_buffer, (uintptr_t)fb,
         (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyFramebuffer);

      /* Setup descriptor set for blit source texture. We don't have to
       * register the descriptor as a private command buffer object since
       * all descriptors will be freed automatically with the descriptor
       * pool.
       */
      VkDescriptorSet set;
      result = allocate_blit_source_descriptor_set(cmd_buffer, &set);
      if (result != VK_SUCCESS)
         goto fail;

      VkImageViewCreateInfo src_image_view_info = {
         .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
         .image = v3dv_image_to_handle(src),
         .viewType = v3dv_image_type_to_view_type(src->vk.image_type),
         .format = src_format,
         .components = *cswizzle,
         .subresourceRange = {
            .aspectMask = region->srcSubresource.aspectMask,
            .baseMipLevel = region->srcSubresource.mipLevel,
            .levelCount = 1,
            .baseArrayLayer =
               src->vk.image_type == VK_IMAGE_TYPE_3D ? 0 : min_src_layer + i,
            .layerCount = 1
         },
      };
      VkImageView src_image_view;
      result = v3dv_create_image_view(device, &src_image_view_info,
                                      &src_image_view);
      if (result != VK_SUCCESS)
         goto fail;

      v3dv_cmd_buffer_add_private_obj(
         cmd_buffer, (uintptr_t)src_image_view,
         (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyImageView);

      VkDescriptorImageInfo image_info = {
         .sampler = sampler,
         .imageView = src_image_view,
         .imageLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
      };
      VkWriteDescriptorSet write = {
         .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
         .dstSet = set,
         .dstBinding = 0,
         .dstArrayElement = 0,
         .descriptorCount = 1,
         .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
         .pImageInfo = &image_info,
      };
      v3dv_UpdateDescriptorSets(_device, 1, &write, 0, NULL);

      v3dv_CmdBindDescriptorSets(_cmd_buffer,
                                 VK_PIPELINE_BIND_POINT_GRAPHICS,
                                 device->meta.blit.p_layout,
                                 0, 1, &set,
                                 0, NULL);

      /* If the region we are about to blit is tile-aligned, then we can
       * use the render pass version that won't pre-load the tile buffer
       * with the dst image contents before the blit. The exception is when we
       * don't have a full color mask, since in that case we need to preserve
       * the original value of some of the color components.
       *
       * Since all layers have the same area, we only need to compute this for
       * the first.
       */
      if (i == 0) {
         struct v3dv_render_pass *pipeline_pass =
            v3dv_render_pass_from_handle(pipeline->pass);
         can_skip_tlb_load =
            cmask == full_cmask &&
            v3dv_subpass_area_is_tile_aligned(cmd_buffer->device, &render_area,
                                              framebuffer, pipeline_pass, 0);
      }

      /* Record blit */
      VkRenderPassBeginInfo rp_info = {
         .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
         .renderPass = can_skip_tlb_load ? pipeline->pass_no_load :
                                           pipeline->pass,
         .framebuffer = fb,
         .renderArea = render_area,
         .clearValueCount = 0,
      };

      VkSubpassBeginInfo sp_info = {
         .sType = VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO,
         .contents = VK_SUBPASS_CONTENTS_INLINE,
      };

      v3dv_CmdBeginRenderPass2(_cmd_buffer, &rp_info, &sp_info);
      struct v3dv_job *job = cmd_buffer->state.job;
      if (!job)
         goto fail;

      /* For 3D blits we need to compute the source slice to blit from (the Z
       * coordinate of the source sample operation). We want to choose this
       * based on the ratio of the depth of the source and the destination
       * images, picking the coordinate in the middle of each step.
       */
      if (src->vk.image_type == VK_IMAGE_TYPE_3D) {
         tex_coords[4] =
            !mirror_z ?
            (min_src_layer + (i + 0.5f) * src_z_step) / (float)src_level_d :
            (max_src_layer - (i + 0.5f) * src_z_step) / (float)src_level_d;
      }

      v3dv_CmdPushConstants(_cmd_buffer,
                            device->meta.blit.p_layout,
                            VK_SHADER_STAGE_VERTEX_BIT, 0, 20,
                            &tex_coords);

      v3dv_CmdDraw(_cmd_buffer, 4, 1, 0, 0);

      VkSubpassEndInfo sp_end_info = {
         .sType = VK_STRUCTURE_TYPE_SUBPASS_END_INFO,
      };

      v3dv_CmdEndRenderPass2(_cmd_buffer, &sp_end_info);
   }

fail:
   v3dv_cmd_buffer_meta_state_pop(cmd_buffer, true);

   return handled;
}

VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBlitImage2(VkCommandBuffer commandBuffer,
                      const VkBlitImageInfo2 *pBlitImageInfo)
{
   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
   V3DV_FROM_HANDLE(v3dv_image, src, pBlitImageInfo->srcImage);
   V3DV_FROM_HANDLE(v3dv_image, dst, pBlitImageInfo->dstImage);

   /* From vkCmdBlitImage:
    *   "srcImage must not use a format that requires a sampler YCBCR
    *    conversion"
    *   "dstImage must not use a format that requires a sampler YCBCR
    *    conversion"
    */
   assert(src->plane_count == 1);
   assert(dst->plane_count == 1);

   /* This command can only happen outside a render pass */
   assert(cmd_buffer->state.pass == NULL);
   assert(cmd_buffer->state.job == NULL);

   /* From the Vulkan 1.0 spec, vkCmdBlitImage valid usage */
   assert(dst->vk.samples == VK_SAMPLE_COUNT_1_BIT &&
          src->vk.samples == VK_SAMPLE_COUNT_1_BIT);

   /* We don't export VK_FORMAT_FEATURE_BLIT_DST_BIT on compressed formats */
   assert(!vk_format_is_compressed(dst->vk.format));

   cmd_buffer->state.is_transfer = true;

   for (uint32_t i = 0; i < pBlitImageInfo->regionCount; i++) {
      const VkImageBlit2 *region = &pBlitImageInfo->pRegions[i];

      if (blit_tfu(cmd_buffer, dst, src, region))
         continue;
      if (blit_shader(cmd_buffer,
                      dst, dst->vk.format,
                      src, src->vk.format,
                      0, NULL,
                      region,
                      pBlitImageInfo->filter, true)) {
         continue;
      }
      unreachable("Unsupported blit operation");
   }

   cmd_buffer->state.is_transfer = false;
}

static bool
resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
                  struct v3dv_image *dst,
                  struct v3dv_image *src,
                  const VkImageResolve2 *region)
{
   /* No resolve for multi-planar images. Using plane 0 */
   assert(dst->plane_count == 1);
   assert(src->plane_count == 1);

   if (!v3dv_meta_can_use_tlb(src, 0, region->srcSubresource.mipLevel,
                              &region->srcOffset, NULL, NULL) ||
       !v3dv_meta_can_use_tlb(dst, 0, region->dstSubresource.mipLevel,
                              &region->dstOffset, &region->extent, NULL)) {
      return false;
   }

   if (!v3dv_X(cmd_buffer->device, format_supports_tlb_resolve)(src->format))
      return false;

   const VkFormat fb_format = src->vk.format;

   uint32_t num_layers;
   if (dst->vk.image_type != VK_IMAGE_TYPE_3D) {
      num_layers = vk_image_subresource_layer_count(&dst->vk,
                                                    &region->dstSubresource);
   } else {
      num_layers = region->extent.depth;
   }
   assert(num_layers > 0);

   struct v3dv_job *job =
      v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
   if (!job)
      return true;

   const uint32_t block_w =
      vk_format_get_blockwidth(dst->planes[0].vk_format);
   const uint32_t block_h =
      vk_format_get_blockheight(dst->planes[0].vk_format);
   const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
   const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);

   uint32_t internal_type, internal_bpp;
   v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_image_aspects)
      (fb_format, region->srcSubresource.aspectMask,
       &internal_type, &internal_bpp);

   v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
                        internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
                        true);

   struct v3dv_meta_framebuffer framebuffer;
   v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
                                              internal_type, &job->frame_tiling);

   v3dv_X(job->device, job_emit_binning_flush)(job);
   v3dv_X(job->device, meta_emit_resolve_image_rcl)(job, dst, src,
                                                    &framebuffer, region);

   v3dv_cmd_buffer_finish_job(cmd_buffer);
   return true;
}

static bool
resolve_image_blit(struct v3dv_cmd_buffer *cmd_buffer,
                   struct v3dv_image *dst,
                   struct v3dv_image *src,
                   const VkImageResolve2 *region)
{
   const VkImageBlit2 blit_region = {
      .sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2,
      .srcSubresource = region->srcSubresource,
      .srcOffsets = {
         region->srcOffset,
         {
            region->srcOffset.x + region->extent.width,
            region->srcOffset.y + region->extent.height,
         }
      },
      .dstSubresource = region->dstSubresource,
      .dstOffsets = {
         region->dstOffset,
         {
            region->dstOffset.x + region->extent.width,
            region->dstOffset.y + region->extent.height,
         }
      },
   };
   return blit_shader(cmd_buffer,
                      dst, dst->vk.format,
                      src, src->vk.format,
                      0, NULL,
                      &blit_region, VK_FILTER_NEAREST, true);
}

VKAPI_ATTR void VKAPI_CALL
v3dv_CmdResolveImage2(VkCommandBuffer commandBuffer,
                         const VkResolveImageInfo2 *info)

{
   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
   V3DV_FROM_HANDLE(v3dv_image, src, info->srcImage);
   V3DV_FROM_HANDLE(v3dv_image, dst, info->dstImage);

    /* This command can only happen outside a render pass */
   assert(cmd_buffer->state.pass == NULL);
   assert(cmd_buffer->state.job == NULL);

   assert(src->vk.samples == VK_SAMPLE_COUNT_4_BIT);
   assert(dst->vk.samples == VK_SAMPLE_COUNT_1_BIT);

   /* We don't support multi-sampled multi-plane images */
   assert(src->plane_count == 1);
   assert(dst->plane_count == 1);

   cmd_buffer->state.is_transfer = true;

   for (uint32_t i = 0; i < info->regionCount; i++) {
      if (resolve_image_tlb(cmd_buffer, dst, src, &info->pRegions[i]))
         continue;
      if (resolve_image_blit(cmd_buffer, dst, src, &info->pRegions[i]))
         continue;
      unreachable("Unsupported multismaple resolve operation");
   }

   cmd_buffer->state.is_transfer = false;
}