/*
 * Copyright © 2020 Raspberry Pi Ltd
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#include "v3dv_private.h"

#include "util/timespec.h"
#include "compiler/nir/nir_builder.h"

static void
kperfmon_create(struct v3dv_device *device,
                struct v3dv_query_pool *pool,
                uint32_t query)
{
   for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
      assert(i * DRM_V3D_MAX_PERF_COUNTERS < pool->perfmon.ncounters);

      struct drm_v3d_perfmon_create req = {
         .ncounters = MIN2(pool->perfmon.ncounters -
                           i * DRM_V3D_MAX_PERF_COUNTERS,
                           DRM_V3D_MAX_PERF_COUNTERS),
      };
      memcpy(req.counters,
             &pool->perfmon.counters[i * DRM_V3D_MAX_PERF_COUNTERS],
             req.ncounters);

      int ret = v3dv_ioctl(device->pdevice->render_fd,
                           DRM_IOCTL_V3D_PERFMON_CREATE,
                           &req);
      if (ret)
         fprintf(stderr, "Failed to create perfmon for query %d: %s\n", query,
                 strerror(errno));

      pool->queries[query].perf.kperfmon_ids[i] = req.id;
   }
}

static void
kperfmon_destroy(struct v3dv_device *device,
                 struct v3dv_query_pool *pool,
                 uint32_t query)
{
   /* Skip destroying if never created */
   if (!pool->queries[query].perf.kperfmon_ids[0])
      return;

   for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
      struct drm_v3d_perfmon_destroy req = {
         .id = pool->queries[query].perf.kperfmon_ids[i]
      };

      int ret = v3dv_ioctl(device->pdevice->render_fd,
                           DRM_IOCTL_V3D_PERFMON_DESTROY,
                           &req);

      if (ret) {
         fprintf(stderr, "Failed to destroy perfmon %u: %s\n",
                 req.id, strerror(errno));
      }
   }
}

/**
 * Creates a VkBuffer (and VkDeviceMemory) to access a BO.
 */
static VkResult
create_vk_storage_buffer(struct v3dv_device *device,
                         struct v3dv_bo *bo,
                         VkBuffer *vk_buf,
                         VkDeviceMemory *vk_mem)
{
   VkDevice vk_device = v3dv_device_to_handle(device);

   VkBufferCreateInfo buf_info = {
      .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
      .size = bo->size,
      .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
   };
   VkResult result = v3dv_CreateBuffer(vk_device, &buf_info, NULL, vk_buf);
   if (result != VK_SUCCESS)
      return result;

   struct v3dv_device_memory *mem =
      vk_object_zalloc(&device->vk, NULL, sizeof(*mem),
                       VK_OBJECT_TYPE_DEVICE_MEMORY);
   if (!mem)
      return VK_ERROR_OUT_OF_HOST_MEMORY;

   mem->bo = bo;
   mem->type = &device->pdevice->memory.memoryTypes[0];

   *vk_mem = v3dv_device_memory_to_handle(mem);
   VkBindBufferMemoryInfo bind_info = {
      .sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO,
      .buffer = *vk_buf,
      .memory = *vk_mem,
      .memoryOffset = 0,
   };
   v3dv_BindBufferMemory2(vk_device, 1, &bind_info);

   return VK_SUCCESS;
}

static void
destroy_vk_storage_buffer(struct v3dv_device *device,
                          VkBuffer *vk_buf,
                          VkDeviceMemory *vk_mem)
{
   if (*vk_mem) {
      vk_object_free(&device->vk, NULL, v3dv_device_memory_from_handle(*vk_mem));
      *vk_mem = VK_NULL_HANDLE;
   }

   v3dv_DestroyBuffer(v3dv_device_to_handle(device), *vk_buf, NULL);
   *vk_buf = VK_NULL_HANDLE;
}

/**
 * Allocates descriptor sets to access query pool BO (availability and
 * occlusion query results) from Vulkan pipelines.
 */
static VkResult
create_pool_descriptors(struct v3dv_device *device,
                        struct v3dv_query_pool *pool)
{
   assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION);
   VkDevice vk_device = v3dv_device_to_handle(device);

   VkDescriptorPoolSize pool_size = {
      .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
      .descriptorCount = 1,
   };
   VkDescriptorPoolCreateInfo pool_info = {
      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
      .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT,
      .maxSets = 1,
      .poolSizeCount = 1,
      .pPoolSizes = &pool_size,
   };
   VkResult result =
      v3dv_CreateDescriptorPool(vk_device, &pool_info, NULL,
                                &pool->meta.descriptor_pool);

   if (result != VK_SUCCESS)
      return result;

   VkDescriptorSetAllocateInfo alloc_info = {
      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
      .descriptorPool = pool->meta.descriptor_pool,
      .descriptorSetCount = 1,
      .pSetLayouts = &device->queries.buf_descriptor_set_layout,
   };
   result = v3dv_AllocateDescriptorSets(vk_device, &alloc_info,
                                        &pool->meta.descriptor_set);
   if (result != VK_SUCCESS)
      return result;

   VkDescriptorBufferInfo desc_buf_info = {
      .buffer = pool->meta.buf,
      .offset = 0,
      .range = VK_WHOLE_SIZE,
   };

   VkWriteDescriptorSet write = {
      .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
      .dstSet = pool->meta.descriptor_set,
      .dstBinding = 0,
      .dstArrayElement = 0,
      .descriptorCount = 1,
      .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
      .pBufferInfo = &desc_buf_info,
   };
   v3dv_UpdateDescriptorSets(vk_device, 1, &write, 0, NULL);

   return VK_SUCCESS;
}

static void
destroy_pool_descriptors(struct v3dv_device *device,
                         struct v3dv_query_pool *pool)
{
   assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION);

   v3dv_FreeDescriptorSets(v3dv_device_to_handle(device),
                           pool->meta.descriptor_pool,
                           1, &pool->meta.descriptor_set);
   pool->meta.descriptor_set = VK_NULL_HANDLE;

   v3dv_DestroyDescriptorPool(v3dv_device_to_handle(device),
                              pool->meta.descriptor_pool, NULL);
   pool->meta.descriptor_pool = VK_NULL_HANDLE;
}

static VkResult
pool_create_meta_resources(struct v3dv_device *device,
                           struct v3dv_query_pool *pool)
{
   VkResult result;

   if (pool->query_type != VK_QUERY_TYPE_OCCLUSION)
      return VK_SUCCESS;

   result = create_vk_storage_buffer(device, pool->occlusion.bo,
                                     &pool->meta.buf, &pool->meta.mem);
   if (result != VK_SUCCESS)
      return result;

   result = create_pool_descriptors(device, pool);
   if (result != VK_SUCCESS)
       return result;

   return VK_SUCCESS;
}

static void
pool_destroy_meta_resources(struct v3dv_device *device,
                            struct v3dv_query_pool *pool)
{
   if (pool->query_type != VK_QUERY_TYPE_OCCLUSION)
      return;

   destroy_pool_descriptors(device, pool);
   destroy_vk_storage_buffer(device, &pool->meta.buf, &pool->meta.mem);
}

VKAPI_ATTR VkResult VKAPI_CALL
v3dv_CreateQueryPool(VkDevice _device,
                     const VkQueryPoolCreateInfo *pCreateInfo,
                     const VkAllocationCallbacks *pAllocator,
                     VkQueryPool *pQueryPool)
{
   V3DV_FROM_HANDLE(v3dv_device, device, _device);

   assert(pCreateInfo->queryType == VK_QUERY_TYPE_OCCLUSION ||
          pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP ||
          pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
   assert(pCreateInfo->queryCount > 0);

   struct v3dv_query_pool *pool =
      vk_object_zalloc(&device->vk, pAllocator, sizeof(*pool),
                       VK_OBJECT_TYPE_QUERY_POOL);
   if (pool == NULL)
      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);

   pool->query_type = pCreateInfo->queryType;
   pool->query_count = pCreateInfo->queryCount;

   uint32_t query_idx = 0;
   VkResult result;

   const uint32_t pool_bytes = sizeof(struct v3dv_query) * pool->query_count;
   pool->queries = vk_alloc2(&device->vk.alloc, pAllocator, pool_bytes, 8,
                             VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
   if (pool->queries == NULL) {
      result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
      goto fail;
   }

   switch (pool->query_type) {
   case VK_QUERY_TYPE_OCCLUSION: {
      /* The hardware allows us to setup groups of 16 queries in consecutive
       * 4-byte addresses, requiring only that each group of 16 queries is
       * aligned to a 1024 byte boundary.
       */
      const uint32_t query_groups = DIV_ROUND_UP(pool->query_count, 16);
      uint32_t bo_size = query_groups * 1024;
      /* After the counters we store avalability data, 1 byte/query */
      pool->occlusion.avail_offset = bo_size;
      bo_size += pool->query_count;
      pool->occlusion.bo = v3dv_bo_alloc(device, bo_size, "query:o", true);
      if (!pool->occlusion.bo) {
         result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
         goto fail;
      }
      if (!v3dv_bo_map(device, pool->occlusion.bo, bo_size)) {
         result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
         goto fail;
      }
      break;
   }
   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
      const VkQueryPoolPerformanceCreateInfoKHR *pq_info =
         vk_find_struct_const(pCreateInfo->pNext,
                              QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);

      assert(pq_info);

      pool->perfmon.ncounters = pq_info->counterIndexCount;
      for (uint32_t i = 0; i < pq_info->counterIndexCount; i++)
         pool->perfmon.counters[i] = pq_info->pCounterIndices[i];

      pool->perfmon.nperfmons = DIV_ROUND_UP(pool->perfmon.ncounters,
                                             DRM_V3D_MAX_PERF_COUNTERS);

      assert(pool->perfmon.nperfmons <= V3DV_MAX_PERFMONS);
      break;
   }
   case VK_QUERY_TYPE_TIMESTAMP: {
      /* 8 bytes per query used for the timestamp value. We have all
       * timestamps tightly packed first in the buffer.
       */
      const uint32_t bo_size = pool->query_count * 8;
      pool->timestamp.bo = v3dv_bo_alloc(device, bo_size, "query:t", true);
      if (!pool->timestamp.bo) {
         result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
         goto fail;
      }
      if (!v3dv_bo_map(device, pool->timestamp.bo, bo_size)) {
         result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
         goto fail;
      }
      break;
   }
   default:
      unreachable("Unsupported query type");
   }

   /* Initialize queries in the pool */
   for (; query_idx < pool->query_count; query_idx++) {
      pool->queries[query_idx].maybe_available = false;
      switch (pool->query_type) {
      case VK_QUERY_TYPE_OCCLUSION: {
         const uint32_t query_group = query_idx / 16;
         const uint32_t query_offset = query_group * 1024 + (query_idx % 16) * 4;
         pool->queries[query_idx].occlusion.offset = query_offset;
         break;
         }
      case VK_QUERY_TYPE_TIMESTAMP:
         pool->queries[query_idx].timestamp.offset = query_idx * 8;
         result = vk_sync_create(&device->vk,
                                 &device->pdevice->drm_syncobj_type, 0, 0,
                                 &pool->queries[query_idx].timestamp.sync);
         if (result != VK_SUCCESS)
            goto fail;
         break;
      case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
         result = vk_sync_create(&device->vk,
                                 &device->pdevice->drm_syncobj_type, 0, 0,
                                 &pool->queries[query_idx].perf.last_job_sync);
         if (result != VK_SUCCESS)
            goto fail;

         kperfmon_create(device, pool, query_idx);
         break;
         }
      default:
         unreachable("Unsupported query type");
      }
   }

   /* Create meta resources */
   result = pool_create_meta_resources(device, pool);
   if (result != VK_SUCCESS)
      goto fail;

   *pQueryPool = v3dv_query_pool_to_handle(pool);

   return VK_SUCCESS;

fail:
   if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
      for (uint32_t j = 0; j < query_idx; j++)
         vk_sync_destroy(&device->vk, pool->queries[j].timestamp.sync);
   }

   if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
      for (uint32_t j = 0; j < query_idx; j++)
         vk_sync_destroy(&device->vk, pool->queries[j].perf.last_job_sync);
   }

   if (pool->occlusion.bo)
      v3dv_bo_free(device, pool->occlusion.bo);
   if (pool->timestamp.bo)
      v3dv_bo_free(device, pool->timestamp.bo);
   if (pool->queries)
      vk_free2(&device->vk.alloc, pAllocator, pool->queries);
   pool_destroy_meta_resources(device, pool);
   vk_object_free(&device->vk, pAllocator, pool);

   return result;
}

VKAPI_ATTR void VKAPI_CALL
v3dv_DestroyQueryPool(VkDevice _device,
                      VkQueryPool queryPool,
                      const VkAllocationCallbacks *pAllocator)
{
   V3DV_FROM_HANDLE(v3dv_device, device, _device);
   V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);

   if (!pool)
      return;

   if (pool->occlusion.bo)
      v3dv_bo_free(device, pool->occlusion.bo);

   if (pool->timestamp.bo)
      v3dv_bo_free(device, pool->timestamp.bo);

   if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
      for (uint32_t i = 0; i < pool->query_count; i++)
         vk_sync_destroy(&device->vk, pool->queries[i].timestamp.sync);
   }

   if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
      for (uint32_t i = 0; i < pool->query_count; i++) {
         kperfmon_destroy(device, pool, i);
         vk_sync_destroy(&device->vk, pool->queries[i].perf.last_job_sync);
      }
   }

   if (pool->queries)
      vk_free2(&device->vk.alloc, pAllocator, pool->queries);

   pool_destroy_meta_resources(device, pool);

   vk_object_free(&device->vk, pAllocator, pool);
}

static void
write_to_buffer(void *dst, uint32_t idx, bool do_64bit, uint64_t value)
{
   if (do_64bit) {
      uint64_t *dst64 = (uint64_t *) dst;
      dst64[idx] = value;
   } else {
      uint32_t *dst32 = (uint32_t *) dst;
      dst32[idx] = (uint32_t) value;
   }
}

static VkResult
query_wait_available(struct v3dv_device *device,
                     struct v3dv_query_pool *pool,
                     struct v3dv_query *q,
                     uint32_t query_idx)
{
   /* For occlusion queries we prefer to poll the availability BO in a loop
    * to waiting on the query results BO, because the latter would
    * make us wait for any job running queries from the pool, even if those
    * queries do not involve the one we want to wait on.
    */
   if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
      uint8_t *q_addr = ((uint8_t *) pool->occlusion.bo->map) +
                        pool->occlusion.avail_offset + query_idx;
      while (*q_addr == 0)
         usleep(250);
      return VK_SUCCESS;
   }

   if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
      if (vk_sync_wait(&device->vk, q->timestamp.sync,
                       0, VK_SYNC_WAIT_COMPLETE, UINT64_MAX) != VK_SUCCESS) {
         return vk_device_set_lost(&device->vk, "Query job wait failed");
      }
      return VK_SUCCESS;
   }

   assert(pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);

   /* For performance queries we need to wait for the queue to signal that
    * the query has been submitted for execution before anything else.
    */
   VkResult result = VK_SUCCESS;
   if (!q->maybe_available) {
      struct timespec timeout;
      timespec_get(&timeout, TIME_UTC);
      timespec_add_msec(&timeout, &timeout, 2000);

      mtx_lock(&device->query_mutex);
      while (!q->maybe_available) {
         if (vk_device_is_lost(&device->vk)) {
            result = VK_ERROR_DEVICE_LOST;
            break;
         }

         int ret = cnd_timedwait(&device->query_ended,
                                 &device->query_mutex,
                                 &timeout);
         if (ret != thrd_success) {
            mtx_unlock(&device->query_mutex);
            result = vk_device_set_lost(&device->vk, "Query wait failed");
            break;
         }
      }
      mtx_unlock(&device->query_mutex);

      if (result != VK_SUCCESS)
         return result;

      /* For performance queries, we also need to wait for the relevant syncobj
       * to be signaled to ensure completion of the GPU work.
       */
      if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR &&
          vk_sync_wait(&device->vk, q->perf.last_job_sync,
                       0, VK_SYNC_WAIT_COMPLETE, UINT64_MAX) != VK_SUCCESS) {
        return vk_device_set_lost(&device->vk, "Query job wait failed");
      }
   }

   return result;
}

static VkResult
query_check_available(struct v3dv_device *device,
                      struct v3dv_query_pool *pool,
                      struct v3dv_query *q,
                      uint32_t query_idx)
{
   /* For occlusion we check the availability BO */
   if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
      const uint8_t *q_addr = ((uint8_t *) pool->occlusion.bo->map) +
                              pool->occlusion.avail_offset + query_idx;
      return (*q_addr != 0) ? VK_SUCCESS : VK_NOT_READY;
   }

   /* For timestamp queries, we need to check if the relevant job
    * has completed.
    */
   if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
      if (vk_sync_wait(&device->vk, q->timestamp.sync,
                       0, VK_SYNC_WAIT_COMPLETE, 0) != VK_SUCCESS) {
         return VK_NOT_READY;
      }
      return VK_SUCCESS;
   }

   /* For other queries we need to check if the queue has submitted the query
    * for execution at all.
    */
   assert(pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
   if (!q->maybe_available)
      return VK_NOT_READY;

   /* For performance queries, we also need to check if the relevant GPU job
    * has completed.
    */
   if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR &&
       vk_sync_wait(&device->vk, q->perf.last_job_sync,
                    0, VK_SYNC_WAIT_COMPLETE, 0) != VK_SUCCESS) {
         return VK_NOT_READY;
   }

   return VK_SUCCESS;
}

static VkResult
query_is_available(struct v3dv_device *device,
                   struct v3dv_query_pool *pool,
                   uint32_t query,
                   bool do_wait,
                   bool *available)
{
   struct v3dv_query *q = &pool->queries[query];

   if (do_wait) {
      VkResult result = query_wait_available(device, pool, q, query);
      if (result != VK_SUCCESS) {
         *available = false;
         return result;
      }

      *available = true;
   } else {
      VkResult result = query_check_available(device, pool, q, query);
      assert(result == VK_SUCCESS || result == VK_NOT_READY);
      *available = (result == VK_SUCCESS);
   }

   return VK_SUCCESS;
}

static VkResult
write_occlusion_query_result(struct v3dv_device *device,
                             struct v3dv_query_pool *pool,
                             uint32_t query,
                             bool do_64bit,
                             void *data,
                             uint32_t slot)
{
   assert(pool && pool->query_type == VK_QUERY_TYPE_OCCLUSION);

   if (vk_device_is_lost(&device->vk))
      return VK_ERROR_DEVICE_LOST;

   struct v3dv_query *q = &pool->queries[query];
   assert(pool->occlusion.bo && pool->occlusion.bo->map);

   const uint8_t *query_addr =
      ((uint8_t *) pool->occlusion.bo->map) + q->occlusion.offset;
   write_to_buffer(data, slot, do_64bit, (uint64_t) *((uint32_t *)query_addr));
   return VK_SUCCESS;
}

static VkResult
write_timestamp_query_result(struct v3dv_device *device,
                             struct v3dv_query_pool *pool,
                             uint32_t query,
                             bool do_64bit,
                             void *data,
                             uint32_t slot)
{
   assert(pool && pool->query_type == VK_QUERY_TYPE_TIMESTAMP);

   struct v3dv_query *q = &pool->queries[query];

   const uint8_t *query_addr =
      ((uint8_t *) pool->timestamp.bo->map) + q->timestamp.offset;

   write_to_buffer(data, slot, do_64bit, *((uint64_t *)query_addr));
   return VK_SUCCESS;
}

static VkResult
write_performance_query_result(struct v3dv_device *device,
                               struct v3dv_query_pool *pool,
                               uint32_t query,
                               bool do_64bit,
                               void *data,
                               uint32_t slot)
{
   assert(pool && pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);

   struct v3dv_query *q = &pool->queries[query];
   uint64_t counter_values[V3D_MAX_PERFCNT];

   for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
      struct drm_v3d_perfmon_get_values req = {
         .id = q->perf.kperfmon_ids[i],
         .values_ptr = (uintptr_t)(&counter_values[i *
                                   DRM_V3D_MAX_PERF_COUNTERS])
      };

      int ret = v3dv_ioctl(device->pdevice->render_fd,
                           DRM_IOCTL_V3D_PERFMON_GET_VALUES,
                           &req);

      if (ret) {
         fprintf(stderr, "failed to get perfmon values: %s\n", strerror(errno));
         return vk_error(device, VK_ERROR_DEVICE_LOST);
      }
   }

   for (uint32_t i = 0; i < pool->perfmon.ncounters; i++)
      write_to_buffer(data, slot + i, do_64bit, counter_values[i]);

   return VK_SUCCESS;
}

static VkResult
write_query_result(struct v3dv_device *device,
                   struct v3dv_query_pool *pool,
                   uint32_t query,
                   bool do_64bit,
                   void *data,
                   uint32_t slot)
{
   switch (pool->query_type) {
   case VK_QUERY_TYPE_OCCLUSION:
      return write_occlusion_query_result(device, pool, query, do_64bit,
                                          data, slot);
   case VK_QUERY_TYPE_TIMESTAMP:
      return write_timestamp_query_result(device, pool, query, do_64bit,
                                          data, slot);
   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
      return write_performance_query_result(device, pool, query, do_64bit,
                                            data, slot);
   default:
      unreachable("Unsupported query type");
   }
}

static uint32_t
get_query_result_count(struct v3dv_query_pool *pool)
{
   switch (pool->query_type) {
   case VK_QUERY_TYPE_OCCLUSION:
   case VK_QUERY_TYPE_TIMESTAMP:
      return 1;
   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
      return pool->perfmon.ncounters;
   default:
      unreachable("Unsupported query type");
   }
}

VkResult
v3dv_get_query_pool_results_cpu(struct v3dv_device *device,
                                struct v3dv_query_pool *pool,
                                uint32_t first,
                                uint32_t count,
                                void *data,
                                VkDeviceSize stride,
                                VkQueryResultFlags flags)
{
   assert(first < pool->query_count);
   assert(first + count <= pool->query_count);
   assert(data);

   const bool do_64bit = flags & VK_QUERY_RESULT_64_BIT ||
      pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR;
   const bool do_wait = flags & VK_QUERY_RESULT_WAIT_BIT;
   const bool do_partial = flags & VK_QUERY_RESULT_PARTIAL_BIT;

   uint32_t result_count = get_query_result_count(pool);

   VkResult result = VK_SUCCESS;
   for (uint32_t i = first; i < first + count; i++) {
      bool available = false;
      VkResult query_result =
         query_is_available(device, pool, i, do_wait, &available);
      if (query_result == VK_ERROR_DEVICE_LOST)
         result = VK_ERROR_DEVICE_LOST;

      /**
       * From the Vulkan 1.0 spec:
       *
       *    "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
       *     both not set then no result values are written to pData for queries
       *     that are in the unavailable state at the time of the call, and
       *     vkGetQueryPoolResults returns VK_NOT_READY. However, availability
       *     state is still written to pData for those queries if
       *     VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
       */
      uint32_t slot = 0;

      const bool write_result = available || do_partial;
      if (write_result)
         write_query_result(device, pool, i, do_64bit, data, slot);
      slot += result_count;

      if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
         write_to_buffer(data, slot++, do_64bit, available ? 1u : 0u);

      if (!write_result && result != VK_ERROR_DEVICE_LOST)
         result = VK_NOT_READY;

      data += stride;
   }

   return result;
}

VKAPI_ATTR VkResult VKAPI_CALL
v3dv_GetQueryPoolResults(VkDevice _device,
                         VkQueryPool queryPool,
                         uint32_t firstQuery,
                         uint32_t queryCount,
                         size_t dataSize,
                         void *pData,
                         VkDeviceSize stride,
                         VkQueryResultFlags flags)
{
   V3DV_FROM_HANDLE(v3dv_device, device, _device);
   V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);

   if (vk_device_is_lost(&device->vk))
      return VK_ERROR_DEVICE_LOST;

   return v3dv_get_query_pool_results_cpu(device, pool, firstQuery, queryCount,
                                          pData, stride, flags);
}

/* Emits a series of vkCmdDispatchBase calls to execute all the workgroups
 * required to handle a number of queries considering per-dispatch limits.
 */
static void
cmd_buffer_emit_dispatch_queries(struct v3dv_cmd_buffer *cmd_buffer,
                                 uint32_t query_count)
{
   VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);

   uint32_t dispatched = 0;
   const uint32_t max_batch_size = 65535;
   while (dispatched < query_count) {
      uint32_t batch_size = MIN2(query_count - dispatched, max_batch_size);
      v3dv_CmdDispatchBase(vk_cmd_buffer, dispatched, 0, 0, batch_size, 1, 1);
      dispatched += batch_size;
   }
}

void
v3dv_cmd_buffer_emit_set_query_availability(struct v3dv_cmd_buffer *cmd_buffer,
                                            struct v3dv_query_pool *pool,
                                            uint32_t query, uint32_t count,
                                            uint8_t availability)
{
   assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION ||
          pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);

   struct v3dv_device *device = cmd_buffer->device;
   VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);

   /* We are about to emit a compute job to set query availability and we need
    * to ensure this executes after the graphics work using the queries has
    * completed.
    */
   VkMemoryBarrier2 barrier = {
      .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
      .srcStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
      .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
   };
   VkDependencyInfo barrier_info = {
      .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
      .memoryBarrierCount = 1,
      .pMemoryBarriers = &barrier,
   };
   v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info);

   /* Dispatch queries */
   v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);

   v3dv_CmdBindPipeline(vk_cmd_buffer,
                        VK_PIPELINE_BIND_POINT_COMPUTE,
                        device->queries.avail_pipeline);

   v3dv_CmdBindDescriptorSets(vk_cmd_buffer,
                              VK_PIPELINE_BIND_POINT_COMPUTE,
                              device->queries.avail_pipeline_layout,
                              0, 1, &pool->meta.descriptor_set,
                              0, NULL);

   struct {
      uint32_t offset;
      uint32_t query;
      uint8_t availability;
   } push_data = { pool->occlusion.avail_offset, query, availability };
   v3dv_CmdPushConstants(vk_cmd_buffer,
                         device->queries.avail_pipeline_layout,
                         VK_SHADER_STAGE_COMPUTE_BIT,
                         0, sizeof(push_data), &push_data);
   cmd_buffer_emit_dispatch_queries(cmd_buffer, count);

   v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false);
}

static void
cmd_buffer_emit_reset_occlusion_query_pool(struct v3dv_cmd_buffer *cmd_buffer,
                                           struct v3dv_query_pool *pool,
                                           uint32_t query, uint32_t count)
{
   struct v3dv_device *device = cmd_buffer->device;
   VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);

   /* Ensure the GPU is done with the queries in the graphics queue before
    * we reset in the compute queue.
    */
   VkMemoryBarrier2 barrier = {
      .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
      .srcStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
      .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
   };
   VkDependencyInfo barrier_info = {
      .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
      .memoryBarrierCount = 1,
      .pMemoryBarriers = &barrier,
   };
   v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info);

   /* Emit compute reset */
   v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);

   v3dv_CmdBindPipeline(vk_cmd_buffer,
                        VK_PIPELINE_BIND_POINT_COMPUTE,
                        device->queries.reset_occlusion_pipeline);

   v3dv_CmdBindDescriptorSets(vk_cmd_buffer,
                              VK_PIPELINE_BIND_POINT_COMPUTE,
                              device->queries.reset_occlusion_pipeline_layout,
                              0, 1, &pool->meta.descriptor_set,
                              0, NULL);
   struct {
      uint32_t offset;
      uint32_t query;
   } push_data = { pool->occlusion.avail_offset, query };
   v3dv_CmdPushConstants(vk_cmd_buffer,
                         device->queries.reset_occlusion_pipeline_layout,
                         VK_SHADER_STAGE_COMPUTE_BIT,
                         0, sizeof(push_data), &push_data);

   cmd_buffer_emit_dispatch_queries(cmd_buffer, count);

   v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false);

   /* Ensure future work in the graphics queue using the queries doesn't start
    * before the reset completed.
    */
   barrier = (VkMemoryBarrier2) {
      .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
      .srcStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
      .dstStageMask = VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT,
   };
   barrier_info = (VkDependencyInfo) {
      .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
      .memoryBarrierCount = 1,
      .pMemoryBarriers = &barrier,
   };
   v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info);
}

static void
cmd_buffer_emit_reset_query_pool(struct v3dv_cmd_buffer *cmd_buffer,
                                 struct v3dv_query_pool *pool,
                                 uint32_t first, uint32_t count)
{
   assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION);
   cmd_buffer_emit_reset_occlusion_query_pool(cmd_buffer, pool, first, count);
}

static void
cmd_buffer_emit_reset_query_pool_cpu(struct v3dv_cmd_buffer *cmd_buffer,
                                     struct v3dv_query_pool *pool,
                                     uint32_t first, uint32_t count)
{
   assert(pool->query_type != VK_QUERY_TYPE_OCCLUSION);

   struct v3dv_job *job =
      v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
                                     V3DV_JOB_TYPE_CPU_RESET_QUERIES,
                                     cmd_buffer, -1);
   v3dv_return_if_oom(cmd_buffer, NULL);
   job->cpu.query_reset.pool = pool;
   job->cpu.query_reset.first = first;
   job->cpu.query_reset.count = count;
   list_addtail(&job->list_link, &cmd_buffer->jobs);
}

VKAPI_ATTR void VKAPI_CALL
v3dv_CmdResetQueryPool(VkCommandBuffer commandBuffer,
                       VkQueryPool queryPool,
                       uint32_t firstQuery,
                       uint32_t queryCount)
{
   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
   V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);

   /* Resets can only happen outside a render pass instance so we should not
    * be in the middle of job recording.
    */
   assert(cmd_buffer->state.pass == NULL);
   assert(cmd_buffer->state.job == NULL);

   assert(firstQuery < pool->query_count);
   assert(firstQuery + queryCount <= pool->query_count);

   /* We can reset occlusion queries in the GPU, but for other query types
    * we emit a CPU job that will call v3dv_reset_query_pool_cpu when executed
    * in the queue.
    */
   if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
      cmd_buffer_emit_reset_query_pool(cmd_buffer, pool, firstQuery, queryCount);
   } else {
      cmd_buffer_emit_reset_query_pool_cpu(cmd_buffer, pool,
                                           firstQuery, queryCount);
   }
}

/**
 * Creates a descriptor pool so we can create a descriptors for the destination
 * buffers of vkCmdCopyQueryResults for queries where this is implemented in
 * the GPU.
 */
static VkResult
create_storage_buffer_descriptor_pool(struct v3dv_cmd_buffer *cmd_buffer)
{
   /* If this is not the first pool we create one for this command buffer
    * size it based on the size of the currently exhausted pool.
    */
   uint32_t descriptor_count = 32;
   if (cmd_buffer->meta.query.dspool != VK_NULL_HANDLE) {
      struct v3dv_descriptor_pool *exhausted_pool =
         v3dv_descriptor_pool_from_handle(cmd_buffer->meta.query.dspool);
      descriptor_count = MIN2(exhausted_pool->max_entry_count * 2, 1024);
   }

   /* Create the descriptor pool */
   cmd_buffer->meta.query.dspool = VK_NULL_HANDLE;
   VkDescriptorPoolSize pool_size = {
      .type = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
      .descriptorCount = descriptor_count,
   };
   VkDescriptorPoolCreateInfo info = {
      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
      .maxSets = descriptor_count,
      .poolSizeCount = 1,
      .pPoolSizes = &pool_size,
      .flags = 0,
   };
   VkResult result =
      v3dv_CreateDescriptorPool(v3dv_device_to_handle(cmd_buffer->device),
                                &info,
                                &cmd_buffer->device->vk.alloc,
                                &cmd_buffer->meta.query.dspool);

   if (result == VK_SUCCESS) {
      assert(cmd_buffer->meta.query.dspool != VK_NULL_HANDLE);
      const VkDescriptorPool vk_pool = cmd_buffer->meta.query.dspool;

      v3dv_cmd_buffer_add_private_obj(
         cmd_buffer, (uintptr_t) vk_pool,
         (v3dv_cmd_buffer_private_obj_destroy_cb)v3dv_DestroyDescriptorPool);

      struct v3dv_descriptor_pool *pool =
         v3dv_descriptor_pool_from_handle(vk_pool);
      pool->is_driver_internal = true;
   }

   return result;
}

static VkResult
allocate_storage_buffer_descriptor_set(struct v3dv_cmd_buffer *cmd_buffer,
                                       VkDescriptorSet *set)
{
   /* Make sure we have a descriptor pool */
   VkResult result;
   if (cmd_buffer->meta.query.dspool == VK_NULL_HANDLE) {
      result = create_storage_buffer_descriptor_pool(cmd_buffer);
      if (result != VK_SUCCESS)
         return result;
   }
   assert(cmd_buffer->meta.query.dspool != VK_NULL_HANDLE);

   /* Allocate descriptor set */
   struct v3dv_device *device = cmd_buffer->device;
   VkDevice vk_device = v3dv_device_to_handle(device);
   VkDescriptorSetAllocateInfo info = {
      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
      .descriptorPool = cmd_buffer->meta.query.dspool,
      .descriptorSetCount = 1,
      .pSetLayouts = &device->queries.buf_descriptor_set_layout,
   };
   result = v3dv_AllocateDescriptorSets(vk_device, &info, set);

   /* If we ran out of pool space, grow the pool and try again */
   if (result == VK_ERROR_OUT_OF_POOL_MEMORY) {
      result = create_storage_buffer_descriptor_pool(cmd_buffer);
      if (result == VK_SUCCESS) {
         info.descriptorPool = cmd_buffer->meta.query.dspool;
         result = v3dv_AllocateDescriptorSets(vk_device, &info, set);
      }
   }

   return result;
}

static uint32_t
copy_pipeline_index_from_flags(VkQueryResultFlags flags)
{
   uint32_t index = 0;
   if (flags & VK_QUERY_RESULT_64_BIT)
      index |= 1;
   if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
      index |= 2;
   if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
      index |= 4;
   assert(index < 8);
   return index;
}

static nir_shader *
get_copy_query_results_cs(const nir_shader_compiler_options *compiler_options,
                          VkQueryResultFlags flags);

static void
cmd_buffer_emit_copy_query_pool_results(struct v3dv_cmd_buffer *cmd_buffer,
                                        struct v3dv_query_pool *pool,
                                        uint32_t first, uint32_t count,
                                        struct v3dv_buffer *buf,
                                        uint32_t offset, uint32_t stride,
                                        VkQueryResultFlags flags)
{
   struct v3dv_device *device = cmd_buffer->device;
   VkDevice vk_device = v3dv_device_to_handle(device);
   VkCommandBuffer vk_cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);

   /* Create the required copy pipeline if not yet created */
   uint32_t pipeline_idx = copy_pipeline_index_from_flags(flags);
   if (!device->queries.copy_pipeline[pipeline_idx]) {
      const nir_shader_compiler_options *compiler_options =
         v3dv_pipeline_get_nir_options(&device->devinfo);
      nir_shader *copy_query_results_cs_nir =
         get_copy_query_results_cs(compiler_options, flags);
      VkResult result =
         v3dv_create_compute_pipeline_from_nir(
               device, copy_query_results_cs_nir,
               device->queries.copy_pipeline_layout,
               &device->queries.copy_pipeline[pipeline_idx]);
      ralloc_free(copy_query_results_cs_nir);
      if (result != VK_SUCCESS) {
         fprintf(stderr, "Failed to create copy query results pipeline\n");
         return;
      }
   }

   /* FIXME: do we need this barrier? Since vkCmdEndQuery should've been called
    * and that already waits maybe we don't (since this is serialized
    * in the compute queue with EndQuery anyway).
    */
   if (flags & VK_QUERY_RESULT_WAIT_BIT) {
      VkMemoryBarrier2 barrier = {
         .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER_2,
         .srcStageMask = VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT,
         .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
      };
      VkDependencyInfo barrier_info = {
         .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
         .memoryBarrierCount = 1,
         .pMemoryBarriers = &barrier,
      };
      v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, &barrier_info);
   }

   /* Allocate and setup descriptor set for output buffer */
   VkDescriptorSet out_buf_descriptor_set;
   VkResult result =
      allocate_storage_buffer_descriptor_set(cmd_buffer,
                                             &out_buf_descriptor_set);
   if (result != VK_SUCCESS) {
      fprintf(stderr, "vkCmdCopyQueryPoolResults failed: "
              "could not allocate descriptor.\n");
      return;
   }

   VkDescriptorBufferInfo desc_buf_info = {
      .buffer = v3dv_buffer_to_handle(buf),
      .offset = 0,
      .range = VK_WHOLE_SIZE,
   };
   VkWriteDescriptorSet write = {
      .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
      .dstSet = out_buf_descriptor_set,
      .dstBinding = 0,
      .dstArrayElement = 0,
      .descriptorCount = 1,
      .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
      .pBufferInfo = &desc_buf_info,
   };
   v3dv_UpdateDescriptorSets(vk_device, 1, &write, 0, NULL);

   /* Dispatch copy */
   v3dv_cmd_buffer_meta_state_push(cmd_buffer, true);

   assert(device->queries.copy_pipeline[pipeline_idx]);
   v3dv_CmdBindPipeline(vk_cmd_buffer,
                        VK_PIPELINE_BIND_POINT_COMPUTE,
                        device->queries.copy_pipeline[pipeline_idx]);

   VkDescriptorSet sets[2] = {
      pool->meta.descriptor_set,
      out_buf_descriptor_set,
   };
   v3dv_CmdBindDescriptorSets(vk_cmd_buffer,
                              VK_PIPELINE_BIND_POINT_COMPUTE,
                              device->queries.copy_pipeline_layout,
                              0, 2, sets, 0, NULL);

   struct {
      uint32_t avail_offset, first, offset, stride, flags;
   } push_data = { pool->occlusion.avail_offset, first, offset, stride, flags };
   v3dv_CmdPushConstants(vk_cmd_buffer,
                         device->queries.copy_pipeline_layout,
                         VK_SHADER_STAGE_COMPUTE_BIT,
                         0, sizeof(push_data), &push_data);

   cmd_buffer_emit_dispatch_queries(cmd_buffer, count);

   v3dv_cmd_buffer_meta_state_pop(cmd_buffer, false);
}

static void
cmd_buffer_emit_copy_query_pool_results_cpu(struct v3dv_cmd_buffer *cmd_buffer,
                                            struct v3dv_query_pool *pool,
                                            uint32_t first,
                                            uint32_t count,
                                            struct v3dv_buffer *dst,
                                            uint32_t offset,
                                            uint32_t stride,
                                            VkQueryResultFlags flags)
{
   struct v3dv_job *job =
      v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
                                     V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS,
                                     cmd_buffer, -1);
   v3dv_return_if_oom(cmd_buffer, NULL);

   job->cpu.query_copy_results.pool = pool;
   job->cpu.query_copy_results.first = first;
   job->cpu.query_copy_results.count = count;
   job->cpu.query_copy_results.dst = dst;
   job->cpu.query_copy_results.offset = offset;
   job->cpu.query_copy_results.stride = stride;
   job->cpu.query_copy_results.flags = flags;

   list_addtail(&job->list_link, &cmd_buffer->jobs);
}

VKAPI_ATTR void VKAPI_CALL
v3dv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
                             VkQueryPool queryPool,
                             uint32_t firstQuery,
                             uint32_t queryCount,
                             VkBuffer dstBuffer,
                             VkDeviceSize dstOffset,
                             VkDeviceSize stride,
                             VkQueryResultFlags flags)
{
   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
   V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
   V3DV_FROM_HANDLE(v3dv_buffer, dst, dstBuffer);

   /* Copies can only happen outside a render pass instance so we should not
    * be in the middle of job recording.
    */
   assert(cmd_buffer->state.pass == NULL);
   assert(cmd_buffer->state.job == NULL);

   assert(firstQuery < pool->query_count);
   assert(firstQuery + queryCount <= pool->query_count);

   /* For occlusion queries we implement the copy in the GPU but for other
    * queries we emit a CPU job that will call v3dv_get_query_pool_results_cpu
    * when executed in the queue.
    */
   if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
      cmd_buffer_emit_copy_query_pool_results(cmd_buffer, pool,
                                              firstQuery, queryCount,
                                              dst, (uint32_t) dstOffset,
                                              (uint32_t) stride, flags);
   } else {
      cmd_buffer_emit_copy_query_pool_results_cpu(cmd_buffer, pool,
                                                  firstQuery, queryCount,
                                                  dst, (uint32_t)dstOffset,
                                                  (uint32_t) stride, flags);
   }
}

VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBeginQuery(VkCommandBuffer commandBuffer,
                   VkQueryPool queryPool,
                   uint32_t query,
                   VkQueryControlFlags flags)
{
   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
   V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);

   v3dv_cmd_buffer_begin_query(cmd_buffer, pool, query, flags);
}

VKAPI_ATTR void VKAPI_CALL
v3dv_CmdEndQuery(VkCommandBuffer commandBuffer,
                 VkQueryPool queryPool,
                 uint32_t query)
{
   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
   V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);

   v3dv_cmd_buffer_end_query(cmd_buffer, pool, query);
}

void
v3dv_reset_query_pool_cpu(struct v3dv_device *device,
                          struct v3dv_query_pool *pool,
                          uint32_t first,
                          uint32_t count)
{
   mtx_lock(&device->query_mutex);

   if (pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
      assert(first + count <= pool->query_count);

      /* Reset timestamp */
      uint8_t *base_addr;
      base_addr  = ((uint8_t *) pool->timestamp.bo->map) +
                    pool->queries[first].timestamp.offset;
      memset(base_addr, 0, 8 * count);

      for (uint32_t i = first; i < first + count; i++) {
         if (vk_sync_reset(&device->vk, pool->queries[i].timestamp.sync) != VK_SUCCESS)
            fprintf(stderr, "Failed to reset sync");
      }

      mtx_unlock(&device->query_mutex);
      return;
   }

   for (uint32_t i = first; i < first + count; i++) {
      assert(i < pool->query_count);
      struct v3dv_query *q = &pool->queries[i];
      q->maybe_available = false;
      switch (pool->query_type) {
      case VK_QUERY_TYPE_OCCLUSION: {
         /* Reset availability */
         uint8_t *base_addr = ((uint8_t *) pool->occlusion.bo->map) +
                              pool->occlusion.avail_offset + first;
         memset(base_addr, 0, count);

         /* Reset occlusion counter */
         const uint8_t *q_addr =
            ((uint8_t *) pool->occlusion.bo->map) + q->occlusion.offset;
         uint32_t *counter = (uint32_t *) q_addr;
         *counter = 0;
         break;
      }
      case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
         kperfmon_destroy(device, pool, i);
         kperfmon_create(device, pool, i);
         if (vk_sync_reset(&device->vk, q->perf.last_job_sync) != VK_SUCCESS)
            fprintf(stderr, "Failed to reset sync");
         break;
      default:
         unreachable("Unsupported query type");
      }
   }

   mtx_unlock(&device->query_mutex);
}

VKAPI_ATTR void VKAPI_CALL
v3dv_ResetQueryPool(VkDevice _device,
                    VkQueryPool queryPool,
                    uint32_t firstQuery,
                    uint32_t queryCount)
{
   V3DV_FROM_HANDLE(v3dv_device, device, _device);
   V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);

   v3dv_reset_query_pool_cpu(device, pool, firstQuery, queryCount);
}

VKAPI_ATTR VkResult VKAPI_CALL
v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
   VkPhysicalDevice physicalDevice,
   uint32_t queueFamilyIndex,
   uint32_t *pCounterCount,
   VkPerformanceCounterKHR *pCounters,
   VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
{
   V3DV_FROM_HANDLE(v3dv_physical_device, pDevice, physicalDevice);

   return v3dv_X(pDevice, enumerate_performance_query_counters)(pDevice,
                                                                pCounterCount,
                                                                pCounters,
                                                                pCounterDescriptions);
}

VKAPI_ATTR void VKAPI_CALL
v3dv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
   VkPhysicalDevice physicalDevice,
   const VkQueryPoolPerformanceCreateInfoKHR *pPerformanceQueryCreateInfo,
   uint32_t *pNumPasses)
{
   *pNumPasses = DIV_ROUND_UP(pPerformanceQueryCreateInfo->counterIndexCount,
                              DRM_V3D_MAX_PERF_COUNTERS);
}

VKAPI_ATTR VkResult VKAPI_CALL
v3dv_AcquireProfilingLockKHR(
   VkDevice _device,
   const VkAcquireProfilingLockInfoKHR *pInfo)
{
   return VK_SUCCESS;
}

VKAPI_ATTR void VKAPI_CALL
v3dv_ReleaseProfilingLockKHR(VkDevice device)
{
}

static inline void
nir_set_query_availability(nir_builder *b,
                           nir_def *buf,
                           nir_def *offset,
                           nir_def *query_idx,
                           nir_def *avail)
{
   offset = nir_iadd(b, offset, query_idx); /* we use 1B per query */
   nir_store_ssbo(b, avail, buf, offset, .write_mask = 0x1, .align_mul = 1);
}

static inline nir_def *
nir_get_query_availability(nir_builder *b,
                           nir_def *buf,
                           nir_def *offset,
                           nir_def *query_idx)
{
   offset = nir_iadd(b, offset, query_idx); /* we use 1B per query */
   nir_def *avail = nir_load_ssbo(b, 1, 8, buf, offset, .align_mul = 1);
   return nir_i2i32(b, avail);
}

static nir_shader *
get_set_query_availability_cs(const nir_shader_compiler_options *options)
{
   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
                                                  "set query availability cs");

   nir_def *buf =
      nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
                                .desc_set = 0,
                                .binding = 0,
                                .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);

   /* This assumes a local size of 1 and a horizontal-only dispatch. If we
    * ever change any of these parameters we need to update how we compute the
    * query index here.
    */
   nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0);

   nir_def *offset =
      nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);

   nir_def *query_idx =
      nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4);

   nir_def *avail =
      nir_load_push_constant(&b, 1, 8, nir_imm_int(&b, 0), .base = 8, .range = 1);

   query_idx = nir_iadd(&b, query_idx, wg_id);
   nir_set_query_availability(&b, buf, offset, query_idx, avail);

   return b.shader;
}

static inline nir_def *
nir_get_occlusion_counter_offset(nir_builder *b, nir_def *query_idx)
{
   nir_def *query_group = nir_udiv_imm(b, query_idx, 16);
   nir_def *query_group_offset = nir_umod_imm(b, query_idx, 16);
   nir_def *offset =
      nir_iadd(b, nir_imul_imm(b, query_group, 1024),
                  nir_imul_imm(b, query_group_offset, 4));
   return offset;
}

static inline void
nir_reset_occlusion_counter(nir_builder *b,
                            nir_def *buf,
                            nir_def *query_idx)
{
   nir_def *offset = nir_get_occlusion_counter_offset(b, query_idx);
   nir_def *zero = nir_imm_int(b, 0);
   nir_store_ssbo(b, zero, buf, offset, .write_mask = 0x1, .align_mul = 4);
}

static inline nir_def *
nir_read_occlusion_counter(nir_builder *b,
                           nir_def *buf,
                           nir_def *query_idx)
{
   nir_def *offset = nir_get_occlusion_counter_offset(b, query_idx);
   return nir_load_ssbo(b, 1, 32, buf, offset, .access = 0, .align_mul = 4);
}

static nir_shader *
get_reset_occlusion_query_cs(const nir_shader_compiler_options *options)
{
   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
                                                  "reset occlusion query cs");

   nir_def *buf =
      nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
                                .desc_set = 0,
                                .binding = 0,
                                .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);

   /* This assumes a local size of 1 and a horizontal-only dispatch. If we
    * ever change any of these parameters we need to update how we compute the
    * query index here.
    */
   nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0);

   nir_def *avail_offset =
      nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);

   nir_def *base_query_idx =
      nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4);

   nir_def *query_idx = nir_iadd(&b, base_query_idx, wg_id);

   nir_set_query_availability(&b, buf, avail_offset, query_idx,
                              nir_imm_intN_t(&b, 0, 8));
   nir_reset_occlusion_counter(&b, buf, query_idx);

   return b.shader;
}

static void
write_query_buffer(nir_builder *b,
                   nir_def *buf,
                   nir_def **offset,
                   nir_def *value,
                   bool flag_64bit)
{
   if (flag_64bit) {
      /* Create a 64-bit value using a vec2 with the .Y component set to 0
       * so we can write a 64-bit value in a single store.
       */
      nir_def *value64 = nir_vec2(b, value, nir_imm_int(b, 0));
      nir_store_ssbo(b, value64, buf, *offset, .write_mask = 0x3, .align_mul = 8);
      *offset = nir_iadd_imm(b, *offset, 8);
   } else {
      nir_store_ssbo(b, value, buf, *offset, .write_mask = 0x1, .align_mul = 4);
      *offset = nir_iadd_imm(b, *offset, 4);
   }
}

static nir_shader *
get_copy_query_results_cs(const nir_shader_compiler_options *options,
                          VkQueryResultFlags flags)
{
   bool flag_64bit = flags & VK_QUERY_RESULT_64_BIT;
   bool flag_avail = flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT;
   bool flag_partial = flags & VK_QUERY_RESULT_PARTIAL_BIT;

   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
                                                  "copy query results cs");

   nir_def *buf =
      nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
                                .desc_set = 0,
                                .binding = 0,
                                .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);

   nir_def *buf_out =
      nir_vulkan_resource_index(&b, 2, 32, nir_imm_int(&b, 0),
                                .desc_set = 1,
                                .binding = 0,
                                .desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);

   /* Read push constants */
   nir_def *avail_offset =
      nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 0, .range = 4);

   nir_def *base_query_idx =
      nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 4, .range = 4);

   nir_def *base_offset_out =
      nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 8, .range = 4);

   nir_def *stride =
      nir_load_push_constant(&b, 1, 32, nir_imm_int(&b, 0), .base = 12, .range = 4);

   /* This assumes a local size of 1 and a horizontal-only dispatch. If we
    * ever change any of these parameters we need to update how we compute the
    * query index here.
    */
   nir_def *wg_id = nir_channel(&b, nir_load_workgroup_id(&b), 0);
   nir_def *query_idx = nir_iadd(&b, base_query_idx, wg_id);

   /* Read query availability if needed */
   nir_def *avail = NULL;
   if (flag_avail || !flag_partial)
      avail = nir_get_query_availability(&b, buf, avail_offset, query_idx);

   /* Write occusion query result... */
   nir_def *offset =
      nir_iadd(&b, base_offset_out, nir_imul(&b, wg_id, stride));

   /* ...if partial is requested, we always write */
   if(flag_partial) {
      nir_def *query_res = nir_read_occlusion_counter(&b, buf, query_idx);
      write_query_buffer(&b, buf_out, &offset, query_res, flag_64bit);
   } else {
      /*...otherwise, we only write if the query is available */
      nir_if *if_stmt = nir_push_if(&b, nir_ine_imm(&b, avail, 0));
         nir_def *query_res = nir_read_occlusion_counter(&b, buf, query_idx);
         write_query_buffer(&b, buf_out, &offset, query_res, flag_64bit);
      nir_pop_if(&b, if_stmt);
   }

   /* Write query availability */
   if (flag_avail)
      write_query_buffer(&b, buf_out, &offset, avail, flag_64bit);

   return b.shader;
}

static bool
create_query_pipelines(struct v3dv_device *device)
{
   VkResult result;
   VkPipeline pipeline;

   /* Set layout: single storage buffer */
   if (!device->queries.buf_descriptor_set_layout) {
      VkDescriptorSetLayoutBinding descriptor_set_layout_binding = {
         .binding = 0,
         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
         .descriptorCount = 1,
         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
      };
      VkDescriptorSetLayoutCreateInfo descriptor_set_layout_info = {
         .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
         .bindingCount = 1,
         .pBindings = &descriptor_set_layout_binding,
      };
      result =
         v3dv_CreateDescriptorSetLayout(v3dv_device_to_handle(device),
                                        &descriptor_set_layout_info,
                                        &device->vk.alloc,
                                        &device->queries.buf_descriptor_set_layout);
      if (result != VK_SUCCESS)
         return false;
   }

   /* Set availability pipeline.
    *
    * Pipeline layout:
    *  - 1 storage buffer for the BO with the query availability.
    *  - 2 push constants:
    *    0B: offset of the availability info in the buffer (4 bytes)
    *    4B: base query index (4 bytes).
    *    8B: availability (1 byte).
    */
   if (!device->queries.avail_pipeline_layout) {
      VkPipelineLayoutCreateInfo pipeline_layout_info = {
         .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
         .setLayoutCount = 1,
         .pSetLayouts = &device->queries.buf_descriptor_set_layout,
         .pushConstantRangeCount = 1,
         .pPushConstantRanges =
             &(VkPushConstantRange) { VK_SHADER_STAGE_COMPUTE_BIT, 0, 9 },
      };

      result =
         v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
                                   &pipeline_layout_info,
                                   &device->vk.alloc,
                                   &device->queries.avail_pipeline_layout);

      if (result != VK_SUCCESS)
         return false;
   }

   const nir_shader_compiler_options *compiler_options =
      v3dv_pipeline_get_nir_options(&device->devinfo);

   if (!device->queries.avail_pipeline) {
      nir_shader *set_query_availability_cs_nir =
         get_set_query_availability_cs(compiler_options);
      result = v3dv_create_compute_pipeline_from_nir(device,
                                                     set_query_availability_cs_nir,
                                                     device->queries.avail_pipeline_layout,
                                                     &pipeline);
      ralloc_free(set_query_availability_cs_nir);
      if (result != VK_SUCCESS)
         return false;

      device->queries.avail_pipeline = pipeline;
   }

   /* Reset occlusion query pipeline.
    *
    * Pipeline layout:
    *  - 1 storage buffer for the BO with the occlusion and availability data.
    *  - Push constants:
    *    0B: offset of the availability info in the buffer (4B)
    *    4B: base query index (4B)
    */
   if (!device->queries.reset_occlusion_pipeline_layout) {
      VkPipelineLayoutCreateInfo pipeline_layout_info = {
         .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
         .setLayoutCount = 1,
         .pSetLayouts = &device->queries.buf_descriptor_set_layout,
         .pushConstantRangeCount = 1,
         .pPushConstantRanges =
             &(VkPushConstantRange) { VK_SHADER_STAGE_COMPUTE_BIT, 0, 8 },
      };

      result =
         v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
                                   &pipeline_layout_info,
                                   &device->vk.alloc,
                                   &device->queries.reset_occlusion_pipeline_layout);

      if (result != VK_SUCCESS)
         return false;
   }

   if (!device->queries.reset_occlusion_pipeline) {
      nir_shader *reset_occlusion_query_cs_nir =
         get_reset_occlusion_query_cs(compiler_options);
      result = v3dv_create_compute_pipeline_from_nir(
                  device,
                  reset_occlusion_query_cs_nir,
                  device->queries.reset_occlusion_pipeline_layout,
                  &pipeline);
      ralloc_free(reset_occlusion_query_cs_nir);
      if (result != VK_SUCCESS)
         return false;

      device->queries.reset_occlusion_pipeline = pipeline;
   }

   /* Copy query results pipelines.
    *
    * Pipeline layout:
    *  - 1 storage buffer for the BO with the query availability and occlusion.
    *  - 1 storage buffer for the output.
    *  - Push constants:
    *    0B: offset of the availability info in the buffer (4B)
    *    4B: base query index (4B)
    *    8B: offset into output buffer (4B)
    *    12B: stride (4B)
    *
    * We create multiple specialized pipelines depending on the copy flags
    * to remove conditionals from the copy shader and get more optimized
    * pipelines.
    */
   if (!device->queries.copy_pipeline_layout) {
      VkDescriptorSetLayout set_layouts[2] = {
         device->queries.buf_descriptor_set_layout,
         device->queries.buf_descriptor_set_layout
      };
      VkPipelineLayoutCreateInfo pipeline_layout_info = {
         .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
         .setLayoutCount = 2,
         .pSetLayouts = set_layouts,
         .pushConstantRangeCount = 1,
         .pPushConstantRanges =
             &(VkPushConstantRange) { VK_SHADER_STAGE_COMPUTE_BIT, 0, 16 },
      };

      result =
         v3dv_CreatePipelineLayout(v3dv_device_to_handle(device),
                                   &pipeline_layout_info,
                                   &device->vk.alloc,
                                   &device->queries.copy_pipeline_layout);

      if (result != VK_SUCCESS)
         return false;
   }

   /* Actual copy pipelines are created lazily on demand since there can be up
    * to 8 depending on the flags used, however it is likely that applications
    * will use the same flags every time and only one pipeline is required.
    */

   return true;
}

static void
destroy_query_pipelines(struct v3dv_device *device)
{
   VkDevice _device = v3dv_device_to_handle(device);

   /* Availability pipeline */
   v3dv_DestroyPipeline(_device, device->queries.avail_pipeline,
                         &device->vk.alloc);
   device->queries.avail_pipeline = VK_NULL_HANDLE;
   v3dv_DestroyPipelineLayout(_device, device->queries.avail_pipeline_layout,
                              &device->vk.alloc);
   device->queries.avail_pipeline_layout = VK_NULL_HANDLE;

   /* Reset occlusion pipeline */
   v3dv_DestroyPipeline(_device, device->queries.reset_occlusion_pipeline,
                         &device->vk.alloc);
   device->queries.reset_occlusion_pipeline = VK_NULL_HANDLE;
   v3dv_DestroyPipelineLayout(_device,
                              device->queries.reset_occlusion_pipeline_layout,
                              &device->vk.alloc);
   device->queries.reset_occlusion_pipeline_layout = VK_NULL_HANDLE;

   /* Copy pipelines */
   for (int i = 0; i < 8; i++) {
      v3dv_DestroyPipeline(_device, device->queries.copy_pipeline[i],
                            &device->vk.alloc);
      device->queries.copy_pipeline[i] = VK_NULL_HANDLE;
   }
   v3dv_DestroyPipelineLayout(_device, device->queries.copy_pipeline_layout,
                              &device->vk.alloc);
   device->queries.copy_pipeline_layout = VK_NULL_HANDLE;

   v3dv_DestroyDescriptorSetLayout(_device,
                                   device->queries.buf_descriptor_set_layout,
                                   &device->vk.alloc);
   device->queries.buf_descriptor_set_layout = VK_NULL_HANDLE;
}

/**
 * Allocates device resources for implementing certain types of queries.
 */
VkResult
v3dv_query_allocate_resources(struct v3dv_device *device)
{
   if (!create_query_pipelines(device))
      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);

   return VK_SUCCESS;
}

void
v3dv_query_free_resources(struct v3dv_device *device)
{
   destroy_query_pipelines(device);
}