/*
 * Copyright © 2022 Collabora Ltd. and Red Hat Inc.
 * SPDX-License-Identifier: MIT
 */
#include "nvk_query_pool.h"

#include "nvk_buffer.h"
#include "nvk_cmd_buffer.h"
#include "nvk_device.h"
#include "nvk_entrypoints.h"
#include "nvk_event.h"
#include "nvk_mme.h"
#include "nvk_physical_device.h"
#include "nvkmd/nvkmd.h"

#include "vk_common_entrypoints.h"
#include "vk_meta.h"
#include "vk_pipeline.h"

#include "compiler/nir/nir.h"
#include "compiler/nir/nir_builder.h"

#include "util/os_time.h"

#include "nv_push_cl906f.h"
#include "nv_push_cl9097.h"
#include "nv_push_cla0c0.h"
#include "nv_push_clc597.h"

struct nvk_query_report {
   uint64_t value;
   uint64_t timestamp;
};

VKAPI_ATTR VkResult VKAPI_CALL
nvk_CreateQueryPool(VkDevice device,
                    const VkQueryPoolCreateInfo *pCreateInfo,
                    const VkAllocationCallbacks *pAllocator,
                    VkQueryPool *pQueryPool)
{
   VK_FROM_HANDLE(nvk_device, dev, device);
   struct nvk_physical_device *pdev = nvk_device_physical(dev);
   struct nvk_query_pool *pool;
   VkResult result;

   pool = vk_query_pool_create(&dev->vk, pCreateInfo,
                               pAllocator, sizeof(*pool));
   if (!pool)
      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);

   /* We place the availability first and then data */
   pool->query_start = align(pool->vk.query_count * sizeof(uint32_t),
                             sizeof(struct nvk_query_report));

   uint32_t reports_per_query;
   switch (pCreateInfo->queryType) {
   case VK_QUERY_TYPE_OCCLUSION:
   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
      reports_per_query = 2;
      break;
   case VK_QUERY_TYPE_TIMESTAMP:
      reports_per_query = 1;
      break;
   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
      reports_per_query = 2 * util_bitcount(pool->vk.pipeline_statistics);
      break;
   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
      // 2 for primitives succeeded 2 for primitives needed
      reports_per_query = 4;
      break;
   default:
      unreachable("Unsupported query type");
   }
   pool->query_stride = reports_per_query * sizeof(struct nvk_query_report);

   if (pool->vk.query_count > 0) {
      uint32_t mem_size = pool->query_start +
                          pool->query_stride * pool->vk.query_count;
      result = nvkmd_dev_alloc_mapped_mem(dev->nvkmd, &dev->vk.base,
                                          mem_size, 0 /* align_B */,
                                          NVKMD_MEM_GART,
                                          NVKMD_MEM_MAP_RDWR,
                                          &pool->mem);
      if (result != VK_SUCCESS) {
         vk_query_pool_destroy(&dev->vk, pAllocator, &pool->vk);
         return result;
      }

      if (pdev->debug_flags & NVK_DEBUG_ZERO_MEMORY)
         memset(pool->mem->map, 0, mem_size);
   }

   *pQueryPool = nvk_query_pool_to_handle(pool);

   return VK_SUCCESS;
}

VKAPI_ATTR void VKAPI_CALL
nvk_DestroyQueryPool(VkDevice device,
                     VkQueryPool queryPool,
                     const VkAllocationCallbacks *pAllocator)
{
   VK_FROM_HANDLE(nvk_device, dev, device);
   VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);

   if (!pool)
      return;

   if (pool->mem)
      nvkmd_mem_unref(pool->mem);
   vk_query_pool_destroy(&dev->vk, pAllocator, &pool->vk);
}

static uint64_t
nvk_query_available_addr(struct nvk_query_pool *pool, uint32_t query)
{
   assert(query < pool->vk.query_count);
   return pool->mem->va->addr + query * sizeof(uint32_t);
}

static nir_def *
nvk_nir_available_addr(nir_builder *b, nir_def *pool_addr,
                       nir_def *query)
{
   nir_def *offset = nir_imul_imm(b, query, sizeof(uint32_t));
   return nir_iadd(b, pool_addr, nir_u2u64(b, offset));
}

static uint32_t *
nvk_query_available_map(struct nvk_query_pool *pool, uint32_t query)
{
   assert(query < pool->vk.query_count);
   return (uint32_t *)pool->mem->map + query;
}

static uint64_t
nvk_query_offset(struct nvk_query_pool *pool, uint32_t query)
{
   assert(query < pool->vk.query_count);
   return pool->query_start + query * pool->query_stride;
}

static uint64_t
nvk_query_report_addr(struct nvk_query_pool *pool, uint32_t query)
{
   return pool->mem->va->addr + nvk_query_offset(pool, query);
}

static nir_def *
nvk_nir_query_report_addr(nir_builder *b, nir_def *pool_addr,
                          nir_def *query_start, nir_def *query_stride,
                          nir_def *query)
{
   nir_def *offset =
      nir_iadd(b, query_start, nir_umul_2x32_64(b, query, query_stride));
   return nir_iadd(b, pool_addr, offset);
}

static struct nvk_query_report *
nvk_query_report_map(struct nvk_query_pool *pool, uint32_t query)
{
   return (void *)((char *)pool->mem->map + nvk_query_offset(pool, query));
}

/**
 * Goes through a series of consecutive query indices in the given pool,
 * setting all element values to 0 and emitting them as available.
 */
static void
emit_zero_queries(struct nvk_cmd_buffer *cmd, struct nvk_query_pool *pool,
                  uint32_t first_index, uint32_t num_queries)
{
   switch (pool->vk.query_type) {
   case VK_QUERY_TYPE_OCCLUSION:
   case VK_QUERY_TYPE_TIMESTAMP:
   case VK_QUERY_TYPE_PIPELINE_STATISTICS:
   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT: {
      for (uint32_t i = 0; i < num_queries; i++) {
         uint64_t addr = nvk_query_available_addr(pool, first_index + i);

         struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
         P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
         P_NV9097_SET_REPORT_SEMAPHORE_A(p, addr >> 32);
         P_NV9097_SET_REPORT_SEMAPHORE_B(p, addr);
         P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1);
         P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
            .operation = OPERATION_RELEASE,
            .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
            .pipeline_location = PIPELINE_LOCATION_ALL,
            .structure_size = STRUCTURE_SIZE_ONE_WORD,
         });
      }
      break;
   }
   default:
      unreachable("Unsupported query type");
   }
}

VKAPI_ATTR void VKAPI_CALL
nvk_ResetQueryPool(VkDevice device,
                   VkQueryPool queryPool,
                   uint32_t firstQuery,
                   uint32_t queryCount)
{
   VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);

   uint32_t *available = nvk_query_available_map(pool, firstQuery);
   memset(available, 0, queryCount * sizeof(*available));
}

VKAPI_ATTR void VKAPI_CALL
nvk_CmdResetQueryPool(VkCommandBuffer commandBuffer,
                      VkQueryPool queryPool,
                      uint32_t firstQuery,
                      uint32_t queryCount)
{
   VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
   VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);

   for (uint32_t i = 0; i < queryCount; i++) {
      uint64_t addr = nvk_query_available_addr(pool, firstQuery + i);

      struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
      P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
      P_NV9097_SET_REPORT_SEMAPHORE_A(p, addr >> 32);
      P_NV9097_SET_REPORT_SEMAPHORE_B(p, addr);
      P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
      P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
         .operation = OPERATION_RELEASE,
         .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
         .pipeline_location = PIPELINE_LOCATION_ALL,
         .structure_size = STRUCTURE_SIZE_ONE_WORD,
      });
   }

   /* Wait for the above writes to complete.  This prevents WaW hazards on any
    * later query availability updates and ensures vkCmdCopyQueryPoolResults
    * will see the query as unavailable if it happens before the query is
    * completed again.
    */
   for (uint32_t i = 0; i < queryCount; i++) {
      uint64_t addr = nvk_query_available_addr(pool, firstQuery + i);

      struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
      __push_mthd(p, SUBC_NV9097, NV906F_SEMAPHOREA);
      P_NV906F_SEMAPHOREA(p, addr >> 32);
      P_NV906F_SEMAPHOREB(p, (addr & UINT32_MAX) >> 2);
      P_NV906F_SEMAPHOREC(p, 0);
      P_NV906F_SEMAPHORED(p, {
         .operation = OPERATION_ACQUIRE,
         .acquire_switch = ACQUIRE_SWITCH_ENABLED,
         .release_size = RELEASE_SIZE_4BYTE,
      });
   }
}

VKAPI_ATTR void VKAPI_CALL
nvk_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
                       VkPipelineStageFlags2 stage,
                       VkQueryPool queryPool,
                       uint32_t query)
{
   VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
   VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);

   struct nv_push *p = nvk_cmd_buffer_push(cmd, 10);

   uint64_t report_addr = nvk_query_report_addr(pool, query);
   P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
   P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
   P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr);
   P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
   P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
      .operation = OPERATION_REPORT_ONLY,
      .pipeline_location = vk_stage_flags_to_nv9097_pipeline_location(stage),
      .structure_size = STRUCTURE_SIZE_FOUR_WORDS,
   });

   uint64_t available_addr = nvk_query_available_addr(pool, query);
   P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
   P_NV9097_SET_REPORT_SEMAPHORE_A(p, available_addr >> 32);
   P_NV9097_SET_REPORT_SEMAPHORE_B(p, available_addr);
   P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1);
   P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
      .operation = OPERATION_RELEASE,
      .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
      .pipeline_location = PIPELINE_LOCATION_ALL,
      .structure_size = STRUCTURE_SIZE_ONE_WORD,
   });

   /* From the Vulkan spec:
    *
    *   "If vkCmdWriteTimestamp2 is called while executing a render pass
    *    instance that has multiview enabled, the timestamp uses N consecutive
    *    query indices in the query pool (starting at query) where N is the
    *    number of bits set in the view mask of the subpass the command is
    *    executed in. The resulting query values are determined by an
    *    implementation-dependent choice of one of the following behaviors:"
    *
    * In our case, only the first query is used, so we emit zeros for the
    * remaining queries, as described in the first behavior listed in the
    * Vulkan spec:
    *
    *   "The first query is a timestamp value and (if more than one bit is set
    *   in the view mask) zero is written to the remaining queries."
    */
   if (cmd->state.gfx.render.view_mask != 0) {
      const uint32_t num_queries =
         util_bitcount(cmd->state.gfx.render.view_mask);
      if (num_queries > 1)
         emit_zero_queries(cmd, pool, query + 1, num_queries - 1);
   }
}

struct nvk_3d_stat_query {
   VkQueryPipelineStatisticFlagBits flag;
   uint8_t loc;
   uint8_t report;
};

/* This must remain sorted in flag order */
static const struct nvk_3d_stat_query nvk_3d_stat_queries[] = {{
   .flag    = VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT,
   .loc     = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_DATA_ASSEMBLER,
   .report  = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_DA_VERTICES_GENERATED,
}, {
   .flag    = VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT,
   .loc     = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_DATA_ASSEMBLER,
   .report  = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_DA_PRIMITIVES_GENERATED,
}, {
   .flag    = VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT,
   .loc     = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_VERTEX_SHADER,
   .report  = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_VS_INVOCATIONS,
}, {
   .flag    = VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT,
   .loc     = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_GEOMETRY_SHADER,
   .report  = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_GS_INVOCATIONS,
}, {
   .flag    = VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT,
   .loc     = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_GEOMETRY_SHADER,
   .report  = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_GS_PRIMITIVES_GENERATED,
}, {
   .flag    = VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT,
   .loc     = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_VPC, /* TODO */
   .report  = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_CLIPPER_INVOCATIONS,
}, {
   .flag    = VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT,
   .loc     = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_VPC, /* TODO */
   .report  = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_CLIPPER_PRIMITIVES_GENERATED,
}, {
   .flag    = VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT,
   .loc     = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_PIXEL_SHADER,
   .report  = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_PS_INVOCATIONS,
}, {
   .flag    = VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT,
   .loc     = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_TESSELATION_INIT_SHADER,
   .report  = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_TI_INVOCATIONS,
}, {
   .flag    = VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT,
   .loc     = NV9097_SET_REPORT_SEMAPHORE_D_PIPELINE_LOCATION_TESSELATION_SHADER,
   .report  = NV9097_SET_REPORT_SEMAPHORE_D_REPORT_TS_INVOCATIONS,
}, {
   .flag    = VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT,
   .loc     = UINT8_MAX,
   .report  = UINT8_MAX,
}};

static void
mme_store_global(struct mme_builder *b,
                 struct mme_value64 addr,
                 struct mme_value v)
{
   mme_mthd(b, NV9097_SET_REPORT_SEMAPHORE_A);
   mme_emit_addr64(b, addr);
   mme_emit(b, v);
   mme_emit(b, mme_imm(0x10000000));
}

void
nvk_mme_write_cs_invocations(struct mme_builder *b)
{
   struct mme_value64 dst_addr = mme_load_addr64(b);

   struct mme_value accum_hi = mme_state(b,
      NVC597_SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CS_INVOCATIONS_HI));
   struct mme_value accum_lo = mme_state(b,
      NVC597_SET_MME_SHADOW_SCRATCH(NVK_MME_SCRATCH_CS_INVOCATIONS_LO));
   struct mme_value64 accum = mme_value64(accum_lo, accum_hi);

   mme_store_global(b, dst_addr, accum.lo);
   mme_store_global(b, mme_add64(b, dst_addr, mme_imm64(4)), accum.hi);
}

static void
nvk_cmd_begin_end_query(struct nvk_cmd_buffer *cmd,
                        struct nvk_query_pool *pool,
                        uint32_t query, uint32_t index,
                        bool end)
{
   uint64_t report_addr = nvk_query_report_addr(pool, query) +
                          end * sizeof(struct nvk_query_report);

   uint32_t end_size = 7 * end;

   struct nv_push *p;
   switch (pool->vk.query_type) {
   case VK_QUERY_TYPE_OCCLUSION:
      p = nvk_cmd_buffer_push(cmd, 7 + end_size);

      P_IMMD(p, NV9097, SET_ZPASS_PIXEL_COUNT, !end);

      P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
      P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
      P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr);
      P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
      P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
         .operation = OPERATION_REPORT_ONLY,
         .pipeline_location = PIPELINE_LOCATION_ALL,
         .report = REPORT_ZPASS_PIXEL_CNT64,
         .structure_size = STRUCTURE_SIZE_FOUR_WORDS,
         .flush_disable = true,
      });
      break;

   case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
      uint32_t stat_count = util_bitcount(pool->vk.pipeline_statistics);
      p = nvk_cmd_buffer_push(cmd, stat_count * 5 + end_size);

      ASSERTED uint32_t stats_left = pool->vk.pipeline_statistics;
      for (uint32_t i = 0; i < ARRAY_SIZE(nvk_3d_stat_queries); i++) {
         const struct nvk_3d_stat_query *sq = &nvk_3d_stat_queries[i];
         if (!(stats_left & sq->flag))
            continue;

         /* The 3D stat queries array MUST be sorted */
         assert(!(stats_left & (sq->flag - 1)));

         if (sq->flag == VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT) {
            P_1INC(p, NVC597, CALL_MME_MACRO(NVK_MME_WRITE_CS_INVOCATIONS));
            P_INLINE_DATA(p, report_addr >> 32);
            P_INLINE_DATA(p, report_addr);
         } else {
            P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
            P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
            P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr);
            P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
            P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
               .operation = OPERATION_REPORT_ONLY,
               .pipeline_location = sq->loc,
               .report = sq->report,
               .structure_size = STRUCTURE_SIZE_FOUR_WORDS,
               .flush_disable = true,
            });
         }

         report_addr += 2 * sizeof(struct nvk_query_report);
         stats_left &= ~sq->flag;
      }
      break;
   }

   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
      const uint32_t xfb_reports[] = {
         NV9097_SET_REPORT_SEMAPHORE_D_REPORT_STREAMING_PRIMITIVES_SUCCEEDED,
         NV9097_SET_REPORT_SEMAPHORE_D_REPORT_STREAMING_PRIMITIVES_NEEDED,
      };
      p = nvk_cmd_buffer_push(cmd, 5 * ARRAY_SIZE(xfb_reports) + end_size);
      for (uint32_t i = 0; i < ARRAY_SIZE(xfb_reports); ++i) {
         P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
         P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
         P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr);
         P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
         P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
               .operation = OPERATION_REPORT_ONLY,
               .pipeline_location = PIPELINE_LOCATION_STREAMING_OUTPUT,
               .report = xfb_reports[i],
               .structure_size = STRUCTURE_SIZE_FOUR_WORDS,
               .sub_report = index,
               .flush_disable = true,
               });
         report_addr += 2 * sizeof(struct nvk_query_report);
      }
      break;
   }

   case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
      p = nvk_cmd_buffer_push(cmd, 5 + end_size);

      P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
      P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
      P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr);
      P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1);
      P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
         .operation = OPERATION_REPORT_ONLY,
         .pipeline_location = PIPELINE_LOCATION_STREAMING_OUTPUT,
         .report = REPORT_VTG_PRIMITIVES_OUT,
         .sub_report = index,
         .structure_size = STRUCTURE_SIZE_FOUR_WORDS,
         .flush_disable = true,
      });
      break;

   default:
      unreachable("Unsupported query type");
   }

   if (end) {
      P_IMMD(p, NV9097, FLUSH_PENDING_WRITES, 0);

      uint64_t available_addr = nvk_query_available_addr(pool, query);
      P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
      P_NV9097_SET_REPORT_SEMAPHORE_A(p, available_addr >> 32);
      P_NV9097_SET_REPORT_SEMAPHORE_B(p, available_addr);
      P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1);
      P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
         .operation = OPERATION_RELEASE,
         .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
         .pipeline_location = PIPELINE_LOCATION_ALL,
         .structure_size = STRUCTURE_SIZE_ONE_WORD,
      });
   }
}

VKAPI_ATTR void VKAPI_CALL
nvk_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,
                            VkQueryPool queryPool,
                            uint32_t query,
                            VkQueryControlFlags flags,
                            uint32_t index)
{
   VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
   VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);

   nvk_cmd_begin_end_query(cmd, pool, query, index, false);
}

VKAPI_ATTR void VKAPI_CALL
nvk_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,
                          VkQueryPool queryPool,
                          uint32_t query,
                          uint32_t index)
{
   VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
   VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);

   nvk_cmd_begin_end_query(cmd, pool, query, index, true);

   /* From the Vulkan spec:
    *
    *   "If queries are used while executing a render pass instance that has
    *    multiview enabled, the query uses N consecutive query indices in
    *    the query pool (starting at query) where N is the number of bits set
    *    in the view mask in the subpass the query is used in. How the
    *    numerical results of the query are distributed among the queries is
    *    implementation-dependent."
    *
    * In our case, only the first query is used, so we emit zeros for the
    * remaining queries.
    */
   if (cmd->state.gfx.render.view_mask != 0) {
      const uint32_t num_queries =
         util_bitcount(cmd->state.gfx.render.view_mask);
      if (num_queries > 1)
         emit_zero_queries(cmd, pool, query + 1, num_queries - 1);
   }
}

static bool
nvk_query_is_available(struct nvk_query_pool *pool, uint32_t query)
{
   uint32_t *available = nvk_query_available_map(pool, query);
   return p_atomic_read(available) != 0;
}

#define NVK_QUERY_TIMEOUT 2000000000ull

static VkResult
nvk_query_wait_for_available(struct nvk_device *dev,
                             struct nvk_query_pool *pool,
                             uint32_t query)
{
   uint64_t abs_timeout_ns = os_time_get_absolute_timeout(NVK_QUERY_TIMEOUT);

   while (os_time_get_nano() < abs_timeout_ns) {
      if (nvk_query_is_available(pool, query))
         return VK_SUCCESS;

      VkResult status = vk_device_check_status(&dev->vk);
      if (status != VK_SUCCESS)
         return status;
   }

   return vk_device_set_lost(&dev->vk, "query timeout");
}

static void
cpu_write_query_result(void *dst, uint32_t idx,
                       VkQueryResultFlags flags,
                       uint64_t result)
{
   if (flags & VK_QUERY_RESULT_64_BIT) {
      uint64_t *dst64 = dst;
      dst64[idx] = result;
   } else {
      uint32_t *dst32 = dst;
      dst32[idx] = result;
   }
}

static void
cpu_get_query_delta(void *dst, const struct nvk_query_report *src,
                    uint32_t idx, VkQueryResultFlags flags)
{
   uint64_t delta = src[idx * 2 + 1].value - src[idx * 2].value;
   cpu_write_query_result(dst, idx, flags, delta);
}

VKAPI_ATTR VkResult VKAPI_CALL
nvk_GetQueryPoolResults(VkDevice device,
                        VkQueryPool queryPool,
                        uint32_t firstQuery,
                        uint32_t queryCount,
                        size_t dataSize,
                        void *pData,
                        VkDeviceSize stride,
                        VkQueryResultFlags flags)
{
   VK_FROM_HANDLE(nvk_device, dev, device);
   VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);

   if (vk_device_is_lost(&dev->vk))
      return VK_ERROR_DEVICE_LOST;

   VkResult status = VK_SUCCESS;
   for (uint32_t i = 0; i < queryCount; i++) {
      const uint32_t query = firstQuery + i;

      bool available = nvk_query_is_available(pool, query);

      if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
         status = nvk_query_wait_for_available(dev, pool, query);
         if (status != VK_SUCCESS)
            return status;

         available = true;
      }

      bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);

      const struct nvk_query_report *src = nvk_query_report_map(pool, query);
      assert(i * stride < dataSize);
      void *dst = (char *)pData + i * stride;

      uint32_t available_dst_idx = 1;
      switch (pool->vk.query_type) {
      case VK_QUERY_TYPE_OCCLUSION:
      case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
         if (write_results)
            cpu_get_query_delta(dst, src, 0, flags);
         break;
      case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
         uint32_t stat_count = util_bitcount(pool->vk.pipeline_statistics);
         available_dst_idx = stat_count;
         if (write_results) {
            for (uint32_t j = 0; j < stat_count; j++)
               cpu_get_query_delta(dst, src, j, flags);
         }
         break;
      }
      case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
         const int prims_succeeded_idx = 0;
         const int prims_needed_idx = 1;
         available_dst_idx = 2;
         if (write_results) {
            cpu_get_query_delta(dst, src, prims_succeeded_idx, flags);
            cpu_get_query_delta(dst, src, prims_needed_idx, flags);
         }
         break;
      }
      case VK_QUERY_TYPE_TIMESTAMP:
         if (write_results)
            cpu_write_query_result(dst, 0, flags, src->timestamp);
         break;
      default:
         unreachable("Unsupported query type");
      }

      if (!write_results)
         status = VK_NOT_READY;

      if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
         cpu_write_query_result(dst, available_dst_idx, flags, available);
   }

   return status;
}

struct nvk_copy_query_push {
   uint64_t pool_addr;
   uint32_t query_start;
   uint32_t query_stride;
   uint32_t first_query;
   uint32_t query_count;
   uint64_t dst_addr;
   uint64_t dst_stride;
   uint32_t flags;
};

static nir_def *
load_struct_var(nir_builder *b, nir_variable *var, uint32_t field)
{
   nir_deref_instr *deref =
      nir_build_deref_struct(b, nir_build_deref_var(b, var), field);
   return nir_load_deref(b, deref);
}

static void
nir_write_query_result(nir_builder *b, nir_def *dst_addr,
                       nir_def *idx, nir_def *flags,
                       nir_def *result)
{
   assert(result->num_components == 1);
   assert(result->bit_size == 64);

   nir_push_if(b, nir_test_mask(b, flags, VK_QUERY_RESULT_64_BIT));
   {
      nir_def *offset = nir_i2i64(b, nir_imul_imm(b, idx, 8));
      nir_store_global(b, nir_iadd(b, dst_addr, offset), 8, result, 0x1);
   }
   nir_push_else(b, NULL);
   {
      nir_def *result32 = nir_u2u32(b, result);
      nir_def *offset = nir_i2i64(b, nir_imul_imm(b, idx, 4));
      nir_store_global(b, nir_iadd(b, dst_addr, offset), 4, result32, 0x1);
   }
   nir_pop_if(b, NULL);
}

static void
nir_get_query_delta(nir_builder *b, nir_def *dst_addr,
                    nir_def *report_addr, nir_def *idx,
                    nir_def *flags)
{
   nir_def *offset =
      nir_imul_imm(b, idx, 2 * sizeof(struct nvk_query_report));
   nir_def *begin_addr =
      nir_iadd(b, report_addr, nir_i2i64(b, offset));
   nir_def *end_addr =
      nir_iadd_imm(b, begin_addr, sizeof(struct nvk_query_report));

   /* nvk_query_report::timestamp is the first uint64_t */
   nir_def *begin = nir_load_global(b, begin_addr, 16, 1, 64);
   nir_def *end = nir_load_global(b, end_addr, 16, 1, 64);

   nir_def *delta = nir_isub(b, end, begin);

   nir_write_query_result(b, dst_addr, idx, flags, delta);
}

static void
nvk_nir_copy_query(nir_builder *b, nir_variable *push, nir_def *i)
{
   nir_def *pool_addr = load_struct_var(b, push, 0);
   nir_def *query_start = nir_u2u64(b, load_struct_var(b, push, 1));
   nir_def *query_stride = load_struct_var(b, push, 2);
   nir_def *first_query = load_struct_var(b, push, 3);
   nir_def *dst_addr = load_struct_var(b, push, 5);
   nir_def *dst_stride = load_struct_var(b, push, 6);
   nir_def *flags = load_struct_var(b, push, 7);

   nir_def *query = nir_iadd(b, first_query, i);

   nir_def *avail_addr = nvk_nir_available_addr(b, pool_addr, query);
   nir_def *available =
      nir_i2b(b, nir_load_global(b, avail_addr, 4, 1, 32));

   nir_def *partial = nir_test_mask(b, flags, VK_QUERY_RESULT_PARTIAL_BIT);
   nir_def *write_results = nir_ior(b, available, partial);

   nir_def *report_addr =
      nvk_nir_query_report_addr(b, pool_addr, query_start, query_stride,
                                query);
   nir_def *dst_offset = nir_imul(b, nir_u2u64(b, i), dst_stride);

   /* Timestamp queries are the only ones use a single report */
   nir_def *is_timestamp =
      nir_ieq_imm(b, query_stride, sizeof(struct nvk_query_report));

   nir_def *one = nir_imm_int(b, 1);
   nir_def *num_reports;
   nir_push_if(b, is_timestamp);
   {
      nir_push_if(b, write_results);
      {
         /* This is the timestamp case.  We add 8 because we're loading
          * nvk_query_report::timestamp.
          */
         nir_def *timestamp =
            nir_load_global(b, nir_iadd_imm(b, report_addr, 8), 8, 1, 64);

         nir_write_query_result(b, nir_iadd(b, dst_addr, dst_offset),
                                nir_imm_int(b, 0), flags, timestamp);
      }
      nir_pop_if(b, NULL);
   }
   nir_push_else(b, NULL);
   {
      /* Everything that isn't a timestamp has the invariant that the
       * number of destination entries is equal to the query stride divided
       * by the size of two reports.
       */
      num_reports = nir_udiv_imm(b, query_stride,
                                 2 * sizeof(struct nvk_query_report));

      nir_push_if(b, write_results);
      {
         nir_variable *r =
            nir_local_variable_create(b->impl, glsl_uint_type(), "r");
         nir_store_var(b, r, nir_imm_int(b, 0), 0x1);

         nir_push_loop(b);
         {
            nir_break_if(b, nir_ige(b, nir_load_var(b, r), num_reports));

            nir_get_query_delta(b, nir_iadd(b, dst_addr, dst_offset),
                                report_addr, nir_load_var(b, r), flags);

            nir_store_var(b, r, nir_iadd_imm(b, nir_load_var(b, r), 1), 0x1);
         }
         nir_pop_loop(b, NULL);
      }
      nir_pop_if(b, NULL);
   }
   nir_pop_if(b, NULL);

   num_reports = nir_if_phi(b, one, num_reports);

   nir_push_if(b, nir_test_mask(b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT));
   {
      nir_write_query_result(b, nir_iadd(b, dst_addr, dst_offset),
                             num_reports, flags, nir_b2i64(b, available));
   }
   nir_pop_if(b, NULL);
}

static nir_shader *
build_copy_queries_shader(void)
{
   nir_builder build =
      nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, NULL,
                                     "nvk-meta-copy-queries");
   nir_builder *b = &build;

   struct glsl_struct_field push_fields[] = {
      { .type = glsl_uint64_t_type(), .name = "pool_addr", .offset = 0 },
      { .type = glsl_uint_type(), .name = "query_start", .offset = 8 },
      { .type = glsl_uint_type(), .name = "query_stride", .offset = 12 },
      { .type = glsl_uint_type(), .name = "first_query", .offset = 16 },
      { .type = glsl_uint_type(), .name = "query_count", .offset = 20 },
      { .type = glsl_uint64_t_type(), .name = "dst_addr", .offset = 24 },
      { .type = glsl_uint64_t_type(), .name = "dst_stride", .offset = 32 },
      { .type = glsl_uint_type(), .name = "flags", .offset = 40 },
   };
   const struct glsl_type *push_iface_type =
      glsl_interface_type(push_fields, ARRAY_SIZE(push_fields),
                          GLSL_INTERFACE_PACKING_STD140,
                          false /* row_major */, "push");
   nir_variable *push = nir_variable_create(b->shader, nir_var_mem_push_const,
                                            push_iface_type, "push");

   b->shader->info.workgroup_size[0] = 32;
   nir_def *wg_id = nir_load_workgroup_id(b);
   nir_def *i = nir_iadd(b, nir_load_subgroup_invocation(b),
                            nir_imul_imm(b, nir_channel(b, wg_id, 0), 32));

   nir_def *query_count = load_struct_var(b, push, 4);
   nir_push_if(b, nir_ilt(b, i, query_count));
   {
      nvk_nir_copy_query(b, push, i);
   }
   nir_pop_if(b, NULL);

   return build.shader;
}

static struct nvk_shader *
atomic_set_or_destroy_shader(struct nvk_device *dev,
                             struct nvk_shader **shader_ptr,
                             struct nvk_shader *shader,
                             const VkAllocationCallbacks *alloc)
{
   struct nvk_shader *old_shader = p_atomic_cmpxchg(shader_ptr, NULL, shader);
   if (old_shader == NULL) {
      return shader;
   } else {
      vk_shader_destroy(&dev->vk, &shader->vk, alloc);
      return old_shader;
   }
}

static VkResult
get_copy_queries_shader(struct nvk_device *dev,
                        struct nvk_shader **shader_out)
{
   struct nvk_shader *shader = p_atomic_read(&dev->copy_queries);
   if (shader != NULL) {
      *shader_out = shader;
      return VK_SUCCESS;
   }

   nir_shader *nir = build_copy_queries_shader();
   VkResult result = nvk_compile_nir_shader(dev, nir, &dev->vk.alloc, &shader);
   if (result != VK_SUCCESS)
      return result;

   *shader_out = atomic_set_or_destroy_shader(dev, &dev->copy_queries,
                                              shader, &dev->vk.alloc);

   return VK_SUCCESS;
}

static void
nvk_meta_copy_query_pool_results(struct nvk_cmd_buffer *cmd,
                                 struct nvk_query_pool *pool,
                                 uint32_t first_query,
                                 uint32_t query_count,
                                 uint64_t dst_addr,
                                 uint64_t dst_stride,
                                 VkQueryResultFlags flags)
{
   struct nvk_device *dev = nvk_cmd_buffer_device(cmd);

   struct nvk_shader *shader;
   VkResult result = get_copy_queries_shader(dev, &shader);
   if (result != VK_SUCCESS) {
      vk_command_buffer_set_error(&cmd->vk, result);
      return;
   }

   const struct nvk_copy_query_push push = {
      .pool_addr = pool->mem->va->addr,
      .query_start = pool->query_start,
      .query_stride = pool->query_stride,
      .first_query = first_query,
      .query_count = query_count,
      .dst_addr = dst_addr,
      .dst_stride = dst_stride,
      .flags = flags,
   };
   nvk_cmd_dispatch_shader(cmd, shader, &push, sizeof(push),
                           DIV_ROUND_UP(query_count, 32), 1, 1);
}

VKAPI_ATTR void VKAPI_CALL
nvk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
                            VkQueryPool queryPool,
                            uint32_t firstQuery,
                            uint32_t queryCount,
                            VkBuffer dstBuffer,
                            VkDeviceSize dstOffset,
                            VkDeviceSize stride,
                            VkQueryResultFlags flags)
{
   VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
   VK_FROM_HANDLE(nvk_query_pool, pool, queryPool);
   VK_FROM_HANDLE(nvk_buffer, dst_buffer, dstBuffer);

   if (flags & VK_QUERY_RESULT_WAIT_BIT) {
      for (uint32_t i = 0; i < queryCount; i++) {
         uint64_t avail_addr = nvk_query_available_addr(pool, firstQuery + i);

         struct nv_push *p = nvk_cmd_buffer_push(cmd, 5);
         __push_mthd(p, SUBC_NV9097, NV906F_SEMAPHOREA);
         P_NV906F_SEMAPHOREA(p, avail_addr >> 32);
         P_NV906F_SEMAPHOREB(p, (avail_addr & UINT32_MAX) >> 2);
         P_NV906F_SEMAPHOREC(p, 1);
         P_NV906F_SEMAPHORED(p, {
            .operation = OPERATION_ACQ_GEQ,
            .acquire_switch = ACQUIRE_SWITCH_ENABLED,
            .release_size = RELEASE_SIZE_4BYTE,
         });
      }
   }

   uint64_t dst_addr = nvk_buffer_address(dst_buffer, dstOffset);
   nvk_meta_copy_query_pool_results(cmd, pool, firstQuery, queryCount,
                                    dst_addr, stride, flags);
}