/*
 * Copyright © 2022 Collabora Ltd. and Red Hat Inc.
 * SPDX-License-Identifier: MIT
 */
#include "nvk_shader.h"

#include "nvk_cmd_buffer.h"
#include "nvk_descriptor_set_layout.h"
#include "nvk_device.h"
#include "nvk_physical_device.h"
#include "nvk_sampler.h"
#include "nvk_shader.h"

#include "vk_nir_convert_ycbcr.h"
#include "vk_pipeline.h"
#include "vk_pipeline_layout.h"
#include "vk_shader_module.h"
#include "vk_ycbcr_conversion.h"

#include "nak.h"
#include "nir.h"
#include "nir_builder.h"
#include "compiler/spirv/nir_spirv.h"

#include "nv50_ir_driver.h"

#include "util/mesa-sha1.h"
#include "util/u_debug.h"

#include "cla097.h"
#include "clb097.h"
#include "clc397.h"
#include "clc597.h"

static void
shared_var_info(const struct glsl_type *type, unsigned *size, unsigned *align)
{
   assert(glsl_type_is_vector_or_scalar(type));

   uint32_t comp_size = glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8;
   unsigned length = glsl_get_vector_elements(type);
   *size = comp_size * length, *align = comp_size;
}

VkShaderStageFlags
nvk_nak_stages(const struct nv_device_info *info)
{
   const VkShaderStageFlags all =
      VK_SHADER_STAGE_VERTEX_BIT |
      VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT |
      VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT |
      VK_SHADER_STAGE_GEOMETRY_BIT |
      VK_SHADER_STAGE_FRAGMENT_BIT |
      VK_SHADER_STAGE_COMPUTE_BIT;

   const struct debug_control flags[] = {
      { "vs", BITFIELD64_BIT(MESA_SHADER_VERTEX) },
      { "tcs", BITFIELD64_BIT(MESA_SHADER_TESS_CTRL) },
      { "tes", BITFIELD64_BIT(MESA_SHADER_TESS_EVAL) },
      { "gs", BITFIELD64_BIT(MESA_SHADER_GEOMETRY) },
      { "fs", BITFIELD64_BIT(MESA_SHADER_FRAGMENT) },
      { "cs", BITFIELD64_BIT(MESA_SHADER_COMPUTE) },
      { "all", all },
      { NULL, 0 },
   };

   const char *env_str = getenv("NVK_USE_NAK");
   if (env_str == NULL)
      return info->cls_eng3d >= MAXWELL_A ? all : 0;
   else
      return parse_debug_string(env_str, flags);
}

static bool
use_nak(const struct nvk_physical_device *pdev, gl_shader_stage stage)
{
   return nvk_nak_stages(&pdev->info) & mesa_to_vk_shader_stage(stage);
}

uint64_t
nvk_physical_device_compiler_flags(const struct nvk_physical_device *pdev)
{
   bool no_cbufs = pdev->debug_flags & NVK_DEBUG_NO_CBUF;
   bool use_edb_buffer_views = nvk_use_edb_buffer_views(pdev);
   uint64_t prog_debug = nvk_cg_get_prog_debug();
   uint64_t prog_optimize = nvk_cg_get_prog_optimize();
   uint64_t nak_stages = nvk_nak_stages(&pdev->info);
   uint64_t nak_flags = nak_debug_flags(pdev->nak);

   assert(prog_debug <= UINT8_MAX);
   assert(prog_optimize < 16);
   assert(nak_stages <= UINT32_MAX);
   assert(nak_flags <= UINT16_MAX);

   return prog_debug
      | (prog_optimize << 8)
      | ((uint64_t)no_cbufs << 12)
      | ((uint64_t)use_edb_buffer_views << 13)
      | (nak_stages << 16)
      | (nak_flags << 48);
}

static const nir_shader_compiler_options *
nvk_get_nir_options(struct vk_physical_device *vk_pdev,
                    gl_shader_stage stage,
                    UNUSED const struct vk_pipeline_robustness_state *rs)
{
   const struct nvk_physical_device *pdev =
      container_of(vk_pdev, struct nvk_physical_device, vk);

   if (use_nak(pdev, stage))
      return nak_nir_options(pdev->nak);
   else
      return nvk_cg_nir_options(pdev, stage);
}

nir_address_format
nvk_ubo_addr_format(const struct nvk_physical_device *pdev,
                    const struct vk_pipeline_robustness_state *rs)
{
   if (nvk_use_bindless_cbuf(&pdev->info)) {
      return nir_address_format_vec2_index_32bit_offset;
   } else if (rs->null_uniform_buffer_descriptor) {
      /* We need bounds checking for null descriptors */
      return nir_address_format_64bit_bounded_global;
   } else {
      switch (rs->uniform_buffers) {
      case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT:
         return nir_address_format_64bit_global_32bit_offset;
      case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT:
      case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT:
         return nir_address_format_64bit_bounded_global;
      default:
         unreachable("Invalid robust buffer access behavior");
      }
   }
}

nir_address_format
nvk_ssbo_addr_format(const struct nvk_physical_device *pdev,
                    const struct vk_pipeline_robustness_state *rs)
{
   if (rs->null_storage_buffer_descriptor) {
      /* We need bounds checking for null descriptors */
      return nir_address_format_64bit_bounded_global;
   } else {
      switch (rs->storage_buffers) {
      case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT:
         return nir_address_format_64bit_global_32bit_offset;
      case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT:
      case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT:
         return nir_address_format_64bit_bounded_global;
      default:
         unreachable("Invalid robust buffer access behavior");
      }
   }
}

static struct spirv_to_nir_options
nvk_get_spirv_options(struct vk_physical_device *vk_pdev,
                      UNUSED gl_shader_stage stage,
                      const struct vk_pipeline_robustness_state *rs)
{
   const struct nvk_physical_device *pdev =
      container_of(vk_pdev, struct nvk_physical_device, vk);

   return (struct spirv_to_nir_options) {
      .ssbo_addr_format = nvk_ssbo_addr_format(pdev, rs),
      .phys_ssbo_addr_format = nir_address_format_64bit_global,
      .ubo_addr_format = nvk_ubo_addr_format(pdev, rs),
      .shared_addr_format = nir_address_format_32bit_offset,
      .min_ssbo_alignment = NVK_MIN_SSBO_ALIGNMENT,
      .min_ubo_alignment = nvk_min_cbuf_alignment(&pdev->info),
   };
}

static void
nvk_preprocess_nir(struct vk_physical_device *vk_pdev, nir_shader *nir)
{
   const struct nvk_physical_device *pdev =
      container_of(vk_pdev, struct nvk_physical_device, vk);

   NIR_PASS_V(nir, nir_lower_io_to_temporaries,
              nir_shader_get_entrypoint(nir), true, false);

   if (use_nak(pdev, nir->info.stage))
      nak_preprocess_nir(nir, pdev->nak);
   else
      nvk_cg_preprocess_nir(nir);
}

static void
nvk_populate_fs_key(struct nak_fs_key *key,
                    const struct vk_graphics_pipeline_state *state)
{
   memset(key, 0, sizeof(*key));

   key->sample_info_cb = 0;
   key->sample_locations_offset = nvk_root_descriptor_offset(draw.sample_locations);
   key->sample_masks_offset = nvk_root_descriptor_offset(draw.sample_masks);

   /* Turn underestimate on when no state is availaible or if explicitly set */
   if (state == NULL || state->rs == NULL ||
       state->rs->conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_UNDERESTIMATE_EXT)
      key->uses_underestimate = true;

   if (state == NULL)
      return;

   if (state->pipeline_flags &
       VK_PIPELINE_CREATE_2_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT)
      key->zs_self_dep = true;

   /* We force per-sample interpolation whenever sampleShadingEnable is set
    * regardless of minSampleShading or rasterizationSamples.
    *
    * When sampleShadingEnable is set, few guarantees are made about the
    * location of interpolation of the inputs.  The only real guarantees are
    * that the inputs are interpolated within the pixel and that you get at
    * least `rasterizationSamples * minSampleShading` unique positions.
    * Importantly, it does not require that when `rasterizationSamples *
    * minSampleShading <= 1.0` that those positions are at the fragment
    * center.  Therefore, it's valid to just always do per-sample (which maps
    * to CENTROID on NVIDIA hardware) all the time and let the hardware sort
    * it out based on what we set in HYBRID_ANTI_ALIAS_CONTROL::passes.
    *
    * Also, we set HYBRID_ANTI_ALIAS_CONTROL::centroid at draw time based on
    * `rasterizationSamples * minSampleShading` so it should be per-pixel
    * whenever we're running only a single pass.  However, this would still be
    * correct even if it got interpolated at some other sample.
    *
    * The one caveat here is that we have to be careful about gl_SampleMaskIn.
    * When `nak_fs_key::force_sample_shading = true` we also turn any reads of
    * gl_SampleMaskIn into `1 << gl_SampleID` because the hardware sample mask
    * is actually per-fragment, not per-pass.  We handle this by smashing
    * minSampleShading to 1.0 whenever gl_SampleMaskIn is read.
    */
   const struct vk_multisample_state *ms = state->ms;
   if (ms != NULL && ms->sample_shading_enable)
      key->force_sample_shading = true;
}

static void
nvk_hash_graphics_state(struct vk_physical_device *device,
                        const struct vk_graphics_pipeline_state *state,
                        VkShaderStageFlags stages,
                        blake3_hash blake3_out)
{
   struct mesa_blake3 blake3_ctx;
   _mesa_blake3_init(&blake3_ctx);
   if (stages & VK_SHADER_STAGE_FRAGMENT_BIT) {
      struct nak_fs_key key;
      nvk_populate_fs_key(&key, state);
      _mesa_blake3_update(&blake3_ctx, &key, sizeof(key));

      const bool is_multiview = state->rp->view_mask != 0;
      _mesa_blake3_update(&blake3_ctx, &is_multiview, sizeof(is_multiview));

      /* This doesn't impact the shader compile but it does go in the
       * nvk_shader and gets [de]serialized along with the binary so we
       * need to hash it.
       */
      if (state->ms && state->ms->sample_shading_enable) {
         _mesa_blake3_update(&blake3_ctx, &state->ms->min_sample_shading,
                             sizeof(state->ms->min_sample_shading));
      }
   }
   _mesa_blake3_final(&blake3_ctx, blake3_out);
}

static bool
lower_load_intrinsic(nir_builder *b, nir_intrinsic_instr *load,
                     UNUSED void *_data)
{
   switch (load->intrinsic) {
   case nir_intrinsic_load_ubo: {
      b->cursor = nir_before_instr(&load->instr);

      nir_def *index = load->src[0].ssa;
      nir_def *offset = load->src[1].ssa;
      const enum gl_access_qualifier access = nir_intrinsic_access(load);
      const uint32_t align_mul = nir_intrinsic_align_mul(load);
      const uint32_t align_offset = nir_intrinsic_align_offset(load);

      nir_def *val;
      if (load->src[0].ssa->num_components == 1) {
         val = nir_ldc_nv(b, load->num_components, load->def.bit_size,
                           index, offset, .access = access,
                           .align_mul = align_mul,
                           .align_offset = align_offset);
      } else if (load->src[0].ssa->num_components == 2) {
         nir_def *handle = nir_pack_64_2x32(b, load->src[0].ssa);
         val = nir_ldcx_nv(b, load->num_components, load->def.bit_size,
                           handle, offset, .access = access,
                           .align_mul = align_mul,
                           .align_offset = align_offset);
      } else {
         unreachable("Invalid UBO index");
      }
      nir_def_rewrite_uses(&load->def, val);
      return true;
   }

   case nir_intrinsic_load_global_constant_offset:
   case nir_intrinsic_load_global_constant_bounded: {
      b->cursor = nir_before_instr(&load->instr);

      nir_def *base_addr = load->src[0].ssa;
      nir_def *offset = load->src[1].ssa;

      nir_def *zero = NULL;
      if (load->intrinsic == nir_intrinsic_load_global_constant_bounded) {
         nir_def *bound = load->src[2].ssa;

         unsigned bit_size = load->def.bit_size;
         assert(bit_size >= 8 && bit_size % 8 == 0);
         unsigned byte_size = bit_size / 8;

         zero = nir_imm_zero(b, load->num_components, bit_size);

         unsigned load_size = byte_size * load->num_components;

         nir_def *sat_offset =
            nir_umin(b, offset, nir_imm_int(b, UINT32_MAX - (load_size - 1)));
         nir_def *in_bounds =
            nir_ilt(b, nir_iadd_imm(b, sat_offset, load_size - 1), bound);

         nir_push_if(b, in_bounds);
      }

      nir_def *val =
         nir_build_load_global_constant(b, load->def.num_components,
                                        load->def.bit_size,
                                        nir_iadd(b, base_addr, nir_u2u64(b, offset)),
                                        .align_mul = nir_intrinsic_align_mul(load),
                                        .align_offset = nir_intrinsic_align_offset(load));

      if (load->intrinsic == nir_intrinsic_load_global_constant_bounded) {
         nir_pop_if(b, NULL);
         val = nir_if_phi(b, val, zero);
      }

      nir_def_rewrite_uses(&load->def, val);
      return true;
   }

   default:
      return false;
   }
}

struct lower_ycbcr_state {
   uint32_t set_layout_count;
   struct vk_descriptor_set_layout * const *set_layouts;
};

static const struct vk_ycbcr_conversion_state *
lookup_ycbcr_conversion(const void *_state, uint32_t set,
                        uint32_t binding, uint32_t array_index)
{
   const struct lower_ycbcr_state *state = _state;
   assert(set < state->set_layout_count);
   assert(state->set_layouts[set] != NULL);
   const struct nvk_descriptor_set_layout *set_layout =
      vk_to_nvk_descriptor_set_layout(state->set_layouts[set]);
   assert(binding < set_layout->binding_count);

   const struct nvk_descriptor_set_binding_layout *bind_layout =
      &set_layout->binding[binding];

   if (bind_layout->immutable_samplers == NULL)
      return NULL;

   array_index = MIN2(array_index, bind_layout->array_size - 1);

   const struct nvk_sampler *sampler =
      bind_layout->immutable_samplers[array_index];

   return sampler && sampler->vk.ycbcr_conversion ?
          &sampler->vk.ycbcr_conversion->state : NULL;
}

static inline bool
nir_has_image_var(nir_shader *nir)
{
   nir_foreach_image_variable(_, nir)
      return true;

   return false;
}

void
nvk_lower_nir(struct nvk_device *dev, nir_shader *nir,
              const struct vk_pipeline_robustness_state *rs,
              bool is_multiview,
              uint32_t set_layout_count,
              struct vk_descriptor_set_layout * const *set_layouts,
              struct nvk_cbuf_map *cbuf_map_out)
{
   struct nvk_physical_device *pdev = nvk_device_physical(dev);

   if (nir->info.stage == MESA_SHADER_FRAGMENT) {
      NIR_PASS(_, nir, nir_lower_input_attachments,
               &(nir_input_attachment_options) {
                  .use_fragcoord_sysval = use_nak(pdev, nir->info.stage),
                  .use_layer_id_sysval = use_nak(pdev, nir->info.stage) ||
                                         is_multiview,
                  .use_view_id_for_layer = is_multiview,
               });
   }

   if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
      NIR_PASS(_, nir, nir_lower_patch_vertices,
               nir->info.tess.tcs_vertices_out, NULL);
   }

   const struct lower_ycbcr_state ycbcr_state = {
      .set_layout_count = set_layout_count,
      .set_layouts = set_layouts,
   };
   NIR_PASS(_, nir, nir_vk_lower_ycbcr_tex,
            lookup_ycbcr_conversion, &ycbcr_state);

   nir_lower_compute_system_values_options csv_options = {
      .has_base_workgroup_id = true,
   };
   NIR_PASS(_, nir, nir_lower_compute_system_values, &csv_options);

   /* Lower push constants before lower_descriptors */
   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_push_const,
            nir_address_format_32bit_offset);

   /* Lower non-uniform access before lower_descriptors */
   enum nir_lower_non_uniform_access_type lower_non_uniform_access_types =
      nir_lower_non_uniform_ubo_access;

   if (pdev->info.cls_eng3d < TURING_A) {
      lower_non_uniform_access_types |= nir_lower_non_uniform_texture_access |
                                        nir_lower_non_uniform_image_access;
   }

   /* In practice, most shaders do not have non-uniform-qualified accesses
    * thus a cheaper and likely to fail check is run first.
    */
   if (nir_has_non_uniform_access(nir, lower_non_uniform_access_types)) {
      struct nir_lower_non_uniform_access_options opts = {
         .types = lower_non_uniform_access_types,
         .callback = NULL,
      };
      NIR_PASS(_, nir, nir_opt_non_uniform_access);
      NIR_PASS(_, nir, nir_lower_non_uniform_access, &opts);
   }

   /* TODO: Kepler image lowering requires image params to be loaded from the
    * descriptor set which we don't currently support.
    */
   assert(pdev->info.cls_eng3d >= MAXWELL_A || !nir_has_image_var(nir));

   struct nvk_cbuf_map *cbuf_map = NULL;
   if (use_nak(pdev, nir->info.stage) &&
       !(pdev->debug_flags & NVK_DEBUG_NO_CBUF)) {
      cbuf_map = cbuf_map_out;

      /* Large constant support assumes cbufs */
      NIR_PASS(_, nir, nir_opt_large_constants, NULL, 32);
   } else {
      /* Codegen sometimes puts stuff in cbuf 1 and adds 1 to our cbuf indices
       * so we can't really rely on it for lowering to cbufs and instead place
       * the root descriptors in both cbuf 0 and cbuf 1.
       */
      *cbuf_map_out = (struct nvk_cbuf_map) {
         .cbuf_count = 2,
         .cbufs = {
            { .type = NVK_CBUF_TYPE_ROOT_DESC },
            { .type = NVK_CBUF_TYPE_ROOT_DESC },
         }
      };
   }

   NIR_PASS(_, nir, nvk_nir_lower_descriptors, pdev, rs,
            set_layout_count, set_layouts, cbuf_map);
   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_global,
            nir_address_format_64bit_global);
   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ssbo,
            nvk_ssbo_addr_format(pdev, rs));
   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ubo,
            nvk_ubo_addr_format(pdev, rs));
   NIR_PASS(_, nir, nir_shader_intrinsics_pass,
            lower_load_intrinsic, nir_metadata_none, NULL);

   if (!nir->info.shared_memory_explicit_layout) {
      NIR_PASS(_, nir, nir_lower_vars_to_explicit_types,
               nir_var_mem_shared, shared_var_info);
   }
   NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_shared,
            nir_address_format_32bit_offset);

   if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) {
      /* QMD::SHARED_MEMORY_SIZE requires an alignment of 256B so it's safe to
       * align everything up to 16B so we can write whole vec4s.
       */
      nir->info.shared_size = align(nir->info.shared_size, 16);
      NIR_PASS(_, nir, nir_zero_initialize_shared_memory,
               nir->info.shared_size, 16);

      /* We need to call lower_compute_system_values again because
       * nir_zero_initialize_shared_memory generates load_invocation_id which
       * has to be lowered to load_invocation_index.
       */
      NIR_PASS(_, nir, nir_lower_compute_system_values, NULL);
   }
}

#ifndef NDEBUG
static void
nvk_shader_dump(struct nvk_shader *shader)
{
   unsigned pos;

   if (shader->info.stage != MESA_SHADER_COMPUTE) {
      _debug_printf("dumping HDR for %s shader\n",
                    _mesa_shader_stage_to_string(shader->info.stage));
      for (pos = 0; pos < ARRAY_SIZE(shader->info.hdr); ++pos)
         _debug_printf("HDR[%02"PRIxPTR"] = 0x%08x\n",
                      pos * sizeof(shader->info.hdr[0]), shader->info.hdr[pos]);
   }
   _debug_printf("shader binary code (0x%x bytes):", shader->code_size);
   for (pos = 0; pos < shader->code_size / 4; ++pos) {
      if ((pos % 8) == 0)
         _debug_printf("\n");
      _debug_printf("%08x ", ((const uint32_t *)shader->code_ptr)[pos]);
   }
   _debug_printf("\n");
}
#endif

static VkResult
nvk_compile_nir_with_nak(struct nvk_physical_device *pdev,
                         nir_shader *nir,
                         VkShaderCreateFlagsEXT shader_flags,
                         const struct vk_pipeline_robustness_state *rs,
                         const struct nak_fs_key *fs_key,
                         struct nvk_shader *shader)
{
   const bool dump_asm =
      shader_flags & VK_SHADER_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_MESA;

   nir_variable_mode robust2_modes = 0;
   if (rs->uniform_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT)
      robust2_modes |= nir_var_mem_ubo;
   if (rs->storage_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT)
      robust2_modes |= nir_var_mem_ssbo;

   shader->nak = nak_compile_shader(nir, dump_asm, pdev->nak, robust2_modes, fs_key);
   shader->info = shader->nak->info;
   shader->code_ptr = shader->nak->code;
   shader->code_size = shader->nak->code_size;

   return VK_SUCCESS;
}

static VkResult
nvk_compile_nir(struct nvk_device *dev, nir_shader *nir,
                VkShaderCreateFlagsEXT shader_flags,
                const struct vk_pipeline_robustness_state *rs,
                const struct nak_fs_key *fs_key,
                struct nvk_shader *shader)
{
   struct nvk_physical_device *pdev = nvk_device_physical(dev);
   VkResult result;

   if (use_nak(pdev, nir->info.stage)) {
      result = nvk_compile_nir_with_nak(pdev, nir, shader_flags, rs,
                                       fs_key, shader);
   } else {
      result = nvk_cg_compile_nir(pdev, nir, fs_key, shader);
   }
   if (result != VK_SUCCESS)
      return result;

   if (nir->constant_data_size > 0) {
      uint32_t data_align = nvk_min_cbuf_alignment(&pdev->info);
      uint32_t data_size = align(nir->constant_data_size, data_align);

      void *data = malloc(data_size);
      if (data == NULL)
         return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);

      memcpy(data, nir->constant_data, nir->constant_data_size);

      assert(nir->constant_data_size <= data_size);
      memset(data + nir->constant_data_size, 0,
             data_size - nir->constant_data_size);

      shader->data_ptr = data;
      shader->data_size = data_size;
   }

   return VK_SUCCESS;
}

static VkResult
nvk_shader_upload(struct nvk_device *dev, struct nvk_shader *shader)
{
   struct nvk_physical_device *pdev = nvk_device_physical(dev);

   uint32_t hdr_size = 0;
   if (shader->info.stage != MESA_SHADER_COMPUTE) {
      if (pdev->info.cls_eng3d >= TURING_A)
         hdr_size = TU102_SHADER_HEADER_SIZE;
      else
         hdr_size = GF100_SHADER_HEADER_SIZE;
   }

   /* Fermi   needs 0x40 alignment
    * Kepler+ needs the first instruction to be 0x80 aligned, so we waste 0x30 bytes
    */
   int alignment = pdev->info.cls_eng3d >= KEPLER_A ? 0x80 : 0x40;

   uint32_t total_size = 0;
   if (pdev->info.cls_eng3d >= KEPLER_A &&
       pdev->info.cls_eng3d < TURING_A &&
       hdr_size > 0) {
      /* The instructions are what has to be aligned so we need to start at a
       * small offset (0x30 B) into the upload area.
       */
      total_size = alignment - hdr_size;
   }

   const uint32_t hdr_offset = total_size;
   total_size += hdr_size;

   const uint32_t code_offset = total_size;
   assert(code_offset % alignment == 0);
   total_size += shader->code_size;

   uint32_t data_offset = 0;
   if (shader->data_size > 0) {
      uint32_t cbuf_alignment = nvk_min_cbuf_alignment(&pdev->info);
      alignment = MAX2(alignment, cbuf_alignment);
      total_size = align(total_size, cbuf_alignment);
      data_offset = total_size;
      total_size += shader->data_size;
   }

   char *data = malloc(total_size);
   if (data == NULL)
      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);

   assert(hdr_size <= sizeof(shader->info.hdr));
   memcpy(data + hdr_offset, shader->info.hdr, hdr_size);
   memcpy(data + code_offset, shader->code_ptr, shader->code_size);
   if (shader->data_size > 0)
      memcpy(data + data_offset, shader->data_ptr, shader->data_size);

#ifndef NDEBUG
   if (debug_get_bool_option("NV50_PROG_DEBUG", false))
      nvk_shader_dump(shader);
#endif

   VkResult result = nvk_heap_upload(dev, &dev->shader_heap, data,
                                     total_size, alignment,
                                     &shader->upload_addr);
   if (result == VK_SUCCESS) {
      shader->upload_size = total_size;

      shader->hdr_addr = shader->upload_addr + hdr_offset;
      if (pdev->info.cls_eng3d < VOLTA_A) {
         const uint64_t heap_base_addr =
            nvk_heap_contiguous_base_address(&dev->shader_heap);
         assert(shader->upload_addr - heap_base_addr < UINT32_MAX);
         shader->hdr_addr -= heap_base_addr;
      }
      shader->data_addr = shader->upload_addr + data_offset;
   }
   free(data);

   return result;
}

static const struct vk_shader_ops nvk_shader_ops;

static void
nvk_shader_destroy(struct vk_device *vk_dev,
                   struct vk_shader *vk_shader,
                   const VkAllocationCallbacks* pAllocator)
{
   struct nvk_device *dev = container_of(vk_dev, struct nvk_device, vk);
   struct nvk_shader *shader = container_of(vk_shader, struct nvk_shader, vk);

   if (shader->upload_size > 0) {
      nvk_heap_free(dev, &dev->shader_heap,
                    shader->upload_addr,
                    shader->upload_size);
   }

   if (shader->nak) {
      nak_shader_bin_destroy(shader->nak);
   } else {
      /* This came from codegen or deserialize, just free it */
      free((void *)shader->code_ptr);
   }

   free((void *)shader->data_ptr);

   vk_shader_free(&dev->vk, pAllocator, &shader->vk);
}

static VkResult
nvk_compile_shader(struct nvk_device *dev,
                   struct vk_shader_compile_info *info,
                   const struct vk_graphics_pipeline_state *state,
                   const VkAllocationCallbacks* pAllocator,
                   struct vk_shader **shader_out)
{
   struct nvk_shader *shader;
   VkResult result;

   /* We consume the NIR, regardless of success or failure */
   nir_shader *nir = info->nir;

   shader = vk_shader_zalloc(&dev->vk, &nvk_shader_ops, info->stage,
                             pAllocator, sizeof(*shader));
   if (shader == NULL) {
      ralloc_free(nir);
      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
   }

   /* TODO: Multiview with ESO */
   const bool is_multiview = state && state->rp->view_mask != 0;

   nvk_lower_nir(dev, nir, info->robustness, is_multiview,
                 info->set_layout_count, info->set_layouts,
                 &shader->cbuf_map);

   struct nak_fs_key fs_key_tmp, *fs_key = NULL;
   if (nir->info.stage == MESA_SHADER_FRAGMENT) {
      nvk_populate_fs_key(&fs_key_tmp, state);
      fs_key = &fs_key_tmp;
   }

   result = nvk_compile_nir(dev, nir, info->flags, info->robustness,
                            fs_key, shader);
   ralloc_free(nir);
   if (result != VK_SUCCESS) {
      nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
      return result;
   }

   result = nvk_shader_upload(dev, shader);
   if (result != VK_SUCCESS) {
      nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
      return result;
   }

   if (info->stage == MESA_SHADER_FRAGMENT) {
      if (shader->info.fs.uses_sample_shading) {
         shader->min_sample_shading = 1;
      } else if (state != NULL && state->ms != NULL &&
                 state->ms->sample_shading_enable) {
         shader->min_sample_shading =
            CLAMP(state->ms->min_sample_shading, 0, 1);
      } else {
         shader->min_sample_shading = 0;
      }
   }

   *shader_out = &shader->vk;

   return VK_SUCCESS;
}

VkResult
nvk_compile_nir_shader(struct nvk_device *dev, nir_shader *nir,
                       const VkAllocationCallbacks *alloc,
                       struct nvk_shader **shader_out)
{
   struct nvk_physical_device *pdev = nvk_device_physical(dev);

   const struct vk_pipeline_robustness_state rs_none = {
      .uniform_buffers = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
      .storage_buffers = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
      .images = VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_ROBUST_IMAGE_ACCESS_2_EXT,
   };

   assert(nir->info.stage == MESA_SHADER_COMPUTE);
   if (nir->options == NULL)
      nir->options = nvk_get_nir_options(&pdev->vk, nir->info.stage, &rs_none);

   struct vk_shader_compile_info info = {
      .stage = nir->info.stage,
      .nir = nir,
      .robustness = &rs_none,
   };

   struct vk_shader *shader;
   VkResult result = nvk_compile_shader(dev, &info, NULL, alloc, &shader);
   if (result != VK_SUCCESS)
      return result;

   *shader_out = container_of(shader, struct nvk_shader, vk);

   return VK_SUCCESS;
}

static VkResult
nvk_compile_shaders(struct vk_device *vk_dev,
                    uint32_t shader_count,
                    struct vk_shader_compile_info *infos,
                    const struct vk_graphics_pipeline_state *state,
                    const VkAllocationCallbacks* pAllocator,
                    struct vk_shader **shaders_out)
{
   struct nvk_device *dev = container_of(vk_dev, struct nvk_device, vk);

   for (uint32_t i = 0; i < shader_count; i++) {
      VkResult result = nvk_compile_shader(dev, &infos[i], state,
                                           pAllocator, &shaders_out[i]);
      if (result != VK_SUCCESS) {
         /* Clean up all the shaders before this point */
         for (uint32_t j = 0; j < i; j++)
            nvk_shader_destroy(&dev->vk, shaders_out[j], pAllocator);

         /* Clean up all the NIR after this point */
         for (uint32_t j = i + 1; j < shader_count; j++)
            ralloc_free(infos[j].nir);

         /* Memset the output array */
         memset(shaders_out, 0, shader_count * sizeof(*shaders_out));

         return result;
      }
   }

   return VK_SUCCESS;
}

static VkResult
nvk_deserialize_shader(struct vk_device *vk_dev,
                       struct blob_reader *blob,
                       uint32_t binary_version,
                       const VkAllocationCallbacks* pAllocator,
                       struct vk_shader **shader_out)
{
   struct nvk_device *dev = container_of(vk_dev, struct nvk_device, vk);
   struct nvk_shader *shader;
   VkResult result;

   struct nak_shader_info info;
   blob_copy_bytes(blob, &info, sizeof(info));

   struct nvk_cbuf_map cbuf_map;
   blob_copy_bytes(blob, &cbuf_map, sizeof(cbuf_map));

   float min_sample_shading;
   blob_copy_bytes(blob, &min_sample_shading, sizeof(min_sample_shading));

   const uint32_t code_size = blob_read_uint32(blob);
   const uint32_t data_size = blob_read_uint32(blob);
   if (blob->overrun)
      return vk_error(dev, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);

   shader = vk_shader_zalloc(&dev->vk, &nvk_shader_ops, info.stage,
                             pAllocator, sizeof(*shader));
   if (shader == NULL)
      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);

   shader->info = info;
   shader->cbuf_map = cbuf_map;
   shader->min_sample_shading = min_sample_shading;
   shader->code_size = code_size;
   shader->data_size = data_size;

   shader->code_ptr = malloc(code_size);
   if (shader->code_ptr == NULL) {
      nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
   }

   shader->data_ptr = malloc(data_size);
   if (shader->data_ptr == NULL) {
      nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
   }

   blob_copy_bytes(blob, (void *)shader->code_ptr, shader->code_size);
   blob_copy_bytes(blob, (void *)shader->data_ptr, shader->data_size);
   if (blob->overrun) {
      nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
      return vk_error(dev, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT);
   }

   result = nvk_shader_upload(dev, shader);
   if (result != VK_SUCCESS) {
      nvk_shader_destroy(&dev->vk, &shader->vk, pAllocator);
      return result;
   }

   *shader_out = &shader->vk;

   return VK_SUCCESS;
}

static bool
nvk_shader_serialize(struct vk_device *vk_dev,
                     const struct vk_shader *vk_shader,
                     struct blob *blob)
{
   struct nvk_shader *shader = container_of(vk_shader, struct nvk_shader, vk);

   /* We can't currently cache assmbly */
   if (shader->nak != NULL && shader->nak->asm_str != NULL)
      return false;

   blob_write_bytes(blob, &shader->info, sizeof(shader->info));
   blob_write_bytes(blob, &shader->cbuf_map, sizeof(shader->cbuf_map));
   blob_write_bytes(blob, &shader->min_sample_shading,
                    sizeof(shader->min_sample_shading));

   blob_write_uint32(blob, shader->code_size);
   blob_write_uint32(blob, shader->data_size);
   blob_write_bytes(blob, shader->code_ptr, shader->code_size);
   blob_write_bytes(blob, shader->data_ptr, shader->data_size);

   return !blob->out_of_memory;
}

#define WRITE_STR(field, ...) ({                               \
   memset(field, 0, sizeof(field));                            \
   UNUSED int i = snprintf(field, sizeof(field), __VA_ARGS__); \
   assert(i > 0 && i < sizeof(field));                         \
})

static VkResult
nvk_shader_get_executable_properties(
   UNUSED struct vk_device *device,
   const struct vk_shader *vk_shader,
   uint32_t *executable_count,
   VkPipelineExecutablePropertiesKHR *properties)
{
   struct nvk_shader *shader = container_of(vk_shader, struct nvk_shader, vk);
   VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out,
                          properties, executable_count);

   vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props) {
      props->stages = mesa_to_vk_shader_stage(shader->info.stage);
      props->subgroupSize = 32;
      WRITE_STR(props->name, "%s",
                _mesa_shader_stage_to_string(shader->info.stage));
      WRITE_STR(props->description, "%s shader",
                _mesa_shader_stage_to_string(shader->info.stage));
   }

   return vk_outarray_status(&out);
}

static VkResult
nvk_shader_get_executable_statistics(
   UNUSED struct vk_device *device,
   const struct vk_shader *vk_shader,
   uint32_t executable_index,
   uint32_t *statistic_count,
   VkPipelineExecutableStatisticKHR *statistics)
{
   struct nvk_shader *shader = container_of(vk_shader, struct nvk_shader, vk);
   VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out,
                          statistics, statistic_count);

   assert(executable_index == 0);

   vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
      WRITE_STR(stat->name, "Instruction count");
      WRITE_STR(stat->description, "Number of instructions used by this shader");
      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
      stat->value.u64 = shader->info.num_instrs;
   }

   vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
      WRITE_STR(stat->name, "Code Size");
      WRITE_STR(stat->description,
                "Size of the compiled shader binary, in bytes");
      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
      stat->value.u64 = shader->code_size;
   }

   vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
      WRITE_STR(stat->name, "Number of GPRs");
      WRITE_STR(stat->description, "Number of GPRs used by this pipeline");
      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
      stat->value.u64 = shader->info.num_gprs;
   }

   vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
      WRITE_STR(stat->name, "SLM Size");
      WRITE_STR(stat->description,
                "Size of shader local (scratch) memory, in bytes");
      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
      stat->value.u64 = shader->info.slm_size;
   }

   return vk_outarray_status(&out);
}

static bool
write_ir_text(VkPipelineExecutableInternalRepresentationKHR* ir,
              const char *data)
{
   ir->isText = VK_TRUE;

   size_t data_len = strlen(data) + 1;

   if (ir->pData == NULL) {
      ir->dataSize = data_len;
      return true;
   }

   strncpy(ir->pData, data, ir->dataSize);
   if (ir->dataSize < data_len)
      return false;

   ir->dataSize = data_len;
   return true;
}

static VkResult
nvk_shader_get_executable_internal_representations(
   UNUSED struct vk_device *device,
   const struct vk_shader *vk_shader,
   uint32_t executable_index,
   uint32_t *internal_representation_count,
   VkPipelineExecutableInternalRepresentationKHR *internal_representations)
{
   struct nvk_shader *shader = container_of(vk_shader, struct nvk_shader, vk);
   VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out,
                          internal_representations,
                          internal_representation_count);
   bool incomplete_text = false;

   assert(executable_index == 0);

   if (shader->nak != NULL && shader->nak->asm_str != NULL) {
      vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
         WRITE_STR(ir->name, "NAK assembly");
         WRITE_STR(ir->description, "NAK assembly");
         if (!write_ir_text(ir, shader->nak->asm_str))
            incomplete_text = true;
      }
   }

   return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out);
}

static const struct vk_shader_ops nvk_shader_ops = {
   .destroy = nvk_shader_destroy,
   .serialize = nvk_shader_serialize,
   .get_executable_properties = nvk_shader_get_executable_properties,
   .get_executable_statistics = nvk_shader_get_executable_statistics,
   .get_executable_internal_representations =
      nvk_shader_get_executable_internal_representations,
};

const struct vk_device_shader_ops nvk_device_shader_ops = {
   .get_nir_options = nvk_get_nir_options,
   .get_spirv_options = nvk_get_spirv_options,
   .preprocess_nir = nvk_preprocess_nir,
   .hash_graphics_state = nvk_hash_graphics_state,
   .compile = nvk_compile_shaders,
   .deserialize = nvk_deserialize_shader,
   .cmd_set_dynamic_graphics_state = vk_cmd_set_dynamic_graphics_state,
   .cmd_bind_shaders = nvk_cmd_bind_shaders,
};