/* * Copyright 2024 Valve Corporation * Copyright 2024 Alyssa Rosenzweig * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. * SPDX-License-Identifier: MIT */ #include "hk_shader.h" #include "agx_helpers.h" #include "agx_nir_lower_gs.h" #include "glsl_types.h" #include "nir.h" #include "nir_builder.h" #include "agx_bo.h" #include "hk_cmd_buffer.h" #include "hk_descriptor_set_layout.h" #include "hk_device.h" #include "hk_physical_device.h" #include "hk_sampler.h" #include "hk_shader.h" #include "nir_builder_opcodes.h" #include "nir_builtin_builder.h" #include "nir_intrinsics.h" #include "nir_intrinsics_indices.h" #include "nir_xfb_info.h" #include "shader_enums.h" #include "vk_nir_convert_ycbcr.h" #include "vk_pipeline.h" #include "vk_pipeline_layout.h" #include "vk_shader_module.h" #include "vk_ycbcr_conversion.h" #include "asahi/compiler/agx_compile.h" #include "asahi/lib/agx_linker.h" #include "asahi/lib/agx_nir_passes.h" #include "asahi/lib/agx_tilebuffer.h" #include "asahi/lib/agx_uvs.h" #include "compiler/spirv/nir_spirv.h" #include "util/blob.h" #include "util/hash_table.h" #include "util/macros.h" #include "util/mesa-sha1.h" #include "util/simple_mtx.h" #include "util/u_debug.h" #include "vulkan/vulkan_core.h" struct hk_fs_key { bool zs_self_dep; /** True if sample shading is forced on via an API knob such as * VkPipelineMultisampleStateCreateInfo::minSampleShading */ bool force_sample_shading; uint8_t pad[2]; }; static_assert(sizeof(struct hk_fs_key) == 4, "packed"); static void shared_var_info(const struct glsl_type *type, unsigned *size, unsigned *align) { assert(glsl_type_is_vector_or_scalar(type)); uint32_t comp_size = glsl_type_is_boolean(type) ? 4 : glsl_get_bit_size(type) / 8; unsigned length = glsl_get_vector_elements(type); *size = comp_size * length, *align = comp_size; } uint64_t hk_physical_device_compiler_flags(const struct hk_physical_device *pdev) { /* TODO compiler flags */ return 0; } const nir_shader_compiler_options * hk_get_nir_options(struct vk_physical_device *vk_pdev, gl_shader_stage stage, UNUSED const struct vk_pipeline_robustness_state *rs) { return &agx_nir_options; } static struct spirv_to_nir_options hk_get_spirv_options(struct vk_physical_device *vk_pdev, UNUSED gl_shader_stage stage, const struct vk_pipeline_robustness_state *rs) { return (struct spirv_to_nir_options){ .ssbo_addr_format = hk_buffer_addr_format(rs->storage_buffers), .phys_ssbo_addr_format = nir_address_format_64bit_global, .ubo_addr_format = hk_buffer_addr_format(rs->uniform_buffers), .shared_addr_format = nir_address_format_32bit_offset, .min_ssbo_alignment = HK_MIN_SSBO_ALIGNMENT, .min_ubo_alignment = HK_MIN_UBO_ALIGNMENT, }; } static bool lower_halt_to_return(nir_builder *b, nir_instr *instr, UNUSED void *_data) { if (instr->type != nir_instr_type_jump) return false; nir_jump_instr *jump = nir_instr_as_jump(instr); if (jump->type != nir_jump_halt) return false; assert(b->impl == nir_shader_get_entrypoint(b->shader)); jump->type = nir_jump_return; return true; } void hk_preprocess_nir_internal(struct vk_physical_device *vk_pdev, nir_shader *nir) { /* Must lower before io to temps */ if (nir->info.stage == MESA_SHADER_FRAGMENT) { NIR_PASS(_, nir, nir_lower_terminate_to_demote); NIR_PASS(_, nir, nir_shader_instructions_pass, lower_halt_to_return, nir_metadata_all, NULL); NIR_PASS(_, nir, nir_lower_returns); } /* Unroll loops before lowering indirects via nir_lower_io_to_temporaries */ UNUSED bool progress = false; NIR_PASS(_, nir, nir_lower_global_vars_to_local); do { progress = false; NIR_PASS(progress, nir, nir_lower_vars_to_ssa); NIR_PASS(progress, nir, nir_copy_prop); NIR_PASS(progress, nir, nir_opt_dce); NIR_PASS(progress, nir, nir_opt_constant_folding); NIR_PASS(progress, nir, nir_opt_loop); NIR_PASS(progress, nir, nir_opt_loop_unroll); } while (progress); if (nir->info.stage == MESA_SHADER_FRAGMENT) { struct nir_lower_sysvals_to_varyings_options sysvals_opts = { .point_coord = true, }; nir_lower_sysvals_to_varyings(nir, &sysvals_opts); } NIR_PASS(_, nir, nir_lower_system_values); /* Gather info before preprocess_nir but after some general lowering, so * inputs_read and system_values_read are accurately set. */ nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir)); NIR_PASS_V(nir, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir), true, false); NIR_PASS(_, nir, nir_lower_global_vars_to_local); NIR_PASS(_, nir, nir_split_var_copies); NIR_PASS(_, nir, nir_split_struct_vars, nir_var_function_temp); /* Optimize but allow copies because we haven't lowered them yet */ agx_preprocess_nir(nir, NULL); NIR_PASS(_, nir, nir_lower_load_const_to_scalar); NIR_PASS(_, nir, nir_lower_var_copies); } static void hk_preprocess_nir(struct vk_physical_device *vk_pdev, nir_shader *nir) { hk_preprocess_nir_internal(vk_pdev, nir); nir_lower_compute_system_values_options csv_options = { .has_base_workgroup_id = true, }; NIR_PASS(_, nir, nir_lower_compute_system_values, &csv_options); } static void hk_populate_fs_key(struct hk_fs_key *key, const struct vk_graphics_pipeline_state *state) { memset(key, 0, sizeof(*key)); if (state == NULL) return; if (state->pipeline_flags & VK_PIPELINE_CREATE_2_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT) key->zs_self_dep = true; /* We force per-sample interpolation whenever sampleShadingEnable is set * regardless of minSampleShading or rasterizationSamples. * * When sampleShadingEnable is set, few guarantees are made about the * location of interpolation of the inputs. The only real guarantees are * that the inputs are interpolated within the pixel and that you get at * least `rasterizationSamples * minSampleShading` unique positions. * Importantly, it does not require that when `rasterizationSamples * * minSampleShading <= 1.0` that those positions are at the fragment * center. Therefore, it's valid to just always do per-sample all the time. * * The one caveat here is that we have to be careful about gl_SampleMaskIn. * When `hk_fs_key::force_sample_shading = true` we also turn any reads of * gl_SampleMaskIn into `1 << gl_SampleID` because the hardware sample mask * is actually per-fragment, not per-pass. We handle this by smashing * minSampleShading to 1.0 whenever gl_SampleMaskIn is read. */ const struct vk_multisample_state *ms = state->ms; if (ms != NULL && ms->sample_shading_enable) key->force_sample_shading = true; } static void hk_hash_graphics_state(struct vk_physical_device *device, const struct vk_graphics_pipeline_state *state, VkShaderStageFlags stages, blake3_hash blake3_out) { struct mesa_blake3 blake3_ctx; _mesa_blake3_init(&blake3_ctx); if (stages & VK_SHADER_STAGE_FRAGMENT_BIT) { struct hk_fs_key key; hk_populate_fs_key(&key, state); _mesa_blake3_update(&blake3_ctx, &key, sizeof(key)); const bool is_multiview = state->rp->view_mask != 0; _mesa_blake3_update(&blake3_ctx, &is_multiview, sizeof(is_multiview)); } _mesa_blake3_final(&blake3_ctx, blake3_out); } static bool lower_load_global_constant_offset_instr(nir_builder *b, nir_intrinsic_instr *intrin, void *data) { if (intrin->intrinsic != nir_intrinsic_load_global_constant_offset && intrin->intrinsic != nir_intrinsic_load_global_constant_bounded) return false; b->cursor = nir_before_instr(&intrin->instr); bool *has_soft_fault = data; nir_def *base_addr = intrin->src[0].ssa; nir_def *offset = intrin->src[1].ssa; nir_def *zero = NULL; nir_def *in_bounds = NULL; if (intrin->intrinsic == nir_intrinsic_load_global_constant_bounded) { nir_def *bound = intrin->src[2].ssa; unsigned bit_size = intrin->def.bit_size; assert(bit_size >= 8 && bit_size % 8 == 0); unsigned byte_size = bit_size / 8; zero = nir_imm_zero(b, intrin->num_components, bit_size); unsigned load_size = byte_size * intrin->num_components; nir_def *sat_offset = nir_umin(b, offset, nir_imm_int(b, UINT32_MAX - (load_size - 1))); in_bounds = nir_ilt(b, nir_iadd_imm(b, sat_offset, load_size - 1), bound); /* If we do not have soft fault, we branch to bounds check. This is slow, * fortunately we always have soft fault for release drivers. * * With soft fault, we speculatively load and smash to zero at the end. */ if (!(*has_soft_fault)) nir_push_if(b, in_bounds); } nir_def *val = nir_build_load_global_constant( b, intrin->def.num_components, intrin->def.bit_size, nir_iadd(b, base_addr, nir_u2u64(b, offset)), .align_mul = nir_intrinsic_align_mul(intrin), .align_offset = nir_intrinsic_align_offset(intrin), .access = nir_intrinsic_access(intrin)); if (intrin->intrinsic == nir_intrinsic_load_global_constant_bounded) { if (*has_soft_fault) { val = nir_bcsel(b, in_bounds, val, zero); } else { nir_pop_if(b, NULL); val = nir_if_phi(b, val, zero); } } nir_def_replace(&intrin->def, val); return true; } struct lower_ycbcr_state { uint32_t set_layout_count; struct vk_descriptor_set_layout *const *set_layouts; }; static const struct vk_ycbcr_conversion_state * lookup_ycbcr_conversion(const void *_state, uint32_t set, uint32_t binding, uint32_t array_index) { const struct lower_ycbcr_state *state = _state; assert(set < state->set_layout_count); assert(state->set_layouts[set] != NULL); const struct hk_descriptor_set_layout *set_layout = vk_to_hk_descriptor_set_layout(state->set_layouts[set]); assert(binding < set_layout->binding_count); const struct hk_descriptor_set_binding_layout *bind_layout = &set_layout->binding[binding]; if (bind_layout->immutable_samplers == NULL) return NULL; array_index = MIN2(array_index, bind_layout->array_size - 1); const struct hk_sampler *sampler = bind_layout->immutable_samplers[array_index]; return sampler && sampler->vk.ycbcr_conversion ? &sampler->vk.ycbcr_conversion->state : NULL; } static inline bool nir_has_image_var(nir_shader *nir) { nir_foreach_image_variable(_, nir) return true; return false; } static int glsl_type_size(const struct glsl_type *type, bool bindless) { return glsl_count_attribute_slots(type, false); } /* * This is the world's worst multiview implementation. We simply duplicate each * draw on the CPU side, changing a uniform in between, and then plumb the view * index into the layer ID here. Whatever, it works. * * The "proper" implementation on AGX would use vertex amplification, but a * MacBook is not a VR headset. */ static void hk_lower_multiview(nir_shader *nir) { /* If there's an existing layer ID write, ignore it. This avoids validation * splat with vk_meta. */ nir_variable *existing = nir_find_variable_with_location( nir, nir_var_shader_out, VARYING_SLOT_LAYER); if (existing) { existing->data.mode = nir_var_shader_temp; existing->data.location = 0; nir_fixup_deref_modes(nir); } /* Now write the view index as the layer */ nir_builder b = nir_builder_at(nir_after_impl(nir_shader_get_entrypoint(nir))); nir_variable *layer = nir_variable_create(nir, nir_var_shader_out, glsl_uint_type(), NULL); layer->data.location = VARYING_SLOT_LAYER; nir_store_var(&b, layer, nir_load_view_index(&b), nir_component_mask(1)); b.shader->info.outputs_written |= VARYING_BIT_LAYER; } /* * KHR_maintenance5 requires that points rasterize with a default point size of * 1.0, while our hardware requires an explicit point size write for this. * Since topology may be dynamic, we insert an unconditional write if necessary. */ static bool hk_nir_insert_psiz_write(nir_shader *nir) { nir_function_impl *impl = nir_shader_get_entrypoint(nir); if (nir->info.outputs_written & VARYING_BIT_PSIZ) { nir_metadata_preserve(impl, nir_metadata_all); return false; } nir_builder b = nir_builder_at(nir_after_impl(impl)); nir_store_output(&b, nir_imm_float(&b, 1.0), nir_imm_int(&b, 0), .write_mask = nir_component_mask(1), .io_semantics.location = VARYING_SLOT_PSIZ, .io_semantics.num_slots = 1, .src_type = nir_type_float32); nir->info.outputs_written |= VARYING_BIT_PSIZ; nir_metadata_preserve(b.impl, nir_metadata_control_flow); return true; } static nir_def * query_custom_border(nir_builder *b, nir_tex_instr *tex) { return nir_build_texture_query(b, tex, nir_texop_custom_border_color_agx, 4, tex->dest_type, false, false); } static nir_def * has_custom_border(nir_builder *b, nir_tex_instr *tex) { return nir_build_texture_query(b, tex, nir_texop_has_custom_border_color_agx, 1, nir_type_bool1, false, false); } static bool lower(nir_builder *b, nir_instr *instr, UNUSED void *_data) { if (instr->type != nir_instr_type_tex) return false; nir_tex_instr *tex = nir_instr_as_tex(instr); if (!nir_tex_instr_need_sampler(tex) || nir_tex_instr_is_query(tex)) return false; /* XXX: this is a really weird edge case, is this even well-defined? */ if (tex->is_shadow) return false; b->cursor = nir_after_instr(&tex->instr); nir_def *has_custom = has_custom_border(b, tex); nir_instr *orig = nir_instr_clone(b->shader, &tex->instr); nir_builder_instr_insert(b, orig); nir_def *clamp_to_1 = &nir_instr_as_tex(orig)->def; nir_push_if(b, has_custom); nir_def *replaced = NULL; { /* Sample again, this time with clamp-to-0 instead of clamp-to-1 */ nir_instr *clone_instr = nir_instr_clone(b->shader, &tex->instr); nir_builder_instr_insert(b, clone_instr); nir_tex_instr *tex_0 = nir_instr_as_tex(clone_instr); nir_def *clamp_to_0 = &tex_0->def; tex_0->backend_flags |= AGX_TEXTURE_FLAG_CLAMP_TO_0; /* Grab the border colour */ nir_def *border = query_custom_border(b, tex_0); if (tex->op == nir_texop_tg4) { border = nir_replicate(b, nir_channel(b, border, tex->component), 4); } /* Combine together with the border */ if (nir_alu_type_get_base_type(tex->dest_type) == nir_type_float && tex->op != nir_texop_tg4) { /* For floats, lerp together: * * For border texels: (1 * border) + (0 * border ) = border * For regular texels: (x * border) + (x * (1 - border)) = x. * * Linear filtering is linear (duh), so lerping is compatible. */ replaced = nir_flrp(b, clamp_to_0, clamp_to_1, border); } else { /* For integers, just select componentwise since there is no linear * filtering. Gathers also use this path since they are unfiltered in * each component. */ replaced = nir_bcsel(b, nir_ieq(b, clamp_to_0, clamp_to_1), clamp_to_0, border); } } nir_pop_if(b, NULL); /* Put it together with a phi */ nir_def *phi = nir_if_phi(b, replaced, clamp_to_1); nir_def_replace(&tex->def, phi); return true; } static bool agx_nir_lower_custom_border(nir_shader *nir) { return nir_shader_instructions_pass(nir, lower, nir_metadata_none, NULL); } /* * In Vulkan, the VIEWPORT should read 0 in the fragment shader if it is not * written by the vertex shader, but in our implementation, the varying would * otherwise be undefined. This small pass predicates VIEWPORT reads based on * whether the hardware vertex shader writes the VIEWPORT (nonzero UVS index). */ static bool lower_viewport_fs(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *data) { if (intr->intrinsic != nir_intrinsic_load_input) return false; nir_io_semantics sem = nir_intrinsic_io_semantics(intr); if (sem.location != VARYING_SLOT_VIEWPORT) return false; b->cursor = nir_after_instr(&intr->instr); nir_def *orig = &intr->def; nir_def *uvs = nir_load_uvs_index_agx(b, .io_semantics = sem); nir_def *def = nir_bcsel(b, nir_ine_imm(b, uvs, 0), orig, nir_imm_int(b, 0)); nir_def_rewrite_uses_after(orig, def, def->parent_instr); return true; } static bool lower_subpass_dim(nir_builder *b, nir_instr *instr, UNUSED void *_data) { if (instr->type != nir_instr_type_tex) return false; nir_tex_instr *tex = nir_instr_as_tex(instr); if (tex->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS) tex->sampler_dim = GLSL_SAMPLER_DIM_2D; else if (tex->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) tex->sampler_dim = GLSL_SAMPLER_DIM_MS; else return false; return true; } void hk_lower_nir(struct hk_device *dev, nir_shader *nir, const struct vk_pipeline_robustness_state *rs, bool is_multiview, uint32_t set_layout_count, struct vk_descriptor_set_layout *const *set_layouts) { if (nir->info.stage == MESA_SHADER_FRAGMENT) { NIR_PASS(_, nir, nir_lower_input_attachments, &(nir_input_attachment_options){ .use_fragcoord_sysval = true, .use_layer_id_sysval = true, .use_view_id_for_layer = is_multiview, }); NIR_PASS(_, nir, nir_shader_instructions_pass, lower_subpass_dim, nir_metadata_all, NULL); NIR_PASS(_, nir, nir_lower_wpos_center); } /* XXX: should be last geometry stage, how do I get to that? */ if (nir->info.stage == MESA_SHADER_VERTEX) { if (is_multiview) hk_lower_multiview(nir); } if (nir->info.stage == MESA_SHADER_TESS_EVAL) { NIR_PASS(_, nir, nir_lower_patch_vertices, nir->info.tess.tcs_vertices_out, NULL); } const struct lower_ycbcr_state ycbcr_state = { .set_layout_count = set_layout_count, .set_layouts = set_layouts, }; NIR_PASS(_, nir, nir_vk_lower_ycbcr_tex, lookup_ycbcr_conversion, &ycbcr_state); /* Lower push constants before lower_descriptors */ NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_push_const, nir_address_format_32bit_offset); // NIR_PASS(_, nir, nir_opt_large_constants, NULL, 32); /* Images accessed through the texture or PBE hardware are robust, so we * don't set lower_image. (There are some sticky details around txf but * they're handled by agx_nir_lower_texture). However, image atomics are * software so require robustness lowering. */ nir_lower_robust_access_options robustness = { .lower_image_atomic = true, }; NIR_PASS(_, nir, nir_lower_robust_access, &robustness); /* We must do early lowering before hk_nir_lower_descriptors, since this will * create lod_bias_agx instructions. */ NIR_PASS(_, nir, agx_nir_lower_texture_early, true /* support_lod_bias */); NIR_PASS(_, nir, agx_nir_lower_custom_border); NIR_PASS(_, nir, hk_nir_lower_descriptors, rs, set_layout_count, set_layouts); NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_global, nir_address_format_64bit_global); NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ssbo, hk_buffer_addr_format(rs->storage_buffers)); NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ubo, hk_buffer_addr_format(rs->uniform_buffers)); bool soft_fault = agx_has_soft_fault(&dev->dev); NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_load_global_constant_offset_instr, nir_metadata_none, &soft_fault); if (!nir->info.shared_memory_explicit_layout) { /* There may be garbage in shared_size, but it's the job of * nir_lower_vars_to_explicit_types to allocate it. We have to reset to * avoid overallocation. */ nir->info.shared_size = 0; NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_mem_shared, shared_var_info); } NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_shared, nir_address_format_32bit_offset); if (nir->info.zero_initialize_shared_memory && nir->info.shared_size > 0) { /* Align everything up to 16B so we can write whole vec4s. */ nir->info.shared_size = align(nir->info.shared_size, 16); NIR_PASS(_, nir, nir_zero_initialize_shared_memory, nir->info.shared_size, 16); /* We need to call lower_compute_system_values again because * nir_zero_initialize_shared_memory generates load_invocation_id which * has to be lowered to load_invocation_index. */ NIR_PASS(_, nir, nir_lower_compute_system_values, NULL); } /* TODO: we can do indirect VS output */ nir_variable_mode lower_indirect_modes = 0; if (nir->info.stage == MESA_SHADER_FRAGMENT) lower_indirect_modes |= nir_var_shader_out; else if (nir->info.stage == MESA_SHADER_VERTEX) lower_indirect_modes |= nir_var_shader_in | nir_var_shader_out; NIR_PASS(_, nir, nir_lower_indirect_derefs, lower_indirect_modes, UINT32_MAX); NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, glsl_type_size, nir_lower_io_lower_64bit_to_32); if (nir->info.stage == MESA_SHADER_FRAGMENT) { NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_viewport_fs, nir_metadata_control_flow, NULL); } NIR_PASS(_, nir, agx_nir_lower_texture); NIR_PASS(_, nir, agx_nir_lower_multisampled_image_store); agx_preprocess_nir(nir, dev->dev.libagx); NIR_PASS(_, nir, nir_opt_conditional_discard); NIR_PASS(_, nir, nir_opt_if, nir_opt_if_optimize_phi_true_false | nir_opt_if_avoid_64bit_phis); } static void hk_upload_shader(struct hk_device *dev, struct hk_shader *shader) { if (shader->b.info.has_preamble) { unsigned offs = shader->b.info.preamble_offset; assert(offs < shader->b.binary_size); size_t size = shader->b.binary_size - offs; assert(size > 0); shader->bo = agx_bo_create(&dev->dev, size, 0, AGX_BO_EXEC | AGX_BO_LOW_VA, "Preamble"); memcpy(shader->bo->map, shader->b.binary + offs, size); shader->preamble_addr = shader->bo->va->addr; } if (!shader->linked.ht) { /* If we only have a single variant, link now. */ shader->only_linked = hk_fast_link(dev, false, shader, NULL, NULL, 0); } if (shader->info.stage == MESA_SHADER_FRAGMENT) { agx_pack(&shader->frag_face, FRAGMENT_FACE_2, cfg) { cfg.conservative_depth = agx_translate_depth_layout(shader->b.info.depth_layout); } } agx_pack(&shader->counts, COUNTS, cfg) { cfg.uniform_register_count = shader->b.info.push_count; cfg.preshader_register_count = shader->b.info.nr_preamble_gprs; cfg.sampler_state_register_count = agx_translate_sampler_state_count( shader->b.info.uses_txf ? 1 : 0, false); } } DERIVE_HASH_TABLE(hk_fast_link_key_vs); DERIVE_HASH_TABLE(hk_fast_link_key_fs); static VkResult hk_init_link_ht(struct hk_shader *shader, gl_shader_stage sw_stage) { simple_mtx_init(&shader->linked.lock, mtx_plain); bool multiple_variants = sw_stage == MESA_SHADER_VERTEX || sw_stage == MESA_SHADER_FRAGMENT; if (!multiple_variants) return VK_SUCCESS; if (sw_stage == MESA_SHADER_VERTEX) shader->linked.ht = hk_fast_link_key_vs_table_create(NULL); else shader->linked.ht = hk_fast_link_key_fs_table_create(NULL); return (shader->linked.ht == NULL) ? VK_ERROR_OUT_OF_HOST_MEMORY : VK_SUCCESS; } static VkResult hk_compile_nir(struct hk_device *dev, const VkAllocationCallbacks *pAllocator, nir_shader *nir, VkShaderCreateFlagsEXT shader_flags, const struct vk_pipeline_robustness_state *rs, const struct hk_fs_key *fs_key, struct hk_shader *shader, gl_shader_stage sw_stage, bool hw, nir_xfb_info *xfb_info) { unsigned vs_uniform_base = 0; /* For now, only shader objects are supported */ if (sw_stage == MESA_SHADER_VERTEX) { vs_uniform_base = 6 * DIV_ROUND_UP( BITSET_LAST_BIT(shader->info.vs.attrib_components_read), 4); } else if (sw_stage == MESA_SHADER_FRAGMENT) { shader->info.fs.interp = agx_gather_interp_info(nir); shader->info.fs.writes_memory = nir->info.writes_memory; /* Discards must be lowering before lowering MSAA to handle discards */ NIR_PASS(_, nir, agx_nir_lower_discard_zs_emit); NIR_PASS(_, nir, agx_nir_lower_fs_output_to_epilog, &shader->info.fs.epilog_key); NIR_PASS(_, nir, agx_nir_lower_sample_mask); if (nir->info.fs.uses_sample_shading) { /* Ensure the sample ID is preserved in register */ nir_builder b = nir_builder_at(nir_after_impl(nir_shader_get_entrypoint(nir))); nir_export_agx(&b, nir_load_exported_agx(&b, 1, 16, .base = 1), .base = 1); NIR_PASS(_, nir, agx_nir_lower_to_per_sample); } NIR_PASS(_, nir, agx_nir_lower_fs_active_samples_to_register); NIR_PASS(_, nir, agx_nir_lower_interpolation); } else if (sw_stage == MESA_SHADER_TESS_EVAL) { shader->info.ts.ccw = nir->info.tess.ccw; shader->info.ts.point_mode = nir->info.tess.point_mode; shader->info.ts.spacing = nir->info.tess.spacing; shader->info.ts.mode = nir->info.tess._primitive_mode; if (nir->info.tess.point_mode) { shader->info.ts.out_prim = MESA_PRIM_POINTS; } else if (nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES) { shader->info.ts.out_prim = MESA_PRIM_LINES; } else { shader->info.ts.out_prim = MESA_PRIM_TRIANGLES; } /* This destroys info so it needs to happen after the gather */ NIR_PASS(_, nir, agx_nir_lower_tes, dev->dev.libagx, hw); } else if (sw_stage == MESA_SHADER_TESS_CTRL) { shader->info.tcs.output_patch_size = nir->info.tess.tcs_vertices_out; shader->info.tcs.per_vertex_outputs = agx_tcs_per_vertex_outputs(nir); shader->info.tcs.nr_patch_outputs = util_last_bit(nir->info.patch_outputs_written); shader->info.tcs.output_stride = agx_tcs_output_stride(nir); } uint64_t outputs = nir->info.outputs_written; if (!hw && (sw_stage == MESA_SHADER_VERTEX || sw_stage == MESA_SHADER_TESS_EVAL)) { nir->info.stage = MESA_SHADER_COMPUTE; memset(&nir->info.cs, 0, sizeof(nir->info.cs)); nir->xfb_info = NULL; } /* XXX: rename */ NIR_PASS(_, nir, hk_lower_uvs_index, vs_uniform_base); #if 0 /* TODO */ nir_variable_mode robust2_modes = 0; if (rs->uniform_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT) robust2_modes |= nir_var_mem_ubo; if (rs->storage_buffers == VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT) robust2_modes |= nir_var_mem_ssbo; #endif struct agx_shader_key backend_key = { .dev = agx_gather_device_key(&dev->dev), .reserved_preamble = 128 /* TODO */, .libagx = dev->dev.libagx, .no_stop = nir->info.stage == MESA_SHADER_FRAGMENT, .has_scratch = true, }; /* For now, sample shading is always dynamic. Indicate that. */ if (nir->info.stage == MESA_SHADER_FRAGMENT && nir->info.fs.uses_sample_shading) backend_key.fs.inside_sample_loop = true; agx_compile_shader_nir(nir, &backend_key, NULL, &shader->b); shader->code_ptr = shader->b.binary; shader->code_size = shader->b.binary_size; shader->info.stage = sw_stage; shader->info.clip_distance_array_size = nir->info.clip_distance_array_size; shader->info.cull_distance_array_size = nir->info.cull_distance_array_size; shader->b.info.outputs = outputs; if (sw_stage == MESA_SHADER_COMPUTE) { for (unsigned i = 0; i < 3; ++i) shader->info.cs.local_size[i] = nir->info.workgroup_size[i]; } if (xfb_info) { assert(xfb_info->output_count < ARRAY_SIZE(shader->info.xfb_outputs)); memcpy(&shader->info.xfb_info, xfb_info, nir_xfb_info_size(xfb_info->output_count)); typed_memcpy(shader->info.xfb_stride, nir->info.xfb_stride, 4); } if (nir->constant_data_size > 0) { uint32_t data_size = align(nir->constant_data_size, HK_MIN_UBO_ALIGNMENT); void *data = malloc(data_size); if (data == NULL) { ralloc_free(nir); return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); } memcpy(data, nir->constant_data, nir->constant_data_size); assert(nir->constant_data_size <= data_size); memset(data + nir->constant_data_size, 0, data_size - nir->constant_data_size); shader->data_ptr = data; shader->data_size = data_size; } ralloc_free(nir); VkResult result = hk_init_link_ht(shader, sw_stage); if (result != VK_SUCCESS) return vk_error(dev, result); hk_upload_shader(dev, shader); return VK_SUCCESS; } static const struct vk_shader_ops hk_shader_ops; static void hk_destroy_linked_shader(struct hk_device *dev, struct hk_linked_shader *linked) { agx_bo_unreference(&dev->dev, linked->b.bo); ralloc_free(linked); } static void hk_shader_destroy(struct hk_device *dev, struct hk_shader *s) { free((void *)s->code_ptr); free((void *)s->data_ptr); agx_bo_unreference(&dev->dev, s->bo); simple_mtx_destroy(&s->linked.lock); if (s->only_linked) hk_destroy_linked_shader(dev, s->only_linked); if (s->linked.ht) { hash_table_foreach(s->linked.ht, entry) { hk_destroy_linked_shader(dev, entry->data); } _mesa_hash_table_destroy(s->linked.ht, NULL); } } void hk_api_shader_destroy(struct vk_device *vk_dev, struct vk_shader *vk_shader, const VkAllocationCallbacks *pAllocator) { struct hk_device *dev = container_of(vk_dev, struct hk_device, vk); struct hk_api_shader *obj = container_of(vk_shader, struct hk_api_shader, vk); hk_foreach_variant(obj, shader) { hk_shader_destroy(dev, shader); } vk_shader_free(&dev->vk, pAllocator, &obj->vk); } static void hk_lower_hw_vs(nir_shader *nir, struct hk_shader *shader) { /* Point size must be clamped, excessively large points don't render * properly on G13. * * Must be synced with pointSizeRange. */ NIR_PASS(_, nir, nir_lower_point_size, 1.0f, 511.95f); /* TODO: Optimize out for monolithic? */ NIR_PASS(_, nir, hk_nir_insert_psiz_write); NIR_PASS(_, nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL); NIR_PASS(_, nir, agx_nir_lower_cull_distance_vs); NIR_PASS(_, nir, agx_nir_lower_uvs, &shader->info.uvs); shader->info.vs.cull_distance_array_size = nir->info.cull_distance_array_size; } VkResult hk_compile_shader(struct hk_device *dev, struct vk_shader_compile_info *info, const struct vk_graphics_pipeline_state *state, const VkAllocationCallbacks *pAllocator, struct hk_api_shader **shader_out) { VkResult result; /* We consume the NIR, regardless of success or failure */ nir_shader *nir = info->nir; size_t size = sizeof(struct hk_api_shader) + sizeof(struct hk_shader) * hk_num_variants(info->stage); struct hk_api_shader *obj = vk_shader_zalloc(&dev->vk, &hk_shader_ops, info->stage, pAllocator, size); if (obj == NULL) { ralloc_free(nir); return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); } /* TODO: Multiview with ESO */ const bool is_multiview = state && state->rp->view_mask != 0; hk_lower_nir(dev, nir, info->robustness, is_multiview, info->set_layout_count, info->set_layouts); gl_shader_stage sw_stage = nir->info.stage; struct hk_fs_key fs_key_tmp, *fs_key = NULL; if (sw_stage == MESA_SHADER_FRAGMENT) { hk_populate_fs_key(&fs_key_tmp, state); fs_key = &fs_key_tmp; nir->info.fs.uses_sample_shading |= fs_key->force_sample_shading; /* Force late-Z for Z/S self-deps. TODO: There's probably a less silly way * to do this. */ if (fs_key->zs_self_dep) { nir_builder b = nir_builder_at(nir_before_impl(nir_shader_get_entrypoint(nir))); nir_discard_if(&b, nir_imm_false(&b)); nir->info.fs.uses_discard = true; } NIR_PASS(_, nir, agx_nir_lower_sample_intrinsics, false); } else if (sw_stage == MESA_SHADER_TESS_CTRL) { NIR_PASS_V(nir, agx_nir_lower_tcs, dev->dev.libagx); } /* Compile all variants up front */ if (sw_stage == MESA_SHADER_GEOMETRY) { for (unsigned rast_disc = 0; rast_disc < 2; ++rast_disc) { struct hk_shader *count_variant = hk_count_gs_variant(obj, rast_disc); nir_shader *clone = nir_shader_clone(NULL, nir); enum mesa_prim out_prim = MESA_PRIM_MAX; nir_shader *count = NULL, *rast = NULL, *pre_gs = NULL; NIR_PASS(_, clone, agx_nir_lower_gs, dev->dev.libagx, rast_disc, &count, &rast, &pre_gs, &out_prim, &count_variant->info.gs.count_words); if (!rast_disc) { struct hk_shader *shader = &obj->variants[HK_GS_VARIANT_RAST]; hk_lower_hw_vs(rast, shader); shader->info.gs.out_prim = out_prim; } struct { nir_shader *in; struct hk_shader *out; } variants[] = { {clone, hk_main_gs_variant(obj, rast_disc)}, {pre_gs, hk_pre_gs_variant(obj, rast_disc)}, {count, count_variant}, {rast_disc ? NULL : rast, &obj->variants[HK_GS_VARIANT_RAST]}, }; for (unsigned v = 0; v < ARRAY_SIZE(variants); ++v) { if (variants[v].in) { result = hk_compile_nir(dev, pAllocator, variants[v].in, info->flags, info->robustness, NULL, variants[v].out, sw_stage, true, NULL); if (result != VK_SUCCESS) { hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator); ralloc_free(nir); return result; } } } } } else if (sw_stage == MESA_SHADER_VERTEX || sw_stage == MESA_SHADER_TESS_EVAL) { if (sw_stage == MESA_SHADER_VERTEX) { assert( !(nir->info.inputs_read & BITFIELD64_MASK(VERT_ATTRIB_GENERIC0)) && "Fixed-function attributes not used in Vulkan"); NIR_PASS(_, nir, nir_recompute_io_bases, nir_var_shader_in); } /* the shader_out portion of this is load-bearing even for tess eval */ NIR_PASS(_, nir, nir_io_add_const_offset_to_base, nir_var_shader_in | nir_var_shader_out); for (enum hk_vs_variant v = 0; v < HK_VS_VARIANTS; ++v) { struct hk_shader *shader = &obj->variants[v]; bool hw = v == HK_VS_VARIANT_HW; /* TODO: Optimize single variant when we know nextStage */ nir_shader *clone = nir_shader_clone(NULL, nir); if (sw_stage == MESA_SHADER_VERTEX) { NIR_PASS(_, clone, agx_nir_lower_vs_input_to_prolog, shader->info.vs.attrib_components_read); shader->info.vs.attribs_read = nir->info.inputs_read >> VERT_ATTRIB_GENERIC0; } if (hw) { hk_lower_hw_vs(clone, shader); } else { NIR_PASS(_, clone, agx_nir_lower_vs_before_gs, dev->dev.libagx); } result = hk_compile_nir(dev, pAllocator, clone, info->flags, info->robustness, fs_key, shader, sw_stage, hw, nir->xfb_info); if (result != VK_SUCCESS) { hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator); ralloc_free(nir); return result; } } } else { struct hk_shader *shader = hk_only_variant(obj); /* hk_compile_nir takes ownership of nir */ result = hk_compile_nir(dev, pAllocator, nir, info->flags, info->robustness, fs_key, shader, sw_stage, true, NULL); if (result != VK_SUCCESS) { hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator); return result; } } *shader_out = obj; return VK_SUCCESS; } static VkResult hk_compile_shaders(struct vk_device *vk_dev, uint32_t shader_count, struct vk_shader_compile_info *infos, const struct vk_graphics_pipeline_state *state, const VkAllocationCallbacks *pAllocator, struct vk_shader **shaders_out) { struct hk_device *dev = container_of(vk_dev, struct hk_device, vk); for (uint32_t i = 0; i < shader_count; i++) { VkResult result = hk_compile_shader(dev, &infos[i], state, pAllocator, (struct hk_api_shader **)&shaders_out[i]); if (result != VK_SUCCESS) { /* Clean up all the shaders before this point */ for (uint32_t j = 0; j < i; j++) hk_api_shader_destroy(&dev->vk, shaders_out[j], pAllocator); /* Clean up all the NIR after this point */ for (uint32_t j = i + 1; j < shader_count; j++) ralloc_free(infos[j].nir); /* Memset the output array */ memset(shaders_out, 0, shader_count * sizeof(*shaders_out)); return result; } } return VK_SUCCESS; } static VkResult hk_deserialize_shader(struct hk_device *dev, struct blob_reader *blob, struct hk_shader *shader) { struct hk_shader_info info; blob_copy_bytes(blob, &info, sizeof(info)); struct agx_shader_info b_info; blob_copy_bytes(blob, &b_info, sizeof(b_info)); const uint32_t code_size = blob_read_uint32(blob); const uint32_t data_size = blob_read_uint32(blob); if (blob->overrun) return vk_error(dev, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT); VkResult result = hk_init_link_ht(shader, info.stage); if (result != VK_SUCCESS) return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); simple_mtx_init(&shader->linked.lock, mtx_plain); shader->b.info = b_info; shader->info = info; shader->code_size = code_size; shader->data_size = data_size; shader->b.binary_size = code_size; shader->code_ptr = malloc(code_size); if (shader->code_ptr == NULL) return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); shader->data_ptr = malloc(data_size); if (shader->data_ptr == NULL) return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); blob_copy_bytes(blob, (void *)shader->code_ptr, shader->code_size); blob_copy_bytes(blob, (void *)shader->data_ptr, shader->data_size); if (blob->overrun) return vk_error(dev, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT); shader->b.binary = (void *)shader->code_ptr; hk_upload_shader(dev, shader); return VK_SUCCESS; } static VkResult hk_deserialize_api_shader(struct vk_device *vk_dev, struct blob_reader *blob, uint32_t binary_version, const VkAllocationCallbacks *pAllocator, struct vk_shader **shader_out) { struct hk_device *dev = container_of(vk_dev, struct hk_device, vk); gl_shader_stage stage = blob_read_uint8(blob); if (blob->overrun) return vk_error(dev, VK_ERROR_INCOMPATIBLE_SHADER_BINARY_EXT); size_t size = sizeof(struct hk_api_shader) + sizeof(struct hk_shader) * hk_num_variants(stage); struct hk_api_shader *obj = vk_shader_zalloc(&dev->vk, &hk_shader_ops, stage, pAllocator, size); if (obj == NULL) return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); hk_foreach_variant(obj, shader) { VkResult result = hk_deserialize_shader(dev, blob, shader); if (result != VK_SUCCESS) { hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator); return result; } } *shader_out = &obj->vk; return VK_SUCCESS; } static void hk_shader_serialize(struct vk_device *vk_dev, const struct hk_shader *shader, struct blob *blob) { blob_write_bytes(blob, &shader->info, sizeof(shader->info)); blob_write_bytes(blob, &shader->b.info, sizeof(shader->b.info)); blob_write_uint32(blob, shader->code_size); blob_write_uint32(blob, shader->data_size); blob_write_bytes(blob, shader->code_ptr, shader->code_size); blob_write_bytes(blob, shader->data_ptr, shader->data_size); } static bool hk_api_shader_serialize(struct vk_device *vk_dev, const struct vk_shader *vk_shader, struct blob *blob) { struct hk_api_shader *obj = container_of(vk_shader, struct hk_api_shader, vk); blob_write_uint8(blob, vk_shader->stage); hk_foreach_variant(obj, shader) { hk_shader_serialize(vk_dev, shader, blob); } return !blob->out_of_memory; } #define WRITE_STR(field, ...) \ ({ \ memset(field, 0, sizeof(field)); \ UNUSED int i = snprintf(field, sizeof(field), __VA_ARGS__); \ assert(i > 0 && i < sizeof(field)); \ }) static VkResult hk_shader_get_executable_properties( UNUSED struct vk_device *device, const struct vk_shader *vk_shader, uint32_t *executable_count, VkPipelineExecutablePropertiesKHR *properties) { struct hk_api_shader *obj = container_of(vk_shader, struct hk_api_shader, vk); VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out, properties, executable_count); vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props) { props->stages = mesa_to_vk_shader_stage(obj->vk.stage); props->subgroupSize = 32; WRITE_STR(props->name, "%s", _mesa_shader_stage_to_string(obj->vk.stage)); WRITE_STR(props->description, "%s shader", _mesa_shader_stage_to_string(obj->vk.stage)); } return vk_outarray_status(&out); } static VkResult hk_shader_get_executable_statistics( UNUSED struct vk_device *device, const struct vk_shader *vk_shader, uint32_t executable_index, uint32_t *statistic_count, VkPipelineExecutableStatisticKHR *statistics) { struct hk_api_shader *obj = container_of(vk_shader, struct hk_api_shader, vk); VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out, statistics, statistic_count); assert(executable_index == 0); /* TODO: find a sane way to report multiple variants and have that play nice * with zink. */ struct hk_shader *shader = hk_any_variant(obj); vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { WRITE_STR(stat->name, "Code Size"); WRITE_STR(stat->description, "Size of the compiled shader binary, in bytes"); stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; stat->value.u64 = shader->code_size; } vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) { WRITE_STR(stat->name, "Number of GPRs"); WRITE_STR(stat->description, "Number of GPRs used by this pipeline"); stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR; stat->value.u64 = shader->b.info.nr_gprs; } return vk_outarray_status(&out); } static bool write_ir_text(VkPipelineExecutableInternalRepresentationKHR *ir, const char *data) { ir->isText = VK_TRUE; size_t data_len = strlen(data) + 1; if (ir->pData == NULL) { ir->dataSize = data_len; return true; } strncpy(ir->pData, data, ir->dataSize); if (ir->dataSize < data_len) return false; ir->dataSize = data_len; return true; } static VkResult hk_shader_get_executable_internal_representations( UNUSED struct vk_device *device, const struct vk_shader *vk_shader, uint32_t executable_index, uint32_t *internal_representation_count, VkPipelineExecutableInternalRepresentationKHR *internal_representations) { VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out, internal_representations, internal_representation_count); bool incomplete_text = false; assert(executable_index == 0); /* TODO */ #if 0 vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) { WRITE_STR(ir->name, "AGX assembly"); WRITE_STR(ir->description, "AGX assembly"); if (!write_ir_text(ir, TODO)) incomplete_text = true; } #endif return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out); } static const struct vk_shader_ops hk_shader_ops = { .destroy = hk_api_shader_destroy, .serialize = hk_api_shader_serialize, .get_executable_properties = hk_shader_get_executable_properties, .get_executable_statistics = hk_shader_get_executable_statistics, .get_executable_internal_representations = hk_shader_get_executable_internal_representations, }; const struct vk_device_shader_ops hk_device_shader_ops = { .get_nir_options = hk_get_nir_options, .get_spirv_options = hk_get_spirv_options, .preprocess_nir = hk_preprocess_nir, .hash_graphics_state = hk_hash_graphics_state, .compile = hk_compile_shaders, .deserialize = hk_deserialize_api_shader, .cmd_set_dynamic_graphics_state = vk_cmd_set_dynamic_graphics_state, .cmd_bind_shaders = hk_cmd_bind_shaders, }; struct hk_linked_shader * hk_fast_link(struct hk_device *dev, bool fragment, struct hk_shader *main, struct agx_shader_part *prolog, struct agx_shader_part *epilog, unsigned nr_samples_shaded) { struct hk_linked_shader *s = rzalloc(NULL, struct hk_linked_shader); agx_fast_link(&s->b, &dev->dev, fragment, &main->b, prolog, epilog, nr_samples_shaded); if (fragment) { agx_pack(&s->fs_counts, FRAGMENT_SHADER_WORD_0, cfg) { cfg.cf_binding_count = s->b.cf.nr_bindings; cfg.uniform_register_count = main->b.info.push_count; cfg.preshader_register_count = main->b.info.nr_preamble_gprs; cfg.sampler_state_register_count = agx_translate_sampler_state_count(s->b.uses_txf ? 1 : 0, false); } } /* Now that we've linked, bake the USC words to bind this program */ struct agx_usc_builder b = agx_usc_builder(s->usc.data, sizeof(s->usc.data)); if (main && main->b.info.immediate_size_16) { unreachable("todo"); #if 0 /* XXX: do ahead of time */ uint64_t ptr = agx_pool_upload_aligned( &cmd->pool, s->b.info.immediates, s->b.info.immediate_size_16 * 2, 64); for (unsigned range = 0; range < constant_push_ranges; ++range) { unsigned offset = 64 * range; assert(offset < s->b.info.immediate_size_16); agx_usc_uniform(&b, s->b.info.immediate_base_uniform + offset, MIN2(64, s->b.info.immediate_size_16 - offset), ptr + (offset * 2)); } #endif } agx_usc_push_packed(&b, UNIFORM, dev->rodata.image_heap); if (s->b.uses_txf) agx_usc_push_packed(&b, SAMPLER, dev->rodata.txf_sampler); agx_usc_shared_non_fragment(&b, &main->b.info, 0); agx_usc_push_packed(&b, SHADER, s->b.shader); agx_usc_push_packed(&b, REGISTERS, s->b.regs); if (fragment) agx_usc_push_packed(&b, FRAGMENT_PROPERTIES, s->b.fragment_props); if (main && main->b.info.has_preamble) { agx_usc_pack(&b, PRESHADER, cfg) { cfg.code = agx_usc_addr(&dev->dev, main->preamble_addr); } } else { agx_usc_pack(&b, NO_PRESHADER, cfg) ; } s->usc.size = b.head - s->usc.data; return s; }