/* * Copyright 2024 Valve Corporation * Copyright 2024 Alyssa Rosenzweig * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc. * SPDX-License-Identifier: MIT */ #include "hk_cmd_buffer.h" #include "agx_bo.h" #include "agx_device.h" #include "agx_linker.h" #include "agx_tilebuffer.h" #include "agx_usc.h" #include "hk_buffer.h" #include "hk_cmd_pool.h" #include "hk_descriptor_set.h" #include "hk_descriptor_set_layout.h" #include "hk_device.h" #include "hk_device_memory.h" #include "hk_entrypoints.h" #include "hk_image_view.h" #include "hk_physical_device.h" #include "hk_shader.h" #include "pool.h" #include "shader_enums.h" #include "vk_pipeline_layout.h" #include "vk_synchronization.h" #include "util/list.h" #include "util/macros.h" #include "util/u_dynarray.h" #include "vulkan/vulkan_core.h" static void hk_descriptor_state_fini(struct hk_cmd_buffer *cmd, struct hk_descriptor_state *desc) { struct hk_cmd_pool *pool = hk_cmd_buffer_pool(cmd); for (unsigned i = 0; i < HK_MAX_SETS; i++) { vk_free(&pool->vk.alloc, desc->push[i]); desc->push[i] = NULL; } } static void hk_free_resettable_cmd_buffer(struct hk_cmd_buffer *cmd) { struct hk_cmd_pool *pool = hk_cmd_buffer_pool(cmd); struct hk_device *dev = hk_cmd_pool_device(pool); hk_descriptor_state_fini(cmd, &cmd->state.gfx.descriptors); hk_descriptor_state_fini(cmd, &cmd->state.cs.descriptors); hk_cmd_pool_free_bo_list(pool, &cmd->uploader.main.bos); hk_cmd_pool_free_usc_bo_list(pool, &cmd->uploader.usc.bos); list_for_each_entry_safe(struct hk_cs, it, &cmd->control_streams, node) { list_del(&it->node); hk_cs_destroy(it); } util_dynarray_foreach(&cmd->large_bos, struct agx_bo *, bo) { agx_bo_unreference(&dev->dev, *bo); } util_dynarray_clear(&cmd->large_bos); } static void hk_destroy_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer) { struct hk_cmd_buffer *cmd = container_of(vk_cmd_buffer, struct hk_cmd_buffer, vk); struct hk_cmd_pool *pool = hk_cmd_buffer_pool(cmd); hk_free_resettable_cmd_buffer(cmd); vk_command_buffer_finish(&cmd->vk); vk_free(&pool->vk.alloc, cmd); } static VkResult hk_create_cmd_buffer(struct vk_command_pool *vk_pool, VkCommandBufferLevel level, struct vk_command_buffer **cmd_buffer_out) { struct hk_cmd_pool *pool = container_of(vk_pool, struct hk_cmd_pool, vk); struct hk_device *dev = hk_cmd_pool_device(pool); struct hk_cmd_buffer *cmd; VkResult result; cmd = vk_zalloc(&pool->vk.alloc, sizeof(*cmd), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (cmd == NULL) return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); result = vk_command_buffer_init(&pool->vk, &cmd->vk, &hk_cmd_buffer_ops, level); if (result != VK_SUCCESS) { vk_free(&pool->vk.alloc, cmd); return result; } util_dynarray_init(&cmd->large_bos, NULL); cmd->vk.dynamic_graphics_state.vi = &cmd->state.gfx._dynamic_vi; cmd->vk.dynamic_graphics_state.ms.sample_locations = &cmd->state.gfx._dynamic_sl; list_inithead(&cmd->uploader.main.bos); list_inithead(&cmd->uploader.usc.bos); list_inithead(&cmd->control_streams); *cmd_buffer_out = &cmd->vk; return VK_SUCCESS; } static void hk_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer, UNUSED VkCommandBufferResetFlags flags) { struct hk_cmd_buffer *cmd = container_of(vk_cmd_buffer, struct hk_cmd_buffer, vk); vk_command_buffer_reset(&cmd->vk); hk_free_resettable_cmd_buffer(cmd); cmd->uploader.main.map = NULL; cmd->uploader.main.base = 0; cmd->uploader.main.offset = 0; cmd->uploader.usc.map = NULL; cmd->uploader.usc.base = 0; cmd->uploader.usc.offset = 0; cmd->current_cs.gfx = NULL; cmd->current_cs.cs = NULL; cmd->current_cs.post_gfx = NULL; cmd->current_cs.pre_gfx = NULL; /* TODO: clear pool! */ memset(&cmd->state, 0, sizeof(cmd->state)); } const struct vk_command_buffer_ops hk_cmd_buffer_ops = { .create = hk_create_cmd_buffer, .reset = hk_reset_cmd_buffer, .destroy = hk_destroy_cmd_buffer, }; static VkResult hk_cmd_buffer_alloc_bo(struct hk_cmd_buffer *cmd, bool usc, struct hk_cmd_bo **bo_out) { VkResult result = hk_cmd_pool_alloc_bo(hk_cmd_buffer_pool(cmd), usc, bo_out); if (result != VK_SUCCESS) return result; if (usc) list_addtail(&(*bo_out)->link, &cmd->uploader.usc.bos); else list_addtail(&(*bo_out)->link, &cmd->uploader.main.bos); return VK_SUCCESS; } struct agx_ptr hk_pool_alloc_internal(struct hk_cmd_buffer *cmd, uint32_t size, uint32_t alignment, bool usc) { struct hk_device *dev = hk_cmd_buffer_device(cmd); struct hk_uploader *uploader = usc ? &cmd->uploader.usc : &cmd->uploader.main; /* Specially handle large allocations owned by the command buffer, e.g. used * for statically allocated vertex output buffers with geometry shaders. */ if (size > HK_CMD_BO_SIZE) { uint32_t flags = usc ? AGX_BO_LOW_VA : 0; struct agx_bo *bo = agx_bo_create(&dev->dev, size, flags, 0, "Large pool allocation"); util_dynarray_append(&cmd->large_bos, struct agx_bo *, bo); return (struct agx_ptr){ .gpu = bo->va->addr, .cpu = bo->map, }; } assert(size <= HK_CMD_BO_SIZE); assert(alignment > 0); uint32_t offset = align(uploader->offset, alignment); assert(offset <= HK_CMD_BO_SIZE); if (uploader->map != NULL && size <= HK_CMD_BO_SIZE - offset) { uploader->offset = offset + size; return (struct agx_ptr){ .gpu = uploader->base + offset, .cpu = uploader->map + offset, }; } struct hk_cmd_bo *bo; VkResult result = hk_cmd_buffer_alloc_bo(cmd, usc, &bo); if (unlikely(result != VK_SUCCESS)) { vk_command_buffer_set_error(&cmd->vk, result); return (struct agx_ptr){0}; } /* Pick whichever of the current upload BO and the new BO will have more * room left to be the BO for the next upload. If our upload size is * bigger than the old offset, we're better off burning the whole new * upload BO on this one allocation and continuing on the current upload * BO. */ if (uploader->map == NULL || size < uploader->offset) { uploader->map = bo->bo->map; uploader->base = bo->bo->va->addr; uploader->offset = size; } return (struct agx_ptr){ .gpu = bo->bo->va->addr, .cpu = bo->map, }; } uint64_t hk_pool_upload(struct hk_cmd_buffer *cmd, const void *data, uint32_t size, uint32_t alignment) { struct agx_ptr T = hk_pool_alloc(cmd, size, alignment); if (unlikely(T.cpu == NULL)) return 0; memcpy(T.cpu, data, size); return T.gpu; } VKAPI_ATTR VkResult VKAPI_CALL hk_BeginCommandBuffer(VkCommandBuffer commandBuffer, const VkCommandBufferBeginInfo *pBeginInfo) { VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); hk_reset_cmd_buffer(&cmd->vk, 0); hk_cmd_buffer_begin_compute(cmd, pBeginInfo); hk_cmd_buffer_begin_graphics(cmd, pBeginInfo); return VK_SUCCESS; } VKAPI_ATTR VkResult VKAPI_CALL hk_EndCommandBuffer(VkCommandBuffer commandBuffer) { VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); assert(cmd->current_cs.gfx == NULL && cmd->current_cs.pre_gfx == NULL && "must end rendering before ending the command buffer"); hk_cmd_buffer_end_compute(cmd); hk_cmd_buffer_end_compute_internal(&cmd->current_cs.post_gfx); return vk_command_buffer_get_record_result(&cmd->vk); } VKAPI_ATTR void VKAPI_CALL hk_CmdPipelineBarrier2(VkCommandBuffer commandBuffer, const VkDependencyInfo *pDependencyInfo) { VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); /* The big hammer. We end both compute and graphics batches. Ending compute * here is necessary to properly handle graphics->compute dependencies. * * XXX: perf. */ hk_cmd_buffer_end_compute(cmd); hk_cmd_buffer_end_graphics(cmd); } void hk_cmd_bind_shaders(struct vk_command_buffer *vk_cmd, uint32_t stage_count, const gl_shader_stage *stages, struct vk_shader **const shaders) { struct hk_cmd_buffer *cmd = container_of(vk_cmd, struct hk_cmd_buffer, vk); for (uint32_t i = 0; i < stage_count; i++) { struct hk_api_shader *shader = container_of(shaders[i], struct hk_api_shader, vk); if (stages[i] == MESA_SHADER_COMPUTE || stages[i] == MESA_SHADER_KERNEL) hk_cmd_bind_compute_shader(cmd, shader); else hk_cmd_bind_graphics_shader(cmd, stages[i], shader); } } static void hk_bind_descriptor_sets(UNUSED struct hk_cmd_buffer *cmd, struct hk_descriptor_state *desc, const VkBindDescriptorSetsInfoKHR *info) { VK_FROM_HANDLE(vk_pipeline_layout, pipeline_layout, info->layout); /* Fro the Vulkan 1.3.275 spec: * * "When binding a descriptor set (see Descriptor Set Binding) to * set number N... * * If, additionally, the previously bound descriptor set for set * N was bound using a pipeline layout not compatible for set N, * then all bindings in sets numbered greater than N are * disturbed." * * This means that, if some earlier set gets bound in such a way that * it changes set_dynamic_buffer_start[s], this binding is implicitly * invalidated. Therefore, we can always look at the current value * of set_dynamic_buffer_start[s] as the base of our dynamic buffer * range and it's only our responsibility to adjust all * set_dynamic_buffer_start[p] for p > s as needed. */ uint8_t dyn_buffer_start = desc->root.set_dynamic_buffer_start[info->firstSet]; uint32_t next_dyn_offset = 0; for (uint32_t i = 0; i < info->descriptorSetCount; ++i) { unsigned s = i + info->firstSet; VK_FROM_HANDLE(hk_descriptor_set, set, info->pDescriptorSets[i]); if (desc->sets[s] != set) { if (set != NULL) { desc->root.sets[s] = hk_descriptor_set_addr(set); desc->set_sizes[s] = set->size; } else { desc->root.sets[s] = 0; desc->set_sizes[s] = 0; } desc->sets[s] = set; desc->sets_dirty |= BITFIELD_BIT(s); /* Binding descriptors invalidates push descriptors */ desc->push_dirty &= ~BITFIELD_BIT(s); } desc->root.set_dynamic_buffer_start[s] = dyn_buffer_start; if (pipeline_layout->set_layouts[s] != NULL) { const struct hk_descriptor_set_layout *set_layout = vk_to_hk_descriptor_set_layout(pipeline_layout->set_layouts[s]); if (set != NULL && set_layout->dynamic_buffer_count > 0) { for (uint32_t j = 0; j < set_layout->dynamic_buffer_count; j++) { struct hk_buffer_address addr = set->dynamic_buffers[j]; addr.base_addr += info->pDynamicOffsets[next_dyn_offset + j]; desc->root.dynamic_buffers[dyn_buffer_start + j] = addr; } next_dyn_offset += set->layout->dynamic_buffer_count; } dyn_buffer_start += set_layout->dynamic_buffer_count; } else { assert(set == NULL); } } assert(dyn_buffer_start <= HK_MAX_DYNAMIC_BUFFERS); assert(next_dyn_offset <= info->dynamicOffsetCount); for (uint32_t s = info->firstSet + info->descriptorSetCount; s < HK_MAX_SETS; s++) desc->root.set_dynamic_buffer_start[s] = dyn_buffer_start; desc->root_dirty = true; } VKAPI_ATTR void VKAPI_CALL hk_CmdBindDescriptorSets2KHR( VkCommandBuffer commandBuffer, const VkBindDescriptorSetsInfoKHR *pBindDescriptorSetsInfo) { VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) { hk_bind_descriptor_sets(cmd, &cmd->state.gfx.descriptors, pBindDescriptorSetsInfo); } if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) { hk_bind_descriptor_sets(cmd, &cmd->state.cs.descriptors, pBindDescriptorSetsInfo); } } static void hk_push_constants(UNUSED struct hk_cmd_buffer *cmd, struct hk_descriptor_state *desc, const VkPushConstantsInfoKHR *info) { memcpy(desc->root.push + info->offset, info->pValues, info->size); desc->root_dirty = true; } VKAPI_ATTR void VKAPI_CALL hk_CmdPushConstants2KHR(VkCommandBuffer commandBuffer, const VkPushConstantsInfoKHR *pPushConstantsInfo) { VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); if (pPushConstantsInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) hk_push_constants(cmd, &cmd->state.gfx.descriptors, pPushConstantsInfo); if (pPushConstantsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) hk_push_constants(cmd, &cmd->state.cs.descriptors, pPushConstantsInfo); } static struct hk_push_descriptor_set * hk_cmd_push_descriptors(struct hk_cmd_buffer *cmd, struct hk_descriptor_state *desc, uint32_t set) { assert(set < HK_MAX_SETS); if (unlikely(desc->push[set] == NULL)) { desc->push[set] = vk_zalloc(&cmd->vk.pool->alloc, sizeof(*desc->push[set]), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (unlikely(desc->push[set] == NULL)) { vk_command_buffer_set_error(&cmd->vk, VK_ERROR_OUT_OF_HOST_MEMORY); return NULL; } } /* Pushing descriptors replaces whatever sets are bound */ desc->sets[set] = NULL; desc->push_dirty |= BITFIELD_BIT(set); return desc->push[set]; } static void hk_push_descriptor_set(struct hk_cmd_buffer *cmd, struct hk_descriptor_state *desc, const VkPushDescriptorSetInfoKHR *info) { VK_FROM_HANDLE(vk_pipeline_layout, pipeline_layout, info->layout); struct hk_push_descriptor_set *push_set = hk_cmd_push_descriptors(cmd, desc, info->set); if (unlikely(push_set == NULL)) return; struct hk_descriptor_set_layout *set_layout = vk_to_hk_descriptor_set_layout(pipeline_layout->set_layouts[info->set]); hk_push_descriptor_set_update(push_set, set_layout, info->descriptorWriteCount, info->pDescriptorWrites); } VKAPI_ATTR void VKAPI_CALL hk_CmdPushDescriptorSet2KHR( VkCommandBuffer commandBuffer, const VkPushDescriptorSetInfoKHR *pPushDescriptorSetInfo) { VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); if (pPushDescriptorSetInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) { hk_push_descriptor_set(cmd, &cmd->state.gfx.descriptors, pPushDescriptorSetInfo); } if (pPushDescriptorSetInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) { hk_push_descriptor_set(cmd, &cmd->state.cs.descriptors, pPushDescriptorSetInfo); } } void hk_cmd_buffer_flush_push_descriptors(struct hk_cmd_buffer *cmd, struct hk_descriptor_state *desc) { u_foreach_bit(set_idx, desc->push_dirty) { struct hk_push_descriptor_set *push_set = desc->push[set_idx]; uint64_t push_set_addr = hk_pool_upload( cmd, push_set->data, sizeof(push_set->data), HK_MIN_UBO_ALIGNMENT); desc->root.sets[set_idx] = push_set_addr; desc->set_sizes[set_idx] = sizeof(push_set->data); } desc->root_dirty = true; desc->push_dirty = 0; } VKAPI_ATTR void VKAPI_CALL hk_CmdPushDescriptorSetWithTemplate2KHR( VkCommandBuffer commandBuffer, const VkPushDescriptorSetWithTemplateInfoKHR *pPushDescriptorSetWithTemplateInfo) { VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); VK_FROM_HANDLE(vk_descriptor_update_template, template, pPushDescriptorSetWithTemplateInfo->descriptorUpdateTemplate); VK_FROM_HANDLE(vk_pipeline_layout, pipeline_layout, pPushDescriptorSetWithTemplateInfo->layout); struct hk_descriptor_state *desc = hk_get_descriptors_state(cmd, template->bind_point); struct hk_push_descriptor_set *push_set = hk_cmd_push_descriptors( cmd, desc, pPushDescriptorSetWithTemplateInfo->set); if (unlikely(push_set == NULL)) return; struct hk_descriptor_set_layout *set_layout = vk_to_hk_descriptor_set_layout( pipeline_layout->set_layouts[pPushDescriptorSetWithTemplateInfo->set]); hk_push_descriptor_set_update_template( push_set, set_layout, template, pPushDescriptorSetWithTemplateInfo->pData); } uint64_t hk_cmd_buffer_upload_root(struct hk_cmd_buffer *cmd, VkPipelineBindPoint bind_point) { struct hk_descriptor_state *desc = hk_get_descriptors_state(cmd, bind_point); struct hk_root_descriptor_table *root = &desc->root; struct agx_ptr root_ptr = hk_pool_alloc(cmd, sizeof(*root), 8); if (!root_ptr.gpu) return 0; root->root_desc_addr = root_ptr.gpu; memcpy(root_ptr.cpu, root, sizeof(*root)); return root_ptr.gpu; } void hk_usc_upload_spilled_rt_descs(struct agx_usc_builder *b, struct hk_cmd_buffer *cmd) { struct hk_rendering_state *render = &cmd->state.gfx.render; /* Upload texture/PBE descriptors for each render target so we can clear * spilled render targets. */ struct agx_ptr descs = hk_pool_alloc(cmd, AGX_TEXTURE_LENGTH * 2 * render->color_att_count, 64); struct agx_texture_packed *desc = descs.cpu; if (!desc) return; for (unsigned i = 0; i < render->color_att_count; ++i) { struct hk_image_view *iview = render->color_att[i].iview; if (!iview) { /* XXX: probably should emit a null descriptor here...? */ continue; } memcpy(&desc[(i * 2) + 0], &iview->planes[0].emrt_texture, sizeof(*desc)); memcpy(&desc[(i * 2) + 1], &iview->planes[0].emrt_pbe, sizeof(*desc)); } desc = descs.cpu; /* Bind the base as u0_u1 for bindless access */ agx_usc_uniform(b, 0, 4, hk_pool_upload(cmd, &descs.gpu, 8, 8)); } void hk_reserve_scratch(struct hk_cmd_buffer *cmd, struct hk_cs *cs, struct hk_shader *s) { struct hk_device *dev = hk_cmd_buffer_device(cmd); uint32_t max_scratch_size = MAX2(s->b.info.scratch_size, s->b.info.preamble_scratch_size); if (max_scratch_size == 0) return; unsigned preamble_size = (s->b.info.preamble_scratch_size > 0) ? 1 : 0; /* Note: this uses the hardware stage, not the software stage */ hk_device_alloc_scratch(dev, s->b.info.stage, max_scratch_size); switch (s->b.info.stage) { case PIPE_SHADER_FRAGMENT: cs->scratch.fs.main = true; cs->scratch.fs.preamble = MAX2(cs->scratch.fs.preamble, preamble_size); break; case PIPE_SHADER_VERTEX: cs->scratch.vs.main = true; cs->scratch.vs.preamble = MAX2(cs->scratch.vs.preamble, preamble_size); break; default: cs->scratch.cs.main = true; cs->scratch.cs.preamble = MAX2(cs->scratch.cs.preamble, preamble_size); break; } } uint32_t hk_upload_usc_words(struct hk_cmd_buffer *cmd, struct hk_shader *s, struct hk_linked_shader *linked) { struct hk_device *dev = hk_cmd_buffer_device(cmd); enum pipe_shader_type sw_stage = s->info.stage; enum pipe_shader_type hw_stage = s->b.info.stage; unsigned constant_push_ranges = DIV_ROUND_UP(s->b.info.immediate_size_16, 64); unsigned push_ranges = 2; unsigned stage_ranges = 3; size_t usc_size = agx_usc_size(constant_push_ranges + push_ranges + stage_ranges + 4); struct agx_ptr t = hk_pool_usc_alloc(cmd, usc_size, 64); if (!t.cpu) return 0; struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size); uint64_t root_ptr; if (sw_stage == PIPE_SHADER_COMPUTE) root_ptr = hk_cmd_buffer_upload_root(cmd, VK_PIPELINE_BIND_POINT_COMPUTE); else root_ptr = cmd->state.gfx.root; static_assert(offsetof(struct hk_root_descriptor_table, root_desc_addr) == 0, "self-reflective"); agx_usc_uniform(&b, HK_ROOT_UNIFORM, 4, root_ptr); if (sw_stage == MESA_SHADER_VERTEX) { unsigned count = DIV_ROUND_UP(BITSET_LAST_BIT(s->info.vs.attrib_components_read), 4); if (count) { agx_usc_uniform( &b, 0, 4 * count, root_ptr + hk_root_descriptor_offset(draw.attrib_base)); agx_usc_uniform( &b, 4 * count, 2 * count, root_ptr + hk_root_descriptor_offset(draw.attrib_clamps)); } if (cmd->state.gfx.draw_params) agx_usc_uniform(&b, 6 * count, 4, cmd->state.gfx.draw_params); if (cmd->state.gfx.draw_id_ptr) agx_usc_uniform(&b, (6 * count) + 4, 1, cmd->state.gfx.draw_id_ptr); if (hw_stage == MESA_SHADER_COMPUTE) { agx_usc_uniform( &b, (6 * count) + 8, 4, root_ptr + hk_root_descriptor_offset(draw.input_assembly)); } } else if (sw_stage == MESA_SHADER_FRAGMENT) { if (agx_tilebuffer_spills(&cmd->state.gfx.render.tilebuffer)) { hk_usc_upload_spilled_rt_descs(&b, cmd); } agx_usc_uniform( &b, 4, 8, root_ptr + hk_root_descriptor_offset(draw.blend_constant)); /* The SHARED state is baked into linked->usc for non-fragment shaders. We * don't pass around the information to bake the tilebuffer layout. * * TODO: We probably could with some refactor. */ agx_usc_push_packed(&b, SHARED, &cmd->state.gfx.render.tilebuffer.usc); } agx_usc_push_blob(&b, linked->usc.data, linked->usc.size); return agx_usc_addr(&dev->dev, t.gpu); } /* Specialized variant of hk_upload_usc_words for internal dispatches that do * not use any state except for some directly mapped uniforms. */ uint32_t hk_upload_usc_words_kernel(struct hk_cmd_buffer *cmd, struct hk_shader *s, void *data, size_t data_size) { struct hk_device *dev = hk_cmd_buffer_device(cmd); assert(s->info.stage == MESA_SHADER_COMPUTE); assert(s->b.info.scratch_size == 0 && "you shouldn't be spilling!"); assert(s->b.info.preamble_scratch_size == 0 && "you shouldn't be spilling!"); unsigned constant_push_ranges = DIV_ROUND_UP(s->b.info.immediate_size_16, 64); size_t usc_size = agx_usc_size(constant_push_ranges + 7); struct agx_ptr t = hk_pool_usc_alloc(cmd, usc_size, 64); if (!t.cpu) return 0; struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size); /* Map the data directly as uniforms starting at u0 */ agx_usc_uniform(&b, 0, DIV_ROUND_UP(data_size, 2), hk_pool_upload(cmd, data, data_size, 4)); agx_usc_push_blob(&b, s->only_linked->usc.data, s->only_linked->usc.size); return agx_usc_addr(&dev->dev, t.gpu); } void hk_cs_init_graphics(struct hk_cmd_buffer *cmd, struct hk_cs *cs) { struct hk_rendering_state *render = &cmd->state.gfx.render; uint8_t *map = cs->current; cs->tib = render->tilebuffer; /* Assume this is not the first control stream of the render pass, so * initially use the partial background program and ZLS control. * hk_BeginRendering will override. */ cs->cr = render->cr; cs->cr.bg.main = render->cr.bg.partial; cs->cr.zls_control = render->cr.zls_control_partial; /* Barrier to enforce GPU-CPU coherency, in case this batch is back to back * with another that caused stale data to be cached and the CPU wrote to it * in the meantime. */ agx_push(map, VDM_BARRIER, cfg) { cfg.usc_cache_inval = true; } struct AGX_PPP_HEADER present = { .w_clamp = true, .occlusion_query_2 = true, .output_unknown = true, .varying_word_2 = true, .viewport_count = 1, /* irrelevant */ }; size_t size = agx_ppp_update_size(&present); struct agx_ptr T = hk_pool_alloc(cmd, size, 64); if (!T.cpu) return; struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &present); /* clang-format off */ agx_ppp_push(&ppp, W_CLAMP, cfg) cfg.w_clamp = 1e-10; agx_ppp_push(&ppp, FRAGMENT_OCCLUSION_QUERY_2, cfg); agx_ppp_push(&ppp, OUTPUT_UNKNOWN, cfg); agx_ppp_push(&ppp, VARYING_2, cfg); /* clang-format on */ agx_ppp_fini(&map, &ppp); cs->current = map; util_dynarray_init(&cs->scissor, NULL); util_dynarray_init(&cs->depth_bias, NULL); /* All graphics state must be reemited in each control stream */ hk_cmd_buffer_dirty_all(cmd); } void hk_ensure_cs_has_space(struct hk_cmd_buffer *cmd, struct hk_cs *cs, size_t space) { bool vdm = cs->type == HK_CS_VDM; size_t link_length = vdm ? AGX_VDM_STREAM_LINK_LENGTH : AGX_CDM_STREAM_LINK_LENGTH; /* Assert that we have space for a link tag */ assert((cs->current + link_length) <= cs->end && "Encoder overflowed"); /* Always leave room for a link tag, in case we run out of space later, * plus padding because VDM apparently overreads? * * 0x200 is not enough. 0x400 seems to work. 0x800 for safety. */ space += link_length + 0x800; /* If there is room in the command buffer, we're done */ if (likely((cs->end - cs->current) >= space)) return; /* Otherwise, we need to allocate a new command buffer. We use memory owned * by the batch to simplify lifetime management for the BO. */ size_t size = 65536; struct agx_ptr T = hk_pool_alloc(cmd, size, 256); /* Jump from the old control stream to the new control stream */ if (vdm) { agx_pack(cs->current, VDM_STREAM_LINK, cfg) { cfg.target_lo = T.gpu & BITFIELD_MASK(32); cfg.target_hi = T.gpu >> 32; } } else { agx_pack(cs->current, CDM_STREAM_LINK, cfg) { cfg.target_lo = T.gpu & BITFIELD_MASK(32); cfg.target_hi = T.gpu >> 32; } } /* Swap out the control stream */ cs->current = T.cpu; cs->end = cs->current + size; cs->stream_linked = true; }