/* * Copyright 2023 Alyssa Rosenzweig * Copyright 2023 Valve Corporation * SPDX-License-Identifier: MIT */ #include "shaders/tessellator.h" #include "geometry.h" /* Compatible with util/u_math.h */ static inline uint util_logbase2_ceil(uint n) { if (n <= 1) return 0; else return 32 - clz(n - 1); } /* Swap the two non-provoking vertices third vert in odd triangles. This * generates a vertex ID list with a consistent winding order. * * With prim and flatshade_first, the map : [0, 1, 2] -> [0, 1, 2] is its own * inverse. This lets us reuse it for both vertex fetch and transform feedback. */ uint libagx_map_vertex_in_tri_strip(uint prim, uint vert, bool flatshade_first) { unsigned pv = flatshade_first ? 0 : 2; bool even = (prim & 1) == 0; bool provoking = vert == pv; return (provoking || even) ? vert : ((3 - pv) - vert); } uint64_t libagx_xfb_vertex_address(global struct agx_geometry_params *p, uint base_index, uint vert, uint buffer, uint stride, uint output_offset) { uint index = base_index + vert; uint xfb_offset = (index * stride) + output_offset; return (uintptr_t)(p->xfb_base[buffer]) + xfb_offset; } uint libagx_vertex_id_for_line_loop(uint prim, uint vert, uint num_prims) { /* (0, 1), (1, 2), (2, 0) */ if (prim == (num_prims - 1) && vert == 1) return 0; else return prim + vert; } uint libagx_vertex_id_for_line_class(enum mesa_prim mode, uint prim, uint vert, uint num_prims) { /* Line list, line strip, or line loop */ if (mode == MESA_PRIM_LINE_LOOP && prim == (num_prims - 1) && vert == 1) return 0; if (mode == MESA_PRIM_LINES) prim *= 2; return prim + vert; } uint libagx_vertex_id_for_tri_fan(uint prim, uint vert, bool flatshade_first) { /* Vulkan spec section 20.1.7 gives (i + 1, i + 2, 0) for a provoking * first. OpenGL instead wants (0, i + 1, i + 2) with a provoking last. * Piglit clipflat expects us to switch between these orders depending on * provoking vertex, to avoid trivializing the fan. * * Rotate accordingly. */ if (flatshade_first) { vert = (vert == 2) ? 0 : (vert + 1); } /* The simpler form assuming last is provoking. */ return (vert == 0) ? 0 : prim + vert; } uint libagx_vertex_id_for_tri_class(enum mesa_prim mode, uint prim, uint vert, bool flatshade_first) { if (flatshade_first && mode == MESA_PRIM_TRIANGLE_FAN) { vert = vert + 1; vert = (vert == 3) ? 0 : vert; } if (mode == MESA_PRIM_TRIANGLE_FAN && vert == 0) return 0; if (mode == MESA_PRIM_TRIANGLES) prim *= 3; /* Triangle list, triangle strip, or triangle fan */ if (mode == MESA_PRIM_TRIANGLE_STRIP) { unsigned pv = flatshade_first ? 0 : 2; bool even = (prim & 1) == 0; bool provoking = vert == pv; vert = ((provoking || even) ? vert : ((3 - pv) - vert)); } return prim + vert; } uint libagx_vertex_id_for_line_adj_class(enum mesa_prim mode, uint prim, uint vert) { /* Line list adj or line strip adj */ if (mode == MESA_PRIM_LINES_ADJACENCY) prim *= 4; return prim + vert; } uint libagx_vertex_id_for_tri_strip_adj(uint prim, uint vert, uint num_prims, bool flatshade_first) { /* See Vulkan spec section 20.1.11 "Triangle Strips With Adjancency". * * There are different cases for first/middle/last/only primitives and for * odd/even primitives. Determine which case we're in. */ bool last = prim == (num_prims - 1); bool first = prim == 0; bool even = (prim & 1) == 0; bool even_or_first = even || first; /* When the last vertex is provoking, we rotate the primitives * accordingly. This seems required for OpenGL. */ if (!flatshade_first && !even_or_first) { vert = (vert + 4u) % 6u; } /* Offsets per the spec. The spec lists 6 cases with 6 offsets. Luckily, * there are lots of patterns we can exploit, avoiding a full 6x6 LUT. * * Here we assume the first vertex is provoking, the Vulkan default. */ uint offsets[6] = { 0, first ? 1 : (even ? -2 : 3), even_or_first ? 2 : 4, last ? 5 : 6, even_or_first ? 4 : 2, even_or_first ? 3 : -2, }; /* Ensure NIR can see thru the local array */ uint offset = 0; for (uint i = 1; i < 6; ++i) { if (i == vert) offset = offsets[i]; } /* Finally add to the base of the primitive */ return (prim * 2) + offset; } uint libagx_vertex_id_for_tri_adj_class(enum mesa_prim mode, uint prim, uint vert, uint nr, bool flatshade_first) { /* Tri adj list or tri adj strip */ if (mode == MESA_PRIM_TRIANGLE_STRIP_ADJACENCY) { return libagx_vertex_id_for_tri_strip_adj(prim, vert, nr, flatshade_first); } else { return (6 * prim) + vert; } } uint libagx_vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first, uint prim, uint vert, uint num_prims) { switch (mode) { case MESA_PRIM_POINTS: case MESA_PRIM_LINES: case MESA_PRIM_TRIANGLES: case MESA_PRIM_LINES_ADJACENCY: case MESA_PRIM_TRIANGLES_ADJACENCY: /* Regular primitive: every N vertices defines a primitive */ return (prim * mesa_vertices_per_prim(mode)) + vert; case MESA_PRIM_LINE_LOOP: return libagx_vertex_id_for_line_loop(prim, vert, num_prims); case MESA_PRIM_LINE_STRIP: case MESA_PRIM_LINE_STRIP_ADJACENCY: /* (i, i + 1) or (i, ..., i + 3) */ return prim + vert; case MESA_PRIM_TRIANGLE_STRIP: { /* Order depends on the provoking vert. * * First: (0, 1, 2), (1, 3, 2), (2, 3, 4). * Last: (0, 1, 2), (2, 1, 3), (2, 3, 4). * * Pull the (maybe swapped) vert from the corresponding primitive */ return prim + libagx_map_vertex_in_tri_strip(prim, vert, flatshade_first); } case MESA_PRIM_TRIANGLE_FAN: return libagx_vertex_id_for_tri_fan(prim, vert, flatshade_first); case MESA_PRIM_TRIANGLE_STRIP_ADJACENCY: return libagx_vertex_id_for_tri_strip_adj(prim, vert, num_prims, flatshade_first); default: return 0; } } uint libagx_load_index_buffer_internal(uintptr_t index_buffer, uint32_t index_buffer_range_el, uint id, uint index_size) { bool oob = id >= index_buffer_range_el; /* If the load would be out-of-bounds, load the first element which is * assumed valid. If the application index buffer is empty with robustness2, * index_buffer will point to a zero sink where only the first is valid. */ if (oob) { id = 0; } uint el; if (index_size == 1) { el = ((constant uint8_t *)index_buffer)[id]; } else if (index_size == 2) { el = ((constant uint16_t *)index_buffer)[id]; } else { el = ((constant uint32_t *)index_buffer)[id]; } /* D3D robustness semantics. TODO: Optimize? */ if (oob) { el = 0; } return el; } uint libagx_load_index_buffer(constant struct agx_ia_state *p, uint id, uint index_size) { return libagx_load_index_buffer_internal( p->index_buffer, p->index_buffer_range_el, id, index_size); } /* * Return the ID of the first thread in the workgroup where cond is true, or * 1024 if cond is false across the workgroup. */ static uint first_true_thread_in_workgroup(bool cond, local uint *scratch) { barrier(CLK_LOCAL_MEM_FENCE); scratch[get_sub_group_id()] = ballot(cond); barrier(CLK_LOCAL_MEM_FENCE); uint first_group = ctz(ballot(scratch[get_sub_group_local_id()])); uint off = ctz(first_group < 32 ? scratch[first_group] : 0); return (first_group * 32) + off; } /* * Allocate memory from the heap (thread-safe). Returns the offset into the * heap. The allocation will be word-aligned. */ static inline uint libagx_atomic_alloc(global struct agx_geometry_state *heap, uint size_B) { return atomic_fetch_add((volatile atomic_uint *)(&heap->heap_bottom), align(size_B, 8)); } /* * When unrolling the index buffer for a draw, we translate the old indirect * draws to new indirect draws. This routine allocates the new index buffer and * sets up most of the new draw descriptor. */ static global void * setup_unroll_for_draw(global struct agx_restart_unroll_params *p, constant uint *in_draw, uint draw, enum mesa_prim mode, uint index_size_B) { /* Determine an upper bound on the memory required for the index buffer. * Restarts only decrease the unrolled index buffer size, so the maximum size * is the unrolled size when the input has no restarts. */ uint max_prims = u_decomposed_prims_for_vertices(mode, in_draw[0]); uint max_verts = max_prims * mesa_vertices_per_prim(mode); uint alloc_size = max_verts * index_size_B; /* Allocate unrolled index buffer. Atomic since multiple threads may be * running to handle multidraw in parallel. */ global struct agx_geometry_state *heap = p->heap; uint old_heap_bottom_B = libagx_atomic_alloc(p->heap, alloc_size); /* Regardless of the input stride, we use tightly packed output draws */ global uint *out = &p->out_draws[5 * draw]; /* Setup most of the descriptor. Count will be determined after unroll. */ out[1] = in_draw[1]; /* instance count */ out[2] = old_heap_bottom_B / index_size_B; /* index offset */ out[3] = in_draw[3]; /* index bias */ out[4] = in_draw[4]; /* base instance */ /* Return the index buffer we allocated */ return (global uchar *)heap->heap + old_heap_bottom_B; } #define UNROLL(INDEX, suffix) \ kernel void libagx_unroll_restart_##suffix( \ global struct agx_restart_unroll_params *p, enum mesa_prim mode, \ uint draw, uint tid) \ { \ /* For an indirect multidraw, we are dispatched maxDraws times and \ * terminate trailing invocations. \ */ \ if (p->count && draw >= *(p->count)) \ return; \ \ constant uint *in_draw = \ (constant uint *)(p->draws + (draw * p->draw_stride)); \ \ uint count = in_draw[0]; \ \ local uintptr_t out_ptr, in_ptr; \ if (tid == 0) { \ out_ptr = (uintptr_t)setup_unroll_for_draw(p, in_draw, draw, mode, \ sizeof(INDEX)); \ \ /* Accessed thru local mem because NIR deref is too aggressive */ \ in_ptr = (uintptr_t)(libagx_index_buffer( \ p->index_buffer, p->index_buffer_size_el, in_draw[2], \ sizeof(INDEX), p->zero_sink)); \ } \ \ barrier(CLK_LOCAL_MEM_FENCE); \ global INDEX *out = (global INDEX *)out_ptr; \ \ local uint scratch[32]; \ \ uint out_prims = 0; \ INDEX restart_idx = p->restart_index; \ bool flatshade_first = p->flatshade_first; \ \ uint needle = 0; \ uint per_prim = mesa_vertices_per_prim(mode); \ while (needle < count) { \ /* Search for next restart or the end. Lanes load in parallel. */ \ uint next_restart = needle; \ for (;;) { \ uint idx = next_restart + tid; \ bool restart = \ idx >= count || libagx_load_index_buffer_internal( \ in_ptr, p->index_buffer_size_el, idx, \ sizeof(INDEX)) == restart_idx; \ \ uint next_offs = first_true_thread_in_workgroup(restart, scratch); \ \ next_restart += next_offs; \ if (next_offs < 1024) \ break; \ } \ \ /* Emit up to the next restart. Lanes output in parallel */ \ uint subcount = next_restart - needle; \ uint subprims = u_decomposed_prims_for_vertices(mode, subcount); \ uint out_prims_base = out_prims; \ for (uint i = tid; i < subprims; i += 1024) { \ for (uint vtx = 0; vtx < per_prim; ++vtx) { \ uint id = libagx_vertex_id_for_topology(mode, flatshade_first, \ i, vtx, subprims); \ uint offset = needle + id; \ \ out[((out_prims_base + i) * per_prim) + vtx] = \ libagx_load_index_buffer_internal( \ in_ptr, p->index_buffer_size_el, offset, sizeof(INDEX)); \ } \ } \ \ out_prims += subprims; \ needle = next_restart + 1; \ } \ \ if (tid == 0) \ p->out_draws[(5 * draw) + 0] = out_prims * per_prim; \ } UNROLL(uchar, u8) UNROLL(ushort, u16) UNROLL(uint, u32) uint libagx_setup_xfb_buffer(global struct agx_geometry_params *p, uint i) { global uint *off_ptr = p->xfb_offs_ptrs[i]; if (!off_ptr) return 0; uint off = *off_ptr; p->xfb_base[i] = p->xfb_base_original[i] + off; return off; } /* * Translate EndPrimitive for LINE_STRIP or TRIANGLE_STRIP output prims into * writes into the 32-bit output index buffer. We write the sequence (b, b + 1, * b + 2, ..., b + n - 1, -1), where b (base) is the first vertex in the prim, n * (count) is the number of verts in the prims, and -1 is the prim restart index * used to signal the end of the prim. * * For points, we write index buffers without restart, just as a sideband to * pass data into the vertex shader. */ void libagx_end_primitive(global int *index_buffer, uint total_verts, uint verts_in_prim, uint total_prims, uint invocation_vertex_base, uint invocation_prim_base, uint geometry_base, bool restart) { /* Previous verts/prims are from previous invocations plus earlier * prims in this invocation. For the intra-invocation counts, we * subtract the count for this prim from the inclusive sum NIR gives us. */ uint previous_verts_in_invoc = (total_verts - verts_in_prim); uint previous_verts = invocation_vertex_base + previous_verts_in_invoc; uint previous_prims = restart ? invocation_prim_base + (total_prims - 1) : 0; /* The indices are encoded as: (unrolled ID * output vertices) + vertex. */ uint index_base = geometry_base + previous_verts_in_invoc; /* Index buffer contains 1 index for each vertex and 1 for each prim */ global int *out = &index_buffer[previous_verts + previous_prims]; /* Write out indices for the strip */ for (uint i = 0; i < verts_in_prim; ++i) { out[i] = index_base + i; } if (restart) out[verts_in_prim] = -1; } void libagx_build_gs_draw(global struct agx_geometry_params *p, uint vertices, uint primitives) { global uint *descriptor = p->indirect_desc; global struct agx_geometry_state *state = p->state; /* Setup the indirect draw descriptor */ uint indices = vertices + primitives; /* includes restart indices */ /* Allocate the index buffer */ uint index_buffer_offset_B = state->heap_bottom; p->output_index_buffer = (global uint *)(state->heap + index_buffer_offset_B); state->heap_bottom += (indices * 4); descriptor[0] = indices; /* count */ descriptor[1] = 1; /* instance count */ descriptor[2] = index_buffer_offset_B / 4; /* start */ descriptor[3] = 0; /* index bias */ descriptor[4] = 0; /* start instance */ if (state->heap_bottom > state->heap_size) { global uint *foo = (global uint *)(uintptr_t)0xdeadbeef; *foo = 0x1234; } } void libagx_gs_setup_indirect(global struct agx_gs_setup_indirect_params *gsi, enum mesa_prim mode, uint local_id) { global struct agx_geometry_params *p = gsi->geom; global struct agx_ia_state *ia = gsi->ia; /* Determine the (primitives, instances) grid size. */ uint vertex_count = gsi->draw[0]; uint instance_count = gsi->draw[1]; ia->verts_per_instance = vertex_count; /* Calculate number of primitives input into the GS */ uint prim_per_instance = u_decomposed_prims_for_vertices(mode, vertex_count); p->input_primitives = prim_per_instance * instance_count; /* Invoke VS as (vertices, instances); GS as (primitives, instances) */ p->vs_grid[0] = vertex_count; p->vs_grid[1] = instance_count; p->gs_grid[0] = prim_per_instance; p->gs_grid[1] = instance_count; p->primitives_log2 = util_logbase2_ceil(prim_per_instance); /* If indexing is enabled, the third word is the offset into the index buffer * in elements. Apply that offset now that we have it. For a hardware * indirect draw, the hardware would do this for us, but for software input * assembly we need to do it ourselves. */ if (gsi->index_size_B) { ia->index_buffer = libagx_index_buffer(gsi->index_buffer, gsi->index_buffer_range_el, gsi->draw[2], gsi->index_size_B, gsi->zero_sink); ia->index_buffer_range_el = libagx_index_buffer_range_el(gsi->index_buffer_range_el, gsi->draw[2]); } /* We need to allocate VS and GS count buffers, do so now */ global struct agx_geometry_state *state = p->state; uint vertex_buffer_size = libagx_tcs_in_size(vertex_count * instance_count, gsi->vs_outputs); p->count_buffer = (global uint *)(state->heap + state->heap_bottom); state->heap_bottom += align(p->input_primitives * p->count_buffer_stride, 16); p->input_buffer = (uintptr_t)(state->heap + state->heap_bottom); *(gsi->vertex_buffer) = p->input_buffer; state->heap_bottom += align(vertex_buffer_size, 4); p->input_mask = gsi->vs_outputs; if (state->heap_bottom > state->heap_size) { global uint *foo = (global uint *)(uintptr_t)0x1deadbeef; *foo = 0x1234; } } /* * Returns (work_group_scan_inclusive_add(x), work_group_sum(x)). Implemented * manually with subgroup ops and local memory since Mesa doesn't do those * lowerings yet. */ static uint2 libagx_work_group_scan_inclusive_add(uint x, local uint *scratch) { uint sg_id = get_sub_group_id(); /* Partial prefix sum of the subgroup */ uint sg = sub_group_scan_inclusive_add(x); /* Reduction (sum) for the subgroup */ uint sg_sum = sub_group_broadcast(sg, 31); /* Write out all the subgroups sums */ barrier(CLK_LOCAL_MEM_FENCE); scratch[sg_id] = sg_sum; barrier(CLK_LOCAL_MEM_FENCE); /* Read all the subgroup sums. Thread T in subgroup G reads the sum of all * threads in subgroup T. */ uint other_sum = scratch[get_sub_group_local_id()]; /* Exclusive sum the subgroup sums to get the total before the current group, * which can be added to the total for the current group. */ uint other_sums = sub_group_scan_exclusive_add(other_sum); uint base = sub_group_broadcast(other_sums, sg_id); uint prefix = base + sg; /* Reduce the workgroup using the prefix sum we already did */ uint reduction = sub_group_broadcast(other_sums + other_sum, 31); return (uint2)(prefix, reduction); } kernel void libagx_prefix_sum(global uint *buffer, uint len, uint words, uint word) { local uint scratch[32]; uint tid = get_local_id(0); /* Main loop: complete workgroups processing 1024 values at once */ uint i, count = 0; uint len_remainder = len % 1024; uint len_rounded_down = len - len_remainder; for (i = tid; i < len_rounded_down; i += 1024) { global uint *ptr = &buffer[(i * words) + word]; uint value = *ptr; uint2 sums = libagx_work_group_scan_inclusive_add(value, scratch); *ptr = count + sums[0]; count += sums[1]; } /* The last iteration is special since we won't have a full subgroup unless * the length is divisible by the subgroup size, and we don't advance count. */ global uint *ptr = &buffer[(i * words) + word]; uint value = (tid < len_remainder) ? *ptr : 0; uint scan = libagx_work_group_scan_inclusive_add(value, scratch)[0]; if (tid < len_remainder) { *ptr = count + scan; } } kernel void libagx_prefix_sum_tess(global struct libagx_tess_args *p) { libagx_prefix_sum(p->counts, p->nr_patches, 1 /* words */, 0 /* word */); /* After prefix summing, we know the total # of indices, so allocate the * index buffer now. Elect a thread for the allocation. */ barrier(CLK_LOCAL_MEM_FENCE); if (get_local_id(0) != 0) return; /* The last element of an inclusive prefix sum is the total sum */ uint total = p->counts[p->nr_patches - 1]; /* Allocate 4-byte indices */ uint32_t elsize_B = sizeof(uint32_t); uint32_t size_B = total * elsize_B; uint alloc_B = p->heap->heap_bottom; p->heap->heap_bottom += size_B; p->heap->heap_bottom = align(p->heap->heap_bottom, 8); p->index_buffer = (global uint32_t *)(((uintptr_t)p->heap->heap) + alloc_B); /* ...and now we can generate the API indexed draw */ global uint32_t *desc = p->out_draws; desc[0] = total; /* count */ desc[1] = 1; /* instance_count */ desc[2] = alloc_B / elsize_B; /* start */ desc[3] = 0; /* index_bias */ desc[4] = 0; /* start_instance */ } uintptr_t libagx_vertex_output_address(uintptr_t buffer, uint64_t mask, uint vtx, gl_varying_slot location) { return buffer + libagx_tcs_in_offs(vtx, location, mask); } uintptr_t libagx_geometry_input_address(constant struct agx_geometry_params *p, uint vtx, gl_varying_slot location) { return libagx_vertex_output_address(p->input_buffer, p->input_mask, vtx, location); } unsigned libagx_input_vertices(constant struct agx_ia_state *ia) { return ia->verts_per_instance; }