xref: /aosp_15_r20/external/mesa3d/src/asahi/lib/shaders/geometry.cl (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1*61046927SAndroid Build Coastguard Worker/*
2*61046927SAndroid Build Coastguard Worker * Copyright 2023 Alyssa Rosenzweig
3*61046927SAndroid Build Coastguard Worker * Copyright 2023 Valve Corporation
4*61046927SAndroid Build Coastguard Worker * SPDX-License-Identifier: MIT
5*61046927SAndroid Build Coastguard Worker */
6*61046927SAndroid Build Coastguard Worker
7*61046927SAndroid Build Coastguard Worker#include "shaders/tessellator.h"
8*61046927SAndroid Build Coastguard Worker#include "geometry.h"
9*61046927SAndroid Build Coastguard Worker
10*61046927SAndroid Build Coastguard Worker/* Compatible with util/u_math.h */
11*61046927SAndroid Build Coastguard Workerstatic inline uint
12*61046927SAndroid Build Coastguard Workerutil_logbase2_ceil(uint n)
13*61046927SAndroid Build Coastguard Worker{
14*61046927SAndroid Build Coastguard Worker   if (n <= 1)
15*61046927SAndroid Build Coastguard Worker      return 0;
16*61046927SAndroid Build Coastguard Worker   else
17*61046927SAndroid Build Coastguard Worker      return 32 - clz(n - 1);
18*61046927SAndroid Build Coastguard Worker}
19*61046927SAndroid Build Coastguard Worker
20*61046927SAndroid Build Coastguard Worker/* Swap the two non-provoking vertices third vert in odd triangles. This
21*61046927SAndroid Build Coastguard Worker * generates a vertex ID list with a consistent winding order.
22*61046927SAndroid Build Coastguard Worker *
23*61046927SAndroid Build Coastguard Worker * With prim and flatshade_first, the map : [0, 1, 2] -> [0, 1, 2] is its own
24*61046927SAndroid Build Coastguard Worker * inverse. This lets us reuse it for both vertex fetch and transform feedback.
25*61046927SAndroid Build Coastguard Worker */
26*61046927SAndroid Build Coastguard Workeruint
27*61046927SAndroid Build Coastguard Workerlibagx_map_vertex_in_tri_strip(uint prim, uint vert, bool flatshade_first)
28*61046927SAndroid Build Coastguard Worker{
29*61046927SAndroid Build Coastguard Worker   unsigned pv = flatshade_first ? 0 : 2;
30*61046927SAndroid Build Coastguard Worker
31*61046927SAndroid Build Coastguard Worker   bool even = (prim & 1) == 0;
32*61046927SAndroid Build Coastguard Worker   bool provoking = vert == pv;
33*61046927SAndroid Build Coastguard Worker
34*61046927SAndroid Build Coastguard Worker   return (provoking || even) ? vert : ((3 - pv) - vert);
35*61046927SAndroid Build Coastguard Worker}
36*61046927SAndroid Build Coastguard Worker
37*61046927SAndroid Build Coastguard Workeruint64_t
38*61046927SAndroid Build Coastguard Workerlibagx_xfb_vertex_address(global struct agx_geometry_params *p, uint base_index,
39*61046927SAndroid Build Coastguard Worker                          uint vert, uint buffer, uint stride,
40*61046927SAndroid Build Coastguard Worker                          uint output_offset)
41*61046927SAndroid Build Coastguard Worker{
42*61046927SAndroid Build Coastguard Worker   uint index = base_index + vert;
43*61046927SAndroid Build Coastguard Worker   uint xfb_offset = (index * stride) + output_offset;
44*61046927SAndroid Build Coastguard Worker
45*61046927SAndroid Build Coastguard Worker   return (uintptr_t)(p->xfb_base[buffer]) + xfb_offset;
46*61046927SAndroid Build Coastguard Worker}
47*61046927SAndroid Build Coastguard Worker
48*61046927SAndroid Build Coastguard Workeruint
49*61046927SAndroid Build Coastguard Workerlibagx_vertex_id_for_line_loop(uint prim, uint vert, uint num_prims)
50*61046927SAndroid Build Coastguard Worker{
51*61046927SAndroid Build Coastguard Worker   /* (0, 1), (1, 2), (2, 0) */
52*61046927SAndroid Build Coastguard Worker   if (prim == (num_prims - 1) && vert == 1)
53*61046927SAndroid Build Coastguard Worker      return 0;
54*61046927SAndroid Build Coastguard Worker   else
55*61046927SAndroid Build Coastguard Worker      return prim + vert;
56*61046927SAndroid Build Coastguard Worker}
57*61046927SAndroid Build Coastguard Worker
58*61046927SAndroid Build Coastguard Workeruint
59*61046927SAndroid Build Coastguard Workerlibagx_vertex_id_for_line_class(enum mesa_prim mode, uint prim, uint vert,
60*61046927SAndroid Build Coastguard Worker                                uint num_prims)
61*61046927SAndroid Build Coastguard Worker{
62*61046927SAndroid Build Coastguard Worker   /* Line list, line strip, or line loop */
63*61046927SAndroid Build Coastguard Worker   if (mode == MESA_PRIM_LINE_LOOP && prim == (num_prims - 1) && vert == 1)
64*61046927SAndroid Build Coastguard Worker      return 0;
65*61046927SAndroid Build Coastguard Worker
66*61046927SAndroid Build Coastguard Worker   if (mode == MESA_PRIM_LINES)
67*61046927SAndroid Build Coastguard Worker      prim *= 2;
68*61046927SAndroid Build Coastguard Worker
69*61046927SAndroid Build Coastguard Worker   return prim + vert;
70*61046927SAndroid Build Coastguard Worker}
71*61046927SAndroid Build Coastguard Worker
72*61046927SAndroid Build Coastguard Workeruint
73*61046927SAndroid Build Coastguard Workerlibagx_vertex_id_for_tri_fan(uint prim, uint vert, bool flatshade_first)
74*61046927SAndroid Build Coastguard Worker{
75*61046927SAndroid Build Coastguard Worker   /* Vulkan spec section 20.1.7 gives (i + 1, i + 2, 0) for a provoking
76*61046927SAndroid Build Coastguard Worker    * first. OpenGL instead wants (0, i + 1, i + 2) with a provoking last.
77*61046927SAndroid Build Coastguard Worker    * Piglit clipflat expects us to switch between these orders depending on
78*61046927SAndroid Build Coastguard Worker    * provoking vertex, to avoid trivializing the fan.
79*61046927SAndroid Build Coastguard Worker    *
80*61046927SAndroid Build Coastguard Worker    * Rotate accordingly.
81*61046927SAndroid Build Coastguard Worker    */
82*61046927SAndroid Build Coastguard Worker   if (flatshade_first) {
83*61046927SAndroid Build Coastguard Worker      vert = (vert == 2) ? 0 : (vert + 1);
84*61046927SAndroid Build Coastguard Worker   }
85*61046927SAndroid Build Coastguard Worker
86*61046927SAndroid Build Coastguard Worker   /* The simpler form assuming last is provoking. */
87*61046927SAndroid Build Coastguard Worker   return (vert == 0) ? 0 : prim + vert;
88*61046927SAndroid Build Coastguard Worker}
89*61046927SAndroid Build Coastguard Worker
90*61046927SAndroid Build Coastguard Workeruint
91*61046927SAndroid Build Coastguard Workerlibagx_vertex_id_for_tri_class(enum mesa_prim mode, uint prim, uint vert,
92*61046927SAndroid Build Coastguard Worker                               bool flatshade_first)
93*61046927SAndroid Build Coastguard Worker{
94*61046927SAndroid Build Coastguard Worker   if (flatshade_first && mode == MESA_PRIM_TRIANGLE_FAN) {
95*61046927SAndroid Build Coastguard Worker      vert = vert + 1;
96*61046927SAndroid Build Coastguard Worker      vert = (vert == 3) ? 0 : vert;
97*61046927SAndroid Build Coastguard Worker   }
98*61046927SAndroid Build Coastguard Worker
99*61046927SAndroid Build Coastguard Worker   if (mode == MESA_PRIM_TRIANGLE_FAN && vert == 0)
100*61046927SAndroid Build Coastguard Worker      return 0;
101*61046927SAndroid Build Coastguard Worker
102*61046927SAndroid Build Coastguard Worker   if (mode == MESA_PRIM_TRIANGLES)
103*61046927SAndroid Build Coastguard Worker      prim *= 3;
104*61046927SAndroid Build Coastguard Worker
105*61046927SAndroid Build Coastguard Worker   /* Triangle list, triangle strip, or triangle fan */
106*61046927SAndroid Build Coastguard Worker   if (mode == MESA_PRIM_TRIANGLE_STRIP) {
107*61046927SAndroid Build Coastguard Worker      unsigned pv = flatshade_first ? 0 : 2;
108*61046927SAndroid Build Coastguard Worker
109*61046927SAndroid Build Coastguard Worker      bool even = (prim & 1) == 0;
110*61046927SAndroid Build Coastguard Worker      bool provoking = vert == pv;
111*61046927SAndroid Build Coastguard Worker
112*61046927SAndroid Build Coastguard Worker      vert = ((provoking || even) ? vert : ((3 - pv) - vert));
113*61046927SAndroid Build Coastguard Worker   }
114*61046927SAndroid Build Coastguard Worker
115*61046927SAndroid Build Coastguard Worker   return prim + vert;
116*61046927SAndroid Build Coastguard Worker}
117*61046927SAndroid Build Coastguard Worker
118*61046927SAndroid Build Coastguard Workeruint
119*61046927SAndroid Build Coastguard Workerlibagx_vertex_id_for_line_adj_class(enum mesa_prim mode, uint prim, uint vert)
120*61046927SAndroid Build Coastguard Worker{
121*61046927SAndroid Build Coastguard Worker   /* Line list adj or line strip adj */
122*61046927SAndroid Build Coastguard Worker   if (mode == MESA_PRIM_LINES_ADJACENCY)
123*61046927SAndroid Build Coastguard Worker      prim *= 4;
124*61046927SAndroid Build Coastguard Worker
125*61046927SAndroid Build Coastguard Worker   return prim + vert;
126*61046927SAndroid Build Coastguard Worker}
127*61046927SAndroid Build Coastguard Worker
128*61046927SAndroid Build Coastguard Workeruint
129*61046927SAndroid Build Coastguard Workerlibagx_vertex_id_for_tri_strip_adj(uint prim, uint vert, uint num_prims,
130*61046927SAndroid Build Coastguard Worker                                   bool flatshade_first)
131*61046927SAndroid Build Coastguard Worker{
132*61046927SAndroid Build Coastguard Worker   /* See Vulkan spec section 20.1.11 "Triangle Strips With Adjancency".
133*61046927SAndroid Build Coastguard Worker    *
134*61046927SAndroid Build Coastguard Worker    * There are different cases for first/middle/last/only primitives and for
135*61046927SAndroid Build Coastguard Worker    * odd/even primitives.  Determine which case we're in.
136*61046927SAndroid Build Coastguard Worker    */
137*61046927SAndroid Build Coastguard Worker   bool last = prim == (num_prims - 1);
138*61046927SAndroid Build Coastguard Worker   bool first = prim == 0;
139*61046927SAndroid Build Coastguard Worker   bool even = (prim & 1) == 0;
140*61046927SAndroid Build Coastguard Worker   bool even_or_first = even || first;
141*61046927SAndroid Build Coastguard Worker
142*61046927SAndroid Build Coastguard Worker   /* When the last vertex is provoking, we rotate the primitives
143*61046927SAndroid Build Coastguard Worker    * accordingly. This seems required for OpenGL.
144*61046927SAndroid Build Coastguard Worker    */
145*61046927SAndroid Build Coastguard Worker   if (!flatshade_first && !even_or_first) {
146*61046927SAndroid Build Coastguard Worker      vert = (vert + 4u) % 6u;
147*61046927SAndroid Build Coastguard Worker   }
148*61046927SAndroid Build Coastguard Worker
149*61046927SAndroid Build Coastguard Worker   /* Offsets per the spec. The spec lists 6 cases with 6 offsets. Luckily,
150*61046927SAndroid Build Coastguard Worker    * there are lots of patterns we can exploit, avoiding a full 6x6 LUT.
151*61046927SAndroid Build Coastguard Worker    *
152*61046927SAndroid Build Coastguard Worker    * Here we assume the first vertex is provoking, the Vulkan default.
153*61046927SAndroid Build Coastguard Worker    */
154*61046927SAndroid Build Coastguard Worker   uint offsets[6] = {
155*61046927SAndroid Build Coastguard Worker      0,
156*61046927SAndroid Build Coastguard Worker      first ? 1 : (even ? -2 : 3),
157*61046927SAndroid Build Coastguard Worker      even_or_first ? 2 : 4,
158*61046927SAndroid Build Coastguard Worker      last ? 5 : 6,
159*61046927SAndroid Build Coastguard Worker      even_or_first ? 4 : 2,
160*61046927SAndroid Build Coastguard Worker      even_or_first ? 3 : -2,
161*61046927SAndroid Build Coastguard Worker   };
162*61046927SAndroid Build Coastguard Worker
163*61046927SAndroid Build Coastguard Worker   /* Ensure NIR can see thru the local array */
164*61046927SAndroid Build Coastguard Worker   uint offset = 0;
165*61046927SAndroid Build Coastguard Worker   for (uint i = 1; i < 6; ++i) {
166*61046927SAndroid Build Coastguard Worker      if (i == vert)
167*61046927SAndroid Build Coastguard Worker         offset = offsets[i];
168*61046927SAndroid Build Coastguard Worker   }
169*61046927SAndroid Build Coastguard Worker
170*61046927SAndroid Build Coastguard Worker   /* Finally add to the base of the primitive */
171*61046927SAndroid Build Coastguard Worker   return (prim * 2) + offset;
172*61046927SAndroid Build Coastguard Worker}
173*61046927SAndroid Build Coastguard Worker
174*61046927SAndroid Build Coastguard Workeruint
175*61046927SAndroid Build Coastguard Workerlibagx_vertex_id_for_tri_adj_class(enum mesa_prim mode, uint prim, uint vert,
176*61046927SAndroid Build Coastguard Worker                                   uint nr, bool flatshade_first)
177*61046927SAndroid Build Coastguard Worker{
178*61046927SAndroid Build Coastguard Worker   /* Tri adj list or tri adj strip */
179*61046927SAndroid Build Coastguard Worker   if (mode == MESA_PRIM_TRIANGLE_STRIP_ADJACENCY) {
180*61046927SAndroid Build Coastguard Worker      return libagx_vertex_id_for_tri_strip_adj(prim, vert, nr,
181*61046927SAndroid Build Coastguard Worker                                                flatshade_first);
182*61046927SAndroid Build Coastguard Worker   } else {
183*61046927SAndroid Build Coastguard Worker      return (6 * prim) + vert;
184*61046927SAndroid Build Coastguard Worker   }
185*61046927SAndroid Build Coastguard Worker}
186*61046927SAndroid Build Coastguard Worker
187*61046927SAndroid Build Coastguard Workeruint
188*61046927SAndroid Build Coastguard Workerlibagx_vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first,
189*61046927SAndroid Build Coastguard Worker                              uint prim, uint vert, uint num_prims)
190*61046927SAndroid Build Coastguard Worker{
191*61046927SAndroid Build Coastguard Worker   switch (mode) {
192*61046927SAndroid Build Coastguard Worker   case MESA_PRIM_POINTS:
193*61046927SAndroid Build Coastguard Worker   case MESA_PRIM_LINES:
194*61046927SAndroid Build Coastguard Worker   case MESA_PRIM_TRIANGLES:
195*61046927SAndroid Build Coastguard Worker   case MESA_PRIM_LINES_ADJACENCY:
196*61046927SAndroid Build Coastguard Worker   case MESA_PRIM_TRIANGLES_ADJACENCY:
197*61046927SAndroid Build Coastguard Worker      /* Regular primitive: every N vertices defines a primitive */
198*61046927SAndroid Build Coastguard Worker      return (prim * mesa_vertices_per_prim(mode)) + vert;
199*61046927SAndroid Build Coastguard Worker
200*61046927SAndroid Build Coastguard Worker   case MESA_PRIM_LINE_LOOP:
201*61046927SAndroid Build Coastguard Worker      return libagx_vertex_id_for_line_loop(prim, vert, num_prims);
202*61046927SAndroid Build Coastguard Worker
203*61046927SAndroid Build Coastguard Worker   case MESA_PRIM_LINE_STRIP:
204*61046927SAndroid Build Coastguard Worker   case MESA_PRIM_LINE_STRIP_ADJACENCY:
205*61046927SAndroid Build Coastguard Worker      /* (i, i + 1) or (i, ..., i + 3) */
206*61046927SAndroid Build Coastguard Worker      return prim + vert;
207*61046927SAndroid Build Coastguard Worker
208*61046927SAndroid Build Coastguard Worker   case MESA_PRIM_TRIANGLE_STRIP: {
209*61046927SAndroid Build Coastguard Worker      /* Order depends on the provoking vert.
210*61046927SAndroid Build Coastguard Worker       *
211*61046927SAndroid Build Coastguard Worker       * First: (0, 1, 2), (1, 3, 2), (2, 3, 4).
212*61046927SAndroid Build Coastguard Worker       * Last:  (0, 1, 2), (2, 1, 3), (2, 3, 4).
213*61046927SAndroid Build Coastguard Worker       *
214*61046927SAndroid Build Coastguard Worker       * Pull the (maybe swapped) vert from the corresponding primitive
215*61046927SAndroid Build Coastguard Worker       */
216*61046927SAndroid Build Coastguard Worker      return prim + libagx_map_vertex_in_tri_strip(prim, vert, flatshade_first);
217*61046927SAndroid Build Coastguard Worker   }
218*61046927SAndroid Build Coastguard Worker
219*61046927SAndroid Build Coastguard Worker   case MESA_PRIM_TRIANGLE_FAN:
220*61046927SAndroid Build Coastguard Worker      return libagx_vertex_id_for_tri_fan(prim, vert, flatshade_first);
221*61046927SAndroid Build Coastguard Worker
222*61046927SAndroid Build Coastguard Worker   case MESA_PRIM_TRIANGLE_STRIP_ADJACENCY:
223*61046927SAndroid Build Coastguard Worker      return libagx_vertex_id_for_tri_strip_adj(prim, vert, num_prims,
224*61046927SAndroid Build Coastguard Worker                                                flatshade_first);
225*61046927SAndroid Build Coastguard Worker
226*61046927SAndroid Build Coastguard Worker   default:
227*61046927SAndroid Build Coastguard Worker      return 0;
228*61046927SAndroid Build Coastguard Worker   }
229*61046927SAndroid Build Coastguard Worker}
230*61046927SAndroid Build Coastguard Worker
231*61046927SAndroid Build Coastguard Workeruint
232*61046927SAndroid Build Coastguard Workerlibagx_load_index_buffer_internal(uintptr_t index_buffer,
233*61046927SAndroid Build Coastguard Worker                                  uint32_t index_buffer_range_el, uint id,
234*61046927SAndroid Build Coastguard Worker                                  uint index_size)
235*61046927SAndroid Build Coastguard Worker{
236*61046927SAndroid Build Coastguard Worker   bool oob = id >= index_buffer_range_el;
237*61046927SAndroid Build Coastguard Worker
238*61046927SAndroid Build Coastguard Worker   /* If the load would be out-of-bounds, load the first element which is
239*61046927SAndroid Build Coastguard Worker    * assumed valid. If the application index buffer is empty with robustness2,
240*61046927SAndroid Build Coastguard Worker    * index_buffer will point to a zero sink where only the first is valid.
241*61046927SAndroid Build Coastguard Worker    */
242*61046927SAndroid Build Coastguard Worker   if (oob) {
243*61046927SAndroid Build Coastguard Worker      id = 0;
244*61046927SAndroid Build Coastguard Worker   }
245*61046927SAndroid Build Coastguard Worker
246*61046927SAndroid Build Coastguard Worker   uint el;
247*61046927SAndroid Build Coastguard Worker   if (index_size == 1) {
248*61046927SAndroid Build Coastguard Worker      el = ((constant uint8_t *)index_buffer)[id];
249*61046927SAndroid Build Coastguard Worker   } else if (index_size == 2) {
250*61046927SAndroid Build Coastguard Worker      el = ((constant uint16_t *)index_buffer)[id];
251*61046927SAndroid Build Coastguard Worker   } else {
252*61046927SAndroid Build Coastguard Worker      el = ((constant uint32_t *)index_buffer)[id];
253*61046927SAndroid Build Coastguard Worker   }
254*61046927SAndroid Build Coastguard Worker
255*61046927SAndroid Build Coastguard Worker   /* D3D robustness semantics. TODO: Optimize? */
256*61046927SAndroid Build Coastguard Worker   if (oob) {
257*61046927SAndroid Build Coastguard Worker      el = 0;
258*61046927SAndroid Build Coastguard Worker   }
259*61046927SAndroid Build Coastguard Worker
260*61046927SAndroid Build Coastguard Worker   return el;
261*61046927SAndroid Build Coastguard Worker}
262*61046927SAndroid Build Coastguard Worker
263*61046927SAndroid Build Coastguard Workeruint
264*61046927SAndroid Build Coastguard Workerlibagx_load_index_buffer(constant struct agx_ia_state *p, uint id,
265*61046927SAndroid Build Coastguard Worker                         uint index_size)
266*61046927SAndroid Build Coastguard Worker{
267*61046927SAndroid Build Coastguard Worker   return libagx_load_index_buffer_internal(
268*61046927SAndroid Build Coastguard Worker      p->index_buffer, p->index_buffer_range_el, id, index_size);
269*61046927SAndroid Build Coastguard Worker}
270*61046927SAndroid Build Coastguard Worker
271*61046927SAndroid Build Coastguard Worker/*
272*61046927SAndroid Build Coastguard Worker * Return the ID of the first thread in the workgroup where cond is true, or
273*61046927SAndroid Build Coastguard Worker * 1024 if cond is false across the workgroup.
274*61046927SAndroid Build Coastguard Worker */
275*61046927SAndroid Build Coastguard Workerstatic uint
276*61046927SAndroid Build Coastguard Workerfirst_true_thread_in_workgroup(bool cond, local uint *scratch)
277*61046927SAndroid Build Coastguard Worker{
278*61046927SAndroid Build Coastguard Worker   barrier(CLK_LOCAL_MEM_FENCE);
279*61046927SAndroid Build Coastguard Worker   scratch[get_sub_group_id()] = ballot(cond);
280*61046927SAndroid Build Coastguard Worker   barrier(CLK_LOCAL_MEM_FENCE);
281*61046927SAndroid Build Coastguard Worker
282*61046927SAndroid Build Coastguard Worker   uint first_group = ctz(ballot(scratch[get_sub_group_local_id()]));
283*61046927SAndroid Build Coastguard Worker   uint off = ctz(first_group < 32 ? scratch[first_group] : 0);
284*61046927SAndroid Build Coastguard Worker   return (first_group * 32) + off;
285*61046927SAndroid Build Coastguard Worker}
286*61046927SAndroid Build Coastguard Worker
287*61046927SAndroid Build Coastguard Worker/*
288*61046927SAndroid Build Coastguard Worker * Allocate memory from the heap (thread-safe). Returns the offset into the
289*61046927SAndroid Build Coastguard Worker * heap. The allocation will be word-aligned.
290*61046927SAndroid Build Coastguard Worker */
291*61046927SAndroid Build Coastguard Workerstatic inline uint
292*61046927SAndroid Build Coastguard Workerlibagx_atomic_alloc(global struct agx_geometry_state *heap, uint size_B)
293*61046927SAndroid Build Coastguard Worker{
294*61046927SAndroid Build Coastguard Worker   return atomic_fetch_add((volatile atomic_uint *)(&heap->heap_bottom),
295*61046927SAndroid Build Coastguard Worker                           align(size_B, 8));
296*61046927SAndroid Build Coastguard Worker}
297*61046927SAndroid Build Coastguard Worker
298*61046927SAndroid Build Coastguard Worker/*
299*61046927SAndroid Build Coastguard Worker * When unrolling the index buffer for a draw, we translate the old indirect
300*61046927SAndroid Build Coastguard Worker * draws to new indirect draws. This routine allocates the new index buffer and
301*61046927SAndroid Build Coastguard Worker * sets up most of the new draw descriptor.
302*61046927SAndroid Build Coastguard Worker */
303*61046927SAndroid Build Coastguard Workerstatic global void *
304*61046927SAndroid Build Coastguard Workersetup_unroll_for_draw(global struct agx_restart_unroll_params *p,
305*61046927SAndroid Build Coastguard Worker                      constant uint *in_draw, uint draw, enum mesa_prim mode,
306*61046927SAndroid Build Coastguard Worker                      uint index_size_B)
307*61046927SAndroid Build Coastguard Worker{
308*61046927SAndroid Build Coastguard Worker   /* Determine an upper bound on the memory required for the index buffer.
309*61046927SAndroid Build Coastguard Worker    * Restarts only decrease the unrolled index buffer size, so the maximum size
310*61046927SAndroid Build Coastguard Worker    * is the unrolled size when the input has no restarts.
311*61046927SAndroid Build Coastguard Worker    */
312*61046927SAndroid Build Coastguard Worker   uint max_prims = u_decomposed_prims_for_vertices(mode, in_draw[0]);
313*61046927SAndroid Build Coastguard Worker   uint max_verts = max_prims * mesa_vertices_per_prim(mode);
314*61046927SAndroid Build Coastguard Worker   uint alloc_size = max_verts * index_size_B;
315*61046927SAndroid Build Coastguard Worker
316*61046927SAndroid Build Coastguard Worker   /* Allocate unrolled index buffer. Atomic since multiple threads may be
317*61046927SAndroid Build Coastguard Worker    * running to handle multidraw in parallel.
318*61046927SAndroid Build Coastguard Worker    */
319*61046927SAndroid Build Coastguard Worker   global struct agx_geometry_state *heap = p->heap;
320*61046927SAndroid Build Coastguard Worker   uint old_heap_bottom_B = libagx_atomic_alloc(p->heap, alloc_size);
321*61046927SAndroid Build Coastguard Worker
322*61046927SAndroid Build Coastguard Worker   /* Regardless of the input stride, we use tightly packed output draws */
323*61046927SAndroid Build Coastguard Worker   global uint *out = &p->out_draws[5 * draw];
324*61046927SAndroid Build Coastguard Worker
325*61046927SAndroid Build Coastguard Worker   /* Setup most of the descriptor. Count will be determined after unroll. */
326*61046927SAndroid Build Coastguard Worker   out[1] = in_draw[1];                       /* instance count */
327*61046927SAndroid Build Coastguard Worker   out[2] = old_heap_bottom_B / index_size_B; /* index offset */
328*61046927SAndroid Build Coastguard Worker   out[3] = in_draw[3];                       /* index bias */
329*61046927SAndroid Build Coastguard Worker   out[4] = in_draw[4];                       /* base instance */
330*61046927SAndroid Build Coastguard Worker
331*61046927SAndroid Build Coastguard Worker   /* Return the index buffer we allocated */
332*61046927SAndroid Build Coastguard Worker   return (global uchar *)heap->heap + old_heap_bottom_B;
333*61046927SAndroid Build Coastguard Worker}
334*61046927SAndroid Build Coastguard Worker
335*61046927SAndroid Build Coastguard Worker#define UNROLL(INDEX, suffix)                                                  \
336*61046927SAndroid Build Coastguard Worker   kernel void libagx_unroll_restart_##suffix(                                 \
337*61046927SAndroid Build Coastguard Worker      global struct agx_restart_unroll_params *p, enum mesa_prim mode,         \
338*61046927SAndroid Build Coastguard Worker      uint draw, uint tid)                                                     \
339*61046927SAndroid Build Coastguard Worker   {                                                                           \
340*61046927SAndroid Build Coastguard Worker      /* For an indirect multidraw, we are dispatched maxDraws times and       \
341*61046927SAndroid Build Coastguard Worker       * terminate trailing invocations.                                       \
342*61046927SAndroid Build Coastguard Worker       */                                                                      \
343*61046927SAndroid Build Coastguard Worker      if (p->count && draw >= *(p->count))                                     \
344*61046927SAndroid Build Coastguard Worker         return;                                                               \
345*61046927SAndroid Build Coastguard Worker                                                                               \
346*61046927SAndroid Build Coastguard Worker      constant uint *in_draw =                                                 \
347*61046927SAndroid Build Coastguard Worker         (constant uint *)(p->draws + (draw * p->draw_stride));                \
348*61046927SAndroid Build Coastguard Worker                                                                               \
349*61046927SAndroid Build Coastguard Worker      uint count = in_draw[0];                                                 \
350*61046927SAndroid Build Coastguard Worker                                                                               \
351*61046927SAndroid Build Coastguard Worker      local uintptr_t out_ptr, in_ptr;                                         \
352*61046927SAndroid Build Coastguard Worker      if (tid == 0) {                                                          \
353*61046927SAndroid Build Coastguard Worker         out_ptr = (uintptr_t)setup_unroll_for_draw(p, in_draw, draw, mode,    \
354*61046927SAndroid Build Coastguard Worker                                                    sizeof(INDEX));            \
355*61046927SAndroid Build Coastguard Worker                                                                               \
356*61046927SAndroid Build Coastguard Worker         /* Accessed thru local mem because NIR deref is too aggressive */     \
357*61046927SAndroid Build Coastguard Worker         in_ptr = (uintptr_t)(libagx_index_buffer(                             \
358*61046927SAndroid Build Coastguard Worker            p->index_buffer, p->index_buffer_size_el, in_draw[2],              \
359*61046927SAndroid Build Coastguard Worker            sizeof(INDEX), p->zero_sink));                                     \
360*61046927SAndroid Build Coastguard Worker      }                                                                        \
361*61046927SAndroid Build Coastguard Worker                                                                               \
362*61046927SAndroid Build Coastguard Worker      barrier(CLK_LOCAL_MEM_FENCE);                                            \
363*61046927SAndroid Build Coastguard Worker      global INDEX *out = (global INDEX *)out_ptr;                             \
364*61046927SAndroid Build Coastguard Worker                                                                               \
365*61046927SAndroid Build Coastguard Worker      local uint scratch[32];                                                  \
366*61046927SAndroid Build Coastguard Worker                                                                               \
367*61046927SAndroid Build Coastguard Worker      uint out_prims = 0;                                                      \
368*61046927SAndroid Build Coastguard Worker      INDEX restart_idx = p->restart_index;                                    \
369*61046927SAndroid Build Coastguard Worker      bool flatshade_first = p->flatshade_first;                               \
370*61046927SAndroid Build Coastguard Worker                                                                               \
371*61046927SAndroid Build Coastguard Worker      uint needle = 0;                                                         \
372*61046927SAndroid Build Coastguard Worker      uint per_prim = mesa_vertices_per_prim(mode);                            \
373*61046927SAndroid Build Coastguard Worker      while (needle < count) {                                                 \
374*61046927SAndroid Build Coastguard Worker         /* Search for next restart or the end. Lanes load in parallel. */     \
375*61046927SAndroid Build Coastguard Worker         uint next_restart = needle;                                           \
376*61046927SAndroid Build Coastguard Worker         for (;;) {                                                            \
377*61046927SAndroid Build Coastguard Worker            uint idx = next_restart + tid;                                     \
378*61046927SAndroid Build Coastguard Worker            bool restart =                                                     \
379*61046927SAndroid Build Coastguard Worker               idx >= count || libagx_load_index_buffer_internal(              \
380*61046927SAndroid Build Coastguard Worker                                  in_ptr, p->index_buffer_size_el, idx,        \
381*61046927SAndroid Build Coastguard Worker                                  sizeof(INDEX)) == restart_idx;               \
382*61046927SAndroid Build Coastguard Worker                                                                               \
383*61046927SAndroid Build Coastguard Worker            uint next_offs = first_true_thread_in_workgroup(restart, scratch); \
384*61046927SAndroid Build Coastguard Worker                                                                               \
385*61046927SAndroid Build Coastguard Worker            next_restart += next_offs;                                         \
386*61046927SAndroid Build Coastguard Worker            if (next_offs < 1024)                                              \
387*61046927SAndroid Build Coastguard Worker               break;                                                          \
388*61046927SAndroid Build Coastguard Worker         }                                                                     \
389*61046927SAndroid Build Coastguard Worker                                                                               \
390*61046927SAndroid Build Coastguard Worker         /* Emit up to the next restart. Lanes output in parallel */           \
391*61046927SAndroid Build Coastguard Worker         uint subcount = next_restart - needle;                                \
392*61046927SAndroid Build Coastguard Worker         uint subprims = u_decomposed_prims_for_vertices(mode, subcount);      \
393*61046927SAndroid Build Coastguard Worker         uint out_prims_base = out_prims;                                      \
394*61046927SAndroid Build Coastguard Worker         for (uint i = tid; i < subprims; i += 1024) {                         \
395*61046927SAndroid Build Coastguard Worker            for (uint vtx = 0; vtx < per_prim; ++vtx) {                        \
396*61046927SAndroid Build Coastguard Worker               uint id = libagx_vertex_id_for_topology(mode, flatshade_first,  \
397*61046927SAndroid Build Coastguard Worker                                                       i, vtx, subprims);      \
398*61046927SAndroid Build Coastguard Worker               uint offset = needle + id;                                      \
399*61046927SAndroid Build Coastguard Worker                                                                               \
400*61046927SAndroid Build Coastguard Worker               out[((out_prims_base + i) * per_prim) + vtx] =                  \
401*61046927SAndroid Build Coastguard Worker                  libagx_load_index_buffer_internal(                           \
402*61046927SAndroid Build Coastguard Worker                     in_ptr, p->index_buffer_size_el, offset, sizeof(INDEX));  \
403*61046927SAndroid Build Coastguard Worker            }                                                                  \
404*61046927SAndroid Build Coastguard Worker         }                                                                     \
405*61046927SAndroid Build Coastguard Worker                                                                               \
406*61046927SAndroid Build Coastguard Worker         out_prims += subprims;                                                \
407*61046927SAndroid Build Coastguard Worker         needle = next_restart + 1;                                            \
408*61046927SAndroid Build Coastguard Worker      }                                                                        \
409*61046927SAndroid Build Coastguard Worker                                                                               \
410*61046927SAndroid Build Coastguard Worker      if (tid == 0)                                                            \
411*61046927SAndroid Build Coastguard Worker         p->out_draws[(5 * draw) + 0] = out_prims * per_prim;                  \
412*61046927SAndroid Build Coastguard Worker   }
413*61046927SAndroid Build Coastguard Worker
414*61046927SAndroid Build Coastguard WorkerUNROLL(uchar, u8)
415*61046927SAndroid Build Coastguard WorkerUNROLL(ushort, u16)
416*61046927SAndroid Build Coastguard WorkerUNROLL(uint, u32)
417*61046927SAndroid Build Coastguard Worker
418*61046927SAndroid Build Coastguard Workeruint
419*61046927SAndroid Build Coastguard Workerlibagx_setup_xfb_buffer(global struct agx_geometry_params *p, uint i)
420*61046927SAndroid Build Coastguard Worker{
421*61046927SAndroid Build Coastguard Worker   global uint *off_ptr = p->xfb_offs_ptrs[i];
422*61046927SAndroid Build Coastguard Worker   if (!off_ptr)
423*61046927SAndroid Build Coastguard Worker      return 0;
424*61046927SAndroid Build Coastguard Worker
425*61046927SAndroid Build Coastguard Worker   uint off = *off_ptr;
426*61046927SAndroid Build Coastguard Worker   p->xfb_base[i] = p->xfb_base_original[i] + off;
427*61046927SAndroid Build Coastguard Worker   return off;
428*61046927SAndroid Build Coastguard Worker}
429*61046927SAndroid Build Coastguard Worker
430*61046927SAndroid Build Coastguard Worker/*
431*61046927SAndroid Build Coastguard Worker * Translate EndPrimitive for LINE_STRIP or TRIANGLE_STRIP output prims into
432*61046927SAndroid Build Coastguard Worker * writes into the 32-bit output index buffer. We write the sequence (b, b + 1,
433*61046927SAndroid Build Coastguard Worker * b + 2, ..., b + n - 1, -1), where b (base) is the first vertex in the prim, n
434*61046927SAndroid Build Coastguard Worker * (count) is the number of verts in the prims, and -1 is the prim restart index
435*61046927SAndroid Build Coastguard Worker * used to signal the end of the prim.
436*61046927SAndroid Build Coastguard Worker *
437*61046927SAndroid Build Coastguard Worker * For points, we write index buffers without restart, just as a sideband to
438*61046927SAndroid Build Coastguard Worker * pass data into the vertex shader.
439*61046927SAndroid Build Coastguard Worker */
440*61046927SAndroid Build Coastguard Workervoid
441*61046927SAndroid Build Coastguard Workerlibagx_end_primitive(global int *index_buffer, uint total_verts,
442*61046927SAndroid Build Coastguard Worker                     uint verts_in_prim, uint total_prims,
443*61046927SAndroid Build Coastguard Worker                     uint invocation_vertex_base, uint invocation_prim_base,
444*61046927SAndroid Build Coastguard Worker                     uint geometry_base, bool restart)
445*61046927SAndroid Build Coastguard Worker{
446*61046927SAndroid Build Coastguard Worker   /* Previous verts/prims are from previous invocations plus earlier
447*61046927SAndroid Build Coastguard Worker    * prims in this invocation. For the intra-invocation counts, we
448*61046927SAndroid Build Coastguard Worker    * subtract the count for this prim from the inclusive sum NIR gives us.
449*61046927SAndroid Build Coastguard Worker    */
450*61046927SAndroid Build Coastguard Worker   uint previous_verts_in_invoc = (total_verts - verts_in_prim);
451*61046927SAndroid Build Coastguard Worker   uint previous_verts = invocation_vertex_base + previous_verts_in_invoc;
452*61046927SAndroid Build Coastguard Worker   uint previous_prims = restart ? invocation_prim_base + (total_prims - 1) : 0;
453*61046927SAndroid Build Coastguard Worker
454*61046927SAndroid Build Coastguard Worker   /* The indices are encoded as: (unrolled ID * output vertices) + vertex. */
455*61046927SAndroid Build Coastguard Worker   uint index_base = geometry_base + previous_verts_in_invoc;
456*61046927SAndroid Build Coastguard Worker
457*61046927SAndroid Build Coastguard Worker   /* Index buffer contains 1 index for each vertex and 1 for each prim */
458*61046927SAndroid Build Coastguard Worker   global int *out = &index_buffer[previous_verts + previous_prims];
459*61046927SAndroid Build Coastguard Worker
460*61046927SAndroid Build Coastguard Worker   /* Write out indices for the strip */
461*61046927SAndroid Build Coastguard Worker   for (uint i = 0; i < verts_in_prim; ++i) {
462*61046927SAndroid Build Coastguard Worker      out[i] = index_base + i;
463*61046927SAndroid Build Coastguard Worker   }
464*61046927SAndroid Build Coastguard Worker
465*61046927SAndroid Build Coastguard Worker   if (restart)
466*61046927SAndroid Build Coastguard Worker      out[verts_in_prim] = -1;
467*61046927SAndroid Build Coastguard Worker}
468*61046927SAndroid Build Coastguard Worker
469*61046927SAndroid Build Coastguard Workervoid
470*61046927SAndroid Build Coastguard Workerlibagx_build_gs_draw(global struct agx_geometry_params *p, uint vertices,
471*61046927SAndroid Build Coastguard Worker                     uint primitives)
472*61046927SAndroid Build Coastguard Worker{
473*61046927SAndroid Build Coastguard Worker   global uint *descriptor = p->indirect_desc;
474*61046927SAndroid Build Coastguard Worker   global struct agx_geometry_state *state = p->state;
475*61046927SAndroid Build Coastguard Worker
476*61046927SAndroid Build Coastguard Worker   /* Setup the indirect draw descriptor */
477*61046927SAndroid Build Coastguard Worker   uint indices = vertices + primitives; /* includes restart indices */
478*61046927SAndroid Build Coastguard Worker
479*61046927SAndroid Build Coastguard Worker   /* Allocate the index buffer */
480*61046927SAndroid Build Coastguard Worker   uint index_buffer_offset_B = state->heap_bottom;
481*61046927SAndroid Build Coastguard Worker   p->output_index_buffer =
482*61046927SAndroid Build Coastguard Worker      (global uint *)(state->heap + index_buffer_offset_B);
483*61046927SAndroid Build Coastguard Worker   state->heap_bottom += (indices * 4);
484*61046927SAndroid Build Coastguard Worker
485*61046927SAndroid Build Coastguard Worker   descriptor[0] = indices;                   /* count */
486*61046927SAndroid Build Coastguard Worker   descriptor[1] = 1;                         /* instance count */
487*61046927SAndroid Build Coastguard Worker   descriptor[2] = index_buffer_offset_B / 4; /* start */
488*61046927SAndroid Build Coastguard Worker   descriptor[3] = 0;                         /* index bias */
489*61046927SAndroid Build Coastguard Worker   descriptor[4] = 0;                         /* start instance */
490*61046927SAndroid Build Coastguard Worker
491*61046927SAndroid Build Coastguard Worker   if (state->heap_bottom > state->heap_size) {
492*61046927SAndroid Build Coastguard Worker      global uint *foo = (global uint *)(uintptr_t)0xdeadbeef;
493*61046927SAndroid Build Coastguard Worker      *foo = 0x1234;
494*61046927SAndroid Build Coastguard Worker   }
495*61046927SAndroid Build Coastguard Worker}
496*61046927SAndroid Build Coastguard Worker
497*61046927SAndroid Build Coastguard Workervoid
498*61046927SAndroid Build Coastguard Workerlibagx_gs_setup_indirect(global struct agx_gs_setup_indirect_params *gsi,
499*61046927SAndroid Build Coastguard Worker                         enum mesa_prim mode, uint local_id)
500*61046927SAndroid Build Coastguard Worker{
501*61046927SAndroid Build Coastguard Worker   global struct agx_geometry_params *p = gsi->geom;
502*61046927SAndroid Build Coastguard Worker   global struct agx_ia_state *ia = gsi->ia;
503*61046927SAndroid Build Coastguard Worker
504*61046927SAndroid Build Coastguard Worker   /* Determine the (primitives, instances) grid size. */
505*61046927SAndroid Build Coastguard Worker   uint vertex_count = gsi->draw[0];
506*61046927SAndroid Build Coastguard Worker   uint instance_count = gsi->draw[1];
507*61046927SAndroid Build Coastguard Worker
508*61046927SAndroid Build Coastguard Worker   ia->verts_per_instance = vertex_count;
509*61046927SAndroid Build Coastguard Worker
510*61046927SAndroid Build Coastguard Worker   /* Calculate number of primitives input into the GS */
511*61046927SAndroid Build Coastguard Worker   uint prim_per_instance = u_decomposed_prims_for_vertices(mode, vertex_count);
512*61046927SAndroid Build Coastguard Worker   p->input_primitives = prim_per_instance * instance_count;
513*61046927SAndroid Build Coastguard Worker
514*61046927SAndroid Build Coastguard Worker   /* Invoke VS as (vertices, instances); GS as (primitives, instances) */
515*61046927SAndroid Build Coastguard Worker   p->vs_grid[0] = vertex_count;
516*61046927SAndroid Build Coastguard Worker   p->vs_grid[1] = instance_count;
517*61046927SAndroid Build Coastguard Worker
518*61046927SAndroid Build Coastguard Worker   p->gs_grid[0] = prim_per_instance;
519*61046927SAndroid Build Coastguard Worker   p->gs_grid[1] = instance_count;
520*61046927SAndroid Build Coastguard Worker
521*61046927SAndroid Build Coastguard Worker   p->primitives_log2 = util_logbase2_ceil(prim_per_instance);
522*61046927SAndroid Build Coastguard Worker
523*61046927SAndroid Build Coastguard Worker   /* If indexing is enabled, the third word is the offset into the index buffer
524*61046927SAndroid Build Coastguard Worker    * in elements. Apply that offset now that we have it. For a hardware
525*61046927SAndroid Build Coastguard Worker    * indirect draw, the hardware would do this for us, but for software input
526*61046927SAndroid Build Coastguard Worker    * assembly we need to do it ourselves.
527*61046927SAndroid Build Coastguard Worker    */
528*61046927SAndroid Build Coastguard Worker   if (gsi->index_size_B) {
529*61046927SAndroid Build Coastguard Worker      ia->index_buffer =
530*61046927SAndroid Build Coastguard Worker         libagx_index_buffer(gsi->index_buffer, gsi->index_buffer_range_el,
531*61046927SAndroid Build Coastguard Worker                             gsi->draw[2], gsi->index_size_B, gsi->zero_sink);
532*61046927SAndroid Build Coastguard Worker
533*61046927SAndroid Build Coastguard Worker      ia->index_buffer_range_el =
534*61046927SAndroid Build Coastguard Worker         libagx_index_buffer_range_el(gsi->index_buffer_range_el, gsi->draw[2]);
535*61046927SAndroid Build Coastguard Worker   }
536*61046927SAndroid Build Coastguard Worker
537*61046927SAndroid Build Coastguard Worker   /* We need to allocate VS and GS count buffers, do so now */
538*61046927SAndroid Build Coastguard Worker   global struct agx_geometry_state *state = p->state;
539*61046927SAndroid Build Coastguard Worker
540*61046927SAndroid Build Coastguard Worker   uint vertex_buffer_size =
541*61046927SAndroid Build Coastguard Worker      libagx_tcs_in_size(vertex_count * instance_count, gsi->vs_outputs);
542*61046927SAndroid Build Coastguard Worker
543*61046927SAndroid Build Coastguard Worker   p->count_buffer = (global uint *)(state->heap + state->heap_bottom);
544*61046927SAndroid Build Coastguard Worker   state->heap_bottom +=
545*61046927SAndroid Build Coastguard Worker      align(p->input_primitives * p->count_buffer_stride, 16);
546*61046927SAndroid Build Coastguard Worker
547*61046927SAndroid Build Coastguard Worker   p->input_buffer = (uintptr_t)(state->heap + state->heap_bottom);
548*61046927SAndroid Build Coastguard Worker   *(gsi->vertex_buffer) = p->input_buffer;
549*61046927SAndroid Build Coastguard Worker   state->heap_bottom += align(vertex_buffer_size, 4);
550*61046927SAndroid Build Coastguard Worker
551*61046927SAndroid Build Coastguard Worker   p->input_mask = gsi->vs_outputs;
552*61046927SAndroid Build Coastguard Worker
553*61046927SAndroid Build Coastguard Worker   if (state->heap_bottom > state->heap_size) {
554*61046927SAndroid Build Coastguard Worker      global uint *foo = (global uint *)(uintptr_t)0x1deadbeef;
555*61046927SAndroid Build Coastguard Worker      *foo = 0x1234;
556*61046927SAndroid Build Coastguard Worker   }
557*61046927SAndroid Build Coastguard Worker}
558*61046927SAndroid Build Coastguard Worker
559*61046927SAndroid Build Coastguard Worker/*
560*61046927SAndroid Build Coastguard Worker * Returns (work_group_scan_inclusive_add(x), work_group_sum(x)). Implemented
561*61046927SAndroid Build Coastguard Worker * manually with subgroup ops and local memory since Mesa doesn't do those
562*61046927SAndroid Build Coastguard Worker * lowerings yet.
563*61046927SAndroid Build Coastguard Worker */
564*61046927SAndroid Build Coastguard Workerstatic uint2
565*61046927SAndroid Build Coastguard Workerlibagx_work_group_scan_inclusive_add(uint x, local uint *scratch)
566*61046927SAndroid Build Coastguard Worker{
567*61046927SAndroid Build Coastguard Worker   uint sg_id = get_sub_group_id();
568*61046927SAndroid Build Coastguard Worker
569*61046927SAndroid Build Coastguard Worker   /* Partial prefix sum of the subgroup */
570*61046927SAndroid Build Coastguard Worker   uint sg = sub_group_scan_inclusive_add(x);
571*61046927SAndroid Build Coastguard Worker
572*61046927SAndroid Build Coastguard Worker   /* Reduction (sum) for the subgroup */
573*61046927SAndroid Build Coastguard Worker   uint sg_sum = sub_group_broadcast(sg, 31);
574*61046927SAndroid Build Coastguard Worker
575*61046927SAndroid Build Coastguard Worker   /* Write out all the subgroups sums */
576*61046927SAndroid Build Coastguard Worker   barrier(CLK_LOCAL_MEM_FENCE);
577*61046927SAndroid Build Coastguard Worker   scratch[sg_id] = sg_sum;
578*61046927SAndroid Build Coastguard Worker   barrier(CLK_LOCAL_MEM_FENCE);
579*61046927SAndroid Build Coastguard Worker
580*61046927SAndroid Build Coastguard Worker   /* Read all the subgroup sums. Thread T in subgroup G reads the sum of all
581*61046927SAndroid Build Coastguard Worker    * threads in subgroup T.
582*61046927SAndroid Build Coastguard Worker    */
583*61046927SAndroid Build Coastguard Worker   uint other_sum = scratch[get_sub_group_local_id()];
584*61046927SAndroid Build Coastguard Worker
585*61046927SAndroid Build Coastguard Worker   /* Exclusive sum the subgroup sums to get the total before the current group,
586*61046927SAndroid Build Coastguard Worker    * which can be added to the total for the current group.
587*61046927SAndroid Build Coastguard Worker    */
588*61046927SAndroid Build Coastguard Worker   uint other_sums = sub_group_scan_exclusive_add(other_sum);
589*61046927SAndroid Build Coastguard Worker   uint base = sub_group_broadcast(other_sums, sg_id);
590*61046927SAndroid Build Coastguard Worker   uint prefix = base + sg;
591*61046927SAndroid Build Coastguard Worker
592*61046927SAndroid Build Coastguard Worker   /* Reduce the workgroup using the prefix sum we already did */
593*61046927SAndroid Build Coastguard Worker   uint reduction = sub_group_broadcast(other_sums + other_sum, 31);
594*61046927SAndroid Build Coastguard Worker
595*61046927SAndroid Build Coastguard Worker   return (uint2)(prefix, reduction);
596*61046927SAndroid Build Coastguard Worker}
597*61046927SAndroid Build Coastguard Worker
598*61046927SAndroid Build Coastguard Workerkernel void
599*61046927SAndroid Build Coastguard Workerlibagx_prefix_sum(global uint *buffer, uint len, uint words, uint word)
600*61046927SAndroid Build Coastguard Worker{
601*61046927SAndroid Build Coastguard Worker   local uint scratch[32];
602*61046927SAndroid Build Coastguard Worker   uint tid = get_local_id(0);
603*61046927SAndroid Build Coastguard Worker
604*61046927SAndroid Build Coastguard Worker   /* Main loop: complete workgroups processing 1024 values at once */
605*61046927SAndroid Build Coastguard Worker   uint i, count = 0;
606*61046927SAndroid Build Coastguard Worker   uint len_remainder = len % 1024;
607*61046927SAndroid Build Coastguard Worker   uint len_rounded_down = len - len_remainder;
608*61046927SAndroid Build Coastguard Worker
609*61046927SAndroid Build Coastguard Worker   for (i = tid; i < len_rounded_down; i += 1024) {
610*61046927SAndroid Build Coastguard Worker      global uint *ptr = &buffer[(i * words) + word];
611*61046927SAndroid Build Coastguard Worker      uint value = *ptr;
612*61046927SAndroid Build Coastguard Worker      uint2 sums = libagx_work_group_scan_inclusive_add(value, scratch);
613*61046927SAndroid Build Coastguard Worker
614*61046927SAndroid Build Coastguard Worker      *ptr = count + sums[0];
615*61046927SAndroid Build Coastguard Worker      count += sums[1];
616*61046927SAndroid Build Coastguard Worker   }
617*61046927SAndroid Build Coastguard Worker
618*61046927SAndroid Build Coastguard Worker   /* The last iteration is special since we won't have a full subgroup unless
619*61046927SAndroid Build Coastguard Worker    * the length is divisible by the subgroup size, and we don't advance count.
620*61046927SAndroid Build Coastguard Worker    */
621*61046927SAndroid Build Coastguard Worker   global uint *ptr = &buffer[(i * words) + word];
622*61046927SAndroid Build Coastguard Worker   uint value = (tid < len_remainder) ? *ptr : 0;
623*61046927SAndroid Build Coastguard Worker   uint scan = libagx_work_group_scan_inclusive_add(value, scratch)[0];
624*61046927SAndroid Build Coastguard Worker
625*61046927SAndroid Build Coastguard Worker   if (tid < len_remainder) {
626*61046927SAndroid Build Coastguard Worker      *ptr = count + scan;
627*61046927SAndroid Build Coastguard Worker   }
628*61046927SAndroid Build Coastguard Worker}
629*61046927SAndroid Build Coastguard Worker
630*61046927SAndroid Build Coastguard Workerkernel void
631*61046927SAndroid Build Coastguard Workerlibagx_prefix_sum_tess(global struct libagx_tess_args *p)
632*61046927SAndroid Build Coastguard Worker{
633*61046927SAndroid Build Coastguard Worker   libagx_prefix_sum(p->counts, p->nr_patches, 1 /* words */, 0 /* word */);
634*61046927SAndroid Build Coastguard Worker
635*61046927SAndroid Build Coastguard Worker   /* After prefix summing, we know the total # of indices, so allocate the
636*61046927SAndroid Build Coastguard Worker    * index buffer now. Elect a thread for the allocation.
637*61046927SAndroid Build Coastguard Worker    */
638*61046927SAndroid Build Coastguard Worker   barrier(CLK_LOCAL_MEM_FENCE);
639*61046927SAndroid Build Coastguard Worker   if (get_local_id(0) != 0)
640*61046927SAndroid Build Coastguard Worker      return;
641*61046927SAndroid Build Coastguard Worker
642*61046927SAndroid Build Coastguard Worker   /* The last element of an inclusive prefix sum is the total sum */
643*61046927SAndroid Build Coastguard Worker   uint total = p->counts[p->nr_patches - 1];
644*61046927SAndroid Build Coastguard Worker
645*61046927SAndroid Build Coastguard Worker   /* Allocate 4-byte indices */
646*61046927SAndroid Build Coastguard Worker   uint32_t elsize_B = sizeof(uint32_t);
647*61046927SAndroid Build Coastguard Worker   uint32_t size_B = total * elsize_B;
648*61046927SAndroid Build Coastguard Worker   uint alloc_B = p->heap->heap_bottom;
649*61046927SAndroid Build Coastguard Worker   p->heap->heap_bottom += size_B;
650*61046927SAndroid Build Coastguard Worker   p->heap->heap_bottom = align(p->heap->heap_bottom, 8);
651*61046927SAndroid Build Coastguard Worker
652*61046927SAndroid Build Coastguard Worker   p->index_buffer = (global uint32_t *)(((uintptr_t)p->heap->heap) + alloc_B);
653*61046927SAndroid Build Coastguard Worker
654*61046927SAndroid Build Coastguard Worker   /* ...and now we can generate the API indexed draw */
655*61046927SAndroid Build Coastguard Worker   global uint32_t *desc = p->out_draws;
656*61046927SAndroid Build Coastguard Worker
657*61046927SAndroid Build Coastguard Worker   desc[0] = total;              /* count */
658*61046927SAndroid Build Coastguard Worker   desc[1] = 1;                  /* instance_count */
659*61046927SAndroid Build Coastguard Worker   desc[2] = alloc_B / elsize_B; /* start */
660*61046927SAndroid Build Coastguard Worker   desc[3] = 0;                  /* index_bias */
661*61046927SAndroid Build Coastguard Worker   desc[4] = 0;                  /* start_instance */
662*61046927SAndroid Build Coastguard Worker}
663*61046927SAndroid Build Coastguard Worker
664*61046927SAndroid Build Coastguard Workeruintptr_t
665*61046927SAndroid Build Coastguard Workerlibagx_vertex_output_address(uintptr_t buffer, uint64_t mask, uint vtx,
666*61046927SAndroid Build Coastguard Worker                             gl_varying_slot location)
667*61046927SAndroid Build Coastguard Worker{
668*61046927SAndroid Build Coastguard Worker   return buffer + libagx_tcs_in_offs(vtx, location, mask);
669*61046927SAndroid Build Coastguard Worker}
670*61046927SAndroid Build Coastguard Worker
671*61046927SAndroid Build Coastguard Workeruintptr_t
672*61046927SAndroid Build Coastguard Workerlibagx_geometry_input_address(constant struct agx_geometry_params *p, uint vtx,
673*61046927SAndroid Build Coastguard Worker                              gl_varying_slot location)
674*61046927SAndroid Build Coastguard Worker{
675*61046927SAndroid Build Coastguard Worker   return libagx_vertex_output_address(p->input_buffer, p->input_mask, vtx,
676*61046927SAndroid Build Coastguard Worker                                       location);
677*61046927SAndroid Build Coastguard Worker}
678*61046927SAndroid Build Coastguard Worker
679*61046927SAndroid Build Coastguard Workerunsigned
680*61046927SAndroid Build Coastguard Workerlibagx_input_vertices(constant struct agx_ia_state *ia)
681*61046927SAndroid Build Coastguard Worker{
682*61046927SAndroid Build Coastguard Worker   return ia->verts_per_instance;
683*61046927SAndroid Build Coastguard Worker}
684