xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/freedreno/ir3/ir3_gallium.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2014 Rob Clark <[email protected]>
3  * SPDX-License-Identifier: MIT
4  *
5  * Authors:
6  *    Rob Clark <[email protected]>
7  */
8 
9 #include "pipe/p_screen.h"
10 #include "pipe/p_state.h"
11 #include "tgsi/tgsi_dump.h"
12 #include "util/format/u_format.h"
13 #include "util/u_inlines.h"
14 #include "util/u_memory.h"
15 #include "util/u_string.h"
16 
17 #include "nir/tgsi_to_nir.h"
18 #include "nir_serialize.h"
19 
20 #include "freedreno_context.h"
21 #include "freedreno_util.h"
22 
23 #include "ir3/ir3_cache.h"
24 #include "ir3/ir3_compiler.h"
25 #include "ir3/ir3_descriptor.h"
26 #include "ir3/ir3_gallium.h"
27 #include "ir3/ir3_nir.h"
28 #include "ir3/ir3_shader.h"
29 
30 /**
31  * The hardware cso for shader state
32  *
33  * Initially just a container for the ir3_shader, but this is where we'll
34  * plumb in async compile.
35  */
36 struct ir3_shader_state {
37    struct ir3_shader *shader;
38 
39    /* Fence signalled when async compile is completed: */
40    struct util_queue_fence ready;
41 };
42 
43 /**
44  * Should initial variants be compiled synchronously?
45  *
46  * The only case where util_debug_message() is used in the initial-variants
47  * path is with FD_MESA_DEBUG=shaderdb.  So if either debug is disabled (ie.
48  * debug.debug_message==NULL), or shaderdb stats are not enabled, we can
49  * compile the initial shader variant asynchronously.
50  */
51 static bool
initial_variants_synchronous(struct fd_context * ctx)52 initial_variants_synchronous(struct fd_context *ctx)
53 {
54    return unlikely(ctx->debug.debug_message) || FD_DBG(SHADERDB) ||
55           FD_DBG(SERIALC);
56 }
57 
58 static void
dump_shader_info(struct ir3_shader_variant * v,struct util_debug_callback * debug)59 dump_shader_info(struct ir3_shader_variant *v,
60                  struct util_debug_callback *debug)
61 {
62    if (!FD_DBG(SHADERDB))
63       return;
64 
65    util_debug_message(
66       debug, SHADER_INFO,
67       "%s shader: %u inst, %u nops, %u non-nops, %u mov, %u cov, "
68       "%u dwords, %u last-baryf, %u last-helper, %u half, %u full, %u constlen, "
69       "%u cat0, %u cat1, %u cat2, %u cat3, %u cat4, %u cat5, %u cat6, %u cat7, "
70       "%u stp, %u ldp, %u sstall, %u (ss), %u systall, %u (sy), %d waves, "
71       "%d loops\n",
72       ir3_shader_stage(v), v->info.instrs_count, v->info.nops_count,
73       v->info.instrs_count - v->info.nops_count, v->info.mov_count,
74       v->info.cov_count, v->info.sizedwords, v->info.last_baryf,
75       v->info.last_helper, v->info.max_half_reg + 1, v->info.max_reg + 1,
76       v->constlen,
77       v->info.instrs_per_cat[0], v->info.instrs_per_cat[1],
78       v->info.instrs_per_cat[2], v->info.instrs_per_cat[3],
79       v->info.instrs_per_cat[4], v->info.instrs_per_cat[5],
80       v->info.instrs_per_cat[6], v->info.instrs_per_cat[7],
81       v->info.stp_count, v->info.ldp_count, v->info.sstall,
82       v->info.ss, v->info.systall, v->info.sy, v->info.max_waves, v->loops);
83 }
84 
85 static void
upload_shader_variant(struct ir3_shader_variant * v)86 upload_shader_variant(struct ir3_shader_variant *v)
87 {
88    struct ir3_compiler *compiler = v->compiler;
89 
90    assert(!v->bo);
91 
92    v->bo =
93       fd_bo_new(compiler->dev, v->info.size, FD_BO_NOMAP,
94                 "%s:%s", ir3_shader_stage(v), v->name);
95 
96    /* Always include shaders in kernel crash dumps. */
97    fd_bo_mark_for_dump(v->bo);
98 
99    fd_bo_upload(v->bo, v->bin, 0, v->info.size);
100 }
101 
102 struct ir3_shader_variant *
ir3_shader_variant(struct ir3_shader * shader,struct ir3_shader_key key,bool binning_pass,struct util_debug_callback * debug)103 ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key,
104                    bool binning_pass, struct util_debug_callback *debug)
105 {
106    struct ir3_shader_variant *v;
107    bool created = false;
108 
109    MESA_TRACE_FUNC();
110 
111    /* Some shader key values may not be used by a given ir3_shader (for
112     * example, fragment shader saturates in the vertex shader), so clean out
113     * those flags to avoid recompiling.
114     */
115    ir3_key_clear_unused(&key, shader);
116 
117    v = ir3_shader_get_variant(shader, &key, binning_pass, false, &created);
118 
119    if (created) {
120       if (shader->initial_variants_done) {
121          perf_debug_message(debug, SHADER_INFO,
122                             "%s shader: recompiling at draw time: global "
123                             "0x%08x, vfsamples %x/%x, astc %x/%x\n",
124                             ir3_shader_stage(v), key.global, key.vsamples,
125                             key.fsamples, key.vastc_srgb, key.fastc_srgb);
126       }
127 
128       dump_shader_info(v, debug);
129       upload_shader_variant(v);
130 
131       if (v->binning) {
132          upload_shader_variant(v->binning);
133          dump_shader_info(v->binning, debug);
134       }
135    }
136 
137    return v;
138 }
139 
140 static void
copy_stream_out(struct ir3_stream_output_info * i,const struct pipe_stream_output_info * p)141 copy_stream_out(struct ir3_stream_output_info *i,
142                 const struct pipe_stream_output_info *p)
143 {
144    STATIC_ASSERT(ARRAY_SIZE(i->stride) == ARRAY_SIZE(p->stride));
145    STATIC_ASSERT(ARRAY_SIZE(i->output) == ARRAY_SIZE(p->output));
146 
147    i->streams_written = 0;
148    i->num_outputs = p->num_outputs;
149    for (int n = 0; n < ARRAY_SIZE(i->stride); n++) {
150       i->stride[n] = p->stride[n];
151       if (p->stride[n])
152          i->streams_written |= BIT(n);
153    }
154 
155    for (int n = 0; n < ARRAY_SIZE(i->output); n++) {
156       i->output[n].register_index = p->output[n].register_index;
157       i->output[n].start_component = p->output[n].start_component;
158       i->output[n].num_components = p->output[n].num_components;
159       i->output[n].output_buffer = p->output[n].output_buffer;
160       i->output[n].dst_offset = p->output[n].dst_offset;
161       i->output[n].stream = p->output[n].stream;
162    }
163 }
164 
165 static void
create_initial_variants(struct ir3_shader_state * hwcso,struct util_debug_callback * debug)166 create_initial_variants(struct ir3_shader_state *hwcso,
167                         struct util_debug_callback *debug)
168 {
169    struct ir3_shader *shader = hwcso->shader;
170    struct ir3_compiler *compiler = shader->compiler;
171    nir_shader *nir = shader->nir;
172 
173    /* Compile standard variants immediately to try to avoid draw-time stalls
174     * to run the compiler.
175     */
176    struct ir3_shader_key key = {
177       .tessellation = IR3_TESS_NONE,
178       .ucp_enables = MASK(nir->info.clip_distance_array_size),
179       .msaa = true,
180    };
181 
182    switch (nir->info.stage) {
183    case MESA_SHADER_TESS_EVAL:
184       key.tessellation = ir3_tess_mode(nir->info.tess._primitive_mode);
185       break;
186 
187    case MESA_SHADER_TESS_CTRL:
188       /* The primitive_mode field, while it exists for TCS, is not
189        * populated (since separable shaders between TCS/TES are legal,
190        * so TCS wouldn't have access to TES's declaration).  Make a
191        * guess so that we shader-db something plausible for TCS.
192        */
193       if (nir->info.outputs_written & VARYING_BIT_TESS_LEVEL_INNER)
194          key.tessellation = IR3_TESS_TRIANGLES;
195       else
196          key.tessellation = IR3_TESS_ISOLINES;
197       break;
198 
199    case MESA_SHADER_GEOMETRY:
200       key.has_gs = true;
201       break;
202 
203    default:
204       break;
205    }
206 
207    key.safe_constlen = false;
208    struct ir3_shader_variant *v = ir3_shader_variant(shader, key, false, debug);
209    if (!v)
210       return;
211 
212    if (v->constlen > compiler->max_const_safe) {
213       key.safe_constlen = true;
214       ir3_shader_variant(shader, key, false, debug);
215    }
216 
217    /* For vertex shaders, also compile initial binning pass shader: */
218    if (nir->info.stage == MESA_SHADER_VERTEX) {
219       key.safe_constlen = false;
220       v = ir3_shader_variant(shader, key, true, debug);
221       if (!v)
222          return;
223 
224       if (v->constlen > compiler->max_const_safe) {
225          key.safe_constlen = true;
226          ir3_shader_variant(shader, key, true, debug);
227       }
228    }
229 
230    shader->initial_variants_done = true;
231 }
232 
233 static void
create_initial_variants_async(void * job,void * gdata,int thread_index)234 create_initial_variants_async(void *job, void *gdata, int thread_index)
235 {
236    struct ir3_shader_state *hwcso = job;
237    struct util_debug_callback debug = {};
238 
239    MESA_TRACE_FUNC();
240 
241    create_initial_variants(hwcso, &debug);
242 }
243 
244 static void
create_initial_compute_variants_async(void * job,void * gdata,int thread_index)245 create_initial_compute_variants_async(void *job, void *gdata, int thread_index)
246 {
247    struct ir3_shader_state *hwcso = job;
248    struct ir3_shader *shader = hwcso->shader;
249    struct util_debug_callback debug = {};
250    static struct ir3_shader_key key; /* static is implicitly zeroed */
251 
252    MESA_TRACE_FUNC();
253 
254    ir3_shader_variant(shader, key, false, &debug);
255    shader->initial_variants_done = true;
256 }
257 
258 /* a bit annoying that compute-shader and normal shader state objects
259  * aren't a bit more aligned.
260  */
261 void *
ir3_shader_compute_state_create(struct pipe_context * pctx,const struct pipe_compute_state * cso)262 ir3_shader_compute_state_create(struct pipe_context *pctx,
263                                 const struct pipe_compute_state *cso)
264 {
265    struct fd_context *ctx = fd_context(pctx);
266 
267    /* req_input_mem will only be non-zero for cl kernels (ie. clover).
268     * This isn't a perfect test because I guess it is possible (but
269     * uncommon) for none for the kernel parameters to be a global,
270     * but ctx->set_global_bindings() can't fail, so this is the next
271     * best place to fail if we need a newer version of kernel driver:
272     */
273    if ((cso->req_input_mem > 0) &&
274        fd_device_version(ctx->dev) < FD_VERSION_BO_IOVA) {
275       return NULL;
276    }
277 
278    struct ir3_compiler *compiler = ctx->screen->compiler;
279    nir_shader *nir;
280 
281    if (cso->ir_type == PIPE_SHADER_IR_NIR) {
282       /* we take ownership of the reference: */
283       nir = (nir_shader *)cso->prog;
284    } else if (cso->ir_type == PIPE_SHADER_IR_NIR_SERIALIZED) {
285       const nir_shader_compiler_options *options =
286             ir3_get_compiler_options(compiler);
287       const struct pipe_binary_program_header *hdr = cso->prog;
288       struct blob_reader reader;
289 
290       blob_reader_init(&reader, hdr->blob, hdr->num_bytes);
291       nir = nir_deserialize(NULL, options, &reader);
292 
293       ir3_finalize_nir(compiler, nir);
294    } else {
295       assert(cso->ir_type == PIPE_SHADER_IR_TGSI);
296       if (ir3_shader_debug & IR3_DBG_DISASM) {
297          tgsi_dump(cso->prog, 0);
298       }
299       nir = tgsi_to_nir(cso->prog, pctx->screen, false);
300    }
301 
302    if (ctx->screen->gen >= 6)
303       ir3_nir_lower_io_to_bindless(nir);
304 
305    enum ir3_wavesize_option api_wavesize = IR3_SINGLE_OR_DOUBLE;
306    enum ir3_wavesize_option real_wavesize = IR3_SINGLE_OR_DOUBLE;
307 
308    if (ctx->screen->gen >= 6 && !ctx->screen->info->a6xx.supports_double_threadsize) {
309       api_wavesize = IR3_SINGLE_ONLY;
310       real_wavesize = IR3_SINGLE_ONLY;
311    }
312 
313    struct ir3_shader *shader =
314       ir3_shader_from_nir(compiler, nir, &(struct ir3_shader_options){
315                               /* TODO: force to single on a6xx with legacy
316                                * ballot extension that uses 64-bit masks
317                                */
318                               .api_wavesize = api_wavesize,
319                               .real_wavesize = real_wavesize,
320                           }, NULL);
321    shader->cs.req_input_mem = align(cso->req_input_mem, 4) / 4;     /* byte->dword */
322    shader->cs.req_local_mem = cso->static_shared_mem;
323 
324    struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso));
325 
326    util_queue_fence_init(&hwcso->ready);
327    hwcso->shader = shader;
328 
329    /* Immediately compile a standard variant.  We have so few variants in our
330     * shaders, that doing so almost eliminates draw-time recompiles.  (This
331     * is also how we get data from shader-db's ./run)
332     */
333 
334    if (initial_variants_synchronous(ctx)) {
335       static struct ir3_shader_key key; /* static is implicitly zeroed */
336       ir3_shader_variant(shader, key, false, &ctx->debug);
337       shader->initial_variants_done = true;
338    } else {
339       struct fd_screen *screen = ctx->screen;
340       util_queue_add_job(&screen->compile_queue, hwcso, &hwcso->ready,
341                          create_initial_compute_variants_async, NULL, 0);
342    }
343 
344    return hwcso;
345 }
346 
347 void *
ir3_shader_state_create(struct pipe_context * pctx,const struct pipe_shader_state * cso)348 ir3_shader_state_create(struct pipe_context *pctx,
349                         const struct pipe_shader_state *cso)
350 {
351    struct fd_context *ctx = fd_context(pctx);
352    struct ir3_compiler *compiler = ctx->screen->compiler;
353    struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso));
354 
355    /*
356     * Convert to nir (if necessary):
357     */
358 
359    nir_shader *nir;
360    if (cso->type == PIPE_SHADER_IR_NIR) {
361       /* we take ownership of the reference: */
362       nir = cso->ir.nir;
363    } else {
364       assert(cso->type == PIPE_SHADER_IR_TGSI);
365       if (ir3_shader_debug & IR3_DBG_DISASM) {
366          tgsi_dump(cso->tokens, 0);
367       }
368       nir = tgsi_to_nir(cso->tokens, pctx->screen, false);
369    }
370 
371    if (ctx->screen->gen >= 6)
372       ir3_nir_lower_io_to_bindless(nir);
373 
374    /*
375     * Create ir3_shader:
376     *
377     * This part is cheap, it doesn't compile initial variants
378     */
379 
380    struct ir3_stream_output_info stream_output = {};
381    copy_stream_out(&stream_output, &cso->stream_output);
382 
383    hwcso->shader =
384       ir3_shader_from_nir(compiler, nir, &(struct ir3_shader_options){
385                               /* TODO: force to single on a6xx with legacy
386                                * ballot extension that uses 64-bit masks
387                                */
388                               .api_wavesize = IR3_SINGLE_OR_DOUBLE,
389                               .real_wavesize = IR3_SINGLE_OR_DOUBLE,
390                           },
391                           &stream_output);
392 
393    /*
394     * Create initial variants to avoid draw-time stalls.  This is
395     * normally done asynchronously, unless debug is enabled (which
396     * will be the case for shader-db)
397     */
398 
399    util_queue_fence_init(&hwcso->ready);
400 
401    if (initial_variants_synchronous(ctx)) {
402       create_initial_variants(hwcso, &ctx->debug);
403    } else {
404       util_queue_add_job(&ctx->screen->compile_queue, hwcso, &hwcso->ready,
405                          create_initial_variants_async, NULL, 0);
406    }
407 
408    return hwcso;
409 }
410 
411 void
ir3_shader_state_delete(struct pipe_context * pctx,void * _hwcso)412 ir3_shader_state_delete(struct pipe_context *pctx, void *_hwcso)
413 {
414    struct fd_context *ctx = fd_context(pctx);
415    struct fd_screen *screen = ctx->screen;
416    struct ir3_shader_state *hwcso = _hwcso;
417    struct ir3_shader *so = hwcso->shader;
418 
419    ir3_cache_invalidate(ctx->shader_cache, hwcso);
420 
421    /* util_queue_drop_job() guarantees that either:
422     *  1) job did not execute
423     *  2) job completed
424     *
425     * In either case the fence is signaled
426     */
427    util_queue_drop_job(&screen->compile_queue, &hwcso->ready);
428 
429    /* free the uploaded shaders, since this is handled outside of the
430     * shared ir3 code (ie. not used by turnip):
431     */
432    for (struct ir3_shader_variant *v = so->variants; v; v = v->next) {
433       fd_bo_del(v->bo);
434       v->bo = NULL;
435 
436       if (v->binning && v->binning->bo) {
437          fd_bo_del(v->binning->bo);
438          v->binning->bo = NULL;
439       }
440    }
441 
442    ir3_shader_destroy(so);
443    util_queue_fence_destroy(&hwcso->ready);
444    free(hwcso);
445 }
446 
447 struct ir3_shader *
ir3_get_shader(struct ir3_shader_state * hwcso)448 ir3_get_shader(struct ir3_shader_state *hwcso)
449 {
450    if (!hwcso)
451       return NULL;
452 
453    MESA_TRACE_FUNC();
454 
455    struct ir3_shader *shader = hwcso->shader;
456    perf_time (1000, "waited for %s:%s:%s variants",
457               _mesa_shader_stage_to_abbrev(shader->type),
458               shader->nir->info.name,
459               shader->nir->info.label) {
460       /* wait for initial variants to compile: */
461       util_queue_fence_wait(&hwcso->ready);
462    }
463 
464    return shader;
465 }
466 
467 struct shader_info *
ir3_get_shader_info(struct ir3_shader_state * hwcso)468 ir3_get_shader_info(struct ir3_shader_state *hwcso)
469 {
470    if (!hwcso)
471       return NULL;
472    return &hwcso->shader->nir->info;
473 }
474 
475 /* fixup dirty shader state in case some "unrelated" (from the state-
476  * tracker's perspective) state change causes us to switch to a
477  * different variant.
478  */
479 void
ir3_fixup_shader_state(struct pipe_context * pctx,struct ir3_shader_key * key)480 ir3_fixup_shader_state(struct pipe_context *pctx, struct ir3_shader_key *key)
481 {
482    struct fd_context *ctx = fd_context(pctx);
483 
484    if (!ir3_shader_key_equal(ctx->last.key, key)) {
485       if (ir3_shader_key_changes_fs(ctx->last.key, key)) {
486          fd_context_dirty_shader(ctx, PIPE_SHADER_FRAGMENT,
487                                  FD_DIRTY_SHADER_PROG);
488       }
489 
490       if (ir3_shader_key_changes_vs(ctx->last.key, key)) {
491          fd_context_dirty_shader(ctx, PIPE_SHADER_VERTEX, FD_DIRTY_SHADER_PROG);
492       }
493 
494       /* NOTE: currently only a6xx has gs/tess, but needs no
495        * gs/tess specific lowering.
496        */
497 
498       *ctx->last.key = *key;
499    }
500 }
501 
502 static char *
ir3_screen_finalize_nir(struct pipe_screen * pscreen,void * nir)503 ir3_screen_finalize_nir(struct pipe_screen *pscreen, void *nir)
504 {
505    struct fd_screen *screen = fd_screen(pscreen);
506 
507    MESA_TRACE_FUNC();
508 
509    ir3_nir_lower_io_to_temporaries(nir);
510    ir3_finalize_nir(screen->compiler, nir);
511 
512    return NULL;
513 }
514 
515 static void
ir3_set_max_shader_compiler_threads(struct pipe_screen * pscreen,unsigned max_threads)516 ir3_set_max_shader_compiler_threads(struct pipe_screen *pscreen,
517                                     unsigned max_threads)
518 {
519    struct fd_screen *screen = fd_screen(pscreen);
520 
521    /* This function doesn't allow a greater number of threads than
522     * the queue had at its creation.
523     */
524    util_queue_adjust_num_threads(&screen->compile_queue, max_threads,
525                                  false);
526 }
527 
528 static bool
ir3_is_parallel_shader_compilation_finished(struct pipe_screen * pscreen,void * shader,enum pipe_shader_type shader_type)529 ir3_is_parallel_shader_compilation_finished(struct pipe_screen *pscreen,
530                                             void *shader,
531                                             enum pipe_shader_type shader_type)
532 {
533    struct ir3_shader_state *hwcso = (struct ir3_shader_state *)shader;
534 
535    return util_queue_fence_is_signalled(&hwcso->ready);
536 }
537 
538 void
ir3_prog_init(struct pipe_context * pctx)539 ir3_prog_init(struct pipe_context *pctx)
540 {
541    pctx->create_vs_state = ir3_shader_state_create;
542    pctx->delete_vs_state = ir3_shader_state_delete;
543 
544    pctx->create_tcs_state = ir3_shader_state_create;
545    pctx->delete_tcs_state = ir3_shader_state_delete;
546 
547    pctx->create_tes_state = ir3_shader_state_create;
548    pctx->delete_tes_state = ir3_shader_state_delete;
549 
550    pctx->create_gs_state = ir3_shader_state_create;
551    pctx->delete_gs_state = ir3_shader_state_delete;
552 
553    pctx->create_fs_state = ir3_shader_state_create;
554    pctx->delete_fs_state = ir3_shader_state_delete;
555 }
556 
557 void
ir3_screen_init(struct pipe_screen * pscreen)558 ir3_screen_init(struct pipe_screen *pscreen)
559 {
560    struct fd_screen *screen = fd_screen(pscreen);
561 
562    struct ir3_compiler_options options = {
563       .bindless_fb_read_descriptor =
564          ir3_shader_descriptor_set(PIPE_SHADER_FRAGMENT),
565       .bindless_fb_read_slot = IR3_BINDLESS_IMAGE_OFFSET +
566                                IR3_BINDLESS_IMAGE_COUNT - 1 - screen->max_rts,
567       .dual_color_blend_by_location = screen->driconf.dual_color_blend_by_location,
568    };
569 
570    if (screen->gen >= 6) {
571       options.lower_base_vertex = true;
572    }
573    screen->compiler =
574       ir3_compiler_create(screen->dev, screen->dev_id, screen->info, &options);
575 
576    /* TODO do we want to limit things to # of fast cores, or just limit
577     * based on total # of both big and little cores.  The little cores
578     * tend to be in-order and probably much slower for compiling than
579     * big cores.  OTOH if they are sitting idle, maybe it is useful to
580     * use them?
581     */
582    unsigned num_threads = sysconf(_SC_NPROCESSORS_ONLN) / 2;
583 
584    /* Create at least one thread - even on single core CPU systems. */
585    num_threads = MAX2(1, num_threads);
586 
587    util_queue_init(&screen->compile_queue, "ir3q", 64, num_threads,
588                    UTIL_QUEUE_INIT_RESIZE_IF_FULL |
589                       UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY, NULL);
590 
591    pscreen->finalize_nir = ir3_screen_finalize_nir;
592    pscreen->set_max_shader_compiler_threads =
593       ir3_set_max_shader_compiler_threads;
594    pscreen->is_parallel_shader_compilation_finished =
595       ir3_is_parallel_shader_compilation_finished;
596 }
597 
598 void
ir3_screen_fini(struct pipe_screen * pscreen)599 ir3_screen_fini(struct pipe_screen *pscreen)
600 {
601    struct fd_screen *screen = fd_screen(pscreen);
602 
603    util_queue_destroy(&screen->compile_queue);
604    ir3_compiler_destroy(screen->compiler);
605    screen->compiler = NULL;
606 }
607 
608 void
ir3_update_max_tf_vtx(struct fd_context * ctx,const struct ir3_shader_variant * v)609 ir3_update_max_tf_vtx(struct fd_context *ctx,
610                       const struct ir3_shader_variant *v)
611 {
612    struct fd_streamout_stateobj *so = &ctx->streamout;
613    const struct ir3_stream_output_info *info = &v->stream_output;
614    uint32_t maxvtxcnt = 0x7fffffff;
615 
616    if (v->stream_output.num_outputs == 0)
617       maxvtxcnt = 0;
618    if (so->num_targets == 0)
619       maxvtxcnt = 0;
620 
621    /* offset to write to is:
622     *
623     *   total_vtxcnt = vtxcnt + offsets[i]
624     *   offset = total_vtxcnt * stride[i]
625     *
626     *   offset =   vtxcnt * stride[i]       ; calculated in shader
627     *            + offsets[i] * stride[i]   ; calculated at emit_tfbos()
628     *
629     * assuming for each vtx, each target buffer will have data written
630     * up to 'offset + stride[i]', that leaves maxvtxcnt as:
631     *
632     *   buffer_size = (maxvtxcnt * stride[i]) + stride[i]
633     *   maxvtxcnt   = (buffer_size - stride[i]) / stride[i]
634     *
635     * but shader is actually doing a less-than (rather than less-than-
636     * equal) check, so we can drop the -stride[i].
637     *
638     * TODO is assumption about `offset + stride[i]` legit?
639     */
640    for (unsigned i = 0; i < so->num_targets; i++) {
641       struct pipe_stream_output_target *target = so->targets[i];
642       unsigned stride = info->stride[i] * 4; /* convert dwords->bytes */
643       if (target) {
644          uint32_t max = target->buffer_size / stride;
645          maxvtxcnt = MIN2(maxvtxcnt, max);
646       }
647    }
648 
649    ctx->streamout.max_tf_vtx = maxvtxcnt;
650 }
651 
652 void
ir3_get_private_mem(struct fd_context * ctx,const struct ir3_shader_variant * so)653 ir3_get_private_mem(struct fd_context *ctx, const struct ir3_shader_variant *so)
654 {
655    uint32_t fibers_per_sp = ctx->screen->info->fibers_per_sp;
656    uint32_t num_sp_cores = ctx->screen->info->num_sp_cores;
657 
658    uint32_t per_fiber_size = so->pvtmem_size;
659    if (per_fiber_size > ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size) {
660       if (ctx->pvtmem[so->pvtmem_per_wave].bo)
661          fd_bo_del(ctx->pvtmem[so->pvtmem_per_wave].bo);
662 
663       uint32_t per_sp_size = ALIGN(per_fiber_size * fibers_per_sp, 1 << 12);
664       uint32_t total_size = per_sp_size * num_sp_cores;
665 
666       ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size = per_fiber_size;
667       ctx->pvtmem[so->pvtmem_per_wave].per_sp_size = per_sp_size;
668       ctx->pvtmem[so->pvtmem_per_wave].bo = fd_bo_new(
669          ctx->screen->dev, total_size, FD_BO_NOMAP, "pvtmem_%s_%d",
670          so->pvtmem_per_wave ? "per_wave" : "per_fiber", per_fiber_size);
671    }
672 }
673