1 /*
2 * Copyright © 2014 Rob Clark <[email protected]>
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Rob Clark <[email protected]>
7 */
8
9 #include "pipe/p_screen.h"
10 #include "pipe/p_state.h"
11 #include "tgsi/tgsi_dump.h"
12 #include "util/format/u_format.h"
13 #include "util/u_inlines.h"
14 #include "util/u_memory.h"
15 #include "util/u_string.h"
16
17 #include "nir/tgsi_to_nir.h"
18 #include "nir_serialize.h"
19
20 #include "freedreno_context.h"
21 #include "freedreno_util.h"
22
23 #include "ir3/ir3_cache.h"
24 #include "ir3/ir3_compiler.h"
25 #include "ir3/ir3_descriptor.h"
26 #include "ir3/ir3_gallium.h"
27 #include "ir3/ir3_nir.h"
28 #include "ir3/ir3_shader.h"
29
30 /**
31 * The hardware cso for shader state
32 *
33 * Initially just a container for the ir3_shader, but this is where we'll
34 * plumb in async compile.
35 */
36 struct ir3_shader_state {
37 struct ir3_shader *shader;
38
39 /* Fence signalled when async compile is completed: */
40 struct util_queue_fence ready;
41 };
42
43 /**
44 * Should initial variants be compiled synchronously?
45 *
46 * The only case where util_debug_message() is used in the initial-variants
47 * path is with FD_MESA_DEBUG=shaderdb. So if either debug is disabled (ie.
48 * debug.debug_message==NULL), or shaderdb stats are not enabled, we can
49 * compile the initial shader variant asynchronously.
50 */
51 static bool
initial_variants_synchronous(struct fd_context * ctx)52 initial_variants_synchronous(struct fd_context *ctx)
53 {
54 return unlikely(ctx->debug.debug_message) || FD_DBG(SHADERDB) ||
55 FD_DBG(SERIALC);
56 }
57
58 static void
dump_shader_info(struct ir3_shader_variant * v,struct util_debug_callback * debug)59 dump_shader_info(struct ir3_shader_variant *v,
60 struct util_debug_callback *debug)
61 {
62 if (!FD_DBG(SHADERDB))
63 return;
64
65 util_debug_message(
66 debug, SHADER_INFO,
67 "%s shader: %u inst, %u nops, %u non-nops, %u mov, %u cov, "
68 "%u dwords, %u last-baryf, %u last-helper, %u half, %u full, %u constlen, "
69 "%u cat0, %u cat1, %u cat2, %u cat3, %u cat4, %u cat5, %u cat6, %u cat7, "
70 "%u stp, %u ldp, %u sstall, %u (ss), %u systall, %u (sy), %d waves, "
71 "%d loops\n",
72 ir3_shader_stage(v), v->info.instrs_count, v->info.nops_count,
73 v->info.instrs_count - v->info.nops_count, v->info.mov_count,
74 v->info.cov_count, v->info.sizedwords, v->info.last_baryf,
75 v->info.last_helper, v->info.max_half_reg + 1, v->info.max_reg + 1,
76 v->constlen,
77 v->info.instrs_per_cat[0], v->info.instrs_per_cat[1],
78 v->info.instrs_per_cat[2], v->info.instrs_per_cat[3],
79 v->info.instrs_per_cat[4], v->info.instrs_per_cat[5],
80 v->info.instrs_per_cat[6], v->info.instrs_per_cat[7],
81 v->info.stp_count, v->info.ldp_count, v->info.sstall,
82 v->info.ss, v->info.systall, v->info.sy, v->info.max_waves, v->loops);
83 }
84
85 static void
upload_shader_variant(struct ir3_shader_variant * v)86 upload_shader_variant(struct ir3_shader_variant *v)
87 {
88 struct ir3_compiler *compiler = v->compiler;
89
90 assert(!v->bo);
91
92 v->bo =
93 fd_bo_new(compiler->dev, v->info.size, FD_BO_NOMAP,
94 "%s:%s", ir3_shader_stage(v), v->name);
95
96 /* Always include shaders in kernel crash dumps. */
97 fd_bo_mark_for_dump(v->bo);
98
99 fd_bo_upload(v->bo, v->bin, 0, v->info.size);
100 }
101
102 struct ir3_shader_variant *
ir3_shader_variant(struct ir3_shader * shader,struct ir3_shader_key key,bool binning_pass,struct util_debug_callback * debug)103 ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key,
104 bool binning_pass, struct util_debug_callback *debug)
105 {
106 struct ir3_shader_variant *v;
107 bool created = false;
108
109 MESA_TRACE_FUNC();
110
111 /* Some shader key values may not be used by a given ir3_shader (for
112 * example, fragment shader saturates in the vertex shader), so clean out
113 * those flags to avoid recompiling.
114 */
115 ir3_key_clear_unused(&key, shader);
116
117 v = ir3_shader_get_variant(shader, &key, binning_pass, false, &created);
118
119 if (created) {
120 if (shader->initial_variants_done) {
121 perf_debug_message(debug, SHADER_INFO,
122 "%s shader: recompiling at draw time: global "
123 "0x%08x, vfsamples %x/%x, astc %x/%x\n",
124 ir3_shader_stage(v), key.global, key.vsamples,
125 key.fsamples, key.vastc_srgb, key.fastc_srgb);
126 }
127
128 dump_shader_info(v, debug);
129 upload_shader_variant(v);
130
131 if (v->binning) {
132 upload_shader_variant(v->binning);
133 dump_shader_info(v->binning, debug);
134 }
135 }
136
137 return v;
138 }
139
140 static void
copy_stream_out(struct ir3_stream_output_info * i,const struct pipe_stream_output_info * p)141 copy_stream_out(struct ir3_stream_output_info *i,
142 const struct pipe_stream_output_info *p)
143 {
144 STATIC_ASSERT(ARRAY_SIZE(i->stride) == ARRAY_SIZE(p->stride));
145 STATIC_ASSERT(ARRAY_SIZE(i->output) == ARRAY_SIZE(p->output));
146
147 i->streams_written = 0;
148 i->num_outputs = p->num_outputs;
149 for (int n = 0; n < ARRAY_SIZE(i->stride); n++) {
150 i->stride[n] = p->stride[n];
151 if (p->stride[n])
152 i->streams_written |= BIT(n);
153 }
154
155 for (int n = 0; n < ARRAY_SIZE(i->output); n++) {
156 i->output[n].register_index = p->output[n].register_index;
157 i->output[n].start_component = p->output[n].start_component;
158 i->output[n].num_components = p->output[n].num_components;
159 i->output[n].output_buffer = p->output[n].output_buffer;
160 i->output[n].dst_offset = p->output[n].dst_offset;
161 i->output[n].stream = p->output[n].stream;
162 }
163 }
164
165 static void
create_initial_variants(struct ir3_shader_state * hwcso,struct util_debug_callback * debug)166 create_initial_variants(struct ir3_shader_state *hwcso,
167 struct util_debug_callback *debug)
168 {
169 struct ir3_shader *shader = hwcso->shader;
170 struct ir3_compiler *compiler = shader->compiler;
171 nir_shader *nir = shader->nir;
172
173 /* Compile standard variants immediately to try to avoid draw-time stalls
174 * to run the compiler.
175 */
176 struct ir3_shader_key key = {
177 .tessellation = IR3_TESS_NONE,
178 .ucp_enables = MASK(nir->info.clip_distance_array_size),
179 .msaa = true,
180 };
181
182 switch (nir->info.stage) {
183 case MESA_SHADER_TESS_EVAL:
184 key.tessellation = ir3_tess_mode(nir->info.tess._primitive_mode);
185 break;
186
187 case MESA_SHADER_TESS_CTRL:
188 /* The primitive_mode field, while it exists for TCS, is not
189 * populated (since separable shaders between TCS/TES are legal,
190 * so TCS wouldn't have access to TES's declaration). Make a
191 * guess so that we shader-db something plausible for TCS.
192 */
193 if (nir->info.outputs_written & VARYING_BIT_TESS_LEVEL_INNER)
194 key.tessellation = IR3_TESS_TRIANGLES;
195 else
196 key.tessellation = IR3_TESS_ISOLINES;
197 break;
198
199 case MESA_SHADER_GEOMETRY:
200 key.has_gs = true;
201 break;
202
203 default:
204 break;
205 }
206
207 key.safe_constlen = false;
208 struct ir3_shader_variant *v = ir3_shader_variant(shader, key, false, debug);
209 if (!v)
210 return;
211
212 if (v->constlen > compiler->max_const_safe) {
213 key.safe_constlen = true;
214 ir3_shader_variant(shader, key, false, debug);
215 }
216
217 /* For vertex shaders, also compile initial binning pass shader: */
218 if (nir->info.stage == MESA_SHADER_VERTEX) {
219 key.safe_constlen = false;
220 v = ir3_shader_variant(shader, key, true, debug);
221 if (!v)
222 return;
223
224 if (v->constlen > compiler->max_const_safe) {
225 key.safe_constlen = true;
226 ir3_shader_variant(shader, key, true, debug);
227 }
228 }
229
230 shader->initial_variants_done = true;
231 }
232
233 static void
create_initial_variants_async(void * job,void * gdata,int thread_index)234 create_initial_variants_async(void *job, void *gdata, int thread_index)
235 {
236 struct ir3_shader_state *hwcso = job;
237 struct util_debug_callback debug = {};
238
239 MESA_TRACE_FUNC();
240
241 create_initial_variants(hwcso, &debug);
242 }
243
244 static void
create_initial_compute_variants_async(void * job,void * gdata,int thread_index)245 create_initial_compute_variants_async(void *job, void *gdata, int thread_index)
246 {
247 struct ir3_shader_state *hwcso = job;
248 struct ir3_shader *shader = hwcso->shader;
249 struct util_debug_callback debug = {};
250 static struct ir3_shader_key key; /* static is implicitly zeroed */
251
252 MESA_TRACE_FUNC();
253
254 ir3_shader_variant(shader, key, false, &debug);
255 shader->initial_variants_done = true;
256 }
257
258 /* a bit annoying that compute-shader and normal shader state objects
259 * aren't a bit more aligned.
260 */
261 void *
ir3_shader_compute_state_create(struct pipe_context * pctx,const struct pipe_compute_state * cso)262 ir3_shader_compute_state_create(struct pipe_context *pctx,
263 const struct pipe_compute_state *cso)
264 {
265 struct fd_context *ctx = fd_context(pctx);
266
267 /* req_input_mem will only be non-zero for cl kernels (ie. clover).
268 * This isn't a perfect test because I guess it is possible (but
269 * uncommon) for none for the kernel parameters to be a global,
270 * but ctx->set_global_bindings() can't fail, so this is the next
271 * best place to fail if we need a newer version of kernel driver:
272 */
273 if ((cso->req_input_mem > 0) &&
274 fd_device_version(ctx->dev) < FD_VERSION_BO_IOVA) {
275 return NULL;
276 }
277
278 struct ir3_compiler *compiler = ctx->screen->compiler;
279 nir_shader *nir;
280
281 if (cso->ir_type == PIPE_SHADER_IR_NIR) {
282 /* we take ownership of the reference: */
283 nir = (nir_shader *)cso->prog;
284 } else if (cso->ir_type == PIPE_SHADER_IR_NIR_SERIALIZED) {
285 const nir_shader_compiler_options *options =
286 ir3_get_compiler_options(compiler);
287 const struct pipe_binary_program_header *hdr = cso->prog;
288 struct blob_reader reader;
289
290 blob_reader_init(&reader, hdr->blob, hdr->num_bytes);
291 nir = nir_deserialize(NULL, options, &reader);
292
293 ir3_finalize_nir(compiler, nir);
294 } else {
295 assert(cso->ir_type == PIPE_SHADER_IR_TGSI);
296 if (ir3_shader_debug & IR3_DBG_DISASM) {
297 tgsi_dump(cso->prog, 0);
298 }
299 nir = tgsi_to_nir(cso->prog, pctx->screen, false);
300 }
301
302 if (ctx->screen->gen >= 6)
303 ir3_nir_lower_io_to_bindless(nir);
304
305 enum ir3_wavesize_option api_wavesize = IR3_SINGLE_OR_DOUBLE;
306 enum ir3_wavesize_option real_wavesize = IR3_SINGLE_OR_DOUBLE;
307
308 if (ctx->screen->gen >= 6 && !ctx->screen->info->a6xx.supports_double_threadsize) {
309 api_wavesize = IR3_SINGLE_ONLY;
310 real_wavesize = IR3_SINGLE_ONLY;
311 }
312
313 struct ir3_shader *shader =
314 ir3_shader_from_nir(compiler, nir, &(struct ir3_shader_options){
315 /* TODO: force to single on a6xx with legacy
316 * ballot extension that uses 64-bit masks
317 */
318 .api_wavesize = api_wavesize,
319 .real_wavesize = real_wavesize,
320 }, NULL);
321 shader->cs.req_input_mem = align(cso->req_input_mem, 4) / 4; /* byte->dword */
322 shader->cs.req_local_mem = cso->static_shared_mem;
323
324 struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso));
325
326 util_queue_fence_init(&hwcso->ready);
327 hwcso->shader = shader;
328
329 /* Immediately compile a standard variant. We have so few variants in our
330 * shaders, that doing so almost eliminates draw-time recompiles. (This
331 * is also how we get data from shader-db's ./run)
332 */
333
334 if (initial_variants_synchronous(ctx)) {
335 static struct ir3_shader_key key; /* static is implicitly zeroed */
336 ir3_shader_variant(shader, key, false, &ctx->debug);
337 shader->initial_variants_done = true;
338 } else {
339 struct fd_screen *screen = ctx->screen;
340 util_queue_add_job(&screen->compile_queue, hwcso, &hwcso->ready,
341 create_initial_compute_variants_async, NULL, 0);
342 }
343
344 return hwcso;
345 }
346
347 void *
ir3_shader_state_create(struct pipe_context * pctx,const struct pipe_shader_state * cso)348 ir3_shader_state_create(struct pipe_context *pctx,
349 const struct pipe_shader_state *cso)
350 {
351 struct fd_context *ctx = fd_context(pctx);
352 struct ir3_compiler *compiler = ctx->screen->compiler;
353 struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso));
354
355 /*
356 * Convert to nir (if necessary):
357 */
358
359 nir_shader *nir;
360 if (cso->type == PIPE_SHADER_IR_NIR) {
361 /* we take ownership of the reference: */
362 nir = cso->ir.nir;
363 } else {
364 assert(cso->type == PIPE_SHADER_IR_TGSI);
365 if (ir3_shader_debug & IR3_DBG_DISASM) {
366 tgsi_dump(cso->tokens, 0);
367 }
368 nir = tgsi_to_nir(cso->tokens, pctx->screen, false);
369 }
370
371 if (ctx->screen->gen >= 6)
372 ir3_nir_lower_io_to_bindless(nir);
373
374 /*
375 * Create ir3_shader:
376 *
377 * This part is cheap, it doesn't compile initial variants
378 */
379
380 struct ir3_stream_output_info stream_output = {};
381 copy_stream_out(&stream_output, &cso->stream_output);
382
383 hwcso->shader =
384 ir3_shader_from_nir(compiler, nir, &(struct ir3_shader_options){
385 /* TODO: force to single on a6xx with legacy
386 * ballot extension that uses 64-bit masks
387 */
388 .api_wavesize = IR3_SINGLE_OR_DOUBLE,
389 .real_wavesize = IR3_SINGLE_OR_DOUBLE,
390 },
391 &stream_output);
392
393 /*
394 * Create initial variants to avoid draw-time stalls. This is
395 * normally done asynchronously, unless debug is enabled (which
396 * will be the case for shader-db)
397 */
398
399 util_queue_fence_init(&hwcso->ready);
400
401 if (initial_variants_synchronous(ctx)) {
402 create_initial_variants(hwcso, &ctx->debug);
403 } else {
404 util_queue_add_job(&ctx->screen->compile_queue, hwcso, &hwcso->ready,
405 create_initial_variants_async, NULL, 0);
406 }
407
408 return hwcso;
409 }
410
411 void
ir3_shader_state_delete(struct pipe_context * pctx,void * _hwcso)412 ir3_shader_state_delete(struct pipe_context *pctx, void *_hwcso)
413 {
414 struct fd_context *ctx = fd_context(pctx);
415 struct fd_screen *screen = ctx->screen;
416 struct ir3_shader_state *hwcso = _hwcso;
417 struct ir3_shader *so = hwcso->shader;
418
419 ir3_cache_invalidate(ctx->shader_cache, hwcso);
420
421 /* util_queue_drop_job() guarantees that either:
422 * 1) job did not execute
423 * 2) job completed
424 *
425 * In either case the fence is signaled
426 */
427 util_queue_drop_job(&screen->compile_queue, &hwcso->ready);
428
429 /* free the uploaded shaders, since this is handled outside of the
430 * shared ir3 code (ie. not used by turnip):
431 */
432 for (struct ir3_shader_variant *v = so->variants; v; v = v->next) {
433 fd_bo_del(v->bo);
434 v->bo = NULL;
435
436 if (v->binning && v->binning->bo) {
437 fd_bo_del(v->binning->bo);
438 v->binning->bo = NULL;
439 }
440 }
441
442 ir3_shader_destroy(so);
443 util_queue_fence_destroy(&hwcso->ready);
444 free(hwcso);
445 }
446
447 struct ir3_shader *
ir3_get_shader(struct ir3_shader_state * hwcso)448 ir3_get_shader(struct ir3_shader_state *hwcso)
449 {
450 if (!hwcso)
451 return NULL;
452
453 MESA_TRACE_FUNC();
454
455 struct ir3_shader *shader = hwcso->shader;
456 perf_time (1000, "waited for %s:%s:%s variants",
457 _mesa_shader_stage_to_abbrev(shader->type),
458 shader->nir->info.name,
459 shader->nir->info.label) {
460 /* wait for initial variants to compile: */
461 util_queue_fence_wait(&hwcso->ready);
462 }
463
464 return shader;
465 }
466
467 struct shader_info *
ir3_get_shader_info(struct ir3_shader_state * hwcso)468 ir3_get_shader_info(struct ir3_shader_state *hwcso)
469 {
470 if (!hwcso)
471 return NULL;
472 return &hwcso->shader->nir->info;
473 }
474
475 /* fixup dirty shader state in case some "unrelated" (from the state-
476 * tracker's perspective) state change causes us to switch to a
477 * different variant.
478 */
479 void
ir3_fixup_shader_state(struct pipe_context * pctx,struct ir3_shader_key * key)480 ir3_fixup_shader_state(struct pipe_context *pctx, struct ir3_shader_key *key)
481 {
482 struct fd_context *ctx = fd_context(pctx);
483
484 if (!ir3_shader_key_equal(ctx->last.key, key)) {
485 if (ir3_shader_key_changes_fs(ctx->last.key, key)) {
486 fd_context_dirty_shader(ctx, PIPE_SHADER_FRAGMENT,
487 FD_DIRTY_SHADER_PROG);
488 }
489
490 if (ir3_shader_key_changes_vs(ctx->last.key, key)) {
491 fd_context_dirty_shader(ctx, PIPE_SHADER_VERTEX, FD_DIRTY_SHADER_PROG);
492 }
493
494 /* NOTE: currently only a6xx has gs/tess, but needs no
495 * gs/tess specific lowering.
496 */
497
498 *ctx->last.key = *key;
499 }
500 }
501
502 static char *
ir3_screen_finalize_nir(struct pipe_screen * pscreen,void * nir)503 ir3_screen_finalize_nir(struct pipe_screen *pscreen, void *nir)
504 {
505 struct fd_screen *screen = fd_screen(pscreen);
506
507 MESA_TRACE_FUNC();
508
509 ir3_nir_lower_io_to_temporaries(nir);
510 ir3_finalize_nir(screen->compiler, nir);
511
512 return NULL;
513 }
514
515 static void
ir3_set_max_shader_compiler_threads(struct pipe_screen * pscreen,unsigned max_threads)516 ir3_set_max_shader_compiler_threads(struct pipe_screen *pscreen,
517 unsigned max_threads)
518 {
519 struct fd_screen *screen = fd_screen(pscreen);
520
521 /* This function doesn't allow a greater number of threads than
522 * the queue had at its creation.
523 */
524 util_queue_adjust_num_threads(&screen->compile_queue, max_threads,
525 false);
526 }
527
528 static bool
ir3_is_parallel_shader_compilation_finished(struct pipe_screen * pscreen,void * shader,enum pipe_shader_type shader_type)529 ir3_is_parallel_shader_compilation_finished(struct pipe_screen *pscreen,
530 void *shader,
531 enum pipe_shader_type shader_type)
532 {
533 struct ir3_shader_state *hwcso = (struct ir3_shader_state *)shader;
534
535 return util_queue_fence_is_signalled(&hwcso->ready);
536 }
537
538 void
ir3_prog_init(struct pipe_context * pctx)539 ir3_prog_init(struct pipe_context *pctx)
540 {
541 pctx->create_vs_state = ir3_shader_state_create;
542 pctx->delete_vs_state = ir3_shader_state_delete;
543
544 pctx->create_tcs_state = ir3_shader_state_create;
545 pctx->delete_tcs_state = ir3_shader_state_delete;
546
547 pctx->create_tes_state = ir3_shader_state_create;
548 pctx->delete_tes_state = ir3_shader_state_delete;
549
550 pctx->create_gs_state = ir3_shader_state_create;
551 pctx->delete_gs_state = ir3_shader_state_delete;
552
553 pctx->create_fs_state = ir3_shader_state_create;
554 pctx->delete_fs_state = ir3_shader_state_delete;
555 }
556
557 void
ir3_screen_init(struct pipe_screen * pscreen)558 ir3_screen_init(struct pipe_screen *pscreen)
559 {
560 struct fd_screen *screen = fd_screen(pscreen);
561
562 struct ir3_compiler_options options = {
563 .bindless_fb_read_descriptor =
564 ir3_shader_descriptor_set(PIPE_SHADER_FRAGMENT),
565 .bindless_fb_read_slot = IR3_BINDLESS_IMAGE_OFFSET +
566 IR3_BINDLESS_IMAGE_COUNT - 1 - screen->max_rts,
567 .dual_color_blend_by_location = screen->driconf.dual_color_blend_by_location,
568 };
569
570 if (screen->gen >= 6) {
571 options.lower_base_vertex = true;
572 }
573 screen->compiler =
574 ir3_compiler_create(screen->dev, screen->dev_id, screen->info, &options);
575
576 /* TODO do we want to limit things to # of fast cores, or just limit
577 * based on total # of both big and little cores. The little cores
578 * tend to be in-order and probably much slower for compiling than
579 * big cores. OTOH if they are sitting idle, maybe it is useful to
580 * use them?
581 */
582 unsigned num_threads = sysconf(_SC_NPROCESSORS_ONLN) / 2;
583
584 /* Create at least one thread - even on single core CPU systems. */
585 num_threads = MAX2(1, num_threads);
586
587 util_queue_init(&screen->compile_queue, "ir3q", 64, num_threads,
588 UTIL_QUEUE_INIT_RESIZE_IF_FULL |
589 UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY, NULL);
590
591 pscreen->finalize_nir = ir3_screen_finalize_nir;
592 pscreen->set_max_shader_compiler_threads =
593 ir3_set_max_shader_compiler_threads;
594 pscreen->is_parallel_shader_compilation_finished =
595 ir3_is_parallel_shader_compilation_finished;
596 }
597
598 void
ir3_screen_fini(struct pipe_screen * pscreen)599 ir3_screen_fini(struct pipe_screen *pscreen)
600 {
601 struct fd_screen *screen = fd_screen(pscreen);
602
603 util_queue_destroy(&screen->compile_queue);
604 ir3_compiler_destroy(screen->compiler);
605 screen->compiler = NULL;
606 }
607
608 void
ir3_update_max_tf_vtx(struct fd_context * ctx,const struct ir3_shader_variant * v)609 ir3_update_max_tf_vtx(struct fd_context *ctx,
610 const struct ir3_shader_variant *v)
611 {
612 struct fd_streamout_stateobj *so = &ctx->streamout;
613 const struct ir3_stream_output_info *info = &v->stream_output;
614 uint32_t maxvtxcnt = 0x7fffffff;
615
616 if (v->stream_output.num_outputs == 0)
617 maxvtxcnt = 0;
618 if (so->num_targets == 0)
619 maxvtxcnt = 0;
620
621 /* offset to write to is:
622 *
623 * total_vtxcnt = vtxcnt + offsets[i]
624 * offset = total_vtxcnt * stride[i]
625 *
626 * offset = vtxcnt * stride[i] ; calculated in shader
627 * + offsets[i] * stride[i] ; calculated at emit_tfbos()
628 *
629 * assuming for each vtx, each target buffer will have data written
630 * up to 'offset + stride[i]', that leaves maxvtxcnt as:
631 *
632 * buffer_size = (maxvtxcnt * stride[i]) + stride[i]
633 * maxvtxcnt = (buffer_size - stride[i]) / stride[i]
634 *
635 * but shader is actually doing a less-than (rather than less-than-
636 * equal) check, so we can drop the -stride[i].
637 *
638 * TODO is assumption about `offset + stride[i]` legit?
639 */
640 for (unsigned i = 0; i < so->num_targets; i++) {
641 struct pipe_stream_output_target *target = so->targets[i];
642 unsigned stride = info->stride[i] * 4; /* convert dwords->bytes */
643 if (target) {
644 uint32_t max = target->buffer_size / stride;
645 maxvtxcnt = MIN2(maxvtxcnt, max);
646 }
647 }
648
649 ctx->streamout.max_tf_vtx = maxvtxcnt;
650 }
651
652 void
ir3_get_private_mem(struct fd_context * ctx,const struct ir3_shader_variant * so)653 ir3_get_private_mem(struct fd_context *ctx, const struct ir3_shader_variant *so)
654 {
655 uint32_t fibers_per_sp = ctx->screen->info->fibers_per_sp;
656 uint32_t num_sp_cores = ctx->screen->info->num_sp_cores;
657
658 uint32_t per_fiber_size = so->pvtmem_size;
659 if (per_fiber_size > ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size) {
660 if (ctx->pvtmem[so->pvtmem_per_wave].bo)
661 fd_bo_del(ctx->pvtmem[so->pvtmem_per_wave].bo);
662
663 uint32_t per_sp_size = ALIGN(per_fiber_size * fibers_per_sp, 1 << 12);
664 uint32_t total_size = per_sp_size * num_sp_cores;
665
666 ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size = per_fiber_size;
667 ctx->pvtmem[so->pvtmem_per_wave].per_sp_size = per_sp_size;
668 ctx->pvtmem[so->pvtmem_per_wave].bo = fd_bo_new(
669 ctx->screen->dev, total_size, FD_BO_NOMAP, "pvtmem_%s_%d",
670 so->pvtmem_per_wave ? "per_wave" : "per_fiber", per_fiber_size);
671 }
672 }
673