1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 * Copyright © 2017 Intel Corporation
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 */
25
26 #include "decode.h"
27
28 #include "drm-uapi/panfrost_drm.h"
29
30 #include "pan_blitter.h"
31 #include "pan_cmdstream.h"
32 #include "pan_context.h"
33 #include "pan_indirect_dispatch.h"
34 #include "pan_jm.h"
35 #include "pan_job.h"
36
37 #if PAN_ARCH >= 10
38 #error "JM helpers are only used for gen < 10"
39 #endif
40
41 void
GENX(jm_init_batch)42 GENX(jm_init_batch)(struct panfrost_batch *batch)
43 {
44 /* Reserve the framebuffer and local storage descriptors */
45 batch->framebuffer =
46 #if PAN_ARCH == 4
47 pan_pool_alloc_desc(&batch->pool.base, FRAMEBUFFER);
48 #else
49 pan_pool_alloc_desc_aggregate(
50 &batch->pool.base, PAN_DESC(FRAMEBUFFER), PAN_DESC(ZS_CRC_EXTENSION),
51 PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET));
52 #endif
53
54 #if PAN_ARCH >= 6
55 batch->tls = pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE);
56 #else
57 /* On Midgard, the TLS is embedded in the FB descriptor */
58 batch->tls = batch->framebuffer;
59
60 #if PAN_ARCH == 5
61 struct mali_framebuffer_pointer_packed ptr;
62
63 pan_pack(ptr.opaque, FRAMEBUFFER_POINTER, cfg) {
64 cfg.pointer = batch->framebuffer.gpu;
65 cfg.render_target_count = 1; /* a necessary lie */
66 }
67
68 batch->tls.gpu = ptr.opaque[0];
69 #endif
70 #endif
71 }
72
73 static int
jm_submit_jc(struct panfrost_batch * batch,mali_ptr first_job_desc,uint32_t reqs,uint32_t out_sync)74 jm_submit_jc(struct panfrost_batch *batch, mali_ptr first_job_desc,
75 uint32_t reqs, uint32_t out_sync)
76 {
77 struct panfrost_context *ctx = batch->ctx;
78 struct pipe_context *gallium = (struct pipe_context *)ctx;
79 struct panfrost_device *dev = pan_device(gallium->screen);
80 struct drm_panfrost_submit submit = {
81 0,
82 };
83 uint32_t in_syncs[1];
84 uint32_t *bo_handles;
85 int ret;
86
87 /* If we trace, we always need a syncobj, so make one of our own if we
88 * weren't given one to use. Remember that we did so, so we can free it
89 * after we're done but preventing double-frees if we were given a
90 * syncobj */
91
92 if (!out_sync && dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC))
93 out_sync = ctx->syncobj;
94
95 submit.out_sync = out_sync;
96 submit.jc = first_job_desc;
97 submit.requirements = reqs;
98
99 if (ctx->in_sync_fd >= 0) {
100 ret = drmSyncobjImportSyncFile(panfrost_device_fd(dev), ctx->in_sync_obj,
101 ctx->in_sync_fd);
102 assert(!ret);
103
104 in_syncs[submit.in_sync_count++] = ctx->in_sync_obj;
105 close(ctx->in_sync_fd);
106 ctx->in_sync_fd = -1;
107 }
108
109 if (submit.in_sync_count)
110 submit.in_syncs = (uintptr_t)in_syncs;
111
112 bo_handles = calloc(panfrost_pool_num_bos(&batch->pool) +
113 panfrost_pool_num_bos(&batch->invisible_pool) +
114 batch->num_bos + 2,
115 sizeof(*bo_handles));
116 assert(bo_handles);
117
118 pan_bo_access *flags = util_dynarray_begin(&batch->bos);
119 unsigned end_bo = util_dynarray_num_elements(&batch->bos, pan_bo_access);
120
121 for (int i = 0; i < end_bo; ++i) {
122 if (!flags[i])
123 continue;
124
125 assert(submit.bo_handle_count < batch->num_bos);
126 bo_handles[submit.bo_handle_count++] = i;
127
128 /* Update the BO access flags so that panfrost_bo_wait() knows
129 * about all pending accesses.
130 * We only keep the READ/WRITE info since this is all the BO
131 * wait logic cares about.
132 * We also preserve existing flags as this batch might not
133 * be the first one to access the BO.
134 */
135 struct panfrost_bo *bo = pan_lookup_bo(dev, i);
136
137 bo->gpu_access |= flags[i] & (PAN_BO_ACCESS_RW);
138 }
139
140 panfrost_pool_get_bo_handles(&batch->pool,
141 bo_handles + submit.bo_handle_count);
142 submit.bo_handle_count += panfrost_pool_num_bos(&batch->pool);
143 panfrost_pool_get_bo_handles(&batch->invisible_pool,
144 bo_handles + submit.bo_handle_count);
145 submit.bo_handle_count += panfrost_pool_num_bos(&batch->invisible_pool);
146
147 /* Add the tiler heap to the list of accessed BOs if the batch has at
148 * least one tiler job. Tiler heap is written by tiler jobs and read
149 * by fragment jobs (the polygon list is coming from this heap).
150 */
151 if (batch->jm.jobs.vtc_jc.first_tiler)
152 bo_handles[submit.bo_handle_count++] =
153 panfrost_bo_handle(dev->tiler_heap);
154
155 /* Always used on Bifrost, occassionally used on Midgard */
156 bo_handles[submit.bo_handle_count++] =
157 panfrost_bo_handle(dev->sample_positions);
158
159 submit.bo_handles = (u64)(uintptr_t)bo_handles;
160 if (ctx->is_noop)
161 ret = 0;
162 else
163 ret = drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANFROST_SUBMIT, &submit);
164 free(bo_handles);
165
166 if (ret)
167 return errno;
168
169 /* Trace the job if we're doing that */
170 if (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) {
171 /* Wait so we can get errors reported back */
172 ret = drmSyncobjWait(panfrost_device_fd(dev), &out_sync, 1, INT64_MAX,
173 0, NULL);
174 if (ret)
175 return errno;
176
177 if (dev->debug & PAN_DBG_TRACE)
178 pandecode_jc(dev->decode_ctx, submit.jc, panfrost_device_gpu_id(dev));
179
180 if (dev->debug & PAN_DBG_DUMP)
181 pandecode_dump_mappings(dev->decode_ctx);
182
183 /* Jobs won't be complete if blackhole rendering, that's ok */
184 if (!ctx->is_noop && dev->debug & PAN_DBG_SYNC)
185 pandecode_abort_on_fault(dev->decode_ctx, submit.jc, panfrost_device_gpu_id(dev));
186 }
187
188 return 0;
189 }
190
191 /* Submit both vertex/tiler and fragment jobs for a batch, possibly with an
192 * outsync corresponding to the later of the two (since there will be an
193 * implicit dep between them) */
194
195 int
GENX(jm_submit_batch)196 GENX(jm_submit_batch)(struct panfrost_batch *batch)
197 {
198 struct pipe_screen *pscreen = batch->ctx->base.screen;
199 struct panfrost_device *dev = pan_device(pscreen);
200 bool has_draws = batch->jm.jobs.vtc_jc.first_job;
201 bool has_tiler = batch->jm.jobs.vtc_jc.first_tiler;
202 bool has_frag = panfrost_has_fragment_job(batch);
203 uint32_t out_sync = batch->ctx->syncobj;
204 int ret = 0;
205
206 unsigned reqs =
207 batch->need_job_req_cycle_count ? PANFROST_JD_REQ_CYCLE_COUNT : 0;
208
209 /* Take the submit lock to make sure no tiler jobs from other context
210 * are inserted between our tiler and fragment jobs, failing to do that
211 * might result in tiler heap corruption.
212 */
213 if (has_tiler)
214 pthread_mutex_lock(&dev->submit_lock);
215
216 if (has_draws) {
217 ret = jm_submit_jc(batch, batch->jm.jobs.vtc_jc.first_job, reqs,
218 has_frag ? 0 : out_sync);
219
220 if (ret)
221 goto done;
222 }
223
224 if (has_frag) {
225 ret = jm_submit_jc(batch, batch->jm.jobs.frag, reqs | PANFROST_JD_REQ_FS,
226 out_sync);
227 if (ret)
228 goto done;
229 }
230
231 done:
232 if (has_tiler)
233 pthread_mutex_unlock(&dev->submit_lock);
234
235 return ret;
236 }
237
238 void
GENX(jm_preload_fb)239 GENX(jm_preload_fb)(struct panfrost_batch *batch, struct pan_fb_info *fb)
240 {
241 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
242 struct panfrost_ptr preload_jobs[2];
243
244 unsigned preload_job_count = GENX(pan_preload_fb)(
245 &dev->blitter, &batch->pool.base, fb, 0, batch->tls.gpu, preload_jobs);
246
247 assert(PAN_ARCH < 6 || !preload_job_count);
248
249 for (unsigned j = 0; j < preload_job_count; j++) {
250 pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_TILER, false, false,
251 0, 0, &preload_jobs[j], true);
252 }
253 }
254
255 void
GENX(jm_emit_fragment_job)256 GENX(jm_emit_fragment_job)(struct panfrost_batch *batch,
257 const struct pan_fb_info *pfb)
258 {
259 struct panfrost_ptr transfer =
260 pan_pool_alloc_desc(&batch->pool.base, FRAGMENT_JOB);
261
262 GENX(pan_emit_fragment_job_payload)
263 (pfb, batch->framebuffer.gpu, transfer.cpu);
264
265 pan_section_pack(transfer.cpu, FRAGMENT_JOB, HEADER, header) {
266 header.type = MALI_JOB_TYPE_FRAGMENT;
267 header.index = 1;
268 }
269
270 batch->jm.jobs.frag = transfer.gpu;
271 }
272
273 #if PAN_ARCH == 9
274 static void
jm_emit_shader_env(struct panfrost_batch * batch,struct MALI_SHADER_ENVIRONMENT * cfg,enum pipe_shader_type stage,mali_ptr shader_ptr)275 jm_emit_shader_env(struct panfrost_batch *batch,
276 struct MALI_SHADER_ENVIRONMENT *cfg,
277 enum pipe_shader_type stage, mali_ptr shader_ptr)
278 {
279 cfg->resources = panfrost_emit_resources(batch, stage);
280 cfg->thread_storage = batch->tls.gpu;
281 cfg->shader = shader_ptr;
282
283 /* Each entry of FAU is 64-bits */
284 cfg->fau = batch->push_uniforms[stage];
285 cfg->fau_count = DIV_ROUND_UP(batch->nr_push_uniforms[stage], 2);
286 }
287 #endif
288
289 void
GENX(jm_launch_grid)290 GENX(jm_launch_grid)(struct panfrost_batch *batch,
291 const struct pipe_grid_info *info)
292 {
293 struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
294
295 /* Invoke according to the grid info */
296
297 unsigned num_wg[3] = {info->grid[0], info->grid[1], info->grid[2]};
298
299 if (info->indirect)
300 num_wg[0] = num_wg[1] = num_wg[2] = 1;
301
302 #if PAN_ARCH <= 7
303 panfrost_pack_work_groups_compute(
304 pan_section_ptr(t.cpu, COMPUTE_JOB, INVOCATION), num_wg[0], num_wg[1],
305 num_wg[2], info->block[0], info->block[1], info->block[2], false,
306 info->indirect != NULL);
307
308 pan_section_pack(t.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
309 cfg.job_task_split = util_logbase2_ceil(info->block[0] + 1) +
310 util_logbase2_ceil(info->block[1] + 1) +
311 util_logbase2_ceil(info->block[2] + 1);
312 }
313
314 pan_section_pack(t.cpu, COMPUTE_JOB, DRAW, cfg) {
315 cfg.state = batch->rsd[PIPE_SHADER_COMPUTE];
316 cfg.attributes = batch->attribs[PIPE_SHADER_COMPUTE];
317 cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_COMPUTE];
318 cfg.thread_storage = batch->tls.gpu;
319 cfg.uniform_buffers = batch->uniform_buffers[PIPE_SHADER_COMPUTE];
320 cfg.push_uniforms = batch->push_uniforms[PIPE_SHADER_COMPUTE];
321 cfg.textures = batch->textures[PIPE_SHADER_COMPUTE];
322 cfg.samplers = batch->samplers[PIPE_SHADER_COMPUTE];
323 }
324
325 #if PAN_ARCH == 4
326 pan_section_pack(t.cpu, COMPUTE_JOB, COMPUTE_PADDING, cfg)
327 ;
328 #endif
329 #else
330 struct panfrost_context *ctx = batch->ctx;
331 struct panfrost_compiled_shader *cs = ctx->prog[PIPE_SHADER_COMPUTE];
332
333 pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) {
334 cfg.workgroup_size_x = info->block[0];
335 cfg.workgroup_size_y = info->block[1];
336 cfg.workgroup_size_z = info->block[2];
337
338 cfg.workgroup_count_x = num_wg[0];
339 cfg.workgroup_count_y = num_wg[1];
340 cfg.workgroup_count_z = num_wg[2];
341
342 jm_emit_shader_env(batch, &cfg.compute, PIPE_SHADER_COMPUTE,
343 batch->rsd[PIPE_SHADER_COMPUTE]);
344
345 /* Workgroups may be merged if the shader does not use barriers
346 * or shared memory. This condition is checked against the
347 * static shared_size at compile-time. We need to check the
348 * variable shared size at launch_grid time, because the
349 * compiler doesn't know about that.
350 */
351 cfg.allow_merging_workgroups = cs->info.cs.allow_merging_workgroups &&
352 (info->variable_shared_mem == 0);
353
354 cfg.task_increment = 1;
355 cfg.task_axis = MALI_TASK_AXIS_Z;
356 }
357 #endif
358
359 unsigned indirect_dep = 0;
360 #if PAN_GPU_SUPPORTS_DISPATCH_INDIRECT
361 if (info->indirect) {
362 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
363 struct pan_indirect_dispatch_info indirect = {
364 .job = t.gpu,
365 .indirect_dim = pan_resource(info->indirect)->image.data.base +
366 info->indirect_offset,
367 .num_wg_sysval =
368 {
369 batch->num_wg_sysval[0],
370 batch->num_wg_sysval[1],
371 batch->num_wg_sysval[2],
372 },
373 };
374
375 indirect_dep = GENX(pan_indirect_dispatch_emit)(
376 &dev->indirect_dispatch, &batch->pool.base, &batch->jm.jobs.vtc_jc,
377 &indirect);
378 }
379 #endif
380
381 pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_COMPUTE, true, false,
382 indirect_dep, 0, &t, false);
383 }
384
385 #if PAN_ARCH >= 6
386 static mali_ptr
jm_emit_tiler_desc(struct panfrost_batch * batch)387 jm_emit_tiler_desc(struct panfrost_batch *batch)
388 {
389 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
390 mali_ptr tiler_desc = PAN_ARCH >= 9 ? batch->tiler_ctx.bifrost.desc
391 : batch->tiler_ctx.valhall.desc;
392
393 if (tiler_desc)
394 return tiler_desc;
395
396 struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, TILER_HEAP);
397
398 pan_pack(t.cpu, TILER_HEAP, heap) {
399 heap.size = panfrost_bo_size(dev->tiler_heap);
400 heap.base = dev->tiler_heap->ptr.gpu;
401 heap.bottom = dev->tiler_heap->ptr.gpu;
402 heap.top = dev->tiler_heap->ptr.gpu + panfrost_bo_size(dev->tiler_heap);
403 }
404
405 mali_ptr heap = t.gpu;
406 unsigned max_levels = dev->tiler_features.max_levels;
407 assert(max_levels >= 2);
408
409 t = pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT);
410 pan_pack(t.cpu, TILER_CONTEXT, tiler) {
411 /* TODO: Select hierarchy mask more effectively */
412 tiler.hierarchy_mask = (max_levels >= 8) ? 0xFF : 0x28;
413
414 /* For large framebuffers, disable the smallest bin size to
415 * avoid pathological tiler memory usage. Required to avoid OOM
416 * on dEQP-GLES31.functional.fbo.no_attachments.maximums.all on
417 * Mali-G57.
418 */
419 if (MAX2(batch->key.width, batch->key.height) >= 4096)
420 tiler.hierarchy_mask &= ~1;
421
422 tiler.fb_width = batch->key.width;
423 tiler.fb_height = batch->key.height;
424 tiler.heap = heap;
425 tiler.sample_pattern =
426 pan_sample_pattern(util_framebuffer_get_num_samples(&batch->key));
427 #if PAN_ARCH >= 9
428 tiler.first_provoking_vertex =
429 pan_tristate_get(batch->first_provoking_vertex);
430 #endif
431 }
432
433 if (PAN_ARCH >= 9)
434 batch->tiler_ctx.valhall.desc = t.gpu;
435 else
436 batch->tiler_ctx.bifrost.desc = t.gpu;
437
438 return t.gpu;
439 }
440 #endif
441
442 #if PAN_ARCH <= 7
443 static inline void
jm_emit_draw_descs(struct panfrost_batch * batch,struct MALI_DRAW * d,enum pipe_shader_type st)444 jm_emit_draw_descs(struct panfrost_batch *batch, struct MALI_DRAW *d,
445 enum pipe_shader_type st)
446 {
447 d->offset_start = batch->ctx->offset_start;
448 d->instance_size =
449 batch->ctx->instance_count > 1 ? batch->ctx->padded_count : 1;
450
451 d->uniform_buffers = batch->uniform_buffers[st];
452 d->push_uniforms = batch->push_uniforms[st];
453 d->textures = batch->textures[st];
454 d->samplers = batch->samplers[st];
455 }
456
457 static void
jm_emit_vertex_draw(struct panfrost_batch * batch,void * section)458 jm_emit_vertex_draw(struct panfrost_batch *batch, void *section)
459 {
460 pan_pack(section, DRAW, cfg) {
461 cfg.state = batch->rsd[PIPE_SHADER_VERTEX];
462 cfg.attributes = batch->attribs[PIPE_SHADER_VERTEX];
463 cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_VERTEX];
464 cfg.varyings = batch->varyings.vs;
465 cfg.varying_buffers = cfg.varyings ? batch->varyings.bufs : 0;
466 cfg.thread_storage = batch->tls.gpu;
467 jm_emit_draw_descs(batch, &cfg, PIPE_SHADER_VERTEX);
468 }
469 }
470
471 static void
jm_emit_vertex_job(struct panfrost_batch * batch,const struct pipe_draw_info * info,void * invocation_template,void * job)472 jm_emit_vertex_job(struct panfrost_batch *batch,
473 const struct pipe_draw_info *info, void *invocation_template,
474 void *job)
475 {
476 void *section = pan_section_ptr(job, COMPUTE_JOB, INVOCATION);
477 memcpy(section, invocation_template, pan_size(INVOCATION));
478
479 pan_section_pack(job, COMPUTE_JOB, PARAMETERS, cfg) {
480 cfg.job_task_split = 5;
481 }
482
483 section = pan_section_ptr(job, COMPUTE_JOB, DRAW);
484 jm_emit_vertex_draw(batch, section);
485
486 #if PAN_ARCH == 4
487 pan_section_pack(job, COMPUTE_JOB, COMPUTE_PADDING, cfg)
488 ;
489 #endif
490 }
491 #endif /* PAN_ARCH <= 7 */
492
493 static void
jm_emit_tiler_draw(void * out,struct panfrost_batch * batch,bool fs_required,enum mesa_prim prim)494 jm_emit_tiler_draw(void *out, struct panfrost_batch *batch, bool fs_required,
495 enum mesa_prim prim)
496 {
497 struct panfrost_context *ctx = batch->ctx;
498 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
499 bool polygon = (prim == MESA_PRIM_TRIANGLES);
500
501 pan_pack(out, DRAW, cfg) {
502 /*
503 * From the Gallium documentation,
504 * pipe_rasterizer_state::cull_face "indicates which faces of
505 * polygons to cull". Points and lines are not considered
506 * polygons and should be drawn even if all faces are culled.
507 * The hardware does not take primitive type into account when
508 * culling, so we need to do that check ourselves.
509 */
510 cfg.cull_front_face = polygon && (rast->cull_face & PIPE_FACE_FRONT);
511 cfg.cull_back_face = polygon && (rast->cull_face & PIPE_FACE_BACK);
512 cfg.front_face_ccw = rast->front_ccw;
513
514 if (ctx->occlusion_query && ctx->active_queries) {
515 if (ctx->occlusion_query->type == PIPE_QUERY_OCCLUSION_COUNTER)
516 cfg.occlusion_query = MALI_OCCLUSION_MODE_COUNTER;
517 else
518 cfg.occlusion_query = MALI_OCCLUSION_MODE_PREDICATE;
519
520 struct panfrost_resource *rsrc =
521 pan_resource(ctx->occlusion_query->rsrc);
522 cfg.occlusion = rsrc->image.data.base;
523 panfrost_batch_write_rsrc(ctx->batch, rsrc, PIPE_SHADER_FRAGMENT);
524 }
525
526 #if PAN_ARCH >= 9
527 struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
528
529 cfg.multisample_enable = rast->multisample;
530 cfg.sample_mask = rast->multisample ? ctx->sample_mask : 0xFFFF;
531
532 /* Use per-sample shading if required by API Also use it when a
533 * blend shader is used with multisampling, as this is handled
534 * by a single ST_TILE in the blend shader with the current
535 * sample ID, requiring per-sample shading.
536 */
537 cfg.evaluate_per_sample =
538 (rast->multisample &&
539 ((ctx->min_samples > 1) || ctx->valhall_has_blend_shader));
540
541 cfg.single_sampled_lines = !rast->multisample;
542
543 cfg.vertex_array.packet = true;
544
545 cfg.minimum_z = batch->minimum_z;
546 cfg.maximum_z = batch->maximum_z;
547
548 cfg.depth_stencil = batch->depth_stencil;
549
550 if (prim == MESA_PRIM_LINES && rast->line_smooth) {
551 cfg.multisample_enable = true;
552 cfg.single_sampled_lines = false;
553 }
554
555 if (fs_required) {
556 bool has_oq = ctx->occlusion_query && ctx->active_queries;
557
558 struct pan_earlyzs_state earlyzs = pan_earlyzs_get(
559 fs->earlyzs, ctx->depth_stencil->writes_zs || has_oq,
560 ctx->blend->base.alpha_to_coverage,
561 ctx->depth_stencil->zs_always_passes);
562
563 cfg.pixel_kill_operation = earlyzs.kill;
564 cfg.zs_update_operation = earlyzs.update;
565
566 cfg.allow_forward_pixel_to_kill =
567 pan_allow_forward_pixel_to_kill(ctx, fs);
568 cfg.allow_forward_pixel_to_be_killed = !fs->info.writes_global;
569
570 /* Mask of render targets that may be written. A render
571 * target may be written if the fragment shader writes
572 * to it AND it actually exists. If the render target
573 * doesn't actually exist, the blend descriptor will be
574 * OFF so it may be omitted from the mask.
575 *
576 * Only set when there is a fragment shader, since
577 * otherwise no colour updates are possible.
578 */
579 cfg.render_target_mask =
580 (fs->info.outputs_written >> FRAG_RESULT_DATA0) & ctx->fb_rt_mask;
581
582 /* Also use per-sample shading if required by the shader
583 */
584 cfg.evaluate_per_sample |= fs->info.fs.sample_shading;
585
586 /* Unlike Bifrost, alpha-to-coverage must be included in
587 * this identically-named flag. Confusing, isn't it?
588 */
589 cfg.shader_modifies_coverage = fs->info.fs.writes_coverage ||
590 fs->info.fs.can_discard ||
591 ctx->blend->base.alpha_to_coverage;
592
593 /* Blend descriptors are only accessed by a BLEND
594 * instruction on Valhall. It follows that if the
595 * fragment shader is omitted, we may also emit the
596 * blend descriptors.
597 */
598 cfg.blend = batch->blend;
599 cfg.blend_count = MAX2(batch->key.nr_cbufs, 1);
600 cfg.alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
601
602 cfg.overdraw_alpha0 = panfrost_overdraw_alpha(ctx, 0);
603 cfg.overdraw_alpha1 = panfrost_overdraw_alpha(ctx, 1);
604
605 jm_emit_shader_env(batch, &cfg.shader, PIPE_SHADER_FRAGMENT,
606 batch->rsd[PIPE_SHADER_FRAGMENT]);
607 } else {
608 /* These operations need to be FORCE to benefit from the
609 * depth-only pass optimizations.
610 */
611 cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;
612 cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_EARLY;
613
614 /* No shader and no blend => no shader or blend
615 * reasons to disable FPK. The only FPK-related state
616 * not covered is alpha-to-coverage which we don't set
617 * without blend.
618 */
619 cfg.allow_forward_pixel_to_kill = true;
620
621 /* No shader => no shader side effects */
622 cfg.allow_forward_pixel_to_be_killed = true;
623
624 /* Alpha isn't written so these are vacuous */
625 cfg.overdraw_alpha0 = true;
626 cfg.overdraw_alpha1 = true;
627 }
628 #else
629 cfg.position = batch->varyings.pos;
630 cfg.state = batch->rsd[PIPE_SHADER_FRAGMENT];
631 cfg.attributes = batch->attribs[PIPE_SHADER_FRAGMENT];
632 cfg.attribute_buffers = batch->attrib_bufs[PIPE_SHADER_FRAGMENT];
633 cfg.viewport = batch->viewport;
634 cfg.varyings = batch->varyings.fs;
635 cfg.varying_buffers = cfg.varyings ? batch->varyings.bufs : 0;
636 cfg.thread_storage = batch->tls.gpu;
637
638 /* For all primitives but lines DRAW.flat_shading_vertex must
639 * be set to 0 and the provoking vertex is selected with the
640 * PRIMITIVE.first_provoking_vertex field.
641 */
642 if (prim == MESA_PRIM_LINES) {
643 /* The logic is inverted across arches. */
644 cfg.flat_shading_vertex = rast->flatshade_first ^ (PAN_ARCH <= 5);
645 }
646
647 jm_emit_draw_descs(batch, &cfg, PIPE_SHADER_FRAGMENT);
648 #endif
649 }
650 }
651
652 /* Packs a primitive descriptor, mostly common between Midgard/Bifrost tiler
653 * jobs and Valhall IDVS jobs
654 */
655 static void
jm_emit_primitive(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,bool secondary_shader,void * out)656 jm_emit_primitive(struct panfrost_batch *batch,
657 const struct pipe_draw_info *info,
658 const struct pipe_draw_start_count_bias *draw,
659 bool secondary_shader, void *out)
660 {
661 struct panfrost_context *ctx = batch->ctx;
662 UNUSED struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
663
664 pan_pack(out, PRIMITIVE, cfg) {
665 cfg.draw_mode = pan_draw_mode(info->mode);
666 if (panfrost_writes_point_size(ctx))
667 cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_FP16;
668
669 #if PAN_ARCH <= 8
670 /* For line primitives, PRIMITIVE.first_provoking_vertex must
671 * be set to true and the provoking vertex is selected with
672 * DRAW.flat_shading_vertex.
673 */
674 if (u_reduced_prim(info->mode) == MESA_PRIM_LINES)
675 cfg.first_provoking_vertex = true;
676 else
677 cfg.first_provoking_vertex = rast->flatshade_first;
678
679 if (panfrost_is_implicit_prim_restart(info)) {
680 cfg.primitive_restart = MALI_PRIMITIVE_RESTART_IMPLICIT;
681 } else if (info->primitive_restart) {
682 cfg.primitive_restart = MALI_PRIMITIVE_RESTART_EXPLICIT;
683 cfg.primitive_restart_index = info->restart_index;
684 }
685
686 cfg.job_task_split = 6;
687 #else
688 struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
689
690 cfg.allow_rotating_primitives = allow_rotating_primitives(fs, info);
691 cfg.primitive_restart = info->primitive_restart;
692
693 /* Non-fixed restart indices should have been lowered */
694 assert(!cfg.primitive_restart || panfrost_is_implicit_prim_restart(info));
695 #endif
696
697 cfg.low_depth_cull = rast->depth_clip_near;
698 cfg.high_depth_cull = rast->depth_clip_far;
699
700 cfg.index_count = draw->count;
701 cfg.index_type = panfrost_translate_index_size(info->index_size);
702
703 if (PAN_ARCH >= 9) {
704 /* Base vertex offset on Valhall is used for both
705 * indexed and non-indexed draws, in a simple way for
706 * either. Handle both cases.
707 */
708 if (cfg.index_type)
709 cfg.base_vertex_offset = draw->index_bias;
710 else
711 cfg.base_vertex_offset = draw->start;
712
713 /* Indices are moved outside the primitive descriptor
714 * on Valhall, so we don't need to set that here
715 */
716 } else if (cfg.index_type) {
717 cfg.base_vertex_offset = draw->index_bias - ctx->offset_start;
718
719 #if PAN_ARCH <= 7
720 cfg.indices = batch->indices;
721 #endif
722 }
723
724 #if PAN_ARCH >= 6
725 cfg.secondary_shader = secondary_shader;
726 #endif
727 }
728 }
729
730 #if PAN_ARCH == 9
731 static void
jm_emit_malloc_vertex_job(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,bool secondary_shader,void * job)732 jm_emit_malloc_vertex_job(struct panfrost_batch *batch,
733 const struct pipe_draw_info *info,
734 const struct pipe_draw_start_count_bias *draw,
735 bool secondary_shader, void *job)
736 {
737 struct panfrost_context *ctx = batch->ctx;
738 struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
739 struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
740
741 bool fs_required = panfrost_fs_required(
742 fs, ctx->blend, &ctx->pipe_framebuffer, ctx->depth_stencil);
743
744 /* Varying shaders only feed data to the fragment shader, so if we omit
745 * the fragment shader, we should omit the varying shader too.
746 */
747 secondary_shader &= fs_required;
748
749 jm_emit_primitive(batch, info, draw, secondary_shader,
750 pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE));
751
752 pan_section_pack(job, MALLOC_VERTEX_JOB, INSTANCE_COUNT, cfg) {
753 cfg.count = info->instance_count;
754 }
755
756 pan_section_pack(job, MALLOC_VERTEX_JOB, ALLOCATION, cfg) {
757 if (secondary_shader) {
758 unsigned sz = panfrost_vertex_attribute_stride(vs, fs);
759 cfg.vertex_packet_stride = sz + 16;
760 cfg.vertex_attribute_stride = sz;
761 } else {
762 /* Hardware requirement for "no varyings" */
763 cfg.vertex_packet_stride = 16;
764 cfg.vertex_attribute_stride = 0;
765 }
766 }
767
768 pan_section_pack(job, MALLOC_VERTEX_JOB, TILER, cfg) {
769 cfg.address = jm_emit_tiler_desc(batch);
770 }
771
772 STATIC_ASSERT(sizeof(batch->scissor) == pan_size(SCISSOR));
773 memcpy(pan_section_ptr(job, MALLOC_VERTEX_JOB, SCISSOR), &batch->scissor,
774 pan_size(SCISSOR));
775
776 panfrost_emit_primitive_size(
777 ctx, info->mode == MESA_PRIM_POINTS, 0,
778 pan_section_ptr(job, MALLOC_VERTEX_JOB, PRIMITIVE_SIZE));
779
780 pan_section_pack(job, MALLOC_VERTEX_JOB, INDICES, cfg) {
781 cfg.address = batch->indices;
782 }
783
784 jm_emit_tiler_draw(pan_section_ptr(job, MALLOC_VERTEX_JOB, DRAW), batch,
785 fs_required, u_reduced_prim(info->mode));
786
787 pan_section_pack(job, MALLOC_VERTEX_JOB, POSITION, cfg) {
788 jm_emit_shader_env(batch, &cfg, PIPE_SHADER_VERTEX,
789 panfrost_get_position_shader(batch, info));
790 }
791
792 pan_section_pack(job, MALLOC_VERTEX_JOB, VARYING, cfg) {
793 /* If a varying shader is used, we configure it with the same
794 * state as the position shader for backwards compatible
795 * behaviour with Bifrost. This could be optimized.
796 */
797 if (!secondary_shader)
798 continue;
799
800 jm_emit_shader_env(batch, &cfg, PIPE_SHADER_VERTEX,
801 panfrost_get_varying_shader(batch));
802 }
803 }
804 #endif
805
806 #if PAN_ARCH <= 7
807 static void
jm_emit_tiler_job(struct panfrost_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,void * invocation_template,bool secondary_shader,void * job)808 jm_emit_tiler_job(struct panfrost_batch *batch,
809 const struct pipe_draw_info *info,
810 const struct pipe_draw_start_count_bias *draw,
811 void *invocation_template, bool secondary_shader, void *job)
812 {
813 struct panfrost_context *ctx = batch->ctx;
814
815 void *section = pan_section_ptr(job, TILER_JOB, INVOCATION);
816 memcpy(section, invocation_template, pan_size(INVOCATION));
817
818 jm_emit_primitive(batch, info, draw, secondary_shader,
819 pan_section_ptr(job, TILER_JOB, PRIMITIVE));
820
821 void *prim_size = pan_section_ptr(job, TILER_JOB, PRIMITIVE_SIZE);
822 enum mesa_prim prim = u_reduced_prim(info->mode);
823
824 #if PAN_ARCH >= 6
825 pan_section_pack(job, TILER_JOB, TILER, cfg) {
826 cfg.address = jm_emit_tiler_desc(batch);
827 }
828
829 pan_section_pack(job, TILER_JOB, PADDING, cfg)
830 ;
831 #endif
832
833 jm_emit_tiler_draw(pan_section_ptr(job, TILER_JOB, DRAW), batch, true, prim);
834
835 panfrost_emit_primitive_size(ctx, prim == MESA_PRIM_POINTS,
836 batch->varyings.psiz, prim_size);
837 }
838 #endif
839
840 void
GENX(jm_launch_xfb)841 GENX(jm_launch_xfb)(struct panfrost_batch *batch,
842 const struct pipe_draw_info *info, unsigned count)
843 {
844 struct panfrost_ptr t = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
845
846 #if PAN_ARCH == 9
847 pan_section_pack(t.cpu, COMPUTE_JOB, PAYLOAD, cfg) {
848 cfg.workgroup_size_x = 1;
849 cfg.workgroup_size_y = 1;
850 cfg.workgroup_size_z = 1;
851
852 cfg.workgroup_count_x = count;
853 cfg.workgroup_count_y = info->instance_count;
854 cfg.workgroup_count_z = 1;
855
856 jm_emit_shader_env(batch, &cfg.compute, PIPE_SHADER_VERTEX,
857 batch->rsd[PIPE_SHADER_VERTEX]);
858
859 /* TODO: Indexing. Also, this is a legacy feature... */
860 cfg.compute.attribute_offset = batch->ctx->offset_start;
861
862 /* Transform feedback shaders do not use barriers or shared
863 * memory, so we may merge workgroups.
864 */
865 cfg.allow_merging_workgroups = true;
866 cfg.task_increment = 1;
867 cfg.task_axis = MALI_TASK_AXIS_Z;
868 }
869 #else
870 struct mali_invocation_packed invocation;
871
872 panfrost_pack_work_groups_compute(&invocation, 1, count,
873 info->instance_count, 1, 1, 1,
874 PAN_ARCH <= 5, false);
875
876 /* No varyings on XFB compute jobs. */
877 mali_ptr saved_vs_varyings = batch->varyings.vs;
878
879 batch->varyings.vs = 0;
880 jm_emit_vertex_job(batch, info, &invocation, t.cpu);
881 batch->varyings.vs = saved_vs_varyings;
882
883 #endif
884 enum mali_job_type job_type = MALI_JOB_TYPE_COMPUTE;
885 #if PAN_ARCH <= 5
886 job_type = MALI_JOB_TYPE_VERTEX;
887 #endif
888 pan_jc_add_job(&batch->jm.jobs.vtc_jc, job_type, true, false, 0, 0, &t,
889 false);
890 }
891
892 #if PAN_ARCH < 9
893 /*
894 * Push jobs required for the rasterization pipeline. If there are side effects
895 * from the vertex shader, these are handled ahead-of-time with a compute
896 * shader. This function should not be called if rasterization is skipped.
897 */
898 static void
jm_push_vertex_tiler_jobs(struct panfrost_batch * batch,const struct panfrost_ptr * vertex_job,const struct panfrost_ptr * tiler_job)899 jm_push_vertex_tiler_jobs(struct panfrost_batch *batch,
900 const struct panfrost_ptr *vertex_job,
901 const struct panfrost_ptr *tiler_job)
902 {
903 unsigned vertex =
904 pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_VERTEX, false, false,
905 0, 0, vertex_job, false);
906
907 pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_TILER, false, false,
908 vertex, 0, tiler_job, false);
909 }
910 #endif
911
912 void
GENX(jm_launch_draw)913 GENX(jm_launch_draw)(struct panfrost_batch *batch,
914 const struct pipe_draw_info *info, unsigned drawid_offset,
915 const struct pipe_draw_start_count_bias *draw,
916 unsigned vertex_count)
917 {
918 struct panfrost_context *ctx = batch->ctx;
919 struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
920 bool secondary_shader = vs->info.vs.secondary_enable;
921 bool idvs = vs->info.vs.idvs;
922
923 #if PAN_ARCH <= 7
924 struct mali_invocation_packed invocation;
925 if (info->instance_count > 1) {
926 panfrost_pack_work_groups_compute(&invocation, 1, vertex_count,
927 info->instance_count, 1, 1, 1, true,
928 false);
929 } else {
930 pan_pack(&invocation, INVOCATION, cfg) {
931 cfg.invocations = vertex_count - 1;
932 cfg.size_y_shift = 0;
933 cfg.size_z_shift = 0;
934 cfg.workgroups_x_shift = 0;
935 cfg.workgroups_y_shift = 0;
936 cfg.workgroups_z_shift = 32;
937 cfg.thread_group_split = MALI_SPLIT_MIN_EFFICIENT;
938 }
939 }
940
941 /* Emit all sort of descriptors. */
942 #endif
943
944 UNUSED struct panfrost_ptr tiler, vertex;
945
946 if (idvs) {
947 #if PAN_ARCH == 9
948 tiler = pan_pool_alloc_desc(&batch->pool.base, MALLOC_VERTEX_JOB);
949 #elif PAN_ARCH >= 6
950 tiler = pan_pool_alloc_desc(&batch->pool.base, INDEXED_VERTEX_JOB);
951 #else
952 unreachable("IDVS is unsupported on Midgard");
953 #endif
954 } else {
955 vertex = pan_pool_alloc_desc(&batch->pool.base, COMPUTE_JOB);
956 tiler = pan_pool_alloc_desc(&batch->pool.base, TILER_JOB);
957 }
958
959 #if PAN_ARCH == 9
960 assert(idvs && "Memory allocated IDVS required on Valhall");
961
962 jm_emit_malloc_vertex_job(batch, info, draw, secondary_shader, tiler.cpu);
963
964 pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_MALLOC_VERTEX, false,
965 false, 0, 0, &tiler, false);
966 #else
967 /* Fire off the draw itself */
968 jm_emit_tiler_job(batch, info, draw, &invocation, secondary_shader,
969 tiler.cpu);
970 if (idvs) {
971 #if PAN_ARCH >= 6
972 jm_emit_vertex_draw(
973 batch, pan_section_ptr(tiler.cpu, INDEXED_VERTEX_JOB, VERTEX_DRAW));
974
975 pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_INDEXED_VERTEX,
976 false, false, 0, 0, &tiler, false);
977 #endif
978 } else {
979 jm_emit_vertex_job(batch, info, &invocation, vertex.cpu);
980 jm_push_vertex_tiler_jobs(batch, &vertex, &tiler);
981 }
982 #endif
983 }
984
985 void
GENX(jm_launch_draw_indirect)986 GENX(jm_launch_draw_indirect)(struct panfrost_batch *batch,
987 const struct pipe_draw_info *info,
988 unsigned drawid_offset,
989 const struct pipe_draw_indirect_info *indirect)
990 {
991 unreachable("draw indirect not implemented for jm");
992 }
993
994 void
GENX(jm_emit_write_timestamp)995 GENX(jm_emit_write_timestamp)(struct panfrost_batch *batch,
996 struct panfrost_resource *dst, unsigned offset)
997 {
998 struct panfrost_ptr job =
999 pan_pool_alloc_desc(&batch->pool.base, WRITE_VALUE_JOB);
1000
1001 pan_section_pack(job.cpu, WRITE_VALUE_JOB, PAYLOAD, cfg) {
1002 cfg.address = dst->image.data.base + dst->image.data.offset + offset;
1003 cfg.type = MALI_WRITE_VALUE_TYPE_SYSTEM_TIMESTAMP;
1004 }
1005
1006 pan_jc_add_job(&batch->jm.jobs.vtc_jc, MALI_JOB_TYPE_WRITE_VALUE, false,
1007 false, 0, 0, &job, false);
1008 panfrost_batch_write_rsrc(batch, dst, PIPE_SHADER_VERTEX);
1009 }
1010