xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/brw_fs_thread_payload.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2006-2022 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "brw_fs.h"
25 #include "brw_fs_builder.h"
26 
27 using namespace brw;
28 
vs_thread_payload(const fs_visitor & v)29 vs_thread_payload::vs_thread_payload(const fs_visitor &v)
30 {
31    unsigned r = 0;
32 
33    /* R0: Thread header. */
34    r += reg_unit(v.devinfo);
35 
36    /* R1: URB handles. */
37    urb_handles = brw_ud8_grf(r, 0);
38    r += reg_unit(v.devinfo);
39 
40    num_regs = r;
41 }
42 
tcs_thread_payload(const fs_visitor & v)43 tcs_thread_payload::tcs_thread_payload(const fs_visitor &v)
44 {
45    struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(v.prog_data);
46    struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(v.prog_data);
47    struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) v.key;
48 
49    if (vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH) {
50       patch_urb_output = brw_ud1_grf(0, 0);
51       primitive_id = brw_vec1_grf(0, 1);
52 
53       /* r1-r4 contain the ICP handles. */
54       icp_handle_start = brw_ud8_grf(1, 0);
55 
56       num_regs = 5;
57    } else {
58       assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH);
59       assert(tcs_key->input_vertices <= BRW_MAX_TCS_INPUT_VERTICES);
60 
61       unsigned r = 0;
62 
63       r += reg_unit(v.devinfo);
64 
65       patch_urb_output = brw_ud8_grf(r, 0);
66       r += reg_unit(v.devinfo);
67 
68       if (tcs_prog_data->include_primitive_id) {
69          primitive_id = brw_vec8_grf(r, 0);
70          r += reg_unit(v.devinfo);
71       }
72 
73       /* ICP handles occupy the next 1-32 registers. */
74       icp_handle_start = brw_ud8_grf(r, 0);
75       r += brw_tcs_prog_key_input_vertices(tcs_key) * reg_unit(v.devinfo);
76 
77       num_regs = r;
78    }
79 }
80 
tes_thread_payload(const fs_visitor & v)81 tes_thread_payload::tes_thread_payload(const fs_visitor &v)
82 {
83    unsigned r = 0;
84 
85    /* R0: Thread Header. */
86    patch_urb_input = retype(brw_vec1_grf(0, 0), BRW_TYPE_UD);
87    primitive_id = brw_vec1_grf(0, 1);
88    r += reg_unit(v.devinfo);
89 
90    /* R1-3: gl_TessCoord.xyz. */
91    for (unsigned i = 0; i < 3; i++) {
92       coords[i] = brw_vec8_grf(r, 0);
93       r += reg_unit(v.devinfo);
94    }
95 
96    /* R4: URB output handles. */
97    urb_output = brw_ud8_grf(r, 0);
98    r += reg_unit(v.devinfo);
99 
100    num_regs = r;
101 }
102 
gs_thread_payload(fs_visitor & v)103 gs_thread_payload::gs_thread_payload(fs_visitor &v)
104 {
105    struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(v.prog_data);
106    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(v.prog_data);
107    const fs_builder bld = fs_builder(&v).at_end();
108 
109    /* R0: thread header. */
110    unsigned r = reg_unit(v.devinfo);
111 
112    /* R1: output URB handles. */
113    urb_handles = bld.vgrf(BRW_TYPE_UD);
114    bld.AND(urb_handles, brw_ud8_grf(r, 0),
115          v.devinfo->ver >= 20 ? brw_imm_ud(0xFFFFFF) : brw_imm_ud(0xFFFF));
116 
117    /* R1: Instance ID stored in bits 31:27 */
118    instance_id = bld.vgrf(BRW_TYPE_UD);
119    bld.SHR(instance_id, brw_ud8_grf(r, 0), brw_imm_ud(27u));
120 
121    r += reg_unit(v.devinfo);
122 
123    if (gs_prog_data->include_primitive_id) {
124       primitive_id = brw_ud8_grf(r, 0);
125       r += reg_unit(v.devinfo);
126    }
127 
128    /* Always enable VUE handles so we can safely use pull model if needed.
129     *
130     * The push model for a GS uses a ton of register space even for trivial
131     * scenarios with just a few inputs, so just make things easier and a bit
132     * safer by always having pull model available.
133     */
134    gs_prog_data->base.include_vue_handles = true;
135 
136    /* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
137    icp_handle_start = brw_ud8_grf(r, 0);
138    r += v.nir->info.gs.vertices_in * reg_unit(v.devinfo);
139 
140    num_regs = r;
141 
142    /* Use a maximum of 24 registers for push-model inputs. */
143    const unsigned max_push_components = 24;
144 
145    /* If pushing our inputs would take too many registers, reduce the URB read
146     * length (which is in HWords, or 8 registers), and resort to pulling.
147     *
148     * Note that the GS reads <URB Read Length> HWords for every vertex - so we
149     * have to multiply by VerticesIn to obtain the total storage requirement.
150     */
151    if (8 * vue_prog_data->urb_read_length * v.nir->info.gs.vertices_in >
152        max_push_components) {
153       vue_prog_data->urb_read_length =
154          ROUND_DOWN_TO(max_push_components / v.nir->info.gs.vertices_in, 8) / 8;
155    }
156 }
157 
158 static inline void
setup_fs_payload_gfx20(fs_thread_payload & payload,const fs_visitor & v,bool & source_depth_to_render_target)159 setup_fs_payload_gfx20(fs_thread_payload &payload,
160                        const fs_visitor &v,
161                        bool &source_depth_to_render_target)
162 {
163    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(v.prog_data);
164    const unsigned payload_width = 16;
165    assert(v.dispatch_width % payload_width == 0);
166    assert(v.devinfo->ver >= 20);
167 
168    for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
169       /* R0-1: PS thread payload header, masks and pixel X/Y coordinates. */
170       payload.num_regs++;
171       payload.subspan_coord_reg[j] = payload.num_regs++;
172    }
173 
174    for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
175       /* R2-13: Barycentric interpolation coordinates.  These appear
176        * in the same order that they appear in the brw_barycentric_mode
177        * enum.  Each set of coordinates occupies 2 64B registers per
178        * SIMD16 half.  Coordinates only appear if they were enabled
179        * using the "Barycentric Interpolation Mode" bits in WM_STATE.
180        */
181       for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
182          if (prog_data->barycentric_interp_modes & (1 << i)) {
183             payload.barycentric_coord_reg[i][j] = payload.num_regs;
184             payload.num_regs += payload_width / 4;
185          }
186       }
187 
188       /* R14: Interpolated depth if "Pixel Shader Uses Source Depth" is set. */
189       if (prog_data->uses_src_depth) {
190          payload.source_depth_reg[j] = payload.num_regs;
191          payload.num_regs += payload_width / 8;
192       }
193 
194       /* R15: Interpolated W if "Pixel Shader Uses Source W" is set. */
195       if (prog_data->uses_src_w) {
196          payload.source_w_reg[j] = payload.num_regs;
197          payload.num_regs += payload_width / 8;
198       }
199 
200       /* R16: MSAA input coverage mask if "Pixel Shader Uses Input
201        * Coverage Mask" is set.
202        */
203       if (prog_data->uses_sample_mask) {
204          payload.sample_mask_in_reg[j] = payload.num_regs;
205          payload.num_regs += payload_width / 8;
206       }
207 
208       /* R19: MSAA position XY offsets if "Position XY Offset Select"
209        * is either POSOFFSET_CENTROID or POSOFFSET_SAMPLE.  Note that
210        * this is delivered as a single SIMD32 vector, inconsistently
211        * with most other PS payload fields.
212        */
213       if (prog_data->uses_pos_offset && j == 0) {
214          for (unsigned k = 0; k < 2; k++) {
215             payload.sample_pos_reg[k] = payload.num_regs;
216             payload.num_regs++;
217          }
218       }
219 
220       /* R22: Sample offsets. */
221       if (prog_data->uses_sample_offsets && j == 0) {
222          payload.sample_offsets_reg = payload.num_regs;
223          payload.num_regs += 2;
224       }
225    }
226 
227    /* RP0: Source Depth and/or W Attribute Vertex Deltas and/or
228     * Perspective Bary Planes.
229     */
230    if (prog_data->uses_depth_w_coefficients ||
231        prog_data->uses_pc_bary_coefficients) {
232       payload.depth_w_coef_reg = payload.pc_bary_coef_reg = payload.num_regs;
233       payload.num_regs += 2 * v.max_polygons;
234    }
235 
236    /* RP4: Non-Perspective Bary planes. */
237    if (prog_data->uses_npc_bary_coefficients) {
238       payload.npc_bary_coef_reg = payload.num_regs;
239       payload.num_regs += 2 * v.max_polygons;
240    }
241 
242    if (v.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
243       source_depth_to_render_target = true;
244    }
245 }
246 
247 static inline void
setup_fs_payload_gfx9(fs_thread_payload & payload,const fs_visitor & v,bool & source_depth_to_render_target)248 setup_fs_payload_gfx9(fs_thread_payload &payload,
249                       const fs_visitor &v,
250                       bool &source_depth_to_render_target)
251 {
252    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(v.prog_data);
253 
254    const unsigned payload_width = MIN2(16, v.dispatch_width);
255    assert(v.dispatch_width % payload_width == 0);
256    assert(v.devinfo->ver < 20);
257 
258    payload.num_regs = 0;
259 
260    /* R0: PS thread payload header. */
261    payload.num_regs++;
262 
263    for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
264       /* R1: masks, pixel X/Y coordinates. */
265       payload.subspan_coord_reg[j] = payload.num_regs++;
266    }
267 
268    for (unsigned j = 0; j < v.dispatch_width / payload_width; j++) {
269       /* R3-26: barycentric interpolation coordinates.  These appear in the
270        * same order that they appear in the brw_barycentric_mode enum.  Each
271        * set of coordinates occupies 2 registers if dispatch width == 8 and 4
272        * registers if dispatch width == 16.  Coordinates only appear if they
273        * were enabled using the "Barycentric Interpolation Mode" bits in
274        * WM_STATE.
275        */
276       for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
277          if (prog_data->barycentric_interp_modes & (1 << i)) {
278             payload.barycentric_coord_reg[i][j] = payload.num_regs;
279             payload.num_regs += payload_width / 4;
280          }
281       }
282 
283       /* R27-28: interpolated depth if uses source depth */
284       if (prog_data->uses_src_depth) {
285          payload.source_depth_reg[j] = payload.num_regs;
286          payload.num_regs += payload_width / 8;
287       }
288 
289       /* R29-30: interpolated W set if GFX6_WM_USES_SOURCE_W. */
290       if (prog_data->uses_src_w) {
291          payload.source_w_reg[j] = payload.num_regs;
292          payload.num_regs += payload_width / 8;
293       }
294 
295       /* R31: MSAA position offsets. */
296       if (prog_data->uses_pos_offset) {
297          payload.sample_pos_reg[j] = payload.num_regs;
298          payload.num_regs++;
299       }
300 
301       /* R32-33: MSAA input coverage mask */
302       if (prog_data->uses_sample_mask) {
303          payload.sample_mask_in_reg[j] = payload.num_regs;
304          payload.num_regs += payload_width / 8;
305       }
306    }
307 
308    /* R66: Source Depth and/or W Attribute Vertex Deltas. */
309    if (prog_data->uses_depth_w_coefficients) {
310       payload.depth_w_coef_reg = payload.num_regs;
311       payload.num_regs += v.max_polygons;
312    }
313 
314    /* R68: Perspective bary planes. */
315    if (prog_data->uses_pc_bary_coefficients) {
316       payload.pc_bary_coef_reg = payload.num_regs;
317       payload.num_regs += v.max_polygons;
318    }
319 
320    /* R70: Non-perspective bary planes. */
321    if (prog_data->uses_npc_bary_coefficients) {
322       payload.npc_bary_coef_reg = payload.num_regs;
323       payload.num_regs += v.max_polygons;
324    }
325 
326    /* R72: Sample offsets. */
327    if (prog_data->uses_sample_offsets) {
328       payload.sample_offsets_reg = payload.num_regs;
329       payload.num_regs++;
330    }
331 
332    if (v.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
333       source_depth_to_render_target = true;
334    }
335 }
336 
fs_thread_payload(const fs_visitor & v,bool & source_depth_to_render_target)337 fs_thread_payload::fs_thread_payload(const fs_visitor &v,
338                                      bool &source_depth_to_render_target)
339   : subspan_coord_reg(),
340     source_depth_reg(),
341     source_w_reg(),
342     aa_dest_stencil_reg(),
343     dest_depth_reg(),
344     sample_pos_reg(),
345     sample_mask_in_reg(),
346     barycentric_coord_reg(),
347     depth_w_coef_reg(),
348     pc_bary_coef_reg(),
349     npc_bary_coef_reg(),
350     sample_offsets_reg()
351 {
352    if (v.devinfo->ver >= 20)
353       setup_fs_payload_gfx20(*this, v, source_depth_to_render_target);
354    else
355       setup_fs_payload_gfx9(*this, v, source_depth_to_render_target);
356 }
357 
cs_thread_payload(const fs_visitor & v)358 cs_thread_payload::cs_thread_payload(const fs_visitor &v)
359 {
360    struct brw_cs_prog_data *prog_data = brw_cs_prog_data(v.prog_data);
361 
362    unsigned r = reg_unit(v.devinfo);
363 
364    /* See nir_setup_uniforms for subgroup_id in earlier versions. */
365    if (v.devinfo->verx10 >= 125) {
366       subgroup_id_ = brw_ud1_grf(0, 2);
367 
368       for (int i = 0; i < 3; i++) {
369          if (prog_data->generate_local_id & (1 << i)) {
370             local_invocation_id[i] = brw_uw8_grf(r, 0);
371             r += reg_unit(v.devinfo);
372             if (v.devinfo->ver < 20 && v.dispatch_width == 32)
373                r += reg_unit(v.devinfo);
374          } else {
375             local_invocation_id[i] = brw_imm_uw(0);
376          }
377       }
378 
379       /* TODO: Fill out uses_btd_stack_ids automatically */
380       if (prog_data->uses_btd_stack_ids)
381          r += reg_unit(v.devinfo);
382    }
383 
384    num_regs = r;
385 }
386 
387 void
load_subgroup_id(const fs_builder & bld,brw_reg & dest) const388 cs_thread_payload::load_subgroup_id(const fs_builder &bld,
389                                     brw_reg &dest) const
390 {
391    auto devinfo = bld.shader->devinfo;
392    dest = retype(dest, BRW_TYPE_UD);
393 
394    if (subgroup_id_.file != BAD_FILE) {
395       assert(devinfo->verx10 >= 125);
396       bld.AND(dest, subgroup_id_, brw_imm_ud(INTEL_MASK(7, 0)));
397    } else {
398       assert(devinfo->verx10 < 125);
399       assert(gl_shader_stage_is_compute(bld.shader->stage));
400       int index = brw_get_subgroup_id_param_index(devinfo,
401                                                   bld.shader->prog_data);
402       bld.MOV(dest, brw_uniform_reg(index, BRW_TYPE_UD));
403    }
404 }
405 
task_mesh_thread_payload(fs_visitor & v)406 task_mesh_thread_payload::task_mesh_thread_payload(fs_visitor &v)
407    : cs_thread_payload(v)
408 {
409    /* Task and Mesh Shader Payloads (SIMD8 and SIMD16)
410     *
411     *  R0: Header
412     *  R1: Local_ID.X[0-7 or 0-15]
413     *  R2: Inline Parameter
414     *
415     * Task and Mesh Shader Payloads (SIMD32)
416     *
417     *  R0: Header
418     *  R1: Local_ID.X[0-15]
419     *  R2: Local_ID.X[16-31]
420     *  R3: Inline Parameter
421     *
422     * Local_ID.X values are 16 bits.
423     *
424     * Inline parameter is optional but always present since we use it to pass
425     * the address to descriptors.
426     */
427 
428    const fs_builder bld = fs_builder(&v).at_end();
429 
430    unsigned r = 0;
431    assert(subgroup_id_.file != BAD_FILE);
432    extended_parameter_0 = retype(brw_vec1_grf(0, 3), BRW_TYPE_UD);
433 
434    if (v.devinfo->ver >= 20) {
435       urb_output = brw_ud1_grf(1, 0);
436    } else {
437       urb_output = bld.vgrf(BRW_TYPE_UD);
438       /* In both mesh and task shader payload, lower 16 bits of g0.6 is
439        * an offset within Slice's Local URB, which says where shader is
440        * supposed to output its data.
441        */
442       bld.AND(urb_output, brw_ud1_grf(0, 6), brw_imm_ud(0xFFFF));
443    }
444 
445    if (v.stage == MESA_SHADER_MESH) {
446       /* g0.7 is Task Shader URB Entry Offset, which contains both an offset
447        * within Slice's Local USB (bits 0:15) and a slice selector
448        * (bits 16:24). Slice selector can be non zero when mesh shader
449        * is spawned on slice other than the one where task shader was run.
450        * Bit 24 says that Slice ID is present and bits 16:23 is the Slice ID.
451        */
452       task_urb_input = brw_ud1_grf(0, 7);
453    }
454    r += reg_unit(v.devinfo);
455 
456    local_index = brw_uw8_grf(r, 0);
457    r += reg_unit(v.devinfo);
458    if (v.devinfo->ver < 20 && v.dispatch_width == 32)
459       r += reg_unit(v.devinfo);
460 
461    inline_parameter = brw_ud1_grf(r, 0);
462    r += reg_unit(v.devinfo);
463 
464    num_regs = r;
465 }
466 
bs_thread_payload(const fs_visitor & v)467 bs_thread_payload::bs_thread_payload(const fs_visitor &v)
468 {
469    unsigned r = 0;
470 
471    /* R0: Thread header. */
472    r += reg_unit(v.devinfo);
473 
474    /* R1: Stack IDs. */
475    r += reg_unit(v.devinfo);
476 
477    /* R2: Inline Parameter.  Used for argument addresses. */
478    global_arg_ptr = brw_ud1_grf(r, 0);
479    local_arg_ptr = brw_ud1_grf(r, 2);
480    r += reg_unit(v.devinfo);
481 
482    num_regs = r;
483 }
484 
485 void
load_shader_type(const fs_builder & bld,brw_reg & dest) const486 bs_thread_payload::load_shader_type(const fs_builder &bld, brw_reg &dest) const
487 {
488    brw_reg ud_dest = retype(dest, BRW_TYPE_UD);
489    bld.MOV(ud_dest, retype(brw_vec1_grf(0, 3), ud_dest.type));
490    bld.AND(ud_dest, ud_dest, brw_imm_ud(0xf));
491 }
492