1 /*
2 * Copyright © 2010 Intel Corporation
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "brw_eu.h"
7 #include "brw_fs.h"
8 #include "brw_fs_builder.h"
9 #include "brw_fs_live_variables.h"
10 #include "brw_nir.h"
11 #include "brw_cfg.h"
12 #include "brw_private.h"
13 #include "intel_nir.h"
14 #include "shader_enums.h"
15 #include "dev/intel_debug.h"
16 #include "dev/intel_wa.h"
17
18 #include <memory>
19
20 using namespace brw;
21
22 static fs_inst *
brw_emit_single_fb_write(fs_visitor & s,const fs_builder & bld,brw_reg color0,brw_reg color1,brw_reg src0_alpha,unsigned components)23 brw_emit_single_fb_write(fs_visitor &s, const fs_builder &bld,
24 brw_reg color0, brw_reg color1,
25 brw_reg src0_alpha, unsigned components)
26 {
27 assert(s.stage == MESA_SHADER_FRAGMENT);
28 struct brw_wm_prog_data *prog_data = brw_wm_prog_data(s.prog_data);
29
30 /* Hand over gl_FragDepth or the payload depth. */
31 const brw_reg dst_depth = fetch_payload_reg(bld, s.fs_payload().dest_depth_reg);
32 brw_reg src_depth, src_stencil;
33
34 if (s.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
35 src_depth = s.frag_depth;
36
37 if (s.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL))
38 src_stencil = s.frag_stencil;
39
40 const brw_reg sources[] = {
41 color0, color1, src0_alpha, src_depth, dst_depth, src_stencil,
42 (prog_data->uses_omask ? s.sample_mask : brw_reg()),
43 brw_imm_ud(components)
44 };
45 assert(ARRAY_SIZE(sources) - 1 == FB_WRITE_LOGICAL_SRC_COMPONENTS);
46 fs_inst *write = bld.emit(FS_OPCODE_FB_WRITE_LOGICAL, brw_reg(),
47 sources, ARRAY_SIZE(sources));
48
49 if (prog_data->uses_kill) {
50 write->predicate = BRW_PREDICATE_NORMAL;
51 write->flag_subreg = sample_mask_flag_subreg(s);
52 }
53
54 return write;
55 }
56
57 static void
brw_do_emit_fb_writes(fs_visitor & s,int nr_color_regions,bool replicate_alpha)58 brw_do_emit_fb_writes(fs_visitor &s, int nr_color_regions, bool replicate_alpha)
59 {
60 const fs_builder bld = fs_builder(&s).at_end();
61 fs_inst *inst = NULL;
62
63 for (int target = 0; target < nr_color_regions; target++) {
64 /* Skip over outputs that weren't written. */
65 if (s.outputs[target].file == BAD_FILE)
66 continue;
67
68 const fs_builder abld = bld.annotate(
69 ralloc_asprintf(s.mem_ctx, "FB write target %d", target));
70
71 brw_reg src0_alpha;
72 if (replicate_alpha && target != 0)
73 src0_alpha = offset(s.outputs[0], bld, 3);
74
75 inst = brw_emit_single_fb_write(s, abld, s.outputs[target],
76 s.dual_src_output, src0_alpha, 4);
77 inst->target = target;
78 }
79
80 if (inst == NULL) {
81 /* Even if there's no color buffers enabled, we still need to send
82 * alpha out the pipeline to our null renderbuffer to support
83 * alpha-testing, alpha-to-coverage, and so on.
84 */
85 /* FINISHME: Factor out this frequently recurring pattern into a
86 * helper function.
87 */
88 const brw_reg srcs[] = { reg_undef, reg_undef,
89 reg_undef, offset(s.outputs[0], bld, 3) };
90 const brw_reg tmp = bld.vgrf(BRW_TYPE_UD, 4);
91 bld.LOAD_PAYLOAD(tmp, srcs, 4, 0);
92
93 inst = brw_emit_single_fb_write(s, bld, tmp, reg_undef, reg_undef, 4);
94 inst->target = 0;
95 }
96
97 inst->last_rt = true;
98 inst->eot = true;
99 }
100
101 static void
brw_emit_fb_writes(fs_visitor & s)102 brw_emit_fb_writes(fs_visitor &s)
103 {
104 const struct intel_device_info *devinfo = s.devinfo;
105 assert(s.stage == MESA_SHADER_FRAGMENT);
106 struct brw_wm_prog_data *prog_data = brw_wm_prog_data(s.prog_data);
107 brw_wm_prog_key *key = (brw_wm_prog_key*) s.key;
108
109 if (s.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
110 /* From the 'Render Target Write message' section of the docs:
111 * "Output Stencil is not supported with SIMD16 Render Target Write
112 * Messages."
113 */
114 if (devinfo->ver >= 20)
115 s.limit_dispatch_width(16, "gl_FragStencilRefARB unsupported "
116 "in SIMD32+ mode.\n");
117 else
118 s.limit_dispatch_width(8, "gl_FragStencilRefARB unsupported "
119 "in SIMD16+ mode.\n");
120 }
121
122 /* ANV doesn't know about sample mask output during the wm key creation
123 * so we compute if we need replicate alpha and emit alpha to coverage
124 * workaround here.
125 */
126 const bool replicate_alpha = key->alpha_test_replicate_alpha ||
127 (key->nr_color_regions > 1 && key->alpha_to_coverage &&
128 s.sample_mask.file == BAD_FILE);
129
130 prog_data->dual_src_blend = (s.dual_src_output.file != BAD_FILE &&
131 s.outputs[0].file != BAD_FILE);
132 assert(!prog_data->dual_src_blend || key->nr_color_regions == 1);
133
134 /* Following condition implements Wa_14017468336:
135 *
136 * "If dual source blend is enabled do not enable SIMD32 dispatch" and
137 * "For a thread dispatched as SIMD32, must not issue SIMD8 message with Last
138 * Render Target Select set."
139 */
140 if (devinfo->ver >= 11 && devinfo->ver <= 12 &&
141 prog_data->dual_src_blend) {
142 /* The dual-source RT write messages fail to release the thread
143 * dependency on ICL and TGL with SIMD32 dispatch, leading to hangs.
144 *
145 * XXX - Emit an extra single-source NULL RT-write marked LastRT in
146 * order to release the thread dependency without disabling
147 * SIMD32.
148 *
149 * The dual-source RT write messages may lead to hangs with SIMD16
150 * dispatch on ICL due some unknown reasons, see
151 * https://gitlab.freedesktop.org/mesa/mesa/-/issues/2183
152 */
153 if (devinfo->ver >= 20)
154 s.limit_dispatch_width(16, "Dual source blending unsupported "
155 "in SIMD32 mode.\n");
156 else
157 s.limit_dispatch_width(8, "Dual source blending unsupported "
158 "in SIMD16 and SIMD32 modes.\n");
159 }
160
161 brw_do_emit_fb_writes(s, key->nr_color_regions, replicate_alpha);
162 }
163
164
165 /** Emits the interpolation for the varying inputs. */
166 static void
brw_emit_interpolation_setup(fs_visitor & s)167 brw_emit_interpolation_setup(fs_visitor &s)
168 {
169 const struct intel_device_info *devinfo = s.devinfo;
170 const fs_builder bld = fs_builder(&s).at_end();
171 fs_builder abld = bld.annotate("compute pixel centers");
172
173 s.pixel_x = bld.vgrf(BRW_TYPE_F);
174 s.pixel_y = bld.vgrf(BRW_TYPE_F);
175
176 const struct brw_wm_prog_key *wm_key = (brw_wm_prog_key*) s.key;
177 struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data);
178 fs_thread_payload &payload = s.fs_payload();
179
180 brw_reg int_sample_offset_x, int_sample_offset_y; /* Used on Gen12HP+ */
181 brw_reg int_sample_offset_xy; /* Used on Gen8+ */
182 brw_reg half_int_sample_offset_x, half_int_sample_offset_y;
183 if (wm_prog_data->coarse_pixel_dispatch != BRW_ALWAYS) {
184 /* The thread payload only delivers subspan locations (ss0, ss1,
185 * ss2, ...). Since subspans covers 2x2 pixels blocks, we need to
186 * generate 4 pixel coordinates out of each subspan location. We do this
187 * by replicating a subspan coordinate 4 times and adding an offset of 1
188 * in each direction from the initial top left (tl) location to generate
189 * top right (tr = +1 in x), bottom left (bl = +1 in y) and bottom right
190 * (br = +1 in x, +1 in y).
191 *
192 * The locations we build look like this in SIMD8 :
193 *
194 * ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
195 *
196 * The value 0x11001010 is a vector of 8 half byte vector. It adds
197 * following to generate the 4 pixels coordinates out of the subspan0:
198 *
199 * 0x
200 * 1 : ss0.y + 1 -> ss0.br.y
201 * 1 : ss0.y + 1 -> ss0.bl.y
202 * 0 : ss0.y + 0 -> ss0.tr.y
203 * 0 : ss0.y + 0 -> ss0.tl.y
204 * 1 : ss0.x + 1 -> ss0.br.x
205 * 0 : ss0.x + 0 -> ss0.bl.x
206 * 1 : ss0.x + 1 -> ss0.tr.x
207 * 0 : ss0.x + 0 -> ss0.tl.x
208 *
209 * By doing a SIMD16 add in a SIMD8 shader, we can generate the 8 pixels
210 * coordinates out of 2 subspans coordinates in a single ADD instruction
211 * (twice the operation above).
212 */
213 int_sample_offset_xy = brw_reg(brw_imm_v(0x11001010));
214 half_int_sample_offset_x = brw_reg(brw_imm_uw(0));
215 half_int_sample_offset_y = brw_reg(brw_imm_uw(0));
216 /* On Gfx12.5, because of regioning restrictions, the interpolation code
217 * is slightly different and works off X & Y only inputs. The ordering
218 * of the half bytes here is a bit odd, with each subspan replicated
219 * twice and every other element is discarded :
220 *
221 * ss0.tl ss0.tl ss0.tr ss0.tr ss0.bl ss0.bl ss0.br ss0.br
222 * X offset: 0 0 1 0 0 0 1 0
223 * Y offset: 0 0 0 0 1 0 1 0
224 */
225 int_sample_offset_x = brw_reg(brw_imm_v(0x01000100));
226 int_sample_offset_y = brw_reg(brw_imm_v(0x01010000));
227 }
228
229 brw_reg int_coarse_offset_x, int_coarse_offset_y; /* Used on Gen12HP+ */
230 brw_reg int_coarse_offset_xy; /* Used on Gen8+ */
231 brw_reg half_int_coarse_offset_x, half_int_coarse_offset_y;
232 if (wm_prog_data->coarse_pixel_dispatch != BRW_NEVER) {
233 /* In coarse pixel dispatch we have to do the same ADD instruction that
234 * we do in normal per pixel dispatch, except this time we're not adding
235 * 1 in each direction, but instead the coarse pixel size.
236 *
237 * The coarse pixel size is delivered as 2 u8 in r1.0
238 */
239 struct brw_reg r1_0 = retype(brw_vec1_reg(FIXED_GRF, 1, 0), BRW_TYPE_UB);
240
241 const fs_builder dbld =
242 abld.exec_all().group(MIN2(16, s.dispatch_width) * 2, 0);
243
244 if (devinfo->verx10 >= 125) {
245 /* To build the array of half bytes we do and AND operation with the
246 * right mask in X.
247 */
248 int_coarse_offset_x = dbld.vgrf(BRW_TYPE_UW);
249 dbld.AND(int_coarse_offset_x, byte_offset(r1_0, 0), brw_imm_v(0x0f000f00));
250
251 /* And the right mask in Y. */
252 int_coarse_offset_y = dbld.vgrf(BRW_TYPE_UW);
253 dbld.AND(int_coarse_offset_y, byte_offset(r1_0, 1), brw_imm_v(0x0f0f0000));
254 } else {
255 /* To build the array of half bytes we do and AND operation with the
256 * right mask in X.
257 */
258 int_coarse_offset_x = dbld.vgrf(BRW_TYPE_UW);
259 dbld.AND(int_coarse_offset_x, byte_offset(r1_0, 0), brw_imm_v(0x0000f0f0));
260
261 /* And the right mask in Y. */
262 int_coarse_offset_y = dbld.vgrf(BRW_TYPE_UW);
263 dbld.AND(int_coarse_offset_y, byte_offset(r1_0, 1), brw_imm_v(0xff000000));
264
265 /* Finally OR the 2 registers. */
266 int_coarse_offset_xy = dbld.vgrf(BRW_TYPE_UW);
267 dbld.OR(int_coarse_offset_xy, int_coarse_offset_x, int_coarse_offset_y);
268 }
269
270 /* Also compute the half coarse size used to center coarses. */
271 half_int_coarse_offset_x = bld.vgrf(BRW_TYPE_UW);
272 half_int_coarse_offset_y = bld.vgrf(BRW_TYPE_UW);
273
274 bld.SHR(half_int_coarse_offset_x, suboffset(r1_0, 0), brw_imm_ud(1));
275 bld.SHR(half_int_coarse_offset_y, suboffset(r1_0, 1), brw_imm_ud(1));
276 }
277
278 brw_reg int_pixel_offset_x, int_pixel_offset_y; /* Used on Gen12HP+ */
279 brw_reg int_pixel_offset_xy; /* Used on Gen8+ */
280 brw_reg half_int_pixel_offset_x, half_int_pixel_offset_y;
281 switch (wm_prog_data->coarse_pixel_dispatch) {
282 case BRW_NEVER:
283 int_pixel_offset_x = int_sample_offset_x;
284 int_pixel_offset_y = int_sample_offset_y;
285 int_pixel_offset_xy = int_sample_offset_xy;
286 half_int_pixel_offset_x = half_int_sample_offset_x;
287 half_int_pixel_offset_y = half_int_sample_offset_y;
288 break;
289
290 case BRW_SOMETIMES: {
291 const fs_builder dbld =
292 abld.exec_all().group(MIN2(16, s.dispatch_width) * 2, 0);
293
294 check_dynamic_msaa_flag(dbld, wm_prog_data,
295 INTEL_MSAA_FLAG_COARSE_RT_WRITES);
296
297 int_pixel_offset_x = dbld.vgrf(BRW_TYPE_UW);
298 set_predicate(BRW_PREDICATE_NORMAL,
299 dbld.SEL(int_pixel_offset_x,
300 int_coarse_offset_x,
301 int_sample_offset_x));
302
303 int_pixel_offset_y = dbld.vgrf(BRW_TYPE_UW);
304 set_predicate(BRW_PREDICATE_NORMAL,
305 dbld.SEL(int_pixel_offset_y,
306 int_coarse_offset_y,
307 int_sample_offset_y));
308
309 int_pixel_offset_xy = dbld.vgrf(BRW_TYPE_UW);
310 set_predicate(BRW_PREDICATE_NORMAL,
311 dbld.SEL(int_pixel_offset_xy,
312 int_coarse_offset_xy,
313 int_sample_offset_xy));
314
315 half_int_pixel_offset_x = bld.vgrf(BRW_TYPE_UW);
316 set_predicate(BRW_PREDICATE_NORMAL,
317 bld.SEL(half_int_pixel_offset_x,
318 half_int_coarse_offset_x,
319 half_int_sample_offset_x));
320
321 half_int_pixel_offset_y = bld.vgrf(BRW_TYPE_UW);
322 set_predicate(BRW_PREDICATE_NORMAL,
323 bld.SEL(half_int_pixel_offset_y,
324 half_int_coarse_offset_y,
325 half_int_sample_offset_y));
326 break;
327 }
328
329 case BRW_ALWAYS:
330 int_pixel_offset_x = int_coarse_offset_x;
331 int_pixel_offset_y = int_coarse_offset_y;
332 int_pixel_offset_xy = int_coarse_offset_xy;
333 half_int_pixel_offset_x = half_int_coarse_offset_x;
334 half_int_pixel_offset_y = half_int_coarse_offset_y;
335 break;
336 }
337
338 for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
339 const fs_builder hbld = abld.group(MIN2(16, s.dispatch_width), i);
340 /* According to the "PS Thread Payload for Normal Dispatch"
341 * pages on the BSpec, subspan X/Y coordinates are stored in
342 * R1.2-R1.5/R2.2-R2.5 on gfx6+, and on R0.10-R0.13/R1.10-R1.13
343 * on gfx20+. gi_reg is the 32B section of the GRF that
344 * contains the subspan coordinates.
345 */
346 const struct brw_reg gi_reg = devinfo->ver >= 20 ? xe2_vec1_grf(i, 8) :
347 brw_vec1_grf(i + 1, 0);
348 const struct brw_reg gi_uw = retype(gi_reg, BRW_TYPE_UW);
349
350 if (devinfo->verx10 >= 125) {
351 const fs_builder dbld =
352 abld.exec_all().group(hbld.dispatch_width() * 2, 0);
353 const brw_reg int_pixel_x = dbld.vgrf(BRW_TYPE_UW);
354 const brw_reg int_pixel_y = dbld.vgrf(BRW_TYPE_UW);
355
356 dbld.ADD(int_pixel_x,
357 brw_reg(stride(suboffset(gi_uw, 4), 2, 8, 0)),
358 int_pixel_offset_x);
359 dbld.ADD(int_pixel_y,
360 brw_reg(stride(suboffset(gi_uw, 5), 2, 8, 0)),
361 int_pixel_offset_y);
362
363 if (wm_prog_data->coarse_pixel_dispatch != BRW_NEVER) {
364 fs_inst *addx = dbld.ADD(int_pixel_x, int_pixel_x,
365 horiz_stride(half_int_pixel_offset_x, 0));
366 fs_inst *addy = dbld.ADD(int_pixel_y, int_pixel_y,
367 horiz_stride(half_int_pixel_offset_y, 0));
368 if (wm_prog_data->coarse_pixel_dispatch != BRW_ALWAYS) {
369 addx->predicate = BRW_PREDICATE_NORMAL;
370 addy->predicate = BRW_PREDICATE_NORMAL;
371 }
372 }
373
374 hbld.MOV(offset(s.pixel_x, hbld, i), horiz_stride(int_pixel_x, 2));
375 hbld.MOV(offset(s.pixel_y, hbld, i), horiz_stride(int_pixel_y, 2));
376
377 } else {
378 /* The "Register Region Restrictions" page says for BDW (and newer,
379 * presumably):
380 *
381 * "When destination spans two registers, the source may be one or
382 * two registers. The destination elements must be evenly split
383 * between the two registers."
384 *
385 * Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16
386 * to compute our pixel centers.
387 */
388 const fs_builder dbld =
389 abld.exec_all().group(hbld.dispatch_width() * 2, 0);
390 brw_reg int_pixel_xy = dbld.vgrf(BRW_TYPE_UW);
391
392 dbld.ADD(int_pixel_xy,
393 brw_reg(stride(suboffset(gi_uw, 4), 1, 4, 0)),
394 int_pixel_offset_xy);
395
396 hbld.emit(FS_OPCODE_PIXEL_X, offset(s.pixel_x, hbld, i), int_pixel_xy,
397 horiz_stride(half_int_pixel_offset_x, 0));
398 hbld.emit(FS_OPCODE_PIXEL_Y, offset(s.pixel_y, hbld, i), int_pixel_xy,
399 horiz_stride(half_int_pixel_offset_y, 0));
400 }
401 }
402
403 abld = bld.annotate("compute pos.z");
404 brw_reg coarse_z;
405 if (wm_prog_data->coarse_pixel_dispatch != BRW_NEVER &&
406 wm_prog_data->uses_depth_w_coefficients) {
407 /* In coarse pixel mode, the HW doesn't interpolate Z coordinate
408 * properly. In the same way we have to add the coarse pixel size to
409 * pixels locations, here we recompute the Z value with 2 coefficients
410 * in X & Y axis.
411 */
412 brw_reg coef_payload = brw_vec8_grf(payload.depth_w_coef_reg, 0);
413 const brw_reg x_start = brw_vec1_grf(coef_payload.nr, 2);
414 const brw_reg y_start = brw_vec1_grf(coef_payload.nr, 6);
415 const brw_reg z_cx = brw_vec1_grf(coef_payload.nr, 1);
416 const brw_reg z_cy = brw_vec1_grf(coef_payload.nr, 0);
417 const brw_reg z_c0 = brw_vec1_grf(coef_payload.nr, 3);
418
419 const brw_reg float_pixel_x = abld.vgrf(BRW_TYPE_F);
420 const brw_reg float_pixel_y = abld.vgrf(BRW_TYPE_F);
421
422 abld.ADD(float_pixel_x, s.pixel_x, negate(x_start));
423 abld.ADD(float_pixel_y, s.pixel_y, negate(y_start));
424
425 /* r1.0 - 0:7 ActualCoarsePixelShadingSize.X */
426 const brw_reg u8_cps_width = brw_reg(retype(brw_vec1_grf(1, 0), BRW_TYPE_UB));
427 /* r1.0 - 15:8 ActualCoarsePixelShadingSize.Y */
428 const brw_reg u8_cps_height = byte_offset(u8_cps_width, 1);
429 const brw_reg u32_cps_width = abld.vgrf(BRW_TYPE_UD);
430 const brw_reg u32_cps_height = abld.vgrf(BRW_TYPE_UD);
431 abld.MOV(u32_cps_width, u8_cps_width);
432 abld.MOV(u32_cps_height, u8_cps_height);
433
434 const brw_reg f_cps_width = abld.vgrf(BRW_TYPE_F);
435 const brw_reg f_cps_height = abld.vgrf(BRW_TYPE_F);
436 abld.MOV(f_cps_width, u32_cps_width);
437 abld.MOV(f_cps_height, u32_cps_height);
438
439 /* Center in the middle of the coarse pixel. */
440 abld.MAD(float_pixel_x, float_pixel_x, brw_imm_f(0.5f), f_cps_width);
441 abld.MAD(float_pixel_y, float_pixel_y, brw_imm_f(0.5f), f_cps_height);
442
443 coarse_z = abld.vgrf(BRW_TYPE_F);
444 abld.MAD(coarse_z, z_c0, z_cx, float_pixel_x);
445 abld.MAD(coarse_z, coarse_z, z_cy, float_pixel_y);
446 }
447
448 if (wm_prog_data->uses_src_depth)
449 s.pixel_z = fetch_payload_reg(bld, payload.source_depth_reg);
450
451 if (wm_prog_data->uses_depth_w_coefficients ||
452 wm_prog_data->uses_src_depth) {
453 brw_reg sample_z = s.pixel_z;
454
455 switch (wm_prog_data->coarse_pixel_dispatch) {
456 case BRW_NEVER:
457 break;
458
459 case BRW_SOMETIMES:
460 assert(wm_prog_data->uses_src_depth);
461 assert(wm_prog_data->uses_depth_w_coefficients);
462 s.pixel_z = abld.vgrf(BRW_TYPE_F);
463
464 /* We re-use the check_dynamic_msaa_flag() call from above */
465 set_predicate(BRW_PREDICATE_NORMAL,
466 abld.SEL(s.pixel_z, coarse_z, sample_z));
467 break;
468
469 case BRW_ALWAYS:
470 assert(!wm_prog_data->uses_src_depth);
471 assert(wm_prog_data->uses_depth_w_coefficients);
472 s.pixel_z = coarse_z;
473 break;
474 }
475 }
476
477 if (wm_prog_data->uses_src_w) {
478 abld = bld.annotate("compute pos.w");
479 s.pixel_w = fetch_payload_reg(abld, payload.source_w_reg);
480 s.wpos_w = bld.vgrf(BRW_TYPE_F);
481 abld.emit(SHADER_OPCODE_RCP, s.wpos_w, s.pixel_w);
482 }
483
484 if (wm_key->persample_interp == BRW_SOMETIMES) {
485 assert(!devinfo->needs_unlit_centroid_workaround);
486
487 const fs_builder ubld = bld.exec_all().group(16, 0);
488 bool loaded_flag = false;
489
490 for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
491 if (!(wm_prog_data->barycentric_interp_modes & BITFIELD_BIT(i)))
492 continue;
493
494 /* The sample mode will always be the top bit set in the perspective
495 * or non-perspective section. In the case where no SAMPLE mode was
496 * requested, wm_prog_data_barycentric_modes() will swap out the top
497 * mode for SAMPLE so this works regardless of whether SAMPLE was
498 * requested or not.
499 */
500 int sample_mode;
501 if (BITFIELD_BIT(i) & BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) {
502 sample_mode = util_last_bit(wm_prog_data->barycentric_interp_modes &
503 BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) - 1;
504 } else {
505 sample_mode = util_last_bit(wm_prog_data->barycentric_interp_modes &
506 BRW_BARYCENTRIC_PERSPECTIVE_BITS) - 1;
507 }
508 assert(wm_prog_data->barycentric_interp_modes &
509 BITFIELD_BIT(sample_mode));
510
511 if (i == sample_mode)
512 continue;
513
514 uint8_t *barys = payload.barycentric_coord_reg[i];
515
516 uint8_t *sample_barys = payload.barycentric_coord_reg[sample_mode];
517 assert(barys[0] && sample_barys[0]);
518
519 if (!loaded_flag) {
520 check_dynamic_msaa_flag(ubld, wm_prog_data,
521 INTEL_MSAA_FLAG_PERSAMPLE_INTERP);
522 }
523
524 for (unsigned j = 0; j < s.dispatch_width / 8; j++) {
525 set_predicate(
526 BRW_PREDICATE_NORMAL,
527 ubld.MOV(brw_vec8_grf(barys[j / 2] + (j % 2) * 2, 0),
528 brw_vec8_grf(sample_barys[j / 2] + (j % 2) * 2, 0)));
529 }
530 }
531 }
532
533 for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
534 s.delta_xy[i] = fetch_barycentric_reg(
535 bld, payload.barycentric_coord_reg[i]);
536 }
537
538 uint32_t centroid_modes = wm_prog_data->barycentric_interp_modes &
539 (1 << BRW_BARYCENTRIC_PERSPECTIVE_CENTROID |
540 1 << BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
541
542 if (devinfo->needs_unlit_centroid_workaround && centroid_modes) {
543 /* Get the pixel/sample mask into f0 so that we know which
544 * pixels are lit. Then, for each channel that is unlit,
545 * replace the centroid data with non-centroid data.
546 */
547 for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
548 bld.exec_all().group(1, 0)
549 .MOV(retype(brw_flag_reg(0, i), BRW_TYPE_UW),
550 retype(brw_vec1_grf(1 + i, 7), BRW_TYPE_UW));
551 }
552
553 for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
554 if (!(centroid_modes & (1 << i)))
555 continue;
556
557 const brw_reg centroid_delta_xy = s.delta_xy[i];
558 const brw_reg &pixel_delta_xy = s.delta_xy[i - 1];
559
560 s.delta_xy[i] = bld.vgrf(BRW_TYPE_F, 2);
561
562 for (unsigned c = 0; c < 2; c++) {
563 for (unsigned q = 0; q < s.dispatch_width / 8; q++) {
564 set_predicate(BRW_PREDICATE_NORMAL,
565 bld.quarter(q).SEL(
566 quarter(offset(s.delta_xy[i], bld, c), q),
567 quarter(offset(centroid_delta_xy, bld, c), q),
568 quarter(offset(pixel_delta_xy, bld, c), q)));
569 }
570 }
571 }
572 }
573 }
574
575
576 /**
577 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
578 * instructions to FS_OPCODE_REP_FB_WRITE.
579 */
580 static void
brw_emit_repclear_shader(fs_visitor & s)581 brw_emit_repclear_shader(fs_visitor &s)
582 {
583 brw_wm_prog_key *key = (brw_wm_prog_key*) s.key;
584 fs_inst *write = NULL;
585
586 assert(s.devinfo->ver < 20);
587 assert(s.uniforms == 0);
588 assume(key->nr_color_regions > 0);
589
590 brw_reg color_output = retype(brw_vec4_grf(127, 0), BRW_TYPE_UD);
591 brw_reg header = retype(brw_vec8_grf(125, 0), BRW_TYPE_UD);
592
593 /* We pass the clear color as a flat input. Copy it to the output. */
594 brw_reg color_input =
595 brw_make_reg(FIXED_GRF, 2, 3, 0, 0, BRW_TYPE_UD,
596 BRW_VERTICAL_STRIDE_8, BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4,
597 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
598
599 const fs_builder bld = fs_builder(&s).at_end();
600 bld.exec_all().group(4, 0).MOV(color_output, color_input);
601
602 if (key->nr_color_regions > 1) {
603 /* Copy g0..g1 as the message header */
604 bld.exec_all().group(16, 0)
605 .MOV(header, retype(brw_vec8_grf(0, 0), BRW_TYPE_UD));
606 }
607
608 for (int i = 0; i < key->nr_color_regions; ++i) {
609 if (i > 0)
610 bld.exec_all().group(1, 0).MOV(component(header, 2), brw_imm_ud(i));
611
612 write = bld.emit(SHADER_OPCODE_SEND);
613 write->resize_sources(3);
614 write->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
615 write->src[0] = brw_imm_ud(0);
616 write->src[1] = brw_imm_ud(0);
617 write->src[2] = i == 0 ? color_output : header;
618 write->check_tdr = true;
619 write->send_has_side_effects = true;
620 write->desc = brw_fb_write_desc(s.devinfo, i,
621 BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED,
622 i == key->nr_color_regions - 1, false);
623
624 /* We can use a headerless message for the first render target */
625 write->header_size = i == 0 ? 0 : 2;
626 write->mlen = 1 + write->header_size;
627 }
628 write->eot = true;
629 write->last_rt = true;
630
631 brw_calculate_cfg(s);
632
633 s.first_non_payload_grf = s.payload().num_regs;
634
635 brw_fs_lower_scoreboard(s);
636 }
637
638 /**
639 * Turn one of the two CENTROID barycentric modes into PIXEL mode.
640 */
641 static enum brw_barycentric_mode
centroid_to_pixel(enum brw_barycentric_mode bary)642 centroid_to_pixel(enum brw_barycentric_mode bary)
643 {
644 assert(bary == BRW_BARYCENTRIC_PERSPECTIVE_CENTROID ||
645 bary == BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
646 return (enum brw_barycentric_mode) ((unsigned) bary - 1);
647 }
648
649 static void
calculate_urb_setup(const struct intel_device_info * devinfo,const struct brw_wm_prog_key * key,struct brw_wm_prog_data * prog_data,const nir_shader * nir,const struct brw_mue_map * mue_map)650 calculate_urb_setup(const struct intel_device_info *devinfo,
651 const struct brw_wm_prog_key *key,
652 struct brw_wm_prog_data *prog_data,
653 const nir_shader *nir,
654 const struct brw_mue_map *mue_map)
655 {
656 memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup));
657 memset(prog_data->urb_setup_channel, 0, sizeof(prog_data->urb_setup_channel));
658
659 int urb_next = 0; /* in vec4s */
660
661 const uint64_t inputs_read =
662 nir->info.inputs_read & ~nir->info.per_primitive_inputs;
663
664 /* Figure out where each of the incoming setup attributes lands. */
665 if (key->mesh_input != BRW_NEVER) {
666 /* Per-Primitive Attributes are laid out by Hardware before the regular
667 * attributes, so order them like this to make easy later to map setup
668 * into real HW registers.
669 */
670 if (nir->info.per_primitive_inputs) {
671 uint64_t per_prim_inputs_read =
672 nir->info.inputs_read & nir->info.per_primitive_inputs;
673
674 /* In Mesh, PRIMITIVE_SHADING_RATE, VIEWPORT and LAYER slots
675 * are always at the beginning, because they come from MUE
676 * Primitive Header, not Per-Primitive Attributes.
677 */
678 const uint64_t primitive_header_bits = VARYING_BIT_VIEWPORT |
679 VARYING_BIT_LAYER |
680 VARYING_BIT_PRIMITIVE_SHADING_RATE;
681
682 if (mue_map) {
683 unsigned per_prim_start_dw = mue_map->per_primitive_start_dw;
684 unsigned per_prim_size_dw = mue_map->per_primitive_pitch_dw;
685
686 bool reads_header = (per_prim_inputs_read & primitive_header_bits) != 0;
687
688 if (reads_header || mue_map->user_data_in_primitive_header) {
689 /* Primitive Shading Rate, Layer and Viewport live in the same
690 * 4-dwords slot (psr is dword 0, layer is dword 1, and viewport
691 * is dword 2).
692 */
693 if (per_prim_inputs_read & VARYING_BIT_PRIMITIVE_SHADING_RATE)
694 prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 0;
695
696 if (per_prim_inputs_read & VARYING_BIT_LAYER)
697 prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
698
699 if (per_prim_inputs_read & VARYING_BIT_VIEWPORT)
700 prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = 0;
701
702 per_prim_inputs_read &= ~primitive_header_bits;
703 } else {
704 /* If fs doesn't need primitive header, then it won't be made
705 * available through SBE_MESH, so we have to skip them when
706 * calculating offset from start of per-prim data.
707 */
708 per_prim_start_dw += mue_map->per_primitive_header_size_dw;
709 per_prim_size_dw -= mue_map->per_primitive_header_size_dw;
710 }
711
712 u_foreach_bit64(i, per_prim_inputs_read) {
713 int start = mue_map->start_dw[i];
714
715 assert(start >= 0);
716 assert(mue_map->len_dw[i] > 0);
717
718 assert(unsigned(start) >= per_prim_start_dw);
719 unsigned pos_dw = unsigned(start) - per_prim_start_dw;
720
721 prog_data->urb_setup[i] = urb_next + pos_dw / 4;
722 prog_data->urb_setup_channel[i] = pos_dw % 4;
723 }
724
725 urb_next = per_prim_size_dw / 4;
726 } else {
727 /* With no MUE map, we never read the primitive header, and
728 * per-primitive attributes won't be packed either, so just lay
729 * them in varying order.
730 */
731 per_prim_inputs_read &= ~primitive_header_bits;
732
733 for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
734 if (per_prim_inputs_read & BITFIELD64_BIT(i)) {
735 prog_data->urb_setup[i] = urb_next++;
736 }
737 }
738
739 /* The actual setup attributes later must be aligned to a full GRF. */
740 urb_next = ALIGN(urb_next, 2);
741 }
742
743 prog_data->num_per_primitive_inputs = urb_next;
744 }
745
746 const uint64_t clip_dist_bits = VARYING_BIT_CLIP_DIST0 |
747 VARYING_BIT_CLIP_DIST1;
748
749 uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK;
750
751 if (inputs_read & clip_dist_bits) {
752 assert(!mue_map || mue_map->per_vertex_header_size_dw > 8);
753 unique_fs_attrs &= ~clip_dist_bits;
754 }
755
756 if (mue_map) {
757 unsigned per_vertex_start_dw = mue_map->per_vertex_start_dw;
758 unsigned per_vertex_size_dw = mue_map->per_vertex_pitch_dw;
759
760 /* Per-Vertex header is available to fragment shader only if there's
761 * user data there.
762 */
763 if (!mue_map->user_data_in_vertex_header) {
764 per_vertex_start_dw += 8;
765 per_vertex_size_dw -= 8;
766 }
767
768 /* In Mesh, CLIP_DIST slots are always at the beginning, because
769 * they come from MUE Vertex Header, not Per-Vertex Attributes.
770 */
771 if (inputs_read & clip_dist_bits) {
772 prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next;
773 prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next + 1;
774 } else if (mue_map && mue_map->per_vertex_header_size_dw > 8) {
775 /* Clip distances are in MUE, but we are not reading them in FS. */
776 per_vertex_start_dw += 8;
777 per_vertex_size_dw -= 8;
778 }
779
780 /* Per-Vertex attributes are laid out ordered. Because we always link
781 * Mesh and Fragment shaders, the which slots are written and read by
782 * each of them will match. */
783 u_foreach_bit64(i, unique_fs_attrs) {
784 int start = mue_map->start_dw[i];
785
786 assert(start >= 0);
787 assert(mue_map->len_dw[i] > 0);
788
789 assert(unsigned(start) >= per_vertex_start_dw);
790 unsigned pos_dw = unsigned(start) - per_vertex_start_dw;
791
792 prog_data->urb_setup[i] = urb_next + pos_dw / 4;
793 prog_data->urb_setup_channel[i] = pos_dw % 4;
794 }
795
796 urb_next += per_vertex_size_dw / 4;
797 } else {
798 /* If we don't have an MUE map, just lay down the inputs the FS reads
799 * in varying order, as we do for the legacy pipeline.
800 */
801 if (inputs_read & clip_dist_bits) {
802 prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next++;
803 prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next++;
804 }
805
806 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
807 if (unique_fs_attrs & BITFIELD64_BIT(i))
808 prog_data->urb_setup[i] = urb_next++;
809 }
810 }
811 } else {
812 assert(!nir->info.per_primitive_inputs);
813
814 uint64_t vue_header_bits =
815 VARYING_BIT_PSIZ | VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT;
816
817 uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK;
818
819 /* VUE header fields all live in the same URB slot, so we pass them
820 * as a single FS input attribute. We want to only count them once.
821 */
822 if (inputs_read & vue_header_bits) {
823 unique_fs_attrs &= ~vue_header_bits;
824 unique_fs_attrs |= VARYING_BIT_PSIZ;
825 }
826
827 if (util_bitcount64(unique_fs_attrs) <= 16) {
828 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
829 * first 16 varying inputs, so we can put them wherever we want.
830 * Just put them in order.
831 *
832 * This is useful because it means that (a) inputs not used by the
833 * fragment shader won't take up valuable register space, and (b) we
834 * won't have to recompile the fragment shader if it gets paired with
835 * a different vertex (or geometry) shader.
836 *
837 * VUE header fields share the same FS input attribute.
838 */
839 if (inputs_read & vue_header_bits) {
840 if (inputs_read & VARYING_BIT_PSIZ)
841 prog_data->urb_setup[VARYING_SLOT_PSIZ] = urb_next;
842 if (inputs_read & VARYING_BIT_LAYER)
843 prog_data->urb_setup[VARYING_SLOT_LAYER] = urb_next;
844 if (inputs_read & VARYING_BIT_VIEWPORT)
845 prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = urb_next;
846
847 urb_next++;
848 }
849
850 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
851 if (inputs_read & BRW_FS_VARYING_INPUT_MASK & ~vue_header_bits &
852 BITFIELD64_BIT(i)) {
853 prog_data->urb_setup[i] = urb_next++;
854 }
855 }
856 } else {
857 /* We have enough input varyings that the SF/SBE pipeline stage can't
858 * arbitrarily rearrange them to suit our whim; we have to put them
859 * in an order that matches the output of the previous pipeline stage
860 * (geometry or vertex shader).
861 */
862
863 /* Re-compute the VUE map here in the case that the one coming from
864 * geometry has more than one position slot (used for Primitive
865 * Replication).
866 */
867 struct intel_vue_map prev_stage_vue_map;
868 brw_compute_vue_map(devinfo, &prev_stage_vue_map,
869 key->input_slots_valid,
870 nir->info.separate_shader, 1);
871
872 int first_slot =
873 brw_compute_first_urb_slot_required(inputs_read,
874 &prev_stage_vue_map);
875
876 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
877 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
878 slot++) {
879 int varying = prev_stage_vue_map.slot_to_varying[slot];
880 if (varying != BRW_VARYING_SLOT_PAD &&
881 (inputs_read & BRW_FS_VARYING_INPUT_MASK &
882 BITFIELD64_BIT(varying))) {
883 prog_data->urb_setup[varying] = slot - first_slot;
884 }
885 }
886 urb_next = prev_stage_vue_map.num_slots - first_slot;
887 }
888 }
889
890 prog_data->num_varying_inputs = urb_next - prog_data->num_per_primitive_inputs;
891 prog_data->inputs = inputs_read;
892
893 brw_compute_urb_setup_index(prog_data);
894 }
895 static bool
is_used_in_not_interp_frag_coord(nir_def * def)896 is_used_in_not_interp_frag_coord(nir_def *def)
897 {
898 nir_foreach_use_including_if(src, def) {
899 if (nir_src_is_if(src))
900 return true;
901
902 if (nir_src_parent_instr(src)->type != nir_instr_type_intrinsic)
903 return true;
904
905 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(nir_src_parent_instr(src));
906 if (intrin->intrinsic != nir_intrinsic_load_frag_coord)
907 return true;
908 }
909
910 return false;
911 }
912
913 /**
914 * Return a bitfield where bit n is set if barycentric interpolation mode n
915 * (see enum brw_barycentric_mode) is needed by the fragment shader.
916 *
917 * We examine the load_barycentric intrinsics rather than looking at input
918 * variables so that we catch interpolateAtCentroid() messages too, which
919 * also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.
920 */
921 static unsigned
brw_compute_barycentric_interp_modes(const struct intel_device_info * devinfo,const struct brw_wm_prog_key * key,const nir_shader * shader)922 brw_compute_barycentric_interp_modes(const struct intel_device_info *devinfo,
923 const struct brw_wm_prog_key *key,
924 const nir_shader *shader)
925 {
926 unsigned barycentric_interp_modes = 0;
927
928 nir_foreach_function_impl(impl, shader) {
929 nir_foreach_block(block, impl) {
930 nir_foreach_instr(instr, block) {
931 if (instr->type != nir_instr_type_intrinsic)
932 continue;
933
934 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
935 switch (intrin->intrinsic) {
936 case nir_intrinsic_load_barycentric_pixel:
937 case nir_intrinsic_load_barycentric_centroid:
938 case nir_intrinsic_load_barycentric_sample:
939 case nir_intrinsic_load_barycentric_at_sample:
940 case nir_intrinsic_load_barycentric_at_offset:
941 break;
942 default:
943 continue;
944 }
945
946 /* Ignore WPOS; it doesn't require interpolation. */
947 if (!is_used_in_not_interp_frag_coord(&intrin->def))
948 continue;
949
950 nir_intrinsic_op bary_op = intrin->intrinsic;
951 enum brw_barycentric_mode bary =
952 brw_barycentric_mode(key, intrin);
953
954 barycentric_interp_modes |= 1 << bary;
955
956 if (devinfo->needs_unlit_centroid_workaround &&
957 bary_op == nir_intrinsic_load_barycentric_centroid)
958 barycentric_interp_modes |= 1 << centroid_to_pixel(bary);
959 }
960 }
961 }
962
963 return barycentric_interp_modes;
964 }
965
966 /**
967 * Return a bitfield where bit n is set if barycentric interpolation
968 * mode n (see enum brw_barycentric_mode) is needed by the fragment
969 * shader barycentric intrinsics that take an explicit offset or
970 * sample as argument.
971 */
972 static unsigned
brw_compute_offset_barycentric_interp_modes(const struct brw_wm_prog_key * key,const nir_shader * shader)973 brw_compute_offset_barycentric_interp_modes(const struct brw_wm_prog_key *key,
974 const nir_shader *shader)
975 {
976 unsigned barycentric_interp_modes = 0;
977
978 nir_foreach_function_impl(impl, shader) {
979 nir_foreach_block(block, impl) {
980 nir_foreach_instr(instr, block) {
981 if (instr->type != nir_instr_type_intrinsic)
982 continue;
983
984 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
985 if (intrin->intrinsic == nir_intrinsic_load_barycentric_at_offset ||
986 intrin->intrinsic == nir_intrinsic_load_barycentric_at_sample)
987 barycentric_interp_modes |= 1 << brw_barycentric_mode(key, intrin);
988 }
989 }
990 }
991
992 return barycentric_interp_modes;
993 }
994
995 static void
brw_compute_flat_inputs(struct brw_wm_prog_data * prog_data,const nir_shader * shader)996 brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,
997 const nir_shader *shader)
998 {
999 prog_data->flat_inputs = 0;
1000
1001 nir_foreach_shader_in_variable(var, shader) {
1002 /* flat shading */
1003 if (var->data.interpolation != INTERP_MODE_FLAT)
1004 continue;
1005
1006 if (var->data.per_primitive)
1007 continue;
1008
1009 unsigned slots = glsl_count_attribute_slots(var->type, false);
1010 for (unsigned s = 0; s < slots; s++) {
1011 int input_index = prog_data->urb_setup[var->data.location + s];
1012
1013 if (input_index >= 0)
1014 prog_data->flat_inputs |= 1 << input_index;
1015 }
1016 }
1017 }
1018
1019 static uint8_t
computed_depth_mode(const nir_shader * shader)1020 computed_depth_mode(const nir_shader *shader)
1021 {
1022 if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
1023 switch (shader->info.fs.depth_layout) {
1024 case FRAG_DEPTH_LAYOUT_NONE:
1025 case FRAG_DEPTH_LAYOUT_ANY:
1026 return BRW_PSCDEPTH_ON;
1027 case FRAG_DEPTH_LAYOUT_GREATER:
1028 return BRW_PSCDEPTH_ON_GE;
1029 case FRAG_DEPTH_LAYOUT_LESS:
1030 return BRW_PSCDEPTH_ON_LE;
1031 case FRAG_DEPTH_LAYOUT_UNCHANGED:
1032 /* We initially set this to OFF, but having the shader write the
1033 * depth means we allocate register space in the SEND message. The
1034 * difference between the SEND register count and the OFF state
1035 * programming makes the HW hang.
1036 *
1037 * Removing the depth writes also leads to test failures. So use
1038 * LesserThanOrEqual, which fits writing the same value
1039 * (unchanged/equal).
1040 *
1041 */
1042 return BRW_PSCDEPTH_ON_LE;
1043 }
1044 }
1045 return BRW_PSCDEPTH_OFF;
1046 }
1047
1048 static void
brw_nir_populate_wm_prog_data(nir_shader * shader,const struct intel_device_info * devinfo,const struct brw_wm_prog_key * key,struct brw_wm_prog_data * prog_data,const struct brw_mue_map * mue_map)1049 brw_nir_populate_wm_prog_data(nir_shader *shader,
1050 const struct intel_device_info *devinfo,
1051 const struct brw_wm_prog_key *key,
1052 struct brw_wm_prog_data *prog_data,
1053 const struct brw_mue_map *mue_map)
1054 {
1055 prog_data->uses_kill = shader->info.fs.uses_discard;
1056 prog_data->uses_omask = !key->ignore_sample_mask_out &&
1057 (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
1058 prog_data->max_polygons = 1;
1059 prog_data->computed_depth_mode = computed_depth_mode(shader);
1060 prog_data->computed_stencil =
1061 shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
1062
1063 prog_data->sample_shading =
1064 shader->info.fs.uses_sample_shading ||
1065 shader->info.outputs_read;
1066
1067 assert(key->multisample_fbo != BRW_NEVER ||
1068 key->persample_interp == BRW_NEVER);
1069
1070 prog_data->persample_dispatch = key->persample_interp;
1071 if (prog_data->sample_shading)
1072 prog_data->persample_dispatch = BRW_ALWAYS;
1073
1074 /* We can only persample dispatch if we have a multisample FBO */
1075 prog_data->persample_dispatch = MIN2(prog_data->persample_dispatch,
1076 key->multisample_fbo);
1077
1078 /* Currently only the Vulkan API allows alpha_to_coverage to be dynamic. If
1079 * persample_dispatch & multisample_fbo are not dynamic, Anv should be able
1080 * to definitively tell whether alpha_to_coverage is on or off.
1081 */
1082 prog_data->alpha_to_coverage = key->alpha_to_coverage;
1083
1084 prog_data->uses_sample_mask =
1085 BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN);
1086
1087 /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
1088 *
1089 * "MSDISPMODE_PERSAMPLE is required in order to select
1090 * POSOFFSET_SAMPLE"
1091 *
1092 * So we can only really get sample positions if we are doing real
1093 * per-sample dispatch. If we need gl_SamplePosition and we don't have
1094 * persample dispatch, we hard-code it to 0.5.
1095 */
1096 prog_data->uses_pos_offset =
1097 prog_data->persample_dispatch != BRW_NEVER &&
1098 (BITSET_TEST(shader->info.system_values_read,
1099 SYSTEM_VALUE_SAMPLE_POS) ||
1100 BITSET_TEST(shader->info.system_values_read,
1101 SYSTEM_VALUE_SAMPLE_POS_OR_CENTER));
1102
1103 prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
1104 prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
1105 prog_data->inner_coverage = shader->info.fs.inner_coverage;
1106
1107 prog_data->barycentric_interp_modes =
1108 brw_compute_barycentric_interp_modes(devinfo, key, shader);
1109
1110 /* From the BDW PRM documentation for 3DSTATE_WM:
1111 *
1112 * "MSDISPMODE_PERSAMPLE is required in order to select Perspective
1113 * Sample or Non- perspective Sample barycentric coordinates."
1114 *
1115 * So cleanup any potentially set sample barycentric mode when not in per
1116 * sample dispatch.
1117 */
1118 if (prog_data->persample_dispatch == BRW_NEVER) {
1119 prog_data->barycentric_interp_modes &=
1120 ~BITFIELD_BIT(BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE);
1121 }
1122
1123 if (devinfo->ver >= 20) {
1124 const unsigned offset_bary_modes =
1125 brw_compute_offset_barycentric_interp_modes(key, shader);
1126
1127 prog_data->uses_npc_bary_coefficients =
1128 offset_bary_modes & BRW_BARYCENTRIC_NONPERSPECTIVE_BITS;
1129 prog_data->uses_pc_bary_coefficients =
1130 offset_bary_modes & ~BRW_BARYCENTRIC_NONPERSPECTIVE_BITS;
1131 prog_data->uses_sample_offsets =
1132 offset_bary_modes & ((1 << BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE) |
1133 (1 << BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE));
1134 }
1135
1136 prog_data->uses_nonperspective_interp_modes =
1137 (prog_data->barycentric_interp_modes & BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) ||
1138 prog_data->uses_npc_bary_coefficients;
1139
1140 /* The current VK_EXT_graphics_pipeline_library specification requires
1141 * coarse to specified at compile time. But per sample interpolation can be
1142 * dynamic. So we should never be in a situation where coarse &
1143 * persample_interp are both respectively true & BRW_ALWAYS.
1144 *
1145 * Coarse will dynamically turned off when persample_interp is active.
1146 */
1147 assert(!key->coarse_pixel || key->persample_interp != BRW_ALWAYS);
1148
1149 prog_data->coarse_pixel_dispatch =
1150 brw_sometimes_invert(prog_data->persample_dispatch);
1151 if (!key->coarse_pixel ||
1152 prog_data->uses_omask ||
1153 prog_data->sample_shading ||
1154 prog_data->uses_sample_mask ||
1155 (prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) ||
1156 prog_data->computed_stencil) {
1157 prog_data->coarse_pixel_dispatch = BRW_NEVER;
1158 }
1159
1160 /* ICL PRMs, Volume 9: Render Engine, Shared Functions Pixel Interpolater,
1161 * Message Descriptor :
1162 *
1163 * "Message Type. Specifies the type of message being sent when
1164 * pixel-rate evaluation is requested :
1165 *
1166 * Format = U2
1167 * 0: Per Message Offset (eval_snapped with immediate offset)
1168 * 1: Sample Position Offset (eval_sindex)
1169 * 2: Centroid Position Offset (eval_centroid)
1170 * 3: Per Slot Offset (eval_snapped with register offset)
1171 *
1172 * Message Type. Specifies the type of message being sent when
1173 * coarse-rate evaluation is requested :
1174 *
1175 * Format = U2
1176 * 0: Coarse to Pixel Mapping Message (internal message)
1177 * 1: Reserved
1178 * 2: Coarse Centroid Position (eval_centroid)
1179 * 3: Per Slot Coarse Pixel Offset (eval_snapped with register offset)"
1180 *
1181 * The Sample Position Offset is marked as reserved for coarse rate
1182 * evaluation and leads to hangs if we try to use it. So disable coarse
1183 * pixel shading if we have any intrinsic that will result in a pixel
1184 * interpolater message at sample.
1185 */
1186 if (intel_nir_pulls_at_sample(shader))
1187 prog_data->coarse_pixel_dispatch = BRW_NEVER;
1188
1189 /* We choose to always enable VMask prior to XeHP, as it would cause
1190 * us to lose out on the eliminate_find_live_channel() optimization.
1191 */
1192 prog_data->uses_vmask = devinfo->verx10 < 125 ||
1193 shader->info.fs.needs_quad_helper_invocations ||
1194 shader->info.uses_wide_subgroup_intrinsics ||
1195 prog_data->coarse_pixel_dispatch != BRW_NEVER;
1196
1197 prog_data->uses_src_w =
1198 BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
1199 prog_data->uses_src_depth =
1200 BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
1201 prog_data->coarse_pixel_dispatch != BRW_ALWAYS;
1202 prog_data->uses_depth_w_coefficients = prog_data->uses_pc_bary_coefficients ||
1203 (BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
1204 prog_data->coarse_pixel_dispatch != BRW_NEVER);
1205
1206 calculate_urb_setup(devinfo, key, prog_data, shader, mue_map);
1207 brw_compute_flat_inputs(prog_data, shader);
1208 }
1209
1210 /* From the SKL PRM, Volume 16, Workarounds:
1211 *
1212 * 0877 3D Pixel Shader Hang possible when pixel shader dispatched with
1213 * only header phases (R0-R2)
1214 *
1215 * WA: Enable a non-header phase (e.g. push constant) when dispatch would
1216 * have been header only.
1217 *
1218 * Instead of enabling push constants one can alternatively enable one of the
1219 * inputs. Here one simply chooses "layer" which shouldn't impose much
1220 * overhead.
1221 */
1222 static void
gfx9_ps_header_only_workaround(struct brw_wm_prog_data * wm_prog_data)1223 gfx9_ps_header_only_workaround(struct brw_wm_prog_data *wm_prog_data)
1224 {
1225 if (wm_prog_data->num_varying_inputs)
1226 return;
1227
1228 if (wm_prog_data->base.curb_read_length)
1229 return;
1230
1231 wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
1232 wm_prog_data->num_varying_inputs = 1;
1233
1234 brw_compute_urb_setup_index(wm_prog_data);
1235 }
1236
1237 static void
brw_assign_urb_setup(fs_visitor & s)1238 brw_assign_urb_setup(fs_visitor &s)
1239 {
1240 assert(s.stage == MESA_SHADER_FRAGMENT);
1241
1242 const struct intel_device_info *devinfo = s.devinfo;
1243 struct brw_wm_prog_data *prog_data = brw_wm_prog_data(s.prog_data);
1244
1245 int urb_start = s.payload().num_regs + prog_data->base.curb_read_length;
1246
1247 /* Offset all the urb_setup[] index by the actual position of the
1248 * setup regs, now that the location of the constants has been chosen.
1249 */
1250 foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
1251 for (int i = 0; i < inst->sources; i++) {
1252 if (inst->src[i].file == ATTR) {
1253 /* ATTR brw_reg::nr in the FS is in units of logical scalar
1254 * inputs each of which consumes 16B on Gfx4-Gfx12. In
1255 * single polygon mode this leads to the following layout
1256 * of the vertex setup plane parameters in the ATTR
1257 * register file:
1258 *
1259 * brw_reg::nr Input Comp0 Comp1 Comp2 Comp3
1260 * 0 Attr0.x a1-a0 a2-a0 N/A a0
1261 * 1 Attr0.y a1-a0 a2-a0 N/A a0
1262 * 2 Attr0.z a1-a0 a2-a0 N/A a0
1263 * 3 Attr0.w a1-a0 a2-a0 N/A a0
1264 * 4 Attr1.x a1-a0 a2-a0 N/A a0
1265 * ...
1266 *
1267 * In multipolygon mode that no longer works since
1268 * different channels may be processing polygons with
1269 * different plane parameters, so each parameter above is
1270 * represented as a dispatch_width-wide vector:
1271 *
1272 * brw_reg::nr brw_reg::offset Input Comp0 ... CompN
1273 * 0 0 Attr0.x a1[0]-a0[0] ... a1[N]-a0[N]
1274 * 0 4 * dispatch_width Attr0.x a2[0]-a0[0] ... a2[N]-a0[N]
1275 * 0 8 * dispatch_width Attr0.x N/A ... N/A
1276 * 0 12 * dispatch_width Attr0.x a0[0] ... a0[N]
1277 * 1 0 Attr0.y a1[0]-a0[0] ... a1[N]-a0[N]
1278 * ...
1279 *
1280 * Note that many of the components on a single row above
1281 * are likely to be replicated multiple times (if, say, a
1282 * single SIMD thread is only processing 2 different
1283 * polygons), so plane parameters aren't actually stored
1284 * in GRF memory with that layout to avoid wasting space.
1285 * Instead we compose ATTR register regions with a 2D
1286 * region that walks through the parameters of each
1287 * polygon with the correct stride, reading the parameter
1288 * corresponding to each channel directly from the PS
1289 * thread payload.
1290 *
1291 * The latter layout corresponds to a param_width equal to
1292 * dispatch_width, while the former (scalar parameter)
1293 * layout has a param_width of 1.
1294 *
1295 * Gfx20+ represent plane parameters in a format similar
1296 * to the above, except the parameters are packed in 12B
1297 * and ordered like "a0, a1-a0, a2-a0" instead of the
1298 * above vec4 representation with a missing component.
1299 */
1300 const unsigned param_width = (s.max_polygons > 1 ? s.dispatch_width : 1);
1301
1302 /* Size of a single scalar component of a plane parameter
1303 * in bytes.
1304 */
1305 const unsigned chan_sz = 4;
1306 struct brw_reg reg;
1307 assert(s.max_polygons > 0);
1308
1309 /* Calculate the base register on the thread payload of
1310 * either the block of vertex setup data or the block of
1311 * per-primitive constant data depending on whether we're
1312 * accessing a primitive or vertex input. Also calculate
1313 * the index of the input within that block.
1314 */
1315 const bool per_prim = inst->src[i].nr < prog_data->num_per_primitive_inputs;
1316 const unsigned base = urb_start +
1317 (per_prim ? 0 :
1318 ALIGN(prog_data->num_per_primitive_inputs / 2,
1319 reg_unit(devinfo)) * s.max_polygons);
1320 const unsigned idx = per_prim ? inst->src[i].nr :
1321 inst->src[i].nr - prog_data->num_per_primitive_inputs;
1322
1323 /* Translate the offset within the param_width-wide
1324 * representation described above into an offset and a
1325 * grf, which contains the plane parameters for the first
1326 * polygon processed by the thread.
1327 */
1328 if (devinfo->ver >= 20 && !per_prim) {
1329 /* Gfx20+ is able to pack 5 logical input components
1330 * per 64B register for vertex setup data.
1331 */
1332 const unsigned grf = base + idx / 5 * 2 * s.max_polygons;
1333 assert(inst->src[i].offset / param_width < 12);
1334 const unsigned delta = idx % 5 * 12 +
1335 inst->src[i].offset / (param_width * chan_sz) * chan_sz +
1336 inst->src[i].offset % chan_sz;
1337 reg = byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
1338 delta);
1339 } else {
1340 /* Earlier platforms and per-primitive block pack 2 logical
1341 * input components per 32B register.
1342 */
1343 const unsigned grf = base + idx / 2 * s.max_polygons;
1344 assert(inst->src[i].offset / param_width < REG_SIZE / 2);
1345 const unsigned delta = (idx % 2) * (REG_SIZE / 2) +
1346 inst->src[i].offset / (param_width * chan_sz) * chan_sz +
1347 inst->src[i].offset % chan_sz;
1348 reg = byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
1349 delta);
1350 }
1351
1352 if (s.max_polygons > 1) {
1353 assert(devinfo->ver >= 12);
1354 /* Misaligned channel strides that would lead to
1355 * cross-channel access in the representation above are
1356 * disallowed.
1357 */
1358 assert(inst->src[i].stride * brw_type_size_bytes(inst->src[i].type) == chan_sz);
1359
1360 /* Number of channels processing the same polygon. */
1361 const unsigned poly_width = s.dispatch_width / s.max_polygons;
1362 assert(s.dispatch_width % s.max_polygons == 0);
1363
1364 /* Accessing a subset of channels of a parameter vector
1365 * starting from "chan" is necessary to handle
1366 * SIMD-lowered instructions though.
1367 */
1368 const unsigned chan = inst->src[i].offset %
1369 (param_width * chan_sz) / chan_sz;
1370 assert(chan < s.dispatch_width);
1371 assert(chan % poly_width == 0);
1372 const unsigned reg_size = reg_unit(devinfo) * REG_SIZE;
1373 reg = byte_offset(reg, chan / poly_width * reg_size);
1374
1375 if (inst->exec_size > poly_width) {
1376 /* Accessing the parameters for multiple polygons.
1377 * Corresponding parameters for different polygons
1378 * are stored a GRF apart on the thread payload, so
1379 * use that as vertical stride.
1380 */
1381 const unsigned vstride = reg_size / brw_type_size_bytes(inst->src[i].type);
1382 assert(vstride <= 32);
1383 assert(chan % poly_width == 0);
1384 reg = stride(reg, vstride, poly_width, 0);
1385 } else {
1386 /* Accessing one parameter for a single polygon --
1387 * Translate to a scalar region.
1388 */
1389 assert(chan % poly_width + inst->exec_size <= poly_width);
1390 reg = stride(reg, 0, 1, 0);
1391 }
1392
1393 } else {
1394 const unsigned width = inst->src[i].stride == 0 ?
1395 1 : MIN2(inst->exec_size, 8);
1396 reg = stride(reg, width * inst->src[i].stride,
1397 width, inst->src[i].stride);
1398 }
1399
1400 reg.abs = inst->src[i].abs;
1401 reg.negate = inst->src[i].negate;
1402 inst->src[i] = reg;
1403 }
1404 }
1405 }
1406
1407 /* Each attribute is 4 setup channels, each of which is half a reg,
1408 * but they may be replicated multiple times for multipolygon
1409 * dispatch.
1410 */
1411 s.first_non_payload_grf += prog_data->num_varying_inputs * 2 * s.max_polygons;
1412
1413 /* Unlike regular attributes, per-primitive attributes have all 4 channels
1414 * in the same slot, so each GRF can store two slots.
1415 */
1416 assert(prog_data->num_per_primitive_inputs % 2 == 0);
1417 s.first_non_payload_grf += prog_data->num_per_primitive_inputs / 2 * s.max_polygons;
1418 }
1419
1420 static bool
run_fs(fs_visitor & s,bool allow_spilling,bool do_rep_send)1421 run_fs(fs_visitor &s, bool allow_spilling, bool do_rep_send)
1422 {
1423 const struct intel_device_info *devinfo = s.devinfo;
1424 struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data);
1425 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) s.key;
1426 const fs_builder bld = fs_builder(&s).at_end();
1427 const nir_shader *nir = s.nir;
1428
1429 assert(s.stage == MESA_SHADER_FRAGMENT);
1430
1431 s.payload_ = new fs_thread_payload(s, s.source_depth_to_render_target);
1432
1433 if (nir->info.ray_queries > 0)
1434 s.limit_dispatch_width(16, "SIMD32 not supported with ray queries.\n");
1435
1436 if (do_rep_send) {
1437 assert(s.dispatch_width == 16);
1438 brw_emit_repclear_shader(s);
1439 } else {
1440 if (nir->info.inputs_read > 0 ||
1441 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) ||
1442 (nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) {
1443 brw_emit_interpolation_setup(s);
1444 }
1445
1446 /* We handle discards by keeping track of the still-live pixels in f0.1.
1447 * Initialize it with the dispatched pixels.
1448 */
1449 if (devinfo->ver >= 20 || wm_prog_data->uses_kill) {
1450 const unsigned lower_width = MIN2(s.dispatch_width, 16);
1451 for (unsigned i = 0; i < s.dispatch_width / lower_width; i++) {
1452 /* According to the "PS Thread Payload for Normal
1453 * Dispatch" pages on the BSpec, the dispatch mask is
1454 * stored in R0.15/R1.15 on gfx20+ and in R1.7/R2.7 on
1455 * gfx6+.
1456 */
1457 const brw_reg dispatch_mask =
1458 devinfo->ver >= 20 ? xe2_vec1_grf(i, 15) :
1459 brw_vec1_grf(i + 1, 7);
1460 bld.exec_all().group(1, 0)
1461 .MOV(brw_sample_mask_reg(bld.group(lower_width, i)),
1462 retype(dispatch_mask, BRW_TYPE_UW));
1463 }
1464 }
1465
1466 if (nir->info.writes_memory)
1467 wm_prog_data->has_side_effects = true;
1468
1469 nir_to_brw(&s);
1470
1471 if (s.failed)
1472 return false;
1473
1474 brw_emit_fb_writes(s);
1475
1476 brw_calculate_cfg(s);
1477
1478 brw_fs_optimize(s);
1479
1480 s.assign_curb_setup();
1481
1482 if (devinfo->ver == 9)
1483 gfx9_ps_header_only_workaround(wm_prog_data);
1484
1485 brw_assign_urb_setup(s);
1486
1487 brw_fs_lower_3src_null_dest(s);
1488 brw_fs_workaround_memory_fence_before_eot(s);
1489 brw_fs_workaround_emit_dummy_mov_instruction(s);
1490
1491 brw_allocate_registers(s, allow_spilling);
1492 }
1493
1494 return !s.failed;
1495 }
1496
1497 const unsigned *
brw_compile_fs(const struct brw_compiler * compiler,struct brw_compile_fs_params * params)1498 brw_compile_fs(const struct brw_compiler *compiler,
1499 struct brw_compile_fs_params *params)
1500 {
1501 struct nir_shader *nir = params->base.nir;
1502 const struct brw_wm_prog_key *key = params->key;
1503 struct brw_wm_prog_data *prog_data = params->prog_data;
1504 bool allow_spilling = params->allow_spilling;
1505 const bool debug_enabled =
1506 brw_should_print_shader(nir, params->base.debug_flag ?
1507 params->base.debug_flag : DEBUG_WM);
1508
1509 prog_data->base.stage = MESA_SHADER_FRAGMENT;
1510 prog_data->base.ray_queries = nir->info.ray_queries;
1511 prog_data->base.total_scratch = 0;
1512
1513 const struct intel_device_info *devinfo = compiler->devinfo;
1514 const unsigned max_subgroup_size = 32;
1515
1516 brw_nir_apply_key(nir, compiler, &key->base, max_subgroup_size);
1517 brw_nir_lower_fs_inputs(nir, devinfo, key);
1518 brw_nir_lower_fs_outputs(nir);
1519
1520 /* From the SKL PRM, Volume 7, "Alpha Coverage":
1521 * "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in
1522 * hardware, regardless of the state setting for this feature."
1523 */
1524 if (key->alpha_to_coverage != BRW_NEVER) {
1525 /* Run constant fold optimization in order to get the correct source
1526 * offset to determine render target 0 store instruction in
1527 * emit_alpha_to_coverage pass.
1528 */
1529 NIR_PASS(_, nir, nir_opt_constant_folding);
1530 NIR_PASS(_, nir, brw_nir_lower_alpha_to_coverage, key, prog_data);
1531 }
1532
1533 NIR_PASS(_, nir, brw_nir_move_interpolation_to_top);
1534 brw_postprocess_nir(nir, compiler, debug_enabled,
1535 key->base.robust_flags);
1536
1537 brw_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data,
1538 params->mue_map);
1539
1540 std::unique_ptr<fs_visitor> v8, v16, v32, vmulti;
1541 cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL,
1542 *multi_cfg = NULL;
1543 float throughput = 0;
1544 bool has_spilled = false;
1545
1546 if (devinfo->ver < 20) {
1547 v8 = std::make_unique<fs_visitor>(compiler, ¶ms->base, key,
1548 prog_data, nir, 8, 1,
1549 params->base.stats != NULL,
1550 debug_enabled);
1551 if (!run_fs(*v8, allow_spilling, false /* do_rep_send */)) {
1552 params->base.error_str = ralloc_strdup(params->base.mem_ctx,
1553 v8->fail_msg);
1554 return NULL;
1555 } else if (INTEL_SIMD(FS, 8)) {
1556 simd8_cfg = v8->cfg;
1557
1558 assert(v8->payload().num_regs % reg_unit(devinfo) == 0);
1559 prog_data->base.dispatch_grf_start_reg = v8->payload().num_regs / reg_unit(devinfo);
1560
1561 const performance &perf = v8->performance_analysis.require();
1562 throughput = MAX2(throughput, perf.throughput);
1563 has_spilled = v8->spilled_any_registers;
1564 allow_spilling = false;
1565 }
1566 }
1567
1568 if (key->coarse_pixel && devinfo->ver < 20) {
1569 if (prog_data->dual_src_blend) {
1570 v8->limit_dispatch_width(8, "SIMD16 coarse pixel shading cannot"
1571 " use SIMD8 messages.\n");
1572 }
1573 v8->limit_dispatch_width(16, "SIMD32 not supported with coarse"
1574 " pixel shading.\n");
1575 }
1576
1577 if (!has_spilled &&
1578 (!v8 || v8->max_dispatch_width >= 16) &&
1579 (INTEL_SIMD(FS, 16) || params->use_rep_send)) {
1580 /* Try a SIMD16 compile */
1581 v16 = std::make_unique<fs_visitor>(compiler, ¶ms->base, key,
1582 prog_data, nir, 16, 1,
1583 params->base.stats != NULL,
1584 debug_enabled);
1585 if (v8)
1586 v16->import_uniforms(v8.get());
1587 if (!run_fs(*v16, allow_spilling, params->use_rep_send)) {
1588 brw_shader_perf_log(compiler, params->base.log_data,
1589 "SIMD16 shader failed to compile: %s\n",
1590 v16->fail_msg);
1591 } else {
1592 simd16_cfg = v16->cfg;
1593
1594 assert(v16->payload().num_regs % reg_unit(devinfo) == 0);
1595 prog_data->dispatch_grf_start_reg_16 = v16->payload().num_regs / reg_unit(devinfo);
1596
1597 const performance &perf = v16->performance_analysis.require();
1598 throughput = MAX2(throughput, perf.throughput);
1599 has_spilled = v16->spilled_any_registers;
1600 allow_spilling = false;
1601 }
1602 }
1603
1604 const bool simd16_failed = v16 && !simd16_cfg;
1605
1606 /* Currently, the compiler only supports SIMD32 on SNB+ */
1607 if (!has_spilled &&
1608 (!v8 || v8->max_dispatch_width >= 32) &&
1609 (!v16 || v16->max_dispatch_width >= 32) && !params->use_rep_send &&
1610 !simd16_failed &&
1611 INTEL_SIMD(FS, 32)) {
1612 /* Try a SIMD32 compile */
1613 v32 = std::make_unique<fs_visitor>(compiler, ¶ms->base, key,
1614 prog_data, nir, 32, 1,
1615 params->base.stats != NULL,
1616 debug_enabled);
1617 if (v8)
1618 v32->import_uniforms(v8.get());
1619 else if (v16)
1620 v32->import_uniforms(v16.get());
1621
1622 if (!run_fs(*v32, allow_spilling, false)) {
1623 brw_shader_perf_log(compiler, params->base.log_data,
1624 "SIMD32 shader failed to compile: %s\n",
1625 v32->fail_msg);
1626 } else {
1627 const performance &perf = v32->performance_analysis.require();
1628
1629 if (!INTEL_DEBUG(DEBUG_DO32) && throughput >= perf.throughput) {
1630 brw_shader_perf_log(compiler, params->base.log_data,
1631 "SIMD32 shader inefficient\n");
1632 } else {
1633 simd32_cfg = v32->cfg;
1634
1635 assert(v32->payload().num_regs % reg_unit(devinfo) == 0);
1636 prog_data->dispatch_grf_start_reg_32 = v32->payload().num_regs / reg_unit(devinfo);
1637
1638 throughput = MAX2(throughput, perf.throughput);
1639 }
1640 }
1641 }
1642
1643 if (devinfo->ver >= 12 && !has_spilled &&
1644 params->max_polygons >= 2 && !key->coarse_pixel) {
1645 fs_visitor *vbase = v8 ? v8.get() : v16 ? v16.get() : v32.get();
1646 assert(vbase);
1647
1648 if (devinfo->ver >= 20 &&
1649 params->max_polygons >= 4 &&
1650 vbase->max_dispatch_width >= 32 &&
1651 4 * prog_data->num_varying_inputs <= MAX_VARYING &&
1652 INTEL_SIMD(FS, 4X8)) {
1653 /* Try a quad-SIMD8 compile */
1654 vmulti = std::make_unique<fs_visitor>(compiler, ¶ms->base, key,
1655 prog_data, nir, 32, 4,
1656 params->base.stats != NULL,
1657 debug_enabled);
1658 vmulti->import_uniforms(vbase);
1659 if (!run_fs(*vmulti, false, params->use_rep_send)) {
1660 brw_shader_perf_log(compiler, params->base.log_data,
1661 "Quad-SIMD8 shader failed to compile: %s\n",
1662 vmulti->fail_msg);
1663 } else {
1664 multi_cfg = vmulti->cfg;
1665 assert(!vmulti->spilled_any_registers);
1666 }
1667 }
1668
1669 if (!multi_cfg && devinfo->ver >= 20 &&
1670 vbase->max_dispatch_width >= 32 &&
1671 2 * prog_data->num_varying_inputs <= MAX_VARYING &&
1672 INTEL_SIMD(FS, 2X16)) {
1673 /* Try a dual-SIMD16 compile */
1674 vmulti = std::make_unique<fs_visitor>(compiler, ¶ms->base, key,
1675 prog_data, nir, 32, 2,
1676 params->base.stats != NULL,
1677 debug_enabled);
1678 vmulti->import_uniforms(vbase);
1679 if (!run_fs(*vmulti, false, params->use_rep_send)) {
1680 brw_shader_perf_log(compiler, params->base.log_data,
1681 "Dual-SIMD16 shader failed to compile: %s\n",
1682 vmulti->fail_msg);
1683 } else {
1684 multi_cfg = vmulti->cfg;
1685 assert(!vmulti->spilled_any_registers);
1686 }
1687 }
1688
1689 if (!multi_cfg && vbase->max_dispatch_width >= 16 &&
1690 2 * prog_data->num_varying_inputs <= MAX_VARYING &&
1691 INTEL_SIMD(FS, 2X8)) {
1692 /* Try a dual-SIMD8 compile */
1693 vmulti = std::make_unique<fs_visitor>(compiler, ¶ms->base, key,
1694 prog_data, nir, 16, 2,
1695 params->base.stats != NULL,
1696 debug_enabled);
1697 vmulti->import_uniforms(vbase);
1698 if (!run_fs(*vmulti, allow_spilling, params->use_rep_send)) {
1699 brw_shader_perf_log(compiler, params->base.log_data,
1700 "Dual-SIMD8 shader failed to compile: %s\n",
1701 vmulti->fail_msg);
1702 } else {
1703 multi_cfg = vmulti->cfg;
1704 }
1705 }
1706
1707 if (multi_cfg) {
1708 assert(vmulti->payload().num_regs % reg_unit(devinfo) == 0);
1709 prog_data->base.dispatch_grf_start_reg = vmulti->payload().num_regs / reg_unit(devinfo);
1710 }
1711 }
1712
1713 /* When the caller requests a repclear shader, they want SIMD16-only */
1714 if (params->use_rep_send)
1715 simd8_cfg = NULL;
1716
1717 fs_generator g(compiler, ¶ms->base, &prog_data->base,
1718 MESA_SHADER_FRAGMENT);
1719
1720 if (unlikely(debug_enabled)) {
1721 g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
1722 "%s fragment shader %s",
1723 nir->info.label ?
1724 nir->info.label : "unnamed",
1725 nir->info.name));
1726 }
1727
1728 struct brw_compile_stats *stats = params->base.stats;
1729 uint32_t max_dispatch_width = 0;
1730
1731 if (multi_cfg) {
1732 prog_data->dispatch_multi = vmulti->dispatch_width;
1733 prog_data->max_polygons = vmulti->max_polygons;
1734 g.generate_code(multi_cfg, vmulti->dispatch_width, vmulti->shader_stats,
1735 vmulti->performance_analysis.require(),
1736 stats, vmulti->max_polygons);
1737 stats = stats ? stats + 1 : NULL;
1738 max_dispatch_width = vmulti->dispatch_width;
1739
1740 } else if (simd8_cfg) {
1741 prog_data->dispatch_8 = true;
1742 g.generate_code(simd8_cfg, 8, v8->shader_stats,
1743 v8->performance_analysis.require(), stats, 1);
1744 stats = stats ? stats + 1 : NULL;
1745 max_dispatch_width = 8;
1746 }
1747
1748 if (simd16_cfg) {
1749 prog_data->dispatch_16 = true;
1750 prog_data->prog_offset_16 = g.generate_code(
1751 simd16_cfg, 16, v16->shader_stats,
1752 v16->performance_analysis.require(), stats, 1);
1753 stats = stats ? stats + 1 : NULL;
1754 max_dispatch_width = 16;
1755 }
1756
1757 if (simd32_cfg) {
1758 prog_data->dispatch_32 = true;
1759 prog_data->prog_offset_32 = g.generate_code(
1760 simd32_cfg, 32, v32->shader_stats,
1761 v32->performance_analysis.require(), stats, 1);
1762 stats = stats ? stats + 1 : NULL;
1763 max_dispatch_width = 32;
1764 }
1765
1766 for (struct brw_compile_stats *s = params->base.stats; s != NULL && s != stats; s++)
1767 s->max_dispatch_width = max_dispatch_width;
1768
1769 g.add_const_data(nir->constant_data, nir->constant_data_size);
1770 return g.get_assembly();
1771 }
1772