xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/brw_compile_fs.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2010 Intel Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "brw_eu.h"
7 #include "brw_fs.h"
8 #include "brw_fs_builder.h"
9 #include "brw_fs_live_variables.h"
10 #include "brw_nir.h"
11 #include "brw_cfg.h"
12 #include "brw_private.h"
13 #include "intel_nir.h"
14 #include "shader_enums.h"
15 #include "dev/intel_debug.h"
16 #include "dev/intel_wa.h"
17 
18 #include <memory>
19 
20 using namespace brw;
21 
22 static fs_inst *
brw_emit_single_fb_write(fs_visitor & s,const fs_builder & bld,brw_reg color0,brw_reg color1,brw_reg src0_alpha,unsigned components)23 brw_emit_single_fb_write(fs_visitor &s, const fs_builder &bld,
24                          brw_reg color0, brw_reg color1,
25                          brw_reg src0_alpha, unsigned components)
26 {
27    assert(s.stage == MESA_SHADER_FRAGMENT);
28    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(s.prog_data);
29 
30    /* Hand over gl_FragDepth or the payload depth. */
31    const brw_reg dst_depth = fetch_payload_reg(bld, s.fs_payload().dest_depth_reg);
32    brw_reg src_depth, src_stencil;
33 
34    if (s.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
35       src_depth = s.frag_depth;
36 
37    if (s.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL))
38       src_stencil = s.frag_stencil;
39 
40    const brw_reg sources[] = {
41       color0, color1, src0_alpha, src_depth, dst_depth, src_stencil,
42       (prog_data->uses_omask ? s.sample_mask : brw_reg()),
43       brw_imm_ud(components)
44    };
45    assert(ARRAY_SIZE(sources) - 1 == FB_WRITE_LOGICAL_SRC_COMPONENTS);
46    fs_inst *write = bld.emit(FS_OPCODE_FB_WRITE_LOGICAL, brw_reg(),
47                              sources, ARRAY_SIZE(sources));
48 
49    if (prog_data->uses_kill) {
50       write->predicate = BRW_PREDICATE_NORMAL;
51       write->flag_subreg = sample_mask_flag_subreg(s);
52    }
53 
54    return write;
55 }
56 
57 static void
brw_do_emit_fb_writes(fs_visitor & s,int nr_color_regions,bool replicate_alpha)58 brw_do_emit_fb_writes(fs_visitor &s, int nr_color_regions, bool replicate_alpha)
59 {
60    const fs_builder bld = fs_builder(&s).at_end();
61    fs_inst *inst = NULL;
62 
63    for (int target = 0; target < nr_color_regions; target++) {
64       /* Skip over outputs that weren't written. */
65       if (s.outputs[target].file == BAD_FILE)
66          continue;
67 
68       const fs_builder abld = bld.annotate(
69          ralloc_asprintf(s.mem_ctx, "FB write target %d", target));
70 
71       brw_reg src0_alpha;
72       if (replicate_alpha && target != 0)
73          src0_alpha = offset(s.outputs[0], bld, 3);
74 
75       inst = brw_emit_single_fb_write(s, abld, s.outputs[target],
76                                       s.dual_src_output, src0_alpha, 4);
77       inst->target = target;
78    }
79 
80    if (inst == NULL) {
81       /* Even if there's no color buffers enabled, we still need to send
82        * alpha out the pipeline to our null renderbuffer to support
83        * alpha-testing, alpha-to-coverage, and so on.
84        */
85       /* FINISHME: Factor out this frequently recurring pattern into a
86        * helper function.
87        */
88       const brw_reg srcs[] = { reg_undef, reg_undef,
89                               reg_undef, offset(s.outputs[0], bld, 3) };
90       const brw_reg tmp = bld.vgrf(BRW_TYPE_UD, 4);
91       bld.LOAD_PAYLOAD(tmp, srcs, 4, 0);
92 
93       inst = brw_emit_single_fb_write(s, bld, tmp, reg_undef, reg_undef, 4);
94       inst->target = 0;
95    }
96 
97    inst->last_rt = true;
98    inst->eot = true;
99 }
100 
101 static void
brw_emit_fb_writes(fs_visitor & s)102 brw_emit_fb_writes(fs_visitor &s)
103 {
104    const struct intel_device_info *devinfo = s.devinfo;
105    assert(s.stage == MESA_SHADER_FRAGMENT);
106    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(s.prog_data);
107    brw_wm_prog_key *key = (brw_wm_prog_key*) s.key;
108 
109    if (s.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
110       /* From the 'Render Target Write message' section of the docs:
111        * "Output Stencil is not supported with SIMD16 Render Target Write
112        * Messages."
113        */
114       if (devinfo->ver >= 20)
115          s.limit_dispatch_width(16, "gl_FragStencilRefARB unsupported "
116                                 "in SIMD32+ mode.\n");
117       else
118          s.limit_dispatch_width(8, "gl_FragStencilRefARB unsupported "
119                                 "in SIMD16+ mode.\n");
120    }
121 
122    /* ANV doesn't know about sample mask output during the wm key creation
123     * so we compute if we need replicate alpha and emit alpha to coverage
124     * workaround here.
125     */
126    const bool replicate_alpha = key->alpha_test_replicate_alpha ||
127       (key->nr_color_regions > 1 && key->alpha_to_coverage &&
128        s.sample_mask.file == BAD_FILE);
129 
130    prog_data->dual_src_blend = (s.dual_src_output.file != BAD_FILE &&
131                                 s.outputs[0].file != BAD_FILE);
132    assert(!prog_data->dual_src_blend || key->nr_color_regions == 1);
133 
134    /* Following condition implements Wa_14017468336:
135     *
136     * "If dual source blend is enabled do not enable SIMD32 dispatch" and
137     * "For a thread dispatched as SIMD32, must not issue SIMD8 message with Last
138     *  Render Target Select set."
139     */
140    if (devinfo->ver >= 11 && devinfo->ver <= 12 &&
141        prog_data->dual_src_blend) {
142       /* The dual-source RT write messages fail to release the thread
143        * dependency on ICL and TGL with SIMD32 dispatch, leading to hangs.
144        *
145        * XXX - Emit an extra single-source NULL RT-write marked LastRT in
146        *       order to release the thread dependency without disabling
147        *       SIMD32.
148        *
149        * The dual-source RT write messages may lead to hangs with SIMD16
150        * dispatch on ICL due some unknown reasons, see
151        * https://gitlab.freedesktop.org/mesa/mesa/-/issues/2183
152        */
153       if (devinfo->ver >= 20)
154          s.limit_dispatch_width(16, "Dual source blending unsupported "
155                                 "in SIMD32 mode.\n");
156       else
157          s.limit_dispatch_width(8, "Dual source blending unsupported "
158                                 "in SIMD16 and SIMD32 modes.\n");
159    }
160 
161    brw_do_emit_fb_writes(s, key->nr_color_regions, replicate_alpha);
162 }
163 
164 
165 /** Emits the interpolation for the varying inputs. */
166 static void
brw_emit_interpolation_setup(fs_visitor & s)167 brw_emit_interpolation_setup(fs_visitor &s)
168 {
169    const struct intel_device_info *devinfo = s.devinfo;
170    const fs_builder bld = fs_builder(&s).at_end();
171    fs_builder abld = bld.annotate("compute pixel centers");
172 
173    s.pixel_x = bld.vgrf(BRW_TYPE_F);
174    s.pixel_y = bld.vgrf(BRW_TYPE_F);
175 
176    const struct brw_wm_prog_key *wm_key = (brw_wm_prog_key*) s.key;
177    struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data);
178    fs_thread_payload &payload = s.fs_payload();
179 
180    brw_reg int_sample_offset_x, int_sample_offset_y; /* Used on Gen12HP+ */
181    brw_reg int_sample_offset_xy; /* Used on Gen8+ */
182    brw_reg half_int_sample_offset_x, half_int_sample_offset_y;
183    if (wm_prog_data->coarse_pixel_dispatch != BRW_ALWAYS) {
184       /* The thread payload only delivers subspan locations (ss0, ss1,
185        * ss2, ...). Since subspans covers 2x2 pixels blocks, we need to
186        * generate 4 pixel coordinates out of each subspan location. We do this
187        * by replicating a subspan coordinate 4 times and adding an offset of 1
188        * in each direction from the initial top left (tl) location to generate
189        * top right (tr = +1 in x), bottom left (bl = +1 in y) and bottom right
190        * (br = +1 in x, +1 in y).
191        *
192        * The locations we build look like this in SIMD8 :
193        *
194        *    ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
195        *
196        * The value 0x11001010 is a vector of 8 half byte vector. It adds
197        * following to generate the 4 pixels coordinates out of the subspan0:
198        *
199        *  0x
200        *    1 : ss0.y + 1 -> ss0.br.y
201        *    1 : ss0.y + 1 -> ss0.bl.y
202        *    0 : ss0.y + 0 -> ss0.tr.y
203        *    0 : ss0.y + 0 -> ss0.tl.y
204        *    1 : ss0.x + 1 -> ss0.br.x
205        *    0 : ss0.x + 0 -> ss0.bl.x
206        *    1 : ss0.x + 1 -> ss0.tr.x
207        *    0 : ss0.x + 0 -> ss0.tl.x
208        *
209        * By doing a SIMD16 add in a SIMD8 shader, we can generate the 8 pixels
210        * coordinates out of 2 subspans coordinates in a single ADD instruction
211        * (twice the operation above).
212        */
213       int_sample_offset_xy = brw_reg(brw_imm_v(0x11001010));
214       half_int_sample_offset_x = brw_reg(brw_imm_uw(0));
215       half_int_sample_offset_y = brw_reg(brw_imm_uw(0));
216       /* On Gfx12.5, because of regioning restrictions, the interpolation code
217        * is slightly different and works off X & Y only inputs. The ordering
218        * of the half bytes here is a bit odd, with each subspan replicated
219        * twice and every other element is discarded :
220        *
221        *             ss0.tl ss0.tl ss0.tr ss0.tr ss0.bl ss0.bl ss0.br ss0.br
222        *  X offset:    0      0      1      0      0      0      1      0
223        *  Y offset:    0      0      0      0      1      0      1      0
224        */
225       int_sample_offset_x = brw_reg(brw_imm_v(0x01000100));
226       int_sample_offset_y = brw_reg(brw_imm_v(0x01010000));
227    }
228 
229    brw_reg int_coarse_offset_x, int_coarse_offset_y; /* Used on Gen12HP+ */
230    brw_reg int_coarse_offset_xy; /* Used on Gen8+ */
231    brw_reg half_int_coarse_offset_x, half_int_coarse_offset_y;
232    if (wm_prog_data->coarse_pixel_dispatch != BRW_NEVER) {
233       /* In coarse pixel dispatch we have to do the same ADD instruction that
234        * we do in normal per pixel dispatch, except this time we're not adding
235        * 1 in each direction, but instead the coarse pixel size.
236        *
237        * The coarse pixel size is delivered as 2 u8 in r1.0
238        */
239       struct brw_reg r1_0 = retype(brw_vec1_reg(FIXED_GRF, 1, 0), BRW_TYPE_UB);
240 
241       const fs_builder dbld =
242          abld.exec_all().group(MIN2(16, s.dispatch_width) * 2, 0);
243 
244       if (devinfo->verx10 >= 125) {
245          /* To build the array of half bytes we do and AND operation with the
246           * right mask in X.
247           */
248          int_coarse_offset_x = dbld.vgrf(BRW_TYPE_UW);
249          dbld.AND(int_coarse_offset_x, byte_offset(r1_0, 0), brw_imm_v(0x0f000f00));
250 
251          /* And the right mask in Y. */
252          int_coarse_offset_y = dbld.vgrf(BRW_TYPE_UW);
253          dbld.AND(int_coarse_offset_y, byte_offset(r1_0, 1), brw_imm_v(0x0f0f0000));
254       } else {
255          /* To build the array of half bytes we do and AND operation with the
256           * right mask in X.
257           */
258          int_coarse_offset_x = dbld.vgrf(BRW_TYPE_UW);
259          dbld.AND(int_coarse_offset_x, byte_offset(r1_0, 0), brw_imm_v(0x0000f0f0));
260 
261          /* And the right mask in Y. */
262          int_coarse_offset_y = dbld.vgrf(BRW_TYPE_UW);
263          dbld.AND(int_coarse_offset_y, byte_offset(r1_0, 1), brw_imm_v(0xff000000));
264 
265          /* Finally OR the 2 registers. */
266          int_coarse_offset_xy = dbld.vgrf(BRW_TYPE_UW);
267          dbld.OR(int_coarse_offset_xy, int_coarse_offset_x, int_coarse_offset_y);
268       }
269 
270       /* Also compute the half coarse size used to center coarses. */
271       half_int_coarse_offset_x = bld.vgrf(BRW_TYPE_UW);
272       half_int_coarse_offset_y = bld.vgrf(BRW_TYPE_UW);
273 
274       bld.SHR(half_int_coarse_offset_x, suboffset(r1_0, 0), brw_imm_ud(1));
275       bld.SHR(half_int_coarse_offset_y, suboffset(r1_0, 1), brw_imm_ud(1));
276    }
277 
278    brw_reg int_pixel_offset_x, int_pixel_offset_y; /* Used on Gen12HP+ */
279    brw_reg int_pixel_offset_xy; /* Used on Gen8+ */
280    brw_reg half_int_pixel_offset_x, half_int_pixel_offset_y;
281    switch (wm_prog_data->coarse_pixel_dispatch) {
282    case BRW_NEVER:
283       int_pixel_offset_x = int_sample_offset_x;
284       int_pixel_offset_y = int_sample_offset_y;
285       int_pixel_offset_xy = int_sample_offset_xy;
286       half_int_pixel_offset_x = half_int_sample_offset_x;
287       half_int_pixel_offset_y = half_int_sample_offset_y;
288       break;
289 
290    case BRW_SOMETIMES: {
291       const fs_builder dbld =
292          abld.exec_all().group(MIN2(16, s.dispatch_width) * 2, 0);
293 
294       check_dynamic_msaa_flag(dbld, wm_prog_data,
295                               INTEL_MSAA_FLAG_COARSE_RT_WRITES);
296 
297       int_pixel_offset_x = dbld.vgrf(BRW_TYPE_UW);
298       set_predicate(BRW_PREDICATE_NORMAL,
299                     dbld.SEL(int_pixel_offset_x,
300                              int_coarse_offset_x,
301                              int_sample_offset_x));
302 
303       int_pixel_offset_y = dbld.vgrf(BRW_TYPE_UW);
304       set_predicate(BRW_PREDICATE_NORMAL,
305                     dbld.SEL(int_pixel_offset_y,
306                              int_coarse_offset_y,
307                              int_sample_offset_y));
308 
309       int_pixel_offset_xy = dbld.vgrf(BRW_TYPE_UW);
310       set_predicate(BRW_PREDICATE_NORMAL,
311                     dbld.SEL(int_pixel_offset_xy,
312                              int_coarse_offset_xy,
313                              int_sample_offset_xy));
314 
315       half_int_pixel_offset_x = bld.vgrf(BRW_TYPE_UW);
316       set_predicate(BRW_PREDICATE_NORMAL,
317                     bld.SEL(half_int_pixel_offset_x,
318                             half_int_coarse_offset_x,
319                             half_int_sample_offset_x));
320 
321       half_int_pixel_offset_y = bld.vgrf(BRW_TYPE_UW);
322       set_predicate(BRW_PREDICATE_NORMAL,
323                     bld.SEL(half_int_pixel_offset_y,
324                             half_int_coarse_offset_y,
325                             half_int_sample_offset_y));
326       break;
327    }
328 
329    case BRW_ALWAYS:
330       int_pixel_offset_x = int_coarse_offset_x;
331       int_pixel_offset_y = int_coarse_offset_y;
332       int_pixel_offset_xy = int_coarse_offset_xy;
333       half_int_pixel_offset_x = half_int_coarse_offset_x;
334       half_int_pixel_offset_y = half_int_coarse_offset_y;
335       break;
336    }
337 
338    for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
339       const fs_builder hbld = abld.group(MIN2(16, s.dispatch_width), i);
340       /* According to the "PS Thread Payload for Normal Dispatch"
341        * pages on the BSpec, subspan X/Y coordinates are stored in
342        * R1.2-R1.5/R2.2-R2.5 on gfx6+, and on R0.10-R0.13/R1.10-R1.13
343        * on gfx20+.  gi_reg is the 32B section of the GRF that
344        * contains the subspan coordinates.
345        */
346       const struct brw_reg gi_reg = devinfo->ver >= 20 ? xe2_vec1_grf(i, 8) :
347                                     brw_vec1_grf(i + 1, 0);
348       const struct brw_reg gi_uw = retype(gi_reg, BRW_TYPE_UW);
349 
350       if (devinfo->verx10 >= 125) {
351          const fs_builder dbld =
352             abld.exec_all().group(hbld.dispatch_width() * 2, 0);
353          const brw_reg int_pixel_x = dbld.vgrf(BRW_TYPE_UW);
354          const brw_reg int_pixel_y = dbld.vgrf(BRW_TYPE_UW);
355 
356          dbld.ADD(int_pixel_x,
357                   brw_reg(stride(suboffset(gi_uw, 4), 2, 8, 0)),
358                   int_pixel_offset_x);
359          dbld.ADD(int_pixel_y,
360                   brw_reg(stride(suboffset(gi_uw, 5), 2, 8, 0)),
361                   int_pixel_offset_y);
362 
363          if (wm_prog_data->coarse_pixel_dispatch != BRW_NEVER) {
364             fs_inst *addx = dbld.ADD(int_pixel_x, int_pixel_x,
365                                      horiz_stride(half_int_pixel_offset_x, 0));
366             fs_inst *addy = dbld.ADD(int_pixel_y, int_pixel_y,
367                                      horiz_stride(half_int_pixel_offset_y, 0));
368             if (wm_prog_data->coarse_pixel_dispatch != BRW_ALWAYS) {
369                addx->predicate = BRW_PREDICATE_NORMAL;
370                addy->predicate = BRW_PREDICATE_NORMAL;
371             }
372          }
373 
374          hbld.MOV(offset(s.pixel_x, hbld, i), horiz_stride(int_pixel_x, 2));
375          hbld.MOV(offset(s.pixel_y, hbld, i), horiz_stride(int_pixel_y, 2));
376 
377       } else {
378          /* The "Register Region Restrictions" page says for BDW (and newer,
379           * presumably):
380           *
381           *     "When destination spans two registers, the source may be one or
382           *      two registers. The destination elements must be evenly split
383           *      between the two registers."
384           *
385           * Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16
386           * to compute our pixel centers.
387           */
388          const fs_builder dbld =
389             abld.exec_all().group(hbld.dispatch_width() * 2, 0);
390          brw_reg int_pixel_xy = dbld.vgrf(BRW_TYPE_UW);
391 
392          dbld.ADD(int_pixel_xy,
393                   brw_reg(stride(suboffset(gi_uw, 4), 1, 4, 0)),
394                   int_pixel_offset_xy);
395 
396          hbld.emit(FS_OPCODE_PIXEL_X, offset(s.pixel_x, hbld, i), int_pixel_xy,
397                                       horiz_stride(half_int_pixel_offset_x, 0));
398          hbld.emit(FS_OPCODE_PIXEL_Y, offset(s.pixel_y, hbld, i), int_pixel_xy,
399                                       horiz_stride(half_int_pixel_offset_y, 0));
400       }
401    }
402 
403    abld = bld.annotate("compute pos.z");
404    brw_reg coarse_z;
405    if (wm_prog_data->coarse_pixel_dispatch != BRW_NEVER &&
406        wm_prog_data->uses_depth_w_coefficients) {
407       /* In coarse pixel mode, the HW doesn't interpolate Z coordinate
408        * properly. In the same way we have to add the coarse pixel size to
409        * pixels locations, here we recompute the Z value with 2 coefficients
410        * in X & Y axis.
411        */
412       brw_reg coef_payload = brw_vec8_grf(payload.depth_w_coef_reg, 0);
413       const brw_reg x_start = brw_vec1_grf(coef_payload.nr, 2);
414       const brw_reg y_start = brw_vec1_grf(coef_payload.nr, 6);
415       const brw_reg z_cx    = brw_vec1_grf(coef_payload.nr, 1);
416       const brw_reg z_cy    = brw_vec1_grf(coef_payload.nr, 0);
417       const brw_reg z_c0    = brw_vec1_grf(coef_payload.nr, 3);
418 
419       const brw_reg float_pixel_x = abld.vgrf(BRW_TYPE_F);
420       const brw_reg float_pixel_y = abld.vgrf(BRW_TYPE_F);
421 
422       abld.ADD(float_pixel_x, s.pixel_x, negate(x_start));
423       abld.ADD(float_pixel_y, s.pixel_y, negate(y_start));
424 
425       /* r1.0 - 0:7 ActualCoarsePixelShadingSize.X */
426       const brw_reg u8_cps_width = brw_reg(retype(brw_vec1_grf(1, 0), BRW_TYPE_UB));
427       /* r1.0 - 15:8 ActualCoarsePixelShadingSize.Y */
428       const brw_reg u8_cps_height = byte_offset(u8_cps_width, 1);
429       const brw_reg u32_cps_width = abld.vgrf(BRW_TYPE_UD);
430       const brw_reg u32_cps_height = abld.vgrf(BRW_TYPE_UD);
431       abld.MOV(u32_cps_width, u8_cps_width);
432       abld.MOV(u32_cps_height, u8_cps_height);
433 
434       const brw_reg f_cps_width = abld.vgrf(BRW_TYPE_F);
435       const brw_reg f_cps_height = abld.vgrf(BRW_TYPE_F);
436       abld.MOV(f_cps_width, u32_cps_width);
437       abld.MOV(f_cps_height, u32_cps_height);
438 
439       /* Center in the middle of the coarse pixel. */
440       abld.MAD(float_pixel_x, float_pixel_x, brw_imm_f(0.5f), f_cps_width);
441       abld.MAD(float_pixel_y, float_pixel_y, brw_imm_f(0.5f), f_cps_height);
442 
443       coarse_z = abld.vgrf(BRW_TYPE_F);
444       abld.MAD(coarse_z, z_c0, z_cx, float_pixel_x);
445       abld.MAD(coarse_z, coarse_z, z_cy, float_pixel_y);
446    }
447 
448    if (wm_prog_data->uses_src_depth)
449       s.pixel_z = fetch_payload_reg(bld, payload.source_depth_reg);
450 
451    if (wm_prog_data->uses_depth_w_coefficients ||
452        wm_prog_data->uses_src_depth) {
453       brw_reg sample_z = s.pixel_z;
454 
455       switch (wm_prog_data->coarse_pixel_dispatch) {
456       case BRW_NEVER:
457          break;
458 
459       case BRW_SOMETIMES:
460          assert(wm_prog_data->uses_src_depth);
461          assert(wm_prog_data->uses_depth_w_coefficients);
462          s.pixel_z = abld.vgrf(BRW_TYPE_F);
463 
464          /* We re-use the check_dynamic_msaa_flag() call from above */
465          set_predicate(BRW_PREDICATE_NORMAL,
466                        abld.SEL(s.pixel_z, coarse_z, sample_z));
467          break;
468 
469       case BRW_ALWAYS:
470          assert(!wm_prog_data->uses_src_depth);
471          assert(wm_prog_data->uses_depth_w_coefficients);
472          s.pixel_z = coarse_z;
473          break;
474       }
475    }
476 
477    if (wm_prog_data->uses_src_w) {
478       abld = bld.annotate("compute pos.w");
479       s.pixel_w = fetch_payload_reg(abld, payload.source_w_reg);
480       s.wpos_w = bld.vgrf(BRW_TYPE_F);
481       abld.emit(SHADER_OPCODE_RCP, s.wpos_w, s.pixel_w);
482    }
483 
484    if (wm_key->persample_interp == BRW_SOMETIMES) {
485       assert(!devinfo->needs_unlit_centroid_workaround);
486 
487       const fs_builder ubld = bld.exec_all().group(16, 0);
488       bool loaded_flag = false;
489 
490       for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
491          if (!(wm_prog_data->barycentric_interp_modes & BITFIELD_BIT(i)))
492             continue;
493 
494          /* The sample mode will always be the top bit set in the perspective
495           * or non-perspective section.  In the case where no SAMPLE mode was
496           * requested, wm_prog_data_barycentric_modes() will swap out the top
497           * mode for SAMPLE so this works regardless of whether SAMPLE was
498           * requested or not.
499           */
500          int sample_mode;
501          if (BITFIELD_BIT(i) & BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) {
502             sample_mode = util_last_bit(wm_prog_data->barycentric_interp_modes &
503                                         BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) - 1;
504          } else {
505             sample_mode = util_last_bit(wm_prog_data->barycentric_interp_modes &
506                                         BRW_BARYCENTRIC_PERSPECTIVE_BITS) - 1;
507          }
508          assert(wm_prog_data->barycentric_interp_modes &
509                 BITFIELD_BIT(sample_mode));
510 
511          if (i == sample_mode)
512             continue;
513 
514          uint8_t *barys = payload.barycentric_coord_reg[i];
515 
516          uint8_t *sample_barys = payload.barycentric_coord_reg[sample_mode];
517          assert(barys[0] && sample_barys[0]);
518 
519          if (!loaded_flag) {
520             check_dynamic_msaa_flag(ubld, wm_prog_data,
521                                     INTEL_MSAA_FLAG_PERSAMPLE_INTERP);
522          }
523 
524          for (unsigned j = 0; j < s.dispatch_width / 8; j++) {
525             set_predicate(
526                BRW_PREDICATE_NORMAL,
527                ubld.MOV(brw_vec8_grf(barys[j / 2] + (j % 2) * 2, 0),
528                         brw_vec8_grf(sample_barys[j / 2] + (j % 2) * 2, 0)));
529          }
530       }
531    }
532 
533    for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
534       s.delta_xy[i] = fetch_barycentric_reg(
535          bld, payload.barycentric_coord_reg[i]);
536    }
537 
538    uint32_t centroid_modes = wm_prog_data->barycentric_interp_modes &
539       (1 << BRW_BARYCENTRIC_PERSPECTIVE_CENTROID |
540        1 << BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
541 
542    if (devinfo->needs_unlit_centroid_workaround && centroid_modes) {
543       /* Get the pixel/sample mask into f0 so that we know which
544        * pixels are lit.  Then, for each channel that is unlit,
545        * replace the centroid data with non-centroid data.
546        */
547       for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
548          bld.exec_all().group(1, 0)
549             .MOV(retype(brw_flag_reg(0, i), BRW_TYPE_UW),
550                  retype(brw_vec1_grf(1 + i, 7), BRW_TYPE_UW));
551       }
552 
553       for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
554          if (!(centroid_modes & (1 << i)))
555             continue;
556 
557          const brw_reg centroid_delta_xy = s.delta_xy[i];
558          const brw_reg &pixel_delta_xy = s.delta_xy[i - 1];
559 
560          s.delta_xy[i] = bld.vgrf(BRW_TYPE_F, 2);
561 
562          for (unsigned c = 0; c < 2; c++) {
563             for (unsigned q = 0; q < s.dispatch_width / 8; q++) {
564                set_predicate(BRW_PREDICATE_NORMAL,
565                   bld.quarter(q).SEL(
566                      quarter(offset(s.delta_xy[i], bld, c), q),
567                      quarter(offset(centroid_delta_xy, bld, c), q),
568                      quarter(offset(pixel_delta_xy, bld, c), q)));
569             }
570          }
571       }
572    }
573 }
574 
575 
576 /**
577  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
578  * instructions to FS_OPCODE_REP_FB_WRITE.
579  */
580 static void
brw_emit_repclear_shader(fs_visitor & s)581 brw_emit_repclear_shader(fs_visitor &s)
582 {
583    brw_wm_prog_key *key = (brw_wm_prog_key*) s.key;
584    fs_inst *write = NULL;
585 
586    assert(s.devinfo->ver < 20);
587    assert(s.uniforms == 0);
588    assume(key->nr_color_regions > 0);
589 
590    brw_reg color_output = retype(brw_vec4_grf(127, 0), BRW_TYPE_UD);
591    brw_reg header = retype(brw_vec8_grf(125, 0), BRW_TYPE_UD);
592 
593    /* We pass the clear color as a flat input.  Copy it to the output. */
594    brw_reg color_input =
595       brw_make_reg(FIXED_GRF, 2, 3, 0, 0, BRW_TYPE_UD,
596               BRW_VERTICAL_STRIDE_8, BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4,
597               BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
598 
599    const fs_builder bld = fs_builder(&s).at_end();
600    bld.exec_all().group(4, 0).MOV(color_output, color_input);
601 
602    if (key->nr_color_regions > 1) {
603       /* Copy g0..g1 as the message header */
604       bld.exec_all().group(16, 0)
605          .MOV(header, retype(brw_vec8_grf(0, 0), BRW_TYPE_UD));
606    }
607 
608    for (int i = 0; i < key->nr_color_regions; ++i) {
609       if (i > 0)
610          bld.exec_all().group(1, 0).MOV(component(header, 2), brw_imm_ud(i));
611 
612       write = bld.emit(SHADER_OPCODE_SEND);
613       write->resize_sources(3);
614       write->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
615       write->src[0] = brw_imm_ud(0);
616       write->src[1] = brw_imm_ud(0);
617       write->src[2] = i == 0 ? color_output : header;
618       write->check_tdr = true;
619       write->send_has_side_effects = true;
620       write->desc = brw_fb_write_desc(s.devinfo, i,
621          BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED,
622          i == key->nr_color_regions - 1, false);
623 
624       /* We can use a headerless message for the first render target */
625       write->header_size = i == 0 ? 0 : 2;
626       write->mlen = 1 + write->header_size;
627    }
628    write->eot = true;
629    write->last_rt = true;
630 
631    brw_calculate_cfg(s);
632 
633    s.first_non_payload_grf = s.payload().num_regs;
634 
635    brw_fs_lower_scoreboard(s);
636 }
637 
638 /**
639  * Turn one of the two CENTROID barycentric modes into PIXEL mode.
640  */
641 static enum brw_barycentric_mode
centroid_to_pixel(enum brw_barycentric_mode bary)642 centroid_to_pixel(enum brw_barycentric_mode bary)
643 {
644    assert(bary == BRW_BARYCENTRIC_PERSPECTIVE_CENTROID ||
645           bary == BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
646    return (enum brw_barycentric_mode) ((unsigned) bary - 1);
647 }
648 
649 static void
calculate_urb_setup(const struct intel_device_info * devinfo,const struct brw_wm_prog_key * key,struct brw_wm_prog_data * prog_data,const nir_shader * nir,const struct brw_mue_map * mue_map)650 calculate_urb_setup(const struct intel_device_info *devinfo,
651                     const struct brw_wm_prog_key *key,
652                     struct brw_wm_prog_data *prog_data,
653                     const nir_shader *nir,
654                     const struct brw_mue_map *mue_map)
655 {
656    memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup));
657    memset(prog_data->urb_setup_channel, 0, sizeof(prog_data->urb_setup_channel));
658 
659    int urb_next = 0; /* in vec4s */
660 
661    const uint64_t inputs_read =
662       nir->info.inputs_read & ~nir->info.per_primitive_inputs;
663 
664    /* Figure out where each of the incoming setup attributes lands. */
665    if (key->mesh_input != BRW_NEVER) {
666       /* Per-Primitive Attributes are laid out by Hardware before the regular
667        * attributes, so order them like this to make easy later to map setup
668        * into real HW registers.
669        */
670       if (nir->info.per_primitive_inputs) {
671          uint64_t per_prim_inputs_read =
672                nir->info.inputs_read & nir->info.per_primitive_inputs;
673 
674          /* In Mesh, PRIMITIVE_SHADING_RATE, VIEWPORT and LAYER slots
675           * are always at the beginning, because they come from MUE
676           * Primitive Header, not Per-Primitive Attributes.
677           */
678          const uint64_t primitive_header_bits = VARYING_BIT_VIEWPORT |
679                                                 VARYING_BIT_LAYER |
680                                                 VARYING_BIT_PRIMITIVE_SHADING_RATE;
681 
682          if (mue_map) {
683             unsigned per_prim_start_dw = mue_map->per_primitive_start_dw;
684             unsigned per_prim_size_dw = mue_map->per_primitive_pitch_dw;
685 
686             bool reads_header = (per_prim_inputs_read & primitive_header_bits) != 0;
687 
688             if (reads_header || mue_map->user_data_in_primitive_header) {
689                /* Primitive Shading Rate, Layer and Viewport live in the same
690                 * 4-dwords slot (psr is dword 0, layer is dword 1, and viewport
691                 * is dword 2).
692                 */
693                if (per_prim_inputs_read & VARYING_BIT_PRIMITIVE_SHADING_RATE)
694                   prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 0;
695 
696                if (per_prim_inputs_read & VARYING_BIT_LAYER)
697                   prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
698 
699                if (per_prim_inputs_read & VARYING_BIT_VIEWPORT)
700                   prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = 0;
701 
702                per_prim_inputs_read &= ~primitive_header_bits;
703             } else {
704                /* If fs doesn't need primitive header, then it won't be made
705                 * available through SBE_MESH, so we have to skip them when
706                 * calculating offset from start of per-prim data.
707                 */
708                per_prim_start_dw += mue_map->per_primitive_header_size_dw;
709                per_prim_size_dw -= mue_map->per_primitive_header_size_dw;
710             }
711 
712             u_foreach_bit64(i, per_prim_inputs_read) {
713                int start = mue_map->start_dw[i];
714 
715                assert(start >= 0);
716                assert(mue_map->len_dw[i] > 0);
717 
718                assert(unsigned(start) >= per_prim_start_dw);
719                unsigned pos_dw = unsigned(start) - per_prim_start_dw;
720 
721                prog_data->urb_setup[i] = urb_next + pos_dw / 4;
722                prog_data->urb_setup_channel[i] = pos_dw % 4;
723             }
724 
725             urb_next = per_prim_size_dw / 4;
726          } else {
727             /* With no MUE map, we never read the primitive header, and
728              * per-primitive attributes won't be packed either, so just lay
729              * them in varying order.
730              */
731             per_prim_inputs_read &= ~primitive_header_bits;
732 
733             for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
734                if (per_prim_inputs_read & BITFIELD64_BIT(i)) {
735                   prog_data->urb_setup[i] = urb_next++;
736                }
737             }
738 
739             /* The actual setup attributes later must be aligned to a full GRF. */
740             urb_next = ALIGN(urb_next, 2);
741          }
742 
743          prog_data->num_per_primitive_inputs = urb_next;
744       }
745 
746       const uint64_t clip_dist_bits = VARYING_BIT_CLIP_DIST0 |
747                                       VARYING_BIT_CLIP_DIST1;
748 
749       uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK;
750 
751       if (inputs_read & clip_dist_bits) {
752          assert(!mue_map || mue_map->per_vertex_header_size_dw > 8);
753          unique_fs_attrs &= ~clip_dist_bits;
754       }
755 
756       if (mue_map) {
757          unsigned per_vertex_start_dw = mue_map->per_vertex_start_dw;
758          unsigned per_vertex_size_dw = mue_map->per_vertex_pitch_dw;
759 
760          /* Per-Vertex header is available to fragment shader only if there's
761           * user data there.
762           */
763          if (!mue_map->user_data_in_vertex_header) {
764             per_vertex_start_dw += 8;
765             per_vertex_size_dw -= 8;
766          }
767 
768          /* In Mesh, CLIP_DIST slots are always at the beginning, because
769           * they come from MUE Vertex Header, not Per-Vertex Attributes.
770           */
771          if (inputs_read & clip_dist_bits) {
772             prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next;
773             prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next + 1;
774          } else if (mue_map && mue_map->per_vertex_header_size_dw > 8) {
775             /* Clip distances are in MUE, but we are not reading them in FS. */
776             per_vertex_start_dw += 8;
777             per_vertex_size_dw -= 8;
778          }
779 
780          /* Per-Vertex attributes are laid out ordered.  Because we always link
781           * Mesh and Fragment shaders, the which slots are written and read by
782           * each of them will match. */
783          u_foreach_bit64(i, unique_fs_attrs) {
784             int start = mue_map->start_dw[i];
785 
786             assert(start >= 0);
787             assert(mue_map->len_dw[i] > 0);
788 
789             assert(unsigned(start) >= per_vertex_start_dw);
790             unsigned pos_dw = unsigned(start) - per_vertex_start_dw;
791 
792             prog_data->urb_setup[i] = urb_next + pos_dw / 4;
793             prog_data->urb_setup_channel[i] = pos_dw % 4;
794          }
795 
796          urb_next += per_vertex_size_dw / 4;
797       } else {
798          /* If we don't have an MUE map, just lay down the inputs the FS reads
799           * in varying order, as we do for the legacy pipeline.
800           */
801          if (inputs_read & clip_dist_bits) {
802             prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next++;
803             prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next++;
804          }
805 
806          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
807             if (unique_fs_attrs & BITFIELD64_BIT(i))
808                prog_data->urb_setup[i] = urb_next++;
809          }
810       }
811    } else {
812       assert(!nir->info.per_primitive_inputs);
813 
814       uint64_t vue_header_bits =
815          VARYING_BIT_PSIZ | VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT;
816 
817       uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK;
818 
819       /* VUE header fields all live in the same URB slot, so we pass them
820        * as a single FS input attribute.  We want to only count them once.
821        */
822       if (inputs_read & vue_header_bits) {
823          unique_fs_attrs &= ~vue_header_bits;
824          unique_fs_attrs |= VARYING_BIT_PSIZ;
825       }
826 
827       if (util_bitcount64(unique_fs_attrs) <= 16) {
828          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
829           * first 16 varying inputs, so we can put them wherever we want.
830           * Just put them in order.
831           *
832           * This is useful because it means that (a) inputs not used by the
833           * fragment shader won't take up valuable register space, and (b) we
834           * won't have to recompile the fragment shader if it gets paired with
835           * a different vertex (or geometry) shader.
836           *
837           * VUE header fields share the same FS input attribute.
838           */
839          if (inputs_read & vue_header_bits) {
840             if (inputs_read & VARYING_BIT_PSIZ)
841                prog_data->urb_setup[VARYING_SLOT_PSIZ] = urb_next;
842             if (inputs_read & VARYING_BIT_LAYER)
843                prog_data->urb_setup[VARYING_SLOT_LAYER] = urb_next;
844             if (inputs_read & VARYING_BIT_VIEWPORT)
845                prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = urb_next;
846 
847             urb_next++;
848          }
849 
850          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
851             if (inputs_read & BRW_FS_VARYING_INPUT_MASK & ~vue_header_bits &
852                 BITFIELD64_BIT(i)) {
853                prog_data->urb_setup[i] = urb_next++;
854             }
855          }
856       } else {
857          /* We have enough input varyings that the SF/SBE pipeline stage can't
858           * arbitrarily rearrange them to suit our whim; we have to put them
859           * in an order that matches the output of the previous pipeline stage
860           * (geometry or vertex shader).
861           */
862 
863          /* Re-compute the VUE map here in the case that the one coming from
864           * geometry has more than one position slot (used for Primitive
865           * Replication).
866           */
867          struct intel_vue_map prev_stage_vue_map;
868          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
869                              key->input_slots_valid,
870                              nir->info.separate_shader, 1);
871 
872          int first_slot =
873             brw_compute_first_urb_slot_required(inputs_read,
874                                                 &prev_stage_vue_map);
875 
876          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
877          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
878               slot++) {
879             int varying = prev_stage_vue_map.slot_to_varying[slot];
880             if (varying != BRW_VARYING_SLOT_PAD &&
881                 (inputs_read & BRW_FS_VARYING_INPUT_MASK &
882                  BITFIELD64_BIT(varying))) {
883                prog_data->urb_setup[varying] = slot - first_slot;
884             }
885          }
886          urb_next = prev_stage_vue_map.num_slots - first_slot;
887       }
888    }
889 
890    prog_data->num_varying_inputs = urb_next - prog_data->num_per_primitive_inputs;
891    prog_data->inputs = inputs_read;
892 
893    brw_compute_urb_setup_index(prog_data);
894 }
895 static bool
is_used_in_not_interp_frag_coord(nir_def * def)896 is_used_in_not_interp_frag_coord(nir_def *def)
897 {
898    nir_foreach_use_including_if(src, def) {
899       if (nir_src_is_if(src))
900          return true;
901 
902       if (nir_src_parent_instr(src)->type != nir_instr_type_intrinsic)
903          return true;
904 
905       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(nir_src_parent_instr(src));
906       if (intrin->intrinsic != nir_intrinsic_load_frag_coord)
907          return true;
908    }
909 
910    return false;
911 }
912 
913 /**
914  * Return a bitfield where bit n is set if barycentric interpolation mode n
915  * (see enum brw_barycentric_mode) is needed by the fragment shader.
916  *
917  * We examine the load_barycentric intrinsics rather than looking at input
918  * variables so that we catch interpolateAtCentroid() messages too, which
919  * also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.
920  */
921 static unsigned
brw_compute_barycentric_interp_modes(const struct intel_device_info * devinfo,const struct brw_wm_prog_key * key,const nir_shader * shader)922 brw_compute_barycentric_interp_modes(const struct intel_device_info *devinfo,
923                                      const struct brw_wm_prog_key *key,
924                                      const nir_shader *shader)
925 {
926    unsigned barycentric_interp_modes = 0;
927 
928    nir_foreach_function_impl(impl, shader) {
929       nir_foreach_block(block, impl) {
930          nir_foreach_instr(instr, block) {
931             if (instr->type != nir_instr_type_intrinsic)
932                continue;
933 
934             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
935             switch (intrin->intrinsic) {
936             case nir_intrinsic_load_barycentric_pixel:
937             case nir_intrinsic_load_barycentric_centroid:
938             case nir_intrinsic_load_barycentric_sample:
939             case nir_intrinsic_load_barycentric_at_sample:
940             case nir_intrinsic_load_barycentric_at_offset:
941                break;
942             default:
943                continue;
944             }
945 
946             /* Ignore WPOS; it doesn't require interpolation. */
947             if (!is_used_in_not_interp_frag_coord(&intrin->def))
948                continue;
949 
950             nir_intrinsic_op bary_op = intrin->intrinsic;
951             enum brw_barycentric_mode bary =
952                brw_barycentric_mode(key, intrin);
953 
954             barycentric_interp_modes |= 1 << bary;
955 
956             if (devinfo->needs_unlit_centroid_workaround &&
957                 bary_op == nir_intrinsic_load_barycentric_centroid)
958                barycentric_interp_modes |= 1 << centroid_to_pixel(bary);
959          }
960       }
961    }
962 
963    return barycentric_interp_modes;
964 }
965 
966 /**
967  * Return a bitfield where bit n is set if barycentric interpolation
968  * mode n (see enum brw_barycentric_mode) is needed by the fragment
969  * shader barycentric intrinsics that take an explicit offset or
970  * sample as argument.
971  */
972 static unsigned
brw_compute_offset_barycentric_interp_modes(const struct brw_wm_prog_key * key,const nir_shader * shader)973 brw_compute_offset_barycentric_interp_modes(const struct brw_wm_prog_key *key,
974                                             const nir_shader *shader)
975 {
976    unsigned barycentric_interp_modes = 0;
977 
978    nir_foreach_function_impl(impl, shader) {
979       nir_foreach_block(block, impl) {
980          nir_foreach_instr(instr, block) {
981             if (instr->type != nir_instr_type_intrinsic)
982                continue;
983 
984             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
985             if (intrin->intrinsic == nir_intrinsic_load_barycentric_at_offset ||
986                 intrin->intrinsic == nir_intrinsic_load_barycentric_at_sample)
987                barycentric_interp_modes |= 1 << brw_barycentric_mode(key, intrin);
988          }
989       }
990    }
991 
992    return barycentric_interp_modes;
993 }
994 
995 static void
brw_compute_flat_inputs(struct brw_wm_prog_data * prog_data,const nir_shader * shader)996 brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,
997                         const nir_shader *shader)
998 {
999    prog_data->flat_inputs = 0;
1000 
1001    nir_foreach_shader_in_variable(var, shader) {
1002       /* flat shading */
1003       if (var->data.interpolation != INTERP_MODE_FLAT)
1004          continue;
1005 
1006       if (var->data.per_primitive)
1007          continue;
1008 
1009       unsigned slots = glsl_count_attribute_slots(var->type, false);
1010       for (unsigned s = 0; s < slots; s++) {
1011          int input_index = prog_data->urb_setup[var->data.location + s];
1012 
1013          if (input_index >= 0)
1014             prog_data->flat_inputs |= 1 << input_index;
1015       }
1016    }
1017 }
1018 
1019 static uint8_t
computed_depth_mode(const nir_shader * shader)1020 computed_depth_mode(const nir_shader *shader)
1021 {
1022    if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
1023       switch (shader->info.fs.depth_layout) {
1024       case FRAG_DEPTH_LAYOUT_NONE:
1025       case FRAG_DEPTH_LAYOUT_ANY:
1026          return BRW_PSCDEPTH_ON;
1027       case FRAG_DEPTH_LAYOUT_GREATER:
1028          return BRW_PSCDEPTH_ON_GE;
1029       case FRAG_DEPTH_LAYOUT_LESS:
1030          return BRW_PSCDEPTH_ON_LE;
1031       case FRAG_DEPTH_LAYOUT_UNCHANGED:
1032          /* We initially set this to OFF, but having the shader write the
1033           * depth means we allocate register space in the SEND message. The
1034           * difference between the SEND register count and the OFF state
1035           * programming makes the HW hang.
1036           *
1037           * Removing the depth writes also leads to test failures. So use
1038           * LesserThanOrEqual, which fits writing the same value
1039           * (unchanged/equal).
1040           *
1041           */
1042          return BRW_PSCDEPTH_ON_LE;
1043       }
1044    }
1045    return BRW_PSCDEPTH_OFF;
1046 }
1047 
1048 static void
brw_nir_populate_wm_prog_data(nir_shader * shader,const struct intel_device_info * devinfo,const struct brw_wm_prog_key * key,struct brw_wm_prog_data * prog_data,const struct brw_mue_map * mue_map)1049 brw_nir_populate_wm_prog_data(nir_shader *shader,
1050                               const struct intel_device_info *devinfo,
1051                               const struct brw_wm_prog_key *key,
1052                               struct brw_wm_prog_data *prog_data,
1053                               const struct brw_mue_map *mue_map)
1054 {
1055    prog_data->uses_kill = shader->info.fs.uses_discard;
1056    prog_data->uses_omask = !key->ignore_sample_mask_out &&
1057       (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
1058    prog_data->max_polygons = 1;
1059    prog_data->computed_depth_mode = computed_depth_mode(shader);
1060    prog_data->computed_stencil =
1061       shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
1062 
1063    prog_data->sample_shading =
1064       shader->info.fs.uses_sample_shading ||
1065       shader->info.outputs_read;
1066 
1067    assert(key->multisample_fbo != BRW_NEVER ||
1068           key->persample_interp == BRW_NEVER);
1069 
1070    prog_data->persample_dispatch = key->persample_interp;
1071    if (prog_data->sample_shading)
1072       prog_data->persample_dispatch = BRW_ALWAYS;
1073 
1074    /* We can only persample dispatch if we have a multisample FBO */
1075    prog_data->persample_dispatch = MIN2(prog_data->persample_dispatch,
1076                                         key->multisample_fbo);
1077 
1078    /* Currently only the Vulkan API allows alpha_to_coverage to be dynamic. If
1079     * persample_dispatch & multisample_fbo are not dynamic, Anv should be able
1080     * to definitively tell whether alpha_to_coverage is on or off.
1081     */
1082    prog_data->alpha_to_coverage = key->alpha_to_coverage;
1083 
1084    prog_data->uses_sample_mask =
1085       BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN);
1086 
1087    /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
1088     *
1089     *    "MSDISPMODE_PERSAMPLE is required in order to select
1090     *    POSOFFSET_SAMPLE"
1091     *
1092     * So we can only really get sample positions if we are doing real
1093     * per-sample dispatch.  If we need gl_SamplePosition and we don't have
1094     * persample dispatch, we hard-code it to 0.5.
1095     */
1096    prog_data->uses_pos_offset =
1097       prog_data->persample_dispatch != BRW_NEVER &&
1098       (BITSET_TEST(shader->info.system_values_read,
1099                    SYSTEM_VALUE_SAMPLE_POS) ||
1100        BITSET_TEST(shader->info.system_values_read,
1101                    SYSTEM_VALUE_SAMPLE_POS_OR_CENTER));
1102 
1103    prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
1104    prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
1105    prog_data->inner_coverage = shader->info.fs.inner_coverage;
1106 
1107    prog_data->barycentric_interp_modes =
1108       brw_compute_barycentric_interp_modes(devinfo, key, shader);
1109 
1110    /* From the BDW PRM documentation for 3DSTATE_WM:
1111     *
1112     *    "MSDISPMODE_PERSAMPLE is required in order to select Perspective
1113     *     Sample or Non- perspective Sample barycentric coordinates."
1114     *
1115     * So cleanup any potentially set sample barycentric mode when not in per
1116     * sample dispatch.
1117     */
1118    if (prog_data->persample_dispatch == BRW_NEVER) {
1119       prog_data->barycentric_interp_modes &=
1120          ~BITFIELD_BIT(BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE);
1121    }
1122 
1123    if (devinfo->ver >= 20) {
1124       const unsigned offset_bary_modes =
1125          brw_compute_offset_barycentric_interp_modes(key, shader);
1126 
1127       prog_data->uses_npc_bary_coefficients =
1128          offset_bary_modes & BRW_BARYCENTRIC_NONPERSPECTIVE_BITS;
1129       prog_data->uses_pc_bary_coefficients =
1130          offset_bary_modes & ~BRW_BARYCENTRIC_NONPERSPECTIVE_BITS;
1131       prog_data->uses_sample_offsets =
1132          offset_bary_modes & ((1 << BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE) |
1133                               (1 << BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE));
1134    }
1135 
1136    prog_data->uses_nonperspective_interp_modes =
1137       (prog_data->barycentric_interp_modes & BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) ||
1138       prog_data->uses_npc_bary_coefficients;
1139 
1140    /* The current VK_EXT_graphics_pipeline_library specification requires
1141     * coarse to specified at compile time. But per sample interpolation can be
1142     * dynamic. So we should never be in a situation where coarse &
1143     * persample_interp are both respectively true & BRW_ALWAYS.
1144     *
1145     * Coarse will dynamically turned off when persample_interp is active.
1146     */
1147    assert(!key->coarse_pixel || key->persample_interp != BRW_ALWAYS);
1148 
1149    prog_data->coarse_pixel_dispatch =
1150       brw_sometimes_invert(prog_data->persample_dispatch);
1151    if (!key->coarse_pixel ||
1152        prog_data->uses_omask ||
1153        prog_data->sample_shading ||
1154        prog_data->uses_sample_mask ||
1155        (prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) ||
1156        prog_data->computed_stencil) {
1157       prog_data->coarse_pixel_dispatch = BRW_NEVER;
1158    }
1159 
1160    /* ICL PRMs, Volume 9: Render Engine, Shared Functions Pixel Interpolater,
1161     * Message Descriptor :
1162     *
1163     *    "Message Type. Specifies the type of message being sent when
1164     *     pixel-rate evaluation is requested :
1165     *
1166     *     Format = U2
1167     *       0: Per Message Offset (eval_snapped with immediate offset)
1168     *       1: Sample Position Offset (eval_sindex)
1169     *       2: Centroid Position Offset (eval_centroid)
1170     *       3: Per Slot Offset (eval_snapped with register offset)
1171     *
1172     *     Message Type. Specifies the type of message being sent when
1173     *     coarse-rate evaluation is requested :
1174     *
1175     *     Format = U2
1176     *       0: Coarse to Pixel Mapping Message (internal message)
1177     *       1: Reserved
1178     *       2: Coarse Centroid Position (eval_centroid)
1179     *       3: Per Slot Coarse Pixel Offset (eval_snapped with register offset)"
1180     *
1181     * The Sample Position Offset is marked as reserved for coarse rate
1182     * evaluation and leads to hangs if we try to use it. So disable coarse
1183     * pixel shading if we have any intrinsic that will result in a pixel
1184     * interpolater message at sample.
1185     */
1186    if (intel_nir_pulls_at_sample(shader))
1187       prog_data->coarse_pixel_dispatch = BRW_NEVER;
1188 
1189    /* We choose to always enable VMask prior to XeHP, as it would cause
1190     * us to lose out on the eliminate_find_live_channel() optimization.
1191     */
1192    prog_data->uses_vmask = devinfo->verx10 < 125 ||
1193                            shader->info.fs.needs_quad_helper_invocations ||
1194                            shader->info.uses_wide_subgroup_intrinsics ||
1195                            prog_data->coarse_pixel_dispatch != BRW_NEVER;
1196 
1197    prog_data->uses_src_w =
1198       BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
1199    prog_data->uses_src_depth =
1200       BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
1201       prog_data->coarse_pixel_dispatch != BRW_ALWAYS;
1202    prog_data->uses_depth_w_coefficients = prog_data->uses_pc_bary_coefficients ||
1203       (BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
1204        prog_data->coarse_pixel_dispatch != BRW_NEVER);
1205 
1206    calculate_urb_setup(devinfo, key, prog_data, shader, mue_map);
1207    brw_compute_flat_inputs(prog_data, shader);
1208 }
1209 
1210 /* From the SKL PRM, Volume 16, Workarounds:
1211  *
1212  *   0877  3D   Pixel Shader Hang possible when pixel shader dispatched with
1213  *              only header phases (R0-R2)
1214  *
1215  *   WA: Enable a non-header phase (e.g. push constant) when dispatch would
1216  *       have been header only.
1217  *
1218  * Instead of enabling push constants one can alternatively enable one of the
1219  * inputs. Here one simply chooses "layer" which shouldn't impose much
1220  * overhead.
1221  */
1222 static void
gfx9_ps_header_only_workaround(struct brw_wm_prog_data * wm_prog_data)1223 gfx9_ps_header_only_workaround(struct brw_wm_prog_data *wm_prog_data)
1224 {
1225    if (wm_prog_data->num_varying_inputs)
1226       return;
1227 
1228    if (wm_prog_data->base.curb_read_length)
1229       return;
1230 
1231    wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
1232    wm_prog_data->num_varying_inputs = 1;
1233 
1234    brw_compute_urb_setup_index(wm_prog_data);
1235 }
1236 
1237 static void
brw_assign_urb_setup(fs_visitor & s)1238 brw_assign_urb_setup(fs_visitor &s)
1239 {
1240    assert(s.stage == MESA_SHADER_FRAGMENT);
1241 
1242    const struct intel_device_info *devinfo = s.devinfo;
1243    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(s.prog_data);
1244 
1245    int urb_start = s.payload().num_regs + prog_data->base.curb_read_length;
1246 
1247    /* Offset all the urb_setup[] index by the actual position of the
1248     * setup regs, now that the location of the constants has been chosen.
1249     */
1250    foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
1251       for (int i = 0; i < inst->sources; i++) {
1252          if (inst->src[i].file == ATTR) {
1253             /* ATTR brw_reg::nr in the FS is in units of logical scalar
1254              * inputs each of which consumes 16B on Gfx4-Gfx12.  In
1255              * single polygon mode this leads to the following layout
1256              * of the vertex setup plane parameters in the ATTR
1257              * register file:
1258              *
1259              *  brw_reg::nr   Input   Comp0  Comp1  Comp2  Comp3
1260              *      0       Attr0.x  a1-a0  a2-a0   N/A    a0
1261              *      1       Attr0.y  a1-a0  a2-a0   N/A    a0
1262              *      2       Attr0.z  a1-a0  a2-a0   N/A    a0
1263              *      3       Attr0.w  a1-a0  a2-a0   N/A    a0
1264              *      4       Attr1.x  a1-a0  a2-a0   N/A    a0
1265              *     ...
1266              *
1267              * In multipolygon mode that no longer works since
1268              * different channels may be processing polygons with
1269              * different plane parameters, so each parameter above is
1270              * represented as a dispatch_width-wide vector:
1271              *
1272              *  brw_reg::nr     brw_reg::offset    Input      Comp0     ...    CompN
1273              *      0                 0          Attr0.x  a1[0]-a0[0] ... a1[N]-a0[N]
1274              *      0        4 * dispatch_width  Attr0.x  a2[0]-a0[0] ... a2[N]-a0[N]
1275              *      0        8 * dispatch_width  Attr0.x     N/A      ...     N/A
1276              *      0       12 * dispatch_width  Attr0.x    a0[0]     ...    a0[N]
1277              *      1                 0          Attr0.y  a1[0]-a0[0] ... a1[N]-a0[N]
1278              *     ...
1279              *
1280              * Note that many of the components on a single row above
1281              * are likely to be replicated multiple times (if, say, a
1282              * single SIMD thread is only processing 2 different
1283              * polygons), so plane parameters aren't actually stored
1284              * in GRF memory with that layout to avoid wasting space.
1285              * Instead we compose ATTR register regions with a 2D
1286              * region that walks through the parameters of each
1287              * polygon with the correct stride, reading the parameter
1288              * corresponding to each channel directly from the PS
1289              * thread payload.
1290              *
1291              * The latter layout corresponds to a param_width equal to
1292              * dispatch_width, while the former (scalar parameter)
1293              * layout has a param_width of 1.
1294              *
1295              * Gfx20+ represent plane parameters in a format similar
1296              * to the above, except the parameters are packed in 12B
1297              * and ordered like "a0, a1-a0, a2-a0" instead of the
1298              * above vec4 representation with a missing component.
1299              */
1300             const unsigned param_width = (s.max_polygons > 1 ? s.dispatch_width : 1);
1301 
1302             /* Size of a single scalar component of a plane parameter
1303              * in bytes.
1304              */
1305             const unsigned chan_sz = 4;
1306             struct brw_reg reg;
1307             assert(s.max_polygons > 0);
1308 
1309             /* Calculate the base register on the thread payload of
1310              * either the block of vertex setup data or the block of
1311              * per-primitive constant data depending on whether we're
1312              * accessing a primitive or vertex input.  Also calculate
1313              * the index of the input within that block.
1314              */
1315             const bool per_prim = inst->src[i].nr < prog_data->num_per_primitive_inputs;
1316             const unsigned base = urb_start +
1317                (per_prim ? 0 :
1318                 ALIGN(prog_data->num_per_primitive_inputs / 2,
1319                       reg_unit(devinfo)) * s.max_polygons);
1320             const unsigned idx = per_prim ? inst->src[i].nr :
1321                inst->src[i].nr - prog_data->num_per_primitive_inputs;
1322 
1323             /* Translate the offset within the param_width-wide
1324              * representation described above into an offset and a
1325              * grf, which contains the plane parameters for the first
1326              * polygon processed by the thread.
1327              */
1328             if (devinfo->ver >= 20 && !per_prim) {
1329                /* Gfx20+ is able to pack 5 logical input components
1330                 * per 64B register for vertex setup data.
1331                 */
1332                const unsigned grf = base + idx / 5 * 2 * s.max_polygons;
1333                assert(inst->src[i].offset / param_width < 12);
1334                const unsigned delta = idx % 5 * 12 +
1335                   inst->src[i].offset / (param_width * chan_sz) * chan_sz +
1336                   inst->src[i].offset % chan_sz;
1337                reg = byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
1338                                  delta);
1339             } else {
1340                /* Earlier platforms and per-primitive block pack 2 logical
1341                 * input components per 32B register.
1342                 */
1343                const unsigned grf = base + idx / 2 * s.max_polygons;
1344                assert(inst->src[i].offset / param_width < REG_SIZE / 2);
1345                const unsigned delta = (idx % 2) * (REG_SIZE / 2) +
1346                   inst->src[i].offset / (param_width * chan_sz) * chan_sz +
1347                   inst->src[i].offset % chan_sz;
1348                reg = byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
1349                                  delta);
1350             }
1351 
1352             if (s.max_polygons > 1) {
1353                assert(devinfo->ver >= 12);
1354                /* Misaligned channel strides that would lead to
1355                 * cross-channel access in the representation above are
1356                 * disallowed.
1357                 */
1358                assert(inst->src[i].stride * brw_type_size_bytes(inst->src[i].type) == chan_sz);
1359 
1360                /* Number of channels processing the same polygon. */
1361                const unsigned poly_width = s.dispatch_width / s.max_polygons;
1362                assert(s.dispatch_width % s.max_polygons == 0);
1363 
1364                /* Accessing a subset of channels of a parameter vector
1365                 * starting from "chan" is necessary to handle
1366                 * SIMD-lowered instructions though.
1367                 */
1368                const unsigned chan = inst->src[i].offset %
1369                   (param_width * chan_sz) / chan_sz;
1370                assert(chan < s.dispatch_width);
1371                assert(chan % poly_width == 0);
1372                const unsigned reg_size = reg_unit(devinfo) * REG_SIZE;
1373                reg = byte_offset(reg, chan / poly_width * reg_size);
1374 
1375                if (inst->exec_size > poly_width) {
1376                   /* Accessing the parameters for multiple polygons.
1377                    * Corresponding parameters for different polygons
1378                    * are stored a GRF apart on the thread payload, so
1379                    * use that as vertical stride.
1380                    */
1381                   const unsigned vstride = reg_size / brw_type_size_bytes(inst->src[i].type);
1382                   assert(vstride <= 32);
1383                   assert(chan % poly_width == 0);
1384                   reg = stride(reg, vstride, poly_width, 0);
1385                } else {
1386                   /* Accessing one parameter for a single polygon --
1387                    * Translate to a scalar region.
1388                    */
1389                   assert(chan % poly_width + inst->exec_size <= poly_width);
1390                   reg = stride(reg, 0, 1, 0);
1391                }
1392 
1393             } else {
1394                const unsigned width = inst->src[i].stride == 0 ?
1395                   1 : MIN2(inst->exec_size, 8);
1396                reg = stride(reg, width * inst->src[i].stride,
1397                             width, inst->src[i].stride);
1398             }
1399 
1400             reg.abs = inst->src[i].abs;
1401             reg.negate = inst->src[i].negate;
1402             inst->src[i] = reg;
1403          }
1404       }
1405    }
1406 
1407    /* Each attribute is 4 setup channels, each of which is half a reg,
1408     * but they may be replicated multiple times for multipolygon
1409     * dispatch.
1410     */
1411    s.first_non_payload_grf += prog_data->num_varying_inputs * 2 * s.max_polygons;
1412 
1413    /* Unlike regular attributes, per-primitive attributes have all 4 channels
1414     * in the same slot, so each GRF can store two slots.
1415     */
1416    assert(prog_data->num_per_primitive_inputs % 2 == 0);
1417    s.first_non_payload_grf += prog_data->num_per_primitive_inputs / 2 * s.max_polygons;
1418 }
1419 
1420 static bool
run_fs(fs_visitor & s,bool allow_spilling,bool do_rep_send)1421 run_fs(fs_visitor &s, bool allow_spilling, bool do_rep_send)
1422 {
1423    const struct intel_device_info *devinfo = s.devinfo;
1424    struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data);
1425    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) s.key;
1426    const fs_builder bld = fs_builder(&s).at_end();
1427    const nir_shader *nir = s.nir;
1428 
1429    assert(s.stage == MESA_SHADER_FRAGMENT);
1430 
1431    s.payload_ = new fs_thread_payload(s, s.source_depth_to_render_target);
1432 
1433    if (nir->info.ray_queries > 0)
1434       s.limit_dispatch_width(16, "SIMD32 not supported with ray queries.\n");
1435 
1436    if (do_rep_send) {
1437       assert(s.dispatch_width == 16);
1438       brw_emit_repclear_shader(s);
1439    } else {
1440       if (nir->info.inputs_read > 0 ||
1441           BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) ||
1442           (nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) {
1443          brw_emit_interpolation_setup(s);
1444       }
1445 
1446       /* We handle discards by keeping track of the still-live pixels in f0.1.
1447        * Initialize it with the dispatched pixels.
1448        */
1449       if (devinfo->ver >= 20 || wm_prog_data->uses_kill) {
1450          const unsigned lower_width = MIN2(s.dispatch_width, 16);
1451          for (unsigned i = 0; i < s.dispatch_width / lower_width; i++) {
1452             /* According to the "PS Thread Payload for Normal
1453              * Dispatch" pages on the BSpec, the dispatch mask is
1454              * stored in R0.15/R1.15 on gfx20+ and in R1.7/R2.7 on
1455              * gfx6+.
1456              */
1457             const brw_reg dispatch_mask =
1458                devinfo->ver >= 20 ? xe2_vec1_grf(i, 15) :
1459                                     brw_vec1_grf(i + 1, 7);
1460             bld.exec_all().group(1, 0)
1461                .MOV(brw_sample_mask_reg(bld.group(lower_width, i)),
1462                     retype(dispatch_mask, BRW_TYPE_UW));
1463          }
1464       }
1465 
1466       if (nir->info.writes_memory)
1467          wm_prog_data->has_side_effects = true;
1468 
1469       nir_to_brw(&s);
1470 
1471       if (s.failed)
1472 	 return false;
1473 
1474       brw_emit_fb_writes(s);
1475 
1476       brw_calculate_cfg(s);
1477 
1478       brw_fs_optimize(s);
1479 
1480       s.assign_curb_setup();
1481 
1482       if (devinfo->ver == 9)
1483          gfx9_ps_header_only_workaround(wm_prog_data);
1484 
1485       brw_assign_urb_setup(s);
1486 
1487       brw_fs_lower_3src_null_dest(s);
1488       brw_fs_workaround_memory_fence_before_eot(s);
1489       brw_fs_workaround_emit_dummy_mov_instruction(s);
1490 
1491       brw_allocate_registers(s, allow_spilling);
1492    }
1493 
1494    return !s.failed;
1495 }
1496 
1497 const unsigned *
brw_compile_fs(const struct brw_compiler * compiler,struct brw_compile_fs_params * params)1498 brw_compile_fs(const struct brw_compiler *compiler,
1499                struct brw_compile_fs_params *params)
1500 {
1501    struct nir_shader *nir = params->base.nir;
1502    const struct brw_wm_prog_key *key = params->key;
1503    struct brw_wm_prog_data *prog_data = params->prog_data;
1504    bool allow_spilling = params->allow_spilling;
1505    const bool debug_enabled =
1506       brw_should_print_shader(nir, params->base.debug_flag ?
1507                                    params->base.debug_flag : DEBUG_WM);
1508 
1509    prog_data->base.stage = MESA_SHADER_FRAGMENT;
1510    prog_data->base.ray_queries = nir->info.ray_queries;
1511    prog_data->base.total_scratch = 0;
1512 
1513    const struct intel_device_info *devinfo = compiler->devinfo;
1514    const unsigned max_subgroup_size = 32;
1515 
1516    brw_nir_apply_key(nir, compiler, &key->base, max_subgroup_size);
1517    brw_nir_lower_fs_inputs(nir, devinfo, key);
1518    brw_nir_lower_fs_outputs(nir);
1519 
1520    /* From the SKL PRM, Volume 7, "Alpha Coverage":
1521     *  "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in
1522     *   hardware, regardless of the state setting for this feature."
1523     */
1524    if (key->alpha_to_coverage != BRW_NEVER) {
1525       /* Run constant fold optimization in order to get the correct source
1526        * offset to determine render target 0 store instruction in
1527        * emit_alpha_to_coverage pass.
1528        */
1529       NIR_PASS(_, nir, nir_opt_constant_folding);
1530       NIR_PASS(_, nir, brw_nir_lower_alpha_to_coverage, key, prog_data);
1531    }
1532 
1533    NIR_PASS(_, nir, brw_nir_move_interpolation_to_top);
1534    brw_postprocess_nir(nir, compiler, debug_enabled,
1535                        key->base.robust_flags);
1536 
1537    brw_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data,
1538                                  params->mue_map);
1539 
1540    std::unique_ptr<fs_visitor> v8, v16, v32, vmulti;
1541    cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL,
1542       *multi_cfg = NULL;
1543    float throughput = 0;
1544    bool has_spilled = false;
1545 
1546    if (devinfo->ver < 20) {
1547       v8 = std::make_unique<fs_visitor>(compiler, &params->base, key,
1548                                         prog_data, nir, 8, 1,
1549                                         params->base.stats != NULL,
1550                                         debug_enabled);
1551       if (!run_fs(*v8, allow_spilling, false /* do_rep_send */)) {
1552          params->base.error_str = ralloc_strdup(params->base.mem_ctx,
1553                                                 v8->fail_msg);
1554          return NULL;
1555       } else if (INTEL_SIMD(FS, 8)) {
1556          simd8_cfg = v8->cfg;
1557 
1558          assert(v8->payload().num_regs % reg_unit(devinfo) == 0);
1559          prog_data->base.dispatch_grf_start_reg = v8->payload().num_regs / reg_unit(devinfo);
1560 
1561          const performance &perf = v8->performance_analysis.require();
1562          throughput = MAX2(throughput, perf.throughput);
1563          has_spilled = v8->spilled_any_registers;
1564          allow_spilling = false;
1565       }
1566    }
1567 
1568    if (key->coarse_pixel && devinfo->ver < 20) {
1569       if (prog_data->dual_src_blend) {
1570          v8->limit_dispatch_width(8, "SIMD16 coarse pixel shading cannot"
1571                                   " use SIMD8 messages.\n");
1572       }
1573       v8->limit_dispatch_width(16, "SIMD32 not supported with coarse"
1574                                " pixel shading.\n");
1575    }
1576 
1577    if (!has_spilled &&
1578        (!v8 || v8->max_dispatch_width >= 16) &&
1579        (INTEL_SIMD(FS, 16) || params->use_rep_send)) {
1580       /* Try a SIMD16 compile */
1581       v16 = std::make_unique<fs_visitor>(compiler, &params->base, key,
1582                                          prog_data, nir, 16, 1,
1583                                          params->base.stats != NULL,
1584                                          debug_enabled);
1585       if (v8)
1586          v16->import_uniforms(v8.get());
1587       if (!run_fs(*v16, allow_spilling, params->use_rep_send)) {
1588          brw_shader_perf_log(compiler, params->base.log_data,
1589                              "SIMD16 shader failed to compile: %s\n",
1590                              v16->fail_msg);
1591       } else {
1592          simd16_cfg = v16->cfg;
1593 
1594          assert(v16->payload().num_regs % reg_unit(devinfo) == 0);
1595          prog_data->dispatch_grf_start_reg_16 = v16->payload().num_regs / reg_unit(devinfo);
1596 
1597          const performance &perf = v16->performance_analysis.require();
1598          throughput = MAX2(throughput, perf.throughput);
1599          has_spilled = v16->spilled_any_registers;
1600          allow_spilling = false;
1601       }
1602    }
1603 
1604    const bool simd16_failed = v16 && !simd16_cfg;
1605 
1606    /* Currently, the compiler only supports SIMD32 on SNB+ */
1607    if (!has_spilled &&
1608        (!v8 || v8->max_dispatch_width >= 32) &&
1609        (!v16 || v16->max_dispatch_width >= 32) && !params->use_rep_send &&
1610        !simd16_failed &&
1611        INTEL_SIMD(FS, 32)) {
1612       /* Try a SIMD32 compile */
1613       v32 = std::make_unique<fs_visitor>(compiler, &params->base, key,
1614                                          prog_data, nir, 32, 1,
1615                                          params->base.stats != NULL,
1616                                          debug_enabled);
1617       if (v8)
1618          v32->import_uniforms(v8.get());
1619       else if (v16)
1620          v32->import_uniforms(v16.get());
1621 
1622       if (!run_fs(*v32, allow_spilling, false)) {
1623          brw_shader_perf_log(compiler, params->base.log_data,
1624                              "SIMD32 shader failed to compile: %s\n",
1625                              v32->fail_msg);
1626       } else {
1627          const performance &perf = v32->performance_analysis.require();
1628 
1629          if (!INTEL_DEBUG(DEBUG_DO32) && throughput >= perf.throughput) {
1630             brw_shader_perf_log(compiler, params->base.log_data,
1631                                 "SIMD32 shader inefficient\n");
1632          } else {
1633             simd32_cfg = v32->cfg;
1634 
1635             assert(v32->payload().num_regs % reg_unit(devinfo) == 0);
1636             prog_data->dispatch_grf_start_reg_32 = v32->payload().num_regs / reg_unit(devinfo);
1637 
1638             throughput = MAX2(throughput, perf.throughput);
1639          }
1640       }
1641    }
1642 
1643    if (devinfo->ver >= 12 && !has_spilled &&
1644        params->max_polygons >= 2 && !key->coarse_pixel) {
1645       fs_visitor *vbase = v8 ? v8.get() : v16 ? v16.get() : v32.get();
1646       assert(vbase);
1647 
1648       if (devinfo->ver >= 20 &&
1649           params->max_polygons >= 4 &&
1650           vbase->max_dispatch_width >= 32 &&
1651           4 * prog_data->num_varying_inputs <= MAX_VARYING &&
1652           INTEL_SIMD(FS, 4X8)) {
1653          /* Try a quad-SIMD8 compile */
1654          vmulti = std::make_unique<fs_visitor>(compiler, &params->base, key,
1655                                                prog_data, nir, 32, 4,
1656                                                params->base.stats != NULL,
1657                                                debug_enabled);
1658          vmulti->import_uniforms(vbase);
1659          if (!run_fs(*vmulti, false, params->use_rep_send)) {
1660             brw_shader_perf_log(compiler, params->base.log_data,
1661                                 "Quad-SIMD8 shader failed to compile: %s\n",
1662                                 vmulti->fail_msg);
1663          } else {
1664             multi_cfg = vmulti->cfg;
1665             assert(!vmulti->spilled_any_registers);
1666          }
1667       }
1668 
1669       if (!multi_cfg && devinfo->ver >= 20 &&
1670           vbase->max_dispatch_width >= 32 &&
1671           2 * prog_data->num_varying_inputs <= MAX_VARYING &&
1672           INTEL_SIMD(FS, 2X16)) {
1673          /* Try a dual-SIMD16 compile */
1674          vmulti = std::make_unique<fs_visitor>(compiler, &params->base, key,
1675                                                prog_data, nir, 32, 2,
1676                                                params->base.stats != NULL,
1677                                                debug_enabled);
1678          vmulti->import_uniforms(vbase);
1679          if (!run_fs(*vmulti, false, params->use_rep_send)) {
1680             brw_shader_perf_log(compiler, params->base.log_data,
1681                                 "Dual-SIMD16 shader failed to compile: %s\n",
1682                                 vmulti->fail_msg);
1683          } else {
1684             multi_cfg = vmulti->cfg;
1685             assert(!vmulti->spilled_any_registers);
1686          }
1687       }
1688 
1689       if (!multi_cfg && vbase->max_dispatch_width >= 16 &&
1690           2 * prog_data->num_varying_inputs <= MAX_VARYING &&
1691           INTEL_SIMD(FS, 2X8)) {
1692          /* Try a dual-SIMD8 compile */
1693          vmulti = std::make_unique<fs_visitor>(compiler, &params->base, key,
1694                                                prog_data, nir, 16, 2,
1695                                                params->base.stats != NULL,
1696                                                debug_enabled);
1697          vmulti->import_uniforms(vbase);
1698          if (!run_fs(*vmulti, allow_spilling, params->use_rep_send)) {
1699             brw_shader_perf_log(compiler, params->base.log_data,
1700                                 "Dual-SIMD8 shader failed to compile: %s\n",
1701                                 vmulti->fail_msg);
1702          } else {
1703             multi_cfg = vmulti->cfg;
1704          }
1705       }
1706 
1707       if (multi_cfg) {
1708          assert(vmulti->payload().num_regs % reg_unit(devinfo) == 0);
1709          prog_data->base.dispatch_grf_start_reg = vmulti->payload().num_regs / reg_unit(devinfo);
1710       }
1711    }
1712 
1713    /* When the caller requests a repclear shader, they want SIMD16-only */
1714    if (params->use_rep_send)
1715       simd8_cfg = NULL;
1716 
1717    fs_generator g(compiler, &params->base, &prog_data->base,
1718                   MESA_SHADER_FRAGMENT);
1719 
1720    if (unlikely(debug_enabled)) {
1721       g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
1722                                      "%s fragment shader %s",
1723                                      nir->info.label ?
1724                                         nir->info.label : "unnamed",
1725                                      nir->info.name));
1726    }
1727 
1728    struct brw_compile_stats *stats = params->base.stats;
1729    uint32_t max_dispatch_width = 0;
1730 
1731    if (multi_cfg) {
1732       prog_data->dispatch_multi = vmulti->dispatch_width;
1733       prog_data->max_polygons = vmulti->max_polygons;
1734       g.generate_code(multi_cfg, vmulti->dispatch_width, vmulti->shader_stats,
1735                       vmulti->performance_analysis.require(),
1736                       stats, vmulti->max_polygons);
1737       stats = stats ? stats + 1 : NULL;
1738       max_dispatch_width = vmulti->dispatch_width;
1739 
1740    } else if (simd8_cfg) {
1741       prog_data->dispatch_8 = true;
1742       g.generate_code(simd8_cfg, 8, v8->shader_stats,
1743                       v8->performance_analysis.require(), stats, 1);
1744       stats = stats ? stats + 1 : NULL;
1745       max_dispatch_width = 8;
1746    }
1747 
1748    if (simd16_cfg) {
1749       prog_data->dispatch_16 = true;
1750       prog_data->prog_offset_16 = g.generate_code(
1751          simd16_cfg, 16, v16->shader_stats,
1752          v16->performance_analysis.require(), stats, 1);
1753       stats = stats ? stats + 1 : NULL;
1754       max_dispatch_width = 16;
1755    }
1756 
1757    if (simd32_cfg) {
1758       prog_data->dispatch_32 = true;
1759       prog_data->prog_offset_32 = g.generate_code(
1760          simd32_cfg, 32, v32->shader_stats,
1761          v32->performance_analysis.require(), stats, 1);
1762       stats = stats ? stats + 1 : NULL;
1763       max_dispatch_width = 32;
1764    }
1765 
1766    for (struct brw_compile_stats *s = params->base.stats; s != NULL && s != stats; s++)
1767       s->max_dispatch_width = max_dispatch_width;
1768 
1769    g.add_const_data(nir->constant_data, nir->constant_data_size);
1770    return g.get_assembly();
1771 }
1772