xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/brw_lower_logical_sends.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2010, 2022 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /**
25  * @file
26  */
27 
28 #include "brw_eu.h"
29 #include "brw_fs.h"
30 #include "brw_fs_builder.h"
31 
32 using namespace brw;
33 
34 static void
lower_urb_read_logical_send(const fs_builder & bld,fs_inst * inst)35 lower_urb_read_logical_send(const fs_builder &bld, fs_inst *inst)
36 {
37    const intel_device_info *devinfo = bld.shader->devinfo;
38    const bool per_slot_present =
39       inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE;
40 
41    assert(inst->size_written % REG_SIZE == 0);
42    assert(inst->header_size == 0);
43 
44    brw_reg payload_sources[2];
45    unsigned header_size = 0;
46    payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
47    if (per_slot_present)
48       payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
49 
50    brw_reg payload = brw_vgrf(bld.shader->alloc.allocate(header_size),
51                              BRW_TYPE_F);
52    bld.LOAD_PAYLOAD(payload, payload_sources, header_size, header_size);
53 
54    inst->opcode = SHADER_OPCODE_SEND;
55    inst->header_size = header_size;
56 
57    inst->sfid = BRW_SFID_URB;
58    inst->desc = brw_urb_desc(devinfo,
59                              GFX8_URB_OPCODE_SIMD8_READ,
60                              per_slot_present,
61                              false,
62                              inst->offset);
63 
64    inst->mlen = header_size;
65    inst->ex_desc = 0;
66    inst->ex_mlen = 0;
67    inst->send_is_volatile = true;
68 
69    inst->resize_sources(4);
70 
71    inst->src[0] = brw_imm_ud(0); /* desc */
72    inst->src[1] = brw_imm_ud(0); /* ex_desc */
73    inst->src[2] = payload;
74    inst->src[3] = brw_null_reg();
75 }
76 
77 static void
lower_urb_read_logical_send_xe2(const fs_builder & bld,fs_inst * inst)78 lower_urb_read_logical_send_xe2(const fs_builder &bld, fs_inst *inst)
79 {
80    const intel_device_info *devinfo = bld.shader->devinfo;
81    assert(devinfo->has_lsc);
82 
83    assert(inst->size_written % (REG_SIZE * reg_unit(devinfo)) == 0);
84    assert(inst->header_size == 0);
85 
86    /* Get the logical send arguments. */
87    const brw_reg handle = inst->src[URB_LOGICAL_SRC_HANDLE];
88 
89    /* Calculate the total number of components of the payload. */
90    const unsigned dst_comps = inst->size_written / (REG_SIZE * reg_unit(devinfo));
91 
92    brw_reg payload = bld.vgrf(BRW_TYPE_UD);
93 
94    bld.MOV(payload, handle);
95 
96    /* The low 24-bits of the URB handle is a byte offset into the URB area.
97     * Add the (OWord) offset of the write to this value.
98     */
99    if (inst->offset) {
100       bld.ADD(payload, payload, brw_imm_ud(inst->offset * 16));
101       inst->offset = 0;
102    }
103 
104    brw_reg offsets = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
105    if (offsets.file != BAD_FILE) {
106       bld.ADD(payload, payload, offsets);
107    }
108 
109    inst->sfid = BRW_SFID_URB;
110 
111    assert((dst_comps >= 1 && dst_comps <= 4) || dst_comps == 8);
112 
113    inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
114                              LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A32,
115                              LSC_DATA_SIZE_D32, dst_comps /* num_channels */,
116                              false /* transpose */,
117                              LSC_CACHE(devinfo, LOAD, L1UC_L3UC));
118 
119    /* Update the original instruction. */
120    inst->opcode = SHADER_OPCODE_SEND;
121    inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, inst->exec_size);
122    inst->ex_mlen = 0;
123    inst->header_size = 0;
124    inst->send_has_side_effects = true;
125    inst->send_is_volatile = false;
126 
127    inst->resize_sources(4);
128 
129    inst->src[0] = brw_imm_ud(0);
130    inst->src[1] = brw_imm_ud(0);
131 
132    inst->src[2] = payload;
133    inst->src[3] = brw_null_reg();
134 }
135 
136 static void
lower_urb_write_logical_send(const fs_builder & bld,fs_inst * inst)137 lower_urb_write_logical_send(const fs_builder &bld, fs_inst *inst)
138 {
139    const intel_device_info *devinfo = bld.shader->devinfo;
140    const bool per_slot_present =
141       inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE;
142    const bool channel_mask_present =
143       inst->src[URB_LOGICAL_SRC_CHANNEL_MASK].file != BAD_FILE;
144 
145    assert(inst->header_size == 0);
146 
147    const unsigned length = 1 + per_slot_present + channel_mask_present +
148                            inst->components_read(URB_LOGICAL_SRC_DATA);
149 
150    brw_reg *payload_sources = new brw_reg[length];
151    brw_reg payload = brw_vgrf(bld.shader->alloc.allocate(length),
152                              BRW_TYPE_F);
153 
154    unsigned header_size = 0;
155    payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
156    if (per_slot_present)
157       payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
158 
159    if (channel_mask_present)
160       payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_CHANNEL_MASK];
161 
162    for (unsigned i = header_size, j = 0; i < length; i++, j++)
163       payload_sources[i] = offset(inst->src[URB_LOGICAL_SRC_DATA], bld, j);
164 
165    bld.LOAD_PAYLOAD(payload, payload_sources, length, header_size);
166 
167    delete [] payload_sources;
168 
169    inst->opcode = SHADER_OPCODE_SEND;
170    inst->header_size = header_size;
171    inst->dst = brw_null_reg();
172 
173    inst->sfid = BRW_SFID_URB;
174    inst->desc = brw_urb_desc(devinfo,
175                              GFX8_URB_OPCODE_SIMD8_WRITE,
176                              per_slot_present,
177                              channel_mask_present,
178                              inst->offset);
179 
180    inst->mlen = length;
181    inst->ex_desc = 0;
182    inst->ex_mlen = 0;
183    inst->send_has_side_effects = true;
184 
185    inst->resize_sources(4);
186 
187    inst->src[0] = brw_imm_ud(0); /* desc */
188    inst->src[1] = brw_imm_ud(0); /* ex_desc */
189    inst->src[2] = payload;
190    inst->src[3] = brw_null_reg();
191 }
192 
193 static void
lower_urb_write_logical_send_xe2(const fs_builder & bld,fs_inst * inst)194 lower_urb_write_logical_send_xe2(const fs_builder &bld, fs_inst *inst)
195 {
196    const intel_device_info *devinfo = bld.shader->devinfo;
197    assert(devinfo->has_lsc);
198 
199    /* Get the logical send arguments. */
200    const brw_reg handle = inst->src[URB_LOGICAL_SRC_HANDLE];
201    const brw_reg src = inst->components_read(URB_LOGICAL_SRC_DATA) ?
202       inst->src[URB_LOGICAL_SRC_DATA] : brw_reg(brw_imm_ud(0));
203    assert(brw_type_size_bytes(src.type) == 4);
204 
205    /* Calculate the total number of components of the payload. */
206    const unsigned src_comps = MAX2(1, inst->components_read(URB_LOGICAL_SRC_DATA));
207    const unsigned src_sz = brw_type_size_bytes(src.type);
208 
209    brw_reg payload = bld.vgrf(BRW_TYPE_UD);
210 
211    bld.MOV(payload, handle);
212 
213    /* The low 24-bits of the URB handle is a byte offset into the URB area.
214     * Add the (OWord) offset of the write to this value.
215     */
216    if (inst->offset) {
217       bld.ADD(payload, payload, brw_imm_ud(inst->offset * 16));
218       inst->offset = 0;
219    }
220 
221    brw_reg offsets = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
222    if (offsets.file != BAD_FILE) {
223       bld.ADD(payload, payload, offsets);
224    }
225 
226    const brw_reg cmask = inst->src[URB_LOGICAL_SRC_CHANNEL_MASK];
227    unsigned mask = 0;
228 
229    if (cmask.file != BAD_FILE) {
230       assert(cmask.file == IMM);
231       assert(cmask.type == BRW_TYPE_UD);
232       mask = cmask.ud >> 16;
233    }
234 
235    brw_reg payload2 = bld.move_to_vgrf(src, src_comps);
236    const unsigned ex_mlen = (src_comps * src_sz * inst->exec_size) / REG_SIZE;
237 
238    inst->sfid = BRW_SFID_URB;
239 
240    enum lsc_opcode op = mask ? LSC_OP_STORE_CMASK : LSC_OP_STORE;
241    inst->desc = lsc_msg_desc(devinfo, op,
242                              LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A32,
243                              LSC_DATA_SIZE_D32,
244                              mask ? mask : src_comps /* num_channels */,
245                              false /* transpose */,
246                              LSC_CACHE(devinfo, STORE, L1UC_L3UC));
247 
248 
249    /* Update the original instruction. */
250    inst->opcode = SHADER_OPCODE_SEND;
251    inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, inst->exec_size);
252    inst->ex_mlen = ex_mlen;
253    inst->header_size = 0;
254    inst->send_has_side_effects = true;
255    inst->send_is_volatile = false;
256 
257    inst->resize_sources(4);
258 
259    inst->src[0] = brw_imm_ud(0);
260    inst->src[1] = brw_imm_ud(0);
261 
262    inst->src[2] = payload;
263    inst->src[3] = payload2;
264 }
265 
266 static void
setup_color_payload(const fs_builder & bld,const brw_wm_prog_key * key,brw_reg * dst,brw_reg color,unsigned components)267 setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
268                     brw_reg *dst, brw_reg color, unsigned components)
269 {
270    if (key->clamp_fragment_color) {
271       brw_reg tmp = bld.vgrf(BRW_TYPE_F, 4);
272       assert(color.type == BRW_TYPE_F);
273 
274       for (unsigned i = 0; i < components; i++)
275          set_saturate(true,
276                       bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
277 
278       color = tmp;
279    }
280 
281    for (unsigned i = 0; i < components; i++)
282       dst[i] = offset(color, bld, i);
283 }
284 
285 static void
lower_fb_write_logical_send(const fs_builder & bld,fs_inst * inst,const struct brw_wm_prog_data * prog_data,const brw_wm_prog_key * key,const fs_thread_payload & fs_payload)286 lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
287                             const struct brw_wm_prog_data *prog_data,
288                             const brw_wm_prog_key *key,
289                             const fs_thread_payload &fs_payload)
290 {
291    assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
292    const intel_device_info *devinfo = bld.shader->devinfo;
293    const brw_reg color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0];
294    const brw_reg color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1];
295    const brw_reg src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA];
296    const brw_reg src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH];
297    const brw_reg dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH];
298    const brw_reg src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL];
299    brw_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];
300    const unsigned components =
301       inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
302 
303    assert(inst->target != 0 || src0_alpha.file == BAD_FILE);
304 
305    brw_reg sources[15];
306    int header_size = 2, payload_header_size;
307    unsigned length = 0;
308 
309    if (devinfo->ver < 11 &&
310       (color1.file != BAD_FILE || key->nr_color_regions > 1)) {
311 
312       /* From the Sandy Bridge PRM, volume 4, page 198:
313        *
314        *     "Dispatched Pixel Enables. One bit per pixel indicating
315        *      which pixels were originally enabled when the thread was
316        *      dispatched. This field is only required for the end-of-
317        *      thread message and on all dual-source messages."
318        */
319       const fs_builder ubld = bld.exec_all().group(8, 0);
320 
321       brw_reg header = ubld.vgrf(BRW_TYPE_UD, 2);
322       if (bld.group() < 16) {
323          /* The header starts off as g0 and g1 for the first half */
324          ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
325                                               BRW_TYPE_UD));
326       } else {
327          /* The header starts off as g0 and g2 for the second half */
328          assert(bld.group() < 32);
329          const brw_reg header_sources[2] = {
330             retype(brw_vec8_grf(0, 0), BRW_TYPE_UD),
331             retype(brw_vec8_grf(2, 0), BRW_TYPE_UD),
332          };
333          ubld.LOAD_PAYLOAD(header, header_sources, 2, 0);
334 
335          /* Gfx12 will require additional fix-ups if we ever hit this path. */
336          assert(devinfo->ver < 12);
337       }
338 
339       uint32_t g00_bits = 0;
340 
341       /* Set "Source0 Alpha Present to RenderTarget" bit in message
342        * header.
343        */
344       if (src0_alpha.file != BAD_FILE)
345          g00_bits |= 1 << 11;
346 
347       /* Set computes stencil to render target */
348       if (prog_data->computed_stencil)
349          g00_bits |= 1 << 14;
350 
351       if (g00_bits) {
352          /* OR extra bits into g0.0 */
353          ubld.group(1, 0).OR(component(header, 0),
354                              retype(brw_vec1_grf(0, 0), BRW_TYPE_UD),
355                              brw_imm_ud(g00_bits));
356       }
357 
358       /* Set the render target index for choosing BLEND_STATE. */
359       if (inst->target > 0) {
360          ubld.group(1, 0).MOV(component(header, 2), brw_imm_ud(inst->target));
361       }
362 
363       if (prog_data->uses_kill) {
364          ubld.group(1, 0).MOV(retype(component(header, 15), BRW_TYPE_UW),
365                               brw_sample_mask_reg(bld));
366       }
367 
368       assert(length == 0);
369       sources[0] = header;
370       sources[1] = horiz_offset(header, 8);
371       length = 2;
372    }
373    assert(length == 0 || length == 2);
374    header_size = length;
375 
376    if (fs_payload.aa_dest_stencil_reg[0]) {
377       assert(inst->group < 16);
378       sources[length] = brw_vgrf(bld.shader->alloc.allocate(1), BRW_TYPE_F);
379       bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
380          .MOV(sources[length],
381               brw_reg(brw_vec8_grf(fs_payload.aa_dest_stencil_reg[0], 0)));
382       length++;
383    }
384 
385    if (src0_alpha.file != BAD_FILE) {
386       for (unsigned i = 0; i < bld.dispatch_width() / 8; i++) {
387          const fs_builder &ubld = bld.exec_all().group(8, i)
388                                     .annotate("FB write src0 alpha");
389          const brw_reg tmp = ubld.vgrf(BRW_TYPE_F);
390          ubld.MOV(tmp, horiz_offset(src0_alpha, i * 8));
391          setup_color_payload(ubld, key, &sources[length], tmp, 1);
392          length++;
393       }
394    }
395 
396    if (sample_mask.file != BAD_FILE) {
397       const brw_reg tmp = brw_vgrf(bld.shader->alloc.allocate(reg_unit(devinfo)),
398                                   BRW_TYPE_UD);
399 
400       /* Hand over gl_SampleMask.  Only the lower 16 bits of each channel are
401        * relevant.  Since it's unsigned single words one vgrf is always
402        * 16-wide, but only the lower or higher 8 channels will be used by the
403        * hardware when doing a SIMD8 write depending on whether we have
404        * selected the subspans for the first or second half respectively.
405        */
406       assert(sample_mask.file != BAD_FILE &&
407              brw_type_size_bytes(sample_mask.type) == 4);
408       sample_mask.type = BRW_TYPE_UW;
409       sample_mask.stride *= 2;
410 
411       bld.exec_all().annotate("FB write oMask")
412          .MOV(horiz_offset(retype(tmp, BRW_TYPE_UW),
413                            inst->group % (16 * reg_unit(devinfo))),
414               sample_mask);
415 
416       for (unsigned i = 0; i < reg_unit(devinfo); i++)
417          sources[length++] = byte_offset(tmp, REG_SIZE * i);
418    }
419 
420    payload_header_size = length;
421 
422    setup_color_payload(bld, key, &sources[length], color0, components);
423    length += 4;
424 
425    if (color1.file != BAD_FILE) {
426       setup_color_payload(bld, key, &sources[length], color1, components);
427       length += 4;
428    }
429 
430    if (src_depth.file != BAD_FILE) {
431       sources[length] = src_depth;
432       length++;
433    }
434 
435    if (dst_depth.file != BAD_FILE) {
436       sources[length] = dst_depth;
437       length++;
438    }
439 
440    if (src_stencil.file != BAD_FILE) {
441       assert(bld.dispatch_width() == 8 * reg_unit(devinfo));
442 
443       /* XXX: src_stencil is only available on gfx9+. dst_depth is never
444        * available on gfx9+. As such it's impossible to have both enabled at the
445        * same time and therefore length cannot overrun the array.
446        */
447       assert(length < 15 * reg_unit(devinfo));
448 
449       sources[length] = bld.vgrf(BRW_TYPE_UD);
450       bld.exec_all().annotate("FB write OS")
451          .MOV(retype(sources[length], BRW_TYPE_UB),
452               subscript(src_stencil, BRW_TYPE_UB, 0));
453       length++;
454    }
455 
456    /* Send from the GRF */
457    brw_reg payload = brw_vgrf(-1, BRW_TYPE_F);
458    fs_inst *load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
459    payload.nr = bld.shader->alloc.allocate(regs_written(load));
460    load->dst = payload;
461 
462    uint32_t msg_ctl = brw_fb_write_msg_control(inst, prog_data);
463 
464    /* XXX - Bit 13 Per-sample PS enable */
465    inst->desc =
466       (inst->group / 16) << 11 | /* rt slot group */
467       brw_fb_write_desc(devinfo, inst->target, msg_ctl, inst->last_rt,
468                         0 /* coarse_rt_write */);
469 
470    brw_reg desc = brw_imm_ud(0);
471    if (prog_data->coarse_pixel_dispatch == BRW_ALWAYS) {
472       inst->desc |= (1 << 18);
473    } else if (prog_data->coarse_pixel_dispatch == BRW_SOMETIMES) {
474       STATIC_ASSERT(INTEL_MSAA_FLAG_COARSE_RT_WRITES == (1 << 18));
475       const fs_builder &ubld = bld.exec_all().group(8, 0);
476       desc = ubld.vgrf(BRW_TYPE_UD);
477       ubld.AND(desc, dynamic_msaa_flags(prog_data),
478                brw_imm_ud(INTEL_MSAA_FLAG_COARSE_RT_WRITES));
479       desc = component(desc, 0);
480    }
481 
482    uint32_t ex_desc = 0;
483    if (devinfo->ver >= 20) {
484       ex_desc = inst->target << 21 |
485                 (key->nr_color_regions == 0) << 20 |
486                 (src0_alpha.file != BAD_FILE) << 15 |
487                 (src_stencil.file != BAD_FILE) << 14 |
488                 (src_depth.file != BAD_FILE) << 13 |
489                 (sample_mask.file != BAD_FILE) << 12;
490    } else if (devinfo->ver >= 11) {
491       /* Set the "Render Target Index" and "Src0 Alpha Present" fields
492        * in the extended message descriptor, in lieu of using a header.
493        */
494       ex_desc = inst->target << 12 | (src0_alpha.file != BAD_FILE) << 15;
495 
496       if (key->nr_color_regions == 0)
497          ex_desc |= 1 << 20; /* Null Render Target */
498    }
499    inst->ex_desc = ex_desc;
500 
501    inst->opcode = SHADER_OPCODE_SEND;
502    inst->resize_sources(3);
503    inst->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
504    inst->src[0] = desc;
505    inst->src[1] = brw_imm_ud(0);
506    inst->src[2] = payload;
507    inst->mlen = regs_written(load);
508    inst->ex_mlen = 0;
509    inst->header_size = header_size;
510    inst->check_tdr = true;
511    inst->send_has_side_effects = true;
512 }
513 
514 static void
lower_fb_read_logical_send(const fs_builder & bld,fs_inst * inst,const struct brw_wm_prog_data * wm_prog_data)515 lower_fb_read_logical_send(const fs_builder &bld, fs_inst *inst,
516                            const struct brw_wm_prog_data *wm_prog_data)
517 {
518    const intel_device_info *devinfo = bld.shader->devinfo;
519    const fs_builder &ubld = bld.exec_all().group(8, 0);
520    const unsigned length = 2;
521    const brw_reg header = ubld.vgrf(BRW_TYPE_UD, length);
522 
523    assert(devinfo->ver >= 9 && devinfo->ver < 20);
524 
525    if (bld.group() < 16) {
526       ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
527                                            BRW_TYPE_UD));
528    } else {
529       assert(bld.group() < 32);
530       const brw_reg header_sources[] = {
531          retype(brw_vec8_grf(0, 0), BRW_TYPE_UD),
532          retype(brw_vec8_grf(2, 0), BRW_TYPE_UD)
533       };
534       ubld.LOAD_PAYLOAD(header, header_sources, ARRAY_SIZE(header_sources), 0);
535 
536       if (devinfo->ver >= 12) {
537          /* On Gfx12 the Viewport and Render Target Array Index fields (AKA
538           * Poly 0 Info) are provided in r1.1 instead of r0.0, and the render
539           * target message header format was updated accordingly -- However
540           * the updated format only works for the lower 16 channels in a
541           * SIMD32 thread, since the higher 16 channels want the subspan data
542           * from r2 instead of r1, so we need to copy over the contents of
543           * r1.1 in order to fix things up.
544           */
545          ubld.group(1, 0).MOV(component(header, 9),
546                               retype(brw_vec1_grf(1, 1), BRW_TYPE_UD));
547       }
548    }
549 
550    /* BSpec 12470 (Gfx8-11), BSpec 47842 (Gfx12+) :
551     *
552     *   "Must be zero for Render Target Read message."
553     *
554     * For bits :
555     *   - 14 : Stencil Present to Render Target
556     *   - 13 : Source Depth Present to Render Target
557     *   - 12 : oMask to Render Target
558     *   - 11 : Source0 Alpha Present to Render Target
559     */
560    ubld.group(1, 0).AND(component(header, 0),
561                         component(header, 0),
562                         brw_imm_ud(~INTEL_MASK(14, 11)));
563 
564    inst->resize_sources(4);
565    inst->opcode = SHADER_OPCODE_SEND;
566    inst->src[0] = brw_imm_ud(0);
567    inst->src[1] = brw_imm_ud(0);
568    inst->src[2] = header;
569    inst->src[3] = brw_reg();
570    inst->mlen = length;
571    inst->header_size = length;
572    inst->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
573    inst->check_tdr = true;
574    inst->desc =
575       (inst->group / 16) << 11 | /* rt slot group */
576       brw_fb_read_desc(devinfo, inst->target,
577                        0 /* msg_control */, inst->exec_size,
578                        wm_prog_data->persample_dispatch);
579 }
580 
581 static bool
is_high_sampler(const struct intel_device_info * devinfo,const brw_reg & sampler)582 is_high_sampler(const struct intel_device_info *devinfo, const brw_reg &sampler)
583 {
584    return sampler.file != IMM || sampler.ud >= 16;
585 }
586 
587 static unsigned
sampler_msg_type(const intel_device_info * devinfo,opcode opcode,bool shadow_compare,bool lod_is_zero,bool has_min_lod)588 sampler_msg_type(const intel_device_info *devinfo,
589                  opcode opcode, bool shadow_compare,
590                  bool lod_is_zero, bool has_min_lod)
591 {
592    switch (opcode) {
593    case SHADER_OPCODE_TEX_LOGICAL:
594       if (devinfo->ver >= 20 && has_min_lod) {
595          return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_COMPARE_MLOD :
596                                  XE2_SAMPLER_MESSAGE_SAMPLE_MLOD;
597       } else {
598          return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE :
599                                  GFX5_SAMPLER_MESSAGE_SAMPLE;
600       }
601    case FS_OPCODE_TXB_LOGICAL:
602       return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE :
603                               GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS;
604    case SHADER_OPCODE_TXL_LOGICAL:
605       assert(!has_min_lod);
606       if (lod_is_zero) {
607          return shadow_compare ? GFX9_SAMPLER_MESSAGE_SAMPLE_C_LZ :
608                                  GFX9_SAMPLER_MESSAGE_SAMPLE_LZ;
609       }
610       return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE :
611                               GFX5_SAMPLER_MESSAGE_SAMPLE_LOD;
612    case SHADER_OPCODE_TXS_LOGICAL:
613    case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
614       assert(!has_min_lod);
615       return GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
616    case SHADER_OPCODE_TXD_LOGICAL:
617       return shadow_compare ? HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE :
618                               GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
619    case SHADER_OPCODE_TXF_LOGICAL:
620       assert(!has_min_lod);
621       return lod_is_zero ? GFX9_SAMPLER_MESSAGE_SAMPLE_LD_LZ :
622                            GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
623    case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
624    case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
625       assert(!has_min_lod);
626       return GFX9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
627    case SHADER_OPCODE_TXF_MCS_LOGICAL:
628       assert(!has_min_lod);
629       return GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
630    case SHADER_OPCODE_LOD_LOGICAL:
631       assert(!has_min_lod);
632       return GFX5_SAMPLER_MESSAGE_LOD;
633    case SHADER_OPCODE_TG4_LOGICAL:
634       assert(!has_min_lod);
635       return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C :
636                               GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
637       break;
638    case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
639       assert(!has_min_lod);
640       return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C :
641                               GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
642    case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
643       assert(!has_min_lod);
644       assert(devinfo->ver >= 20);
645       return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_L_C:
646                               XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_L;
647    case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
648       assert(!has_min_lod);
649       assert(devinfo->ver >= 20);
650       return XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_B;
651    case SHADER_OPCODE_TG4_BIAS_LOGICAL:
652       assert(!has_min_lod);
653       assert(devinfo->ver >= 20);
654       return XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_B;
655    case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
656       assert(!has_min_lod);
657       assert(devinfo->ver >= 20);
658       return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_L_C :
659                               XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_L;
660    case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
661       assert(!has_min_lod);
662       assert(devinfo->ver >= 20);
663       return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_I_C :
664                               XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_I;
665   case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
666       assert(!has_min_lod);
667       return GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
668    default:
669       unreachable("not reached");
670    }
671 }
672 
673 /**
674  * Emit a LOAD_PAYLOAD instruction while ensuring the sources are aligned to
675  * the given requested_alignment_sz.
676  */
677 static fs_inst *
emit_load_payload_with_padding(const fs_builder & bld,const brw_reg & dst,const brw_reg * src,unsigned sources,unsigned header_size,unsigned requested_alignment_sz)678 emit_load_payload_with_padding(const fs_builder &bld, const brw_reg &dst,
679                                const brw_reg *src, unsigned sources,
680                                unsigned header_size,
681                                unsigned requested_alignment_sz)
682 {
683    unsigned length = 0;
684    unsigned num_srcs =
685       sources * DIV_ROUND_UP(requested_alignment_sz, bld.dispatch_width());
686    brw_reg *src_comps = new brw_reg[num_srcs];
687 
688    for (unsigned i = 0; i < header_size; i++)
689       src_comps[length++] = src[i];
690 
691    for (unsigned i = header_size; i < sources; i++) {
692       unsigned src_sz =
693          retype(dst, src[i].type).component_size(bld.dispatch_width());
694       const enum brw_reg_type padding_payload_type =
695          brw_type_with_size(BRW_TYPE_UD, brw_type_size_bits(src[i].type));
696 
697       src_comps[length++] = src[i];
698 
699       /* Expand the real sources if component of requested payload type is
700        * larger than real source component.
701        */
702       if (src_sz < requested_alignment_sz) {
703          for (unsigned j = 0; j < (requested_alignment_sz / src_sz) - 1; j++) {
704             src_comps[length++] = retype(brw_reg(), padding_payload_type);
705          }
706       }
707    }
708 
709    fs_inst *inst = bld.LOAD_PAYLOAD(dst, src_comps, length, header_size);
710    delete[] src_comps;
711 
712    return inst;
713 }
714 
715 static bool
shader_opcode_needs_header(opcode op)716 shader_opcode_needs_header(opcode op)
717 {
718    switch (op) {
719    case SHADER_OPCODE_TG4_LOGICAL:
720    case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
721    case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
722    case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
723    case SHADER_OPCODE_TG4_BIAS_LOGICAL:
724    case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
725    case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
726    case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
727       return true;
728    default:
729       break;
730    }
731 
732    return false;
733 }
734 
735 static void
lower_sampler_logical_send(const fs_builder & bld,fs_inst * inst,const brw_reg & coordinate,const brw_reg & shadow_c,brw_reg lod,const brw_reg & lod2,const brw_reg & min_lod,const brw_reg & sample_index,const brw_reg & mcs,const brw_reg & surface,const brw_reg & sampler,const brw_reg & surface_handle,const brw_reg & sampler_handle,const brw_reg & tg4_offset,unsigned payload_type_bit_size,unsigned coord_components,unsigned grad_components,bool residency)736 lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst,
737                            const brw_reg &coordinate,
738                            const brw_reg &shadow_c,
739                            brw_reg lod, const brw_reg &lod2,
740                            const brw_reg &min_lod,
741                            const brw_reg &sample_index,
742                            const brw_reg &mcs,
743                            const brw_reg &surface,
744                            const brw_reg &sampler,
745                            const brw_reg &surface_handle,
746                            const brw_reg &sampler_handle,
747                            const brw_reg &tg4_offset,
748                            unsigned payload_type_bit_size,
749                            unsigned coord_components,
750                            unsigned grad_components,
751                            bool residency)
752 {
753    const brw_compiler *compiler = bld.shader->compiler;
754    const intel_device_info *devinfo = bld.shader->devinfo;
755    const enum brw_reg_type payload_type =
756       brw_type_with_size(BRW_TYPE_F, payload_type_bit_size);
757    const enum brw_reg_type payload_unsigned_type =
758       brw_type_with_size(BRW_TYPE_UD, payload_type_bit_size);
759    const enum brw_reg_type payload_signed_type =
760       brw_type_with_size(BRW_TYPE_D, payload_type_bit_size);
761    unsigned reg_width = bld.dispatch_width() / 8;
762    unsigned header_size = 0, length = 0;
763    opcode op = inst->opcode;
764    brw_reg sources[1 + MAX_SAMPLER_MESSAGE_SIZE];
765    for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
766       sources[i] = bld.vgrf(payload_type);
767 
768    /* We must have exactly one of surface/sampler and surface/sampler_handle */
769    assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
770    assert((sampler.file == BAD_FILE) != (sampler_handle.file == BAD_FILE));
771 
772    if (shader_opcode_needs_header(op) || inst->offset != 0 || inst->eot ||
773        sampler_handle.file != BAD_FILE ||
774        is_high_sampler(devinfo, sampler) ||
775        residency) {
776       /* For general texture offsets (no txf workaround), we need a header to
777        * put them in.
778        *
779        * TG4 needs to place its channel select in the header, for interaction
780        * with ARB_texture_swizzle.  The sampler index is only 4-bits, so for
781        * larger sampler numbers we need to offset the Sampler State Pointer in
782        * the header.
783        */
784       brw_reg header = retype(sources[0], BRW_TYPE_UD);
785       for (header_size = 0; header_size < reg_unit(devinfo); header_size++)
786          sources[length++] = byte_offset(header, REG_SIZE * header_size);
787 
788       /* If we're requesting fewer than four channels worth of response,
789        * and we have an explicit header, we need to set up the sampler
790        * writemask.  It's reversed from normal: 1 means "don't write".
791        */
792       unsigned reg_count = regs_written(inst) - reg_unit(devinfo) * residency;
793       if (!inst->eot && reg_count < 4 * reg_width) {
794          assert(reg_count % reg_width == 0);
795          unsigned mask = ~((1 << (reg_count / reg_width)) - 1) & 0xf;
796          inst->offset |= mask << 12;
797       }
798 
799       if (residency)
800          inst->offset |= 1 << 23; /* g0.2 bit23 : Pixel Null Mask Enable */
801 
802       /* Build the actual header */
803       const fs_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
804       const fs_builder ubld1 = ubld.group(1, 0);
805       if (devinfo->ver >= 11)
806          ubld.MOV(header, brw_imm_ud(0));
807       else
808          ubld.MOV(header, retype(brw_vec8_grf(0, 0), BRW_TYPE_UD));
809       if (inst->offset) {
810          ubld1.MOV(component(header, 2), brw_imm_ud(inst->offset));
811       } else if (devinfo->ver < 11 &&
812                  bld.shader->stage != MESA_SHADER_VERTEX &&
813                  bld.shader->stage != MESA_SHADER_FRAGMENT) {
814          /* The vertex and fragment stages have g0.2 set to 0, so
815           * header0.2 is 0 when g0 is copied. Other stages may not, so we
816           * must set it to 0 to avoid setting undesirable bits in the
817           * message.
818           */
819          ubld1.MOV(component(header, 2), brw_imm_ud(0));
820       }
821 
822       if (sampler_handle.file != BAD_FILE) {
823          /* Bindless sampler handles aren't relative to the sampler state
824           * pointer passed into the shader through SAMPLER_STATE_POINTERS_*.
825           * Instead, it's an absolute pointer relative to dynamic state base
826           * address.
827           *
828           * Sampler states are 16 bytes each and the pointer we give here has
829           * to be 32-byte aligned.  In order to avoid more indirect messages
830           * than required, we assume that all bindless sampler states are
831           * 32-byte aligned.  This sacrifices a bit of general state base
832           * address space but means we can do something more efficient in the
833           * shader.
834           */
835          if (compiler->use_bindless_sampler_offset) {
836             assert(devinfo->ver >= 11);
837             ubld1.OR(component(header, 3), sampler_handle, brw_imm_ud(1));
838          } else {
839             ubld1.MOV(component(header, 3), sampler_handle);
840          }
841       } else if (is_high_sampler(devinfo, sampler)) {
842          brw_reg sampler_state_ptr =
843             retype(brw_vec1_grf(0, 3), BRW_TYPE_UD);
844 
845          /* Gfx11+ sampler message headers include bits in 4:0 which conflict
846           * with the ones included in g0.3 bits 4:0.  Mask them out.
847           */
848          if (devinfo->ver >= 11) {
849             sampler_state_ptr = ubld1.vgrf(BRW_TYPE_UD);
850             ubld1.AND(sampler_state_ptr,
851                       retype(brw_vec1_grf(0, 3), BRW_TYPE_UD),
852                       brw_imm_ud(INTEL_MASK(31, 5)));
853          }
854 
855          if (sampler.file == IMM) {
856             assert(sampler.ud >= 16);
857             const int sampler_state_size = 16; /* 16 bytes */
858 
859             ubld1.ADD(component(header, 3), sampler_state_ptr,
860                       brw_imm_ud(16 * (sampler.ud / 16) * sampler_state_size));
861          } else {
862             brw_reg tmp = ubld1.vgrf(BRW_TYPE_UD);
863             ubld1.AND(tmp, sampler, brw_imm_ud(0x0f0));
864             ubld1.SHL(tmp, tmp, brw_imm_ud(4));
865             ubld1.ADD(component(header, 3), sampler_state_ptr, tmp);
866          }
867       } else if (devinfo->ver >= 11) {
868          /* Gfx11+ sampler message headers include bits in 4:0 which conflict
869           * with the ones included in g0.3 bits 4:0.  Mask them out.
870           */
871          ubld1.AND(component(header, 3),
872                    retype(brw_vec1_grf(0, 3), BRW_TYPE_UD),
873                    brw_imm_ud(INTEL_MASK(31, 5)));
874       }
875    }
876 
877    const bool lod_is_zero = lod.is_zero();
878 
879    /* On Xe2 and newer platforms, min_lod is the first parameter specifically
880     * so that a bunch of other, possibly unused, parameters don't need to also
881     * be included.
882     */
883    const unsigned msg_type =
884       sampler_msg_type(devinfo, op, inst->shadow_compare, lod_is_zero,
885                        min_lod.file != BAD_FILE);
886 
887    const bool min_lod_is_first = devinfo->ver >= 20 &&
888       (msg_type == XE2_SAMPLER_MESSAGE_SAMPLE_MLOD ||
889        msg_type == XE2_SAMPLER_MESSAGE_SAMPLE_COMPARE_MLOD);
890 
891    if (min_lod_is_first) {
892       assert(min_lod.file != BAD_FILE);
893       bld.MOV(sources[length++], min_lod);
894    }
895 
896    if (shadow_c.file != BAD_FILE) {
897       bld.MOV(sources[length], shadow_c);
898       length++;
899    }
900 
901    bool coordinate_done = false;
902 
903    /* Set up the LOD info */
904    switch (op) {
905    case SHADER_OPCODE_TXL_LOGICAL:
906       if (lod_is_zero)
907          break;
908       FALLTHROUGH;
909    case FS_OPCODE_TXB_LOGICAL:
910    case SHADER_OPCODE_TG4_BIAS_LOGICAL:
911    case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
912    case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
913    case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
914       bld.MOV(sources[length], lod);
915       length++;
916       break;
917    case SHADER_OPCODE_TXD_LOGICAL:
918       /* TXD should have been lowered in SIMD16 mode (in SIMD32 mode in
919        * Xe2+).
920        */
921       assert(bld.dispatch_width() == (8 * reg_unit(devinfo)));
922 
923       /* Load dPdx and the coordinate together:
924        * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
925        */
926       for (unsigned i = 0; i < coord_components; i++) {
927          bld.MOV(sources[length++], offset(coordinate, bld, i));
928 
929          /* For cube map array, the coordinate is (u,v,r,ai) but there are
930           * only derivatives for (u, v, r).
931           */
932          if (i < grad_components) {
933             bld.MOV(sources[length++], offset(lod, bld, i));
934             bld.MOV(sources[length++], offset(lod2, bld, i));
935          }
936       }
937 
938       coordinate_done = true;
939       break;
940    case SHADER_OPCODE_TXS_LOGICAL:
941       sources[length] = retype(sources[length], payload_unsigned_type);
942       bld.MOV(sources[length++], lod);
943       break;
944    case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
945       /* We need an LOD; just use 0 */
946       sources[length] = retype(sources[length], payload_unsigned_type);
947       bld.MOV(sources[length++], brw_imm_ud(0));
948       break;
949    case SHADER_OPCODE_TXF_LOGICAL:
950        /* On Gfx9 the parameters are intermixed they are u, v, lod, r. */
951       sources[length] = retype(sources[length], payload_signed_type);
952       bld.MOV(sources[length++], coordinate);
953 
954       if (coord_components >= 2) {
955          sources[length] = retype(sources[length], payload_signed_type);
956          bld.MOV(sources[length], offset(coordinate, bld, 1));
957       } else {
958          sources[length] = brw_imm_d(0);
959       }
960       length++;
961 
962       if (!lod_is_zero) {
963          sources[length] = retype(sources[length], payload_signed_type);
964          bld.MOV(sources[length++], lod);
965       }
966 
967       for (unsigned i = 2; i < coord_components; i++) {
968          sources[length] = retype(sources[length], payload_signed_type);
969          bld.MOV(sources[length++], offset(coordinate, bld, i));
970       }
971 
972       coordinate_done = true;
973       break;
974 
975    case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
976    case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
977       sources[length] = retype(sources[length], payload_unsigned_type);
978       bld.MOV(sources[length++], sample_index);
979 
980       /* Data from the multisample control surface. */
981       for (unsigned i = 0; i < 2; ++i) {
982          /* Sampler always writes 4/8 register worth of data but for ld_mcs
983           * only valid data is in first two register. So with 16-bit
984           * payload, we need to split 2-32bit register into 4-16-bit
985           * payload.
986           *
987           * From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs -
988           * Shared Functions - 3D Sampler - Messages - Message Format:
989           *
990           *    ld2dms_w   si  mcs0 mcs1 mcs2  mcs3  u  v  r
991           */
992          if (op == SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL) {
993             brw_reg tmp = offset(mcs, bld, i);
994             sources[length] = retype(sources[length], payload_unsigned_type);
995             bld.MOV(sources[length++],
996                     mcs.file == IMM ? mcs :
997                     brw_reg(subscript(tmp, payload_unsigned_type, 0)));
998 
999             sources[length] = retype(sources[length], payload_unsigned_type);
1000             bld.MOV(sources[length++],
1001                     mcs.file == IMM ? mcs :
1002                     brw_reg(subscript(tmp, payload_unsigned_type, 1)));
1003          } else {
1004             sources[length] = retype(sources[length], payload_unsigned_type);
1005             bld.MOV(sources[length++],
1006                     mcs.file == IMM ? mcs : offset(mcs, bld, i));
1007          }
1008       }
1009       FALLTHROUGH;
1010 
1011    case SHADER_OPCODE_TXF_MCS_LOGICAL:
1012       /* There is no offsetting for this message; just copy in the integer
1013        * texture coordinates.
1014        */
1015       for (unsigned i = 0; i < coord_components; i++) {
1016          sources[length] = retype(sources[length], payload_signed_type);
1017          bld.MOV(sources[length++], offset(coordinate, bld, i));
1018       }
1019 
1020       coordinate_done = true;
1021       break;
1022    case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
1023       /* More crazy intermixing */
1024       for (unsigned i = 0; i < 2; i++) /* u, v */
1025          bld.MOV(sources[length++], offset(coordinate, bld, i));
1026 
1027       for (unsigned i = 0; i < 2; i++) { /* offu, offv */
1028          sources[length] = retype(sources[length], payload_signed_type);
1029          bld.MOV(sources[length++], offset(tg4_offset, bld, i));
1030       }
1031 
1032       if (coord_components == 3) /* r if present */
1033          bld.MOV(sources[length++], offset(coordinate, bld, 2));
1034 
1035       coordinate_done = true;
1036       break;
1037    default:
1038       break;
1039    }
1040 
1041    /* Set up the coordinate (except for cases where it was done above) */
1042    if (!coordinate_done) {
1043       for (unsigned i = 0; i < coord_components; i++)
1044          bld.MOV(retype(sources[length++], payload_type),
1045                  offset(coordinate, bld, i));
1046    }
1047 
1048    if (min_lod.file != BAD_FILE && !min_lod_is_first) {
1049       /* Account for all of the missing coordinate sources */
1050       if (op == FS_OPCODE_TXB_LOGICAL && devinfo->ver >= 20) {
1051          /* Bspec 64985:
1052           *
1053           * For sample_b sampler message format:
1054           *
1055           * SIMD16H/SIMD32H
1056           * Param Number   0     1  2  3  4  5
1057           * Param          BIAS  U  V  R  Ai MLOD
1058           *
1059           * SIMD16/SIMD32
1060           * Param Number   0        1  2  3  4
1061           * Param          BIAS_AI  U  V  R  MLOD
1062           */
1063          length += 3 - coord_components;
1064       } else if (op == SHADER_OPCODE_TXD_LOGICAL && devinfo->verx10 >= 125) {
1065          /* On DG2 and newer platforms, sample_d can only be used with 1D and
1066           * 2D surfaces, so the maximum number of gradient components is 2.
1067           * In spite of this limitation, the Bspec lists a mysterious R
1068           * component before the min_lod, so the maximum coordinate components
1069           * is 3.
1070           *
1071           * See bspec 45942, "Enable new message layout for cube array"
1072           */
1073          length += 3 - coord_components;
1074          length += (2 - grad_components) * 2;
1075       } else {
1076          length += 4 - coord_components;
1077          if (op == SHADER_OPCODE_TXD_LOGICAL)
1078             length += (3 - grad_components) * 2;
1079       }
1080 
1081       bld.MOV(sources[length++], min_lod);
1082 
1083       /* Wa_14014595444: Populate MLOD as parameter 5 (twice). */
1084        if (devinfo->verx10 == 125 && op == FS_OPCODE_TXB_LOGICAL &&
1085           !inst->shadow_compare)
1086          bld.MOV(sources[length++], min_lod);
1087    }
1088 
1089    const brw_reg src_payload =
1090       brw_vgrf(bld.shader->alloc.allocate(length * reg_width),
1091                BRW_TYPE_F);
1092    /* In case of 16-bit payload each component takes one full register in
1093     * both SIMD8H and SIMD16H modes. In both cases one reg can hold 16
1094     * elements. In SIMD8H case hardware simply expects the components to be
1095     * padded (i.e., aligned on reg boundary).
1096     */
1097    fs_inst *load_payload_inst =
1098       emit_load_payload_with_padding(bld, src_payload, sources, length,
1099                                      header_size, REG_SIZE * reg_unit(devinfo));
1100    unsigned mlen = load_payload_inst->size_written / REG_SIZE;
1101    unsigned simd_mode = 0;
1102    if (devinfo->ver < 20) {
1103       if (payload_type_bit_size == 16) {
1104          assert(devinfo->ver >= 11);
1105          simd_mode = inst->exec_size <= 8 ? GFX10_SAMPLER_SIMD_MODE_SIMD8H :
1106             GFX10_SAMPLER_SIMD_MODE_SIMD16H;
1107       } else {
1108          simd_mode = inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
1109             BRW_SAMPLER_SIMD_MODE_SIMD16;
1110       }
1111    } else {
1112       if (payload_type_bit_size == 16) {
1113          simd_mode = inst->exec_size <= 16 ? XE2_SAMPLER_SIMD_MODE_SIMD16H :
1114             XE2_SAMPLER_SIMD_MODE_SIMD32H;
1115       } else {
1116          simd_mode = inst->exec_size <= 16 ? XE2_SAMPLER_SIMD_MODE_SIMD16 :
1117             XE2_SAMPLER_SIMD_MODE_SIMD32;
1118       }
1119    }
1120 
1121    /* Generate the SEND. */
1122    inst->opcode = SHADER_OPCODE_SEND;
1123    inst->mlen = mlen;
1124    inst->header_size = header_size;
1125    inst->sfid = BRW_SFID_SAMPLER;
1126    uint sampler_ret_type = brw_type_size_bits(inst->dst.type) == 16
1127       ? GFX8_SAMPLER_RETURN_FORMAT_16BITS
1128       : GFX8_SAMPLER_RETURN_FORMAT_32BITS;
1129    if (surface.file == IMM &&
1130        (sampler.file == IMM || sampler_handle.file != BAD_FILE)) {
1131       inst->desc = brw_sampler_desc(devinfo, surface.ud,
1132                                     sampler.file == IMM ? sampler.ud % 16 : 0,
1133                                     msg_type,
1134                                     simd_mode,
1135                                     sampler_ret_type);
1136       inst->src[0] = brw_imm_ud(0);
1137       inst->src[1] = brw_imm_ud(0);
1138    } else if (surface_handle.file != BAD_FILE) {
1139       /* Bindless surface */
1140       inst->desc = brw_sampler_desc(devinfo,
1141                                     GFX9_BTI_BINDLESS,
1142                                     sampler.file == IMM ? sampler.ud % 16 : 0,
1143                                     msg_type,
1144                                     simd_mode,
1145                                     sampler_ret_type);
1146 
1147       /* For bindless samplers, the entire address is included in the message
1148        * header so we can leave the portion in the message descriptor 0.
1149        */
1150       if (sampler_handle.file != BAD_FILE || sampler.file == IMM) {
1151          inst->src[0] = brw_imm_ud(0);
1152       } else {
1153          const fs_builder ubld = bld.group(1, 0).exec_all();
1154          brw_reg desc = ubld.vgrf(BRW_TYPE_UD);
1155          ubld.SHL(desc, sampler, brw_imm_ud(8));
1156          inst->src[0] = component(desc, 0);
1157       }
1158 
1159       /* We assume that the driver provided the handle in the top 20 bits so
1160        * we can use the surface handle directly as the extended descriptor.
1161        */
1162       inst->src[1] = retype(surface_handle, BRW_TYPE_UD);
1163       inst->send_ex_bso = compiler->extended_bindless_surface_offset;
1164    } else {
1165       /* Immediate portion of the descriptor */
1166       inst->desc = brw_sampler_desc(devinfo,
1167                                     0, /* surface */
1168                                     0, /* sampler */
1169                                     msg_type,
1170                                     simd_mode,
1171                                     sampler_ret_type);
1172       const fs_builder ubld = bld.group(1, 0).exec_all();
1173       brw_reg desc = ubld.vgrf(BRW_TYPE_UD);
1174       if (surface.equals(sampler)) {
1175          /* This case is common in GL */
1176          ubld.MUL(desc, surface, brw_imm_ud(0x101));
1177       } else {
1178          if (sampler_handle.file != BAD_FILE) {
1179             ubld.MOV(desc, surface);
1180          } else if (sampler.file == IMM) {
1181             ubld.OR(desc, surface, brw_imm_ud(sampler.ud << 8));
1182          } else {
1183             ubld.SHL(desc, sampler, brw_imm_ud(8));
1184             ubld.OR(desc, desc, surface);
1185          }
1186       }
1187       ubld.AND(desc, desc, brw_imm_ud(0xfff));
1188 
1189       inst->src[0] = component(desc, 0);
1190       inst->src[1] = brw_imm_ud(0); /* ex_desc */
1191    }
1192 
1193    inst->ex_desc = 0;
1194 
1195    inst->src[2] = src_payload;
1196    inst->resize_sources(3);
1197 
1198    if (inst->eot) {
1199       /* EOT sampler messages don't make sense to split because it would
1200        * involve ending half of the thread early.
1201        */
1202       assert(inst->group == 0);
1203       /* We need to use SENDC for EOT sampler messages */
1204       inst->check_tdr = true;
1205       inst->send_has_side_effects = true;
1206    }
1207 
1208    /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
1209    assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE * reg_unit(devinfo));
1210 }
1211 
1212 static unsigned
get_sampler_msg_payload_type_bit_size(const intel_device_info * devinfo,const fs_inst * inst)1213 get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo,
1214                                       const fs_inst *inst)
1215 {
1216    assert(inst);
1217    const brw_reg *src = inst->src;
1218    unsigned src_type_size = 0;
1219 
1220    /* All sources need to have the same size, therefore seek the first valid
1221     * and take the size from there.
1222     */
1223    for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
1224       if (src[i].file != BAD_FILE) {
1225          src_type_size = brw_type_size_bytes(src[i].type);
1226          break;
1227       }
1228    }
1229 
1230    assert(src_type_size == 2 || src_type_size == 4);
1231 
1232 #ifndef NDEBUG
1233    /* Make sure all sources agree. On gfx12 this doesn't hold when sampling
1234     * compressed multisampled surfaces. There the payload contains MCS data
1235     * which is already in 16-bits unlike the other parameters that need forced
1236     * conversion.
1237     */
1238    if (inst->opcode != SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL) {
1239       for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
1240          assert(src[i].file == BAD_FILE ||
1241                 brw_type_size_bytes(src[i].type) == src_type_size);
1242       }
1243    }
1244 #endif
1245 
1246    if (devinfo->verx10 < 125)
1247       return src_type_size * 8;
1248 
1249    /* Force conversion from 32-bit sources to 16-bit payload. From the XeHP Bspec:
1250     * 3D and GPGPU Programs - Shared Functions - 3D Sampler - Messages - Message
1251     * Format [GFX12:HAS:1209977870] *
1252     *
1253     *  ld2dms_w       SIMD8H and SIMD16H Only
1254     *  ld_mcs         SIMD8H and SIMD16H Only
1255     *  ld2dms         REMOVEDBY(GEN:HAS:1406788836)
1256     */
1257    if (inst->opcode == SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL ||
1258        inst->opcode == SHADER_OPCODE_TXF_MCS_LOGICAL)
1259       src_type_size = 2;
1260 
1261    return src_type_size * 8;
1262 }
1263 
1264 static void
lower_sampler_logical_send(const fs_builder & bld,fs_inst * inst)1265 lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst)
1266 {
1267    const intel_device_info *devinfo = bld.shader->devinfo;
1268    const brw_reg coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE];
1269    const brw_reg shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
1270    const brw_reg lod = inst->src[TEX_LOGICAL_SRC_LOD];
1271    const brw_reg lod2 = inst->src[TEX_LOGICAL_SRC_LOD2];
1272    const brw_reg min_lod = inst->src[TEX_LOGICAL_SRC_MIN_LOD];
1273    const brw_reg sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX];
1274    const brw_reg mcs = inst->src[TEX_LOGICAL_SRC_MCS];
1275    const brw_reg surface = inst->src[TEX_LOGICAL_SRC_SURFACE];
1276    const brw_reg sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];
1277    const brw_reg surface_handle = inst->src[TEX_LOGICAL_SRC_SURFACE_HANDLE];
1278    const brw_reg sampler_handle = inst->src[TEX_LOGICAL_SRC_SAMPLER_HANDLE];
1279    const brw_reg tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET];
1280    assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
1281    const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
1282    assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
1283    const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
1284    assert(inst->src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
1285    const bool residency = inst->src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
1286 
1287    const unsigned msg_payload_type_bit_size =
1288       get_sampler_msg_payload_type_bit_size(devinfo, inst);
1289 
1290    /* 16-bit payloads are available only on gfx11+ */
1291    assert(msg_payload_type_bit_size != 16 || devinfo->ver >= 11);
1292 
1293    lower_sampler_logical_send(bld, inst, coordinate,
1294                               shadow_c, lod, lod2, min_lod,
1295                               sample_index,
1296                               mcs, surface, sampler,
1297                               surface_handle, sampler_handle,
1298                               tg4_offset,
1299                               msg_payload_type_bit_size,
1300                               coord_components, grad_components,
1301                               residency);
1302 }
1303 
1304 /**
1305  * Predicate the specified instruction on the vector mask.
1306  */
1307 static void
emit_predicate_on_vector_mask(const fs_builder & bld,fs_inst * inst)1308 emit_predicate_on_vector_mask(const fs_builder &bld, fs_inst *inst)
1309 {
1310    assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
1311           bld.group() == inst->group &&
1312           bld.dispatch_width() == inst->exec_size);
1313 
1314    const fs_builder ubld = bld.exec_all().group(1, 0);
1315 
1316    const fs_visitor &s = *bld.shader;
1317    const brw_reg vector_mask = ubld.vgrf(BRW_TYPE_UW);
1318    ubld.UNDEF(vector_mask);
1319    ubld.emit(SHADER_OPCODE_READ_ARCH_REG, vector_mask, retype(brw_sr0_reg(3),
1320                                                               BRW_TYPE_UD));
1321    const unsigned subreg = sample_mask_flag_subreg(s);
1322 
1323    ubld.MOV(brw_flag_subreg(subreg + inst->group / 16), vector_mask);
1324 
1325    if (inst->predicate) {
1326       assert(inst->predicate == BRW_PREDICATE_NORMAL);
1327       assert(!inst->predicate_inverse);
1328       assert(inst->flag_subreg == 0);
1329       assert(s.devinfo->ver < 20);
1330       /* Combine the vector mask with the existing predicate by using a
1331        * vertical predication mode.
1332        */
1333       inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
1334    } else {
1335       inst->flag_subreg = subreg;
1336       inst->predicate = BRW_PREDICATE_NORMAL;
1337       inst->predicate_inverse = false;
1338    }
1339 }
1340 
1341 static void
setup_surface_descriptors(const fs_builder & bld,fs_inst * inst,uint32_t desc,const brw_reg & surface,const brw_reg & surface_handle)1342 setup_surface_descriptors(const fs_builder &bld, fs_inst *inst, uint32_t desc,
1343                           const brw_reg &surface, const brw_reg &surface_handle)
1344 {
1345    const brw_compiler *compiler = bld.shader->compiler;
1346 
1347    /* We must have exactly one of surface and surface_handle */
1348    assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
1349 
1350    if (surface.file == IMM) {
1351       inst->desc = desc | (surface.ud & 0xff);
1352       inst->src[0] = brw_imm_ud(0);
1353       inst->src[1] = brw_imm_ud(0); /* ex_desc */
1354    } else if (surface_handle.file != BAD_FILE) {
1355       /* Bindless surface */
1356       inst->desc = desc | GFX9_BTI_BINDLESS;
1357       inst->src[0] = brw_imm_ud(0);
1358 
1359       /* We assume that the driver provided the handle in the top 20 bits so
1360        * we can use the surface handle directly as the extended descriptor.
1361        */
1362       inst->src[1] = retype(surface_handle, BRW_TYPE_UD);
1363       inst->send_ex_bso = compiler->extended_bindless_surface_offset;
1364    } else {
1365       inst->desc = desc;
1366       const fs_builder ubld = bld.exec_all().group(1, 0);
1367       brw_reg tmp = ubld.vgrf(BRW_TYPE_UD);
1368       ubld.AND(tmp, surface, brw_imm_ud(0xff));
1369       inst->src[0] = component(tmp, 0);
1370       inst->src[1] = brw_imm_ud(0); /* ex_desc */
1371    }
1372 }
1373 
1374 static void
setup_lsc_surface_descriptors(const fs_builder & bld,fs_inst * inst,uint32_t desc,const brw_reg & surface)1375 setup_lsc_surface_descriptors(const fs_builder &bld, fs_inst *inst,
1376                               uint32_t desc, const brw_reg &surface)
1377 {
1378    const ASSERTED intel_device_info *devinfo = bld.shader->devinfo;
1379    const brw_compiler *compiler = bld.shader->compiler;
1380 
1381    inst->src[0] = brw_imm_ud(0); /* desc */
1382 
1383    enum lsc_addr_surface_type surf_type = lsc_msg_desc_addr_type(devinfo, desc);
1384    switch (surf_type) {
1385    case LSC_ADDR_SURFTYPE_BSS:
1386       inst->send_ex_bso = compiler->extended_bindless_surface_offset;
1387       /* fall-through */
1388    case LSC_ADDR_SURFTYPE_SS:
1389       assert(surface.file != BAD_FILE);
1390       /* We assume that the driver provided the handle in the top 20 bits so
1391        * we can use the surface handle directly as the extended descriptor.
1392        */
1393       inst->src[1] = retype(surface, BRW_TYPE_UD);
1394       break;
1395 
1396    case LSC_ADDR_SURFTYPE_BTI:
1397       assert(surface.file != BAD_FILE);
1398       if (surface.file == IMM) {
1399          inst->src[1] = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud));
1400       } else {
1401          const fs_builder ubld = bld.exec_all().group(1, 0);
1402          brw_reg tmp = ubld.vgrf(BRW_TYPE_UD);
1403          ubld.SHL(tmp, surface, brw_imm_ud(24));
1404          inst->src[1] = component(tmp, 0);
1405       }
1406       break;
1407 
1408    case LSC_ADDR_SURFTYPE_FLAT:
1409       inst->src[1] = brw_imm_ud(0);
1410       break;
1411 
1412    default:
1413       unreachable("Invalid LSC surface address type");
1414    }
1415 }
1416 
1417 static enum lsc_addr_size
lsc_addr_size_for_type(enum brw_reg_type type)1418 lsc_addr_size_for_type(enum brw_reg_type type)
1419 {
1420    switch (brw_type_size_bytes(type)) {
1421    case 2: return LSC_ADDR_SIZE_A16;
1422    case 4: return LSC_ADDR_SIZE_A32;
1423    case 8: return LSC_ADDR_SIZE_A64;
1424    default: unreachable("invalid type size");
1425    }
1426 }
1427 
1428 static void
lower_lsc_memory_logical_send(const fs_builder & bld,fs_inst * inst)1429 lower_lsc_memory_logical_send(const fs_builder &bld, fs_inst *inst)
1430 {
1431    const intel_device_info *devinfo = bld.shader->devinfo;
1432    assert(devinfo->has_lsc);
1433 
1434    assert(inst->src[MEMORY_LOGICAL_OPCODE].file == IMM);
1435    assert(inst->src[MEMORY_LOGICAL_MODE].file == IMM);
1436    assert(inst->src[MEMORY_LOGICAL_BINDING_TYPE].file == IMM);
1437    assert(inst->src[MEMORY_LOGICAL_COORD_COMPONENTS].file == IMM);
1438    assert(inst->src[MEMORY_LOGICAL_DATA_SIZE].file == IMM);
1439    assert(inst->src[MEMORY_LOGICAL_FLAGS].file == IMM);
1440 
1441    /* Get the logical send arguments. */
1442    const enum lsc_opcode op = (lsc_opcode) inst->src[MEMORY_LOGICAL_OPCODE].ud;
1443    const enum memory_logical_mode mode =
1444       (enum memory_logical_mode) inst->src[MEMORY_LOGICAL_MODE].ud;
1445    const enum lsc_addr_surface_type binding_type =
1446       (enum lsc_addr_surface_type) inst->src[MEMORY_LOGICAL_BINDING_TYPE].ud;
1447    const brw_reg binding = inst->src[MEMORY_LOGICAL_BINDING];
1448    const brw_reg addr = inst->src[MEMORY_LOGICAL_ADDRESS];
1449    const unsigned coord_components =
1450       inst->src[MEMORY_LOGICAL_COORD_COMPONENTS].ud;
1451    enum lsc_data_size data_size =
1452       (enum lsc_data_size) inst->src[MEMORY_LOGICAL_DATA_SIZE].ud;
1453    const unsigned components = inst->src[MEMORY_LOGICAL_COMPONENTS].ud;
1454    const enum memory_flags flags =
1455       (enum memory_flags) inst->src[MEMORY_LOGICAL_FLAGS].ud;
1456    const bool transpose = flags & MEMORY_FLAG_TRANSPOSE;
1457    const bool include_helpers = flags & MEMORY_FLAG_INCLUDE_HELPERS;
1458    const brw_reg data0 = inst->src[MEMORY_LOGICAL_DATA0];
1459    const brw_reg data1 = inst->src[MEMORY_LOGICAL_DATA1];
1460    const bool has_side_effects = inst->has_side_effects();
1461 
1462    const uint32_t data_size_B = lsc_data_size_bytes(data_size);
1463    const enum brw_reg_type data_type =
1464       brw_type_with_size(data0.type, data_size_B * 8);
1465 
1466    const enum lsc_addr_size addr_size = lsc_addr_size_for_type(addr.type);
1467 
1468    brw_reg payload = addr;
1469 
1470    if (addr.file != VGRF || !addr.is_contiguous()) {
1471       if (inst->force_writemask_all) {
1472          const fs_builder dbld = bld.group(bld.shader->dispatch_width, 0);
1473          payload = dbld.move_to_vgrf(addr, coord_components);
1474       } else {
1475          payload = bld.move_to_vgrf(addr, coord_components);
1476       }
1477    }
1478 
1479    unsigned ex_mlen = 0;
1480    brw_reg payload2;
1481    if (data0.file != BAD_FILE) {
1482       if (transpose) {
1483          assert(data1.file == BAD_FILE);
1484 
1485          payload2 = data0;
1486          ex_mlen = DIV_ROUND_UP(components, 8);
1487       } else {
1488          brw_reg data[8];
1489          unsigned size = 0;
1490 
1491          assert(components < 8);
1492 
1493          for (unsigned i = 0; i < components; i++)
1494             data[size++] = offset(data0, inst->exec_size, i);
1495 
1496          if (data1.file != BAD_FILE) {
1497             for (unsigned i = 0; i < components; i++)
1498                data[size++] = offset(data1, inst->exec_size, i);
1499          }
1500 
1501          payload2 = bld.vgrf(data0.type, size);
1502          bld.LOAD_PAYLOAD(payload2, data, size, 0);
1503          ex_mlen = (size * brw_type_size_bytes(data_type) * inst->exec_size) / REG_SIZE;
1504       }
1505    }
1506 
1507    /* Bspec: Atomic instruction -> Cache section:
1508     *
1509     *    Atomic messages are always forced to "un-cacheable" in the L1
1510     *    cache.
1511     */
1512    unsigned cache_mode =
1513       lsc_opcode_is_atomic(op) ? (unsigned) LSC_CACHE(devinfo, STORE, L1UC_L3WB) :
1514       lsc_opcode_is_store(op)  ? (unsigned) LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS) :
1515       (unsigned) LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS);
1516 
1517    /* If we're a fragment shader, we have to predicate with the sample mask to
1518     * avoid helper invocations in instructions with side effects, unless they
1519     * are explicitly required.  One exception is for scratch writes - even
1520     * though those have side effects, they represent operations that didn't
1521     * originally have any.  We want to avoid accessing undefined values from
1522     * scratch, so we disable helper invocations entirely there.
1523     *
1524     * There are also special cases when we actually want to run on helpers
1525     * (ray queries).
1526     */
1527    if (bld.shader->stage == MESA_SHADER_FRAGMENT && !transpose) {
1528       if (include_helpers)
1529          emit_predicate_on_vector_mask(bld, inst);
1530       else if (has_side_effects && mode != MEMORY_MODE_SCRATCH)
1531          brw_emit_predicate_on_sample_mask(bld, inst);
1532    }
1533 
1534    switch (mode) {
1535    case MEMORY_MODE_UNTYPED:
1536    case MEMORY_MODE_SCRATCH:
1537       inst->sfid = GFX12_SFID_UGM;
1538       break;
1539    case MEMORY_MODE_TYPED:
1540       inst->sfid = GFX12_SFID_TGM;
1541       break;
1542    case MEMORY_MODE_SHARED_LOCAL:
1543       inst->sfid = GFX12_SFID_SLM;
1544       break;
1545    }
1546    assert(inst->sfid);
1547 
1548    inst->desc = lsc_msg_desc(devinfo, op, binding_type, addr_size, data_size,
1549                              lsc_opcode_has_cmask(op) ?
1550                              (1 << components) - 1 : components,
1551                              transpose, cache_mode);
1552 
1553    /* Set up extended descriptors, fills src[0] and src[1]. */
1554    setup_lsc_surface_descriptors(bld, inst, inst->desc, binding);
1555 
1556    inst->opcode = SHADER_OPCODE_SEND;
1557    inst->mlen = lsc_msg_addr_len(devinfo, addr_size,
1558                                  inst->exec_size * coord_components);
1559    inst->ex_mlen = ex_mlen;
1560    inst->header_size = 0;
1561    inst->send_has_side_effects = has_side_effects;
1562    inst->send_is_volatile = !has_side_effects;
1563 
1564    inst->resize_sources(4);
1565 
1566    /* Finally, the payload */
1567    inst->src[2] = payload;
1568    inst->src[3] = payload2;
1569 }
1570 
1571 static brw_reg
emit_a64_oword_block_header(const fs_builder & bld,const brw_reg & addr)1572 emit_a64_oword_block_header(const fs_builder &bld, const brw_reg &addr)
1573 {
1574    const fs_builder ubld = bld.exec_all().group(8, 0);
1575 
1576    assert(brw_type_size_bytes(addr.type) == 8 && addr.stride == 0);
1577 
1578    brw_reg expanded_addr = addr;
1579    if (addr.file == UNIFORM) {
1580       /* We can't do stride 1 with the UNIFORM file, it requires stride 0 */
1581       expanded_addr = ubld.vgrf(BRW_TYPE_UQ);
1582       expanded_addr.stride = 0;
1583       ubld.MOV(expanded_addr, retype(addr, BRW_TYPE_UQ));
1584    }
1585 
1586    brw_reg header = ubld.vgrf(BRW_TYPE_UD);
1587    ubld.MOV(header, brw_imm_ud(0));
1588 
1589    /* Use a 2-wide MOV to fill out the address */
1590    brw_reg addr_vec2 = expanded_addr;
1591    addr_vec2.type = BRW_TYPE_UD;
1592    addr_vec2.stride = 1;
1593    ubld.group(2, 0).MOV(header, addr_vec2);
1594 
1595    return header;
1596 }
1597 
1598 static void
lower_hdc_memory_logical_send(const fs_builder & bld,fs_inst * inst)1599 lower_hdc_memory_logical_send(const fs_builder &bld, fs_inst *inst)
1600 {
1601    const intel_device_info *devinfo = bld.shader->devinfo;
1602    const brw_compiler *compiler = bld.shader->compiler;
1603 
1604    assert(inst->src[MEMORY_LOGICAL_OPCODE].file == IMM);
1605    assert(inst->src[MEMORY_LOGICAL_MODE].file == IMM);
1606    assert(inst->src[MEMORY_LOGICAL_BINDING_TYPE].file == IMM);
1607    assert(inst->src[MEMORY_LOGICAL_COORD_COMPONENTS].file == IMM);
1608    assert(inst->src[MEMORY_LOGICAL_DATA_SIZE].file == IMM);
1609    assert(inst->src[MEMORY_LOGICAL_FLAGS].file == IMM);
1610 
1611    /* Get the logical send arguments. */
1612    const enum lsc_opcode op = (lsc_opcode)inst->src[MEMORY_LOGICAL_OPCODE].ud;
1613    const enum memory_logical_mode mode =
1614       (enum memory_logical_mode) inst->src[MEMORY_LOGICAL_MODE].ud;
1615    enum lsc_addr_surface_type binding_type =
1616       (enum lsc_addr_surface_type) inst->src[MEMORY_LOGICAL_BINDING_TYPE].ud;
1617    brw_reg binding = inst->src[MEMORY_LOGICAL_BINDING];
1618    const brw_reg addr = inst->src[MEMORY_LOGICAL_ADDRESS];
1619    const unsigned coord_components =
1620       inst->src[MEMORY_LOGICAL_COORD_COMPONENTS].ud;
1621    const unsigned alignment = inst->src[MEMORY_LOGICAL_ALIGNMENT].ud;
1622    const unsigned components = inst->src[MEMORY_LOGICAL_COMPONENTS].ud;
1623    const enum memory_flags flags =
1624       (enum memory_flags) inst->src[MEMORY_LOGICAL_FLAGS].ud;
1625    const bool block = flags & MEMORY_FLAG_TRANSPOSE;
1626    const bool include_helpers = flags & MEMORY_FLAG_INCLUDE_HELPERS;
1627    const brw_reg data0 = inst->src[MEMORY_LOGICAL_DATA0];
1628    const brw_reg data1 = inst->src[MEMORY_LOGICAL_DATA1];
1629    const bool has_side_effects = inst->has_side_effects();
1630    const bool has_dest = inst->dst.file != BAD_FILE && !inst->dst.is_null();
1631 
1632    /* Don't predicate scratch writes on the sample mask.  Otherwise,
1633     * FS helper invocations would load undefined values from scratch memory.
1634     * And scratch memory load/stores are produced from operations without
1635     * side-effects, thus they should not have different behavior in the
1636     * helper invocations.
1637     */
1638    bool allow_sample_mask = has_side_effects && mode != MEMORY_MODE_SCRATCH;
1639 
1640    const enum lsc_data_size data_size =
1641       (enum lsc_data_size) inst->src[MEMORY_LOGICAL_DATA_SIZE].ud;
1642 
1643    /* unpadded data size */
1644    const uint32_t data_bit_size =
1645       data_size == LSC_DATA_SIZE_D8U32 ? 8 :
1646       data_size == LSC_DATA_SIZE_D16U32 ? 16 :
1647       8 * lsc_data_size_bytes(data_size);
1648 
1649    const bool byte_scattered =
1650       data_bit_size < 32 || (alignment != 0 && alignment < 4);
1651    const bool dword_scattered = !byte_scattered && mode == MEMORY_MODE_SCRATCH;
1652    const bool surface_access = !byte_scattered && !dword_scattered && !block;
1653 
1654    /* SLM block reads must use the 16B-aligned OWord Block Read messages,
1655     * as the unaligned message doesn't exist for SLM.
1656     */
1657    const bool oword_aligned = block && mode == MEMORY_MODE_SHARED_LOCAL;
1658    assert(!oword_aligned || (alignment % 16) == 0);
1659 
1660    enum lsc_addr_size addr_size = lsc_addr_size_for_type(addr.type);
1661    unsigned addr_size_B = coord_components * lsc_addr_size_bytes(addr_size);
1662 
1663    brw_reg header;
1664    fs_builder ubld8 = bld.exec_all().group(8, 0);
1665    fs_builder ubld1 = ubld8.group(1, 0);
1666    if (mode == MEMORY_MODE_SCRATCH) {
1667       header = ubld8.vgrf(BRW_TYPE_UD);
1668       ubld8.emit(SHADER_OPCODE_SCRATCH_HEADER, header, brw_ud8_grf(0, 0));
1669    } else if (block) {
1670       if (addr_size == LSC_ADDR_SIZE_A64) {
1671          header = emit_a64_oword_block_header(bld, addr);
1672       } else {
1673          header = ubld8.vgrf(BRW_TYPE_UD);
1674          ubld8.MOV(header, brw_imm_ud(0));
1675          if (oword_aligned)
1676             ubld1.SHR(component(header, 2), addr, brw_imm_ud(4));
1677          else
1678             ubld1.MOV(component(header, 2), addr);
1679       }
1680    }
1681 
1682    /* If we're a fragment shader, we have to predicate with the sample mask to
1683     * avoid helper invocations to avoid helper invocations in instructions
1684     * with side effects, unless they are explicitly required.
1685     *
1686     * There are also special cases when we actually want to run on helpers
1687     * (ray queries).
1688     */
1689    if (bld.shader->stage == MESA_SHADER_FRAGMENT) {
1690       if (include_helpers)
1691          emit_predicate_on_vector_mask(bld, inst);
1692       else if (allow_sample_mask &&
1693                (header.file == BAD_FILE || !surface_access))
1694          brw_emit_predicate_on_sample_mask(bld, inst);
1695    }
1696 
1697    brw_reg payload, payload2;
1698    unsigned mlen, ex_mlen = 0;
1699 
1700    if (!block) {
1701       brw_reg data[11];
1702       unsigned num_sources = 0;
1703       if (header.file != BAD_FILE)
1704          data[num_sources++] = header;
1705 
1706       for (unsigned i = 0; i < coord_components; i++)
1707          data[num_sources++] = offset(addr, inst->exec_size, i);
1708 
1709       if (data0.file != BAD_FILE) {
1710          for (unsigned i = 0; i < components; i++)
1711             data[num_sources++] = offset(data0, inst->exec_size, i);
1712          if (data1.file != BAD_FILE) {
1713             for (unsigned i = 0; i < components; i++)
1714                data[num_sources++] = offset(data1, inst->exec_size, i);
1715          }
1716       }
1717 
1718       assert(num_sources <= ARRAY_SIZE(data));
1719 
1720       unsigned payload_size_UDs = (header.file != BAD_FILE ? 1 : 0) +
1721                                   (addr_size_B / 4) +
1722                                   (lsc_op_num_data_values(op) * components *
1723                                    lsc_data_size_bytes(data_size) / 4);
1724 
1725       payload = bld.vgrf(BRW_TYPE_UD, payload_size_UDs);
1726       fs_inst *load_payload =
1727          emit_load_payload_with_padding(bld, payload, data, num_sources,
1728                                         header.file != BAD_FILE ? 1 : 0,
1729                                         REG_SIZE);
1730       mlen = load_payload->size_written / REG_SIZE;
1731    } else {
1732       assert(data1.file == BAD_FILE);
1733 
1734       payload = header;
1735       mlen = 1;
1736 
1737       if (data0.file != BAD_FILE) {
1738          payload2 = bld.move_to_vgrf(data0, components);
1739          ex_mlen = components * sizeof(uint32_t) / REG_SIZE;
1740       }
1741    }
1742 
1743 
1744    if (mode == MEMORY_MODE_SHARED_LOCAL) {
1745       binding_type = LSC_ADDR_SURFTYPE_BTI;
1746       binding = brw_imm_ud(GFX7_BTI_SLM);
1747    } else if (mode == MEMORY_MODE_SCRATCH) {
1748       binding_type = LSC_ADDR_SURFTYPE_BTI;
1749       binding = brw_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT);
1750    }
1751 
1752    uint32_t sfid, desc;
1753    if (mode == MEMORY_MODE_TYPED) {
1754       assert(addr_size == LSC_ADDR_SIZE_A32);
1755       assert(!block);
1756 
1757       sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
1758 
1759       if (lsc_opcode_is_atomic(op)) {
1760          desc = brw_dp_typed_atomic_desc(devinfo, inst->exec_size, inst->group,
1761                                          lsc_op_to_legacy_atomic(op),
1762                                          has_dest);
1763       } else {
1764          desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size,
1765                                              inst->group, components, !has_dest);
1766       }
1767    } else if (addr_size == LSC_ADDR_SIZE_A64) {
1768       assert(binding_type == LSC_ADDR_SURFTYPE_FLAT);
1769       assert(!dword_scattered);
1770 
1771       sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
1772 
1773       if (lsc_opcode_is_atomic(op)) {
1774          unsigned aop = lsc_op_to_legacy_atomic(op);
1775          if (lsc_opcode_is_atomic_float(op)) {
1776             desc = brw_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size,
1777                                                         data_bit_size, aop,
1778                                                         has_dest);
1779          } else {
1780             desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size,
1781                                                   data_bit_size, aop,
1782                                                   has_dest);
1783          }
1784       } else if (block) {
1785          desc = brw_dp_a64_oword_block_rw_desc(devinfo, oword_aligned,
1786                                                components, !has_dest);
1787       } else if (byte_scattered) {
1788          desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
1789                                                   data_bit_size, !has_dest);
1790       } else {
1791          desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
1792                                                    components, !has_dest);
1793       }
1794    } else {
1795       assert(binding_type != LSC_ADDR_SURFTYPE_FLAT);
1796 
1797       sfid = surface_access ? HSW_SFID_DATAPORT_DATA_CACHE_1
1798                             : GFX7_SFID_DATAPORT_DATA_CACHE;
1799 
1800       if (lsc_opcode_is_atomic(op)) {
1801          unsigned aop = lsc_op_to_legacy_atomic(op);
1802          if (lsc_opcode_is_atomic_float(op)) {
1803             desc = brw_dp_untyped_atomic_float_desc(devinfo, inst->exec_size,
1804                                                     aop, has_dest);
1805          } else {
1806             desc = brw_dp_untyped_atomic_desc(devinfo, inst->exec_size,
1807                                               aop, has_dest);
1808          }
1809       } else if (block) {
1810          desc = brw_dp_oword_block_rw_desc(devinfo, oword_aligned,
1811                                            components, !has_dest);
1812       } else if (byte_scattered) {
1813          desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
1814                                               data_bit_size, !has_dest);
1815       } else if (dword_scattered) {
1816          desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
1817                                                !has_dest);
1818       } else {
1819          desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
1820                                                components, !has_dest);
1821       }
1822    }
1823 
1824    assert(sfid);
1825 
1826    /* Update the original instruction. */
1827    inst->opcode = SHADER_OPCODE_SEND;
1828    inst->sfid = sfid;
1829    inst->mlen = mlen;
1830    inst->ex_mlen = ex_mlen;
1831    inst->header_size = header.file != BAD_FILE ? 1 : 0;
1832    inst->send_has_side_effects = has_side_effects;
1833    inst->send_is_volatile = !has_side_effects;
1834 
1835    if (block) {
1836       assert(inst->force_writemask_all);
1837       inst->exec_size = components > 8 ? 16 : 8;
1838    }
1839 
1840    inst->resize_sources(4);
1841 
1842    /* Set up descriptors */
1843    switch (binding_type) {
1844    case LSC_ADDR_SURFTYPE_FLAT:
1845       inst->src[0] = brw_imm_ud(0);
1846       inst->src[1] = brw_imm_ud(0);
1847       break;
1848    case LSC_ADDR_SURFTYPE_BSS:
1849       inst->send_ex_bso = compiler->extended_bindless_surface_offset;
1850       /* fall-through */
1851    case LSC_ADDR_SURFTYPE_SS:
1852       desc |= GFX9_BTI_BINDLESS;
1853 
1854       /* We assume that the driver provided the handle in the top 20 bits so
1855        * we can use the surface handle directly as the extended descriptor.
1856        */
1857       inst->src[0] = brw_imm_ud(0);
1858       inst->src[1] = binding;
1859       break;
1860    case LSC_ADDR_SURFTYPE_BTI:
1861       if (binding.file == IMM) {
1862          desc |= binding.ud & 0xff;
1863          inst->src[0] = brw_imm_ud(0);
1864          inst->src[1] = brw_imm_ud(0);
1865       } else {
1866          brw_reg tmp = ubld1.vgrf(BRW_TYPE_UD);
1867          ubld1.AND(tmp, binding, brw_imm_ud(0xff));
1868          inst->src[0] = component(tmp, 0);
1869          inst->src[1] = brw_imm_ud(0);
1870       }
1871       break;
1872    default:
1873       unreachable("Unknown surface type");
1874    }
1875 
1876    inst->desc = desc;
1877 
1878    /* Finally, the payloads */
1879    inst->src[2] = payload;
1880    inst->src[3] = payload2;
1881 }
1882 
1883 static void
lower_lsc_varying_pull_constant_logical_send(const fs_builder & bld,fs_inst * inst)1884 lower_lsc_varying_pull_constant_logical_send(const fs_builder &bld,
1885                                              fs_inst *inst)
1886 {
1887    const intel_device_info *devinfo = bld.shader->devinfo;
1888    ASSERTED const brw_compiler *compiler = bld.shader->compiler;
1889 
1890    brw_reg surface        = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE];
1891    brw_reg surface_handle = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE];
1892    brw_reg offset_B       = inst->src[PULL_VARYING_CONSTANT_SRC_OFFSET];
1893    brw_reg alignment_B    = inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT];
1894 
1895    /* We are switching the instruction from an ALU-like instruction to a
1896     * send-from-grf instruction.  Since sends can't handle strides or
1897     * source modifiers, we have to make a copy of the offset source.
1898     */
1899    brw_reg ubo_offset = bld.move_to_vgrf(offset_B, 1);
1900 
1901    enum lsc_addr_surface_type surf_type =
1902       surface_handle.file == BAD_FILE ?
1903       LSC_ADDR_SURFTYPE_BTI : LSC_ADDR_SURFTYPE_BSS;
1904 
1905    assert(alignment_B.file == IMM);
1906    unsigned alignment = alignment_B.ud;
1907 
1908    inst->opcode = SHADER_OPCODE_SEND;
1909    inst->sfid = GFX12_SFID_UGM;
1910    inst->resize_sources(3);
1911    inst->send_ex_bso = surf_type == LSC_ADDR_SURFTYPE_BSS &&
1912                        compiler->extended_bindless_surface_offset;
1913 
1914    assert(!compiler->indirect_ubos_use_sampler);
1915 
1916    inst->src[0] = brw_imm_ud(0);
1917    inst->src[2] = ubo_offset; /* payload */
1918 
1919    if (alignment >= 4) {
1920       inst->desc =
1921          lsc_msg_desc(devinfo, LSC_OP_LOAD,
1922                       surf_type, LSC_ADDR_SIZE_A32,
1923                       LSC_DATA_SIZE_D32,
1924                       4 /* num_channels */,
1925                       false /* transpose */,
1926                       LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
1927       inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, inst->exec_size);
1928 
1929       setup_lsc_surface_descriptors(bld, inst, inst->desc,
1930                                     surface.file != BAD_FILE ?
1931                                     surface : surface_handle);
1932    } else {
1933       inst->desc =
1934          lsc_msg_desc(devinfo, LSC_OP_LOAD,
1935                       surf_type, LSC_ADDR_SIZE_A32,
1936                       LSC_DATA_SIZE_D32,
1937                       1 /* num_channels */,
1938                       false /* transpose */,
1939                       LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
1940       inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, inst->exec_size);
1941 
1942       setup_lsc_surface_descriptors(bld, inst, inst->desc,
1943                                     surface.file != BAD_FILE ?
1944                                     surface : surface_handle);
1945 
1946       /* The byte scattered messages can only read one dword at a time so
1947        * we have to duplicate the message 4 times to read the full vec4.
1948        * Hopefully, dead code will clean up the mess if some of them aren't
1949        * needed.
1950        */
1951       assert(inst->size_written == 16 * inst->exec_size);
1952       inst->size_written /= 4;
1953       for (unsigned c = 1; c < 4; c++) {
1954          /* Emit a copy of the instruction because we're about to modify
1955           * it.  Because this loop starts at 1, we will emit copies for the
1956           * first 3 and the final one will be the modified instruction.
1957           */
1958          bld.emit(*inst);
1959 
1960          /* Offset the source */
1961          inst->src[2] = bld.vgrf(BRW_TYPE_UD);
1962          bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4));
1963 
1964          /* Offset the destination */
1965          inst->dst = offset(inst->dst, bld, 1);
1966       }
1967    }
1968 }
1969 
1970 static void
lower_varying_pull_constant_logical_send(const fs_builder & bld,fs_inst * inst)1971 lower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst)
1972 {
1973    const intel_device_info *devinfo = bld.shader->devinfo;
1974    const brw_compiler *compiler = bld.shader->compiler;
1975 
1976    brw_reg surface = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE];
1977    brw_reg surface_handle = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE];
1978    brw_reg offset_B = inst->src[PULL_VARYING_CONSTANT_SRC_OFFSET];
1979 
1980    /* We are switching the instruction from an ALU-like instruction to a
1981     * send-from-grf instruction.  Since sends can't handle strides or
1982     * source modifiers, we have to make a copy of the offset source.
1983     */
1984    brw_reg ubo_offset = bld.vgrf(BRW_TYPE_UD);
1985    bld.MOV(ubo_offset, offset_B);
1986 
1987    assert(inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT].file == IMM);
1988    unsigned alignment = inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT].ud;
1989 
1990    inst->opcode = SHADER_OPCODE_SEND;
1991    inst->mlen = inst->exec_size / 8;
1992    inst->resize_sources(3);
1993 
1994    /* src[0] & src[1] are filled by setup_surface_descriptors() */
1995    inst->src[2] = ubo_offset; /* payload */
1996 
1997    if (compiler->indirect_ubos_use_sampler) {
1998       const unsigned simd_mode =
1999          inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
2000                                 BRW_SAMPLER_SIMD_MODE_SIMD16;
2001       const uint32_t desc = brw_sampler_desc(devinfo, 0, 0,
2002                                              GFX5_SAMPLER_MESSAGE_SAMPLE_LD,
2003                                              simd_mode, 0);
2004 
2005       inst->sfid = BRW_SFID_SAMPLER;
2006       setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2007    } else if (alignment >= 4) {
2008       const uint32_t desc =
2009          brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
2010                                         4, /* num_channels */
2011                                         false   /* write */);
2012 
2013       inst->sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
2014       setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2015    } else {
2016       const uint32_t desc =
2017          brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
2018                                        32,     /* bit_size */
2019                                        false   /* write */);
2020 
2021       inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
2022       setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2023 
2024       /* The byte scattered messages can only read one dword at a time so
2025        * we have to duplicate the message 4 times to read the full vec4.
2026        * Hopefully, dead code will clean up the mess if some of them aren't
2027        * needed.
2028        */
2029       assert(inst->size_written == 16 * inst->exec_size);
2030       inst->size_written /= 4;
2031       for (unsigned c = 1; c < 4; c++) {
2032          /* Emit a copy of the instruction because we're about to modify
2033           * it.  Because this loop starts at 1, we will emit copies for the
2034           * first 3 and the final one will be the modified instruction.
2035           */
2036          bld.emit(*inst);
2037 
2038          /* Offset the source */
2039          inst->src[2] = bld.vgrf(BRW_TYPE_UD);
2040          bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4));
2041 
2042          /* Offset the destination */
2043          inst->dst = offset(inst->dst, bld, 1);
2044       }
2045    }
2046 }
2047 
2048 static void
lower_interpolator_logical_send(const fs_builder & bld,fs_inst * inst,const struct brw_wm_prog_key * wm_prog_key,const struct brw_wm_prog_data * wm_prog_data)2049 lower_interpolator_logical_send(const fs_builder &bld, fs_inst *inst,
2050                                 const struct brw_wm_prog_key *wm_prog_key,
2051                                 const struct brw_wm_prog_data *wm_prog_data)
2052 {
2053    const intel_device_info *devinfo = bld.shader->devinfo;
2054 
2055    /* We have to send something */
2056    brw_reg payload = brw_vec8_grf(0, 0);
2057    unsigned mlen = 1;
2058 
2059    unsigned mode;
2060    switch (inst->opcode) {
2061    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
2062       assert(inst->src[INTERP_SRC_OFFSET].file == BAD_FILE);
2063       mode = GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE;
2064       break;
2065 
2066    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
2067       assert(inst->src[INTERP_SRC_OFFSET].file == BAD_FILE);
2068       mode = GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET;
2069       break;
2070 
2071    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
2072       payload = inst->src[INTERP_SRC_OFFSET];
2073       mlen = 2 * inst->exec_size / 8;
2074       mode = GFX7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET;
2075       break;
2076 
2077    default:
2078       unreachable("Invalid interpolator instruction");
2079    }
2080 
2081    const bool dynamic_mode =
2082       inst->src[INTERP_SRC_DYNAMIC_MODE].file != BAD_FILE;
2083 
2084    brw_reg desc = inst->src[INTERP_SRC_MSG_DESC];
2085    uint32_t desc_imm =
2086       brw_pixel_interp_desc(devinfo,
2087                             /* Leave the mode at 0 if persample_dispatch is
2088                              * dynamic, it will be ORed in below.
2089                              */
2090                             dynamic_mode ? 0 : mode,
2091                             inst->pi_noperspective,
2092                             false /* coarse_pixel_rate */,
2093                             inst->exec_size, inst->group);
2094 
2095    if (wm_prog_data->coarse_pixel_dispatch == BRW_ALWAYS) {
2096       desc_imm |= (1 << 15);
2097    } else if (wm_prog_data->coarse_pixel_dispatch == BRW_SOMETIMES) {
2098       STATIC_ASSERT(INTEL_MSAA_FLAG_COARSE_PI_MSG == (1 << 15));
2099       brw_reg orig_desc = desc;
2100       const fs_builder &ubld = bld.exec_all().group(8, 0);
2101       desc = ubld.vgrf(BRW_TYPE_UD);
2102       ubld.AND(desc, dynamic_msaa_flags(wm_prog_data),
2103                brw_imm_ud(INTEL_MSAA_FLAG_COARSE_PI_MSG));
2104 
2105       /* And, if it's AT_OFFSET, we might have a non-trivial descriptor */
2106       if (orig_desc.file == IMM) {
2107          desc_imm |= orig_desc.ud;
2108       } else {
2109          ubld.OR(desc, desc, orig_desc);
2110       }
2111    }
2112 
2113    /* If persample_dispatch is dynamic, select the interpolation mode
2114     * dynamically and OR into the descriptor to complete the static part
2115     * generated by brw_pixel_interp_desc().
2116     *
2117     * Why does this work? If you look at the SKL PRMs, Volume 7:
2118     * 3D-Media-GPGPU, Shared Functions Pixel Interpolater, you'll see that
2119     *
2120     *   - "Per Message Offset” Message Descriptor
2121     *   - “Sample Position Offset” Message Descriptor
2122     *
2123     * have different formats. Fortunately, a fragment shader dispatched at
2124     * pixel rate, will have gl_SampleID = 0 & gl_NumSamples = 1. So the value
2125     * we pack in “Sample Position Offset” will be a 0 and will cover the X/Y
2126     * components of "Per Message Offset”, which will give us the pixel offset 0x0.
2127     */
2128    if (dynamic_mode) {
2129       brw_reg orig_desc = desc;
2130       const fs_builder &ubld = bld.exec_all().group(8, 0);
2131       desc = ubld.vgrf(BRW_TYPE_UD);
2132 
2133       /* The predicate should have been built in brw_fs_nir.cpp when emitting
2134        * NIR code. This guarantees that we do not have incorrect interactions
2135        * with the flag register holding the predication result.
2136        */
2137       if (orig_desc.file == IMM) {
2138          /* Not using SEL here because we would generate an instruction with 2
2139           * immediate sources which is not supported by HW.
2140           */
2141          set_predicate_inv(BRW_PREDICATE_NORMAL, false,
2142                            ubld.MOV(desc, brw_imm_ud(orig_desc.ud |
2143                                                      GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE << 12)));
2144          set_predicate_inv(BRW_PREDICATE_NORMAL, true,
2145                            ubld.MOV(desc, brw_imm_ud(orig_desc.ud |
2146                                                      GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET << 12)));
2147       } else {
2148          set_predicate_inv(BRW_PREDICATE_NORMAL, false,
2149                            ubld.OR(desc, orig_desc,
2150                                    brw_imm_ud(GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE << 12)));
2151          set_predicate_inv(BRW_PREDICATE_NORMAL, true,
2152                            ubld.OR(desc, orig_desc,
2153                                    brw_imm_ud(GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET << 12)));
2154       }
2155    }
2156 
2157    inst->opcode = SHADER_OPCODE_SEND;
2158    inst->sfid = GFX7_SFID_PIXEL_INTERPOLATOR;
2159    inst->desc = desc_imm;
2160    inst->ex_desc = 0;
2161    inst->mlen = mlen;
2162    inst->ex_mlen = 0;
2163    inst->send_has_side_effects = false;
2164    inst->send_is_volatile = false;
2165 
2166    inst->resize_sources(3);
2167    inst->src[0] = component(desc, 0);
2168    inst->src[1] = brw_imm_ud(0); /* ex_desc */
2169    inst->src[2] = payload;
2170 }
2171 
2172 static void
lower_btd_logical_send(const fs_builder & bld,fs_inst * inst)2173 lower_btd_logical_send(const fs_builder &bld, fs_inst *inst)
2174 {
2175    const intel_device_info *devinfo = bld.shader->devinfo;
2176    brw_reg global_addr = inst->src[0];
2177    const brw_reg btd_record = inst->src[1];
2178 
2179    const unsigned unit = reg_unit(devinfo);
2180    const unsigned mlen = 2 * unit;
2181    const fs_builder ubld = bld.exec_all();
2182    brw_reg header = ubld.vgrf(BRW_TYPE_UD, 2 * unit);
2183 
2184    ubld.MOV(header, brw_imm_ud(0));
2185    switch (inst->opcode) {
2186    case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
2187       assert(brw_type_size_bytes(global_addr.type) == 8 &&
2188              global_addr.stride == 0);
2189       global_addr.type = BRW_TYPE_UD;
2190       global_addr.stride = 1;
2191       ubld.group(2, 0).MOV(header, global_addr);
2192       break;
2193 
2194    case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
2195       /* The bottom bit is the Stack ID release bit */
2196       ubld.group(1, 0).MOV(header, brw_imm_ud(1));
2197       break;
2198 
2199    default:
2200       unreachable("Invalid BTD message");
2201    }
2202 
2203    /* Stack IDs are always in R1 regardless of whether we're coming from a
2204     * bindless shader or a regular compute shader.
2205     */
2206    brw_reg stack_ids = retype(offset(header, bld, 1), BRW_TYPE_UW);
2207    bld.exec_all().MOV(stack_ids, retype(brw_vec8_grf(1 * unit, 0),
2208                                         BRW_TYPE_UW));
2209 
2210    unsigned ex_mlen = 0;
2211    brw_reg payload;
2212    if (inst->opcode == SHADER_OPCODE_BTD_SPAWN_LOGICAL) {
2213       ex_mlen = 2 * (inst->exec_size / 8);
2214       payload = bld.move_to_vgrf(btd_record, 1);
2215    } else {
2216       assert(inst->opcode == SHADER_OPCODE_BTD_RETIRE_LOGICAL);
2217       /* All these messages take a BTD and things complain if we don't provide
2218        * one for RETIRE.  However, it shouldn't ever actually get used so fill
2219        * it with zero.
2220        */
2221       ex_mlen = 2 * (inst->exec_size / 8);
2222       payload = bld.move_to_vgrf(brw_imm_uq(0), 1);
2223    }
2224 
2225    /* Update the original instruction. */
2226    inst->opcode = SHADER_OPCODE_SEND;
2227    inst->mlen = mlen;
2228    inst->ex_mlen = ex_mlen;
2229    inst->header_size = 0; /* HW docs require has_header = false */
2230    inst->send_has_side_effects = true;
2231    inst->send_is_volatile = false;
2232 
2233    /* Set up SFID and descriptors */
2234    inst->sfid = GEN_RT_SFID_BINDLESS_THREAD_DISPATCH;
2235    inst->desc = brw_btd_spawn_desc(devinfo, inst->exec_size,
2236                                    GEN_RT_BTD_MESSAGE_SPAWN);
2237    inst->resize_sources(4);
2238    inst->src[0] = brw_imm_ud(0); /* desc */
2239    inst->src[1] = brw_imm_ud(0); /* ex_desc */
2240    inst->src[2] = header;
2241    inst->src[3] = payload;
2242 }
2243 
2244 static void
lower_trace_ray_logical_send(const fs_builder & bld,fs_inst * inst)2245 lower_trace_ray_logical_send(const fs_builder &bld, fs_inst *inst)
2246 {
2247    const intel_device_info *devinfo = bld.shader->devinfo;
2248    /* The emit_uniformize() in brw_fs_nir.cpp will generate an horizontal
2249     * stride of 0. Below we're doing a MOV() in SIMD2. Since we can't use UQ/Q
2250     * types in on Gfx12.5, we need to tweak the stride with a value of 1 dword
2251     * so that the MOV operates on 2 components rather than twice the same
2252     * component.
2253     */
2254    brw_reg globals_addr = retype(inst->src[RT_LOGICAL_SRC_GLOBALS], BRW_TYPE_UD);
2255    globals_addr.stride = 1;
2256    const brw_reg bvh_level =
2257       inst->src[RT_LOGICAL_SRC_BVH_LEVEL].file == IMM ?
2258       inst->src[RT_LOGICAL_SRC_BVH_LEVEL] :
2259       bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_BVH_LEVEL],
2260                        inst->components_read(RT_LOGICAL_SRC_BVH_LEVEL));
2261    const brw_reg trace_ray_control =
2262       inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL].file == IMM ?
2263       inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] :
2264       bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL],
2265                        inst->components_read(RT_LOGICAL_SRC_TRACE_RAY_CONTROL));
2266    const brw_reg synchronous_src = inst->src[RT_LOGICAL_SRC_SYNCHRONOUS];
2267    assert(synchronous_src.file == IMM);
2268    const bool synchronous = synchronous_src.ud;
2269 
2270    const unsigned unit = reg_unit(devinfo);
2271    const unsigned mlen = unit;
2272    const fs_builder ubld = bld.exec_all();
2273    brw_reg header = ubld.vgrf(BRW_TYPE_UD);
2274    ubld.MOV(header, brw_imm_ud(0));
2275    ubld.group(2, 0).MOV(header, globals_addr);
2276    if (synchronous)
2277       ubld.group(1, 0).MOV(byte_offset(header, 16), brw_imm_ud(synchronous));
2278 
2279    const unsigned ex_mlen = inst->exec_size / 8;
2280    brw_reg payload = bld.vgrf(BRW_TYPE_UD);
2281    if (bvh_level.file == IMM &&
2282        trace_ray_control.file == IMM) {
2283       uint32_t high = devinfo->ver >= 20 ? 10 : 9;
2284       bld.MOV(payload, brw_imm_ud(SET_BITS(trace_ray_control.ud, high, 8) |
2285                                   (bvh_level.ud & 0x7)));
2286    } else {
2287       bld.SHL(payload, trace_ray_control, brw_imm_ud(8));
2288       bld.OR(payload, payload, bvh_level);
2289    }
2290 
2291    /* When doing synchronous traversal, the HW implicitly computes the
2292     * stack_id using the following formula :
2293     *
2294     *    EUID[3:0] & THREAD_ID[2:0] & SIMD_LANE_ID[3:0]
2295     *
2296     * Only in the asynchronous case we need to set the stack_id given from the
2297     * payload register.
2298     */
2299    if (!synchronous) {
2300       bld.AND(subscript(payload, BRW_TYPE_UW, 1),
2301               retype(brw_vec8_grf(1 * unit, 0), BRW_TYPE_UW),
2302               brw_imm_uw(0x7ff));
2303    }
2304 
2305    /* Update the original instruction. */
2306    inst->opcode = SHADER_OPCODE_SEND;
2307    inst->mlen = mlen;
2308    inst->ex_mlen = ex_mlen;
2309    inst->header_size = 0; /* HW docs require has_header = false */
2310    inst->send_has_side_effects = true;
2311    inst->send_is_volatile = false;
2312 
2313    /* Set up SFID and descriptors */
2314    inst->sfid = GEN_RT_SFID_RAY_TRACE_ACCELERATOR;
2315    inst->desc = brw_rt_trace_ray_desc(devinfo, inst->exec_size);
2316    inst->resize_sources(4);
2317    inst->src[0] = brw_imm_ud(0); /* desc */
2318    inst->src[1] = brw_imm_ud(0); /* ex_desc */
2319    inst->src[2] = header;
2320    inst->src[3] = payload;
2321 }
2322 
2323 static void
lower_get_buffer_size(const fs_builder & bld,fs_inst * inst)2324 lower_get_buffer_size(const fs_builder &bld, fs_inst *inst)
2325 {
2326    const intel_device_info *devinfo = bld.shader->devinfo;
2327    /* Since we can only execute this instruction on uniform bti/surface
2328     * handles, brw_fs_nir.cpp should already have limited this to SIMD8.
2329     */
2330    assert(inst->exec_size == (devinfo->ver < 20 ? 8 : 16));
2331 
2332    brw_reg surface = inst->src[GET_BUFFER_SIZE_SRC_SURFACE];
2333    brw_reg surface_handle = inst->src[GET_BUFFER_SIZE_SRC_SURFACE_HANDLE];
2334    brw_reg lod = inst->src[GET_BUFFER_SIZE_SRC_LOD];
2335 
2336    inst->opcode = SHADER_OPCODE_SEND;
2337    inst->mlen = inst->exec_size / 8;
2338    inst->resize_sources(3);
2339    inst->ex_mlen = 0;
2340    inst->ex_desc = 0;
2341 
2342    /* src[0] & src[1] are filled by setup_surface_descriptors() */
2343    inst->src[2] = lod;
2344 
2345    const uint32_t return_format = GFX8_SAMPLER_RETURN_FORMAT_32BITS;
2346 
2347    const uint32_t desc = brw_sampler_desc(devinfo, 0, 0,
2348                                           GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
2349                                           BRW_SAMPLER_SIMD_MODE_SIMD8,
2350                                           return_format);
2351 
2352    inst->dst = retype(inst->dst, BRW_TYPE_UW);
2353    inst->sfid = BRW_SFID_SAMPLER;
2354    setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2355 }
2356 
2357 bool
brw_fs_lower_logical_sends(fs_visitor & s)2358 brw_fs_lower_logical_sends(fs_visitor &s)
2359 {
2360    const intel_device_info *devinfo = s.devinfo;
2361    bool progress = false;
2362 
2363    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
2364       const fs_builder ibld(&s, block, inst);
2365 
2366       switch (inst->opcode) {
2367       case FS_OPCODE_FB_WRITE_LOGICAL:
2368          assert(s.stage == MESA_SHADER_FRAGMENT);
2369          lower_fb_write_logical_send(ibld, inst,
2370                                      brw_wm_prog_data(s.prog_data),
2371                                      (const brw_wm_prog_key *)s.key,
2372                                      s.fs_payload());
2373          break;
2374 
2375       case FS_OPCODE_FB_READ_LOGICAL:
2376          lower_fb_read_logical_send(ibld, inst, brw_wm_prog_data(s.prog_data));
2377          break;
2378 
2379       case SHADER_OPCODE_TEX_LOGICAL:
2380       case SHADER_OPCODE_TXD_LOGICAL:
2381       case SHADER_OPCODE_TXF_LOGICAL:
2382       case SHADER_OPCODE_TXL_LOGICAL:
2383       case SHADER_OPCODE_TXS_LOGICAL:
2384       case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
2385       case FS_OPCODE_TXB_LOGICAL:
2386       case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
2387       case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
2388       case SHADER_OPCODE_TXF_MCS_LOGICAL:
2389       case SHADER_OPCODE_LOD_LOGICAL:
2390       case SHADER_OPCODE_TG4_LOGICAL:
2391       case SHADER_OPCODE_TG4_BIAS_LOGICAL:
2392       case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
2393       case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
2394       case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
2395       case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
2396       case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
2397       case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
2398          lower_sampler_logical_send(ibld, inst);
2399          break;
2400 
2401       case SHADER_OPCODE_GET_BUFFER_SIZE:
2402          lower_get_buffer_size(ibld, inst);
2403          break;
2404 
2405       case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
2406       case SHADER_OPCODE_MEMORY_STORE_LOGICAL:
2407       case SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL:
2408          if (devinfo->ver >= 20 ||
2409              (devinfo->has_lsc &&
2410               inst->src[MEMORY_LOGICAL_MODE].ud != MEMORY_MODE_TYPED))
2411             lower_lsc_memory_logical_send(ibld, inst);
2412          else
2413             lower_hdc_memory_logical_send(ibld, inst);
2414          break;
2415 
2416       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
2417          if (devinfo->has_lsc && !s.compiler->indirect_ubos_use_sampler)
2418             lower_lsc_varying_pull_constant_logical_send(ibld, inst);
2419          else
2420             lower_varying_pull_constant_logical_send(ibld, inst);
2421          break;
2422 
2423       case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
2424       case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
2425       case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
2426          lower_interpolator_logical_send(ibld, inst,
2427                                          (const brw_wm_prog_key *)s.key,
2428                                          brw_wm_prog_data(s.prog_data));
2429          break;
2430 
2431       case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
2432       case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
2433          lower_btd_logical_send(ibld, inst);
2434          break;
2435 
2436       case RT_OPCODE_TRACE_RAY_LOGICAL:
2437          lower_trace_ray_logical_send(ibld, inst);
2438          break;
2439 
2440       case SHADER_OPCODE_URB_READ_LOGICAL:
2441          if (devinfo->ver < 20)
2442             lower_urb_read_logical_send(ibld, inst);
2443          else
2444             lower_urb_read_logical_send_xe2(ibld, inst);
2445          break;
2446 
2447       case SHADER_OPCODE_URB_WRITE_LOGICAL:
2448          if (devinfo->ver < 20)
2449             lower_urb_write_logical_send(ibld, inst);
2450          else
2451             lower_urb_write_logical_send_xe2(ibld, inst);
2452 
2453          break;
2454 
2455       default:
2456          continue;
2457       }
2458 
2459       progress = true;
2460    }
2461 
2462    if (progress)
2463       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
2464 
2465    return progress;
2466 }
2467 
2468 /**
2469  * Turns the generic expression-style uniform pull constant load instruction
2470  * into a hardware-specific series of instructions for loading a pull
2471  * constant.
2472  *
2473  * The expression style allows the CSE pass before this to optimize out
2474  * repeated loads from the same offset, and gives the pre-register-allocation
2475  * scheduling full flexibility, while the conversion to native instructions
2476  * allows the post-register-allocation scheduler the best information
2477  * possible.
2478  *
2479  * Note that execution masking for setting up pull constant loads is special:
2480  * the channels that need to be written are unrelated to the current execution
2481  * mask, since a later instruction will use one of the result channels as a
2482  * source operand for all 8 or 16 of its channels.
2483  */
2484 bool
brw_fs_lower_uniform_pull_constant_loads(fs_visitor & s)2485 brw_fs_lower_uniform_pull_constant_loads(fs_visitor &s)
2486 {
2487    const intel_device_info *devinfo = s.devinfo;
2488    bool progress = false;
2489 
2490    foreach_block_and_inst (block, fs_inst, inst, s.cfg) {
2491       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2492          continue;
2493 
2494       const brw_reg surface = inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE];
2495       const brw_reg surface_handle = inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE_HANDLE];
2496       const brw_reg offset_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_OFFSET];
2497       const brw_reg size_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_SIZE];
2498       assert(surface.file == BAD_FILE || surface_handle.file == BAD_FILE);
2499       assert(offset_B.file == IMM);
2500       assert(size_B.file == IMM);
2501 
2502       if (devinfo->has_lsc) {
2503          const fs_builder ubld =
2504             fs_builder(&s, block, inst).group(8, 0).exec_all();
2505 
2506          const brw_reg payload = ubld.vgrf(BRW_TYPE_UD);
2507          ubld.MOV(payload, offset_B);
2508 
2509          inst->sfid = GFX12_SFID_UGM;
2510          inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
2511                                    surface_handle.file == BAD_FILE ?
2512                                    LSC_ADDR_SURFTYPE_BTI :
2513                                    LSC_ADDR_SURFTYPE_BSS,
2514                                    LSC_ADDR_SIZE_A32,
2515                                    LSC_DATA_SIZE_D32,
2516                                    inst->size_written / 4,
2517                                    true /* transpose */,
2518                                    LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
2519 
2520          /* Update the original instruction. */
2521          inst->opcode = SHADER_OPCODE_SEND;
2522          inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, 1);
2523          inst->send_ex_bso = surface_handle.file != BAD_FILE &&
2524                              s.compiler->extended_bindless_surface_offset;
2525          inst->ex_mlen = 0;
2526          inst->header_size = 0;
2527          inst->send_has_side_effects = false;
2528          inst->send_is_volatile = true;
2529          inst->exec_size = 1;
2530 
2531          /* Finally, the payload */
2532 
2533          inst->resize_sources(3);
2534          setup_lsc_surface_descriptors(ubld, inst, inst->desc,
2535                                        surface.file != BAD_FILE ?
2536                                        surface : surface_handle);
2537          inst->src[2] = payload;
2538 
2539          s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
2540       } else {
2541          const fs_builder ubld = fs_builder(&s, block, inst).exec_all();
2542          brw_reg header = fs_builder(&s, 8).exec_all().vgrf(BRW_TYPE_UD);
2543 
2544          ubld.group(8, 0).MOV(header,
2545                               retype(brw_vec8_grf(0, 0), BRW_TYPE_UD));
2546          ubld.group(1, 0).MOV(component(header, 2),
2547                               brw_imm_ud(offset_B.ud / 16));
2548 
2549          inst->sfid = GFX6_SFID_DATAPORT_CONSTANT_CACHE;
2550          inst->opcode = SHADER_OPCODE_SEND;
2551          inst->header_size = 1;
2552          inst->mlen = 1;
2553 
2554          uint32_t desc =
2555             brw_dp_oword_block_rw_desc(devinfo, true /* align_16B */,
2556                                        size_B.ud / 4, false /* write */);
2557 
2558          inst->resize_sources(4);
2559 
2560          setup_surface_descriptors(ubld, inst, desc, surface, surface_handle);
2561 
2562          inst->src[2] = header;
2563          inst->src[3] = brw_reg(); /* unused for reads */
2564 
2565          s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
2566       }
2567 
2568       progress = true;
2569    }
2570 
2571    return progress;
2572 }
2573