1 /*
2 * Copyright © 2010, 2022 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /**
25 * @file
26 */
27
28 #include "brw_eu.h"
29 #include "brw_fs.h"
30 #include "brw_fs_builder.h"
31
32 using namespace brw;
33
34 static void
lower_urb_read_logical_send(const fs_builder & bld,fs_inst * inst)35 lower_urb_read_logical_send(const fs_builder &bld, fs_inst *inst)
36 {
37 const intel_device_info *devinfo = bld.shader->devinfo;
38 const bool per_slot_present =
39 inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE;
40
41 assert(inst->size_written % REG_SIZE == 0);
42 assert(inst->header_size == 0);
43
44 brw_reg payload_sources[2];
45 unsigned header_size = 0;
46 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
47 if (per_slot_present)
48 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
49
50 brw_reg payload = brw_vgrf(bld.shader->alloc.allocate(header_size),
51 BRW_TYPE_F);
52 bld.LOAD_PAYLOAD(payload, payload_sources, header_size, header_size);
53
54 inst->opcode = SHADER_OPCODE_SEND;
55 inst->header_size = header_size;
56
57 inst->sfid = BRW_SFID_URB;
58 inst->desc = brw_urb_desc(devinfo,
59 GFX8_URB_OPCODE_SIMD8_READ,
60 per_slot_present,
61 false,
62 inst->offset);
63
64 inst->mlen = header_size;
65 inst->ex_desc = 0;
66 inst->ex_mlen = 0;
67 inst->send_is_volatile = true;
68
69 inst->resize_sources(4);
70
71 inst->src[0] = brw_imm_ud(0); /* desc */
72 inst->src[1] = brw_imm_ud(0); /* ex_desc */
73 inst->src[2] = payload;
74 inst->src[3] = brw_null_reg();
75 }
76
77 static void
lower_urb_read_logical_send_xe2(const fs_builder & bld,fs_inst * inst)78 lower_urb_read_logical_send_xe2(const fs_builder &bld, fs_inst *inst)
79 {
80 const intel_device_info *devinfo = bld.shader->devinfo;
81 assert(devinfo->has_lsc);
82
83 assert(inst->size_written % (REG_SIZE * reg_unit(devinfo)) == 0);
84 assert(inst->header_size == 0);
85
86 /* Get the logical send arguments. */
87 const brw_reg handle = inst->src[URB_LOGICAL_SRC_HANDLE];
88
89 /* Calculate the total number of components of the payload. */
90 const unsigned dst_comps = inst->size_written / (REG_SIZE * reg_unit(devinfo));
91
92 brw_reg payload = bld.vgrf(BRW_TYPE_UD);
93
94 bld.MOV(payload, handle);
95
96 /* The low 24-bits of the URB handle is a byte offset into the URB area.
97 * Add the (OWord) offset of the write to this value.
98 */
99 if (inst->offset) {
100 bld.ADD(payload, payload, brw_imm_ud(inst->offset * 16));
101 inst->offset = 0;
102 }
103
104 brw_reg offsets = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
105 if (offsets.file != BAD_FILE) {
106 bld.ADD(payload, payload, offsets);
107 }
108
109 inst->sfid = BRW_SFID_URB;
110
111 assert((dst_comps >= 1 && dst_comps <= 4) || dst_comps == 8);
112
113 inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
114 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A32,
115 LSC_DATA_SIZE_D32, dst_comps /* num_channels */,
116 false /* transpose */,
117 LSC_CACHE(devinfo, LOAD, L1UC_L3UC));
118
119 /* Update the original instruction. */
120 inst->opcode = SHADER_OPCODE_SEND;
121 inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, inst->exec_size);
122 inst->ex_mlen = 0;
123 inst->header_size = 0;
124 inst->send_has_side_effects = true;
125 inst->send_is_volatile = false;
126
127 inst->resize_sources(4);
128
129 inst->src[0] = brw_imm_ud(0);
130 inst->src[1] = brw_imm_ud(0);
131
132 inst->src[2] = payload;
133 inst->src[3] = brw_null_reg();
134 }
135
136 static void
lower_urb_write_logical_send(const fs_builder & bld,fs_inst * inst)137 lower_urb_write_logical_send(const fs_builder &bld, fs_inst *inst)
138 {
139 const intel_device_info *devinfo = bld.shader->devinfo;
140 const bool per_slot_present =
141 inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE;
142 const bool channel_mask_present =
143 inst->src[URB_LOGICAL_SRC_CHANNEL_MASK].file != BAD_FILE;
144
145 assert(inst->header_size == 0);
146
147 const unsigned length = 1 + per_slot_present + channel_mask_present +
148 inst->components_read(URB_LOGICAL_SRC_DATA);
149
150 brw_reg *payload_sources = new brw_reg[length];
151 brw_reg payload = brw_vgrf(bld.shader->alloc.allocate(length),
152 BRW_TYPE_F);
153
154 unsigned header_size = 0;
155 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
156 if (per_slot_present)
157 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
158
159 if (channel_mask_present)
160 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_CHANNEL_MASK];
161
162 for (unsigned i = header_size, j = 0; i < length; i++, j++)
163 payload_sources[i] = offset(inst->src[URB_LOGICAL_SRC_DATA], bld, j);
164
165 bld.LOAD_PAYLOAD(payload, payload_sources, length, header_size);
166
167 delete [] payload_sources;
168
169 inst->opcode = SHADER_OPCODE_SEND;
170 inst->header_size = header_size;
171 inst->dst = brw_null_reg();
172
173 inst->sfid = BRW_SFID_URB;
174 inst->desc = brw_urb_desc(devinfo,
175 GFX8_URB_OPCODE_SIMD8_WRITE,
176 per_slot_present,
177 channel_mask_present,
178 inst->offset);
179
180 inst->mlen = length;
181 inst->ex_desc = 0;
182 inst->ex_mlen = 0;
183 inst->send_has_side_effects = true;
184
185 inst->resize_sources(4);
186
187 inst->src[0] = brw_imm_ud(0); /* desc */
188 inst->src[1] = brw_imm_ud(0); /* ex_desc */
189 inst->src[2] = payload;
190 inst->src[3] = brw_null_reg();
191 }
192
193 static void
lower_urb_write_logical_send_xe2(const fs_builder & bld,fs_inst * inst)194 lower_urb_write_logical_send_xe2(const fs_builder &bld, fs_inst *inst)
195 {
196 const intel_device_info *devinfo = bld.shader->devinfo;
197 assert(devinfo->has_lsc);
198
199 /* Get the logical send arguments. */
200 const brw_reg handle = inst->src[URB_LOGICAL_SRC_HANDLE];
201 const brw_reg src = inst->components_read(URB_LOGICAL_SRC_DATA) ?
202 inst->src[URB_LOGICAL_SRC_DATA] : brw_reg(brw_imm_ud(0));
203 assert(brw_type_size_bytes(src.type) == 4);
204
205 /* Calculate the total number of components of the payload. */
206 const unsigned src_comps = MAX2(1, inst->components_read(URB_LOGICAL_SRC_DATA));
207 const unsigned src_sz = brw_type_size_bytes(src.type);
208
209 brw_reg payload = bld.vgrf(BRW_TYPE_UD);
210
211 bld.MOV(payload, handle);
212
213 /* The low 24-bits of the URB handle is a byte offset into the URB area.
214 * Add the (OWord) offset of the write to this value.
215 */
216 if (inst->offset) {
217 bld.ADD(payload, payload, brw_imm_ud(inst->offset * 16));
218 inst->offset = 0;
219 }
220
221 brw_reg offsets = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
222 if (offsets.file != BAD_FILE) {
223 bld.ADD(payload, payload, offsets);
224 }
225
226 const brw_reg cmask = inst->src[URB_LOGICAL_SRC_CHANNEL_MASK];
227 unsigned mask = 0;
228
229 if (cmask.file != BAD_FILE) {
230 assert(cmask.file == IMM);
231 assert(cmask.type == BRW_TYPE_UD);
232 mask = cmask.ud >> 16;
233 }
234
235 brw_reg payload2 = bld.move_to_vgrf(src, src_comps);
236 const unsigned ex_mlen = (src_comps * src_sz * inst->exec_size) / REG_SIZE;
237
238 inst->sfid = BRW_SFID_URB;
239
240 enum lsc_opcode op = mask ? LSC_OP_STORE_CMASK : LSC_OP_STORE;
241 inst->desc = lsc_msg_desc(devinfo, op,
242 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A32,
243 LSC_DATA_SIZE_D32,
244 mask ? mask : src_comps /* num_channels */,
245 false /* transpose */,
246 LSC_CACHE(devinfo, STORE, L1UC_L3UC));
247
248
249 /* Update the original instruction. */
250 inst->opcode = SHADER_OPCODE_SEND;
251 inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, inst->exec_size);
252 inst->ex_mlen = ex_mlen;
253 inst->header_size = 0;
254 inst->send_has_side_effects = true;
255 inst->send_is_volatile = false;
256
257 inst->resize_sources(4);
258
259 inst->src[0] = brw_imm_ud(0);
260 inst->src[1] = brw_imm_ud(0);
261
262 inst->src[2] = payload;
263 inst->src[3] = payload2;
264 }
265
266 static void
setup_color_payload(const fs_builder & bld,const brw_wm_prog_key * key,brw_reg * dst,brw_reg color,unsigned components)267 setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
268 brw_reg *dst, brw_reg color, unsigned components)
269 {
270 if (key->clamp_fragment_color) {
271 brw_reg tmp = bld.vgrf(BRW_TYPE_F, 4);
272 assert(color.type == BRW_TYPE_F);
273
274 for (unsigned i = 0; i < components; i++)
275 set_saturate(true,
276 bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
277
278 color = tmp;
279 }
280
281 for (unsigned i = 0; i < components; i++)
282 dst[i] = offset(color, bld, i);
283 }
284
285 static void
lower_fb_write_logical_send(const fs_builder & bld,fs_inst * inst,const struct brw_wm_prog_data * prog_data,const brw_wm_prog_key * key,const fs_thread_payload & fs_payload)286 lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
287 const struct brw_wm_prog_data *prog_data,
288 const brw_wm_prog_key *key,
289 const fs_thread_payload &fs_payload)
290 {
291 assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
292 const intel_device_info *devinfo = bld.shader->devinfo;
293 const brw_reg color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0];
294 const brw_reg color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1];
295 const brw_reg src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA];
296 const brw_reg src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH];
297 const brw_reg dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH];
298 const brw_reg src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL];
299 brw_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];
300 const unsigned components =
301 inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
302
303 assert(inst->target != 0 || src0_alpha.file == BAD_FILE);
304
305 brw_reg sources[15];
306 int header_size = 2, payload_header_size;
307 unsigned length = 0;
308
309 if (devinfo->ver < 11 &&
310 (color1.file != BAD_FILE || key->nr_color_regions > 1)) {
311
312 /* From the Sandy Bridge PRM, volume 4, page 198:
313 *
314 * "Dispatched Pixel Enables. One bit per pixel indicating
315 * which pixels were originally enabled when the thread was
316 * dispatched. This field is only required for the end-of-
317 * thread message and on all dual-source messages."
318 */
319 const fs_builder ubld = bld.exec_all().group(8, 0);
320
321 brw_reg header = ubld.vgrf(BRW_TYPE_UD, 2);
322 if (bld.group() < 16) {
323 /* The header starts off as g0 and g1 for the first half */
324 ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
325 BRW_TYPE_UD));
326 } else {
327 /* The header starts off as g0 and g2 for the second half */
328 assert(bld.group() < 32);
329 const brw_reg header_sources[2] = {
330 retype(brw_vec8_grf(0, 0), BRW_TYPE_UD),
331 retype(brw_vec8_grf(2, 0), BRW_TYPE_UD),
332 };
333 ubld.LOAD_PAYLOAD(header, header_sources, 2, 0);
334
335 /* Gfx12 will require additional fix-ups if we ever hit this path. */
336 assert(devinfo->ver < 12);
337 }
338
339 uint32_t g00_bits = 0;
340
341 /* Set "Source0 Alpha Present to RenderTarget" bit in message
342 * header.
343 */
344 if (src0_alpha.file != BAD_FILE)
345 g00_bits |= 1 << 11;
346
347 /* Set computes stencil to render target */
348 if (prog_data->computed_stencil)
349 g00_bits |= 1 << 14;
350
351 if (g00_bits) {
352 /* OR extra bits into g0.0 */
353 ubld.group(1, 0).OR(component(header, 0),
354 retype(brw_vec1_grf(0, 0), BRW_TYPE_UD),
355 brw_imm_ud(g00_bits));
356 }
357
358 /* Set the render target index for choosing BLEND_STATE. */
359 if (inst->target > 0) {
360 ubld.group(1, 0).MOV(component(header, 2), brw_imm_ud(inst->target));
361 }
362
363 if (prog_data->uses_kill) {
364 ubld.group(1, 0).MOV(retype(component(header, 15), BRW_TYPE_UW),
365 brw_sample_mask_reg(bld));
366 }
367
368 assert(length == 0);
369 sources[0] = header;
370 sources[1] = horiz_offset(header, 8);
371 length = 2;
372 }
373 assert(length == 0 || length == 2);
374 header_size = length;
375
376 if (fs_payload.aa_dest_stencil_reg[0]) {
377 assert(inst->group < 16);
378 sources[length] = brw_vgrf(bld.shader->alloc.allocate(1), BRW_TYPE_F);
379 bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
380 .MOV(sources[length],
381 brw_reg(brw_vec8_grf(fs_payload.aa_dest_stencil_reg[0], 0)));
382 length++;
383 }
384
385 if (src0_alpha.file != BAD_FILE) {
386 for (unsigned i = 0; i < bld.dispatch_width() / 8; i++) {
387 const fs_builder &ubld = bld.exec_all().group(8, i)
388 .annotate("FB write src0 alpha");
389 const brw_reg tmp = ubld.vgrf(BRW_TYPE_F);
390 ubld.MOV(tmp, horiz_offset(src0_alpha, i * 8));
391 setup_color_payload(ubld, key, &sources[length], tmp, 1);
392 length++;
393 }
394 }
395
396 if (sample_mask.file != BAD_FILE) {
397 const brw_reg tmp = brw_vgrf(bld.shader->alloc.allocate(reg_unit(devinfo)),
398 BRW_TYPE_UD);
399
400 /* Hand over gl_SampleMask. Only the lower 16 bits of each channel are
401 * relevant. Since it's unsigned single words one vgrf is always
402 * 16-wide, but only the lower or higher 8 channels will be used by the
403 * hardware when doing a SIMD8 write depending on whether we have
404 * selected the subspans for the first or second half respectively.
405 */
406 assert(sample_mask.file != BAD_FILE &&
407 brw_type_size_bytes(sample_mask.type) == 4);
408 sample_mask.type = BRW_TYPE_UW;
409 sample_mask.stride *= 2;
410
411 bld.exec_all().annotate("FB write oMask")
412 .MOV(horiz_offset(retype(tmp, BRW_TYPE_UW),
413 inst->group % (16 * reg_unit(devinfo))),
414 sample_mask);
415
416 for (unsigned i = 0; i < reg_unit(devinfo); i++)
417 sources[length++] = byte_offset(tmp, REG_SIZE * i);
418 }
419
420 payload_header_size = length;
421
422 setup_color_payload(bld, key, &sources[length], color0, components);
423 length += 4;
424
425 if (color1.file != BAD_FILE) {
426 setup_color_payload(bld, key, &sources[length], color1, components);
427 length += 4;
428 }
429
430 if (src_depth.file != BAD_FILE) {
431 sources[length] = src_depth;
432 length++;
433 }
434
435 if (dst_depth.file != BAD_FILE) {
436 sources[length] = dst_depth;
437 length++;
438 }
439
440 if (src_stencil.file != BAD_FILE) {
441 assert(bld.dispatch_width() == 8 * reg_unit(devinfo));
442
443 /* XXX: src_stencil is only available on gfx9+. dst_depth is never
444 * available on gfx9+. As such it's impossible to have both enabled at the
445 * same time and therefore length cannot overrun the array.
446 */
447 assert(length < 15 * reg_unit(devinfo));
448
449 sources[length] = bld.vgrf(BRW_TYPE_UD);
450 bld.exec_all().annotate("FB write OS")
451 .MOV(retype(sources[length], BRW_TYPE_UB),
452 subscript(src_stencil, BRW_TYPE_UB, 0));
453 length++;
454 }
455
456 /* Send from the GRF */
457 brw_reg payload = brw_vgrf(-1, BRW_TYPE_F);
458 fs_inst *load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
459 payload.nr = bld.shader->alloc.allocate(regs_written(load));
460 load->dst = payload;
461
462 uint32_t msg_ctl = brw_fb_write_msg_control(inst, prog_data);
463
464 /* XXX - Bit 13 Per-sample PS enable */
465 inst->desc =
466 (inst->group / 16) << 11 | /* rt slot group */
467 brw_fb_write_desc(devinfo, inst->target, msg_ctl, inst->last_rt,
468 0 /* coarse_rt_write */);
469
470 brw_reg desc = brw_imm_ud(0);
471 if (prog_data->coarse_pixel_dispatch == BRW_ALWAYS) {
472 inst->desc |= (1 << 18);
473 } else if (prog_data->coarse_pixel_dispatch == BRW_SOMETIMES) {
474 STATIC_ASSERT(INTEL_MSAA_FLAG_COARSE_RT_WRITES == (1 << 18));
475 const fs_builder &ubld = bld.exec_all().group(8, 0);
476 desc = ubld.vgrf(BRW_TYPE_UD);
477 ubld.AND(desc, dynamic_msaa_flags(prog_data),
478 brw_imm_ud(INTEL_MSAA_FLAG_COARSE_RT_WRITES));
479 desc = component(desc, 0);
480 }
481
482 uint32_t ex_desc = 0;
483 if (devinfo->ver >= 20) {
484 ex_desc = inst->target << 21 |
485 (key->nr_color_regions == 0) << 20 |
486 (src0_alpha.file != BAD_FILE) << 15 |
487 (src_stencil.file != BAD_FILE) << 14 |
488 (src_depth.file != BAD_FILE) << 13 |
489 (sample_mask.file != BAD_FILE) << 12;
490 } else if (devinfo->ver >= 11) {
491 /* Set the "Render Target Index" and "Src0 Alpha Present" fields
492 * in the extended message descriptor, in lieu of using a header.
493 */
494 ex_desc = inst->target << 12 | (src0_alpha.file != BAD_FILE) << 15;
495
496 if (key->nr_color_regions == 0)
497 ex_desc |= 1 << 20; /* Null Render Target */
498 }
499 inst->ex_desc = ex_desc;
500
501 inst->opcode = SHADER_OPCODE_SEND;
502 inst->resize_sources(3);
503 inst->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
504 inst->src[0] = desc;
505 inst->src[1] = brw_imm_ud(0);
506 inst->src[2] = payload;
507 inst->mlen = regs_written(load);
508 inst->ex_mlen = 0;
509 inst->header_size = header_size;
510 inst->check_tdr = true;
511 inst->send_has_side_effects = true;
512 }
513
514 static void
lower_fb_read_logical_send(const fs_builder & bld,fs_inst * inst,const struct brw_wm_prog_data * wm_prog_data)515 lower_fb_read_logical_send(const fs_builder &bld, fs_inst *inst,
516 const struct brw_wm_prog_data *wm_prog_data)
517 {
518 const intel_device_info *devinfo = bld.shader->devinfo;
519 const fs_builder &ubld = bld.exec_all().group(8, 0);
520 const unsigned length = 2;
521 const brw_reg header = ubld.vgrf(BRW_TYPE_UD, length);
522
523 assert(devinfo->ver >= 9 && devinfo->ver < 20);
524
525 if (bld.group() < 16) {
526 ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
527 BRW_TYPE_UD));
528 } else {
529 assert(bld.group() < 32);
530 const brw_reg header_sources[] = {
531 retype(brw_vec8_grf(0, 0), BRW_TYPE_UD),
532 retype(brw_vec8_grf(2, 0), BRW_TYPE_UD)
533 };
534 ubld.LOAD_PAYLOAD(header, header_sources, ARRAY_SIZE(header_sources), 0);
535
536 if (devinfo->ver >= 12) {
537 /* On Gfx12 the Viewport and Render Target Array Index fields (AKA
538 * Poly 0 Info) are provided in r1.1 instead of r0.0, and the render
539 * target message header format was updated accordingly -- However
540 * the updated format only works for the lower 16 channels in a
541 * SIMD32 thread, since the higher 16 channels want the subspan data
542 * from r2 instead of r1, so we need to copy over the contents of
543 * r1.1 in order to fix things up.
544 */
545 ubld.group(1, 0).MOV(component(header, 9),
546 retype(brw_vec1_grf(1, 1), BRW_TYPE_UD));
547 }
548 }
549
550 /* BSpec 12470 (Gfx8-11), BSpec 47842 (Gfx12+) :
551 *
552 * "Must be zero for Render Target Read message."
553 *
554 * For bits :
555 * - 14 : Stencil Present to Render Target
556 * - 13 : Source Depth Present to Render Target
557 * - 12 : oMask to Render Target
558 * - 11 : Source0 Alpha Present to Render Target
559 */
560 ubld.group(1, 0).AND(component(header, 0),
561 component(header, 0),
562 brw_imm_ud(~INTEL_MASK(14, 11)));
563
564 inst->resize_sources(4);
565 inst->opcode = SHADER_OPCODE_SEND;
566 inst->src[0] = brw_imm_ud(0);
567 inst->src[1] = brw_imm_ud(0);
568 inst->src[2] = header;
569 inst->src[3] = brw_reg();
570 inst->mlen = length;
571 inst->header_size = length;
572 inst->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
573 inst->check_tdr = true;
574 inst->desc =
575 (inst->group / 16) << 11 | /* rt slot group */
576 brw_fb_read_desc(devinfo, inst->target,
577 0 /* msg_control */, inst->exec_size,
578 wm_prog_data->persample_dispatch);
579 }
580
581 static bool
is_high_sampler(const struct intel_device_info * devinfo,const brw_reg & sampler)582 is_high_sampler(const struct intel_device_info *devinfo, const brw_reg &sampler)
583 {
584 return sampler.file != IMM || sampler.ud >= 16;
585 }
586
587 static unsigned
sampler_msg_type(const intel_device_info * devinfo,opcode opcode,bool shadow_compare,bool lod_is_zero,bool has_min_lod)588 sampler_msg_type(const intel_device_info *devinfo,
589 opcode opcode, bool shadow_compare,
590 bool lod_is_zero, bool has_min_lod)
591 {
592 switch (opcode) {
593 case SHADER_OPCODE_TEX_LOGICAL:
594 if (devinfo->ver >= 20 && has_min_lod) {
595 return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_COMPARE_MLOD :
596 XE2_SAMPLER_MESSAGE_SAMPLE_MLOD;
597 } else {
598 return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE :
599 GFX5_SAMPLER_MESSAGE_SAMPLE;
600 }
601 case FS_OPCODE_TXB_LOGICAL:
602 return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE :
603 GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS;
604 case SHADER_OPCODE_TXL_LOGICAL:
605 assert(!has_min_lod);
606 if (lod_is_zero) {
607 return shadow_compare ? GFX9_SAMPLER_MESSAGE_SAMPLE_C_LZ :
608 GFX9_SAMPLER_MESSAGE_SAMPLE_LZ;
609 }
610 return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE :
611 GFX5_SAMPLER_MESSAGE_SAMPLE_LOD;
612 case SHADER_OPCODE_TXS_LOGICAL:
613 case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
614 assert(!has_min_lod);
615 return GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
616 case SHADER_OPCODE_TXD_LOGICAL:
617 return shadow_compare ? HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE :
618 GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
619 case SHADER_OPCODE_TXF_LOGICAL:
620 assert(!has_min_lod);
621 return lod_is_zero ? GFX9_SAMPLER_MESSAGE_SAMPLE_LD_LZ :
622 GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
623 case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
624 case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
625 assert(!has_min_lod);
626 return GFX9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
627 case SHADER_OPCODE_TXF_MCS_LOGICAL:
628 assert(!has_min_lod);
629 return GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
630 case SHADER_OPCODE_LOD_LOGICAL:
631 assert(!has_min_lod);
632 return GFX5_SAMPLER_MESSAGE_LOD;
633 case SHADER_OPCODE_TG4_LOGICAL:
634 assert(!has_min_lod);
635 return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C :
636 GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
637 break;
638 case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
639 assert(!has_min_lod);
640 return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C :
641 GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
642 case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
643 assert(!has_min_lod);
644 assert(devinfo->ver >= 20);
645 return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_L_C:
646 XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_L;
647 case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
648 assert(!has_min_lod);
649 assert(devinfo->ver >= 20);
650 return XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_B;
651 case SHADER_OPCODE_TG4_BIAS_LOGICAL:
652 assert(!has_min_lod);
653 assert(devinfo->ver >= 20);
654 return XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_B;
655 case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
656 assert(!has_min_lod);
657 assert(devinfo->ver >= 20);
658 return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_L_C :
659 XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_L;
660 case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
661 assert(!has_min_lod);
662 assert(devinfo->ver >= 20);
663 return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_I_C :
664 XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_I;
665 case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
666 assert(!has_min_lod);
667 return GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
668 default:
669 unreachable("not reached");
670 }
671 }
672
673 /**
674 * Emit a LOAD_PAYLOAD instruction while ensuring the sources are aligned to
675 * the given requested_alignment_sz.
676 */
677 static fs_inst *
emit_load_payload_with_padding(const fs_builder & bld,const brw_reg & dst,const brw_reg * src,unsigned sources,unsigned header_size,unsigned requested_alignment_sz)678 emit_load_payload_with_padding(const fs_builder &bld, const brw_reg &dst,
679 const brw_reg *src, unsigned sources,
680 unsigned header_size,
681 unsigned requested_alignment_sz)
682 {
683 unsigned length = 0;
684 unsigned num_srcs =
685 sources * DIV_ROUND_UP(requested_alignment_sz, bld.dispatch_width());
686 brw_reg *src_comps = new brw_reg[num_srcs];
687
688 for (unsigned i = 0; i < header_size; i++)
689 src_comps[length++] = src[i];
690
691 for (unsigned i = header_size; i < sources; i++) {
692 unsigned src_sz =
693 retype(dst, src[i].type).component_size(bld.dispatch_width());
694 const enum brw_reg_type padding_payload_type =
695 brw_type_with_size(BRW_TYPE_UD, brw_type_size_bits(src[i].type));
696
697 src_comps[length++] = src[i];
698
699 /* Expand the real sources if component of requested payload type is
700 * larger than real source component.
701 */
702 if (src_sz < requested_alignment_sz) {
703 for (unsigned j = 0; j < (requested_alignment_sz / src_sz) - 1; j++) {
704 src_comps[length++] = retype(brw_reg(), padding_payload_type);
705 }
706 }
707 }
708
709 fs_inst *inst = bld.LOAD_PAYLOAD(dst, src_comps, length, header_size);
710 delete[] src_comps;
711
712 return inst;
713 }
714
715 static bool
shader_opcode_needs_header(opcode op)716 shader_opcode_needs_header(opcode op)
717 {
718 switch (op) {
719 case SHADER_OPCODE_TG4_LOGICAL:
720 case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
721 case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
722 case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
723 case SHADER_OPCODE_TG4_BIAS_LOGICAL:
724 case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
725 case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
726 case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
727 return true;
728 default:
729 break;
730 }
731
732 return false;
733 }
734
735 static void
lower_sampler_logical_send(const fs_builder & bld,fs_inst * inst,const brw_reg & coordinate,const brw_reg & shadow_c,brw_reg lod,const brw_reg & lod2,const brw_reg & min_lod,const brw_reg & sample_index,const brw_reg & mcs,const brw_reg & surface,const brw_reg & sampler,const brw_reg & surface_handle,const brw_reg & sampler_handle,const brw_reg & tg4_offset,unsigned payload_type_bit_size,unsigned coord_components,unsigned grad_components,bool residency)736 lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst,
737 const brw_reg &coordinate,
738 const brw_reg &shadow_c,
739 brw_reg lod, const brw_reg &lod2,
740 const brw_reg &min_lod,
741 const brw_reg &sample_index,
742 const brw_reg &mcs,
743 const brw_reg &surface,
744 const brw_reg &sampler,
745 const brw_reg &surface_handle,
746 const brw_reg &sampler_handle,
747 const brw_reg &tg4_offset,
748 unsigned payload_type_bit_size,
749 unsigned coord_components,
750 unsigned grad_components,
751 bool residency)
752 {
753 const brw_compiler *compiler = bld.shader->compiler;
754 const intel_device_info *devinfo = bld.shader->devinfo;
755 const enum brw_reg_type payload_type =
756 brw_type_with_size(BRW_TYPE_F, payload_type_bit_size);
757 const enum brw_reg_type payload_unsigned_type =
758 brw_type_with_size(BRW_TYPE_UD, payload_type_bit_size);
759 const enum brw_reg_type payload_signed_type =
760 brw_type_with_size(BRW_TYPE_D, payload_type_bit_size);
761 unsigned reg_width = bld.dispatch_width() / 8;
762 unsigned header_size = 0, length = 0;
763 opcode op = inst->opcode;
764 brw_reg sources[1 + MAX_SAMPLER_MESSAGE_SIZE];
765 for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
766 sources[i] = bld.vgrf(payload_type);
767
768 /* We must have exactly one of surface/sampler and surface/sampler_handle */
769 assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
770 assert((sampler.file == BAD_FILE) != (sampler_handle.file == BAD_FILE));
771
772 if (shader_opcode_needs_header(op) || inst->offset != 0 || inst->eot ||
773 sampler_handle.file != BAD_FILE ||
774 is_high_sampler(devinfo, sampler) ||
775 residency) {
776 /* For general texture offsets (no txf workaround), we need a header to
777 * put them in.
778 *
779 * TG4 needs to place its channel select in the header, for interaction
780 * with ARB_texture_swizzle. The sampler index is only 4-bits, so for
781 * larger sampler numbers we need to offset the Sampler State Pointer in
782 * the header.
783 */
784 brw_reg header = retype(sources[0], BRW_TYPE_UD);
785 for (header_size = 0; header_size < reg_unit(devinfo); header_size++)
786 sources[length++] = byte_offset(header, REG_SIZE * header_size);
787
788 /* If we're requesting fewer than four channels worth of response,
789 * and we have an explicit header, we need to set up the sampler
790 * writemask. It's reversed from normal: 1 means "don't write".
791 */
792 unsigned reg_count = regs_written(inst) - reg_unit(devinfo) * residency;
793 if (!inst->eot && reg_count < 4 * reg_width) {
794 assert(reg_count % reg_width == 0);
795 unsigned mask = ~((1 << (reg_count / reg_width)) - 1) & 0xf;
796 inst->offset |= mask << 12;
797 }
798
799 if (residency)
800 inst->offset |= 1 << 23; /* g0.2 bit23 : Pixel Null Mask Enable */
801
802 /* Build the actual header */
803 const fs_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
804 const fs_builder ubld1 = ubld.group(1, 0);
805 if (devinfo->ver >= 11)
806 ubld.MOV(header, brw_imm_ud(0));
807 else
808 ubld.MOV(header, retype(brw_vec8_grf(0, 0), BRW_TYPE_UD));
809 if (inst->offset) {
810 ubld1.MOV(component(header, 2), brw_imm_ud(inst->offset));
811 } else if (devinfo->ver < 11 &&
812 bld.shader->stage != MESA_SHADER_VERTEX &&
813 bld.shader->stage != MESA_SHADER_FRAGMENT) {
814 /* The vertex and fragment stages have g0.2 set to 0, so
815 * header0.2 is 0 when g0 is copied. Other stages may not, so we
816 * must set it to 0 to avoid setting undesirable bits in the
817 * message.
818 */
819 ubld1.MOV(component(header, 2), brw_imm_ud(0));
820 }
821
822 if (sampler_handle.file != BAD_FILE) {
823 /* Bindless sampler handles aren't relative to the sampler state
824 * pointer passed into the shader through SAMPLER_STATE_POINTERS_*.
825 * Instead, it's an absolute pointer relative to dynamic state base
826 * address.
827 *
828 * Sampler states are 16 bytes each and the pointer we give here has
829 * to be 32-byte aligned. In order to avoid more indirect messages
830 * than required, we assume that all bindless sampler states are
831 * 32-byte aligned. This sacrifices a bit of general state base
832 * address space but means we can do something more efficient in the
833 * shader.
834 */
835 if (compiler->use_bindless_sampler_offset) {
836 assert(devinfo->ver >= 11);
837 ubld1.OR(component(header, 3), sampler_handle, brw_imm_ud(1));
838 } else {
839 ubld1.MOV(component(header, 3), sampler_handle);
840 }
841 } else if (is_high_sampler(devinfo, sampler)) {
842 brw_reg sampler_state_ptr =
843 retype(brw_vec1_grf(0, 3), BRW_TYPE_UD);
844
845 /* Gfx11+ sampler message headers include bits in 4:0 which conflict
846 * with the ones included in g0.3 bits 4:0. Mask them out.
847 */
848 if (devinfo->ver >= 11) {
849 sampler_state_ptr = ubld1.vgrf(BRW_TYPE_UD);
850 ubld1.AND(sampler_state_ptr,
851 retype(brw_vec1_grf(0, 3), BRW_TYPE_UD),
852 brw_imm_ud(INTEL_MASK(31, 5)));
853 }
854
855 if (sampler.file == IMM) {
856 assert(sampler.ud >= 16);
857 const int sampler_state_size = 16; /* 16 bytes */
858
859 ubld1.ADD(component(header, 3), sampler_state_ptr,
860 brw_imm_ud(16 * (sampler.ud / 16) * sampler_state_size));
861 } else {
862 brw_reg tmp = ubld1.vgrf(BRW_TYPE_UD);
863 ubld1.AND(tmp, sampler, brw_imm_ud(0x0f0));
864 ubld1.SHL(tmp, tmp, brw_imm_ud(4));
865 ubld1.ADD(component(header, 3), sampler_state_ptr, tmp);
866 }
867 } else if (devinfo->ver >= 11) {
868 /* Gfx11+ sampler message headers include bits in 4:0 which conflict
869 * with the ones included in g0.3 bits 4:0. Mask them out.
870 */
871 ubld1.AND(component(header, 3),
872 retype(brw_vec1_grf(0, 3), BRW_TYPE_UD),
873 brw_imm_ud(INTEL_MASK(31, 5)));
874 }
875 }
876
877 const bool lod_is_zero = lod.is_zero();
878
879 /* On Xe2 and newer platforms, min_lod is the first parameter specifically
880 * so that a bunch of other, possibly unused, parameters don't need to also
881 * be included.
882 */
883 const unsigned msg_type =
884 sampler_msg_type(devinfo, op, inst->shadow_compare, lod_is_zero,
885 min_lod.file != BAD_FILE);
886
887 const bool min_lod_is_first = devinfo->ver >= 20 &&
888 (msg_type == XE2_SAMPLER_MESSAGE_SAMPLE_MLOD ||
889 msg_type == XE2_SAMPLER_MESSAGE_SAMPLE_COMPARE_MLOD);
890
891 if (min_lod_is_first) {
892 assert(min_lod.file != BAD_FILE);
893 bld.MOV(sources[length++], min_lod);
894 }
895
896 if (shadow_c.file != BAD_FILE) {
897 bld.MOV(sources[length], shadow_c);
898 length++;
899 }
900
901 bool coordinate_done = false;
902
903 /* Set up the LOD info */
904 switch (op) {
905 case SHADER_OPCODE_TXL_LOGICAL:
906 if (lod_is_zero)
907 break;
908 FALLTHROUGH;
909 case FS_OPCODE_TXB_LOGICAL:
910 case SHADER_OPCODE_TG4_BIAS_LOGICAL:
911 case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
912 case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
913 case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
914 bld.MOV(sources[length], lod);
915 length++;
916 break;
917 case SHADER_OPCODE_TXD_LOGICAL:
918 /* TXD should have been lowered in SIMD16 mode (in SIMD32 mode in
919 * Xe2+).
920 */
921 assert(bld.dispatch_width() == (8 * reg_unit(devinfo)));
922
923 /* Load dPdx and the coordinate together:
924 * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
925 */
926 for (unsigned i = 0; i < coord_components; i++) {
927 bld.MOV(sources[length++], offset(coordinate, bld, i));
928
929 /* For cube map array, the coordinate is (u,v,r,ai) but there are
930 * only derivatives for (u, v, r).
931 */
932 if (i < grad_components) {
933 bld.MOV(sources[length++], offset(lod, bld, i));
934 bld.MOV(sources[length++], offset(lod2, bld, i));
935 }
936 }
937
938 coordinate_done = true;
939 break;
940 case SHADER_OPCODE_TXS_LOGICAL:
941 sources[length] = retype(sources[length], payload_unsigned_type);
942 bld.MOV(sources[length++], lod);
943 break;
944 case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
945 /* We need an LOD; just use 0 */
946 sources[length] = retype(sources[length], payload_unsigned_type);
947 bld.MOV(sources[length++], brw_imm_ud(0));
948 break;
949 case SHADER_OPCODE_TXF_LOGICAL:
950 /* On Gfx9 the parameters are intermixed they are u, v, lod, r. */
951 sources[length] = retype(sources[length], payload_signed_type);
952 bld.MOV(sources[length++], coordinate);
953
954 if (coord_components >= 2) {
955 sources[length] = retype(sources[length], payload_signed_type);
956 bld.MOV(sources[length], offset(coordinate, bld, 1));
957 } else {
958 sources[length] = brw_imm_d(0);
959 }
960 length++;
961
962 if (!lod_is_zero) {
963 sources[length] = retype(sources[length], payload_signed_type);
964 bld.MOV(sources[length++], lod);
965 }
966
967 for (unsigned i = 2; i < coord_components; i++) {
968 sources[length] = retype(sources[length], payload_signed_type);
969 bld.MOV(sources[length++], offset(coordinate, bld, i));
970 }
971
972 coordinate_done = true;
973 break;
974
975 case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
976 case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
977 sources[length] = retype(sources[length], payload_unsigned_type);
978 bld.MOV(sources[length++], sample_index);
979
980 /* Data from the multisample control surface. */
981 for (unsigned i = 0; i < 2; ++i) {
982 /* Sampler always writes 4/8 register worth of data but for ld_mcs
983 * only valid data is in first two register. So with 16-bit
984 * payload, we need to split 2-32bit register into 4-16-bit
985 * payload.
986 *
987 * From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs -
988 * Shared Functions - 3D Sampler - Messages - Message Format:
989 *
990 * ld2dms_w si mcs0 mcs1 mcs2 mcs3 u v r
991 */
992 if (op == SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL) {
993 brw_reg tmp = offset(mcs, bld, i);
994 sources[length] = retype(sources[length], payload_unsigned_type);
995 bld.MOV(sources[length++],
996 mcs.file == IMM ? mcs :
997 brw_reg(subscript(tmp, payload_unsigned_type, 0)));
998
999 sources[length] = retype(sources[length], payload_unsigned_type);
1000 bld.MOV(sources[length++],
1001 mcs.file == IMM ? mcs :
1002 brw_reg(subscript(tmp, payload_unsigned_type, 1)));
1003 } else {
1004 sources[length] = retype(sources[length], payload_unsigned_type);
1005 bld.MOV(sources[length++],
1006 mcs.file == IMM ? mcs : offset(mcs, bld, i));
1007 }
1008 }
1009 FALLTHROUGH;
1010
1011 case SHADER_OPCODE_TXF_MCS_LOGICAL:
1012 /* There is no offsetting for this message; just copy in the integer
1013 * texture coordinates.
1014 */
1015 for (unsigned i = 0; i < coord_components; i++) {
1016 sources[length] = retype(sources[length], payload_signed_type);
1017 bld.MOV(sources[length++], offset(coordinate, bld, i));
1018 }
1019
1020 coordinate_done = true;
1021 break;
1022 case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
1023 /* More crazy intermixing */
1024 for (unsigned i = 0; i < 2; i++) /* u, v */
1025 bld.MOV(sources[length++], offset(coordinate, bld, i));
1026
1027 for (unsigned i = 0; i < 2; i++) { /* offu, offv */
1028 sources[length] = retype(sources[length], payload_signed_type);
1029 bld.MOV(sources[length++], offset(tg4_offset, bld, i));
1030 }
1031
1032 if (coord_components == 3) /* r if present */
1033 bld.MOV(sources[length++], offset(coordinate, bld, 2));
1034
1035 coordinate_done = true;
1036 break;
1037 default:
1038 break;
1039 }
1040
1041 /* Set up the coordinate (except for cases where it was done above) */
1042 if (!coordinate_done) {
1043 for (unsigned i = 0; i < coord_components; i++)
1044 bld.MOV(retype(sources[length++], payload_type),
1045 offset(coordinate, bld, i));
1046 }
1047
1048 if (min_lod.file != BAD_FILE && !min_lod_is_first) {
1049 /* Account for all of the missing coordinate sources */
1050 if (op == FS_OPCODE_TXB_LOGICAL && devinfo->ver >= 20) {
1051 /* Bspec 64985:
1052 *
1053 * For sample_b sampler message format:
1054 *
1055 * SIMD16H/SIMD32H
1056 * Param Number 0 1 2 3 4 5
1057 * Param BIAS U V R Ai MLOD
1058 *
1059 * SIMD16/SIMD32
1060 * Param Number 0 1 2 3 4
1061 * Param BIAS_AI U V R MLOD
1062 */
1063 length += 3 - coord_components;
1064 } else if (op == SHADER_OPCODE_TXD_LOGICAL && devinfo->verx10 >= 125) {
1065 /* On DG2 and newer platforms, sample_d can only be used with 1D and
1066 * 2D surfaces, so the maximum number of gradient components is 2.
1067 * In spite of this limitation, the Bspec lists a mysterious R
1068 * component before the min_lod, so the maximum coordinate components
1069 * is 3.
1070 *
1071 * See bspec 45942, "Enable new message layout for cube array"
1072 */
1073 length += 3 - coord_components;
1074 length += (2 - grad_components) * 2;
1075 } else {
1076 length += 4 - coord_components;
1077 if (op == SHADER_OPCODE_TXD_LOGICAL)
1078 length += (3 - grad_components) * 2;
1079 }
1080
1081 bld.MOV(sources[length++], min_lod);
1082
1083 /* Wa_14014595444: Populate MLOD as parameter 5 (twice). */
1084 if (devinfo->verx10 == 125 && op == FS_OPCODE_TXB_LOGICAL &&
1085 !inst->shadow_compare)
1086 bld.MOV(sources[length++], min_lod);
1087 }
1088
1089 const brw_reg src_payload =
1090 brw_vgrf(bld.shader->alloc.allocate(length * reg_width),
1091 BRW_TYPE_F);
1092 /* In case of 16-bit payload each component takes one full register in
1093 * both SIMD8H and SIMD16H modes. In both cases one reg can hold 16
1094 * elements. In SIMD8H case hardware simply expects the components to be
1095 * padded (i.e., aligned on reg boundary).
1096 */
1097 fs_inst *load_payload_inst =
1098 emit_load_payload_with_padding(bld, src_payload, sources, length,
1099 header_size, REG_SIZE * reg_unit(devinfo));
1100 unsigned mlen = load_payload_inst->size_written / REG_SIZE;
1101 unsigned simd_mode = 0;
1102 if (devinfo->ver < 20) {
1103 if (payload_type_bit_size == 16) {
1104 assert(devinfo->ver >= 11);
1105 simd_mode = inst->exec_size <= 8 ? GFX10_SAMPLER_SIMD_MODE_SIMD8H :
1106 GFX10_SAMPLER_SIMD_MODE_SIMD16H;
1107 } else {
1108 simd_mode = inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
1109 BRW_SAMPLER_SIMD_MODE_SIMD16;
1110 }
1111 } else {
1112 if (payload_type_bit_size == 16) {
1113 simd_mode = inst->exec_size <= 16 ? XE2_SAMPLER_SIMD_MODE_SIMD16H :
1114 XE2_SAMPLER_SIMD_MODE_SIMD32H;
1115 } else {
1116 simd_mode = inst->exec_size <= 16 ? XE2_SAMPLER_SIMD_MODE_SIMD16 :
1117 XE2_SAMPLER_SIMD_MODE_SIMD32;
1118 }
1119 }
1120
1121 /* Generate the SEND. */
1122 inst->opcode = SHADER_OPCODE_SEND;
1123 inst->mlen = mlen;
1124 inst->header_size = header_size;
1125 inst->sfid = BRW_SFID_SAMPLER;
1126 uint sampler_ret_type = brw_type_size_bits(inst->dst.type) == 16
1127 ? GFX8_SAMPLER_RETURN_FORMAT_16BITS
1128 : GFX8_SAMPLER_RETURN_FORMAT_32BITS;
1129 if (surface.file == IMM &&
1130 (sampler.file == IMM || sampler_handle.file != BAD_FILE)) {
1131 inst->desc = brw_sampler_desc(devinfo, surface.ud,
1132 sampler.file == IMM ? sampler.ud % 16 : 0,
1133 msg_type,
1134 simd_mode,
1135 sampler_ret_type);
1136 inst->src[0] = brw_imm_ud(0);
1137 inst->src[1] = brw_imm_ud(0);
1138 } else if (surface_handle.file != BAD_FILE) {
1139 /* Bindless surface */
1140 inst->desc = brw_sampler_desc(devinfo,
1141 GFX9_BTI_BINDLESS,
1142 sampler.file == IMM ? sampler.ud % 16 : 0,
1143 msg_type,
1144 simd_mode,
1145 sampler_ret_type);
1146
1147 /* For bindless samplers, the entire address is included in the message
1148 * header so we can leave the portion in the message descriptor 0.
1149 */
1150 if (sampler_handle.file != BAD_FILE || sampler.file == IMM) {
1151 inst->src[0] = brw_imm_ud(0);
1152 } else {
1153 const fs_builder ubld = bld.group(1, 0).exec_all();
1154 brw_reg desc = ubld.vgrf(BRW_TYPE_UD);
1155 ubld.SHL(desc, sampler, brw_imm_ud(8));
1156 inst->src[0] = component(desc, 0);
1157 }
1158
1159 /* We assume that the driver provided the handle in the top 20 bits so
1160 * we can use the surface handle directly as the extended descriptor.
1161 */
1162 inst->src[1] = retype(surface_handle, BRW_TYPE_UD);
1163 inst->send_ex_bso = compiler->extended_bindless_surface_offset;
1164 } else {
1165 /* Immediate portion of the descriptor */
1166 inst->desc = brw_sampler_desc(devinfo,
1167 0, /* surface */
1168 0, /* sampler */
1169 msg_type,
1170 simd_mode,
1171 sampler_ret_type);
1172 const fs_builder ubld = bld.group(1, 0).exec_all();
1173 brw_reg desc = ubld.vgrf(BRW_TYPE_UD);
1174 if (surface.equals(sampler)) {
1175 /* This case is common in GL */
1176 ubld.MUL(desc, surface, brw_imm_ud(0x101));
1177 } else {
1178 if (sampler_handle.file != BAD_FILE) {
1179 ubld.MOV(desc, surface);
1180 } else if (sampler.file == IMM) {
1181 ubld.OR(desc, surface, brw_imm_ud(sampler.ud << 8));
1182 } else {
1183 ubld.SHL(desc, sampler, brw_imm_ud(8));
1184 ubld.OR(desc, desc, surface);
1185 }
1186 }
1187 ubld.AND(desc, desc, brw_imm_ud(0xfff));
1188
1189 inst->src[0] = component(desc, 0);
1190 inst->src[1] = brw_imm_ud(0); /* ex_desc */
1191 }
1192
1193 inst->ex_desc = 0;
1194
1195 inst->src[2] = src_payload;
1196 inst->resize_sources(3);
1197
1198 if (inst->eot) {
1199 /* EOT sampler messages don't make sense to split because it would
1200 * involve ending half of the thread early.
1201 */
1202 assert(inst->group == 0);
1203 /* We need to use SENDC for EOT sampler messages */
1204 inst->check_tdr = true;
1205 inst->send_has_side_effects = true;
1206 }
1207
1208 /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
1209 assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE * reg_unit(devinfo));
1210 }
1211
1212 static unsigned
get_sampler_msg_payload_type_bit_size(const intel_device_info * devinfo,const fs_inst * inst)1213 get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo,
1214 const fs_inst *inst)
1215 {
1216 assert(inst);
1217 const brw_reg *src = inst->src;
1218 unsigned src_type_size = 0;
1219
1220 /* All sources need to have the same size, therefore seek the first valid
1221 * and take the size from there.
1222 */
1223 for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
1224 if (src[i].file != BAD_FILE) {
1225 src_type_size = brw_type_size_bytes(src[i].type);
1226 break;
1227 }
1228 }
1229
1230 assert(src_type_size == 2 || src_type_size == 4);
1231
1232 #ifndef NDEBUG
1233 /* Make sure all sources agree. On gfx12 this doesn't hold when sampling
1234 * compressed multisampled surfaces. There the payload contains MCS data
1235 * which is already in 16-bits unlike the other parameters that need forced
1236 * conversion.
1237 */
1238 if (inst->opcode != SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL) {
1239 for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
1240 assert(src[i].file == BAD_FILE ||
1241 brw_type_size_bytes(src[i].type) == src_type_size);
1242 }
1243 }
1244 #endif
1245
1246 if (devinfo->verx10 < 125)
1247 return src_type_size * 8;
1248
1249 /* Force conversion from 32-bit sources to 16-bit payload. From the XeHP Bspec:
1250 * 3D and GPGPU Programs - Shared Functions - 3D Sampler - Messages - Message
1251 * Format [GFX12:HAS:1209977870] *
1252 *
1253 * ld2dms_w SIMD8H and SIMD16H Only
1254 * ld_mcs SIMD8H and SIMD16H Only
1255 * ld2dms REMOVEDBY(GEN:HAS:1406788836)
1256 */
1257 if (inst->opcode == SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL ||
1258 inst->opcode == SHADER_OPCODE_TXF_MCS_LOGICAL)
1259 src_type_size = 2;
1260
1261 return src_type_size * 8;
1262 }
1263
1264 static void
lower_sampler_logical_send(const fs_builder & bld,fs_inst * inst)1265 lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst)
1266 {
1267 const intel_device_info *devinfo = bld.shader->devinfo;
1268 const brw_reg coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE];
1269 const brw_reg shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
1270 const brw_reg lod = inst->src[TEX_LOGICAL_SRC_LOD];
1271 const brw_reg lod2 = inst->src[TEX_LOGICAL_SRC_LOD2];
1272 const brw_reg min_lod = inst->src[TEX_LOGICAL_SRC_MIN_LOD];
1273 const brw_reg sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX];
1274 const brw_reg mcs = inst->src[TEX_LOGICAL_SRC_MCS];
1275 const brw_reg surface = inst->src[TEX_LOGICAL_SRC_SURFACE];
1276 const brw_reg sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];
1277 const brw_reg surface_handle = inst->src[TEX_LOGICAL_SRC_SURFACE_HANDLE];
1278 const brw_reg sampler_handle = inst->src[TEX_LOGICAL_SRC_SAMPLER_HANDLE];
1279 const brw_reg tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET];
1280 assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
1281 const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
1282 assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
1283 const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
1284 assert(inst->src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
1285 const bool residency = inst->src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
1286
1287 const unsigned msg_payload_type_bit_size =
1288 get_sampler_msg_payload_type_bit_size(devinfo, inst);
1289
1290 /* 16-bit payloads are available only on gfx11+ */
1291 assert(msg_payload_type_bit_size != 16 || devinfo->ver >= 11);
1292
1293 lower_sampler_logical_send(bld, inst, coordinate,
1294 shadow_c, lod, lod2, min_lod,
1295 sample_index,
1296 mcs, surface, sampler,
1297 surface_handle, sampler_handle,
1298 tg4_offset,
1299 msg_payload_type_bit_size,
1300 coord_components, grad_components,
1301 residency);
1302 }
1303
1304 /**
1305 * Predicate the specified instruction on the vector mask.
1306 */
1307 static void
emit_predicate_on_vector_mask(const fs_builder & bld,fs_inst * inst)1308 emit_predicate_on_vector_mask(const fs_builder &bld, fs_inst *inst)
1309 {
1310 assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
1311 bld.group() == inst->group &&
1312 bld.dispatch_width() == inst->exec_size);
1313
1314 const fs_builder ubld = bld.exec_all().group(1, 0);
1315
1316 const fs_visitor &s = *bld.shader;
1317 const brw_reg vector_mask = ubld.vgrf(BRW_TYPE_UW);
1318 ubld.UNDEF(vector_mask);
1319 ubld.emit(SHADER_OPCODE_READ_ARCH_REG, vector_mask, retype(brw_sr0_reg(3),
1320 BRW_TYPE_UD));
1321 const unsigned subreg = sample_mask_flag_subreg(s);
1322
1323 ubld.MOV(brw_flag_subreg(subreg + inst->group / 16), vector_mask);
1324
1325 if (inst->predicate) {
1326 assert(inst->predicate == BRW_PREDICATE_NORMAL);
1327 assert(!inst->predicate_inverse);
1328 assert(inst->flag_subreg == 0);
1329 assert(s.devinfo->ver < 20);
1330 /* Combine the vector mask with the existing predicate by using a
1331 * vertical predication mode.
1332 */
1333 inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
1334 } else {
1335 inst->flag_subreg = subreg;
1336 inst->predicate = BRW_PREDICATE_NORMAL;
1337 inst->predicate_inverse = false;
1338 }
1339 }
1340
1341 static void
setup_surface_descriptors(const fs_builder & bld,fs_inst * inst,uint32_t desc,const brw_reg & surface,const brw_reg & surface_handle)1342 setup_surface_descriptors(const fs_builder &bld, fs_inst *inst, uint32_t desc,
1343 const brw_reg &surface, const brw_reg &surface_handle)
1344 {
1345 const brw_compiler *compiler = bld.shader->compiler;
1346
1347 /* We must have exactly one of surface and surface_handle */
1348 assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
1349
1350 if (surface.file == IMM) {
1351 inst->desc = desc | (surface.ud & 0xff);
1352 inst->src[0] = brw_imm_ud(0);
1353 inst->src[1] = brw_imm_ud(0); /* ex_desc */
1354 } else if (surface_handle.file != BAD_FILE) {
1355 /* Bindless surface */
1356 inst->desc = desc | GFX9_BTI_BINDLESS;
1357 inst->src[0] = brw_imm_ud(0);
1358
1359 /* We assume that the driver provided the handle in the top 20 bits so
1360 * we can use the surface handle directly as the extended descriptor.
1361 */
1362 inst->src[1] = retype(surface_handle, BRW_TYPE_UD);
1363 inst->send_ex_bso = compiler->extended_bindless_surface_offset;
1364 } else {
1365 inst->desc = desc;
1366 const fs_builder ubld = bld.exec_all().group(1, 0);
1367 brw_reg tmp = ubld.vgrf(BRW_TYPE_UD);
1368 ubld.AND(tmp, surface, brw_imm_ud(0xff));
1369 inst->src[0] = component(tmp, 0);
1370 inst->src[1] = brw_imm_ud(0); /* ex_desc */
1371 }
1372 }
1373
1374 static void
setup_lsc_surface_descriptors(const fs_builder & bld,fs_inst * inst,uint32_t desc,const brw_reg & surface)1375 setup_lsc_surface_descriptors(const fs_builder &bld, fs_inst *inst,
1376 uint32_t desc, const brw_reg &surface)
1377 {
1378 const ASSERTED intel_device_info *devinfo = bld.shader->devinfo;
1379 const brw_compiler *compiler = bld.shader->compiler;
1380
1381 inst->src[0] = brw_imm_ud(0); /* desc */
1382
1383 enum lsc_addr_surface_type surf_type = lsc_msg_desc_addr_type(devinfo, desc);
1384 switch (surf_type) {
1385 case LSC_ADDR_SURFTYPE_BSS:
1386 inst->send_ex_bso = compiler->extended_bindless_surface_offset;
1387 /* fall-through */
1388 case LSC_ADDR_SURFTYPE_SS:
1389 assert(surface.file != BAD_FILE);
1390 /* We assume that the driver provided the handle in the top 20 bits so
1391 * we can use the surface handle directly as the extended descriptor.
1392 */
1393 inst->src[1] = retype(surface, BRW_TYPE_UD);
1394 break;
1395
1396 case LSC_ADDR_SURFTYPE_BTI:
1397 assert(surface.file != BAD_FILE);
1398 if (surface.file == IMM) {
1399 inst->src[1] = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud));
1400 } else {
1401 const fs_builder ubld = bld.exec_all().group(1, 0);
1402 brw_reg tmp = ubld.vgrf(BRW_TYPE_UD);
1403 ubld.SHL(tmp, surface, brw_imm_ud(24));
1404 inst->src[1] = component(tmp, 0);
1405 }
1406 break;
1407
1408 case LSC_ADDR_SURFTYPE_FLAT:
1409 inst->src[1] = brw_imm_ud(0);
1410 break;
1411
1412 default:
1413 unreachable("Invalid LSC surface address type");
1414 }
1415 }
1416
1417 static enum lsc_addr_size
lsc_addr_size_for_type(enum brw_reg_type type)1418 lsc_addr_size_for_type(enum brw_reg_type type)
1419 {
1420 switch (brw_type_size_bytes(type)) {
1421 case 2: return LSC_ADDR_SIZE_A16;
1422 case 4: return LSC_ADDR_SIZE_A32;
1423 case 8: return LSC_ADDR_SIZE_A64;
1424 default: unreachable("invalid type size");
1425 }
1426 }
1427
1428 static void
lower_lsc_memory_logical_send(const fs_builder & bld,fs_inst * inst)1429 lower_lsc_memory_logical_send(const fs_builder &bld, fs_inst *inst)
1430 {
1431 const intel_device_info *devinfo = bld.shader->devinfo;
1432 assert(devinfo->has_lsc);
1433
1434 assert(inst->src[MEMORY_LOGICAL_OPCODE].file == IMM);
1435 assert(inst->src[MEMORY_LOGICAL_MODE].file == IMM);
1436 assert(inst->src[MEMORY_LOGICAL_BINDING_TYPE].file == IMM);
1437 assert(inst->src[MEMORY_LOGICAL_COORD_COMPONENTS].file == IMM);
1438 assert(inst->src[MEMORY_LOGICAL_DATA_SIZE].file == IMM);
1439 assert(inst->src[MEMORY_LOGICAL_FLAGS].file == IMM);
1440
1441 /* Get the logical send arguments. */
1442 const enum lsc_opcode op = (lsc_opcode) inst->src[MEMORY_LOGICAL_OPCODE].ud;
1443 const enum memory_logical_mode mode =
1444 (enum memory_logical_mode) inst->src[MEMORY_LOGICAL_MODE].ud;
1445 const enum lsc_addr_surface_type binding_type =
1446 (enum lsc_addr_surface_type) inst->src[MEMORY_LOGICAL_BINDING_TYPE].ud;
1447 const brw_reg binding = inst->src[MEMORY_LOGICAL_BINDING];
1448 const brw_reg addr = inst->src[MEMORY_LOGICAL_ADDRESS];
1449 const unsigned coord_components =
1450 inst->src[MEMORY_LOGICAL_COORD_COMPONENTS].ud;
1451 enum lsc_data_size data_size =
1452 (enum lsc_data_size) inst->src[MEMORY_LOGICAL_DATA_SIZE].ud;
1453 const unsigned components = inst->src[MEMORY_LOGICAL_COMPONENTS].ud;
1454 const enum memory_flags flags =
1455 (enum memory_flags) inst->src[MEMORY_LOGICAL_FLAGS].ud;
1456 const bool transpose = flags & MEMORY_FLAG_TRANSPOSE;
1457 const bool include_helpers = flags & MEMORY_FLAG_INCLUDE_HELPERS;
1458 const brw_reg data0 = inst->src[MEMORY_LOGICAL_DATA0];
1459 const brw_reg data1 = inst->src[MEMORY_LOGICAL_DATA1];
1460 const bool has_side_effects = inst->has_side_effects();
1461
1462 const uint32_t data_size_B = lsc_data_size_bytes(data_size);
1463 const enum brw_reg_type data_type =
1464 brw_type_with_size(data0.type, data_size_B * 8);
1465
1466 const enum lsc_addr_size addr_size = lsc_addr_size_for_type(addr.type);
1467
1468 brw_reg payload = addr;
1469
1470 if (addr.file != VGRF || !addr.is_contiguous()) {
1471 if (inst->force_writemask_all) {
1472 const fs_builder dbld = bld.group(bld.shader->dispatch_width, 0);
1473 payload = dbld.move_to_vgrf(addr, coord_components);
1474 } else {
1475 payload = bld.move_to_vgrf(addr, coord_components);
1476 }
1477 }
1478
1479 unsigned ex_mlen = 0;
1480 brw_reg payload2;
1481 if (data0.file != BAD_FILE) {
1482 if (transpose) {
1483 assert(data1.file == BAD_FILE);
1484
1485 payload2 = data0;
1486 ex_mlen = DIV_ROUND_UP(components, 8);
1487 } else {
1488 brw_reg data[8];
1489 unsigned size = 0;
1490
1491 assert(components < 8);
1492
1493 for (unsigned i = 0; i < components; i++)
1494 data[size++] = offset(data0, inst->exec_size, i);
1495
1496 if (data1.file != BAD_FILE) {
1497 for (unsigned i = 0; i < components; i++)
1498 data[size++] = offset(data1, inst->exec_size, i);
1499 }
1500
1501 payload2 = bld.vgrf(data0.type, size);
1502 bld.LOAD_PAYLOAD(payload2, data, size, 0);
1503 ex_mlen = (size * brw_type_size_bytes(data_type) * inst->exec_size) / REG_SIZE;
1504 }
1505 }
1506
1507 /* Bspec: Atomic instruction -> Cache section:
1508 *
1509 * Atomic messages are always forced to "un-cacheable" in the L1
1510 * cache.
1511 */
1512 unsigned cache_mode =
1513 lsc_opcode_is_atomic(op) ? (unsigned) LSC_CACHE(devinfo, STORE, L1UC_L3WB) :
1514 lsc_opcode_is_store(op) ? (unsigned) LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS) :
1515 (unsigned) LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS);
1516
1517 /* If we're a fragment shader, we have to predicate with the sample mask to
1518 * avoid helper invocations in instructions with side effects, unless they
1519 * are explicitly required. One exception is for scratch writes - even
1520 * though those have side effects, they represent operations that didn't
1521 * originally have any. We want to avoid accessing undefined values from
1522 * scratch, so we disable helper invocations entirely there.
1523 *
1524 * There are also special cases when we actually want to run on helpers
1525 * (ray queries).
1526 */
1527 if (bld.shader->stage == MESA_SHADER_FRAGMENT && !transpose) {
1528 if (include_helpers)
1529 emit_predicate_on_vector_mask(bld, inst);
1530 else if (has_side_effects && mode != MEMORY_MODE_SCRATCH)
1531 brw_emit_predicate_on_sample_mask(bld, inst);
1532 }
1533
1534 switch (mode) {
1535 case MEMORY_MODE_UNTYPED:
1536 case MEMORY_MODE_SCRATCH:
1537 inst->sfid = GFX12_SFID_UGM;
1538 break;
1539 case MEMORY_MODE_TYPED:
1540 inst->sfid = GFX12_SFID_TGM;
1541 break;
1542 case MEMORY_MODE_SHARED_LOCAL:
1543 inst->sfid = GFX12_SFID_SLM;
1544 break;
1545 }
1546 assert(inst->sfid);
1547
1548 inst->desc = lsc_msg_desc(devinfo, op, binding_type, addr_size, data_size,
1549 lsc_opcode_has_cmask(op) ?
1550 (1 << components) - 1 : components,
1551 transpose, cache_mode);
1552
1553 /* Set up extended descriptors, fills src[0] and src[1]. */
1554 setup_lsc_surface_descriptors(bld, inst, inst->desc, binding);
1555
1556 inst->opcode = SHADER_OPCODE_SEND;
1557 inst->mlen = lsc_msg_addr_len(devinfo, addr_size,
1558 inst->exec_size * coord_components);
1559 inst->ex_mlen = ex_mlen;
1560 inst->header_size = 0;
1561 inst->send_has_side_effects = has_side_effects;
1562 inst->send_is_volatile = !has_side_effects;
1563
1564 inst->resize_sources(4);
1565
1566 /* Finally, the payload */
1567 inst->src[2] = payload;
1568 inst->src[3] = payload2;
1569 }
1570
1571 static brw_reg
emit_a64_oword_block_header(const fs_builder & bld,const brw_reg & addr)1572 emit_a64_oword_block_header(const fs_builder &bld, const brw_reg &addr)
1573 {
1574 const fs_builder ubld = bld.exec_all().group(8, 0);
1575
1576 assert(brw_type_size_bytes(addr.type) == 8 && addr.stride == 0);
1577
1578 brw_reg expanded_addr = addr;
1579 if (addr.file == UNIFORM) {
1580 /* We can't do stride 1 with the UNIFORM file, it requires stride 0 */
1581 expanded_addr = ubld.vgrf(BRW_TYPE_UQ);
1582 expanded_addr.stride = 0;
1583 ubld.MOV(expanded_addr, retype(addr, BRW_TYPE_UQ));
1584 }
1585
1586 brw_reg header = ubld.vgrf(BRW_TYPE_UD);
1587 ubld.MOV(header, brw_imm_ud(0));
1588
1589 /* Use a 2-wide MOV to fill out the address */
1590 brw_reg addr_vec2 = expanded_addr;
1591 addr_vec2.type = BRW_TYPE_UD;
1592 addr_vec2.stride = 1;
1593 ubld.group(2, 0).MOV(header, addr_vec2);
1594
1595 return header;
1596 }
1597
1598 static void
lower_hdc_memory_logical_send(const fs_builder & bld,fs_inst * inst)1599 lower_hdc_memory_logical_send(const fs_builder &bld, fs_inst *inst)
1600 {
1601 const intel_device_info *devinfo = bld.shader->devinfo;
1602 const brw_compiler *compiler = bld.shader->compiler;
1603
1604 assert(inst->src[MEMORY_LOGICAL_OPCODE].file == IMM);
1605 assert(inst->src[MEMORY_LOGICAL_MODE].file == IMM);
1606 assert(inst->src[MEMORY_LOGICAL_BINDING_TYPE].file == IMM);
1607 assert(inst->src[MEMORY_LOGICAL_COORD_COMPONENTS].file == IMM);
1608 assert(inst->src[MEMORY_LOGICAL_DATA_SIZE].file == IMM);
1609 assert(inst->src[MEMORY_LOGICAL_FLAGS].file == IMM);
1610
1611 /* Get the logical send arguments. */
1612 const enum lsc_opcode op = (lsc_opcode)inst->src[MEMORY_LOGICAL_OPCODE].ud;
1613 const enum memory_logical_mode mode =
1614 (enum memory_logical_mode) inst->src[MEMORY_LOGICAL_MODE].ud;
1615 enum lsc_addr_surface_type binding_type =
1616 (enum lsc_addr_surface_type) inst->src[MEMORY_LOGICAL_BINDING_TYPE].ud;
1617 brw_reg binding = inst->src[MEMORY_LOGICAL_BINDING];
1618 const brw_reg addr = inst->src[MEMORY_LOGICAL_ADDRESS];
1619 const unsigned coord_components =
1620 inst->src[MEMORY_LOGICAL_COORD_COMPONENTS].ud;
1621 const unsigned alignment = inst->src[MEMORY_LOGICAL_ALIGNMENT].ud;
1622 const unsigned components = inst->src[MEMORY_LOGICAL_COMPONENTS].ud;
1623 const enum memory_flags flags =
1624 (enum memory_flags) inst->src[MEMORY_LOGICAL_FLAGS].ud;
1625 const bool block = flags & MEMORY_FLAG_TRANSPOSE;
1626 const bool include_helpers = flags & MEMORY_FLAG_INCLUDE_HELPERS;
1627 const brw_reg data0 = inst->src[MEMORY_LOGICAL_DATA0];
1628 const brw_reg data1 = inst->src[MEMORY_LOGICAL_DATA1];
1629 const bool has_side_effects = inst->has_side_effects();
1630 const bool has_dest = inst->dst.file != BAD_FILE && !inst->dst.is_null();
1631
1632 /* Don't predicate scratch writes on the sample mask. Otherwise,
1633 * FS helper invocations would load undefined values from scratch memory.
1634 * And scratch memory load/stores are produced from operations without
1635 * side-effects, thus they should not have different behavior in the
1636 * helper invocations.
1637 */
1638 bool allow_sample_mask = has_side_effects && mode != MEMORY_MODE_SCRATCH;
1639
1640 const enum lsc_data_size data_size =
1641 (enum lsc_data_size) inst->src[MEMORY_LOGICAL_DATA_SIZE].ud;
1642
1643 /* unpadded data size */
1644 const uint32_t data_bit_size =
1645 data_size == LSC_DATA_SIZE_D8U32 ? 8 :
1646 data_size == LSC_DATA_SIZE_D16U32 ? 16 :
1647 8 * lsc_data_size_bytes(data_size);
1648
1649 const bool byte_scattered =
1650 data_bit_size < 32 || (alignment != 0 && alignment < 4);
1651 const bool dword_scattered = !byte_scattered && mode == MEMORY_MODE_SCRATCH;
1652 const bool surface_access = !byte_scattered && !dword_scattered && !block;
1653
1654 /* SLM block reads must use the 16B-aligned OWord Block Read messages,
1655 * as the unaligned message doesn't exist for SLM.
1656 */
1657 const bool oword_aligned = block && mode == MEMORY_MODE_SHARED_LOCAL;
1658 assert(!oword_aligned || (alignment % 16) == 0);
1659
1660 enum lsc_addr_size addr_size = lsc_addr_size_for_type(addr.type);
1661 unsigned addr_size_B = coord_components * lsc_addr_size_bytes(addr_size);
1662
1663 brw_reg header;
1664 fs_builder ubld8 = bld.exec_all().group(8, 0);
1665 fs_builder ubld1 = ubld8.group(1, 0);
1666 if (mode == MEMORY_MODE_SCRATCH) {
1667 header = ubld8.vgrf(BRW_TYPE_UD);
1668 ubld8.emit(SHADER_OPCODE_SCRATCH_HEADER, header, brw_ud8_grf(0, 0));
1669 } else if (block) {
1670 if (addr_size == LSC_ADDR_SIZE_A64) {
1671 header = emit_a64_oword_block_header(bld, addr);
1672 } else {
1673 header = ubld8.vgrf(BRW_TYPE_UD);
1674 ubld8.MOV(header, brw_imm_ud(0));
1675 if (oword_aligned)
1676 ubld1.SHR(component(header, 2), addr, brw_imm_ud(4));
1677 else
1678 ubld1.MOV(component(header, 2), addr);
1679 }
1680 }
1681
1682 /* If we're a fragment shader, we have to predicate with the sample mask to
1683 * avoid helper invocations to avoid helper invocations in instructions
1684 * with side effects, unless they are explicitly required.
1685 *
1686 * There are also special cases when we actually want to run on helpers
1687 * (ray queries).
1688 */
1689 if (bld.shader->stage == MESA_SHADER_FRAGMENT) {
1690 if (include_helpers)
1691 emit_predicate_on_vector_mask(bld, inst);
1692 else if (allow_sample_mask &&
1693 (header.file == BAD_FILE || !surface_access))
1694 brw_emit_predicate_on_sample_mask(bld, inst);
1695 }
1696
1697 brw_reg payload, payload2;
1698 unsigned mlen, ex_mlen = 0;
1699
1700 if (!block) {
1701 brw_reg data[11];
1702 unsigned num_sources = 0;
1703 if (header.file != BAD_FILE)
1704 data[num_sources++] = header;
1705
1706 for (unsigned i = 0; i < coord_components; i++)
1707 data[num_sources++] = offset(addr, inst->exec_size, i);
1708
1709 if (data0.file != BAD_FILE) {
1710 for (unsigned i = 0; i < components; i++)
1711 data[num_sources++] = offset(data0, inst->exec_size, i);
1712 if (data1.file != BAD_FILE) {
1713 for (unsigned i = 0; i < components; i++)
1714 data[num_sources++] = offset(data1, inst->exec_size, i);
1715 }
1716 }
1717
1718 assert(num_sources <= ARRAY_SIZE(data));
1719
1720 unsigned payload_size_UDs = (header.file != BAD_FILE ? 1 : 0) +
1721 (addr_size_B / 4) +
1722 (lsc_op_num_data_values(op) * components *
1723 lsc_data_size_bytes(data_size) / 4);
1724
1725 payload = bld.vgrf(BRW_TYPE_UD, payload_size_UDs);
1726 fs_inst *load_payload =
1727 emit_load_payload_with_padding(bld, payload, data, num_sources,
1728 header.file != BAD_FILE ? 1 : 0,
1729 REG_SIZE);
1730 mlen = load_payload->size_written / REG_SIZE;
1731 } else {
1732 assert(data1.file == BAD_FILE);
1733
1734 payload = header;
1735 mlen = 1;
1736
1737 if (data0.file != BAD_FILE) {
1738 payload2 = bld.move_to_vgrf(data0, components);
1739 ex_mlen = components * sizeof(uint32_t) / REG_SIZE;
1740 }
1741 }
1742
1743
1744 if (mode == MEMORY_MODE_SHARED_LOCAL) {
1745 binding_type = LSC_ADDR_SURFTYPE_BTI;
1746 binding = brw_imm_ud(GFX7_BTI_SLM);
1747 } else if (mode == MEMORY_MODE_SCRATCH) {
1748 binding_type = LSC_ADDR_SURFTYPE_BTI;
1749 binding = brw_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT);
1750 }
1751
1752 uint32_t sfid, desc;
1753 if (mode == MEMORY_MODE_TYPED) {
1754 assert(addr_size == LSC_ADDR_SIZE_A32);
1755 assert(!block);
1756
1757 sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
1758
1759 if (lsc_opcode_is_atomic(op)) {
1760 desc = brw_dp_typed_atomic_desc(devinfo, inst->exec_size, inst->group,
1761 lsc_op_to_legacy_atomic(op),
1762 has_dest);
1763 } else {
1764 desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size,
1765 inst->group, components, !has_dest);
1766 }
1767 } else if (addr_size == LSC_ADDR_SIZE_A64) {
1768 assert(binding_type == LSC_ADDR_SURFTYPE_FLAT);
1769 assert(!dword_scattered);
1770
1771 sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
1772
1773 if (lsc_opcode_is_atomic(op)) {
1774 unsigned aop = lsc_op_to_legacy_atomic(op);
1775 if (lsc_opcode_is_atomic_float(op)) {
1776 desc = brw_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size,
1777 data_bit_size, aop,
1778 has_dest);
1779 } else {
1780 desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size,
1781 data_bit_size, aop,
1782 has_dest);
1783 }
1784 } else if (block) {
1785 desc = brw_dp_a64_oword_block_rw_desc(devinfo, oword_aligned,
1786 components, !has_dest);
1787 } else if (byte_scattered) {
1788 desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
1789 data_bit_size, !has_dest);
1790 } else {
1791 desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
1792 components, !has_dest);
1793 }
1794 } else {
1795 assert(binding_type != LSC_ADDR_SURFTYPE_FLAT);
1796
1797 sfid = surface_access ? HSW_SFID_DATAPORT_DATA_CACHE_1
1798 : GFX7_SFID_DATAPORT_DATA_CACHE;
1799
1800 if (lsc_opcode_is_atomic(op)) {
1801 unsigned aop = lsc_op_to_legacy_atomic(op);
1802 if (lsc_opcode_is_atomic_float(op)) {
1803 desc = brw_dp_untyped_atomic_float_desc(devinfo, inst->exec_size,
1804 aop, has_dest);
1805 } else {
1806 desc = brw_dp_untyped_atomic_desc(devinfo, inst->exec_size,
1807 aop, has_dest);
1808 }
1809 } else if (block) {
1810 desc = brw_dp_oword_block_rw_desc(devinfo, oword_aligned,
1811 components, !has_dest);
1812 } else if (byte_scattered) {
1813 desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
1814 data_bit_size, !has_dest);
1815 } else if (dword_scattered) {
1816 desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
1817 !has_dest);
1818 } else {
1819 desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
1820 components, !has_dest);
1821 }
1822 }
1823
1824 assert(sfid);
1825
1826 /* Update the original instruction. */
1827 inst->opcode = SHADER_OPCODE_SEND;
1828 inst->sfid = sfid;
1829 inst->mlen = mlen;
1830 inst->ex_mlen = ex_mlen;
1831 inst->header_size = header.file != BAD_FILE ? 1 : 0;
1832 inst->send_has_side_effects = has_side_effects;
1833 inst->send_is_volatile = !has_side_effects;
1834
1835 if (block) {
1836 assert(inst->force_writemask_all);
1837 inst->exec_size = components > 8 ? 16 : 8;
1838 }
1839
1840 inst->resize_sources(4);
1841
1842 /* Set up descriptors */
1843 switch (binding_type) {
1844 case LSC_ADDR_SURFTYPE_FLAT:
1845 inst->src[0] = brw_imm_ud(0);
1846 inst->src[1] = brw_imm_ud(0);
1847 break;
1848 case LSC_ADDR_SURFTYPE_BSS:
1849 inst->send_ex_bso = compiler->extended_bindless_surface_offset;
1850 /* fall-through */
1851 case LSC_ADDR_SURFTYPE_SS:
1852 desc |= GFX9_BTI_BINDLESS;
1853
1854 /* We assume that the driver provided the handle in the top 20 bits so
1855 * we can use the surface handle directly as the extended descriptor.
1856 */
1857 inst->src[0] = brw_imm_ud(0);
1858 inst->src[1] = binding;
1859 break;
1860 case LSC_ADDR_SURFTYPE_BTI:
1861 if (binding.file == IMM) {
1862 desc |= binding.ud & 0xff;
1863 inst->src[0] = brw_imm_ud(0);
1864 inst->src[1] = brw_imm_ud(0);
1865 } else {
1866 brw_reg tmp = ubld1.vgrf(BRW_TYPE_UD);
1867 ubld1.AND(tmp, binding, brw_imm_ud(0xff));
1868 inst->src[0] = component(tmp, 0);
1869 inst->src[1] = brw_imm_ud(0);
1870 }
1871 break;
1872 default:
1873 unreachable("Unknown surface type");
1874 }
1875
1876 inst->desc = desc;
1877
1878 /* Finally, the payloads */
1879 inst->src[2] = payload;
1880 inst->src[3] = payload2;
1881 }
1882
1883 static void
lower_lsc_varying_pull_constant_logical_send(const fs_builder & bld,fs_inst * inst)1884 lower_lsc_varying_pull_constant_logical_send(const fs_builder &bld,
1885 fs_inst *inst)
1886 {
1887 const intel_device_info *devinfo = bld.shader->devinfo;
1888 ASSERTED const brw_compiler *compiler = bld.shader->compiler;
1889
1890 brw_reg surface = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE];
1891 brw_reg surface_handle = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE];
1892 brw_reg offset_B = inst->src[PULL_VARYING_CONSTANT_SRC_OFFSET];
1893 brw_reg alignment_B = inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT];
1894
1895 /* We are switching the instruction from an ALU-like instruction to a
1896 * send-from-grf instruction. Since sends can't handle strides or
1897 * source modifiers, we have to make a copy of the offset source.
1898 */
1899 brw_reg ubo_offset = bld.move_to_vgrf(offset_B, 1);
1900
1901 enum lsc_addr_surface_type surf_type =
1902 surface_handle.file == BAD_FILE ?
1903 LSC_ADDR_SURFTYPE_BTI : LSC_ADDR_SURFTYPE_BSS;
1904
1905 assert(alignment_B.file == IMM);
1906 unsigned alignment = alignment_B.ud;
1907
1908 inst->opcode = SHADER_OPCODE_SEND;
1909 inst->sfid = GFX12_SFID_UGM;
1910 inst->resize_sources(3);
1911 inst->send_ex_bso = surf_type == LSC_ADDR_SURFTYPE_BSS &&
1912 compiler->extended_bindless_surface_offset;
1913
1914 assert(!compiler->indirect_ubos_use_sampler);
1915
1916 inst->src[0] = brw_imm_ud(0);
1917 inst->src[2] = ubo_offset; /* payload */
1918
1919 if (alignment >= 4) {
1920 inst->desc =
1921 lsc_msg_desc(devinfo, LSC_OP_LOAD,
1922 surf_type, LSC_ADDR_SIZE_A32,
1923 LSC_DATA_SIZE_D32,
1924 4 /* num_channels */,
1925 false /* transpose */,
1926 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
1927 inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, inst->exec_size);
1928
1929 setup_lsc_surface_descriptors(bld, inst, inst->desc,
1930 surface.file != BAD_FILE ?
1931 surface : surface_handle);
1932 } else {
1933 inst->desc =
1934 lsc_msg_desc(devinfo, LSC_OP_LOAD,
1935 surf_type, LSC_ADDR_SIZE_A32,
1936 LSC_DATA_SIZE_D32,
1937 1 /* num_channels */,
1938 false /* transpose */,
1939 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
1940 inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, inst->exec_size);
1941
1942 setup_lsc_surface_descriptors(bld, inst, inst->desc,
1943 surface.file != BAD_FILE ?
1944 surface : surface_handle);
1945
1946 /* The byte scattered messages can only read one dword at a time so
1947 * we have to duplicate the message 4 times to read the full vec4.
1948 * Hopefully, dead code will clean up the mess if some of them aren't
1949 * needed.
1950 */
1951 assert(inst->size_written == 16 * inst->exec_size);
1952 inst->size_written /= 4;
1953 for (unsigned c = 1; c < 4; c++) {
1954 /* Emit a copy of the instruction because we're about to modify
1955 * it. Because this loop starts at 1, we will emit copies for the
1956 * first 3 and the final one will be the modified instruction.
1957 */
1958 bld.emit(*inst);
1959
1960 /* Offset the source */
1961 inst->src[2] = bld.vgrf(BRW_TYPE_UD);
1962 bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4));
1963
1964 /* Offset the destination */
1965 inst->dst = offset(inst->dst, bld, 1);
1966 }
1967 }
1968 }
1969
1970 static void
lower_varying_pull_constant_logical_send(const fs_builder & bld,fs_inst * inst)1971 lower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst)
1972 {
1973 const intel_device_info *devinfo = bld.shader->devinfo;
1974 const brw_compiler *compiler = bld.shader->compiler;
1975
1976 brw_reg surface = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE];
1977 brw_reg surface_handle = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE];
1978 brw_reg offset_B = inst->src[PULL_VARYING_CONSTANT_SRC_OFFSET];
1979
1980 /* We are switching the instruction from an ALU-like instruction to a
1981 * send-from-grf instruction. Since sends can't handle strides or
1982 * source modifiers, we have to make a copy of the offset source.
1983 */
1984 brw_reg ubo_offset = bld.vgrf(BRW_TYPE_UD);
1985 bld.MOV(ubo_offset, offset_B);
1986
1987 assert(inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT].file == IMM);
1988 unsigned alignment = inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT].ud;
1989
1990 inst->opcode = SHADER_OPCODE_SEND;
1991 inst->mlen = inst->exec_size / 8;
1992 inst->resize_sources(3);
1993
1994 /* src[0] & src[1] are filled by setup_surface_descriptors() */
1995 inst->src[2] = ubo_offset; /* payload */
1996
1997 if (compiler->indirect_ubos_use_sampler) {
1998 const unsigned simd_mode =
1999 inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
2000 BRW_SAMPLER_SIMD_MODE_SIMD16;
2001 const uint32_t desc = brw_sampler_desc(devinfo, 0, 0,
2002 GFX5_SAMPLER_MESSAGE_SAMPLE_LD,
2003 simd_mode, 0);
2004
2005 inst->sfid = BRW_SFID_SAMPLER;
2006 setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2007 } else if (alignment >= 4) {
2008 const uint32_t desc =
2009 brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
2010 4, /* num_channels */
2011 false /* write */);
2012
2013 inst->sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
2014 setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2015 } else {
2016 const uint32_t desc =
2017 brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
2018 32, /* bit_size */
2019 false /* write */);
2020
2021 inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
2022 setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2023
2024 /* The byte scattered messages can only read one dword at a time so
2025 * we have to duplicate the message 4 times to read the full vec4.
2026 * Hopefully, dead code will clean up the mess if some of them aren't
2027 * needed.
2028 */
2029 assert(inst->size_written == 16 * inst->exec_size);
2030 inst->size_written /= 4;
2031 for (unsigned c = 1; c < 4; c++) {
2032 /* Emit a copy of the instruction because we're about to modify
2033 * it. Because this loop starts at 1, we will emit copies for the
2034 * first 3 and the final one will be the modified instruction.
2035 */
2036 bld.emit(*inst);
2037
2038 /* Offset the source */
2039 inst->src[2] = bld.vgrf(BRW_TYPE_UD);
2040 bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4));
2041
2042 /* Offset the destination */
2043 inst->dst = offset(inst->dst, bld, 1);
2044 }
2045 }
2046 }
2047
2048 static void
lower_interpolator_logical_send(const fs_builder & bld,fs_inst * inst,const struct brw_wm_prog_key * wm_prog_key,const struct brw_wm_prog_data * wm_prog_data)2049 lower_interpolator_logical_send(const fs_builder &bld, fs_inst *inst,
2050 const struct brw_wm_prog_key *wm_prog_key,
2051 const struct brw_wm_prog_data *wm_prog_data)
2052 {
2053 const intel_device_info *devinfo = bld.shader->devinfo;
2054
2055 /* We have to send something */
2056 brw_reg payload = brw_vec8_grf(0, 0);
2057 unsigned mlen = 1;
2058
2059 unsigned mode;
2060 switch (inst->opcode) {
2061 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
2062 assert(inst->src[INTERP_SRC_OFFSET].file == BAD_FILE);
2063 mode = GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE;
2064 break;
2065
2066 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
2067 assert(inst->src[INTERP_SRC_OFFSET].file == BAD_FILE);
2068 mode = GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET;
2069 break;
2070
2071 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
2072 payload = inst->src[INTERP_SRC_OFFSET];
2073 mlen = 2 * inst->exec_size / 8;
2074 mode = GFX7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET;
2075 break;
2076
2077 default:
2078 unreachable("Invalid interpolator instruction");
2079 }
2080
2081 const bool dynamic_mode =
2082 inst->src[INTERP_SRC_DYNAMIC_MODE].file != BAD_FILE;
2083
2084 brw_reg desc = inst->src[INTERP_SRC_MSG_DESC];
2085 uint32_t desc_imm =
2086 brw_pixel_interp_desc(devinfo,
2087 /* Leave the mode at 0 if persample_dispatch is
2088 * dynamic, it will be ORed in below.
2089 */
2090 dynamic_mode ? 0 : mode,
2091 inst->pi_noperspective,
2092 false /* coarse_pixel_rate */,
2093 inst->exec_size, inst->group);
2094
2095 if (wm_prog_data->coarse_pixel_dispatch == BRW_ALWAYS) {
2096 desc_imm |= (1 << 15);
2097 } else if (wm_prog_data->coarse_pixel_dispatch == BRW_SOMETIMES) {
2098 STATIC_ASSERT(INTEL_MSAA_FLAG_COARSE_PI_MSG == (1 << 15));
2099 brw_reg orig_desc = desc;
2100 const fs_builder &ubld = bld.exec_all().group(8, 0);
2101 desc = ubld.vgrf(BRW_TYPE_UD);
2102 ubld.AND(desc, dynamic_msaa_flags(wm_prog_data),
2103 brw_imm_ud(INTEL_MSAA_FLAG_COARSE_PI_MSG));
2104
2105 /* And, if it's AT_OFFSET, we might have a non-trivial descriptor */
2106 if (orig_desc.file == IMM) {
2107 desc_imm |= orig_desc.ud;
2108 } else {
2109 ubld.OR(desc, desc, orig_desc);
2110 }
2111 }
2112
2113 /* If persample_dispatch is dynamic, select the interpolation mode
2114 * dynamically and OR into the descriptor to complete the static part
2115 * generated by brw_pixel_interp_desc().
2116 *
2117 * Why does this work? If you look at the SKL PRMs, Volume 7:
2118 * 3D-Media-GPGPU, Shared Functions Pixel Interpolater, you'll see that
2119 *
2120 * - "Per Message Offset” Message Descriptor
2121 * - “Sample Position Offset” Message Descriptor
2122 *
2123 * have different formats. Fortunately, a fragment shader dispatched at
2124 * pixel rate, will have gl_SampleID = 0 & gl_NumSamples = 1. So the value
2125 * we pack in “Sample Position Offset” will be a 0 and will cover the X/Y
2126 * components of "Per Message Offset”, which will give us the pixel offset 0x0.
2127 */
2128 if (dynamic_mode) {
2129 brw_reg orig_desc = desc;
2130 const fs_builder &ubld = bld.exec_all().group(8, 0);
2131 desc = ubld.vgrf(BRW_TYPE_UD);
2132
2133 /* The predicate should have been built in brw_fs_nir.cpp when emitting
2134 * NIR code. This guarantees that we do not have incorrect interactions
2135 * with the flag register holding the predication result.
2136 */
2137 if (orig_desc.file == IMM) {
2138 /* Not using SEL here because we would generate an instruction with 2
2139 * immediate sources which is not supported by HW.
2140 */
2141 set_predicate_inv(BRW_PREDICATE_NORMAL, false,
2142 ubld.MOV(desc, brw_imm_ud(orig_desc.ud |
2143 GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE << 12)));
2144 set_predicate_inv(BRW_PREDICATE_NORMAL, true,
2145 ubld.MOV(desc, brw_imm_ud(orig_desc.ud |
2146 GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET << 12)));
2147 } else {
2148 set_predicate_inv(BRW_PREDICATE_NORMAL, false,
2149 ubld.OR(desc, orig_desc,
2150 brw_imm_ud(GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE << 12)));
2151 set_predicate_inv(BRW_PREDICATE_NORMAL, true,
2152 ubld.OR(desc, orig_desc,
2153 brw_imm_ud(GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET << 12)));
2154 }
2155 }
2156
2157 inst->opcode = SHADER_OPCODE_SEND;
2158 inst->sfid = GFX7_SFID_PIXEL_INTERPOLATOR;
2159 inst->desc = desc_imm;
2160 inst->ex_desc = 0;
2161 inst->mlen = mlen;
2162 inst->ex_mlen = 0;
2163 inst->send_has_side_effects = false;
2164 inst->send_is_volatile = false;
2165
2166 inst->resize_sources(3);
2167 inst->src[0] = component(desc, 0);
2168 inst->src[1] = brw_imm_ud(0); /* ex_desc */
2169 inst->src[2] = payload;
2170 }
2171
2172 static void
lower_btd_logical_send(const fs_builder & bld,fs_inst * inst)2173 lower_btd_logical_send(const fs_builder &bld, fs_inst *inst)
2174 {
2175 const intel_device_info *devinfo = bld.shader->devinfo;
2176 brw_reg global_addr = inst->src[0];
2177 const brw_reg btd_record = inst->src[1];
2178
2179 const unsigned unit = reg_unit(devinfo);
2180 const unsigned mlen = 2 * unit;
2181 const fs_builder ubld = bld.exec_all();
2182 brw_reg header = ubld.vgrf(BRW_TYPE_UD, 2 * unit);
2183
2184 ubld.MOV(header, brw_imm_ud(0));
2185 switch (inst->opcode) {
2186 case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
2187 assert(brw_type_size_bytes(global_addr.type) == 8 &&
2188 global_addr.stride == 0);
2189 global_addr.type = BRW_TYPE_UD;
2190 global_addr.stride = 1;
2191 ubld.group(2, 0).MOV(header, global_addr);
2192 break;
2193
2194 case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
2195 /* The bottom bit is the Stack ID release bit */
2196 ubld.group(1, 0).MOV(header, brw_imm_ud(1));
2197 break;
2198
2199 default:
2200 unreachable("Invalid BTD message");
2201 }
2202
2203 /* Stack IDs are always in R1 regardless of whether we're coming from a
2204 * bindless shader or a regular compute shader.
2205 */
2206 brw_reg stack_ids = retype(offset(header, bld, 1), BRW_TYPE_UW);
2207 bld.exec_all().MOV(stack_ids, retype(brw_vec8_grf(1 * unit, 0),
2208 BRW_TYPE_UW));
2209
2210 unsigned ex_mlen = 0;
2211 brw_reg payload;
2212 if (inst->opcode == SHADER_OPCODE_BTD_SPAWN_LOGICAL) {
2213 ex_mlen = 2 * (inst->exec_size / 8);
2214 payload = bld.move_to_vgrf(btd_record, 1);
2215 } else {
2216 assert(inst->opcode == SHADER_OPCODE_BTD_RETIRE_LOGICAL);
2217 /* All these messages take a BTD and things complain if we don't provide
2218 * one for RETIRE. However, it shouldn't ever actually get used so fill
2219 * it with zero.
2220 */
2221 ex_mlen = 2 * (inst->exec_size / 8);
2222 payload = bld.move_to_vgrf(brw_imm_uq(0), 1);
2223 }
2224
2225 /* Update the original instruction. */
2226 inst->opcode = SHADER_OPCODE_SEND;
2227 inst->mlen = mlen;
2228 inst->ex_mlen = ex_mlen;
2229 inst->header_size = 0; /* HW docs require has_header = false */
2230 inst->send_has_side_effects = true;
2231 inst->send_is_volatile = false;
2232
2233 /* Set up SFID and descriptors */
2234 inst->sfid = GEN_RT_SFID_BINDLESS_THREAD_DISPATCH;
2235 inst->desc = brw_btd_spawn_desc(devinfo, inst->exec_size,
2236 GEN_RT_BTD_MESSAGE_SPAWN);
2237 inst->resize_sources(4);
2238 inst->src[0] = brw_imm_ud(0); /* desc */
2239 inst->src[1] = brw_imm_ud(0); /* ex_desc */
2240 inst->src[2] = header;
2241 inst->src[3] = payload;
2242 }
2243
2244 static void
lower_trace_ray_logical_send(const fs_builder & bld,fs_inst * inst)2245 lower_trace_ray_logical_send(const fs_builder &bld, fs_inst *inst)
2246 {
2247 const intel_device_info *devinfo = bld.shader->devinfo;
2248 /* The emit_uniformize() in brw_fs_nir.cpp will generate an horizontal
2249 * stride of 0. Below we're doing a MOV() in SIMD2. Since we can't use UQ/Q
2250 * types in on Gfx12.5, we need to tweak the stride with a value of 1 dword
2251 * so that the MOV operates on 2 components rather than twice the same
2252 * component.
2253 */
2254 brw_reg globals_addr = retype(inst->src[RT_LOGICAL_SRC_GLOBALS], BRW_TYPE_UD);
2255 globals_addr.stride = 1;
2256 const brw_reg bvh_level =
2257 inst->src[RT_LOGICAL_SRC_BVH_LEVEL].file == IMM ?
2258 inst->src[RT_LOGICAL_SRC_BVH_LEVEL] :
2259 bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_BVH_LEVEL],
2260 inst->components_read(RT_LOGICAL_SRC_BVH_LEVEL));
2261 const brw_reg trace_ray_control =
2262 inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL].file == IMM ?
2263 inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] :
2264 bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL],
2265 inst->components_read(RT_LOGICAL_SRC_TRACE_RAY_CONTROL));
2266 const brw_reg synchronous_src = inst->src[RT_LOGICAL_SRC_SYNCHRONOUS];
2267 assert(synchronous_src.file == IMM);
2268 const bool synchronous = synchronous_src.ud;
2269
2270 const unsigned unit = reg_unit(devinfo);
2271 const unsigned mlen = unit;
2272 const fs_builder ubld = bld.exec_all();
2273 brw_reg header = ubld.vgrf(BRW_TYPE_UD);
2274 ubld.MOV(header, brw_imm_ud(0));
2275 ubld.group(2, 0).MOV(header, globals_addr);
2276 if (synchronous)
2277 ubld.group(1, 0).MOV(byte_offset(header, 16), brw_imm_ud(synchronous));
2278
2279 const unsigned ex_mlen = inst->exec_size / 8;
2280 brw_reg payload = bld.vgrf(BRW_TYPE_UD);
2281 if (bvh_level.file == IMM &&
2282 trace_ray_control.file == IMM) {
2283 uint32_t high = devinfo->ver >= 20 ? 10 : 9;
2284 bld.MOV(payload, brw_imm_ud(SET_BITS(trace_ray_control.ud, high, 8) |
2285 (bvh_level.ud & 0x7)));
2286 } else {
2287 bld.SHL(payload, trace_ray_control, brw_imm_ud(8));
2288 bld.OR(payload, payload, bvh_level);
2289 }
2290
2291 /* When doing synchronous traversal, the HW implicitly computes the
2292 * stack_id using the following formula :
2293 *
2294 * EUID[3:0] & THREAD_ID[2:0] & SIMD_LANE_ID[3:0]
2295 *
2296 * Only in the asynchronous case we need to set the stack_id given from the
2297 * payload register.
2298 */
2299 if (!synchronous) {
2300 bld.AND(subscript(payload, BRW_TYPE_UW, 1),
2301 retype(brw_vec8_grf(1 * unit, 0), BRW_TYPE_UW),
2302 brw_imm_uw(0x7ff));
2303 }
2304
2305 /* Update the original instruction. */
2306 inst->opcode = SHADER_OPCODE_SEND;
2307 inst->mlen = mlen;
2308 inst->ex_mlen = ex_mlen;
2309 inst->header_size = 0; /* HW docs require has_header = false */
2310 inst->send_has_side_effects = true;
2311 inst->send_is_volatile = false;
2312
2313 /* Set up SFID and descriptors */
2314 inst->sfid = GEN_RT_SFID_RAY_TRACE_ACCELERATOR;
2315 inst->desc = brw_rt_trace_ray_desc(devinfo, inst->exec_size);
2316 inst->resize_sources(4);
2317 inst->src[0] = brw_imm_ud(0); /* desc */
2318 inst->src[1] = brw_imm_ud(0); /* ex_desc */
2319 inst->src[2] = header;
2320 inst->src[3] = payload;
2321 }
2322
2323 static void
lower_get_buffer_size(const fs_builder & bld,fs_inst * inst)2324 lower_get_buffer_size(const fs_builder &bld, fs_inst *inst)
2325 {
2326 const intel_device_info *devinfo = bld.shader->devinfo;
2327 /* Since we can only execute this instruction on uniform bti/surface
2328 * handles, brw_fs_nir.cpp should already have limited this to SIMD8.
2329 */
2330 assert(inst->exec_size == (devinfo->ver < 20 ? 8 : 16));
2331
2332 brw_reg surface = inst->src[GET_BUFFER_SIZE_SRC_SURFACE];
2333 brw_reg surface_handle = inst->src[GET_BUFFER_SIZE_SRC_SURFACE_HANDLE];
2334 brw_reg lod = inst->src[GET_BUFFER_SIZE_SRC_LOD];
2335
2336 inst->opcode = SHADER_OPCODE_SEND;
2337 inst->mlen = inst->exec_size / 8;
2338 inst->resize_sources(3);
2339 inst->ex_mlen = 0;
2340 inst->ex_desc = 0;
2341
2342 /* src[0] & src[1] are filled by setup_surface_descriptors() */
2343 inst->src[2] = lod;
2344
2345 const uint32_t return_format = GFX8_SAMPLER_RETURN_FORMAT_32BITS;
2346
2347 const uint32_t desc = brw_sampler_desc(devinfo, 0, 0,
2348 GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
2349 BRW_SAMPLER_SIMD_MODE_SIMD8,
2350 return_format);
2351
2352 inst->dst = retype(inst->dst, BRW_TYPE_UW);
2353 inst->sfid = BRW_SFID_SAMPLER;
2354 setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2355 }
2356
2357 bool
brw_fs_lower_logical_sends(fs_visitor & s)2358 brw_fs_lower_logical_sends(fs_visitor &s)
2359 {
2360 const intel_device_info *devinfo = s.devinfo;
2361 bool progress = false;
2362
2363 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
2364 const fs_builder ibld(&s, block, inst);
2365
2366 switch (inst->opcode) {
2367 case FS_OPCODE_FB_WRITE_LOGICAL:
2368 assert(s.stage == MESA_SHADER_FRAGMENT);
2369 lower_fb_write_logical_send(ibld, inst,
2370 brw_wm_prog_data(s.prog_data),
2371 (const brw_wm_prog_key *)s.key,
2372 s.fs_payload());
2373 break;
2374
2375 case FS_OPCODE_FB_READ_LOGICAL:
2376 lower_fb_read_logical_send(ibld, inst, brw_wm_prog_data(s.prog_data));
2377 break;
2378
2379 case SHADER_OPCODE_TEX_LOGICAL:
2380 case SHADER_OPCODE_TXD_LOGICAL:
2381 case SHADER_OPCODE_TXF_LOGICAL:
2382 case SHADER_OPCODE_TXL_LOGICAL:
2383 case SHADER_OPCODE_TXS_LOGICAL:
2384 case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
2385 case FS_OPCODE_TXB_LOGICAL:
2386 case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
2387 case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
2388 case SHADER_OPCODE_TXF_MCS_LOGICAL:
2389 case SHADER_OPCODE_LOD_LOGICAL:
2390 case SHADER_OPCODE_TG4_LOGICAL:
2391 case SHADER_OPCODE_TG4_BIAS_LOGICAL:
2392 case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
2393 case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
2394 case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
2395 case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
2396 case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
2397 case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
2398 lower_sampler_logical_send(ibld, inst);
2399 break;
2400
2401 case SHADER_OPCODE_GET_BUFFER_SIZE:
2402 lower_get_buffer_size(ibld, inst);
2403 break;
2404
2405 case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
2406 case SHADER_OPCODE_MEMORY_STORE_LOGICAL:
2407 case SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL:
2408 if (devinfo->ver >= 20 ||
2409 (devinfo->has_lsc &&
2410 inst->src[MEMORY_LOGICAL_MODE].ud != MEMORY_MODE_TYPED))
2411 lower_lsc_memory_logical_send(ibld, inst);
2412 else
2413 lower_hdc_memory_logical_send(ibld, inst);
2414 break;
2415
2416 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
2417 if (devinfo->has_lsc && !s.compiler->indirect_ubos_use_sampler)
2418 lower_lsc_varying_pull_constant_logical_send(ibld, inst);
2419 else
2420 lower_varying_pull_constant_logical_send(ibld, inst);
2421 break;
2422
2423 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
2424 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
2425 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
2426 lower_interpolator_logical_send(ibld, inst,
2427 (const brw_wm_prog_key *)s.key,
2428 brw_wm_prog_data(s.prog_data));
2429 break;
2430
2431 case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
2432 case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
2433 lower_btd_logical_send(ibld, inst);
2434 break;
2435
2436 case RT_OPCODE_TRACE_RAY_LOGICAL:
2437 lower_trace_ray_logical_send(ibld, inst);
2438 break;
2439
2440 case SHADER_OPCODE_URB_READ_LOGICAL:
2441 if (devinfo->ver < 20)
2442 lower_urb_read_logical_send(ibld, inst);
2443 else
2444 lower_urb_read_logical_send_xe2(ibld, inst);
2445 break;
2446
2447 case SHADER_OPCODE_URB_WRITE_LOGICAL:
2448 if (devinfo->ver < 20)
2449 lower_urb_write_logical_send(ibld, inst);
2450 else
2451 lower_urb_write_logical_send_xe2(ibld, inst);
2452
2453 break;
2454
2455 default:
2456 continue;
2457 }
2458
2459 progress = true;
2460 }
2461
2462 if (progress)
2463 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
2464
2465 return progress;
2466 }
2467
2468 /**
2469 * Turns the generic expression-style uniform pull constant load instruction
2470 * into a hardware-specific series of instructions for loading a pull
2471 * constant.
2472 *
2473 * The expression style allows the CSE pass before this to optimize out
2474 * repeated loads from the same offset, and gives the pre-register-allocation
2475 * scheduling full flexibility, while the conversion to native instructions
2476 * allows the post-register-allocation scheduler the best information
2477 * possible.
2478 *
2479 * Note that execution masking for setting up pull constant loads is special:
2480 * the channels that need to be written are unrelated to the current execution
2481 * mask, since a later instruction will use one of the result channels as a
2482 * source operand for all 8 or 16 of its channels.
2483 */
2484 bool
brw_fs_lower_uniform_pull_constant_loads(fs_visitor & s)2485 brw_fs_lower_uniform_pull_constant_loads(fs_visitor &s)
2486 {
2487 const intel_device_info *devinfo = s.devinfo;
2488 bool progress = false;
2489
2490 foreach_block_and_inst (block, fs_inst, inst, s.cfg) {
2491 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2492 continue;
2493
2494 const brw_reg surface = inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE];
2495 const brw_reg surface_handle = inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE_HANDLE];
2496 const brw_reg offset_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_OFFSET];
2497 const brw_reg size_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_SIZE];
2498 assert(surface.file == BAD_FILE || surface_handle.file == BAD_FILE);
2499 assert(offset_B.file == IMM);
2500 assert(size_B.file == IMM);
2501
2502 if (devinfo->has_lsc) {
2503 const fs_builder ubld =
2504 fs_builder(&s, block, inst).group(8, 0).exec_all();
2505
2506 const brw_reg payload = ubld.vgrf(BRW_TYPE_UD);
2507 ubld.MOV(payload, offset_B);
2508
2509 inst->sfid = GFX12_SFID_UGM;
2510 inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
2511 surface_handle.file == BAD_FILE ?
2512 LSC_ADDR_SURFTYPE_BTI :
2513 LSC_ADDR_SURFTYPE_BSS,
2514 LSC_ADDR_SIZE_A32,
2515 LSC_DATA_SIZE_D32,
2516 inst->size_written / 4,
2517 true /* transpose */,
2518 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
2519
2520 /* Update the original instruction. */
2521 inst->opcode = SHADER_OPCODE_SEND;
2522 inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, 1);
2523 inst->send_ex_bso = surface_handle.file != BAD_FILE &&
2524 s.compiler->extended_bindless_surface_offset;
2525 inst->ex_mlen = 0;
2526 inst->header_size = 0;
2527 inst->send_has_side_effects = false;
2528 inst->send_is_volatile = true;
2529 inst->exec_size = 1;
2530
2531 /* Finally, the payload */
2532
2533 inst->resize_sources(3);
2534 setup_lsc_surface_descriptors(ubld, inst, inst->desc,
2535 surface.file != BAD_FILE ?
2536 surface : surface_handle);
2537 inst->src[2] = payload;
2538
2539 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
2540 } else {
2541 const fs_builder ubld = fs_builder(&s, block, inst).exec_all();
2542 brw_reg header = fs_builder(&s, 8).exec_all().vgrf(BRW_TYPE_UD);
2543
2544 ubld.group(8, 0).MOV(header,
2545 retype(brw_vec8_grf(0, 0), BRW_TYPE_UD));
2546 ubld.group(1, 0).MOV(component(header, 2),
2547 brw_imm_ud(offset_B.ud / 16));
2548
2549 inst->sfid = GFX6_SFID_DATAPORT_CONSTANT_CACHE;
2550 inst->opcode = SHADER_OPCODE_SEND;
2551 inst->header_size = 1;
2552 inst->mlen = 1;
2553
2554 uint32_t desc =
2555 brw_dp_oword_block_rw_desc(devinfo, true /* align_16B */,
2556 size_B.ud / 4, false /* write */);
2557
2558 inst->resize_sources(4);
2559
2560 setup_surface_descriptors(ubld, inst, desc, surface, surface_handle);
2561
2562 inst->src[2] = header;
2563 inst->src[3] = brw_reg(); /* unused for reads */
2564
2565 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
2566 }
2567
2568 progress = true;
2569 }
2570
2571 return progress;
2572 }
2573