xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/brw_fs_lower_simd_width.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2010 Intel Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "brw_fs.h"
7 #include "brw_fs_builder.h"
8 
9 using namespace brw;
10 
11 static bool
is_mixed_float_with_fp32_dst(const fs_inst * inst)12 is_mixed_float_with_fp32_dst(const fs_inst *inst)
13 {
14    if (inst->dst.type != BRW_TYPE_F)
15       return false;
16 
17    for (int i = 0; i < inst->sources; i++) {
18       if (inst->src[i].type == BRW_TYPE_HF)
19          return true;
20    }
21 
22    return false;
23 }
24 
25 static bool
is_mixed_float_with_packed_fp16_dst(const fs_inst * inst)26 is_mixed_float_with_packed_fp16_dst(const fs_inst *inst)
27 {
28    if (inst->dst.type != BRW_TYPE_HF || inst->dst.stride != 1)
29       return false;
30 
31    for (int i = 0; i < inst->sources; i++) {
32       if (inst->src[i].type == BRW_TYPE_F)
33          return true;
34    }
35 
36    return false;
37 }
38 
39 /**
40  * Get the closest allowed SIMD width for instruction \p inst accounting for
41  * some common regioning and execution control restrictions that apply to FPU
42  * instructions.  These restrictions don't necessarily have any relevance to
43  * instructions not executed by the FPU pipeline like extended math, control
44  * flow or send message instructions.
45  *
46  * For virtual opcodes it's really up to the instruction -- In some cases
47  * (e.g. where a virtual instruction unrolls into a simple sequence of FPU
48  * instructions) it may simplify virtual instruction lowering if we can
49  * enforce FPU-like regioning restrictions already on the virtual instruction,
50  * in other cases (e.g. virtual send-like instructions) this may be
51  * excessively restrictive.
52  */
53 static unsigned
get_fpu_lowered_simd_width(const fs_visitor * shader,const fs_inst * inst)54 get_fpu_lowered_simd_width(const fs_visitor *shader,
55                            const fs_inst *inst)
56 {
57    const struct brw_compiler *compiler = shader->compiler;
58    const struct intel_device_info *devinfo = compiler->devinfo;
59 
60    /* Maximum execution size representable in the instruction controls. */
61    unsigned max_width = MIN2(32, inst->exec_size);
62 
63    /* Number of channels per polygon handled by a multipolygon PS shader. */
64    const unsigned poly_width = shader->dispatch_width /
65                                MAX2(1, shader->max_polygons);
66 
67    /* Number of registers that will be read by an ATTR source if
68     * present for multipolygon PS shaders, since the PS vertex setup
69     * data for each polygon is stored in different contiguous GRFs.
70     */
71    const unsigned attr_reg_count = (shader->stage != MESA_SHADER_FRAGMENT ||
72                                     shader->max_polygons < 2 ? 0 :
73                                     DIV_ROUND_UP(inst->exec_size,
74                                                  poly_width) * reg_unit(devinfo));
75 
76    /* According to the PRMs:
77     *  "A. In Direct Addressing mode, a source cannot span more than 2
78     *      adjacent GRF registers.
79     *   B. A destination cannot span more than 2 adjacent GRF registers."
80     *
81     * Look for the source or destination with the largest register region
82     * which is the one that is going to limit the overall execution size of
83     * the instruction due to this rule.
84     */
85    unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
86 
87    for (unsigned i = 0; i < inst->sources; i++)
88       reg_count = MAX3(reg_count, DIV_ROUND_UP(inst->size_read(i), REG_SIZE),
89                        (inst->src[i].file == ATTR ? attr_reg_count : 0));
90 
91    /* Calculate the maximum execution size of the instruction based on the
92     * factor by which it goes over the hardware limit of 2 GRFs.
93     */
94    const unsigned max_reg_count = 2 * reg_unit(devinfo);
95    if (reg_count > max_reg_count)
96       max_width = MIN2(max_width, inst->exec_size / DIV_ROUND_UP(reg_count, max_reg_count));
97 
98    /* From the IVB PRMs (applies to HSW too):
99     *  "Instructions with condition modifiers must not use SIMD32."
100     *
101     * From the BDW PRMs (applies to later hardware too):
102     *  "Ternary instruction with condition modifiers must not use SIMD32."
103     */
104    if (inst->conditional_mod && inst->is_3src(compiler) && devinfo->ver < 12)
105       max_width = MIN2(max_width, 16);
106 
107    /* From the IVB PRMs (applies to other devices that don't have the
108     * intel_device_info::supports_simd16_3src flag set):
109     *  "In Align16 access mode, SIMD16 is not allowed for DW operations and
110     *   SIMD8 is not allowed for DF operations."
111     */
112    if (inst->is_3src(compiler) && !devinfo->supports_simd16_3src)
113       max_width = MIN2(max_width, inst->exec_size / reg_count);
114 
115    if (inst->opcode != BRW_OPCODE_MOV) {
116       /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
117        * Float Operations:
118        *
119        *    "No SIMD16 in mixed mode when destination is f32. Instruction
120        *     execution size must be no more than 8."
121        *
122        * Testing indicates that this restriction does not apply to MOVs.
123        */
124       if (is_mixed_float_with_fp32_dst(inst) && devinfo->ver < 20)
125          max_width = MIN2(max_width, 8);
126 
127       /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
128        * Float Operations:
129        *
130        *    "No SIMD16 in mixed mode when destination is packed f16 for both
131        *     Align1 and Align16."
132        */
133       if (is_mixed_float_with_packed_fp16_dst(inst) && devinfo->ver < 20)
134          max_width = MIN2(max_width, 8);
135    }
136 
137    /* Only power-of-two execution sizes are representable in the instruction
138     * control fields.
139     */
140    return 1 << util_logbase2(max_width);
141 }
142 
143 /**
144  * Get the maximum allowed SIMD width for instruction \p inst accounting for
145  * various payload size restrictions that apply to sampler message
146  * instructions.
147  *
148  * This is only intended to provide a maximum theoretical bound for the
149  * execution size of the message based on the number of argument components
150  * alone, which in most cases will determine whether the SIMD8 or SIMD16
151  * variant of the message can be used, though some messages may have
152  * additional restrictions not accounted for here (e.g. pre-ILK hardware uses
153  * the message length to determine the exact SIMD width and argument count,
154  * which makes a number of sampler message combinations impossible to
155  * represent).
156  *
157  * Note: Platforms with monolithic SIMD16 double the possible SIMD widths
158  * change from (SIMD8, SIMD16) to (SIMD16, SIMD32).
159  */
160 static unsigned
get_sampler_lowered_simd_width(const struct intel_device_info * devinfo,const fs_inst * inst)161 get_sampler_lowered_simd_width(const struct intel_device_info *devinfo,
162                                const fs_inst *inst)
163 {
164    /* If we have a min_lod parameter on anything other than a simple sample
165     * message, it will push it over 5 arguments and we have to fall back to
166     * SIMD8.
167     */
168    if (inst->opcode != SHADER_OPCODE_TEX_LOGICAL &&
169        inst->components_read(TEX_LOGICAL_SRC_MIN_LOD))
170       return devinfo->ver < 20 ? 8 : 16;
171 
172    /* On Gfx9+ the LOD argument is for free if we're able to use the LZ
173     * variant of the TXL or TXF message.
174     */
175    const bool implicit_lod = (inst->opcode == SHADER_OPCODE_TXL_LOGICAL ||
176                               inst->opcode == SHADER_OPCODE_TXF_LOGICAL) &&
177                              inst->src[TEX_LOGICAL_SRC_LOD].is_zero();
178 
179    /* Calculate the total number of argument components that need to be passed
180     * to the sampler unit.
181     */
182    assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
183    const unsigned grad_components =
184       inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
185    assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
186    const unsigned coord_components =
187       inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
188 
189    unsigned num_payload_components =
190       coord_components +
191       inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) +
192       (implicit_lod ? 0 : inst->components_read(TEX_LOGICAL_SRC_LOD)) +
193       inst->components_read(TEX_LOGICAL_SRC_LOD2) +
194       inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) +
195       (inst->opcode == SHADER_OPCODE_TG4_OFFSET_LOGICAL ?
196        inst->components_read(TEX_LOGICAL_SRC_TG4_OFFSET) : 0) +
197       inst->components_read(TEX_LOGICAL_SRC_MCS) +
198       inst->components_read(TEX_LOGICAL_SRC_MIN_LOD);
199 
200 
201    if (inst->opcode == FS_OPCODE_TXB_LOGICAL && devinfo->ver >= 20) {
202       num_payload_components += 3 - coord_components;
203    } else if (inst->opcode == SHADER_OPCODE_TXD_LOGICAL &&
204             devinfo->verx10 >= 125 && devinfo->ver < 20) {
205       num_payload_components +=
206          3 - coord_components + (2 - grad_components) * 2;
207    } else {
208       num_payload_components += 4 - coord_components;
209       if (inst->opcode == SHADER_OPCODE_TXD_LOGICAL)
210          num_payload_components += (3 - grad_components) * 2;
211    }
212 
213 
214    const unsigned simd_limit = reg_unit(devinfo) *
215       (num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16);
216 
217    /* SIMD16 (SIMD32 on Xe2) messages with more than five arguments exceed the
218     * maximum message size supported by the sampler, regardless of whether a
219     * header is provided or not.
220     */
221    return MIN2(inst->exec_size, simd_limit);
222 }
223 
224 static bool
is_half_float_src_dst(const fs_inst * inst)225 is_half_float_src_dst(const fs_inst *inst)
226 {
227    if (inst->dst.type == BRW_TYPE_HF)
228       return true;
229 
230    for (int i = 0; i < inst->sources; i++) {
231       if (inst->src[i].type == BRW_TYPE_HF)
232          return true;
233    }
234 
235    return false;
236 }
237 
238 /**
239  * Get the closest native SIMD width supported by the hardware for instruction
240  * \p inst.  The instruction will be left untouched by
241  * fs_visitor::lower_simd_width() if the returned value is equal to the
242  * original execution size.
243  */
244 unsigned
brw_fs_get_lowered_simd_width(const fs_visitor * shader,const fs_inst * inst)245 brw_fs_get_lowered_simd_width(const fs_visitor *shader, const fs_inst *inst)
246 {
247    const struct brw_compiler *compiler = shader->compiler;
248    const struct intel_device_info *devinfo = compiler->devinfo;
249 
250    switch (inst->opcode) {
251    case BRW_OPCODE_DP4A:
252    case BRW_OPCODE_MOV:
253    case BRW_OPCODE_SEL:
254    case BRW_OPCODE_NOT:
255    case BRW_OPCODE_AND:
256    case BRW_OPCODE_OR:
257    case BRW_OPCODE_XOR:
258    case BRW_OPCODE_SHR:
259    case BRW_OPCODE_SHL:
260    case BRW_OPCODE_ASR:
261    case BRW_OPCODE_ROR:
262    case BRW_OPCODE_ROL:
263    case BRW_OPCODE_CMPN:
264    case BRW_OPCODE_CSEL:
265    case BRW_OPCODE_BFREV:
266    case BRW_OPCODE_BFE:
267    case BRW_OPCODE_ADD:
268    case BRW_OPCODE_MUL:
269    case BRW_OPCODE_AVG:
270    case BRW_OPCODE_FRC:
271    case BRW_OPCODE_RNDU:
272    case BRW_OPCODE_RNDD:
273    case BRW_OPCODE_RNDE:
274    case BRW_OPCODE_RNDZ:
275    case BRW_OPCODE_LZD:
276    case BRW_OPCODE_FBH:
277    case BRW_OPCODE_FBL:
278    case BRW_OPCODE_CBIT:
279    case BRW_OPCODE_MAD:
280    case BRW_OPCODE_LRP:
281    case BRW_OPCODE_ADD3:
282    case FS_OPCODE_PACK:
283    case SHADER_OPCODE_SEL_EXEC:
284    case SHADER_OPCODE_CLUSTER_BROADCAST:
285    case SHADER_OPCODE_MOV_RELOC_IMM:
286    case BRW_OPCODE_CMP:
287    case BRW_OPCODE_BFI1:
288    case BRW_OPCODE_BFI2:
289       return get_fpu_lowered_simd_width(shader, inst);
290 
291    case SHADER_OPCODE_RCP:
292    case SHADER_OPCODE_RSQ:
293    case SHADER_OPCODE_SQRT:
294    case SHADER_OPCODE_EXP2:
295    case SHADER_OPCODE_LOG2:
296    case SHADER_OPCODE_SIN:
297    case SHADER_OPCODE_COS: {
298       /* Xe2+: BSpec 56797
299        *
300        * Math operation rules when half-floats are used on both source and
301        * destination operands and both source and destinations are packed.
302        *
303        * The execution size must be 16.
304        */
305       if (is_half_float_src_dst(inst))
306          return devinfo->ver < 20 ? MIN2(8,  inst->exec_size) :
307                                     MIN2(16, inst->exec_size);
308       return MIN2(16, inst->exec_size);
309    }
310 
311    case SHADER_OPCODE_POW: {
312       /* SIMD16 is only allowed on Gfx7+. Extended Math Function is limited
313        * to SIMD8 with half-float
314        */
315       if (is_half_float_src_dst(inst))
316         return MIN2(8,  inst->exec_size);
317       return MIN2(16, inst->exec_size);
318    }
319 
320    case SHADER_OPCODE_USUB_SAT:
321    case SHADER_OPCODE_ISUB_SAT:
322       return get_fpu_lowered_simd_width(shader, inst);
323 
324    case SHADER_OPCODE_INT_QUOTIENT:
325    case SHADER_OPCODE_INT_REMAINDER:
326       /* Integer division is limited to SIMD8 on all generations. */
327       return MIN2(8, inst->exec_size);
328 
329    case BRW_OPCODE_PLN:
330    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
331    case FS_OPCODE_PACK_HALF_2x16_SPLIT:
332    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
333    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
334    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
335    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
336    case FS_OPCODE_DDX_COARSE:
337    case FS_OPCODE_DDX_FINE:
338    case FS_OPCODE_DDY_COARSE:
339    case FS_OPCODE_DDY_FINE:
340       return MIN2(16, inst->exec_size);
341 
342    case SHADER_OPCODE_MULH:
343       /* MULH is lowered to the MUL/MACH sequence using the accumulator, which
344        * is 8-wide on Gfx7+.
345        */
346       return devinfo->ver >= 20 ? 16 : 8;
347 
348    case FS_OPCODE_FB_WRITE_LOGICAL:
349       if (devinfo->ver >= 20) {
350          /* Dual-source FB writes are unsupported in SIMD32 mode. */
351          return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?
352                  16 : MIN2(32, inst->exec_size));
353       } else {
354          /* Dual-source FB writes are unsupported in SIMD16 mode. */
355          return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?
356                  8 : MIN2(16, inst->exec_size));
357       }
358 
359    case FS_OPCODE_FB_READ_LOGICAL:
360       return MIN2(16, inst->exec_size);
361 
362    case SHADER_OPCODE_TEX_LOGICAL:
363    case SHADER_OPCODE_TXF_MCS_LOGICAL:
364    case SHADER_OPCODE_LOD_LOGICAL:
365    case SHADER_OPCODE_TG4_LOGICAL:
366    case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
367    case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
368    case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
369    case SHADER_OPCODE_TG4_BIAS_LOGICAL:
370    case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
371    case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
372    case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
373    case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
374    case SHADER_OPCODE_TXL_LOGICAL:
375    case FS_OPCODE_TXB_LOGICAL:
376    case SHADER_OPCODE_TXF_LOGICAL:
377    case SHADER_OPCODE_TXS_LOGICAL:
378       return get_sampler_lowered_simd_width(devinfo, inst);
379 
380    case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
381    case SHADER_OPCODE_MEMORY_STORE_LOGICAL:
382    case SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL:
383       if (devinfo->ver >= 20)
384          return inst->exec_size;
385 
386       if (inst->src[MEMORY_LOGICAL_MODE].ud == MEMORY_MODE_TYPED)
387          return 8;
388 
389       /* HDC A64 atomics are limited to SIMD8 */
390       if (!devinfo->has_lsc &&
391           inst->src[MEMORY_LOGICAL_BINDING_TYPE].ud == LSC_ADDR_SURFTYPE_FLAT
392           && lsc_opcode_is_atomic((enum lsc_opcode)
393                                   inst->src[MEMORY_LOGICAL_OPCODE].ud))
394          return 8;
395 
396       return MIN2(16, inst->exec_size);
397 
398    /* On gfx12 parameters are fixed to 16-bit values and therefore they all
399     * always fit regardless of the execution size.
400     */
401    case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
402       return MIN2(16, inst->exec_size);
403 
404    case SHADER_OPCODE_TXD_LOGICAL:
405       /* TXD is unsupported in SIMD16 mode previous to Xe2. SIMD32 is still
406        * unsuppported on Xe2.
407        */
408       return devinfo->ver < 20 ? 8 : 16;
409 
410    case SHADER_OPCODE_URB_READ_LOGICAL:
411    case SHADER_OPCODE_URB_WRITE_LOGICAL:
412       return MIN2(devinfo->ver < 20 ? 8 : 16, inst->exec_size);
413 
414    case SHADER_OPCODE_QUAD_SWIZZLE: {
415       const unsigned swiz = inst->src[1].ud;
416       return (is_uniform(inst->src[0]) ?
417                  get_fpu_lowered_simd_width(shader, inst) :
418               devinfo->ver < 11 && brw_type_size_bytes(inst->src[0].type) == 4 ? 8 :
419               swiz == BRW_SWIZZLE_XYXY || swiz == BRW_SWIZZLE_ZWZW ? 4 :
420               get_fpu_lowered_simd_width(shader, inst));
421    }
422    case SHADER_OPCODE_MOV_INDIRECT: {
423       /* From IVB and HSW PRMs:
424        *
425        * "2.When the destination requires two registers and the sources are
426        *  indirect, the sources must use 1x1 regioning mode.
427        *
428        * In case of DF instructions in HSW/IVB, the exec_size is limited by
429        * the EU decompression logic not handling VxH indirect addressing
430        * correctly.
431        */
432       const unsigned max_size = 2 * REG_SIZE;
433       /* Prior to Broadwell, we only have 8 address subregisters. */
434       return MIN3(16,
435                   max_size / (inst->dst.stride * brw_type_size_bytes(inst->dst.type)),
436                   inst->exec_size);
437    }
438 
439    case SHADER_OPCODE_LOAD_PAYLOAD: {
440       const unsigned reg_count =
441          DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE);
442 
443       if (reg_count > 2) {
444          /* Only LOAD_PAYLOAD instructions with per-channel destination region
445           * can be easily lowered (which excludes headers and heterogeneous
446           * types).
447           */
448          assert(!inst->header_size);
449          for (unsigned i = 0; i < inst->sources; i++)
450             assert(brw_type_size_bits(inst->dst.type) == brw_type_size_bits(inst->src[i].type) ||
451                    inst->src[i].file == BAD_FILE);
452 
453          return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
454       } else {
455          return inst->exec_size;
456       }
457    }
458    default:
459       return inst->exec_size;
460    }
461 }
462 
463 /**
464  * Return true if splitting out the group of channels of instruction \p inst
465  * given by lbld.group() requires allocating a temporary for the i-th source
466  * of the lowered instruction.
467  */
468 static inline bool
needs_src_copy(const fs_builder & lbld,const fs_inst * inst,unsigned i)469 needs_src_copy(const fs_builder &lbld, const fs_inst *inst, unsigned i)
470 {
471    /* The indirectly indexed register stays the same even if we split the
472     * instruction.
473     */
474    if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
475       return false;
476 
477    return !(is_periodic(inst->src[i], lbld.dispatch_width()) ||
478             (inst->components_read(i) == 1 &&
479              lbld.dispatch_width() <= inst->exec_size)) ||
480           (inst->flags_written(lbld.shader->devinfo) &
481            brw_fs_flag_mask(inst->src[i], brw_type_size_bytes(inst->src[i].type)));
482 }
483 
484 /**
485  * Extract the data that would be consumed by the channel group given by
486  * lbld.group() from the i-th source region of instruction \p inst and return
487  * it as result in packed form.
488  */
489 static brw_reg
emit_unzip(const fs_builder & lbld,fs_inst * inst,unsigned i)490 emit_unzip(const fs_builder &lbld, fs_inst *inst, unsigned i)
491 {
492    assert(lbld.group() >= inst->group);
493 
494    /* Specified channel group from the source region. */
495    const brw_reg src = horiz_offset(inst->src[i], lbld.group() - inst->group);
496 
497    if (needs_src_copy(lbld, inst, i)) {
498       const unsigned num_components = inst->components_read(i);
499       const brw_reg tmp = lbld.vgrf(inst->src[i].type, num_components);
500 
501       brw_reg comps[num_components];
502       for (unsigned k = 0; k < num_components; ++k)
503          comps[k] = offset(src, inst->exec_size, k);
504       lbld.VEC(tmp, comps, num_components);
505 
506       return tmp;
507    } else if (is_periodic(inst->src[i], lbld.dispatch_width()) ||
508               (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)) {
509       /* The source is invariant for all dispatch_width-wide groups of the
510        * original region.
511        *
512        * The src[0] of MOV_INDIRECT is invariant regardless of the execution
513        * size.
514        */
515       return inst->src[i];
516 
517    } else {
518       /* We can just point the lowered instruction at the right channel group
519        * from the original region.
520        */
521       return src;
522    }
523 }
524 
525 /**
526  * Return true if splitting out the group of channels of instruction \p inst
527  * given by lbld.group() requires allocating a temporary for the destination
528  * of the lowered instruction and copying the data back to the original
529  * destination region.
530  */
531 static inline bool
needs_dst_copy(const fs_builder & lbld,const fs_inst * inst)532 needs_dst_copy(const fs_builder &lbld, const fs_inst *inst)
533 {
534    if (inst->dst.is_null())
535       return false;
536 
537    /* If the instruction writes more than one component we'll have to shuffle
538     * the results of multiple lowered instructions in order to make sure that
539     * they end up arranged correctly in the original destination region.
540     */
541    if (inst->size_written > inst->dst.component_size(inst->exec_size))
542       return true;
543 
544    for (unsigned i = 0; i < inst->sources; i++) {
545       /* If we already made a copy of the source for other reasons there won't
546        * be any overlap with the destination.
547        */
548       if (needs_src_copy(lbld, inst, i))
549          continue;
550 
551       /* In order to keep the logic simple we emit a copy whenever the
552        * destination region doesn't exactly match an overlapping source, which
553        * may point at the source and destination not being aligned group by
554        * group which could cause one of the lowered instructions to overwrite
555        * the data read from the same source by other lowered instructions.
556        */
557       if (regions_overlap(inst->dst, inst->size_written,
558                           inst->src[i], inst->size_read(i)) &&
559           !inst->dst.equals(inst->src[i]))
560         return true;
561    }
562 
563    return false;
564 }
565 
566 /**
567  * Insert data from a packed temporary into the channel group given by
568  * lbld.group() of the destination region of instruction \p inst and return
569  * the temporary as result.  Any copy instructions that are required for
570  * unzipping the previous value (in the case of partial writes) will be
571  * inserted using \p lbld_before and any copy instructions required for
572  * zipping up the destination of \p inst will be inserted using \p lbld_after.
573  */
574 static brw_reg
emit_zip(const fs_builder & lbld_before,const fs_builder & lbld_after,fs_inst * inst)575 emit_zip(const fs_builder &lbld_before, const fs_builder &lbld_after,
576          fs_inst *inst)
577 {
578    assert(lbld_before.dispatch_width() == lbld_after.dispatch_width());
579    assert(lbld_before.group() == lbld_after.group());
580    assert(lbld_after.group() >= inst->group);
581 
582    const struct intel_device_info *devinfo = lbld_before.shader->devinfo;
583 
584    /* Specified channel group from the destination region. */
585    const brw_reg dst = horiz_offset(inst->dst, lbld_after.group() - inst->group);
586 
587    if (!needs_dst_copy(lbld_after, inst)) {
588       /* No need to allocate a temporary for the lowered instruction, just
589        * take the right group of channels from the original region.
590        */
591       return dst;
592    }
593 
594    /* Deal with the residency data part later */
595    const unsigned residency_size = inst->has_sampler_residency() ?
596       (reg_unit(devinfo) * REG_SIZE) : 0;
597    const unsigned dst_size = (inst->size_written - residency_size) /
598       inst->dst.component_size(inst->exec_size);
599 
600    const brw_reg tmp = lbld_after.vgrf(inst->dst.type,
601                                       dst_size + inst->has_sampler_residency());
602 
603    if (inst->predicate) {
604       /* Handle predication by copying the original contents of the
605        * destination into the temporary before emitting the lowered
606        * instruction.
607        */
608       for (unsigned k = 0; k < dst_size; ++k) {
609          lbld_before.MOV(offset(tmp, lbld_before, k),
610                          offset(dst, inst->exec_size, k));
611       }
612    }
613 
614    for (unsigned k = 0; k < dst_size; ++k) {
615       /* Copy the (split) temp into the original (larger) destination */
616       lbld_after.MOV(offset(dst, inst->exec_size, k),
617                      offset(tmp, lbld_after, k));
618    }
619 
620    if (inst->has_sampler_residency()) {
621       /* Sampler messages with residency need a special attention. In the
622        * first lane of the last component are located the Pixel Null Mask
623        * (bits 0:15) & some upper bits we need to discard (bits 16:31). We
624        * have to build a single 32bit value for the SIMD32 message out of 2
625        * SIMD16 16 bit values.
626        */
627       const fs_builder rbld = lbld_after.exec_all().group(1, 0);
628       brw_reg local_res_reg = component(
629          retype(offset(tmp, lbld_before, dst_size), BRW_TYPE_UW), 0);
630       brw_reg final_res_reg =
631          retype(byte_offset(inst->dst,
632                             inst->size_written - residency_size +
633                             lbld_after.group() / 8), BRW_TYPE_UW);
634       rbld.MOV(final_res_reg, local_res_reg);
635    }
636 
637    return tmp;
638 }
639 
640 bool
brw_fs_lower_simd_width(fs_visitor & s)641 brw_fs_lower_simd_width(fs_visitor &s)
642 {
643    bool progress = false;
644 
645    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
646       const unsigned lower_width = brw_fs_get_lowered_simd_width(&s, inst);
647 
648       /* No splitting required */
649       if (lower_width == inst->exec_size)
650          continue;
651 
652       assert(lower_width < inst->exec_size);
653 
654       /* Builder matching the original instruction. */
655       const fs_builder bld = fs_builder(&s).at_end();
656       const fs_builder ibld =
657          bld.at(block, inst).exec_all(inst->force_writemask_all)
658             .group(inst->exec_size, inst->group / inst->exec_size);
659 
660       /* Split the copies in chunks of the execution width of either the
661        * original or the lowered instruction, whichever is lower.
662        */
663       const unsigned n = DIV_ROUND_UP(inst->exec_size, lower_width);
664       const unsigned residency_size = inst->has_sampler_residency() ?
665          (reg_unit(s.devinfo) * REG_SIZE) : 0;
666       const unsigned dst_size =
667          (inst->size_written - residency_size) /
668          inst->dst.component_size(inst->exec_size);
669 
670       assert(!inst->writes_accumulator && !inst->mlen);
671 
672       /* Inserting the zip, unzip, and duplicated instructions in all of
673        * the right spots is somewhat tricky.  All of the unzip and any
674        * instructions from the zip which unzip the destination prior to
675        * writing need to happen before all of the per-group instructions
676        * and the zip instructions need to happen after.  In order to sort
677        * this all out, we insert the unzip instructions before \p inst,
678        * insert the per-group instructions after \p inst (i.e. before
679        * inst->next), and insert the zip instructions before the
680        * instruction after \p inst.  Since we are inserting instructions
681        * after \p inst, inst->next is a moving target and we need to save
682        * it off here so that we insert the zip instructions in the right
683        * place.
684        *
685        * Since we're inserting split instructions after after_inst, the
686        * instructions will end up in the reverse order that we insert them.
687        * However, certain render target writes require that the low group
688        * instructions come before the high group.  From the Ivy Bridge PRM
689        * Vol. 4, Pt. 1, Section 3.9.11:
690        *
691        *    "If multiple SIMD8 Dual Source messages are delivered by the
692        *    pixel shader thread, each SIMD8_DUALSRC_LO message must be
693        *    issued before the SIMD8_DUALSRC_HI message with the same Slot
694        *    Group Select setting."
695        *
696        * And, from Section 3.9.11.1 of the same PRM:
697        *
698        *    "When SIMD32 or SIMD16 PS threads send render target writes
699        *    with multiple SIMD8 and SIMD16 messages, the following must
700        *    hold:
701        *
702        *    All the slots (as described above) must have a corresponding
703        *    render target write irrespective of the slot's validity. A slot
704        *    is considered valid when at least one sample is enabled. For
705        *    example, a SIMD16 PS thread must send two SIMD8 render target
706        *    writes to cover all the slots.
707        *
708        *    PS thread must send SIMD render target write messages with
709        *    increasing slot numbers. For example, SIMD16 thread has
710        *    Slot[15:0] and if two SIMD8 render target writes are used, the
711        *    first SIMD8 render target write must send Slot[7:0] and the
712        *    next one must send Slot[15:8]."
713        *
714        * In order to make low group instructions come before high group
715        * instructions (this is required for some render target writes), we
716        * split from the highest group to lowest.
717        */
718       exec_node *const after_inst = inst->next;
719       for (int i = n - 1; i >= 0; i--) {
720          /* Emit a copy of the original instruction with the lowered width.
721           * If the EOT flag was set throw it away except for the last
722           * instruction to avoid killing the thread prematurely.
723           */
724          fs_inst split_inst = *inst;
725          split_inst.exec_size = lower_width;
726          split_inst.eot = inst->eot && i == int(n - 1);
727 
728          /* Select the correct channel enables for the i-th group, then
729           * transform the sources and destination and emit the lowered
730           * instruction.
731           */
732          const fs_builder lbld = ibld.group(lower_width, i);
733 
734          for (unsigned j = 0; j < inst->sources; j++)
735             split_inst.src[j] = emit_unzip(lbld.at(block, inst), inst, j);
736 
737          split_inst.dst = emit_zip(lbld.at(block, inst),
738                                    lbld.at(block, after_inst), inst);
739          split_inst.size_written =
740             split_inst.dst.component_size(lower_width) * dst_size +
741             residency_size;
742 
743          lbld.at(block, inst->next).emit(split_inst);
744       }
745 
746       inst->remove(block);
747       progress = true;
748    }
749 
750    if (progress)
751       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
752 
753    return progress;
754 }
755