xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/elk/elk_eu_emit.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3  Intel funded Tungsten Graphics to
4  develop this 3D driver.
5 
6  Permission is hereby granted, free of charge, to any person obtaining
7  a copy of this software and associated documentation files (the
8  "Software"), to deal in the Software without restriction, including
9  without limitation the rights to use, copy, modify, merge, publish,
10  distribute, sublicense, and/or sell copies of the Software, and to
11  permit persons to whom the Software is furnished to do so, subject to
12  the following conditions:
13 
14  The above copyright notice and this permission notice (including the
15  next paragraph) shall be included in all copies or substantial
16  portions of the Software.
17 
18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 
26  **********************************************************************/
27  /*
28   * Authors:
29   *   Keith Whitwell <[email protected]>
30   */
31 
32 
33 #include "elk_eu_defines.h"
34 #include "elk_eu.h"
35 
36 #include "util/ralloc.h"
37 
38 /**
39  * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40  * registers, implicitly moving the operand to a message register.
41  *
42  * On Sandybridge, this is no longer the case.  This function performs the
43  * explicit move; it should be called before emitting a SEND instruction.
44  */
45 void
elk_gfx6_resolve_implied_move(struct elk_codegen * p,struct elk_reg * src,unsigned msg_reg_nr)46 elk_gfx6_resolve_implied_move(struct elk_codegen *p,
47 			  struct elk_reg *src,
48 			  unsigned msg_reg_nr)
49 {
50    const struct intel_device_info *devinfo = p->devinfo;
51    if (devinfo->ver < 6)
52       return;
53 
54    if (src->file == ELK_MESSAGE_REGISTER_FILE)
55       return;
56 
57    if (src->file != ELK_ARCHITECTURE_REGISTER_FILE || src->nr != ELK_ARF_NULL) {
58       elk_push_insn_state(p);
59       elk_set_default_exec_size(p, ELK_EXECUTE_8);
60       elk_set_default_mask_control(p, ELK_MASK_DISABLE);
61       elk_set_default_compression_control(p, ELK_COMPRESSION_NONE);
62       elk_MOV(p, retype(elk_message_reg(msg_reg_nr), ELK_REGISTER_TYPE_UD),
63 	      retype(*src, ELK_REGISTER_TYPE_UD));
64       elk_pop_insn_state(p);
65    }
66    *src = elk_message_reg(msg_reg_nr);
67 }
68 
69 static void
gfx7_convert_mrf_to_grf(struct elk_codegen * p,struct elk_reg * reg)70 gfx7_convert_mrf_to_grf(struct elk_codegen *p, struct elk_reg *reg)
71 {
72    /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
73     * "The send with EOT should use register space R112-R127 for <src>. This is
74     *  to enable loading of a new thread into the same slot while the message
75     *  with EOT for current thread is pending dispatch."
76     *
77     * Since we're pretending to have 16 MRFs anyway, we may as well use the
78     * registers required for messages with EOT.
79     */
80    const struct intel_device_info *devinfo = p->devinfo;
81    if (devinfo->ver >= 7 && reg->file == ELK_MESSAGE_REGISTER_FILE) {
82       reg->file = ELK_GENERAL_REGISTER_FILE;
83       reg->nr += GFX7_MRF_HACK_START;
84    }
85 }
86 
87 void
elk_set_dest(struct elk_codegen * p,elk_inst * inst,struct elk_reg dest)88 elk_set_dest(struct elk_codegen *p, elk_inst *inst, struct elk_reg dest)
89 {
90    const struct intel_device_info *devinfo = p->devinfo;
91 
92    if (dest.file == ELK_MESSAGE_REGISTER_FILE)
93       assert((dest.nr & ~ELK_MRF_COMPR4) < ELK_MAX_MRF(devinfo->ver));
94    else if (dest.file == ELK_GENERAL_REGISTER_FILE)
95       assert(dest.nr < XE2_MAX_GRF);
96 
97    /* The hardware has a restriction where a destination of size Byte with
98     * a stride of 1 is only allowed for a packed byte MOV. For any other
99     * instruction, the stride must be at least 2, even when the destination
100     * is the NULL register.
101     */
102    if (dest.file == ELK_ARCHITECTURE_REGISTER_FILE &&
103        dest.nr == ELK_ARF_NULL &&
104        type_sz(dest.type) == 1 &&
105        dest.hstride == ELK_HORIZONTAL_STRIDE_1) {
106       dest.hstride = ELK_HORIZONTAL_STRIDE_2;
107    }
108 
109    gfx7_convert_mrf_to_grf(p, &dest);
110 
111    elk_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
112    elk_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
113 
114    if (dest.address_mode == ELK_ADDRESS_DIRECT) {
115       elk_inst_set_dst_da_reg_nr(devinfo, inst, phys_nr(devinfo, dest));
116 
117       if (elk_inst_access_mode(devinfo, inst) == ELK_ALIGN_1) {
118          elk_inst_set_dst_da1_subreg_nr(devinfo, inst, phys_subnr(devinfo, dest));
119          if (dest.hstride == ELK_HORIZONTAL_STRIDE_0)
120             dest.hstride = ELK_HORIZONTAL_STRIDE_1;
121          elk_inst_set_dst_hstride(devinfo, inst, dest.hstride);
122       } else {
123          elk_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
124          elk_inst_set_da16_writemask(devinfo, inst, dest.writemask);
125          if (dest.file == ELK_GENERAL_REGISTER_FILE ||
126              dest.file == ELK_MESSAGE_REGISTER_FILE) {
127             assert(dest.writemask != 0);
128          }
129          /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
130           *    Although Dst.HorzStride is a don't care for Align16, HW needs
131           *    this to be programmed as "01".
132           */
133          elk_inst_set_dst_hstride(devinfo, inst, 1);
134       }
135    } else {
136       elk_inst_set_dst_ia_subreg_nr(devinfo, inst, phys_subnr(devinfo, dest));
137 
138       /* These are different sizes in align1 vs align16:
139        */
140       if (elk_inst_access_mode(devinfo, inst) == ELK_ALIGN_1) {
141          elk_inst_set_dst_ia1_addr_imm(devinfo, inst,
142                                        dest.indirect_offset);
143          if (dest.hstride == ELK_HORIZONTAL_STRIDE_0)
144             dest.hstride = ELK_HORIZONTAL_STRIDE_1;
145          elk_inst_set_dst_hstride(devinfo, inst, dest.hstride);
146       } else {
147          elk_inst_set_dst_ia16_addr_imm(devinfo, inst,
148                                         dest.indirect_offset);
149          /* even ignored in da16, still need to set as '01' */
150          elk_inst_set_dst_hstride(devinfo, inst, 1);
151       }
152    }
153 
154    /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
155     * or 16 (SIMD16), as that's normally correct.  However, when dealing with
156     * small registers, it can be useful for us to automatically reduce it to
157     * match the register size.
158     */
159    if (p->automatic_exec_sizes) {
160       /*
161        * In platforms that support fp64 we can emit instructions with a width
162        * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
163        * these cases we need to make sure that these instructions have their
164        * exec sizes set properly when they are emitted and we can't rely on
165        * this code to fix it.
166        */
167       bool fix_exec_size;
168       if (devinfo->ver >= 6)
169          fix_exec_size = dest.width < ELK_EXECUTE_4;
170       else
171          fix_exec_size = dest.width < ELK_EXECUTE_8;
172 
173       if (fix_exec_size)
174          elk_inst_set_exec_size(devinfo, inst, dest.width);
175    }
176 }
177 
178 void
elk_set_src0(struct elk_codegen * p,elk_inst * inst,struct elk_reg reg)179 elk_set_src0(struct elk_codegen *p, elk_inst *inst, struct elk_reg reg)
180 {
181    const struct intel_device_info *devinfo = p->devinfo;
182 
183    if (reg.file == ELK_MESSAGE_REGISTER_FILE)
184       assert((reg.nr & ~ELK_MRF_COMPR4) < ELK_MAX_MRF(devinfo->ver));
185    else if (reg.file == ELK_GENERAL_REGISTER_FILE)
186       assert(reg.nr < XE2_MAX_GRF);
187 
188    gfx7_convert_mrf_to_grf(p, &reg);
189 
190    if (devinfo->ver >= 6 &&
191        (elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SEND ||
192         elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SENDC)) {
193       /* Any source modifiers or regions will be ignored, since this just
194        * identifies the MRF/GRF to start reading the message contents from.
195        * Check for some likely failures.
196        */
197       assert(!reg.negate);
198       assert(!reg.abs);
199       assert(reg.address_mode == ELK_ADDRESS_DIRECT);
200    }
201 
202    elk_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
203    elk_inst_set_src0_abs(devinfo, inst, reg.abs);
204    elk_inst_set_src0_negate(devinfo, inst, reg.negate);
205    elk_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
206 
207    if (reg.file == ELK_IMMEDIATE_VALUE) {
208       if (reg.type == ELK_REGISTER_TYPE_DF ||
209           elk_inst_opcode(p->isa, inst) == ELK_OPCODE_DIM)
210          elk_inst_set_imm_df(devinfo, inst, reg.df);
211       else if (reg.type == ELK_REGISTER_TYPE_UQ ||
212                reg.type == ELK_REGISTER_TYPE_Q)
213          elk_inst_set_imm_uq(devinfo, inst, reg.u64);
214       else
215          elk_inst_set_imm_ud(devinfo, inst, reg.ud);
216 
217       if (type_sz(reg.type) < 8) {
218          elk_inst_set_src1_reg_file(devinfo, inst,
219                                     ELK_ARCHITECTURE_REGISTER_FILE);
220          elk_inst_set_src1_reg_hw_type(devinfo, inst,
221                                        elk_inst_src0_reg_hw_type(devinfo, inst));
222       }
223    } else {
224       if (reg.address_mode == ELK_ADDRESS_DIRECT) {
225          elk_inst_set_src0_da_reg_nr(devinfo, inst, phys_nr(devinfo, reg));
226          if (elk_inst_access_mode(devinfo, inst) == ELK_ALIGN_1) {
227             elk_inst_set_src0_da1_subreg_nr(devinfo, inst, phys_subnr(devinfo, reg));
228          } else {
229             elk_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
230          }
231       } else {
232          elk_inst_set_src0_ia_subreg_nr(devinfo, inst, phys_subnr(devinfo, reg));
233 
234          if (elk_inst_access_mode(devinfo, inst) == ELK_ALIGN_1) {
235             elk_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
236          } else {
237             elk_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
238          }
239       }
240 
241       if (elk_inst_access_mode(devinfo, inst) == ELK_ALIGN_1) {
242          if (reg.width == ELK_WIDTH_1 &&
243              elk_inst_exec_size(devinfo, inst) == ELK_EXECUTE_1) {
244             elk_inst_set_src0_hstride(devinfo, inst, ELK_HORIZONTAL_STRIDE_0);
245             elk_inst_set_src0_width(devinfo, inst, ELK_WIDTH_1);
246             elk_inst_set_src0_vstride(devinfo, inst, ELK_VERTICAL_STRIDE_0);
247          } else {
248             elk_inst_set_src0_hstride(devinfo, inst, reg.hstride);
249             elk_inst_set_src0_width(devinfo, inst, reg.width);
250             elk_inst_set_src0_vstride(devinfo, inst, reg.vstride);
251          }
252       } else {
253          elk_inst_set_src0_da16_swiz_x(devinfo, inst,
254             ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_X));
255          elk_inst_set_src0_da16_swiz_y(devinfo, inst,
256             ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_Y));
257          elk_inst_set_src0_da16_swiz_z(devinfo, inst,
258             ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_Z));
259          elk_inst_set_src0_da16_swiz_w(devinfo, inst,
260             ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_W));
261 
262          if (reg.vstride == ELK_VERTICAL_STRIDE_8) {
263             /* This is an oddity of the fact we're using the same
264              * descriptions for registers in align_16 as align_1:
265              */
266             elk_inst_set_src0_vstride(devinfo, inst, ELK_VERTICAL_STRIDE_4);
267          } else if (devinfo->verx10 == 70 &&
268                     reg.type == ELK_REGISTER_TYPE_DF &&
269                     reg.vstride == ELK_VERTICAL_STRIDE_2) {
270             /* From SNB PRM:
271              *
272              * "For Align16 access mode, only encodings of 0000 and 0011
273              *  are allowed. Other codes are reserved."
274              *
275              * Presumably the DevSNB behavior applies to IVB as well.
276              */
277             elk_inst_set_src0_vstride(devinfo, inst, ELK_VERTICAL_STRIDE_4);
278          } else {
279             elk_inst_set_src0_vstride(devinfo, inst, reg.vstride);
280          }
281       }
282    }
283 }
284 
285 
286 void
elk_set_src1(struct elk_codegen * p,elk_inst * inst,struct elk_reg reg)287 elk_set_src1(struct elk_codegen *p, elk_inst *inst, struct elk_reg reg)
288 {
289    const struct intel_device_info *devinfo = p->devinfo;
290 
291    if (reg.file == ELK_GENERAL_REGISTER_FILE)
292       assert(reg.nr < XE2_MAX_GRF);
293 
294    {
295       /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
296        *
297        *    "Accumulator registers may be accessed explicitly as src0
298        *    operands only."
299        */
300       assert(reg.file != ELK_ARCHITECTURE_REGISTER_FILE ||
301              reg.nr != ELK_ARF_ACCUMULATOR);
302 
303       gfx7_convert_mrf_to_grf(p, &reg);
304       assert(reg.file != ELK_MESSAGE_REGISTER_FILE);
305 
306       elk_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
307       elk_inst_set_src1_abs(devinfo, inst, reg.abs);
308       elk_inst_set_src1_negate(devinfo, inst, reg.negate);
309 
310       /* Only src1 can be immediate in two-argument instructions.
311        */
312       assert(elk_inst_src0_reg_file(devinfo, inst) != ELK_IMMEDIATE_VALUE);
313 
314       if (reg.file == ELK_IMMEDIATE_VALUE) {
315          /* two-argument instructions can only use 32-bit immediates */
316          assert(type_sz(reg.type) < 8);
317          elk_inst_set_imm_ud(devinfo, inst, reg.ud);
318       } else {
319          /* This is a hardware restriction, which may or may not be lifted
320           * in the future:
321           */
322          assert (reg.address_mode == ELK_ADDRESS_DIRECT);
323          /* assert (reg.file == ELK_GENERAL_REGISTER_FILE); */
324 
325          elk_inst_set_src1_da_reg_nr(devinfo, inst, phys_nr(devinfo, reg));
326          if (elk_inst_access_mode(devinfo, inst) == ELK_ALIGN_1) {
327             elk_inst_set_src1_da1_subreg_nr(devinfo, inst, phys_subnr(devinfo, reg));
328          } else {
329             elk_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
330          }
331 
332          if (elk_inst_access_mode(devinfo, inst) == ELK_ALIGN_1) {
333             if (reg.width == ELK_WIDTH_1 &&
334                 elk_inst_exec_size(devinfo, inst) == ELK_EXECUTE_1) {
335                elk_inst_set_src1_hstride(devinfo, inst, ELK_HORIZONTAL_STRIDE_0);
336                elk_inst_set_src1_width(devinfo, inst, ELK_WIDTH_1);
337                elk_inst_set_src1_vstride(devinfo, inst, ELK_VERTICAL_STRIDE_0);
338             } else {
339                elk_inst_set_src1_hstride(devinfo, inst, reg.hstride);
340                elk_inst_set_src1_width(devinfo, inst, reg.width);
341                elk_inst_set_src1_vstride(devinfo, inst, reg.vstride);
342             }
343          } else {
344             elk_inst_set_src1_da16_swiz_x(devinfo, inst,
345                ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_X));
346             elk_inst_set_src1_da16_swiz_y(devinfo, inst,
347                ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_Y));
348             elk_inst_set_src1_da16_swiz_z(devinfo, inst,
349                ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_Z));
350             elk_inst_set_src1_da16_swiz_w(devinfo, inst,
351                ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_W));
352 
353             if (reg.vstride == ELK_VERTICAL_STRIDE_8) {
354                /* This is an oddity of the fact we're using the same
355                 * descriptions for registers in align_16 as align_1:
356                 */
357                elk_inst_set_src1_vstride(devinfo, inst, ELK_VERTICAL_STRIDE_4);
358             } else if (devinfo->verx10 == 70 &&
359                        reg.type == ELK_REGISTER_TYPE_DF &&
360                        reg.vstride == ELK_VERTICAL_STRIDE_2) {
361                /* From SNB PRM:
362                 *
363                 * "For Align16 access mode, only encodings of 0000 and 0011
364                 *  are allowed. Other codes are reserved."
365                 *
366                 * Presumably the DevSNB behavior applies to IVB as well.
367                 */
368                elk_inst_set_src1_vstride(devinfo, inst, ELK_VERTICAL_STRIDE_4);
369             } else {
370                elk_inst_set_src1_vstride(devinfo, inst, reg.vstride);
371             }
372          }
373       }
374    }
375 }
376 
377 /**
378  * Specify the descriptor and extended descriptor immediate for a SEND(C)
379  * message instruction.
380  */
381 void
elk_set_desc_ex(struct elk_codegen * p,elk_inst * inst,unsigned desc,unsigned ex_desc)382 elk_set_desc_ex(struct elk_codegen *p, elk_inst *inst,
383                 unsigned desc, unsigned ex_desc)
384 {
385    const struct intel_device_info *devinfo = p->devinfo;
386    assert(elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SEND ||
387           elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SENDC);
388    elk_inst_set_src1_file_type(devinfo, inst,
389                                ELK_IMMEDIATE_VALUE, ELK_REGISTER_TYPE_UD);
390    elk_inst_set_send_desc(devinfo, inst, desc);
391 }
392 
elk_set_math_message(struct elk_codegen * p,elk_inst * inst,unsigned function,unsigned integer_type,bool low_precision,unsigned dataType)393 static void elk_set_math_message( struct elk_codegen *p,
394 				  elk_inst *inst,
395 				  unsigned function,
396 				  unsigned integer_type,
397 				  bool low_precision,
398 				  unsigned dataType )
399 {
400    const struct intel_device_info *devinfo = p->devinfo;
401    unsigned msg_length;
402    unsigned response_length;
403 
404    /* Infer message length from the function */
405    switch (function) {
406    case ELK_MATH_FUNCTION_POW:
407    case ELK_MATH_FUNCTION_INT_DIV_QUOTIENT:
408    case ELK_MATH_FUNCTION_INT_DIV_REMAINDER:
409    case ELK_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
410       msg_length = 2;
411       break;
412    default:
413       msg_length = 1;
414       break;
415    }
416 
417    /* Infer response length from the function */
418    switch (function) {
419    case ELK_MATH_FUNCTION_SINCOS:
420    case ELK_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
421       response_length = 2;
422       break;
423    default:
424       response_length = 1;
425       break;
426    }
427 
428    elk_set_desc(p, inst, elk_message_desc(
429                    devinfo, msg_length, response_length, false));
430 
431    elk_inst_set_sfid(devinfo, inst, ELK_SFID_MATH);
432    elk_inst_set_math_msg_function(devinfo, inst, function);
433    elk_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
434    elk_inst_set_math_msg_precision(devinfo, inst, low_precision);
435    elk_inst_set_math_msg_saturate(devinfo, inst, elk_inst_saturate(devinfo, inst));
436    elk_inst_set_math_msg_data_type(devinfo, inst, dataType);
437    elk_inst_set_saturate(devinfo, inst, 0);
438 }
439 
440 
elk_set_ff_sync_message(struct elk_codegen * p,elk_inst * insn,bool allocate,unsigned response_length,bool end_of_thread)441 static void elk_set_ff_sync_message(struct elk_codegen *p,
442 				    elk_inst *insn,
443 				    bool allocate,
444 				    unsigned response_length,
445 				    bool end_of_thread)
446 {
447    const struct intel_device_info *devinfo = p->devinfo;
448 
449    elk_set_desc(p, insn, elk_message_desc(
450                    devinfo, 1, response_length, true));
451 
452    elk_inst_set_sfid(devinfo, insn, ELK_SFID_URB);
453    elk_inst_set_eot(devinfo, insn, end_of_thread);
454    elk_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
455    elk_inst_set_urb_allocate(devinfo, insn, allocate);
456    /* The following fields are not used by FF_SYNC: */
457    elk_inst_set_urb_global_offset(devinfo, insn, 0);
458    elk_inst_set_urb_swizzle_control(devinfo, insn, 0);
459    elk_inst_set_urb_used(devinfo, insn, 0);
460    elk_inst_set_urb_complete(devinfo, insn, 0);
461 }
462 
elk_set_urb_message(struct elk_codegen * p,elk_inst * insn,enum elk_urb_write_flags flags,unsigned msg_length,unsigned response_length,unsigned offset,unsigned swizzle_control)463 static void elk_set_urb_message( struct elk_codegen *p,
464 				 elk_inst *insn,
465                                  enum elk_urb_write_flags flags,
466 				 unsigned msg_length,
467 				 unsigned response_length,
468 				 unsigned offset,
469 				 unsigned swizzle_control )
470 {
471    const struct intel_device_info *devinfo = p->devinfo;
472 
473    assert(devinfo->ver < 7 || swizzle_control != ELK_URB_SWIZZLE_TRANSPOSE);
474    assert(devinfo->ver < 7 || !(flags & ELK_URB_WRITE_ALLOCATE));
475    assert(devinfo->ver >= 7 || !(flags & ELK_URB_WRITE_PER_SLOT_OFFSET));
476 
477    elk_set_desc(p, insn, elk_message_desc(
478                    devinfo, msg_length, response_length, true));
479 
480    elk_inst_set_sfid(devinfo, insn, ELK_SFID_URB);
481    elk_inst_set_eot(devinfo, insn, !!(flags & ELK_URB_WRITE_EOT));
482 
483    if (flags & ELK_URB_WRITE_OWORD) {
484       assert(msg_length == 2); /* header + one OWORD of data */
485       elk_inst_set_urb_opcode(devinfo, insn, ELK_URB_OPCODE_WRITE_OWORD);
486    } else {
487       elk_inst_set_urb_opcode(devinfo, insn, ELK_URB_OPCODE_WRITE_HWORD);
488    }
489 
490    elk_inst_set_urb_global_offset(devinfo, insn, offset);
491    elk_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
492 
493    if (devinfo->ver < 8) {
494       elk_inst_set_urb_complete(devinfo, insn, !!(flags & ELK_URB_WRITE_COMPLETE));
495    }
496 
497    if (devinfo->ver < 7) {
498       elk_inst_set_urb_allocate(devinfo, insn, !!(flags & ELK_URB_WRITE_ALLOCATE));
499       elk_inst_set_urb_used(devinfo, insn, !(flags & ELK_URB_WRITE_UNUSED));
500    } else {
501       elk_inst_set_urb_per_slot_offset(devinfo, insn,
502          !!(flags & ELK_URB_WRITE_PER_SLOT_OFFSET));
503    }
504 }
505 
506 static void
gfx7_set_dp_scratch_message(struct elk_codegen * p,elk_inst * inst,bool write,bool dword,bool invalidate_after_read,unsigned num_regs,unsigned addr_offset,unsigned mlen,unsigned rlen,bool header_present)507 gfx7_set_dp_scratch_message(struct elk_codegen *p,
508                             elk_inst *inst,
509                             bool write,
510                             bool dword,
511                             bool invalidate_after_read,
512                             unsigned num_regs,
513                             unsigned addr_offset,
514                             unsigned mlen,
515                             unsigned rlen,
516                             bool header_present)
517 {
518    const struct intel_device_info *devinfo = p->devinfo;
519    assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
520           (devinfo->ver >= 8 && num_regs == 8));
521    const unsigned block_size = (devinfo->ver >= 8 ? util_logbase2(num_regs) :
522                                 num_regs - 1);
523 
524    elk_set_desc(p, inst, elk_message_desc(
525                    devinfo, mlen, rlen, header_present));
526 
527    elk_inst_set_sfid(devinfo, inst, GFX7_SFID_DATAPORT_DATA_CACHE);
528    elk_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
529    elk_inst_set_scratch_read_write(devinfo, inst, write);
530    elk_inst_set_scratch_type(devinfo, inst, dword);
531    elk_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
532    elk_inst_set_scratch_block_size(devinfo, inst, block_size);
533    elk_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
534 }
535 
536 static void
elk_inst_set_state(const struct elk_isa_info * isa,elk_inst * insn,const struct elk_insn_state * state)537 elk_inst_set_state(const struct elk_isa_info *isa,
538                    elk_inst *insn,
539                    const struct elk_insn_state *state)
540 {
541    const struct intel_device_info *devinfo = isa->devinfo;
542 
543    elk_inst_set_exec_size(devinfo, insn, state->exec_size);
544    elk_inst_set_group(devinfo, insn, state->group);
545    elk_inst_set_compression(devinfo, insn, state->compressed);
546    elk_inst_set_access_mode(devinfo, insn, state->access_mode);
547    elk_inst_set_mask_control(devinfo, insn, state->mask_control);
548    elk_inst_set_saturate(devinfo, insn, state->saturate);
549    elk_inst_set_pred_control(devinfo, insn, state->predicate);
550    elk_inst_set_pred_inv(devinfo, insn, state->pred_inv);
551 
552    if (elk_is_3src(isa, elk_inst_opcode(isa, insn)) &&
553        state->access_mode == ELK_ALIGN_16) {
554       elk_inst_set_3src_a16_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
555       if (devinfo->ver >= 7)
556          elk_inst_set_3src_a16_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
557    } else {
558       elk_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
559       if (devinfo->ver >= 7)
560          elk_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
561    }
562 
563    if (devinfo->ver >= 6)
564       elk_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control);
565 }
566 
567 static elk_inst *
elk_append_insns(struct elk_codegen * p,unsigned nr_insn,unsigned alignment)568 elk_append_insns(struct elk_codegen *p, unsigned nr_insn, unsigned alignment)
569 {
570    assert(util_is_power_of_two_or_zero(sizeof(elk_inst)));
571    assert(util_is_power_of_two_or_zero(alignment));
572    const unsigned align_insn = MAX2(alignment / sizeof(elk_inst), 1);
573    const unsigned start_insn = ALIGN(p->nr_insn, align_insn);
574    const unsigned new_nr_insn = start_insn + nr_insn;
575 
576    if (p->store_size < new_nr_insn) {
577       p->store_size = util_next_power_of_two(new_nr_insn * sizeof(elk_inst));
578       p->store = reralloc(p->mem_ctx, p->store, elk_inst, p->store_size);
579    }
580 
581    /* Memset any padding due to alignment to 0.  We don't want to be hashing
582     * or caching a bunch of random bits we got from a memory allocation.
583     */
584    if (p->nr_insn < start_insn) {
585       memset(&p->store[p->nr_insn], 0,
586              (start_insn - p->nr_insn) * sizeof(elk_inst));
587    }
588 
589    assert(p->next_insn_offset == p->nr_insn * sizeof(elk_inst));
590    p->nr_insn = new_nr_insn;
591    p->next_insn_offset = new_nr_insn * sizeof(elk_inst);
592 
593    return &p->store[start_insn];
594 }
595 
596 void
elk_realign(struct elk_codegen * p,unsigned alignment)597 elk_realign(struct elk_codegen *p, unsigned alignment)
598 {
599    elk_append_insns(p, 0, alignment);
600 }
601 
602 int
elk_append_data(struct elk_codegen * p,void * data,unsigned size,unsigned alignment)603 elk_append_data(struct elk_codegen *p, void *data,
604                 unsigned size, unsigned alignment)
605 {
606    unsigned nr_insn = DIV_ROUND_UP(size, sizeof(elk_inst));
607    void *dst = elk_append_insns(p, nr_insn, alignment);
608    memcpy(dst, data, size);
609 
610    /* If it's not a whole number of instructions, memset the end */
611    if (size < nr_insn * sizeof(elk_inst))
612       memset(dst + size, 0, nr_insn * sizeof(elk_inst) - size);
613 
614    return dst - (void *)p->store;
615 }
616 
617 #define next_insn elk_next_insn
618 elk_inst *
elk_next_insn(struct elk_codegen * p,unsigned opcode)619 elk_next_insn(struct elk_codegen *p, unsigned opcode)
620 {
621    elk_inst *insn = elk_append_insns(p, 1, sizeof(elk_inst));
622 
623    memset(insn, 0, sizeof(*insn));
624    elk_inst_set_opcode(p->isa, insn, opcode);
625 
626    /* Apply the default instruction state */
627    elk_inst_set_state(p->isa, insn, p->current);
628 
629    return insn;
630 }
631 
632 void
elk_add_reloc(struct elk_codegen * p,uint32_t id,enum elk_shader_reloc_type type,uint32_t offset,uint32_t delta)633 elk_add_reloc(struct elk_codegen *p, uint32_t id,
634               enum elk_shader_reloc_type type,
635               uint32_t offset, uint32_t delta)
636 {
637    if (p->num_relocs + 1 > p->reloc_array_size) {
638       p->reloc_array_size = MAX2(16, p->reloc_array_size * 2);
639       p->relocs = reralloc(p->mem_ctx, p->relocs,
640                            struct elk_shader_reloc, p->reloc_array_size);
641    }
642 
643    p->relocs[p->num_relocs++] = (struct elk_shader_reloc) {
644       .id = id,
645       .type = type,
646       .offset = offset,
647       .delta = delta,
648    };
649 }
650 
651 static elk_inst *
elk_alu1(struct elk_codegen * p,unsigned opcode,struct elk_reg dest,struct elk_reg src)652 elk_alu1(struct elk_codegen *p, unsigned opcode,
653          struct elk_reg dest, struct elk_reg src)
654 {
655    elk_inst *insn = next_insn(p, opcode);
656    elk_set_dest(p, insn, dest);
657    elk_set_src0(p, insn, src);
658    return insn;
659 }
660 
661 static elk_inst *
elk_alu2(struct elk_codegen * p,unsigned opcode,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1)662 elk_alu2(struct elk_codegen *p, unsigned opcode,
663          struct elk_reg dest, struct elk_reg src0, struct elk_reg src1)
664 {
665    /* 64-bit immediates are only supported on 1-src instructions */
666    assert(src0.file != ELK_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
667    assert(src1.file != ELK_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
668 
669    elk_inst *insn = next_insn(p, opcode);
670    elk_set_dest(p, insn, dest);
671    elk_set_src0(p, insn, src0);
672    elk_set_src1(p, insn, src1);
673    return insn;
674 }
675 
676 static int
get_3src_subreg_nr(struct elk_reg reg)677 get_3src_subreg_nr(struct elk_reg reg)
678 {
679    /* Normally, SubRegNum is in bytes (0..31).  However, 3-src instructions
680     * use 32-bit units (components 0..7).  Since they only support F/D/UD
681     * types, this doesn't lose any flexibility, but uses fewer bits.
682     */
683    return reg.subnr / 4;
684 }
685 
686 static enum gfx10_align1_3src_vertical_stride
to_3src_align1_vstride(const struct intel_device_info * devinfo,enum elk_vertical_stride vstride)687 to_3src_align1_vstride(const struct intel_device_info *devinfo,
688                        enum elk_vertical_stride vstride)
689 {
690    switch (vstride) {
691    case ELK_VERTICAL_STRIDE_0:
692       return ELK_ALIGN1_3SRC_VERTICAL_STRIDE_0;
693    case ELK_VERTICAL_STRIDE_2:
694       return ELK_ALIGN1_3SRC_VERTICAL_STRIDE_2;
695    case ELK_VERTICAL_STRIDE_4:
696       return ELK_ALIGN1_3SRC_VERTICAL_STRIDE_4;
697    case ELK_VERTICAL_STRIDE_8:
698    case ELK_VERTICAL_STRIDE_16:
699       return ELK_ALIGN1_3SRC_VERTICAL_STRIDE_8;
700    default:
701       unreachable("invalid vstride");
702    }
703 }
704 
705 
706 static enum gfx10_align1_3src_src_horizontal_stride
to_3src_align1_hstride(enum elk_horizontal_stride hstride)707 to_3src_align1_hstride(enum elk_horizontal_stride hstride)
708 {
709    switch (hstride) {
710    case ELK_HORIZONTAL_STRIDE_0:
711       return ELK_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;
712    case ELK_HORIZONTAL_STRIDE_1:
713       return ELK_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;
714    case ELK_HORIZONTAL_STRIDE_2:
715       return ELK_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;
716    case ELK_HORIZONTAL_STRIDE_4:
717       return ELK_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;
718    default:
719       unreachable("invalid hstride");
720    }
721 }
722 
723 static elk_inst *
elk_alu3(struct elk_codegen * p,unsigned opcode,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1,struct elk_reg src2)724 elk_alu3(struct elk_codegen *p, unsigned opcode, struct elk_reg dest,
725          struct elk_reg src0, struct elk_reg src1, struct elk_reg src2)
726 {
727    const struct intel_device_info *devinfo = p->devinfo;
728    elk_inst *inst = next_insn(p, opcode);
729 
730    gfx7_convert_mrf_to_grf(p, &dest);
731 
732    assert(dest.nr < XE2_MAX_GRF);
733 
734    assert(src0.file == ELK_IMMEDIATE_VALUE || src0.nr < XE2_MAX_GRF);
735    assert(src1.file != ELK_IMMEDIATE_VALUE && src1.nr < XE2_MAX_GRF);
736    assert(src2.file == ELK_IMMEDIATE_VALUE || src2.nr < XE2_MAX_GRF);
737    assert(dest.address_mode == ELK_ADDRESS_DIRECT);
738    assert(src0.address_mode == ELK_ADDRESS_DIRECT);
739    assert(src1.address_mode == ELK_ADDRESS_DIRECT);
740    assert(src2.address_mode == ELK_ADDRESS_DIRECT);
741 
742    assert(dest.file == ELK_GENERAL_REGISTER_FILE ||
743           dest.file == ELK_MESSAGE_REGISTER_FILE);
744    assert(dest.type == ELK_REGISTER_TYPE_F  ||
745           dest.type == ELK_REGISTER_TYPE_DF ||
746           dest.type == ELK_REGISTER_TYPE_D  ||
747           dest.type == ELK_REGISTER_TYPE_UD ||
748           (dest.type == ELK_REGISTER_TYPE_HF && devinfo->ver >= 8));
749    if (devinfo->ver == 6) {
750       elk_inst_set_3src_a16_dst_reg_file(devinfo, inst,
751                                          dest.file == ELK_MESSAGE_REGISTER_FILE);
752    }
753    elk_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
754    elk_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 4);
755    elk_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
756 
757    assert(src0.file == ELK_GENERAL_REGISTER_FILE);
758    elk_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
759    elk_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
760    elk_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
761    elk_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
762    elk_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
763    elk_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
764                                        src0.vstride == ELK_VERTICAL_STRIDE_0);
765 
766    assert(src1.file == ELK_GENERAL_REGISTER_FILE);
767    elk_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
768    elk_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
769    elk_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
770    elk_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
771    elk_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
772    elk_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
773                                        src1.vstride == ELK_VERTICAL_STRIDE_0);
774 
775    assert(src2.file == ELK_GENERAL_REGISTER_FILE);
776    elk_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
777    elk_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
778    elk_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
779    elk_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
780    elk_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
781    elk_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
782                                        src2.vstride == ELK_VERTICAL_STRIDE_0);
783 
784    if (devinfo->ver >= 7) {
785       /* Set both the source and destination types based on dest.type,
786        * ignoring the source register types.  The MAD and LRP emitters ensure
787        * that all four types are float.  The BFE and BFI2 emitters, however,
788        * may send us mixed D and UD types and want us to ignore that and use
789        * the destination type.
790        */
791       elk_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
792       elk_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
793 
794       /* From the Bspec, 3D Media GPGPU, Instruction fields, srcType:
795        *
796        *    "Three source instructions can use operands with mixed-mode
797        *     precision. When SrcType field is set to :f or :hf it defines
798        *     precision for source 0 only, and fields Src1Type and Src2Type
799        *     define precision for other source operands:
800        *
801        *     0b = :f. Single precision Float (32-bit).
802        *     1b = :hf. Half precision Float (16-bit)."
803        */
804       if (src1.type == ELK_REGISTER_TYPE_HF)
805          elk_inst_set_3src_a16_src1_type(devinfo, inst, 1);
806 
807       if (src2.type == ELK_REGISTER_TYPE_HF)
808          elk_inst_set_3src_a16_src2_type(devinfo, inst, 1);
809    }
810 
811    return inst;
812 }
813 
814 /***********************************************************************
815  * Convenience routines.
816  */
817 #define ALU1(OP)					\
818 elk_inst *elk_##OP(struct elk_codegen *p,		\
819 	      struct elk_reg dest,			\
820 	      struct elk_reg src0)   			\
821 {							\
822    return elk_alu1(p, ELK_OPCODE_##OP, dest, src0);    	\
823 }
824 
825 #define ALU2(OP)					\
826 elk_inst *elk_##OP(struct elk_codegen *p,		\
827 	      struct elk_reg dest,			\
828 	      struct elk_reg src0,			\
829 	      struct elk_reg src1)   			\
830 {							\
831    return elk_alu2(p, ELK_OPCODE_##OP, dest, src0, src1);	\
832 }
833 
834 #define ALU3(OP)					\
835 elk_inst *elk_##OP(struct elk_codegen *p,		\
836 	      struct elk_reg dest,			\
837 	      struct elk_reg src0,			\
838 	      struct elk_reg src1,			\
839 	      struct elk_reg src2)   			\
840 {                                                       \
841    if (p->current->access_mode == ELK_ALIGN_16) {       \
842       if (src0.vstride == ELK_VERTICAL_STRIDE_0)        \
843          src0.swizzle = ELK_SWIZZLE_XXXX;               \
844       if (src1.vstride == ELK_VERTICAL_STRIDE_0)        \
845          src1.swizzle = ELK_SWIZZLE_XXXX;               \
846       if (src2.vstride == ELK_VERTICAL_STRIDE_0)        \
847          src2.swizzle = ELK_SWIZZLE_XXXX;               \
848    }                                                    \
849    return elk_alu3(p, ELK_OPCODE_##OP, dest, src0, src1, src2);	\
850 }
851 
852 #define ALU3F(OP)                                               \
853 elk_inst *elk_##OP(struct elk_codegen *p,         \
854                                  struct elk_reg dest,           \
855                                  struct elk_reg src0,           \
856                                  struct elk_reg src1,           \
857                                  struct elk_reg src2)           \
858 {                                                               \
859    assert(dest.type == ELK_REGISTER_TYPE_F ||                   \
860           dest.type == ELK_REGISTER_TYPE_DF);                   \
861    if (dest.type == ELK_REGISTER_TYPE_F) {                      \
862       assert(src0.type == ELK_REGISTER_TYPE_F);                 \
863       assert(src1.type == ELK_REGISTER_TYPE_F);                 \
864       assert(src2.type == ELK_REGISTER_TYPE_F);                 \
865    } else if (dest.type == ELK_REGISTER_TYPE_DF) {              \
866       assert(src0.type == ELK_REGISTER_TYPE_DF);                \
867       assert(src1.type == ELK_REGISTER_TYPE_DF);                \
868       assert(src2.type == ELK_REGISTER_TYPE_DF);                \
869    }                                                            \
870                                                                 \
871    if (p->current->access_mode == ELK_ALIGN_16) {               \
872       if (src0.vstride == ELK_VERTICAL_STRIDE_0)                \
873          src0.swizzle = ELK_SWIZZLE_XXXX;                       \
874       if (src1.vstride == ELK_VERTICAL_STRIDE_0)                \
875          src1.swizzle = ELK_SWIZZLE_XXXX;                       \
876       if (src2.vstride == ELK_VERTICAL_STRIDE_0)                \
877          src2.swizzle = ELK_SWIZZLE_XXXX;                       \
878    }                                                            \
879    return elk_alu3(p, ELK_OPCODE_##OP, dest, src0, src1, src2); \
880 }
881 
882 ALU2(SEL)
ALU1(NOT)883 ALU1(NOT)
884 ALU2(AND)
885 ALU2(OR)
886 ALU2(XOR)
887 ALU2(SHR)
888 ALU2(SHL)
889 ALU1(DIM)
890 ALU2(ASR)
891 ALU3(CSEL)
892 ALU1(FRC)
893 ALU1(RNDD)
894 ALU1(RNDE)
895 ALU1(RNDU)
896 ALU1(RNDZ)
897 ALU2(MAC)
898 ALU2(MACH)
899 ALU1(LZD)
900 ALU2(DP4)
901 ALU2(DPH)
902 ALU2(DP3)
903 ALU2(DP2)
904 ALU3(MAD)
905 ALU3F(LRP)
906 ALU1(BFREV)
907 ALU3(BFE)
908 ALU2(BFI1)
909 ALU3(BFI2)
910 ALU1(FBH)
911 ALU1(FBL)
912 ALU1(CBIT)
913 ALU2(ADDC)
914 ALU2(SUBB)
915 
916 elk_inst *
917 elk_MOV(struct elk_codegen *p, struct elk_reg dest, struct elk_reg src0)
918 {
919    const struct intel_device_info *devinfo = p->devinfo;
920 
921    /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
922     * To avoid the problems that causes, we use an <X,2,0> source region to
923     * read each element twice.
924     */
925    if (devinfo->verx10 == 70 &&
926        elk_get_default_access_mode(p) == ELK_ALIGN_1 &&
927        dest.type == ELK_REGISTER_TYPE_DF &&
928        (src0.type == ELK_REGISTER_TYPE_F ||
929         src0.type == ELK_REGISTER_TYPE_D ||
930         src0.type == ELK_REGISTER_TYPE_UD) &&
931        !has_scalar_region(src0)) {
932       assert(src0.vstride == src0.width + src0.hstride);
933       src0.vstride = src0.hstride;
934       src0.width = ELK_WIDTH_2;
935       src0.hstride = ELK_HORIZONTAL_STRIDE_0;
936    }
937 
938    return elk_alu1(p, ELK_OPCODE_MOV, dest, src0);
939 }
940 
941 elk_inst *
elk_ADD(struct elk_codegen * p,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1)942 elk_ADD(struct elk_codegen *p, struct elk_reg dest,
943         struct elk_reg src0, struct elk_reg src1)
944 {
945    /* 6.2.2: add */
946    if (src0.type == ELK_REGISTER_TYPE_F ||
947        (src0.file == ELK_IMMEDIATE_VALUE &&
948 	src0.type == ELK_REGISTER_TYPE_VF)) {
949       assert(src1.type != ELK_REGISTER_TYPE_UD);
950       assert(src1.type != ELK_REGISTER_TYPE_D);
951    }
952 
953    if (src1.type == ELK_REGISTER_TYPE_F ||
954        (src1.file == ELK_IMMEDIATE_VALUE &&
955 	src1.type == ELK_REGISTER_TYPE_VF)) {
956       assert(src0.type != ELK_REGISTER_TYPE_UD);
957       assert(src0.type != ELK_REGISTER_TYPE_D);
958    }
959 
960    return elk_alu2(p, ELK_OPCODE_ADD, dest, src0, src1);
961 }
962 
963 elk_inst *
elk_AVG(struct elk_codegen * p,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1)964 elk_AVG(struct elk_codegen *p, struct elk_reg dest,
965         struct elk_reg src0, struct elk_reg src1)
966 {
967    assert(dest.type == src0.type);
968    assert(src0.type == src1.type);
969    switch (src0.type) {
970    case ELK_REGISTER_TYPE_B:
971    case ELK_REGISTER_TYPE_UB:
972    case ELK_REGISTER_TYPE_W:
973    case ELK_REGISTER_TYPE_UW:
974    case ELK_REGISTER_TYPE_D:
975    case ELK_REGISTER_TYPE_UD:
976       break;
977    default:
978       unreachable("Bad type for elk_AVG");
979    }
980 
981    return elk_alu2(p, ELK_OPCODE_AVG, dest, src0, src1);
982 }
983 
984 elk_inst *
elk_MUL(struct elk_codegen * p,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1)985 elk_MUL(struct elk_codegen *p, struct elk_reg dest,
986         struct elk_reg src0, struct elk_reg src1)
987 {
988    /* 6.32.38: mul */
989    if (src0.type == ELK_REGISTER_TYPE_D ||
990        src0.type == ELK_REGISTER_TYPE_UD ||
991        src1.type == ELK_REGISTER_TYPE_D ||
992        src1.type == ELK_REGISTER_TYPE_UD) {
993       assert(dest.type != ELK_REGISTER_TYPE_F);
994    }
995 
996    if (src0.type == ELK_REGISTER_TYPE_F ||
997        (src0.file == ELK_IMMEDIATE_VALUE &&
998 	src0.type == ELK_REGISTER_TYPE_VF)) {
999       assert(src1.type != ELK_REGISTER_TYPE_UD);
1000       assert(src1.type != ELK_REGISTER_TYPE_D);
1001    }
1002 
1003    if (src1.type == ELK_REGISTER_TYPE_F ||
1004        (src1.file == ELK_IMMEDIATE_VALUE &&
1005 	src1.type == ELK_REGISTER_TYPE_VF)) {
1006       assert(src0.type != ELK_REGISTER_TYPE_UD);
1007       assert(src0.type != ELK_REGISTER_TYPE_D);
1008    }
1009 
1010    assert(src0.file != ELK_ARCHITECTURE_REGISTER_FILE ||
1011 	  src0.nr != ELK_ARF_ACCUMULATOR);
1012    assert(src1.file != ELK_ARCHITECTURE_REGISTER_FILE ||
1013 	  src1.nr != ELK_ARF_ACCUMULATOR);
1014 
1015    return elk_alu2(p, ELK_OPCODE_MUL, dest, src0, src1);
1016 }
1017 
1018 elk_inst *
elk_LINE(struct elk_codegen * p,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1)1019 elk_LINE(struct elk_codegen *p, struct elk_reg dest,
1020          struct elk_reg src0, struct elk_reg src1)
1021 {
1022    src0.vstride = ELK_VERTICAL_STRIDE_0;
1023    src0.width = ELK_WIDTH_1;
1024    src0.hstride = ELK_HORIZONTAL_STRIDE_0;
1025    return elk_alu2(p, ELK_OPCODE_LINE, dest, src0, src1);
1026 }
1027 
1028 elk_inst *
elk_PLN(struct elk_codegen * p,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1)1029 elk_PLN(struct elk_codegen *p, struct elk_reg dest,
1030         struct elk_reg src0, struct elk_reg src1)
1031 {
1032    src0.vstride = ELK_VERTICAL_STRIDE_0;
1033    src0.width = ELK_WIDTH_1;
1034    src0.hstride = ELK_HORIZONTAL_STRIDE_0;
1035    src1.vstride = ELK_VERTICAL_STRIDE_8;
1036    src1.width = ELK_WIDTH_8;
1037    src1.hstride = ELK_HORIZONTAL_STRIDE_1;
1038    return elk_alu2(p, ELK_OPCODE_PLN, dest, src0, src1);
1039 }
1040 
1041 elk_inst *
elk_F32TO16(struct elk_codegen * p,struct elk_reg dst,struct elk_reg src)1042 elk_F32TO16(struct elk_codegen *p, struct elk_reg dst, struct elk_reg src)
1043 {
1044    assert(p->devinfo->ver == 7);
1045 
1046    /* The F32TO16 instruction doesn't support 32-bit destination types in
1047     * Align1 mode.  Gfx7 (only) does zero out the high 16 bits in Align16
1048     * mode as an undocumented feature.
1049     */
1050    if (ELK_ALIGN_16 == elk_get_default_access_mode(p)) {
1051       assert(dst.type == ELK_REGISTER_TYPE_UD);
1052    } else {
1053       assert(dst.type == ELK_REGISTER_TYPE_W ||
1054              dst.type == ELK_REGISTER_TYPE_UW);
1055    }
1056 
1057    return elk_alu1(p, ELK_OPCODE_F32TO16, dst, src);
1058 }
1059 
1060 elk_inst *
elk_F16TO32(struct elk_codegen * p,struct elk_reg dst,struct elk_reg src)1061 elk_F16TO32(struct elk_codegen *p, struct elk_reg dst, struct elk_reg src)
1062 {
1063    assert(p->devinfo->ver == 7);
1064 
1065    if (ELK_ALIGN_16 == elk_get_default_access_mode(p)) {
1066       assert(src.type == ELK_REGISTER_TYPE_UD);
1067    } else {
1068       /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1069        *
1070        *   Because this instruction does not have a 16-bit floating-point
1071        *   type, the source data type must be Word (W). The destination type
1072        *   must be F (Float).
1073        */
1074       assert(src.type == ELK_REGISTER_TYPE_W ||
1075              src.type == ELK_REGISTER_TYPE_UW);
1076    }
1077 
1078    return elk_alu1(p, ELK_OPCODE_F16TO32, dst, src);
1079 }
1080 
1081 
elk_NOP(struct elk_codegen * p)1082 void elk_NOP(struct elk_codegen *p)
1083 {
1084    elk_inst *insn = next_insn(p, ELK_OPCODE_NOP);
1085    memset(insn, 0, sizeof(*insn));
1086    elk_inst_set_opcode(p->isa, insn, ELK_OPCODE_NOP);
1087 }
1088 
1089 /***********************************************************************
1090  * Comparisons, if/else/endif
1091  */
1092 
1093 elk_inst *
elk_JMPI(struct elk_codegen * p,struct elk_reg index,unsigned predicate_control)1094 elk_JMPI(struct elk_codegen *p, struct elk_reg index,
1095          unsigned predicate_control)
1096 {
1097    const struct intel_device_info *devinfo = p->devinfo;
1098    struct elk_reg ip = elk_ip_reg();
1099    elk_inst *inst = elk_alu2(p, ELK_OPCODE_JMPI, ip, ip, index);
1100 
1101    elk_inst_set_exec_size(devinfo, inst, ELK_EXECUTE_1);
1102    elk_inst_set_qtr_control(devinfo, inst, ELK_COMPRESSION_NONE);
1103    elk_inst_set_mask_control(devinfo, inst, ELK_MASK_DISABLE);
1104    elk_inst_set_pred_control(devinfo, inst, predicate_control);
1105 
1106    return inst;
1107 }
1108 
1109 static void
push_if_stack(struct elk_codegen * p,elk_inst * inst)1110 push_if_stack(struct elk_codegen *p, elk_inst *inst)
1111 {
1112    p->if_stack[p->if_stack_depth] = inst - p->store;
1113 
1114    p->if_stack_depth++;
1115    if (p->if_stack_array_size <= p->if_stack_depth) {
1116       p->if_stack_array_size *= 2;
1117       p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1118 			     p->if_stack_array_size);
1119    }
1120 }
1121 
1122 static elk_inst *
pop_if_stack(struct elk_codegen * p)1123 pop_if_stack(struct elk_codegen *p)
1124 {
1125    p->if_stack_depth--;
1126    return &p->store[p->if_stack[p->if_stack_depth]];
1127 }
1128 
1129 static void
push_loop_stack(struct elk_codegen * p,elk_inst * inst)1130 push_loop_stack(struct elk_codegen *p, elk_inst *inst)
1131 {
1132    if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1133       p->loop_stack_array_size *= 2;
1134       p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1135 			       p->loop_stack_array_size);
1136       p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1137 				     p->loop_stack_array_size);
1138    }
1139 
1140    p->loop_stack[p->loop_stack_depth] = inst - p->store;
1141    p->loop_stack_depth++;
1142    p->if_depth_in_loop[p->loop_stack_depth] = 0;
1143 }
1144 
1145 static elk_inst *
get_inner_do_insn(struct elk_codegen * p)1146 get_inner_do_insn(struct elk_codegen *p)
1147 {
1148    return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1149 }
1150 
1151 /* EU takes the value from the flag register and pushes it onto some
1152  * sort of a stack (presumably merging with any flag value already on
1153  * the stack).  Within an if block, the flags at the top of the stack
1154  * control execution on each channel of the unit, eg. on each of the
1155  * 16 pixel values in our wm programs.
1156  *
1157  * When the matching 'else' instruction is reached (presumably by
1158  * countdown of the instruction count patched in by our ELSE/ENDIF
1159  * functions), the relevant flags are inverted.
1160  *
1161  * When the matching 'endif' instruction is reached, the flags are
1162  * popped off.  If the stack is now empty, normal execution resumes.
1163  */
1164 elk_inst *
elk_IF(struct elk_codegen * p,unsigned execute_size)1165 elk_IF(struct elk_codegen *p, unsigned execute_size)
1166 {
1167    const struct intel_device_info *devinfo = p->devinfo;
1168    elk_inst *insn;
1169 
1170    insn = next_insn(p, ELK_OPCODE_IF);
1171 
1172    /* Override the defaults for this instruction:
1173     */
1174    if (devinfo->ver < 6) {
1175       elk_set_dest(p, insn, elk_ip_reg());
1176       elk_set_src0(p, insn, elk_ip_reg());
1177       elk_set_src1(p, insn, elk_imm_d(0x0));
1178    } else if (devinfo->ver == 6) {
1179       elk_set_dest(p, insn, elk_imm_w(0));
1180       elk_inst_set_gfx6_jump_count(devinfo, insn, 0);
1181       elk_set_src0(p, insn, vec1(retype(elk_null_reg(), ELK_REGISTER_TYPE_D)));
1182       elk_set_src1(p, insn, vec1(retype(elk_null_reg(), ELK_REGISTER_TYPE_D)));
1183    } else if (devinfo->ver == 7) {
1184       elk_set_dest(p, insn, vec1(retype(elk_null_reg(), ELK_REGISTER_TYPE_D)));
1185       elk_set_src0(p, insn, vec1(retype(elk_null_reg(), ELK_REGISTER_TYPE_D)));
1186       elk_set_src1(p, insn, elk_imm_w(0));
1187       elk_inst_set_jip(devinfo, insn, 0);
1188       elk_inst_set_uip(devinfo, insn, 0);
1189    } else {
1190       elk_set_dest(p, insn, vec1(retype(elk_null_reg(), ELK_REGISTER_TYPE_D)));
1191       elk_set_src0(p, insn, elk_imm_d(0));
1192       elk_inst_set_jip(devinfo, insn, 0);
1193       elk_inst_set_uip(devinfo, insn, 0);
1194    }
1195 
1196    elk_inst_set_exec_size(devinfo, insn, execute_size);
1197    elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
1198    elk_inst_set_pred_control(devinfo, insn, ELK_PREDICATE_NORMAL);
1199    elk_inst_set_mask_control(devinfo, insn, ELK_MASK_ENABLE);
1200    if (!p->single_program_flow && devinfo->ver < 6)
1201       elk_inst_set_thread_control(devinfo, insn, ELK_THREAD_SWITCH);
1202 
1203    push_if_stack(p, insn);
1204    p->if_depth_in_loop[p->loop_stack_depth]++;
1205    return insn;
1206 }
1207 
1208 /* This function is only used for gfx6-style IF instructions with an
1209  * embedded comparison (conditional modifier).  It is not used on gfx7.
1210  */
1211 elk_inst *
elk_gfx6_IF(struct elk_codegen * p,enum elk_conditional_mod conditional,struct elk_reg src0,struct elk_reg src1)1212 elk_gfx6_IF(struct elk_codegen *p, enum elk_conditional_mod conditional,
1213 	struct elk_reg src0, struct elk_reg src1)
1214 {
1215    const struct intel_device_info *devinfo = p->devinfo;
1216    elk_inst *insn;
1217 
1218    insn = next_insn(p, ELK_OPCODE_IF);
1219 
1220    elk_set_dest(p, insn, elk_imm_w(0));
1221    elk_inst_set_exec_size(devinfo, insn, elk_get_default_exec_size(p));
1222    elk_inst_set_gfx6_jump_count(devinfo, insn, 0);
1223    elk_set_src0(p, insn, src0);
1224    elk_set_src1(p, insn, src1);
1225 
1226    assert(elk_inst_qtr_control(devinfo, insn) == ELK_COMPRESSION_NONE);
1227    assert(elk_inst_pred_control(devinfo, insn) == ELK_PREDICATE_NONE);
1228    elk_inst_set_cond_modifier(devinfo, insn, conditional);
1229 
1230    push_if_stack(p, insn);
1231    return insn;
1232 }
1233 
1234 /**
1235  * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1236  */
1237 static void
convert_IF_ELSE_to_ADD(struct elk_codegen * p,elk_inst * if_inst,elk_inst * else_inst)1238 convert_IF_ELSE_to_ADD(struct elk_codegen *p,
1239                        elk_inst *if_inst, elk_inst *else_inst)
1240 {
1241    const struct intel_device_info *devinfo = p->devinfo;
1242 
1243    /* The next instruction (where the ENDIF would be, if it existed) */
1244    elk_inst *next_inst = &p->store[p->nr_insn];
1245 
1246    assert(p->single_program_flow);
1247    assert(if_inst != NULL && elk_inst_opcode(p->isa, if_inst) == ELK_OPCODE_IF);
1248    assert(else_inst == NULL || elk_inst_opcode(p->isa, else_inst) == ELK_OPCODE_ELSE);
1249    assert(elk_inst_exec_size(devinfo, if_inst) == ELK_EXECUTE_1);
1250 
1251    /* Convert IF to an ADD instruction that moves the instruction pointer
1252     * to the first instruction of the ELSE block.  If there is no ELSE
1253     * block, point to where ENDIF would be.  Reverse the predicate.
1254     *
1255     * There's no need to execute an ENDIF since we don't need to do any
1256     * stack operations, and if we're currently executing, we just want to
1257     * continue normally.
1258     */
1259    elk_inst_set_opcode(p->isa, if_inst, ELK_OPCODE_ADD);
1260    elk_inst_set_pred_inv(devinfo, if_inst, true);
1261 
1262    if (else_inst != NULL) {
1263       /* Convert ELSE to an ADD instruction that points where the ENDIF
1264        * would be.
1265        */
1266       elk_inst_set_opcode(p->isa, else_inst, ELK_OPCODE_ADD);
1267 
1268       elk_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1269       elk_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1270    } else {
1271       elk_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1272    }
1273 }
1274 
1275 /**
1276  * Patch IF and ELSE instructions with appropriate jump targets.
1277  */
1278 static void
patch_IF_ELSE(struct elk_codegen * p,elk_inst * if_inst,elk_inst * else_inst,elk_inst * endif_inst)1279 patch_IF_ELSE(struct elk_codegen *p,
1280               elk_inst *if_inst, elk_inst *else_inst, elk_inst *endif_inst)
1281 {
1282    const struct intel_device_info *devinfo = p->devinfo;
1283 
1284    /* We shouldn't be patching IF and ELSE instructions in single program flow
1285     * mode when gen < 6, because in single program flow mode on those
1286     * platforms, we convert flow control instructions to conditional ADDs that
1287     * operate on IP (see elk_ENDIF).
1288     *
1289     * However, on Gfx6, writing to IP doesn't work in single program flow mode
1290     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1291     * not be updated by non-flow control instructions.").  And on later
1292     * platforms, there is no significant benefit to converting control flow
1293     * instructions to conditional ADDs.  So we do patch IF and ELSE
1294     * instructions in single program flow mode on those platforms.
1295     */
1296    if (devinfo->ver < 6)
1297       assert(!p->single_program_flow);
1298 
1299    assert(if_inst != NULL && elk_inst_opcode(p->isa, if_inst) == ELK_OPCODE_IF);
1300    assert(endif_inst != NULL);
1301    assert(else_inst == NULL || elk_inst_opcode(p->isa, else_inst) == ELK_OPCODE_ELSE);
1302 
1303    unsigned br = elk_jump_scale(devinfo);
1304 
1305    assert(elk_inst_opcode(p->isa, endif_inst) == ELK_OPCODE_ENDIF);
1306    elk_inst_set_exec_size(devinfo, endif_inst, elk_inst_exec_size(devinfo, if_inst));
1307 
1308    if (else_inst == NULL) {
1309       /* Patch IF -> ENDIF */
1310       if (devinfo->ver < 6) {
1311 	 /* Turn it into an IFF, which means no mask stack operations for
1312 	  * all-false and jumping past the ENDIF.
1313 	  */
1314          elk_inst_set_opcode(p->isa, if_inst, ELK_OPCODE_IFF);
1315          elk_inst_set_gfx4_jump_count(devinfo, if_inst,
1316                                       br * (endif_inst - if_inst + 1));
1317          elk_inst_set_gfx4_pop_count(devinfo, if_inst, 0);
1318       } else if (devinfo->ver == 6) {
1319 	 /* As of gfx6, there is no IFF and IF must point to the ENDIF. */
1320          elk_inst_set_gfx6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1321       } else {
1322          elk_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1323          elk_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1324       }
1325    } else {
1326       elk_inst_set_exec_size(devinfo, else_inst, elk_inst_exec_size(devinfo, if_inst));
1327 
1328       /* Patch IF -> ELSE */
1329       if (devinfo->ver < 6) {
1330          elk_inst_set_gfx4_jump_count(devinfo, if_inst,
1331                                       br * (else_inst - if_inst));
1332          elk_inst_set_gfx4_pop_count(devinfo, if_inst, 0);
1333       } else if (devinfo->ver == 6) {
1334          elk_inst_set_gfx6_jump_count(devinfo, if_inst,
1335                                       br * (else_inst - if_inst + 1));
1336       }
1337 
1338       /* Patch ELSE -> ENDIF */
1339       if (devinfo->ver < 6) {
1340 	 /* ELK_OPCODE_ELSE pre-gfx6 should point just past the
1341 	  * matching ENDIF.
1342 	  */
1343          elk_inst_set_gfx4_jump_count(devinfo, else_inst,
1344                                       br * (endif_inst - else_inst + 1));
1345          elk_inst_set_gfx4_pop_count(devinfo, else_inst, 1);
1346       } else if (devinfo->ver == 6) {
1347 	 /* ELK_OPCODE_ELSE on gfx6 should point to the matching ENDIF. */
1348          elk_inst_set_gfx6_jump_count(devinfo, else_inst,
1349                                       br * (endif_inst - else_inst));
1350       } else {
1351 	 /* The IF instruction's JIP should point just past the ELSE */
1352          elk_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1353 	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1354          elk_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1355 
1356          if (devinfo->ver >= 8) {
1357             /* Set the ELSE instruction to use branch_ctrl with a join
1358              * jump target pointing at the NOP inserted right before
1359              * the ENDIF instruction in order to make sure it is
1360              * executed in all cases, since attempting to do the same
1361              * as on other generations could cause the EU to jump at
1362              * the instruction immediately after the ENDIF due to
1363              * Wa_220160235, which could cause the program to continue
1364              * running with all channels disabled.
1365              */
1366             elk_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst - 1));
1367             elk_inst_set_branch_control(devinfo, else_inst, true);
1368          } else {
1369             elk_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1370          }
1371 
1372          if (devinfo->ver >= 8) {
1373             /* Since we don't set branch_ctrl on Gfx11+, the ELSE's
1374              * JIP and UIP both should point to ENDIF on those
1375              * platforms.
1376              */
1377             elk_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1378          }
1379       }
1380    }
1381 }
1382 
1383 void
elk_ELSE(struct elk_codegen * p)1384 elk_ELSE(struct elk_codegen *p)
1385 {
1386    const struct intel_device_info *devinfo = p->devinfo;
1387    elk_inst *insn;
1388 
1389    insn = next_insn(p, ELK_OPCODE_ELSE);
1390 
1391    if (devinfo->ver < 6) {
1392       elk_set_dest(p, insn, elk_ip_reg());
1393       elk_set_src0(p, insn, elk_ip_reg());
1394       elk_set_src1(p, insn, elk_imm_d(0x0));
1395    } else if (devinfo->ver == 6) {
1396       elk_set_dest(p, insn, elk_imm_w(0));
1397       elk_inst_set_gfx6_jump_count(devinfo, insn, 0);
1398       elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1399       elk_set_src1(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1400    } else if (devinfo->ver == 7) {
1401       elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1402       elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1403       elk_set_src1(p, insn, elk_imm_w(0));
1404       elk_inst_set_jip(devinfo, insn, 0);
1405       elk_inst_set_uip(devinfo, insn, 0);
1406    } else {
1407       elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1408       elk_set_src0(p, insn, elk_imm_d(0));
1409       elk_inst_set_jip(devinfo, insn, 0);
1410       elk_inst_set_uip(devinfo, insn, 0);
1411    }
1412 
1413    elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
1414    elk_inst_set_mask_control(devinfo, insn, ELK_MASK_ENABLE);
1415    if (!p->single_program_flow && devinfo->ver < 6)
1416       elk_inst_set_thread_control(devinfo, insn, ELK_THREAD_SWITCH);
1417 
1418    push_if_stack(p, insn);
1419 }
1420 
1421 void
elk_ENDIF(struct elk_codegen * p)1422 elk_ENDIF(struct elk_codegen *p)
1423 {
1424    const struct intel_device_info *devinfo = p->devinfo;
1425    elk_inst *insn = NULL;
1426    elk_inst *else_inst = NULL;
1427    elk_inst *if_inst = NULL;
1428    elk_inst *tmp;
1429    bool emit_endif = true;
1430 
1431    assert(p->if_stack_depth > 0);
1432 
1433    if (devinfo->ver >= 8 &&
1434        elk_inst_opcode(p->isa, &p->store[p->if_stack[
1435                              p->if_stack_depth - 1]]) == ELK_OPCODE_ELSE) {
1436       /* Insert a NOP to be specified as join instruction within the
1437        * ELSE block, which is valid for an ELSE instruction with
1438        * branch_ctrl on.  The ELSE instruction will be set to jump
1439        * here instead of to the ENDIF instruction, since attempting to
1440        * do the latter would prevent the ENDIF from being executed in
1441        * some cases due to Wa_220160235, which could cause the program
1442        * to continue running with all channels disabled.
1443        */
1444       elk_NOP(p);
1445    }
1446 
1447    /* In single program flow mode, we can express IF and ELSE instructions
1448     * equivalently as ADD instructions that operate on IP.  On platforms prior
1449     * to Gfx6, flow control instructions cause an implied thread switch, so
1450     * this is a significant savings.
1451     *
1452     * However, on Gfx6, writing to IP doesn't work in single program flow mode
1453     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1454     * not be updated by non-flow control instructions.").  And on later
1455     * platforms, there is no significant benefit to converting control flow
1456     * instructions to conditional ADDs.  So we only do this trick on Gfx4 and
1457     * Gfx5.
1458     */
1459    if (devinfo->ver < 6 && p->single_program_flow)
1460       emit_endif = false;
1461 
1462    /*
1463     * A single next_insn() may change the base address of instruction store
1464     * memory(p->store), so call it first before referencing the instruction
1465     * store pointer from an index
1466     */
1467    if (emit_endif)
1468       insn = next_insn(p, ELK_OPCODE_ENDIF);
1469 
1470    /* Pop the IF and (optional) ELSE instructions from the stack */
1471    p->if_depth_in_loop[p->loop_stack_depth]--;
1472    tmp = pop_if_stack(p);
1473    if (elk_inst_opcode(p->isa, tmp) == ELK_OPCODE_ELSE) {
1474       else_inst = tmp;
1475       tmp = pop_if_stack(p);
1476    }
1477    if_inst = tmp;
1478 
1479    if (!emit_endif) {
1480       /* ENDIF is useless; don't bother emitting it. */
1481       convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1482       return;
1483    }
1484 
1485    if (devinfo->ver < 6) {
1486       elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1487       elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1488       elk_set_src1(p, insn, elk_imm_d(0x0));
1489    } else if (devinfo->ver == 6) {
1490       elk_set_dest(p, insn, elk_imm_w(0));
1491       elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1492       elk_set_src1(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1493    } else if (devinfo->ver == 7) {
1494       elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1495       elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1496       elk_set_src1(p, insn, elk_imm_w(0));
1497    } else {
1498       elk_set_src0(p, insn, elk_imm_d(0));
1499    }
1500 
1501    elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
1502    elk_inst_set_mask_control(devinfo, insn, ELK_MASK_ENABLE);
1503    if (devinfo->ver < 6)
1504       elk_inst_set_thread_control(devinfo, insn, ELK_THREAD_SWITCH);
1505 
1506    /* Also pop item off the stack in the endif instruction: */
1507    if (devinfo->ver < 6) {
1508       elk_inst_set_gfx4_jump_count(devinfo, insn, 0);
1509       elk_inst_set_gfx4_pop_count(devinfo, insn, 1);
1510    } else if (devinfo->ver == 6) {
1511       elk_inst_set_gfx6_jump_count(devinfo, insn, 2);
1512    } else {
1513       elk_inst_set_jip(devinfo, insn, 2);
1514    }
1515    patch_IF_ELSE(p, if_inst, else_inst, insn);
1516 }
1517 
1518 elk_inst *
elk_BREAK(struct elk_codegen * p)1519 elk_BREAK(struct elk_codegen *p)
1520 {
1521    const struct intel_device_info *devinfo = p->devinfo;
1522    elk_inst *insn;
1523 
1524    insn = next_insn(p, ELK_OPCODE_BREAK);
1525    if (devinfo->ver >= 8) {
1526       elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1527       elk_set_src0(p, insn, elk_imm_d(0x0));
1528    } else if (devinfo->ver >= 6) {
1529       elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1530       elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1531       elk_set_src1(p, insn, elk_imm_d(0x0));
1532    } else {
1533       elk_set_dest(p, insn, elk_ip_reg());
1534       elk_set_src0(p, insn, elk_ip_reg());
1535       elk_set_src1(p, insn, elk_imm_d(0x0));
1536       elk_inst_set_gfx4_pop_count(devinfo, insn,
1537                                   p->if_depth_in_loop[p->loop_stack_depth]);
1538    }
1539    elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
1540    elk_inst_set_exec_size(devinfo, insn, elk_get_default_exec_size(p));
1541 
1542    return insn;
1543 }
1544 
1545 elk_inst *
elk_CONT(struct elk_codegen * p)1546 elk_CONT(struct elk_codegen *p)
1547 {
1548    const struct intel_device_info *devinfo = p->devinfo;
1549    elk_inst *insn;
1550 
1551    insn = next_insn(p, ELK_OPCODE_CONTINUE);
1552    elk_set_dest(p, insn, elk_ip_reg());
1553    if (devinfo->ver >= 8) {
1554       elk_set_src0(p, insn, elk_imm_d(0x0));
1555    } else {
1556       elk_set_src0(p, insn, elk_ip_reg());
1557       elk_set_src1(p, insn, elk_imm_d(0x0));
1558    }
1559 
1560    if (devinfo->ver < 6) {
1561       elk_inst_set_gfx4_pop_count(devinfo, insn,
1562                                   p->if_depth_in_loop[p->loop_stack_depth]);
1563    }
1564    elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
1565    elk_inst_set_exec_size(devinfo, insn, elk_get_default_exec_size(p));
1566    return insn;
1567 }
1568 
1569 elk_inst *
elk_HALT(struct elk_codegen * p)1570 elk_HALT(struct elk_codegen *p)
1571 {
1572    const struct intel_device_info *devinfo = p->devinfo;
1573    elk_inst *insn;
1574 
1575    insn = next_insn(p, ELK_OPCODE_HALT);
1576    elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1577    if (devinfo->ver < 6) {
1578       /* From the Gfx4 PRM:
1579        *
1580        *    "IP register must be put (for example, by the assembler) at <dst>
1581        *    and <src0> locations.
1582        */
1583       elk_set_dest(p, insn, elk_ip_reg());
1584       elk_set_src0(p, insn, elk_ip_reg());
1585       elk_set_src1(p, insn, elk_imm_d(0x0)); /* exitcode updated later. */
1586    } else if (devinfo->ver < 8) {
1587       elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1588       elk_set_src1(p, insn, elk_imm_d(0x0)); /* UIP and JIP, updated later. */
1589    } else {
1590       assert(devinfo->ver == 8);
1591       elk_set_src0(p, insn, elk_imm_d(0x0));
1592    }
1593 
1594    elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
1595    elk_inst_set_exec_size(devinfo, insn, elk_get_default_exec_size(p));
1596    return insn;
1597 }
1598 
1599 /* DO/WHILE loop:
1600  *
1601  * The DO/WHILE is just an unterminated loop -- break or continue are
1602  * used for control within the loop.  We have a few ways they can be
1603  * done.
1604  *
1605  * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1606  * jip and no DO instruction.
1607  *
1608  * For non-uniform control flow pre-gfx6, there's a DO instruction to
1609  * push the mask, and a WHILE to jump back, and BREAK to get out and
1610  * pop the mask.
1611  *
1612  * For gfx6, there's no more mask stack, so no need for DO.  WHILE
1613  * just points back to the first instruction of the loop.
1614  */
1615 elk_inst *
elk_DO(struct elk_codegen * p,unsigned execute_size)1616 elk_DO(struct elk_codegen *p, unsigned execute_size)
1617 {
1618    const struct intel_device_info *devinfo = p->devinfo;
1619 
1620    if (devinfo->ver >= 6 || p->single_program_flow) {
1621       push_loop_stack(p, &p->store[p->nr_insn]);
1622       return &p->store[p->nr_insn];
1623    } else {
1624       elk_inst *insn = next_insn(p, ELK_OPCODE_DO);
1625 
1626       push_loop_stack(p, insn);
1627 
1628       /* Override the defaults for this instruction:
1629        */
1630       elk_set_dest(p, insn, elk_null_reg());
1631       elk_set_src0(p, insn, elk_null_reg());
1632       elk_set_src1(p, insn, elk_null_reg());
1633 
1634       elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
1635       elk_inst_set_exec_size(devinfo, insn, execute_size);
1636       elk_inst_set_pred_control(devinfo, insn, ELK_PREDICATE_NONE);
1637 
1638       return insn;
1639    }
1640 }
1641 
1642 /**
1643  * For pre-gfx6, we patch BREAK/CONT instructions to point at the WHILE
1644  * instruction here.
1645  *
1646  * For gfx6+, see elk_set_uip_jip(), which doesn't care so much about the loop
1647  * nesting, since it can always just point to the end of the block/current loop.
1648  */
1649 static void
elk_patch_break_cont(struct elk_codegen * p,elk_inst * while_inst)1650 elk_patch_break_cont(struct elk_codegen *p, elk_inst *while_inst)
1651 {
1652    const struct intel_device_info *devinfo = p->devinfo;
1653    elk_inst *do_inst = get_inner_do_insn(p);
1654    elk_inst *inst;
1655    unsigned br = elk_jump_scale(devinfo);
1656 
1657    assert(devinfo->ver < 6);
1658 
1659    for (inst = while_inst - 1; inst != do_inst; inst--) {
1660       /* If the jump count is != 0, that means that this instruction has already
1661        * been patched because it's part of a loop inside of the one we're
1662        * patching.
1663        */
1664       if (elk_inst_opcode(p->isa, inst) == ELK_OPCODE_BREAK &&
1665           elk_inst_gfx4_jump_count(devinfo, inst) == 0) {
1666          elk_inst_set_gfx4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1667       } else if (elk_inst_opcode(p->isa, inst) == ELK_OPCODE_CONTINUE &&
1668                  elk_inst_gfx4_jump_count(devinfo, inst) == 0) {
1669          elk_inst_set_gfx4_jump_count(devinfo, inst, br * (while_inst - inst));
1670       }
1671    }
1672 }
1673 
1674 elk_inst *
elk_WHILE(struct elk_codegen * p)1675 elk_WHILE(struct elk_codegen *p)
1676 {
1677    const struct intel_device_info *devinfo = p->devinfo;
1678    elk_inst *insn, *do_insn;
1679    unsigned br = elk_jump_scale(devinfo);
1680 
1681    if (devinfo->ver >= 6) {
1682       insn = next_insn(p, ELK_OPCODE_WHILE);
1683       do_insn = get_inner_do_insn(p);
1684 
1685       if (devinfo->ver >= 8) {
1686          elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1687          elk_set_src0(p, insn, elk_imm_d(0));
1688          elk_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1689       } else if (devinfo->ver == 7) {
1690          elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1691          elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1692          elk_set_src1(p, insn, elk_imm_w(0));
1693          elk_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1694       } else {
1695          elk_set_dest(p, insn, elk_imm_w(0));
1696          elk_inst_set_gfx6_jump_count(devinfo, insn, br * (do_insn - insn));
1697          elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1698          elk_set_src1(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1699       }
1700 
1701       elk_inst_set_exec_size(devinfo, insn, elk_get_default_exec_size(p));
1702 
1703    } else {
1704       if (p->single_program_flow) {
1705 	 insn = next_insn(p, ELK_OPCODE_ADD);
1706          do_insn = get_inner_do_insn(p);
1707 
1708 	 elk_set_dest(p, insn, elk_ip_reg());
1709 	 elk_set_src0(p, insn, elk_ip_reg());
1710 	 elk_set_src1(p, insn, elk_imm_d((do_insn - insn) * 16));
1711          elk_inst_set_exec_size(devinfo, insn, ELK_EXECUTE_1);
1712       } else {
1713 	 insn = next_insn(p, ELK_OPCODE_WHILE);
1714          do_insn = get_inner_do_insn(p);
1715 
1716          assert(elk_inst_opcode(p->isa, do_insn) == ELK_OPCODE_DO);
1717 
1718 	 elk_set_dest(p, insn, elk_ip_reg());
1719 	 elk_set_src0(p, insn, elk_ip_reg());
1720 	 elk_set_src1(p, insn, elk_imm_d(0));
1721 
1722          elk_inst_set_exec_size(devinfo, insn, elk_inst_exec_size(devinfo, do_insn));
1723          elk_inst_set_gfx4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1724          elk_inst_set_gfx4_pop_count(devinfo, insn, 0);
1725 
1726 	 elk_patch_break_cont(p, insn);
1727       }
1728    }
1729    elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
1730 
1731    p->loop_stack_depth--;
1732 
1733    return insn;
1734 }
1735 
1736 /* FORWARD JUMPS:
1737  */
elk_land_fwd_jump(struct elk_codegen * p,int jmp_insn_idx)1738 void elk_land_fwd_jump(struct elk_codegen *p, int jmp_insn_idx)
1739 {
1740    const struct intel_device_info *devinfo = p->devinfo;
1741    elk_inst *jmp_insn = &p->store[jmp_insn_idx];
1742    unsigned jmpi = 1;
1743 
1744    if (devinfo->ver >= 5)
1745       jmpi = 2;
1746 
1747    assert(elk_inst_opcode(p->isa, jmp_insn) == ELK_OPCODE_JMPI);
1748    assert(elk_inst_src1_reg_file(devinfo, jmp_insn) == ELK_IMMEDIATE_VALUE);
1749 
1750    elk_inst_set_gfx4_jump_count(devinfo, jmp_insn,
1751                                 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1752 }
1753 
1754 /* To integrate with the above, it makes sense that the comparison
1755  * instruction should populate the flag register.  It might be simpler
1756  * just to use the flag reg for most WM tasks?
1757  */
elk_CMP(struct elk_codegen * p,struct elk_reg dest,unsigned conditional,struct elk_reg src0,struct elk_reg src1)1758 void elk_CMP(struct elk_codegen *p,
1759 	     struct elk_reg dest,
1760 	     unsigned conditional,
1761 	     struct elk_reg src0,
1762 	     struct elk_reg src1)
1763 {
1764    const struct intel_device_info *devinfo = p->devinfo;
1765    elk_inst *insn = next_insn(p, ELK_OPCODE_CMP);
1766 
1767    elk_inst_set_cond_modifier(devinfo, insn, conditional);
1768    elk_set_dest(p, insn, dest);
1769    elk_set_src0(p, insn, src0);
1770    elk_set_src1(p, insn, src1);
1771 
1772    /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1773     * page says:
1774     *    "Any CMP instruction with a null destination must use a {switch}."
1775     *
1776     * It also applies to other Gfx7 platforms (IVB, BYT) even though it isn't
1777     * mentioned on their work-arounds pages.
1778     */
1779    if (devinfo->ver == 7) {
1780       if (dest.file == ELK_ARCHITECTURE_REGISTER_FILE &&
1781           dest.nr == ELK_ARF_NULL) {
1782          elk_inst_set_thread_control(devinfo, insn, ELK_THREAD_SWITCH);
1783       }
1784    }
1785 }
1786 
elk_CMPN(struct elk_codegen * p,struct elk_reg dest,unsigned conditional,struct elk_reg src0,struct elk_reg src1)1787 void elk_CMPN(struct elk_codegen *p,
1788               struct elk_reg dest,
1789               unsigned conditional,
1790               struct elk_reg src0,
1791               struct elk_reg src1)
1792 {
1793    const struct intel_device_info *devinfo = p->devinfo;
1794    elk_inst *insn = next_insn(p, ELK_OPCODE_CMPN);
1795 
1796    elk_inst_set_cond_modifier(devinfo, insn, conditional);
1797    elk_set_dest(p, insn, dest);
1798    elk_set_src0(p, insn, src0);
1799    elk_set_src1(p, insn, src1);
1800 
1801    /* Page 166 of the Ivy Bridge PRM Volume 4 part 3 (Execution Unit ISA)
1802     * says:
1803     *
1804     *    If the destination is the null register, the {Switch} instruction
1805     *    option must be used.
1806     *
1807     * Page 77 of the Haswell PRM Volume 2b contains the same text.
1808     */
1809    if (devinfo->ver == 7) {
1810       if (dest.file == ELK_ARCHITECTURE_REGISTER_FILE &&
1811           dest.nr == ELK_ARF_NULL) {
1812          elk_inst_set_thread_control(devinfo, insn, ELK_THREAD_SWITCH);
1813       }
1814    }
1815 }
1816 
1817 /***********************************************************************
1818  * Helpers for the various SEND message types:
1819  */
1820 
1821 /** Extended math function, float[8].
1822  */
elk_gfx4_math(struct elk_codegen * p,struct elk_reg dest,unsigned function,unsigned msg_reg_nr,struct elk_reg src,unsigned precision)1823 void elk_gfx4_math(struct elk_codegen *p,
1824 	       struct elk_reg dest,
1825 	       unsigned function,
1826 	       unsigned msg_reg_nr,
1827 	       struct elk_reg src,
1828 	       unsigned precision )
1829 {
1830    const struct intel_device_info *devinfo = p->devinfo;
1831    elk_inst *insn = next_insn(p, ELK_OPCODE_SEND);
1832    unsigned data_type;
1833    if (has_scalar_region(src)) {
1834       data_type = ELK_MATH_DATA_SCALAR;
1835    } else {
1836       data_type = ELK_MATH_DATA_VECTOR;
1837    }
1838 
1839    assert(devinfo->ver < 6);
1840 
1841    /* Example code doesn't set predicate_control for send
1842     * instructions.
1843     */
1844    elk_inst_set_pred_control(devinfo, insn, 0);
1845    elk_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
1846 
1847    elk_set_dest(p, insn, dest);
1848    elk_set_src0(p, insn, src);
1849    elk_set_math_message(p,
1850                         insn,
1851                         function,
1852                         src.type == ELK_REGISTER_TYPE_D,
1853                         precision,
1854                         data_type);
1855 }
1856 
elk_gfx6_math(struct elk_codegen * p,struct elk_reg dest,unsigned function,struct elk_reg src0,struct elk_reg src1)1857 void elk_gfx6_math(struct elk_codegen *p,
1858 	       struct elk_reg dest,
1859 	       unsigned function,
1860 	       struct elk_reg src0,
1861 	       struct elk_reg src1)
1862 {
1863    const struct intel_device_info *devinfo = p->devinfo;
1864    elk_inst *insn = next_insn(p, ELK_OPCODE_MATH);
1865 
1866    assert(devinfo->ver >= 6);
1867 
1868    assert(dest.file == ELK_GENERAL_REGISTER_FILE ||
1869           (devinfo->ver >= 7 && dest.file == ELK_MESSAGE_REGISTER_FILE));
1870 
1871    assert(dest.hstride == ELK_HORIZONTAL_STRIDE_1);
1872    if (devinfo->ver == 6) {
1873       assert(src0.hstride == ELK_HORIZONTAL_STRIDE_1);
1874       assert(src1.hstride == ELK_HORIZONTAL_STRIDE_1);
1875    }
1876 
1877    if (function == ELK_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1878        function == ELK_MATH_FUNCTION_INT_DIV_REMAINDER ||
1879        function == ELK_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1880       assert(src0.type != ELK_REGISTER_TYPE_F);
1881       assert(src1.type != ELK_REGISTER_TYPE_F);
1882       assert(src1.file == ELK_GENERAL_REGISTER_FILE ||
1883              (devinfo->ver >= 8 && src1.file == ELK_IMMEDIATE_VALUE));
1884       /* From BSpec 6647/47428 "[Instruction] Extended Math Function":
1885        *     INT DIV function does not support source modifiers.
1886        */
1887       assert(!src0.negate);
1888       assert(!src0.abs);
1889       assert(!src1.negate);
1890       assert(!src1.abs);
1891    } else {
1892       assert(src0.type == ELK_REGISTER_TYPE_F);
1893       assert(src1.type == ELK_REGISTER_TYPE_F);
1894    }
1895 
1896    /* Source modifiers are ignored for extended math instructions on Gfx6. */
1897    if (devinfo->ver == 6) {
1898       assert(!src0.negate);
1899       assert(!src0.abs);
1900       assert(!src1.negate);
1901       assert(!src1.abs);
1902    }
1903 
1904    elk_inst_set_math_function(devinfo, insn, function);
1905 
1906    elk_set_dest(p, insn, dest);
1907    elk_set_src0(p, insn, src0);
1908    elk_set_src1(p, insn, src1);
1909 }
1910 
1911 /**
1912  * Return the right surface index to access the thread scratch space using
1913  * stateless dataport messages.
1914  */
1915 unsigned
elk_scratch_surface_idx(const struct elk_codegen * p)1916 elk_scratch_surface_idx(const struct elk_codegen *p)
1917 {
1918    /* The scratch space is thread-local so IA coherency is unnecessary. */
1919    if (p->devinfo->ver >= 8)
1920       return GFX8_BTI_STATELESS_NON_COHERENT;
1921    else
1922       return ELK_BTI_STATELESS;
1923 }
1924 
1925 /**
1926  * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1927  * using a constant offset per channel.
1928  *
1929  * The offset must be aligned to oword size (16 bytes).  Used for
1930  * register spilling.
1931  */
elk_oword_block_write_scratch(struct elk_codegen * p,struct elk_reg mrf,int num_regs,unsigned offset)1932 void elk_oword_block_write_scratch(struct elk_codegen *p,
1933 				   struct elk_reg mrf,
1934 				   int num_regs,
1935 				   unsigned offset)
1936 {
1937    const struct intel_device_info *devinfo = p->devinfo;
1938    const unsigned target_cache =
1939       (devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
1940        devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
1941        ELK_SFID_DATAPORT_WRITE);
1942    uint32_t msg_type;
1943 
1944    if (devinfo->ver >= 6)
1945       offset /= 16;
1946 
1947    mrf = retype(mrf, ELK_REGISTER_TYPE_UD);
1948 
1949    const unsigned mlen = 1 + num_regs;
1950 
1951    /* Set up the message header.  This is g0, with g0.2 filled with
1952     * the offset.  We don't want to leave our offset around in g0 or
1953     * it'll screw up texture samples, so set it up inside the message
1954     * reg.
1955     */
1956    {
1957       elk_push_insn_state(p);
1958       elk_set_default_exec_size(p, ELK_EXECUTE_8);
1959       elk_set_default_mask_control(p, ELK_MASK_DISABLE);
1960       elk_set_default_compression_control(p, ELK_COMPRESSION_NONE);
1961 
1962       elk_MOV(p, mrf, retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
1963 
1964       /* set message header global offset field (reg 0, element 2) */
1965       elk_set_default_exec_size(p, ELK_EXECUTE_1);
1966       elk_MOV(p,
1967 	      retype(elk_vec1_reg(ELK_MESSAGE_REGISTER_FILE,
1968 				  mrf.nr,
1969 				  2), ELK_REGISTER_TYPE_UD),
1970 	      elk_imm_ud(offset));
1971 
1972       elk_pop_insn_state(p);
1973    }
1974 
1975    {
1976       struct elk_reg dest;
1977       elk_inst *insn = next_insn(p, ELK_OPCODE_SEND);
1978       int send_commit_msg;
1979       struct elk_reg src_header = retype(elk_vec8_grf(0, 0),
1980 					 ELK_REGISTER_TYPE_UW);
1981 
1982       elk_inst_set_sfid(devinfo, insn, target_cache);
1983       elk_inst_set_compression(devinfo, insn, false);
1984 
1985       if (elk_inst_exec_size(devinfo, insn) >= 16)
1986 	 src_header = vec16(src_header);
1987 
1988       assert(elk_inst_pred_control(devinfo, insn) == ELK_PREDICATE_NONE);
1989       if (devinfo->ver < 6)
1990          elk_inst_set_base_mrf(devinfo, insn, mrf.nr);
1991 
1992       /* Until gfx6, writes followed by reads from the same location
1993        * are not guaranteed to be ordered unless write_commit is set.
1994        * If set, then a no-op write is issued to the destination
1995        * register to set a dependency, and a read from the destination
1996        * can be used to ensure the ordering.
1997        *
1998        * For gfx6, only writes between different threads need ordering
1999        * protection.  Our use of DP writes is all about register
2000        * spilling within a thread.
2001        */
2002       if (devinfo->ver >= 6) {
2003 	 dest = retype(vec16(elk_null_reg()), ELK_REGISTER_TYPE_UW);
2004 	 send_commit_msg = 0;
2005       } else {
2006 	 dest = src_header;
2007 	 send_commit_msg = 1;
2008       }
2009 
2010       elk_set_dest(p, insn, dest);
2011       if (devinfo->ver >= 6) {
2012 	 elk_set_src0(p, insn, mrf);
2013       } else {
2014 	 elk_set_src0(p, insn, elk_null_reg());
2015       }
2016 
2017       if (devinfo->ver >= 6)
2018 	 msg_type = GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2019       else
2020 	 msg_type = ELK_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2021 
2022       elk_set_desc(p, insn,
2023                    elk_message_desc(devinfo, mlen, send_commit_msg, true) |
2024                    elk_dp_write_desc(devinfo, elk_scratch_surface_idx(p),
2025                                      ELK_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2026                                      msg_type, send_commit_msg));
2027    }
2028 }
2029 
2030 
2031 /**
2032  * Read a block of owords (half a GRF each) from the scratch buffer
2033  * using a constant index per channel.
2034  *
2035  * Offset must be aligned to oword size (16 bytes).  Used for register
2036  * spilling.
2037  */
2038 void
elk_oword_block_read_scratch(struct elk_codegen * p,struct elk_reg dest,struct elk_reg mrf,int num_regs,unsigned offset)2039 elk_oword_block_read_scratch(struct elk_codegen *p,
2040 			     struct elk_reg dest,
2041 			     struct elk_reg mrf,
2042 			     int num_regs,
2043 			     unsigned offset)
2044 {
2045    const struct intel_device_info *devinfo = p->devinfo;
2046 
2047    if (devinfo->ver >= 6)
2048       offset /= 16;
2049 
2050    if (p->devinfo->ver >= 7) {
2051       /* On gen 7 and above, we no longer have message registers and we can
2052        * send from any register we want.  By using the destination register
2053        * for the message, we guarantee that the implied message write won't
2054        * accidentally overwrite anything.  This has been a problem because
2055        * the MRF registers and source for the final FB write are both fixed
2056        * and may overlap.
2057        */
2058       mrf = retype(dest, ELK_REGISTER_TYPE_UD);
2059    } else {
2060       mrf = retype(mrf, ELK_REGISTER_TYPE_UD);
2061    }
2062    dest = retype(dest, ELK_REGISTER_TYPE_UW);
2063 
2064    const unsigned rlen = num_regs;
2065    const unsigned target_cache =
2066       (devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
2067        devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
2068        ELK_SFID_DATAPORT_READ);
2069 
2070    {
2071       elk_push_insn_state(p);
2072       elk_set_default_exec_size(p, ELK_EXECUTE_8);
2073       elk_set_default_compression_control(p, ELK_COMPRESSION_NONE);
2074       elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2075 
2076       elk_MOV(p, mrf, retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
2077 
2078       /* set message header global offset field (reg 0, element 2) */
2079       elk_set_default_exec_size(p, ELK_EXECUTE_1);
2080       elk_MOV(p, get_element_ud(mrf, 2), elk_imm_ud(offset));
2081 
2082       elk_pop_insn_state(p);
2083    }
2084 
2085    {
2086       elk_inst *insn = next_insn(p, ELK_OPCODE_SEND);
2087 
2088       elk_inst_set_sfid(devinfo, insn, target_cache);
2089       assert(elk_inst_pred_control(devinfo, insn) == 0);
2090       elk_inst_set_compression(devinfo, insn, false);
2091 
2092       elk_set_dest(p, insn, dest);	/* UW? */
2093       if (devinfo->ver >= 6) {
2094 	 elk_set_src0(p, insn, mrf);
2095       } else {
2096 	 elk_set_src0(p, insn, elk_null_reg());
2097          elk_inst_set_base_mrf(devinfo, insn, mrf.nr);
2098       }
2099 
2100       elk_set_desc(p, insn,
2101                    elk_message_desc(devinfo, 1, rlen, true) |
2102                    elk_dp_read_desc(devinfo, elk_scratch_surface_idx(p),
2103                                     ELK_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2104                                     ELK_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2105                                     ELK_DATAPORT_READ_TARGET_RENDER_CACHE));
2106    }
2107 }
2108 
2109 void
elk_gfx7_block_read_scratch(struct elk_codegen * p,struct elk_reg dest,int num_regs,unsigned offset)2110 elk_gfx7_block_read_scratch(struct elk_codegen *p,
2111                         struct elk_reg dest,
2112                         int num_regs,
2113                         unsigned offset)
2114 {
2115    elk_inst *insn = next_insn(p, ELK_OPCODE_SEND);
2116    assert(elk_inst_pred_control(p->devinfo, insn) == ELK_PREDICATE_NONE);
2117 
2118    elk_set_dest(p, insn, retype(dest, ELK_REGISTER_TYPE_UW));
2119 
2120    /* The HW requires that the header is present; this is to get the g0.5
2121     * scratch offset.
2122     */
2123    elk_set_src0(p, insn, elk_vec8_grf(0, 0));
2124 
2125    /* According to the docs, offset is "A 12-bit HWord offset into the memory
2126     * Immediate Memory buffer as specified by binding table 0xFF."  An HWORD
2127     * is 32 bytes, which happens to be the size of a register.
2128     */
2129    offset /= REG_SIZE;
2130    assert(offset < (1 << 12));
2131 
2132    gfx7_set_dp_scratch_message(p, insn,
2133                                false, /* scratch read */
2134                                false, /* OWords */
2135                                false, /* invalidate after read */
2136                                num_regs,
2137                                offset,
2138                                1,        /* mlen: just g0 */
2139                                num_regs, /* rlen */
2140                                true);    /* header present */
2141 }
2142 
2143 /**
2144  * Read float[4] vectors from the data port constant cache.
2145  * Location (in buffer) should be a multiple of 16.
2146  * Used for fetching shader constants.
2147  */
elk_oword_block_read(struct elk_codegen * p,struct elk_reg dest,struct elk_reg mrf,uint32_t offset,uint32_t bind_table_index)2148 void elk_oword_block_read(struct elk_codegen *p,
2149 			  struct elk_reg dest,
2150 			  struct elk_reg mrf,
2151 			  uint32_t offset,
2152 			  uint32_t bind_table_index)
2153 {
2154    const struct intel_device_info *devinfo = p->devinfo;
2155    const unsigned target_cache =
2156       (devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_CONSTANT_CACHE :
2157        ELK_SFID_DATAPORT_READ);
2158    const unsigned exec_size = 1 << elk_get_default_exec_size(p);
2159 
2160    /* On newer hardware, offset is in units of owords. */
2161    if (devinfo->ver >= 6)
2162       offset /= 16;
2163 
2164    mrf = retype(mrf, ELK_REGISTER_TYPE_UD);
2165 
2166    elk_push_insn_state(p);
2167    elk_set_default_predicate_control(p, ELK_PREDICATE_NONE);
2168    elk_set_default_flag_reg(p, 0, 0);
2169    elk_set_default_compression_control(p, ELK_COMPRESSION_NONE);
2170    elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2171 
2172    elk_push_insn_state(p);
2173    elk_set_default_exec_size(p, ELK_EXECUTE_8);
2174    elk_MOV(p, mrf, retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
2175 
2176    /* set message header global offset field (reg 0, element 2) */
2177    elk_set_default_exec_size(p, ELK_EXECUTE_1);
2178    elk_MOV(p,
2179 	   retype(elk_vec1_reg(ELK_MESSAGE_REGISTER_FILE,
2180 			       mrf.nr,
2181 			       2), ELK_REGISTER_TYPE_UD),
2182 	   elk_imm_ud(offset));
2183    elk_pop_insn_state(p);
2184 
2185    elk_inst *insn = next_insn(p, ELK_OPCODE_SEND);
2186 
2187    elk_inst_set_sfid(devinfo, insn, target_cache);
2188 
2189    /* cast dest to a uword[8] vector */
2190    dest = retype(vec8(dest), ELK_REGISTER_TYPE_UW);
2191 
2192    elk_set_dest(p, insn, dest);
2193    if (devinfo->ver >= 6) {
2194       elk_set_src0(p, insn, mrf);
2195    } else {
2196       elk_set_src0(p, insn, elk_null_reg());
2197       elk_inst_set_base_mrf(devinfo, insn, mrf.nr);
2198    }
2199 
2200    elk_set_desc(p, insn,
2201                 elk_message_desc(devinfo, 1, DIV_ROUND_UP(exec_size, 8), true) |
2202                 elk_dp_read_desc(devinfo, bind_table_index,
2203                                  ELK_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2204                                  ELK_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2205                                  ELK_DATAPORT_READ_TARGET_DATA_CACHE));
2206 
2207    elk_pop_insn_state(p);
2208 }
2209 
2210 elk_inst *
elk_fb_WRITE(struct elk_codegen * p,struct elk_reg payload,struct elk_reg implied_header,unsigned msg_control,unsigned binding_table_index,unsigned msg_length,unsigned response_length,bool eot,bool last_render_target,bool header_present)2211 elk_fb_WRITE(struct elk_codegen *p,
2212              struct elk_reg payload,
2213              struct elk_reg implied_header,
2214              unsigned msg_control,
2215              unsigned binding_table_index,
2216              unsigned msg_length,
2217              unsigned response_length,
2218              bool eot,
2219              bool last_render_target,
2220              bool header_present)
2221 {
2222    const struct intel_device_info *devinfo = p->devinfo;
2223    const unsigned target_cache =
2224       (devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
2225        ELK_SFID_DATAPORT_WRITE);
2226    elk_inst *insn;
2227    struct elk_reg dest, src0;
2228 
2229    if (elk_get_default_exec_size(p) >= ELK_EXECUTE_16)
2230       dest = retype(vec16(elk_null_reg()), ELK_REGISTER_TYPE_UW);
2231    else
2232       dest = retype(vec8(elk_null_reg()), ELK_REGISTER_TYPE_UW);
2233 
2234    if (devinfo->ver >= 6) {
2235       insn = next_insn(p, ELK_OPCODE_SENDC);
2236    } else {
2237       insn = next_insn(p, ELK_OPCODE_SEND);
2238    }
2239    elk_inst_set_sfid(devinfo, insn, target_cache);
2240    elk_inst_set_compression(devinfo, insn, false);
2241 
2242    if (devinfo->ver >= 6) {
2243       /* headerless version, just submit color payload */
2244       src0 = payload;
2245    } else {
2246       assert(payload.file == ELK_MESSAGE_REGISTER_FILE);
2247       elk_inst_set_base_mrf(devinfo, insn, payload.nr);
2248       src0 = implied_header;
2249    }
2250 
2251    elk_set_dest(p, insn, dest);
2252    elk_set_src0(p, insn, src0);
2253    elk_set_desc(p, insn,
2254                 elk_message_desc(devinfo, msg_length, response_length,
2255                                  header_present) |
2256                 elk_fb_write_desc(devinfo, binding_table_index, msg_control,
2257                                   last_render_target,
2258                                   false /* coarse_write */));
2259    elk_inst_set_eot(devinfo, insn, eot);
2260 
2261    return insn;
2262 }
2263 
2264 /**
2265  * Texture sample instruction.
2266  * Note: the msg_type plus msg_length values determine exactly what kind
2267  * of sampling operation is performed.  See volume 4, page 161 of docs.
2268  */
elk_SAMPLE(struct elk_codegen * p,struct elk_reg dest,unsigned msg_reg_nr,struct elk_reg src0,unsigned binding_table_index,unsigned sampler,unsigned msg_type,unsigned response_length,unsigned msg_length,unsigned header_present,unsigned simd_mode,unsigned return_format)2269 void elk_SAMPLE(struct elk_codegen *p,
2270 		struct elk_reg dest,
2271 		unsigned msg_reg_nr,
2272 		struct elk_reg src0,
2273 		unsigned binding_table_index,
2274 		unsigned sampler,
2275 		unsigned msg_type,
2276 		unsigned response_length,
2277 		unsigned msg_length,
2278 		unsigned header_present,
2279 		unsigned simd_mode,
2280 		unsigned return_format)
2281 {
2282    const struct intel_device_info *devinfo = p->devinfo;
2283    elk_inst *insn;
2284 
2285    if (msg_reg_nr != -1)
2286       elk_gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
2287 
2288    insn = next_insn(p, ELK_OPCODE_SEND);
2289    elk_inst_set_sfid(devinfo, insn, ELK_SFID_SAMPLER);
2290    elk_inst_set_pred_control(devinfo, insn, ELK_PREDICATE_NONE); /* XXX */
2291 
2292    /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2293     *
2294     *    "Instruction compression is not allowed for this instruction (that
2295     *     is, send). The hardware behavior is undefined if this instruction is
2296     *     set as compressed. However, compress control can be set to "SecHalf"
2297     *     to affect the EMask generation."
2298     *
2299     * No similar wording is found in later PRMs, but there are examples
2300     * utilizing send with SecHalf.  More importantly, SIMD8 sampler messages
2301     * are allowed in SIMD16 mode and they could not work without SecHalf.  For
2302     * these reasons, we allow ELK_COMPRESSION_2NDHALF here.
2303     */
2304    elk_inst_set_compression(devinfo, insn, false);
2305 
2306    if (devinfo->ver < 6)
2307       elk_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2308 
2309    elk_set_dest(p, insn, dest);
2310    elk_set_src0(p, insn, src0);
2311    elk_set_desc(p, insn,
2312                 elk_message_desc(devinfo, msg_length, response_length,
2313                                  header_present) |
2314                 elk_sampler_desc(devinfo, binding_table_index, sampler,
2315                                  msg_type, simd_mode, return_format));
2316 }
2317 
2318 /* Adjust the message header's sampler state pointer to
2319  * select the correct group of 16 samplers.
2320  */
elk_adjust_sampler_state_pointer(struct elk_codegen * p,struct elk_reg header,struct elk_reg sampler_index)2321 void elk_adjust_sampler_state_pointer(struct elk_codegen *p,
2322                                       struct elk_reg header,
2323                                       struct elk_reg sampler_index)
2324 {
2325    /* The "Sampler Index" field can only store values between 0 and 15.
2326     * However, we can add an offset to the "Sampler State Pointer"
2327     * field, effectively selecting a different set of 16 samplers.
2328     *
2329     * The "Sampler State Pointer" needs to be aligned to a 32-byte
2330     * offset, and each sampler state is only 16-bytes, so we can't
2331     * exclusively use the offset - we have to use both.
2332     */
2333 
2334    const struct intel_device_info *devinfo = p->devinfo;
2335 
2336    if (sampler_index.file == ELK_IMMEDIATE_VALUE) {
2337       const int sampler_state_size = 16; /* 16 bytes */
2338       uint32_t sampler = sampler_index.ud;
2339 
2340       if (sampler >= 16) {
2341          assert(devinfo->verx10 >= 75);
2342          elk_ADD(p,
2343                  get_element_ud(header, 3),
2344                  get_element_ud(elk_vec8_grf(0, 0), 3),
2345                  elk_imm_ud(16 * (sampler / 16) * sampler_state_size));
2346       }
2347    } else {
2348       /* Non-const sampler array indexing case */
2349       if (devinfo->verx10 <= 70) {
2350          return;
2351       }
2352 
2353       struct elk_reg temp = get_element_ud(header, 3);
2354 
2355       elk_push_insn_state(p);
2356       elk_AND(p, temp, get_element_ud(sampler_index, 0), elk_imm_ud(0x0f0));
2357       elk_SHL(p, temp, temp, elk_imm_ud(4));
2358       elk_ADD(p,
2359               get_element_ud(header, 3),
2360               get_element_ud(elk_vec8_grf(0, 0), 3),
2361               temp);
2362       elk_pop_insn_state(p);
2363    }
2364 }
2365 
2366 /* All these variables are pretty confusing - we might be better off
2367  * using bitmasks and macros for this, in the old style.  Or perhaps
2368  * just having the caller instantiate the fields in dword3 itself.
2369  */
elk_urb_WRITE(struct elk_codegen * p,struct elk_reg dest,unsigned msg_reg_nr,struct elk_reg src0,enum elk_urb_write_flags flags,unsigned msg_length,unsigned response_length,unsigned offset,unsigned swizzle)2370 void elk_urb_WRITE(struct elk_codegen *p,
2371 		   struct elk_reg dest,
2372 		   unsigned msg_reg_nr,
2373 		   struct elk_reg src0,
2374                    enum elk_urb_write_flags flags,
2375 		   unsigned msg_length,
2376 		   unsigned response_length,
2377 		   unsigned offset,
2378 		   unsigned swizzle)
2379 {
2380    const struct intel_device_info *devinfo = p->devinfo;
2381    elk_inst *insn;
2382 
2383    elk_gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
2384 
2385    if (devinfo->ver >= 7 && !(flags & ELK_URB_WRITE_USE_CHANNEL_MASKS)) {
2386       /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2387       elk_push_insn_state(p);
2388       elk_set_default_access_mode(p, ELK_ALIGN_1);
2389       elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2390       elk_set_default_exec_size(p, ELK_EXECUTE_1);
2391       elk_OR(p, retype(elk_vec1_reg(ELK_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2392 		       ELK_REGISTER_TYPE_UD),
2393 	        retype(elk_vec1_grf(0, 5), ELK_REGISTER_TYPE_UD),
2394 		elk_imm_ud(0xff00));
2395       elk_pop_insn_state(p);
2396    }
2397 
2398    insn = next_insn(p, ELK_OPCODE_SEND);
2399 
2400    assert(msg_length < ELK_MAX_MRF(devinfo->ver));
2401 
2402    elk_set_dest(p, insn, dest);
2403    elk_set_src0(p, insn, src0);
2404    elk_set_src1(p, insn, elk_imm_d(0));
2405 
2406    if (devinfo->ver < 6)
2407       elk_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2408 
2409    elk_set_urb_message(p,
2410 		       insn,
2411 		       flags,
2412 		       msg_length,
2413 		       response_length,
2414 		       offset,
2415 		       swizzle);
2416 }
2417 
2418 void
elk_send_indirect_message(struct elk_codegen * p,unsigned sfid,struct elk_reg dst,struct elk_reg payload,struct elk_reg desc,unsigned desc_imm,bool eot)2419 elk_send_indirect_message(struct elk_codegen *p,
2420                           unsigned sfid,
2421                           struct elk_reg dst,
2422                           struct elk_reg payload,
2423                           struct elk_reg desc,
2424                           unsigned desc_imm,
2425                           bool eot)
2426 {
2427    const struct intel_device_info *devinfo = p->devinfo;
2428    struct elk_inst *send;
2429 
2430    dst = retype(dst, ELK_REGISTER_TYPE_UW);
2431 
2432    assert(desc.type == ELK_REGISTER_TYPE_UD);
2433 
2434    if (desc.file == ELK_IMMEDIATE_VALUE) {
2435       send = next_insn(p, ELK_OPCODE_SEND);
2436       elk_set_src0(p, send, retype(payload, ELK_REGISTER_TYPE_UD));
2437       elk_set_desc(p, send, desc.ud | desc_imm);
2438    } else {
2439       struct elk_reg addr = retype(elk_address_reg(0), ELK_REGISTER_TYPE_UD);
2440 
2441       elk_push_insn_state(p);
2442       elk_set_default_access_mode(p, ELK_ALIGN_1);
2443       elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2444       elk_set_default_exec_size(p, ELK_EXECUTE_1);
2445       elk_set_default_predicate_control(p, ELK_PREDICATE_NONE);
2446       elk_set_default_flag_reg(p, 0, 0);
2447 
2448       /* Load the indirect descriptor to an address register using OR so the
2449        * caller can specify additional descriptor bits with the desc_imm
2450        * immediate.
2451        */
2452       elk_OR(p, addr, desc, elk_imm_ud(desc_imm));
2453 
2454       elk_pop_insn_state(p);
2455 
2456       send = next_insn(p, ELK_OPCODE_SEND);
2457       elk_set_src0(p, send, retype(payload, ELK_REGISTER_TYPE_UD));
2458       elk_set_src1(p, send, addr);
2459    }
2460 
2461    elk_set_dest(p, send, dst);
2462    elk_inst_set_sfid(devinfo, send, sfid);
2463    elk_inst_set_eot(devinfo, send, eot);
2464 }
2465 
2466 static void
elk_send_indirect_surface_message(struct elk_codegen * p,unsigned sfid,struct elk_reg dst,struct elk_reg payload,struct elk_reg surface,unsigned desc_imm)2467 elk_send_indirect_surface_message(struct elk_codegen *p,
2468                                   unsigned sfid,
2469                                   struct elk_reg dst,
2470                                   struct elk_reg payload,
2471                                   struct elk_reg surface,
2472                                   unsigned desc_imm)
2473 {
2474    if (surface.file != ELK_IMMEDIATE_VALUE) {
2475       struct elk_reg addr = retype(elk_address_reg(0), ELK_REGISTER_TYPE_UD);
2476 
2477       elk_push_insn_state(p);
2478       elk_set_default_access_mode(p, ELK_ALIGN_1);
2479       elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2480       elk_set_default_exec_size(p, ELK_EXECUTE_1);
2481       elk_set_default_predicate_control(p, ELK_PREDICATE_NONE);
2482       elk_set_default_flag_reg(p, 0, 0);
2483 
2484       /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2485        * some surface array is accessed out of bounds.
2486        */
2487       elk_AND(p, addr,
2488               suboffset(vec1(retype(surface, ELK_REGISTER_TYPE_UD)),
2489                         ELK_GET_SWZ(surface.swizzle, 0)),
2490               elk_imm_ud(0xff));
2491 
2492       elk_pop_insn_state(p);
2493 
2494       surface = addr;
2495    }
2496 
2497    elk_send_indirect_message(p, sfid, dst, payload, surface, desc_imm, false);
2498 }
2499 
2500 static bool
while_jumps_before_offset(const struct intel_device_info * devinfo,elk_inst * insn,int while_offset,int start_offset)2501 while_jumps_before_offset(const struct intel_device_info *devinfo,
2502                           elk_inst *insn, int while_offset, int start_offset)
2503 {
2504    int scale = 16 / elk_jump_scale(devinfo);
2505    int jip = devinfo->ver == 6 ? elk_inst_gfx6_jump_count(devinfo, insn)
2506                                : elk_inst_jip(devinfo, insn);
2507    assert(jip < 0);
2508    return while_offset + jip * scale <= start_offset;
2509 }
2510 
2511 
2512 static int
elk_find_next_block_end(struct elk_codegen * p,int start_offset)2513 elk_find_next_block_end(struct elk_codegen *p, int start_offset)
2514 {
2515    int offset;
2516    void *store = p->store;
2517    const struct intel_device_info *devinfo = p->devinfo;
2518 
2519    int depth = 0;
2520 
2521    for (offset = next_offset(devinfo, store, start_offset);
2522         offset < p->next_insn_offset;
2523         offset = next_offset(devinfo, store, offset)) {
2524       elk_inst *insn = store + offset;
2525 
2526       switch (elk_inst_opcode(p->isa, insn)) {
2527       case ELK_OPCODE_IF:
2528          depth++;
2529          break;
2530       case ELK_OPCODE_ENDIF:
2531          if (depth == 0)
2532             return offset;
2533          depth--;
2534          break;
2535       case ELK_OPCODE_WHILE:
2536          /* If the while doesn't jump before our instruction, it's the end
2537           * of a sibling do...while loop.  Ignore it.
2538           */
2539          if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2540             continue;
2541          FALLTHROUGH;
2542       case ELK_OPCODE_ELSE:
2543       case ELK_OPCODE_HALT:
2544          if (depth == 0)
2545             return offset;
2546          break;
2547       default:
2548          break;
2549       }
2550    }
2551 
2552    return 0;
2553 }
2554 
2555 /* There is no DO instruction on gfx6, so to find the end of the loop
2556  * we have to see if the loop is jumping back before our start
2557  * instruction.
2558  */
2559 static int
elk_find_loop_end(struct elk_codegen * p,int start_offset)2560 elk_find_loop_end(struct elk_codegen *p, int start_offset)
2561 {
2562    const struct intel_device_info *devinfo = p->devinfo;
2563    int offset;
2564    void *store = p->store;
2565 
2566    assert(devinfo->ver >= 6);
2567 
2568    /* Always start after the instruction (such as a WHILE) we're trying to fix
2569     * up.
2570     */
2571    for (offset = next_offset(devinfo, store, start_offset);
2572         offset < p->next_insn_offset;
2573         offset = next_offset(devinfo, store, offset)) {
2574       elk_inst *insn = store + offset;
2575 
2576       if (elk_inst_opcode(p->isa, insn) == ELK_OPCODE_WHILE) {
2577 	 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2578 	    return offset;
2579       }
2580    }
2581    assert(!"not reached");
2582    return start_offset;
2583 }
2584 
2585 /* After program generation, go back and update the UIP and JIP of
2586  * BREAK, CONT, and HALT instructions to their correct locations.
2587  */
2588 void
elk_set_uip_jip(struct elk_codegen * p,int start_offset)2589 elk_set_uip_jip(struct elk_codegen *p, int start_offset)
2590 {
2591    const struct intel_device_info *devinfo = p->devinfo;
2592    int offset;
2593    int br = elk_jump_scale(devinfo);
2594    int scale = 16 / br;
2595    void *store = p->store;
2596 
2597    if (devinfo->ver < 6)
2598       return;
2599 
2600    for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2601       elk_inst *insn = store + offset;
2602       assert(elk_inst_cmpt_control(devinfo, insn) == 0);
2603 
2604       switch (elk_inst_opcode(p->isa, insn)) {
2605       case ELK_OPCODE_BREAK: {
2606          int block_end_offset = elk_find_next_block_end(p, offset);
2607          assert(block_end_offset != 0);
2608          elk_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2609 	 /* Gfx7 UIP points to WHILE; Gfx6 points just after it */
2610          elk_inst_set_uip(devinfo, insn,
2611 	    (elk_find_loop_end(p, offset) - offset +
2612              (devinfo->ver == 6 ? 16 : 0)) / scale);
2613 	 break;
2614       }
2615 
2616       case ELK_OPCODE_CONTINUE: {
2617          int block_end_offset = elk_find_next_block_end(p, offset);
2618          assert(block_end_offset != 0);
2619          elk_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2620          elk_inst_set_uip(devinfo, insn,
2621             (elk_find_loop_end(p, offset) - offset) / scale);
2622 
2623          assert(elk_inst_uip(devinfo, insn) != 0);
2624          assert(elk_inst_jip(devinfo, insn) != 0);
2625 	 break;
2626       }
2627 
2628       case ELK_OPCODE_ENDIF: {
2629          int block_end_offset = elk_find_next_block_end(p, offset);
2630          int32_t jump = (block_end_offset == 0) ?
2631                         1 * br : (block_end_offset - offset) / scale;
2632          if (devinfo->ver >= 7)
2633             elk_inst_set_jip(devinfo, insn, jump);
2634          else
2635             elk_inst_set_gfx6_jump_count(devinfo, insn, jump);
2636 	 break;
2637       }
2638 
2639       case ELK_OPCODE_HALT: {
2640 	 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2641 	  *
2642 	  *    "In case of the halt instruction not inside any conditional
2643 	  *     code block, the value of <JIP> and <UIP> should be the
2644 	  *     same. In case of the halt instruction inside conditional code
2645 	  *     block, the <UIP> should be the end of the program, and the
2646 	  *     <JIP> should be end of the most inner conditional code block."
2647 	  *
2648 	  * The uip will have already been set by whoever set up the
2649 	  * instruction.
2650 	  */
2651          int block_end_offset = elk_find_next_block_end(p, offset);
2652 	 if (block_end_offset == 0) {
2653             elk_inst_set_jip(devinfo, insn, elk_inst_uip(devinfo, insn));
2654 	 } else {
2655             elk_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2656 	 }
2657          assert(elk_inst_uip(devinfo, insn) != 0);
2658          assert(elk_inst_jip(devinfo, insn) != 0);
2659 	 break;
2660       }
2661 
2662       default:
2663          break;
2664       }
2665    }
2666 }
2667 
elk_ff_sync(struct elk_codegen * p,struct elk_reg dest,unsigned msg_reg_nr,struct elk_reg src0,bool allocate,unsigned response_length,bool eot)2668 void elk_ff_sync(struct elk_codegen *p,
2669 		   struct elk_reg dest,
2670 		   unsigned msg_reg_nr,
2671 		   struct elk_reg src0,
2672 		   bool allocate,
2673 		   unsigned response_length,
2674 		   bool eot)
2675 {
2676    const struct intel_device_info *devinfo = p->devinfo;
2677    elk_inst *insn;
2678 
2679    elk_gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
2680 
2681    insn = next_insn(p, ELK_OPCODE_SEND);
2682    elk_set_dest(p, insn, dest);
2683    elk_set_src0(p, insn, src0);
2684    elk_set_src1(p, insn, elk_imm_d(0));
2685 
2686    if (devinfo->ver < 6)
2687       elk_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2688 
2689    elk_set_ff_sync_message(p,
2690 			   insn,
2691 			   allocate,
2692 			   response_length,
2693 			   eot);
2694 }
2695 
2696 /**
2697  * Emit the SEND instruction necessary to generate stream output data on Gfx6
2698  * (for transform feedback).
2699  *
2700  * If send_commit_msg is true, this is the last piece of stream output data
2701  * from this thread, so send the data as a committed write.  According to the
2702  * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2703  *
2704  *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2705  *   writes are complete by sending the final write as a committed write."
2706  */
2707 void
elk_svb_write(struct elk_codegen * p,struct elk_reg dest,unsigned msg_reg_nr,struct elk_reg src0,unsigned binding_table_index,bool send_commit_msg)2708 elk_svb_write(struct elk_codegen *p,
2709               struct elk_reg dest,
2710               unsigned msg_reg_nr,
2711               struct elk_reg src0,
2712               unsigned binding_table_index,
2713               bool   send_commit_msg)
2714 {
2715    const struct intel_device_info *devinfo = p->devinfo;
2716    assert(devinfo->ver == 6);
2717    const unsigned target_cache = GFX6_SFID_DATAPORT_RENDER_CACHE;
2718    elk_inst *insn;
2719 
2720    elk_gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
2721 
2722    insn = next_insn(p, ELK_OPCODE_SEND);
2723    elk_inst_set_sfid(devinfo, insn, target_cache);
2724    elk_set_dest(p, insn, dest);
2725    elk_set_src0(p, insn, src0);
2726    elk_set_desc(p, insn,
2727                 elk_message_desc(devinfo, 1, send_commit_msg, true) |
2728                 elk_dp_write_desc(devinfo, binding_table_index,
2729                                   0, /* msg_control: ignored */
2730                                   GFX6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2731                                   send_commit_msg)); /* send_commit_msg */
2732 }
2733 
2734 static unsigned
elk_surface_payload_size(unsigned num_channels,unsigned exec_size)2735 elk_surface_payload_size(unsigned num_channels,
2736                          unsigned exec_size /**< 0 for SIMD4x2 */)
2737 {
2738    if (exec_size == 0)
2739       return 1; /* SIMD4x2 */
2740    else if (exec_size <= 8)
2741       return num_channels;
2742    else
2743       return 2 * num_channels;
2744 }
2745 
2746 void
elk_untyped_atomic(struct elk_codegen * p,struct elk_reg dst,struct elk_reg payload,struct elk_reg surface,unsigned atomic_op,unsigned msg_length,bool response_expected,bool header_present)2747 elk_untyped_atomic(struct elk_codegen *p,
2748                    struct elk_reg dst,
2749                    struct elk_reg payload,
2750                    struct elk_reg surface,
2751                    unsigned atomic_op,
2752                    unsigned msg_length,
2753                    bool response_expected,
2754                    bool header_present)
2755 {
2756    const struct intel_device_info *devinfo = p->devinfo;
2757    const unsigned sfid = (devinfo->verx10 >= 75 ?
2758                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
2759                           GFX7_SFID_DATAPORT_DATA_CACHE);
2760    const bool align1 = elk_get_default_access_mode(p) == ELK_ALIGN_1;
2761    /* SIMD4x2 untyped atomic instructions only exist on HSW+ */
2762    const bool has_simd4x2 = devinfo->verx10 >= 75;
2763    const unsigned exec_size = align1 ? 1 << elk_get_default_exec_size(p) :
2764                               has_simd4x2 ? 0 : 8;
2765    const unsigned response_length =
2766       elk_surface_payload_size(response_expected, exec_size);
2767    const unsigned desc =
2768       elk_message_desc(devinfo, msg_length, response_length, header_present) |
2769       elk_dp_untyped_atomic_desc(devinfo, exec_size, atomic_op,
2770                                  response_expected);
2771    /* Mask out unused components -- This is especially important in Align16
2772     * mode on generations that don't have native support for SIMD4x2 atomics,
2773     * because unused but enabled components will cause the dataport to perform
2774     * additional atomic operations on the addresses that happen to be in the
2775     * uninitialized Y, Z and W coordinates of the payload.
2776     */
2777    const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
2778 
2779    elk_send_indirect_surface_message(p, sfid, elk_writemask(dst, mask),
2780                                      payload, surface, desc);
2781 }
2782 
2783 void
elk_untyped_surface_read(struct elk_codegen * p,struct elk_reg dst,struct elk_reg payload,struct elk_reg surface,unsigned msg_length,unsigned num_channels)2784 elk_untyped_surface_read(struct elk_codegen *p,
2785                          struct elk_reg dst,
2786                          struct elk_reg payload,
2787                          struct elk_reg surface,
2788                          unsigned msg_length,
2789                          unsigned num_channels)
2790 {
2791    const struct intel_device_info *devinfo = p->devinfo;
2792    const unsigned sfid = (devinfo->verx10 >= 75 ?
2793                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
2794                           GFX7_SFID_DATAPORT_DATA_CACHE);
2795    const bool align1 = elk_get_default_access_mode(p) == ELK_ALIGN_1;
2796    const unsigned exec_size = align1 ? 1 << elk_get_default_exec_size(p) : 0;
2797    const unsigned response_length =
2798       elk_surface_payload_size(num_channels, exec_size);
2799    const unsigned desc =
2800       elk_message_desc(devinfo, msg_length, response_length, false) |
2801       elk_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, false);
2802 
2803    elk_send_indirect_surface_message(p, sfid, dst, payload, surface, desc);
2804 }
2805 
2806 void
elk_untyped_surface_write(struct elk_codegen * p,struct elk_reg payload,struct elk_reg surface,unsigned msg_length,unsigned num_channels,bool header_present)2807 elk_untyped_surface_write(struct elk_codegen *p,
2808                           struct elk_reg payload,
2809                           struct elk_reg surface,
2810                           unsigned msg_length,
2811                           unsigned num_channels,
2812                           bool header_present)
2813 {
2814    const struct intel_device_info *devinfo = p->devinfo;
2815    const unsigned sfid = (devinfo->verx10 >= 75 ?
2816                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
2817                           GFX7_SFID_DATAPORT_DATA_CACHE);
2818    const bool align1 = elk_get_default_access_mode(p) == ELK_ALIGN_1;
2819    /* SIMD4x2 untyped surface write instructions only exist on HSW+ */
2820    const bool has_simd4x2 = devinfo->verx10 >= 75;
2821    const unsigned exec_size = align1 ? 1 << elk_get_default_exec_size(p) :
2822                               has_simd4x2 ? 0 : 8;
2823    const unsigned desc =
2824       elk_message_desc(devinfo, msg_length, 0, header_present) |
2825       elk_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, true);
2826    /* Mask out unused components -- See comment in elk_untyped_atomic(). */
2827    const unsigned mask = !has_simd4x2 && !align1 ? WRITEMASK_X : WRITEMASK_XYZW;
2828 
2829    elk_send_indirect_surface_message(p, sfid, elk_writemask(elk_null_reg(), mask),
2830                                      payload, surface, desc);
2831 }
2832 
2833 static void
elk_set_memory_fence_message(struct elk_codegen * p,struct elk_inst * insn,enum elk_message_target sfid,bool commit_enable,unsigned bti)2834 elk_set_memory_fence_message(struct elk_codegen *p,
2835                              struct elk_inst *insn,
2836                              enum elk_message_target sfid,
2837                              bool commit_enable,
2838                              unsigned bti)
2839 {
2840    const struct intel_device_info *devinfo = p->devinfo;
2841 
2842    elk_set_desc(p, insn, elk_message_desc(
2843                    devinfo, 1, (commit_enable ? 1 : 0), true));
2844 
2845    elk_inst_set_sfid(devinfo, insn, sfid);
2846 
2847    switch (sfid) {
2848    case GFX6_SFID_DATAPORT_RENDER_CACHE:
2849       elk_inst_set_dp_msg_type(devinfo, insn, GFX7_DATAPORT_RC_MEMORY_FENCE);
2850       break;
2851    case GFX7_SFID_DATAPORT_DATA_CACHE:
2852       elk_inst_set_dp_msg_type(devinfo, insn, GFX7_DATAPORT_DC_MEMORY_FENCE);
2853       break;
2854    default:
2855       unreachable("Not reached");
2856    }
2857 
2858    if (commit_enable)
2859       elk_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
2860 
2861    assert(bti == 0);
2862    elk_inst_set_binding_table_index(devinfo, insn, bti);
2863 }
2864 
2865 void
elk_memory_fence(struct elk_codegen * p,struct elk_reg dst,struct elk_reg src,enum elk_opcode send_op,enum elk_message_target sfid,uint32_t desc,bool commit_enable,unsigned bti)2866 elk_memory_fence(struct elk_codegen *p,
2867                  struct elk_reg dst,
2868                  struct elk_reg src,
2869                  enum elk_opcode send_op,
2870                  enum elk_message_target sfid,
2871                  uint32_t desc,
2872                  bool commit_enable,
2873                  unsigned bti)
2874 {
2875    const struct intel_device_info *devinfo = p->devinfo;
2876 
2877    dst = retype(vec1(dst), ELK_REGISTER_TYPE_UW);
2878    src = retype(vec1(src), ELK_REGISTER_TYPE_UD);
2879 
2880    /* Set dst as destination for dependency tracking, the MEMORY_FENCE
2881     * message doesn't write anything back.
2882     */
2883    struct elk_inst *insn = next_insn(p, send_op);
2884    elk_inst_set_mask_control(devinfo, insn, ELK_MASK_DISABLE);
2885    elk_inst_set_exec_size(devinfo, insn, ELK_EXECUTE_1);
2886    elk_set_dest(p, insn, dst);
2887    elk_set_src0(p, insn, src);
2888 
2889    elk_set_memory_fence_message(p, insn, sfid, commit_enable, bti);
2890 }
2891 
2892 void
elk_find_live_channel(struct elk_codegen * p,struct elk_reg dst,bool last)2893 elk_find_live_channel(struct elk_codegen *p, struct elk_reg dst, bool last)
2894 {
2895    const struct intel_device_info *devinfo = p->devinfo;
2896    const unsigned exec_size = 1 << elk_get_default_exec_size(p);
2897    const unsigned qtr_control = elk_get_default_group(p) / 8;
2898    elk_inst *inst;
2899 
2900    assert(devinfo->ver == 7);
2901 
2902    elk_push_insn_state(p);
2903 
2904    /* The flag register is only used on Gfx7 in align1 mode, so avoid setting
2905     * unnecessary bits in the instruction words, get the information we need
2906     * and reset the default flag register. This allows more instructions to be
2907     * compacted.
2908     */
2909    const unsigned flag_subreg = p->current->flag_subreg;
2910    elk_set_default_flag_reg(p, 0, 0);
2911 
2912    if (elk_get_default_access_mode(p) == ELK_ALIGN_1) {
2913       elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2914 
2915       const struct elk_reg flag = elk_flag_subreg(flag_subreg);
2916 
2917       elk_set_default_exec_size(p, ELK_EXECUTE_1);
2918       elk_MOV(p, retype(flag, ELK_REGISTER_TYPE_UD), elk_imm_ud(0));
2919 
2920       /* Run enough instructions returning zero with execution masking and
2921        * a conditional modifier enabled in order to get the full execution
2922        * mask in f1.0.  We could use a single 32-wide move here if it
2923        * weren't because of the hardware bug that causes channel enables to
2924        * be applied incorrectly to the second half of 32-wide instructions
2925        * on Gfx7.
2926        */
2927       const unsigned lower_size = MIN2(16, exec_size);
2928       for (unsigned i = 0; i < exec_size / lower_size; i++) {
2929          inst = elk_MOV(p, retype(elk_null_reg(), ELK_REGISTER_TYPE_UW),
2930                         elk_imm_uw(0));
2931          elk_inst_set_mask_control(devinfo, inst, ELK_MASK_ENABLE);
2932          elk_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
2933          elk_inst_set_cond_modifier(devinfo, inst, ELK_CONDITIONAL_Z);
2934          elk_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
2935          elk_inst_set_flag_reg_nr(devinfo, inst, flag_subreg / 2);
2936          elk_inst_set_flag_subreg_nr(devinfo, inst, flag_subreg % 2);
2937       }
2938 
2939       /* Find the first bit set in the exec_size-wide portion of the flag
2940        * register that was updated by the last sequence of MOV
2941        * instructions.
2942        */
2943       const enum elk_reg_type type = elk_int_type(exec_size / 8, false);
2944       elk_set_default_exec_size(p, ELK_EXECUTE_1);
2945       if (!last) {
2946          inst = elk_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
2947       } else {
2948          inst = elk_LZD(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
2949          struct elk_reg neg = vec1(dst);
2950          neg.negate = true;
2951          inst = elk_ADD(p, vec1(dst), neg, elk_imm_uw(31));
2952       }
2953    } else {
2954       elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2955 
2956       /* Overwrite the destination without and with execution masking to
2957        * find out which of the channels is active.
2958        */
2959       elk_push_insn_state(p);
2960       elk_set_default_exec_size(p, ELK_EXECUTE_4);
2961       elk_MOV(p, elk_writemask(vec4(dst), WRITEMASK_X),
2962               elk_imm_ud(1));
2963 
2964       inst = elk_MOV(p, elk_writemask(vec4(dst), WRITEMASK_X),
2965                      elk_imm_ud(0));
2966       elk_pop_insn_state(p);
2967       elk_inst_set_mask_control(devinfo, inst, ELK_MASK_ENABLE);
2968    }
2969 
2970    elk_pop_insn_state(p);
2971 }
2972 
2973 void
elk_broadcast(struct elk_codegen * p,struct elk_reg dst,struct elk_reg src,struct elk_reg idx)2974 elk_broadcast(struct elk_codegen *p,
2975               struct elk_reg dst,
2976               struct elk_reg src,
2977               struct elk_reg idx)
2978 {
2979    const struct intel_device_info *devinfo = p->devinfo;
2980    const bool align1 = elk_get_default_access_mode(p) == ELK_ALIGN_1;
2981    elk_inst *inst;
2982 
2983    elk_push_insn_state(p);
2984    elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2985    elk_set_default_exec_size(p, align1 ? ELK_EXECUTE_1 : ELK_EXECUTE_4);
2986 
2987    assert(src.file == ELK_GENERAL_REGISTER_FILE &&
2988           src.address_mode == ELK_ADDRESS_DIRECT);
2989    assert(!src.abs && !src.negate);
2990 
2991    /* Gen12.5 adds the following region restriction:
2992     *
2993     *    "Vx1 and VxH indirect addressing for Float, Half-Float, Double-Float
2994     *    and Quad-Word data must not be used."
2995     *
2996     * We require the source and destination types to match so stomp to an
2997     * unsigned integer type.
2998     */
2999    assert(src.type == dst.type);
3000    src.type = dst.type = elk_reg_type_from_bit_size(type_sz(src.type) * 8,
3001                                                     ELK_REGISTER_TYPE_UD);
3002 
3003    if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3004        idx.file == ELK_IMMEDIATE_VALUE) {
3005       /* Trivial, the source is already uniform or the index is a constant.
3006        * We will typically not get here if the optimizer is doing its job, but
3007        * asserting would be mean.
3008        */
3009       const unsigned i = idx.file == ELK_IMMEDIATE_VALUE ? idx.ud : 0;
3010       src = align1 ? stride(suboffset(src, i), 0, 1, 0) :
3011                      stride(suboffset(src, 4 * i), 0, 4, 1);
3012 
3013       if (type_sz(src.type) > 4 && !devinfo->has_64bit_int) {
3014          elk_MOV(p, subscript(dst, ELK_REGISTER_TYPE_D, 0),
3015                     subscript(src, ELK_REGISTER_TYPE_D, 0));
3016          elk_MOV(p, subscript(dst, ELK_REGISTER_TYPE_D, 1),
3017                     subscript(src, ELK_REGISTER_TYPE_D, 1));
3018       } else {
3019          elk_MOV(p, dst, src);
3020       }
3021    } else {
3022       /* From the Haswell PRM section "Register Region Restrictions":
3023        *
3024        *    "The lower bits of the AddressImmediate must not overflow to
3025        *    change the register address.  The lower 5 bits of Address
3026        *    Immediate when added to lower 5 bits of address register gives
3027        *    the sub-register offset. The upper bits of Address Immediate
3028        *    when added to upper bits of address register gives the register
3029        *    address. Any overflow from sub-register offset is dropped."
3030        *
3031        * Fortunately, for broadcast, we never have a sub-register offset so
3032        * this isn't an issue.
3033        */
3034       assert(src.subnr == 0);
3035 
3036       if (align1) {
3037          const struct elk_reg addr =
3038             retype(elk_address_reg(0), ELK_REGISTER_TYPE_UD);
3039          unsigned offset = src.nr * REG_SIZE + src.subnr;
3040          /* Limit in bytes of the signed indirect addressing immediate. */
3041          const unsigned limit = 512;
3042 
3043          elk_push_insn_state(p);
3044          elk_set_default_mask_control(p, ELK_MASK_DISABLE);
3045          elk_set_default_predicate_control(p, ELK_PREDICATE_NONE);
3046          elk_set_default_flag_reg(p, 0, 0);
3047 
3048          /* Take into account the component size and horizontal stride. */
3049          assert(src.vstride == src.hstride + src.width);
3050          elk_SHL(p, addr, vec1(idx),
3051                  elk_imm_ud(util_logbase2(type_sz(src.type)) +
3052                             src.hstride - 1));
3053 
3054          /* We can only address up to limit bytes using the indirect
3055           * addressing immediate, account for the difference if the source
3056           * register is above this limit.
3057           */
3058          if (offset >= limit) {
3059             elk_ADD(p, addr, addr, elk_imm_ud(offset - offset % limit));
3060             offset = offset % limit;
3061          }
3062 
3063          elk_pop_insn_state(p);
3064 
3065          /* Use indirect addressing to fetch the specified component. */
3066          if (type_sz(src.type) > 4 &&
3067              (devinfo->platform == INTEL_PLATFORM_CHV || !devinfo->has_64bit_int)) {
3068             /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3069              *
3070              *    "When source or destination datatype is 64b or operation is
3071              *    integer DWord multiply, indirect addressing must not be
3072              *    used."
3073              *
3074              * To work around both of this issue, we do two integer MOVs
3075              * insead of one 64-bit MOV.  Because no double value should ever
3076              * cross a register boundary, it's safe to use the immediate
3077              * offset in the indirect here to handle adding 4 bytes to the
3078              * offset and avoid the extra ADD to the register file.
3079              */
3080             elk_MOV(p, subscript(dst, ELK_REGISTER_TYPE_D, 0),
3081                        retype(elk_vec1_indirect(addr.subnr, offset),
3082                               ELK_REGISTER_TYPE_D));
3083             elk_MOV(p, subscript(dst, ELK_REGISTER_TYPE_D, 1),
3084                        retype(elk_vec1_indirect(addr.subnr, offset + 4),
3085                               ELK_REGISTER_TYPE_D));
3086          } else {
3087             elk_MOV(p, dst,
3088                     retype(elk_vec1_indirect(addr.subnr, offset), src.type));
3089          }
3090       } else {
3091          /* In SIMD4x2 mode the index can be either zero or one, replicate it
3092           * to all bits of a flag register,
3093           */
3094          inst = elk_MOV(p,
3095                         elk_null_reg(),
3096                         stride(elk_swizzle(idx, ELK_SWIZZLE_XXXX), 4, 4, 1));
3097          elk_inst_set_pred_control(devinfo, inst, ELK_PREDICATE_NONE);
3098          elk_inst_set_cond_modifier(devinfo, inst, ELK_CONDITIONAL_NZ);
3099          elk_inst_set_flag_reg_nr(devinfo, inst, 1);
3100 
3101          /* and use predicated SEL to pick the right channel. */
3102          inst = elk_SEL(p, dst,
3103                         stride(suboffset(src, 4), 4, 4, 1),
3104                         stride(src, 4, 4, 1));
3105          elk_inst_set_pred_control(devinfo, inst, ELK_PREDICATE_NORMAL);
3106          elk_inst_set_flag_reg_nr(devinfo, inst, 1);
3107       }
3108    }
3109 
3110    elk_pop_insn_state(p);
3111 }
3112 
3113 
3114 /**
3115  * Emit the SEND message for a barrier
3116  */
3117 void
elk_barrier(struct elk_codegen * p,struct elk_reg src)3118 elk_barrier(struct elk_codegen *p, struct elk_reg src)
3119 {
3120    const struct intel_device_info *devinfo = p->devinfo;
3121    struct elk_inst *inst;
3122 
3123    assert(devinfo->ver >= 7);
3124 
3125    elk_push_insn_state(p);
3126    elk_set_default_access_mode(p, ELK_ALIGN_1);
3127    inst = next_insn(p, ELK_OPCODE_SEND);
3128    elk_set_dest(p, inst, retype(elk_null_reg(), ELK_REGISTER_TYPE_UW));
3129    elk_set_src0(p, inst, src);
3130    elk_set_src1(p, inst, elk_null_reg());
3131    elk_set_desc(p, inst, elk_message_desc(devinfo,
3132                                           1 * reg_unit(devinfo), 0, false));
3133 
3134    elk_inst_set_sfid(devinfo, inst, ELK_SFID_MESSAGE_GATEWAY);
3135    elk_inst_set_gateway_subfuncid(devinfo, inst,
3136                                   ELK_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3137 
3138    elk_inst_set_mask_control(devinfo, inst, ELK_MASK_DISABLE);
3139    elk_pop_insn_state(p);
3140 }
3141 
3142 
3143 /**
3144  * Emit the wait instruction for a barrier
3145  */
3146 void
elk_WAIT(struct elk_codegen * p)3147 elk_WAIT(struct elk_codegen *p)
3148 {
3149    const struct intel_device_info *devinfo = p->devinfo;
3150    struct elk_inst *insn;
3151 
3152    struct elk_reg src = elk_notification_reg();
3153 
3154    insn = next_insn(p, ELK_OPCODE_WAIT);
3155    elk_set_dest(p, insn, src);
3156    elk_set_src0(p, insn, src);
3157    elk_set_src1(p, insn, elk_null_reg());
3158 
3159    elk_inst_set_exec_size(devinfo, insn, ELK_EXECUTE_1);
3160    elk_inst_set_mask_control(devinfo, insn, ELK_MASK_DISABLE);
3161 }
3162 
3163 void
elk_float_controls_mode(struct elk_codegen * p,unsigned mode,unsigned mask)3164 elk_float_controls_mode(struct elk_codegen *p,
3165                         unsigned mode, unsigned mask)
3166 {
3167    assert(p->current->mask_control == ELK_MASK_DISABLE);
3168 
3169    /* From the Skylake PRM, Volume 7, page 760:
3170     *  "Implementation Restriction on Register Access: When the control
3171     *   register is used as an explicit source and/or destination, hardware
3172     *   does not ensure execution pipeline coherency. Software must set the
3173     *   thread control field to ‘switch’ for an instruction that uses
3174     *   control register as an explicit operand."
3175     */
3176    elk_inst *inst = elk_AND(p, elk_cr0_reg(0), elk_cr0_reg(0),
3177                             elk_imm_ud(~mask));
3178    elk_inst_set_exec_size(p->devinfo, inst, ELK_EXECUTE_1);
3179    elk_inst_set_thread_control(p->devinfo, inst, ELK_THREAD_SWITCH);
3180 
3181    if (mode) {
3182       elk_inst *inst_or = elk_OR(p, elk_cr0_reg(0), elk_cr0_reg(0),
3183                                  elk_imm_ud(mode));
3184       elk_inst_set_exec_size(p->devinfo, inst_or, ELK_EXECUTE_1);
3185       elk_inst_set_thread_control(p->devinfo, inst_or, ELK_THREAD_SWITCH);
3186    }
3187 }
3188 
3189 void
elk_update_reloc_imm(const struct elk_isa_info * isa,elk_inst * inst,uint32_t value)3190 elk_update_reloc_imm(const struct elk_isa_info *isa,
3191                      elk_inst *inst,
3192                      uint32_t value)
3193 {
3194    const struct intel_device_info *devinfo = isa->devinfo;
3195 
3196    /* Sanity check that the instruction is a MOV of an immediate */
3197    assert(elk_inst_opcode(isa, inst) == ELK_OPCODE_MOV);
3198    assert(elk_inst_src0_reg_file(devinfo, inst) == ELK_IMMEDIATE_VALUE);
3199 
3200    /* If it was compacted, we can't safely rewrite */
3201    assert(elk_inst_cmpt_control(devinfo, inst) == 0);
3202 
3203    elk_inst_set_imm_ud(devinfo, inst, value);
3204 }
3205 
3206 /* A default value for constants that will be patched at run-time.
3207  * We pick an arbitrary value that prevents instruction compaction.
3208  */
3209 #define DEFAULT_PATCH_IMM 0x4a7cc037
3210 
3211 void
elk_MOV_reloc_imm(struct elk_codegen * p,struct elk_reg dst,enum elk_reg_type src_type,uint32_t id)3212 elk_MOV_reloc_imm(struct elk_codegen *p,
3213                   struct elk_reg dst,
3214                   enum elk_reg_type src_type,
3215                   uint32_t id)
3216 {
3217    assert(type_sz(src_type) == 4);
3218    assert(type_sz(dst.type) == 4);
3219 
3220    elk_add_reloc(p, id, ELK_SHADER_RELOC_TYPE_MOV_IMM,
3221                  p->next_insn_offset, 0);
3222 
3223    elk_MOV(p, dst, retype(elk_imm_ud(DEFAULT_PATCH_IMM), src_type));
3224 }
3225