1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <[email protected]>
30 */
31
32
33 #include "elk_eu_defines.h"
34 #include "elk_eu.h"
35
36 #include "util/ralloc.h"
37
38 /**
39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40 * registers, implicitly moving the operand to a message register.
41 *
42 * On Sandybridge, this is no longer the case. This function performs the
43 * explicit move; it should be called before emitting a SEND instruction.
44 */
45 void
elk_gfx6_resolve_implied_move(struct elk_codegen * p,struct elk_reg * src,unsigned msg_reg_nr)46 elk_gfx6_resolve_implied_move(struct elk_codegen *p,
47 struct elk_reg *src,
48 unsigned msg_reg_nr)
49 {
50 const struct intel_device_info *devinfo = p->devinfo;
51 if (devinfo->ver < 6)
52 return;
53
54 if (src->file == ELK_MESSAGE_REGISTER_FILE)
55 return;
56
57 if (src->file != ELK_ARCHITECTURE_REGISTER_FILE || src->nr != ELK_ARF_NULL) {
58 elk_push_insn_state(p);
59 elk_set_default_exec_size(p, ELK_EXECUTE_8);
60 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
61 elk_set_default_compression_control(p, ELK_COMPRESSION_NONE);
62 elk_MOV(p, retype(elk_message_reg(msg_reg_nr), ELK_REGISTER_TYPE_UD),
63 retype(*src, ELK_REGISTER_TYPE_UD));
64 elk_pop_insn_state(p);
65 }
66 *src = elk_message_reg(msg_reg_nr);
67 }
68
69 static void
gfx7_convert_mrf_to_grf(struct elk_codegen * p,struct elk_reg * reg)70 gfx7_convert_mrf_to_grf(struct elk_codegen *p, struct elk_reg *reg)
71 {
72 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
73 * "The send with EOT should use register space R112-R127 for <src>. This is
74 * to enable loading of a new thread into the same slot while the message
75 * with EOT for current thread is pending dispatch."
76 *
77 * Since we're pretending to have 16 MRFs anyway, we may as well use the
78 * registers required for messages with EOT.
79 */
80 const struct intel_device_info *devinfo = p->devinfo;
81 if (devinfo->ver >= 7 && reg->file == ELK_MESSAGE_REGISTER_FILE) {
82 reg->file = ELK_GENERAL_REGISTER_FILE;
83 reg->nr += GFX7_MRF_HACK_START;
84 }
85 }
86
87 void
elk_set_dest(struct elk_codegen * p,elk_inst * inst,struct elk_reg dest)88 elk_set_dest(struct elk_codegen *p, elk_inst *inst, struct elk_reg dest)
89 {
90 const struct intel_device_info *devinfo = p->devinfo;
91
92 if (dest.file == ELK_MESSAGE_REGISTER_FILE)
93 assert((dest.nr & ~ELK_MRF_COMPR4) < ELK_MAX_MRF(devinfo->ver));
94 else if (dest.file == ELK_GENERAL_REGISTER_FILE)
95 assert(dest.nr < XE2_MAX_GRF);
96
97 /* The hardware has a restriction where a destination of size Byte with
98 * a stride of 1 is only allowed for a packed byte MOV. For any other
99 * instruction, the stride must be at least 2, even when the destination
100 * is the NULL register.
101 */
102 if (dest.file == ELK_ARCHITECTURE_REGISTER_FILE &&
103 dest.nr == ELK_ARF_NULL &&
104 type_sz(dest.type) == 1 &&
105 dest.hstride == ELK_HORIZONTAL_STRIDE_1) {
106 dest.hstride = ELK_HORIZONTAL_STRIDE_2;
107 }
108
109 gfx7_convert_mrf_to_grf(p, &dest);
110
111 elk_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
112 elk_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
113
114 if (dest.address_mode == ELK_ADDRESS_DIRECT) {
115 elk_inst_set_dst_da_reg_nr(devinfo, inst, phys_nr(devinfo, dest));
116
117 if (elk_inst_access_mode(devinfo, inst) == ELK_ALIGN_1) {
118 elk_inst_set_dst_da1_subreg_nr(devinfo, inst, phys_subnr(devinfo, dest));
119 if (dest.hstride == ELK_HORIZONTAL_STRIDE_0)
120 dest.hstride = ELK_HORIZONTAL_STRIDE_1;
121 elk_inst_set_dst_hstride(devinfo, inst, dest.hstride);
122 } else {
123 elk_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
124 elk_inst_set_da16_writemask(devinfo, inst, dest.writemask);
125 if (dest.file == ELK_GENERAL_REGISTER_FILE ||
126 dest.file == ELK_MESSAGE_REGISTER_FILE) {
127 assert(dest.writemask != 0);
128 }
129 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
130 * Although Dst.HorzStride is a don't care for Align16, HW needs
131 * this to be programmed as "01".
132 */
133 elk_inst_set_dst_hstride(devinfo, inst, 1);
134 }
135 } else {
136 elk_inst_set_dst_ia_subreg_nr(devinfo, inst, phys_subnr(devinfo, dest));
137
138 /* These are different sizes in align1 vs align16:
139 */
140 if (elk_inst_access_mode(devinfo, inst) == ELK_ALIGN_1) {
141 elk_inst_set_dst_ia1_addr_imm(devinfo, inst,
142 dest.indirect_offset);
143 if (dest.hstride == ELK_HORIZONTAL_STRIDE_0)
144 dest.hstride = ELK_HORIZONTAL_STRIDE_1;
145 elk_inst_set_dst_hstride(devinfo, inst, dest.hstride);
146 } else {
147 elk_inst_set_dst_ia16_addr_imm(devinfo, inst,
148 dest.indirect_offset);
149 /* even ignored in da16, still need to set as '01' */
150 elk_inst_set_dst_hstride(devinfo, inst, 1);
151 }
152 }
153
154 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
155 * or 16 (SIMD16), as that's normally correct. However, when dealing with
156 * small registers, it can be useful for us to automatically reduce it to
157 * match the register size.
158 */
159 if (p->automatic_exec_sizes) {
160 /*
161 * In platforms that support fp64 we can emit instructions with a width
162 * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
163 * these cases we need to make sure that these instructions have their
164 * exec sizes set properly when they are emitted and we can't rely on
165 * this code to fix it.
166 */
167 bool fix_exec_size;
168 if (devinfo->ver >= 6)
169 fix_exec_size = dest.width < ELK_EXECUTE_4;
170 else
171 fix_exec_size = dest.width < ELK_EXECUTE_8;
172
173 if (fix_exec_size)
174 elk_inst_set_exec_size(devinfo, inst, dest.width);
175 }
176 }
177
178 void
elk_set_src0(struct elk_codegen * p,elk_inst * inst,struct elk_reg reg)179 elk_set_src0(struct elk_codegen *p, elk_inst *inst, struct elk_reg reg)
180 {
181 const struct intel_device_info *devinfo = p->devinfo;
182
183 if (reg.file == ELK_MESSAGE_REGISTER_FILE)
184 assert((reg.nr & ~ELK_MRF_COMPR4) < ELK_MAX_MRF(devinfo->ver));
185 else if (reg.file == ELK_GENERAL_REGISTER_FILE)
186 assert(reg.nr < XE2_MAX_GRF);
187
188 gfx7_convert_mrf_to_grf(p, ®);
189
190 if (devinfo->ver >= 6 &&
191 (elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SEND ||
192 elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SENDC)) {
193 /* Any source modifiers or regions will be ignored, since this just
194 * identifies the MRF/GRF to start reading the message contents from.
195 * Check for some likely failures.
196 */
197 assert(!reg.negate);
198 assert(!reg.abs);
199 assert(reg.address_mode == ELK_ADDRESS_DIRECT);
200 }
201
202 elk_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
203 elk_inst_set_src0_abs(devinfo, inst, reg.abs);
204 elk_inst_set_src0_negate(devinfo, inst, reg.negate);
205 elk_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
206
207 if (reg.file == ELK_IMMEDIATE_VALUE) {
208 if (reg.type == ELK_REGISTER_TYPE_DF ||
209 elk_inst_opcode(p->isa, inst) == ELK_OPCODE_DIM)
210 elk_inst_set_imm_df(devinfo, inst, reg.df);
211 else if (reg.type == ELK_REGISTER_TYPE_UQ ||
212 reg.type == ELK_REGISTER_TYPE_Q)
213 elk_inst_set_imm_uq(devinfo, inst, reg.u64);
214 else
215 elk_inst_set_imm_ud(devinfo, inst, reg.ud);
216
217 if (type_sz(reg.type) < 8) {
218 elk_inst_set_src1_reg_file(devinfo, inst,
219 ELK_ARCHITECTURE_REGISTER_FILE);
220 elk_inst_set_src1_reg_hw_type(devinfo, inst,
221 elk_inst_src0_reg_hw_type(devinfo, inst));
222 }
223 } else {
224 if (reg.address_mode == ELK_ADDRESS_DIRECT) {
225 elk_inst_set_src0_da_reg_nr(devinfo, inst, phys_nr(devinfo, reg));
226 if (elk_inst_access_mode(devinfo, inst) == ELK_ALIGN_1) {
227 elk_inst_set_src0_da1_subreg_nr(devinfo, inst, phys_subnr(devinfo, reg));
228 } else {
229 elk_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
230 }
231 } else {
232 elk_inst_set_src0_ia_subreg_nr(devinfo, inst, phys_subnr(devinfo, reg));
233
234 if (elk_inst_access_mode(devinfo, inst) == ELK_ALIGN_1) {
235 elk_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
236 } else {
237 elk_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
238 }
239 }
240
241 if (elk_inst_access_mode(devinfo, inst) == ELK_ALIGN_1) {
242 if (reg.width == ELK_WIDTH_1 &&
243 elk_inst_exec_size(devinfo, inst) == ELK_EXECUTE_1) {
244 elk_inst_set_src0_hstride(devinfo, inst, ELK_HORIZONTAL_STRIDE_0);
245 elk_inst_set_src0_width(devinfo, inst, ELK_WIDTH_1);
246 elk_inst_set_src0_vstride(devinfo, inst, ELK_VERTICAL_STRIDE_0);
247 } else {
248 elk_inst_set_src0_hstride(devinfo, inst, reg.hstride);
249 elk_inst_set_src0_width(devinfo, inst, reg.width);
250 elk_inst_set_src0_vstride(devinfo, inst, reg.vstride);
251 }
252 } else {
253 elk_inst_set_src0_da16_swiz_x(devinfo, inst,
254 ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_X));
255 elk_inst_set_src0_da16_swiz_y(devinfo, inst,
256 ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_Y));
257 elk_inst_set_src0_da16_swiz_z(devinfo, inst,
258 ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_Z));
259 elk_inst_set_src0_da16_swiz_w(devinfo, inst,
260 ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_W));
261
262 if (reg.vstride == ELK_VERTICAL_STRIDE_8) {
263 /* This is an oddity of the fact we're using the same
264 * descriptions for registers in align_16 as align_1:
265 */
266 elk_inst_set_src0_vstride(devinfo, inst, ELK_VERTICAL_STRIDE_4);
267 } else if (devinfo->verx10 == 70 &&
268 reg.type == ELK_REGISTER_TYPE_DF &&
269 reg.vstride == ELK_VERTICAL_STRIDE_2) {
270 /* From SNB PRM:
271 *
272 * "For Align16 access mode, only encodings of 0000 and 0011
273 * are allowed. Other codes are reserved."
274 *
275 * Presumably the DevSNB behavior applies to IVB as well.
276 */
277 elk_inst_set_src0_vstride(devinfo, inst, ELK_VERTICAL_STRIDE_4);
278 } else {
279 elk_inst_set_src0_vstride(devinfo, inst, reg.vstride);
280 }
281 }
282 }
283 }
284
285
286 void
elk_set_src1(struct elk_codegen * p,elk_inst * inst,struct elk_reg reg)287 elk_set_src1(struct elk_codegen *p, elk_inst *inst, struct elk_reg reg)
288 {
289 const struct intel_device_info *devinfo = p->devinfo;
290
291 if (reg.file == ELK_GENERAL_REGISTER_FILE)
292 assert(reg.nr < XE2_MAX_GRF);
293
294 {
295 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
296 *
297 * "Accumulator registers may be accessed explicitly as src0
298 * operands only."
299 */
300 assert(reg.file != ELK_ARCHITECTURE_REGISTER_FILE ||
301 reg.nr != ELK_ARF_ACCUMULATOR);
302
303 gfx7_convert_mrf_to_grf(p, ®);
304 assert(reg.file != ELK_MESSAGE_REGISTER_FILE);
305
306 elk_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
307 elk_inst_set_src1_abs(devinfo, inst, reg.abs);
308 elk_inst_set_src1_negate(devinfo, inst, reg.negate);
309
310 /* Only src1 can be immediate in two-argument instructions.
311 */
312 assert(elk_inst_src0_reg_file(devinfo, inst) != ELK_IMMEDIATE_VALUE);
313
314 if (reg.file == ELK_IMMEDIATE_VALUE) {
315 /* two-argument instructions can only use 32-bit immediates */
316 assert(type_sz(reg.type) < 8);
317 elk_inst_set_imm_ud(devinfo, inst, reg.ud);
318 } else {
319 /* This is a hardware restriction, which may or may not be lifted
320 * in the future:
321 */
322 assert (reg.address_mode == ELK_ADDRESS_DIRECT);
323 /* assert (reg.file == ELK_GENERAL_REGISTER_FILE); */
324
325 elk_inst_set_src1_da_reg_nr(devinfo, inst, phys_nr(devinfo, reg));
326 if (elk_inst_access_mode(devinfo, inst) == ELK_ALIGN_1) {
327 elk_inst_set_src1_da1_subreg_nr(devinfo, inst, phys_subnr(devinfo, reg));
328 } else {
329 elk_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
330 }
331
332 if (elk_inst_access_mode(devinfo, inst) == ELK_ALIGN_1) {
333 if (reg.width == ELK_WIDTH_1 &&
334 elk_inst_exec_size(devinfo, inst) == ELK_EXECUTE_1) {
335 elk_inst_set_src1_hstride(devinfo, inst, ELK_HORIZONTAL_STRIDE_0);
336 elk_inst_set_src1_width(devinfo, inst, ELK_WIDTH_1);
337 elk_inst_set_src1_vstride(devinfo, inst, ELK_VERTICAL_STRIDE_0);
338 } else {
339 elk_inst_set_src1_hstride(devinfo, inst, reg.hstride);
340 elk_inst_set_src1_width(devinfo, inst, reg.width);
341 elk_inst_set_src1_vstride(devinfo, inst, reg.vstride);
342 }
343 } else {
344 elk_inst_set_src1_da16_swiz_x(devinfo, inst,
345 ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_X));
346 elk_inst_set_src1_da16_swiz_y(devinfo, inst,
347 ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_Y));
348 elk_inst_set_src1_da16_swiz_z(devinfo, inst,
349 ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_Z));
350 elk_inst_set_src1_da16_swiz_w(devinfo, inst,
351 ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_W));
352
353 if (reg.vstride == ELK_VERTICAL_STRIDE_8) {
354 /* This is an oddity of the fact we're using the same
355 * descriptions for registers in align_16 as align_1:
356 */
357 elk_inst_set_src1_vstride(devinfo, inst, ELK_VERTICAL_STRIDE_4);
358 } else if (devinfo->verx10 == 70 &&
359 reg.type == ELK_REGISTER_TYPE_DF &&
360 reg.vstride == ELK_VERTICAL_STRIDE_2) {
361 /* From SNB PRM:
362 *
363 * "For Align16 access mode, only encodings of 0000 and 0011
364 * are allowed. Other codes are reserved."
365 *
366 * Presumably the DevSNB behavior applies to IVB as well.
367 */
368 elk_inst_set_src1_vstride(devinfo, inst, ELK_VERTICAL_STRIDE_4);
369 } else {
370 elk_inst_set_src1_vstride(devinfo, inst, reg.vstride);
371 }
372 }
373 }
374 }
375 }
376
377 /**
378 * Specify the descriptor and extended descriptor immediate for a SEND(C)
379 * message instruction.
380 */
381 void
elk_set_desc_ex(struct elk_codegen * p,elk_inst * inst,unsigned desc,unsigned ex_desc)382 elk_set_desc_ex(struct elk_codegen *p, elk_inst *inst,
383 unsigned desc, unsigned ex_desc)
384 {
385 const struct intel_device_info *devinfo = p->devinfo;
386 assert(elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SEND ||
387 elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SENDC);
388 elk_inst_set_src1_file_type(devinfo, inst,
389 ELK_IMMEDIATE_VALUE, ELK_REGISTER_TYPE_UD);
390 elk_inst_set_send_desc(devinfo, inst, desc);
391 }
392
elk_set_math_message(struct elk_codegen * p,elk_inst * inst,unsigned function,unsigned integer_type,bool low_precision,unsigned dataType)393 static void elk_set_math_message( struct elk_codegen *p,
394 elk_inst *inst,
395 unsigned function,
396 unsigned integer_type,
397 bool low_precision,
398 unsigned dataType )
399 {
400 const struct intel_device_info *devinfo = p->devinfo;
401 unsigned msg_length;
402 unsigned response_length;
403
404 /* Infer message length from the function */
405 switch (function) {
406 case ELK_MATH_FUNCTION_POW:
407 case ELK_MATH_FUNCTION_INT_DIV_QUOTIENT:
408 case ELK_MATH_FUNCTION_INT_DIV_REMAINDER:
409 case ELK_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
410 msg_length = 2;
411 break;
412 default:
413 msg_length = 1;
414 break;
415 }
416
417 /* Infer response length from the function */
418 switch (function) {
419 case ELK_MATH_FUNCTION_SINCOS:
420 case ELK_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
421 response_length = 2;
422 break;
423 default:
424 response_length = 1;
425 break;
426 }
427
428 elk_set_desc(p, inst, elk_message_desc(
429 devinfo, msg_length, response_length, false));
430
431 elk_inst_set_sfid(devinfo, inst, ELK_SFID_MATH);
432 elk_inst_set_math_msg_function(devinfo, inst, function);
433 elk_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
434 elk_inst_set_math_msg_precision(devinfo, inst, low_precision);
435 elk_inst_set_math_msg_saturate(devinfo, inst, elk_inst_saturate(devinfo, inst));
436 elk_inst_set_math_msg_data_type(devinfo, inst, dataType);
437 elk_inst_set_saturate(devinfo, inst, 0);
438 }
439
440
elk_set_ff_sync_message(struct elk_codegen * p,elk_inst * insn,bool allocate,unsigned response_length,bool end_of_thread)441 static void elk_set_ff_sync_message(struct elk_codegen *p,
442 elk_inst *insn,
443 bool allocate,
444 unsigned response_length,
445 bool end_of_thread)
446 {
447 const struct intel_device_info *devinfo = p->devinfo;
448
449 elk_set_desc(p, insn, elk_message_desc(
450 devinfo, 1, response_length, true));
451
452 elk_inst_set_sfid(devinfo, insn, ELK_SFID_URB);
453 elk_inst_set_eot(devinfo, insn, end_of_thread);
454 elk_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
455 elk_inst_set_urb_allocate(devinfo, insn, allocate);
456 /* The following fields are not used by FF_SYNC: */
457 elk_inst_set_urb_global_offset(devinfo, insn, 0);
458 elk_inst_set_urb_swizzle_control(devinfo, insn, 0);
459 elk_inst_set_urb_used(devinfo, insn, 0);
460 elk_inst_set_urb_complete(devinfo, insn, 0);
461 }
462
elk_set_urb_message(struct elk_codegen * p,elk_inst * insn,enum elk_urb_write_flags flags,unsigned msg_length,unsigned response_length,unsigned offset,unsigned swizzle_control)463 static void elk_set_urb_message( struct elk_codegen *p,
464 elk_inst *insn,
465 enum elk_urb_write_flags flags,
466 unsigned msg_length,
467 unsigned response_length,
468 unsigned offset,
469 unsigned swizzle_control )
470 {
471 const struct intel_device_info *devinfo = p->devinfo;
472
473 assert(devinfo->ver < 7 || swizzle_control != ELK_URB_SWIZZLE_TRANSPOSE);
474 assert(devinfo->ver < 7 || !(flags & ELK_URB_WRITE_ALLOCATE));
475 assert(devinfo->ver >= 7 || !(flags & ELK_URB_WRITE_PER_SLOT_OFFSET));
476
477 elk_set_desc(p, insn, elk_message_desc(
478 devinfo, msg_length, response_length, true));
479
480 elk_inst_set_sfid(devinfo, insn, ELK_SFID_URB);
481 elk_inst_set_eot(devinfo, insn, !!(flags & ELK_URB_WRITE_EOT));
482
483 if (flags & ELK_URB_WRITE_OWORD) {
484 assert(msg_length == 2); /* header + one OWORD of data */
485 elk_inst_set_urb_opcode(devinfo, insn, ELK_URB_OPCODE_WRITE_OWORD);
486 } else {
487 elk_inst_set_urb_opcode(devinfo, insn, ELK_URB_OPCODE_WRITE_HWORD);
488 }
489
490 elk_inst_set_urb_global_offset(devinfo, insn, offset);
491 elk_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
492
493 if (devinfo->ver < 8) {
494 elk_inst_set_urb_complete(devinfo, insn, !!(flags & ELK_URB_WRITE_COMPLETE));
495 }
496
497 if (devinfo->ver < 7) {
498 elk_inst_set_urb_allocate(devinfo, insn, !!(flags & ELK_URB_WRITE_ALLOCATE));
499 elk_inst_set_urb_used(devinfo, insn, !(flags & ELK_URB_WRITE_UNUSED));
500 } else {
501 elk_inst_set_urb_per_slot_offset(devinfo, insn,
502 !!(flags & ELK_URB_WRITE_PER_SLOT_OFFSET));
503 }
504 }
505
506 static void
gfx7_set_dp_scratch_message(struct elk_codegen * p,elk_inst * inst,bool write,bool dword,bool invalidate_after_read,unsigned num_regs,unsigned addr_offset,unsigned mlen,unsigned rlen,bool header_present)507 gfx7_set_dp_scratch_message(struct elk_codegen *p,
508 elk_inst *inst,
509 bool write,
510 bool dword,
511 bool invalidate_after_read,
512 unsigned num_regs,
513 unsigned addr_offset,
514 unsigned mlen,
515 unsigned rlen,
516 bool header_present)
517 {
518 const struct intel_device_info *devinfo = p->devinfo;
519 assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
520 (devinfo->ver >= 8 && num_regs == 8));
521 const unsigned block_size = (devinfo->ver >= 8 ? util_logbase2(num_regs) :
522 num_regs - 1);
523
524 elk_set_desc(p, inst, elk_message_desc(
525 devinfo, mlen, rlen, header_present));
526
527 elk_inst_set_sfid(devinfo, inst, GFX7_SFID_DATAPORT_DATA_CACHE);
528 elk_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
529 elk_inst_set_scratch_read_write(devinfo, inst, write);
530 elk_inst_set_scratch_type(devinfo, inst, dword);
531 elk_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
532 elk_inst_set_scratch_block_size(devinfo, inst, block_size);
533 elk_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
534 }
535
536 static void
elk_inst_set_state(const struct elk_isa_info * isa,elk_inst * insn,const struct elk_insn_state * state)537 elk_inst_set_state(const struct elk_isa_info *isa,
538 elk_inst *insn,
539 const struct elk_insn_state *state)
540 {
541 const struct intel_device_info *devinfo = isa->devinfo;
542
543 elk_inst_set_exec_size(devinfo, insn, state->exec_size);
544 elk_inst_set_group(devinfo, insn, state->group);
545 elk_inst_set_compression(devinfo, insn, state->compressed);
546 elk_inst_set_access_mode(devinfo, insn, state->access_mode);
547 elk_inst_set_mask_control(devinfo, insn, state->mask_control);
548 elk_inst_set_saturate(devinfo, insn, state->saturate);
549 elk_inst_set_pred_control(devinfo, insn, state->predicate);
550 elk_inst_set_pred_inv(devinfo, insn, state->pred_inv);
551
552 if (elk_is_3src(isa, elk_inst_opcode(isa, insn)) &&
553 state->access_mode == ELK_ALIGN_16) {
554 elk_inst_set_3src_a16_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
555 if (devinfo->ver >= 7)
556 elk_inst_set_3src_a16_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
557 } else {
558 elk_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
559 if (devinfo->ver >= 7)
560 elk_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
561 }
562
563 if (devinfo->ver >= 6)
564 elk_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control);
565 }
566
567 static elk_inst *
elk_append_insns(struct elk_codegen * p,unsigned nr_insn,unsigned alignment)568 elk_append_insns(struct elk_codegen *p, unsigned nr_insn, unsigned alignment)
569 {
570 assert(util_is_power_of_two_or_zero(sizeof(elk_inst)));
571 assert(util_is_power_of_two_or_zero(alignment));
572 const unsigned align_insn = MAX2(alignment / sizeof(elk_inst), 1);
573 const unsigned start_insn = ALIGN(p->nr_insn, align_insn);
574 const unsigned new_nr_insn = start_insn + nr_insn;
575
576 if (p->store_size < new_nr_insn) {
577 p->store_size = util_next_power_of_two(new_nr_insn * sizeof(elk_inst));
578 p->store = reralloc(p->mem_ctx, p->store, elk_inst, p->store_size);
579 }
580
581 /* Memset any padding due to alignment to 0. We don't want to be hashing
582 * or caching a bunch of random bits we got from a memory allocation.
583 */
584 if (p->nr_insn < start_insn) {
585 memset(&p->store[p->nr_insn], 0,
586 (start_insn - p->nr_insn) * sizeof(elk_inst));
587 }
588
589 assert(p->next_insn_offset == p->nr_insn * sizeof(elk_inst));
590 p->nr_insn = new_nr_insn;
591 p->next_insn_offset = new_nr_insn * sizeof(elk_inst);
592
593 return &p->store[start_insn];
594 }
595
596 void
elk_realign(struct elk_codegen * p,unsigned alignment)597 elk_realign(struct elk_codegen *p, unsigned alignment)
598 {
599 elk_append_insns(p, 0, alignment);
600 }
601
602 int
elk_append_data(struct elk_codegen * p,void * data,unsigned size,unsigned alignment)603 elk_append_data(struct elk_codegen *p, void *data,
604 unsigned size, unsigned alignment)
605 {
606 unsigned nr_insn = DIV_ROUND_UP(size, sizeof(elk_inst));
607 void *dst = elk_append_insns(p, nr_insn, alignment);
608 memcpy(dst, data, size);
609
610 /* If it's not a whole number of instructions, memset the end */
611 if (size < nr_insn * sizeof(elk_inst))
612 memset(dst + size, 0, nr_insn * sizeof(elk_inst) - size);
613
614 return dst - (void *)p->store;
615 }
616
617 #define next_insn elk_next_insn
618 elk_inst *
elk_next_insn(struct elk_codegen * p,unsigned opcode)619 elk_next_insn(struct elk_codegen *p, unsigned opcode)
620 {
621 elk_inst *insn = elk_append_insns(p, 1, sizeof(elk_inst));
622
623 memset(insn, 0, sizeof(*insn));
624 elk_inst_set_opcode(p->isa, insn, opcode);
625
626 /* Apply the default instruction state */
627 elk_inst_set_state(p->isa, insn, p->current);
628
629 return insn;
630 }
631
632 void
elk_add_reloc(struct elk_codegen * p,uint32_t id,enum elk_shader_reloc_type type,uint32_t offset,uint32_t delta)633 elk_add_reloc(struct elk_codegen *p, uint32_t id,
634 enum elk_shader_reloc_type type,
635 uint32_t offset, uint32_t delta)
636 {
637 if (p->num_relocs + 1 > p->reloc_array_size) {
638 p->reloc_array_size = MAX2(16, p->reloc_array_size * 2);
639 p->relocs = reralloc(p->mem_ctx, p->relocs,
640 struct elk_shader_reloc, p->reloc_array_size);
641 }
642
643 p->relocs[p->num_relocs++] = (struct elk_shader_reloc) {
644 .id = id,
645 .type = type,
646 .offset = offset,
647 .delta = delta,
648 };
649 }
650
651 static elk_inst *
elk_alu1(struct elk_codegen * p,unsigned opcode,struct elk_reg dest,struct elk_reg src)652 elk_alu1(struct elk_codegen *p, unsigned opcode,
653 struct elk_reg dest, struct elk_reg src)
654 {
655 elk_inst *insn = next_insn(p, opcode);
656 elk_set_dest(p, insn, dest);
657 elk_set_src0(p, insn, src);
658 return insn;
659 }
660
661 static elk_inst *
elk_alu2(struct elk_codegen * p,unsigned opcode,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1)662 elk_alu2(struct elk_codegen *p, unsigned opcode,
663 struct elk_reg dest, struct elk_reg src0, struct elk_reg src1)
664 {
665 /* 64-bit immediates are only supported on 1-src instructions */
666 assert(src0.file != ELK_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
667 assert(src1.file != ELK_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
668
669 elk_inst *insn = next_insn(p, opcode);
670 elk_set_dest(p, insn, dest);
671 elk_set_src0(p, insn, src0);
672 elk_set_src1(p, insn, src1);
673 return insn;
674 }
675
676 static int
get_3src_subreg_nr(struct elk_reg reg)677 get_3src_subreg_nr(struct elk_reg reg)
678 {
679 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
680 * use 32-bit units (components 0..7). Since they only support F/D/UD
681 * types, this doesn't lose any flexibility, but uses fewer bits.
682 */
683 return reg.subnr / 4;
684 }
685
686 static enum gfx10_align1_3src_vertical_stride
to_3src_align1_vstride(const struct intel_device_info * devinfo,enum elk_vertical_stride vstride)687 to_3src_align1_vstride(const struct intel_device_info *devinfo,
688 enum elk_vertical_stride vstride)
689 {
690 switch (vstride) {
691 case ELK_VERTICAL_STRIDE_0:
692 return ELK_ALIGN1_3SRC_VERTICAL_STRIDE_0;
693 case ELK_VERTICAL_STRIDE_2:
694 return ELK_ALIGN1_3SRC_VERTICAL_STRIDE_2;
695 case ELK_VERTICAL_STRIDE_4:
696 return ELK_ALIGN1_3SRC_VERTICAL_STRIDE_4;
697 case ELK_VERTICAL_STRIDE_8:
698 case ELK_VERTICAL_STRIDE_16:
699 return ELK_ALIGN1_3SRC_VERTICAL_STRIDE_8;
700 default:
701 unreachable("invalid vstride");
702 }
703 }
704
705
706 static enum gfx10_align1_3src_src_horizontal_stride
to_3src_align1_hstride(enum elk_horizontal_stride hstride)707 to_3src_align1_hstride(enum elk_horizontal_stride hstride)
708 {
709 switch (hstride) {
710 case ELK_HORIZONTAL_STRIDE_0:
711 return ELK_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;
712 case ELK_HORIZONTAL_STRIDE_1:
713 return ELK_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;
714 case ELK_HORIZONTAL_STRIDE_2:
715 return ELK_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;
716 case ELK_HORIZONTAL_STRIDE_4:
717 return ELK_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;
718 default:
719 unreachable("invalid hstride");
720 }
721 }
722
723 static elk_inst *
elk_alu3(struct elk_codegen * p,unsigned opcode,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1,struct elk_reg src2)724 elk_alu3(struct elk_codegen *p, unsigned opcode, struct elk_reg dest,
725 struct elk_reg src0, struct elk_reg src1, struct elk_reg src2)
726 {
727 const struct intel_device_info *devinfo = p->devinfo;
728 elk_inst *inst = next_insn(p, opcode);
729
730 gfx7_convert_mrf_to_grf(p, &dest);
731
732 assert(dest.nr < XE2_MAX_GRF);
733
734 assert(src0.file == ELK_IMMEDIATE_VALUE || src0.nr < XE2_MAX_GRF);
735 assert(src1.file != ELK_IMMEDIATE_VALUE && src1.nr < XE2_MAX_GRF);
736 assert(src2.file == ELK_IMMEDIATE_VALUE || src2.nr < XE2_MAX_GRF);
737 assert(dest.address_mode == ELK_ADDRESS_DIRECT);
738 assert(src0.address_mode == ELK_ADDRESS_DIRECT);
739 assert(src1.address_mode == ELK_ADDRESS_DIRECT);
740 assert(src2.address_mode == ELK_ADDRESS_DIRECT);
741
742 assert(dest.file == ELK_GENERAL_REGISTER_FILE ||
743 dest.file == ELK_MESSAGE_REGISTER_FILE);
744 assert(dest.type == ELK_REGISTER_TYPE_F ||
745 dest.type == ELK_REGISTER_TYPE_DF ||
746 dest.type == ELK_REGISTER_TYPE_D ||
747 dest.type == ELK_REGISTER_TYPE_UD ||
748 (dest.type == ELK_REGISTER_TYPE_HF && devinfo->ver >= 8));
749 if (devinfo->ver == 6) {
750 elk_inst_set_3src_a16_dst_reg_file(devinfo, inst,
751 dest.file == ELK_MESSAGE_REGISTER_FILE);
752 }
753 elk_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
754 elk_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 4);
755 elk_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
756
757 assert(src0.file == ELK_GENERAL_REGISTER_FILE);
758 elk_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
759 elk_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
760 elk_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
761 elk_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
762 elk_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
763 elk_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
764 src0.vstride == ELK_VERTICAL_STRIDE_0);
765
766 assert(src1.file == ELK_GENERAL_REGISTER_FILE);
767 elk_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
768 elk_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
769 elk_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
770 elk_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
771 elk_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
772 elk_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
773 src1.vstride == ELK_VERTICAL_STRIDE_0);
774
775 assert(src2.file == ELK_GENERAL_REGISTER_FILE);
776 elk_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
777 elk_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
778 elk_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
779 elk_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
780 elk_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
781 elk_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
782 src2.vstride == ELK_VERTICAL_STRIDE_0);
783
784 if (devinfo->ver >= 7) {
785 /* Set both the source and destination types based on dest.type,
786 * ignoring the source register types. The MAD and LRP emitters ensure
787 * that all four types are float. The BFE and BFI2 emitters, however,
788 * may send us mixed D and UD types and want us to ignore that and use
789 * the destination type.
790 */
791 elk_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
792 elk_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
793
794 /* From the Bspec, 3D Media GPGPU, Instruction fields, srcType:
795 *
796 * "Three source instructions can use operands with mixed-mode
797 * precision. When SrcType field is set to :f or :hf it defines
798 * precision for source 0 only, and fields Src1Type and Src2Type
799 * define precision for other source operands:
800 *
801 * 0b = :f. Single precision Float (32-bit).
802 * 1b = :hf. Half precision Float (16-bit)."
803 */
804 if (src1.type == ELK_REGISTER_TYPE_HF)
805 elk_inst_set_3src_a16_src1_type(devinfo, inst, 1);
806
807 if (src2.type == ELK_REGISTER_TYPE_HF)
808 elk_inst_set_3src_a16_src2_type(devinfo, inst, 1);
809 }
810
811 return inst;
812 }
813
814 /***********************************************************************
815 * Convenience routines.
816 */
817 #define ALU1(OP) \
818 elk_inst *elk_##OP(struct elk_codegen *p, \
819 struct elk_reg dest, \
820 struct elk_reg src0) \
821 { \
822 return elk_alu1(p, ELK_OPCODE_##OP, dest, src0); \
823 }
824
825 #define ALU2(OP) \
826 elk_inst *elk_##OP(struct elk_codegen *p, \
827 struct elk_reg dest, \
828 struct elk_reg src0, \
829 struct elk_reg src1) \
830 { \
831 return elk_alu2(p, ELK_OPCODE_##OP, dest, src0, src1); \
832 }
833
834 #define ALU3(OP) \
835 elk_inst *elk_##OP(struct elk_codegen *p, \
836 struct elk_reg dest, \
837 struct elk_reg src0, \
838 struct elk_reg src1, \
839 struct elk_reg src2) \
840 { \
841 if (p->current->access_mode == ELK_ALIGN_16) { \
842 if (src0.vstride == ELK_VERTICAL_STRIDE_0) \
843 src0.swizzle = ELK_SWIZZLE_XXXX; \
844 if (src1.vstride == ELK_VERTICAL_STRIDE_0) \
845 src1.swizzle = ELK_SWIZZLE_XXXX; \
846 if (src2.vstride == ELK_VERTICAL_STRIDE_0) \
847 src2.swizzle = ELK_SWIZZLE_XXXX; \
848 } \
849 return elk_alu3(p, ELK_OPCODE_##OP, dest, src0, src1, src2); \
850 }
851
852 #define ALU3F(OP) \
853 elk_inst *elk_##OP(struct elk_codegen *p, \
854 struct elk_reg dest, \
855 struct elk_reg src0, \
856 struct elk_reg src1, \
857 struct elk_reg src2) \
858 { \
859 assert(dest.type == ELK_REGISTER_TYPE_F || \
860 dest.type == ELK_REGISTER_TYPE_DF); \
861 if (dest.type == ELK_REGISTER_TYPE_F) { \
862 assert(src0.type == ELK_REGISTER_TYPE_F); \
863 assert(src1.type == ELK_REGISTER_TYPE_F); \
864 assert(src2.type == ELK_REGISTER_TYPE_F); \
865 } else if (dest.type == ELK_REGISTER_TYPE_DF) { \
866 assert(src0.type == ELK_REGISTER_TYPE_DF); \
867 assert(src1.type == ELK_REGISTER_TYPE_DF); \
868 assert(src2.type == ELK_REGISTER_TYPE_DF); \
869 } \
870 \
871 if (p->current->access_mode == ELK_ALIGN_16) { \
872 if (src0.vstride == ELK_VERTICAL_STRIDE_0) \
873 src0.swizzle = ELK_SWIZZLE_XXXX; \
874 if (src1.vstride == ELK_VERTICAL_STRIDE_0) \
875 src1.swizzle = ELK_SWIZZLE_XXXX; \
876 if (src2.vstride == ELK_VERTICAL_STRIDE_0) \
877 src2.swizzle = ELK_SWIZZLE_XXXX; \
878 } \
879 return elk_alu3(p, ELK_OPCODE_##OP, dest, src0, src1, src2); \
880 }
881
882 ALU2(SEL)
ALU1(NOT)883 ALU1(NOT)
884 ALU2(AND)
885 ALU2(OR)
886 ALU2(XOR)
887 ALU2(SHR)
888 ALU2(SHL)
889 ALU1(DIM)
890 ALU2(ASR)
891 ALU3(CSEL)
892 ALU1(FRC)
893 ALU1(RNDD)
894 ALU1(RNDE)
895 ALU1(RNDU)
896 ALU1(RNDZ)
897 ALU2(MAC)
898 ALU2(MACH)
899 ALU1(LZD)
900 ALU2(DP4)
901 ALU2(DPH)
902 ALU2(DP3)
903 ALU2(DP2)
904 ALU3(MAD)
905 ALU3F(LRP)
906 ALU1(BFREV)
907 ALU3(BFE)
908 ALU2(BFI1)
909 ALU3(BFI2)
910 ALU1(FBH)
911 ALU1(FBL)
912 ALU1(CBIT)
913 ALU2(ADDC)
914 ALU2(SUBB)
915
916 elk_inst *
917 elk_MOV(struct elk_codegen *p, struct elk_reg dest, struct elk_reg src0)
918 {
919 const struct intel_device_info *devinfo = p->devinfo;
920
921 /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
922 * To avoid the problems that causes, we use an <X,2,0> source region to
923 * read each element twice.
924 */
925 if (devinfo->verx10 == 70 &&
926 elk_get_default_access_mode(p) == ELK_ALIGN_1 &&
927 dest.type == ELK_REGISTER_TYPE_DF &&
928 (src0.type == ELK_REGISTER_TYPE_F ||
929 src0.type == ELK_REGISTER_TYPE_D ||
930 src0.type == ELK_REGISTER_TYPE_UD) &&
931 !has_scalar_region(src0)) {
932 assert(src0.vstride == src0.width + src0.hstride);
933 src0.vstride = src0.hstride;
934 src0.width = ELK_WIDTH_2;
935 src0.hstride = ELK_HORIZONTAL_STRIDE_0;
936 }
937
938 return elk_alu1(p, ELK_OPCODE_MOV, dest, src0);
939 }
940
941 elk_inst *
elk_ADD(struct elk_codegen * p,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1)942 elk_ADD(struct elk_codegen *p, struct elk_reg dest,
943 struct elk_reg src0, struct elk_reg src1)
944 {
945 /* 6.2.2: add */
946 if (src0.type == ELK_REGISTER_TYPE_F ||
947 (src0.file == ELK_IMMEDIATE_VALUE &&
948 src0.type == ELK_REGISTER_TYPE_VF)) {
949 assert(src1.type != ELK_REGISTER_TYPE_UD);
950 assert(src1.type != ELK_REGISTER_TYPE_D);
951 }
952
953 if (src1.type == ELK_REGISTER_TYPE_F ||
954 (src1.file == ELK_IMMEDIATE_VALUE &&
955 src1.type == ELK_REGISTER_TYPE_VF)) {
956 assert(src0.type != ELK_REGISTER_TYPE_UD);
957 assert(src0.type != ELK_REGISTER_TYPE_D);
958 }
959
960 return elk_alu2(p, ELK_OPCODE_ADD, dest, src0, src1);
961 }
962
963 elk_inst *
elk_AVG(struct elk_codegen * p,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1)964 elk_AVG(struct elk_codegen *p, struct elk_reg dest,
965 struct elk_reg src0, struct elk_reg src1)
966 {
967 assert(dest.type == src0.type);
968 assert(src0.type == src1.type);
969 switch (src0.type) {
970 case ELK_REGISTER_TYPE_B:
971 case ELK_REGISTER_TYPE_UB:
972 case ELK_REGISTER_TYPE_W:
973 case ELK_REGISTER_TYPE_UW:
974 case ELK_REGISTER_TYPE_D:
975 case ELK_REGISTER_TYPE_UD:
976 break;
977 default:
978 unreachable("Bad type for elk_AVG");
979 }
980
981 return elk_alu2(p, ELK_OPCODE_AVG, dest, src0, src1);
982 }
983
984 elk_inst *
elk_MUL(struct elk_codegen * p,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1)985 elk_MUL(struct elk_codegen *p, struct elk_reg dest,
986 struct elk_reg src0, struct elk_reg src1)
987 {
988 /* 6.32.38: mul */
989 if (src0.type == ELK_REGISTER_TYPE_D ||
990 src0.type == ELK_REGISTER_TYPE_UD ||
991 src1.type == ELK_REGISTER_TYPE_D ||
992 src1.type == ELK_REGISTER_TYPE_UD) {
993 assert(dest.type != ELK_REGISTER_TYPE_F);
994 }
995
996 if (src0.type == ELK_REGISTER_TYPE_F ||
997 (src0.file == ELK_IMMEDIATE_VALUE &&
998 src0.type == ELK_REGISTER_TYPE_VF)) {
999 assert(src1.type != ELK_REGISTER_TYPE_UD);
1000 assert(src1.type != ELK_REGISTER_TYPE_D);
1001 }
1002
1003 if (src1.type == ELK_REGISTER_TYPE_F ||
1004 (src1.file == ELK_IMMEDIATE_VALUE &&
1005 src1.type == ELK_REGISTER_TYPE_VF)) {
1006 assert(src0.type != ELK_REGISTER_TYPE_UD);
1007 assert(src0.type != ELK_REGISTER_TYPE_D);
1008 }
1009
1010 assert(src0.file != ELK_ARCHITECTURE_REGISTER_FILE ||
1011 src0.nr != ELK_ARF_ACCUMULATOR);
1012 assert(src1.file != ELK_ARCHITECTURE_REGISTER_FILE ||
1013 src1.nr != ELK_ARF_ACCUMULATOR);
1014
1015 return elk_alu2(p, ELK_OPCODE_MUL, dest, src0, src1);
1016 }
1017
1018 elk_inst *
elk_LINE(struct elk_codegen * p,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1)1019 elk_LINE(struct elk_codegen *p, struct elk_reg dest,
1020 struct elk_reg src0, struct elk_reg src1)
1021 {
1022 src0.vstride = ELK_VERTICAL_STRIDE_0;
1023 src0.width = ELK_WIDTH_1;
1024 src0.hstride = ELK_HORIZONTAL_STRIDE_0;
1025 return elk_alu2(p, ELK_OPCODE_LINE, dest, src0, src1);
1026 }
1027
1028 elk_inst *
elk_PLN(struct elk_codegen * p,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1)1029 elk_PLN(struct elk_codegen *p, struct elk_reg dest,
1030 struct elk_reg src0, struct elk_reg src1)
1031 {
1032 src0.vstride = ELK_VERTICAL_STRIDE_0;
1033 src0.width = ELK_WIDTH_1;
1034 src0.hstride = ELK_HORIZONTAL_STRIDE_0;
1035 src1.vstride = ELK_VERTICAL_STRIDE_8;
1036 src1.width = ELK_WIDTH_8;
1037 src1.hstride = ELK_HORIZONTAL_STRIDE_1;
1038 return elk_alu2(p, ELK_OPCODE_PLN, dest, src0, src1);
1039 }
1040
1041 elk_inst *
elk_F32TO16(struct elk_codegen * p,struct elk_reg dst,struct elk_reg src)1042 elk_F32TO16(struct elk_codegen *p, struct elk_reg dst, struct elk_reg src)
1043 {
1044 assert(p->devinfo->ver == 7);
1045
1046 /* The F32TO16 instruction doesn't support 32-bit destination types in
1047 * Align1 mode. Gfx7 (only) does zero out the high 16 bits in Align16
1048 * mode as an undocumented feature.
1049 */
1050 if (ELK_ALIGN_16 == elk_get_default_access_mode(p)) {
1051 assert(dst.type == ELK_REGISTER_TYPE_UD);
1052 } else {
1053 assert(dst.type == ELK_REGISTER_TYPE_W ||
1054 dst.type == ELK_REGISTER_TYPE_UW);
1055 }
1056
1057 return elk_alu1(p, ELK_OPCODE_F32TO16, dst, src);
1058 }
1059
1060 elk_inst *
elk_F16TO32(struct elk_codegen * p,struct elk_reg dst,struct elk_reg src)1061 elk_F16TO32(struct elk_codegen *p, struct elk_reg dst, struct elk_reg src)
1062 {
1063 assert(p->devinfo->ver == 7);
1064
1065 if (ELK_ALIGN_16 == elk_get_default_access_mode(p)) {
1066 assert(src.type == ELK_REGISTER_TYPE_UD);
1067 } else {
1068 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1069 *
1070 * Because this instruction does not have a 16-bit floating-point
1071 * type, the source data type must be Word (W). The destination type
1072 * must be F (Float).
1073 */
1074 assert(src.type == ELK_REGISTER_TYPE_W ||
1075 src.type == ELK_REGISTER_TYPE_UW);
1076 }
1077
1078 return elk_alu1(p, ELK_OPCODE_F16TO32, dst, src);
1079 }
1080
1081
elk_NOP(struct elk_codegen * p)1082 void elk_NOP(struct elk_codegen *p)
1083 {
1084 elk_inst *insn = next_insn(p, ELK_OPCODE_NOP);
1085 memset(insn, 0, sizeof(*insn));
1086 elk_inst_set_opcode(p->isa, insn, ELK_OPCODE_NOP);
1087 }
1088
1089 /***********************************************************************
1090 * Comparisons, if/else/endif
1091 */
1092
1093 elk_inst *
elk_JMPI(struct elk_codegen * p,struct elk_reg index,unsigned predicate_control)1094 elk_JMPI(struct elk_codegen *p, struct elk_reg index,
1095 unsigned predicate_control)
1096 {
1097 const struct intel_device_info *devinfo = p->devinfo;
1098 struct elk_reg ip = elk_ip_reg();
1099 elk_inst *inst = elk_alu2(p, ELK_OPCODE_JMPI, ip, ip, index);
1100
1101 elk_inst_set_exec_size(devinfo, inst, ELK_EXECUTE_1);
1102 elk_inst_set_qtr_control(devinfo, inst, ELK_COMPRESSION_NONE);
1103 elk_inst_set_mask_control(devinfo, inst, ELK_MASK_DISABLE);
1104 elk_inst_set_pred_control(devinfo, inst, predicate_control);
1105
1106 return inst;
1107 }
1108
1109 static void
push_if_stack(struct elk_codegen * p,elk_inst * inst)1110 push_if_stack(struct elk_codegen *p, elk_inst *inst)
1111 {
1112 p->if_stack[p->if_stack_depth] = inst - p->store;
1113
1114 p->if_stack_depth++;
1115 if (p->if_stack_array_size <= p->if_stack_depth) {
1116 p->if_stack_array_size *= 2;
1117 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1118 p->if_stack_array_size);
1119 }
1120 }
1121
1122 static elk_inst *
pop_if_stack(struct elk_codegen * p)1123 pop_if_stack(struct elk_codegen *p)
1124 {
1125 p->if_stack_depth--;
1126 return &p->store[p->if_stack[p->if_stack_depth]];
1127 }
1128
1129 static void
push_loop_stack(struct elk_codegen * p,elk_inst * inst)1130 push_loop_stack(struct elk_codegen *p, elk_inst *inst)
1131 {
1132 if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1133 p->loop_stack_array_size *= 2;
1134 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1135 p->loop_stack_array_size);
1136 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1137 p->loop_stack_array_size);
1138 }
1139
1140 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1141 p->loop_stack_depth++;
1142 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1143 }
1144
1145 static elk_inst *
get_inner_do_insn(struct elk_codegen * p)1146 get_inner_do_insn(struct elk_codegen *p)
1147 {
1148 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1149 }
1150
1151 /* EU takes the value from the flag register and pushes it onto some
1152 * sort of a stack (presumably merging with any flag value already on
1153 * the stack). Within an if block, the flags at the top of the stack
1154 * control execution on each channel of the unit, eg. on each of the
1155 * 16 pixel values in our wm programs.
1156 *
1157 * When the matching 'else' instruction is reached (presumably by
1158 * countdown of the instruction count patched in by our ELSE/ENDIF
1159 * functions), the relevant flags are inverted.
1160 *
1161 * When the matching 'endif' instruction is reached, the flags are
1162 * popped off. If the stack is now empty, normal execution resumes.
1163 */
1164 elk_inst *
elk_IF(struct elk_codegen * p,unsigned execute_size)1165 elk_IF(struct elk_codegen *p, unsigned execute_size)
1166 {
1167 const struct intel_device_info *devinfo = p->devinfo;
1168 elk_inst *insn;
1169
1170 insn = next_insn(p, ELK_OPCODE_IF);
1171
1172 /* Override the defaults for this instruction:
1173 */
1174 if (devinfo->ver < 6) {
1175 elk_set_dest(p, insn, elk_ip_reg());
1176 elk_set_src0(p, insn, elk_ip_reg());
1177 elk_set_src1(p, insn, elk_imm_d(0x0));
1178 } else if (devinfo->ver == 6) {
1179 elk_set_dest(p, insn, elk_imm_w(0));
1180 elk_inst_set_gfx6_jump_count(devinfo, insn, 0);
1181 elk_set_src0(p, insn, vec1(retype(elk_null_reg(), ELK_REGISTER_TYPE_D)));
1182 elk_set_src1(p, insn, vec1(retype(elk_null_reg(), ELK_REGISTER_TYPE_D)));
1183 } else if (devinfo->ver == 7) {
1184 elk_set_dest(p, insn, vec1(retype(elk_null_reg(), ELK_REGISTER_TYPE_D)));
1185 elk_set_src0(p, insn, vec1(retype(elk_null_reg(), ELK_REGISTER_TYPE_D)));
1186 elk_set_src1(p, insn, elk_imm_w(0));
1187 elk_inst_set_jip(devinfo, insn, 0);
1188 elk_inst_set_uip(devinfo, insn, 0);
1189 } else {
1190 elk_set_dest(p, insn, vec1(retype(elk_null_reg(), ELK_REGISTER_TYPE_D)));
1191 elk_set_src0(p, insn, elk_imm_d(0));
1192 elk_inst_set_jip(devinfo, insn, 0);
1193 elk_inst_set_uip(devinfo, insn, 0);
1194 }
1195
1196 elk_inst_set_exec_size(devinfo, insn, execute_size);
1197 elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
1198 elk_inst_set_pred_control(devinfo, insn, ELK_PREDICATE_NORMAL);
1199 elk_inst_set_mask_control(devinfo, insn, ELK_MASK_ENABLE);
1200 if (!p->single_program_flow && devinfo->ver < 6)
1201 elk_inst_set_thread_control(devinfo, insn, ELK_THREAD_SWITCH);
1202
1203 push_if_stack(p, insn);
1204 p->if_depth_in_loop[p->loop_stack_depth]++;
1205 return insn;
1206 }
1207
1208 /* This function is only used for gfx6-style IF instructions with an
1209 * embedded comparison (conditional modifier). It is not used on gfx7.
1210 */
1211 elk_inst *
elk_gfx6_IF(struct elk_codegen * p,enum elk_conditional_mod conditional,struct elk_reg src0,struct elk_reg src1)1212 elk_gfx6_IF(struct elk_codegen *p, enum elk_conditional_mod conditional,
1213 struct elk_reg src0, struct elk_reg src1)
1214 {
1215 const struct intel_device_info *devinfo = p->devinfo;
1216 elk_inst *insn;
1217
1218 insn = next_insn(p, ELK_OPCODE_IF);
1219
1220 elk_set_dest(p, insn, elk_imm_w(0));
1221 elk_inst_set_exec_size(devinfo, insn, elk_get_default_exec_size(p));
1222 elk_inst_set_gfx6_jump_count(devinfo, insn, 0);
1223 elk_set_src0(p, insn, src0);
1224 elk_set_src1(p, insn, src1);
1225
1226 assert(elk_inst_qtr_control(devinfo, insn) == ELK_COMPRESSION_NONE);
1227 assert(elk_inst_pred_control(devinfo, insn) == ELK_PREDICATE_NONE);
1228 elk_inst_set_cond_modifier(devinfo, insn, conditional);
1229
1230 push_if_stack(p, insn);
1231 return insn;
1232 }
1233
1234 /**
1235 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1236 */
1237 static void
convert_IF_ELSE_to_ADD(struct elk_codegen * p,elk_inst * if_inst,elk_inst * else_inst)1238 convert_IF_ELSE_to_ADD(struct elk_codegen *p,
1239 elk_inst *if_inst, elk_inst *else_inst)
1240 {
1241 const struct intel_device_info *devinfo = p->devinfo;
1242
1243 /* The next instruction (where the ENDIF would be, if it existed) */
1244 elk_inst *next_inst = &p->store[p->nr_insn];
1245
1246 assert(p->single_program_flow);
1247 assert(if_inst != NULL && elk_inst_opcode(p->isa, if_inst) == ELK_OPCODE_IF);
1248 assert(else_inst == NULL || elk_inst_opcode(p->isa, else_inst) == ELK_OPCODE_ELSE);
1249 assert(elk_inst_exec_size(devinfo, if_inst) == ELK_EXECUTE_1);
1250
1251 /* Convert IF to an ADD instruction that moves the instruction pointer
1252 * to the first instruction of the ELSE block. If there is no ELSE
1253 * block, point to where ENDIF would be. Reverse the predicate.
1254 *
1255 * There's no need to execute an ENDIF since we don't need to do any
1256 * stack operations, and if we're currently executing, we just want to
1257 * continue normally.
1258 */
1259 elk_inst_set_opcode(p->isa, if_inst, ELK_OPCODE_ADD);
1260 elk_inst_set_pred_inv(devinfo, if_inst, true);
1261
1262 if (else_inst != NULL) {
1263 /* Convert ELSE to an ADD instruction that points where the ENDIF
1264 * would be.
1265 */
1266 elk_inst_set_opcode(p->isa, else_inst, ELK_OPCODE_ADD);
1267
1268 elk_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1269 elk_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1270 } else {
1271 elk_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1272 }
1273 }
1274
1275 /**
1276 * Patch IF and ELSE instructions with appropriate jump targets.
1277 */
1278 static void
patch_IF_ELSE(struct elk_codegen * p,elk_inst * if_inst,elk_inst * else_inst,elk_inst * endif_inst)1279 patch_IF_ELSE(struct elk_codegen *p,
1280 elk_inst *if_inst, elk_inst *else_inst, elk_inst *endif_inst)
1281 {
1282 const struct intel_device_info *devinfo = p->devinfo;
1283
1284 /* We shouldn't be patching IF and ELSE instructions in single program flow
1285 * mode when gen < 6, because in single program flow mode on those
1286 * platforms, we convert flow control instructions to conditional ADDs that
1287 * operate on IP (see elk_ENDIF).
1288 *
1289 * However, on Gfx6, writing to IP doesn't work in single program flow mode
1290 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1291 * not be updated by non-flow control instructions."). And on later
1292 * platforms, there is no significant benefit to converting control flow
1293 * instructions to conditional ADDs. So we do patch IF and ELSE
1294 * instructions in single program flow mode on those platforms.
1295 */
1296 if (devinfo->ver < 6)
1297 assert(!p->single_program_flow);
1298
1299 assert(if_inst != NULL && elk_inst_opcode(p->isa, if_inst) == ELK_OPCODE_IF);
1300 assert(endif_inst != NULL);
1301 assert(else_inst == NULL || elk_inst_opcode(p->isa, else_inst) == ELK_OPCODE_ELSE);
1302
1303 unsigned br = elk_jump_scale(devinfo);
1304
1305 assert(elk_inst_opcode(p->isa, endif_inst) == ELK_OPCODE_ENDIF);
1306 elk_inst_set_exec_size(devinfo, endif_inst, elk_inst_exec_size(devinfo, if_inst));
1307
1308 if (else_inst == NULL) {
1309 /* Patch IF -> ENDIF */
1310 if (devinfo->ver < 6) {
1311 /* Turn it into an IFF, which means no mask stack operations for
1312 * all-false and jumping past the ENDIF.
1313 */
1314 elk_inst_set_opcode(p->isa, if_inst, ELK_OPCODE_IFF);
1315 elk_inst_set_gfx4_jump_count(devinfo, if_inst,
1316 br * (endif_inst - if_inst + 1));
1317 elk_inst_set_gfx4_pop_count(devinfo, if_inst, 0);
1318 } else if (devinfo->ver == 6) {
1319 /* As of gfx6, there is no IFF and IF must point to the ENDIF. */
1320 elk_inst_set_gfx6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1321 } else {
1322 elk_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1323 elk_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1324 }
1325 } else {
1326 elk_inst_set_exec_size(devinfo, else_inst, elk_inst_exec_size(devinfo, if_inst));
1327
1328 /* Patch IF -> ELSE */
1329 if (devinfo->ver < 6) {
1330 elk_inst_set_gfx4_jump_count(devinfo, if_inst,
1331 br * (else_inst - if_inst));
1332 elk_inst_set_gfx4_pop_count(devinfo, if_inst, 0);
1333 } else if (devinfo->ver == 6) {
1334 elk_inst_set_gfx6_jump_count(devinfo, if_inst,
1335 br * (else_inst - if_inst + 1));
1336 }
1337
1338 /* Patch ELSE -> ENDIF */
1339 if (devinfo->ver < 6) {
1340 /* ELK_OPCODE_ELSE pre-gfx6 should point just past the
1341 * matching ENDIF.
1342 */
1343 elk_inst_set_gfx4_jump_count(devinfo, else_inst,
1344 br * (endif_inst - else_inst + 1));
1345 elk_inst_set_gfx4_pop_count(devinfo, else_inst, 1);
1346 } else if (devinfo->ver == 6) {
1347 /* ELK_OPCODE_ELSE on gfx6 should point to the matching ENDIF. */
1348 elk_inst_set_gfx6_jump_count(devinfo, else_inst,
1349 br * (endif_inst - else_inst));
1350 } else {
1351 /* The IF instruction's JIP should point just past the ELSE */
1352 elk_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1353 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1354 elk_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1355
1356 if (devinfo->ver >= 8) {
1357 /* Set the ELSE instruction to use branch_ctrl with a join
1358 * jump target pointing at the NOP inserted right before
1359 * the ENDIF instruction in order to make sure it is
1360 * executed in all cases, since attempting to do the same
1361 * as on other generations could cause the EU to jump at
1362 * the instruction immediately after the ENDIF due to
1363 * Wa_220160235, which could cause the program to continue
1364 * running with all channels disabled.
1365 */
1366 elk_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst - 1));
1367 elk_inst_set_branch_control(devinfo, else_inst, true);
1368 } else {
1369 elk_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1370 }
1371
1372 if (devinfo->ver >= 8) {
1373 /* Since we don't set branch_ctrl on Gfx11+, the ELSE's
1374 * JIP and UIP both should point to ENDIF on those
1375 * platforms.
1376 */
1377 elk_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1378 }
1379 }
1380 }
1381 }
1382
1383 void
elk_ELSE(struct elk_codegen * p)1384 elk_ELSE(struct elk_codegen *p)
1385 {
1386 const struct intel_device_info *devinfo = p->devinfo;
1387 elk_inst *insn;
1388
1389 insn = next_insn(p, ELK_OPCODE_ELSE);
1390
1391 if (devinfo->ver < 6) {
1392 elk_set_dest(p, insn, elk_ip_reg());
1393 elk_set_src0(p, insn, elk_ip_reg());
1394 elk_set_src1(p, insn, elk_imm_d(0x0));
1395 } else if (devinfo->ver == 6) {
1396 elk_set_dest(p, insn, elk_imm_w(0));
1397 elk_inst_set_gfx6_jump_count(devinfo, insn, 0);
1398 elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1399 elk_set_src1(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1400 } else if (devinfo->ver == 7) {
1401 elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1402 elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1403 elk_set_src1(p, insn, elk_imm_w(0));
1404 elk_inst_set_jip(devinfo, insn, 0);
1405 elk_inst_set_uip(devinfo, insn, 0);
1406 } else {
1407 elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1408 elk_set_src0(p, insn, elk_imm_d(0));
1409 elk_inst_set_jip(devinfo, insn, 0);
1410 elk_inst_set_uip(devinfo, insn, 0);
1411 }
1412
1413 elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
1414 elk_inst_set_mask_control(devinfo, insn, ELK_MASK_ENABLE);
1415 if (!p->single_program_flow && devinfo->ver < 6)
1416 elk_inst_set_thread_control(devinfo, insn, ELK_THREAD_SWITCH);
1417
1418 push_if_stack(p, insn);
1419 }
1420
1421 void
elk_ENDIF(struct elk_codegen * p)1422 elk_ENDIF(struct elk_codegen *p)
1423 {
1424 const struct intel_device_info *devinfo = p->devinfo;
1425 elk_inst *insn = NULL;
1426 elk_inst *else_inst = NULL;
1427 elk_inst *if_inst = NULL;
1428 elk_inst *tmp;
1429 bool emit_endif = true;
1430
1431 assert(p->if_stack_depth > 0);
1432
1433 if (devinfo->ver >= 8 &&
1434 elk_inst_opcode(p->isa, &p->store[p->if_stack[
1435 p->if_stack_depth - 1]]) == ELK_OPCODE_ELSE) {
1436 /* Insert a NOP to be specified as join instruction within the
1437 * ELSE block, which is valid for an ELSE instruction with
1438 * branch_ctrl on. The ELSE instruction will be set to jump
1439 * here instead of to the ENDIF instruction, since attempting to
1440 * do the latter would prevent the ENDIF from being executed in
1441 * some cases due to Wa_220160235, which could cause the program
1442 * to continue running with all channels disabled.
1443 */
1444 elk_NOP(p);
1445 }
1446
1447 /* In single program flow mode, we can express IF and ELSE instructions
1448 * equivalently as ADD instructions that operate on IP. On platforms prior
1449 * to Gfx6, flow control instructions cause an implied thread switch, so
1450 * this is a significant savings.
1451 *
1452 * However, on Gfx6, writing to IP doesn't work in single program flow mode
1453 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1454 * not be updated by non-flow control instructions."). And on later
1455 * platforms, there is no significant benefit to converting control flow
1456 * instructions to conditional ADDs. So we only do this trick on Gfx4 and
1457 * Gfx5.
1458 */
1459 if (devinfo->ver < 6 && p->single_program_flow)
1460 emit_endif = false;
1461
1462 /*
1463 * A single next_insn() may change the base address of instruction store
1464 * memory(p->store), so call it first before referencing the instruction
1465 * store pointer from an index
1466 */
1467 if (emit_endif)
1468 insn = next_insn(p, ELK_OPCODE_ENDIF);
1469
1470 /* Pop the IF and (optional) ELSE instructions from the stack */
1471 p->if_depth_in_loop[p->loop_stack_depth]--;
1472 tmp = pop_if_stack(p);
1473 if (elk_inst_opcode(p->isa, tmp) == ELK_OPCODE_ELSE) {
1474 else_inst = tmp;
1475 tmp = pop_if_stack(p);
1476 }
1477 if_inst = tmp;
1478
1479 if (!emit_endif) {
1480 /* ENDIF is useless; don't bother emitting it. */
1481 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1482 return;
1483 }
1484
1485 if (devinfo->ver < 6) {
1486 elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1487 elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1488 elk_set_src1(p, insn, elk_imm_d(0x0));
1489 } else if (devinfo->ver == 6) {
1490 elk_set_dest(p, insn, elk_imm_w(0));
1491 elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1492 elk_set_src1(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1493 } else if (devinfo->ver == 7) {
1494 elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1495 elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1496 elk_set_src1(p, insn, elk_imm_w(0));
1497 } else {
1498 elk_set_src0(p, insn, elk_imm_d(0));
1499 }
1500
1501 elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
1502 elk_inst_set_mask_control(devinfo, insn, ELK_MASK_ENABLE);
1503 if (devinfo->ver < 6)
1504 elk_inst_set_thread_control(devinfo, insn, ELK_THREAD_SWITCH);
1505
1506 /* Also pop item off the stack in the endif instruction: */
1507 if (devinfo->ver < 6) {
1508 elk_inst_set_gfx4_jump_count(devinfo, insn, 0);
1509 elk_inst_set_gfx4_pop_count(devinfo, insn, 1);
1510 } else if (devinfo->ver == 6) {
1511 elk_inst_set_gfx6_jump_count(devinfo, insn, 2);
1512 } else {
1513 elk_inst_set_jip(devinfo, insn, 2);
1514 }
1515 patch_IF_ELSE(p, if_inst, else_inst, insn);
1516 }
1517
1518 elk_inst *
elk_BREAK(struct elk_codegen * p)1519 elk_BREAK(struct elk_codegen *p)
1520 {
1521 const struct intel_device_info *devinfo = p->devinfo;
1522 elk_inst *insn;
1523
1524 insn = next_insn(p, ELK_OPCODE_BREAK);
1525 if (devinfo->ver >= 8) {
1526 elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1527 elk_set_src0(p, insn, elk_imm_d(0x0));
1528 } else if (devinfo->ver >= 6) {
1529 elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1530 elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1531 elk_set_src1(p, insn, elk_imm_d(0x0));
1532 } else {
1533 elk_set_dest(p, insn, elk_ip_reg());
1534 elk_set_src0(p, insn, elk_ip_reg());
1535 elk_set_src1(p, insn, elk_imm_d(0x0));
1536 elk_inst_set_gfx4_pop_count(devinfo, insn,
1537 p->if_depth_in_loop[p->loop_stack_depth]);
1538 }
1539 elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
1540 elk_inst_set_exec_size(devinfo, insn, elk_get_default_exec_size(p));
1541
1542 return insn;
1543 }
1544
1545 elk_inst *
elk_CONT(struct elk_codegen * p)1546 elk_CONT(struct elk_codegen *p)
1547 {
1548 const struct intel_device_info *devinfo = p->devinfo;
1549 elk_inst *insn;
1550
1551 insn = next_insn(p, ELK_OPCODE_CONTINUE);
1552 elk_set_dest(p, insn, elk_ip_reg());
1553 if (devinfo->ver >= 8) {
1554 elk_set_src0(p, insn, elk_imm_d(0x0));
1555 } else {
1556 elk_set_src0(p, insn, elk_ip_reg());
1557 elk_set_src1(p, insn, elk_imm_d(0x0));
1558 }
1559
1560 if (devinfo->ver < 6) {
1561 elk_inst_set_gfx4_pop_count(devinfo, insn,
1562 p->if_depth_in_loop[p->loop_stack_depth]);
1563 }
1564 elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
1565 elk_inst_set_exec_size(devinfo, insn, elk_get_default_exec_size(p));
1566 return insn;
1567 }
1568
1569 elk_inst *
elk_HALT(struct elk_codegen * p)1570 elk_HALT(struct elk_codegen *p)
1571 {
1572 const struct intel_device_info *devinfo = p->devinfo;
1573 elk_inst *insn;
1574
1575 insn = next_insn(p, ELK_OPCODE_HALT);
1576 elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1577 if (devinfo->ver < 6) {
1578 /* From the Gfx4 PRM:
1579 *
1580 * "IP register must be put (for example, by the assembler) at <dst>
1581 * and <src0> locations.
1582 */
1583 elk_set_dest(p, insn, elk_ip_reg());
1584 elk_set_src0(p, insn, elk_ip_reg());
1585 elk_set_src1(p, insn, elk_imm_d(0x0)); /* exitcode updated later. */
1586 } else if (devinfo->ver < 8) {
1587 elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1588 elk_set_src1(p, insn, elk_imm_d(0x0)); /* UIP and JIP, updated later. */
1589 } else {
1590 assert(devinfo->ver == 8);
1591 elk_set_src0(p, insn, elk_imm_d(0x0));
1592 }
1593
1594 elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
1595 elk_inst_set_exec_size(devinfo, insn, elk_get_default_exec_size(p));
1596 return insn;
1597 }
1598
1599 /* DO/WHILE loop:
1600 *
1601 * The DO/WHILE is just an unterminated loop -- break or continue are
1602 * used for control within the loop. We have a few ways they can be
1603 * done.
1604 *
1605 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1606 * jip and no DO instruction.
1607 *
1608 * For non-uniform control flow pre-gfx6, there's a DO instruction to
1609 * push the mask, and a WHILE to jump back, and BREAK to get out and
1610 * pop the mask.
1611 *
1612 * For gfx6, there's no more mask stack, so no need for DO. WHILE
1613 * just points back to the first instruction of the loop.
1614 */
1615 elk_inst *
elk_DO(struct elk_codegen * p,unsigned execute_size)1616 elk_DO(struct elk_codegen *p, unsigned execute_size)
1617 {
1618 const struct intel_device_info *devinfo = p->devinfo;
1619
1620 if (devinfo->ver >= 6 || p->single_program_flow) {
1621 push_loop_stack(p, &p->store[p->nr_insn]);
1622 return &p->store[p->nr_insn];
1623 } else {
1624 elk_inst *insn = next_insn(p, ELK_OPCODE_DO);
1625
1626 push_loop_stack(p, insn);
1627
1628 /* Override the defaults for this instruction:
1629 */
1630 elk_set_dest(p, insn, elk_null_reg());
1631 elk_set_src0(p, insn, elk_null_reg());
1632 elk_set_src1(p, insn, elk_null_reg());
1633
1634 elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
1635 elk_inst_set_exec_size(devinfo, insn, execute_size);
1636 elk_inst_set_pred_control(devinfo, insn, ELK_PREDICATE_NONE);
1637
1638 return insn;
1639 }
1640 }
1641
1642 /**
1643 * For pre-gfx6, we patch BREAK/CONT instructions to point at the WHILE
1644 * instruction here.
1645 *
1646 * For gfx6+, see elk_set_uip_jip(), which doesn't care so much about the loop
1647 * nesting, since it can always just point to the end of the block/current loop.
1648 */
1649 static void
elk_patch_break_cont(struct elk_codegen * p,elk_inst * while_inst)1650 elk_patch_break_cont(struct elk_codegen *p, elk_inst *while_inst)
1651 {
1652 const struct intel_device_info *devinfo = p->devinfo;
1653 elk_inst *do_inst = get_inner_do_insn(p);
1654 elk_inst *inst;
1655 unsigned br = elk_jump_scale(devinfo);
1656
1657 assert(devinfo->ver < 6);
1658
1659 for (inst = while_inst - 1; inst != do_inst; inst--) {
1660 /* If the jump count is != 0, that means that this instruction has already
1661 * been patched because it's part of a loop inside of the one we're
1662 * patching.
1663 */
1664 if (elk_inst_opcode(p->isa, inst) == ELK_OPCODE_BREAK &&
1665 elk_inst_gfx4_jump_count(devinfo, inst) == 0) {
1666 elk_inst_set_gfx4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1667 } else if (elk_inst_opcode(p->isa, inst) == ELK_OPCODE_CONTINUE &&
1668 elk_inst_gfx4_jump_count(devinfo, inst) == 0) {
1669 elk_inst_set_gfx4_jump_count(devinfo, inst, br * (while_inst - inst));
1670 }
1671 }
1672 }
1673
1674 elk_inst *
elk_WHILE(struct elk_codegen * p)1675 elk_WHILE(struct elk_codegen *p)
1676 {
1677 const struct intel_device_info *devinfo = p->devinfo;
1678 elk_inst *insn, *do_insn;
1679 unsigned br = elk_jump_scale(devinfo);
1680
1681 if (devinfo->ver >= 6) {
1682 insn = next_insn(p, ELK_OPCODE_WHILE);
1683 do_insn = get_inner_do_insn(p);
1684
1685 if (devinfo->ver >= 8) {
1686 elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1687 elk_set_src0(p, insn, elk_imm_d(0));
1688 elk_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1689 } else if (devinfo->ver == 7) {
1690 elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1691 elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1692 elk_set_src1(p, insn, elk_imm_w(0));
1693 elk_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1694 } else {
1695 elk_set_dest(p, insn, elk_imm_w(0));
1696 elk_inst_set_gfx6_jump_count(devinfo, insn, br * (do_insn - insn));
1697 elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1698 elk_set_src1(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1699 }
1700
1701 elk_inst_set_exec_size(devinfo, insn, elk_get_default_exec_size(p));
1702
1703 } else {
1704 if (p->single_program_flow) {
1705 insn = next_insn(p, ELK_OPCODE_ADD);
1706 do_insn = get_inner_do_insn(p);
1707
1708 elk_set_dest(p, insn, elk_ip_reg());
1709 elk_set_src0(p, insn, elk_ip_reg());
1710 elk_set_src1(p, insn, elk_imm_d((do_insn - insn) * 16));
1711 elk_inst_set_exec_size(devinfo, insn, ELK_EXECUTE_1);
1712 } else {
1713 insn = next_insn(p, ELK_OPCODE_WHILE);
1714 do_insn = get_inner_do_insn(p);
1715
1716 assert(elk_inst_opcode(p->isa, do_insn) == ELK_OPCODE_DO);
1717
1718 elk_set_dest(p, insn, elk_ip_reg());
1719 elk_set_src0(p, insn, elk_ip_reg());
1720 elk_set_src1(p, insn, elk_imm_d(0));
1721
1722 elk_inst_set_exec_size(devinfo, insn, elk_inst_exec_size(devinfo, do_insn));
1723 elk_inst_set_gfx4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1724 elk_inst_set_gfx4_pop_count(devinfo, insn, 0);
1725
1726 elk_patch_break_cont(p, insn);
1727 }
1728 }
1729 elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
1730
1731 p->loop_stack_depth--;
1732
1733 return insn;
1734 }
1735
1736 /* FORWARD JUMPS:
1737 */
elk_land_fwd_jump(struct elk_codegen * p,int jmp_insn_idx)1738 void elk_land_fwd_jump(struct elk_codegen *p, int jmp_insn_idx)
1739 {
1740 const struct intel_device_info *devinfo = p->devinfo;
1741 elk_inst *jmp_insn = &p->store[jmp_insn_idx];
1742 unsigned jmpi = 1;
1743
1744 if (devinfo->ver >= 5)
1745 jmpi = 2;
1746
1747 assert(elk_inst_opcode(p->isa, jmp_insn) == ELK_OPCODE_JMPI);
1748 assert(elk_inst_src1_reg_file(devinfo, jmp_insn) == ELK_IMMEDIATE_VALUE);
1749
1750 elk_inst_set_gfx4_jump_count(devinfo, jmp_insn,
1751 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1752 }
1753
1754 /* To integrate with the above, it makes sense that the comparison
1755 * instruction should populate the flag register. It might be simpler
1756 * just to use the flag reg for most WM tasks?
1757 */
elk_CMP(struct elk_codegen * p,struct elk_reg dest,unsigned conditional,struct elk_reg src0,struct elk_reg src1)1758 void elk_CMP(struct elk_codegen *p,
1759 struct elk_reg dest,
1760 unsigned conditional,
1761 struct elk_reg src0,
1762 struct elk_reg src1)
1763 {
1764 const struct intel_device_info *devinfo = p->devinfo;
1765 elk_inst *insn = next_insn(p, ELK_OPCODE_CMP);
1766
1767 elk_inst_set_cond_modifier(devinfo, insn, conditional);
1768 elk_set_dest(p, insn, dest);
1769 elk_set_src0(p, insn, src0);
1770 elk_set_src1(p, insn, src1);
1771
1772 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1773 * page says:
1774 * "Any CMP instruction with a null destination must use a {switch}."
1775 *
1776 * It also applies to other Gfx7 platforms (IVB, BYT) even though it isn't
1777 * mentioned on their work-arounds pages.
1778 */
1779 if (devinfo->ver == 7) {
1780 if (dest.file == ELK_ARCHITECTURE_REGISTER_FILE &&
1781 dest.nr == ELK_ARF_NULL) {
1782 elk_inst_set_thread_control(devinfo, insn, ELK_THREAD_SWITCH);
1783 }
1784 }
1785 }
1786
elk_CMPN(struct elk_codegen * p,struct elk_reg dest,unsigned conditional,struct elk_reg src0,struct elk_reg src1)1787 void elk_CMPN(struct elk_codegen *p,
1788 struct elk_reg dest,
1789 unsigned conditional,
1790 struct elk_reg src0,
1791 struct elk_reg src1)
1792 {
1793 const struct intel_device_info *devinfo = p->devinfo;
1794 elk_inst *insn = next_insn(p, ELK_OPCODE_CMPN);
1795
1796 elk_inst_set_cond_modifier(devinfo, insn, conditional);
1797 elk_set_dest(p, insn, dest);
1798 elk_set_src0(p, insn, src0);
1799 elk_set_src1(p, insn, src1);
1800
1801 /* Page 166 of the Ivy Bridge PRM Volume 4 part 3 (Execution Unit ISA)
1802 * says:
1803 *
1804 * If the destination is the null register, the {Switch} instruction
1805 * option must be used.
1806 *
1807 * Page 77 of the Haswell PRM Volume 2b contains the same text.
1808 */
1809 if (devinfo->ver == 7) {
1810 if (dest.file == ELK_ARCHITECTURE_REGISTER_FILE &&
1811 dest.nr == ELK_ARF_NULL) {
1812 elk_inst_set_thread_control(devinfo, insn, ELK_THREAD_SWITCH);
1813 }
1814 }
1815 }
1816
1817 /***********************************************************************
1818 * Helpers for the various SEND message types:
1819 */
1820
1821 /** Extended math function, float[8].
1822 */
elk_gfx4_math(struct elk_codegen * p,struct elk_reg dest,unsigned function,unsigned msg_reg_nr,struct elk_reg src,unsigned precision)1823 void elk_gfx4_math(struct elk_codegen *p,
1824 struct elk_reg dest,
1825 unsigned function,
1826 unsigned msg_reg_nr,
1827 struct elk_reg src,
1828 unsigned precision )
1829 {
1830 const struct intel_device_info *devinfo = p->devinfo;
1831 elk_inst *insn = next_insn(p, ELK_OPCODE_SEND);
1832 unsigned data_type;
1833 if (has_scalar_region(src)) {
1834 data_type = ELK_MATH_DATA_SCALAR;
1835 } else {
1836 data_type = ELK_MATH_DATA_VECTOR;
1837 }
1838
1839 assert(devinfo->ver < 6);
1840
1841 /* Example code doesn't set predicate_control for send
1842 * instructions.
1843 */
1844 elk_inst_set_pred_control(devinfo, insn, 0);
1845 elk_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
1846
1847 elk_set_dest(p, insn, dest);
1848 elk_set_src0(p, insn, src);
1849 elk_set_math_message(p,
1850 insn,
1851 function,
1852 src.type == ELK_REGISTER_TYPE_D,
1853 precision,
1854 data_type);
1855 }
1856
elk_gfx6_math(struct elk_codegen * p,struct elk_reg dest,unsigned function,struct elk_reg src0,struct elk_reg src1)1857 void elk_gfx6_math(struct elk_codegen *p,
1858 struct elk_reg dest,
1859 unsigned function,
1860 struct elk_reg src0,
1861 struct elk_reg src1)
1862 {
1863 const struct intel_device_info *devinfo = p->devinfo;
1864 elk_inst *insn = next_insn(p, ELK_OPCODE_MATH);
1865
1866 assert(devinfo->ver >= 6);
1867
1868 assert(dest.file == ELK_GENERAL_REGISTER_FILE ||
1869 (devinfo->ver >= 7 && dest.file == ELK_MESSAGE_REGISTER_FILE));
1870
1871 assert(dest.hstride == ELK_HORIZONTAL_STRIDE_1);
1872 if (devinfo->ver == 6) {
1873 assert(src0.hstride == ELK_HORIZONTAL_STRIDE_1);
1874 assert(src1.hstride == ELK_HORIZONTAL_STRIDE_1);
1875 }
1876
1877 if (function == ELK_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1878 function == ELK_MATH_FUNCTION_INT_DIV_REMAINDER ||
1879 function == ELK_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1880 assert(src0.type != ELK_REGISTER_TYPE_F);
1881 assert(src1.type != ELK_REGISTER_TYPE_F);
1882 assert(src1.file == ELK_GENERAL_REGISTER_FILE ||
1883 (devinfo->ver >= 8 && src1.file == ELK_IMMEDIATE_VALUE));
1884 /* From BSpec 6647/47428 "[Instruction] Extended Math Function":
1885 * INT DIV function does not support source modifiers.
1886 */
1887 assert(!src0.negate);
1888 assert(!src0.abs);
1889 assert(!src1.negate);
1890 assert(!src1.abs);
1891 } else {
1892 assert(src0.type == ELK_REGISTER_TYPE_F);
1893 assert(src1.type == ELK_REGISTER_TYPE_F);
1894 }
1895
1896 /* Source modifiers are ignored for extended math instructions on Gfx6. */
1897 if (devinfo->ver == 6) {
1898 assert(!src0.negate);
1899 assert(!src0.abs);
1900 assert(!src1.negate);
1901 assert(!src1.abs);
1902 }
1903
1904 elk_inst_set_math_function(devinfo, insn, function);
1905
1906 elk_set_dest(p, insn, dest);
1907 elk_set_src0(p, insn, src0);
1908 elk_set_src1(p, insn, src1);
1909 }
1910
1911 /**
1912 * Return the right surface index to access the thread scratch space using
1913 * stateless dataport messages.
1914 */
1915 unsigned
elk_scratch_surface_idx(const struct elk_codegen * p)1916 elk_scratch_surface_idx(const struct elk_codegen *p)
1917 {
1918 /* The scratch space is thread-local so IA coherency is unnecessary. */
1919 if (p->devinfo->ver >= 8)
1920 return GFX8_BTI_STATELESS_NON_COHERENT;
1921 else
1922 return ELK_BTI_STATELESS;
1923 }
1924
1925 /**
1926 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1927 * using a constant offset per channel.
1928 *
1929 * The offset must be aligned to oword size (16 bytes). Used for
1930 * register spilling.
1931 */
elk_oword_block_write_scratch(struct elk_codegen * p,struct elk_reg mrf,int num_regs,unsigned offset)1932 void elk_oword_block_write_scratch(struct elk_codegen *p,
1933 struct elk_reg mrf,
1934 int num_regs,
1935 unsigned offset)
1936 {
1937 const struct intel_device_info *devinfo = p->devinfo;
1938 const unsigned target_cache =
1939 (devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
1940 devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
1941 ELK_SFID_DATAPORT_WRITE);
1942 uint32_t msg_type;
1943
1944 if (devinfo->ver >= 6)
1945 offset /= 16;
1946
1947 mrf = retype(mrf, ELK_REGISTER_TYPE_UD);
1948
1949 const unsigned mlen = 1 + num_regs;
1950
1951 /* Set up the message header. This is g0, with g0.2 filled with
1952 * the offset. We don't want to leave our offset around in g0 or
1953 * it'll screw up texture samples, so set it up inside the message
1954 * reg.
1955 */
1956 {
1957 elk_push_insn_state(p);
1958 elk_set_default_exec_size(p, ELK_EXECUTE_8);
1959 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
1960 elk_set_default_compression_control(p, ELK_COMPRESSION_NONE);
1961
1962 elk_MOV(p, mrf, retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
1963
1964 /* set message header global offset field (reg 0, element 2) */
1965 elk_set_default_exec_size(p, ELK_EXECUTE_1);
1966 elk_MOV(p,
1967 retype(elk_vec1_reg(ELK_MESSAGE_REGISTER_FILE,
1968 mrf.nr,
1969 2), ELK_REGISTER_TYPE_UD),
1970 elk_imm_ud(offset));
1971
1972 elk_pop_insn_state(p);
1973 }
1974
1975 {
1976 struct elk_reg dest;
1977 elk_inst *insn = next_insn(p, ELK_OPCODE_SEND);
1978 int send_commit_msg;
1979 struct elk_reg src_header = retype(elk_vec8_grf(0, 0),
1980 ELK_REGISTER_TYPE_UW);
1981
1982 elk_inst_set_sfid(devinfo, insn, target_cache);
1983 elk_inst_set_compression(devinfo, insn, false);
1984
1985 if (elk_inst_exec_size(devinfo, insn) >= 16)
1986 src_header = vec16(src_header);
1987
1988 assert(elk_inst_pred_control(devinfo, insn) == ELK_PREDICATE_NONE);
1989 if (devinfo->ver < 6)
1990 elk_inst_set_base_mrf(devinfo, insn, mrf.nr);
1991
1992 /* Until gfx6, writes followed by reads from the same location
1993 * are not guaranteed to be ordered unless write_commit is set.
1994 * If set, then a no-op write is issued to the destination
1995 * register to set a dependency, and a read from the destination
1996 * can be used to ensure the ordering.
1997 *
1998 * For gfx6, only writes between different threads need ordering
1999 * protection. Our use of DP writes is all about register
2000 * spilling within a thread.
2001 */
2002 if (devinfo->ver >= 6) {
2003 dest = retype(vec16(elk_null_reg()), ELK_REGISTER_TYPE_UW);
2004 send_commit_msg = 0;
2005 } else {
2006 dest = src_header;
2007 send_commit_msg = 1;
2008 }
2009
2010 elk_set_dest(p, insn, dest);
2011 if (devinfo->ver >= 6) {
2012 elk_set_src0(p, insn, mrf);
2013 } else {
2014 elk_set_src0(p, insn, elk_null_reg());
2015 }
2016
2017 if (devinfo->ver >= 6)
2018 msg_type = GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2019 else
2020 msg_type = ELK_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2021
2022 elk_set_desc(p, insn,
2023 elk_message_desc(devinfo, mlen, send_commit_msg, true) |
2024 elk_dp_write_desc(devinfo, elk_scratch_surface_idx(p),
2025 ELK_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2026 msg_type, send_commit_msg));
2027 }
2028 }
2029
2030
2031 /**
2032 * Read a block of owords (half a GRF each) from the scratch buffer
2033 * using a constant index per channel.
2034 *
2035 * Offset must be aligned to oword size (16 bytes). Used for register
2036 * spilling.
2037 */
2038 void
elk_oword_block_read_scratch(struct elk_codegen * p,struct elk_reg dest,struct elk_reg mrf,int num_regs,unsigned offset)2039 elk_oword_block_read_scratch(struct elk_codegen *p,
2040 struct elk_reg dest,
2041 struct elk_reg mrf,
2042 int num_regs,
2043 unsigned offset)
2044 {
2045 const struct intel_device_info *devinfo = p->devinfo;
2046
2047 if (devinfo->ver >= 6)
2048 offset /= 16;
2049
2050 if (p->devinfo->ver >= 7) {
2051 /* On gen 7 and above, we no longer have message registers and we can
2052 * send from any register we want. By using the destination register
2053 * for the message, we guarantee that the implied message write won't
2054 * accidentally overwrite anything. This has been a problem because
2055 * the MRF registers and source for the final FB write are both fixed
2056 * and may overlap.
2057 */
2058 mrf = retype(dest, ELK_REGISTER_TYPE_UD);
2059 } else {
2060 mrf = retype(mrf, ELK_REGISTER_TYPE_UD);
2061 }
2062 dest = retype(dest, ELK_REGISTER_TYPE_UW);
2063
2064 const unsigned rlen = num_regs;
2065 const unsigned target_cache =
2066 (devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
2067 devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
2068 ELK_SFID_DATAPORT_READ);
2069
2070 {
2071 elk_push_insn_state(p);
2072 elk_set_default_exec_size(p, ELK_EXECUTE_8);
2073 elk_set_default_compression_control(p, ELK_COMPRESSION_NONE);
2074 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2075
2076 elk_MOV(p, mrf, retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
2077
2078 /* set message header global offset field (reg 0, element 2) */
2079 elk_set_default_exec_size(p, ELK_EXECUTE_1);
2080 elk_MOV(p, get_element_ud(mrf, 2), elk_imm_ud(offset));
2081
2082 elk_pop_insn_state(p);
2083 }
2084
2085 {
2086 elk_inst *insn = next_insn(p, ELK_OPCODE_SEND);
2087
2088 elk_inst_set_sfid(devinfo, insn, target_cache);
2089 assert(elk_inst_pred_control(devinfo, insn) == 0);
2090 elk_inst_set_compression(devinfo, insn, false);
2091
2092 elk_set_dest(p, insn, dest); /* UW? */
2093 if (devinfo->ver >= 6) {
2094 elk_set_src0(p, insn, mrf);
2095 } else {
2096 elk_set_src0(p, insn, elk_null_reg());
2097 elk_inst_set_base_mrf(devinfo, insn, mrf.nr);
2098 }
2099
2100 elk_set_desc(p, insn,
2101 elk_message_desc(devinfo, 1, rlen, true) |
2102 elk_dp_read_desc(devinfo, elk_scratch_surface_idx(p),
2103 ELK_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2104 ELK_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2105 ELK_DATAPORT_READ_TARGET_RENDER_CACHE));
2106 }
2107 }
2108
2109 void
elk_gfx7_block_read_scratch(struct elk_codegen * p,struct elk_reg dest,int num_regs,unsigned offset)2110 elk_gfx7_block_read_scratch(struct elk_codegen *p,
2111 struct elk_reg dest,
2112 int num_regs,
2113 unsigned offset)
2114 {
2115 elk_inst *insn = next_insn(p, ELK_OPCODE_SEND);
2116 assert(elk_inst_pred_control(p->devinfo, insn) == ELK_PREDICATE_NONE);
2117
2118 elk_set_dest(p, insn, retype(dest, ELK_REGISTER_TYPE_UW));
2119
2120 /* The HW requires that the header is present; this is to get the g0.5
2121 * scratch offset.
2122 */
2123 elk_set_src0(p, insn, elk_vec8_grf(0, 0));
2124
2125 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2126 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2127 * is 32 bytes, which happens to be the size of a register.
2128 */
2129 offset /= REG_SIZE;
2130 assert(offset < (1 << 12));
2131
2132 gfx7_set_dp_scratch_message(p, insn,
2133 false, /* scratch read */
2134 false, /* OWords */
2135 false, /* invalidate after read */
2136 num_regs,
2137 offset,
2138 1, /* mlen: just g0 */
2139 num_regs, /* rlen */
2140 true); /* header present */
2141 }
2142
2143 /**
2144 * Read float[4] vectors from the data port constant cache.
2145 * Location (in buffer) should be a multiple of 16.
2146 * Used for fetching shader constants.
2147 */
elk_oword_block_read(struct elk_codegen * p,struct elk_reg dest,struct elk_reg mrf,uint32_t offset,uint32_t bind_table_index)2148 void elk_oword_block_read(struct elk_codegen *p,
2149 struct elk_reg dest,
2150 struct elk_reg mrf,
2151 uint32_t offset,
2152 uint32_t bind_table_index)
2153 {
2154 const struct intel_device_info *devinfo = p->devinfo;
2155 const unsigned target_cache =
2156 (devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_CONSTANT_CACHE :
2157 ELK_SFID_DATAPORT_READ);
2158 const unsigned exec_size = 1 << elk_get_default_exec_size(p);
2159
2160 /* On newer hardware, offset is in units of owords. */
2161 if (devinfo->ver >= 6)
2162 offset /= 16;
2163
2164 mrf = retype(mrf, ELK_REGISTER_TYPE_UD);
2165
2166 elk_push_insn_state(p);
2167 elk_set_default_predicate_control(p, ELK_PREDICATE_NONE);
2168 elk_set_default_flag_reg(p, 0, 0);
2169 elk_set_default_compression_control(p, ELK_COMPRESSION_NONE);
2170 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2171
2172 elk_push_insn_state(p);
2173 elk_set_default_exec_size(p, ELK_EXECUTE_8);
2174 elk_MOV(p, mrf, retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
2175
2176 /* set message header global offset field (reg 0, element 2) */
2177 elk_set_default_exec_size(p, ELK_EXECUTE_1);
2178 elk_MOV(p,
2179 retype(elk_vec1_reg(ELK_MESSAGE_REGISTER_FILE,
2180 mrf.nr,
2181 2), ELK_REGISTER_TYPE_UD),
2182 elk_imm_ud(offset));
2183 elk_pop_insn_state(p);
2184
2185 elk_inst *insn = next_insn(p, ELK_OPCODE_SEND);
2186
2187 elk_inst_set_sfid(devinfo, insn, target_cache);
2188
2189 /* cast dest to a uword[8] vector */
2190 dest = retype(vec8(dest), ELK_REGISTER_TYPE_UW);
2191
2192 elk_set_dest(p, insn, dest);
2193 if (devinfo->ver >= 6) {
2194 elk_set_src0(p, insn, mrf);
2195 } else {
2196 elk_set_src0(p, insn, elk_null_reg());
2197 elk_inst_set_base_mrf(devinfo, insn, mrf.nr);
2198 }
2199
2200 elk_set_desc(p, insn,
2201 elk_message_desc(devinfo, 1, DIV_ROUND_UP(exec_size, 8), true) |
2202 elk_dp_read_desc(devinfo, bind_table_index,
2203 ELK_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2204 ELK_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2205 ELK_DATAPORT_READ_TARGET_DATA_CACHE));
2206
2207 elk_pop_insn_state(p);
2208 }
2209
2210 elk_inst *
elk_fb_WRITE(struct elk_codegen * p,struct elk_reg payload,struct elk_reg implied_header,unsigned msg_control,unsigned binding_table_index,unsigned msg_length,unsigned response_length,bool eot,bool last_render_target,bool header_present)2211 elk_fb_WRITE(struct elk_codegen *p,
2212 struct elk_reg payload,
2213 struct elk_reg implied_header,
2214 unsigned msg_control,
2215 unsigned binding_table_index,
2216 unsigned msg_length,
2217 unsigned response_length,
2218 bool eot,
2219 bool last_render_target,
2220 bool header_present)
2221 {
2222 const struct intel_device_info *devinfo = p->devinfo;
2223 const unsigned target_cache =
2224 (devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
2225 ELK_SFID_DATAPORT_WRITE);
2226 elk_inst *insn;
2227 struct elk_reg dest, src0;
2228
2229 if (elk_get_default_exec_size(p) >= ELK_EXECUTE_16)
2230 dest = retype(vec16(elk_null_reg()), ELK_REGISTER_TYPE_UW);
2231 else
2232 dest = retype(vec8(elk_null_reg()), ELK_REGISTER_TYPE_UW);
2233
2234 if (devinfo->ver >= 6) {
2235 insn = next_insn(p, ELK_OPCODE_SENDC);
2236 } else {
2237 insn = next_insn(p, ELK_OPCODE_SEND);
2238 }
2239 elk_inst_set_sfid(devinfo, insn, target_cache);
2240 elk_inst_set_compression(devinfo, insn, false);
2241
2242 if (devinfo->ver >= 6) {
2243 /* headerless version, just submit color payload */
2244 src0 = payload;
2245 } else {
2246 assert(payload.file == ELK_MESSAGE_REGISTER_FILE);
2247 elk_inst_set_base_mrf(devinfo, insn, payload.nr);
2248 src0 = implied_header;
2249 }
2250
2251 elk_set_dest(p, insn, dest);
2252 elk_set_src0(p, insn, src0);
2253 elk_set_desc(p, insn,
2254 elk_message_desc(devinfo, msg_length, response_length,
2255 header_present) |
2256 elk_fb_write_desc(devinfo, binding_table_index, msg_control,
2257 last_render_target,
2258 false /* coarse_write */));
2259 elk_inst_set_eot(devinfo, insn, eot);
2260
2261 return insn;
2262 }
2263
2264 /**
2265 * Texture sample instruction.
2266 * Note: the msg_type plus msg_length values determine exactly what kind
2267 * of sampling operation is performed. See volume 4, page 161 of docs.
2268 */
elk_SAMPLE(struct elk_codegen * p,struct elk_reg dest,unsigned msg_reg_nr,struct elk_reg src0,unsigned binding_table_index,unsigned sampler,unsigned msg_type,unsigned response_length,unsigned msg_length,unsigned header_present,unsigned simd_mode,unsigned return_format)2269 void elk_SAMPLE(struct elk_codegen *p,
2270 struct elk_reg dest,
2271 unsigned msg_reg_nr,
2272 struct elk_reg src0,
2273 unsigned binding_table_index,
2274 unsigned sampler,
2275 unsigned msg_type,
2276 unsigned response_length,
2277 unsigned msg_length,
2278 unsigned header_present,
2279 unsigned simd_mode,
2280 unsigned return_format)
2281 {
2282 const struct intel_device_info *devinfo = p->devinfo;
2283 elk_inst *insn;
2284
2285 if (msg_reg_nr != -1)
2286 elk_gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
2287
2288 insn = next_insn(p, ELK_OPCODE_SEND);
2289 elk_inst_set_sfid(devinfo, insn, ELK_SFID_SAMPLER);
2290 elk_inst_set_pred_control(devinfo, insn, ELK_PREDICATE_NONE); /* XXX */
2291
2292 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2293 *
2294 * "Instruction compression is not allowed for this instruction (that
2295 * is, send). The hardware behavior is undefined if this instruction is
2296 * set as compressed. However, compress control can be set to "SecHalf"
2297 * to affect the EMask generation."
2298 *
2299 * No similar wording is found in later PRMs, but there are examples
2300 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2301 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2302 * these reasons, we allow ELK_COMPRESSION_2NDHALF here.
2303 */
2304 elk_inst_set_compression(devinfo, insn, false);
2305
2306 if (devinfo->ver < 6)
2307 elk_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2308
2309 elk_set_dest(p, insn, dest);
2310 elk_set_src0(p, insn, src0);
2311 elk_set_desc(p, insn,
2312 elk_message_desc(devinfo, msg_length, response_length,
2313 header_present) |
2314 elk_sampler_desc(devinfo, binding_table_index, sampler,
2315 msg_type, simd_mode, return_format));
2316 }
2317
2318 /* Adjust the message header's sampler state pointer to
2319 * select the correct group of 16 samplers.
2320 */
elk_adjust_sampler_state_pointer(struct elk_codegen * p,struct elk_reg header,struct elk_reg sampler_index)2321 void elk_adjust_sampler_state_pointer(struct elk_codegen *p,
2322 struct elk_reg header,
2323 struct elk_reg sampler_index)
2324 {
2325 /* The "Sampler Index" field can only store values between 0 and 15.
2326 * However, we can add an offset to the "Sampler State Pointer"
2327 * field, effectively selecting a different set of 16 samplers.
2328 *
2329 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2330 * offset, and each sampler state is only 16-bytes, so we can't
2331 * exclusively use the offset - we have to use both.
2332 */
2333
2334 const struct intel_device_info *devinfo = p->devinfo;
2335
2336 if (sampler_index.file == ELK_IMMEDIATE_VALUE) {
2337 const int sampler_state_size = 16; /* 16 bytes */
2338 uint32_t sampler = sampler_index.ud;
2339
2340 if (sampler >= 16) {
2341 assert(devinfo->verx10 >= 75);
2342 elk_ADD(p,
2343 get_element_ud(header, 3),
2344 get_element_ud(elk_vec8_grf(0, 0), 3),
2345 elk_imm_ud(16 * (sampler / 16) * sampler_state_size));
2346 }
2347 } else {
2348 /* Non-const sampler array indexing case */
2349 if (devinfo->verx10 <= 70) {
2350 return;
2351 }
2352
2353 struct elk_reg temp = get_element_ud(header, 3);
2354
2355 elk_push_insn_state(p);
2356 elk_AND(p, temp, get_element_ud(sampler_index, 0), elk_imm_ud(0x0f0));
2357 elk_SHL(p, temp, temp, elk_imm_ud(4));
2358 elk_ADD(p,
2359 get_element_ud(header, 3),
2360 get_element_ud(elk_vec8_grf(0, 0), 3),
2361 temp);
2362 elk_pop_insn_state(p);
2363 }
2364 }
2365
2366 /* All these variables are pretty confusing - we might be better off
2367 * using bitmasks and macros for this, in the old style. Or perhaps
2368 * just having the caller instantiate the fields in dword3 itself.
2369 */
elk_urb_WRITE(struct elk_codegen * p,struct elk_reg dest,unsigned msg_reg_nr,struct elk_reg src0,enum elk_urb_write_flags flags,unsigned msg_length,unsigned response_length,unsigned offset,unsigned swizzle)2370 void elk_urb_WRITE(struct elk_codegen *p,
2371 struct elk_reg dest,
2372 unsigned msg_reg_nr,
2373 struct elk_reg src0,
2374 enum elk_urb_write_flags flags,
2375 unsigned msg_length,
2376 unsigned response_length,
2377 unsigned offset,
2378 unsigned swizzle)
2379 {
2380 const struct intel_device_info *devinfo = p->devinfo;
2381 elk_inst *insn;
2382
2383 elk_gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
2384
2385 if (devinfo->ver >= 7 && !(flags & ELK_URB_WRITE_USE_CHANNEL_MASKS)) {
2386 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2387 elk_push_insn_state(p);
2388 elk_set_default_access_mode(p, ELK_ALIGN_1);
2389 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2390 elk_set_default_exec_size(p, ELK_EXECUTE_1);
2391 elk_OR(p, retype(elk_vec1_reg(ELK_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2392 ELK_REGISTER_TYPE_UD),
2393 retype(elk_vec1_grf(0, 5), ELK_REGISTER_TYPE_UD),
2394 elk_imm_ud(0xff00));
2395 elk_pop_insn_state(p);
2396 }
2397
2398 insn = next_insn(p, ELK_OPCODE_SEND);
2399
2400 assert(msg_length < ELK_MAX_MRF(devinfo->ver));
2401
2402 elk_set_dest(p, insn, dest);
2403 elk_set_src0(p, insn, src0);
2404 elk_set_src1(p, insn, elk_imm_d(0));
2405
2406 if (devinfo->ver < 6)
2407 elk_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2408
2409 elk_set_urb_message(p,
2410 insn,
2411 flags,
2412 msg_length,
2413 response_length,
2414 offset,
2415 swizzle);
2416 }
2417
2418 void
elk_send_indirect_message(struct elk_codegen * p,unsigned sfid,struct elk_reg dst,struct elk_reg payload,struct elk_reg desc,unsigned desc_imm,bool eot)2419 elk_send_indirect_message(struct elk_codegen *p,
2420 unsigned sfid,
2421 struct elk_reg dst,
2422 struct elk_reg payload,
2423 struct elk_reg desc,
2424 unsigned desc_imm,
2425 bool eot)
2426 {
2427 const struct intel_device_info *devinfo = p->devinfo;
2428 struct elk_inst *send;
2429
2430 dst = retype(dst, ELK_REGISTER_TYPE_UW);
2431
2432 assert(desc.type == ELK_REGISTER_TYPE_UD);
2433
2434 if (desc.file == ELK_IMMEDIATE_VALUE) {
2435 send = next_insn(p, ELK_OPCODE_SEND);
2436 elk_set_src0(p, send, retype(payload, ELK_REGISTER_TYPE_UD));
2437 elk_set_desc(p, send, desc.ud | desc_imm);
2438 } else {
2439 struct elk_reg addr = retype(elk_address_reg(0), ELK_REGISTER_TYPE_UD);
2440
2441 elk_push_insn_state(p);
2442 elk_set_default_access_mode(p, ELK_ALIGN_1);
2443 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2444 elk_set_default_exec_size(p, ELK_EXECUTE_1);
2445 elk_set_default_predicate_control(p, ELK_PREDICATE_NONE);
2446 elk_set_default_flag_reg(p, 0, 0);
2447
2448 /* Load the indirect descriptor to an address register using OR so the
2449 * caller can specify additional descriptor bits with the desc_imm
2450 * immediate.
2451 */
2452 elk_OR(p, addr, desc, elk_imm_ud(desc_imm));
2453
2454 elk_pop_insn_state(p);
2455
2456 send = next_insn(p, ELK_OPCODE_SEND);
2457 elk_set_src0(p, send, retype(payload, ELK_REGISTER_TYPE_UD));
2458 elk_set_src1(p, send, addr);
2459 }
2460
2461 elk_set_dest(p, send, dst);
2462 elk_inst_set_sfid(devinfo, send, sfid);
2463 elk_inst_set_eot(devinfo, send, eot);
2464 }
2465
2466 static void
elk_send_indirect_surface_message(struct elk_codegen * p,unsigned sfid,struct elk_reg dst,struct elk_reg payload,struct elk_reg surface,unsigned desc_imm)2467 elk_send_indirect_surface_message(struct elk_codegen *p,
2468 unsigned sfid,
2469 struct elk_reg dst,
2470 struct elk_reg payload,
2471 struct elk_reg surface,
2472 unsigned desc_imm)
2473 {
2474 if (surface.file != ELK_IMMEDIATE_VALUE) {
2475 struct elk_reg addr = retype(elk_address_reg(0), ELK_REGISTER_TYPE_UD);
2476
2477 elk_push_insn_state(p);
2478 elk_set_default_access_mode(p, ELK_ALIGN_1);
2479 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2480 elk_set_default_exec_size(p, ELK_EXECUTE_1);
2481 elk_set_default_predicate_control(p, ELK_PREDICATE_NONE);
2482 elk_set_default_flag_reg(p, 0, 0);
2483
2484 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2485 * some surface array is accessed out of bounds.
2486 */
2487 elk_AND(p, addr,
2488 suboffset(vec1(retype(surface, ELK_REGISTER_TYPE_UD)),
2489 ELK_GET_SWZ(surface.swizzle, 0)),
2490 elk_imm_ud(0xff));
2491
2492 elk_pop_insn_state(p);
2493
2494 surface = addr;
2495 }
2496
2497 elk_send_indirect_message(p, sfid, dst, payload, surface, desc_imm, false);
2498 }
2499
2500 static bool
while_jumps_before_offset(const struct intel_device_info * devinfo,elk_inst * insn,int while_offset,int start_offset)2501 while_jumps_before_offset(const struct intel_device_info *devinfo,
2502 elk_inst *insn, int while_offset, int start_offset)
2503 {
2504 int scale = 16 / elk_jump_scale(devinfo);
2505 int jip = devinfo->ver == 6 ? elk_inst_gfx6_jump_count(devinfo, insn)
2506 : elk_inst_jip(devinfo, insn);
2507 assert(jip < 0);
2508 return while_offset + jip * scale <= start_offset;
2509 }
2510
2511
2512 static int
elk_find_next_block_end(struct elk_codegen * p,int start_offset)2513 elk_find_next_block_end(struct elk_codegen *p, int start_offset)
2514 {
2515 int offset;
2516 void *store = p->store;
2517 const struct intel_device_info *devinfo = p->devinfo;
2518
2519 int depth = 0;
2520
2521 for (offset = next_offset(devinfo, store, start_offset);
2522 offset < p->next_insn_offset;
2523 offset = next_offset(devinfo, store, offset)) {
2524 elk_inst *insn = store + offset;
2525
2526 switch (elk_inst_opcode(p->isa, insn)) {
2527 case ELK_OPCODE_IF:
2528 depth++;
2529 break;
2530 case ELK_OPCODE_ENDIF:
2531 if (depth == 0)
2532 return offset;
2533 depth--;
2534 break;
2535 case ELK_OPCODE_WHILE:
2536 /* If the while doesn't jump before our instruction, it's the end
2537 * of a sibling do...while loop. Ignore it.
2538 */
2539 if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2540 continue;
2541 FALLTHROUGH;
2542 case ELK_OPCODE_ELSE:
2543 case ELK_OPCODE_HALT:
2544 if (depth == 0)
2545 return offset;
2546 break;
2547 default:
2548 break;
2549 }
2550 }
2551
2552 return 0;
2553 }
2554
2555 /* There is no DO instruction on gfx6, so to find the end of the loop
2556 * we have to see if the loop is jumping back before our start
2557 * instruction.
2558 */
2559 static int
elk_find_loop_end(struct elk_codegen * p,int start_offset)2560 elk_find_loop_end(struct elk_codegen *p, int start_offset)
2561 {
2562 const struct intel_device_info *devinfo = p->devinfo;
2563 int offset;
2564 void *store = p->store;
2565
2566 assert(devinfo->ver >= 6);
2567
2568 /* Always start after the instruction (such as a WHILE) we're trying to fix
2569 * up.
2570 */
2571 for (offset = next_offset(devinfo, store, start_offset);
2572 offset < p->next_insn_offset;
2573 offset = next_offset(devinfo, store, offset)) {
2574 elk_inst *insn = store + offset;
2575
2576 if (elk_inst_opcode(p->isa, insn) == ELK_OPCODE_WHILE) {
2577 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2578 return offset;
2579 }
2580 }
2581 assert(!"not reached");
2582 return start_offset;
2583 }
2584
2585 /* After program generation, go back and update the UIP and JIP of
2586 * BREAK, CONT, and HALT instructions to their correct locations.
2587 */
2588 void
elk_set_uip_jip(struct elk_codegen * p,int start_offset)2589 elk_set_uip_jip(struct elk_codegen *p, int start_offset)
2590 {
2591 const struct intel_device_info *devinfo = p->devinfo;
2592 int offset;
2593 int br = elk_jump_scale(devinfo);
2594 int scale = 16 / br;
2595 void *store = p->store;
2596
2597 if (devinfo->ver < 6)
2598 return;
2599
2600 for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2601 elk_inst *insn = store + offset;
2602 assert(elk_inst_cmpt_control(devinfo, insn) == 0);
2603
2604 switch (elk_inst_opcode(p->isa, insn)) {
2605 case ELK_OPCODE_BREAK: {
2606 int block_end_offset = elk_find_next_block_end(p, offset);
2607 assert(block_end_offset != 0);
2608 elk_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2609 /* Gfx7 UIP points to WHILE; Gfx6 points just after it */
2610 elk_inst_set_uip(devinfo, insn,
2611 (elk_find_loop_end(p, offset) - offset +
2612 (devinfo->ver == 6 ? 16 : 0)) / scale);
2613 break;
2614 }
2615
2616 case ELK_OPCODE_CONTINUE: {
2617 int block_end_offset = elk_find_next_block_end(p, offset);
2618 assert(block_end_offset != 0);
2619 elk_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2620 elk_inst_set_uip(devinfo, insn,
2621 (elk_find_loop_end(p, offset) - offset) / scale);
2622
2623 assert(elk_inst_uip(devinfo, insn) != 0);
2624 assert(elk_inst_jip(devinfo, insn) != 0);
2625 break;
2626 }
2627
2628 case ELK_OPCODE_ENDIF: {
2629 int block_end_offset = elk_find_next_block_end(p, offset);
2630 int32_t jump = (block_end_offset == 0) ?
2631 1 * br : (block_end_offset - offset) / scale;
2632 if (devinfo->ver >= 7)
2633 elk_inst_set_jip(devinfo, insn, jump);
2634 else
2635 elk_inst_set_gfx6_jump_count(devinfo, insn, jump);
2636 break;
2637 }
2638
2639 case ELK_OPCODE_HALT: {
2640 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2641 *
2642 * "In case of the halt instruction not inside any conditional
2643 * code block, the value of <JIP> and <UIP> should be the
2644 * same. In case of the halt instruction inside conditional code
2645 * block, the <UIP> should be the end of the program, and the
2646 * <JIP> should be end of the most inner conditional code block."
2647 *
2648 * The uip will have already been set by whoever set up the
2649 * instruction.
2650 */
2651 int block_end_offset = elk_find_next_block_end(p, offset);
2652 if (block_end_offset == 0) {
2653 elk_inst_set_jip(devinfo, insn, elk_inst_uip(devinfo, insn));
2654 } else {
2655 elk_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2656 }
2657 assert(elk_inst_uip(devinfo, insn) != 0);
2658 assert(elk_inst_jip(devinfo, insn) != 0);
2659 break;
2660 }
2661
2662 default:
2663 break;
2664 }
2665 }
2666 }
2667
elk_ff_sync(struct elk_codegen * p,struct elk_reg dest,unsigned msg_reg_nr,struct elk_reg src0,bool allocate,unsigned response_length,bool eot)2668 void elk_ff_sync(struct elk_codegen *p,
2669 struct elk_reg dest,
2670 unsigned msg_reg_nr,
2671 struct elk_reg src0,
2672 bool allocate,
2673 unsigned response_length,
2674 bool eot)
2675 {
2676 const struct intel_device_info *devinfo = p->devinfo;
2677 elk_inst *insn;
2678
2679 elk_gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
2680
2681 insn = next_insn(p, ELK_OPCODE_SEND);
2682 elk_set_dest(p, insn, dest);
2683 elk_set_src0(p, insn, src0);
2684 elk_set_src1(p, insn, elk_imm_d(0));
2685
2686 if (devinfo->ver < 6)
2687 elk_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2688
2689 elk_set_ff_sync_message(p,
2690 insn,
2691 allocate,
2692 response_length,
2693 eot);
2694 }
2695
2696 /**
2697 * Emit the SEND instruction necessary to generate stream output data on Gfx6
2698 * (for transform feedback).
2699 *
2700 * If send_commit_msg is true, this is the last piece of stream output data
2701 * from this thread, so send the data as a committed write. According to the
2702 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2703 *
2704 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2705 * writes are complete by sending the final write as a committed write."
2706 */
2707 void
elk_svb_write(struct elk_codegen * p,struct elk_reg dest,unsigned msg_reg_nr,struct elk_reg src0,unsigned binding_table_index,bool send_commit_msg)2708 elk_svb_write(struct elk_codegen *p,
2709 struct elk_reg dest,
2710 unsigned msg_reg_nr,
2711 struct elk_reg src0,
2712 unsigned binding_table_index,
2713 bool send_commit_msg)
2714 {
2715 const struct intel_device_info *devinfo = p->devinfo;
2716 assert(devinfo->ver == 6);
2717 const unsigned target_cache = GFX6_SFID_DATAPORT_RENDER_CACHE;
2718 elk_inst *insn;
2719
2720 elk_gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
2721
2722 insn = next_insn(p, ELK_OPCODE_SEND);
2723 elk_inst_set_sfid(devinfo, insn, target_cache);
2724 elk_set_dest(p, insn, dest);
2725 elk_set_src0(p, insn, src0);
2726 elk_set_desc(p, insn,
2727 elk_message_desc(devinfo, 1, send_commit_msg, true) |
2728 elk_dp_write_desc(devinfo, binding_table_index,
2729 0, /* msg_control: ignored */
2730 GFX6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2731 send_commit_msg)); /* send_commit_msg */
2732 }
2733
2734 static unsigned
elk_surface_payload_size(unsigned num_channels,unsigned exec_size)2735 elk_surface_payload_size(unsigned num_channels,
2736 unsigned exec_size /**< 0 for SIMD4x2 */)
2737 {
2738 if (exec_size == 0)
2739 return 1; /* SIMD4x2 */
2740 else if (exec_size <= 8)
2741 return num_channels;
2742 else
2743 return 2 * num_channels;
2744 }
2745
2746 void
elk_untyped_atomic(struct elk_codegen * p,struct elk_reg dst,struct elk_reg payload,struct elk_reg surface,unsigned atomic_op,unsigned msg_length,bool response_expected,bool header_present)2747 elk_untyped_atomic(struct elk_codegen *p,
2748 struct elk_reg dst,
2749 struct elk_reg payload,
2750 struct elk_reg surface,
2751 unsigned atomic_op,
2752 unsigned msg_length,
2753 bool response_expected,
2754 bool header_present)
2755 {
2756 const struct intel_device_info *devinfo = p->devinfo;
2757 const unsigned sfid = (devinfo->verx10 >= 75 ?
2758 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2759 GFX7_SFID_DATAPORT_DATA_CACHE);
2760 const bool align1 = elk_get_default_access_mode(p) == ELK_ALIGN_1;
2761 /* SIMD4x2 untyped atomic instructions only exist on HSW+ */
2762 const bool has_simd4x2 = devinfo->verx10 >= 75;
2763 const unsigned exec_size = align1 ? 1 << elk_get_default_exec_size(p) :
2764 has_simd4x2 ? 0 : 8;
2765 const unsigned response_length =
2766 elk_surface_payload_size(response_expected, exec_size);
2767 const unsigned desc =
2768 elk_message_desc(devinfo, msg_length, response_length, header_present) |
2769 elk_dp_untyped_atomic_desc(devinfo, exec_size, atomic_op,
2770 response_expected);
2771 /* Mask out unused components -- This is especially important in Align16
2772 * mode on generations that don't have native support for SIMD4x2 atomics,
2773 * because unused but enabled components will cause the dataport to perform
2774 * additional atomic operations on the addresses that happen to be in the
2775 * uninitialized Y, Z and W coordinates of the payload.
2776 */
2777 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
2778
2779 elk_send_indirect_surface_message(p, sfid, elk_writemask(dst, mask),
2780 payload, surface, desc);
2781 }
2782
2783 void
elk_untyped_surface_read(struct elk_codegen * p,struct elk_reg dst,struct elk_reg payload,struct elk_reg surface,unsigned msg_length,unsigned num_channels)2784 elk_untyped_surface_read(struct elk_codegen *p,
2785 struct elk_reg dst,
2786 struct elk_reg payload,
2787 struct elk_reg surface,
2788 unsigned msg_length,
2789 unsigned num_channels)
2790 {
2791 const struct intel_device_info *devinfo = p->devinfo;
2792 const unsigned sfid = (devinfo->verx10 >= 75 ?
2793 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2794 GFX7_SFID_DATAPORT_DATA_CACHE);
2795 const bool align1 = elk_get_default_access_mode(p) == ELK_ALIGN_1;
2796 const unsigned exec_size = align1 ? 1 << elk_get_default_exec_size(p) : 0;
2797 const unsigned response_length =
2798 elk_surface_payload_size(num_channels, exec_size);
2799 const unsigned desc =
2800 elk_message_desc(devinfo, msg_length, response_length, false) |
2801 elk_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, false);
2802
2803 elk_send_indirect_surface_message(p, sfid, dst, payload, surface, desc);
2804 }
2805
2806 void
elk_untyped_surface_write(struct elk_codegen * p,struct elk_reg payload,struct elk_reg surface,unsigned msg_length,unsigned num_channels,bool header_present)2807 elk_untyped_surface_write(struct elk_codegen *p,
2808 struct elk_reg payload,
2809 struct elk_reg surface,
2810 unsigned msg_length,
2811 unsigned num_channels,
2812 bool header_present)
2813 {
2814 const struct intel_device_info *devinfo = p->devinfo;
2815 const unsigned sfid = (devinfo->verx10 >= 75 ?
2816 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2817 GFX7_SFID_DATAPORT_DATA_CACHE);
2818 const bool align1 = elk_get_default_access_mode(p) == ELK_ALIGN_1;
2819 /* SIMD4x2 untyped surface write instructions only exist on HSW+ */
2820 const bool has_simd4x2 = devinfo->verx10 >= 75;
2821 const unsigned exec_size = align1 ? 1 << elk_get_default_exec_size(p) :
2822 has_simd4x2 ? 0 : 8;
2823 const unsigned desc =
2824 elk_message_desc(devinfo, msg_length, 0, header_present) |
2825 elk_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, true);
2826 /* Mask out unused components -- See comment in elk_untyped_atomic(). */
2827 const unsigned mask = !has_simd4x2 && !align1 ? WRITEMASK_X : WRITEMASK_XYZW;
2828
2829 elk_send_indirect_surface_message(p, sfid, elk_writemask(elk_null_reg(), mask),
2830 payload, surface, desc);
2831 }
2832
2833 static void
elk_set_memory_fence_message(struct elk_codegen * p,struct elk_inst * insn,enum elk_message_target sfid,bool commit_enable,unsigned bti)2834 elk_set_memory_fence_message(struct elk_codegen *p,
2835 struct elk_inst *insn,
2836 enum elk_message_target sfid,
2837 bool commit_enable,
2838 unsigned bti)
2839 {
2840 const struct intel_device_info *devinfo = p->devinfo;
2841
2842 elk_set_desc(p, insn, elk_message_desc(
2843 devinfo, 1, (commit_enable ? 1 : 0), true));
2844
2845 elk_inst_set_sfid(devinfo, insn, sfid);
2846
2847 switch (sfid) {
2848 case GFX6_SFID_DATAPORT_RENDER_CACHE:
2849 elk_inst_set_dp_msg_type(devinfo, insn, GFX7_DATAPORT_RC_MEMORY_FENCE);
2850 break;
2851 case GFX7_SFID_DATAPORT_DATA_CACHE:
2852 elk_inst_set_dp_msg_type(devinfo, insn, GFX7_DATAPORT_DC_MEMORY_FENCE);
2853 break;
2854 default:
2855 unreachable("Not reached");
2856 }
2857
2858 if (commit_enable)
2859 elk_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
2860
2861 assert(bti == 0);
2862 elk_inst_set_binding_table_index(devinfo, insn, bti);
2863 }
2864
2865 void
elk_memory_fence(struct elk_codegen * p,struct elk_reg dst,struct elk_reg src,enum elk_opcode send_op,enum elk_message_target sfid,uint32_t desc,bool commit_enable,unsigned bti)2866 elk_memory_fence(struct elk_codegen *p,
2867 struct elk_reg dst,
2868 struct elk_reg src,
2869 enum elk_opcode send_op,
2870 enum elk_message_target sfid,
2871 uint32_t desc,
2872 bool commit_enable,
2873 unsigned bti)
2874 {
2875 const struct intel_device_info *devinfo = p->devinfo;
2876
2877 dst = retype(vec1(dst), ELK_REGISTER_TYPE_UW);
2878 src = retype(vec1(src), ELK_REGISTER_TYPE_UD);
2879
2880 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
2881 * message doesn't write anything back.
2882 */
2883 struct elk_inst *insn = next_insn(p, send_op);
2884 elk_inst_set_mask_control(devinfo, insn, ELK_MASK_DISABLE);
2885 elk_inst_set_exec_size(devinfo, insn, ELK_EXECUTE_1);
2886 elk_set_dest(p, insn, dst);
2887 elk_set_src0(p, insn, src);
2888
2889 elk_set_memory_fence_message(p, insn, sfid, commit_enable, bti);
2890 }
2891
2892 void
elk_find_live_channel(struct elk_codegen * p,struct elk_reg dst,bool last)2893 elk_find_live_channel(struct elk_codegen *p, struct elk_reg dst, bool last)
2894 {
2895 const struct intel_device_info *devinfo = p->devinfo;
2896 const unsigned exec_size = 1 << elk_get_default_exec_size(p);
2897 const unsigned qtr_control = elk_get_default_group(p) / 8;
2898 elk_inst *inst;
2899
2900 assert(devinfo->ver == 7);
2901
2902 elk_push_insn_state(p);
2903
2904 /* The flag register is only used on Gfx7 in align1 mode, so avoid setting
2905 * unnecessary bits in the instruction words, get the information we need
2906 * and reset the default flag register. This allows more instructions to be
2907 * compacted.
2908 */
2909 const unsigned flag_subreg = p->current->flag_subreg;
2910 elk_set_default_flag_reg(p, 0, 0);
2911
2912 if (elk_get_default_access_mode(p) == ELK_ALIGN_1) {
2913 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2914
2915 const struct elk_reg flag = elk_flag_subreg(flag_subreg);
2916
2917 elk_set_default_exec_size(p, ELK_EXECUTE_1);
2918 elk_MOV(p, retype(flag, ELK_REGISTER_TYPE_UD), elk_imm_ud(0));
2919
2920 /* Run enough instructions returning zero with execution masking and
2921 * a conditional modifier enabled in order to get the full execution
2922 * mask in f1.0. We could use a single 32-wide move here if it
2923 * weren't because of the hardware bug that causes channel enables to
2924 * be applied incorrectly to the second half of 32-wide instructions
2925 * on Gfx7.
2926 */
2927 const unsigned lower_size = MIN2(16, exec_size);
2928 for (unsigned i = 0; i < exec_size / lower_size; i++) {
2929 inst = elk_MOV(p, retype(elk_null_reg(), ELK_REGISTER_TYPE_UW),
2930 elk_imm_uw(0));
2931 elk_inst_set_mask_control(devinfo, inst, ELK_MASK_ENABLE);
2932 elk_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
2933 elk_inst_set_cond_modifier(devinfo, inst, ELK_CONDITIONAL_Z);
2934 elk_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
2935 elk_inst_set_flag_reg_nr(devinfo, inst, flag_subreg / 2);
2936 elk_inst_set_flag_subreg_nr(devinfo, inst, flag_subreg % 2);
2937 }
2938
2939 /* Find the first bit set in the exec_size-wide portion of the flag
2940 * register that was updated by the last sequence of MOV
2941 * instructions.
2942 */
2943 const enum elk_reg_type type = elk_int_type(exec_size / 8, false);
2944 elk_set_default_exec_size(p, ELK_EXECUTE_1);
2945 if (!last) {
2946 inst = elk_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
2947 } else {
2948 inst = elk_LZD(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
2949 struct elk_reg neg = vec1(dst);
2950 neg.negate = true;
2951 inst = elk_ADD(p, vec1(dst), neg, elk_imm_uw(31));
2952 }
2953 } else {
2954 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2955
2956 /* Overwrite the destination without and with execution masking to
2957 * find out which of the channels is active.
2958 */
2959 elk_push_insn_state(p);
2960 elk_set_default_exec_size(p, ELK_EXECUTE_4);
2961 elk_MOV(p, elk_writemask(vec4(dst), WRITEMASK_X),
2962 elk_imm_ud(1));
2963
2964 inst = elk_MOV(p, elk_writemask(vec4(dst), WRITEMASK_X),
2965 elk_imm_ud(0));
2966 elk_pop_insn_state(p);
2967 elk_inst_set_mask_control(devinfo, inst, ELK_MASK_ENABLE);
2968 }
2969
2970 elk_pop_insn_state(p);
2971 }
2972
2973 void
elk_broadcast(struct elk_codegen * p,struct elk_reg dst,struct elk_reg src,struct elk_reg idx)2974 elk_broadcast(struct elk_codegen *p,
2975 struct elk_reg dst,
2976 struct elk_reg src,
2977 struct elk_reg idx)
2978 {
2979 const struct intel_device_info *devinfo = p->devinfo;
2980 const bool align1 = elk_get_default_access_mode(p) == ELK_ALIGN_1;
2981 elk_inst *inst;
2982
2983 elk_push_insn_state(p);
2984 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2985 elk_set_default_exec_size(p, align1 ? ELK_EXECUTE_1 : ELK_EXECUTE_4);
2986
2987 assert(src.file == ELK_GENERAL_REGISTER_FILE &&
2988 src.address_mode == ELK_ADDRESS_DIRECT);
2989 assert(!src.abs && !src.negate);
2990
2991 /* Gen12.5 adds the following region restriction:
2992 *
2993 * "Vx1 and VxH indirect addressing for Float, Half-Float, Double-Float
2994 * and Quad-Word data must not be used."
2995 *
2996 * We require the source and destination types to match so stomp to an
2997 * unsigned integer type.
2998 */
2999 assert(src.type == dst.type);
3000 src.type = dst.type = elk_reg_type_from_bit_size(type_sz(src.type) * 8,
3001 ELK_REGISTER_TYPE_UD);
3002
3003 if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3004 idx.file == ELK_IMMEDIATE_VALUE) {
3005 /* Trivial, the source is already uniform or the index is a constant.
3006 * We will typically not get here if the optimizer is doing its job, but
3007 * asserting would be mean.
3008 */
3009 const unsigned i = idx.file == ELK_IMMEDIATE_VALUE ? idx.ud : 0;
3010 src = align1 ? stride(suboffset(src, i), 0, 1, 0) :
3011 stride(suboffset(src, 4 * i), 0, 4, 1);
3012
3013 if (type_sz(src.type) > 4 && !devinfo->has_64bit_int) {
3014 elk_MOV(p, subscript(dst, ELK_REGISTER_TYPE_D, 0),
3015 subscript(src, ELK_REGISTER_TYPE_D, 0));
3016 elk_MOV(p, subscript(dst, ELK_REGISTER_TYPE_D, 1),
3017 subscript(src, ELK_REGISTER_TYPE_D, 1));
3018 } else {
3019 elk_MOV(p, dst, src);
3020 }
3021 } else {
3022 /* From the Haswell PRM section "Register Region Restrictions":
3023 *
3024 * "The lower bits of the AddressImmediate must not overflow to
3025 * change the register address. The lower 5 bits of Address
3026 * Immediate when added to lower 5 bits of address register gives
3027 * the sub-register offset. The upper bits of Address Immediate
3028 * when added to upper bits of address register gives the register
3029 * address. Any overflow from sub-register offset is dropped."
3030 *
3031 * Fortunately, for broadcast, we never have a sub-register offset so
3032 * this isn't an issue.
3033 */
3034 assert(src.subnr == 0);
3035
3036 if (align1) {
3037 const struct elk_reg addr =
3038 retype(elk_address_reg(0), ELK_REGISTER_TYPE_UD);
3039 unsigned offset = src.nr * REG_SIZE + src.subnr;
3040 /* Limit in bytes of the signed indirect addressing immediate. */
3041 const unsigned limit = 512;
3042
3043 elk_push_insn_state(p);
3044 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
3045 elk_set_default_predicate_control(p, ELK_PREDICATE_NONE);
3046 elk_set_default_flag_reg(p, 0, 0);
3047
3048 /* Take into account the component size and horizontal stride. */
3049 assert(src.vstride == src.hstride + src.width);
3050 elk_SHL(p, addr, vec1(idx),
3051 elk_imm_ud(util_logbase2(type_sz(src.type)) +
3052 src.hstride - 1));
3053
3054 /* We can only address up to limit bytes using the indirect
3055 * addressing immediate, account for the difference if the source
3056 * register is above this limit.
3057 */
3058 if (offset >= limit) {
3059 elk_ADD(p, addr, addr, elk_imm_ud(offset - offset % limit));
3060 offset = offset % limit;
3061 }
3062
3063 elk_pop_insn_state(p);
3064
3065 /* Use indirect addressing to fetch the specified component. */
3066 if (type_sz(src.type) > 4 &&
3067 (devinfo->platform == INTEL_PLATFORM_CHV || !devinfo->has_64bit_int)) {
3068 /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3069 *
3070 * "When source or destination datatype is 64b or operation is
3071 * integer DWord multiply, indirect addressing must not be
3072 * used."
3073 *
3074 * To work around both of this issue, we do two integer MOVs
3075 * insead of one 64-bit MOV. Because no double value should ever
3076 * cross a register boundary, it's safe to use the immediate
3077 * offset in the indirect here to handle adding 4 bytes to the
3078 * offset and avoid the extra ADD to the register file.
3079 */
3080 elk_MOV(p, subscript(dst, ELK_REGISTER_TYPE_D, 0),
3081 retype(elk_vec1_indirect(addr.subnr, offset),
3082 ELK_REGISTER_TYPE_D));
3083 elk_MOV(p, subscript(dst, ELK_REGISTER_TYPE_D, 1),
3084 retype(elk_vec1_indirect(addr.subnr, offset + 4),
3085 ELK_REGISTER_TYPE_D));
3086 } else {
3087 elk_MOV(p, dst,
3088 retype(elk_vec1_indirect(addr.subnr, offset), src.type));
3089 }
3090 } else {
3091 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3092 * to all bits of a flag register,
3093 */
3094 inst = elk_MOV(p,
3095 elk_null_reg(),
3096 stride(elk_swizzle(idx, ELK_SWIZZLE_XXXX), 4, 4, 1));
3097 elk_inst_set_pred_control(devinfo, inst, ELK_PREDICATE_NONE);
3098 elk_inst_set_cond_modifier(devinfo, inst, ELK_CONDITIONAL_NZ);
3099 elk_inst_set_flag_reg_nr(devinfo, inst, 1);
3100
3101 /* and use predicated SEL to pick the right channel. */
3102 inst = elk_SEL(p, dst,
3103 stride(suboffset(src, 4), 4, 4, 1),
3104 stride(src, 4, 4, 1));
3105 elk_inst_set_pred_control(devinfo, inst, ELK_PREDICATE_NORMAL);
3106 elk_inst_set_flag_reg_nr(devinfo, inst, 1);
3107 }
3108 }
3109
3110 elk_pop_insn_state(p);
3111 }
3112
3113
3114 /**
3115 * Emit the SEND message for a barrier
3116 */
3117 void
elk_barrier(struct elk_codegen * p,struct elk_reg src)3118 elk_barrier(struct elk_codegen *p, struct elk_reg src)
3119 {
3120 const struct intel_device_info *devinfo = p->devinfo;
3121 struct elk_inst *inst;
3122
3123 assert(devinfo->ver >= 7);
3124
3125 elk_push_insn_state(p);
3126 elk_set_default_access_mode(p, ELK_ALIGN_1);
3127 inst = next_insn(p, ELK_OPCODE_SEND);
3128 elk_set_dest(p, inst, retype(elk_null_reg(), ELK_REGISTER_TYPE_UW));
3129 elk_set_src0(p, inst, src);
3130 elk_set_src1(p, inst, elk_null_reg());
3131 elk_set_desc(p, inst, elk_message_desc(devinfo,
3132 1 * reg_unit(devinfo), 0, false));
3133
3134 elk_inst_set_sfid(devinfo, inst, ELK_SFID_MESSAGE_GATEWAY);
3135 elk_inst_set_gateway_subfuncid(devinfo, inst,
3136 ELK_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3137
3138 elk_inst_set_mask_control(devinfo, inst, ELK_MASK_DISABLE);
3139 elk_pop_insn_state(p);
3140 }
3141
3142
3143 /**
3144 * Emit the wait instruction for a barrier
3145 */
3146 void
elk_WAIT(struct elk_codegen * p)3147 elk_WAIT(struct elk_codegen *p)
3148 {
3149 const struct intel_device_info *devinfo = p->devinfo;
3150 struct elk_inst *insn;
3151
3152 struct elk_reg src = elk_notification_reg();
3153
3154 insn = next_insn(p, ELK_OPCODE_WAIT);
3155 elk_set_dest(p, insn, src);
3156 elk_set_src0(p, insn, src);
3157 elk_set_src1(p, insn, elk_null_reg());
3158
3159 elk_inst_set_exec_size(devinfo, insn, ELK_EXECUTE_1);
3160 elk_inst_set_mask_control(devinfo, insn, ELK_MASK_DISABLE);
3161 }
3162
3163 void
elk_float_controls_mode(struct elk_codegen * p,unsigned mode,unsigned mask)3164 elk_float_controls_mode(struct elk_codegen *p,
3165 unsigned mode, unsigned mask)
3166 {
3167 assert(p->current->mask_control == ELK_MASK_DISABLE);
3168
3169 /* From the Skylake PRM, Volume 7, page 760:
3170 * "Implementation Restriction on Register Access: When the control
3171 * register is used as an explicit source and/or destination, hardware
3172 * does not ensure execution pipeline coherency. Software must set the
3173 * thread control field to ‘switch’ for an instruction that uses
3174 * control register as an explicit operand."
3175 */
3176 elk_inst *inst = elk_AND(p, elk_cr0_reg(0), elk_cr0_reg(0),
3177 elk_imm_ud(~mask));
3178 elk_inst_set_exec_size(p->devinfo, inst, ELK_EXECUTE_1);
3179 elk_inst_set_thread_control(p->devinfo, inst, ELK_THREAD_SWITCH);
3180
3181 if (mode) {
3182 elk_inst *inst_or = elk_OR(p, elk_cr0_reg(0), elk_cr0_reg(0),
3183 elk_imm_ud(mode));
3184 elk_inst_set_exec_size(p->devinfo, inst_or, ELK_EXECUTE_1);
3185 elk_inst_set_thread_control(p->devinfo, inst_or, ELK_THREAD_SWITCH);
3186 }
3187 }
3188
3189 void
elk_update_reloc_imm(const struct elk_isa_info * isa,elk_inst * inst,uint32_t value)3190 elk_update_reloc_imm(const struct elk_isa_info *isa,
3191 elk_inst *inst,
3192 uint32_t value)
3193 {
3194 const struct intel_device_info *devinfo = isa->devinfo;
3195
3196 /* Sanity check that the instruction is a MOV of an immediate */
3197 assert(elk_inst_opcode(isa, inst) == ELK_OPCODE_MOV);
3198 assert(elk_inst_src0_reg_file(devinfo, inst) == ELK_IMMEDIATE_VALUE);
3199
3200 /* If it was compacted, we can't safely rewrite */
3201 assert(elk_inst_cmpt_control(devinfo, inst) == 0);
3202
3203 elk_inst_set_imm_ud(devinfo, inst, value);
3204 }
3205
3206 /* A default value for constants that will be patched at run-time.
3207 * We pick an arbitrary value that prevents instruction compaction.
3208 */
3209 #define DEFAULT_PATCH_IMM 0x4a7cc037
3210
3211 void
elk_MOV_reloc_imm(struct elk_codegen * p,struct elk_reg dst,enum elk_reg_type src_type,uint32_t id)3212 elk_MOV_reloc_imm(struct elk_codegen *p,
3213 struct elk_reg dst,
3214 enum elk_reg_type src_type,
3215 uint32_t id)
3216 {
3217 assert(type_sz(src_type) == 4);
3218 assert(type_sz(dst.type) == 4);
3219
3220 elk_add_reloc(p, id, ELK_SHADER_RELOC_TYPE_MOV_IMM,
3221 p->next_insn_offset, 0);
3222
3223 elk_MOV(p, dst, retype(elk_imm_ud(DEFAULT_PATCH_IMM), src_type));
3224 }
3225