1 /* Copyright © 2011 Intel Corporation
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a
4 * copy of this software and associated documentation files (the "Software"),
5 * to deal in the Software without restriction, including without limitation
6 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
7 * and/or sell copies of the Software, and to permit persons to whom the
8 * Software is furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice (including the next
11 * paragraph) shall be included in all copies or substantial portions of the
12 * Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
20 * IN THE SOFTWARE.
21 */
22
23 #include "elk_vec4.h"
24 #include "elk_cfg.h"
25 #include "elk_eu.h"
26 #include "elk_disasm_info.h"
27 #include "dev/intel_debug.h"
28 #include "util/mesa-sha1.h"
29
30 using namespace elk;
31
32 static void
generate_math1_gfx4(struct elk_codegen * p,vec4_instruction * inst,struct elk_reg dst,struct elk_reg src)33 generate_math1_gfx4(struct elk_codegen *p,
34 vec4_instruction *inst,
35 struct elk_reg dst,
36 struct elk_reg src)
37 {
38 elk_gfx4_math(p,
39 dst,
40 elk_math_function(inst->opcode),
41 inst->base_mrf,
42 src,
43 ELK_MATH_PRECISION_FULL);
44 }
45
46 static void
check_gfx6_math_src_arg(struct elk_reg src)47 check_gfx6_math_src_arg(struct elk_reg src)
48 {
49 /* Source swizzles are ignored. */
50 assert(!src.abs);
51 assert(!src.negate);
52 assert(src.swizzle == ELK_SWIZZLE_XYZW);
53 }
54
55 static void
generate_math_gfx6(struct elk_codegen * p,vec4_instruction * inst,struct elk_reg dst,struct elk_reg src0,struct elk_reg src1)56 generate_math_gfx6(struct elk_codegen *p,
57 vec4_instruction *inst,
58 struct elk_reg dst,
59 struct elk_reg src0,
60 struct elk_reg src1)
61 {
62 /* Can't do writemask because math can't be align16. */
63 assert(dst.writemask == WRITEMASK_XYZW);
64 /* Source swizzles are ignored. */
65 check_gfx6_math_src_arg(src0);
66 if (src1.file == ELK_GENERAL_REGISTER_FILE)
67 check_gfx6_math_src_arg(src1);
68
69 elk_set_default_access_mode(p, ELK_ALIGN_1);
70 elk_gfx6_math(p, dst, elk_math_function(inst->opcode), src0, src1);
71 elk_set_default_access_mode(p, ELK_ALIGN_16);
72 }
73
74 static void
generate_math2_gfx4(struct elk_codegen * p,vec4_instruction * inst,struct elk_reg dst,struct elk_reg src0,struct elk_reg src1)75 generate_math2_gfx4(struct elk_codegen *p,
76 vec4_instruction *inst,
77 struct elk_reg dst,
78 struct elk_reg src0,
79 struct elk_reg src1)
80 {
81 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
82 * "Message Payload":
83 *
84 * "Operand0[7]. For the INT DIV functions, this operand is the
85 * denominator."
86 * ...
87 * "Operand1[7]. For the INT DIV functions, this operand is the
88 * numerator."
89 */
90 bool is_int_div = inst->opcode != ELK_SHADER_OPCODE_POW;
91 struct elk_reg &op0 = is_int_div ? src1 : src0;
92 struct elk_reg &op1 = is_int_div ? src0 : src1;
93
94 elk_push_insn_state(p);
95 elk_set_default_saturate(p, false);
96 elk_set_default_predicate_control(p, ELK_PREDICATE_NONE);
97 elk_set_default_flag_reg(p, 0, 0);
98 elk_MOV(p, retype(elk_message_reg(inst->base_mrf + 1), op1.type), op1);
99 elk_pop_insn_state(p);
100
101 elk_gfx4_math(p,
102 dst,
103 elk_math_function(inst->opcode),
104 inst->base_mrf,
105 op0,
106 ELK_MATH_PRECISION_FULL);
107 }
108
109 static void
generate_tex(struct elk_codegen * p,struct elk_vue_prog_data * prog_data,gl_shader_stage stage,vec4_instruction * inst,struct elk_reg dst,struct elk_reg src,struct elk_reg surface_index,struct elk_reg sampler_index)110 generate_tex(struct elk_codegen *p,
111 struct elk_vue_prog_data *prog_data,
112 gl_shader_stage stage,
113 vec4_instruction *inst,
114 struct elk_reg dst,
115 struct elk_reg src,
116 struct elk_reg surface_index,
117 struct elk_reg sampler_index)
118 {
119 const struct intel_device_info *devinfo = p->devinfo;
120 int msg_type = -1;
121
122 if (devinfo->ver >= 5) {
123 switch (inst->opcode) {
124 case ELK_SHADER_OPCODE_TEX:
125 case ELK_SHADER_OPCODE_TXL:
126 if (inst->shadow_compare) {
127 msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
128 } else {
129 msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LOD;
130 }
131 break;
132 case ELK_SHADER_OPCODE_TXD:
133 if (inst->shadow_compare) {
134 /* Gfx7.5+. Otherwise, lowered by elk_lower_texture_gradients(). */
135 assert(devinfo->verx10 == 75);
136 msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
137 } else {
138 msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
139 }
140 break;
141 case ELK_SHADER_OPCODE_TXF:
142 msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
143 break;
144 case ELK_SHADER_OPCODE_TXF_CMS:
145 if (devinfo->ver >= 7)
146 msg_type = GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
147 else
148 msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
149 break;
150 case ELK_SHADER_OPCODE_TXF_MCS:
151 assert(devinfo->ver >= 7);
152 msg_type = GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
153 break;
154 case ELK_SHADER_OPCODE_TXS:
155 msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
156 break;
157 case ELK_SHADER_OPCODE_TG4:
158 if (inst->shadow_compare) {
159 msg_type = GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
160 } else {
161 msg_type = GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
162 }
163 break;
164 case ELK_SHADER_OPCODE_TG4_OFFSET:
165 if (inst->shadow_compare) {
166 msg_type = GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
167 } else {
168 msg_type = GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
169 }
170 break;
171 case ELK_SHADER_OPCODE_SAMPLEINFO:
172 msg_type = GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
173 break;
174 default:
175 unreachable("should not get here: invalid vec4 texture opcode");
176 }
177 } else {
178 switch (inst->opcode) {
179 case ELK_SHADER_OPCODE_TEX:
180 case ELK_SHADER_OPCODE_TXL:
181 if (inst->shadow_compare) {
182 msg_type = ELK_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_COMPARE;
183 assert(inst->mlen == 3);
184 } else {
185 msg_type = ELK_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD;
186 assert(inst->mlen == 2);
187 }
188 break;
189 case ELK_SHADER_OPCODE_TXD:
190 /* There is no sample_d_c message; comparisons are done manually. */
191 msg_type = ELK_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS;
192 assert(inst->mlen == 4);
193 break;
194 case ELK_SHADER_OPCODE_TXF:
195 msg_type = ELK_SAMPLER_MESSAGE_SIMD4X2_LD;
196 assert(inst->mlen == 2);
197 break;
198 case ELK_SHADER_OPCODE_TXS:
199 msg_type = ELK_SAMPLER_MESSAGE_SIMD4X2_RESINFO;
200 assert(inst->mlen == 2);
201 break;
202 default:
203 unreachable("should not get here: invalid vec4 texture opcode");
204 }
205 }
206
207 assert(msg_type != -1);
208
209 assert(sampler_index.type == ELK_REGISTER_TYPE_UD);
210
211 /* Load the message header if present. If there's a texture offset, we need
212 * to set it up explicitly and load the offset bitfield. Otherwise, we can
213 * use an implied move from g0 to the first message register.
214 */
215 if (inst->header_size != 0) {
216 if (devinfo->ver < 6 && !inst->offset) {
217 /* Set up an implied move from g0 to the MRF. */
218 src = elk_vec8_grf(0, 0);
219 } else {
220 struct elk_reg header =
221 retype(elk_message_reg(inst->base_mrf), ELK_REGISTER_TYPE_UD);
222 uint32_t dw2 = 0;
223
224 /* Explicitly set up the message header by copying g0 to the MRF. */
225 elk_push_insn_state(p);
226 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
227 elk_MOV(p, header, retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
228
229 elk_set_default_access_mode(p, ELK_ALIGN_1);
230
231 if (inst->offset)
232 /* Set the texel offset bits in DWord 2. */
233 dw2 = inst->offset;
234
235 /* The VS, DS, and FS stages have the g0.2 payload delivered as 0,
236 * so header0.2 is 0 when g0 is copied. The HS and GS stages do
237 * not, so we must set to to 0 to avoid setting undesirable bits
238 * in the message header.
239 */
240 if (dw2 ||
241 stage == MESA_SHADER_TESS_CTRL ||
242 stage == MESA_SHADER_GEOMETRY) {
243 elk_MOV(p, get_element_ud(header, 2), elk_imm_ud(dw2));
244 }
245
246 elk_adjust_sampler_state_pointer(p, header, sampler_index);
247 elk_pop_insn_state(p);
248 }
249 }
250
251 uint32_t return_format;
252
253 switch (dst.type) {
254 case ELK_REGISTER_TYPE_D:
255 return_format = ELK_SAMPLER_RETURN_FORMAT_SINT32;
256 break;
257 case ELK_REGISTER_TYPE_UD:
258 return_format = ELK_SAMPLER_RETURN_FORMAT_UINT32;
259 break;
260 default:
261 return_format = ELK_SAMPLER_RETURN_FORMAT_FLOAT32;
262 break;
263 }
264
265 /* Stomp the resinfo output type to UINT32. On gens 4-5, the output type
266 * is set as part of the message descriptor. On gfx4, the PRM seems to
267 * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on
268 * later gens UINT32 is required. Once you hit Sandy Bridge, the bit is
269 * gone from the message descriptor entirely and you just get UINT32 all
270 * the time regasrdless. Since we can really only do non-UINT32 on gfx4,
271 * just stomp it to UINT32 all the time.
272 */
273 if (inst->opcode == ELK_SHADER_OPCODE_TXS)
274 return_format = ELK_SAMPLER_RETURN_FORMAT_UINT32;
275
276 if (surface_index.file == ELK_IMMEDIATE_VALUE &&
277 sampler_index.file == ELK_IMMEDIATE_VALUE) {
278 uint32_t surface = surface_index.ud;
279 uint32_t sampler = sampler_index.ud;
280
281 elk_SAMPLE(p,
282 dst,
283 inst->base_mrf,
284 src,
285 surface,
286 sampler % 16,
287 msg_type,
288 1, /* response length */
289 inst->mlen,
290 inst->header_size != 0,
291 ELK_SAMPLER_SIMD_MODE_SIMD4X2,
292 return_format);
293 } else {
294 /* Non-constant sampler index. */
295
296 struct elk_reg addr = vec1(retype(elk_address_reg(0), ELK_REGISTER_TYPE_UD));
297 struct elk_reg surface_reg = vec1(retype(surface_index, ELK_REGISTER_TYPE_UD));
298 struct elk_reg sampler_reg = vec1(retype(sampler_index, ELK_REGISTER_TYPE_UD));
299
300 elk_push_insn_state(p);
301 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
302 elk_set_default_access_mode(p, ELK_ALIGN_1);
303
304 if (elk_regs_equal(&surface_reg, &sampler_reg)) {
305 elk_MUL(p, addr, sampler_reg, elk_imm_uw(0x101));
306 } else {
307 if (sampler_reg.file == ELK_IMMEDIATE_VALUE) {
308 elk_OR(p, addr, surface_reg, elk_imm_ud(sampler_reg.ud << 8));
309 } else {
310 elk_SHL(p, addr, sampler_reg, elk_imm_ud(8));
311 elk_OR(p, addr, addr, surface_reg);
312 }
313 }
314 elk_AND(p, addr, addr, elk_imm_ud(0xfff));
315
316 elk_pop_insn_state(p);
317
318 if (inst->base_mrf != -1)
319 elk_gfx6_resolve_implied_move(p, &src, inst->base_mrf);
320
321 /* dst = send(offset, a0.0 | <descriptor>) */
322 elk_send_indirect_message(
323 p, ELK_SFID_SAMPLER, dst, src, addr,
324 elk_message_desc(devinfo, inst->mlen, 1, inst->header_size) |
325 elk_sampler_desc(devinfo,
326 0 /* surface */,
327 0 /* sampler */,
328 msg_type,
329 ELK_SAMPLER_SIMD_MODE_SIMD4X2,
330 return_format),
331 false /* EOT */);
332
333 /* visitor knows more than we do about the surface limit required,
334 * so has already done marking.
335 */
336 }
337 }
338
339 static void
generate_vs_urb_write(struct elk_codegen * p,vec4_instruction * inst)340 generate_vs_urb_write(struct elk_codegen *p, vec4_instruction *inst)
341 {
342 elk_urb_WRITE(p,
343 elk_null_reg(), /* dest */
344 inst->base_mrf, /* starting mrf reg nr */
345 elk_vec8_grf(0, 0), /* src */
346 inst->urb_write_flags,
347 inst->mlen,
348 0, /* response len */
349 inst->offset, /* urb destination offset */
350 ELK_URB_SWIZZLE_INTERLEAVE);
351 }
352
353 static void
generate_gs_urb_write(struct elk_codegen * p,vec4_instruction * inst)354 generate_gs_urb_write(struct elk_codegen *p, vec4_instruction *inst)
355 {
356 struct elk_reg src = elk_message_reg(inst->base_mrf);
357 elk_urb_WRITE(p,
358 elk_null_reg(), /* dest */
359 inst->base_mrf, /* starting mrf reg nr */
360 src,
361 inst->urb_write_flags,
362 inst->mlen,
363 0, /* response len */
364 inst->offset, /* urb destination offset */
365 ELK_URB_SWIZZLE_INTERLEAVE);
366 }
367
368 static void
generate_gs_urb_write_allocate(struct elk_codegen * p,vec4_instruction * inst)369 generate_gs_urb_write_allocate(struct elk_codegen *p, vec4_instruction *inst)
370 {
371 struct elk_reg src = elk_message_reg(inst->base_mrf);
372
373 /* We pass the temporary passed in src0 as the writeback register */
374 elk_urb_WRITE(p,
375 inst->src[0].as_elk_reg(), /* dest */
376 inst->base_mrf, /* starting mrf reg nr */
377 src,
378 ELK_URB_WRITE_ALLOCATE_COMPLETE,
379 inst->mlen,
380 1, /* response len */
381 inst->offset, /* urb destination offset */
382 ELK_URB_SWIZZLE_INTERLEAVE);
383
384 /* Now put allocated urb handle in dst.0 */
385 elk_push_insn_state(p);
386 elk_set_default_access_mode(p, ELK_ALIGN_1);
387 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
388 elk_MOV(p, get_element_ud(inst->dst.as_elk_reg(), 0),
389 get_element_ud(inst->src[0].as_elk_reg(), 0));
390 elk_pop_insn_state(p);
391 }
392
393 static void
generate_gs_thread_end(struct elk_codegen * p,vec4_instruction * inst)394 generate_gs_thread_end(struct elk_codegen *p, vec4_instruction *inst)
395 {
396 struct elk_reg src = elk_message_reg(inst->base_mrf);
397 elk_urb_WRITE(p,
398 elk_null_reg(), /* dest */
399 inst->base_mrf, /* starting mrf reg nr */
400 src,
401 ELK_URB_WRITE_EOT | inst->urb_write_flags,
402 inst->mlen,
403 0, /* response len */
404 0, /* urb destination offset */
405 ELK_URB_SWIZZLE_INTERLEAVE);
406 }
407
408 static void
generate_gs_set_write_offset(struct elk_codegen * p,struct elk_reg dst,struct elk_reg src0,struct elk_reg src1)409 generate_gs_set_write_offset(struct elk_codegen *p,
410 struct elk_reg dst,
411 struct elk_reg src0,
412 struct elk_reg src1)
413 {
414 /* From p22 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
415 * Header: M0.3):
416 *
417 * Slot 0 Offset. This field, after adding to the Global Offset field
418 * in the message descriptor, specifies the offset (in 256-bit units)
419 * from the start of the URB entry, as referenced by URB Handle 0, at
420 * which the data will be accessed.
421 *
422 * Similar text describes DWORD M0.4, which is slot 1 offset.
423 *
424 * Therefore, we want to multiply DWORDs 0 and 4 of src0 (the x components
425 * of the register for geometry shader invocations 0 and 1) by the
426 * immediate value in src1, and store the result in DWORDs 3 and 4 of dst.
427 *
428 * We can do this with the following EU instruction:
429 *
430 * mul(2) dst.3<1>UD src0<8;2,4>UD src1<...>UW { Align1 WE_all }
431 */
432 elk_push_insn_state(p);
433 elk_set_default_access_mode(p, ELK_ALIGN_1);
434 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
435 assert(p->devinfo->ver >= 7 &&
436 src1.file == ELK_IMMEDIATE_VALUE &&
437 src1.type == ELK_REGISTER_TYPE_UD &&
438 src1.ud <= USHRT_MAX);
439 if (src0.file == ELK_IMMEDIATE_VALUE) {
440 elk_MOV(p, suboffset(stride(dst, 2, 2, 1), 3),
441 elk_imm_ud(src0.ud * src1.ud));
442 } else {
443 if (src1.file == ELK_IMMEDIATE_VALUE) {
444 src1 = elk_imm_uw(src1.ud);
445 }
446 elk_MUL(p, suboffset(stride(dst, 2, 2, 1), 3), stride(src0, 8, 2, 4),
447 retype(src1, ELK_REGISTER_TYPE_UW));
448 }
449 elk_pop_insn_state(p);
450 }
451
452 static void
generate_gs_set_vertex_count(struct elk_codegen * p,struct elk_reg dst,struct elk_reg src)453 generate_gs_set_vertex_count(struct elk_codegen *p,
454 struct elk_reg dst,
455 struct elk_reg src)
456 {
457 elk_push_insn_state(p);
458 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
459
460 /* If we think of the src and dst registers as composed of 8 DWORDs each,
461 * we want to pick up the contents of DWORDs 0 and 4 from src, truncate
462 * them to WORDs, and then pack them into DWORD 2 of dst.
463 *
464 * It's easier to get the EU to do this if we think of the src and dst
465 * registers as composed of 16 WORDS each; then, we want to pick up the
466 * contents of WORDs 0 and 8 from src, and pack them into WORDs 4 and 5
467 * of dst.
468 *
469 * We can do that by the following EU instruction:
470 *
471 * mov (2) dst.4<1>:uw src<8;1,0>:uw { Align1, Q1, NoMask }
472 */
473 elk_set_default_access_mode(p, ELK_ALIGN_1);
474 elk_MOV(p,
475 suboffset(stride(retype(dst, ELK_REGISTER_TYPE_UW), 2, 2, 1), 4),
476 stride(retype(src, ELK_REGISTER_TYPE_UW), 8, 1, 0));
477
478 elk_pop_insn_state(p);
479 }
480
481 static void
generate_gs_svb_write(struct elk_codegen * p,vec4_instruction * inst,struct elk_reg dst,struct elk_reg src0,struct elk_reg src1)482 generate_gs_svb_write(struct elk_codegen *p,
483 vec4_instruction *inst,
484 struct elk_reg dst,
485 struct elk_reg src0,
486 struct elk_reg src1)
487 {
488 int binding = inst->sol_binding;
489 bool final_write = inst->sol_final_write;
490
491 elk_push_insn_state(p);
492 elk_set_default_exec_size(p, ELK_EXECUTE_4);
493 /* Copy Vertex data into M0.x */
494 elk_MOV(p, stride(dst, 4, 4, 1),
495 stride(retype(src0, ELK_REGISTER_TYPE_UD), 4, 4, 1));
496 elk_pop_insn_state(p);
497
498 elk_push_insn_state(p);
499 /* Send SVB Write */
500 elk_svb_write(p,
501 final_write ? src1 : elk_null_reg(), /* dest == src1 */
502 1, /* msg_reg_nr */
503 dst, /* src0 == previous dst */
504 ELK_GFX6_SOL_BINDING_START + binding, /* binding_table_index */
505 final_write); /* send_commit_msg */
506
507 /* Finally, wait for the write commit to occur so that we can proceed to
508 * other things safely.
509 *
510 * From the Sandybridge PRM, Volume 4, Part 1, Section 3.3:
511 *
512 * The write commit does not modify the destination register, but
513 * merely clears the dependency associated with the destination
514 * register. Thus, a simple “mov” instruction using the register as a
515 * source is sufficient to wait for the write commit to occur.
516 */
517 if (final_write) {
518 elk_MOV(p, src1, src1);
519 }
520 elk_pop_insn_state(p);
521 }
522
523 static void
generate_gs_svb_set_destination_index(struct elk_codegen * p,vec4_instruction * inst,struct elk_reg dst,struct elk_reg src)524 generate_gs_svb_set_destination_index(struct elk_codegen *p,
525 vec4_instruction *inst,
526 struct elk_reg dst,
527 struct elk_reg src)
528 {
529 int vertex = inst->sol_vertex;
530 elk_push_insn_state(p);
531 elk_set_default_access_mode(p, ELK_ALIGN_1);
532 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
533 elk_MOV(p, get_element_ud(dst, 5), get_element_ud(src, vertex));
534 elk_pop_insn_state(p);
535 }
536
537 static void
generate_gs_set_dword_2(struct elk_codegen * p,struct elk_reg dst,struct elk_reg src)538 generate_gs_set_dword_2(struct elk_codegen *p,
539 struct elk_reg dst,
540 struct elk_reg src)
541 {
542 elk_push_insn_state(p);
543 elk_set_default_access_mode(p, ELK_ALIGN_1);
544 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
545 elk_MOV(p, suboffset(vec1(dst), 2), suboffset(vec1(src), 0));
546 elk_pop_insn_state(p);
547 }
548
549 static void
generate_gs_prepare_channel_masks(struct elk_codegen * p,struct elk_reg dst)550 generate_gs_prepare_channel_masks(struct elk_codegen *p,
551 struct elk_reg dst)
552 {
553 /* We want to left shift just DWORD 4 (the x component belonging to the
554 * second geometry shader invocation) by 4 bits. So generate the
555 * instruction:
556 *
557 * shl(1) dst.4<1>UD dst.4<0,1,0>UD 4UD { align1 WE_all }
558 */
559 dst = suboffset(vec1(dst), 4);
560 elk_push_insn_state(p);
561 elk_set_default_access_mode(p, ELK_ALIGN_1);
562 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
563 elk_SHL(p, dst, dst, elk_imm_ud(4));
564 elk_pop_insn_state(p);
565 }
566
567 static void
generate_gs_set_channel_masks(struct elk_codegen * p,struct elk_reg dst,struct elk_reg src)568 generate_gs_set_channel_masks(struct elk_codegen *p,
569 struct elk_reg dst,
570 struct elk_reg src)
571 {
572 /* From p21 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
573 * Header: M0.5):
574 *
575 * 15 Vertex 1 DATA [3] / Vertex 0 DATA[7] Channel Mask
576 *
577 * When Swizzle Control = URB_INTERLEAVED this bit controls Vertex 1
578 * DATA[3], when Swizzle Control = URB_NOSWIZZLE this bit controls
579 * Vertex 0 DATA[7]. This bit is ANDed with the corresponding
580 * channel enable to determine the final channel enable. For the
581 * URB_READ_OWORD & URB_READ_HWORD messages, when final channel
582 * enable is 1 it indicates that Vertex 1 DATA [3] will be included
583 * in the writeback message. For the URB_WRITE_OWORD &
584 * URB_WRITE_HWORD messages, when final channel enable is 1 it
585 * indicates that Vertex 1 DATA [3] will be written to the surface.
586 *
587 * 0: Vertex 1 DATA [3] / Vertex 0 DATA[7] channel not included
588 * 1: Vertex DATA [3] / Vertex 0 DATA[7] channel included
589 *
590 * 14 Vertex 1 DATA [2] Channel Mask
591 * 13 Vertex 1 DATA [1] Channel Mask
592 * 12 Vertex 1 DATA [0] Channel Mask
593 * 11 Vertex 0 DATA [3] Channel Mask
594 * 10 Vertex 0 DATA [2] Channel Mask
595 * 9 Vertex 0 DATA [1] Channel Mask
596 * 8 Vertex 0 DATA [0] Channel Mask
597 *
598 * (This is from a section of the PRM that is agnostic to the particular
599 * type of shader being executed, so "Vertex 0" and "Vertex 1" refer to
600 * geometry shader invocations 0 and 1, respectively). Since we have the
601 * enable flags for geometry shader invocation 0 in bits 3:0 of DWORD 0,
602 * and the enable flags for geometry shader invocation 1 in bits 7:0 of
603 * DWORD 4, we just need to OR them together and store the result in bits
604 * 15:8 of DWORD 5.
605 *
606 * It's easier to get the EU to do this if we think of the src and dst
607 * registers as composed of 32 bytes each; then, we want to pick up the
608 * contents of bytes 0 and 16 from src, OR them together, and store them in
609 * byte 21.
610 *
611 * We can do that by the following EU instruction:
612 *
613 * or(1) dst.21<1>UB src<0,1,0>UB src.16<0,1,0>UB { align1 WE_all }
614 *
615 * Note: this relies on the source register having zeros in (a) bits 7:4 of
616 * DWORD 0 and (b) bits 3:0 of DWORD 4. We can rely on (b) because the
617 * source register was prepared by ELK_GS_OPCODE_PREPARE_CHANNEL_MASKS (which
618 * shifts DWORD 4 left by 4 bits), and we can rely on (a) because prior to
619 * the execution of ELK_GS_OPCODE_PREPARE_CHANNEL_MASKS, DWORDs 0 and 4 need to
620 * contain valid channel mask values (which are in the range 0x0-0xf).
621 */
622 dst = retype(dst, ELK_REGISTER_TYPE_UB);
623 src = retype(src, ELK_REGISTER_TYPE_UB);
624 elk_push_insn_state(p);
625 elk_set_default_access_mode(p, ELK_ALIGN_1);
626 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
627 elk_OR(p, suboffset(vec1(dst), 21), vec1(src), suboffset(vec1(src), 16));
628 elk_pop_insn_state(p);
629 }
630
631 static void
generate_gs_get_instance_id(struct elk_codegen * p,struct elk_reg dst)632 generate_gs_get_instance_id(struct elk_codegen *p,
633 struct elk_reg dst)
634 {
635 /* We want to right shift R0.0 & R0.1 by GFX7_GS_PAYLOAD_INSTANCE_ID_SHIFT
636 * and store into dst.0 & dst.4. So generate the instruction:
637 *
638 * shr(8) dst<1> R0<1,4,0> GFX7_GS_PAYLOAD_INSTANCE_ID_SHIFT { align1 WE_normal 1Q }
639 */
640 elk_push_insn_state(p);
641 elk_set_default_access_mode(p, ELK_ALIGN_1);
642 dst = retype(dst, ELK_REGISTER_TYPE_UD);
643 struct elk_reg r0(retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
644 elk_SHR(p, dst, stride(r0, 1, 4, 0),
645 elk_imm_ud(GFX7_GS_PAYLOAD_INSTANCE_ID_SHIFT));
646 elk_pop_insn_state(p);
647 }
648
649 static void
generate_gs_ff_sync_set_primitives(struct elk_codegen * p,struct elk_reg dst,struct elk_reg src0,struct elk_reg src1,struct elk_reg src2)650 generate_gs_ff_sync_set_primitives(struct elk_codegen *p,
651 struct elk_reg dst,
652 struct elk_reg src0,
653 struct elk_reg src1,
654 struct elk_reg src2)
655 {
656 elk_push_insn_state(p);
657 elk_set_default_access_mode(p, ELK_ALIGN_1);
658 /* Save src0 data in 16:31 bits of dst.0 */
659 elk_AND(p, suboffset(vec1(dst), 0), suboffset(vec1(src0), 0),
660 elk_imm_ud(0xffffu));
661 elk_SHL(p, suboffset(vec1(dst), 0), suboffset(vec1(dst), 0), elk_imm_ud(16));
662 /* Save src1 data in 0:15 bits of dst.0 */
663 elk_AND(p, suboffset(vec1(src2), 0), suboffset(vec1(src1), 0),
664 elk_imm_ud(0xffffu));
665 elk_OR(p, suboffset(vec1(dst), 0),
666 suboffset(vec1(dst), 0),
667 suboffset(vec1(src2), 0));
668 elk_pop_insn_state(p);
669 }
670
671 static void
generate_gs_ff_sync(struct elk_codegen * p,vec4_instruction * inst,struct elk_reg dst,struct elk_reg src0,struct elk_reg src1)672 generate_gs_ff_sync(struct elk_codegen *p,
673 vec4_instruction *inst,
674 struct elk_reg dst,
675 struct elk_reg src0,
676 struct elk_reg src1)
677 {
678 /* This opcode uses an implied MRF register for:
679 * - the header of the ff_sync message. And as such it is expected to be
680 * initialized to r0 before calling here.
681 * - the destination where we will write the allocated URB handle.
682 */
683 struct elk_reg header =
684 retype(elk_message_reg(inst->base_mrf), ELK_REGISTER_TYPE_UD);
685
686 /* Overwrite dword 0 of the header (SO vertices to write) and
687 * dword 1 (number of primitives written).
688 */
689 elk_push_insn_state(p);
690 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
691 elk_set_default_access_mode(p, ELK_ALIGN_1);
692 elk_MOV(p, get_element_ud(header, 0), get_element_ud(src1, 0));
693 elk_MOV(p, get_element_ud(header, 1), get_element_ud(src0, 0));
694 elk_pop_insn_state(p);
695
696 /* Allocate URB handle in dst */
697 elk_ff_sync(p,
698 dst,
699 0,
700 header,
701 1, /* allocate */
702 1, /* response length */
703 0 /* eot */);
704
705 /* Now put allocated urb handle in header.0 */
706 elk_push_insn_state(p);
707 elk_set_default_access_mode(p, ELK_ALIGN_1);
708 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
709 elk_MOV(p, get_element_ud(header, 0), get_element_ud(dst, 0));
710
711 /* src1 is not an immediate when we use transform feedback */
712 if (src1.file != ELK_IMMEDIATE_VALUE) {
713 elk_set_default_exec_size(p, ELK_EXECUTE_4);
714 elk_MOV(p, elk_vec4_grf(src1.nr, 0), elk_vec4_grf(dst.nr, 1));
715 }
716
717 elk_pop_insn_state(p);
718 }
719
720 static void
generate_gs_set_primitive_id(struct elk_codegen * p,struct elk_reg dst)721 generate_gs_set_primitive_id(struct elk_codegen *p, struct elk_reg dst)
722 {
723 /* In gfx6, PrimitiveID is delivered in R0.1 of the payload */
724 struct elk_reg src = elk_vec8_grf(0, 0);
725 elk_push_insn_state(p);
726 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
727 elk_set_default_access_mode(p, ELK_ALIGN_1);
728 elk_MOV(p, get_element_ud(dst, 0), get_element_ud(src, 1));
729 elk_pop_insn_state(p);
730 }
731
732 static void
generate_tcs_get_instance_id(struct elk_codegen * p,struct elk_reg dst)733 generate_tcs_get_instance_id(struct elk_codegen *p, struct elk_reg dst)
734 {
735 const struct intel_device_info *devinfo = p->devinfo;
736 const bool ivb = devinfo->platform == INTEL_PLATFORM_IVB ||
737 devinfo->platform == INTEL_PLATFORM_BYT;
738
739 /* "Instance Count" comes as part of the payload in r0.2 bits 23:17.
740 *
741 * Since we operate in SIMD4x2 mode, we need run half as many threads
742 * as necessary. So we assign (2i + 1, 2i) as the thread counts. We
743 * shift right by one less to accomplish the multiplication by two.
744 */
745 dst = retype(dst, ELK_REGISTER_TYPE_UD);
746 struct elk_reg r0(retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
747
748 elk_push_insn_state(p);
749 elk_set_default_access_mode(p, ELK_ALIGN_1);
750
751 const int mask = ivb ? INTEL_MASK(22, 16) : INTEL_MASK(23, 17);
752 const int shift = ivb ? 16 : 17;
753
754 elk_AND(p, get_element_ud(dst, 0), get_element_ud(r0, 2), elk_imm_ud(mask));
755 elk_SHR(p, get_element_ud(dst, 0), get_element_ud(dst, 0),
756 elk_imm_ud(shift - 1));
757 elk_ADD(p, get_element_ud(dst, 4), get_element_ud(dst, 0), elk_imm_ud(1));
758
759 elk_pop_insn_state(p);
760 }
761
762 static void
generate_tcs_urb_write(struct elk_codegen * p,vec4_instruction * inst,struct elk_reg urb_header)763 generate_tcs_urb_write(struct elk_codegen *p,
764 vec4_instruction *inst,
765 struct elk_reg urb_header)
766 {
767 const struct intel_device_info *devinfo = p->devinfo;
768
769 elk_inst *send = elk_next_insn(p, ELK_OPCODE_SEND);
770 elk_set_dest(p, send, elk_null_reg());
771 elk_set_src0(p, send, urb_header);
772 elk_set_desc(p, send, elk_message_desc(devinfo, inst->mlen, 0, true));
773
774 elk_inst_set_sfid(devinfo, send, ELK_SFID_URB);
775 elk_inst_set_urb_opcode(devinfo, send, ELK_URB_OPCODE_WRITE_OWORD);
776 elk_inst_set_urb_global_offset(devinfo, send, inst->offset);
777 if (inst->urb_write_flags & ELK_URB_WRITE_EOT) {
778 elk_inst_set_eot(devinfo, send, 1);
779 } else {
780 elk_inst_set_urb_per_slot_offset(devinfo, send, 1);
781 elk_inst_set_urb_swizzle_control(devinfo, send, ELK_URB_SWIZZLE_INTERLEAVE);
782 }
783
784 /* what happens to swizzles? */
785 }
786
787
788 static void
generate_tcs_input_urb_offsets(struct elk_codegen * p,struct elk_reg dst,struct elk_reg vertex,struct elk_reg offset)789 generate_tcs_input_urb_offsets(struct elk_codegen *p,
790 struct elk_reg dst,
791 struct elk_reg vertex,
792 struct elk_reg offset)
793 {
794 /* Generates an URB read/write message header for HS/DS operation.
795 * Inputs are a vertex index, and a byte offset from the beginning of
796 * the vertex. */
797
798 /* If `vertex` is not an immediate, we clobber a0.0 */
799
800 assert(vertex.file == ELK_IMMEDIATE_VALUE || vertex.file == ELK_GENERAL_REGISTER_FILE);
801 assert(vertex.type == ELK_REGISTER_TYPE_UD || vertex.type == ELK_REGISTER_TYPE_D);
802
803 assert(dst.file == ELK_GENERAL_REGISTER_FILE);
804
805 elk_push_insn_state(p);
806 elk_set_default_access_mode(p, ELK_ALIGN_1);
807 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
808 elk_MOV(p, dst, elk_imm_ud(0));
809
810 /* m0.5 bits 8-15 are channel enables */
811 elk_MOV(p, get_element_ud(dst, 5), elk_imm_ud(0xff00));
812
813 /* m0.0-0.1: URB handles */
814 if (vertex.file == ELK_IMMEDIATE_VALUE) {
815 uint32_t vertex_index = vertex.ud;
816 struct elk_reg index_reg = elk_vec1_grf(
817 1 + (vertex_index >> 3), vertex_index & 7);
818
819 elk_MOV(p, vec2(get_element_ud(dst, 0)),
820 retype(index_reg, ELK_REGISTER_TYPE_UD));
821 } else {
822 /* Use indirect addressing. ICP Handles are DWords (single channels
823 * of a register) and start at g1.0.
824 *
825 * In order to start our region at g1.0, we add 8 to the vertex index,
826 * effectively skipping over the 8 channels in g0.0. This gives us a
827 * DWord offset to the ICP Handle.
828 *
829 * Indirect addressing works in terms of bytes, so we then multiply
830 * the DWord offset by 4 (by shifting left by 2).
831 */
832 struct elk_reg addr = elk_address_reg(0);
833
834 /* bottom half: m0.0 = g[1.0 + vertex.0]UD */
835 elk_ADD(p, addr, retype(get_element_ud(vertex, 0), ELK_REGISTER_TYPE_UW),
836 elk_imm_uw(0x8));
837 elk_SHL(p, addr, addr, elk_imm_uw(2));
838 elk_MOV(p, get_element_ud(dst, 0), deref_1ud(elk_indirect(0, 0), 0));
839
840 /* top half: m0.1 = g[1.0 + vertex.4]UD */
841 elk_ADD(p, addr, retype(get_element_ud(vertex, 4), ELK_REGISTER_TYPE_UW),
842 elk_imm_uw(0x8));
843 elk_SHL(p, addr, addr, elk_imm_uw(2));
844 elk_MOV(p, get_element_ud(dst, 1), deref_1ud(elk_indirect(0, 0), 0));
845 }
846
847 /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */
848 if (offset.file != ARF)
849 elk_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
850
851 elk_pop_insn_state(p);
852 }
853
854
855 static void
generate_tcs_output_urb_offsets(struct elk_codegen * p,struct elk_reg dst,struct elk_reg write_mask,struct elk_reg offset)856 generate_tcs_output_urb_offsets(struct elk_codegen *p,
857 struct elk_reg dst,
858 struct elk_reg write_mask,
859 struct elk_reg offset)
860 {
861 /* Generates an URB read/write message header for HS/DS operation, for the patch URB entry. */
862 assert(dst.file == ELK_GENERAL_REGISTER_FILE || dst.file == ELK_MESSAGE_REGISTER_FILE);
863
864 assert(write_mask.file == ELK_IMMEDIATE_VALUE);
865 assert(write_mask.type == ELK_REGISTER_TYPE_UD);
866
867 elk_push_insn_state(p);
868
869 elk_set_default_access_mode(p, ELK_ALIGN_1);
870 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
871 elk_MOV(p, dst, elk_imm_ud(0));
872
873 unsigned mask = write_mask.ud;
874
875 /* m0.5 bits 15:12 and 11:8 are channel enables */
876 elk_MOV(p, get_element_ud(dst, 5), elk_imm_ud((mask << 8) | (mask << 12)));
877
878 /* HS patch URB handle is delivered in r0.0 */
879 struct elk_reg urb_handle = elk_vec1_grf(0, 0);
880
881 /* m0.0-0.1: URB handles */
882 elk_MOV(p, vec2(get_element_ud(dst, 0)),
883 retype(urb_handle, ELK_REGISTER_TYPE_UD));
884
885 /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */
886 if (offset.file != ARF)
887 elk_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
888
889 elk_pop_insn_state(p);
890 }
891
892 static void
generate_tes_create_input_read_header(struct elk_codegen * p,struct elk_reg dst)893 generate_tes_create_input_read_header(struct elk_codegen *p,
894 struct elk_reg dst)
895 {
896 elk_push_insn_state(p);
897 elk_set_default_access_mode(p, ELK_ALIGN_1);
898 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
899
900 /* Initialize the register to 0 */
901 elk_MOV(p, dst, elk_imm_ud(0));
902
903 /* Enable all the channels in m0.5 bits 15:8 */
904 elk_MOV(p, get_element_ud(dst, 5), elk_imm_ud(0xff00));
905
906 /* Copy g1.3 (the patch URB handle) to m0.0 and m0.1. For safety,
907 * mask out irrelevant "Reserved" bits, as they're not marked MBZ.
908 */
909 elk_AND(p, vec2(get_element_ud(dst, 0)),
910 retype(elk_vec1_grf(1, 3), ELK_REGISTER_TYPE_UD),
911 elk_imm_ud(0x1fff));
912 elk_pop_insn_state(p);
913 }
914
915 static void
generate_tes_add_indirect_urb_offset(struct elk_codegen * p,struct elk_reg dst,struct elk_reg header,struct elk_reg offset)916 generate_tes_add_indirect_urb_offset(struct elk_codegen *p,
917 struct elk_reg dst,
918 struct elk_reg header,
919 struct elk_reg offset)
920 {
921 elk_push_insn_state(p);
922 elk_set_default_access_mode(p, ELK_ALIGN_1);
923 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
924
925 elk_MOV(p, dst, header);
926
927 /* Uniforms will have a stride <0;4,1>, and we need to convert to <0;1,0>.
928 * Other values get <4;1,0>.
929 */
930 struct elk_reg restrided_offset;
931 if (offset.vstride == ELK_VERTICAL_STRIDE_0 &&
932 offset.width == ELK_WIDTH_4 &&
933 offset.hstride == ELK_HORIZONTAL_STRIDE_1) {
934 restrided_offset = stride(offset, 0, 1, 0);
935 } else {
936 restrided_offset = stride(offset, 4, 1, 0);
937 }
938
939 /* m0.3-0.4: 128-bit-granular offsets into the URB from the handles */
940 elk_MOV(p, vec2(get_element_ud(dst, 3)), restrided_offset);
941
942 elk_pop_insn_state(p);
943 }
944
945 static void
generate_vec4_urb_read(struct elk_codegen * p,vec4_instruction * inst,struct elk_reg dst,struct elk_reg header)946 generate_vec4_urb_read(struct elk_codegen *p,
947 vec4_instruction *inst,
948 struct elk_reg dst,
949 struct elk_reg header)
950 {
951 const struct intel_device_info *devinfo = p->devinfo;
952
953 assert(header.file == ELK_GENERAL_REGISTER_FILE);
954 assert(header.type == ELK_REGISTER_TYPE_UD);
955
956 elk_inst *send = elk_next_insn(p, ELK_OPCODE_SEND);
957 elk_set_dest(p, send, dst);
958 elk_set_src0(p, send, header);
959
960 elk_set_desc(p, send, elk_message_desc(devinfo, 1, 1, true));
961
962 elk_inst_set_sfid(devinfo, send, ELK_SFID_URB);
963 elk_inst_set_urb_opcode(devinfo, send, ELK_URB_OPCODE_READ_OWORD);
964 elk_inst_set_urb_swizzle_control(devinfo, send, ELK_URB_SWIZZLE_INTERLEAVE);
965 elk_inst_set_urb_per_slot_offset(devinfo, send, 1);
966
967 elk_inst_set_urb_global_offset(devinfo, send, inst->offset);
968 }
969
970 static void
generate_tcs_release_input(struct elk_codegen * p,struct elk_reg header,struct elk_reg vertex,struct elk_reg is_unpaired)971 generate_tcs_release_input(struct elk_codegen *p,
972 struct elk_reg header,
973 struct elk_reg vertex,
974 struct elk_reg is_unpaired)
975 {
976 const struct intel_device_info *devinfo = p->devinfo;
977
978 assert(vertex.file == ELK_IMMEDIATE_VALUE);
979 assert(vertex.type == ELK_REGISTER_TYPE_UD);
980
981 /* m0.0-0.1: URB handles */
982 struct elk_reg urb_handles =
983 retype(elk_vec2_grf(1 + (vertex.ud >> 3), vertex.ud & 7),
984 ELK_REGISTER_TYPE_UD);
985
986 elk_push_insn_state(p);
987 elk_set_default_access_mode(p, ELK_ALIGN_1);
988 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
989 elk_MOV(p, header, elk_imm_ud(0));
990 elk_MOV(p, vec2(get_element_ud(header, 0)), urb_handles);
991 elk_pop_insn_state(p);
992
993 elk_inst *send = elk_next_insn(p, ELK_OPCODE_SEND);
994 elk_set_dest(p, send, elk_null_reg());
995 elk_set_src0(p, send, header);
996 elk_set_desc(p, send, elk_message_desc(devinfo, 1, 0, true));
997
998 elk_inst_set_sfid(devinfo, send, ELK_SFID_URB);
999 elk_inst_set_urb_opcode(devinfo, send, ELK_URB_OPCODE_READ_OWORD);
1000 elk_inst_set_urb_complete(devinfo, send, 1);
1001 elk_inst_set_urb_swizzle_control(devinfo, send, is_unpaired.ud ?
1002 ELK_URB_SWIZZLE_NONE :
1003 ELK_URB_SWIZZLE_INTERLEAVE);
1004 }
1005
1006 static void
generate_tcs_thread_end(struct elk_codegen * p,vec4_instruction * inst)1007 generate_tcs_thread_end(struct elk_codegen *p, vec4_instruction *inst)
1008 {
1009 struct elk_reg header = elk_message_reg(inst->base_mrf);
1010
1011 elk_push_insn_state(p);
1012 elk_set_default_access_mode(p, ELK_ALIGN_1);
1013 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
1014 elk_MOV(p, header, elk_imm_ud(0));
1015 elk_MOV(p, get_element_ud(header, 5), elk_imm_ud(WRITEMASK_X << 8));
1016 elk_MOV(p, get_element_ud(header, 0),
1017 retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_UD));
1018 elk_MOV(p, elk_message_reg(inst->base_mrf + 1), elk_imm_ud(0u));
1019 elk_pop_insn_state(p);
1020
1021 elk_urb_WRITE(p,
1022 elk_null_reg(), /* dest */
1023 inst->base_mrf, /* starting mrf reg nr */
1024 header,
1025 ELK_URB_WRITE_EOT | ELK_URB_WRITE_OWORD |
1026 ELK_URB_WRITE_USE_CHANNEL_MASKS,
1027 inst->mlen,
1028 0, /* response len */
1029 0, /* urb destination offset */
1030 0);
1031 }
1032
1033 static void
generate_tes_get_primitive_id(struct elk_codegen * p,struct elk_reg dst)1034 generate_tes_get_primitive_id(struct elk_codegen *p, struct elk_reg dst)
1035 {
1036 elk_push_insn_state(p);
1037 elk_set_default_access_mode(p, ELK_ALIGN_1);
1038 elk_MOV(p, dst, retype(elk_vec1_grf(1, 7), ELK_REGISTER_TYPE_D));
1039 elk_pop_insn_state(p);
1040 }
1041
1042 static void
generate_tcs_get_primitive_id(struct elk_codegen * p,struct elk_reg dst)1043 generate_tcs_get_primitive_id(struct elk_codegen *p, struct elk_reg dst)
1044 {
1045 elk_push_insn_state(p);
1046 elk_set_default_access_mode(p, ELK_ALIGN_1);
1047 elk_MOV(p, dst, retype(elk_vec1_grf(0, 1), ELK_REGISTER_TYPE_UD));
1048 elk_pop_insn_state(p);
1049 }
1050
1051 static void
generate_tcs_create_barrier_header(struct elk_codegen * p,struct elk_vue_prog_data * prog_data,struct elk_reg dst)1052 generate_tcs_create_barrier_header(struct elk_codegen *p,
1053 struct elk_vue_prog_data *prog_data,
1054 struct elk_reg dst)
1055 {
1056 const struct intel_device_info *devinfo = p->devinfo;
1057 const bool ivb = devinfo->platform == INTEL_PLATFORM_IVB ||
1058 devinfo->platform == INTEL_PLATFORM_BYT;
1059 struct elk_reg m0_2 = get_element_ud(dst, 2);
1060 unsigned instances = ((struct elk_tcs_prog_data *) prog_data)->instances;
1061
1062 elk_push_insn_state(p);
1063 elk_set_default_access_mode(p, ELK_ALIGN_1);
1064 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
1065
1066 /* Zero the message header */
1067 elk_MOV(p, retype(dst, ELK_REGISTER_TYPE_UD), elk_imm_ud(0u));
1068
1069 /* Copy "Barrier ID" from r0.2, bits 16:13 (Gfx7.5+) or 15:12 (Gfx7) */
1070 elk_AND(p, m0_2,
1071 retype(elk_vec1_grf(0, 2), ELK_REGISTER_TYPE_UD),
1072 elk_imm_ud(ivb ? INTEL_MASK(15, 12) : INTEL_MASK(16, 13)));
1073
1074 /* Shift it up to bits 27:24. */
1075 elk_SHL(p, m0_2, get_element_ud(dst, 2), elk_imm_ud(ivb ? 12 : 11));
1076
1077 /* Set the Barrier Count and the enable bit */
1078 elk_OR(p, m0_2, m0_2, elk_imm_ud(instances << 9 | (1 << 15)));
1079
1080 elk_pop_insn_state(p);
1081 }
1082
1083 static void
generate_oword_dual_block_offsets(struct elk_codegen * p,struct elk_reg m1,struct elk_reg index)1084 generate_oword_dual_block_offsets(struct elk_codegen *p,
1085 struct elk_reg m1,
1086 struct elk_reg index)
1087 {
1088 int second_vertex_offset;
1089
1090 if (p->devinfo->ver >= 6)
1091 second_vertex_offset = 1;
1092 else
1093 second_vertex_offset = 16;
1094
1095 m1 = retype(m1, ELK_REGISTER_TYPE_D);
1096
1097 /* Set up M1 (message payload). Only the block offsets in M1.0 and
1098 * M1.4 are used, and the rest are ignored.
1099 */
1100 struct elk_reg m1_0 = suboffset(vec1(m1), 0);
1101 struct elk_reg m1_4 = suboffset(vec1(m1), 4);
1102 struct elk_reg index_0 = suboffset(vec1(index), 0);
1103 struct elk_reg index_4 = suboffset(vec1(index), 4);
1104
1105 elk_push_insn_state(p);
1106 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
1107 elk_set_default_access_mode(p, ELK_ALIGN_1);
1108
1109 elk_MOV(p, m1_0, index_0);
1110
1111 if (index.file == ELK_IMMEDIATE_VALUE) {
1112 index_4.ud += second_vertex_offset;
1113 elk_MOV(p, m1_4, index_4);
1114 } else {
1115 elk_ADD(p, m1_4, index_4, elk_imm_d(second_vertex_offset));
1116 }
1117
1118 elk_pop_insn_state(p);
1119 }
1120
1121 static void
generate_unpack_flags(struct elk_codegen * p,struct elk_reg dst)1122 generate_unpack_flags(struct elk_codegen *p,
1123 struct elk_reg dst)
1124 {
1125 elk_push_insn_state(p);
1126 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
1127 elk_set_default_access_mode(p, ELK_ALIGN_1);
1128
1129 struct elk_reg flags = elk_flag_reg(0, 0);
1130 struct elk_reg dst_0 = suboffset(vec1(dst), 0);
1131 struct elk_reg dst_4 = suboffset(vec1(dst), 4);
1132
1133 elk_AND(p, dst_0, flags, elk_imm_ud(0x0f));
1134 elk_AND(p, dst_4, flags, elk_imm_ud(0xf0));
1135 elk_SHR(p, dst_4, dst_4, elk_imm_ud(4));
1136
1137 elk_pop_insn_state(p);
1138 }
1139
1140 static void
generate_scratch_read(struct elk_codegen * p,vec4_instruction * inst,struct elk_reg dst,struct elk_reg index)1141 generate_scratch_read(struct elk_codegen *p,
1142 vec4_instruction *inst,
1143 struct elk_reg dst,
1144 struct elk_reg index)
1145 {
1146 const struct intel_device_info *devinfo = p->devinfo;
1147 struct elk_reg header = elk_vec8_grf(0, 0);
1148
1149 elk_gfx6_resolve_implied_move(p, &header, inst->base_mrf);
1150
1151 generate_oword_dual_block_offsets(p, elk_message_reg(inst->base_mrf + 1),
1152 index);
1153
1154 uint32_t msg_type;
1155
1156 if (devinfo->ver >= 6)
1157 msg_type = GFX6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1158 else if (devinfo->verx10 >= 45)
1159 msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1160 else
1161 msg_type = ELK_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1162
1163 const unsigned target_cache =
1164 devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
1165 devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
1166 ELK_SFID_DATAPORT_READ;
1167
1168 /* Each of the 8 channel enables is considered for whether each
1169 * dword is written.
1170 */
1171 elk_inst *send = elk_next_insn(p, ELK_OPCODE_SEND);
1172 elk_inst_set_sfid(devinfo, send, target_cache);
1173 elk_set_dest(p, send, dst);
1174 elk_set_src0(p, send, header);
1175 if (devinfo->ver < 6)
1176 elk_inst_set_cond_modifier(devinfo, send, inst->base_mrf);
1177 elk_set_desc(p, send,
1178 elk_message_desc(devinfo, 2, 1, true) |
1179 elk_dp_read_desc(devinfo,
1180 elk_scratch_surface_idx(p),
1181 ELK_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
1182 msg_type, ELK_DATAPORT_READ_TARGET_RENDER_CACHE));
1183 }
1184
1185 static void
generate_scratch_write(struct elk_codegen * p,vec4_instruction * inst,struct elk_reg dst,struct elk_reg src,struct elk_reg index)1186 generate_scratch_write(struct elk_codegen *p,
1187 vec4_instruction *inst,
1188 struct elk_reg dst,
1189 struct elk_reg src,
1190 struct elk_reg index)
1191 {
1192 const struct intel_device_info *devinfo = p->devinfo;
1193 const unsigned target_cache =
1194 (devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
1195 devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
1196 ELK_SFID_DATAPORT_WRITE);
1197 struct elk_reg header = elk_vec8_grf(0, 0);
1198 bool write_commit;
1199
1200 /* If the instruction is predicated, we'll predicate the send, not
1201 * the header setup.
1202 */
1203 elk_push_insn_state(p);
1204 elk_set_default_predicate_control(p, ELK_PREDICATE_NONE);
1205 elk_set_default_flag_reg(p, 0, 0);
1206
1207 elk_gfx6_resolve_implied_move(p, &header, inst->base_mrf);
1208
1209 generate_oword_dual_block_offsets(p, elk_message_reg(inst->base_mrf + 1),
1210 index);
1211
1212 elk_MOV(p,
1213 retype(elk_message_reg(inst->base_mrf + 2), ELK_REGISTER_TYPE_D),
1214 retype(src, ELK_REGISTER_TYPE_D));
1215
1216 elk_pop_insn_state(p);
1217
1218 uint32_t msg_type;
1219
1220 if (devinfo->ver >= 7)
1221 msg_type = GFX7_DATAPORT_DC_OWORD_DUAL_BLOCK_WRITE;
1222 else if (devinfo->ver == 6)
1223 msg_type = GFX6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
1224 else
1225 msg_type = ELK_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
1226
1227 elk_set_default_predicate_control(p, inst->predicate);
1228
1229 /* Pre-gfx6, we have to specify write commits to ensure ordering
1230 * between reads and writes within a thread. Afterwards, that's
1231 * guaranteed and write commits only matter for inter-thread
1232 * synchronization.
1233 */
1234 if (devinfo->ver >= 6) {
1235 write_commit = false;
1236 } else {
1237 /* The visitor set up our destination register to be g0. This
1238 * means that when the next read comes along, we will end up
1239 * reading from g0 and causing a block on the write commit. For
1240 * write-after-read, we are relying on the value of the previous
1241 * read being used (and thus blocking on completion) before our
1242 * write is executed. This means we have to be careful in
1243 * instruction scheduling to not violate this assumption.
1244 */
1245 write_commit = true;
1246 }
1247
1248 /* Each of the 8 channel enables is considered for whether each
1249 * dword is written.
1250 */
1251 elk_inst *send = elk_next_insn(p, ELK_OPCODE_SEND);
1252 elk_inst_set_sfid(p->devinfo, send, target_cache);
1253 elk_set_dest(p, send, dst);
1254 elk_set_src0(p, send, header);
1255 if (devinfo->ver < 6)
1256 elk_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf);
1257 elk_set_desc(p, send,
1258 elk_message_desc(devinfo, 3, write_commit, true) |
1259 elk_dp_write_desc(devinfo,
1260 elk_scratch_surface_idx(p),
1261 ELK_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
1262 msg_type,
1263 write_commit));
1264 }
1265
1266 static void
generate_pull_constant_load(struct elk_codegen * p,vec4_instruction * inst,struct elk_reg dst,struct elk_reg index,struct elk_reg offset)1267 generate_pull_constant_load(struct elk_codegen *p,
1268 vec4_instruction *inst,
1269 struct elk_reg dst,
1270 struct elk_reg index,
1271 struct elk_reg offset)
1272 {
1273 const struct intel_device_info *devinfo = p->devinfo;
1274 const unsigned target_cache =
1275 (devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_SAMPLER_CACHE :
1276 ELK_SFID_DATAPORT_READ);
1277 assert(index.file == ELK_IMMEDIATE_VALUE &&
1278 index.type == ELK_REGISTER_TYPE_UD);
1279 uint32_t surf_index = index.ud;
1280
1281 struct elk_reg header = elk_vec8_grf(0, 0);
1282
1283 elk_gfx6_resolve_implied_move(p, &header, inst->base_mrf);
1284
1285 if (devinfo->ver >= 6) {
1286 if (offset.file == ELK_IMMEDIATE_VALUE) {
1287 elk_MOV(p, retype(elk_message_reg(inst->base_mrf + 1),
1288 ELK_REGISTER_TYPE_D),
1289 elk_imm_d(offset.ud >> 4));
1290 } else {
1291 elk_SHR(p, retype(elk_message_reg(inst->base_mrf + 1),
1292 ELK_REGISTER_TYPE_D),
1293 offset, elk_imm_d(4));
1294 }
1295 } else {
1296 elk_MOV(p, retype(elk_message_reg(inst->base_mrf + 1),
1297 ELK_REGISTER_TYPE_D),
1298 offset);
1299 }
1300
1301 uint32_t msg_type;
1302
1303 if (devinfo->ver >= 6)
1304 msg_type = GFX6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1305 else if (devinfo->verx10 >= 45)
1306 msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1307 else
1308 msg_type = ELK_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1309
1310 /* Each of the 8 channel enables is considered for whether each
1311 * dword is written.
1312 */
1313 elk_inst *send = elk_next_insn(p, ELK_OPCODE_SEND);
1314 elk_inst_set_sfid(devinfo, send, target_cache);
1315 elk_set_dest(p, send, dst);
1316 elk_set_src0(p, send, header);
1317 if (devinfo->ver < 6)
1318 elk_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf);
1319 elk_set_desc(p, send,
1320 elk_message_desc(devinfo, 2, 1, true) |
1321 elk_dp_read_desc(devinfo, surf_index,
1322 ELK_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
1323 msg_type,
1324 ELK_DATAPORT_READ_TARGET_DATA_CACHE));
1325 }
1326
1327 static void
generate_get_buffer_size(struct elk_codegen * p,vec4_instruction * inst,struct elk_reg dst,struct elk_reg src,struct elk_reg surf_index)1328 generate_get_buffer_size(struct elk_codegen *p,
1329 vec4_instruction *inst,
1330 struct elk_reg dst,
1331 struct elk_reg src,
1332 struct elk_reg surf_index)
1333 {
1334 assert(p->devinfo->ver >= 7);
1335 assert(surf_index.type == ELK_REGISTER_TYPE_UD &&
1336 surf_index.file == ELK_IMMEDIATE_VALUE);
1337
1338 elk_SAMPLE(p,
1339 dst,
1340 inst->base_mrf,
1341 src,
1342 surf_index.ud,
1343 0,
1344 GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
1345 1, /* response length */
1346 inst->mlen,
1347 inst->header_size > 0,
1348 ELK_SAMPLER_SIMD_MODE_SIMD4X2,
1349 ELK_SAMPLER_RETURN_FORMAT_SINT32);
1350 }
1351
1352 static void
generate_pull_constant_load_gfx7(struct elk_codegen * p,vec4_instruction * inst,struct elk_reg dst,struct elk_reg surf_index,struct elk_reg offset)1353 generate_pull_constant_load_gfx7(struct elk_codegen *p,
1354 vec4_instruction *inst,
1355 struct elk_reg dst,
1356 struct elk_reg surf_index,
1357 struct elk_reg offset)
1358 {
1359 const struct intel_device_info *devinfo = p->devinfo;
1360 assert(surf_index.type == ELK_REGISTER_TYPE_UD);
1361
1362 if (surf_index.file == ELK_IMMEDIATE_VALUE) {
1363
1364 elk_inst *insn = elk_next_insn(p, ELK_OPCODE_SEND);
1365 elk_inst_set_sfid(devinfo, insn, ELK_SFID_SAMPLER);
1366 elk_set_dest(p, insn, dst);
1367 elk_set_src0(p, insn, offset);
1368 elk_set_desc(p, insn,
1369 elk_message_desc(devinfo, inst->mlen, 1, inst->header_size) |
1370 elk_sampler_desc(devinfo, surf_index.ud,
1371 0, /* LD message ignores sampler unit */
1372 GFX5_SAMPLER_MESSAGE_SAMPLE_LD,
1373 ELK_SAMPLER_SIMD_MODE_SIMD4X2, 0));
1374 } else {
1375
1376 struct elk_reg addr = vec1(retype(elk_address_reg(0), ELK_REGISTER_TYPE_UD));
1377
1378 elk_push_insn_state(p);
1379 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
1380 elk_set_default_access_mode(p, ELK_ALIGN_1);
1381
1382 /* a0.0 = surf_index & 0xff */
1383 elk_inst *insn_and = elk_next_insn(p, ELK_OPCODE_AND);
1384 elk_inst_set_exec_size(devinfo, insn_and, ELK_EXECUTE_1);
1385 elk_set_dest(p, insn_and, addr);
1386 elk_set_src0(p, insn_and, vec1(retype(surf_index, ELK_REGISTER_TYPE_UD)));
1387 elk_set_src1(p, insn_and, elk_imm_ud(0x0ff));
1388
1389 elk_pop_insn_state(p);
1390
1391 /* dst = send(offset, a0.0 | <descriptor>) */
1392 elk_send_indirect_message(
1393 p, ELK_SFID_SAMPLER, dst, offset, addr,
1394 elk_message_desc(devinfo, inst->mlen, 1, inst->header_size) |
1395 elk_sampler_desc(devinfo,
1396 0 /* surface */,
1397 0 /* sampler */,
1398 GFX5_SAMPLER_MESSAGE_SAMPLE_LD,
1399 ELK_SAMPLER_SIMD_MODE_SIMD4X2,
1400 0),
1401 false /* EOT */);
1402 }
1403 }
1404
1405 static void
generate_mov_indirect(struct elk_codegen * p,vec4_instruction *,struct elk_reg dst,struct elk_reg reg,struct elk_reg indirect)1406 generate_mov_indirect(struct elk_codegen *p,
1407 vec4_instruction *,
1408 struct elk_reg dst, struct elk_reg reg,
1409 struct elk_reg indirect)
1410 {
1411 assert(indirect.type == ELK_REGISTER_TYPE_UD);
1412 assert(p->devinfo->ver >= 6);
1413
1414 unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr * (REG_SIZE / 2);
1415
1416 /* This instruction acts in align1 mode */
1417 assert(dst.writemask == WRITEMASK_XYZW);
1418
1419 if (indirect.file == ELK_IMMEDIATE_VALUE) {
1420 imm_byte_offset += indirect.ud;
1421
1422 reg.nr = imm_byte_offset / REG_SIZE;
1423 reg.subnr = (imm_byte_offset / (REG_SIZE / 2)) % 2;
1424 unsigned shift = (imm_byte_offset / 4) % 4;
1425 reg.swizzle += ELK_SWIZZLE4(shift, shift, shift, shift);
1426
1427 elk_MOV(p, dst, reg);
1428 } else {
1429 elk_push_insn_state(p);
1430 elk_set_default_access_mode(p, ELK_ALIGN_1);
1431 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
1432
1433 struct elk_reg addr = vec8(elk_address_reg(0));
1434
1435 /* We need to move the indirect value into the address register. In
1436 * order to make things make some sense, we want to respect at least the
1437 * X component of the swizzle. In order to do that, we need to convert
1438 * the subnr (probably 0) to an align1 subnr and add in the swizzle.
1439 */
1440 assert(elk_is_single_value_swizzle(indirect.swizzle));
1441 indirect.subnr = (indirect.subnr * 4 + ELK_GET_SWZ(indirect.swizzle, 0));
1442
1443 /* We then use a region of <8,4,0>:uw to pick off the first 2 bytes of
1444 * the indirect and splat it out to all four channels of the given half
1445 * of a0.
1446 */
1447 indirect.subnr *= 2;
1448 indirect = stride(retype(indirect, ELK_REGISTER_TYPE_UW), 8, 4, 0);
1449 elk_ADD(p, addr, indirect, elk_imm_uw(imm_byte_offset));
1450
1451 /* Now we need to incorporate the swizzle from the source register */
1452 if (reg.swizzle != ELK_SWIZZLE_XXXX) {
1453 uint32_t uv_swiz = ELK_GET_SWZ(reg.swizzle, 0) << 2 |
1454 ELK_GET_SWZ(reg.swizzle, 1) << 6 |
1455 ELK_GET_SWZ(reg.swizzle, 2) << 10 |
1456 ELK_GET_SWZ(reg.swizzle, 3) << 14;
1457 uv_swiz |= uv_swiz << 16;
1458
1459 elk_ADD(p, addr, addr, elk_imm_uv(uv_swiz));
1460 }
1461
1462 elk_MOV(p, dst, retype(elk_VxH_indirect(0, 0), reg.type));
1463
1464 elk_pop_insn_state(p);
1465 }
1466 }
1467
1468 static void
generate_zero_oob_push_regs(struct elk_codegen * p,struct elk_stage_prog_data * prog_data,struct elk_reg scratch,struct elk_reg bit_mask_in)1469 generate_zero_oob_push_regs(struct elk_codegen *p,
1470 struct elk_stage_prog_data *prog_data,
1471 struct elk_reg scratch,
1472 struct elk_reg bit_mask_in)
1473 {
1474 const uint64_t want_zero = prog_data->zero_push_reg;
1475 assert(want_zero);
1476
1477 assert(bit_mask_in.file == ELK_GENERAL_REGISTER_FILE);
1478 assert(ELK_GET_SWZ(bit_mask_in.swizzle, 1) ==
1479 ELK_GET_SWZ(bit_mask_in.swizzle, 0) + 1);
1480 bit_mask_in.subnr += ELK_GET_SWZ(bit_mask_in.swizzle, 0) * 4;
1481 bit_mask_in.type = ELK_REGISTER_TYPE_W;
1482
1483 /* Scratch should be 3 registers in the GRF */
1484 assert(scratch.file == ELK_GENERAL_REGISTER_FILE);
1485 scratch = vec8(scratch);
1486 struct elk_reg mask_w16 = retype(scratch, ELK_REGISTER_TYPE_W);
1487 struct elk_reg mask_d16 = retype(byte_offset(scratch, REG_SIZE),
1488 ELK_REGISTER_TYPE_D);
1489
1490 elk_push_insn_state(p);
1491 elk_set_default_access_mode(p, ELK_ALIGN_1);
1492 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
1493
1494 for (unsigned i = 0; i < 64; i++) {
1495 if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
1496 elk_set_default_exec_size(p, ELK_EXECUTE_8);
1497 elk_SHL(p, suboffset(mask_w16, 8),
1498 vec1(byte_offset(bit_mask_in, i / 8)),
1499 elk_imm_v(0x01234567));
1500 elk_SHL(p, mask_w16, suboffset(mask_w16, 8), elk_imm_w(8));
1501
1502 elk_set_default_exec_size(p, ELK_EXECUTE_16);
1503 elk_ASR(p, mask_d16, mask_w16, elk_imm_w(15));
1504 }
1505
1506 if (want_zero & BITFIELD64_BIT(i)) {
1507 unsigned push_start = prog_data->dispatch_grf_start_reg;
1508 struct elk_reg push_reg =
1509 retype(elk_vec8_grf(push_start + i, 0), ELK_REGISTER_TYPE_D);
1510
1511 elk_set_default_exec_size(p, ELK_EXECUTE_8);
1512 elk_AND(p, push_reg, push_reg, vec1(suboffset(mask_d16, i)));
1513 }
1514 }
1515
1516 elk_pop_insn_state(p);
1517 }
1518
1519 static void
generate_code(struct elk_codegen * p,const struct elk_compiler * compiler,const struct elk_compile_params * params,const nir_shader * nir,struct elk_vue_prog_data * prog_data,const struct elk_cfg_t * cfg,const performance & perf,struct elk_compile_stats * stats,bool debug_enabled)1520 generate_code(struct elk_codegen *p,
1521 const struct elk_compiler *compiler,
1522 const struct elk_compile_params *params,
1523 const nir_shader *nir,
1524 struct elk_vue_prog_data *prog_data,
1525 const struct elk_cfg_t *cfg,
1526 const performance &perf,
1527 struct elk_compile_stats *stats,
1528 bool debug_enabled)
1529 {
1530 const struct intel_device_info *devinfo = p->devinfo;
1531 const char *stage_abbrev = _mesa_shader_stage_to_abbrev(nir->info.stage);
1532 struct elk_disasm_info *elk_disasm_info = elk_disasm_initialize(p->isa, cfg);
1533
1534 /* `send_count` explicitly does not include spills or fills, as we'd
1535 * like to use it as a metric for intentional memory access or other
1536 * shared function use. Otherwise, subtle changes to scheduling or
1537 * register allocation could cause it to fluctuate wildly - and that
1538 * effect is already counted in spill/fill counts.
1539 */
1540 int spill_count = 0, fill_count = 0;
1541 int loop_count = 0, send_count = 0;
1542
1543 foreach_block_and_inst (block, vec4_instruction, inst, cfg) {
1544 struct elk_reg src[3], dst;
1545
1546 if (unlikely(debug_enabled))
1547 elk_disasm_annotate(elk_disasm_info, inst, p->next_insn_offset);
1548
1549 for (unsigned int i = 0; i < 3; i++) {
1550 src[i] = inst->src[i].as_elk_reg();
1551 }
1552 dst = inst->dst.as_elk_reg();
1553
1554 elk_set_default_predicate_control(p, inst->predicate);
1555 elk_set_default_predicate_inverse(p, inst->predicate_inverse);
1556 elk_set_default_flag_reg(p, inst->flag_subreg / 2, inst->flag_subreg % 2);
1557 elk_set_default_saturate(p, inst->saturate);
1558 elk_set_default_mask_control(p, inst->force_writemask_all);
1559 elk_set_default_acc_write_control(p, inst->writes_accumulator);
1560
1561 assert(inst->group % inst->exec_size == 0);
1562 assert(inst->group % 4 == 0);
1563
1564 /* There are some instructions where the destination is 64-bit
1565 * but we retype it to a smaller type. In that case, we cannot
1566 * double the exec_size.
1567 */
1568 const bool is_df = (get_exec_type_size(inst) == 8 ||
1569 inst->dst.type == ELK_REGISTER_TYPE_DF) &&
1570 inst->opcode != ELK_VEC4_OPCODE_PICK_LOW_32BIT &&
1571 inst->opcode != ELK_VEC4_OPCODE_PICK_HIGH_32BIT &&
1572 inst->opcode != ELK_VEC4_OPCODE_SET_LOW_32BIT &&
1573 inst->opcode != ELK_VEC4_OPCODE_SET_HIGH_32BIT;
1574
1575 unsigned exec_size = inst->exec_size;
1576 if (devinfo->verx10 == 70 && is_df)
1577 exec_size *= 2;
1578
1579 elk_set_default_exec_size(p, cvt(exec_size) - 1);
1580
1581 if (!inst->force_writemask_all)
1582 elk_set_default_group(p, inst->group);
1583
1584 assert(inst->base_mrf + inst->mlen <= ELK_MAX_MRF(devinfo->ver));
1585 assert(inst->mlen <= ELK_MAX_MSG_LENGTH);
1586
1587 unsigned pre_emit_nr_insn = p->nr_insn;
1588
1589 switch (inst->opcode) {
1590 case ELK_VEC4_OPCODE_UNPACK_UNIFORM:
1591 case ELK_OPCODE_MOV:
1592 case ELK_VEC4_OPCODE_MOV_FOR_SCRATCH:
1593 elk_MOV(p, dst, src[0]);
1594 break;
1595 case ELK_OPCODE_ADD:
1596 elk_ADD(p, dst, src[0], src[1]);
1597 break;
1598 case ELK_OPCODE_MUL:
1599 elk_MUL(p, dst, src[0], src[1]);
1600 break;
1601 case ELK_OPCODE_MACH:
1602 elk_MACH(p, dst, src[0], src[1]);
1603 break;
1604
1605 case ELK_OPCODE_MAD:
1606 assert(devinfo->ver >= 6);
1607 elk_MAD(p, dst, src[0], src[1], src[2]);
1608 break;
1609
1610 case ELK_OPCODE_FRC:
1611 elk_FRC(p, dst, src[0]);
1612 break;
1613 case ELK_OPCODE_RNDD:
1614 elk_RNDD(p, dst, src[0]);
1615 break;
1616 case ELK_OPCODE_RNDE:
1617 elk_RNDE(p, dst, src[0]);
1618 break;
1619 case ELK_OPCODE_RNDZ:
1620 elk_RNDZ(p, dst, src[0]);
1621 break;
1622
1623 case ELK_OPCODE_AND:
1624 elk_AND(p, dst, src[0], src[1]);
1625 break;
1626 case ELK_OPCODE_OR:
1627 elk_OR(p, dst, src[0], src[1]);
1628 break;
1629 case ELK_OPCODE_XOR:
1630 elk_XOR(p, dst, src[0], src[1]);
1631 break;
1632 case ELK_OPCODE_NOT:
1633 elk_NOT(p, dst, src[0]);
1634 break;
1635 case ELK_OPCODE_ASR:
1636 elk_ASR(p, dst, src[0], src[1]);
1637 break;
1638 case ELK_OPCODE_SHR:
1639 elk_SHR(p, dst, src[0], src[1]);
1640 break;
1641 case ELK_OPCODE_SHL:
1642 elk_SHL(p, dst, src[0], src[1]);
1643 break;
1644
1645 case ELK_OPCODE_CMP:
1646 elk_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
1647 break;
1648 case ELK_OPCODE_CMPN:
1649 elk_CMPN(p, dst, inst->conditional_mod, src[0], src[1]);
1650 break;
1651 case ELK_OPCODE_SEL:
1652 elk_SEL(p, dst, src[0], src[1]);
1653 break;
1654
1655 case ELK_OPCODE_DPH:
1656 elk_DPH(p, dst, src[0], src[1]);
1657 break;
1658
1659 case ELK_OPCODE_DP4:
1660 elk_DP4(p, dst, src[0], src[1]);
1661 break;
1662
1663 case ELK_OPCODE_DP3:
1664 elk_DP3(p, dst, src[0], src[1]);
1665 break;
1666
1667 case ELK_OPCODE_DP2:
1668 elk_DP2(p, dst, src[0], src[1]);
1669 break;
1670
1671 case ELK_OPCODE_F32TO16:
1672 assert(devinfo->ver >= 7);
1673 elk_F32TO16(p, dst, src[0]);
1674 break;
1675
1676 case ELK_OPCODE_F16TO32:
1677 assert(devinfo->ver >= 7);
1678 elk_F16TO32(p, dst, src[0]);
1679 break;
1680
1681 case ELK_OPCODE_LRP:
1682 assert(devinfo->ver >= 6);
1683 elk_LRP(p, dst, src[0], src[1], src[2]);
1684 break;
1685
1686 case ELK_OPCODE_BFREV:
1687 assert(devinfo->ver >= 7);
1688 elk_BFREV(p, retype(dst, ELK_REGISTER_TYPE_UD),
1689 retype(src[0], ELK_REGISTER_TYPE_UD));
1690 break;
1691 case ELK_OPCODE_FBH:
1692 assert(devinfo->ver >= 7);
1693 elk_FBH(p, retype(dst, src[0].type), src[0]);
1694 break;
1695 case ELK_OPCODE_FBL:
1696 assert(devinfo->ver >= 7);
1697 elk_FBL(p, retype(dst, ELK_REGISTER_TYPE_UD),
1698 retype(src[0], ELK_REGISTER_TYPE_UD));
1699 break;
1700 case ELK_OPCODE_LZD:
1701 elk_LZD(p, dst, src[0]);
1702 break;
1703 case ELK_OPCODE_CBIT:
1704 assert(devinfo->ver >= 7);
1705 elk_CBIT(p, retype(dst, ELK_REGISTER_TYPE_UD),
1706 retype(src[0], ELK_REGISTER_TYPE_UD));
1707 break;
1708 case ELK_OPCODE_ADDC:
1709 assert(devinfo->ver >= 7);
1710 elk_ADDC(p, dst, src[0], src[1]);
1711 break;
1712 case ELK_OPCODE_SUBB:
1713 assert(devinfo->ver >= 7);
1714 elk_SUBB(p, dst, src[0], src[1]);
1715 break;
1716 case ELK_OPCODE_MAC:
1717 elk_MAC(p, dst, src[0], src[1]);
1718 break;
1719
1720 case ELK_OPCODE_BFE:
1721 assert(devinfo->ver >= 7);
1722 elk_BFE(p, dst, src[0], src[1], src[2]);
1723 break;
1724
1725 case ELK_OPCODE_BFI1:
1726 assert(devinfo->ver >= 7);
1727 elk_BFI1(p, dst, src[0], src[1]);
1728 break;
1729 case ELK_OPCODE_BFI2:
1730 assert(devinfo->ver >= 7);
1731 elk_BFI2(p, dst, src[0], src[1], src[2]);
1732 break;
1733
1734 case ELK_OPCODE_IF:
1735 if (!inst->src[0].is_null()) {
1736 /* The instruction has an embedded compare (only allowed on gfx6) */
1737 assert(devinfo->ver == 6);
1738 elk_gfx6_IF(p, inst->conditional_mod, src[0], src[1]);
1739 } else {
1740 elk_inst *if_inst = elk_IF(p, ELK_EXECUTE_8);
1741 elk_inst_set_pred_control(p->devinfo, if_inst, inst->predicate);
1742 }
1743 break;
1744
1745 case ELK_OPCODE_ELSE:
1746 elk_ELSE(p);
1747 break;
1748 case ELK_OPCODE_ENDIF:
1749 elk_ENDIF(p);
1750 break;
1751
1752 case ELK_OPCODE_DO:
1753 elk_DO(p, ELK_EXECUTE_8);
1754 break;
1755
1756 case ELK_OPCODE_BREAK:
1757 elk_BREAK(p);
1758 elk_set_default_predicate_control(p, ELK_PREDICATE_NONE);
1759 break;
1760 case ELK_OPCODE_CONTINUE:
1761 elk_CONT(p);
1762 elk_set_default_predicate_control(p, ELK_PREDICATE_NONE);
1763 break;
1764
1765 case ELK_OPCODE_WHILE:
1766 elk_WHILE(p);
1767 loop_count++;
1768 break;
1769
1770 case ELK_SHADER_OPCODE_RCP:
1771 case ELK_SHADER_OPCODE_RSQ:
1772 case ELK_SHADER_OPCODE_SQRT:
1773 case ELK_SHADER_OPCODE_EXP2:
1774 case ELK_SHADER_OPCODE_LOG2:
1775 case ELK_SHADER_OPCODE_SIN:
1776 case ELK_SHADER_OPCODE_COS:
1777 assert(inst->conditional_mod == ELK_CONDITIONAL_NONE);
1778 if (devinfo->ver >= 7) {
1779 elk_gfx6_math(p, dst, elk_math_function(inst->opcode), src[0],
1780 elk_null_reg());
1781 } else if (devinfo->ver == 6) {
1782 generate_math_gfx6(p, inst, dst, src[0], elk_null_reg());
1783 } else {
1784 generate_math1_gfx4(p, inst, dst, src[0]);
1785 send_count++;
1786 }
1787 break;
1788
1789 case ELK_SHADER_OPCODE_POW:
1790 case ELK_SHADER_OPCODE_INT_QUOTIENT:
1791 case ELK_SHADER_OPCODE_INT_REMAINDER:
1792 assert(inst->conditional_mod == ELK_CONDITIONAL_NONE);
1793 if (devinfo->ver >= 7) {
1794 elk_gfx6_math(p, dst, elk_math_function(inst->opcode), src[0], src[1]);
1795 } else if (devinfo->ver == 6) {
1796 generate_math_gfx6(p, inst, dst, src[0], src[1]);
1797 } else {
1798 generate_math2_gfx4(p, inst, dst, src[0], src[1]);
1799 send_count++;
1800 }
1801 break;
1802
1803 case ELK_SHADER_OPCODE_TEX:
1804 case ELK_SHADER_OPCODE_TXD:
1805 case ELK_SHADER_OPCODE_TXF:
1806 case ELK_SHADER_OPCODE_TXF_CMS:
1807 case ELK_SHADER_OPCODE_TXF_CMS_W:
1808 case ELK_SHADER_OPCODE_TXF_MCS:
1809 case ELK_SHADER_OPCODE_TXL:
1810 case ELK_SHADER_OPCODE_TXS:
1811 case ELK_SHADER_OPCODE_TG4:
1812 case ELK_SHADER_OPCODE_TG4_OFFSET:
1813 case ELK_SHADER_OPCODE_SAMPLEINFO:
1814 generate_tex(p, prog_data, nir->info.stage,
1815 inst, dst, src[0], src[1], src[2]);
1816 send_count++;
1817 break;
1818
1819 case ELK_SHADER_OPCODE_GET_BUFFER_SIZE:
1820 generate_get_buffer_size(p, inst, dst, src[0], src[1]);
1821 send_count++;
1822 break;
1823
1824 case ELK_VEC4_VS_OPCODE_URB_WRITE:
1825 generate_vs_urb_write(p, inst);
1826 send_count++;
1827 break;
1828
1829 case ELK_SHADER_OPCODE_GFX4_SCRATCH_READ:
1830 generate_scratch_read(p, inst, dst, src[0]);
1831 fill_count++;
1832 break;
1833
1834 case ELK_SHADER_OPCODE_GFX4_SCRATCH_WRITE:
1835 generate_scratch_write(p, inst, dst, src[0], src[1]);
1836 spill_count++;
1837 break;
1838
1839 case ELK_VS_OPCODE_PULL_CONSTANT_LOAD:
1840 generate_pull_constant_load(p, inst, dst, src[0], src[1]);
1841 send_count++;
1842 break;
1843
1844 case ELK_VS_OPCODE_PULL_CONSTANT_LOAD_GFX7:
1845 generate_pull_constant_load_gfx7(p, inst, dst, src[0], src[1]);
1846 send_count++;
1847 break;
1848
1849 case ELK_VEC4_GS_OPCODE_URB_WRITE:
1850 generate_gs_urb_write(p, inst);
1851 send_count++;
1852 break;
1853
1854 case ELK_VEC4_GS_OPCODE_URB_WRITE_ALLOCATE:
1855 generate_gs_urb_write_allocate(p, inst);
1856 send_count++;
1857 break;
1858
1859 case ELK_GS_OPCODE_SVB_WRITE:
1860 generate_gs_svb_write(p, inst, dst, src[0], src[1]);
1861 send_count++;
1862 break;
1863
1864 case ELK_GS_OPCODE_SVB_SET_DST_INDEX:
1865 generate_gs_svb_set_destination_index(p, inst, dst, src[0]);
1866 break;
1867
1868 case ELK_GS_OPCODE_THREAD_END:
1869 generate_gs_thread_end(p, inst);
1870 send_count++;
1871 break;
1872
1873 case ELK_GS_OPCODE_SET_WRITE_OFFSET:
1874 generate_gs_set_write_offset(p, dst, src[0], src[1]);
1875 break;
1876
1877 case ELK_GS_OPCODE_SET_VERTEX_COUNT:
1878 generate_gs_set_vertex_count(p, dst, src[0]);
1879 break;
1880
1881 case ELK_GS_OPCODE_FF_SYNC:
1882 generate_gs_ff_sync(p, inst, dst, src[0], src[1]);
1883 send_count++;
1884 break;
1885
1886 case ELK_GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
1887 generate_gs_ff_sync_set_primitives(p, dst, src[0], src[1], src[2]);
1888 break;
1889
1890 case ELK_GS_OPCODE_SET_PRIMITIVE_ID:
1891 generate_gs_set_primitive_id(p, dst);
1892 break;
1893
1894 case ELK_GS_OPCODE_SET_DWORD_2:
1895 generate_gs_set_dword_2(p, dst, src[0]);
1896 break;
1897
1898 case ELK_GS_OPCODE_PREPARE_CHANNEL_MASKS:
1899 generate_gs_prepare_channel_masks(p, dst);
1900 break;
1901
1902 case ELK_GS_OPCODE_SET_CHANNEL_MASKS:
1903 generate_gs_set_channel_masks(p, dst, src[0]);
1904 break;
1905
1906 case ELK_GS_OPCODE_GET_INSTANCE_ID:
1907 generate_gs_get_instance_id(p, dst);
1908 break;
1909
1910 case ELK_VEC4_OPCODE_UNTYPED_ATOMIC:
1911 assert(src[2].file == ELK_IMMEDIATE_VALUE);
1912 elk_untyped_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen,
1913 !inst->dst.is_null(), inst->header_size);
1914 send_count++;
1915 break;
1916
1917 case ELK_VEC4_OPCODE_UNTYPED_SURFACE_READ:
1918 assert(!inst->header_size);
1919 assert(src[2].file == ELK_IMMEDIATE_VALUE);
1920 elk_untyped_surface_read(p, dst, src[0], src[1], inst->mlen,
1921 src[2].ud);
1922 send_count++;
1923 break;
1924
1925 case ELK_VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
1926 assert(src[2].file == ELK_IMMEDIATE_VALUE);
1927 elk_untyped_surface_write(p, src[0], src[1], inst->mlen,
1928 src[2].ud, inst->header_size);
1929 send_count++;
1930 break;
1931
1932 case ELK_SHADER_OPCODE_MEMORY_FENCE:
1933 elk_memory_fence(p, dst, src[0], ELK_OPCODE_SEND,
1934 elk_message_target(inst->sfid),
1935 inst->desc,
1936 /* commit_enable */ false,
1937 /* bti */ 0);
1938 send_count++;
1939 break;
1940
1941 case ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL:
1942 elk_find_live_channel(p, dst, false);
1943 break;
1944
1945 case ELK_SHADER_OPCODE_BROADCAST:
1946 assert(inst->force_writemask_all);
1947 elk_broadcast(p, dst, src[0], src[1]);
1948 break;
1949
1950 case ELK_VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
1951 generate_unpack_flags(p, dst);
1952 break;
1953
1954 case ELK_VEC4_OPCODE_MOV_BYTES: {
1955 /* Moves the low byte from each channel, using an Align1 access mode
1956 * and a <4,1,0> source region.
1957 */
1958 assert(src[0].type == ELK_REGISTER_TYPE_UB ||
1959 src[0].type == ELK_REGISTER_TYPE_B);
1960
1961 elk_set_default_access_mode(p, ELK_ALIGN_1);
1962 src[0].vstride = ELK_VERTICAL_STRIDE_4;
1963 src[0].width = ELK_WIDTH_1;
1964 src[0].hstride = ELK_HORIZONTAL_STRIDE_0;
1965 elk_MOV(p, dst, src[0]);
1966 elk_set_default_access_mode(p, ELK_ALIGN_16);
1967 break;
1968 }
1969
1970 case ELK_VEC4_OPCODE_DOUBLE_TO_F32:
1971 case ELK_VEC4_OPCODE_DOUBLE_TO_D32:
1972 case ELK_VEC4_OPCODE_DOUBLE_TO_U32: {
1973 assert(type_sz(src[0].type) == 8);
1974 assert(type_sz(dst.type) == 8);
1975
1976 elk_reg_type dst_type;
1977
1978 switch (inst->opcode) {
1979 case ELK_VEC4_OPCODE_DOUBLE_TO_F32:
1980 dst_type = ELK_REGISTER_TYPE_F;
1981 break;
1982 case ELK_VEC4_OPCODE_DOUBLE_TO_D32:
1983 dst_type = ELK_REGISTER_TYPE_D;
1984 break;
1985 case ELK_VEC4_OPCODE_DOUBLE_TO_U32:
1986 dst_type = ELK_REGISTER_TYPE_UD;
1987 break;
1988 default:
1989 unreachable("Not supported conversion");
1990 }
1991 dst = retype(dst, dst_type);
1992
1993 elk_set_default_access_mode(p, ELK_ALIGN_1);
1994
1995 /* When converting from DF->F, we set destination's stride as 2 as an
1996 * alignment requirement. But in IVB/BYT, each DF implicitly writes
1997 * two floats, being the first one the converted value. So we don't
1998 * need to explicitly set stride 2, but 1.
1999 */
2000 struct elk_reg spread_dst;
2001 if (devinfo->verx10 == 70)
2002 spread_dst = stride(dst, 8, 4, 1);
2003 else
2004 spread_dst = stride(dst, 8, 4, 2);
2005
2006 elk_MOV(p, spread_dst, src[0]);
2007
2008 elk_set_default_access_mode(p, ELK_ALIGN_16);
2009 break;
2010 }
2011
2012 case ELK_VEC4_OPCODE_TO_DOUBLE: {
2013 assert(type_sz(src[0].type) == 4);
2014 assert(type_sz(dst.type) == 8);
2015
2016 elk_set_default_access_mode(p, ELK_ALIGN_1);
2017
2018 elk_MOV(p, dst, src[0]);
2019
2020 elk_set_default_access_mode(p, ELK_ALIGN_16);
2021 break;
2022 }
2023
2024 case ELK_VEC4_OPCODE_PICK_LOW_32BIT:
2025 case ELK_VEC4_OPCODE_PICK_HIGH_32BIT: {
2026 /* Stores the low/high 32-bit of each 64-bit element in src[0] into
2027 * dst using ALIGN1 mode and a <8,4,2>:UD region on the source.
2028 */
2029 assert(type_sz(src[0].type) == 8);
2030 assert(type_sz(dst.type) == 4);
2031
2032 elk_set_default_access_mode(p, ELK_ALIGN_1);
2033
2034 dst = retype(dst, ELK_REGISTER_TYPE_UD);
2035 dst.hstride = ELK_HORIZONTAL_STRIDE_1;
2036
2037 src[0] = retype(src[0], ELK_REGISTER_TYPE_UD);
2038 if (inst->opcode == ELK_VEC4_OPCODE_PICK_HIGH_32BIT)
2039 src[0] = suboffset(src[0], 1);
2040 src[0] = spread(src[0], 2);
2041 elk_MOV(p, dst, src[0]);
2042
2043 elk_set_default_access_mode(p, ELK_ALIGN_16);
2044 break;
2045 }
2046
2047 case ELK_VEC4_OPCODE_SET_LOW_32BIT:
2048 case ELK_VEC4_OPCODE_SET_HIGH_32BIT: {
2049 /* Reads consecutive 32-bit elements from src[0] and writes
2050 * them to the low/high 32-bit of each 64-bit element in dst.
2051 */
2052 assert(type_sz(src[0].type) == 4);
2053 assert(type_sz(dst.type) == 8);
2054
2055 elk_set_default_access_mode(p, ELK_ALIGN_1);
2056
2057 dst = retype(dst, ELK_REGISTER_TYPE_UD);
2058 if (inst->opcode == ELK_VEC4_OPCODE_SET_HIGH_32BIT)
2059 dst = suboffset(dst, 1);
2060 dst.hstride = ELK_HORIZONTAL_STRIDE_2;
2061
2062 src[0] = retype(src[0], ELK_REGISTER_TYPE_UD);
2063 elk_MOV(p, dst, src[0]);
2064
2065 elk_set_default_access_mode(p, ELK_ALIGN_16);
2066 break;
2067 }
2068
2069 case ELK_VEC4_OPCODE_PACK_BYTES: {
2070 /* Is effectively:
2071 *
2072 * mov(8) dst<16,4,1>:UB src<4,1,0>:UB
2073 *
2074 * but destinations' only regioning is horizontal stride, so instead we
2075 * have to use two instructions:
2076 *
2077 * mov(4) dst<1>:UB src<4,1,0>:UB
2078 * mov(4) dst.16<1>:UB src.16<4,1,0>:UB
2079 *
2080 * where they pack the four bytes from the low and high four DW.
2081 */
2082 assert(util_is_power_of_two_nonzero(dst.writemask));
2083 unsigned offset = __builtin_ctz(dst.writemask);
2084
2085 dst.type = ELK_REGISTER_TYPE_UB;
2086
2087 elk_set_default_access_mode(p, ELK_ALIGN_1);
2088
2089 src[0].type = ELK_REGISTER_TYPE_UB;
2090 src[0].vstride = ELK_VERTICAL_STRIDE_4;
2091 src[0].width = ELK_WIDTH_1;
2092 src[0].hstride = ELK_HORIZONTAL_STRIDE_0;
2093 dst.subnr = offset * 4;
2094 struct elk_inst *insn = elk_MOV(p, dst, src[0]);
2095 elk_inst_set_exec_size(p->devinfo, insn, ELK_EXECUTE_4);
2096 elk_inst_set_no_dd_clear(p->devinfo, insn, true);
2097 elk_inst_set_no_dd_check(p->devinfo, insn, inst->no_dd_check);
2098
2099 src[0].subnr = 16;
2100 dst.subnr = 16 + offset * 4;
2101 insn = elk_MOV(p, dst, src[0]);
2102 elk_inst_set_exec_size(p->devinfo, insn, ELK_EXECUTE_4);
2103 elk_inst_set_no_dd_clear(p->devinfo, insn, inst->no_dd_clear);
2104 elk_inst_set_no_dd_check(p->devinfo, insn, true);
2105
2106 elk_set_default_access_mode(p, ELK_ALIGN_16);
2107 break;
2108 }
2109
2110 case ELK_VEC4_OPCODE_ZERO_OOB_PUSH_REGS:
2111 generate_zero_oob_push_regs(p, &prog_data->base, dst, src[0]);
2112 break;
2113
2114 case ELK_VEC4_TCS_OPCODE_URB_WRITE:
2115 generate_tcs_urb_write(p, inst, src[0]);
2116 send_count++;
2117 break;
2118
2119 case ELK_VEC4_OPCODE_URB_READ:
2120 generate_vec4_urb_read(p, inst, dst, src[0]);
2121 send_count++;
2122 break;
2123
2124 case ELK_VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS:
2125 generate_tcs_input_urb_offsets(p, dst, src[0], src[1]);
2126 break;
2127
2128 case ELK_VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
2129 generate_tcs_output_urb_offsets(p, dst, src[0], src[1]);
2130 break;
2131
2132 case ELK_TCS_OPCODE_GET_INSTANCE_ID:
2133 generate_tcs_get_instance_id(p, dst);
2134 break;
2135
2136 case ELK_TCS_OPCODE_GET_PRIMITIVE_ID:
2137 generate_tcs_get_primitive_id(p, dst);
2138 break;
2139
2140 case ELK_TCS_OPCODE_CREATE_BARRIER_HEADER:
2141 generate_tcs_create_barrier_header(p, prog_data, dst);
2142 break;
2143
2144 case ELK_TES_OPCODE_CREATE_INPUT_READ_HEADER:
2145 generate_tes_create_input_read_header(p, dst);
2146 break;
2147
2148 case ELK_TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
2149 generate_tes_add_indirect_urb_offset(p, dst, src[0], src[1]);
2150 break;
2151
2152 case ELK_TES_OPCODE_GET_PRIMITIVE_ID:
2153 generate_tes_get_primitive_id(p, dst);
2154 break;
2155
2156 case ELK_TCS_OPCODE_SRC0_010_IS_ZERO:
2157 /* If src_reg had stride like elk_fs_reg, we wouldn't need this. */
2158 elk_MOV(p, elk_null_reg(), stride(src[0], 0, 1, 0));
2159 break;
2160
2161 case ELK_TCS_OPCODE_RELEASE_INPUT:
2162 generate_tcs_release_input(p, dst, src[0], src[1]);
2163 send_count++;
2164 break;
2165
2166 case ELK_TCS_OPCODE_THREAD_END:
2167 generate_tcs_thread_end(p, inst);
2168 send_count++;
2169 break;
2170
2171 case ELK_SHADER_OPCODE_BARRIER:
2172 elk_barrier(p, src[0]);
2173 elk_WAIT(p);
2174 send_count++;
2175 break;
2176
2177 case ELK_SHADER_OPCODE_MOV_INDIRECT:
2178 generate_mov_indirect(p, inst, dst, src[0], src[1]);
2179 break;
2180
2181 case ELK_OPCODE_DIM:
2182 assert(devinfo->verx10 == 75);
2183 assert(src[0].type == ELK_REGISTER_TYPE_DF);
2184 assert(dst.type == ELK_REGISTER_TYPE_DF);
2185 elk_DIM(p, dst, retype(src[0], ELK_REGISTER_TYPE_F));
2186 break;
2187
2188 case ELK_SHADER_OPCODE_RND_MODE: {
2189 assert(src[0].file == ELK_IMMEDIATE_VALUE);
2190 /*
2191 * Changes the floating point rounding mode updating the control
2192 * register field defined at cr0.0[5-6] bits.
2193 */
2194 enum elk_rnd_mode mode =
2195 (enum elk_rnd_mode) (src[0].d << ELK_CR0_RND_MODE_SHIFT);
2196 elk_float_controls_mode(p, mode, ELK_CR0_RND_MODE_MASK);
2197 }
2198 break;
2199
2200 default:
2201 unreachable("Unsupported opcode");
2202 }
2203
2204 if (inst->opcode == ELK_VEC4_OPCODE_PACK_BYTES) {
2205 /* Handled dependency hints in the generator. */
2206
2207 assert(!inst->conditional_mod);
2208 } else if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
2209 assert(p->nr_insn == pre_emit_nr_insn + 1 ||
2210 !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
2211 "emitting more than 1 instruction");
2212
2213 elk_inst *last = &p->store[pre_emit_nr_insn];
2214
2215 if (inst->conditional_mod)
2216 elk_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
2217 elk_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
2218 elk_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
2219 }
2220 }
2221
2222 elk_set_uip_jip(p, 0);
2223
2224 /* end of program sentinel */
2225 elk_disasm_new_inst_group(elk_disasm_info, p->next_insn_offset);
2226
2227 #ifndef NDEBUG
2228 bool validated =
2229 #else
2230 if (unlikely(debug_enabled))
2231 #endif
2232 elk_validate_instructions(&compiler->isa, p->store,
2233 0, p->next_insn_offset,
2234 elk_disasm_info);
2235
2236 int before_size = p->next_insn_offset;
2237 elk_compact_instructions(p, 0, elk_disasm_info);
2238 int after_size = p->next_insn_offset;
2239
2240 bool dump_shader_bin = elk_should_dump_shader_bin();
2241 unsigned char sha1[21];
2242 char sha1buf[41];
2243
2244 if (unlikely(debug_enabled || dump_shader_bin)) {
2245 _mesa_sha1_compute(p->store, p->next_insn_offset, sha1);
2246 _mesa_sha1_format(sha1buf, sha1);
2247 }
2248
2249 if (unlikely(dump_shader_bin))
2250 elk_dump_shader_bin(p->store, 0, p->next_insn_offset, sha1buf);
2251
2252 if (unlikely(debug_enabled)) {
2253 fprintf(stderr, "Native code for %s %s shader %s (src_hash 0x%08x) (sha1 %s):\n",
2254 nir->info.label ? nir->info.label : "unnamed",
2255 _mesa_shader_stage_to_string(nir->info.stage), nir->info.name,
2256 params->source_hash, sha1buf);
2257
2258 fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles. %d:%d "
2259 "spills:fills, %u sends. Compacted %d to %d bytes (%.0f%%)\n",
2260 stage_abbrev, before_size / 16, loop_count, perf.latency,
2261 spill_count, fill_count, send_count, before_size, after_size,
2262 100.0f * (before_size - after_size) / before_size);
2263
2264 /* overriding the shader makes elk_disasm_info invalid */
2265 if (!elk_try_override_assembly(p, 0, sha1buf)) {
2266 elk_dump_assembly(p->store, 0, p->next_insn_offset,
2267 elk_disasm_info, perf.block_latency);
2268 } else {
2269 fprintf(stderr, "Successfully overrode shader with sha1 %s\n\n", sha1buf);
2270 }
2271 }
2272 ralloc_free(elk_disasm_info);
2273 assert(validated);
2274
2275 elk_shader_debug_log(compiler, params->log_data,
2276 "%s vec4 shader: %d inst, %d loops, %u cycles, "
2277 "%d:%d spills:fills, %u sends, "
2278 "compacted %d to %d bytes.\n",
2279 stage_abbrev, before_size / 16,
2280 loop_count, perf.latency, spill_count,
2281 fill_count, send_count, before_size, after_size);
2282 if (stats) {
2283 stats->dispatch_width = 0;
2284 stats->max_dispatch_width = 0;
2285 stats->instructions = before_size / 16;
2286 stats->sends = send_count;
2287 stats->loops = loop_count;
2288 stats->cycles = perf.latency;
2289 stats->spills = spill_count;
2290 stats->fills = fill_count;
2291 }
2292 }
2293
2294 extern "C" const unsigned *
elk_vec4_generate_assembly(const struct elk_compiler * compiler,const struct elk_compile_params * params,const nir_shader * nir,struct elk_vue_prog_data * prog_data,const struct elk_cfg_t * cfg,const performance & perf,bool debug_enabled)2295 elk_vec4_generate_assembly(const struct elk_compiler *compiler,
2296 const struct elk_compile_params *params,
2297 const nir_shader *nir,
2298 struct elk_vue_prog_data *prog_data,
2299 const struct elk_cfg_t *cfg,
2300 const performance &perf,
2301 bool debug_enabled)
2302 {
2303 struct elk_codegen *p = rzalloc(params->mem_ctx, struct elk_codegen);
2304 elk_init_codegen(&compiler->isa, p, params->mem_ctx);
2305 elk_set_default_access_mode(p, ELK_ALIGN_16);
2306
2307 generate_code(p, compiler, params,
2308 nir, prog_data, cfg, perf,
2309 params->stats, debug_enabled);
2310
2311 assert(prog_data->base.const_data_size == 0);
2312 if (nir->constant_data_size > 0) {
2313 prog_data->base.const_data_size = nir->constant_data_size;
2314 prog_data->base.const_data_offset =
2315 elk_append_data(p, nir->constant_data, nir->constant_data_size, 32);
2316 }
2317
2318 return elk_get_program(p, &prog_data->base.program_size);
2319 }
2320