xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/elk/elk_vec4_generator.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /* Copyright © 2011 Intel Corporation
2  *
3  * Permission is hereby granted, free of charge, to any person obtaining a
4  * copy of this software and associated documentation files (the "Software"),
5  * to deal in the Software without restriction, including without limitation
6  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
7  * and/or sell copies of the Software, and to permit persons to whom the
8  * Software is furnished to do so, subject to the following conditions:
9  *
10  * The above copyright notice and this permission notice (including the next
11  * paragraph) shall be included in all copies or substantial portions of the
12  * Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
20  * IN THE SOFTWARE.
21  */
22 
23 #include "elk_vec4.h"
24 #include "elk_cfg.h"
25 #include "elk_eu.h"
26 #include "elk_disasm_info.h"
27 #include "dev/intel_debug.h"
28 #include "util/mesa-sha1.h"
29 
30 using namespace elk;
31 
32 static void
generate_math1_gfx4(struct elk_codegen * p,vec4_instruction * inst,struct elk_reg dst,struct elk_reg src)33 generate_math1_gfx4(struct elk_codegen *p,
34                     vec4_instruction *inst,
35                     struct elk_reg dst,
36                     struct elk_reg src)
37 {
38    elk_gfx4_math(p,
39 	     dst,
40 	     elk_math_function(inst->opcode),
41 	     inst->base_mrf,
42 	     src,
43 	     ELK_MATH_PRECISION_FULL);
44 }
45 
46 static void
check_gfx6_math_src_arg(struct elk_reg src)47 check_gfx6_math_src_arg(struct elk_reg src)
48 {
49    /* Source swizzles are ignored. */
50    assert(!src.abs);
51    assert(!src.negate);
52    assert(src.swizzle == ELK_SWIZZLE_XYZW);
53 }
54 
55 static void
generate_math_gfx6(struct elk_codegen * p,vec4_instruction * inst,struct elk_reg dst,struct elk_reg src0,struct elk_reg src1)56 generate_math_gfx6(struct elk_codegen *p,
57                    vec4_instruction *inst,
58                    struct elk_reg dst,
59                    struct elk_reg src0,
60                    struct elk_reg src1)
61 {
62    /* Can't do writemask because math can't be align16. */
63    assert(dst.writemask == WRITEMASK_XYZW);
64    /* Source swizzles are ignored. */
65    check_gfx6_math_src_arg(src0);
66    if (src1.file == ELK_GENERAL_REGISTER_FILE)
67       check_gfx6_math_src_arg(src1);
68 
69    elk_set_default_access_mode(p, ELK_ALIGN_1);
70    elk_gfx6_math(p, dst, elk_math_function(inst->opcode), src0, src1);
71    elk_set_default_access_mode(p, ELK_ALIGN_16);
72 }
73 
74 static void
generate_math2_gfx4(struct elk_codegen * p,vec4_instruction * inst,struct elk_reg dst,struct elk_reg src0,struct elk_reg src1)75 generate_math2_gfx4(struct elk_codegen *p,
76                     vec4_instruction *inst,
77                     struct elk_reg dst,
78                     struct elk_reg src0,
79                     struct elk_reg src1)
80 {
81    /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
82     * "Message Payload":
83     *
84     * "Operand0[7].  For the INT DIV functions, this operand is the
85     *  denominator."
86     *  ...
87     * "Operand1[7].  For the INT DIV functions, this operand is the
88     *  numerator."
89     */
90    bool is_int_div = inst->opcode != ELK_SHADER_OPCODE_POW;
91    struct elk_reg &op0 = is_int_div ? src1 : src0;
92    struct elk_reg &op1 = is_int_div ? src0 : src1;
93 
94    elk_push_insn_state(p);
95    elk_set_default_saturate(p, false);
96    elk_set_default_predicate_control(p, ELK_PREDICATE_NONE);
97    elk_set_default_flag_reg(p, 0, 0);
98    elk_MOV(p, retype(elk_message_reg(inst->base_mrf + 1), op1.type), op1);
99    elk_pop_insn_state(p);
100 
101    elk_gfx4_math(p,
102 	     dst,
103 	     elk_math_function(inst->opcode),
104 	     inst->base_mrf,
105 	     op0,
106 	     ELK_MATH_PRECISION_FULL);
107 }
108 
109 static void
generate_tex(struct elk_codegen * p,struct elk_vue_prog_data * prog_data,gl_shader_stage stage,vec4_instruction * inst,struct elk_reg dst,struct elk_reg src,struct elk_reg surface_index,struct elk_reg sampler_index)110 generate_tex(struct elk_codegen *p,
111              struct elk_vue_prog_data *prog_data,
112              gl_shader_stage stage,
113              vec4_instruction *inst,
114              struct elk_reg dst,
115              struct elk_reg src,
116              struct elk_reg surface_index,
117              struct elk_reg sampler_index)
118 {
119    const struct intel_device_info *devinfo = p->devinfo;
120    int msg_type = -1;
121 
122    if (devinfo->ver >= 5) {
123       switch (inst->opcode) {
124       case ELK_SHADER_OPCODE_TEX:
125       case ELK_SHADER_OPCODE_TXL:
126 	 if (inst->shadow_compare) {
127 	    msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
128 	 } else {
129 	    msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LOD;
130 	 }
131 	 break;
132       case ELK_SHADER_OPCODE_TXD:
133          if (inst->shadow_compare) {
134             /* Gfx7.5+.  Otherwise, lowered by elk_lower_texture_gradients(). */
135             assert(devinfo->verx10 == 75);
136             msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
137          } else {
138             msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
139          }
140 	 break;
141       case ELK_SHADER_OPCODE_TXF:
142 	 msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
143 	 break;
144       case ELK_SHADER_OPCODE_TXF_CMS:
145          if (devinfo->ver >= 7)
146             msg_type = GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
147          else
148             msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
149          break;
150       case ELK_SHADER_OPCODE_TXF_MCS:
151          assert(devinfo->ver >= 7);
152          msg_type = GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
153          break;
154       case ELK_SHADER_OPCODE_TXS:
155 	 msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
156 	 break;
157       case ELK_SHADER_OPCODE_TG4:
158          if (inst->shadow_compare) {
159             msg_type = GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
160          } else {
161             msg_type = GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
162          }
163          break;
164       case ELK_SHADER_OPCODE_TG4_OFFSET:
165          if (inst->shadow_compare) {
166             msg_type = GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
167          } else {
168             msg_type = GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
169          }
170          break;
171       case ELK_SHADER_OPCODE_SAMPLEINFO:
172          msg_type = GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
173          break;
174       default:
175 	 unreachable("should not get here: invalid vec4 texture opcode");
176       }
177    } else {
178       switch (inst->opcode) {
179       case ELK_SHADER_OPCODE_TEX:
180       case ELK_SHADER_OPCODE_TXL:
181 	 if (inst->shadow_compare) {
182 	    msg_type = ELK_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_COMPARE;
183 	    assert(inst->mlen == 3);
184 	 } else {
185 	    msg_type = ELK_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD;
186 	    assert(inst->mlen == 2);
187 	 }
188 	 break;
189       case ELK_SHADER_OPCODE_TXD:
190 	 /* There is no sample_d_c message; comparisons are done manually. */
191 	 msg_type = ELK_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS;
192 	 assert(inst->mlen == 4);
193 	 break;
194       case ELK_SHADER_OPCODE_TXF:
195 	 msg_type = ELK_SAMPLER_MESSAGE_SIMD4X2_LD;
196 	 assert(inst->mlen == 2);
197 	 break;
198       case ELK_SHADER_OPCODE_TXS:
199 	 msg_type = ELK_SAMPLER_MESSAGE_SIMD4X2_RESINFO;
200 	 assert(inst->mlen == 2);
201 	 break;
202       default:
203 	 unreachable("should not get here: invalid vec4 texture opcode");
204       }
205    }
206 
207    assert(msg_type != -1);
208 
209    assert(sampler_index.type == ELK_REGISTER_TYPE_UD);
210 
211    /* Load the message header if present.  If there's a texture offset, we need
212     * to set it up explicitly and load the offset bitfield.  Otherwise, we can
213     * use an implied move from g0 to the first message register.
214     */
215    if (inst->header_size != 0) {
216       if (devinfo->ver < 6 && !inst->offset) {
217          /* Set up an implied move from g0 to the MRF. */
218          src = elk_vec8_grf(0, 0);
219       } else {
220          struct elk_reg header =
221             retype(elk_message_reg(inst->base_mrf), ELK_REGISTER_TYPE_UD);
222          uint32_t dw2 = 0;
223 
224          /* Explicitly set up the message header by copying g0 to the MRF. */
225          elk_push_insn_state(p);
226          elk_set_default_mask_control(p, ELK_MASK_DISABLE);
227          elk_MOV(p, header, retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
228 
229          elk_set_default_access_mode(p, ELK_ALIGN_1);
230 
231          if (inst->offset)
232             /* Set the texel offset bits in DWord 2. */
233             dw2 = inst->offset;
234 
235          /* The VS, DS, and FS stages have the g0.2 payload delivered as 0,
236           * so header0.2 is 0 when g0 is copied.  The HS and GS stages do
237           * not, so we must set to to 0 to avoid setting undesirable bits
238           * in the message header.
239           */
240          if (dw2 ||
241              stage == MESA_SHADER_TESS_CTRL ||
242              stage == MESA_SHADER_GEOMETRY) {
243             elk_MOV(p, get_element_ud(header, 2), elk_imm_ud(dw2));
244          }
245 
246          elk_adjust_sampler_state_pointer(p, header, sampler_index);
247          elk_pop_insn_state(p);
248       }
249    }
250 
251    uint32_t return_format;
252 
253    switch (dst.type) {
254    case ELK_REGISTER_TYPE_D:
255       return_format = ELK_SAMPLER_RETURN_FORMAT_SINT32;
256       break;
257    case ELK_REGISTER_TYPE_UD:
258       return_format = ELK_SAMPLER_RETURN_FORMAT_UINT32;
259       break;
260    default:
261       return_format = ELK_SAMPLER_RETURN_FORMAT_FLOAT32;
262       break;
263    }
264 
265    /* Stomp the resinfo output type to UINT32.  On gens 4-5, the output type
266     * is set as part of the message descriptor.  On gfx4, the PRM seems to
267     * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on
268     * later gens UINT32 is required.  Once you hit Sandy Bridge, the bit is
269     * gone from the message descriptor entirely and you just get UINT32 all
270     * the time regasrdless.  Since we can really only do non-UINT32 on gfx4,
271     * just stomp it to UINT32 all the time.
272     */
273    if (inst->opcode == ELK_SHADER_OPCODE_TXS)
274       return_format = ELK_SAMPLER_RETURN_FORMAT_UINT32;
275 
276    if (surface_index.file == ELK_IMMEDIATE_VALUE &&
277        sampler_index.file == ELK_IMMEDIATE_VALUE) {
278       uint32_t surface = surface_index.ud;
279       uint32_t sampler = sampler_index.ud;
280 
281       elk_SAMPLE(p,
282                  dst,
283                  inst->base_mrf,
284                  src,
285                  surface,
286                  sampler % 16,
287                  msg_type,
288                  1, /* response length */
289                  inst->mlen,
290                  inst->header_size != 0,
291                  ELK_SAMPLER_SIMD_MODE_SIMD4X2,
292                  return_format);
293    } else {
294       /* Non-constant sampler index. */
295 
296       struct elk_reg addr = vec1(retype(elk_address_reg(0), ELK_REGISTER_TYPE_UD));
297       struct elk_reg surface_reg = vec1(retype(surface_index, ELK_REGISTER_TYPE_UD));
298       struct elk_reg sampler_reg = vec1(retype(sampler_index, ELK_REGISTER_TYPE_UD));
299 
300       elk_push_insn_state(p);
301       elk_set_default_mask_control(p, ELK_MASK_DISABLE);
302       elk_set_default_access_mode(p, ELK_ALIGN_1);
303 
304       if (elk_regs_equal(&surface_reg, &sampler_reg)) {
305          elk_MUL(p, addr, sampler_reg, elk_imm_uw(0x101));
306       } else {
307          if (sampler_reg.file == ELK_IMMEDIATE_VALUE) {
308             elk_OR(p, addr, surface_reg, elk_imm_ud(sampler_reg.ud << 8));
309          } else {
310             elk_SHL(p, addr, sampler_reg, elk_imm_ud(8));
311             elk_OR(p, addr, addr, surface_reg);
312          }
313       }
314       elk_AND(p, addr, addr, elk_imm_ud(0xfff));
315 
316       elk_pop_insn_state(p);
317 
318       if (inst->base_mrf != -1)
319          elk_gfx6_resolve_implied_move(p, &src, inst->base_mrf);
320 
321       /* dst = send(offset, a0.0 | <descriptor>) */
322       elk_send_indirect_message(
323          p, ELK_SFID_SAMPLER, dst, src, addr,
324          elk_message_desc(devinfo, inst->mlen, 1, inst->header_size) |
325          elk_sampler_desc(devinfo,
326                           0 /* surface */,
327                           0 /* sampler */,
328                           msg_type,
329                           ELK_SAMPLER_SIMD_MODE_SIMD4X2,
330                           return_format),
331          false /* EOT */);
332 
333       /* visitor knows more than we do about the surface limit required,
334        * so has already done marking.
335        */
336    }
337 }
338 
339 static void
generate_vs_urb_write(struct elk_codegen * p,vec4_instruction * inst)340 generate_vs_urb_write(struct elk_codegen *p, vec4_instruction *inst)
341 {
342    elk_urb_WRITE(p,
343 		 elk_null_reg(), /* dest */
344 		 inst->base_mrf, /* starting mrf reg nr */
345 		 elk_vec8_grf(0, 0), /* src */
346                  inst->urb_write_flags,
347 		 inst->mlen,
348 		 0,		/* response len */
349 		 inst->offset,	/* urb destination offset */
350 		 ELK_URB_SWIZZLE_INTERLEAVE);
351 }
352 
353 static void
generate_gs_urb_write(struct elk_codegen * p,vec4_instruction * inst)354 generate_gs_urb_write(struct elk_codegen *p, vec4_instruction *inst)
355 {
356    struct elk_reg src = elk_message_reg(inst->base_mrf);
357    elk_urb_WRITE(p,
358                  elk_null_reg(), /* dest */
359                  inst->base_mrf, /* starting mrf reg nr */
360                  src,
361                  inst->urb_write_flags,
362                  inst->mlen,
363                  0,             /* response len */
364                  inst->offset,  /* urb destination offset */
365                  ELK_URB_SWIZZLE_INTERLEAVE);
366 }
367 
368 static void
generate_gs_urb_write_allocate(struct elk_codegen * p,vec4_instruction * inst)369 generate_gs_urb_write_allocate(struct elk_codegen *p, vec4_instruction *inst)
370 {
371    struct elk_reg src = elk_message_reg(inst->base_mrf);
372 
373    /* We pass the temporary passed in src0 as the writeback register */
374    elk_urb_WRITE(p,
375                  inst->src[0].as_elk_reg(), /* dest */
376                  inst->base_mrf, /* starting mrf reg nr */
377                  src,
378                  ELK_URB_WRITE_ALLOCATE_COMPLETE,
379                  inst->mlen,
380                  1, /* response len */
381                  inst->offset,  /* urb destination offset */
382                  ELK_URB_SWIZZLE_INTERLEAVE);
383 
384    /* Now put allocated urb handle in dst.0 */
385    elk_push_insn_state(p);
386    elk_set_default_access_mode(p, ELK_ALIGN_1);
387    elk_set_default_mask_control(p, ELK_MASK_DISABLE);
388    elk_MOV(p, get_element_ud(inst->dst.as_elk_reg(), 0),
389            get_element_ud(inst->src[0].as_elk_reg(), 0));
390    elk_pop_insn_state(p);
391 }
392 
393 static void
generate_gs_thread_end(struct elk_codegen * p,vec4_instruction * inst)394 generate_gs_thread_end(struct elk_codegen *p, vec4_instruction *inst)
395 {
396    struct elk_reg src = elk_message_reg(inst->base_mrf);
397    elk_urb_WRITE(p,
398                  elk_null_reg(), /* dest */
399                  inst->base_mrf, /* starting mrf reg nr */
400                  src,
401                  ELK_URB_WRITE_EOT | inst->urb_write_flags,
402                  inst->mlen,
403                  0,              /* response len */
404                  0,              /* urb destination offset */
405                  ELK_URB_SWIZZLE_INTERLEAVE);
406 }
407 
408 static void
generate_gs_set_write_offset(struct elk_codegen * p,struct elk_reg dst,struct elk_reg src0,struct elk_reg src1)409 generate_gs_set_write_offset(struct elk_codegen *p,
410                              struct elk_reg dst,
411                              struct elk_reg src0,
412                              struct elk_reg src1)
413 {
414    /* From p22 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
415     * Header: M0.3):
416     *
417     *     Slot 0 Offset. This field, after adding to the Global Offset field
418     *     in the message descriptor, specifies the offset (in 256-bit units)
419     *     from the start of the URB entry, as referenced by URB Handle 0, at
420     *     which the data will be accessed.
421     *
422     * Similar text describes DWORD M0.4, which is slot 1 offset.
423     *
424     * Therefore, we want to multiply DWORDs 0 and 4 of src0 (the x components
425     * of the register for geometry shader invocations 0 and 1) by the
426     * immediate value in src1, and store the result in DWORDs 3 and 4 of dst.
427     *
428     * We can do this with the following EU instruction:
429     *
430     *     mul(2) dst.3<1>UD src0<8;2,4>UD src1<...>UW   { Align1 WE_all }
431     */
432    elk_push_insn_state(p);
433    elk_set_default_access_mode(p, ELK_ALIGN_1);
434    elk_set_default_mask_control(p, ELK_MASK_DISABLE);
435    assert(p->devinfo->ver >= 7 &&
436           src1.file == ELK_IMMEDIATE_VALUE &&
437           src1.type == ELK_REGISTER_TYPE_UD &&
438           src1.ud <= USHRT_MAX);
439    if (src0.file == ELK_IMMEDIATE_VALUE) {
440       elk_MOV(p, suboffset(stride(dst, 2, 2, 1), 3),
441               elk_imm_ud(src0.ud * src1.ud));
442    } else {
443       if (src1.file == ELK_IMMEDIATE_VALUE) {
444          src1 = elk_imm_uw(src1.ud);
445       }
446       elk_MUL(p, suboffset(stride(dst, 2, 2, 1), 3), stride(src0, 8, 2, 4),
447               retype(src1, ELK_REGISTER_TYPE_UW));
448    }
449    elk_pop_insn_state(p);
450 }
451 
452 static void
generate_gs_set_vertex_count(struct elk_codegen * p,struct elk_reg dst,struct elk_reg src)453 generate_gs_set_vertex_count(struct elk_codegen *p,
454                              struct elk_reg dst,
455                              struct elk_reg src)
456 {
457    elk_push_insn_state(p);
458    elk_set_default_mask_control(p, ELK_MASK_DISABLE);
459 
460    /* If we think of the src and dst registers as composed of 8 DWORDs each,
461     * we want to pick up the contents of DWORDs 0 and 4 from src, truncate
462     * them to WORDs, and then pack them into DWORD 2 of dst.
463     *
464     * It's easier to get the EU to do this if we think of the src and dst
465     * registers as composed of 16 WORDS each; then, we want to pick up the
466     * contents of WORDs 0 and 8 from src, and pack them into WORDs 4 and 5
467     * of dst.
468     *
469     * We can do that by the following EU instruction:
470     *
471     *     mov (2) dst.4<1>:uw src<8;1,0>:uw   { Align1, Q1, NoMask }
472     */
473    elk_set_default_access_mode(p, ELK_ALIGN_1);
474    elk_MOV(p,
475            suboffset(stride(retype(dst, ELK_REGISTER_TYPE_UW), 2, 2, 1), 4),
476            stride(retype(src, ELK_REGISTER_TYPE_UW), 8, 1, 0));
477 
478    elk_pop_insn_state(p);
479 }
480 
481 static void
generate_gs_svb_write(struct elk_codegen * p,vec4_instruction * inst,struct elk_reg dst,struct elk_reg src0,struct elk_reg src1)482 generate_gs_svb_write(struct elk_codegen *p,
483                       vec4_instruction *inst,
484                       struct elk_reg dst,
485                       struct elk_reg src0,
486                       struct elk_reg src1)
487 {
488    int binding = inst->sol_binding;
489    bool final_write = inst->sol_final_write;
490 
491    elk_push_insn_state(p);
492    elk_set_default_exec_size(p, ELK_EXECUTE_4);
493    /* Copy Vertex data into M0.x */
494    elk_MOV(p, stride(dst, 4, 4, 1),
495            stride(retype(src0, ELK_REGISTER_TYPE_UD), 4, 4, 1));
496    elk_pop_insn_state(p);
497 
498    elk_push_insn_state(p);
499    /* Send SVB Write */
500    elk_svb_write(p,
501                  final_write ? src1 : elk_null_reg(), /* dest == src1 */
502                  1, /* msg_reg_nr */
503                  dst, /* src0 == previous dst */
504                  ELK_GFX6_SOL_BINDING_START + binding, /* binding_table_index */
505                  final_write); /* send_commit_msg */
506 
507    /* Finally, wait for the write commit to occur so that we can proceed to
508     * other things safely.
509     *
510     * From the Sandybridge PRM, Volume 4, Part 1, Section 3.3:
511     *
512     *   The write commit does not modify the destination register, but
513     *   merely clears the dependency associated with the destination
514     *   register. Thus, a simple “mov” instruction using the register as a
515     *   source is sufficient to wait for the write commit to occur.
516     */
517    if (final_write) {
518       elk_MOV(p, src1, src1);
519    }
520    elk_pop_insn_state(p);
521 }
522 
523 static void
generate_gs_svb_set_destination_index(struct elk_codegen * p,vec4_instruction * inst,struct elk_reg dst,struct elk_reg src)524 generate_gs_svb_set_destination_index(struct elk_codegen *p,
525                                       vec4_instruction *inst,
526                                       struct elk_reg dst,
527                                       struct elk_reg src)
528 {
529    int vertex = inst->sol_vertex;
530    elk_push_insn_state(p);
531    elk_set_default_access_mode(p, ELK_ALIGN_1);
532    elk_set_default_mask_control(p, ELK_MASK_DISABLE);
533    elk_MOV(p, get_element_ud(dst, 5), get_element_ud(src, vertex));
534    elk_pop_insn_state(p);
535 }
536 
537 static void
generate_gs_set_dword_2(struct elk_codegen * p,struct elk_reg dst,struct elk_reg src)538 generate_gs_set_dword_2(struct elk_codegen *p,
539                         struct elk_reg dst,
540                         struct elk_reg src)
541 {
542    elk_push_insn_state(p);
543    elk_set_default_access_mode(p, ELK_ALIGN_1);
544    elk_set_default_mask_control(p, ELK_MASK_DISABLE);
545    elk_MOV(p, suboffset(vec1(dst), 2), suboffset(vec1(src), 0));
546    elk_pop_insn_state(p);
547 }
548 
549 static void
generate_gs_prepare_channel_masks(struct elk_codegen * p,struct elk_reg dst)550 generate_gs_prepare_channel_masks(struct elk_codegen *p,
551                                   struct elk_reg dst)
552 {
553    /* We want to left shift just DWORD 4 (the x component belonging to the
554     * second geometry shader invocation) by 4 bits.  So generate the
555     * instruction:
556     *
557     *     shl(1) dst.4<1>UD dst.4<0,1,0>UD 4UD { align1 WE_all }
558     */
559    dst = suboffset(vec1(dst), 4);
560    elk_push_insn_state(p);
561    elk_set_default_access_mode(p, ELK_ALIGN_1);
562    elk_set_default_mask_control(p, ELK_MASK_DISABLE);
563    elk_SHL(p, dst, dst, elk_imm_ud(4));
564    elk_pop_insn_state(p);
565 }
566 
567 static void
generate_gs_set_channel_masks(struct elk_codegen * p,struct elk_reg dst,struct elk_reg src)568 generate_gs_set_channel_masks(struct elk_codegen *p,
569                               struct elk_reg dst,
570                               struct elk_reg src)
571 {
572    /* From p21 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
573     * Header: M0.5):
574     *
575     *     15 Vertex 1 DATA [3] / Vertex 0 DATA[7] Channel Mask
576     *
577     *        When Swizzle Control = URB_INTERLEAVED this bit controls Vertex 1
578     *        DATA[3], when Swizzle Control = URB_NOSWIZZLE this bit controls
579     *        Vertex 0 DATA[7].  This bit is ANDed with the corresponding
580     *        channel enable to determine the final channel enable.  For the
581     *        URB_READ_OWORD & URB_READ_HWORD messages, when final channel
582     *        enable is 1 it indicates that Vertex 1 DATA [3] will be included
583     *        in the writeback message.  For the URB_WRITE_OWORD &
584     *        URB_WRITE_HWORD messages, when final channel enable is 1 it
585     *        indicates that Vertex 1 DATA [3] will be written to the surface.
586     *
587     *        0: Vertex 1 DATA [3] / Vertex 0 DATA[7] channel not included
588     *        1: Vertex DATA [3] / Vertex 0 DATA[7] channel included
589     *
590     *     14 Vertex 1 DATA [2] Channel Mask
591     *     13 Vertex 1 DATA [1] Channel Mask
592     *     12 Vertex 1 DATA [0] Channel Mask
593     *     11 Vertex 0 DATA [3] Channel Mask
594     *     10 Vertex 0 DATA [2] Channel Mask
595     *      9 Vertex 0 DATA [1] Channel Mask
596     *      8 Vertex 0 DATA [0] Channel Mask
597     *
598     * (This is from a section of the PRM that is agnostic to the particular
599     * type of shader being executed, so "Vertex 0" and "Vertex 1" refer to
600     * geometry shader invocations 0 and 1, respectively).  Since we have the
601     * enable flags for geometry shader invocation 0 in bits 3:0 of DWORD 0,
602     * and the enable flags for geometry shader invocation 1 in bits 7:0 of
603     * DWORD 4, we just need to OR them together and store the result in bits
604     * 15:8 of DWORD 5.
605     *
606     * It's easier to get the EU to do this if we think of the src and dst
607     * registers as composed of 32 bytes each; then, we want to pick up the
608     * contents of bytes 0 and 16 from src, OR them together, and store them in
609     * byte 21.
610     *
611     * We can do that by the following EU instruction:
612     *
613     *     or(1) dst.21<1>UB src<0,1,0>UB src.16<0,1,0>UB { align1 WE_all }
614     *
615     * Note: this relies on the source register having zeros in (a) bits 7:4 of
616     * DWORD 0 and (b) bits 3:0 of DWORD 4.  We can rely on (b) because the
617     * source register was prepared by ELK_GS_OPCODE_PREPARE_CHANNEL_MASKS (which
618     * shifts DWORD 4 left by 4 bits), and we can rely on (a) because prior to
619     * the execution of ELK_GS_OPCODE_PREPARE_CHANNEL_MASKS, DWORDs 0 and 4 need to
620     * contain valid channel mask values (which are in the range 0x0-0xf).
621     */
622    dst = retype(dst, ELK_REGISTER_TYPE_UB);
623    src = retype(src, ELK_REGISTER_TYPE_UB);
624    elk_push_insn_state(p);
625    elk_set_default_access_mode(p, ELK_ALIGN_1);
626    elk_set_default_mask_control(p, ELK_MASK_DISABLE);
627    elk_OR(p, suboffset(vec1(dst), 21), vec1(src), suboffset(vec1(src), 16));
628    elk_pop_insn_state(p);
629 }
630 
631 static void
generate_gs_get_instance_id(struct elk_codegen * p,struct elk_reg dst)632 generate_gs_get_instance_id(struct elk_codegen *p,
633                             struct elk_reg dst)
634 {
635    /* We want to right shift R0.0 & R0.1 by GFX7_GS_PAYLOAD_INSTANCE_ID_SHIFT
636     * and store into dst.0 & dst.4. So generate the instruction:
637     *
638     *     shr(8) dst<1> R0<1,4,0> GFX7_GS_PAYLOAD_INSTANCE_ID_SHIFT { align1 WE_normal 1Q }
639     */
640    elk_push_insn_state(p);
641    elk_set_default_access_mode(p, ELK_ALIGN_1);
642    dst = retype(dst, ELK_REGISTER_TYPE_UD);
643    struct elk_reg r0(retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
644    elk_SHR(p, dst, stride(r0, 1, 4, 0),
645            elk_imm_ud(GFX7_GS_PAYLOAD_INSTANCE_ID_SHIFT));
646    elk_pop_insn_state(p);
647 }
648 
649 static void
generate_gs_ff_sync_set_primitives(struct elk_codegen * p,struct elk_reg dst,struct elk_reg src0,struct elk_reg src1,struct elk_reg src2)650 generate_gs_ff_sync_set_primitives(struct elk_codegen *p,
651                                    struct elk_reg dst,
652                                    struct elk_reg src0,
653                                    struct elk_reg src1,
654                                    struct elk_reg src2)
655 {
656    elk_push_insn_state(p);
657    elk_set_default_access_mode(p, ELK_ALIGN_1);
658    /* Save src0 data in 16:31 bits of dst.0 */
659    elk_AND(p, suboffset(vec1(dst), 0), suboffset(vec1(src0), 0),
660            elk_imm_ud(0xffffu));
661    elk_SHL(p, suboffset(vec1(dst), 0), suboffset(vec1(dst), 0), elk_imm_ud(16));
662    /* Save src1 data in 0:15 bits of dst.0 */
663    elk_AND(p, suboffset(vec1(src2), 0), suboffset(vec1(src1), 0),
664            elk_imm_ud(0xffffu));
665    elk_OR(p, suboffset(vec1(dst), 0),
666           suboffset(vec1(dst), 0),
667           suboffset(vec1(src2), 0));
668    elk_pop_insn_state(p);
669 }
670 
671 static void
generate_gs_ff_sync(struct elk_codegen * p,vec4_instruction * inst,struct elk_reg dst,struct elk_reg src0,struct elk_reg src1)672 generate_gs_ff_sync(struct elk_codegen *p,
673                     vec4_instruction *inst,
674                     struct elk_reg dst,
675                     struct elk_reg src0,
676                     struct elk_reg src1)
677 {
678    /* This opcode uses an implied MRF register for:
679     *  - the header of the ff_sync message. And as such it is expected to be
680     *    initialized to r0 before calling here.
681     *  - the destination where we will write the allocated URB handle.
682     */
683    struct elk_reg header =
684       retype(elk_message_reg(inst->base_mrf), ELK_REGISTER_TYPE_UD);
685 
686    /* Overwrite dword 0 of the header (SO vertices to write) and
687     * dword 1 (number of primitives written).
688     */
689    elk_push_insn_state(p);
690    elk_set_default_mask_control(p, ELK_MASK_DISABLE);
691    elk_set_default_access_mode(p, ELK_ALIGN_1);
692    elk_MOV(p, get_element_ud(header, 0), get_element_ud(src1, 0));
693    elk_MOV(p, get_element_ud(header, 1), get_element_ud(src0, 0));
694    elk_pop_insn_state(p);
695 
696    /* Allocate URB handle in dst */
697    elk_ff_sync(p,
698                dst,
699                0,
700                header,
701                1, /* allocate */
702                1, /* response length */
703                0 /* eot */);
704 
705    /* Now put allocated urb handle in header.0 */
706    elk_push_insn_state(p);
707    elk_set_default_access_mode(p, ELK_ALIGN_1);
708    elk_set_default_mask_control(p, ELK_MASK_DISABLE);
709    elk_MOV(p, get_element_ud(header, 0), get_element_ud(dst, 0));
710 
711    /* src1 is not an immediate when we use transform feedback */
712    if (src1.file != ELK_IMMEDIATE_VALUE) {
713       elk_set_default_exec_size(p, ELK_EXECUTE_4);
714       elk_MOV(p, elk_vec4_grf(src1.nr, 0), elk_vec4_grf(dst.nr, 1));
715    }
716 
717    elk_pop_insn_state(p);
718 }
719 
720 static void
generate_gs_set_primitive_id(struct elk_codegen * p,struct elk_reg dst)721 generate_gs_set_primitive_id(struct elk_codegen *p, struct elk_reg dst)
722 {
723    /* In gfx6, PrimitiveID is delivered in R0.1 of the payload */
724    struct elk_reg src = elk_vec8_grf(0, 0);
725    elk_push_insn_state(p);
726    elk_set_default_mask_control(p, ELK_MASK_DISABLE);
727    elk_set_default_access_mode(p, ELK_ALIGN_1);
728    elk_MOV(p, get_element_ud(dst, 0), get_element_ud(src, 1));
729    elk_pop_insn_state(p);
730 }
731 
732 static void
generate_tcs_get_instance_id(struct elk_codegen * p,struct elk_reg dst)733 generate_tcs_get_instance_id(struct elk_codegen *p, struct elk_reg dst)
734 {
735    const struct intel_device_info *devinfo = p->devinfo;
736    const bool ivb = devinfo->platform == INTEL_PLATFORM_IVB ||
737                     devinfo->platform == INTEL_PLATFORM_BYT;
738 
739    /* "Instance Count" comes as part of the payload in r0.2 bits 23:17.
740     *
741     * Since we operate in SIMD4x2 mode, we need run half as many threads
742     * as necessary.  So we assign (2i + 1, 2i) as the thread counts.  We
743     * shift right by one less to accomplish the multiplication by two.
744     */
745    dst = retype(dst, ELK_REGISTER_TYPE_UD);
746    struct elk_reg r0(retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
747 
748    elk_push_insn_state(p);
749    elk_set_default_access_mode(p, ELK_ALIGN_1);
750 
751    const int mask = ivb ? INTEL_MASK(22, 16) : INTEL_MASK(23, 17);
752    const int shift = ivb ? 16 : 17;
753 
754    elk_AND(p, get_element_ud(dst, 0), get_element_ud(r0, 2), elk_imm_ud(mask));
755    elk_SHR(p, get_element_ud(dst, 0), get_element_ud(dst, 0),
756            elk_imm_ud(shift - 1));
757    elk_ADD(p, get_element_ud(dst, 4), get_element_ud(dst, 0), elk_imm_ud(1));
758 
759    elk_pop_insn_state(p);
760 }
761 
762 static void
generate_tcs_urb_write(struct elk_codegen * p,vec4_instruction * inst,struct elk_reg urb_header)763 generate_tcs_urb_write(struct elk_codegen *p,
764                        vec4_instruction *inst,
765                        struct elk_reg urb_header)
766 {
767    const struct intel_device_info *devinfo = p->devinfo;
768 
769    elk_inst *send = elk_next_insn(p, ELK_OPCODE_SEND);
770    elk_set_dest(p, send, elk_null_reg());
771    elk_set_src0(p, send, urb_header);
772    elk_set_desc(p, send, elk_message_desc(devinfo, inst->mlen, 0, true));
773 
774    elk_inst_set_sfid(devinfo, send, ELK_SFID_URB);
775    elk_inst_set_urb_opcode(devinfo, send, ELK_URB_OPCODE_WRITE_OWORD);
776    elk_inst_set_urb_global_offset(devinfo, send, inst->offset);
777    if (inst->urb_write_flags & ELK_URB_WRITE_EOT) {
778       elk_inst_set_eot(devinfo, send, 1);
779    } else {
780       elk_inst_set_urb_per_slot_offset(devinfo, send, 1);
781       elk_inst_set_urb_swizzle_control(devinfo, send, ELK_URB_SWIZZLE_INTERLEAVE);
782    }
783 
784    /* what happens to swizzles? */
785 }
786 
787 
788 static void
generate_tcs_input_urb_offsets(struct elk_codegen * p,struct elk_reg dst,struct elk_reg vertex,struct elk_reg offset)789 generate_tcs_input_urb_offsets(struct elk_codegen *p,
790                                struct elk_reg dst,
791                                struct elk_reg vertex,
792                                struct elk_reg offset)
793 {
794    /* Generates an URB read/write message header for HS/DS operation.
795     * Inputs are a vertex index, and a byte offset from the beginning of
796     * the vertex. */
797 
798    /* If `vertex` is not an immediate, we clobber a0.0 */
799 
800    assert(vertex.file == ELK_IMMEDIATE_VALUE || vertex.file == ELK_GENERAL_REGISTER_FILE);
801    assert(vertex.type == ELK_REGISTER_TYPE_UD || vertex.type == ELK_REGISTER_TYPE_D);
802 
803    assert(dst.file == ELK_GENERAL_REGISTER_FILE);
804 
805    elk_push_insn_state(p);
806    elk_set_default_access_mode(p, ELK_ALIGN_1);
807    elk_set_default_mask_control(p, ELK_MASK_DISABLE);
808    elk_MOV(p, dst, elk_imm_ud(0));
809 
810    /* m0.5 bits 8-15 are channel enables */
811    elk_MOV(p, get_element_ud(dst, 5), elk_imm_ud(0xff00));
812 
813    /* m0.0-0.1: URB handles */
814    if (vertex.file == ELK_IMMEDIATE_VALUE) {
815       uint32_t vertex_index = vertex.ud;
816       struct elk_reg index_reg = elk_vec1_grf(
817             1 + (vertex_index >> 3), vertex_index & 7);
818 
819       elk_MOV(p, vec2(get_element_ud(dst, 0)),
820               retype(index_reg, ELK_REGISTER_TYPE_UD));
821    } else {
822       /* Use indirect addressing.  ICP Handles are DWords (single channels
823        * of a register) and start at g1.0.
824        *
825        * In order to start our region at g1.0, we add 8 to the vertex index,
826        * effectively skipping over the 8 channels in g0.0.  This gives us a
827        * DWord offset to the ICP Handle.
828        *
829        * Indirect addressing works in terms of bytes, so we then multiply
830        * the DWord offset by 4 (by shifting left by 2).
831        */
832       struct elk_reg addr = elk_address_reg(0);
833 
834       /* bottom half: m0.0 = g[1.0 + vertex.0]UD */
835       elk_ADD(p, addr, retype(get_element_ud(vertex, 0), ELK_REGISTER_TYPE_UW),
836               elk_imm_uw(0x8));
837       elk_SHL(p, addr, addr, elk_imm_uw(2));
838       elk_MOV(p, get_element_ud(dst, 0), deref_1ud(elk_indirect(0, 0), 0));
839 
840       /* top half: m0.1 = g[1.0 + vertex.4]UD */
841       elk_ADD(p, addr, retype(get_element_ud(vertex, 4), ELK_REGISTER_TYPE_UW),
842               elk_imm_uw(0x8));
843       elk_SHL(p, addr, addr, elk_imm_uw(2));
844       elk_MOV(p, get_element_ud(dst, 1), deref_1ud(elk_indirect(0, 0), 0));
845    }
846 
847    /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */
848    if (offset.file != ARF)
849       elk_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
850 
851    elk_pop_insn_state(p);
852 }
853 
854 
855 static void
generate_tcs_output_urb_offsets(struct elk_codegen * p,struct elk_reg dst,struct elk_reg write_mask,struct elk_reg offset)856 generate_tcs_output_urb_offsets(struct elk_codegen *p,
857                                 struct elk_reg dst,
858                                 struct elk_reg write_mask,
859                                 struct elk_reg offset)
860 {
861    /* Generates an URB read/write message header for HS/DS operation, for the patch URB entry. */
862    assert(dst.file == ELK_GENERAL_REGISTER_FILE || dst.file == ELK_MESSAGE_REGISTER_FILE);
863 
864    assert(write_mask.file == ELK_IMMEDIATE_VALUE);
865    assert(write_mask.type == ELK_REGISTER_TYPE_UD);
866 
867    elk_push_insn_state(p);
868 
869    elk_set_default_access_mode(p, ELK_ALIGN_1);
870    elk_set_default_mask_control(p, ELK_MASK_DISABLE);
871    elk_MOV(p, dst, elk_imm_ud(0));
872 
873    unsigned mask = write_mask.ud;
874 
875    /* m0.5 bits 15:12 and 11:8 are channel enables */
876    elk_MOV(p, get_element_ud(dst, 5), elk_imm_ud((mask << 8) | (mask << 12)));
877 
878    /* HS patch URB handle is delivered in r0.0 */
879    struct elk_reg urb_handle = elk_vec1_grf(0, 0);
880 
881    /* m0.0-0.1: URB handles */
882    elk_MOV(p, vec2(get_element_ud(dst, 0)),
883            retype(urb_handle, ELK_REGISTER_TYPE_UD));
884 
885    /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */
886    if (offset.file != ARF)
887       elk_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
888 
889    elk_pop_insn_state(p);
890 }
891 
892 static void
generate_tes_create_input_read_header(struct elk_codegen * p,struct elk_reg dst)893 generate_tes_create_input_read_header(struct elk_codegen *p,
894                                       struct elk_reg dst)
895 {
896    elk_push_insn_state(p);
897    elk_set_default_access_mode(p, ELK_ALIGN_1);
898    elk_set_default_mask_control(p, ELK_MASK_DISABLE);
899 
900    /* Initialize the register to 0 */
901    elk_MOV(p, dst, elk_imm_ud(0));
902 
903    /* Enable all the channels in m0.5 bits 15:8 */
904    elk_MOV(p, get_element_ud(dst, 5), elk_imm_ud(0xff00));
905 
906    /* Copy g1.3 (the patch URB handle) to m0.0 and m0.1.  For safety,
907     * mask out irrelevant "Reserved" bits, as they're not marked MBZ.
908     */
909    elk_AND(p, vec2(get_element_ud(dst, 0)),
910            retype(elk_vec1_grf(1, 3), ELK_REGISTER_TYPE_UD),
911            elk_imm_ud(0x1fff));
912    elk_pop_insn_state(p);
913 }
914 
915 static void
generate_tes_add_indirect_urb_offset(struct elk_codegen * p,struct elk_reg dst,struct elk_reg header,struct elk_reg offset)916 generate_tes_add_indirect_urb_offset(struct elk_codegen *p,
917                                      struct elk_reg dst,
918                                      struct elk_reg header,
919                                      struct elk_reg offset)
920 {
921    elk_push_insn_state(p);
922    elk_set_default_access_mode(p, ELK_ALIGN_1);
923    elk_set_default_mask_control(p, ELK_MASK_DISABLE);
924 
925    elk_MOV(p, dst, header);
926 
927    /* Uniforms will have a stride <0;4,1>, and we need to convert to <0;1,0>.
928     * Other values get <4;1,0>.
929     */
930    struct elk_reg restrided_offset;
931    if (offset.vstride == ELK_VERTICAL_STRIDE_0 &&
932        offset.width == ELK_WIDTH_4 &&
933        offset.hstride == ELK_HORIZONTAL_STRIDE_1) {
934       restrided_offset = stride(offset, 0, 1, 0);
935    } else {
936       restrided_offset = stride(offset, 4, 1, 0);
937    }
938 
939    /* m0.3-0.4: 128-bit-granular offsets into the URB from the handles */
940    elk_MOV(p, vec2(get_element_ud(dst, 3)), restrided_offset);
941 
942    elk_pop_insn_state(p);
943 }
944 
945 static void
generate_vec4_urb_read(struct elk_codegen * p,vec4_instruction * inst,struct elk_reg dst,struct elk_reg header)946 generate_vec4_urb_read(struct elk_codegen *p,
947                        vec4_instruction *inst,
948                        struct elk_reg dst,
949                        struct elk_reg header)
950 {
951    const struct intel_device_info *devinfo = p->devinfo;
952 
953    assert(header.file == ELK_GENERAL_REGISTER_FILE);
954    assert(header.type == ELK_REGISTER_TYPE_UD);
955 
956    elk_inst *send = elk_next_insn(p, ELK_OPCODE_SEND);
957    elk_set_dest(p, send, dst);
958    elk_set_src0(p, send, header);
959 
960    elk_set_desc(p, send, elk_message_desc(devinfo, 1, 1, true));
961 
962    elk_inst_set_sfid(devinfo, send, ELK_SFID_URB);
963    elk_inst_set_urb_opcode(devinfo, send, ELK_URB_OPCODE_READ_OWORD);
964    elk_inst_set_urb_swizzle_control(devinfo, send, ELK_URB_SWIZZLE_INTERLEAVE);
965    elk_inst_set_urb_per_slot_offset(devinfo, send, 1);
966 
967    elk_inst_set_urb_global_offset(devinfo, send, inst->offset);
968 }
969 
970 static void
generate_tcs_release_input(struct elk_codegen * p,struct elk_reg header,struct elk_reg vertex,struct elk_reg is_unpaired)971 generate_tcs_release_input(struct elk_codegen *p,
972                            struct elk_reg header,
973                            struct elk_reg vertex,
974                            struct elk_reg is_unpaired)
975 {
976    const struct intel_device_info *devinfo = p->devinfo;
977 
978    assert(vertex.file == ELK_IMMEDIATE_VALUE);
979    assert(vertex.type == ELK_REGISTER_TYPE_UD);
980 
981    /* m0.0-0.1: URB handles */
982    struct elk_reg urb_handles =
983       retype(elk_vec2_grf(1 + (vertex.ud >> 3), vertex.ud & 7),
984              ELK_REGISTER_TYPE_UD);
985 
986    elk_push_insn_state(p);
987    elk_set_default_access_mode(p, ELK_ALIGN_1);
988    elk_set_default_mask_control(p, ELK_MASK_DISABLE);
989    elk_MOV(p, header, elk_imm_ud(0));
990    elk_MOV(p, vec2(get_element_ud(header, 0)), urb_handles);
991    elk_pop_insn_state(p);
992 
993    elk_inst *send = elk_next_insn(p, ELK_OPCODE_SEND);
994    elk_set_dest(p, send, elk_null_reg());
995    elk_set_src0(p, send, header);
996    elk_set_desc(p, send, elk_message_desc(devinfo, 1, 0, true));
997 
998    elk_inst_set_sfid(devinfo, send, ELK_SFID_URB);
999    elk_inst_set_urb_opcode(devinfo, send, ELK_URB_OPCODE_READ_OWORD);
1000    elk_inst_set_urb_complete(devinfo, send, 1);
1001    elk_inst_set_urb_swizzle_control(devinfo, send, is_unpaired.ud ?
1002                                     ELK_URB_SWIZZLE_NONE :
1003                                     ELK_URB_SWIZZLE_INTERLEAVE);
1004 }
1005 
1006 static void
generate_tcs_thread_end(struct elk_codegen * p,vec4_instruction * inst)1007 generate_tcs_thread_end(struct elk_codegen *p, vec4_instruction *inst)
1008 {
1009    struct elk_reg header = elk_message_reg(inst->base_mrf);
1010 
1011    elk_push_insn_state(p);
1012    elk_set_default_access_mode(p, ELK_ALIGN_1);
1013    elk_set_default_mask_control(p, ELK_MASK_DISABLE);
1014    elk_MOV(p, header, elk_imm_ud(0));
1015    elk_MOV(p, get_element_ud(header, 5), elk_imm_ud(WRITEMASK_X << 8));
1016    elk_MOV(p, get_element_ud(header, 0),
1017            retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_UD));
1018    elk_MOV(p, elk_message_reg(inst->base_mrf + 1), elk_imm_ud(0u));
1019    elk_pop_insn_state(p);
1020 
1021    elk_urb_WRITE(p,
1022                  elk_null_reg(), /* dest */
1023                  inst->base_mrf, /* starting mrf reg nr */
1024                  header,
1025                  ELK_URB_WRITE_EOT | ELK_URB_WRITE_OWORD |
1026                  ELK_URB_WRITE_USE_CHANNEL_MASKS,
1027                  inst->mlen,
1028                  0,              /* response len */
1029                  0,              /* urb destination offset */
1030                  0);
1031 }
1032 
1033 static void
generate_tes_get_primitive_id(struct elk_codegen * p,struct elk_reg dst)1034 generate_tes_get_primitive_id(struct elk_codegen *p, struct elk_reg dst)
1035 {
1036    elk_push_insn_state(p);
1037    elk_set_default_access_mode(p, ELK_ALIGN_1);
1038    elk_MOV(p, dst, retype(elk_vec1_grf(1, 7), ELK_REGISTER_TYPE_D));
1039    elk_pop_insn_state(p);
1040 }
1041 
1042 static void
generate_tcs_get_primitive_id(struct elk_codegen * p,struct elk_reg dst)1043 generate_tcs_get_primitive_id(struct elk_codegen *p, struct elk_reg dst)
1044 {
1045    elk_push_insn_state(p);
1046    elk_set_default_access_mode(p, ELK_ALIGN_1);
1047    elk_MOV(p, dst, retype(elk_vec1_grf(0, 1), ELK_REGISTER_TYPE_UD));
1048    elk_pop_insn_state(p);
1049 }
1050 
1051 static void
generate_tcs_create_barrier_header(struct elk_codegen * p,struct elk_vue_prog_data * prog_data,struct elk_reg dst)1052 generate_tcs_create_barrier_header(struct elk_codegen *p,
1053                                    struct elk_vue_prog_data *prog_data,
1054                                    struct elk_reg dst)
1055 {
1056    const struct intel_device_info *devinfo = p->devinfo;
1057    const bool ivb = devinfo->platform == INTEL_PLATFORM_IVB ||
1058                     devinfo->platform == INTEL_PLATFORM_BYT;
1059    struct elk_reg m0_2 = get_element_ud(dst, 2);
1060    unsigned instances = ((struct elk_tcs_prog_data *) prog_data)->instances;
1061 
1062    elk_push_insn_state(p);
1063    elk_set_default_access_mode(p, ELK_ALIGN_1);
1064    elk_set_default_mask_control(p, ELK_MASK_DISABLE);
1065 
1066    /* Zero the message header */
1067    elk_MOV(p, retype(dst, ELK_REGISTER_TYPE_UD), elk_imm_ud(0u));
1068 
1069    /* Copy "Barrier ID" from r0.2, bits 16:13 (Gfx7.5+) or 15:12 (Gfx7) */
1070    elk_AND(p, m0_2,
1071            retype(elk_vec1_grf(0, 2), ELK_REGISTER_TYPE_UD),
1072            elk_imm_ud(ivb ? INTEL_MASK(15, 12) : INTEL_MASK(16, 13)));
1073 
1074    /* Shift it up to bits 27:24. */
1075    elk_SHL(p, m0_2, get_element_ud(dst, 2), elk_imm_ud(ivb ? 12 : 11));
1076 
1077    /* Set the Barrier Count and the enable bit */
1078    elk_OR(p, m0_2, m0_2, elk_imm_ud(instances << 9 | (1 << 15)));
1079 
1080    elk_pop_insn_state(p);
1081 }
1082 
1083 static void
generate_oword_dual_block_offsets(struct elk_codegen * p,struct elk_reg m1,struct elk_reg index)1084 generate_oword_dual_block_offsets(struct elk_codegen *p,
1085                                   struct elk_reg m1,
1086                                   struct elk_reg index)
1087 {
1088    int second_vertex_offset;
1089 
1090    if (p->devinfo->ver >= 6)
1091       second_vertex_offset = 1;
1092    else
1093       second_vertex_offset = 16;
1094 
1095    m1 = retype(m1, ELK_REGISTER_TYPE_D);
1096 
1097    /* Set up M1 (message payload).  Only the block offsets in M1.0 and
1098     * M1.4 are used, and the rest are ignored.
1099     */
1100    struct elk_reg m1_0 = suboffset(vec1(m1), 0);
1101    struct elk_reg m1_4 = suboffset(vec1(m1), 4);
1102    struct elk_reg index_0 = suboffset(vec1(index), 0);
1103    struct elk_reg index_4 = suboffset(vec1(index), 4);
1104 
1105    elk_push_insn_state(p);
1106    elk_set_default_mask_control(p, ELK_MASK_DISABLE);
1107    elk_set_default_access_mode(p, ELK_ALIGN_1);
1108 
1109    elk_MOV(p, m1_0, index_0);
1110 
1111    if (index.file == ELK_IMMEDIATE_VALUE) {
1112       index_4.ud += second_vertex_offset;
1113       elk_MOV(p, m1_4, index_4);
1114    } else {
1115       elk_ADD(p, m1_4, index_4, elk_imm_d(second_vertex_offset));
1116    }
1117 
1118    elk_pop_insn_state(p);
1119 }
1120 
1121 static void
generate_unpack_flags(struct elk_codegen * p,struct elk_reg dst)1122 generate_unpack_flags(struct elk_codegen *p,
1123                       struct elk_reg dst)
1124 {
1125    elk_push_insn_state(p);
1126    elk_set_default_mask_control(p, ELK_MASK_DISABLE);
1127    elk_set_default_access_mode(p, ELK_ALIGN_1);
1128 
1129    struct elk_reg flags = elk_flag_reg(0, 0);
1130    struct elk_reg dst_0 = suboffset(vec1(dst), 0);
1131    struct elk_reg dst_4 = suboffset(vec1(dst), 4);
1132 
1133    elk_AND(p, dst_0, flags, elk_imm_ud(0x0f));
1134    elk_AND(p, dst_4, flags, elk_imm_ud(0xf0));
1135    elk_SHR(p, dst_4, dst_4, elk_imm_ud(4));
1136 
1137    elk_pop_insn_state(p);
1138 }
1139 
1140 static void
generate_scratch_read(struct elk_codegen * p,vec4_instruction * inst,struct elk_reg dst,struct elk_reg index)1141 generate_scratch_read(struct elk_codegen *p,
1142                       vec4_instruction *inst,
1143                       struct elk_reg dst,
1144                       struct elk_reg index)
1145 {
1146    const struct intel_device_info *devinfo = p->devinfo;
1147    struct elk_reg header = elk_vec8_grf(0, 0);
1148 
1149    elk_gfx6_resolve_implied_move(p, &header, inst->base_mrf);
1150 
1151    generate_oword_dual_block_offsets(p, elk_message_reg(inst->base_mrf + 1),
1152 				     index);
1153 
1154    uint32_t msg_type;
1155 
1156    if (devinfo->ver >= 6)
1157       msg_type = GFX6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1158    else if (devinfo->verx10 >= 45)
1159       msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1160    else
1161       msg_type = ELK_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1162 
1163    const unsigned target_cache =
1164       devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
1165       devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
1166       ELK_SFID_DATAPORT_READ;
1167 
1168    /* Each of the 8 channel enables is considered for whether each
1169     * dword is written.
1170     */
1171    elk_inst *send = elk_next_insn(p, ELK_OPCODE_SEND);
1172    elk_inst_set_sfid(devinfo, send, target_cache);
1173    elk_set_dest(p, send, dst);
1174    elk_set_src0(p, send, header);
1175    if (devinfo->ver < 6)
1176       elk_inst_set_cond_modifier(devinfo, send, inst->base_mrf);
1177    elk_set_desc(p, send,
1178                 elk_message_desc(devinfo, 2, 1, true) |
1179                 elk_dp_read_desc(devinfo,
1180                                  elk_scratch_surface_idx(p),
1181                                  ELK_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
1182                                  msg_type, ELK_DATAPORT_READ_TARGET_RENDER_CACHE));
1183 }
1184 
1185 static void
generate_scratch_write(struct elk_codegen * p,vec4_instruction * inst,struct elk_reg dst,struct elk_reg src,struct elk_reg index)1186 generate_scratch_write(struct elk_codegen *p,
1187                        vec4_instruction *inst,
1188                        struct elk_reg dst,
1189                        struct elk_reg src,
1190                        struct elk_reg index)
1191 {
1192    const struct intel_device_info *devinfo = p->devinfo;
1193    const unsigned target_cache =
1194       (devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
1195        devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
1196        ELK_SFID_DATAPORT_WRITE);
1197    struct elk_reg header = elk_vec8_grf(0, 0);
1198    bool write_commit;
1199 
1200    /* If the instruction is predicated, we'll predicate the send, not
1201     * the header setup.
1202     */
1203    elk_push_insn_state(p);
1204    elk_set_default_predicate_control(p, ELK_PREDICATE_NONE);
1205    elk_set_default_flag_reg(p, 0, 0);
1206 
1207    elk_gfx6_resolve_implied_move(p, &header, inst->base_mrf);
1208 
1209    generate_oword_dual_block_offsets(p, elk_message_reg(inst->base_mrf + 1),
1210 				     index);
1211 
1212    elk_MOV(p,
1213 	   retype(elk_message_reg(inst->base_mrf + 2), ELK_REGISTER_TYPE_D),
1214 	   retype(src, ELK_REGISTER_TYPE_D));
1215 
1216    elk_pop_insn_state(p);
1217 
1218    uint32_t msg_type;
1219 
1220    if (devinfo->ver >= 7)
1221       msg_type = GFX7_DATAPORT_DC_OWORD_DUAL_BLOCK_WRITE;
1222    else if (devinfo->ver == 6)
1223       msg_type = GFX6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
1224    else
1225       msg_type = ELK_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
1226 
1227    elk_set_default_predicate_control(p, inst->predicate);
1228 
1229    /* Pre-gfx6, we have to specify write commits to ensure ordering
1230     * between reads and writes within a thread.  Afterwards, that's
1231     * guaranteed and write commits only matter for inter-thread
1232     * synchronization.
1233     */
1234    if (devinfo->ver >= 6) {
1235       write_commit = false;
1236    } else {
1237       /* The visitor set up our destination register to be g0.  This
1238        * means that when the next read comes along, we will end up
1239        * reading from g0 and causing a block on the write commit.  For
1240        * write-after-read, we are relying on the value of the previous
1241        * read being used (and thus blocking on completion) before our
1242        * write is executed.  This means we have to be careful in
1243        * instruction scheduling to not violate this assumption.
1244        */
1245       write_commit = true;
1246    }
1247 
1248    /* Each of the 8 channel enables is considered for whether each
1249     * dword is written.
1250     */
1251    elk_inst *send = elk_next_insn(p, ELK_OPCODE_SEND);
1252    elk_inst_set_sfid(p->devinfo, send, target_cache);
1253    elk_set_dest(p, send, dst);
1254    elk_set_src0(p, send, header);
1255    if (devinfo->ver < 6)
1256       elk_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf);
1257    elk_set_desc(p, send,
1258                 elk_message_desc(devinfo, 3, write_commit, true) |
1259                 elk_dp_write_desc(devinfo,
1260                                   elk_scratch_surface_idx(p),
1261                                   ELK_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
1262                                   msg_type,
1263                                   write_commit));
1264 }
1265 
1266 static void
generate_pull_constant_load(struct elk_codegen * p,vec4_instruction * inst,struct elk_reg dst,struct elk_reg index,struct elk_reg offset)1267 generate_pull_constant_load(struct elk_codegen *p,
1268                             vec4_instruction *inst,
1269                             struct elk_reg dst,
1270                             struct elk_reg index,
1271                             struct elk_reg offset)
1272 {
1273    const struct intel_device_info *devinfo = p->devinfo;
1274    const unsigned target_cache =
1275       (devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_SAMPLER_CACHE :
1276        ELK_SFID_DATAPORT_READ);
1277    assert(index.file == ELK_IMMEDIATE_VALUE &&
1278 	  index.type == ELK_REGISTER_TYPE_UD);
1279    uint32_t surf_index = index.ud;
1280 
1281    struct elk_reg header = elk_vec8_grf(0, 0);
1282 
1283    elk_gfx6_resolve_implied_move(p, &header, inst->base_mrf);
1284 
1285    if (devinfo->ver >= 6) {
1286       if (offset.file == ELK_IMMEDIATE_VALUE) {
1287          elk_MOV(p, retype(elk_message_reg(inst->base_mrf + 1),
1288                            ELK_REGISTER_TYPE_D),
1289                  elk_imm_d(offset.ud >> 4));
1290       } else {
1291          elk_SHR(p, retype(elk_message_reg(inst->base_mrf + 1),
1292                            ELK_REGISTER_TYPE_D),
1293                  offset, elk_imm_d(4));
1294       }
1295    } else {
1296       elk_MOV(p, retype(elk_message_reg(inst->base_mrf + 1),
1297                         ELK_REGISTER_TYPE_D),
1298               offset);
1299    }
1300 
1301    uint32_t msg_type;
1302 
1303    if (devinfo->ver >= 6)
1304       msg_type = GFX6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1305    else if (devinfo->verx10 >= 45)
1306       msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1307    else
1308       msg_type = ELK_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1309 
1310    /* Each of the 8 channel enables is considered for whether each
1311     * dword is written.
1312     */
1313    elk_inst *send = elk_next_insn(p, ELK_OPCODE_SEND);
1314    elk_inst_set_sfid(devinfo, send, target_cache);
1315    elk_set_dest(p, send, dst);
1316    elk_set_src0(p, send, header);
1317    if (devinfo->ver < 6)
1318       elk_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf);
1319    elk_set_desc(p, send,
1320                 elk_message_desc(devinfo, 2, 1, true) |
1321                 elk_dp_read_desc(devinfo, surf_index,
1322                                  ELK_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
1323                                  msg_type,
1324                                  ELK_DATAPORT_READ_TARGET_DATA_CACHE));
1325 }
1326 
1327 static void
generate_get_buffer_size(struct elk_codegen * p,vec4_instruction * inst,struct elk_reg dst,struct elk_reg src,struct elk_reg surf_index)1328 generate_get_buffer_size(struct elk_codegen *p,
1329                          vec4_instruction *inst,
1330                          struct elk_reg dst,
1331                          struct elk_reg src,
1332                          struct elk_reg surf_index)
1333 {
1334    assert(p->devinfo->ver >= 7);
1335    assert(surf_index.type == ELK_REGISTER_TYPE_UD &&
1336           surf_index.file == ELK_IMMEDIATE_VALUE);
1337 
1338    elk_SAMPLE(p,
1339               dst,
1340               inst->base_mrf,
1341               src,
1342               surf_index.ud,
1343               0,
1344               GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
1345               1, /* response length */
1346               inst->mlen,
1347               inst->header_size > 0,
1348               ELK_SAMPLER_SIMD_MODE_SIMD4X2,
1349               ELK_SAMPLER_RETURN_FORMAT_SINT32);
1350 }
1351 
1352 static void
generate_pull_constant_load_gfx7(struct elk_codegen * p,vec4_instruction * inst,struct elk_reg dst,struct elk_reg surf_index,struct elk_reg offset)1353 generate_pull_constant_load_gfx7(struct elk_codegen *p,
1354                                  vec4_instruction *inst,
1355                                  struct elk_reg dst,
1356                                  struct elk_reg surf_index,
1357                                  struct elk_reg offset)
1358 {
1359    const struct intel_device_info *devinfo = p->devinfo;
1360    assert(surf_index.type == ELK_REGISTER_TYPE_UD);
1361 
1362    if (surf_index.file == ELK_IMMEDIATE_VALUE) {
1363 
1364       elk_inst *insn = elk_next_insn(p, ELK_OPCODE_SEND);
1365       elk_inst_set_sfid(devinfo, insn, ELK_SFID_SAMPLER);
1366       elk_set_dest(p, insn, dst);
1367       elk_set_src0(p, insn, offset);
1368       elk_set_desc(p, insn,
1369                    elk_message_desc(devinfo, inst->mlen, 1, inst->header_size) |
1370                    elk_sampler_desc(devinfo, surf_index.ud,
1371                                     0, /* LD message ignores sampler unit */
1372                                     GFX5_SAMPLER_MESSAGE_SAMPLE_LD,
1373                                     ELK_SAMPLER_SIMD_MODE_SIMD4X2, 0));
1374    } else {
1375 
1376       struct elk_reg addr = vec1(retype(elk_address_reg(0), ELK_REGISTER_TYPE_UD));
1377 
1378       elk_push_insn_state(p);
1379       elk_set_default_mask_control(p, ELK_MASK_DISABLE);
1380       elk_set_default_access_mode(p, ELK_ALIGN_1);
1381 
1382       /* a0.0 = surf_index & 0xff */
1383       elk_inst *insn_and = elk_next_insn(p, ELK_OPCODE_AND);
1384       elk_inst_set_exec_size(devinfo, insn_and, ELK_EXECUTE_1);
1385       elk_set_dest(p, insn_and, addr);
1386       elk_set_src0(p, insn_and, vec1(retype(surf_index, ELK_REGISTER_TYPE_UD)));
1387       elk_set_src1(p, insn_and, elk_imm_ud(0x0ff));
1388 
1389       elk_pop_insn_state(p);
1390 
1391       /* dst = send(offset, a0.0 | <descriptor>) */
1392       elk_send_indirect_message(
1393          p, ELK_SFID_SAMPLER, dst, offset, addr,
1394          elk_message_desc(devinfo, inst->mlen, 1, inst->header_size) |
1395          elk_sampler_desc(devinfo,
1396                           0 /* surface */,
1397                           0 /* sampler */,
1398                           GFX5_SAMPLER_MESSAGE_SAMPLE_LD,
1399                           ELK_SAMPLER_SIMD_MODE_SIMD4X2,
1400                           0),
1401          false /* EOT */);
1402    }
1403 }
1404 
1405 static void
generate_mov_indirect(struct elk_codegen * p,vec4_instruction *,struct elk_reg dst,struct elk_reg reg,struct elk_reg indirect)1406 generate_mov_indirect(struct elk_codegen *p,
1407                       vec4_instruction *,
1408                       struct elk_reg dst, struct elk_reg reg,
1409                       struct elk_reg indirect)
1410 {
1411    assert(indirect.type == ELK_REGISTER_TYPE_UD);
1412    assert(p->devinfo->ver >= 6);
1413 
1414    unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr * (REG_SIZE / 2);
1415 
1416    /* This instruction acts in align1 mode */
1417    assert(dst.writemask == WRITEMASK_XYZW);
1418 
1419    if (indirect.file == ELK_IMMEDIATE_VALUE) {
1420       imm_byte_offset += indirect.ud;
1421 
1422       reg.nr = imm_byte_offset / REG_SIZE;
1423       reg.subnr = (imm_byte_offset / (REG_SIZE / 2)) % 2;
1424       unsigned shift = (imm_byte_offset / 4) % 4;
1425       reg.swizzle += ELK_SWIZZLE4(shift, shift, shift, shift);
1426 
1427       elk_MOV(p, dst, reg);
1428    } else {
1429       elk_push_insn_state(p);
1430       elk_set_default_access_mode(p, ELK_ALIGN_1);
1431       elk_set_default_mask_control(p, ELK_MASK_DISABLE);
1432 
1433       struct elk_reg addr = vec8(elk_address_reg(0));
1434 
1435       /* We need to move the indirect value into the address register.  In
1436        * order to make things make some sense, we want to respect at least the
1437        * X component of the swizzle.  In order to do that, we need to convert
1438        * the subnr (probably 0) to an align1 subnr and add in the swizzle.
1439        */
1440       assert(elk_is_single_value_swizzle(indirect.swizzle));
1441       indirect.subnr = (indirect.subnr * 4 + ELK_GET_SWZ(indirect.swizzle, 0));
1442 
1443       /* We then use a region of <8,4,0>:uw to pick off the first 2 bytes of
1444        * the indirect and splat it out to all four channels of the given half
1445        * of a0.
1446        */
1447       indirect.subnr *= 2;
1448       indirect = stride(retype(indirect, ELK_REGISTER_TYPE_UW), 8, 4, 0);
1449       elk_ADD(p, addr, indirect, elk_imm_uw(imm_byte_offset));
1450 
1451       /* Now we need to incorporate the swizzle from the source register */
1452       if (reg.swizzle != ELK_SWIZZLE_XXXX) {
1453          uint32_t uv_swiz = ELK_GET_SWZ(reg.swizzle, 0) << 2 |
1454                             ELK_GET_SWZ(reg.swizzle, 1) << 6 |
1455                             ELK_GET_SWZ(reg.swizzle, 2) << 10 |
1456                             ELK_GET_SWZ(reg.swizzle, 3) << 14;
1457          uv_swiz |= uv_swiz << 16;
1458 
1459          elk_ADD(p, addr, addr, elk_imm_uv(uv_swiz));
1460       }
1461 
1462       elk_MOV(p, dst, retype(elk_VxH_indirect(0, 0), reg.type));
1463 
1464       elk_pop_insn_state(p);
1465    }
1466 }
1467 
1468 static void
generate_zero_oob_push_regs(struct elk_codegen * p,struct elk_stage_prog_data * prog_data,struct elk_reg scratch,struct elk_reg bit_mask_in)1469 generate_zero_oob_push_regs(struct elk_codegen *p,
1470                             struct elk_stage_prog_data *prog_data,
1471                             struct elk_reg scratch,
1472                             struct elk_reg bit_mask_in)
1473 {
1474    const uint64_t want_zero = prog_data->zero_push_reg;
1475    assert(want_zero);
1476 
1477    assert(bit_mask_in.file == ELK_GENERAL_REGISTER_FILE);
1478    assert(ELK_GET_SWZ(bit_mask_in.swizzle, 1) ==
1479           ELK_GET_SWZ(bit_mask_in.swizzle, 0) + 1);
1480    bit_mask_in.subnr += ELK_GET_SWZ(bit_mask_in.swizzle, 0) * 4;
1481    bit_mask_in.type = ELK_REGISTER_TYPE_W;
1482 
1483    /* Scratch should be 3 registers in the GRF */
1484    assert(scratch.file == ELK_GENERAL_REGISTER_FILE);
1485    scratch = vec8(scratch);
1486    struct elk_reg mask_w16 = retype(scratch, ELK_REGISTER_TYPE_W);
1487    struct elk_reg mask_d16 = retype(byte_offset(scratch, REG_SIZE),
1488                                     ELK_REGISTER_TYPE_D);
1489 
1490    elk_push_insn_state(p);
1491    elk_set_default_access_mode(p, ELK_ALIGN_1);
1492    elk_set_default_mask_control(p, ELK_MASK_DISABLE);
1493 
1494    for (unsigned i = 0; i < 64; i++) {
1495       if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
1496          elk_set_default_exec_size(p, ELK_EXECUTE_8);
1497          elk_SHL(p, suboffset(mask_w16, 8),
1498                     vec1(byte_offset(bit_mask_in, i / 8)),
1499                     elk_imm_v(0x01234567));
1500          elk_SHL(p, mask_w16, suboffset(mask_w16, 8), elk_imm_w(8));
1501 
1502          elk_set_default_exec_size(p, ELK_EXECUTE_16);
1503          elk_ASR(p, mask_d16, mask_w16, elk_imm_w(15));
1504       }
1505 
1506       if (want_zero & BITFIELD64_BIT(i)) {
1507          unsigned push_start = prog_data->dispatch_grf_start_reg;
1508          struct elk_reg push_reg =
1509             retype(elk_vec8_grf(push_start + i, 0), ELK_REGISTER_TYPE_D);
1510 
1511          elk_set_default_exec_size(p, ELK_EXECUTE_8);
1512          elk_AND(p, push_reg, push_reg, vec1(suboffset(mask_d16, i)));
1513       }
1514    }
1515 
1516    elk_pop_insn_state(p);
1517 }
1518 
1519 static void
generate_code(struct elk_codegen * p,const struct elk_compiler * compiler,const struct elk_compile_params * params,const nir_shader * nir,struct elk_vue_prog_data * prog_data,const struct elk_cfg_t * cfg,const performance & perf,struct elk_compile_stats * stats,bool debug_enabled)1520 generate_code(struct elk_codegen *p,
1521               const struct elk_compiler *compiler,
1522               const struct elk_compile_params *params,
1523               const nir_shader *nir,
1524               struct elk_vue_prog_data *prog_data,
1525               const struct elk_cfg_t *cfg,
1526               const performance &perf,
1527               struct elk_compile_stats *stats,
1528               bool debug_enabled)
1529 {
1530    const struct intel_device_info *devinfo = p->devinfo;
1531    const char *stage_abbrev = _mesa_shader_stage_to_abbrev(nir->info.stage);
1532    struct elk_disasm_info *elk_disasm_info = elk_disasm_initialize(p->isa, cfg);
1533 
1534    /* `send_count` explicitly does not include spills or fills, as we'd
1535     * like to use it as a metric for intentional memory access or other
1536     * shared function use.  Otherwise, subtle changes to scheduling or
1537     * register allocation could cause it to fluctuate wildly - and that
1538     * effect is already counted in spill/fill counts.
1539     */
1540    int spill_count = 0, fill_count = 0;
1541    int loop_count = 0, send_count = 0;
1542 
1543    foreach_block_and_inst (block, vec4_instruction, inst, cfg) {
1544       struct elk_reg src[3], dst;
1545 
1546       if (unlikely(debug_enabled))
1547          elk_disasm_annotate(elk_disasm_info, inst, p->next_insn_offset);
1548 
1549       for (unsigned int i = 0; i < 3; i++) {
1550          src[i] = inst->src[i].as_elk_reg();
1551       }
1552       dst = inst->dst.as_elk_reg();
1553 
1554       elk_set_default_predicate_control(p, inst->predicate);
1555       elk_set_default_predicate_inverse(p, inst->predicate_inverse);
1556       elk_set_default_flag_reg(p, inst->flag_subreg / 2, inst->flag_subreg % 2);
1557       elk_set_default_saturate(p, inst->saturate);
1558       elk_set_default_mask_control(p, inst->force_writemask_all);
1559       elk_set_default_acc_write_control(p, inst->writes_accumulator);
1560 
1561       assert(inst->group % inst->exec_size == 0);
1562       assert(inst->group % 4 == 0);
1563 
1564       /* There are some instructions where the destination is 64-bit
1565        * but we retype it to a smaller type. In that case, we cannot
1566        * double the exec_size.
1567        */
1568       const bool is_df = (get_exec_type_size(inst) == 8 ||
1569                           inst->dst.type == ELK_REGISTER_TYPE_DF) &&
1570                          inst->opcode != ELK_VEC4_OPCODE_PICK_LOW_32BIT &&
1571                          inst->opcode != ELK_VEC4_OPCODE_PICK_HIGH_32BIT &&
1572                          inst->opcode != ELK_VEC4_OPCODE_SET_LOW_32BIT &&
1573                          inst->opcode != ELK_VEC4_OPCODE_SET_HIGH_32BIT;
1574 
1575       unsigned exec_size = inst->exec_size;
1576       if (devinfo->verx10 == 70 && is_df)
1577          exec_size *= 2;
1578 
1579       elk_set_default_exec_size(p, cvt(exec_size) - 1);
1580 
1581       if (!inst->force_writemask_all)
1582          elk_set_default_group(p, inst->group);
1583 
1584       assert(inst->base_mrf + inst->mlen <= ELK_MAX_MRF(devinfo->ver));
1585       assert(inst->mlen <= ELK_MAX_MSG_LENGTH);
1586 
1587       unsigned pre_emit_nr_insn = p->nr_insn;
1588 
1589       switch (inst->opcode) {
1590       case ELK_VEC4_OPCODE_UNPACK_UNIFORM:
1591       case ELK_OPCODE_MOV:
1592       case ELK_VEC4_OPCODE_MOV_FOR_SCRATCH:
1593          elk_MOV(p, dst, src[0]);
1594          break;
1595       case ELK_OPCODE_ADD:
1596          elk_ADD(p, dst, src[0], src[1]);
1597          break;
1598       case ELK_OPCODE_MUL:
1599          elk_MUL(p, dst, src[0], src[1]);
1600          break;
1601       case ELK_OPCODE_MACH:
1602          elk_MACH(p, dst, src[0], src[1]);
1603          break;
1604 
1605       case ELK_OPCODE_MAD:
1606          assert(devinfo->ver >= 6);
1607          elk_MAD(p, dst, src[0], src[1], src[2]);
1608          break;
1609 
1610       case ELK_OPCODE_FRC:
1611          elk_FRC(p, dst, src[0]);
1612          break;
1613       case ELK_OPCODE_RNDD:
1614          elk_RNDD(p, dst, src[0]);
1615          break;
1616       case ELK_OPCODE_RNDE:
1617          elk_RNDE(p, dst, src[0]);
1618          break;
1619       case ELK_OPCODE_RNDZ:
1620          elk_RNDZ(p, dst, src[0]);
1621          break;
1622 
1623       case ELK_OPCODE_AND:
1624          elk_AND(p, dst, src[0], src[1]);
1625          break;
1626       case ELK_OPCODE_OR:
1627          elk_OR(p, dst, src[0], src[1]);
1628          break;
1629       case ELK_OPCODE_XOR:
1630          elk_XOR(p, dst, src[0], src[1]);
1631          break;
1632       case ELK_OPCODE_NOT:
1633          elk_NOT(p, dst, src[0]);
1634          break;
1635       case ELK_OPCODE_ASR:
1636          elk_ASR(p, dst, src[0], src[1]);
1637          break;
1638       case ELK_OPCODE_SHR:
1639          elk_SHR(p, dst, src[0], src[1]);
1640          break;
1641       case ELK_OPCODE_SHL:
1642          elk_SHL(p, dst, src[0], src[1]);
1643          break;
1644 
1645       case ELK_OPCODE_CMP:
1646          elk_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
1647          break;
1648       case ELK_OPCODE_CMPN:
1649          elk_CMPN(p, dst, inst->conditional_mod, src[0], src[1]);
1650          break;
1651       case ELK_OPCODE_SEL:
1652          elk_SEL(p, dst, src[0], src[1]);
1653          break;
1654 
1655       case ELK_OPCODE_DPH:
1656          elk_DPH(p, dst, src[0], src[1]);
1657          break;
1658 
1659       case ELK_OPCODE_DP4:
1660          elk_DP4(p, dst, src[0], src[1]);
1661          break;
1662 
1663       case ELK_OPCODE_DP3:
1664          elk_DP3(p, dst, src[0], src[1]);
1665          break;
1666 
1667       case ELK_OPCODE_DP2:
1668          elk_DP2(p, dst, src[0], src[1]);
1669          break;
1670 
1671       case ELK_OPCODE_F32TO16:
1672          assert(devinfo->ver >= 7);
1673          elk_F32TO16(p, dst, src[0]);
1674          break;
1675 
1676       case ELK_OPCODE_F16TO32:
1677          assert(devinfo->ver >= 7);
1678          elk_F16TO32(p, dst, src[0]);
1679          break;
1680 
1681       case ELK_OPCODE_LRP:
1682          assert(devinfo->ver >= 6);
1683          elk_LRP(p, dst, src[0], src[1], src[2]);
1684          break;
1685 
1686       case ELK_OPCODE_BFREV:
1687          assert(devinfo->ver >= 7);
1688          elk_BFREV(p, retype(dst, ELK_REGISTER_TYPE_UD),
1689                    retype(src[0], ELK_REGISTER_TYPE_UD));
1690          break;
1691       case ELK_OPCODE_FBH:
1692          assert(devinfo->ver >= 7);
1693          elk_FBH(p, retype(dst, src[0].type), src[0]);
1694          break;
1695       case ELK_OPCODE_FBL:
1696          assert(devinfo->ver >= 7);
1697          elk_FBL(p, retype(dst, ELK_REGISTER_TYPE_UD),
1698                  retype(src[0], ELK_REGISTER_TYPE_UD));
1699          break;
1700       case ELK_OPCODE_LZD:
1701          elk_LZD(p, dst, src[0]);
1702          break;
1703       case ELK_OPCODE_CBIT:
1704          assert(devinfo->ver >= 7);
1705          elk_CBIT(p, retype(dst, ELK_REGISTER_TYPE_UD),
1706                   retype(src[0], ELK_REGISTER_TYPE_UD));
1707          break;
1708       case ELK_OPCODE_ADDC:
1709          assert(devinfo->ver >= 7);
1710          elk_ADDC(p, dst, src[0], src[1]);
1711          break;
1712       case ELK_OPCODE_SUBB:
1713          assert(devinfo->ver >= 7);
1714          elk_SUBB(p, dst, src[0], src[1]);
1715          break;
1716       case ELK_OPCODE_MAC:
1717          elk_MAC(p, dst, src[0], src[1]);
1718          break;
1719 
1720       case ELK_OPCODE_BFE:
1721          assert(devinfo->ver >= 7);
1722          elk_BFE(p, dst, src[0], src[1], src[2]);
1723          break;
1724 
1725       case ELK_OPCODE_BFI1:
1726          assert(devinfo->ver >= 7);
1727          elk_BFI1(p, dst, src[0], src[1]);
1728          break;
1729       case ELK_OPCODE_BFI2:
1730          assert(devinfo->ver >= 7);
1731          elk_BFI2(p, dst, src[0], src[1], src[2]);
1732          break;
1733 
1734       case ELK_OPCODE_IF:
1735          if (!inst->src[0].is_null()) {
1736             /* The instruction has an embedded compare (only allowed on gfx6) */
1737             assert(devinfo->ver == 6);
1738             elk_gfx6_IF(p, inst->conditional_mod, src[0], src[1]);
1739          } else {
1740             elk_inst *if_inst = elk_IF(p, ELK_EXECUTE_8);
1741             elk_inst_set_pred_control(p->devinfo, if_inst, inst->predicate);
1742          }
1743          break;
1744 
1745       case ELK_OPCODE_ELSE:
1746          elk_ELSE(p);
1747          break;
1748       case ELK_OPCODE_ENDIF:
1749          elk_ENDIF(p);
1750          break;
1751 
1752       case ELK_OPCODE_DO:
1753          elk_DO(p, ELK_EXECUTE_8);
1754          break;
1755 
1756       case ELK_OPCODE_BREAK:
1757          elk_BREAK(p);
1758          elk_set_default_predicate_control(p, ELK_PREDICATE_NONE);
1759          break;
1760       case ELK_OPCODE_CONTINUE:
1761          elk_CONT(p);
1762          elk_set_default_predicate_control(p, ELK_PREDICATE_NONE);
1763          break;
1764 
1765       case ELK_OPCODE_WHILE:
1766          elk_WHILE(p);
1767          loop_count++;
1768          break;
1769 
1770       case ELK_SHADER_OPCODE_RCP:
1771       case ELK_SHADER_OPCODE_RSQ:
1772       case ELK_SHADER_OPCODE_SQRT:
1773       case ELK_SHADER_OPCODE_EXP2:
1774       case ELK_SHADER_OPCODE_LOG2:
1775       case ELK_SHADER_OPCODE_SIN:
1776       case ELK_SHADER_OPCODE_COS:
1777          assert(inst->conditional_mod == ELK_CONDITIONAL_NONE);
1778          if (devinfo->ver >= 7) {
1779             elk_gfx6_math(p, dst, elk_math_function(inst->opcode), src[0],
1780                       elk_null_reg());
1781          } else if (devinfo->ver == 6) {
1782             generate_math_gfx6(p, inst, dst, src[0], elk_null_reg());
1783          } else {
1784             generate_math1_gfx4(p, inst, dst, src[0]);
1785             send_count++;
1786          }
1787          break;
1788 
1789       case ELK_SHADER_OPCODE_POW:
1790       case ELK_SHADER_OPCODE_INT_QUOTIENT:
1791       case ELK_SHADER_OPCODE_INT_REMAINDER:
1792          assert(inst->conditional_mod == ELK_CONDITIONAL_NONE);
1793          if (devinfo->ver >= 7) {
1794             elk_gfx6_math(p, dst, elk_math_function(inst->opcode), src[0], src[1]);
1795          } else if (devinfo->ver == 6) {
1796             generate_math_gfx6(p, inst, dst, src[0], src[1]);
1797          } else {
1798             generate_math2_gfx4(p, inst, dst, src[0], src[1]);
1799             send_count++;
1800          }
1801          break;
1802 
1803       case ELK_SHADER_OPCODE_TEX:
1804       case ELK_SHADER_OPCODE_TXD:
1805       case ELK_SHADER_OPCODE_TXF:
1806       case ELK_SHADER_OPCODE_TXF_CMS:
1807       case ELK_SHADER_OPCODE_TXF_CMS_W:
1808       case ELK_SHADER_OPCODE_TXF_MCS:
1809       case ELK_SHADER_OPCODE_TXL:
1810       case ELK_SHADER_OPCODE_TXS:
1811       case ELK_SHADER_OPCODE_TG4:
1812       case ELK_SHADER_OPCODE_TG4_OFFSET:
1813       case ELK_SHADER_OPCODE_SAMPLEINFO:
1814          generate_tex(p, prog_data, nir->info.stage,
1815                       inst, dst, src[0], src[1], src[2]);
1816          send_count++;
1817          break;
1818 
1819       case ELK_SHADER_OPCODE_GET_BUFFER_SIZE:
1820          generate_get_buffer_size(p, inst, dst, src[0], src[1]);
1821          send_count++;
1822          break;
1823 
1824       case ELK_VEC4_VS_OPCODE_URB_WRITE:
1825          generate_vs_urb_write(p, inst);
1826          send_count++;
1827          break;
1828 
1829       case ELK_SHADER_OPCODE_GFX4_SCRATCH_READ:
1830          generate_scratch_read(p, inst, dst, src[0]);
1831          fill_count++;
1832          break;
1833 
1834       case ELK_SHADER_OPCODE_GFX4_SCRATCH_WRITE:
1835          generate_scratch_write(p, inst, dst, src[0], src[1]);
1836          spill_count++;
1837          break;
1838 
1839       case ELK_VS_OPCODE_PULL_CONSTANT_LOAD:
1840          generate_pull_constant_load(p, inst, dst, src[0], src[1]);
1841          send_count++;
1842          break;
1843 
1844       case ELK_VS_OPCODE_PULL_CONSTANT_LOAD_GFX7:
1845          generate_pull_constant_load_gfx7(p, inst, dst, src[0], src[1]);
1846          send_count++;
1847          break;
1848 
1849       case ELK_VEC4_GS_OPCODE_URB_WRITE:
1850          generate_gs_urb_write(p, inst);
1851          send_count++;
1852          break;
1853 
1854       case ELK_VEC4_GS_OPCODE_URB_WRITE_ALLOCATE:
1855          generate_gs_urb_write_allocate(p, inst);
1856          send_count++;
1857          break;
1858 
1859       case ELK_GS_OPCODE_SVB_WRITE:
1860          generate_gs_svb_write(p, inst, dst, src[0], src[1]);
1861          send_count++;
1862          break;
1863 
1864       case ELK_GS_OPCODE_SVB_SET_DST_INDEX:
1865          generate_gs_svb_set_destination_index(p, inst, dst, src[0]);
1866          break;
1867 
1868       case ELK_GS_OPCODE_THREAD_END:
1869          generate_gs_thread_end(p, inst);
1870          send_count++;
1871          break;
1872 
1873       case ELK_GS_OPCODE_SET_WRITE_OFFSET:
1874          generate_gs_set_write_offset(p, dst, src[0], src[1]);
1875          break;
1876 
1877       case ELK_GS_OPCODE_SET_VERTEX_COUNT:
1878          generate_gs_set_vertex_count(p, dst, src[0]);
1879          break;
1880 
1881       case ELK_GS_OPCODE_FF_SYNC:
1882          generate_gs_ff_sync(p, inst, dst, src[0], src[1]);
1883          send_count++;
1884          break;
1885 
1886       case ELK_GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
1887          generate_gs_ff_sync_set_primitives(p, dst, src[0], src[1], src[2]);
1888          break;
1889 
1890       case ELK_GS_OPCODE_SET_PRIMITIVE_ID:
1891          generate_gs_set_primitive_id(p, dst);
1892          break;
1893 
1894       case ELK_GS_OPCODE_SET_DWORD_2:
1895          generate_gs_set_dword_2(p, dst, src[0]);
1896          break;
1897 
1898       case ELK_GS_OPCODE_PREPARE_CHANNEL_MASKS:
1899          generate_gs_prepare_channel_masks(p, dst);
1900          break;
1901 
1902       case ELK_GS_OPCODE_SET_CHANNEL_MASKS:
1903          generate_gs_set_channel_masks(p, dst, src[0]);
1904          break;
1905 
1906       case ELK_GS_OPCODE_GET_INSTANCE_ID:
1907          generate_gs_get_instance_id(p, dst);
1908          break;
1909 
1910       case ELK_VEC4_OPCODE_UNTYPED_ATOMIC:
1911          assert(src[2].file == ELK_IMMEDIATE_VALUE);
1912          elk_untyped_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen,
1913                             !inst->dst.is_null(), inst->header_size);
1914          send_count++;
1915          break;
1916 
1917       case ELK_VEC4_OPCODE_UNTYPED_SURFACE_READ:
1918          assert(!inst->header_size);
1919          assert(src[2].file == ELK_IMMEDIATE_VALUE);
1920          elk_untyped_surface_read(p, dst, src[0], src[1], inst->mlen,
1921                                   src[2].ud);
1922          send_count++;
1923          break;
1924 
1925       case ELK_VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
1926          assert(src[2].file == ELK_IMMEDIATE_VALUE);
1927          elk_untyped_surface_write(p, src[0], src[1], inst->mlen,
1928                                    src[2].ud, inst->header_size);
1929          send_count++;
1930          break;
1931 
1932       case ELK_SHADER_OPCODE_MEMORY_FENCE:
1933          elk_memory_fence(p, dst, src[0], ELK_OPCODE_SEND,
1934                           elk_message_target(inst->sfid),
1935                           inst->desc,
1936                           /* commit_enable */ false,
1937                           /* bti */ 0);
1938          send_count++;
1939          break;
1940 
1941       case ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL:
1942          elk_find_live_channel(p, dst, false);
1943          break;
1944 
1945       case ELK_SHADER_OPCODE_BROADCAST:
1946          assert(inst->force_writemask_all);
1947          elk_broadcast(p, dst, src[0], src[1]);
1948          break;
1949 
1950       case ELK_VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
1951          generate_unpack_flags(p, dst);
1952          break;
1953 
1954       case ELK_VEC4_OPCODE_MOV_BYTES: {
1955          /* Moves the low byte from each channel, using an Align1 access mode
1956           * and a <4,1,0> source region.
1957           */
1958          assert(src[0].type == ELK_REGISTER_TYPE_UB ||
1959                 src[0].type == ELK_REGISTER_TYPE_B);
1960 
1961          elk_set_default_access_mode(p, ELK_ALIGN_1);
1962          src[0].vstride = ELK_VERTICAL_STRIDE_4;
1963          src[0].width = ELK_WIDTH_1;
1964          src[0].hstride = ELK_HORIZONTAL_STRIDE_0;
1965          elk_MOV(p, dst, src[0]);
1966          elk_set_default_access_mode(p, ELK_ALIGN_16);
1967          break;
1968       }
1969 
1970       case ELK_VEC4_OPCODE_DOUBLE_TO_F32:
1971       case ELK_VEC4_OPCODE_DOUBLE_TO_D32:
1972       case ELK_VEC4_OPCODE_DOUBLE_TO_U32: {
1973          assert(type_sz(src[0].type) == 8);
1974          assert(type_sz(dst.type) == 8);
1975 
1976          elk_reg_type dst_type;
1977 
1978          switch (inst->opcode) {
1979          case ELK_VEC4_OPCODE_DOUBLE_TO_F32:
1980             dst_type = ELK_REGISTER_TYPE_F;
1981             break;
1982          case ELK_VEC4_OPCODE_DOUBLE_TO_D32:
1983             dst_type = ELK_REGISTER_TYPE_D;
1984             break;
1985          case ELK_VEC4_OPCODE_DOUBLE_TO_U32:
1986             dst_type = ELK_REGISTER_TYPE_UD;
1987             break;
1988          default:
1989             unreachable("Not supported conversion");
1990          }
1991          dst = retype(dst, dst_type);
1992 
1993          elk_set_default_access_mode(p, ELK_ALIGN_1);
1994 
1995          /* When converting from DF->F, we set destination's stride as 2 as an
1996           * alignment requirement. But in IVB/BYT, each DF implicitly writes
1997           * two floats, being the first one the converted value. So we don't
1998           * need to explicitly set stride 2, but 1.
1999           */
2000          struct elk_reg spread_dst;
2001          if (devinfo->verx10 == 70)
2002             spread_dst = stride(dst, 8, 4, 1);
2003          else
2004             spread_dst = stride(dst, 8, 4, 2);
2005 
2006          elk_MOV(p, spread_dst, src[0]);
2007 
2008          elk_set_default_access_mode(p, ELK_ALIGN_16);
2009          break;
2010       }
2011 
2012       case ELK_VEC4_OPCODE_TO_DOUBLE: {
2013          assert(type_sz(src[0].type) == 4);
2014          assert(type_sz(dst.type) == 8);
2015 
2016          elk_set_default_access_mode(p, ELK_ALIGN_1);
2017 
2018          elk_MOV(p, dst, src[0]);
2019 
2020          elk_set_default_access_mode(p, ELK_ALIGN_16);
2021          break;
2022       }
2023 
2024       case ELK_VEC4_OPCODE_PICK_LOW_32BIT:
2025       case ELK_VEC4_OPCODE_PICK_HIGH_32BIT: {
2026          /* Stores the low/high 32-bit of each 64-bit element in src[0] into
2027           * dst using ALIGN1 mode and a <8,4,2>:UD region on the source.
2028           */
2029          assert(type_sz(src[0].type) == 8);
2030          assert(type_sz(dst.type) == 4);
2031 
2032          elk_set_default_access_mode(p, ELK_ALIGN_1);
2033 
2034          dst = retype(dst, ELK_REGISTER_TYPE_UD);
2035          dst.hstride = ELK_HORIZONTAL_STRIDE_1;
2036 
2037          src[0] = retype(src[0], ELK_REGISTER_TYPE_UD);
2038          if (inst->opcode == ELK_VEC4_OPCODE_PICK_HIGH_32BIT)
2039             src[0] = suboffset(src[0], 1);
2040          src[0] = spread(src[0], 2);
2041          elk_MOV(p, dst, src[0]);
2042 
2043          elk_set_default_access_mode(p, ELK_ALIGN_16);
2044          break;
2045       }
2046 
2047       case ELK_VEC4_OPCODE_SET_LOW_32BIT:
2048       case ELK_VEC4_OPCODE_SET_HIGH_32BIT: {
2049          /* Reads consecutive 32-bit elements from src[0] and writes
2050           * them to the low/high 32-bit of each 64-bit element in dst.
2051           */
2052          assert(type_sz(src[0].type) == 4);
2053          assert(type_sz(dst.type) == 8);
2054 
2055          elk_set_default_access_mode(p, ELK_ALIGN_1);
2056 
2057          dst = retype(dst, ELK_REGISTER_TYPE_UD);
2058          if (inst->opcode == ELK_VEC4_OPCODE_SET_HIGH_32BIT)
2059             dst = suboffset(dst, 1);
2060          dst.hstride = ELK_HORIZONTAL_STRIDE_2;
2061 
2062          src[0] = retype(src[0], ELK_REGISTER_TYPE_UD);
2063          elk_MOV(p, dst, src[0]);
2064 
2065          elk_set_default_access_mode(p, ELK_ALIGN_16);
2066          break;
2067       }
2068 
2069       case ELK_VEC4_OPCODE_PACK_BYTES: {
2070          /* Is effectively:
2071           *
2072           *   mov(8) dst<16,4,1>:UB src<4,1,0>:UB
2073           *
2074           * but destinations' only regioning is horizontal stride, so instead we
2075           * have to use two instructions:
2076           *
2077           *   mov(4) dst<1>:UB     src<4,1,0>:UB
2078           *   mov(4) dst.16<1>:UB  src.16<4,1,0>:UB
2079           *
2080           * where they pack the four bytes from the low and high four DW.
2081           */
2082          assert(util_is_power_of_two_nonzero(dst.writemask));
2083          unsigned offset = __builtin_ctz(dst.writemask);
2084 
2085          dst.type = ELK_REGISTER_TYPE_UB;
2086 
2087          elk_set_default_access_mode(p, ELK_ALIGN_1);
2088 
2089          src[0].type = ELK_REGISTER_TYPE_UB;
2090          src[0].vstride = ELK_VERTICAL_STRIDE_4;
2091          src[0].width = ELK_WIDTH_1;
2092          src[0].hstride = ELK_HORIZONTAL_STRIDE_0;
2093          dst.subnr = offset * 4;
2094          struct elk_inst *insn = elk_MOV(p, dst, src[0]);
2095          elk_inst_set_exec_size(p->devinfo, insn, ELK_EXECUTE_4);
2096          elk_inst_set_no_dd_clear(p->devinfo, insn, true);
2097          elk_inst_set_no_dd_check(p->devinfo, insn, inst->no_dd_check);
2098 
2099          src[0].subnr = 16;
2100          dst.subnr = 16 + offset * 4;
2101          insn = elk_MOV(p, dst, src[0]);
2102          elk_inst_set_exec_size(p->devinfo, insn, ELK_EXECUTE_4);
2103          elk_inst_set_no_dd_clear(p->devinfo, insn, inst->no_dd_clear);
2104          elk_inst_set_no_dd_check(p->devinfo, insn, true);
2105 
2106          elk_set_default_access_mode(p, ELK_ALIGN_16);
2107          break;
2108       }
2109 
2110       case ELK_VEC4_OPCODE_ZERO_OOB_PUSH_REGS:
2111          generate_zero_oob_push_regs(p, &prog_data->base, dst, src[0]);
2112          break;
2113 
2114       case ELK_VEC4_TCS_OPCODE_URB_WRITE:
2115          generate_tcs_urb_write(p, inst, src[0]);
2116          send_count++;
2117          break;
2118 
2119       case ELK_VEC4_OPCODE_URB_READ:
2120          generate_vec4_urb_read(p, inst, dst, src[0]);
2121          send_count++;
2122          break;
2123 
2124       case ELK_VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS:
2125          generate_tcs_input_urb_offsets(p, dst, src[0], src[1]);
2126          break;
2127 
2128       case ELK_VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
2129          generate_tcs_output_urb_offsets(p, dst, src[0], src[1]);
2130          break;
2131 
2132       case ELK_TCS_OPCODE_GET_INSTANCE_ID:
2133          generate_tcs_get_instance_id(p, dst);
2134          break;
2135 
2136       case ELK_TCS_OPCODE_GET_PRIMITIVE_ID:
2137          generate_tcs_get_primitive_id(p, dst);
2138          break;
2139 
2140       case ELK_TCS_OPCODE_CREATE_BARRIER_HEADER:
2141          generate_tcs_create_barrier_header(p, prog_data, dst);
2142          break;
2143 
2144       case ELK_TES_OPCODE_CREATE_INPUT_READ_HEADER:
2145          generate_tes_create_input_read_header(p, dst);
2146          break;
2147 
2148       case ELK_TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
2149          generate_tes_add_indirect_urb_offset(p, dst, src[0], src[1]);
2150          break;
2151 
2152       case ELK_TES_OPCODE_GET_PRIMITIVE_ID:
2153          generate_tes_get_primitive_id(p, dst);
2154          break;
2155 
2156       case ELK_TCS_OPCODE_SRC0_010_IS_ZERO:
2157          /* If src_reg had stride like elk_fs_reg, we wouldn't need this. */
2158          elk_MOV(p, elk_null_reg(), stride(src[0], 0, 1, 0));
2159          break;
2160 
2161       case ELK_TCS_OPCODE_RELEASE_INPUT:
2162          generate_tcs_release_input(p, dst, src[0], src[1]);
2163          send_count++;
2164          break;
2165 
2166       case ELK_TCS_OPCODE_THREAD_END:
2167          generate_tcs_thread_end(p, inst);
2168          send_count++;
2169          break;
2170 
2171       case ELK_SHADER_OPCODE_BARRIER:
2172          elk_barrier(p, src[0]);
2173          elk_WAIT(p);
2174          send_count++;
2175          break;
2176 
2177       case ELK_SHADER_OPCODE_MOV_INDIRECT:
2178          generate_mov_indirect(p, inst, dst, src[0], src[1]);
2179          break;
2180 
2181       case ELK_OPCODE_DIM:
2182          assert(devinfo->verx10 == 75);
2183          assert(src[0].type == ELK_REGISTER_TYPE_DF);
2184          assert(dst.type == ELK_REGISTER_TYPE_DF);
2185          elk_DIM(p, dst, retype(src[0], ELK_REGISTER_TYPE_F));
2186          break;
2187 
2188       case ELK_SHADER_OPCODE_RND_MODE: {
2189          assert(src[0].file == ELK_IMMEDIATE_VALUE);
2190          /*
2191           * Changes the floating point rounding mode updating the control
2192           * register field defined at cr0.0[5-6] bits.
2193           */
2194          enum elk_rnd_mode mode =
2195             (enum elk_rnd_mode) (src[0].d << ELK_CR0_RND_MODE_SHIFT);
2196          elk_float_controls_mode(p, mode, ELK_CR0_RND_MODE_MASK);
2197       }
2198          break;
2199 
2200       default:
2201          unreachable("Unsupported opcode");
2202       }
2203 
2204       if (inst->opcode == ELK_VEC4_OPCODE_PACK_BYTES) {
2205          /* Handled dependency hints in the generator. */
2206 
2207          assert(!inst->conditional_mod);
2208       } else if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
2209          assert(p->nr_insn == pre_emit_nr_insn + 1 ||
2210                 !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
2211                  "emitting more than 1 instruction");
2212 
2213          elk_inst *last = &p->store[pre_emit_nr_insn];
2214 
2215          if (inst->conditional_mod)
2216             elk_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
2217          elk_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
2218          elk_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
2219       }
2220    }
2221 
2222    elk_set_uip_jip(p, 0);
2223 
2224    /* end of program sentinel */
2225    elk_disasm_new_inst_group(elk_disasm_info, p->next_insn_offset);
2226 
2227 #ifndef NDEBUG
2228    bool validated =
2229 #else
2230    if (unlikely(debug_enabled))
2231 #endif
2232       elk_validate_instructions(&compiler->isa, p->store,
2233                                 0, p->next_insn_offset,
2234                                 elk_disasm_info);
2235 
2236    int before_size = p->next_insn_offset;
2237    elk_compact_instructions(p, 0, elk_disasm_info);
2238    int after_size = p->next_insn_offset;
2239 
2240    bool dump_shader_bin = elk_should_dump_shader_bin();
2241    unsigned char sha1[21];
2242    char sha1buf[41];
2243 
2244    if (unlikely(debug_enabled || dump_shader_bin)) {
2245       _mesa_sha1_compute(p->store, p->next_insn_offset, sha1);
2246       _mesa_sha1_format(sha1buf, sha1);
2247    }
2248 
2249    if (unlikely(dump_shader_bin))
2250       elk_dump_shader_bin(p->store, 0, p->next_insn_offset, sha1buf);
2251 
2252    if (unlikely(debug_enabled)) {
2253       fprintf(stderr, "Native code for %s %s shader %s (src_hash 0x%08x) (sha1 %s):\n",
2254             nir->info.label ? nir->info.label : "unnamed",
2255             _mesa_shader_stage_to_string(nir->info.stage), nir->info.name,
2256             params->source_hash, sha1buf);
2257 
2258       fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles. %d:%d "
2259                      "spills:fills, %u sends. Compacted %d to %d bytes (%.0f%%)\n",
2260             stage_abbrev, before_size / 16, loop_count, perf.latency,
2261             spill_count, fill_count, send_count, before_size, after_size,
2262             100.0f * (before_size - after_size) / before_size);
2263 
2264       /* overriding the shader makes elk_disasm_info invalid */
2265       if (!elk_try_override_assembly(p, 0, sha1buf)) {
2266          elk_dump_assembly(p->store, 0, p->next_insn_offset,
2267                        elk_disasm_info, perf.block_latency);
2268       } else {
2269          fprintf(stderr, "Successfully overrode shader with sha1 %s\n\n", sha1buf);
2270       }
2271    }
2272    ralloc_free(elk_disasm_info);
2273    assert(validated);
2274 
2275    elk_shader_debug_log(compiler, params->log_data,
2276                         "%s vec4 shader: %d inst, %d loops, %u cycles, "
2277                         "%d:%d spills:fills, %u sends, "
2278                         "compacted %d to %d bytes.\n",
2279                         stage_abbrev, before_size / 16,
2280                         loop_count, perf.latency, spill_count,
2281                         fill_count, send_count, before_size, after_size);
2282    if (stats) {
2283       stats->dispatch_width = 0;
2284       stats->max_dispatch_width = 0;
2285       stats->instructions = before_size / 16;
2286       stats->sends = send_count;
2287       stats->loops = loop_count;
2288       stats->cycles = perf.latency;
2289       stats->spills = spill_count;
2290       stats->fills = fill_count;
2291    }
2292 }
2293 
2294 extern "C" const unsigned *
elk_vec4_generate_assembly(const struct elk_compiler * compiler,const struct elk_compile_params * params,const nir_shader * nir,struct elk_vue_prog_data * prog_data,const struct elk_cfg_t * cfg,const performance & perf,bool debug_enabled)2295 elk_vec4_generate_assembly(const struct elk_compiler *compiler,
2296                            const struct elk_compile_params *params,
2297                            const nir_shader *nir,
2298                            struct elk_vue_prog_data *prog_data,
2299                            const struct elk_cfg_t *cfg,
2300                            const performance &perf,
2301                            bool debug_enabled)
2302 {
2303    struct elk_codegen *p = rzalloc(params->mem_ctx, struct elk_codegen);
2304    elk_init_codegen(&compiler->isa, p, params->mem_ctx);
2305    elk_set_default_access_mode(p, ELK_ALIGN_16);
2306 
2307    generate_code(p, compiler, params,
2308                  nir, prog_data, cfg, perf,
2309                  params->stats, debug_enabled);
2310 
2311    assert(prog_data->base.const_data_size == 0);
2312    if (nir->constant_data_size > 0) {
2313       prog_data->base.const_data_size = nir->constant_data_size;
2314       prog_data->base.const_data_offset =
2315          elk_append_data(p, nir->constant_data, nir->constant_data_size, 32);
2316    }
2317 
2318    return elk_get_program(p, &prog_data->base.program_size);
2319 }
2320