1 /* -*- c++ -*- */
2 /*
3 * Copyright © 2010-2015 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #ifndef ELK_IR_FS_H
26 #define ELK_IR_FS_H
27
28 #include "elk_shader.h"
29
30 class elk_fs_inst;
31
32 class elk_fs_reg : public elk_backend_reg {
33 public:
34 DECLARE_RALLOC_CXX_OPERATORS(elk_fs_reg)
35
36 void init();
37
38 elk_fs_reg();
39 elk_fs_reg(struct ::elk_reg reg);
40 elk_fs_reg(enum elk_reg_file file, unsigned nr);
41 elk_fs_reg(enum elk_reg_file file, unsigned nr, enum elk_reg_type type);
42
43 bool equals(const elk_fs_reg &r) const;
44 bool negative_equals(const elk_fs_reg &r) const;
45 bool is_contiguous() const;
46
47 /**
48 * Return the size in bytes of a single logical component of the
49 * register assuming the given execution width.
50 */
51 unsigned component_size(unsigned width) const;
52
53 /** Register region horizontal stride */
54 uint8_t stride;
55 };
56
57 static inline elk_fs_reg
negate(elk_fs_reg reg)58 negate(elk_fs_reg reg)
59 {
60 assert(reg.file != IMM);
61 reg.negate = !reg.negate;
62 return reg;
63 }
64
65 static inline elk_fs_reg
retype(elk_fs_reg reg,enum elk_reg_type type)66 retype(elk_fs_reg reg, enum elk_reg_type type)
67 {
68 reg.type = type;
69 return reg;
70 }
71
72 static inline elk_fs_reg
byte_offset(elk_fs_reg reg,unsigned delta)73 byte_offset(elk_fs_reg reg, unsigned delta)
74 {
75 switch (reg.file) {
76 case BAD_FILE:
77 break;
78 case VGRF:
79 case ATTR:
80 case UNIFORM:
81 reg.offset += delta;
82 break;
83 case MRF: {
84 const unsigned suboffset = reg.offset + delta;
85 reg.nr += suboffset / REG_SIZE;
86 reg.offset = suboffset % REG_SIZE;
87 break;
88 }
89 case ARF:
90 case FIXED_GRF: {
91 const unsigned suboffset = reg.subnr + delta;
92 reg.nr += suboffset / REG_SIZE;
93 reg.subnr = suboffset % REG_SIZE;
94 break;
95 }
96 case IMM:
97 default:
98 assert(delta == 0);
99 }
100 return reg;
101 }
102
103 static inline elk_fs_reg
horiz_offset(const elk_fs_reg & reg,unsigned delta)104 horiz_offset(const elk_fs_reg ®, unsigned delta)
105 {
106 switch (reg.file) {
107 case BAD_FILE:
108 case UNIFORM:
109 case IMM:
110 /* These only have a single component that is implicitly splatted. A
111 * horizontal offset should be a harmless no-op.
112 * XXX - Handle vector immediates correctly.
113 */
114 return reg;
115 case VGRF:
116 case MRF:
117 case ATTR:
118 return byte_offset(reg, delta * reg.stride * type_sz(reg.type));
119 case ARF:
120 case FIXED_GRF:
121 if (reg.is_null()) {
122 return reg;
123 } else {
124 const unsigned hstride = reg.hstride ? 1 << (reg.hstride - 1) : 0;
125 const unsigned vstride = reg.vstride ? 1 << (reg.vstride - 1) : 0;
126 const unsigned width = 1 << reg.width;
127
128 if (delta % width == 0) {
129 return byte_offset(reg, delta / width * vstride * type_sz(reg.type));
130 } else {
131 assert(vstride == hstride * width);
132 return byte_offset(reg, delta * hstride * type_sz(reg.type));
133 }
134 }
135 }
136 unreachable("Invalid register file");
137 }
138
139 static inline elk_fs_reg
offset(elk_fs_reg reg,unsigned width,unsigned delta)140 offset(elk_fs_reg reg, unsigned width, unsigned delta)
141 {
142 switch (reg.file) {
143 case BAD_FILE:
144 break;
145 case ARF:
146 case FIXED_GRF:
147 case MRF:
148 case VGRF:
149 case ATTR:
150 case UNIFORM:
151 return byte_offset(reg, delta * reg.component_size(width));
152 case IMM:
153 assert(delta == 0);
154 }
155 return reg;
156 }
157
158 /**
159 * Get the scalar channel of \p reg given by \p idx and replicate it to all
160 * channels of the result.
161 */
162 static inline elk_fs_reg
component(elk_fs_reg reg,unsigned idx)163 component(elk_fs_reg reg, unsigned idx)
164 {
165 reg = horiz_offset(reg, idx);
166 reg.stride = 0;
167 if (reg.file == ARF || reg.file == FIXED_GRF) {
168 reg.vstride = ELK_VERTICAL_STRIDE_0;
169 reg.width = ELK_WIDTH_1;
170 reg.hstride = ELK_HORIZONTAL_STRIDE_0;
171 }
172 return reg;
173 }
174
175 /**
176 * Return an integer identifying the discrete address space a register is
177 * contained in. A register is by definition fully contained in the single
178 * reg_space it belongs to, so two registers with different reg_space ids are
179 * guaranteed not to overlap. Most register files are a single reg_space of
180 * its own, only the VGRF and ATTR files are composed of multiple discrete
181 * address spaces, one for each allocation and input attribute respectively.
182 */
183 static inline uint32_t
reg_space(const elk_fs_reg & r)184 reg_space(const elk_fs_reg &r)
185 {
186 return r.file << 16 | (r.file == VGRF || r.file == ATTR ? r.nr : 0);
187 }
188
189 /**
190 * Return the base offset in bytes of a register relative to the start of its
191 * reg_space().
192 */
193 static inline unsigned
reg_offset(const elk_fs_reg & r)194 reg_offset(const elk_fs_reg &r)
195 {
196 return (r.file == VGRF || r.file == IMM || r.file == ATTR ? 0 : r.nr) *
197 (r.file == UNIFORM ? 4 : REG_SIZE) + r.offset +
198 (r.file == ARF || r.file == FIXED_GRF ? r.subnr : 0);
199 }
200
201 /**
202 * Return the amount of padding in bytes left unused between individual
203 * components of register \p r due to a (horizontal) stride value greater than
204 * one, or zero if components are tightly packed in the register file.
205 */
206 static inline unsigned
reg_padding(const elk_fs_reg & r)207 reg_padding(const elk_fs_reg &r)
208 {
209 const unsigned stride = ((r.file != ARF && r.file != FIXED_GRF) ? r.stride :
210 r.hstride == 0 ? 0 :
211 1 << (r.hstride - 1));
212 return (MAX2(1, stride) - 1) * type_sz(r.type);
213 }
214
215 /* Do not call this directly. Call regions_overlap() instead. */
216 static inline bool
regions_overlap_MRF(const elk_fs_reg & r,unsigned dr,const elk_fs_reg & s,unsigned ds)217 regions_overlap_MRF(const elk_fs_reg &r, unsigned dr, const elk_fs_reg &s, unsigned ds)
218 {
219 if (r.nr & ELK_MRF_COMPR4) {
220 elk_fs_reg t = r;
221 t.nr &= ~ELK_MRF_COMPR4;
222 /* COMPR4 regions are translated by the hardware during decompression
223 * into two separate half-regions 4 MRFs apart from each other.
224 *
225 * Note: swapping s and t in this parameter list eliminates one possible
226 * level of recursion (since the s in the called versions of
227 * regions_overlap_MRF can't be COMPR4), and that makes the compiled
228 * code a lot smaller.
229 */
230 return regions_overlap_MRF(s, ds, t, dr / 2) ||
231 regions_overlap_MRF(s, ds, byte_offset(t, 4 * REG_SIZE), dr / 2);
232 } else if (s.nr & ELK_MRF_COMPR4) {
233 return regions_overlap_MRF(s, ds, r, dr);
234 }
235
236 return !((r.nr * REG_SIZE + r.offset + dr) <= (s.nr * REG_SIZE + s.offset) ||
237 (s.nr * REG_SIZE + s.offset + ds) <= (r.nr * REG_SIZE + r.offset));
238 }
239
240 /**
241 * Return whether the register region starting at \p r and spanning \p dr
242 * bytes could potentially overlap the register region starting at \p s and
243 * spanning \p ds bytes.
244 */
245 static inline bool
regions_overlap(const elk_fs_reg & r,unsigned dr,const elk_fs_reg & s,unsigned ds)246 regions_overlap(const elk_fs_reg &r, unsigned dr, const elk_fs_reg &s, unsigned ds)
247 {
248 if (r.file != s.file)
249 return false;
250
251 if (r.file == VGRF) {
252 return r.nr == s.nr &&
253 !(r.offset + dr <= s.offset || s.offset + ds <= r.offset);
254 } else if (r.file != MRF) {
255 return !(reg_offset(r) + dr <= reg_offset(s) ||
256 reg_offset(s) + ds <= reg_offset(r));
257 } else {
258 return regions_overlap_MRF(r, dr, s, ds);
259 }
260 }
261
262 /**
263 * Check that the register region given by r [r.offset, r.offset + dr[
264 * is fully contained inside the register region given by s
265 * [s.offset, s.offset + ds[.
266 */
267 static inline bool
region_contained_in(const elk_fs_reg & r,unsigned dr,const elk_fs_reg & s,unsigned ds)268 region_contained_in(const elk_fs_reg &r, unsigned dr, const elk_fs_reg &s, unsigned ds)
269 {
270 return reg_space(r) == reg_space(s) &&
271 reg_offset(r) >= reg_offset(s) &&
272 reg_offset(r) + dr <= reg_offset(s) + ds;
273 }
274
275 /**
276 * Return whether the given register region is n-periodic, i.e. whether the
277 * original region remains invariant after shifting it by \p n scalar
278 * channels.
279 */
280 static inline bool
is_periodic(const elk_fs_reg & reg,unsigned n)281 is_periodic(const elk_fs_reg ®, unsigned n)
282 {
283 if (reg.file == BAD_FILE || reg.is_null()) {
284 return true;
285
286 } else if (reg.file == IMM) {
287 const unsigned period = (reg.type == ELK_REGISTER_TYPE_UV ||
288 reg.type == ELK_REGISTER_TYPE_V ? 8 :
289 reg.type == ELK_REGISTER_TYPE_VF ? 4 :
290 1);
291 return n % period == 0;
292
293 } else if (reg.file == ARF || reg.file == FIXED_GRF) {
294 const unsigned period = (reg.hstride == 0 && reg.vstride == 0 ? 1 :
295 reg.vstride == 0 ? 1 << reg.width :
296 ~0);
297 return n % period == 0;
298
299 } else {
300 return reg.stride == 0;
301 }
302 }
303
304 static inline bool
is_uniform(const elk_fs_reg & reg)305 is_uniform(const elk_fs_reg ®)
306 {
307 return is_periodic(reg, 1);
308 }
309
310 /**
311 * Get the specified 8-component quarter of a register.
312 */
313 static inline elk_fs_reg
quarter(const elk_fs_reg & reg,unsigned idx)314 quarter(const elk_fs_reg ®, unsigned idx)
315 {
316 assert(idx < 4);
317 return horiz_offset(reg, 8 * idx);
318 }
319
320 /**
321 * Reinterpret each channel of register \p reg as a vector of values of the
322 * given smaller type and take the i-th subcomponent from each.
323 */
324 static inline elk_fs_reg
subscript(elk_fs_reg reg,elk_reg_type type,unsigned i)325 subscript(elk_fs_reg reg, elk_reg_type type, unsigned i)
326 {
327 assert((i + 1) * type_sz(type) <= type_sz(reg.type));
328
329 if (reg.file == ARF || reg.file == FIXED_GRF) {
330 /* The stride is encoded inconsistently for fixed GRF and ARF registers
331 * as the log2 of the actual vertical and horizontal strides.
332 */
333 const int delta = util_logbase2(type_sz(reg.type)) -
334 util_logbase2(type_sz(type));
335 reg.hstride += (reg.hstride ? delta : 0);
336 reg.vstride += (reg.vstride ? delta : 0);
337
338 } else if (reg.file == IMM) {
339 unsigned bit_size = type_sz(type) * 8;
340 reg.u64 >>= i * bit_size;
341 reg.u64 &= BITFIELD64_MASK(bit_size);
342 if (bit_size <= 16)
343 reg.u64 |= reg.u64 << 16;
344 return retype(reg, type);
345 } else {
346 reg.stride *= type_sz(reg.type) / type_sz(type);
347 }
348
349 return byte_offset(retype(reg, type), i * type_sz(type));
350 }
351
352 static inline elk_fs_reg
horiz_stride(elk_fs_reg reg,unsigned s)353 horiz_stride(elk_fs_reg reg, unsigned s)
354 {
355 reg.stride *= s;
356 return reg;
357 }
358
359 static const elk_fs_reg reg_undef;
360
361 class elk_fs_inst : public elk_backend_instruction {
362 elk_fs_inst &operator=(const elk_fs_inst &);
363
364 void init(enum elk_opcode opcode, uint8_t exec_width, const elk_fs_reg &dst,
365 const elk_fs_reg *src, unsigned sources);
366
367 public:
368 DECLARE_RALLOC_CXX_OPERATORS(elk_fs_inst)
369
370 elk_fs_inst();
371 elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size);
372 elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst);
373 elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
374 const elk_fs_reg &src0);
375 elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
376 const elk_fs_reg &src0, const elk_fs_reg &src1);
377 elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
378 const elk_fs_reg &src0, const elk_fs_reg &src1, const elk_fs_reg &src2);
379 elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
380 const elk_fs_reg src[], unsigned sources);
381 elk_fs_inst(const elk_fs_inst &that);
382 ~elk_fs_inst();
383
384 void resize_sources(uint8_t num_sources);
385
386 bool is_send_from_grf() const;
387 bool is_payload(unsigned arg) const;
388 bool is_partial_write() const;
389 unsigned components_read(unsigned i) const;
390 unsigned size_read(int arg) const;
391 bool can_do_source_mods(const struct intel_device_info *devinfo) const;
392 bool can_do_cmod();
393 bool can_change_types() const;
394 bool has_source_and_destination_hazard() const;
395 unsigned implied_mrf_writes() const;
396
397 /**
398 * Return whether \p arg is a control source of a virtual instruction which
399 * shouldn't contribute to the execution type and usual regioning
400 * restriction calculations of arithmetic instructions.
401 */
402 bool is_control_source(unsigned arg) const;
403
404 /**
405 * Return the subset of flag registers read by the instruction as a bitset
406 * with byte granularity.
407 */
408 unsigned flags_read(const intel_device_info *devinfo) const;
409
410 /**
411 * Return the subset of flag registers updated by the instruction (either
412 * partially or fully) as a bitset with byte granularity.
413 */
414 unsigned flags_written(const intel_device_info *devinfo) const;
415
416 /**
417 * Return true if this instruction is a sampler message gathering residency
418 * data.
419 */
420 bool has_sampler_residency() const;
421
422 elk_fs_reg dst;
423 elk_fs_reg *src;
424
425 uint8_t sources; /**< Number of elk_fs_reg sources. */
426
427 bool last_rt:1;
428 bool pi_noperspective:1; /**< Pixel interpolator noperspective flag */
429 bool keep_payload_trailing_zeros;
430 };
431
432 /**
433 * Make the execution of \p inst dependent on the evaluation of a possibly
434 * inverted predicate.
435 */
436 static inline elk_fs_inst *
set_predicate_inv(enum elk_predicate pred,bool inverse,elk_fs_inst * inst)437 set_predicate_inv(enum elk_predicate pred, bool inverse,
438 elk_fs_inst *inst)
439 {
440 inst->predicate = pred;
441 inst->predicate_inverse = inverse;
442 return inst;
443 }
444
445 /**
446 * Make the execution of \p inst dependent on the evaluation of a predicate.
447 */
448 static inline elk_fs_inst *
set_predicate(enum elk_predicate pred,elk_fs_inst * inst)449 set_predicate(enum elk_predicate pred, elk_fs_inst *inst)
450 {
451 return set_predicate_inv(pred, false, inst);
452 }
453
454 /**
455 * Write the result of evaluating the condition given by \p mod to a flag
456 * register.
457 */
458 static inline elk_fs_inst *
set_condmod(enum elk_conditional_mod mod,elk_fs_inst * inst)459 set_condmod(enum elk_conditional_mod mod, elk_fs_inst *inst)
460 {
461 inst->conditional_mod = mod;
462 return inst;
463 }
464
465 /**
466 * Clamp the result of \p inst to the saturation range of its destination
467 * datatype.
468 */
469 static inline elk_fs_inst *
set_saturate(bool saturate,elk_fs_inst * inst)470 set_saturate(bool saturate, elk_fs_inst *inst)
471 {
472 inst->saturate = saturate;
473 return inst;
474 }
475
476 /**
477 * Return the number of dataflow registers written by the instruction (either
478 * fully or partially) counted from 'floor(reg_offset(inst->dst) /
479 * register_size)'. The somewhat arbitrary register size unit is 4B for the
480 * UNIFORM and IMM files and 32B for all other files.
481 */
482 inline unsigned
regs_written(const elk_fs_inst * inst)483 regs_written(const elk_fs_inst *inst)
484 {
485 assert(inst->dst.file != UNIFORM && inst->dst.file != IMM);
486 return DIV_ROUND_UP(reg_offset(inst->dst) % REG_SIZE +
487 inst->size_written -
488 MIN2(inst->size_written, reg_padding(inst->dst)),
489 REG_SIZE);
490 }
491
492 /**
493 * Return the number of dataflow registers read by the instruction (either
494 * fully or partially) counted from 'floor(reg_offset(inst->src[i]) /
495 * register_size)'. The somewhat arbitrary register size unit is 4B for the
496 * UNIFORM files and 32B for all other files.
497 */
498 inline unsigned
regs_read(const elk_fs_inst * inst,unsigned i)499 regs_read(const elk_fs_inst *inst, unsigned i)
500 {
501 if (inst->src[i].file == IMM)
502 return 1;
503
504 const unsigned reg_size = inst->src[i].file == UNIFORM ? 4 : REG_SIZE;
505 return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size +
506 inst->size_read(i) -
507 MIN2(inst->size_read(i), reg_padding(inst->src[i])),
508 reg_size);
509 }
510
511 static inline enum elk_reg_type
get_exec_type(const elk_fs_inst * inst)512 get_exec_type(const elk_fs_inst *inst)
513 {
514 elk_reg_type exec_type = ELK_REGISTER_TYPE_B;
515
516 for (int i = 0; i < inst->sources; i++) {
517 if (inst->src[i].file != BAD_FILE &&
518 !inst->is_control_source(i)) {
519 const elk_reg_type t = get_exec_type(inst->src[i].type);
520 if (type_sz(t) > type_sz(exec_type))
521 exec_type = t;
522 else if (type_sz(t) == type_sz(exec_type) &&
523 elk_reg_type_is_floating_point(t))
524 exec_type = t;
525 }
526 }
527
528 if (exec_type == ELK_REGISTER_TYPE_B)
529 exec_type = inst->dst.type;
530
531 assert(exec_type != ELK_REGISTER_TYPE_B);
532
533 /* Promotion of the execution type to 32-bit for conversions from or to
534 * half-float seems to be consistent with the following text from the
535 * Cherryview PRM Vol. 7, "Execution Data Type":
536 *
537 * "When single precision and half precision floats are mixed between
538 * source operands or between source and destination operand [..] single
539 * precision float is the execution datatype."
540 *
541 * and from "Register Region Restrictions":
542 *
543 * "Conversion between Integer and HF (Half Float) must be DWord aligned
544 * and strided by a DWord on the destination."
545 */
546 if (type_sz(exec_type) == 2 &&
547 inst->dst.type != exec_type) {
548 if (exec_type == ELK_REGISTER_TYPE_HF)
549 exec_type = ELK_REGISTER_TYPE_F;
550 else if (inst->dst.type == ELK_REGISTER_TYPE_HF)
551 exec_type = ELK_REGISTER_TYPE_D;
552 }
553
554 return exec_type;
555 }
556
557 static inline unsigned
get_exec_type_size(const elk_fs_inst * inst)558 get_exec_type_size(const elk_fs_inst *inst)
559 {
560 return type_sz(get_exec_type(inst));
561 }
562
563 static inline bool
is_send(const elk_fs_inst * inst)564 is_send(const elk_fs_inst *inst)
565 {
566 return inst->mlen || inst->is_send_from_grf();
567 }
568
569 /**
570 * Return whether the instruction isn't an ALU instruction and cannot be
571 * assumed to complete in-order.
572 */
573 static inline bool
is_unordered(const intel_device_info * devinfo,const elk_fs_inst * inst)574 is_unordered(const intel_device_info *devinfo, const elk_fs_inst *inst)
575 {
576 return is_send(inst) || inst->is_math() ||
577 (devinfo->has_64bit_float_via_math_pipe &&
578 (get_exec_type(inst) == ELK_REGISTER_TYPE_DF ||
579 inst->dst.type == ELK_REGISTER_TYPE_DF));
580 }
581
582 /**
583 * Return whether the following regioning restriction applies to the specified
584 * instruction. From the Cherryview PRM Vol 7. "Register Region
585 * Restrictions":
586 *
587 * "When source or destination datatype is 64b or operation is integer DWord
588 * multiply, regioning in Align1 must follow these rules:
589 *
590 * 1. Source and Destination horizontal stride must be aligned to the same qword.
591 * 2. Regioning must ensure Src.Vstride = Src.Width * Src.Hstride.
592 * 3. Source and Destination offset must be the same, except the case of
593 * scalar source."
594 */
595 static inline bool
has_dst_aligned_region_restriction(const intel_device_info * devinfo,const elk_fs_inst * inst,elk_reg_type dst_type)596 has_dst_aligned_region_restriction(const intel_device_info *devinfo,
597 const elk_fs_inst *inst,
598 elk_reg_type dst_type)
599 {
600 const elk_reg_type exec_type = get_exec_type(inst);
601 /* Even though the hardware spec claims that "integer DWord multiply"
602 * operations are restricted, empirical evidence and the behavior of the
603 * simulator suggest that only 32x32-bit integer multiplication is
604 * restricted.
605 */
606 const bool is_dword_multiply = !elk_reg_type_is_floating_point(exec_type) &&
607 ((inst->opcode == ELK_OPCODE_MUL &&
608 MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4) ||
609 (inst->opcode == ELK_OPCODE_MAD &&
610 MIN2(type_sz(inst->src[1].type), type_sz(inst->src[2].type)) >= 4));
611
612 if (type_sz(dst_type) > 4 || type_sz(exec_type) > 4 ||
613 (type_sz(exec_type) == 4 && is_dword_multiply))
614 return devinfo->platform == INTEL_PLATFORM_CHV;
615
616 else
617 return false;
618 }
619
620 static inline bool
has_dst_aligned_region_restriction(const intel_device_info * devinfo,const elk_fs_inst * inst)621 has_dst_aligned_region_restriction(const intel_device_info *devinfo,
622 const elk_fs_inst *inst)
623 {
624 return has_dst_aligned_region_restriction(devinfo, inst, inst->dst.type);
625 }
626
627 /**
628 * Return whether the LOAD_PAYLOAD instruction is a plain copy of bits from
629 * the specified register file into a VGRF.
630 *
631 * This implies identity register regions without any source-destination
632 * overlap, but otherwise has no implications on the location of sources and
633 * destination in the register file: Gathering any number of portions from
634 * multiple virtual registers in any order is allowed.
635 */
636 inline bool
is_copy_payload(elk_reg_file file,const elk_fs_inst * inst)637 is_copy_payload(elk_reg_file file, const elk_fs_inst *inst)
638 {
639 if (inst->opcode != ELK_SHADER_OPCODE_LOAD_PAYLOAD ||
640 inst->is_partial_write() || inst->saturate ||
641 inst->dst.file != VGRF)
642 return false;
643
644 for (unsigned i = 0; i < inst->sources; i++) {
645 if (inst->src[i].file != file ||
646 inst->src[i].abs || inst->src[i].negate)
647 return false;
648
649 if (!inst->src[i].is_contiguous())
650 return false;
651
652 if (regions_overlap(inst->dst, inst->size_written,
653 inst->src[i], inst->size_read(i)))
654 return false;
655 }
656
657 return true;
658 }
659
660 /**
661 * Like is_copy_payload(), but the instruction is required to copy a single
662 * contiguous block of registers from the given register file into the
663 * destination without any reordering.
664 */
665 inline bool
is_identity_payload(elk_reg_file file,const elk_fs_inst * inst)666 is_identity_payload(elk_reg_file file, const elk_fs_inst *inst) {
667 if (is_copy_payload(file, inst)) {
668 elk_fs_reg reg = inst->src[0];
669
670 for (unsigned i = 0; i < inst->sources; i++) {
671 reg.type = inst->src[i].type;
672 if (!inst->src[i].equals(reg))
673 return false;
674
675 reg = byte_offset(reg, inst->size_read(i));
676 }
677
678 return true;
679 } else {
680 return false;
681 }
682 }
683
684 /**
685 * Like is_copy_payload(), but the instruction is required to source data from
686 * at least two disjoint VGRFs.
687 *
688 * This doesn't necessarily rule out the elimination of this instruction
689 * through register coalescing, but due to limitations of the register
690 * coalesce pass it might be impossible to do so directly until a later stage,
691 * when the LOAD_PAYLOAD instruction is unrolled into a sequence of MOV
692 * instructions.
693 */
694 inline bool
is_multi_copy_payload(const elk_fs_inst * inst)695 is_multi_copy_payload(const elk_fs_inst *inst) {
696 if (is_copy_payload(VGRF, inst)) {
697 for (unsigned i = 0; i < inst->sources; i++) {
698 if (inst->src[i].nr != inst->src[0].nr)
699 return true;
700 }
701 }
702
703 return false;
704 }
705
706 /**
707 * Like is_identity_payload(), but the instruction is required to copy the
708 * whole contents of a single VGRF into the destination.
709 *
710 * This means that there is a good chance that the instruction will be
711 * eliminated through register coalescing, but it's neither a necessary nor a
712 * sufficient condition for that to happen -- E.g. consider the case where
713 * source and destination registers diverge due to other instructions in the
714 * program overwriting part of their contents, which isn't something we can
715 * predict up front based on a cheap strictly local test of the copy
716 * instruction.
717 */
718 inline bool
is_coalescing_payload(const elk::simple_allocator & alloc,const elk_fs_inst * inst)719 is_coalescing_payload(const elk::simple_allocator &alloc, const elk_fs_inst *inst)
720 {
721 return is_identity_payload(VGRF, inst) &&
722 inst->src[0].offset == 0 &&
723 alloc.sizes[inst->src[0].nr] * REG_SIZE == inst->size_written;
724 }
725
726 bool
727 elk_has_bank_conflict(const struct elk_isa_info *isa, const elk_fs_inst *inst);
728
729 #endif
730