xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/elk/elk_fs_combine_constants.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /** @file elk_fs_combine_constants.cpp
25  *
26  * This file contains the opt_combine_constants() pass that runs after the
27  * regular optimization loop. It passes over the instruction list and
28  * selectively promotes immediate values to registers by emitting a mov(1)
29  * instruction.
30  *
31  * This is useful on Gen 7 particularly, because a few instructions can be
32  * coissued (i.e., issued in the same cycle as another thread on the same EU
33  * issues an instruction) under some circumstances, one of which is that they
34  * cannot use immediate values.
35  */
36 
37 #include "elk_fs.h"
38 #include "elk_fs_builder.h"
39 #include "elk_cfg.h"
40 #include "util/half_float.h"
41 
42 using namespace elk;
43 
44 static const bool debug = false;
45 
46 namespace {
47 
48 enum PACKED interpreted_type {
49    float_only = 0,
50    integer_only,
51    either_type
52 };
53 
54 struct value {
55    /** Raw bit pattern of the value. */
56    nir_const_value value;
57 
58    /** Instruction that uses this instance of the value. */
59    unsigned instr_index;
60 
61    /** Size, in bits, of the value. */
62    uint8_t bit_size;
63 
64    /**
65     * Which source of instr is this value?
66     *
67     * \note This field is not actually used by \c elk_combine_constants, but
68     * it is generally very useful to callers.
69     */
70    uint8_t src;
71 
72    /**
73     * In what ways can instr interpret this value?
74     *
75     * Choices are floating-point only, integer only, or either type.
76     */
77    enum interpreted_type type;
78 
79    /**
80     * Only try to make a single source non-constant.
81     *
82     * On some architectures, some instructions require that all sources be
83     * non-constant.  For example, the multiply-accumulate instruction on Intel
84     * GPUs upto Gen11 require that all sources be non-constant.  Other
85     * instructions, like the selection instruction, allow one constant source.
86     *
87     * If a single constant source is allowed, set this flag to true.
88     *
89     * If an instruction allows a single constant and it has only a signle
90     * constant to begin, it should be included.  Various places in
91     * \c combine_constants will assume that there are multiple constants if
92     * \c ::allow_one_constant is set.  This may even be enforced by in-code
93     * assertions.
94     */
95    bool allow_one_constant;
96 
97    /**
98     * Restrict values that can reach this value to not include negations.
99     *
100     * This is useful for instructions that cannot have source modifiers.  For
101     * example, on Intel GPUs the integer source of a shift instruction (e.g.,
102     * SHL) can have a source modifier, but the integer source of the bitfield
103     * insertion instruction (i.e., BFI2) cannot.  A pair of these instructions
104     * might have sources that are negations of each other.  Using this flag
105     * will ensure that the BFI2 does not have a negated source, but the SHL
106     * might.
107     */
108    bool no_negations;
109 
110    /**
111     * \name UtilCombineConstantsPrivate
112     * Private data used only by elk_combine_constants
113     *
114     * Any data stored in these fields will be overwritten by the call to
115     * \c elk_combine_constants.  No assumptions should be made about the
116     * state of these fields after that function returns.
117     */
118    /**@{*/
119    /** Mask of negations that can be generated from this value. */
120    uint8_t reachable_mask;
121 
122    /** Mask of negations that can generate this value. */
123    uint8_t reaching_mask;
124 
125    /**
126     * Value with the next source from the same instruction.
127     *
128     * This pointer may be \c NULL.  If it is not \c NULL, it will form a
129     * singly-linked circular list of values.  The list is unorderd.  That is,
130     * as the list is iterated, the \c ::src values will be in arbitrary order.
131     *
132     * \todo Is it even possible for there to be more than two elements in this
133     * list?  This pass does not operate on vecN instructions or intrinsics, so
134     * the theoretical limit should be three.  However, instructions with all
135     * constant sources should have been folded away.
136     */
137    struct value *next_src;
138    /**@}*/
139 };
140 
141 struct combine_constants_value {
142    /** Raw bit pattern of the constant loaded. */
143    nir_const_value value;
144 
145    /**
146     * Index of the first user.
147     *
148     * This is the offset into \c combine_constants_result::user_map of the
149     * first user of this value.
150     */
151    unsigned first_user;
152 
153    /** Number of users of this value. */
154    unsigned num_users;
155 
156    /** Size, in bits, of the value. */
157    uint8_t bit_size;
158 };
159 
160 struct combine_constants_user {
161    /** Index into the array of values passed to elk_combine_constants. */
162    unsigned index;
163 
164    /**
165     * Manner in which the value should be interpreted in the instruction.
166     *
167     * This is only useful when ::negate is set.  Unless the corresponding
168     * value::type is \c either_type, this field must have the same value as
169     * value::type.
170     */
171    enum interpreted_type type;
172 
173    /** Should this value be negated to generate the original value? */
174    bool negate;
175 };
176 
177 class combine_constants_result {
178 public:
combine_constants_result(unsigned num_candidates,bool & success)179    combine_constants_result(unsigned num_candidates, bool &success)
180       : num_values_to_emit(0), user_map(NULL)
181    {
182       user_map = (struct combine_constants_user *) calloc(num_candidates,
183                                                           sizeof(user_map[0]));
184 
185       /* In the worst case, the number of output values will be equal to the
186        * number of input values.  Allocate a buffer that is known to be large
187        * enough now, and it can be reduced later.
188        */
189       values_to_emit =
190          (struct combine_constants_value *) calloc(num_candidates,
191                                                    sizeof(values_to_emit[0]));
192 
193       success = (user_map != NULL && values_to_emit != NULL);
194    }
195 
196    combine_constants_result(const combine_constants_result &) = delete;
197 
~combine_constants_result()198    ~combine_constants_result()
199    {
200       free(values_to_emit);
201       free(user_map);
202    }
203 
204    combine_constants_result & operator=(const combine_constants_result &) = delete;
205 
append_value(const nir_const_value & value,unsigned bit_size)206    void append_value(const nir_const_value &value, unsigned bit_size)
207    {
208       values_to_emit[num_values_to_emit].value = value;
209       values_to_emit[num_values_to_emit].first_user = 0;
210       values_to_emit[num_values_to_emit].num_users = 0;
211       values_to_emit[num_values_to_emit].bit_size = bit_size;
212       num_values_to_emit++;
213    }
214 
215    unsigned num_values_to_emit;
216    struct combine_constants_value *values_to_emit;
217 
218    struct combine_constants_user *user_map;
219 };
220 
221 #define VALUE_INDEX                  0
222 #define FLOAT_NEG_INDEX              1
223 #define INT_NEG_INDEX                2
224 #define MAX_NUM_REACHABLE            3
225 
226 #define VALUE_EXISTS                 (1 << VALUE_INDEX)
227 #define FLOAT_NEG_EXISTS             (1 << FLOAT_NEG_INDEX)
228 #define INT_NEG_EXISTS               (1 << INT_NEG_INDEX)
229 
230 static bool
negation_exists(nir_const_value v,unsigned bit_size,enum interpreted_type base_type)231 negation_exists(nir_const_value v, unsigned bit_size,
232                 enum interpreted_type base_type)
233 {
234    /* either_type does not make sense in this context. */
235    assert(base_type == float_only || base_type == integer_only);
236 
237    switch (bit_size) {
238    case 8:
239       if (base_type == float_only)
240          return false;
241       else
242          return v.i8 != 0 && v.i8 != INT8_MIN;
243 
244    case 16:
245       if (base_type == float_only)
246          return !util_is_half_nan(v.i16);
247       else
248          return v.i16 != 0 && v.i16 != INT16_MIN;
249 
250    case 32:
251       if (base_type == float_only)
252          return !isnan(v.f32);
253       else
254          return v.i32 != 0 && v.i32 != INT32_MIN;
255 
256    case 64:
257       if (base_type == float_only)
258          return !isnan(v.f64);
259       else
260          return v.i64 != 0 && v.i64 != INT64_MIN;
261 
262    default:
263       unreachable("unsupported bit-size should have already been filtered.");
264    }
265 }
266 
267 static nir_const_value
negate(nir_const_value v,unsigned bit_size,enum interpreted_type base_type)268 negate(nir_const_value v, unsigned bit_size, enum interpreted_type base_type)
269 {
270    /* either_type does not make sense in this context. */
271    assert(base_type == float_only || base_type == integer_only);
272 
273    nir_const_value ret = { 0, };
274 
275    switch (bit_size) {
276    case 8:
277       assert(base_type == integer_only);
278       ret.i8 = -v.i8;
279       break;
280 
281    case 16:
282       if (base_type == float_only)
283          ret.u16 = v.u16 ^ INT16_MIN;
284       else
285          ret.i16 = -v.i16;
286       break;
287 
288    case 32:
289       if (base_type == float_only)
290          ret.u32 = v.u32 ^ INT32_MIN;
291       else
292          ret.i32 = -v.i32;
293       break;
294 
295    case 64:
296       if (base_type == float_only)
297          ret.u64 = v.u64 ^ INT64_MIN;
298       else
299          ret.i64 = -v.i64;
300       break;
301 
302    default:
303       unreachable("unsupported bit-size should have already been filtered.");
304    }
305 
306    return ret;
307 }
308 
309 static nir_const_value
absolute(nir_const_value v,unsigned bit_size,enum interpreted_type base_type)310 absolute(nir_const_value v, unsigned bit_size, enum interpreted_type base_type)
311 {
312    /* either_type does not make sense in this context. */
313    assert(base_type == float_only || base_type == integer_only);
314 
315    nir_const_value ret = { 0, };
316 
317    switch (bit_size) {
318    case 8:
319       assert(base_type == integer_only);
320       ret.i8 = abs(v.i8);
321       break;
322 
323    case 16:
324       if (base_type == float_only)
325          ret.u16 = v.u16 & 0x7fff;
326       else
327          ret.i16 = abs(v.i16);
328       break;
329 
330    case 32:
331       if (base_type == float_only)
332          ret.f32 = fabs(v.f32);
333       else
334          ret.i32 = abs(v.i32);
335       break;
336 
337    case 64:
338       if (base_type == float_only)
339          ret.f64 = fabs(v.f64);
340       else {
341          if (sizeof(v.i64) == sizeof(long int)) {
342             ret.i64 = labs((long int) v.i64);
343          } else {
344             assert(sizeof(v.i64) == sizeof(long long int));
345             ret.i64 = llabs((long long int) v.i64);
346          }
347       }
348       break;
349 
350    default:
351       unreachable("unsupported bit-size should have already been filtered.");
352    }
353 
354    return ret;
355 }
356 
357 static void
calculate_masks(nir_const_value v,enum interpreted_type type,unsigned bit_size,uint8_t * reachable_mask,uint8_t * reaching_mask)358 calculate_masks(nir_const_value v, enum interpreted_type type,
359                 unsigned bit_size, uint8_t *reachable_mask,
360                 uint8_t *reaching_mask)
361 {
362    *reachable_mask = 0;
363    *reaching_mask = 0;
364 
365    /* Calculate the extended reachable mask. */
366    if (type == float_only || type == either_type) {
367       if (negation_exists(v, bit_size, float_only))
368          *reachable_mask |= FLOAT_NEG_EXISTS;
369    }
370 
371    if (type == integer_only || type == either_type) {
372       if (negation_exists(v, bit_size, integer_only))
373          *reachable_mask |= INT_NEG_EXISTS;
374    }
375 
376    /* Calculate the extended reaching mask.  All of the "is this negation
377     * possible" was already determined for the reachable_mask, so reuse that
378     * data.
379     */
380    if (type == float_only || type == either_type) {
381       if (*reachable_mask & FLOAT_NEG_EXISTS)
382          *reaching_mask |= FLOAT_NEG_EXISTS;
383    }
384 
385    if (type == integer_only || type == either_type) {
386       if (*reachable_mask & INT_NEG_EXISTS)
387          *reaching_mask |= INT_NEG_EXISTS;
388    }
389 }
390 
391 static void
calculate_reachable_values(nir_const_value v,unsigned bit_size,unsigned reachable_mask,nir_const_value * reachable_values)392 calculate_reachable_values(nir_const_value v,
393                            unsigned bit_size,
394                            unsigned reachable_mask,
395                            nir_const_value *reachable_values)
396 {
397    memset(reachable_values, 0, MAX_NUM_REACHABLE * sizeof(reachable_values[0]));
398 
399    reachable_values[VALUE_INDEX] = v;
400 
401    if (reachable_mask & INT_NEG_EXISTS) {
402       const nir_const_value neg = negate(v, bit_size, integer_only);
403 
404       reachable_values[INT_NEG_INDEX] = neg;
405    }
406 
407    if (reachable_mask & FLOAT_NEG_EXISTS) {
408       const nir_const_value neg = negate(v, bit_size, float_only);
409 
410       reachable_values[FLOAT_NEG_INDEX] = neg;
411    }
412 }
413 
414 static bool
value_equal(nir_const_value a,nir_const_value b,unsigned bit_size)415 value_equal(nir_const_value a, nir_const_value b, unsigned bit_size)
416 {
417    switch (bit_size) {
418    case 8:
419       return a.u8 == b.u8;
420    case 16:
421       return a.u16 == b.u16;
422    case 32:
423       return a.u32 == b.u32;
424    case 64:
425       return a.u64 == b.u64;
426    default:
427       unreachable("unsupported bit-size should have already been filtered.");
428    }
429 }
430 
431 /** Can these values be the same with one level of negation? */
432 static bool
value_can_equal(const nir_const_value * from,uint8_t reachable_mask,nir_const_value to,uint8_t reaching_mask,unsigned bit_size)433 value_can_equal(const nir_const_value *from, uint8_t reachable_mask,
434                 nir_const_value to, uint8_t reaching_mask,
435                 unsigned bit_size)
436 {
437    const uint8_t combined_mask = reachable_mask & reaching_mask;
438 
439    return value_equal(from[VALUE_INDEX], to, bit_size) ||
440           ((combined_mask & INT_NEG_EXISTS) &&
441            value_equal(from[INT_NEG_INDEX], to, bit_size)) ||
442           ((combined_mask & FLOAT_NEG_EXISTS) &&
443            value_equal(from[FLOAT_NEG_INDEX], to, bit_size));
444 }
445 
446 static void
preprocess_candidates(struct value * candidates,unsigned num_candidates)447 preprocess_candidates(struct value *candidates, unsigned num_candidates)
448 {
449    /* Calculate the reaching_mask and reachable_mask for each candidate. */
450    for (unsigned i = 0; i < num_candidates; i++) {
451       calculate_masks(candidates[i].value,
452                       candidates[i].type,
453                       candidates[i].bit_size,
454                       &candidates[i].reachable_mask,
455                       &candidates[i].reaching_mask);
456 
457       /* If negations are not allowed, then only the original value is
458        * reaching.
459        */
460       if (candidates[i].no_negations)
461          candidates[i].reaching_mask = 0;
462    }
463 
464    for (unsigned i = 0; i < num_candidates; i++)
465       candidates[i].next_src = NULL;
466 
467    for (unsigned i = 0; i < num_candidates - 1; i++) {
468       if (candidates[i].next_src != NULL)
469          continue;
470 
471       struct value *prev = &candidates[i];
472 
473       for (unsigned j = i + 1; j < num_candidates; j++) {
474          if (candidates[i].instr_index == candidates[j].instr_index) {
475             prev->next_src = &candidates[j];
476             prev = prev->next_src;
477          }
478       }
479 
480       /* Close the cycle. */
481       if (prev != &candidates[i])
482          prev->next_src = &candidates[i];
483    }
484 }
485 
486 static bool
reaching_value_exists(const struct value * c,const struct combine_constants_value * values,unsigned num_values)487 reaching_value_exists(const struct value *c,
488                       const struct combine_constants_value *values,
489                       unsigned num_values)
490 {
491    nir_const_value reachable_values[MAX_NUM_REACHABLE];
492 
493    calculate_reachable_values(c->value, c->bit_size, c->reaching_mask,
494                               reachable_values);
495 
496    /* Check to see if the value is already in the result set. */
497    for (unsigned j = 0; j < num_values; j++) {
498       if (c->bit_size == values[j].bit_size &&
499           value_can_equal(reachable_values, c->reaching_mask,
500                           values[j].value, c->reaching_mask,
501                           c->bit_size)) {
502          return true;
503       }
504    }
505 
506    return false;
507 }
508 
509 static combine_constants_result *
combine_constants_greedy(struct value * candidates,unsigned num_candidates)510 combine_constants_greedy(struct value *candidates, unsigned num_candidates)
511 {
512    bool success;
513    combine_constants_result *result =
514       new combine_constants_result(num_candidates, success);
515    if (result == NULL || !success) {
516       delete result;
517       return NULL;
518    }
519 
520    BITSET_WORD *remain =
521       (BITSET_WORD *) calloc(BITSET_WORDS(num_candidates), sizeof(remain[0]));
522 
523    if (remain == NULL) {
524       delete result;
525       return NULL;
526    }
527 
528    memset(remain, 0xff, BITSET_WORDS(num_candidates) * sizeof(remain[0]));
529 
530    /* Operate in three passes.  The first pass handles all values that must be
531     * emitted and for which a negation cannot exist.
532     */
533    unsigned i;
534    for (i = 0; i < num_candidates; i++) {
535       if (candidates[i].allow_one_constant ||
536           (candidates[i].reaching_mask & (FLOAT_NEG_EXISTS | INT_NEG_EXISTS))) {
537          continue;
538       }
539 
540       /* Check to see if the value is already in the result set. */
541       bool found = false;
542       const unsigned num_values = result->num_values_to_emit;
543       for (unsigned j = 0; j < num_values; j++) {
544          if (candidates[i].bit_size == result->values_to_emit[j].bit_size &&
545              value_equal(candidates[i].value,
546                          result->values_to_emit[j].value,
547                          candidates[i].bit_size)) {
548             found = true;
549             break;
550          }
551       }
552 
553       if (!found)
554          result->append_value(candidates[i].value, candidates[i].bit_size);
555 
556       BITSET_CLEAR(remain, i);
557    }
558 
559    /* The second pass handles all values that must be emitted and for which a
560     * negation can exist.
561     */
562    BITSET_FOREACH_SET(i, remain, num_candidates) {
563       if (candidates[i].allow_one_constant)
564          continue;
565 
566       assert(candidates[i].reaching_mask & (FLOAT_NEG_EXISTS | INT_NEG_EXISTS));
567 
568       if (!reaching_value_exists(&candidates[i], result->values_to_emit,
569                                  result->num_values_to_emit)) {
570          result->append_value(absolute(candidates[i].value,
571                                        candidates[i].bit_size,
572                                        candidates[i].type),
573                               candidates[i].bit_size);
574       }
575 
576       BITSET_CLEAR(remain, i);
577    }
578 
579    /* The third pass handles all of the values that may not have to be
580     * emitted.  These are the values where allow_one_constant is set.
581     */
582    BITSET_FOREACH_SET(i, remain, num_candidates) {
583       assert(candidates[i].allow_one_constant);
584 
585       /* The BITSET_FOREACH_SET macro does not detect changes to the bitset
586        * that occur within the current word.  Since code in this loop may
587        * clear bits from the set, re-test here.
588        */
589       if (!BITSET_TEST(remain, i))
590          continue;
591 
592       assert(candidates[i].next_src != NULL);
593 
594       const struct value *const other_candidate = candidates[i].next_src;
595       const unsigned j = other_candidate - candidates;
596 
597       if (!reaching_value_exists(&candidates[i], result->values_to_emit,
598                                  result->num_values_to_emit)) {
599          /* Before emitting a value, see if a match for the other source of
600           * the instruction exists.
601           */
602          if (!reaching_value_exists(&candidates[j], result->values_to_emit,
603                                     result->num_values_to_emit)) {
604             result->append_value(candidates[i].value, candidates[i].bit_size);
605          }
606       }
607 
608       /* Mark both sources as handled. */
609       BITSET_CLEAR(remain, i);
610       BITSET_CLEAR(remain, j);
611    }
612 
613    /* As noted above, there will never be more values in the output than in
614     * the input.  If there are fewer values, reduce the size of the
615     * allocation.
616     */
617    if (result->num_values_to_emit < num_candidates) {
618       result->values_to_emit = (struct combine_constants_value *)
619          realloc(result->values_to_emit, sizeof(result->values_to_emit[0]) *
620                  result->num_values_to_emit);
621 
622       /* Is it even possible for a reducing realloc to fail? */
623       assert(result->values_to_emit != NULL);
624    }
625 
626    /* Create the mapping from "combined" constants to list of candidates
627     * passed in by the caller.
628     */
629    memset(remain, 0xff, BITSET_WORDS(num_candidates) * sizeof(remain[0]));
630 
631    unsigned total_users = 0;
632 
633    const unsigned num_values = result->num_values_to_emit;
634    for (unsigned value_idx = 0; value_idx < num_values; value_idx++) {
635       result->values_to_emit[value_idx].first_user = total_users;
636 
637       uint8_t reachable_mask;
638       uint8_t unused_mask;
639 
640       calculate_masks(result->values_to_emit[value_idx].value, either_type,
641                       result->values_to_emit[value_idx].bit_size,
642                       &reachable_mask, &unused_mask);
643 
644       nir_const_value reachable_values[MAX_NUM_REACHABLE];
645 
646       calculate_reachable_values(result->values_to_emit[value_idx].value,
647                                  result->values_to_emit[value_idx].bit_size,
648                                  reachable_mask, reachable_values);
649 
650       for (unsigned i = 0; i < num_candidates; i++) {
651          bool matched = false;
652 
653          if (!BITSET_TEST(remain, i))
654             continue;
655 
656          if (candidates[i].bit_size != result->values_to_emit[value_idx].bit_size)
657             continue;
658 
659          if (value_equal(candidates[i].value, result->values_to_emit[value_idx].value,
660                          result->values_to_emit[value_idx].bit_size)) {
661             result->user_map[total_users].index = i;
662             result->user_map[total_users].type = candidates[i].type;
663             result->user_map[total_users].negate = false;
664             total_users++;
665 
666             matched = true;
667             BITSET_CLEAR(remain, i);
668          } else {
669             const uint8_t combined_mask = reachable_mask &
670                                           candidates[i].reaching_mask;
671 
672             enum interpreted_type type = either_type;
673 
674             if ((combined_mask & INT_NEG_EXISTS) &&
675                 value_equal(candidates[i].value,
676                             reachable_values[INT_NEG_INDEX],
677                             candidates[i].bit_size)) {
678                type = integer_only;
679             }
680 
681             if (type == either_type &&
682                 (combined_mask & FLOAT_NEG_EXISTS) &&
683                 value_equal(candidates[i].value,
684                             reachable_values[FLOAT_NEG_INDEX],
685                             candidates[i].bit_size)) {
686                type = float_only;
687             }
688 
689             if (type != either_type) {
690                /* Finding a match on this path implies that the user must
691                 * allow source negations.
692                 */
693                assert(!candidates[i].no_negations);
694 
695                result->user_map[total_users].index = i;
696                result->user_map[total_users].type = type;
697                result->user_map[total_users].negate = true;
698                total_users++;
699 
700                matched = true;
701                BITSET_CLEAR(remain, i);
702             }
703          }
704 
705          /* Mark the other source of instructions that can have a constant
706           * source.  Selection is the prime example of this, and we want to
707           * avoid generating sequences like bcsel(a, fneg(b), ineg(c)).
708           *
709           * This also makes sure that the assertion (below) that *all* values
710           * were processed holds even when some values may be allowed to
711           * remain as constants.
712           *
713           * FINISHME: There may be value in only doing this when type ==
714           * either_type.  If both sources are loaded, a register allocator may
715           * be able to make a better choice about which value to "spill"
716           * (i.e., replace with an immediate) under heavy register pressure.
717           */
718          if (matched && candidates[i].allow_one_constant) {
719             const struct value *const other_src = candidates[i].next_src;
720             const unsigned idx = other_src - candidates;
721 
722             assert(idx < num_candidates);
723             BITSET_CLEAR(remain, idx);
724          }
725       }
726 
727       assert(total_users > result->values_to_emit[value_idx].first_user);
728       result->values_to_emit[value_idx].num_users =
729          total_users - result->values_to_emit[value_idx].first_user;
730    }
731 
732    /* Verify that all of the values were emitted by the loop above.  If any
733     * bits are still set in remain, then some value was not emitted.  The use
734     * of memset to populate remain prevents the use of a more performant loop.
735     */
736 #ifndef NDEBUG
737    bool pass = true;
738 
739    BITSET_FOREACH_SET(i, remain, num_candidates) {
740       fprintf(stderr, "candidate %d was not processed: { "
741               ".b = %s, "
742               ".f32 = %f, .f64 = %g, "
743               ".i8 = %d, .u8 = 0x%02x, "
744               ".i16 = %d, .u16 = 0x%04x, "
745               ".i32 = %d, .u32 = 0x%08x, "
746               ".i64 = %" PRId64 ", .u64 = 0x%016" PRIx64 " }\n",
747               i,
748               candidates[i].value.b ? "true" : "false",
749               candidates[i].value.f32, candidates[i].value.f64,
750               candidates[i].value.i8,  candidates[i].value.u8,
751               candidates[i].value.i16, candidates[i].value.u16,
752               candidates[i].value.i32, candidates[i].value.u32,
753               candidates[i].value.i64, candidates[i].value.u64);
754       pass = false;
755    }
756 
757    assert(pass && "All values should have been processed.");
758 #endif
759 
760    free(remain);
761 
762    return result;
763 }
764 
765 static combine_constants_result *
elk_combine_constants(struct value * candidates,unsigned num_candidates)766 elk_combine_constants(struct value *candidates, unsigned num_candidates)
767 {
768    preprocess_candidates(candidates, num_candidates);
769 
770    return combine_constants_greedy(candidates, num_candidates);
771 }
772 
773 /* Returns whether an instruction could co-issue if its immediate source were
774  * replaced with a GRF source.
775  */
776 static bool
could_coissue(const struct intel_device_info * devinfo,const elk_fs_inst * inst)777 could_coissue(const struct intel_device_info *devinfo, const elk_fs_inst *inst)
778 {
779    assert(inst->opcode == ELK_OPCODE_MOV ||
780           inst->opcode == ELK_OPCODE_CMP ||
781           inst->opcode == ELK_OPCODE_ADD ||
782           inst->opcode == ELK_OPCODE_MUL);
783 
784    if (devinfo->ver != 7)
785       return false;
786 
787    /* Only float instructions can coissue.  We don't have a great
788     * understanding of whether or not something like float(int(a) + int(b))
789     * would be considered float (based on the destination type) or integer
790     * (based on the source types), so we take the conservative choice of
791     * only promoting when both destination and source are float.
792     */
793    return inst->dst.type == ELK_REGISTER_TYPE_F &&
794           inst->src[0].type == ELK_REGISTER_TYPE_F;
795 }
796 
797 /**
798  * Box for storing elk_fs_inst and some other necessary data
799  *
800  * \sa box_instruction
801  */
802 struct fs_inst_box {
803    elk_fs_inst *inst;
804    unsigned ip;
805    elk_bblock_t *block;
806    bool must_promote;
807 };
808 
809 /** A box for putting fs_regs in a linked list. */
810 struct reg_link {
811    DECLARE_RALLOC_CXX_OPERATORS(reg_link)
812 
reg_link__anon313661870111::reg_link813    reg_link(elk_fs_inst *inst, unsigned src, bool negate, enum interpreted_type type)
814    : inst(inst), src(src), negate(negate), type(type) {}
815 
816    struct exec_node link;
817    elk_fs_inst *inst;
818    uint8_t src;
819    bool negate;
820    enum interpreted_type type;
821 };
822 
823 static struct exec_node *
link(void * mem_ctx,elk_fs_inst * inst,unsigned src,bool negate,enum interpreted_type type)824 link(void *mem_ctx, elk_fs_inst *inst, unsigned src, bool negate,
825      enum interpreted_type type)
826 {
827    reg_link *l = new(mem_ctx) reg_link(inst, src, negate, type);
828    return &l->link;
829 }
830 
831 /**
832  * Information about an immediate value.
833  */
834 struct imm {
835    /** The common ancestor of all blocks using this immediate value. */
836    elk_bblock_t *block;
837 
838    /**
839     * The instruction generating the immediate value, if all uses are contained
840     * within a single basic block. Otherwise, NULL.
841     */
842    elk_fs_inst *inst;
843 
844    /**
845     * A list of fs_regs that refer to this immediate.  If we promote it, we'll
846     * have to patch these up to refer to the new GRF.
847     */
848    exec_list *uses;
849 
850    /** The immediate value */
851    union {
852       char bytes[8];
853       double df;
854       int64_t d64;
855       float f;
856       int32_t d;
857       int16_t w;
858    };
859    uint8_t size;
860 
861    /** When promoting half-float we need to account for certain restrictions */
862    bool is_half_float;
863 
864    /**
865     * The GRF register and subregister number where we've decided to store the
866     * constant value.
867     */
868    uint8_t subreg_offset;
869    uint16_t nr;
870 
871    /** The number of coissuable instructions using this immediate. */
872    uint16_t uses_by_coissue;
873 
874    /**
875     * Whether this constant is used by an instruction that can't handle an
876     * immediate source (and already has to be promoted to a GRF).
877     */
878    bool must_promote;
879 
880    /** Is the value used only in a single basic block? */
881    bool used_in_single_block;
882 
883    uint16_t first_use_ip;
884    uint16_t last_use_ip;
885 };
886 
887 /** The working set of information about immediates. */
888 struct table {
889    struct value *values;
890    int size;
891    int num_values;
892 
893    struct imm *imm;
894    int len;
895 
896    struct fs_inst_box *boxes;
897    unsigned num_boxes;
898    unsigned size_boxes;
899 };
900 
901 static struct value *
new_value(struct table * table,void * mem_ctx)902 new_value(struct table *table, void *mem_ctx)
903 {
904    if (table->num_values == table->size) {
905       table->size *= 2;
906       table->values = reralloc(mem_ctx, table->values, struct value, table->size);
907    }
908    return &table->values[table->num_values++];
909 }
910 
911 /**
912  * Store an instruction with some other data in a table.
913  *
914  * \returns the index into the dynamic array of boxes for the instruction.
915  */
916 static unsigned
box_instruction(struct table * table,void * mem_ctx,elk_fs_inst * inst,unsigned ip,elk_bblock_t * block,bool must_promote)917 box_instruction(struct table *table, void *mem_ctx, elk_fs_inst *inst,
918                 unsigned ip, elk_bblock_t *block, bool must_promote)
919 {
920    /* It is common for box_instruction to be called consecutively for each
921     * source of an instruction.  As a result, the most common case for finding
922     * an instruction in the table is when that instruction was the last one
923     * added.  Search the list back to front.
924     */
925    for (unsigned i = table->num_boxes; i > 0; /* empty */) {
926       i--;
927 
928       if (table->boxes[i].inst == inst)
929          return i;
930    }
931 
932    if (table->num_boxes == table->size_boxes) {
933       table->size_boxes *= 2;
934       table->boxes = reralloc(mem_ctx, table->boxes, fs_inst_box,
935                               table->size_boxes);
936    }
937 
938    assert(table->num_boxes < table->size_boxes);
939 
940    const unsigned idx = table->num_boxes++;
941    fs_inst_box *ib =  &table->boxes[idx];
942 
943    ib->inst = inst;
944    ib->block = block;
945    ib->ip = ip;
946    ib->must_promote = must_promote;
947 
948    return idx;
949 }
950 
951 /**
952  * Comparator used for sorting an array of imm structures.
953  *
954  * We sort by basic block number, then last use IP, then first use IP (least
955  * to greatest). This sorting causes immediates live in the same area to be
956  * allocated to the same register in the hopes that all values will be dead
957  * about the same time and the register can be reused.
958  */
959 static int
compare(const void * _a,const void * _b)960 compare(const void *_a, const void *_b)
961 {
962    const struct imm *a = (const struct imm *)_a,
963                     *b = (const struct imm *)_b;
964 
965    int block_diff = a->block->num - b->block->num;
966    if (block_diff)
967       return block_diff;
968 
969    int end_diff = a->last_use_ip - b->last_use_ip;
970    if (end_diff)
971       return end_diff;
972 
973    return a->first_use_ip - b->first_use_ip;
974 }
975 
976 static struct elk_reg
build_imm_reg_for_copy(struct imm * imm)977 build_imm_reg_for_copy(struct imm *imm)
978 {
979    switch (imm->size) {
980    case 8:
981       return elk_imm_d(imm->d64);
982    case 4:
983       return elk_imm_d(imm->d);
984    case 2:
985       return elk_imm_w(imm->w);
986    default:
987       unreachable("not implemented");
988    }
989 }
990 
991 static inline uint32_t
get_alignment_for_imm(const struct imm * imm)992 get_alignment_for_imm(const struct imm *imm)
993 {
994    if (imm->is_half_float)
995       return 4; /* At least MAD seems to require this */
996    else
997       return imm->size;
998 }
999 
1000 static void
add_candidate_immediate(struct table * table,elk_fs_inst * inst,unsigned ip,unsigned i,bool must_promote,bool allow_one_constant,elk_bblock_t * block,const struct intel_device_info * devinfo,void * const_ctx)1001 add_candidate_immediate(struct table *table, elk_fs_inst *inst, unsigned ip,
1002                         unsigned i,
1003                         bool must_promote,
1004                         bool allow_one_constant,
1005                         elk_bblock_t *block,
1006                         const struct intel_device_info *devinfo,
1007                         void *const_ctx)
1008 {
1009    struct value *v = new_value(table, const_ctx);
1010 
1011    unsigned box_idx = box_instruction(table, const_ctx, inst, ip, block,
1012                                       must_promote);
1013 
1014    v->value.u64 = inst->src[i].d64;
1015    v->bit_size = 8 * type_sz(inst->src[i].type);
1016    v->instr_index = box_idx;
1017    v->src = i;
1018    v->allow_one_constant = allow_one_constant;
1019 
1020    /* Right-shift instructions are special.  They can have source modifiers,
1021     * but changing the type can change the semantic of the instruction.  Only
1022     * allow negations on a right shift if the source type is already signed.
1023     */
1024    v->no_negations = !inst->can_do_source_mods(devinfo) ||
1025                      ((inst->opcode == ELK_OPCODE_SHR ||
1026                        inst->opcode == ELK_OPCODE_ASR) &&
1027                       elk_reg_type_is_unsigned_integer(inst->src[i].type));
1028 
1029    switch (inst->src[i].type) {
1030    case ELK_REGISTER_TYPE_DF:
1031    case ELK_REGISTER_TYPE_NF:
1032    case ELK_REGISTER_TYPE_F:
1033    case ELK_REGISTER_TYPE_HF:
1034       v->type = float_only;
1035       break;
1036 
1037    case ELK_REGISTER_TYPE_UQ:
1038    case ELK_REGISTER_TYPE_Q:
1039    case ELK_REGISTER_TYPE_UD:
1040    case ELK_REGISTER_TYPE_D:
1041    case ELK_REGISTER_TYPE_UW:
1042    case ELK_REGISTER_TYPE_W:
1043       v->type = integer_only;
1044       break;
1045 
1046    case ELK_REGISTER_TYPE_VF:
1047    case ELK_REGISTER_TYPE_UV:
1048    case ELK_REGISTER_TYPE_V:
1049    case ELK_REGISTER_TYPE_UB:
1050    case ELK_REGISTER_TYPE_B:
1051    default:
1052       unreachable("not reached");
1053    }
1054 
1055    /* It is safe to change the type of the operands of a select instruction
1056     * that has no conditional modifier, no source modifiers, and no saturate
1057     * modifer.
1058     */
1059    if (inst->opcode == ELK_OPCODE_SEL &&
1060        inst->conditional_mod == ELK_CONDITIONAL_NONE &&
1061        !inst->src[0].negate && !inst->src[0].abs &&
1062        !inst->src[1].negate && !inst->src[1].abs &&
1063        !inst->saturate) {
1064       v->type = either_type;
1065    }
1066 }
1067 
1068 struct register_allocation {
1069    /** VGRF for storing values. */
1070    unsigned nr;
1071 
1072    /**
1073     * Mask of currently available slots in this register.
1074     *
1075     * Each register is 16, 16-bit slots.  Allocations require 1, 2, or 4 slots
1076     * for word, double-word, or quad-word values, respectively.
1077     */
1078    uint16_t avail;
1079 };
1080 
1081 static elk_fs_reg
allocate_slots(struct register_allocation * regs,unsigned num_regs,unsigned bytes,unsigned align_bytes,elk::simple_allocator & alloc)1082 allocate_slots(struct register_allocation *regs, unsigned num_regs,
1083                unsigned bytes, unsigned align_bytes,
1084                elk::simple_allocator &alloc)
1085 {
1086    assert(bytes == 2 || bytes == 4 || bytes == 8);
1087    assert(align_bytes == 2 || align_bytes == 4 || align_bytes == 8);
1088 
1089    const unsigned words = bytes / 2;
1090    const unsigned align_words = align_bytes / 2;
1091    const uint16_t mask = (1U << words) - 1;
1092 
1093    for (unsigned i = 0; i < num_regs; i++) {
1094       for (unsigned j = 0; j <= (16 - words); j += align_words) {
1095          const uint16_t x = regs[i].avail >> j;
1096 
1097          if ((x & mask) == mask) {
1098             if (regs[i].nr == UINT_MAX)
1099                regs[i].nr = alloc.allocate(1);
1100 
1101             regs[i].avail &= ~(mask << j);
1102 
1103             elk_fs_reg reg(VGRF, regs[i].nr);
1104             reg.offset = j * 2;
1105 
1106             return reg;
1107          }
1108       }
1109    }
1110 
1111    unreachable("No free slots found.");
1112 }
1113 
1114 static void
deallocate_slots(struct register_allocation * regs,unsigned num_regs,unsigned reg_nr,unsigned subreg_offset,unsigned bytes)1115 deallocate_slots(struct register_allocation *regs, unsigned num_regs,
1116                  unsigned reg_nr, unsigned subreg_offset, unsigned bytes)
1117 {
1118    assert(bytes == 2 || bytes == 4 || bytes == 8);
1119    assert(subreg_offset % 2 == 0);
1120    assert(subreg_offset + bytes <= 32);
1121 
1122    const unsigned words = bytes / 2;
1123    const unsigned offset = subreg_offset / 2;
1124    const uint16_t mask = ((1U << words) - 1) << offset;
1125 
1126    for (unsigned i = 0; i < num_regs; i++) {
1127       if (regs[i].nr == reg_nr) {
1128          regs[i].avail |= mask;
1129          return;
1130       }
1131    }
1132 
1133    unreachable("No such register found.");
1134 }
1135 
1136 static void
parcel_out_registers(struct imm * imm,unsigned len,const elk_bblock_t * cur_block,struct register_allocation * regs,unsigned num_regs,elk::simple_allocator & alloc,unsigned ver)1137 parcel_out_registers(struct imm *imm, unsigned len, const elk_bblock_t *cur_block,
1138                      struct register_allocation *regs, unsigned num_regs,
1139                      elk::simple_allocator &alloc, unsigned ver)
1140 {
1141    /* Each basic block has two distinct set of constants.  There is the set of
1142     * constants that only have uses in that block, and there is the set of
1143     * constants that have uses after that block.
1144     *
1145     * Allocation proceeds in three passes.
1146     *
1147     * 1. Allocate space for the values that are used outside this block.
1148     *
1149     * 2. Allocate space for the values that are used only in this block.
1150     *
1151     * 3. Deallocate the space for the values that are used only in this block.
1152     */
1153 
1154    for (unsigned pass = 0; pass < 2; pass++) {
1155       const bool used_in_single_block = pass != 0;
1156 
1157       for (unsigned i = 0; i < len; i++) {
1158          if (imm[i].block == cur_block &&
1159              imm[i].used_in_single_block == used_in_single_block) {
1160             /* From the BDW and CHV PRM, 3D Media GPGPU, Special Restrictions:
1161              *
1162              *   "In Align16 mode, the channel selects and channel enables apply
1163              *    to a pair of half-floats, because these parameters are defined
1164              *    for DWord elements ONLY. This is applicable when both source
1165              *    and destination are half-floats."
1166              *
1167              * This means that Align16 instructions that use promoted HF
1168              * immediates and use a <0,1,0>:HF region would read 2 HF slots
1169              * instead of replicating the single one we want. To avoid this, we
1170              * always populate both HF slots within a DWord with the constant.
1171              */
1172             const unsigned width = ver == 8 && imm[i].is_half_float ? 2 : 1;
1173 
1174             const elk_fs_reg reg = allocate_slots(regs, num_regs,
1175                                               imm[i].size * width,
1176                                               get_alignment_for_imm(&imm[i]),
1177                                               alloc);
1178 
1179             imm[i].nr = reg.nr;
1180             imm[i].subreg_offset = reg.offset;
1181          }
1182       }
1183    }
1184 
1185    for (unsigned i = 0; i < len; i++) {
1186       if (imm[i].block == cur_block && imm[i].used_in_single_block) {
1187          const unsigned width = ver == 8 && imm[i].is_half_float ? 2 : 1;
1188 
1189          deallocate_slots(regs, num_regs, imm[i].nr, imm[i].subreg_offset,
1190                           imm[i].size * width);
1191       }
1192    }
1193 }
1194 
1195 } /* namespace */
1196 
1197 bool
opt_combine_constants()1198 elk_fs_visitor::opt_combine_constants()
1199 {
1200    void *const_ctx = ralloc_context(NULL);
1201 
1202    struct table table;
1203 
1204    /* For each of the dynamic arrays in the table, allocate about a page of
1205     * memory.  On LP64 systems, this gives 126 value objects 169 fs_inst_box
1206     * objects.  Even larger shaders that have been obverved rarely need more
1207     * than 20 or 30 values.  Most smaller shaders, which is most shaders, need
1208     * at most a couple dozen fs_inst_box.
1209     */
1210    table.size = (4096 - (5 * sizeof(void *))) / sizeof(struct value);
1211    table.num_values = 0;
1212    table.values = ralloc_array(const_ctx, struct value, table.size);
1213 
1214    table.size_boxes = (4096 - (5 * sizeof(void *))) / sizeof(struct fs_inst_box);
1215    table.num_boxes = 0;
1216    table.boxes = ralloc_array(const_ctx, fs_inst_box, table.size_boxes);
1217 
1218    const elk::idom_tree &idom = idom_analysis.require();
1219    unsigned ip = -1;
1220 
1221    /* Make a pass through all instructions and count the number of times each
1222     * constant is used by coissueable instructions or instructions that cannot
1223     * take immediate arguments.
1224     */
1225    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1226       ip++;
1227 
1228       switch (inst->opcode) {
1229       case ELK_SHADER_OPCODE_INT_QUOTIENT:
1230       case ELK_SHADER_OPCODE_INT_REMAINDER:
1231       case ELK_SHADER_OPCODE_POW:
1232          if (inst->src[0].file == IMM) {
1233             assert(inst->opcode != ELK_SHADER_OPCODE_POW);
1234 
1235             add_candidate_immediate(&table, inst, ip, 0, true, false, block,
1236                                     devinfo, const_ctx);
1237          }
1238 
1239          if (inst->src[1].file == IMM && devinfo->ver < 8) {
1240             add_candidate_immediate(&table, inst, ip, 1, true, false, block,
1241                                     devinfo, const_ctx);
1242          }
1243 
1244          break;
1245 
1246       case ELK_OPCODE_MAD: {
1247          for (int i = 0; i < inst->sources; i++) {
1248             if (inst->src[i].file != IMM)
1249                continue;
1250 
1251             add_candidate_immediate(&table, inst, ip, i, true, false, block,
1252                                     devinfo, const_ctx);
1253          }
1254 
1255          break;
1256       }
1257 
1258       case ELK_OPCODE_BFE:
1259       case ELK_OPCODE_BFI2:
1260       case ELK_OPCODE_LRP:
1261          for (int i = 0; i < inst->sources; i++) {
1262             if (inst->src[i].file != IMM)
1263                continue;
1264 
1265             add_candidate_immediate(&table, inst, ip, i, true, false, block,
1266                                     devinfo, const_ctx);
1267          }
1268 
1269          break;
1270 
1271       case ELK_OPCODE_SEL:
1272          if (inst->src[0].file == IMM) {
1273             /* It is possible to have src0 be immediate but src1 not be
1274              * immediate for the non-commutative conditional modifiers (e.g.,
1275              * G).
1276              */
1277             if (inst->conditional_mod == ELK_CONDITIONAL_NONE ||
1278                 /* Only GE and L are commutative. */
1279                 inst->conditional_mod == ELK_CONDITIONAL_GE ||
1280                 inst->conditional_mod == ELK_CONDITIONAL_L) {
1281                assert(inst->src[1].file == IMM);
1282 
1283                add_candidate_immediate(&table, inst, ip, 0, true, true, block,
1284                                        devinfo, const_ctx);
1285                add_candidate_immediate(&table, inst, ip, 1, true, true, block,
1286                                        devinfo, const_ctx);
1287             } else {
1288                add_candidate_immediate(&table, inst, ip, 0, true, false, block,
1289                                        devinfo, const_ctx);
1290             }
1291          }
1292          break;
1293 
1294       case ELK_OPCODE_ASR:
1295       case ELK_OPCODE_BFI1:
1296       case ELK_OPCODE_SHL:
1297       case ELK_OPCODE_SHR:
1298          if (inst->src[0].file == IMM) {
1299             add_candidate_immediate(&table, inst, ip, 0, true, false, block,
1300                                     devinfo, const_ctx);
1301          }
1302          break;
1303 
1304       case ELK_OPCODE_MOV:
1305          if (could_coissue(devinfo, inst) && inst->src[0].file == IMM) {
1306             add_candidate_immediate(&table, inst, ip, 0, false, false, block,
1307                                     devinfo, const_ctx);
1308          }
1309          break;
1310 
1311       case ELK_OPCODE_CMP:
1312       case ELK_OPCODE_ADD:
1313       case ELK_OPCODE_MUL:
1314          assert(inst->src[0].file != IMM);
1315 
1316          if (could_coissue(devinfo, inst) && inst->src[1].file == IMM) {
1317             add_candidate_immediate(&table, inst, ip, 1, false, false, block,
1318                                     devinfo, const_ctx);
1319          }
1320          break;
1321 
1322       default:
1323          break;
1324       }
1325    }
1326 
1327    if (table.num_values == 0) {
1328       ralloc_free(const_ctx);
1329       return false;
1330    }
1331 
1332    combine_constants_result *result =
1333       elk_combine_constants(table.values, table.num_values);
1334 
1335    table.imm = ralloc_array(const_ctx, struct imm, result->num_values_to_emit);
1336    table.len = 0;
1337 
1338    for (unsigned i = 0; i < result->num_values_to_emit; i++) {
1339       struct imm *imm = &table.imm[table.len];
1340 
1341       imm->block = NULL;
1342       imm->inst = NULL;
1343       imm->d64 = result->values_to_emit[i].value.u64;
1344       imm->size = result->values_to_emit[i].bit_size / 8;
1345 
1346       imm->uses_by_coissue = 0;
1347       imm->must_promote = false;
1348       imm->is_half_float = false;
1349 
1350       imm->first_use_ip = UINT16_MAX;
1351       imm->last_use_ip = 0;
1352 
1353       imm->uses = new(const_ctx) exec_list;
1354 
1355       const unsigned first_user = result->values_to_emit[i].first_user;
1356       const unsigned last_user = first_user +
1357          result->values_to_emit[i].num_users;
1358 
1359       for (unsigned j = first_user; j < last_user; j++) {
1360          const unsigned idx = table.values[result->user_map[j].index].instr_index;
1361          fs_inst_box *const ib = &table.boxes[idx];
1362 
1363          const unsigned src = table.values[result->user_map[j].index].src;
1364 
1365          imm->uses->push_tail(link(const_ctx, ib->inst, src,
1366                                    result->user_map[j].negate,
1367                                    result->user_map[j].type));
1368 
1369          if (ib->must_promote)
1370             imm->must_promote = true;
1371          else
1372             imm->uses_by_coissue++;
1373 
1374          if (imm->block == NULL) {
1375             /* Block should only be NULL on the first pass.  On the first
1376              * pass, inst should also be NULL.
1377              */
1378             assert(imm->inst == NULL);
1379 
1380             imm->inst = ib->inst;
1381             imm->block = ib->block;
1382             imm->first_use_ip = ib->ip;
1383             imm->last_use_ip = ib->ip;
1384             imm->used_in_single_block = true;
1385          } else {
1386             elk_bblock_t *intersection = idom.intersect(ib->block,
1387                                                     imm->block);
1388 
1389             if (ib->block != imm->block)
1390                imm->used_in_single_block = false;
1391 
1392             if (imm->first_use_ip > ib->ip) {
1393                imm->first_use_ip = ib->ip;
1394 
1395                /* If the first-use instruction is to be tracked, block must be
1396                 * the block that contains it.  The old block was read in the
1397                 * idom.intersect call above, so it is safe to overwrite it
1398                 * here.
1399                 */
1400                imm->inst = ib->inst;
1401                imm->block = ib->block;
1402             }
1403 
1404             if (imm->last_use_ip < ib->ip)
1405                imm->last_use_ip = ib->ip;
1406 
1407             /* The common dominator is not the block that contains the
1408              * first-use instruction, so don't track that instruction.  The
1409              * load instruction will be added in the common dominator block
1410              * instead of before the first-use instruction.
1411              */
1412             if (intersection != imm->block)
1413                imm->inst = NULL;
1414 
1415             imm->block = intersection;
1416          }
1417 
1418          if (ib->inst->src[src].type == ELK_REGISTER_TYPE_HF)
1419             imm->is_half_float = true;
1420       }
1421 
1422       /* Remove constants from the table that don't have enough uses to make
1423        * them profitable to store in a register.
1424        */
1425       if (imm->must_promote || imm->uses_by_coissue >= 4)
1426          table.len++;
1427    }
1428 
1429    delete result;
1430 
1431    if (table.len == 0) {
1432       ralloc_free(const_ctx);
1433       return false;
1434    }
1435    if (cfg->num_blocks != 1)
1436       qsort(table.imm, table.len, sizeof(struct imm), compare);
1437 
1438    if (devinfo->ver > 7) {
1439       struct register_allocation *regs =
1440          (struct register_allocation *) calloc(table.len, sizeof(regs[0]));
1441 
1442       for (int i = 0; i < table.len; i++) {
1443          regs[i].nr = UINT_MAX;
1444          regs[i].avail = 0xffff;
1445       }
1446 
1447       foreach_block(block, cfg) {
1448          parcel_out_registers(table.imm, table.len, block, regs, table.len,
1449                               alloc, devinfo->ver);
1450       }
1451 
1452       free(regs);
1453    } else {
1454       elk_fs_reg reg(VGRF, alloc.allocate(1));
1455       reg.stride = 0;
1456 
1457       for (int i = 0; i < table.len; i++) {
1458          struct imm *imm = &table.imm[i];
1459 
1460          /* Put the immediate in an offset aligned to its size. Some
1461           * instructions seem to have additional alignment requirements, so
1462           * account for that too.
1463           */
1464          reg.offset = ALIGN(reg.offset, get_alignment_for_imm(imm));
1465 
1466          /* Ensure we have enough space in the register to copy the immediate */
1467          if (reg.offset + imm->size > REG_SIZE) {
1468             reg.nr = alloc.allocate(1);
1469             reg.offset = 0;
1470          }
1471 
1472          imm->nr = reg.nr;
1473          imm->subreg_offset = reg.offset;
1474 
1475          reg.offset += imm->size;
1476       }
1477    }
1478 
1479    bool rebuild_cfg = false;
1480 
1481    /* Insert MOVs to load the constant values into GRFs. */
1482    for (int i = 0; i < table.len; i++) {
1483       struct imm *imm = &table.imm[i];
1484 
1485       /* Insert it either before the instruction that generated the immediate
1486        * or after the last non-control flow instruction of the common ancestor.
1487        */
1488       exec_node *n;
1489       elk_bblock_t *insert_block;
1490       if (imm->inst != nullptr) {
1491          n = imm->inst;
1492          insert_block = imm->block;
1493       } else {
1494          if (imm->block->start()->opcode == ELK_OPCODE_DO) {
1495             /* DO blocks are weird. They can contain only the single DO
1496              * instruction. As a result, MOV instructions cannot be added to
1497              * the DO block.
1498              */
1499             elk_bblock_t *next_block = imm->block->next();
1500             if (next_block->starts_with_control_flow()) {
1501                /* This is the difficult case. This occurs for code like
1502                 *
1503                 *    do {
1504                 *       do {
1505                 *          ...
1506                 *       } while (...);
1507                 *    } while (...);
1508                 *
1509                 * when the MOV instructions need to be inserted between the
1510                 * two DO instructions.
1511                 *
1512                 * To properly handle this scenario, a new block would need to
1513                 * be inserted. Doing so would require modifying arbitrary many
1514                 * CONTINUE, BREAK, and WHILE instructions to point to the new
1515                 * block.
1516                 *
1517                 * It is unlikely that this would ever be correct. Instead,
1518                 * insert the MOV instructions in the known wrong place and
1519                 * rebuild the CFG at the end of the pass.
1520                 */
1521                insert_block = imm->block;
1522                n = insert_block->last_non_control_flow_inst()->next;
1523 
1524                rebuild_cfg = true;
1525             } else {
1526                insert_block = next_block;
1527                n = insert_block->start();
1528             }
1529          } else {
1530             insert_block = imm->block;
1531             n = insert_block->last_non_control_flow_inst()->next;
1532          }
1533       }
1534 
1535       /* From the BDW and CHV PRM, 3D Media GPGPU, Special Restrictions:
1536        *
1537        *   "In Align16 mode, the channel selects and channel enables apply to a
1538        *    pair of half-floats, because these parameters are defined for DWord
1539        *    elements ONLY. This is applicable when both source and destination
1540        *    are half-floats."
1541        *
1542        * This means that Align16 instructions that use promoted HF immediates
1543        * and use a <0,1,0>:HF region would read 2 HF slots instead of
1544        * replicating the single one we want. To avoid this, we always populate
1545        * both HF slots within a DWord with the constant.
1546        */
1547       const uint32_t width = devinfo->ver == 8 && imm->is_half_float ? 2 : 1;
1548       const fs_builder ibld = fs_builder(this, width).at(insert_block, n).exec_all();
1549 
1550       elk_fs_reg reg(VGRF, imm->nr);
1551       reg.offset = imm->subreg_offset;
1552       reg.stride = 0;
1553 
1554       /* Put the immediate in an offset aligned to its size. Some instructions
1555        * seem to have additional alignment requirements, so account for that
1556        * too.
1557        */
1558       assert(reg.offset == ALIGN(reg.offset, get_alignment_for_imm(imm)));
1559 
1560       struct elk_reg imm_reg = build_imm_reg_for_copy(imm);
1561 
1562       /* Ensure we have enough space in the register to copy the immediate */
1563       assert(reg.offset + type_sz(imm_reg.type) * width <= REG_SIZE);
1564 
1565       ibld.MOV(retype(reg, imm_reg.type), imm_reg);
1566    }
1567    shader_stats.promoted_constants = table.len;
1568 
1569    /* Rewrite the immediate sources to refer to the new GRFs. */
1570    for (int i = 0; i < table.len; i++) {
1571       foreach_list_typed(reg_link, link, link, table.imm[i].uses) {
1572          elk_fs_reg *reg = &link->inst->src[link->src];
1573 
1574          if (link->inst->opcode == ELK_OPCODE_SEL) {
1575             if (link->type == either_type) {
1576                /* Do not change the register type. */
1577             } else if (link->type == integer_only) {
1578                reg->type = elk_int_type(type_sz(reg->type), true);
1579             } else {
1580                assert(link->type == float_only);
1581 
1582                switch (type_sz(reg->type)) {
1583                case 2:
1584                   reg->type = ELK_REGISTER_TYPE_HF;
1585                   break;
1586                case 4:
1587                   reg->type = ELK_REGISTER_TYPE_F;
1588                   break;
1589                case 8:
1590                   reg->type = ELK_REGISTER_TYPE_DF;
1591                   break;
1592                default:
1593                   unreachable("Bad type size");
1594                }
1595             }
1596          } else if ((link->inst->opcode == ELK_OPCODE_SHL ||
1597                      link->inst->opcode == ELK_OPCODE_ASR) &&
1598                     link->negate) {
1599             reg->type = elk_int_type(type_sz(reg->type), true);
1600          }
1601 
1602 #if MESA_DEBUG
1603          switch (reg->type) {
1604          case ELK_REGISTER_TYPE_DF:
1605             assert((isnan(reg->df) && isnan(table.imm[i].df)) ||
1606                    (fabs(reg->df) == fabs(table.imm[i].df)));
1607             break;
1608          case ELK_REGISTER_TYPE_F:
1609             assert((isnan(reg->f) && isnan(table.imm[i].f)) ||
1610                    (fabsf(reg->f) == fabsf(table.imm[i].f)));
1611             break;
1612          case ELK_REGISTER_TYPE_HF:
1613             assert((isnan(_mesa_half_to_float(reg->d & 0xffffu)) &&
1614                     isnan(_mesa_half_to_float(table.imm[i].w))) ||
1615                    (fabsf(_mesa_half_to_float(reg->d & 0xffffu)) ==
1616                     fabsf(_mesa_half_to_float(table.imm[i].w))));
1617             break;
1618          case ELK_REGISTER_TYPE_Q:
1619             assert(abs(reg->d64) == abs(table.imm[i].d64));
1620             break;
1621          case ELK_REGISTER_TYPE_UQ:
1622             assert(!link->negate);
1623             assert(reg->d64 == table.imm[i].d64);
1624             break;
1625          case ELK_REGISTER_TYPE_D:
1626             assert(abs(reg->d) == abs(table.imm[i].d));
1627             break;
1628          case ELK_REGISTER_TYPE_UD:
1629             assert(!link->negate);
1630             assert(reg->d == table.imm[i].d);
1631             break;
1632          case ELK_REGISTER_TYPE_W:
1633             assert(abs((int16_t) (reg->d & 0xffff)) == table.imm[i].w);
1634             break;
1635          case ELK_REGISTER_TYPE_UW:
1636             assert(!link->negate);
1637             assert((reg->ud & 0xffffu) == (uint16_t) table.imm[i].w);
1638             break;
1639          default:
1640             break;
1641          }
1642 #endif
1643 
1644          assert(link->inst->can_do_source_mods(devinfo) || !link->negate);
1645 
1646          reg->file = VGRF;
1647          reg->offset = table.imm[i].subreg_offset;
1648          reg->stride = 0;
1649          reg->negate = link->negate;
1650          reg->nr = table.imm[i].nr;
1651       }
1652    }
1653 
1654    /* Fixup any SEL instructions that have src0 still as an immediate.  Fixup
1655     * the types of any SEL instruction that have a negation on one of the
1656     * sources.  Adding the negation may have changed the type of that source,
1657     * so the other source (and destination) must be changed to match.
1658     */
1659    for (unsigned i = 0; i < table.num_boxes; i++) {
1660       elk_fs_inst *inst = table.boxes[i].inst;
1661 
1662       if (inst->opcode != ELK_OPCODE_SEL)
1663          continue;
1664 
1665       /* If both sources have negation, the types had better be the same! */
1666       assert(!inst->src[0].negate || !inst->src[1].negate ||
1667              inst->src[0].type == inst->src[1].type);
1668 
1669       /* If either source has a negation, force the type of the other source
1670        * and the type of the result to be the same.
1671        */
1672       if (inst->src[0].negate) {
1673          inst->src[1].type = inst->src[0].type;
1674          inst->dst.type = inst->src[0].type;
1675       }
1676 
1677       if (inst->src[1].negate) {
1678          inst->src[0].type = inst->src[1].type;
1679          inst->dst.type = inst->src[1].type;
1680       }
1681 
1682       if (inst->src[0].file != IMM)
1683          continue;
1684 
1685       assert(inst->src[1].file != IMM);
1686       assert(inst->conditional_mod == ELK_CONDITIONAL_NONE ||
1687              inst->conditional_mod == ELK_CONDITIONAL_GE ||
1688              inst->conditional_mod == ELK_CONDITIONAL_L);
1689 
1690       elk_fs_reg temp = inst->src[0];
1691       inst->src[0] = inst->src[1];
1692       inst->src[1] = temp;
1693 
1694       /* If this was predicated, flipping operands means we also need to flip
1695        * the predicate.
1696        */
1697       if (inst->conditional_mod == ELK_CONDITIONAL_NONE)
1698          inst->predicate_inverse = !inst->predicate_inverse;
1699    }
1700 
1701    if (debug) {
1702       for (int i = 0; i < table.len; i++) {
1703          struct imm *imm = &table.imm[i];
1704 
1705          fprintf(stderr,
1706                  "0x%016" PRIx64 " - block %3d, reg %3d sub %2d, "
1707                  "Uses: (%2d, %2d), IP: %4d to %4d, length %4d\n",
1708                  (uint64_t)(imm->d & BITFIELD64_MASK(imm->size * 8)),
1709                  imm->block->num,
1710                  imm->nr,
1711                  imm->subreg_offset,
1712                  imm->must_promote,
1713                  imm->uses_by_coissue,
1714                  imm->first_use_ip,
1715                  imm->last_use_ip,
1716                  imm->last_use_ip - imm->first_use_ip);
1717       }
1718    }
1719 
1720    if (rebuild_cfg) {
1721       /* When the CFG is initially built, the instructions are removed from
1722        * the list of instructions stored in elk_fs_visitor -- the same exec_node
1723        * is used for membership in that list and in a block list.  So we need
1724        * to pull them back before rebuilding the CFG.
1725        */
1726       assert(exec_list_length(&instructions) == 0);
1727       foreach_block(block, cfg) {
1728          exec_list_append(&instructions, &block->instructions);
1729       }
1730 
1731       delete cfg;
1732       cfg = NULL;
1733       calculate_cfg();
1734    }
1735 
1736    ralloc_free(const_ctx);
1737 
1738    invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES |
1739                        (rebuild_cfg ? DEPENDENCY_BLOCKS : DEPENDENCY_NOTHING));
1740 
1741    return true;
1742 }
1743