xref: /aosp_15_r20/external/mesa3d/src/compiler/glsl/lower_packing_builtins.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2012 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21  * DEALINGS IN THE SOFTWARE.
22  */
23 
24 #include "ir.h"
25 #include "ir_builder.h"
26 #include "ir_optimization.h"
27 #include "ir_rvalue_visitor.h"
28 
29 enum lower_packing_builtins_op {
30    LOWER_PACK_UNPACK_NONE               = 0x0000,
31 
32    LOWER_PACK_SNORM_2x16                = 0x0001,
33    LOWER_UNPACK_SNORM_2x16              = 0x0002,
34 
35    LOWER_PACK_UNORM_2x16                = 0x0004,
36    LOWER_UNPACK_UNORM_2x16              = 0x0008,
37 
38    LOWER_PACK_HALF_2x16                 = 0x0010,
39    LOWER_UNPACK_HALF_2x16               = 0x0020,
40 
41    LOWER_PACK_SNORM_4x8                 = 0x0040,
42    LOWER_UNPACK_SNORM_4x8               = 0x0080,
43 
44    LOWER_PACK_UNORM_4x8                 = 0x0100,
45    LOWER_UNPACK_UNORM_4x8               = 0x0200,
46 
47    LOWER_PACK_USE_BFI                   = 0x0400,
48    LOWER_PACK_USE_BFE                   = 0x0800,
49 };
50 
51 namespace {
52 
53 using namespace ir_builder;
54 
55 /**
56  * A visitor that lowers built-in floating-point pack/unpack expressions
57  * such packSnorm2x16.
58  */
59 class lower_packing_builtins_visitor : public ir_rvalue_visitor {
60 public:
61    /**
62     * \param op_mask is a bitmask of `enum lower_packing_builtins_op`
63     */
lower_packing_builtins_visitor(int op_mask)64    explicit lower_packing_builtins_visitor(int op_mask)
65       : op_mask(op_mask),
66         progress(false)
67    {
68       factory.instructions = &factory_instructions;
69    }
70 
~lower_packing_builtins_visitor()71    virtual ~lower_packing_builtins_visitor()
72    {
73       assert(factory_instructions.is_empty());
74    }
75 
76    lower_packing_builtins_visitor(const lower_packing_builtins_visitor &) = delete;
77    lower_packing_builtins_visitor & operator=(const lower_packing_builtins_visitor &) = delete;
78 
get_progress()79    bool get_progress() { return progress; }
80 
handle_rvalue(ir_rvalue ** rvalue)81    void handle_rvalue(ir_rvalue **rvalue)
82    {
83       if (!*rvalue)
84 	 return;
85 
86       ir_expression *expr = (*rvalue)->as_expression();
87       if (!expr)
88 	 return;
89 
90       enum lower_packing_builtins_op lowering_op =
91          choose_lowering_op(expr->operation);
92 
93       if (lowering_op == LOWER_PACK_UNPACK_NONE)
94          return;
95 
96       setup_factory(ralloc_parent(expr));
97 
98       ir_rvalue *op0 = expr->operands[0];
99       ralloc_steal(factory.mem_ctx, op0);
100 
101       switch (lowering_op) {
102       case LOWER_PACK_SNORM_2x16:
103          *rvalue = lower_pack_snorm_2x16(op0);
104          break;
105       case LOWER_PACK_SNORM_4x8:
106          *rvalue = lower_pack_snorm_4x8(op0);
107          break;
108       case LOWER_PACK_UNORM_2x16:
109          *rvalue = lower_pack_unorm_2x16(op0);
110          break;
111       case LOWER_PACK_UNORM_4x8:
112          *rvalue = lower_pack_unorm_4x8(op0);
113          break;
114       case LOWER_PACK_HALF_2x16:
115          *rvalue = lower_pack_half_2x16(op0);
116          break;
117       case LOWER_UNPACK_SNORM_2x16:
118          *rvalue = lower_unpack_snorm_2x16(op0);
119          break;
120       case LOWER_UNPACK_SNORM_4x8:
121          *rvalue = lower_unpack_snorm_4x8(op0);
122          break;
123       case LOWER_UNPACK_UNORM_2x16:
124          *rvalue = lower_unpack_unorm_2x16(op0);
125          break;
126       case LOWER_UNPACK_UNORM_4x8:
127          *rvalue = lower_unpack_unorm_4x8(op0);
128          break;
129       case LOWER_UNPACK_HALF_2x16:
130          *rvalue = lower_unpack_half_2x16(op0);
131          break;
132       case LOWER_PACK_UNPACK_NONE:
133       case LOWER_PACK_USE_BFI:
134       case LOWER_PACK_USE_BFE:
135          assert(!"not reached");
136          break;
137       }
138 
139       teardown_factory();
140       progress = true;
141    }
142 
143 private:
144    const int op_mask;
145    bool progress;
146    ir_factory factory;
147    exec_list factory_instructions;
148 
149    /**
150     * Determine the needed lowering operation by filtering \a expr_op
151     * through \ref op_mask.
152     */
153    enum lower_packing_builtins_op
choose_lowering_op(ir_expression_operation expr_op)154    choose_lowering_op(ir_expression_operation expr_op)
155    {
156       /* C++ regards int and enum as fundamentally different types.
157        * So, we can't simply return from each case; we must cast the return
158        * value.
159        */
160       int result;
161 
162       switch (expr_op) {
163       case ir_unop_pack_snorm_2x16:
164          result = op_mask & LOWER_PACK_SNORM_2x16;
165          break;
166       case ir_unop_pack_snorm_4x8:
167          result = op_mask & LOWER_PACK_SNORM_4x8;
168          break;
169       case ir_unop_pack_unorm_2x16:
170          result = op_mask & LOWER_PACK_UNORM_2x16;
171          break;
172       case ir_unop_pack_unorm_4x8:
173          result = op_mask & LOWER_PACK_UNORM_4x8;
174          break;
175       case ir_unop_pack_half_2x16:
176          result = op_mask & LOWER_PACK_HALF_2x16;
177          break;
178       case ir_unop_unpack_snorm_2x16:
179          result = op_mask & LOWER_UNPACK_SNORM_2x16;
180          break;
181       case ir_unop_unpack_snorm_4x8:
182          result = op_mask & LOWER_UNPACK_SNORM_4x8;
183          break;
184       case ir_unop_unpack_unorm_2x16:
185          result = op_mask & LOWER_UNPACK_UNORM_2x16;
186          break;
187       case ir_unop_unpack_unorm_4x8:
188          result = op_mask & LOWER_UNPACK_UNORM_4x8;
189          break;
190       case ir_unop_unpack_half_2x16:
191          result = op_mask & LOWER_UNPACK_HALF_2x16;
192          break;
193       default:
194          result = LOWER_PACK_UNPACK_NONE;
195          break;
196       }
197 
198       return static_cast<enum lower_packing_builtins_op>(result);
199    }
200 
201    void
setup_factory(void * mem_ctx)202    setup_factory(void *mem_ctx)
203    {
204       assert(factory.mem_ctx == NULL);
205       assert(factory.instructions->is_empty());
206 
207       factory.mem_ctx = mem_ctx;
208    }
209 
210    void
teardown_factory()211    teardown_factory()
212    {
213       base_ir->insert_before(factory.instructions);
214       assert(factory.instructions->is_empty());
215       factory.mem_ctx = NULL;
216    }
217 
218    template <typename T>
219    ir_constant*
constant(T x)220    constant(T x)
221    {
222       return factory.constant(x);
223    }
224 
225    /**
226     * \brief Pack two uint16's into a single uint32.
227     *
228     * Interpret the given uvec2 as a uint16 pair. Pack the pair into a uint32
229     * where the least significant bits specify the first element of the pair.
230     * Return the uint32.
231     */
232    ir_rvalue*
pack_uvec2_to_uint(ir_rvalue * uvec2_rval)233    pack_uvec2_to_uint(ir_rvalue *uvec2_rval)
234    {
235       assert(uvec2_rval->type == &glsl_type_builtin_uvec2);
236 
237       /* uvec2 u = UVEC2_RVAL; */
238       ir_variable *u = factory.make_temp(&glsl_type_builtin_uvec2,
239                                          "tmp_pack_uvec2_to_uint");
240       factory.emit(assign(u, uvec2_rval));
241 
242       if (op_mask & LOWER_PACK_USE_BFI) {
243          return bitfield_insert(bit_and(swizzle_x(u), constant(0xffffu)),
244                                 swizzle_y(u),
245                                 constant(16u),
246                                 constant(16u));
247       }
248 
249       /* return (u.y << 16) | (u.x & 0xffff); */
250       return bit_or(lshift(swizzle_y(u), constant(16u)),
251                     bit_and(swizzle_x(u), constant(0xffffu)));
252    }
253 
254    /**
255     * \brief Pack four uint8's into a single uint32.
256     *
257     * Interpret the given uvec4 as a uint32 4-typle. Pack the 4-tuple into a
258     * uint32 where the least significant bits specify the first element of the
259     * 4-tuple. Return the uint32.
260     */
261    ir_rvalue*
pack_uvec4_to_uint(ir_rvalue * uvec4_rval)262    pack_uvec4_to_uint(ir_rvalue *uvec4_rval)
263    {
264       assert(uvec4_rval->type == &glsl_type_builtin_uvec4);
265 
266       ir_variable *u = factory.make_temp(&glsl_type_builtin_uvec4,
267                                          "tmp_pack_uvec4_to_uint");
268 
269       if (op_mask & LOWER_PACK_USE_BFI) {
270          /* uvec4 u = UVEC4_RVAL; */
271          factory.emit(assign(u, uvec4_rval));
272 
273          return bitfield_insert(bitfield_insert(
274                                    bitfield_insert(
275                                       bit_and(swizzle_x(u), constant(0xffu)),
276                                       swizzle_y(u), constant(8u), constant(8u)),
277                                    swizzle_z(u), constant(16u), constant(8u)),
278                                 swizzle_w(u), constant(24u), constant(8u));
279       }
280 
281       /* uvec4 u = UVEC4_RVAL & 0xff */
282       factory.emit(assign(u, bit_and(uvec4_rval, constant(0xffu))));
283 
284       /* return (u.w << 24) | (u.z << 16) | (u.y << 8) | u.x; */
285       return bit_or(bit_or(lshift(swizzle_w(u), constant(24u)),
286                            lshift(swizzle_z(u), constant(16u))),
287                     bit_or(lshift(swizzle_y(u), constant(8u)),
288                            swizzle_x(u)));
289    }
290 
291    /**
292     * \brief Unpack a uint32 into two uint16's.
293     *
294     * Interpret the given uint32 as a uint16 pair where the uint32's least
295     * significant bits specify the pair's first element. Return the uint16
296     * pair as a uvec2.
297     */
298    ir_rvalue*
unpack_uint_to_uvec2(ir_rvalue * uint_rval)299    unpack_uint_to_uvec2(ir_rvalue *uint_rval)
300    {
301       assert(uint_rval->type == &glsl_type_builtin_uint);
302 
303       /* uint u = UINT_RVAL; */
304       ir_variable *u = factory.make_temp(&glsl_type_builtin_uint,
305                                           "tmp_unpack_uint_to_uvec2_u");
306       factory.emit(assign(u, uint_rval));
307 
308       /* uvec2 u2; */
309       ir_variable *u2 = factory.make_temp(&glsl_type_builtin_uvec2,
310                                            "tmp_unpack_uint_to_uvec2_u2");
311 
312       /* u2.x = u & 0xffffu; */
313       factory.emit(assign(u2, bit_and(u, constant(0xffffu)), WRITEMASK_X));
314 
315       /* u2.y = u >> 16u; */
316       factory.emit(assign(u2, rshift(u, constant(16u)), WRITEMASK_Y));
317 
318       return deref(u2).val;
319    }
320 
321    /**
322     * \brief Unpack a uint32 into two int16's.
323     *
324     * Specifically each 16-bit value is sign-extended to the full width of an
325     * int32 on return.
326     */
327    ir_rvalue *
unpack_uint_to_ivec2(ir_rvalue * uint_rval)328    unpack_uint_to_ivec2(ir_rvalue *uint_rval)
329    {
330       assert(uint_rval->type == &glsl_type_builtin_uint);
331 
332       if (!(op_mask & LOWER_PACK_USE_BFE)) {
333          return rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval)),
334                               constant(16u)),
335                        constant(16u));
336       }
337 
338       ir_variable *i = factory.make_temp(&glsl_type_builtin_int,
339                                          "tmp_unpack_uint_to_ivec2_i");
340       factory.emit(assign(i, u2i(uint_rval)));
341 
342       /* ivec2 i2; */
343       ir_variable *i2 = factory.make_temp(&glsl_type_builtin_ivec2,
344                                           "tmp_unpack_uint_to_ivec2_i2");
345 
346       factory.emit(assign(i2, bitfield_extract(i, constant(0), constant(16)),
347                           WRITEMASK_X));
348       factory.emit(assign(i2, bitfield_extract(i, constant(16), constant(16)),
349                           WRITEMASK_Y));
350 
351       return deref(i2).val;
352    }
353 
354    /**
355     * \brief Unpack a uint32 into four uint8's.
356     *
357     * Interpret the given uint32 as a uint8 4-tuple where the uint32's least
358     * significant bits specify the 4-tuple's first element. Return the uint8
359     * 4-tuple as a uvec4.
360     */
361    ir_rvalue*
unpack_uint_to_uvec4(ir_rvalue * uint_rval)362    unpack_uint_to_uvec4(ir_rvalue *uint_rval)
363    {
364       assert(uint_rval->type == &glsl_type_builtin_uint);
365 
366       /* uint u = UINT_RVAL; */
367       ir_variable *u = factory.make_temp(&glsl_type_builtin_uint,
368                                           "tmp_unpack_uint_to_uvec4_u");
369       factory.emit(assign(u, uint_rval));
370 
371       /* uvec4 u4; */
372       ir_variable *u4 = factory.make_temp(&glsl_type_builtin_uvec4,
373                                            "tmp_unpack_uint_to_uvec4_u4");
374 
375       /* u4.x = u & 0xffu; */
376       factory.emit(assign(u4, bit_and(u, constant(0xffu)), WRITEMASK_X));
377 
378       if (op_mask & LOWER_PACK_USE_BFE) {
379          /* u4.y = bitfield_extract(u, 8, 8); */
380          factory.emit(assign(u4, bitfield_extract(u, constant(8u), constant(8u)),
381                              WRITEMASK_Y));
382 
383          /* u4.z = bitfield_extract(u, 16, 8); */
384          factory.emit(assign(u4, bitfield_extract(u, constant(16u), constant(8u)),
385                              WRITEMASK_Z));
386       } else {
387          /* u4.y = (u >> 8u) & 0xffu; */
388          factory.emit(assign(u4, bit_and(rshift(u, constant(8u)),
389                                          constant(0xffu)), WRITEMASK_Y));
390 
391          /* u4.z = (u >> 16u) & 0xffu; */
392          factory.emit(assign(u4, bit_and(rshift(u, constant(16u)),
393                                          constant(0xffu)), WRITEMASK_Z));
394       }
395 
396       /* u4.w = (u >> 24u) */
397       factory.emit(assign(u4, rshift(u, constant(24u)), WRITEMASK_W));
398 
399       return deref(u4).val;
400    }
401 
402    /**
403     * \brief Unpack a uint32 into four int8's.
404     *
405     * Specifically each 8-bit value is sign-extended to the full width of an
406     * int32 on return.
407     */
408    ir_rvalue *
unpack_uint_to_ivec4(ir_rvalue * uint_rval)409    unpack_uint_to_ivec4(ir_rvalue *uint_rval)
410    {
411       assert(uint_rval->type == &glsl_type_builtin_uint);
412 
413       if (!(op_mask & LOWER_PACK_USE_BFE)) {
414          return rshift(lshift(u2i(unpack_uint_to_uvec4(uint_rval)),
415                               constant(24u)),
416                        constant(24u));
417       }
418 
419       ir_variable *i = factory.make_temp(&glsl_type_builtin_int,
420                                          "tmp_unpack_uint_to_ivec4_i");
421       factory.emit(assign(i, u2i(uint_rval)));
422 
423       /* ivec4 i4; */
424       ir_variable *i4 = factory.make_temp(&glsl_type_builtin_ivec4,
425                                           "tmp_unpack_uint_to_ivec4_i4");
426 
427       factory.emit(assign(i4, bitfield_extract(i, constant(0), constant(8)),
428                           WRITEMASK_X));
429       factory.emit(assign(i4, bitfield_extract(i, constant(8), constant(8)),
430                           WRITEMASK_Y));
431       factory.emit(assign(i4, bitfield_extract(i, constant(16), constant(8)),
432                           WRITEMASK_Z));
433       factory.emit(assign(i4, bitfield_extract(i, constant(24), constant(8)),
434                           WRITEMASK_W));
435 
436       return deref(i4).val;
437    }
438 
439    /**
440     * \brief Lower a packSnorm2x16 expression.
441     *
442     * \param vec2_rval is packSnorm2x16's input
443     * \return packSnorm2x16's output as a uint rvalue
444     */
445    ir_rvalue*
lower_pack_snorm_2x16(ir_rvalue * vec2_rval)446    lower_pack_snorm_2x16(ir_rvalue *vec2_rval)
447    {
448       /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
449        *
450        *    highp uint packSnorm2x16(vec2 v)
451        *    --------------------------------
452        *    First, converts each component of the normalized floating-point value
453        *    v into 16-bit integer values. Then, the results are packed into the
454        *    returned 32-bit unsigned integer.
455        *
456        *    The conversion for component c of v to fixed point is done as
457        *    follows:
458        *
459        *       packSnorm2x16: round(clamp(c, -1, +1) * 32767.0)
460        *
461        *    The first component of the vector will be written to the least
462        *    significant bits of the output; the last component will be written to
463        *    the most significant bits.
464        *
465        * This function generates IR that approximates the following pseudo-GLSL:
466        *
467        *     return pack_uvec2_to_uint(
468        *         uvec2(ivec2(
469        *           round(clamp(VEC2_RVALUE, -1.0f, 1.0f) * 32767.0f))));
470        *
471        * It is necessary to first convert the vec2 to ivec2 rather than directly
472        * converting vec2 to uvec2 because the latter conversion is undefined.
473        * From page 56 (62 of pdf) of the GLSL ES 3.00 spec: "It is undefined to
474        * convert a negative floating point value to an uint".
475        */
476       assert(vec2_rval->type == &glsl_type_builtin_vec2);
477 
478       ir_rvalue *result = pack_uvec2_to_uint(
479             i2u(f2i(round_even(mul(clamp(vec2_rval,
480                                          constant(-1.0f),
481                                          constant(1.0f)),
482                                    constant(32767.0f))))));
483 
484       assert(result->type == &glsl_type_builtin_uint);
485       return result;
486    }
487 
488    /**
489     * \brief Lower a packSnorm4x8 expression.
490     *
491     * \param vec4_rval is packSnorm4x8's input
492     * \return packSnorm4x8's output as a uint rvalue
493     */
494    ir_rvalue*
lower_pack_snorm_4x8(ir_rvalue * vec4_rval)495    lower_pack_snorm_4x8(ir_rvalue *vec4_rval)
496    {
497       /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
498        *
499        *    highp uint packSnorm4x8(vec4 v)
500        *    -------------------------------
501        *    First, converts each component of the normalized floating-point value
502        *    v into 8-bit integer values. Then, the results are packed into the
503        *    returned 32-bit unsigned integer.
504        *
505        *    The conversion for component c of v to fixed point is done as
506        *    follows:
507        *
508        *       packSnorm4x8: round(clamp(c, -1, +1) * 127.0)
509        *
510        *    The first component of the vector will be written to the least
511        *    significant bits of the output; the last component will be written to
512        *    the most significant bits.
513        *
514        * This function generates IR that approximates the following pseudo-GLSL:
515        *
516        *     return pack_uvec4_to_uint(
517        *         uvec4(ivec4(
518        *           round(clamp(VEC4_RVALUE, -1.0f, 1.0f) * 127.0f))));
519        *
520        * It is necessary to first convert the vec4 to ivec4 rather than directly
521        * converting vec4 to uvec4 because the latter conversion is undefined.
522        * From page 87 (93 of pdf) of the GLSL 4.30 spec: "It is undefined to
523        * convert a negative floating point value to an uint".
524        */
525       assert(vec4_rval->type == &glsl_type_builtin_vec4);
526 
527       ir_rvalue *result = pack_uvec4_to_uint(
528             i2u(f2i(round_even(mul(clamp(vec4_rval,
529                                          constant(-1.0f),
530                                          constant(1.0f)),
531                                    constant(127.0f))))));
532 
533       assert(result->type == &glsl_type_builtin_uint);
534       return result;
535    }
536 
537    /**
538     * \brief Lower an unpackSnorm2x16 expression.
539     *
540     * \param uint_rval is unpackSnorm2x16's input
541     * \return unpackSnorm2x16's output as a vec2 rvalue
542     */
543    ir_rvalue*
lower_unpack_snorm_2x16(ir_rvalue * uint_rval)544    lower_unpack_snorm_2x16(ir_rvalue *uint_rval)
545    {
546       /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
547        *
548        *    highp vec2 unpackSnorm2x16 (highp uint p)
549        *    -----------------------------------------
550        *    First, unpacks a single 32-bit unsigned integer p into a pair of
551        *    16-bit unsigned integers. Then, each component is converted to
552        *    a normalized floating-point value to generate the returned
553        *    two-component vector.
554        *
555        *    The conversion for unpacked fixed-point value f to floating point is
556        *    done as follows:
557        *
558        *       unpackSnorm2x16: clamp(f / 32767.0, -1,+1)
559        *
560        *    The first component of the returned vector will be extracted from the
561        *    least significant bits of the input; the last component will be
562        *    extracted from the most significant bits.
563        *
564        * This function generates IR that approximates the following pseudo-GLSL:
565        *
566        *    return clamp(
567        *       ((ivec2(unpack_uint_to_uvec2(UINT_RVALUE)) << 16) >> 16) / 32767.0f,
568        *       -1.0f, 1.0f);
569        *
570        * The above IR may appear unnecessarily complex, but the intermediate
571        * conversion to ivec2 and the bit shifts are necessary to correctly unpack
572        * negative floats.
573        *
574        * To see why, consider packing and then unpacking vec2(-1.0, 0.0).
575        * packSnorm2x16 encodes -1.0 as the int16 0xffff. During unpacking, we
576        * place that int16 into an int32, which results in the *positive* integer
577        * 0x0000ffff.  The int16's sign bit becomes, in the int32, the rather
578        * unimportant bit 16. We must now extend the int16's sign bit into bits
579        * 17-32, which is accomplished by left-shifting then right-shifting.
580        */
581 
582       assert(uint_rval->type == &glsl_type_builtin_uint);
583 
584       ir_rvalue *result =
585         clamp(div(i2f(unpack_uint_to_ivec2(uint_rval)),
586                   constant(32767.0f)),
587               constant(-1.0f),
588               constant(1.0f));
589 
590       assert(result->type == &glsl_type_builtin_vec2);
591       return result;
592    }
593 
594    /**
595     * \brief Lower an unpackSnorm4x8 expression.
596     *
597     * \param uint_rval is unpackSnorm4x8's input
598     * \return unpackSnorm4x8's output as a vec4 rvalue
599     */
600    ir_rvalue*
lower_unpack_snorm_4x8(ir_rvalue * uint_rval)601    lower_unpack_snorm_4x8(ir_rvalue *uint_rval)
602    {
603       /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
604        *
605        *    highp vec4 unpackSnorm4x8 (highp uint p)
606        *    ----------------------------------------
607        *    First, unpacks a single 32-bit unsigned integer p into four
608        *    8-bit unsigned integers. Then, each component is converted to
609        *    a normalized floating-point value to generate the returned
610        *    four-component vector.
611        *
612        *    The conversion for unpacked fixed-point value f to floating point is
613        *    done as follows:
614        *
615        *       unpackSnorm4x8: clamp(f / 127.0, -1, +1)
616        *
617        *    The first component of the returned vector will be extracted from the
618        *    least significant bits of the input; the last component will be
619        *    extracted from the most significant bits.
620        *
621        * This function generates IR that approximates the following pseudo-GLSL:
622        *
623        *    return clamp(
624        *       ((ivec4(unpack_uint_to_uvec4(UINT_RVALUE)) << 24) >> 24) / 127.0f,
625        *       -1.0f, 1.0f);
626        *
627        * The above IR may appear unnecessarily complex, but the intermediate
628        * conversion to ivec4 and the bit shifts are necessary to correctly unpack
629        * negative floats.
630        *
631        * To see why, consider packing and then unpacking vec4(-1.0, 0.0, 0.0,
632        * 0.0). packSnorm4x8 encodes -1.0 as the int8 0xff. During unpacking, we
633        * place that int8 into an int32, which results in the *positive* integer
634        * 0x000000ff.  The int8's sign bit becomes, in the int32, the rather
635        * unimportant bit 8. We must now extend the int8's sign bit into bits
636        * 9-32, which is accomplished by left-shifting then right-shifting.
637        */
638 
639       assert(uint_rval->type == &glsl_type_builtin_uint);
640 
641       ir_rvalue *result =
642         clamp(div(i2f(unpack_uint_to_ivec4(uint_rval)),
643                   constant(127.0f)),
644               constant(-1.0f),
645               constant(1.0f));
646 
647       assert(result->type == &glsl_type_builtin_vec4);
648       return result;
649    }
650 
651    /**
652     * \brief Lower a packUnorm2x16 expression.
653     *
654     * \param vec2_rval is packUnorm2x16's input
655     * \return packUnorm2x16's output as a uint rvalue
656     */
657    ir_rvalue*
lower_pack_unorm_2x16(ir_rvalue * vec2_rval)658    lower_pack_unorm_2x16(ir_rvalue *vec2_rval)
659    {
660       /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
661        *
662        *    highp uint packUnorm2x16 (vec2 v)
663        *    ---------------------------------
664        *    First, converts each component of the normalized floating-point value
665        *    v into 16-bit integer values. Then, the results are packed into the
666        *    returned 32-bit unsigned integer.
667        *
668        *    The conversion for component c of v to fixed point is done as
669        *    follows:
670        *
671        *       packUnorm2x16: round(clamp(c, 0, +1) * 65535.0)
672        *
673        *    The first component of the vector will be written to the least
674        *    significant bits of the output; the last component will be written to
675        *    the most significant bits.
676        *
677        * This function generates IR that approximates the following pseudo-GLSL:
678        *
679        *     return pack_uvec2_to_uint(uvec2(
680        *                round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 65535.0f)));
681        *
682        * Here it is safe to directly convert the vec2 to uvec2 because the vec2
683        * has been clamped to a non-negative range.
684        */
685 
686       assert(vec2_rval->type == &glsl_type_builtin_vec2);
687 
688       ir_rvalue *result = pack_uvec2_to_uint(
689          f2u(round_even(mul(saturate(vec2_rval), constant(65535.0f)))));
690 
691       assert(result->type == &glsl_type_builtin_uint);
692       return result;
693    }
694 
695    /**
696     * \brief Lower a packUnorm4x8 expression.
697     *
698     * \param vec4_rval is packUnorm4x8's input
699     * \return packUnorm4x8's output as a uint rvalue
700     */
701    ir_rvalue*
lower_pack_unorm_4x8(ir_rvalue * vec4_rval)702    lower_pack_unorm_4x8(ir_rvalue *vec4_rval)
703    {
704       /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
705        *
706        *    highp uint packUnorm4x8 (vec4 v)
707        *    --------------------------------
708        *    First, converts each component of the normalized floating-point value
709        *    v into 8-bit integer values. Then, the results are packed into the
710        *    returned 32-bit unsigned integer.
711        *
712        *    The conversion for component c of v to fixed point is done as
713        *    follows:
714        *
715        *       packUnorm4x8: round(clamp(c, 0, +1) * 255.0)
716        *
717        *    The first component of the vector will be written to the least
718        *    significant bits of the output; the last component will be written to
719        *    the most significant bits.
720        *
721        * This function generates IR that approximates the following pseudo-GLSL:
722        *
723        *     return pack_uvec4_to_uint(uvec4(
724        *                round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 255.0f)));
725        *
726        * Here it is safe to directly convert the vec4 to uvec4 because the vec4
727        * has been clamped to a non-negative range.
728        */
729 
730       assert(vec4_rval->type == &glsl_type_builtin_vec4);
731 
732       ir_rvalue *result = pack_uvec4_to_uint(
733          f2u(round_even(mul(saturate(vec4_rval), constant(255.0f)))));
734 
735       assert(result->type == &glsl_type_builtin_uint);
736       return result;
737    }
738 
739    /**
740     * \brief Lower an unpackUnorm2x16 expression.
741     *
742     * \param uint_rval is unpackUnorm2x16's input
743     * \return unpackUnorm2x16's output as a vec2 rvalue
744     */
745    ir_rvalue*
lower_unpack_unorm_2x16(ir_rvalue * uint_rval)746    lower_unpack_unorm_2x16(ir_rvalue *uint_rval)
747    {
748       /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
749        *
750        *    highp vec2 unpackUnorm2x16 (highp uint p)
751        *    -----------------------------------------
752        *    First, unpacks a single 32-bit unsigned integer p into a pair of
753        *    16-bit unsigned integers. Then, each component is converted to
754        *    a normalized floating-point value to generate the returned
755        *    two-component vector.
756        *
757        *    The conversion for unpacked fixed-point value f to floating point is
758        *    done as follows:
759        *
760        *       unpackUnorm2x16: f / 65535.0
761        *
762        *    The first component of the returned vector will be extracted from the
763        *    least significant bits of the input; the last component will be
764        *    extracted from the most significant bits.
765        *
766        * This function generates IR that approximates the following pseudo-GLSL:
767        *
768        *     return vec2(unpack_uint_to_uvec2(UINT_RVALUE)) / 65535.0;
769        */
770 
771       assert(uint_rval->type == &glsl_type_builtin_uint);
772 
773       ir_rvalue *result = div(u2f(unpack_uint_to_uvec2(uint_rval)),
774                               constant(65535.0f));
775 
776       assert(result->type == &glsl_type_builtin_vec2);
777       return result;
778    }
779 
780    /**
781     * \brief Lower an unpackUnorm4x8 expression.
782     *
783     * \param uint_rval is unpackUnorm4x8's input
784     * \return unpackUnorm4x8's output as a vec4 rvalue
785     */
786    ir_rvalue*
lower_unpack_unorm_4x8(ir_rvalue * uint_rval)787    lower_unpack_unorm_4x8(ir_rvalue *uint_rval)
788    {
789       /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
790        *
791        *    highp vec4 unpackUnorm4x8 (highp uint p)
792        *    ----------------------------------------
793        *    First, unpacks a single 32-bit unsigned integer p into four
794        *    8-bit unsigned integers. Then, each component is converted to
795        *    a normalized floating-point value to generate the returned
796        *    two-component vector.
797        *
798        *    The conversion for unpacked fixed-point value f to floating point is
799        *    done as follows:
800        *
801        *       unpackUnorm4x8: f / 255.0
802        *
803        *    The first component of the returned vector will be extracted from the
804        *    least significant bits of the input; the last component will be
805        *    extracted from the most significant bits.
806        *
807        * This function generates IR that approximates the following pseudo-GLSL:
808        *
809        *     return vec4(unpack_uint_to_uvec4(UINT_RVALUE)) / 255.0;
810        */
811 
812       assert(uint_rval->type == &glsl_type_builtin_uint);
813 
814       ir_rvalue *result = div(u2f(unpack_uint_to_uvec4(uint_rval)),
815                               constant(255.0f));
816 
817       assert(result->type == &glsl_type_builtin_vec4);
818       return result;
819    }
820 
821    /**
822     * \brief Lower the component-wise calculation of packHalf2x16.
823     *
824     * \param f_rval is one component of packHafl2x16's input
825     * \param e_rval is the unshifted exponent bits of f_rval
826     * \param m_rval is the unshifted mantissa bits of f_rval
827     *
828     * \return a uint rvalue that encodes a float16 in its lower 16 bits
829     */
830    ir_rvalue*
pack_half_1x16_nosign(ir_rvalue * f_rval,ir_rvalue * e_rval,ir_rvalue * m_rval)831    pack_half_1x16_nosign(ir_rvalue *f_rval,
832                          ir_rvalue *e_rval,
833                          ir_rvalue *m_rval)
834    {
835       assert(e_rval->type == &glsl_type_builtin_uint);
836       assert(m_rval->type == &glsl_type_builtin_uint);
837 
838       /* uint u16; */
839       ir_variable *u16 = factory.make_temp(&glsl_type_builtin_uint,
840                                            "tmp_pack_half_1x16_u16");
841 
842       /* float f = FLOAT_RVAL; */
843       ir_variable *f = factory.make_temp(&glsl_type_builtin_float,
844                                           "tmp_pack_half_1x16_f");
845       factory.emit(assign(f, f_rval));
846 
847       /* uint e = E_RVAL; */
848       ir_variable *e = factory.make_temp(&glsl_type_builtin_uint,
849                                           "tmp_pack_half_1x16_e");
850       factory.emit(assign(e, e_rval));
851 
852       /* uint m = M_RVAL; */
853       ir_variable *m = factory.make_temp(&glsl_type_builtin_uint,
854                                           "tmp_pack_half_1x16_m");
855       factory.emit(assign(m, m_rval));
856 
857       /* Preliminaries
858        * -------------
859        *
860        * For a float16, the bit layout is:
861        *
862        *   sign:     15
863        *   exponent: 10:14
864        *   mantissa: 0:9
865        *
866        * Let f16 be a float16 value. The sign, exponent, and mantissa
867        * determine its value thus:
868        *
869        *   if e16 = 0 and m16 = 0, then zero:       (-1)^s16 * 0                               (1)
870        *   if e16 = 0 and m16!= 0, then subnormal:  (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10)     (2)
871        *   if 0 < e16 < 31, then normal:            (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
872        *   if e16 = 31 and m16 = 0, then infinite:  (-1)^s16 * inf                             (4)
873        *   if e16 = 31 and m16 != 0, then           NaN                                        (5)
874        *
875        * where 0 <= m16 < 2^10.
876        *
877        * For a float32, the bit layout is:
878        *
879        *   sign:     31
880        *   exponent: 23:30
881        *   mantissa: 0:22
882        *
883        * Let f32 be a float32 value. The sign, exponent, and mantissa
884        * determine its value thus:
885        *
886        *   if e32 = 0 and m32 = 0, then zero:        (-1)^s * 0                                (10)
887        *   if e32 = 0 and m32 != 0, then subnormal:  (-1)^s * 2^(e32 - 126) * (m32 / 2^23)     (11)
888        *   if 0 < e32 < 255, then normal:            (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
889        *   if e32 = 255 and m32 = 0, then infinite:  (-1)^s * inf                              (13)
890        *   if e32 = 255 and m32 != 0, then           NaN                                       (14)
891        *
892        * where 0 <= m32 < 2^23.
893        *
894        * The minimum and maximum normal float16 values are
895        *
896        *   min_norm16 = 2^(1 - 15) * (1 + 0 / 2^10) = 2^(-14)   (20)
897        *   max_norm16 = 2^(30 - 15) * (1 + 1023 / 2^10)         (21)
898        *
899        * The step at max_norm16 is
900        *
901        *   max_step16 = 2^5                                     (22)
902        *
903        * Observe that the float16 boundary values in equations 20-21 lie in the
904        * range of normal float32 values.
905        *
906        *
907        * Rounding Behavior
908        * -----------------
909        * Not all float32 values can be exactly represented as a float16. We
910        * round all such intermediate float32 values to the nearest float16; if
911        * the float32 is exactly between to float16 values, we round to the one
912        * with an even mantissa. This rounding behavior has several benefits:
913        *
914        *   - It has no sign bias.
915        *
916        *   - It reproduces the behavior of real hardware: opcode F32TO16 in Intel's
917        *     GPU ISA.
918        *
919        *   - By reproducing the behavior of the GPU (at least on Intel hardware),
920        *     compile-time evaluation of constant packHalf2x16 GLSL expressions will
921        *     result in the same value as if the expression were executed on the
922        *     GPU.
923        *
924        * Calculation
925        * -----------
926        * Our task is to compute s16, e16, m16 given f32.  Since this function
927        * ignores the sign bit, assume that s32 = s16 = 0.  There are several
928        * cases consider.
929        */
930 
931       factory.emit(
932 
933          /* Case 1) f32 is NaN
934           *
935           *   The resultant f16 will also be NaN.
936           */
937 
938          /* if (e32 == 255 && m32 != 0) { */
939          if_tree(logic_and(equal(e, constant(0xffu << 23u)),
940                            logic_not(equal(m, constant(0u)))),
941 
942             assign(u16, constant(0x7fffu)),
943 
944          /* Case 2) f32 lies in the range [0, min_norm16).
945           *
946           *   The resultant float16 will be either zero, subnormal, or normal.
947           *
948           *   Solving
949           *
950           *     f32 = min_norm16       (30)
951           *
952           *   gives
953           *
954           *     e32 = 113 and m32 = 0  (31)
955           *
956           *   Therefore this case occurs if and only if
957           *
958           *     e32 < 113              (32)
959           */
960 
961          /* } else if (e32 < 113) { */
962          if_tree(less(e, constant(113u << 23u)),
963 
964             /* u16 = uint(round_to_even(abs(f32) * float(1u << 24u))); */
965             assign(u16, f2u(round_even(mul(expr(ir_unop_abs, f),
966                                            constant((float) (1 << 24)))))),
967 
968          /* Case 3) f32 lies in the range
969           *         [min_norm16, max_norm16 + max_step16).
970           *
971           *   The resultant float16 will be either normal or infinite.
972           *
973           *   Solving
974           *
975           *     f32 = max_norm16 + max_step16           (40)
976           *         = 2^15 * (1 + 1023 / 2^10) + 2^5    (41)
977           *         = 2^16                              (42)
978           *   gives
979           *
980           *     e32 = 143 and m32 = 0                   (43)
981           *
982           *   We already solved the boundary condition f32 = min_norm16 above
983           *   in equation 31. Therefore this case occurs if and only if
984           *
985           *     113 <= e32 and e32 < 143
986           */
987 
988          /* } else if (e32 < 143) { */
989          if_tree(less(e, constant(143u << 23u)),
990 
991             /* The addition below handles the case where the mantissa rounds
992              * up to 1024 and bumps the exponent.
993              *
994              * u16 = ((e - (112u << 23u)) >> 13u)
995              *     + round_to_even((float(m) / (1u << 13u));
996              */
997             assign(u16, add(rshift(sub(e, constant(112u << 23u)),
998                                    constant(13u)),
999                             f2u(round_even(
1000                                   div(u2f(m), constant((float) (1 << 13))))))),
1001 
1002          /* Case 4) f32 lies in the range [max_norm16 + max_step16, inf].
1003           *
1004           *   The resultant float16 will be infinite.
1005           *
1006           *   The cases above caught all float32 values in the range
1007           *   [0, max_norm16 + max_step16), so this is the fall-through case.
1008           */
1009 
1010          /* } else { */
1011 
1012             assign(u16, constant(31u << 10u))))));
1013 
1014          /* } */
1015 
1016        return deref(u16).val;
1017    }
1018 
1019    /**
1020     * \brief Lower a packHalf2x16 expression.
1021     *
1022     * \param vec2_rval is packHalf2x16's input
1023     * \return packHalf2x16's output as a uint rvalue
1024     */
1025    ir_rvalue*
lower_pack_half_2x16(ir_rvalue * vec2_rval)1026    lower_pack_half_2x16(ir_rvalue *vec2_rval)
1027    {
1028       /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
1029        *
1030        *    highp uint packHalf2x16 (mediump vec2 v)
1031        *    ----------------------------------------
1032        *    Returns an unsigned integer obtained by converting the components of
1033        *    a two-component floating-point vector to the 16-bit floating-point
1034        *    representation found in the OpenGL ES Specification, and then packing
1035        *    these two 16-bit integers into a 32-bit unsigned integer.
1036        *
1037        *    The first vector component specifies the 16 least- significant bits
1038        *    of the result; the second component specifies the 16 most-significant
1039        *    bits.
1040        */
1041 
1042       assert(vec2_rval->type == &glsl_type_builtin_vec2);
1043 
1044       /* vec2 f = VEC2_RVAL; */
1045       ir_variable *f = factory.make_temp(&glsl_type_builtin_vec2,
1046                                          "tmp_pack_half_2x16_f");
1047       factory.emit(assign(f, vec2_rval));
1048 
1049       /* uvec2 f32 = bitcast_f2u(f); */
1050       ir_variable *f32 = factory.make_temp(&glsl_type_builtin_uvec2,
1051                                             "tmp_pack_half_2x16_f32");
1052       factory.emit(assign(f32, expr(ir_unop_bitcast_f2u, f)));
1053 
1054       /* uvec2 f16; */
1055       ir_variable *f16 = factory.make_temp(&glsl_type_builtin_uvec2,
1056                                         "tmp_pack_half_2x16_f16");
1057 
1058       /* Get f32's unshifted exponent bits.
1059        *
1060        *   uvec2 e = f32 & 0x7f800000u;
1061        */
1062       ir_variable *e = factory.make_temp(&glsl_type_builtin_uvec2,
1063                                           "tmp_pack_half_2x16_e");
1064       factory.emit(assign(e, bit_and(f32, constant(0x7f800000u))));
1065 
1066       /* Get f32's unshifted mantissa bits.
1067        *
1068        *   uvec2 m = f32 & 0x007fffffu;
1069        */
1070       ir_variable *m = factory.make_temp(&glsl_type_builtin_uvec2,
1071                                           "tmp_pack_half_2x16_m");
1072       factory.emit(assign(m, bit_and(f32, constant(0x007fffffu))));
1073 
1074       /* Set f16's exponent and mantissa bits.
1075        *
1076        *   f16.x = pack_half_1x16_nosign(e.x, m.x);
1077        *   f16.y = pack_half_1y16_nosign(e.y, m.y);
1078        */
1079       factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_x(f),
1080                                                      swizzle_x(e),
1081                                                      swizzle_x(m)),
1082                            WRITEMASK_X));
1083       factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_y(f),
1084                                                      swizzle_y(e),
1085                                                      swizzle_y(m)),
1086                            WRITEMASK_Y));
1087 
1088       /* Set f16's sign bits.
1089        *
1090        *   f16 |= (f32 & (1u << 31u) >> 16u;
1091        */
1092       factory.emit(
1093          assign(f16, bit_or(f16,
1094                             rshift(bit_and(f32, constant(1u << 31u)),
1095                                    constant(16u)))));
1096 
1097 
1098       /* return (f16.y << 16u) | f16.x; */
1099       ir_rvalue *result = bit_or(lshift(swizzle_y(f16),
1100                                         constant(16u)),
1101                                  swizzle_x(f16));
1102 
1103       assert(result->type == &glsl_type_builtin_uint);
1104       return result;
1105    }
1106 
1107    /**
1108     * \brief Lower the component-wise calculation of unpackHalf2x16.
1109     *
1110     * Given a uint that encodes a float16 in its lower 16 bits, this function
1111     * returns a uint that encodes a float32 with the same value. The sign bit
1112     * of the float16 is ignored.
1113     *
1114     * \param e_rval is the unshifted exponent bits of a float16
1115     * \param m_rval is the unshifted mantissa bits of a float16
1116     * \param a uint rvalue that encodes a float32
1117     */
1118    ir_rvalue*
unpack_half_1x16_nosign(ir_rvalue * e_rval,ir_rvalue * m_rval)1119    unpack_half_1x16_nosign(ir_rvalue *e_rval, ir_rvalue *m_rval)
1120    {
1121       assert(e_rval->type == &glsl_type_builtin_uint);
1122       assert(m_rval->type == &glsl_type_builtin_uint);
1123 
1124       /* uint u32; */
1125       ir_variable *u32 = factory.make_temp(&glsl_type_builtin_uint,
1126                                            "tmp_unpack_half_1x16_u32");
1127 
1128       /* uint e = E_RVAL; */
1129       ir_variable *e = factory.make_temp(&glsl_type_builtin_uint,
1130                                           "tmp_unpack_half_1x16_e");
1131       factory.emit(assign(e, e_rval));
1132 
1133       /* uint m = M_RVAL; */
1134       ir_variable *m = factory.make_temp(&glsl_type_builtin_uint,
1135                                           "tmp_unpack_half_1x16_m");
1136       factory.emit(assign(m, m_rval));
1137 
1138       /* Preliminaries
1139        * -------------
1140        *
1141        * For a float16, the bit layout is:
1142        *
1143        *   sign:     15
1144        *   exponent: 10:14
1145        *   mantissa: 0:9
1146        *
1147        * Let f16 be a float16 value. The sign, exponent, and mantissa
1148        * determine its value thus:
1149        *
1150        *   if e16 = 0 and m16 = 0, then zero:       (-1)^s16 * 0                               (1)
1151        *   if e16 = 0 and m16!= 0, then subnormal:  (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10)     (2)
1152        *   if 0 < e16 < 31, then normal:            (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
1153        *   if e16 = 31 and m16 = 0, then infinite:  (-1)^s16 * inf                             (4)
1154        *   if e16 = 31 and m16 != 0, then           NaN                                        (5)
1155        *
1156        * where 0 <= m16 < 2^10.
1157        *
1158        * For a float32, the bit layout is:
1159        *
1160        *   sign: 31
1161        *   exponent: 23:30
1162        *   mantissa: 0:22
1163        *
1164        * Let f32 be a float32 value. The sign, exponent, and mantissa
1165        * determine its value thus:
1166        *
1167        *   if e32 = 0 and m32 = 0, then zero:        (-1)^s * 0                                (10)
1168        *   if e32 = 0 and m32 != 0, then subnormal:  (-1)^s * 2^(e32 - 126) * (m32 / 2^23)     (11)
1169        *   if 0 < e32 < 255, then normal:            (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
1170        *   if e32 = 255 and m32 = 0, then infinite:  (-1)^s * inf                              (13)
1171        *   if e32 = 255 and m32 != 0, then           NaN                                       (14)
1172        *
1173        * where 0 <= m32 < 2^23.
1174        *
1175        * Calculation
1176        * -----------
1177        * Our task is to compute s32, e32, m32 given f16.  Since this function
1178        * ignores the sign bit, assume that s32 = s16 = 0.  There are several
1179        * cases consider.
1180        */
1181 
1182       factory.emit(
1183 
1184          /* Case 1) f16 is zero or subnormal.
1185           *
1186           *   The simplest method of calcuating f32 in this case is
1187           *
1188           *     f32 = f16                       (20)
1189           *         = 2^(-14) * (m16 / 2^10)    (21)
1190           *         = m16 / 2^(-24)             (22)
1191           */
1192 
1193          /* if (e16 == 0) { */
1194          if_tree(equal(e, constant(0u)),
1195 
1196             /* u32 = bitcast_f2u(float(m) / float(1 << 24)); */
1197             assign(u32, expr(ir_unop_bitcast_f2u,
1198                                 div(u2f(m), constant((float)(1 << 24))))),
1199 
1200          /* Case 2) f16 is normal.
1201           *
1202           *   The equation
1203           *
1204           *     f32 = f16                              (30)
1205           *     2^(e32 - 127) * (1 + m32 / 2^23) =     (31)
1206           *       2^(e16 - 15) * (1 + m16 / 2^10)
1207           *
1208           *   can be decomposed into two
1209           *
1210           *     2^(e32 - 127) = 2^(e16 - 15)           (32)
1211           *     1 + m32 / 2^23 = 1 + m16 / 2^10        (33)
1212           *
1213           *   which solve to
1214           *
1215           *     e32 = e16 + 112                        (34)
1216           *     m32 = m16 * 2^13                       (35)
1217           */
1218 
1219          /* } else if (e16 < 31)) { */
1220          if_tree(less(e, constant(31u << 10u)),
1221 
1222               /* u32 = ((e + (112 << 10)) | m) << 13;
1223                */
1224               assign(u32, lshift(bit_or(add(e, constant(112u << 10u)), m),
1225                                  constant(13u))),
1226 
1227 
1228          /* Case 3) f16 is infinite. */
1229          if_tree(equal(m, constant(0u)),
1230 
1231                  assign(u32, constant(255u << 23u)),
1232 
1233          /* Case 4) f16 is NaN. */
1234          /* } else { */
1235 
1236             assign(u32, constant(0x7fffffffu))))));
1237 
1238          /* } */
1239 
1240       return deref(u32).val;
1241    }
1242 
1243    /**
1244     * \brief Lower an unpackHalf2x16 expression.
1245     *
1246     * \param uint_rval is unpackHalf2x16's input
1247     * \return unpackHalf2x16's output as a vec2 rvalue
1248     */
1249    ir_rvalue*
lower_unpack_half_2x16(ir_rvalue * uint_rval)1250    lower_unpack_half_2x16(ir_rvalue *uint_rval)
1251    {
1252       /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
1253        *
1254        *    mediump vec2 unpackHalf2x16 (highp uint v)
1255        *    ------------------------------------------
1256        *    Returns a two-component floating-point vector with components
1257        *    obtained by unpacking a 32-bit unsigned integer into a pair of 16-bit
1258        *    values, interpreting those values as 16-bit floating-point numbers
1259        *    according to the OpenGL ES Specification, and converting them to
1260        *    32-bit floating-point values.
1261        *
1262        *    The first component of the vector is obtained from the
1263        *    16 least-significant bits of v; the second component is obtained
1264        *    from the 16 most-significant bits of v.
1265        */
1266       assert(uint_rval->type == &glsl_type_builtin_uint);
1267 
1268       /* uint u = RVALUE;
1269        * uvec2 f16 = uvec2(u.x & 0xffff, u.y >> 16);
1270        */
1271       ir_variable *f16 = factory.make_temp(&glsl_type_builtin_uvec2,
1272                                             "tmp_unpack_half_2x16_f16");
1273       factory.emit(assign(f16, unpack_uint_to_uvec2(uint_rval)));
1274 
1275       /* uvec2 f32; */
1276       ir_variable *f32 = factory.make_temp(&glsl_type_builtin_uvec2,
1277                                             "tmp_unpack_half_2x16_f32");
1278 
1279       /* Get f16's unshifted exponent bits.
1280        *
1281        *    uvec2 e = f16 & 0x7c00u;
1282        */
1283       ir_variable *e = factory.make_temp(&glsl_type_builtin_uvec2,
1284                                           "tmp_unpack_half_2x16_e");
1285       factory.emit(assign(e, bit_and(f16, constant(0x7c00u))));
1286 
1287       /* Get f16's unshifted mantissa bits.
1288        *
1289        *    uvec2 m = f16 & 0x03ffu;
1290        */
1291       ir_variable *m = factory.make_temp(&glsl_type_builtin_uvec2,
1292                                           "tmp_unpack_half_2x16_m");
1293       factory.emit(assign(m, bit_and(f16, constant(0x03ffu))));
1294 
1295       /* Set f32's exponent and mantissa bits.
1296        *
1297        *   f32.x = unpack_half_1x16_nosign(e.x, m.x);
1298        *   f32.y = unpack_half_1x16_nosign(e.y, m.y);
1299        */
1300       factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_x(e),
1301                                                        swizzle_x(m)),
1302                            WRITEMASK_X));
1303       factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_y(e),
1304                                                        swizzle_y(m)),
1305                            WRITEMASK_Y));
1306 
1307       /* Set f32's sign bit.
1308        *
1309        *    f32 |= (f16 & 0x8000u) << 16u;
1310        */
1311       factory.emit(assign(f32, bit_or(f32,
1312                                        lshift(bit_and(f16,
1313                                                       constant(0x8000u)),
1314                                               constant(16u)))));
1315 
1316       /* return bitcast_u2f(f32); */
1317       ir_rvalue *result = expr(ir_unop_bitcast_u2f, f32);
1318       assert(result->type == &glsl_type_builtin_vec2);
1319       return result;
1320    }
1321 };
1322 
1323 } // namespace anonymous
1324 
1325 /**
1326  * \brief Lower the builtin packing functions.
1327  */
1328 bool
lower_packing_builtins(exec_list * instructions,bool has_shading_language_packing,bool has_gpu_shader5,bool has_half_float_packing)1329 lower_packing_builtins(exec_list *instructions,
1330                        bool has_shading_language_packing,
1331                        bool has_gpu_shader5,
1332                        bool has_half_float_packing)
1333 {
1334    if (!has_shading_language_packing)
1335       return false;
1336 
1337    int op_mask = LOWER_PACK_SNORM_2x16 |
1338                  LOWER_UNPACK_SNORM_2x16 |
1339                  LOWER_PACK_UNORM_2x16 |
1340                  LOWER_UNPACK_UNORM_2x16 |
1341                  LOWER_PACK_SNORM_4x8 |
1342                  LOWER_UNPACK_SNORM_4x8 |
1343                  LOWER_UNPACK_UNORM_4x8 |
1344                  LOWER_PACK_UNORM_4x8;
1345 
1346    if (has_gpu_shader5)
1347       op_mask |= LOWER_PACK_USE_BFI | LOWER_PACK_USE_BFE;
1348 
1349    if (!has_half_float_packing)
1350       op_mask |= LOWER_PACK_HALF_2x16 | LOWER_UNPACK_HALF_2x16;
1351 
1352    lower_packing_builtins_visitor v(op_mask);
1353    visit_list_elements(&v, instructions, true);
1354    return v.get_progress();
1355 }
1356