1 /*
2 * Copyright © 2012 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24 #include "ir.h"
25 #include "ir_builder.h"
26 #include "ir_optimization.h"
27 #include "ir_rvalue_visitor.h"
28
29 enum lower_packing_builtins_op {
30 LOWER_PACK_UNPACK_NONE = 0x0000,
31
32 LOWER_PACK_SNORM_2x16 = 0x0001,
33 LOWER_UNPACK_SNORM_2x16 = 0x0002,
34
35 LOWER_PACK_UNORM_2x16 = 0x0004,
36 LOWER_UNPACK_UNORM_2x16 = 0x0008,
37
38 LOWER_PACK_HALF_2x16 = 0x0010,
39 LOWER_UNPACK_HALF_2x16 = 0x0020,
40
41 LOWER_PACK_SNORM_4x8 = 0x0040,
42 LOWER_UNPACK_SNORM_4x8 = 0x0080,
43
44 LOWER_PACK_UNORM_4x8 = 0x0100,
45 LOWER_UNPACK_UNORM_4x8 = 0x0200,
46
47 LOWER_PACK_USE_BFI = 0x0400,
48 LOWER_PACK_USE_BFE = 0x0800,
49 };
50
51 namespace {
52
53 using namespace ir_builder;
54
55 /**
56 * A visitor that lowers built-in floating-point pack/unpack expressions
57 * such packSnorm2x16.
58 */
59 class lower_packing_builtins_visitor : public ir_rvalue_visitor {
60 public:
61 /**
62 * \param op_mask is a bitmask of `enum lower_packing_builtins_op`
63 */
lower_packing_builtins_visitor(int op_mask)64 explicit lower_packing_builtins_visitor(int op_mask)
65 : op_mask(op_mask),
66 progress(false)
67 {
68 factory.instructions = &factory_instructions;
69 }
70
~lower_packing_builtins_visitor()71 virtual ~lower_packing_builtins_visitor()
72 {
73 assert(factory_instructions.is_empty());
74 }
75
76 lower_packing_builtins_visitor(const lower_packing_builtins_visitor &) = delete;
77 lower_packing_builtins_visitor & operator=(const lower_packing_builtins_visitor &) = delete;
78
get_progress()79 bool get_progress() { return progress; }
80
handle_rvalue(ir_rvalue ** rvalue)81 void handle_rvalue(ir_rvalue **rvalue)
82 {
83 if (!*rvalue)
84 return;
85
86 ir_expression *expr = (*rvalue)->as_expression();
87 if (!expr)
88 return;
89
90 enum lower_packing_builtins_op lowering_op =
91 choose_lowering_op(expr->operation);
92
93 if (lowering_op == LOWER_PACK_UNPACK_NONE)
94 return;
95
96 setup_factory(ralloc_parent(expr));
97
98 ir_rvalue *op0 = expr->operands[0];
99 ralloc_steal(factory.mem_ctx, op0);
100
101 switch (lowering_op) {
102 case LOWER_PACK_SNORM_2x16:
103 *rvalue = lower_pack_snorm_2x16(op0);
104 break;
105 case LOWER_PACK_SNORM_4x8:
106 *rvalue = lower_pack_snorm_4x8(op0);
107 break;
108 case LOWER_PACK_UNORM_2x16:
109 *rvalue = lower_pack_unorm_2x16(op0);
110 break;
111 case LOWER_PACK_UNORM_4x8:
112 *rvalue = lower_pack_unorm_4x8(op0);
113 break;
114 case LOWER_PACK_HALF_2x16:
115 *rvalue = lower_pack_half_2x16(op0);
116 break;
117 case LOWER_UNPACK_SNORM_2x16:
118 *rvalue = lower_unpack_snorm_2x16(op0);
119 break;
120 case LOWER_UNPACK_SNORM_4x8:
121 *rvalue = lower_unpack_snorm_4x8(op0);
122 break;
123 case LOWER_UNPACK_UNORM_2x16:
124 *rvalue = lower_unpack_unorm_2x16(op0);
125 break;
126 case LOWER_UNPACK_UNORM_4x8:
127 *rvalue = lower_unpack_unorm_4x8(op0);
128 break;
129 case LOWER_UNPACK_HALF_2x16:
130 *rvalue = lower_unpack_half_2x16(op0);
131 break;
132 case LOWER_PACK_UNPACK_NONE:
133 case LOWER_PACK_USE_BFI:
134 case LOWER_PACK_USE_BFE:
135 assert(!"not reached");
136 break;
137 }
138
139 teardown_factory();
140 progress = true;
141 }
142
143 private:
144 const int op_mask;
145 bool progress;
146 ir_factory factory;
147 exec_list factory_instructions;
148
149 /**
150 * Determine the needed lowering operation by filtering \a expr_op
151 * through \ref op_mask.
152 */
153 enum lower_packing_builtins_op
choose_lowering_op(ir_expression_operation expr_op)154 choose_lowering_op(ir_expression_operation expr_op)
155 {
156 /* C++ regards int and enum as fundamentally different types.
157 * So, we can't simply return from each case; we must cast the return
158 * value.
159 */
160 int result;
161
162 switch (expr_op) {
163 case ir_unop_pack_snorm_2x16:
164 result = op_mask & LOWER_PACK_SNORM_2x16;
165 break;
166 case ir_unop_pack_snorm_4x8:
167 result = op_mask & LOWER_PACK_SNORM_4x8;
168 break;
169 case ir_unop_pack_unorm_2x16:
170 result = op_mask & LOWER_PACK_UNORM_2x16;
171 break;
172 case ir_unop_pack_unorm_4x8:
173 result = op_mask & LOWER_PACK_UNORM_4x8;
174 break;
175 case ir_unop_pack_half_2x16:
176 result = op_mask & LOWER_PACK_HALF_2x16;
177 break;
178 case ir_unop_unpack_snorm_2x16:
179 result = op_mask & LOWER_UNPACK_SNORM_2x16;
180 break;
181 case ir_unop_unpack_snorm_4x8:
182 result = op_mask & LOWER_UNPACK_SNORM_4x8;
183 break;
184 case ir_unop_unpack_unorm_2x16:
185 result = op_mask & LOWER_UNPACK_UNORM_2x16;
186 break;
187 case ir_unop_unpack_unorm_4x8:
188 result = op_mask & LOWER_UNPACK_UNORM_4x8;
189 break;
190 case ir_unop_unpack_half_2x16:
191 result = op_mask & LOWER_UNPACK_HALF_2x16;
192 break;
193 default:
194 result = LOWER_PACK_UNPACK_NONE;
195 break;
196 }
197
198 return static_cast<enum lower_packing_builtins_op>(result);
199 }
200
201 void
setup_factory(void * mem_ctx)202 setup_factory(void *mem_ctx)
203 {
204 assert(factory.mem_ctx == NULL);
205 assert(factory.instructions->is_empty());
206
207 factory.mem_ctx = mem_ctx;
208 }
209
210 void
teardown_factory()211 teardown_factory()
212 {
213 base_ir->insert_before(factory.instructions);
214 assert(factory.instructions->is_empty());
215 factory.mem_ctx = NULL;
216 }
217
218 template <typename T>
219 ir_constant*
constant(T x)220 constant(T x)
221 {
222 return factory.constant(x);
223 }
224
225 /**
226 * \brief Pack two uint16's into a single uint32.
227 *
228 * Interpret the given uvec2 as a uint16 pair. Pack the pair into a uint32
229 * where the least significant bits specify the first element of the pair.
230 * Return the uint32.
231 */
232 ir_rvalue*
pack_uvec2_to_uint(ir_rvalue * uvec2_rval)233 pack_uvec2_to_uint(ir_rvalue *uvec2_rval)
234 {
235 assert(uvec2_rval->type == &glsl_type_builtin_uvec2);
236
237 /* uvec2 u = UVEC2_RVAL; */
238 ir_variable *u = factory.make_temp(&glsl_type_builtin_uvec2,
239 "tmp_pack_uvec2_to_uint");
240 factory.emit(assign(u, uvec2_rval));
241
242 if (op_mask & LOWER_PACK_USE_BFI) {
243 return bitfield_insert(bit_and(swizzle_x(u), constant(0xffffu)),
244 swizzle_y(u),
245 constant(16u),
246 constant(16u));
247 }
248
249 /* return (u.y << 16) | (u.x & 0xffff); */
250 return bit_or(lshift(swizzle_y(u), constant(16u)),
251 bit_and(swizzle_x(u), constant(0xffffu)));
252 }
253
254 /**
255 * \brief Pack four uint8's into a single uint32.
256 *
257 * Interpret the given uvec4 as a uint32 4-typle. Pack the 4-tuple into a
258 * uint32 where the least significant bits specify the first element of the
259 * 4-tuple. Return the uint32.
260 */
261 ir_rvalue*
pack_uvec4_to_uint(ir_rvalue * uvec4_rval)262 pack_uvec4_to_uint(ir_rvalue *uvec4_rval)
263 {
264 assert(uvec4_rval->type == &glsl_type_builtin_uvec4);
265
266 ir_variable *u = factory.make_temp(&glsl_type_builtin_uvec4,
267 "tmp_pack_uvec4_to_uint");
268
269 if (op_mask & LOWER_PACK_USE_BFI) {
270 /* uvec4 u = UVEC4_RVAL; */
271 factory.emit(assign(u, uvec4_rval));
272
273 return bitfield_insert(bitfield_insert(
274 bitfield_insert(
275 bit_and(swizzle_x(u), constant(0xffu)),
276 swizzle_y(u), constant(8u), constant(8u)),
277 swizzle_z(u), constant(16u), constant(8u)),
278 swizzle_w(u), constant(24u), constant(8u));
279 }
280
281 /* uvec4 u = UVEC4_RVAL & 0xff */
282 factory.emit(assign(u, bit_and(uvec4_rval, constant(0xffu))));
283
284 /* return (u.w << 24) | (u.z << 16) | (u.y << 8) | u.x; */
285 return bit_or(bit_or(lshift(swizzle_w(u), constant(24u)),
286 lshift(swizzle_z(u), constant(16u))),
287 bit_or(lshift(swizzle_y(u), constant(8u)),
288 swizzle_x(u)));
289 }
290
291 /**
292 * \brief Unpack a uint32 into two uint16's.
293 *
294 * Interpret the given uint32 as a uint16 pair where the uint32's least
295 * significant bits specify the pair's first element. Return the uint16
296 * pair as a uvec2.
297 */
298 ir_rvalue*
unpack_uint_to_uvec2(ir_rvalue * uint_rval)299 unpack_uint_to_uvec2(ir_rvalue *uint_rval)
300 {
301 assert(uint_rval->type == &glsl_type_builtin_uint);
302
303 /* uint u = UINT_RVAL; */
304 ir_variable *u = factory.make_temp(&glsl_type_builtin_uint,
305 "tmp_unpack_uint_to_uvec2_u");
306 factory.emit(assign(u, uint_rval));
307
308 /* uvec2 u2; */
309 ir_variable *u2 = factory.make_temp(&glsl_type_builtin_uvec2,
310 "tmp_unpack_uint_to_uvec2_u2");
311
312 /* u2.x = u & 0xffffu; */
313 factory.emit(assign(u2, bit_and(u, constant(0xffffu)), WRITEMASK_X));
314
315 /* u2.y = u >> 16u; */
316 factory.emit(assign(u2, rshift(u, constant(16u)), WRITEMASK_Y));
317
318 return deref(u2).val;
319 }
320
321 /**
322 * \brief Unpack a uint32 into two int16's.
323 *
324 * Specifically each 16-bit value is sign-extended to the full width of an
325 * int32 on return.
326 */
327 ir_rvalue *
unpack_uint_to_ivec2(ir_rvalue * uint_rval)328 unpack_uint_to_ivec2(ir_rvalue *uint_rval)
329 {
330 assert(uint_rval->type == &glsl_type_builtin_uint);
331
332 if (!(op_mask & LOWER_PACK_USE_BFE)) {
333 return rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval)),
334 constant(16u)),
335 constant(16u));
336 }
337
338 ir_variable *i = factory.make_temp(&glsl_type_builtin_int,
339 "tmp_unpack_uint_to_ivec2_i");
340 factory.emit(assign(i, u2i(uint_rval)));
341
342 /* ivec2 i2; */
343 ir_variable *i2 = factory.make_temp(&glsl_type_builtin_ivec2,
344 "tmp_unpack_uint_to_ivec2_i2");
345
346 factory.emit(assign(i2, bitfield_extract(i, constant(0), constant(16)),
347 WRITEMASK_X));
348 factory.emit(assign(i2, bitfield_extract(i, constant(16), constant(16)),
349 WRITEMASK_Y));
350
351 return deref(i2).val;
352 }
353
354 /**
355 * \brief Unpack a uint32 into four uint8's.
356 *
357 * Interpret the given uint32 as a uint8 4-tuple where the uint32's least
358 * significant bits specify the 4-tuple's first element. Return the uint8
359 * 4-tuple as a uvec4.
360 */
361 ir_rvalue*
unpack_uint_to_uvec4(ir_rvalue * uint_rval)362 unpack_uint_to_uvec4(ir_rvalue *uint_rval)
363 {
364 assert(uint_rval->type == &glsl_type_builtin_uint);
365
366 /* uint u = UINT_RVAL; */
367 ir_variable *u = factory.make_temp(&glsl_type_builtin_uint,
368 "tmp_unpack_uint_to_uvec4_u");
369 factory.emit(assign(u, uint_rval));
370
371 /* uvec4 u4; */
372 ir_variable *u4 = factory.make_temp(&glsl_type_builtin_uvec4,
373 "tmp_unpack_uint_to_uvec4_u4");
374
375 /* u4.x = u & 0xffu; */
376 factory.emit(assign(u4, bit_and(u, constant(0xffu)), WRITEMASK_X));
377
378 if (op_mask & LOWER_PACK_USE_BFE) {
379 /* u4.y = bitfield_extract(u, 8, 8); */
380 factory.emit(assign(u4, bitfield_extract(u, constant(8u), constant(8u)),
381 WRITEMASK_Y));
382
383 /* u4.z = bitfield_extract(u, 16, 8); */
384 factory.emit(assign(u4, bitfield_extract(u, constant(16u), constant(8u)),
385 WRITEMASK_Z));
386 } else {
387 /* u4.y = (u >> 8u) & 0xffu; */
388 factory.emit(assign(u4, bit_and(rshift(u, constant(8u)),
389 constant(0xffu)), WRITEMASK_Y));
390
391 /* u4.z = (u >> 16u) & 0xffu; */
392 factory.emit(assign(u4, bit_and(rshift(u, constant(16u)),
393 constant(0xffu)), WRITEMASK_Z));
394 }
395
396 /* u4.w = (u >> 24u) */
397 factory.emit(assign(u4, rshift(u, constant(24u)), WRITEMASK_W));
398
399 return deref(u4).val;
400 }
401
402 /**
403 * \brief Unpack a uint32 into four int8's.
404 *
405 * Specifically each 8-bit value is sign-extended to the full width of an
406 * int32 on return.
407 */
408 ir_rvalue *
unpack_uint_to_ivec4(ir_rvalue * uint_rval)409 unpack_uint_to_ivec4(ir_rvalue *uint_rval)
410 {
411 assert(uint_rval->type == &glsl_type_builtin_uint);
412
413 if (!(op_mask & LOWER_PACK_USE_BFE)) {
414 return rshift(lshift(u2i(unpack_uint_to_uvec4(uint_rval)),
415 constant(24u)),
416 constant(24u));
417 }
418
419 ir_variable *i = factory.make_temp(&glsl_type_builtin_int,
420 "tmp_unpack_uint_to_ivec4_i");
421 factory.emit(assign(i, u2i(uint_rval)));
422
423 /* ivec4 i4; */
424 ir_variable *i4 = factory.make_temp(&glsl_type_builtin_ivec4,
425 "tmp_unpack_uint_to_ivec4_i4");
426
427 factory.emit(assign(i4, bitfield_extract(i, constant(0), constant(8)),
428 WRITEMASK_X));
429 factory.emit(assign(i4, bitfield_extract(i, constant(8), constant(8)),
430 WRITEMASK_Y));
431 factory.emit(assign(i4, bitfield_extract(i, constant(16), constant(8)),
432 WRITEMASK_Z));
433 factory.emit(assign(i4, bitfield_extract(i, constant(24), constant(8)),
434 WRITEMASK_W));
435
436 return deref(i4).val;
437 }
438
439 /**
440 * \brief Lower a packSnorm2x16 expression.
441 *
442 * \param vec2_rval is packSnorm2x16's input
443 * \return packSnorm2x16's output as a uint rvalue
444 */
445 ir_rvalue*
lower_pack_snorm_2x16(ir_rvalue * vec2_rval)446 lower_pack_snorm_2x16(ir_rvalue *vec2_rval)
447 {
448 /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
449 *
450 * highp uint packSnorm2x16(vec2 v)
451 * --------------------------------
452 * First, converts each component of the normalized floating-point value
453 * v into 16-bit integer values. Then, the results are packed into the
454 * returned 32-bit unsigned integer.
455 *
456 * The conversion for component c of v to fixed point is done as
457 * follows:
458 *
459 * packSnorm2x16: round(clamp(c, -1, +1) * 32767.0)
460 *
461 * The first component of the vector will be written to the least
462 * significant bits of the output; the last component will be written to
463 * the most significant bits.
464 *
465 * This function generates IR that approximates the following pseudo-GLSL:
466 *
467 * return pack_uvec2_to_uint(
468 * uvec2(ivec2(
469 * round(clamp(VEC2_RVALUE, -1.0f, 1.0f) * 32767.0f))));
470 *
471 * It is necessary to first convert the vec2 to ivec2 rather than directly
472 * converting vec2 to uvec2 because the latter conversion is undefined.
473 * From page 56 (62 of pdf) of the GLSL ES 3.00 spec: "It is undefined to
474 * convert a negative floating point value to an uint".
475 */
476 assert(vec2_rval->type == &glsl_type_builtin_vec2);
477
478 ir_rvalue *result = pack_uvec2_to_uint(
479 i2u(f2i(round_even(mul(clamp(vec2_rval,
480 constant(-1.0f),
481 constant(1.0f)),
482 constant(32767.0f))))));
483
484 assert(result->type == &glsl_type_builtin_uint);
485 return result;
486 }
487
488 /**
489 * \brief Lower a packSnorm4x8 expression.
490 *
491 * \param vec4_rval is packSnorm4x8's input
492 * \return packSnorm4x8's output as a uint rvalue
493 */
494 ir_rvalue*
lower_pack_snorm_4x8(ir_rvalue * vec4_rval)495 lower_pack_snorm_4x8(ir_rvalue *vec4_rval)
496 {
497 /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
498 *
499 * highp uint packSnorm4x8(vec4 v)
500 * -------------------------------
501 * First, converts each component of the normalized floating-point value
502 * v into 8-bit integer values. Then, the results are packed into the
503 * returned 32-bit unsigned integer.
504 *
505 * The conversion for component c of v to fixed point is done as
506 * follows:
507 *
508 * packSnorm4x8: round(clamp(c, -1, +1) * 127.0)
509 *
510 * The first component of the vector will be written to the least
511 * significant bits of the output; the last component will be written to
512 * the most significant bits.
513 *
514 * This function generates IR that approximates the following pseudo-GLSL:
515 *
516 * return pack_uvec4_to_uint(
517 * uvec4(ivec4(
518 * round(clamp(VEC4_RVALUE, -1.0f, 1.0f) * 127.0f))));
519 *
520 * It is necessary to first convert the vec4 to ivec4 rather than directly
521 * converting vec4 to uvec4 because the latter conversion is undefined.
522 * From page 87 (93 of pdf) of the GLSL 4.30 spec: "It is undefined to
523 * convert a negative floating point value to an uint".
524 */
525 assert(vec4_rval->type == &glsl_type_builtin_vec4);
526
527 ir_rvalue *result = pack_uvec4_to_uint(
528 i2u(f2i(round_even(mul(clamp(vec4_rval,
529 constant(-1.0f),
530 constant(1.0f)),
531 constant(127.0f))))));
532
533 assert(result->type == &glsl_type_builtin_uint);
534 return result;
535 }
536
537 /**
538 * \brief Lower an unpackSnorm2x16 expression.
539 *
540 * \param uint_rval is unpackSnorm2x16's input
541 * \return unpackSnorm2x16's output as a vec2 rvalue
542 */
543 ir_rvalue*
lower_unpack_snorm_2x16(ir_rvalue * uint_rval)544 lower_unpack_snorm_2x16(ir_rvalue *uint_rval)
545 {
546 /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
547 *
548 * highp vec2 unpackSnorm2x16 (highp uint p)
549 * -----------------------------------------
550 * First, unpacks a single 32-bit unsigned integer p into a pair of
551 * 16-bit unsigned integers. Then, each component is converted to
552 * a normalized floating-point value to generate the returned
553 * two-component vector.
554 *
555 * The conversion for unpacked fixed-point value f to floating point is
556 * done as follows:
557 *
558 * unpackSnorm2x16: clamp(f / 32767.0, -1,+1)
559 *
560 * The first component of the returned vector will be extracted from the
561 * least significant bits of the input; the last component will be
562 * extracted from the most significant bits.
563 *
564 * This function generates IR that approximates the following pseudo-GLSL:
565 *
566 * return clamp(
567 * ((ivec2(unpack_uint_to_uvec2(UINT_RVALUE)) << 16) >> 16) / 32767.0f,
568 * -1.0f, 1.0f);
569 *
570 * The above IR may appear unnecessarily complex, but the intermediate
571 * conversion to ivec2 and the bit shifts are necessary to correctly unpack
572 * negative floats.
573 *
574 * To see why, consider packing and then unpacking vec2(-1.0, 0.0).
575 * packSnorm2x16 encodes -1.0 as the int16 0xffff. During unpacking, we
576 * place that int16 into an int32, which results in the *positive* integer
577 * 0x0000ffff. The int16's sign bit becomes, in the int32, the rather
578 * unimportant bit 16. We must now extend the int16's sign bit into bits
579 * 17-32, which is accomplished by left-shifting then right-shifting.
580 */
581
582 assert(uint_rval->type == &glsl_type_builtin_uint);
583
584 ir_rvalue *result =
585 clamp(div(i2f(unpack_uint_to_ivec2(uint_rval)),
586 constant(32767.0f)),
587 constant(-1.0f),
588 constant(1.0f));
589
590 assert(result->type == &glsl_type_builtin_vec2);
591 return result;
592 }
593
594 /**
595 * \brief Lower an unpackSnorm4x8 expression.
596 *
597 * \param uint_rval is unpackSnorm4x8's input
598 * \return unpackSnorm4x8's output as a vec4 rvalue
599 */
600 ir_rvalue*
lower_unpack_snorm_4x8(ir_rvalue * uint_rval)601 lower_unpack_snorm_4x8(ir_rvalue *uint_rval)
602 {
603 /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
604 *
605 * highp vec4 unpackSnorm4x8 (highp uint p)
606 * ----------------------------------------
607 * First, unpacks a single 32-bit unsigned integer p into four
608 * 8-bit unsigned integers. Then, each component is converted to
609 * a normalized floating-point value to generate the returned
610 * four-component vector.
611 *
612 * The conversion for unpacked fixed-point value f to floating point is
613 * done as follows:
614 *
615 * unpackSnorm4x8: clamp(f / 127.0, -1, +1)
616 *
617 * The first component of the returned vector will be extracted from the
618 * least significant bits of the input; the last component will be
619 * extracted from the most significant bits.
620 *
621 * This function generates IR that approximates the following pseudo-GLSL:
622 *
623 * return clamp(
624 * ((ivec4(unpack_uint_to_uvec4(UINT_RVALUE)) << 24) >> 24) / 127.0f,
625 * -1.0f, 1.0f);
626 *
627 * The above IR may appear unnecessarily complex, but the intermediate
628 * conversion to ivec4 and the bit shifts are necessary to correctly unpack
629 * negative floats.
630 *
631 * To see why, consider packing and then unpacking vec4(-1.0, 0.0, 0.0,
632 * 0.0). packSnorm4x8 encodes -1.0 as the int8 0xff. During unpacking, we
633 * place that int8 into an int32, which results in the *positive* integer
634 * 0x000000ff. The int8's sign bit becomes, in the int32, the rather
635 * unimportant bit 8. We must now extend the int8's sign bit into bits
636 * 9-32, which is accomplished by left-shifting then right-shifting.
637 */
638
639 assert(uint_rval->type == &glsl_type_builtin_uint);
640
641 ir_rvalue *result =
642 clamp(div(i2f(unpack_uint_to_ivec4(uint_rval)),
643 constant(127.0f)),
644 constant(-1.0f),
645 constant(1.0f));
646
647 assert(result->type == &glsl_type_builtin_vec4);
648 return result;
649 }
650
651 /**
652 * \brief Lower a packUnorm2x16 expression.
653 *
654 * \param vec2_rval is packUnorm2x16's input
655 * \return packUnorm2x16's output as a uint rvalue
656 */
657 ir_rvalue*
lower_pack_unorm_2x16(ir_rvalue * vec2_rval)658 lower_pack_unorm_2x16(ir_rvalue *vec2_rval)
659 {
660 /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
661 *
662 * highp uint packUnorm2x16 (vec2 v)
663 * ---------------------------------
664 * First, converts each component of the normalized floating-point value
665 * v into 16-bit integer values. Then, the results are packed into the
666 * returned 32-bit unsigned integer.
667 *
668 * The conversion for component c of v to fixed point is done as
669 * follows:
670 *
671 * packUnorm2x16: round(clamp(c, 0, +1) * 65535.0)
672 *
673 * The first component of the vector will be written to the least
674 * significant bits of the output; the last component will be written to
675 * the most significant bits.
676 *
677 * This function generates IR that approximates the following pseudo-GLSL:
678 *
679 * return pack_uvec2_to_uint(uvec2(
680 * round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 65535.0f)));
681 *
682 * Here it is safe to directly convert the vec2 to uvec2 because the vec2
683 * has been clamped to a non-negative range.
684 */
685
686 assert(vec2_rval->type == &glsl_type_builtin_vec2);
687
688 ir_rvalue *result = pack_uvec2_to_uint(
689 f2u(round_even(mul(saturate(vec2_rval), constant(65535.0f)))));
690
691 assert(result->type == &glsl_type_builtin_uint);
692 return result;
693 }
694
695 /**
696 * \brief Lower a packUnorm4x8 expression.
697 *
698 * \param vec4_rval is packUnorm4x8's input
699 * \return packUnorm4x8's output as a uint rvalue
700 */
701 ir_rvalue*
lower_pack_unorm_4x8(ir_rvalue * vec4_rval)702 lower_pack_unorm_4x8(ir_rvalue *vec4_rval)
703 {
704 /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
705 *
706 * highp uint packUnorm4x8 (vec4 v)
707 * --------------------------------
708 * First, converts each component of the normalized floating-point value
709 * v into 8-bit integer values. Then, the results are packed into the
710 * returned 32-bit unsigned integer.
711 *
712 * The conversion for component c of v to fixed point is done as
713 * follows:
714 *
715 * packUnorm4x8: round(clamp(c, 0, +1) * 255.0)
716 *
717 * The first component of the vector will be written to the least
718 * significant bits of the output; the last component will be written to
719 * the most significant bits.
720 *
721 * This function generates IR that approximates the following pseudo-GLSL:
722 *
723 * return pack_uvec4_to_uint(uvec4(
724 * round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 255.0f)));
725 *
726 * Here it is safe to directly convert the vec4 to uvec4 because the vec4
727 * has been clamped to a non-negative range.
728 */
729
730 assert(vec4_rval->type == &glsl_type_builtin_vec4);
731
732 ir_rvalue *result = pack_uvec4_to_uint(
733 f2u(round_even(mul(saturate(vec4_rval), constant(255.0f)))));
734
735 assert(result->type == &glsl_type_builtin_uint);
736 return result;
737 }
738
739 /**
740 * \brief Lower an unpackUnorm2x16 expression.
741 *
742 * \param uint_rval is unpackUnorm2x16's input
743 * \return unpackUnorm2x16's output as a vec2 rvalue
744 */
745 ir_rvalue*
lower_unpack_unorm_2x16(ir_rvalue * uint_rval)746 lower_unpack_unorm_2x16(ir_rvalue *uint_rval)
747 {
748 /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
749 *
750 * highp vec2 unpackUnorm2x16 (highp uint p)
751 * -----------------------------------------
752 * First, unpacks a single 32-bit unsigned integer p into a pair of
753 * 16-bit unsigned integers. Then, each component is converted to
754 * a normalized floating-point value to generate the returned
755 * two-component vector.
756 *
757 * The conversion for unpacked fixed-point value f to floating point is
758 * done as follows:
759 *
760 * unpackUnorm2x16: f / 65535.0
761 *
762 * The first component of the returned vector will be extracted from the
763 * least significant bits of the input; the last component will be
764 * extracted from the most significant bits.
765 *
766 * This function generates IR that approximates the following pseudo-GLSL:
767 *
768 * return vec2(unpack_uint_to_uvec2(UINT_RVALUE)) / 65535.0;
769 */
770
771 assert(uint_rval->type == &glsl_type_builtin_uint);
772
773 ir_rvalue *result = div(u2f(unpack_uint_to_uvec2(uint_rval)),
774 constant(65535.0f));
775
776 assert(result->type == &glsl_type_builtin_vec2);
777 return result;
778 }
779
780 /**
781 * \brief Lower an unpackUnorm4x8 expression.
782 *
783 * \param uint_rval is unpackUnorm4x8's input
784 * \return unpackUnorm4x8's output as a vec4 rvalue
785 */
786 ir_rvalue*
lower_unpack_unorm_4x8(ir_rvalue * uint_rval)787 lower_unpack_unorm_4x8(ir_rvalue *uint_rval)
788 {
789 /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
790 *
791 * highp vec4 unpackUnorm4x8 (highp uint p)
792 * ----------------------------------------
793 * First, unpacks a single 32-bit unsigned integer p into four
794 * 8-bit unsigned integers. Then, each component is converted to
795 * a normalized floating-point value to generate the returned
796 * two-component vector.
797 *
798 * The conversion for unpacked fixed-point value f to floating point is
799 * done as follows:
800 *
801 * unpackUnorm4x8: f / 255.0
802 *
803 * The first component of the returned vector will be extracted from the
804 * least significant bits of the input; the last component will be
805 * extracted from the most significant bits.
806 *
807 * This function generates IR that approximates the following pseudo-GLSL:
808 *
809 * return vec4(unpack_uint_to_uvec4(UINT_RVALUE)) / 255.0;
810 */
811
812 assert(uint_rval->type == &glsl_type_builtin_uint);
813
814 ir_rvalue *result = div(u2f(unpack_uint_to_uvec4(uint_rval)),
815 constant(255.0f));
816
817 assert(result->type == &glsl_type_builtin_vec4);
818 return result;
819 }
820
821 /**
822 * \brief Lower the component-wise calculation of packHalf2x16.
823 *
824 * \param f_rval is one component of packHafl2x16's input
825 * \param e_rval is the unshifted exponent bits of f_rval
826 * \param m_rval is the unshifted mantissa bits of f_rval
827 *
828 * \return a uint rvalue that encodes a float16 in its lower 16 bits
829 */
830 ir_rvalue*
pack_half_1x16_nosign(ir_rvalue * f_rval,ir_rvalue * e_rval,ir_rvalue * m_rval)831 pack_half_1x16_nosign(ir_rvalue *f_rval,
832 ir_rvalue *e_rval,
833 ir_rvalue *m_rval)
834 {
835 assert(e_rval->type == &glsl_type_builtin_uint);
836 assert(m_rval->type == &glsl_type_builtin_uint);
837
838 /* uint u16; */
839 ir_variable *u16 = factory.make_temp(&glsl_type_builtin_uint,
840 "tmp_pack_half_1x16_u16");
841
842 /* float f = FLOAT_RVAL; */
843 ir_variable *f = factory.make_temp(&glsl_type_builtin_float,
844 "tmp_pack_half_1x16_f");
845 factory.emit(assign(f, f_rval));
846
847 /* uint e = E_RVAL; */
848 ir_variable *e = factory.make_temp(&glsl_type_builtin_uint,
849 "tmp_pack_half_1x16_e");
850 factory.emit(assign(e, e_rval));
851
852 /* uint m = M_RVAL; */
853 ir_variable *m = factory.make_temp(&glsl_type_builtin_uint,
854 "tmp_pack_half_1x16_m");
855 factory.emit(assign(m, m_rval));
856
857 /* Preliminaries
858 * -------------
859 *
860 * For a float16, the bit layout is:
861 *
862 * sign: 15
863 * exponent: 10:14
864 * mantissa: 0:9
865 *
866 * Let f16 be a float16 value. The sign, exponent, and mantissa
867 * determine its value thus:
868 *
869 * if e16 = 0 and m16 = 0, then zero: (-1)^s16 * 0 (1)
870 * if e16 = 0 and m16!= 0, then subnormal: (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10) (2)
871 * if 0 < e16 < 31, then normal: (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
872 * if e16 = 31 and m16 = 0, then infinite: (-1)^s16 * inf (4)
873 * if e16 = 31 and m16 != 0, then NaN (5)
874 *
875 * where 0 <= m16 < 2^10.
876 *
877 * For a float32, the bit layout is:
878 *
879 * sign: 31
880 * exponent: 23:30
881 * mantissa: 0:22
882 *
883 * Let f32 be a float32 value. The sign, exponent, and mantissa
884 * determine its value thus:
885 *
886 * if e32 = 0 and m32 = 0, then zero: (-1)^s * 0 (10)
887 * if e32 = 0 and m32 != 0, then subnormal: (-1)^s * 2^(e32 - 126) * (m32 / 2^23) (11)
888 * if 0 < e32 < 255, then normal: (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
889 * if e32 = 255 and m32 = 0, then infinite: (-1)^s * inf (13)
890 * if e32 = 255 and m32 != 0, then NaN (14)
891 *
892 * where 0 <= m32 < 2^23.
893 *
894 * The minimum and maximum normal float16 values are
895 *
896 * min_norm16 = 2^(1 - 15) * (1 + 0 / 2^10) = 2^(-14) (20)
897 * max_norm16 = 2^(30 - 15) * (1 + 1023 / 2^10) (21)
898 *
899 * The step at max_norm16 is
900 *
901 * max_step16 = 2^5 (22)
902 *
903 * Observe that the float16 boundary values in equations 20-21 lie in the
904 * range of normal float32 values.
905 *
906 *
907 * Rounding Behavior
908 * -----------------
909 * Not all float32 values can be exactly represented as a float16. We
910 * round all such intermediate float32 values to the nearest float16; if
911 * the float32 is exactly between to float16 values, we round to the one
912 * with an even mantissa. This rounding behavior has several benefits:
913 *
914 * - It has no sign bias.
915 *
916 * - It reproduces the behavior of real hardware: opcode F32TO16 in Intel's
917 * GPU ISA.
918 *
919 * - By reproducing the behavior of the GPU (at least on Intel hardware),
920 * compile-time evaluation of constant packHalf2x16 GLSL expressions will
921 * result in the same value as if the expression were executed on the
922 * GPU.
923 *
924 * Calculation
925 * -----------
926 * Our task is to compute s16, e16, m16 given f32. Since this function
927 * ignores the sign bit, assume that s32 = s16 = 0. There are several
928 * cases consider.
929 */
930
931 factory.emit(
932
933 /* Case 1) f32 is NaN
934 *
935 * The resultant f16 will also be NaN.
936 */
937
938 /* if (e32 == 255 && m32 != 0) { */
939 if_tree(logic_and(equal(e, constant(0xffu << 23u)),
940 logic_not(equal(m, constant(0u)))),
941
942 assign(u16, constant(0x7fffu)),
943
944 /* Case 2) f32 lies in the range [0, min_norm16).
945 *
946 * The resultant float16 will be either zero, subnormal, or normal.
947 *
948 * Solving
949 *
950 * f32 = min_norm16 (30)
951 *
952 * gives
953 *
954 * e32 = 113 and m32 = 0 (31)
955 *
956 * Therefore this case occurs if and only if
957 *
958 * e32 < 113 (32)
959 */
960
961 /* } else if (e32 < 113) { */
962 if_tree(less(e, constant(113u << 23u)),
963
964 /* u16 = uint(round_to_even(abs(f32) * float(1u << 24u))); */
965 assign(u16, f2u(round_even(mul(expr(ir_unop_abs, f),
966 constant((float) (1 << 24)))))),
967
968 /* Case 3) f32 lies in the range
969 * [min_norm16, max_norm16 + max_step16).
970 *
971 * The resultant float16 will be either normal or infinite.
972 *
973 * Solving
974 *
975 * f32 = max_norm16 + max_step16 (40)
976 * = 2^15 * (1 + 1023 / 2^10) + 2^5 (41)
977 * = 2^16 (42)
978 * gives
979 *
980 * e32 = 143 and m32 = 0 (43)
981 *
982 * We already solved the boundary condition f32 = min_norm16 above
983 * in equation 31. Therefore this case occurs if and only if
984 *
985 * 113 <= e32 and e32 < 143
986 */
987
988 /* } else if (e32 < 143) { */
989 if_tree(less(e, constant(143u << 23u)),
990
991 /* The addition below handles the case where the mantissa rounds
992 * up to 1024 and bumps the exponent.
993 *
994 * u16 = ((e - (112u << 23u)) >> 13u)
995 * + round_to_even((float(m) / (1u << 13u));
996 */
997 assign(u16, add(rshift(sub(e, constant(112u << 23u)),
998 constant(13u)),
999 f2u(round_even(
1000 div(u2f(m), constant((float) (1 << 13))))))),
1001
1002 /* Case 4) f32 lies in the range [max_norm16 + max_step16, inf].
1003 *
1004 * The resultant float16 will be infinite.
1005 *
1006 * The cases above caught all float32 values in the range
1007 * [0, max_norm16 + max_step16), so this is the fall-through case.
1008 */
1009
1010 /* } else { */
1011
1012 assign(u16, constant(31u << 10u))))));
1013
1014 /* } */
1015
1016 return deref(u16).val;
1017 }
1018
1019 /**
1020 * \brief Lower a packHalf2x16 expression.
1021 *
1022 * \param vec2_rval is packHalf2x16's input
1023 * \return packHalf2x16's output as a uint rvalue
1024 */
1025 ir_rvalue*
lower_pack_half_2x16(ir_rvalue * vec2_rval)1026 lower_pack_half_2x16(ir_rvalue *vec2_rval)
1027 {
1028 /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
1029 *
1030 * highp uint packHalf2x16 (mediump vec2 v)
1031 * ----------------------------------------
1032 * Returns an unsigned integer obtained by converting the components of
1033 * a two-component floating-point vector to the 16-bit floating-point
1034 * representation found in the OpenGL ES Specification, and then packing
1035 * these two 16-bit integers into a 32-bit unsigned integer.
1036 *
1037 * The first vector component specifies the 16 least- significant bits
1038 * of the result; the second component specifies the 16 most-significant
1039 * bits.
1040 */
1041
1042 assert(vec2_rval->type == &glsl_type_builtin_vec2);
1043
1044 /* vec2 f = VEC2_RVAL; */
1045 ir_variable *f = factory.make_temp(&glsl_type_builtin_vec2,
1046 "tmp_pack_half_2x16_f");
1047 factory.emit(assign(f, vec2_rval));
1048
1049 /* uvec2 f32 = bitcast_f2u(f); */
1050 ir_variable *f32 = factory.make_temp(&glsl_type_builtin_uvec2,
1051 "tmp_pack_half_2x16_f32");
1052 factory.emit(assign(f32, expr(ir_unop_bitcast_f2u, f)));
1053
1054 /* uvec2 f16; */
1055 ir_variable *f16 = factory.make_temp(&glsl_type_builtin_uvec2,
1056 "tmp_pack_half_2x16_f16");
1057
1058 /* Get f32's unshifted exponent bits.
1059 *
1060 * uvec2 e = f32 & 0x7f800000u;
1061 */
1062 ir_variable *e = factory.make_temp(&glsl_type_builtin_uvec2,
1063 "tmp_pack_half_2x16_e");
1064 factory.emit(assign(e, bit_and(f32, constant(0x7f800000u))));
1065
1066 /* Get f32's unshifted mantissa bits.
1067 *
1068 * uvec2 m = f32 & 0x007fffffu;
1069 */
1070 ir_variable *m = factory.make_temp(&glsl_type_builtin_uvec2,
1071 "tmp_pack_half_2x16_m");
1072 factory.emit(assign(m, bit_and(f32, constant(0x007fffffu))));
1073
1074 /* Set f16's exponent and mantissa bits.
1075 *
1076 * f16.x = pack_half_1x16_nosign(e.x, m.x);
1077 * f16.y = pack_half_1y16_nosign(e.y, m.y);
1078 */
1079 factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_x(f),
1080 swizzle_x(e),
1081 swizzle_x(m)),
1082 WRITEMASK_X));
1083 factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_y(f),
1084 swizzle_y(e),
1085 swizzle_y(m)),
1086 WRITEMASK_Y));
1087
1088 /* Set f16's sign bits.
1089 *
1090 * f16 |= (f32 & (1u << 31u) >> 16u;
1091 */
1092 factory.emit(
1093 assign(f16, bit_or(f16,
1094 rshift(bit_and(f32, constant(1u << 31u)),
1095 constant(16u)))));
1096
1097
1098 /* return (f16.y << 16u) | f16.x; */
1099 ir_rvalue *result = bit_or(lshift(swizzle_y(f16),
1100 constant(16u)),
1101 swizzle_x(f16));
1102
1103 assert(result->type == &glsl_type_builtin_uint);
1104 return result;
1105 }
1106
1107 /**
1108 * \brief Lower the component-wise calculation of unpackHalf2x16.
1109 *
1110 * Given a uint that encodes a float16 in its lower 16 bits, this function
1111 * returns a uint that encodes a float32 with the same value. The sign bit
1112 * of the float16 is ignored.
1113 *
1114 * \param e_rval is the unshifted exponent bits of a float16
1115 * \param m_rval is the unshifted mantissa bits of a float16
1116 * \param a uint rvalue that encodes a float32
1117 */
1118 ir_rvalue*
unpack_half_1x16_nosign(ir_rvalue * e_rval,ir_rvalue * m_rval)1119 unpack_half_1x16_nosign(ir_rvalue *e_rval, ir_rvalue *m_rval)
1120 {
1121 assert(e_rval->type == &glsl_type_builtin_uint);
1122 assert(m_rval->type == &glsl_type_builtin_uint);
1123
1124 /* uint u32; */
1125 ir_variable *u32 = factory.make_temp(&glsl_type_builtin_uint,
1126 "tmp_unpack_half_1x16_u32");
1127
1128 /* uint e = E_RVAL; */
1129 ir_variable *e = factory.make_temp(&glsl_type_builtin_uint,
1130 "tmp_unpack_half_1x16_e");
1131 factory.emit(assign(e, e_rval));
1132
1133 /* uint m = M_RVAL; */
1134 ir_variable *m = factory.make_temp(&glsl_type_builtin_uint,
1135 "tmp_unpack_half_1x16_m");
1136 factory.emit(assign(m, m_rval));
1137
1138 /* Preliminaries
1139 * -------------
1140 *
1141 * For a float16, the bit layout is:
1142 *
1143 * sign: 15
1144 * exponent: 10:14
1145 * mantissa: 0:9
1146 *
1147 * Let f16 be a float16 value. The sign, exponent, and mantissa
1148 * determine its value thus:
1149 *
1150 * if e16 = 0 and m16 = 0, then zero: (-1)^s16 * 0 (1)
1151 * if e16 = 0 and m16!= 0, then subnormal: (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10) (2)
1152 * if 0 < e16 < 31, then normal: (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
1153 * if e16 = 31 and m16 = 0, then infinite: (-1)^s16 * inf (4)
1154 * if e16 = 31 and m16 != 0, then NaN (5)
1155 *
1156 * where 0 <= m16 < 2^10.
1157 *
1158 * For a float32, the bit layout is:
1159 *
1160 * sign: 31
1161 * exponent: 23:30
1162 * mantissa: 0:22
1163 *
1164 * Let f32 be a float32 value. The sign, exponent, and mantissa
1165 * determine its value thus:
1166 *
1167 * if e32 = 0 and m32 = 0, then zero: (-1)^s * 0 (10)
1168 * if e32 = 0 and m32 != 0, then subnormal: (-1)^s * 2^(e32 - 126) * (m32 / 2^23) (11)
1169 * if 0 < e32 < 255, then normal: (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
1170 * if e32 = 255 and m32 = 0, then infinite: (-1)^s * inf (13)
1171 * if e32 = 255 and m32 != 0, then NaN (14)
1172 *
1173 * where 0 <= m32 < 2^23.
1174 *
1175 * Calculation
1176 * -----------
1177 * Our task is to compute s32, e32, m32 given f16. Since this function
1178 * ignores the sign bit, assume that s32 = s16 = 0. There are several
1179 * cases consider.
1180 */
1181
1182 factory.emit(
1183
1184 /* Case 1) f16 is zero or subnormal.
1185 *
1186 * The simplest method of calcuating f32 in this case is
1187 *
1188 * f32 = f16 (20)
1189 * = 2^(-14) * (m16 / 2^10) (21)
1190 * = m16 / 2^(-24) (22)
1191 */
1192
1193 /* if (e16 == 0) { */
1194 if_tree(equal(e, constant(0u)),
1195
1196 /* u32 = bitcast_f2u(float(m) / float(1 << 24)); */
1197 assign(u32, expr(ir_unop_bitcast_f2u,
1198 div(u2f(m), constant((float)(1 << 24))))),
1199
1200 /* Case 2) f16 is normal.
1201 *
1202 * The equation
1203 *
1204 * f32 = f16 (30)
1205 * 2^(e32 - 127) * (1 + m32 / 2^23) = (31)
1206 * 2^(e16 - 15) * (1 + m16 / 2^10)
1207 *
1208 * can be decomposed into two
1209 *
1210 * 2^(e32 - 127) = 2^(e16 - 15) (32)
1211 * 1 + m32 / 2^23 = 1 + m16 / 2^10 (33)
1212 *
1213 * which solve to
1214 *
1215 * e32 = e16 + 112 (34)
1216 * m32 = m16 * 2^13 (35)
1217 */
1218
1219 /* } else if (e16 < 31)) { */
1220 if_tree(less(e, constant(31u << 10u)),
1221
1222 /* u32 = ((e + (112 << 10)) | m) << 13;
1223 */
1224 assign(u32, lshift(bit_or(add(e, constant(112u << 10u)), m),
1225 constant(13u))),
1226
1227
1228 /* Case 3) f16 is infinite. */
1229 if_tree(equal(m, constant(0u)),
1230
1231 assign(u32, constant(255u << 23u)),
1232
1233 /* Case 4) f16 is NaN. */
1234 /* } else { */
1235
1236 assign(u32, constant(0x7fffffffu))))));
1237
1238 /* } */
1239
1240 return deref(u32).val;
1241 }
1242
1243 /**
1244 * \brief Lower an unpackHalf2x16 expression.
1245 *
1246 * \param uint_rval is unpackHalf2x16's input
1247 * \return unpackHalf2x16's output as a vec2 rvalue
1248 */
1249 ir_rvalue*
lower_unpack_half_2x16(ir_rvalue * uint_rval)1250 lower_unpack_half_2x16(ir_rvalue *uint_rval)
1251 {
1252 /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
1253 *
1254 * mediump vec2 unpackHalf2x16 (highp uint v)
1255 * ------------------------------------------
1256 * Returns a two-component floating-point vector with components
1257 * obtained by unpacking a 32-bit unsigned integer into a pair of 16-bit
1258 * values, interpreting those values as 16-bit floating-point numbers
1259 * according to the OpenGL ES Specification, and converting them to
1260 * 32-bit floating-point values.
1261 *
1262 * The first component of the vector is obtained from the
1263 * 16 least-significant bits of v; the second component is obtained
1264 * from the 16 most-significant bits of v.
1265 */
1266 assert(uint_rval->type == &glsl_type_builtin_uint);
1267
1268 /* uint u = RVALUE;
1269 * uvec2 f16 = uvec2(u.x & 0xffff, u.y >> 16);
1270 */
1271 ir_variable *f16 = factory.make_temp(&glsl_type_builtin_uvec2,
1272 "tmp_unpack_half_2x16_f16");
1273 factory.emit(assign(f16, unpack_uint_to_uvec2(uint_rval)));
1274
1275 /* uvec2 f32; */
1276 ir_variable *f32 = factory.make_temp(&glsl_type_builtin_uvec2,
1277 "tmp_unpack_half_2x16_f32");
1278
1279 /* Get f16's unshifted exponent bits.
1280 *
1281 * uvec2 e = f16 & 0x7c00u;
1282 */
1283 ir_variable *e = factory.make_temp(&glsl_type_builtin_uvec2,
1284 "tmp_unpack_half_2x16_e");
1285 factory.emit(assign(e, bit_and(f16, constant(0x7c00u))));
1286
1287 /* Get f16's unshifted mantissa bits.
1288 *
1289 * uvec2 m = f16 & 0x03ffu;
1290 */
1291 ir_variable *m = factory.make_temp(&glsl_type_builtin_uvec2,
1292 "tmp_unpack_half_2x16_m");
1293 factory.emit(assign(m, bit_and(f16, constant(0x03ffu))));
1294
1295 /* Set f32's exponent and mantissa bits.
1296 *
1297 * f32.x = unpack_half_1x16_nosign(e.x, m.x);
1298 * f32.y = unpack_half_1x16_nosign(e.y, m.y);
1299 */
1300 factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_x(e),
1301 swizzle_x(m)),
1302 WRITEMASK_X));
1303 factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_y(e),
1304 swizzle_y(m)),
1305 WRITEMASK_Y));
1306
1307 /* Set f32's sign bit.
1308 *
1309 * f32 |= (f16 & 0x8000u) << 16u;
1310 */
1311 factory.emit(assign(f32, bit_or(f32,
1312 lshift(bit_and(f16,
1313 constant(0x8000u)),
1314 constant(16u)))));
1315
1316 /* return bitcast_u2f(f32); */
1317 ir_rvalue *result = expr(ir_unop_bitcast_u2f, f32);
1318 assert(result->type == &glsl_type_builtin_vec2);
1319 return result;
1320 }
1321 };
1322
1323 } // namespace anonymous
1324
1325 /**
1326 * \brief Lower the builtin packing functions.
1327 */
1328 bool
lower_packing_builtins(exec_list * instructions,bool has_shading_language_packing,bool has_gpu_shader5,bool has_half_float_packing)1329 lower_packing_builtins(exec_list *instructions,
1330 bool has_shading_language_packing,
1331 bool has_gpu_shader5,
1332 bool has_half_float_packing)
1333 {
1334 if (!has_shading_language_packing)
1335 return false;
1336
1337 int op_mask = LOWER_PACK_SNORM_2x16 |
1338 LOWER_UNPACK_SNORM_2x16 |
1339 LOWER_PACK_UNORM_2x16 |
1340 LOWER_UNPACK_UNORM_2x16 |
1341 LOWER_PACK_SNORM_4x8 |
1342 LOWER_UNPACK_SNORM_4x8 |
1343 LOWER_UNPACK_UNORM_4x8 |
1344 LOWER_PACK_UNORM_4x8;
1345
1346 if (has_gpu_shader5)
1347 op_mask |= LOWER_PACK_USE_BFI | LOWER_PACK_USE_BFE;
1348
1349 if (!has_half_float_packing)
1350 op_mask |= LOWER_PACK_HALF_2x16 | LOWER_UNPACK_HALF_2x16;
1351
1352 lower_packing_builtins_visitor v(op_mask);
1353 visit_list_elements(&v, instructions, true);
1354 return v.get_progress();
1355 }
1356