1 /*
2 * Copyright © 2012 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_eu_compact.c
25 *
26 * Instruction compaction is a feature of gm45 and newer hardware that allows
27 * for a smaller instruction encoding.
28 *
29 * The instruction cache is on the order of 32KB, and many programs generate
30 * far more instructions than that. The instruction cache is built to barely
31 * keep up with instruction dispatch abaility in cache hit cases -- L1
32 * instruction cache misses that still hit in the next level could limit
33 * throughput by around 50%.
34 *
35 * The idea of instruction compaction is that most instructions use a tiny
36 * subset of the GPU functionality, so we can encode what would be a 16 byte
37 * instruction in 8 bytes using some lookup tables for various fields.
38 */
39
40 #include <string.h>
41
42 #include "brw_compat.h"
43 #include "brw_context.h"
44 #include "brw_eu.h"
45
46 static const uint32_t gen6_control_index_table[32] = {
47 0b00000000000000000,
48 0b01000000000000000,
49 0b00110000000000000,
50 0b00000000100000000,
51 0b00010000000000000,
52 0b00001000100000000,
53 0b00000000100000010,
54 0b00000000000000010,
55 0b01000000100000000,
56 0b01010000000000000,
57 0b10110000000000000,
58 0b00100000000000000,
59 0b11010000000000000,
60 0b11000000000000000,
61 0b01001000100000000,
62 0b01000000000001000,
63 0b01000000000000100,
64 0b00000000000001000,
65 0b00000000000000100,
66 0b00111000100000000,
67 0b00001000100000010,
68 0b00110000100000000,
69 0b00110000000000001,
70 0b00100000000000001,
71 0b00110000000000010,
72 0b00110000000000101,
73 0b00110000000001001,
74 0b00110000000010000,
75 0b00110000000000011,
76 0b00110000000000100,
77 0b00110000100001000,
78 0b00100000000001001
79 };
80
81 static const uint32_t gen6_datatype_table[32] = {
82 0b001001110000000000,
83 0b001000110000100000,
84 0b001001110000000001,
85 0b001000000001100000,
86 0b001010110100101001,
87 0b001000000110101101,
88 0b001100011000101100,
89 0b001011110110101101,
90 0b001000000111101100,
91 0b001000000001100001,
92 0b001000110010100101,
93 0b001000000001000001,
94 0b001000001000110001,
95 0b001000001000101001,
96 0b001000000000100000,
97 0b001000001000110010,
98 0b001010010100101001,
99 0b001011010010100101,
100 0b001000000110100101,
101 0b001100011000101001,
102 0b001011011000101100,
103 0b001011010110100101,
104 0b001011110110100101,
105 0b001111011110111101,
106 0b001111011110111100,
107 0b001111011110111101,
108 0b001111011110011101,
109 0b001111011110111110,
110 0b001000000000100001,
111 0b001000000000100010,
112 0b001001111111011101,
113 0b001000001110111110,
114 };
115
116 static const uint32_t gen6_subreg_table[32] = {
117 0b000000000000000,
118 0b000000000000100,
119 0b000000110000000,
120 0b111000000000000,
121 0b011110000001000,
122 0b000010000000000,
123 0b000000000010000,
124 0b000110000001100,
125 0b001000000000000,
126 0b000001000000000,
127 0b000001010010100,
128 0b000000001010110,
129 0b010000000000000,
130 0b110000000000000,
131 0b000100000000000,
132 0b000000010000000,
133 0b000000000001000,
134 0b100000000000000,
135 0b000001010000000,
136 0b001010000000000,
137 0b001100000000000,
138 0b000000001010100,
139 0b101101010010100,
140 0b010100000000000,
141 0b000000010001111,
142 0b011000000000000,
143 0b111110000000000,
144 0b101000000000000,
145 0b000000000001111,
146 0b000100010001111,
147 0b001000010001111,
148 0b000110000000000,
149 };
150
151 static const uint32_t gen6_src_index_table[32] = {
152 0b000000000000,
153 0b010110001000,
154 0b010001101000,
155 0b001000101000,
156 0b011010010000,
157 0b000100100000,
158 0b010001101100,
159 0b010101110000,
160 0b011001111000,
161 0b001100101000,
162 0b010110001100,
163 0b001000100000,
164 0b010110001010,
165 0b000000000010,
166 0b010101010000,
167 0b010101101000,
168 0b111101001100,
169 0b111100101100,
170 0b011001110000,
171 0b010110001001,
172 0b010101011000,
173 0b001101001000,
174 0b010000101100,
175 0b010000000000,
176 0b001101110000,
177 0b001100010000,
178 0b001100000000,
179 0b010001101010,
180 0b001101111000,
181 0b000001110000,
182 0b001100100000,
183 0b001101010000,
184 };
185
186 static const uint32_t gen7_control_index_table[32] = {
187 0b0000000000000000010,
188 0b0000100000000000000,
189 0b0000100000000000001,
190 0b0000100000000000010,
191 0b0000100000000000011,
192 0b0000100000000000100,
193 0b0000100000000000101,
194 0b0000100000000000111,
195 0b0000100000000001000,
196 0b0000100000000001001,
197 0b0000100000000001101,
198 0b0000110000000000000,
199 0b0000110000000000001,
200 0b0000110000000000010,
201 0b0000110000000000011,
202 0b0000110000000000100,
203 0b0000110000000000101,
204 0b0000110000000000111,
205 0b0000110000000001001,
206 0b0000110000000001101,
207 0b0000110000000010000,
208 0b0000110000100000000,
209 0b0001000000000000000,
210 0b0001000000000000010,
211 0b0001000000000000100,
212 0b0001000000100000000,
213 0b0010110000000000000,
214 0b0010110000000010000,
215 0b0011000000000000000,
216 0b0011000000100000000,
217 0b0101000000000000000,
218 0b0101000000100000000
219 };
220
221 static const uint32_t gen7_datatype_table[32] = {
222 0b001000000000000001,
223 0b001000000000100000,
224 0b001000000000100001,
225 0b001000000001100001,
226 0b001000000010111101,
227 0b001000001011111101,
228 0b001000001110100001,
229 0b001000001110100101,
230 0b001000001110111101,
231 0b001000010000100001,
232 0b001000110000100000,
233 0b001000110000100001,
234 0b001001010010100101,
235 0b001001110010100100,
236 0b001001110010100101,
237 0b001111001110111101,
238 0b001111011110011101,
239 0b001111011110111100,
240 0b001111011110111101,
241 0b001111111110111100,
242 0b000000001000001100,
243 0b001000000000111101,
244 0b001000000010100101,
245 0b001000010000100000,
246 0b001001010010100100,
247 0b001001110010000100,
248 0b001010010100001001,
249 0b001101111110111101,
250 0b001111111110111101,
251 0b001011110110101100,
252 0b001010010100101000,
253 0b001010110100101000
254 };
255
256 static const uint32_t gen7_subreg_table[32] = {
257 0b000000000000000,
258 0b000000000000001,
259 0b000000000001000,
260 0b000000000001111,
261 0b000000000010000,
262 0b000000010000000,
263 0b000000100000000,
264 0b000000110000000,
265 0b000001000000000,
266 0b000001000010000,
267 0b000010100000000,
268 0b001000000000000,
269 0b001000000000001,
270 0b001000010000001,
271 0b001000010000010,
272 0b001000010000011,
273 0b001000010000100,
274 0b001000010000111,
275 0b001000010001000,
276 0b001000010001110,
277 0b001000010001111,
278 0b001000110000000,
279 0b001000111101000,
280 0b010000000000000,
281 0b010000110000000,
282 0b011000000000000,
283 0b011110010000111,
284 0b100000000000000,
285 0b101000000000000,
286 0b110000000000000,
287 0b111000000000000,
288 0b111000000011100
289 };
290
291 static const uint32_t gen7_src_index_table[32] = {
292 0b000000000000,
293 0b000000000010,
294 0b000000010000,
295 0b000000010010,
296 0b000000011000,
297 0b000000100000,
298 0b000000101000,
299 0b000001001000,
300 0b000001010000,
301 0b000001110000,
302 0b000001111000,
303 0b001100000000,
304 0b001100000010,
305 0b001100001000,
306 0b001100010000,
307 0b001100010010,
308 0b001100100000,
309 0b001100101000,
310 0b001100111000,
311 0b001101000000,
312 0b001101000010,
313 0b001101001000,
314 0b001101010000,
315 0b001101100000,
316 0b001101101000,
317 0b001101110000,
318 0b001101110001,
319 0b001101111000,
320 0b010001101000,
321 0b010001101001,
322 0b010001101010,
323 0b010110001000
324 };
325
326 static const uint32_t *control_index_table;
327 static const uint32_t *datatype_table;
328 static const uint32_t *subreg_table;
329 static const uint32_t *src_index_table;
330
331 static bool
set_control_index(struct intel_context * intel,struct brw_compact_instruction * dst,struct brw_instruction * src)332 set_control_index(struct intel_context *intel,
333 struct brw_compact_instruction *dst,
334 struct brw_instruction *src)
335 {
336 uint32_t *src_u32 = (uint32_t *)src;
337 uint32_t uncompacted = 0;
338
339 uncompacted |= ((src_u32[0] >> 8) & 0xffff) << 0;
340 uncompacted |= ((src_u32[0] >> 31) & 0x1) << 16;
341 /* On gen7, the flag register number gets integrated into the control
342 * index.
343 */
344 if (intel->gen >= 7)
345 uncompacted |= ((src_u32[2] >> 25) & 0x3) << 17;
346
347 for (int i = 0; i < 32; i++) {
348 if (control_index_table[i] == uncompacted) {
349 dst->dw0.control_index = i;
350 return true;
351 }
352 }
353
354 return false;
355 }
356
357 static bool
set_datatype_index(struct brw_compact_instruction * dst,struct brw_instruction * src)358 set_datatype_index(struct brw_compact_instruction *dst,
359 struct brw_instruction *src)
360 {
361 uint32_t uncompacted = 0;
362
363 uncompacted |= src->bits1.ud & 0x7fff;
364 uncompacted |= (src->bits1.ud >> 29) << 15;
365
366 for (int i = 0; i < 32; i++) {
367 if (datatype_table[i] == uncompacted) {
368 dst->dw0.data_type_index = i;
369 return true;
370 }
371 }
372
373 return false;
374 }
375
376 static bool
set_subreg_index(struct brw_compact_instruction * dst,struct brw_instruction * src)377 set_subreg_index(struct brw_compact_instruction *dst,
378 struct brw_instruction *src)
379 {
380 uint32_t uncompacted = 0;
381
382 uncompacted |= src->bits1.da1.dest_subreg_nr << 0;
383 uncompacted |= src->bits2.da1.src0_subreg_nr << 5;
384 uncompacted |= src->bits3.da1.src1_subreg_nr << 10;
385
386 for (int i = 0; i < 32; i++) {
387 if (subreg_table[i] == uncompacted) {
388 dst->dw0.sub_reg_index = i;
389 return true;
390 }
391 }
392
393 return false;
394 }
395
396 static bool
get_src_index(uint32_t uncompacted,uint32_t * compacted)397 get_src_index(uint32_t uncompacted,
398 uint32_t *compacted)
399 {
400 for (int i = 0; i < 32; i++) {
401 if (src_index_table[i] == uncompacted) {
402 *compacted = i;
403 return true;
404 }
405 }
406
407 return false;
408 }
409
410 static bool
set_src0_index(struct brw_compact_instruction * dst,struct brw_instruction * src)411 set_src0_index(struct brw_compact_instruction *dst,
412 struct brw_instruction *src)
413 {
414 uint32_t compacted, uncompacted = 0;
415
416 uncompacted |= (src->bits2.ud >> 13) & 0xfff;
417
418 if (!get_src_index(uncompacted, &compacted))
419 return false;
420
421 dst->dw0.src0_index = compacted & 0x3;
422 dst->dw1.src0_index = compacted >> 2;
423
424 return true;
425 }
426
427 static bool
set_src1_index(struct brw_compact_instruction * dst,struct brw_instruction * src)428 set_src1_index(struct brw_compact_instruction *dst,
429 struct brw_instruction *src)
430 {
431 uint32_t compacted, uncompacted = 0;
432
433 uncompacted |= (src->bits3.ud >> 13) & 0xfff;
434
435 if (!get_src_index(uncompacted, &compacted))
436 return false;
437
438 dst->dw1.src1_index = compacted;
439
440 return true;
441 }
442
443 /**
444 * Tries to compact instruction src into dst.
445 *
446 * It doesn't modify dst unless src is compactable, which is relied on by
447 * brw_compact_instructions().
448 */
449 bool
brw_try_compact_instruction(struct brw_compile * p,struct brw_compact_instruction * dst,struct brw_instruction * src)450 brw_try_compact_instruction(struct brw_compile *p,
451 struct brw_compact_instruction *dst,
452 struct brw_instruction *src)
453 {
454 struct brw_context *brw = p->brw;
455 struct intel_context *intel = &brw->intel;
456 struct brw_compact_instruction temp;
457
458 if (src->header.opcode == BRW_OPCODE_IF ||
459 src->header.opcode == BRW_OPCODE_ELSE ||
460 src->header.opcode == BRW_OPCODE_ENDIF ||
461 src->header.opcode == BRW_OPCODE_HALT ||
462 src->header.opcode == BRW_OPCODE_DO ||
463 src->header.opcode == BRW_OPCODE_WHILE) {
464 /* FINISHME: The fixup code below, and brw_set_uip_jip and friends, needs
465 * to be able to handle compacted flow control instructions..
466 */
467 return false;
468 }
469
470 /* FINISHME: immediates */
471 if (src->bits1.da1.src0_reg_file == BRW_IMMEDIATE_VALUE ||
472 src->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE)
473 return false;
474
475 memset(&temp, 0, sizeof(temp));
476
477 temp.dw0.opcode = src->header.opcode;
478 temp.dw0.debug_control = src->header.debug_control;
479 if (!set_control_index(intel, &temp, src))
480 return false;
481 if (!set_datatype_index(&temp, src))
482 return false;
483 if (!set_subreg_index(&temp, src))
484 return false;
485 temp.dw0.acc_wr_control = src->header.acc_wr_control;
486 temp.dw0.conditionalmod = src->header.destreg__conditionalmod;
487 if (intel->gen <= 6)
488 temp.dw0.flag_subreg_nr = src->bits2.da1.flag_subreg_nr;
489 temp.dw0.cmpt_ctrl = 1;
490 if (!set_src0_index(&temp, src))
491 return false;
492 if (!set_src1_index(&temp, src))
493 return false;
494 temp.dw1.dst_reg_nr = src->bits1.da1.dest_reg_nr;
495 temp.dw1.src0_reg_nr = src->bits2.da1.src0_reg_nr;
496 temp.dw1.src1_reg_nr = src->bits3.da1.src1_reg_nr;
497
498 *dst = temp;
499
500 return true;
501 }
502
503 static void
set_uncompacted_control(struct intel_context * intel,struct brw_instruction * dst,struct brw_compact_instruction * src)504 set_uncompacted_control(struct intel_context *intel,
505 struct brw_instruction *dst,
506 struct brw_compact_instruction *src)
507 {
508 uint32_t *dst_u32 = (uint32_t *)dst;
509 uint32_t uncompacted = control_index_table[src->dw0.control_index];
510
511 dst_u32[0] |= ((uncompacted >> 0) & 0xffff) << 8;
512 dst_u32[0] |= ((uncompacted >> 16) & 0x1) << 31;
513
514 if (intel->gen >= 7)
515 dst_u32[2] |= ((uncompacted >> 17) & 0x3) << 25;
516 }
517
518 static void
set_uncompacted_datatype(struct brw_instruction * dst,struct brw_compact_instruction * src)519 set_uncompacted_datatype(struct brw_instruction *dst,
520 struct brw_compact_instruction *src)
521 {
522 uint32_t uncompacted = datatype_table[src->dw0.data_type_index];
523
524 dst->bits1.ud &= ~(0x7 << 29);
525 dst->bits1.ud |= ((uncompacted >> 15) & 0x7) << 29;
526 dst->bits1.ud &= ~0x7fff;
527 dst->bits1.ud |= uncompacted & 0x7fff;
528 }
529
530 static void
set_uncompacted_subreg(struct brw_instruction * dst,struct brw_compact_instruction * src)531 set_uncompacted_subreg(struct brw_instruction *dst,
532 struct brw_compact_instruction *src)
533 {
534 uint32_t uncompacted = subreg_table[src->dw0.sub_reg_index];
535
536 dst->bits1.da1.dest_subreg_nr = (uncompacted >> 0) & 0x1f;
537 dst->bits2.da1.src0_subreg_nr = (uncompacted >> 5) & 0x1f;
538 dst->bits3.da1.src1_subreg_nr = (uncompacted >> 10) & 0x1f;
539 }
540
541 static void
set_uncompacted_src0(struct brw_instruction * dst,struct brw_compact_instruction * src)542 set_uncompacted_src0(struct brw_instruction *dst,
543 struct brw_compact_instruction *src)
544 {
545 uint32_t compacted = src->dw0.src0_index | src->dw1.src0_index << 2;
546 uint32_t uncompacted = src_index_table[compacted];
547
548 dst->bits2.ud |= uncompacted << 13;
549 }
550
551 static void
set_uncompacted_src1(struct brw_instruction * dst,struct brw_compact_instruction * src)552 set_uncompacted_src1(struct brw_instruction *dst,
553 struct brw_compact_instruction *src)
554 {
555 uint32_t uncompacted = src_index_table[src->dw1.src1_index];
556
557 dst->bits3.ud |= uncompacted << 13;
558 }
559
560 void
brw_uncompact_instruction(struct intel_context * intel,struct brw_instruction * dst,struct brw_compact_instruction * src)561 brw_uncompact_instruction(struct intel_context *intel,
562 struct brw_instruction *dst,
563 struct brw_compact_instruction *src)
564 {
565 memset(dst, 0, sizeof(*dst));
566
567 dst->header.opcode = src->dw0.opcode;
568 dst->header.debug_control = src->dw0.debug_control;
569
570 set_uncompacted_control(intel, dst, src);
571 set_uncompacted_datatype(dst, src);
572 set_uncompacted_subreg(dst, src);
573 dst->header.acc_wr_control = src->dw0.acc_wr_control;
574 dst->header.destreg__conditionalmod = src->dw0.conditionalmod;
575 if (intel->gen <= 6)
576 dst->bits2.da1.flag_subreg_nr = src->dw0.flag_subreg_nr;
577 set_uncompacted_src0(dst, src);
578 set_uncompacted_src1(dst, src);
579 dst->bits1.da1.dest_reg_nr = src->dw1.dst_reg_nr;
580 dst->bits2.da1.src0_reg_nr = src->dw1.src0_reg_nr;
581 dst->bits3.da1.src1_reg_nr = src->dw1.src1_reg_nr;
582 }
583
brw_debug_compact_uncompact(struct intel_context * intel,struct brw_instruction * orig,struct brw_instruction * uncompacted)584 void brw_debug_compact_uncompact(struct intel_context *intel,
585 struct brw_instruction *orig,
586 struct brw_instruction *uncompacted)
587 {
588 fprintf(stderr, "Instruction compact/uncompact changed (gen%d):\n",
589 intel->gen);
590
591 fprintf(stderr, " before: ");
592 brw_disasm(stderr, orig, intel->gen);
593
594 fprintf(stderr, " after: ");
595 brw_disasm(stderr, uncompacted, intel->gen);
596
597 uint32_t *before_bits = (uint32_t *)orig;
598 uint32_t *after_bits = (uint32_t *)uncompacted;
599 printf(" changed bits:\n");
600 for (int i = 0; i < 128; i++) {
601 uint32_t before = before_bits[i / 32] & (1 << (i & 31));
602 uint32_t after = after_bits[i / 32] & (1 << (i & 31));
603
604 if (before != after) {
605 printf(" bit %d, %s to %s\n", i,
606 before ? "set" : "unset",
607 after ? "set" : "unset");
608 }
609 }
610 }
611
612 static int
compacted_between(int old_ip,int old_target_ip,int * compacted_counts)613 compacted_between(int old_ip, int old_target_ip, int *compacted_counts)
614 {
615 int this_compacted_count = compacted_counts[old_ip];
616 int target_compacted_count = compacted_counts[old_target_ip];
617 return target_compacted_count - this_compacted_count;
618 }
619
620 static void
update_uip_jip(struct brw_instruction * insn,int this_old_ip,int * compacted_counts)621 update_uip_jip(struct brw_instruction *insn, int this_old_ip,
622 int *compacted_counts)
623 {
624 int target_old_ip;
625
626 target_old_ip = this_old_ip + insn->bits3.break_cont.jip;
627 insn->bits3.break_cont.jip -= compacted_between(this_old_ip,
628 target_old_ip,
629 compacted_counts);
630
631 target_old_ip = this_old_ip + insn->bits3.break_cont.uip;
632 insn->bits3.break_cont.uip -= compacted_between(this_old_ip,
633 target_old_ip,
634 compacted_counts);
635 }
636
637 void
brw_init_compaction_tables(struct intel_context * intel)638 brw_init_compaction_tables(struct intel_context *intel)
639 {
640 assert(gen6_control_index_table[ARRAY_SIZE(gen6_control_index_table) - 1] != 0);
641 assert(gen6_datatype_table[ARRAY_SIZE(gen6_datatype_table) - 1] != 0);
642 assert(gen6_subreg_table[ARRAY_SIZE(gen6_subreg_table) - 1] != 0);
643 assert(gen6_src_index_table[ARRAY_SIZE(gen6_src_index_table) - 1] != 0);
644 assert(gen7_control_index_table[ARRAY_SIZE(gen6_control_index_table) - 1] != 0);
645 assert(gen7_datatype_table[ARRAY_SIZE(gen6_datatype_table) - 1] != 0);
646 assert(gen7_subreg_table[ARRAY_SIZE(gen6_subreg_table) - 1] != 0);
647 assert(gen7_src_index_table[ARRAY_SIZE(gen6_src_index_table) - 1] != 0);
648
649 switch (intel->gen) {
650 case 7:
651 control_index_table = gen7_control_index_table;
652 datatype_table = gen7_datatype_table;
653 subreg_table = gen7_subreg_table;
654 src_index_table = gen7_src_index_table;
655 break;
656 case 6:
657 control_index_table = gen6_control_index_table;
658 datatype_table = gen6_datatype_table;
659 subreg_table = gen6_subreg_table;
660 src_index_table = gen6_src_index_table;
661 break;
662 default:
663 return;
664 }
665 }
666
667 void
brw_compact_instructions(struct brw_compile * p)668 brw_compact_instructions(struct brw_compile *p)
669 {
670 struct brw_context *brw = p->brw;
671 struct intel_context *intel = &brw->intel;
672 void *store = p->store;
673 /* For an instruction at byte offset 8*i before compaction, this is the number
674 * of compacted instructions that preceded it.
675 */
676 int compacted_counts[p->next_insn_offset / 8];
677 /* For an instruction at byte offset 8*i after compaction, this is the
678 * 8-byte offset it was at before compaction.
679 */
680 int old_ip[p->next_insn_offset / 8];
681
682 if (intel->gen < 6)
683 return;
684
685 int src_offset;
686 int offset = 0;
687 int compacted_count = 0;
688 for (src_offset = 0; src_offset < p->nr_insn * 16;) {
689 struct brw_instruction *src = store + src_offset;
690 void *dst = store + offset;
691
692 old_ip[offset / 8] = src_offset / 8;
693 compacted_counts[src_offset / 8] = compacted_count;
694
695 struct brw_instruction saved = *src;
696
697 if (!src->header.cmpt_control &&
698 brw_try_compact_instruction(p, dst, src)) {
699 compacted_count++;
700
701 if (INTEL_DEBUG) {
702 struct brw_instruction uncompacted;
703 brw_uncompact_instruction(intel, &uncompacted, dst);
704 if (memcmp(&saved, &uncompacted, sizeof(uncompacted))) {
705 brw_debug_compact_uncompact(intel, &saved, &uncompacted);
706 }
707 }
708
709 offset += 8;
710 src_offset += 16;
711 } else {
712 int size = src->header.cmpt_control ? 8 : 16;
713
714 /* It appears that the end of thread SEND instruction needs to be
715 * aligned, or the GPU hangs.
716 */
717 if ((src->header.opcode == BRW_OPCODE_SEND ||
718 src->header.opcode == BRW_OPCODE_SENDC) &&
719 src->bits3.generic.end_of_thread &&
720 (offset & 8) != 0) {
721 struct brw_compact_instruction *align = store + offset;
722 memset(align, 0, sizeof(*align));
723 align->dw0.opcode = BRW_OPCODE_NOP;
724 align->dw0.cmpt_ctrl = 1;
725 offset += 8;
726 old_ip[offset / 8] = src_offset / 8;
727 dst = store + offset;
728 }
729
730 /* If we didn't compact this intruction, we need to move it down into
731 * place.
732 */
733 if (offset != src_offset) {
734 memmove(dst, src, size);
735 }
736 offset += size;
737 src_offset += size;
738 }
739 }
740
741 /* Fix up control flow offsets. */
742 p->next_insn_offset = offset;
743 for (offset = 0; offset < p->next_insn_offset;) {
744 struct brw_instruction *insn = store + offset;
745 int this_old_ip = old_ip[offset / 8];
746 int this_compacted_count = compacted_counts[this_old_ip];
747 int target_old_ip, target_compacted_count;
748
749 switch (insn->header.opcode) {
750 case BRW_OPCODE_BREAK:
751 case BRW_OPCODE_CONTINUE:
752 case BRW_OPCODE_HALT:
753 update_uip_jip(insn, this_old_ip, compacted_counts);
754 break;
755
756 case BRW_OPCODE_IF:
757 case BRW_OPCODE_ELSE:
758 case BRW_OPCODE_ENDIF:
759 case BRW_OPCODE_WHILE:
760 if (intel->gen == 6) {
761 target_old_ip = this_old_ip + insn->bits1.branch_gen6.jump_count;
762 target_compacted_count = compacted_counts[target_old_ip];
763 insn->bits1.branch_gen6.jump_count -= (target_compacted_count -
764 this_compacted_count);
765 } else {
766 update_uip_jip(insn, this_old_ip, compacted_counts);
767 }
768 break;
769 }
770
771 if (insn->header.cmpt_control) {
772 offset += 8;
773 } else {
774 offset += 16;
775 }
776 }
777
778 /* p->nr_insn is counting the number of uncompacted instructions still, so
779 * divide. We do want to be sure there's a valid instruction in any
780 * alignment padding, so that the next compression pass (for the FS 8/16
781 * compile passes) parses correctly.
782 */
783 if (p->next_insn_offset & 8) {
784 struct brw_compact_instruction *align = store + offset;
785 memset(align, 0, sizeof(*align));
786 align->dw0.opcode = BRW_OPCODE_NOP;
787 align->dw0.cmpt_ctrl = 1;
788 p->next_insn_offset += 8;
789 }
790 p->nr_insn = p->next_insn_offset / 16;
791
792 if (0) {
793 fprintf(stdout, "dumping compacted program\n");
794 brw_dump_compile(p, stdout, 0, p->next_insn_offset);
795
796 int cmp = 0;
797 for (offset = 0; offset < p->next_insn_offset;) {
798 struct brw_instruction *insn = store + offset;
799
800 if (insn->header.cmpt_control) {
801 offset += 8;
802 cmp++;
803 } else {
804 offset += 16;
805 }
806 }
807 fprintf(stderr, "%db/%db saved (%d%%)\n", cmp * 8, offset + cmp * 8,
808 cmp * 8 * 100 / (offset + cmp * 8));
809 }
810 }
811