1 /*
2 * Copyright © 2020 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "elk_eu.h"
25 #include "elk_fs.h"
26 #include "elk_vec4.h"
27 #include "elk_cfg.h"
28
29 using namespace elk;
30
31 namespace {
32 /**
33 * Enumeration representing the various asynchronous units that can run
34 * computations in parallel on behalf of a shader thread.
35 */
36 enum intel_eu_unit {
37 /** EU front-end. */
38 EU_UNIT_FE,
39 /** EU FPU0 (Note that co-issue to FPU1 is currently not modeled here). */
40 EU_UNIT_FPU,
41 /** Extended Math unit (AKA FPU1 on Gfx8-11, part of the EU on Gfx6+). */
42 EU_UNIT_EM,
43 /** Sampler shared function. */
44 EU_UNIT_SAMPLER,
45 /** Pixel Interpolator shared function. */
46 EU_UNIT_PI,
47 /** Unified Return Buffer shared function. */
48 EU_UNIT_URB,
49 /** Data Port Data Cache shared function. */
50 EU_UNIT_DP_DC,
51 /** Data Port Render Cache shared function. */
52 EU_UNIT_DP_RC,
53 /** Data Port Constant Cache shared function. */
54 EU_UNIT_DP_CC,
55 /** Message Gateway shared function. */
56 EU_UNIT_GATEWAY,
57 /** Thread Spawner shared function. */
58 EU_UNIT_SPAWNER,
59 /* EU_UNIT_VME, */
60 /* EU_UNIT_CRE, */
61 /** Number of asynchronous units currently tracked. */
62 EU_NUM_UNITS,
63 /** Dummy unit for instructions that don't consume runtime from the above. */
64 EU_UNIT_NULL = EU_NUM_UNITS
65 };
66
67 /**
68 * Enumeration representing a computation result another computation can
69 * potentially depend on.
70 */
71 enum intel_eu_dependency_id {
72 /* Register part of the GRF. */
73 EU_DEPENDENCY_ID_GRF0 = 0,
74 /* Register part of the MRF. Only used on Gfx4-6. */
75 EU_DEPENDENCY_ID_MRF0 = EU_DEPENDENCY_ID_GRF0 + XE2_MAX_GRF,
76 /* Address register part of the ARF. */
77 EU_DEPENDENCY_ID_ADDR0 = EU_DEPENDENCY_ID_MRF0 + 24,
78 /* Accumulator register part of the ARF. */
79 EU_DEPENDENCY_ID_ACCUM0 = EU_DEPENDENCY_ID_ADDR0 + 1,
80 /* Flag register part of the ARF. */
81 EU_DEPENDENCY_ID_FLAG0 = EU_DEPENDENCY_ID_ACCUM0 + 12,
82 /* SBID token write completion. Only used on Gfx12+. */
83 EU_DEPENDENCY_ID_SBID_WR0 = EU_DEPENDENCY_ID_FLAG0 + 8,
84 /* SBID token read completion. Only used on Gfx12+. */
85 EU_DEPENDENCY_ID_SBID_RD0 = EU_DEPENDENCY_ID_SBID_WR0 + 32,
86 /* Number of computation dependencies currently tracked. */
87 EU_NUM_DEPENDENCY_IDS = EU_DEPENDENCY_ID_SBID_RD0 + 32
88 };
89
90 /**
91 * State of our modeling of the program execution.
92 */
93 struct state {
state__anoncc437f620111::state94 state() : unit_ready(), dep_ready(), unit_busy(), weight(1.0) {}
95 /**
96 * Time at which a given unit will be ready to execute the next
97 * computation, in clock units.
98 */
99 unsigned unit_ready[EU_NUM_UNITS];
100 /**
101 * Time at which an instruction dependent on a given dependency ID will
102 * be ready to execute, in clock units.
103 */
104 unsigned dep_ready[EU_NUM_DEPENDENCY_IDS];
105 /**
106 * Aggregated utilization of a given unit excluding idle cycles,
107 * in clock units.
108 */
109 float unit_busy[EU_NUM_UNITS];
110 /**
111 * Factor of the overhead of a computation accounted for in the
112 * aggregated utilization calculation.
113 */
114 float weight;
115 };
116
117 /**
118 * Information derived from an IR instruction used to compute performance
119 * estimates. Allows the timing calculation to work on both FS and VEC4
120 * instructions.
121 */
122 struct instruction_info {
instruction_info__anoncc437f620111::instruction_info123 instruction_info(const struct elk_isa_info *isa, const elk_fs_inst *inst) :
124 isa(isa), devinfo(isa->devinfo), op(inst->opcode),
125 td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
126 tx(get_exec_type(inst)), sx(0), ss(0),
127 sc(elk_has_bank_conflict(isa, inst) ? sd : 0),
128 desc(inst->desc), sfid(inst->sfid)
129 {
130 for (unsigned i = 0; i < inst->sources; i++)
131 ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
132
133 /* Convert the execution size to GRF units. */
134 sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
135
136 /* 32x32 integer multiplication has half the usual ALU throughput.
137 * Treat it as double-precision.
138 */
139 if ((inst->opcode == ELK_OPCODE_MUL || inst->opcode == ELK_OPCODE_MAD) &&
140 !elk_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
141 type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
142 tx = elk_int_type(8, tx == ELK_REGISTER_TYPE_D);
143 }
144
instruction_info__anoncc437f620111::instruction_info145 instruction_info(const struct elk_isa_info *isa,
146 const vec4_instruction *inst) :
147 isa(isa), devinfo(isa->devinfo), op(inst->opcode),
148 td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
149 tx(get_exec_type(inst)), sx(0), ss(0), sc(0),
150 desc(inst->desc), sfid(inst->sfid)
151 {
152 /* Compute the maximum source size. */
153 for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++)
154 ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
155
156 /* Convert the execution size to GRF units. */
157 sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
158
159 /* 32x32 integer multiplication has half the usual ALU throughput.
160 * Treat it as double-precision.
161 */
162 if ((inst->opcode == ELK_OPCODE_MUL || inst->opcode == ELK_OPCODE_MAD) &&
163 !elk_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
164 type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
165 tx = elk_int_type(8, tx == ELK_REGISTER_TYPE_D);
166 }
167
168 /** ISA encoding information */
169 const struct elk_isa_info *isa;
170 /** Device information. */
171 const struct intel_device_info *devinfo;
172 /** Instruction opcode. */
173 elk_opcode op;
174 /** Destination type. */
175 elk_reg_type td;
176 /** Destination size in GRF units. */
177 unsigned sd;
178 /** Execution type. */
179 elk_reg_type tx;
180 /** Execution size in GRF units. */
181 unsigned sx;
182 /** Source size. */
183 unsigned ss;
184 /** Bank conflict penalty size in GRF units (equal to sd if non-zero). */
185 unsigned sc;
186 /** Send message descriptor. */
187 uint32_t desc;
188 /** Send message shared function ID. */
189 uint8_t sfid;
190 };
191
192 /**
193 * Timing information of an instruction used to estimate the performance of
194 * the program.
195 */
196 struct perf_desc {
perf_desc__anoncc437f620111::perf_desc197 perf_desc(enum intel_eu_unit u, int df, int db,
198 int ls, int ld, int la, int lf) :
199 u(u), df(df), db(db), ls(ls), ld(ld), la(la), lf(lf) {}
200
201 /**
202 * Back-end unit its runtime shall be accounted to, in addition to the
203 * EU front-end which is always assumed to be involved.
204 */
205 enum intel_eu_unit u;
206 /**
207 * Overhead cycles from the time that the EU front-end starts executing
208 * the instruction until it's ready to execute the next instruction.
209 */
210 int df;
211 /**
212 * Overhead cycles from the time that the back-end starts executing the
213 * instruction until it's ready to execute the next instruction.
214 */
215 int db;
216 /**
217 * Latency cycles from the time that the back-end starts executing the
218 * instruction until its sources have been read from the register file.
219 */
220 int ls;
221 /**
222 * Latency cycles from the time that the back-end starts executing the
223 * instruction until its regular destination has been written to the
224 * register file.
225 */
226 int ld;
227 /**
228 * Latency cycles from the time that the back-end starts executing the
229 * instruction until its accumulator destination has been written to the
230 * ARF file.
231 *
232 * Note that this is an approximation of the real behavior of
233 * accumulating instructions in the hardware: Instead of modeling a pair
234 * of back-to-back accumulating instructions as a first computation with
235 * latency equal to ld followed by another computation with a
236 * mid-pipeline stall (e.g. after the "M" part of a MAC instruction), we
237 * model the stall as if it occurred at the top of the pipeline, with
238 * the latency of the accumulator computation offset accordingly.
239 */
240 int la;
241 /**
242 * Latency cycles from the time that the back-end starts executing the
243 * instruction until its flag destination has been written to the ARF
244 * file.
245 */
246 int lf;
247 };
248
249 /**
250 * Compute the timing information of an instruction based on any relevant
251 * information from the IR and a number of parameters specifying a linear
252 * approximation: Parameter X_Y specifies the derivative of timing X
253 * relative to info field Y, while X_1 specifies the independent term of
254 * the approximation of timing X.
255 */
256 perf_desc
calculate_desc(const instruction_info & info,enum intel_eu_unit u,int df_1,int df_sd,int df_sc,int db_1,int db_sx,int ls_1,int ld_1,int la_1,int lf_1,int l_ss,int l_sd)257 calculate_desc(const instruction_info &info, enum intel_eu_unit u,
258 int df_1, int df_sd, int df_sc,
259 int db_1, int db_sx,
260 int ls_1, int ld_1, int la_1, int lf_1,
261 int l_ss, int l_sd)
262 {
263 return perf_desc(u, df_1 + df_sd * int(info.sd) + df_sc * int(info.sc),
264 db_1 + db_sx * int(info.sx),
265 ls_1 + l_ss * int(info.ss),
266 ld_1 + l_ss * int(info.ss) + l_sd * int(info.sd),
267 la_1, lf_1);
268 }
269
270 /**
271 * Compute the timing information of an instruction based on any relevant
272 * information from the IR and a number of linear approximation parameters
273 * hard-coded for each IR instruction.
274 *
275 * Most timing parameters are obtained from the multivariate linear
276 * regression of a sample of empirical timings measured using the tm0
277 * register (as can be done today by using the shader_time debugging
278 * option). The Gfx4-5 math timings are obtained from BSpec Volume 5c.3
279 * "Shared Functions - Extended Math", Section 3.2 "Performance".
280 * Parameters marked XXX shall be considered low-quality, they're possibly
281 * high variance or completely guessed in cases where experimental data was
282 * unavailable.
283 */
284 const perf_desc
instruction_desc(const instruction_info & info)285 instruction_desc(const instruction_info &info)
286 {
287 const struct intel_device_info *devinfo = info.devinfo;
288
289 switch (info.op) {
290 case ELK_OPCODE_SEL:
291 case ELK_OPCODE_NOT:
292 case ELK_OPCODE_AND:
293 case ELK_OPCODE_OR:
294 case ELK_OPCODE_XOR:
295 case ELK_OPCODE_SHR:
296 case ELK_OPCODE_SHL:
297 case ELK_OPCODE_DIM:
298 case ELK_OPCODE_ASR:
299 case ELK_OPCODE_CMPN:
300 case ELK_OPCODE_F16TO32:
301 case ELK_OPCODE_BFREV:
302 case ELK_OPCODE_BFI1:
303 case ELK_OPCODE_AVG:
304 case ELK_OPCODE_FRC:
305 case ELK_OPCODE_RNDU:
306 case ELK_OPCODE_RNDD:
307 case ELK_OPCODE_RNDE:
308 case ELK_OPCODE_RNDZ:
309 case ELK_OPCODE_MAC:
310 case ELK_OPCODE_MACH:
311 case ELK_OPCODE_LZD:
312 case ELK_OPCODE_FBH:
313 case ELK_OPCODE_FBL:
314 case ELK_OPCODE_CBIT:
315 case ELK_OPCODE_ADDC:
316 case ELK_OPCODE_SUBB:
317 case ELK_OPCODE_SAD2:
318 case ELK_OPCODE_SADA2:
319 case ELK_OPCODE_LINE:
320 case ELK_OPCODE_NOP:
321 case ELK_SHADER_OPCODE_CLUSTER_BROADCAST:
322 case ELK_SHADER_OPCODE_SCRATCH_HEADER:
323 case ELK_FS_OPCODE_DDX_COARSE:
324 case ELK_FS_OPCODE_DDX_FINE:
325 case ELK_FS_OPCODE_DDY_COARSE:
326 case ELK_FS_OPCODE_PIXEL_X:
327 case ELK_FS_OPCODE_PIXEL_Y:
328 case ELK_FS_OPCODE_SET_SAMPLE_ID:
329 case ELK_VEC4_OPCODE_MOV_BYTES:
330 case ELK_VEC4_OPCODE_UNPACK_UNIFORM:
331 case ELK_VEC4_OPCODE_DOUBLE_TO_F32:
332 case ELK_VEC4_OPCODE_DOUBLE_TO_D32:
333 case ELK_VEC4_OPCODE_DOUBLE_TO_U32:
334 case ELK_VEC4_OPCODE_TO_DOUBLE:
335 case ELK_VEC4_OPCODE_PICK_LOW_32BIT:
336 case ELK_VEC4_OPCODE_PICK_HIGH_32BIT:
337 case ELK_VEC4_OPCODE_SET_LOW_32BIT:
338 case ELK_VEC4_OPCODE_SET_HIGH_32BIT:
339 case ELK_VEC4_OPCODE_ZERO_OOB_PUSH_REGS:
340 case ELK_GS_OPCODE_SET_DWORD_2:
341 case ELK_GS_OPCODE_SET_WRITE_OFFSET:
342 case ELK_GS_OPCODE_SET_VERTEX_COUNT:
343 case ELK_GS_OPCODE_PREPARE_CHANNEL_MASKS:
344 case ELK_GS_OPCODE_SET_CHANNEL_MASKS:
345 case ELK_GS_OPCODE_GET_INSTANCE_ID:
346 case ELK_GS_OPCODE_SET_PRIMITIVE_ID:
347 case ELK_GS_OPCODE_SVB_SET_DST_INDEX:
348 case ELK_TCS_OPCODE_SRC0_010_IS_ZERO:
349 case ELK_TCS_OPCODE_GET_PRIMITIVE_ID:
350 case ELK_TES_OPCODE_GET_PRIMITIVE_ID:
351 case ELK_SHADER_OPCODE_READ_SR_REG:
352 if (devinfo->ver >= 8) {
353 if (type_sz(info.tx) > 4)
354 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
355 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
356 else
357 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
358 0, 8, 4, 12, 0, 0);
359 } else if (devinfo->verx10 >= 75) {
360 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
361 0, 10, 6 /* XXX */, 16, 0, 0);
362 } else {
363 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
364 0, 12, 8 /* XXX */, 18, 0, 0);
365 }
366
367 case ELK_OPCODE_MOV:
368 case ELK_OPCODE_CMP:
369 case ELK_OPCODE_ADD:
370 case ELK_OPCODE_MUL:
371 case ELK_SHADER_OPCODE_MOV_RELOC_IMM:
372 case ELK_VEC4_OPCODE_MOV_FOR_SCRATCH:
373 if (devinfo->ver >= 8) {
374 if (type_sz(info.tx) > 4)
375 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
376 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
377 else
378 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
379 0, 8, 4, 12, 0, 0);
380 } else if (devinfo->verx10 >= 75) {
381 if (info.tx == ELK_REGISTER_TYPE_F)
382 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
383 0, 12, 8 /* XXX */, 18, 0, 0);
384 else
385 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
386 0, 10, 6 /* XXX */, 16, 0, 0);
387 } else if (devinfo->ver >= 7) {
388 if (info.tx == ELK_REGISTER_TYPE_F)
389 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
390 0, 14, 10 /* XXX */, 20, 0, 0);
391 else
392 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
393 0, 12, 8 /* XXX */, 18, 0, 0);
394 } else {
395 return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 0,
396 0, 2 /* XXX */,
397 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
398 0, 0);
399 }
400
401 case ELK_OPCODE_BFE:
402 case ELK_OPCODE_BFI2:
403 case ELK_OPCODE_CSEL:
404 if (devinfo->ver >= 8)
405 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
406 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
407 else if (devinfo->verx10 >= 75)
408 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
409 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
410 else if (devinfo->ver >= 7)
411 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
412 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
413 else
414 abort();
415
416 case ELK_OPCODE_MAD:
417 if (devinfo->ver >= 8) {
418 if (type_sz(info.tx) > 4)
419 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
420 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
421 else
422 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
423 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
424 } else if (devinfo->verx10 >= 75) {
425 if (info.tx == ELK_REGISTER_TYPE_F)
426 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
427 0, 12, 8 /* XXX */, 18, 0, 0);
428 else
429 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
430 0, 10, 6 /* XXX */, 16, 0, 0);
431 } else if (devinfo->ver >= 7) {
432 if (info.tx == ELK_REGISTER_TYPE_F)
433 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
434 0, 14, 10 /* XXX */, 20, 0, 0);
435 else
436 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
437 0, 12, 8 /* XXX */, 18, 0, 0);
438 } else if (devinfo->ver >= 6) {
439 return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 1 /* XXX */,
440 0, 2 /* XXX */,
441 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
442 0, 0);
443 } else {
444 abort();
445 }
446
447 case ELK_OPCODE_F32TO16:
448 if (devinfo->ver >= 8)
449 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
450 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
451 else if (devinfo->verx10 >= 75)
452 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
453 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
454 else if (devinfo->ver >= 7)
455 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
456 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
457 else
458 abort();
459
460 case ELK_OPCODE_DP4:
461 case ELK_OPCODE_DPH:
462 case ELK_OPCODE_DP3:
463 case ELK_OPCODE_DP2:
464 if (devinfo->ver >= 8)
465 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
466 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
467 else if (devinfo->verx10 >= 75)
468 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
469 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
470 else
471 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
472 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
473
474 case ELK_SHADER_OPCODE_RCP:
475 case ELK_SHADER_OPCODE_RSQ:
476 case ELK_SHADER_OPCODE_SQRT:
477 case ELK_SHADER_OPCODE_EXP2:
478 case ELK_SHADER_OPCODE_LOG2:
479 case ELK_SHADER_OPCODE_SIN:
480 case ELK_SHADER_OPCODE_COS:
481 case ELK_SHADER_OPCODE_POW:
482 case ELK_SHADER_OPCODE_INT_QUOTIENT:
483 case ELK_SHADER_OPCODE_INT_REMAINDER:
484 if (devinfo->ver >= 6) {
485 switch (info.op) {
486 case ELK_SHADER_OPCODE_RCP:
487 case ELK_SHADER_OPCODE_RSQ:
488 case ELK_SHADER_OPCODE_SQRT:
489 case ELK_SHADER_OPCODE_EXP2:
490 case ELK_SHADER_OPCODE_LOG2:
491 case ELK_SHADER_OPCODE_SIN:
492 case ELK_SHADER_OPCODE_COS:
493 if (devinfo->ver >= 8)
494 return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 4,
495 0, 16, 0, 0, 0, 0);
496 else if (devinfo->verx10 >= 75)
497 return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 2,
498 0, 12, 0, 0, 0, 0);
499 else
500 return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 2,
501 0, 14, 0, 0, 0, 0);
502
503 case ELK_SHADER_OPCODE_POW:
504 if (devinfo->ver >= 8)
505 return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 8,
506 0, 24, 0, 0, 0, 0);
507 else if (devinfo->verx10 >= 75)
508 return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 4,
509 0, 20, 0, 0, 0, 0);
510 else
511 return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 4,
512 0, 22, 0, 0, 0, 0);
513
514 case ELK_SHADER_OPCODE_INT_QUOTIENT:
515 case ELK_SHADER_OPCODE_INT_REMAINDER:
516 return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 26, 0,
517 0, 28 /* XXX */, 0, 0, 0, 0);
518
519 default:
520 abort();
521 }
522 } else {
523 switch (info.op) {
524 case ELK_SHADER_OPCODE_RCP:
525 return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 8,
526 0, 22, 0, 0, 0, 8);
527
528 case ELK_SHADER_OPCODE_RSQ:
529 return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 16,
530 0, 44, 0, 0, 0, 8);
531
532 case ELK_SHADER_OPCODE_INT_QUOTIENT:
533 case ELK_SHADER_OPCODE_SQRT:
534 case ELK_SHADER_OPCODE_LOG2:
535 return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 24,
536 0, 66, 0, 0, 0, 8);
537
538 case ELK_SHADER_OPCODE_INT_REMAINDER:
539 case ELK_SHADER_OPCODE_EXP2:
540 return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 32,
541 0, 88, 0, 0, 0, 8);
542
543 case ELK_SHADER_OPCODE_SIN:
544 case ELK_SHADER_OPCODE_COS:
545 return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 48,
546 0, 132, 0, 0, 0, 8);
547
548 case ELK_SHADER_OPCODE_POW:
549 return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 64,
550 0, 176, 0, 0, 0, 8);
551
552 default:
553 abort();
554 }
555 }
556
557 case ELK_OPCODE_DO:
558 if (devinfo->ver >= 6)
559 return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
560 0, 0, 0, 0, 0, 0);
561 else
562 return calculate_desc(info, EU_UNIT_NULL, 2 /* XXX */, 0, 0, 0, 0,
563 0, 0, 0, 0, 0, 0);
564
565 case ELK_OPCODE_IF:
566 case ELK_OPCODE_ELSE:
567 case ELK_OPCODE_ENDIF:
568 case ELK_OPCODE_WHILE:
569 case ELK_OPCODE_BREAK:
570 case ELK_OPCODE_CONTINUE:
571 case ELK_OPCODE_HALT:
572 if (devinfo->ver >= 8)
573 return calculate_desc(info, EU_UNIT_NULL, 8, 0, 0, 0, 0,
574 0, 0, 0, 0, 0, 0);
575 else if (devinfo->verx10 >= 75)
576 return calculate_desc(info, EU_UNIT_NULL, 6, 0, 0, 0, 0,
577 0, 0, 0, 0, 0, 0);
578 else
579 return calculate_desc(info, EU_UNIT_NULL, 2, 0, 0, 0, 0,
580 0, 0, 0, 0, 0, 0);
581
582 case ELK_FS_OPCODE_LINTERP:
583 if (devinfo->ver >= 8)
584 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
585 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
586 else if (devinfo->verx10 >= 75)
587 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
588 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
589 else
590 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
591 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
592
593 case ELK_OPCODE_LRP:
594 if (devinfo->ver >= 8)
595 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
596 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
597 else if (devinfo->verx10 >= 75)
598 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
599 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
600 else if (devinfo->ver >= 6)
601 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
602 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
603 else
604 abort();
605
606 case ELK_FS_OPCODE_PACK_HALF_2x16_SPLIT:
607 if (devinfo->ver >= 8)
608 return calculate_desc(info, EU_UNIT_FPU, 16, 6, 0, 0, 6,
609 0, 8 /* XXX */, 4 /* XXX */,
610 12 /* XXX */, 0, 0);
611 else if (devinfo->verx10 >= 75)
612 return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6,
613 0, 10 /* XXX */, 6 /* XXX */,
614 16 /* XXX */, 0, 0);
615 else if (devinfo->ver >= 7)
616 return calculate_desc(info, EU_UNIT_FPU, 24, 6, 0, 0, 6,
617 0, 12 /* XXX */, 8 /* XXX */,
618 18 /* XXX */, 0, 0);
619 else
620 abort();
621
622 case ELK_SHADER_OPCODE_MOV_INDIRECT:
623 if (devinfo->ver >= 8)
624 return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
625 0, 8 /* XXX */, 4 /* XXX */,
626 12 /* XXX */, 0, 0);
627 else if (devinfo->verx10 >= 75)
628 return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
629 0, 10 /* XXX */, 6 /* XXX */,
630 16 /* XXX */, 0, 0);
631 else
632 return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
633 0, 12 /* XXX */, 8 /* XXX */,
634 18 /* XXX */, 0, 0);
635
636 case ELK_SHADER_OPCODE_BROADCAST:
637 if (devinfo->ver >= 8)
638 return calculate_desc(info, EU_UNIT_FPU, 18, 0, 0, 4, 0,
639 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
640 else if (devinfo->verx10 >= 75)
641 return calculate_desc(info, EU_UNIT_FPU, 18, 0, 0, 4, 0,
642 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
643 else if (devinfo->ver >= 7)
644 return calculate_desc(info, EU_UNIT_FPU, 20, 0, 0, 4, 0,
645 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
646 else
647 abort();
648
649 case ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL:
650 case ELK_SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
651 if (devinfo->ver >= 8)
652 return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
653 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
654 else if (devinfo->verx10 >= 75)
655 return calculate_desc(info, EU_UNIT_FPU, 36, 0, 0, 6, 0,
656 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
657 else if (devinfo->ver >= 7)
658 return calculate_desc(info, EU_UNIT_FPU, 40, 0, 0, 6, 0,
659 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
660 else
661 abort();
662
663 case ELK_SHADER_OPCODE_RND_MODE:
664 case ELK_SHADER_OPCODE_FLOAT_CONTROL_MODE:
665 if (devinfo->ver >= 8)
666 return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0,
667 4 /* XXX */, 0,
668 0, 0, 0, 0, 0, 0);
669 else if (devinfo->verx10 >= 75)
670 return calculate_desc(info, EU_UNIT_FPU, 24 /* XXX */, 0, 0,
671 4 /* XXX */, 0,
672 0, 0, 0, 0, 0, 0);
673 else if (devinfo->ver >= 6)
674 return calculate_desc(info, EU_UNIT_FPU, 28 /* XXX */, 0, 0,
675 4 /* XXX */, 0,
676 0, 0, 0, 0, 0, 0);
677 else
678 abort();
679
680 case ELK_SHADER_OPCODE_SHUFFLE:
681 if (devinfo->ver >= 8)
682 return calculate_desc(info, EU_UNIT_FPU, 42 /* XXX */, 0, 0,
683 42 /* XXX */, 0,
684 0, 8 /* XXX */, 4 /* XXX */,
685 12 /* XXX */, 0, 0);
686 else if (devinfo->verx10 >= 75)
687 return calculate_desc(info, EU_UNIT_FPU, 0, 44 /* XXX */, 0,
688 0, 44 /* XXX */,
689 0, 10 /* XXX */, 6 /* XXX */,
690 16 /* XXX */, 0, 0);
691 else if (devinfo->ver >= 6)
692 return calculate_desc(info, EU_UNIT_FPU, 0, 46 /* XXX */, 0,
693 0, 46 /* XXX */,
694 0, 12 /* XXX */, 8 /* XXX */,
695 18 /* XXX */, 0, 0);
696 else
697 abort();
698
699 case ELK_SHADER_OPCODE_SEL_EXEC:
700 if (devinfo->ver >= 8)
701 return calculate_desc(info, EU_UNIT_FPU, 8 /* XXX */, 4 /* XXX */, 0,
702 0, 4 /* XXX */,
703 0, 8 /* XXX */, 4 /* XXX */,
704 12 /* XXX */, 0, 0);
705 else if (devinfo->verx10 >= 75)
706 return calculate_desc(info, EU_UNIT_FPU, 10 /* XXX */, 4 /* XXX */, 0,
707 0, 4 /* XXX */,
708 0, 10 /* XXX */, 6 /* XXX */,
709 16 /* XXX */, 0, 0);
710 else
711 return calculate_desc(info, EU_UNIT_FPU, 12 /* XXX */, 4 /* XXX */, 0,
712 0, 4 /* XXX */,
713 0, 12 /* XXX */, 8 /* XXX */,
714 18 /* XXX */, 0, 0);
715
716 case ELK_SHADER_OPCODE_QUAD_SWIZZLE:
717 if (devinfo->ver >= 8)
718 return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
719 0, 8 /* XXX */,
720 0, 8 /* XXX */, 4 /* XXX */,
721 12 /* XXX */, 0, 0);
722 else if (devinfo->verx10 >= 75)
723 return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
724 0, 8 /* XXX */,
725 0, 10 /* XXX */, 6 /* XXX */,
726 16 /* XXX */, 0, 0);
727 else
728 return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
729 0, 8 /* XXX */,
730 0, 12 /* XXX */, 8 /* XXX */,
731 18 /* XXX */, 0, 0);
732
733 case ELK_FS_OPCODE_DDY_FINE:
734 if (devinfo->ver >= 8)
735 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
736 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
737 else if (devinfo->verx10 >= 75)
738 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
739 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
740 else
741 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
742 0, 14, 10 /* XXX */, 20 /* XXX */, 0, 0);
743
744 case ELK_FS_OPCODE_LOAD_LIVE_CHANNELS:
745 if (devinfo->ver >= 8)
746 return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 0,
747 0, 2 /* XXX */,
748 0, 0, 0, 8 /* XXX */, 0, 0);
749 else
750 abort();
751
752 case ELK_VEC4_OPCODE_PACK_BYTES:
753 if (devinfo->ver >= 8)
754 return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0,
755 4 /* XXX */, 0,
756 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
757 0, 0);
758 else if (devinfo->verx10 >= 75)
759 return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0,
760 4 /* XXX */, 0,
761 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
762 0, 0);
763 else
764 return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0,
765 4 /* XXX */, 0,
766 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
767 0, 0);
768
769 case ELK_VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
770 case ELK_TCS_OPCODE_GET_INSTANCE_ID:
771 case ELK_VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS:
772 case ELK_VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
773 case ELK_TES_OPCODE_CREATE_INPUT_READ_HEADER:
774 if (devinfo->ver >= 8)
775 return calculate_desc(info, EU_UNIT_FPU, 22 /* XXX */, 0, 0,
776 6 /* XXX */, 0,
777 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
778 0, 0);
779 else if (devinfo->verx10 >= 75)
780 return calculate_desc(info, EU_UNIT_FPU, 26 /* XXX */, 0, 0,
781 6 /* XXX */, 0,
782 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
783 0, 0);
784 else
785 return calculate_desc(info, EU_UNIT_FPU, 30 /* XXX */, 0, 0,
786 6 /* XXX */, 0,
787 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
788 0, 0);
789
790 case ELK_GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
791 case ELK_TCS_OPCODE_CREATE_BARRIER_HEADER:
792 if (devinfo->ver >= 8)
793 return calculate_desc(info, EU_UNIT_FPU, 32 /* XXX */, 0, 0,
794 8 /* XXX */, 0,
795 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
796 0, 0);
797 else if (devinfo->verx10 >= 75)
798 return calculate_desc(info, EU_UNIT_FPU, 38 /* XXX */, 0, 0,
799 8 /* XXX */, 0,
800 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
801 0, 0);
802 else if (devinfo->ver >= 6)
803 return calculate_desc(info, EU_UNIT_FPU, 44 /* XXX */, 0, 0,
804 8 /* XXX */, 0,
805 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
806 0, 0);
807 else
808 abort();
809
810 case ELK_TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
811 if (devinfo->ver >= 8)
812 return calculate_desc(info, EU_UNIT_FPU, 12 /* XXX */, 0, 0,
813 4 /* XXX */, 0,
814 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
815 0, 0);
816 else if (devinfo->verx10 >= 75)
817 return calculate_desc(info, EU_UNIT_FPU, 14 /* XXX */, 0, 0,
818 4 /* XXX */, 0,
819 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
820 0, 0);
821 else if (devinfo->ver >= 7)
822 return calculate_desc(info, EU_UNIT_FPU, 16 /* XXX */, 0, 0,
823 4 /* XXX */, 0,
824 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
825 0, 0);
826 else
827 abort();
828
829 case ELK_SHADER_OPCODE_TEX:
830 case ELK_FS_OPCODE_TXB:
831 case ELK_SHADER_OPCODE_TXD:
832 case ELK_SHADER_OPCODE_TXF:
833 case ELK_SHADER_OPCODE_TXF_LZ:
834 case ELK_SHADER_OPCODE_TXL:
835 case ELK_SHADER_OPCODE_TXL_LZ:
836 case ELK_SHADER_OPCODE_TXF_CMS:
837 case ELK_SHADER_OPCODE_TXF_CMS_W:
838 case ELK_SHADER_OPCODE_TXF_UMS:
839 case ELK_SHADER_OPCODE_TXF_MCS:
840 case ELK_SHADER_OPCODE_TXS:
841 case ELK_SHADER_OPCODE_LOD:
842 case ELK_SHADER_OPCODE_GET_BUFFER_SIZE:
843 case ELK_SHADER_OPCODE_TG4:
844 case ELK_SHADER_OPCODE_TG4_OFFSET:
845 case ELK_SHADER_OPCODE_SAMPLEINFO:
846 case ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
847 return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16 /* XXX */,
848 8 /* XXX */, 750 /* XXX */, 0, 0,
849 2 /* XXX */, 0);
850
851 case ELK_VEC4_OPCODE_URB_READ:
852 case ELK_VEC4_VS_OPCODE_URB_WRITE:
853 case ELK_VEC4_GS_OPCODE_URB_WRITE:
854 case ELK_VEC4_GS_OPCODE_URB_WRITE_ALLOCATE:
855 case ELK_GS_OPCODE_THREAD_END:
856 case ELK_GS_OPCODE_FF_SYNC:
857 case ELK_VEC4_TCS_OPCODE_URB_WRITE:
858 case ELK_TCS_OPCODE_RELEASE_INPUT:
859 case ELK_TCS_OPCODE_THREAD_END:
860 return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */,
861 32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
862
863 case ELK_SHADER_OPCODE_MEMORY_FENCE:
864 case ELK_SHADER_OPCODE_INTERLOCK:
865 switch (info.sfid) {
866 case GFX6_SFID_DATAPORT_RENDER_CACHE:
867 if (devinfo->ver >= 7)
868 return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 30 /* XXX */, 0,
869 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
870 else
871 abort();
872
873 case ELK_SFID_URB:
874 case GFX7_SFID_DATAPORT_DATA_CACHE:
875 case HSW_SFID_DATAPORT_DATA_CACHE_1:
876 if (devinfo->ver >= 7)
877 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 30 /* XXX */, 0,
878 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
879 else
880 abort();
881
882 default:
883 abort();
884 }
885
886 case ELK_SHADER_OPCODE_GFX4_SCRATCH_READ:
887 case ELK_SHADER_OPCODE_GFX4_SCRATCH_WRITE:
888 case ELK_SHADER_OPCODE_GFX7_SCRATCH_READ:
889 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 0, 8 /* XXX */,
890 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
891
892 case ELK_VEC4_OPCODE_UNTYPED_ATOMIC:
893 if (devinfo->ver >= 7)
894 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
895 30 /* XXX */, 400 /* XXX */,
896 10 /* XXX */, 100 /* XXX */, 0, 0,
897 0, 400 /* XXX */);
898 else
899 abort();
900
901 case ELK_VEC4_OPCODE_UNTYPED_SURFACE_READ:
902 case ELK_VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
903 if (devinfo->ver >= 7)
904 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
905 0, 20 /* XXX */,
906 10 /* XXX */, 100 /* XXX */, 0, 0,
907 0, 0);
908 else
909 abort();
910
911 case ELK_FS_OPCODE_FB_WRITE:
912 case ELK_FS_OPCODE_REP_FB_WRITE:
913 return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 0, 450 /* XXX */,
914 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
915
916 case ELK_GS_OPCODE_SVB_WRITE:
917 if (devinfo->ver >= 6)
918 return calculate_desc(info, EU_UNIT_DP_RC, 2 /* XXX */, 0, 0,
919 0, 450 /* XXX */,
920 10 /* XXX */, 300 /* XXX */, 0, 0,
921 0, 0);
922 else
923 abort();
924
925 case ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
926 return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */,
927 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
928
929 case ELK_VS_OPCODE_PULL_CONSTANT_LOAD:
930 case ELK_VS_OPCODE_PULL_CONSTANT_LOAD_GFX7:
931 return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16,
932 8, 750, 0, 0, 2, 0);
933
934 case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
935 case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
936 case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
937 if (devinfo->ver >= 7)
938 return calculate_desc(info, EU_UNIT_PI, 2, 0, 0, 14 /* XXX */, 0,
939 0, 90 /* XXX */, 0, 0, 0, 0);
940 else
941 abort();
942
943 case ELK_SHADER_OPCODE_BARRIER:
944 if (devinfo->ver >= 7)
945 return calculate_desc(info, EU_UNIT_GATEWAY, 90 /* XXX */, 0, 0,
946 0 /* XXX */, 0,
947 0, 0, 0, 0, 0, 0);
948 else
949 abort();
950
951 case ELK_CS_OPCODE_CS_TERMINATE:
952 if (devinfo->ver >= 7)
953 return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0,
954 10 /* XXX */, 0, 0, 0, 0, 0);
955 else
956 abort();
957
958 case ELK_SHADER_OPCODE_SEND:
959 switch (info.sfid) {
960 case GFX6_SFID_DATAPORT_CONSTANT_CACHE:
961 if (devinfo->ver >= 7) {
962 /* See ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD */
963 return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */,
964 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
965 } else {
966 abort();
967 }
968 case GFX6_SFID_DATAPORT_RENDER_CACHE:
969 if (devinfo->ver >= 7) {
970 switch (elk_dp_desc_msg_type(devinfo, info.desc)) {
971 case GFX7_DATAPORT_RC_TYPED_ATOMIC_OP:
972 return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0,
973 30 /* XXX */, 450 /* XXX */,
974 10 /* XXX */, 100 /* XXX */,
975 0, 0, 0, 400 /* XXX */);
976 default:
977 return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0,
978 0, 450 /* XXX */,
979 10 /* XXX */, 300 /* XXX */, 0, 0,
980 0, 0);
981 }
982 } else if (devinfo->ver >= 6) {
983 return calculate_desc(info, EU_UNIT_DP_RC, 2 /* XXX */, 0, 0,
984 0, 450 /* XXX */,
985 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
986 } else {
987 abort();
988 }
989 case ELK_SFID_SAMPLER: {
990 if (devinfo->ver >= 6)
991 return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16,
992 8, 750, 0, 0, 2, 0);
993 else
994 abort();
995 }
996 case GFX7_SFID_DATAPORT_DATA_CACHE:
997 case HSW_SFID_DATAPORT_DATA_CACHE_1:
998 if (devinfo->verx10 >= 75) {
999 switch (elk_dp_desc_msg_type(devinfo, info.desc)) {
1000 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP:
1001 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2:
1002 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2:
1003 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP:
1004 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1005 30 /* XXX */, 400 /* XXX */,
1006 10 /* XXX */, 100 /* XXX */, 0, 0,
1007 0, 400 /* XXX */);
1008
1009 default:
1010 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1011 0, 20 /* XXX */,
1012 10 /* XXX */, 100 /* XXX */, 0, 0,
1013 0, 0);
1014 }
1015 } else if (devinfo->ver >= 7) {
1016 switch (elk_dp_desc_msg_type(devinfo, info.desc)) {
1017 case GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP:
1018 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1019 30 /* XXX */, 400 /* XXX */,
1020 10 /* XXX */, 100 /* XXX */,
1021 0, 0, 0, 400 /* XXX */);
1022 default:
1023 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1024 0, 20 /* XXX */,
1025 10 /* XXX */, 100 /* XXX */, 0, 0,
1026 0, 0);
1027 }
1028 } else {
1029 abort();
1030 }
1031
1032 case GFX7_SFID_PIXEL_INTERPOLATOR:
1033 if (devinfo->ver >= 7)
1034 return calculate_desc(info, EU_UNIT_PI, 2, 0, 0, 14 /* XXX */, 0,
1035 0, 90 /* XXX */, 0, 0, 0, 0);
1036 else
1037 abort();
1038
1039 case ELK_SFID_URB:
1040 return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */,
1041 32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
1042
1043 default:
1044 abort();
1045 }
1046
1047 case ELK_SHADER_OPCODE_UNDEF:
1048 case ELK_SHADER_OPCODE_HALT_TARGET:
1049 case ELK_FS_OPCODE_SCHEDULING_FENCE:
1050 return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
1051 0, 0, 0, 0, 0, 0);
1052
1053 default:
1054 abort();
1055 }
1056 }
1057
1058 /**
1059 * Model the performance behavior of a stall on the specified dependency
1060 * ID.
1061 */
1062 void
stall_on_dependency(state & st,enum intel_eu_dependency_id id)1063 stall_on_dependency(state &st, enum intel_eu_dependency_id id)
1064 {
1065 if (id < ARRAY_SIZE(st.dep_ready))
1066 st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE],
1067 st.dep_ready[id]);
1068 }
1069
1070 /**
1071 * Model the performance behavior of the front-end and back-end while
1072 * executing an instruction with the specified timing information, assuming
1073 * all dependencies are already clear.
1074 */
1075 void
execute_instruction(state & st,const perf_desc & perf)1076 execute_instruction(state &st, const perf_desc &perf)
1077 {
1078 /* Compute the time at which the front-end will be ready to execute the
1079 * next instruction.
1080 */
1081 st.unit_ready[EU_UNIT_FE] += perf.df;
1082
1083 if (perf.u < EU_NUM_UNITS) {
1084 /* Wait for the back-end to be ready to execute this instruction. */
1085 st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE],
1086 st.unit_ready[perf.u]);
1087
1088 /* Compute the time at which the back-end will be ready to execute
1089 * the next instruction, and update the back-end utilization.
1090 */
1091 st.unit_ready[perf.u] = st.unit_ready[EU_UNIT_FE] + perf.db;
1092 st.unit_busy[perf.u] += perf.db * st.weight;
1093 }
1094 }
1095
1096 /**
1097 * Model the performance behavior of a read dependency provided by an
1098 * instruction.
1099 */
1100 void
mark_read_dependency(state & st,const perf_desc & perf,enum intel_eu_dependency_id id)1101 mark_read_dependency(state &st, const perf_desc &perf,
1102 enum intel_eu_dependency_id id)
1103 {
1104 if (id < ARRAY_SIZE(st.dep_ready))
1105 st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ls;
1106 }
1107
1108 /**
1109 * Model the performance behavior of a write dependency provided by an
1110 * instruction.
1111 */
1112 void
mark_write_dependency(state & st,const perf_desc & perf,enum intel_eu_dependency_id id)1113 mark_write_dependency(state &st, const perf_desc &perf,
1114 enum intel_eu_dependency_id id)
1115 {
1116 if (id >= EU_DEPENDENCY_ID_ACCUM0 && id < EU_DEPENDENCY_ID_FLAG0)
1117 st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.la;
1118 else if (id >= EU_DEPENDENCY_ID_FLAG0 && id < EU_DEPENDENCY_ID_SBID_WR0)
1119 st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.lf;
1120 else if (id < ARRAY_SIZE(st.dep_ready))
1121 st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ld;
1122 }
1123
1124 /**
1125 * Return the dependency ID of a elk_backend_reg, offset by \p delta GRFs.
1126 */
1127 enum intel_eu_dependency_id
reg_dependency_id(const intel_device_info * devinfo,const elk_backend_reg & r,const int delta)1128 reg_dependency_id(const intel_device_info *devinfo, const elk_backend_reg &r,
1129 const int delta)
1130 {
1131 if (r.file == VGRF) {
1132 const unsigned i = r.nr + r.offset / REG_SIZE + delta;
1133 assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0);
1134 return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
1135
1136 } else if (r.file == FIXED_GRF) {
1137 const unsigned i = r.nr + delta;
1138 assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0);
1139 return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
1140
1141 } else if (r.file == MRF && devinfo->ver >= 7) {
1142 const unsigned i = GFX7_MRF_HACK_START +
1143 r.nr + r.offset / REG_SIZE + delta;
1144 assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0);
1145 return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
1146
1147 } else if (r.file == MRF && devinfo->ver < 7) {
1148 const unsigned i = (r.nr & ~ELK_MRF_COMPR4) +
1149 r.offset / REG_SIZE + delta;
1150 assert(i < EU_DEPENDENCY_ID_ADDR0 - EU_DEPENDENCY_ID_MRF0);
1151 return intel_eu_dependency_id(EU_DEPENDENCY_ID_MRF0 + i);
1152
1153 } else if (r.file == ARF && r.nr >= ELK_ARF_ADDRESS &&
1154 r.nr < ELK_ARF_ACCUMULATOR) {
1155 assert(delta == 0);
1156 return EU_DEPENDENCY_ID_ADDR0;
1157
1158 } else if (r.file == ARF && r.nr >= ELK_ARF_ACCUMULATOR &&
1159 r.nr < ELK_ARF_FLAG) {
1160 const unsigned i = r.nr - ELK_ARF_ACCUMULATOR + delta;
1161 assert(i < EU_DEPENDENCY_ID_FLAG0 - EU_DEPENDENCY_ID_ACCUM0);
1162 return intel_eu_dependency_id(EU_DEPENDENCY_ID_ACCUM0 + i);
1163
1164 } else {
1165 return EU_NUM_DEPENDENCY_IDS;
1166 }
1167 }
1168
1169 /**
1170 * Return the dependency ID of flag register starting at offset \p i.
1171 */
1172 enum intel_eu_dependency_id
flag_dependency_id(unsigned i)1173 flag_dependency_id(unsigned i)
1174 {
1175 assert(i < EU_DEPENDENCY_ID_SBID_WR0 - EU_DEPENDENCY_ID_FLAG0);
1176 return intel_eu_dependency_id(EU_DEPENDENCY_ID_FLAG0 + i);
1177 }
1178
1179 /**
1180 * Return the implicit accumulator register accessed by channel \p i of the
1181 * instruction.
1182 */
1183 unsigned
accum_reg_of_channel(const intel_device_info * devinfo,const elk_backend_instruction * inst,elk_reg_type tx,unsigned i)1184 accum_reg_of_channel(const intel_device_info *devinfo,
1185 const elk_backend_instruction *inst,
1186 elk_reg_type tx, unsigned i)
1187 {
1188 assert(inst->reads_accumulator_implicitly() ||
1189 inst->writes_accumulator_implicitly(devinfo));
1190 const unsigned offset = (inst->group + i) * type_sz(tx) *
1191 (devinfo->ver < 7 || elk_reg_type_is_floating_point(tx) ? 1 : 2);
1192 return offset / (reg_unit(devinfo) * REG_SIZE) % 2;
1193 }
1194
1195 /**
1196 * Model the performance behavior of an FS back-end instruction.
1197 */
1198 void
issue_fs_inst(state & st,const struct elk_isa_info * isa,const elk_backend_instruction * be_inst)1199 issue_fs_inst(state &st, const struct elk_isa_info *isa,
1200 const elk_backend_instruction *be_inst)
1201 {
1202 const struct intel_device_info *devinfo = isa->devinfo;
1203 const elk_fs_inst *inst = static_cast<const elk_fs_inst *>(be_inst);
1204 const instruction_info info(isa, inst);
1205 const perf_desc perf = instruction_desc(info);
1206
1207 /* Stall on any source dependencies. */
1208 for (unsigned i = 0; i < inst->sources; i++) {
1209 for (unsigned j = 0; j < regs_read(inst, i); j++)
1210 stall_on_dependency(
1211 st, reg_dependency_id(devinfo, inst->src[i], j));
1212 }
1213
1214 if (inst->reads_accumulator_implicitly()) {
1215 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1216 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1217 inst->exec_size - 1); j++)
1218 stall_on_dependency(
1219 st, reg_dependency_id(devinfo, elk_acc_reg(8), j));
1220 }
1221
1222 if (is_send(inst) && inst->base_mrf != -1) {
1223 for (unsigned j = 0; j < inst->mlen; j++)
1224 stall_on_dependency(
1225 st, reg_dependency_id(
1226 devinfo, elk_uvec_mrf(8, inst->base_mrf, 0), j));
1227 }
1228
1229 if (const unsigned mask = inst->flags_read(devinfo)) {
1230 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1231 if (mask & (1 << i))
1232 stall_on_dependency(st, flag_dependency_id(i));
1233 }
1234 }
1235
1236 /* Stall on any write dependencies. */
1237 if (!inst->no_dd_check) {
1238 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1239 for (unsigned j = 0; j < regs_written(inst); j++)
1240 stall_on_dependency(
1241 st, reg_dependency_id(devinfo, inst->dst, j));
1242 }
1243
1244 if (inst->writes_accumulator_implicitly(devinfo)) {
1245 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1246 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1247 inst->exec_size - 1); j++)
1248 stall_on_dependency(
1249 st, reg_dependency_id(devinfo, elk_acc_reg(8), j));
1250 }
1251
1252 if (const unsigned mask = inst->flags_written(devinfo)) {
1253 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1254 if (mask & (1 << i))
1255 stall_on_dependency(st, flag_dependency_id(i));
1256 }
1257 }
1258 }
1259
1260 /* Execute the instruction. */
1261 execute_instruction(st, perf);
1262
1263 /* Mark any source dependencies. */
1264 if (inst->is_send_from_grf()) {
1265 for (unsigned i = 0; i < inst->sources; i++) {
1266 if (inst->is_payload(i)) {
1267 for (unsigned j = 0; j < regs_read(inst, i); j++)
1268 mark_read_dependency(
1269 st, perf, reg_dependency_id(devinfo, inst->src[i], j));
1270 }
1271 }
1272 }
1273
1274 if (is_send(inst) && inst->base_mrf != -1) {
1275 for (unsigned j = 0; j < inst->mlen; j++)
1276 mark_read_dependency(st, perf,
1277 reg_dependency_id(devinfo, elk_uvec_mrf(8, inst->base_mrf, 0), j));
1278 }
1279
1280 /* Mark any destination dependencies. */
1281 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1282 for (unsigned j = 0; j < regs_written(inst); j++) {
1283 mark_write_dependency(st, perf,
1284 reg_dependency_id(devinfo, inst->dst, j));
1285 }
1286 }
1287
1288 if (inst->writes_accumulator_implicitly(devinfo)) {
1289 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1290 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1291 inst->exec_size - 1); j++)
1292 mark_write_dependency(st, perf,
1293 reg_dependency_id(devinfo, elk_acc_reg(8), j));
1294 }
1295
1296 if (const unsigned mask = inst->flags_written(devinfo)) {
1297 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1298 if (mask & (1 << i))
1299 mark_write_dependency(st, perf, flag_dependency_id(i));
1300 }
1301 }
1302 }
1303
1304 /**
1305 * Model the performance behavior of a VEC4 back-end instruction.
1306 */
1307 void
issue_vec4_instruction(state & st,const struct elk_isa_info * isa,const elk_backend_instruction * be_inst)1308 issue_vec4_instruction(state &st, const struct elk_isa_info *isa,
1309 const elk_backend_instruction *be_inst)
1310 {
1311 const struct intel_device_info *devinfo = isa->devinfo;
1312 const vec4_instruction *inst =
1313 static_cast<const vec4_instruction *>(be_inst);
1314 const instruction_info info(isa, inst);
1315 const perf_desc perf = instruction_desc(info);
1316
1317 /* Stall on any source dependencies. */
1318 for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
1319 for (unsigned j = 0; j < regs_read(inst, i); j++)
1320 stall_on_dependency(
1321 st, reg_dependency_id(devinfo, inst->src[i], j));
1322 }
1323
1324 if (inst->reads_accumulator_implicitly()) {
1325 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1326 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1327 inst->exec_size - 1); j++)
1328 stall_on_dependency(
1329 st, reg_dependency_id(devinfo, elk_acc_reg(8), j));
1330 }
1331
1332 if (inst->base_mrf != -1) {
1333 for (unsigned j = 0; j < inst->mlen; j++)
1334 stall_on_dependency(
1335 st, reg_dependency_id(
1336 devinfo, elk_uvec_mrf(8, inst->base_mrf, 0), j));
1337 }
1338
1339 if (inst->reads_flag())
1340 stall_on_dependency(st, EU_DEPENDENCY_ID_FLAG0);
1341
1342 /* Stall on any write dependencies. */
1343 if (!inst->no_dd_check) {
1344 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1345 for (unsigned j = 0; j < regs_written(inst); j++)
1346 stall_on_dependency(
1347 st, reg_dependency_id(devinfo, inst->dst, j));
1348 }
1349
1350 if (inst->writes_accumulator_implicitly(devinfo)) {
1351 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1352 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1353 inst->exec_size - 1); j++)
1354 stall_on_dependency(
1355 st, reg_dependency_id(devinfo, elk_acc_reg(8), j));
1356 }
1357
1358 if (inst->writes_flag(devinfo))
1359 stall_on_dependency(st, EU_DEPENDENCY_ID_FLAG0);
1360 }
1361
1362 /* Execute the instruction. */
1363 execute_instruction(st, perf);
1364
1365 /* Mark any source dependencies. */
1366 if (inst->is_send_from_grf()) {
1367 for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
1368 for (unsigned j = 0; j < regs_read(inst, i); j++)
1369 mark_read_dependency(
1370 st, perf, reg_dependency_id(devinfo, inst->src[i], j));
1371 }
1372 }
1373
1374 if (inst->base_mrf != -1) {
1375 for (unsigned j = 0; j < inst->mlen; j++)
1376 mark_read_dependency(st, perf,
1377 reg_dependency_id(devinfo, elk_uvec_mrf(8, inst->base_mrf, 0), j));
1378 }
1379
1380 /* Mark any destination dependencies. */
1381 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1382 for (unsigned j = 0; j < regs_written(inst); j++) {
1383 mark_write_dependency(st, perf,
1384 reg_dependency_id(devinfo, inst->dst, j));
1385 }
1386 }
1387
1388 if (inst->writes_accumulator_implicitly(devinfo)) {
1389 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1390 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1391 inst->exec_size - 1); j++)
1392 mark_write_dependency(st, perf,
1393 reg_dependency_id(devinfo, elk_acc_reg(8), j));
1394 }
1395
1396 if (inst->writes_flag(devinfo))
1397 mark_write_dependency(st, perf, EU_DEPENDENCY_ID_FLAG0);
1398 }
1399
1400 /**
1401 * Calculate the maximum possible throughput of the program compatible with
1402 * the cycle-count utilization estimated for each asynchronous unit, in
1403 * threads-per-cycle units.
1404 */
1405 float
calculate_thread_throughput(const state & st,float busy)1406 calculate_thread_throughput(const state &st, float busy)
1407 {
1408 for (unsigned i = 0; i < EU_NUM_UNITS; i++)
1409 busy = MAX2(busy, st.unit_busy[i]);
1410
1411 return 1.0 / busy;
1412 }
1413
1414 /**
1415 * Estimate the performance of the specified shader.
1416 */
1417 void
calculate_performance(performance & p,const elk_backend_shader * s,void (* issue_instruction)(state &,const struct elk_isa_info *,const elk_backend_instruction *),unsigned dispatch_width)1418 calculate_performance(performance &p, const elk_backend_shader *s,
1419 void (*issue_instruction)(
1420 state &, const struct elk_isa_info *,
1421 const elk_backend_instruction *),
1422 unsigned dispatch_width)
1423 {
1424 /* XXX - Note that the previous version of this code used worst-case
1425 * scenario estimation of branching divergence for SIMD32 shaders,
1426 * but this heuristic was removed to improve performance in common
1427 * scenarios. Wider shader variants are less optimal when divergence
1428 * is high, e.g. when application renders complex scene on a small
1429 * surface. It is assumed that such renders are short, so their
1430 * time doesn't matter and when it comes to the overall performance,
1431 * they are dominated by more optimal larger renders.
1432 *
1433 * It's possible that we could do better with divergence analysis
1434 * by isolating branches which are 100% uniform.
1435 *
1436 * Plumbing the trip counts from NIR loop analysis would allow us
1437 * to do a better job regarding the loop weights.
1438 *
1439 * In the meantime use values that roughly match the control flow
1440 * weights used elsewhere in the compiler back-end.
1441 */
1442 const float discard_weight = 1.0;
1443 const float loop_weight = 10;
1444 unsigned halt_count = 0;
1445 unsigned elapsed = 0;
1446 state st;
1447
1448 foreach_block(block, s->cfg) {
1449 const unsigned elapsed0 = elapsed;
1450
1451 foreach_inst_in_block(elk_backend_instruction, inst, block) {
1452 const unsigned clock0 = st.unit_ready[EU_UNIT_FE];
1453
1454 issue_instruction(st, &s->compiler->isa, inst);
1455
1456 if (inst->opcode == ELK_SHADER_OPCODE_HALT_TARGET && halt_count)
1457 st.weight /= discard_weight;
1458
1459 elapsed += (st.unit_ready[EU_UNIT_FE] - clock0) * st.weight;
1460
1461 if (inst->opcode == ELK_OPCODE_DO)
1462 st.weight *= loop_weight;
1463 else if (inst->opcode == ELK_OPCODE_WHILE)
1464 st.weight /= loop_weight;
1465 else if (inst->opcode == ELK_OPCODE_HALT && !halt_count++)
1466 st.weight *= discard_weight;
1467 }
1468
1469 p.block_latency[block->num] = elapsed - elapsed0;
1470 }
1471
1472 p.latency = elapsed;
1473 p.throughput = dispatch_width * calculate_thread_throughput(st, elapsed);
1474 }
1475 }
1476
performance(const elk_fs_visitor * v)1477 elk::performance::performance(const elk_fs_visitor *v) :
1478 block_latency(new unsigned[v->cfg->num_blocks])
1479 {
1480 calculate_performance(*this, v, issue_fs_inst, v->dispatch_width);
1481 }
1482
performance(const vec4_visitor * v)1483 elk::performance::performance(const vec4_visitor *v) :
1484 block_latency(new unsigned[v->cfg->num_blocks])
1485 {
1486 calculate_performance(*this, v, issue_vec4_instruction, 8);
1487 }
1488
~performance()1489 elk::performance::~performance()
1490 {
1491 delete[] block_latency;
1492 }
1493