xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/elk/elk_ir_performance.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2020 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "elk_eu.h"
25 #include "elk_fs.h"
26 #include "elk_vec4.h"
27 #include "elk_cfg.h"
28 
29 using namespace elk;
30 
31 namespace {
32    /**
33     * Enumeration representing the various asynchronous units that can run
34     * computations in parallel on behalf of a shader thread.
35     */
36    enum intel_eu_unit {
37       /** EU front-end. */
38       EU_UNIT_FE,
39       /** EU FPU0 (Note that co-issue to FPU1 is currently not modeled here). */
40       EU_UNIT_FPU,
41       /** Extended Math unit (AKA FPU1 on Gfx8-11, part of the EU on Gfx6+). */
42       EU_UNIT_EM,
43       /** Sampler shared function. */
44       EU_UNIT_SAMPLER,
45       /** Pixel Interpolator shared function. */
46       EU_UNIT_PI,
47       /** Unified Return Buffer shared function. */
48       EU_UNIT_URB,
49       /** Data Port Data Cache shared function. */
50       EU_UNIT_DP_DC,
51       /** Data Port Render Cache shared function. */
52       EU_UNIT_DP_RC,
53       /** Data Port Constant Cache shared function. */
54       EU_UNIT_DP_CC,
55       /** Message Gateway shared function. */
56       EU_UNIT_GATEWAY,
57       /** Thread Spawner shared function. */
58       EU_UNIT_SPAWNER,
59       /* EU_UNIT_VME, */
60       /* EU_UNIT_CRE, */
61       /** Number of asynchronous units currently tracked. */
62       EU_NUM_UNITS,
63       /** Dummy unit for instructions that don't consume runtime from the above. */
64       EU_UNIT_NULL = EU_NUM_UNITS
65    };
66 
67    /**
68     * Enumeration representing a computation result another computation can
69     * potentially depend on.
70     */
71    enum intel_eu_dependency_id {
72       /* Register part of the GRF. */
73       EU_DEPENDENCY_ID_GRF0 = 0,
74       /* Register part of the MRF.  Only used on Gfx4-6. */
75       EU_DEPENDENCY_ID_MRF0 = EU_DEPENDENCY_ID_GRF0 + XE2_MAX_GRF,
76       /* Address register part of the ARF. */
77       EU_DEPENDENCY_ID_ADDR0 = EU_DEPENDENCY_ID_MRF0 + 24,
78       /* Accumulator register part of the ARF. */
79       EU_DEPENDENCY_ID_ACCUM0 = EU_DEPENDENCY_ID_ADDR0 + 1,
80       /* Flag register part of the ARF. */
81       EU_DEPENDENCY_ID_FLAG0 = EU_DEPENDENCY_ID_ACCUM0 + 12,
82       /* SBID token write completion.  Only used on Gfx12+. */
83       EU_DEPENDENCY_ID_SBID_WR0 = EU_DEPENDENCY_ID_FLAG0 + 8,
84       /* SBID token read completion.  Only used on Gfx12+. */
85       EU_DEPENDENCY_ID_SBID_RD0 = EU_DEPENDENCY_ID_SBID_WR0 + 32,
86       /* Number of computation dependencies currently tracked. */
87       EU_NUM_DEPENDENCY_IDS = EU_DEPENDENCY_ID_SBID_RD0 + 32
88    };
89 
90    /**
91     * State of our modeling of the program execution.
92     */
93    struct state {
state__anoncc437f620111::state94       state() : unit_ready(), dep_ready(), unit_busy(), weight(1.0) {}
95       /**
96        * Time at which a given unit will be ready to execute the next
97        * computation, in clock units.
98        */
99       unsigned unit_ready[EU_NUM_UNITS];
100       /**
101        * Time at which an instruction dependent on a given dependency ID will
102        * be ready to execute, in clock units.
103        */
104       unsigned dep_ready[EU_NUM_DEPENDENCY_IDS];
105       /**
106        * Aggregated utilization of a given unit excluding idle cycles,
107        * in clock units.
108        */
109       float unit_busy[EU_NUM_UNITS];
110       /**
111        * Factor of the overhead of a computation accounted for in the
112        * aggregated utilization calculation.
113        */
114       float weight;
115    };
116 
117    /**
118     * Information derived from an IR instruction used to compute performance
119     * estimates.  Allows the timing calculation to work on both FS and VEC4
120     * instructions.
121     */
122    struct instruction_info {
instruction_info__anoncc437f620111::instruction_info123       instruction_info(const struct elk_isa_info *isa, const elk_fs_inst *inst) :
124          isa(isa), devinfo(isa->devinfo), op(inst->opcode),
125          td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
126          tx(get_exec_type(inst)), sx(0), ss(0),
127          sc(elk_has_bank_conflict(isa, inst) ? sd : 0),
128          desc(inst->desc), sfid(inst->sfid)
129       {
130          for (unsigned i = 0; i < inst->sources; i++)
131             ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
132 
133          /* Convert the execution size to GRF units. */
134          sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
135 
136          /* 32x32 integer multiplication has half the usual ALU throughput.
137           * Treat it as double-precision.
138           */
139          if ((inst->opcode == ELK_OPCODE_MUL || inst->opcode == ELK_OPCODE_MAD) &&
140              !elk_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
141              type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
142             tx = elk_int_type(8, tx == ELK_REGISTER_TYPE_D);
143       }
144 
instruction_info__anoncc437f620111::instruction_info145       instruction_info(const struct elk_isa_info *isa,
146                        const vec4_instruction *inst) :
147          isa(isa), devinfo(isa->devinfo), op(inst->opcode),
148          td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
149          tx(get_exec_type(inst)), sx(0), ss(0), sc(0),
150          desc(inst->desc), sfid(inst->sfid)
151       {
152          /* Compute the maximum source size. */
153          for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++)
154             ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
155 
156          /* Convert the execution size to GRF units. */
157          sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
158 
159          /* 32x32 integer multiplication has half the usual ALU throughput.
160           * Treat it as double-precision.
161           */
162          if ((inst->opcode == ELK_OPCODE_MUL || inst->opcode == ELK_OPCODE_MAD) &&
163              !elk_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
164              type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
165             tx = elk_int_type(8, tx == ELK_REGISTER_TYPE_D);
166       }
167 
168       /** ISA encoding information */
169       const struct elk_isa_info *isa;
170       /** Device information. */
171       const struct intel_device_info *devinfo;
172       /** Instruction opcode. */
173       elk_opcode op;
174       /** Destination type. */
175       elk_reg_type td;
176       /** Destination size in GRF units. */
177       unsigned sd;
178       /** Execution type. */
179       elk_reg_type tx;
180       /** Execution size in GRF units. */
181       unsigned sx;
182       /** Source size. */
183       unsigned ss;
184       /** Bank conflict penalty size in GRF units (equal to sd if non-zero). */
185       unsigned sc;
186       /** Send message descriptor. */
187       uint32_t desc;
188       /** Send message shared function ID. */
189       uint8_t sfid;
190    };
191 
192    /**
193     * Timing information of an instruction used to estimate the performance of
194     * the program.
195     */
196    struct perf_desc {
perf_desc__anoncc437f620111::perf_desc197       perf_desc(enum intel_eu_unit u, int df, int db,
198                 int ls, int ld, int la, int lf) :
199          u(u), df(df), db(db), ls(ls), ld(ld), la(la), lf(lf) {}
200 
201       /**
202        * Back-end unit its runtime shall be accounted to, in addition to the
203        * EU front-end which is always assumed to be involved.
204        */
205       enum intel_eu_unit u;
206       /**
207        * Overhead cycles from the time that the EU front-end starts executing
208        * the instruction until it's ready to execute the next instruction.
209        */
210       int df;
211       /**
212        * Overhead cycles from the time that the back-end starts executing the
213        * instruction until it's ready to execute the next instruction.
214        */
215       int db;
216       /**
217        * Latency cycles from the time that the back-end starts executing the
218        * instruction until its sources have been read from the register file.
219        */
220       int ls;
221       /**
222        * Latency cycles from the time that the back-end starts executing the
223        * instruction until its regular destination has been written to the
224        * register file.
225        */
226       int ld;
227       /**
228        * Latency cycles from the time that the back-end starts executing the
229        * instruction until its accumulator destination has been written to the
230        * ARF file.
231        *
232        * Note that this is an approximation of the real behavior of
233        * accumulating instructions in the hardware: Instead of modeling a pair
234        * of back-to-back accumulating instructions as a first computation with
235        * latency equal to ld followed by another computation with a
236        * mid-pipeline stall (e.g. after the "M" part of a MAC instruction), we
237        * model the stall as if it occurred at the top of the pipeline, with
238        * the latency of the accumulator computation offset accordingly.
239        */
240       int la;
241       /**
242        * Latency cycles from the time that the back-end starts executing the
243        * instruction until its flag destination has been written to the ARF
244        * file.
245        */
246       int lf;
247    };
248 
249    /**
250     * Compute the timing information of an instruction based on any relevant
251     * information from the IR and a number of parameters specifying a linear
252     * approximation: Parameter X_Y specifies the derivative of timing X
253     * relative to info field Y, while X_1 specifies the independent term of
254     * the approximation of timing X.
255     */
256    perf_desc
calculate_desc(const instruction_info & info,enum intel_eu_unit u,int df_1,int df_sd,int df_sc,int db_1,int db_sx,int ls_1,int ld_1,int la_1,int lf_1,int l_ss,int l_sd)257    calculate_desc(const instruction_info &info, enum intel_eu_unit u,
258                   int df_1, int df_sd, int df_sc,
259                   int db_1, int db_sx,
260                   int ls_1, int ld_1, int la_1, int lf_1,
261                   int l_ss, int l_sd)
262    {
263       return perf_desc(u, df_1 + df_sd * int(info.sd) + df_sc * int(info.sc),
264                           db_1 + db_sx * int(info.sx),
265                           ls_1 + l_ss * int(info.ss),
266                           ld_1 + l_ss * int(info.ss) + l_sd * int(info.sd),
267                           la_1, lf_1);
268    }
269 
270    /**
271     * Compute the timing information of an instruction based on any relevant
272     * information from the IR and a number of linear approximation parameters
273     * hard-coded for each IR instruction.
274     *
275     * Most timing parameters are obtained from the multivariate linear
276     * regression of a sample of empirical timings measured using the tm0
277     * register (as can be done today by using the shader_time debugging
278     * option).  The Gfx4-5 math timings are obtained from BSpec Volume 5c.3
279     * "Shared Functions - Extended Math", Section 3.2 "Performance".
280     * Parameters marked XXX shall be considered low-quality, they're possibly
281     * high variance or completely guessed in cases where experimental data was
282     * unavailable.
283     */
284    const perf_desc
instruction_desc(const instruction_info & info)285    instruction_desc(const instruction_info &info)
286    {
287       const struct intel_device_info *devinfo = info.devinfo;
288 
289       switch (info.op) {
290       case ELK_OPCODE_SEL:
291       case ELK_OPCODE_NOT:
292       case ELK_OPCODE_AND:
293       case ELK_OPCODE_OR:
294       case ELK_OPCODE_XOR:
295       case ELK_OPCODE_SHR:
296       case ELK_OPCODE_SHL:
297       case ELK_OPCODE_DIM:
298       case ELK_OPCODE_ASR:
299       case ELK_OPCODE_CMPN:
300       case ELK_OPCODE_F16TO32:
301       case ELK_OPCODE_BFREV:
302       case ELK_OPCODE_BFI1:
303       case ELK_OPCODE_AVG:
304       case ELK_OPCODE_FRC:
305       case ELK_OPCODE_RNDU:
306       case ELK_OPCODE_RNDD:
307       case ELK_OPCODE_RNDE:
308       case ELK_OPCODE_RNDZ:
309       case ELK_OPCODE_MAC:
310       case ELK_OPCODE_MACH:
311       case ELK_OPCODE_LZD:
312       case ELK_OPCODE_FBH:
313       case ELK_OPCODE_FBL:
314       case ELK_OPCODE_CBIT:
315       case ELK_OPCODE_ADDC:
316       case ELK_OPCODE_SUBB:
317       case ELK_OPCODE_SAD2:
318       case ELK_OPCODE_SADA2:
319       case ELK_OPCODE_LINE:
320       case ELK_OPCODE_NOP:
321       case ELK_SHADER_OPCODE_CLUSTER_BROADCAST:
322       case ELK_SHADER_OPCODE_SCRATCH_HEADER:
323       case ELK_FS_OPCODE_DDX_COARSE:
324       case ELK_FS_OPCODE_DDX_FINE:
325       case ELK_FS_OPCODE_DDY_COARSE:
326       case ELK_FS_OPCODE_PIXEL_X:
327       case ELK_FS_OPCODE_PIXEL_Y:
328       case ELK_FS_OPCODE_SET_SAMPLE_ID:
329       case ELK_VEC4_OPCODE_MOV_BYTES:
330       case ELK_VEC4_OPCODE_UNPACK_UNIFORM:
331       case ELK_VEC4_OPCODE_DOUBLE_TO_F32:
332       case ELK_VEC4_OPCODE_DOUBLE_TO_D32:
333       case ELK_VEC4_OPCODE_DOUBLE_TO_U32:
334       case ELK_VEC4_OPCODE_TO_DOUBLE:
335       case ELK_VEC4_OPCODE_PICK_LOW_32BIT:
336       case ELK_VEC4_OPCODE_PICK_HIGH_32BIT:
337       case ELK_VEC4_OPCODE_SET_LOW_32BIT:
338       case ELK_VEC4_OPCODE_SET_HIGH_32BIT:
339       case ELK_VEC4_OPCODE_ZERO_OOB_PUSH_REGS:
340       case ELK_GS_OPCODE_SET_DWORD_2:
341       case ELK_GS_OPCODE_SET_WRITE_OFFSET:
342       case ELK_GS_OPCODE_SET_VERTEX_COUNT:
343       case ELK_GS_OPCODE_PREPARE_CHANNEL_MASKS:
344       case ELK_GS_OPCODE_SET_CHANNEL_MASKS:
345       case ELK_GS_OPCODE_GET_INSTANCE_ID:
346       case ELK_GS_OPCODE_SET_PRIMITIVE_ID:
347       case ELK_GS_OPCODE_SVB_SET_DST_INDEX:
348       case ELK_TCS_OPCODE_SRC0_010_IS_ZERO:
349       case ELK_TCS_OPCODE_GET_PRIMITIVE_ID:
350       case ELK_TES_OPCODE_GET_PRIMITIVE_ID:
351       case ELK_SHADER_OPCODE_READ_SR_REG:
352          if (devinfo->ver >= 8) {
353             if (type_sz(info.tx) > 4)
354                return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
355                                      0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
356             else
357                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
358                                      0, 8, 4, 12, 0, 0);
359          } else if (devinfo->verx10 >= 75) {
360             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
361                                   0, 10, 6 /* XXX */, 16, 0, 0);
362          } else {
363             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
364                                   0, 12, 8 /* XXX */, 18, 0, 0);
365          }
366 
367       case ELK_OPCODE_MOV:
368       case ELK_OPCODE_CMP:
369       case ELK_OPCODE_ADD:
370       case ELK_OPCODE_MUL:
371       case ELK_SHADER_OPCODE_MOV_RELOC_IMM:
372       case ELK_VEC4_OPCODE_MOV_FOR_SCRATCH:
373          if (devinfo->ver >= 8) {
374             if (type_sz(info.tx) > 4)
375                return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
376                                      0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
377             else
378                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
379                                      0, 8, 4, 12, 0, 0);
380          } else if (devinfo->verx10 >= 75) {
381             if (info.tx == ELK_REGISTER_TYPE_F)
382                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
383                                      0, 12, 8 /* XXX */, 18, 0, 0);
384             else
385                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
386                                      0, 10, 6 /* XXX */, 16, 0, 0);
387          } else if (devinfo->ver >= 7) {
388             if (info.tx == ELK_REGISTER_TYPE_F)
389                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
390                                      0, 14, 10 /* XXX */, 20, 0, 0);
391             else
392                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
393                                      0, 12, 8 /* XXX */, 18, 0, 0);
394          } else {
395             return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 0,
396                                   0, 2 /* XXX */,
397                                   0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
398                                   0, 0);
399          }
400 
401       case ELK_OPCODE_BFE:
402       case ELK_OPCODE_BFI2:
403       case ELK_OPCODE_CSEL:
404          if (devinfo->ver >= 8)
405             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
406                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
407          else if (devinfo->verx10 >= 75)
408             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
409                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
410          else if (devinfo->ver >= 7)
411             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
412                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
413          else
414             abort();
415 
416       case ELK_OPCODE_MAD:
417          if (devinfo->ver >= 8) {
418             if (type_sz(info.tx) > 4)
419                return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
420                                      0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
421             else
422                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
423                                      0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
424          } else if (devinfo->verx10 >= 75) {
425             if (info.tx == ELK_REGISTER_TYPE_F)
426                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
427                                      0, 12, 8 /* XXX */, 18, 0, 0);
428             else
429                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
430                                      0, 10, 6 /* XXX */, 16, 0, 0);
431          } else if (devinfo->ver >= 7) {
432             if (info.tx == ELK_REGISTER_TYPE_F)
433                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
434                                      0, 14, 10 /* XXX */, 20, 0, 0);
435             else
436                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
437                                      0, 12, 8 /* XXX */, 18, 0, 0);
438          } else if (devinfo->ver >= 6) {
439             return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 1 /* XXX */,
440                                   0, 2 /* XXX */,
441                                   0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
442                                   0, 0);
443          } else {
444             abort();
445          }
446 
447       case ELK_OPCODE_F32TO16:
448          if (devinfo->ver >= 8)
449             return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
450                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
451          else if (devinfo->verx10 >= 75)
452             return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
453                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
454          else if (devinfo->ver >= 7)
455             return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
456                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
457          else
458             abort();
459 
460       case ELK_OPCODE_DP4:
461       case ELK_OPCODE_DPH:
462       case ELK_OPCODE_DP3:
463       case ELK_OPCODE_DP2:
464          if (devinfo->ver >= 8)
465             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
466                                   0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
467          else if (devinfo->verx10 >= 75)
468             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
469                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
470          else
471             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
472                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
473 
474       case ELK_SHADER_OPCODE_RCP:
475       case ELK_SHADER_OPCODE_RSQ:
476       case ELK_SHADER_OPCODE_SQRT:
477       case ELK_SHADER_OPCODE_EXP2:
478       case ELK_SHADER_OPCODE_LOG2:
479       case ELK_SHADER_OPCODE_SIN:
480       case ELK_SHADER_OPCODE_COS:
481       case ELK_SHADER_OPCODE_POW:
482       case ELK_SHADER_OPCODE_INT_QUOTIENT:
483       case ELK_SHADER_OPCODE_INT_REMAINDER:
484          if (devinfo->ver >= 6) {
485             switch (info.op) {
486             case ELK_SHADER_OPCODE_RCP:
487             case ELK_SHADER_OPCODE_RSQ:
488             case ELK_SHADER_OPCODE_SQRT:
489             case ELK_SHADER_OPCODE_EXP2:
490             case ELK_SHADER_OPCODE_LOG2:
491             case ELK_SHADER_OPCODE_SIN:
492             case ELK_SHADER_OPCODE_COS:
493                if (devinfo->ver >= 8)
494                   return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 4,
495                                         0, 16, 0, 0, 0, 0);
496                else if (devinfo->verx10 >= 75)
497                   return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 2,
498                                         0, 12, 0, 0, 0, 0);
499                else
500                   return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 2,
501                                         0, 14, 0, 0, 0, 0);
502 
503             case ELK_SHADER_OPCODE_POW:
504                if (devinfo->ver >= 8)
505                   return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 8,
506                                         0, 24, 0, 0, 0, 0);
507                else if (devinfo->verx10 >= 75)
508                   return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 4,
509                                         0, 20, 0, 0, 0, 0);
510                else
511                   return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 4,
512                                         0, 22, 0, 0, 0, 0);
513 
514             case ELK_SHADER_OPCODE_INT_QUOTIENT:
515             case ELK_SHADER_OPCODE_INT_REMAINDER:
516                return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 26, 0,
517                                      0, 28 /* XXX */, 0, 0, 0, 0);
518 
519             default:
520                abort();
521             }
522          } else {
523             switch (info.op) {
524             case ELK_SHADER_OPCODE_RCP:
525                return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 8,
526                                      0, 22, 0, 0, 0, 8);
527 
528             case ELK_SHADER_OPCODE_RSQ:
529                return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 16,
530                                      0, 44, 0, 0, 0, 8);
531 
532             case ELK_SHADER_OPCODE_INT_QUOTIENT:
533             case ELK_SHADER_OPCODE_SQRT:
534             case ELK_SHADER_OPCODE_LOG2:
535                return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 24,
536                                      0, 66, 0, 0, 0, 8);
537 
538             case ELK_SHADER_OPCODE_INT_REMAINDER:
539             case ELK_SHADER_OPCODE_EXP2:
540                return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 32,
541                                      0, 88, 0, 0, 0, 8);
542 
543             case ELK_SHADER_OPCODE_SIN:
544             case ELK_SHADER_OPCODE_COS:
545                return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 48,
546                                      0, 132, 0, 0, 0, 8);
547 
548             case ELK_SHADER_OPCODE_POW:
549                return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 64,
550                                      0, 176, 0, 0, 0, 8);
551 
552             default:
553                abort();
554             }
555          }
556 
557       case ELK_OPCODE_DO:
558          if (devinfo->ver >= 6)
559             return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
560                                   0, 0, 0, 0, 0, 0);
561          else
562             return calculate_desc(info, EU_UNIT_NULL, 2 /* XXX */, 0, 0, 0, 0,
563                                   0, 0, 0, 0, 0, 0);
564 
565       case ELK_OPCODE_IF:
566       case ELK_OPCODE_ELSE:
567       case ELK_OPCODE_ENDIF:
568       case ELK_OPCODE_WHILE:
569       case ELK_OPCODE_BREAK:
570       case ELK_OPCODE_CONTINUE:
571       case ELK_OPCODE_HALT:
572          if (devinfo->ver >= 8)
573             return calculate_desc(info, EU_UNIT_NULL, 8, 0, 0, 0, 0,
574                                   0, 0, 0, 0, 0, 0);
575          else if (devinfo->verx10 >= 75)
576             return calculate_desc(info, EU_UNIT_NULL, 6, 0, 0, 0, 0,
577                                   0, 0, 0, 0, 0, 0);
578          else
579             return calculate_desc(info, EU_UNIT_NULL, 2, 0, 0, 0, 0,
580                                   0, 0, 0, 0, 0, 0);
581 
582       case ELK_FS_OPCODE_LINTERP:
583          if (devinfo->ver >= 8)
584             return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
585                                   0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
586          else if (devinfo->verx10 >= 75)
587             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
588                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
589          else
590             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
591                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
592 
593       case ELK_OPCODE_LRP:
594          if (devinfo->ver >= 8)
595             return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
596                                   0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
597          else if (devinfo->verx10 >= 75)
598             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
599                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
600          else if (devinfo->ver >= 6)
601             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
602                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
603          else
604             abort();
605 
606       case ELK_FS_OPCODE_PACK_HALF_2x16_SPLIT:
607          if (devinfo->ver >= 8)
608             return calculate_desc(info, EU_UNIT_FPU, 16, 6, 0, 0, 6,
609                                   0, 8 /* XXX */, 4 /* XXX */,
610                                   12 /* XXX */, 0, 0);
611          else if (devinfo->verx10 >= 75)
612             return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6,
613                                   0, 10 /* XXX */, 6 /* XXX */,
614                                   16 /* XXX */, 0, 0);
615          else if (devinfo->ver >= 7)
616             return calculate_desc(info, EU_UNIT_FPU, 24, 6, 0, 0, 6,
617                                   0, 12 /* XXX */, 8 /* XXX */,
618                                   18 /* XXX */, 0, 0);
619          else
620             abort();
621 
622       case ELK_SHADER_OPCODE_MOV_INDIRECT:
623          if (devinfo->ver >= 8)
624             return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
625                                   0, 8 /* XXX */, 4 /* XXX */,
626                                   12 /* XXX */, 0, 0);
627          else if (devinfo->verx10 >= 75)
628             return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
629                                   0, 10 /* XXX */, 6 /* XXX */,
630                                   16 /* XXX */, 0, 0);
631          else
632             return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
633                                   0, 12 /* XXX */, 8 /* XXX */,
634                                   18 /* XXX */, 0, 0);
635 
636       case ELK_SHADER_OPCODE_BROADCAST:
637          if (devinfo->ver >= 8)
638             return calculate_desc(info, EU_UNIT_FPU, 18, 0, 0, 4, 0,
639                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
640          else if (devinfo->verx10 >= 75)
641             return calculate_desc(info, EU_UNIT_FPU, 18, 0, 0, 4, 0,
642                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
643          else if (devinfo->ver >= 7)
644             return calculate_desc(info, EU_UNIT_FPU, 20, 0, 0, 4, 0,
645                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
646          else
647             abort();
648 
649       case ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL:
650       case ELK_SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
651          if (devinfo->ver >= 8)
652             return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
653                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
654          else if (devinfo->verx10 >= 75)
655             return calculate_desc(info, EU_UNIT_FPU, 36, 0, 0, 6, 0,
656                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
657          else if (devinfo->ver >= 7)
658             return calculate_desc(info, EU_UNIT_FPU, 40, 0, 0, 6, 0,
659                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
660          else
661             abort();
662 
663       case ELK_SHADER_OPCODE_RND_MODE:
664       case ELK_SHADER_OPCODE_FLOAT_CONTROL_MODE:
665          if (devinfo->ver >= 8)
666             return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0,
667                                   4 /* XXX */, 0,
668                                   0, 0, 0, 0, 0, 0);
669          else if (devinfo->verx10 >= 75)
670             return calculate_desc(info, EU_UNIT_FPU, 24 /* XXX */, 0, 0,
671                                   4 /* XXX */, 0,
672                                   0, 0, 0, 0, 0, 0);
673          else if (devinfo->ver >= 6)
674             return calculate_desc(info, EU_UNIT_FPU, 28 /* XXX */, 0, 0,
675                                   4 /* XXX */, 0,
676                                   0, 0, 0, 0, 0, 0);
677          else
678             abort();
679 
680       case ELK_SHADER_OPCODE_SHUFFLE:
681          if (devinfo->ver >= 8)
682             return calculate_desc(info, EU_UNIT_FPU, 42 /* XXX */, 0, 0,
683                                   42 /* XXX */, 0,
684                                   0, 8 /* XXX */, 4 /* XXX */,
685                                   12 /* XXX */, 0, 0);
686          else if (devinfo->verx10 >= 75)
687             return calculate_desc(info, EU_UNIT_FPU, 0, 44 /* XXX */, 0,
688                                   0, 44 /* XXX */,
689                                   0, 10 /* XXX */, 6 /* XXX */,
690                                   16 /* XXX */, 0, 0);
691          else if (devinfo->ver >= 6)
692             return calculate_desc(info, EU_UNIT_FPU, 0, 46 /* XXX */, 0,
693                                   0, 46 /* XXX */,
694                                   0, 12 /* XXX */, 8 /* XXX */,
695                                   18 /* XXX */, 0, 0);
696          else
697             abort();
698 
699       case ELK_SHADER_OPCODE_SEL_EXEC:
700          if (devinfo->ver >= 8)
701             return calculate_desc(info, EU_UNIT_FPU, 8 /* XXX */, 4 /* XXX */, 0,
702                                   0, 4 /* XXX */,
703                                   0, 8 /* XXX */, 4 /* XXX */,
704                                   12 /* XXX */, 0, 0);
705          else if (devinfo->verx10 >= 75)
706             return calculate_desc(info, EU_UNIT_FPU, 10 /* XXX */, 4 /* XXX */, 0,
707                                   0, 4 /* XXX */,
708                                   0, 10 /* XXX */, 6 /* XXX */,
709                                   16 /* XXX */, 0, 0);
710          else
711             return calculate_desc(info, EU_UNIT_FPU, 12 /* XXX */, 4 /* XXX */, 0,
712                                   0, 4 /* XXX */,
713                                   0, 12 /* XXX */, 8 /* XXX */,
714                                   18 /* XXX */, 0, 0);
715 
716       case ELK_SHADER_OPCODE_QUAD_SWIZZLE:
717          if (devinfo->ver >= 8)
718             return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
719                                   0, 8 /* XXX */,
720                                   0, 8 /* XXX */, 4 /* XXX */,
721                                   12 /* XXX */, 0, 0);
722          else if (devinfo->verx10 >= 75)
723             return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
724                                   0, 8 /* XXX */,
725                                   0, 10 /* XXX */, 6 /* XXX */,
726                                   16 /* XXX */, 0, 0);
727          else
728             return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
729                                   0, 8 /* XXX */,
730                                   0, 12 /* XXX */, 8 /* XXX */,
731                                   18 /* XXX */, 0, 0);
732 
733       case ELK_FS_OPCODE_DDY_FINE:
734          if (devinfo->ver >= 8)
735             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
736                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
737          else if (devinfo->verx10 >= 75)
738             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
739                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
740          else
741             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
742                                   0, 14, 10 /* XXX */, 20 /* XXX */, 0, 0);
743 
744       case ELK_FS_OPCODE_LOAD_LIVE_CHANNELS:
745          if (devinfo->ver >= 8)
746             return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 0,
747                                   0, 2 /* XXX */,
748                                   0, 0, 0, 8 /* XXX */, 0, 0);
749          else
750             abort();
751 
752       case ELK_VEC4_OPCODE_PACK_BYTES:
753          if (devinfo->ver >= 8)
754             return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0,
755                                   4 /* XXX */, 0,
756                                   0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
757                                   0, 0);
758          else if (devinfo->verx10 >= 75)
759             return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0,
760                                   4 /* XXX */, 0,
761                                   0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
762                                   0, 0);
763          else
764             return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0,
765                                   4 /* XXX */, 0,
766                                   0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
767                                   0, 0);
768 
769       case ELK_VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
770       case ELK_TCS_OPCODE_GET_INSTANCE_ID:
771       case ELK_VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS:
772       case ELK_VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
773       case ELK_TES_OPCODE_CREATE_INPUT_READ_HEADER:
774          if (devinfo->ver >= 8)
775             return calculate_desc(info, EU_UNIT_FPU, 22 /* XXX */, 0, 0,
776                                   6 /* XXX */, 0,
777                                   0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
778                                   0, 0);
779          else if (devinfo->verx10 >= 75)
780             return calculate_desc(info, EU_UNIT_FPU, 26 /* XXX */, 0, 0,
781                                   6 /* XXX */, 0,
782                                   0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
783                                   0, 0);
784          else
785             return calculate_desc(info, EU_UNIT_FPU, 30 /* XXX */, 0, 0,
786                                   6 /* XXX */, 0,
787                                   0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
788                                   0, 0);
789 
790       case ELK_GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
791       case ELK_TCS_OPCODE_CREATE_BARRIER_HEADER:
792          if (devinfo->ver >= 8)
793             return calculate_desc(info, EU_UNIT_FPU, 32 /* XXX */, 0, 0,
794                                   8 /* XXX */, 0,
795                                   0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
796                                   0, 0);
797          else if (devinfo->verx10 >= 75)
798             return calculate_desc(info, EU_UNIT_FPU, 38 /* XXX */, 0, 0,
799                                   8 /* XXX */, 0,
800                                   0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
801                                   0, 0);
802          else if (devinfo->ver >= 6)
803             return calculate_desc(info, EU_UNIT_FPU, 44 /* XXX */, 0, 0,
804                                   8 /* XXX */, 0,
805                                   0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
806                                   0, 0);
807          else
808             abort();
809 
810       case ELK_TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
811          if (devinfo->ver >= 8)
812             return calculate_desc(info, EU_UNIT_FPU, 12 /* XXX */, 0, 0,
813                                   4 /* XXX */, 0,
814                                   0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
815                                   0, 0);
816          else if (devinfo->verx10 >= 75)
817             return calculate_desc(info, EU_UNIT_FPU, 14 /* XXX */, 0, 0,
818                                   4 /* XXX */, 0,
819                                   0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
820                                   0, 0);
821          else if (devinfo->ver >= 7)
822             return calculate_desc(info, EU_UNIT_FPU, 16 /* XXX */, 0, 0,
823                                   4 /* XXX */, 0,
824                                   0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
825                                   0, 0);
826          else
827             abort();
828 
829       case ELK_SHADER_OPCODE_TEX:
830       case ELK_FS_OPCODE_TXB:
831       case ELK_SHADER_OPCODE_TXD:
832       case ELK_SHADER_OPCODE_TXF:
833       case ELK_SHADER_OPCODE_TXF_LZ:
834       case ELK_SHADER_OPCODE_TXL:
835       case ELK_SHADER_OPCODE_TXL_LZ:
836       case ELK_SHADER_OPCODE_TXF_CMS:
837       case ELK_SHADER_OPCODE_TXF_CMS_W:
838       case ELK_SHADER_OPCODE_TXF_UMS:
839       case ELK_SHADER_OPCODE_TXF_MCS:
840       case ELK_SHADER_OPCODE_TXS:
841       case ELK_SHADER_OPCODE_LOD:
842       case ELK_SHADER_OPCODE_GET_BUFFER_SIZE:
843       case ELK_SHADER_OPCODE_TG4:
844       case ELK_SHADER_OPCODE_TG4_OFFSET:
845       case ELK_SHADER_OPCODE_SAMPLEINFO:
846       case ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
847          return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16 /* XXX */,
848                                8 /* XXX */, 750 /* XXX */, 0, 0,
849                                2 /* XXX */, 0);
850 
851       case ELK_VEC4_OPCODE_URB_READ:
852       case ELK_VEC4_VS_OPCODE_URB_WRITE:
853       case ELK_VEC4_GS_OPCODE_URB_WRITE:
854       case ELK_VEC4_GS_OPCODE_URB_WRITE_ALLOCATE:
855       case ELK_GS_OPCODE_THREAD_END:
856       case ELK_GS_OPCODE_FF_SYNC:
857       case ELK_VEC4_TCS_OPCODE_URB_WRITE:
858       case ELK_TCS_OPCODE_RELEASE_INPUT:
859       case ELK_TCS_OPCODE_THREAD_END:
860          return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */,
861                                32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
862 
863       case ELK_SHADER_OPCODE_MEMORY_FENCE:
864       case ELK_SHADER_OPCODE_INTERLOCK:
865          switch (info.sfid) {
866          case GFX6_SFID_DATAPORT_RENDER_CACHE:
867             if (devinfo->ver >= 7)
868                return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 30 /* XXX */, 0,
869                                      10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
870             else
871                abort();
872 
873          case ELK_SFID_URB:
874          case GFX7_SFID_DATAPORT_DATA_CACHE:
875          case HSW_SFID_DATAPORT_DATA_CACHE_1:
876             if (devinfo->ver >= 7)
877                return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 30 /* XXX */, 0,
878                                      10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
879             else
880                abort();
881 
882          default:
883             abort();
884          }
885 
886       case ELK_SHADER_OPCODE_GFX4_SCRATCH_READ:
887       case ELK_SHADER_OPCODE_GFX4_SCRATCH_WRITE:
888       case ELK_SHADER_OPCODE_GFX7_SCRATCH_READ:
889          return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 0, 8 /* XXX */,
890                                10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
891 
892       case ELK_VEC4_OPCODE_UNTYPED_ATOMIC:
893          if (devinfo->ver >= 7)
894             return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
895                                   30 /* XXX */, 400 /* XXX */,
896                                   10 /* XXX */, 100 /* XXX */, 0, 0,
897                                   0, 400 /* XXX */);
898          else
899             abort();
900 
901       case ELK_VEC4_OPCODE_UNTYPED_SURFACE_READ:
902       case ELK_VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
903          if (devinfo->ver >= 7)
904             return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
905                                   0, 20 /* XXX */,
906                                   10 /* XXX */, 100 /* XXX */, 0, 0,
907                                   0, 0);
908          else
909             abort();
910 
911       case ELK_FS_OPCODE_FB_WRITE:
912       case ELK_FS_OPCODE_REP_FB_WRITE:
913          return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 0, 450 /* XXX */,
914                                10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
915 
916       case ELK_GS_OPCODE_SVB_WRITE:
917          if (devinfo->ver >= 6)
918             return calculate_desc(info, EU_UNIT_DP_RC, 2 /* XXX */, 0, 0,
919                                   0, 450 /* XXX */,
920                                   10 /* XXX */, 300 /* XXX */, 0, 0,
921                                   0, 0);
922          else
923             abort();
924 
925       case ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
926          return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */,
927                                10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
928 
929       case ELK_VS_OPCODE_PULL_CONSTANT_LOAD:
930       case ELK_VS_OPCODE_PULL_CONSTANT_LOAD_GFX7:
931          return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16,
932                                8, 750, 0, 0, 2, 0);
933 
934       case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
935       case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
936       case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
937          if (devinfo->ver >= 7)
938             return calculate_desc(info, EU_UNIT_PI, 2, 0, 0, 14 /* XXX */, 0,
939                                   0, 90 /* XXX */, 0, 0, 0, 0);
940          else
941             abort();
942 
943       case ELK_SHADER_OPCODE_BARRIER:
944          if (devinfo->ver >= 7)
945             return calculate_desc(info, EU_UNIT_GATEWAY, 90 /* XXX */, 0, 0,
946                                   0 /* XXX */, 0,
947                                   0, 0, 0, 0, 0, 0);
948          else
949             abort();
950 
951       case ELK_CS_OPCODE_CS_TERMINATE:
952          if (devinfo->ver >= 7)
953             return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0,
954                                   10 /* XXX */, 0, 0, 0, 0, 0);
955          else
956             abort();
957 
958       case ELK_SHADER_OPCODE_SEND:
959          switch (info.sfid) {
960          case GFX6_SFID_DATAPORT_CONSTANT_CACHE:
961             if (devinfo->ver >= 7) {
962                /* See ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD */
963                return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */,
964                                      10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
965             } else {
966                abort();
967             }
968          case GFX6_SFID_DATAPORT_RENDER_CACHE:
969             if (devinfo->ver >= 7) {
970                switch (elk_dp_desc_msg_type(devinfo, info.desc)) {
971                case GFX7_DATAPORT_RC_TYPED_ATOMIC_OP:
972                   return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0,
973                                         30 /* XXX */, 450 /* XXX */,
974                                         10 /* XXX */, 100 /* XXX */,
975                                         0, 0, 0, 400 /* XXX */);
976                default:
977                   return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0,
978                                         0, 450 /* XXX */,
979                                         10 /* XXX */, 300 /* XXX */, 0, 0,
980                                         0, 0);
981                }
982             } else if (devinfo->ver >= 6)  {
983                return calculate_desc(info, EU_UNIT_DP_RC, 2 /* XXX */, 0, 0,
984                                      0, 450 /* XXX */,
985                                      10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
986             } else {
987                abort();
988             }
989          case ELK_SFID_SAMPLER: {
990             if (devinfo->ver >= 6)
991                return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16,
992                                      8, 750, 0, 0, 2, 0);
993             else
994                abort();
995          }
996          case GFX7_SFID_DATAPORT_DATA_CACHE:
997          case HSW_SFID_DATAPORT_DATA_CACHE_1:
998             if (devinfo->verx10 >= 75) {
999                switch (elk_dp_desc_msg_type(devinfo, info.desc)) {
1000                case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP:
1001                case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2:
1002                case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2:
1003                case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP:
1004                   return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1005                                         30 /* XXX */, 400 /* XXX */,
1006                                         10 /* XXX */, 100 /* XXX */, 0, 0,
1007                                         0, 400 /* XXX */);
1008 
1009                default:
1010                   return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1011                                         0, 20 /* XXX */,
1012                                         10 /* XXX */, 100 /* XXX */, 0, 0,
1013                                         0, 0);
1014                }
1015             } else if (devinfo->ver >= 7) {
1016                switch (elk_dp_desc_msg_type(devinfo, info.desc)) {
1017                case GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP:
1018                   return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1019                                         30 /* XXX */, 400 /* XXX */,
1020                                         10 /* XXX */, 100 /* XXX */,
1021                                         0, 0, 0, 400 /* XXX */);
1022                default:
1023                   return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1024                                         0, 20 /* XXX */,
1025                                         10 /* XXX */, 100 /* XXX */, 0, 0,
1026                                         0, 0);
1027                }
1028             } else {
1029                abort();
1030             }
1031 
1032          case GFX7_SFID_PIXEL_INTERPOLATOR:
1033             if (devinfo->ver >= 7)
1034                return calculate_desc(info, EU_UNIT_PI, 2, 0, 0, 14 /* XXX */, 0,
1035                                      0, 90 /* XXX */, 0, 0, 0, 0);
1036             else
1037                abort();
1038 
1039          case ELK_SFID_URB:
1040             return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */,
1041                                   32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
1042 
1043          default:
1044             abort();
1045          }
1046 
1047       case ELK_SHADER_OPCODE_UNDEF:
1048       case ELK_SHADER_OPCODE_HALT_TARGET:
1049       case ELK_FS_OPCODE_SCHEDULING_FENCE:
1050          return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
1051                                0, 0, 0, 0, 0, 0);
1052 
1053       default:
1054          abort();
1055       }
1056    }
1057 
1058    /**
1059     * Model the performance behavior of a stall on the specified dependency
1060     * ID.
1061     */
1062    void
stall_on_dependency(state & st,enum intel_eu_dependency_id id)1063    stall_on_dependency(state &st, enum intel_eu_dependency_id id)
1064    {
1065       if (id < ARRAY_SIZE(st.dep_ready))
1066          st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE],
1067                                        st.dep_ready[id]);
1068    }
1069 
1070    /**
1071     * Model the performance behavior of the front-end and back-end while
1072     * executing an instruction with the specified timing information, assuming
1073     * all dependencies are already clear.
1074     */
1075    void
execute_instruction(state & st,const perf_desc & perf)1076    execute_instruction(state &st, const perf_desc &perf)
1077    {
1078       /* Compute the time at which the front-end will be ready to execute the
1079        * next instruction.
1080        */
1081       st.unit_ready[EU_UNIT_FE] += perf.df;
1082 
1083       if (perf.u < EU_NUM_UNITS) {
1084          /* Wait for the back-end to be ready to execute this instruction. */
1085          st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE],
1086                                        st.unit_ready[perf.u]);
1087 
1088          /* Compute the time at which the back-end will be ready to execute
1089           * the next instruction, and update the back-end utilization.
1090           */
1091          st.unit_ready[perf.u] = st.unit_ready[EU_UNIT_FE] + perf.db;
1092          st.unit_busy[perf.u] += perf.db * st.weight;
1093       }
1094    }
1095 
1096    /**
1097     * Model the performance behavior of a read dependency provided by an
1098     * instruction.
1099     */
1100    void
mark_read_dependency(state & st,const perf_desc & perf,enum intel_eu_dependency_id id)1101    mark_read_dependency(state &st, const perf_desc &perf,
1102                         enum intel_eu_dependency_id id)
1103    {
1104       if (id < ARRAY_SIZE(st.dep_ready))
1105          st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ls;
1106    }
1107 
1108    /**
1109     * Model the performance behavior of a write dependency provided by an
1110     * instruction.
1111     */
1112    void
mark_write_dependency(state & st,const perf_desc & perf,enum intel_eu_dependency_id id)1113    mark_write_dependency(state &st, const perf_desc &perf,
1114                          enum intel_eu_dependency_id id)
1115    {
1116       if (id >= EU_DEPENDENCY_ID_ACCUM0 && id < EU_DEPENDENCY_ID_FLAG0)
1117          st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.la;
1118       else if (id >= EU_DEPENDENCY_ID_FLAG0 && id < EU_DEPENDENCY_ID_SBID_WR0)
1119          st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.lf;
1120       else if (id < ARRAY_SIZE(st.dep_ready))
1121          st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ld;
1122    }
1123 
1124    /**
1125     * Return the dependency ID of a elk_backend_reg, offset by \p delta GRFs.
1126     */
1127    enum intel_eu_dependency_id
reg_dependency_id(const intel_device_info * devinfo,const elk_backend_reg & r,const int delta)1128    reg_dependency_id(const intel_device_info *devinfo, const elk_backend_reg &r,
1129                      const int delta)
1130    {
1131       if (r.file == VGRF) {
1132          const unsigned i = r.nr + r.offset / REG_SIZE + delta;
1133          assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0);
1134          return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
1135 
1136       } else if (r.file == FIXED_GRF) {
1137          const unsigned i = r.nr + delta;
1138          assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0);
1139          return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
1140 
1141       } else if (r.file == MRF && devinfo->ver >= 7) {
1142          const unsigned i = GFX7_MRF_HACK_START +
1143                             r.nr + r.offset / REG_SIZE + delta;
1144          assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0);
1145          return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
1146 
1147       } else if (r.file == MRF && devinfo->ver < 7) {
1148          const unsigned i = (r.nr & ~ELK_MRF_COMPR4) +
1149                             r.offset / REG_SIZE + delta;
1150          assert(i < EU_DEPENDENCY_ID_ADDR0 - EU_DEPENDENCY_ID_MRF0);
1151          return intel_eu_dependency_id(EU_DEPENDENCY_ID_MRF0 + i);
1152 
1153       } else if (r.file == ARF && r.nr >= ELK_ARF_ADDRESS &&
1154                  r.nr < ELK_ARF_ACCUMULATOR) {
1155          assert(delta == 0);
1156          return EU_DEPENDENCY_ID_ADDR0;
1157 
1158       } else if (r.file == ARF && r.nr >= ELK_ARF_ACCUMULATOR &&
1159                  r.nr < ELK_ARF_FLAG) {
1160          const unsigned i = r.nr - ELK_ARF_ACCUMULATOR + delta;
1161          assert(i < EU_DEPENDENCY_ID_FLAG0 - EU_DEPENDENCY_ID_ACCUM0);
1162          return intel_eu_dependency_id(EU_DEPENDENCY_ID_ACCUM0 + i);
1163 
1164       } else {
1165          return EU_NUM_DEPENDENCY_IDS;
1166       }
1167    }
1168 
1169    /**
1170     * Return the dependency ID of flag register starting at offset \p i.
1171     */
1172    enum intel_eu_dependency_id
flag_dependency_id(unsigned i)1173    flag_dependency_id(unsigned i)
1174    {
1175       assert(i < EU_DEPENDENCY_ID_SBID_WR0 - EU_DEPENDENCY_ID_FLAG0);
1176       return intel_eu_dependency_id(EU_DEPENDENCY_ID_FLAG0 + i);
1177    }
1178 
1179    /**
1180     * Return the implicit accumulator register accessed by channel \p i of the
1181     * instruction.
1182     */
1183    unsigned
accum_reg_of_channel(const intel_device_info * devinfo,const elk_backend_instruction * inst,elk_reg_type tx,unsigned i)1184    accum_reg_of_channel(const intel_device_info *devinfo,
1185                         const elk_backend_instruction *inst,
1186                         elk_reg_type tx, unsigned i)
1187    {
1188       assert(inst->reads_accumulator_implicitly() ||
1189              inst->writes_accumulator_implicitly(devinfo));
1190       const unsigned offset = (inst->group + i) * type_sz(tx) *
1191          (devinfo->ver < 7 || elk_reg_type_is_floating_point(tx) ? 1 : 2);
1192       return offset / (reg_unit(devinfo) * REG_SIZE) % 2;
1193    }
1194 
1195    /**
1196     * Model the performance behavior of an FS back-end instruction.
1197     */
1198    void
issue_fs_inst(state & st,const struct elk_isa_info * isa,const elk_backend_instruction * be_inst)1199    issue_fs_inst(state &st, const struct elk_isa_info *isa,
1200                  const elk_backend_instruction *be_inst)
1201    {
1202       const struct intel_device_info *devinfo = isa->devinfo;
1203       const elk_fs_inst *inst = static_cast<const elk_fs_inst *>(be_inst);
1204       const instruction_info info(isa, inst);
1205       const perf_desc perf = instruction_desc(info);
1206 
1207       /* Stall on any source dependencies. */
1208       for (unsigned i = 0; i < inst->sources; i++) {
1209          for (unsigned j = 0; j < regs_read(inst, i); j++)
1210             stall_on_dependency(
1211                st, reg_dependency_id(devinfo, inst->src[i], j));
1212       }
1213 
1214       if (inst->reads_accumulator_implicitly()) {
1215          for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1216               j <= accum_reg_of_channel(devinfo, inst, info.tx,
1217                                         inst->exec_size - 1); j++)
1218             stall_on_dependency(
1219                st, reg_dependency_id(devinfo, elk_acc_reg(8), j));
1220       }
1221 
1222       if (is_send(inst) && inst->base_mrf != -1) {
1223          for (unsigned j = 0; j < inst->mlen; j++)
1224             stall_on_dependency(
1225                st, reg_dependency_id(
1226                   devinfo, elk_uvec_mrf(8, inst->base_mrf, 0), j));
1227       }
1228 
1229       if (const unsigned mask = inst->flags_read(devinfo)) {
1230          for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1231             if (mask & (1 << i))
1232                stall_on_dependency(st, flag_dependency_id(i));
1233          }
1234       }
1235 
1236       /* Stall on any write dependencies. */
1237       if (!inst->no_dd_check) {
1238          if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1239             for (unsigned j = 0; j < regs_written(inst); j++)
1240                stall_on_dependency(
1241                   st, reg_dependency_id(devinfo, inst->dst, j));
1242          }
1243 
1244          if (inst->writes_accumulator_implicitly(devinfo)) {
1245             for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1246                  j <= accum_reg_of_channel(devinfo, inst, info.tx,
1247                                            inst->exec_size - 1); j++)
1248                stall_on_dependency(
1249                   st, reg_dependency_id(devinfo, elk_acc_reg(8), j));
1250          }
1251 
1252          if (const unsigned mask = inst->flags_written(devinfo)) {
1253             for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1254                if (mask & (1 << i))
1255                   stall_on_dependency(st, flag_dependency_id(i));
1256             }
1257          }
1258       }
1259 
1260       /* Execute the instruction. */
1261       execute_instruction(st, perf);
1262 
1263       /* Mark any source dependencies. */
1264       if (inst->is_send_from_grf()) {
1265          for (unsigned i = 0; i < inst->sources; i++) {
1266             if (inst->is_payload(i)) {
1267                for (unsigned j = 0; j < regs_read(inst, i); j++)
1268                   mark_read_dependency(
1269                      st, perf, reg_dependency_id(devinfo, inst->src[i], j));
1270             }
1271          }
1272       }
1273 
1274       if (is_send(inst) && inst->base_mrf != -1) {
1275          for (unsigned j = 0; j < inst->mlen; j++)
1276             mark_read_dependency(st, perf,
1277                reg_dependency_id(devinfo, elk_uvec_mrf(8, inst->base_mrf, 0), j));
1278       }
1279 
1280       /* Mark any destination dependencies. */
1281       if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1282          for (unsigned j = 0; j < regs_written(inst); j++) {
1283             mark_write_dependency(st, perf,
1284                                   reg_dependency_id(devinfo, inst->dst, j));
1285          }
1286       }
1287 
1288       if (inst->writes_accumulator_implicitly(devinfo)) {
1289          for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1290               j <= accum_reg_of_channel(devinfo, inst, info.tx,
1291                                         inst->exec_size - 1); j++)
1292             mark_write_dependency(st, perf,
1293                                   reg_dependency_id(devinfo, elk_acc_reg(8), j));
1294       }
1295 
1296       if (const unsigned mask = inst->flags_written(devinfo)) {
1297          for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1298             if (mask & (1 << i))
1299                mark_write_dependency(st, perf, flag_dependency_id(i));
1300          }
1301       }
1302    }
1303 
1304    /**
1305     * Model the performance behavior of a VEC4 back-end instruction.
1306     */
1307    void
issue_vec4_instruction(state & st,const struct elk_isa_info * isa,const elk_backend_instruction * be_inst)1308    issue_vec4_instruction(state &st, const struct elk_isa_info *isa,
1309                           const elk_backend_instruction *be_inst)
1310    {
1311       const struct intel_device_info *devinfo = isa->devinfo;
1312       const vec4_instruction *inst =
1313          static_cast<const vec4_instruction *>(be_inst);
1314       const instruction_info info(isa, inst);
1315       const perf_desc perf = instruction_desc(info);
1316 
1317       /* Stall on any source dependencies. */
1318       for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
1319          for (unsigned j = 0; j < regs_read(inst, i); j++)
1320             stall_on_dependency(
1321                st, reg_dependency_id(devinfo, inst->src[i], j));
1322       }
1323 
1324       if (inst->reads_accumulator_implicitly()) {
1325          for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1326               j <= accum_reg_of_channel(devinfo, inst, info.tx,
1327                                         inst->exec_size - 1); j++)
1328             stall_on_dependency(
1329                st, reg_dependency_id(devinfo, elk_acc_reg(8), j));
1330       }
1331 
1332       if (inst->base_mrf != -1) {
1333          for (unsigned j = 0; j < inst->mlen; j++)
1334             stall_on_dependency(
1335                st, reg_dependency_id(
1336                   devinfo, elk_uvec_mrf(8, inst->base_mrf, 0), j));
1337       }
1338 
1339       if (inst->reads_flag())
1340          stall_on_dependency(st, EU_DEPENDENCY_ID_FLAG0);
1341 
1342       /* Stall on any write dependencies. */
1343       if (!inst->no_dd_check) {
1344          if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1345             for (unsigned j = 0; j < regs_written(inst); j++)
1346                stall_on_dependency(
1347                   st, reg_dependency_id(devinfo, inst->dst, j));
1348          }
1349 
1350          if (inst->writes_accumulator_implicitly(devinfo)) {
1351             for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1352                  j <= accum_reg_of_channel(devinfo, inst, info.tx,
1353                                            inst->exec_size - 1); j++)
1354                stall_on_dependency(
1355                   st, reg_dependency_id(devinfo, elk_acc_reg(8), j));
1356          }
1357 
1358          if (inst->writes_flag(devinfo))
1359             stall_on_dependency(st, EU_DEPENDENCY_ID_FLAG0);
1360       }
1361 
1362       /* Execute the instruction. */
1363       execute_instruction(st, perf);
1364 
1365       /* Mark any source dependencies. */
1366       if (inst->is_send_from_grf()) {
1367          for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
1368             for (unsigned j = 0; j < regs_read(inst, i); j++)
1369                mark_read_dependency(
1370                   st, perf, reg_dependency_id(devinfo, inst->src[i], j));
1371          }
1372       }
1373 
1374       if (inst->base_mrf != -1) {
1375          for (unsigned j = 0; j < inst->mlen; j++)
1376             mark_read_dependency(st, perf,
1377                reg_dependency_id(devinfo, elk_uvec_mrf(8, inst->base_mrf, 0), j));
1378       }
1379 
1380       /* Mark any destination dependencies. */
1381       if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1382          for (unsigned j = 0; j < regs_written(inst); j++) {
1383             mark_write_dependency(st, perf,
1384                                   reg_dependency_id(devinfo, inst->dst, j));
1385          }
1386       }
1387 
1388       if (inst->writes_accumulator_implicitly(devinfo)) {
1389          for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1390               j <= accum_reg_of_channel(devinfo, inst, info.tx,
1391                                         inst->exec_size - 1); j++)
1392             mark_write_dependency(st, perf,
1393                                   reg_dependency_id(devinfo, elk_acc_reg(8), j));
1394       }
1395 
1396       if (inst->writes_flag(devinfo))
1397          mark_write_dependency(st, perf, EU_DEPENDENCY_ID_FLAG0);
1398    }
1399 
1400    /**
1401     * Calculate the maximum possible throughput of the program compatible with
1402     * the cycle-count utilization estimated for each asynchronous unit, in
1403     * threads-per-cycle units.
1404     */
1405    float
calculate_thread_throughput(const state & st,float busy)1406    calculate_thread_throughput(const state &st, float busy)
1407    {
1408       for (unsigned i = 0; i < EU_NUM_UNITS; i++)
1409          busy = MAX2(busy, st.unit_busy[i]);
1410 
1411       return 1.0 / busy;
1412    }
1413 
1414    /**
1415     * Estimate the performance of the specified shader.
1416     */
1417    void
calculate_performance(performance & p,const elk_backend_shader * s,void (* issue_instruction)(state &,const struct elk_isa_info *,const elk_backend_instruction *),unsigned dispatch_width)1418    calculate_performance(performance &p, const elk_backend_shader *s,
1419                          void (*issue_instruction)(
1420                             state &, const struct elk_isa_info *,
1421                             const elk_backend_instruction *),
1422                          unsigned dispatch_width)
1423    {
1424       /* XXX - Note that the previous version of this code used worst-case
1425        *       scenario estimation of branching divergence for SIMD32 shaders,
1426        *       but this heuristic was removed to improve performance in common
1427        *       scenarios. Wider shader variants are less optimal when divergence
1428        *       is high, e.g. when application renders complex scene on a small
1429        *       surface. It is assumed that such renders are short, so their
1430        *       time doesn't matter and when it comes to the overall performance,
1431        *       they are dominated by more optimal larger renders.
1432        *
1433        *       It's possible that we could do better with divergence analysis
1434        *       by isolating branches which are 100% uniform.
1435        *
1436        *       Plumbing the trip counts from NIR loop analysis would allow us
1437        *       to do a better job regarding the loop weights.
1438        *
1439        *       In the meantime use values that roughly match the control flow
1440        *       weights used elsewhere in the compiler back-end.
1441        */
1442       const float discard_weight = 1.0;
1443       const float loop_weight = 10;
1444       unsigned halt_count = 0;
1445       unsigned elapsed = 0;
1446       state st;
1447 
1448       foreach_block(block, s->cfg) {
1449          const unsigned elapsed0 = elapsed;
1450 
1451          foreach_inst_in_block(elk_backend_instruction, inst, block) {
1452             const unsigned clock0 = st.unit_ready[EU_UNIT_FE];
1453 
1454             issue_instruction(st, &s->compiler->isa, inst);
1455 
1456             if (inst->opcode == ELK_SHADER_OPCODE_HALT_TARGET && halt_count)
1457                st.weight /= discard_weight;
1458 
1459             elapsed += (st.unit_ready[EU_UNIT_FE] - clock0) * st.weight;
1460 
1461             if (inst->opcode == ELK_OPCODE_DO)
1462                st.weight *= loop_weight;
1463             else if (inst->opcode == ELK_OPCODE_WHILE)
1464                st.weight /= loop_weight;
1465             else if (inst->opcode == ELK_OPCODE_HALT && !halt_count++)
1466                st.weight *= discard_weight;
1467          }
1468 
1469          p.block_latency[block->num] = elapsed - elapsed0;
1470       }
1471 
1472       p.latency = elapsed;
1473       p.throughput = dispatch_width * calculate_thread_throughput(st, elapsed);
1474    }
1475 }
1476 
performance(const elk_fs_visitor * v)1477 elk::performance::performance(const elk_fs_visitor *v) :
1478    block_latency(new unsigned[v->cfg->num_blocks])
1479 {
1480    calculate_performance(*this, v, issue_fs_inst, v->dispatch_width);
1481 }
1482 
performance(const vec4_visitor * v)1483 elk::performance::performance(const vec4_visitor *v) :
1484    block_latency(new unsigned[v->cfg->num_blocks])
1485 {
1486    calculate_performance(*this, v, issue_vec4_instruction, 8);
1487 }
1488 
~performance()1489 elk::performance::~performance()
1490 {
1491    delete[] block_latency;
1492 }
1493