xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/brw_compiler.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2010 - 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #ifndef BRW_COMPILER_H
25 #define BRW_COMPILER_H
26 
27 #include <stdio.h>
28 #include "c11/threads.h"
29 #include "dev/intel_device_info.h"
30 #include "isl/isl.h"
31 #include "util/macros.h"
32 #include "util/mesa-sha1.h"
33 #include "util/enum_operators.h"
34 #include "util/ralloc.h"
35 #include "util/u_math.h"
36 #include "util/u_printf.h"
37 #include "brw_isa_info.h"
38 #include "intel_shader_enums.h"
39 
40 #ifdef __cplusplus
41 extern "C" {
42 #endif
43 
44 struct ra_regs;
45 struct nir_shader;
46 struct shader_info;
47 
48 struct nir_shader_compiler_options;
49 typedef struct nir_shader nir_shader;
50 
51 #define REG_CLASS_COUNT 20
52 
53 struct brw_compiler {
54    const struct intel_device_info *devinfo;
55 
56    /* This lock must be taken if the compiler is to be modified in any way,
57     * including adding something to the ralloc child list.
58     */
59    mtx_t mutex;
60 
61    struct brw_isa_info isa;
62 
63    struct {
64       struct ra_regs *regs;
65 
66       /**
67        * Array of the ra classes for the unaligned contiguous register
68        * block sizes used, indexed by register size.
69        */
70       struct ra_class *classes[REG_CLASS_COUNT];
71    } fs_reg_set;
72 
73    void (*shader_debug_log)(void *, unsigned *id, const char *str, ...) PRINTFLIKE(3, 4);
74    void (*shader_perf_log)(void *, unsigned *id, const char *str, ...) PRINTFLIKE(3, 4);
75 
76    bool use_tcs_multi_patch;
77    struct nir_shader_compiler_options *nir_options[MESA_ALL_SHADER_STAGES];
78 
79    /**
80     * Apply workarounds for SIN and COS output range problems.
81     * This can negatively impact performance.
82     */
83    bool precise_trig;
84 
85    /**
86     * Whether indirect UBO loads should use the sampler or go through the
87     * data/constant cache.  For the sampler, UBO surface states have to be set
88     * up with VK_FORMAT_R32G32B32A32_FLOAT whereas if it's going through the
89     * constant or data cache, UBOs must use VK_FORMAT_RAW.
90     */
91    bool indirect_ubos_use_sampler;
92 
93    /**
94     * Gfx12.5+ has a bit in the SEND instruction extending the bindless
95     * surface offset range from 20 to 26 bits, effectively giving us 4Gb of
96     * bindless surface descriptors instead of 64Mb previously.
97     */
98    bool extended_bindless_surface_offset;
99 
100    /**
101     * Gfx11+ has a bit in the dword 3 of the sampler message header that
102     * indicates whether the sampler handle is relative to the dynamic state
103     * base address (0) or the bindless sampler base address (1). The driver
104     * can select this.
105     */
106    bool use_bindless_sampler_offset;
107 
108    /**
109     * Should DPAS instructions be lowered?
110     *
111     * This will be set for all platforms before Gfx12.5. It may also be set
112     * platforms that support DPAS for testing purposes.
113     */
114    bool lower_dpas;
115 
116    /**
117     * Calling the ra_allocate function after each register spill can take
118     * several minutes. This option speeds up shader compilation by spilling
119     * more registers after the ra_allocate failure. Required for
120     * Cyberpunk 2077, which uses a watchdog thread to terminate the process
121     * in case the render thread hasn't responded within 2 minutes.
122     */
123    int spilling_rate;
124 
125    struct nir_shader *clc_shader;
126 
127    struct {
128       unsigned mue_header_packing;
129       bool mue_compaction;
130    } mesh;
131 };
132 
133 #define brw_shader_debug_log(compiler, data, fmt, ... ) do {    \
134    static unsigned id = 0;                                      \
135    compiler->shader_debug_log(data, &id, fmt, ##__VA_ARGS__);   \
136 } while (0)
137 
138 #define brw_shader_perf_log(compiler, data, fmt, ... ) do {     \
139    static unsigned id = 0;                                      \
140    compiler->shader_perf_log(data, &id, fmt, ##__VA_ARGS__);    \
141 } while (0)
142 
143 /**
144  * We use a constant subgroup size of 32.  It really only needs to be a
145  * maximum and, since we do SIMD32 for compute shaders in some cases, it
146  * needs to be at least 32.  SIMD8 and SIMD16 shaders will still claim a
147  * subgroup size of 32 but will act as if 16 or 24 of those channels are
148  * disabled.
149  */
150 #define BRW_SUBGROUP_SIZE 32
151 
152 static inline bool
brw_shader_stage_is_bindless(gl_shader_stage stage)153 brw_shader_stage_is_bindless(gl_shader_stage stage)
154 {
155    return stage >= MESA_SHADER_RAYGEN &&
156           stage <= MESA_SHADER_CALLABLE;
157 }
158 
159 static inline bool
brw_shader_stage_requires_bindless_resources(gl_shader_stage stage)160 brw_shader_stage_requires_bindless_resources(gl_shader_stage stage)
161 {
162    return brw_shader_stage_is_bindless(stage) || gl_shader_stage_is_mesh(stage);
163 }
164 
165 /**
166  * Program key structures.
167  *
168  * When drawing, we look for the currently bound shaders in the program
169  * cache.  This is essentially a hash table lookup, and these are the keys.
170  *
171  * Sometimes OpenGL features specified as state need to be simulated via
172  * shader code, due to a mismatch between the API and the hardware.  This
173  * is often referred to as "non-orthagonal state" or "NOS".  We store NOS
174  * in the program key so it's considered when searching for a program.  If
175  * we haven't seen a particular combination before, we have to recompile a
176  * new specialized version.
177  *
178  * Shader compilation should not look up state in gl_context directly, but
179  * instead use the copy in the program key.  This guarantees recompiles will
180  * happen correctly.
181  *
182  *  @{
183  */
184 
185 #define BRW_MAX_SAMPLERS 32
186 
187 /* Provide explicit padding for each member, to ensure that the compiler
188  * initializes every bit in the shader cache keys.  The keys will be compared
189  * with memcmp.
190  */
191 PRAGMA_DIAGNOSTIC_PUSH
192 PRAGMA_DIAGNOSTIC_ERROR(-Wpadded)
193 
194 enum brw_robustness_flags {
195    BRW_ROBUSTNESS_UBO  = BITFIELD_BIT(0),
196    BRW_ROBUSTNESS_SSBO = BITFIELD_BIT(1),
197 };
198 
199 struct brw_base_prog_key {
200    unsigned program_string_id;
201 
202    enum brw_robustness_flags robust_flags:2;
203 
204    unsigned padding:22;
205 
206    /**
207     * Apply workarounds for SIN and COS input range problems.
208     * This limits input range for SIN and COS to [-2p : 2p] to
209     * avoid precision issues.
210     */
211    bool limit_trig_input_range;
212 };
213 
214 /**
215  * OpenGL attribute slots fall in [0, VERT_ATTRIB_MAX - 1] with the range
216  * [VERT_ATTRIB_GENERIC0, VERT_ATTRIB_MAX - 1] reserved for up to 16 user
217  * input vertex attributes. In Vulkan, we expose up to 28 user vertex input
218  * attributes that are mapped to slots also starting at VERT_ATTRIB_GENERIC0.
219  */
220 #define MAX_GL_VERT_ATTRIB     VERT_ATTRIB_MAX
221 #define MAX_VK_VERT_ATTRIB     (VERT_ATTRIB_GENERIC0 + 28)
222 
223 /** The program key for Vertex Shaders. */
224 struct brw_vs_prog_key {
225    struct brw_base_prog_key base;
226 };
227 
228 /** The program key for Tessellation Control Shaders. */
229 struct brw_tcs_prog_key
230 {
231    struct brw_base_prog_key base;
232 
233    /** A bitfield of per-vertex outputs written. */
234    uint64_t outputs_written;
235 
236    enum tess_primitive_mode _tes_primitive_mode;
237 
238    /** Number of input vertices, 0 means dynamic */
239    unsigned input_vertices;
240 
241    /** A bitfield of per-patch outputs written. */
242    uint32_t patch_outputs_written;
243 
244    uint32_t padding;
245 };
246 
247 #define BRW_MAX_TCS_INPUT_VERTICES (32)
248 
249 static inline uint32_t
brw_tcs_prog_key_input_vertices(const struct brw_tcs_prog_key * key)250 brw_tcs_prog_key_input_vertices(const struct brw_tcs_prog_key *key)
251 {
252    return key->input_vertices != 0 ?
253           key->input_vertices : BRW_MAX_TCS_INPUT_VERTICES;
254 }
255 
256 /** The program key for Tessellation Evaluation Shaders. */
257 struct brw_tes_prog_key
258 {
259    struct brw_base_prog_key base;
260 
261    /** A bitfield of per-vertex inputs read. */
262    uint64_t inputs_read;
263 
264    /** A bitfield of per-patch inputs read. */
265    uint32_t patch_inputs_read;
266 
267    uint32_t padding;
268 };
269 
270 /** The program key for Geometry Shaders. */
271 struct brw_gs_prog_key
272 {
273    struct brw_base_prog_key base;
274 };
275 
276 struct brw_task_prog_key
277 {
278    struct brw_base_prog_key base;
279 };
280 
281 struct brw_mesh_prog_key
282 {
283    struct brw_base_prog_key base;
284 
285    bool compact_mue:1;
286    unsigned padding:31;
287 };
288 
289 enum brw_sometimes {
290    BRW_NEVER = 0,
291    BRW_SOMETIMES,
292    BRW_ALWAYS
293 };
294 
295 static inline enum brw_sometimes
brw_sometimes_invert(enum brw_sometimes x)296 brw_sometimes_invert(enum brw_sometimes x)
297 {
298    return (enum brw_sometimes)((int)BRW_ALWAYS - (int)x);
299 }
300 
301 /** The program key for Fragment/Pixel Shaders. */
302 struct brw_wm_prog_key {
303    struct brw_base_prog_key base;
304 
305    uint64_t input_slots_valid;
306    uint8_t color_outputs_valid;
307 
308    /* Some collection of BRW_WM_IZ_* */
309    bool flat_shade:1;
310    unsigned nr_color_regions:5;
311    bool alpha_test_replicate_alpha:1;
312    enum brw_sometimes alpha_to_coverage:2;
313    bool clamp_fragment_color:1;
314 
315    bool force_dual_color_blend:1;
316 
317    /** Whether or inputs are interpolated at sample rate by default
318     *
319     * This corresponds to the sample shading API bit in Vulkan or OpenGL which
320     * controls how inputs with no interpolation qualifier are interpolated.
321     * This is distinct from the way that using gl_SampleID or similar requires
322     * us to run per-sample.  Even when running per-sample due to gl_SampleID,
323     * we may still interpolate unqualified inputs at the pixel center.
324     */
325    enum brw_sometimes persample_interp:2;
326 
327    /* Whether or not we are running on a multisampled framebuffer */
328    enum brw_sometimes multisample_fbo:2;
329 
330    /* Whether the preceding shader stage is mesh */
331    enum brw_sometimes mesh_input:2;
332 
333    bool coherent_fb_fetch:1;
334    bool ignore_sample_mask_out:1;
335    bool coarse_pixel:1;
336    bool null_push_constant_tbimr_workaround:1;
337 
338    uint64_t padding:35;
339 };
340 
341 struct brw_cs_prog_key {
342    struct brw_base_prog_key base;
343 };
344 
345 struct brw_bs_prog_key {
346    struct brw_base_prog_key base;
347 
348    /* Represents enum enum brw_rt_ray_flags values given at pipeline creation
349     * to be combined with ray_flags handed to the traceRayEXT() calls by the
350     * shader.
351     */
352    uint32_t pipeline_ray_flags;
353 };
354 
355 /* brw_any_prog_key is any of the keys that map to an API stage */
356 union brw_any_prog_key {
357    struct brw_base_prog_key base;
358    struct brw_vs_prog_key vs;
359    struct brw_tcs_prog_key tcs;
360    struct brw_tes_prog_key tes;
361    struct brw_gs_prog_key gs;
362    struct brw_wm_prog_key wm;
363    struct brw_cs_prog_key cs;
364    struct brw_bs_prog_key bs;
365    struct brw_task_prog_key task;
366    struct brw_mesh_prog_key mesh;
367 };
368 
369 PRAGMA_DIAGNOSTIC_POP
370 
371 /** Max number of render targets in a shader */
372 #define BRW_MAX_DRAW_BUFFERS 8
373 
374 struct brw_ubo_range
375 {
376    uint16_t block;
377 
378    /* In units of 32-byte registers */
379    uint8_t start;
380    uint8_t length;
381 };
382 
383 /* We reserve the first 2^16 values for builtins */
384 #define BRW_PARAM_IS_BUILTIN(param) (((param) & 0xffff0000) == 0)
385 
386 enum brw_param_builtin {
387    BRW_PARAM_BUILTIN_ZERO,
388 
389    BRW_PARAM_BUILTIN_CLIP_PLANE_0_X,
390    BRW_PARAM_BUILTIN_CLIP_PLANE_0_Y,
391    BRW_PARAM_BUILTIN_CLIP_PLANE_0_Z,
392    BRW_PARAM_BUILTIN_CLIP_PLANE_0_W,
393    BRW_PARAM_BUILTIN_CLIP_PLANE_1_X,
394    BRW_PARAM_BUILTIN_CLIP_PLANE_1_Y,
395    BRW_PARAM_BUILTIN_CLIP_PLANE_1_Z,
396    BRW_PARAM_BUILTIN_CLIP_PLANE_1_W,
397    BRW_PARAM_BUILTIN_CLIP_PLANE_2_X,
398    BRW_PARAM_BUILTIN_CLIP_PLANE_2_Y,
399    BRW_PARAM_BUILTIN_CLIP_PLANE_2_Z,
400    BRW_PARAM_BUILTIN_CLIP_PLANE_2_W,
401    BRW_PARAM_BUILTIN_CLIP_PLANE_3_X,
402    BRW_PARAM_BUILTIN_CLIP_PLANE_3_Y,
403    BRW_PARAM_BUILTIN_CLIP_PLANE_3_Z,
404    BRW_PARAM_BUILTIN_CLIP_PLANE_3_W,
405    BRW_PARAM_BUILTIN_CLIP_PLANE_4_X,
406    BRW_PARAM_BUILTIN_CLIP_PLANE_4_Y,
407    BRW_PARAM_BUILTIN_CLIP_PLANE_4_Z,
408    BRW_PARAM_BUILTIN_CLIP_PLANE_4_W,
409    BRW_PARAM_BUILTIN_CLIP_PLANE_5_X,
410    BRW_PARAM_BUILTIN_CLIP_PLANE_5_Y,
411    BRW_PARAM_BUILTIN_CLIP_PLANE_5_Z,
412    BRW_PARAM_BUILTIN_CLIP_PLANE_5_W,
413    BRW_PARAM_BUILTIN_CLIP_PLANE_6_X,
414    BRW_PARAM_BUILTIN_CLIP_PLANE_6_Y,
415    BRW_PARAM_BUILTIN_CLIP_PLANE_6_Z,
416    BRW_PARAM_BUILTIN_CLIP_PLANE_6_W,
417    BRW_PARAM_BUILTIN_CLIP_PLANE_7_X,
418    BRW_PARAM_BUILTIN_CLIP_PLANE_7_Y,
419    BRW_PARAM_BUILTIN_CLIP_PLANE_7_Z,
420    BRW_PARAM_BUILTIN_CLIP_PLANE_7_W,
421 
422    BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X,
423    BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_Y,
424    BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_Z,
425    BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_W,
426    BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X,
427    BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y,
428 
429    BRW_PARAM_BUILTIN_PATCH_VERTICES_IN,
430 
431    BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_X,
432    BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Y,
433    BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Z,
434    BRW_PARAM_BUILTIN_SUBGROUP_ID,
435    BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X,
436    BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Y,
437    BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Z,
438    BRW_PARAM_BUILTIN_WORK_DIM,
439 };
440 
441 #define BRW_PARAM_BUILTIN_CLIP_PLANE(idx, comp) \
442    (BRW_PARAM_BUILTIN_CLIP_PLANE_0_X + ((idx) << 2) + (comp))
443 
444 #define BRW_PARAM_BUILTIN_IS_CLIP_PLANE(param)  \
445    ((param) >= BRW_PARAM_BUILTIN_CLIP_PLANE_0_X && \
446     (param) <= BRW_PARAM_BUILTIN_CLIP_PLANE_7_W)
447 
448 #define BRW_PARAM_BUILTIN_CLIP_PLANE_IDX(param) \
449    (((param) - BRW_PARAM_BUILTIN_CLIP_PLANE_0_X) >> 2)
450 
451 #define BRW_PARAM_BUILTIN_CLIP_PLANE_COMP(param) \
452    (((param) - BRW_PARAM_BUILTIN_CLIP_PLANE_0_X) & 0x3)
453 
454 #define BRW_MAX_EMBEDDED_SAMPLERS (4096)
455 
456 enum brw_shader_reloc_id {
457    BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW,
458    BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH,
459    BRW_SHADER_RELOC_SHADER_START_OFFSET,
460    BRW_SHADER_RELOC_RESUME_SBT_ADDR_LOW,
461    BRW_SHADER_RELOC_RESUME_SBT_ADDR_HIGH,
462    BRW_SHADER_RELOC_DESCRIPTORS_ADDR_HIGH,
463    BRW_SHADER_RELOC_DESCRIPTORS_BUFFER_ADDR_HIGH,
464    BRW_SHADER_RELOC_EMBEDDED_SAMPLER_HANDLE,
465    BRW_SHADER_RELOC_LAST_EMBEDDED_SAMPLER_HANDLE =
466    BRW_SHADER_RELOC_EMBEDDED_SAMPLER_HANDLE + BRW_MAX_EMBEDDED_SAMPLERS - 1,
467    BRW_SHADER_RELOC_PRINTF_BUFFER_ADDR_LOW,
468    BRW_SHADER_RELOC_PRINTF_BUFFER_ADDR_HIGH,
469    BRW_SHADER_RELOC_PRINTF_BASE_IDENTIFIER,
470 };
471 
472 enum brw_shader_reloc_type {
473    /** An arbitrary 32-bit value */
474    BRW_SHADER_RELOC_TYPE_U32,
475    /** A MOV instruction with an immediate source */
476    BRW_SHADER_RELOC_TYPE_MOV_IMM,
477 };
478 
479 /** Represents a code relocation
480  *
481  * Relocatable constants are immediates in the code which we want to be able
482  * to replace post-compile with the actual value.
483  */
484 struct brw_shader_reloc {
485    /** The 32-bit ID of the relocatable constant */
486    uint32_t id;
487 
488    /** Type of this relocation */
489    enum brw_shader_reloc_type type;
490 
491    /** The offset in the shader to the relocated value
492     *
493     * For MOV_IMM relocs, this is an offset to the MOV instruction.  This
494     * allows us to do some sanity checking while we update the value.
495     */
496    uint32_t offset;
497 
498    /** Value to be added to the relocated value before it is written */
499    uint32_t delta;
500 };
501 
502 /** A value to write to a relocation */
503 struct brw_shader_reloc_value {
504    /** The 32-bit ID of the relocatable constant */
505    uint32_t id;
506 
507    /** The value with which to replace the relocated immediate */
508    uint32_t value;
509 };
510 
511 struct brw_stage_prog_data {
512    struct brw_ubo_range ubo_ranges[4];
513 
514    unsigned nr_params;       /**< number of float params/constants */
515 
516    gl_shader_stage stage;
517 
518    /* zero_push_reg is a bitfield which indicates what push registers (if any)
519     * should be zeroed by SW at the start of the shader.  The corresponding
520     * push_reg_mask_param specifies the param index (in 32-bit units) where
521     * the actual runtime 64-bit mask will be pushed.  The shader will zero
522     * push reg i if
523     *
524     *    reg_used & zero_push_reg & ~*push_reg_mask_param & (1ull << i)
525     *
526     * If this field is set, brw_compiler::compact_params must be false.
527     */
528    uint64_t zero_push_reg;
529    unsigned push_reg_mask_param;
530 
531    unsigned curb_read_length;
532    unsigned total_scratch;
533    unsigned total_shared;
534 
535    unsigned program_size;
536 
537    unsigned const_data_size;
538    unsigned const_data_offset;
539 
540    unsigned num_relocs;
541    const struct brw_shader_reloc *relocs;
542 
543    /** Does this program pull from any UBO or other constant buffers? */
544    bool has_ubo_pull;
545 
546    /** How many ray queries objects in this shader. */
547    unsigned ray_queries;
548 
549    /**
550     * Register where the thread expects to find input data from the URB
551     * (typically uniforms, followed by vertex or fragment attributes).
552     */
553    unsigned dispatch_grf_start_reg;
554 
555    bool use_alt_mode; /**< Use ALT floating point mode?  Otherwise, IEEE. */
556 
557    /* 32-bit identifiers for all push/pull parameters.  These can be anything
558     * the driver wishes them to be; the core of the back-end compiler simply
559     * re-arranges them.  The one restriction is that the bottom 2^16 values
560     * are reserved for builtins defined in the brw_param_builtin enum defined
561     * above.
562     */
563    uint32_t *param;
564 
565    /* Whether shader uses atomic operations. */
566    bool uses_atomic_load_store;
567 
568    /* Printf descriptions contained by the shader */
569    uint32_t printf_info_count;
570    u_printf_info *printf_info;
571 };
572 
573 static inline uint32_t *
brw_stage_prog_data_add_params(struct brw_stage_prog_data * prog_data,unsigned nr_new_params)574 brw_stage_prog_data_add_params(struct brw_stage_prog_data *prog_data,
575                                unsigned nr_new_params)
576 {
577    unsigned old_nr_params = prog_data->nr_params;
578    prog_data->nr_params += nr_new_params;
579    prog_data->param = reralloc(ralloc_parent(prog_data->param),
580                                prog_data->param, uint32_t,
581                                prog_data->nr_params);
582    return prog_data->param + old_nr_params;
583 }
584 
585 void
586 brw_stage_prog_data_add_printf(struct brw_stage_prog_data *prog_data,
587                                void *mem_ctx,
588                                const u_printf_info *print);
589 
590 enum brw_barycentric_mode {
591    BRW_BARYCENTRIC_PERSPECTIVE_PIXEL       = 0,
592    BRW_BARYCENTRIC_PERSPECTIVE_CENTROID    = 1,
593    BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE      = 2,
594    BRW_BARYCENTRIC_NONPERSPECTIVE_PIXEL    = 3,
595    BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID = 4,
596    BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE   = 5,
597    BRW_BARYCENTRIC_MODE_COUNT              = 6
598 };
599 #define BRW_BARYCENTRIC_PERSPECTIVE_BITS \
600    ((1 << BRW_BARYCENTRIC_PERSPECTIVE_PIXEL) | \
601     (1 << BRW_BARYCENTRIC_PERSPECTIVE_CENTROID) | \
602     (1 << BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE))
603 #define BRW_BARYCENTRIC_NONPERSPECTIVE_BITS \
604    ((1 << BRW_BARYCENTRIC_NONPERSPECTIVE_PIXEL) | \
605     (1 << BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID) | \
606     (1 << BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE))
607 
608 enum brw_pixel_shader_computed_depth_mode {
609    BRW_PSCDEPTH_OFF   = 0, /* PS does not compute depth */
610    BRW_PSCDEPTH_ON    = 1, /* PS computes depth; no guarantee about value */
611    BRW_PSCDEPTH_ON_GE = 2, /* PS guarantees output depth >= source depth */
612    BRW_PSCDEPTH_ON_LE = 3, /* PS guarantees output depth <= source depth */
613 };
614 
615 /* Data about a particular attempt to compile a program.  Note that
616  * there can be many of these, each in a different GL state
617  * corresponding to a different brw_wm_prog_key struct, with different
618  * compiled programs.
619  */
620 struct brw_wm_prog_data {
621    struct brw_stage_prog_data base;
622 
623    unsigned num_per_primitive_inputs;
624    unsigned num_varying_inputs;
625 
626    uint8_t dispatch_grf_start_reg_16;
627    uint8_t dispatch_grf_start_reg_32;
628    uint32_t prog_offset_16;
629    uint32_t prog_offset_32;
630 
631    uint8_t computed_depth_mode;
632 
633    /**
634     * Number of polygons handled in parallel by the multi-polygon PS
635     * kernel.
636     */
637    uint8_t max_polygons;
638 
639    /**
640     * Dispatch width of the multi-polygon PS kernel, or 0 if no
641     * multi-polygon kernel was built.
642     */
643    uint8_t dispatch_multi;
644 
645    bool computed_stencil;
646    bool early_fragment_tests;
647    bool post_depth_coverage;
648    bool inner_coverage;
649    bool dispatch_8;
650    bool dispatch_16;
651    bool dispatch_32;
652    bool dual_src_blend;
653    bool uses_pos_offset;
654    bool uses_omask;
655    bool uses_kill;
656    bool uses_src_depth;
657    bool uses_src_w;
658    bool uses_depth_w_coefficients;
659    bool uses_pc_bary_coefficients;
660    bool uses_npc_bary_coefficients;
661    bool uses_sample_offsets;
662    bool uses_sample_mask;
663    bool uses_vmask;
664    bool has_side_effects;
665    bool pulls_bary;
666 
667    bool contains_flat_varying;
668    bool contains_noperspective_varying;
669 
670    /** True if the shader wants sample shading
671     *
672     * This corresponds to whether or not a gl_SampleId, gl_SamplePosition, or
673     * a sample-qualified input are used in the shader.  It is independent of
674     * GL_MIN_SAMPLE_SHADING_VALUE in GL or minSampleShading in Vulkan.
675     */
676    bool sample_shading;
677 
678    /** Min sample shading value
679     *
680     * Not used by the compiler, but useful for restore from the cache. The
681     * driver is expected to write the value it wants.
682     */
683    float min_sample_shading;
684 
685    /** Should this shader be dispatched per-sample */
686    enum brw_sometimes persample_dispatch;
687 
688    /**
689     * Shader is ran at the coarse pixel shading dispatch rate (3DSTATE_CPS).
690     */
691    enum brw_sometimes coarse_pixel_dispatch;
692 
693    /**
694     * Shader writes the SampleMask and this is AND-ed with the API's
695     * SampleMask to generate a new coverage mask.
696     */
697    enum brw_sometimes alpha_to_coverage;
698 
699    unsigned msaa_flags_param;
700 
701    /**
702     * Mask of which interpolation modes are required by the fragment shader.
703     * Those interpolations are delivered as part of the thread payload. Used
704     * in hardware setup on gfx6+.
705     */
706    uint32_t barycentric_interp_modes;
707 
708    /**
709     * Whether nonperspective interpolation modes are used by the
710     * barycentric_interp_modes or fragment shader through interpolator messages.
711     */
712    bool uses_nonperspective_interp_modes;
713 
714    /**
715     * Mask of which FS inputs are marked flat by the shader source.  This is
716     * needed for setting up 3DSTATE_SF/SBE.
717     */
718    uint32_t flat_inputs;
719 
720    /**
721     * The FS inputs
722     */
723    uint64_t inputs;
724 
725    /**
726     * Map from gl_varying_slot to the position within the FS setup data
727     * payload where the varying's attribute vertex deltas should be delivered.
728     * For varying slots that are not used by the FS, the value is -1.
729     */
730    int urb_setup[VARYING_SLOT_MAX];
731    int urb_setup_channel[VARYING_SLOT_MAX];
732 
733    /**
734     * Cache structure into the urb_setup array above that contains the
735     * attribute numbers of active varyings out of urb_setup.
736     * The actual count is stored in urb_setup_attribs_count.
737     */
738    uint8_t urb_setup_attribs[VARYING_SLOT_MAX];
739    uint8_t urb_setup_attribs_count;
740 };
741 
742 #ifdef GFX_VERx10
743 
744 #if GFX_VERx10 >= 200
745 
746 /** Returns the SIMD width corresponding to a given KSP index
747  *
748  * The "Variable Pixel Dispatch" table in the PRM (which can be found, for
749  * example in Vol. 7 of the SKL PRM) has a mapping from dispatch widths to
750  * kernel start pointer (KSP) indices that is based on what dispatch widths
751  * are enabled.  This function provides, effectively, the reverse mapping.
752  *
753  * If the given KSP is enabled, a SIMD width of 8, 16, or 32 is
754  * returned.  Note that for a multipolygon dispatch kernel 8 is always
755  * returned, since multipolygon kernels use the "_8" fields from
756  * brw_wm_prog_data regardless of their SIMD width.  If the KSP is
757  * invalid, 0 is returned.
758  */
759 static inline unsigned
brw_fs_simd_width_for_ksp(unsigned ksp_idx,bool enabled,unsigned width_sel)760 brw_fs_simd_width_for_ksp(unsigned ksp_idx, bool enabled, unsigned width_sel)
761 {
762    assert(ksp_idx < 2);
763    return !enabled ? 0 :
764           width_sel ? 32 :
765           16;
766 }
767 
768 #define brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx)              \
769         (ksp_idx == 0 && (wm_state).Kernel0MaximumPolysperThread ? 8 :  \
770          ksp_idx == 0 ? brw_fs_simd_width_for_ksp(ksp_idx, (wm_state).Kernel0Enable, \
771                                                   (wm_state).Kernel0SIMDWidth): \
772          brw_fs_simd_width_for_ksp(ksp_idx, (wm_state).Kernel1Enable,   \
773                                    (wm_state).Kernel1SIMDWidth))
774 
775 #else
776 
777 /** Returns the SIMD width corresponding to a given KSP index
778  *
779  * The "Variable Pixel Dispatch" table in the PRM (which can be found, for
780  * example in Vol. 7 of the SKL PRM) has a mapping from dispatch widths to
781  * kernel start pointer (KSP) indices that is based on what dispatch widths
782  * are enabled.  This function provides, effectively, the reverse mapping.
783  *
784  * If the given KSP is valid with respect to the SIMD8/16/32 enables, a SIMD
785  * width of 8, 16, or 32 is returned.  If the KSP is invalid, 0 is returned.
786  */
787 static inline unsigned
brw_fs_simd_width_for_ksp(unsigned ksp_idx,bool simd8_enabled,bool simd16_enabled,bool simd32_enabled)788 brw_fs_simd_width_for_ksp(unsigned ksp_idx, bool simd8_enabled,
789                           bool simd16_enabled, bool simd32_enabled)
790 {
791    /* This function strictly ignores contiguous dispatch */
792    switch (ksp_idx) {
793    case 0:
794       return simd8_enabled ? 8 :
795              (simd16_enabled && !simd32_enabled) ? 16 :
796              (simd32_enabled && !simd16_enabled) ? 32 : 0;
797    case 1:
798       return (simd32_enabled && (simd16_enabled || simd8_enabled)) ? 32 : 0;
799    case 2:
800       return (simd16_enabled && (simd32_enabled || simd8_enabled)) ? 16 : 0;
801    default:
802       unreachable("Invalid KSP index");
803    }
804 }
805 
806 #define brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx)              \
807    brw_fs_simd_width_for_ksp((ksp_idx), (wm_state)._8PixelDispatchEnable, \
808                              (wm_state)._16PixelDispatchEnable, \
809                              (wm_state)._32PixelDispatchEnable)
810 
811 #endif
812 
813 #endif
814 
815 #define brw_wm_state_has_ksp(wm_state, ksp_idx) \
816    (brw_wm_state_simd_width_for_ksp((wm_state), (ksp_idx)) != 0)
817 
818 static inline uint32_t
_brw_wm_prog_data_prog_offset(const struct brw_wm_prog_data * prog_data,unsigned simd_width)819 _brw_wm_prog_data_prog_offset(const struct brw_wm_prog_data *prog_data,
820                               unsigned simd_width)
821 {
822    switch (simd_width) {
823    case 8: return 0;
824    case 16: return prog_data->prog_offset_16;
825    case 32: return prog_data->prog_offset_32;
826    default: return 0;
827    }
828 }
829 
830 #define brw_wm_prog_data_prog_offset(prog_data, wm_state, ksp_idx) \
831    _brw_wm_prog_data_prog_offset(prog_data, \
832       brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx))
833 
834 static inline uint8_t
_brw_wm_prog_data_dispatch_grf_start_reg(const struct brw_wm_prog_data * prog_data,unsigned simd_width)835 _brw_wm_prog_data_dispatch_grf_start_reg(const struct brw_wm_prog_data *prog_data,
836                                          unsigned simd_width)
837 {
838    switch (simd_width) {
839    case 8: return prog_data->base.dispatch_grf_start_reg;
840    case 16: return prog_data->dispatch_grf_start_reg_16;
841    case 32: return prog_data->dispatch_grf_start_reg_32;
842    default: return 0;
843    }
844 }
845 
846 #define brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm_state, ksp_idx) \
847    _brw_wm_prog_data_dispatch_grf_start_reg(prog_data, \
848       brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx))
849 
850 static inline bool
brw_wm_prog_data_is_persample(const struct brw_wm_prog_data * prog_data,enum intel_msaa_flags pushed_msaa_flags)851 brw_wm_prog_data_is_persample(const struct brw_wm_prog_data *prog_data,
852                               enum intel_msaa_flags pushed_msaa_flags)
853 {
854    if (pushed_msaa_flags & INTEL_MSAA_FLAG_ENABLE_DYNAMIC) {
855       if (!(pushed_msaa_flags & INTEL_MSAA_FLAG_MULTISAMPLE_FBO))
856          return false;
857 
858       if (prog_data->sample_shading)
859          assert(pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH);
860 
861       if (pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH)
862          assert(prog_data->persample_dispatch != BRW_NEVER);
863       else
864          assert(prog_data->persample_dispatch != BRW_ALWAYS);
865 
866       return (pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH) != 0;
867    }
868 
869    assert(prog_data->persample_dispatch == BRW_ALWAYS ||
870           prog_data->persample_dispatch == BRW_NEVER);
871 
872    return prog_data->persample_dispatch;
873 }
874 
875 static inline uint32_t
wm_prog_data_barycentric_modes(const struct brw_wm_prog_data * prog_data,enum intel_msaa_flags pushed_msaa_flags)876 wm_prog_data_barycentric_modes(const struct brw_wm_prog_data *prog_data,
877                                enum intel_msaa_flags pushed_msaa_flags)
878 {
879    uint32_t modes = prog_data->barycentric_interp_modes;
880 
881    /* In the non dynamic case, we can just return the computed modes from
882     * compilation time.
883     */
884    if (!(pushed_msaa_flags & INTEL_MSAA_FLAG_ENABLE_DYNAMIC))
885       return modes;
886 
887    if (pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_INTERP) {
888       assert(prog_data->persample_dispatch == BRW_ALWAYS ||
889              (pushed_msaa_flags & INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH));
890 
891       /* Making dynamic per-sample interpolation work is a bit tricky.  The
892        * hardware will hang if SAMPLE is requested but per-sample dispatch is
893        * not enabled.  This means we can't preemptively add SAMPLE to the
894        * barycentrics bitfield.  Instead, we have to add it late and only
895        * on-demand.  Annoyingly, changing the number of barycentrics requested
896        * changes the whole PS shader payload so we very much don't want to do
897        * that.  Instead, if the dynamic per-sample interpolation flag is set,
898        * we check to see if SAMPLE was requested and, if not, replace the
899        * highest barycentric bit in the [non]perspective grouping (CENTROID,
900        * if it exists, else PIXEL) with SAMPLE.  The shader will stomp all the
901        * barycentrics in the shader with SAMPLE so it really doesn't matter
902        * which one we replace.  The important thing is that we keep the number
903        * of barycentrics in each [non]perspective grouping the same.
904        */
905       if ((modes & BRW_BARYCENTRIC_PERSPECTIVE_BITS) &&
906           !(modes & BITFIELD_BIT(BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE))) {
907          int sample_mode =
908             util_last_bit(modes & BRW_BARYCENTRIC_PERSPECTIVE_BITS) - 1;
909          assert(modes & BITFIELD_BIT(sample_mode));
910 
911          modes &= ~BITFIELD_BIT(sample_mode);
912          modes |= BITFIELD_BIT(BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE);
913       }
914 
915       if ((modes & BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) &&
916           !(modes & BITFIELD_BIT(BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE))) {
917          int sample_mode =
918             util_last_bit(modes & BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) - 1;
919          assert(modes & BITFIELD_BIT(sample_mode));
920 
921          modes &= ~BITFIELD_BIT(sample_mode);
922          modes |= BITFIELD_BIT(BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE);
923       }
924    } else {
925       /* If we're not using per-sample interpolation, we need to disable the
926        * per-sample bits.
927        *
928        * SKL PRMs, Volume 2a: Command Reference: Instructions,
929        * 3DSTATE_WM:Barycentric Interpolation Mode:
930 
931        *    "MSDISPMODE_PERSAMPLE is required in order to select Perspective
932        *     Sample or Non-perspective Sample barycentric coordinates."
933        */
934       uint32_t sample_bits = (BITFIELD_BIT(BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE) |
935                               BITFIELD_BIT(BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE));
936       uint32_t requested_sample = modes & sample_bits;
937       modes &= ~sample_bits;
938       /*
939        * If the shader requested some sample modes and we have to disable
940        * them, make sure we add back the pixel variant back to not mess up the
941        * thread payload.
942        *
943        * Why does this works out? Because of the ordering in the thread payload :
944        *
945        *   R7:10  Perspective Centroid Barycentric
946        *   R11:14 Perspective Sample Barycentric
947        *   R15:18 Linear Pixel Location Barycentric
948        *
949        * In the backend when persample dispatch is dynamic, we always select
950        * the sample barycentric and turn off the pixel location (even if
951        * requested through intrinsics). That way when we dynamically select
952        * pixel or sample dispatch, the barycentric always match, since the
953        * pixel location barycentric register offset will align with the sample
954        * barycentric.
955        */
956       if (requested_sample) {
957          if (requested_sample & BITFIELD_BIT(BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE))
958             modes |= BITFIELD_BIT(BRW_BARYCENTRIC_PERSPECTIVE_PIXEL);
959          if (requested_sample & BITFIELD_BIT(BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE))
960             modes |= BITFIELD_BIT(BRW_BARYCENTRIC_NONPERSPECTIVE_PIXEL);
961       }
962    }
963 
964    return modes;
965 }
966 
967 static inline bool
brw_wm_prog_data_is_coarse(const struct brw_wm_prog_data * prog_data,enum intel_msaa_flags pushed_msaa_flags)968 brw_wm_prog_data_is_coarse(const struct brw_wm_prog_data *prog_data,
969                            enum intel_msaa_flags pushed_msaa_flags)
970 {
971    if (pushed_msaa_flags & INTEL_MSAA_FLAG_ENABLE_DYNAMIC) {
972       if (pushed_msaa_flags & INTEL_MSAA_FLAG_COARSE_RT_WRITES)
973          assert(prog_data->coarse_pixel_dispatch != BRW_NEVER);
974       else
975          assert(prog_data->coarse_pixel_dispatch != BRW_ALWAYS);
976 
977       return pushed_msaa_flags & INTEL_MSAA_FLAG_COARSE_RT_WRITES;
978    }
979 
980    assert(prog_data->coarse_pixel_dispatch == BRW_ALWAYS ||
981           prog_data->coarse_pixel_dispatch == BRW_NEVER);
982 
983    return prog_data->coarse_pixel_dispatch;
984 }
985 
986 struct brw_push_const_block {
987    unsigned dwords;     /* Dword count, not reg aligned */
988    unsigned regs;
989    unsigned size;       /* Bytes, register aligned */
990 };
991 
992 struct brw_cs_prog_data {
993    struct brw_stage_prog_data base;
994 
995    unsigned local_size[3];
996 
997    /* Program offsets for the 8/16/32 SIMD variants.  Multiple variants are
998     * kept when using variable group size, and the right one can only be
999     * decided at dispatch time.
1000     */
1001    unsigned prog_offset[3];
1002 
1003    /* Bitmask indicating which program offsets are valid. */
1004    unsigned prog_mask;
1005 
1006    /* Bitmask indicating which programs have spilled. */
1007    unsigned prog_spilled;
1008 
1009    bool uses_barrier;
1010    bool uses_num_work_groups;
1011    bool uses_inline_data;
1012    bool uses_btd_stack_ids;
1013    bool uses_systolic;
1014    uint8_t generate_local_id;
1015    enum intel_compute_walk_order walk_order;
1016 
1017    struct {
1018       struct brw_push_const_block cross_thread;
1019       struct brw_push_const_block per_thread;
1020    } push;
1021 };
1022 
1023 static inline uint32_t
brw_cs_prog_data_prog_offset(const struct brw_cs_prog_data * prog_data,unsigned dispatch_width)1024 brw_cs_prog_data_prog_offset(const struct brw_cs_prog_data *prog_data,
1025                              unsigned dispatch_width)
1026 {
1027    assert(dispatch_width == 8 ||
1028           dispatch_width == 16 ||
1029           dispatch_width == 32);
1030    const unsigned index = dispatch_width / 16;
1031    assert(prog_data->prog_mask & (1 << index));
1032    return prog_data->prog_offset[index];
1033 }
1034 
1035 struct brw_bs_prog_data {
1036    struct brw_stage_prog_data base;
1037 
1038    /** SIMD size of the root shader */
1039    uint8_t simd_size;
1040 
1041    /** Maximum stack size of all shaders */
1042    uint32_t max_stack_size;
1043 
1044    /** Offset into the shader where the resume SBT is located */
1045    uint32_t resume_sbt_offset;
1046 
1047    /** Number of resume shaders */
1048    uint32_t num_resume_shaders;
1049 };
1050 
1051 /**
1052  * Enum representing the i965-specific vertex results that don't correspond
1053  * exactly to any element of gl_varying_slot.  The values of this enum are
1054  * assigned such that they don't conflict with gl_varying_slot.
1055  */
1056 typedef enum
1057 {
1058    BRW_VARYING_SLOT_PAD = VARYING_SLOT_MAX,
1059    BRW_VARYING_SLOT_COUNT
1060 } brw_varying_slot;
1061 
1062 /**
1063  * Bitmask indicating which fragment shader inputs represent varyings (and
1064  * hence have to be delivered to the fragment shader by the SF/SBE stage).
1065  */
1066 #define BRW_FS_VARYING_INPUT_MASK \
1067    (BITFIELD64_RANGE(0, VARYING_SLOT_MAX) & \
1068     ~VARYING_BIT_POS & ~VARYING_BIT_FACE)
1069 
1070 void brw_print_vue_map(FILE *fp, const struct intel_vue_map *vue_map,
1071                        gl_shader_stage stage);
1072 
1073 /**
1074  * Convert a VUE slot number into a byte offset within the VUE.
1075  */
brw_vue_slot_to_offset(unsigned slot)1076 static inline unsigned brw_vue_slot_to_offset(unsigned slot)
1077 {
1078    return 16*slot;
1079 }
1080 
1081 /**
1082  * Convert a vertex output (brw_varying_slot) into a byte offset within the
1083  * VUE.
1084  */
1085 static inline unsigned
brw_varying_to_offset(const struct intel_vue_map * vue_map,unsigned varying)1086 brw_varying_to_offset(const struct intel_vue_map *vue_map, unsigned varying)
1087 {
1088    return brw_vue_slot_to_offset(vue_map->varying_to_slot[varying]);
1089 }
1090 
1091 void brw_compute_vue_map(const struct intel_device_info *devinfo,
1092                          struct intel_vue_map *vue_map,
1093                          uint64_t slots_valid,
1094                          bool separate_shader,
1095                          uint32_t pos_slots);
1096 
1097 void brw_compute_tess_vue_map(struct intel_vue_map *const vue_map,
1098                               uint64_t slots_valid,
1099                               uint32_t is_patch);
1100 
1101 struct brw_vue_prog_data {
1102    struct brw_stage_prog_data base;
1103    struct intel_vue_map vue_map;
1104 
1105    /** Should the hardware deliver input VUE handles for URB pull loads? */
1106    bool include_vue_handles;
1107 
1108    unsigned urb_read_length;
1109    unsigned total_grf;
1110 
1111    uint32_t clip_distance_mask;
1112    uint32_t cull_distance_mask;
1113 
1114    /* Used for calculating urb partitions.  In the VS, this is the size of the
1115     * URB entry used for both input and output to the thread.  In the GS, this
1116     * is the size of the URB entry used for output.
1117     */
1118    unsigned urb_entry_size;
1119 
1120    enum intel_shader_dispatch_mode dispatch_mode;
1121 };
1122 
1123 struct brw_vs_prog_data {
1124    struct brw_vue_prog_data base;
1125 
1126    uint64_t inputs_read;
1127    uint64_t double_inputs_read;
1128 
1129    unsigned nr_attribute_slots;
1130 
1131    bool uses_vertexid;
1132    bool uses_instanceid;
1133    bool uses_is_indexed_draw;
1134    bool uses_firstvertex;
1135    bool uses_baseinstance;
1136    bool uses_drawid;
1137 };
1138 
1139 struct brw_tcs_prog_data
1140 {
1141    struct brw_vue_prog_data base;
1142 
1143    /** Should the non-SINGLE_PATCH payload provide primitive ID? */
1144    bool include_primitive_id;
1145 
1146    /** Number vertices in output patch */
1147    int instances;
1148 
1149    /** Track patch count threshold */
1150    int patch_count_threshold;
1151 };
1152 
1153 
1154 struct brw_tes_prog_data
1155 {
1156    struct brw_vue_prog_data base;
1157 
1158    enum intel_tess_partitioning partitioning;
1159    enum intel_tess_output_topology output_topology;
1160    enum intel_tess_domain domain;
1161    bool include_primitive_id;
1162 };
1163 
1164 struct brw_gs_prog_data
1165 {
1166    struct brw_vue_prog_data base;
1167 
1168    unsigned vertices_in;
1169 
1170    /**
1171     * Size of an output vertex, measured in HWORDS (32 bytes).
1172     */
1173    unsigned output_vertex_size_hwords;
1174 
1175    unsigned output_topology;
1176 
1177    /**
1178     * Size of the control data (cut bits or StreamID bits), in hwords (32
1179     * bytes).  0 if there is no control data.
1180     */
1181    unsigned control_data_header_size_hwords;
1182 
1183    /**
1184     * Format of the control data (either GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID
1185     * if the control data is StreamID bits, or
1186     * GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT if the control data is cut bits).
1187     * Ignored if control_data_header_size is 0.
1188     */
1189    unsigned control_data_format;
1190 
1191    bool include_primitive_id;
1192 
1193    /**
1194     * The number of vertices emitted, if constant - otherwise -1.
1195     */
1196    int static_vertex_count;
1197 
1198    int invocations;
1199 };
1200 
1201 struct brw_tue_map {
1202    uint32_t size_dw;
1203 
1204    uint32_t per_task_data_start_dw;
1205 };
1206 
1207 struct brw_mue_map {
1208    int32_t start_dw[VARYING_SLOT_MAX];
1209    uint32_t len_dw[VARYING_SLOT_MAX];
1210    uint32_t per_primitive_indices_dw;
1211 
1212    uint32_t size_dw;
1213 
1214    uint32_t max_primitives;
1215    uint32_t per_primitive_start_dw;
1216    uint32_t per_primitive_header_size_dw;
1217    uint32_t per_primitive_data_size_dw;
1218    uint32_t per_primitive_pitch_dw;
1219    bool user_data_in_primitive_header;
1220 
1221    uint32_t max_vertices;
1222    uint32_t per_vertex_start_dw;
1223    uint32_t per_vertex_header_size_dw;
1224    uint32_t per_vertex_data_size_dw;
1225    uint32_t per_vertex_pitch_dw;
1226    bool user_data_in_vertex_header;
1227 };
1228 
1229 struct brw_task_prog_data {
1230    struct brw_cs_prog_data base;
1231    struct brw_tue_map map;
1232    bool uses_drawid;
1233 };
1234 
1235 enum brw_mesh_index_format {
1236    BRW_INDEX_FORMAT_U32,
1237    BRW_INDEX_FORMAT_U888X,
1238 };
1239 
1240 struct brw_mesh_prog_data {
1241    struct brw_cs_prog_data base;
1242    struct brw_mue_map map;
1243 
1244    uint32_t clip_distance_mask;
1245    uint32_t cull_distance_mask;
1246    uint16_t primitive_type;
1247 
1248    enum brw_mesh_index_format index_format;
1249 
1250    bool uses_drawid;
1251    bool autostrip_enable;
1252 };
1253 
1254 /* brw_any_prog_data is prog_data for any stage that maps to an API stage */
1255 union brw_any_prog_data {
1256    struct brw_stage_prog_data base;
1257    struct brw_vue_prog_data vue;
1258    struct brw_vs_prog_data vs;
1259    struct brw_tcs_prog_data tcs;
1260    struct brw_tes_prog_data tes;
1261    struct brw_gs_prog_data gs;
1262    struct brw_wm_prog_data wm;
1263    struct brw_cs_prog_data cs;
1264    struct brw_bs_prog_data bs;
1265    struct brw_task_prog_data task;
1266    struct brw_mesh_prog_data mesh;
1267 };
1268 
1269 #define DEFINE_PROG_DATA_DOWNCAST(STAGE, CHECK)                            \
1270 static inline struct brw_##STAGE##_prog_data *                             \
1271 brw_##STAGE##_prog_data(struct brw_stage_prog_data *prog_data)             \
1272 {                                                                          \
1273    if (prog_data)                                                          \
1274       assert(CHECK);                                                       \
1275    return (struct brw_##STAGE##_prog_data *) prog_data;                    \
1276 }                                                                          \
1277 static inline const struct brw_##STAGE##_prog_data *                       \
1278 brw_##STAGE##_prog_data_const(const struct brw_stage_prog_data *prog_data) \
1279 {                                                                          \
1280    if (prog_data)                                                          \
1281       assert(CHECK);                                                       \
1282    return (const struct brw_##STAGE##_prog_data *) prog_data;              \
1283 }
1284 
1285 DEFINE_PROG_DATA_DOWNCAST(vs,  prog_data->stage == MESA_SHADER_VERTEX)
1286 DEFINE_PROG_DATA_DOWNCAST(tcs, prog_data->stage == MESA_SHADER_TESS_CTRL)
1287 DEFINE_PROG_DATA_DOWNCAST(tes, prog_data->stage == MESA_SHADER_TESS_EVAL)
1288 DEFINE_PROG_DATA_DOWNCAST(gs,  prog_data->stage == MESA_SHADER_GEOMETRY)
1289 DEFINE_PROG_DATA_DOWNCAST(wm,  prog_data->stage == MESA_SHADER_FRAGMENT)
1290 DEFINE_PROG_DATA_DOWNCAST(cs,  gl_shader_stage_uses_workgroup(prog_data->stage))
1291 DEFINE_PROG_DATA_DOWNCAST(bs,  brw_shader_stage_is_bindless(prog_data->stage))
1292 
1293 DEFINE_PROG_DATA_DOWNCAST(vue, prog_data->stage == MESA_SHADER_VERTEX ||
1294                                prog_data->stage == MESA_SHADER_TESS_CTRL ||
1295                                prog_data->stage == MESA_SHADER_TESS_EVAL ||
1296                                prog_data->stage == MESA_SHADER_GEOMETRY)
1297 
1298 DEFINE_PROG_DATA_DOWNCAST(task, prog_data->stage == MESA_SHADER_TASK)
1299 DEFINE_PROG_DATA_DOWNCAST(mesh, prog_data->stage == MESA_SHADER_MESH)
1300 
1301 #undef DEFINE_PROG_DATA_DOWNCAST
1302 
1303 struct brw_compile_stats {
1304    uint32_t dispatch_width; /**< 0 for vec4 */
1305    uint32_t max_polygons;
1306    uint32_t max_dispatch_width;
1307    uint32_t instructions;
1308    uint32_t sends;
1309    uint32_t loops;
1310    uint32_t cycles;
1311    uint32_t spills;
1312    uint32_t fills;
1313    uint32_t max_live_registers;
1314 };
1315 
1316 /** @} */
1317 
1318 struct brw_compiler *
1319 brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo);
1320 
1321 /**
1322  * Returns a compiler configuration for use with disk shader cache
1323  *
1324  * This value only needs to change for settings that can cause different
1325  * program generation between two runs on the same hardware.
1326  *
1327  * For example, it doesn't need to be different for gen 8 and gen 9 hardware,
1328  * but it does need to be different if INTEL_DEBUG=nocompact is or isn't used.
1329  */
1330 uint64_t
1331 brw_get_compiler_config_value(const struct brw_compiler *compiler);
1332 
1333 /* Provides a string sha1 hash of all device information fields that could
1334  * affect shader compilation.
1335  */
1336 void
1337 brw_device_sha1(char *hex, const struct intel_device_info *devinfo);
1338 
1339 /* For callers computing their own UUID or hash.  Hashes all device
1340  * information fields that could affect shader compilation into the provided
1341  * sha1_ctx.
1342  */
1343 void
1344 brw_device_sha1_update(struct mesa_sha1 *sha1_ctx,
1345                        const struct intel_device_info *devinfo);
1346 
1347 unsigned
1348 brw_prog_data_size(gl_shader_stage stage);
1349 
1350 unsigned
1351 brw_prog_key_size(gl_shader_stage stage);
1352 
1353 struct brw_compile_params {
1354    void *mem_ctx;
1355 
1356    nir_shader *nir;
1357 
1358    struct brw_compile_stats *stats;
1359 
1360    void *log_data;
1361 
1362    char *error_str;
1363 
1364    uint64_t debug_flag;
1365 
1366    uint32_t source_hash;
1367 };
1368 
1369 /**
1370  * Parameters for compiling a vertex shader.
1371  *
1372  * Some of these will be modified during the shader compilation.
1373  */
1374 struct brw_compile_vs_params {
1375    struct brw_compile_params base;
1376 
1377    const struct brw_vs_prog_key *key;
1378    struct brw_vs_prog_data *prog_data;
1379 };
1380 
1381 /**
1382  * Compile a vertex shader.
1383  *
1384  * Returns the final assembly and updates the parameters structure.
1385  */
1386 const unsigned *
1387 brw_compile_vs(const struct brw_compiler *compiler,
1388                struct brw_compile_vs_params *params);
1389 
1390 /**
1391  * Parameters for compiling a tessellation control shader.
1392  *
1393  * Some of these will be modified during the shader compilation.
1394  */
1395 struct brw_compile_tcs_params {
1396    struct brw_compile_params base;
1397 
1398    const struct brw_tcs_prog_key *key;
1399    struct brw_tcs_prog_data *prog_data;
1400 };
1401 
1402 /**
1403  * Compile a tessellation control shader.
1404  *
1405  * Returns the final assembly and updates the parameters structure.
1406  */
1407 const unsigned *
1408 brw_compile_tcs(const struct brw_compiler *compiler,
1409                 struct brw_compile_tcs_params *params);
1410 
1411 /**
1412  * Parameters for compiling a tessellation evaluation shader.
1413  *
1414  * Some of these will be modified during the shader compilation.
1415  */
1416 struct brw_compile_tes_params {
1417    struct brw_compile_params base;
1418 
1419    const struct brw_tes_prog_key *key;
1420    struct brw_tes_prog_data *prog_data;
1421    const struct intel_vue_map *input_vue_map;
1422 };
1423 
1424 /**
1425  * Compile a tessellation evaluation shader.
1426  *
1427  * Returns the final assembly and updates the parameters structure.
1428  */
1429 const unsigned *
1430 brw_compile_tes(const struct brw_compiler *compiler,
1431                 struct brw_compile_tes_params *params);
1432 
1433 /**
1434  * Parameters for compiling a geometry shader.
1435  *
1436  * Some of these will be modified during the shader compilation.
1437  */
1438 struct brw_compile_gs_params {
1439    struct brw_compile_params base;
1440 
1441    const struct brw_gs_prog_key *key;
1442    struct brw_gs_prog_data *prog_data;
1443 };
1444 
1445 /**
1446  * Compile a geometry shader.
1447  *
1448  * Returns the final assembly and updates the parameters structure.
1449  */
1450 const unsigned *
1451 brw_compile_gs(const struct brw_compiler *compiler,
1452                struct brw_compile_gs_params *params);
1453 
1454 struct brw_compile_task_params {
1455    struct brw_compile_params base;
1456 
1457    const struct brw_task_prog_key *key;
1458    struct brw_task_prog_data *prog_data;
1459 };
1460 
1461 const unsigned *
1462 brw_compile_task(const struct brw_compiler *compiler,
1463                  struct brw_compile_task_params *params);
1464 
1465 struct brw_compile_mesh_params {
1466    struct brw_compile_params base;
1467 
1468    const struct brw_mesh_prog_key *key;
1469    struct brw_mesh_prog_data *prog_data;
1470    const struct brw_tue_map *tue_map;
1471 };
1472 
1473 const unsigned *
1474 brw_compile_mesh(const struct brw_compiler *compiler,
1475                  struct brw_compile_mesh_params *params);
1476 
1477 /**
1478  * Parameters for compiling a fragment shader.
1479  *
1480  * Some of these will be modified during the shader compilation.
1481  */
1482 struct brw_compile_fs_params {
1483    struct brw_compile_params base;
1484 
1485    const struct brw_wm_prog_key *key;
1486    struct brw_wm_prog_data *prog_data;
1487 
1488    const struct intel_vue_map *vue_map;
1489    const struct brw_mue_map *mue_map;
1490 
1491    bool allow_spilling;
1492    bool use_rep_send;
1493    uint8_t max_polygons;
1494 };
1495 
1496 /**
1497  * Compile a fragment shader.
1498  *
1499  * Returns the final assembly and updates the parameters structure.
1500  */
1501 const unsigned *
1502 brw_compile_fs(const struct brw_compiler *compiler,
1503                struct brw_compile_fs_params *params);
1504 
1505 /**
1506  * Parameters for compiling a compute shader.
1507  *
1508  * Some of these will be modified during the shader compilation.
1509  */
1510 struct brw_compile_cs_params {
1511    struct brw_compile_params base;
1512 
1513    const struct brw_cs_prog_key *key;
1514    struct brw_cs_prog_data *prog_data;
1515 };
1516 
1517 /**
1518  * Compile a compute shader.
1519  *
1520  * Returns the final assembly and updates the parameters structure.
1521  */
1522 const unsigned *
1523 brw_compile_cs(const struct brw_compiler *compiler,
1524                struct brw_compile_cs_params *params);
1525 
1526 /**
1527  * Parameters for compiling a Bindless shader.
1528  *
1529  * Some of these will be modified during the shader compilation.
1530  */
1531 struct brw_compile_bs_params {
1532    struct brw_compile_params base;
1533 
1534    const struct brw_bs_prog_key *key;
1535    struct brw_bs_prog_data *prog_data;
1536 
1537    unsigned num_resume_shaders;
1538    struct nir_shader **resume_shaders;
1539 };
1540 
1541 /**
1542  * Compile a Bindless shader.
1543  *
1544  * Returns the final assembly and updates the parameters structure.
1545  */
1546 const unsigned *
1547 brw_compile_bs(const struct brw_compiler *compiler,
1548                struct brw_compile_bs_params *params);
1549 
1550 void brw_debug_key_recompile(const struct brw_compiler *c, void *log,
1551                              gl_shader_stage stage,
1552                              const struct brw_base_prog_key *old_key,
1553                              const struct brw_base_prog_key *key);
1554 
1555 unsigned
1556 brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data,
1557                              unsigned threads);
1558 
1559 void
1560 brw_write_shader_relocs(const struct brw_isa_info *isa,
1561                         void *program,
1562                         const struct brw_stage_prog_data *prog_data,
1563                         struct brw_shader_reloc_value *values,
1564                         unsigned num_values);
1565 
1566 /**
1567  * Get the dispatch information for a shader to be used with GPGPU_WALKER and
1568  * similar instructions.
1569  *
1570  * If override_local_size is not NULL, it must to point to a 3-element that
1571  * will override the value from prog_data->local_size.  This is used by
1572  * ARB_compute_variable_group_size, where the size is set only at dispatch
1573  * time (so prog_data is outdated).
1574  */
1575 struct intel_cs_dispatch_info
1576 brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
1577                          const struct brw_cs_prog_data *prog_data,
1578                          const unsigned *override_local_size);
1579 
1580 /**
1581  * Return true if the given shader stage is dispatched contiguously by the
1582  * relevant fixed function starting from channel 0 of the SIMD thread, which
1583  * implies that the dispatch mask of a thread can be assumed to have the form
1584  * '2^n - 1' for some n.
1585  */
1586 static inline bool
brw_stage_has_packed_dispatch(ASSERTED const struct intel_device_info * devinfo,gl_shader_stage stage,unsigned max_polygons,const struct brw_stage_prog_data * prog_data)1587 brw_stage_has_packed_dispatch(ASSERTED const struct intel_device_info *devinfo,
1588                               gl_shader_stage stage, unsigned max_polygons,
1589                               const struct brw_stage_prog_data *prog_data)
1590 {
1591    /* The code below makes assumptions about the hardware's thread dispatch
1592     * behavior that could be proven wrong in future generations -- Make sure
1593     * to do a full test run with brw_fs_test_dispatch_packing() hooked up to
1594     * the NIR front-end before changing this assertion. It can be temporarily
1595     * enabled by setting the macro below to true.
1596     */
1597    #define ENABLE_FS_TEST_DISPATCH_PACKING false
1598    assert(devinfo->ver <= 20);
1599 
1600    switch (stage) {
1601    case MESA_SHADER_FRAGMENT: {
1602       /* The PSD discards subspans coming in with no lit samples, which in the
1603        * per-pixel shading case implies that each subspan will either be fully
1604        * lit (due to the VMask being used to allow derivative computations),
1605        * or not dispatched at all.  In per-sample dispatch mode individual
1606        * samples from the same subspan have a fixed relative location within
1607        * the SIMD thread, so dispatch of unlit samples cannot be avoided in
1608        * general and we should return false.
1609        */
1610       const struct brw_wm_prog_data *wm_prog_data =
1611          (const struct brw_wm_prog_data *)prog_data;
1612       return devinfo->verx10 < 125 &&
1613              !wm_prog_data->persample_dispatch &&
1614              wm_prog_data->uses_vmask &&
1615              max_polygons < 2;
1616    }
1617    case MESA_SHADER_COMPUTE:
1618       /* Compute shaders will be spawned with either a fully enabled dispatch
1619        * mask or with whatever bottom/right execution mask was given to the
1620        * GPGPU walker command to be used along the workgroup edges -- In both
1621        * cases the dispatch mask is required to be tightly packed for our
1622        * invocation index calculations to work.
1623        */
1624       return true;
1625    default:
1626       /* Most remaining fixed functions are limited to use a packed dispatch
1627        * mask due to the hardware representation of the dispatch mask as a
1628        * single counter representing the number of enabled channels.
1629        */
1630       return true;
1631    }
1632 }
1633 
1634 /**
1635  * Computes the first varying slot in the URB produced by the previous stage
1636  * that is used in the next stage. We do this by testing the varying slots in
1637  * the previous stage's vue map against the inputs read in the next stage.
1638  *
1639  * Note that:
1640  *
1641  * - Each URB offset contains two varying slots and we can only skip a
1642  *   full offset if both slots are unused, so the value we return here is always
1643  *   rounded down to the closest multiple of two.
1644  *
1645  * - gl_Layer and gl_ViewportIndex don't have their own varying slots, they are
1646  *   part of the vue header, so if these are read we can't skip anything.
1647  */
1648 static inline int
brw_compute_first_urb_slot_required(uint64_t inputs_read,const struct intel_vue_map * prev_stage_vue_map)1649 brw_compute_first_urb_slot_required(uint64_t inputs_read,
1650                                     const struct intel_vue_map *prev_stage_vue_map)
1651 {
1652    if ((inputs_read & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PRIMITIVE_SHADING_RATE)) == 0) {
1653       for (int i = 0; i < prev_stage_vue_map->num_slots; i++) {
1654          int varying = prev_stage_vue_map->slot_to_varying[i];
1655          if (varying != BRW_VARYING_SLOT_PAD && varying > 0 &&
1656              (inputs_read & BITFIELD64_BIT(varying)) != 0) {
1657             return ROUND_DOWN_TO(i, 2);
1658          }
1659       }
1660    }
1661 
1662    return 0;
1663 }
1664 
1665 /* From InlineData in 3DSTATE_TASK_SHADER_DATA and 3DSTATE_MESH_SHADER_DATA. */
1666 #define BRW_TASK_MESH_INLINE_DATA_SIZE_DW 8
1667 
1668 /* InlineData[0-1] is used for Vulkan descriptor. */
1669 #define BRW_TASK_MESH_PUSH_CONSTANTS_START_DW 2
1670 
1671 #define BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW \
1672    (BRW_TASK_MESH_INLINE_DATA_SIZE_DW - BRW_TASK_MESH_PUSH_CONSTANTS_START_DW)
1673 
1674 /**
1675  * This enum is used as the base indice of the nir_load_topology_id_intel
1676  * intrinsic. This is used to return different values based on some aspect of
1677  * the topology of the device.
1678  */
1679 enum brw_topology_id
1680 {
1681    /* A value based of the DSS identifier the shader is currently running on.
1682     * Be mindful that the DSS ID can be higher than the total number of DSS on
1683     * the device. This is because of the fusing that can occur on different
1684     * parts.
1685     */
1686    BRW_TOPOLOGY_ID_DSS,
1687 
1688    /* A value composed of EU ID, thread ID & SIMD lane ID. */
1689    BRW_TOPOLOGY_ID_EU_THREAD_SIMD,
1690 };
1691 
1692 #ifdef __cplusplus
1693 } /* extern "C" */
1694 #endif
1695 
1696 #endif /* BRW_COMPILER_H */
1697