xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/crocus/crocus_state.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2017 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included
12  * in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20  * DEALINGS IN THE SOFTWARE.
21  */
22 
23 /**
24  * @file crocus_state.c
25  *
26  * ============================= GENXML CODE =============================
27  *              [This file is compiled once per generation.]
28  * =======================================================================
29  *
30  * This is the main state upload code.
31  *
32  * Gallium uses Constant State Objects, or CSOs, for most state.  Large,
33  * complex, or highly reusable state can be created once, and bound and
34  * rebound multiple times.  This is modeled with the pipe->create_*_state()
35  * and pipe->bind_*_state() hooks.  Highly dynamic or inexpensive state is
36  * streamed out on the fly, via pipe->set_*_state() hooks.
37  *
38  * OpenGL involves frequently mutating context state, which is mirrored in
39  * core Mesa by highly mutable data structures.  However, most applications
40  * typically draw the same things over and over - from frame to frame, most
41  * of the same objects are still visible and need to be redrawn.  So, rather
42  * than inventing new state all the time, applications usually mutate to swap
43  * between known states that we've seen before.
44  *
45  * Gallium isolates us from this mutation by tracking API state, and
46  * distilling it into a set of Constant State Objects, or CSOs.  Large,
47  * complex, or typically reusable state can be created once, then reused
48  * multiple times.  Drivers can create and store their own associated data.
49  * This create/bind model corresponds to the pipe->create_*_state() and
50  * pipe->bind_*_state() driver hooks.
51  *
52  * Some state is cheap to create, or expected to be highly dynamic.  Rather
53  * than creating and caching piles of CSOs for these, Gallium simply streams
54  * them out, via the pipe->set_*_state() driver hooks.
55  *
56  * To reduce draw time overhead, we try to compute as much state at create
57  * time as possible.  Wherever possible, we translate the Gallium pipe state
58  * to 3DSTATE commands, and store those commands in the CSO.  At draw time,
59  * we can simply memcpy them into a batch buffer.
60  *
61  * No hardware matches the abstraction perfectly, so some commands require
62  * information from multiple CSOs.  In this case, we can store two copies
63  * of the packet (one in each CSO), and simply | together their DWords at
64  * draw time.  Sometimes the second set is trivial (one or two fields), so
65  * we simply pack it at draw time.
66  *
67  * There are two main components in the file below.  First, the CSO hooks
68  * create/bind/track state.  The second are the draw-time upload functions,
69  * crocus_upload_render_state() and crocus_upload_compute_state(), which read
70  * the context state and emit the commands into the actual batch.
71  */
72 
73 #include <errno.h>
74 #include <stdio.h>
75 
76 #if HAVE_VALGRIND
77 #include <memcheck.h>
78 #include <valgrind.h>
79 #define VG(x) x
80 #else
81 #define VG(x)
82 #endif
83 
84 #include "drm-uapi/i915_drm.h"
85 #include "intel/common/intel_compute_slm.h"
86 #include "intel/common/intel_l3_config.h"
87 #include "intel/common/intel_sample_positions.h"
88 #include "intel/compiler/elk/elk_compiler.h"
89 #include "compiler/shader_info.h"
90 #include "pipe/p_context.h"
91 #include "pipe/p_defines.h"
92 #include "pipe/p_screen.h"
93 #include "pipe/p_state.h"
94 #include "util/format/u_format.h"
95 #include "util/half_float.h"
96 #include "util/u_dual_blend.h"
97 #include "util/u_framebuffer.h"
98 #include "util/u_helpers.h"
99 #include "util/u_inlines.h"
100 #include "util/u_memory.h"
101 #include "util/u_prim.h"
102 #include "util/u_transfer.h"
103 #include "util/u_upload_mgr.h"
104 #include "util/u_viewport.h"
105 #include "crocus_batch.h"
106 #include "crocus_context.h"
107 #include "crocus_defines.h"
108 #include "crocus_pipe.h"
109 #include "crocus_resource.h"
110 
111 #include "crocus_genx_macros.h"
112 #include "intel/common/intel_genX_state_elk.h"
113 #include "intel/common/intel_guardband.h"
114 #include "main/macros.h" /* UNCLAMPED_* */
115 
116 /**
117  * Statically assert that PIPE_* enums match the hardware packets.
118  * (As long as they match, we don't need to translate them.)
119  */
pipe_asserts()120 UNUSED static void pipe_asserts()
121 {
122 #define PIPE_ASSERT(x) STATIC_ASSERT((int)x)
123 
124    /* pipe_logicop happens to match the hardware. */
125    PIPE_ASSERT(PIPE_LOGICOP_CLEAR == LOGICOP_CLEAR);
126    PIPE_ASSERT(PIPE_LOGICOP_NOR == LOGICOP_NOR);
127    PIPE_ASSERT(PIPE_LOGICOP_AND_INVERTED == LOGICOP_AND_INVERTED);
128    PIPE_ASSERT(PIPE_LOGICOP_COPY_INVERTED == LOGICOP_COPY_INVERTED);
129    PIPE_ASSERT(PIPE_LOGICOP_AND_REVERSE == LOGICOP_AND_REVERSE);
130    PIPE_ASSERT(PIPE_LOGICOP_INVERT == LOGICOP_INVERT);
131    PIPE_ASSERT(PIPE_LOGICOP_XOR == LOGICOP_XOR);
132    PIPE_ASSERT(PIPE_LOGICOP_NAND == LOGICOP_NAND);
133    PIPE_ASSERT(PIPE_LOGICOP_AND == LOGICOP_AND);
134    PIPE_ASSERT(PIPE_LOGICOP_EQUIV == LOGICOP_EQUIV);
135    PIPE_ASSERT(PIPE_LOGICOP_NOOP == LOGICOP_NOOP);
136    PIPE_ASSERT(PIPE_LOGICOP_OR_INVERTED == LOGICOP_OR_INVERTED);
137    PIPE_ASSERT(PIPE_LOGICOP_COPY == LOGICOP_COPY);
138    PIPE_ASSERT(PIPE_LOGICOP_OR_REVERSE == LOGICOP_OR_REVERSE);
139    PIPE_ASSERT(PIPE_LOGICOP_OR == LOGICOP_OR);
140    PIPE_ASSERT(PIPE_LOGICOP_SET == LOGICOP_SET);
141 
142    /* pipe_blend_func happens to match the hardware. */
143    PIPE_ASSERT(PIPE_BLENDFACTOR_ONE == BLENDFACTOR_ONE);
144    PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_COLOR == BLENDFACTOR_SRC_COLOR);
145    PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA == BLENDFACTOR_SRC_ALPHA);
146    PIPE_ASSERT(PIPE_BLENDFACTOR_DST_ALPHA == BLENDFACTOR_DST_ALPHA);
147    PIPE_ASSERT(PIPE_BLENDFACTOR_DST_COLOR == BLENDFACTOR_DST_COLOR);
148    PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE == BLENDFACTOR_SRC_ALPHA_SATURATE);
149    PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_COLOR == BLENDFACTOR_CONST_COLOR);
150    PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_ALPHA == BLENDFACTOR_CONST_ALPHA);
151    PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_COLOR == BLENDFACTOR_SRC1_COLOR);
152    PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_ALPHA == BLENDFACTOR_SRC1_ALPHA);
153    PIPE_ASSERT(PIPE_BLENDFACTOR_ZERO == BLENDFACTOR_ZERO);
154    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_COLOR == BLENDFACTOR_INV_SRC_COLOR);
155    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_ALPHA == BLENDFACTOR_INV_SRC_ALPHA);
156    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_ALPHA == BLENDFACTOR_INV_DST_ALPHA);
157    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_COLOR == BLENDFACTOR_INV_DST_COLOR);
158    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_COLOR == BLENDFACTOR_INV_CONST_COLOR);
159    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_ALPHA == BLENDFACTOR_INV_CONST_ALPHA);
160    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_COLOR == BLENDFACTOR_INV_SRC1_COLOR);
161    PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_ALPHA == BLENDFACTOR_INV_SRC1_ALPHA);
162 
163    /* pipe_blend_func happens to match the hardware. */
164    PIPE_ASSERT(PIPE_BLEND_ADD == BLENDFUNCTION_ADD);
165    PIPE_ASSERT(PIPE_BLEND_SUBTRACT == BLENDFUNCTION_SUBTRACT);
166    PIPE_ASSERT(PIPE_BLEND_REVERSE_SUBTRACT == BLENDFUNCTION_REVERSE_SUBTRACT);
167    PIPE_ASSERT(PIPE_BLEND_MIN == BLENDFUNCTION_MIN);
168    PIPE_ASSERT(PIPE_BLEND_MAX == BLENDFUNCTION_MAX);
169 
170    /* pipe_stencil_op happens to match the hardware. */
171    PIPE_ASSERT(PIPE_STENCIL_OP_KEEP == STENCILOP_KEEP);
172    PIPE_ASSERT(PIPE_STENCIL_OP_ZERO == STENCILOP_ZERO);
173    PIPE_ASSERT(PIPE_STENCIL_OP_REPLACE == STENCILOP_REPLACE);
174    PIPE_ASSERT(PIPE_STENCIL_OP_INCR == STENCILOP_INCRSAT);
175    PIPE_ASSERT(PIPE_STENCIL_OP_DECR == STENCILOP_DECRSAT);
176    PIPE_ASSERT(PIPE_STENCIL_OP_INCR_WRAP == STENCILOP_INCR);
177    PIPE_ASSERT(PIPE_STENCIL_OP_DECR_WRAP == STENCILOP_DECR);
178    PIPE_ASSERT(PIPE_STENCIL_OP_INVERT == STENCILOP_INVERT);
179 
180 #if GFX_VER >= 6
181    /* pipe_sprite_coord_mode happens to match 3DSTATE_SBE */
182    PIPE_ASSERT(PIPE_SPRITE_COORD_UPPER_LEFT == UPPERLEFT);
183    PIPE_ASSERT(PIPE_SPRITE_COORD_LOWER_LEFT == LOWERLEFT);
184 #endif
185 #undef PIPE_ASSERT
186 }
187 
188 static unsigned
translate_prim_type(enum mesa_prim prim,uint8_t verts_per_patch)189 translate_prim_type(enum mesa_prim prim, uint8_t verts_per_patch)
190 {
191    static const unsigned map[] = {
192       [MESA_PRIM_POINTS]                   = _3DPRIM_POINTLIST,
193       [MESA_PRIM_LINES]                    = _3DPRIM_LINELIST,
194       [MESA_PRIM_LINE_LOOP]                = _3DPRIM_LINELOOP,
195       [MESA_PRIM_LINE_STRIP]               = _3DPRIM_LINESTRIP,
196       [MESA_PRIM_TRIANGLES]                = _3DPRIM_TRILIST,
197       [MESA_PRIM_TRIANGLE_STRIP]           = _3DPRIM_TRISTRIP,
198       [MESA_PRIM_TRIANGLE_FAN]             = _3DPRIM_TRIFAN,
199       [MESA_PRIM_QUADS]                    = _3DPRIM_QUADLIST,
200       [MESA_PRIM_QUAD_STRIP]               = _3DPRIM_QUADSTRIP,
201       [MESA_PRIM_POLYGON]                  = _3DPRIM_POLYGON,
202 #if GFX_VER >= 6
203       [MESA_PRIM_LINES_ADJACENCY]          = _3DPRIM_LINELIST_ADJ,
204       [MESA_PRIM_LINE_STRIP_ADJACENCY]     = _3DPRIM_LINESTRIP_ADJ,
205       [MESA_PRIM_TRIANGLES_ADJACENCY]      = _3DPRIM_TRILIST_ADJ,
206       [MESA_PRIM_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
207 #endif
208 #if GFX_VER >= 7
209       [MESA_PRIM_PATCHES]                  = _3DPRIM_PATCHLIST_1 - 1,
210 #endif
211    };
212 
213    return map[prim] + (prim == MESA_PRIM_PATCHES ? verts_per_patch : 0);
214 }
215 
216 static unsigned
translate_compare_func(enum pipe_compare_func pipe_func)217 translate_compare_func(enum pipe_compare_func pipe_func)
218 {
219    static const unsigned map[] = {
220       [PIPE_FUNC_NEVER]    = COMPAREFUNCTION_NEVER,
221       [PIPE_FUNC_LESS]     = COMPAREFUNCTION_LESS,
222       [PIPE_FUNC_EQUAL]    = COMPAREFUNCTION_EQUAL,
223       [PIPE_FUNC_LEQUAL]   = COMPAREFUNCTION_LEQUAL,
224       [PIPE_FUNC_GREATER]  = COMPAREFUNCTION_GREATER,
225       [PIPE_FUNC_NOTEQUAL] = COMPAREFUNCTION_NOTEQUAL,
226       [PIPE_FUNC_GEQUAL]   = COMPAREFUNCTION_GEQUAL,
227       [PIPE_FUNC_ALWAYS]   = COMPAREFUNCTION_ALWAYS,
228    };
229    return map[pipe_func];
230 }
231 
232 static unsigned
translate_shadow_func(enum pipe_compare_func pipe_func)233 translate_shadow_func(enum pipe_compare_func pipe_func)
234 {
235    /* Gallium specifies the result of shadow comparisons as:
236     *
237     *    1 if ref <op> texel,
238     *    0 otherwise.
239     *
240     * The hardware does:
241     *
242     *    0 if texel <op> ref,
243     *    1 otherwise.
244     *
245     * So we need to flip the operator and also negate.
246     */
247    static const unsigned map[] = {
248       [PIPE_FUNC_NEVER]    = PREFILTEROP_ALWAYS,
249       [PIPE_FUNC_LESS]     = PREFILTEROP_LEQUAL,
250       [PIPE_FUNC_EQUAL]    = PREFILTEROP_NOTEQUAL,
251       [PIPE_FUNC_LEQUAL]   = PREFILTEROP_LESS,
252       [PIPE_FUNC_GREATER]  = PREFILTEROP_GEQUAL,
253       [PIPE_FUNC_NOTEQUAL] = PREFILTEROP_EQUAL,
254       [PIPE_FUNC_GEQUAL]   = PREFILTEROP_GREATER,
255       [PIPE_FUNC_ALWAYS]   = PREFILTEROP_NEVER,
256    };
257    return map[pipe_func];
258 }
259 
260 static unsigned
translate_cull_mode(unsigned pipe_face)261 translate_cull_mode(unsigned pipe_face)
262 {
263    static const unsigned map[4] = {
264       [PIPE_FACE_NONE]           = CULLMODE_NONE,
265       [PIPE_FACE_FRONT]          = CULLMODE_FRONT,
266       [PIPE_FACE_BACK]           = CULLMODE_BACK,
267       [PIPE_FACE_FRONT_AND_BACK] = CULLMODE_BOTH,
268    };
269    return map[pipe_face];
270 }
271 
272 #if GFX_VER >= 6
273 static unsigned
translate_fill_mode(unsigned pipe_polymode)274 translate_fill_mode(unsigned pipe_polymode)
275 {
276    static const unsigned map[4] = {
277       [PIPE_POLYGON_MODE_FILL]           = FILL_MODE_SOLID,
278       [PIPE_POLYGON_MODE_LINE]           = FILL_MODE_WIREFRAME,
279       [PIPE_POLYGON_MODE_POINT]          = FILL_MODE_POINT,
280       [PIPE_POLYGON_MODE_FILL_RECTANGLE] = FILL_MODE_SOLID,
281    };
282    return map[pipe_polymode];
283 }
284 #endif
285 
286 static unsigned
translate_mip_filter(enum pipe_tex_mipfilter pipe_mip)287 translate_mip_filter(enum pipe_tex_mipfilter pipe_mip)
288 {
289    static const unsigned map[] = {
290       [PIPE_TEX_MIPFILTER_NEAREST] = MIPFILTER_NEAREST,
291       [PIPE_TEX_MIPFILTER_LINEAR]  = MIPFILTER_LINEAR,
292       [PIPE_TEX_MIPFILTER_NONE]    = MIPFILTER_NONE,
293    };
294    return map[pipe_mip];
295 }
296 
297 static uint32_t
translate_wrap(unsigned pipe_wrap,bool either_nearest)298 translate_wrap(unsigned pipe_wrap, bool either_nearest)
299 {
300    static const unsigned map[] = {
301       [PIPE_TEX_WRAP_REPEAT]                 = TCM_WRAP,
302 #if GFX_VER == 8
303       [PIPE_TEX_WRAP_CLAMP]                  = TCM_HALF_BORDER,
304 #else
305       [PIPE_TEX_WRAP_CLAMP]                  = TCM_CLAMP_BORDER,
306 #endif
307       [PIPE_TEX_WRAP_CLAMP_TO_EDGE]          = TCM_CLAMP,
308       [PIPE_TEX_WRAP_CLAMP_TO_BORDER]        = TCM_CLAMP_BORDER,
309       [PIPE_TEX_WRAP_MIRROR_REPEAT]          = TCM_MIRROR,
310       [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE]   = TCM_MIRROR_ONCE,
311 
312       /* These are unsupported. */
313       [PIPE_TEX_WRAP_MIRROR_CLAMP]           = -1,
314       [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER] = -1,
315    };
316 #if GFX_VER < 8
317    if (pipe_wrap == PIPE_TEX_WRAP_CLAMP && either_nearest)
318       return TCM_CLAMP;
319 #endif
320    return map[pipe_wrap];
321 }
322 
323 /**
324  * Equiv if elk_state_batch
325  */
326 static uint32_t *
stream_state(struct crocus_batch * batch,unsigned size,unsigned alignment,uint32_t * out_offset)327 stream_state(struct crocus_batch *batch,
328              unsigned size,
329              unsigned alignment,
330              uint32_t *out_offset)
331 {
332    uint32_t offset = ALIGN(batch->state.used, alignment);
333 
334    if (offset + size >= STATE_SZ && !batch->no_wrap) {
335       crocus_batch_flush(batch);
336       offset = ALIGN(batch->state.used, alignment);
337    } else if (offset + size >= batch->state.bo->size) {
338       const unsigned new_size =
339          MIN2(batch->state.bo->size + batch->state.bo->size / 2,
340               MAX_STATE_SIZE);
341       crocus_grow_buffer(batch, true, batch->state.used, new_size);
342       assert(offset + size < batch->state.bo->size);
343    }
344 
345    crocus_record_state_size(batch->state_sizes, offset, size);
346 
347    batch->state.used = offset + size;
348    *out_offset = offset;
349 
350    return (uint32_t *)batch->state.map + (offset >> 2);
351 }
352 
353 /**
354  * stream_state() + memcpy.
355  */
356 static uint32_t
emit_state(struct crocus_batch * batch,const void * data,unsigned size,unsigned alignment)357 emit_state(struct crocus_batch *batch, const void *data, unsigned size,
358            unsigned alignment)
359 {
360    unsigned offset = 0;
361    uint32_t *map = stream_state(batch, size, alignment, &offset);
362 
363    if (map)
364       memcpy(map, data, size);
365 
366    return offset;
367 }
368 
369 #if GFX_VER <= 5
370 static void
upload_pipelined_state_pointers(struct crocus_batch * batch,bool gs_active,uint32_t gs_offset,uint32_t vs_offset,uint32_t sf_offset,uint32_t clip_offset,uint32_t wm_offset,uint32_t cc_offset)371 upload_pipelined_state_pointers(struct crocus_batch *batch,
372                                 bool gs_active, uint32_t gs_offset,
373                                 uint32_t vs_offset, uint32_t sf_offset,
374                                 uint32_t clip_offset, uint32_t wm_offset, uint32_t cc_offset)
375 {
376 #if GFX_VER == 5
377    /* Need to flush before changing clip max threads for errata. */
378    crocus_emit_cmd(batch, GENX(MI_FLUSH), foo);
379 #endif
380 
381    crocus_emit_cmd(batch, GENX(3DSTATE_PIPELINED_POINTERS), pp) {
382       pp.PointertoVSState = ro_bo(batch->state.bo, vs_offset);
383       pp.GSEnable = gs_active;
384       if (gs_active)
385          pp.PointertoGSState = ro_bo(batch->state.bo, gs_offset);
386       pp.ClipEnable = true;
387       pp.PointertoCLIPState = ro_bo(batch->state.bo, clip_offset);
388       pp.PointertoSFState = ro_bo(batch->state.bo, sf_offset);
389       pp.PointertoWMState = ro_bo(batch->state.bo, wm_offset);
390       pp.PointertoColorCalcState = ro_bo(batch->state.bo, cc_offset);
391    }
392 }
393 
394 #endif
395 /**
396  * Did field 'x' change between 'old_cso' and 'new_cso'?
397  *
398  * (If so, we may want to set some dirty flags.)
399  */
400 #define cso_changed(x) (!old_cso || (old_cso->x != new_cso->x))
401 #define cso_changed_memcmp(x) \
402    (!old_cso || memcmp(old_cso->x, new_cso->x, sizeof(old_cso->x)) != 0)
403 
404 static void
flush_before_state_base_change(struct crocus_batch * batch)405 flush_before_state_base_change(struct crocus_batch *batch)
406 {
407 #if GFX_VER >= 6
408    /* Flush before emitting STATE_BASE_ADDRESS.
409     *
410     * This isn't documented anywhere in the PRM.  However, it seems to be
411     * necessary prior to changing the surface state base adress.  We've
412     * seen issues in Vulkan where we get GPU hangs when using multi-level
413     * command buffers which clear depth, reset state base address, and then
414     * go render stuff.
415     *
416     * Normally, in GL, we would trust the kernel to do sufficient stalls
417     * and flushes prior to executing our batch.  However, it doesn't seem
418     * as if the kernel's flushing is always sufficient and we don't want to
419     * rely on it.
420     *
421     * We make this an end-of-pipe sync instead of a normal flush because we
422     * do not know the current status of the GPU.  On Haswell at least,
423     * having a fast-clear operation in flight at the same time as a normal
424     * rendering operation can cause hangs.  Since the kernel's flushing is
425     * insufficient, we need to ensure that any rendering operations from
426     * other processes are definitely complete before we try to do our own
427     * rendering.  It's a bit of a big hammer but it appears to work.
428     */
429    const unsigned dc_flush =
430       GFX_VER >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
431    crocus_emit_end_of_pipe_sync(batch,
432                                 "change STATE_BASE_ADDRESS (flushes)",
433                                 PIPE_CONTROL_RENDER_TARGET_FLUSH |
434                                 dc_flush |
435                                 PIPE_CONTROL_DEPTH_CACHE_FLUSH);
436 #endif
437 }
438 
439 static void
flush_after_state_base_change(struct crocus_batch * batch)440 flush_after_state_base_change(struct crocus_batch *batch)
441 {
442    /* After re-setting the surface state base address, we have to do some
443     * cache flusing so that the sampler engine will pick up the new
444     * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
445     * Shared Function > 3D Sampler > State > State Caching (page 96):
446     *
447     *    Coherency with system memory in the state cache, like the texture
448     *    cache is handled partially by software. It is expected that the
449     *    command stream or shader will issue Cache Flush operation or
450     *    Cache_Flush sampler message to ensure that the L1 cache remains
451     *    coherent with system memory.
452     *
453     *    [...]
454     *
455     *    Whenever the value of the Dynamic_State_Base_Addr,
456     *    Surface_State_Base_Addr are altered, the L1 state cache must be
457     *    invalidated to ensure the new surface or sampler state is fetched
458     *    from system memory.
459     *
460     * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
461     * which, according the PIPE_CONTROL instruction documentation in the
462     * Broadwell PRM:
463     *
464     *    Setting this bit is independent of any other bit in this packet.
465     *    This bit controls the invalidation of the L1 and L2 state caches
466     *    at the top of the pipe i.e. at the parsing time.
467     *
468     * Unfortunately, experimentation seems to indicate that state cache
469     * invalidation through a PIPE_CONTROL does nothing whatsoever in
470     * regards to surface state and binding tables.  In stead, it seems that
471     * invalidating the texture cache is what is actually needed.
472     *
473     * XXX:  As far as we have been able to determine through
474     * experimentation, shows that flush the texture cache appears to be
475     * sufficient.  The theory here is that all of the sampling/rendering
476     * units cache the binding table in the texture cache.  However, we have
477     * yet to be able to actually confirm this.
478     */
479 #if GFX_VER >= 6
480    crocus_emit_end_of_pipe_sync(batch,
481                                 "change STATE_BASE_ADDRESS (invalidates)",
482                                 PIPE_CONTROL_INSTRUCTION_INVALIDATE |
483                                 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
484                                 PIPE_CONTROL_CONST_CACHE_INVALIDATE |
485                                 PIPE_CONTROL_STATE_CACHE_INVALIDATE);
486 #endif
487 }
488 
489 #if GFX_VER >= 6
490 static void
crocus_store_register_mem32(struct crocus_batch * batch,uint32_t reg,struct crocus_bo * bo,uint32_t offset,bool predicated)491 crocus_store_register_mem32(struct crocus_batch *batch, uint32_t reg,
492                             struct crocus_bo *bo, uint32_t offset,
493                             bool predicated)
494 {
495    crocus_emit_cmd(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
496       srm.RegisterAddress = reg;
497       srm.MemoryAddress = ggtt_bo(bo, offset);
498 #if GFX_VERx10 >= 75
499       srm.PredicateEnable = predicated;
500 #else
501       if (predicated)
502          unreachable("unsupported predication");
503 #endif
504    }
505 }
506 
507 static void
crocus_store_register_mem64(struct crocus_batch * batch,uint32_t reg,struct crocus_bo * bo,uint32_t offset,bool predicated)508 crocus_store_register_mem64(struct crocus_batch *batch, uint32_t reg,
509                             struct crocus_bo *bo, uint32_t offset,
510                             bool predicated)
511 {
512    crocus_store_register_mem32(batch, reg + 0, bo, offset + 0, predicated);
513    crocus_store_register_mem32(batch, reg + 4, bo, offset + 4, predicated);
514 }
515 #endif
516 
517 #if GFX_VER >= 7
518 static void
_crocus_emit_lri(struct crocus_batch * batch,uint32_t reg,uint32_t val)519 _crocus_emit_lri(struct crocus_batch *batch, uint32_t reg, uint32_t val)
520 {
521    crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
522       lri.RegisterOffset = reg;
523       lri.DataDWord      = val;
524    }
525 }
526 #define crocus_emit_lri(b, r, v) _crocus_emit_lri(b, GENX(r##_num), v)
527 
528 #if GFX_VERx10 >= 75
529 static void
_crocus_emit_lrr(struct crocus_batch * batch,uint32_t dst,uint32_t src)530 _crocus_emit_lrr(struct crocus_batch *batch, uint32_t dst, uint32_t src)
531 {
532    crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_REG), lrr) {
533       lrr.SourceRegisterAddress = src;
534       lrr.DestinationRegisterAddress = dst;
535    }
536 }
537 
538 static void
crocus_load_register_reg32(struct crocus_batch * batch,uint32_t dst,uint32_t src)539 crocus_load_register_reg32(struct crocus_batch *batch, uint32_t dst,
540                            uint32_t src)
541 {
542    _crocus_emit_lrr(batch, dst, src);
543 }
544 
545 static void
crocus_load_register_reg64(struct crocus_batch * batch,uint32_t dst,uint32_t src)546 crocus_load_register_reg64(struct crocus_batch *batch, uint32_t dst,
547                            uint32_t src)
548 {
549    _crocus_emit_lrr(batch, dst, src);
550    _crocus_emit_lrr(batch, dst + 4, src + 4);
551 }
552 #endif
553 
554 static void
crocus_load_register_imm32(struct crocus_batch * batch,uint32_t reg,uint32_t val)555 crocus_load_register_imm32(struct crocus_batch *batch, uint32_t reg,
556                            uint32_t val)
557 {
558    _crocus_emit_lri(batch, reg, val);
559 }
560 
561 static void
crocus_load_register_imm64(struct crocus_batch * batch,uint32_t reg,uint64_t val)562 crocus_load_register_imm64(struct crocus_batch *batch, uint32_t reg,
563                            uint64_t val)
564 {
565    _crocus_emit_lri(batch, reg + 0, val & 0xffffffff);
566    _crocus_emit_lri(batch, reg + 4, val >> 32);
567 }
568 
569 /**
570  * Emit MI_LOAD_REGISTER_MEM to load a 32-bit MMIO register from a buffer.
571  */
572 static void
crocus_load_register_mem32(struct crocus_batch * batch,uint32_t reg,struct crocus_bo * bo,uint32_t offset)573 crocus_load_register_mem32(struct crocus_batch *batch, uint32_t reg,
574                            struct crocus_bo *bo, uint32_t offset)
575 {
576    crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
577       lrm.RegisterAddress = reg;
578       lrm.MemoryAddress = ro_bo(bo, offset);
579    }
580 }
581 
582 /**
583  * Load a 64-bit value from a buffer into a MMIO register via
584  * two MI_LOAD_REGISTER_MEM commands.
585  */
586 static void
crocus_load_register_mem64(struct crocus_batch * batch,uint32_t reg,struct crocus_bo * bo,uint32_t offset)587 crocus_load_register_mem64(struct crocus_batch *batch, uint32_t reg,
588                            struct crocus_bo *bo, uint32_t offset)
589 {
590    crocus_load_register_mem32(batch, reg + 0, bo, offset + 0);
591    crocus_load_register_mem32(batch, reg + 4, bo, offset + 4);
592 }
593 
594 #if GFX_VERx10 >= 75
595 static void
crocus_store_data_imm32(struct crocus_batch * batch,struct crocus_bo * bo,uint32_t offset,uint32_t imm)596 crocus_store_data_imm32(struct crocus_batch *batch,
597                         struct crocus_bo *bo, uint32_t offset,
598                         uint32_t imm)
599 {
600    crocus_emit_cmd(batch, GENX(MI_STORE_DATA_IMM), sdi) {
601       sdi.Address = rw_bo(bo, offset);
602 #if GFX_VER >= 6
603       sdi.ImmediateData = imm;
604 #endif
605    }
606 }
607 
608 static void
crocus_store_data_imm64(struct crocus_batch * batch,struct crocus_bo * bo,uint32_t offset,uint64_t imm)609 crocus_store_data_imm64(struct crocus_batch *batch,
610                         struct crocus_bo *bo, uint32_t offset,
611                         uint64_t imm)
612 {
613    /* Can't use crocus_emit_cmd because MI_STORE_DATA_IMM has a length of
614     * 2 in genxml but it's actually variable length and we need 5 DWords.
615     */
616    void *map = crocus_get_command_space(batch, 4 * 5);
617    _crocus_pack_command(batch, GENX(MI_STORE_DATA_IMM), map, sdi) {
618       sdi.DWordLength = 5 - 2;
619       sdi.Address = rw_bo(bo, offset);
620 #if GFX_VER >= 6
621       sdi.ImmediateData = imm;
622 #endif
623    }
624 }
625 #endif
626 
627 static void
crocus_copy_mem_mem(struct crocus_batch * batch,struct crocus_bo * dst_bo,uint32_t dst_offset,struct crocus_bo * src_bo,uint32_t src_offset,unsigned bytes)628 crocus_copy_mem_mem(struct crocus_batch *batch,
629                     struct crocus_bo *dst_bo, uint32_t dst_offset,
630                     struct crocus_bo *src_bo, uint32_t src_offset,
631                     unsigned bytes)
632 {
633    assert(bytes % 4 == 0);
634    assert(dst_offset % 4 == 0);
635    assert(src_offset % 4 == 0);
636 
637 #define CROCUS_TEMP_REG 0x2440 /* GEN7_3DPRIM_BASE_VERTEX */
638    for (unsigned i = 0; i < bytes; i += 4) {
639       crocus_load_register_mem32(batch, CROCUS_TEMP_REG,
640                                  src_bo, src_offset + i);
641       crocus_store_register_mem32(batch, CROCUS_TEMP_REG,
642                                   dst_bo, dst_offset + i, false);
643    }
644 }
645 #endif
646 
647 /**
648  * Gallium CSO for rasterizer state.
649  */
650 struct crocus_rasterizer_state {
651    struct pipe_rasterizer_state cso;
652 #if GFX_VER >= 6
653    uint32_t sf[GENX(3DSTATE_SF_length)];
654    uint32_t clip[GENX(3DSTATE_CLIP_length)];
655 #endif
656 #if GFX_VER >= 8
657    uint32_t raster[GENX(3DSTATE_RASTER_length)];
658 #endif
659    uint32_t line_stipple[GENX(3DSTATE_LINE_STIPPLE_length)];
660 
661    uint8_t num_clip_plane_consts;
662    bool fill_mode_point_or_line;
663 };
664 
665 #if GFX_VER <= 5
666 #define URB_VS 0
667 #define URB_GS 1
668 #define URB_CLP 2
669 #define URB_SF 3
670 #define URB_CS 4
671 
672 static const struct {
673    uint32_t min_nr_entries;
674    uint32_t preferred_nr_entries;
675    uint32_t min_entry_size;
676    uint32_t  max_entry_size;
677 } limits[URB_CS+1] = {
678    { 16, 32, 1, 5 },                        /* vs */
679    { 4, 8,  1, 5 },                        /* gs */
680    { 5, 10,  1, 5 },                        /* clp */
681    { 1, 8,  1, 12 },                        /* sf */
682    { 1, 4,  1, 32 }                        /* cs */
683 };
684 
check_urb_layout(struct crocus_context * ice)685 static bool check_urb_layout(struct crocus_context *ice)
686 {
687    ice->urb.vs_start = 0;
688    ice->urb.gs_start = ice->urb.nr_vs_entries * ice->urb.vsize;
689    ice->urb.clip_start = ice->urb.gs_start + ice->urb.nr_gs_entries * ice->urb.vsize;
690    ice->urb.sf_start = ice->urb.clip_start + ice->urb.nr_clip_entries * ice->urb.vsize;
691    ice->urb.cs_start = ice->urb.sf_start + ice->urb.nr_sf_entries * ice->urb.sfsize;
692 
693    return ice->urb.cs_start + ice->urb.nr_cs_entries *
694       ice->urb.csize <= ice->urb.size;
695 }
696 
697 
698 static bool
crocus_calculate_urb_fence(struct crocus_batch * batch,unsigned csize,unsigned vsize,unsigned sfsize)699 crocus_calculate_urb_fence(struct crocus_batch *batch, unsigned csize,
700                            unsigned vsize, unsigned sfsize)
701 {
702    struct crocus_context *ice = batch->ice;
703    if (csize < limits[URB_CS].min_entry_size)
704       csize = limits[URB_CS].min_entry_size;
705 
706    if (vsize < limits[URB_VS].min_entry_size)
707       vsize = limits[URB_VS].min_entry_size;
708 
709    if (sfsize < limits[URB_SF].min_entry_size)
710       sfsize = limits[URB_SF].min_entry_size;
711 
712    if (ice->urb.vsize < vsize ||
713        ice->urb.sfsize < sfsize ||
714        ice->urb.csize < csize ||
715        (ice->urb.constrained && (ice->urb.vsize > vsize ||
716                                  ice->urb.sfsize > sfsize ||
717                                  ice->urb.csize > csize))) {
718 
719 
720       ice->urb.csize = csize;
721       ice->urb.sfsize = sfsize;
722       ice->urb.vsize = vsize;
723 
724       ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
725       ice->urb.nr_gs_entries = limits[URB_GS].preferred_nr_entries;
726       ice->urb.nr_clip_entries = limits[URB_CLP].preferred_nr_entries;
727       ice->urb.nr_sf_entries = limits[URB_SF].preferred_nr_entries;
728       ice->urb.nr_cs_entries = limits[URB_CS].preferred_nr_entries;
729 
730       ice->urb.constrained = 0;
731 
732       if (GFX_VER == 5) {
733          ice->urb.nr_vs_entries = 128;
734          ice->urb.nr_sf_entries = 48;
735          if (check_urb_layout(ice)) {
736             goto done;
737          } else {
738             ice->urb.constrained = 1;
739             ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
740             ice->urb.nr_sf_entries = limits[URB_SF].preferred_nr_entries;
741          }
742       } else if (GFX_VERx10 == 45) {
743          ice->urb.nr_vs_entries = 64;
744          if (check_urb_layout(ice)) {
745             goto done;
746          } else {
747             ice->urb.constrained = 1;
748             ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
749          }
750       }
751 
752       if (!check_urb_layout(ice)) {
753          ice->urb.nr_vs_entries = limits[URB_VS].min_nr_entries;
754          ice->urb.nr_gs_entries = limits[URB_GS].min_nr_entries;
755          ice->urb.nr_clip_entries = limits[URB_CLP].min_nr_entries;
756          ice->urb.nr_sf_entries = limits[URB_SF].min_nr_entries;
757          ice->urb.nr_cs_entries = limits[URB_CS].min_nr_entries;
758 
759          /* Mark us as operating with constrained nr_entries, so that next
760           * time we recalculate we'll resize the fences in the hope of
761           * escaping constrained mode and getting back to normal performance.
762           */
763          ice->urb.constrained = 1;
764 
765          if (!check_urb_layout(ice)) {
766             /* This is impossible, given the maximal sizes of urb
767              * entries and the values for minimum nr of entries
768              * provided above.
769              */
770             fprintf(stderr, "couldn't calculate URB layout!\n");
771             exit(1);
772          }
773 
774          if (INTEL_DEBUG(DEBUG_URB|DEBUG_PERF))
775             fprintf(stderr, "URB CONSTRAINED\n");
776       }
777 
778 done:
779       if (INTEL_DEBUG(DEBUG_URB))
780          fprintf(stderr,
781                  "URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
782                  ice->urb.vs_start,
783                  ice->urb.gs_start,
784                  ice->urb.clip_start,
785                  ice->urb.sf_start,
786                  ice->urb.cs_start,
787                  ice->urb.size);
788       return true;
789    }
790    return false;
791 }
792 
793 static void
crocus_upload_urb_fence(struct crocus_batch * batch)794 crocus_upload_urb_fence(struct crocus_batch *batch)
795 {
796    uint32_t urb_fence[3];
797    _crocus_pack_command(batch, GENX(URB_FENCE), urb_fence, urb) {
798       urb.VSUnitURBReallocationRequest = 1;
799       urb.GSUnitURBReallocationRequest = 1;
800       urb.CLIPUnitURBReallocationRequest = 1;
801       urb.SFUnitURBReallocationRequest = 1;
802       urb.VFEUnitURBReallocationRequest = 1;
803       urb.CSUnitURBReallocationRequest = 1;
804 
805       urb.VSFence = batch->ice->urb.gs_start;
806       urb.GSFence = batch->ice->urb.clip_start;
807       urb.CLIPFence = batch->ice->urb.sf_start;
808       urb.SFFence = batch->ice->urb.cs_start;
809       urb.CSFence = batch->ice->urb.size;
810    }
811 
812    /* erratum: URB_FENCE must not cross a 64byte cacheline */
813    if ((crocus_batch_bytes_used(batch) & 15) > 12) {
814       int pad = 16 - (crocus_batch_bytes_used(batch) & 15);
815       do {
816          *(uint32_t *)batch->command.map_next = 0;
817          batch->command.map_next += sizeof(uint32_t);
818       } while (--pad);
819    }
820 
821    crocus_batch_emit(batch, urb_fence, sizeof(uint32_t) * 3);
822 }
823 
824 static bool
calculate_curbe_offsets(struct crocus_batch * batch)825 calculate_curbe_offsets(struct crocus_batch *batch)
826 {
827    struct crocus_context *ice = batch->ice;
828 
829    unsigned nr_fp_regs, nr_vp_regs, nr_clip_regs = 0;
830    unsigned total_regs;
831 
832    nr_fp_regs = 0;
833    for (int i = 0; i < 4; i++) {
834       const struct elk_ubo_range *range = &ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data->ubo_ranges[i];
835       if (range->length == 0)
836          continue;
837 
838       /* ubo range tracks at 256-bit, we need 512-bit */
839       nr_fp_regs += (range->length + 1) / 2;
840    }
841 
842    if (ice->state.cso_rast->cso.clip_plane_enable) {
843       unsigned nr_planes = 6 + util_bitcount(ice->state.cso_rast->cso.clip_plane_enable);
844       nr_clip_regs = (nr_planes * 4 + 15) / 16;
845    }
846 
847    nr_vp_regs = 0;
848    for (int i = 0; i < 4; i++) {
849       const struct elk_ubo_range *range = &ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data->ubo_ranges[i];
850       if (range->length == 0)
851          continue;
852 
853       /* ubo range tracks at 256-bit, we need 512-bit */
854       nr_vp_regs += (range->length + 1) / 2;
855    }
856    if (nr_vp_regs == 0) {
857       /* The pre-gen6 VS requires that some push constants get loaded no
858        * matter what, or the GPU would hang.
859        */
860       nr_vp_regs = 1;
861    }
862    total_regs = nr_fp_regs + nr_vp_regs + nr_clip_regs;
863 
864    /* The CURBE allocation size is limited to 32 512-bit units (128 EU
865     * registers, or 1024 floats).  See CS_URB_STATE in the gen4 or gen5
866     * (volume 1, part 1) PRMs.
867     *
868     * Note that in elk_fs.cpp we're only loading up to 16 EU registers of
869     * values as push constants before spilling to pull constants, and in
870     * elk_vec4.cpp we're loading up to 32 registers of push constants.  An EU
871     * register is 1/2 of one of these URB entry units, so that leaves us 16 EU
872     * regs for clip.
873     */
874    assert(total_regs <= 32);
875 
876    /* Lazy resize:
877     */
878    if (nr_fp_regs > ice->curbe.wm_size ||
879        nr_vp_regs > ice->curbe.vs_size ||
880        nr_clip_regs != ice->curbe.clip_size ||
881        (total_regs < ice->curbe.total_size / 4 &&
882         ice->curbe.total_size > 16)) {
883 
884       GLuint reg = 0;
885 
886       /* Calculate a new layout:
887        */
888       reg = 0;
889       ice->curbe.wm_start = reg;
890       ice->curbe.wm_size = nr_fp_regs; reg += nr_fp_regs;
891       ice->curbe.clip_start = reg;
892       ice->curbe.clip_size = nr_clip_regs; reg += nr_clip_regs;
893       ice->curbe.vs_start = reg;
894       ice->curbe.vs_size = nr_vp_regs; reg += nr_vp_regs;
895       ice->curbe.total_size = reg;
896 
897       if (0)
898          fprintf(stderr, "curbe wm %d+%d clip %d+%d vs %d+%d\n",
899                  ice->curbe.wm_start,
900                  ice->curbe.wm_size,
901                  ice->curbe.clip_start,
902                  ice->curbe.clip_size,
903                  ice->curbe.vs_start,
904                  ice->curbe.vs_size );
905       return true;
906    }
907    return false;
908 }
909 
910 static void
upload_shader_consts(struct crocus_context * ice,gl_shader_stage stage,uint32_t * map,unsigned start)911 upload_shader_consts(struct crocus_context *ice,
912                      gl_shader_stage stage,
913                      uint32_t *map,
914                      unsigned start)
915 {
916    struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
917    struct elk_stage_prog_data *prog_data = (void *) shader->prog_data;
918    uint32_t *cmap;
919    bool found = false;
920    unsigned offset = start * 16;
921    int total = 0;
922    for (int i = 0; i < 4; i++) {
923       const struct elk_ubo_range *range = &prog_data->ubo_ranges[i];
924 
925       if (range->length == 0)
926          continue;
927 
928       unsigned block_index = crocus_bti_to_group_index(
929          &shader->bt, CROCUS_SURFACE_GROUP_UBO, range->block);
930       unsigned len = range->length * 8 * sizeof(float);
931       unsigned start = range->start * 8 * sizeof(float);
932       struct pipe_transfer *transfer;
933 
934       cmap = pipe_buffer_map_range(&ice->ctx, ice->state.shaders[stage].constbufs[block_index].buffer,
935                                    ice->state.shaders[stage].constbufs[block_index].buffer_offset + start, len,
936                                    PIPE_MAP_READ | PIPE_MAP_UNSYNCHRONIZED, &transfer);
937       if (cmap)
938          memcpy(&map[offset + (total * 8)], cmap, len);
939       pipe_buffer_unmap(&ice->ctx, transfer);
940       total += range->length;
941       found = true;
942    }
943 
944    if (stage == MESA_SHADER_VERTEX && !found) {
945       /* The pre-gen6 VS requires that some push constants get loaded no
946        * matter what, or the GPU would hang.
947        */
948       unsigned len = 16;
949       memset(&map[offset], 0, len);
950    }
951 }
952 
953 static const float fixed_plane[6][4] = {
954    { 0,    0,   -1, 1 },
955    { 0,    0,    1, 1 },
956    { 0,   -1,    0, 1 },
957    { 0,    1,    0, 1 },
958    {-1,    0,    0, 1 },
959    { 1,    0,    0, 1 }
960 };
961 
962 static void
gen4_upload_curbe(struct crocus_batch * batch)963 gen4_upload_curbe(struct crocus_batch *batch)
964 {
965    struct crocus_context *ice = batch->ice;
966    const unsigned sz = ice->curbe.total_size;
967    const unsigned buf_sz = sz * 16 * sizeof(float);
968 
969    if (sz == 0)
970       goto emit;
971 
972    uint32_t *map;
973    u_upload_alloc(ice->ctx.const_uploader, 0, buf_sz, 64,
974                   &ice->curbe.curbe_offset, (struct pipe_resource **)&ice->curbe.curbe_res, (void **) &map);
975 
976    /* fragment shader constants */
977    if (ice->curbe.wm_size) {
978       upload_shader_consts(ice, MESA_SHADER_FRAGMENT, map, ice->curbe.wm_start);
979    }
980 
981    /* clipper constants */
982    if (ice->curbe.clip_size) {
983       unsigned offset = ice->curbe.clip_start * 16;
984       float *fmap = (float *)map;
985       unsigned i;
986       /* If any planes are going this way, send them all this way:
987        */
988       for (i = 0; i < 6; i++) {
989          fmap[offset + i * 4 + 0] = fixed_plane[i][0];
990          fmap[offset + i * 4 + 1] = fixed_plane[i][1];
991          fmap[offset + i * 4 + 2] = fixed_plane[i][2];
992          fmap[offset + i * 4 + 3] = fixed_plane[i][3];
993       }
994 
995       unsigned mask = ice->state.cso_rast->cso.clip_plane_enable;
996       struct pipe_clip_state *cp = &ice->state.clip_planes;
997       while (mask) {
998          const int j = u_bit_scan(&mask);
999          fmap[offset + i * 4 + 0] = cp->ucp[j][0];
1000          fmap[offset + i * 4 + 1] = cp->ucp[j][1];
1001          fmap[offset + i * 4 + 2] = cp->ucp[j][2];
1002          fmap[offset + i * 4 + 3] = cp->ucp[j][3];
1003          i++;
1004       }
1005    }
1006 
1007    /* vertex shader constants */
1008    if (ice->curbe.vs_size) {
1009       upload_shader_consts(ice, MESA_SHADER_VERTEX, map, ice->curbe.vs_start);
1010    }
1011    if (0) {
1012       for (int i = 0; i < sz*16; i+=4) {
1013          float *f = (float *)map;
1014          fprintf(stderr, "curbe %d.%d: %f %f %f %f\n", i/8, i&4,
1015                  f[i+0], f[i+1], f[i+2], f[i+3]);
1016       }
1017    }
1018 
1019 emit:
1020    crocus_emit_cmd(batch, GENX(CONSTANT_BUFFER), cb) {
1021       if (ice->curbe.curbe_res) {
1022          cb.BufferLength = ice->curbe.total_size - 1;
1023          cb.Valid = 1;
1024          cb.BufferStartingAddress = ro_bo(ice->curbe.curbe_res->bo, ice->curbe.curbe_offset);
1025       }
1026    }
1027 
1028 #if GFX_VER == 4 && GFX_VERx10 != 45
1029    /* Work around a Broadwater/Crestline depth interpolator bug.  The
1030     * following sequence will cause GPU hangs:
1031     *
1032     * 1. Change state so that all depth related fields in CC_STATE are
1033     *    disabled, and in WM_STATE, only "PS Use Source Depth" is enabled.
1034     * 2. Emit a CONSTANT_BUFFER packet.
1035     * 3. Draw via 3DPRIMITIVE.
1036     *
1037     * The recommended workaround is to emit a non-pipelined state change after
1038     * emitting CONSTANT_BUFFER, in order to drain the windowizer pipeline.
1039     *
1040     * We arbitrarily choose 3DSTATE_GLOBAL_DEPTH_CLAMP_OFFSET (as it's small),
1041     * and always emit it when "PS Use Source Depth" is set.  We could be more
1042     * precise, but the additional complexity is probably not worth it.
1043     *
1044     */
1045    const struct shader_info *fs_info =
1046       crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
1047 
1048    if (BITSET_TEST(fs_info->system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
1049       ice->state.global_depth_offset_clamp = 0;
1050       crocus_emit_cmd(batch, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp);
1051    }
1052 #endif
1053 }
1054 #endif
1055 
1056 #if GFX_VER >= 7
1057 
1058 #define IVB_L3SQCREG1_SQGHPCI_DEFAULT     0x00730000
1059 #define VLV_L3SQCREG1_SQGHPCI_DEFAULT     0x00d30000
1060 #define HSW_L3SQCREG1_SQGHPCI_DEFAULT     0x00610000
1061 
1062 static void
setup_l3_config(struct crocus_batch * batch,const struct intel_l3_config * cfg)1063 setup_l3_config(struct crocus_batch *batch, const struct intel_l3_config *cfg)
1064 {
1065 #if GFX_VER == 7
1066    const struct intel_device_info *devinfo = &batch->screen->devinfo;
1067    const bool has_dc = cfg->n[INTEL_L3P_DC] || cfg->n[INTEL_L3P_ALL];
1068    const bool has_is = cfg->n[INTEL_L3P_IS] || cfg->n[INTEL_L3P_RO] ||
1069                        cfg->n[INTEL_L3P_ALL];
1070    const bool has_c = cfg->n[INTEL_L3P_C] || cfg->n[INTEL_L3P_RO] ||
1071                       cfg->n[INTEL_L3P_ALL];
1072    const bool has_t = cfg->n[INTEL_L3P_T] || cfg->n[INTEL_L3P_RO] ||
1073                       cfg->n[INTEL_L3P_ALL];
1074    const bool has_slm = cfg->n[INTEL_L3P_SLM];
1075 #endif
1076 
1077    /* According to the hardware docs, the L3 partitioning can only be changed
1078     * while the pipeline is completely drained and the caches are flushed,
1079     * which involves a first PIPE_CONTROL flush which stalls the pipeline...
1080     */
1081    crocus_emit_pipe_control_flush(batch, "l3_config",
1082                                   PIPE_CONTROL_DATA_CACHE_FLUSH |
1083                                   PIPE_CONTROL_CS_STALL);
1084 
1085    /* ...followed by a second pipelined PIPE_CONTROL that initiates
1086     * invalidation of the relevant caches.  Note that because RO invalidation
1087     * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
1088     * command is processed by the CS) we cannot combine it with the previous
1089     * stalling flush as the hardware documentation suggests, because that
1090     * would cause the CS to stall on previous rendering *after* RO
1091     * invalidation and wouldn't prevent the RO caches from being polluted by
1092     * concurrent rendering before the stall completes.  This intentionally
1093     * doesn't implement the SKL+ hardware workaround suggesting to enable CS
1094     * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
1095     * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
1096     * already guarantee that there is no concurrent GPGPU kernel execution
1097     * (see SKL HSD 2132585).
1098     */
1099    crocus_emit_pipe_control_flush(batch, "l3 config",
1100                                   PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
1101                                   PIPE_CONTROL_CONST_CACHE_INVALIDATE |
1102                                   PIPE_CONTROL_INSTRUCTION_INVALIDATE |
1103                                   PIPE_CONTROL_STATE_CACHE_INVALIDATE);
1104 
1105    /* Now send a third stalling flush to make sure that invalidation is
1106     * complete when the L3 configuration registers are modified.
1107     */
1108    crocus_emit_pipe_control_flush(batch, "l3 config",
1109                                   PIPE_CONTROL_DATA_CACHE_FLUSH |
1110                                   PIPE_CONTROL_CS_STALL);
1111 
1112 #if GFX_VER == 8
1113    assert(!cfg->n[INTEL_L3P_IS] && !cfg->n[INTEL_L3P_C] && !cfg->n[INTEL_L3P_T]);
1114    crocus_emit_reg(batch, GENX(L3CNTLREG), reg) {
1115       reg.SLMEnable = cfg->n[INTEL_L3P_SLM] > 0;
1116       reg.URBAllocation = cfg->n[INTEL_L3P_URB];
1117       reg.ROAllocation = cfg->n[INTEL_L3P_RO];
1118       reg.DCAllocation = cfg->n[INTEL_L3P_DC];
1119       reg.AllAllocation = cfg->n[INTEL_L3P_ALL];
1120    }
1121 #else
1122    assert(!cfg->n[INTEL_L3P_ALL]);
1123 
1124    /* When enabled SLM only uses a portion of the L3 on half of the banks,
1125     * the matching space on the remaining banks has to be allocated to a
1126     * client (URB for all validated configurations) set to the
1127     * lower-bandwidth 2-bank address hashing mode.
1128     */
1129    const bool urb_low_bw = has_slm && devinfo->platform != INTEL_PLATFORM_BYT;
1130    assert(!urb_low_bw || cfg->n[INTEL_L3P_URB] == cfg->n[INTEL_L3P_SLM]);
1131 
1132    /* Minimum number of ways that can be allocated to the URB. */
1133    const unsigned n0_urb = (devinfo->platform == INTEL_PLATFORM_BYT ? 32 : 0);
1134    assert(cfg->n[INTEL_L3P_URB] >= n0_urb);
1135 
1136    uint32_t l3sqcr1, l3cr2, l3cr3;
1137 
1138    crocus_pack_state(GENX(L3SQCREG1), &l3sqcr1, reg) {
1139       reg.ConvertDC_UC = !has_dc;
1140       reg.ConvertIS_UC = !has_is;
1141       reg.ConvertC_UC = !has_c;
1142       reg.ConvertT_UC = !has_t;
1143 #if GFX_VERx10 == 75
1144       reg.L3SQGeneralPriorityCreditInitialization = SQGPCI_DEFAULT;
1145 #else
1146       reg.L3SQGeneralPriorityCreditInitialization =
1147          devinfo->platform == INTEL_PLATFORM_BYT ? BYT_SQGPCI_DEFAULT : SQGPCI_DEFAULT;
1148 #endif
1149       reg.L3SQHighPriorityCreditInitialization = SQHPCI_DEFAULT;
1150    };
1151 
1152    crocus_pack_state(GENX(L3CNTLREG2), &l3cr2, reg) {
1153       reg.SLMEnable = has_slm;
1154       reg.URBLowBandwidth = urb_low_bw;
1155       reg.URBAllocation = cfg->n[INTEL_L3P_URB] - n0_urb;
1156 #if !(GFX_VERx10 == 75)
1157       reg.ALLAllocation = cfg->n[INTEL_L3P_ALL];
1158 #endif
1159       reg.ROAllocation = cfg->n[INTEL_L3P_RO];
1160       reg.DCAllocation = cfg->n[INTEL_L3P_DC];
1161    };
1162 
1163    crocus_pack_state(GENX(L3CNTLREG3), &l3cr3, reg) {
1164       reg.ISAllocation = cfg->n[INTEL_L3P_IS];
1165       reg.ISLowBandwidth = 0;
1166       reg.CAllocation = cfg->n[INTEL_L3P_C];
1167       reg.CLowBandwidth = 0;
1168       reg.TAllocation = cfg->n[INTEL_L3P_T];
1169       reg.TLowBandwidth = 0;
1170    };
1171 
1172    /* Set up the L3 partitioning. */
1173    crocus_emit_lri(batch, L3SQCREG1, l3sqcr1);
1174    crocus_emit_lri(batch, L3CNTLREG2, l3cr2);
1175    crocus_emit_lri(batch, L3CNTLREG3, l3cr3);
1176 
1177 #if GFX_VERx10 == 75
1178    /* TODO: Fail screen creation if command parser version < 4 */
1179    uint32_t scratch1, chicken3;
1180    crocus_pack_state(GENX(SCRATCH1), &scratch1, reg) {
1181       reg.L3AtomicDisable = !has_dc;
1182    }
1183    crocus_pack_state(GENX(CHICKEN3), &chicken3, reg) {
1184       reg.L3AtomicDisableMask = true;
1185       reg.L3AtomicDisable = !has_dc;
1186    }
1187    crocus_emit_lri(batch, SCRATCH1, scratch1);
1188    crocus_emit_lri(batch, CHICKEN3, chicken3);
1189 #endif
1190 #endif
1191 }
1192 
1193 static void
emit_l3_state(struct crocus_batch * batch,bool compute)1194 emit_l3_state(struct crocus_batch *batch, bool compute)
1195 {
1196    const struct intel_l3_config *const cfg =
1197       compute ? batch->screen->l3_config_cs : batch->screen->l3_config_3d;
1198 
1199    setup_l3_config(batch, cfg);
1200    if (INTEL_DEBUG(DEBUG_L3)) {
1201       intel_dump_l3_config(cfg, stderr);
1202    }
1203 }
1204 
1205 /**
1206  * Emit a PIPE_CONTROL command for gen7 with the CS Stall bit set.
1207  */
1208 static void
gen7_emit_cs_stall_flush(struct crocus_batch * batch)1209 gen7_emit_cs_stall_flush(struct crocus_batch *batch)
1210 {
1211    crocus_emit_pipe_control_write(batch,
1212                                   "workaround",
1213                                   PIPE_CONTROL_CS_STALL
1214                                   | PIPE_CONTROL_WRITE_IMMEDIATE,
1215                                   batch->ice->workaround_bo,
1216                                   batch->ice->workaround_offset, 0);
1217 }
1218 #endif
1219 
1220 static void
emit_pipeline_select(struct crocus_batch * batch,uint32_t pipeline)1221 emit_pipeline_select(struct crocus_batch *batch, uint32_t pipeline)
1222 {
1223 #if GFX_VER == 8
1224    /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
1225     *
1226     *   Software must clear the COLOR_CALC_STATE Valid field in
1227     *   3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
1228     *   with Pipeline Select set to GPGPU.
1229     *
1230     * The internal hardware docs recommend the same workaround for Gfx9
1231     * hardware too.
1232     */
1233    if (pipeline == GPGPU)
1234       crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
1235 #endif
1236 
1237 #if GFX_VER >= 6
1238    /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
1239     * PIPELINE_SELECT [DevBWR+]":
1240     *
1241     *    "Project: DEVSNB+
1242     *
1243     *     Software must ensure all the write caches are flushed through a
1244     *     stalling PIPE_CONTROL command followed by another PIPE_CONTROL
1245     *     command to invalidate read only caches prior to programming
1246     *     MI_PIPELINE_SELECT command to change the Pipeline Select Mode."
1247     */
1248    const unsigned dc_flush =
1249       GFX_VER >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
1250    crocus_emit_pipe_control_flush(batch,
1251                                   "workaround: PIPELINE_SELECT flushes (1/2)",
1252                                   PIPE_CONTROL_RENDER_TARGET_FLUSH |
1253                                   PIPE_CONTROL_DEPTH_CACHE_FLUSH |
1254                                   dc_flush |
1255                                   PIPE_CONTROL_CS_STALL);
1256 
1257    crocus_emit_pipe_control_flush(batch,
1258                                   "workaround: PIPELINE_SELECT flushes (2/2)",
1259                                   PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
1260                                   PIPE_CONTROL_CONST_CACHE_INVALIDATE |
1261                                   PIPE_CONTROL_STATE_CACHE_INVALIDATE |
1262                                   PIPE_CONTROL_INSTRUCTION_INVALIDATE);
1263 #else
1264    /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
1265     * PIPELINE_SELECT [DevBWR+]":
1266     *
1267     *   Project: PRE-DEVSNB
1268     *
1269     *   Software must ensure the current pipeline is flushed via an
1270     *   MI_FLUSH or PIPE_CONTROL prior to the execution of PIPELINE_SELECT.
1271     */
1272    crocus_emit_cmd(batch, GENX(MI_FLUSH), foo);
1273 #endif
1274 
1275    crocus_emit_cmd(batch, GENX(PIPELINE_SELECT), sel) {
1276       sel.PipelineSelection = pipeline;
1277    }
1278 
1279 #if GFX_VER == 7 && !(GFX_VERx10 == 75)
1280    if (pipeline == _3D) {
1281       gen7_emit_cs_stall_flush(batch);
1282 
1283       crocus_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
1284          prim.PrimitiveTopologyType = _3DPRIM_POINTLIST;
1285       };
1286    }
1287 #endif
1288 }
1289 
1290 /**
1291  * The following diagram shows how we partition the URB:
1292  *
1293  *        16kB or 32kB               Rest of the URB space
1294  *   __________-__________   _________________-_________________
1295  *  /                     \ /                                   \
1296  * +-------------------------------------------------------------+
1297  * |  VS/HS/DS/GS/FS Push  |           VS/HS/DS/GS URB           |
1298  * |       Constants       |               Entries               |
1299  * +-------------------------------------------------------------+
1300  *
1301  * Notably, push constants must be stored at the beginning of the URB
1302  * space, while entries can be stored anywhere.  Ivybridge and Haswell
1303  * GT1/GT2 have a maximum constant buffer size of 16kB, while Haswell GT3
1304  * doubles this (32kB).
1305  *
1306  * Ivybridge and Haswell GT1/GT2 allow push constants to be located (and
1307  * sized) in increments of 1kB.  Haswell GT3 requires them to be located and
1308  * sized in increments of 2kB.
1309  *
1310  * Currently we split the constant buffer space evenly among whatever stages
1311  * are active.  This is probably not ideal, but simple.
1312  *
1313  * Ivybridge GT1 and Haswell GT1 have 128kB of URB space.
1314  * Ivybridge GT2 and Haswell GT2 have 256kB of URB space.
1315  * Haswell GT3 has 512kB of URB space.
1316  *
1317  * See "Volume 2a: 3D Pipeline," section 1.8, "Volume 1b: Configurations",
1318  * and the documentation for 3DSTATE_PUSH_CONSTANT_ALLOC_xS.
1319  */
1320 #if GFX_VER >= 7
1321 static void
crocus_alloc_push_constants(struct crocus_batch * batch)1322 crocus_alloc_push_constants(struct crocus_batch *batch)
1323 {
1324    const unsigned push_constant_kb =
1325       batch->screen->devinfo.max_constant_urb_size_kb;
1326    unsigned size_per_stage = push_constant_kb / 5;
1327 
1328    /* For now, we set a static partitioning of the push constant area,
1329     * assuming that all stages could be in use.
1330     *
1331     * TODO: Try lazily allocating the HS/DS/GS sections as needed, and
1332     *       see if that improves performance by offering more space to
1333     *       the VS/FS when those aren't in use.  Also, try dynamically
1334     *       enabling/disabling it like i965 does.  This would be more
1335     *       stalls and may not actually help; we don't know yet.
1336     */
1337    for (int i = 0; i <= MESA_SHADER_FRAGMENT; i++) {
1338       crocus_emit_cmd(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
1339          alloc._3DCommandSubOpcode = 18 + i;
1340          alloc.ConstantBufferOffset = size_per_stage * i;
1341          alloc.ConstantBufferSize = i == MESA_SHADER_FRAGMENT ? (push_constant_kb - 4 * size_per_stage) : size_per_stage;
1342       }
1343    }
1344 
1345    /* From p292 of the Ivy Bridge PRM (11.2.4 3DSTATE_PUSH_CONSTANT_ALLOC_PS):
1346     *
1347     *     A PIPE_CONTROL command with the CS Stall bit set must be programmed
1348     *     in the ring after this instruction.
1349     *
1350     * No such restriction exists for Haswell or Baytrail.
1351     */
1352    if (batch->screen->devinfo.platform == INTEL_PLATFORM_IVB)
1353       gen7_emit_cs_stall_flush(batch);
1354 }
1355 #endif
1356 
1357 /**
1358  * Upload the initial GPU state for a render context.
1359  *
1360  * This sets some invariant state that needs to be programmed a particular
1361  * way, but we never actually change.
1362  */
1363 static void
crocus_init_render_context(struct crocus_batch * batch)1364 crocus_init_render_context(struct crocus_batch *batch)
1365 {
1366    UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
1367 
1368    emit_pipeline_select(batch, _3D);
1369 
1370    crocus_emit_cmd(batch, GENX(STATE_SIP), foo);
1371 
1372 #if GFX_VER >= 7
1373    emit_l3_state(batch, false);
1374 #endif
1375 #if (GFX_VERx10 == 70 || GFX_VERx10 == 80)
1376    crocus_emit_reg(batch, GENX(INSTPM), reg) {
1377       reg.CONSTANT_BUFFERAddressOffsetDisable = true;
1378       reg.CONSTANT_BUFFERAddressOffsetDisableMask = true;
1379    }
1380 #endif
1381 #if GFX_VER >= 5 || GFX_VERx10 == 45
1382    /* Use the legacy AA line coverage computation. */
1383    crocus_emit_cmd(batch, GENX(3DSTATE_AA_LINE_PARAMETERS), foo);
1384 #endif
1385 
1386    /* No polygon stippling offsets are necessary. */
1387    /* TODO: may need to set an offset for origin-UL framebuffers */
1388    crocus_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_OFFSET), foo);
1389 
1390 #if GFX_VER >= 7
1391    crocus_alloc_push_constants(batch);
1392 #endif
1393 
1394 #if GFX_VER == 8
1395    /* Set the initial MSAA sample positions. */
1396    crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLE_PATTERN), pat) {
1397       INTEL_SAMPLE_POS_1X(pat._1xSample);
1398       INTEL_SAMPLE_POS_2X(pat._2xSample);
1399       INTEL_SAMPLE_POS_4X(pat._4xSample);
1400       INTEL_SAMPLE_POS_8X(pat._8xSample);
1401    }
1402 
1403    /* Disable chromakeying (it's for media) */
1404    crocus_emit_cmd(batch, GENX(3DSTATE_WM_CHROMAKEY), foo);
1405 
1406    /* We want regular rendering, not special HiZ operations. */
1407    crocus_emit_cmd(batch, GENX(3DSTATE_WM_HZ_OP), foo);
1408 #endif
1409 }
1410 
1411 #if GFX_VER >= 7
1412 static void
crocus_init_compute_context(struct crocus_batch * batch)1413 crocus_init_compute_context(struct crocus_batch *batch)
1414 {
1415    UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
1416 
1417    emit_pipeline_select(batch, GPGPU);
1418 
1419 #if GFX_VER >= 7
1420    emit_l3_state(batch, true);
1421 #endif
1422 }
1423 #endif
1424 
1425 /**
1426  * Generation-specific context state (ice->state.genx->...).
1427  *
1428  * Most state can go in crocus_context directly, but these encode hardware
1429  * packets which vary by generation.
1430  */
1431 struct crocus_genx_state {
1432    struct {
1433 #if GFX_VER >= 7
1434       struct isl_image_param image_param[PIPE_MAX_SHADER_IMAGES];
1435 #endif
1436    } shaders[MESA_SHADER_STAGES];
1437 
1438 #if GFX_VER == 8
1439    bool pma_fix_enabled;
1440 #endif
1441 };
1442 
1443 /**
1444  * The pipe->set_blend_color() driver hook.
1445  *
1446  * This corresponds to our COLOR_CALC_STATE.
1447  */
1448 static void
crocus_set_blend_color(struct pipe_context * ctx,const struct pipe_blend_color * state)1449 crocus_set_blend_color(struct pipe_context *ctx,
1450                        const struct pipe_blend_color *state)
1451 {
1452    struct crocus_context *ice = (struct crocus_context *) ctx;
1453 
1454    /* Our COLOR_CALC_STATE is exactly pipe_blend_color, so just memcpy */
1455    memcpy(&ice->state.blend_color, state, sizeof(struct pipe_blend_color));
1456 #if GFX_VER <= 5
1457    ice->state.dirty |= CROCUS_DIRTY_GEN4_CONSTANT_COLOR;
1458 #else
1459    ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1460 #endif
1461 }
1462 
1463 /**
1464  * Gallium CSO for blend state (see pipe_blend_state).
1465  */
1466 struct crocus_blend_state {
1467 #if GFX_VER == 8
1468    /** Partial 3DSTATE_PS_BLEND */
1469    uint32_t ps_blend[GENX(3DSTATE_PS_BLEND_length)];
1470 #endif
1471 
1472    /** copy of BLEND_STATE */
1473    struct pipe_blend_state cso;
1474 
1475    /** Bitfield of whether blending is enabled for RT[i] - for aux resolves */
1476    uint8_t blend_enables;
1477 
1478    /** Bitfield of whether color writes are enabled for RT[i] */
1479    uint8_t color_write_enables;
1480 
1481    /** Does RT[0] use dual color blending? */
1482    bool dual_color_blending;
1483 };
1484 
1485 static enum pipe_blendfactor
fix_blendfactor(enum pipe_blendfactor f,bool alpha_to_one)1486 fix_blendfactor(enum pipe_blendfactor f, bool alpha_to_one)
1487 {
1488    if (alpha_to_one) {
1489       if (f == PIPE_BLENDFACTOR_SRC1_ALPHA)
1490          return PIPE_BLENDFACTOR_ONE;
1491 
1492       if (f == PIPE_BLENDFACTOR_INV_SRC1_ALPHA)
1493          return PIPE_BLENDFACTOR_ZERO;
1494    }
1495 
1496    return f;
1497 }
1498 
1499 #if GFX_VER >= 6
1500 typedef struct GENX(BLEND_STATE_ENTRY) BLEND_ENTRY_GENXML;
1501 #else
1502 typedef struct GENX(COLOR_CALC_STATE) BLEND_ENTRY_GENXML;
1503 #endif
1504 
1505 static bool
can_emit_logic_op(struct crocus_context * ice)1506 can_emit_logic_op(struct crocus_context *ice)
1507 {
1508    /* all pre gen8 have logicop restricted to unorm */
1509    enum pipe_format pformat = PIPE_FORMAT_NONE;
1510    for (unsigned i = 0; i < ice->state.framebuffer.nr_cbufs; i++) {
1511       if (ice->state.framebuffer.cbufs[i]) {
1512          pformat = ice->state.framebuffer.cbufs[i]->format;
1513          break;
1514       }
1515    }
1516    return (pformat == PIPE_FORMAT_NONE || util_format_is_unorm(pformat));
1517 }
1518 
1519 static bool
set_blend_entry_bits(struct crocus_batch * batch,BLEND_ENTRY_GENXML * entry,struct crocus_blend_state * cso_blend,int idx)1520 set_blend_entry_bits(struct crocus_batch *batch, BLEND_ENTRY_GENXML *entry,
1521                      struct crocus_blend_state *cso_blend,
1522                      int idx)
1523 {
1524    struct crocus_context *ice = batch->ice;
1525    bool independent_alpha_blend = false;
1526    const struct pipe_rt_blend_state *rt =
1527       &cso_blend->cso.rt[cso_blend->cso.independent_blend_enable ? idx : 0];
1528    const unsigned blend_enabled = rt->blend_enable;
1529 
1530    enum pipe_blendfactor src_rgb =
1531       fix_blendfactor(rt->rgb_src_factor, cso_blend->cso.alpha_to_one);
1532    enum pipe_blendfactor src_alpha =
1533       fix_blendfactor(rt->alpha_src_factor, cso_blend->cso.alpha_to_one);
1534    enum pipe_blendfactor dst_rgb =
1535       fix_blendfactor(rt->rgb_dst_factor, cso_blend->cso.alpha_to_one);
1536    enum pipe_blendfactor dst_alpha =
1537       fix_blendfactor(rt->alpha_dst_factor, cso_blend->cso.alpha_to_one);
1538 
1539    if (rt->rgb_func != rt->alpha_func ||
1540        src_rgb != src_alpha || dst_rgb != dst_alpha)
1541       independent_alpha_blend = true;
1542    if (cso_blend->cso.logicop_enable) {
1543       if (GFX_VER >= 8 || can_emit_logic_op(ice)) {
1544          entry->LogicOpEnable = cso_blend->cso.logicop_enable;
1545          entry->LogicOpFunction = cso_blend->cso.logicop_func;
1546       }
1547    } else if (blend_enabled) {
1548       if (idx == 0) {
1549          struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
1550          struct elk_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
1551          entry->ColorBufferBlendEnable =
1552             (!cso_blend->dual_color_blending || wm_prog_data->dual_src_blend);
1553       } else
1554          entry->ColorBufferBlendEnable = 1;
1555 
1556       entry->ColorBlendFunction          = rt->rgb_func;
1557       entry->AlphaBlendFunction          = rt->alpha_func;
1558       entry->SourceBlendFactor           = (int) src_rgb;
1559       entry->SourceAlphaBlendFactor      = (int) src_alpha;
1560       entry->DestinationBlendFactor      = (int) dst_rgb;
1561       entry->DestinationAlphaBlendFactor = (int) dst_alpha;
1562    }
1563 #if GFX_VER <= 5
1564    /*
1565     * Gen4/GM45/ILK can't handle have ColorBufferBlendEnable == 0
1566     * when a dual src blend shader is in use. Setup dummy blending.
1567     */
1568    struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
1569    struct elk_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
1570    if (idx == 0 && !blend_enabled && wm_prog_data->dual_src_blend) {
1571       entry->ColorBufferBlendEnable = 1;
1572       entry->ColorBlendFunction = PIPE_BLEND_ADD;
1573       entry->AlphaBlendFunction = PIPE_BLEND_ADD;
1574       entry->SourceBlendFactor = PIPE_BLENDFACTOR_ONE;
1575       entry->SourceAlphaBlendFactor = PIPE_BLENDFACTOR_ONE;
1576       entry->DestinationBlendFactor = PIPE_BLENDFACTOR_ZERO;
1577       entry->DestinationAlphaBlendFactor = PIPE_BLENDFACTOR_ZERO;
1578    }
1579 #endif
1580    return independent_alpha_blend;
1581 }
1582 
1583 /**
1584  * The pipe->create_blend_state() driver hook.
1585  *
1586  * Translates a pipe_blend_state into crocus_blend_state.
1587  */
1588 static void *
crocus_create_blend_state(struct pipe_context * ctx,const struct pipe_blend_state * state)1589 crocus_create_blend_state(struct pipe_context *ctx,
1590                           const struct pipe_blend_state *state)
1591 {
1592    struct crocus_blend_state *cso = malloc(sizeof(struct crocus_blend_state));
1593 
1594    cso->blend_enables = 0;
1595    cso->color_write_enables = 0;
1596    STATIC_ASSERT(ELK_MAX_DRAW_BUFFERS <= 8);
1597 
1598    cso->cso = *state;
1599    cso->dual_color_blending = util_blend_state_is_dual(state, 0);
1600 
1601 #if GFX_VER == 8
1602    bool indep_alpha_blend = false;
1603 #endif
1604    for (int i = 0; i < ELK_MAX_DRAW_BUFFERS; i++) {
1605       const struct pipe_rt_blend_state *rt =
1606          &state->rt[state->independent_blend_enable ? i : 0];
1607       if (rt->blend_enable)
1608          cso->blend_enables |= 1u << i;
1609       if (rt->colormask)
1610          cso->color_write_enables |= 1u << i;
1611 #if GFX_VER == 8
1612       enum pipe_blendfactor src_rgb =
1613          fix_blendfactor(rt->rgb_src_factor, state->alpha_to_one);
1614       enum pipe_blendfactor src_alpha =
1615          fix_blendfactor(rt->alpha_src_factor, state->alpha_to_one);
1616       enum pipe_blendfactor dst_rgb =
1617          fix_blendfactor(rt->rgb_dst_factor, state->alpha_to_one);
1618       enum pipe_blendfactor dst_alpha =
1619          fix_blendfactor(rt->alpha_dst_factor, state->alpha_to_one);
1620 
1621       if (rt->rgb_func != rt->alpha_func ||
1622           src_rgb != src_alpha || dst_rgb != dst_alpha)
1623          indep_alpha_blend = true;
1624 #endif
1625    }
1626 
1627 #if GFX_VER == 8
1628    crocus_pack_command(GENX(3DSTATE_PS_BLEND), cso->ps_blend, pb) {
1629       /* pb.HasWriteableRT is filled in at draw time.
1630        * pb.AlphaTestEnable is filled in at draw time.
1631        *
1632        * pb.ColorBufferBlendEnable is filled in at draw time so we can avoid
1633        * setting it when dual color blending without an appropriate shader.
1634        */
1635 
1636       pb.AlphaToCoverageEnable = state->alpha_to_coverage;
1637       pb.IndependentAlphaBlendEnable = indep_alpha_blend;
1638 
1639       /* The casts prevent warnings about implicit enum type conversions. */
1640       pb.SourceBlendFactor =
1641          (int) fix_blendfactor(state->rt[0].rgb_src_factor, state->alpha_to_one);
1642       pb.SourceAlphaBlendFactor =
1643          (int) fix_blendfactor(state->rt[0].alpha_src_factor, state->alpha_to_one);
1644       pb.DestinationBlendFactor =
1645          (int) fix_blendfactor(state->rt[0].rgb_dst_factor, state->alpha_to_one);
1646       pb.DestinationAlphaBlendFactor =
1647          (int) fix_blendfactor(state->rt[0].alpha_dst_factor, state->alpha_to_one);
1648    }
1649 #endif
1650    return cso;
1651 }
1652 
1653 /**
1654  * The pipe->bind_blend_state() driver hook.
1655  *
1656  * Bind a blending CSO and flag related dirty bits.
1657  */
1658 static void
crocus_bind_blend_state(struct pipe_context * ctx,void * state)1659 crocus_bind_blend_state(struct pipe_context *ctx, void *state)
1660 {
1661    struct crocus_context *ice = (struct crocus_context *) ctx;
1662    struct crocus_blend_state *cso = state;
1663 
1664    ice->state.cso_blend = cso;
1665    ice->state.blend_enables = cso ? cso->blend_enables : 0;
1666 
1667    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_FS;
1668    ice->state.dirty |= CROCUS_DIRTY_WM;
1669 #if GFX_VER >= 6
1670    ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
1671 #endif
1672 #if GFX_VER >= 7
1673    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS;
1674 #endif
1675 #if GFX_VER == 8
1676    ice->state.dirty |= CROCUS_DIRTY_GEN8_PMA_FIX;
1677    ice->state.dirty |= CROCUS_DIRTY_GEN8_PS_BLEND;
1678 #endif
1679    ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1680    ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
1681    ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_BLEND];
1682 }
1683 
1684 /**
1685  * Return true if the FS writes to any color outputs which are not disabled
1686  * via color masking.
1687  */
1688 static bool
has_writeable_rt(const struct crocus_blend_state * cso_blend,const struct shader_info * fs_info)1689 has_writeable_rt(const struct crocus_blend_state *cso_blend,
1690                  const struct shader_info *fs_info)
1691 {
1692    if (!fs_info)
1693       return false;
1694 
1695    unsigned rt_outputs = fs_info->outputs_written >> FRAG_RESULT_DATA0;
1696 
1697    if (fs_info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR))
1698       rt_outputs = (1 << ELK_MAX_DRAW_BUFFERS) - 1;
1699 
1700    return cso_blend->color_write_enables & rt_outputs;
1701 }
1702 
1703 /**
1704  * Gallium CSO for depth, stencil, and alpha testing state.
1705  */
1706 struct crocus_depth_stencil_alpha_state {
1707    struct pipe_depth_stencil_alpha_state cso;
1708 
1709    bool depth_writes_enabled;
1710    bool stencil_writes_enabled;
1711 };
1712 
1713 /**
1714  * The pipe->create_depth_stencil_alpha_state() driver hook.
1715  *
1716  * We encode most of 3DSTATE_WM_DEPTH_STENCIL, and just save off the alpha
1717  * testing state since we need pieces of it in a variety of places.
1718  */
1719 static void *
crocus_create_zsa_state(struct pipe_context * ctx,const struct pipe_depth_stencil_alpha_state * state)1720 crocus_create_zsa_state(struct pipe_context *ctx,
1721                         const struct pipe_depth_stencil_alpha_state *state)
1722 {
1723    struct crocus_depth_stencil_alpha_state *cso =
1724       malloc(sizeof(struct crocus_depth_stencil_alpha_state));
1725 
1726    bool two_sided_stencil = state->stencil[1].enabled;
1727    cso->cso = *state;
1728 
1729    cso->depth_writes_enabled = state->depth_writemask;
1730    cso->stencil_writes_enabled =
1731       state->stencil[0].writemask != 0 ||
1732       (two_sided_stencil && state->stencil[1].writemask != 0);
1733 
1734    /* The state tracker needs to optimize away EQUAL writes for us. */
1735    assert(!(state->depth_func == PIPE_FUNC_EQUAL && state->depth_writemask));
1736 
1737    return cso;
1738 }
1739 
1740 /**
1741  * The pipe->bind_depth_stencil_alpha_state() driver hook.
1742  *
1743  * Bind a depth/stencil/alpha CSO and flag related dirty bits.
1744  */
1745 static void
crocus_bind_zsa_state(struct pipe_context * ctx,void * state)1746 crocus_bind_zsa_state(struct pipe_context *ctx, void *state)
1747 {
1748    struct crocus_context *ice = (struct crocus_context *) ctx;
1749    struct crocus_depth_stencil_alpha_state *old_cso = ice->state.cso_zsa;
1750    struct crocus_depth_stencil_alpha_state *new_cso = state;
1751 
1752    if (new_cso) {
1753       if (cso_changed(cso.alpha_ref_value))
1754          ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1755 
1756       if (cso_changed(cso.alpha_enabled))
1757          ice->state.dirty |= CROCUS_DIRTY_WM;
1758 #if GFX_VER >= 6
1759       if (cso_changed(cso.alpha_enabled))
1760          ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
1761 
1762       if (cso_changed(cso.alpha_func))
1763          ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
1764 #endif
1765 #if GFX_VER == 8
1766       if (cso_changed(cso.alpha_enabled))
1767          ice->state.dirty |= CROCUS_DIRTY_GEN8_PS_BLEND;
1768 #endif
1769 
1770       if (cso_changed(depth_writes_enabled))
1771          ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
1772 
1773       ice->state.depth_writes_enabled = new_cso->depth_writes_enabled;
1774       ice->state.stencil_writes_enabled = new_cso->stencil_writes_enabled;
1775 
1776 #if GFX_VER <= 5
1777       ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1778 #endif
1779    }
1780 
1781    ice->state.cso_zsa = new_cso;
1782    ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
1783 #if GFX_VER >= 6
1784    ice->state.dirty |= CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL;
1785 #endif
1786 #if GFX_VER == 8
1787    ice->state.dirty |= CROCUS_DIRTY_GEN8_PMA_FIX;
1788 #endif
1789    ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_DEPTH_STENCIL_ALPHA];
1790 }
1791 
1792 #if GFX_VER == 8
1793 static bool
want_pma_fix(struct crocus_context * ice)1794 want_pma_fix(struct crocus_context *ice)
1795 {
1796    UNUSED struct crocus_screen *screen = (void *) ice->ctx.screen;
1797    UNUSED const struct intel_device_info *devinfo = &screen->devinfo;
1798    const struct elk_wm_prog_data *wm_prog_data = (void *)
1799       ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
1800    const struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
1801    const struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
1802    const struct crocus_blend_state *cso_blend = ice->state.cso_blend;
1803 
1804    /* In very specific combinations of state, we can instruct Gfx8-9 hardware
1805     * to avoid stalling at the pixel mask array.  The state equations are
1806     * documented in these places:
1807     *
1808     * - Gfx8 Depth PMA Fix:   CACHE_MODE_1::NP_PMA_FIX_ENABLE
1809     * - Gfx9 Stencil PMA Fix: CACHE_MODE_0::STC PMA Optimization Enable
1810     *
1811     * Both equations share some common elements:
1812     *
1813     *    no_hiz_op =
1814     *       !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
1815     *         3DSTATE_WM_HZ_OP::DepthBufferResolve ||
1816     *         3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
1817     *         3DSTATE_WM_HZ_OP::StencilBufferClear) &&
1818     *
1819     *    killpixels =
1820     *       3DSTATE_WM::ForceKillPix != ForceOff &&
1821     *       (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
1822     *        3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
1823     *        3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
1824     *        3DSTATE_PS_BLEND::AlphaTestEnable ||
1825     *        3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
1826     *
1827     *    (Technically the stencil PMA treats ForceKillPix differently,
1828     *     but I think this is a documentation oversight, and we don't
1829     *     ever use it in this way, so it doesn't matter).
1830     *
1831     *    common_pma_fix =
1832     *       3DSTATE_WM::ForceThreadDispatch != 1 &&
1833     *       3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0 &&
1834     *       3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
1835     *       3DSTATE_DEPTH_BUFFER::HIZ Enable &&
1836     *       3DSTATE_WM::EDSC_Mode != EDSC_PREPS &&
1837     *       3DSTATE_PS_EXTRA::PixelShaderValid &&
1838     *       no_hiz_op
1839     *
1840     * These are always true:
1841     *
1842     *    3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0
1843     *    3DSTATE_PS_EXTRA::PixelShaderValid
1844     *
1845     * Also, we never use the normal drawing path for HiZ ops; these are true:
1846     *
1847     *    !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
1848     *      3DSTATE_WM_HZ_OP::DepthBufferResolve ||
1849     *      3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
1850     *      3DSTATE_WM_HZ_OP::StencilBufferClear)
1851     *
1852     * This happens sometimes:
1853     *
1854     *    3DSTATE_WM::ForceThreadDispatch != 1
1855     *
1856     * However, we choose to ignore it as it either agrees with the signal
1857     * (dispatch was already enabled, so nothing out of the ordinary), or
1858     * there are no framebuffer attachments (so no depth or HiZ anyway,
1859     * meaning the PMA signal will already be disabled).
1860     */
1861 
1862    if (!cso_fb->zsbuf)
1863       return false;
1864 
1865    struct crocus_resource *zres, *sres;
1866    crocus_get_depth_stencil_resources(devinfo,
1867                                       cso_fb->zsbuf->texture, &zres, &sres);
1868 
1869    /* 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
1870     * 3DSTATE_DEPTH_BUFFER::HIZ Enable &&
1871     */
1872    if (!zres || !crocus_resource_level_has_hiz(zres, cso_fb->zsbuf->u.tex.level))
1873       return false;
1874 
1875    /* 3DSTATE_WM::EDSC_Mode != EDSC_PREPS */
1876    if (wm_prog_data->early_fragment_tests)
1877       return false;
1878 
1879    /* 3DSTATE_WM::ForceKillPix != ForceOff &&
1880     * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
1881     *  3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
1882     *  3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
1883     *  3DSTATE_PS_BLEND::AlphaTestEnable ||
1884     *  3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
1885     */
1886    bool killpixels = wm_prog_data->uses_kill || wm_prog_data->uses_omask ||
1887                      cso_blend->cso.alpha_to_coverage || cso_zsa->cso.alpha_enabled;
1888 
1889    /* The Gfx8 depth PMA equation becomes:
1890     *
1891     *    depth_writes =
1892     *       3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
1893     *       3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE
1894     *
1895     *    stencil_writes =
1896     *       3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
1897     *       3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
1898     *       3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE
1899     *
1900     *    Z_PMA_OPT =
1901     *       common_pma_fix &&
1902     *       3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable &&
1903     *       ((killpixels && (depth_writes || stencil_writes)) ||
1904     *        3DSTATE_PS_EXTRA::PixelShaderComputedDepthMode != PSCDEPTH_OFF)
1905     *
1906     */
1907    if (!cso_zsa->cso.depth_enabled)
1908       return false;
1909 
1910    return wm_prog_data->computed_depth_mode != PSCDEPTH_OFF ||
1911           (killpixels && (cso_zsa->depth_writes_enabled ||
1912                           (sres && cso_zsa->stencil_writes_enabled)));
1913 }
1914 #endif
1915 void
genX(crocus_update_pma_fix)1916 genX(crocus_update_pma_fix)(struct crocus_context *ice,
1917                             struct crocus_batch *batch,
1918                             bool enable)
1919 {
1920 #if GFX_VER == 8
1921    struct crocus_genx_state *genx = ice->state.genx;
1922 
1923    if (genx->pma_fix_enabled == enable)
1924       return;
1925 
1926    genx->pma_fix_enabled = enable;
1927 
1928    /* According to the Broadwell PIPE_CONTROL documentation, software should
1929     * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set
1930     * prior to the LRI.  If stencil buffer writes are enabled, then a Render        * Cache Flush is also necessary.
1931     *
1932     * The Gfx9 docs say to use a depth stall rather than a command streamer
1933     * stall.  However, the hardware seems to violently disagree.  A full
1934     * command streamer stall seems to be needed in both cases.
1935     */
1936    crocus_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
1937                                   PIPE_CONTROL_CS_STALL |
1938                                   PIPE_CONTROL_DEPTH_CACHE_FLUSH |
1939                                   PIPE_CONTROL_RENDER_TARGET_FLUSH);
1940 
1941    crocus_emit_reg(batch, GENX(CACHE_MODE_1), reg) {
1942       reg.NPPMAFixEnable = enable;
1943       reg.NPEarlyZFailsDisable = enable;
1944       reg.NPPMAFixEnableMask = true;
1945       reg.NPEarlyZFailsDisableMask = true;
1946    }
1947 
1948    /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
1949     * Flush bits is often necessary.  We do it regardless because it's easier.
1950     * The render cache flush is also necessary if stencil writes are enabled.
1951     *
1952     * Again, the Gfx9 docs give a different set of flushes but the Broadwell
1953     * flushes seem to work just as well.
1954     */
1955    crocus_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
1956                                   PIPE_CONTROL_DEPTH_STALL |
1957                                   PIPE_CONTROL_DEPTH_CACHE_FLUSH |
1958                                   PIPE_CONTROL_RENDER_TARGET_FLUSH);
1959 #endif
1960 }
1961 
1962 static float
get_line_width(const struct pipe_rasterizer_state * state)1963 get_line_width(const struct pipe_rasterizer_state *state)
1964 {
1965    float line_width = state->line_width;
1966 
1967    /* From the OpenGL 4.4 spec:
1968     *
1969     * "The actual width of non-antialiased lines is determined by rounding
1970     *  the supplied width to the nearest integer, then clamping it to the
1971     *  implementation-dependent maximum non-antialiased line width."
1972     */
1973    if (!state->multisample && !state->line_smooth)
1974       line_width = roundf(state->line_width);
1975 
1976    if (!state->multisample && state->line_smooth && line_width < 1.5f) {
1977       /* For 1 pixel line thickness or less, the general anti-aliasing
1978        * algorithm gives up, and a garbage line is generated.  Setting a
1979        * Line Width of 0.0 specifies the rasterization of the "thinnest"
1980        * (one-pixel-wide), non-antialiased lines.
1981        *
1982        * Lines rendered with zero Line Width are rasterized using the
1983        * "Grid Intersection Quantization" rules as specified by the
1984        * "Zero-Width (Cosmetic) Line Rasterization" section of the docs.
1985        */
1986       /* hack around this for gfx4/5 fps counters in hud. */
1987       line_width = GFX_VER < 6 ? 1.5f : 0.0f;
1988    }
1989    return line_width;
1990 }
1991 
1992 /**
1993  * The pipe->create_rasterizer_state() driver hook.
1994  */
1995 static void *
crocus_create_rasterizer_state(struct pipe_context * ctx,const struct pipe_rasterizer_state * state)1996 crocus_create_rasterizer_state(struct pipe_context *ctx,
1997                                const struct pipe_rasterizer_state *state)
1998 {
1999    struct crocus_rasterizer_state *cso =
2000       malloc(sizeof(struct crocus_rasterizer_state));
2001 
2002    cso->fill_mode_point_or_line =
2003       state->fill_front == PIPE_POLYGON_MODE_LINE ||
2004       state->fill_front == PIPE_POLYGON_MODE_POINT ||
2005       state->fill_back == PIPE_POLYGON_MODE_LINE ||
2006       state->fill_back == PIPE_POLYGON_MODE_POINT;
2007 
2008    if (state->clip_plane_enable != 0)
2009       cso->num_clip_plane_consts = util_logbase2(state->clip_plane_enable) + 1;
2010    else
2011       cso->num_clip_plane_consts = 0;
2012 
2013    cso->cso = *state;
2014 
2015 #if GFX_VER >= 6
2016    float line_width = get_line_width(state);
2017 
2018    crocus_pack_command(GENX(3DSTATE_SF), cso->sf, sf) {
2019       sf.StatisticsEnable = true;
2020       sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
2021       sf.LineEndCapAntialiasingRegionWidth =
2022          state->line_smooth ? _10pixels : _05pixels;
2023       sf.LastPixelEnable = state->line_last_pixel;
2024 #if GFX_VER <= 7
2025       sf.AntialiasingEnable = state->line_smooth;
2026 #endif
2027 #if GFX_VER == 8
2028       struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
2029       if (screen->devinfo.platform == INTEL_PLATFORM_CHV)
2030          sf.CHVLineWidth = line_width;
2031       else
2032          sf.LineWidth = line_width;
2033 #else
2034       sf.LineWidth = line_width;
2035 #endif
2036       sf.PointWidthSource = state->point_size_per_vertex ? Vertex : State;
2037       sf.PointWidth = state->point_size;
2038 
2039       if (state->flatshade_first) {
2040          sf.TriangleFanProvokingVertexSelect = 1;
2041       } else {
2042          sf.TriangleStripListProvokingVertexSelect = 2;
2043          sf.TriangleFanProvokingVertexSelect = 2;
2044          sf.LineStripListProvokingVertexSelect = 1;
2045       }
2046 
2047 #if GFX_VER == 6
2048       sf.AttributeSwizzleEnable = true;
2049       if (state->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT)
2050          sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
2051       else
2052          sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
2053 #endif
2054 
2055 #if GFX_VER <= 7
2056       sf.FrontWinding = state->front_ccw ? 1 : 0; // Or the other way...
2057 
2058 #if GFX_VER >= 6
2059       sf.GlobalDepthOffsetEnableSolid = state->offset_tri;
2060       sf.GlobalDepthOffsetEnableWireframe = state->offset_line;
2061       sf.GlobalDepthOffsetEnablePoint = state->offset_point;
2062       sf.GlobalDepthOffsetConstant = state->offset_units * 2;
2063       sf.GlobalDepthOffsetScale = state->offset_scale;
2064       sf.GlobalDepthOffsetClamp = state->offset_clamp;
2065 
2066       sf.FrontFaceFillMode = translate_fill_mode(state->fill_front);
2067       sf.BackFaceFillMode = translate_fill_mode(state->fill_back);
2068 #endif
2069 
2070       sf.CullMode = translate_cull_mode(state->cull_face);
2071       sf.ScissorRectangleEnable = true;
2072 
2073 #if GFX_VERx10 == 75
2074       sf.LineStippleEnable = state->line_stipple_enable;
2075 #endif
2076 #endif
2077    }
2078 #endif
2079 
2080 #if GFX_VER == 8
2081    crocus_pack_command(GENX(3DSTATE_RASTER), cso->raster, rr) {
2082       rr.FrontWinding = state->front_ccw ? CounterClockwise : Clockwise;
2083       rr.CullMode = translate_cull_mode(state->cull_face);
2084       rr.FrontFaceFillMode = translate_fill_mode(state->fill_front);
2085       rr.BackFaceFillMode = translate_fill_mode(state->fill_back);
2086       rr.DXMultisampleRasterizationEnable = state->multisample;
2087       rr.GlobalDepthOffsetEnableSolid = state->offset_tri;
2088       rr.GlobalDepthOffsetEnableWireframe = state->offset_line;
2089       rr.GlobalDepthOffsetEnablePoint = state->offset_point;
2090       rr.GlobalDepthOffsetConstant = state->offset_units * 2;
2091       rr.GlobalDepthOffsetScale = state->offset_scale;
2092       rr.GlobalDepthOffsetClamp = state->offset_clamp;
2093       rr.SmoothPointEnable = state->point_smooth;
2094       rr.AntialiasingEnable = state->line_smooth;
2095       rr.ScissorRectangleEnable = state->scissor;
2096       rr.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far);
2097    }
2098 #endif
2099 
2100 #if GFX_VER >= 6
2101    crocus_pack_command(GENX(3DSTATE_CLIP), cso->clip, cl) {
2102       /* cl.NonPerspectiveBarycentricEnable is filled in at draw time from
2103        * the FS program; cl.ForceZeroRTAIndexEnable is filled in from the FB.
2104        */
2105 #if GFX_VER >= 7
2106       cl.EarlyCullEnable = true;
2107 #endif
2108 
2109 #if GFX_VER == 7
2110       cl.FrontWinding = state->front_ccw ? 1 : 0;
2111       cl.CullMode = translate_cull_mode(state->cull_face);
2112 #endif
2113       cl.UserClipDistanceClipTestEnableBitmask = state->clip_plane_enable;
2114 #if GFX_VER < 8
2115       cl.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far);
2116 #endif
2117       cl.APIMode = state->clip_halfz ? APIMODE_D3D : APIMODE_OGL;
2118       cl.GuardbandClipTestEnable = true;
2119       cl.ClipEnable = true;
2120       cl.MinimumPointWidth = 0.125;
2121       cl.MaximumPointWidth = 255.875;
2122 
2123 #if GFX_VER == 8
2124       cl.ForceUserClipDistanceClipTestEnableBitmask = true;
2125 #endif
2126 
2127       if (state->flatshade_first) {
2128          cl.TriangleFanProvokingVertexSelect = 1;
2129       } else {
2130          cl.TriangleStripListProvokingVertexSelect = 2;
2131          cl.TriangleFanProvokingVertexSelect = 2;
2132          cl.LineStripListProvokingVertexSelect = 1;
2133       }
2134    }
2135 #endif
2136 
2137    /* Remap from 0..255 back to 1..256 */
2138    const unsigned line_stipple_factor = state->line_stipple_factor + 1;
2139 
2140    crocus_pack_command(GENX(3DSTATE_LINE_STIPPLE), cso->line_stipple, line) {
2141       if (state->line_stipple_enable) {
2142          line.LineStipplePattern = state->line_stipple_pattern;
2143          line.LineStippleInverseRepeatCount = 1.0f / line_stipple_factor;
2144          line.LineStippleRepeatCount = line_stipple_factor;
2145       }
2146    }
2147 
2148    return cso;
2149 }
2150 
2151 /**
2152  * The pipe->bind_rasterizer_state() driver hook.
2153  *
2154  * Bind a rasterizer CSO and flag related dirty bits.
2155  */
2156 static void
crocus_bind_rasterizer_state(struct pipe_context * ctx,void * state)2157 crocus_bind_rasterizer_state(struct pipe_context *ctx, void *state)
2158 {
2159    struct crocus_context *ice = (struct crocus_context *) ctx;
2160    struct crocus_rasterizer_state *old_cso = ice->state.cso_rast;
2161    struct crocus_rasterizer_state *new_cso = state;
2162 
2163    if (new_cso) {
2164       /* Try to avoid re-emitting 3DSTATE_LINE_STIPPLE, it's non-pipelined */
2165       if (cso_changed_memcmp(line_stipple))
2166          ice->state.dirty |= CROCUS_DIRTY_LINE_STIPPLE;
2167 #if GFX_VER >= 6
2168       if (cso_changed(cso.half_pixel_center))
2169          ice->state.dirty |= CROCUS_DIRTY_GEN6_MULTISAMPLE;
2170       if (cso_changed(cso.scissor))
2171          ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
2172       if (cso_changed(cso.multisample))
2173 	 ice->state.dirty |= CROCUS_DIRTY_WM;
2174 #else
2175       if (cso_changed(cso.scissor))
2176          ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
2177 #endif
2178 
2179       if (cso_changed(cso.line_stipple_enable) || cso_changed(cso.poly_stipple_enable))
2180          ice->state.dirty |= CROCUS_DIRTY_WM;
2181 
2182 #if GFX_VER >= 6
2183       if (cso_changed(cso.rasterizer_discard))
2184          ice->state.dirty |= CROCUS_DIRTY_STREAMOUT | CROCUS_DIRTY_CLIP;
2185 
2186       if (cso_changed(cso.flatshade_first))
2187          ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
2188 #endif
2189 
2190       if (cso_changed(cso.depth_clip_near) || cso_changed(cso.depth_clip_far) ||
2191           cso_changed(cso.clip_halfz))
2192          ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
2193 
2194 #if GFX_VER >= 7
2195       if (cso_changed(cso.sprite_coord_enable) ||
2196           cso_changed(cso.sprite_coord_mode) ||
2197           cso_changed(cso.light_twoside))
2198          ice->state.dirty |= CROCUS_DIRTY_GEN7_SBE;
2199 #endif
2200 #if GFX_VER <= 5
2201       if (cso_changed(cso.clip_plane_enable))
2202          ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
2203 #endif
2204    }
2205 
2206    ice->state.cso_rast = new_cso;
2207    ice->state.dirty |= CROCUS_DIRTY_RASTER;
2208    ice->state.dirty |= CROCUS_DIRTY_CLIP;
2209 #if GFX_VER <= 5
2210    ice->state.dirty |= CROCUS_DIRTY_GEN4_CLIP_PROG | CROCUS_DIRTY_GEN4_SF_PROG;
2211    ice->state.dirty |= CROCUS_DIRTY_WM;
2212 #endif
2213 #if GFX_VER <= 6
2214    ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
2215 #endif
2216    ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_RASTERIZER];
2217 }
2218 
2219 /**
2220  * Return true if the given wrap mode requires the border color to exist.
2221  *
2222  * (We can skip uploading it if the sampler isn't going to use it.)
2223  */
2224 static bool
wrap_mode_needs_border_color(unsigned wrap_mode)2225 wrap_mode_needs_border_color(unsigned wrap_mode)
2226 {
2227 #if GFX_VER == 8
2228    return wrap_mode == TCM_CLAMP_BORDER || wrap_mode == TCM_HALF_BORDER;
2229 #else
2230    return wrap_mode == TCM_CLAMP_BORDER;
2231 #endif
2232 }
2233 
2234 /**
2235  * Gallium CSO for sampler state.
2236  */
2237 struct crocus_sampler_state {
2238    struct pipe_sampler_state pstate;
2239    union pipe_color_union border_color;
2240    bool needs_border_color;
2241    unsigned wrap_s;
2242    unsigned wrap_t;
2243    unsigned wrap_r;
2244    unsigned mag_img_filter;
2245    float min_lod;
2246 };
2247 
2248 /**
2249  * The pipe->create_sampler_state() driver hook.
2250  *
2251  * We fill out SAMPLER_STATE (except for the border color pointer), and
2252  * store that on the CPU.  It doesn't make sense to upload it to a GPU
2253  * buffer object yet, because 3DSTATE_SAMPLER_STATE_POINTERS requires
2254  * all bound sampler states to be in contiguous memor.
2255  */
2256 static void *
crocus_create_sampler_state(struct pipe_context * ctx,const struct pipe_sampler_state * state)2257 crocus_create_sampler_state(struct pipe_context *ctx,
2258                             const struct pipe_sampler_state *state)
2259 {
2260    struct crocus_sampler_state *cso = CALLOC_STRUCT(crocus_sampler_state);
2261 
2262    if (!cso)
2263       return NULL;
2264 
2265    STATIC_ASSERT(PIPE_TEX_FILTER_NEAREST == MAPFILTER_NEAREST);
2266    STATIC_ASSERT(PIPE_TEX_FILTER_LINEAR == MAPFILTER_LINEAR);
2267 
2268    bool either_nearest = state->min_img_filter == PIPE_TEX_FILTER_NEAREST ||
2269       state->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
2270    cso->wrap_s = translate_wrap(state->wrap_s, either_nearest);
2271    cso->wrap_t = translate_wrap(state->wrap_t, either_nearest);
2272    cso->wrap_r = translate_wrap(state->wrap_r, either_nearest);
2273 
2274    cso->pstate = *state;
2275 
2276    memcpy(&cso->border_color, &state->border_color, sizeof(cso->border_color));
2277 
2278    cso->needs_border_color = wrap_mode_needs_border_color(cso->wrap_s) ||
2279                              wrap_mode_needs_border_color(cso->wrap_t) ||
2280                              wrap_mode_needs_border_color(cso->wrap_r);
2281 
2282    cso->min_lod = state->min_lod;
2283    cso->mag_img_filter = state->mag_img_filter;
2284 
2285    // XXX: explain this code ported from ilo...I don't get it at all...
2286    if (state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
2287        state->min_lod > 0.0f) {
2288       cso->min_lod = 0.0f;
2289       cso->mag_img_filter = state->min_img_filter;
2290    }
2291 
2292    return cso;
2293 }
2294 
2295 /**
2296  * The pipe->bind_sampler_states() driver hook.
2297  */
2298 static void
crocus_bind_sampler_states(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start,unsigned count,void ** states)2299 crocus_bind_sampler_states(struct pipe_context *ctx,
2300                            enum pipe_shader_type p_stage,
2301                            unsigned start, unsigned count,
2302                            void **states)
2303 {
2304    struct crocus_context *ice = (struct crocus_context *) ctx;
2305    gl_shader_stage stage = stage_from_pipe(p_stage);
2306    struct crocus_shader_state *shs = &ice->state.shaders[stage];
2307 
2308    assert(start + count <= CROCUS_MAX_TEXTURE_SAMPLERS);
2309 
2310    bool dirty = false;
2311 
2312    for (int i = 0; i < count; i++) {
2313       if (shs->samplers[start + i] != states[i]) {
2314          shs->samplers[start + i] = states[i];
2315          dirty = true;
2316       }
2317    }
2318 
2319    if (dirty) {
2320 #if GFX_VER <= 5
2321       if (p_stage == PIPE_SHADER_FRAGMENT)
2322          ice->state.dirty |= CROCUS_DIRTY_WM;
2323       else if (p_stage == PIPE_SHADER_VERTEX)
2324          ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS;
2325 #endif
2326       ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
2327       ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_TEXTURES];
2328    }
2329 }
2330 
2331 enum samp_workaround {
2332    SAMP_NORMAL,
2333    SAMP_CUBE_CLAMP,
2334    SAMP_CUBE_CUBE,
2335    SAMP_T_WRAP,
2336 };
2337 
2338 static void
crocus_upload_sampler_state(struct crocus_batch * batch,struct crocus_sampler_state * cso,uint32_t border_color_offset,enum samp_workaround samp_workaround,uint32_t first_level,void * map)2339 crocus_upload_sampler_state(struct crocus_batch *batch,
2340                             struct crocus_sampler_state *cso,
2341                             uint32_t border_color_offset,
2342                             enum samp_workaround samp_workaround,
2343                             uint32_t first_level,
2344                             void *map)
2345 {
2346    struct pipe_sampler_state *state = &cso->pstate;
2347    uint32_t wrap_s, wrap_t, wrap_r;
2348 
2349    wrap_s = cso->wrap_s;
2350    wrap_t = cso->wrap_t;
2351    wrap_r = cso->wrap_r;
2352 
2353    switch (samp_workaround) {
2354    case SAMP_CUBE_CLAMP:
2355       wrap_s = TCM_CLAMP;
2356       wrap_t = TCM_CLAMP;
2357       wrap_r = TCM_CLAMP;
2358       break;
2359    case SAMP_CUBE_CUBE:
2360       wrap_s = TCM_CUBE;
2361       wrap_t = TCM_CUBE;
2362       wrap_r = TCM_CUBE;
2363       break;
2364    case SAMP_T_WRAP:
2365       wrap_t = TCM_WRAP;
2366       break;
2367    default:
2368       break;
2369    }
2370 
2371    _crocus_pack_state(batch, GENX(SAMPLER_STATE), map, samp) {
2372       samp.TCXAddressControlMode = wrap_s;
2373       samp.TCYAddressControlMode = wrap_t;
2374       samp.TCZAddressControlMode = wrap_r;
2375 
2376 #if GFX_VER >= 6
2377       samp.NonnormalizedCoordinateEnable = state->unnormalized_coords;
2378 #endif
2379       samp.MinModeFilter = state->min_img_filter;
2380       samp.MagModeFilter = cso->mag_img_filter;
2381       samp.MipModeFilter = translate_mip_filter(state->min_mip_filter);
2382       samp.MaximumAnisotropy = RATIO21;
2383 
2384       if (state->max_anisotropy >= 2) {
2385          if (state->min_img_filter == PIPE_TEX_FILTER_LINEAR) {
2386             samp.MinModeFilter = MAPFILTER_ANISOTROPIC;
2387 #if GFX_VER >= 7
2388             samp.AnisotropicAlgorithm = EWAApproximation;
2389 #endif
2390          }
2391 
2392          if (state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)
2393             samp.MagModeFilter = MAPFILTER_ANISOTROPIC;
2394 
2395          samp.MaximumAnisotropy =
2396             MIN2((state->max_anisotropy - 2) / 2, RATIO161);
2397       }
2398 
2399       /* Set address rounding bits if not using nearest filtering. */
2400       if (state->min_img_filter != PIPE_TEX_FILTER_NEAREST) {
2401          samp.UAddressMinFilterRoundingEnable = true;
2402          samp.VAddressMinFilterRoundingEnable = true;
2403          samp.RAddressMinFilterRoundingEnable = true;
2404       }
2405 
2406       if (state->mag_img_filter != PIPE_TEX_FILTER_NEAREST) {
2407          samp.UAddressMagFilterRoundingEnable = true;
2408          samp.VAddressMagFilterRoundingEnable = true;
2409          samp.RAddressMagFilterRoundingEnable = true;
2410       }
2411 
2412       if (state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE)
2413          samp.ShadowFunction = translate_shadow_func(state->compare_func);
2414 
2415       const float hw_max_lod = GFX_VER >= 7 ? 14 : 13;
2416 
2417 #if GFX_VER == 8
2418       samp.LODPreClampMode = CLAMP_MODE_OGL;
2419 #else
2420       samp.LODPreClampEnable = true;
2421 #endif
2422       samp.MinLOD = CLAMP(cso->min_lod, 0, hw_max_lod);
2423       samp.MaxLOD = CLAMP(state->max_lod, 0, hw_max_lod);
2424       samp.TextureLODBias = CLAMP(state->lod_bias, -16, 15);
2425 
2426 #if GFX_VER == 6
2427       samp.BaseMipLevel = CLAMP(first_level, 0, hw_max_lod);
2428       samp.MinandMagStateNotEqual = samp.MinModeFilter != samp.MagModeFilter;
2429 #endif
2430 
2431 #if GFX_VER < 6
2432       samp.BorderColorPointer =
2433          ro_bo(batch->state.bo, border_color_offset);
2434 #else
2435       samp.BorderColorPointer = border_color_offset;
2436 #endif
2437    }
2438 }
2439 
2440 static void
crocus_upload_border_color(struct crocus_batch * batch,struct crocus_sampler_state * cso,struct crocus_sampler_view * tex,uint32_t * bc_offset)2441 crocus_upload_border_color(struct crocus_batch *batch,
2442                            struct crocus_sampler_state *cso,
2443                            struct crocus_sampler_view *tex,
2444                            uint32_t *bc_offset)
2445 {
2446    /* We may need to swizzle the border color for format faking.
2447     * A/LA formats are faked as R/RG with 000R or R00G swizzles.
2448     * This means we need to move the border color's A channel into
2449     * the R or G channels so that those read swizzles will move it
2450     * back into A.
2451     */
2452    enum pipe_format internal_format = PIPE_FORMAT_NONE;
2453    union pipe_color_union *color = &cso->border_color;
2454    union pipe_color_union tmp;
2455    if (tex) {
2456       internal_format = tex->res->internal_format;
2457 
2458       if (util_format_is_alpha(internal_format)) {
2459          unsigned char swz[4] = {
2460             PIPE_SWIZZLE_0, PIPE_SWIZZLE_0,
2461             PIPE_SWIZZLE_0, PIPE_SWIZZLE_W,
2462          };
2463          util_format_apply_color_swizzle(&tmp, color, swz, true);
2464          color = &tmp;
2465       } else if (util_format_is_luminance_alpha(internal_format) &&
2466                  internal_format != PIPE_FORMAT_L8A8_SRGB) {
2467          unsigned char swz[4] = {
2468             PIPE_SWIZZLE_X, PIPE_SWIZZLE_X,
2469             PIPE_SWIZZLE_X, PIPE_SWIZZLE_W
2470          };
2471          util_format_apply_color_swizzle(&tmp, color, swz, true);
2472          color = &tmp;
2473       }
2474    }
2475    bool is_integer_format = util_format_is_pure_integer(internal_format);
2476    unsigned sbc_size = GENX(SAMPLER_BORDER_COLOR_STATE_length) * 4;
2477    const int sbc_align = (GFX_VER == 8 ? 64 : ((GFX_VERx10 == 75 && is_integer_format) ? 512 : 32));
2478    uint32_t *sbc = stream_state(batch, sbc_size, sbc_align, bc_offset);
2479 
2480    struct GENX(SAMPLER_BORDER_COLOR_STATE) state = { 0 };
2481 
2482 #define ASSIGN(dst, src)                        \
2483    do {                                         \
2484       dst = src;                                \
2485    } while (0)
2486 
2487 #define ASSIGNu16(dst, src)                     \
2488    do {                                         \
2489       dst = (uint16_t)src;                      \
2490    } while (0)
2491 
2492 #define ASSIGNu8(dst, src)                      \
2493    do {                                         \
2494       dst = (uint8_t)src;                       \
2495    } while (0)
2496 
2497 #define BORDER_COLOR_ATTR(macro, _color_type, src)              \
2498    macro(state.BorderColor ## _color_type ## Red, src[0]);      \
2499    macro(state.BorderColor ## _color_type ## Green, src[1]);    \
2500    macro(state.BorderColor ## _color_type ## Blue, src[2]);     \
2501    macro(state.BorderColor ## _color_type ## Alpha, src[3]);
2502 
2503 #if GFX_VER >= 8
2504    /* On Broadwell, the border color is represented as four 32-bit floats,
2505     * integers, or unsigned values, interpreted according to the surface
2506     * format.  This matches the sampler->BorderColor union exactly; just
2507     * memcpy the values.
2508     */
2509    BORDER_COLOR_ATTR(ASSIGN, 32bit, color->ui);
2510 #elif GFX_VERx10 == 75
2511    if (is_integer_format) {
2512       const struct util_format_description *format_desc =
2513          util_format_description(internal_format);
2514 
2515       /* From the Haswell PRM, "Command Reference: Structures", Page 36:
2516        * "If any color channel is missing from the surface format,
2517        *  corresponding border color should be programmed as zero and if
2518        *  alpha channel is missing, corresponding Alpha border color should
2519        *  be programmed as 1."
2520        */
2521       unsigned c[4] = { 0, 0, 0, 1 };
2522       for (int i = 0; i < 4; i++) {
2523          if (format_desc->channel[i].size)
2524             c[i] = color->ui[i];
2525       }
2526 
2527       switch (format_desc->channel[0].size) {
2528       case 8:
2529          /* Copy RGBA in order. */
2530          BORDER_COLOR_ATTR(ASSIGNu8, 8bit, c);
2531          break;
2532       case 10:
2533          /* R10G10B10A2_UINT is treated like a 16-bit format. */
2534       case 16:
2535          BORDER_COLOR_ATTR(ASSIGNu16, 16bit, c);
2536          break;
2537       case 32:
2538          if (format_desc->channel[1].size && !format_desc->channel[2].size) {
2539             /* Careful inspection of the tables reveals that for RG32 formats,
2540              * the green channel needs to go where blue normally belongs.
2541              */
2542             state.BorderColor32bitRed = c[0];
2543             state.BorderColor32bitBlue = c[1];
2544             state.BorderColor32bitAlpha = 1;
2545          } else {
2546             /* Copy RGBA in order. */
2547             BORDER_COLOR_ATTR(ASSIGN, 32bit, c);
2548          }
2549          break;
2550       default:
2551          assert(!"Invalid number of bits per channel in integer format.");
2552          break;
2553       }
2554    } else {
2555       BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
2556    }
2557 #elif GFX_VER == 5 || GFX_VER == 6
2558    BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_UBYTE, Unorm, color->f);
2559    BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_USHORT, Unorm16, color->f);
2560    BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_SHORT, Snorm16, color->f);
2561 
2562 #define MESA_FLOAT_TO_HALF(dst, src)            \
2563    dst = _mesa_float_to_half(src);
2564 
2565    BORDER_COLOR_ATTR(MESA_FLOAT_TO_HALF, Float16, color->f);
2566 
2567 #undef MESA_FLOAT_TO_HALF
2568 
2569    state.BorderColorSnorm8Red   = state.BorderColorSnorm16Red >> 8;
2570    state.BorderColorSnorm8Green = state.BorderColorSnorm16Green >> 8;
2571    state.BorderColorSnorm8Blue  = state.BorderColorSnorm16Blue >> 8;
2572    state.BorderColorSnorm8Alpha = state.BorderColorSnorm16Alpha >> 8;
2573 
2574    BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
2575 
2576 #elif GFX_VER == 4
2577    BORDER_COLOR_ATTR(ASSIGN, , color->f);
2578 #else
2579    BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
2580 #endif
2581 
2582 #undef ASSIGN
2583 #undef BORDER_COLOR_ATTR
2584 
2585    GENX(SAMPLER_BORDER_COLOR_STATE_pack)(batch, sbc, &state);
2586 }
2587 
2588 /**
2589  * Upload the sampler states into a contiguous area of GPU memory, for
2590  * for 3DSTATE_SAMPLER_STATE_POINTERS_*.
2591  *
2592  * Also fill out the border color state pointers.
2593  */
2594 static void
crocus_upload_sampler_states(struct crocus_context * ice,struct crocus_batch * batch,gl_shader_stage stage)2595 crocus_upload_sampler_states(struct crocus_context *ice,
2596                              struct crocus_batch *batch, gl_shader_stage stage)
2597 {
2598    struct crocus_shader_state *shs = &ice->state.shaders[stage];
2599    const struct shader_info *info = crocus_get_shader_info(ice, stage);
2600 
2601    /* We assume the state tracker will call pipe->bind_sampler_states()
2602     * if the program's number of textures changes.
2603     */
2604    unsigned count = info ? BITSET_LAST_BIT(info->textures_used) : 0;
2605 
2606    if (!count)
2607       return;
2608 
2609    /* Assemble the SAMPLER_STATEs into a contiguous table that lives
2610     * in the dynamic state memory zone, so we can point to it via the
2611     * 3DSTATE_SAMPLER_STATE_POINTERS_* commands.
2612     */
2613    unsigned size = count * 4 * GENX(SAMPLER_STATE_length);
2614    uint32_t *map = stream_state(batch, size, 32, &shs->sampler_offset);
2615 
2616    if (unlikely(!map))
2617       return;
2618 
2619    for (int i = 0; i < count; i++) {
2620       struct crocus_sampler_state *state = shs->samplers[i];
2621       struct crocus_sampler_view *tex = shs->textures[i];
2622 
2623       if (!state || !tex) {
2624          memset(map, 0, 4 * GENX(SAMPLER_STATE_length));
2625       } else {
2626          unsigned border_color_offset = 0;
2627          if (state->needs_border_color) {
2628             crocus_upload_border_color(batch, state, tex, &border_color_offset);
2629          }
2630 
2631          enum samp_workaround wa = SAMP_NORMAL;
2632          /* There's a bug in 1D texture sampling - it actually pays
2633           * attention to the wrap_t value, though it should not.
2634           * Override the wrap_t value here to GL_REPEAT to keep
2635           * any nonexistent border pixels from floating in.
2636           */
2637          if (tex->base.target == PIPE_TEXTURE_1D)
2638             wa = SAMP_T_WRAP;
2639          else if (tex->base.target == PIPE_TEXTURE_CUBE ||
2640                   tex->base.target == PIPE_TEXTURE_CUBE_ARRAY) {
2641             /* Cube maps must use the same wrap mode for all three coordinate
2642              * dimensions.  Prior to Haswell, only CUBE and CLAMP are valid.
2643              *
2644              * Ivybridge and Baytrail seem to have problems with CUBE mode and
2645              * integer formats.  Fall back to CLAMP for now.
2646              */
2647             if (state->pstate.seamless_cube_map &&
2648                 !(GFX_VERx10 == 70 && util_format_is_pure_integer(tex->base.format)))
2649                wa = SAMP_CUBE_CUBE;
2650             else
2651                wa = SAMP_CUBE_CLAMP;
2652          }
2653 
2654          uint32_t first_level = 0;
2655          if (tex->base.target != PIPE_BUFFER)
2656             first_level = tex->base.u.tex.first_level;
2657 
2658          crocus_upload_sampler_state(batch, state, border_color_offset, wa, first_level, map);
2659       }
2660 
2661       map += GENX(SAMPLER_STATE_length);
2662    }
2663 }
2664 
2665 /**
2666  * The pipe->create_sampler_view() driver hook.
2667  */
2668 static struct pipe_sampler_view *
crocus_create_sampler_view(struct pipe_context * ctx,struct pipe_resource * tex,const struct pipe_sampler_view * tmpl)2669 crocus_create_sampler_view(struct pipe_context *ctx,
2670                            struct pipe_resource *tex,
2671                            const struct pipe_sampler_view *tmpl)
2672 {
2673    struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
2674    const struct intel_device_info *devinfo = &screen->devinfo;
2675    struct crocus_sampler_view *isv = calloc(1, sizeof(struct crocus_sampler_view));
2676 
2677    if (!isv)
2678       return NULL;
2679 
2680    /* initialize base object */
2681    isv->base = *tmpl;
2682    isv->base.context = ctx;
2683    isv->base.texture = NULL;
2684    pipe_reference_init(&isv->base.reference, 1);
2685    pipe_resource_reference(&isv->base.texture, tex);
2686 
2687    if (util_format_is_depth_or_stencil(tmpl->format)) {
2688       struct crocus_resource *zres, *sres;
2689       const struct util_format_description *desc =
2690          util_format_description(tmpl->format);
2691 
2692       crocus_get_depth_stencil_resources(devinfo, tex, &zres, &sres);
2693 
2694       tex = util_format_has_depth(desc) ? &zres->base.b : &sres->base.b;
2695 
2696       if (tex->format == PIPE_FORMAT_S8_UINT)
2697          if (GFX_VER == 7 && sres->shadow)
2698             tex = &sres->shadow->base.b;
2699    }
2700 
2701    isv->res = (struct crocus_resource *) tex;
2702 
2703    isl_surf_usage_flags_t usage = ISL_SURF_USAGE_TEXTURE_BIT;
2704 
2705    if (isv->base.target == PIPE_TEXTURE_CUBE ||
2706        isv->base.target == PIPE_TEXTURE_CUBE_ARRAY)
2707       usage |= ISL_SURF_USAGE_CUBE_BIT;
2708 
2709    const struct crocus_format_info fmt =
2710       crocus_format_for_usage(devinfo, tmpl->format, usage);
2711 
2712    enum pipe_swizzle vswz[4] = { tmpl->swizzle_r, tmpl->swizzle_g, tmpl->swizzle_b, tmpl->swizzle_a };
2713    crocus_combine_swizzle(isv->swizzle, fmt.swizzles, vswz);
2714 
2715    /* hardcode stencil swizzles - hw returns 0G01, we want GGGG */
2716    if (GFX_VER < 6 &&
2717        (tmpl->format == PIPE_FORMAT_X32_S8X24_UINT ||
2718         tmpl->format == PIPE_FORMAT_X24S8_UINT)) {
2719       isv->swizzle[0] = tmpl->swizzle_g;
2720       isv->swizzle[1] = tmpl->swizzle_g;
2721       isv->swizzle[2] = tmpl->swizzle_g;
2722       isv->swizzle[3] = tmpl->swizzle_g;
2723    }
2724 
2725    isv->clear_color = isv->res->aux.clear_color;
2726 
2727    isv->view = (struct isl_view) {
2728       .format = fmt.fmt,
2729 #if GFX_VERx10 >= 75
2730       .swizzle = (struct isl_swizzle) {
2731          .r = pipe_to_isl_swizzle(isv->swizzle[0], false),
2732          .g = pipe_to_isl_swizzle(isv->swizzle[1], false),
2733          .b = pipe_to_isl_swizzle(isv->swizzle[2], false),
2734          .a = pipe_to_isl_swizzle(isv->swizzle[3], false),
2735       },
2736 #else
2737       /* swizzling handled in shader code */
2738       .swizzle = ISL_SWIZZLE_IDENTITY,
2739 #endif
2740       .usage = usage,
2741    };
2742 
2743    /* Fill out SURFACE_STATE for this view. */
2744    if (tmpl->target != PIPE_BUFFER) {
2745       isv->view.base_level = tmpl->u.tex.first_level;
2746       isv->view.levels = tmpl->u.tex.last_level - tmpl->u.tex.first_level + 1;
2747 
2748       /* Hardware older than skylake ignores this value */
2749       assert(tex->target != PIPE_TEXTURE_3D || !tmpl->u.tex.first_layer);
2750 
2751       // XXX: do I need to port f9fd0cf4790cb2a530e75d1a2206dbb9d8af7cb2?
2752       isv->view.base_array_layer = tmpl->u.tex.first_layer;
2753       isv->view.array_len =
2754          tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
2755    }
2756 #if GFX_VER >= 6
2757    /* just create a second view struct for texture gather just in case */
2758    isv->gather_view = isv->view;
2759 
2760 #if GFX_VER == 7
2761    if (fmt.fmt == ISL_FORMAT_R32G32_FLOAT ||
2762        fmt.fmt == ISL_FORMAT_R32G32_SINT ||
2763        fmt.fmt == ISL_FORMAT_R32G32_UINT) {
2764       isv->gather_view.format = ISL_FORMAT_R32G32_FLOAT_LD;
2765 #if GFX_VERx10 >= 75
2766       isv->gather_view.swizzle = (struct isl_swizzle) {
2767          .r = pipe_to_isl_swizzle(isv->swizzle[0], GFX_VERx10 == 75),
2768          .g = pipe_to_isl_swizzle(isv->swizzle[1], GFX_VERx10 == 75),
2769          .b = pipe_to_isl_swizzle(isv->swizzle[2], GFX_VERx10 == 75),
2770          .a = pipe_to_isl_swizzle(isv->swizzle[3], GFX_VERx10 == 75),
2771       };
2772 #endif
2773    }
2774 #endif
2775 #if GFX_VER == 6
2776    /* Sandybridge's gather4 message is broken for integer formats.
2777     * To work around this, we pretend the surface is UNORM for
2778     * 8 or 16-bit formats, and emit shader instructions to recover
2779     * the real INT/UINT value.  For 32-bit formats, we pretend
2780     * the surface is FLOAT, and simply reinterpret the resulting
2781     * bits.
2782     */
2783    switch (fmt.fmt) {
2784    case ISL_FORMAT_R8_SINT:
2785    case ISL_FORMAT_R8_UINT:
2786       isv->gather_view.format = ISL_FORMAT_R8_UNORM;
2787       break;
2788 
2789    case ISL_FORMAT_R16_SINT:
2790    case ISL_FORMAT_R16_UINT:
2791       isv->gather_view.format = ISL_FORMAT_R16_UNORM;
2792       break;
2793 
2794    case ISL_FORMAT_R32_SINT:
2795    case ISL_FORMAT_R32_UINT:
2796       isv->gather_view.format = ISL_FORMAT_R32_FLOAT;
2797       break;
2798 
2799    default:
2800       break;
2801    }
2802 #endif
2803 #endif
2804 
2805    return &isv->base;
2806 }
2807 
2808 static void
crocus_sampler_view_destroy(struct pipe_context * ctx,struct pipe_sampler_view * state)2809 crocus_sampler_view_destroy(struct pipe_context *ctx,
2810                             struct pipe_sampler_view *state)
2811 {
2812    struct crocus_sampler_view *isv = (void *) state;
2813    pipe_resource_reference(&state->texture, NULL);
2814    free(isv);
2815 }
2816 
2817 /**
2818  * The pipe->create_surface() driver hook.
2819  *
2820  * In Gallium nomenclature, "surfaces" are a view of a resource that
2821  * can be bound as a render target or depth/stencil buffer.
2822  */
2823 static struct pipe_surface *
crocus_create_surface(struct pipe_context * ctx,struct pipe_resource * tex,const struct pipe_surface * tmpl)2824 crocus_create_surface(struct pipe_context *ctx,
2825                       struct pipe_resource *tex,
2826                       const struct pipe_surface *tmpl)
2827 {
2828    struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
2829    const struct intel_device_info *devinfo = &screen->devinfo;
2830 
2831    isl_surf_usage_flags_t usage = 0;
2832    if (tmpl->writable)
2833       usage = ISL_SURF_USAGE_STORAGE_BIT;
2834    else if (util_format_is_depth_or_stencil(tmpl->format))
2835       usage = ISL_SURF_USAGE_DEPTH_BIT;
2836    else
2837       usage = ISL_SURF_USAGE_RENDER_TARGET_BIT;
2838 
2839    const struct crocus_format_info fmt =
2840       crocus_format_for_usage(devinfo, tmpl->format, usage);
2841 
2842    if ((usage & ISL_SURF_USAGE_RENDER_TARGET_BIT) &&
2843        !isl_format_supports_rendering(devinfo, fmt.fmt)) {
2844       /* Framebuffer validation will reject this invalid case, but it
2845        * hasn't had the opportunity yet.  In the meantime, we need to
2846        * avoid hitting ISL asserts about unsupported formats below.
2847        */
2848       return NULL;
2849    }
2850 
2851    struct crocus_surface *surf = calloc(1, sizeof(struct crocus_surface));
2852    struct pipe_surface *psurf = &surf->base;
2853    struct crocus_resource *res = (struct crocus_resource *) tex;
2854 
2855    if (!surf)
2856       return NULL;
2857 
2858    pipe_reference_init(&psurf->reference, 1);
2859    pipe_resource_reference(&psurf->texture, tex);
2860    psurf->context = ctx;
2861    psurf->format = tmpl->format;
2862    psurf->width = tex->width0;
2863    psurf->height = tex->height0;
2864    psurf->u.tex.first_layer = tmpl->u.tex.first_layer;
2865    psurf->u.tex.last_layer = tmpl->u.tex.last_layer;
2866    psurf->u.tex.level = tmpl->u.tex.level;
2867 
2868    uint32_t array_len = tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
2869 
2870    struct isl_view *view = &surf->view;
2871    *view = (struct isl_view) {
2872       .format = fmt.fmt,
2873       .base_level = tmpl->u.tex.level,
2874       .levels = 1,
2875       .base_array_layer = tmpl->u.tex.first_layer,
2876       .array_len = array_len,
2877       .swizzle = ISL_SWIZZLE_IDENTITY,
2878       .usage = usage,
2879    };
2880 
2881 #if GFX_VER >= 6
2882    struct isl_view *read_view = &surf->read_view;
2883    *read_view = (struct isl_view) {
2884       .format = fmt.fmt,
2885       .base_level = tmpl->u.tex.level,
2886       .levels = 1,
2887       .base_array_layer = tmpl->u.tex.first_layer,
2888       .array_len = array_len,
2889       .swizzle = ISL_SWIZZLE_IDENTITY,
2890       .usage = ISL_SURF_USAGE_TEXTURE_BIT,
2891    };
2892 #endif
2893 
2894    surf->clear_color = res->aux.clear_color;
2895 
2896    /* Bail early for depth/stencil - we don't want SURFACE_STATE for them. */
2897    if (res->surf.usage & (ISL_SURF_USAGE_DEPTH_BIT |
2898                           ISL_SURF_USAGE_STENCIL_BIT))
2899       return psurf;
2900 
2901    if (!isl_format_is_compressed(res->surf.format)) {
2902       memcpy(&surf->surf, &res->surf, sizeof(surf->surf));
2903       uint64_t temp_offset;
2904       uint32_t temp_x, temp_y;
2905 
2906       isl_surf_get_image_offset_B_tile_sa(&res->surf, tmpl->u.tex.level,
2907                                           res->base.b.target == PIPE_TEXTURE_3D ? 0 : tmpl->u.tex.first_layer,
2908                                           res->base.b.target == PIPE_TEXTURE_3D ? tmpl->u.tex.first_layer : 0,
2909                                           &temp_offset, &temp_x, &temp_y);
2910       if (!devinfo->has_surface_tile_offset &&
2911           (temp_x || temp_y)) {
2912          /* Original gfx4 hardware couldn't draw to a non-tile-aligned
2913           * destination.
2914           */
2915          /* move to temp */
2916          struct pipe_resource wa_templ = (struct pipe_resource) {
2917             .width0 = u_minify(res->base.b.width0, tmpl->u.tex.level),
2918             .height0 = u_minify(res->base.b.height0, tmpl->u.tex.level),
2919             .depth0 = 1,
2920             .array_size = 1,
2921             .format = res->base.b.format,
2922             .target = PIPE_TEXTURE_2D,
2923             .bind = (usage & ISL_SURF_USAGE_DEPTH_BIT ? PIPE_BIND_DEPTH_STENCIL : PIPE_BIND_RENDER_TARGET) | PIPE_BIND_SAMPLER_VIEW,
2924          };
2925          surf->align_res = screen->base.resource_create(&screen->base, &wa_templ);
2926          view->base_level = 0;
2927          view->base_array_layer = 0;
2928          view->array_len = 1;
2929          struct crocus_resource *align_res = (struct crocus_resource *)surf->align_res;
2930          memcpy(&surf->surf, &align_res->surf, sizeof(surf->surf));
2931       }
2932       return psurf;
2933    }
2934 
2935    /* The resource has a compressed format, which is not renderable, but we
2936     * have a renderable view format.  We must be attempting to upload blocks
2937     * of compressed data via an uncompressed view.
2938     *
2939     * In this case, we can assume there are no auxiliary buffers, a single
2940     * miplevel, and that the resource is single-sampled.  Gallium may try
2941     * and create an uncompressed view with multiple layers, however.
2942     */
2943    assert(!isl_format_is_compressed(fmt.fmt));
2944    assert(res->surf.samples == 1);
2945    assert(view->levels == 1);
2946 
2947    /* TODO: compressed pbo uploads aren't working here */
2948    pipe_surface_reference(&psurf, NULL);
2949    return NULL;
2950 
2951    uint64_t offset_B = 0;
2952    uint32_t tile_x_sa = 0, tile_y_sa = 0;
2953 
2954    if (view->base_level > 0) {
2955       /* We can't rely on the hardware's miplevel selection with such
2956        * a substantial lie about the format, so we select a single image
2957        * using the Tile X/Y Offset fields.  In this case, we can't handle
2958        * multiple array slices.
2959        *
2960        * On Broadwell, HALIGN and VALIGN are specified in pixels and are
2961        * hard-coded to align to exactly the block size of the compressed
2962        * texture.  This means that, when reinterpreted as a non-compressed
2963        * texture, the tile offsets may be anything and we can't rely on
2964        * X/Y Offset.
2965        *
2966        * Return NULL to force the state tracker to take fallback paths.
2967        */
2968       // TODO: check if the gen7 check is right, originally gen8
2969       if (view->array_len > 1 || GFX_VER == 7) {
2970          pipe_surface_reference(&psurf, NULL);
2971          return NULL;
2972       }
2973 
2974       const bool is_3d = res->surf.dim == ISL_SURF_DIM_3D;
2975       isl_surf_get_image_surf(&screen->isl_dev, &res->surf,
2976                               view->base_level,
2977                               is_3d ? 0 : view->base_array_layer,
2978                               is_3d ? view->base_array_layer : 0,
2979                               &surf->surf,
2980                               &offset_B, &tile_x_sa, &tile_y_sa);
2981 
2982       /* We use address and tile offsets to access a single level/layer
2983        * as a subimage, so reset level/layer so it doesn't offset again.
2984        */
2985       view->base_array_layer = 0;
2986       view->base_level = 0;
2987    } else {
2988       /* Level 0 doesn't require tile offsets, and the hardware can find
2989        * array slices using QPitch even with the format override, so we
2990        * can allow layers in this case.  Copy the original ISL surface.
2991        */
2992       memcpy(&surf->surf, &res->surf, sizeof(surf->surf));
2993    }
2994 
2995    /* Scale down the image dimensions by the block size. */
2996    const struct isl_format_layout *fmtl =
2997       isl_format_get_layout(res->surf.format);
2998    surf->surf.format = fmt.fmt;
2999    surf->surf.logical_level0_px = isl_surf_get_logical_level0_el(&surf->surf);
3000    surf->surf.phys_level0_sa = isl_surf_get_phys_level0_el(&surf->surf);
3001    tile_x_sa /= fmtl->bw;
3002    tile_y_sa /= fmtl->bh;
3003 
3004    psurf->width = surf->surf.logical_level0_px.width;
3005    psurf->height = surf->surf.logical_level0_px.height;
3006 
3007    return psurf;
3008 }
3009 
3010 #if GFX_VER >= 7
3011 static void
fill_default_image_param(struct isl_image_param * param)3012 fill_default_image_param(struct isl_image_param *param)
3013 {
3014    memset(param, 0, sizeof(*param));
3015    /* Set the swizzling shifts to all-ones to effectively disable swizzling --
3016     * See emit_address_calculation() in elk_fs_surface_builder.cpp for a more
3017     * detailed explanation of these parameters.
3018     */
3019    param->swizzling[0] = 0xff;
3020    param->swizzling[1] = 0xff;
3021 }
3022 
3023 static void
fill_buffer_image_param(struct isl_image_param * param,enum pipe_format pfmt,unsigned size)3024 fill_buffer_image_param(struct isl_image_param *param,
3025                         enum pipe_format pfmt,
3026                         unsigned size)
3027 {
3028    const unsigned cpp = util_format_get_blocksize(pfmt);
3029 
3030    fill_default_image_param(param);
3031    param->size[0] = size / cpp;
3032    param->stride[0] = cpp;
3033 }
3034 
3035 #endif
3036 
3037 /**
3038  * The pipe->set_shader_images() driver hook.
3039  */
3040 static void
crocus_set_shader_images(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start_slot,unsigned count,unsigned unbind_num_trailing_slots,const struct pipe_image_view * p_images)3041 crocus_set_shader_images(struct pipe_context *ctx,
3042                          enum pipe_shader_type p_stage,
3043                          unsigned start_slot, unsigned count,
3044                          unsigned unbind_num_trailing_slots,
3045                          const struct pipe_image_view *p_images)
3046 {
3047 #if GFX_VER >= 7
3048    struct crocus_context *ice = (struct crocus_context *) ctx;
3049    struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3050    const struct intel_device_info *devinfo = &screen->devinfo;
3051    gl_shader_stage stage = stage_from_pipe(p_stage);
3052    struct crocus_shader_state *shs = &ice->state.shaders[stage];
3053    struct crocus_genx_state *genx = ice->state.genx;
3054    struct isl_image_param *image_params = genx->shaders[stage].image_param;
3055 
3056    shs->bound_image_views &= ~u_bit_consecutive(start_slot, count);
3057 
3058    for (unsigned i = 0; i < count; i++) {
3059       struct crocus_image_view *iv = &shs->image[start_slot + i];
3060 
3061       if (p_images && p_images[i].resource) {
3062          const struct pipe_image_view *img = &p_images[i];
3063          struct crocus_resource *res = (void *) img->resource;
3064 
3065          util_copy_image_view(&iv->base, img);
3066 
3067          shs->bound_image_views |= 1 << (start_slot + i);
3068 
3069          res->bind_history |= PIPE_BIND_SHADER_IMAGE;
3070          res->bind_stages |= 1 << stage;
3071 
3072          isl_surf_usage_flags_t usage = ISL_SURF_USAGE_STORAGE_BIT;
3073          struct crocus_format_info fmt =
3074             crocus_format_for_usage(devinfo, img->format, usage);
3075 
3076          struct isl_swizzle swiz = pipe_to_isl_swizzles(fmt.swizzles);
3077          if (img->shader_access & PIPE_IMAGE_ACCESS_READ) {
3078             /* On Gen8, try to use typed surfaces reads (which support a
3079              * limited number of formats), and if not possible, fall back
3080              * to untyped reads.
3081              */
3082             if (!isl_has_matching_typed_storage_image_format(devinfo, fmt.fmt))
3083                fmt.fmt = ISL_FORMAT_RAW;
3084             else
3085                fmt.fmt = isl_lower_storage_image_format(devinfo, fmt.fmt);
3086          }
3087 
3088          if (res->base.b.target != PIPE_BUFFER) {
3089             struct isl_view view = {
3090                .format = fmt.fmt,
3091                .base_level = img->u.tex.level,
3092                .levels = 1,
3093                .base_array_layer = img->u.tex.first_layer,
3094                .array_len = img->u.tex.last_layer - img->u.tex.first_layer + 1,
3095                .swizzle = swiz,
3096                .usage = usage,
3097             };
3098 
3099             iv->view = view;
3100 
3101             isl_surf_fill_image_param(&screen->isl_dev,
3102                                       &image_params[start_slot + i],
3103                                       &res->surf, &view);
3104          } else {
3105             struct isl_view view = {
3106                .format = fmt.fmt,
3107                .swizzle = swiz,
3108                .usage = usage,
3109             };
3110             iv->view = view;
3111 
3112             util_range_add(&res->base.b, &res->valid_buffer_range, img->u.buf.offset,
3113                            img->u.buf.offset + img->u.buf.size);
3114             fill_buffer_image_param(&image_params[start_slot + i],
3115                                     img->format, img->u.buf.size);
3116          }
3117       } else {
3118          pipe_resource_reference(&iv->base.resource, NULL);
3119          fill_default_image_param(&image_params[start_slot + i]);
3120       }
3121    }
3122 
3123    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage;
3124    ice->state.dirty |=
3125       stage == MESA_SHADER_COMPUTE ? CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
3126                                    : CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3127 
3128    /* Broadwell also needs isl_image_params re-uploaded */
3129    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage;
3130    shs->sysvals_need_upload = true;
3131 #endif
3132 }
3133 
3134 
3135 /**
3136  * The pipe->set_sampler_views() driver hook.
3137  */
3138 static void
crocus_set_sampler_views(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start,unsigned count,unsigned unbind_num_trailing_slots,bool take_ownership,struct pipe_sampler_view ** views)3139 crocus_set_sampler_views(struct pipe_context *ctx,
3140                          enum pipe_shader_type p_stage,
3141                          unsigned start, unsigned count,
3142                          unsigned unbind_num_trailing_slots,
3143                          bool take_ownership,
3144                          struct pipe_sampler_view **views)
3145 {
3146    struct crocus_context *ice = (struct crocus_context *) ctx;
3147    gl_shader_stage stage = stage_from_pipe(p_stage);
3148    struct crocus_shader_state *shs = &ice->state.shaders[stage];
3149 
3150    shs->bound_sampler_views &= ~u_bit_consecutive(start, count);
3151 
3152    for (unsigned i = 0; i < count; i++) {
3153       struct pipe_sampler_view *pview = views ? views[i] : NULL;
3154 
3155       if (take_ownership) {
3156          pipe_sampler_view_reference((struct pipe_sampler_view **)
3157                                      &shs->textures[start + i], NULL);
3158          shs->textures[start + i] = (struct crocus_sampler_view *)pview;
3159       } else {
3160          pipe_sampler_view_reference((struct pipe_sampler_view **)
3161                                      &shs->textures[start + i], pview);
3162       }
3163 
3164       struct crocus_sampler_view *view = (void *) pview;
3165       if (view) {
3166          view->res->bind_history |= PIPE_BIND_SAMPLER_VIEW;
3167          view->res->bind_stages |= 1 << stage;
3168 
3169          shs->bound_sampler_views |= 1 << (start + i);
3170       }
3171    }
3172 #if GFX_VER == 6
3173    /* first level parameters to crocus_upload_sampler_state is gfx6 only */
3174    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
3175 #endif
3176    ice->state.stage_dirty |= (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage);
3177    ice->state.dirty |=
3178       stage == MESA_SHADER_COMPUTE ? CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
3179                                    : CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3180    ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_TEXTURES];
3181 }
3182 
3183 /**
3184  * The pipe->set_tess_state() driver hook.
3185  */
3186 static void
crocus_set_tess_state(struct pipe_context * ctx,const float default_outer_level[4],const float default_inner_level[2])3187 crocus_set_tess_state(struct pipe_context *ctx,
3188                       const float default_outer_level[4],
3189                       const float default_inner_level[2])
3190 {
3191    struct crocus_context *ice = (struct crocus_context *) ctx;
3192    struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_CTRL];
3193 
3194    memcpy(&ice->state.default_outer_level[0], &default_outer_level[0], 4 * sizeof(float));
3195    memcpy(&ice->state.default_inner_level[0], &default_inner_level[0], 2 * sizeof(float));
3196 
3197    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS;
3198    shs->sysvals_need_upload = true;
3199 }
3200 
3201 static void
crocus_set_patch_vertices(struct pipe_context * ctx,uint8_t patch_vertices)3202 crocus_set_patch_vertices(struct pipe_context *ctx, uint8_t patch_vertices)
3203 {
3204    struct crocus_context *ice = (struct crocus_context *) ctx;
3205 
3206    ice->state.patch_vertices = patch_vertices;
3207 }
3208 
3209 static void
crocus_surface_destroy(struct pipe_context * ctx,struct pipe_surface * p_surf)3210 crocus_surface_destroy(struct pipe_context *ctx, struct pipe_surface *p_surf)
3211 {
3212    struct crocus_surface *surf = (void *) p_surf;
3213    pipe_resource_reference(&p_surf->texture, NULL);
3214 
3215    pipe_resource_reference(&surf->align_res, NULL);
3216    free(surf);
3217 }
3218 
3219 static void
crocus_set_clip_state(struct pipe_context * ctx,const struct pipe_clip_state * state)3220 crocus_set_clip_state(struct pipe_context *ctx,
3221                       const struct pipe_clip_state *state)
3222 {
3223    struct crocus_context *ice = (struct crocus_context *) ctx;
3224    struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_VERTEX];
3225    struct crocus_shader_state *gshs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
3226    struct crocus_shader_state *tshs = &ice->state.shaders[MESA_SHADER_TESS_EVAL];
3227 
3228    memcpy(&ice->state.clip_planes, state, sizeof(*state));
3229 
3230 #if GFX_VER <= 5
3231    ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
3232 #endif
3233    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS | CROCUS_STAGE_DIRTY_CONSTANTS_GS |
3234                              CROCUS_STAGE_DIRTY_CONSTANTS_TES;
3235    shs->sysvals_need_upload = true;
3236    gshs->sysvals_need_upload = true;
3237    tshs->sysvals_need_upload = true;
3238 }
3239 
3240 /**
3241  * The pipe->set_polygon_stipple() driver hook.
3242  */
3243 static void
crocus_set_polygon_stipple(struct pipe_context * ctx,const struct pipe_poly_stipple * state)3244 crocus_set_polygon_stipple(struct pipe_context *ctx,
3245                            const struct pipe_poly_stipple *state)
3246 {
3247    struct crocus_context *ice = (struct crocus_context *) ctx;
3248    memcpy(&ice->state.poly_stipple, state, sizeof(*state));
3249    ice->state.dirty |= CROCUS_DIRTY_POLYGON_STIPPLE;
3250 }
3251 
3252 /**
3253  * The pipe->set_sample_mask() driver hook.
3254  */
3255 static void
crocus_set_sample_mask(struct pipe_context * ctx,unsigned sample_mask)3256 crocus_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask)
3257 {
3258    struct crocus_context *ice = (struct crocus_context *) ctx;
3259 
3260    /* We only support 16x MSAA, so we have 16 bits of sample maks.
3261     * st/mesa may pass us 0xffffffff though, meaning "enable all samples".
3262     */
3263    ice->state.sample_mask = sample_mask & 0xff;
3264    ice->state.dirty |= CROCUS_DIRTY_GEN6_SAMPLE_MASK;
3265 }
3266 
3267 static void
crocus_fill_scissor_rect(struct crocus_context * ice,int idx,struct pipe_scissor_state * ss)3268 crocus_fill_scissor_rect(struct crocus_context *ice,
3269                          int idx,
3270                          struct pipe_scissor_state *ss)
3271 {
3272    struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
3273    struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
3274    const struct pipe_viewport_state *vp = &ice->state.viewports[idx];
3275    struct pipe_scissor_state scissor = (struct pipe_scissor_state) {
3276       .minx = MAX2(-fabsf(vp->scale[0]) + vp->translate[0], 0),
3277       .maxx = MIN2( fabsf(vp->scale[0]) + vp->translate[0], cso_fb->width) - 1,
3278       .miny = MAX2(-fabsf(vp->scale[1]) + vp->translate[1], 0),
3279       .maxy = MIN2( fabsf(vp->scale[1]) + vp->translate[1], cso_fb->height) - 1,
3280    };
3281    if (cso_state->scissor) {
3282       struct pipe_scissor_state *s = &ice->state.scissors[idx];
3283       scissor.minx = MAX2(scissor.minx, s->minx);
3284       scissor.miny = MAX2(scissor.miny, s->miny);
3285       scissor.maxx = MIN2(scissor.maxx, s->maxx);
3286       scissor.maxy = MIN2(scissor.maxy, s->maxy);
3287    }
3288    *ss = scissor;
3289 }
3290 
3291 /**
3292  * The pipe->set_scissor_states() driver hook.
3293  *
3294  * This corresponds to our SCISSOR_RECT state structures.  It's an
3295  * exact match, so we just store them, and memcpy them out later.
3296  */
3297 static void
crocus_set_scissor_states(struct pipe_context * ctx,unsigned start_slot,unsigned num_scissors,const struct pipe_scissor_state * rects)3298 crocus_set_scissor_states(struct pipe_context *ctx,
3299                           unsigned start_slot,
3300                           unsigned num_scissors,
3301                           const struct pipe_scissor_state *rects)
3302 {
3303    struct crocus_context *ice = (struct crocus_context *) ctx;
3304 
3305    for (unsigned i = 0; i < num_scissors; i++) {
3306       if (rects[i].minx == rects[i].maxx || rects[i].miny == rects[i].maxy) {
3307          /* If the scissor was out of bounds and got clamped to 0 width/height
3308           * at the bounds, the subtraction of 1 from maximums could produce a
3309           * negative number and thus not clip anything.  Instead, just provide
3310           * a min > max scissor inside the bounds, which produces the expected
3311           * no rendering.
3312           */
3313          ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
3314             .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
3315          };
3316       } else {
3317          ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
3318             .minx = rects[i].minx,     .miny = rects[i].miny,
3319             .maxx = rects[i].maxx - 1, .maxy = rects[i].maxy - 1,
3320          };
3321       }
3322    }
3323 
3324 #if GFX_VER < 6
3325    ice->state.dirty |= CROCUS_DIRTY_RASTER; /* SF state */
3326 #else
3327    ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
3328 #endif
3329    ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
3330 
3331 }
3332 
3333 /**
3334  * The pipe->set_stencil_ref() driver hook.
3335  *
3336  * This is added to 3DSTATE_WM_DEPTH_STENCIL dynamically at draw time.
3337  */
3338 static void
crocus_set_stencil_ref(struct pipe_context * ctx,const struct pipe_stencil_ref ref)3339 crocus_set_stencil_ref(struct pipe_context *ctx,
3340                        const struct pipe_stencil_ref ref)
3341 {
3342    struct crocus_context *ice = (struct crocus_context *) ctx;
3343    ice->state.stencil_ref = ref;
3344    ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
3345 }
3346 
3347 #if GFX_VER == 8
3348 static float
viewport_extent(const struct pipe_viewport_state * state,int axis,float sign)3349 viewport_extent(const struct pipe_viewport_state *state, int axis, float sign)
3350 {
3351    return copysignf(state->scale[axis], sign) + state->translate[axis];
3352 }
3353 #endif
3354 
3355 /**
3356  * The pipe->set_viewport_states() driver hook.
3357  *
3358  * This corresponds to our SF_CLIP_VIEWPORT states.  We can't calculate
3359  * the guardband yet, as we need the framebuffer dimensions, but we can
3360  * at least fill out the rest.
3361  */
3362 static void
crocus_set_viewport_states(struct pipe_context * ctx,unsigned start_slot,unsigned count,const struct pipe_viewport_state * states)3363 crocus_set_viewport_states(struct pipe_context *ctx,
3364                            unsigned start_slot,
3365                            unsigned count,
3366                            const struct pipe_viewport_state *states)
3367 {
3368    struct crocus_context *ice = (struct crocus_context *) ctx;
3369    struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3370 
3371    memcpy(&ice->state.viewports[start_slot], states, sizeof(*states) * count);
3372 
3373    /* Fix depth test misrenderings by lowering translated depth range */
3374    if (screen->driconf.lower_depth_range_rate != 1.0f)
3375       ice->state.viewports[start_slot].translate[2] *=
3376          screen->driconf.lower_depth_range_rate;
3377 
3378    ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
3379    ice->state.dirty |= CROCUS_DIRTY_RASTER;
3380 #if GFX_VER >= 6
3381    ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
3382 #endif
3383 
3384    if (ice->state.cso_rast && (!ice->state.cso_rast->cso.depth_clip_near ||
3385                                !ice->state.cso_rast->cso.depth_clip_far))
3386       ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
3387 }
3388 
3389 /**
3390  * The pipe->set_framebuffer_state() driver hook.
3391  *
3392  * Sets the current draw FBO, including color render targets, depth,
3393  * and stencil buffers.
3394  */
3395 static void
crocus_set_framebuffer_state(struct pipe_context * ctx,const struct pipe_framebuffer_state * state)3396 crocus_set_framebuffer_state(struct pipe_context *ctx,
3397                              const struct pipe_framebuffer_state *state)
3398 {
3399    struct crocus_context *ice = (struct crocus_context *) ctx;
3400    struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
3401    struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3402    const struct intel_device_info *devinfo = &screen->devinfo;
3403 #if 0
3404    struct isl_device *isl_dev = &screen->isl_dev;
3405    struct crocus_resource *zres;
3406    struct crocus_resource *stencil_res;
3407 #endif
3408 
3409    unsigned samples = util_framebuffer_get_num_samples(state);
3410    unsigned layers = util_framebuffer_get_num_layers(state);
3411 
3412 #if GFX_VER >= 6
3413    if (cso->samples != samples) {
3414       ice->state.dirty |= CROCUS_DIRTY_GEN6_MULTISAMPLE;
3415       ice->state.dirty |= CROCUS_DIRTY_GEN6_SAMPLE_MASK;
3416       ice->state.dirty |= CROCUS_DIRTY_RASTER;
3417 #if GFX_VERx10 == 75
3418       ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS;
3419 #endif
3420    }
3421 #endif
3422 
3423 #if GFX_VER >= 6 && GFX_VER < 8
3424    ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
3425 #endif
3426 
3427    if ((cso->layers == 0) != (layers == 0)) {
3428       ice->state.dirty |= CROCUS_DIRTY_CLIP;
3429    }
3430 
3431    if (cso->width != state->width || cso->height != state->height) {
3432       ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
3433       ice->state.dirty |= CROCUS_DIRTY_RASTER;
3434       ice->state.dirty |= CROCUS_DIRTY_DRAWING_RECTANGLE;
3435 #if GFX_VER >= 6
3436       ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
3437 #endif
3438    }
3439 
3440    if (cso->zsbuf || state->zsbuf) {
3441       ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER;
3442 
3443       /* update SF's depth buffer format */
3444       if (GFX_VER == 7 && cso->zsbuf)
3445          ice->state.dirty |= CROCUS_DIRTY_RASTER;
3446    }
3447 
3448    /* wm thread dispatch enable */
3449    ice->state.dirty |= CROCUS_DIRTY_WM;
3450    util_copy_framebuffer_state(cso, state);
3451    cso->samples = samples;
3452    cso->layers = layers;
3453 
3454    if (cso->zsbuf) {
3455       struct crocus_resource *zres;
3456       struct crocus_resource *stencil_res;
3457       enum isl_aux_usage aux_usage = ISL_AUX_USAGE_NONE;
3458       crocus_get_depth_stencil_resources(devinfo, cso->zsbuf->texture, &zres,
3459                                          &stencil_res);
3460       if (zres && crocus_resource_level_has_hiz(zres, cso->zsbuf->u.tex.level)) {
3461          aux_usage = zres->aux.usage;
3462       }
3463       ice->state.hiz_usage = aux_usage;
3464    }
3465 
3466    /* Render target change */
3467    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_FS;
3468 
3469    ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3470 
3471    ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_FRAMEBUFFER];
3472 }
3473 
3474 /**
3475  * The pipe->set_constant_buffer() driver hook.
3476  *
3477  * This uploads any constant data in user buffers, and references
3478  * any UBO resources containing constant data.
3479  */
3480 static void
crocus_set_constant_buffer(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned index,bool take_ownership,const struct pipe_constant_buffer * input)3481 crocus_set_constant_buffer(struct pipe_context *ctx,
3482                            enum pipe_shader_type p_stage, unsigned index,
3483                            bool take_ownership,
3484                            const struct pipe_constant_buffer *input)
3485 {
3486    struct crocus_context *ice = (struct crocus_context *) ctx;
3487    gl_shader_stage stage = stage_from_pipe(p_stage);
3488    struct crocus_shader_state *shs = &ice->state.shaders[stage];
3489    struct pipe_constant_buffer *cbuf = &shs->constbufs[index];
3490 
3491    util_copy_constant_buffer(&shs->constbufs[index], input, take_ownership);
3492 
3493    if (input && input->buffer_size && (input->buffer || input->user_buffer)) {
3494       shs->bound_cbufs |= 1u << index;
3495 
3496       if (input->user_buffer) {
3497          void *map = NULL;
3498          pipe_resource_reference(&cbuf->buffer, NULL);
3499          u_upload_alloc(ice->ctx.const_uploader, 0, input->buffer_size, 64,
3500                         &cbuf->buffer_offset, &cbuf->buffer, (void **) &map);
3501 
3502          if (!cbuf->buffer) {
3503             /* Allocation was unsuccessful - just unbind */
3504             crocus_set_constant_buffer(ctx, p_stage, index, false, NULL);
3505             return;
3506          }
3507 
3508          assert(map);
3509          memcpy(map, input->user_buffer, input->buffer_size);
3510       }
3511       cbuf->buffer_size =
3512          MIN2(input->buffer_size,
3513               crocus_resource_bo(cbuf->buffer)->size - cbuf->buffer_offset);
3514 
3515       struct crocus_resource *res = (void *) cbuf->buffer;
3516       res->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
3517       res->bind_stages |= 1 << stage;
3518    } else {
3519       shs->bound_cbufs &= ~(1u << index);
3520    }
3521 
3522    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage;
3523 }
3524 
3525 static void
upload_sysvals(struct crocus_context * ice,gl_shader_stage stage)3526 upload_sysvals(struct crocus_context *ice,
3527                gl_shader_stage stage)
3528 {
3529    UNUSED struct crocus_genx_state *genx = ice->state.genx;
3530    struct crocus_shader_state *shs = &ice->state.shaders[stage];
3531 
3532    struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
3533    if (!shader || shader->num_system_values == 0)
3534       return;
3535 
3536    assert(shader->num_cbufs > 0);
3537 
3538    unsigned sysval_cbuf_index = shader->num_cbufs - 1;
3539    struct pipe_constant_buffer *cbuf = &shs->constbufs[sysval_cbuf_index];
3540    unsigned upload_size = shader->num_system_values * sizeof(uint32_t);
3541    uint32_t *map = NULL;
3542 
3543    assert(sysval_cbuf_index < PIPE_MAX_CONSTANT_BUFFERS);
3544    u_upload_alloc(ice->ctx.const_uploader, 0, upload_size, 64,
3545                   &cbuf->buffer_offset, &cbuf->buffer, (void **) &map);
3546 
3547    for (int i = 0; i < shader->num_system_values; i++) {
3548       uint32_t sysval = shader->system_values[i];
3549       uint32_t value = 0;
3550 
3551       if (ELK_PARAM_DOMAIN(sysval) == ELK_PARAM_DOMAIN_IMAGE) {
3552 #if GFX_VER >= 7
3553          unsigned img = ELK_PARAM_IMAGE_IDX(sysval);
3554          unsigned offset = ELK_PARAM_IMAGE_OFFSET(sysval);
3555          struct isl_image_param *param =
3556             &genx->shaders[stage].image_param[img];
3557 
3558          assert(offset < sizeof(struct isl_image_param));
3559          value = ((uint32_t *) param)[offset];
3560 #endif
3561       } else if (sysval == ELK_PARAM_BUILTIN_ZERO) {
3562          value = 0;
3563       } else if (ELK_PARAM_BUILTIN_IS_CLIP_PLANE(sysval)) {
3564          int plane = ELK_PARAM_BUILTIN_CLIP_PLANE_IDX(sysval);
3565          int comp  = ELK_PARAM_BUILTIN_CLIP_PLANE_COMP(sysval);
3566          value = fui(ice->state.clip_planes.ucp[plane][comp]);
3567       } else if (sysval == ELK_PARAM_BUILTIN_PATCH_VERTICES_IN) {
3568          if (stage == MESA_SHADER_TESS_CTRL) {
3569             value = ice->state.vertices_per_patch;
3570          } else {
3571             assert(stage == MESA_SHADER_TESS_EVAL);
3572             const struct shader_info *tcs_info =
3573                crocus_get_shader_info(ice, MESA_SHADER_TESS_CTRL);
3574             if (tcs_info)
3575                value = tcs_info->tess.tcs_vertices_out;
3576             else
3577                value = ice->state.vertices_per_patch;
3578          }
3579       } else if (sysval >= ELK_PARAM_BUILTIN_TESS_LEVEL_OUTER_X &&
3580                  sysval <= ELK_PARAM_BUILTIN_TESS_LEVEL_OUTER_W) {
3581          unsigned i = sysval - ELK_PARAM_BUILTIN_TESS_LEVEL_OUTER_X;
3582          value = fui(ice->state.default_outer_level[i]);
3583       } else if (sysval == ELK_PARAM_BUILTIN_TESS_LEVEL_INNER_X) {
3584          value = fui(ice->state.default_inner_level[0]);
3585       } else if (sysval == ELK_PARAM_BUILTIN_TESS_LEVEL_INNER_Y) {
3586          value = fui(ice->state.default_inner_level[1]);
3587       } else if (sysval >= ELK_PARAM_BUILTIN_WORK_GROUP_SIZE_X &&
3588                  sysval <= ELK_PARAM_BUILTIN_WORK_GROUP_SIZE_Z) {
3589          unsigned i = sysval - ELK_PARAM_BUILTIN_WORK_GROUP_SIZE_X;
3590          value = ice->state.last_block[i];
3591       } else {
3592          assert(!"unhandled system value");
3593       }
3594 
3595       *map++ = value;
3596    }
3597 
3598    cbuf->buffer_size = upload_size;
3599    shs->sysvals_need_upload = false;
3600 }
3601 
3602 /**
3603  * The pipe->set_shader_buffers() driver hook.
3604  *
3605  * This binds SSBOs and ABOs.  Unfortunately, we need to stream out
3606  * SURFACE_STATE here, as the buffer offset may change each time.
3607  */
3608 static void
crocus_set_shader_buffers(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start_slot,unsigned count,const struct pipe_shader_buffer * buffers,unsigned writable_bitmask)3609 crocus_set_shader_buffers(struct pipe_context *ctx,
3610                           enum pipe_shader_type p_stage,
3611                           unsigned start_slot, unsigned count,
3612                           const struct pipe_shader_buffer *buffers,
3613                           unsigned writable_bitmask)
3614 {
3615    struct crocus_context *ice = (struct crocus_context *) ctx;
3616    gl_shader_stage stage = stage_from_pipe(p_stage);
3617    struct crocus_shader_state *shs = &ice->state.shaders[stage];
3618 
3619    unsigned modified_bits = u_bit_consecutive(start_slot, count);
3620 
3621    shs->bound_ssbos &= ~modified_bits;
3622    shs->writable_ssbos &= ~modified_bits;
3623    shs->writable_ssbos |= writable_bitmask << start_slot;
3624 
3625    for (unsigned i = 0; i < count; i++) {
3626       if (buffers && buffers[i].buffer) {
3627          struct crocus_resource *res = (void *) buffers[i].buffer;
3628          struct pipe_shader_buffer *ssbo = &shs->ssbo[start_slot + i];
3629          pipe_resource_reference(&ssbo->buffer, &res->base.b);
3630          ssbo->buffer_offset = buffers[i].buffer_offset;
3631          ssbo->buffer_size =
3632             MIN2(buffers[i].buffer_size, res->bo->size - ssbo->buffer_offset);
3633 
3634          shs->bound_ssbos |= 1 << (start_slot + i);
3635 
3636          res->bind_history |= PIPE_BIND_SHADER_BUFFER;
3637          res->bind_stages |= 1 << stage;
3638 
3639          util_range_add(&res->base.b, &res->valid_buffer_range, ssbo->buffer_offset,
3640                         ssbo->buffer_offset + ssbo->buffer_size);
3641       } else {
3642          pipe_resource_reference(&shs->ssbo[start_slot + i].buffer, NULL);
3643       }
3644    }
3645 
3646    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage;
3647 }
3648 
3649 static void
crocus_delete_state(struct pipe_context * ctx,void * state)3650 crocus_delete_state(struct pipe_context *ctx, void *state)
3651 {
3652    free(state);
3653 }
3654 
3655 /**
3656  * The pipe->set_vertex_buffers() driver hook.
3657  *
3658  * This translates pipe_vertex_buffer to our 3DSTATE_VERTEX_BUFFERS packet.
3659  */
3660 static void
crocus_set_vertex_buffers(struct pipe_context * ctx,unsigned count,const struct pipe_vertex_buffer * buffers)3661 crocus_set_vertex_buffers(struct pipe_context *ctx,
3662                           unsigned count,
3663                           const struct pipe_vertex_buffer *buffers)
3664 {
3665    struct crocus_context *ice = (struct crocus_context *) ctx;
3666    struct crocus_screen *screen = (struct crocus_screen *) ctx->screen;
3667    const unsigned padding =
3668       (GFX_VERx10 < 75 && screen->devinfo.platform != INTEL_PLATFORM_BYT) * 2;
3669 
3670    util_set_vertex_buffers_mask(ice->state.vertex_buffers, &ice->state.bound_vertex_buffers,
3671                                 buffers, count, true);
3672 
3673    for (unsigned i = 0; i < count; i++) {
3674       struct pipe_vertex_buffer *state =
3675          &ice->state.vertex_buffers[i];
3676 
3677       if (!state->is_user_buffer && state->buffer.resource) {
3678          struct crocus_resource *res = (void *)state->buffer.resource;
3679          res->bind_history |= PIPE_BIND_VERTEX_BUFFER;
3680       }
3681 
3682       uint32_t end = 0;
3683       if (state->buffer.resource)
3684          end = state->buffer.resource->width0 + padding;
3685       ice->state.vb_end[i] = end;
3686    }
3687    ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS;
3688 }
3689 
3690 #if GFX_VERx10 < 75
get_wa_flags(enum isl_format format)3691 static uint8_t get_wa_flags(enum isl_format format)
3692 {
3693    uint8_t wa_flags = 0;
3694 
3695    switch (format) {
3696    case ISL_FORMAT_R10G10B10A2_USCALED:
3697       wa_flags = ELK_ATTRIB_WA_SCALE;
3698       break;
3699    case ISL_FORMAT_R10G10B10A2_SSCALED:
3700       wa_flags = ELK_ATTRIB_WA_SIGN | ELK_ATTRIB_WA_SCALE;
3701       break;
3702    case ISL_FORMAT_R10G10B10A2_UNORM:
3703       wa_flags = ELK_ATTRIB_WA_NORMALIZE;
3704       break;
3705    case ISL_FORMAT_R10G10B10A2_SNORM:
3706       wa_flags = ELK_ATTRIB_WA_SIGN | ELK_ATTRIB_WA_NORMALIZE;
3707       break;
3708    case ISL_FORMAT_R10G10B10A2_SINT:
3709       wa_flags = ELK_ATTRIB_WA_SIGN;
3710       break;
3711    case ISL_FORMAT_B10G10R10A2_USCALED:
3712       wa_flags = ELK_ATTRIB_WA_SCALE | ELK_ATTRIB_WA_BGRA;
3713       break;
3714    case ISL_FORMAT_B10G10R10A2_SSCALED:
3715       wa_flags = ELK_ATTRIB_WA_SIGN | ELK_ATTRIB_WA_SCALE | ELK_ATTRIB_WA_BGRA;
3716       break;
3717    case ISL_FORMAT_B10G10R10A2_UNORM:
3718       wa_flags = ELK_ATTRIB_WA_NORMALIZE | ELK_ATTRIB_WA_BGRA;
3719       break;
3720    case ISL_FORMAT_B10G10R10A2_SNORM:
3721       wa_flags = ELK_ATTRIB_WA_SIGN | ELK_ATTRIB_WA_NORMALIZE | ELK_ATTRIB_WA_BGRA;
3722       break;
3723    case ISL_FORMAT_B10G10R10A2_SINT:
3724       wa_flags = ELK_ATTRIB_WA_SIGN | ELK_ATTRIB_WA_BGRA;
3725       break;
3726    case ISL_FORMAT_B10G10R10A2_UINT:
3727       wa_flags = ELK_ATTRIB_WA_BGRA;
3728       break;
3729    default:
3730       break;
3731    }
3732    return wa_flags;
3733 }
3734 #endif
3735 
3736 /**
3737  * Gallium CSO for vertex elements.
3738  */
3739 struct crocus_vertex_element_state {
3740    uint32_t vertex_elements[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
3741 #if GFX_VER == 8
3742    uint32_t vf_instancing[33 * GENX(3DSTATE_VF_INSTANCING_length)];
3743 #endif
3744    uint32_t edgeflag_ve[GENX(VERTEX_ELEMENT_STATE_length)];
3745 #if GFX_VER == 8
3746    uint32_t edgeflag_vfi[GENX(3DSTATE_VF_INSTANCING_length)];
3747 #endif
3748    uint32_t step_rate[16];
3749    uint8_t wa_flags[33];
3750    uint16_t strides[16];
3751    unsigned count;
3752 };
3753 
3754 /**
3755  * The pipe->create_vertex_elements() driver hook.
3756  *
3757  * This translates pipe_vertex_element to our 3DSTATE_VERTEX_ELEMENTS
3758  * and 3DSTATE_VF_INSTANCING commands. The vertex_elements and vf_instancing
3759  * arrays are ready to be emitted at draw time if no EdgeFlag or SGVs are
3760  * needed. In these cases we will need information available at draw time.
3761  * We setup edgeflag_ve and edgeflag_vfi as alternatives last
3762  * 3DSTATE_VERTEX_ELEMENT and 3DSTATE_VF_INSTANCING that can be used at
3763  * draw time if we detect that EdgeFlag is needed by the Vertex Shader.
3764  */
3765 static void *
crocus_create_vertex_elements(struct pipe_context * ctx,unsigned count,const struct pipe_vertex_element * state)3766 crocus_create_vertex_elements(struct pipe_context *ctx,
3767                               unsigned count,
3768                               const struct pipe_vertex_element *state)
3769 {
3770    struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3771    const struct intel_device_info *devinfo = &screen->devinfo;
3772    struct crocus_vertex_element_state *cso =
3773       calloc(1, sizeof(struct crocus_vertex_element_state));
3774 
3775    cso->count = count;
3776 
3777    crocus_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), cso->vertex_elements, ve) {
3778       ve.DWordLength =
3779          1 + GENX(VERTEX_ELEMENT_STATE_length) * MAX2(count, 1) - 2;
3780    }
3781 
3782    uint32_t *ve_pack_dest = &cso->vertex_elements[1];
3783 #if GFX_VER == 8
3784    uint32_t *vfi_pack_dest = cso->vf_instancing;
3785 #endif
3786 
3787    if (count == 0) {
3788       crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
3789          ve.Valid = true;
3790          ve.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
3791          ve.Component0Control = VFCOMP_STORE_0;
3792          ve.Component1Control = VFCOMP_STORE_0;
3793          ve.Component2Control = VFCOMP_STORE_0;
3794          ve.Component3Control = VFCOMP_STORE_1_FP;
3795       }
3796 #if GFX_VER == 8
3797       crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
3798       }
3799 #endif
3800    }
3801 
3802    for (int i = 0; i < count; i++) {
3803       const struct crocus_format_info fmt =
3804          crocus_format_for_usage(devinfo, state[i].src_format, 0);
3805       unsigned comp[4] = { VFCOMP_STORE_SRC, VFCOMP_STORE_SRC,
3806                            VFCOMP_STORE_SRC, VFCOMP_STORE_SRC };
3807       enum isl_format actual_fmt = fmt.fmt;
3808 
3809 #if GFX_VERx10 < 75
3810       cso->wa_flags[i] = get_wa_flags(fmt.fmt);
3811 
3812       if (fmt.fmt == ISL_FORMAT_R10G10B10A2_USCALED ||
3813           fmt.fmt == ISL_FORMAT_R10G10B10A2_SSCALED ||
3814           fmt.fmt == ISL_FORMAT_R10G10B10A2_UNORM ||
3815           fmt.fmt == ISL_FORMAT_R10G10B10A2_SNORM ||
3816           fmt.fmt == ISL_FORMAT_R10G10B10A2_SINT ||
3817           fmt.fmt == ISL_FORMAT_B10G10R10A2_USCALED ||
3818           fmt.fmt == ISL_FORMAT_B10G10R10A2_SSCALED ||
3819           fmt.fmt == ISL_FORMAT_B10G10R10A2_UNORM ||
3820           fmt.fmt == ISL_FORMAT_B10G10R10A2_SNORM ||
3821           fmt.fmt == ISL_FORMAT_B10G10R10A2_UINT ||
3822           fmt.fmt == ISL_FORMAT_B10G10R10A2_SINT)
3823          actual_fmt = ISL_FORMAT_R10G10B10A2_UINT;
3824       if (fmt.fmt == ISL_FORMAT_R8G8B8_SINT)
3825          actual_fmt = ISL_FORMAT_R8G8B8A8_SINT;
3826       if (fmt.fmt == ISL_FORMAT_R8G8B8_UINT)
3827          actual_fmt = ISL_FORMAT_R8G8B8A8_UINT;
3828       if (fmt.fmt == ISL_FORMAT_R16G16B16_SINT)
3829          actual_fmt = ISL_FORMAT_R16G16B16A16_SINT;
3830       if (fmt.fmt == ISL_FORMAT_R16G16B16_UINT)
3831          actual_fmt = ISL_FORMAT_R16G16B16A16_UINT;
3832 #endif
3833 
3834       cso->step_rate[state[i].vertex_buffer_index] = state[i].instance_divisor;
3835       cso->strides[state[i].vertex_buffer_index] = state[i].src_stride;
3836 
3837       switch (isl_format_get_num_channels(fmt.fmt)) {
3838       case 0: comp[0] = VFCOMP_STORE_0; FALLTHROUGH;
3839       case 1: comp[1] = VFCOMP_STORE_0; FALLTHROUGH;
3840       case 2: comp[2] = VFCOMP_STORE_0; FALLTHROUGH;
3841       case 3:
3842          comp[3] = isl_format_has_int_channel(fmt.fmt) ? VFCOMP_STORE_1_INT
3843             : VFCOMP_STORE_1_FP;
3844          break;
3845       }
3846       crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
3847 #if GFX_VER >= 6
3848          ve.EdgeFlagEnable = false;
3849 #endif
3850          ve.VertexBufferIndex = state[i].vertex_buffer_index;
3851          ve.Valid = true;
3852          ve.SourceElementOffset = state[i].src_offset;
3853          ve.SourceElementFormat = actual_fmt;
3854          ve.Component0Control = comp[0];
3855          ve.Component1Control = comp[1];
3856          ve.Component2Control = comp[2];
3857          ve.Component3Control = comp[3];
3858 #if GFX_VER < 5
3859          ve.DestinationElementOffset = i * 4;
3860 #endif
3861       }
3862 
3863 #if GFX_VER == 8
3864       crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
3865          vi.VertexElementIndex = i;
3866          vi.InstancingEnable = state[i].instance_divisor > 0;
3867          vi.InstanceDataStepRate = state[i].instance_divisor;
3868       }
3869 #endif
3870       ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
3871 #if GFX_VER == 8
3872       vfi_pack_dest += GENX(3DSTATE_VF_INSTANCING_length);
3873 #endif
3874    }
3875 
3876    /* An alternative version of the last VE and VFI is stored so it
3877     * can be used at draw time in case Vertex Shader uses EdgeFlag
3878     */
3879    if (count) {
3880       const unsigned edgeflag_index = count - 1;
3881       const struct crocus_format_info fmt =
3882          crocus_format_for_usage(devinfo, state[edgeflag_index].src_format, 0);
3883       crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), cso->edgeflag_ve, ve) {
3884 #if GFX_VER >= 6
3885          ve.EdgeFlagEnable = true;
3886 #endif
3887          ve.VertexBufferIndex = state[edgeflag_index].vertex_buffer_index;
3888          ve.Valid = true;
3889          ve.SourceElementOffset = state[edgeflag_index].src_offset;
3890          ve.SourceElementFormat = fmt.fmt;
3891          ve.Component0Control = VFCOMP_STORE_SRC;
3892          ve.Component1Control = VFCOMP_STORE_0;
3893          ve.Component2Control = VFCOMP_STORE_0;
3894          ve.Component3Control = VFCOMP_STORE_0;
3895       }
3896 #if GFX_VER == 8
3897       crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), cso->edgeflag_vfi, vi) {
3898          /* The vi.VertexElementIndex of the EdgeFlag Vertex Element is filled
3899           * at draw time, as it should change if SGVs are emitted.
3900           */
3901          vi.InstancingEnable = state[edgeflag_index].instance_divisor > 0;
3902          vi.InstanceDataStepRate = state[edgeflag_index].instance_divisor;
3903       }
3904 #endif
3905    }
3906 
3907    return cso;
3908 }
3909 
3910 /**
3911  * The pipe->bind_vertex_elements_state() driver hook.
3912  */
3913 static void
crocus_bind_vertex_elements_state(struct pipe_context * ctx,void * state)3914 crocus_bind_vertex_elements_state(struct pipe_context *ctx, void *state)
3915 {
3916    struct crocus_context *ice = (struct crocus_context *) ctx;
3917 #if GFX_VER == 8
3918    struct crocus_vertex_element_state *old_cso = ice->state.cso_vertex_elements;
3919    struct crocus_vertex_element_state *new_cso = state;
3920 
3921    if (new_cso && cso_changed(count))
3922       ice->state.dirty |= CROCUS_DIRTY_GEN8_VF_SGVS;
3923 #endif
3924    ice->state.cso_vertex_elements = state;
3925    ice->state.dirty |= CROCUS_DIRTY_VERTEX_ELEMENTS | CROCUS_DIRTY_VERTEX_BUFFERS;
3926    ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_VERTEX_ELEMENTS];
3927 }
3928 
3929 #if GFX_VER >= 6
3930 struct crocus_streamout_counter {
3931    uint32_t offset_start;
3932    uint32_t offset_end;
3933 
3934    uint64_t accum;
3935 };
3936 
3937 /**
3938  * Gallium CSO for stream output (transform feedback) targets.
3939  */
3940 struct crocus_stream_output_target {
3941    struct pipe_stream_output_target base;
3942 
3943    /** Stride (bytes-per-vertex) during this transform feedback operation */
3944    uint16_t stride;
3945 
3946    /** Has 3DSTATE_SO_BUFFER actually been emitted, zeroing the offsets? */
3947    bool zeroed;
3948 
3949    struct crocus_resource *offset_res;
3950    uint32_t offset_offset;
3951 
3952 #if GFX_VER == 6
3953    void *prim_map;
3954    struct crocus_streamout_counter prev_count;
3955    struct crocus_streamout_counter count;
3956 #endif
3957 #if GFX_VER == 8
3958    /** Does the next 3DSTATE_SO_BUFFER need to zero the offsets? */
3959    bool zero_offset;
3960 #endif
3961 };
3962 
3963 #if GFX_VER >= 7
3964 static uint32_t
crocus_get_so_offset(struct pipe_stream_output_target * so)3965 crocus_get_so_offset(struct pipe_stream_output_target *so)
3966 {
3967    struct crocus_stream_output_target *tgt = (void *)so;
3968    struct pipe_transfer *transfer;
3969    struct pipe_box box;
3970    uint32_t result;
3971    u_box_1d(tgt->offset_offset, 4, &box);
3972    void *val = so->context->buffer_map(so->context, &tgt->offset_res->base.b,
3973                                        0, PIPE_MAP_DIRECTLY,
3974                                        &box, &transfer);
3975    assert(val);
3976    result = *(uint32_t *)val;
3977    so->context->buffer_unmap(so->context, transfer);
3978 
3979    return result / tgt->stride;
3980 }
3981 #endif
3982 
3983 #if GFX_VER == 6
3984 static void
3985 compute_vertices_written_so_far(struct crocus_context *ice,
3986                                 struct crocus_stream_output_target *tgt,
3987                                 struct crocus_streamout_counter *count,
3988                                 uint64_t *svbi);
3989 
3990 static uint32_t
crocus_get_so_offset(struct pipe_stream_output_target * so)3991 crocus_get_so_offset(struct pipe_stream_output_target *so)
3992 {
3993    struct crocus_stream_output_target *tgt = (void *)so;
3994    struct crocus_context *ice = (void *)so->context;
3995 
3996    uint64_t vert_written;
3997    compute_vertices_written_so_far(ice, tgt, &tgt->prev_count, &vert_written);
3998    return vert_written;
3999 }
4000 #endif
4001 
4002 /**
4003  * The pipe->create_stream_output_target() driver hook.
4004  *
4005  * "Target" here refers to a destination buffer.  We translate this into
4006  * a 3DSTATE_SO_BUFFER packet.  We can handle most fields, but don't yet
4007  * know which buffer this represents, or whether we ought to zero the
4008  * write-offsets, or append.  Those are handled in the set() hook.
4009  */
4010 static struct pipe_stream_output_target *
crocus_create_stream_output_target(struct pipe_context * ctx,struct pipe_resource * p_res,unsigned buffer_offset,unsigned buffer_size)4011 crocus_create_stream_output_target(struct pipe_context *ctx,
4012                                    struct pipe_resource *p_res,
4013                                    unsigned buffer_offset,
4014                                    unsigned buffer_size)
4015 {
4016    struct crocus_resource *res = (void *) p_res;
4017    struct crocus_stream_output_target *cso = calloc(1, sizeof(*cso));
4018    if (!cso)
4019       return NULL;
4020 
4021    res->bind_history |= PIPE_BIND_STREAM_OUTPUT;
4022 
4023    pipe_reference_init(&cso->base.reference, 1);
4024    pipe_resource_reference(&cso->base.buffer, p_res);
4025    cso->base.buffer_offset = buffer_offset;
4026    cso->base.buffer_size = buffer_size;
4027    cso->base.context = ctx;
4028 
4029    util_range_add(&res->base.b, &res->valid_buffer_range, buffer_offset,
4030                   buffer_offset + buffer_size);
4031 #if GFX_VER >= 7
4032    struct crocus_context *ice = (struct crocus_context *) ctx;
4033    void *temp;
4034    u_upload_alloc(ice->ctx.stream_uploader, 0, sizeof(uint32_t), 4,
4035                   &cso->offset_offset,
4036                   (struct pipe_resource **)&cso->offset_res,
4037                   &temp);
4038 #endif
4039 
4040    return &cso->base;
4041 }
4042 
4043 static void
crocus_stream_output_target_destroy(struct pipe_context * ctx,struct pipe_stream_output_target * state)4044 crocus_stream_output_target_destroy(struct pipe_context *ctx,
4045                                     struct pipe_stream_output_target *state)
4046 {
4047    struct crocus_stream_output_target *cso = (void *) state;
4048 
4049    pipe_resource_reference((struct pipe_resource **)&cso->offset_res, NULL);
4050    pipe_resource_reference(&cso->base.buffer, NULL);
4051 
4052    free(cso);
4053 }
4054 
4055 #define GEN6_SO_NUM_PRIMS_WRITTEN       0x2288
4056 #define GEN7_SO_WRITE_OFFSET(n)         (0x5280 + (n) * 4)
4057 
4058 #if GFX_VER == 6
4059 static void
aggregate_stream_counter(struct crocus_batch * batch,struct crocus_stream_output_target * tgt,struct crocus_streamout_counter * counter)4060 aggregate_stream_counter(struct crocus_batch *batch, struct crocus_stream_output_target *tgt,
4061                          struct crocus_streamout_counter *counter)
4062 {
4063    uint64_t *prim_counts = tgt->prim_map;
4064 
4065    if (crocus_batch_references(batch, tgt->offset_res->bo)) {
4066       struct pipe_fence_handle *out_fence = NULL;
4067       batch->ice->ctx.flush(&batch->ice->ctx, &out_fence, 0);
4068       batch->screen->base.fence_finish(&batch->screen->base, &batch->ice->ctx, out_fence, UINT64_MAX);
4069       batch->screen->base.fence_reference(&batch->screen->base, &out_fence, NULL);
4070    }
4071 
4072    for (unsigned i = counter->offset_start / sizeof(uint64_t); i < counter->offset_end / sizeof(uint64_t); i += 2) {
4073       counter->accum += prim_counts[i + 1] - prim_counts[i];
4074    }
4075    tgt->count.offset_start = tgt->count.offset_end = 0;
4076 }
4077 
4078 static void
crocus_stream_store_prims_written(struct crocus_batch * batch,struct crocus_stream_output_target * tgt)4079 crocus_stream_store_prims_written(struct crocus_batch *batch,
4080                                   struct crocus_stream_output_target *tgt)
4081 {
4082    if (!tgt->offset_res) {
4083       u_upload_alloc(batch->ice->ctx.stream_uploader, 0, 4096, 4,
4084                      &tgt->offset_offset,
4085                      (struct pipe_resource **)&tgt->offset_res,
4086                      &tgt->prim_map);
4087       tgt->count.offset_start = tgt->count.offset_end = 0;
4088    }
4089 
4090    if (tgt->count.offset_end + 16 >= 4096) {
4091       aggregate_stream_counter(batch, tgt, &tgt->prev_count);
4092       aggregate_stream_counter(batch, tgt, &tgt->count);
4093    }
4094 
4095    crocus_emit_mi_flush(batch);
4096    crocus_store_register_mem64(batch, GEN6_SO_NUM_PRIMS_WRITTEN,
4097                                tgt->offset_res->bo,
4098                                tgt->count.offset_end + tgt->offset_offset, false);
4099    tgt->count.offset_end += 8;
4100 }
4101 
4102 static void
compute_vertices_written_so_far(struct crocus_context * ice,struct crocus_stream_output_target * tgt,struct crocus_streamout_counter * counter,uint64_t * svbi)4103 compute_vertices_written_so_far(struct crocus_context *ice,
4104                                 struct crocus_stream_output_target *tgt,
4105                                 struct crocus_streamout_counter *counter,
4106                                 uint64_t *svbi)
4107 {
4108    //TODO vertices per prim
4109    aggregate_stream_counter(&ice->batches[0], tgt, counter);
4110 
4111    *svbi = counter->accum * ice->state.last_xfb_verts_per_prim;
4112 }
4113 #endif
4114 /**
4115  * The pipe->set_stream_output_targets() driver hook.
4116  *
4117  * At this point, we know which targets are bound to a particular index,
4118  * and also whether we want to append or start over.  We can finish the
4119  * 3DSTATE_SO_BUFFER packets we started earlier.
4120  */
4121 static void
crocus_set_stream_output_targets(struct pipe_context * ctx,unsigned num_targets,struct pipe_stream_output_target ** targets,const unsigned * offsets)4122 crocus_set_stream_output_targets(struct pipe_context *ctx,
4123                                  unsigned num_targets,
4124                                  struct pipe_stream_output_target **targets,
4125                                  const unsigned *offsets)
4126 {
4127    struct crocus_context *ice = (struct crocus_context *) ctx;
4128    struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
4129    struct pipe_stream_output_target *old_tgt[4] = { NULL, NULL, NULL, NULL };
4130    const bool active = num_targets > 0;
4131    if (ice->state.streamout_active != active) {
4132       ice->state.streamout_active = active;
4133 #if GFX_VER >= 7
4134       ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
4135 #else
4136       ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
4137 #endif
4138 
4139       /* We only emit 3DSTATE_SO_DECL_LIST when streamout is active, because
4140        * it's a non-pipelined command.  If we're switching streamout on, we
4141        * may have missed emitting it earlier, so do so now.  (We're already
4142        * taking a stall to update 3DSTATE_SO_BUFFERS anyway...)
4143        */
4144       if (active) {
4145 #if GFX_VER >= 7
4146          ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST;
4147 #endif
4148       } else {
4149          uint32_t flush = 0;
4150          for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4151             struct crocus_stream_output_target *tgt =
4152                (void *) ice->state.so_target[i];
4153             if (tgt) {
4154                struct crocus_resource *res = (void *) tgt->base.buffer;
4155 
4156                flush |= crocus_flush_bits_for_history(res);
4157                crocus_dirty_for_history(ice, res);
4158             }
4159          }
4160          crocus_emit_pipe_control_flush(&ice->batches[CROCUS_BATCH_RENDER],
4161                                         "make streamout results visible", flush);
4162       }
4163    }
4164 
4165    ice->state.so_targets = num_targets;
4166    for (int i = 0; i < 4; i++) {
4167       pipe_so_target_reference(&old_tgt[i], ice->state.so_target[i]);
4168       pipe_so_target_reference(&ice->state.so_target[i],
4169                                i < num_targets ? targets[i] : NULL);
4170    }
4171 
4172 #if GFX_VER == 6
4173    bool stored_num_prims = false;
4174    for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4175       if (num_targets) {
4176          struct crocus_stream_output_target *tgt =
4177             (void *) ice->state.so_target[i];
4178 
4179          if (!tgt)
4180             continue;
4181          if (offsets[i] == 0) {
4182             // This means that we're supposed to ignore anything written to
4183             // the buffer before. We can do this by just clearing out the
4184             // count of writes to the prim count buffer.
4185             tgt->count.offset_start = tgt->count.offset_end;
4186             tgt->count.accum = 0;
4187             ice->state.svbi = 0;
4188          } else {
4189             if (tgt->offset_res) {
4190                compute_vertices_written_so_far(ice, tgt, &tgt->count, &ice->state.svbi);
4191                tgt->count.offset_start = tgt->count.offset_end;
4192             }
4193          }
4194 
4195          if (!stored_num_prims) {
4196             crocus_stream_store_prims_written(batch, tgt);
4197             stored_num_prims = true;
4198          }
4199       } else {
4200          struct crocus_stream_output_target *tgt =
4201             (void *) old_tgt[i];
4202          if (tgt) {
4203             if (!stored_num_prims) {
4204                crocus_stream_store_prims_written(batch, tgt);
4205                stored_num_prims = true;
4206             }
4207 
4208             if (tgt->offset_res) {
4209                tgt->prev_count = tgt->count;
4210             }
4211          }
4212       }
4213       pipe_so_target_reference(&old_tgt[i], NULL);
4214    }
4215    ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_GS;
4216 #else
4217    for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4218       if (num_targets) {
4219          struct crocus_stream_output_target *tgt =
4220             (void *) ice->state.so_target[i];
4221 
4222          if (offsets[i] == 0) {
4223 #if GFX_VER == 8
4224             if (tgt)
4225                tgt->zero_offset = true;
4226 #endif
4227             crocus_load_register_imm32(batch, GEN7_SO_WRITE_OFFSET(i), 0);
4228          }
4229          else if (tgt)
4230             crocus_load_register_mem32(batch, GEN7_SO_WRITE_OFFSET(i),
4231                                        tgt->offset_res->bo,
4232                                        tgt->offset_offset);
4233       } else {
4234          struct crocus_stream_output_target *tgt =
4235             (void *) old_tgt[i];
4236          if (tgt)
4237             crocus_store_register_mem32(batch, GEN7_SO_WRITE_OFFSET(i),
4238                                         tgt->offset_res->bo,
4239                                         tgt->offset_offset, false);
4240       }
4241       pipe_so_target_reference(&old_tgt[i], NULL);
4242    }
4243 #endif
4244    /* No need to update 3DSTATE_SO_BUFFER unless SOL is active. */
4245    if (!active)
4246       return;
4247 #if GFX_VER >= 7
4248    ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
4249 #elif GFX_VER == 6
4250    ice->state.dirty |= CROCUS_DIRTY_GEN6_SVBI;
4251 #endif
4252 }
4253 
4254 #endif
4255 
4256 #if GFX_VER >= 7
4257 /**
4258  * An crocus-vtable helper for encoding the 3DSTATE_SO_DECL_LIST and
4259  * 3DSTATE_STREAMOUT packets.
4260  *
4261  * 3DSTATE_SO_DECL_LIST is a list of shader outputs we want the streamout
4262  * hardware to record.  We can create it entirely based on the shader, with
4263  * no dynamic state dependencies.
4264  *
4265  * 3DSTATE_STREAMOUT is an annoying mix of shader-based information and
4266  * state-based settings.  We capture the shader-related ones here, and merge
4267  * the rest in at draw time.
4268  */
4269 static uint32_t *
crocus_create_so_decl_list(const struct pipe_stream_output_info * info,const struct intel_vue_map * vue_map)4270 crocus_create_so_decl_list(const struct pipe_stream_output_info *info,
4271                            const struct intel_vue_map *vue_map)
4272 {
4273    struct GENX(SO_DECL) so_decl[PIPE_MAX_VERTEX_STREAMS][128];
4274    int buffer_mask[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4275    int next_offset[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4276    int decls[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4277    int max_decls = 0;
4278    STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= PIPE_MAX_SO_OUTPUTS);
4279 
4280    memset(so_decl, 0, sizeof(so_decl));
4281 
4282    /* Construct the list of SO_DECLs to be emitted.  The formatting of the
4283     * command feels strange -- each dword pair contains a SO_DECL per stream.
4284     */
4285    for (unsigned i = 0; i < info->num_outputs; i++) {
4286       const struct pipe_stream_output *output = &info->output[i];
4287       const int buffer = output->output_buffer;
4288       const int varying = output->register_index;
4289       const unsigned stream_id = output->stream;
4290       assert(stream_id < PIPE_MAX_VERTEX_STREAMS);
4291 
4292       buffer_mask[stream_id] |= 1 << buffer;
4293 
4294       assert(vue_map->varying_to_slot[varying] >= 0);
4295 
4296       /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
4297        * array.  Instead, it simply increments DstOffset for the following
4298        * input by the number of components that should be skipped.
4299        *
4300        * Our hardware is unusual in that it requires us to program SO_DECLs
4301        * for fake "hole" components, rather than simply taking the offset
4302        * for each real varying.  Each hole can have size 1, 2, 3, or 4; we
4303        * program as many size = 4 holes as we can, then a final hole to
4304        * accommodate the final 1, 2, or 3 remaining.
4305        */
4306       int skip_components = output->dst_offset - next_offset[buffer];
4307 
4308       while (skip_components > 0) {
4309          so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
4310             .HoleFlag = 1,
4311             .OutputBufferSlot = output->output_buffer,
4312             .ComponentMask = (1 << MIN2(skip_components, 4)) - 1,
4313          };
4314          skip_components -= 4;
4315       }
4316 
4317       next_offset[buffer] = output->dst_offset + output->num_components;
4318 
4319       so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
4320          .OutputBufferSlot = output->output_buffer,
4321          .RegisterIndex = vue_map->varying_to_slot[varying],
4322          .ComponentMask =
4323             ((1 << output->num_components) - 1) << output->start_component,
4324       };
4325 
4326       if (decls[stream_id] > max_decls)
4327          max_decls = decls[stream_id];
4328    }
4329 
4330    unsigned dwords = GENX(3DSTATE_STREAMOUT_length) + (3 + 2 * max_decls);
4331    uint32_t *map = ralloc_size(NULL, sizeof(uint32_t) * dwords);
4332    uint32_t *so_decl_map = map + GENX(3DSTATE_STREAMOUT_length);
4333 
4334    crocus_pack_command(GENX(3DSTATE_STREAMOUT), map, sol) {
4335       int urb_entry_read_offset = 0;
4336       int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
4337          urb_entry_read_offset;
4338 
4339       /* We always read the whole vertex.  This could be reduced at some
4340        * point by reading less and offsetting the register index in the
4341        * SO_DECLs.
4342        */
4343       sol.Stream0VertexReadOffset = urb_entry_read_offset;
4344       sol.Stream0VertexReadLength = urb_entry_read_length - 1;
4345       sol.Stream1VertexReadOffset = urb_entry_read_offset;
4346       sol.Stream1VertexReadLength = urb_entry_read_length - 1;
4347       sol.Stream2VertexReadOffset = urb_entry_read_offset;
4348       sol.Stream2VertexReadLength = urb_entry_read_length - 1;
4349       sol.Stream3VertexReadOffset = urb_entry_read_offset;
4350       sol.Stream3VertexReadLength = urb_entry_read_length - 1;
4351 
4352       // TODO: Double-check that stride == 0 means no buffer. Probably this
4353       // needs to go elsewhere, where the buffer enable stuff is actually
4354       // known.
4355 #if GFX_VER < 8
4356       sol.SOBufferEnable0 = !!info->stride[0];
4357       sol.SOBufferEnable1 = !!info->stride[1];
4358       sol.SOBufferEnable2 = !!info->stride[2];
4359       sol.SOBufferEnable3 = !!info->stride[3];
4360 #else
4361       /* Set buffer pitches; 0 means unbound. */
4362       sol.Buffer0SurfacePitch = 4 * info->stride[0];
4363       sol.Buffer1SurfacePitch = 4 * info->stride[1];
4364       sol.Buffer2SurfacePitch = 4 * info->stride[2];
4365       sol.Buffer3SurfacePitch = 4 * info->stride[3];
4366 #endif
4367    }
4368 
4369    crocus_pack_command(GENX(3DSTATE_SO_DECL_LIST), so_decl_map, list) {
4370       list.DWordLength = 3 + 2 * max_decls - 2;
4371       list.StreamtoBufferSelects0 = buffer_mask[0];
4372       list.StreamtoBufferSelects1 = buffer_mask[1];
4373       list.StreamtoBufferSelects2 = buffer_mask[2];
4374       list.StreamtoBufferSelects3 = buffer_mask[3];
4375       list.NumEntries0 = decls[0];
4376       list.NumEntries1 = decls[1];
4377       list.NumEntries2 = decls[2];
4378       list.NumEntries3 = decls[3];
4379    }
4380 
4381    for (int i = 0; i < max_decls; i++) {
4382       crocus_pack_state(GENX(SO_DECL_ENTRY), so_decl_map + 3 + i * 2, entry) {
4383          entry.Stream0Decl = so_decl[0][i];
4384          entry.Stream1Decl = so_decl[1][i];
4385          entry.Stream2Decl = so_decl[2][i];
4386          entry.Stream3Decl = so_decl[3][i];
4387       }
4388    }
4389 
4390    return map;
4391 }
4392 #endif
4393 
4394 #if GFX_VER == 6
4395 static void
crocus_emit_so_svbi(struct crocus_context * ice)4396 crocus_emit_so_svbi(struct crocus_context *ice)
4397 {
4398    struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
4399 
4400    unsigned max_vertex = 0xffffffff;
4401    for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4402       struct crocus_stream_output_target *tgt =
4403          (void *) ice->state.so_target[i];
4404       if (tgt)
4405          max_vertex = MIN2(max_vertex, tgt->base.buffer_size / tgt->stride);
4406    }
4407 
4408    crocus_emit_cmd(batch, GENX(3DSTATE_GS_SVB_INDEX), svbi) {
4409       svbi.IndexNumber = 0;
4410       svbi.StreamedVertexBufferIndex = (uint32_t)ice->state.svbi; /* fix when resuming, based on target's prim count */
4411       svbi.MaximumIndex = max_vertex;
4412    }
4413 
4414    /* initialize the rest of the SVBI's to reasonable values so that we don't
4415     * run out of room writing the regular data.
4416     */
4417    for (int i = 1; i < 4; i++) {
4418       crocus_emit_cmd(batch, GENX(3DSTATE_GS_SVB_INDEX), svbi) {
4419          svbi.IndexNumber = i;
4420          svbi.StreamedVertexBufferIndex = 0;
4421          svbi.MaximumIndex = 0xffffffff;
4422       }
4423    }
4424 }
4425 
4426 #endif
4427 
4428 
4429 #if GFX_VER >= 6
4430 static bool
crocus_is_drawing_points(const struct crocus_context * ice)4431 crocus_is_drawing_points(const struct crocus_context *ice)
4432 {
4433    const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4434 
4435    if (cso_rast->cso.fill_front == PIPE_POLYGON_MODE_POINT ||
4436        cso_rast->cso.fill_back == PIPE_POLYGON_MODE_POINT)
4437       return true;
4438 
4439    if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
4440       const struct elk_gs_prog_data *gs_prog_data =
4441          (void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data;
4442       return gs_prog_data->output_topology == _3DPRIM_POINTLIST;
4443    } else if (ice->shaders.prog[MESA_SHADER_TESS_EVAL]) {
4444       const struct elk_tes_prog_data *tes_data =
4445          (void *) ice->shaders.prog[MESA_SHADER_TESS_EVAL]->prog_data;
4446       return tes_data->output_topology == INTEL_TESS_OUTPUT_TOPOLOGY_POINT;
4447    } else {
4448       return ice->state.prim_mode == MESA_PRIM_POINTS;
4449    }
4450 }
4451 #endif
4452 
4453 #if GFX_VER >= 6
4454 static void
get_attr_override(struct GENX (SF_OUTPUT_ATTRIBUTE_DETAIL)* attr,const struct intel_vue_map * vue_map,int urb_entry_read_offset,int fs_attr,bool two_side_color,uint32_t * max_source_attr)4455 get_attr_override(
4456    struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr,
4457    const struct intel_vue_map *vue_map,
4458    int urb_entry_read_offset, int fs_attr,
4459    bool two_side_color, uint32_t *max_source_attr)
4460 {
4461    /* Find the VUE slot for this attribute. */
4462    int slot = vue_map->varying_to_slot[fs_attr];
4463 
4464    /* Viewport and Layer are stored in the VUE header.  We need to override
4465     * them to zero if earlier stages didn't write them, as GL requires that
4466     * they read back as zero when not explicitly set.
4467     */
4468    if (fs_attr == VARYING_SLOT_VIEWPORT || fs_attr == VARYING_SLOT_LAYER) {
4469       attr->ComponentOverrideX = true;
4470       attr->ComponentOverrideW = true;
4471       attr->ConstantSource = CONST_0000;
4472 
4473       if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
4474          attr->ComponentOverrideY = true;
4475       if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
4476          attr->ComponentOverrideZ = true;
4477 
4478       return;
4479    }
4480 
4481    /* If there was only a back color written but not front, use back
4482     * as the color instead of undefined
4483     */
4484    if (slot == -1 && fs_attr == VARYING_SLOT_COL0)
4485       slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];
4486    if (slot == -1 && fs_attr == VARYING_SLOT_COL1)
4487       slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];
4488 
4489    if (slot == -1) {
4490       /* This attribute does not exist in the VUE--that means that the vertex
4491        * shader did not write to it.  This means that either:
4492        *
4493        * (a) This attribute is a texture coordinate, and it is going to be
4494        * replaced with point coordinates (as a consequence of a call to
4495        * glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)), so the
4496        * hardware will ignore whatever attribute override we supply.
4497        *
4498        * (b) This attribute is read by the fragment shader but not written by
4499        * the vertex shader, so its value is undefined.  Therefore the
4500        * attribute override we supply doesn't matter.
4501        *
4502        * (c) This attribute is gl_PrimitiveID, and it wasn't written by the
4503        * previous shader stage.
4504        *
4505        * Note that we don't have to worry about the cases where the attribute
4506        * is gl_PointCoord or is undergoing point sprite coordinate
4507        * replacement, because in those cases, this function isn't called.
4508        *
4509        * In case (c), we need to program the attribute overrides so that the
4510        * primitive ID will be stored in this slot.  In every other case, the
4511        * attribute override we supply doesn't matter.  So just go ahead and
4512        * program primitive ID in every case.
4513        */
4514       attr->ComponentOverrideW = true;
4515       attr->ComponentOverrideX = true;
4516       attr->ComponentOverrideY = true;
4517       attr->ComponentOverrideZ = true;
4518       attr->ConstantSource = PRIM_ID;
4519       return;
4520    }
4521 
4522    /* Compute the location of the attribute relative to urb_entry_read_offset.
4523     * Each increment of urb_entry_read_offset represents a 256-bit value, so
4524     * it counts for two 128-bit VUE slots.
4525     */
4526    int source_attr = slot - 2 * urb_entry_read_offset;
4527    assert(source_attr >= 0 && source_attr < 32);
4528 
4529    /* If we are doing two-sided color, and the VUE slot following this one
4530     * represents a back-facing color, then we need to instruct the SF unit to
4531     * do back-facing swizzling.
4532     */
4533    bool swizzling = two_side_color &&
4534       ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&
4535         vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||
4536        (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&
4537         vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1));
4538 
4539    /* Update max_source_attr.  If swizzling, the SF will read this slot + 1. */
4540    if (*max_source_attr < source_attr + swizzling)
4541       *max_source_attr = source_attr + swizzling;
4542 
4543    attr->SourceAttribute = source_attr;
4544    if (swizzling)
4545       attr->SwizzleSelect = INPUTATTR_FACING;
4546 }
4547 
4548 static void
calculate_attr_overrides(const struct crocus_context * ice,struct GENX (SF_OUTPUT_ATTRIBUTE_DETAIL)* attr_overrides,uint32_t * point_sprite_enables,uint32_t * urb_entry_read_length,uint32_t * urb_entry_read_offset)4549 calculate_attr_overrides(
4550    const struct crocus_context *ice,
4551    struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr_overrides,
4552    uint32_t *point_sprite_enables,
4553    uint32_t *urb_entry_read_length,
4554    uint32_t *urb_entry_read_offset)
4555 {
4556    const struct elk_wm_prog_data *wm_prog_data = (void *)
4557       ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
4558    const struct intel_vue_map *vue_map = ice->shaders.last_vue_map;
4559    const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4560    uint32_t max_source_attr = 0;
4561    const struct shader_info *fs_info =
4562       crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
4563 
4564    int first_slot =
4565       elk_compute_first_urb_slot_required(fs_info->inputs_read, vue_map);
4566 
4567    /* Each URB offset packs two varying slots */
4568    assert(first_slot % 2 == 0);
4569    *urb_entry_read_offset = first_slot / 2;
4570    *point_sprite_enables = 0;
4571 
4572    for (int fs_attr = 0; fs_attr < VARYING_SLOT_MAX; fs_attr++) {
4573       const int input_index = wm_prog_data->urb_setup[fs_attr];
4574 
4575       if (input_index < 0)
4576          continue;
4577 
4578       bool point_sprite = false;
4579       if (crocus_is_drawing_points(ice)) {
4580          if (fs_attr >= VARYING_SLOT_TEX0 &&
4581              fs_attr <= VARYING_SLOT_TEX7 &&
4582              cso_rast->cso.sprite_coord_enable & (1 << (fs_attr - VARYING_SLOT_TEX0)))
4583             point_sprite = true;
4584 
4585          if (fs_attr == VARYING_SLOT_PNTC)
4586             point_sprite = true;
4587 
4588          if (point_sprite)
4589             *point_sprite_enables |= 1U << input_index;
4590       }
4591 
4592       struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attribute = { 0 };
4593       if (!point_sprite) {
4594          get_attr_override(&attribute, vue_map, *urb_entry_read_offset, fs_attr,
4595                            cso_rast->cso.light_twoside, &max_source_attr);
4596       }
4597 
4598       /* The hardware can only do the overrides on 16 overrides at a
4599        * time, and the other up to 16 have to be lined up so that the
4600        * input index = the output index.  We'll need to do some
4601        * tweaking to make sure that's the case.
4602        */
4603       if (input_index < 16)
4604          attr_overrides[input_index] = attribute;
4605       else
4606          assert(attribute.SourceAttribute == input_index);
4607    }
4608 
4609    /* From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
4610     * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
4611     *
4612     * "This field should be set to the minimum length required to read the
4613     *  maximum source attribute.  The maximum source attribute is indicated
4614     *  by the maximum value of the enabled Attribute # Source Attribute if
4615     *  Attribute Swizzle Enable is set, Number of Output Attributes-1 if
4616     *  enable is not set.
4617     *  read_length = ceiling((max_source_attr + 1) / 2)
4618     *
4619     *  [errata] Corruption/Hang possible if length programmed larger than
4620     *  recommended"
4621     *
4622     * Similar text exists for Ivy Bridge.
4623     */
4624    *urb_entry_read_length = DIV_ROUND_UP(max_source_attr + 1, 2);
4625 }
4626 #endif
4627 
4628 #if GFX_VER >= 7
4629 static void
crocus_emit_sbe(struct crocus_batch * batch,const struct crocus_context * ice)4630 crocus_emit_sbe(struct crocus_batch *batch, const struct crocus_context *ice)
4631 {
4632    const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4633    const struct elk_wm_prog_data *wm_prog_data = (void *)
4634       ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
4635 #if GFX_VER >= 8
4636    struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = { { 0 } };
4637 #else
4638 #define attr_overrides sbe.Attribute
4639 #endif
4640 
4641    uint32_t urb_entry_read_length;
4642    uint32_t urb_entry_read_offset;
4643    uint32_t point_sprite_enables;
4644 
4645    crocus_emit_cmd(batch, GENX(3DSTATE_SBE), sbe) {
4646       sbe.AttributeSwizzleEnable = true;
4647       sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
4648       sbe.PointSpriteTextureCoordinateOrigin = cso_rast->cso.sprite_coord_mode;
4649 
4650       calculate_attr_overrides(ice,
4651                                attr_overrides,
4652                                &point_sprite_enables,
4653                                &urb_entry_read_length,
4654                                &urb_entry_read_offset);
4655       sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
4656       sbe.VertexURBEntryReadLength = urb_entry_read_length;
4657       sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
4658       sbe.PointSpriteTextureCoordinateEnable = point_sprite_enables;
4659 #if GFX_VER >= 8
4660       sbe.ForceVertexURBEntryReadLength = true;
4661       sbe.ForceVertexURBEntryReadOffset = true;
4662 #endif
4663    }
4664 #if GFX_VER >= 8
4665    crocus_emit_cmd(batch, GENX(3DSTATE_SBE_SWIZ), sbes) {
4666       for (int i = 0; i < 16; i++)
4667          sbes.Attribute[i] = attr_overrides[i];
4668    }
4669 #endif
4670 }
4671 #endif
4672 
4673 /* ------------------------------------------------------------------- */
4674 
4675 /**
4676  * Populate VS program key fields based on the current state.
4677  */
4678 static void
crocus_populate_vs_key(const struct crocus_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct elk_vs_prog_key * key)4679 crocus_populate_vs_key(const struct crocus_context *ice,
4680                        const struct shader_info *info,
4681                        gl_shader_stage last_stage,
4682                        struct elk_vs_prog_key *key)
4683 {
4684    const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4685 
4686    if (info->clip_distance_array_size == 0 &&
4687        (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4688        last_stage == MESA_SHADER_VERTEX)
4689       key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4690 
4691    if (last_stage == MESA_SHADER_VERTEX &&
4692        info->outputs_written & (VARYING_BIT_PSIZ))
4693       key->clamp_pointsize = 1;
4694 
4695 #if GFX_VER <= 5
4696    key->copy_edgeflag = (cso_rast->cso.fill_back != PIPE_POLYGON_MODE_FILL ||
4697                          cso_rast->cso.fill_front != PIPE_POLYGON_MODE_FILL);
4698    key->point_coord_replace = cso_rast->cso.sprite_coord_enable & 0xff;
4699 #endif
4700 
4701    key->clamp_vertex_color = cso_rast->cso.clamp_vertex_color;
4702 
4703 #if GFX_VERx10 < 75
4704    uint64_t inputs_read = info->inputs_read;
4705    int ve_idx = 0;
4706    while (inputs_read) {
4707       int i = u_bit_scan64(&inputs_read);
4708       key->gl_attrib_wa_flags[i] = ice->state.cso_vertex_elements->wa_flags[ve_idx];
4709       ve_idx++;
4710    }
4711 #endif
4712 }
4713 
4714 /**
4715  * Populate TCS program key fields based on the current state.
4716  */
4717 static void
crocus_populate_tcs_key(const struct crocus_context * ice,struct elk_tcs_prog_key * key)4718 crocus_populate_tcs_key(const struct crocus_context *ice,
4719                         struct elk_tcs_prog_key *key)
4720 {
4721 }
4722 
4723 /**
4724  * Populate TES program key fields based on the current state.
4725  */
4726 static void
crocus_populate_tes_key(const struct crocus_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct elk_tes_prog_key * key)4727 crocus_populate_tes_key(const struct crocus_context *ice,
4728                         const struct shader_info *info,
4729                         gl_shader_stage last_stage,
4730                         struct elk_tes_prog_key *key)
4731 {
4732    const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4733 
4734    if (info->clip_distance_array_size == 0 &&
4735        (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4736        last_stage == MESA_SHADER_TESS_EVAL)
4737       key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4738 
4739    if (last_stage == MESA_SHADER_TESS_EVAL &&
4740        info->outputs_written & (VARYING_BIT_PSIZ))
4741       key->clamp_pointsize = 1;
4742 }
4743 
4744 /**
4745  * Populate GS program key fields based on the current state.
4746  */
4747 static void
crocus_populate_gs_key(const struct crocus_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct elk_gs_prog_key * key)4748 crocus_populate_gs_key(const struct crocus_context *ice,
4749                        const struct shader_info *info,
4750                        gl_shader_stage last_stage,
4751                        struct elk_gs_prog_key *key)
4752 {
4753    const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4754 
4755    if (info->clip_distance_array_size == 0 &&
4756        (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4757        last_stage == MESA_SHADER_GEOMETRY)
4758       key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4759 
4760    if (last_stage == MESA_SHADER_GEOMETRY &&
4761        info->outputs_written & (VARYING_BIT_PSIZ))
4762       key->clamp_pointsize = 1;
4763 }
4764 
4765 /**
4766  * Populate FS program key fields based on the current state.
4767  */
4768 static void
crocus_populate_fs_key(const struct crocus_context * ice,const struct shader_info * info,struct elk_wm_prog_key * key)4769 crocus_populate_fs_key(const struct crocus_context *ice,
4770                        const struct shader_info *info,
4771                        struct elk_wm_prog_key *key)
4772 {
4773    struct crocus_screen *screen = (void *) ice->ctx.screen;
4774    const struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
4775    const struct crocus_depth_stencil_alpha_state *zsa = ice->state.cso_zsa;
4776    const struct crocus_rasterizer_state *rast = ice->state.cso_rast;
4777    const struct crocus_blend_state *blend = ice->state.cso_blend;
4778 
4779 #if GFX_VER < 6
4780    uint32_t lookup = 0;
4781 
4782    if (info->fs.uses_discard || zsa->cso.alpha_enabled)
4783       lookup |= ELK_WM_IZ_PS_KILL_ALPHATEST_BIT;
4784 
4785    if (info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4786       lookup |= ELK_WM_IZ_PS_COMPUTES_DEPTH_BIT;
4787 
4788    if (fb->zsbuf && zsa->cso.depth_enabled) {
4789       lookup |= ELK_WM_IZ_DEPTH_TEST_ENABLE_BIT;
4790 
4791       if (zsa->cso.depth_writemask)
4792          lookup |= ELK_WM_IZ_DEPTH_WRITE_ENABLE_BIT;
4793 
4794    }
4795    if (zsa->cso.stencil[0].enabled || zsa->cso.stencil[1].enabled) {
4796       lookup |= ELK_WM_IZ_STENCIL_TEST_ENABLE_BIT;
4797       if (zsa->cso.stencil[0].writemask || zsa->cso.stencil[1].writemask)
4798          lookup |= ELK_WM_IZ_STENCIL_WRITE_ENABLE_BIT;
4799    }
4800    key->iz_lookup = lookup;
4801    key->stats_wm = ice->state.stats_wm;
4802 #endif
4803 
4804    uint32_t line_aa = ELK_NEVER;
4805    if (rast->cso.line_smooth) {
4806       int reduced_prim = ice->state.reduced_prim_mode;
4807       if (reduced_prim == MESA_PRIM_LINES)
4808          line_aa = ELK_ALWAYS;
4809       else if (reduced_prim == MESA_PRIM_TRIANGLES) {
4810          if (rast->cso.fill_front == PIPE_POLYGON_MODE_LINE) {
4811             line_aa = ELK_SOMETIMES;
4812 
4813             if (rast->cso.fill_back == PIPE_POLYGON_MODE_LINE ||
4814                 rast->cso.cull_face == PIPE_FACE_BACK)
4815                line_aa = ELK_ALWAYS;
4816          } else if (rast->cso.fill_back == PIPE_POLYGON_MODE_LINE) {
4817             line_aa = ELK_SOMETIMES;
4818 
4819             if (rast->cso.cull_face == PIPE_FACE_FRONT)
4820                line_aa = ELK_ALWAYS;
4821          }
4822       }
4823    }
4824    key->line_aa = line_aa;
4825 
4826    key->nr_color_regions = fb->nr_cbufs;
4827 
4828    key->clamp_fragment_color = rast->cso.clamp_fragment_color;
4829 
4830    key->alpha_to_coverage = blend->cso.alpha_to_coverage ?
4831       ELK_ALWAYS : ELK_NEVER;
4832 
4833    key->alpha_test_replicate_alpha = fb->nr_cbufs > 1 && zsa->cso.alpha_enabled;
4834 
4835    key->flat_shade = rast->cso.flatshade &&
4836       (info->inputs_read & (VARYING_BIT_COL0 | VARYING_BIT_COL1));
4837 
4838    const bool multisample_fbo = rast->cso.multisample && fb->samples > 1;
4839    key->multisample_fbo = multisample_fbo ? ELK_ALWAYS : ELK_NEVER;
4840    key->persample_interp =
4841       rast->cso.force_persample_interp ? ELK_ALWAYS : ELK_NEVER;
4842 
4843    key->ignore_sample_mask_out = !multisample_fbo;
4844    key->coherent_fb_fetch = false; // TODO: needed?
4845 
4846    key->force_dual_color_blend =
4847       screen->driconf.dual_color_blend_by_location &&
4848       (blend->blend_enables & 1) && blend->dual_color_blending;
4849 
4850 #if GFX_VER <= 5
4851    if (fb->nr_cbufs > 1 && zsa->cso.alpha_enabled) {
4852       key->emit_alpha_test = true;
4853       key->alpha_test_func = zsa->cso.alpha_func;
4854       key->alpha_test_ref = zsa->cso.alpha_ref_value;
4855    }
4856 #endif
4857 }
4858 
4859 static void
crocus_populate_cs_key(const struct crocus_context * ice,struct elk_cs_prog_key * key)4860 crocus_populate_cs_key(const struct crocus_context *ice,
4861                        struct elk_cs_prog_key *key)
4862 {
4863 }
4864 
4865 #if GFX_VER == 4
4866 #define KSP(ice, shader) ro_bo((ice)->shaders.cache_bo, (shader)->offset);
4867 #elif GFX_VER >= 5
4868 static uint64_t
KSP(const struct crocus_context * ice,const struct crocus_compiled_shader * shader)4869 KSP(const struct crocus_context *ice, const struct crocus_compiled_shader *shader)
4870 {
4871    return shader->offset;
4872 }
4873 #endif
4874 
4875 /* Gen11 workaround table #2056 WABTPPrefetchDisable suggests to disable
4876  * prefetching of binding tables in A0 and B0 steppings.  XXX: Revisit
4877  * this WA on C0 stepping.
4878  *
4879  * TODO: Fill out SamplerCount for prefetching?
4880  */
4881 
4882 #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage)                 \
4883    pkt.KernelStartPointer = KSP(ice, shader);                           \
4884    pkt.BindingTableEntryCount = shader->bt.size_bytes / 4;              \
4885    pkt.FloatingPointMode = prog_data->use_alt_mode;                     \
4886                                                                         \
4887    pkt.DispatchGRFStartRegisterForURBData =                             \
4888       prog_data->dispatch_grf_start_reg;                                \
4889    pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length;     \
4890    pkt.prefix##URBEntryReadOffset = 0;                                  \
4891                                                                         \
4892    pkt.StatisticsEnable = true;                                         \
4893    pkt.Enable           = true;                                         \
4894                                                                         \
4895    if (prog_data->total_scratch) {                                      \
4896       struct crocus_bo *bo =                                            \
4897          crocus_get_scratch_space(ice, prog_data->total_scratch, stage); \
4898       pkt.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11;   \
4899       pkt.ScratchSpaceBasePointer = rw_bo(bo, 0);                       \
4900    }
4901 
4902 /* ------------------------------------------------------------------- */
4903 #if GFX_VER >= 6
4904 static const uint32_t push_constant_opcodes[] = {
4905    [MESA_SHADER_VERTEX]    = 21,
4906    [MESA_SHADER_TESS_CTRL] = 25, /* HS */
4907    [MESA_SHADER_TESS_EVAL] = 26, /* DS */
4908    [MESA_SHADER_GEOMETRY]  = 22,
4909    [MESA_SHADER_FRAGMENT]  = 23,
4910    [MESA_SHADER_COMPUTE]   = 0,
4911 };
4912 #endif
4913 
4914 static void
emit_sized_null_surface(struct crocus_batch * batch,unsigned width,unsigned height,unsigned layers,unsigned levels,unsigned minimum_array_element,uint32_t * out_offset)4915 emit_sized_null_surface(struct crocus_batch *batch,
4916                         unsigned width, unsigned height,
4917                         unsigned layers, unsigned levels,
4918                         unsigned minimum_array_element,
4919                         uint32_t *out_offset)
4920 {
4921    struct isl_device *isl_dev = &batch->screen->isl_dev;
4922    uint32_t *surf = stream_state(batch, isl_dev->ss.size,
4923                                  isl_dev->ss.align,
4924                                  out_offset);
4925    //TODO gen 6 multisample crash
4926    isl_null_fill_state(isl_dev, surf,
4927                        .size = isl_extent3d(width, height, layers),
4928                        .levels = levels,
4929                        .minimum_array_element = minimum_array_element);
4930 }
4931 static void
emit_null_surface(struct crocus_batch * batch,uint32_t * out_offset)4932 emit_null_surface(struct crocus_batch *batch,
4933                   uint32_t *out_offset)
4934 {
4935    emit_sized_null_surface(batch, 1, 1, 1, 0, 0, out_offset);
4936 }
4937 
4938 static void
emit_null_fb_surface(struct crocus_batch * batch,struct crocus_context * ice,uint32_t * out_offset)4939 emit_null_fb_surface(struct crocus_batch *batch,
4940                      struct crocus_context *ice,
4941                      uint32_t *out_offset)
4942 {
4943    uint32_t width, height, layers, level, layer;
4944    /* If set_framebuffer_state() was never called, fall back to 1x1x1 */
4945    if (ice->state.framebuffer.width == 0 && ice->state.framebuffer.height == 0) {
4946       emit_null_surface(batch, out_offset);
4947       return;
4948    }
4949 
4950    struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
4951    width = MAX2(cso->width, 1);
4952    height = MAX2(cso->height, 1);
4953    layers = cso->layers ? cso->layers : 1;
4954    level = 0;
4955    layer = 0;
4956 
4957    if (cso->nr_cbufs == 0 && cso->zsbuf) {
4958       width = cso->zsbuf->width;
4959       height = cso->zsbuf->height;
4960       level = cso->zsbuf->u.tex.level;
4961       layer = cso->zsbuf->u.tex.first_layer;
4962    }
4963    emit_sized_null_surface(batch, width, height,
4964                            layers, level, layer,
4965                            out_offset);
4966 }
4967 
4968 static void
emit_surface_state(struct crocus_batch * batch,struct crocus_resource * res,const struct isl_surf * in_surf,bool adjust_surf,struct isl_view * in_view,bool writeable,enum isl_aux_usage aux_usage,bool blend_enable,uint32_t write_disables,uint32_t * surf_state,uint32_t addr_offset)4969 emit_surface_state(struct crocus_batch *batch,
4970                    struct crocus_resource *res,
4971                    const struct isl_surf *in_surf,
4972                    bool adjust_surf,
4973                    struct isl_view *in_view,
4974                    bool writeable,
4975                    enum isl_aux_usage aux_usage,
4976                    bool blend_enable,
4977                    uint32_t write_disables,
4978                    uint32_t *surf_state,
4979                    uint32_t addr_offset)
4980 {
4981    struct isl_device *isl_dev = &batch->screen->isl_dev;
4982    uint32_t reloc = RELOC_32BIT;
4983    uint64_t offset_B = res->offset;
4984    uint32_t tile_x_sa = 0, tile_y_sa = 0;
4985 
4986    if (writeable)
4987       reloc |= RELOC_WRITE;
4988 
4989    struct isl_surf surf = *in_surf;
4990    struct isl_view view = *in_view;
4991    if (adjust_surf) {
4992       if (res->base.b.target == PIPE_TEXTURE_3D && view.array_len == 1) {
4993          isl_surf_get_image_surf(isl_dev, in_surf,
4994                                  view.base_level, 0,
4995                                  view.base_array_layer,
4996                                  &surf, &offset_B,
4997                                  &tile_x_sa, &tile_y_sa);
4998          view.base_array_layer = 0;
4999          view.base_level = 0;
5000       } else if (res->base.b.target == PIPE_TEXTURE_CUBE && GFX_VER == 4) {
5001          isl_surf_get_image_surf(isl_dev, in_surf,
5002                                  view.base_level, view.base_array_layer,
5003                                  0,
5004                                  &surf, &offset_B,
5005                                  &tile_x_sa, &tile_y_sa);
5006          view.base_array_layer = 0;
5007          view.base_level = 0;
5008       } else if (res->base.b.target == PIPE_TEXTURE_1D_ARRAY)
5009          surf.dim = ISL_SURF_DIM_2D;
5010    }
5011 
5012    union isl_color_value clear_color = { .u32 = { 0, 0, 0, 0 } };
5013    struct crocus_bo *aux_bo = NULL;
5014    uint32_t aux_offset = 0;
5015    struct isl_surf *aux_surf = NULL;
5016    if (aux_usage != ISL_AUX_USAGE_NONE) {
5017       aux_surf = &res->aux.surf;
5018       aux_offset = res->aux.offset;
5019       aux_bo = res->aux.bo;
5020 
5021       clear_color = crocus_resource_get_clear_color(res);
5022    }
5023 
5024    isl_surf_fill_state(isl_dev, surf_state,
5025                        .surf = &surf,
5026                        .view = &view,
5027                        .address = crocus_state_reloc(batch,
5028                                                      addr_offset + isl_dev->ss.addr_offset,
5029                                                      res->bo, offset_B, reloc),
5030                        .aux_surf = aux_surf,
5031                        .aux_usage = aux_usage,
5032                        .aux_address = aux_offset,
5033                        .mocs = crocus_mocs(res->bo, isl_dev),
5034                        .clear_color = clear_color,
5035                        .use_clear_address = false,
5036                        .clear_address = 0,
5037                        .x_offset_sa = tile_x_sa,
5038                        .y_offset_sa = tile_y_sa,
5039 #if GFX_VER <= 5
5040                        .blend_enable = blend_enable,
5041                        .write_disables = write_disables,
5042 #endif
5043       );
5044 
5045    if (aux_surf) {
5046       /* On gen7 and prior, the upper 20 bits of surface state DWORD 6 are the
5047        * upper 20 bits of the GPU address of the MCS buffer; the lower 12 bits
5048        * contain other control information.  Since buffer addresses are always
5049        * on 4k boundaries (and thus have their lower 12 bits zero), we can use
5050        * an ordinary reloc to do the necessary address translation.
5051        *
5052        * FIXME: move to the point of assignment.
5053        */
5054       if (GFX_VER == 8) {
5055          uint64_t *aux_addr = (uint64_t *)(surf_state + (isl_dev->ss.aux_addr_offset / 4));
5056          *aux_addr = crocus_state_reloc(batch,
5057                                         addr_offset + isl_dev->ss.aux_addr_offset,
5058                                         aux_bo, *aux_addr,
5059                                         reloc);
5060       } else {
5061          uint32_t *aux_addr = surf_state + (isl_dev->ss.aux_addr_offset / 4);
5062          *aux_addr = crocus_state_reloc(batch,
5063                                         addr_offset + isl_dev->ss.aux_addr_offset,
5064                                         aux_bo, *aux_addr,
5065                                         reloc);
5066       }
5067    }
5068 
5069 }
5070 
5071 static uint32_t
emit_surface(struct crocus_batch * batch,struct crocus_surface * surf,enum isl_aux_usage aux_usage,bool blend_enable,uint32_t write_disables)5072 emit_surface(struct crocus_batch *batch,
5073              struct crocus_surface *surf,
5074              enum isl_aux_usage aux_usage,
5075              bool blend_enable,
5076              uint32_t write_disables)
5077 {
5078    struct isl_device *isl_dev = &batch->screen->isl_dev;
5079    struct crocus_resource *res = (struct crocus_resource *)surf->base.texture;
5080    struct isl_view *view = &surf->view;
5081    uint32_t offset = 0;
5082    enum pipe_texture_target target = res->base.b.target;
5083    bool adjust_surf = false;
5084 
5085    if (GFX_VER == 4 && target == PIPE_TEXTURE_CUBE)
5086       adjust_surf = true;
5087 
5088    if (surf->align_res)
5089       res = (struct crocus_resource *)surf->align_res;
5090 
5091    uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, isl_dev->ss.align, &offset);
5092 
5093    emit_surface_state(batch, res, &surf->surf, adjust_surf, view, true,
5094                       aux_usage, blend_enable,
5095                       write_disables,
5096                       surf_state, offset);
5097    return offset;
5098 }
5099 
5100 static uint32_t
emit_rt_surface(struct crocus_batch * batch,struct crocus_surface * surf,enum isl_aux_usage aux_usage)5101 emit_rt_surface(struct crocus_batch *batch,
5102                 struct crocus_surface *surf,
5103                 enum isl_aux_usage aux_usage)
5104 {
5105    struct isl_device *isl_dev = &batch->screen->isl_dev;
5106    struct crocus_resource *res = (struct crocus_resource *)surf->base.texture;
5107    struct isl_view *view = &surf->read_view;
5108    uint32_t offset = 0;
5109    uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, isl_dev->ss.align, &offset);
5110 
5111    emit_surface_state(batch, res, &surf->surf, true, view, false,
5112                       aux_usage, 0, false,
5113                       surf_state, offset);
5114    return offset;
5115 }
5116 
5117 static uint32_t
emit_grid(struct crocus_context * ice,struct crocus_batch * batch)5118 emit_grid(struct crocus_context *ice,
5119           struct crocus_batch *batch)
5120 {
5121    UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5122    uint32_t offset = 0;
5123    struct crocus_state_ref *grid_ref = &ice->state.grid_size;
5124    uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5125                                        isl_dev->ss.align, &offset);
5126    isl_buffer_fill_state(isl_dev, surf_state,
5127                          .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5128                                                        crocus_resource_bo(grid_ref->res),
5129                                                        grid_ref->offset,
5130                                                        RELOC_32BIT),
5131                          .size_B = 12,
5132                          .format = ISL_FORMAT_RAW,
5133                          .stride_B = 1,
5134                          .mocs = crocus_mocs(crocus_resource_bo(grid_ref->res), isl_dev));
5135    return offset;
5136 }
5137 
5138 static uint32_t
emit_ubo_buffer(struct crocus_context * ice,struct crocus_batch * batch,struct pipe_constant_buffer * buffer)5139 emit_ubo_buffer(struct crocus_context *ice,
5140                 struct crocus_batch *batch,
5141                 struct pipe_constant_buffer *buffer)
5142 {
5143    UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5144    uint32_t offset = 0;
5145 
5146    uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5147                                        isl_dev->ss.align, &offset);
5148    isl_buffer_fill_state(isl_dev, surf_state,
5149                          .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5150                                                        crocus_resource_bo(buffer->buffer),
5151                                                        buffer->buffer_offset,
5152                                                        RELOC_32BIT),
5153                          .size_B = buffer->buffer_size,
5154                          .format = 0,
5155                          .swizzle = ISL_SWIZZLE_IDENTITY,
5156                          .stride_B = 1,
5157                          .mocs = crocus_mocs(crocus_resource_bo(buffer->buffer), isl_dev));
5158 
5159    return offset;
5160 }
5161 
5162 static uint32_t
emit_ssbo_buffer(struct crocus_context * ice,struct crocus_batch * batch,struct pipe_shader_buffer * buffer,bool writeable)5163 emit_ssbo_buffer(struct crocus_context *ice,
5164                  struct crocus_batch *batch,
5165                  struct pipe_shader_buffer *buffer, bool writeable)
5166 {
5167    UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5168    uint32_t offset = 0;
5169    uint32_t reloc = RELOC_32BIT;
5170 
5171    if (writeable)
5172       reloc |= RELOC_WRITE;
5173    uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5174                                        isl_dev->ss.align, &offset);
5175    isl_buffer_fill_state(isl_dev, surf_state,
5176                          .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5177                                                        crocus_resource_bo(buffer->buffer),
5178                                                        buffer->buffer_offset,
5179                                                        reloc),
5180                          .size_B = buffer->buffer_size,
5181                          .format = ISL_FORMAT_RAW,
5182                          .swizzle = ISL_SWIZZLE_IDENTITY,
5183                          .stride_B = 1,
5184                          .mocs = crocus_mocs(crocus_resource_bo(buffer->buffer), isl_dev));
5185 
5186    return offset;
5187 }
5188 
5189 static uint32_t
emit_sampler_view(struct crocus_context * ice,struct crocus_batch * batch,bool for_gather,struct crocus_sampler_view * isv)5190 emit_sampler_view(struct crocus_context *ice,
5191                   struct crocus_batch *batch,
5192                   bool for_gather,
5193                   struct crocus_sampler_view *isv)
5194 {
5195    UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5196    uint32_t offset = 0;
5197 
5198    uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5199                                        isl_dev->ss.align, &offset);
5200 
5201    if (isv->base.target == PIPE_BUFFER) {
5202       const struct isl_format_layout *fmtl = isl_format_get_layout(isv->view.format);
5203       const unsigned cpp = isv->view.format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
5204       unsigned final_size =
5205          MIN3(isv->base.u.buf.size, isv->res->bo->size - isv->res->offset,
5206               CROCUS_MAX_TEXTURE_BUFFER_SIZE * cpp);
5207       isl_buffer_fill_state(isl_dev, surf_state,
5208                             .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5209                                                           isv->res->bo,
5210                                                           isv->res->offset + isv->base.u.buf.offset, RELOC_32BIT),
5211                             .size_B = final_size,
5212                             .format = isv->view.format,
5213                             .swizzle = isv->view.swizzle,
5214                             .stride_B = cpp,
5215                             .mocs = crocus_mocs(isv->res->bo, isl_dev)
5216          );
5217    } else {
5218       enum isl_aux_usage aux_usage =
5219          crocus_resource_texture_aux_usage(isv->res);
5220 
5221       emit_surface_state(batch, isv->res, &isv->res->surf, false,
5222                          for_gather ? &isv->gather_view : &isv->view,
5223                          false, aux_usage, false,
5224                          0, surf_state, offset);
5225    }
5226    return offset;
5227 }
5228 
5229 static uint32_t
emit_image_view(struct crocus_context * ice,struct crocus_batch * batch,struct crocus_image_view * iv)5230 emit_image_view(struct crocus_context *ice,
5231                 struct crocus_batch *batch,
5232                 struct crocus_image_view *iv)
5233 {
5234    UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5235    uint32_t offset = 0;
5236 
5237    struct crocus_resource *res = (struct crocus_resource *)iv->base.resource;
5238    uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5239                                        isl_dev->ss.align, &offset);
5240    bool write = iv->base.shader_access & PIPE_IMAGE_ACCESS_WRITE;
5241    uint32_t reloc = RELOC_32BIT | (write ? RELOC_WRITE : 0);
5242    if (res->base.b.target == PIPE_BUFFER) {
5243       const struct isl_format_layout *fmtl = isl_format_get_layout(iv->view.format);
5244       const unsigned cpp = iv->view.format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
5245       unsigned final_size =
5246          MIN3(iv->base.u.buf.size, res->bo->size - res->offset - iv->base.u.buf.offset,
5247               CROCUS_MAX_TEXTURE_BUFFER_SIZE * cpp);
5248       isl_buffer_fill_state(isl_dev, surf_state,
5249                             .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5250                                                           res->bo,
5251                                                           res->offset + iv->base.u.buf.offset, reloc),
5252                             .size_B = final_size,
5253                             .format = iv->view.format,
5254                             .swizzle = iv->view.swizzle,
5255                             .stride_B = cpp,
5256                             .mocs = crocus_mocs(res->bo, isl_dev)
5257          );
5258    } else {
5259       if (iv->view.format == ISL_FORMAT_RAW) {
5260          isl_buffer_fill_state(isl_dev, surf_state,
5261                                .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5262                                                              res->bo,
5263                                                              res->offset, reloc),
5264                                .size_B = res->bo->size - res->offset,
5265                                .format = iv->view.format,
5266                                .swizzle = iv->view.swizzle,
5267                                .stride_B = 1,
5268                                .mocs = crocus_mocs(res->bo, isl_dev),
5269             );
5270 
5271 
5272       } else {
5273          emit_surface_state(batch, res,
5274                             &res->surf, false, &iv->view,
5275                             write, 0, false,
5276                             0, surf_state, offset);
5277       }
5278    }
5279 
5280    return offset;
5281 }
5282 
5283 #if GFX_VER == 6
5284 static uint32_t
emit_sol_surface(struct crocus_batch * batch,struct pipe_stream_output_info * so_info,uint32_t idx)5285 emit_sol_surface(struct crocus_batch *batch,
5286                  struct pipe_stream_output_info *so_info,
5287                  uint32_t idx)
5288 {
5289    struct crocus_context *ice = batch->ice;
5290 
5291    if (idx >= so_info->num_outputs || !ice->state.streamout_active)
5292       return 0;
5293    const struct pipe_stream_output *output = &so_info->output[idx];
5294    const int buffer = output->output_buffer;
5295    assert(output->stream == 0);
5296 
5297    struct crocus_resource *buf = (struct crocus_resource *)ice->state.so_target[buffer]->buffer;
5298    unsigned stride_dwords = so_info->stride[buffer];
5299    unsigned offset_dwords = ice->state.so_target[buffer]->buffer_offset / 4 + output->dst_offset;
5300 
5301    size_t size_dwords = (ice->state.so_target[buffer]->buffer_offset + ice->state.so_target[buffer]->buffer_size) / 4;
5302    unsigned num_vector_components = output->num_components;
5303    unsigned num_elements;
5304    /* FIXME: can we rely on core Mesa to ensure that the buffer isn't
5305     * too big to map using a single binding table entry?
5306     */
5307    //   assert((size_dwords - offset_dwords) / stride_dwords
5308    //          <= ELK_MAX_NUM_BUFFER_ENTRIES);
5309 
5310    if (size_dwords > offset_dwords + num_vector_components) {
5311       /* There is room for at least 1 transform feedback output in the buffer.
5312        * Compute the number of additional transform feedback outputs the
5313        * buffer has room for.
5314        */
5315       num_elements =
5316          (size_dwords - offset_dwords - num_vector_components);
5317    } else {
5318       /* There isn't even room for a single transform feedback output in the
5319        * buffer.  We can't configure the binding table entry to prevent output
5320        * entirely; we'll have to rely on the geometry shader to detect
5321        * overflow.  But to minimize the damage in case of a bug, set up the
5322        * binding table entry to just allow a single output.
5323        */
5324       num_elements = 0;
5325    }
5326    num_elements += stride_dwords;
5327 
5328    uint32_t surface_format;
5329    switch (num_vector_components) {
5330    case 1:
5331       surface_format = ISL_FORMAT_R32_FLOAT;
5332       break;
5333    case 2:
5334       surface_format = ISL_FORMAT_R32G32_FLOAT;
5335       break;
5336    case 3:
5337       surface_format = ISL_FORMAT_R32G32B32_FLOAT;
5338       break;
5339    case 4:
5340       surface_format = ISL_FORMAT_R32G32B32A32_FLOAT;
5341       break;
5342    default:
5343       unreachable("Invalid vector size for transform feedback output");
5344    }
5345 
5346    UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5347    uint32_t offset = 0;
5348 
5349    uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5350                                        isl_dev->ss.align, &offset);
5351    isl_buffer_fill_state(isl_dev, surf_state,
5352                          .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5353                                                        crocus_resource_bo(&buf->base.b),
5354                                                        offset_dwords * 4, RELOC_32BIT|RELOC_WRITE),
5355                          .size_B = num_elements * 4,
5356                          .stride_B = stride_dwords * 4,
5357                          .swizzle = ISL_SWIZZLE_IDENTITY,
5358                          .format = surface_format);
5359    return offset;
5360 }
5361 #endif
5362 
5363 #define foreach_surface_used(index, group)                      \
5364    for (int index = 0; index < bt->sizes[group]; index++)       \
5365       if (crocus_group_index_to_bti(bt, group, index) !=        \
5366           CROCUS_SURFACE_NOT_USED)
5367 
5368 static void
crocus_populate_binding_table(struct crocus_context * ice,struct crocus_batch * batch,gl_shader_stage stage,bool ff_gs)5369 crocus_populate_binding_table(struct crocus_context *ice,
5370                               struct crocus_batch *batch,
5371                               gl_shader_stage stage, bool ff_gs)
5372 {
5373    struct crocus_compiled_shader *shader = ff_gs ? ice->shaders.ff_gs_prog : ice->shaders.prog[stage];
5374    struct crocus_shader_state *shs = ff_gs ? NULL : &ice->state.shaders[stage];
5375    if (!shader)
5376       return;
5377 
5378    struct crocus_binding_table *bt = &shader->bt;
5379    int s = 0;
5380    uint32_t *surf_offsets = shader->surf_offset;
5381 
5382 #if GFX_VER < 8
5383    const struct shader_info *info = crocus_get_shader_info(ice, stage);
5384 #endif
5385 
5386    if (stage == MESA_SHADER_FRAGMENT) {
5387       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5388       /* Note that cso_fb->nr_cbufs == fs_key->nr_color_regions. */
5389       if (cso_fb->nr_cbufs) {
5390          for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
5391             uint32_t write_disables = 0;
5392             bool blend_enable = false;
5393 #if GFX_VER <= 5
5394             const struct pipe_rt_blend_state *rt =
5395                &ice->state.cso_blend->cso.rt[ice->state.cso_blend->cso.independent_blend_enable ? i : 0];
5396             struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
5397             struct elk_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
5398             write_disables |= (rt->colormask & PIPE_MASK_A) ? 0x0 : 0x8;
5399             write_disables |= (rt->colormask & PIPE_MASK_R) ? 0x0 : 0x4;
5400             write_disables |= (rt->colormask & PIPE_MASK_G) ? 0x0 : 0x2;
5401             write_disables |= (rt->colormask & PIPE_MASK_B) ? 0x0 : 0x1;
5402             /* Gen4/5 can't handle blending off when a dual src blend wm is enabled. */
5403             blend_enable = rt->blend_enable || wm_prog_data->dual_src_blend;
5404 #endif
5405             if (cso_fb->cbufs[i]) {
5406                surf_offsets[s] = emit_surface(batch,
5407                                               (struct crocus_surface *)cso_fb->cbufs[i],
5408                                               ice->state.draw_aux_usage[i],
5409                                               blend_enable,
5410                                               write_disables);
5411             } else {
5412                emit_null_fb_surface(batch, ice, &surf_offsets[s]);
5413             }
5414             s++;
5415          }
5416       } else {
5417          emit_null_fb_surface(batch, ice, &surf_offsets[s]);
5418          s++;
5419       }
5420 
5421       foreach_surface_used(i, CROCUS_SURFACE_GROUP_RENDER_TARGET_READ) {
5422          struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5423          if (cso_fb->cbufs[i]) {
5424             surf_offsets[s++] = emit_rt_surface(batch,
5425                                                 (struct crocus_surface *)cso_fb->cbufs[i],
5426                                                 ice->state.draw_aux_usage[i]);
5427          }
5428       }
5429    }
5430 
5431    if (stage == MESA_SHADER_COMPUTE) {
5432       foreach_surface_used(i, CROCUS_SURFACE_GROUP_CS_WORK_GROUPS) {
5433          surf_offsets[s] = emit_grid(ice, batch);
5434          s++;
5435       }
5436    }
5437 
5438 #if GFX_VER == 6
5439    if (stage == MESA_SHADER_GEOMETRY) {
5440       struct pipe_stream_output_info *so_info;
5441       if (ice->shaders.uncompiled[MESA_SHADER_GEOMETRY])
5442          so_info = &ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]->stream_output;
5443       else
5444          so_info = &ice->shaders.uncompiled[MESA_SHADER_VERTEX]->stream_output;
5445 
5446       foreach_surface_used(i, CROCUS_SURFACE_GROUP_SOL) {
5447          surf_offsets[s] = emit_sol_surface(batch, so_info, i);
5448          s++;
5449       }
5450    }
5451 #endif
5452 
5453    foreach_surface_used(i, CROCUS_SURFACE_GROUP_TEXTURE) {
5454       struct crocus_sampler_view *view = shs->textures[i];
5455       if (view)
5456          surf_offsets[s] = emit_sampler_view(ice, batch, false, view);
5457       else
5458          emit_null_surface(batch, &surf_offsets[s]);
5459       s++;
5460    }
5461 
5462 #if GFX_VER < 8
5463    if (info && info->uses_texture_gather) {
5464       foreach_surface_used(i, CROCUS_SURFACE_GROUP_TEXTURE_GATHER) {
5465          struct crocus_sampler_view *view = shs->textures[i];
5466          if (view)
5467             surf_offsets[s] = emit_sampler_view(ice, batch, true, view);
5468          else
5469             emit_null_surface(batch, &surf_offsets[s]);
5470          s++;
5471       }
5472    }
5473 #endif
5474 
5475    foreach_surface_used(i, CROCUS_SURFACE_GROUP_IMAGE) {
5476       struct crocus_image_view *view = &shs->image[i];
5477       if (view->base.resource)
5478          surf_offsets[s] = emit_image_view(ice, batch, view);
5479       else
5480          emit_null_surface(batch, &surf_offsets[s]);
5481       s++;
5482    }
5483    foreach_surface_used(i, CROCUS_SURFACE_GROUP_UBO) {
5484       if (shs->constbufs[i].buffer)
5485          surf_offsets[s] = emit_ubo_buffer(ice, batch, &shs->constbufs[i]);
5486       else
5487          emit_null_surface(batch, &surf_offsets[s]);
5488       s++;
5489    }
5490    foreach_surface_used(i, CROCUS_SURFACE_GROUP_SSBO) {
5491       if (shs->ssbo[i].buffer)
5492          surf_offsets[s] = emit_ssbo_buffer(ice, batch, &shs->ssbo[i],
5493                                             !!(shs->writable_ssbos & (1 << i)));
5494       else
5495          emit_null_surface(batch, &surf_offsets[s]);
5496       s++;
5497    }
5498 
5499 }
5500 /* ------------------------------------------------------------------- */
5501 static uint32_t
crocus_upload_binding_table(struct crocus_context * ice,struct crocus_batch * batch,uint32_t * table,uint32_t size)5502 crocus_upload_binding_table(struct crocus_context *ice,
5503                             struct crocus_batch *batch,
5504                             uint32_t *table,
5505                             uint32_t size)
5506 
5507 {
5508    if (size == 0)
5509       return 0;
5510    return emit_state(batch, table, size, 32);
5511 }
5512 
5513 /**
5514  * Possibly emit STATE_BASE_ADDRESS to update Surface State Base Address.
5515  */
5516 
5517 static void
crocus_update_surface_base_address(struct crocus_batch * batch)5518 crocus_update_surface_base_address(struct crocus_batch *batch)
5519 {
5520    if (batch->state_base_address_emitted)
5521       return;
5522 
5523    UNUSED uint32_t mocs = batch->screen->isl_dev.mocs.internal;
5524 
5525    flush_before_state_base_change(batch);
5526 
5527    crocus_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {
5528       /* Set base addresses */
5529       sba.GeneralStateBaseAddressModifyEnable = true;
5530 
5531 #if GFX_VER >= 6
5532       sba.DynamicStateBaseAddressModifyEnable = true;
5533       sba.DynamicStateBaseAddress = ro_bo(batch->state.bo, 0);
5534 #endif
5535 
5536       sba.SurfaceStateBaseAddressModifyEnable = true;
5537       sba.SurfaceStateBaseAddress = ro_bo(batch->state.bo, 0);
5538 
5539       sba.IndirectObjectBaseAddressModifyEnable = true;
5540 
5541 #if GFX_VER >= 5
5542       sba.InstructionBaseAddressModifyEnable = true;
5543       sba.InstructionBaseAddress = ro_bo(batch->ice->shaders.cache_bo, 0); // TODO!
5544 #endif
5545 
5546       /* Set buffer sizes on Gen8+ or upper bounds on Gen4-7 */
5547 #if GFX_VER == 8
5548       sba.GeneralStateBufferSize   = 0xfffff;
5549       sba.IndirectObjectBufferSize = 0xfffff;
5550       sba.InstructionBufferSize    = 0xfffff;
5551       sba.DynamicStateBufferSize   = MAX_STATE_SIZE;
5552 
5553       sba.GeneralStateBufferSizeModifyEnable    = true;
5554       sba.DynamicStateBufferSizeModifyEnable    = true;
5555       sba.IndirectObjectBufferSizeModifyEnable  = true;
5556       sba.InstructionBuffersizeModifyEnable     = true;
5557 #else
5558       sba.GeneralStateAccessUpperBoundModifyEnable = true;
5559       sba.IndirectObjectAccessUpperBoundModifyEnable = true;
5560 
5561 #if GFX_VER >= 5
5562       sba.InstructionAccessUpperBoundModifyEnable = true;
5563 #endif
5564 
5565 #if GFX_VER >= 6
5566       /* Dynamic state upper bound.  Although the documentation says that
5567        * programming it to zero will cause it to be ignored, that is a lie.
5568        * If this isn't programmed to a real bound, the sampler border color
5569        * pointer is rejected, causing border color to mysteriously fail.
5570        */
5571       sba.DynamicStateAccessUpperBound = ro_bo(NULL, 0xfffff000);
5572       sba.DynamicStateAccessUpperBoundModifyEnable = true;
5573 #else
5574       /* Same idea but using General State Base Address on Gen4-5 */
5575       sba.GeneralStateAccessUpperBound = ro_bo(NULL, 0xfffff000);
5576 #endif
5577 #endif
5578 
5579 #if GFX_VER >= 6
5580       /* The hardware appears to pay attention to the MOCS fields even
5581        * if you don't set the "Address Modify Enable" bit for the base.
5582        */
5583       sba.GeneralStateMOCS            = mocs;
5584       sba.StatelessDataPortAccessMOCS = mocs;
5585       sba.DynamicStateMOCS            = mocs;
5586       sba.IndirectObjectMOCS          = mocs;
5587       sba.InstructionMOCS             = mocs;
5588       sba.SurfaceStateMOCS            = mocs;
5589 #endif
5590    }
5591 
5592    flush_after_state_base_change(batch);
5593 
5594    /* According to section 3.6.1 of VOL1 of the 965 PRM,
5595     * STATE_BASE_ADDRESS updates require a reissue of:
5596     *
5597     * 3DSTATE_PIPELINE_POINTERS
5598     * 3DSTATE_BINDING_TABLE_POINTERS
5599     * MEDIA_STATE_POINTERS
5600     *
5601     * and this continues through Ironlake.  The Sandy Bridge PRM, vol
5602     * 1 part 1 says that the folowing packets must be reissued:
5603     *
5604     * 3DSTATE_CC_POINTERS
5605     * 3DSTATE_BINDING_TABLE_POINTERS
5606     * 3DSTATE_SAMPLER_STATE_POINTERS
5607     * 3DSTATE_VIEWPORT_STATE_POINTERS
5608     * MEDIA_STATE_POINTERS
5609     *
5610     * Those are always reissued following SBA updates anyway (new
5611     * batch time), except in the case of the program cache BO
5612     * changing.  Having a separate state flag makes the sequence more
5613     * obvious.
5614     */
5615 #if GFX_VER <= 5
5616    batch->ice->state.dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS | CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
5617 #elif GFX_VER == 6
5618    batch->ice->state.dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS | CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS;
5619 #endif
5620    batch->state_base_address_emitted = true;
5621 }
5622 
5623 static inline void
crocus_viewport_zmin_zmax(const struct pipe_viewport_state * vp,bool halfz,bool window_space_position,float * zmin,float * zmax)5624 crocus_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz,
5625                           bool window_space_position, float *zmin, float *zmax)
5626 {
5627    if (window_space_position) {
5628       *zmin = 0.f;
5629       *zmax = 1.f;
5630       return;
5631    }
5632    util_viewport_zmin_zmax(vp, halfz, zmin, zmax);
5633 }
5634 
5635 struct push_bos {
5636    struct {
5637       struct crocus_address addr;
5638       uint32_t length;
5639    } buffers[4];
5640    int buffer_count;
5641    uint32_t max_length;
5642 };
5643 
5644 #if GFX_VER >= 6
5645 static void
setup_constant_buffers(struct crocus_context * ice,struct crocus_batch * batch,int stage,struct push_bos * push_bos)5646 setup_constant_buffers(struct crocus_context *ice,
5647                        struct crocus_batch *batch,
5648                        int stage,
5649                        struct push_bos *push_bos)
5650 {
5651    struct crocus_shader_state *shs = &ice->state.shaders[stage];
5652    struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
5653    struct elk_stage_prog_data *prog_data = (void *) shader->prog_data;
5654 
5655    uint32_t push_range_sum = 0;
5656 
5657    int n = 0;
5658    for (int i = 0; i < 4; i++) {
5659       const struct elk_ubo_range *range = &prog_data->ubo_ranges[i];
5660 
5661       if (range->length == 0)
5662          continue;
5663 
5664       push_range_sum += range->length;
5665 
5666       if (range->length > push_bos->max_length)
5667          push_bos->max_length = range->length;
5668 
5669       /* Range block is a binding table index, map back to UBO index. */
5670       unsigned block_index = crocus_bti_to_group_index(
5671          &shader->bt, CROCUS_SURFACE_GROUP_UBO, range->block);
5672       assert(block_index != CROCUS_SURFACE_NOT_USED);
5673 
5674       struct pipe_constant_buffer *cbuf = &shs->constbufs[block_index];
5675       struct crocus_resource *res = (void *) cbuf->buffer;
5676 
5677       assert(cbuf->buffer_offset % 32 == 0);
5678 
5679       push_bos->buffers[n].length = range->length;
5680       push_bos->buffers[n].addr =
5681          res ? ro_bo(res->bo, range->start * 32 + cbuf->buffer_offset)
5682          : ro_bo(batch->ice->workaround_bo,
5683                  batch->ice->workaround_offset);
5684       n++;
5685    }
5686 
5687    /* From the 3DSTATE_CONSTANT_XS and 3DSTATE_CONSTANT_ALL programming notes:
5688     *
5689     *    "The sum of all four read length fields must be less than or
5690     *    equal to the size of 64."
5691     */
5692    assert(push_range_sum <= 64);
5693 
5694    push_bos->buffer_count = n;
5695 }
5696 
5697 #if GFX_VER == 7
5698 static void
gen7_emit_vs_workaround_flush(struct crocus_batch * batch)5699 gen7_emit_vs_workaround_flush(struct crocus_batch *batch)
5700 {
5701    crocus_emit_pipe_control_write(batch,
5702                                   "vs workaround",
5703                                   PIPE_CONTROL_WRITE_IMMEDIATE
5704                                   | PIPE_CONTROL_DEPTH_STALL,
5705                                   batch->ice->workaround_bo,
5706                                   batch->ice->workaround_offset, 0);
5707 }
5708 #endif
5709 
5710 static void
emit_push_constant_packets(struct crocus_context * ice,struct crocus_batch * batch,int stage,const struct push_bos * push_bos)5711 emit_push_constant_packets(struct crocus_context *ice,
5712                            struct crocus_batch *batch,
5713                            int stage,
5714                            const struct push_bos *push_bos)
5715 {
5716    struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
5717    struct elk_stage_prog_data *prog_data = shader ? (void *) shader->prog_data : NULL;
5718    UNUSED uint32_t mocs = crocus_mocs(NULL, &batch->screen->isl_dev);
5719 
5720 #if GFX_VER == 7
5721    if (stage == MESA_SHADER_VERTEX) {
5722       if (batch->screen->devinfo.platform == INTEL_PLATFORM_IVB)
5723          gen7_emit_vs_workaround_flush(batch);
5724    }
5725 #endif
5726    crocus_emit_cmd(batch, GENX(3DSTATE_CONSTANT_VS), pkt) {
5727       pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
5728 #if GFX_VER >= 7
5729 #if GFX_VER != 8
5730       /* MOCS is MBZ on Gen8 so we skip it there */
5731       pkt.ConstantBody.MOCS = mocs;
5732 #endif
5733 
5734       if (prog_data) {
5735          /* The Skylake PRM contains the following restriction:
5736           *
5737           *    "The driver must ensure The following case does not occur
5738           *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
5739           *     buffer 3 read length equal to zero committed followed by a
5740           *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
5741           *     zero committed."
5742           *
5743           * To avoid this, we program the buffers in the highest slots.
5744           * This way, slot 0 is only used if slot 3 is also used.
5745           */
5746          int n = push_bos->buffer_count;
5747          assert(n <= 4);
5748 #if GFX_VERx10 >= 75
5749          const unsigned shift = 4 - n;
5750 #else
5751          const unsigned shift = 0;
5752 #endif
5753          for (int i = 0; i < n; i++) {
5754             pkt.ConstantBody.ReadLength[i + shift] =
5755                push_bos->buffers[i].length;
5756             pkt.ConstantBody.Buffer[i + shift] = push_bos->buffers[i].addr;
5757          }
5758       }
5759 #else
5760       if (prog_data) {
5761          int n = push_bos->buffer_count;
5762          assert (n <= 1);
5763          if (n == 1) {
5764             pkt.Buffer0Valid = true;
5765             pkt.ConstantBody.PointertoConstantBuffer0 = push_bos->buffers[0].addr.offset;
5766             pkt.ConstantBody.ConstantBuffer0ReadLength = push_bos->buffers[0].length - 1;
5767          }
5768       }
5769 #endif
5770    }
5771 }
5772 
5773 #endif
5774 
5775 #if GFX_VER == 8
5776 typedef struct GENX(3DSTATE_WM_DEPTH_STENCIL) DEPTH_STENCIL_GENXML;
5777 #elif GFX_VER >= 6
5778 typedef struct GENX(DEPTH_STENCIL_STATE)      DEPTH_STENCIL_GENXML;
5779 #else
5780 typedef struct GENX(COLOR_CALC_STATE)         DEPTH_STENCIL_GENXML;
5781 #endif
5782 
5783 static inline void
set_depth_stencil_bits(struct crocus_context * ice,DEPTH_STENCIL_GENXML * ds)5784 set_depth_stencil_bits(struct crocus_context *ice, DEPTH_STENCIL_GENXML *ds)
5785 {
5786    struct crocus_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
5787    ds->DepthTestEnable = cso->cso.depth_enabled;
5788    ds->DepthBufferWriteEnable = cso->cso.depth_writemask;
5789    ds->DepthTestFunction = translate_compare_func(cso->cso.depth_func);
5790 
5791    ds->StencilFailOp = cso->cso.stencil[0].fail_op;
5792    ds->StencilPassDepthFailOp = cso->cso.stencil[0].zfail_op;
5793    ds->StencilPassDepthPassOp = cso->cso.stencil[0].zpass_op;
5794    ds->StencilTestFunction = translate_compare_func(cso->cso.stencil[0].func);
5795 
5796    ds->StencilTestMask = cso->cso.stencil[0].valuemask;
5797    ds->StencilWriteMask = cso->cso.stencil[0].writemask;
5798 
5799    ds->BackfaceStencilFailOp = cso->cso.stencil[1].fail_op;
5800    ds->BackfaceStencilPassDepthFailOp = cso->cso.stencil[1].zfail_op;
5801    ds->BackfaceStencilPassDepthPassOp = cso->cso.stencil[1].zpass_op;
5802    ds->BackfaceStencilTestFunction = translate_compare_func(cso->cso.stencil[1].func);
5803 
5804    ds->BackfaceStencilTestMask = cso->cso.stencil[1].valuemask;
5805    ds->BackfaceStencilWriteMask = cso->cso.stencil[1].writemask;
5806    ds->DoubleSidedStencilEnable = cso->cso.stencil[1].enabled;
5807    ds->StencilTestEnable = cso->cso.stencil[0].enabled;
5808    ds->StencilBufferWriteEnable =
5809       cso->cso.stencil[0].writemask != 0 ||
5810       (cso->cso.stencil[1].enabled && cso->cso.stencil[1].writemask != 0);
5811 }
5812 
5813 static void
emit_vertex_buffer_state(struct crocus_batch * batch,unsigned buffer_id,struct crocus_bo * bo,unsigned start_offset,unsigned end_offset,unsigned stride,unsigned step_rate,uint32_t ** map)5814 emit_vertex_buffer_state(struct crocus_batch *batch,
5815                          unsigned buffer_id,
5816                          struct crocus_bo *bo,
5817                          unsigned start_offset,
5818                          unsigned end_offset,
5819                          unsigned stride,
5820                          unsigned step_rate,
5821                          uint32_t **map)
5822 {
5823    const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);
5824    _crocus_pack_state(batch, GENX(VERTEX_BUFFER_STATE), *map, vb) {
5825       vb.BufferStartingAddress = ro_bo(bo, start_offset);
5826 #if GFX_VER >= 8
5827       vb.BufferSize = end_offset - start_offset;
5828 #endif
5829       vb.VertexBufferIndex = buffer_id;
5830       vb.BufferPitch = stride;
5831 #if GFX_VER >= 7
5832       vb.AddressModifyEnable = true;
5833 #endif
5834 #if GFX_VER >= 6
5835       vb.MOCS = crocus_mocs(bo, &batch->screen->isl_dev);
5836 #endif
5837 #if GFX_VER < 8
5838       vb.BufferAccessType = step_rate ? INSTANCEDATA : VERTEXDATA;
5839       vb.InstanceDataStepRate = step_rate;
5840 #if GFX_VER >= 5
5841       vb.EndAddress = ro_bo(bo, end_offset - 1);
5842 #endif
5843 #endif
5844    }
5845    *map += vb_dwords;
5846 }
5847 
5848 #if GFX_VER >= 6
5849 static uint32_t
determine_sample_mask(struct crocus_context * ice)5850 determine_sample_mask(struct crocus_context *ice)
5851 {
5852    uint32_t num_samples = ice->state.framebuffer.samples;
5853 
5854    if (num_samples <= 1)
5855       return 1;
5856 
5857    uint32_t fb_mask = (1 << num_samples) - 1;
5858    return ice->state.sample_mask & fb_mask;
5859 }
5860 #endif
5861 
5862 static void
crocus_upload_dirty_render_state(struct crocus_context * ice,struct crocus_batch * batch,const struct pipe_draw_info * draw)5863 crocus_upload_dirty_render_state(struct crocus_context *ice,
5864                                struct crocus_batch *batch,
5865                                const struct pipe_draw_info *draw)
5866 {
5867    uint64_t dirty = ice->state.dirty;
5868    uint64_t stage_dirty = ice->state.stage_dirty;
5869 
5870    if (!(dirty & CROCUS_ALL_DIRTY_FOR_RENDER) &&
5871        !(stage_dirty & CROCUS_ALL_STAGE_DIRTY_FOR_RENDER))
5872       return;
5873 
5874    if (dirty & CROCUS_DIRTY_VF_STATISTICS) {
5875       crocus_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
5876          vf.StatisticsEnable = true;
5877       }
5878    }
5879 
5880 #if GFX_VER <= 5
5881    if (stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS |
5882                       CROCUS_STAGE_DIRTY_CONSTANTS_FS)) {
5883       bool ret = calculate_curbe_offsets(batch);
5884       if (ret) {
5885          dirty |= CROCUS_DIRTY_GEN4_CURBE | CROCUS_DIRTY_WM | CROCUS_DIRTY_CLIP;
5886          stage_dirty |= CROCUS_STAGE_DIRTY_VS;
5887       }
5888    }
5889 
5890    if (dirty & (CROCUS_DIRTY_GEN4_CURBE | CROCUS_DIRTY_RASTER) ||
5891        stage_dirty & CROCUS_STAGE_DIRTY_VS) {
5892      bool ret = crocus_calculate_urb_fence(batch, ice->curbe.total_size,
5893                                            elk_vue_prog_data(ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data)->urb_entry_size,
5894                                            ((struct elk_sf_prog_data *)ice->shaders.sf_prog->prog_data)->urb_entry_size);
5895      if (ret) {
5896 	dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS | CROCUS_DIRTY_RASTER | CROCUS_DIRTY_CLIP;
5897 	stage_dirty |= CROCUS_STAGE_DIRTY_GS | CROCUS_STAGE_DIRTY_VS;
5898      }
5899    }
5900 #endif
5901    if (dirty & CROCUS_DIRTY_CC_VIEWPORT) {
5902       const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
5903       uint32_t cc_vp_address;
5904 
5905       /* XXX: could avoid streaming for depth_clip [0,1] case. */
5906       uint32_t *cc_vp_map =
5907          stream_state(batch,
5908                       4 * ice->state.num_viewports *
5909                       GENX(CC_VIEWPORT_length), 32, &cc_vp_address);
5910       for (int i = 0; i < ice->state.num_viewports; i++) {
5911          float zmin, zmax;
5912          crocus_viewport_zmin_zmax(&ice->state.viewports[i], cso_rast->cso.clip_halfz,
5913                                  ice->state.window_space_position,
5914                                  &zmin, &zmax);
5915          if (cso_rast->cso.depth_clip_near)
5916             zmin = 0.0;
5917          if (cso_rast->cso.depth_clip_far)
5918             zmax = 1.0;
5919 
5920          crocus_pack_state(GENX(CC_VIEWPORT), cc_vp_map, ccv) {
5921             ccv.MinimumDepth = zmin;
5922             ccv.MaximumDepth = zmax;
5923          }
5924 
5925          cc_vp_map += GENX(CC_VIEWPORT_length);
5926       }
5927 
5928 #if GFX_VER >= 7
5929       crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
5930          ptr.CCViewportPointer = cc_vp_address;
5931       }
5932 #elif GFX_VER == 6
5933       crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
5934          vp.CCViewportStateChange = 1;
5935          vp.PointertoCC_VIEWPORT = cc_vp_address;
5936       }
5937 #else
5938       ice->state.cc_vp_address = cc_vp_address;
5939       dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
5940 #endif
5941    }
5942 
5943    if (dirty & CROCUS_DIRTY_SF_CL_VIEWPORT) {
5944       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5945 #if GFX_VER >= 7
5946       uint32_t sf_cl_vp_address;
5947       uint32_t *vp_map =
5948          stream_state(batch,
5949                       4 * ice->state.num_viewports *
5950                       GENX(SF_CLIP_VIEWPORT_length), 64, &sf_cl_vp_address);
5951 #else
5952       uint32_t *vp_map =
5953          stream_state(batch,
5954                       4 * ice->state.num_viewports * GENX(SF_VIEWPORT_length),
5955                       32, &ice->state.sf_vp_address);
5956       uint32_t *clip_map =
5957          stream_state(batch,
5958                       4 * ice->state.num_viewports * GENX(CLIP_VIEWPORT_length),
5959                       32, &ice->state.clip_vp_address);
5960 #endif
5961 
5962       for (unsigned i = 0; i < ice->state.num_viewports; i++) {
5963          const struct pipe_viewport_state *state = &ice->state.viewports[i];
5964          float gb_xmin, gb_xmax, gb_ymin, gb_ymax;
5965 
5966 #if GFX_VER == 8
5967          float vp_xmin = viewport_extent(state, 0, -1.0f);
5968          float vp_xmax = viewport_extent(state, 0,  1.0f);
5969          float vp_ymin = viewport_extent(state, 1, -1.0f);
5970          float vp_ymax = viewport_extent(state, 1,  1.0f);
5971 #endif
5972          intel_calculate_guardband_size(0, cso_fb->width, 0, cso_fb->height,
5973                                         state->scale[0], state->scale[1],
5974                                         state->translate[0], state->translate[1],
5975                                         &gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax);
5976 #if GFX_VER >= 7
5977          crocus_pack_state(GENX(SF_CLIP_VIEWPORT), vp_map, vp)
5978 #else
5979          crocus_pack_state(GENX(SF_VIEWPORT), vp_map, vp)
5980 #endif
5981          {
5982             vp.ViewportMatrixElementm00 = state->scale[0];
5983             vp.ViewportMatrixElementm11 = state->scale[1];
5984             vp.ViewportMatrixElementm22 = state->scale[2];
5985             vp.ViewportMatrixElementm30 = state->translate[0];
5986             vp.ViewportMatrixElementm31 = state->translate[1];
5987             vp.ViewportMatrixElementm32 = state->translate[2];
5988 #if GFX_VER < 6
5989             struct pipe_scissor_state scissor;
5990             crocus_fill_scissor_rect(ice, 0, &scissor);
5991             vp.ScissorRectangle.ScissorRectangleXMin = scissor.minx;
5992             vp.ScissorRectangle.ScissorRectangleXMax = scissor.maxx;
5993             vp.ScissorRectangle.ScissorRectangleYMin = scissor.miny;
5994             vp.ScissorRectangle.ScissorRectangleYMax = scissor.maxy;
5995 #endif
5996 
5997 #if GFX_VER >= 7
5998             vp.XMinClipGuardband = gb_xmin;
5999             vp.XMaxClipGuardband = gb_xmax;
6000             vp.YMinClipGuardband = gb_ymin;
6001             vp.YMaxClipGuardband = gb_ymax;
6002 #endif
6003 #if GFX_VER == 8
6004             vp.XMinViewPort = MAX2(vp_xmin, 0);
6005             vp.XMaxViewPort = MIN2(vp_xmax, cso_fb->width) - 1;
6006             vp.YMinViewPort = MAX2(vp_ymin, 0);
6007             vp.YMaxViewPort = MIN2(vp_ymax, cso_fb->height) - 1;
6008 #endif
6009          }
6010 #if GFX_VER < 7
6011          crocus_pack_state(GENX(CLIP_VIEWPORT), clip_map, clip) {
6012             clip.XMinClipGuardband = gb_xmin;
6013             clip.XMaxClipGuardband = gb_xmax;
6014             clip.YMinClipGuardband = gb_ymin;
6015             clip.YMaxClipGuardband = gb_ymax;
6016          }
6017 #endif
6018 #if GFX_VER >= 7
6019          vp_map += GENX(SF_CLIP_VIEWPORT_length);
6020 #else
6021          vp_map += GENX(SF_VIEWPORT_length);
6022          clip_map += GENX(CLIP_VIEWPORT_length);
6023 #endif
6024       }
6025 #if GFX_VER >= 7
6026       crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
6027          ptr.SFClipViewportPointer = sf_cl_vp_address;
6028       }
6029 #elif GFX_VER == 6
6030       crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
6031          vp.SFViewportStateChange = 1;
6032          vp.CLIPViewportStateChange = 1;
6033          vp.PointertoCLIP_VIEWPORT = ice->state.clip_vp_address;
6034          vp.PointertoSF_VIEWPORT = ice->state.sf_vp_address;
6035       }
6036 #endif
6037    }
6038 
6039 #if GFX_VER >= 6
6040    if (dirty & CROCUS_DIRTY_GEN6_URB) {
6041 #if GFX_VER == 6
6042       bool gs_present = ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL
6043          || ice->shaders.ff_gs_prog;
6044 
6045       struct elk_vue_prog_data *vue_prog_data =
6046          (void *) ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data;
6047       const unsigned vs_size = vue_prog_data->urb_entry_size;
6048       unsigned gs_size = vs_size;
6049       if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
6050          struct elk_vue_prog_data *gs_vue_prog_data =
6051             (void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data;
6052          gs_size = gs_vue_prog_data->urb_entry_size;
6053       }
6054 
6055       genX(crocus_upload_urb)(batch, vs_size, gs_present, gs_size);
6056 #endif
6057 #if GFX_VER >= 7
6058       const struct intel_device_info *devinfo = &batch->screen->devinfo;
6059       bool gs_present = ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL;
6060       bool tess_present = ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL;
6061       struct intel_urb_config urb_cfg;
6062 
6063       for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
6064          if (!ice->shaders.prog[i]) {
6065             urb_cfg.size[i] = 1;
6066          } else {
6067             struct elk_vue_prog_data *vue_prog_data =
6068                (void *) ice->shaders.prog[i]->prog_data;
6069             urb_cfg.size[i] = vue_prog_data->urb_entry_size;
6070          }
6071          assert(urb_cfg.size[i] != 0);
6072       }
6073 
6074       /* If we're just switching between programs with the same URB requirements,
6075        * skip the rest of the logic.
6076        */
6077       bool no_change = false;
6078       if (ice->urb.vsize == urb_cfg.size[MESA_SHADER_VERTEX] &&
6079           ice->urb.gs_present == gs_present &&
6080           ice->urb.gsize == urb_cfg.size[MESA_SHADER_GEOMETRY] &&
6081           ice->urb.tess_present == tess_present &&
6082           ice->urb.hsize == urb_cfg.size[MESA_SHADER_TESS_CTRL] &&
6083           ice->urb.dsize == urb_cfg.size[MESA_SHADER_TESS_EVAL]) {
6084          no_change = true;
6085       }
6086 
6087       if (!no_change) {
6088          ice->urb.vsize = urb_cfg.size[MESA_SHADER_VERTEX];
6089          ice->urb.gs_present = gs_present;
6090          ice->urb.gsize = urb_cfg.size[MESA_SHADER_GEOMETRY];
6091          ice->urb.tess_present = tess_present;
6092          ice->urb.hsize = urb_cfg.size[MESA_SHADER_TESS_CTRL];
6093          ice->urb.dsize = urb_cfg.size[MESA_SHADER_TESS_EVAL];
6094 
6095          bool constrained;
6096          intel_get_urb_config(devinfo,
6097                               batch->screen->l3_config_3d,
6098                               tess_present,
6099                               gs_present,
6100                               &urb_cfg, NULL, &constrained);
6101 
6102 #if GFX_VER == 7
6103          if (devinfo->platform == INTEL_PLATFORM_IVB)
6104             gen7_emit_vs_workaround_flush(batch);
6105 #endif
6106          for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
6107             crocus_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) {
6108                urb._3DCommandSubOpcode += i;
6109                urb.VSURBStartingAddress     = urb_cfg.start[i];
6110                urb.VSURBEntryAllocationSize = urb_cfg.size[i] - 1;
6111                urb.VSNumberofURBEntries     = urb_cfg.entries[i];
6112             }
6113          }
6114       }
6115 #endif
6116    }
6117 
6118    if (dirty & CROCUS_DIRTY_GEN6_BLEND_STATE) {
6119       struct crocus_blend_state *cso_blend = ice->state.cso_blend;
6120       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
6121       struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
6122 
6123       STATIC_ASSERT(GENX(BLEND_STATE_ENTRY_length) == 2);
6124       int rt_dwords =
6125          MAX2(cso_fb->nr_cbufs, 1) * GENX(BLEND_STATE_ENTRY_length);
6126 #if GFX_VER >= 8
6127       rt_dwords += GENX(BLEND_STATE_length);
6128 #endif
6129       uint32_t blend_offset;
6130       uint32_t *blend_map =
6131          stream_state(batch,
6132                       4 * rt_dwords, 64, &blend_offset);
6133 
6134 #if GFX_VER >= 8
6135    struct GENX(BLEND_STATE) be = { 0 };
6136    {
6137 #else
6138    for (int i = 0; i < ELK_MAX_DRAW_BUFFERS; i++) {
6139       struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
6140 #define be entry
6141 #endif
6142 
6143       be.AlphaTestEnable = cso_zsa->cso.alpha_enabled;
6144       be.AlphaTestFunction = translate_compare_func(cso_zsa->cso.alpha_func);
6145       be.AlphaToCoverageEnable = cso_blend->cso.alpha_to_coverage;
6146       be.AlphaToOneEnable = cso_blend->cso.alpha_to_one;
6147       be.AlphaToCoverageDitherEnable = GFX_VER >= 7 && cso_blend->cso.alpha_to_coverage_dither;
6148       be.ColorDitherEnable = cso_blend->cso.dither;
6149 
6150 #if GFX_VER >= 8
6151       for (int i = 0; i < ELK_MAX_DRAW_BUFFERS; i++) {
6152          struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
6153 #else
6154       {
6155 #endif
6156          const struct pipe_rt_blend_state *rt =
6157             &cso_blend->cso.rt[cso_blend->cso.independent_blend_enable ? i : 0];
6158 
6159          be.IndependentAlphaBlendEnable = set_blend_entry_bits(batch, &entry, cso_blend, i) ||
6160             be.IndependentAlphaBlendEnable;
6161 
6162          if (GFX_VER >= 8 || can_emit_logic_op(ice)) {
6163             entry.LogicOpEnable = cso_blend->cso.logicop_enable;
6164             entry.LogicOpFunction = cso_blend->cso.logicop_func;
6165          }
6166 
6167          entry.ColorClampRange = COLORCLAMP_RTFORMAT;
6168          entry.PreBlendColorClampEnable = true;
6169          entry.PostBlendColorClampEnable = true;
6170 
6171          entry.WriteDisableRed   = !(rt->colormask & PIPE_MASK_R);
6172          entry.WriteDisableGreen = !(rt->colormask & PIPE_MASK_G);
6173          entry.WriteDisableBlue  = !(rt->colormask & PIPE_MASK_B);
6174          entry.WriteDisableAlpha = !(rt->colormask & PIPE_MASK_A);
6175 
6176 #if GFX_VER >= 8
6177          GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[1 + i * 2], &entry);
6178 #else
6179          GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[i * 2], &entry);
6180 #endif
6181       }
6182    }
6183 #if GFX_VER >= 8
6184    GENX(BLEND_STATE_pack)(NULL, blend_map, &be);
6185 #endif
6186 #if GFX_VER < 7
6187       crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
6188          ptr.PointertoBLEND_STATE = blend_offset;
6189          ptr.BLEND_STATEChange = true;
6190       }
6191 #else
6192       crocus_emit_cmd(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) {
6193          ptr.BlendStatePointer = blend_offset;
6194 #if GFX_VER >= 8
6195          ptr.BlendStatePointerValid = true;
6196 #endif
6197       }
6198 #endif
6199    }
6200 #endif
6201 
6202    if (dirty & CROCUS_DIRTY_COLOR_CALC_STATE) {
6203       struct crocus_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
6204       UNUSED struct crocus_blend_state *cso_blend = ice->state.cso_blend;
6205       struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
6206       uint32_t cc_offset;
6207       void *cc_map =
6208          stream_state(batch,
6209                       sizeof(uint32_t) * GENX(COLOR_CALC_STATE_length),
6210                       64, &cc_offset);
6211 #if GFX_VER <= 5
6212       dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6213 #endif
6214       _crocus_pack_state(batch, GENX(COLOR_CALC_STATE), cc_map, cc) {
6215          cc.AlphaTestFormat = ALPHATEST_FLOAT32;
6216          cc.AlphaReferenceValueAsFLOAT32 = cso->cso.alpha_ref_value;
6217 
6218 #if GFX_VER <= 5
6219 
6220          set_depth_stencil_bits(ice, &cc);
6221 
6222          if (cso_blend->cso.logicop_enable) {
6223             if (can_emit_logic_op(ice)) {
6224                cc.LogicOpEnable = cso_blend->cso.logicop_enable;
6225                cc.LogicOpFunction = cso_blend->cso.logicop_func;
6226             }
6227          }
6228          cc.ColorDitherEnable = cso_blend->cso.dither;
6229 
6230          cc.IndependentAlphaBlendEnable = set_blend_entry_bits(batch, &cc, cso_blend, 0);
6231 
6232          if (cso->cso.alpha_enabled && ice->state.framebuffer.nr_cbufs <= 1) {
6233             cc.AlphaTestEnable = cso->cso.alpha_enabled;
6234             cc.AlphaTestFunction = translate_compare_func(cso->cso.alpha_func);
6235          }
6236          cc.StatisticsEnable = ice->state.stats_wm ? 1 : 0;
6237          cc.CCViewportStatePointer = ro_bo(batch->state.bo, ice->state.cc_vp_address);
6238 #else
6239          cc.AlphaTestFormat = ALPHATEST_FLOAT32;
6240          cc.AlphaReferenceValueAsFLOAT32 = cso->cso.alpha_ref_value;
6241 
6242          cc.BlendConstantColorRed   = ice->state.blend_color.color[0];
6243          cc.BlendConstantColorGreen = ice->state.blend_color.color[1];
6244          cc.BlendConstantColorBlue  = ice->state.blend_color.color[2];
6245          cc.BlendConstantColorAlpha = ice->state.blend_color.color[3];
6246 #endif
6247          cc.StencilReferenceValue = p_stencil_refs->ref_value[0];
6248          cc.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
6249       }
6250       ice->shaders.cc_offset = cc_offset;
6251 #if GFX_VER >= 6
6252       crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
6253          ptr.ColorCalcStatePointer = cc_offset;
6254 #if GFX_VER != 7
6255          ptr.ColorCalcStatePointerValid = true;
6256 #endif
6257       }
6258 #endif
6259    }
6260 #if GFX_VER <= 5
6261    if (dirty & CROCUS_DIRTY_GEN4_CONSTANT_COLOR) {
6262       crocus_emit_cmd(batch, GENX(3DSTATE_CONSTANT_COLOR), blend_cc) {
6263          blend_cc.BlendConstantColorRed = ice->state.blend_color.color[0];
6264          blend_cc.BlendConstantColorGreen = ice->state.blend_color.color[1];
6265          blend_cc.BlendConstantColorBlue = ice->state.blend_color.color[2];
6266          blend_cc.BlendConstantColorAlpha = ice->state.blend_color.color[3];
6267       }
6268    }
6269 #endif
6270    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6271       if (!(stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage)))
6272          continue;
6273 
6274       struct crocus_shader_state *shs = &ice->state.shaders[stage];
6275       struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
6276 
6277       if (!shader)
6278          continue;
6279 
6280       if (shs->sysvals_need_upload)
6281          upload_sysvals(ice, stage);
6282 
6283 #if GFX_VER <= 5
6284       dirty |= CROCUS_DIRTY_GEN4_CURBE;
6285 #endif
6286 #if GFX_VER >= 7
6287       struct push_bos push_bos = {};
6288       setup_constant_buffers(ice, batch, stage, &push_bos);
6289 
6290       emit_push_constant_packets(ice, batch, stage, &push_bos);
6291 #endif
6292    }
6293 
6294    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6295       if (stage_dirty & (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage)) {
6296          if (ice->shaders.prog[stage]) {
6297 #if GFX_VER <= 6
6298             dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
6299 #endif
6300             crocus_populate_binding_table(ice, batch, stage, false);
6301             ice->shaders.prog[stage]->bind_bo_offset =
6302                crocus_upload_binding_table(ice, batch,
6303                                            ice->shaders.prog[stage]->surf_offset,
6304                                            ice->shaders.prog[stage]->bt.size_bytes);
6305 
6306 #if GFX_VER >= 7
6307             crocus_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), ptr) {
6308                ptr._3DCommandSubOpcode = 38 + stage;
6309                ptr.PointertoVSBindingTable = ice->shaders.prog[stage]->bind_bo_offset;
6310             }
6311 #endif
6312 #if GFX_VER == 6
6313          } else if (stage == MESA_SHADER_GEOMETRY && ice->shaders.ff_gs_prog) {
6314             dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
6315             crocus_populate_binding_table(ice, batch, stage, true);
6316             ice->shaders.ff_gs_prog->bind_bo_offset =
6317                crocus_upload_binding_table(ice, batch,
6318                                            ice->shaders.ff_gs_prog->surf_offset,
6319                                            ice->shaders.ff_gs_prog->bt.size_bytes);
6320 #endif
6321          }
6322       }
6323    }
6324 #if GFX_VER <= 6
6325    if (dirty & CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS) {
6326       struct crocus_compiled_shader *gs = ice->shaders.prog[MESA_SHADER_GEOMETRY];
6327       if (gs == NULL)
6328          gs = ice->shaders.ff_gs_prog;
6329       crocus_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS), ptr) {
6330          ptr.PointertoVSBindingTable = ice->shaders.prog[MESA_SHADER_VERTEX]->bind_bo_offset;
6331          ptr.PointertoPSBindingTable = ice->shaders.prog[MESA_SHADER_FRAGMENT]->bind_bo_offset;
6332 #if GFX_VER == 6
6333          ptr.VSBindingTableChange = true;
6334          ptr.PSBindingTableChange = true;
6335          ptr.GSBindingTableChange = gs ? true : false;
6336          ptr.PointertoGSBindingTable = gs ? gs->bind_bo_offset : 0;
6337 #endif
6338       }
6339    }
6340 #endif
6341 
6342    bool sampler_updates = dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS;
6343    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6344       if (!(stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage)) ||
6345           !ice->shaders.prog[stage])
6346          continue;
6347 
6348       crocus_upload_sampler_states(ice, batch, stage);
6349 
6350       sampler_updates = true;
6351 
6352 #if GFX_VER >= 7
6353       struct crocus_shader_state *shs = &ice->state.shaders[stage];
6354 
6355       crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
6356          ptr._3DCommandSubOpcode = 43 + stage;
6357          ptr.PointertoVSSamplerState = shs->sampler_offset;
6358       }
6359 #endif
6360    }
6361 
6362    if (sampler_updates) {
6363 #if GFX_VER == 6
6364       struct crocus_shader_state *shs_vs = &ice->state.shaders[MESA_SHADER_VERTEX];
6365       struct crocus_shader_state *shs_gs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
6366       struct crocus_shader_state *shs_fs = &ice->state.shaders[MESA_SHADER_FRAGMENT];
6367       crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS), ptr) {
6368          if (ice->shaders.prog[MESA_SHADER_VERTEX] &&
6369              (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
6370               stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_VERTEX))) {
6371             ptr.VSSamplerStateChange = true;
6372             ptr.PointertoVSSamplerState = shs_vs->sampler_offset;
6373          }
6374          if (ice->shaders.prog[MESA_SHADER_GEOMETRY] &&
6375              (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
6376               stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_GEOMETRY))) {
6377             ptr.GSSamplerStateChange = true;
6378             ptr.PointertoGSSamplerState = shs_gs->sampler_offset;
6379          }
6380          if (ice->shaders.prog[MESA_SHADER_FRAGMENT] &&
6381              (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
6382               stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_FRAGMENT))) {
6383             ptr.PSSamplerStateChange = true;
6384             ptr.PointertoPSSamplerState = shs_fs->sampler_offset;
6385          }
6386       }
6387 #endif
6388    }
6389 
6390 #if GFX_VER >= 6
6391    if (dirty & CROCUS_DIRTY_GEN6_MULTISAMPLE) {
6392       crocus_emit_cmd(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
6393          ms.PixelLocation =
6394             ice->state.cso_rast->cso.half_pixel_center ? CENTER : UL_CORNER;
6395          if (ice->state.framebuffer.samples > 0)
6396             ms.NumberofMultisamples = ffs(ice->state.framebuffer.samples) - 1;
6397 #if GFX_VER == 6
6398          INTEL_SAMPLE_POS_4X(ms.Sample);
6399 #elif GFX_VER == 7
6400          switch (ice->state.framebuffer.samples) {
6401          case 1:
6402             INTEL_SAMPLE_POS_1X(ms.Sample);
6403             break;
6404          case 2:
6405             INTEL_SAMPLE_POS_2X(ms.Sample);
6406             break;
6407          case 4:
6408             INTEL_SAMPLE_POS_4X(ms.Sample);
6409             break;
6410          case 8:
6411             INTEL_SAMPLE_POS_8X(ms.Sample);
6412             break;
6413          default:
6414             break;
6415          }
6416 #endif
6417       }
6418    }
6419 
6420    if (dirty & CROCUS_DIRTY_GEN6_SAMPLE_MASK) {
6421       crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLE_MASK), ms) {
6422          ms.SampleMask = determine_sample_mask(ice);
6423       }
6424    }
6425 #endif
6426 
6427 #if GFX_VER >= 7
6428    struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
6429    if ((stage_dirty & CROCUS_STAGE_DIRTY_FS) && shader) {
6430       struct elk_stage_prog_data *prog_data = shader->prog_data;
6431       struct elk_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
6432 
6433       crocus_emit_cmd(batch, GENX(3DSTATE_PS), ps) {
6434 
6435          /* Initialize the execution mask with VMask.  Otherwise, derivatives are
6436           * incorrect for subspans where some of the pixels are unlit.  We believe
6437           * the bit just didn't take effect in previous generations.
6438           */
6439          ps.VectorMaskEnable = GFX_VER >= 8 && wm_prog_data->uses_vmask;
6440 
6441          intel_set_ps_dispatch_state(&ps, &batch->screen->devinfo,
6442                                      wm_prog_data,
6443                                      ice->state.framebuffer.samples,
6444                                      0 /* msaa_flags */);
6445 
6446          ps.DispatchGRFStartRegisterForConstantSetupData0 =
6447             elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
6448          ps.DispatchGRFStartRegisterForConstantSetupData1 =
6449             elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
6450          ps.DispatchGRFStartRegisterForConstantSetupData2 =
6451             elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
6452 
6453          ps.KernelStartPointer0 = KSP(ice, shader) +
6454             elk_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
6455          ps.KernelStartPointer1 = KSP(ice, shader) +
6456             elk_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
6457          ps.KernelStartPointer2 = KSP(ice, shader) +
6458             elk_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
6459 
6460 #if GFX_VERx10 == 75
6461          ps.SampleMask = determine_sample_mask(ice);
6462 #endif
6463          // XXX: WABTPPrefetchDisable, see above, drop at C0
6464          ps.BindingTableEntryCount = shader->bt.size_bytes / 4;
6465          ps.FloatingPointMode = prog_data->use_alt_mode;
6466 #if GFX_VER >= 8
6467          ps.MaximumNumberofThreadsPerPSD =
6468             batch->screen->devinfo.max_threads_per_psd - 2;
6469 #else
6470          ps.MaximumNumberofThreads = batch->screen->devinfo.max_wm_threads - 1;
6471 #endif
6472 
6473          ps.PushConstantEnable = prog_data->ubo_ranges[0].length > 0;
6474 
6475 #if GFX_VER < 8
6476          ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
6477          ps.DualSourceBlendEnable = wm_prog_data->dual_src_blend && ice->state.cso_blend->dual_color_blending;
6478          ps.AttributeEnable = (wm_prog_data->num_varying_inputs != 0);
6479 #endif
6480          /* From the documentation for this packet:
6481           * "If the PS kernel does not need the Position XY Offsets to
6482           *  compute a Position Value, then this field should be programmed
6483           *  to POSOFFSET_NONE."
6484           *
6485           * "SW Recommendation: If the PS kernel needs the Position Offsets
6486           *  to compute a Position XY value, this field should match Position
6487           *  ZW Interpolation Mode to ensure a consistent position.xyzw
6488           *  computation."
6489           *
6490           * We only require XY sample offsets. So, this recommendation doesn't
6491           * look useful at the moment.  We might need this in future.
6492           */
6493          ps.PositionXYOffsetSelect =
6494             wm_prog_data->uses_pos_offset ? POSOFFSET_SAMPLE : POSOFFSET_NONE;
6495 
6496          if (wm_prog_data->base.total_scratch) {
6497             struct crocus_bo *bo = crocus_get_scratch_space(ice, wm_prog_data->base.total_scratch, MESA_SHADER_FRAGMENT);
6498             ps.PerThreadScratchSpace = ffs(wm_prog_data->base.total_scratch) - 11;
6499             ps.ScratchSpaceBasePointer = rw_bo(bo, 0);
6500          }
6501       }
6502 #if GFX_VER == 8
6503       const struct shader_info *fs_info =
6504          crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
6505       crocus_emit_cmd(batch, GENX(3DSTATE_PS_EXTRA), psx) {
6506          psx.PixelShaderValid = true;
6507          psx.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
6508          psx.PixelShaderKillsPixel = wm_prog_data->uses_kill;
6509          psx.AttributeEnable = wm_prog_data->num_varying_inputs != 0;
6510          psx.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
6511          psx.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
6512          psx.PixelShaderIsPerSample =
6513             elk_wm_prog_data_is_persample(wm_prog_data, 0);
6514 
6515          /* _NEW_MULTISAMPLE | ELK_NEW_CONSERVATIVE_RASTERIZATION */
6516          if (wm_prog_data->uses_sample_mask)
6517             psx.PixelShaderUsesInputCoverageMask = true;
6518 
6519          psx.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
6520 
6521          /* The stricter cross-primitive coherency guarantees that the hardware
6522           * gives us with the "Accesses UAV" bit set for at least one shader stage
6523           * and the "UAV coherency required" bit set on the 3DPRIMITIVE command
6524           * are redundant within the current image, atomic counter and SSBO GL
6525           * APIs, which all have very loose ordering and coherency requirements
6526           * and generally rely on the application to insert explicit barriers when
6527           * a shader invocation is expected to see the memory writes performed by
6528           * the invocations of some previous primitive.  Regardless of the value
6529           * of "UAV coherency required", the "Accesses UAV" bits will implicitly
6530           * cause an in most cases useless DC flush when the lowermost stage with
6531           * the bit set finishes execution.
6532           *
6533           * It would be nice to disable it, but in some cases we can't because on
6534           * Gfx8+ it also has an influence on rasterization via the PS UAV-only
6535           * signal (which could be set independently from the coherency mechanism
6536           * in the 3DSTATE_WM command on Gfx7), and because in some cases it will
6537           * determine whether the hardware skips execution of the fragment shader
6538           * or not via the ThreadDispatchEnable signal.  However if we know that
6539           * GFX8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and
6540           * GFX8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any
6541           * difference so we may just disable it here.
6542           *
6543           * Gfx8 hardware tries to compute ThreadDispatchEnable for us but doesn't
6544           * take into account KillPixels when no depth or stencil writes are
6545           * enabled.  In order for occlusion queries to work correctly with no
6546           * attachments, we need to force-enable here.
6547           *
6548           */
6549          if ((wm_prog_data->has_side_effects || wm_prog_data->uses_kill) &&
6550              !(has_writeable_rt(ice->state.cso_blend, fs_info)))
6551             psx.PixelShaderHasUAV = true;
6552       }
6553 #endif
6554    }
6555 #endif
6556 
6557 #if GFX_VER >= 7
6558    if (ice->state.streamout_active) {
6559       if (dirty & CROCUS_DIRTY_GEN7_SO_BUFFERS) {
6560          for (int i = 0; i < 4; i++) {
6561             struct crocus_stream_output_target *tgt =
6562                (void *) ice->state.so_target[i];
6563 
6564             if (!tgt) {
6565                crocus_emit_cmd(batch, GENX(3DSTATE_SO_BUFFER), sob) {
6566                   sob.SOBufferIndex = i;
6567                   sob.MOCS = crocus_mocs(NULL, &batch->screen->isl_dev);
6568                }
6569                continue;
6570             }
6571             struct crocus_resource *res = (void *) tgt->base.buffer;
6572             uint32_t start = tgt->base.buffer_offset;
6573 #if GFX_VER < 8
6574             uint32_t end = ALIGN(start + tgt->base.buffer_size, 4);
6575 #endif
6576             crocus_emit_cmd(batch, GENX(3DSTATE_SO_BUFFER), sob) {
6577                sob.SOBufferIndex = i;
6578 
6579                sob.SurfaceBaseAddress = rw_bo(res->bo, start);
6580                sob.MOCS = crocus_mocs(res->bo, &batch->screen->isl_dev);
6581 #if GFX_VER < 8
6582                sob.SurfacePitch = tgt->stride;
6583                sob.SurfaceEndAddress = rw_bo(res->bo, end);
6584 #else
6585                sob.SOBufferEnable = true;
6586                sob.StreamOffsetWriteEnable = true;
6587                sob.StreamOutputBufferOffsetAddressEnable = true;
6588 
6589                sob.SurfaceSize = MAX2(tgt->base.buffer_size / 4, 1) - 1;
6590                sob.StreamOutputBufferOffsetAddress =
6591                   rw_bo(crocus_resource_bo(&tgt->offset_res->base.b), tgt->offset_offset);
6592                if (tgt->zero_offset) {
6593                   sob.StreamOffset = 0;
6594                   tgt->zero_offset = false;
6595                } else
6596                   sob.StreamOffset = 0xFFFFFFFF; /* not offset, see above */
6597 #endif
6598             }
6599          }
6600       }
6601 
6602       if ((dirty & CROCUS_DIRTY_SO_DECL_LIST) && ice->state.streamout) {
6603          uint32_t *decl_list =
6604             ice->state.streamout + GENX(3DSTATE_STREAMOUT_length);
6605          crocus_batch_emit(batch, decl_list, 4 * ((decl_list[0] & 0xff) + 2));
6606       }
6607 
6608       if (dirty & CROCUS_DIRTY_STREAMOUT) {
6609          const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
6610 
6611          uint32_t dynamic_sol[GENX(3DSTATE_STREAMOUT_length)];
6612          crocus_pack_command(GENX(3DSTATE_STREAMOUT), dynamic_sol, sol) {
6613             sol.SOFunctionEnable = true;
6614             sol.SOStatisticsEnable = true;
6615 
6616             sol.RenderingDisable = cso_rast->cso.rasterizer_discard &&
6617                                    !ice->state.prims_generated_query_active;
6618             sol.ReorderMode = cso_rast->cso.flatshade_first ? LEADING : TRAILING;
6619          }
6620 
6621          assert(ice->state.streamout);
6622 
6623          crocus_emit_merge(batch, ice->state.streamout, dynamic_sol,
6624                          GENX(3DSTATE_STREAMOUT_length));
6625       }
6626    } else {
6627       if (dirty & CROCUS_DIRTY_STREAMOUT) {
6628          crocus_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), sol);
6629       }
6630    }
6631 #endif
6632 #if GFX_VER == 6
6633    if (ice->state.streamout_active) {
6634       if (dirty & CROCUS_DIRTY_GEN6_SVBI) {
6635          crocus_emit_so_svbi(ice);
6636       }
6637    }
6638 #endif
6639 
6640    if (dirty & CROCUS_DIRTY_CLIP) {
6641 #if GFX_VER < 6
6642       const struct elk_clip_prog_data *clip_prog_data = (struct elk_clip_prog_data *)ice->shaders.clip_prog->prog_data;
6643       struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
6644 
6645       uint32_t *clip_ptr = stream_state(batch, GENX(CLIP_STATE_length) * 4, 32, &ice->shaders.clip_offset);
6646       dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6647       _crocus_pack_state(batch, GENX(CLIP_STATE), clip_ptr, clip) {
6648          clip.KernelStartPointer = KSP(ice, ice->shaders.clip_prog);
6649          clip.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
6650          clip.SingleProgramFlow = true;
6651          clip.GRFRegisterCount = DIV_ROUND_UP(clip_prog_data->total_grf, 16) - 1;
6652 
6653          clip.VertexURBEntryReadLength = clip_prog_data->urb_read_length;
6654          clip.ConstantURBEntryReadLength = clip_prog_data->curb_read_length;
6655 
6656          clip.DispatchGRFStartRegisterForURBData = 1;
6657          clip.VertexURBEntryReadOffset = 0;
6658          clip.ConstantURBEntryReadOffset = ice->curbe.clip_start * 2;
6659 
6660          clip.NumberofURBEntries = batch->ice->urb.nr_clip_entries;
6661          clip.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
6662 
6663          if (batch->ice->urb.nr_clip_entries >= 10) {
6664             /* Half of the URB entries go to each thread, and it has to be an
6665              * even number.
6666              */
6667             assert(batch->ice->urb.nr_clip_entries % 2 == 0);
6668 
6669             /* Although up to 16 concurrent Clip threads are allowed on Ironlake,
6670              * only 2 threads can output VUEs at a time.
6671              */
6672             clip.MaximumNumberofThreads = (GFX_VER == 5 ? 16 : 2) - 1;
6673          } else {
6674             assert(batch->ice->urb.nr_clip_entries >= 5);
6675             clip.MaximumNumberofThreads = 1 - 1;
6676          }
6677          clip.VertexPositionSpace = VPOS_NDCSPACE;
6678          clip.UserClipFlagsMustClipEnable = true;
6679          clip.GuardbandClipTestEnable = true;
6680 
6681          clip.ClipperViewportStatePointer = ro_bo(batch->state.bo, ice->state.clip_vp_address);
6682          clip.ScreenSpaceViewportXMin = -1.0;
6683          clip.ScreenSpaceViewportXMax = 1.0;
6684          clip.ScreenSpaceViewportYMin = -1.0;
6685          clip.ScreenSpaceViewportYMax = 1.0;
6686          clip.ViewportXYClipTestEnable = true;
6687          clip.ViewportZClipTestEnable = (cso_state->depth_clip_near || cso_state->depth_clip_far);
6688 
6689 #if GFX_VER == 5 || GFX_VERx10 == 45
6690          clip.UserClipDistanceClipTestEnableBitmask = cso_state->clip_plane_enable;
6691 #else
6692          /* Up to 6 actual clip flags, plus the 7th for the negative RHW
6693           * workaround.
6694           */
6695          clip.UserClipDistanceClipTestEnableBitmask = (cso_state->clip_plane_enable & 0x3f) | 0x40;
6696 #endif
6697 
6698          clip.APIMode = cso_state->clip_halfz ? APIMODE_D3D : APIMODE_OGL;
6699          clip.GuardbandClipTestEnable = true;
6700 
6701          clip.ClipMode = clip_prog_data->clip_mode;
6702 #if GFX_VERx10 == 45
6703          clip.NegativeWClipTestEnable = true;
6704 #endif
6705       }
6706 
6707 #else //if GFX_VER >= 6
6708       struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
6709       const struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data );
6710       struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
6711       bool gs_or_tes = ice->shaders.prog[MESA_SHADER_GEOMETRY] ||
6712                        ice->shaders.prog[MESA_SHADER_TESS_EVAL];
6713       bool points_or_lines = cso_rast->fill_mode_point_or_line ||
6714          (gs_or_tes ? ice->shaders.output_topology_is_points_or_lines
6715                     : ice->state.prim_is_points_or_lines);
6716       uint32_t dynamic_clip[GENX(3DSTATE_CLIP_length)];
6717       crocus_pack_command(GENX(3DSTATE_CLIP), &dynamic_clip, cl) {
6718          cl.StatisticsEnable = ice->state.statistics_counters_enabled;
6719          if (cso_rast->cso.rasterizer_discard)
6720             cl.ClipMode = CLIPMODE_REJECT_ALL;
6721          else if (ice->state.window_space_position)
6722             cl.ClipMode = CLIPMODE_ACCEPT_ALL;
6723          else
6724             cl.ClipMode = CLIPMODE_NORMAL;
6725 
6726          cl.PerspectiveDivideDisable = ice->state.window_space_position;
6727          cl.ViewportXYClipTestEnable = !points_or_lines;
6728 
6729          cl.UserClipDistanceCullTestEnableBitmask =
6730             elk_vue_prog_data(ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data)->cull_distance_mask;
6731 
6732          cl.NonPerspectiveBarycentricEnable = wm_prog_data->uses_nonperspective_interp_modes;
6733 
6734          cl.ForceZeroRTAIndexEnable = cso_fb->layers <= 1;
6735          cl.MaximumVPIndex = ice->state.num_viewports - 1;
6736       }
6737       crocus_emit_merge(batch, cso_rast->clip, dynamic_clip,
6738                       ARRAY_SIZE(cso_rast->clip));
6739 #endif
6740    }
6741 
6742    if (stage_dirty & CROCUS_STAGE_DIRTY_VS) {
6743       struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_VERTEX];
6744       const struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(shader->prog_data);
6745       const struct elk_stage_prog_data *prog_data = &vue_prog_data->base;
6746 #if GFX_VER == 7
6747       if (batch->screen->devinfo.platform == INTEL_PLATFORM_IVB)
6748          gen7_emit_vs_workaround_flush(batch);
6749 #endif
6750 
6751 
6752 #if GFX_VER == 6
6753       struct push_bos push_bos = {};
6754       setup_constant_buffers(ice, batch, MESA_SHADER_VERTEX, &push_bos);
6755 
6756       emit_push_constant_packets(ice, batch, MESA_SHADER_VERTEX, &push_bos);
6757 #endif
6758 #if GFX_VER >= 6
6759       crocus_emit_cmd(batch, GENX(3DSTATE_VS), vs)
6760 #else
6761       uint32_t *vs_ptr = stream_state(batch,
6762                                       GENX(VS_STATE_length) * 4, 32, &ice->shaders.vs_offset);
6763       dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6764       _crocus_pack_state(batch, GENX(VS_STATE), vs_ptr, vs)
6765 #endif
6766       {
6767          INIT_THREAD_DISPATCH_FIELDS(vs, Vertex, MESA_SHADER_VERTEX);
6768 
6769          vs.MaximumNumberofThreads = batch->screen->devinfo.max_vs_threads - 1;
6770 
6771 #if GFX_VER < 6
6772          vs.GRFRegisterCount = DIV_ROUND_UP(vue_prog_data->total_grf, 16) - 1;
6773          vs.ConstantURBEntryReadLength = vue_prog_data->base.curb_read_length;
6774          vs.ConstantURBEntryReadOffset = ice->curbe.vs_start * 2;
6775 
6776          vs.NumberofURBEntries = batch->ice->urb.nr_vs_entries >> (GFX_VER == 5 ? 2 : 0);
6777          vs.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
6778 
6779          vs.MaximumNumberofThreads =
6780             CLAMP(batch->ice->urb.nr_vs_entries / 2, 1, batch->screen->devinfo.max_vs_threads) - 1;
6781          vs.StatisticsEnable = false;
6782          vs.SamplerStatePointer = ro_bo(batch->state.bo, ice->state.shaders[MESA_SHADER_VERTEX].sampler_offset);
6783 #endif
6784 #if GFX_VER == 5
6785          /* Force single program flow on Ironlake.  We cannot reliably get
6786           * all applications working without it.  See:
6787           * https://bugs.freedesktop.org/show_bug.cgi?id=29172
6788           *
6789           * The most notable and reliably failing application is the Humus
6790           * demo "CelShading"
6791           */
6792          vs.SingleProgramFlow = true;
6793          vs.SamplerCount = 0; /* hardware requirement */
6794 
6795 #endif
6796 #if GFX_VER >= 8
6797          vs.SIMD8DispatchEnable =
6798             vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8;
6799 
6800          vs.UserClipDistanceCullTestEnableBitmask =
6801             vue_prog_data->cull_distance_mask;
6802 #endif
6803       }
6804 
6805 #if GFX_VER == 6
6806       crocus_emit_pipe_control_flush(batch,
6807                                      "post VS const",
6808                                      PIPE_CONTROL_DEPTH_STALL |
6809                                      PIPE_CONTROL_INSTRUCTION_INVALIDATE |
6810                                      PIPE_CONTROL_STATE_CACHE_INVALIDATE);
6811 #endif
6812    }
6813 
6814    if (stage_dirty & CROCUS_STAGE_DIRTY_GS) {
6815       struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_GEOMETRY];
6816       bool active = GFX_VER >= 6 && shader;
6817 #if GFX_VER == 6
6818       struct push_bos push_bos = {};
6819       if (shader)
6820          setup_constant_buffers(ice, batch, MESA_SHADER_GEOMETRY, &push_bos);
6821 
6822       emit_push_constant_packets(ice, batch, MESA_SHADER_GEOMETRY, &push_bos);
6823 #endif
6824 #if GFX_VERx10 == 70
6825    /**
6826     * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
6827     * Geometry > Geometry Shader > State:
6828     *
6829     *     "Note: Because of corruption in IVB:GT2, software needs to flush the
6830     *     whole fixed function pipeline when the GS enable changes value in
6831     *     the 3DSTATE_GS."
6832     *
6833     * The hardware architects have clarified that in this context "flush the
6834     * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
6835     * Stall" bit set.
6836     */
6837    if (batch->screen->devinfo.gt == 2 && ice->state.gs_enabled != active)
6838       gen7_emit_cs_stall_flush(batch);
6839 #endif
6840 #if GFX_VER >= 6
6841       crocus_emit_cmd(batch, GENX(3DSTATE_GS), gs)
6842 #else
6843       uint32_t *gs_ptr = stream_state(batch,
6844                                       GENX(GS_STATE_length) * 4, 32, &ice->shaders.gs_offset);
6845       dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6846       _crocus_pack_state(batch, GENX(GS_STATE), gs_ptr, gs)
6847 #endif
6848      {
6849 #if GFX_VER >= 6
6850          if (active) {
6851             const struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(shader->prog_data);
6852             const struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(shader->prog_data);
6853             const struct elk_stage_prog_data *prog_data = &gs_prog_data->base.base;
6854 
6855             INIT_THREAD_DISPATCH_FIELDS(gs, Vertex, MESA_SHADER_GEOMETRY);
6856 #if GFX_VER >= 7
6857             gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
6858             gs.OutputTopology = gs_prog_data->output_topology;
6859             gs.ControlDataHeaderSize =
6860                gs_prog_data->control_data_header_size_hwords;
6861 
6862             gs.InstanceControl = gs_prog_data->invocations - 1;
6863             gs.DispatchMode = vue_prog_data->dispatch_mode;
6864 
6865             gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
6866 
6867             gs.ControlDataFormat = gs_prog_data->control_data_format;
6868 #endif
6869 
6870             /* Note: the meaning of the GEN7_GS_REORDER_TRAILING bit changes between
6871              * Ivy Bridge and Haswell.
6872              *
6873              * On Ivy Bridge, setting this bit causes the vertices of a triangle
6874              * strip to be delivered to the geometry shader in an order that does
6875              * not strictly follow the OpenGL spec, but preserves triangle
6876              * orientation.  For example, if the vertices are (1, 2, 3, 4, 5), then
6877              * the geometry shader sees triangles:
6878              *
6879              * (1, 2, 3), (2, 4, 3), (3, 4, 5)
6880              *
6881              * (Clearing the bit is even worse, because it fails to preserve
6882              * orientation).
6883              *
6884              * Triangle strips with adjacency always ordered in a way that preserves
6885              * triangle orientation but does not strictly follow the OpenGL spec,
6886              * regardless of the setting of this bit.
6887              *
6888              * On Haswell, both triangle strips and triangle strips with adjacency
6889              * are always ordered in a way that preserves triangle orientation.
6890              * Setting this bit causes the ordering to strictly follow the OpenGL
6891              * spec.
6892              *
6893              * So in either case we want to set the bit.  Unfortunately on Ivy
6894              * Bridge this will get the order close to correct but not perfect.
6895              */
6896             gs.ReorderMode = TRAILING;
6897             gs.MaximumNumberofThreads =
6898                GFX_VER == 8 ? (batch->screen->devinfo.max_gs_threads / 2 - 1) :
6899                (batch->screen->devinfo.max_gs_threads - 1);
6900 #if GFX_VER < 7
6901             gs.SOStatisticsEnable = true;
6902             if (gs_prog_data->num_transform_feedback_bindings)
6903                gs.SVBIPayloadEnable = ice->state.streamout_active;
6904 
6905             /* GEN6_GS_SPF_MODE and GEN6_GS_VECTOR_MASK_ENABLE are enabled as it
6906              * was previously done for gen6.
6907              *
6908              * TODO: test with both disabled to see if the HW is behaving
6909              * as expected, like in gen7.
6910              */
6911             gs.SingleProgramFlow = true;
6912             gs.VectorMaskEnable = true;
6913 #endif
6914 #if GFX_VER >= 8
6915             gs.ExpectedVertexCount = gs_prog_data->vertices_in;
6916 
6917             if (gs_prog_data->static_vertex_count != -1) {
6918                gs.StaticOutput = true;
6919                gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count;
6920             }
6921             gs.IncludeVertexHandles = vue_prog_data->include_vue_handles;
6922 
6923             gs.UserClipDistanceCullTestEnableBitmask =
6924                vue_prog_data->cull_distance_mask;
6925 
6926             const int urb_entry_write_offset = 1;
6927             const uint32_t urb_entry_output_length =
6928                DIV_ROUND_UP(vue_prog_data->vue_map.num_slots, 2) -
6929                urb_entry_write_offset;
6930 
6931             gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;
6932             gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
6933 #endif
6934          }
6935 #endif
6936 #if GFX_VER <= 6
6937          if (!active && ice->shaders.ff_gs_prog) {
6938             const struct elk_ff_gs_prog_data *gs_prog_data = (struct elk_ff_gs_prog_data *)ice->shaders.ff_gs_prog->prog_data;
6939             /* In gen6, transform feedback for the VS stage is done with an
6940              * ad-hoc GS program. This function provides the needed 3DSTATE_GS
6941              * for this.
6942              */
6943             gs.KernelStartPointer = KSP(ice, ice->shaders.ff_gs_prog);
6944             gs.SingleProgramFlow = true;
6945             gs.DispatchGRFStartRegisterForURBData = GFX_VER == 6 ? 2 : 1;
6946             gs.VertexURBEntryReadLength = gs_prog_data->urb_read_length;
6947 
6948 #if GFX_VER <= 5
6949             gs.GRFRegisterCount =
6950                DIV_ROUND_UP(gs_prog_data->total_grf, 16) - 1;
6951             /* ELK_NEW_URB_FENCE */
6952             gs.NumberofURBEntries = batch->ice->urb.nr_gs_entries;
6953             gs.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
6954             gs.MaximumNumberofThreads = batch->ice->urb.nr_gs_entries >= 8 ? 1 : 0;
6955             gs.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
6956 #else
6957             gs.Enable = true;
6958             gs.VectorMaskEnable = true;
6959             gs.SVBIPayloadEnable = true;
6960             gs.SVBIPostIncrementEnable = true;
6961             gs.SVBIPostIncrementValue = gs_prog_data->svbi_postincrement_value;
6962             gs.SOStatisticsEnable = true;
6963             gs.MaximumNumberofThreads = batch->screen->devinfo.max_gs_threads - 1;
6964 #endif
6965          }
6966 #endif
6967          if (!active && !ice->shaders.ff_gs_prog) {
6968 #if GFX_VER < 8
6969             gs.DispatchGRFStartRegisterForURBData = 1;
6970 #if GFX_VER >= 7
6971             gs.IncludeVertexHandles = true;
6972 #endif
6973 #endif
6974          }
6975 #if GFX_VER >= 6
6976          gs.StatisticsEnable = true;
6977 #endif
6978 #if GFX_VER == 5 || GFX_VER == 6
6979          gs.RenderingEnabled = true;
6980 #endif
6981 #if GFX_VER <= 5
6982          gs.MaximumVPIndex = ice->state.num_viewports - 1;
6983 #endif
6984       }
6985       ice->state.gs_enabled = active;
6986    }
6987 
6988 #if GFX_VER >= 7
6989    if (stage_dirty & CROCUS_STAGE_DIRTY_TCS) {
6990       struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_TESS_CTRL];
6991 
6992       if (shader) {
6993          const struct elk_tcs_prog_data *tcs_prog_data = elk_tcs_prog_data(shader->prog_data);
6994          const struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(shader->prog_data);
6995          const struct elk_stage_prog_data *prog_data = &tcs_prog_data->base.base;
6996 
6997          crocus_emit_cmd(batch, GENX(3DSTATE_HS), hs) {
6998             INIT_THREAD_DISPATCH_FIELDS(hs, Vertex, MESA_SHADER_TESS_CTRL);
6999             hs.InstanceCount = tcs_prog_data->instances - 1;
7000             hs.IncludeVertexHandles = true;
7001             hs.MaximumNumberofThreads = batch->screen->devinfo.max_tcs_threads - 1;
7002          }
7003       } else {
7004          crocus_emit_cmd(batch, GENX(3DSTATE_HS), hs);
7005       }
7006 
7007    }
7008 
7009    if (stage_dirty & CROCUS_STAGE_DIRTY_TES) {
7010       struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_TESS_EVAL];
7011       if (shader) {
7012          const struct elk_tes_prog_data *tes_prog_data = elk_tes_prog_data(shader->prog_data);
7013          const struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(shader->prog_data);
7014          const struct elk_stage_prog_data *prog_data = &tes_prog_data->base.base;
7015 
7016          crocus_emit_cmd(batch, GENX(3DSTATE_TE), te) {
7017             te.Partitioning = tes_prog_data->partitioning;
7018             te.OutputTopology = tes_prog_data->output_topology;
7019             te.TEDomain = tes_prog_data->domain;
7020             te.TEEnable = true;
7021             te.MaximumTessellationFactorOdd = 63.0;
7022             te.MaximumTessellationFactorNotOdd = 64.0;
7023          };
7024          crocus_emit_cmd(batch, GENX(3DSTATE_DS), ds) {
7025             INIT_THREAD_DISPATCH_FIELDS(ds, Patch, MESA_SHADER_TESS_EVAL);
7026 
7027             ds.MaximumNumberofThreads = batch->screen->devinfo.max_tes_threads - 1;
7028             ds.ComputeWCoordinateEnable =
7029                tes_prog_data->domain == INTEL_TESS_DOMAIN_TRI;
7030 
7031 #if GFX_VER >= 8
7032             if (vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8)
7033                ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
7034             ds.UserClipDistanceCullTestEnableBitmask =
7035                vue_prog_data->cull_distance_mask;
7036 #endif
7037          };
7038       } else {
7039          crocus_emit_cmd(batch, GENX(3DSTATE_TE), te);
7040          crocus_emit_cmd(batch, GENX(3DSTATE_DS), ds);
7041       }
7042    }
7043 #endif
7044    if (dirty & CROCUS_DIRTY_RASTER) {
7045 
7046 #if GFX_VER < 6
7047       const struct elk_sf_prog_data *sf_prog_data = (struct elk_sf_prog_data *)ice->shaders.sf_prog->prog_data;
7048       struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
7049       uint32_t *sf_ptr = stream_state(batch,
7050                                       GENX(SF_STATE_length) * 4, 32, &ice->shaders.sf_offset);
7051       dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
7052       _crocus_pack_state(batch, GENX(SF_STATE), sf_ptr, sf) {
7053          sf.KernelStartPointer = KSP(ice, ice->shaders.sf_prog);
7054          sf.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
7055          sf.GRFRegisterCount = DIV_ROUND_UP(sf_prog_data->total_grf, 16) - 1;
7056          sf.DispatchGRFStartRegisterForURBData = 3;
7057          sf.VertexURBEntryReadOffset = ELK_SF_URB_ENTRY_READ_OFFSET;
7058          sf.VertexURBEntryReadLength = sf_prog_data->urb_read_length;
7059          sf.URBEntryAllocationSize = batch->ice->urb.sfsize - 1;
7060          sf.NumberofURBEntries = batch->ice->urb.nr_sf_entries;
7061          sf.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
7062 
7063          sf.SetupViewportStateOffset = ro_bo(batch->state.bo, ice->state.sf_vp_address);
7064 
7065          sf.MaximumNumberofThreads =
7066             MIN2(GFX_VER == 5 ? 48 : 24, batch->ice->urb.nr_sf_entries) - 1;
7067 
7068          sf.SpritePointEnable = cso_state->point_quad_rasterization;
7069          sf.DestinationOriginHorizontalBias = 0.5;
7070          sf.DestinationOriginVerticalBias = 0.5;
7071 
7072 	 sf.LineEndCapAntialiasingRegionWidth =
7073             cso_state->line_smooth ? _10pixels : _05pixels;
7074          sf.LastPixelEnable = cso_state->line_last_pixel;
7075          sf.AntialiasingEnable = cso_state->line_smooth;
7076 
7077          sf.LineWidth = get_line_width(cso_state);
7078          sf.PointWidth = cso_state->point_size;
7079          sf.PointWidthSource = cso_state->point_size_per_vertex ? Vertex : State;
7080 #if GFX_VERx10 >= 45
7081          sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
7082 #endif
7083          sf.ViewportTransformEnable = true;
7084          sf.FrontWinding = cso_state->front_ccw ? 1 : 0;
7085          sf.ScissorRectangleEnable = true;
7086          sf.CullMode = translate_cull_mode(cso_state->cull_face);
7087 
7088          if (cso_state->flatshade_first) {
7089             sf.TriangleFanProvokingVertexSelect = 1;
7090          } else {
7091             sf.TriangleStripListProvokingVertexSelect = 2;
7092             sf.TriangleFanProvokingVertexSelect = 2;
7093             sf.LineStripListProvokingVertexSelect = 1;
7094          }
7095       }
7096 #else
7097       struct crocus_rasterizer_state *cso = ice->state.cso_rast;
7098       uint32_t dynamic_sf[GENX(3DSTATE_SF_length)];
7099       crocus_pack_command(GENX(3DSTATE_SF), &dynamic_sf, sf) {
7100          sf.ViewportTransformEnable = !ice->state.window_space_position;
7101 
7102 #if GFX_VER == 6
7103          const struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data);
7104          uint32_t urb_entry_read_length;
7105          uint32_t urb_entry_read_offset;
7106          uint32_t point_sprite_enables;
7107          calculate_attr_overrides(ice, sf.Attribute, &point_sprite_enables,
7108                                   &urb_entry_read_length,
7109                                   &urb_entry_read_offset);
7110          sf.VertexURBEntryReadLength = urb_entry_read_length;
7111          sf.VertexURBEntryReadOffset = urb_entry_read_offset;
7112          sf.PointSpriteTextureCoordinateEnable = point_sprite_enables;
7113          sf.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
7114          sf.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
7115 #endif
7116 
7117 #if GFX_VER >= 6 && GFX_VER < 8
7118          if (ice->state.framebuffer.samples > 1 && ice->state.cso_rast->cso.multisample)
7119             sf.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
7120 #endif
7121 #if GFX_VER == 7
7122          if (ice->state.framebuffer.zsbuf) {
7123             struct crocus_resource *zres, *sres;
7124                crocus_get_depth_stencil_resources(&batch->screen->devinfo,
7125                                                   ice->state.framebuffer.zsbuf->texture,
7126                                                   &zres, &sres);
7127             /* ANV thinks that the stencil-ness doesn't matter, this is just
7128              * about handling polygon offset scaling.
7129              */
7130             sf.DepthBufferSurfaceFormat = zres ? isl_format_get_depth_format(zres->surf.format, false) : D16_UNORM;
7131          }
7132 #endif
7133       }
7134       crocus_emit_merge(batch, cso->sf, dynamic_sf,
7135                       ARRAY_SIZE(dynamic_sf));
7136 #if GFX_VER == 8
7137       crocus_batch_emit(batch, cso->raster, sizeof(cso->raster));
7138 #endif
7139 #endif
7140    }
7141 
7142    if (dirty & CROCUS_DIRTY_WM) {
7143       struct crocus_rasterizer_state *cso = ice->state.cso_rast;
7144       const struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data);
7145       UNUSED bool writes_depth = wm_prog_data->computed_depth_mode != ELK_PSCDEPTH_OFF;
7146       UNUSED const struct shader_info *fs_info =
7147          crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
7148 
7149 #if GFX_VER == 6
7150       struct push_bos push_bos = {};
7151       setup_constant_buffers(ice, batch, MESA_SHADER_FRAGMENT, &push_bos);
7152 
7153       emit_push_constant_packets(ice, batch, MESA_SHADER_FRAGMENT, &push_bos);
7154 #endif
7155 #if GFX_VER >= 6
7156       crocus_emit_cmd(batch, GENX(3DSTATE_WM), wm)
7157 #else
7158       uint32_t *wm_ptr = stream_state(batch,
7159                                       GENX(WM_STATE_length) * 4, 32, &ice->shaders.wm_offset);
7160 
7161       dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
7162 
7163       _crocus_pack_state(batch, GENX(WM_STATE), wm_ptr, wm)
7164 #endif
7165      {
7166 #if GFX_VER <= 6
7167          wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
7168          wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
7169          wm._32PixelDispatchEnable = wm_prog_data->dispatch_32;
7170 #endif
7171 #if GFX_VER == 4
7172       /* On gen4, we only have one shader kernel */
7173          if (elk_wm_state_has_ksp(wm, 0)) {
7174             wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]);
7175             wm.GRFRegisterCount0 = elk_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
7176             wm.DispatchGRFStartRegisterForConstantSetupData0 =
7177                wm_prog_data->base.dispatch_grf_start_reg;
7178          }
7179 #elif GFX_VER == 5
7180          wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7181             elk_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
7182          wm.KernelStartPointer1 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7183             elk_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
7184          wm.KernelStartPointer2 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7185             elk_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
7186 
7187          wm.GRFRegisterCount0 = elk_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
7188          wm.GRFRegisterCount1 = elk_wm_prog_data_reg_blocks(wm_prog_data, wm, 1);
7189          wm.GRFRegisterCount2 = elk_wm_prog_data_reg_blocks(wm_prog_data, wm, 2);
7190 
7191          wm.DispatchGRFStartRegisterForConstantSetupData0 =
7192             wm_prog_data->base.dispatch_grf_start_reg;
7193 #elif GFX_VER == 6
7194          wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7195             elk_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
7196          wm.KernelStartPointer1 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7197             elk_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
7198          wm.KernelStartPointer2 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7199             elk_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
7200 
7201          wm.DispatchGRFStartRegisterForConstantSetupData0 =
7202            elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
7203          wm.DispatchGRFStartRegisterForConstantSetupData1 =
7204            elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1);
7205          wm.DispatchGRFStartRegisterForConstantSetupData2 =
7206            elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2);
7207 #endif
7208 #if GFX_VER <= 5
7209          wm.ConstantURBEntryReadLength = wm_prog_data->base.curb_read_length;
7210          wm.ConstantURBEntryReadOffset = ice->curbe.wm_start * 2;
7211          wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2;
7212          wm.SetupURBEntryReadOffset = 0;
7213          wm.EarlyDepthTestEnable = true;
7214          wm.LineAntialiasingRegionWidth = _05pixels;
7215          wm.LineEndCapAntialiasingRegionWidth = _10pixels;
7216          wm.DepthCoefficientURBReadOffset = 1;
7217 
7218          if (cso->cso.offset_tri) {
7219             wm.GlobalDepthOffsetEnable = true;
7220 
7221          /* Something weird going on with legacy_global_depth_bias,
7222           * offset_constant, scaling and MRD.  This value passes glean
7223           * but gives some odd results elsewere (eg. the
7224           * quad-offset-units test).
7225           */
7226             wm.GlobalDepthOffsetConstant = cso->cso.offset_units * 2;
7227             wm.GlobalDepthOffsetScale = cso->cso.offset_scale;
7228          }
7229          wm.SamplerStatePointer = ro_bo(batch->state.bo,
7230                                         ice->state.shaders[MESA_SHADER_FRAGMENT].sampler_offset);
7231 #endif
7232 
7233          wm.StatisticsEnable = (GFX_VER >= 6 || ice->state.stats_wm) ?
7234             ice->state.statistics_counters_enabled : 0;
7235 
7236 #if GFX_VER >= 6
7237          wm.LineAntialiasingRegionWidth = _10pixels;
7238          wm.LineEndCapAntialiasingRegionWidth = _05pixels;
7239 
7240          wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
7241          wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
7242 #endif
7243 #if GFX_VER == 6
7244       wm.DualSourceBlendEnable = wm_prog_data->dual_src_blend &&
7245          ice->state.cso_blend->dual_color_blending;
7246       wm.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
7247       wm.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
7248 
7249       /* From the SNB PRM, volume 2 part 1, page 281:
7250        * "If the PS kernel does not need the Position XY Offsets
7251        * to compute a Position XY value, then this field should be
7252        * programmed to POSOFFSET_NONE."
7253        *
7254        * "SW Recommendation: If the PS kernel needs the Position Offsets
7255        * to compute a Position XY value, this field should match Position
7256        * ZW Interpolation Mode to ensure a consistent position.xyzw
7257        * computation."
7258        * We only require XY sample offsets. So, this recommendation doesn't
7259        * look useful at the moment. We might need this in future.
7260        */
7261       if (wm_prog_data->uses_pos_offset)
7262          wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
7263       else
7264          wm.PositionXYOffsetSelect = POSOFFSET_NONE;
7265 #endif
7266          wm.LineStippleEnable = cso->cso.line_stipple_enable;
7267          wm.PolygonStippleEnable = cso->cso.poly_stipple_enable;
7268 
7269 #if GFX_VER < 7
7270          if (wm_prog_data->base.use_alt_mode)
7271             wm.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
7272          wm.BindingTableEntryCount = ice->shaders.prog[MESA_SHADER_FRAGMENT]->bt.size_bytes / 4;
7273          wm.MaximumNumberofThreads = batch->screen->devinfo.max_wm_threads - 1;
7274 #endif
7275 
7276 #if GFX_VER < 8
7277 #if GFX_VER >= 6
7278          wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
7279 
7280          struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
7281          if (fb->samples > 1) {
7282             if (cso->cso.multisample)
7283                wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
7284             else
7285                wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
7286 
7287             if (elk_wm_prog_data_is_persample(wm_prog_data, 0))
7288                wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
7289             else
7290                wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
7291          } else {
7292             wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
7293             wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
7294          }
7295 #endif
7296 
7297          wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
7298 
7299          if (wm_prog_data->uses_kill ||
7300              ice->state.cso_zsa->cso.alpha_enabled ||
7301              ice->state.cso_blend->cso.alpha_to_coverage ||
7302              (GFX_VER >= 6 && wm_prog_data->uses_omask))
7303             wm.PixelShaderKillsPixel = true;
7304 
7305          if (has_writeable_rt(ice->state.cso_blend, fs_info) ||
7306              writes_depth || wm.PixelShaderKillsPixel ||
7307              (GFX_VER >= 6 && wm_prog_data->has_side_effects))
7308             wm.ThreadDispatchEnable = true;
7309 
7310 #if GFX_VER >= 7
7311          wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
7312          wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
7313 #else
7314          if (wm_prog_data->base.total_scratch) {
7315             struct crocus_bo *bo = crocus_get_scratch_space(ice, wm_prog_data->base.total_scratch,
7316                                                             MESA_SHADER_FRAGMENT);
7317             wm.PerThreadScratchSpace = ffs(wm_prog_data->base.total_scratch) - 11;
7318             wm.ScratchSpaceBasePointer = rw_bo(bo, 0);
7319          }
7320 
7321          wm.PixelShaderComputedDepth = writes_depth;
7322 
7323 #endif
7324          /* The "UAV access enable" bits are unnecessary on HSW because they only
7325           * seem to have an effect on the HW-assisted coherency mechanism which we
7326           * don't need, and the rasterization-related UAV_ONLY flag and the
7327           * DISPATCH_ENABLE bit can be set independently from it.
7328           * C.f. gen8_upload_ps_extra().
7329           *
7330           * ELK_NEW_FRAGMENT_PROGRAM | ELK_NEW_FS_PROG_DATA | _NEW_BUFFERS |
7331           * _NEW_COLOR
7332           */
7333 #if GFX_VERx10 == 75
7334          if (!(has_writeable_rt(ice->state.cso_blend, fs_info) || writes_depth) &&
7335              wm_prog_data->has_side_effects)
7336             wm.PSUAVonly = ON;
7337 #endif
7338 #endif
7339 #if GFX_VER >= 7
7340       /* ELK_NEW_FS_PROG_DATA */
7341          if (wm_prog_data->early_fragment_tests)
7342            wm.EarlyDepthStencilControl = EDSC_PREPS;
7343          else if (wm_prog_data->has_side_effects)
7344            wm.EarlyDepthStencilControl = EDSC_PSEXEC;
7345 #endif
7346 #if GFX_VER == 8
7347          /* We could skip this bit if color writes are enabled. */
7348          if (wm_prog_data->has_side_effects || wm_prog_data->uses_kill)
7349             wm.ForceThreadDispatchEnable = ForceON;
7350 #endif
7351       };
7352 
7353 #if GFX_VER <= 5
7354       if (ice->state.global_depth_offset_clamp != cso->cso.offset_clamp) {
7355          crocus_emit_cmd(batch, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp) {
7356             clamp.GlobalDepthOffsetClamp = cso->cso.offset_clamp;
7357          }
7358          ice->state.global_depth_offset_clamp = cso->cso.offset_clamp;
7359       }
7360 #endif
7361    }
7362 
7363 #if GFX_VER >= 7
7364    if (dirty & CROCUS_DIRTY_GEN7_SBE) {
7365       crocus_emit_sbe(batch, ice);
7366    }
7367 #endif
7368 
7369 #if GFX_VER >= 8
7370    if (dirty & CROCUS_DIRTY_GEN8_PS_BLEND) {
7371       struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
7372       struct crocus_blend_state *cso_blend = ice->state.cso_blend;
7373       struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
7374       struct elk_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
7375       const struct shader_info *fs_info =
7376          crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
7377       uint32_t dynamic_pb[GENX(3DSTATE_PS_BLEND_length)];
7378       crocus_pack_command(GENX(3DSTATE_PS_BLEND), &dynamic_pb, pb) {
7379          pb.HasWriteableRT = has_writeable_rt(cso_blend, fs_info);
7380          pb.AlphaTestEnable = cso_zsa->cso.alpha_enabled;
7381          pb.ColorBufferBlendEnable = (cso_blend->blend_enables & 1) &&
7382             (!cso_blend->dual_color_blending || wm_prog_data->dual_src_blend);
7383       }
7384       crocus_emit_merge(batch, cso_blend->ps_blend, dynamic_pb,
7385                         ARRAY_SIZE(cso_blend->ps_blend));
7386    }
7387 #endif
7388 
7389 #if GFX_VER >= 6
7390    if (dirty & CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL) {
7391 
7392 #if GFX_VER >= 8
7393       crocus_emit_cmd(batch, GENX(3DSTATE_WM_DEPTH_STENCIL), wmds) {
7394          set_depth_stencil_bits(ice, &wmds);
7395       }
7396 #else
7397       uint32_t ds_offset;
7398       void *ds_map = stream_state(batch,
7399                                   sizeof(uint32_t) * GENX(DEPTH_STENCIL_STATE_length),
7400                                   64, &ds_offset);
7401       _crocus_pack_state(batch, GENX(DEPTH_STENCIL_STATE), ds_map, ds) {
7402          set_depth_stencil_bits(ice, &ds);
7403       }
7404 
7405 #if GFX_VER == 6
7406       crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
7407          ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
7408          ptr.DEPTH_STENCIL_STATEChange = true;
7409       }
7410 #else
7411       crocus_emit_cmd(batch, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), ptr) {
7412          ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
7413       }
7414 #endif
7415 #endif
7416    }
7417 
7418    if (dirty & CROCUS_DIRTY_GEN6_SCISSOR_RECT) {
7419       /* Align to 64-byte boundary as per anv. */
7420       uint32_t scissor_offset;
7421       struct pipe_scissor_state *scissor_map = (void *)
7422          stream_state(batch, sizeof(struct pipe_scissor_state) * ice->state.num_viewports,
7423                       64, &scissor_offset);
7424       for (int i = 0; i < ice->state.num_viewports; i++) {
7425          struct pipe_scissor_state scissor;
7426          crocus_fill_scissor_rect(ice, i, &scissor);
7427          scissor_map[i] = scissor;
7428       }
7429 
7430       crocus_emit_cmd(batch, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
7431          ptr.ScissorRectPointer = scissor_offset;
7432       }
7433    }
7434 #endif
7435 
7436    if (dirty & CROCUS_DIRTY_DEPTH_BUFFER) {
7437       struct isl_device *isl_dev = &batch->screen->isl_dev;
7438 #if GFX_VER >= 6
7439       crocus_emit_depth_stall_flushes(batch);
7440 #endif
7441       void *batch_ptr;
7442       struct crocus_resource *zres, *sres;
7443       struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
7444       batch_ptr = crocus_get_command_space(batch, isl_dev->ds.size);
7445 
7446       struct isl_view view = {
7447                               .base_level = 0,
7448                               .levels = 1,
7449                               .base_array_layer = 0,
7450                               .array_len = 1,
7451                               .swizzle = ISL_SWIZZLE_IDENTITY,
7452       };
7453       struct isl_depth_stencil_hiz_emit_info info = {
7454          .view = &view,
7455          .mocs = crocus_mocs(NULL, isl_dev),
7456       };
7457 
7458       if (cso->zsbuf) {
7459          crocus_get_depth_stencil_resources(&batch->screen->devinfo, cso->zsbuf->texture, &zres, &sres);
7460          struct crocus_surface *zsbuf = (struct crocus_surface *)cso->zsbuf;
7461          if (zsbuf->align_res) {
7462             zres = (struct crocus_resource *)zsbuf->align_res;
7463          }
7464          view.base_level = cso->zsbuf->u.tex.level;
7465          view.base_array_layer = cso->zsbuf->u.tex.first_layer;
7466          view.array_len = cso->zsbuf->u.tex.last_layer - cso->zsbuf->u.tex.first_layer + 1;
7467 
7468          if (zres) {
7469             view.usage |= ISL_SURF_USAGE_DEPTH_BIT;
7470 
7471             info.depth_surf = &zres->surf;
7472             info.depth_address = crocus_command_reloc(batch,
7473                                                       (batch_ptr - batch->command.map) + isl_dev->ds.depth_offset,
7474                                                       zres->bo, 0, RELOC_32BIT);
7475 
7476             info.mocs = crocus_mocs(zres->bo, isl_dev);
7477             view.format = zres->surf.format;
7478 
7479             if (crocus_resource_level_has_hiz(zres, view.base_level)) {
7480                info.hiz_usage = zres->aux.usage;
7481                info.hiz_surf = &zres->aux.surf;
7482                uint64_t hiz_offset = 0;
7483 
7484 #if GFX_VER == 6
7485                /* HiZ surfaces on Sandy Bridge technically don't support
7486                 * mip-mapping.  However, we can fake it by offsetting to the
7487                 * first slice of LOD0 in the HiZ surface.
7488                 */
7489                isl_surf_get_image_offset_B_tile_sa(&zres->aux.surf,
7490                                                    view.base_level, 0, 0,
7491                                                    &hiz_offset, NULL, NULL);
7492 #endif
7493                info.hiz_address = crocus_command_reloc(batch,
7494                                                        (batch_ptr - batch->command.map) + isl_dev->ds.hiz_offset,
7495                                                        zres->aux.bo, zres->aux.offset + hiz_offset,
7496                                                        RELOC_32BIT);
7497                info.depth_clear_value = crocus_resource_get_clear_color(zres).f32[0];
7498             }
7499          }
7500 
7501 #if GFX_VER >= 6
7502          if (sres) {
7503             view.usage |= ISL_SURF_USAGE_STENCIL_BIT;
7504             info.stencil_aux_usage = sres->aux.usage;
7505             info.stencil_surf = &sres->surf;
7506 
7507             uint64_t stencil_offset = 0;
7508 #if GFX_VER == 6
7509             /* Stencil surfaces on Sandy Bridge technically don't support
7510              * mip-mapping.  However, we can fake it by offsetting to the
7511              * first slice of LOD0 in the stencil surface.
7512              */
7513             isl_surf_get_image_offset_B_tile_sa(&sres->surf,
7514                                                 view.base_level, 0, 0,
7515                                                 &stencil_offset, NULL, NULL);
7516 #endif
7517 
7518             info.stencil_address = crocus_command_reloc(batch,
7519                                                         (batch_ptr - batch->command.map) + isl_dev->ds.stencil_offset,
7520                                                         sres->bo, stencil_offset, RELOC_32BIT);
7521             if (!zres) {
7522                view.format = sres->surf.format;
7523                info.mocs = crocus_mocs(sres->bo, isl_dev);
7524             }
7525          }
7526 #endif
7527       }
7528       isl_emit_depth_stencil_hiz_s(isl_dev, batch_ptr, &info);
7529    }
7530 
7531    /* TODO: Disable emitting this until something uses a stipple. */
7532    if (dirty & CROCUS_DIRTY_POLYGON_STIPPLE) {
7533       crocus_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
7534          for (int i = 0; i < 32; i++) {
7535             poly.PatternRow[i] = ice->state.poly_stipple.stipple[i];
7536          }
7537       }
7538    }
7539 
7540    if (dirty & CROCUS_DIRTY_LINE_STIPPLE) {
7541       struct crocus_rasterizer_state *cso = ice->state.cso_rast;
7542       crocus_batch_emit(batch, cso->line_stipple, sizeof(cso->line_stipple));
7543    }
7544 
7545 #if GFX_VER >= 8
7546    if (dirty & CROCUS_DIRTY_GEN8_VF_TOPOLOGY) {
7547       crocus_emit_cmd(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
7548          topo.PrimitiveTopologyType =
7549             translate_prim_type(draw->mode, ice->state.patch_vertices);
7550       }
7551    }
7552 #endif
7553 
7554 #if GFX_VER <= 5
7555    if (dirty & CROCUS_DIRTY_GEN5_PIPELINED_POINTERS) {
7556       upload_pipelined_state_pointers(batch, ice->shaders.ff_gs_prog ? true : false, ice->shaders.gs_offset,
7557                                       ice->shaders.vs_offset, ice->shaders.sf_offset,
7558                                       ice->shaders.clip_offset, ice->shaders.wm_offset, ice->shaders.cc_offset);
7559       crocus_upload_urb_fence(batch);
7560 
7561       crocus_emit_cmd(batch, GENX(CS_URB_STATE), cs) {
7562         cs.NumberofURBEntries = ice->urb.nr_cs_entries;
7563         cs.URBEntryAllocationSize = ice->urb.csize - 1;
7564       }
7565       dirty |= CROCUS_DIRTY_GEN4_CURBE;
7566    }
7567 #endif
7568    if (dirty & CROCUS_DIRTY_DRAWING_RECTANGLE) {
7569       struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
7570       if (fb->width && fb->height) {
7571          crocus_emit_cmd(batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
7572             rect.ClippedDrawingRectangleXMax = fb->width - 1;
7573             rect.ClippedDrawingRectangleYMax = fb->height - 1;
7574          }
7575       }
7576    }
7577 
7578    if (dirty & CROCUS_DIRTY_VERTEX_BUFFERS) {
7579       const uint32_t user_count = util_bitcount(ice->state.bound_vertex_buffers);
7580       const uint32_t count = user_count +
7581          ice->state.vs_uses_draw_params + ice->state.vs_uses_derived_draw_params;
7582       uint32_t dynamic_bound = ice->state.bound_vertex_buffers;
7583 
7584       if (count) {
7585          const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);
7586 
7587          uint32_t *map =
7588             crocus_get_command_space(batch, 4 * (1 + vb_dwords * count));
7589          _crocus_pack_command(batch, GENX(3DSTATE_VERTEX_BUFFERS), map, vb) {
7590             vb.DWordLength = (vb_dwords * count + 1) - 2;
7591          }
7592          map += 1;
7593 
7594          uint32_t bound = dynamic_bound;
7595          int i;
7596          while (bound) {
7597             i = u_bit_scan(&bound);
7598             struct pipe_vertex_buffer *buf = &ice->state.vertex_buffers[i];
7599             struct crocus_bo *bo = crocus_resource_bo(buf->buffer.resource);
7600             uint32_t step_rate = ice->state.cso_vertex_elements->step_rate[i];
7601 
7602             emit_vertex_buffer_state(batch, i, bo,
7603                                      buf->buffer_offset,
7604                                      ice->state.vb_end[i],
7605                                      ice->state.cso_vertex_elements->strides[i],
7606                                      step_rate,
7607                                      &map);
7608          }
7609          i = user_count;
7610          if (ice->state.vs_uses_draw_params) {
7611             struct crocus_resource *res = (struct crocus_resource *)ice->draw.draw_params.res;
7612             emit_vertex_buffer_state(batch, i++,
7613                                      res->bo,
7614                                      ice->draw.draw_params.offset,
7615                                      ice->draw.draw_params.res->width0,
7616                                      0, 0, &map);
7617          }
7618          if (ice->state.vs_uses_derived_draw_params) {
7619             struct crocus_resource *res = (struct crocus_resource *)ice->draw.derived_draw_params.res;
7620             emit_vertex_buffer_state(batch, i++,
7621                                      res->bo,
7622                                      ice->draw.derived_draw_params.offset,
7623                                      ice->draw.derived_draw_params.res->width0,
7624                                      0, 0, &map);
7625          }
7626       }
7627    }
7628 
7629    if (dirty & CROCUS_DIRTY_VERTEX_ELEMENTS) {
7630       struct crocus_vertex_element_state *cso = ice->state.cso_vertex_elements;
7631       const unsigned entries = MAX2(cso->count, 1);
7632       if (!(ice->state.vs_needs_sgvs_element ||
7633             ice->state.vs_uses_derived_draw_params ||
7634             ice->state.vs_needs_edge_flag)) {
7635          crocus_batch_emit(batch, cso->vertex_elements, sizeof(uint32_t) *
7636                          (1 + entries * GENX(VERTEX_ELEMENT_STATE_length)));
7637       } else {
7638          uint32_t dynamic_ves[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
7639          const unsigned dyn_count = cso->count +
7640             ice->state.vs_needs_sgvs_element +
7641             ice->state.vs_uses_derived_draw_params;
7642 
7643          crocus_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS),
7644                            &dynamic_ves, ve) {
7645             ve.DWordLength =
7646                1 + GENX(VERTEX_ELEMENT_STATE_length) * dyn_count - 2;
7647          }
7648          memcpy(&dynamic_ves[1], &cso->vertex_elements[1],
7649                 (cso->count - ice->state.vs_needs_edge_flag) *
7650                 GENX(VERTEX_ELEMENT_STATE_length) * sizeof(uint32_t));
7651          uint32_t *ve_pack_dest =
7652             &dynamic_ves[1 + (cso->count - ice->state.vs_needs_edge_flag) *
7653                          GENX(VERTEX_ELEMENT_STATE_length)];
7654 
7655          if (ice->state.vs_needs_sgvs_element) {
7656             uint32_t base_ctrl = ice->state.vs_uses_draw_params ?
7657                                  VFCOMP_STORE_SRC : VFCOMP_STORE_0;
7658             crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
7659                ve.Valid = true;
7660                ve.VertexBufferIndex =
7661                   util_bitcount64(ice->state.bound_vertex_buffers);
7662                ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
7663                ve.Component0Control = base_ctrl;
7664                ve.Component1Control = base_ctrl;
7665 #if GFX_VER < 8
7666                ve.Component2Control = ice->state.vs_uses_vertexid ? VFCOMP_STORE_VID : VFCOMP_STORE_0;
7667                ve.Component3Control = ice->state.vs_uses_instanceid ? VFCOMP_STORE_IID : VFCOMP_STORE_0;
7668 #else
7669                ve.Component2Control = VFCOMP_STORE_0;
7670                ve.Component3Control = VFCOMP_STORE_0;
7671 #endif
7672 #if GFX_VER < 5
7673                ve.DestinationElementOffset = cso->count * 4;
7674 #endif
7675             }
7676             ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
7677          }
7678          if (ice->state.vs_uses_derived_draw_params) {
7679             crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
7680                ve.Valid = true;
7681                ve.VertexBufferIndex =
7682                   util_bitcount64(ice->state.bound_vertex_buffers) +
7683                   ice->state.vs_uses_draw_params;
7684                ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
7685                ve.Component0Control = VFCOMP_STORE_SRC;
7686                ve.Component1Control = VFCOMP_STORE_SRC;
7687                ve.Component2Control = VFCOMP_STORE_0;
7688                ve.Component3Control = VFCOMP_STORE_0;
7689 #if GFX_VER < 5
7690                ve.DestinationElementOffset = (cso->count + ice->state.vs_needs_sgvs_element) * 4;
7691 #endif
7692             }
7693             ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
7694          }
7695          if (ice->state.vs_needs_edge_flag) {
7696             for (int i = 0; i < GENX(VERTEX_ELEMENT_STATE_length);  i++)
7697                ve_pack_dest[i] = cso->edgeflag_ve[i];
7698          }
7699 
7700          crocus_batch_emit(batch, &dynamic_ves, sizeof(uint32_t) *
7701                          (1 + dyn_count * GENX(VERTEX_ELEMENT_STATE_length)));
7702       }
7703 
7704 #if GFX_VER == 8
7705       if (!ice->state.vs_needs_edge_flag) {
7706          crocus_batch_emit(batch, cso->vf_instancing, sizeof(uint32_t) *
7707                          entries * GENX(3DSTATE_VF_INSTANCING_length));
7708       } else {
7709          assert(cso->count > 0);
7710          const unsigned edgeflag_index = cso->count - 1;
7711          uint32_t dynamic_vfi[33 * GENX(3DSTATE_VF_INSTANCING_length)];
7712          memcpy(&dynamic_vfi[0], cso->vf_instancing, edgeflag_index *
7713                 GENX(3DSTATE_VF_INSTANCING_length) * sizeof(uint32_t));
7714 
7715          uint32_t *vfi_pack_dest = &dynamic_vfi[0] +
7716             edgeflag_index * GENX(3DSTATE_VF_INSTANCING_length);
7717          crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
7718             vi.VertexElementIndex = edgeflag_index +
7719                ice->state.vs_needs_sgvs_element +
7720                ice->state.vs_uses_derived_draw_params;
7721          }
7722          for (int i = 0; i < GENX(3DSTATE_VF_INSTANCING_length);  i++)
7723             vfi_pack_dest[i] |= cso->edgeflag_vfi[i];
7724 
7725          crocus_batch_emit(batch, &dynamic_vfi[0], sizeof(uint32_t) *
7726                          entries * GENX(3DSTATE_VF_INSTANCING_length));
7727       }
7728 #endif
7729    }
7730 
7731 #if GFX_VER == 8
7732    if (dirty & CROCUS_DIRTY_GEN8_VF_SGVS) {
7733       const struct elk_vs_prog_data *vs_prog_data = (void *)
7734          ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data;
7735       struct crocus_vertex_element_state *cso = ice->state.cso_vertex_elements;
7736 
7737       crocus_emit_cmd(batch, GENX(3DSTATE_VF_SGVS), sgv) {
7738          if (vs_prog_data->uses_vertexid) {
7739             sgv.VertexIDEnable = true;
7740             sgv.VertexIDComponentNumber = 2;
7741             sgv.VertexIDElementOffset =
7742                cso->count - ice->state.vs_needs_edge_flag;
7743          }
7744 
7745          if (vs_prog_data->uses_instanceid) {
7746             sgv.InstanceIDEnable = true;
7747             sgv.InstanceIDComponentNumber = 3;
7748             sgv.InstanceIDElementOffset =
7749                cso->count - ice->state.vs_needs_edge_flag;
7750          }
7751       }
7752    }
7753 #endif
7754 #if GFX_VERx10 >= 75
7755    if (dirty & CROCUS_DIRTY_GEN75_VF) {
7756       crocus_emit_cmd(batch, GENX(3DSTATE_VF), vf) {
7757          if (draw->primitive_restart) {
7758             vf.IndexedDrawCutIndexEnable = true;
7759             vf.CutIndex = draw->restart_index;
7760          }
7761       }
7762    }
7763 #endif
7764 
7765 #if GFX_VER == 8
7766    if (dirty & CROCUS_DIRTY_GEN8_PMA_FIX) {
7767       bool enable = want_pma_fix(ice);
7768       genX(crocus_update_pma_fix)(ice, batch, enable);
7769    }
7770 #endif
7771 
7772 #if GFX_VER <= 5
7773    if (dirty & CROCUS_DIRTY_GEN4_CURBE) {
7774       gen4_upload_curbe(batch);
7775    }
7776 #endif
7777 }
7778 
7779 static void
7780 crocus_upload_render_state(struct crocus_context *ice,
7781                            struct crocus_batch *batch,
7782                            const struct pipe_draw_info *draw,
7783                            unsigned drawid_offset,
7784                            const struct pipe_draw_indirect_info *indirect,
7785                            const struct pipe_draw_start_count_bias *sc)
7786 {
7787 #if GFX_VER >= 7
7788    bool use_predicate = ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT;
7789 #endif
7790 
7791    batch->no_wrap = true;
7792    batch->contains_draw = true;
7793 
7794    crocus_update_surface_base_address(batch);
7795 
7796    crocus_upload_dirty_render_state(ice, batch, draw);
7797 
7798    batch->no_wrap = false;
7799    if (draw->index_size > 0) {
7800       unsigned offset;
7801       unsigned size;
7802       bool emit_index = false;
7803 
7804       if (draw->has_user_indices) {
7805          unsigned start_offset = draw->index_size * sc->start;
7806          u_upload_data(ice->ctx.stream_uploader, 0,
7807                        sc->count * draw->index_size, 4,
7808                        (char *)draw->index.user + start_offset,
7809                        &offset, &ice->state.index_buffer.res);
7810          offset -= start_offset;
7811          size = start_offset + sc->count * draw->index_size;
7812          emit_index = true;
7813       } else {
7814          struct crocus_resource *res = (void *) draw->index.resource;
7815 
7816          if (ice->state.index_buffer.res != draw->index.resource) {
7817             res->bind_history |= PIPE_BIND_INDEX_BUFFER;
7818             pipe_resource_reference(&ice->state.index_buffer.res,
7819                                     draw->index.resource);
7820             emit_index = true;
7821          }
7822          offset = 0;
7823          size = draw->index.resource->width0;
7824       }
7825 
7826       if (!emit_index &&
7827           (ice->state.index_buffer.size != size ||
7828            ice->state.index_buffer.index_size != draw->index_size
7829 #if GFX_VERx10 < 75
7830            || ice->state.index_buffer.prim_restart != draw->primitive_restart
7831 #endif
7832 	   )
7833 	  )
7834          emit_index = true;
7835 
7836       if (emit_index) {
7837          struct crocus_bo *bo = crocus_resource_bo(ice->state.index_buffer.res);
7838 
7839          crocus_emit_cmd(batch, GENX(3DSTATE_INDEX_BUFFER), ib) {
7840 #if GFX_VERx10 < 75
7841             ib.CutIndexEnable = draw->primitive_restart;
7842 #endif
7843             ib.IndexFormat = draw->index_size >> 1;
7844             ib.BufferStartingAddress = ro_bo(bo, offset);
7845 #if GFX_VER >= 8
7846             ib.BufferSize = bo->size - offset;
7847 #else
7848             ib.BufferEndingAddress = ro_bo(bo, offset + size - 1);
7849 #endif
7850 #if GFX_VER >= 6
7851             ib.MOCS = crocus_mocs(bo, &batch->screen->isl_dev);
7852 #endif
7853          }
7854          ice->state.index_buffer.size = size;
7855          ice->state.index_buffer.offset = offset;
7856          ice->state.index_buffer.index_size = draw->index_size;
7857 #if GFX_VERx10 < 75
7858          ice->state.index_buffer.prim_restart = draw->primitive_restart;
7859 #endif
7860       }
7861    }
7862 
7863 #define _3DPRIM_END_OFFSET          0x2420
7864 #define _3DPRIM_START_VERTEX        0x2430
7865 #define _3DPRIM_VERTEX_COUNT        0x2434
7866 #define _3DPRIM_INSTANCE_COUNT      0x2438
7867 #define _3DPRIM_START_INSTANCE      0x243C
7868 #define _3DPRIM_BASE_VERTEX         0x2440
7869 
7870 #if GFX_VER >= 7
7871    if (indirect && !indirect->count_from_stream_output) {
7872       if (indirect->indirect_draw_count) {
7873          use_predicate = true;
7874 
7875          struct crocus_bo *draw_count_bo =
7876             crocus_resource_bo(indirect->indirect_draw_count);
7877          unsigned draw_count_offset =
7878             indirect->indirect_draw_count_offset;
7879 
7880          crocus_emit_pipe_control_flush(batch,
7881                                         "ensure indirect draw buffer is flushed",
7882                                         PIPE_CONTROL_FLUSH_ENABLE);
7883          if (ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) {
7884 #if GFX_VERx10 >= 75
7885             struct mi_builder b;
7886             mi_builder_init(&b, &batch->screen->devinfo, batch);
7887 
7888             /* comparison = draw id < draw count */
7889             struct mi_value comparison =
7890                mi_ult(&b, mi_imm(drawid_offset),
7891                       mi_mem32(ro_bo(draw_count_bo,
7892                                      draw_count_offset)));
7893 #if GFX_VER == 8
7894             /* predicate = comparison & conditional rendering predicate */
7895             mi_store(&b, mi_reg32(MI_PREDICATE_RESULT),
7896                          mi_iand(&b, comparison, mi_reg32(CS_GPR(15))));
7897 #else
7898             /* predicate = comparison & conditional rendering predicate */
7899             struct mi_value pred = mi_iand(&b, comparison,
7900                                            mi_reg32(CS_GPR(15)));
7901 
7902             mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), pred);
7903             mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
7904 
7905             unsigned mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
7906                MI_PREDICATE_COMBINEOP_SET |
7907                MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
7908 
7909             crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
7910 #endif
7911 #endif
7912          } else {
7913             uint32_t mi_predicate;
7914 
7915             /* Upload the id of the current primitive to MI_PREDICATE_SRC1. */
7916             crocus_load_register_imm64(batch, MI_PREDICATE_SRC1, drawid_offset);
7917             /* Upload the current draw count from the draw parameters buffer
7918              * to MI_PREDICATE_SRC0.
7919              */
7920             crocus_load_register_mem32(batch, MI_PREDICATE_SRC0,
7921                                        draw_count_bo, draw_count_offset);
7922             /* Zero the top 32-bits of MI_PREDICATE_SRC0 */
7923             crocus_load_register_imm32(batch, MI_PREDICATE_SRC0 + 4, 0);
7924 
7925             if (drawid_offset == 0) {
7926                mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
7927                   MI_PREDICATE_COMBINEOP_SET |
7928                   MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
7929             } else {
7930                /* While draw_index < draw_count the predicate's result will be
7931                 *  (draw_index == draw_count) ^ TRUE = TRUE
7932                 * When draw_index == draw_count the result is
7933                 *  (TRUE) ^ TRUE = FALSE
7934                 * After this all results will be:
7935                 *  (FALSE) ^ FALSE = FALSE
7936                 */
7937                mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOAD |
7938                   MI_PREDICATE_COMBINEOP_XOR |
7939                   MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
7940             }
7941             crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
7942          }
7943       }
7944 
7945 #if GFX_VER >= 7
7946       struct crocus_bo *bo = crocus_resource_bo(indirect->buffer);
7947       assert(bo);
7948 
7949       crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7950          lrm.RegisterAddress = _3DPRIM_VERTEX_COUNT;
7951          lrm.MemoryAddress = ro_bo(bo, indirect->offset + 0);
7952       }
7953       crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7954          lrm.RegisterAddress = _3DPRIM_INSTANCE_COUNT;
7955          lrm.MemoryAddress = ro_bo(bo, indirect->offset + 4);
7956       }
7957       crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7958          lrm.RegisterAddress = _3DPRIM_START_VERTEX;
7959          lrm.MemoryAddress = ro_bo(bo, indirect->offset + 8);
7960       }
7961       if (draw->index_size) {
7962          crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7963             lrm.RegisterAddress = _3DPRIM_BASE_VERTEX;
7964             lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12);
7965          }
7966          crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7967             lrm.RegisterAddress = _3DPRIM_START_INSTANCE;
7968             lrm.MemoryAddress = ro_bo(bo, indirect->offset + 16);
7969          }
7970       } else {
7971          crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7972             lrm.RegisterAddress = _3DPRIM_START_INSTANCE;
7973             lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12);
7974          }
7975          crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
7976             lri.RegisterOffset = _3DPRIM_BASE_VERTEX;
7977             lri.DataDWord = 0;
7978          }
7979       }
7980 #endif
7981    } else if (indirect && indirect->count_from_stream_output) {
7982 #if GFX_VERx10 >= 75
7983       struct crocus_stream_output_target *so =
7984          (void *) indirect->count_from_stream_output;
7985 
7986       /* XXX: Replace with actual cache tracking */
7987       crocus_emit_pipe_control_flush(batch,
7988                                      "draw count from stream output stall",
7989                                      PIPE_CONTROL_CS_STALL);
7990 
7991       struct mi_builder b;
7992       mi_builder_init(&b, &batch->screen->devinfo, batch);
7993 
7994       struct crocus_address addr =
7995          ro_bo(crocus_resource_bo(&so->offset_res->base.b), so->offset_offset);
7996       struct mi_value offset =
7997          mi_iadd_imm(&b, mi_mem32(addr), -so->base.buffer_offset);
7998 
7999       mi_store(&b, mi_reg32(_3DPRIM_VERTEX_COUNT),
8000                mi_udiv32_imm(&b, offset, so->stride));
8001 
8002       _crocus_emit_lri(batch, _3DPRIM_START_VERTEX, 0);
8003       _crocus_emit_lri(batch, _3DPRIM_BASE_VERTEX, 0);
8004       _crocus_emit_lri(batch, _3DPRIM_START_INSTANCE, 0);
8005       _crocus_emit_lri(batch, _3DPRIM_INSTANCE_COUNT, draw->instance_count);
8006 #endif
8007    }
8008 #else
8009    assert(!indirect);
8010 #endif
8011 
8012    crocus_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
8013       prim.VertexAccessType = draw->index_size > 0 ? RANDOM : SEQUENTIAL;
8014 #if GFX_VER >= 7
8015       prim.PredicateEnable = use_predicate;
8016 #endif
8017 
8018       prim.PrimitiveTopologyType = translate_prim_type(ice->state.prim_mode, ice->state.patch_vertices);
8019       if (indirect) {
8020          // XXX Probably have to do something for gen6 here?
8021 #if GFX_VER >= 7
8022          prim.IndirectParameterEnable = true;
8023 #endif
8024       } else {
8025 #if GFX_VER >= 5
8026          prim.StartInstanceLocation = draw->start_instance;
8027 #endif
8028          prim.InstanceCount = draw->instance_count;
8029          prim.VertexCountPerInstance = sc->count;
8030 
8031          prim.StartVertexLocation = sc->start;
8032 
8033          if (draw->index_size) {
8034             prim.BaseVertexLocation += sc->index_bias;
8035          }
8036       }
8037    }
8038 }
8039 
8040 #if GFX_VER >= 7
8041 
8042 static void
8043 crocus_upload_compute_state(struct crocus_context *ice,
8044                             struct crocus_batch *batch,
8045                             const struct pipe_grid_info *grid)
8046 {
8047    const uint64_t stage_dirty = ice->state.stage_dirty;
8048    struct crocus_screen *screen = batch->screen;
8049    const struct intel_device_info *devinfo = &screen->devinfo;
8050    struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
8051    struct crocus_compiled_shader *shader =
8052       ice->shaders.prog[MESA_SHADER_COMPUTE];
8053    struct elk_stage_prog_data *prog_data = shader->prog_data;
8054    struct elk_cs_prog_data *cs_prog_data = (void *) prog_data;
8055    const struct intel_cs_dispatch_info dispatch =
8056       elk_cs_get_dispatch_info(devinfo, cs_prog_data, grid->block);
8057 
8058    crocus_update_surface_base_address(batch);
8059    if ((stage_dirty & CROCUS_STAGE_DIRTY_CONSTANTS_CS) && shs->sysvals_need_upload)
8060       upload_sysvals(ice, MESA_SHADER_COMPUTE);
8061 
8062    if (stage_dirty & CROCUS_STAGE_DIRTY_BINDINGS_CS) {
8063       crocus_populate_binding_table(ice, batch, MESA_SHADER_COMPUTE, false);
8064       ice->shaders.prog[MESA_SHADER_COMPUTE]->bind_bo_offset =
8065          crocus_upload_binding_table(ice, batch,
8066                                      ice->shaders.prog[MESA_SHADER_COMPUTE]->surf_offset,
8067                                      ice->shaders.prog[MESA_SHADER_COMPUTE]->bt.size_bytes);
8068    }
8069 
8070    if (stage_dirty & CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS)
8071       crocus_upload_sampler_states(ice, batch, MESA_SHADER_COMPUTE);
8072 
8073    if ((stage_dirty & CROCUS_STAGE_DIRTY_CS) ||
8074        cs_prog_data->local_size[0] == 0 /* Variable local group size */) {
8075       /* The MEDIA_VFE_STATE documentation for Gen8+ says:
8076        *
8077        *   "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
8078        *    the only bits that are changed are scoreboard related: Scoreboard
8079        *    Enable, Scoreboard Type, Scoreboard Mask, Scoreboard Delta.  For
8080        *    these scoreboard related states, a MEDIA_STATE_FLUSH is
8081        *    sufficient."
8082        */
8083       crocus_emit_pipe_control_flush(batch,
8084                                      "workaround: stall before MEDIA_VFE_STATE",
8085                                      PIPE_CONTROL_CS_STALL);
8086 
8087       crocus_emit_cmd(batch, GENX(MEDIA_VFE_STATE), vfe) {
8088          if (prog_data->total_scratch) {
8089             struct crocus_bo *bo =
8090                crocus_get_scratch_space(ice, prog_data->total_scratch,
8091                                         MESA_SHADER_COMPUTE);
8092 #if GFX_VER == 8
8093             /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
8094              * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
8095              */
8096             vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11;
8097 #elif GFX_VERx10 == 75
8098             /* Haswell's Per Thread Scratch Space is in the range [0, 10]
8099              * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
8100              */
8101             vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 12;
8102 #else
8103             /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
8104              * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
8105              */
8106             vfe.PerThreadScratchSpace = prog_data->total_scratch / 1024 - 1;
8107 #endif
8108             vfe.ScratchSpaceBasePointer = rw_bo(bo, 0);
8109          }
8110 
8111          vfe.MaximumNumberofThreads =
8112             devinfo->max_cs_threads * devinfo->subslice_total - 1;
8113          vfe.ResetGatewayTimer =
8114             Resettingrelativetimerandlatchingtheglobaltimestamp;
8115          vfe.BypassGatewayControl = true;
8116 #if GFX_VER == 7
8117          vfe.GPGPUMode = true;
8118 #endif
8119 #if GFX_VER == 8
8120          vfe.BypassGatewayControl = true;
8121 #endif
8122          vfe.NumberofURBEntries = GFX_VER == 8 ? 2 : 0;
8123          vfe.URBEntryAllocationSize = GFX_VER == 8 ? 2 : 0;
8124 
8125          vfe.CURBEAllocationSize =
8126             ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
8127                   cs_prog_data->push.cross_thread.regs, 2);
8128       }
8129    }
8130 
8131    /* TODO: Combine subgroup-id with cbuf0 so we can push regular uniforms */
8132    if ((stage_dirty & CROCUS_STAGE_DIRTY_CS) ||
8133        cs_prog_data->local_size[0] == 0 /* Variable local group size */) {
8134       uint32_t curbe_data_offset = 0;
8135       assert(cs_prog_data->push.cross_thread.dwords == 0 &&
8136              cs_prog_data->push.per_thread.dwords == 1 &&
8137              cs_prog_data->base.param[0] == ELK_PARAM_BUILTIN_SUBGROUP_ID);
8138       const unsigned push_const_size =
8139          elk_cs_push_const_total_size(cs_prog_data, dispatch.threads);
8140       uint32_t *curbe_data_map =
8141          stream_state(batch,
8142                       ALIGN(push_const_size, 64), 64,
8143                       &curbe_data_offset);
8144       assert(curbe_data_map);
8145       memset(curbe_data_map, 0x5a, ALIGN(push_const_size, 64));
8146       crocus_fill_cs_push_const_buffer(cs_prog_data, dispatch.threads,
8147                                        curbe_data_map);
8148 
8149       crocus_emit_cmd(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
8150          curbe.CURBETotalDataLength = ALIGN(push_const_size, 64);
8151          curbe.CURBEDataStartAddress = curbe_data_offset;
8152       }
8153    }
8154 
8155    if (stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS |
8156                       CROCUS_STAGE_DIRTY_BINDINGS_CS |
8157                       CROCUS_STAGE_DIRTY_CONSTANTS_CS |
8158                       CROCUS_STAGE_DIRTY_CS)) {
8159       uint32_t desc[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
8160       const uint64_t ksp = KSP(ice,shader) + elk_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size);
8161       crocus_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), desc, idd) {
8162          idd.KernelStartPointer = ksp;
8163          idd.SamplerStatePointer = shs->sampler_offset;
8164          idd.BindingTablePointer = ice->shaders.prog[MESA_SHADER_COMPUTE]->bind_bo_offset;
8165          idd.BindingTableEntryCount = MIN2(shader->bt.size_bytes / 4, 31);
8166          idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads;
8167          idd.ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs;
8168          idd.BarrierEnable = cs_prog_data->uses_barrier;
8169          idd.SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER,
8170                                                                    prog_data->total_shared);
8171 #if GFX_VERx10 >= 75
8172          idd.CrossThreadConstantDataReadLength = cs_prog_data->push.cross_thread.regs;
8173 #endif
8174       }
8175 
8176       crocus_emit_cmd(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
8177          load.InterfaceDescriptorTotalLength =
8178             GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
8179          load.InterfaceDescriptorDataStartAddress =
8180             emit_state(batch, desc, sizeof(desc), 64);
8181       }
8182    }
8183 
8184 #define GPGPU_DISPATCHDIMX 0x2500
8185 #define GPGPU_DISPATCHDIMY 0x2504
8186 #define GPGPU_DISPATCHDIMZ 0x2508
8187 
8188    if (grid->indirect) {
8189       struct crocus_state_ref *grid_size = &ice->state.grid_size;
8190       struct crocus_bo *bo = crocus_resource_bo(grid_size->res);
8191       crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
8192          lrm.RegisterAddress = GPGPU_DISPATCHDIMX;
8193          lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 0);
8194       }
8195       crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
8196          lrm.RegisterAddress = GPGPU_DISPATCHDIMY;
8197          lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 4);
8198       }
8199       crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
8200          lrm.RegisterAddress = GPGPU_DISPATCHDIMZ;
8201          lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 8);
8202       }
8203 
8204 #if GFX_VER == 7
8205       /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */
8206       _crocus_emit_lri(batch, MI_PREDICATE_SRC0 + 4, 0);
8207       crocus_load_register_imm64(batch, MI_PREDICATE_SRC1, 0);
8208 
8209       /* Load compute_dispatch_indirect_x_size into SRC0 */
8210       crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 0);
8211 
8212       /* predicate = (compute_dispatch_indirect_x_size == 0); */
8213       crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8214          mip.LoadOperation    = LOAD_LOAD;
8215          mip.CombineOperation = COMBINE_SET;
8216          mip.CompareOperation = COMPARE_SRCS_EQUAL;
8217       };
8218 
8219       /* Load compute_dispatch_indirect_y_size into SRC0 */
8220       crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 4);
8221 
8222       /* predicate = (compute_dispatch_indirect_y_size == 0); */
8223       crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8224          mip.LoadOperation    = LOAD_LOAD;
8225          mip.CombineOperation = COMBINE_OR;
8226          mip.CompareOperation = COMPARE_SRCS_EQUAL;
8227       };
8228 
8229       /* Load compute_dispatch_indirect_z_size into SRC0 */
8230       crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 8);
8231 
8232       /* predicate = (compute_dispatch_indirect_z_size == 0); */
8233       crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8234          mip.LoadOperation    = LOAD_LOAD;
8235          mip.CombineOperation = COMBINE_OR;
8236          mip.CompareOperation = COMPARE_SRCS_EQUAL;
8237       };
8238 
8239       /* predicate = !predicate; */
8240 #define COMPARE_FALSE                           1
8241       crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8242          mip.LoadOperation    = LOAD_LOADINV;
8243          mip.CombineOperation = COMBINE_OR;
8244          mip.CompareOperation = COMPARE_FALSE;
8245       }
8246 #endif
8247    }
8248 
8249    crocus_emit_cmd(batch, GENX(GPGPU_WALKER), ggw) {
8250       ggw.IndirectParameterEnable    = grid->indirect != NULL;
8251       ggw.PredicateEnable            = GFX_VER <= 7 && grid->indirect != NULL;
8252       ggw.SIMDSize                   = dispatch.simd_size / 16;
8253       ggw.ThreadDepthCounterMaximum  = 0;
8254       ggw.ThreadHeightCounterMaximum = 0;
8255       ggw.ThreadWidthCounterMaximum  = dispatch.threads - 1;
8256       ggw.ThreadGroupIDXDimension    = grid->grid[0];
8257       ggw.ThreadGroupIDYDimension    = grid->grid[1];
8258       ggw.ThreadGroupIDZDimension    = grid->grid[2];
8259       ggw.RightExecutionMask         = dispatch.right_mask;
8260       ggw.BottomExecutionMask        = 0xffffffff;
8261    }
8262 
8263    crocus_emit_cmd(batch, GENX(MEDIA_STATE_FLUSH), msf);
8264 
8265    batch->contains_draw = true;
8266 }
8267 
8268 #endif /* GFX_VER >= 7 */
8269 
8270 /**
8271  * State module teardown.
8272  */
8273 static void
8274 crocus_destroy_state(struct crocus_context *ice)
8275 {
8276    struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
8277 
8278    pipe_resource_reference(&ice->draw.draw_params.res, NULL);
8279    pipe_resource_reference(&ice->draw.derived_draw_params.res, NULL);
8280 
8281    free(ice->state.genx);
8282 
8283    for (int i = 0; i < 4; i++) {
8284       pipe_so_target_reference(&ice->state.so_target[i], NULL);
8285    }
8286 
8287    util_unreference_framebuffer_state(cso);
8288 
8289    for (int stage = 0; stage < MESA_SHADER_STAGES; stage++) {
8290       struct crocus_shader_state *shs = &ice->state.shaders[stage];
8291       for (int i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
8292          pipe_resource_reference(&shs->constbufs[i].buffer, NULL);
8293       }
8294       for (int i = 0; i < PIPE_MAX_SHADER_IMAGES; i++) {
8295          pipe_resource_reference(&shs->image[i].base.resource, NULL);
8296       }
8297       for (int i = 0; i < PIPE_MAX_SHADER_BUFFERS; i++) {
8298          pipe_resource_reference(&shs->ssbo[i].buffer, NULL);
8299       }
8300       for (int i = 0; i < CROCUS_MAX_TEXTURE_SAMPLERS; i++) {
8301          pipe_sampler_view_reference((struct pipe_sampler_view **)
8302                                      &shs->textures[i], NULL);
8303       }
8304    }
8305 
8306    for (int i = 0; i < 16; i++)
8307       pipe_resource_reference(&ice->state.vertex_buffers[i].buffer.resource, NULL);
8308    pipe_resource_reference(&ice->state.grid_size.res, NULL);
8309 
8310    pipe_resource_reference(&ice->state.index_buffer.res, NULL);
8311 }
8312 
8313 /* ------------------------------------------------------------------- */
8314 
8315 static void
8316 crocus_rebind_buffer(struct crocus_context *ice,
8317                      struct crocus_resource *res)
8318 {
8319    struct pipe_context *ctx = &ice->ctx;
8320 
8321    assert(res->base.b.target == PIPE_BUFFER);
8322 
8323    /* Buffers can't be framebuffer attachments, nor display related,
8324     * and we don't have upstream Clover support.
8325     */
8326    assert(!(res->bind_history & (PIPE_BIND_DEPTH_STENCIL |
8327                                  PIPE_BIND_RENDER_TARGET |
8328                                  PIPE_BIND_BLENDABLE |
8329                                  PIPE_BIND_DISPLAY_TARGET |
8330                                  PIPE_BIND_CURSOR |
8331                                  PIPE_BIND_COMPUTE_RESOURCE |
8332                                  PIPE_BIND_GLOBAL)));
8333 
8334    if (res->bind_history & PIPE_BIND_VERTEX_BUFFER) {
8335       uint64_t bound_vbs = ice->state.bound_vertex_buffers;
8336       while (bound_vbs) {
8337          const int i = u_bit_scan64(&bound_vbs);
8338          struct pipe_vertex_buffer *buffer = &ice->state.vertex_buffers[i];
8339 
8340          if (!buffer->is_user_buffer && &res->base.b == buffer->buffer.resource)
8341             ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS;
8342       }
8343    }
8344 
8345    if ((res->bind_history & PIPE_BIND_INDEX_BUFFER) &&
8346        ice->state.index_buffer.res) {
8347       if (res->bo == crocus_resource_bo(ice->state.index_buffer.res))
8348          pipe_resource_reference(&ice->state.index_buffer.res, NULL);
8349    }
8350    /* There is no need to handle these:
8351     * - PIPE_BIND_COMMAND_ARGS_BUFFER (emitted for every indirect draw)
8352     * - PIPE_BIND_QUERY_BUFFER (no persistent state references)
8353     */
8354 
8355    if (res->bind_history & PIPE_BIND_STREAM_OUTPUT) {
8356       /* XXX: be careful about resetting vs appending... */
8357       for (int i = 0; i < 4; i++) {
8358          if (ice->state.so_target[i] &&
8359              (ice->state.so_target[i]->buffer == &res->base.b)) {
8360 #if GFX_VER == 6
8361             ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_GS;
8362 #else
8363             ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
8364 #endif
8365          }
8366       }
8367    }
8368 
8369    for (int s = MESA_SHADER_VERTEX; s < MESA_SHADER_STAGES; s++) {
8370       struct crocus_shader_state *shs = &ice->state.shaders[s];
8371       enum pipe_shader_type p_stage = stage_to_pipe(s);
8372 
8373       if (!(res->bind_stages & (1 << s)))
8374          continue;
8375 
8376       if (res->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
8377          /* Skip constant buffer 0, it's for regular uniforms, not UBOs */
8378          uint32_t bound_cbufs = shs->bound_cbufs & ~1u;
8379          while (bound_cbufs) {
8380             const int i = u_bit_scan(&bound_cbufs);
8381             struct pipe_constant_buffer *cbuf = &shs->constbufs[i];
8382 
8383             if (res->bo == crocus_resource_bo(cbuf->buffer)) {
8384                ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << s;
8385             }
8386          }
8387       }
8388 
8389       if (res->bind_history & PIPE_BIND_SHADER_BUFFER) {
8390          uint32_t bound_ssbos = shs->bound_ssbos;
8391          while (bound_ssbos) {
8392             const int i = u_bit_scan(&bound_ssbos);
8393             struct pipe_shader_buffer *ssbo = &shs->ssbo[i];
8394 
8395             if (res->bo == crocus_resource_bo(ssbo->buffer)) {
8396                struct pipe_shader_buffer buf = {
8397                   .buffer = &res->base.b,
8398                   .buffer_offset = ssbo->buffer_offset,
8399                   .buffer_size = ssbo->buffer_size,
8400                };
8401                crocus_set_shader_buffers(ctx, p_stage, i, 1, &buf,
8402                                          (shs->writable_ssbos >> i) & 1);
8403             }
8404          }
8405       }
8406 
8407       if (res->bind_history & PIPE_BIND_SAMPLER_VIEW) {
8408          uint32_t bound_sampler_views = shs->bound_sampler_views;
8409          while (bound_sampler_views) {
8410             const int i = u_bit_scan(&bound_sampler_views);
8411             struct crocus_sampler_view *isv = shs->textures[i];
8412             struct crocus_bo *bo = isv->res->bo;
8413 
8414             if (res->bo == bo) {
8415                ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << s;
8416             }
8417          }
8418       }
8419 
8420       if (res->bind_history & PIPE_BIND_SHADER_IMAGE) {
8421          uint32_t bound_image_views = shs->bound_image_views;
8422          while (bound_image_views) {
8423             const int i = u_bit_scan(&bound_image_views);
8424             struct crocus_image_view *iv = &shs->image[i];
8425             struct crocus_bo *bo = crocus_resource_bo(iv->base.resource);
8426 
8427             if (res->bo == bo)
8428                ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << s;
8429          }
8430       }
8431    }
8432 }
8433 
8434 /* ------------------------------------------------------------------- */
8435 
8436 static unsigned
8437 flags_to_post_sync_op(uint32_t flags)
8438 {
8439    if (flags & PIPE_CONTROL_WRITE_IMMEDIATE)
8440       return WriteImmediateData;
8441 
8442    if (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT)
8443       return WritePSDepthCount;
8444 
8445    if (flags & PIPE_CONTROL_WRITE_TIMESTAMP)
8446       return WriteTimestamp;
8447 
8448    return 0;
8449 }
8450 
8451 /*
8452  * Do the given flags have a Post Sync or LRI Post Sync operation?
8453  */
8454 static enum pipe_control_flags
8455 get_post_sync_flags(enum pipe_control_flags flags)
8456 {
8457    flags &= PIPE_CONTROL_WRITE_IMMEDIATE |
8458             PIPE_CONTROL_WRITE_DEPTH_COUNT |
8459             PIPE_CONTROL_WRITE_TIMESTAMP |
8460             PIPE_CONTROL_LRI_POST_SYNC_OP;
8461 
8462    /* Only one "Post Sync Op" is allowed, and it's mutually exclusive with
8463     * "LRI Post Sync Operation".  So more than one bit set would be illegal.
8464     */
8465    assert(util_bitcount(flags) <= 1);
8466 
8467    return flags;
8468 }
8469 
8470 #define IS_COMPUTE_PIPELINE(batch) (batch->name == CROCUS_BATCH_COMPUTE)
8471 
8472 /**
8473  * Emit a series of PIPE_CONTROL commands, taking into account any
8474  * workarounds necessary to actually accomplish the caller's request.
8475  *
8476  * Unless otherwise noted, spec quotations in this function come from:
8477  *
8478  * Synchronization of the 3D Pipeline > PIPE_CONTROL Command > Programming
8479  * Restrictions for PIPE_CONTROL.
8480  *
8481  * You should not use this function directly.  Use the helpers in
8482  * crocus_pipe_control.c instead, which may split the pipe control further.
8483  */
8484 static void
8485 crocus_emit_raw_pipe_control(struct crocus_batch *batch,
8486                              const char *reason,
8487                              uint32_t flags,
8488                              struct crocus_bo *bo,
8489                              uint32_t offset,
8490                              uint64_t imm)
8491 {
8492    UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
8493    enum pipe_control_flags post_sync_flags = get_post_sync_flags(flags);
8494    UNUSED enum pipe_control_flags non_lri_post_sync_flags =
8495       post_sync_flags & ~PIPE_CONTROL_LRI_POST_SYNC_OP;
8496 
8497    /* Recursive PIPE_CONTROL workarounds --------------------------------
8498     * (http://knowyourmeme.com/memes/xzibit-yo-dawg)
8499     *
8500     * We do these first because we want to look at the original operation,
8501     * rather than any workarounds we set.
8502     */
8503 
8504    /* "Flush Types" workarounds ---------------------------------------------
8505     * We do these now because they may add post-sync operations or CS stalls.
8506     */
8507 
8508    if (GFX_VER == 6 && (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH)) {
8509       /* Hardware workaround: SNB B-Spec says:
8510        *
8511        *    "[Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush
8512        *     Enable = 1, a PIPE_CONTROL with any non-zero post-sync-op is
8513        *     required."
8514        */
8515       crocus_emit_post_sync_nonzero_flush(batch);
8516    }
8517 
8518 #if GFX_VER == 8
8519    if (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) {
8520       /* Project: BDW, SKL+ (stopping at CNL) / Argument: VF Invalidate
8521        *
8522        * "'Post Sync Operation' must be enabled to 'Write Immediate Data' or
8523        *  'Write PS Depth Count' or 'Write Timestamp'."
8524        */
8525       if (!bo) {
8526          flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
8527          post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
8528          non_lri_post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
8529          bo = batch->ice->workaround_bo;
8530          offset = batch->ice->workaround_offset;
8531       }
8532    }
8533 #endif
8534 
8535 #if GFX_VERx10 < 75
8536    if (flags & PIPE_CONTROL_DEPTH_STALL) {
8537       /* Project: PRE-HSW / Argument: Depth Stall
8538        *
8539        * "The following bits must be clear:
8540        *  - Render Target Cache Flush Enable ([12] of DW1)
8541        *  - Depth Cache Flush Enable ([0] of DW1)"
8542        */
8543       assert(!(flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
8544                         PIPE_CONTROL_DEPTH_CACHE_FLUSH)));
8545    }
8546 #endif
8547    if (GFX_VER >= 6 && (flags & PIPE_CONTROL_DEPTH_STALL)) {
8548       /* From the PIPE_CONTROL instruction table, bit 13 (Depth Stall Enable):
8549        *
8550        *    "This bit must be DISABLED for operations other than writing
8551        *     PS_DEPTH_COUNT."
8552        *
8553        * This seems like nonsense.  An Ivybridge workaround requires us to
8554        * emit a PIPE_CONTROL with a depth stall and write immediate post-sync
8555        * operation.  Gen8+ requires us to emit depth stalls and depth cache
8556        * flushes together.  So, it's hard to imagine this means anything other
8557        * than "we originally intended this to be used for PS_DEPTH_COUNT".
8558        *
8559        * We ignore the supposed restriction and do nothing.
8560        */
8561    }
8562 
8563    if (GFX_VERx10 < 75 && (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH)) {
8564       /* Project: PRE-HSW / Argument: Depth Cache Flush
8565        *
8566        * "Depth Stall must be clear ([13] of DW1)."
8567        */
8568       assert(!(flags & PIPE_CONTROL_DEPTH_STALL));
8569    }
8570 
8571    if (flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
8572                 PIPE_CONTROL_STALL_AT_SCOREBOARD)) {
8573       /* From the PIPE_CONTROL instruction table, bit 12 and bit 1:
8574        *
8575        *    "This bit must be DISABLED for End-of-pipe (Read) fences,
8576        *     PS_DEPTH_COUNT or TIMESTAMP queries."
8577        *
8578        * TODO: Implement end-of-pipe checking.
8579        */
8580       assert(!(post_sync_flags & (PIPE_CONTROL_WRITE_DEPTH_COUNT |
8581                                   PIPE_CONTROL_WRITE_TIMESTAMP)));
8582    }
8583 
8584    if (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) {
8585       /* From the PIPE_CONTROL instruction table, bit 1:
8586        *
8587        *    "This bit is ignored if Depth Stall Enable is set.
8588        *     Further, the render cache is not flushed even if Write Cache
8589        *     Flush Enable bit is set."
8590        *
8591        * We assert that the caller doesn't do this combination, to try and
8592        * prevent mistakes.  It shouldn't hurt the GPU, though.
8593        *
8594        * We skip this check on Gen11+ as the "Stall at Pixel Scoreboard"
8595        * and "Render Target Flush" combo is explicitly required for BTI
8596        * update workarounds.
8597        */
8598       assert(!(flags & (PIPE_CONTROL_DEPTH_STALL |
8599                         PIPE_CONTROL_RENDER_TARGET_FLUSH)));
8600    }
8601 
8602    /* PIPE_CONTROL page workarounds ------------------------------------- */
8603 
8604    if (GFX_VER >= 7 && (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE)) {
8605       /* From the PIPE_CONTROL page itself:
8606        *
8607        *    "IVB, HSW, BDW
8608        *     Restriction: Pipe_control with CS-stall bit set must be issued
8609        *     before a pipe-control command that has the State Cache
8610        *     Invalidate bit set."
8611        */
8612       flags |= PIPE_CONTROL_CS_STALL;
8613    }
8614 
8615    if ((GFX_VERx10 == 75)) {
8616       /* From the PIPE_CONTROL page itself:
8617        *
8618        *    "HSW - Programming Note: PIPECONTROL with RO Cache Invalidation:
8619        *     Prior to programming a PIPECONTROL command with any of the RO
8620        *     cache invalidation bit set, program a PIPECONTROL flush command
8621        *     with “CS stall” bit and “HDC Flush” bit set."
8622        *
8623        * TODO: Actually implement this.  What's an HDC Flush?
8624        */
8625    }
8626 
8627    if (flags & PIPE_CONTROL_FLUSH_LLC) {
8628       /* From the PIPE_CONTROL instruction table, bit 26 (Flush LLC):
8629        *
8630        *    "Project: ALL
8631        *     SW must always program Post-Sync Operation to "Write Immediate
8632        *     Data" when Flush LLC is set."
8633        *
8634        * For now, we just require the caller to do it.
8635        */
8636       assert(flags & PIPE_CONTROL_WRITE_IMMEDIATE);
8637    }
8638 
8639    /* "Post-Sync Operation" workarounds -------------------------------- */
8640 
8641    /* Project: All / Argument: Global Snapshot Count Reset [19]
8642     *
8643     * "This bit must not be exercised on any product.
8644     *  Requires stall bit ([20] of DW1) set."
8645     *
8646     * We don't use this, so we just assert that it isn't used.  The
8647     * PIPE_CONTROL instruction page indicates that they intended this
8648     * as a debug feature and don't think it is useful in production,
8649     * but it may actually be usable, should we ever want to.
8650     */
8651    assert((flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) == 0);
8652 
8653    if (flags & (PIPE_CONTROL_MEDIA_STATE_CLEAR |
8654                 PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE)) {
8655       /* Project: All / Arguments:
8656        *
8657        * - Generic Media State Clear [16]
8658        * - Indirect State Pointers Disable [16]
8659        *
8660        *    "Requires stall bit ([20] of DW1) set."
8661        *
8662        * Also, the PIPE_CONTROL instruction table, bit 16 (Generic Media
8663        * State Clear) says:
8664        *
8665        *    "PIPECONTROL command with “Command Streamer Stall Enable” must be
8666        *     programmed prior to programming a PIPECONTROL command with "Media
8667        *     State Clear" set in GPGPU mode of operation"
8668        *
8669        * This is a subset of the earlier rule, so there's nothing to do.
8670        */
8671       flags |= PIPE_CONTROL_CS_STALL;
8672    }
8673 
8674    if (flags & PIPE_CONTROL_STORE_DATA_INDEX) {
8675       /* Project: All / Argument: Store Data Index
8676        *
8677        * "Post-Sync Operation ([15:14] of DW1) must be set to something other
8678        *  than '0'."
8679        *
8680        * For now, we just assert that the caller does this.  We might want to
8681        * automatically add a write to the workaround BO...
8682        */
8683       assert(non_lri_post_sync_flags != 0);
8684    }
8685 
8686    if (flags & PIPE_CONTROL_SYNC_GFDT) {
8687       /* Project: All / Argument: Sync GFDT
8688        *
8689        * "Post-Sync Operation ([15:14] of DW1) must be set to something other
8690        *  than '0' or 0x2520[13] must be set."
8691        *
8692        * For now, we just assert that the caller does this.
8693        */
8694       assert(non_lri_post_sync_flags != 0);
8695    }
8696 
8697    if (GFX_VER >= 6 && GFX_VER < 8 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
8698       /* Project: SNB, IVB, HSW / Argument: TLB inv
8699        *
8700        * "{All SKUs}{All Steppings}: Post-Sync Operation ([15:14] of DW1)
8701        *  must be set to something other than '0'."
8702        *
8703        * For now, we just assert that the caller does this.
8704        */
8705       assert(non_lri_post_sync_flags != 0);
8706    }
8707 
8708    if (GFX_VER >= 7 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
8709       /* Project: IVB+ / Argument: TLB inv
8710        *
8711        *    "Requires stall bit ([20] of DW1) set."
8712        *
8713        * Also, from the PIPE_CONTROL instruction table:
8714        *
8715        *    "Project: SKL+
8716        *     Post Sync Operation or CS stall must be set to ensure a TLB
8717        *     invalidation occurs.  Otherwise no cycle will occur to the TLB
8718        *     cache to invalidate."
8719        *
8720        * This is not a subset of the earlier rule, so there's nothing to do.
8721        */
8722       flags |= PIPE_CONTROL_CS_STALL;
8723    }
8724 #if GFX_VER == 8
8725    if (IS_COMPUTE_PIPELINE(batch)) {
8726       if (post_sync_flags ||
8727           (flags & (PIPE_CONTROL_NOTIFY_ENABLE |
8728                     PIPE_CONTROL_DEPTH_STALL |
8729                     PIPE_CONTROL_RENDER_TARGET_FLUSH |
8730                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
8731                     PIPE_CONTROL_DATA_CACHE_FLUSH))) {
8732          /* Project: BDW / Arguments:
8733           *
8734           * - LRI Post Sync Operation   [23]
8735           * - Post Sync Op              [15:14]
8736           * - Notify En                 [8]
8737           * - Depth Stall               [13]
8738           * - Render Target Cache Flush [12]
8739           * - Depth Cache Flush         [0]
8740           * - DC Flush Enable           [5]
8741           *
8742           *    "Requires stall bit ([20] of DW) set for all GPGPU and Media
8743           *     Workloads."
8744           *
8745           * (The docs have separate table rows for each bit, with essentially
8746           * the same workaround text.  We've combined them here.)
8747           */
8748          flags |= PIPE_CONTROL_CS_STALL;
8749 
8750          /* Also, from the PIPE_CONTROL instruction table, bit 20:
8751           *
8752           *    "Project: BDW
8753           *     This bit must be always set when PIPE_CONTROL command is
8754           *     programmed by GPGPU and MEDIA workloads, except for the cases
8755           *     when only Read Only Cache Invalidation bits are set (State
8756           *     Cache Invalidation Enable, Instruction cache Invalidation
8757           *     Enable, Texture Cache Invalidation Enable, Constant Cache
8758           *     Invalidation Enable). This is to WA FFDOP CG issue, this WA
8759           *     need not implemented when FF_DOP_CG is disable via "Fixed
8760           *     Function DOP Clock Gate Disable" bit in RC_PSMI_CTRL register."
8761           *
8762           * It sounds like we could avoid CS stalls in some cases, but we
8763           * don't currently bother.  This list isn't exactly the list above,
8764           * either...
8765           */
8766       }
8767    }
8768 #endif
8769    /* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT:
8770     *
8771     * "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with
8772     *  only read-cache-invalidate bit(s) set, must have a CS_STALL bit set."
8773     *
8774     * Note that the kernel does CS stalls between batches, so we only need
8775     * to count them within a batch.  We currently naively count every 4, and
8776     * don't skip the ones with only read-cache-invalidate bits set.  This
8777     * may or may not be a problem...
8778     */
8779    if (GFX_VER == 7 && !(GFX_VERx10 == 75)) {
8780       if (flags & PIPE_CONTROL_CS_STALL) {
8781          /* If we're doing a CS stall, reset the counter and carry on. */
8782          batch->pipe_controls_since_last_cs_stall = 0;
8783       }
8784 
8785       /* If this is the fourth pipe control without a CS stall, do one now. */
8786       if (++batch->pipe_controls_since_last_cs_stall == 4) {
8787          batch->pipe_controls_since_last_cs_stall = 0;
8788          flags |= PIPE_CONTROL_CS_STALL;
8789       }
8790    }
8791 
8792    /* "Stall" workarounds ----------------------------------------------
8793     * These have to come after the earlier ones because we may have added
8794     * some additional CS stalls above.
8795     */
8796 
8797    if (flags & PIPE_CONTROL_CS_STALL) {
8798       /* Project: PRE-SKL, VLV, CHV
8799        *
8800        * "[All Stepping][All SKUs]:
8801        *
8802        *  One of the following must also be set:
8803        *
8804        *  - Render Target Cache Flush Enable ([12] of DW1)
8805        *  - Depth Cache Flush Enable ([0] of DW1)
8806        *  - Stall at Pixel Scoreboard ([1] of DW1)
8807        *  - Depth Stall ([13] of DW1)
8808        *  - Post-Sync Operation ([13] of DW1)
8809        *  - DC Flush Enable ([5] of DW1)"
8810        *
8811        * If we don't already have one of those bits set, we choose to add
8812        * "Stall at Pixel Scoreboard".  Some of the other bits require a
8813        * CS stall as a workaround (see above), which would send us into
8814        * an infinite recursion of PIPE_CONTROLs.  "Stall at Pixel Scoreboard"
8815        * appears to be safe, so we choose that.
8816        */
8817       const uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
8818                                PIPE_CONTROL_DEPTH_CACHE_FLUSH |
8819                                PIPE_CONTROL_WRITE_IMMEDIATE |
8820                                PIPE_CONTROL_WRITE_DEPTH_COUNT |
8821                                PIPE_CONTROL_WRITE_TIMESTAMP |
8822                                PIPE_CONTROL_STALL_AT_SCOREBOARD |
8823                                PIPE_CONTROL_DEPTH_STALL |
8824                                PIPE_CONTROL_DATA_CACHE_FLUSH;
8825       if (!(flags & wa_bits))
8826          flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
8827    }
8828 
8829    /* Emit --------------------------------------------------------------- */
8830 
8831    if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) {
8832       fprintf(stderr,
8833               "  PC [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%"PRIx64"]: %s\n",
8834               (flags & PIPE_CONTROL_FLUSH_ENABLE) ? "PipeCon " : "",
8835               (flags & PIPE_CONTROL_CS_STALL) ? "CS " : "",
8836               (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) ? "Scoreboard " : "",
8837               (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) ? "VF " : "",
8838               (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH) ? "RT " : "",
8839               (flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE) ? "Const " : "",
8840               (flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE) ? "TC " : "",
8841               (flags & PIPE_CONTROL_DATA_CACHE_FLUSH) ? "DC " : "",
8842               (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH) ? "ZFlush " : "",
8843               (flags & PIPE_CONTROL_DEPTH_STALL) ? "ZStall " : "",
8844               (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE) ? "State " : "",
8845               (flags & PIPE_CONTROL_TLB_INVALIDATE) ? "TLB " : "",
8846               (flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE) ? "Inst " : "",
8847               (flags & PIPE_CONTROL_MEDIA_STATE_CLEAR) ? "MediaClear " : "",
8848               (flags & PIPE_CONTROL_NOTIFY_ENABLE) ? "Notify " : "",
8849               (flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) ?
8850               "SnapRes" : "",
8851               (flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE) ?
8852               "ISPDis" : "",
8853               (flags & PIPE_CONTROL_WRITE_IMMEDIATE) ? "WriteImm " : "",
8854               (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT) ? "WriteZCount " : "",
8855               (flags & PIPE_CONTROL_WRITE_TIMESTAMP) ? "WriteTimestamp " : "",
8856               imm, reason);
8857    }
8858 
8859    crocus_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
8860 #if GFX_VER >= 7
8861       pc.LRIPostSyncOperation = NoLRIOperation;
8862       pc.PipeControlFlushEnable = flags & PIPE_CONTROL_FLUSH_ENABLE;
8863       pc.DCFlushEnable = flags & PIPE_CONTROL_DATA_CACHE_FLUSH;
8864 #endif
8865 #if GFX_VER >= 6
8866       pc.StoreDataIndex = 0;
8867       pc.CommandStreamerStallEnable = flags & PIPE_CONTROL_CS_STALL;
8868       pc.GlobalSnapshotCountReset =
8869          flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET;
8870       pc.TLBInvalidate = flags & PIPE_CONTROL_TLB_INVALIDATE;
8871       pc.GenericMediaStateClear = flags & PIPE_CONTROL_MEDIA_STATE_CLEAR;
8872       pc.StallAtPixelScoreboard = flags & PIPE_CONTROL_STALL_AT_SCOREBOARD;
8873       pc.RenderTargetCacheFlushEnable =
8874          flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;
8875       pc.DepthCacheFlushEnable = flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH;
8876       pc.StateCacheInvalidationEnable =
8877          flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE;
8878       pc.VFCacheInvalidationEnable = flags & PIPE_CONTROL_VF_CACHE_INVALIDATE;
8879       pc.ConstantCacheInvalidationEnable =
8880          flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE;
8881 #else
8882       pc.WriteCacheFlush = flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;
8883 #endif
8884       pc.PostSyncOperation = flags_to_post_sync_op(flags);
8885       pc.DepthStallEnable = flags & PIPE_CONTROL_DEPTH_STALL;
8886       pc.InstructionCacheInvalidateEnable =
8887          flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE;
8888       pc.NotifyEnable = flags & PIPE_CONTROL_NOTIFY_ENABLE;
8889 #if GFX_VER >= 5 || GFX_VERx10 == 45
8890       pc.IndirectStatePointersDisable =
8891          flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE;
8892 #endif
8893 #if GFX_VER >= 6
8894       pc.TextureCacheInvalidationEnable =
8895          flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
8896 #elif GFX_VER == 5 || GFX_VERx10 == 45
8897       pc.TextureCacheFlushEnable =
8898          flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
8899 #endif
8900       pc.Address = ggtt_bo(bo, offset);
8901       if (GFX_VER < 7 && bo)
8902          pc.DestinationAddressType = DAT_GGTT;
8903       pc.ImmediateData = imm;
8904    }
8905 }
8906 
8907 #if GFX_VER == 6
8908 void
8909 genX(crocus_upload_urb)(struct crocus_batch *batch,
8910                         unsigned vs_size,
8911                         bool gs_present,
8912                         unsigned gs_size)
8913 {
8914    struct crocus_context *ice = batch->ice;
8915    int nr_vs_entries, nr_gs_entries;
8916    int total_urb_size = ice->urb.size * 1024; /* in bytes */
8917    const struct intel_device_info *devinfo = &batch->screen->devinfo;
8918 
8919    /* Calculate how many entries fit in each stage's section of the URB */
8920    if (gs_present) {
8921       nr_vs_entries = (total_urb_size/2) / (vs_size * 128);
8922       nr_gs_entries = (total_urb_size/2) / (gs_size * 128);
8923    } else {
8924       nr_vs_entries = total_urb_size / (vs_size * 128);
8925       nr_gs_entries = 0;
8926    }
8927 
8928    /* Then clamp to the maximum allowed by the hardware */
8929    if (nr_vs_entries > devinfo->urb.max_entries[MESA_SHADER_VERTEX])
8930       nr_vs_entries = devinfo->urb.max_entries[MESA_SHADER_VERTEX];
8931 
8932    if (nr_gs_entries > devinfo->urb.max_entries[MESA_SHADER_GEOMETRY])
8933       nr_gs_entries = devinfo->urb.max_entries[MESA_SHADER_GEOMETRY];
8934 
8935    /* Finally, both must be a multiple of 4 (see 3DSTATE_URB in the PRM). */
8936    ice->urb.nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, 4);
8937    ice->urb.nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, 4);
8938 
8939    assert(ice->urb.nr_vs_entries >=
8940           devinfo->urb.min_entries[MESA_SHADER_VERTEX]);
8941    assert(ice->urb.nr_vs_entries % 4 == 0);
8942    assert(ice->urb.nr_gs_entries % 4 == 0);
8943    assert(vs_size <= 5);
8944    assert(gs_size <= 5);
8945 
8946    crocus_emit_cmd(batch, GENX(3DSTATE_URB), urb) {
8947       urb.VSNumberofURBEntries = ice->urb.nr_vs_entries;
8948       urb.VSURBEntryAllocationSize = vs_size - 1;
8949 
8950       urb.GSNumberofURBEntries = ice->urb.nr_gs_entries;
8951       urb.GSURBEntryAllocationSize = gs_size - 1;
8952    };
8953    /* From the PRM Volume 2 part 1, section 1.4.7:
8954     *
8955     *   Because of a urb corruption caused by allocating a previous gsunit’s
8956     *   urb entry to vsunit software is required to send a "GS NULL
8957     *   Fence"(Send URB fence with VS URB size == 1 and GS URB size == 0) plus
8958     *   a dummy DRAW call before any case where VS will be taking over GS URB
8959     *   space.
8960     *
8961     * It is not clear exactly what this means ("URB fence" is a command that
8962     * doesn't exist on Gen6).  So for now we just do a full pipeline flush as
8963     * a workaround.
8964     */
8965    if (ice->urb.gs_present && !gs_present)
8966       crocus_emit_mi_flush(batch);
8967    ice->urb.gs_present = gs_present;
8968 }
8969 #endif
8970 
8971 static void
8972 crocus_lost_genx_state(struct crocus_context *ice, struct crocus_batch *batch)
8973 {
8974 }
8975 
8976 static void
8977 crocus_emit_mi_report_perf_count(struct crocus_batch *batch,
8978                                  struct crocus_bo *bo,
8979                                  uint32_t offset_in_bytes,
8980                                  uint32_t report_id)
8981 {
8982 #if GFX_VER >= 7
8983    crocus_emit_cmd(batch, GENX(MI_REPORT_PERF_COUNT), mi_rpc) {
8984       mi_rpc.MemoryAddress = rw_bo(bo, offset_in_bytes);
8985       mi_rpc.ReportID = report_id;
8986    }
8987 #endif
8988 }
8989 
8990 /**
8991  * From the PRM, Volume 2a:
8992  *
8993  *    "Indirect State Pointers Disable
8994  *
8995  *    At the completion of the post-sync operation associated with this pipe
8996  *    control packet, the indirect state pointers in the hardware are
8997  *    considered invalid; the indirect pointers are not saved in the context.
8998  *    If any new indirect state commands are executed in the command stream
8999  *    while the pipe control is pending, the new indirect state commands are
9000  *    preserved.
9001  *
9002  *    [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
9003  *    restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
9004  *    commands are only considered as Indirect State Pointers. Once ISP is
9005  *    issued in a context, SW must initialize by programming push constant
9006  *    commands for all the shaders (at least to zero length) before attempting
9007  *    any rendering operation for the same context."
9008  *
9009  * 3DSTATE_CONSTANT_* packets are restored during a context restore,
9010  * even though they point to a BO that has been already unreferenced at
9011  * the end of the previous batch buffer. This has been fine so far since
9012  * we are protected by these scratch page (every address not covered by
9013  * a BO should be pointing to the scratch page). But on CNL, it is
9014  * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
9015  * instruction.
9016  *
9017  * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
9018  * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
9019  * context restore, so the mentioned hang doesn't happen. However,
9020  * software must program push constant commands for all stages prior to
9021  * rendering anything, so we flag them as dirty.
9022  *
9023  * Finally, we also make sure to stall at pixel scoreboard to make sure the
9024  * constants have been loaded into the EUs prior to disable the push constants
9025  * so that it doesn't hang a previous 3DPRIMITIVE.
9026  */
9027 #if GFX_VER >= 7
9028 static void
9029 gen7_emit_isp_disable(struct crocus_batch *batch)
9030 {
9031    crocus_emit_raw_pipe_control(batch, "isp disable",
9032                                 PIPE_CONTROL_STALL_AT_SCOREBOARD |
9033                                 PIPE_CONTROL_CS_STALL,
9034                                 NULL, 0, 0);
9035    crocus_emit_raw_pipe_control(batch, "isp disable",
9036                                 PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE |
9037                                 PIPE_CONTROL_CS_STALL,
9038                                 NULL, 0, 0);
9039 
9040    struct crocus_context *ice = batch->ice;
9041    ice->state.stage_dirty |= (CROCUS_STAGE_DIRTY_CONSTANTS_VS |
9042                               CROCUS_STAGE_DIRTY_CONSTANTS_TCS |
9043                               CROCUS_STAGE_DIRTY_CONSTANTS_TES |
9044                               CROCUS_STAGE_DIRTY_CONSTANTS_GS |
9045                               CROCUS_STAGE_DIRTY_CONSTANTS_FS);
9046 }
9047 #endif
9048 
9049 #if GFX_VER >= 7
9050 static void
9051 crocus_state_finish_batch(struct crocus_batch *batch)
9052 {
9053 #if GFX_VERx10 == 75
9054    if (batch->name == CROCUS_BATCH_RENDER) {
9055       crocus_emit_mi_flush(batch);
9056       crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
9057          ptr.ColorCalcStatePointer = batch->ice->shaders.cc_offset;
9058       }
9059 
9060       crocus_emit_pipe_control_flush(batch, "hsw wa", PIPE_CONTROL_RENDER_TARGET_FLUSH |
9061                                      PIPE_CONTROL_CS_STALL);
9062    }
9063 #endif
9064    gen7_emit_isp_disable(batch);
9065 }
9066 #endif
9067 
9068 static void
9069 crocus_batch_reset_dirty(struct crocus_batch *batch)
9070 {
9071    /* unreference any index buffer so it get reemitted. */
9072    pipe_resource_reference(&batch->ice->state.index_buffer.res, NULL);
9073 
9074    /* for GEN4/5 need to reemit anything that ends up in the state batch that points to anything in the state batch
9075     * as the old state batch won't still be available.
9076     */
9077    batch->ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER |
9078       CROCUS_DIRTY_COLOR_CALC_STATE;
9079 
9080    batch->ice->state.dirty |= CROCUS_DIRTY_VERTEX_ELEMENTS | CROCUS_DIRTY_VERTEX_BUFFERS;
9081 
9082    batch->ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_BINDINGS;
9083    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS;
9084    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_TES;
9085    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_TCS;
9086    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_GS;
9087    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_PS;
9088    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS;
9089 
9090    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS;
9091    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TES;
9092    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS;
9093    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_GS;
9094    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_FS;
9095    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_CS;
9096 
9097    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS;
9098    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_GS;
9099    batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CS;
9100    batch->ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT | CROCUS_DIRTY_SF_CL_VIEWPORT;
9101 
9102 #if GFX_VER >= 6
9103    /* SCISSOR_STATE */
9104    batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
9105    batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
9106    batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL;
9107 
9108 #endif
9109 #if GFX_VER <= 5
9110    /* dirty the SF state on gen4/5 */
9111    batch->ice->state.dirty |= CROCUS_DIRTY_RASTER;
9112    batch->ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
9113    batch->ice->state.dirty |= CROCUS_DIRTY_CLIP;
9114    batch->ice->state.dirty |= CROCUS_DIRTY_WM;
9115 #endif
9116 #if GFX_VER >= 7
9117    /* Streamout dirty */
9118    batch->ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
9119    batch->ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST;
9120    batch->ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
9121 #endif
9122 }
9123 
9124 #if GFX_VERx10 == 75
9125 struct pipe_rasterizer_state *crocus_get_rast_state(struct crocus_context *ice)
9126 {
9127    return &ice->state.cso_rast->cso;
9128 }
9129 #endif
9130 
9131 #if GFX_VER >= 6
9132 static void update_so_strides(struct crocus_context *ice,
9133                               uint16_t *strides)
9134 {
9135    for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
9136       struct crocus_stream_output_target *so = (void *)ice->state.so_target[i];
9137       if (so)
9138          so->stride = strides[i] * sizeof(uint32_t);
9139    }
9140 }
9141 #endif
9142 
9143 static void crocus_fill_clamp_mask(const struct crocus_sampler_state *samp,
9144                                    int s,
9145                                    uint32_t *clamp_mask)
9146 {
9147 #if GFX_VER < 8
9148    if (samp->pstate.min_img_filter != PIPE_TEX_FILTER_NEAREST &&
9149        samp->pstate.mag_img_filter != PIPE_TEX_FILTER_NEAREST) {
9150       if (samp->pstate.wrap_s == PIPE_TEX_WRAP_CLAMP)
9151          clamp_mask[0] |= (1 << s);
9152       if (samp->pstate.wrap_t == PIPE_TEX_WRAP_CLAMP)
9153          clamp_mask[1] |= (1 << s);
9154       if (samp->pstate.wrap_r == PIPE_TEX_WRAP_CLAMP)
9155          clamp_mask[2] |= (1 << s);
9156    }
9157 #endif
9158 }
9159 
9160 static void
9161 crocus_set_frontend_noop(struct pipe_context *ctx, bool enable)
9162 {
9163    struct crocus_context *ice = (struct crocus_context *) ctx;
9164 
9165    if (crocus_batch_prepare_noop(&ice->batches[CROCUS_BATCH_RENDER], enable)) {
9166       ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_RENDER;
9167       ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_RENDER;
9168    }
9169 
9170    if (ice->batch_count == 1)
9171       return;
9172 
9173    if (crocus_batch_prepare_noop(&ice->batches[CROCUS_BATCH_COMPUTE], enable)) {
9174       ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_COMPUTE;
9175       ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE;
9176    }
9177 }
9178 
9179 void
9180 genX(crocus_init_screen_state)(struct crocus_screen *screen)
9181 {
9182    assert(screen->devinfo.verx10 == GFX_VERx10);
9183    assert(screen->devinfo.ver == GFX_VER);
9184    screen->vtbl.destroy_state = crocus_destroy_state;
9185    screen->vtbl.init_render_context = crocus_init_render_context;
9186    screen->vtbl.upload_render_state = crocus_upload_render_state;
9187 #if GFX_VER >= 7
9188    screen->vtbl.init_compute_context = crocus_init_compute_context;
9189    screen->vtbl.upload_compute_state = crocus_upload_compute_state;
9190 #endif
9191    screen->vtbl.emit_raw_pipe_control = crocus_emit_raw_pipe_control;
9192    screen->vtbl.emit_mi_report_perf_count = crocus_emit_mi_report_perf_count;
9193    screen->vtbl.rebind_buffer = crocus_rebind_buffer;
9194 #if GFX_VERx10 >= 75
9195    screen->vtbl.load_register_reg32 = crocus_load_register_reg32;
9196    screen->vtbl.load_register_reg64 = crocus_load_register_reg64;
9197    screen->vtbl.load_register_imm32 = crocus_load_register_imm32;
9198    screen->vtbl.load_register_imm64 = crocus_load_register_imm64;
9199    screen->vtbl.store_data_imm32 = crocus_store_data_imm32;
9200    screen->vtbl.store_data_imm64 = crocus_store_data_imm64;
9201 #endif
9202 #if GFX_VER >= 7
9203    screen->vtbl.load_register_mem32 = crocus_load_register_mem32;
9204    screen->vtbl.load_register_mem64 = crocus_load_register_mem64;
9205    screen->vtbl.copy_mem_mem = crocus_copy_mem_mem;
9206    screen->vtbl.create_so_decl_list = crocus_create_so_decl_list;
9207 #endif
9208    screen->vtbl.update_surface_base_address = crocus_update_surface_base_address;
9209 #if GFX_VER >= 6
9210    screen->vtbl.store_register_mem32 = crocus_store_register_mem32;
9211    screen->vtbl.store_register_mem64 = crocus_store_register_mem64;
9212 #endif
9213    screen->vtbl.populate_vs_key = crocus_populate_vs_key;
9214    screen->vtbl.populate_tcs_key = crocus_populate_tcs_key;
9215    screen->vtbl.populate_tes_key = crocus_populate_tes_key;
9216    screen->vtbl.populate_gs_key = crocus_populate_gs_key;
9217    screen->vtbl.populate_fs_key = crocus_populate_fs_key;
9218    screen->vtbl.populate_cs_key = crocus_populate_cs_key;
9219    screen->vtbl.lost_genx_state = crocus_lost_genx_state;
9220 #if GFX_VER >= 7
9221    screen->vtbl.finish_batch = crocus_state_finish_batch;
9222 #endif
9223 #if GFX_VER <= 5
9224    screen->vtbl.upload_urb_fence = crocus_upload_urb_fence;
9225    screen->vtbl.calculate_urb_fence = crocus_calculate_urb_fence;
9226 #endif
9227    screen->vtbl.fill_clamp_mask = crocus_fill_clamp_mask;
9228    screen->vtbl.batch_reset_dirty = crocus_batch_reset_dirty;
9229    screen->vtbl.translate_prim_type = translate_prim_type;
9230 #if GFX_VER >= 6
9231    screen->vtbl.update_so_strides = update_so_strides;
9232    screen->vtbl.get_so_offset = crocus_get_so_offset;
9233 #endif
9234 
9235    genX(crocus_init_blt)(screen);
9236 }
9237 
9238 void
9239 genX(crocus_init_state)(struct crocus_context *ice)
9240 {
9241    struct pipe_context *ctx = &ice->ctx;
9242 
9243    ctx->create_blend_state = crocus_create_blend_state;
9244    ctx->create_depth_stencil_alpha_state = crocus_create_zsa_state;
9245    ctx->create_rasterizer_state = crocus_create_rasterizer_state;
9246    ctx->create_sampler_state = crocus_create_sampler_state;
9247    ctx->create_sampler_view = crocus_create_sampler_view;
9248    ctx->create_surface = crocus_create_surface;
9249    ctx->create_vertex_elements_state = crocus_create_vertex_elements;
9250    ctx->bind_blend_state = crocus_bind_blend_state;
9251    ctx->bind_depth_stencil_alpha_state = crocus_bind_zsa_state;
9252    ctx->bind_sampler_states = crocus_bind_sampler_states;
9253    ctx->bind_rasterizer_state = crocus_bind_rasterizer_state;
9254    ctx->bind_vertex_elements_state = crocus_bind_vertex_elements_state;
9255    ctx->delete_blend_state = crocus_delete_state;
9256    ctx->delete_depth_stencil_alpha_state = crocus_delete_state;
9257    ctx->delete_rasterizer_state = crocus_delete_state;
9258    ctx->delete_sampler_state = crocus_delete_state;
9259    ctx->delete_vertex_elements_state = crocus_delete_state;
9260    ctx->set_blend_color = crocus_set_blend_color;
9261    ctx->set_clip_state = crocus_set_clip_state;
9262    ctx->set_constant_buffer = crocus_set_constant_buffer;
9263    ctx->set_shader_buffers = crocus_set_shader_buffers;
9264    ctx->set_shader_images = crocus_set_shader_images;
9265    ctx->set_sampler_views = crocus_set_sampler_views;
9266    ctx->set_tess_state = crocus_set_tess_state;
9267    ctx->set_patch_vertices = crocus_set_patch_vertices;
9268    ctx->set_framebuffer_state = crocus_set_framebuffer_state;
9269    ctx->set_polygon_stipple = crocus_set_polygon_stipple;
9270    ctx->set_sample_mask = crocus_set_sample_mask;
9271    ctx->set_scissor_states = crocus_set_scissor_states;
9272    ctx->set_stencil_ref = crocus_set_stencil_ref;
9273    ctx->set_vertex_buffers = crocus_set_vertex_buffers;
9274    ctx->set_viewport_states = crocus_set_viewport_states;
9275    ctx->sampler_view_destroy = crocus_sampler_view_destroy;
9276    ctx->surface_destroy = crocus_surface_destroy;
9277    ctx->draw_vbo = crocus_draw_vbo;
9278    ctx->launch_grid = crocus_launch_grid;
9279 
9280    ctx->set_frontend_noop = crocus_set_frontend_noop;
9281 
9282 #if GFX_VER >= 6
9283    ctx->create_stream_output_target = crocus_create_stream_output_target;
9284    ctx->stream_output_target_destroy = crocus_stream_output_target_destroy;
9285    ctx->set_stream_output_targets = crocus_set_stream_output_targets;
9286 #endif
9287 
9288    ice->state.dirty = ~0ull;
9289    ice->state.stage_dirty = ~0ull;
9290 
9291    ice->state.statistics_counters_enabled = true;
9292 
9293    ice->state.sample_mask = 0xff;
9294    ice->state.num_viewports = 1;
9295    ice->state.prim_mode = MESA_PRIM_COUNT;
9296    ice->state.reduced_prim_mode = MESA_PRIM_COUNT;
9297    ice->state.genx = calloc(1, sizeof(struct crocus_genx_state));
9298    ice->draw.derived_params.drawid = -1;
9299 
9300    /* Default all scissor rectangles to be empty regions. */
9301    for (int i = 0; i < CROCUS_MAX_VIEWPORTS; i++) {
9302       ice->state.scissors[i] = (struct pipe_scissor_state) {
9303          .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
9304       };
9305    }
9306 }
9307