1 /*
2 * Copyright © 2017 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 * DEALINGS IN THE SOFTWARE.
21 */
22
23 /**
24 * @file crocus_state.c
25 *
26 * ============================= GENXML CODE =============================
27 * [This file is compiled once per generation.]
28 * =======================================================================
29 *
30 * This is the main state upload code.
31 *
32 * Gallium uses Constant State Objects, or CSOs, for most state. Large,
33 * complex, or highly reusable state can be created once, and bound and
34 * rebound multiple times. This is modeled with the pipe->create_*_state()
35 * and pipe->bind_*_state() hooks. Highly dynamic or inexpensive state is
36 * streamed out on the fly, via pipe->set_*_state() hooks.
37 *
38 * OpenGL involves frequently mutating context state, which is mirrored in
39 * core Mesa by highly mutable data structures. However, most applications
40 * typically draw the same things over and over - from frame to frame, most
41 * of the same objects are still visible and need to be redrawn. So, rather
42 * than inventing new state all the time, applications usually mutate to swap
43 * between known states that we've seen before.
44 *
45 * Gallium isolates us from this mutation by tracking API state, and
46 * distilling it into a set of Constant State Objects, or CSOs. Large,
47 * complex, or typically reusable state can be created once, then reused
48 * multiple times. Drivers can create and store their own associated data.
49 * This create/bind model corresponds to the pipe->create_*_state() and
50 * pipe->bind_*_state() driver hooks.
51 *
52 * Some state is cheap to create, or expected to be highly dynamic. Rather
53 * than creating and caching piles of CSOs for these, Gallium simply streams
54 * them out, via the pipe->set_*_state() driver hooks.
55 *
56 * To reduce draw time overhead, we try to compute as much state at create
57 * time as possible. Wherever possible, we translate the Gallium pipe state
58 * to 3DSTATE commands, and store those commands in the CSO. At draw time,
59 * we can simply memcpy them into a batch buffer.
60 *
61 * No hardware matches the abstraction perfectly, so some commands require
62 * information from multiple CSOs. In this case, we can store two copies
63 * of the packet (one in each CSO), and simply | together their DWords at
64 * draw time. Sometimes the second set is trivial (one or two fields), so
65 * we simply pack it at draw time.
66 *
67 * There are two main components in the file below. First, the CSO hooks
68 * create/bind/track state. The second are the draw-time upload functions,
69 * crocus_upload_render_state() and crocus_upload_compute_state(), which read
70 * the context state and emit the commands into the actual batch.
71 */
72
73 #include <errno.h>
74 #include <stdio.h>
75
76 #if HAVE_VALGRIND
77 #include <memcheck.h>
78 #include <valgrind.h>
79 #define VG(x) x
80 #else
81 #define VG(x)
82 #endif
83
84 #include "drm-uapi/i915_drm.h"
85 #include "intel/common/intel_compute_slm.h"
86 #include "intel/common/intel_l3_config.h"
87 #include "intel/common/intel_sample_positions.h"
88 #include "intel/compiler/elk/elk_compiler.h"
89 #include "compiler/shader_info.h"
90 #include "pipe/p_context.h"
91 #include "pipe/p_defines.h"
92 #include "pipe/p_screen.h"
93 #include "pipe/p_state.h"
94 #include "util/format/u_format.h"
95 #include "util/half_float.h"
96 #include "util/u_dual_blend.h"
97 #include "util/u_framebuffer.h"
98 #include "util/u_helpers.h"
99 #include "util/u_inlines.h"
100 #include "util/u_memory.h"
101 #include "util/u_prim.h"
102 #include "util/u_transfer.h"
103 #include "util/u_upload_mgr.h"
104 #include "util/u_viewport.h"
105 #include "crocus_batch.h"
106 #include "crocus_context.h"
107 #include "crocus_defines.h"
108 #include "crocus_pipe.h"
109 #include "crocus_resource.h"
110
111 #include "crocus_genx_macros.h"
112 #include "intel/common/intel_genX_state_elk.h"
113 #include "intel/common/intel_guardband.h"
114 #include "main/macros.h" /* UNCLAMPED_* */
115
116 /**
117 * Statically assert that PIPE_* enums match the hardware packets.
118 * (As long as they match, we don't need to translate them.)
119 */
pipe_asserts()120 UNUSED static void pipe_asserts()
121 {
122 #define PIPE_ASSERT(x) STATIC_ASSERT((int)x)
123
124 /* pipe_logicop happens to match the hardware. */
125 PIPE_ASSERT(PIPE_LOGICOP_CLEAR == LOGICOP_CLEAR);
126 PIPE_ASSERT(PIPE_LOGICOP_NOR == LOGICOP_NOR);
127 PIPE_ASSERT(PIPE_LOGICOP_AND_INVERTED == LOGICOP_AND_INVERTED);
128 PIPE_ASSERT(PIPE_LOGICOP_COPY_INVERTED == LOGICOP_COPY_INVERTED);
129 PIPE_ASSERT(PIPE_LOGICOP_AND_REVERSE == LOGICOP_AND_REVERSE);
130 PIPE_ASSERT(PIPE_LOGICOP_INVERT == LOGICOP_INVERT);
131 PIPE_ASSERT(PIPE_LOGICOP_XOR == LOGICOP_XOR);
132 PIPE_ASSERT(PIPE_LOGICOP_NAND == LOGICOP_NAND);
133 PIPE_ASSERT(PIPE_LOGICOP_AND == LOGICOP_AND);
134 PIPE_ASSERT(PIPE_LOGICOP_EQUIV == LOGICOP_EQUIV);
135 PIPE_ASSERT(PIPE_LOGICOP_NOOP == LOGICOP_NOOP);
136 PIPE_ASSERT(PIPE_LOGICOP_OR_INVERTED == LOGICOP_OR_INVERTED);
137 PIPE_ASSERT(PIPE_LOGICOP_COPY == LOGICOP_COPY);
138 PIPE_ASSERT(PIPE_LOGICOP_OR_REVERSE == LOGICOP_OR_REVERSE);
139 PIPE_ASSERT(PIPE_LOGICOP_OR == LOGICOP_OR);
140 PIPE_ASSERT(PIPE_LOGICOP_SET == LOGICOP_SET);
141
142 /* pipe_blend_func happens to match the hardware. */
143 PIPE_ASSERT(PIPE_BLENDFACTOR_ONE == BLENDFACTOR_ONE);
144 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_COLOR == BLENDFACTOR_SRC_COLOR);
145 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA == BLENDFACTOR_SRC_ALPHA);
146 PIPE_ASSERT(PIPE_BLENDFACTOR_DST_ALPHA == BLENDFACTOR_DST_ALPHA);
147 PIPE_ASSERT(PIPE_BLENDFACTOR_DST_COLOR == BLENDFACTOR_DST_COLOR);
148 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE == BLENDFACTOR_SRC_ALPHA_SATURATE);
149 PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_COLOR == BLENDFACTOR_CONST_COLOR);
150 PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_ALPHA == BLENDFACTOR_CONST_ALPHA);
151 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_COLOR == BLENDFACTOR_SRC1_COLOR);
152 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_ALPHA == BLENDFACTOR_SRC1_ALPHA);
153 PIPE_ASSERT(PIPE_BLENDFACTOR_ZERO == BLENDFACTOR_ZERO);
154 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_COLOR == BLENDFACTOR_INV_SRC_COLOR);
155 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_ALPHA == BLENDFACTOR_INV_SRC_ALPHA);
156 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_ALPHA == BLENDFACTOR_INV_DST_ALPHA);
157 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_COLOR == BLENDFACTOR_INV_DST_COLOR);
158 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_COLOR == BLENDFACTOR_INV_CONST_COLOR);
159 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_ALPHA == BLENDFACTOR_INV_CONST_ALPHA);
160 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_COLOR == BLENDFACTOR_INV_SRC1_COLOR);
161 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_ALPHA == BLENDFACTOR_INV_SRC1_ALPHA);
162
163 /* pipe_blend_func happens to match the hardware. */
164 PIPE_ASSERT(PIPE_BLEND_ADD == BLENDFUNCTION_ADD);
165 PIPE_ASSERT(PIPE_BLEND_SUBTRACT == BLENDFUNCTION_SUBTRACT);
166 PIPE_ASSERT(PIPE_BLEND_REVERSE_SUBTRACT == BLENDFUNCTION_REVERSE_SUBTRACT);
167 PIPE_ASSERT(PIPE_BLEND_MIN == BLENDFUNCTION_MIN);
168 PIPE_ASSERT(PIPE_BLEND_MAX == BLENDFUNCTION_MAX);
169
170 /* pipe_stencil_op happens to match the hardware. */
171 PIPE_ASSERT(PIPE_STENCIL_OP_KEEP == STENCILOP_KEEP);
172 PIPE_ASSERT(PIPE_STENCIL_OP_ZERO == STENCILOP_ZERO);
173 PIPE_ASSERT(PIPE_STENCIL_OP_REPLACE == STENCILOP_REPLACE);
174 PIPE_ASSERT(PIPE_STENCIL_OP_INCR == STENCILOP_INCRSAT);
175 PIPE_ASSERT(PIPE_STENCIL_OP_DECR == STENCILOP_DECRSAT);
176 PIPE_ASSERT(PIPE_STENCIL_OP_INCR_WRAP == STENCILOP_INCR);
177 PIPE_ASSERT(PIPE_STENCIL_OP_DECR_WRAP == STENCILOP_DECR);
178 PIPE_ASSERT(PIPE_STENCIL_OP_INVERT == STENCILOP_INVERT);
179
180 #if GFX_VER >= 6
181 /* pipe_sprite_coord_mode happens to match 3DSTATE_SBE */
182 PIPE_ASSERT(PIPE_SPRITE_COORD_UPPER_LEFT == UPPERLEFT);
183 PIPE_ASSERT(PIPE_SPRITE_COORD_LOWER_LEFT == LOWERLEFT);
184 #endif
185 #undef PIPE_ASSERT
186 }
187
188 static unsigned
translate_prim_type(enum mesa_prim prim,uint8_t verts_per_patch)189 translate_prim_type(enum mesa_prim prim, uint8_t verts_per_patch)
190 {
191 static const unsigned map[] = {
192 [MESA_PRIM_POINTS] = _3DPRIM_POINTLIST,
193 [MESA_PRIM_LINES] = _3DPRIM_LINELIST,
194 [MESA_PRIM_LINE_LOOP] = _3DPRIM_LINELOOP,
195 [MESA_PRIM_LINE_STRIP] = _3DPRIM_LINESTRIP,
196 [MESA_PRIM_TRIANGLES] = _3DPRIM_TRILIST,
197 [MESA_PRIM_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP,
198 [MESA_PRIM_TRIANGLE_FAN] = _3DPRIM_TRIFAN,
199 [MESA_PRIM_QUADS] = _3DPRIM_QUADLIST,
200 [MESA_PRIM_QUAD_STRIP] = _3DPRIM_QUADSTRIP,
201 [MESA_PRIM_POLYGON] = _3DPRIM_POLYGON,
202 #if GFX_VER >= 6
203 [MESA_PRIM_LINES_ADJACENCY] = _3DPRIM_LINELIST_ADJ,
204 [MESA_PRIM_LINE_STRIP_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ,
205 [MESA_PRIM_TRIANGLES_ADJACENCY] = _3DPRIM_TRILIST_ADJ,
206 [MESA_PRIM_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
207 #endif
208 #if GFX_VER >= 7
209 [MESA_PRIM_PATCHES] = _3DPRIM_PATCHLIST_1 - 1,
210 #endif
211 };
212
213 return map[prim] + (prim == MESA_PRIM_PATCHES ? verts_per_patch : 0);
214 }
215
216 static unsigned
translate_compare_func(enum pipe_compare_func pipe_func)217 translate_compare_func(enum pipe_compare_func pipe_func)
218 {
219 static const unsigned map[] = {
220 [PIPE_FUNC_NEVER] = COMPAREFUNCTION_NEVER,
221 [PIPE_FUNC_LESS] = COMPAREFUNCTION_LESS,
222 [PIPE_FUNC_EQUAL] = COMPAREFUNCTION_EQUAL,
223 [PIPE_FUNC_LEQUAL] = COMPAREFUNCTION_LEQUAL,
224 [PIPE_FUNC_GREATER] = COMPAREFUNCTION_GREATER,
225 [PIPE_FUNC_NOTEQUAL] = COMPAREFUNCTION_NOTEQUAL,
226 [PIPE_FUNC_GEQUAL] = COMPAREFUNCTION_GEQUAL,
227 [PIPE_FUNC_ALWAYS] = COMPAREFUNCTION_ALWAYS,
228 };
229 return map[pipe_func];
230 }
231
232 static unsigned
translate_shadow_func(enum pipe_compare_func pipe_func)233 translate_shadow_func(enum pipe_compare_func pipe_func)
234 {
235 /* Gallium specifies the result of shadow comparisons as:
236 *
237 * 1 if ref <op> texel,
238 * 0 otherwise.
239 *
240 * The hardware does:
241 *
242 * 0 if texel <op> ref,
243 * 1 otherwise.
244 *
245 * So we need to flip the operator and also negate.
246 */
247 static const unsigned map[] = {
248 [PIPE_FUNC_NEVER] = PREFILTEROP_ALWAYS,
249 [PIPE_FUNC_LESS] = PREFILTEROP_LEQUAL,
250 [PIPE_FUNC_EQUAL] = PREFILTEROP_NOTEQUAL,
251 [PIPE_FUNC_LEQUAL] = PREFILTEROP_LESS,
252 [PIPE_FUNC_GREATER] = PREFILTEROP_GEQUAL,
253 [PIPE_FUNC_NOTEQUAL] = PREFILTEROP_EQUAL,
254 [PIPE_FUNC_GEQUAL] = PREFILTEROP_GREATER,
255 [PIPE_FUNC_ALWAYS] = PREFILTEROP_NEVER,
256 };
257 return map[pipe_func];
258 }
259
260 static unsigned
translate_cull_mode(unsigned pipe_face)261 translate_cull_mode(unsigned pipe_face)
262 {
263 static const unsigned map[4] = {
264 [PIPE_FACE_NONE] = CULLMODE_NONE,
265 [PIPE_FACE_FRONT] = CULLMODE_FRONT,
266 [PIPE_FACE_BACK] = CULLMODE_BACK,
267 [PIPE_FACE_FRONT_AND_BACK] = CULLMODE_BOTH,
268 };
269 return map[pipe_face];
270 }
271
272 #if GFX_VER >= 6
273 static unsigned
translate_fill_mode(unsigned pipe_polymode)274 translate_fill_mode(unsigned pipe_polymode)
275 {
276 static const unsigned map[4] = {
277 [PIPE_POLYGON_MODE_FILL] = FILL_MODE_SOLID,
278 [PIPE_POLYGON_MODE_LINE] = FILL_MODE_WIREFRAME,
279 [PIPE_POLYGON_MODE_POINT] = FILL_MODE_POINT,
280 [PIPE_POLYGON_MODE_FILL_RECTANGLE] = FILL_MODE_SOLID,
281 };
282 return map[pipe_polymode];
283 }
284 #endif
285
286 static unsigned
translate_mip_filter(enum pipe_tex_mipfilter pipe_mip)287 translate_mip_filter(enum pipe_tex_mipfilter pipe_mip)
288 {
289 static const unsigned map[] = {
290 [PIPE_TEX_MIPFILTER_NEAREST] = MIPFILTER_NEAREST,
291 [PIPE_TEX_MIPFILTER_LINEAR] = MIPFILTER_LINEAR,
292 [PIPE_TEX_MIPFILTER_NONE] = MIPFILTER_NONE,
293 };
294 return map[pipe_mip];
295 }
296
297 static uint32_t
translate_wrap(unsigned pipe_wrap,bool either_nearest)298 translate_wrap(unsigned pipe_wrap, bool either_nearest)
299 {
300 static const unsigned map[] = {
301 [PIPE_TEX_WRAP_REPEAT] = TCM_WRAP,
302 #if GFX_VER == 8
303 [PIPE_TEX_WRAP_CLAMP] = TCM_HALF_BORDER,
304 #else
305 [PIPE_TEX_WRAP_CLAMP] = TCM_CLAMP_BORDER,
306 #endif
307 [PIPE_TEX_WRAP_CLAMP_TO_EDGE] = TCM_CLAMP,
308 [PIPE_TEX_WRAP_CLAMP_TO_BORDER] = TCM_CLAMP_BORDER,
309 [PIPE_TEX_WRAP_MIRROR_REPEAT] = TCM_MIRROR,
310 [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE] = TCM_MIRROR_ONCE,
311
312 /* These are unsupported. */
313 [PIPE_TEX_WRAP_MIRROR_CLAMP] = -1,
314 [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER] = -1,
315 };
316 #if GFX_VER < 8
317 if (pipe_wrap == PIPE_TEX_WRAP_CLAMP && either_nearest)
318 return TCM_CLAMP;
319 #endif
320 return map[pipe_wrap];
321 }
322
323 /**
324 * Equiv if elk_state_batch
325 */
326 static uint32_t *
stream_state(struct crocus_batch * batch,unsigned size,unsigned alignment,uint32_t * out_offset)327 stream_state(struct crocus_batch *batch,
328 unsigned size,
329 unsigned alignment,
330 uint32_t *out_offset)
331 {
332 uint32_t offset = ALIGN(batch->state.used, alignment);
333
334 if (offset + size >= STATE_SZ && !batch->no_wrap) {
335 crocus_batch_flush(batch);
336 offset = ALIGN(batch->state.used, alignment);
337 } else if (offset + size >= batch->state.bo->size) {
338 const unsigned new_size =
339 MIN2(batch->state.bo->size + batch->state.bo->size / 2,
340 MAX_STATE_SIZE);
341 crocus_grow_buffer(batch, true, batch->state.used, new_size);
342 assert(offset + size < batch->state.bo->size);
343 }
344
345 crocus_record_state_size(batch->state_sizes, offset, size);
346
347 batch->state.used = offset + size;
348 *out_offset = offset;
349
350 return (uint32_t *)batch->state.map + (offset >> 2);
351 }
352
353 /**
354 * stream_state() + memcpy.
355 */
356 static uint32_t
emit_state(struct crocus_batch * batch,const void * data,unsigned size,unsigned alignment)357 emit_state(struct crocus_batch *batch, const void *data, unsigned size,
358 unsigned alignment)
359 {
360 unsigned offset = 0;
361 uint32_t *map = stream_state(batch, size, alignment, &offset);
362
363 if (map)
364 memcpy(map, data, size);
365
366 return offset;
367 }
368
369 #if GFX_VER <= 5
370 static void
upload_pipelined_state_pointers(struct crocus_batch * batch,bool gs_active,uint32_t gs_offset,uint32_t vs_offset,uint32_t sf_offset,uint32_t clip_offset,uint32_t wm_offset,uint32_t cc_offset)371 upload_pipelined_state_pointers(struct crocus_batch *batch,
372 bool gs_active, uint32_t gs_offset,
373 uint32_t vs_offset, uint32_t sf_offset,
374 uint32_t clip_offset, uint32_t wm_offset, uint32_t cc_offset)
375 {
376 #if GFX_VER == 5
377 /* Need to flush before changing clip max threads for errata. */
378 crocus_emit_cmd(batch, GENX(MI_FLUSH), foo);
379 #endif
380
381 crocus_emit_cmd(batch, GENX(3DSTATE_PIPELINED_POINTERS), pp) {
382 pp.PointertoVSState = ro_bo(batch->state.bo, vs_offset);
383 pp.GSEnable = gs_active;
384 if (gs_active)
385 pp.PointertoGSState = ro_bo(batch->state.bo, gs_offset);
386 pp.ClipEnable = true;
387 pp.PointertoCLIPState = ro_bo(batch->state.bo, clip_offset);
388 pp.PointertoSFState = ro_bo(batch->state.bo, sf_offset);
389 pp.PointertoWMState = ro_bo(batch->state.bo, wm_offset);
390 pp.PointertoColorCalcState = ro_bo(batch->state.bo, cc_offset);
391 }
392 }
393
394 #endif
395 /**
396 * Did field 'x' change between 'old_cso' and 'new_cso'?
397 *
398 * (If so, we may want to set some dirty flags.)
399 */
400 #define cso_changed(x) (!old_cso || (old_cso->x != new_cso->x))
401 #define cso_changed_memcmp(x) \
402 (!old_cso || memcmp(old_cso->x, new_cso->x, sizeof(old_cso->x)) != 0)
403
404 static void
flush_before_state_base_change(struct crocus_batch * batch)405 flush_before_state_base_change(struct crocus_batch *batch)
406 {
407 #if GFX_VER >= 6
408 /* Flush before emitting STATE_BASE_ADDRESS.
409 *
410 * This isn't documented anywhere in the PRM. However, it seems to be
411 * necessary prior to changing the surface state base adress. We've
412 * seen issues in Vulkan where we get GPU hangs when using multi-level
413 * command buffers which clear depth, reset state base address, and then
414 * go render stuff.
415 *
416 * Normally, in GL, we would trust the kernel to do sufficient stalls
417 * and flushes prior to executing our batch. However, it doesn't seem
418 * as if the kernel's flushing is always sufficient and we don't want to
419 * rely on it.
420 *
421 * We make this an end-of-pipe sync instead of a normal flush because we
422 * do not know the current status of the GPU. On Haswell at least,
423 * having a fast-clear operation in flight at the same time as a normal
424 * rendering operation can cause hangs. Since the kernel's flushing is
425 * insufficient, we need to ensure that any rendering operations from
426 * other processes are definitely complete before we try to do our own
427 * rendering. It's a bit of a big hammer but it appears to work.
428 */
429 const unsigned dc_flush =
430 GFX_VER >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
431 crocus_emit_end_of_pipe_sync(batch,
432 "change STATE_BASE_ADDRESS (flushes)",
433 PIPE_CONTROL_RENDER_TARGET_FLUSH |
434 dc_flush |
435 PIPE_CONTROL_DEPTH_CACHE_FLUSH);
436 #endif
437 }
438
439 static void
flush_after_state_base_change(struct crocus_batch * batch)440 flush_after_state_base_change(struct crocus_batch *batch)
441 {
442 /* After re-setting the surface state base address, we have to do some
443 * cache flusing so that the sampler engine will pick up the new
444 * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
445 * Shared Function > 3D Sampler > State > State Caching (page 96):
446 *
447 * Coherency with system memory in the state cache, like the texture
448 * cache is handled partially by software. It is expected that the
449 * command stream or shader will issue Cache Flush operation or
450 * Cache_Flush sampler message to ensure that the L1 cache remains
451 * coherent with system memory.
452 *
453 * [...]
454 *
455 * Whenever the value of the Dynamic_State_Base_Addr,
456 * Surface_State_Base_Addr are altered, the L1 state cache must be
457 * invalidated to ensure the new surface or sampler state is fetched
458 * from system memory.
459 *
460 * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
461 * which, according the PIPE_CONTROL instruction documentation in the
462 * Broadwell PRM:
463 *
464 * Setting this bit is independent of any other bit in this packet.
465 * This bit controls the invalidation of the L1 and L2 state caches
466 * at the top of the pipe i.e. at the parsing time.
467 *
468 * Unfortunately, experimentation seems to indicate that state cache
469 * invalidation through a PIPE_CONTROL does nothing whatsoever in
470 * regards to surface state and binding tables. In stead, it seems that
471 * invalidating the texture cache is what is actually needed.
472 *
473 * XXX: As far as we have been able to determine through
474 * experimentation, shows that flush the texture cache appears to be
475 * sufficient. The theory here is that all of the sampling/rendering
476 * units cache the binding table in the texture cache. However, we have
477 * yet to be able to actually confirm this.
478 */
479 #if GFX_VER >= 6
480 crocus_emit_end_of_pipe_sync(batch,
481 "change STATE_BASE_ADDRESS (invalidates)",
482 PIPE_CONTROL_INSTRUCTION_INVALIDATE |
483 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
484 PIPE_CONTROL_CONST_CACHE_INVALIDATE |
485 PIPE_CONTROL_STATE_CACHE_INVALIDATE);
486 #endif
487 }
488
489 #if GFX_VER >= 6
490 static void
crocus_store_register_mem32(struct crocus_batch * batch,uint32_t reg,struct crocus_bo * bo,uint32_t offset,bool predicated)491 crocus_store_register_mem32(struct crocus_batch *batch, uint32_t reg,
492 struct crocus_bo *bo, uint32_t offset,
493 bool predicated)
494 {
495 crocus_emit_cmd(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
496 srm.RegisterAddress = reg;
497 srm.MemoryAddress = ggtt_bo(bo, offset);
498 #if GFX_VERx10 >= 75
499 srm.PredicateEnable = predicated;
500 #else
501 if (predicated)
502 unreachable("unsupported predication");
503 #endif
504 }
505 }
506
507 static void
crocus_store_register_mem64(struct crocus_batch * batch,uint32_t reg,struct crocus_bo * bo,uint32_t offset,bool predicated)508 crocus_store_register_mem64(struct crocus_batch *batch, uint32_t reg,
509 struct crocus_bo *bo, uint32_t offset,
510 bool predicated)
511 {
512 crocus_store_register_mem32(batch, reg + 0, bo, offset + 0, predicated);
513 crocus_store_register_mem32(batch, reg + 4, bo, offset + 4, predicated);
514 }
515 #endif
516
517 #if GFX_VER >= 7
518 static void
_crocus_emit_lri(struct crocus_batch * batch,uint32_t reg,uint32_t val)519 _crocus_emit_lri(struct crocus_batch *batch, uint32_t reg, uint32_t val)
520 {
521 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
522 lri.RegisterOffset = reg;
523 lri.DataDWord = val;
524 }
525 }
526 #define crocus_emit_lri(b, r, v) _crocus_emit_lri(b, GENX(r##_num), v)
527
528 #if GFX_VERx10 >= 75
529 static void
_crocus_emit_lrr(struct crocus_batch * batch,uint32_t dst,uint32_t src)530 _crocus_emit_lrr(struct crocus_batch *batch, uint32_t dst, uint32_t src)
531 {
532 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_REG), lrr) {
533 lrr.SourceRegisterAddress = src;
534 lrr.DestinationRegisterAddress = dst;
535 }
536 }
537
538 static void
crocus_load_register_reg32(struct crocus_batch * batch,uint32_t dst,uint32_t src)539 crocus_load_register_reg32(struct crocus_batch *batch, uint32_t dst,
540 uint32_t src)
541 {
542 _crocus_emit_lrr(batch, dst, src);
543 }
544
545 static void
crocus_load_register_reg64(struct crocus_batch * batch,uint32_t dst,uint32_t src)546 crocus_load_register_reg64(struct crocus_batch *batch, uint32_t dst,
547 uint32_t src)
548 {
549 _crocus_emit_lrr(batch, dst, src);
550 _crocus_emit_lrr(batch, dst + 4, src + 4);
551 }
552 #endif
553
554 static void
crocus_load_register_imm32(struct crocus_batch * batch,uint32_t reg,uint32_t val)555 crocus_load_register_imm32(struct crocus_batch *batch, uint32_t reg,
556 uint32_t val)
557 {
558 _crocus_emit_lri(batch, reg, val);
559 }
560
561 static void
crocus_load_register_imm64(struct crocus_batch * batch,uint32_t reg,uint64_t val)562 crocus_load_register_imm64(struct crocus_batch *batch, uint32_t reg,
563 uint64_t val)
564 {
565 _crocus_emit_lri(batch, reg + 0, val & 0xffffffff);
566 _crocus_emit_lri(batch, reg + 4, val >> 32);
567 }
568
569 /**
570 * Emit MI_LOAD_REGISTER_MEM to load a 32-bit MMIO register from a buffer.
571 */
572 static void
crocus_load_register_mem32(struct crocus_batch * batch,uint32_t reg,struct crocus_bo * bo,uint32_t offset)573 crocus_load_register_mem32(struct crocus_batch *batch, uint32_t reg,
574 struct crocus_bo *bo, uint32_t offset)
575 {
576 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
577 lrm.RegisterAddress = reg;
578 lrm.MemoryAddress = ro_bo(bo, offset);
579 }
580 }
581
582 /**
583 * Load a 64-bit value from a buffer into a MMIO register via
584 * two MI_LOAD_REGISTER_MEM commands.
585 */
586 static void
crocus_load_register_mem64(struct crocus_batch * batch,uint32_t reg,struct crocus_bo * bo,uint32_t offset)587 crocus_load_register_mem64(struct crocus_batch *batch, uint32_t reg,
588 struct crocus_bo *bo, uint32_t offset)
589 {
590 crocus_load_register_mem32(batch, reg + 0, bo, offset + 0);
591 crocus_load_register_mem32(batch, reg + 4, bo, offset + 4);
592 }
593
594 #if GFX_VERx10 >= 75
595 static void
crocus_store_data_imm32(struct crocus_batch * batch,struct crocus_bo * bo,uint32_t offset,uint32_t imm)596 crocus_store_data_imm32(struct crocus_batch *batch,
597 struct crocus_bo *bo, uint32_t offset,
598 uint32_t imm)
599 {
600 crocus_emit_cmd(batch, GENX(MI_STORE_DATA_IMM), sdi) {
601 sdi.Address = rw_bo(bo, offset);
602 #if GFX_VER >= 6
603 sdi.ImmediateData = imm;
604 #endif
605 }
606 }
607
608 static void
crocus_store_data_imm64(struct crocus_batch * batch,struct crocus_bo * bo,uint32_t offset,uint64_t imm)609 crocus_store_data_imm64(struct crocus_batch *batch,
610 struct crocus_bo *bo, uint32_t offset,
611 uint64_t imm)
612 {
613 /* Can't use crocus_emit_cmd because MI_STORE_DATA_IMM has a length of
614 * 2 in genxml but it's actually variable length and we need 5 DWords.
615 */
616 void *map = crocus_get_command_space(batch, 4 * 5);
617 _crocus_pack_command(batch, GENX(MI_STORE_DATA_IMM), map, sdi) {
618 sdi.DWordLength = 5 - 2;
619 sdi.Address = rw_bo(bo, offset);
620 #if GFX_VER >= 6
621 sdi.ImmediateData = imm;
622 #endif
623 }
624 }
625 #endif
626
627 static void
crocus_copy_mem_mem(struct crocus_batch * batch,struct crocus_bo * dst_bo,uint32_t dst_offset,struct crocus_bo * src_bo,uint32_t src_offset,unsigned bytes)628 crocus_copy_mem_mem(struct crocus_batch *batch,
629 struct crocus_bo *dst_bo, uint32_t dst_offset,
630 struct crocus_bo *src_bo, uint32_t src_offset,
631 unsigned bytes)
632 {
633 assert(bytes % 4 == 0);
634 assert(dst_offset % 4 == 0);
635 assert(src_offset % 4 == 0);
636
637 #define CROCUS_TEMP_REG 0x2440 /* GEN7_3DPRIM_BASE_VERTEX */
638 for (unsigned i = 0; i < bytes; i += 4) {
639 crocus_load_register_mem32(batch, CROCUS_TEMP_REG,
640 src_bo, src_offset + i);
641 crocus_store_register_mem32(batch, CROCUS_TEMP_REG,
642 dst_bo, dst_offset + i, false);
643 }
644 }
645 #endif
646
647 /**
648 * Gallium CSO for rasterizer state.
649 */
650 struct crocus_rasterizer_state {
651 struct pipe_rasterizer_state cso;
652 #if GFX_VER >= 6
653 uint32_t sf[GENX(3DSTATE_SF_length)];
654 uint32_t clip[GENX(3DSTATE_CLIP_length)];
655 #endif
656 #if GFX_VER >= 8
657 uint32_t raster[GENX(3DSTATE_RASTER_length)];
658 #endif
659 uint32_t line_stipple[GENX(3DSTATE_LINE_STIPPLE_length)];
660
661 uint8_t num_clip_plane_consts;
662 bool fill_mode_point_or_line;
663 };
664
665 #if GFX_VER <= 5
666 #define URB_VS 0
667 #define URB_GS 1
668 #define URB_CLP 2
669 #define URB_SF 3
670 #define URB_CS 4
671
672 static const struct {
673 uint32_t min_nr_entries;
674 uint32_t preferred_nr_entries;
675 uint32_t min_entry_size;
676 uint32_t max_entry_size;
677 } limits[URB_CS+1] = {
678 { 16, 32, 1, 5 }, /* vs */
679 { 4, 8, 1, 5 }, /* gs */
680 { 5, 10, 1, 5 }, /* clp */
681 { 1, 8, 1, 12 }, /* sf */
682 { 1, 4, 1, 32 } /* cs */
683 };
684
check_urb_layout(struct crocus_context * ice)685 static bool check_urb_layout(struct crocus_context *ice)
686 {
687 ice->urb.vs_start = 0;
688 ice->urb.gs_start = ice->urb.nr_vs_entries * ice->urb.vsize;
689 ice->urb.clip_start = ice->urb.gs_start + ice->urb.nr_gs_entries * ice->urb.vsize;
690 ice->urb.sf_start = ice->urb.clip_start + ice->urb.nr_clip_entries * ice->urb.vsize;
691 ice->urb.cs_start = ice->urb.sf_start + ice->urb.nr_sf_entries * ice->urb.sfsize;
692
693 return ice->urb.cs_start + ice->urb.nr_cs_entries *
694 ice->urb.csize <= ice->urb.size;
695 }
696
697
698 static bool
crocus_calculate_urb_fence(struct crocus_batch * batch,unsigned csize,unsigned vsize,unsigned sfsize)699 crocus_calculate_urb_fence(struct crocus_batch *batch, unsigned csize,
700 unsigned vsize, unsigned sfsize)
701 {
702 struct crocus_context *ice = batch->ice;
703 if (csize < limits[URB_CS].min_entry_size)
704 csize = limits[URB_CS].min_entry_size;
705
706 if (vsize < limits[URB_VS].min_entry_size)
707 vsize = limits[URB_VS].min_entry_size;
708
709 if (sfsize < limits[URB_SF].min_entry_size)
710 sfsize = limits[URB_SF].min_entry_size;
711
712 if (ice->urb.vsize < vsize ||
713 ice->urb.sfsize < sfsize ||
714 ice->urb.csize < csize ||
715 (ice->urb.constrained && (ice->urb.vsize > vsize ||
716 ice->urb.sfsize > sfsize ||
717 ice->urb.csize > csize))) {
718
719
720 ice->urb.csize = csize;
721 ice->urb.sfsize = sfsize;
722 ice->urb.vsize = vsize;
723
724 ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
725 ice->urb.nr_gs_entries = limits[URB_GS].preferred_nr_entries;
726 ice->urb.nr_clip_entries = limits[URB_CLP].preferred_nr_entries;
727 ice->urb.nr_sf_entries = limits[URB_SF].preferred_nr_entries;
728 ice->urb.nr_cs_entries = limits[URB_CS].preferred_nr_entries;
729
730 ice->urb.constrained = 0;
731
732 if (GFX_VER == 5) {
733 ice->urb.nr_vs_entries = 128;
734 ice->urb.nr_sf_entries = 48;
735 if (check_urb_layout(ice)) {
736 goto done;
737 } else {
738 ice->urb.constrained = 1;
739 ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
740 ice->urb.nr_sf_entries = limits[URB_SF].preferred_nr_entries;
741 }
742 } else if (GFX_VERx10 == 45) {
743 ice->urb.nr_vs_entries = 64;
744 if (check_urb_layout(ice)) {
745 goto done;
746 } else {
747 ice->urb.constrained = 1;
748 ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
749 }
750 }
751
752 if (!check_urb_layout(ice)) {
753 ice->urb.nr_vs_entries = limits[URB_VS].min_nr_entries;
754 ice->urb.nr_gs_entries = limits[URB_GS].min_nr_entries;
755 ice->urb.nr_clip_entries = limits[URB_CLP].min_nr_entries;
756 ice->urb.nr_sf_entries = limits[URB_SF].min_nr_entries;
757 ice->urb.nr_cs_entries = limits[URB_CS].min_nr_entries;
758
759 /* Mark us as operating with constrained nr_entries, so that next
760 * time we recalculate we'll resize the fences in the hope of
761 * escaping constrained mode and getting back to normal performance.
762 */
763 ice->urb.constrained = 1;
764
765 if (!check_urb_layout(ice)) {
766 /* This is impossible, given the maximal sizes of urb
767 * entries and the values for minimum nr of entries
768 * provided above.
769 */
770 fprintf(stderr, "couldn't calculate URB layout!\n");
771 exit(1);
772 }
773
774 if (INTEL_DEBUG(DEBUG_URB|DEBUG_PERF))
775 fprintf(stderr, "URB CONSTRAINED\n");
776 }
777
778 done:
779 if (INTEL_DEBUG(DEBUG_URB))
780 fprintf(stderr,
781 "URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
782 ice->urb.vs_start,
783 ice->urb.gs_start,
784 ice->urb.clip_start,
785 ice->urb.sf_start,
786 ice->urb.cs_start,
787 ice->urb.size);
788 return true;
789 }
790 return false;
791 }
792
793 static void
crocus_upload_urb_fence(struct crocus_batch * batch)794 crocus_upload_urb_fence(struct crocus_batch *batch)
795 {
796 uint32_t urb_fence[3];
797 _crocus_pack_command(batch, GENX(URB_FENCE), urb_fence, urb) {
798 urb.VSUnitURBReallocationRequest = 1;
799 urb.GSUnitURBReallocationRequest = 1;
800 urb.CLIPUnitURBReallocationRequest = 1;
801 urb.SFUnitURBReallocationRequest = 1;
802 urb.VFEUnitURBReallocationRequest = 1;
803 urb.CSUnitURBReallocationRequest = 1;
804
805 urb.VSFence = batch->ice->urb.gs_start;
806 urb.GSFence = batch->ice->urb.clip_start;
807 urb.CLIPFence = batch->ice->urb.sf_start;
808 urb.SFFence = batch->ice->urb.cs_start;
809 urb.CSFence = batch->ice->urb.size;
810 }
811
812 /* erratum: URB_FENCE must not cross a 64byte cacheline */
813 if ((crocus_batch_bytes_used(batch) & 15) > 12) {
814 int pad = 16 - (crocus_batch_bytes_used(batch) & 15);
815 do {
816 *(uint32_t *)batch->command.map_next = 0;
817 batch->command.map_next += sizeof(uint32_t);
818 } while (--pad);
819 }
820
821 crocus_batch_emit(batch, urb_fence, sizeof(uint32_t) * 3);
822 }
823
824 static bool
calculate_curbe_offsets(struct crocus_batch * batch)825 calculate_curbe_offsets(struct crocus_batch *batch)
826 {
827 struct crocus_context *ice = batch->ice;
828
829 unsigned nr_fp_regs, nr_vp_regs, nr_clip_regs = 0;
830 unsigned total_regs;
831
832 nr_fp_regs = 0;
833 for (int i = 0; i < 4; i++) {
834 const struct elk_ubo_range *range = &ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data->ubo_ranges[i];
835 if (range->length == 0)
836 continue;
837
838 /* ubo range tracks at 256-bit, we need 512-bit */
839 nr_fp_regs += (range->length + 1) / 2;
840 }
841
842 if (ice->state.cso_rast->cso.clip_plane_enable) {
843 unsigned nr_planes = 6 + util_bitcount(ice->state.cso_rast->cso.clip_plane_enable);
844 nr_clip_regs = (nr_planes * 4 + 15) / 16;
845 }
846
847 nr_vp_regs = 0;
848 for (int i = 0; i < 4; i++) {
849 const struct elk_ubo_range *range = &ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data->ubo_ranges[i];
850 if (range->length == 0)
851 continue;
852
853 /* ubo range tracks at 256-bit, we need 512-bit */
854 nr_vp_regs += (range->length + 1) / 2;
855 }
856 if (nr_vp_regs == 0) {
857 /* The pre-gen6 VS requires that some push constants get loaded no
858 * matter what, or the GPU would hang.
859 */
860 nr_vp_regs = 1;
861 }
862 total_regs = nr_fp_regs + nr_vp_regs + nr_clip_regs;
863
864 /* The CURBE allocation size is limited to 32 512-bit units (128 EU
865 * registers, or 1024 floats). See CS_URB_STATE in the gen4 or gen5
866 * (volume 1, part 1) PRMs.
867 *
868 * Note that in elk_fs.cpp we're only loading up to 16 EU registers of
869 * values as push constants before spilling to pull constants, and in
870 * elk_vec4.cpp we're loading up to 32 registers of push constants. An EU
871 * register is 1/2 of one of these URB entry units, so that leaves us 16 EU
872 * regs for clip.
873 */
874 assert(total_regs <= 32);
875
876 /* Lazy resize:
877 */
878 if (nr_fp_regs > ice->curbe.wm_size ||
879 nr_vp_regs > ice->curbe.vs_size ||
880 nr_clip_regs != ice->curbe.clip_size ||
881 (total_regs < ice->curbe.total_size / 4 &&
882 ice->curbe.total_size > 16)) {
883
884 GLuint reg = 0;
885
886 /* Calculate a new layout:
887 */
888 reg = 0;
889 ice->curbe.wm_start = reg;
890 ice->curbe.wm_size = nr_fp_regs; reg += nr_fp_regs;
891 ice->curbe.clip_start = reg;
892 ice->curbe.clip_size = nr_clip_regs; reg += nr_clip_regs;
893 ice->curbe.vs_start = reg;
894 ice->curbe.vs_size = nr_vp_regs; reg += nr_vp_regs;
895 ice->curbe.total_size = reg;
896
897 if (0)
898 fprintf(stderr, "curbe wm %d+%d clip %d+%d vs %d+%d\n",
899 ice->curbe.wm_start,
900 ice->curbe.wm_size,
901 ice->curbe.clip_start,
902 ice->curbe.clip_size,
903 ice->curbe.vs_start,
904 ice->curbe.vs_size );
905 return true;
906 }
907 return false;
908 }
909
910 static void
upload_shader_consts(struct crocus_context * ice,gl_shader_stage stage,uint32_t * map,unsigned start)911 upload_shader_consts(struct crocus_context *ice,
912 gl_shader_stage stage,
913 uint32_t *map,
914 unsigned start)
915 {
916 struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
917 struct elk_stage_prog_data *prog_data = (void *) shader->prog_data;
918 uint32_t *cmap;
919 bool found = false;
920 unsigned offset = start * 16;
921 int total = 0;
922 for (int i = 0; i < 4; i++) {
923 const struct elk_ubo_range *range = &prog_data->ubo_ranges[i];
924
925 if (range->length == 0)
926 continue;
927
928 unsigned block_index = crocus_bti_to_group_index(
929 &shader->bt, CROCUS_SURFACE_GROUP_UBO, range->block);
930 unsigned len = range->length * 8 * sizeof(float);
931 unsigned start = range->start * 8 * sizeof(float);
932 struct pipe_transfer *transfer;
933
934 cmap = pipe_buffer_map_range(&ice->ctx, ice->state.shaders[stage].constbufs[block_index].buffer,
935 ice->state.shaders[stage].constbufs[block_index].buffer_offset + start, len,
936 PIPE_MAP_READ | PIPE_MAP_UNSYNCHRONIZED, &transfer);
937 if (cmap)
938 memcpy(&map[offset + (total * 8)], cmap, len);
939 pipe_buffer_unmap(&ice->ctx, transfer);
940 total += range->length;
941 found = true;
942 }
943
944 if (stage == MESA_SHADER_VERTEX && !found) {
945 /* The pre-gen6 VS requires that some push constants get loaded no
946 * matter what, or the GPU would hang.
947 */
948 unsigned len = 16;
949 memset(&map[offset], 0, len);
950 }
951 }
952
953 static const float fixed_plane[6][4] = {
954 { 0, 0, -1, 1 },
955 { 0, 0, 1, 1 },
956 { 0, -1, 0, 1 },
957 { 0, 1, 0, 1 },
958 {-1, 0, 0, 1 },
959 { 1, 0, 0, 1 }
960 };
961
962 static void
gen4_upload_curbe(struct crocus_batch * batch)963 gen4_upload_curbe(struct crocus_batch *batch)
964 {
965 struct crocus_context *ice = batch->ice;
966 const unsigned sz = ice->curbe.total_size;
967 const unsigned buf_sz = sz * 16 * sizeof(float);
968
969 if (sz == 0)
970 goto emit;
971
972 uint32_t *map;
973 u_upload_alloc(ice->ctx.const_uploader, 0, buf_sz, 64,
974 &ice->curbe.curbe_offset, (struct pipe_resource **)&ice->curbe.curbe_res, (void **) &map);
975
976 /* fragment shader constants */
977 if (ice->curbe.wm_size) {
978 upload_shader_consts(ice, MESA_SHADER_FRAGMENT, map, ice->curbe.wm_start);
979 }
980
981 /* clipper constants */
982 if (ice->curbe.clip_size) {
983 unsigned offset = ice->curbe.clip_start * 16;
984 float *fmap = (float *)map;
985 unsigned i;
986 /* If any planes are going this way, send them all this way:
987 */
988 for (i = 0; i < 6; i++) {
989 fmap[offset + i * 4 + 0] = fixed_plane[i][0];
990 fmap[offset + i * 4 + 1] = fixed_plane[i][1];
991 fmap[offset + i * 4 + 2] = fixed_plane[i][2];
992 fmap[offset + i * 4 + 3] = fixed_plane[i][3];
993 }
994
995 unsigned mask = ice->state.cso_rast->cso.clip_plane_enable;
996 struct pipe_clip_state *cp = &ice->state.clip_planes;
997 while (mask) {
998 const int j = u_bit_scan(&mask);
999 fmap[offset + i * 4 + 0] = cp->ucp[j][0];
1000 fmap[offset + i * 4 + 1] = cp->ucp[j][1];
1001 fmap[offset + i * 4 + 2] = cp->ucp[j][2];
1002 fmap[offset + i * 4 + 3] = cp->ucp[j][3];
1003 i++;
1004 }
1005 }
1006
1007 /* vertex shader constants */
1008 if (ice->curbe.vs_size) {
1009 upload_shader_consts(ice, MESA_SHADER_VERTEX, map, ice->curbe.vs_start);
1010 }
1011 if (0) {
1012 for (int i = 0; i < sz*16; i+=4) {
1013 float *f = (float *)map;
1014 fprintf(stderr, "curbe %d.%d: %f %f %f %f\n", i/8, i&4,
1015 f[i+0], f[i+1], f[i+2], f[i+3]);
1016 }
1017 }
1018
1019 emit:
1020 crocus_emit_cmd(batch, GENX(CONSTANT_BUFFER), cb) {
1021 if (ice->curbe.curbe_res) {
1022 cb.BufferLength = ice->curbe.total_size - 1;
1023 cb.Valid = 1;
1024 cb.BufferStartingAddress = ro_bo(ice->curbe.curbe_res->bo, ice->curbe.curbe_offset);
1025 }
1026 }
1027
1028 #if GFX_VER == 4 && GFX_VERx10 != 45
1029 /* Work around a Broadwater/Crestline depth interpolator bug. The
1030 * following sequence will cause GPU hangs:
1031 *
1032 * 1. Change state so that all depth related fields in CC_STATE are
1033 * disabled, and in WM_STATE, only "PS Use Source Depth" is enabled.
1034 * 2. Emit a CONSTANT_BUFFER packet.
1035 * 3. Draw via 3DPRIMITIVE.
1036 *
1037 * The recommended workaround is to emit a non-pipelined state change after
1038 * emitting CONSTANT_BUFFER, in order to drain the windowizer pipeline.
1039 *
1040 * We arbitrarily choose 3DSTATE_GLOBAL_DEPTH_CLAMP_OFFSET (as it's small),
1041 * and always emit it when "PS Use Source Depth" is set. We could be more
1042 * precise, but the additional complexity is probably not worth it.
1043 *
1044 */
1045 const struct shader_info *fs_info =
1046 crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
1047
1048 if (BITSET_TEST(fs_info->system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
1049 ice->state.global_depth_offset_clamp = 0;
1050 crocus_emit_cmd(batch, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp);
1051 }
1052 #endif
1053 }
1054 #endif
1055
1056 #if GFX_VER >= 7
1057
1058 #define IVB_L3SQCREG1_SQGHPCI_DEFAULT 0x00730000
1059 #define VLV_L3SQCREG1_SQGHPCI_DEFAULT 0x00d30000
1060 #define HSW_L3SQCREG1_SQGHPCI_DEFAULT 0x00610000
1061
1062 static void
setup_l3_config(struct crocus_batch * batch,const struct intel_l3_config * cfg)1063 setup_l3_config(struct crocus_batch *batch, const struct intel_l3_config *cfg)
1064 {
1065 #if GFX_VER == 7
1066 const struct intel_device_info *devinfo = &batch->screen->devinfo;
1067 const bool has_dc = cfg->n[INTEL_L3P_DC] || cfg->n[INTEL_L3P_ALL];
1068 const bool has_is = cfg->n[INTEL_L3P_IS] || cfg->n[INTEL_L3P_RO] ||
1069 cfg->n[INTEL_L3P_ALL];
1070 const bool has_c = cfg->n[INTEL_L3P_C] || cfg->n[INTEL_L3P_RO] ||
1071 cfg->n[INTEL_L3P_ALL];
1072 const bool has_t = cfg->n[INTEL_L3P_T] || cfg->n[INTEL_L3P_RO] ||
1073 cfg->n[INTEL_L3P_ALL];
1074 const bool has_slm = cfg->n[INTEL_L3P_SLM];
1075 #endif
1076
1077 /* According to the hardware docs, the L3 partitioning can only be changed
1078 * while the pipeline is completely drained and the caches are flushed,
1079 * which involves a first PIPE_CONTROL flush which stalls the pipeline...
1080 */
1081 crocus_emit_pipe_control_flush(batch, "l3_config",
1082 PIPE_CONTROL_DATA_CACHE_FLUSH |
1083 PIPE_CONTROL_CS_STALL);
1084
1085 /* ...followed by a second pipelined PIPE_CONTROL that initiates
1086 * invalidation of the relevant caches. Note that because RO invalidation
1087 * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
1088 * command is processed by the CS) we cannot combine it with the previous
1089 * stalling flush as the hardware documentation suggests, because that
1090 * would cause the CS to stall on previous rendering *after* RO
1091 * invalidation and wouldn't prevent the RO caches from being polluted by
1092 * concurrent rendering before the stall completes. This intentionally
1093 * doesn't implement the SKL+ hardware workaround suggesting to enable CS
1094 * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
1095 * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
1096 * already guarantee that there is no concurrent GPGPU kernel execution
1097 * (see SKL HSD 2132585).
1098 */
1099 crocus_emit_pipe_control_flush(batch, "l3 config",
1100 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
1101 PIPE_CONTROL_CONST_CACHE_INVALIDATE |
1102 PIPE_CONTROL_INSTRUCTION_INVALIDATE |
1103 PIPE_CONTROL_STATE_CACHE_INVALIDATE);
1104
1105 /* Now send a third stalling flush to make sure that invalidation is
1106 * complete when the L3 configuration registers are modified.
1107 */
1108 crocus_emit_pipe_control_flush(batch, "l3 config",
1109 PIPE_CONTROL_DATA_CACHE_FLUSH |
1110 PIPE_CONTROL_CS_STALL);
1111
1112 #if GFX_VER == 8
1113 assert(!cfg->n[INTEL_L3P_IS] && !cfg->n[INTEL_L3P_C] && !cfg->n[INTEL_L3P_T]);
1114 crocus_emit_reg(batch, GENX(L3CNTLREG), reg) {
1115 reg.SLMEnable = cfg->n[INTEL_L3P_SLM] > 0;
1116 reg.URBAllocation = cfg->n[INTEL_L3P_URB];
1117 reg.ROAllocation = cfg->n[INTEL_L3P_RO];
1118 reg.DCAllocation = cfg->n[INTEL_L3P_DC];
1119 reg.AllAllocation = cfg->n[INTEL_L3P_ALL];
1120 }
1121 #else
1122 assert(!cfg->n[INTEL_L3P_ALL]);
1123
1124 /* When enabled SLM only uses a portion of the L3 on half of the banks,
1125 * the matching space on the remaining banks has to be allocated to a
1126 * client (URB for all validated configurations) set to the
1127 * lower-bandwidth 2-bank address hashing mode.
1128 */
1129 const bool urb_low_bw = has_slm && devinfo->platform != INTEL_PLATFORM_BYT;
1130 assert(!urb_low_bw || cfg->n[INTEL_L3P_URB] == cfg->n[INTEL_L3P_SLM]);
1131
1132 /* Minimum number of ways that can be allocated to the URB. */
1133 const unsigned n0_urb = (devinfo->platform == INTEL_PLATFORM_BYT ? 32 : 0);
1134 assert(cfg->n[INTEL_L3P_URB] >= n0_urb);
1135
1136 uint32_t l3sqcr1, l3cr2, l3cr3;
1137
1138 crocus_pack_state(GENX(L3SQCREG1), &l3sqcr1, reg) {
1139 reg.ConvertDC_UC = !has_dc;
1140 reg.ConvertIS_UC = !has_is;
1141 reg.ConvertC_UC = !has_c;
1142 reg.ConvertT_UC = !has_t;
1143 #if GFX_VERx10 == 75
1144 reg.L3SQGeneralPriorityCreditInitialization = SQGPCI_DEFAULT;
1145 #else
1146 reg.L3SQGeneralPriorityCreditInitialization =
1147 devinfo->platform == INTEL_PLATFORM_BYT ? BYT_SQGPCI_DEFAULT : SQGPCI_DEFAULT;
1148 #endif
1149 reg.L3SQHighPriorityCreditInitialization = SQHPCI_DEFAULT;
1150 };
1151
1152 crocus_pack_state(GENX(L3CNTLREG2), &l3cr2, reg) {
1153 reg.SLMEnable = has_slm;
1154 reg.URBLowBandwidth = urb_low_bw;
1155 reg.URBAllocation = cfg->n[INTEL_L3P_URB] - n0_urb;
1156 #if !(GFX_VERx10 == 75)
1157 reg.ALLAllocation = cfg->n[INTEL_L3P_ALL];
1158 #endif
1159 reg.ROAllocation = cfg->n[INTEL_L3P_RO];
1160 reg.DCAllocation = cfg->n[INTEL_L3P_DC];
1161 };
1162
1163 crocus_pack_state(GENX(L3CNTLREG3), &l3cr3, reg) {
1164 reg.ISAllocation = cfg->n[INTEL_L3P_IS];
1165 reg.ISLowBandwidth = 0;
1166 reg.CAllocation = cfg->n[INTEL_L3P_C];
1167 reg.CLowBandwidth = 0;
1168 reg.TAllocation = cfg->n[INTEL_L3P_T];
1169 reg.TLowBandwidth = 0;
1170 };
1171
1172 /* Set up the L3 partitioning. */
1173 crocus_emit_lri(batch, L3SQCREG1, l3sqcr1);
1174 crocus_emit_lri(batch, L3CNTLREG2, l3cr2);
1175 crocus_emit_lri(batch, L3CNTLREG3, l3cr3);
1176
1177 #if GFX_VERx10 == 75
1178 /* TODO: Fail screen creation if command parser version < 4 */
1179 uint32_t scratch1, chicken3;
1180 crocus_pack_state(GENX(SCRATCH1), &scratch1, reg) {
1181 reg.L3AtomicDisable = !has_dc;
1182 }
1183 crocus_pack_state(GENX(CHICKEN3), &chicken3, reg) {
1184 reg.L3AtomicDisableMask = true;
1185 reg.L3AtomicDisable = !has_dc;
1186 }
1187 crocus_emit_lri(batch, SCRATCH1, scratch1);
1188 crocus_emit_lri(batch, CHICKEN3, chicken3);
1189 #endif
1190 #endif
1191 }
1192
1193 static void
emit_l3_state(struct crocus_batch * batch,bool compute)1194 emit_l3_state(struct crocus_batch *batch, bool compute)
1195 {
1196 const struct intel_l3_config *const cfg =
1197 compute ? batch->screen->l3_config_cs : batch->screen->l3_config_3d;
1198
1199 setup_l3_config(batch, cfg);
1200 if (INTEL_DEBUG(DEBUG_L3)) {
1201 intel_dump_l3_config(cfg, stderr);
1202 }
1203 }
1204
1205 /**
1206 * Emit a PIPE_CONTROL command for gen7 with the CS Stall bit set.
1207 */
1208 static void
gen7_emit_cs_stall_flush(struct crocus_batch * batch)1209 gen7_emit_cs_stall_flush(struct crocus_batch *batch)
1210 {
1211 crocus_emit_pipe_control_write(batch,
1212 "workaround",
1213 PIPE_CONTROL_CS_STALL
1214 | PIPE_CONTROL_WRITE_IMMEDIATE,
1215 batch->ice->workaround_bo,
1216 batch->ice->workaround_offset, 0);
1217 }
1218 #endif
1219
1220 static void
emit_pipeline_select(struct crocus_batch * batch,uint32_t pipeline)1221 emit_pipeline_select(struct crocus_batch *batch, uint32_t pipeline)
1222 {
1223 #if GFX_VER == 8
1224 /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
1225 *
1226 * Software must clear the COLOR_CALC_STATE Valid field in
1227 * 3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
1228 * with Pipeline Select set to GPGPU.
1229 *
1230 * The internal hardware docs recommend the same workaround for Gfx9
1231 * hardware too.
1232 */
1233 if (pipeline == GPGPU)
1234 crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
1235 #endif
1236
1237 #if GFX_VER >= 6
1238 /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
1239 * PIPELINE_SELECT [DevBWR+]":
1240 *
1241 * "Project: DEVSNB+
1242 *
1243 * Software must ensure all the write caches are flushed through a
1244 * stalling PIPE_CONTROL command followed by another PIPE_CONTROL
1245 * command to invalidate read only caches prior to programming
1246 * MI_PIPELINE_SELECT command to change the Pipeline Select Mode."
1247 */
1248 const unsigned dc_flush =
1249 GFX_VER >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
1250 crocus_emit_pipe_control_flush(batch,
1251 "workaround: PIPELINE_SELECT flushes (1/2)",
1252 PIPE_CONTROL_RENDER_TARGET_FLUSH |
1253 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
1254 dc_flush |
1255 PIPE_CONTROL_CS_STALL);
1256
1257 crocus_emit_pipe_control_flush(batch,
1258 "workaround: PIPELINE_SELECT flushes (2/2)",
1259 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
1260 PIPE_CONTROL_CONST_CACHE_INVALIDATE |
1261 PIPE_CONTROL_STATE_CACHE_INVALIDATE |
1262 PIPE_CONTROL_INSTRUCTION_INVALIDATE);
1263 #else
1264 /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
1265 * PIPELINE_SELECT [DevBWR+]":
1266 *
1267 * Project: PRE-DEVSNB
1268 *
1269 * Software must ensure the current pipeline is flushed via an
1270 * MI_FLUSH or PIPE_CONTROL prior to the execution of PIPELINE_SELECT.
1271 */
1272 crocus_emit_cmd(batch, GENX(MI_FLUSH), foo);
1273 #endif
1274
1275 crocus_emit_cmd(batch, GENX(PIPELINE_SELECT), sel) {
1276 sel.PipelineSelection = pipeline;
1277 }
1278
1279 #if GFX_VER == 7 && !(GFX_VERx10 == 75)
1280 if (pipeline == _3D) {
1281 gen7_emit_cs_stall_flush(batch);
1282
1283 crocus_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
1284 prim.PrimitiveTopologyType = _3DPRIM_POINTLIST;
1285 };
1286 }
1287 #endif
1288 }
1289
1290 /**
1291 * The following diagram shows how we partition the URB:
1292 *
1293 * 16kB or 32kB Rest of the URB space
1294 * __________-__________ _________________-_________________
1295 * / \ / \
1296 * +-------------------------------------------------------------+
1297 * | VS/HS/DS/GS/FS Push | VS/HS/DS/GS URB |
1298 * | Constants | Entries |
1299 * +-------------------------------------------------------------+
1300 *
1301 * Notably, push constants must be stored at the beginning of the URB
1302 * space, while entries can be stored anywhere. Ivybridge and Haswell
1303 * GT1/GT2 have a maximum constant buffer size of 16kB, while Haswell GT3
1304 * doubles this (32kB).
1305 *
1306 * Ivybridge and Haswell GT1/GT2 allow push constants to be located (and
1307 * sized) in increments of 1kB. Haswell GT3 requires them to be located and
1308 * sized in increments of 2kB.
1309 *
1310 * Currently we split the constant buffer space evenly among whatever stages
1311 * are active. This is probably not ideal, but simple.
1312 *
1313 * Ivybridge GT1 and Haswell GT1 have 128kB of URB space.
1314 * Ivybridge GT2 and Haswell GT2 have 256kB of URB space.
1315 * Haswell GT3 has 512kB of URB space.
1316 *
1317 * See "Volume 2a: 3D Pipeline," section 1.8, "Volume 1b: Configurations",
1318 * and the documentation for 3DSTATE_PUSH_CONSTANT_ALLOC_xS.
1319 */
1320 #if GFX_VER >= 7
1321 static void
crocus_alloc_push_constants(struct crocus_batch * batch)1322 crocus_alloc_push_constants(struct crocus_batch *batch)
1323 {
1324 const unsigned push_constant_kb =
1325 batch->screen->devinfo.max_constant_urb_size_kb;
1326 unsigned size_per_stage = push_constant_kb / 5;
1327
1328 /* For now, we set a static partitioning of the push constant area,
1329 * assuming that all stages could be in use.
1330 *
1331 * TODO: Try lazily allocating the HS/DS/GS sections as needed, and
1332 * see if that improves performance by offering more space to
1333 * the VS/FS when those aren't in use. Also, try dynamically
1334 * enabling/disabling it like i965 does. This would be more
1335 * stalls and may not actually help; we don't know yet.
1336 */
1337 for (int i = 0; i <= MESA_SHADER_FRAGMENT; i++) {
1338 crocus_emit_cmd(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
1339 alloc._3DCommandSubOpcode = 18 + i;
1340 alloc.ConstantBufferOffset = size_per_stage * i;
1341 alloc.ConstantBufferSize = i == MESA_SHADER_FRAGMENT ? (push_constant_kb - 4 * size_per_stage) : size_per_stage;
1342 }
1343 }
1344
1345 /* From p292 of the Ivy Bridge PRM (11.2.4 3DSTATE_PUSH_CONSTANT_ALLOC_PS):
1346 *
1347 * A PIPE_CONTROL command with the CS Stall bit set must be programmed
1348 * in the ring after this instruction.
1349 *
1350 * No such restriction exists for Haswell or Baytrail.
1351 */
1352 if (batch->screen->devinfo.platform == INTEL_PLATFORM_IVB)
1353 gen7_emit_cs_stall_flush(batch);
1354 }
1355 #endif
1356
1357 /**
1358 * Upload the initial GPU state for a render context.
1359 *
1360 * This sets some invariant state that needs to be programmed a particular
1361 * way, but we never actually change.
1362 */
1363 static void
crocus_init_render_context(struct crocus_batch * batch)1364 crocus_init_render_context(struct crocus_batch *batch)
1365 {
1366 UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
1367
1368 emit_pipeline_select(batch, _3D);
1369
1370 crocus_emit_cmd(batch, GENX(STATE_SIP), foo);
1371
1372 #if GFX_VER >= 7
1373 emit_l3_state(batch, false);
1374 #endif
1375 #if (GFX_VERx10 == 70 || GFX_VERx10 == 80)
1376 crocus_emit_reg(batch, GENX(INSTPM), reg) {
1377 reg.CONSTANT_BUFFERAddressOffsetDisable = true;
1378 reg.CONSTANT_BUFFERAddressOffsetDisableMask = true;
1379 }
1380 #endif
1381 #if GFX_VER >= 5 || GFX_VERx10 == 45
1382 /* Use the legacy AA line coverage computation. */
1383 crocus_emit_cmd(batch, GENX(3DSTATE_AA_LINE_PARAMETERS), foo);
1384 #endif
1385
1386 /* No polygon stippling offsets are necessary. */
1387 /* TODO: may need to set an offset for origin-UL framebuffers */
1388 crocus_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_OFFSET), foo);
1389
1390 #if GFX_VER >= 7
1391 crocus_alloc_push_constants(batch);
1392 #endif
1393
1394 #if GFX_VER == 8
1395 /* Set the initial MSAA sample positions. */
1396 crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLE_PATTERN), pat) {
1397 INTEL_SAMPLE_POS_1X(pat._1xSample);
1398 INTEL_SAMPLE_POS_2X(pat._2xSample);
1399 INTEL_SAMPLE_POS_4X(pat._4xSample);
1400 INTEL_SAMPLE_POS_8X(pat._8xSample);
1401 }
1402
1403 /* Disable chromakeying (it's for media) */
1404 crocus_emit_cmd(batch, GENX(3DSTATE_WM_CHROMAKEY), foo);
1405
1406 /* We want regular rendering, not special HiZ operations. */
1407 crocus_emit_cmd(batch, GENX(3DSTATE_WM_HZ_OP), foo);
1408 #endif
1409 }
1410
1411 #if GFX_VER >= 7
1412 static void
crocus_init_compute_context(struct crocus_batch * batch)1413 crocus_init_compute_context(struct crocus_batch *batch)
1414 {
1415 UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
1416
1417 emit_pipeline_select(batch, GPGPU);
1418
1419 #if GFX_VER >= 7
1420 emit_l3_state(batch, true);
1421 #endif
1422 }
1423 #endif
1424
1425 /**
1426 * Generation-specific context state (ice->state.genx->...).
1427 *
1428 * Most state can go in crocus_context directly, but these encode hardware
1429 * packets which vary by generation.
1430 */
1431 struct crocus_genx_state {
1432 struct {
1433 #if GFX_VER >= 7
1434 struct isl_image_param image_param[PIPE_MAX_SHADER_IMAGES];
1435 #endif
1436 } shaders[MESA_SHADER_STAGES];
1437
1438 #if GFX_VER == 8
1439 bool pma_fix_enabled;
1440 #endif
1441 };
1442
1443 /**
1444 * The pipe->set_blend_color() driver hook.
1445 *
1446 * This corresponds to our COLOR_CALC_STATE.
1447 */
1448 static void
crocus_set_blend_color(struct pipe_context * ctx,const struct pipe_blend_color * state)1449 crocus_set_blend_color(struct pipe_context *ctx,
1450 const struct pipe_blend_color *state)
1451 {
1452 struct crocus_context *ice = (struct crocus_context *) ctx;
1453
1454 /* Our COLOR_CALC_STATE is exactly pipe_blend_color, so just memcpy */
1455 memcpy(&ice->state.blend_color, state, sizeof(struct pipe_blend_color));
1456 #if GFX_VER <= 5
1457 ice->state.dirty |= CROCUS_DIRTY_GEN4_CONSTANT_COLOR;
1458 #else
1459 ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1460 #endif
1461 }
1462
1463 /**
1464 * Gallium CSO for blend state (see pipe_blend_state).
1465 */
1466 struct crocus_blend_state {
1467 #if GFX_VER == 8
1468 /** Partial 3DSTATE_PS_BLEND */
1469 uint32_t ps_blend[GENX(3DSTATE_PS_BLEND_length)];
1470 #endif
1471
1472 /** copy of BLEND_STATE */
1473 struct pipe_blend_state cso;
1474
1475 /** Bitfield of whether blending is enabled for RT[i] - for aux resolves */
1476 uint8_t blend_enables;
1477
1478 /** Bitfield of whether color writes are enabled for RT[i] */
1479 uint8_t color_write_enables;
1480
1481 /** Does RT[0] use dual color blending? */
1482 bool dual_color_blending;
1483 };
1484
1485 static enum pipe_blendfactor
fix_blendfactor(enum pipe_blendfactor f,bool alpha_to_one)1486 fix_blendfactor(enum pipe_blendfactor f, bool alpha_to_one)
1487 {
1488 if (alpha_to_one) {
1489 if (f == PIPE_BLENDFACTOR_SRC1_ALPHA)
1490 return PIPE_BLENDFACTOR_ONE;
1491
1492 if (f == PIPE_BLENDFACTOR_INV_SRC1_ALPHA)
1493 return PIPE_BLENDFACTOR_ZERO;
1494 }
1495
1496 return f;
1497 }
1498
1499 #if GFX_VER >= 6
1500 typedef struct GENX(BLEND_STATE_ENTRY) BLEND_ENTRY_GENXML;
1501 #else
1502 typedef struct GENX(COLOR_CALC_STATE) BLEND_ENTRY_GENXML;
1503 #endif
1504
1505 static bool
can_emit_logic_op(struct crocus_context * ice)1506 can_emit_logic_op(struct crocus_context *ice)
1507 {
1508 /* all pre gen8 have logicop restricted to unorm */
1509 enum pipe_format pformat = PIPE_FORMAT_NONE;
1510 for (unsigned i = 0; i < ice->state.framebuffer.nr_cbufs; i++) {
1511 if (ice->state.framebuffer.cbufs[i]) {
1512 pformat = ice->state.framebuffer.cbufs[i]->format;
1513 break;
1514 }
1515 }
1516 return (pformat == PIPE_FORMAT_NONE || util_format_is_unorm(pformat));
1517 }
1518
1519 static bool
set_blend_entry_bits(struct crocus_batch * batch,BLEND_ENTRY_GENXML * entry,struct crocus_blend_state * cso_blend,int idx)1520 set_blend_entry_bits(struct crocus_batch *batch, BLEND_ENTRY_GENXML *entry,
1521 struct crocus_blend_state *cso_blend,
1522 int idx)
1523 {
1524 struct crocus_context *ice = batch->ice;
1525 bool independent_alpha_blend = false;
1526 const struct pipe_rt_blend_state *rt =
1527 &cso_blend->cso.rt[cso_blend->cso.independent_blend_enable ? idx : 0];
1528 const unsigned blend_enabled = rt->blend_enable;
1529
1530 enum pipe_blendfactor src_rgb =
1531 fix_blendfactor(rt->rgb_src_factor, cso_blend->cso.alpha_to_one);
1532 enum pipe_blendfactor src_alpha =
1533 fix_blendfactor(rt->alpha_src_factor, cso_blend->cso.alpha_to_one);
1534 enum pipe_blendfactor dst_rgb =
1535 fix_blendfactor(rt->rgb_dst_factor, cso_blend->cso.alpha_to_one);
1536 enum pipe_blendfactor dst_alpha =
1537 fix_blendfactor(rt->alpha_dst_factor, cso_blend->cso.alpha_to_one);
1538
1539 if (rt->rgb_func != rt->alpha_func ||
1540 src_rgb != src_alpha || dst_rgb != dst_alpha)
1541 independent_alpha_blend = true;
1542 if (cso_blend->cso.logicop_enable) {
1543 if (GFX_VER >= 8 || can_emit_logic_op(ice)) {
1544 entry->LogicOpEnable = cso_blend->cso.logicop_enable;
1545 entry->LogicOpFunction = cso_blend->cso.logicop_func;
1546 }
1547 } else if (blend_enabled) {
1548 if (idx == 0) {
1549 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
1550 struct elk_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
1551 entry->ColorBufferBlendEnable =
1552 (!cso_blend->dual_color_blending || wm_prog_data->dual_src_blend);
1553 } else
1554 entry->ColorBufferBlendEnable = 1;
1555
1556 entry->ColorBlendFunction = rt->rgb_func;
1557 entry->AlphaBlendFunction = rt->alpha_func;
1558 entry->SourceBlendFactor = (int) src_rgb;
1559 entry->SourceAlphaBlendFactor = (int) src_alpha;
1560 entry->DestinationBlendFactor = (int) dst_rgb;
1561 entry->DestinationAlphaBlendFactor = (int) dst_alpha;
1562 }
1563 #if GFX_VER <= 5
1564 /*
1565 * Gen4/GM45/ILK can't handle have ColorBufferBlendEnable == 0
1566 * when a dual src blend shader is in use. Setup dummy blending.
1567 */
1568 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
1569 struct elk_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
1570 if (idx == 0 && !blend_enabled && wm_prog_data->dual_src_blend) {
1571 entry->ColorBufferBlendEnable = 1;
1572 entry->ColorBlendFunction = PIPE_BLEND_ADD;
1573 entry->AlphaBlendFunction = PIPE_BLEND_ADD;
1574 entry->SourceBlendFactor = PIPE_BLENDFACTOR_ONE;
1575 entry->SourceAlphaBlendFactor = PIPE_BLENDFACTOR_ONE;
1576 entry->DestinationBlendFactor = PIPE_BLENDFACTOR_ZERO;
1577 entry->DestinationAlphaBlendFactor = PIPE_BLENDFACTOR_ZERO;
1578 }
1579 #endif
1580 return independent_alpha_blend;
1581 }
1582
1583 /**
1584 * The pipe->create_blend_state() driver hook.
1585 *
1586 * Translates a pipe_blend_state into crocus_blend_state.
1587 */
1588 static void *
crocus_create_blend_state(struct pipe_context * ctx,const struct pipe_blend_state * state)1589 crocus_create_blend_state(struct pipe_context *ctx,
1590 const struct pipe_blend_state *state)
1591 {
1592 struct crocus_blend_state *cso = malloc(sizeof(struct crocus_blend_state));
1593
1594 cso->blend_enables = 0;
1595 cso->color_write_enables = 0;
1596 STATIC_ASSERT(ELK_MAX_DRAW_BUFFERS <= 8);
1597
1598 cso->cso = *state;
1599 cso->dual_color_blending = util_blend_state_is_dual(state, 0);
1600
1601 #if GFX_VER == 8
1602 bool indep_alpha_blend = false;
1603 #endif
1604 for (int i = 0; i < ELK_MAX_DRAW_BUFFERS; i++) {
1605 const struct pipe_rt_blend_state *rt =
1606 &state->rt[state->independent_blend_enable ? i : 0];
1607 if (rt->blend_enable)
1608 cso->blend_enables |= 1u << i;
1609 if (rt->colormask)
1610 cso->color_write_enables |= 1u << i;
1611 #if GFX_VER == 8
1612 enum pipe_blendfactor src_rgb =
1613 fix_blendfactor(rt->rgb_src_factor, state->alpha_to_one);
1614 enum pipe_blendfactor src_alpha =
1615 fix_blendfactor(rt->alpha_src_factor, state->alpha_to_one);
1616 enum pipe_blendfactor dst_rgb =
1617 fix_blendfactor(rt->rgb_dst_factor, state->alpha_to_one);
1618 enum pipe_blendfactor dst_alpha =
1619 fix_blendfactor(rt->alpha_dst_factor, state->alpha_to_one);
1620
1621 if (rt->rgb_func != rt->alpha_func ||
1622 src_rgb != src_alpha || dst_rgb != dst_alpha)
1623 indep_alpha_blend = true;
1624 #endif
1625 }
1626
1627 #if GFX_VER == 8
1628 crocus_pack_command(GENX(3DSTATE_PS_BLEND), cso->ps_blend, pb) {
1629 /* pb.HasWriteableRT is filled in at draw time.
1630 * pb.AlphaTestEnable is filled in at draw time.
1631 *
1632 * pb.ColorBufferBlendEnable is filled in at draw time so we can avoid
1633 * setting it when dual color blending without an appropriate shader.
1634 */
1635
1636 pb.AlphaToCoverageEnable = state->alpha_to_coverage;
1637 pb.IndependentAlphaBlendEnable = indep_alpha_blend;
1638
1639 /* The casts prevent warnings about implicit enum type conversions. */
1640 pb.SourceBlendFactor =
1641 (int) fix_blendfactor(state->rt[0].rgb_src_factor, state->alpha_to_one);
1642 pb.SourceAlphaBlendFactor =
1643 (int) fix_blendfactor(state->rt[0].alpha_src_factor, state->alpha_to_one);
1644 pb.DestinationBlendFactor =
1645 (int) fix_blendfactor(state->rt[0].rgb_dst_factor, state->alpha_to_one);
1646 pb.DestinationAlphaBlendFactor =
1647 (int) fix_blendfactor(state->rt[0].alpha_dst_factor, state->alpha_to_one);
1648 }
1649 #endif
1650 return cso;
1651 }
1652
1653 /**
1654 * The pipe->bind_blend_state() driver hook.
1655 *
1656 * Bind a blending CSO and flag related dirty bits.
1657 */
1658 static void
crocus_bind_blend_state(struct pipe_context * ctx,void * state)1659 crocus_bind_blend_state(struct pipe_context *ctx, void *state)
1660 {
1661 struct crocus_context *ice = (struct crocus_context *) ctx;
1662 struct crocus_blend_state *cso = state;
1663
1664 ice->state.cso_blend = cso;
1665 ice->state.blend_enables = cso ? cso->blend_enables : 0;
1666
1667 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_FS;
1668 ice->state.dirty |= CROCUS_DIRTY_WM;
1669 #if GFX_VER >= 6
1670 ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
1671 #endif
1672 #if GFX_VER >= 7
1673 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS;
1674 #endif
1675 #if GFX_VER == 8
1676 ice->state.dirty |= CROCUS_DIRTY_GEN8_PMA_FIX;
1677 ice->state.dirty |= CROCUS_DIRTY_GEN8_PS_BLEND;
1678 #endif
1679 ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1680 ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
1681 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_BLEND];
1682 }
1683
1684 /**
1685 * Return true if the FS writes to any color outputs which are not disabled
1686 * via color masking.
1687 */
1688 static bool
has_writeable_rt(const struct crocus_blend_state * cso_blend,const struct shader_info * fs_info)1689 has_writeable_rt(const struct crocus_blend_state *cso_blend,
1690 const struct shader_info *fs_info)
1691 {
1692 if (!fs_info)
1693 return false;
1694
1695 unsigned rt_outputs = fs_info->outputs_written >> FRAG_RESULT_DATA0;
1696
1697 if (fs_info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR))
1698 rt_outputs = (1 << ELK_MAX_DRAW_BUFFERS) - 1;
1699
1700 return cso_blend->color_write_enables & rt_outputs;
1701 }
1702
1703 /**
1704 * Gallium CSO for depth, stencil, and alpha testing state.
1705 */
1706 struct crocus_depth_stencil_alpha_state {
1707 struct pipe_depth_stencil_alpha_state cso;
1708
1709 bool depth_writes_enabled;
1710 bool stencil_writes_enabled;
1711 };
1712
1713 /**
1714 * The pipe->create_depth_stencil_alpha_state() driver hook.
1715 *
1716 * We encode most of 3DSTATE_WM_DEPTH_STENCIL, and just save off the alpha
1717 * testing state since we need pieces of it in a variety of places.
1718 */
1719 static void *
crocus_create_zsa_state(struct pipe_context * ctx,const struct pipe_depth_stencil_alpha_state * state)1720 crocus_create_zsa_state(struct pipe_context *ctx,
1721 const struct pipe_depth_stencil_alpha_state *state)
1722 {
1723 struct crocus_depth_stencil_alpha_state *cso =
1724 malloc(sizeof(struct crocus_depth_stencil_alpha_state));
1725
1726 bool two_sided_stencil = state->stencil[1].enabled;
1727 cso->cso = *state;
1728
1729 cso->depth_writes_enabled = state->depth_writemask;
1730 cso->stencil_writes_enabled =
1731 state->stencil[0].writemask != 0 ||
1732 (two_sided_stencil && state->stencil[1].writemask != 0);
1733
1734 /* The state tracker needs to optimize away EQUAL writes for us. */
1735 assert(!(state->depth_func == PIPE_FUNC_EQUAL && state->depth_writemask));
1736
1737 return cso;
1738 }
1739
1740 /**
1741 * The pipe->bind_depth_stencil_alpha_state() driver hook.
1742 *
1743 * Bind a depth/stencil/alpha CSO and flag related dirty bits.
1744 */
1745 static void
crocus_bind_zsa_state(struct pipe_context * ctx,void * state)1746 crocus_bind_zsa_state(struct pipe_context *ctx, void *state)
1747 {
1748 struct crocus_context *ice = (struct crocus_context *) ctx;
1749 struct crocus_depth_stencil_alpha_state *old_cso = ice->state.cso_zsa;
1750 struct crocus_depth_stencil_alpha_state *new_cso = state;
1751
1752 if (new_cso) {
1753 if (cso_changed(cso.alpha_ref_value))
1754 ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1755
1756 if (cso_changed(cso.alpha_enabled))
1757 ice->state.dirty |= CROCUS_DIRTY_WM;
1758 #if GFX_VER >= 6
1759 if (cso_changed(cso.alpha_enabled))
1760 ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
1761
1762 if (cso_changed(cso.alpha_func))
1763 ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
1764 #endif
1765 #if GFX_VER == 8
1766 if (cso_changed(cso.alpha_enabled))
1767 ice->state.dirty |= CROCUS_DIRTY_GEN8_PS_BLEND;
1768 #endif
1769
1770 if (cso_changed(depth_writes_enabled))
1771 ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
1772
1773 ice->state.depth_writes_enabled = new_cso->depth_writes_enabled;
1774 ice->state.stencil_writes_enabled = new_cso->stencil_writes_enabled;
1775
1776 #if GFX_VER <= 5
1777 ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1778 #endif
1779 }
1780
1781 ice->state.cso_zsa = new_cso;
1782 ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
1783 #if GFX_VER >= 6
1784 ice->state.dirty |= CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL;
1785 #endif
1786 #if GFX_VER == 8
1787 ice->state.dirty |= CROCUS_DIRTY_GEN8_PMA_FIX;
1788 #endif
1789 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_DEPTH_STENCIL_ALPHA];
1790 }
1791
1792 #if GFX_VER == 8
1793 static bool
want_pma_fix(struct crocus_context * ice)1794 want_pma_fix(struct crocus_context *ice)
1795 {
1796 UNUSED struct crocus_screen *screen = (void *) ice->ctx.screen;
1797 UNUSED const struct intel_device_info *devinfo = &screen->devinfo;
1798 const struct elk_wm_prog_data *wm_prog_data = (void *)
1799 ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
1800 const struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
1801 const struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
1802 const struct crocus_blend_state *cso_blend = ice->state.cso_blend;
1803
1804 /* In very specific combinations of state, we can instruct Gfx8-9 hardware
1805 * to avoid stalling at the pixel mask array. The state equations are
1806 * documented in these places:
1807 *
1808 * - Gfx8 Depth PMA Fix: CACHE_MODE_1::NP_PMA_FIX_ENABLE
1809 * - Gfx9 Stencil PMA Fix: CACHE_MODE_0::STC PMA Optimization Enable
1810 *
1811 * Both equations share some common elements:
1812 *
1813 * no_hiz_op =
1814 * !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
1815 * 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
1816 * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
1817 * 3DSTATE_WM_HZ_OP::StencilBufferClear) &&
1818 *
1819 * killpixels =
1820 * 3DSTATE_WM::ForceKillPix != ForceOff &&
1821 * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
1822 * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
1823 * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
1824 * 3DSTATE_PS_BLEND::AlphaTestEnable ||
1825 * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
1826 *
1827 * (Technically the stencil PMA treats ForceKillPix differently,
1828 * but I think this is a documentation oversight, and we don't
1829 * ever use it in this way, so it doesn't matter).
1830 *
1831 * common_pma_fix =
1832 * 3DSTATE_WM::ForceThreadDispatch != 1 &&
1833 * 3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0 &&
1834 * 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
1835 * 3DSTATE_DEPTH_BUFFER::HIZ Enable &&
1836 * 3DSTATE_WM::EDSC_Mode != EDSC_PREPS &&
1837 * 3DSTATE_PS_EXTRA::PixelShaderValid &&
1838 * no_hiz_op
1839 *
1840 * These are always true:
1841 *
1842 * 3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0
1843 * 3DSTATE_PS_EXTRA::PixelShaderValid
1844 *
1845 * Also, we never use the normal drawing path for HiZ ops; these are true:
1846 *
1847 * !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
1848 * 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
1849 * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
1850 * 3DSTATE_WM_HZ_OP::StencilBufferClear)
1851 *
1852 * This happens sometimes:
1853 *
1854 * 3DSTATE_WM::ForceThreadDispatch != 1
1855 *
1856 * However, we choose to ignore it as it either agrees with the signal
1857 * (dispatch was already enabled, so nothing out of the ordinary), or
1858 * there are no framebuffer attachments (so no depth or HiZ anyway,
1859 * meaning the PMA signal will already be disabled).
1860 */
1861
1862 if (!cso_fb->zsbuf)
1863 return false;
1864
1865 struct crocus_resource *zres, *sres;
1866 crocus_get_depth_stencil_resources(devinfo,
1867 cso_fb->zsbuf->texture, &zres, &sres);
1868
1869 /* 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
1870 * 3DSTATE_DEPTH_BUFFER::HIZ Enable &&
1871 */
1872 if (!zres || !crocus_resource_level_has_hiz(zres, cso_fb->zsbuf->u.tex.level))
1873 return false;
1874
1875 /* 3DSTATE_WM::EDSC_Mode != EDSC_PREPS */
1876 if (wm_prog_data->early_fragment_tests)
1877 return false;
1878
1879 /* 3DSTATE_WM::ForceKillPix != ForceOff &&
1880 * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
1881 * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
1882 * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
1883 * 3DSTATE_PS_BLEND::AlphaTestEnable ||
1884 * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
1885 */
1886 bool killpixels = wm_prog_data->uses_kill || wm_prog_data->uses_omask ||
1887 cso_blend->cso.alpha_to_coverage || cso_zsa->cso.alpha_enabled;
1888
1889 /* The Gfx8 depth PMA equation becomes:
1890 *
1891 * depth_writes =
1892 * 3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
1893 * 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE
1894 *
1895 * stencil_writes =
1896 * 3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
1897 * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
1898 * 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE
1899 *
1900 * Z_PMA_OPT =
1901 * common_pma_fix &&
1902 * 3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable &&
1903 * ((killpixels && (depth_writes || stencil_writes)) ||
1904 * 3DSTATE_PS_EXTRA::PixelShaderComputedDepthMode != PSCDEPTH_OFF)
1905 *
1906 */
1907 if (!cso_zsa->cso.depth_enabled)
1908 return false;
1909
1910 return wm_prog_data->computed_depth_mode != PSCDEPTH_OFF ||
1911 (killpixels && (cso_zsa->depth_writes_enabled ||
1912 (sres && cso_zsa->stencil_writes_enabled)));
1913 }
1914 #endif
1915 void
genX(crocus_update_pma_fix)1916 genX(crocus_update_pma_fix)(struct crocus_context *ice,
1917 struct crocus_batch *batch,
1918 bool enable)
1919 {
1920 #if GFX_VER == 8
1921 struct crocus_genx_state *genx = ice->state.genx;
1922
1923 if (genx->pma_fix_enabled == enable)
1924 return;
1925
1926 genx->pma_fix_enabled = enable;
1927
1928 /* According to the Broadwell PIPE_CONTROL documentation, software should
1929 * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set
1930 * prior to the LRI. If stencil buffer writes are enabled, then a Render * Cache Flush is also necessary.
1931 *
1932 * The Gfx9 docs say to use a depth stall rather than a command streamer
1933 * stall. However, the hardware seems to violently disagree. A full
1934 * command streamer stall seems to be needed in both cases.
1935 */
1936 crocus_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
1937 PIPE_CONTROL_CS_STALL |
1938 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
1939 PIPE_CONTROL_RENDER_TARGET_FLUSH);
1940
1941 crocus_emit_reg(batch, GENX(CACHE_MODE_1), reg) {
1942 reg.NPPMAFixEnable = enable;
1943 reg.NPEarlyZFailsDisable = enable;
1944 reg.NPPMAFixEnableMask = true;
1945 reg.NPEarlyZFailsDisableMask = true;
1946 }
1947
1948 /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
1949 * Flush bits is often necessary. We do it regardless because it's easier.
1950 * The render cache flush is also necessary if stencil writes are enabled.
1951 *
1952 * Again, the Gfx9 docs give a different set of flushes but the Broadwell
1953 * flushes seem to work just as well.
1954 */
1955 crocus_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
1956 PIPE_CONTROL_DEPTH_STALL |
1957 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
1958 PIPE_CONTROL_RENDER_TARGET_FLUSH);
1959 #endif
1960 }
1961
1962 static float
get_line_width(const struct pipe_rasterizer_state * state)1963 get_line_width(const struct pipe_rasterizer_state *state)
1964 {
1965 float line_width = state->line_width;
1966
1967 /* From the OpenGL 4.4 spec:
1968 *
1969 * "The actual width of non-antialiased lines is determined by rounding
1970 * the supplied width to the nearest integer, then clamping it to the
1971 * implementation-dependent maximum non-antialiased line width."
1972 */
1973 if (!state->multisample && !state->line_smooth)
1974 line_width = roundf(state->line_width);
1975
1976 if (!state->multisample && state->line_smooth && line_width < 1.5f) {
1977 /* For 1 pixel line thickness or less, the general anti-aliasing
1978 * algorithm gives up, and a garbage line is generated. Setting a
1979 * Line Width of 0.0 specifies the rasterization of the "thinnest"
1980 * (one-pixel-wide), non-antialiased lines.
1981 *
1982 * Lines rendered with zero Line Width are rasterized using the
1983 * "Grid Intersection Quantization" rules as specified by the
1984 * "Zero-Width (Cosmetic) Line Rasterization" section of the docs.
1985 */
1986 /* hack around this for gfx4/5 fps counters in hud. */
1987 line_width = GFX_VER < 6 ? 1.5f : 0.0f;
1988 }
1989 return line_width;
1990 }
1991
1992 /**
1993 * The pipe->create_rasterizer_state() driver hook.
1994 */
1995 static void *
crocus_create_rasterizer_state(struct pipe_context * ctx,const struct pipe_rasterizer_state * state)1996 crocus_create_rasterizer_state(struct pipe_context *ctx,
1997 const struct pipe_rasterizer_state *state)
1998 {
1999 struct crocus_rasterizer_state *cso =
2000 malloc(sizeof(struct crocus_rasterizer_state));
2001
2002 cso->fill_mode_point_or_line =
2003 state->fill_front == PIPE_POLYGON_MODE_LINE ||
2004 state->fill_front == PIPE_POLYGON_MODE_POINT ||
2005 state->fill_back == PIPE_POLYGON_MODE_LINE ||
2006 state->fill_back == PIPE_POLYGON_MODE_POINT;
2007
2008 if (state->clip_plane_enable != 0)
2009 cso->num_clip_plane_consts = util_logbase2(state->clip_plane_enable) + 1;
2010 else
2011 cso->num_clip_plane_consts = 0;
2012
2013 cso->cso = *state;
2014
2015 #if GFX_VER >= 6
2016 float line_width = get_line_width(state);
2017
2018 crocus_pack_command(GENX(3DSTATE_SF), cso->sf, sf) {
2019 sf.StatisticsEnable = true;
2020 sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
2021 sf.LineEndCapAntialiasingRegionWidth =
2022 state->line_smooth ? _10pixels : _05pixels;
2023 sf.LastPixelEnable = state->line_last_pixel;
2024 #if GFX_VER <= 7
2025 sf.AntialiasingEnable = state->line_smooth;
2026 #endif
2027 #if GFX_VER == 8
2028 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
2029 if (screen->devinfo.platform == INTEL_PLATFORM_CHV)
2030 sf.CHVLineWidth = line_width;
2031 else
2032 sf.LineWidth = line_width;
2033 #else
2034 sf.LineWidth = line_width;
2035 #endif
2036 sf.PointWidthSource = state->point_size_per_vertex ? Vertex : State;
2037 sf.PointWidth = state->point_size;
2038
2039 if (state->flatshade_first) {
2040 sf.TriangleFanProvokingVertexSelect = 1;
2041 } else {
2042 sf.TriangleStripListProvokingVertexSelect = 2;
2043 sf.TriangleFanProvokingVertexSelect = 2;
2044 sf.LineStripListProvokingVertexSelect = 1;
2045 }
2046
2047 #if GFX_VER == 6
2048 sf.AttributeSwizzleEnable = true;
2049 if (state->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT)
2050 sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
2051 else
2052 sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
2053 #endif
2054
2055 #if GFX_VER <= 7
2056 sf.FrontWinding = state->front_ccw ? 1 : 0; // Or the other way...
2057
2058 #if GFX_VER >= 6
2059 sf.GlobalDepthOffsetEnableSolid = state->offset_tri;
2060 sf.GlobalDepthOffsetEnableWireframe = state->offset_line;
2061 sf.GlobalDepthOffsetEnablePoint = state->offset_point;
2062 sf.GlobalDepthOffsetConstant = state->offset_units * 2;
2063 sf.GlobalDepthOffsetScale = state->offset_scale;
2064 sf.GlobalDepthOffsetClamp = state->offset_clamp;
2065
2066 sf.FrontFaceFillMode = translate_fill_mode(state->fill_front);
2067 sf.BackFaceFillMode = translate_fill_mode(state->fill_back);
2068 #endif
2069
2070 sf.CullMode = translate_cull_mode(state->cull_face);
2071 sf.ScissorRectangleEnable = true;
2072
2073 #if GFX_VERx10 == 75
2074 sf.LineStippleEnable = state->line_stipple_enable;
2075 #endif
2076 #endif
2077 }
2078 #endif
2079
2080 #if GFX_VER == 8
2081 crocus_pack_command(GENX(3DSTATE_RASTER), cso->raster, rr) {
2082 rr.FrontWinding = state->front_ccw ? CounterClockwise : Clockwise;
2083 rr.CullMode = translate_cull_mode(state->cull_face);
2084 rr.FrontFaceFillMode = translate_fill_mode(state->fill_front);
2085 rr.BackFaceFillMode = translate_fill_mode(state->fill_back);
2086 rr.DXMultisampleRasterizationEnable = state->multisample;
2087 rr.GlobalDepthOffsetEnableSolid = state->offset_tri;
2088 rr.GlobalDepthOffsetEnableWireframe = state->offset_line;
2089 rr.GlobalDepthOffsetEnablePoint = state->offset_point;
2090 rr.GlobalDepthOffsetConstant = state->offset_units * 2;
2091 rr.GlobalDepthOffsetScale = state->offset_scale;
2092 rr.GlobalDepthOffsetClamp = state->offset_clamp;
2093 rr.SmoothPointEnable = state->point_smooth;
2094 rr.AntialiasingEnable = state->line_smooth;
2095 rr.ScissorRectangleEnable = state->scissor;
2096 rr.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far);
2097 }
2098 #endif
2099
2100 #if GFX_VER >= 6
2101 crocus_pack_command(GENX(3DSTATE_CLIP), cso->clip, cl) {
2102 /* cl.NonPerspectiveBarycentricEnable is filled in at draw time from
2103 * the FS program; cl.ForceZeroRTAIndexEnable is filled in from the FB.
2104 */
2105 #if GFX_VER >= 7
2106 cl.EarlyCullEnable = true;
2107 #endif
2108
2109 #if GFX_VER == 7
2110 cl.FrontWinding = state->front_ccw ? 1 : 0;
2111 cl.CullMode = translate_cull_mode(state->cull_face);
2112 #endif
2113 cl.UserClipDistanceClipTestEnableBitmask = state->clip_plane_enable;
2114 #if GFX_VER < 8
2115 cl.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far);
2116 #endif
2117 cl.APIMode = state->clip_halfz ? APIMODE_D3D : APIMODE_OGL;
2118 cl.GuardbandClipTestEnable = true;
2119 cl.ClipEnable = true;
2120 cl.MinimumPointWidth = 0.125;
2121 cl.MaximumPointWidth = 255.875;
2122
2123 #if GFX_VER == 8
2124 cl.ForceUserClipDistanceClipTestEnableBitmask = true;
2125 #endif
2126
2127 if (state->flatshade_first) {
2128 cl.TriangleFanProvokingVertexSelect = 1;
2129 } else {
2130 cl.TriangleStripListProvokingVertexSelect = 2;
2131 cl.TriangleFanProvokingVertexSelect = 2;
2132 cl.LineStripListProvokingVertexSelect = 1;
2133 }
2134 }
2135 #endif
2136
2137 /* Remap from 0..255 back to 1..256 */
2138 const unsigned line_stipple_factor = state->line_stipple_factor + 1;
2139
2140 crocus_pack_command(GENX(3DSTATE_LINE_STIPPLE), cso->line_stipple, line) {
2141 if (state->line_stipple_enable) {
2142 line.LineStipplePattern = state->line_stipple_pattern;
2143 line.LineStippleInverseRepeatCount = 1.0f / line_stipple_factor;
2144 line.LineStippleRepeatCount = line_stipple_factor;
2145 }
2146 }
2147
2148 return cso;
2149 }
2150
2151 /**
2152 * The pipe->bind_rasterizer_state() driver hook.
2153 *
2154 * Bind a rasterizer CSO and flag related dirty bits.
2155 */
2156 static void
crocus_bind_rasterizer_state(struct pipe_context * ctx,void * state)2157 crocus_bind_rasterizer_state(struct pipe_context *ctx, void *state)
2158 {
2159 struct crocus_context *ice = (struct crocus_context *) ctx;
2160 struct crocus_rasterizer_state *old_cso = ice->state.cso_rast;
2161 struct crocus_rasterizer_state *new_cso = state;
2162
2163 if (new_cso) {
2164 /* Try to avoid re-emitting 3DSTATE_LINE_STIPPLE, it's non-pipelined */
2165 if (cso_changed_memcmp(line_stipple))
2166 ice->state.dirty |= CROCUS_DIRTY_LINE_STIPPLE;
2167 #if GFX_VER >= 6
2168 if (cso_changed(cso.half_pixel_center))
2169 ice->state.dirty |= CROCUS_DIRTY_GEN6_MULTISAMPLE;
2170 if (cso_changed(cso.scissor))
2171 ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
2172 if (cso_changed(cso.multisample))
2173 ice->state.dirty |= CROCUS_DIRTY_WM;
2174 #else
2175 if (cso_changed(cso.scissor))
2176 ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
2177 #endif
2178
2179 if (cso_changed(cso.line_stipple_enable) || cso_changed(cso.poly_stipple_enable))
2180 ice->state.dirty |= CROCUS_DIRTY_WM;
2181
2182 #if GFX_VER >= 6
2183 if (cso_changed(cso.rasterizer_discard))
2184 ice->state.dirty |= CROCUS_DIRTY_STREAMOUT | CROCUS_DIRTY_CLIP;
2185
2186 if (cso_changed(cso.flatshade_first))
2187 ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
2188 #endif
2189
2190 if (cso_changed(cso.depth_clip_near) || cso_changed(cso.depth_clip_far) ||
2191 cso_changed(cso.clip_halfz))
2192 ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
2193
2194 #if GFX_VER >= 7
2195 if (cso_changed(cso.sprite_coord_enable) ||
2196 cso_changed(cso.sprite_coord_mode) ||
2197 cso_changed(cso.light_twoside))
2198 ice->state.dirty |= CROCUS_DIRTY_GEN7_SBE;
2199 #endif
2200 #if GFX_VER <= 5
2201 if (cso_changed(cso.clip_plane_enable))
2202 ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
2203 #endif
2204 }
2205
2206 ice->state.cso_rast = new_cso;
2207 ice->state.dirty |= CROCUS_DIRTY_RASTER;
2208 ice->state.dirty |= CROCUS_DIRTY_CLIP;
2209 #if GFX_VER <= 5
2210 ice->state.dirty |= CROCUS_DIRTY_GEN4_CLIP_PROG | CROCUS_DIRTY_GEN4_SF_PROG;
2211 ice->state.dirty |= CROCUS_DIRTY_WM;
2212 #endif
2213 #if GFX_VER <= 6
2214 ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
2215 #endif
2216 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_RASTERIZER];
2217 }
2218
2219 /**
2220 * Return true if the given wrap mode requires the border color to exist.
2221 *
2222 * (We can skip uploading it if the sampler isn't going to use it.)
2223 */
2224 static bool
wrap_mode_needs_border_color(unsigned wrap_mode)2225 wrap_mode_needs_border_color(unsigned wrap_mode)
2226 {
2227 #if GFX_VER == 8
2228 return wrap_mode == TCM_CLAMP_BORDER || wrap_mode == TCM_HALF_BORDER;
2229 #else
2230 return wrap_mode == TCM_CLAMP_BORDER;
2231 #endif
2232 }
2233
2234 /**
2235 * Gallium CSO for sampler state.
2236 */
2237 struct crocus_sampler_state {
2238 struct pipe_sampler_state pstate;
2239 union pipe_color_union border_color;
2240 bool needs_border_color;
2241 unsigned wrap_s;
2242 unsigned wrap_t;
2243 unsigned wrap_r;
2244 unsigned mag_img_filter;
2245 float min_lod;
2246 };
2247
2248 /**
2249 * The pipe->create_sampler_state() driver hook.
2250 *
2251 * We fill out SAMPLER_STATE (except for the border color pointer), and
2252 * store that on the CPU. It doesn't make sense to upload it to a GPU
2253 * buffer object yet, because 3DSTATE_SAMPLER_STATE_POINTERS requires
2254 * all bound sampler states to be in contiguous memor.
2255 */
2256 static void *
crocus_create_sampler_state(struct pipe_context * ctx,const struct pipe_sampler_state * state)2257 crocus_create_sampler_state(struct pipe_context *ctx,
2258 const struct pipe_sampler_state *state)
2259 {
2260 struct crocus_sampler_state *cso = CALLOC_STRUCT(crocus_sampler_state);
2261
2262 if (!cso)
2263 return NULL;
2264
2265 STATIC_ASSERT(PIPE_TEX_FILTER_NEAREST == MAPFILTER_NEAREST);
2266 STATIC_ASSERT(PIPE_TEX_FILTER_LINEAR == MAPFILTER_LINEAR);
2267
2268 bool either_nearest = state->min_img_filter == PIPE_TEX_FILTER_NEAREST ||
2269 state->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
2270 cso->wrap_s = translate_wrap(state->wrap_s, either_nearest);
2271 cso->wrap_t = translate_wrap(state->wrap_t, either_nearest);
2272 cso->wrap_r = translate_wrap(state->wrap_r, either_nearest);
2273
2274 cso->pstate = *state;
2275
2276 memcpy(&cso->border_color, &state->border_color, sizeof(cso->border_color));
2277
2278 cso->needs_border_color = wrap_mode_needs_border_color(cso->wrap_s) ||
2279 wrap_mode_needs_border_color(cso->wrap_t) ||
2280 wrap_mode_needs_border_color(cso->wrap_r);
2281
2282 cso->min_lod = state->min_lod;
2283 cso->mag_img_filter = state->mag_img_filter;
2284
2285 // XXX: explain this code ported from ilo...I don't get it at all...
2286 if (state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
2287 state->min_lod > 0.0f) {
2288 cso->min_lod = 0.0f;
2289 cso->mag_img_filter = state->min_img_filter;
2290 }
2291
2292 return cso;
2293 }
2294
2295 /**
2296 * The pipe->bind_sampler_states() driver hook.
2297 */
2298 static void
crocus_bind_sampler_states(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start,unsigned count,void ** states)2299 crocus_bind_sampler_states(struct pipe_context *ctx,
2300 enum pipe_shader_type p_stage,
2301 unsigned start, unsigned count,
2302 void **states)
2303 {
2304 struct crocus_context *ice = (struct crocus_context *) ctx;
2305 gl_shader_stage stage = stage_from_pipe(p_stage);
2306 struct crocus_shader_state *shs = &ice->state.shaders[stage];
2307
2308 assert(start + count <= CROCUS_MAX_TEXTURE_SAMPLERS);
2309
2310 bool dirty = false;
2311
2312 for (int i = 0; i < count; i++) {
2313 if (shs->samplers[start + i] != states[i]) {
2314 shs->samplers[start + i] = states[i];
2315 dirty = true;
2316 }
2317 }
2318
2319 if (dirty) {
2320 #if GFX_VER <= 5
2321 if (p_stage == PIPE_SHADER_FRAGMENT)
2322 ice->state.dirty |= CROCUS_DIRTY_WM;
2323 else if (p_stage == PIPE_SHADER_VERTEX)
2324 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS;
2325 #endif
2326 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
2327 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_TEXTURES];
2328 }
2329 }
2330
2331 enum samp_workaround {
2332 SAMP_NORMAL,
2333 SAMP_CUBE_CLAMP,
2334 SAMP_CUBE_CUBE,
2335 SAMP_T_WRAP,
2336 };
2337
2338 static void
crocus_upload_sampler_state(struct crocus_batch * batch,struct crocus_sampler_state * cso,uint32_t border_color_offset,enum samp_workaround samp_workaround,uint32_t first_level,void * map)2339 crocus_upload_sampler_state(struct crocus_batch *batch,
2340 struct crocus_sampler_state *cso,
2341 uint32_t border_color_offset,
2342 enum samp_workaround samp_workaround,
2343 uint32_t first_level,
2344 void *map)
2345 {
2346 struct pipe_sampler_state *state = &cso->pstate;
2347 uint32_t wrap_s, wrap_t, wrap_r;
2348
2349 wrap_s = cso->wrap_s;
2350 wrap_t = cso->wrap_t;
2351 wrap_r = cso->wrap_r;
2352
2353 switch (samp_workaround) {
2354 case SAMP_CUBE_CLAMP:
2355 wrap_s = TCM_CLAMP;
2356 wrap_t = TCM_CLAMP;
2357 wrap_r = TCM_CLAMP;
2358 break;
2359 case SAMP_CUBE_CUBE:
2360 wrap_s = TCM_CUBE;
2361 wrap_t = TCM_CUBE;
2362 wrap_r = TCM_CUBE;
2363 break;
2364 case SAMP_T_WRAP:
2365 wrap_t = TCM_WRAP;
2366 break;
2367 default:
2368 break;
2369 }
2370
2371 _crocus_pack_state(batch, GENX(SAMPLER_STATE), map, samp) {
2372 samp.TCXAddressControlMode = wrap_s;
2373 samp.TCYAddressControlMode = wrap_t;
2374 samp.TCZAddressControlMode = wrap_r;
2375
2376 #if GFX_VER >= 6
2377 samp.NonnormalizedCoordinateEnable = state->unnormalized_coords;
2378 #endif
2379 samp.MinModeFilter = state->min_img_filter;
2380 samp.MagModeFilter = cso->mag_img_filter;
2381 samp.MipModeFilter = translate_mip_filter(state->min_mip_filter);
2382 samp.MaximumAnisotropy = RATIO21;
2383
2384 if (state->max_anisotropy >= 2) {
2385 if (state->min_img_filter == PIPE_TEX_FILTER_LINEAR) {
2386 samp.MinModeFilter = MAPFILTER_ANISOTROPIC;
2387 #if GFX_VER >= 7
2388 samp.AnisotropicAlgorithm = EWAApproximation;
2389 #endif
2390 }
2391
2392 if (state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)
2393 samp.MagModeFilter = MAPFILTER_ANISOTROPIC;
2394
2395 samp.MaximumAnisotropy =
2396 MIN2((state->max_anisotropy - 2) / 2, RATIO161);
2397 }
2398
2399 /* Set address rounding bits if not using nearest filtering. */
2400 if (state->min_img_filter != PIPE_TEX_FILTER_NEAREST) {
2401 samp.UAddressMinFilterRoundingEnable = true;
2402 samp.VAddressMinFilterRoundingEnable = true;
2403 samp.RAddressMinFilterRoundingEnable = true;
2404 }
2405
2406 if (state->mag_img_filter != PIPE_TEX_FILTER_NEAREST) {
2407 samp.UAddressMagFilterRoundingEnable = true;
2408 samp.VAddressMagFilterRoundingEnable = true;
2409 samp.RAddressMagFilterRoundingEnable = true;
2410 }
2411
2412 if (state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE)
2413 samp.ShadowFunction = translate_shadow_func(state->compare_func);
2414
2415 const float hw_max_lod = GFX_VER >= 7 ? 14 : 13;
2416
2417 #if GFX_VER == 8
2418 samp.LODPreClampMode = CLAMP_MODE_OGL;
2419 #else
2420 samp.LODPreClampEnable = true;
2421 #endif
2422 samp.MinLOD = CLAMP(cso->min_lod, 0, hw_max_lod);
2423 samp.MaxLOD = CLAMP(state->max_lod, 0, hw_max_lod);
2424 samp.TextureLODBias = CLAMP(state->lod_bias, -16, 15);
2425
2426 #if GFX_VER == 6
2427 samp.BaseMipLevel = CLAMP(first_level, 0, hw_max_lod);
2428 samp.MinandMagStateNotEqual = samp.MinModeFilter != samp.MagModeFilter;
2429 #endif
2430
2431 #if GFX_VER < 6
2432 samp.BorderColorPointer =
2433 ro_bo(batch->state.bo, border_color_offset);
2434 #else
2435 samp.BorderColorPointer = border_color_offset;
2436 #endif
2437 }
2438 }
2439
2440 static void
crocus_upload_border_color(struct crocus_batch * batch,struct crocus_sampler_state * cso,struct crocus_sampler_view * tex,uint32_t * bc_offset)2441 crocus_upload_border_color(struct crocus_batch *batch,
2442 struct crocus_sampler_state *cso,
2443 struct crocus_sampler_view *tex,
2444 uint32_t *bc_offset)
2445 {
2446 /* We may need to swizzle the border color for format faking.
2447 * A/LA formats are faked as R/RG with 000R or R00G swizzles.
2448 * This means we need to move the border color's A channel into
2449 * the R or G channels so that those read swizzles will move it
2450 * back into A.
2451 */
2452 enum pipe_format internal_format = PIPE_FORMAT_NONE;
2453 union pipe_color_union *color = &cso->border_color;
2454 union pipe_color_union tmp;
2455 if (tex) {
2456 internal_format = tex->res->internal_format;
2457
2458 if (util_format_is_alpha(internal_format)) {
2459 unsigned char swz[4] = {
2460 PIPE_SWIZZLE_0, PIPE_SWIZZLE_0,
2461 PIPE_SWIZZLE_0, PIPE_SWIZZLE_W,
2462 };
2463 util_format_apply_color_swizzle(&tmp, color, swz, true);
2464 color = &tmp;
2465 } else if (util_format_is_luminance_alpha(internal_format) &&
2466 internal_format != PIPE_FORMAT_L8A8_SRGB) {
2467 unsigned char swz[4] = {
2468 PIPE_SWIZZLE_X, PIPE_SWIZZLE_X,
2469 PIPE_SWIZZLE_X, PIPE_SWIZZLE_W
2470 };
2471 util_format_apply_color_swizzle(&tmp, color, swz, true);
2472 color = &tmp;
2473 }
2474 }
2475 bool is_integer_format = util_format_is_pure_integer(internal_format);
2476 unsigned sbc_size = GENX(SAMPLER_BORDER_COLOR_STATE_length) * 4;
2477 const int sbc_align = (GFX_VER == 8 ? 64 : ((GFX_VERx10 == 75 && is_integer_format) ? 512 : 32));
2478 uint32_t *sbc = stream_state(batch, sbc_size, sbc_align, bc_offset);
2479
2480 struct GENX(SAMPLER_BORDER_COLOR_STATE) state = { 0 };
2481
2482 #define ASSIGN(dst, src) \
2483 do { \
2484 dst = src; \
2485 } while (0)
2486
2487 #define ASSIGNu16(dst, src) \
2488 do { \
2489 dst = (uint16_t)src; \
2490 } while (0)
2491
2492 #define ASSIGNu8(dst, src) \
2493 do { \
2494 dst = (uint8_t)src; \
2495 } while (0)
2496
2497 #define BORDER_COLOR_ATTR(macro, _color_type, src) \
2498 macro(state.BorderColor ## _color_type ## Red, src[0]); \
2499 macro(state.BorderColor ## _color_type ## Green, src[1]); \
2500 macro(state.BorderColor ## _color_type ## Blue, src[2]); \
2501 macro(state.BorderColor ## _color_type ## Alpha, src[3]);
2502
2503 #if GFX_VER >= 8
2504 /* On Broadwell, the border color is represented as four 32-bit floats,
2505 * integers, or unsigned values, interpreted according to the surface
2506 * format. This matches the sampler->BorderColor union exactly; just
2507 * memcpy the values.
2508 */
2509 BORDER_COLOR_ATTR(ASSIGN, 32bit, color->ui);
2510 #elif GFX_VERx10 == 75
2511 if (is_integer_format) {
2512 const struct util_format_description *format_desc =
2513 util_format_description(internal_format);
2514
2515 /* From the Haswell PRM, "Command Reference: Structures", Page 36:
2516 * "If any color channel is missing from the surface format,
2517 * corresponding border color should be programmed as zero and if
2518 * alpha channel is missing, corresponding Alpha border color should
2519 * be programmed as 1."
2520 */
2521 unsigned c[4] = { 0, 0, 0, 1 };
2522 for (int i = 0; i < 4; i++) {
2523 if (format_desc->channel[i].size)
2524 c[i] = color->ui[i];
2525 }
2526
2527 switch (format_desc->channel[0].size) {
2528 case 8:
2529 /* Copy RGBA in order. */
2530 BORDER_COLOR_ATTR(ASSIGNu8, 8bit, c);
2531 break;
2532 case 10:
2533 /* R10G10B10A2_UINT is treated like a 16-bit format. */
2534 case 16:
2535 BORDER_COLOR_ATTR(ASSIGNu16, 16bit, c);
2536 break;
2537 case 32:
2538 if (format_desc->channel[1].size && !format_desc->channel[2].size) {
2539 /* Careful inspection of the tables reveals that for RG32 formats,
2540 * the green channel needs to go where blue normally belongs.
2541 */
2542 state.BorderColor32bitRed = c[0];
2543 state.BorderColor32bitBlue = c[1];
2544 state.BorderColor32bitAlpha = 1;
2545 } else {
2546 /* Copy RGBA in order. */
2547 BORDER_COLOR_ATTR(ASSIGN, 32bit, c);
2548 }
2549 break;
2550 default:
2551 assert(!"Invalid number of bits per channel in integer format.");
2552 break;
2553 }
2554 } else {
2555 BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
2556 }
2557 #elif GFX_VER == 5 || GFX_VER == 6
2558 BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_UBYTE, Unorm, color->f);
2559 BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_USHORT, Unorm16, color->f);
2560 BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_SHORT, Snorm16, color->f);
2561
2562 #define MESA_FLOAT_TO_HALF(dst, src) \
2563 dst = _mesa_float_to_half(src);
2564
2565 BORDER_COLOR_ATTR(MESA_FLOAT_TO_HALF, Float16, color->f);
2566
2567 #undef MESA_FLOAT_TO_HALF
2568
2569 state.BorderColorSnorm8Red = state.BorderColorSnorm16Red >> 8;
2570 state.BorderColorSnorm8Green = state.BorderColorSnorm16Green >> 8;
2571 state.BorderColorSnorm8Blue = state.BorderColorSnorm16Blue >> 8;
2572 state.BorderColorSnorm8Alpha = state.BorderColorSnorm16Alpha >> 8;
2573
2574 BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
2575
2576 #elif GFX_VER == 4
2577 BORDER_COLOR_ATTR(ASSIGN, , color->f);
2578 #else
2579 BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
2580 #endif
2581
2582 #undef ASSIGN
2583 #undef BORDER_COLOR_ATTR
2584
2585 GENX(SAMPLER_BORDER_COLOR_STATE_pack)(batch, sbc, &state);
2586 }
2587
2588 /**
2589 * Upload the sampler states into a contiguous area of GPU memory, for
2590 * for 3DSTATE_SAMPLER_STATE_POINTERS_*.
2591 *
2592 * Also fill out the border color state pointers.
2593 */
2594 static void
crocus_upload_sampler_states(struct crocus_context * ice,struct crocus_batch * batch,gl_shader_stage stage)2595 crocus_upload_sampler_states(struct crocus_context *ice,
2596 struct crocus_batch *batch, gl_shader_stage stage)
2597 {
2598 struct crocus_shader_state *shs = &ice->state.shaders[stage];
2599 const struct shader_info *info = crocus_get_shader_info(ice, stage);
2600
2601 /* We assume the state tracker will call pipe->bind_sampler_states()
2602 * if the program's number of textures changes.
2603 */
2604 unsigned count = info ? BITSET_LAST_BIT(info->textures_used) : 0;
2605
2606 if (!count)
2607 return;
2608
2609 /* Assemble the SAMPLER_STATEs into a contiguous table that lives
2610 * in the dynamic state memory zone, so we can point to it via the
2611 * 3DSTATE_SAMPLER_STATE_POINTERS_* commands.
2612 */
2613 unsigned size = count * 4 * GENX(SAMPLER_STATE_length);
2614 uint32_t *map = stream_state(batch, size, 32, &shs->sampler_offset);
2615
2616 if (unlikely(!map))
2617 return;
2618
2619 for (int i = 0; i < count; i++) {
2620 struct crocus_sampler_state *state = shs->samplers[i];
2621 struct crocus_sampler_view *tex = shs->textures[i];
2622
2623 if (!state || !tex) {
2624 memset(map, 0, 4 * GENX(SAMPLER_STATE_length));
2625 } else {
2626 unsigned border_color_offset = 0;
2627 if (state->needs_border_color) {
2628 crocus_upload_border_color(batch, state, tex, &border_color_offset);
2629 }
2630
2631 enum samp_workaround wa = SAMP_NORMAL;
2632 /* There's a bug in 1D texture sampling - it actually pays
2633 * attention to the wrap_t value, though it should not.
2634 * Override the wrap_t value here to GL_REPEAT to keep
2635 * any nonexistent border pixels from floating in.
2636 */
2637 if (tex->base.target == PIPE_TEXTURE_1D)
2638 wa = SAMP_T_WRAP;
2639 else if (tex->base.target == PIPE_TEXTURE_CUBE ||
2640 tex->base.target == PIPE_TEXTURE_CUBE_ARRAY) {
2641 /* Cube maps must use the same wrap mode for all three coordinate
2642 * dimensions. Prior to Haswell, only CUBE and CLAMP are valid.
2643 *
2644 * Ivybridge and Baytrail seem to have problems with CUBE mode and
2645 * integer formats. Fall back to CLAMP for now.
2646 */
2647 if (state->pstate.seamless_cube_map &&
2648 !(GFX_VERx10 == 70 && util_format_is_pure_integer(tex->base.format)))
2649 wa = SAMP_CUBE_CUBE;
2650 else
2651 wa = SAMP_CUBE_CLAMP;
2652 }
2653
2654 uint32_t first_level = 0;
2655 if (tex->base.target != PIPE_BUFFER)
2656 first_level = tex->base.u.tex.first_level;
2657
2658 crocus_upload_sampler_state(batch, state, border_color_offset, wa, first_level, map);
2659 }
2660
2661 map += GENX(SAMPLER_STATE_length);
2662 }
2663 }
2664
2665 /**
2666 * The pipe->create_sampler_view() driver hook.
2667 */
2668 static struct pipe_sampler_view *
crocus_create_sampler_view(struct pipe_context * ctx,struct pipe_resource * tex,const struct pipe_sampler_view * tmpl)2669 crocus_create_sampler_view(struct pipe_context *ctx,
2670 struct pipe_resource *tex,
2671 const struct pipe_sampler_view *tmpl)
2672 {
2673 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
2674 const struct intel_device_info *devinfo = &screen->devinfo;
2675 struct crocus_sampler_view *isv = calloc(1, sizeof(struct crocus_sampler_view));
2676
2677 if (!isv)
2678 return NULL;
2679
2680 /* initialize base object */
2681 isv->base = *tmpl;
2682 isv->base.context = ctx;
2683 isv->base.texture = NULL;
2684 pipe_reference_init(&isv->base.reference, 1);
2685 pipe_resource_reference(&isv->base.texture, tex);
2686
2687 if (util_format_is_depth_or_stencil(tmpl->format)) {
2688 struct crocus_resource *zres, *sres;
2689 const struct util_format_description *desc =
2690 util_format_description(tmpl->format);
2691
2692 crocus_get_depth_stencil_resources(devinfo, tex, &zres, &sres);
2693
2694 tex = util_format_has_depth(desc) ? &zres->base.b : &sres->base.b;
2695
2696 if (tex->format == PIPE_FORMAT_S8_UINT)
2697 if (GFX_VER == 7 && sres->shadow)
2698 tex = &sres->shadow->base.b;
2699 }
2700
2701 isv->res = (struct crocus_resource *) tex;
2702
2703 isl_surf_usage_flags_t usage = ISL_SURF_USAGE_TEXTURE_BIT;
2704
2705 if (isv->base.target == PIPE_TEXTURE_CUBE ||
2706 isv->base.target == PIPE_TEXTURE_CUBE_ARRAY)
2707 usage |= ISL_SURF_USAGE_CUBE_BIT;
2708
2709 const struct crocus_format_info fmt =
2710 crocus_format_for_usage(devinfo, tmpl->format, usage);
2711
2712 enum pipe_swizzle vswz[4] = { tmpl->swizzle_r, tmpl->swizzle_g, tmpl->swizzle_b, tmpl->swizzle_a };
2713 crocus_combine_swizzle(isv->swizzle, fmt.swizzles, vswz);
2714
2715 /* hardcode stencil swizzles - hw returns 0G01, we want GGGG */
2716 if (GFX_VER < 6 &&
2717 (tmpl->format == PIPE_FORMAT_X32_S8X24_UINT ||
2718 tmpl->format == PIPE_FORMAT_X24S8_UINT)) {
2719 isv->swizzle[0] = tmpl->swizzle_g;
2720 isv->swizzle[1] = tmpl->swizzle_g;
2721 isv->swizzle[2] = tmpl->swizzle_g;
2722 isv->swizzle[3] = tmpl->swizzle_g;
2723 }
2724
2725 isv->clear_color = isv->res->aux.clear_color;
2726
2727 isv->view = (struct isl_view) {
2728 .format = fmt.fmt,
2729 #if GFX_VERx10 >= 75
2730 .swizzle = (struct isl_swizzle) {
2731 .r = pipe_to_isl_swizzle(isv->swizzle[0], false),
2732 .g = pipe_to_isl_swizzle(isv->swizzle[1], false),
2733 .b = pipe_to_isl_swizzle(isv->swizzle[2], false),
2734 .a = pipe_to_isl_swizzle(isv->swizzle[3], false),
2735 },
2736 #else
2737 /* swizzling handled in shader code */
2738 .swizzle = ISL_SWIZZLE_IDENTITY,
2739 #endif
2740 .usage = usage,
2741 };
2742
2743 /* Fill out SURFACE_STATE for this view. */
2744 if (tmpl->target != PIPE_BUFFER) {
2745 isv->view.base_level = tmpl->u.tex.first_level;
2746 isv->view.levels = tmpl->u.tex.last_level - tmpl->u.tex.first_level + 1;
2747
2748 /* Hardware older than skylake ignores this value */
2749 assert(tex->target != PIPE_TEXTURE_3D || !tmpl->u.tex.first_layer);
2750
2751 // XXX: do I need to port f9fd0cf4790cb2a530e75d1a2206dbb9d8af7cb2?
2752 isv->view.base_array_layer = tmpl->u.tex.first_layer;
2753 isv->view.array_len =
2754 tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
2755 }
2756 #if GFX_VER >= 6
2757 /* just create a second view struct for texture gather just in case */
2758 isv->gather_view = isv->view;
2759
2760 #if GFX_VER == 7
2761 if (fmt.fmt == ISL_FORMAT_R32G32_FLOAT ||
2762 fmt.fmt == ISL_FORMAT_R32G32_SINT ||
2763 fmt.fmt == ISL_FORMAT_R32G32_UINT) {
2764 isv->gather_view.format = ISL_FORMAT_R32G32_FLOAT_LD;
2765 #if GFX_VERx10 >= 75
2766 isv->gather_view.swizzle = (struct isl_swizzle) {
2767 .r = pipe_to_isl_swizzle(isv->swizzle[0], GFX_VERx10 == 75),
2768 .g = pipe_to_isl_swizzle(isv->swizzle[1], GFX_VERx10 == 75),
2769 .b = pipe_to_isl_swizzle(isv->swizzle[2], GFX_VERx10 == 75),
2770 .a = pipe_to_isl_swizzle(isv->swizzle[3], GFX_VERx10 == 75),
2771 };
2772 #endif
2773 }
2774 #endif
2775 #if GFX_VER == 6
2776 /* Sandybridge's gather4 message is broken for integer formats.
2777 * To work around this, we pretend the surface is UNORM for
2778 * 8 or 16-bit formats, and emit shader instructions to recover
2779 * the real INT/UINT value. For 32-bit formats, we pretend
2780 * the surface is FLOAT, and simply reinterpret the resulting
2781 * bits.
2782 */
2783 switch (fmt.fmt) {
2784 case ISL_FORMAT_R8_SINT:
2785 case ISL_FORMAT_R8_UINT:
2786 isv->gather_view.format = ISL_FORMAT_R8_UNORM;
2787 break;
2788
2789 case ISL_FORMAT_R16_SINT:
2790 case ISL_FORMAT_R16_UINT:
2791 isv->gather_view.format = ISL_FORMAT_R16_UNORM;
2792 break;
2793
2794 case ISL_FORMAT_R32_SINT:
2795 case ISL_FORMAT_R32_UINT:
2796 isv->gather_view.format = ISL_FORMAT_R32_FLOAT;
2797 break;
2798
2799 default:
2800 break;
2801 }
2802 #endif
2803 #endif
2804
2805 return &isv->base;
2806 }
2807
2808 static void
crocus_sampler_view_destroy(struct pipe_context * ctx,struct pipe_sampler_view * state)2809 crocus_sampler_view_destroy(struct pipe_context *ctx,
2810 struct pipe_sampler_view *state)
2811 {
2812 struct crocus_sampler_view *isv = (void *) state;
2813 pipe_resource_reference(&state->texture, NULL);
2814 free(isv);
2815 }
2816
2817 /**
2818 * The pipe->create_surface() driver hook.
2819 *
2820 * In Gallium nomenclature, "surfaces" are a view of a resource that
2821 * can be bound as a render target or depth/stencil buffer.
2822 */
2823 static struct pipe_surface *
crocus_create_surface(struct pipe_context * ctx,struct pipe_resource * tex,const struct pipe_surface * tmpl)2824 crocus_create_surface(struct pipe_context *ctx,
2825 struct pipe_resource *tex,
2826 const struct pipe_surface *tmpl)
2827 {
2828 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
2829 const struct intel_device_info *devinfo = &screen->devinfo;
2830
2831 isl_surf_usage_flags_t usage = 0;
2832 if (tmpl->writable)
2833 usage = ISL_SURF_USAGE_STORAGE_BIT;
2834 else if (util_format_is_depth_or_stencil(tmpl->format))
2835 usage = ISL_SURF_USAGE_DEPTH_BIT;
2836 else
2837 usage = ISL_SURF_USAGE_RENDER_TARGET_BIT;
2838
2839 const struct crocus_format_info fmt =
2840 crocus_format_for_usage(devinfo, tmpl->format, usage);
2841
2842 if ((usage & ISL_SURF_USAGE_RENDER_TARGET_BIT) &&
2843 !isl_format_supports_rendering(devinfo, fmt.fmt)) {
2844 /* Framebuffer validation will reject this invalid case, but it
2845 * hasn't had the opportunity yet. In the meantime, we need to
2846 * avoid hitting ISL asserts about unsupported formats below.
2847 */
2848 return NULL;
2849 }
2850
2851 struct crocus_surface *surf = calloc(1, sizeof(struct crocus_surface));
2852 struct pipe_surface *psurf = &surf->base;
2853 struct crocus_resource *res = (struct crocus_resource *) tex;
2854
2855 if (!surf)
2856 return NULL;
2857
2858 pipe_reference_init(&psurf->reference, 1);
2859 pipe_resource_reference(&psurf->texture, tex);
2860 psurf->context = ctx;
2861 psurf->format = tmpl->format;
2862 psurf->width = tex->width0;
2863 psurf->height = tex->height0;
2864 psurf->u.tex.first_layer = tmpl->u.tex.first_layer;
2865 psurf->u.tex.last_layer = tmpl->u.tex.last_layer;
2866 psurf->u.tex.level = tmpl->u.tex.level;
2867
2868 uint32_t array_len = tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
2869
2870 struct isl_view *view = &surf->view;
2871 *view = (struct isl_view) {
2872 .format = fmt.fmt,
2873 .base_level = tmpl->u.tex.level,
2874 .levels = 1,
2875 .base_array_layer = tmpl->u.tex.first_layer,
2876 .array_len = array_len,
2877 .swizzle = ISL_SWIZZLE_IDENTITY,
2878 .usage = usage,
2879 };
2880
2881 #if GFX_VER >= 6
2882 struct isl_view *read_view = &surf->read_view;
2883 *read_view = (struct isl_view) {
2884 .format = fmt.fmt,
2885 .base_level = tmpl->u.tex.level,
2886 .levels = 1,
2887 .base_array_layer = tmpl->u.tex.first_layer,
2888 .array_len = array_len,
2889 .swizzle = ISL_SWIZZLE_IDENTITY,
2890 .usage = ISL_SURF_USAGE_TEXTURE_BIT,
2891 };
2892 #endif
2893
2894 surf->clear_color = res->aux.clear_color;
2895
2896 /* Bail early for depth/stencil - we don't want SURFACE_STATE for them. */
2897 if (res->surf.usage & (ISL_SURF_USAGE_DEPTH_BIT |
2898 ISL_SURF_USAGE_STENCIL_BIT))
2899 return psurf;
2900
2901 if (!isl_format_is_compressed(res->surf.format)) {
2902 memcpy(&surf->surf, &res->surf, sizeof(surf->surf));
2903 uint64_t temp_offset;
2904 uint32_t temp_x, temp_y;
2905
2906 isl_surf_get_image_offset_B_tile_sa(&res->surf, tmpl->u.tex.level,
2907 res->base.b.target == PIPE_TEXTURE_3D ? 0 : tmpl->u.tex.first_layer,
2908 res->base.b.target == PIPE_TEXTURE_3D ? tmpl->u.tex.first_layer : 0,
2909 &temp_offset, &temp_x, &temp_y);
2910 if (!devinfo->has_surface_tile_offset &&
2911 (temp_x || temp_y)) {
2912 /* Original gfx4 hardware couldn't draw to a non-tile-aligned
2913 * destination.
2914 */
2915 /* move to temp */
2916 struct pipe_resource wa_templ = (struct pipe_resource) {
2917 .width0 = u_minify(res->base.b.width0, tmpl->u.tex.level),
2918 .height0 = u_minify(res->base.b.height0, tmpl->u.tex.level),
2919 .depth0 = 1,
2920 .array_size = 1,
2921 .format = res->base.b.format,
2922 .target = PIPE_TEXTURE_2D,
2923 .bind = (usage & ISL_SURF_USAGE_DEPTH_BIT ? PIPE_BIND_DEPTH_STENCIL : PIPE_BIND_RENDER_TARGET) | PIPE_BIND_SAMPLER_VIEW,
2924 };
2925 surf->align_res = screen->base.resource_create(&screen->base, &wa_templ);
2926 view->base_level = 0;
2927 view->base_array_layer = 0;
2928 view->array_len = 1;
2929 struct crocus_resource *align_res = (struct crocus_resource *)surf->align_res;
2930 memcpy(&surf->surf, &align_res->surf, sizeof(surf->surf));
2931 }
2932 return psurf;
2933 }
2934
2935 /* The resource has a compressed format, which is not renderable, but we
2936 * have a renderable view format. We must be attempting to upload blocks
2937 * of compressed data via an uncompressed view.
2938 *
2939 * In this case, we can assume there are no auxiliary buffers, a single
2940 * miplevel, and that the resource is single-sampled. Gallium may try
2941 * and create an uncompressed view with multiple layers, however.
2942 */
2943 assert(!isl_format_is_compressed(fmt.fmt));
2944 assert(res->surf.samples == 1);
2945 assert(view->levels == 1);
2946
2947 /* TODO: compressed pbo uploads aren't working here */
2948 pipe_surface_reference(&psurf, NULL);
2949 return NULL;
2950
2951 uint64_t offset_B = 0;
2952 uint32_t tile_x_sa = 0, tile_y_sa = 0;
2953
2954 if (view->base_level > 0) {
2955 /* We can't rely on the hardware's miplevel selection with such
2956 * a substantial lie about the format, so we select a single image
2957 * using the Tile X/Y Offset fields. In this case, we can't handle
2958 * multiple array slices.
2959 *
2960 * On Broadwell, HALIGN and VALIGN are specified in pixels and are
2961 * hard-coded to align to exactly the block size of the compressed
2962 * texture. This means that, when reinterpreted as a non-compressed
2963 * texture, the tile offsets may be anything and we can't rely on
2964 * X/Y Offset.
2965 *
2966 * Return NULL to force the state tracker to take fallback paths.
2967 */
2968 // TODO: check if the gen7 check is right, originally gen8
2969 if (view->array_len > 1 || GFX_VER == 7) {
2970 pipe_surface_reference(&psurf, NULL);
2971 return NULL;
2972 }
2973
2974 const bool is_3d = res->surf.dim == ISL_SURF_DIM_3D;
2975 isl_surf_get_image_surf(&screen->isl_dev, &res->surf,
2976 view->base_level,
2977 is_3d ? 0 : view->base_array_layer,
2978 is_3d ? view->base_array_layer : 0,
2979 &surf->surf,
2980 &offset_B, &tile_x_sa, &tile_y_sa);
2981
2982 /* We use address and tile offsets to access a single level/layer
2983 * as a subimage, so reset level/layer so it doesn't offset again.
2984 */
2985 view->base_array_layer = 0;
2986 view->base_level = 0;
2987 } else {
2988 /* Level 0 doesn't require tile offsets, and the hardware can find
2989 * array slices using QPitch even with the format override, so we
2990 * can allow layers in this case. Copy the original ISL surface.
2991 */
2992 memcpy(&surf->surf, &res->surf, sizeof(surf->surf));
2993 }
2994
2995 /* Scale down the image dimensions by the block size. */
2996 const struct isl_format_layout *fmtl =
2997 isl_format_get_layout(res->surf.format);
2998 surf->surf.format = fmt.fmt;
2999 surf->surf.logical_level0_px = isl_surf_get_logical_level0_el(&surf->surf);
3000 surf->surf.phys_level0_sa = isl_surf_get_phys_level0_el(&surf->surf);
3001 tile_x_sa /= fmtl->bw;
3002 tile_y_sa /= fmtl->bh;
3003
3004 psurf->width = surf->surf.logical_level0_px.width;
3005 psurf->height = surf->surf.logical_level0_px.height;
3006
3007 return psurf;
3008 }
3009
3010 #if GFX_VER >= 7
3011 static void
fill_default_image_param(struct isl_image_param * param)3012 fill_default_image_param(struct isl_image_param *param)
3013 {
3014 memset(param, 0, sizeof(*param));
3015 /* Set the swizzling shifts to all-ones to effectively disable swizzling --
3016 * See emit_address_calculation() in elk_fs_surface_builder.cpp for a more
3017 * detailed explanation of these parameters.
3018 */
3019 param->swizzling[0] = 0xff;
3020 param->swizzling[1] = 0xff;
3021 }
3022
3023 static void
fill_buffer_image_param(struct isl_image_param * param,enum pipe_format pfmt,unsigned size)3024 fill_buffer_image_param(struct isl_image_param *param,
3025 enum pipe_format pfmt,
3026 unsigned size)
3027 {
3028 const unsigned cpp = util_format_get_blocksize(pfmt);
3029
3030 fill_default_image_param(param);
3031 param->size[0] = size / cpp;
3032 param->stride[0] = cpp;
3033 }
3034
3035 #endif
3036
3037 /**
3038 * The pipe->set_shader_images() driver hook.
3039 */
3040 static void
crocus_set_shader_images(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start_slot,unsigned count,unsigned unbind_num_trailing_slots,const struct pipe_image_view * p_images)3041 crocus_set_shader_images(struct pipe_context *ctx,
3042 enum pipe_shader_type p_stage,
3043 unsigned start_slot, unsigned count,
3044 unsigned unbind_num_trailing_slots,
3045 const struct pipe_image_view *p_images)
3046 {
3047 #if GFX_VER >= 7
3048 struct crocus_context *ice = (struct crocus_context *) ctx;
3049 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3050 const struct intel_device_info *devinfo = &screen->devinfo;
3051 gl_shader_stage stage = stage_from_pipe(p_stage);
3052 struct crocus_shader_state *shs = &ice->state.shaders[stage];
3053 struct crocus_genx_state *genx = ice->state.genx;
3054 struct isl_image_param *image_params = genx->shaders[stage].image_param;
3055
3056 shs->bound_image_views &= ~u_bit_consecutive(start_slot, count);
3057
3058 for (unsigned i = 0; i < count; i++) {
3059 struct crocus_image_view *iv = &shs->image[start_slot + i];
3060
3061 if (p_images && p_images[i].resource) {
3062 const struct pipe_image_view *img = &p_images[i];
3063 struct crocus_resource *res = (void *) img->resource;
3064
3065 util_copy_image_view(&iv->base, img);
3066
3067 shs->bound_image_views |= 1 << (start_slot + i);
3068
3069 res->bind_history |= PIPE_BIND_SHADER_IMAGE;
3070 res->bind_stages |= 1 << stage;
3071
3072 isl_surf_usage_flags_t usage = ISL_SURF_USAGE_STORAGE_BIT;
3073 struct crocus_format_info fmt =
3074 crocus_format_for_usage(devinfo, img->format, usage);
3075
3076 struct isl_swizzle swiz = pipe_to_isl_swizzles(fmt.swizzles);
3077 if (img->shader_access & PIPE_IMAGE_ACCESS_READ) {
3078 /* On Gen8, try to use typed surfaces reads (which support a
3079 * limited number of formats), and if not possible, fall back
3080 * to untyped reads.
3081 */
3082 if (!isl_has_matching_typed_storage_image_format(devinfo, fmt.fmt))
3083 fmt.fmt = ISL_FORMAT_RAW;
3084 else
3085 fmt.fmt = isl_lower_storage_image_format(devinfo, fmt.fmt);
3086 }
3087
3088 if (res->base.b.target != PIPE_BUFFER) {
3089 struct isl_view view = {
3090 .format = fmt.fmt,
3091 .base_level = img->u.tex.level,
3092 .levels = 1,
3093 .base_array_layer = img->u.tex.first_layer,
3094 .array_len = img->u.tex.last_layer - img->u.tex.first_layer + 1,
3095 .swizzle = swiz,
3096 .usage = usage,
3097 };
3098
3099 iv->view = view;
3100
3101 isl_surf_fill_image_param(&screen->isl_dev,
3102 &image_params[start_slot + i],
3103 &res->surf, &view);
3104 } else {
3105 struct isl_view view = {
3106 .format = fmt.fmt,
3107 .swizzle = swiz,
3108 .usage = usage,
3109 };
3110 iv->view = view;
3111
3112 util_range_add(&res->base.b, &res->valid_buffer_range, img->u.buf.offset,
3113 img->u.buf.offset + img->u.buf.size);
3114 fill_buffer_image_param(&image_params[start_slot + i],
3115 img->format, img->u.buf.size);
3116 }
3117 } else {
3118 pipe_resource_reference(&iv->base.resource, NULL);
3119 fill_default_image_param(&image_params[start_slot + i]);
3120 }
3121 }
3122
3123 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage;
3124 ice->state.dirty |=
3125 stage == MESA_SHADER_COMPUTE ? CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
3126 : CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3127
3128 /* Broadwell also needs isl_image_params re-uploaded */
3129 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage;
3130 shs->sysvals_need_upload = true;
3131 #endif
3132 }
3133
3134
3135 /**
3136 * The pipe->set_sampler_views() driver hook.
3137 */
3138 static void
crocus_set_sampler_views(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start,unsigned count,unsigned unbind_num_trailing_slots,bool take_ownership,struct pipe_sampler_view ** views)3139 crocus_set_sampler_views(struct pipe_context *ctx,
3140 enum pipe_shader_type p_stage,
3141 unsigned start, unsigned count,
3142 unsigned unbind_num_trailing_slots,
3143 bool take_ownership,
3144 struct pipe_sampler_view **views)
3145 {
3146 struct crocus_context *ice = (struct crocus_context *) ctx;
3147 gl_shader_stage stage = stage_from_pipe(p_stage);
3148 struct crocus_shader_state *shs = &ice->state.shaders[stage];
3149
3150 shs->bound_sampler_views &= ~u_bit_consecutive(start, count);
3151
3152 for (unsigned i = 0; i < count; i++) {
3153 struct pipe_sampler_view *pview = views ? views[i] : NULL;
3154
3155 if (take_ownership) {
3156 pipe_sampler_view_reference((struct pipe_sampler_view **)
3157 &shs->textures[start + i], NULL);
3158 shs->textures[start + i] = (struct crocus_sampler_view *)pview;
3159 } else {
3160 pipe_sampler_view_reference((struct pipe_sampler_view **)
3161 &shs->textures[start + i], pview);
3162 }
3163
3164 struct crocus_sampler_view *view = (void *) pview;
3165 if (view) {
3166 view->res->bind_history |= PIPE_BIND_SAMPLER_VIEW;
3167 view->res->bind_stages |= 1 << stage;
3168
3169 shs->bound_sampler_views |= 1 << (start + i);
3170 }
3171 }
3172 #if GFX_VER == 6
3173 /* first level parameters to crocus_upload_sampler_state is gfx6 only */
3174 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
3175 #endif
3176 ice->state.stage_dirty |= (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage);
3177 ice->state.dirty |=
3178 stage == MESA_SHADER_COMPUTE ? CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
3179 : CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3180 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_TEXTURES];
3181 }
3182
3183 /**
3184 * The pipe->set_tess_state() driver hook.
3185 */
3186 static void
crocus_set_tess_state(struct pipe_context * ctx,const float default_outer_level[4],const float default_inner_level[2])3187 crocus_set_tess_state(struct pipe_context *ctx,
3188 const float default_outer_level[4],
3189 const float default_inner_level[2])
3190 {
3191 struct crocus_context *ice = (struct crocus_context *) ctx;
3192 struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_CTRL];
3193
3194 memcpy(&ice->state.default_outer_level[0], &default_outer_level[0], 4 * sizeof(float));
3195 memcpy(&ice->state.default_inner_level[0], &default_inner_level[0], 2 * sizeof(float));
3196
3197 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS;
3198 shs->sysvals_need_upload = true;
3199 }
3200
3201 static void
crocus_set_patch_vertices(struct pipe_context * ctx,uint8_t patch_vertices)3202 crocus_set_patch_vertices(struct pipe_context *ctx, uint8_t patch_vertices)
3203 {
3204 struct crocus_context *ice = (struct crocus_context *) ctx;
3205
3206 ice->state.patch_vertices = patch_vertices;
3207 }
3208
3209 static void
crocus_surface_destroy(struct pipe_context * ctx,struct pipe_surface * p_surf)3210 crocus_surface_destroy(struct pipe_context *ctx, struct pipe_surface *p_surf)
3211 {
3212 struct crocus_surface *surf = (void *) p_surf;
3213 pipe_resource_reference(&p_surf->texture, NULL);
3214
3215 pipe_resource_reference(&surf->align_res, NULL);
3216 free(surf);
3217 }
3218
3219 static void
crocus_set_clip_state(struct pipe_context * ctx,const struct pipe_clip_state * state)3220 crocus_set_clip_state(struct pipe_context *ctx,
3221 const struct pipe_clip_state *state)
3222 {
3223 struct crocus_context *ice = (struct crocus_context *) ctx;
3224 struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_VERTEX];
3225 struct crocus_shader_state *gshs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
3226 struct crocus_shader_state *tshs = &ice->state.shaders[MESA_SHADER_TESS_EVAL];
3227
3228 memcpy(&ice->state.clip_planes, state, sizeof(*state));
3229
3230 #if GFX_VER <= 5
3231 ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
3232 #endif
3233 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS | CROCUS_STAGE_DIRTY_CONSTANTS_GS |
3234 CROCUS_STAGE_DIRTY_CONSTANTS_TES;
3235 shs->sysvals_need_upload = true;
3236 gshs->sysvals_need_upload = true;
3237 tshs->sysvals_need_upload = true;
3238 }
3239
3240 /**
3241 * The pipe->set_polygon_stipple() driver hook.
3242 */
3243 static void
crocus_set_polygon_stipple(struct pipe_context * ctx,const struct pipe_poly_stipple * state)3244 crocus_set_polygon_stipple(struct pipe_context *ctx,
3245 const struct pipe_poly_stipple *state)
3246 {
3247 struct crocus_context *ice = (struct crocus_context *) ctx;
3248 memcpy(&ice->state.poly_stipple, state, sizeof(*state));
3249 ice->state.dirty |= CROCUS_DIRTY_POLYGON_STIPPLE;
3250 }
3251
3252 /**
3253 * The pipe->set_sample_mask() driver hook.
3254 */
3255 static void
crocus_set_sample_mask(struct pipe_context * ctx,unsigned sample_mask)3256 crocus_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask)
3257 {
3258 struct crocus_context *ice = (struct crocus_context *) ctx;
3259
3260 /* We only support 16x MSAA, so we have 16 bits of sample maks.
3261 * st/mesa may pass us 0xffffffff though, meaning "enable all samples".
3262 */
3263 ice->state.sample_mask = sample_mask & 0xff;
3264 ice->state.dirty |= CROCUS_DIRTY_GEN6_SAMPLE_MASK;
3265 }
3266
3267 static void
crocus_fill_scissor_rect(struct crocus_context * ice,int idx,struct pipe_scissor_state * ss)3268 crocus_fill_scissor_rect(struct crocus_context *ice,
3269 int idx,
3270 struct pipe_scissor_state *ss)
3271 {
3272 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
3273 struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
3274 const struct pipe_viewport_state *vp = &ice->state.viewports[idx];
3275 struct pipe_scissor_state scissor = (struct pipe_scissor_state) {
3276 .minx = MAX2(-fabsf(vp->scale[0]) + vp->translate[0], 0),
3277 .maxx = MIN2( fabsf(vp->scale[0]) + vp->translate[0], cso_fb->width) - 1,
3278 .miny = MAX2(-fabsf(vp->scale[1]) + vp->translate[1], 0),
3279 .maxy = MIN2( fabsf(vp->scale[1]) + vp->translate[1], cso_fb->height) - 1,
3280 };
3281 if (cso_state->scissor) {
3282 struct pipe_scissor_state *s = &ice->state.scissors[idx];
3283 scissor.minx = MAX2(scissor.minx, s->minx);
3284 scissor.miny = MAX2(scissor.miny, s->miny);
3285 scissor.maxx = MIN2(scissor.maxx, s->maxx);
3286 scissor.maxy = MIN2(scissor.maxy, s->maxy);
3287 }
3288 *ss = scissor;
3289 }
3290
3291 /**
3292 * The pipe->set_scissor_states() driver hook.
3293 *
3294 * This corresponds to our SCISSOR_RECT state structures. It's an
3295 * exact match, so we just store them, and memcpy them out later.
3296 */
3297 static void
crocus_set_scissor_states(struct pipe_context * ctx,unsigned start_slot,unsigned num_scissors,const struct pipe_scissor_state * rects)3298 crocus_set_scissor_states(struct pipe_context *ctx,
3299 unsigned start_slot,
3300 unsigned num_scissors,
3301 const struct pipe_scissor_state *rects)
3302 {
3303 struct crocus_context *ice = (struct crocus_context *) ctx;
3304
3305 for (unsigned i = 0; i < num_scissors; i++) {
3306 if (rects[i].minx == rects[i].maxx || rects[i].miny == rects[i].maxy) {
3307 /* If the scissor was out of bounds and got clamped to 0 width/height
3308 * at the bounds, the subtraction of 1 from maximums could produce a
3309 * negative number and thus not clip anything. Instead, just provide
3310 * a min > max scissor inside the bounds, which produces the expected
3311 * no rendering.
3312 */
3313 ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
3314 .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
3315 };
3316 } else {
3317 ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
3318 .minx = rects[i].minx, .miny = rects[i].miny,
3319 .maxx = rects[i].maxx - 1, .maxy = rects[i].maxy - 1,
3320 };
3321 }
3322 }
3323
3324 #if GFX_VER < 6
3325 ice->state.dirty |= CROCUS_DIRTY_RASTER; /* SF state */
3326 #else
3327 ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
3328 #endif
3329 ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
3330
3331 }
3332
3333 /**
3334 * The pipe->set_stencil_ref() driver hook.
3335 *
3336 * This is added to 3DSTATE_WM_DEPTH_STENCIL dynamically at draw time.
3337 */
3338 static void
crocus_set_stencil_ref(struct pipe_context * ctx,const struct pipe_stencil_ref ref)3339 crocus_set_stencil_ref(struct pipe_context *ctx,
3340 const struct pipe_stencil_ref ref)
3341 {
3342 struct crocus_context *ice = (struct crocus_context *) ctx;
3343 ice->state.stencil_ref = ref;
3344 ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
3345 }
3346
3347 #if GFX_VER == 8
3348 static float
viewport_extent(const struct pipe_viewport_state * state,int axis,float sign)3349 viewport_extent(const struct pipe_viewport_state *state, int axis, float sign)
3350 {
3351 return copysignf(state->scale[axis], sign) + state->translate[axis];
3352 }
3353 #endif
3354
3355 /**
3356 * The pipe->set_viewport_states() driver hook.
3357 *
3358 * This corresponds to our SF_CLIP_VIEWPORT states. We can't calculate
3359 * the guardband yet, as we need the framebuffer dimensions, but we can
3360 * at least fill out the rest.
3361 */
3362 static void
crocus_set_viewport_states(struct pipe_context * ctx,unsigned start_slot,unsigned count,const struct pipe_viewport_state * states)3363 crocus_set_viewport_states(struct pipe_context *ctx,
3364 unsigned start_slot,
3365 unsigned count,
3366 const struct pipe_viewport_state *states)
3367 {
3368 struct crocus_context *ice = (struct crocus_context *) ctx;
3369 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3370
3371 memcpy(&ice->state.viewports[start_slot], states, sizeof(*states) * count);
3372
3373 /* Fix depth test misrenderings by lowering translated depth range */
3374 if (screen->driconf.lower_depth_range_rate != 1.0f)
3375 ice->state.viewports[start_slot].translate[2] *=
3376 screen->driconf.lower_depth_range_rate;
3377
3378 ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
3379 ice->state.dirty |= CROCUS_DIRTY_RASTER;
3380 #if GFX_VER >= 6
3381 ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
3382 #endif
3383
3384 if (ice->state.cso_rast && (!ice->state.cso_rast->cso.depth_clip_near ||
3385 !ice->state.cso_rast->cso.depth_clip_far))
3386 ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
3387 }
3388
3389 /**
3390 * The pipe->set_framebuffer_state() driver hook.
3391 *
3392 * Sets the current draw FBO, including color render targets, depth,
3393 * and stencil buffers.
3394 */
3395 static void
crocus_set_framebuffer_state(struct pipe_context * ctx,const struct pipe_framebuffer_state * state)3396 crocus_set_framebuffer_state(struct pipe_context *ctx,
3397 const struct pipe_framebuffer_state *state)
3398 {
3399 struct crocus_context *ice = (struct crocus_context *) ctx;
3400 struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
3401 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3402 const struct intel_device_info *devinfo = &screen->devinfo;
3403 #if 0
3404 struct isl_device *isl_dev = &screen->isl_dev;
3405 struct crocus_resource *zres;
3406 struct crocus_resource *stencil_res;
3407 #endif
3408
3409 unsigned samples = util_framebuffer_get_num_samples(state);
3410 unsigned layers = util_framebuffer_get_num_layers(state);
3411
3412 #if GFX_VER >= 6
3413 if (cso->samples != samples) {
3414 ice->state.dirty |= CROCUS_DIRTY_GEN6_MULTISAMPLE;
3415 ice->state.dirty |= CROCUS_DIRTY_GEN6_SAMPLE_MASK;
3416 ice->state.dirty |= CROCUS_DIRTY_RASTER;
3417 #if GFX_VERx10 == 75
3418 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS;
3419 #endif
3420 }
3421 #endif
3422
3423 #if GFX_VER >= 6 && GFX_VER < 8
3424 ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
3425 #endif
3426
3427 if ((cso->layers == 0) != (layers == 0)) {
3428 ice->state.dirty |= CROCUS_DIRTY_CLIP;
3429 }
3430
3431 if (cso->width != state->width || cso->height != state->height) {
3432 ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
3433 ice->state.dirty |= CROCUS_DIRTY_RASTER;
3434 ice->state.dirty |= CROCUS_DIRTY_DRAWING_RECTANGLE;
3435 #if GFX_VER >= 6
3436 ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
3437 #endif
3438 }
3439
3440 if (cso->zsbuf || state->zsbuf) {
3441 ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER;
3442
3443 /* update SF's depth buffer format */
3444 if (GFX_VER == 7 && cso->zsbuf)
3445 ice->state.dirty |= CROCUS_DIRTY_RASTER;
3446 }
3447
3448 /* wm thread dispatch enable */
3449 ice->state.dirty |= CROCUS_DIRTY_WM;
3450 util_copy_framebuffer_state(cso, state);
3451 cso->samples = samples;
3452 cso->layers = layers;
3453
3454 if (cso->zsbuf) {
3455 struct crocus_resource *zres;
3456 struct crocus_resource *stencil_res;
3457 enum isl_aux_usage aux_usage = ISL_AUX_USAGE_NONE;
3458 crocus_get_depth_stencil_resources(devinfo, cso->zsbuf->texture, &zres,
3459 &stencil_res);
3460 if (zres && crocus_resource_level_has_hiz(zres, cso->zsbuf->u.tex.level)) {
3461 aux_usage = zres->aux.usage;
3462 }
3463 ice->state.hiz_usage = aux_usage;
3464 }
3465
3466 /* Render target change */
3467 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_FS;
3468
3469 ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3470
3471 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_FRAMEBUFFER];
3472 }
3473
3474 /**
3475 * The pipe->set_constant_buffer() driver hook.
3476 *
3477 * This uploads any constant data in user buffers, and references
3478 * any UBO resources containing constant data.
3479 */
3480 static void
crocus_set_constant_buffer(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned index,bool take_ownership,const struct pipe_constant_buffer * input)3481 crocus_set_constant_buffer(struct pipe_context *ctx,
3482 enum pipe_shader_type p_stage, unsigned index,
3483 bool take_ownership,
3484 const struct pipe_constant_buffer *input)
3485 {
3486 struct crocus_context *ice = (struct crocus_context *) ctx;
3487 gl_shader_stage stage = stage_from_pipe(p_stage);
3488 struct crocus_shader_state *shs = &ice->state.shaders[stage];
3489 struct pipe_constant_buffer *cbuf = &shs->constbufs[index];
3490
3491 util_copy_constant_buffer(&shs->constbufs[index], input, take_ownership);
3492
3493 if (input && input->buffer_size && (input->buffer || input->user_buffer)) {
3494 shs->bound_cbufs |= 1u << index;
3495
3496 if (input->user_buffer) {
3497 void *map = NULL;
3498 pipe_resource_reference(&cbuf->buffer, NULL);
3499 u_upload_alloc(ice->ctx.const_uploader, 0, input->buffer_size, 64,
3500 &cbuf->buffer_offset, &cbuf->buffer, (void **) &map);
3501
3502 if (!cbuf->buffer) {
3503 /* Allocation was unsuccessful - just unbind */
3504 crocus_set_constant_buffer(ctx, p_stage, index, false, NULL);
3505 return;
3506 }
3507
3508 assert(map);
3509 memcpy(map, input->user_buffer, input->buffer_size);
3510 }
3511 cbuf->buffer_size =
3512 MIN2(input->buffer_size,
3513 crocus_resource_bo(cbuf->buffer)->size - cbuf->buffer_offset);
3514
3515 struct crocus_resource *res = (void *) cbuf->buffer;
3516 res->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
3517 res->bind_stages |= 1 << stage;
3518 } else {
3519 shs->bound_cbufs &= ~(1u << index);
3520 }
3521
3522 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage;
3523 }
3524
3525 static void
upload_sysvals(struct crocus_context * ice,gl_shader_stage stage)3526 upload_sysvals(struct crocus_context *ice,
3527 gl_shader_stage stage)
3528 {
3529 UNUSED struct crocus_genx_state *genx = ice->state.genx;
3530 struct crocus_shader_state *shs = &ice->state.shaders[stage];
3531
3532 struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
3533 if (!shader || shader->num_system_values == 0)
3534 return;
3535
3536 assert(shader->num_cbufs > 0);
3537
3538 unsigned sysval_cbuf_index = shader->num_cbufs - 1;
3539 struct pipe_constant_buffer *cbuf = &shs->constbufs[sysval_cbuf_index];
3540 unsigned upload_size = shader->num_system_values * sizeof(uint32_t);
3541 uint32_t *map = NULL;
3542
3543 assert(sysval_cbuf_index < PIPE_MAX_CONSTANT_BUFFERS);
3544 u_upload_alloc(ice->ctx.const_uploader, 0, upload_size, 64,
3545 &cbuf->buffer_offset, &cbuf->buffer, (void **) &map);
3546
3547 for (int i = 0; i < shader->num_system_values; i++) {
3548 uint32_t sysval = shader->system_values[i];
3549 uint32_t value = 0;
3550
3551 if (ELK_PARAM_DOMAIN(sysval) == ELK_PARAM_DOMAIN_IMAGE) {
3552 #if GFX_VER >= 7
3553 unsigned img = ELK_PARAM_IMAGE_IDX(sysval);
3554 unsigned offset = ELK_PARAM_IMAGE_OFFSET(sysval);
3555 struct isl_image_param *param =
3556 &genx->shaders[stage].image_param[img];
3557
3558 assert(offset < sizeof(struct isl_image_param));
3559 value = ((uint32_t *) param)[offset];
3560 #endif
3561 } else if (sysval == ELK_PARAM_BUILTIN_ZERO) {
3562 value = 0;
3563 } else if (ELK_PARAM_BUILTIN_IS_CLIP_PLANE(sysval)) {
3564 int plane = ELK_PARAM_BUILTIN_CLIP_PLANE_IDX(sysval);
3565 int comp = ELK_PARAM_BUILTIN_CLIP_PLANE_COMP(sysval);
3566 value = fui(ice->state.clip_planes.ucp[plane][comp]);
3567 } else if (sysval == ELK_PARAM_BUILTIN_PATCH_VERTICES_IN) {
3568 if (stage == MESA_SHADER_TESS_CTRL) {
3569 value = ice->state.vertices_per_patch;
3570 } else {
3571 assert(stage == MESA_SHADER_TESS_EVAL);
3572 const struct shader_info *tcs_info =
3573 crocus_get_shader_info(ice, MESA_SHADER_TESS_CTRL);
3574 if (tcs_info)
3575 value = tcs_info->tess.tcs_vertices_out;
3576 else
3577 value = ice->state.vertices_per_patch;
3578 }
3579 } else if (sysval >= ELK_PARAM_BUILTIN_TESS_LEVEL_OUTER_X &&
3580 sysval <= ELK_PARAM_BUILTIN_TESS_LEVEL_OUTER_W) {
3581 unsigned i = sysval - ELK_PARAM_BUILTIN_TESS_LEVEL_OUTER_X;
3582 value = fui(ice->state.default_outer_level[i]);
3583 } else if (sysval == ELK_PARAM_BUILTIN_TESS_LEVEL_INNER_X) {
3584 value = fui(ice->state.default_inner_level[0]);
3585 } else if (sysval == ELK_PARAM_BUILTIN_TESS_LEVEL_INNER_Y) {
3586 value = fui(ice->state.default_inner_level[1]);
3587 } else if (sysval >= ELK_PARAM_BUILTIN_WORK_GROUP_SIZE_X &&
3588 sysval <= ELK_PARAM_BUILTIN_WORK_GROUP_SIZE_Z) {
3589 unsigned i = sysval - ELK_PARAM_BUILTIN_WORK_GROUP_SIZE_X;
3590 value = ice->state.last_block[i];
3591 } else {
3592 assert(!"unhandled system value");
3593 }
3594
3595 *map++ = value;
3596 }
3597
3598 cbuf->buffer_size = upload_size;
3599 shs->sysvals_need_upload = false;
3600 }
3601
3602 /**
3603 * The pipe->set_shader_buffers() driver hook.
3604 *
3605 * This binds SSBOs and ABOs. Unfortunately, we need to stream out
3606 * SURFACE_STATE here, as the buffer offset may change each time.
3607 */
3608 static void
crocus_set_shader_buffers(struct pipe_context * ctx,enum pipe_shader_type p_stage,unsigned start_slot,unsigned count,const struct pipe_shader_buffer * buffers,unsigned writable_bitmask)3609 crocus_set_shader_buffers(struct pipe_context *ctx,
3610 enum pipe_shader_type p_stage,
3611 unsigned start_slot, unsigned count,
3612 const struct pipe_shader_buffer *buffers,
3613 unsigned writable_bitmask)
3614 {
3615 struct crocus_context *ice = (struct crocus_context *) ctx;
3616 gl_shader_stage stage = stage_from_pipe(p_stage);
3617 struct crocus_shader_state *shs = &ice->state.shaders[stage];
3618
3619 unsigned modified_bits = u_bit_consecutive(start_slot, count);
3620
3621 shs->bound_ssbos &= ~modified_bits;
3622 shs->writable_ssbos &= ~modified_bits;
3623 shs->writable_ssbos |= writable_bitmask << start_slot;
3624
3625 for (unsigned i = 0; i < count; i++) {
3626 if (buffers && buffers[i].buffer) {
3627 struct crocus_resource *res = (void *) buffers[i].buffer;
3628 struct pipe_shader_buffer *ssbo = &shs->ssbo[start_slot + i];
3629 pipe_resource_reference(&ssbo->buffer, &res->base.b);
3630 ssbo->buffer_offset = buffers[i].buffer_offset;
3631 ssbo->buffer_size =
3632 MIN2(buffers[i].buffer_size, res->bo->size - ssbo->buffer_offset);
3633
3634 shs->bound_ssbos |= 1 << (start_slot + i);
3635
3636 res->bind_history |= PIPE_BIND_SHADER_BUFFER;
3637 res->bind_stages |= 1 << stage;
3638
3639 util_range_add(&res->base.b, &res->valid_buffer_range, ssbo->buffer_offset,
3640 ssbo->buffer_offset + ssbo->buffer_size);
3641 } else {
3642 pipe_resource_reference(&shs->ssbo[start_slot + i].buffer, NULL);
3643 }
3644 }
3645
3646 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage;
3647 }
3648
3649 static void
crocus_delete_state(struct pipe_context * ctx,void * state)3650 crocus_delete_state(struct pipe_context *ctx, void *state)
3651 {
3652 free(state);
3653 }
3654
3655 /**
3656 * The pipe->set_vertex_buffers() driver hook.
3657 *
3658 * This translates pipe_vertex_buffer to our 3DSTATE_VERTEX_BUFFERS packet.
3659 */
3660 static void
crocus_set_vertex_buffers(struct pipe_context * ctx,unsigned count,const struct pipe_vertex_buffer * buffers)3661 crocus_set_vertex_buffers(struct pipe_context *ctx,
3662 unsigned count,
3663 const struct pipe_vertex_buffer *buffers)
3664 {
3665 struct crocus_context *ice = (struct crocus_context *) ctx;
3666 struct crocus_screen *screen = (struct crocus_screen *) ctx->screen;
3667 const unsigned padding =
3668 (GFX_VERx10 < 75 && screen->devinfo.platform != INTEL_PLATFORM_BYT) * 2;
3669
3670 util_set_vertex_buffers_mask(ice->state.vertex_buffers, &ice->state.bound_vertex_buffers,
3671 buffers, count, true);
3672
3673 for (unsigned i = 0; i < count; i++) {
3674 struct pipe_vertex_buffer *state =
3675 &ice->state.vertex_buffers[i];
3676
3677 if (!state->is_user_buffer && state->buffer.resource) {
3678 struct crocus_resource *res = (void *)state->buffer.resource;
3679 res->bind_history |= PIPE_BIND_VERTEX_BUFFER;
3680 }
3681
3682 uint32_t end = 0;
3683 if (state->buffer.resource)
3684 end = state->buffer.resource->width0 + padding;
3685 ice->state.vb_end[i] = end;
3686 }
3687 ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS;
3688 }
3689
3690 #if GFX_VERx10 < 75
get_wa_flags(enum isl_format format)3691 static uint8_t get_wa_flags(enum isl_format format)
3692 {
3693 uint8_t wa_flags = 0;
3694
3695 switch (format) {
3696 case ISL_FORMAT_R10G10B10A2_USCALED:
3697 wa_flags = ELK_ATTRIB_WA_SCALE;
3698 break;
3699 case ISL_FORMAT_R10G10B10A2_SSCALED:
3700 wa_flags = ELK_ATTRIB_WA_SIGN | ELK_ATTRIB_WA_SCALE;
3701 break;
3702 case ISL_FORMAT_R10G10B10A2_UNORM:
3703 wa_flags = ELK_ATTRIB_WA_NORMALIZE;
3704 break;
3705 case ISL_FORMAT_R10G10B10A2_SNORM:
3706 wa_flags = ELK_ATTRIB_WA_SIGN | ELK_ATTRIB_WA_NORMALIZE;
3707 break;
3708 case ISL_FORMAT_R10G10B10A2_SINT:
3709 wa_flags = ELK_ATTRIB_WA_SIGN;
3710 break;
3711 case ISL_FORMAT_B10G10R10A2_USCALED:
3712 wa_flags = ELK_ATTRIB_WA_SCALE | ELK_ATTRIB_WA_BGRA;
3713 break;
3714 case ISL_FORMAT_B10G10R10A2_SSCALED:
3715 wa_flags = ELK_ATTRIB_WA_SIGN | ELK_ATTRIB_WA_SCALE | ELK_ATTRIB_WA_BGRA;
3716 break;
3717 case ISL_FORMAT_B10G10R10A2_UNORM:
3718 wa_flags = ELK_ATTRIB_WA_NORMALIZE | ELK_ATTRIB_WA_BGRA;
3719 break;
3720 case ISL_FORMAT_B10G10R10A2_SNORM:
3721 wa_flags = ELK_ATTRIB_WA_SIGN | ELK_ATTRIB_WA_NORMALIZE | ELK_ATTRIB_WA_BGRA;
3722 break;
3723 case ISL_FORMAT_B10G10R10A2_SINT:
3724 wa_flags = ELK_ATTRIB_WA_SIGN | ELK_ATTRIB_WA_BGRA;
3725 break;
3726 case ISL_FORMAT_B10G10R10A2_UINT:
3727 wa_flags = ELK_ATTRIB_WA_BGRA;
3728 break;
3729 default:
3730 break;
3731 }
3732 return wa_flags;
3733 }
3734 #endif
3735
3736 /**
3737 * Gallium CSO for vertex elements.
3738 */
3739 struct crocus_vertex_element_state {
3740 uint32_t vertex_elements[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
3741 #if GFX_VER == 8
3742 uint32_t vf_instancing[33 * GENX(3DSTATE_VF_INSTANCING_length)];
3743 #endif
3744 uint32_t edgeflag_ve[GENX(VERTEX_ELEMENT_STATE_length)];
3745 #if GFX_VER == 8
3746 uint32_t edgeflag_vfi[GENX(3DSTATE_VF_INSTANCING_length)];
3747 #endif
3748 uint32_t step_rate[16];
3749 uint8_t wa_flags[33];
3750 uint16_t strides[16];
3751 unsigned count;
3752 };
3753
3754 /**
3755 * The pipe->create_vertex_elements() driver hook.
3756 *
3757 * This translates pipe_vertex_element to our 3DSTATE_VERTEX_ELEMENTS
3758 * and 3DSTATE_VF_INSTANCING commands. The vertex_elements and vf_instancing
3759 * arrays are ready to be emitted at draw time if no EdgeFlag or SGVs are
3760 * needed. In these cases we will need information available at draw time.
3761 * We setup edgeflag_ve and edgeflag_vfi as alternatives last
3762 * 3DSTATE_VERTEX_ELEMENT and 3DSTATE_VF_INSTANCING that can be used at
3763 * draw time if we detect that EdgeFlag is needed by the Vertex Shader.
3764 */
3765 static void *
crocus_create_vertex_elements(struct pipe_context * ctx,unsigned count,const struct pipe_vertex_element * state)3766 crocus_create_vertex_elements(struct pipe_context *ctx,
3767 unsigned count,
3768 const struct pipe_vertex_element *state)
3769 {
3770 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3771 const struct intel_device_info *devinfo = &screen->devinfo;
3772 struct crocus_vertex_element_state *cso =
3773 calloc(1, sizeof(struct crocus_vertex_element_state));
3774
3775 cso->count = count;
3776
3777 crocus_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), cso->vertex_elements, ve) {
3778 ve.DWordLength =
3779 1 + GENX(VERTEX_ELEMENT_STATE_length) * MAX2(count, 1) - 2;
3780 }
3781
3782 uint32_t *ve_pack_dest = &cso->vertex_elements[1];
3783 #if GFX_VER == 8
3784 uint32_t *vfi_pack_dest = cso->vf_instancing;
3785 #endif
3786
3787 if (count == 0) {
3788 crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
3789 ve.Valid = true;
3790 ve.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
3791 ve.Component0Control = VFCOMP_STORE_0;
3792 ve.Component1Control = VFCOMP_STORE_0;
3793 ve.Component2Control = VFCOMP_STORE_0;
3794 ve.Component3Control = VFCOMP_STORE_1_FP;
3795 }
3796 #if GFX_VER == 8
3797 crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
3798 }
3799 #endif
3800 }
3801
3802 for (int i = 0; i < count; i++) {
3803 const struct crocus_format_info fmt =
3804 crocus_format_for_usage(devinfo, state[i].src_format, 0);
3805 unsigned comp[4] = { VFCOMP_STORE_SRC, VFCOMP_STORE_SRC,
3806 VFCOMP_STORE_SRC, VFCOMP_STORE_SRC };
3807 enum isl_format actual_fmt = fmt.fmt;
3808
3809 #if GFX_VERx10 < 75
3810 cso->wa_flags[i] = get_wa_flags(fmt.fmt);
3811
3812 if (fmt.fmt == ISL_FORMAT_R10G10B10A2_USCALED ||
3813 fmt.fmt == ISL_FORMAT_R10G10B10A2_SSCALED ||
3814 fmt.fmt == ISL_FORMAT_R10G10B10A2_UNORM ||
3815 fmt.fmt == ISL_FORMAT_R10G10B10A2_SNORM ||
3816 fmt.fmt == ISL_FORMAT_R10G10B10A2_SINT ||
3817 fmt.fmt == ISL_FORMAT_B10G10R10A2_USCALED ||
3818 fmt.fmt == ISL_FORMAT_B10G10R10A2_SSCALED ||
3819 fmt.fmt == ISL_FORMAT_B10G10R10A2_UNORM ||
3820 fmt.fmt == ISL_FORMAT_B10G10R10A2_SNORM ||
3821 fmt.fmt == ISL_FORMAT_B10G10R10A2_UINT ||
3822 fmt.fmt == ISL_FORMAT_B10G10R10A2_SINT)
3823 actual_fmt = ISL_FORMAT_R10G10B10A2_UINT;
3824 if (fmt.fmt == ISL_FORMAT_R8G8B8_SINT)
3825 actual_fmt = ISL_FORMAT_R8G8B8A8_SINT;
3826 if (fmt.fmt == ISL_FORMAT_R8G8B8_UINT)
3827 actual_fmt = ISL_FORMAT_R8G8B8A8_UINT;
3828 if (fmt.fmt == ISL_FORMAT_R16G16B16_SINT)
3829 actual_fmt = ISL_FORMAT_R16G16B16A16_SINT;
3830 if (fmt.fmt == ISL_FORMAT_R16G16B16_UINT)
3831 actual_fmt = ISL_FORMAT_R16G16B16A16_UINT;
3832 #endif
3833
3834 cso->step_rate[state[i].vertex_buffer_index] = state[i].instance_divisor;
3835 cso->strides[state[i].vertex_buffer_index] = state[i].src_stride;
3836
3837 switch (isl_format_get_num_channels(fmt.fmt)) {
3838 case 0: comp[0] = VFCOMP_STORE_0; FALLTHROUGH;
3839 case 1: comp[1] = VFCOMP_STORE_0; FALLTHROUGH;
3840 case 2: comp[2] = VFCOMP_STORE_0; FALLTHROUGH;
3841 case 3:
3842 comp[3] = isl_format_has_int_channel(fmt.fmt) ? VFCOMP_STORE_1_INT
3843 : VFCOMP_STORE_1_FP;
3844 break;
3845 }
3846 crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
3847 #if GFX_VER >= 6
3848 ve.EdgeFlagEnable = false;
3849 #endif
3850 ve.VertexBufferIndex = state[i].vertex_buffer_index;
3851 ve.Valid = true;
3852 ve.SourceElementOffset = state[i].src_offset;
3853 ve.SourceElementFormat = actual_fmt;
3854 ve.Component0Control = comp[0];
3855 ve.Component1Control = comp[1];
3856 ve.Component2Control = comp[2];
3857 ve.Component3Control = comp[3];
3858 #if GFX_VER < 5
3859 ve.DestinationElementOffset = i * 4;
3860 #endif
3861 }
3862
3863 #if GFX_VER == 8
3864 crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
3865 vi.VertexElementIndex = i;
3866 vi.InstancingEnable = state[i].instance_divisor > 0;
3867 vi.InstanceDataStepRate = state[i].instance_divisor;
3868 }
3869 #endif
3870 ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
3871 #if GFX_VER == 8
3872 vfi_pack_dest += GENX(3DSTATE_VF_INSTANCING_length);
3873 #endif
3874 }
3875
3876 /* An alternative version of the last VE and VFI is stored so it
3877 * can be used at draw time in case Vertex Shader uses EdgeFlag
3878 */
3879 if (count) {
3880 const unsigned edgeflag_index = count - 1;
3881 const struct crocus_format_info fmt =
3882 crocus_format_for_usage(devinfo, state[edgeflag_index].src_format, 0);
3883 crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), cso->edgeflag_ve, ve) {
3884 #if GFX_VER >= 6
3885 ve.EdgeFlagEnable = true;
3886 #endif
3887 ve.VertexBufferIndex = state[edgeflag_index].vertex_buffer_index;
3888 ve.Valid = true;
3889 ve.SourceElementOffset = state[edgeflag_index].src_offset;
3890 ve.SourceElementFormat = fmt.fmt;
3891 ve.Component0Control = VFCOMP_STORE_SRC;
3892 ve.Component1Control = VFCOMP_STORE_0;
3893 ve.Component2Control = VFCOMP_STORE_0;
3894 ve.Component3Control = VFCOMP_STORE_0;
3895 }
3896 #if GFX_VER == 8
3897 crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), cso->edgeflag_vfi, vi) {
3898 /* The vi.VertexElementIndex of the EdgeFlag Vertex Element is filled
3899 * at draw time, as it should change if SGVs are emitted.
3900 */
3901 vi.InstancingEnable = state[edgeflag_index].instance_divisor > 0;
3902 vi.InstanceDataStepRate = state[edgeflag_index].instance_divisor;
3903 }
3904 #endif
3905 }
3906
3907 return cso;
3908 }
3909
3910 /**
3911 * The pipe->bind_vertex_elements_state() driver hook.
3912 */
3913 static void
crocus_bind_vertex_elements_state(struct pipe_context * ctx,void * state)3914 crocus_bind_vertex_elements_state(struct pipe_context *ctx, void *state)
3915 {
3916 struct crocus_context *ice = (struct crocus_context *) ctx;
3917 #if GFX_VER == 8
3918 struct crocus_vertex_element_state *old_cso = ice->state.cso_vertex_elements;
3919 struct crocus_vertex_element_state *new_cso = state;
3920
3921 if (new_cso && cso_changed(count))
3922 ice->state.dirty |= CROCUS_DIRTY_GEN8_VF_SGVS;
3923 #endif
3924 ice->state.cso_vertex_elements = state;
3925 ice->state.dirty |= CROCUS_DIRTY_VERTEX_ELEMENTS | CROCUS_DIRTY_VERTEX_BUFFERS;
3926 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_VERTEX_ELEMENTS];
3927 }
3928
3929 #if GFX_VER >= 6
3930 struct crocus_streamout_counter {
3931 uint32_t offset_start;
3932 uint32_t offset_end;
3933
3934 uint64_t accum;
3935 };
3936
3937 /**
3938 * Gallium CSO for stream output (transform feedback) targets.
3939 */
3940 struct crocus_stream_output_target {
3941 struct pipe_stream_output_target base;
3942
3943 /** Stride (bytes-per-vertex) during this transform feedback operation */
3944 uint16_t stride;
3945
3946 /** Has 3DSTATE_SO_BUFFER actually been emitted, zeroing the offsets? */
3947 bool zeroed;
3948
3949 struct crocus_resource *offset_res;
3950 uint32_t offset_offset;
3951
3952 #if GFX_VER == 6
3953 void *prim_map;
3954 struct crocus_streamout_counter prev_count;
3955 struct crocus_streamout_counter count;
3956 #endif
3957 #if GFX_VER == 8
3958 /** Does the next 3DSTATE_SO_BUFFER need to zero the offsets? */
3959 bool zero_offset;
3960 #endif
3961 };
3962
3963 #if GFX_VER >= 7
3964 static uint32_t
crocus_get_so_offset(struct pipe_stream_output_target * so)3965 crocus_get_so_offset(struct pipe_stream_output_target *so)
3966 {
3967 struct crocus_stream_output_target *tgt = (void *)so;
3968 struct pipe_transfer *transfer;
3969 struct pipe_box box;
3970 uint32_t result;
3971 u_box_1d(tgt->offset_offset, 4, &box);
3972 void *val = so->context->buffer_map(so->context, &tgt->offset_res->base.b,
3973 0, PIPE_MAP_DIRECTLY,
3974 &box, &transfer);
3975 assert(val);
3976 result = *(uint32_t *)val;
3977 so->context->buffer_unmap(so->context, transfer);
3978
3979 return result / tgt->stride;
3980 }
3981 #endif
3982
3983 #if GFX_VER == 6
3984 static void
3985 compute_vertices_written_so_far(struct crocus_context *ice,
3986 struct crocus_stream_output_target *tgt,
3987 struct crocus_streamout_counter *count,
3988 uint64_t *svbi);
3989
3990 static uint32_t
crocus_get_so_offset(struct pipe_stream_output_target * so)3991 crocus_get_so_offset(struct pipe_stream_output_target *so)
3992 {
3993 struct crocus_stream_output_target *tgt = (void *)so;
3994 struct crocus_context *ice = (void *)so->context;
3995
3996 uint64_t vert_written;
3997 compute_vertices_written_so_far(ice, tgt, &tgt->prev_count, &vert_written);
3998 return vert_written;
3999 }
4000 #endif
4001
4002 /**
4003 * The pipe->create_stream_output_target() driver hook.
4004 *
4005 * "Target" here refers to a destination buffer. We translate this into
4006 * a 3DSTATE_SO_BUFFER packet. We can handle most fields, but don't yet
4007 * know which buffer this represents, or whether we ought to zero the
4008 * write-offsets, or append. Those are handled in the set() hook.
4009 */
4010 static struct pipe_stream_output_target *
crocus_create_stream_output_target(struct pipe_context * ctx,struct pipe_resource * p_res,unsigned buffer_offset,unsigned buffer_size)4011 crocus_create_stream_output_target(struct pipe_context *ctx,
4012 struct pipe_resource *p_res,
4013 unsigned buffer_offset,
4014 unsigned buffer_size)
4015 {
4016 struct crocus_resource *res = (void *) p_res;
4017 struct crocus_stream_output_target *cso = calloc(1, sizeof(*cso));
4018 if (!cso)
4019 return NULL;
4020
4021 res->bind_history |= PIPE_BIND_STREAM_OUTPUT;
4022
4023 pipe_reference_init(&cso->base.reference, 1);
4024 pipe_resource_reference(&cso->base.buffer, p_res);
4025 cso->base.buffer_offset = buffer_offset;
4026 cso->base.buffer_size = buffer_size;
4027 cso->base.context = ctx;
4028
4029 util_range_add(&res->base.b, &res->valid_buffer_range, buffer_offset,
4030 buffer_offset + buffer_size);
4031 #if GFX_VER >= 7
4032 struct crocus_context *ice = (struct crocus_context *) ctx;
4033 void *temp;
4034 u_upload_alloc(ice->ctx.stream_uploader, 0, sizeof(uint32_t), 4,
4035 &cso->offset_offset,
4036 (struct pipe_resource **)&cso->offset_res,
4037 &temp);
4038 #endif
4039
4040 return &cso->base;
4041 }
4042
4043 static void
crocus_stream_output_target_destroy(struct pipe_context * ctx,struct pipe_stream_output_target * state)4044 crocus_stream_output_target_destroy(struct pipe_context *ctx,
4045 struct pipe_stream_output_target *state)
4046 {
4047 struct crocus_stream_output_target *cso = (void *) state;
4048
4049 pipe_resource_reference((struct pipe_resource **)&cso->offset_res, NULL);
4050 pipe_resource_reference(&cso->base.buffer, NULL);
4051
4052 free(cso);
4053 }
4054
4055 #define GEN6_SO_NUM_PRIMS_WRITTEN 0x2288
4056 #define GEN7_SO_WRITE_OFFSET(n) (0x5280 + (n) * 4)
4057
4058 #if GFX_VER == 6
4059 static void
aggregate_stream_counter(struct crocus_batch * batch,struct crocus_stream_output_target * tgt,struct crocus_streamout_counter * counter)4060 aggregate_stream_counter(struct crocus_batch *batch, struct crocus_stream_output_target *tgt,
4061 struct crocus_streamout_counter *counter)
4062 {
4063 uint64_t *prim_counts = tgt->prim_map;
4064
4065 if (crocus_batch_references(batch, tgt->offset_res->bo)) {
4066 struct pipe_fence_handle *out_fence = NULL;
4067 batch->ice->ctx.flush(&batch->ice->ctx, &out_fence, 0);
4068 batch->screen->base.fence_finish(&batch->screen->base, &batch->ice->ctx, out_fence, UINT64_MAX);
4069 batch->screen->base.fence_reference(&batch->screen->base, &out_fence, NULL);
4070 }
4071
4072 for (unsigned i = counter->offset_start / sizeof(uint64_t); i < counter->offset_end / sizeof(uint64_t); i += 2) {
4073 counter->accum += prim_counts[i + 1] - prim_counts[i];
4074 }
4075 tgt->count.offset_start = tgt->count.offset_end = 0;
4076 }
4077
4078 static void
crocus_stream_store_prims_written(struct crocus_batch * batch,struct crocus_stream_output_target * tgt)4079 crocus_stream_store_prims_written(struct crocus_batch *batch,
4080 struct crocus_stream_output_target *tgt)
4081 {
4082 if (!tgt->offset_res) {
4083 u_upload_alloc(batch->ice->ctx.stream_uploader, 0, 4096, 4,
4084 &tgt->offset_offset,
4085 (struct pipe_resource **)&tgt->offset_res,
4086 &tgt->prim_map);
4087 tgt->count.offset_start = tgt->count.offset_end = 0;
4088 }
4089
4090 if (tgt->count.offset_end + 16 >= 4096) {
4091 aggregate_stream_counter(batch, tgt, &tgt->prev_count);
4092 aggregate_stream_counter(batch, tgt, &tgt->count);
4093 }
4094
4095 crocus_emit_mi_flush(batch);
4096 crocus_store_register_mem64(batch, GEN6_SO_NUM_PRIMS_WRITTEN,
4097 tgt->offset_res->bo,
4098 tgt->count.offset_end + tgt->offset_offset, false);
4099 tgt->count.offset_end += 8;
4100 }
4101
4102 static void
compute_vertices_written_so_far(struct crocus_context * ice,struct crocus_stream_output_target * tgt,struct crocus_streamout_counter * counter,uint64_t * svbi)4103 compute_vertices_written_so_far(struct crocus_context *ice,
4104 struct crocus_stream_output_target *tgt,
4105 struct crocus_streamout_counter *counter,
4106 uint64_t *svbi)
4107 {
4108 //TODO vertices per prim
4109 aggregate_stream_counter(&ice->batches[0], tgt, counter);
4110
4111 *svbi = counter->accum * ice->state.last_xfb_verts_per_prim;
4112 }
4113 #endif
4114 /**
4115 * The pipe->set_stream_output_targets() driver hook.
4116 *
4117 * At this point, we know which targets are bound to a particular index,
4118 * and also whether we want to append or start over. We can finish the
4119 * 3DSTATE_SO_BUFFER packets we started earlier.
4120 */
4121 static void
crocus_set_stream_output_targets(struct pipe_context * ctx,unsigned num_targets,struct pipe_stream_output_target ** targets,const unsigned * offsets)4122 crocus_set_stream_output_targets(struct pipe_context *ctx,
4123 unsigned num_targets,
4124 struct pipe_stream_output_target **targets,
4125 const unsigned *offsets)
4126 {
4127 struct crocus_context *ice = (struct crocus_context *) ctx;
4128 struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
4129 struct pipe_stream_output_target *old_tgt[4] = { NULL, NULL, NULL, NULL };
4130 const bool active = num_targets > 0;
4131 if (ice->state.streamout_active != active) {
4132 ice->state.streamout_active = active;
4133 #if GFX_VER >= 7
4134 ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
4135 #else
4136 ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
4137 #endif
4138
4139 /* We only emit 3DSTATE_SO_DECL_LIST when streamout is active, because
4140 * it's a non-pipelined command. If we're switching streamout on, we
4141 * may have missed emitting it earlier, so do so now. (We're already
4142 * taking a stall to update 3DSTATE_SO_BUFFERS anyway...)
4143 */
4144 if (active) {
4145 #if GFX_VER >= 7
4146 ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST;
4147 #endif
4148 } else {
4149 uint32_t flush = 0;
4150 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4151 struct crocus_stream_output_target *tgt =
4152 (void *) ice->state.so_target[i];
4153 if (tgt) {
4154 struct crocus_resource *res = (void *) tgt->base.buffer;
4155
4156 flush |= crocus_flush_bits_for_history(res);
4157 crocus_dirty_for_history(ice, res);
4158 }
4159 }
4160 crocus_emit_pipe_control_flush(&ice->batches[CROCUS_BATCH_RENDER],
4161 "make streamout results visible", flush);
4162 }
4163 }
4164
4165 ice->state.so_targets = num_targets;
4166 for (int i = 0; i < 4; i++) {
4167 pipe_so_target_reference(&old_tgt[i], ice->state.so_target[i]);
4168 pipe_so_target_reference(&ice->state.so_target[i],
4169 i < num_targets ? targets[i] : NULL);
4170 }
4171
4172 #if GFX_VER == 6
4173 bool stored_num_prims = false;
4174 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4175 if (num_targets) {
4176 struct crocus_stream_output_target *tgt =
4177 (void *) ice->state.so_target[i];
4178
4179 if (!tgt)
4180 continue;
4181 if (offsets[i] == 0) {
4182 // This means that we're supposed to ignore anything written to
4183 // the buffer before. We can do this by just clearing out the
4184 // count of writes to the prim count buffer.
4185 tgt->count.offset_start = tgt->count.offset_end;
4186 tgt->count.accum = 0;
4187 ice->state.svbi = 0;
4188 } else {
4189 if (tgt->offset_res) {
4190 compute_vertices_written_so_far(ice, tgt, &tgt->count, &ice->state.svbi);
4191 tgt->count.offset_start = tgt->count.offset_end;
4192 }
4193 }
4194
4195 if (!stored_num_prims) {
4196 crocus_stream_store_prims_written(batch, tgt);
4197 stored_num_prims = true;
4198 }
4199 } else {
4200 struct crocus_stream_output_target *tgt =
4201 (void *) old_tgt[i];
4202 if (tgt) {
4203 if (!stored_num_prims) {
4204 crocus_stream_store_prims_written(batch, tgt);
4205 stored_num_prims = true;
4206 }
4207
4208 if (tgt->offset_res) {
4209 tgt->prev_count = tgt->count;
4210 }
4211 }
4212 }
4213 pipe_so_target_reference(&old_tgt[i], NULL);
4214 }
4215 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_GS;
4216 #else
4217 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4218 if (num_targets) {
4219 struct crocus_stream_output_target *tgt =
4220 (void *) ice->state.so_target[i];
4221
4222 if (offsets[i] == 0) {
4223 #if GFX_VER == 8
4224 if (tgt)
4225 tgt->zero_offset = true;
4226 #endif
4227 crocus_load_register_imm32(batch, GEN7_SO_WRITE_OFFSET(i), 0);
4228 }
4229 else if (tgt)
4230 crocus_load_register_mem32(batch, GEN7_SO_WRITE_OFFSET(i),
4231 tgt->offset_res->bo,
4232 tgt->offset_offset);
4233 } else {
4234 struct crocus_stream_output_target *tgt =
4235 (void *) old_tgt[i];
4236 if (tgt)
4237 crocus_store_register_mem32(batch, GEN7_SO_WRITE_OFFSET(i),
4238 tgt->offset_res->bo,
4239 tgt->offset_offset, false);
4240 }
4241 pipe_so_target_reference(&old_tgt[i], NULL);
4242 }
4243 #endif
4244 /* No need to update 3DSTATE_SO_BUFFER unless SOL is active. */
4245 if (!active)
4246 return;
4247 #if GFX_VER >= 7
4248 ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
4249 #elif GFX_VER == 6
4250 ice->state.dirty |= CROCUS_DIRTY_GEN6_SVBI;
4251 #endif
4252 }
4253
4254 #endif
4255
4256 #if GFX_VER >= 7
4257 /**
4258 * An crocus-vtable helper for encoding the 3DSTATE_SO_DECL_LIST and
4259 * 3DSTATE_STREAMOUT packets.
4260 *
4261 * 3DSTATE_SO_DECL_LIST is a list of shader outputs we want the streamout
4262 * hardware to record. We can create it entirely based on the shader, with
4263 * no dynamic state dependencies.
4264 *
4265 * 3DSTATE_STREAMOUT is an annoying mix of shader-based information and
4266 * state-based settings. We capture the shader-related ones here, and merge
4267 * the rest in at draw time.
4268 */
4269 static uint32_t *
crocus_create_so_decl_list(const struct pipe_stream_output_info * info,const struct intel_vue_map * vue_map)4270 crocus_create_so_decl_list(const struct pipe_stream_output_info *info,
4271 const struct intel_vue_map *vue_map)
4272 {
4273 struct GENX(SO_DECL) so_decl[PIPE_MAX_VERTEX_STREAMS][128];
4274 int buffer_mask[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4275 int next_offset[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4276 int decls[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4277 int max_decls = 0;
4278 STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= PIPE_MAX_SO_OUTPUTS);
4279
4280 memset(so_decl, 0, sizeof(so_decl));
4281
4282 /* Construct the list of SO_DECLs to be emitted. The formatting of the
4283 * command feels strange -- each dword pair contains a SO_DECL per stream.
4284 */
4285 for (unsigned i = 0; i < info->num_outputs; i++) {
4286 const struct pipe_stream_output *output = &info->output[i];
4287 const int buffer = output->output_buffer;
4288 const int varying = output->register_index;
4289 const unsigned stream_id = output->stream;
4290 assert(stream_id < PIPE_MAX_VERTEX_STREAMS);
4291
4292 buffer_mask[stream_id] |= 1 << buffer;
4293
4294 assert(vue_map->varying_to_slot[varying] >= 0);
4295
4296 /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
4297 * array. Instead, it simply increments DstOffset for the following
4298 * input by the number of components that should be skipped.
4299 *
4300 * Our hardware is unusual in that it requires us to program SO_DECLs
4301 * for fake "hole" components, rather than simply taking the offset
4302 * for each real varying. Each hole can have size 1, 2, 3, or 4; we
4303 * program as many size = 4 holes as we can, then a final hole to
4304 * accommodate the final 1, 2, or 3 remaining.
4305 */
4306 int skip_components = output->dst_offset - next_offset[buffer];
4307
4308 while (skip_components > 0) {
4309 so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
4310 .HoleFlag = 1,
4311 .OutputBufferSlot = output->output_buffer,
4312 .ComponentMask = (1 << MIN2(skip_components, 4)) - 1,
4313 };
4314 skip_components -= 4;
4315 }
4316
4317 next_offset[buffer] = output->dst_offset + output->num_components;
4318
4319 so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
4320 .OutputBufferSlot = output->output_buffer,
4321 .RegisterIndex = vue_map->varying_to_slot[varying],
4322 .ComponentMask =
4323 ((1 << output->num_components) - 1) << output->start_component,
4324 };
4325
4326 if (decls[stream_id] > max_decls)
4327 max_decls = decls[stream_id];
4328 }
4329
4330 unsigned dwords = GENX(3DSTATE_STREAMOUT_length) + (3 + 2 * max_decls);
4331 uint32_t *map = ralloc_size(NULL, sizeof(uint32_t) * dwords);
4332 uint32_t *so_decl_map = map + GENX(3DSTATE_STREAMOUT_length);
4333
4334 crocus_pack_command(GENX(3DSTATE_STREAMOUT), map, sol) {
4335 int urb_entry_read_offset = 0;
4336 int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
4337 urb_entry_read_offset;
4338
4339 /* We always read the whole vertex. This could be reduced at some
4340 * point by reading less and offsetting the register index in the
4341 * SO_DECLs.
4342 */
4343 sol.Stream0VertexReadOffset = urb_entry_read_offset;
4344 sol.Stream0VertexReadLength = urb_entry_read_length - 1;
4345 sol.Stream1VertexReadOffset = urb_entry_read_offset;
4346 sol.Stream1VertexReadLength = urb_entry_read_length - 1;
4347 sol.Stream2VertexReadOffset = urb_entry_read_offset;
4348 sol.Stream2VertexReadLength = urb_entry_read_length - 1;
4349 sol.Stream3VertexReadOffset = urb_entry_read_offset;
4350 sol.Stream3VertexReadLength = urb_entry_read_length - 1;
4351
4352 // TODO: Double-check that stride == 0 means no buffer. Probably this
4353 // needs to go elsewhere, where the buffer enable stuff is actually
4354 // known.
4355 #if GFX_VER < 8
4356 sol.SOBufferEnable0 = !!info->stride[0];
4357 sol.SOBufferEnable1 = !!info->stride[1];
4358 sol.SOBufferEnable2 = !!info->stride[2];
4359 sol.SOBufferEnable3 = !!info->stride[3];
4360 #else
4361 /* Set buffer pitches; 0 means unbound. */
4362 sol.Buffer0SurfacePitch = 4 * info->stride[0];
4363 sol.Buffer1SurfacePitch = 4 * info->stride[1];
4364 sol.Buffer2SurfacePitch = 4 * info->stride[2];
4365 sol.Buffer3SurfacePitch = 4 * info->stride[3];
4366 #endif
4367 }
4368
4369 crocus_pack_command(GENX(3DSTATE_SO_DECL_LIST), so_decl_map, list) {
4370 list.DWordLength = 3 + 2 * max_decls - 2;
4371 list.StreamtoBufferSelects0 = buffer_mask[0];
4372 list.StreamtoBufferSelects1 = buffer_mask[1];
4373 list.StreamtoBufferSelects2 = buffer_mask[2];
4374 list.StreamtoBufferSelects3 = buffer_mask[3];
4375 list.NumEntries0 = decls[0];
4376 list.NumEntries1 = decls[1];
4377 list.NumEntries2 = decls[2];
4378 list.NumEntries3 = decls[3];
4379 }
4380
4381 for (int i = 0; i < max_decls; i++) {
4382 crocus_pack_state(GENX(SO_DECL_ENTRY), so_decl_map + 3 + i * 2, entry) {
4383 entry.Stream0Decl = so_decl[0][i];
4384 entry.Stream1Decl = so_decl[1][i];
4385 entry.Stream2Decl = so_decl[2][i];
4386 entry.Stream3Decl = so_decl[3][i];
4387 }
4388 }
4389
4390 return map;
4391 }
4392 #endif
4393
4394 #if GFX_VER == 6
4395 static void
crocus_emit_so_svbi(struct crocus_context * ice)4396 crocus_emit_so_svbi(struct crocus_context *ice)
4397 {
4398 struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
4399
4400 unsigned max_vertex = 0xffffffff;
4401 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4402 struct crocus_stream_output_target *tgt =
4403 (void *) ice->state.so_target[i];
4404 if (tgt)
4405 max_vertex = MIN2(max_vertex, tgt->base.buffer_size / tgt->stride);
4406 }
4407
4408 crocus_emit_cmd(batch, GENX(3DSTATE_GS_SVB_INDEX), svbi) {
4409 svbi.IndexNumber = 0;
4410 svbi.StreamedVertexBufferIndex = (uint32_t)ice->state.svbi; /* fix when resuming, based on target's prim count */
4411 svbi.MaximumIndex = max_vertex;
4412 }
4413
4414 /* initialize the rest of the SVBI's to reasonable values so that we don't
4415 * run out of room writing the regular data.
4416 */
4417 for (int i = 1; i < 4; i++) {
4418 crocus_emit_cmd(batch, GENX(3DSTATE_GS_SVB_INDEX), svbi) {
4419 svbi.IndexNumber = i;
4420 svbi.StreamedVertexBufferIndex = 0;
4421 svbi.MaximumIndex = 0xffffffff;
4422 }
4423 }
4424 }
4425
4426 #endif
4427
4428
4429 #if GFX_VER >= 6
4430 static bool
crocus_is_drawing_points(const struct crocus_context * ice)4431 crocus_is_drawing_points(const struct crocus_context *ice)
4432 {
4433 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4434
4435 if (cso_rast->cso.fill_front == PIPE_POLYGON_MODE_POINT ||
4436 cso_rast->cso.fill_back == PIPE_POLYGON_MODE_POINT)
4437 return true;
4438
4439 if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
4440 const struct elk_gs_prog_data *gs_prog_data =
4441 (void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data;
4442 return gs_prog_data->output_topology == _3DPRIM_POINTLIST;
4443 } else if (ice->shaders.prog[MESA_SHADER_TESS_EVAL]) {
4444 const struct elk_tes_prog_data *tes_data =
4445 (void *) ice->shaders.prog[MESA_SHADER_TESS_EVAL]->prog_data;
4446 return tes_data->output_topology == INTEL_TESS_OUTPUT_TOPOLOGY_POINT;
4447 } else {
4448 return ice->state.prim_mode == MESA_PRIM_POINTS;
4449 }
4450 }
4451 #endif
4452
4453 #if GFX_VER >= 6
4454 static void
get_attr_override(struct GENX (SF_OUTPUT_ATTRIBUTE_DETAIL)* attr,const struct intel_vue_map * vue_map,int urb_entry_read_offset,int fs_attr,bool two_side_color,uint32_t * max_source_attr)4455 get_attr_override(
4456 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr,
4457 const struct intel_vue_map *vue_map,
4458 int urb_entry_read_offset, int fs_attr,
4459 bool two_side_color, uint32_t *max_source_attr)
4460 {
4461 /* Find the VUE slot for this attribute. */
4462 int slot = vue_map->varying_to_slot[fs_attr];
4463
4464 /* Viewport and Layer are stored in the VUE header. We need to override
4465 * them to zero if earlier stages didn't write them, as GL requires that
4466 * they read back as zero when not explicitly set.
4467 */
4468 if (fs_attr == VARYING_SLOT_VIEWPORT || fs_attr == VARYING_SLOT_LAYER) {
4469 attr->ComponentOverrideX = true;
4470 attr->ComponentOverrideW = true;
4471 attr->ConstantSource = CONST_0000;
4472
4473 if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
4474 attr->ComponentOverrideY = true;
4475 if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
4476 attr->ComponentOverrideZ = true;
4477
4478 return;
4479 }
4480
4481 /* If there was only a back color written but not front, use back
4482 * as the color instead of undefined
4483 */
4484 if (slot == -1 && fs_attr == VARYING_SLOT_COL0)
4485 slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];
4486 if (slot == -1 && fs_attr == VARYING_SLOT_COL1)
4487 slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];
4488
4489 if (slot == -1) {
4490 /* This attribute does not exist in the VUE--that means that the vertex
4491 * shader did not write to it. This means that either:
4492 *
4493 * (a) This attribute is a texture coordinate, and it is going to be
4494 * replaced with point coordinates (as a consequence of a call to
4495 * glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)), so the
4496 * hardware will ignore whatever attribute override we supply.
4497 *
4498 * (b) This attribute is read by the fragment shader but not written by
4499 * the vertex shader, so its value is undefined. Therefore the
4500 * attribute override we supply doesn't matter.
4501 *
4502 * (c) This attribute is gl_PrimitiveID, and it wasn't written by the
4503 * previous shader stage.
4504 *
4505 * Note that we don't have to worry about the cases where the attribute
4506 * is gl_PointCoord or is undergoing point sprite coordinate
4507 * replacement, because in those cases, this function isn't called.
4508 *
4509 * In case (c), we need to program the attribute overrides so that the
4510 * primitive ID will be stored in this slot. In every other case, the
4511 * attribute override we supply doesn't matter. So just go ahead and
4512 * program primitive ID in every case.
4513 */
4514 attr->ComponentOverrideW = true;
4515 attr->ComponentOverrideX = true;
4516 attr->ComponentOverrideY = true;
4517 attr->ComponentOverrideZ = true;
4518 attr->ConstantSource = PRIM_ID;
4519 return;
4520 }
4521
4522 /* Compute the location of the attribute relative to urb_entry_read_offset.
4523 * Each increment of urb_entry_read_offset represents a 256-bit value, so
4524 * it counts for two 128-bit VUE slots.
4525 */
4526 int source_attr = slot - 2 * urb_entry_read_offset;
4527 assert(source_attr >= 0 && source_attr < 32);
4528
4529 /* If we are doing two-sided color, and the VUE slot following this one
4530 * represents a back-facing color, then we need to instruct the SF unit to
4531 * do back-facing swizzling.
4532 */
4533 bool swizzling = two_side_color &&
4534 ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&
4535 vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||
4536 (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&
4537 vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1));
4538
4539 /* Update max_source_attr. If swizzling, the SF will read this slot + 1. */
4540 if (*max_source_attr < source_attr + swizzling)
4541 *max_source_attr = source_attr + swizzling;
4542
4543 attr->SourceAttribute = source_attr;
4544 if (swizzling)
4545 attr->SwizzleSelect = INPUTATTR_FACING;
4546 }
4547
4548 static void
calculate_attr_overrides(const struct crocus_context * ice,struct GENX (SF_OUTPUT_ATTRIBUTE_DETAIL)* attr_overrides,uint32_t * point_sprite_enables,uint32_t * urb_entry_read_length,uint32_t * urb_entry_read_offset)4549 calculate_attr_overrides(
4550 const struct crocus_context *ice,
4551 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr_overrides,
4552 uint32_t *point_sprite_enables,
4553 uint32_t *urb_entry_read_length,
4554 uint32_t *urb_entry_read_offset)
4555 {
4556 const struct elk_wm_prog_data *wm_prog_data = (void *)
4557 ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
4558 const struct intel_vue_map *vue_map = ice->shaders.last_vue_map;
4559 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4560 uint32_t max_source_attr = 0;
4561 const struct shader_info *fs_info =
4562 crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
4563
4564 int first_slot =
4565 elk_compute_first_urb_slot_required(fs_info->inputs_read, vue_map);
4566
4567 /* Each URB offset packs two varying slots */
4568 assert(first_slot % 2 == 0);
4569 *urb_entry_read_offset = first_slot / 2;
4570 *point_sprite_enables = 0;
4571
4572 for (int fs_attr = 0; fs_attr < VARYING_SLOT_MAX; fs_attr++) {
4573 const int input_index = wm_prog_data->urb_setup[fs_attr];
4574
4575 if (input_index < 0)
4576 continue;
4577
4578 bool point_sprite = false;
4579 if (crocus_is_drawing_points(ice)) {
4580 if (fs_attr >= VARYING_SLOT_TEX0 &&
4581 fs_attr <= VARYING_SLOT_TEX7 &&
4582 cso_rast->cso.sprite_coord_enable & (1 << (fs_attr - VARYING_SLOT_TEX0)))
4583 point_sprite = true;
4584
4585 if (fs_attr == VARYING_SLOT_PNTC)
4586 point_sprite = true;
4587
4588 if (point_sprite)
4589 *point_sprite_enables |= 1U << input_index;
4590 }
4591
4592 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attribute = { 0 };
4593 if (!point_sprite) {
4594 get_attr_override(&attribute, vue_map, *urb_entry_read_offset, fs_attr,
4595 cso_rast->cso.light_twoside, &max_source_attr);
4596 }
4597
4598 /* The hardware can only do the overrides on 16 overrides at a
4599 * time, and the other up to 16 have to be lined up so that the
4600 * input index = the output index. We'll need to do some
4601 * tweaking to make sure that's the case.
4602 */
4603 if (input_index < 16)
4604 attr_overrides[input_index] = attribute;
4605 else
4606 assert(attribute.SourceAttribute == input_index);
4607 }
4608
4609 /* From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
4610 * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
4611 *
4612 * "This field should be set to the minimum length required to read the
4613 * maximum source attribute. The maximum source attribute is indicated
4614 * by the maximum value of the enabled Attribute # Source Attribute if
4615 * Attribute Swizzle Enable is set, Number of Output Attributes-1 if
4616 * enable is not set.
4617 * read_length = ceiling((max_source_attr + 1) / 2)
4618 *
4619 * [errata] Corruption/Hang possible if length programmed larger than
4620 * recommended"
4621 *
4622 * Similar text exists for Ivy Bridge.
4623 */
4624 *urb_entry_read_length = DIV_ROUND_UP(max_source_attr + 1, 2);
4625 }
4626 #endif
4627
4628 #if GFX_VER >= 7
4629 static void
crocus_emit_sbe(struct crocus_batch * batch,const struct crocus_context * ice)4630 crocus_emit_sbe(struct crocus_batch *batch, const struct crocus_context *ice)
4631 {
4632 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4633 const struct elk_wm_prog_data *wm_prog_data = (void *)
4634 ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
4635 #if GFX_VER >= 8
4636 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = { { 0 } };
4637 #else
4638 #define attr_overrides sbe.Attribute
4639 #endif
4640
4641 uint32_t urb_entry_read_length;
4642 uint32_t urb_entry_read_offset;
4643 uint32_t point_sprite_enables;
4644
4645 crocus_emit_cmd(batch, GENX(3DSTATE_SBE), sbe) {
4646 sbe.AttributeSwizzleEnable = true;
4647 sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
4648 sbe.PointSpriteTextureCoordinateOrigin = cso_rast->cso.sprite_coord_mode;
4649
4650 calculate_attr_overrides(ice,
4651 attr_overrides,
4652 &point_sprite_enables,
4653 &urb_entry_read_length,
4654 &urb_entry_read_offset);
4655 sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
4656 sbe.VertexURBEntryReadLength = urb_entry_read_length;
4657 sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
4658 sbe.PointSpriteTextureCoordinateEnable = point_sprite_enables;
4659 #if GFX_VER >= 8
4660 sbe.ForceVertexURBEntryReadLength = true;
4661 sbe.ForceVertexURBEntryReadOffset = true;
4662 #endif
4663 }
4664 #if GFX_VER >= 8
4665 crocus_emit_cmd(batch, GENX(3DSTATE_SBE_SWIZ), sbes) {
4666 for (int i = 0; i < 16; i++)
4667 sbes.Attribute[i] = attr_overrides[i];
4668 }
4669 #endif
4670 }
4671 #endif
4672
4673 /* ------------------------------------------------------------------- */
4674
4675 /**
4676 * Populate VS program key fields based on the current state.
4677 */
4678 static void
crocus_populate_vs_key(const struct crocus_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct elk_vs_prog_key * key)4679 crocus_populate_vs_key(const struct crocus_context *ice,
4680 const struct shader_info *info,
4681 gl_shader_stage last_stage,
4682 struct elk_vs_prog_key *key)
4683 {
4684 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4685
4686 if (info->clip_distance_array_size == 0 &&
4687 (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4688 last_stage == MESA_SHADER_VERTEX)
4689 key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4690
4691 if (last_stage == MESA_SHADER_VERTEX &&
4692 info->outputs_written & (VARYING_BIT_PSIZ))
4693 key->clamp_pointsize = 1;
4694
4695 #if GFX_VER <= 5
4696 key->copy_edgeflag = (cso_rast->cso.fill_back != PIPE_POLYGON_MODE_FILL ||
4697 cso_rast->cso.fill_front != PIPE_POLYGON_MODE_FILL);
4698 key->point_coord_replace = cso_rast->cso.sprite_coord_enable & 0xff;
4699 #endif
4700
4701 key->clamp_vertex_color = cso_rast->cso.clamp_vertex_color;
4702
4703 #if GFX_VERx10 < 75
4704 uint64_t inputs_read = info->inputs_read;
4705 int ve_idx = 0;
4706 while (inputs_read) {
4707 int i = u_bit_scan64(&inputs_read);
4708 key->gl_attrib_wa_flags[i] = ice->state.cso_vertex_elements->wa_flags[ve_idx];
4709 ve_idx++;
4710 }
4711 #endif
4712 }
4713
4714 /**
4715 * Populate TCS program key fields based on the current state.
4716 */
4717 static void
crocus_populate_tcs_key(const struct crocus_context * ice,struct elk_tcs_prog_key * key)4718 crocus_populate_tcs_key(const struct crocus_context *ice,
4719 struct elk_tcs_prog_key *key)
4720 {
4721 }
4722
4723 /**
4724 * Populate TES program key fields based on the current state.
4725 */
4726 static void
crocus_populate_tes_key(const struct crocus_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct elk_tes_prog_key * key)4727 crocus_populate_tes_key(const struct crocus_context *ice,
4728 const struct shader_info *info,
4729 gl_shader_stage last_stage,
4730 struct elk_tes_prog_key *key)
4731 {
4732 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4733
4734 if (info->clip_distance_array_size == 0 &&
4735 (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4736 last_stage == MESA_SHADER_TESS_EVAL)
4737 key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4738
4739 if (last_stage == MESA_SHADER_TESS_EVAL &&
4740 info->outputs_written & (VARYING_BIT_PSIZ))
4741 key->clamp_pointsize = 1;
4742 }
4743
4744 /**
4745 * Populate GS program key fields based on the current state.
4746 */
4747 static void
crocus_populate_gs_key(const struct crocus_context * ice,const struct shader_info * info,gl_shader_stage last_stage,struct elk_gs_prog_key * key)4748 crocus_populate_gs_key(const struct crocus_context *ice,
4749 const struct shader_info *info,
4750 gl_shader_stage last_stage,
4751 struct elk_gs_prog_key *key)
4752 {
4753 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4754
4755 if (info->clip_distance_array_size == 0 &&
4756 (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4757 last_stage == MESA_SHADER_GEOMETRY)
4758 key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4759
4760 if (last_stage == MESA_SHADER_GEOMETRY &&
4761 info->outputs_written & (VARYING_BIT_PSIZ))
4762 key->clamp_pointsize = 1;
4763 }
4764
4765 /**
4766 * Populate FS program key fields based on the current state.
4767 */
4768 static void
crocus_populate_fs_key(const struct crocus_context * ice,const struct shader_info * info,struct elk_wm_prog_key * key)4769 crocus_populate_fs_key(const struct crocus_context *ice,
4770 const struct shader_info *info,
4771 struct elk_wm_prog_key *key)
4772 {
4773 struct crocus_screen *screen = (void *) ice->ctx.screen;
4774 const struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
4775 const struct crocus_depth_stencil_alpha_state *zsa = ice->state.cso_zsa;
4776 const struct crocus_rasterizer_state *rast = ice->state.cso_rast;
4777 const struct crocus_blend_state *blend = ice->state.cso_blend;
4778
4779 #if GFX_VER < 6
4780 uint32_t lookup = 0;
4781
4782 if (info->fs.uses_discard || zsa->cso.alpha_enabled)
4783 lookup |= ELK_WM_IZ_PS_KILL_ALPHATEST_BIT;
4784
4785 if (info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4786 lookup |= ELK_WM_IZ_PS_COMPUTES_DEPTH_BIT;
4787
4788 if (fb->zsbuf && zsa->cso.depth_enabled) {
4789 lookup |= ELK_WM_IZ_DEPTH_TEST_ENABLE_BIT;
4790
4791 if (zsa->cso.depth_writemask)
4792 lookup |= ELK_WM_IZ_DEPTH_WRITE_ENABLE_BIT;
4793
4794 }
4795 if (zsa->cso.stencil[0].enabled || zsa->cso.stencil[1].enabled) {
4796 lookup |= ELK_WM_IZ_STENCIL_TEST_ENABLE_BIT;
4797 if (zsa->cso.stencil[0].writemask || zsa->cso.stencil[1].writemask)
4798 lookup |= ELK_WM_IZ_STENCIL_WRITE_ENABLE_BIT;
4799 }
4800 key->iz_lookup = lookup;
4801 key->stats_wm = ice->state.stats_wm;
4802 #endif
4803
4804 uint32_t line_aa = ELK_NEVER;
4805 if (rast->cso.line_smooth) {
4806 int reduced_prim = ice->state.reduced_prim_mode;
4807 if (reduced_prim == MESA_PRIM_LINES)
4808 line_aa = ELK_ALWAYS;
4809 else if (reduced_prim == MESA_PRIM_TRIANGLES) {
4810 if (rast->cso.fill_front == PIPE_POLYGON_MODE_LINE) {
4811 line_aa = ELK_SOMETIMES;
4812
4813 if (rast->cso.fill_back == PIPE_POLYGON_MODE_LINE ||
4814 rast->cso.cull_face == PIPE_FACE_BACK)
4815 line_aa = ELK_ALWAYS;
4816 } else if (rast->cso.fill_back == PIPE_POLYGON_MODE_LINE) {
4817 line_aa = ELK_SOMETIMES;
4818
4819 if (rast->cso.cull_face == PIPE_FACE_FRONT)
4820 line_aa = ELK_ALWAYS;
4821 }
4822 }
4823 }
4824 key->line_aa = line_aa;
4825
4826 key->nr_color_regions = fb->nr_cbufs;
4827
4828 key->clamp_fragment_color = rast->cso.clamp_fragment_color;
4829
4830 key->alpha_to_coverage = blend->cso.alpha_to_coverage ?
4831 ELK_ALWAYS : ELK_NEVER;
4832
4833 key->alpha_test_replicate_alpha = fb->nr_cbufs > 1 && zsa->cso.alpha_enabled;
4834
4835 key->flat_shade = rast->cso.flatshade &&
4836 (info->inputs_read & (VARYING_BIT_COL0 | VARYING_BIT_COL1));
4837
4838 const bool multisample_fbo = rast->cso.multisample && fb->samples > 1;
4839 key->multisample_fbo = multisample_fbo ? ELK_ALWAYS : ELK_NEVER;
4840 key->persample_interp =
4841 rast->cso.force_persample_interp ? ELK_ALWAYS : ELK_NEVER;
4842
4843 key->ignore_sample_mask_out = !multisample_fbo;
4844 key->coherent_fb_fetch = false; // TODO: needed?
4845
4846 key->force_dual_color_blend =
4847 screen->driconf.dual_color_blend_by_location &&
4848 (blend->blend_enables & 1) && blend->dual_color_blending;
4849
4850 #if GFX_VER <= 5
4851 if (fb->nr_cbufs > 1 && zsa->cso.alpha_enabled) {
4852 key->emit_alpha_test = true;
4853 key->alpha_test_func = zsa->cso.alpha_func;
4854 key->alpha_test_ref = zsa->cso.alpha_ref_value;
4855 }
4856 #endif
4857 }
4858
4859 static void
crocus_populate_cs_key(const struct crocus_context * ice,struct elk_cs_prog_key * key)4860 crocus_populate_cs_key(const struct crocus_context *ice,
4861 struct elk_cs_prog_key *key)
4862 {
4863 }
4864
4865 #if GFX_VER == 4
4866 #define KSP(ice, shader) ro_bo((ice)->shaders.cache_bo, (shader)->offset);
4867 #elif GFX_VER >= 5
4868 static uint64_t
KSP(const struct crocus_context * ice,const struct crocus_compiled_shader * shader)4869 KSP(const struct crocus_context *ice, const struct crocus_compiled_shader *shader)
4870 {
4871 return shader->offset;
4872 }
4873 #endif
4874
4875 /* Gen11 workaround table #2056 WABTPPrefetchDisable suggests to disable
4876 * prefetching of binding tables in A0 and B0 steppings. XXX: Revisit
4877 * this WA on C0 stepping.
4878 *
4879 * TODO: Fill out SamplerCount for prefetching?
4880 */
4881
4882 #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage) \
4883 pkt.KernelStartPointer = KSP(ice, shader); \
4884 pkt.BindingTableEntryCount = shader->bt.size_bytes / 4; \
4885 pkt.FloatingPointMode = prog_data->use_alt_mode; \
4886 \
4887 pkt.DispatchGRFStartRegisterForURBData = \
4888 prog_data->dispatch_grf_start_reg; \
4889 pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length; \
4890 pkt.prefix##URBEntryReadOffset = 0; \
4891 \
4892 pkt.StatisticsEnable = true; \
4893 pkt.Enable = true; \
4894 \
4895 if (prog_data->total_scratch) { \
4896 struct crocus_bo *bo = \
4897 crocus_get_scratch_space(ice, prog_data->total_scratch, stage); \
4898 pkt.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11; \
4899 pkt.ScratchSpaceBasePointer = rw_bo(bo, 0); \
4900 }
4901
4902 /* ------------------------------------------------------------------- */
4903 #if GFX_VER >= 6
4904 static const uint32_t push_constant_opcodes[] = {
4905 [MESA_SHADER_VERTEX] = 21,
4906 [MESA_SHADER_TESS_CTRL] = 25, /* HS */
4907 [MESA_SHADER_TESS_EVAL] = 26, /* DS */
4908 [MESA_SHADER_GEOMETRY] = 22,
4909 [MESA_SHADER_FRAGMENT] = 23,
4910 [MESA_SHADER_COMPUTE] = 0,
4911 };
4912 #endif
4913
4914 static void
emit_sized_null_surface(struct crocus_batch * batch,unsigned width,unsigned height,unsigned layers,unsigned levels,unsigned minimum_array_element,uint32_t * out_offset)4915 emit_sized_null_surface(struct crocus_batch *batch,
4916 unsigned width, unsigned height,
4917 unsigned layers, unsigned levels,
4918 unsigned minimum_array_element,
4919 uint32_t *out_offset)
4920 {
4921 struct isl_device *isl_dev = &batch->screen->isl_dev;
4922 uint32_t *surf = stream_state(batch, isl_dev->ss.size,
4923 isl_dev->ss.align,
4924 out_offset);
4925 //TODO gen 6 multisample crash
4926 isl_null_fill_state(isl_dev, surf,
4927 .size = isl_extent3d(width, height, layers),
4928 .levels = levels,
4929 .minimum_array_element = minimum_array_element);
4930 }
4931 static void
emit_null_surface(struct crocus_batch * batch,uint32_t * out_offset)4932 emit_null_surface(struct crocus_batch *batch,
4933 uint32_t *out_offset)
4934 {
4935 emit_sized_null_surface(batch, 1, 1, 1, 0, 0, out_offset);
4936 }
4937
4938 static void
emit_null_fb_surface(struct crocus_batch * batch,struct crocus_context * ice,uint32_t * out_offset)4939 emit_null_fb_surface(struct crocus_batch *batch,
4940 struct crocus_context *ice,
4941 uint32_t *out_offset)
4942 {
4943 uint32_t width, height, layers, level, layer;
4944 /* If set_framebuffer_state() was never called, fall back to 1x1x1 */
4945 if (ice->state.framebuffer.width == 0 && ice->state.framebuffer.height == 0) {
4946 emit_null_surface(batch, out_offset);
4947 return;
4948 }
4949
4950 struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
4951 width = MAX2(cso->width, 1);
4952 height = MAX2(cso->height, 1);
4953 layers = cso->layers ? cso->layers : 1;
4954 level = 0;
4955 layer = 0;
4956
4957 if (cso->nr_cbufs == 0 && cso->zsbuf) {
4958 width = cso->zsbuf->width;
4959 height = cso->zsbuf->height;
4960 level = cso->zsbuf->u.tex.level;
4961 layer = cso->zsbuf->u.tex.first_layer;
4962 }
4963 emit_sized_null_surface(batch, width, height,
4964 layers, level, layer,
4965 out_offset);
4966 }
4967
4968 static void
emit_surface_state(struct crocus_batch * batch,struct crocus_resource * res,const struct isl_surf * in_surf,bool adjust_surf,struct isl_view * in_view,bool writeable,enum isl_aux_usage aux_usage,bool blend_enable,uint32_t write_disables,uint32_t * surf_state,uint32_t addr_offset)4969 emit_surface_state(struct crocus_batch *batch,
4970 struct crocus_resource *res,
4971 const struct isl_surf *in_surf,
4972 bool adjust_surf,
4973 struct isl_view *in_view,
4974 bool writeable,
4975 enum isl_aux_usage aux_usage,
4976 bool blend_enable,
4977 uint32_t write_disables,
4978 uint32_t *surf_state,
4979 uint32_t addr_offset)
4980 {
4981 struct isl_device *isl_dev = &batch->screen->isl_dev;
4982 uint32_t reloc = RELOC_32BIT;
4983 uint64_t offset_B = res->offset;
4984 uint32_t tile_x_sa = 0, tile_y_sa = 0;
4985
4986 if (writeable)
4987 reloc |= RELOC_WRITE;
4988
4989 struct isl_surf surf = *in_surf;
4990 struct isl_view view = *in_view;
4991 if (adjust_surf) {
4992 if (res->base.b.target == PIPE_TEXTURE_3D && view.array_len == 1) {
4993 isl_surf_get_image_surf(isl_dev, in_surf,
4994 view.base_level, 0,
4995 view.base_array_layer,
4996 &surf, &offset_B,
4997 &tile_x_sa, &tile_y_sa);
4998 view.base_array_layer = 0;
4999 view.base_level = 0;
5000 } else if (res->base.b.target == PIPE_TEXTURE_CUBE && GFX_VER == 4) {
5001 isl_surf_get_image_surf(isl_dev, in_surf,
5002 view.base_level, view.base_array_layer,
5003 0,
5004 &surf, &offset_B,
5005 &tile_x_sa, &tile_y_sa);
5006 view.base_array_layer = 0;
5007 view.base_level = 0;
5008 } else if (res->base.b.target == PIPE_TEXTURE_1D_ARRAY)
5009 surf.dim = ISL_SURF_DIM_2D;
5010 }
5011
5012 union isl_color_value clear_color = { .u32 = { 0, 0, 0, 0 } };
5013 struct crocus_bo *aux_bo = NULL;
5014 uint32_t aux_offset = 0;
5015 struct isl_surf *aux_surf = NULL;
5016 if (aux_usage != ISL_AUX_USAGE_NONE) {
5017 aux_surf = &res->aux.surf;
5018 aux_offset = res->aux.offset;
5019 aux_bo = res->aux.bo;
5020
5021 clear_color = crocus_resource_get_clear_color(res);
5022 }
5023
5024 isl_surf_fill_state(isl_dev, surf_state,
5025 .surf = &surf,
5026 .view = &view,
5027 .address = crocus_state_reloc(batch,
5028 addr_offset + isl_dev->ss.addr_offset,
5029 res->bo, offset_B, reloc),
5030 .aux_surf = aux_surf,
5031 .aux_usage = aux_usage,
5032 .aux_address = aux_offset,
5033 .mocs = crocus_mocs(res->bo, isl_dev),
5034 .clear_color = clear_color,
5035 .use_clear_address = false,
5036 .clear_address = 0,
5037 .x_offset_sa = tile_x_sa,
5038 .y_offset_sa = tile_y_sa,
5039 #if GFX_VER <= 5
5040 .blend_enable = blend_enable,
5041 .write_disables = write_disables,
5042 #endif
5043 );
5044
5045 if (aux_surf) {
5046 /* On gen7 and prior, the upper 20 bits of surface state DWORD 6 are the
5047 * upper 20 bits of the GPU address of the MCS buffer; the lower 12 bits
5048 * contain other control information. Since buffer addresses are always
5049 * on 4k boundaries (and thus have their lower 12 bits zero), we can use
5050 * an ordinary reloc to do the necessary address translation.
5051 *
5052 * FIXME: move to the point of assignment.
5053 */
5054 if (GFX_VER == 8) {
5055 uint64_t *aux_addr = (uint64_t *)(surf_state + (isl_dev->ss.aux_addr_offset / 4));
5056 *aux_addr = crocus_state_reloc(batch,
5057 addr_offset + isl_dev->ss.aux_addr_offset,
5058 aux_bo, *aux_addr,
5059 reloc);
5060 } else {
5061 uint32_t *aux_addr = surf_state + (isl_dev->ss.aux_addr_offset / 4);
5062 *aux_addr = crocus_state_reloc(batch,
5063 addr_offset + isl_dev->ss.aux_addr_offset,
5064 aux_bo, *aux_addr,
5065 reloc);
5066 }
5067 }
5068
5069 }
5070
5071 static uint32_t
emit_surface(struct crocus_batch * batch,struct crocus_surface * surf,enum isl_aux_usage aux_usage,bool blend_enable,uint32_t write_disables)5072 emit_surface(struct crocus_batch *batch,
5073 struct crocus_surface *surf,
5074 enum isl_aux_usage aux_usage,
5075 bool blend_enable,
5076 uint32_t write_disables)
5077 {
5078 struct isl_device *isl_dev = &batch->screen->isl_dev;
5079 struct crocus_resource *res = (struct crocus_resource *)surf->base.texture;
5080 struct isl_view *view = &surf->view;
5081 uint32_t offset = 0;
5082 enum pipe_texture_target target = res->base.b.target;
5083 bool adjust_surf = false;
5084
5085 if (GFX_VER == 4 && target == PIPE_TEXTURE_CUBE)
5086 adjust_surf = true;
5087
5088 if (surf->align_res)
5089 res = (struct crocus_resource *)surf->align_res;
5090
5091 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, isl_dev->ss.align, &offset);
5092
5093 emit_surface_state(batch, res, &surf->surf, adjust_surf, view, true,
5094 aux_usage, blend_enable,
5095 write_disables,
5096 surf_state, offset);
5097 return offset;
5098 }
5099
5100 static uint32_t
emit_rt_surface(struct crocus_batch * batch,struct crocus_surface * surf,enum isl_aux_usage aux_usage)5101 emit_rt_surface(struct crocus_batch *batch,
5102 struct crocus_surface *surf,
5103 enum isl_aux_usage aux_usage)
5104 {
5105 struct isl_device *isl_dev = &batch->screen->isl_dev;
5106 struct crocus_resource *res = (struct crocus_resource *)surf->base.texture;
5107 struct isl_view *view = &surf->read_view;
5108 uint32_t offset = 0;
5109 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, isl_dev->ss.align, &offset);
5110
5111 emit_surface_state(batch, res, &surf->surf, true, view, false,
5112 aux_usage, 0, false,
5113 surf_state, offset);
5114 return offset;
5115 }
5116
5117 static uint32_t
emit_grid(struct crocus_context * ice,struct crocus_batch * batch)5118 emit_grid(struct crocus_context *ice,
5119 struct crocus_batch *batch)
5120 {
5121 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5122 uint32_t offset = 0;
5123 struct crocus_state_ref *grid_ref = &ice->state.grid_size;
5124 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5125 isl_dev->ss.align, &offset);
5126 isl_buffer_fill_state(isl_dev, surf_state,
5127 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5128 crocus_resource_bo(grid_ref->res),
5129 grid_ref->offset,
5130 RELOC_32BIT),
5131 .size_B = 12,
5132 .format = ISL_FORMAT_RAW,
5133 .stride_B = 1,
5134 .mocs = crocus_mocs(crocus_resource_bo(grid_ref->res), isl_dev));
5135 return offset;
5136 }
5137
5138 static uint32_t
emit_ubo_buffer(struct crocus_context * ice,struct crocus_batch * batch,struct pipe_constant_buffer * buffer)5139 emit_ubo_buffer(struct crocus_context *ice,
5140 struct crocus_batch *batch,
5141 struct pipe_constant_buffer *buffer)
5142 {
5143 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5144 uint32_t offset = 0;
5145
5146 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5147 isl_dev->ss.align, &offset);
5148 isl_buffer_fill_state(isl_dev, surf_state,
5149 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5150 crocus_resource_bo(buffer->buffer),
5151 buffer->buffer_offset,
5152 RELOC_32BIT),
5153 .size_B = buffer->buffer_size,
5154 .format = 0,
5155 .swizzle = ISL_SWIZZLE_IDENTITY,
5156 .stride_B = 1,
5157 .mocs = crocus_mocs(crocus_resource_bo(buffer->buffer), isl_dev));
5158
5159 return offset;
5160 }
5161
5162 static uint32_t
emit_ssbo_buffer(struct crocus_context * ice,struct crocus_batch * batch,struct pipe_shader_buffer * buffer,bool writeable)5163 emit_ssbo_buffer(struct crocus_context *ice,
5164 struct crocus_batch *batch,
5165 struct pipe_shader_buffer *buffer, bool writeable)
5166 {
5167 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5168 uint32_t offset = 0;
5169 uint32_t reloc = RELOC_32BIT;
5170
5171 if (writeable)
5172 reloc |= RELOC_WRITE;
5173 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5174 isl_dev->ss.align, &offset);
5175 isl_buffer_fill_state(isl_dev, surf_state,
5176 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5177 crocus_resource_bo(buffer->buffer),
5178 buffer->buffer_offset,
5179 reloc),
5180 .size_B = buffer->buffer_size,
5181 .format = ISL_FORMAT_RAW,
5182 .swizzle = ISL_SWIZZLE_IDENTITY,
5183 .stride_B = 1,
5184 .mocs = crocus_mocs(crocus_resource_bo(buffer->buffer), isl_dev));
5185
5186 return offset;
5187 }
5188
5189 static uint32_t
emit_sampler_view(struct crocus_context * ice,struct crocus_batch * batch,bool for_gather,struct crocus_sampler_view * isv)5190 emit_sampler_view(struct crocus_context *ice,
5191 struct crocus_batch *batch,
5192 bool for_gather,
5193 struct crocus_sampler_view *isv)
5194 {
5195 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5196 uint32_t offset = 0;
5197
5198 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5199 isl_dev->ss.align, &offset);
5200
5201 if (isv->base.target == PIPE_BUFFER) {
5202 const struct isl_format_layout *fmtl = isl_format_get_layout(isv->view.format);
5203 const unsigned cpp = isv->view.format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
5204 unsigned final_size =
5205 MIN3(isv->base.u.buf.size, isv->res->bo->size - isv->res->offset,
5206 CROCUS_MAX_TEXTURE_BUFFER_SIZE * cpp);
5207 isl_buffer_fill_state(isl_dev, surf_state,
5208 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5209 isv->res->bo,
5210 isv->res->offset + isv->base.u.buf.offset, RELOC_32BIT),
5211 .size_B = final_size,
5212 .format = isv->view.format,
5213 .swizzle = isv->view.swizzle,
5214 .stride_B = cpp,
5215 .mocs = crocus_mocs(isv->res->bo, isl_dev)
5216 );
5217 } else {
5218 enum isl_aux_usage aux_usage =
5219 crocus_resource_texture_aux_usage(isv->res);
5220
5221 emit_surface_state(batch, isv->res, &isv->res->surf, false,
5222 for_gather ? &isv->gather_view : &isv->view,
5223 false, aux_usage, false,
5224 0, surf_state, offset);
5225 }
5226 return offset;
5227 }
5228
5229 static uint32_t
emit_image_view(struct crocus_context * ice,struct crocus_batch * batch,struct crocus_image_view * iv)5230 emit_image_view(struct crocus_context *ice,
5231 struct crocus_batch *batch,
5232 struct crocus_image_view *iv)
5233 {
5234 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5235 uint32_t offset = 0;
5236
5237 struct crocus_resource *res = (struct crocus_resource *)iv->base.resource;
5238 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5239 isl_dev->ss.align, &offset);
5240 bool write = iv->base.shader_access & PIPE_IMAGE_ACCESS_WRITE;
5241 uint32_t reloc = RELOC_32BIT | (write ? RELOC_WRITE : 0);
5242 if (res->base.b.target == PIPE_BUFFER) {
5243 const struct isl_format_layout *fmtl = isl_format_get_layout(iv->view.format);
5244 const unsigned cpp = iv->view.format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
5245 unsigned final_size =
5246 MIN3(iv->base.u.buf.size, res->bo->size - res->offset - iv->base.u.buf.offset,
5247 CROCUS_MAX_TEXTURE_BUFFER_SIZE * cpp);
5248 isl_buffer_fill_state(isl_dev, surf_state,
5249 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5250 res->bo,
5251 res->offset + iv->base.u.buf.offset, reloc),
5252 .size_B = final_size,
5253 .format = iv->view.format,
5254 .swizzle = iv->view.swizzle,
5255 .stride_B = cpp,
5256 .mocs = crocus_mocs(res->bo, isl_dev)
5257 );
5258 } else {
5259 if (iv->view.format == ISL_FORMAT_RAW) {
5260 isl_buffer_fill_state(isl_dev, surf_state,
5261 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5262 res->bo,
5263 res->offset, reloc),
5264 .size_B = res->bo->size - res->offset,
5265 .format = iv->view.format,
5266 .swizzle = iv->view.swizzle,
5267 .stride_B = 1,
5268 .mocs = crocus_mocs(res->bo, isl_dev),
5269 );
5270
5271
5272 } else {
5273 emit_surface_state(batch, res,
5274 &res->surf, false, &iv->view,
5275 write, 0, false,
5276 0, surf_state, offset);
5277 }
5278 }
5279
5280 return offset;
5281 }
5282
5283 #if GFX_VER == 6
5284 static uint32_t
emit_sol_surface(struct crocus_batch * batch,struct pipe_stream_output_info * so_info,uint32_t idx)5285 emit_sol_surface(struct crocus_batch *batch,
5286 struct pipe_stream_output_info *so_info,
5287 uint32_t idx)
5288 {
5289 struct crocus_context *ice = batch->ice;
5290
5291 if (idx >= so_info->num_outputs || !ice->state.streamout_active)
5292 return 0;
5293 const struct pipe_stream_output *output = &so_info->output[idx];
5294 const int buffer = output->output_buffer;
5295 assert(output->stream == 0);
5296
5297 struct crocus_resource *buf = (struct crocus_resource *)ice->state.so_target[buffer]->buffer;
5298 unsigned stride_dwords = so_info->stride[buffer];
5299 unsigned offset_dwords = ice->state.so_target[buffer]->buffer_offset / 4 + output->dst_offset;
5300
5301 size_t size_dwords = (ice->state.so_target[buffer]->buffer_offset + ice->state.so_target[buffer]->buffer_size) / 4;
5302 unsigned num_vector_components = output->num_components;
5303 unsigned num_elements;
5304 /* FIXME: can we rely on core Mesa to ensure that the buffer isn't
5305 * too big to map using a single binding table entry?
5306 */
5307 // assert((size_dwords - offset_dwords) / stride_dwords
5308 // <= ELK_MAX_NUM_BUFFER_ENTRIES);
5309
5310 if (size_dwords > offset_dwords + num_vector_components) {
5311 /* There is room for at least 1 transform feedback output in the buffer.
5312 * Compute the number of additional transform feedback outputs the
5313 * buffer has room for.
5314 */
5315 num_elements =
5316 (size_dwords - offset_dwords - num_vector_components);
5317 } else {
5318 /* There isn't even room for a single transform feedback output in the
5319 * buffer. We can't configure the binding table entry to prevent output
5320 * entirely; we'll have to rely on the geometry shader to detect
5321 * overflow. But to minimize the damage in case of a bug, set up the
5322 * binding table entry to just allow a single output.
5323 */
5324 num_elements = 0;
5325 }
5326 num_elements += stride_dwords;
5327
5328 uint32_t surface_format;
5329 switch (num_vector_components) {
5330 case 1:
5331 surface_format = ISL_FORMAT_R32_FLOAT;
5332 break;
5333 case 2:
5334 surface_format = ISL_FORMAT_R32G32_FLOAT;
5335 break;
5336 case 3:
5337 surface_format = ISL_FORMAT_R32G32B32_FLOAT;
5338 break;
5339 case 4:
5340 surface_format = ISL_FORMAT_R32G32B32A32_FLOAT;
5341 break;
5342 default:
5343 unreachable("Invalid vector size for transform feedback output");
5344 }
5345
5346 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5347 uint32_t offset = 0;
5348
5349 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5350 isl_dev->ss.align, &offset);
5351 isl_buffer_fill_state(isl_dev, surf_state,
5352 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5353 crocus_resource_bo(&buf->base.b),
5354 offset_dwords * 4, RELOC_32BIT|RELOC_WRITE),
5355 .size_B = num_elements * 4,
5356 .stride_B = stride_dwords * 4,
5357 .swizzle = ISL_SWIZZLE_IDENTITY,
5358 .format = surface_format);
5359 return offset;
5360 }
5361 #endif
5362
5363 #define foreach_surface_used(index, group) \
5364 for (int index = 0; index < bt->sizes[group]; index++) \
5365 if (crocus_group_index_to_bti(bt, group, index) != \
5366 CROCUS_SURFACE_NOT_USED)
5367
5368 static void
crocus_populate_binding_table(struct crocus_context * ice,struct crocus_batch * batch,gl_shader_stage stage,bool ff_gs)5369 crocus_populate_binding_table(struct crocus_context *ice,
5370 struct crocus_batch *batch,
5371 gl_shader_stage stage, bool ff_gs)
5372 {
5373 struct crocus_compiled_shader *shader = ff_gs ? ice->shaders.ff_gs_prog : ice->shaders.prog[stage];
5374 struct crocus_shader_state *shs = ff_gs ? NULL : &ice->state.shaders[stage];
5375 if (!shader)
5376 return;
5377
5378 struct crocus_binding_table *bt = &shader->bt;
5379 int s = 0;
5380 uint32_t *surf_offsets = shader->surf_offset;
5381
5382 #if GFX_VER < 8
5383 const struct shader_info *info = crocus_get_shader_info(ice, stage);
5384 #endif
5385
5386 if (stage == MESA_SHADER_FRAGMENT) {
5387 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5388 /* Note that cso_fb->nr_cbufs == fs_key->nr_color_regions. */
5389 if (cso_fb->nr_cbufs) {
5390 for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
5391 uint32_t write_disables = 0;
5392 bool blend_enable = false;
5393 #if GFX_VER <= 5
5394 const struct pipe_rt_blend_state *rt =
5395 &ice->state.cso_blend->cso.rt[ice->state.cso_blend->cso.independent_blend_enable ? i : 0];
5396 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
5397 struct elk_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
5398 write_disables |= (rt->colormask & PIPE_MASK_A) ? 0x0 : 0x8;
5399 write_disables |= (rt->colormask & PIPE_MASK_R) ? 0x0 : 0x4;
5400 write_disables |= (rt->colormask & PIPE_MASK_G) ? 0x0 : 0x2;
5401 write_disables |= (rt->colormask & PIPE_MASK_B) ? 0x0 : 0x1;
5402 /* Gen4/5 can't handle blending off when a dual src blend wm is enabled. */
5403 blend_enable = rt->blend_enable || wm_prog_data->dual_src_blend;
5404 #endif
5405 if (cso_fb->cbufs[i]) {
5406 surf_offsets[s] = emit_surface(batch,
5407 (struct crocus_surface *)cso_fb->cbufs[i],
5408 ice->state.draw_aux_usage[i],
5409 blend_enable,
5410 write_disables);
5411 } else {
5412 emit_null_fb_surface(batch, ice, &surf_offsets[s]);
5413 }
5414 s++;
5415 }
5416 } else {
5417 emit_null_fb_surface(batch, ice, &surf_offsets[s]);
5418 s++;
5419 }
5420
5421 foreach_surface_used(i, CROCUS_SURFACE_GROUP_RENDER_TARGET_READ) {
5422 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5423 if (cso_fb->cbufs[i]) {
5424 surf_offsets[s++] = emit_rt_surface(batch,
5425 (struct crocus_surface *)cso_fb->cbufs[i],
5426 ice->state.draw_aux_usage[i]);
5427 }
5428 }
5429 }
5430
5431 if (stage == MESA_SHADER_COMPUTE) {
5432 foreach_surface_used(i, CROCUS_SURFACE_GROUP_CS_WORK_GROUPS) {
5433 surf_offsets[s] = emit_grid(ice, batch);
5434 s++;
5435 }
5436 }
5437
5438 #if GFX_VER == 6
5439 if (stage == MESA_SHADER_GEOMETRY) {
5440 struct pipe_stream_output_info *so_info;
5441 if (ice->shaders.uncompiled[MESA_SHADER_GEOMETRY])
5442 so_info = &ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]->stream_output;
5443 else
5444 so_info = &ice->shaders.uncompiled[MESA_SHADER_VERTEX]->stream_output;
5445
5446 foreach_surface_used(i, CROCUS_SURFACE_GROUP_SOL) {
5447 surf_offsets[s] = emit_sol_surface(batch, so_info, i);
5448 s++;
5449 }
5450 }
5451 #endif
5452
5453 foreach_surface_used(i, CROCUS_SURFACE_GROUP_TEXTURE) {
5454 struct crocus_sampler_view *view = shs->textures[i];
5455 if (view)
5456 surf_offsets[s] = emit_sampler_view(ice, batch, false, view);
5457 else
5458 emit_null_surface(batch, &surf_offsets[s]);
5459 s++;
5460 }
5461
5462 #if GFX_VER < 8
5463 if (info && info->uses_texture_gather) {
5464 foreach_surface_used(i, CROCUS_SURFACE_GROUP_TEXTURE_GATHER) {
5465 struct crocus_sampler_view *view = shs->textures[i];
5466 if (view)
5467 surf_offsets[s] = emit_sampler_view(ice, batch, true, view);
5468 else
5469 emit_null_surface(batch, &surf_offsets[s]);
5470 s++;
5471 }
5472 }
5473 #endif
5474
5475 foreach_surface_used(i, CROCUS_SURFACE_GROUP_IMAGE) {
5476 struct crocus_image_view *view = &shs->image[i];
5477 if (view->base.resource)
5478 surf_offsets[s] = emit_image_view(ice, batch, view);
5479 else
5480 emit_null_surface(batch, &surf_offsets[s]);
5481 s++;
5482 }
5483 foreach_surface_used(i, CROCUS_SURFACE_GROUP_UBO) {
5484 if (shs->constbufs[i].buffer)
5485 surf_offsets[s] = emit_ubo_buffer(ice, batch, &shs->constbufs[i]);
5486 else
5487 emit_null_surface(batch, &surf_offsets[s]);
5488 s++;
5489 }
5490 foreach_surface_used(i, CROCUS_SURFACE_GROUP_SSBO) {
5491 if (shs->ssbo[i].buffer)
5492 surf_offsets[s] = emit_ssbo_buffer(ice, batch, &shs->ssbo[i],
5493 !!(shs->writable_ssbos & (1 << i)));
5494 else
5495 emit_null_surface(batch, &surf_offsets[s]);
5496 s++;
5497 }
5498
5499 }
5500 /* ------------------------------------------------------------------- */
5501 static uint32_t
crocus_upload_binding_table(struct crocus_context * ice,struct crocus_batch * batch,uint32_t * table,uint32_t size)5502 crocus_upload_binding_table(struct crocus_context *ice,
5503 struct crocus_batch *batch,
5504 uint32_t *table,
5505 uint32_t size)
5506
5507 {
5508 if (size == 0)
5509 return 0;
5510 return emit_state(batch, table, size, 32);
5511 }
5512
5513 /**
5514 * Possibly emit STATE_BASE_ADDRESS to update Surface State Base Address.
5515 */
5516
5517 static void
crocus_update_surface_base_address(struct crocus_batch * batch)5518 crocus_update_surface_base_address(struct crocus_batch *batch)
5519 {
5520 if (batch->state_base_address_emitted)
5521 return;
5522
5523 UNUSED uint32_t mocs = batch->screen->isl_dev.mocs.internal;
5524
5525 flush_before_state_base_change(batch);
5526
5527 crocus_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {
5528 /* Set base addresses */
5529 sba.GeneralStateBaseAddressModifyEnable = true;
5530
5531 #if GFX_VER >= 6
5532 sba.DynamicStateBaseAddressModifyEnable = true;
5533 sba.DynamicStateBaseAddress = ro_bo(batch->state.bo, 0);
5534 #endif
5535
5536 sba.SurfaceStateBaseAddressModifyEnable = true;
5537 sba.SurfaceStateBaseAddress = ro_bo(batch->state.bo, 0);
5538
5539 sba.IndirectObjectBaseAddressModifyEnable = true;
5540
5541 #if GFX_VER >= 5
5542 sba.InstructionBaseAddressModifyEnable = true;
5543 sba.InstructionBaseAddress = ro_bo(batch->ice->shaders.cache_bo, 0); // TODO!
5544 #endif
5545
5546 /* Set buffer sizes on Gen8+ or upper bounds on Gen4-7 */
5547 #if GFX_VER == 8
5548 sba.GeneralStateBufferSize = 0xfffff;
5549 sba.IndirectObjectBufferSize = 0xfffff;
5550 sba.InstructionBufferSize = 0xfffff;
5551 sba.DynamicStateBufferSize = MAX_STATE_SIZE;
5552
5553 sba.GeneralStateBufferSizeModifyEnable = true;
5554 sba.DynamicStateBufferSizeModifyEnable = true;
5555 sba.IndirectObjectBufferSizeModifyEnable = true;
5556 sba.InstructionBuffersizeModifyEnable = true;
5557 #else
5558 sba.GeneralStateAccessUpperBoundModifyEnable = true;
5559 sba.IndirectObjectAccessUpperBoundModifyEnable = true;
5560
5561 #if GFX_VER >= 5
5562 sba.InstructionAccessUpperBoundModifyEnable = true;
5563 #endif
5564
5565 #if GFX_VER >= 6
5566 /* Dynamic state upper bound. Although the documentation says that
5567 * programming it to zero will cause it to be ignored, that is a lie.
5568 * If this isn't programmed to a real bound, the sampler border color
5569 * pointer is rejected, causing border color to mysteriously fail.
5570 */
5571 sba.DynamicStateAccessUpperBound = ro_bo(NULL, 0xfffff000);
5572 sba.DynamicStateAccessUpperBoundModifyEnable = true;
5573 #else
5574 /* Same idea but using General State Base Address on Gen4-5 */
5575 sba.GeneralStateAccessUpperBound = ro_bo(NULL, 0xfffff000);
5576 #endif
5577 #endif
5578
5579 #if GFX_VER >= 6
5580 /* The hardware appears to pay attention to the MOCS fields even
5581 * if you don't set the "Address Modify Enable" bit for the base.
5582 */
5583 sba.GeneralStateMOCS = mocs;
5584 sba.StatelessDataPortAccessMOCS = mocs;
5585 sba.DynamicStateMOCS = mocs;
5586 sba.IndirectObjectMOCS = mocs;
5587 sba.InstructionMOCS = mocs;
5588 sba.SurfaceStateMOCS = mocs;
5589 #endif
5590 }
5591
5592 flush_after_state_base_change(batch);
5593
5594 /* According to section 3.6.1 of VOL1 of the 965 PRM,
5595 * STATE_BASE_ADDRESS updates require a reissue of:
5596 *
5597 * 3DSTATE_PIPELINE_POINTERS
5598 * 3DSTATE_BINDING_TABLE_POINTERS
5599 * MEDIA_STATE_POINTERS
5600 *
5601 * and this continues through Ironlake. The Sandy Bridge PRM, vol
5602 * 1 part 1 says that the folowing packets must be reissued:
5603 *
5604 * 3DSTATE_CC_POINTERS
5605 * 3DSTATE_BINDING_TABLE_POINTERS
5606 * 3DSTATE_SAMPLER_STATE_POINTERS
5607 * 3DSTATE_VIEWPORT_STATE_POINTERS
5608 * MEDIA_STATE_POINTERS
5609 *
5610 * Those are always reissued following SBA updates anyway (new
5611 * batch time), except in the case of the program cache BO
5612 * changing. Having a separate state flag makes the sequence more
5613 * obvious.
5614 */
5615 #if GFX_VER <= 5
5616 batch->ice->state.dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS | CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
5617 #elif GFX_VER == 6
5618 batch->ice->state.dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS | CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS;
5619 #endif
5620 batch->state_base_address_emitted = true;
5621 }
5622
5623 static inline void
crocus_viewport_zmin_zmax(const struct pipe_viewport_state * vp,bool halfz,bool window_space_position,float * zmin,float * zmax)5624 crocus_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz,
5625 bool window_space_position, float *zmin, float *zmax)
5626 {
5627 if (window_space_position) {
5628 *zmin = 0.f;
5629 *zmax = 1.f;
5630 return;
5631 }
5632 util_viewport_zmin_zmax(vp, halfz, zmin, zmax);
5633 }
5634
5635 struct push_bos {
5636 struct {
5637 struct crocus_address addr;
5638 uint32_t length;
5639 } buffers[4];
5640 int buffer_count;
5641 uint32_t max_length;
5642 };
5643
5644 #if GFX_VER >= 6
5645 static void
setup_constant_buffers(struct crocus_context * ice,struct crocus_batch * batch,int stage,struct push_bos * push_bos)5646 setup_constant_buffers(struct crocus_context *ice,
5647 struct crocus_batch *batch,
5648 int stage,
5649 struct push_bos *push_bos)
5650 {
5651 struct crocus_shader_state *shs = &ice->state.shaders[stage];
5652 struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
5653 struct elk_stage_prog_data *prog_data = (void *) shader->prog_data;
5654
5655 uint32_t push_range_sum = 0;
5656
5657 int n = 0;
5658 for (int i = 0; i < 4; i++) {
5659 const struct elk_ubo_range *range = &prog_data->ubo_ranges[i];
5660
5661 if (range->length == 0)
5662 continue;
5663
5664 push_range_sum += range->length;
5665
5666 if (range->length > push_bos->max_length)
5667 push_bos->max_length = range->length;
5668
5669 /* Range block is a binding table index, map back to UBO index. */
5670 unsigned block_index = crocus_bti_to_group_index(
5671 &shader->bt, CROCUS_SURFACE_GROUP_UBO, range->block);
5672 assert(block_index != CROCUS_SURFACE_NOT_USED);
5673
5674 struct pipe_constant_buffer *cbuf = &shs->constbufs[block_index];
5675 struct crocus_resource *res = (void *) cbuf->buffer;
5676
5677 assert(cbuf->buffer_offset % 32 == 0);
5678
5679 push_bos->buffers[n].length = range->length;
5680 push_bos->buffers[n].addr =
5681 res ? ro_bo(res->bo, range->start * 32 + cbuf->buffer_offset)
5682 : ro_bo(batch->ice->workaround_bo,
5683 batch->ice->workaround_offset);
5684 n++;
5685 }
5686
5687 /* From the 3DSTATE_CONSTANT_XS and 3DSTATE_CONSTANT_ALL programming notes:
5688 *
5689 * "The sum of all four read length fields must be less than or
5690 * equal to the size of 64."
5691 */
5692 assert(push_range_sum <= 64);
5693
5694 push_bos->buffer_count = n;
5695 }
5696
5697 #if GFX_VER == 7
5698 static void
gen7_emit_vs_workaround_flush(struct crocus_batch * batch)5699 gen7_emit_vs_workaround_flush(struct crocus_batch *batch)
5700 {
5701 crocus_emit_pipe_control_write(batch,
5702 "vs workaround",
5703 PIPE_CONTROL_WRITE_IMMEDIATE
5704 | PIPE_CONTROL_DEPTH_STALL,
5705 batch->ice->workaround_bo,
5706 batch->ice->workaround_offset, 0);
5707 }
5708 #endif
5709
5710 static void
emit_push_constant_packets(struct crocus_context * ice,struct crocus_batch * batch,int stage,const struct push_bos * push_bos)5711 emit_push_constant_packets(struct crocus_context *ice,
5712 struct crocus_batch *batch,
5713 int stage,
5714 const struct push_bos *push_bos)
5715 {
5716 struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
5717 struct elk_stage_prog_data *prog_data = shader ? (void *) shader->prog_data : NULL;
5718 UNUSED uint32_t mocs = crocus_mocs(NULL, &batch->screen->isl_dev);
5719
5720 #if GFX_VER == 7
5721 if (stage == MESA_SHADER_VERTEX) {
5722 if (batch->screen->devinfo.platform == INTEL_PLATFORM_IVB)
5723 gen7_emit_vs_workaround_flush(batch);
5724 }
5725 #endif
5726 crocus_emit_cmd(batch, GENX(3DSTATE_CONSTANT_VS), pkt) {
5727 pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
5728 #if GFX_VER >= 7
5729 #if GFX_VER != 8
5730 /* MOCS is MBZ on Gen8 so we skip it there */
5731 pkt.ConstantBody.MOCS = mocs;
5732 #endif
5733
5734 if (prog_data) {
5735 /* The Skylake PRM contains the following restriction:
5736 *
5737 * "The driver must ensure The following case does not occur
5738 * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
5739 * buffer 3 read length equal to zero committed followed by a
5740 * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to
5741 * zero committed."
5742 *
5743 * To avoid this, we program the buffers in the highest slots.
5744 * This way, slot 0 is only used if slot 3 is also used.
5745 */
5746 int n = push_bos->buffer_count;
5747 assert(n <= 4);
5748 #if GFX_VERx10 >= 75
5749 const unsigned shift = 4 - n;
5750 #else
5751 const unsigned shift = 0;
5752 #endif
5753 for (int i = 0; i < n; i++) {
5754 pkt.ConstantBody.ReadLength[i + shift] =
5755 push_bos->buffers[i].length;
5756 pkt.ConstantBody.Buffer[i + shift] = push_bos->buffers[i].addr;
5757 }
5758 }
5759 #else
5760 if (prog_data) {
5761 int n = push_bos->buffer_count;
5762 assert (n <= 1);
5763 if (n == 1) {
5764 pkt.Buffer0Valid = true;
5765 pkt.ConstantBody.PointertoConstantBuffer0 = push_bos->buffers[0].addr.offset;
5766 pkt.ConstantBody.ConstantBuffer0ReadLength = push_bos->buffers[0].length - 1;
5767 }
5768 }
5769 #endif
5770 }
5771 }
5772
5773 #endif
5774
5775 #if GFX_VER == 8
5776 typedef struct GENX(3DSTATE_WM_DEPTH_STENCIL) DEPTH_STENCIL_GENXML;
5777 #elif GFX_VER >= 6
5778 typedef struct GENX(DEPTH_STENCIL_STATE) DEPTH_STENCIL_GENXML;
5779 #else
5780 typedef struct GENX(COLOR_CALC_STATE) DEPTH_STENCIL_GENXML;
5781 #endif
5782
5783 static inline void
set_depth_stencil_bits(struct crocus_context * ice,DEPTH_STENCIL_GENXML * ds)5784 set_depth_stencil_bits(struct crocus_context *ice, DEPTH_STENCIL_GENXML *ds)
5785 {
5786 struct crocus_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
5787 ds->DepthTestEnable = cso->cso.depth_enabled;
5788 ds->DepthBufferWriteEnable = cso->cso.depth_writemask;
5789 ds->DepthTestFunction = translate_compare_func(cso->cso.depth_func);
5790
5791 ds->StencilFailOp = cso->cso.stencil[0].fail_op;
5792 ds->StencilPassDepthFailOp = cso->cso.stencil[0].zfail_op;
5793 ds->StencilPassDepthPassOp = cso->cso.stencil[0].zpass_op;
5794 ds->StencilTestFunction = translate_compare_func(cso->cso.stencil[0].func);
5795
5796 ds->StencilTestMask = cso->cso.stencil[0].valuemask;
5797 ds->StencilWriteMask = cso->cso.stencil[0].writemask;
5798
5799 ds->BackfaceStencilFailOp = cso->cso.stencil[1].fail_op;
5800 ds->BackfaceStencilPassDepthFailOp = cso->cso.stencil[1].zfail_op;
5801 ds->BackfaceStencilPassDepthPassOp = cso->cso.stencil[1].zpass_op;
5802 ds->BackfaceStencilTestFunction = translate_compare_func(cso->cso.stencil[1].func);
5803
5804 ds->BackfaceStencilTestMask = cso->cso.stencil[1].valuemask;
5805 ds->BackfaceStencilWriteMask = cso->cso.stencil[1].writemask;
5806 ds->DoubleSidedStencilEnable = cso->cso.stencil[1].enabled;
5807 ds->StencilTestEnable = cso->cso.stencil[0].enabled;
5808 ds->StencilBufferWriteEnable =
5809 cso->cso.stencil[0].writemask != 0 ||
5810 (cso->cso.stencil[1].enabled && cso->cso.stencil[1].writemask != 0);
5811 }
5812
5813 static void
emit_vertex_buffer_state(struct crocus_batch * batch,unsigned buffer_id,struct crocus_bo * bo,unsigned start_offset,unsigned end_offset,unsigned stride,unsigned step_rate,uint32_t ** map)5814 emit_vertex_buffer_state(struct crocus_batch *batch,
5815 unsigned buffer_id,
5816 struct crocus_bo *bo,
5817 unsigned start_offset,
5818 unsigned end_offset,
5819 unsigned stride,
5820 unsigned step_rate,
5821 uint32_t **map)
5822 {
5823 const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);
5824 _crocus_pack_state(batch, GENX(VERTEX_BUFFER_STATE), *map, vb) {
5825 vb.BufferStartingAddress = ro_bo(bo, start_offset);
5826 #if GFX_VER >= 8
5827 vb.BufferSize = end_offset - start_offset;
5828 #endif
5829 vb.VertexBufferIndex = buffer_id;
5830 vb.BufferPitch = stride;
5831 #if GFX_VER >= 7
5832 vb.AddressModifyEnable = true;
5833 #endif
5834 #if GFX_VER >= 6
5835 vb.MOCS = crocus_mocs(bo, &batch->screen->isl_dev);
5836 #endif
5837 #if GFX_VER < 8
5838 vb.BufferAccessType = step_rate ? INSTANCEDATA : VERTEXDATA;
5839 vb.InstanceDataStepRate = step_rate;
5840 #if GFX_VER >= 5
5841 vb.EndAddress = ro_bo(bo, end_offset - 1);
5842 #endif
5843 #endif
5844 }
5845 *map += vb_dwords;
5846 }
5847
5848 #if GFX_VER >= 6
5849 static uint32_t
determine_sample_mask(struct crocus_context * ice)5850 determine_sample_mask(struct crocus_context *ice)
5851 {
5852 uint32_t num_samples = ice->state.framebuffer.samples;
5853
5854 if (num_samples <= 1)
5855 return 1;
5856
5857 uint32_t fb_mask = (1 << num_samples) - 1;
5858 return ice->state.sample_mask & fb_mask;
5859 }
5860 #endif
5861
5862 static void
crocus_upload_dirty_render_state(struct crocus_context * ice,struct crocus_batch * batch,const struct pipe_draw_info * draw)5863 crocus_upload_dirty_render_state(struct crocus_context *ice,
5864 struct crocus_batch *batch,
5865 const struct pipe_draw_info *draw)
5866 {
5867 uint64_t dirty = ice->state.dirty;
5868 uint64_t stage_dirty = ice->state.stage_dirty;
5869
5870 if (!(dirty & CROCUS_ALL_DIRTY_FOR_RENDER) &&
5871 !(stage_dirty & CROCUS_ALL_STAGE_DIRTY_FOR_RENDER))
5872 return;
5873
5874 if (dirty & CROCUS_DIRTY_VF_STATISTICS) {
5875 crocus_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
5876 vf.StatisticsEnable = true;
5877 }
5878 }
5879
5880 #if GFX_VER <= 5
5881 if (stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS |
5882 CROCUS_STAGE_DIRTY_CONSTANTS_FS)) {
5883 bool ret = calculate_curbe_offsets(batch);
5884 if (ret) {
5885 dirty |= CROCUS_DIRTY_GEN4_CURBE | CROCUS_DIRTY_WM | CROCUS_DIRTY_CLIP;
5886 stage_dirty |= CROCUS_STAGE_DIRTY_VS;
5887 }
5888 }
5889
5890 if (dirty & (CROCUS_DIRTY_GEN4_CURBE | CROCUS_DIRTY_RASTER) ||
5891 stage_dirty & CROCUS_STAGE_DIRTY_VS) {
5892 bool ret = crocus_calculate_urb_fence(batch, ice->curbe.total_size,
5893 elk_vue_prog_data(ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data)->urb_entry_size,
5894 ((struct elk_sf_prog_data *)ice->shaders.sf_prog->prog_data)->urb_entry_size);
5895 if (ret) {
5896 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS | CROCUS_DIRTY_RASTER | CROCUS_DIRTY_CLIP;
5897 stage_dirty |= CROCUS_STAGE_DIRTY_GS | CROCUS_STAGE_DIRTY_VS;
5898 }
5899 }
5900 #endif
5901 if (dirty & CROCUS_DIRTY_CC_VIEWPORT) {
5902 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
5903 uint32_t cc_vp_address;
5904
5905 /* XXX: could avoid streaming for depth_clip [0,1] case. */
5906 uint32_t *cc_vp_map =
5907 stream_state(batch,
5908 4 * ice->state.num_viewports *
5909 GENX(CC_VIEWPORT_length), 32, &cc_vp_address);
5910 for (int i = 0; i < ice->state.num_viewports; i++) {
5911 float zmin, zmax;
5912 crocus_viewport_zmin_zmax(&ice->state.viewports[i], cso_rast->cso.clip_halfz,
5913 ice->state.window_space_position,
5914 &zmin, &zmax);
5915 if (cso_rast->cso.depth_clip_near)
5916 zmin = 0.0;
5917 if (cso_rast->cso.depth_clip_far)
5918 zmax = 1.0;
5919
5920 crocus_pack_state(GENX(CC_VIEWPORT), cc_vp_map, ccv) {
5921 ccv.MinimumDepth = zmin;
5922 ccv.MaximumDepth = zmax;
5923 }
5924
5925 cc_vp_map += GENX(CC_VIEWPORT_length);
5926 }
5927
5928 #if GFX_VER >= 7
5929 crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
5930 ptr.CCViewportPointer = cc_vp_address;
5931 }
5932 #elif GFX_VER == 6
5933 crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
5934 vp.CCViewportStateChange = 1;
5935 vp.PointertoCC_VIEWPORT = cc_vp_address;
5936 }
5937 #else
5938 ice->state.cc_vp_address = cc_vp_address;
5939 dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
5940 #endif
5941 }
5942
5943 if (dirty & CROCUS_DIRTY_SF_CL_VIEWPORT) {
5944 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5945 #if GFX_VER >= 7
5946 uint32_t sf_cl_vp_address;
5947 uint32_t *vp_map =
5948 stream_state(batch,
5949 4 * ice->state.num_viewports *
5950 GENX(SF_CLIP_VIEWPORT_length), 64, &sf_cl_vp_address);
5951 #else
5952 uint32_t *vp_map =
5953 stream_state(batch,
5954 4 * ice->state.num_viewports * GENX(SF_VIEWPORT_length),
5955 32, &ice->state.sf_vp_address);
5956 uint32_t *clip_map =
5957 stream_state(batch,
5958 4 * ice->state.num_viewports * GENX(CLIP_VIEWPORT_length),
5959 32, &ice->state.clip_vp_address);
5960 #endif
5961
5962 for (unsigned i = 0; i < ice->state.num_viewports; i++) {
5963 const struct pipe_viewport_state *state = &ice->state.viewports[i];
5964 float gb_xmin, gb_xmax, gb_ymin, gb_ymax;
5965
5966 #if GFX_VER == 8
5967 float vp_xmin = viewport_extent(state, 0, -1.0f);
5968 float vp_xmax = viewport_extent(state, 0, 1.0f);
5969 float vp_ymin = viewport_extent(state, 1, -1.0f);
5970 float vp_ymax = viewport_extent(state, 1, 1.0f);
5971 #endif
5972 intel_calculate_guardband_size(0, cso_fb->width, 0, cso_fb->height,
5973 state->scale[0], state->scale[1],
5974 state->translate[0], state->translate[1],
5975 &gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax);
5976 #if GFX_VER >= 7
5977 crocus_pack_state(GENX(SF_CLIP_VIEWPORT), vp_map, vp)
5978 #else
5979 crocus_pack_state(GENX(SF_VIEWPORT), vp_map, vp)
5980 #endif
5981 {
5982 vp.ViewportMatrixElementm00 = state->scale[0];
5983 vp.ViewportMatrixElementm11 = state->scale[1];
5984 vp.ViewportMatrixElementm22 = state->scale[2];
5985 vp.ViewportMatrixElementm30 = state->translate[0];
5986 vp.ViewportMatrixElementm31 = state->translate[1];
5987 vp.ViewportMatrixElementm32 = state->translate[2];
5988 #if GFX_VER < 6
5989 struct pipe_scissor_state scissor;
5990 crocus_fill_scissor_rect(ice, 0, &scissor);
5991 vp.ScissorRectangle.ScissorRectangleXMin = scissor.minx;
5992 vp.ScissorRectangle.ScissorRectangleXMax = scissor.maxx;
5993 vp.ScissorRectangle.ScissorRectangleYMin = scissor.miny;
5994 vp.ScissorRectangle.ScissorRectangleYMax = scissor.maxy;
5995 #endif
5996
5997 #if GFX_VER >= 7
5998 vp.XMinClipGuardband = gb_xmin;
5999 vp.XMaxClipGuardband = gb_xmax;
6000 vp.YMinClipGuardband = gb_ymin;
6001 vp.YMaxClipGuardband = gb_ymax;
6002 #endif
6003 #if GFX_VER == 8
6004 vp.XMinViewPort = MAX2(vp_xmin, 0);
6005 vp.XMaxViewPort = MIN2(vp_xmax, cso_fb->width) - 1;
6006 vp.YMinViewPort = MAX2(vp_ymin, 0);
6007 vp.YMaxViewPort = MIN2(vp_ymax, cso_fb->height) - 1;
6008 #endif
6009 }
6010 #if GFX_VER < 7
6011 crocus_pack_state(GENX(CLIP_VIEWPORT), clip_map, clip) {
6012 clip.XMinClipGuardband = gb_xmin;
6013 clip.XMaxClipGuardband = gb_xmax;
6014 clip.YMinClipGuardband = gb_ymin;
6015 clip.YMaxClipGuardband = gb_ymax;
6016 }
6017 #endif
6018 #if GFX_VER >= 7
6019 vp_map += GENX(SF_CLIP_VIEWPORT_length);
6020 #else
6021 vp_map += GENX(SF_VIEWPORT_length);
6022 clip_map += GENX(CLIP_VIEWPORT_length);
6023 #endif
6024 }
6025 #if GFX_VER >= 7
6026 crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
6027 ptr.SFClipViewportPointer = sf_cl_vp_address;
6028 }
6029 #elif GFX_VER == 6
6030 crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
6031 vp.SFViewportStateChange = 1;
6032 vp.CLIPViewportStateChange = 1;
6033 vp.PointertoCLIP_VIEWPORT = ice->state.clip_vp_address;
6034 vp.PointertoSF_VIEWPORT = ice->state.sf_vp_address;
6035 }
6036 #endif
6037 }
6038
6039 #if GFX_VER >= 6
6040 if (dirty & CROCUS_DIRTY_GEN6_URB) {
6041 #if GFX_VER == 6
6042 bool gs_present = ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL
6043 || ice->shaders.ff_gs_prog;
6044
6045 struct elk_vue_prog_data *vue_prog_data =
6046 (void *) ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data;
6047 const unsigned vs_size = vue_prog_data->urb_entry_size;
6048 unsigned gs_size = vs_size;
6049 if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
6050 struct elk_vue_prog_data *gs_vue_prog_data =
6051 (void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data;
6052 gs_size = gs_vue_prog_data->urb_entry_size;
6053 }
6054
6055 genX(crocus_upload_urb)(batch, vs_size, gs_present, gs_size);
6056 #endif
6057 #if GFX_VER >= 7
6058 const struct intel_device_info *devinfo = &batch->screen->devinfo;
6059 bool gs_present = ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL;
6060 bool tess_present = ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL;
6061 struct intel_urb_config urb_cfg;
6062
6063 for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
6064 if (!ice->shaders.prog[i]) {
6065 urb_cfg.size[i] = 1;
6066 } else {
6067 struct elk_vue_prog_data *vue_prog_data =
6068 (void *) ice->shaders.prog[i]->prog_data;
6069 urb_cfg.size[i] = vue_prog_data->urb_entry_size;
6070 }
6071 assert(urb_cfg.size[i] != 0);
6072 }
6073
6074 /* If we're just switching between programs with the same URB requirements,
6075 * skip the rest of the logic.
6076 */
6077 bool no_change = false;
6078 if (ice->urb.vsize == urb_cfg.size[MESA_SHADER_VERTEX] &&
6079 ice->urb.gs_present == gs_present &&
6080 ice->urb.gsize == urb_cfg.size[MESA_SHADER_GEOMETRY] &&
6081 ice->urb.tess_present == tess_present &&
6082 ice->urb.hsize == urb_cfg.size[MESA_SHADER_TESS_CTRL] &&
6083 ice->urb.dsize == urb_cfg.size[MESA_SHADER_TESS_EVAL]) {
6084 no_change = true;
6085 }
6086
6087 if (!no_change) {
6088 ice->urb.vsize = urb_cfg.size[MESA_SHADER_VERTEX];
6089 ice->urb.gs_present = gs_present;
6090 ice->urb.gsize = urb_cfg.size[MESA_SHADER_GEOMETRY];
6091 ice->urb.tess_present = tess_present;
6092 ice->urb.hsize = urb_cfg.size[MESA_SHADER_TESS_CTRL];
6093 ice->urb.dsize = urb_cfg.size[MESA_SHADER_TESS_EVAL];
6094
6095 bool constrained;
6096 intel_get_urb_config(devinfo,
6097 batch->screen->l3_config_3d,
6098 tess_present,
6099 gs_present,
6100 &urb_cfg, NULL, &constrained);
6101
6102 #if GFX_VER == 7
6103 if (devinfo->platform == INTEL_PLATFORM_IVB)
6104 gen7_emit_vs_workaround_flush(batch);
6105 #endif
6106 for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
6107 crocus_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) {
6108 urb._3DCommandSubOpcode += i;
6109 urb.VSURBStartingAddress = urb_cfg.start[i];
6110 urb.VSURBEntryAllocationSize = urb_cfg.size[i] - 1;
6111 urb.VSNumberofURBEntries = urb_cfg.entries[i];
6112 }
6113 }
6114 }
6115 #endif
6116 }
6117
6118 if (dirty & CROCUS_DIRTY_GEN6_BLEND_STATE) {
6119 struct crocus_blend_state *cso_blend = ice->state.cso_blend;
6120 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
6121 struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
6122
6123 STATIC_ASSERT(GENX(BLEND_STATE_ENTRY_length) == 2);
6124 int rt_dwords =
6125 MAX2(cso_fb->nr_cbufs, 1) * GENX(BLEND_STATE_ENTRY_length);
6126 #if GFX_VER >= 8
6127 rt_dwords += GENX(BLEND_STATE_length);
6128 #endif
6129 uint32_t blend_offset;
6130 uint32_t *blend_map =
6131 stream_state(batch,
6132 4 * rt_dwords, 64, &blend_offset);
6133
6134 #if GFX_VER >= 8
6135 struct GENX(BLEND_STATE) be = { 0 };
6136 {
6137 #else
6138 for (int i = 0; i < ELK_MAX_DRAW_BUFFERS; i++) {
6139 struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
6140 #define be entry
6141 #endif
6142
6143 be.AlphaTestEnable = cso_zsa->cso.alpha_enabled;
6144 be.AlphaTestFunction = translate_compare_func(cso_zsa->cso.alpha_func);
6145 be.AlphaToCoverageEnable = cso_blend->cso.alpha_to_coverage;
6146 be.AlphaToOneEnable = cso_blend->cso.alpha_to_one;
6147 be.AlphaToCoverageDitherEnable = GFX_VER >= 7 && cso_blend->cso.alpha_to_coverage_dither;
6148 be.ColorDitherEnable = cso_blend->cso.dither;
6149
6150 #if GFX_VER >= 8
6151 for (int i = 0; i < ELK_MAX_DRAW_BUFFERS; i++) {
6152 struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
6153 #else
6154 {
6155 #endif
6156 const struct pipe_rt_blend_state *rt =
6157 &cso_blend->cso.rt[cso_blend->cso.independent_blend_enable ? i : 0];
6158
6159 be.IndependentAlphaBlendEnable = set_blend_entry_bits(batch, &entry, cso_blend, i) ||
6160 be.IndependentAlphaBlendEnable;
6161
6162 if (GFX_VER >= 8 || can_emit_logic_op(ice)) {
6163 entry.LogicOpEnable = cso_blend->cso.logicop_enable;
6164 entry.LogicOpFunction = cso_blend->cso.logicop_func;
6165 }
6166
6167 entry.ColorClampRange = COLORCLAMP_RTFORMAT;
6168 entry.PreBlendColorClampEnable = true;
6169 entry.PostBlendColorClampEnable = true;
6170
6171 entry.WriteDisableRed = !(rt->colormask & PIPE_MASK_R);
6172 entry.WriteDisableGreen = !(rt->colormask & PIPE_MASK_G);
6173 entry.WriteDisableBlue = !(rt->colormask & PIPE_MASK_B);
6174 entry.WriteDisableAlpha = !(rt->colormask & PIPE_MASK_A);
6175
6176 #if GFX_VER >= 8
6177 GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[1 + i * 2], &entry);
6178 #else
6179 GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[i * 2], &entry);
6180 #endif
6181 }
6182 }
6183 #if GFX_VER >= 8
6184 GENX(BLEND_STATE_pack)(NULL, blend_map, &be);
6185 #endif
6186 #if GFX_VER < 7
6187 crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
6188 ptr.PointertoBLEND_STATE = blend_offset;
6189 ptr.BLEND_STATEChange = true;
6190 }
6191 #else
6192 crocus_emit_cmd(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) {
6193 ptr.BlendStatePointer = blend_offset;
6194 #if GFX_VER >= 8
6195 ptr.BlendStatePointerValid = true;
6196 #endif
6197 }
6198 #endif
6199 }
6200 #endif
6201
6202 if (dirty & CROCUS_DIRTY_COLOR_CALC_STATE) {
6203 struct crocus_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
6204 UNUSED struct crocus_blend_state *cso_blend = ice->state.cso_blend;
6205 struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
6206 uint32_t cc_offset;
6207 void *cc_map =
6208 stream_state(batch,
6209 sizeof(uint32_t) * GENX(COLOR_CALC_STATE_length),
6210 64, &cc_offset);
6211 #if GFX_VER <= 5
6212 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6213 #endif
6214 _crocus_pack_state(batch, GENX(COLOR_CALC_STATE), cc_map, cc) {
6215 cc.AlphaTestFormat = ALPHATEST_FLOAT32;
6216 cc.AlphaReferenceValueAsFLOAT32 = cso->cso.alpha_ref_value;
6217
6218 #if GFX_VER <= 5
6219
6220 set_depth_stencil_bits(ice, &cc);
6221
6222 if (cso_blend->cso.logicop_enable) {
6223 if (can_emit_logic_op(ice)) {
6224 cc.LogicOpEnable = cso_blend->cso.logicop_enable;
6225 cc.LogicOpFunction = cso_blend->cso.logicop_func;
6226 }
6227 }
6228 cc.ColorDitherEnable = cso_blend->cso.dither;
6229
6230 cc.IndependentAlphaBlendEnable = set_blend_entry_bits(batch, &cc, cso_blend, 0);
6231
6232 if (cso->cso.alpha_enabled && ice->state.framebuffer.nr_cbufs <= 1) {
6233 cc.AlphaTestEnable = cso->cso.alpha_enabled;
6234 cc.AlphaTestFunction = translate_compare_func(cso->cso.alpha_func);
6235 }
6236 cc.StatisticsEnable = ice->state.stats_wm ? 1 : 0;
6237 cc.CCViewportStatePointer = ro_bo(batch->state.bo, ice->state.cc_vp_address);
6238 #else
6239 cc.AlphaTestFormat = ALPHATEST_FLOAT32;
6240 cc.AlphaReferenceValueAsFLOAT32 = cso->cso.alpha_ref_value;
6241
6242 cc.BlendConstantColorRed = ice->state.blend_color.color[0];
6243 cc.BlendConstantColorGreen = ice->state.blend_color.color[1];
6244 cc.BlendConstantColorBlue = ice->state.blend_color.color[2];
6245 cc.BlendConstantColorAlpha = ice->state.blend_color.color[3];
6246 #endif
6247 cc.StencilReferenceValue = p_stencil_refs->ref_value[0];
6248 cc.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
6249 }
6250 ice->shaders.cc_offset = cc_offset;
6251 #if GFX_VER >= 6
6252 crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
6253 ptr.ColorCalcStatePointer = cc_offset;
6254 #if GFX_VER != 7
6255 ptr.ColorCalcStatePointerValid = true;
6256 #endif
6257 }
6258 #endif
6259 }
6260 #if GFX_VER <= 5
6261 if (dirty & CROCUS_DIRTY_GEN4_CONSTANT_COLOR) {
6262 crocus_emit_cmd(batch, GENX(3DSTATE_CONSTANT_COLOR), blend_cc) {
6263 blend_cc.BlendConstantColorRed = ice->state.blend_color.color[0];
6264 blend_cc.BlendConstantColorGreen = ice->state.blend_color.color[1];
6265 blend_cc.BlendConstantColorBlue = ice->state.blend_color.color[2];
6266 blend_cc.BlendConstantColorAlpha = ice->state.blend_color.color[3];
6267 }
6268 }
6269 #endif
6270 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6271 if (!(stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage)))
6272 continue;
6273
6274 struct crocus_shader_state *shs = &ice->state.shaders[stage];
6275 struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
6276
6277 if (!shader)
6278 continue;
6279
6280 if (shs->sysvals_need_upload)
6281 upload_sysvals(ice, stage);
6282
6283 #if GFX_VER <= 5
6284 dirty |= CROCUS_DIRTY_GEN4_CURBE;
6285 #endif
6286 #if GFX_VER >= 7
6287 struct push_bos push_bos = {};
6288 setup_constant_buffers(ice, batch, stage, &push_bos);
6289
6290 emit_push_constant_packets(ice, batch, stage, &push_bos);
6291 #endif
6292 }
6293
6294 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6295 if (stage_dirty & (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage)) {
6296 if (ice->shaders.prog[stage]) {
6297 #if GFX_VER <= 6
6298 dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
6299 #endif
6300 crocus_populate_binding_table(ice, batch, stage, false);
6301 ice->shaders.prog[stage]->bind_bo_offset =
6302 crocus_upload_binding_table(ice, batch,
6303 ice->shaders.prog[stage]->surf_offset,
6304 ice->shaders.prog[stage]->bt.size_bytes);
6305
6306 #if GFX_VER >= 7
6307 crocus_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), ptr) {
6308 ptr._3DCommandSubOpcode = 38 + stage;
6309 ptr.PointertoVSBindingTable = ice->shaders.prog[stage]->bind_bo_offset;
6310 }
6311 #endif
6312 #if GFX_VER == 6
6313 } else if (stage == MESA_SHADER_GEOMETRY && ice->shaders.ff_gs_prog) {
6314 dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
6315 crocus_populate_binding_table(ice, batch, stage, true);
6316 ice->shaders.ff_gs_prog->bind_bo_offset =
6317 crocus_upload_binding_table(ice, batch,
6318 ice->shaders.ff_gs_prog->surf_offset,
6319 ice->shaders.ff_gs_prog->bt.size_bytes);
6320 #endif
6321 }
6322 }
6323 }
6324 #if GFX_VER <= 6
6325 if (dirty & CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS) {
6326 struct crocus_compiled_shader *gs = ice->shaders.prog[MESA_SHADER_GEOMETRY];
6327 if (gs == NULL)
6328 gs = ice->shaders.ff_gs_prog;
6329 crocus_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS), ptr) {
6330 ptr.PointertoVSBindingTable = ice->shaders.prog[MESA_SHADER_VERTEX]->bind_bo_offset;
6331 ptr.PointertoPSBindingTable = ice->shaders.prog[MESA_SHADER_FRAGMENT]->bind_bo_offset;
6332 #if GFX_VER == 6
6333 ptr.VSBindingTableChange = true;
6334 ptr.PSBindingTableChange = true;
6335 ptr.GSBindingTableChange = gs ? true : false;
6336 ptr.PointertoGSBindingTable = gs ? gs->bind_bo_offset : 0;
6337 #endif
6338 }
6339 }
6340 #endif
6341
6342 bool sampler_updates = dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS;
6343 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6344 if (!(stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage)) ||
6345 !ice->shaders.prog[stage])
6346 continue;
6347
6348 crocus_upload_sampler_states(ice, batch, stage);
6349
6350 sampler_updates = true;
6351
6352 #if GFX_VER >= 7
6353 struct crocus_shader_state *shs = &ice->state.shaders[stage];
6354
6355 crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
6356 ptr._3DCommandSubOpcode = 43 + stage;
6357 ptr.PointertoVSSamplerState = shs->sampler_offset;
6358 }
6359 #endif
6360 }
6361
6362 if (sampler_updates) {
6363 #if GFX_VER == 6
6364 struct crocus_shader_state *shs_vs = &ice->state.shaders[MESA_SHADER_VERTEX];
6365 struct crocus_shader_state *shs_gs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
6366 struct crocus_shader_state *shs_fs = &ice->state.shaders[MESA_SHADER_FRAGMENT];
6367 crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS), ptr) {
6368 if (ice->shaders.prog[MESA_SHADER_VERTEX] &&
6369 (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
6370 stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_VERTEX))) {
6371 ptr.VSSamplerStateChange = true;
6372 ptr.PointertoVSSamplerState = shs_vs->sampler_offset;
6373 }
6374 if (ice->shaders.prog[MESA_SHADER_GEOMETRY] &&
6375 (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
6376 stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_GEOMETRY))) {
6377 ptr.GSSamplerStateChange = true;
6378 ptr.PointertoGSSamplerState = shs_gs->sampler_offset;
6379 }
6380 if (ice->shaders.prog[MESA_SHADER_FRAGMENT] &&
6381 (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
6382 stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_FRAGMENT))) {
6383 ptr.PSSamplerStateChange = true;
6384 ptr.PointertoPSSamplerState = shs_fs->sampler_offset;
6385 }
6386 }
6387 #endif
6388 }
6389
6390 #if GFX_VER >= 6
6391 if (dirty & CROCUS_DIRTY_GEN6_MULTISAMPLE) {
6392 crocus_emit_cmd(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
6393 ms.PixelLocation =
6394 ice->state.cso_rast->cso.half_pixel_center ? CENTER : UL_CORNER;
6395 if (ice->state.framebuffer.samples > 0)
6396 ms.NumberofMultisamples = ffs(ice->state.framebuffer.samples) - 1;
6397 #if GFX_VER == 6
6398 INTEL_SAMPLE_POS_4X(ms.Sample);
6399 #elif GFX_VER == 7
6400 switch (ice->state.framebuffer.samples) {
6401 case 1:
6402 INTEL_SAMPLE_POS_1X(ms.Sample);
6403 break;
6404 case 2:
6405 INTEL_SAMPLE_POS_2X(ms.Sample);
6406 break;
6407 case 4:
6408 INTEL_SAMPLE_POS_4X(ms.Sample);
6409 break;
6410 case 8:
6411 INTEL_SAMPLE_POS_8X(ms.Sample);
6412 break;
6413 default:
6414 break;
6415 }
6416 #endif
6417 }
6418 }
6419
6420 if (dirty & CROCUS_DIRTY_GEN6_SAMPLE_MASK) {
6421 crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLE_MASK), ms) {
6422 ms.SampleMask = determine_sample_mask(ice);
6423 }
6424 }
6425 #endif
6426
6427 #if GFX_VER >= 7
6428 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
6429 if ((stage_dirty & CROCUS_STAGE_DIRTY_FS) && shader) {
6430 struct elk_stage_prog_data *prog_data = shader->prog_data;
6431 struct elk_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
6432
6433 crocus_emit_cmd(batch, GENX(3DSTATE_PS), ps) {
6434
6435 /* Initialize the execution mask with VMask. Otherwise, derivatives are
6436 * incorrect for subspans where some of the pixels are unlit. We believe
6437 * the bit just didn't take effect in previous generations.
6438 */
6439 ps.VectorMaskEnable = GFX_VER >= 8 && wm_prog_data->uses_vmask;
6440
6441 intel_set_ps_dispatch_state(&ps, &batch->screen->devinfo,
6442 wm_prog_data,
6443 ice->state.framebuffer.samples,
6444 0 /* msaa_flags */);
6445
6446 ps.DispatchGRFStartRegisterForConstantSetupData0 =
6447 elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
6448 ps.DispatchGRFStartRegisterForConstantSetupData1 =
6449 elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
6450 ps.DispatchGRFStartRegisterForConstantSetupData2 =
6451 elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
6452
6453 ps.KernelStartPointer0 = KSP(ice, shader) +
6454 elk_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
6455 ps.KernelStartPointer1 = KSP(ice, shader) +
6456 elk_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
6457 ps.KernelStartPointer2 = KSP(ice, shader) +
6458 elk_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
6459
6460 #if GFX_VERx10 == 75
6461 ps.SampleMask = determine_sample_mask(ice);
6462 #endif
6463 // XXX: WABTPPrefetchDisable, see above, drop at C0
6464 ps.BindingTableEntryCount = shader->bt.size_bytes / 4;
6465 ps.FloatingPointMode = prog_data->use_alt_mode;
6466 #if GFX_VER >= 8
6467 ps.MaximumNumberofThreadsPerPSD =
6468 batch->screen->devinfo.max_threads_per_psd - 2;
6469 #else
6470 ps.MaximumNumberofThreads = batch->screen->devinfo.max_wm_threads - 1;
6471 #endif
6472
6473 ps.PushConstantEnable = prog_data->ubo_ranges[0].length > 0;
6474
6475 #if GFX_VER < 8
6476 ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
6477 ps.DualSourceBlendEnable = wm_prog_data->dual_src_blend && ice->state.cso_blend->dual_color_blending;
6478 ps.AttributeEnable = (wm_prog_data->num_varying_inputs != 0);
6479 #endif
6480 /* From the documentation for this packet:
6481 * "If the PS kernel does not need the Position XY Offsets to
6482 * compute a Position Value, then this field should be programmed
6483 * to POSOFFSET_NONE."
6484 *
6485 * "SW Recommendation: If the PS kernel needs the Position Offsets
6486 * to compute a Position XY value, this field should match Position
6487 * ZW Interpolation Mode to ensure a consistent position.xyzw
6488 * computation."
6489 *
6490 * We only require XY sample offsets. So, this recommendation doesn't
6491 * look useful at the moment. We might need this in future.
6492 */
6493 ps.PositionXYOffsetSelect =
6494 wm_prog_data->uses_pos_offset ? POSOFFSET_SAMPLE : POSOFFSET_NONE;
6495
6496 if (wm_prog_data->base.total_scratch) {
6497 struct crocus_bo *bo = crocus_get_scratch_space(ice, wm_prog_data->base.total_scratch, MESA_SHADER_FRAGMENT);
6498 ps.PerThreadScratchSpace = ffs(wm_prog_data->base.total_scratch) - 11;
6499 ps.ScratchSpaceBasePointer = rw_bo(bo, 0);
6500 }
6501 }
6502 #if GFX_VER == 8
6503 const struct shader_info *fs_info =
6504 crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
6505 crocus_emit_cmd(batch, GENX(3DSTATE_PS_EXTRA), psx) {
6506 psx.PixelShaderValid = true;
6507 psx.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
6508 psx.PixelShaderKillsPixel = wm_prog_data->uses_kill;
6509 psx.AttributeEnable = wm_prog_data->num_varying_inputs != 0;
6510 psx.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
6511 psx.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
6512 psx.PixelShaderIsPerSample =
6513 elk_wm_prog_data_is_persample(wm_prog_data, 0);
6514
6515 /* _NEW_MULTISAMPLE | ELK_NEW_CONSERVATIVE_RASTERIZATION */
6516 if (wm_prog_data->uses_sample_mask)
6517 psx.PixelShaderUsesInputCoverageMask = true;
6518
6519 psx.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
6520
6521 /* The stricter cross-primitive coherency guarantees that the hardware
6522 * gives us with the "Accesses UAV" bit set for at least one shader stage
6523 * and the "UAV coherency required" bit set on the 3DPRIMITIVE command
6524 * are redundant within the current image, atomic counter and SSBO GL
6525 * APIs, which all have very loose ordering and coherency requirements
6526 * and generally rely on the application to insert explicit barriers when
6527 * a shader invocation is expected to see the memory writes performed by
6528 * the invocations of some previous primitive. Regardless of the value
6529 * of "UAV coherency required", the "Accesses UAV" bits will implicitly
6530 * cause an in most cases useless DC flush when the lowermost stage with
6531 * the bit set finishes execution.
6532 *
6533 * It would be nice to disable it, but in some cases we can't because on
6534 * Gfx8+ it also has an influence on rasterization via the PS UAV-only
6535 * signal (which could be set independently from the coherency mechanism
6536 * in the 3DSTATE_WM command on Gfx7), and because in some cases it will
6537 * determine whether the hardware skips execution of the fragment shader
6538 * or not via the ThreadDispatchEnable signal. However if we know that
6539 * GFX8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and
6540 * GFX8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any
6541 * difference so we may just disable it here.
6542 *
6543 * Gfx8 hardware tries to compute ThreadDispatchEnable for us but doesn't
6544 * take into account KillPixels when no depth or stencil writes are
6545 * enabled. In order for occlusion queries to work correctly with no
6546 * attachments, we need to force-enable here.
6547 *
6548 */
6549 if ((wm_prog_data->has_side_effects || wm_prog_data->uses_kill) &&
6550 !(has_writeable_rt(ice->state.cso_blend, fs_info)))
6551 psx.PixelShaderHasUAV = true;
6552 }
6553 #endif
6554 }
6555 #endif
6556
6557 #if GFX_VER >= 7
6558 if (ice->state.streamout_active) {
6559 if (dirty & CROCUS_DIRTY_GEN7_SO_BUFFERS) {
6560 for (int i = 0; i < 4; i++) {
6561 struct crocus_stream_output_target *tgt =
6562 (void *) ice->state.so_target[i];
6563
6564 if (!tgt) {
6565 crocus_emit_cmd(batch, GENX(3DSTATE_SO_BUFFER), sob) {
6566 sob.SOBufferIndex = i;
6567 sob.MOCS = crocus_mocs(NULL, &batch->screen->isl_dev);
6568 }
6569 continue;
6570 }
6571 struct crocus_resource *res = (void *) tgt->base.buffer;
6572 uint32_t start = tgt->base.buffer_offset;
6573 #if GFX_VER < 8
6574 uint32_t end = ALIGN(start + tgt->base.buffer_size, 4);
6575 #endif
6576 crocus_emit_cmd(batch, GENX(3DSTATE_SO_BUFFER), sob) {
6577 sob.SOBufferIndex = i;
6578
6579 sob.SurfaceBaseAddress = rw_bo(res->bo, start);
6580 sob.MOCS = crocus_mocs(res->bo, &batch->screen->isl_dev);
6581 #if GFX_VER < 8
6582 sob.SurfacePitch = tgt->stride;
6583 sob.SurfaceEndAddress = rw_bo(res->bo, end);
6584 #else
6585 sob.SOBufferEnable = true;
6586 sob.StreamOffsetWriteEnable = true;
6587 sob.StreamOutputBufferOffsetAddressEnable = true;
6588
6589 sob.SurfaceSize = MAX2(tgt->base.buffer_size / 4, 1) - 1;
6590 sob.StreamOutputBufferOffsetAddress =
6591 rw_bo(crocus_resource_bo(&tgt->offset_res->base.b), tgt->offset_offset);
6592 if (tgt->zero_offset) {
6593 sob.StreamOffset = 0;
6594 tgt->zero_offset = false;
6595 } else
6596 sob.StreamOffset = 0xFFFFFFFF; /* not offset, see above */
6597 #endif
6598 }
6599 }
6600 }
6601
6602 if ((dirty & CROCUS_DIRTY_SO_DECL_LIST) && ice->state.streamout) {
6603 uint32_t *decl_list =
6604 ice->state.streamout + GENX(3DSTATE_STREAMOUT_length);
6605 crocus_batch_emit(batch, decl_list, 4 * ((decl_list[0] & 0xff) + 2));
6606 }
6607
6608 if (dirty & CROCUS_DIRTY_STREAMOUT) {
6609 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
6610
6611 uint32_t dynamic_sol[GENX(3DSTATE_STREAMOUT_length)];
6612 crocus_pack_command(GENX(3DSTATE_STREAMOUT), dynamic_sol, sol) {
6613 sol.SOFunctionEnable = true;
6614 sol.SOStatisticsEnable = true;
6615
6616 sol.RenderingDisable = cso_rast->cso.rasterizer_discard &&
6617 !ice->state.prims_generated_query_active;
6618 sol.ReorderMode = cso_rast->cso.flatshade_first ? LEADING : TRAILING;
6619 }
6620
6621 assert(ice->state.streamout);
6622
6623 crocus_emit_merge(batch, ice->state.streamout, dynamic_sol,
6624 GENX(3DSTATE_STREAMOUT_length));
6625 }
6626 } else {
6627 if (dirty & CROCUS_DIRTY_STREAMOUT) {
6628 crocus_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), sol);
6629 }
6630 }
6631 #endif
6632 #if GFX_VER == 6
6633 if (ice->state.streamout_active) {
6634 if (dirty & CROCUS_DIRTY_GEN6_SVBI) {
6635 crocus_emit_so_svbi(ice);
6636 }
6637 }
6638 #endif
6639
6640 if (dirty & CROCUS_DIRTY_CLIP) {
6641 #if GFX_VER < 6
6642 const struct elk_clip_prog_data *clip_prog_data = (struct elk_clip_prog_data *)ice->shaders.clip_prog->prog_data;
6643 struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
6644
6645 uint32_t *clip_ptr = stream_state(batch, GENX(CLIP_STATE_length) * 4, 32, &ice->shaders.clip_offset);
6646 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6647 _crocus_pack_state(batch, GENX(CLIP_STATE), clip_ptr, clip) {
6648 clip.KernelStartPointer = KSP(ice, ice->shaders.clip_prog);
6649 clip.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
6650 clip.SingleProgramFlow = true;
6651 clip.GRFRegisterCount = DIV_ROUND_UP(clip_prog_data->total_grf, 16) - 1;
6652
6653 clip.VertexURBEntryReadLength = clip_prog_data->urb_read_length;
6654 clip.ConstantURBEntryReadLength = clip_prog_data->curb_read_length;
6655
6656 clip.DispatchGRFStartRegisterForURBData = 1;
6657 clip.VertexURBEntryReadOffset = 0;
6658 clip.ConstantURBEntryReadOffset = ice->curbe.clip_start * 2;
6659
6660 clip.NumberofURBEntries = batch->ice->urb.nr_clip_entries;
6661 clip.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
6662
6663 if (batch->ice->urb.nr_clip_entries >= 10) {
6664 /* Half of the URB entries go to each thread, and it has to be an
6665 * even number.
6666 */
6667 assert(batch->ice->urb.nr_clip_entries % 2 == 0);
6668
6669 /* Although up to 16 concurrent Clip threads are allowed on Ironlake,
6670 * only 2 threads can output VUEs at a time.
6671 */
6672 clip.MaximumNumberofThreads = (GFX_VER == 5 ? 16 : 2) - 1;
6673 } else {
6674 assert(batch->ice->urb.nr_clip_entries >= 5);
6675 clip.MaximumNumberofThreads = 1 - 1;
6676 }
6677 clip.VertexPositionSpace = VPOS_NDCSPACE;
6678 clip.UserClipFlagsMustClipEnable = true;
6679 clip.GuardbandClipTestEnable = true;
6680
6681 clip.ClipperViewportStatePointer = ro_bo(batch->state.bo, ice->state.clip_vp_address);
6682 clip.ScreenSpaceViewportXMin = -1.0;
6683 clip.ScreenSpaceViewportXMax = 1.0;
6684 clip.ScreenSpaceViewportYMin = -1.0;
6685 clip.ScreenSpaceViewportYMax = 1.0;
6686 clip.ViewportXYClipTestEnable = true;
6687 clip.ViewportZClipTestEnable = (cso_state->depth_clip_near || cso_state->depth_clip_far);
6688
6689 #if GFX_VER == 5 || GFX_VERx10 == 45
6690 clip.UserClipDistanceClipTestEnableBitmask = cso_state->clip_plane_enable;
6691 #else
6692 /* Up to 6 actual clip flags, plus the 7th for the negative RHW
6693 * workaround.
6694 */
6695 clip.UserClipDistanceClipTestEnableBitmask = (cso_state->clip_plane_enable & 0x3f) | 0x40;
6696 #endif
6697
6698 clip.APIMode = cso_state->clip_halfz ? APIMODE_D3D : APIMODE_OGL;
6699 clip.GuardbandClipTestEnable = true;
6700
6701 clip.ClipMode = clip_prog_data->clip_mode;
6702 #if GFX_VERx10 == 45
6703 clip.NegativeWClipTestEnable = true;
6704 #endif
6705 }
6706
6707 #else //if GFX_VER >= 6
6708 struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
6709 const struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data );
6710 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
6711 bool gs_or_tes = ice->shaders.prog[MESA_SHADER_GEOMETRY] ||
6712 ice->shaders.prog[MESA_SHADER_TESS_EVAL];
6713 bool points_or_lines = cso_rast->fill_mode_point_or_line ||
6714 (gs_or_tes ? ice->shaders.output_topology_is_points_or_lines
6715 : ice->state.prim_is_points_or_lines);
6716 uint32_t dynamic_clip[GENX(3DSTATE_CLIP_length)];
6717 crocus_pack_command(GENX(3DSTATE_CLIP), &dynamic_clip, cl) {
6718 cl.StatisticsEnable = ice->state.statistics_counters_enabled;
6719 if (cso_rast->cso.rasterizer_discard)
6720 cl.ClipMode = CLIPMODE_REJECT_ALL;
6721 else if (ice->state.window_space_position)
6722 cl.ClipMode = CLIPMODE_ACCEPT_ALL;
6723 else
6724 cl.ClipMode = CLIPMODE_NORMAL;
6725
6726 cl.PerspectiveDivideDisable = ice->state.window_space_position;
6727 cl.ViewportXYClipTestEnable = !points_or_lines;
6728
6729 cl.UserClipDistanceCullTestEnableBitmask =
6730 elk_vue_prog_data(ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data)->cull_distance_mask;
6731
6732 cl.NonPerspectiveBarycentricEnable = wm_prog_data->uses_nonperspective_interp_modes;
6733
6734 cl.ForceZeroRTAIndexEnable = cso_fb->layers <= 1;
6735 cl.MaximumVPIndex = ice->state.num_viewports - 1;
6736 }
6737 crocus_emit_merge(batch, cso_rast->clip, dynamic_clip,
6738 ARRAY_SIZE(cso_rast->clip));
6739 #endif
6740 }
6741
6742 if (stage_dirty & CROCUS_STAGE_DIRTY_VS) {
6743 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_VERTEX];
6744 const struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(shader->prog_data);
6745 const struct elk_stage_prog_data *prog_data = &vue_prog_data->base;
6746 #if GFX_VER == 7
6747 if (batch->screen->devinfo.platform == INTEL_PLATFORM_IVB)
6748 gen7_emit_vs_workaround_flush(batch);
6749 #endif
6750
6751
6752 #if GFX_VER == 6
6753 struct push_bos push_bos = {};
6754 setup_constant_buffers(ice, batch, MESA_SHADER_VERTEX, &push_bos);
6755
6756 emit_push_constant_packets(ice, batch, MESA_SHADER_VERTEX, &push_bos);
6757 #endif
6758 #if GFX_VER >= 6
6759 crocus_emit_cmd(batch, GENX(3DSTATE_VS), vs)
6760 #else
6761 uint32_t *vs_ptr = stream_state(batch,
6762 GENX(VS_STATE_length) * 4, 32, &ice->shaders.vs_offset);
6763 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6764 _crocus_pack_state(batch, GENX(VS_STATE), vs_ptr, vs)
6765 #endif
6766 {
6767 INIT_THREAD_DISPATCH_FIELDS(vs, Vertex, MESA_SHADER_VERTEX);
6768
6769 vs.MaximumNumberofThreads = batch->screen->devinfo.max_vs_threads - 1;
6770
6771 #if GFX_VER < 6
6772 vs.GRFRegisterCount = DIV_ROUND_UP(vue_prog_data->total_grf, 16) - 1;
6773 vs.ConstantURBEntryReadLength = vue_prog_data->base.curb_read_length;
6774 vs.ConstantURBEntryReadOffset = ice->curbe.vs_start * 2;
6775
6776 vs.NumberofURBEntries = batch->ice->urb.nr_vs_entries >> (GFX_VER == 5 ? 2 : 0);
6777 vs.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
6778
6779 vs.MaximumNumberofThreads =
6780 CLAMP(batch->ice->urb.nr_vs_entries / 2, 1, batch->screen->devinfo.max_vs_threads) - 1;
6781 vs.StatisticsEnable = false;
6782 vs.SamplerStatePointer = ro_bo(batch->state.bo, ice->state.shaders[MESA_SHADER_VERTEX].sampler_offset);
6783 #endif
6784 #if GFX_VER == 5
6785 /* Force single program flow on Ironlake. We cannot reliably get
6786 * all applications working without it. See:
6787 * https://bugs.freedesktop.org/show_bug.cgi?id=29172
6788 *
6789 * The most notable and reliably failing application is the Humus
6790 * demo "CelShading"
6791 */
6792 vs.SingleProgramFlow = true;
6793 vs.SamplerCount = 0; /* hardware requirement */
6794
6795 #endif
6796 #if GFX_VER >= 8
6797 vs.SIMD8DispatchEnable =
6798 vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8;
6799
6800 vs.UserClipDistanceCullTestEnableBitmask =
6801 vue_prog_data->cull_distance_mask;
6802 #endif
6803 }
6804
6805 #if GFX_VER == 6
6806 crocus_emit_pipe_control_flush(batch,
6807 "post VS const",
6808 PIPE_CONTROL_DEPTH_STALL |
6809 PIPE_CONTROL_INSTRUCTION_INVALIDATE |
6810 PIPE_CONTROL_STATE_CACHE_INVALIDATE);
6811 #endif
6812 }
6813
6814 if (stage_dirty & CROCUS_STAGE_DIRTY_GS) {
6815 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_GEOMETRY];
6816 bool active = GFX_VER >= 6 && shader;
6817 #if GFX_VER == 6
6818 struct push_bos push_bos = {};
6819 if (shader)
6820 setup_constant_buffers(ice, batch, MESA_SHADER_GEOMETRY, &push_bos);
6821
6822 emit_push_constant_packets(ice, batch, MESA_SHADER_GEOMETRY, &push_bos);
6823 #endif
6824 #if GFX_VERx10 == 70
6825 /**
6826 * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
6827 * Geometry > Geometry Shader > State:
6828 *
6829 * "Note: Because of corruption in IVB:GT2, software needs to flush the
6830 * whole fixed function pipeline when the GS enable changes value in
6831 * the 3DSTATE_GS."
6832 *
6833 * The hardware architects have clarified that in this context "flush the
6834 * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
6835 * Stall" bit set.
6836 */
6837 if (batch->screen->devinfo.gt == 2 && ice->state.gs_enabled != active)
6838 gen7_emit_cs_stall_flush(batch);
6839 #endif
6840 #if GFX_VER >= 6
6841 crocus_emit_cmd(batch, GENX(3DSTATE_GS), gs)
6842 #else
6843 uint32_t *gs_ptr = stream_state(batch,
6844 GENX(GS_STATE_length) * 4, 32, &ice->shaders.gs_offset);
6845 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6846 _crocus_pack_state(batch, GENX(GS_STATE), gs_ptr, gs)
6847 #endif
6848 {
6849 #if GFX_VER >= 6
6850 if (active) {
6851 const struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(shader->prog_data);
6852 const struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(shader->prog_data);
6853 const struct elk_stage_prog_data *prog_data = &gs_prog_data->base.base;
6854
6855 INIT_THREAD_DISPATCH_FIELDS(gs, Vertex, MESA_SHADER_GEOMETRY);
6856 #if GFX_VER >= 7
6857 gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
6858 gs.OutputTopology = gs_prog_data->output_topology;
6859 gs.ControlDataHeaderSize =
6860 gs_prog_data->control_data_header_size_hwords;
6861
6862 gs.InstanceControl = gs_prog_data->invocations - 1;
6863 gs.DispatchMode = vue_prog_data->dispatch_mode;
6864
6865 gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
6866
6867 gs.ControlDataFormat = gs_prog_data->control_data_format;
6868 #endif
6869
6870 /* Note: the meaning of the GEN7_GS_REORDER_TRAILING bit changes between
6871 * Ivy Bridge and Haswell.
6872 *
6873 * On Ivy Bridge, setting this bit causes the vertices of a triangle
6874 * strip to be delivered to the geometry shader in an order that does
6875 * not strictly follow the OpenGL spec, but preserves triangle
6876 * orientation. For example, if the vertices are (1, 2, 3, 4, 5), then
6877 * the geometry shader sees triangles:
6878 *
6879 * (1, 2, 3), (2, 4, 3), (3, 4, 5)
6880 *
6881 * (Clearing the bit is even worse, because it fails to preserve
6882 * orientation).
6883 *
6884 * Triangle strips with adjacency always ordered in a way that preserves
6885 * triangle orientation but does not strictly follow the OpenGL spec,
6886 * regardless of the setting of this bit.
6887 *
6888 * On Haswell, both triangle strips and triangle strips with adjacency
6889 * are always ordered in a way that preserves triangle orientation.
6890 * Setting this bit causes the ordering to strictly follow the OpenGL
6891 * spec.
6892 *
6893 * So in either case we want to set the bit. Unfortunately on Ivy
6894 * Bridge this will get the order close to correct but not perfect.
6895 */
6896 gs.ReorderMode = TRAILING;
6897 gs.MaximumNumberofThreads =
6898 GFX_VER == 8 ? (batch->screen->devinfo.max_gs_threads / 2 - 1) :
6899 (batch->screen->devinfo.max_gs_threads - 1);
6900 #if GFX_VER < 7
6901 gs.SOStatisticsEnable = true;
6902 if (gs_prog_data->num_transform_feedback_bindings)
6903 gs.SVBIPayloadEnable = ice->state.streamout_active;
6904
6905 /* GEN6_GS_SPF_MODE and GEN6_GS_VECTOR_MASK_ENABLE are enabled as it
6906 * was previously done for gen6.
6907 *
6908 * TODO: test with both disabled to see if the HW is behaving
6909 * as expected, like in gen7.
6910 */
6911 gs.SingleProgramFlow = true;
6912 gs.VectorMaskEnable = true;
6913 #endif
6914 #if GFX_VER >= 8
6915 gs.ExpectedVertexCount = gs_prog_data->vertices_in;
6916
6917 if (gs_prog_data->static_vertex_count != -1) {
6918 gs.StaticOutput = true;
6919 gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count;
6920 }
6921 gs.IncludeVertexHandles = vue_prog_data->include_vue_handles;
6922
6923 gs.UserClipDistanceCullTestEnableBitmask =
6924 vue_prog_data->cull_distance_mask;
6925
6926 const int urb_entry_write_offset = 1;
6927 const uint32_t urb_entry_output_length =
6928 DIV_ROUND_UP(vue_prog_data->vue_map.num_slots, 2) -
6929 urb_entry_write_offset;
6930
6931 gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;
6932 gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
6933 #endif
6934 }
6935 #endif
6936 #if GFX_VER <= 6
6937 if (!active && ice->shaders.ff_gs_prog) {
6938 const struct elk_ff_gs_prog_data *gs_prog_data = (struct elk_ff_gs_prog_data *)ice->shaders.ff_gs_prog->prog_data;
6939 /* In gen6, transform feedback for the VS stage is done with an
6940 * ad-hoc GS program. This function provides the needed 3DSTATE_GS
6941 * for this.
6942 */
6943 gs.KernelStartPointer = KSP(ice, ice->shaders.ff_gs_prog);
6944 gs.SingleProgramFlow = true;
6945 gs.DispatchGRFStartRegisterForURBData = GFX_VER == 6 ? 2 : 1;
6946 gs.VertexURBEntryReadLength = gs_prog_data->urb_read_length;
6947
6948 #if GFX_VER <= 5
6949 gs.GRFRegisterCount =
6950 DIV_ROUND_UP(gs_prog_data->total_grf, 16) - 1;
6951 /* ELK_NEW_URB_FENCE */
6952 gs.NumberofURBEntries = batch->ice->urb.nr_gs_entries;
6953 gs.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
6954 gs.MaximumNumberofThreads = batch->ice->urb.nr_gs_entries >= 8 ? 1 : 0;
6955 gs.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
6956 #else
6957 gs.Enable = true;
6958 gs.VectorMaskEnable = true;
6959 gs.SVBIPayloadEnable = true;
6960 gs.SVBIPostIncrementEnable = true;
6961 gs.SVBIPostIncrementValue = gs_prog_data->svbi_postincrement_value;
6962 gs.SOStatisticsEnable = true;
6963 gs.MaximumNumberofThreads = batch->screen->devinfo.max_gs_threads - 1;
6964 #endif
6965 }
6966 #endif
6967 if (!active && !ice->shaders.ff_gs_prog) {
6968 #if GFX_VER < 8
6969 gs.DispatchGRFStartRegisterForURBData = 1;
6970 #if GFX_VER >= 7
6971 gs.IncludeVertexHandles = true;
6972 #endif
6973 #endif
6974 }
6975 #if GFX_VER >= 6
6976 gs.StatisticsEnable = true;
6977 #endif
6978 #if GFX_VER == 5 || GFX_VER == 6
6979 gs.RenderingEnabled = true;
6980 #endif
6981 #if GFX_VER <= 5
6982 gs.MaximumVPIndex = ice->state.num_viewports - 1;
6983 #endif
6984 }
6985 ice->state.gs_enabled = active;
6986 }
6987
6988 #if GFX_VER >= 7
6989 if (stage_dirty & CROCUS_STAGE_DIRTY_TCS) {
6990 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_TESS_CTRL];
6991
6992 if (shader) {
6993 const struct elk_tcs_prog_data *tcs_prog_data = elk_tcs_prog_data(shader->prog_data);
6994 const struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(shader->prog_data);
6995 const struct elk_stage_prog_data *prog_data = &tcs_prog_data->base.base;
6996
6997 crocus_emit_cmd(batch, GENX(3DSTATE_HS), hs) {
6998 INIT_THREAD_DISPATCH_FIELDS(hs, Vertex, MESA_SHADER_TESS_CTRL);
6999 hs.InstanceCount = tcs_prog_data->instances - 1;
7000 hs.IncludeVertexHandles = true;
7001 hs.MaximumNumberofThreads = batch->screen->devinfo.max_tcs_threads - 1;
7002 }
7003 } else {
7004 crocus_emit_cmd(batch, GENX(3DSTATE_HS), hs);
7005 }
7006
7007 }
7008
7009 if (stage_dirty & CROCUS_STAGE_DIRTY_TES) {
7010 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_TESS_EVAL];
7011 if (shader) {
7012 const struct elk_tes_prog_data *tes_prog_data = elk_tes_prog_data(shader->prog_data);
7013 const struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(shader->prog_data);
7014 const struct elk_stage_prog_data *prog_data = &tes_prog_data->base.base;
7015
7016 crocus_emit_cmd(batch, GENX(3DSTATE_TE), te) {
7017 te.Partitioning = tes_prog_data->partitioning;
7018 te.OutputTopology = tes_prog_data->output_topology;
7019 te.TEDomain = tes_prog_data->domain;
7020 te.TEEnable = true;
7021 te.MaximumTessellationFactorOdd = 63.0;
7022 te.MaximumTessellationFactorNotOdd = 64.0;
7023 };
7024 crocus_emit_cmd(batch, GENX(3DSTATE_DS), ds) {
7025 INIT_THREAD_DISPATCH_FIELDS(ds, Patch, MESA_SHADER_TESS_EVAL);
7026
7027 ds.MaximumNumberofThreads = batch->screen->devinfo.max_tes_threads - 1;
7028 ds.ComputeWCoordinateEnable =
7029 tes_prog_data->domain == INTEL_TESS_DOMAIN_TRI;
7030
7031 #if GFX_VER >= 8
7032 if (vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8)
7033 ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
7034 ds.UserClipDistanceCullTestEnableBitmask =
7035 vue_prog_data->cull_distance_mask;
7036 #endif
7037 };
7038 } else {
7039 crocus_emit_cmd(batch, GENX(3DSTATE_TE), te);
7040 crocus_emit_cmd(batch, GENX(3DSTATE_DS), ds);
7041 }
7042 }
7043 #endif
7044 if (dirty & CROCUS_DIRTY_RASTER) {
7045
7046 #if GFX_VER < 6
7047 const struct elk_sf_prog_data *sf_prog_data = (struct elk_sf_prog_data *)ice->shaders.sf_prog->prog_data;
7048 struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
7049 uint32_t *sf_ptr = stream_state(batch,
7050 GENX(SF_STATE_length) * 4, 32, &ice->shaders.sf_offset);
7051 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
7052 _crocus_pack_state(batch, GENX(SF_STATE), sf_ptr, sf) {
7053 sf.KernelStartPointer = KSP(ice, ice->shaders.sf_prog);
7054 sf.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
7055 sf.GRFRegisterCount = DIV_ROUND_UP(sf_prog_data->total_grf, 16) - 1;
7056 sf.DispatchGRFStartRegisterForURBData = 3;
7057 sf.VertexURBEntryReadOffset = ELK_SF_URB_ENTRY_READ_OFFSET;
7058 sf.VertexURBEntryReadLength = sf_prog_data->urb_read_length;
7059 sf.URBEntryAllocationSize = batch->ice->urb.sfsize - 1;
7060 sf.NumberofURBEntries = batch->ice->urb.nr_sf_entries;
7061 sf.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
7062
7063 sf.SetupViewportStateOffset = ro_bo(batch->state.bo, ice->state.sf_vp_address);
7064
7065 sf.MaximumNumberofThreads =
7066 MIN2(GFX_VER == 5 ? 48 : 24, batch->ice->urb.nr_sf_entries) - 1;
7067
7068 sf.SpritePointEnable = cso_state->point_quad_rasterization;
7069 sf.DestinationOriginHorizontalBias = 0.5;
7070 sf.DestinationOriginVerticalBias = 0.5;
7071
7072 sf.LineEndCapAntialiasingRegionWidth =
7073 cso_state->line_smooth ? _10pixels : _05pixels;
7074 sf.LastPixelEnable = cso_state->line_last_pixel;
7075 sf.AntialiasingEnable = cso_state->line_smooth;
7076
7077 sf.LineWidth = get_line_width(cso_state);
7078 sf.PointWidth = cso_state->point_size;
7079 sf.PointWidthSource = cso_state->point_size_per_vertex ? Vertex : State;
7080 #if GFX_VERx10 >= 45
7081 sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
7082 #endif
7083 sf.ViewportTransformEnable = true;
7084 sf.FrontWinding = cso_state->front_ccw ? 1 : 0;
7085 sf.ScissorRectangleEnable = true;
7086 sf.CullMode = translate_cull_mode(cso_state->cull_face);
7087
7088 if (cso_state->flatshade_first) {
7089 sf.TriangleFanProvokingVertexSelect = 1;
7090 } else {
7091 sf.TriangleStripListProvokingVertexSelect = 2;
7092 sf.TriangleFanProvokingVertexSelect = 2;
7093 sf.LineStripListProvokingVertexSelect = 1;
7094 }
7095 }
7096 #else
7097 struct crocus_rasterizer_state *cso = ice->state.cso_rast;
7098 uint32_t dynamic_sf[GENX(3DSTATE_SF_length)];
7099 crocus_pack_command(GENX(3DSTATE_SF), &dynamic_sf, sf) {
7100 sf.ViewportTransformEnable = !ice->state.window_space_position;
7101
7102 #if GFX_VER == 6
7103 const struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data);
7104 uint32_t urb_entry_read_length;
7105 uint32_t urb_entry_read_offset;
7106 uint32_t point_sprite_enables;
7107 calculate_attr_overrides(ice, sf.Attribute, &point_sprite_enables,
7108 &urb_entry_read_length,
7109 &urb_entry_read_offset);
7110 sf.VertexURBEntryReadLength = urb_entry_read_length;
7111 sf.VertexURBEntryReadOffset = urb_entry_read_offset;
7112 sf.PointSpriteTextureCoordinateEnable = point_sprite_enables;
7113 sf.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
7114 sf.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
7115 #endif
7116
7117 #if GFX_VER >= 6 && GFX_VER < 8
7118 if (ice->state.framebuffer.samples > 1 && ice->state.cso_rast->cso.multisample)
7119 sf.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
7120 #endif
7121 #if GFX_VER == 7
7122 if (ice->state.framebuffer.zsbuf) {
7123 struct crocus_resource *zres, *sres;
7124 crocus_get_depth_stencil_resources(&batch->screen->devinfo,
7125 ice->state.framebuffer.zsbuf->texture,
7126 &zres, &sres);
7127 /* ANV thinks that the stencil-ness doesn't matter, this is just
7128 * about handling polygon offset scaling.
7129 */
7130 sf.DepthBufferSurfaceFormat = zres ? isl_format_get_depth_format(zres->surf.format, false) : D16_UNORM;
7131 }
7132 #endif
7133 }
7134 crocus_emit_merge(batch, cso->sf, dynamic_sf,
7135 ARRAY_SIZE(dynamic_sf));
7136 #if GFX_VER == 8
7137 crocus_batch_emit(batch, cso->raster, sizeof(cso->raster));
7138 #endif
7139 #endif
7140 }
7141
7142 if (dirty & CROCUS_DIRTY_WM) {
7143 struct crocus_rasterizer_state *cso = ice->state.cso_rast;
7144 const struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data);
7145 UNUSED bool writes_depth = wm_prog_data->computed_depth_mode != ELK_PSCDEPTH_OFF;
7146 UNUSED const struct shader_info *fs_info =
7147 crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
7148
7149 #if GFX_VER == 6
7150 struct push_bos push_bos = {};
7151 setup_constant_buffers(ice, batch, MESA_SHADER_FRAGMENT, &push_bos);
7152
7153 emit_push_constant_packets(ice, batch, MESA_SHADER_FRAGMENT, &push_bos);
7154 #endif
7155 #if GFX_VER >= 6
7156 crocus_emit_cmd(batch, GENX(3DSTATE_WM), wm)
7157 #else
7158 uint32_t *wm_ptr = stream_state(batch,
7159 GENX(WM_STATE_length) * 4, 32, &ice->shaders.wm_offset);
7160
7161 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
7162
7163 _crocus_pack_state(batch, GENX(WM_STATE), wm_ptr, wm)
7164 #endif
7165 {
7166 #if GFX_VER <= 6
7167 wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
7168 wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
7169 wm._32PixelDispatchEnable = wm_prog_data->dispatch_32;
7170 #endif
7171 #if GFX_VER == 4
7172 /* On gen4, we only have one shader kernel */
7173 if (elk_wm_state_has_ksp(wm, 0)) {
7174 wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]);
7175 wm.GRFRegisterCount0 = elk_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
7176 wm.DispatchGRFStartRegisterForConstantSetupData0 =
7177 wm_prog_data->base.dispatch_grf_start_reg;
7178 }
7179 #elif GFX_VER == 5
7180 wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7181 elk_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
7182 wm.KernelStartPointer1 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7183 elk_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
7184 wm.KernelStartPointer2 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7185 elk_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
7186
7187 wm.GRFRegisterCount0 = elk_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
7188 wm.GRFRegisterCount1 = elk_wm_prog_data_reg_blocks(wm_prog_data, wm, 1);
7189 wm.GRFRegisterCount2 = elk_wm_prog_data_reg_blocks(wm_prog_data, wm, 2);
7190
7191 wm.DispatchGRFStartRegisterForConstantSetupData0 =
7192 wm_prog_data->base.dispatch_grf_start_reg;
7193 #elif GFX_VER == 6
7194 wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7195 elk_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
7196 wm.KernelStartPointer1 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7197 elk_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
7198 wm.KernelStartPointer2 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7199 elk_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
7200
7201 wm.DispatchGRFStartRegisterForConstantSetupData0 =
7202 elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
7203 wm.DispatchGRFStartRegisterForConstantSetupData1 =
7204 elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1);
7205 wm.DispatchGRFStartRegisterForConstantSetupData2 =
7206 elk_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2);
7207 #endif
7208 #if GFX_VER <= 5
7209 wm.ConstantURBEntryReadLength = wm_prog_data->base.curb_read_length;
7210 wm.ConstantURBEntryReadOffset = ice->curbe.wm_start * 2;
7211 wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2;
7212 wm.SetupURBEntryReadOffset = 0;
7213 wm.EarlyDepthTestEnable = true;
7214 wm.LineAntialiasingRegionWidth = _05pixels;
7215 wm.LineEndCapAntialiasingRegionWidth = _10pixels;
7216 wm.DepthCoefficientURBReadOffset = 1;
7217
7218 if (cso->cso.offset_tri) {
7219 wm.GlobalDepthOffsetEnable = true;
7220
7221 /* Something weird going on with legacy_global_depth_bias,
7222 * offset_constant, scaling and MRD. This value passes glean
7223 * but gives some odd results elsewere (eg. the
7224 * quad-offset-units test).
7225 */
7226 wm.GlobalDepthOffsetConstant = cso->cso.offset_units * 2;
7227 wm.GlobalDepthOffsetScale = cso->cso.offset_scale;
7228 }
7229 wm.SamplerStatePointer = ro_bo(batch->state.bo,
7230 ice->state.shaders[MESA_SHADER_FRAGMENT].sampler_offset);
7231 #endif
7232
7233 wm.StatisticsEnable = (GFX_VER >= 6 || ice->state.stats_wm) ?
7234 ice->state.statistics_counters_enabled : 0;
7235
7236 #if GFX_VER >= 6
7237 wm.LineAntialiasingRegionWidth = _10pixels;
7238 wm.LineEndCapAntialiasingRegionWidth = _05pixels;
7239
7240 wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
7241 wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
7242 #endif
7243 #if GFX_VER == 6
7244 wm.DualSourceBlendEnable = wm_prog_data->dual_src_blend &&
7245 ice->state.cso_blend->dual_color_blending;
7246 wm.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
7247 wm.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
7248
7249 /* From the SNB PRM, volume 2 part 1, page 281:
7250 * "If the PS kernel does not need the Position XY Offsets
7251 * to compute a Position XY value, then this field should be
7252 * programmed to POSOFFSET_NONE."
7253 *
7254 * "SW Recommendation: If the PS kernel needs the Position Offsets
7255 * to compute a Position XY value, this field should match Position
7256 * ZW Interpolation Mode to ensure a consistent position.xyzw
7257 * computation."
7258 * We only require XY sample offsets. So, this recommendation doesn't
7259 * look useful at the moment. We might need this in future.
7260 */
7261 if (wm_prog_data->uses_pos_offset)
7262 wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
7263 else
7264 wm.PositionXYOffsetSelect = POSOFFSET_NONE;
7265 #endif
7266 wm.LineStippleEnable = cso->cso.line_stipple_enable;
7267 wm.PolygonStippleEnable = cso->cso.poly_stipple_enable;
7268
7269 #if GFX_VER < 7
7270 if (wm_prog_data->base.use_alt_mode)
7271 wm.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
7272 wm.BindingTableEntryCount = ice->shaders.prog[MESA_SHADER_FRAGMENT]->bt.size_bytes / 4;
7273 wm.MaximumNumberofThreads = batch->screen->devinfo.max_wm_threads - 1;
7274 #endif
7275
7276 #if GFX_VER < 8
7277 #if GFX_VER >= 6
7278 wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
7279
7280 struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
7281 if (fb->samples > 1) {
7282 if (cso->cso.multisample)
7283 wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
7284 else
7285 wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
7286
7287 if (elk_wm_prog_data_is_persample(wm_prog_data, 0))
7288 wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
7289 else
7290 wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
7291 } else {
7292 wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
7293 wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
7294 }
7295 #endif
7296
7297 wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
7298
7299 if (wm_prog_data->uses_kill ||
7300 ice->state.cso_zsa->cso.alpha_enabled ||
7301 ice->state.cso_blend->cso.alpha_to_coverage ||
7302 (GFX_VER >= 6 && wm_prog_data->uses_omask))
7303 wm.PixelShaderKillsPixel = true;
7304
7305 if (has_writeable_rt(ice->state.cso_blend, fs_info) ||
7306 writes_depth || wm.PixelShaderKillsPixel ||
7307 (GFX_VER >= 6 && wm_prog_data->has_side_effects))
7308 wm.ThreadDispatchEnable = true;
7309
7310 #if GFX_VER >= 7
7311 wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
7312 wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
7313 #else
7314 if (wm_prog_data->base.total_scratch) {
7315 struct crocus_bo *bo = crocus_get_scratch_space(ice, wm_prog_data->base.total_scratch,
7316 MESA_SHADER_FRAGMENT);
7317 wm.PerThreadScratchSpace = ffs(wm_prog_data->base.total_scratch) - 11;
7318 wm.ScratchSpaceBasePointer = rw_bo(bo, 0);
7319 }
7320
7321 wm.PixelShaderComputedDepth = writes_depth;
7322
7323 #endif
7324 /* The "UAV access enable" bits are unnecessary on HSW because they only
7325 * seem to have an effect on the HW-assisted coherency mechanism which we
7326 * don't need, and the rasterization-related UAV_ONLY flag and the
7327 * DISPATCH_ENABLE bit can be set independently from it.
7328 * C.f. gen8_upload_ps_extra().
7329 *
7330 * ELK_NEW_FRAGMENT_PROGRAM | ELK_NEW_FS_PROG_DATA | _NEW_BUFFERS |
7331 * _NEW_COLOR
7332 */
7333 #if GFX_VERx10 == 75
7334 if (!(has_writeable_rt(ice->state.cso_blend, fs_info) || writes_depth) &&
7335 wm_prog_data->has_side_effects)
7336 wm.PSUAVonly = ON;
7337 #endif
7338 #endif
7339 #if GFX_VER >= 7
7340 /* ELK_NEW_FS_PROG_DATA */
7341 if (wm_prog_data->early_fragment_tests)
7342 wm.EarlyDepthStencilControl = EDSC_PREPS;
7343 else if (wm_prog_data->has_side_effects)
7344 wm.EarlyDepthStencilControl = EDSC_PSEXEC;
7345 #endif
7346 #if GFX_VER == 8
7347 /* We could skip this bit if color writes are enabled. */
7348 if (wm_prog_data->has_side_effects || wm_prog_data->uses_kill)
7349 wm.ForceThreadDispatchEnable = ForceON;
7350 #endif
7351 };
7352
7353 #if GFX_VER <= 5
7354 if (ice->state.global_depth_offset_clamp != cso->cso.offset_clamp) {
7355 crocus_emit_cmd(batch, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp) {
7356 clamp.GlobalDepthOffsetClamp = cso->cso.offset_clamp;
7357 }
7358 ice->state.global_depth_offset_clamp = cso->cso.offset_clamp;
7359 }
7360 #endif
7361 }
7362
7363 #if GFX_VER >= 7
7364 if (dirty & CROCUS_DIRTY_GEN7_SBE) {
7365 crocus_emit_sbe(batch, ice);
7366 }
7367 #endif
7368
7369 #if GFX_VER >= 8
7370 if (dirty & CROCUS_DIRTY_GEN8_PS_BLEND) {
7371 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
7372 struct crocus_blend_state *cso_blend = ice->state.cso_blend;
7373 struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
7374 struct elk_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
7375 const struct shader_info *fs_info =
7376 crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
7377 uint32_t dynamic_pb[GENX(3DSTATE_PS_BLEND_length)];
7378 crocus_pack_command(GENX(3DSTATE_PS_BLEND), &dynamic_pb, pb) {
7379 pb.HasWriteableRT = has_writeable_rt(cso_blend, fs_info);
7380 pb.AlphaTestEnable = cso_zsa->cso.alpha_enabled;
7381 pb.ColorBufferBlendEnable = (cso_blend->blend_enables & 1) &&
7382 (!cso_blend->dual_color_blending || wm_prog_data->dual_src_blend);
7383 }
7384 crocus_emit_merge(batch, cso_blend->ps_blend, dynamic_pb,
7385 ARRAY_SIZE(cso_blend->ps_blend));
7386 }
7387 #endif
7388
7389 #if GFX_VER >= 6
7390 if (dirty & CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL) {
7391
7392 #if GFX_VER >= 8
7393 crocus_emit_cmd(batch, GENX(3DSTATE_WM_DEPTH_STENCIL), wmds) {
7394 set_depth_stencil_bits(ice, &wmds);
7395 }
7396 #else
7397 uint32_t ds_offset;
7398 void *ds_map = stream_state(batch,
7399 sizeof(uint32_t) * GENX(DEPTH_STENCIL_STATE_length),
7400 64, &ds_offset);
7401 _crocus_pack_state(batch, GENX(DEPTH_STENCIL_STATE), ds_map, ds) {
7402 set_depth_stencil_bits(ice, &ds);
7403 }
7404
7405 #if GFX_VER == 6
7406 crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
7407 ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
7408 ptr.DEPTH_STENCIL_STATEChange = true;
7409 }
7410 #else
7411 crocus_emit_cmd(batch, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), ptr) {
7412 ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
7413 }
7414 #endif
7415 #endif
7416 }
7417
7418 if (dirty & CROCUS_DIRTY_GEN6_SCISSOR_RECT) {
7419 /* Align to 64-byte boundary as per anv. */
7420 uint32_t scissor_offset;
7421 struct pipe_scissor_state *scissor_map = (void *)
7422 stream_state(batch, sizeof(struct pipe_scissor_state) * ice->state.num_viewports,
7423 64, &scissor_offset);
7424 for (int i = 0; i < ice->state.num_viewports; i++) {
7425 struct pipe_scissor_state scissor;
7426 crocus_fill_scissor_rect(ice, i, &scissor);
7427 scissor_map[i] = scissor;
7428 }
7429
7430 crocus_emit_cmd(batch, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
7431 ptr.ScissorRectPointer = scissor_offset;
7432 }
7433 }
7434 #endif
7435
7436 if (dirty & CROCUS_DIRTY_DEPTH_BUFFER) {
7437 struct isl_device *isl_dev = &batch->screen->isl_dev;
7438 #if GFX_VER >= 6
7439 crocus_emit_depth_stall_flushes(batch);
7440 #endif
7441 void *batch_ptr;
7442 struct crocus_resource *zres, *sres;
7443 struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
7444 batch_ptr = crocus_get_command_space(batch, isl_dev->ds.size);
7445
7446 struct isl_view view = {
7447 .base_level = 0,
7448 .levels = 1,
7449 .base_array_layer = 0,
7450 .array_len = 1,
7451 .swizzle = ISL_SWIZZLE_IDENTITY,
7452 };
7453 struct isl_depth_stencil_hiz_emit_info info = {
7454 .view = &view,
7455 .mocs = crocus_mocs(NULL, isl_dev),
7456 };
7457
7458 if (cso->zsbuf) {
7459 crocus_get_depth_stencil_resources(&batch->screen->devinfo, cso->zsbuf->texture, &zres, &sres);
7460 struct crocus_surface *zsbuf = (struct crocus_surface *)cso->zsbuf;
7461 if (zsbuf->align_res) {
7462 zres = (struct crocus_resource *)zsbuf->align_res;
7463 }
7464 view.base_level = cso->zsbuf->u.tex.level;
7465 view.base_array_layer = cso->zsbuf->u.tex.first_layer;
7466 view.array_len = cso->zsbuf->u.tex.last_layer - cso->zsbuf->u.tex.first_layer + 1;
7467
7468 if (zres) {
7469 view.usage |= ISL_SURF_USAGE_DEPTH_BIT;
7470
7471 info.depth_surf = &zres->surf;
7472 info.depth_address = crocus_command_reloc(batch,
7473 (batch_ptr - batch->command.map) + isl_dev->ds.depth_offset,
7474 zres->bo, 0, RELOC_32BIT);
7475
7476 info.mocs = crocus_mocs(zres->bo, isl_dev);
7477 view.format = zres->surf.format;
7478
7479 if (crocus_resource_level_has_hiz(zres, view.base_level)) {
7480 info.hiz_usage = zres->aux.usage;
7481 info.hiz_surf = &zres->aux.surf;
7482 uint64_t hiz_offset = 0;
7483
7484 #if GFX_VER == 6
7485 /* HiZ surfaces on Sandy Bridge technically don't support
7486 * mip-mapping. However, we can fake it by offsetting to the
7487 * first slice of LOD0 in the HiZ surface.
7488 */
7489 isl_surf_get_image_offset_B_tile_sa(&zres->aux.surf,
7490 view.base_level, 0, 0,
7491 &hiz_offset, NULL, NULL);
7492 #endif
7493 info.hiz_address = crocus_command_reloc(batch,
7494 (batch_ptr - batch->command.map) + isl_dev->ds.hiz_offset,
7495 zres->aux.bo, zres->aux.offset + hiz_offset,
7496 RELOC_32BIT);
7497 info.depth_clear_value = crocus_resource_get_clear_color(zres).f32[0];
7498 }
7499 }
7500
7501 #if GFX_VER >= 6
7502 if (sres) {
7503 view.usage |= ISL_SURF_USAGE_STENCIL_BIT;
7504 info.stencil_aux_usage = sres->aux.usage;
7505 info.stencil_surf = &sres->surf;
7506
7507 uint64_t stencil_offset = 0;
7508 #if GFX_VER == 6
7509 /* Stencil surfaces on Sandy Bridge technically don't support
7510 * mip-mapping. However, we can fake it by offsetting to the
7511 * first slice of LOD0 in the stencil surface.
7512 */
7513 isl_surf_get_image_offset_B_tile_sa(&sres->surf,
7514 view.base_level, 0, 0,
7515 &stencil_offset, NULL, NULL);
7516 #endif
7517
7518 info.stencil_address = crocus_command_reloc(batch,
7519 (batch_ptr - batch->command.map) + isl_dev->ds.stencil_offset,
7520 sres->bo, stencil_offset, RELOC_32BIT);
7521 if (!zres) {
7522 view.format = sres->surf.format;
7523 info.mocs = crocus_mocs(sres->bo, isl_dev);
7524 }
7525 }
7526 #endif
7527 }
7528 isl_emit_depth_stencil_hiz_s(isl_dev, batch_ptr, &info);
7529 }
7530
7531 /* TODO: Disable emitting this until something uses a stipple. */
7532 if (dirty & CROCUS_DIRTY_POLYGON_STIPPLE) {
7533 crocus_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
7534 for (int i = 0; i < 32; i++) {
7535 poly.PatternRow[i] = ice->state.poly_stipple.stipple[i];
7536 }
7537 }
7538 }
7539
7540 if (dirty & CROCUS_DIRTY_LINE_STIPPLE) {
7541 struct crocus_rasterizer_state *cso = ice->state.cso_rast;
7542 crocus_batch_emit(batch, cso->line_stipple, sizeof(cso->line_stipple));
7543 }
7544
7545 #if GFX_VER >= 8
7546 if (dirty & CROCUS_DIRTY_GEN8_VF_TOPOLOGY) {
7547 crocus_emit_cmd(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
7548 topo.PrimitiveTopologyType =
7549 translate_prim_type(draw->mode, ice->state.patch_vertices);
7550 }
7551 }
7552 #endif
7553
7554 #if GFX_VER <= 5
7555 if (dirty & CROCUS_DIRTY_GEN5_PIPELINED_POINTERS) {
7556 upload_pipelined_state_pointers(batch, ice->shaders.ff_gs_prog ? true : false, ice->shaders.gs_offset,
7557 ice->shaders.vs_offset, ice->shaders.sf_offset,
7558 ice->shaders.clip_offset, ice->shaders.wm_offset, ice->shaders.cc_offset);
7559 crocus_upload_urb_fence(batch);
7560
7561 crocus_emit_cmd(batch, GENX(CS_URB_STATE), cs) {
7562 cs.NumberofURBEntries = ice->urb.nr_cs_entries;
7563 cs.URBEntryAllocationSize = ice->urb.csize - 1;
7564 }
7565 dirty |= CROCUS_DIRTY_GEN4_CURBE;
7566 }
7567 #endif
7568 if (dirty & CROCUS_DIRTY_DRAWING_RECTANGLE) {
7569 struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
7570 if (fb->width && fb->height) {
7571 crocus_emit_cmd(batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
7572 rect.ClippedDrawingRectangleXMax = fb->width - 1;
7573 rect.ClippedDrawingRectangleYMax = fb->height - 1;
7574 }
7575 }
7576 }
7577
7578 if (dirty & CROCUS_DIRTY_VERTEX_BUFFERS) {
7579 const uint32_t user_count = util_bitcount(ice->state.bound_vertex_buffers);
7580 const uint32_t count = user_count +
7581 ice->state.vs_uses_draw_params + ice->state.vs_uses_derived_draw_params;
7582 uint32_t dynamic_bound = ice->state.bound_vertex_buffers;
7583
7584 if (count) {
7585 const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);
7586
7587 uint32_t *map =
7588 crocus_get_command_space(batch, 4 * (1 + vb_dwords * count));
7589 _crocus_pack_command(batch, GENX(3DSTATE_VERTEX_BUFFERS), map, vb) {
7590 vb.DWordLength = (vb_dwords * count + 1) - 2;
7591 }
7592 map += 1;
7593
7594 uint32_t bound = dynamic_bound;
7595 int i;
7596 while (bound) {
7597 i = u_bit_scan(&bound);
7598 struct pipe_vertex_buffer *buf = &ice->state.vertex_buffers[i];
7599 struct crocus_bo *bo = crocus_resource_bo(buf->buffer.resource);
7600 uint32_t step_rate = ice->state.cso_vertex_elements->step_rate[i];
7601
7602 emit_vertex_buffer_state(batch, i, bo,
7603 buf->buffer_offset,
7604 ice->state.vb_end[i],
7605 ice->state.cso_vertex_elements->strides[i],
7606 step_rate,
7607 &map);
7608 }
7609 i = user_count;
7610 if (ice->state.vs_uses_draw_params) {
7611 struct crocus_resource *res = (struct crocus_resource *)ice->draw.draw_params.res;
7612 emit_vertex_buffer_state(batch, i++,
7613 res->bo,
7614 ice->draw.draw_params.offset,
7615 ice->draw.draw_params.res->width0,
7616 0, 0, &map);
7617 }
7618 if (ice->state.vs_uses_derived_draw_params) {
7619 struct crocus_resource *res = (struct crocus_resource *)ice->draw.derived_draw_params.res;
7620 emit_vertex_buffer_state(batch, i++,
7621 res->bo,
7622 ice->draw.derived_draw_params.offset,
7623 ice->draw.derived_draw_params.res->width0,
7624 0, 0, &map);
7625 }
7626 }
7627 }
7628
7629 if (dirty & CROCUS_DIRTY_VERTEX_ELEMENTS) {
7630 struct crocus_vertex_element_state *cso = ice->state.cso_vertex_elements;
7631 const unsigned entries = MAX2(cso->count, 1);
7632 if (!(ice->state.vs_needs_sgvs_element ||
7633 ice->state.vs_uses_derived_draw_params ||
7634 ice->state.vs_needs_edge_flag)) {
7635 crocus_batch_emit(batch, cso->vertex_elements, sizeof(uint32_t) *
7636 (1 + entries * GENX(VERTEX_ELEMENT_STATE_length)));
7637 } else {
7638 uint32_t dynamic_ves[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
7639 const unsigned dyn_count = cso->count +
7640 ice->state.vs_needs_sgvs_element +
7641 ice->state.vs_uses_derived_draw_params;
7642
7643 crocus_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS),
7644 &dynamic_ves, ve) {
7645 ve.DWordLength =
7646 1 + GENX(VERTEX_ELEMENT_STATE_length) * dyn_count - 2;
7647 }
7648 memcpy(&dynamic_ves[1], &cso->vertex_elements[1],
7649 (cso->count - ice->state.vs_needs_edge_flag) *
7650 GENX(VERTEX_ELEMENT_STATE_length) * sizeof(uint32_t));
7651 uint32_t *ve_pack_dest =
7652 &dynamic_ves[1 + (cso->count - ice->state.vs_needs_edge_flag) *
7653 GENX(VERTEX_ELEMENT_STATE_length)];
7654
7655 if (ice->state.vs_needs_sgvs_element) {
7656 uint32_t base_ctrl = ice->state.vs_uses_draw_params ?
7657 VFCOMP_STORE_SRC : VFCOMP_STORE_0;
7658 crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
7659 ve.Valid = true;
7660 ve.VertexBufferIndex =
7661 util_bitcount64(ice->state.bound_vertex_buffers);
7662 ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
7663 ve.Component0Control = base_ctrl;
7664 ve.Component1Control = base_ctrl;
7665 #if GFX_VER < 8
7666 ve.Component2Control = ice->state.vs_uses_vertexid ? VFCOMP_STORE_VID : VFCOMP_STORE_0;
7667 ve.Component3Control = ice->state.vs_uses_instanceid ? VFCOMP_STORE_IID : VFCOMP_STORE_0;
7668 #else
7669 ve.Component2Control = VFCOMP_STORE_0;
7670 ve.Component3Control = VFCOMP_STORE_0;
7671 #endif
7672 #if GFX_VER < 5
7673 ve.DestinationElementOffset = cso->count * 4;
7674 #endif
7675 }
7676 ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
7677 }
7678 if (ice->state.vs_uses_derived_draw_params) {
7679 crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
7680 ve.Valid = true;
7681 ve.VertexBufferIndex =
7682 util_bitcount64(ice->state.bound_vertex_buffers) +
7683 ice->state.vs_uses_draw_params;
7684 ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
7685 ve.Component0Control = VFCOMP_STORE_SRC;
7686 ve.Component1Control = VFCOMP_STORE_SRC;
7687 ve.Component2Control = VFCOMP_STORE_0;
7688 ve.Component3Control = VFCOMP_STORE_0;
7689 #if GFX_VER < 5
7690 ve.DestinationElementOffset = (cso->count + ice->state.vs_needs_sgvs_element) * 4;
7691 #endif
7692 }
7693 ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
7694 }
7695 if (ice->state.vs_needs_edge_flag) {
7696 for (int i = 0; i < GENX(VERTEX_ELEMENT_STATE_length); i++)
7697 ve_pack_dest[i] = cso->edgeflag_ve[i];
7698 }
7699
7700 crocus_batch_emit(batch, &dynamic_ves, sizeof(uint32_t) *
7701 (1 + dyn_count * GENX(VERTEX_ELEMENT_STATE_length)));
7702 }
7703
7704 #if GFX_VER == 8
7705 if (!ice->state.vs_needs_edge_flag) {
7706 crocus_batch_emit(batch, cso->vf_instancing, sizeof(uint32_t) *
7707 entries * GENX(3DSTATE_VF_INSTANCING_length));
7708 } else {
7709 assert(cso->count > 0);
7710 const unsigned edgeflag_index = cso->count - 1;
7711 uint32_t dynamic_vfi[33 * GENX(3DSTATE_VF_INSTANCING_length)];
7712 memcpy(&dynamic_vfi[0], cso->vf_instancing, edgeflag_index *
7713 GENX(3DSTATE_VF_INSTANCING_length) * sizeof(uint32_t));
7714
7715 uint32_t *vfi_pack_dest = &dynamic_vfi[0] +
7716 edgeflag_index * GENX(3DSTATE_VF_INSTANCING_length);
7717 crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
7718 vi.VertexElementIndex = edgeflag_index +
7719 ice->state.vs_needs_sgvs_element +
7720 ice->state.vs_uses_derived_draw_params;
7721 }
7722 for (int i = 0; i < GENX(3DSTATE_VF_INSTANCING_length); i++)
7723 vfi_pack_dest[i] |= cso->edgeflag_vfi[i];
7724
7725 crocus_batch_emit(batch, &dynamic_vfi[0], sizeof(uint32_t) *
7726 entries * GENX(3DSTATE_VF_INSTANCING_length));
7727 }
7728 #endif
7729 }
7730
7731 #if GFX_VER == 8
7732 if (dirty & CROCUS_DIRTY_GEN8_VF_SGVS) {
7733 const struct elk_vs_prog_data *vs_prog_data = (void *)
7734 ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data;
7735 struct crocus_vertex_element_state *cso = ice->state.cso_vertex_elements;
7736
7737 crocus_emit_cmd(batch, GENX(3DSTATE_VF_SGVS), sgv) {
7738 if (vs_prog_data->uses_vertexid) {
7739 sgv.VertexIDEnable = true;
7740 sgv.VertexIDComponentNumber = 2;
7741 sgv.VertexIDElementOffset =
7742 cso->count - ice->state.vs_needs_edge_flag;
7743 }
7744
7745 if (vs_prog_data->uses_instanceid) {
7746 sgv.InstanceIDEnable = true;
7747 sgv.InstanceIDComponentNumber = 3;
7748 sgv.InstanceIDElementOffset =
7749 cso->count - ice->state.vs_needs_edge_flag;
7750 }
7751 }
7752 }
7753 #endif
7754 #if GFX_VERx10 >= 75
7755 if (dirty & CROCUS_DIRTY_GEN75_VF) {
7756 crocus_emit_cmd(batch, GENX(3DSTATE_VF), vf) {
7757 if (draw->primitive_restart) {
7758 vf.IndexedDrawCutIndexEnable = true;
7759 vf.CutIndex = draw->restart_index;
7760 }
7761 }
7762 }
7763 #endif
7764
7765 #if GFX_VER == 8
7766 if (dirty & CROCUS_DIRTY_GEN8_PMA_FIX) {
7767 bool enable = want_pma_fix(ice);
7768 genX(crocus_update_pma_fix)(ice, batch, enable);
7769 }
7770 #endif
7771
7772 #if GFX_VER <= 5
7773 if (dirty & CROCUS_DIRTY_GEN4_CURBE) {
7774 gen4_upload_curbe(batch);
7775 }
7776 #endif
7777 }
7778
7779 static void
7780 crocus_upload_render_state(struct crocus_context *ice,
7781 struct crocus_batch *batch,
7782 const struct pipe_draw_info *draw,
7783 unsigned drawid_offset,
7784 const struct pipe_draw_indirect_info *indirect,
7785 const struct pipe_draw_start_count_bias *sc)
7786 {
7787 #if GFX_VER >= 7
7788 bool use_predicate = ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT;
7789 #endif
7790
7791 batch->no_wrap = true;
7792 batch->contains_draw = true;
7793
7794 crocus_update_surface_base_address(batch);
7795
7796 crocus_upload_dirty_render_state(ice, batch, draw);
7797
7798 batch->no_wrap = false;
7799 if (draw->index_size > 0) {
7800 unsigned offset;
7801 unsigned size;
7802 bool emit_index = false;
7803
7804 if (draw->has_user_indices) {
7805 unsigned start_offset = draw->index_size * sc->start;
7806 u_upload_data(ice->ctx.stream_uploader, 0,
7807 sc->count * draw->index_size, 4,
7808 (char *)draw->index.user + start_offset,
7809 &offset, &ice->state.index_buffer.res);
7810 offset -= start_offset;
7811 size = start_offset + sc->count * draw->index_size;
7812 emit_index = true;
7813 } else {
7814 struct crocus_resource *res = (void *) draw->index.resource;
7815
7816 if (ice->state.index_buffer.res != draw->index.resource) {
7817 res->bind_history |= PIPE_BIND_INDEX_BUFFER;
7818 pipe_resource_reference(&ice->state.index_buffer.res,
7819 draw->index.resource);
7820 emit_index = true;
7821 }
7822 offset = 0;
7823 size = draw->index.resource->width0;
7824 }
7825
7826 if (!emit_index &&
7827 (ice->state.index_buffer.size != size ||
7828 ice->state.index_buffer.index_size != draw->index_size
7829 #if GFX_VERx10 < 75
7830 || ice->state.index_buffer.prim_restart != draw->primitive_restart
7831 #endif
7832 )
7833 )
7834 emit_index = true;
7835
7836 if (emit_index) {
7837 struct crocus_bo *bo = crocus_resource_bo(ice->state.index_buffer.res);
7838
7839 crocus_emit_cmd(batch, GENX(3DSTATE_INDEX_BUFFER), ib) {
7840 #if GFX_VERx10 < 75
7841 ib.CutIndexEnable = draw->primitive_restart;
7842 #endif
7843 ib.IndexFormat = draw->index_size >> 1;
7844 ib.BufferStartingAddress = ro_bo(bo, offset);
7845 #if GFX_VER >= 8
7846 ib.BufferSize = bo->size - offset;
7847 #else
7848 ib.BufferEndingAddress = ro_bo(bo, offset + size - 1);
7849 #endif
7850 #if GFX_VER >= 6
7851 ib.MOCS = crocus_mocs(bo, &batch->screen->isl_dev);
7852 #endif
7853 }
7854 ice->state.index_buffer.size = size;
7855 ice->state.index_buffer.offset = offset;
7856 ice->state.index_buffer.index_size = draw->index_size;
7857 #if GFX_VERx10 < 75
7858 ice->state.index_buffer.prim_restart = draw->primitive_restart;
7859 #endif
7860 }
7861 }
7862
7863 #define _3DPRIM_END_OFFSET 0x2420
7864 #define _3DPRIM_START_VERTEX 0x2430
7865 #define _3DPRIM_VERTEX_COUNT 0x2434
7866 #define _3DPRIM_INSTANCE_COUNT 0x2438
7867 #define _3DPRIM_START_INSTANCE 0x243C
7868 #define _3DPRIM_BASE_VERTEX 0x2440
7869
7870 #if GFX_VER >= 7
7871 if (indirect && !indirect->count_from_stream_output) {
7872 if (indirect->indirect_draw_count) {
7873 use_predicate = true;
7874
7875 struct crocus_bo *draw_count_bo =
7876 crocus_resource_bo(indirect->indirect_draw_count);
7877 unsigned draw_count_offset =
7878 indirect->indirect_draw_count_offset;
7879
7880 crocus_emit_pipe_control_flush(batch,
7881 "ensure indirect draw buffer is flushed",
7882 PIPE_CONTROL_FLUSH_ENABLE);
7883 if (ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) {
7884 #if GFX_VERx10 >= 75
7885 struct mi_builder b;
7886 mi_builder_init(&b, &batch->screen->devinfo, batch);
7887
7888 /* comparison = draw id < draw count */
7889 struct mi_value comparison =
7890 mi_ult(&b, mi_imm(drawid_offset),
7891 mi_mem32(ro_bo(draw_count_bo,
7892 draw_count_offset)));
7893 #if GFX_VER == 8
7894 /* predicate = comparison & conditional rendering predicate */
7895 mi_store(&b, mi_reg32(MI_PREDICATE_RESULT),
7896 mi_iand(&b, comparison, mi_reg32(CS_GPR(15))));
7897 #else
7898 /* predicate = comparison & conditional rendering predicate */
7899 struct mi_value pred = mi_iand(&b, comparison,
7900 mi_reg32(CS_GPR(15)));
7901
7902 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), pred);
7903 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
7904
7905 unsigned mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
7906 MI_PREDICATE_COMBINEOP_SET |
7907 MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
7908
7909 crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
7910 #endif
7911 #endif
7912 } else {
7913 uint32_t mi_predicate;
7914
7915 /* Upload the id of the current primitive to MI_PREDICATE_SRC1. */
7916 crocus_load_register_imm64(batch, MI_PREDICATE_SRC1, drawid_offset);
7917 /* Upload the current draw count from the draw parameters buffer
7918 * to MI_PREDICATE_SRC0.
7919 */
7920 crocus_load_register_mem32(batch, MI_PREDICATE_SRC0,
7921 draw_count_bo, draw_count_offset);
7922 /* Zero the top 32-bits of MI_PREDICATE_SRC0 */
7923 crocus_load_register_imm32(batch, MI_PREDICATE_SRC0 + 4, 0);
7924
7925 if (drawid_offset == 0) {
7926 mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
7927 MI_PREDICATE_COMBINEOP_SET |
7928 MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
7929 } else {
7930 /* While draw_index < draw_count the predicate's result will be
7931 * (draw_index == draw_count) ^ TRUE = TRUE
7932 * When draw_index == draw_count the result is
7933 * (TRUE) ^ TRUE = FALSE
7934 * After this all results will be:
7935 * (FALSE) ^ FALSE = FALSE
7936 */
7937 mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOAD |
7938 MI_PREDICATE_COMBINEOP_XOR |
7939 MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
7940 }
7941 crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
7942 }
7943 }
7944
7945 #if GFX_VER >= 7
7946 struct crocus_bo *bo = crocus_resource_bo(indirect->buffer);
7947 assert(bo);
7948
7949 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7950 lrm.RegisterAddress = _3DPRIM_VERTEX_COUNT;
7951 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 0);
7952 }
7953 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7954 lrm.RegisterAddress = _3DPRIM_INSTANCE_COUNT;
7955 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 4);
7956 }
7957 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7958 lrm.RegisterAddress = _3DPRIM_START_VERTEX;
7959 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 8);
7960 }
7961 if (draw->index_size) {
7962 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7963 lrm.RegisterAddress = _3DPRIM_BASE_VERTEX;
7964 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12);
7965 }
7966 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7967 lrm.RegisterAddress = _3DPRIM_START_INSTANCE;
7968 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 16);
7969 }
7970 } else {
7971 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7972 lrm.RegisterAddress = _3DPRIM_START_INSTANCE;
7973 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12);
7974 }
7975 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
7976 lri.RegisterOffset = _3DPRIM_BASE_VERTEX;
7977 lri.DataDWord = 0;
7978 }
7979 }
7980 #endif
7981 } else if (indirect && indirect->count_from_stream_output) {
7982 #if GFX_VERx10 >= 75
7983 struct crocus_stream_output_target *so =
7984 (void *) indirect->count_from_stream_output;
7985
7986 /* XXX: Replace with actual cache tracking */
7987 crocus_emit_pipe_control_flush(batch,
7988 "draw count from stream output stall",
7989 PIPE_CONTROL_CS_STALL);
7990
7991 struct mi_builder b;
7992 mi_builder_init(&b, &batch->screen->devinfo, batch);
7993
7994 struct crocus_address addr =
7995 ro_bo(crocus_resource_bo(&so->offset_res->base.b), so->offset_offset);
7996 struct mi_value offset =
7997 mi_iadd_imm(&b, mi_mem32(addr), -so->base.buffer_offset);
7998
7999 mi_store(&b, mi_reg32(_3DPRIM_VERTEX_COUNT),
8000 mi_udiv32_imm(&b, offset, so->stride));
8001
8002 _crocus_emit_lri(batch, _3DPRIM_START_VERTEX, 0);
8003 _crocus_emit_lri(batch, _3DPRIM_BASE_VERTEX, 0);
8004 _crocus_emit_lri(batch, _3DPRIM_START_INSTANCE, 0);
8005 _crocus_emit_lri(batch, _3DPRIM_INSTANCE_COUNT, draw->instance_count);
8006 #endif
8007 }
8008 #else
8009 assert(!indirect);
8010 #endif
8011
8012 crocus_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
8013 prim.VertexAccessType = draw->index_size > 0 ? RANDOM : SEQUENTIAL;
8014 #if GFX_VER >= 7
8015 prim.PredicateEnable = use_predicate;
8016 #endif
8017
8018 prim.PrimitiveTopologyType = translate_prim_type(ice->state.prim_mode, ice->state.patch_vertices);
8019 if (indirect) {
8020 // XXX Probably have to do something for gen6 here?
8021 #if GFX_VER >= 7
8022 prim.IndirectParameterEnable = true;
8023 #endif
8024 } else {
8025 #if GFX_VER >= 5
8026 prim.StartInstanceLocation = draw->start_instance;
8027 #endif
8028 prim.InstanceCount = draw->instance_count;
8029 prim.VertexCountPerInstance = sc->count;
8030
8031 prim.StartVertexLocation = sc->start;
8032
8033 if (draw->index_size) {
8034 prim.BaseVertexLocation += sc->index_bias;
8035 }
8036 }
8037 }
8038 }
8039
8040 #if GFX_VER >= 7
8041
8042 static void
8043 crocus_upload_compute_state(struct crocus_context *ice,
8044 struct crocus_batch *batch,
8045 const struct pipe_grid_info *grid)
8046 {
8047 const uint64_t stage_dirty = ice->state.stage_dirty;
8048 struct crocus_screen *screen = batch->screen;
8049 const struct intel_device_info *devinfo = &screen->devinfo;
8050 struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
8051 struct crocus_compiled_shader *shader =
8052 ice->shaders.prog[MESA_SHADER_COMPUTE];
8053 struct elk_stage_prog_data *prog_data = shader->prog_data;
8054 struct elk_cs_prog_data *cs_prog_data = (void *) prog_data;
8055 const struct intel_cs_dispatch_info dispatch =
8056 elk_cs_get_dispatch_info(devinfo, cs_prog_data, grid->block);
8057
8058 crocus_update_surface_base_address(batch);
8059 if ((stage_dirty & CROCUS_STAGE_DIRTY_CONSTANTS_CS) && shs->sysvals_need_upload)
8060 upload_sysvals(ice, MESA_SHADER_COMPUTE);
8061
8062 if (stage_dirty & CROCUS_STAGE_DIRTY_BINDINGS_CS) {
8063 crocus_populate_binding_table(ice, batch, MESA_SHADER_COMPUTE, false);
8064 ice->shaders.prog[MESA_SHADER_COMPUTE]->bind_bo_offset =
8065 crocus_upload_binding_table(ice, batch,
8066 ice->shaders.prog[MESA_SHADER_COMPUTE]->surf_offset,
8067 ice->shaders.prog[MESA_SHADER_COMPUTE]->bt.size_bytes);
8068 }
8069
8070 if (stage_dirty & CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS)
8071 crocus_upload_sampler_states(ice, batch, MESA_SHADER_COMPUTE);
8072
8073 if ((stage_dirty & CROCUS_STAGE_DIRTY_CS) ||
8074 cs_prog_data->local_size[0] == 0 /* Variable local group size */) {
8075 /* The MEDIA_VFE_STATE documentation for Gen8+ says:
8076 *
8077 * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
8078 * the only bits that are changed are scoreboard related: Scoreboard
8079 * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard Delta. For
8080 * these scoreboard related states, a MEDIA_STATE_FLUSH is
8081 * sufficient."
8082 */
8083 crocus_emit_pipe_control_flush(batch,
8084 "workaround: stall before MEDIA_VFE_STATE",
8085 PIPE_CONTROL_CS_STALL);
8086
8087 crocus_emit_cmd(batch, GENX(MEDIA_VFE_STATE), vfe) {
8088 if (prog_data->total_scratch) {
8089 struct crocus_bo *bo =
8090 crocus_get_scratch_space(ice, prog_data->total_scratch,
8091 MESA_SHADER_COMPUTE);
8092 #if GFX_VER == 8
8093 /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
8094 * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
8095 */
8096 vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11;
8097 #elif GFX_VERx10 == 75
8098 /* Haswell's Per Thread Scratch Space is in the range [0, 10]
8099 * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
8100 */
8101 vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 12;
8102 #else
8103 /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
8104 * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
8105 */
8106 vfe.PerThreadScratchSpace = prog_data->total_scratch / 1024 - 1;
8107 #endif
8108 vfe.ScratchSpaceBasePointer = rw_bo(bo, 0);
8109 }
8110
8111 vfe.MaximumNumberofThreads =
8112 devinfo->max_cs_threads * devinfo->subslice_total - 1;
8113 vfe.ResetGatewayTimer =
8114 Resettingrelativetimerandlatchingtheglobaltimestamp;
8115 vfe.BypassGatewayControl = true;
8116 #if GFX_VER == 7
8117 vfe.GPGPUMode = true;
8118 #endif
8119 #if GFX_VER == 8
8120 vfe.BypassGatewayControl = true;
8121 #endif
8122 vfe.NumberofURBEntries = GFX_VER == 8 ? 2 : 0;
8123 vfe.URBEntryAllocationSize = GFX_VER == 8 ? 2 : 0;
8124
8125 vfe.CURBEAllocationSize =
8126 ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
8127 cs_prog_data->push.cross_thread.regs, 2);
8128 }
8129 }
8130
8131 /* TODO: Combine subgroup-id with cbuf0 so we can push regular uniforms */
8132 if ((stage_dirty & CROCUS_STAGE_DIRTY_CS) ||
8133 cs_prog_data->local_size[0] == 0 /* Variable local group size */) {
8134 uint32_t curbe_data_offset = 0;
8135 assert(cs_prog_data->push.cross_thread.dwords == 0 &&
8136 cs_prog_data->push.per_thread.dwords == 1 &&
8137 cs_prog_data->base.param[0] == ELK_PARAM_BUILTIN_SUBGROUP_ID);
8138 const unsigned push_const_size =
8139 elk_cs_push_const_total_size(cs_prog_data, dispatch.threads);
8140 uint32_t *curbe_data_map =
8141 stream_state(batch,
8142 ALIGN(push_const_size, 64), 64,
8143 &curbe_data_offset);
8144 assert(curbe_data_map);
8145 memset(curbe_data_map, 0x5a, ALIGN(push_const_size, 64));
8146 crocus_fill_cs_push_const_buffer(cs_prog_data, dispatch.threads,
8147 curbe_data_map);
8148
8149 crocus_emit_cmd(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
8150 curbe.CURBETotalDataLength = ALIGN(push_const_size, 64);
8151 curbe.CURBEDataStartAddress = curbe_data_offset;
8152 }
8153 }
8154
8155 if (stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS |
8156 CROCUS_STAGE_DIRTY_BINDINGS_CS |
8157 CROCUS_STAGE_DIRTY_CONSTANTS_CS |
8158 CROCUS_STAGE_DIRTY_CS)) {
8159 uint32_t desc[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
8160 const uint64_t ksp = KSP(ice,shader) + elk_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size);
8161 crocus_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), desc, idd) {
8162 idd.KernelStartPointer = ksp;
8163 idd.SamplerStatePointer = shs->sampler_offset;
8164 idd.BindingTablePointer = ice->shaders.prog[MESA_SHADER_COMPUTE]->bind_bo_offset;
8165 idd.BindingTableEntryCount = MIN2(shader->bt.size_bytes / 4, 31);
8166 idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads;
8167 idd.ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs;
8168 idd.BarrierEnable = cs_prog_data->uses_barrier;
8169 idd.SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER,
8170 prog_data->total_shared);
8171 #if GFX_VERx10 >= 75
8172 idd.CrossThreadConstantDataReadLength = cs_prog_data->push.cross_thread.regs;
8173 #endif
8174 }
8175
8176 crocus_emit_cmd(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
8177 load.InterfaceDescriptorTotalLength =
8178 GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
8179 load.InterfaceDescriptorDataStartAddress =
8180 emit_state(batch, desc, sizeof(desc), 64);
8181 }
8182 }
8183
8184 #define GPGPU_DISPATCHDIMX 0x2500
8185 #define GPGPU_DISPATCHDIMY 0x2504
8186 #define GPGPU_DISPATCHDIMZ 0x2508
8187
8188 if (grid->indirect) {
8189 struct crocus_state_ref *grid_size = &ice->state.grid_size;
8190 struct crocus_bo *bo = crocus_resource_bo(grid_size->res);
8191 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
8192 lrm.RegisterAddress = GPGPU_DISPATCHDIMX;
8193 lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 0);
8194 }
8195 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
8196 lrm.RegisterAddress = GPGPU_DISPATCHDIMY;
8197 lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 4);
8198 }
8199 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
8200 lrm.RegisterAddress = GPGPU_DISPATCHDIMZ;
8201 lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 8);
8202 }
8203
8204 #if GFX_VER == 7
8205 /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */
8206 _crocus_emit_lri(batch, MI_PREDICATE_SRC0 + 4, 0);
8207 crocus_load_register_imm64(batch, MI_PREDICATE_SRC1, 0);
8208
8209 /* Load compute_dispatch_indirect_x_size into SRC0 */
8210 crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 0);
8211
8212 /* predicate = (compute_dispatch_indirect_x_size == 0); */
8213 crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8214 mip.LoadOperation = LOAD_LOAD;
8215 mip.CombineOperation = COMBINE_SET;
8216 mip.CompareOperation = COMPARE_SRCS_EQUAL;
8217 };
8218
8219 /* Load compute_dispatch_indirect_y_size into SRC0 */
8220 crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 4);
8221
8222 /* predicate = (compute_dispatch_indirect_y_size == 0); */
8223 crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8224 mip.LoadOperation = LOAD_LOAD;
8225 mip.CombineOperation = COMBINE_OR;
8226 mip.CompareOperation = COMPARE_SRCS_EQUAL;
8227 };
8228
8229 /* Load compute_dispatch_indirect_z_size into SRC0 */
8230 crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 8);
8231
8232 /* predicate = (compute_dispatch_indirect_z_size == 0); */
8233 crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8234 mip.LoadOperation = LOAD_LOAD;
8235 mip.CombineOperation = COMBINE_OR;
8236 mip.CompareOperation = COMPARE_SRCS_EQUAL;
8237 };
8238
8239 /* predicate = !predicate; */
8240 #define COMPARE_FALSE 1
8241 crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8242 mip.LoadOperation = LOAD_LOADINV;
8243 mip.CombineOperation = COMBINE_OR;
8244 mip.CompareOperation = COMPARE_FALSE;
8245 }
8246 #endif
8247 }
8248
8249 crocus_emit_cmd(batch, GENX(GPGPU_WALKER), ggw) {
8250 ggw.IndirectParameterEnable = grid->indirect != NULL;
8251 ggw.PredicateEnable = GFX_VER <= 7 && grid->indirect != NULL;
8252 ggw.SIMDSize = dispatch.simd_size / 16;
8253 ggw.ThreadDepthCounterMaximum = 0;
8254 ggw.ThreadHeightCounterMaximum = 0;
8255 ggw.ThreadWidthCounterMaximum = dispatch.threads - 1;
8256 ggw.ThreadGroupIDXDimension = grid->grid[0];
8257 ggw.ThreadGroupIDYDimension = grid->grid[1];
8258 ggw.ThreadGroupIDZDimension = grid->grid[2];
8259 ggw.RightExecutionMask = dispatch.right_mask;
8260 ggw.BottomExecutionMask = 0xffffffff;
8261 }
8262
8263 crocus_emit_cmd(batch, GENX(MEDIA_STATE_FLUSH), msf);
8264
8265 batch->contains_draw = true;
8266 }
8267
8268 #endif /* GFX_VER >= 7 */
8269
8270 /**
8271 * State module teardown.
8272 */
8273 static void
8274 crocus_destroy_state(struct crocus_context *ice)
8275 {
8276 struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
8277
8278 pipe_resource_reference(&ice->draw.draw_params.res, NULL);
8279 pipe_resource_reference(&ice->draw.derived_draw_params.res, NULL);
8280
8281 free(ice->state.genx);
8282
8283 for (int i = 0; i < 4; i++) {
8284 pipe_so_target_reference(&ice->state.so_target[i], NULL);
8285 }
8286
8287 util_unreference_framebuffer_state(cso);
8288
8289 for (int stage = 0; stage < MESA_SHADER_STAGES; stage++) {
8290 struct crocus_shader_state *shs = &ice->state.shaders[stage];
8291 for (int i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
8292 pipe_resource_reference(&shs->constbufs[i].buffer, NULL);
8293 }
8294 for (int i = 0; i < PIPE_MAX_SHADER_IMAGES; i++) {
8295 pipe_resource_reference(&shs->image[i].base.resource, NULL);
8296 }
8297 for (int i = 0; i < PIPE_MAX_SHADER_BUFFERS; i++) {
8298 pipe_resource_reference(&shs->ssbo[i].buffer, NULL);
8299 }
8300 for (int i = 0; i < CROCUS_MAX_TEXTURE_SAMPLERS; i++) {
8301 pipe_sampler_view_reference((struct pipe_sampler_view **)
8302 &shs->textures[i], NULL);
8303 }
8304 }
8305
8306 for (int i = 0; i < 16; i++)
8307 pipe_resource_reference(&ice->state.vertex_buffers[i].buffer.resource, NULL);
8308 pipe_resource_reference(&ice->state.grid_size.res, NULL);
8309
8310 pipe_resource_reference(&ice->state.index_buffer.res, NULL);
8311 }
8312
8313 /* ------------------------------------------------------------------- */
8314
8315 static void
8316 crocus_rebind_buffer(struct crocus_context *ice,
8317 struct crocus_resource *res)
8318 {
8319 struct pipe_context *ctx = &ice->ctx;
8320
8321 assert(res->base.b.target == PIPE_BUFFER);
8322
8323 /* Buffers can't be framebuffer attachments, nor display related,
8324 * and we don't have upstream Clover support.
8325 */
8326 assert(!(res->bind_history & (PIPE_BIND_DEPTH_STENCIL |
8327 PIPE_BIND_RENDER_TARGET |
8328 PIPE_BIND_BLENDABLE |
8329 PIPE_BIND_DISPLAY_TARGET |
8330 PIPE_BIND_CURSOR |
8331 PIPE_BIND_COMPUTE_RESOURCE |
8332 PIPE_BIND_GLOBAL)));
8333
8334 if (res->bind_history & PIPE_BIND_VERTEX_BUFFER) {
8335 uint64_t bound_vbs = ice->state.bound_vertex_buffers;
8336 while (bound_vbs) {
8337 const int i = u_bit_scan64(&bound_vbs);
8338 struct pipe_vertex_buffer *buffer = &ice->state.vertex_buffers[i];
8339
8340 if (!buffer->is_user_buffer && &res->base.b == buffer->buffer.resource)
8341 ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS;
8342 }
8343 }
8344
8345 if ((res->bind_history & PIPE_BIND_INDEX_BUFFER) &&
8346 ice->state.index_buffer.res) {
8347 if (res->bo == crocus_resource_bo(ice->state.index_buffer.res))
8348 pipe_resource_reference(&ice->state.index_buffer.res, NULL);
8349 }
8350 /* There is no need to handle these:
8351 * - PIPE_BIND_COMMAND_ARGS_BUFFER (emitted for every indirect draw)
8352 * - PIPE_BIND_QUERY_BUFFER (no persistent state references)
8353 */
8354
8355 if (res->bind_history & PIPE_BIND_STREAM_OUTPUT) {
8356 /* XXX: be careful about resetting vs appending... */
8357 for (int i = 0; i < 4; i++) {
8358 if (ice->state.so_target[i] &&
8359 (ice->state.so_target[i]->buffer == &res->base.b)) {
8360 #if GFX_VER == 6
8361 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_GS;
8362 #else
8363 ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
8364 #endif
8365 }
8366 }
8367 }
8368
8369 for (int s = MESA_SHADER_VERTEX; s < MESA_SHADER_STAGES; s++) {
8370 struct crocus_shader_state *shs = &ice->state.shaders[s];
8371 enum pipe_shader_type p_stage = stage_to_pipe(s);
8372
8373 if (!(res->bind_stages & (1 << s)))
8374 continue;
8375
8376 if (res->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
8377 /* Skip constant buffer 0, it's for regular uniforms, not UBOs */
8378 uint32_t bound_cbufs = shs->bound_cbufs & ~1u;
8379 while (bound_cbufs) {
8380 const int i = u_bit_scan(&bound_cbufs);
8381 struct pipe_constant_buffer *cbuf = &shs->constbufs[i];
8382
8383 if (res->bo == crocus_resource_bo(cbuf->buffer)) {
8384 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << s;
8385 }
8386 }
8387 }
8388
8389 if (res->bind_history & PIPE_BIND_SHADER_BUFFER) {
8390 uint32_t bound_ssbos = shs->bound_ssbos;
8391 while (bound_ssbos) {
8392 const int i = u_bit_scan(&bound_ssbos);
8393 struct pipe_shader_buffer *ssbo = &shs->ssbo[i];
8394
8395 if (res->bo == crocus_resource_bo(ssbo->buffer)) {
8396 struct pipe_shader_buffer buf = {
8397 .buffer = &res->base.b,
8398 .buffer_offset = ssbo->buffer_offset,
8399 .buffer_size = ssbo->buffer_size,
8400 };
8401 crocus_set_shader_buffers(ctx, p_stage, i, 1, &buf,
8402 (shs->writable_ssbos >> i) & 1);
8403 }
8404 }
8405 }
8406
8407 if (res->bind_history & PIPE_BIND_SAMPLER_VIEW) {
8408 uint32_t bound_sampler_views = shs->bound_sampler_views;
8409 while (bound_sampler_views) {
8410 const int i = u_bit_scan(&bound_sampler_views);
8411 struct crocus_sampler_view *isv = shs->textures[i];
8412 struct crocus_bo *bo = isv->res->bo;
8413
8414 if (res->bo == bo) {
8415 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << s;
8416 }
8417 }
8418 }
8419
8420 if (res->bind_history & PIPE_BIND_SHADER_IMAGE) {
8421 uint32_t bound_image_views = shs->bound_image_views;
8422 while (bound_image_views) {
8423 const int i = u_bit_scan(&bound_image_views);
8424 struct crocus_image_view *iv = &shs->image[i];
8425 struct crocus_bo *bo = crocus_resource_bo(iv->base.resource);
8426
8427 if (res->bo == bo)
8428 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << s;
8429 }
8430 }
8431 }
8432 }
8433
8434 /* ------------------------------------------------------------------- */
8435
8436 static unsigned
8437 flags_to_post_sync_op(uint32_t flags)
8438 {
8439 if (flags & PIPE_CONTROL_WRITE_IMMEDIATE)
8440 return WriteImmediateData;
8441
8442 if (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT)
8443 return WritePSDepthCount;
8444
8445 if (flags & PIPE_CONTROL_WRITE_TIMESTAMP)
8446 return WriteTimestamp;
8447
8448 return 0;
8449 }
8450
8451 /*
8452 * Do the given flags have a Post Sync or LRI Post Sync operation?
8453 */
8454 static enum pipe_control_flags
8455 get_post_sync_flags(enum pipe_control_flags flags)
8456 {
8457 flags &= PIPE_CONTROL_WRITE_IMMEDIATE |
8458 PIPE_CONTROL_WRITE_DEPTH_COUNT |
8459 PIPE_CONTROL_WRITE_TIMESTAMP |
8460 PIPE_CONTROL_LRI_POST_SYNC_OP;
8461
8462 /* Only one "Post Sync Op" is allowed, and it's mutually exclusive with
8463 * "LRI Post Sync Operation". So more than one bit set would be illegal.
8464 */
8465 assert(util_bitcount(flags) <= 1);
8466
8467 return flags;
8468 }
8469
8470 #define IS_COMPUTE_PIPELINE(batch) (batch->name == CROCUS_BATCH_COMPUTE)
8471
8472 /**
8473 * Emit a series of PIPE_CONTROL commands, taking into account any
8474 * workarounds necessary to actually accomplish the caller's request.
8475 *
8476 * Unless otherwise noted, spec quotations in this function come from:
8477 *
8478 * Synchronization of the 3D Pipeline > PIPE_CONTROL Command > Programming
8479 * Restrictions for PIPE_CONTROL.
8480 *
8481 * You should not use this function directly. Use the helpers in
8482 * crocus_pipe_control.c instead, which may split the pipe control further.
8483 */
8484 static void
8485 crocus_emit_raw_pipe_control(struct crocus_batch *batch,
8486 const char *reason,
8487 uint32_t flags,
8488 struct crocus_bo *bo,
8489 uint32_t offset,
8490 uint64_t imm)
8491 {
8492 UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
8493 enum pipe_control_flags post_sync_flags = get_post_sync_flags(flags);
8494 UNUSED enum pipe_control_flags non_lri_post_sync_flags =
8495 post_sync_flags & ~PIPE_CONTROL_LRI_POST_SYNC_OP;
8496
8497 /* Recursive PIPE_CONTROL workarounds --------------------------------
8498 * (http://knowyourmeme.com/memes/xzibit-yo-dawg)
8499 *
8500 * We do these first because we want to look at the original operation,
8501 * rather than any workarounds we set.
8502 */
8503
8504 /* "Flush Types" workarounds ---------------------------------------------
8505 * We do these now because they may add post-sync operations or CS stalls.
8506 */
8507
8508 if (GFX_VER == 6 && (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH)) {
8509 /* Hardware workaround: SNB B-Spec says:
8510 *
8511 * "[Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush
8512 * Enable = 1, a PIPE_CONTROL with any non-zero post-sync-op is
8513 * required."
8514 */
8515 crocus_emit_post_sync_nonzero_flush(batch);
8516 }
8517
8518 #if GFX_VER == 8
8519 if (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) {
8520 /* Project: BDW, SKL+ (stopping at CNL) / Argument: VF Invalidate
8521 *
8522 * "'Post Sync Operation' must be enabled to 'Write Immediate Data' or
8523 * 'Write PS Depth Count' or 'Write Timestamp'."
8524 */
8525 if (!bo) {
8526 flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
8527 post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
8528 non_lri_post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
8529 bo = batch->ice->workaround_bo;
8530 offset = batch->ice->workaround_offset;
8531 }
8532 }
8533 #endif
8534
8535 #if GFX_VERx10 < 75
8536 if (flags & PIPE_CONTROL_DEPTH_STALL) {
8537 /* Project: PRE-HSW / Argument: Depth Stall
8538 *
8539 * "The following bits must be clear:
8540 * - Render Target Cache Flush Enable ([12] of DW1)
8541 * - Depth Cache Flush Enable ([0] of DW1)"
8542 */
8543 assert(!(flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
8544 PIPE_CONTROL_DEPTH_CACHE_FLUSH)));
8545 }
8546 #endif
8547 if (GFX_VER >= 6 && (flags & PIPE_CONTROL_DEPTH_STALL)) {
8548 /* From the PIPE_CONTROL instruction table, bit 13 (Depth Stall Enable):
8549 *
8550 * "This bit must be DISABLED for operations other than writing
8551 * PS_DEPTH_COUNT."
8552 *
8553 * This seems like nonsense. An Ivybridge workaround requires us to
8554 * emit a PIPE_CONTROL with a depth stall and write immediate post-sync
8555 * operation. Gen8+ requires us to emit depth stalls and depth cache
8556 * flushes together. So, it's hard to imagine this means anything other
8557 * than "we originally intended this to be used for PS_DEPTH_COUNT".
8558 *
8559 * We ignore the supposed restriction and do nothing.
8560 */
8561 }
8562
8563 if (GFX_VERx10 < 75 && (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH)) {
8564 /* Project: PRE-HSW / Argument: Depth Cache Flush
8565 *
8566 * "Depth Stall must be clear ([13] of DW1)."
8567 */
8568 assert(!(flags & PIPE_CONTROL_DEPTH_STALL));
8569 }
8570
8571 if (flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
8572 PIPE_CONTROL_STALL_AT_SCOREBOARD)) {
8573 /* From the PIPE_CONTROL instruction table, bit 12 and bit 1:
8574 *
8575 * "This bit must be DISABLED for End-of-pipe (Read) fences,
8576 * PS_DEPTH_COUNT or TIMESTAMP queries."
8577 *
8578 * TODO: Implement end-of-pipe checking.
8579 */
8580 assert(!(post_sync_flags & (PIPE_CONTROL_WRITE_DEPTH_COUNT |
8581 PIPE_CONTROL_WRITE_TIMESTAMP)));
8582 }
8583
8584 if (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) {
8585 /* From the PIPE_CONTROL instruction table, bit 1:
8586 *
8587 * "This bit is ignored if Depth Stall Enable is set.
8588 * Further, the render cache is not flushed even if Write Cache
8589 * Flush Enable bit is set."
8590 *
8591 * We assert that the caller doesn't do this combination, to try and
8592 * prevent mistakes. It shouldn't hurt the GPU, though.
8593 *
8594 * We skip this check on Gen11+ as the "Stall at Pixel Scoreboard"
8595 * and "Render Target Flush" combo is explicitly required for BTI
8596 * update workarounds.
8597 */
8598 assert(!(flags & (PIPE_CONTROL_DEPTH_STALL |
8599 PIPE_CONTROL_RENDER_TARGET_FLUSH)));
8600 }
8601
8602 /* PIPE_CONTROL page workarounds ------------------------------------- */
8603
8604 if (GFX_VER >= 7 && (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE)) {
8605 /* From the PIPE_CONTROL page itself:
8606 *
8607 * "IVB, HSW, BDW
8608 * Restriction: Pipe_control with CS-stall bit set must be issued
8609 * before a pipe-control command that has the State Cache
8610 * Invalidate bit set."
8611 */
8612 flags |= PIPE_CONTROL_CS_STALL;
8613 }
8614
8615 if ((GFX_VERx10 == 75)) {
8616 /* From the PIPE_CONTROL page itself:
8617 *
8618 * "HSW - Programming Note: PIPECONTROL with RO Cache Invalidation:
8619 * Prior to programming a PIPECONTROL command with any of the RO
8620 * cache invalidation bit set, program a PIPECONTROL flush command
8621 * with “CS stall” bit and “HDC Flush” bit set."
8622 *
8623 * TODO: Actually implement this. What's an HDC Flush?
8624 */
8625 }
8626
8627 if (flags & PIPE_CONTROL_FLUSH_LLC) {
8628 /* From the PIPE_CONTROL instruction table, bit 26 (Flush LLC):
8629 *
8630 * "Project: ALL
8631 * SW must always program Post-Sync Operation to "Write Immediate
8632 * Data" when Flush LLC is set."
8633 *
8634 * For now, we just require the caller to do it.
8635 */
8636 assert(flags & PIPE_CONTROL_WRITE_IMMEDIATE);
8637 }
8638
8639 /* "Post-Sync Operation" workarounds -------------------------------- */
8640
8641 /* Project: All / Argument: Global Snapshot Count Reset [19]
8642 *
8643 * "This bit must not be exercised on any product.
8644 * Requires stall bit ([20] of DW1) set."
8645 *
8646 * We don't use this, so we just assert that it isn't used. The
8647 * PIPE_CONTROL instruction page indicates that they intended this
8648 * as a debug feature and don't think it is useful in production,
8649 * but it may actually be usable, should we ever want to.
8650 */
8651 assert((flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) == 0);
8652
8653 if (flags & (PIPE_CONTROL_MEDIA_STATE_CLEAR |
8654 PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE)) {
8655 /* Project: All / Arguments:
8656 *
8657 * - Generic Media State Clear [16]
8658 * - Indirect State Pointers Disable [16]
8659 *
8660 * "Requires stall bit ([20] of DW1) set."
8661 *
8662 * Also, the PIPE_CONTROL instruction table, bit 16 (Generic Media
8663 * State Clear) says:
8664 *
8665 * "PIPECONTROL command with “Command Streamer Stall Enable” must be
8666 * programmed prior to programming a PIPECONTROL command with "Media
8667 * State Clear" set in GPGPU mode of operation"
8668 *
8669 * This is a subset of the earlier rule, so there's nothing to do.
8670 */
8671 flags |= PIPE_CONTROL_CS_STALL;
8672 }
8673
8674 if (flags & PIPE_CONTROL_STORE_DATA_INDEX) {
8675 /* Project: All / Argument: Store Data Index
8676 *
8677 * "Post-Sync Operation ([15:14] of DW1) must be set to something other
8678 * than '0'."
8679 *
8680 * For now, we just assert that the caller does this. We might want to
8681 * automatically add a write to the workaround BO...
8682 */
8683 assert(non_lri_post_sync_flags != 0);
8684 }
8685
8686 if (flags & PIPE_CONTROL_SYNC_GFDT) {
8687 /* Project: All / Argument: Sync GFDT
8688 *
8689 * "Post-Sync Operation ([15:14] of DW1) must be set to something other
8690 * than '0' or 0x2520[13] must be set."
8691 *
8692 * For now, we just assert that the caller does this.
8693 */
8694 assert(non_lri_post_sync_flags != 0);
8695 }
8696
8697 if (GFX_VER >= 6 && GFX_VER < 8 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
8698 /* Project: SNB, IVB, HSW / Argument: TLB inv
8699 *
8700 * "{All SKUs}{All Steppings}: Post-Sync Operation ([15:14] of DW1)
8701 * must be set to something other than '0'."
8702 *
8703 * For now, we just assert that the caller does this.
8704 */
8705 assert(non_lri_post_sync_flags != 0);
8706 }
8707
8708 if (GFX_VER >= 7 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
8709 /* Project: IVB+ / Argument: TLB inv
8710 *
8711 * "Requires stall bit ([20] of DW1) set."
8712 *
8713 * Also, from the PIPE_CONTROL instruction table:
8714 *
8715 * "Project: SKL+
8716 * Post Sync Operation or CS stall must be set to ensure a TLB
8717 * invalidation occurs. Otherwise no cycle will occur to the TLB
8718 * cache to invalidate."
8719 *
8720 * This is not a subset of the earlier rule, so there's nothing to do.
8721 */
8722 flags |= PIPE_CONTROL_CS_STALL;
8723 }
8724 #if GFX_VER == 8
8725 if (IS_COMPUTE_PIPELINE(batch)) {
8726 if (post_sync_flags ||
8727 (flags & (PIPE_CONTROL_NOTIFY_ENABLE |
8728 PIPE_CONTROL_DEPTH_STALL |
8729 PIPE_CONTROL_RENDER_TARGET_FLUSH |
8730 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
8731 PIPE_CONTROL_DATA_CACHE_FLUSH))) {
8732 /* Project: BDW / Arguments:
8733 *
8734 * - LRI Post Sync Operation [23]
8735 * - Post Sync Op [15:14]
8736 * - Notify En [8]
8737 * - Depth Stall [13]
8738 * - Render Target Cache Flush [12]
8739 * - Depth Cache Flush [0]
8740 * - DC Flush Enable [5]
8741 *
8742 * "Requires stall bit ([20] of DW) set for all GPGPU and Media
8743 * Workloads."
8744 *
8745 * (The docs have separate table rows for each bit, with essentially
8746 * the same workaround text. We've combined them here.)
8747 */
8748 flags |= PIPE_CONTROL_CS_STALL;
8749
8750 /* Also, from the PIPE_CONTROL instruction table, bit 20:
8751 *
8752 * "Project: BDW
8753 * This bit must be always set when PIPE_CONTROL command is
8754 * programmed by GPGPU and MEDIA workloads, except for the cases
8755 * when only Read Only Cache Invalidation bits are set (State
8756 * Cache Invalidation Enable, Instruction cache Invalidation
8757 * Enable, Texture Cache Invalidation Enable, Constant Cache
8758 * Invalidation Enable). This is to WA FFDOP CG issue, this WA
8759 * need not implemented when FF_DOP_CG is disable via "Fixed
8760 * Function DOP Clock Gate Disable" bit in RC_PSMI_CTRL register."
8761 *
8762 * It sounds like we could avoid CS stalls in some cases, but we
8763 * don't currently bother. This list isn't exactly the list above,
8764 * either...
8765 */
8766 }
8767 }
8768 #endif
8769 /* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT:
8770 *
8771 * "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with
8772 * only read-cache-invalidate bit(s) set, must have a CS_STALL bit set."
8773 *
8774 * Note that the kernel does CS stalls between batches, so we only need
8775 * to count them within a batch. We currently naively count every 4, and
8776 * don't skip the ones with only read-cache-invalidate bits set. This
8777 * may or may not be a problem...
8778 */
8779 if (GFX_VER == 7 && !(GFX_VERx10 == 75)) {
8780 if (flags & PIPE_CONTROL_CS_STALL) {
8781 /* If we're doing a CS stall, reset the counter and carry on. */
8782 batch->pipe_controls_since_last_cs_stall = 0;
8783 }
8784
8785 /* If this is the fourth pipe control without a CS stall, do one now. */
8786 if (++batch->pipe_controls_since_last_cs_stall == 4) {
8787 batch->pipe_controls_since_last_cs_stall = 0;
8788 flags |= PIPE_CONTROL_CS_STALL;
8789 }
8790 }
8791
8792 /* "Stall" workarounds ----------------------------------------------
8793 * These have to come after the earlier ones because we may have added
8794 * some additional CS stalls above.
8795 */
8796
8797 if (flags & PIPE_CONTROL_CS_STALL) {
8798 /* Project: PRE-SKL, VLV, CHV
8799 *
8800 * "[All Stepping][All SKUs]:
8801 *
8802 * One of the following must also be set:
8803 *
8804 * - Render Target Cache Flush Enable ([12] of DW1)
8805 * - Depth Cache Flush Enable ([0] of DW1)
8806 * - Stall at Pixel Scoreboard ([1] of DW1)
8807 * - Depth Stall ([13] of DW1)
8808 * - Post-Sync Operation ([13] of DW1)
8809 * - DC Flush Enable ([5] of DW1)"
8810 *
8811 * If we don't already have one of those bits set, we choose to add
8812 * "Stall at Pixel Scoreboard". Some of the other bits require a
8813 * CS stall as a workaround (see above), which would send us into
8814 * an infinite recursion of PIPE_CONTROLs. "Stall at Pixel Scoreboard"
8815 * appears to be safe, so we choose that.
8816 */
8817 const uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
8818 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
8819 PIPE_CONTROL_WRITE_IMMEDIATE |
8820 PIPE_CONTROL_WRITE_DEPTH_COUNT |
8821 PIPE_CONTROL_WRITE_TIMESTAMP |
8822 PIPE_CONTROL_STALL_AT_SCOREBOARD |
8823 PIPE_CONTROL_DEPTH_STALL |
8824 PIPE_CONTROL_DATA_CACHE_FLUSH;
8825 if (!(flags & wa_bits))
8826 flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
8827 }
8828
8829 /* Emit --------------------------------------------------------------- */
8830
8831 if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) {
8832 fprintf(stderr,
8833 " PC [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%"PRIx64"]: %s\n",
8834 (flags & PIPE_CONTROL_FLUSH_ENABLE) ? "PipeCon " : "",
8835 (flags & PIPE_CONTROL_CS_STALL) ? "CS " : "",
8836 (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) ? "Scoreboard " : "",
8837 (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) ? "VF " : "",
8838 (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH) ? "RT " : "",
8839 (flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE) ? "Const " : "",
8840 (flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE) ? "TC " : "",
8841 (flags & PIPE_CONTROL_DATA_CACHE_FLUSH) ? "DC " : "",
8842 (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH) ? "ZFlush " : "",
8843 (flags & PIPE_CONTROL_DEPTH_STALL) ? "ZStall " : "",
8844 (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE) ? "State " : "",
8845 (flags & PIPE_CONTROL_TLB_INVALIDATE) ? "TLB " : "",
8846 (flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE) ? "Inst " : "",
8847 (flags & PIPE_CONTROL_MEDIA_STATE_CLEAR) ? "MediaClear " : "",
8848 (flags & PIPE_CONTROL_NOTIFY_ENABLE) ? "Notify " : "",
8849 (flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) ?
8850 "SnapRes" : "",
8851 (flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE) ?
8852 "ISPDis" : "",
8853 (flags & PIPE_CONTROL_WRITE_IMMEDIATE) ? "WriteImm " : "",
8854 (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT) ? "WriteZCount " : "",
8855 (flags & PIPE_CONTROL_WRITE_TIMESTAMP) ? "WriteTimestamp " : "",
8856 imm, reason);
8857 }
8858
8859 crocus_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
8860 #if GFX_VER >= 7
8861 pc.LRIPostSyncOperation = NoLRIOperation;
8862 pc.PipeControlFlushEnable = flags & PIPE_CONTROL_FLUSH_ENABLE;
8863 pc.DCFlushEnable = flags & PIPE_CONTROL_DATA_CACHE_FLUSH;
8864 #endif
8865 #if GFX_VER >= 6
8866 pc.StoreDataIndex = 0;
8867 pc.CommandStreamerStallEnable = flags & PIPE_CONTROL_CS_STALL;
8868 pc.GlobalSnapshotCountReset =
8869 flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET;
8870 pc.TLBInvalidate = flags & PIPE_CONTROL_TLB_INVALIDATE;
8871 pc.GenericMediaStateClear = flags & PIPE_CONTROL_MEDIA_STATE_CLEAR;
8872 pc.StallAtPixelScoreboard = flags & PIPE_CONTROL_STALL_AT_SCOREBOARD;
8873 pc.RenderTargetCacheFlushEnable =
8874 flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;
8875 pc.DepthCacheFlushEnable = flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH;
8876 pc.StateCacheInvalidationEnable =
8877 flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE;
8878 pc.VFCacheInvalidationEnable = flags & PIPE_CONTROL_VF_CACHE_INVALIDATE;
8879 pc.ConstantCacheInvalidationEnable =
8880 flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE;
8881 #else
8882 pc.WriteCacheFlush = flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;
8883 #endif
8884 pc.PostSyncOperation = flags_to_post_sync_op(flags);
8885 pc.DepthStallEnable = flags & PIPE_CONTROL_DEPTH_STALL;
8886 pc.InstructionCacheInvalidateEnable =
8887 flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE;
8888 pc.NotifyEnable = flags & PIPE_CONTROL_NOTIFY_ENABLE;
8889 #if GFX_VER >= 5 || GFX_VERx10 == 45
8890 pc.IndirectStatePointersDisable =
8891 flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE;
8892 #endif
8893 #if GFX_VER >= 6
8894 pc.TextureCacheInvalidationEnable =
8895 flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
8896 #elif GFX_VER == 5 || GFX_VERx10 == 45
8897 pc.TextureCacheFlushEnable =
8898 flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
8899 #endif
8900 pc.Address = ggtt_bo(bo, offset);
8901 if (GFX_VER < 7 && bo)
8902 pc.DestinationAddressType = DAT_GGTT;
8903 pc.ImmediateData = imm;
8904 }
8905 }
8906
8907 #if GFX_VER == 6
8908 void
8909 genX(crocus_upload_urb)(struct crocus_batch *batch,
8910 unsigned vs_size,
8911 bool gs_present,
8912 unsigned gs_size)
8913 {
8914 struct crocus_context *ice = batch->ice;
8915 int nr_vs_entries, nr_gs_entries;
8916 int total_urb_size = ice->urb.size * 1024; /* in bytes */
8917 const struct intel_device_info *devinfo = &batch->screen->devinfo;
8918
8919 /* Calculate how many entries fit in each stage's section of the URB */
8920 if (gs_present) {
8921 nr_vs_entries = (total_urb_size/2) / (vs_size * 128);
8922 nr_gs_entries = (total_urb_size/2) / (gs_size * 128);
8923 } else {
8924 nr_vs_entries = total_urb_size / (vs_size * 128);
8925 nr_gs_entries = 0;
8926 }
8927
8928 /* Then clamp to the maximum allowed by the hardware */
8929 if (nr_vs_entries > devinfo->urb.max_entries[MESA_SHADER_VERTEX])
8930 nr_vs_entries = devinfo->urb.max_entries[MESA_SHADER_VERTEX];
8931
8932 if (nr_gs_entries > devinfo->urb.max_entries[MESA_SHADER_GEOMETRY])
8933 nr_gs_entries = devinfo->urb.max_entries[MESA_SHADER_GEOMETRY];
8934
8935 /* Finally, both must be a multiple of 4 (see 3DSTATE_URB in the PRM). */
8936 ice->urb.nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, 4);
8937 ice->urb.nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, 4);
8938
8939 assert(ice->urb.nr_vs_entries >=
8940 devinfo->urb.min_entries[MESA_SHADER_VERTEX]);
8941 assert(ice->urb.nr_vs_entries % 4 == 0);
8942 assert(ice->urb.nr_gs_entries % 4 == 0);
8943 assert(vs_size <= 5);
8944 assert(gs_size <= 5);
8945
8946 crocus_emit_cmd(batch, GENX(3DSTATE_URB), urb) {
8947 urb.VSNumberofURBEntries = ice->urb.nr_vs_entries;
8948 urb.VSURBEntryAllocationSize = vs_size - 1;
8949
8950 urb.GSNumberofURBEntries = ice->urb.nr_gs_entries;
8951 urb.GSURBEntryAllocationSize = gs_size - 1;
8952 };
8953 /* From the PRM Volume 2 part 1, section 1.4.7:
8954 *
8955 * Because of a urb corruption caused by allocating a previous gsunit’s
8956 * urb entry to vsunit software is required to send a "GS NULL
8957 * Fence"(Send URB fence with VS URB size == 1 and GS URB size == 0) plus
8958 * a dummy DRAW call before any case where VS will be taking over GS URB
8959 * space.
8960 *
8961 * It is not clear exactly what this means ("URB fence" is a command that
8962 * doesn't exist on Gen6). So for now we just do a full pipeline flush as
8963 * a workaround.
8964 */
8965 if (ice->urb.gs_present && !gs_present)
8966 crocus_emit_mi_flush(batch);
8967 ice->urb.gs_present = gs_present;
8968 }
8969 #endif
8970
8971 static void
8972 crocus_lost_genx_state(struct crocus_context *ice, struct crocus_batch *batch)
8973 {
8974 }
8975
8976 static void
8977 crocus_emit_mi_report_perf_count(struct crocus_batch *batch,
8978 struct crocus_bo *bo,
8979 uint32_t offset_in_bytes,
8980 uint32_t report_id)
8981 {
8982 #if GFX_VER >= 7
8983 crocus_emit_cmd(batch, GENX(MI_REPORT_PERF_COUNT), mi_rpc) {
8984 mi_rpc.MemoryAddress = rw_bo(bo, offset_in_bytes);
8985 mi_rpc.ReportID = report_id;
8986 }
8987 #endif
8988 }
8989
8990 /**
8991 * From the PRM, Volume 2a:
8992 *
8993 * "Indirect State Pointers Disable
8994 *
8995 * At the completion of the post-sync operation associated with this pipe
8996 * control packet, the indirect state pointers in the hardware are
8997 * considered invalid; the indirect pointers are not saved in the context.
8998 * If any new indirect state commands are executed in the command stream
8999 * while the pipe control is pending, the new indirect state commands are
9000 * preserved.
9001 *
9002 * [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
9003 * restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
9004 * commands are only considered as Indirect State Pointers. Once ISP is
9005 * issued in a context, SW must initialize by programming push constant
9006 * commands for all the shaders (at least to zero length) before attempting
9007 * any rendering operation for the same context."
9008 *
9009 * 3DSTATE_CONSTANT_* packets are restored during a context restore,
9010 * even though they point to a BO that has been already unreferenced at
9011 * the end of the previous batch buffer. This has been fine so far since
9012 * we are protected by these scratch page (every address not covered by
9013 * a BO should be pointing to the scratch page). But on CNL, it is
9014 * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
9015 * instruction.
9016 *
9017 * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
9018 * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
9019 * context restore, so the mentioned hang doesn't happen. However,
9020 * software must program push constant commands for all stages prior to
9021 * rendering anything, so we flag them as dirty.
9022 *
9023 * Finally, we also make sure to stall at pixel scoreboard to make sure the
9024 * constants have been loaded into the EUs prior to disable the push constants
9025 * so that it doesn't hang a previous 3DPRIMITIVE.
9026 */
9027 #if GFX_VER >= 7
9028 static void
9029 gen7_emit_isp_disable(struct crocus_batch *batch)
9030 {
9031 crocus_emit_raw_pipe_control(batch, "isp disable",
9032 PIPE_CONTROL_STALL_AT_SCOREBOARD |
9033 PIPE_CONTROL_CS_STALL,
9034 NULL, 0, 0);
9035 crocus_emit_raw_pipe_control(batch, "isp disable",
9036 PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE |
9037 PIPE_CONTROL_CS_STALL,
9038 NULL, 0, 0);
9039
9040 struct crocus_context *ice = batch->ice;
9041 ice->state.stage_dirty |= (CROCUS_STAGE_DIRTY_CONSTANTS_VS |
9042 CROCUS_STAGE_DIRTY_CONSTANTS_TCS |
9043 CROCUS_STAGE_DIRTY_CONSTANTS_TES |
9044 CROCUS_STAGE_DIRTY_CONSTANTS_GS |
9045 CROCUS_STAGE_DIRTY_CONSTANTS_FS);
9046 }
9047 #endif
9048
9049 #if GFX_VER >= 7
9050 static void
9051 crocus_state_finish_batch(struct crocus_batch *batch)
9052 {
9053 #if GFX_VERx10 == 75
9054 if (batch->name == CROCUS_BATCH_RENDER) {
9055 crocus_emit_mi_flush(batch);
9056 crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
9057 ptr.ColorCalcStatePointer = batch->ice->shaders.cc_offset;
9058 }
9059
9060 crocus_emit_pipe_control_flush(batch, "hsw wa", PIPE_CONTROL_RENDER_TARGET_FLUSH |
9061 PIPE_CONTROL_CS_STALL);
9062 }
9063 #endif
9064 gen7_emit_isp_disable(batch);
9065 }
9066 #endif
9067
9068 static void
9069 crocus_batch_reset_dirty(struct crocus_batch *batch)
9070 {
9071 /* unreference any index buffer so it get reemitted. */
9072 pipe_resource_reference(&batch->ice->state.index_buffer.res, NULL);
9073
9074 /* for GEN4/5 need to reemit anything that ends up in the state batch that points to anything in the state batch
9075 * as the old state batch won't still be available.
9076 */
9077 batch->ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER |
9078 CROCUS_DIRTY_COLOR_CALC_STATE;
9079
9080 batch->ice->state.dirty |= CROCUS_DIRTY_VERTEX_ELEMENTS | CROCUS_DIRTY_VERTEX_BUFFERS;
9081
9082 batch->ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_BINDINGS;
9083 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS;
9084 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_TES;
9085 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_TCS;
9086 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_GS;
9087 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_PS;
9088 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS;
9089
9090 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS;
9091 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TES;
9092 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS;
9093 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_GS;
9094 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_FS;
9095 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_CS;
9096
9097 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS;
9098 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_GS;
9099 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CS;
9100 batch->ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT | CROCUS_DIRTY_SF_CL_VIEWPORT;
9101
9102 #if GFX_VER >= 6
9103 /* SCISSOR_STATE */
9104 batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
9105 batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
9106 batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL;
9107
9108 #endif
9109 #if GFX_VER <= 5
9110 /* dirty the SF state on gen4/5 */
9111 batch->ice->state.dirty |= CROCUS_DIRTY_RASTER;
9112 batch->ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
9113 batch->ice->state.dirty |= CROCUS_DIRTY_CLIP;
9114 batch->ice->state.dirty |= CROCUS_DIRTY_WM;
9115 #endif
9116 #if GFX_VER >= 7
9117 /* Streamout dirty */
9118 batch->ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
9119 batch->ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST;
9120 batch->ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
9121 #endif
9122 }
9123
9124 #if GFX_VERx10 == 75
9125 struct pipe_rasterizer_state *crocus_get_rast_state(struct crocus_context *ice)
9126 {
9127 return &ice->state.cso_rast->cso;
9128 }
9129 #endif
9130
9131 #if GFX_VER >= 6
9132 static void update_so_strides(struct crocus_context *ice,
9133 uint16_t *strides)
9134 {
9135 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
9136 struct crocus_stream_output_target *so = (void *)ice->state.so_target[i];
9137 if (so)
9138 so->stride = strides[i] * sizeof(uint32_t);
9139 }
9140 }
9141 #endif
9142
9143 static void crocus_fill_clamp_mask(const struct crocus_sampler_state *samp,
9144 int s,
9145 uint32_t *clamp_mask)
9146 {
9147 #if GFX_VER < 8
9148 if (samp->pstate.min_img_filter != PIPE_TEX_FILTER_NEAREST &&
9149 samp->pstate.mag_img_filter != PIPE_TEX_FILTER_NEAREST) {
9150 if (samp->pstate.wrap_s == PIPE_TEX_WRAP_CLAMP)
9151 clamp_mask[0] |= (1 << s);
9152 if (samp->pstate.wrap_t == PIPE_TEX_WRAP_CLAMP)
9153 clamp_mask[1] |= (1 << s);
9154 if (samp->pstate.wrap_r == PIPE_TEX_WRAP_CLAMP)
9155 clamp_mask[2] |= (1 << s);
9156 }
9157 #endif
9158 }
9159
9160 static void
9161 crocus_set_frontend_noop(struct pipe_context *ctx, bool enable)
9162 {
9163 struct crocus_context *ice = (struct crocus_context *) ctx;
9164
9165 if (crocus_batch_prepare_noop(&ice->batches[CROCUS_BATCH_RENDER], enable)) {
9166 ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_RENDER;
9167 ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_RENDER;
9168 }
9169
9170 if (ice->batch_count == 1)
9171 return;
9172
9173 if (crocus_batch_prepare_noop(&ice->batches[CROCUS_BATCH_COMPUTE], enable)) {
9174 ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_COMPUTE;
9175 ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE;
9176 }
9177 }
9178
9179 void
9180 genX(crocus_init_screen_state)(struct crocus_screen *screen)
9181 {
9182 assert(screen->devinfo.verx10 == GFX_VERx10);
9183 assert(screen->devinfo.ver == GFX_VER);
9184 screen->vtbl.destroy_state = crocus_destroy_state;
9185 screen->vtbl.init_render_context = crocus_init_render_context;
9186 screen->vtbl.upload_render_state = crocus_upload_render_state;
9187 #if GFX_VER >= 7
9188 screen->vtbl.init_compute_context = crocus_init_compute_context;
9189 screen->vtbl.upload_compute_state = crocus_upload_compute_state;
9190 #endif
9191 screen->vtbl.emit_raw_pipe_control = crocus_emit_raw_pipe_control;
9192 screen->vtbl.emit_mi_report_perf_count = crocus_emit_mi_report_perf_count;
9193 screen->vtbl.rebind_buffer = crocus_rebind_buffer;
9194 #if GFX_VERx10 >= 75
9195 screen->vtbl.load_register_reg32 = crocus_load_register_reg32;
9196 screen->vtbl.load_register_reg64 = crocus_load_register_reg64;
9197 screen->vtbl.load_register_imm32 = crocus_load_register_imm32;
9198 screen->vtbl.load_register_imm64 = crocus_load_register_imm64;
9199 screen->vtbl.store_data_imm32 = crocus_store_data_imm32;
9200 screen->vtbl.store_data_imm64 = crocus_store_data_imm64;
9201 #endif
9202 #if GFX_VER >= 7
9203 screen->vtbl.load_register_mem32 = crocus_load_register_mem32;
9204 screen->vtbl.load_register_mem64 = crocus_load_register_mem64;
9205 screen->vtbl.copy_mem_mem = crocus_copy_mem_mem;
9206 screen->vtbl.create_so_decl_list = crocus_create_so_decl_list;
9207 #endif
9208 screen->vtbl.update_surface_base_address = crocus_update_surface_base_address;
9209 #if GFX_VER >= 6
9210 screen->vtbl.store_register_mem32 = crocus_store_register_mem32;
9211 screen->vtbl.store_register_mem64 = crocus_store_register_mem64;
9212 #endif
9213 screen->vtbl.populate_vs_key = crocus_populate_vs_key;
9214 screen->vtbl.populate_tcs_key = crocus_populate_tcs_key;
9215 screen->vtbl.populate_tes_key = crocus_populate_tes_key;
9216 screen->vtbl.populate_gs_key = crocus_populate_gs_key;
9217 screen->vtbl.populate_fs_key = crocus_populate_fs_key;
9218 screen->vtbl.populate_cs_key = crocus_populate_cs_key;
9219 screen->vtbl.lost_genx_state = crocus_lost_genx_state;
9220 #if GFX_VER >= 7
9221 screen->vtbl.finish_batch = crocus_state_finish_batch;
9222 #endif
9223 #if GFX_VER <= 5
9224 screen->vtbl.upload_urb_fence = crocus_upload_urb_fence;
9225 screen->vtbl.calculate_urb_fence = crocus_calculate_urb_fence;
9226 #endif
9227 screen->vtbl.fill_clamp_mask = crocus_fill_clamp_mask;
9228 screen->vtbl.batch_reset_dirty = crocus_batch_reset_dirty;
9229 screen->vtbl.translate_prim_type = translate_prim_type;
9230 #if GFX_VER >= 6
9231 screen->vtbl.update_so_strides = update_so_strides;
9232 screen->vtbl.get_so_offset = crocus_get_so_offset;
9233 #endif
9234
9235 genX(crocus_init_blt)(screen);
9236 }
9237
9238 void
9239 genX(crocus_init_state)(struct crocus_context *ice)
9240 {
9241 struct pipe_context *ctx = &ice->ctx;
9242
9243 ctx->create_blend_state = crocus_create_blend_state;
9244 ctx->create_depth_stencil_alpha_state = crocus_create_zsa_state;
9245 ctx->create_rasterizer_state = crocus_create_rasterizer_state;
9246 ctx->create_sampler_state = crocus_create_sampler_state;
9247 ctx->create_sampler_view = crocus_create_sampler_view;
9248 ctx->create_surface = crocus_create_surface;
9249 ctx->create_vertex_elements_state = crocus_create_vertex_elements;
9250 ctx->bind_blend_state = crocus_bind_blend_state;
9251 ctx->bind_depth_stencil_alpha_state = crocus_bind_zsa_state;
9252 ctx->bind_sampler_states = crocus_bind_sampler_states;
9253 ctx->bind_rasterizer_state = crocus_bind_rasterizer_state;
9254 ctx->bind_vertex_elements_state = crocus_bind_vertex_elements_state;
9255 ctx->delete_blend_state = crocus_delete_state;
9256 ctx->delete_depth_stencil_alpha_state = crocus_delete_state;
9257 ctx->delete_rasterizer_state = crocus_delete_state;
9258 ctx->delete_sampler_state = crocus_delete_state;
9259 ctx->delete_vertex_elements_state = crocus_delete_state;
9260 ctx->set_blend_color = crocus_set_blend_color;
9261 ctx->set_clip_state = crocus_set_clip_state;
9262 ctx->set_constant_buffer = crocus_set_constant_buffer;
9263 ctx->set_shader_buffers = crocus_set_shader_buffers;
9264 ctx->set_shader_images = crocus_set_shader_images;
9265 ctx->set_sampler_views = crocus_set_sampler_views;
9266 ctx->set_tess_state = crocus_set_tess_state;
9267 ctx->set_patch_vertices = crocus_set_patch_vertices;
9268 ctx->set_framebuffer_state = crocus_set_framebuffer_state;
9269 ctx->set_polygon_stipple = crocus_set_polygon_stipple;
9270 ctx->set_sample_mask = crocus_set_sample_mask;
9271 ctx->set_scissor_states = crocus_set_scissor_states;
9272 ctx->set_stencil_ref = crocus_set_stencil_ref;
9273 ctx->set_vertex_buffers = crocus_set_vertex_buffers;
9274 ctx->set_viewport_states = crocus_set_viewport_states;
9275 ctx->sampler_view_destroy = crocus_sampler_view_destroy;
9276 ctx->surface_destroy = crocus_surface_destroy;
9277 ctx->draw_vbo = crocus_draw_vbo;
9278 ctx->launch_grid = crocus_launch_grid;
9279
9280 ctx->set_frontend_noop = crocus_set_frontend_noop;
9281
9282 #if GFX_VER >= 6
9283 ctx->create_stream_output_target = crocus_create_stream_output_target;
9284 ctx->stream_output_target_destroy = crocus_stream_output_target_destroy;
9285 ctx->set_stream_output_targets = crocus_set_stream_output_targets;
9286 #endif
9287
9288 ice->state.dirty = ~0ull;
9289 ice->state.stage_dirty = ~0ull;
9290
9291 ice->state.statistics_counters_enabled = true;
9292
9293 ice->state.sample_mask = 0xff;
9294 ice->state.num_viewports = 1;
9295 ice->state.prim_mode = MESA_PRIM_COUNT;
9296 ice->state.reduced_prim_mode = MESA_PRIM_COUNT;
9297 ice->state.genx = calloc(1, sizeof(struct crocus_genx_state));
9298 ice->draw.derived_params.drawid = -1;
9299
9300 /* Default all scissor rectangles to be empty regions. */
9301 for (int i = 0; i < CROCUS_MAX_VIEWPORTS; i++) {
9302 ice->state.scissors[i] = (struct pipe_scissor_state) {
9303 .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
9304 };
9305 }
9306 }
9307