1 /*
2 * Copyright © 2016 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "anv_private.h"
25
26 #include "genxml/gen_macros.h"
27 #include "genxml/genX_pack.h"
28
29 #include "common/intel_l3_config.h"
30
31 /**
32 * This file implements some lightweight memcpy/memset operations on the GPU
33 * using a vertex buffer and streamout.
34 */
35
36 /**
37 * Returns the greatest common divisor of a and b that is a power of two.
38 */
39 static uint64_t
gcd_pow2_u64(uint64_t a,uint64_t b)40 gcd_pow2_u64(uint64_t a, uint64_t b)
41 {
42 assert(a > 0 || b > 0);
43
44 unsigned a_log2 = ffsll(a) - 1;
45 unsigned b_log2 = ffsll(b) - 1;
46
47 /* If either a or b is 0, then a_log2 or b_log2 will be UINT_MAX in which
48 * case, the MIN2() will take the other one. If both are 0 then we will
49 * hit the assert above.
50 */
51 return 1 << MIN2(a_log2, b_log2);
52 }
53
54 static void
emit_common_so_memcpy(struct anv_memcpy_state * state,const struct intel_urb_config * urb_cfg_in,const struct intel_l3_config * l3_config)55 emit_common_so_memcpy(struct anv_memcpy_state *state,
56 const struct intel_urb_config *urb_cfg_in,
57 const struct intel_l3_config *l3_config)
58 {
59 struct anv_batch *batch = state->batch;
60 struct anv_device *device = state->device;
61
62 if (state->cmd_buffer) {
63 /* Wa_14015814527 */
64 genX(apply_task_urb_workaround)(state->cmd_buffer);
65
66 genX(cmd_buffer_apply_pipe_flushes)(state->cmd_buffer);
67
68 genX(flush_pipeline_select_3d)(state->cmd_buffer);
69
70 #if GFX_VER == 9
71 genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(
72 state->cmd_buffer, SEQUENTIAL, 1ull << 32);
73 #endif
74 }
75
76 anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
77 vfi.InstancingEnable = false;
78 vfi.VertexElementIndex = 0;
79 }
80 anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS), sgvs);
81 #if GFX_VER >= 11
82 anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS_2), sgvs);
83 #endif
84
85 /* Disable all shader stages */
86 anv_batch_emit(batch, GENX(3DSTATE_VS), vs);
87 anv_batch_emit(batch, GENX(3DSTATE_HS), hs);
88 anv_batch_emit(batch, GENX(3DSTATE_TE), te);
89 anv_batch_emit(batch, GENX(3DSTATE_DS), DS);
90 anv_batch_emit(batch, GENX(3DSTATE_GS), gs);
91 anv_batch_emit(batch, GENX(3DSTATE_PS), gs);
92
93 #if GFX_VERx10 >= 125
94 /* Disable Mesh, we can't have this and streamout enabled at the same
95 * time.
96 */
97 if (device->vk.enabled_extensions.EXT_mesh_shader) {
98 anv_batch_emit(batch, GENX(3DSTATE_MESH_CONTROL), mesh);
99 anv_batch_emit(batch, GENX(3DSTATE_TASK_CONTROL), task);
100 }
101 #endif
102
103 #if INTEL_WA_16013994831_GFX_VER
104 /* Wa_16013994831 - Disable preemption during streamout. */
105 if (intel_needs_workaround(device->info, 16013994831))
106 genX(batch_set_preemption)(batch, device->info, _3D, false);
107 #endif
108
109 anv_batch_emit(batch, GENX(3DSTATE_SBE), sbe) {
110 sbe.VertexURBEntryReadOffset = 1;
111 sbe.NumberofSFOutputAttributes = 1;
112 sbe.VertexURBEntryReadLength = 1;
113 sbe.ForceVertexURBEntryReadLength = true;
114 sbe.ForceVertexURBEntryReadOffset = true;
115
116 for (unsigned i = 0; i < 32; i++)
117 sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
118 }
119
120 /* Emit URB setup. We tell it that the VS is active because we want it to
121 * allocate space for the VS. Even though one isn't run, we need VUEs to
122 * store the data that VF is going to pass to SOL.
123 */
124 const unsigned entry_size[4] = { DIV_ROUND_UP(32, 64), 1, 1, 1 };
125 memcpy(state->urb_cfg.size, &entry_size, sizeof(entry_size));
126
127 genX(emit_urb_setup)(device, batch, l3_config,
128 VK_SHADER_STAGE_VERTEX_BIT, urb_cfg_in, &state->urb_cfg,
129 NULL);
130
131 #if GFX_VER >= 12
132 /* Disable Primitive Replication. */
133 anv_batch_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
134 #endif
135
136 anv_batch_emit(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
137 topo.PrimitiveTopologyType = _3DPRIM_POINTLIST;
138 }
139
140 anv_batch_emit(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
141 vf.StatisticsEnable = false;
142 }
143 }
144
145 static void
emit_so_memcpy(struct anv_memcpy_state * state,struct anv_address dst,struct anv_address src,uint32_t size)146 emit_so_memcpy(struct anv_memcpy_state *state,
147 struct anv_address dst, struct anv_address src,
148 uint32_t size)
149 {
150 struct anv_batch *batch = state->batch;
151 struct anv_device *device = state->device;
152
153 /* The maximum copy block size is 4 32-bit components at a time. */
154 assert(size % 4 == 0);
155 unsigned bs = gcd_pow2_u64(16, size);
156
157 enum isl_format format;
158 switch (bs) {
159 case 4: format = ISL_FORMAT_R32_UINT; break;
160 case 8: format = ISL_FORMAT_R32G32_UINT; break;
161 case 16: format = ISL_FORMAT_R32G32B32A32_UINT; break;
162 default:
163 unreachable("Invalid size");
164 }
165
166 uint32_t *dw;
167 dw = anv_batch_emitn(batch, 5, GENX(3DSTATE_VERTEX_BUFFERS));
168 GENX(VERTEX_BUFFER_STATE_pack)(batch, dw + 1,
169 &(struct GENX(VERTEX_BUFFER_STATE)) {
170 .VertexBufferIndex = 32, /* Reserved for this */
171 .AddressModifyEnable = true,
172 .BufferStartingAddress = src,
173 .BufferPitch = bs,
174 .MOCS = anv_mocs(device, src.bo, 0),
175 #if GFX_VER >= 12
176 .L3BypassDisable = true,
177 #endif
178 .BufferSize = size,
179 });
180
181 dw = anv_batch_emitn(batch, 3, GENX(3DSTATE_VERTEX_ELEMENTS));
182 GENX(VERTEX_ELEMENT_STATE_pack)(batch, dw + 1,
183 &(struct GENX(VERTEX_ELEMENT_STATE)) {
184 .VertexBufferIndex = 32,
185 .Valid = true,
186 .SourceElementFormat = format,
187 .SourceElementOffset = 0,
188 .Component0Control = (bs >= 4) ? VFCOMP_STORE_SRC : VFCOMP_STORE_0,
189 .Component1Control = (bs >= 8) ? VFCOMP_STORE_SRC : VFCOMP_STORE_0,
190 .Component2Control = (bs >= 12) ? VFCOMP_STORE_SRC : VFCOMP_STORE_0,
191 .Component3Control = (bs >= 16) ? VFCOMP_STORE_SRC : VFCOMP_STORE_0,
192 });
193
194
195 /* Wa_16011411144:
196 *
197 * SW must insert a PIPE_CONTROL cmd before and after the
198 * 3dstate_so_buffer_index_0/1/2/3 states to ensure so_buffer_index_*
199 * state is not combined with other state changes.
200 */
201 if (intel_needs_workaround(device->info, 16011411144))
202 genx_batch_emit_pipe_control(batch, device->info, _3D, ANV_PIPE_CS_STALL_BIT);
203
204 anv_batch_emit(batch, GENX(3DSTATE_SO_BUFFER), sob) {
205 #if GFX_VER < 12
206 sob.SOBufferIndex = 0;
207 #else
208 sob._3DCommandOpcode = 0;
209 sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD;
210 #endif
211 sob.MOCS = anv_mocs(device, dst.bo, ISL_SURF_USAGE_STREAM_OUT_BIT),
212 sob.SurfaceBaseAddress = dst;
213
214 sob.SOBufferEnable = true;
215 sob.SurfaceSize = size / 4 - 1;
216
217 /* As SOL writes out data, it updates the SO_WRITE_OFFSET registers with
218 * the end position of the stream. We need to reset this value to 0 at
219 * the beginning of the run or else SOL will start at the offset from
220 * the previous draw.
221 */
222 sob.StreamOffsetWriteEnable = true;
223 sob.StreamOffset = 0;
224 }
225
226 /* Wa_16011411144: also CS_STALL after touching SO_BUFFER change */
227 if (intel_needs_workaround(device->info, 16011411144))
228 genx_batch_emit_pipe_control(batch, device->info, _3D, ANV_PIPE_CS_STALL_BIT);
229
230 dw = anv_batch_emitn(batch, 5, GENX(3DSTATE_SO_DECL_LIST),
231 .StreamtoBufferSelects0 = (1 << 0),
232 .NumEntries0 = 1);
233 GENX(SO_DECL_ENTRY_pack)(batch, dw + 3,
234 &(struct GENX(SO_DECL_ENTRY)) {
235 .Stream0Decl = {
236 .OutputBufferSlot = 0,
237 .RegisterIndex = 0,
238 .ComponentMask = (1 << (bs / 4)) - 1,
239 },
240 });
241
242 #if GFX_VERx10 == 125
243 /* Wa_14015946265: Send PC with CS stall after SO_DECL. */
244 genx_batch_emit_pipe_control(batch, device->info, _3D, ANV_PIPE_CS_STALL_BIT);
245 #endif
246
247 anv_batch_emit(batch, GENX(3DSTATE_STREAMOUT), so) {
248 so.SOFunctionEnable = true;
249 so.RenderingDisable = true;
250 so.Stream0VertexReadOffset = 0;
251 so.Stream0VertexReadLength = DIV_ROUND_UP(32, 64);
252 so.Buffer0SurfacePitch = bs;
253 }
254
255 genX(emit_breakpoint)(batch, device, true);
256 anv_batch_emit(batch, GENX(3DPRIMITIVE), prim) {
257 prim.VertexAccessType = SEQUENTIAL;
258 prim.VertexCountPerInstance = size / bs;
259 prim.StartVertexLocation = 0;
260 prim.InstanceCount = 1;
261 prim.StartInstanceLocation = 0;
262 prim.BaseVertexLocation = 0;
263 }
264
265 genX(batch_emit_post_3dprimitive_was)(batch,
266 device,
267 _3DPRIM_POINTLIST, size / bs);
268
269 genX(emit_breakpoint)(batch, device, false);
270 }
271
272 void
genX(emit_so_memcpy_init)273 genX(emit_so_memcpy_init)(struct anv_memcpy_state *state,
274 struct anv_device *device,
275 struct anv_cmd_buffer *cmd_buffer,
276 struct anv_batch *batch)
277 {
278 memset(state, 0, sizeof(*state));
279
280 state->cmd_buffer = cmd_buffer;
281 state->batch = batch;
282 state->device = device;
283
284 if (state->cmd_buffer) {
285 if (!cmd_buffer->state.current_l3_config) {
286 genX(cmd_buffer_config_l3)(cmd_buffer,
287 intel_get_default_l3_config(device->info));
288 }
289 emit_common_so_memcpy(state,
290 &state->cmd_buffer->state.gfx.urb_cfg,
291 cmd_buffer->state.current_l3_config);
292 } else {
293 const struct intel_l3_config *cfg = intel_get_default_l3_config(device->info);
294 genX(emit_l3_config)(batch, device, cfg);
295 genX(emit_pipeline_select)(batch, _3D, device);
296
297 /* Dummy URB config, will trigger URB reemission */
298 struct intel_urb_config urb_cfg_in = { 0 };
299 emit_common_so_memcpy(state, &urb_cfg_in, cfg);
300 }
301 }
302
303 void
genX(emit_so_memcpy_fini)304 genX(emit_so_memcpy_fini)(struct anv_memcpy_state *state)
305 {
306 genX(emit_apply_pipe_flushes)(state->batch, state->device, _3D,
307 ANV_PIPE_END_OF_PIPE_SYNC_BIT,
308 NULL);
309
310 if (state->cmd_buffer) {
311 /* Flag all the instructions emitted by the memcpy. */
312 struct anv_gfx_dynamic_state *hw_state =
313 &state->cmd_buffer->state.gfx.dyn_state;
314
315 #if INTEL_WA_14018283232_GFX_VER
316 genX(cmd_buffer_ensure_wa_14018283232)(state->cmd_buffer, false);
317 #endif
318
319 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_URB);
320 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_STATISTICS);
321 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF);
322 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_TOPOLOGY);
323 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VERTEX_INPUT);
324 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS);
325 #if GFX_VER >= 11
326 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VF_SGVS_2);
327 #endif
328 #if GFX_VER >= 12
329 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PRIMITIVE_REPLICATION);
330 #endif
331 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SO_DECL_LIST);
332 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_STREAMOUT);
333 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SAMPLE_MASK);
334 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_MULTISAMPLE);
335 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SF);
336 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_SBE);
337 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_VS);
338 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_HS);
339 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_DS);
340 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TE);
341 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_GS);
342 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_PS);
343 if (state->cmd_buffer->device->vk.enabled_extensions.EXT_mesh_shader) {
344 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_MESH_CONTROL);
345 BITSET_SET(hw_state->dirty, ANV_GFX_STATE_TASK_CONTROL);
346 }
347
348 state->cmd_buffer->state.gfx.dirty |= ~(ANV_CMD_DIRTY_PIPELINE |
349 ANV_CMD_DIRTY_INDEX_BUFFER);
350
351 memcpy(&state->cmd_buffer->state.gfx.urb_cfg, &state->urb_cfg,
352 sizeof(struct intel_urb_config));
353 }
354 }
355
356 void
genX(emit_so_memcpy_end)357 genX(emit_so_memcpy_end)(struct anv_memcpy_state *state)
358 {
359 if (intel_needs_workaround(state->device->info, 16013994831))
360 genX(batch_set_preemption)(state->batch, state->device->info, _3D, true);
361
362 anv_batch_emit(state->batch, GENX(MI_BATCH_BUFFER_END), end);
363
364 if ((state->batch->next - state->batch->start) & 4)
365 anv_batch_emit(state->batch, GENX(MI_NOOP), noop);
366 }
367
368 void
genX(emit_so_memcpy)369 genX(emit_so_memcpy)(struct anv_memcpy_state *state,
370 struct anv_address dst, struct anv_address src,
371 uint32_t size)
372 {
373 if (GFX_VER == 9 &&
374 anv_gfx8_9_vb_cache_range_needs_workaround(&state->vb_bound,
375 &state->vb_dirty,
376 src, size)) {
377 genX(emit_apply_pipe_flushes)(state->batch, state->device, _3D,
378 ANV_PIPE_CS_STALL_BIT |
379 ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
380 NULL);
381 memset(&state->vb_dirty, 0, sizeof(state->vb_dirty));
382 }
383
384 emit_so_memcpy(state, dst, src, size);
385 }
386
387 void
genX(cmd_buffer_so_memcpy)388 genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer,
389 struct anv_address dst, struct anv_address src,
390 uint32_t size)
391 {
392 if (size == 0)
393 return;
394
395 struct anv_memcpy_state state;
396 genX(emit_so_memcpy_init)(&state,
397 cmd_buffer->device,
398 cmd_buffer,
399 &cmd_buffer->batch);
400 emit_so_memcpy(&state, dst, src, size);
401 genX(emit_so_memcpy_fini)(&state);
402 }
403