1 /*
2 * Copyright © 2022 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "anv_private.h"
25
26 #include "compiler/intel_nir.h"
27 #include "compiler/brw_compiler.h"
28 #include "compiler/brw_nir.h"
29 #include "compiler/nir/nir.h"
30 #include "compiler/nir/nir_builder.h"
31 #include "dev/intel_debug.h"
32 #include "intel/compiler/intel_nir.h"
33 #include "util/macros.h"
34
35 #include "vk_nir.h"
36
37 #include "anv_internal_kernels.h"
38
39 static bool
lower_base_workgroup_id(nir_builder * b,nir_intrinsic_instr * intrin,UNUSED void * data)40 lower_base_workgroup_id(nir_builder *b, nir_intrinsic_instr *intrin,
41 UNUSED void *data)
42 {
43 if (intrin->intrinsic != nir_intrinsic_load_base_workgroup_id)
44 return false;
45
46 b->cursor = nir_instr_remove(&intrin->instr);
47 nir_def_rewrite_uses(&intrin->def, nir_imm_zero(b, 3, 32));
48 return true;
49 }
50
51 static void
link_libanv(nir_shader * nir,const nir_shader * libanv)52 link_libanv(nir_shader *nir, const nir_shader *libanv)
53 {
54 nir_link_shader_functions(nir, libanv);
55 NIR_PASS_V(nir, nir_inline_functions);
56 NIR_PASS_V(nir, nir_remove_non_entrypoints);
57 NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, nir_var_function_temp,
58 glsl_get_cl_type_size_align);
59 NIR_PASS_V(nir, nir_opt_deref);
60 NIR_PASS_V(nir, nir_lower_vars_to_ssa);
61 NIR_PASS_V(nir, nir_lower_explicit_io,
62 nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared |
63 nir_var_mem_global,
64 nir_address_format_62bit_generic);
65 }
66
67 static struct anv_shader_bin *
compile_shader(struct anv_device * device,const nir_shader * libanv,enum anv_internal_kernel_name shader_name,gl_shader_stage stage,const char * name,const void * hash_key,uint32_t hash_key_size,uint32_t sends_count_expectation)68 compile_shader(struct anv_device *device,
69 const nir_shader *libanv,
70 enum anv_internal_kernel_name shader_name,
71 gl_shader_stage stage,
72 const char *name,
73 const void *hash_key,
74 uint32_t hash_key_size,
75 uint32_t sends_count_expectation)
76 {
77 const nir_shader_compiler_options *nir_options =
78 device->physical->compiler->nir_options[stage];
79
80 nir_builder b = nir_builder_init_simple_shader(stage, nir_options,
81 "%s", name);
82
83 uint32_t uniform_size =
84 anv_genX(device->info, call_internal_shader)(&b, shader_name);
85
86 nir_shader *nir = b.shader;
87
88 link_libanv(nir, libanv);
89
90 if (INTEL_DEBUG(DEBUG_SHADER_PRINT)) {
91 nir_lower_printf_options printf_opts = {
92 .ptr_bit_size = 64,
93 .use_printf_base_identifier = true,
94 };
95 NIR_PASS_V(nir, nir_lower_printf, &printf_opts);
96 }
97
98 NIR_PASS_V(nir, nir_lower_vars_to_ssa);
99 NIR_PASS_V(nir, nir_opt_cse);
100 NIR_PASS_V(nir, nir_opt_gcm, true);
101 NIR_PASS_V(nir, nir_opt_peephole_select, 1, false, false);
102
103 NIR_PASS_V(nir, nir_lower_variable_initializers, ~0);
104
105 NIR_PASS_V(nir, nir_split_var_copies);
106 NIR_PASS_V(nir, nir_split_per_member_structs);
107
108 if (stage == MESA_SHADER_COMPUTE) {
109 nir->info.workgroup_size[0] = 16;
110 nir->info.workgroup_size[1] = 1;
111 nir->info.workgroup_size[2] = 1;
112 }
113
114 struct brw_compiler *compiler = device->physical->compiler;
115 struct brw_nir_compiler_opts opts = {};
116 brw_preprocess_nir(compiler, nir, &opts);
117
118 NIR_PASS_V(nir, nir_propagate_invariant, false);
119
120 if (stage == MESA_SHADER_FRAGMENT) {
121 NIR_PASS_V(nir, nir_lower_input_attachments,
122 &(nir_input_attachment_options) {
123 .use_fragcoord_sysval = true,
124 .use_layer_id_sysval = true,
125 });
126 } else {
127 nir_lower_compute_system_values_options options = {
128 .has_base_workgroup_id = true,
129 .lower_cs_local_id_to_index = true,
130 .lower_workgroup_id_to_index = gl_shader_stage_is_mesh(stage),
131 };
132 NIR_PASS_V(nir, nir_lower_compute_system_values, &options);
133 NIR_PASS_V(nir, nir_shader_intrinsics_pass, lower_base_workgroup_id,
134 nir_metadata_control_flow, NULL);
135 }
136
137 /* Reset sizes before gathering information */
138 nir->global_mem_size = 0;
139 nir->scratch_size = 0;
140 nir->info.shared_size = 0;
141 nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
142
143 NIR_PASS_V(nir, nir_copy_prop);
144 NIR_PASS_V(nir, nir_opt_constant_folding);
145 NIR_PASS_V(nir, nir_opt_dce);
146
147 union brw_any_prog_key key;
148 memset(&key, 0, sizeof(key));
149
150 union brw_any_prog_data prog_data;
151 memset(&prog_data, 0, sizeof(prog_data));
152
153 if (stage == MESA_SHADER_COMPUTE) {
154 NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics,
155 device->info, &prog_data.cs);
156 }
157
158 /* Do vectorizing here. For some reason when trying to do it in the back
159 * this just isn't working.
160 */
161 nir_load_store_vectorize_options options = {
162 .modes = nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_global,
163 .callback = brw_nir_should_vectorize_mem,
164 .robust_modes = (nir_variable_mode)0,
165 };
166 NIR_PASS_V(nir, nir_opt_load_store_vectorize, &options);
167
168 nir->num_uniforms = uniform_size;
169
170 prog_data.base.nr_params = nir->num_uniforms / 4;
171
172 brw_nir_analyze_ubo_ranges(compiler, nir, prog_data.base.ubo_ranges);
173
174 void *temp_ctx = ralloc_context(NULL);
175
176 const unsigned *program;
177 if (stage == MESA_SHADER_FRAGMENT) {
178 struct brw_compile_stats stats[3];
179 struct brw_compile_fs_params params = {
180 .base = {
181 .nir = nir,
182 .log_data = device,
183 .debug_flag = DEBUG_WM,
184 .stats = stats,
185 .mem_ctx = temp_ctx,
186 },
187 .key = &key.wm,
188 .prog_data = &prog_data.wm,
189 };
190 program = brw_compile_fs(compiler, ¶ms);
191
192 if (!INTEL_DEBUG(DEBUG_SHADER_PRINT)) {
193 unsigned stat_idx = 0;
194 if (prog_data.wm.dispatch_8) {
195 assert(stats[stat_idx].spills == 0);
196 assert(stats[stat_idx].fills == 0);
197 assert(stats[stat_idx].sends == sends_count_expectation);
198 stat_idx++;
199 }
200 if (prog_data.wm.dispatch_16) {
201 assert(stats[stat_idx].spills == 0);
202 assert(stats[stat_idx].fills == 0);
203 assert(stats[stat_idx].sends == sends_count_expectation);
204 stat_idx++;
205 }
206 if (prog_data.wm.dispatch_32) {
207 assert(stats[stat_idx].spills == 0);
208 assert(stats[stat_idx].fills == 0);
209 assert(stats[stat_idx].sends ==
210 sends_count_expectation *
211 (device->info->ver < 20 ? 2 : 1));
212 stat_idx++;
213 }
214 }
215 } else {
216 struct brw_compile_stats stats;
217 struct brw_compile_cs_params params = {
218 .base = {
219 .nir = nir,
220 .stats = &stats,
221 .log_data = device,
222 .debug_flag = DEBUG_CS,
223 .mem_ctx = temp_ctx,
224 },
225 .key = &key.cs,
226 .prog_data = &prog_data.cs,
227 };
228 program = brw_compile_cs(compiler, ¶ms);
229
230 if (!INTEL_DEBUG(DEBUG_SHADER_PRINT)) {
231 assert(stats.spills == 0);
232 assert(stats.fills == 0);
233 assert(stats.sends == sends_count_expectation);
234 }
235 }
236
237 assert(prog_data.base.total_scratch == 0);
238 assert(program != NULL);
239 struct anv_shader_bin *kernel = NULL;
240 if (program == NULL)
241 goto exit;
242
243 struct anv_pipeline_bind_map empty_bind_map = {};
244 struct anv_push_descriptor_info empty_push_desc_info = {};
245 struct anv_shader_upload_params upload_params = {
246 .stage = nir->info.stage,
247 .key_data = hash_key,
248 .key_size = hash_key_size,
249 .kernel_data = program,
250 .kernel_size = prog_data.base.program_size,
251 .prog_data = &prog_data.base,
252 .prog_data_size = sizeof(prog_data),
253 .bind_map = &empty_bind_map,
254 .push_desc_info = &empty_push_desc_info,
255 };
256
257 kernel = anv_device_upload_kernel(device, device->internal_cache, &upload_params);
258
259 exit:
260 ralloc_free(temp_ctx);
261 ralloc_free(nir);
262
263 return kernel;
264 }
265
266 VkResult
anv_device_get_internal_shader(struct anv_device * device,enum anv_internal_kernel_name name,struct anv_shader_bin ** out_bin)267 anv_device_get_internal_shader(struct anv_device *device,
268 enum anv_internal_kernel_name name,
269 struct anv_shader_bin **out_bin)
270 {
271 const struct {
272 struct {
273 char name[40];
274 } key;
275
276 gl_shader_stage stage;
277
278 uint32_t send_count;
279 } internal_kernels[] = {
280 [ANV_INTERNAL_KERNEL_GENERATED_DRAWS] = {
281 .key = {
282 .name = "anv-generated-indirect-draws",
283 },
284 .stage = MESA_SHADER_FRAGMENT,
285 .send_count = (device->info->ver == 9 ?
286 /* 1 load +
287 * 4 stores +
288 * 2 * (2 loads + 2 stores) +
289 * 3 stores
290 */
291 16 :
292 /* 1 load +
293 * 2 * (2 loads + 3 stores) +
294 * 3 stores
295 */
296 14),
297 },
298 [ANV_INTERNAL_KERNEL_COPY_QUERY_RESULTS_COMPUTE] = {
299 .key = {
300 .name = "anv-copy-query-compute",
301 },
302 .stage = MESA_SHADER_COMPUTE,
303 .send_count = device->info->verx10 >= 125 ?
304 9 /* 4 loads + 4 stores + 1 EOT */ :
305 8 /* 3 loads + 4 stores + 1 EOT */,
306 },
307 [ANV_INTERNAL_KERNEL_COPY_QUERY_RESULTS_FRAGMENT] = {
308 .key = {
309 .name = "anv-copy-query-fragment",
310 },
311 .stage = MESA_SHADER_FRAGMENT,
312 .send_count = 8 /* 3 loads + 4 stores + 1 EOT */,
313 },
314 [ANV_INTERNAL_KERNEL_MEMCPY_COMPUTE] = {
315 .key = {
316 .name = "anv-memcpy-compute",
317 },
318 .stage = MESA_SHADER_COMPUTE,
319 .send_count = device->info->verx10 >= 125 ?
320 10 /* 5 loads (1 pull constants) + 4 stores + 1 EOT */ :
321 9 /* 4 loads + 4 stores + 1 EOT */,
322 },
323 };
324
325 struct anv_shader_bin *bin =
326 p_atomic_read(&device->internal_kernels[name]);
327 if (bin != NULL) {
328 *out_bin = bin;
329 return VK_SUCCESS;
330 }
331
332 bin =
333 anv_device_search_for_kernel(device,
334 device->internal_cache,
335 &internal_kernels[name].key,
336 sizeof(internal_kernels[name].key),
337 NULL);
338 if (bin != NULL) {
339 p_atomic_set(&device->internal_kernels[name], bin);
340 *out_bin = bin;
341 return VK_SUCCESS;
342 }
343
344 void *mem_ctx = ralloc_context(NULL);
345
346 nir_shader *libanv_shaders =
347 anv_genX(device->info, load_libanv_shader)(device, mem_ctx);
348
349 bin = compile_shader(device,
350 libanv_shaders,
351 name,
352 internal_kernels[name].stage,
353 internal_kernels[name].key.name,
354 &internal_kernels[name].key,
355 sizeof(internal_kernels[name].key),
356 internal_kernels[name].send_count);
357 if (bin == NULL)
358 return vk_errorf(device, VK_ERROR_OUT_OF_HOST_MEMORY,
359 "Unable to compiler internal kernel");
360
361 /* The cache already has a reference and it's not going anywhere so
362 * there is no need to hold a second reference.
363 */
364 anv_shader_bin_unref(device, bin);
365
366 p_atomic_set(&device->internal_kernels[name], bin);
367
368 *out_bin = bin;
369 return VK_SUCCESS;
370 }
371
372 VkResult
anv_device_init_internal_kernels(struct anv_device * device)373 anv_device_init_internal_kernels(struct anv_device *device)
374 {
375 const struct intel_l3_weights w =
376 intel_get_default_l3_weights(device->info,
377 true /* wants_dc_cache */,
378 false /* needs_slm */);
379 device->internal_kernels_l3_config = intel_get_l3_config(device->info, w);
380
381 return VK_SUCCESS;
382 }
383
384 void
anv_device_finish_internal_kernels(struct anv_device * device)385 anv_device_finish_internal_kernels(struct anv_device *device)
386 {
387 }
388