1 /*
2 * Copyright (c) 2020 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_nir_rt.h"
25 #include "brw_nir_rt_builder.h"
26
27 static nir_def *
nir_build_vec3_mat_mult_col_major(nir_builder * b,nir_def * vec,nir_def * matrix[],bool translation)28 nir_build_vec3_mat_mult_col_major(nir_builder *b, nir_def *vec,
29 nir_def *matrix[], bool translation)
30 {
31 nir_def *result_components[3] = {
32 nir_channel(b, matrix[3], 0),
33 nir_channel(b, matrix[3], 1),
34 nir_channel(b, matrix[3], 2),
35 };
36 for (unsigned i = 0; i < 3; ++i) {
37 for (unsigned j = 0; j < 3; ++j) {
38 nir_def *v = nir_fmul(b, nir_channels(b, vec, 1 << j), nir_channels(b, matrix[j], 1 << i));
39 result_components[i] = (translation || j) ? nir_fadd(b, result_components[i], v) : v;
40 }
41 }
42 return nir_vec(b, result_components, 3);
43 }
44
45 static nir_def *
build_leaf_is_procedural(nir_builder * b,struct brw_nir_rt_mem_hit_defs * hit)46 build_leaf_is_procedural(nir_builder *b, struct brw_nir_rt_mem_hit_defs *hit)
47 {
48 switch (b->shader->info.stage) {
49 case MESA_SHADER_ANY_HIT:
50 /* Any-hit shaders are always compiled into intersection shaders for
51 * procedural geometry. If we got here in an any-hit shader, it's for
52 * triangles.
53 */
54 return nir_imm_false(b);
55
56 case MESA_SHADER_INTERSECTION:
57 return nir_imm_true(b);
58
59 default:
60 return nir_ieq_imm(b, hit->leaf_type,
61 BRW_RT_BVH_NODE_TYPE_PROCEDURAL);
62 }
63 }
64
65 static void
lower_rt_intrinsics_impl(nir_function_impl * impl,const struct intel_device_info * devinfo)66 lower_rt_intrinsics_impl(nir_function_impl *impl,
67 const struct intel_device_info *devinfo)
68 {
69 bool progress = false;
70
71 nir_builder build = nir_builder_at(nir_before_impl(impl));
72 nir_builder *b = &build;
73
74 struct brw_nir_rt_globals_defs globals;
75 brw_nir_rt_load_globals(b, &globals);
76
77 nir_def *hotzone_addr = brw_nir_rt_sw_hotzone_addr(b, devinfo);
78 nir_def *hotzone = nir_load_global(b, hotzone_addr, 16, 4, 32);
79
80 gl_shader_stage stage = b->shader->info.stage;
81 struct brw_nir_rt_mem_ray_defs world_ray_in = {};
82 struct brw_nir_rt_mem_ray_defs object_ray_in = {};
83 struct brw_nir_rt_mem_hit_defs hit_in = {};
84 switch (stage) {
85 case MESA_SHADER_ANY_HIT:
86 case MESA_SHADER_CLOSEST_HIT:
87 case MESA_SHADER_INTERSECTION:
88 brw_nir_rt_load_mem_hit(b, &hit_in,
89 stage == MESA_SHADER_CLOSEST_HIT);
90 brw_nir_rt_load_mem_ray(b, &object_ray_in,
91 BRW_RT_BVH_LEVEL_OBJECT);
92 FALLTHROUGH;
93
94 case MESA_SHADER_MISS:
95 brw_nir_rt_load_mem_ray(b, &world_ray_in,
96 BRW_RT_BVH_LEVEL_WORLD);
97 break;
98
99 default:
100 break;
101 }
102
103 nir_def *thread_stack_base_addr = brw_nir_rt_sw_stack_addr(b, devinfo);
104 nir_def *stack_base_offset = nir_channel(b, hotzone, 0);
105 nir_def *stack_base_addr =
106 nir_iadd(b, thread_stack_base_addr, nir_u2u64(b, stack_base_offset));
107 ASSERTED bool seen_scratch_base_ptr_load = false;
108 ASSERTED bool found_resume = false;
109
110 nir_foreach_block(block, impl) {
111 nir_foreach_instr_safe(instr, block) {
112 if (instr->type != nir_instr_type_intrinsic)
113 continue;
114
115 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
116
117 b->cursor = nir_after_instr(&intrin->instr);
118
119 nir_def *sysval = NULL;
120 switch (intrin->intrinsic) {
121 case nir_intrinsic_load_scratch_base_ptr:
122 assert(nir_intrinsic_base(intrin) == 1);
123 seen_scratch_base_ptr_load = true;
124 sysval = stack_base_addr;
125 break;
126
127 case nir_intrinsic_btd_stack_push_intel: {
128 int32_t stack_size = nir_intrinsic_stack_size(intrin);
129 if (stack_size > 0) {
130 nir_def *child_stack_offset =
131 nir_iadd_imm(b, stack_base_offset, stack_size);
132 nir_store_global(b, hotzone_addr, 16, child_stack_offset, 0x1);
133 }
134 nir_instr_remove(instr);
135 break;
136 }
137
138 case nir_intrinsic_rt_resume:
139 /* This is the first "interesting" instruction */
140 assert(block == nir_start_block(impl));
141 assert(!seen_scratch_base_ptr_load);
142 found_resume = true;
143
144 int32_t stack_size = nir_intrinsic_stack_size(intrin);
145 if (stack_size > 0) {
146 stack_base_offset =
147 nir_iadd_imm(b, stack_base_offset, -stack_size);
148 nir_store_global(b, hotzone_addr, 16, stack_base_offset, 0x1);
149 stack_base_addr = nir_iadd(b, thread_stack_base_addr,
150 nir_u2u64(b, stack_base_offset));
151 }
152 nir_instr_remove(instr);
153 break;
154
155 case nir_intrinsic_load_uniform: {
156 /* We don't want to lower this in the launch trampoline. */
157 if (stage == MESA_SHADER_COMPUTE)
158 break;
159
160 sysval = brw_nir_load_global_const(b, intrin,
161 nir_load_btd_global_arg_addr_intel(b),
162 BRW_RT_PUSH_CONST_OFFSET);
163
164 break;
165 }
166
167 case nir_intrinsic_load_ray_launch_id:
168 sysval = nir_channels(b, hotzone, 0xe);
169 break;
170
171 case nir_intrinsic_load_ray_launch_size:
172 sysval = globals.launch_size;
173 break;
174
175 case nir_intrinsic_load_ray_world_origin:
176 sysval = world_ray_in.orig;
177 break;
178
179 case nir_intrinsic_load_ray_world_direction:
180 sysval = world_ray_in.dir;
181 break;
182
183 case nir_intrinsic_load_ray_object_origin:
184 if (stage == MESA_SHADER_CLOSEST_HIT) {
185 struct brw_nir_rt_bvh_instance_leaf_defs leaf;
186 brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
187
188 sysval = nir_build_vec3_mat_mult_col_major(
189 b, world_ray_in.orig, leaf.world_to_object, true);
190 } else {
191 sysval = object_ray_in.orig;
192 }
193 break;
194
195 case nir_intrinsic_load_ray_object_direction:
196 if (stage == MESA_SHADER_CLOSEST_HIT) {
197 struct brw_nir_rt_bvh_instance_leaf_defs leaf;
198 brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
199
200 sysval = nir_build_vec3_mat_mult_col_major(
201 b, world_ray_in.dir, leaf.world_to_object, false);
202 } else {
203 sysval = object_ray_in.dir;
204 }
205 break;
206
207 case nir_intrinsic_load_ray_t_min:
208 /* It shouldn't matter which we pull this from */
209 sysval = world_ray_in.t_near;
210 break;
211
212 case nir_intrinsic_load_ray_t_max:
213 if (stage == MESA_SHADER_MISS)
214 sysval = world_ray_in.t_far;
215 else
216 sysval = hit_in.t;
217 break;
218
219 case nir_intrinsic_load_primitive_id:
220 sysval = brw_nir_rt_load_primitive_id_from_hit(b,
221 build_leaf_is_procedural(b, &hit_in),
222 &hit_in);
223 break;
224
225 case nir_intrinsic_load_instance_id: {
226 struct brw_nir_rt_bvh_instance_leaf_defs leaf;
227 brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
228 sysval = leaf.instance_index;
229 break;
230 }
231
232 case nir_intrinsic_load_ray_object_to_world: {
233 struct brw_nir_rt_bvh_instance_leaf_defs leaf;
234 brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
235 sysval = leaf.object_to_world[nir_intrinsic_column(intrin)];
236 break;
237 }
238
239 case nir_intrinsic_load_ray_world_to_object: {
240 struct brw_nir_rt_bvh_instance_leaf_defs leaf;
241 brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
242 sysval = leaf.world_to_object[nir_intrinsic_column(intrin)];
243 break;
244 }
245
246 case nir_intrinsic_load_ray_hit_kind: {
247 nir_def *tri_hit_kind =
248 nir_bcsel(b, hit_in.front_face,
249 nir_imm_int(b, BRW_RT_HIT_KIND_FRONT_FACE),
250 nir_imm_int(b, BRW_RT_HIT_KIND_BACK_FACE));
251 sysval = nir_bcsel(b, build_leaf_is_procedural(b, &hit_in),
252 hit_in.aabb_hit_kind, tri_hit_kind);
253 break;
254 }
255
256 case nir_intrinsic_load_ray_flags:
257 /* We need to fetch the original ray flags we stored in the
258 * leaf pointer, because the actual ray flags we get here
259 * will include any flags passed on the pipeline at creation
260 * time, and the spec for IncomingRayFlagsKHR says:
261 * Setting pipeline flags on the raytracing pipeline must not
262 * cause any corresponding flags to be set in variables with
263 * this decoration.
264 */
265 sysval = nir_u2u32(b, world_ray_in.inst_leaf_ptr);
266 break;
267
268 case nir_intrinsic_load_cull_mask:
269 sysval = nir_u2u32(b, world_ray_in.ray_mask);
270 break;
271
272 case nir_intrinsic_load_ray_geometry_index: {
273 nir_def *geometry_index_dw =
274 nir_load_global(b, nir_iadd_imm(b, hit_in.prim_leaf_ptr, 4), 4,
275 1, 32);
276 sysval = nir_iand_imm(b, geometry_index_dw, BITFIELD_MASK(29));
277 break;
278 }
279
280 case nir_intrinsic_load_ray_instance_custom_index: {
281 struct brw_nir_rt_bvh_instance_leaf_defs leaf;
282 brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
283 sysval = leaf.instance_id;
284 break;
285 }
286
287 case nir_intrinsic_load_shader_record_ptr:
288 /* We can't handle this intrinsic in resume shaders because the
289 * handle we get there won't be from the original SBT. The shader
290 * call lowering/splitting pass should have ensured that this
291 * value was spilled from the initial shader and unspilled in any
292 * resume shaders that need it.
293 */
294 assert(!found_resume);
295 sysval = nir_load_btd_local_arg_addr_intel(b);
296 break;
297
298 case nir_intrinsic_load_ray_base_mem_addr_intel:
299 sysval = globals.base_mem_addr;
300 break;
301
302 case nir_intrinsic_load_ray_hw_stack_size_intel:
303 sysval = nir_imul_imm(b, globals.hw_stack_size, 64);
304 break;
305
306 case nir_intrinsic_load_ray_sw_stack_size_intel:
307 sysval = nir_imul_imm(b, globals.sw_stack_size, 64);
308 break;
309
310 case nir_intrinsic_load_ray_num_dss_rt_stacks_intel:
311 sysval = globals.num_dss_rt_stacks;
312 break;
313
314 case nir_intrinsic_load_ray_hit_sbt_addr_intel:
315 sysval = globals.hit_sbt_addr;
316 break;
317
318 case nir_intrinsic_load_ray_hit_sbt_stride_intel:
319 sysval = globals.hit_sbt_stride;
320 break;
321
322 case nir_intrinsic_load_ray_miss_sbt_addr_intel:
323 sysval = globals.miss_sbt_addr;
324 break;
325
326 case nir_intrinsic_load_ray_miss_sbt_stride_intel:
327 sysval = globals.miss_sbt_stride;
328 break;
329
330 case nir_intrinsic_load_callable_sbt_addr_intel:
331 sysval = globals.call_sbt_addr;
332 break;
333
334 case nir_intrinsic_load_callable_sbt_stride_intel:
335 sysval = globals.call_sbt_stride;
336 break;
337
338 case nir_intrinsic_load_btd_resume_sbt_addr_intel:
339 sysval = nir_pack_64_2x32_split(b,
340 nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_RESUME_SBT_ADDR_LOW),
341 nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_RESUME_SBT_ADDR_HIGH));
342 break;
343
344 case nir_intrinsic_load_leaf_procedural_intel:
345 sysval = build_leaf_is_procedural(b, &hit_in);
346 break;
347
348 case nir_intrinsic_load_ray_triangle_vertex_positions: {
349 struct brw_nir_rt_bvh_primitive_leaf_positions_defs pos;
350 brw_nir_rt_load_bvh_primitive_leaf_positions(b, &pos, hit_in.prim_leaf_ptr);
351 sysval = pos.positions[nir_intrinsic_column(intrin)];
352 break;
353 }
354
355 case nir_intrinsic_load_leaf_opaque_intel: {
356 if (stage == MESA_SHADER_INTERSECTION) {
357 /* In intersection shaders, the opaque bit is passed to us in
358 * the front_face bit.
359 */
360 sysval = hit_in.front_face;
361 } else {
362 nir_def *flags_dw =
363 nir_load_global(b, nir_iadd_imm(b, hit_in.prim_leaf_ptr, 4), 4,
364 1, 32);
365 sysval = nir_i2b(b, nir_iand_imm(b, flags_dw, 1u << 30));
366 }
367 break;
368 }
369
370 default:
371 continue;
372 }
373
374 progress = true;
375
376 if (sysval) {
377 nir_def_replace(&intrin->def, sysval);
378 }
379 }
380 }
381
382 nir_metadata_preserve(impl,
383 progress ?
384 nir_metadata_none :
385 (nir_metadata_control_flow));
386 }
387
388 /** Lower ray-tracing system values and intrinsics
389 *
390 * In most 3D shader stages, intrinsics are a fairly thin wrapper around
391 * hardware functionality and system values represent magic bits that come
392 * into the shader from FF hardware. Ray-tracing, however, looks a bit more
393 * like the OpenGL 1.0 world where the underlying hardware is simple and most
394 * of the API implementation is software.
395 *
396 * In particular, most things that are treated as system values (or built-ins
397 * in SPIR-V) don't get magically dropped into registers for us. Instead, we
398 * have to fetch them from the relevant data structures shared with the
399 * ray-tracing hardware. Most come from either the RT_DISPATCH_GLOBALS or
400 * from one of the MemHit data structures. Some, such as primitive_id require
401 * us to fetch the leaf address from the MemHit struct and then manually read
402 * the data out of the BVH. Instead of trying to emit all this code deep in
403 * the back-end where we can't effectively optimize it, we lower it all to
404 * global memory access in NIR.
405 *
406 * Once this pass is complete, the only real system values left are the two
407 * argument pointer system values for BTD dispatch: btd_local_arg_addr and
408 * btd_global_arg_addr.
409 */
410 void
brw_nir_lower_rt_intrinsics(nir_shader * nir,const struct intel_device_info * devinfo)411 brw_nir_lower_rt_intrinsics(nir_shader *nir,
412 const struct intel_device_info *devinfo)
413 {
414 nir_foreach_function_impl(impl, nir) {
415 lower_rt_intrinsics_impl(impl, devinfo);
416 }
417 }
418