xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/brw_nir_lower_rt_intrinsics.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright (c) 2020 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "brw_nir_rt.h"
25 #include "brw_nir_rt_builder.h"
26 
27 static nir_def *
nir_build_vec3_mat_mult_col_major(nir_builder * b,nir_def * vec,nir_def * matrix[],bool translation)28 nir_build_vec3_mat_mult_col_major(nir_builder *b, nir_def *vec,
29                                   nir_def *matrix[], bool translation)
30 {
31    nir_def *result_components[3] = {
32       nir_channel(b, matrix[3], 0),
33       nir_channel(b, matrix[3], 1),
34       nir_channel(b, matrix[3], 2),
35    };
36    for (unsigned i = 0; i < 3; ++i) {
37       for (unsigned j = 0; j < 3; ++j) {
38          nir_def *v = nir_fmul(b, nir_channels(b, vec, 1 << j), nir_channels(b, matrix[j], 1 << i));
39          result_components[i] = (translation || j) ? nir_fadd(b, result_components[i], v) : v;
40       }
41    }
42    return nir_vec(b, result_components, 3);
43 }
44 
45 static nir_def *
build_leaf_is_procedural(nir_builder * b,struct brw_nir_rt_mem_hit_defs * hit)46 build_leaf_is_procedural(nir_builder *b, struct brw_nir_rt_mem_hit_defs *hit)
47 {
48    switch (b->shader->info.stage) {
49    case MESA_SHADER_ANY_HIT:
50       /* Any-hit shaders are always compiled into intersection shaders for
51        * procedural geometry.  If we got here in an any-hit shader, it's for
52        * triangles.
53        */
54       return nir_imm_false(b);
55 
56    case MESA_SHADER_INTERSECTION:
57       return nir_imm_true(b);
58 
59    default:
60       return nir_ieq_imm(b, hit->leaf_type,
61                             BRW_RT_BVH_NODE_TYPE_PROCEDURAL);
62    }
63 }
64 
65 static void
lower_rt_intrinsics_impl(nir_function_impl * impl,const struct intel_device_info * devinfo)66 lower_rt_intrinsics_impl(nir_function_impl *impl,
67                          const struct intel_device_info *devinfo)
68 {
69    bool progress = false;
70 
71    nir_builder build = nir_builder_at(nir_before_impl(impl));
72    nir_builder *b = &build;
73 
74    struct brw_nir_rt_globals_defs globals;
75    brw_nir_rt_load_globals(b, &globals);
76 
77    nir_def *hotzone_addr = brw_nir_rt_sw_hotzone_addr(b, devinfo);
78    nir_def *hotzone = nir_load_global(b, hotzone_addr, 16, 4, 32);
79 
80    gl_shader_stage stage = b->shader->info.stage;
81    struct brw_nir_rt_mem_ray_defs world_ray_in = {};
82    struct brw_nir_rt_mem_ray_defs object_ray_in = {};
83    struct brw_nir_rt_mem_hit_defs hit_in = {};
84    switch (stage) {
85    case MESA_SHADER_ANY_HIT:
86    case MESA_SHADER_CLOSEST_HIT:
87    case MESA_SHADER_INTERSECTION:
88       brw_nir_rt_load_mem_hit(b, &hit_in,
89                               stage == MESA_SHADER_CLOSEST_HIT);
90       brw_nir_rt_load_mem_ray(b, &object_ray_in,
91                               BRW_RT_BVH_LEVEL_OBJECT);
92       FALLTHROUGH;
93 
94    case MESA_SHADER_MISS:
95       brw_nir_rt_load_mem_ray(b, &world_ray_in,
96                               BRW_RT_BVH_LEVEL_WORLD);
97       break;
98 
99    default:
100       break;
101    }
102 
103    nir_def *thread_stack_base_addr = brw_nir_rt_sw_stack_addr(b, devinfo);
104    nir_def *stack_base_offset = nir_channel(b, hotzone, 0);
105    nir_def *stack_base_addr =
106       nir_iadd(b, thread_stack_base_addr, nir_u2u64(b, stack_base_offset));
107    ASSERTED bool seen_scratch_base_ptr_load = false;
108    ASSERTED bool found_resume = false;
109 
110    nir_foreach_block(block, impl) {
111       nir_foreach_instr_safe(instr, block) {
112          if (instr->type != nir_instr_type_intrinsic)
113             continue;
114 
115          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
116 
117          b->cursor = nir_after_instr(&intrin->instr);
118 
119          nir_def *sysval = NULL;
120          switch (intrin->intrinsic) {
121          case nir_intrinsic_load_scratch_base_ptr:
122             assert(nir_intrinsic_base(intrin) == 1);
123             seen_scratch_base_ptr_load = true;
124             sysval = stack_base_addr;
125             break;
126 
127          case nir_intrinsic_btd_stack_push_intel: {
128             int32_t stack_size = nir_intrinsic_stack_size(intrin);
129             if (stack_size > 0) {
130                nir_def *child_stack_offset =
131                   nir_iadd_imm(b, stack_base_offset, stack_size);
132                nir_store_global(b, hotzone_addr, 16, child_stack_offset, 0x1);
133             }
134             nir_instr_remove(instr);
135             break;
136          }
137 
138          case nir_intrinsic_rt_resume:
139             /* This is the first "interesting" instruction */
140             assert(block == nir_start_block(impl));
141             assert(!seen_scratch_base_ptr_load);
142             found_resume = true;
143 
144             int32_t stack_size = nir_intrinsic_stack_size(intrin);
145             if (stack_size > 0) {
146                stack_base_offset =
147                   nir_iadd_imm(b, stack_base_offset, -stack_size);
148                nir_store_global(b, hotzone_addr, 16, stack_base_offset, 0x1);
149                stack_base_addr = nir_iadd(b, thread_stack_base_addr,
150                                           nir_u2u64(b, stack_base_offset));
151             }
152             nir_instr_remove(instr);
153             break;
154 
155          case nir_intrinsic_load_uniform: {
156             /* We don't want to lower this in the launch trampoline. */
157             if (stage == MESA_SHADER_COMPUTE)
158                break;
159 
160             sysval = brw_nir_load_global_const(b, intrin,
161                         nir_load_btd_global_arg_addr_intel(b),
162                         BRW_RT_PUSH_CONST_OFFSET);
163 
164             break;
165          }
166 
167          case nir_intrinsic_load_ray_launch_id:
168             sysval = nir_channels(b, hotzone, 0xe);
169             break;
170 
171          case nir_intrinsic_load_ray_launch_size:
172             sysval = globals.launch_size;
173             break;
174 
175          case nir_intrinsic_load_ray_world_origin:
176             sysval = world_ray_in.orig;
177             break;
178 
179          case nir_intrinsic_load_ray_world_direction:
180             sysval = world_ray_in.dir;
181             break;
182 
183          case nir_intrinsic_load_ray_object_origin:
184             if (stage == MESA_SHADER_CLOSEST_HIT) {
185                struct brw_nir_rt_bvh_instance_leaf_defs leaf;
186                brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
187 
188                sysval = nir_build_vec3_mat_mult_col_major(
189                   b, world_ray_in.orig, leaf.world_to_object, true);
190             } else {
191                sysval = object_ray_in.orig;
192             }
193             break;
194 
195          case nir_intrinsic_load_ray_object_direction:
196             if (stage == MESA_SHADER_CLOSEST_HIT) {
197                struct brw_nir_rt_bvh_instance_leaf_defs leaf;
198                brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
199 
200                sysval = nir_build_vec3_mat_mult_col_major(
201                   b, world_ray_in.dir, leaf.world_to_object, false);
202             } else {
203                sysval = object_ray_in.dir;
204             }
205             break;
206 
207          case nir_intrinsic_load_ray_t_min:
208             /* It shouldn't matter which we pull this from */
209             sysval = world_ray_in.t_near;
210             break;
211 
212          case nir_intrinsic_load_ray_t_max:
213             if (stage == MESA_SHADER_MISS)
214                sysval = world_ray_in.t_far;
215             else
216                sysval = hit_in.t;
217             break;
218 
219          case nir_intrinsic_load_primitive_id:
220             sysval = brw_nir_rt_load_primitive_id_from_hit(b,
221                                                            build_leaf_is_procedural(b, &hit_in),
222                                                            &hit_in);
223             break;
224 
225          case nir_intrinsic_load_instance_id: {
226             struct brw_nir_rt_bvh_instance_leaf_defs leaf;
227             brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
228             sysval = leaf.instance_index;
229             break;
230          }
231 
232          case nir_intrinsic_load_ray_object_to_world: {
233             struct brw_nir_rt_bvh_instance_leaf_defs leaf;
234             brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
235             sysval = leaf.object_to_world[nir_intrinsic_column(intrin)];
236             break;
237          }
238 
239          case nir_intrinsic_load_ray_world_to_object: {
240             struct brw_nir_rt_bvh_instance_leaf_defs leaf;
241             brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
242             sysval = leaf.world_to_object[nir_intrinsic_column(intrin)];
243             break;
244          }
245 
246          case nir_intrinsic_load_ray_hit_kind: {
247             nir_def *tri_hit_kind =
248                nir_bcsel(b, hit_in.front_face,
249                             nir_imm_int(b, BRW_RT_HIT_KIND_FRONT_FACE),
250                             nir_imm_int(b, BRW_RT_HIT_KIND_BACK_FACE));
251             sysval = nir_bcsel(b, build_leaf_is_procedural(b, &hit_in),
252                                   hit_in.aabb_hit_kind, tri_hit_kind);
253             break;
254          }
255 
256          case nir_intrinsic_load_ray_flags:
257             /* We need to fetch the original ray flags we stored in the
258              * leaf pointer, because the actual ray flags we get here
259              * will include any flags passed on the pipeline at creation
260              * time, and the spec for IncomingRayFlagsKHR says:
261              *   Setting pipeline flags on the raytracing pipeline must not
262              *   cause any corresponding flags to be set in variables with
263              *   this decoration.
264              */
265             sysval = nir_u2u32(b, world_ray_in.inst_leaf_ptr);
266             break;
267 
268          case nir_intrinsic_load_cull_mask:
269             sysval = nir_u2u32(b, world_ray_in.ray_mask);
270             break;
271 
272          case nir_intrinsic_load_ray_geometry_index: {
273             nir_def *geometry_index_dw =
274                nir_load_global(b, nir_iadd_imm(b, hit_in.prim_leaf_ptr, 4), 4,
275                                1, 32);
276             sysval = nir_iand_imm(b, geometry_index_dw, BITFIELD_MASK(29));
277             break;
278          }
279 
280          case nir_intrinsic_load_ray_instance_custom_index: {
281             struct brw_nir_rt_bvh_instance_leaf_defs leaf;
282             brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
283             sysval = leaf.instance_id;
284             break;
285          }
286 
287          case nir_intrinsic_load_shader_record_ptr:
288             /* We can't handle this intrinsic in resume shaders because the
289              * handle we get there won't be from the original SBT.  The shader
290              * call lowering/splitting pass should have ensured that this
291              * value was spilled from the initial shader and unspilled in any
292              * resume shaders that need it.
293              */
294             assert(!found_resume);
295             sysval = nir_load_btd_local_arg_addr_intel(b);
296             break;
297 
298          case nir_intrinsic_load_ray_base_mem_addr_intel:
299             sysval = globals.base_mem_addr;
300             break;
301 
302          case nir_intrinsic_load_ray_hw_stack_size_intel:
303             sysval = nir_imul_imm(b, globals.hw_stack_size, 64);
304             break;
305 
306          case nir_intrinsic_load_ray_sw_stack_size_intel:
307             sysval = nir_imul_imm(b, globals.sw_stack_size, 64);
308             break;
309 
310          case nir_intrinsic_load_ray_num_dss_rt_stacks_intel:
311             sysval = globals.num_dss_rt_stacks;
312             break;
313 
314          case nir_intrinsic_load_ray_hit_sbt_addr_intel:
315             sysval = globals.hit_sbt_addr;
316             break;
317 
318          case nir_intrinsic_load_ray_hit_sbt_stride_intel:
319             sysval = globals.hit_sbt_stride;
320             break;
321 
322          case nir_intrinsic_load_ray_miss_sbt_addr_intel:
323             sysval = globals.miss_sbt_addr;
324             break;
325 
326          case nir_intrinsic_load_ray_miss_sbt_stride_intel:
327             sysval = globals.miss_sbt_stride;
328             break;
329 
330          case nir_intrinsic_load_callable_sbt_addr_intel:
331             sysval = globals.call_sbt_addr;
332             break;
333 
334          case nir_intrinsic_load_callable_sbt_stride_intel:
335             sysval = globals.call_sbt_stride;
336             break;
337 
338          case nir_intrinsic_load_btd_resume_sbt_addr_intel:
339             sysval = nir_pack_64_2x32_split(b,
340                nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_RESUME_SBT_ADDR_LOW),
341                nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_RESUME_SBT_ADDR_HIGH));
342             break;
343 
344          case nir_intrinsic_load_leaf_procedural_intel:
345             sysval = build_leaf_is_procedural(b, &hit_in);
346             break;
347 
348          case nir_intrinsic_load_ray_triangle_vertex_positions: {
349             struct brw_nir_rt_bvh_primitive_leaf_positions_defs pos;
350             brw_nir_rt_load_bvh_primitive_leaf_positions(b, &pos, hit_in.prim_leaf_ptr);
351             sysval = pos.positions[nir_intrinsic_column(intrin)];
352             break;
353          }
354 
355          case nir_intrinsic_load_leaf_opaque_intel: {
356             if (stage == MESA_SHADER_INTERSECTION) {
357                /* In intersection shaders, the opaque bit is passed to us in
358                 * the front_face bit.
359                 */
360                sysval = hit_in.front_face;
361             } else {
362                nir_def *flags_dw =
363                   nir_load_global(b, nir_iadd_imm(b, hit_in.prim_leaf_ptr, 4), 4,
364                                   1, 32);
365                sysval = nir_i2b(b, nir_iand_imm(b, flags_dw, 1u << 30));
366             }
367             break;
368          }
369 
370          default:
371             continue;
372          }
373 
374          progress = true;
375 
376          if (sysval) {
377             nir_def_replace(&intrin->def, sysval);
378          }
379       }
380    }
381 
382    nir_metadata_preserve(impl,
383                          progress ?
384                          nir_metadata_none :
385                          (nir_metadata_control_flow));
386 }
387 
388 /** Lower ray-tracing system values and intrinsics
389  *
390  * In most 3D shader stages, intrinsics are a fairly thin wrapper around
391  * hardware functionality and system values represent magic bits that come
392  * into the shader from FF hardware.  Ray-tracing, however, looks a bit more
393  * like the OpenGL 1.0 world where the underlying hardware is simple and most
394  * of the API implementation is software.
395  *
396  * In particular, most things that are treated as system values (or built-ins
397  * in SPIR-V) don't get magically dropped into registers for us.  Instead, we
398  * have to fetch them from the relevant data structures shared with the
399  * ray-tracing hardware.  Most come from either the RT_DISPATCH_GLOBALS or
400  * from one of the MemHit data structures.  Some, such as primitive_id require
401  * us to fetch the leaf address from the MemHit struct and then manually read
402  * the data out of the BVH.  Instead of trying to emit all this code deep in
403  * the back-end where we can't effectively optimize it, we lower it all to
404  * global memory access in NIR.
405  *
406  * Once this pass is complete, the only real system values left are the two
407  * argument pointer system values for BTD dispatch: btd_local_arg_addr and
408  * btd_global_arg_addr.
409  */
410 void
brw_nir_lower_rt_intrinsics(nir_shader * nir,const struct intel_device_info * devinfo)411 brw_nir_lower_rt_intrinsics(nir_shader *nir,
412                             const struct intel_device_info *devinfo)
413 {
414    nir_foreach_function_impl(impl, nir) {
415       lower_rt_intrinsics_impl(impl, devinfo);
416    }
417 }
418