1 /*
2 * Copyright © 2023 Valve Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "ac_gpu_info.h"
8 #include "ac_nir.h"
9 #include "nir.h"
10 #include "nir_builder.h"
11 #include "radv_constants.h"
12 #include "radv_nir.h"
13 #include "radv_shader.h"
14 #include "radv_shader_args.h"
15
16 typedef struct {
17 const struct radv_shader_args *args;
18 const struct radv_shader_info *info;
19 const struct radv_graphics_state_key *gfx_state;
20 const struct radeon_info *gpu_info;
21 } lower_vs_inputs_state;
22
23 static nir_def *
lower_load_vs_input_from_prolog(nir_builder * b,nir_intrinsic_instr * intrin,lower_vs_inputs_state * s)24 lower_load_vs_input_from_prolog(nir_builder *b, nir_intrinsic_instr *intrin, lower_vs_inputs_state *s)
25 {
26 nir_src *offset_src = nir_get_io_offset_src(intrin);
27 assert(nir_src_is_const(*offset_src));
28
29 const nir_io_semantics io_sem = nir_intrinsic_io_semantics(intrin);
30 const unsigned base_offset = nir_src_as_uint(*offset_src);
31 const unsigned location = io_sem.location + base_offset - VERT_ATTRIB_GENERIC0;
32 const unsigned component = nir_intrinsic_component(intrin);
33 const unsigned bit_size = intrin->def.bit_size;
34 const unsigned num_components = intrin->def.num_components;
35
36 /* 64-bit inputs: they occupy twice as many 32-bit components.
37 * 16-bit inputs: they occupy a 32-bit component (not packed).
38 */
39 const unsigned arg_bit_size = MAX2(bit_size, 32);
40
41 unsigned num_input_args = 1;
42 nir_def *input_args[2] = {ac_nir_load_arg(b, &s->args->ac, s->args->vs_inputs[location]), NULL};
43 if (component * 32 + arg_bit_size * num_components > 128) {
44 assert(bit_size == 64);
45
46 num_input_args++;
47 input_args[1] = ac_nir_load_arg(b, &s->args->ac, s->args->vs_inputs[location + 1]);
48 }
49
50 nir_def *extracted = nir_extract_bits(b, input_args, num_input_args, component * 32, num_components, arg_bit_size);
51
52 if (bit_size < arg_bit_size) {
53 assert(bit_size == 16);
54
55 if (nir_alu_type_get_base_type(nir_intrinsic_dest_type(intrin)) == nir_type_float)
56 return nir_f2f16(b, extracted);
57 else
58 return nir_u2u16(b, extracted);
59 }
60
61 return extracted;
62 }
63
64 static nir_def *
calc_vs_input_index_instance_rate(nir_builder * b,unsigned location,lower_vs_inputs_state * s)65 calc_vs_input_index_instance_rate(nir_builder *b, unsigned location, lower_vs_inputs_state *s)
66 {
67 const uint32_t divisor = s->gfx_state->vi.instance_rate_divisors[location];
68 nir_def *start_instance = nir_load_base_instance(b);
69
70 if (divisor == 0)
71 return start_instance;
72
73 nir_def *instance_id = nir_udiv_imm(b, nir_load_instance_id(b), divisor);
74 return nir_iadd(b, start_instance, instance_id);
75 }
76
77 static nir_def *
calc_vs_input_index(nir_builder * b,unsigned location,lower_vs_inputs_state * s)78 calc_vs_input_index(nir_builder *b, unsigned location, lower_vs_inputs_state *s)
79 {
80 if (s->gfx_state->vi.instance_rate_inputs & BITFIELD_BIT(location))
81 return calc_vs_input_index_instance_rate(b, location, s);
82
83 return nir_iadd(b, nir_load_first_vertex(b), nir_load_vertex_id_zero_base(b));
84 }
85
86 static bool
can_use_untyped_load(const struct util_format_description * f,const unsigned bit_size)87 can_use_untyped_load(const struct util_format_description *f, const unsigned bit_size)
88 {
89 /* All components must have same size and type. */
90 if (!f->is_array)
91 return false;
92
93 const struct util_format_channel_description *c = &f->channel[0];
94 return c->size == bit_size && bit_size >= 32;
95 }
96
97 static nir_def *
oob_input_load_value(nir_builder * b,const unsigned channel_idx,const unsigned bit_size,const bool is_float)98 oob_input_load_value(nir_builder *b, const unsigned channel_idx, const unsigned bit_size, const bool is_float)
99 {
100 /* 22.1.1. Attribute Location and Component Assignment of Vulkan 1.3 specification:
101 * For 64-bit data types, no default attribute values are provided. Input variables
102 * must not use more components than provided by the attribute.
103 */
104 if (bit_size == 64)
105 return nir_undef(b, 1, bit_size);
106
107 if (channel_idx == 3) {
108 if (is_float)
109 return nir_imm_floatN_t(b, 1.0, bit_size);
110 else
111 return nir_imm_intN_t(b, 1, bit_size);
112 }
113
114 return nir_imm_intN_t(b, 0, bit_size);
115 }
116
117 static unsigned
count_format_bytes(const struct util_format_description * f,const unsigned first_channel,const unsigned num_channels)118 count_format_bytes(const struct util_format_description *f, const unsigned first_channel, const unsigned num_channels)
119 {
120 if (!num_channels)
121 return 0;
122
123 const unsigned last_channel = first_channel + num_channels - 1;
124 assert(last_channel < f->nr_channels);
125 unsigned bits = 0;
126 for (unsigned i = first_channel; i <= last_channel; ++i) {
127 bits += f->channel[i].size;
128 }
129
130 assert(bits % 8 == 0);
131 return bits / 8;
132 }
133
134 static bool
format_needs_swizzle(const struct util_format_description * f)135 format_needs_swizzle(const struct util_format_description *f)
136 {
137 for (unsigned i = 0; i < f->nr_channels; ++i) {
138 if (f->swizzle[i] != PIPE_SWIZZLE_X + i)
139 return true;
140 }
141
142 return false;
143 }
144
145 static unsigned
first_used_swizzled_channel(const struct util_format_description * f,const unsigned mask,const bool backwards)146 first_used_swizzled_channel(const struct util_format_description *f, const unsigned mask, const bool backwards)
147 {
148 unsigned first_used = backwards ? 0 : f->nr_channels;
149 const unsigned it_mask = mask & BITFIELD_MASK(f->nr_channels);
150
151 u_foreach_bit (b, it_mask) {
152 assert(f->swizzle[b] != PIPE_SWIZZLE_0 && f->swizzle[b] != PIPE_SWIZZLE_1);
153 const unsigned c = f->swizzle[b] - PIPE_SWIZZLE_X;
154 first_used = backwards ? MAX2(first_used, c) : MIN2(first_used, c);
155 }
156
157 return first_used;
158 }
159
160 static nir_def *
adjust_vertex_fetch_alpha(nir_builder * b,enum ac_vs_input_alpha_adjust alpha_adjust,nir_def * alpha)161 adjust_vertex_fetch_alpha(nir_builder *b, enum ac_vs_input_alpha_adjust alpha_adjust, nir_def *alpha)
162 {
163 if (alpha_adjust == AC_ALPHA_ADJUST_SSCALED)
164 alpha = nir_f2u32(b, alpha);
165
166 /* For the integer-like cases, do a natural sign extension.
167 *
168 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0 and happen to contain 0, 1, 2, 3 as
169 * the two LSBs of the exponent.
170 */
171 unsigned offset = alpha_adjust == AC_ALPHA_ADJUST_SNORM ? 23u : 0u;
172
173 alpha = nir_ibfe_imm(b, alpha, offset, 2u);
174
175 /* Convert back to the right type. */
176 if (alpha_adjust == AC_ALPHA_ADJUST_SNORM) {
177 alpha = nir_i2f32(b, alpha);
178 alpha = nir_fmax(b, alpha, nir_imm_float(b, -1.0f));
179 } else if (alpha_adjust == AC_ALPHA_ADJUST_SSCALED) {
180 alpha = nir_i2f32(b, alpha);
181 }
182
183 return alpha;
184 }
185
186 static nir_def *
lower_load_vs_input(nir_builder * b,nir_intrinsic_instr * intrin,lower_vs_inputs_state * s)187 lower_load_vs_input(nir_builder *b, nir_intrinsic_instr *intrin, lower_vs_inputs_state *s)
188 {
189 nir_src *offset_src = nir_get_io_offset_src(intrin);
190 assert(nir_src_is_const(*offset_src));
191
192 const nir_io_semantics io_sem = nir_intrinsic_io_semantics(intrin);
193 const unsigned base_offset = nir_src_as_uint(*offset_src);
194 const unsigned location = io_sem.location + base_offset - VERT_ATTRIB_GENERIC0;
195 const unsigned bit_size = intrin->def.bit_size;
196 const unsigned dest_num_components = intrin->def.num_components;
197
198 /* Convert the component offset to bit_size units.
199 * (Intrinsic component offset is in 32-bit units.)
200 *
201 * Small bitsize inputs consume the same space as 32-bit inputs,
202 * but 64-bit inputs consume twice as many.
203 * 64-bit variables must not have a component of 1 or 3.
204 * (See VK spec 15.1.5 "Component Assignment")
205 */
206 const unsigned component = nir_intrinsic_component(intrin) / (MAX2(32, bit_size) / 32);
207
208 /* Bitmask of components in bit_size units
209 * of the current input load that are actually used.
210 */
211 const unsigned dest_use_mask = nir_def_components_read(&intrin->def) << component;
212
213 /* If the input is entirely unused, just replace it with undef.
214 * This is just in case we debug this pass without running DCE first.
215 */
216 if (!dest_use_mask)
217 return nir_undef(b, dest_num_components, bit_size);
218
219 const uint32_t attrib_binding = s->gfx_state->vi.vertex_attribute_bindings[location];
220 const uint32_t attrib_offset = s->gfx_state->vi.vertex_attribute_offsets[location];
221 const uint32_t attrib_stride = s->gfx_state->vi.vertex_attribute_strides[location];
222 const enum pipe_format attrib_format = s->gfx_state->vi.vertex_attribute_formats[location];
223 const struct util_format_description *f = util_format_description(attrib_format);
224 const struct ac_vtx_format_info *vtx_info =
225 ac_get_vtx_format_info(s->gpu_info->gfx_level, s->gpu_info->family, attrib_format);
226 const unsigned binding_index = s->info->vs.use_per_attribute_vb_descs ? location : attrib_binding;
227 const unsigned desc_index = util_bitcount(s->info->vs.vb_desc_usage_mask & u_bit_consecutive(0, binding_index));
228
229 nir_def *vertex_buffers_arg = ac_nir_load_arg(b, &s->args->ac, s->args->ac.vertex_buffers);
230 nir_def *vertex_buffers = nir_pack_64_2x32_split(b, vertex_buffers_arg, nir_imm_int(b, s->gpu_info->address32_hi));
231 nir_def *descriptor = nir_load_smem_amd(b, 4, vertex_buffers, nir_imm_int(b, desc_index * 16));
232 nir_def *base_index = calc_vs_input_index(b, location, s);
233 nir_def *zero = nir_imm_int(b, 0);
234
235 /* We currently implement swizzling for all formats in shaders.
236 * Note, it is possible to specify swizzling in the DST_SEL fields of descriptors,
237 * but we don't use that because typed loads using the MTBUF instruction format
238 * don't support DST_SEL, so it's simpler to just handle it all in shaders.
239 */
240 const bool needs_swizzle = format_needs_swizzle(f);
241
242 /* We need to adjust the alpha channel as loaded by the HW,
243 * for example sign extension and normalization may be necessary.
244 */
245 const enum ac_vs_input_alpha_adjust alpha_adjust = vtx_info->alpha_adjust;
246
247 /* Try to shrink the load format by skipping unused components from the start.
248 * Beneficial because the backend may be able to emit fewer HW instructions.
249 * Only possible with array formats.
250 */
251 const unsigned first_used_channel = first_used_swizzled_channel(f, dest_use_mask, false);
252 const unsigned skipped_start = f->is_array ? first_used_channel : 0;
253
254 /* Number of channels we actually use and load.
255 * Don't shrink the format here because this might allow the backend to
256 * emit fewer (but larger than needed) HW instructions.
257 */
258 const unsigned first_trailing_unused_channel = first_used_swizzled_channel(f, dest_use_mask, true) + 1;
259 const unsigned max_loaded_channels = MIN2(first_trailing_unused_channel, f->nr_channels);
260 const unsigned fetch_num_channels =
261 first_used_channel >= max_loaded_channels ? 0 : max_loaded_channels - skipped_start;
262
263 /* Load VS inputs from VRAM.
264 *
265 * For the vast majority of cases this will only create 1x load_(typed)_buffer_amd
266 * intrinsic and the backend is responsible for further splitting that
267 * to as many HW instructions as needed based on alignment.
268 *
269 * Take care to prevent loaded components from failing the range check,
270 * by emitting several load intrinsics with different index sources.
271 * This is necessary because the backend can't further roll the const offset
272 * into the index source of MUBUF / MTBUF instructions.
273 */
274 nir_def *loads[NIR_MAX_VEC_COMPONENTS] = {0};
275 unsigned num_loads = 0;
276 for (unsigned x = 0, channels; x < fetch_num_channels; x += channels) {
277 channels = fetch_num_channels - x;
278 const unsigned start = skipped_start + x;
279 enum pipe_format fetch_format = attrib_format;
280 nir_def *index = base_index;
281
282 /* Add excess constant offset to the index. */
283 unsigned const_off = attrib_offset + count_format_bytes(f, 0, start);
284 if (attrib_stride && const_off >= attrib_stride) {
285 index = nir_iadd_imm(b, base_index, const_off / attrib_stride);
286 const_off %= attrib_stride;
287 }
288
289 /* Reduce the number of loaded channels until we can pass the range check.
290 * Only for array formats. VK spec mandates proper alignment for packed formats.
291 * Note, NONE seems to occur in real use and is considered an array format.
292 */
293 if (f->is_array && fetch_format != PIPE_FORMAT_NONE) {
294 while (channels > 1 && attrib_stride && (const_off + count_format_bytes(f, start, channels)) > attrib_stride) {
295 channels--;
296 }
297
298 /* Keep the fetch format as large as possible to let the backend emit
299 * larger load instructions when it deems them beneficial.
300 */
301 fetch_format = util_format_get_array(f->channel[0].type, f->channel[0].size, f->nr_channels - start,
302 f->is_unorm || f->is_snorm, f->channel[0].pure_integer);
303 }
304
305 assert(f->is_array || channels == fetch_num_channels);
306
307 /* Prefer using untyped buffer loads if possible, to avoid potential alignment issues.
308 * Typed loads can cause GPU hangs when used with improper alignment.
309 */
310 if (can_use_untyped_load(f, bit_size)) {
311 loads[num_loads++] = nir_load_buffer_amd(b, channels, bit_size, descriptor, zero, zero, index,
312 .base = const_off, .memory_modes = nir_var_shader_in);
313 } else {
314 const unsigned align_mul = MAX2(1, s->gfx_state->vi.vertex_binding_align[attrib_binding]);
315 const unsigned align_offset = const_off % align_mul;
316
317 loads[num_loads++] = nir_load_typed_buffer_amd(
318 b, channels, bit_size, descriptor, zero, zero, index, .base = const_off, .format = fetch_format,
319 .align_mul = align_mul, .align_offset = align_offset, .memory_modes = nir_var_shader_in);
320 }
321 }
322
323 nir_def *load = loads[0];
324
325 /* Extract the channels we actually need when we couldn't skip starting
326 * components or had to emit more than one load intrinsic.
327 */
328 if (num_loads > 0 && (first_used_channel > skipped_start || num_loads != 1))
329 load = nir_extract_bits(b, loads, num_loads, (first_used_channel - skipped_start) * bit_size,
330 max_loaded_channels - first_used_channel, bit_size);
331
332 /* Return early if possible to avoid generating unnecessary IR. */
333 if (num_loads > 0 && first_used_channel == component && load->num_components == dest_num_components &&
334 !needs_swizzle && alpha_adjust == AC_ALPHA_ADJUST_NONE)
335 return load;
336
337 /* Fill unused and OOB components.
338 * Apply swizzle and alpha adjust according to the format.
339 */
340 const nir_alu_type dst_type = nir_alu_type_get_base_type(nir_intrinsic_dest_type(intrin));
341 nir_def *channels[NIR_MAX_VEC_COMPONENTS] = {0};
342 for (unsigned i = 0; i < dest_num_components; ++i) {
343 const unsigned c = i + component;
344
345 if (!(dest_use_mask & BITFIELD_BIT(c))) {
346 /* Fill unused channels with zero. */
347 channels[i] = nir_imm_zero(b, 1, bit_size);
348 continue;
349 }
350
351 const unsigned sw = f->swizzle[c];
352 assert(sw >= first_used_channel);
353 const unsigned loaded_channel = sw - first_used_channel;
354
355 if (load && loaded_channel < load->num_components) {
356 /* Use channels that were loaded from VRAM. */
357 channels[i] = nir_channel(b, load, loaded_channel);
358
359 if (alpha_adjust != AC_ALPHA_ADJUST_NONE && c == 3)
360 channels[i] = adjust_vertex_fetch_alpha(b, alpha_adjust, channels[i]);
361 } else {
362 /* Handle input loads that are larger than their format. */
363 channels[i] = oob_input_load_value(b, c, bit_size, dst_type == nir_type_float);
364 }
365 }
366
367 return nir_vec(b, channels, dest_num_components);
368 }
369
370 static bool
lower_vs_input_instr(nir_builder * b,nir_intrinsic_instr * intrin,void * state)371 lower_vs_input_instr(nir_builder *b, nir_intrinsic_instr *intrin, void *state)
372 {
373 if (intrin->intrinsic != nir_intrinsic_load_input)
374 return false;
375
376 lower_vs_inputs_state *s = (lower_vs_inputs_state *)state;
377
378 b->cursor = nir_before_instr(&intrin->instr);
379
380 nir_def *replacement = NULL;
381
382 if (s->info->vs.dynamic_inputs) {
383 replacement = lower_load_vs_input_from_prolog(b, intrin, s);
384 } else {
385 replacement = lower_load_vs_input(b, intrin, s);
386 }
387
388 nir_def_replace(&intrin->def, replacement);
389 nir_instr_free(&intrin->instr);
390
391 return true;
392 }
393
394 bool
radv_nir_lower_vs_inputs(nir_shader * shader,const struct radv_shader_stage * vs_stage,const struct radv_graphics_state_key * gfx_state,const struct radeon_info * gpu_info)395 radv_nir_lower_vs_inputs(nir_shader *shader, const struct radv_shader_stage *vs_stage,
396 const struct radv_graphics_state_key *gfx_state, const struct radeon_info *gpu_info)
397 {
398 assert(shader->info.stage == MESA_SHADER_VERTEX);
399
400 lower_vs_inputs_state state = {
401 .info = &vs_stage->info,
402 .args = &vs_stage->args,
403 .gfx_state = gfx_state,
404 .gpu_info = gpu_info,
405 };
406
407 return nir_shader_intrinsics_pass(shader, lower_vs_input_instr, nir_metadata_control_flow, &state);
408 }
409