1 /*
2 * Copyright © 2019 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "nir.h"
25 #include "nir_builder.h"
26 #include "nir_deref.h"
27
28 #include "util/bitscan.h"
29 #include "util/list.h"
30 #include "util/u_math.h"
31
32 /* Combine stores of vectors to the same deref into a single store.
33 *
34 * This per-block pass keeps track of stores of vectors to the same
35 * destination and combines them into the last store of the sequence. Dead
36 * stores (or parts of the store) found during the process are removed.
37 *
38 * A pending combination becomes an actual combination in various situations:
39 * at the end of the block, when another instruction uses the memory or due to
40 * barriers.
41 *
42 * Besides vectors, the pass also look at array derefs of vectors. For direct
43 * array derefs, it works like a write mask access to the given component.
44 * For indirect access there's no way to know before hand what component it
45 * will overlap with, so the combination is finished -- the indirect remains
46 * unmodified.
47 */
48
49 /* Keep track of a group of stores that can be combined. All stores share the
50 * same destination.
51 */
52 struct combined_store {
53 struct list_head link;
54
55 nir_component_mask_t write_mask;
56 nir_deref_instr *dst;
57
58 /* Latest store added. It is reused when combining. */
59 nir_intrinsic_instr *latest;
60
61 /* Original store for each component. The number of times a store appear
62 * in this array is kept in the store's pass_flags.
63 */
64 nir_intrinsic_instr *stores[NIR_MAX_VEC_COMPONENTS];
65 };
66
67 struct combine_stores_state {
68 nir_variable_mode modes;
69
70 /* Pending store combinations. */
71 struct list_head pending;
72
73 /* Per function impl state. */
74 nir_builder b;
75 bool progress;
76
77 /* Allocator and freelist to reuse structs between functions. */
78 linear_ctx *lin_ctx;
79 struct list_head freelist;
80 };
81
82 static struct combined_store *
alloc_combined_store(struct combine_stores_state * state)83 alloc_combined_store(struct combine_stores_state *state)
84 {
85 struct combined_store *result;
86 if (list_is_empty(&state->freelist)) {
87 result = linear_zalloc_child(state->lin_ctx, sizeof(*result));
88 } else {
89 result = list_first_entry(&state->freelist,
90 struct combined_store,
91 link);
92 list_del(&result->link);
93 memset(result, 0, sizeof(*result));
94 }
95 return result;
96 }
97
98 static void
free_combined_store(struct combine_stores_state * state,struct combined_store * combo)99 free_combined_store(struct combine_stores_state *state,
100 struct combined_store *combo)
101 {
102 list_del(&combo->link);
103 combo->write_mask = 0;
104 list_add(&combo->link, &state->freelist);
105 }
106
107 static void
combine_stores(struct combine_stores_state * state,struct combined_store * combo)108 combine_stores(struct combine_stores_state *state,
109 struct combined_store *combo)
110 {
111 assert(combo->latest);
112 assert(combo->latest->intrinsic == nir_intrinsic_store_deref);
113
114 /* If the combined writemask is the same as the latest store, we know there
115 * is only one store in the combination, so nothing to combine.
116 */
117 if ((combo->write_mask & nir_intrinsic_write_mask(combo->latest)) ==
118 combo->write_mask)
119 return;
120
121 state->b.cursor = nir_before_instr(&combo->latest->instr);
122
123 /* Build a new vec, to be used as source for the combined store. As it
124 * gets build, remove previous stores that are not needed anymore.
125 */
126 nir_scalar comps[NIR_MAX_VEC_COMPONENTS] = { 0 };
127 unsigned num_components = glsl_get_vector_elements(combo->dst->type);
128 unsigned bit_size = combo->latest->src[1].ssa->bit_size;
129 for (unsigned i = 0; i < num_components; i++) {
130 nir_intrinsic_instr *store = combo->stores[i];
131 if (combo->write_mask & (1 << i)) {
132 assert(store);
133
134 /* If store->num_components == 1 then we are in the deref-of-vec case
135 * and store->src[1] is a scalar. Otherwise, we're a regular vector
136 * load and we have to pick off a component.
137 */
138 comps[i] = nir_get_scalar(store->src[1].ssa, store->num_components == 1 ? 0 : i);
139
140 assert(store->instr.pass_flags > 0);
141 if (--store->instr.pass_flags == 0 && store != combo->latest)
142 nir_instr_remove(&store->instr);
143 } else {
144 comps[i] = nir_get_scalar(nir_undef(&state->b, 1, bit_size), 0);
145 }
146 }
147 assert(combo->latest->instr.pass_flags == 0);
148 nir_def *vec = nir_vec_scalars(&state->b, comps, num_components);
149
150 /* Fix the latest store with the combined information. */
151 nir_intrinsic_instr *store = combo->latest;
152
153 /* In this case, our store is as an array deref of a vector so we need to
154 * rewrite it to use a deref to the whole vector.
155 */
156 if (store->num_components == 1) {
157 store->num_components = num_components;
158 nir_src_rewrite(&store->src[0], &combo->dst->def);
159 }
160
161 assert(store->num_components == num_components);
162 nir_intrinsic_set_write_mask(store, combo->write_mask);
163 nir_src_rewrite(&store->src[1], vec);
164 state->progress = true;
165 }
166
167 static void
combine_stores_with_deref(struct combine_stores_state * state,nir_deref_instr * deref)168 combine_stores_with_deref(struct combine_stores_state *state,
169 nir_deref_instr *deref)
170 {
171 if (!nir_deref_mode_may_be(deref, state->modes))
172 return;
173
174 list_for_each_entry_safe(struct combined_store, combo, &state->pending, link) {
175 if (nir_compare_derefs(combo->dst, deref) & nir_derefs_may_alias_bit) {
176 combine_stores(state, combo);
177 free_combined_store(state, combo);
178 }
179 }
180 }
181
182 static void
combine_stores_with_modes(struct combine_stores_state * state,nir_variable_mode modes)183 combine_stores_with_modes(struct combine_stores_state *state,
184 nir_variable_mode modes)
185 {
186 if ((state->modes & modes) == 0)
187 return;
188
189 list_for_each_entry_safe(struct combined_store, combo, &state->pending, link) {
190 if (nir_deref_mode_may_be(combo->dst, modes)) {
191 combine_stores(state, combo);
192 free_combined_store(state, combo);
193 }
194 }
195 }
196
197 static struct combined_store *
find_matching_combined_store(struct combine_stores_state * state,nir_deref_instr * deref)198 find_matching_combined_store(struct combine_stores_state *state,
199 nir_deref_instr *deref)
200 {
201 list_for_each_entry(struct combined_store, combo, &state->pending, link) {
202 if (nir_compare_derefs(combo->dst, deref) & nir_derefs_equal_bit)
203 return combo;
204 }
205 return NULL;
206 }
207
208 static void
update_combined_store(struct combine_stores_state * state,nir_intrinsic_instr * intrin)209 update_combined_store(struct combine_stores_state *state,
210 nir_intrinsic_instr *intrin)
211 {
212 nir_deref_instr *dst = nir_src_as_deref(intrin->src[0]);
213 if (!nir_deref_mode_may_be(dst, state->modes))
214 return;
215
216 unsigned vec_mask;
217 nir_deref_instr *vec_dst;
218
219 if (glsl_type_is_vector(dst->type)) {
220 vec_mask = nir_intrinsic_write_mask(intrin);
221 vec_dst = dst;
222 } else {
223 /* Besides vectors, only direct array derefs of vectors are handled. */
224 if (dst->deref_type != nir_deref_type_array ||
225 !nir_src_is_const(dst->arr.index) ||
226 !glsl_type_is_vector(nir_deref_instr_parent(dst)->type)) {
227 combine_stores_with_deref(state, dst);
228 return;
229 }
230
231 uint64_t index = nir_src_as_uint(dst->arr.index);
232 vec_dst = nir_deref_instr_parent(dst);
233
234 if (index >= glsl_get_vector_elements(vec_dst->type)) {
235 /* Storing to an invalid index is a no-op. */
236 nir_instr_remove(&intrin->instr);
237 state->progress = true;
238 return;
239 }
240
241 vec_mask = 1 << index;
242 }
243
244 struct combined_store *combo = find_matching_combined_store(state, vec_dst);
245 if (!combo) {
246 combo = alloc_combined_store(state);
247 combo->dst = vec_dst;
248 list_add(&combo->link, &state->pending);
249 }
250
251 /* Use pass_flags to reference count the store based on how many
252 * components are still used by the combination.
253 */
254 intrin->instr.pass_flags = util_bitcount(vec_mask);
255 combo->latest = intrin;
256
257 /* Update the combined_store, clearing up older overlapping references. */
258 combo->write_mask |= vec_mask;
259 while (vec_mask) {
260 unsigned i = u_bit_scan(&vec_mask);
261 nir_intrinsic_instr *prev_store = combo->stores[i];
262
263 if (prev_store) {
264 if (--prev_store->instr.pass_flags == 0) {
265 nir_instr_remove(&prev_store->instr);
266 } else {
267 assert(glsl_type_is_vector(
268 nir_src_as_deref(prev_store->src[0])->type));
269 nir_component_mask_t prev_mask = nir_intrinsic_write_mask(prev_store);
270 nir_intrinsic_set_write_mask(prev_store, prev_mask & ~(1 << i));
271 }
272 state->progress = true;
273 }
274 combo->stores[i] = combo->latest;
275 }
276 }
277
278 static void
combine_stores_block(struct combine_stores_state * state,nir_block * block)279 combine_stores_block(struct combine_stores_state *state, nir_block *block)
280 {
281 nir_foreach_instr_safe(instr, block) {
282 if (instr->type == nir_instr_type_call) {
283 combine_stores_with_modes(state, nir_var_shader_out |
284 nir_var_shader_temp |
285 nir_var_function_temp |
286 nir_var_mem_ssbo |
287 nir_var_mem_shared |
288 nir_var_mem_global);
289 continue;
290 }
291
292 if (instr->type != nir_instr_type_intrinsic)
293 continue;
294
295 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
296 switch (intrin->intrinsic) {
297 case nir_intrinsic_store_deref:
298 if (nir_intrinsic_access(intrin) & ACCESS_VOLATILE) {
299 nir_deref_instr *dst = nir_src_as_deref(intrin->src[0]);
300 /* When we see a volatile store, we go ahead and combine all
301 * previous non-volatile stores which touch that address and
302 * specifically don't add the volatile store to the list. This
303 * way we guarantee that the volatile store isn't combined with
304 * anything and no non-volatile stores are combined across a
305 * volatile store.
306 */
307 combine_stores_with_deref(state, dst);
308 } else {
309 update_combined_store(state, intrin);
310 }
311 break;
312
313 case nir_intrinsic_barrier:
314 if (nir_intrinsic_memory_semantics(intrin) & NIR_MEMORY_RELEASE) {
315 combine_stores_with_modes(state,
316 nir_intrinsic_memory_modes(intrin));
317 }
318 break;
319
320 case nir_intrinsic_emit_vertex:
321 case nir_intrinsic_emit_vertex_with_counter:
322 combine_stores_with_modes(state, nir_var_shader_out);
323 break;
324
325 case nir_intrinsic_report_ray_intersection:
326 combine_stores_with_modes(state, nir_var_mem_ssbo |
327 nir_var_mem_global |
328 nir_var_shader_call_data |
329 nir_var_ray_hit_attrib);
330 break;
331
332 case nir_intrinsic_ignore_ray_intersection:
333 case nir_intrinsic_terminate_ray:
334 combine_stores_with_modes(state, nir_var_mem_ssbo |
335 nir_var_mem_global |
336 nir_var_shader_call_data);
337 break;
338
339 case nir_intrinsic_load_deref: {
340 nir_deref_instr *src = nir_src_as_deref(intrin->src[0]);
341 combine_stores_with_deref(state, src);
342 break;
343 }
344
345 case nir_intrinsic_load_deref_block_intel:
346 case nir_intrinsic_store_deref_block_intel: {
347 /* Combine all the stores that may alias with the whole variable (or
348 * cast).
349 */
350 nir_deref_instr *operand = nir_src_as_deref(intrin->src[0]);
351 while (nir_deref_instr_parent(operand))
352 operand = nir_deref_instr_parent(operand);
353 assert(operand->deref_type == nir_deref_type_var ||
354 operand->deref_type == nir_deref_type_cast);
355
356 combine_stores_with_deref(state, operand);
357 break;
358 }
359
360 case nir_intrinsic_copy_deref:
361 case nir_intrinsic_memcpy_deref: {
362 nir_deref_instr *dst = nir_src_as_deref(intrin->src[0]);
363 nir_deref_instr *src = nir_src_as_deref(intrin->src[1]);
364 combine_stores_with_deref(state, dst);
365 combine_stores_with_deref(state, src);
366 break;
367 }
368
369 case nir_intrinsic_trace_ray:
370 case nir_intrinsic_execute_callable:
371 case nir_intrinsic_rt_trace_ray:
372 case nir_intrinsic_rt_execute_callable: {
373 nir_deref_instr *payload =
374 nir_src_as_deref(*nir_get_shader_call_payload_src(intrin));
375 combine_stores_with_deref(state, payload);
376 break;
377 }
378
379 case nir_intrinsic_deref_atomic:
380 case nir_intrinsic_deref_atomic_swap: {
381 nir_deref_instr *dst = nir_src_as_deref(intrin->src[0]);
382 combine_stores_with_deref(state, dst);
383 break;
384 }
385
386 default:
387 break;
388 }
389 }
390
391 /* At the end of the block, try all the remaining combinations. */
392 combine_stores_with_modes(state, state->modes);
393 }
394
395 static bool
combine_stores_impl(struct combine_stores_state * state,nir_function_impl * impl)396 combine_stores_impl(struct combine_stores_state *state, nir_function_impl *impl)
397 {
398 state->progress = false;
399 state->b = nir_builder_create(impl);
400
401 nir_foreach_block(block, impl)
402 combine_stores_block(state, block);
403
404 if (state->progress) {
405 nir_metadata_preserve(impl, nir_metadata_control_flow);
406 } else {
407 nir_metadata_preserve(impl, nir_metadata_all);
408 }
409
410 return state->progress;
411 }
412
413 bool
nir_opt_combine_stores(nir_shader * shader,nir_variable_mode modes)414 nir_opt_combine_stores(nir_shader *shader, nir_variable_mode modes)
415 {
416 void *mem_ctx = ralloc_context(NULL);
417 struct combine_stores_state state = {
418 .modes = modes,
419 .lin_ctx = linear_context(mem_ctx),
420 };
421
422 list_inithead(&state.pending);
423 list_inithead(&state.freelist);
424
425 bool progress = false;
426
427 nir_foreach_function_impl(impl, shader) {
428 progress |= combine_stores_impl(&state, impl);
429 }
430
431 ralloc_free(mem_ctx);
432 return progress;
433 }
434