xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/brw_simd_selection.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2021 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "brw_private.h"
25 #include "compiler/shader_info.h"
26 #include "intel/dev/intel_debug.h"
27 #include "intel/dev/intel_device_info.h"
28 #include "util/ralloc.h"
29 
30 unsigned
brw_required_dispatch_width(const struct shader_info * info)31 brw_required_dispatch_width(const struct shader_info *info)
32 {
33    if ((int)info->subgroup_size >= (int)SUBGROUP_SIZE_REQUIRE_8) {
34       assert(gl_shader_stage_uses_workgroup(info->stage));
35       /* These enum values are expressly chosen to be equal to the subgroup
36        * size that they require.
37        */
38       return (unsigned)info->subgroup_size;
39    } else {
40       return 0;
41    }
42 }
43 
44 unsigned
brw_geometry_stage_dispatch_width(const struct intel_device_info * devinfo)45 brw_geometry_stage_dispatch_width(const struct intel_device_info *devinfo)
46 {
47    if (devinfo->ver >= 20)
48       return 16;
49    return 8;
50 }
51 
52 static inline bool
test_bit(unsigned mask,unsigned bit)53 test_bit(unsigned mask, unsigned bit) {
54    return mask & (1u << bit);
55 }
56 
57 namespace {
58 
59 struct brw_cs_prog_data *
get_cs_prog_data(brw_simd_selection_state & state)60 get_cs_prog_data(brw_simd_selection_state &state)
61 {
62    if (std::holds_alternative<struct brw_cs_prog_data *>(state.prog_data))
63       return std::get<struct brw_cs_prog_data *>(state.prog_data);
64    else
65       return nullptr;
66 }
67 
68 struct brw_stage_prog_data *
get_prog_data(brw_simd_selection_state & state)69 get_prog_data(brw_simd_selection_state &state)
70 {
71    if (std::holds_alternative<struct brw_cs_prog_data *>(state.prog_data))
72       return &std::get<struct brw_cs_prog_data *>(state.prog_data)->base;
73    else if (std::holds_alternative<struct brw_bs_prog_data *>(state.prog_data))
74       return &std::get<struct brw_bs_prog_data *>(state.prog_data)->base;
75    else
76       return nullptr;
77 }
78 
79 }
80 
81 bool
brw_simd_should_compile(brw_simd_selection_state & state,unsigned simd)82 brw_simd_should_compile(brw_simd_selection_state &state, unsigned simd)
83 {
84    assert(simd < SIMD_COUNT);
85    assert(!state.compiled[simd]);
86 
87    const auto cs_prog_data = get_cs_prog_data(state);
88    const auto prog_data = get_prog_data(state);
89    const unsigned width = 8u << simd;
90 
91    /* For shaders with variable size workgroup, in most cases we can compile
92     * all the variants (exceptions are bindless dispatch & ray queries), since
93     * the choice will happen only at dispatch time.
94     */
95    const bool workgroup_size_variable = cs_prog_data && cs_prog_data->local_size[0] == 0;
96 
97    if (!workgroup_size_variable) {
98       if (state.spilled[simd]) {
99          state.error[simd] = "Would spill";
100          return false;
101       }
102 
103       if (state.required_width && state.required_width != width) {
104          state.error[simd] = "Different than required dispatch width";
105          return false;
106       }
107 
108       if (cs_prog_data) {
109          const unsigned workgroup_size = cs_prog_data->local_size[0] *
110                                          cs_prog_data->local_size[1] *
111                                          cs_prog_data->local_size[2];
112 
113          unsigned max_threads = state.devinfo->max_cs_workgroup_threads;
114 
115          const unsigned min_simd = state.devinfo->ver >= 20 ? 1 : 0;
116          if (simd > min_simd && state.compiled[simd - 1] &&
117             workgroup_size <= (width / 2)) {
118             state.error[simd] = "Workgroup size already fits in smaller SIMD";
119             return false;
120          }
121 
122          if (DIV_ROUND_UP(workgroup_size, width) > max_threads) {
123             state.error[simd] = "Would need more than max_threads to fit all invocations";
124             return false;
125          }
126       }
127 
128       /* The SIMD32 is only enabled for cases it is needed unless forced.
129        *
130        * TODO: Use performance_analysis and drop this rule.
131        */
132       if (width == 32 && state.devinfo->ver < 20) {
133          if (!INTEL_DEBUG(DEBUG_DO32) && (state.compiled[0] || state.compiled[1])) {
134             state.error[simd] = "SIMD32 not required (use INTEL_DEBUG=do32 to force)";
135             return false;
136          }
137       }
138    }
139 
140    if (width == 8 && state.devinfo->ver >= 20) {
141       state.error[simd] = "SIMD8 not supported on Xe2+";
142       return false;
143    }
144 
145    if (width == 32 && cs_prog_data && cs_prog_data->base.ray_queries > 0) {
146       state.error[simd] = "Ray queries not supported";
147       return false;
148    }
149 
150    if (width == 32 && cs_prog_data && cs_prog_data->uses_btd_stack_ids) {
151       state.error[simd] = "Bindless shader calls not supported";
152       return false;
153    }
154 
155    uint64_t start;
156    switch (prog_data->stage) {
157    case MESA_SHADER_COMPUTE:
158       start = DEBUG_CS_SIMD8;
159       break;
160    case MESA_SHADER_TASK:
161       start = DEBUG_TS_SIMD8;
162       break;
163    case MESA_SHADER_MESH:
164       start = DEBUG_MS_SIMD8;
165       break;
166    case MESA_SHADER_RAYGEN:
167    case MESA_SHADER_ANY_HIT:
168    case MESA_SHADER_CLOSEST_HIT:
169    case MESA_SHADER_MISS:
170    case MESA_SHADER_INTERSECTION:
171    case MESA_SHADER_CALLABLE:
172       start = DEBUG_RT_SIMD8;
173       break;
174    default:
175       unreachable("unknown shader stage in brw_simd_should_compile");
176    }
177 
178    const bool env_skip[] = {
179       (intel_simd & (start << 0)) == 0,
180       (intel_simd & (start << 1)) == 0,
181       (intel_simd & (start << 2)) == 0,
182    };
183 
184    static_assert(ARRAY_SIZE(env_skip) == SIMD_COUNT);
185 
186    if (unlikely(env_skip[simd])) {
187       state.error[simd] = "Disabled by INTEL_DEBUG environment variable";
188       return false;
189    }
190 
191    return true;
192 }
193 
194 void
brw_simd_mark_compiled(brw_simd_selection_state & state,unsigned simd,bool spilled)195 brw_simd_mark_compiled(brw_simd_selection_state &state, unsigned simd, bool spilled)
196 {
197    assert(simd < SIMD_COUNT);
198    assert(!state.compiled[simd]);
199 
200    auto cs_prog_data = get_cs_prog_data(state);
201 
202    state.compiled[simd] = true;
203    if (cs_prog_data)
204       cs_prog_data->prog_mask |= 1u << simd;
205 
206    /* If a SIMD spilled, all the larger ones would spill too. */
207    if (spilled) {
208       for (unsigned i = simd; i < SIMD_COUNT; i++) {
209          state.spilled[i] = true;
210          if (cs_prog_data)
211             cs_prog_data->prog_spilled |= 1u << i;
212       }
213    }
214 }
215 
216 int
brw_simd_select(const struct brw_simd_selection_state & state)217 brw_simd_select(const struct brw_simd_selection_state &state)
218 {
219    for (int i = SIMD_COUNT - 1; i >= 0; i--) {
220       if (state.compiled[i] && !state.spilled[i])
221          return i;
222    }
223    for (int i = SIMD_COUNT - 1; i >= 0; i--) {
224       if (state.compiled[i])
225          return i;
226    }
227    return -1;
228 }
229 
230 int
brw_simd_select_for_workgroup_size(const struct intel_device_info * devinfo,const struct brw_cs_prog_data * prog_data,const unsigned * sizes)231 brw_simd_select_for_workgroup_size(const struct intel_device_info *devinfo,
232                                    const struct brw_cs_prog_data *prog_data,
233                                    const unsigned *sizes)
234 {
235    if (!sizes || (prog_data->local_size[0] == sizes[0] &&
236                   prog_data->local_size[1] == sizes[1] &&
237                   prog_data->local_size[2] == sizes[2])) {
238       brw_simd_selection_state simd_state{
239          .prog_data = const_cast<struct brw_cs_prog_data *>(prog_data),
240       };
241 
242       /* Propagate the prog_data information back to the simd_state,
243        * so we can use select() directly.
244        */
245       for (int i = 0; i < SIMD_COUNT; i++) {
246          simd_state.compiled[i] = test_bit(prog_data->prog_mask, i);
247          simd_state.spilled[i] = test_bit(prog_data->prog_spilled, i);
248       }
249 
250       return brw_simd_select(simd_state);
251    }
252 
253    struct brw_cs_prog_data cloned = *prog_data;
254    for (unsigned i = 0; i < 3; i++)
255       cloned.local_size[i] = sizes[i];
256 
257    cloned.prog_mask = 0;
258    cloned.prog_spilled = 0;
259 
260    brw_simd_selection_state simd_state{
261       .devinfo = devinfo,
262       .prog_data = &cloned,
263    };
264 
265    for (unsigned simd = 0; simd < SIMD_COUNT; simd++) {
266       /* We are not recompiling, so use original results of prog_mask and
267        * prog_spilled as they will already contain all possible compilations.
268        */
269       if (brw_simd_should_compile(simd_state, simd) &&
270           test_bit(prog_data->prog_mask, simd)) {
271          brw_simd_mark_compiled(simd_state, simd, test_bit(prog_data->prog_spilled, simd));
272       }
273    }
274 
275    return brw_simd_select(simd_state);
276 }
277