xref: /aosp_15_r20/external/mesa3d/src/intel/vulkan/grl/gpu/msb_radix_bitonic_sort.grl (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1//
2// Copyright (C) 2009-2021 Intel Corporation
3//
4// SPDX-License-Identifier: MIT
5//
6//
7
8module msb_radix_bitonic_sort;
9
10kernel_module msb_radix_sort ("morton_msb_radix_bitonic_sort.cl")
11{
12    links lsc_intrinsics;
13
14    kernel opencl_debug_print                                    < kernelFunction="debug_print_kernel">;
15    kernel opencl_check_bls                                      < kernelFunction="check_bls_sort">;
16
17    kernel opencl_bottom_level_sort_single_wg                    < kernelFunction="sort_morton_codes_bottom_level_single_wg">;
18
19    kernel opencl_build_morton_kernel_sort_msb_init              < kernelFunction="sort_morton_codes_msb_begin">;
20
21    kernel opencl_build_morton_kernel_sort_msb_scheduler         < kernelFunction="scheduler">;
22
23    kernel opencl_build_morton_kernel_sort_bottom_level          < kernelFunction="sort_morton_codes_bottom_level">;
24
25    kernel opencl_build_morton_kernel_sort_msb_count_items       < kernelFunction="sort_morton_codes_msb_count_items">;
26    kernel opencl_build_morton_kernel_sort_msb_bin_items         < kernelFunction="sort_morton_codes_msb_bin_items">;
27
28    kernel opencl_build_morton_kernel_sort_batched_bls_dispatch  < kernelFunction="sort_morton_codes_batched_BLS_dispatch">;
29}
30
31
32const MSB_RADIX_NUM_VCONTEXTS  = 8;
33const BOTTOM_LEVEL_SORT_THRESHOLD  = 512;
34
35struct MSBRadixScheduler
36{
37    dword num_wgs_msb;
38    dword num_wgs_bls;
39
40    dword scheduler_postsync;
41    dword _pad1;
42};
43
44struct MSBRadixArgs
45{
46    qword p_scheduler;
47    qword p_num_primitives;
48};
49
50
51
52
53struct BatchedBLSDispatchEntry
54{
55    qword p_data_buffer;
56    qword num_elements; // number of elements in p_data_buffer
57};
58
59
60
61
62metakernel add_bls_dispatch_init(qword p_storage)
63{
64    define REG_numWgs         REG14;
65    define REG_p_storage      REG15;
66
67    REG_numWgs = 0;
68    REG_p_storage = p_storage;
69}
70
71
72
73
74// basically this code does:
75// bls_args_for_dispatches[dispatchID] = { bls_new_pointer, numPrimitives };
76// dispatchId++;
77//
78metakernel add_bls_dispatch(
79    qword p_data,
80    qword p_num_primitives
81)
82{
83    define C_1                                REG0;
84    define C_8                                REG1;
85
86    define C_MIN_PRIMREFS                     REG2;
87
88    define REG_p_data                         REG3;
89    define REG_num_prims                      REG4;
90    define REG_no_dispatch                    REG5;
91
92    define REG_numWgs                         REG14;
93    define REG_p_storage                      REG15;
94
95    C_MIN_PRIMREFS = 2;
96
97    REG_num_prims = 0;
98    REG_num_prims.lo = load_dword(p_num_primitives);
99
100    REG_no_dispatch  = REG_num_prims < C_MIN_PRIMREFS;
101
102    goto l_finish if(REG_no_dispatch.lo);
103
104    C_1 = 1;
105    C_8 = 8;
106
107    // pseudocode: BatchedBLSDispatchEntry.p_data_buffer = p_data
108    REG_p_data = p_data;
109    store_qword( REG_p_storage, REG_p_data ); // store the data pointer
110
111    REG_p_storage = REG_p_storage + C_8; // point to next member in BatchedBLSDispatchEntry struct
112
113    // pseudocode: BatchedBLSDispatchEntry.num_elements = *p_num_primitives
114    store_qword( REG_p_storage, REG_num_prims );
115
116    REG_p_storage = REG_p_storage + C_8; // point to next BatchedBLSDispatchEntry instance
117
118    REG_numWgs = REG_numWgs + C_1;
119
120l_finish:
121
122}
123
124
125
126
127metakernel batched_bls_dispatch(
128    qword private_mem
129)
130{
131    define REG_numWgs REG14;
132
133    DISPATCHDIM_X = REG_numWgs;
134    DISPATCHDIM_Y = 1;
135    DISPATCHDIM_Z = 1;
136
137    dispatch_indirect opencl_build_morton_kernel_sort_batched_bls_dispatch args(private_mem);
138}
139
140
141
142
143metakernel sort_bottom_level(
144    qword build_globals,
145    qword input,
146    qword p_num_primitives)
147{
148    define REG_num_prims       REG0;
149    define C_MIN_PRIMREFS      REG1;
150    define REG_no_dispatch     REG2;
151
152    REG_num_prims  = load_dword( p_num_primitives );
153
154    C_MIN_PRIMREFS = 2;
155
156    REG_no_dispatch  = REG_num_prims < C_MIN_PRIMREFS;
157
158    goto l_finish if(REG_no_dispatch.lo);
159
160    dispatch opencl_bottom_level_sort_single_wg (1, 1, 1) args(build_globals, input, input);
161
162l_finish:
163
164}
165
166
167
168
169metakernel sort(
170    qword build_globals,
171    qword input,
172    qword tmp,
173    MSBRadixArgs sort_args)
174{
175    define REG_num_prims       REG0;
176    {
177        define C_MIN_PRIMREFS           REG1;
178        define C_MAX_PRIMREFS           REG2;
179        define REG_no_dispatch          REG3;
180        define REG_dispatch_single_wg   REG4;
181
182        REG_num_prims  = load_dword( sort_args.p_num_primitives );
183        C_MIN_PRIMREFS = 2;
184        C_MAX_PRIMREFS = BOTTOM_LEVEL_SORT_THRESHOLD;
185
186        REG_no_dispatch  = REG_num_prims < C_MIN_PRIMREFS;
187        REG_dispatch_single_wg = REG_num_prims < C_MAX_PRIMREFS;
188
189        goto l_sort_finish if(REG_no_dispatch.lo);
190        goto l_dispatch_single_wg if(REG_dispatch_single_wg.lo);
191        goto l_full_sort;
192    }
193
194l_dispatch_single_wg:
195
196    {
197        dispatch opencl_bottom_level_sort_single_wg (1, 1, 1) args(build_globals, input, input);
198        goto l_sort_finish;
199    }
200
201l_full_sort:
202
203    define p_scheduler                  sort_args.p_scheduler;
204    define p_scheduler_postsync        (sort_args.p_scheduler + offsetof(MSBRadixScheduler.scheduler_postsync) );
205    define p_num_wgs_bls               (sort_args.p_scheduler + offsetof(MSBRadixScheduler.num_wgs_bls) );
206
207    define REG_scheduler_postsync    REG3;
208    REG_scheduler_postsync = p_scheduler_postsync;
209
210    define C_0    REG4;
211    define C_8    REG5;
212    define C_255  REG6;
213    C_0 = 0;
214    C_8 = 8;
215    C_255 = 255;
216
217    store_qword( REG_scheduler_postsync, C_0 ); // initialize scheduler semaphore
218
219    REG_num_prims = REG_num_prims + C_255;
220    REG_num_prims = REG_num_prims >> C_8;
221
222    DISPATCHDIM_X = REG_num_prims.lo;
223    DISPATCHDIM_Y = 1;
224    DISPATCHDIM_Z = 1;
225
226    control( cs_store_fence ); // commit the semaphore write
227
228    // initialize the whole execution
229    dispatch opencl_build_morton_kernel_sort_msb_init (MSB_RADIX_NUM_VCONTEXTS, 1, 1) args(build_globals, sort_args.p_scheduler, input, tmp)
230        postsync store_dword( p_scheduler_postsync, 1 );
231
232    // wait on count_items kernel
233    semaphore_wait while( *p_scheduler_postsync != 1 );
234
235    dispatch_indirect opencl_build_morton_kernel_sort_msb_count_items args(sort_args.p_scheduler)
236        postsync store_dword( p_scheduler_postsync, 2 );
237
238    // wait on count_items kernel
239    semaphore_wait while( *p_scheduler_postsync != 2 );
240
241    dispatch_indirect opencl_build_morton_kernel_sort_msb_bin_items args(sort_args.p_scheduler, input)
242        postsync store_dword( p_scheduler_postsync, 0 );
243
244    define C_MASK_HI REG4;
245    C_MASK_HI = 0x00000000ffffffff;
246
247    l_build_loop:
248    {
249        semaphore_wait while( *p_scheduler_postsync != 0 );
250        {
251            dispatch opencl_build_morton_kernel_sort_msb_scheduler(1,1,1) args( sort_args.p_scheduler, input, tmp )
252                postsync store_dword( p_scheduler_postsync, 1 );
253
254            // wait on scheduler kernel
255            semaphore_wait while( *p_scheduler_postsync != 1 );
256        }
257
258        // load and process the scheduler results
259        define REG_wg_counts    REG0;
260        define REG_num_msb_wgs  REG0.lo;
261        define REG_num_bls_wgs  REG0.hi;
262        define REG_p_scheduler  REG1;
263        define REG_no_msb_wgs   REG2;
264        {
265            REG_p_scheduler = p_scheduler;
266            REG_wg_counts    = load_qword( REG_p_scheduler );
267
268            REG_no_msb_wgs = REG_wg_counts  & C_MASK_HI;
269            REG_no_msb_wgs = REG_no_msb_wgs == 0;
270        }
271
272        // dispatch new bls WGs
273        DISPATCHDIM_X = REG_num_bls_wgs;
274        dispatch_indirect opencl_build_morton_kernel_sort_bottom_level args( p_scheduler, input );
275
276        // jump out if there are no msb WGs
277        goto l_sort_finish if (REG_no_msb_wgs);
278
279        DISPATCHDIM_X = REG_num_msb_wgs;
280        dispatch_indirect opencl_build_morton_kernel_sort_msb_count_items args(sort_args.p_scheduler)
281            postsync store_dword( p_scheduler_postsync, 2 );
282
283        // wait on count_items kernel
284        semaphore_wait while( *p_scheduler_postsync != 2 );
285
286        dispatch_indirect opencl_build_morton_kernel_sort_msb_bin_items args(sort_args.p_scheduler, input)
287            postsync store_dword( p_scheduler_postsync, 0 );
288
289        // wait till all BLS finished launching
290        semaphore_wait while( *p_num_wgs_bls != 0 );
291
292        goto l_build_loop;
293    }
294
295l_sort_finish:
296
297}
298