1// 2// Copyright (C) 2009-2021 Intel Corporation 3// 4// SPDX-License-Identifier: MIT 5// 6// 7 8module msb_radix_bitonic_sort; 9 10kernel_module msb_radix_sort ("morton_msb_radix_bitonic_sort.cl") 11{ 12 links lsc_intrinsics; 13 14 kernel opencl_debug_print < kernelFunction="debug_print_kernel">; 15 kernel opencl_check_bls < kernelFunction="check_bls_sort">; 16 17 kernel opencl_bottom_level_sort_single_wg < kernelFunction="sort_morton_codes_bottom_level_single_wg">; 18 19 kernel opencl_build_morton_kernel_sort_msb_init < kernelFunction="sort_morton_codes_msb_begin">; 20 21 kernel opencl_build_morton_kernel_sort_msb_scheduler < kernelFunction="scheduler">; 22 23 kernel opencl_build_morton_kernel_sort_bottom_level < kernelFunction="sort_morton_codes_bottom_level">; 24 25 kernel opencl_build_morton_kernel_sort_msb_count_items < kernelFunction="sort_morton_codes_msb_count_items">; 26 kernel opencl_build_morton_kernel_sort_msb_bin_items < kernelFunction="sort_morton_codes_msb_bin_items">; 27 28 kernel opencl_build_morton_kernel_sort_batched_bls_dispatch < kernelFunction="sort_morton_codes_batched_BLS_dispatch">; 29} 30 31 32const MSB_RADIX_NUM_VCONTEXTS = 8; 33const BOTTOM_LEVEL_SORT_THRESHOLD = 512; 34 35struct MSBRadixScheduler 36{ 37 dword num_wgs_msb; 38 dword num_wgs_bls; 39 40 dword scheduler_postsync; 41 dword _pad1; 42}; 43 44struct MSBRadixArgs 45{ 46 qword p_scheduler; 47 qword p_num_primitives; 48}; 49 50 51 52 53struct BatchedBLSDispatchEntry 54{ 55 qword p_data_buffer; 56 qword num_elements; // number of elements in p_data_buffer 57}; 58 59 60 61 62metakernel add_bls_dispatch_init(qword p_storage) 63{ 64 define REG_numWgs REG14; 65 define REG_p_storage REG15; 66 67 REG_numWgs = 0; 68 REG_p_storage = p_storage; 69} 70 71 72 73 74// basically this code does: 75// bls_args_for_dispatches[dispatchID] = { bls_new_pointer, numPrimitives }; 76// dispatchId++; 77// 78metakernel add_bls_dispatch( 79 qword p_data, 80 qword p_num_primitives 81) 82{ 83 define C_1 REG0; 84 define C_8 REG1; 85 86 define C_MIN_PRIMREFS REG2; 87 88 define REG_p_data REG3; 89 define REG_num_prims REG4; 90 define REG_no_dispatch REG5; 91 92 define REG_numWgs REG14; 93 define REG_p_storage REG15; 94 95 C_MIN_PRIMREFS = 2; 96 97 REG_num_prims = 0; 98 REG_num_prims.lo = load_dword(p_num_primitives); 99 100 REG_no_dispatch = REG_num_prims < C_MIN_PRIMREFS; 101 102 goto l_finish if(REG_no_dispatch.lo); 103 104 C_1 = 1; 105 C_8 = 8; 106 107 // pseudocode: BatchedBLSDispatchEntry.p_data_buffer = p_data 108 REG_p_data = p_data; 109 store_qword( REG_p_storage, REG_p_data ); // store the data pointer 110 111 REG_p_storage = REG_p_storage + C_8; // point to next member in BatchedBLSDispatchEntry struct 112 113 // pseudocode: BatchedBLSDispatchEntry.num_elements = *p_num_primitives 114 store_qword( REG_p_storage, REG_num_prims ); 115 116 REG_p_storage = REG_p_storage + C_8; // point to next BatchedBLSDispatchEntry instance 117 118 REG_numWgs = REG_numWgs + C_1; 119 120l_finish: 121 122} 123 124 125 126 127metakernel batched_bls_dispatch( 128 qword private_mem 129) 130{ 131 define REG_numWgs REG14; 132 133 DISPATCHDIM_X = REG_numWgs; 134 DISPATCHDIM_Y = 1; 135 DISPATCHDIM_Z = 1; 136 137 dispatch_indirect opencl_build_morton_kernel_sort_batched_bls_dispatch args(private_mem); 138} 139 140 141 142 143metakernel sort_bottom_level( 144 qword build_globals, 145 qword input, 146 qword p_num_primitives) 147{ 148 define REG_num_prims REG0; 149 define C_MIN_PRIMREFS REG1; 150 define REG_no_dispatch REG2; 151 152 REG_num_prims = load_dword( p_num_primitives ); 153 154 C_MIN_PRIMREFS = 2; 155 156 REG_no_dispatch = REG_num_prims < C_MIN_PRIMREFS; 157 158 goto l_finish if(REG_no_dispatch.lo); 159 160 dispatch opencl_bottom_level_sort_single_wg (1, 1, 1) args(build_globals, input, input); 161 162l_finish: 163 164} 165 166 167 168 169metakernel sort( 170 qword build_globals, 171 qword input, 172 qword tmp, 173 MSBRadixArgs sort_args) 174{ 175 define REG_num_prims REG0; 176 { 177 define C_MIN_PRIMREFS REG1; 178 define C_MAX_PRIMREFS REG2; 179 define REG_no_dispatch REG3; 180 define REG_dispatch_single_wg REG4; 181 182 REG_num_prims = load_dword( sort_args.p_num_primitives ); 183 C_MIN_PRIMREFS = 2; 184 C_MAX_PRIMREFS = BOTTOM_LEVEL_SORT_THRESHOLD; 185 186 REG_no_dispatch = REG_num_prims < C_MIN_PRIMREFS; 187 REG_dispatch_single_wg = REG_num_prims < C_MAX_PRIMREFS; 188 189 goto l_sort_finish if(REG_no_dispatch.lo); 190 goto l_dispatch_single_wg if(REG_dispatch_single_wg.lo); 191 goto l_full_sort; 192 } 193 194l_dispatch_single_wg: 195 196 { 197 dispatch opencl_bottom_level_sort_single_wg (1, 1, 1) args(build_globals, input, input); 198 goto l_sort_finish; 199 } 200 201l_full_sort: 202 203 define p_scheduler sort_args.p_scheduler; 204 define p_scheduler_postsync (sort_args.p_scheduler + offsetof(MSBRadixScheduler.scheduler_postsync) ); 205 define p_num_wgs_bls (sort_args.p_scheduler + offsetof(MSBRadixScheduler.num_wgs_bls) ); 206 207 define REG_scheduler_postsync REG3; 208 REG_scheduler_postsync = p_scheduler_postsync; 209 210 define C_0 REG4; 211 define C_8 REG5; 212 define C_255 REG6; 213 C_0 = 0; 214 C_8 = 8; 215 C_255 = 255; 216 217 store_qword( REG_scheduler_postsync, C_0 ); // initialize scheduler semaphore 218 219 REG_num_prims = REG_num_prims + C_255; 220 REG_num_prims = REG_num_prims >> C_8; 221 222 DISPATCHDIM_X = REG_num_prims.lo; 223 DISPATCHDIM_Y = 1; 224 DISPATCHDIM_Z = 1; 225 226 control( cs_store_fence ); // commit the semaphore write 227 228 // initialize the whole execution 229 dispatch opencl_build_morton_kernel_sort_msb_init (MSB_RADIX_NUM_VCONTEXTS, 1, 1) args(build_globals, sort_args.p_scheduler, input, tmp) 230 postsync store_dword( p_scheduler_postsync, 1 ); 231 232 // wait on count_items kernel 233 semaphore_wait while( *p_scheduler_postsync != 1 ); 234 235 dispatch_indirect opencl_build_morton_kernel_sort_msb_count_items args(sort_args.p_scheduler) 236 postsync store_dword( p_scheduler_postsync, 2 ); 237 238 // wait on count_items kernel 239 semaphore_wait while( *p_scheduler_postsync != 2 ); 240 241 dispatch_indirect opencl_build_morton_kernel_sort_msb_bin_items args(sort_args.p_scheduler, input) 242 postsync store_dword( p_scheduler_postsync, 0 ); 243 244 define C_MASK_HI REG4; 245 C_MASK_HI = 0x00000000ffffffff; 246 247 l_build_loop: 248 { 249 semaphore_wait while( *p_scheduler_postsync != 0 ); 250 { 251 dispatch opencl_build_morton_kernel_sort_msb_scheduler(1,1,1) args( sort_args.p_scheduler, input, tmp ) 252 postsync store_dword( p_scheduler_postsync, 1 ); 253 254 // wait on scheduler kernel 255 semaphore_wait while( *p_scheduler_postsync != 1 ); 256 } 257 258 // load and process the scheduler results 259 define REG_wg_counts REG0; 260 define REG_num_msb_wgs REG0.lo; 261 define REG_num_bls_wgs REG0.hi; 262 define REG_p_scheduler REG1; 263 define REG_no_msb_wgs REG2; 264 { 265 REG_p_scheduler = p_scheduler; 266 REG_wg_counts = load_qword( REG_p_scheduler ); 267 268 REG_no_msb_wgs = REG_wg_counts & C_MASK_HI; 269 REG_no_msb_wgs = REG_no_msb_wgs == 0; 270 } 271 272 // dispatch new bls WGs 273 DISPATCHDIM_X = REG_num_bls_wgs; 274 dispatch_indirect opencl_build_morton_kernel_sort_bottom_level args( p_scheduler, input ); 275 276 // jump out if there are no msb WGs 277 goto l_sort_finish if (REG_no_msb_wgs); 278 279 DISPATCHDIM_X = REG_num_msb_wgs; 280 dispatch_indirect opencl_build_morton_kernel_sort_msb_count_items args(sort_args.p_scheduler) 281 postsync store_dword( p_scheduler_postsync, 2 ); 282 283 // wait on count_items kernel 284 semaphore_wait while( *p_scheduler_postsync != 2 ); 285 286 dispatch_indirect opencl_build_morton_kernel_sort_msb_bin_items args(sort_args.p_scheduler, input) 287 postsync store_dword( p_scheduler_postsync, 0 ); 288 289 // wait till all BLS finished launching 290 semaphore_wait while( *p_num_wgs_bls != 0 ); 291 292 goto l_build_loop; 293 } 294 295l_sort_finish: 296 297} 298