xref: /aosp_15_r20/external/mesa3d/src/intel/vulkan/genX_acceleration_structure.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2020 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "anv_private.h"
25 
26 #include <math.h>
27 
28 #include "util/u_debug.h"
29 #include "util/half_float.h"
30 #include "util/u_atomic.h"
31 
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
34 #include "genxml/genX_rt_pack.h"
35 
36 #include "ds/intel_tracepoints.h"
37 
38 #if GFX_VERx10 >= 125
39 #include "grl/grl_structs.h"
40 
41 /* Wait for the previous dispatches to finish and flush their data port
42  * writes.
43  */
44 #define ANV_GRL_FLUSH_FLAGS (ANV_PIPE_END_OF_PIPE_SYNC_BIT | \
45                              ANV_PIPE_DATA_CACHE_FLUSH_BIT | \
46                              ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT)
47 
48 static const VkAccelerationStructureGeometryKHR *
get_geometry(const VkAccelerationStructureBuildGeometryInfoKHR * pInfo,uint32_t index)49 get_geometry(const VkAccelerationStructureBuildGeometryInfoKHR *pInfo,
50              uint32_t index)
51 {
52    return pInfo->pGeometries ? &pInfo->pGeometries[index] :
53                                pInfo->ppGeometries[index];
54 }
55 
align_transient_size(size_t bytes)56 static size_t align_transient_size(size_t bytes)
57 {
58    return align_uintptr(bytes, 64);
59 }
60 
align_private_size(size_t bytes)61 static size_t align_private_size(size_t bytes)
62 {
63    return align_uintptr(bytes, 64);
64 }
65 
get_scheduler_size(size_t num_builds)66 static size_t get_scheduler_size(size_t num_builds)
67 {
68     size_t scheduler_size = sizeof(union SchedulerUnion);
69     /* add more memory for qnode creation stage if needed */
70     if (num_builds > QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM) {
71         scheduler_size += (num_builds - QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM) * 2 *
72            sizeof(struct QNodeGlobalRootBufferEntry);
73     }
74 
75     return align_private_size(scheduler_size);
76 }
77 
78 static size_t
get_batched_binnedsah_transient_mem_size(size_t num_builds)79 get_batched_binnedsah_transient_mem_size(size_t num_builds)
80 {
81    if (num_builds == 0)
82       return 0;
83    return num_builds * (sizeof(struct SAHBuildBuffersInfo) + sizeof(gpuva_t));
84 }
85 
86 static size_t
get_batched_binnedsah_private_mem_size(size_t num_builds)87 get_batched_binnedsah_private_mem_size(size_t num_builds)
88 {
89    if (num_builds == 0)
90       return 0;
91 
92    size_t globals_size = align_private_size(num_builds * sizeof(struct SAHBuildGlobals));
93    return globals_size + get_scheduler_size(num_builds);
94 }
95 
96 static uint32_t
estimate_qbvh6_nodes(const uint32_t N)97 estimate_qbvh6_nodes(const uint32_t N)
98 {
99    const uint32_t W = 6;
100    const uint32_t N0 = N / 2 + N % 2; // lowest level with 2 leaves per QBVH6 node
101    const uint32_t N1 = N0 / W + (N0 % W ? 1 : 0); // filled level
102    const uint32_t N2 = N0 / W + (N1 % W ? 1 : 0); // filled level
103    const uint32_t N3 = N0 / W + (N2 % W ? 1 : 0); // filled level
104    const uint32_t N4 = N3; // overestimate remaining nodes
105    return N0 + N1 + N2 + N3 + N4;
106 }
107 
108 /* Estimates the worst case number of QBVH6 nodes for a top-down BVH
109  * build that guarantees to produce subtree with N >= K primitives
110  * from which a single QBVH6 node is created.
111  */
112 static uint32_t
estimate_qbvh6_nodes_minK(const uint32_t N,uint32_t K)113 estimate_qbvh6_nodes_minK(const uint32_t N, uint32_t K)
114 {
115     const uint32_t N0 = N / K + (N % K ? 1 : 0); // lowest level of nodes with K leaves minimally
116     return N0 + estimate_qbvh6_nodes(N0);
117 }
118 
119 static size_t
estimate_qbvh6_fatleafs(const size_t P)120 estimate_qbvh6_fatleafs(const size_t P)
121 {
122    return P;
123 }
124 
125 static size_t
estimate_qbvh6_nodes_worstcase(const size_t P)126 estimate_qbvh6_nodes_worstcase(const size_t P)
127 {
128    const size_t F = estimate_qbvh6_fatleafs(P);
129 
130    // worst-case each inner node having 5 fat-leaf children.
131    //  number of inner nodes is F/5 and number of fat-leaves is F
132    return F + ceil(F/5.0);
133 }
134 
135 #define sizeof_PrimRef      32
136 #define sizeof_HwInstanceLeaf (GENX(RT_BVH_INSTANCE_LEAF_length) * 4)
137 #define sizeof_InternalNode   (GENX(RT_BVH_INTERNAL_NODE_length) * 4)
138 #define sizeof_Procedural     (GENX(RT_BVH_PROCEDURAL_LEAF_length) * 4)
139 #define sizeof_Quad           (GENX(RT_BVH_QUAD_LEAF_length) * 4)
140 
141 static struct MKSizeEstimate
get_gpu_size_estimate(const VkAccelerationStructureBuildGeometryInfoKHR * pInfo,const VkAccelerationStructureBuildRangeInfoKHR * pBuildRangeInfos,const uint32_t * pMaxPrimitiveCounts)142 get_gpu_size_estimate(const VkAccelerationStructureBuildGeometryInfoKHR *pInfo,
143                       const VkAccelerationStructureBuildRangeInfoKHR *pBuildRangeInfos,
144                       const uint32_t *pMaxPrimitiveCounts)
145 {
146    uint32_t num_triangles = 0, num_aabbs = 0, num_instances = 0;
147    for (unsigned g = 0; g < pInfo->geometryCount; g++) {
148       const VkAccelerationStructureGeometryKHR *pGeometry =
149          get_geometry(pInfo, g);
150       uint32_t prim_count = pBuildRangeInfos != NULL ?
151          pBuildRangeInfos[g].primitiveCount : pMaxPrimitiveCounts[g];
152 
153       switch (pGeometry->geometryType) {
154       case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
155          num_triangles += prim_count;
156          break;
157       case VK_GEOMETRY_TYPE_AABBS_KHR:
158          num_aabbs += prim_count;
159          break;
160       case VK_GEOMETRY_TYPE_INSTANCES_KHR:
161          num_instances += prim_count;
162          break;
163       default:
164          unreachable("Unsupported geometry type");
165       }
166    }
167    const uint32_t num_primitives = num_triangles + num_aabbs + num_instances;
168 
169    struct MKSizeEstimate est = {};
170 
171    uint64_t size = sizeof(BVHBase);
172    size = align64(size, 64);
173 
174    /* Must immediately follow BVHBase because we use fixed offset to nodes. */
175    est.node_data_start = size;
176 
177    switch (pInfo->type) {
178    case VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR: {
179       assert(num_triangles == 0 && num_aabbs == 0);
180 
181       est.numPrimitives = num_instances;
182       est.numPrimitivesToSplit = 0;
183       est.numBuildPrimitives = est.numPrimitives + est.numPrimitivesToSplit;
184 
185       est.min_primitives = est.numPrimitives;
186       est.max_primitives = est.numPrimitives + est.numPrimitivesToSplit;
187 
188       unsigned int sizeInnerNodes =
189          (unsigned int) estimate_qbvh6_nodes_worstcase(est.numBuildPrimitives) *
190          sizeof_InternalNode;
191       if (sizeInnerNodes == 0)
192          sizeInnerNodes = sizeof_InternalNode;
193 
194       est.max_inner_nodes = sizeInnerNodes / sizeof_InternalNode;
195 
196       size += sizeInnerNodes;
197       STATIC_ASSERT(sizeof_InternalNode % 64 == 0);
198 
199       est.leaf_data_start = size;
200       size += est.numBuildPrimitives * sizeof_HwInstanceLeaf;
201       STATIC_ASSERT(sizeof_HwInstanceLeaf % 64 == 0);
202 
203       est.leaf_data_size = est.numBuildPrimitives * sizeof_HwInstanceLeaf;
204 
205       break;
206    }
207 
208    case VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR: {
209       assert(num_instances == 0);
210 
211       /* RT: TODO */
212       const float split_factor = 0.0f;
213       uint32_t num_prims_to_split = 0;
214       if (false)
215          num_prims_to_split = num_triangles + (double)split_factor;
216 
217       const uint32_t num_build_triangles = num_triangles + num_prims_to_split;
218       const uint32_t num_build_primitives = num_build_triangles + num_aabbs;
219 
220       est.numPrimitives = num_primitives;
221       est.numTriangles = num_triangles;
222       est.numProcedurals = num_aabbs;
223       est.numMeshes = pInfo->geometryCount;
224       est.numBuildPrimitives = num_build_primitives;
225       est.numPrimitivesToSplit = num_prims_to_split;
226       est.max_instance_leafs = 0;
227 
228       est.min_primitives = (size_t)(num_build_triangles * 0.5f + num_aabbs);
229       est.max_primitives = num_build_triangles + num_aabbs;
230 
231       size_t nodeBytes = 0;
232       nodeBytes += estimate_qbvh6_nodes_worstcase(num_build_triangles) * sizeof_InternalNode;
233       nodeBytes += estimate_qbvh6_nodes_worstcase(num_aabbs) * sizeof_InternalNode;
234       if (nodeBytes == 0) // for case with 0 primitives
235          nodeBytes = sizeof_InternalNode;
236       nodeBytes = MAX2(nodeBytes, 8 * (size_t)num_build_primitives); // for primref_index0/1 buffers
237 
238       est.max_inner_nodes = nodeBytes / sizeof_InternalNode;
239 
240       size += nodeBytes;
241       STATIC_ASSERT(sizeof_InternalNode % 64 == 0);
242 
243       est.leaf_data_start = size;
244       size += num_build_triangles * sizeof_Quad;
245       STATIC_ASSERT(sizeof_Quad % 64 == 0);
246 
247       est.procedural_data_start = size;
248       size += num_aabbs * sizeof_Procedural;
249       STATIC_ASSERT(sizeof_Procedural % 64 == 0);
250 
251       est.leaf_data_size = num_build_triangles * sizeof_Quad +
252                            num_aabbs * sizeof_Procedural;
253 
254       if (num_build_primitives == 0)
255          size += MAX2(sizeof_Quad, sizeof_Procedural);
256       break;
257    }
258 
259    default:
260       unreachable("Unsupported acceleration structure type");
261    }
262 
263    size = align64(size, 64);
264    est.instance_descs_start = size;
265    size += sizeof(struct InstanceDesc) * num_instances;
266 
267    est.geo_meta_data_start = size;
268    size += sizeof(struct GeoMetaData) * pInfo->geometryCount;
269    size = align64(size, 64);
270 
271    assert(size == align64(size, 64));
272    est.back_pointer_start = size;
273 
274    const bool alloc_backpointers = false; /* RT TODO */
275    if (alloc_backpointers) {
276       size += est.max_inner_nodes * sizeof(uint32_t);
277       size = align64(size, 64);
278    }
279 
280    assert(size < UINT32_MAX);
281    est.sizeTotal = align64(size, 64);
282 
283    return est;
284 }
285 
286 struct scratch_layout {
287    gpuva_t base;
288    uint32_t total_size;
289 
290    gpuva_t primrefs;
291    gpuva_t globals;
292    gpuva_t leaf_index_buffers;
293    uint32_t leaf_index_buffer_stride;
294 
295    /* new_sah */
296    gpuva_t qnode_buffer;
297    gpuva_t bvh2_buffer;
298 };
299 
300 static size_t
get_bvh2_size(uint32_t num_primitivies)301 get_bvh2_size(uint32_t num_primitivies)
302 {
303    if (num_primitivies == 0)
304       return 0;
305    return sizeof(struct BVH2) +
306       (2 * num_primitivies - 1) * sizeof(struct BVH2Node);
307 }
308 
309 static struct scratch_layout
get_gpu_scratch_layout(struct anv_address base,struct MKSizeEstimate est,enum anv_rt_bvh_build_method build_method)310 get_gpu_scratch_layout(struct anv_address base,
311                        struct MKSizeEstimate est,
312                        enum anv_rt_bvh_build_method build_method)
313 {
314    struct scratch_layout scratch = {
315       .base = anv_address_physical(base),
316    };
317    gpuva_t current = anv_address_physical(base);
318 
319    scratch.globals = current;
320    current += sizeof(struct Globals);
321 
322    scratch.primrefs = intel_canonical_address(current);
323    current += est.numBuildPrimitives * sizeof_PrimRef;
324 
325    scratch.leaf_index_buffers = intel_canonical_address(current);
326    current += est.numBuildPrimitives * sizeof(uint32_t) * 2;
327    scratch.leaf_index_buffer_stride = sizeof(uint32_t);
328 
329    switch (build_method) {
330    case ANV_BVH_BUILD_METHOD_TRIVIAL:
331       break;
332 
333    case ANV_BVH_BUILD_METHOD_NEW_SAH: {
334       size_t bvh2_size = get_bvh2_size(est.numBuildPrimitives);
335       if (est.leaf_data_size < bvh2_size) {
336          scratch.bvh2_buffer = intel_canonical_address(current);
337          current += bvh2_size;
338       }
339 
340       scratch.qnode_buffer = intel_canonical_address(current);
341       current += 2 * sizeof(dword) * est.max_inner_nodes;
342       break;
343    }
344 
345    default:
346       unreachable("invalid build");
347    }
348 
349    assert((current - scratch.base) < UINT32_MAX);
350    scratch.total_size = current - scratch.base;
351 
352    return scratch;
353 }
354 
355 static void
anv_get_gpu_acceleration_structure_size(UNUSED struct anv_device * device,VkAccelerationStructureBuildTypeKHR buildType,const VkAccelerationStructureBuildGeometryInfoKHR * pBuildInfo,const uint32_t * pMaxPrimitiveCounts,VkAccelerationStructureBuildSizesInfoKHR * pSizeInfo)356 anv_get_gpu_acceleration_structure_size(
357    UNUSED struct anv_device                   *device,
358    VkAccelerationStructureBuildTypeKHR         buildType,
359    const VkAccelerationStructureBuildGeometryInfoKHR* pBuildInfo,
360    const uint32_t*                             pMaxPrimitiveCounts,
361    VkAccelerationStructureBuildSizesInfoKHR*   pSizeInfo)
362 {
363 
364    struct MKSizeEstimate est = get_gpu_size_estimate(pBuildInfo, NULL,
365                                                      pMaxPrimitiveCounts);
366    struct scratch_layout scratch = get_gpu_scratch_layout(ANV_NULL_ADDRESS, est,
367                                                           device->bvh_build_method);
368 
369    pSizeInfo->accelerationStructureSize = est.sizeTotal;
370    pSizeInfo->buildScratchSize = scratch.total_size;
371    pSizeInfo->updateScratchSize = scratch.total_size; /* TODO */
372 }
373 
374 void
genX(GetAccelerationStructureBuildSizesKHR)375 genX(GetAccelerationStructureBuildSizesKHR)(
376     VkDevice                                    _device,
377     VkAccelerationStructureBuildTypeKHR         buildType,
378     const VkAccelerationStructureBuildGeometryInfoKHR* pBuildInfo,
379     const uint32_t*                             pMaxPrimitiveCounts,
380     VkAccelerationStructureBuildSizesInfoKHR*   pSizeInfo)
381 {
382    ANV_FROM_HANDLE(anv_device, device, _device);
383    assert(pSizeInfo->sType ==
384           VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_BUILD_SIZES_INFO_KHR);
385 
386    VkAccelerationStructureBuildSizesInfoKHR gpu_size_info;
387    anv_get_gpu_acceleration_structure_size(device, buildType, pBuildInfo,
388                                            pMaxPrimitiveCounts,
389                                            &gpu_size_info);
390 
391    pSizeInfo->accelerationStructureSize =
392       gpu_size_info.accelerationStructureSize;
393    pSizeInfo->buildScratchSize = gpu_size_info.buildScratchSize;
394    pSizeInfo->updateScratchSize = gpu_size_info.updateScratchSize;
395 }
396 
397 void
genX(GetDeviceAccelerationStructureCompatibilityKHR)398 genX(GetDeviceAccelerationStructureCompatibilityKHR)(
399     VkDevice                                    _device,
400     const VkAccelerationStructureVersionInfoKHR* pVersionInfo,
401     VkAccelerationStructureCompatibilityKHR*    pCompatibility)
402 {
403    ANV_FROM_HANDLE(anv_device, device, _device);
404 
405    if (memcmp(pVersionInfo->pVersionData,
406               device->physical->rt_uuid,
407               sizeof(device->physical->rt_uuid)) == 0) {
408       *pCompatibility = VK_ACCELERATION_STRUCTURE_COMPATIBILITY_COMPATIBLE_KHR;
409    } else {
410       *pCompatibility = VK_ACCELERATION_STRUCTURE_COMPATIBILITY_INCOMPATIBLE_KHR;
411    }
412 }
413 
414 static inline uint8_t
vk_to_grl_GeometryFlags(VkGeometryFlagsKHR flags)415 vk_to_grl_GeometryFlags(VkGeometryFlagsKHR flags)
416 {
417    uint8_t grl_flags = GEOMETRY_FLAG_NONE;
418    unsigned mask = flags;
419    while (mask) {
420       int i = u_bit_scan(&mask);
421       switch ((VkGeometryFlagBitsKHR)(1u << i)) {
422       case VK_GEOMETRY_OPAQUE_BIT_KHR:
423          grl_flags |= GEOMETRY_FLAG_OPAQUE;
424          break;
425       case VK_GEOMETRY_NO_DUPLICATE_ANY_HIT_INVOCATION_BIT_KHR:
426          grl_flags |= GEOMETRY_FLAG_NO_DUPLICATE_ANYHIT_INVOCATION;
427          break;
428       default:
429          unreachable("Unsupported acceleration structure build flag");
430       }
431    }
432    return grl_flags;
433 }
434 
435 static inline IndexFormat
vk_to_grl_IndexFormat(VkIndexType type)436 vk_to_grl_IndexFormat(VkIndexType type)
437 {
438    switch (type) {
439    case VK_INDEX_TYPE_NONE_KHR:  return INDEX_FORMAT_NONE;
440    case VK_INDEX_TYPE_UINT8_KHR: unreachable("No UINT8 support yet");
441    case VK_INDEX_TYPE_UINT16:    return INDEX_FORMAT_R16_UINT;
442    case VK_INDEX_TYPE_UINT32:    return INDEX_FORMAT_R32_UINT;
443    default:
444       unreachable("Unsupported index type");
445    }
446 }
447 
448 static inline VertexFormat
vk_to_grl_VertexFormat(VkFormat format)449 vk_to_grl_VertexFormat(VkFormat format)
450 {
451    switch (format) {
452    case VK_FORMAT_R32G32_SFLOAT:       return VERTEX_FORMAT_R32G32_FLOAT;
453    case VK_FORMAT_R32G32B32_SFLOAT:    return VERTEX_FORMAT_R32G32B32_FLOAT;
454    case VK_FORMAT_R16G16_SFLOAT:       return VERTEX_FORMAT_R16G16_FLOAT;
455    case VK_FORMAT_R16G16B16A16_SFLOAT: return VERTEX_FORMAT_R16G16B16A16_FLOAT;
456    case VK_FORMAT_R16G16_SNORM:        return VERTEX_FORMAT_R16G16_SNORM;
457    case VK_FORMAT_R16G16B16A16_SNORM:  return VERTEX_FORMAT_R16G16B16A16_SNORM;
458    case VK_FORMAT_R16G16B16A16_UNORM:  return VERTEX_FORMAT_R16G16B16A16_UNORM;
459    case VK_FORMAT_R16G16_UNORM:        return VERTEX_FORMAT_R16G16_UNORM;
460    /* case VK_FORMAT_R10G10B10A2_UNORM:   return VERTEX_FORMAT_R10G10B10A2_UNORM; */
461    case VK_FORMAT_R8G8B8A8_UNORM:      return VERTEX_FORMAT_R8G8B8A8_UNORM;
462    case VK_FORMAT_R8G8_UNORM:          return VERTEX_FORMAT_R8G8_UNORM;
463    case VK_FORMAT_R8G8B8A8_SNORM:      return VERTEX_FORMAT_R8G8B8A8_SNORM;
464    case VK_FORMAT_R8G8_SNORM:          return VERTEX_FORMAT_R8G8_SNORM;
465    default:
466       unreachable("Unsupported vertex format");
467    }
468 }
469 
470 static struct Geo
vk_to_grl_Geo(const VkAccelerationStructureGeometryKHR * pGeometry,uint32_t prim_count,uint32_t transform_offset,uint32_t primitive_offset,uint32_t first_vertex)471 vk_to_grl_Geo(const VkAccelerationStructureGeometryKHR *pGeometry,
472               uint32_t prim_count,
473               uint32_t transform_offset,
474               uint32_t primitive_offset,
475               uint32_t first_vertex)
476 {
477    struct Geo geo = {
478       .Flags = vk_to_grl_GeometryFlags(pGeometry->flags),
479    };
480 
481    switch (pGeometry->geometryType) {
482    case VK_GEOMETRY_TYPE_TRIANGLES_KHR: {
483       const VkAccelerationStructureGeometryTrianglesDataKHR *vk_tri =
484          &pGeometry->geometry.triangles;
485 
486       geo.Type = GEOMETRY_TYPE_TRIANGLES;
487 
488       geo.Desc.Triangles.pTransformBuffer =
489          vk_tri->transformData.deviceAddress;
490       geo.Desc.Triangles.pIndexBuffer =
491          vk_tri->indexData.deviceAddress;
492       geo.Desc.Triangles.pVertexBuffer =
493          vk_tri->vertexData.deviceAddress;
494       geo.Desc.Triangles.VertexBufferByteStride = vk_tri->vertexStride;
495 
496       if (geo.Desc.Triangles.pTransformBuffer)
497          geo.Desc.Triangles.pTransformBuffer += transform_offset;
498 
499       if (vk_tri->indexType == VK_INDEX_TYPE_NONE_KHR) {
500          geo.Desc.Triangles.IndexCount = 0;
501          geo.Desc.Triangles.VertexCount = prim_count * 3;
502          geo.Desc.Triangles.IndexFormat = INDEX_FORMAT_NONE;
503          geo.Desc.Triangles.pVertexBuffer += primitive_offset;
504       } else {
505          geo.Desc.Triangles.IndexCount = prim_count * 3;
506          geo.Desc.Triangles.VertexCount = vk_tri->maxVertex;
507          geo.Desc.Triangles.IndexFormat =
508             vk_to_grl_IndexFormat(vk_tri->indexType);
509          geo.Desc.Triangles.pIndexBuffer += primitive_offset;
510       }
511 
512       geo.Desc.Triangles.VertexFormat =
513          vk_to_grl_VertexFormat(vk_tri->vertexFormat);
514       geo.Desc.Triangles.pVertexBuffer += vk_tri->vertexStride * first_vertex;
515       break;
516    }
517 
518    case VK_GEOMETRY_TYPE_AABBS_KHR: {
519       const VkAccelerationStructureGeometryAabbsDataKHR *vk_aabbs =
520          &pGeometry->geometry.aabbs;
521       geo.Type = GEOMETRY_TYPE_PROCEDURAL;
522       geo.Desc.Procedural.pAABBs_GPUVA =
523          vk_aabbs->data.deviceAddress + primitive_offset;
524       geo.Desc.Procedural.AABBByteStride = vk_aabbs->stride;
525       geo.Desc.Procedural.AABBCount = prim_count;
526       break;
527    }
528 
529    default:
530       unreachable("Invalid geometry type");
531    }
532 
533    return geo;
534 }
535 
536 #include "grl/grl_metakernel_copy.h"
537 #include "grl/grl_metakernel_misc.h"
538 #include "grl/grl_metakernel_build_primref.h"
539 #include "grl/grl_metakernel_new_sah_builder.h"
540 #include "grl/grl_metakernel_build_leaf.h"
541 
542 struct build_state {
543    enum anv_rt_bvh_build_method build_method;
544 
545    struct MKSizeEstimate estimate;
546    struct scratch_layout scratch;
547    struct MKBuilderState state;
548 
549    struct anv_address bvh_addr;
550 
551    size_t geom_size_prefix_sum_buffer;
552    size_t transient_size;
553 
554    uint32_t leaf_type;
555    uint32_t leaf_size;
556 
557    uint32_t num_geometries;
558    uint32_t num_instances;
559 
560    uint64_t instances_addr;
561    bool array_of_instances_ptr;
562 
563    const VkAccelerationStructureGeometryKHR *vk_geoms;
564 };
565 
566 static void
get_binnedsah_scratch_buffers(struct build_state * bs,uint64_t * p_qnode_buffer,uint64_t * p_primref_indices,uint64_t * p_bvh2)567 get_binnedsah_scratch_buffers(struct build_state *bs,
568                               uint64_t *p_qnode_buffer,
569                               uint64_t *p_primref_indices,
570                               uint64_t *p_bvh2)
571 {
572     if (bs->estimate.numBuildPrimitives == 0)
573     {
574         *p_bvh2 = 0;
575 	*p_qnode_buffer = 0;
576         *p_primref_indices = 0;
577         return;
578     }
579 
580     size_t bvh2_size = get_bvh2_size(bs->estimate.numBuildPrimitives);
581     if (bs->estimate.leaf_data_size < bvh2_size) {
582        assert(bs->scratch.bvh2_buffer != 0);
583        *p_bvh2 = bs->scratch.bvh2_buffer;
584     } else {
585        *p_bvh2 = intel_canonical_address(bs->state.bvh_buffer +
586                                          bs->estimate.leaf_data_start);
587     }
588 
589     assert(bs->scratch.qnode_buffer != 0);
590     *p_qnode_buffer = bs->scratch.qnode_buffer;
591 
592     assert(bs->scratch.leaf_index_buffers != 0);
593     *p_primref_indices = bs->scratch.leaf_index_buffers;
594 }
595 
596 static void
write_memory(struct anv_cmd_alloc alloc,size_t offset,const void * data,size_t data_len)597 write_memory(struct anv_cmd_alloc alloc, size_t offset, const void *data, size_t data_len)
598 {
599    assert((offset + data_len) < alloc.size);
600    memcpy(alloc.map + offset, data, data_len);
601 }
602 
603 static void
cmd_build_acceleration_structures(struct anv_cmd_buffer * cmd_buffer,uint32_t infoCount,const VkAccelerationStructureBuildGeometryInfoKHR * pInfos,const VkAccelerationStructureBuildRangeInfoKHR * const * ppBuildRangeInfos,const VkDeviceAddress * pIndirectDeviceAddresses,const uint32_t * pIndirectStrides,const uint32_t * const * ppMaxPrimitiveCounts)604 cmd_build_acceleration_structures(
605    struct anv_cmd_buffer *cmd_buffer,
606    uint32_t infoCount,
607    const VkAccelerationStructureBuildGeometryInfoKHR *pInfos,
608    const VkAccelerationStructureBuildRangeInfoKHR *const *ppBuildRangeInfos,
609    const VkDeviceAddress *pIndirectDeviceAddresses,
610    const uint32_t *pIndirectStrides,
611    const uint32_t *const *ppMaxPrimitiveCounts)
612 {
613    struct anv_device *device = cmd_buffer->device;
614    VK_MULTIALLOC(ma);
615 
616    struct build_state *builds;
617    vk_multialloc_add(&ma, &builds, struct build_state, infoCount);
618 
619    if (!vk_multialloc_zalloc(&ma,
620                              &cmd_buffer->device->vk.alloc,
621                              VK_SYSTEM_ALLOCATION_SCOPE_COMMAND)) {
622       anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);
623       return;
624    }
625 
626    trace_intel_begin_as_build(&cmd_buffer->trace);
627 
628    /* TODO: Indirect */
629    assert(ppBuildRangeInfos != NULL);
630 
631    size_t transient_mem_init_globals_size = 0;
632    size_t transient_mem_init_globals_offset = 0;
633 
634    size_t transient_total     = 0;
635 
636     size_t private_mem_total = 0;
637 
638     size_t num_trivial_builds = 0;
639     size_t num_new_sah_builds = 0;
640 
641    /* Prepare a bunch of data for the kernels we have to run. */
642    for (uint32_t i = 0; i < infoCount; i++) {
643       struct build_state *bs = &builds[i];
644 
645       const VkAccelerationStructureBuildGeometryInfoKHR *pInfo = &pInfos[i];
646       struct anv_address scratch_addr =
647          anv_address_from_u64(pInfo->scratchData.deviceAddress);
648 
649       const VkAccelerationStructureBuildRangeInfoKHR *pBuildRangeInfos =
650          ppBuildRangeInfos ? ppBuildRangeInfos[i] : NULL;
651       const uint32_t *pMaxPrimitiveCounts =
652          ppMaxPrimitiveCounts ? ppMaxPrimitiveCounts[i] : NULL;
653 
654       ANV_FROM_HANDLE(vk_acceleration_structure, dst_accel,
655                       pInfo->dstAccelerationStructure);
656 
657       bs->build_method = device->bvh_build_method;
658 
659       bs->bvh_addr = anv_address_from_u64(vk_acceleration_structure_get_va(dst_accel));
660 
661       bs->estimate = get_gpu_size_estimate(pInfo, pBuildRangeInfos,
662                                            pMaxPrimitiveCounts);
663       bs->scratch = get_gpu_scratch_layout(scratch_addr, bs->estimate,
664                                            bs->build_method);
665 
666       uint32_t leaf_size, leaf_type;
667 
668       switch (pInfo->type) {
669       case VK_ACCELERATION_STRUCTURE_TYPE_TOP_LEVEL_KHR: {
670          assert(pInfo->geometryCount == 1);
671 
672          const VkAccelerationStructureGeometryKHR *pGeometry =
673             get_geometry(pInfo, 0);
674          assert(pGeometry->geometryType == VK_GEOMETRY_TYPE_INSTANCES_KHR);
675 
676          const VkAccelerationStructureGeometryInstancesDataKHR *instances =
677             &pGeometry->geometry.instances;
678 
679          bs->num_instances = pBuildRangeInfos[0].primitiveCount;
680          bs->instances_addr = instances->data.deviceAddress;
681          bs->array_of_instances_ptr = instances->arrayOfPointers;
682          leaf_type = NODE_TYPE_INSTANCE;
683          leaf_size = GENX(RT_BVH_INSTANCE_LEAF_length) * 4;
684          break;
685       }
686 
687       case VK_ACCELERATION_STRUCTURE_TYPE_BOTTOM_LEVEL_KHR: {
688          bs->num_geometries = pInfo->geometryCount;
689          leaf_type = NODE_TYPE_QUAD;
690          leaf_size = GENX(RT_BVH_QUAD_LEAF_length) * 4;
691          break;
692       }
693 
694       default:
695          unreachable("Unsupported acceleration structure type");
696       }
697 
698       size_t geom_struct_size = bs->num_geometries * sizeof(struct Geo);
699       size_t geom_prefix_sum_size = align_uintptr(sizeof(uint32_t) * (bs->num_geometries + 1), 64);
700 
701       bs->transient_size = geom_prefix_sum_size + geom_struct_size;
702 
703       bs->geom_size_prefix_sum_buffer = transient_total + 0;
704 
705       bs->state = (struct MKBuilderState) {
706          .geomDesc_buffer = bs->geom_size_prefix_sum_buffer +
707                             geom_prefix_sum_size,
708          .build_primref_buffer = bs->scratch.primrefs,
709          .build_globals = bs->scratch.globals,
710          .bvh_buffer = anv_address_physical(bs->bvh_addr),
711          .leaf_type = leaf_type,
712          .leaf_size = leaf_size,
713       };
714 
715       transient_total += bs->transient_size;
716 
717       switch (device->bvh_build_method) {
718       case ANV_BVH_BUILD_METHOD_TRIVIAL:
719          num_trivial_builds++;
720          break;
721       case ANV_BVH_BUILD_METHOD_NEW_SAH:
722          num_new_sah_builds++;
723          break;
724       default:
725          unreachable("invalid BVH build method");
726       }
727 
728       transient_mem_init_globals_size += sizeof(struct BatchedInitGlobalsData);
729    }
730 
731    transient_total = align_transient_size(transient_total);
732    transient_mem_init_globals_offset = transient_total;
733    transient_total += align_transient_size(transient_mem_init_globals_size);
734 
735    size_t transient_mem_binnedsah_size = 0;
736    size_t transient_mem_binnedsah_offset = 0;
737    size_t private_mem_binnedsah_size = 0;
738    size_t private_mem_binnedsah_offset = 0;
739 
740    transient_mem_binnedsah_size = get_batched_binnedsah_transient_mem_size(num_new_sah_builds);
741    transient_mem_binnedsah_offset = transient_total;
742    transient_total += align_transient_size(transient_mem_binnedsah_size);
743 
744    private_mem_binnedsah_size = get_batched_binnedsah_private_mem_size(num_new_sah_builds);
745    private_mem_binnedsah_offset = private_mem_total;
746    private_mem_total += align_private_size(private_mem_binnedsah_size);
747 
748    /* Allocate required memory, unless we already have a suiteable buffer */
749    struct anv_cmd_alloc private_mem_alloc;
750    if (private_mem_total > cmd_buffer->state.rt.build_priv_mem_size) {
751       private_mem_alloc =
752          anv_cmd_buffer_alloc_space(cmd_buffer, private_mem_total, 64,
753                                     false /* mapped */);
754       if (anv_cmd_alloc_is_empty(private_mem_alloc)) {
755          anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
756          goto error;
757       }
758 
759       cmd_buffer->state.rt.build_priv_mem_addr = private_mem_alloc.address;
760       cmd_buffer->state.rt.build_priv_mem_size = private_mem_alloc.size;
761    } else {
762       private_mem_alloc = (struct anv_cmd_alloc) {
763          .address = cmd_buffer->state.rt.build_priv_mem_addr,
764          .map     = anv_address_map(cmd_buffer->state.rt.build_priv_mem_addr),
765          .size    = cmd_buffer->state.rt.build_priv_mem_size,
766       };
767    }
768 
769    struct anv_cmd_alloc transient_mem_alloc =
770       anv_cmd_buffer_alloc_space(cmd_buffer, transient_total, 64,
771                                  true /* mapped */);
772    if (transient_total > 0 && anv_cmd_alloc_is_empty(transient_mem_alloc)) {
773       anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_DEVICE_MEMORY);
774       goto error;
775    }
776 
777    uint64_t private_base = anv_address_physical(private_mem_alloc.address);
778    uint64_t transient_base = anv_address_physical(transient_mem_alloc.address);
779 
780    /* Prepare transient memory */
781    for (uint32_t i = 0; i < infoCount; i++) {
782       struct build_state *bs = &builds[i];
783 
784       const VkAccelerationStructureBuildGeometryInfoKHR *pInfo = &pInfos[i];
785 
786       const VkAccelerationStructureBuildRangeInfoKHR *pBuildRangeInfos =
787          ppBuildRangeInfos ? ppBuildRangeInfos[i] : NULL;
788 
789       struct Geo *geos = transient_mem_alloc.map + bs->state.geomDesc_buffer;
790       uint32_t *prefixes = transient_mem_alloc.map + bs->geom_size_prefix_sum_buffer;
791       uint32_t prefix_sum = 0;
792       for (unsigned g = 0; g < bs->num_geometries; g++) {
793          const VkAccelerationStructureGeometryKHR *pGeometry = get_geometry(pInfo, g);
794          uint32_t prim_count = pBuildRangeInfos[g].primitiveCount;
795          geos[g] = vk_to_grl_Geo(pGeometry, prim_count,
796                                  pBuildRangeInfos[g].transformOffset,
797                                  pBuildRangeInfos[g].primitiveOffset,
798                                  pBuildRangeInfos[g].firstVertex);
799 
800          prefixes[g] = prefix_sum;
801          prefix_sum += prim_count;
802       }
803 
804       prefixes[bs->num_geometries] = prefix_sum;
805 
806       bs->geom_size_prefix_sum_buffer =
807          intel_canonical_address(bs->geom_size_prefix_sum_buffer +
808                                  transient_base);
809       bs->state.geomDesc_buffer =
810          intel_canonical_address(bs->state.geomDesc_buffer +
811                                  transient_base);
812 
813       struct BatchedInitGlobalsData data = {
814          .p_build_globals = bs->scratch.globals,
815          .p_bvh_buffer = anv_address_physical(bs->bvh_addr),
816 
817          .numPrimitives = 0,
818          .numGeometries = bs->num_geometries,
819          .numInstances = bs->num_instances,
820 
821          .instance_descs_start = bs->estimate.instance_descs_start,
822          .geo_meta_data_start = bs->estimate.geo_meta_data_start,
823          .node_data_start = bs->estimate.node_data_start,
824          .leaf_data_start = bs->estimate.leaf_data_start,
825          .procedural_data_start = bs->estimate.procedural_data_start,
826          .back_pointer_start = bs->estimate.back_pointer_start,
827          .sizeTotal = bs->estimate.sizeTotal,
828 
829          .leafType = bs->state.leaf_type,
830          .leafSize = bs->state.leaf_size,
831       };
832 
833       write_memory(transient_mem_alloc,
834                    transient_mem_init_globals_offset + i * sizeof(data),
835                    &data, sizeof(data));
836    }
837 
838    genX(flush_pipeline_select_gpgpu)(cmd_buffer);
839 
840    /* Due to the nature of GRL and its heavy use of jumps/predication, we
841     * cannot tell exactly in what order the CFE_STATE we insert are going to
842     * be executed. So always use the largest possible size.
843     */
844    genX(cmd_buffer_ensure_cfe_state)(
845       cmd_buffer,
846       cmd_buffer->device->physical->max_grl_scratch_size);
847 
848    /* Round 1 : init_globals kernel */
849    genX(grl_misc_batched_init_globals)(
850       cmd_buffer,
851       intel_canonical_address(transient_base +
852                               transient_mem_init_globals_offset),
853       infoCount);
854 
855    anv_add_pending_pipe_bits(cmd_buffer,
856                              ANV_GRL_FLUSH_FLAGS,
857                              "building accel struct");
858 
859    /* Round 2 : Copy instance/geometry data from the application provided
860     *           buffers into the acceleration structures.
861     */
862    for (uint32_t i = 0; i < infoCount; i++) {
863       struct build_state *bs = &builds[i];
864 
865       /* Metadata */
866       if (bs->num_instances) {
867          assert(bs->num_geometries == 0);
868 
869          const uint64_t copy_size = bs->num_instances * sizeof(InstanceDesc);
870          /* This must be calculated in same way as
871           * groupCountForGeoMetaDataCopySize
872           */
873          const uint32_t num_threads = (copy_size >> 8) + 3;
874 
875          if (bs->array_of_instances_ptr) {
876             genX(grl_misc_copy_instance_ptrs)(
877                cmd_buffer,
878                anv_address_physical(anv_address_add(bs->bvh_addr,
879                                                     bs->estimate.instance_descs_start)),
880                bs->instances_addr,
881                copy_size, num_threads);
882          } else {
883             genX(grl_misc_copy_instances)(
884                cmd_buffer,
885                anv_address_physical(anv_address_add(bs->bvh_addr,
886                                                     bs->estimate.instance_descs_start)),
887                bs->instances_addr,
888                copy_size, num_threads);
889          }
890       }
891 
892       if (bs->num_geometries) {
893          assert(bs->num_instances == 0);
894          const uint64_t copy_size = bs->num_geometries * sizeof(struct GeoMetaData);
895 
896          /* This must be calculated in same way as
897           * groupCountForGeoMetaDataCopySize
898           */
899          const uint32_t num_threads = (copy_size >> 6) + 1;
900 
901          genX(grl_misc_copy_geo_meta_data)(
902             cmd_buffer,
903             anv_address_physical(anv_address_add(bs->bvh_addr,
904                                                  bs->estimate.geo_meta_data_start)),
905             bs->state.geomDesc_buffer,
906             copy_size,
907             num_threads);
908       }
909 
910       /* Primrefs */
911       if (bs->num_instances) {
912          if (bs->array_of_instances_ptr) {
913             genX(grl_build_primref_buildPrimirefsFromInstancesArrOfPtrs)(
914                cmd_buffer,
915                bs->instances_addr,
916                PREFIX_MK_SIZE(grl_build_primref, bs->estimate),
917                PREFIX_MK_STATE(grl_build_primref, bs->state),
918                false /* allowUpdate */);
919          } else {
920             genX(grl_build_primref_buildPrimirefsFromInstances)(
921                cmd_buffer,
922                bs->instances_addr,
923                PREFIX_MK_SIZE(grl_build_primref, bs->estimate),
924                PREFIX_MK_STATE(grl_build_primref, bs->state),
925                false /* allowUpdate */);
926          }
927       }
928 
929       if (bs->num_geometries) {
930          const VkAccelerationStructureBuildGeometryInfoKHR *pInfo = &pInfos[i];
931          const VkAccelerationStructureBuildRangeInfoKHR *pBuildRangeInfos =
932             ppBuildRangeInfos ? ppBuildRangeInfos[i] : NULL;
933 
934          assert(pInfo->geometryCount == bs->num_geometries);
935          for (unsigned g = 0; g < pInfo->geometryCount; g++) {
936             const VkAccelerationStructureGeometryKHR *pGeometry =
937                get_geometry(pInfo, g);
938 
939             switch (pGeometry->geometryType) {
940             case VK_GEOMETRY_TYPE_TRIANGLES_KHR:
941                genX(grl_build_primref_primrefs_from_tris)(
942                   cmd_buffer,
943                   PREFIX_MK_STATE(grl_build_primref, bs->state),
944                   PREFIX_MK_SIZE(grl_build_primref, bs->estimate),
945                   bs->state.geomDesc_buffer + g * sizeof(struct Geo),
946                   g,
947                   vk_to_grl_GeometryFlags(pGeometry->flags),
948                   /* TODO: Indirect */
949                   pBuildRangeInfos[g].primitiveCount);
950                break;
951 
952             case VK_GEOMETRY_TYPE_AABBS_KHR:
953                genX(grl_build_primref_primrefs_from_proc)(
954                   cmd_buffer,
955                   PREFIX_MK_STATE(grl_build_primref, bs->state),
956                   PREFIX_MK_SIZE(grl_build_primref, bs->estimate),
957                   bs->state.geomDesc_buffer + g * sizeof(struct Geo),
958                   g,
959                   vk_to_grl_GeometryFlags(pGeometry->flags),
960                   /* TODO: Indirect */
961                   pBuildRangeInfos[g].primitiveCount);
962                break;
963 
964             default:
965                unreachable("Invalid geometry type");
966             }
967          }
968       }
969    }
970 
971    anv_add_pending_pipe_bits(cmd_buffer,
972                              ANV_GRL_FLUSH_FLAGS,
973                              "building accel struct");
974 
975    /* Dispatch trivial builds */
976    if (num_trivial_builds) {
977       for (uint32_t i = 0; i < infoCount; i++) {
978          struct build_state *bs = &builds[i];
979 
980          if (bs->build_method != ANV_BVH_BUILD_METHOD_TRIVIAL)
981             continue;
982 
983          genX(grl_new_sah_builder_single_pass_binsah)(
984             cmd_buffer,
985             bs->scratch.globals,
986             bs->state.bvh_buffer,
987             bs->state.build_primref_buffer,
988             bs->scratch.leaf_index_buffers,
989             false /* alloc_backpointers */);
990       }
991    }
992 
993    /* Dispatch new SAH builds */
994    if (num_new_sah_builds) {
995       size_t global_ptrs_offset  = transient_mem_binnedsah_offset;
996       size_t buffers_info_offset = transient_mem_binnedsah_offset + sizeof(gpuva_t) * num_new_sah_builds;
997 
998       size_t scheduler_offset   = private_mem_binnedsah_offset;
999       size_t sah_globals_offset = private_mem_binnedsah_offset + get_scheduler_size(num_new_sah_builds);
1000 
1001       struct SAHBuildArgsBatchable args = {
1002          .num_builds                               = infoCount,
1003          .p_globals_ptrs                           = intel_canonical_address(transient_base + global_ptrs_offset),
1004          .p_buffers_info                           = intel_canonical_address(transient_base + buffers_info_offset),
1005          .p_scheduler                              = intel_canonical_address(private_base + scheduler_offset),
1006          .p_sah_globals                            = intel_canonical_address(private_base + sah_globals_offset),
1007          .num_max_qnode_global_root_buffer_entries = MAX2(num_new_sah_builds, QNODE_GLOBAL_ROOT_BUFFER_MIN_ENTRIES_NUM),
1008       };
1009 
1010       for (uint32_t i = 0; i < infoCount; i++) {
1011          struct build_state *bs = &builds[i];
1012 
1013          if (bs->build_method != ANV_BVH_BUILD_METHOD_NEW_SAH)
1014             continue;
1015 
1016          uint64_t p_build_primref_index_buffers;
1017          uint64_t p_bvh2;
1018          uint64_t p_qnode_child_buffer;
1019 
1020          get_binnedsah_scratch_buffers(bs,
1021                                        &p_qnode_child_buffer,
1022                                        &p_build_primref_index_buffers,
1023                                        &p_bvh2);
1024 
1025          struct SAHBuildBuffersInfo buffers = {
1026             .p_primref_index_buffers  = bs->scratch.leaf_index_buffers,
1027             .p_bvh_base               = bs->state.bvh_buffer,
1028             .p_primrefs_buffer        = bs->state.build_primref_buffer,
1029             .p_bvh2                   = p_bvh2,
1030             .p_qnode_root_buffer      = p_qnode_child_buffer,
1031             .sah_globals_flags        = 0,
1032          };
1033 
1034          write_memory(transient_mem_alloc, buffers_info_offset, &buffers, sizeof(buffers));
1035          buffers_info_offset += sizeof(buffers);
1036 
1037          write_memory(transient_mem_alloc, global_ptrs_offset, &bs->state.build_globals,
1038                       sizeof(bs->state.build_globals));
1039          global_ptrs_offset += sizeof(bs->state.build_globals);
1040       }
1041 
1042       genX(grl_new_sah_builder_new_sah_build_batchable)(
1043          cmd_buffer, PREFIX_MK_SAH_BUILD_ARGS_BATCHABLE(grl_new_sah_builder, args));
1044    }
1045 
1046    if (num_new_sah_builds == 0)
1047       anv_add_pending_pipe_bits(cmd_buffer,
1048                               ANV_GRL_FLUSH_FLAGS,
1049                              "building accel struct");
1050 
1051    /* Finally write the leaves. */
1052    for (uint32_t i = 0; i < infoCount; i++) {
1053       struct build_state *bs = &builds[i];
1054 
1055       if (bs->num_instances) {
1056          assert(bs->num_geometries == 0);
1057          if (bs->array_of_instances_ptr) {
1058             genX(grl_leaf_builder_buildLeafDXR_instances_pointers)(cmd_buffer,
1059                PREFIX_MK_STATE(grl_leaf_builder, bs->state),
1060                bs->scratch.leaf_index_buffers,
1061                bs->instances_addr,
1062                bs->scratch.leaf_index_buffer_stride,
1063                0 /* offset */,
1064                bs->estimate.numBuildPrimitives);
1065          } else {
1066             genX(grl_leaf_builder_buildLeafDXR_instances)(cmd_buffer,
1067                PREFIX_MK_STATE(grl_leaf_builder, bs->state),
1068                bs->scratch.leaf_index_buffers,
1069                bs->instances_addr,
1070                bs->scratch.leaf_index_buffer_stride,
1071                0 /* offset */,
1072                bs->estimate.numBuildPrimitives);
1073          }
1074       }
1075 
1076       if (bs->num_geometries) {
1077          assert(bs->num_instances == 0);
1078          const uint64_t p_numPrimitives =
1079             bs->state.build_globals + offsetof(struct Globals, numPrimitives);
1080 
1081          assert(bs->estimate.numProcedurals == 0 ||
1082                 bs->estimate.numTriangles == 0);
1083          if (bs->estimate.numProcedurals) {
1084             genX(grl_leaf_builder_buildLeafDXR_procedurals)(
1085                cmd_buffer,
1086                PREFIX_MK_STATE(grl_leaf_builder, bs->state),
1087                bs->scratch.leaf_index_buffers,
1088                bs->scratch.leaf_index_buffer_stride,
1089                0 /* offset */,
1090                p_numPrimitives);
1091          } else {
1092             genX(grl_leaf_builder_buildLeafDXR_quads)(
1093                cmd_buffer,
1094                PREFIX_MK_STATE(grl_leaf_builder, bs->state),
1095                bs->scratch.leaf_index_buffers,
1096                bs->scratch.leaf_index_buffer_stride,
1097                0 /* offset */,
1098                p_numPrimitives,
1099                false /* allow_updates */);
1100          }
1101       }
1102    }
1103 
1104    anv_add_pending_pipe_bits(cmd_buffer,
1105                              ANV_GRL_FLUSH_FLAGS,
1106                              "building accel struct");
1107 
1108    trace_intel_end_as_build(&cmd_buffer->trace);
1109 
1110  error:
1111    vk_free(&cmd_buffer->device->vk.alloc, builds);
1112 }
1113 
1114 void
genX(CmdBuildAccelerationStructuresKHR)1115 genX(CmdBuildAccelerationStructuresKHR)(
1116     VkCommandBuffer                             commandBuffer,
1117     uint32_t                                    infoCount,
1118     const VkAccelerationStructureBuildGeometryInfoKHR* pInfos,
1119     const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos)
1120 {
1121    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1122 
1123    if (anv_batch_has_error(&cmd_buffer->batch))
1124       return;
1125 
1126    cmd_build_acceleration_structures(cmd_buffer, infoCount, pInfos,
1127                                      ppBuildRangeInfos, NULL, NULL, NULL);
1128 }
1129 
1130 void
genX(CmdBuildAccelerationStructuresIndirectKHR)1131 genX(CmdBuildAccelerationStructuresIndirectKHR)(
1132     VkCommandBuffer                             commandBuffer,
1133     uint32_t                                    infoCount,
1134     const VkAccelerationStructureBuildGeometryInfoKHR* pInfos,
1135     const VkDeviceAddress*                      pIndirectDeviceAddresses,
1136     const uint32_t*                             pIndirectStrides,
1137     const uint32_t* const*                      ppMaxPrimitiveCounts)
1138 {
1139    unreachable("Unimplemented");
1140 }
1141 
1142 void
genX(CmdCopyAccelerationStructureKHR)1143 genX(CmdCopyAccelerationStructureKHR)(
1144     VkCommandBuffer                             commandBuffer,
1145     const VkCopyAccelerationStructureInfoKHR*   pInfo)
1146 {
1147    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1148    ANV_FROM_HANDLE(vk_acceleration_structure, src_accel, pInfo->src);
1149    ANV_FROM_HANDLE(vk_acceleration_structure, dst_accel, pInfo->dst);
1150 
1151    assert(pInfo->mode == VK_COPY_ACCELERATION_STRUCTURE_MODE_COMPACT_KHR ||
1152           pInfo->mode == VK_COPY_ACCELERATION_STRUCTURE_MODE_CLONE_KHR);
1153 
1154    if (pInfo->mode == VK_COPY_ACCELERATION_STRUCTURE_MODE_CLONE_KHR) {
1155       uint64_t src_size_addr =
1156          vk_acceleration_structure_get_va(src_accel) +
1157          offsetof(struct BVHBase, Meta.allocationSize);
1158       genX(grl_copy_clone_indirect)(
1159          cmd_buffer,
1160          vk_acceleration_structure_get_va(dst_accel),
1161          vk_acceleration_structure_get_va(src_accel),
1162          src_size_addr);
1163    } else {
1164       genX(grl_copy_compact)(
1165          cmd_buffer,
1166          vk_acceleration_structure_get_va(dst_accel),
1167          vk_acceleration_structure_get_va(src_accel));
1168    }
1169 
1170    anv_add_pending_pipe_bits(cmd_buffer,
1171                              ANV_PIPE_END_OF_PIPE_SYNC_BIT,
1172                              "after copy acceleration struct");
1173 }
1174 
1175 void
genX(CmdCopyAccelerationStructureToMemoryKHR)1176 genX(CmdCopyAccelerationStructureToMemoryKHR)(
1177     VkCommandBuffer                             commandBuffer,
1178     const VkCopyAccelerationStructureToMemoryInfoKHR* pInfo)
1179 {
1180    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1181    ANV_FROM_HANDLE(vk_acceleration_structure, src_accel, pInfo->src);
1182    struct anv_device *device = cmd_buffer->device;
1183    uint64_t src_size_addr =
1184       vk_acceleration_structure_get_va(src_accel) +
1185       offsetof(struct BVHBase, Meta.allocationSize);
1186 
1187    assert(pInfo->mode == VK_COPY_ACCELERATION_STRUCTURE_MODE_SERIALIZE_KHR);
1188 
1189    genX(grl_copy_serialize_indirect)(
1190       cmd_buffer,
1191       pInfo->dst.deviceAddress,
1192       vk_acceleration_structure_get_va(src_accel),
1193       anv_address_physical(device->rt_uuid_addr),
1194       src_size_addr);
1195 
1196    anv_add_pending_pipe_bits(cmd_buffer,
1197                              ANV_PIPE_END_OF_PIPE_SYNC_BIT,
1198                              "after copy acceleration struct");
1199 }
1200 
1201 void
genX(CmdCopyMemoryToAccelerationStructureKHR)1202 genX(CmdCopyMemoryToAccelerationStructureKHR)(
1203     VkCommandBuffer                             commandBuffer,
1204     const VkCopyMemoryToAccelerationStructureInfoKHR* pInfo)
1205 {
1206    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1207    ANV_FROM_HANDLE(vk_acceleration_structure, dst_accel, pInfo->dst);
1208 
1209    assert(pInfo->mode == VK_COPY_ACCELERATION_STRUCTURE_MODE_DESERIALIZE_KHR);
1210 
1211    uint64_t src_size_addr = pInfo->src.deviceAddress +
1212       offsetof(struct SerializationHeader, DeserializedSizeInBytes);
1213    genX(grl_copy_deserialize_indirect)(
1214       cmd_buffer,
1215       vk_acceleration_structure_get_va(dst_accel),
1216       pInfo->src.deviceAddress,
1217       src_size_addr);
1218 
1219    anv_add_pending_pipe_bits(cmd_buffer,
1220                              ANV_PIPE_END_OF_PIPE_SYNC_BIT,
1221                              "after copy acceleration struct");
1222 }
1223 
1224 /* TODO: Host commands */
1225 
1226 VkResult
genX(BuildAccelerationStructuresKHR)1227 genX(BuildAccelerationStructuresKHR)(
1228     VkDevice                                    _device,
1229     VkDeferredOperationKHR                      deferredOperation,
1230     uint32_t                                    infoCount,
1231     const VkAccelerationStructureBuildGeometryInfoKHR* pInfos,
1232     const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos)
1233 {
1234    ANV_FROM_HANDLE(anv_device, device, _device);
1235    unreachable("Unimplemented");
1236    return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
1237 }
1238 
1239 VkResult
genX(CopyAccelerationStructureKHR)1240 genX(CopyAccelerationStructureKHR)(
1241     VkDevice                                    _device,
1242     VkDeferredOperationKHR                      deferredOperation,
1243     const VkCopyAccelerationStructureInfoKHR*   pInfo)
1244 {
1245    ANV_FROM_HANDLE(anv_device, device, _device);
1246    unreachable("Unimplemented");
1247    return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
1248 }
1249 
1250 VkResult
genX(CopyAccelerationStructureToMemoryKHR)1251 genX(CopyAccelerationStructureToMemoryKHR)(
1252     VkDevice                                    _device,
1253     VkDeferredOperationKHR                      deferredOperation,
1254     const VkCopyAccelerationStructureToMemoryInfoKHR* pInfo)
1255 {
1256    ANV_FROM_HANDLE(anv_device, device, _device);
1257    unreachable("Unimplemented");
1258    return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
1259 }
1260 
1261 VkResult
genX(CopyMemoryToAccelerationStructureKHR)1262 genX(CopyMemoryToAccelerationStructureKHR)(
1263     VkDevice                                    _device,
1264     VkDeferredOperationKHR                      deferredOperation,
1265     const VkCopyMemoryToAccelerationStructureInfoKHR* pInfo)
1266 {
1267    ANV_FROM_HANDLE(anv_device, device, _device);
1268    unreachable("Unimplemented");
1269    return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
1270 }
1271 
1272 VkResult
genX(WriteAccelerationStructuresPropertiesKHR)1273 genX(WriteAccelerationStructuresPropertiesKHR)(
1274     VkDevice                                    _device,
1275     uint32_t                                    accelerationStructureCount,
1276     const VkAccelerationStructureKHR*           pAccelerationStructures,
1277     VkQueryType                                 queryType,
1278     size_t                                      dataSize,
1279     void*                                       pData,
1280     size_t                                      stride)
1281 {
1282    ANV_FROM_HANDLE(anv_device, device, _device);
1283    unreachable("Unimplemented");
1284    return vk_error(device, VK_ERROR_FEATURE_NOT_PRESENT);
1285 }
1286 
1287 #endif /* GFX_VERx10 >= 125 */
1288