1 // Copyright 2024 The Android Open Source Project
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expresso or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "DeviceLostHelper.h"
16 
17 #include "host-common/logging.h"
18 
19 namespace gfxstream {
20 namespace vk {
21 
enableWithNvidiaDeviceDiagnosticCheckpoints()22 void DeviceLostHelper::enableWithNvidiaDeviceDiagnosticCheckpoints() { mEnabled = true; }
23 
createMarkerForCommandBuffer(const VkCommandBuffer & commandBuffer,MarkerType type)24 const void* DeviceLostHelper::createMarkerForCommandBuffer(const VkCommandBuffer& commandBuffer,
25                                                            MarkerType type) {
26     std::lock_guard<std::mutex> lock(mMarkersMutex);
27 
28     auto it = mMarkers.insert(CheckpointMarker{commandBuffer, type});
29 
30     // References and pointers to data stored in the container are only
31     // invalidated by erasing that element, even when the corresponding
32     // iterator is invalidated.
33     return reinterpret_cast<const void*>(&(*it.first));
34 }
35 
removeMarkersForCommandBuffer(const VkCommandBuffer & commandBuffer)36 void DeviceLostHelper::removeMarkersForCommandBuffer(const VkCommandBuffer& commandBuffer) {
37     std::lock_guard<std::mutex> lock(mMarkersMutex);
38     mMarkers.erase(CheckpointMarker{
39         .commandBuffer = commandBuffer,
40         .type = MarkerType::kBegin,
41     });
42     mMarkers.erase(CheckpointMarker{
43         .commandBuffer = commandBuffer,
44         .type = MarkerType::kEnd,
45     });
46 }
47 
addNeededDeviceExtensions(std::vector<const char * > * deviceExtensions)48 void DeviceLostHelper::addNeededDeviceExtensions(std::vector<const char*>* deviceExtensions) {
49     if (mEnabled) {
50         deviceExtensions->push_back(VK_NV_DEVICE_DIAGNOSTIC_CHECKPOINTS_EXTENSION_NAME);
51     }
52 }
53 
onBeginCommandBuffer(const VkCommandBuffer & commandBuffer,const VulkanDispatch * vk)54 void DeviceLostHelper::onBeginCommandBuffer(const VkCommandBuffer& commandBuffer,
55                                             const VulkanDispatch* vk) {
56     if (!mEnabled) {
57         return;
58     }
59 
60     const void* marker = createMarkerForCommandBuffer(commandBuffer, MarkerType::kBegin);
61     vk->vkCmdSetCheckpointNV(commandBuffer, marker);
62 }
63 
onEndCommandBuffer(const VkCommandBuffer & commandBuffer,const VulkanDispatch * vk)64 void DeviceLostHelper::onEndCommandBuffer(const VkCommandBuffer& commandBuffer,
65                                           const VulkanDispatch* vk) {
66     if (!mEnabled) {
67         return;
68     }
69 
70     const void* marker = createMarkerForCommandBuffer(commandBuffer, MarkerType::kEnd);
71     vk->vkCmdSetCheckpointNV(commandBuffer, marker);
72 }
73 
onResetCommandBuffer(const VkCommandBuffer & commandBuffer)74 void DeviceLostHelper::onResetCommandBuffer(const VkCommandBuffer& commandBuffer) {
75     if (!mEnabled) {
76         return;
77     }
78 
79     removeMarkersForCommandBuffer(commandBuffer);
80 }
81 
onFreeCommandBuffer(const VkCommandBuffer & commandBuffer)82 void DeviceLostHelper::onFreeCommandBuffer(const VkCommandBuffer& commandBuffer) {
83     if (!mEnabled) {
84         return;
85     }
86 
87     removeMarkersForCommandBuffer(commandBuffer);
88 }
89 
onDeviceLost(const std::vector<DeviceWithQueues> & devicesWithQueues)90 void DeviceLostHelper::onDeviceLost(const std::vector<DeviceWithQueues>& devicesWithQueues) {
91     if (!mEnabled) {
92         return;
93     }
94 
95     ERR("DeviceLostHelper starting lost device checks...");
96 
97     for (const DeviceWithQueues& deviceWithQueues : devicesWithQueues) {
98         const auto& device = deviceWithQueues.device;
99         const auto* deviceDispatch = deviceWithQueues.deviceDispatch;
100         if (deviceDispatch->vkDeviceWaitIdle(device) != VK_ERROR_DEVICE_LOST) {
101             continue;
102         }
103         ERR("VkDevice:%p was lost, checking for unfinished VkCommandBuffers...", device);
104 
105         struct CommandBufferOnQueue {
106             VkCommandBuffer commandBuffer = VK_NULL_HANDLE;
107             VkQueue queue = VK_NULL_HANDLE;
108         };
109         std::vector<CommandBufferOnQueue> unfinishedCommandBuffers;
110 
111         for (const VkQueue& queue : deviceWithQueues.queues) {
112             std::vector<VkCheckpointDataNV> checkpointDatas;
113 
114             uint32_t checkpointDataCount = 0;
115             deviceDispatch->vkGetQueueCheckpointDataNV(queue, &checkpointDataCount, nullptr);
116             if (checkpointDataCount == 0) continue;
117 
118             checkpointDatas.resize(
119                 static_cast<size_t>(checkpointDataCount),
120                 VkCheckpointDataNV{
121                     .sType = VK_STRUCTURE_TYPE_CHECKPOINT_DATA_NV,
122                 });
123             deviceDispatch->vkGetQueueCheckpointDataNV(queue, &checkpointDataCount,
124                                                        checkpointDatas.data());
125 
126             std::unordered_set<VkCommandBuffer> unfinishedCommandBuffersForQueue;
127             for (const VkCheckpointDataNV& checkpointData : checkpointDatas) {
128                 const auto& marker =
129                     *reinterpret_cast<const CheckpointMarker*>(checkpointData.pCheckpointMarker);
130                 if (marker.type == MarkerType::kBegin) {
131                     unfinishedCommandBuffersForQueue.insert(marker.commandBuffer);
132                 } else {
133                     unfinishedCommandBuffersForQueue.erase(marker.commandBuffer);
134                 }
135             }
136 
137             for (const VkCommandBuffer commandBuffer : unfinishedCommandBuffersForQueue) {
138                 unfinishedCommandBuffers.push_back(CommandBufferOnQueue{
139                     .commandBuffer = commandBuffer,
140                     .queue = queue,
141                 });
142             }
143         }
144 
145         if (unfinishedCommandBuffers.empty()) {
146             ERR("VkDevice:%p has no outstanding VkCommandBuffers.", device);
147         } else {
148             ERR("VkDevice:%p has outstanding VkCommandBuffers:", device);
149             for (const CommandBufferOnQueue& unfinished : unfinishedCommandBuffers) {
150                 ERR("   - VkCommandBuffer:%p on VkQueue:%p", unfinished.commandBuffer,
151                     unfinished.queue);
152             }
153         }
154     }
155 
156     ERR("DeviceLostHelper finished lost device checks.");
157 }
158 
159 }  // namespace vk
160 }  // namespace gfxstream