1 // Copyright 2024 The Android Open Source Project
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expresso or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "DeviceLostHelper.h"
16
17 #include "host-common/logging.h"
18
19 namespace gfxstream {
20 namespace vk {
21
enableWithNvidiaDeviceDiagnosticCheckpoints()22 void DeviceLostHelper::enableWithNvidiaDeviceDiagnosticCheckpoints() { mEnabled = true; }
23
createMarkerForCommandBuffer(const VkCommandBuffer & commandBuffer,MarkerType type)24 const void* DeviceLostHelper::createMarkerForCommandBuffer(const VkCommandBuffer& commandBuffer,
25 MarkerType type) {
26 std::lock_guard<std::mutex> lock(mMarkersMutex);
27
28 auto it = mMarkers.insert(CheckpointMarker{commandBuffer, type});
29
30 // References and pointers to data stored in the container are only
31 // invalidated by erasing that element, even when the corresponding
32 // iterator is invalidated.
33 return reinterpret_cast<const void*>(&(*it.first));
34 }
35
removeMarkersForCommandBuffer(const VkCommandBuffer & commandBuffer)36 void DeviceLostHelper::removeMarkersForCommandBuffer(const VkCommandBuffer& commandBuffer) {
37 std::lock_guard<std::mutex> lock(mMarkersMutex);
38 mMarkers.erase(CheckpointMarker{
39 .commandBuffer = commandBuffer,
40 .type = MarkerType::kBegin,
41 });
42 mMarkers.erase(CheckpointMarker{
43 .commandBuffer = commandBuffer,
44 .type = MarkerType::kEnd,
45 });
46 }
47
addNeededDeviceExtensions(std::vector<const char * > * deviceExtensions)48 void DeviceLostHelper::addNeededDeviceExtensions(std::vector<const char*>* deviceExtensions) {
49 if (mEnabled) {
50 deviceExtensions->push_back(VK_NV_DEVICE_DIAGNOSTIC_CHECKPOINTS_EXTENSION_NAME);
51 }
52 }
53
onBeginCommandBuffer(const VkCommandBuffer & commandBuffer,const VulkanDispatch * vk)54 void DeviceLostHelper::onBeginCommandBuffer(const VkCommandBuffer& commandBuffer,
55 const VulkanDispatch* vk) {
56 if (!mEnabled) {
57 return;
58 }
59
60 const void* marker = createMarkerForCommandBuffer(commandBuffer, MarkerType::kBegin);
61 vk->vkCmdSetCheckpointNV(commandBuffer, marker);
62 }
63
onEndCommandBuffer(const VkCommandBuffer & commandBuffer,const VulkanDispatch * vk)64 void DeviceLostHelper::onEndCommandBuffer(const VkCommandBuffer& commandBuffer,
65 const VulkanDispatch* vk) {
66 if (!mEnabled) {
67 return;
68 }
69
70 const void* marker = createMarkerForCommandBuffer(commandBuffer, MarkerType::kEnd);
71 vk->vkCmdSetCheckpointNV(commandBuffer, marker);
72 }
73
onResetCommandBuffer(const VkCommandBuffer & commandBuffer)74 void DeviceLostHelper::onResetCommandBuffer(const VkCommandBuffer& commandBuffer) {
75 if (!mEnabled) {
76 return;
77 }
78
79 removeMarkersForCommandBuffer(commandBuffer);
80 }
81
onFreeCommandBuffer(const VkCommandBuffer & commandBuffer)82 void DeviceLostHelper::onFreeCommandBuffer(const VkCommandBuffer& commandBuffer) {
83 if (!mEnabled) {
84 return;
85 }
86
87 removeMarkersForCommandBuffer(commandBuffer);
88 }
89
onDeviceLost(const std::vector<DeviceWithQueues> & devicesWithQueues)90 void DeviceLostHelper::onDeviceLost(const std::vector<DeviceWithQueues>& devicesWithQueues) {
91 if (!mEnabled) {
92 return;
93 }
94
95 ERR("DeviceLostHelper starting lost device checks...");
96
97 for (const DeviceWithQueues& deviceWithQueues : devicesWithQueues) {
98 const auto& device = deviceWithQueues.device;
99 const auto* deviceDispatch = deviceWithQueues.deviceDispatch;
100 if (deviceDispatch->vkDeviceWaitIdle(device) != VK_ERROR_DEVICE_LOST) {
101 continue;
102 }
103 ERR("VkDevice:%p was lost, checking for unfinished VkCommandBuffers...", device);
104
105 struct CommandBufferOnQueue {
106 VkCommandBuffer commandBuffer = VK_NULL_HANDLE;
107 VkQueue queue = VK_NULL_HANDLE;
108 };
109 std::vector<CommandBufferOnQueue> unfinishedCommandBuffers;
110
111 for (const VkQueue& queue : deviceWithQueues.queues) {
112 std::vector<VkCheckpointDataNV> checkpointDatas;
113
114 uint32_t checkpointDataCount = 0;
115 deviceDispatch->vkGetQueueCheckpointDataNV(queue, &checkpointDataCount, nullptr);
116 if (checkpointDataCount == 0) continue;
117
118 checkpointDatas.resize(
119 static_cast<size_t>(checkpointDataCount),
120 VkCheckpointDataNV{
121 .sType = VK_STRUCTURE_TYPE_CHECKPOINT_DATA_NV,
122 });
123 deviceDispatch->vkGetQueueCheckpointDataNV(queue, &checkpointDataCount,
124 checkpointDatas.data());
125
126 std::unordered_set<VkCommandBuffer> unfinishedCommandBuffersForQueue;
127 for (const VkCheckpointDataNV& checkpointData : checkpointDatas) {
128 const auto& marker =
129 *reinterpret_cast<const CheckpointMarker*>(checkpointData.pCheckpointMarker);
130 if (marker.type == MarkerType::kBegin) {
131 unfinishedCommandBuffersForQueue.insert(marker.commandBuffer);
132 } else {
133 unfinishedCommandBuffersForQueue.erase(marker.commandBuffer);
134 }
135 }
136
137 for (const VkCommandBuffer commandBuffer : unfinishedCommandBuffersForQueue) {
138 unfinishedCommandBuffers.push_back(CommandBufferOnQueue{
139 .commandBuffer = commandBuffer,
140 .queue = queue,
141 });
142 }
143 }
144
145 if (unfinishedCommandBuffers.empty()) {
146 ERR("VkDevice:%p has no outstanding VkCommandBuffers.", device);
147 } else {
148 ERR("VkDevice:%p has outstanding VkCommandBuffers:", device);
149 for (const CommandBufferOnQueue& unfinished : unfinishedCommandBuffers) {
150 ERR(" - VkCommandBuffer:%p on VkQueue:%p", unfinished.commandBuffer,
151 unfinished.queue);
152 }
153 }
154 }
155
156 ERR("DeviceLostHelper finished lost device checks.");
157 }
158
159 } // namespace vk
160 } // namespace gfxstream