xref: /aosp_15_r20/external/crosvm/devices/src/virtio/vhost/user/device/gpu.rs (revision bb4ee6a4ae7042d18b07a98463b9c8b875e44b39)
1 // Copyright 2021 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 pub mod sys;
6 
7 use std::cell::RefCell;
8 use std::rc::Rc;
9 use std::sync::Arc;
10 
11 use anyhow::anyhow;
12 use anyhow::bail;
13 use anyhow::Context;
14 use base::error;
15 use base::warn;
16 use base::Tube;
17 use cros_async::EventAsync;
18 use cros_async::Executor;
19 use cros_async::TaskHandle;
20 use futures::FutureExt;
21 use futures::StreamExt;
22 use sync::Mutex;
23 pub use sys::run_gpu_device;
24 pub use sys::Options;
25 use vm_memory::GuestMemory;
26 use vmm_vhost::message::VhostUserProtocolFeatures;
27 use vmm_vhost::VHOST_USER_F_PROTOCOL_FEATURES;
28 
29 use crate::virtio::device_constants::gpu::NUM_QUEUES;
30 use crate::virtio::gpu;
31 use crate::virtio::gpu::QueueReader;
32 use crate::virtio::vhost::user::device::handler::Error as DeviceError;
33 use crate::virtio::vhost::user::device::handler::VhostBackendReqConnection;
34 use crate::virtio::vhost::user::device::handler::VhostUserDevice;
35 use crate::virtio::vhost::user::device::handler::WorkerState;
36 use crate::virtio::DescriptorChain;
37 use crate::virtio::Gpu;
38 use crate::virtio::Queue;
39 use crate::virtio::SharedMemoryMapper;
40 use crate::virtio::SharedMemoryRegion;
41 use crate::virtio::VirtioDevice;
42 
43 const MAX_QUEUE_NUM: usize = NUM_QUEUES;
44 
45 #[derive(Clone)]
46 struct SharedReader {
47     queue: Arc<Mutex<Queue>>,
48 }
49 
50 impl gpu::QueueReader for SharedReader {
pop(&self) -> Option<DescriptorChain>51     fn pop(&self) -> Option<DescriptorChain> {
52         self.queue.lock().pop()
53     }
54 
add_used(&self, desc_chain: DescriptorChain, len: u32)55     fn add_used(&self, desc_chain: DescriptorChain, len: u32) {
56         self.queue.lock().add_used(desc_chain, len)
57     }
58 
signal_used(&self)59     fn signal_used(&self) {
60         self.queue.lock().trigger_interrupt();
61     }
62 }
63 
run_ctrl_queue( reader: SharedReader, mem: GuestMemory, kick_evt: EventAsync, state: Rc<RefCell<gpu::Frontend>>, )64 async fn run_ctrl_queue(
65     reader: SharedReader,
66     mem: GuestMemory,
67     kick_evt: EventAsync,
68     state: Rc<RefCell<gpu::Frontend>>,
69 ) {
70     loop {
71         if let Err(e) = kick_evt.next_val().await {
72             error!("Failed to read kick event for ctrl queue: {}", e);
73             break;
74         }
75 
76         let mut state = state.borrow_mut();
77         let needs_interrupt = state.process_queue(&mem, &reader);
78 
79         if needs_interrupt {
80             reader.signal_used();
81         }
82     }
83 }
84 
85 struct GpuBackend {
86     ex: Executor,
87     gpu: Rc<RefCell<Gpu>>,
88     resource_bridges: Arc<Mutex<Vec<Tube>>>,
89     state: Option<Rc<RefCell<gpu::Frontend>>>,
90     fence_state: Arc<Mutex<gpu::FenceState>>,
91     queue_workers: [Option<WorkerState<Arc<Mutex<Queue>>, ()>>; MAX_QUEUE_NUM],
92     // In the downstream, we may add platform workers after start_platform_workers returns.
93     platform_worker_tx: futures::channel::mpsc::UnboundedSender<TaskHandle<()>>,
94     platform_worker_rx: futures::channel::mpsc::UnboundedReceiver<TaskHandle<()>>,
95     shmem_mapper: Arc<Mutex<Option<Box<dyn SharedMemoryMapper>>>>,
96 }
97 
98 impl GpuBackend {
stop_non_queue_workers(&mut self) -> anyhow::Result<()>99     fn stop_non_queue_workers(&mut self) -> anyhow::Result<()> {
100         self.ex
101             .run_until(async {
102                 while let Some(Some(handle)) = self.platform_worker_rx.next().now_or_never() {
103                     handle.cancel().await;
104                 }
105             })
106             .context("stopping the non-queue workers for GPU")?;
107         Ok(())
108     }
109 }
110 
111 impl VhostUserDevice for GpuBackend {
max_queue_num(&self) -> usize112     fn max_queue_num(&self) -> usize {
113         MAX_QUEUE_NUM
114     }
115 
features(&self) -> u64116     fn features(&self) -> u64 {
117         self.gpu.borrow().features() | 1 << VHOST_USER_F_PROTOCOL_FEATURES
118     }
119 
ack_features(&mut self, value: u64) -> anyhow::Result<()>120     fn ack_features(&mut self, value: u64) -> anyhow::Result<()> {
121         self.gpu.borrow_mut().ack_features(value);
122         Ok(())
123     }
124 
protocol_features(&self) -> VhostUserProtocolFeatures125     fn protocol_features(&self) -> VhostUserProtocolFeatures {
126         VhostUserProtocolFeatures::CONFIG
127             | VhostUserProtocolFeatures::BACKEND_REQ
128             | VhostUserProtocolFeatures::MQ
129             | VhostUserProtocolFeatures::SHARED_MEMORY_REGIONS
130             | VhostUserProtocolFeatures::DEVICE_STATE
131     }
132 
read_config(&self, offset: u64, dst: &mut [u8])133     fn read_config(&self, offset: u64, dst: &mut [u8]) {
134         self.gpu.borrow().read_config(offset, dst)
135     }
136 
write_config(&self, offset: u64, data: &[u8])137     fn write_config(&self, offset: u64, data: &[u8]) {
138         self.gpu.borrow_mut().write_config(offset, data)
139     }
140 
start_queue(&mut self, idx: usize, queue: Queue, mem: GuestMemory) -> anyhow::Result<()>141     fn start_queue(&mut self, idx: usize, queue: Queue, mem: GuestMemory) -> anyhow::Result<()> {
142         if self.queue_workers[idx].is_some() {
143             warn!("Starting new queue handler without stopping old handler");
144             self.stop_queue(idx)?;
145         }
146 
147         let doorbell = queue.interrupt().clone();
148 
149         // Create a refcounted queue. The GPU control queue uses a SharedReader which allows us to
150         // handle fences in the RutabagaFenceHandler, and also handle queue messages in
151         // `run_ctrl_queue`.
152         // For the cursor queue, we still create the refcounted queue to support retrieving the
153         // queue for snapshotting (but don't handle any messages).
154         let queue = Arc::new(Mutex::new(queue));
155 
156         // Spawn a worker for the queue.
157         let queue_task = match idx {
158             0 => {
159                 // Set up worker for the control queue.
160                 let kick_evt = queue
161                     .lock()
162                     .event()
163                     .try_clone()
164                     .context("failed to clone queue event")?;
165                 let kick_evt = EventAsync::new(kick_evt, &self.ex)
166                     .context("failed to create EventAsync for kick_evt")?;
167                 let reader = SharedReader {
168                     queue: queue.clone(),
169                 };
170 
171                 let state = if let Some(s) = self.state.as_ref() {
172                     s.clone()
173                 } else {
174                     let fence_handler_resources =
175                         Arc::new(Mutex::new(Some(gpu::FenceHandlerActivationResources {
176                             mem: mem.clone(),
177                             ctrl_queue: reader.clone(),
178                         })));
179                     let fence_handler = gpu::create_fence_handler(
180                         fence_handler_resources,
181                         self.fence_state.clone(),
182                     );
183 
184                     let state = Rc::new(RefCell::new(
185                         self.gpu
186                             .borrow_mut()
187                             .initialize_frontend(
188                                 self.fence_state.clone(),
189                                 fence_handler,
190                                 Arc::clone(&self.shmem_mapper),
191                             )
192                             .ok_or_else(|| anyhow!("failed to initialize gpu frontend"))?,
193                     ));
194                     self.state = Some(state.clone());
195                     state
196                 };
197 
198                 // Start handling platform-specific workers.
199                 self.start_platform_workers(doorbell)?;
200 
201                 // Start handling the control queue.
202                 self.ex
203                     .spawn_local(run_ctrl_queue(reader, mem, kick_evt, state))
204             }
205             1 => {
206                 // For the cursor queue, spawn an empty worker, as we don't process it at all.
207                 // We don't handle the cursor queue because no current users of vhost-user GPU pass
208                 // any messages on it.
209                 self.ex.spawn_local(async {})
210             }
211             _ => bail!("attempted to start unknown queue: {}", idx),
212         };
213 
214         self.queue_workers[idx] = Some(WorkerState { queue_task, queue });
215         Ok(())
216     }
217 
stop_queue(&mut self, idx: usize) -> anyhow::Result<Queue>218     fn stop_queue(&mut self, idx: usize) -> anyhow::Result<Queue> {
219         if let Some(worker) = self.queue_workers.get_mut(idx).and_then(Option::take) {
220             // Wait for queue_task to be aborted.
221             let _ = self.ex.run_until(worker.queue_task.cancel());
222 
223             if idx == 0 {
224                 // Stop the non-queue workers if this is the control queue (where we start them).
225                 self.stop_non_queue_workers()?;
226 
227                 // After we stop all workers, we have only one reference left to self.state.
228                 // Clearing it allows the GPU state to be destroyed, which gets rid of the
229                 // remaining control queue reference from RutabagaFenceHandler.
230                 // This allows our worker.queue to be recovered as it has no further references.
231                 self.state = None;
232             }
233 
234             let queue = match Arc::try_unwrap(worker.queue) {
235                 Ok(queue_mutex) => queue_mutex.into_inner(),
236                 Err(_) => panic!("failed to recover queue from worker"),
237             };
238 
239             Ok(queue)
240         } else {
241             Err(anyhow::Error::new(DeviceError::WorkerNotFound))
242         }
243     }
244 
enter_suspended_state(&mut self) -> anyhow::Result<()>245     fn enter_suspended_state(&mut self) -> anyhow::Result<()> {
246         self.stop_non_queue_workers()?;
247         Ok(())
248     }
249 
reset(&mut self)250     fn reset(&mut self) {
251         self.stop_non_queue_workers()
252             .expect("Failed to stop platform workers.");
253 
254         for queue_num in 0..self.max_queue_num() {
255             // The cursor queue is never used, so we should check if the queue is set before
256             // stopping.
257             if self.queue_workers[queue_num].is_some() {
258                 if let Err(e) = self.stop_queue(queue_num) {
259                     error!("Failed to stop_queue during reset: {}", e);
260                 }
261             }
262         }
263     }
264 
get_shared_memory_region(&self) -> Option<SharedMemoryRegion>265     fn get_shared_memory_region(&self) -> Option<SharedMemoryRegion> {
266         self.gpu.borrow().get_shared_memory_region()
267     }
268 
set_backend_req_connection(&mut self, conn: Arc<VhostBackendReqConnection>)269     fn set_backend_req_connection(&mut self, conn: Arc<VhostBackendReqConnection>) {
270         if self
271             .shmem_mapper
272             .lock()
273             .replace(conn.take_shmem_mapper().unwrap())
274             .is_some()
275         {
276             warn!("Connection already established. Overwriting shmem_mapper");
277         }
278     }
279 
snapshot(&mut self) -> anyhow::Result<serde_json::Value>280     fn snapshot(&mut self) -> anyhow::Result<serde_json::Value> {
281         // TODO(b/289431114): Snapshot more fields if needed. Right now we just need a bare bones
282         // snapshot of the GPU to create a POC.
283         Ok(serde_json::Value::Null)
284     }
285 
restore(&mut self, data: serde_json::Value) -> anyhow::Result<()>286     fn restore(&mut self, data: serde_json::Value) -> anyhow::Result<()> {
287         anyhow::ensure!(
288             data.is_null(),
289             "unexpected snapshot data: should be null, got {}",
290             data
291         );
292         Ok(())
293     }
294 }
295 
296 impl Drop for GpuBackend {
drop(&mut self)297     fn drop(&mut self) {
298         // Workers are detached and will leak unless they are aborted. Aborting marks the
299         // Abortable task, then wakes it up. This means the executor should be asked to continue
300         // running for one more step after the backend is destroyed.
301         self.reset();
302     }
303 }
304