xref: /aosp_15_r20/external/crosvm/devices/src/virtio/fs/passthrough.rs (revision bb4ee6a4ae7042d18b07a98463b9c8b875e44b39)
1 // Copyright 2019 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 use std::borrow::Cow;
6 use std::cell::RefCell;
7 use std::cmp;
8 use std::collections::btree_map;
9 use std::collections::BTreeMap;
10 use std::ffi::CStr;
11 use std::ffi::CString;
12 #[cfg(feature = "fs_runtime_ugid_map")]
13 use std::ffi::OsStr;
14 use std::fs::File;
15 use std::io;
16 use std::mem;
17 use std::mem::size_of;
18 use std::mem::MaybeUninit;
19 use std::os::raw::c_int;
20 use std::os::raw::c_long;
21 #[cfg(feature = "fs_runtime_ugid_map")]
22 use std::os::unix::ffi::OsStrExt;
23 #[cfg(feature = "fs_runtime_ugid_map")]
24 use std::path::Path;
25 use std::ptr;
26 use std::ptr::addr_of;
27 use std::ptr::addr_of_mut;
28 use std::sync::atomic::AtomicBool;
29 use std::sync::atomic::AtomicU64;
30 use std::sync::atomic::Ordering;
31 use std::sync::Arc;
32 use std::sync::MutexGuard;
33 #[cfg(feature = "fs_permission_translation")]
34 use std::sync::RwLock;
35 use std::time::Duration;
36 
37 #[cfg(feature = "arc_quota")]
38 use base::debug;
39 use base::error;
40 use base::ioctl_ior_nr;
41 use base::ioctl_iow_nr;
42 use base::ioctl_iowr_nr;
43 use base::ioctl_with_mut_ptr;
44 use base::ioctl_with_ptr;
45 use base::syscall;
46 use base::unix::FileFlags;
47 use base::warn;
48 use base::AsRawDescriptor;
49 use base::FromRawDescriptor;
50 use base::IoctlNr;
51 use base::Protection;
52 use base::RawDescriptor;
53 use fuse::filesystem::Context;
54 use fuse::filesystem::DirectoryIterator;
55 use fuse::filesystem::Entry;
56 use fuse::filesystem::FileSystem;
57 use fuse::filesystem::FsOptions;
58 use fuse::filesystem::GetxattrReply;
59 use fuse::filesystem::IoctlFlags;
60 use fuse::filesystem::IoctlReply;
61 use fuse::filesystem::ListxattrReply;
62 use fuse::filesystem::OpenOptions;
63 use fuse::filesystem::RemoveMappingOne;
64 use fuse::filesystem::SetattrValid;
65 use fuse::filesystem::ZeroCopyReader;
66 use fuse::filesystem::ZeroCopyWriter;
67 use fuse::filesystem::ROOT_ID;
68 use fuse::sys::WRITE_KILL_PRIV;
69 use fuse::Mapper;
70 #[cfg(feature = "arc_quota")]
71 use protobuf::Message;
72 use sync::Mutex;
73 #[cfg(feature = "arc_quota")]
74 use system_api::client::OrgChromiumSpaced;
75 #[cfg(feature = "arc_quota")]
76 use system_api::spaced::SetProjectIdReply;
77 #[cfg(feature = "arc_quota")]
78 use system_api::spaced::SetProjectInheritanceFlagReply;
79 use zerocopy::AsBytes;
80 use zerocopy::FromBytes;
81 use zerocopy::FromZeroes;
82 
83 #[cfg(feature = "arc_quota")]
84 use crate::virtio::fs::arc_ioctl::FsPathXattrDataBuffer;
85 #[cfg(feature = "arc_quota")]
86 use crate::virtio::fs::arc_ioctl::FsPermissionDataBuffer;
87 #[cfg(feature = "arc_quota")]
88 use crate::virtio::fs::arc_ioctl::XattrData;
89 use crate::virtio::fs::caps::Capability;
90 use crate::virtio::fs::caps::Caps;
91 use crate::virtio::fs::caps::Set as CapSet;
92 use crate::virtio::fs::caps::Value as CapValue;
93 use crate::virtio::fs::config::CachePolicy;
94 use crate::virtio::fs::config::Config;
95 #[cfg(feature = "fs_permission_translation")]
96 use crate::virtio::fs::config::PermissionData;
97 use crate::virtio::fs::expiring_map::ExpiringMap;
98 use crate::virtio::fs::multikey::MultikeyBTreeMap;
99 use crate::virtio::fs::read_dir::ReadDir;
100 
101 const EMPTY_CSTR: &CStr = c"";
102 const PROC_CSTR: &CStr = c"/proc";
103 const UNLABELED_CSTR: &CStr = c"unlabeled";
104 
105 const USER_VIRTIOFS_XATTR: &[u8] = b"user.virtiofs.";
106 const SECURITY_XATTR: &[u8] = b"security.";
107 const SELINUX_XATTR: &[u8] = b"security.selinux";
108 
109 const FSCRYPT_KEY_DESCRIPTOR_SIZE: usize = 8;
110 const FSCRYPT_KEY_IDENTIFIER_SIZE: usize = 16;
111 
112 #[cfg(feature = "arc_quota")]
113 const FS_PROJINHERIT_FL: c_int = 0x20000000;
114 
115 // 25 seconds is the default timeout for dbus-send.
116 #[cfg(feature = "arc_quota")]
117 const DEFAULT_DBUS_TIMEOUT: Duration = Duration::from_secs(25);
118 
119 /// Internal utility wrapper for `cros_tracing::trace_event!()` macro with VirtioFS calls.
120 macro_rules! fs_trace {
121     ($tag:expr, $name:expr, $($arg:expr),+) => {
122         cros_tracing::trace_event!(VirtioFs, $name, $tag, $($arg),*)
123     };
124 }
125 
126 #[repr(C)]
127 #[derive(Clone, Copy, AsBytes, FromZeroes, FromBytes)]
128 struct fscrypt_policy_v1 {
129     _version: u8,
130     _contents_encryption_mode: u8,
131     _filenames_encryption_mode: u8,
132     _flags: u8,
133     _master_key_descriptor: [u8; FSCRYPT_KEY_DESCRIPTOR_SIZE],
134 }
135 
136 #[repr(C)]
137 #[derive(Clone, Copy, AsBytes, FromZeroes, FromBytes)]
138 struct fscrypt_policy_v2 {
139     _version: u8,
140     _contents_encryption_mode: u8,
141     _filenames_encryption_mode: u8,
142     _flags: u8,
143     __reserved: [u8; 4],
144     master_key_identifier: [u8; FSCRYPT_KEY_IDENTIFIER_SIZE],
145 }
146 
147 #[repr(C)]
148 #[derive(Copy, Clone, FromZeroes, FromBytes)]
149 union fscrypt_policy {
150     _version: u8,
151     _v1: fscrypt_policy_v1,
152     _v2: fscrypt_policy_v2,
153 }
154 
155 #[repr(C)]
156 #[derive(Copy, Clone, FromZeroes, FromBytes)]
157 struct fscrypt_get_policy_ex_arg {
158     policy_size: u64,       /* input/output */
159     policy: fscrypt_policy, /* output */
160 }
161 
162 impl From<&fscrypt_get_policy_ex_arg> for &[u8] {
from(value: &fscrypt_get_policy_ex_arg) -> Self163     fn from(value: &fscrypt_get_policy_ex_arg) -> Self {
164         assert!(value.policy_size <= size_of::<fscrypt_policy>() as u64);
165         let data_raw: *const fscrypt_get_policy_ex_arg = value;
166         // SAFETY: the length of the output slice is asserted to be within the struct it points to
167         unsafe {
168             std::slice::from_raw_parts(
169                 data_raw.cast(),
170                 value.policy_size as usize + size_of::<u64>(),
171             )
172         }
173     }
174 }
175 
176 ioctl_iowr_nr!(FS_IOC_GET_ENCRYPTION_POLICY_EX, 'f' as u32, 22, [u8; 9]);
177 
178 #[repr(C)]
179 #[derive(Clone, Copy, AsBytes, FromZeroes, FromBytes)]
180 struct fsxattr {
181     fsx_xflags: u32,     /* xflags field value (get/set) */
182     fsx_extsize: u32,    /* extsize field value (get/set) */
183     fsx_nextents: u32,   /* nextents field value (get) */
184     fsx_projid: u32,     /* project identifier (get/set) */
185     fsx_cowextsize: u32, /* CoW extsize field value (get/set) */
186     fsx_pad: [u8; 8],
187 }
188 
189 ioctl_ior_nr!(FS_IOC_FSGETXATTR, 'X' as u32, 31, fsxattr);
190 ioctl_iow_nr!(FS_IOC_FSSETXATTR, 'X' as u32, 32, fsxattr);
191 
192 ioctl_ior_nr!(FS_IOC_GETFLAGS, 'f' as u32, 1, c_long);
193 ioctl_iow_nr!(FS_IOC_SETFLAGS, 'f' as u32, 2, c_long);
194 
195 ioctl_ior_nr!(FS_IOC32_GETFLAGS, 'f' as u32, 1, u32);
196 ioctl_iow_nr!(FS_IOC32_SETFLAGS, 'f' as u32, 2, u32);
197 
198 ioctl_ior_nr!(FS_IOC64_GETFLAGS, 'f' as u32, 1, u64);
199 ioctl_iow_nr!(FS_IOC64_SETFLAGS, 'f' as u32, 2, u64);
200 
201 #[cfg(feature = "arc_quota")]
202 ioctl_iow_nr!(FS_IOC_SETPERMISSION, 'f' as u32, 1, FsPermissionDataBuffer);
203 #[cfg(feature = "arc_quota")]
204 ioctl_iow_nr!(FS_IOC_SETPATHXATTR, 'f' as u32, 1, FsPathXattrDataBuffer);
205 
206 #[repr(C)]
207 #[derive(Clone, Copy, AsBytes, FromZeroes, FromBytes)]
208 struct fsverity_enable_arg {
209     _version: u32,
210     _hash_algorithm: u32,
211     _block_size: u32,
212     salt_size: u32,
213     salt_ptr: u64,
214     sig_size: u32,
215     __reserved1: u32,
216     sig_ptr: u64,
217     __reserved2: [u64; 11],
218 }
219 
220 #[repr(C)]
221 #[derive(Clone, Copy, AsBytes, FromZeroes, FromBytes)]
222 struct fsverity_digest {
223     _digest_algorithm: u16,
224     digest_size: u16,
225     // __u8 digest[];
226 }
227 
228 ioctl_iow_nr!(FS_IOC_ENABLE_VERITY, 'f' as u32, 133, fsverity_enable_arg);
229 ioctl_iowr_nr!(FS_IOC_MEASURE_VERITY, 'f' as u32, 134, fsverity_digest);
230 
231 pub type Inode = u64;
232 type Handle = u64;
233 
234 #[derive(Clone, Copy, Debug, PartialOrd, Ord, PartialEq, Eq)]
235 struct InodeAltKey {
236     ino: libc::ino64_t,
237     dev: libc::dev_t,
238 }
239 
240 #[derive(PartialEq, Eq, Debug)]
241 enum FileType {
242     Regular,
243     Directory,
244     Other,
245 }
246 
247 impl From<libc::mode_t> for FileType {
from(mode: libc::mode_t) -> Self248     fn from(mode: libc::mode_t) -> Self {
249         match mode & libc::S_IFMT {
250             libc::S_IFREG => FileType::Regular,
251             libc::S_IFDIR => FileType::Directory,
252             _ => FileType::Other,
253         }
254     }
255 }
256 
257 #[derive(Debug)]
258 struct InodeData {
259     inode: Inode,
260     // (File, open_flags)
261     file: Mutex<(File, libc::c_int)>,
262     refcount: AtomicU64,
263     filetype: FileType,
264     path: String,
265 }
266 
267 impl AsRawDescriptor for InodeData {
as_raw_descriptor(&self) -> RawDescriptor268     fn as_raw_descriptor(&self) -> RawDescriptor {
269         self.file.lock().0.as_raw_descriptor()
270     }
271 }
272 
273 #[derive(Debug)]
274 struct HandleData {
275     inode: Inode,
276     file: Mutex<File>,
277 }
278 
279 impl AsRawDescriptor for HandleData {
as_raw_descriptor(&self) -> RawDescriptor280     fn as_raw_descriptor(&self) -> RawDescriptor {
281         self.file.lock().as_raw_descriptor()
282     }
283 }
284 
285 macro_rules! scoped_cred {
286     ($name:ident, $ty:ty, $syscall_nr:expr) => {
287         #[derive(Debug)]
288         struct $name {
289             old: $ty,
290         }
291 
292         impl $name {
293             // Changes the effective uid/gid of the current thread to `val`. Changes the thread's
294             // credentials back to `old` when the returned struct is dropped.
295             fn new(val: $ty, old: $ty) -> io::Result<Option<$name>> {
296                 if val == old {
297                     // Nothing to do since we already have the correct value.
298                     return Ok(None);
299                 }
300 
301                 // We want credential changes to be per-thread because otherwise
302                 // we might interfere with operations being carried out on other
303                 // threads with different uids/gids.  However, posix requires that
304                 // all threads in a process share the same credentials.  To do this
305                 // libc uses signals to ensure that when one thread changes its
306                 // credentials the other threads do the same thing.
307                 //
308                 // So instead we invoke the syscall directly in order to get around
309                 // this limitation.  Another option is to use the setfsuid and
310                 // setfsgid systems calls.   However since those calls have no way to
311                 // return an error, it's preferable to do this instead.
312 
313                 // SAFETY: this call is safe because it doesn't modify any memory and we
314                 // check the return value.
315                 let res = unsafe { libc::syscall($syscall_nr, -1, val, -1) };
316                 if res == 0 {
317                     Ok(Some($name { old }))
318                 } else {
319                     Err(io::Error::last_os_error())
320                 }
321             }
322         }
323 
324         impl Drop for $name {
325             fn drop(&mut self) {
326                 // SAFETY: trivially safe
327                 let res = unsafe { libc::syscall($syscall_nr, -1, self.old, -1) };
328                 if res < 0 {
329                     error!(
330                         "failed to change credentials back to {}: {}",
331                         self.old,
332                         io::Error::last_os_error(),
333                     );
334                 }
335             }
336         }
337     };
338 }
339 #[cfg(not(target_arch = "arm"))]
340 scoped_cred!(ScopedUid, libc::uid_t, libc::SYS_setresuid);
341 #[cfg(target_arch = "arm")]
342 scoped_cred!(ScopedUid, libc::uid_t, libc::SYS_setresuid32);
343 
344 #[cfg(not(target_arch = "arm"))]
345 scoped_cred!(ScopedGid, libc::gid_t, libc::SYS_setresgid);
346 #[cfg(target_arch = "arm")]
347 scoped_cred!(ScopedGid, libc::gid_t, libc::SYS_setresgid32);
348 
349 #[cfg(not(target_arch = "arm"))]
350 const SYS_GETEUID: libc::c_long = libc::SYS_geteuid;
351 #[cfg(target_arch = "arm")]
352 const SYS_GETEUID: libc::c_long = libc::SYS_geteuid32;
353 
354 #[cfg(not(target_arch = "arm"))]
355 const SYS_GETEGID: libc::c_long = libc::SYS_getegid;
356 #[cfg(target_arch = "arm")]
357 const SYS_GETEGID: libc::c_long = libc::SYS_getegid32;
358 
359 thread_local! {
360     // SAFETY: both calls take no parameters and only return an integer value. The kernel also
361     // guarantees that they can never fail.
362     static THREAD_EUID: libc::uid_t = unsafe { libc::syscall(SYS_GETEUID) as libc::uid_t };
363     // SAFETY: both calls take no parameters and only return an integer value. The kernel also
364     // guarantees that they can never fail.
365     static THREAD_EGID: libc::gid_t = unsafe { libc::syscall(SYS_GETEGID) as libc::gid_t };
366 }
367 
set_creds( uid: libc::uid_t, gid: libc::gid_t, ) -> io::Result<(Option<ScopedUid>, Option<ScopedGid>)>368 fn set_creds(
369     uid: libc::uid_t,
370     gid: libc::gid_t,
371 ) -> io::Result<(Option<ScopedUid>, Option<ScopedGid>)> {
372     let olduid = THREAD_EUID.with(|uid| *uid);
373     let oldgid = THREAD_EGID.with(|gid| *gid);
374 
375     // We have to change the gid before we change the uid because if we change the uid first then we
376     // lose the capability to change the gid.  However changing back can happen in any order.
377     ScopedGid::new(gid, oldgid).and_then(|gid| Ok((ScopedUid::new(uid, olduid)?, gid)))
378 }
379 
380 thread_local!(static THREAD_FSCREATE: RefCell<Option<File>> = const { RefCell::new(None) });
381 
382 // Opens and returns a write-only handle to /proc/thread-self/attr/fscreate. Panics if it fails to
383 // open the file.
open_fscreate(proc: &File) -> File384 fn open_fscreate(proc: &File) -> File {
385     let fscreate = c"thread-self/attr/fscreate";
386 
387     // SAFETY: this doesn't modify any memory and we check the return value.
388     let raw_descriptor = unsafe {
389         libc::openat(
390             proc.as_raw_descriptor(),
391             fscreate.as_ptr(),
392             libc::O_CLOEXEC | libc::O_WRONLY,
393         )
394     };
395 
396     // We don't expect this to fail and we're not in a position to return an error here so just
397     // panic.
398     if raw_descriptor < 0 {
399         panic!(
400             "Failed to open /proc/thread-self/attr/fscreate: {}",
401             io::Error::last_os_error()
402         );
403     }
404 
405     // SAFETY: safe because we just opened this descriptor.
406     unsafe { File::from_raw_descriptor(raw_descriptor) }
407 }
408 
409 struct ScopedSecurityContext;
410 
411 impl ScopedSecurityContext {
new(proc: &File, ctx: &CStr) -> io::Result<ScopedSecurityContext>412     fn new(proc: &File, ctx: &CStr) -> io::Result<ScopedSecurityContext> {
413         THREAD_FSCREATE.with(|thread_fscreate| {
414             let mut fscreate = thread_fscreate.borrow_mut();
415             let file = fscreate.get_or_insert_with(|| open_fscreate(proc));
416             // SAFETY: this doesn't modify any memory and we check the return value.
417             let ret = unsafe {
418                 libc::write(
419                     file.as_raw_descriptor(),
420                     ctx.as_ptr() as *const libc::c_void,
421                     ctx.to_bytes_with_nul().len(),
422                 )
423             };
424             if ret < 0 {
425                 Err(io::Error::last_os_error())
426             } else {
427                 Ok(ScopedSecurityContext)
428             }
429         })
430     }
431 }
432 
433 impl Drop for ScopedSecurityContext {
drop(&mut self)434     fn drop(&mut self) {
435         THREAD_FSCREATE.with(|thread_fscreate| {
436             // expect is safe here because the thread local would have been initialized by the call
437             // to `new` above.
438             let fscreate = thread_fscreate.borrow();
439             let file = fscreate
440                 .as_ref()
441                 .expect("Uninitialized thread-local when dropping ScopedSecurityContext");
442 
443             // SAFETY: this doesn't modify any memory and we check the return value.
444             let ret = unsafe { libc::write(file.as_raw_descriptor(), ptr::null(), 0) };
445 
446             if ret < 0 {
447                 warn!(
448                     "Failed to restore security context: {}",
449                     io::Error::last_os_error()
450                 );
451             }
452         })
453     }
454 }
455 
456 struct ScopedUmask {
457     old: libc::mode_t,
458     mask: libc::mode_t,
459 }
460 
461 impl ScopedUmask {
new(mask: libc::mode_t) -> ScopedUmask462     fn new(mask: libc::mode_t) -> ScopedUmask {
463         ScopedUmask {
464             // SAFETY: this doesn't modify any memory and always succeeds.
465             old: unsafe { libc::umask(mask) },
466             mask,
467         }
468     }
469 }
470 
471 impl Drop for ScopedUmask {
drop(&mut self)472     fn drop(&mut self) {
473         // SAFETY: this doesn't modify any memory and always succeeds.
474         let previous = unsafe { libc::umask(self.old) };
475         debug_assert_eq!(
476             previous, self.mask,
477             "umask changed while holding ScopedUmask"
478         );
479     }
480 }
481 
482 struct ScopedFsetid(Caps);
483 impl Drop for ScopedFsetid {
drop(&mut self)484     fn drop(&mut self) {
485         if let Err(e) = raise_cap_fsetid(&mut self.0) {
486             error!(
487                 "Failed to restore CAP_FSETID: {}.  Some operations may be broken.",
488                 e
489             )
490         }
491     }
492 }
493 
raise_cap_fsetid(c: &mut Caps) -> io::Result<()>494 fn raise_cap_fsetid(c: &mut Caps) -> io::Result<()> {
495     c.update(&[Capability::Fsetid], CapSet::Effective, CapValue::Set)?;
496     c.apply()
497 }
498 
499 // Drops CAP_FSETID from the effective set for the current thread and returns an RAII guard that
500 // adds the capability back when it is dropped.
drop_cap_fsetid() -> io::Result<ScopedFsetid>501 fn drop_cap_fsetid() -> io::Result<ScopedFsetid> {
502     let mut caps = Caps::for_current_thread()?;
503     caps.update(&[Capability::Fsetid], CapSet::Effective, CapValue::Clear)?;
504     caps.apply()?;
505     Ok(ScopedFsetid(caps))
506 }
507 
ebadf() -> io::Error508 fn ebadf() -> io::Error {
509     io::Error::from_raw_os_error(libc::EBADF)
510 }
511 
eexist() -> io::Error512 fn eexist() -> io::Error {
513     io::Error::from_raw_os_error(libc::EEXIST)
514 }
515 
stat<F: AsRawDescriptor + ?Sized>(f: &F) -> io::Result<libc::stat64>516 fn stat<F: AsRawDescriptor + ?Sized>(f: &F) -> io::Result<libc::stat64> {
517     let mut st: MaybeUninit<libc::stat64> = MaybeUninit::<libc::stat64>::zeroed();
518 
519     // SAFETY: the kernel will only write data in `st` and we check the return value.
520     syscall!(unsafe {
521         libc::fstatat64(
522             f.as_raw_descriptor(),
523             EMPTY_CSTR.as_ptr(),
524             st.as_mut_ptr(),
525             libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW,
526         )
527     })?;
528 
529     // SAFETY: the kernel guarantees that the struct is now fully initialized.
530     Ok(unsafe { st.assume_init() })
531 }
532 
statat<D: AsRawDescriptor>(dir: &D, name: &CStr) -> io::Result<libc::stat64>533 fn statat<D: AsRawDescriptor>(dir: &D, name: &CStr) -> io::Result<libc::stat64> {
534     let mut st = MaybeUninit::<libc::stat64>::zeroed();
535 
536     // SAFETY: the kernel will only write data in `st` and we check the return value.
537     syscall!(unsafe {
538         libc::fstatat64(
539             dir.as_raw_descriptor(),
540             name.as_ptr(),
541             st.as_mut_ptr(),
542             libc::AT_SYMLINK_NOFOLLOW,
543         )
544     })?;
545 
546     // SAFETY: the kernel guarantees that the struct is now fully initialized.
547     Ok(unsafe { st.assume_init() })
548 }
549 
550 #[cfg(feature = "arc_quota")]
is_android_project_id(project_id: u32) -> bool551 fn is_android_project_id(project_id: u32) -> bool {
552     // The following constants defines the valid range of project ID used by
553     // Android and are taken from android_filesystem_config.h in Android
554     // codebase.
555     //
556     // Project IDs reserved for Android files on external storage. Total 100 IDs
557     // from PROJECT_ID_EXT_DEFAULT (1000) are reserved.
558     const PROJECT_ID_FOR_ANDROID_FILES: std::ops::RangeInclusive<u32> = 1000..=1099;
559     // Project IDs reserved for Android apps.
560     // The lower-limit of the range is PROJECT_ID_EXT_DATA_START.
561     // The upper-limit of the range differs before and after T. Here we use that
562     // of T (PROJECT_ID_APP_CACHE_END) as it is larger.
563     const PROJECT_ID_FOR_ANDROID_APPS: std::ops::RangeInclusive<u32> = 20000..=69999;
564 
565     PROJECT_ID_FOR_ANDROID_FILES.contains(&project_id)
566         || PROJECT_ID_FOR_ANDROID_APPS.contains(&project_id)
567 }
568 
569 /// Per-directory cache for `PassthroughFs::ascii_casefold_lookup()`.
570 ///
571 /// The key of the underlying `BTreeMap` is a lower-cased file name in the direcoty.
572 /// The value is the case-sensitive file name stored in the host file system.
573 /// We assume that if PassthroughFs has exclusive access to the filesystem, this cache exhaustively
574 ///  covers all file names that exist within the directory.
575 /// So every `PassthroughFs`'s handler that adds or removes files in the directory is expected to
576 /// update this cache.
577 struct CasefoldCache(BTreeMap<Vec<u8>, CString>);
578 
579 impl CasefoldCache {
new(dir: &InodeData) -> io::Result<Self>580     fn new(dir: &InodeData) -> io::Result<Self> {
581         let mut mp = BTreeMap::new();
582 
583         let mut buf = [0u8; 1024];
584         let mut offset = 0;
585         loop {
586             let mut read_dir = ReadDir::new(dir, offset, &mut buf[..])?;
587             if read_dir.remaining() == 0 {
588                 break;
589             }
590 
591             while let Some(entry) = read_dir.next() {
592                 offset = entry.offset as libc::off64_t;
593                 let entry_name = entry.name;
594                 mp.insert(
595                     entry_name.to_bytes().to_ascii_lowercase(),
596                     entry_name.to_owned(),
597                 );
598             }
599         }
600         Ok(Self(mp))
601     }
602 
insert(&mut self, name: &CStr)603     fn insert(&mut self, name: &CStr) {
604         let lower_case = name.to_bytes().to_ascii_lowercase();
605         self.0.insert(lower_case, name.into());
606     }
607 
lookup(&self, name: &[u8]) -> Option<CString>608     fn lookup(&self, name: &[u8]) -> Option<CString> {
609         let lower = name.to_ascii_lowercase();
610         self.0.get(&lower).cloned()
611     }
612 
remove(&mut self, name: &CStr)613     fn remove(&mut self, name: &CStr) {
614         let lower_case = name.to_bytes().to_ascii_lowercase();
615         self.0.remove(&lower_case);
616     }
617 }
618 
619 /// Time expiring mapping from an inode of a directory to `CasefoldCache` for the directory.
620 /// Each entry will be expired after `timeout`.
621 /// When ascii_casefold is disabled, this struct does nothing.
622 struct ExpiringCasefoldLookupCaches {
623     inner: ExpiringMap<Inode, CasefoldCache>,
624 }
625 
626 impl ExpiringCasefoldLookupCaches {
new(timeout: Duration) -> Self627     fn new(timeout: Duration) -> Self {
628         Self {
629             inner: ExpiringMap::new(timeout),
630         }
631     }
632 
insert(&mut self, parent: Inode, name: &CStr)633     fn insert(&mut self, parent: Inode, name: &CStr) {
634         if let Some(dir_cache) = self.inner.get_mut(&parent) {
635             dir_cache.insert(name);
636         }
637     }
638 
remove(&mut self, parent: Inode, name: &CStr)639     fn remove(&mut self, parent: Inode, name: &CStr) {
640         if let Some(dir_cache) = self.inner.get_mut(&parent) {
641             dir_cache.remove(name);
642         }
643     }
644 
forget(&mut self, parent: Inode)645     fn forget(&mut self, parent: Inode) {
646         self.inner.remove(&parent);
647     }
648 
649     /// Get `CasefoldCache` for the given directory.
650     /// If the cache doesn't exist, generate it by fetching directory information with
651     /// `getdents64()`.
get(&mut self, parent: &InodeData) -> io::Result<&CasefoldCache>652     fn get(&mut self, parent: &InodeData) -> io::Result<&CasefoldCache> {
653         self.inner
654             .get_or_insert_with(&parent.inode, || CasefoldCache::new(parent))
655     }
656 
657     #[cfg(test)]
exists_in_cache(&mut self, parent: Inode, name: &CStr) -> bool658     fn exists_in_cache(&mut self, parent: Inode, name: &CStr) -> bool {
659         if let Some(dir_cache) = self.inner.get(&parent) {
660             dir_cache.lookup(name.to_bytes()).is_some()
661         } else {
662             false
663         }
664     }
665 }
666 
667 #[cfg(feature = "fs_permission_translation")]
668 impl PermissionData {
need_set_permission(&self, path: &str) -> bool669     pub(crate) fn need_set_permission(&self, path: &str) -> bool {
670         path.starts_with(&self.perm_path)
671     }
672 }
673 
674 /// A file system that simply "passes through" all requests it receives to the underlying file
675 /// system. To keep the implementation simple it servers the contents of its root directory. Users
676 /// that wish to serve only a specific directory should set up the environment so that that
677 /// directory ends up as the root of the file system process. One way to accomplish this is via a
678 /// combination of mount namespaces and the pivot_root system call.
679 pub struct PassthroughFs {
680     // Mutex that must be acquired before executing a process-wide operation such as fchdir.
681     process_lock: Mutex<()>,
682     // virtio-fs tag that the guest uses when mounting. This is only used for debugging
683     // when tracing is enabled.
684     tag: String,
685 
686     // File descriptors for various points in the file system tree.
687     inodes: Mutex<MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>>,
688     next_inode: AtomicU64,
689 
690     // File descriptors for open files and directories. Unlike the fds in `inodes`, these _can_ be
691     // used for reading and writing data.
692     handles: Mutex<BTreeMap<Handle, Arc<HandleData>>>,
693     next_handle: AtomicU64,
694 
695     // File descriptor pointing to the `/proc` directory. This is used to convert an fd from
696     // `inodes` into one that can go into `handles`. This is accomplished by reading the
697     // `self/fd/{}` symlink. We keep an open fd here in case the file system tree that we are meant
698     // to be serving doesn't have access to `/proc`.
699     proc: File,
700 
701     // Whether writeback caching is enabled for this directory. This will only be true when
702     // `cfg.writeback` is true and `init` was called with `FsOptions::WRITEBACK_CACHE`.
703     writeback: AtomicBool,
704 
705     // Whether zero message opens are supported by the kernel driver.
706     zero_message_open: AtomicBool,
707 
708     // Whether zero message opendir is supported by the kernel driver.
709     zero_message_opendir: AtomicBool,
710 
711     // Used to communicate with other processes using D-Bus.
712     #[cfg(feature = "arc_quota")]
713     dbus_connection: Option<Mutex<dbus::blocking::Connection>>,
714     #[cfg(feature = "arc_quota")]
715     dbus_fd: Option<std::os::unix::io::RawFd>,
716 
717     // Time-expiring cache for `ascii_casefold_lookup()`.
718     // The key is an inode of a directory, and the value is a cache for the directory.
719     // Each value will be expired `cfg.timeout` after it's created.
720     //
721     // TODO(b/267748212): Instead of per-device Mutex, we might want to have per-directory Mutex
722     // if we use PassthroughFs in multi-threaded environments.
723     expiring_casefold_lookup_caches: Option<Mutex<ExpiringCasefoldLookupCaches>>,
724 
725     // paths and coresponding permission setting set by `crosvm_client_fs_permission_set` API
726     #[cfg(feature = "fs_permission_translation")]
727     permission_paths: RwLock<Vec<PermissionData>>,
728 
729     // paths and coresponding xattr setting set by `crosvm_client_fs_xattr_set` API
730     #[cfg(feature = "arc_quota")]
731     xattr_paths: RwLock<Vec<XattrData>>,
732 
733     cfg: Config,
734 
735     // Set the root directory when pivot root isn't enabled for jailed process.
736     //
737     // virtio-fs typically uses mount namespaces and pivot_root for file system isolation,
738     // making the jailed process's root directory "/".
739     //
740     // However, Android's security model prevents crosvm from having the necessary SYS_ADMIN
741     // capability for mount namespaces and pivot_root. This lack of isolation means that
742     // root_dir defaults to the path provided via "--shared-dir".
743     root_dir: String,
744 }
745 
746 impl std::fmt::Debug for PassthroughFs {
fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result747     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
748         f.debug_struct("PassthroughFs")
749             .field("tag", &self.tag)
750             .field("next_inode", &self.next_inode)
751             .field("next_handle", &self.next_handle)
752             .field("proc", &self.proc)
753             .field("writeback", &self.writeback)
754             .field("zero_message_open", &self.zero_message_open)
755             .field("zero_message_opendir", &self.zero_message_opendir)
756             .field("cfg", &self.cfg)
757             .finish()
758     }
759 }
760 
761 impl PassthroughFs {
new(tag: &str, cfg: Config) -> io::Result<PassthroughFs>762     pub fn new(tag: &str, cfg: Config) -> io::Result<PassthroughFs> {
763         // SAFETY: this doesn't modify any memory and we check the return value.
764         let raw_descriptor = syscall!(unsafe {
765             libc::openat64(
766                 libc::AT_FDCWD,
767                 PROC_CSTR.as_ptr(),
768                 libc::O_PATH | libc::O_NOFOLLOW | libc::O_CLOEXEC,
769             )
770         })?;
771 
772         // Privileged UIDs can use D-Bus to perform some operations.
773         #[cfg(feature = "arc_quota")]
774         let (dbus_connection, dbus_fd) = if cfg.privileged_quota_uids.is_empty() {
775             (None, None)
776         } else {
777             let mut channel = dbus::channel::Channel::get_private(dbus::channel::BusType::System)
778                 .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
779             channel.set_watch_enabled(true);
780             let dbus_fd = channel.watch().fd;
781             channel.set_watch_enabled(false);
782             (
783                 Some(Mutex::new(dbus::blocking::Connection::from(channel))),
784                 Some(dbus_fd),
785             )
786         };
787 
788         // SAFETY: safe because we just opened this descriptor.
789         let proc = unsafe { File::from_raw_descriptor(raw_descriptor) };
790 
791         let expiring_casefold_lookup_caches = if cfg.ascii_casefold {
792             Some(Mutex::new(ExpiringCasefoldLookupCaches::new(cfg.timeout)))
793         } else {
794             None
795         };
796 
797         #[allow(unused_mut)]
798         let mut passthroughfs = PassthroughFs {
799             process_lock: Mutex::new(()),
800             tag: tag.to_string(),
801             inodes: Mutex::new(MultikeyBTreeMap::new()),
802             next_inode: AtomicU64::new(ROOT_ID + 1),
803 
804             handles: Mutex::new(BTreeMap::new()),
805             next_handle: AtomicU64::new(1),
806 
807             proc,
808 
809             writeback: AtomicBool::new(false),
810             zero_message_open: AtomicBool::new(false),
811             zero_message_opendir: AtomicBool::new(false),
812 
813             #[cfg(feature = "arc_quota")]
814             dbus_connection,
815             #[cfg(feature = "arc_quota")]
816             dbus_fd,
817             expiring_casefold_lookup_caches,
818             #[cfg(feature = "fs_permission_translation")]
819             permission_paths: RwLock::new(Vec::new()),
820             #[cfg(feature = "arc_quota")]
821             xattr_paths: RwLock::new(Vec::new()),
822             cfg,
823             root_dir: "/".to_string(),
824         };
825 
826         #[cfg(feature = "fs_runtime_ugid_map")]
827         passthroughfs.set_permission_path();
828 
829         cros_tracing::trace_simple_print!(
830             VirtioFs,
831             "New PassthroughFS initialized: {:?}",
832             passthroughfs
833         );
834         Ok(passthroughfs)
835     }
836 
837     #[cfg(feature = "fs_runtime_ugid_map")]
set_permission_path(&mut self)838     fn set_permission_path(&mut self) {
839         if !self.cfg.ugid_map.is_empty() {
840             let mut write_lock = self
841                 .permission_paths
842                 .write()
843                 .expect("Failed to acquire write lock on permission_paths");
844             *write_lock = self.cfg.ugid_map.clone();
845         }
846     }
847 
848     #[cfg(feature = "fs_runtime_ugid_map")]
set_root_dir(&mut self, shared_dir: String) -> io::Result<()>849     pub fn set_root_dir(&mut self, shared_dir: String) -> io::Result<()> {
850         let canonicalized_root = match std::fs::canonicalize(shared_dir) {
851             Ok(path) => path,
852             Err(e) => {
853                 return Err(io::Error::new(
854                     io::ErrorKind::InvalidInput,
855                     format!("Failed to canonicalize root_dir: {}", e),
856                 ));
857             }
858         };
859         self.root_dir = canonicalized_root.to_string_lossy().to_string();
860         Ok(())
861     }
862 
cfg(&self) -> &Config863     pub fn cfg(&self) -> &Config {
864         &self.cfg
865     }
866 
keep_rds(&self) -> Vec<RawDescriptor>867     pub fn keep_rds(&self) -> Vec<RawDescriptor> {
868         #[cfg_attr(not(feature = "arc_quota"), allow(unused_mut))]
869         let mut keep_rds = vec![self.proc.as_raw_descriptor()];
870         #[cfg(feature = "arc_quota")]
871         if let Some(fd) = self.dbus_fd {
872             keep_rds.push(fd);
873         }
874         keep_rds
875     }
876 
rewrite_xattr_name<'xattr>(&self, name: &'xattr CStr) -> Cow<'xattr, CStr>877     fn rewrite_xattr_name<'xattr>(&self, name: &'xattr CStr) -> Cow<'xattr, CStr> {
878         if !self.cfg.rewrite_security_xattrs {
879             return Cow::Borrowed(name);
880         }
881 
882         // Does not include nul-terminator.
883         let buf = name.to_bytes();
884         if !buf.starts_with(SECURITY_XATTR) || buf == SELINUX_XATTR {
885             return Cow::Borrowed(name);
886         }
887 
888         let mut newname = USER_VIRTIOFS_XATTR.to_vec();
889         newname.extend_from_slice(buf);
890 
891         // The unwrap is safe here because the prefix doesn't contain any interior nul-bytes and the
892         // to_bytes() call above will not return a byte slice with any interior nul-bytes either.
893         Cow::Owned(CString::new(newname).expect("Failed to re-write xattr name"))
894     }
895 
find_inode(&self, inode: Inode) -> io::Result<Arc<InodeData>>896     fn find_inode(&self, inode: Inode) -> io::Result<Arc<InodeData>> {
897         self.inodes.lock().get(&inode).cloned().ok_or_else(ebadf)
898     }
899 
find_handle(&self, handle: Handle, inode: Inode) -> io::Result<Arc<HandleData>>900     fn find_handle(&self, handle: Handle, inode: Inode) -> io::Result<Arc<HandleData>> {
901         self.handles
902             .lock()
903             .get(&handle)
904             .filter(|hd| hd.inode == inode)
905             .cloned()
906             .ok_or_else(ebadf)
907     }
908 
open_fd(&self, fd: RawDescriptor, flags: i32) -> io::Result<File>909     fn open_fd(&self, fd: RawDescriptor, flags: i32) -> io::Result<File> {
910         let pathname = CString::new(format!("self/fd/{}", fd))
911             .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
912 
913         // SAFETY: this doesn't modify any memory and we check the return value. We don't really
914         // check `flags` because if the kernel can't handle poorly specified flags then we have
915         // much bigger problems. Also, clear the `O_NOFOLLOW` flag if it is set since we need
916         // to follow the `/proc/self/fd` symlink to get the file.
917         let raw_descriptor = syscall!(unsafe {
918             libc::openat64(
919                 self.proc.as_raw_descriptor(),
920                 pathname.as_ptr(),
921                 (flags | libc::O_CLOEXEC) & !(libc::O_NOFOLLOW | libc::O_DIRECT),
922             )
923         })?;
924 
925         // SAFETY: safe because we just opened this descriptor.
926         Ok(unsafe { File::from_raw_descriptor(raw_descriptor) })
927     }
928 
929     /// Modifies the provided open flags based on the writeback caching configuration.
930     /// Return the updated open flags.
update_open_flags(&self, mut flags: i32) -> i32931     fn update_open_flags(&self, mut flags: i32) -> i32 {
932         // When writeback caching is enabled, the kernel may send read requests even if the
933         // userspace program opened the file write-only. So we need to ensure that we have opened
934         // the file for reading as well as writing.
935         let writeback = self.writeback.load(Ordering::Relaxed);
936         if writeback && flags & libc::O_ACCMODE == libc::O_WRONLY {
937             flags &= !libc::O_ACCMODE;
938             flags |= libc::O_RDWR;
939         }
940 
941         // When writeback caching is enabled the kernel is responsible for handling `O_APPEND`.
942         // However, this breaks atomicity as the file may have changed on disk, invalidating the
943         // cached copy of the data in the kernel and the offset that the kernel thinks is the end of
944         // the file. Just allow this for now as it is the user's responsibility to enable writeback
945         // caching only for directories that are not shared. It also means that we need to clear the
946         // `O_APPEND` flag.
947         if writeback && flags & libc::O_APPEND != 0 {
948             flags &= !libc::O_APPEND;
949         }
950 
951         flags
952     }
953 
open_inode(&self, inode: &InodeData, mut flags: i32) -> io::Result<File>954     fn open_inode(&self, inode: &InodeData, mut flags: i32) -> io::Result<File> {
955         // handle writeback caching cases
956         flags = self.update_open_flags(flags);
957 
958         self.open_fd(inode.as_raw_descriptor(), flags)
959     }
960 
961     // Increases the inode refcount and returns the inode.
increase_inode_refcount(&self, inode_data: &InodeData) -> Inode962     fn increase_inode_refcount(&self, inode_data: &InodeData) -> Inode {
963         // Matches with the release store in `forget`.
964         inode_data.refcount.fetch_add(1, Ordering::Acquire);
965         inode_data.inode
966     }
967 
968     // Creates a new entry for `f` or increases the refcount of the existing entry for `f`.
969     // The inodes mutex lock must not be already taken by the same thread otherwise this
970     // will deadlock.
add_entry( &self, f: File, #[cfg_attr(not(feature = "fs_permission_translation"), allow(unused_mut))] mut st: libc::stat64, open_flags: libc::c_int, path: String, ) -> Entry971     fn add_entry(
972         &self,
973         f: File,
974         #[cfg_attr(not(feature = "fs_permission_translation"), allow(unused_mut))]
975         mut st: libc::stat64,
976         open_flags: libc::c_int,
977         path: String,
978     ) -> Entry {
979         #[cfg(feature = "arc_quota")]
980         self.set_permission(&mut st, &path);
981         #[cfg(feature = "fs_runtime_ugid_map")]
982         self.set_ugid_permission(&mut st, &path);
983         let mut inodes = self.inodes.lock();
984 
985         let altkey = InodeAltKey {
986             ino: st.st_ino,
987             dev: st.st_dev,
988         };
989 
990         let inode = if let Some(data) = inodes.get_alt(&altkey) {
991             self.increase_inode_refcount(data)
992         } else {
993             let inode = self.next_inode.fetch_add(1, Ordering::Relaxed);
994             inodes.insert(
995                 inode,
996                 altkey,
997                 Arc::new(InodeData {
998                     inode,
999                     file: Mutex::new((f, open_flags)),
1000                     refcount: AtomicU64::new(1),
1001                     filetype: st.st_mode.into(),
1002                     path,
1003                 }),
1004             );
1005 
1006             inode
1007         };
1008 
1009         Entry {
1010             inode,
1011             generation: 0,
1012             attr: st,
1013             // We use the same timeout for the attribute and the entry.
1014             attr_timeout: self.cfg.timeout,
1015             entry_timeout: self.cfg.timeout,
1016         }
1017     }
1018 
1019     /// Acquires lock of `expiring_casefold_lookup_caches` if `ascii_casefold` is enabled.
lock_casefold_lookup_caches(&self) -> Option<MutexGuard<'_, ExpiringCasefoldLookupCaches>>1020     fn lock_casefold_lookup_caches(&self) -> Option<MutexGuard<'_, ExpiringCasefoldLookupCaches>> {
1021         self.expiring_casefold_lookup_caches
1022             .as_ref()
1023             .map(|c| c.lock())
1024     }
1025 
1026     // Returns an actual case-sensitive file name that matches with the given `name`.
1027     // Returns `Ok(None)` if no file matches with the give `name`.
1028     // This function will panic if casefold is not enabled.
get_case_unfolded_name( &self, parent: &InodeData, name: &[u8], ) -> io::Result<Option<CString>>1029     fn get_case_unfolded_name(
1030         &self,
1031         parent: &InodeData,
1032         name: &[u8],
1033     ) -> io::Result<Option<CString>> {
1034         let mut caches = self
1035             .lock_casefold_lookup_caches()
1036             .expect("casefold must be enabled");
1037         let dir_cache = caches.get(parent)?;
1038         Ok(dir_cache.lookup(name))
1039     }
1040 
1041     // Performs an ascii case insensitive lookup.
ascii_casefold_lookup(&self, parent: &InodeData, name: &[u8]) -> io::Result<Entry>1042     fn ascii_casefold_lookup(&self, parent: &InodeData, name: &[u8]) -> io::Result<Entry> {
1043         match self.get_case_unfolded_name(parent, name)? {
1044             None => Err(io::Error::from_raw_os_error(libc::ENOENT)),
1045             Some(actual_name) => self.do_lookup(parent, &actual_name),
1046         }
1047     }
1048 
1049     #[cfg(test)]
exists_in_casefold_cache(&self, parent: Inode, name: &CStr) -> bool1050     fn exists_in_casefold_cache(&self, parent: Inode, name: &CStr) -> bool {
1051         let mut cache = self
1052             .lock_casefold_lookup_caches()
1053             .expect("casefold must be enabled");
1054         cache.exists_in_cache(parent, name)
1055     }
1056 
do_lookup(&self, parent: &InodeData, name: &CStr) -> io::Result<Entry>1057     fn do_lookup(&self, parent: &InodeData, name: &CStr) -> io::Result<Entry> {
1058         #[cfg_attr(not(feature = "fs_permission_translation"), allow(unused_mut))]
1059         let mut st = statat(parent, name)?;
1060 
1061         let altkey = InodeAltKey {
1062             ino: st.st_ino,
1063             dev: st.st_dev,
1064         };
1065 
1066         let path = format!(
1067             "{}/{}",
1068             parent.path.clone(),
1069             name.to_str().unwrap_or("<non UTF-8 str>")
1070         );
1071 
1072         // Check if we already have an entry before opening a new file.
1073         if let Some(data) = self.inodes.lock().get_alt(&altkey) {
1074             // Return the same inode with the reference counter increased.
1075             #[cfg(feature = "arc_quota")]
1076             self.set_permission(&mut st, &path);
1077             #[cfg(feature = "fs_runtime_ugid_map")]
1078             self.set_ugid_permission(&mut st, &path);
1079             return Ok(Entry {
1080                 inode: self.increase_inode_refcount(data),
1081                 generation: 0,
1082                 attr: st,
1083                 // We use the same timeout for the attribute and the entry.
1084                 attr_timeout: self.cfg.timeout,
1085                 entry_timeout: self.cfg.timeout,
1086             });
1087         }
1088 
1089         // Open a regular file with O_RDONLY to store in `InodeData` so explicit open requests can
1090         // be skipped later if the ZERO_MESSAGE_{OPEN,OPENDIR} features are enabled.
1091         // If the crosvm process doesn't have a read permission, fall back to O_PATH below.
1092         let mut flags = libc::O_RDONLY | libc::O_NOFOLLOW | libc::O_CLOEXEC;
1093         match FileType::from(st.st_mode) {
1094             FileType::Regular => {}
1095             FileType::Directory => flags |= libc::O_DIRECTORY,
1096             FileType::Other => flags |= libc::O_PATH,
1097         };
1098 
1099         // SAFETY: this doesn't modify any memory and we check the return value.
1100         let fd = match unsafe {
1101             syscall!(libc::openat64(
1102                 parent.as_raw_descriptor(),
1103                 name.as_ptr(),
1104                 flags
1105             ))
1106         } {
1107             Ok(fd) => fd,
1108             Err(e) if e.errno() == libc::EACCES => {
1109                 // If O_RDONLY is unavailable, fall back to O_PATH to get an FD to store in
1110                 // `InodeData`.
1111                 // Note that some operations which should be allowed without read permissions
1112                 // require syscalls that don't support O_PATH fds. For those syscalls, we will
1113                 // need to fall back to their path-based equivalents with /self/fd/${FD}.
1114                 // e.g. `fgetxattr()` for an O_PATH FD fails while `getxaattr()` for /self/fd/${FD}
1115                 // works.
1116                 flags |= libc::O_PATH;
1117                 // SAFETY: this doesn't modify any memory and we check the return value.
1118                 unsafe {
1119                     syscall!(libc::openat64(
1120                         parent.as_raw_descriptor(),
1121                         name.as_ptr(),
1122                         flags
1123                     ))
1124                 }?
1125             }
1126             Err(e) => {
1127                 return Err(e.into());
1128             }
1129         };
1130 
1131         // SAFETY: safe because we own the fd.
1132         let f = unsafe { File::from_raw_descriptor(fd) };
1133         // We made sure the lock acquired for `self.inodes` is released automatically when
1134         // the if block above is exited, so a call to `self.add_entry()` should not cause a deadlock
1135         // here. This would not be the case if this were executed in an else block instead.
1136         Ok(self.add_entry(f, st, flags, path))
1137     }
1138 
get_cache_open_options(&self, flags: u32) -> OpenOptions1139     fn get_cache_open_options(&self, flags: u32) -> OpenOptions {
1140         let mut opts = OpenOptions::empty();
1141         match self.cfg.cache_policy {
1142             // We only set the direct I/O option on files.
1143             CachePolicy::Never => opts.set(
1144                 OpenOptions::DIRECT_IO,
1145                 flags & (libc::O_DIRECTORY as u32) == 0,
1146             ),
1147             CachePolicy::Always => {
1148                 opts |= if flags & (libc::O_DIRECTORY as u32) == 0 {
1149                     OpenOptions::KEEP_CACHE
1150                 } else {
1151                     OpenOptions::CACHE_DIR
1152                 }
1153             }
1154             _ => {}
1155         };
1156         opts
1157     }
1158 
1159     // Performs lookup using original name first, if it fails and ascii_casefold is enabled,
1160     // it tries to unfold the name and do lookup again.
do_lookup_with_casefold_fallback( &self, parent: &InodeData, name: &CStr, ) -> io::Result<Entry>1161     fn do_lookup_with_casefold_fallback(
1162         &self,
1163         parent: &InodeData,
1164         name: &CStr,
1165     ) -> io::Result<Entry> {
1166         let mut res = self.do_lookup(parent, name);
1167         // If `ascii_casefold` is enabled, fallback to `ascii_casefold_lookup()`.
1168         if res.is_err() && self.cfg.ascii_casefold {
1169             res = self.ascii_casefold_lookup(parent, name.to_bytes());
1170         }
1171         res
1172     }
1173 
do_open(&self, inode: Inode, flags: u32) -> io::Result<(Option<Handle>, OpenOptions)>1174     fn do_open(&self, inode: Inode, flags: u32) -> io::Result<(Option<Handle>, OpenOptions)> {
1175         let inode_data = self.find_inode(inode)?;
1176 
1177         let file = Mutex::new(self.open_inode(&inode_data, flags as i32)?);
1178 
1179         let handle = self.next_handle.fetch_add(1, Ordering::Relaxed);
1180         let data = HandleData { inode, file };
1181 
1182         self.handles.lock().insert(handle, Arc::new(data));
1183 
1184         let opts = self.get_cache_open_options(flags);
1185 
1186         Ok((Some(handle), opts))
1187     }
1188 
do_open_at( &self, parent_data: Arc<InodeData>, name: &CStr, inode: Inode, flags: u32, ) -> io::Result<(Option<Handle>, OpenOptions)>1189     fn do_open_at(
1190         &self,
1191         parent_data: Arc<InodeData>,
1192         name: &CStr,
1193         inode: Inode,
1194         flags: u32,
1195     ) -> io::Result<(Option<Handle>, OpenOptions)> {
1196         let open_flags = self.update_open_flags(flags as i32);
1197 
1198         let fd_open = syscall!(
1199             // SAFETY: return value is checked.
1200             unsafe {
1201                 libc::openat64(
1202                     parent_data.as_raw_descriptor(),
1203                     name.as_ptr(),
1204                     (open_flags | libc::O_CLOEXEC) & !(libc::O_NOFOLLOW | libc::O_DIRECT),
1205                 )
1206             }
1207         )?;
1208 
1209         // SAFETY: fd_open is valid
1210         let file_open = unsafe { File::from_raw_descriptor(fd_open) };
1211         let handle = self.next_handle.fetch_add(1, Ordering::Relaxed);
1212         let data = HandleData {
1213             inode,
1214             file: Mutex::new(file_open),
1215         };
1216 
1217         self.handles.lock().insert(handle, Arc::new(data));
1218 
1219         let opts = self.get_cache_open_options(open_flags as u32);
1220         Ok((Some(handle), opts))
1221     }
1222 
do_release(&self, inode: Inode, handle: Handle) -> io::Result<()>1223     fn do_release(&self, inode: Inode, handle: Handle) -> io::Result<()> {
1224         let mut handles = self.handles.lock();
1225 
1226         if let btree_map::Entry::Occupied(e) = handles.entry(handle) {
1227             if e.get().inode == inode {
1228                 // We don't need to close the file here because that will happen automatically when
1229                 // the last `Arc` is dropped.
1230                 e.remove();
1231                 return Ok(());
1232             }
1233         }
1234 
1235         Err(ebadf())
1236     }
1237 
do_getattr(&self, inode: &InodeData) -> io::Result<(libc::stat64, Duration)>1238     fn do_getattr(&self, inode: &InodeData) -> io::Result<(libc::stat64, Duration)> {
1239         #[allow(unused_mut)]
1240         let mut st = stat(inode)?;
1241 
1242         #[cfg(feature = "arc_quota")]
1243         self.set_permission(&mut st, &inode.path);
1244         #[cfg(feature = "fs_runtime_ugid_map")]
1245         self.set_ugid_permission(&mut st, &inode.path);
1246         Ok((st, self.cfg.timeout))
1247     }
1248 
do_unlink(&self, parent: &InodeData, name: &CStr, flags: libc::c_int) -> io::Result<()>1249     fn do_unlink(&self, parent: &InodeData, name: &CStr, flags: libc::c_int) -> io::Result<()> {
1250         // SAFETY: this doesn't modify any memory and we check the return value.
1251         syscall!(unsafe { libc::unlinkat(parent.as_raw_descriptor(), name.as_ptr(), flags) })?;
1252         Ok(())
1253     }
1254 
do_fsync<F: AsRawDescriptor>(&self, file: &F, datasync: bool) -> io::Result<()>1255     fn do_fsync<F: AsRawDescriptor>(&self, file: &F, datasync: bool) -> io::Result<()> {
1256         // SAFETY: this doesn't modify any memory and we check the return value.
1257         syscall!(unsafe {
1258             if datasync {
1259                 libc::fdatasync(file.as_raw_descriptor())
1260             } else {
1261                 libc::fsync(file.as_raw_descriptor())
1262             }
1263         })?;
1264 
1265         Ok(())
1266     }
1267 
1268     // Changes the CWD to `self.proc`, runs `f`, and then changes the CWD back to the root
1269     // directory. This effectively emulates an *at syscall starting at /proc, which is useful when
1270     // there is no *at syscall available. Panics if any of the fchdir calls fail or if there is no
1271     // root inode.
1272     //
1273     // NOTE: this method acquires an `self`-wide lock. If any locks are acquired in `f`, care must
1274     // be taken to avoid the risk of deadlocks.
with_proc_chdir<F, T>(&self, f: F) -> T where F: FnOnce() -> T,1275     fn with_proc_chdir<F, T>(&self, f: F) -> T
1276     where
1277         F: FnOnce() -> T,
1278     {
1279         let root = self.find_inode(ROOT_ID).expect("failed to find root inode");
1280 
1281         // Acquire a lock for `fchdir`.
1282         let _proc_lock = self.process_lock.lock();
1283         // SAFETY: this doesn't modify any memory and we check the return value. Since the
1284         // fchdir should never fail we just use debug_asserts.
1285         let proc_cwd = unsafe { libc::fchdir(self.proc.as_raw_descriptor()) };
1286         debug_assert_eq!(
1287             proc_cwd,
1288             0,
1289             "failed to fchdir to /proc: {}",
1290             io::Error::last_os_error()
1291         );
1292 
1293         let res = f();
1294 
1295         // SAFETY: this doesn't modify any memory and we check the return value. Since the
1296         // fchdir should never fail we just use debug_asserts.
1297         let root_cwd = unsafe { libc::fchdir(root.as_raw_descriptor()) };
1298         debug_assert_eq!(
1299             root_cwd,
1300             0,
1301             "failed to fchdir back to root directory: {}",
1302             io::Error::last_os_error()
1303         );
1304 
1305         res
1306     }
1307 
do_getxattr(&self, inode: &InodeData, name: &CStr, value: &mut [u8]) -> io::Result<usize>1308     fn do_getxattr(&self, inode: &InodeData, name: &CStr, value: &mut [u8]) -> io::Result<usize> {
1309         let file = inode.file.lock();
1310         let o_path_file = (file.1 & libc::O_PATH) != 0;
1311         let res = if o_path_file {
1312             // For FDs opened with `O_PATH`, we cannot call `fgetxattr` normally. Instead we
1313             // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
1314             //  and then setting the CWD back to the root directory.
1315             let path = CString::new(format!("self/fd/{}", file.0.as_raw_descriptor()))
1316                 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1317 
1318             // SAFETY: this will only modify `value` and we check the return value.
1319             self.with_proc_chdir(|| unsafe {
1320                 libc::getxattr(
1321                     path.as_ptr(),
1322                     name.as_ptr(),
1323                     value.as_mut_ptr() as *mut libc::c_void,
1324                     value.len() as libc::size_t,
1325                 )
1326             })
1327         } else {
1328             // For regular files and directories, we can just use fgetxattr.
1329             // SAFETY: this will only write to `value` and we check the return value.
1330             unsafe {
1331                 libc::fgetxattr(
1332                     file.0.as_raw_descriptor(),
1333                     name.as_ptr(),
1334                     value.as_mut_ptr() as *mut libc::c_void,
1335                     value.len() as libc::size_t,
1336                 )
1337             }
1338         };
1339 
1340         if res < 0 {
1341             Err(io::Error::last_os_error())
1342         } else {
1343             Ok(res as usize)
1344         }
1345     }
1346 
get_encryption_policy_ex<R: io::Read>( &self, inode: Inode, handle: Handle, mut r: R, ) -> io::Result<IoctlReply>1347     fn get_encryption_policy_ex<R: io::Read>(
1348         &self,
1349         inode: Inode,
1350         handle: Handle,
1351         mut r: R,
1352     ) -> io::Result<IoctlReply> {
1353         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1354             self.find_inode(inode)?
1355         } else {
1356             self.find_handle(handle, inode)?
1357         };
1358 
1359         // SAFETY: this struct only has integer fields and any value is valid.
1360         let mut arg = unsafe { MaybeUninit::<fscrypt_get_policy_ex_arg>::zeroed().assume_init() };
1361         r.read_exact(arg.policy_size.as_bytes_mut())?;
1362 
1363         let policy_size = cmp::min(arg.policy_size, size_of::<fscrypt_policy>() as u64);
1364         arg.policy_size = policy_size;
1365 
1366         let res =
1367             // SAFETY: the kernel will only write to `arg` and we check the return value.
1368             unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GET_ENCRYPTION_POLICY_EX, &mut arg) };
1369         if res < 0 {
1370             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1371         } else {
1372             let len = size_of::<u64>() + arg.policy_size as usize;
1373             Ok(IoctlReply::Done(Ok(<&[u8]>::from(&arg)[..len].to_vec())))
1374         }
1375     }
1376 
get_fsxattr(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply>1377     fn get_fsxattr(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply> {
1378         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1379             self.find_inode(inode)?
1380         } else {
1381             self.find_handle(handle, inode)?
1382         };
1383 
1384         let mut buf = MaybeUninit::<fsxattr>::zeroed();
1385 
1386         // SAFETY: the kernel will only write to `buf` and we check the return value.
1387         let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_FSGETXATTR, buf.as_mut_ptr()) };
1388         if res < 0 {
1389             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1390         } else {
1391             // SAFETY: the kernel guarantees that the policy is now initialized.
1392             let xattr = unsafe { buf.assume_init() };
1393             Ok(IoctlReply::Done(Ok(xattr.as_bytes().to_vec())))
1394         }
1395     }
1396 
set_fsxattr<R: io::Read>( &self, #[cfg_attr(not(feature = "arc_quota"), allow(unused_variables))] ctx: Context, inode: Inode, handle: Handle, mut r: R, ) -> io::Result<IoctlReply>1397     fn set_fsxattr<R: io::Read>(
1398         &self,
1399         #[cfg_attr(not(feature = "arc_quota"), allow(unused_variables))] ctx: Context,
1400         inode: Inode,
1401         handle: Handle,
1402         mut r: R,
1403     ) -> io::Result<IoctlReply> {
1404         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1405             self.find_inode(inode)?
1406         } else {
1407             self.find_handle(handle, inode)?
1408         };
1409 
1410         let mut in_attr = fsxattr::new_zeroed();
1411         r.read_exact(in_attr.as_bytes_mut())?;
1412 
1413         #[cfg(feature = "arc_quota")]
1414         let st = stat(&*data)?;
1415 
1416         // Changing quota project ID requires CAP_FOWNER or being file owner.
1417         // Here we use privileged_quota_uids because we cannot perform a CAP_FOWNER check.
1418         #[cfg(feature = "arc_quota")]
1419         if ctx.uid == st.st_uid || self.cfg.privileged_quota_uids.contains(&ctx.uid) {
1420             // Get the current fsxattr.
1421             let mut buf = MaybeUninit::<fsxattr>::zeroed();
1422             // SAFETY: the kernel will only write to `buf` and we check the return value.
1423             let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_FSGETXATTR, buf.as_mut_ptr()) };
1424             if res < 0 {
1425                 return Ok(IoctlReply::Done(Err(io::Error::last_os_error())));
1426             }
1427             // SAFETY: the kernel guarantees that the policy is now initialized.
1428             let current_attr = unsafe { buf.assume_init() };
1429 
1430             // Project ID cannot be changed inside a user namespace.
1431             // Use Spaced to avoid this restriction.
1432             if current_attr.fsx_projid != in_attr.fsx_projid {
1433                 let connection = self.dbus_connection.as_ref().unwrap().lock();
1434                 let proxy = connection.with_proxy(
1435                     "org.chromium.Spaced",
1436                     "/org/chromium/Spaced",
1437                     DEFAULT_DBUS_TIMEOUT,
1438                 );
1439                 let project_id = in_attr.fsx_projid;
1440                 if !is_android_project_id(project_id) {
1441                     return Err(io::Error::from_raw_os_error(libc::EINVAL));
1442                 }
1443                 let file_clone = base::SafeDescriptor::try_from(&*data)?;
1444                 match proxy.set_project_id(file_clone.into(), project_id) {
1445                     Ok(r) => {
1446                         let r = SetProjectIdReply::parse_from_bytes(&r)
1447                             .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1448                         if !r.success {
1449                             return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1450                                 r.error,
1451                             ))));
1452                         }
1453                     }
1454                     Err(e) => {
1455                         return Err(io::Error::new(io::ErrorKind::Other, e));
1456                     }
1457                 };
1458             }
1459         }
1460 
1461         //  SAFETY: this doesn't modify any memory and we check the return value.
1462         let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_FSSETXATTR, &in_attr) };
1463         if res < 0 {
1464             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1465         } else {
1466             Ok(IoctlReply::Done(Ok(Vec::new())))
1467         }
1468     }
1469 
get_flags(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply>1470     fn get_flags(&self, inode: Inode, handle: Handle) -> io::Result<IoctlReply> {
1471         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1472             self.find_inode(inode)?
1473         } else {
1474             self.find_handle(handle, inode)?
1475         };
1476 
1477         // The ioctl encoding is a long but the parameter is actually an int.
1478         let mut flags: c_int = 0;
1479 
1480         // SAFETY: the kernel will only write to `flags` and we check the return value.
1481         let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GETFLAGS, &mut flags) };
1482         if res < 0 {
1483             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1484         } else {
1485             Ok(IoctlReply::Done(Ok(flags.to_ne_bytes().to_vec())))
1486         }
1487     }
1488 
set_flags<R: io::Read>( &self, #[cfg_attr(not(feature = "arc_quota"), allow(unused_variables))] ctx: Context, inode: Inode, handle: Handle, mut r: R, ) -> io::Result<IoctlReply>1489     fn set_flags<R: io::Read>(
1490         &self,
1491         #[cfg_attr(not(feature = "arc_quota"), allow(unused_variables))] ctx: Context,
1492         inode: Inode,
1493         handle: Handle,
1494         mut r: R,
1495     ) -> io::Result<IoctlReply> {
1496         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1497             self.find_inode(inode)?
1498         } else {
1499             self.find_handle(handle, inode)?
1500         };
1501 
1502         // The ioctl encoding is a long but the parameter is actually an int.
1503         let mut in_flags: c_int = 0;
1504         r.read_exact(in_flags.as_bytes_mut())?;
1505 
1506         #[cfg(feature = "arc_quota")]
1507         let st = stat(&*data)?;
1508 
1509         // Only privleged uid can perform FS_IOC_SETFLAGS through cryptohome.
1510         #[cfg(feature = "arc_quota")]
1511         if ctx.uid == st.st_uid || self.cfg.privileged_quota_uids.contains(&ctx.uid) {
1512             // Get the current flag.
1513             let mut buf = MaybeUninit::<c_int>::zeroed();
1514             // SAFETY: the kernel will only write to `buf` and we check the return value.
1515             let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_GETFLAGS, buf.as_mut_ptr()) };
1516             if res < 0 {
1517                 return Ok(IoctlReply::Done(Err(io::Error::last_os_error())));
1518             }
1519             // SAFETY: the kernel guarantees that the policy is now initialized.
1520             let current_flags = unsafe { buf.assume_init() };
1521 
1522             // Project inheritance flag cannot be changed inside a user namespace.
1523             // Use Spaced to avoid this restriction.
1524             if (in_flags & FS_PROJINHERIT_FL) != (current_flags & FS_PROJINHERIT_FL) {
1525                 let connection = self.dbus_connection.as_ref().unwrap().lock();
1526                 let proxy = connection.with_proxy(
1527                     "org.chromium.Spaced",
1528                     "/org/chromium/Spaced",
1529                     DEFAULT_DBUS_TIMEOUT,
1530                 );
1531                 // If the input flags contain FS_PROJINHERIT_FL, then it is a set. Otherwise it is a
1532                 // reset.
1533                 let enable = (in_flags & FS_PROJINHERIT_FL) == FS_PROJINHERIT_FL;
1534                 let file_clone = base::SafeDescriptor::try_from(&*data)?;
1535                 match proxy.set_project_inheritance_flag(file_clone.into(), enable) {
1536                     Ok(r) => {
1537                         let r = SetProjectInheritanceFlagReply::parse_from_bytes(&r)
1538                             .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
1539                         if !r.success {
1540                             return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1541                                 r.error,
1542                             ))));
1543                         }
1544                     }
1545                     Err(e) => {
1546                         return Err(io::Error::new(io::ErrorKind::Other, e));
1547                     }
1548                 };
1549             }
1550         }
1551 
1552         // SAFETY: this doesn't modify any memory and we check the return value.
1553         let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_SETFLAGS, &in_flags) };
1554         if res < 0 {
1555             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1556         } else {
1557             Ok(IoctlReply::Done(Ok(Vec::new())))
1558         }
1559     }
1560 
enable_verity<R: io::Read>( &self, inode: Inode, handle: Handle, mut r: R, ) -> io::Result<IoctlReply>1561     fn enable_verity<R: io::Read>(
1562         &self,
1563         inode: Inode,
1564         handle: Handle,
1565         mut r: R,
1566     ) -> io::Result<IoctlReply> {
1567         let inode_data = self.find_inode(inode)?;
1568 
1569         // These match the return codes from `fsverity_ioctl_enable` in the kernel.
1570         match inode_data.filetype {
1571             FileType::Regular => {}
1572             FileType::Directory => return Err(io::Error::from_raw_os_error(libc::EISDIR)),
1573             FileType::Other => return Err(io::Error::from_raw_os_error(libc::EINVAL)),
1574         }
1575 
1576         {
1577             // We cannot enable verity while holding a writable fd so get a new one, if necessary.
1578             let mut file = inode_data.file.lock();
1579             let mut flags = file.1;
1580             match flags & libc::O_ACCMODE {
1581                 libc::O_WRONLY | libc::O_RDWR => {
1582                     flags &= !libc::O_ACCMODE;
1583                     flags |= libc::O_RDONLY;
1584 
1585                     // We need to get a read-only handle for this file.
1586                     let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDONLY)?;
1587                     *file = (newfile, flags);
1588                 }
1589                 libc::O_RDONLY => {}
1590                 _ => panic!("Unexpected flags: {:#x}", flags),
1591             }
1592         }
1593 
1594         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1595             inode_data
1596         } else {
1597             let data = self.find_handle(handle, inode)?;
1598 
1599             {
1600                 // We can't enable verity while holding a writable fd. We don't know whether the
1601                 // file was opened for writing so check it here. We don't expect
1602                 // this to be a frequent operation so the extra latency should be
1603                 // fine.
1604                 let mut file = data.file.lock();
1605                 let flags = FileFlags::from_file(&*file).map_err(io::Error::from)?;
1606                 match flags {
1607                     FileFlags::ReadWrite | FileFlags::Write => {
1608                         // We need to get a read-only handle for this file.
1609                         *file = self.open_fd(file.as_raw_descriptor(), libc::O_RDONLY)?;
1610                     }
1611                     FileFlags::Read => {}
1612                 }
1613             }
1614 
1615             data
1616         };
1617 
1618         let mut arg = fsverity_enable_arg::new_zeroed();
1619         r.read_exact(arg.as_bytes_mut())?;
1620 
1621         let mut salt;
1622         if arg.salt_size > 0 {
1623             if arg.salt_size > self.max_buffer_size() {
1624                 return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1625                     libc::ENOMEM,
1626                 ))));
1627             }
1628             salt = vec![0; arg.salt_size as usize];
1629             r.read_exact(&mut salt)?;
1630             arg.salt_ptr = salt.as_ptr() as usize as u64;
1631         } else {
1632             arg.salt_ptr = 0;
1633         }
1634 
1635         let mut sig;
1636         if arg.sig_size > 0 {
1637             if arg.sig_size > self.max_buffer_size() {
1638                 return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1639                     libc::ENOMEM,
1640                 ))));
1641             }
1642             sig = vec![0; arg.sig_size as usize];
1643             r.read_exact(&mut sig)?;
1644             arg.sig_ptr = sig.as_ptr() as usize as u64;
1645         } else {
1646             arg.sig_ptr = 0;
1647         }
1648 
1649         // SAFETY: this doesn't modify any memory and we check the return value.
1650         let res = unsafe { ioctl_with_ptr(&*data, FS_IOC_ENABLE_VERITY, &arg) };
1651         if res < 0 {
1652             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1653         } else {
1654             Ok(IoctlReply::Done(Ok(Vec::new())))
1655         }
1656     }
1657 
measure_verity<R: io::Read>( &self, inode: Inode, handle: Handle, mut r: R, out_size: u32, ) -> io::Result<IoctlReply>1658     fn measure_verity<R: io::Read>(
1659         &self,
1660         inode: Inode,
1661         handle: Handle,
1662         mut r: R,
1663         out_size: u32,
1664     ) -> io::Result<IoctlReply> {
1665         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
1666             self.find_inode(inode)?
1667         } else {
1668             self.find_handle(handle, inode)?
1669         };
1670 
1671         let mut digest = fsverity_digest::new_zeroed();
1672         r.read_exact(digest.as_bytes_mut())?;
1673 
1674         // Taken from fs/verity/fsverity_private.h.
1675         const FS_VERITY_MAX_DIGEST_SIZE: u16 = 64;
1676 
1677         // This digest size is what the fsverity command line utility uses.
1678         const DIGEST_SIZE: u16 = FS_VERITY_MAX_DIGEST_SIZE * 2 + 1;
1679         const BUFLEN: usize = size_of::<fsverity_digest>() + DIGEST_SIZE as usize;
1680         const ROUNDED_LEN: usize =
1681             (BUFLEN + size_of::<fsverity_digest>() - 1) / size_of::<fsverity_digest>();
1682 
1683         // Make sure we get a properly aligned allocation.
1684         let mut buf = [MaybeUninit::<fsverity_digest>::uninit(); ROUNDED_LEN];
1685 
1686         // SAFETY: we are only writing data and not reading uninitialized memory.
1687         unsafe {
1688             // TODO: Replace with `MaybeUninit::slice_as_mut_ptr` once it is stabilized.
1689             addr_of_mut!((*(buf.as_mut_ptr() as *mut fsverity_digest)).digest_size)
1690                 .write(DIGEST_SIZE)
1691         };
1692 
1693         // SAFETY: this will only modify `buf` and we check the return value.
1694         let res = unsafe { ioctl_with_mut_ptr(&*data, FS_IOC_MEASURE_VERITY, buf.as_mut_ptr()) };
1695         if res < 0 {
1696             Ok(IoctlReply::Done(Err(io::Error::last_os_error())))
1697         } else {
1698             let digest_size =
1699                 // SAFETY: this value was initialized by us already and then overwritten by the kernel.
1700                 // TODO: Replace with `MaybeUninit::slice_as_ptr` once it is stabilized.
1701                 unsafe { addr_of!((*(buf.as_ptr() as *const fsverity_digest)).digest_size).read() };
1702             let outlen = size_of::<fsverity_digest>() as u32 + u32::from(digest_size);
1703 
1704             // The kernel guarantees this but it doesn't hurt to be paranoid.
1705             debug_assert!(outlen <= (ROUNDED_LEN * size_of::<fsverity_digest>()) as u32);
1706             if digest.digest_size < digest_size || out_size < outlen {
1707                 return Ok(IoctlReply::Done(Err(io::Error::from_raw_os_error(
1708                     libc::EOVERFLOW,
1709                 ))));
1710             }
1711 
1712             let buf: [MaybeUninit<u8>; ROUNDED_LEN * size_of::<fsverity_digest>()] =
1713                 // SAFETY: any bit pattern is valid for `MaybeUninit<u8>` and `fsverity_digest`
1714                 // doesn't contain any references.
1715                 unsafe { mem::transmute(buf) };
1716 
1717             let buf =
1718                 // SAFETY: Casting to `*const [u8]` is safe because the kernel guarantees that the
1719                 // first `outlen` bytes of `buf` are initialized and `MaybeUninit<u8>` is guaranteed
1720                 // to have the same layout as `u8`.
1721                 // TODO: Replace with `MaybeUninit::slice_assume_init_ref` once it is stabilized.
1722                 unsafe { &*(&buf[..outlen as usize] as *const [MaybeUninit<u8>] as *const [u8]) };
1723             Ok(IoctlReply::Done(Ok(buf.to_vec())))
1724         }
1725     }
1726 }
1727 
1728 #[cfg(feature = "fs_runtime_ugid_map")]
1729 impl PassthroughFs {
find_and_set_ugid_permission( &self, st: &mut libc::stat64, path: &str, is_root_path: bool, ) -> bool1730     fn find_and_set_ugid_permission(
1731         &self,
1732         st: &mut libc::stat64,
1733         path: &str,
1734         is_root_path: bool,
1735     ) -> bool {
1736         for perm_data in self
1737             .permission_paths
1738             .read()
1739             .expect("acquire permission_paths read lock")
1740             .iter()
1741         {
1742             if (is_root_path && perm_data.perm_path == "/")
1743                 || (!is_root_path
1744                     && perm_data.perm_path != "/"
1745                     && perm_data.need_set_permission(path))
1746             {
1747                 self.set_permission_from_data(st, perm_data);
1748                 return true;
1749             }
1750         }
1751         false
1752     }
1753 
set_permission_from_data(&self, st: &mut libc::stat64, perm_data: &PermissionData)1754     fn set_permission_from_data(&self, st: &mut libc::stat64, perm_data: &PermissionData) {
1755         st.st_uid = perm_data.guest_uid;
1756         st.st_gid = perm_data.guest_gid;
1757         st.st_mode = (st.st_mode & libc::S_IFMT) | (0o777 & !perm_data.umask);
1758     }
1759 
1760     /// Set permission according to path
set_ugid_permission(&self, st: &mut libc::stat64, path: &str)1761     fn set_ugid_permission(&self, st: &mut libc::stat64, path: &str) {
1762         let is_root_path = path.is_empty();
1763 
1764         if self.find_and_set_ugid_permission(st, path, is_root_path) {
1765             return;
1766         }
1767 
1768         if let Some(perm_data) = self
1769             .permission_paths
1770             .read()
1771             .expect("acquire permission_paths read lock")
1772             .iter()
1773             .find(|pd| pd.perm_path == "/")
1774         {
1775             self.set_permission_from_data(st, perm_data);
1776         }
1777     }
1778 
1779     /// Set host uid/gid to configured value according to path
change_ugid_creds(&self, ctx: &Context, parent_data: &InodeData, name: &CStr) -> (u32, u32)1780     fn change_ugid_creds(&self, ctx: &Context, parent_data: &InodeData, name: &CStr) -> (u32, u32) {
1781         let path = format!(
1782             "{}/{}",
1783             parent_data.path.clone(),
1784             name.to_str().unwrap_or("<non UTF-8 str>")
1785         );
1786 
1787         let is_root_path = path.is_empty();
1788 
1789         if self.find_ugid_creds_for_path(&path, is_root_path).is_some() {
1790             return self.find_ugid_creds_for_path(&path, is_root_path).unwrap();
1791         }
1792 
1793         if let Some(perm_data) = self
1794             .permission_paths
1795             .read()
1796             .expect("acquire permission_paths read lock")
1797             .iter()
1798             .find(|pd| pd.perm_path == "/")
1799         {
1800             return (perm_data.host_uid, perm_data.host_gid);
1801         }
1802 
1803         (ctx.uid, ctx.gid)
1804     }
1805 
find_ugid_creds_for_path(&self, path: &str, is_root_path: bool) -> Option<(u32, u32)>1806     fn find_ugid_creds_for_path(&self, path: &str, is_root_path: bool) -> Option<(u32, u32)> {
1807         for perm_data in self
1808             .permission_paths
1809             .read()
1810             .expect("acquire permission_paths read lock")
1811             .iter()
1812         {
1813             if (is_root_path && perm_data.perm_path == "/")
1814                 || (!is_root_path
1815                     && perm_data.perm_path != "/"
1816                     && perm_data.need_set_permission(path))
1817             {
1818                 return Some((perm_data.host_uid, perm_data.host_gid));
1819             }
1820         }
1821         None
1822     }
1823 }
1824 
1825 #[cfg(feature = "arc_quota")]
1826 impl PassthroughFs {
1827     /// Convert u8 slice to string
string_from_u8_slice(&self, buf: &[u8]) -> io::Result<String>1828     fn string_from_u8_slice(&self, buf: &[u8]) -> io::Result<String> {
1829         match CStr::from_bytes_until_nul(buf).map(|s| s.to_string_lossy().to_string()) {
1830             Ok(s) => Ok(s),
1831             Err(e) => {
1832                 error!("fail to convert u8 slice to string: {}", e);
1833                 Err(io::Error::from_raw_os_error(libc::EINVAL))
1834             }
1835         }
1836     }
1837 
1838     /// Set permission according to path
set_permission(&self, st: &mut libc::stat64, path: &str)1839     fn set_permission(&self, st: &mut libc::stat64, path: &str) {
1840         for perm_data in self
1841             .permission_paths
1842             .read()
1843             .expect("acquire permission_paths read lock")
1844             .iter()
1845         {
1846             if perm_data.need_set_permission(path) {
1847                 st.st_uid = perm_data.guest_uid;
1848                 st.st_gid = perm_data.guest_gid;
1849                 st.st_mode = (st.st_mode & libc::S_IFMT) | (0o777 & !perm_data.umask);
1850             }
1851         }
1852     }
1853 
1854     /// Set host uid/gid to configured value according to path
change_creds(&self, ctx: &Context, parent_data: &InodeData, name: &CStr) -> (u32, u32)1855     fn change_creds(&self, ctx: &Context, parent_data: &InodeData, name: &CStr) -> (u32, u32) {
1856         let path = format!(
1857             "{}/{}",
1858             parent_data.path.clone(),
1859             name.to_str().unwrap_or("<non UTF-8 str>")
1860         );
1861 
1862         for perm_data in self
1863             .permission_paths
1864             .read()
1865             .expect("acquire permission_paths read lock")
1866             .iter()
1867         {
1868             if perm_data.need_set_permission(&path) {
1869                 return (perm_data.host_uid, perm_data.host_gid);
1870             }
1871         }
1872 
1873         (ctx.uid, ctx.gid)
1874     }
1875 
read_permission_data<R: io::Read>(&self, mut r: R) -> io::Result<PermissionData>1876     fn read_permission_data<R: io::Read>(&self, mut r: R) -> io::Result<PermissionData> {
1877         let mut fs_permission_data = FsPermissionDataBuffer::new_zeroed();
1878         r.read_exact(fs_permission_data.as_bytes_mut())?;
1879 
1880         let perm_path = self.string_from_u8_slice(&fs_permission_data.perm_path)?;
1881         if !perm_path.starts_with('/') {
1882             error!("FS_IOC_SETPERMISSION: perm path must start with '/'");
1883             return Err(io::Error::from_raw_os_error(libc::EINVAL));
1884         }
1885         Ok(PermissionData {
1886             guest_uid: fs_permission_data.guest_uid,
1887             guest_gid: fs_permission_data.guest_gid,
1888             host_uid: fs_permission_data.host_uid,
1889             host_gid: fs_permission_data.host_gid,
1890             umask: fs_permission_data.umask,
1891             perm_path,
1892         })
1893     }
1894 
1895     /// Sets uid/gid/umask for all files and directories under a specific path.
1896     ///
1897     /// This ioctl does not correspond to any upstream FUSE feature. It is used for arcvm
1898     /// It associates the specified path with the provide uid, gid, and umask values within the
1899     /// filesystem metadata.
1900     ///
1901     /// During subsequent lookup operations, the stored uid/gid/umask values are retrieved and
1902     /// applied to all files and directories found under the registered path. Before sending
1903     /// file stat information to the client, the uid and gid are substituted by `guest_uid` and
1904     /// `guest_gid` if the file falls under the registered path. The file mode is masked by the
1905     ///  umask.
1906     ///
1907     /// When the guest creates a file within the specified path, the file gid/uid stat in host
1908     /// will be overwritten to `host_uid` and `host_gid` values.
1909     ///
1910     /// This functionality enables dynamic configuration of ownership and permissions for a
1911     /// specific directory hierarchy within the filesystem.
1912     ///
1913     /// # Notes
1914     /// - This method affects all existing and future files under the registered path.
1915     /// - The original file ownership and permissions are overridden by the provided values.
1916     /// - The registered path should not be renamed
1917     /// - Refer go/remove-mount-passthrough-fuse for more design details
set_permission_by_path<R: io::Read>(&self, r: R) -> IoctlReply1918     fn set_permission_by_path<R: io::Read>(&self, r: R) -> IoctlReply {
1919         if self
1920             .permission_paths
1921             .read()
1922             .expect("acquire permission_paths read lock")
1923             .len()
1924             >= self.cfg.max_dynamic_perm
1925         {
1926             error!(
1927                 "FS_IOC_SETPERMISSION exceeds limits of max_dynamic_perm: {}",
1928                 self.cfg.max_dynamic_perm
1929             );
1930             return IoctlReply::Done(Err(io::Error::from_raw_os_error(libc::EPERM)));
1931         }
1932 
1933         let perm_data = match self.read_permission_data(r) {
1934             Ok(data) => data,
1935             Err(e) => {
1936                 error!("fail to read permission data: {}", e);
1937                 return IoctlReply::Done(Err(e));
1938             }
1939         };
1940 
1941         self.permission_paths
1942             .write()
1943             .expect("acquire permission_paths write lock")
1944             .push(perm_data);
1945 
1946         IoctlReply::Done(Ok(Vec::new()))
1947     }
1948 
1949     // Get xattr value according to path and name
get_xattr_by_path(&self, path: &str, name: &str) -> Option<String>1950     fn get_xattr_by_path(&self, path: &str, name: &str) -> Option<String> {
1951         self.xattr_paths
1952             .read()
1953             .expect("acquire permission_paths read lock")
1954             .iter()
1955             .find(|data| data.need_set_guest_xattr(path, name))
1956             .map(|data| data.xattr_value.clone())
1957     }
1958 
skip_host_set_xattr(&self, path: &str, name: &str) -> bool1959     fn skip_host_set_xattr(&self, path: &str, name: &str) -> bool {
1960         self.get_xattr_by_path(path, name).is_some()
1961     }
1962 
read_xattr_data<R: io::Read>(&self, mut r: R) -> io::Result<XattrData>1963     fn read_xattr_data<R: io::Read>(&self, mut r: R) -> io::Result<XattrData> {
1964         let mut fs_path_xattr_data = FsPathXattrDataBuffer::new_zeroed();
1965         r.read_exact(fs_path_xattr_data.as_bytes_mut())?;
1966 
1967         let xattr_path = self.string_from_u8_slice(&fs_path_xattr_data.path)?;
1968         if !xattr_path.starts_with('/') {
1969             error!("FS_IOC_SETPATHXATTR: perm path must start with '/'");
1970             return Err(io::Error::from_raw_os_error(libc::EINVAL));
1971         }
1972         let xattr_name = self.string_from_u8_slice(&fs_path_xattr_data.xattr_name)?;
1973         let xattr_value = self.string_from_u8_slice(&fs_path_xattr_data.xattr_value)?;
1974 
1975         Ok(XattrData {
1976             xattr_path,
1977             xattr_name,
1978             xattr_value,
1979         })
1980     }
1981 
1982     /// Sets xattr value for all files and directories under a specific path.
1983     ///
1984     /// This ioctl does not correspond to any upstream FUSE feature. It is used for arcvm.
1985     /// It associates the specified path and xattr name with a value.
1986     ///
1987     /// When the getxattr is called for the specified path and name, the predefined
1988     /// value is returned.
1989     ///
1990     /// # Notes
1991     /// - This method affects all existing and future files under the registered path.
1992     /// - The SECURITY_CONTEXT feature will be disabled if this ioctl is enabled.
1993     /// - The registered path should not be renamed
1994     /// - Refer go/remove-mount-passthrough-fuse for more design details
set_xattr_by_path<R: io::Read>(&self, r: R) -> IoctlReply1995     fn set_xattr_by_path<R: io::Read>(&self, r: R) -> IoctlReply {
1996         if self
1997             .xattr_paths
1998             .read()
1999             .expect("acquire xattr_paths read lock")
2000             .len()
2001             >= self.cfg.max_dynamic_xattr
2002         {
2003             error!(
2004                 "FS_IOC_SETPATHXATTR exceeds limits of max_dynamic_xattr: {}",
2005                 self.cfg.max_dynamic_xattr
2006             );
2007             return IoctlReply::Done(Err(io::Error::from_raw_os_error(libc::EPERM)));
2008         }
2009 
2010         let xattr_data = match self.read_xattr_data(r) {
2011             Ok(data) => data,
2012             Err(e) => {
2013                 error!("fail to read xattr data: {}", e);
2014                 return IoctlReply::Done(Err(e));
2015             }
2016         };
2017 
2018         self.xattr_paths
2019             .write()
2020             .expect("acquire xattr_paths write lock")
2021             .push(xattr_data);
2022 
2023         IoctlReply::Done(Ok(Vec::new()))
2024     }
2025 
do_getxattr_with_filter( &self, data: Arc<InodeData>, name: Cow<CStr>, buf: &mut [u8], ) -> io::Result<usize>2026     fn do_getxattr_with_filter(
2027         &self,
2028         data: Arc<InodeData>,
2029         name: Cow<CStr>,
2030         buf: &mut [u8],
2031     ) -> io::Result<usize> {
2032         let res: usize = match self.get_xattr_by_path(&data.path, &name.to_string_lossy()) {
2033             Some(predifined_xattr) => {
2034                 let x = predifined_xattr.into_bytes();
2035                 if x.len() > buf.len() {
2036                     return Err(io::Error::from_raw_os_error(libc::ERANGE));
2037                 }
2038                 buf[..x.len()].copy_from_slice(&x);
2039                 x.len()
2040             }
2041             None => self.do_getxattr(&data, &name, &mut buf[..])?,
2042         };
2043         Ok(res)
2044     }
2045 }
2046 
2047 /// Decrements the refcount of the inode.
2048 /// Returns `true` if the refcount became 0.
forget_one( inodes: &mut MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>, inode: Inode, count: u64, ) -> bool2049 fn forget_one(
2050     inodes: &mut MultikeyBTreeMap<Inode, InodeAltKey, Arc<InodeData>>,
2051     inode: Inode,
2052     count: u64,
2053 ) -> bool {
2054     if let Some(data) = inodes.get(&inode) {
2055         // Acquiring the write lock on the inode map prevents new lookups from incrementing the
2056         // refcount but there is the possibility that a previous lookup already acquired a
2057         // reference to the inode data and is in the process of updating the refcount so we need
2058         // to loop here until we can decrement successfully.
2059         loop {
2060             let refcount = data.refcount.load(Ordering::Relaxed);
2061 
2062             // Saturating sub because it doesn't make sense for a refcount to go below zero and
2063             // we don't want misbehaving clients to cause integer overflow.
2064             let new_count = refcount.saturating_sub(count);
2065 
2066             // Synchronizes with the acquire load in `do_lookup`.
2067             if data
2068                 .refcount
2069                 .compare_exchange_weak(refcount, new_count, Ordering::Release, Ordering::Relaxed)
2070                 .is_ok()
2071             {
2072                 if new_count == 0 {
2073                     // We just removed the last refcount for this inode. There's no need for an
2074                     // acquire fence here because we hold a write lock on the inode map and any
2075                     // thread that is waiting to do a forget on the same inode will have to wait
2076                     // until we release the lock. So there's is no other release store for us to
2077                     // synchronize with before deleting the entry.
2078                     inodes.remove(&inode);
2079                     return true;
2080                 }
2081                 break;
2082             }
2083         }
2084     }
2085     false
2086 }
2087 
2088 // Strips any `user.virtiofs.` prefix from `buf`. If buf contains one or more nul-bytes, each
2089 // nul-byte-separated slice is treated as a C string and the prefix is stripped from each one.
strip_xattr_prefix(buf: &mut Vec<u8>)2090 fn strip_xattr_prefix(buf: &mut Vec<u8>) {
2091     fn next_cstr(b: &[u8], start: usize) -> Option<&[u8]> {
2092         if start >= b.len() {
2093             return None;
2094         }
2095 
2096         let end = b[start..]
2097             .iter()
2098             .position(|&c| c == b'\0')
2099             .map(|p| start + p + 1)
2100             .unwrap_or(b.len());
2101 
2102         Some(&b[start..end])
2103     }
2104 
2105     let mut pos = 0;
2106     while let Some(name) = next_cstr(buf, pos) {
2107         if !name.starts_with(USER_VIRTIOFS_XATTR) {
2108             pos += name.len();
2109             continue;
2110         }
2111 
2112         let newlen = name.len() - USER_VIRTIOFS_XATTR.len();
2113         buf.drain(pos..pos + USER_VIRTIOFS_XATTR.len());
2114         pos += newlen;
2115     }
2116 }
2117 
2118 impl FileSystem for PassthroughFs {
2119     type Inode = Inode;
2120     type Handle = Handle;
2121     type DirIter = ReadDir<Box<[u8]>>;
2122 
init(&self, capable: FsOptions) -> io::Result<FsOptions>2123     fn init(&self, capable: FsOptions) -> io::Result<FsOptions> {
2124         let root = CString::new(self.root_dir.clone())
2125             .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
2126 
2127         let flags = libc::O_DIRECTORY | libc::O_NOFOLLOW | libc::O_CLOEXEC;
2128         // SAFETY: this doesn't modify any memory and we check the return value.
2129         let raw_descriptor = unsafe { libc::openat64(libc::AT_FDCWD, root.as_ptr(), flags) };
2130         if raw_descriptor < 0 {
2131             return Err(io::Error::last_os_error());
2132         }
2133 
2134         // SAFETY: safe because we just opened this descriptor above.
2135         let f = unsafe { File::from_raw_descriptor(raw_descriptor) };
2136 
2137         let st = stat(&f)?;
2138 
2139         // SAFETY: this doesn't modify any memory and there is no need to check the return
2140         // value because this system call always succeeds. We need to clear the umask here because
2141         // we want the client to be able to set all the bits in the mode.
2142         unsafe { libc::umask(0o000) };
2143 
2144         let mut inodes = self.inodes.lock();
2145 
2146         // Not sure why the root inode gets a refcount of 2 but that's what libfuse does.
2147         inodes.insert(
2148             ROOT_ID,
2149             InodeAltKey {
2150                 ino: st.st_ino,
2151                 dev: st.st_dev,
2152             },
2153             Arc::new(InodeData {
2154                 inode: ROOT_ID,
2155                 file: Mutex::new((f, flags)),
2156                 refcount: AtomicU64::new(2),
2157                 filetype: st.st_mode.into(),
2158                 path: "".to_string(),
2159             }),
2160         );
2161 
2162         let mut opts = FsOptions::DO_READDIRPLUS
2163             | FsOptions::READDIRPLUS_AUTO
2164             | FsOptions::EXPORT_SUPPORT
2165             | FsOptions::DONT_MASK
2166             | FsOptions::CACHE_SYMLINKS;
2167 
2168         // Device using dynamic xattr feature will have different security context in
2169         // host and guests. The SECURITY_CONTEXT feature should not be enabled in the
2170         // device.
2171         if self.cfg.max_dynamic_xattr == 0 && self.cfg.security_ctx {
2172             opts |= FsOptions::SECURITY_CONTEXT;
2173         }
2174 
2175         if self.cfg.posix_acl {
2176             opts |= FsOptions::POSIX_ACL;
2177         }
2178         if self.cfg.writeback && capable.contains(FsOptions::WRITEBACK_CACHE) {
2179             opts |= FsOptions::WRITEBACK_CACHE;
2180             self.writeback.store(true, Ordering::Relaxed);
2181         }
2182         if self.cfg.cache_policy == CachePolicy::Always {
2183             if capable.contains(FsOptions::ZERO_MESSAGE_OPEN) {
2184                 opts |= FsOptions::ZERO_MESSAGE_OPEN;
2185                 self.zero_message_open.store(true, Ordering::Relaxed);
2186             }
2187             if capable.contains(FsOptions::ZERO_MESSAGE_OPENDIR) {
2188                 opts |= FsOptions::ZERO_MESSAGE_OPENDIR;
2189                 self.zero_message_opendir.store(true, Ordering::Relaxed);
2190             }
2191         }
2192         Ok(opts)
2193     }
2194 
destroy(&self)2195     fn destroy(&self) {
2196         cros_tracing::trace_simple_print!(VirtioFs, "{:?}: destroy", self);
2197         self.handles.lock().clear();
2198         self.inodes.lock().clear();
2199     }
2200 
statfs(&self, _ctx: Context, inode: Inode) -> io::Result<libc::statvfs64>2201     fn statfs(&self, _ctx: Context, inode: Inode) -> io::Result<libc::statvfs64> {
2202         let _trace = fs_trace!(self.tag, "statfs", inode);
2203         let data = self.find_inode(inode)?;
2204 
2205         let mut out = MaybeUninit::<libc::statvfs64>::zeroed();
2206 
2207         // SAFETY: this will only modify `out` and we check the return value.
2208         syscall!(unsafe { libc::fstatvfs64(data.as_raw_descriptor(), out.as_mut_ptr()) })?;
2209 
2210         // SAFETY: the kernel guarantees that `out` has been initialized.
2211         Ok(unsafe { out.assume_init() })
2212     }
2213 
lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<Entry>2214     fn lookup(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<Entry> {
2215         let data = self.find_inode(parent)?;
2216         #[allow(unused_variables)]
2217         let path = format!(
2218             "{}/{}",
2219             data.path,
2220             name.to_str().unwrap_or("<non UTF-8 path>")
2221         );
2222         let _trace = fs_trace!(self.tag, "lookup", parent, path);
2223 
2224         let mut res = self.do_lookup_with_casefold_fallback(&data, name);
2225 
2226         // FUSE takes a inode=0 as a request to do negative dentry cache.
2227         // So, if `negative_timeout` is set, return success with the timeout value and inode=0 as a
2228         // response.
2229         if let Err(e) = &res {
2230             if e.kind() == std::io::ErrorKind::NotFound && !self.cfg.negative_timeout.is_zero() {
2231                 res = Ok(Entry::new_negative(self.cfg.negative_timeout));
2232             }
2233         }
2234 
2235         res
2236     }
2237 
forget(&self, _ctx: Context, inode: Inode, count: u64)2238     fn forget(&self, _ctx: Context, inode: Inode, count: u64) {
2239         let _trace = fs_trace!(self.tag, "forget", inode, count);
2240         let mut inodes = self.inodes.lock();
2241         let caches = self.lock_casefold_lookup_caches();
2242         if forget_one(&mut inodes, inode, count) {
2243             if let Some(mut c) = caches {
2244                 c.forget(inode);
2245             }
2246         }
2247     }
2248 
batch_forget(&self, _ctx: Context, requests: Vec<(Inode, u64)>)2249     fn batch_forget(&self, _ctx: Context, requests: Vec<(Inode, u64)>) {
2250         let mut inodes = self.inodes.lock();
2251         let mut caches = self.lock_casefold_lookup_caches();
2252         for (inode, count) in requests {
2253             if forget_one(&mut inodes, inode, count) {
2254                 if let Some(c) = caches.as_mut() {
2255                     c.forget(inode);
2256                 }
2257             }
2258         }
2259     }
2260 
opendir( &self, _ctx: Context, inode: Inode, flags: u32, ) -> io::Result<(Option<Handle>, OpenOptions)>2261     fn opendir(
2262         &self,
2263         _ctx: Context,
2264         inode: Inode,
2265         flags: u32,
2266     ) -> io::Result<(Option<Handle>, OpenOptions)> {
2267         let _trace = fs_trace!(self.tag, "opendir", inode, flags);
2268         if self.zero_message_opendir.load(Ordering::Relaxed) {
2269             Err(io::Error::from_raw_os_error(libc::ENOSYS))
2270         } else {
2271             self.do_open(inode, flags | (libc::O_DIRECTORY as u32))
2272         }
2273     }
2274 
releasedir( &self, _ctx: Context, inode: Inode, _flags: u32, handle: Handle, ) -> io::Result<()>2275     fn releasedir(
2276         &self,
2277         _ctx: Context,
2278         inode: Inode,
2279         _flags: u32,
2280         handle: Handle,
2281     ) -> io::Result<()> {
2282         let _trace = fs_trace!(self.tag, "releasedir", inode, handle);
2283         if self.zero_message_opendir.load(Ordering::Relaxed) {
2284             Ok(())
2285         } else {
2286             self.do_release(inode, handle)
2287         }
2288     }
2289 
mkdir( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, umask: u32, security_ctx: Option<&CStr>, ) -> io::Result<Entry>2290     fn mkdir(
2291         &self,
2292         ctx: Context,
2293         parent: Inode,
2294         name: &CStr,
2295         mode: u32,
2296         umask: u32,
2297         security_ctx: Option<&CStr>,
2298     ) -> io::Result<Entry> {
2299         let _trace = fs_trace!(self.tag, "mkdir", parent, name, mode, umask, security_ctx);
2300         let data = self.find_inode(parent)?;
2301 
2302         let _ctx = security_ctx
2303             .filter(|ctx| *ctx != UNLABELED_CSTR)
2304             .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
2305             .transpose()?;
2306 
2307         #[allow(unused_variables)]
2308         #[cfg(feature = "arc_quota")]
2309         let (uid, gid) = self.change_creds(&ctx, &data, name);
2310         #[cfg(feature = "fs_runtime_ugid_map")]
2311         let (uid, gid) = self.change_ugid_creds(&ctx, &data, name);
2312         #[cfg(not(feature = "fs_permission_translation"))]
2313         let (uid, gid) = (ctx.uid, ctx.gid);
2314 
2315         let (_uid, _gid) = set_creds(uid, gid)?;
2316         {
2317             let casefold_cache = self.lock_casefold_lookup_caches();
2318             let _scoped_umask = ScopedUmask::new(umask);
2319 
2320             // SAFETY: this doesn't modify any memory and we check the return value.
2321             syscall!(unsafe { libc::mkdirat(data.as_raw_descriptor(), name.as_ptr(), mode) })?;
2322             if let Some(mut c) = casefold_cache {
2323                 c.insert(data.inode, name);
2324             }
2325         }
2326         self.do_lookup(&data, name)
2327     }
2328 
rmdir(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()>2329     fn rmdir(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> {
2330         let _trace = fs_trace!(self.tag, "rmdir", parent, name);
2331         let data = self.find_inode(parent)?;
2332         let casefold_cache = self.lock_casefold_lookup_caches();
2333         // TODO(b/278691962): If ascii_casefold is enabled, we need to call
2334         // `get_case_unfolded_name()` to get the actual name to be unlinked.
2335         self.do_unlink(&data, name, libc::AT_REMOVEDIR)?;
2336         if let Some(mut c) = casefold_cache {
2337             c.remove(data.inode, name);
2338         }
2339         Ok(())
2340     }
2341 
readdir( &self, _ctx: Context, inode: Inode, handle: Handle, size: u32, offset: u64, ) -> io::Result<Self::DirIter>2342     fn readdir(
2343         &self,
2344         _ctx: Context,
2345         inode: Inode,
2346         handle: Handle,
2347         size: u32,
2348         offset: u64,
2349     ) -> io::Result<Self::DirIter> {
2350         let _trace = fs_trace!(self.tag, "readdir", inode, handle, size, offset);
2351         let buf = vec![0; size as usize].into_boxed_slice();
2352 
2353         if self.zero_message_opendir.load(Ordering::Relaxed) {
2354             let data = self.find_inode(inode)?;
2355             ReadDir::new(&*data, offset as libc::off64_t, buf)
2356         } else {
2357             let data = self.find_handle(handle, inode)?;
2358 
2359             let dir = data.file.lock();
2360 
2361             ReadDir::new(&*dir, offset as libc::off64_t, buf)
2362         }
2363     }
2364 
open( &self, _ctx: Context, inode: Inode, flags: u32, ) -> io::Result<(Option<Handle>, OpenOptions)>2365     fn open(
2366         &self,
2367         _ctx: Context,
2368         inode: Inode,
2369         flags: u32,
2370     ) -> io::Result<(Option<Handle>, OpenOptions)> {
2371         if self.zero_message_open.load(Ordering::Relaxed) {
2372             let _trace = fs_trace!(self.tag, "open (zero-message)", inode, flags);
2373             Err(io::Error::from_raw_os_error(libc::ENOSYS))
2374         } else {
2375             let _trace = fs_trace!(self.tag, "open", inode, flags);
2376             self.do_open(inode, flags)
2377         }
2378     }
2379 
release( &self, _ctx: Context, inode: Inode, _flags: u32, handle: Handle, _flush: bool, _flock_release: bool, _lock_owner: Option<u64>, ) -> io::Result<()>2380     fn release(
2381         &self,
2382         _ctx: Context,
2383         inode: Inode,
2384         _flags: u32,
2385         handle: Handle,
2386         _flush: bool,
2387         _flock_release: bool,
2388         _lock_owner: Option<u64>,
2389     ) -> io::Result<()> {
2390         if self.zero_message_open.load(Ordering::Relaxed) {
2391             let _trace = fs_trace!(self.tag, "release (zero-message)", inode, handle);
2392             Ok(())
2393         } else {
2394             let _trace = fs_trace!(self.tag, "release", inode, handle);
2395             self.do_release(inode, handle)
2396         }
2397     }
2398 
chromeos_tmpfile( &self, ctx: Context, parent: Self::Inode, mode: u32, umask: u32, security_ctx: Option<&CStr>, ) -> io::Result<Entry>2399     fn chromeos_tmpfile(
2400         &self,
2401         ctx: Context,
2402         parent: Self::Inode,
2403         mode: u32,
2404         umask: u32,
2405         security_ctx: Option<&CStr>,
2406     ) -> io::Result<Entry> {
2407         let _trace = fs_trace!(
2408             self.tag,
2409             "chromeos_tempfile",
2410             parent,
2411             mode,
2412             umask,
2413             security_ctx
2414         );
2415         let data = self.find_inode(parent)?;
2416 
2417         let _ctx = security_ctx
2418             .filter(|ctx| *ctx != UNLABELED_CSTR)
2419             .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
2420             .transpose()?;
2421 
2422         let tmpflags = libc::O_RDWR | libc::O_TMPFILE | libc::O_CLOEXEC | libc::O_NOFOLLOW;
2423 
2424         let current_dir = c".";
2425 
2426         #[allow(unused_variables)]
2427         #[cfg(feature = "arc_quota")]
2428         let (uid, gid) = self.change_creds(&ctx, &data, current_dir);
2429         #[cfg(feature = "fs_runtime_ugid_map")]
2430         let (uid, gid) = self.change_ugid_creds(&ctx, &data, current_dir);
2431         #[cfg(not(feature = "fs_permission_translation"))]
2432         let (uid, gid) = (ctx.uid, ctx.gid);
2433 
2434         let (_uid, _gid) = set_creds(uid, gid)?;
2435 
2436         let fd = {
2437             let _scoped_umask = ScopedUmask::new(umask);
2438 
2439             // SAFETY: this doesn't modify any memory and we check the return value.
2440             syscall!(unsafe {
2441                 libc::openat64(
2442                     data.as_raw_descriptor(),
2443                     current_dir.as_ptr(),
2444                     tmpflags,
2445                     mode,
2446                 )
2447             })?
2448         };
2449         // No need to add casefold_cache becuase we created an anonymous file.
2450 
2451         // SAFETY: safe because we just opened this fd.
2452         let tmpfile = unsafe { File::from_raw_descriptor(fd) };
2453         let st = stat(&tmpfile)?;
2454         let path = format!(
2455             "{}/{}",
2456             data.path.clone(),
2457             current_dir.to_str().unwrap_or("<non UTF-8 str>")
2458         );
2459         Ok(self.add_entry(tmpfile, st, tmpflags, path))
2460     }
2461 
create( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, flags: u32, umask: u32, security_ctx: Option<&CStr>, ) -> io::Result<(Entry, Option<Handle>, OpenOptions)>2462     fn create(
2463         &self,
2464         ctx: Context,
2465         parent: Inode,
2466         name: &CStr,
2467         mode: u32,
2468         flags: u32,
2469         umask: u32,
2470         security_ctx: Option<&CStr>,
2471     ) -> io::Result<(Entry, Option<Handle>, OpenOptions)> {
2472         let _trace = fs_trace!(
2473             self.tag,
2474             "create",
2475             parent,
2476             name,
2477             mode,
2478             flags,
2479             umask,
2480             security_ctx
2481         );
2482         let data = self.find_inode(parent)?;
2483 
2484         let _ctx = security_ctx
2485             .filter(|ctx| *ctx != UNLABELED_CSTR)
2486             .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
2487             .transpose()?;
2488 
2489         #[allow(unused_variables)]
2490         #[cfg(feature = "arc_quota")]
2491         let (uid, gid) = self.change_creds(&ctx, &data, name);
2492         #[cfg(feature = "fs_runtime_ugid_map")]
2493         let (uid, gid) = self.change_ugid_creds(&ctx, &data, name);
2494         #[cfg(not(feature = "fs_permission_translation"))]
2495         let (uid, gid) = (ctx.uid, ctx.gid);
2496 
2497         let (_uid, _gid) = set_creds(uid, gid)?;
2498 
2499         let flags = self.update_open_flags(flags as i32);
2500         let create_flags =
2501             (flags | libc::O_CREAT | libc::O_CLOEXEC | libc::O_NOFOLLOW) & !libc::O_DIRECT;
2502 
2503         let fd = {
2504             let _scoped_umask = ScopedUmask::new(umask);
2505             let casefold_cache = self.lock_casefold_lookup_caches();
2506 
2507             // SAFETY: this doesn't modify any memory and we check the return value. We don't really
2508             // check `flags` because if the kernel can't handle poorly specified flags then we have
2509             // much bigger problems.
2510             // TODO(b/278691962): If ascii_casefold is enabled, we need to call
2511             // `get_case_unfolded_name()` to get the actual name to be created.
2512             let fd = syscall!(unsafe {
2513                 libc::openat64(data.as_raw_descriptor(), name.as_ptr(), create_flags, mode)
2514             })?;
2515             if let Some(mut c) = casefold_cache {
2516                 c.insert(parent, name);
2517             }
2518             fd
2519         };
2520 
2521         // SAFETY: safe because we just opened this fd.
2522         let file = unsafe { File::from_raw_descriptor(fd) };
2523 
2524         let st = stat(&file)?;
2525         let path = format!(
2526             "{}/{}",
2527             data.path.clone(),
2528             name.to_str().unwrap_or("<non UTF-8 str>")
2529         );
2530         let entry = self.add_entry(file, st, create_flags, path);
2531 
2532         let (handle, opts) = if self.zero_message_open.load(Ordering::Relaxed) {
2533             (None, OpenOptions::KEEP_CACHE)
2534         } else {
2535             self.do_open_at(
2536                 data,
2537                 name,
2538                 entry.inode,
2539                 flags as u32 & !((libc::O_CREAT | libc::O_EXCL | libc::O_NOCTTY) as u32),
2540             )
2541             .inspect_err(|_e| {
2542                 // Don't leak the entry.
2543                 self.forget(ctx, entry.inode, 1);
2544             })?
2545         };
2546         Ok((entry, handle, opts))
2547     }
2548 
unlink(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()>2549     fn unlink(&self, _ctx: Context, parent: Inode, name: &CStr) -> io::Result<()> {
2550         let _trace = fs_trace!(self.tag, "unlink", parent, name);
2551         let data = self.find_inode(parent)?;
2552         let casefold_cache = self.lock_casefold_lookup_caches();
2553         // TODO(b/278691962): If ascii_casefold is enabled, we need to call
2554         // `get_case_unfolded_name()` to get the actual name to be unlinked.
2555         self.do_unlink(&data, name, 0)?;
2556         if let Some(mut c) = casefold_cache {
2557             c.remove(data.inode, name);
2558         }
2559         Ok(())
2560     }
2561 
read<W: io::Write + ZeroCopyWriter>( &self, _ctx: Context, inode: Inode, handle: Handle, mut w: W, size: u32, offset: u64, _lock_owner: Option<u64>, _flags: u32, ) -> io::Result<usize>2562     fn read<W: io::Write + ZeroCopyWriter>(
2563         &self,
2564         _ctx: Context,
2565         inode: Inode,
2566         handle: Handle,
2567         mut w: W,
2568         size: u32,
2569         offset: u64,
2570         _lock_owner: Option<u64>,
2571         _flags: u32,
2572     ) -> io::Result<usize> {
2573         if self.zero_message_open.load(Ordering::Relaxed) {
2574             let _trace = fs_trace!(self.tag, "read (zero-message)", inode, handle, size, offset);
2575             let data = self.find_inode(inode)?;
2576 
2577             let mut file = data.file.lock();
2578             let mut flags = file.1;
2579             match flags & libc::O_ACCMODE {
2580                 libc::O_WRONLY => {
2581                     flags &= !libc::O_WRONLY;
2582                     flags |= libc::O_RDWR;
2583 
2584                     // We need to get a readable handle for this file.
2585                     let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
2586                     *file = (newfile, flags);
2587                 }
2588                 libc::O_RDONLY | libc::O_RDWR => {}
2589                 _ => panic!("Unexpected flags: {:#x}", flags),
2590             }
2591 
2592             w.write_from(&mut file.0, size as usize, offset)
2593         } else {
2594             let _trace = fs_trace!(self.tag, "read", inode, handle, size, offset);
2595             let data = self.find_handle(handle, inode)?;
2596 
2597             let mut f = data.file.lock();
2598             w.write_from(&mut f, size as usize, offset)
2599         }
2600     }
2601 
write<R: io::Read + ZeroCopyReader>( &self, _ctx: Context, inode: Inode, handle: Handle, mut r: R, size: u32, offset: u64, _lock_owner: Option<u64>, _delayed_write: bool, flags: u32, ) -> io::Result<usize>2602     fn write<R: io::Read + ZeroCopyReader>(
2603         &self,
2604         _ctx: Context,
2605         inode: Inode,
2606         handle: Handle,
2607         mut r: R,
2608         size: u32,
2609         offset: u64,
2610         _lock_owner: Option<u64>,
2611         _delayed_write: bool,
2612         flags: u32,
2613     ) -> io::Result<usize> {
2614         // When the WRITE_KILL_PRIV flag is set, drop CAP_FSETID so that the kernel will
2615         // automatically clear the setuid and setgid bits for us.
2616         let _fsetid = if flags & WRITE_KILL_PRIV != 0 {
2617             Some(drop_cap_fsetid()?)
2618         } else {
2619             None
2620         };
2621 
2622         if self.zero_message_open.load(Ordering::Relaxed) {
2623             let _trace = fs_trace!(
2624                 self.tag,
2625                 "write (zero-message)",
2626                 inode,
2627                 handle,
2628                 size,
2629                 offset
2630             );
2631 
2632             let data = self.find_inode(inode)?;
2633 
2634             let mut file = data.file.lock();
2635             let mut flags = file.1;
2636             match flags & libc::O_ACCMODE {
2637                 libc::O_RDONLY => {
2638                     flags &= !libc::O_RDONLY;
2639                     flags |= libc::O_RDWR;
2640 
2641                     // We need to get a writable handle for this file.
2642                     let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
2643                     *file = (newfile, flags);
2644                 }
2645                 libc::O_WRONLY | libc::O_RDWR => {}
2646                 _ => panic!("Unexpected flags: {:#x}", flags),
2647             }
2648 
2649             r.read_to(&mut file.0, size as usize, offset)
2650         } else {
2651             let _trace = fs_trace!(self.tag, "write", inode, handle, size, offset);
2652 
2653             let data = self.find_handle(handle, inode)?;
2654 
2655             let mut f = data.file.lock();
2656             r.read_to(&mut f, size as usize, offset)
2657         }
2658     }
2659 
getattr( &self, _ctx: Context, inode: Inode, _handle: Option<Handle>, ) -> io::Result<(libc::stat64, Duration)>2660     fn getattr(
2661         &self,
2662         _ctx: Context,
2663         inode: Inode,
2664         _handle: Option<Handle>,
2665     ) -> io::Result<(libc::stat64, Duration)> {
2666         let _trace = fs_trace!(self.tag, "getattr", inode, _handle);
2667 
2668         let data = self.find_inode(inode)?;
2669         self.do_getattr(&data)
2670     }
2671 
setattr( &self, _ctx: Context, inode: Inode, attr: libc::stat64, handle: Option<Handle>, valid: SetattrValid, ) -> io::Result<(libc::stat64, Duration)>2672     fn setattr(
2673         &self,
2674         _ctx: Context,
2675         inode: Inode,
2676         attr: libc::stat64,
2677         handle: Option<Handle>,
2678         valid: SetattrValid,
2679     ) -> io::Result<(libc::stat64, Duration)> {
2680         let _trace = fs_trace!(self.tag, "setattr", inode, handle);
2681         let inode_data = self.find_inode(inode)?;
2682 
2683         enum Data<'a> {
2684             Handle(MutexGuard<'a, File>),
2685             ProcPath(CString),
2686         }
2687 
2688         // If we have a handle then use it otherwise get a new fd from the inode.
2689         let hd;
2690         let data = if let Some(handle) = handle.filter(|&h| h != 0) {
2691             hd = self.find_handle(handle, inode)?;
2692             Data::Handle(hd.file.lock())
2693         } else {
2694             let pathname = CString::new(format!("self/fd/{}", inode_data.as_raw_descriptor()))
2695                 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
2696             Data::ProcPath(pathname)
2697         };
2698 
2699         if valid.contains(SetattrValid::MODE) {
2700             // SAFETY: this doesn't modify any memory and we check the return value.
2701             syscall!(unsafe {
2702                 match data {
2703                     Data::Handle(ref fd) => libc::fchmod(fd.as_raw_descriptor(), attr.st_mode),
2704                     Data::ProcPath(ref p) => {
2705                         libc::fchmodat(self.proc.as_raw_descriptor(), p.as_ptr(), attr.st_mode, 0)
2706                     }
2707                 }
2708             })?;
2709         }
2710 
2711         if valid.intersects(SetattrValid::UID | SetattrValid::GID) {
2712             let uid = if valid.contains(SetattrValid::UID) {
2713                 attr.st_uid
2714             } else {
2715                 // Cannot use -1 here because these are unsigned values.
2716                 u32::MAX
2717             };
2718             let gid = if valid.contains(SetattrValid::GID) {
2719                 attr.st_gid
2720             } else {
2721                 // Cannot use -1 here because these are unsigned values.
2722                 u32::MAX
2723             };
2724 
2725             // SAFETY: this doesn't modify any memory and we check the return value.
2726             syscall!(unsafe {
2727                 libc::fchownat(
2728                     inode_data.as_raw_descriptor(),
2729                     EMPTY_CSTR.as_ptr(),
2730                     uid,
2731                     gid,
2732                     libc::AT_EMPTY_PATH | libc::AT_SYMLINK_NOFOLLOW,
2733                 )
2734             })?;
2735         }
2736 
2737         if valid.contains(SetattrValid::SIZE) {
2738             syscall!(match data {
2739                 Data::Handle(ref fd) => {
2740                     // SAFETY: this doesn't modify any memory and we check the return value.
2741                     unsafe { libc::ftruncate64(fd.as_raw_descriptor(), attr.st_size) }
2742                 }
2743                 _ => {
2744                     // There is no `ftruncateat` so we need to get a new fd and truncate it.
2745                     let f = self.open_inode(&inode_data, libc::O_NONBLOCK | libc::O_RDWR)?;
2746                     // SAFETY: this doesn't modify any memory and we check the return value.
2747                     unsafe { libc::ftruncate64(f.as_raw_descriptor(), attr.st_size) }
2748                 }
2749             })?;
2750         }
2751 
2752         if valid.intersects(SetattrValid::ATIME | SetattrValid::MTIME) {
2753             let mut tvs = [
2754                 libc::timespec {
2755                     tv_sec: 0,
2756                     tv_nsec: libc::UTIME_OMIT,
2757                 },
2758                 libc::timespec {
2759                     tv_sec: 0,
2760                     tv_nsec: libc::UTIME_OMIT,
2761                 },
2762             ];
2763 
2764             if valid.contains(SetattrValid::ATIME_NOW) {
2765                 tvs[0].tv_nsec = libc::UTIME_NOW;
2766             } else if valid.contains(SetattrValid::ATIME) {
2767                 tvs[0].tv_sec = attr.st_atime;
2768                 tvs[0].tv_nsec = attr.st_atime_nsec;
2769             }
2770 
2771             if valid.contains(SetattrValid::MTIME_NOW) {
2772                 tvs[1].tv_nsec = libc::UTIME_NOW;
2773             } else if valid.contains(SetattrValid::MTIME) {
2774                 tvs[1].tv_sec = attr.st_mtime;
2775                 tvs[1].tv_nsec = attr.st_mtime_nsec;
2776             }
2777 
2778             // SAFETY: this doesn't modify any memory and we check the return value.
2779             syscall!(unsafe {
2780                 match data {
2781                     Data::Handle(ref fd) => libc::futimens(fd.as_raw_descriptor(), tvs.as_ptr()),
2782                     Data::ProcPath(ref p) => {
2783                         libc::utimensat(self.proc.as_raw_descriptor(), p.as_ptr(), tvs.as_ptr(), 0)
2784                     }
2785                 }
2786             })?;
2787         }
2788 
2789         self.do_getattr(&inode_data)
2790     }
2791 
rename( &self, _ctx: Context, olddir: Inode, oldname: &CStr, newdir: Inode, newname: &CStr, flags: u32, ) -> io::Result<()>2792     fn rename(
2793         &self,
2794         _ctx: Context,
2795         olddir: Inode,
2796         oldname: &CStr,
2797         newdir: Inode,
2798         newname: &CStr,
2799         flags: u32,
2800     ) -> io::Result<()> {
2801         let _trace = fs_trace!(self.tag, "rename", olddir, oldname, newdir, newname, flags);
2802 
2803         let old_inode = self.find_inode(olddir)?;
2804         let new_inode = self.find_inode(newdir)?;
2805         {
2806             let casefold_cache = self.lock_casefold_lookup_caches();
2807 
2808             // SAFETY: this doesn't modify any memory and we check the return value.
2809             // TODO: Switch to libc::renameat2 once https://github.com/rust-lang/libc/pull/1508 lands
2810             // and we have glibc 2.28.
2811             syscall!(unsafe {
2812                 libc::syscall(
2813                     libc::SYS_renameat2,
2814                     old_inode.as_raw_descriptor(),
2815                     oldname.as_ptr(),
2816                     new_inode.as_raw_descriptor(),
2817                     newname.as_ptr(),
2818                     flags,
2819                 )
2820             })?;
2821             if let Some(mut c) = casefold_cache {
2822                 c.remove(olddir, oldname);
2823                 c.insert(newdir, newname);
2824             }
2825         }
2826 
2827         Ok(())
2828     }
2829 
mknod( &self, ctx: Context, parent: Inode, name: &CStr, mode: u32, rdev: u32, umask: u32, security_ctx: Option<&CStr>, ) -> io::Result<Entry>2830     fn mknod(
2831         &self,
2832         ctx: Context,
2833         parent: Inode,
2834         name: &CStr,
2835         mode: u32,
2836         rdev: u32,
2837         umask: u32,
2838         security_ctx: Option<&CStr>,
2839     ) -> io::Result<Entry> {
2840         let _trace = fs_trace!(
2841             self.tag,
2842             "mknod",
2843             parent,
2844             name,
2845             mode,
2846             rdev,
2847             umask,
2848             security_ctx
2849         );
2850         let data = self.find_inode(parent)?;
2851 
2852         let _ctx = security_ctx
2853             .filter(|ctx| *ctx != UNLABELED_CSTR)
2854             .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
2855             .transpose()?;
2856 
2857         #[allow(unused_variables)]
2858         #[cfg(feature = "arc_quota")]
2859         let (uid, gid) = self.change_creds(&ctx, &data, name);
2860         #[cfg(feature = "fs_runtime_ugid_map")]
2861         let (uid, gid) = self.change_ugid_creds(&ctx, &data, name);
2862         #[cfg(not(feature = "fs_permission_translation"))]
2863         let (uid, gid) = (ctx.uid, ctx.gid);
2864 
2865         let (_uid, _gid) = set_creds(uid, gid)?;
2866         {
2867             let _scoped_umask = ScopedUmask::new(umask);
2868             let casefold_cache = self.lock_casefold_lookup_caches();
2869 
2870             // SAFETY: this doesn't modify any memory and we check the return value.
2871             syscall!(unsafe {
2872                 libc::mknodat(
2873                     data.as_raw_descriptor(),
2874                     name.as_ptr(),
2875                     mode as libc::mode_t,
2876                     rdev as libc::dev_t,
2877                 )
2878             })?;
2879             if let Some(mut c) = casefold_cache {
2880                 c.insert(parent, name);
2881             }
2882         }
2883 
2884         self.do_lookup(&data, name)
2885     }
2886 
link( &self, _ctx: Context, inode: Inode, newparent: Inode, newname: &CStr, ) -> io::Result<Entry>2887     fn link(
2888         &self,
2889         _ctx: Context,
2890         inode: Inode,
2891         newparent: Inode,
2892         newname: &CStr,
2893     ) -> io::Result<Entry> {
2894         let _trace = fs_trace!(self.tag, "link", inode, newparent, newname);
2895         let data = self.find_inode(inode)?;
2896         let new_inode = self.find_inode(newparent)?;
2897 
2898         let path = CString::new(format!("self/fd/{}", data.as_raw_descriptor()))
2899             .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
2900 
2901         {
2902             let casefold_cache = self.lock_casefold_lookup_caches();
2903             // SAFETY: this doesn't modify any memory and we check the return value.
2904             syscall!(unsafe {
2905                 libc::linkat(
2906                     self.proc.as_raw_descriptor(),
2907                     path.as_ptr(),
2908                     new_inode.as_raw_descriptor(),
2909                     newname.as_ptr(),
2910                     libc::AT_SYMLINK_FOLLOW,
2911                 )
2912             })?;
2913             if let Some(mut c) = casefold_cache {
2914                 c.insert(newparent, newname);
2915             }
2916         }
2917 
2918         self.do_lookup(&new_inode, newname)
2919     }
2920 
symlink( &self, ctx: Context, linkname: &CStr, parent: Inode, name: &CStr, security_ctx: Option<&CStr>, ) -> io::Result<Entry>2921     fn symlink(
2922         &self,
2923         ctx: Context,
2924         linkname: &CStr,
2925         parent: Inode,
2926         name: &CStr,
2927         security_ctx: Option<&CStr>,
2928     ) -> io::Result<Entry> {
2929         let _trace = fs_trace!(self.tag, "symlink", parent, linkname, name, security_ctx);
2930         let data = self.find_inode(parent)?;
2931 
2932         let _ctx = security_ctx
2933             .filter(|ctx| *ctx != UNLABELED_CSTR)
2934             .map(|ctx| ScopedSecurityContext::new(&self.proc, ctx))
2935             .transpose()?;
2936 
2937         #[allow(unused_variables)]
2938         #[cfg(feature = "arc_quota")]
2939         let (uid, gid) = self.change_creds(&ctx, &data, name);
2940         #[cfg(feature = "fs_runtime_ugid_map")]
2941         let (uid, gid) = self.change_ugid_creds(&ctx, &data, name);
2942         #[cfg(not(feature = "fs_permission_translation"))]
2943         let (uid, gid) = (ctx.uid, ctx.gid);
2944 
2945         let (_uid, _gid) = set_creds(uid, gid)?;
2946         {
2947             let casefold_cache = self.lock_casefold_lookup_caches();
2948             // SAFETY: this doesn't modify any memory and we check the return value.
2949             syscall!(unsafe {
2950                 libc::symlinkat(linkname.as_ptr(), data.as_raw_descriptor(), name.as_ptr())
2951             })?;
2952             if let Some(mut c) = casefold_cache {
2953                 c.insert(parent, name);
2954             }
2955         }
2956 
2957         self.do_lookup(&data, name)
2958     }
2959 
readlink(&self, _ctx: Context, inode: Inode) -> io::Result<Vec<u8>>2960     fn readlink(&self, _ctx: Context, inode: Inode) -> io::Result<Vec<u8>> {
2961         let _trace = fs_trace!(self.tag, "readlink", inode);
2962         let data = self.find_inode(inode)?;
2963 
2964         let mut buf = vec![0; libc::PATH_MAX as usize];
2965 
2966         // SAFETY: this will only modify the contents of `buf` and we check the return value.
2967         let res = syscall!(unsafe {
2968             libc::readlinkat(
2969                 data.as_raw_descriptor(),
2970                 EMPTY_CSTR.as_ptr(),
2971                 buf.as_mut_ptr() as *mut libc::c_char,
2972                 buf.len(),
2973             )
2974         })?;
2975 
2976         buf.resize(res as usize, 0);
2977 
2978         #[cfg(feature = "fs_runtime_ugid_map")]
2979         {
2980             let link_target = Path::new(OsStr::from_bytes(&buf[..res as usize]));
2981             if !link_target.starts_with(&self.root_dir) {
2982                 return Err(io::Error::new(
2983                     io::ErrorKind::InvalidInput,
2984                     "Symbolic link points outside of root_dir",
2985                 ));
2986             }
2987         }
2988         Ok(buf)
2989     }
2990 
flush( &self, _ctx: Context, inode: Inode, handle: Handle, _lock_owner: u64, ) -> io::Result<()>2991     fn flush(
2992         &self,
2993         _ctx: Context,
2994         inode: Inode,
2995         handle: Handle,
2996         _lock_owner: u64,
2997     ) -> io::Result<()> {
2998         let _trace = fs_trace!(self.tag, "flush", inode, handle);
2999         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
3000             self.find_inode(inode)?
3001         } else {
3002             self.find_handle(handle, inode)?
3003         };
3004 
3005         // SAFETY:
3006         // Since this method is called whenever an fd is closed in the client, we can emulate that
3007         // behavior by doing the same thing (dup-ing the fd and then immediately closing it). Safe
3008         // because this doesn't modify any memory and we check the return values.
3009         unsafe {
3010             let newfd = syscall!(libc::fcntl(
3011                 data.as_raw_descriptor(),
3012                 libc::F_DUPFD_CLOEXEC,
3013                 0
3014             ))?;
3015 
3016             syscall!(libc::close(newfd))?;
3017         }
3018         Ok(())
3019     }
3020 
fsync(&self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle) -> io::Result<()>3021     fn fsync(&self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle) -> io::Result<()> {
3022         if self.zero_message_open.load(Ordering::Relaxed) {
3023             let _trace = fs_trace!(self.tag, "fsync (zero-message)", inode, datasync, handle);
3024             let data = self.find_inode(inode)?;
3025             self.do_fsync(&*data, datasync)
3026         } else {
3027             let _trace = fs_trace!(self.tag, "fsync", inode, datasync, handle);
3028             let data = self.find_handle(handle, inode)?;
3029 
3030             let file = data.file.lock();
3031             self.do_fsync(&*file, datasync)
3032         }
3033     }
3034 
fsyncdir( &self, _ctx: Context, inode: Inode, datasync: bool, handle: Handle, ) -> io::Result<()>3035     fn fsyncdir(
3036         &self,
3037         _ctx: Context,
3038         inode: Inode,
3039         datasync: bool,
3040         handle: Handle,
3041     ) -> io::Result<()> {
3042         if self.zero_message_opendir.load(Ordering::Relaxed) {
3043             let _trace = fs_trace!(self.tag, "fsyncdir (zero-message)", inode, datasync, handle);
3044             let data = self.find_inode(inode)?;
3045             self.do_fsync(&*data, datasync)
3046         } else {
3047             let _trace = fs_trace!(self.tag, "fsyncdir", inode, datasync, handle);
3048             let data = self.find_handle(handle, inode)?;
3049 
3050             let file = data.file.lock();
3051             self.do_fsync(&*file, datasync)
3052         }
3053     }
3054 
access(&self, ctx: Context, inode: Inode, mask: u32) -> io::Result<()>3055     fn access(&self, ctx: Context, inode: Inode, mask: u32) -> io::Result<()> {
3056         let _trace = fs_trace!(self.tag, "access", inode, mask);
3057         let data = self.find_inode(inode)?;
3058 
3059         let st = stat(&*data)?;
3060         let mode = mask as i32 & (libc::R_OK | libc::W_OK | libc::X_OK);
3061 
3062         if mode == libc::F_OK {
3063             // The file exists since we were able to call `stat(2)` on it.
3064             return Ok(());
3065         }
3066 
3067         if (mode & libc::R_OK) != 0 {
3068             if ctx.uid != 0
3069                 && (st.st_uid != ctx.uid || st.st_mode & 0o400 == 0)
3070                 && (st.st_gid != ctx.gid || st.st_mode & 0o040 == 0)
3071                 && st.st_mode & 0o004 == 0
3072             {
3073                 return Err(io::Error::from_raw_os_error(libc::EACCES));
3074             }
3075         }
3076 
3077         if (mode & libc::W_OK) != 0 {
3078             if ctx.uid != 0
3079                 && (st.st_uid != ctx.uid || st.st_mode & 0o200 == 0)
3080                 && (st.st_gid != ctx.gid || st.st_mode & 0o020 == 0)
3081                 && st.st_mode & 0o002 == 0
3082             {
3083                 return Err(io::Error::from_raw_os_error(libc::EACCES));
3084             }
3085         }
3086 
3087         // root can only execute something if it is executable by one of the owner, the group, or
3088         // everyone.
3089         if (mode & libc::X_OK) != 0 {
3090             if (ctx.uid != 0 || st.st_mode & 0o111 == 0)
3091                 && (st.st_uid != ctx.uid || st.st_mode & 0o100 == 0)
3092                 && (st.st_gid != ctx.gid || st.st_mode & 0o010 == 0)
3093                 && st.st_mode & 0o001 == 0
3094             {
3095                 return Err(io::Error::from_raw_os_error(libc::EACCES));
3096             }
3097         }
3098 
3099         Ok(())
3100     }
3101 
setxattr( &self, _ctx: Context, inode: Inode, name: &CStr, value: &[u8], flags: u32, ) -> io::Result<()>3102     fn setxattr(
3103         &self,
3104         _ctx: Context,
3105         inode: Inode,
3106         name: &CStr,
3107         value: &[u8],
3108         flags: u32,
3109     ) -> io::Result<()> {
3110         let _trace = fs_trace!(self.tag, "setxattr", inode, name, flags);
3111         // We can't allow the VM to set this xattr because an unprivileged process may use it to set
3112         // a privileged xattr.
3113         if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
3114             return Err(io::Error::from_raw_os_error(libc::EPERM));
3115         }
3116 
3117         let data = self.find_inode(inode)?;
3118         let name = self.rewrite_xattr_name(name);
3119 
3120         #[cfg(feature = "arc_quota")]
3121         if self.skip_host_set_xattr(&data.path, &name.to_string_lossy()) {
3122             debug!(
3123                 "ignore setxattr for path:{} xattr_name:{}",
3124                 &data.path,
3125                 &name.to_string_lossy()
3126             );
3127             return Ok(());
3128         }
3129 
3130         let file = data.file.lock();
3131         let o_path_file = (file.1 & libc::O_PATH) != 0;
3132         if o_path_file {
3133             // For FDs opened with `O_PATH`, we cannot call `fsetxattr` normally. Instead we emulate
3134             // an _at syscall by changing the CWD to /proc, running the path based syscall, and then
3135             // setting the CWD back to the root directory.
3136             let path = CString::new(format!("self/fd/{}", file.0.as_raw_descriptor()))
3137                 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
3138 
3139             syscall!(self.with_proc_chdir(|| {
3140                 // SAFETY: this doesn't modify any memory and we check the return value.
3141                 unsafe {
3142                     libc::setxattr(
3143                         path.as_ptr(),
3144                         name.as_ptr(),
3145                         value.as_ptr() as *const libc::c_void,
3146                         value.len() as libc::size_t,
3147                         flags as c_int,
3148                     )
3149                 }
3150             }))?;
3151         } else {
3152             syscall!(
3153                 // For regular files and directories, we can just use fsetxattr.
3154                 // SAFETY: this doesn't modify any memory and we check the return value.
3155                 unsafe {
3156                     libc::fsetxattr(
3157                         file.0.as_raw_descriptor(),
3158                         name.as_ptr(),
3159                         value.as_ptr() as *const libc::c_void,
3160                         value.len() as libc::size_t,
3161                         flags as c_int,
3162                     )
3163                 }
3164             )?;
3165         }
3166 
3167         Ok(())
3168     }
3169 
getxattr( &self, _ctx: Context, inode: Inode, name: &CStr, size: u32, ) -> io::Result<GetxattrReply>3170     fn getxattr(
3171         &self,
3172         _ctx: Context,
3173         inode: Inode,
3174         name: &CStr,
3175         size: u32,
3176     ) -> io::Result<GetxattrReply> {
3177         let _trace = fs_trace!(self.tag, "getxattr", inode, name, size);
3178         // We don't allow the VM to set this xattr so we also pretend there is no value associated
3179         // with it.
3180         if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
3181             return Err(io::Error::from_raw_os_error(libc::ENODATA));
3182         }
3183 
3184         let data = self.find_inode(inode)?;
3185         let name = self.rewrite_xattr_name(name);
3186         let mut buf = vec![0u8; size as usize];
3187 
3188         #[cfg(feature = "arc_quota")]
3189         let res = self.do_getxattr_with_filter(data, name, &mut buf)?;
3190 
3191         #[cfg(not(feature = "arc_quota"))]
3192         let res = self.do_getxattr(&data, &name, &mut buf[..])?;
3193 
3194         if size == 0 {
3195             Ok(GetxattrReply::Count(res as u32))
3196         } else {
3197             buf.truncate(res);
3198             Ok(GetxattrReply::Value(buf))
3199         }
3200     }
3201 
listxattr(&self, _ctx: Context, inode: Inode, size: u32) -> io::Result<ListxattrReply>3202     fn listxattr(&self, _ctx: Context, inode: Inode, size: u32) -> io::Result<ListxattrReply> {
3203         let _trace = fs_trace!(self.tag, "listxattr", inode, size);
3204         let data = self.find_inode(inode)?;
3205 
3206         let mut buf = vec![0u8; size as usize];
3207 
3208         let file = data.file.lock();
3209         let o_path_file = (file.1 & libc::O_PATH) != 0;
3210         let res = if o_path_file {
3211             // For FDs opened with `O_PATH`, we cannot call `flistxattr` normally. Instead we
3212             // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
3213             // and then setting the CWD back to the root directory.
3214             let path = CString::new(format!("self/fd/{}", file.0.as_raw_descriptor()))
3215                 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
3216 
3217             // SAFETY: this will only modify `buf` and we check the return value.
3218             syscall!(self.with_proc_chdir(|| unsafe {
3219                 libc::listxattr(
3220                     path.as_ptr(),
3221                     buf.as_mut_ptr() as *mut libc::c_char,
3222                     buf.len() as libc::size_t,
3223                 )
3224             }))?
3225         } else {
3226             // For regular files and directories, we can just flistxattr.
3227             // SAFETY: this will only write to `buf` and we check the return value.
3228             syscall!(unsafe {
3229                 libc::flistxattr(
3230                     file.0.as_raw_descriptor(),
3231                     buf.as_mut_ptr() as *mut libc::c_char,
3232                     buf.len() as libc::size_t,
3233                 )
3234             })?
3235         };
3236 
3237         if size == 0 {
3238             Ok(ListxattrReply::Count(res as u32))
3239         } else {
3240             buf.truncate(res as usize);
3241 
3242             if self.cfg.rewrite_security_xattrs {
3243                 strip_xattr_prefix(&mut buf);
3244             }
3245             Ok(ListxattrReply::Names(buf))
3246         }
3247     }
3248 
removexattr(&self, _ctx: Context, inode: Inode, name: &CStr) -> io::Result<()>3249     fn removexattr(&self, _ctx: Context, inode: Inode, name: &CStr) -> io::Result<()> {
3250         let _trace = fs_trace!(self.tag, "removexattr", inode, name);
3251         // We don't allow the VM to set this xattr so we also pretend there is no value associated
3252         // with it.
3253         if self.cfg.rewrite_security_xattrs && name.to_bytes().starts_with(USER_VIRTIOFS_XATTR) {
3254             return Err(io::Error::from_raw_os_error(libc::ENODATA));
3255         }
3256 
3257         let data = self.find_inode(inode)?;
3258         let name = self.rewrite_xattr_name(name);
3259 
3260         let file = data.file.lock();
3261         let o_path_file = (file.1 & libc::O_PATH) != 0;
3262         if o_path_file {
3263             // For files opened with `O_PATH`, we cannot call `fremovexattr` normally. Instead we
3264             // emulate an _at syscall by changing the CWD to /proc, running the path based syscall,
3265             // and then setting the CWD back to the root directory.
3266             let path = CString::new(format!("self/fd/{}", file.0.as_raw_descriptor()))
3267                 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
3268 
3269             syscall!(self.with_proc_chdir(||
3270                     // SAFETY: this doesn't modify any memory and we check the return value.
3271                     unsafe { libc::removexattr(path.as_ptr(), name.as_ptr()) }))?;
3272         } else {
3273             // For regular files and directories, we can just use fremovexattr.
3274             syscall!(
3275                 // SAFETY: this doesn't modify any memory and we check the return value.
3276                 unsafe { libc::fremovexattr(file.0.as_raw_descriptor(), name.as_ptr()) }
3277             )?;
3278         }
3279 
3280         Ok(())
3281     }
3282 
fallocate( &self, _ctx: Context, inode: Inode, handle: Handle, mode: u32, offset: u64, length: u64, ) -> io::Result<()>3283     fn fallocate(
3284         &self,
3285         _ctx: Context,
3286         inode: Inode,
3287         handle: Handle,
3288         mode: u32,
3289         offset: u64,
3290         length: u64,
3291     ) -> io::Result<()> {
3292         let _trace = fs_trace!(self.tag, "fallocate", inode, handle, mode, offset, length);
3293 
3294         let data: Arc<dyn AsRawDescriptor> = if self.zero_message_open.load(Ordering::Relaxed) {
3295             let data = self.find_inode(inode)?;
3296 
3297             {
3298                 // fallocate needs a writable fd
3299                 let mut file = data.file.lock();
3300                 let mut flags = file.1;
3301                 match flags & libc::O_ACCMODE {
3302                     libc::O_RDONLY => {
3303                         flags &= !libc::O_RDONLY;
3304                         flags |= libc::O_RDWR;
3305 
3306                         // We need to get a writable handle for this file.
3307                         let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
3308                         *file = (newfile, flags);
3309                     }
3310                     libc::O_WRONLY | libc::O_RDWR => {}
3311                     _ => panic!("Unexpected flags: {:#x}", flags),
3312                 }
3313             }
3314 
3315             data
3316         } else {
3317             self.find_handle(handle, inode)?
3318         };
3319 
3320         let fd = data.as_raw_descriptor();
3321         // SAFETY: this doesn't modify any memory and we check the return value.
3322         syscall!(unsafe {
3323             libc::fallocate64(
3324                 fd,
3325                 mode as libc::c_int,
3326                 offset as libc::off64_t,
3327                 length as libc::off64_t,
3328             )
3329         })?;
3330 
3331         Ok(())
3332     }
3333 
3334     #[allow(clippy::unnecessary_cast)]
ioctl<R: io::Read>( &self, ctx: Context, inode: Inode, handle: Handle, _flags: IoctlFlags, cmd: u32, _arg: u64, in_size: u32, out_size: u32, r: R, ) -> io::Result<IoctlReply>3335     fn ioctl<R: io::Read>(
3336         &self,
3337         ctx: Context,
3338         inode: Inode,
3339         handle: Handle,
3340         _flags: IoctlFlags,
3341         cmd: u32,
3342         _arg: u64,
3343         in_size: u32,
3344         out_size: u32,
3345         r: R,
3346     ) -> io::Result<IoctlReply> {
3347         let _trace = fs_trace!(self.tag, "ioctl", inode, handle, cmd, in_size, out_size);
3348 
3349         match cmd as IoctlNr {
3350             FS_IOC_GET_ENCRYPTION_POLICY_EX => self.get_encryption_policy_ex(inode, handle, r),
3351             FS_IOC_FSGETXATTR => {
3352                 if out_size < size_of::<fsxattr>() as u32 {
3353                     Err(io::Error::from_raw_os_error(libc::ENOMEM))
3354                 } else {
3355                     self.get_fsxattr(inode, handle)
3356                 }
3357             }
3358             FS_IOC_FSSETXATTR => {
3359                 if in_size < size_of::<fsxattr>() as u32 {
3360                     Err(io::Error::from_raw_os_error(libc::EINVAL))
3361                 } else {
3362                     self.set_fsxattr(ctx, inode, handle, r)
3363                 }
3364             }
3365             FS_IOC32_GETFLAGS | FS_IOC64_GETFLAGS => {
3366                 if out_size < size_of::<c_int>() as u32 {
3367                     Err(io::Error::from_raw_os_error(libc::ENOMEM))
3368                 } else {
3369                     self.get_flags(inode, handle)
3370                 }
3371             }
3372             FS_IOC32_SETFLAGS | FS_IOC64_SETFLAGS => {
3373                 if in_size < size_of::<c_int>() as u32 {
3374                     Err(io::Error::from_raw_os_error(libc::ENOMEM))
3375                 } else {
3376                     self.set_flags(ctx, inode, handle, r)
3377                 }
3378             }
3379             FS_IOC_ENABLE_VERITY => {
3380                 if in_size < size_of::<fsverity_enable_arg>() as u32 {
3381                     Err(io::Error::from_raw_os_error(libc::ENOMEM))
3382                 } else {
3383                     self.enable_verity(inode, handle, r)
3384                 }
3385             }
3386             FS_IOC_MEASURE_VERITY => {
3387                 if in_size < size_of::<fsverity_digest>() as u32
3388                     || out_size < size_of::<fsverity_digest>() as u32
3389                 {
3390                     Err(io::Error::from_raw_os_error(libc::ENOMEM))
3391                 } else {
3392                     self.measure_verity(inode, handle, r, out_size)
3393                 }
3394             }
3395             // The following is ARCVM-specific ioctl
3396             // Refer go/remove-mount-passthrough-fuse for more design details
3397             #[cfg(feature = "arc_quota")]
3398             FS_IOC_SETPERMISSION => {
3399                 if in_size != size_of::<FsPermissionDataBuffer>() as u32 {
3400                     Err(io::Error::from_raw_os_error(libc::EINVAL))
3401                 } else {
3402                     Ok(self.set_permission_by_path(r))
3403                 }
3404             }
3405             #[cfg(feature = "arc_quota")]
3406             FS_IOC_SETPATHXATTR => {
3407                 if in_size != size_of::<FsPathXattrDataBuffer>() as u32 {
3408                     Err(io::Error::from_raw_os_error(libc::EINVAL))
3409                 } else {
3410                     Ok(self.set_xattr_by_path(r))
3411                 }
3412             }
3413             _ => Err(io::Error::from_raw_os_error(libc::ENOTTY)),
3414         }
3415     }
3416 
copy_file_range( &self, ctx: Context, inode_src: Inode, handle_src: Handle, offset_src: u64, inode_dst: Inode, handle_dst: Handle, offset_dst: u64, length: u64, flags: u64, ) -> io::Result<usize>3417     fn copy_file_range(
3418         &self,
3419         ctx: Context,
3420         inode_src: Inode,
3421         handle_src: Handle,
3422         offset_src: u64,
3423         inode_dst: Inode,
3424         handle_dst: Handle,
3425         offset_dst: u64,
3426         length: u64,
3427         flags: u64,
3428     ) -> io::Result<usize> {
3429         let _trace = fs_trace!(
3430             self.tag,
3431             "copy_file_range",
3432             inode_src,
3433             handle_src,
3434             offset_src,
3435             inode_dst,
3436             handle_dst,
3437             offset_dst,
3438             length,
3439             flags
3440         );
3441         // We need to change credentials during a write so that the kernel will remove setuid or
3442         // setgid bits from the file if it was written to by someone other than the owner.
3443         let (_uid, _gid) = set_creds(ctx.uid, ctx.gid)?;
3444         let (src_data, dst_data): (Arc<dyn AsRawDescriptor>, Arc<dyn AsRawDescriptor>) =
3445             if self.zero_message_open.load(Ordering::Relaxed) {
3446                 (self.find_inode(inode_src)?, self.find_inode(inode_dst)?)
3447             } else {
3448                 (
3449                     self.find_handle(handle_src, inode_src)?,
3450                     self.find_handle(handle_dst, inode_dst)?,
3451                 )
3452             };
3453 
3454         let src = src_data.as_raw_descriptor();
3455         let dst = dst_data.as_raw_descriptor();
3456 
3457         Ok(syscall!(
3458             // SAFETY: this call is safe because it doesn't modify any memory and we
3459             // check the return value.
3460             unsafe {
3461                 libc::syscall(
3462                     libc::SYS_copy_file_range,
3463                     src,
3464                     &offset_src,
3465                     dst,
3466                     &offset_dst,
3467                     length,
3468                     flags,
3469                 )
3470             }
3471         )? as usize)
3472     }
3473 
set_up_mapping<M: Mapper>( &self, _ctx: Context, inode: Self::Inode, _handle: Self::Handle, file_offset: u64, mem_offset: u64, size: usize, prot: u32, mapper: M, ) -> io::Result<()>3474     fn set_up_mapping<M: Mapper>(
3475         &self,
3476         _ctx: Context,
3477         inode: Self::Inode,
3478         _handle: Self::Handle,
3479         file_offset: u64,
3480         mem_offset: u64,
3481         size: usize,
3482         prot: u32,
3483         mapper: M,
3484     ) -> io::Result<()> {
3485         let _trace = fs_trace!(
3486             self.tag,
3487             "set_up_mapping",
3488             inode,
3489             file_offset,
3490             mem_offset,
3491             size,
3492             prot
3493         );
3494         if !self.cfg.use_dax {
3495             return Err(io::Error::from_raw_os_error(libc::ENOSYS));
3496         }
3497 
3498         let read = prot & libc::PROT_READ as u32 != 0;
3499         let write = prot & libc::PROT_WRITE as u32 != 0;
3500         let (mmap_flags, prot) = match (read, write) {
3501             (true, true) => (libc::O_RDWR, Protection::read_write()),
3502             (true, false) => (libc::O_RDONLY, Protection::read()),
3503             // Write-only is mapped to O_RDWR since mmap always requires an fd opened for reading.
3504             (false, true) => (libc::O_RDWR, Protection::write()),
3505             (false, false) => return Err(io::Error::from_raw_os_error(libc::EINVAL)),
3506         };
3507 
3508         let data = self.find_inode(inode)?;
3509 
3510         if self.zero_message_open.load(Ordering::Relaxed) {
3511             let mut file = data.file.lock();
3512             let mut open_flags = file.1;
3513             match (mmap_flags, open_flags & libc::O_ACCMODE) {
3514                 (libc::O_RDONLY, libc::O_WRONLY)
3515                 | (libc::O_RDWR, libc::O_RDONLY)
3516                 | (libc::O_RDWR, libc::O_WRONLY) => {
3517                     // We have a read-only or write-only fd and we need to upgrade it.
3518                     open_flags &= !libc::O_ACCMODE;
3519                     open_flags |= libc::O_RDWR;
3520 
3521                     let newfile = self.open_fd(file.0.as_raw_descriptor(), libc::O_RDWR)?;
3522                     *file = (newfile, open_flags);
3523                 }
3524                 (libc::O_RDONLY, libc::O_RDONLY)
3525                 | (libc::O_RDONLY, libc::O_RDWR)
3526                 | (libc::O_RDWR, libc::O_RDWR) => {}
3527                 (m, o) => panic!(
3528                     "Unexpected combination of access flags: ({:#x}, {:#x})",
3529                     m, o
3530                 ),
3531             }
3532             mapper.map(mem_offset, size, &file.0, file_offset, prot)
3533         } else {
3534             let file = self.open_inode(&data, mmap_flags | libc::O_NONBLOCK)?;
3535             mapper.map(mem_offset, size, &file, file_offset, prot)
3536         }
3537     }
3538 
remove_mapping<M: Mapper>(&self, msgs: &[RemoveMappingOne], mapper: M) -> io::Result<()>3539     fn remove_mapping<M: Mapper>(&self, msgs: &[RemoveMappingOne], mapper: M) -> io::Result<()> {
3540         let _trace = fs_trace!(self.tag, "remove_mapping", msgs);
3541         if !self.cfg.use_dax {
3542             return Err(io::Error::from_raw_os_error(libc::ENOSYS));
3543         }
3544 
3545         for RemoveMappingOne { moffset, len } in msgs {
3546             mapper.unmap(*moffset, *len)?;
3547         }
3548         Ok(())
3549     }
3550 
atomic_open( &self, ctx: Context, parent: Self::Inode, name: &CStr, mode: u32, flags: u32, umask: u32, security_ctx: Option<&CStr>, ) -> io::Result<(Entry, Option<Self::Handle>, OpenOptions)>3551     fn atomic_open(
3552         &self,
3553         ctx: Context,
3554         parent: Self::Inode,
3555         name: &CStr,
3556         mode: u32,
3557         flags: u32,
3558         umask: u32,
3559         security_ctx: Option<&CStr>,
3560     ) -> io::Result<(Entry, Option<Self::Handle>, OpenOptions)> {
3561         let _trace = fs_trace!(
3562             self.tag,
3563             "atomic_open",
3564             parent,
3565             name,
3566             mode,
3567             flags,
3568             umask,
3569             security_ctx
3570         );
3571         // Perform lookup but not create negative dentry
3572         let data = self.find_inode(parent)?;
3573 
3574         #[allow(unused_variables)]
3575         #[cfg(feature = "arc_quota")]
3576         let (uid, gid) = self.change_creds(&ctx, &data, name);
3577         #[cfg(feature = "fs_runtime_ugid_map")]
3578         let (uid, gid) = self.change_ugid_creds(&ctx, &data, name);
3579         #[cfg(not(feature = "fs_permission_translation"))]
3580         let (uid, gid) = (ctx.uid, ctx.gid);
3581 
3582         let (_uid, _gid) = set_creds(uid, gid)?;
3583 
3584         // This lookup serves two purposes:
3585         // 1. If the O_CREATE flag is not set, it retrieves the d_entry for the file.
3586         // 2. If the O_CREATE flag is set, it checks whether the file exists.
3587         let res = self.do_lookup_with_casefold_fallback(&data, name);
3588 
3589         if let Err(e) = res {
3590             if e.kind() == std::io::ErrorKind::NotFound && (flags as i32 & libc::O_CREAT) != 0 {
3591                 // If the file did not exist & O_CREAT is set,
3592                 // create file & set FILE_CREATED bits in open options
3593                 let (entry, handler, mut opts) =
3594                     self.create(ctx, parent, name, mode, flags, umask, security_ctx)?;
3595                 opts |= OpenOptions::FILE_CREATED;
3596                 return Ok((entry, handler, opts));
3597             } else if e.kind() == std::io::ErrorKind::NotFound
3598                 && !self.cfg.negative_timeout.is_zero()
3599             {
3600                 return Ok((
3601                     Entry::new_negative(self.cfg.negative_timeout),
3602                     None,
3603                     OpenOptions::empty(),
3604                 ));
3605             }
3606             return Err(e);
3607         }
3608 
3609         // SAFETY: checked res is not error before
3610         let entry = res.unwrap();
3611 
3612         if entry.attr.st_mode & libc::S_IFMT == libc::S_IFLNK {
3613             return Ok((entry, None, OpenOptions::empty()));
3614         }
3615 
3616         if (flags as i32 & (libc::O_CREAT | libc::O_EXCL)) == (libc::O_CREAT | libc::O_EXCL) {
3617             return Err(eexist());
3618         }
3619 
3620         let (handler, opts) = if self.zero_message_open.load(Ordering::Relaxed) {
3621             (None, OpenOptions::KEEP_CACHE)
3622         } else {
3623             let (handler, opts) = self.do_open(entry.inode, flags)?;
3624             (handler, opts)
3625         };
3626         Ok((entry, handler, opts))
3627     }
3628 }
3629 
3630 #[cfg(test)]
3631 mod tests {
3632     use std::path::Path;
3633 
3634     use named_lock::NamedLock;
3635     use tempfile::TempDir;
3636 
3637     use super::*;
3638     #[cfg(feature = "arc_quota")]
3639     use crate::virtio::fs::arc_ioctl::FS_IOCTL_PATH_MAX_LEN;
3640     #[cfg(feature = "arc_quota")]
3641     use crate::virtio::fs::arc_ioctl::FS_IOCTL_XATTR_NAME_MAX_LEN;
3642     #[cfg(feature = "arc_quota")]
3643     use crate::virtio::fs::arc_ioctl::FS_IOCTL_XATTR_VALUE_MAX_LEN;
3644 
3645     const UNITTEST_LOCK_NAME: &str = "passthroughfs_unittest_lock";
3646 
3647     // Create an instance of `Context` with valid uid, gid, and pid.
3648     // The correct ids are necessary for test cases where new files are created.
get_context() -> Context3649     fn get_context() -> Context {
3650         // SAFETY: both calls take no parameters and only return an integer value. The kernel also
3651         // guarantees that they can never fail.
3652         let uid = unsafe { libc::syscall(SYS_GETEUID) as libc::uid_t };
3653         // SAFETY: both calls take no parameters and only return an integer value. The kernel also
3654         // guarantees that they can never fail.
3655         let gid = unsafe { libc::syscall(SYS_GETEGID) as libc::gid_t };
3656         let pid = std::process::id() as libc::pid_t;
3657         Context { uid, gid, pid }
3658     }
3659 
3660     /// Creates the given directories and files under `temp_dir`.
create_test_data(temp_dir: &TempDir, dirs: &[&str], files: &[&str])3661     fn create_test_data(temp_dir: &TempDir, dirs: &[&str], files: &[&str]) {
3662         let path = temp_dir.path();
3663 
3664         for d in dirs {
3665             std::fs::create_dir_all(path.join(d)).unwrap();
3666         }
3667 
3668         for f in files {
3669             File::create(path.join(f)).unwrap();
3670         }
3671     }
3672 
3673     /// Looks up the given `path` in `fs`.
lookup(fs: &PassthroughFs, path: &Path) -> io::Result<Inode>3674     fn lookup(fs: &PassthroughFs, path: &Path) -> io::Result<Inode> {
3675         let mut inode = 1;
3676         let ctx = get_context();
3677         for name in path.iter() {
3678             let name = CString::new(name.to_str().unwrap()).unwrap();
3679             let ent = match fs.lookup(ctx, inode, &name) {
3680                 Ok(ent) => ent,
3681                 Err(e) => {
3682                     return Err(e);
3683                 }
3684             };
3685             inode = ent.inode;
3686         }
3687         Ok(inode)
3688     }
3689 
3690     /// Looks up the given `path` in `fs`.
3691     #[cfg(feature = "arc_quota")]
lookup_ent(fs: &PassthroughFs, path: &Path) -> io::Result<Entry>3692     fn lookup_ent(fs: &PassthroughFs, path: &Path) -> io::Result<Entry> {
3693         let mut inode = 1;
3694         let ctx = get_context();
3695         let mut entry = Entry::new_negative(Duration::from_secs(10));
3696         for name in path.iter() {
3697             let name = CString::new(name.to_str().unwrap()).unwrap();
3698             entry = match fs.lookup(ctx, inode, &name) {
3699                 Ok(ent) => ent,
3700                 Err(e) => {
3701                     return Err(e);
3702                 }
3703             };
3704             inode = entry.inode;
3705         }
3706         Ok(entry)
3707     }
3708 
3709     /// Creates a file at the given `path`.
create(fs: &PassthroughFs, path: &Path) -> io::Result<Entry>3710     fn create(fs: &PassthroughFs, path: &Path) -> io::Result<Entry> {
3711         let parent = path.parent().unwrap();
3712         let filename = CString::new(path.file_name().unwrap().to_str().unwrap()).unwrap();
3713         let parent_inode = lookup(fs, parent)?;
3714         let ctx = get_context();
3715         let security_ctx = None;
3716         fs.create(
3717             ctx,
3718             parent_inode,
3719             &filename,
3720             0o666,
3721             libc::O_RDWR as u32,
3722             0,
3723             security_ctx,
3724         )
3725         .map(|(entry, _, _)| entry)
3726     }
3727 
3728     /// Removes a file at the given `path`.
unlink(fs: &PassthroughFs, path: &Path) -> io::Result<()>3729     fn unlink(fs: &PassthroughFs, path: &Path) -> io::Result<()> {
3730         let parent = path.parent().unwrap();
3731         let filename = CString::new(path.file_name().unwrap().to_str().unwrap()).unwrap();
3732         let parent_inode = lookup(fs, parent)?;
3733         let ctx = get_context();
3734         fs.unlink(ctx, parent_inode, &filename)
3735     }
3736 
3737     /// Forgets cache.
forget(fs: &PassthroughFs, path: &Path) -> io::Result<()>3738     fn forget(fs: &PassthroughFs, path: &Path) -> io::Result<()> {
3739         let ctx = get_context();
3740         let inode = lookup(fs, path)?;
3741         // Pass `u64::MAX` to ensure that the refcount goes to 0 and we forget inode.
3742         fs.forget(ctx, inode, u64::MAX);
3743         Ok(())
3744     }
3745 
3746     /// Looks up and open the given `path` in `fs`.
atomic_open( fs: &PassthroughFs, path: &Path, mode: u32, flags: u32, umask: u32, security_ctx: Option<&CStr>, ) -> io::Result<(Entry, Option<Handle>, OpenOptions)>3747     fn atomic_open(
3748         fs: &PassthroughFs,
3749         path: &Path,
3750         mode: u32,
3751         flags: u32,
3752         umask: u32,
3753         security_ctx: Option<&CStr>,
3754     ) -> io::Result<(Entry, Option<Handle>, OpenOptions)> {
3755         let mut inode = 1;
3756         let ctx = get_context();
3757 
3758         let path_vec: Vec<_> = path.iter().collect();
3759         let vec_len = path_vec.len();
3760 
3761         // Do lookup before util (vec_len-1)-th pathname, this operation is to simulate
3762         // the behavior of VFS, since when VFS call atomic_open only at last look up.
3763         for name in &path_vec[0..vec_len - 1] {
3764             let name = CString::new(name.to_str().unwrap()).unwrap();
3765             let ent = fs.lookup(ctx, inode, &name)?;
3766             inode = ent.inode;
3767         }
3768 
3769         let name = CString::new(path_vec[vec_len - 1].to_str().unwrap()).unwrap();
3770 
3771         fs.atomic_open(ctx, inode, &name, mode, flags, umask, security_ctx)
3772     }
3773 
symlink( fs: &PassthroughFs, linkname: &Path, name: &Path, security_ctx: Option<&CStr>, ) -> io::Result<Entry>3774     fn symlink(
3775         fs: &PassthroughFs,
3776         linkname: &Path,
3777         name: &Path,
3778         security_ctx: Option<&CStr>,
3779     ) -> io::Result<Entry> {
3780         let inode = 1;
3781         let ctx = get_context();
3782         let name = CString::new(name.to_str().unwrap()).unwrap();
3783         let linkname = CString::new(linkname.to_str().unwrap()).unwrap();
3784         fs.symlink(ctx, &linkname, inode, &name, security_ctx)
3785     }
3786 
3787     // In this ioctl inode,handle,flags,arg and out_size is irrelavant, set to empty value.
3788     #[cfg(feature = "arc_quota")]
fs_ioc_setpermission<R: io::Read>( fs: &PassthroughFs, in_size: u32, r: R, ) -> io::Result<IoctlReply>3789     fn fs_ioc_setpermission<R: io::Read>(
3790         fs: &PassthroughFs,
3791         in_size: u32,
3792         r: R,
3793     ) -> io::Result<IoctlReply> {
3794         let ctx = get_context();
3795         fs.ioctl(
3796             ctx,
3797             0,
3798             0,
3799             IoctlFlags::empty(),
3800             FS_IOC_SETPERMISSION as u32,
3801             0,
3802             in_size,
3803             0,
3804             r,
3805         )
3806     }
3807 
3808     // In this ioctl inode,handle,flags,arg and out_size is irrelavant, set to empty value.
3809     #[cfg(feature = "arc_quota")]
fs_ioc_setpathxattr<R: io::Read>( fs: &PassthroughFs, in_size: u32, r: R, ) -> io::Result<IoctlReply>3810     fn fs_ioc_setpathxattr<R: io::Read>(
3811         fs: &PassthroughFs,
3812         in_size: u32,
3813         r: R,
3814     ) -> io::Result<IoctlReply> {
3815         let ctx = get_context();
3816         fs.ioctl(
3817             ctx,
3818             0,
3819             0,
3820             IoctlFlags::empty(),
3821             FS_IOC_SETPATHXATTR as u32,
3822             0,
3823             in_size,
3824             0,
3825             r,
3826         )
3827     }
3828 
3829     #[test]
rewrite_xattr_names()3830     fn rewrite_xattr_names() {
3831         // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
3832         // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
3833         let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
3834         let _guard = lock.lock().expect("acquire named lock");
3835 
3836         let cfg = Config {
3837             rewrite_security_xattrs: true,
3838             ..Default::default()
3839         };
3840 
3841         let p = PassthroughFs::new("tag", cfg).expect("Failed to create PassthroughFs");
3842 
3843         // Selinux shouldn't get overwritten.
3844         let selinux = c"security.selinux";
3845         assert_eq!(p.rewrite_xattr_name(selinux).to_bytes(), selinux.to_bytes());
3846 
3847         // user, trusted, and system should not be changed either.
3848         let user = c"user.foobar";
3849         assert_eq!(p.rewrite_xattr_name(user).to_bytes(), user.to_bytes());
3850         let trusted = c"trusted.foobar";
3851         assert_eq!(p.rewrite_xattr_name(trusted).to_bytes(), trusted.to_bytes());
3852         let system = c"system.foobar";
3853         assert_eq!(p.rewrite_xattr_name(system).to_bytes(), system.to_bytes());
3854 
3855         // sehash should be re-written.
3856         let sehash = c"security.sehash";
3857         assert_eq!(
3858             p.rewrite_xattr_name(sehash).to_bytes(),
3859             b"user.virtiofs.security.sehash"
3860         );
3861     }
3862 
3863     #[test]
strip_xattr_names()3864     fn strip_xattr_names() {
3865         let only_nuls = b"\0\0\0\0\0";
3866         let mut actual = only_nuls.to_vec();
3867         strip_xattr_prefix(&mut actual);
3868         assert_eq!(&actual[..], &only_nuls[..]);
3869 
3870         let no_nuls = b"security.sehashuser.virtiofs";
3871         let mut actual = no_nuls.to_vec();
3872         strip_xattr_prefix(&mut actual);
3873         assert_eq!(&actual[..], &no_nuls[..]);
3874 
3875         let empty = b"";
3876         let mut actual = empty.to_vec();
3877         strip_xattr_prefix(&mut actual);
3878         assert_eq!(&actual[..], &empty[..]);
3879 
3880         let no_strippable_names = b"security.selinux\0user.foobar\0system.test\0";
3881         let mut actual = no_strippable_names.to_vec();
3882         strip_xattr_prefix(&mut actual);
3883         assert_eq!(&actual[..], &no_strippable_names[..]);
3884 
3885         let only_strippable_names = b"user.virtiofs.security.sehash\0user.virtiofs.security.wat\0";
3886         let mut actual = only_strippable_names.to_vec();
3887         strip_xattr_prefix(&mut actual);
3888         assert_eq!(&actual[..], b"security.sehash\0security.wat\0");
3889 
3890         let mixed_names = b"user.virtiofs.security.sehash\0security.selinux\0user.virtiofs.security.wat\0user.foobar\0";
3891         let mut actual = mixed_names.to_vec();
3892         strip_xattr_prefix(&mut actual);
3893         let expected = b"security.sehash\0security.selinux\0security.wat\0user.foobar\0";
3894         assert_eq!(&actual[..], &expected[..]);
3895 
3896         let no_nul_with_prefix = b"user.virtiofs.security.sehash";
3897         let mut actual = no_nul_with_prefix.to_vec();
3898         strip_xattr_prefix(&mut actual);
3899         assert_eq!(&actual[..], b"security.sehash");
3900     }
3901 
3902     #[test]
lookup_files()3903     fn lookup_files() {
3904         // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
3905         // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
3906         let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
3907         let _guard = lock.lock().expect("acquire named lock");
3908 
3909         let temp_dir = TempDir::new().unwrap();
3910         create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/b.txt"]);
3911 
3912         let cfg = Default::default();
3913         let fs = PassthroughFs::new("tag", cfg).unwrap();
3914 
3915         let capable = FsOptions::empty();
3916         fs.init(capable).unwrap();
3917 
3918         assert!(lookup(&fs, &temp_dir.path().join("a.txt")).is_ok());
3919         assert!(lookup(&fs, &temp_dir.path().join("dir")).is_ok());
3920         assert!(lookup(&fs, &temp_dir.path().join("dir/b.txt")).is_ok());
3921 
3922         assert_eq!(
3923             lookup(&fs, &temp_dir.path().join("nonexistent-file"))
3924                 .expect_err("file must not exist")
3925                 .kind(),
3926             io::ErrorKind::NotFound
3927         );
3928         // "A.txt" is different from "a.txt".
3929         assert_eq!(
3930             lookup(&fs, &temp_dir.path().join("A.txt"))
3931                 .expect_err("file must not exist")
3932                 .kind(),
3933             io::ErrorKind::NotFound
3934         );
3935     }
3936 
3937     #[test]
lookup_files_ascii_casefold()3938     fn lookup_files_ascii_casefold() {
3939         // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
3940         // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
3941         let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
3942         let _guard = lock.lock().expect("acquire named lock");
3943 
3944         let temp_dir = TempDir::new().unwrap();
3945         create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/b.txt"]);
3946 
3947         let cfg = Config {
3948             ascii_casefold: true,
3949             ..Default::default()
3950         };
3951         let fs = PassthroughFs::new("tag", cfg).unwrap();
3952 
3953         let capable = FsOptions::empty();
3954         fs.init(capable).unwrap();
3955 
3956         // Ensure that "A.txt" is equated with "a.txt".
3957         let a_inode = lookup(&fs, &temp_dir.path().join("a.txt")).expect("a.txt must be found");
3958         assert_eq!(
3959             lookup(&fs, &temp_dir.path().join("A.txt")).expect("A.txt must exist"),
3960             a_inode
3961         );
3962 
3963         let dir_inode = lookup(&fs, &temp_dir.path().join("dir")).expect("dir must be found");
3964         assert_eq!(
3965             lookup(&fs, &temp_dir.path().join("DiR")).expect("DiR must exist"),
3966             dir_inode
3967         );
3968 
3969         let b_inode =
3970             lookup(&fs, &temp_dir.path().join("dir/b.txt")).expect("dir/b.txt must be found");
3971         assert_eq!(
3972             lookup(&fs, &temp_dir.path().join("dIr/B.TxT")).expect("dIr/B.TxT must exist"),
3973             b_inode
3974         );
3975 
3976         assert_eq!(
3977             lookup(&fs, &temp_dir.path().join("nonexistent-file"))
3978                 .expect_err("file must not exist")
3979                 .kind(),
3980             io::ErrorKind::NotFound
3981         );
3982     }
3983 
test_create_and_remove(ascii_casefold: bool)3984     fn test_create_and_remove(ascii_casefold: bool) {
3985         // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
3986         // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
3987         let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
3988         let _guard = lock.lock().expect("acquire named lock");
3989 
3990         let temp_dir = TempDir::new().unwrap();
3991         let timeout = Duration::from_millis(10);
3992         let cfg = Config {
3993             timeout,
3994             cache_policy: CachePolicy::Auto,
3995             ascii_casefold,
3996             ..Default::default()
3997         };
3998         let fs = PassthroughFs::new("tag", cfg).unwrap();
3999 
4000         let capable = FsOptions::empty();
4001         fs.init(capable).unwrap();
4002 
4003         // Create a.txt and b.txt.
4004         let a_path = temp_dir.path().join("a.txt");
4005         let b_path = temp_dir.path().join("b.txt");
4006         let a_entry = create(&fs, &a_path).expect("create a.txt");
4007         let b_entry = create(&fs, &b_path).expect("create b.txt");
4008         assert_eq!(
4009             a_entry.inode,
4010             lookup(&fs, &a_path).expect("lookup a.txt"),
4011             "Created file 'a.txt' must be looked up"
4012         );
4013         assert_eq!(
4014             b_entry.inode,
4015             lookup(&fs, &b_path).expect("lookup b.txt"),
4016             "Created file 'b.txt' must be looked up"
4017         );
4018 
4019         // Remove a.txt only
4020         unlink(&fs, &a_path).expect("Remove");
4021         assert_eq!(
4022             lookup(&fs, &a_path)
4023                 .expect_err("file must not exist")
4024                 .kind(),
4025             io::ErrorKind::NotFound,
4026             "a.txt must be removed"
4027         );
4028         // "A.TXT" must not be found regardless of whether casefold is enabled or not.
4029         let upper_a_path = temp_dir.path().join("A.TXT");
4030         assert_eq!(
4031             lookup(&fs, &upper_a_path)
4032                 .expect_err("file must not exist")
4033                 .kind(),
4034             io::ErrorKind::NotFound,
4035             "A.txt must be removed"
4036         );
4037 
4038         // Check if the host file system doesn't have a.txt but does b.txt.
4039         assert!(!a_path.exists(), "a.txt must be removed");
4040         assert!(b_path.exists(), "b.txt must exist");
4041     }
4042 
4043     #[test]
create_and_remove()4044     fn create_and_remove() {
4045         test_create_and_remove(false /* casefold */);
4046     }
4047 
4048     #[test]
create_and_remove_casefold()4049     fn create_and_remove_casefold() {
4050         test_create_and_remove(true /* casefold */);
4051     }
4052 
test_create_and_forget(ascii_casefold: bool)4053     fn test_create_and_forget(ascii_casefold: bool) {
4054         // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4055         // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4056         let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4057         let _guard = lock.lock().expect("acquire named lock");
4058 
4059         let temp_dir = TempDir::new().unwrap();
4060         let timeout = Duration::from_millis(10);
4061         let cfg = Config {
4062             timeout,
4063             cache_policy: CachePolicy::Auto,
4064             ascii_casefold,
4065             ..Default::default()
4066         };
4067         let fs = PassthroughFs::new("tag", cfg).unwrap();
4068 
4069         let capable = FsOptions::empty();
4070         fs.init(capable).unwrap();
4071 
4072         // Create a.txt.
4073         let a_path = temp_dir.path().join("a.txt");
4074         let a_entry = create(&fs, &a_path).expect("create a.txt");
4075         assert_eq!(
4076             a_entry.inode,
4077             lookup(&fs, &a_path).expect("lookup a.txt"),
4078             "Created file 'a.txt' must be looked up"
4079         );
4080 
4081         // Forget a.txt's inode from PassthroughFs's internal cache.
4082         forget(&fs, &a_path).expect("forget a.txt");
4083 
4084         if ascii_casefold {
4085             let upper_a_path = temp_dir.path().join("A.TXT");
4086             let new_a_inode = lookup(&fs, &upper_a_path).expect("lookup a.txt");
4087             assert_ne!(
4088                 a_entry.inode, new_a_inode,
4089                 "inode must be changed after forget()"
4090             );
4091             assert_eq!(
4092                 new_a_inode,
4093                 lookup(&fs, &a_path).expect("lookup a.txt"),
4094                 "inode must be same for a.txt and A.TXT"
4095             );
4096         } else {
4097             assert_ne!(
4098                 a_entry.inode,
4099                 lookup(&fs, &a_path).expect("lookup a.txt"),
4100                 "inode must be changed after forget()"
4101             );
4102         }
4103     }
4104 
4105     #[test]
create_and_forget()4106     fn create_and_forget() {
4107         test_create_and_forget(false /* ascii_casefold */);
4108     }
4109 
4110     #[test]
create_and_forget_casefold()4111     fn create_and_forget_casefold() {
4112         test_create_and_forget(true /* ascii_casefold */);
4113     }
4114 
4115     #[test]
casefold_lookup_cache()4116     fn casefold_lookup_cache() {
4117         let temp_dir = TempDir::new().unwrap();
4118         // Prepare `a.txt` before starting the test.
4119         create_test_data(&temp_dir, &[], &["a.txt"]);
4120 
4121         let cfg = Config {
4122             ascii_casefold: true,
4123             ..Default::default()
4124         };
4125         let fs = PassthroughFs::new("tag", cfg).unwrap();
4126 
4127         let capable = FsOptions::empty();
4128         fs.init(capable).unwrap();
4129 
4130         let parent = lookup(&fs, temp_dir.path()).expect("lookup temp_dir");
4131 
4132         // Since `a.txt` exists, "A.TXT" must exist.
4133         let large_a_path = temp_dir.path().join("A.TXT");
4134         // Looking up "A.TXT" must create a CasefoldCache entry.
4135         lookup(&fs, &large_a_path).expect("A.TXT must exist");
4136         assert!(fs.exists_in_casefold_cache(parent, &CString::new("A.TXT").unwrap()));
4137 
4138         // Create b.txt.
4139         let b_path = temp_dir.path().join("b.txt");
4140         create(&fs, &b_path).expect("create b.txt");
4141         // Then, b.txt must exists in the cache.
4142         assert!(fs.exists_in_casefold_cache(parent, &CString::new("B.TXT").unwrap()));
4143         // When removing b.txt, it must be removed from the cache as well.
4144         unlink(&fs, &b_path).expect("remove b.txt");
4145         assert!(!fs.exists_in_casefold_cache(parent, &CString::new("B.TXT").unwrap()));
4146     }
4147 
4148     #[test]
lookup_negative_cache()4149     fn lookup_negative_cache() {
4150         let temp_dir = TempDir::new().unwrap();
4151         // Prepare `a.txt` before starting the test.
4152         create_test_data(&temp_dir, &[], &[]);
4153 
4154         let cfg = Config {
4155             negative_timeout: Duration::from_secs(5),
4156             ..Default::default()
4157         };
4158         let fs = PassthroughFs::new("tag", cfg).unwrap();
4159 
4160         let capable = FsOptions::empty();
4161         fs.init(capable).unwrap();
4162 
4163         let a_path = temp_dir.path().join("a.txt");
4164         // a.txt hasn't existed yet.
4165         // Since negative_timeout is enabled, success with inode=0 is expected.
4166         assert_eq!(
4167             0,
4168             lookup(&fs, &a_path).expect("lookup a.txt"),
4169             "Entry with inode=0 is expected for non-existing file 'a.txt'"
4170         );
4171         // Create a.txt
4172         let a_entry = create(&fs, &a_path).expect("create a.txt");
4173         assert_eq!(
4174             a_entry.inode,
4175             lookup(&fs, &a_path).expect("lookup a.txt"),
4176             "Created file 'a.txt' must be looked up"
4177         );
4178         // Remove a.txt
4179         unlink(&fs, &a_path).expect("Remove");
4180         assert_eq!(
4181             0,
4182             lookup(&fs, &a_path).expect("lookup a.txt"),
4183             "Entry with inode=0 is expected for the removed file 'a.txt'"
4184         );
4185     }
4186     #[test]
test_atomic_open_existing_file()4187     fn test_atomic_open_existing_file() {
4188         atomic_open_existing_file(false);
4189     }
4190 
4191     #[test]
test_atomic_open_existing_file_zero_message()4192     fn test_atomic_open_existing_file_zero_message() {
4193         atomic_open_existing_file(true);
4194     }
4195 
atomic_open_existing_file(zero_message_open: bool)4196     fn atomic_open_existing_file(zero_message_open: bool) {
4197         // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4198         // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4199         let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4200         let _guard = lock.lock().expect("acquire named lock");
4201 
4202         let temp_dir = TempDir::new().unwrap();
4203         create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/b.txt", "dir/c.txt"]);
4204 
4205         let cache_policy = match zero_message_open {
4206             true => CachePolicy::Always,
4207             false => CachePolicy::Auto,
4208         };
4209 
4210         let cfg = Config {
4211             cache_policy,
4212             ..Default::default()
4213         };
4214         let fs = PassthroughFs::new("tag", cfg).unwrap();
4215 
4216         let capable = FsOptions::ZERO_MESSAGE_OPEN;
4217         fs.init(capable).unwrap();
4218 
4219         // atomic_open with flag O_RDWR, should return positive dentry and file handler
4220         let res = atomic_open(
4221             &fs,
4222             &temp_dir.path().join("a.txt"),
4223             0o666,
4224             libc::O_RDWR as u32,
4225             0,
4226             None,
4227         );
4228         assert!(res.is_ok());
4229         let (entry, handler, open_options) = res.unwrap();
4230         assert_ne!(entry.inode, 0);
4231 
4232         if zero_message_open {
4233             assert!(handler.is_none());
4234             assert_eq!(open_options, OpenOptions::KEEP_CACHE);
4235         } else {
4236             assert!(handler.is_some());
4237             assert_ne!(
4238                 open_options & OpenOptions::FILE_CREATED,
4239                 OpenOptions::FILE_CREATED
4240             );
4241         }
4242 
4243         // atomic_open with flag O_RDWR |  O_CREATE, should return positive dentry and file handler
4244         let res = atomic_open(
4245             &fs,
4246             &temp_dir.path().join("dir/b.txt"),
4247             0o666,
4248             (libc::O_RDWR | libc::O_CREAT) as u32,
4249             0,
4250             None,
4251         );
4252         assert!(res.is_ok());
4253         let (entry, handler, open_options) = res.unwrap();
4254         assert_ne!(entry.inode, 0);
4255 
4256         if zero_message_open {
4257             assert!(handler.is_none());
4258             assert_eq!(open_options, OpenOptions::KEEP_CACHE);
4259         } else {
4260             assert!(handler.is_some());
4261             assert_ne!(
4262                 open_options & OpenOptions::FILE_CREATED,
4263                 OpenOptions::FILE_CREATED
4264             );
4265         }
4266 
4267         // atomic_open with flag O_RDWR | O_CREATE | O_EXCL, should return positive dentry and file
4268         // handler
4269         let res = atomic_open(
4270             &fs,
4271             &temp_dir.path().join("dir/c.txt"),
4272             0o666,
4273             (libc::O_RDWR | libc::O_CREAT | libc::O_EXCL) as u32,
4274             0,
4275             None,
4276         );
4277         assert!(res.is_err());
4278         let err_kind = res.unwrap_err().kind();
4279         assert_eq!(err_kind, io::ErrorKind::AlreadyExists);
4280     }
4281 
4282     #[test]
test_atomic_open_non_existing_file()4283     fn test_atomic_open_non_existing_file() {
4284         atomic_open_non_existing_file(false);
4285     }
4286 
4287     #[test]
test_atomic_open_non_existing_file_zero_message()4288     fn test_atomic_open_non_existing_file_zero_message() {
4289         atomic_open_non_existing_file(true);
4290     }
4291 
atomic_open_non_existing_file(zero_message_open: bool)4292     fn atomic_open_non_existing_file(zero_message_open: bool) {
4293         // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4294         // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4295         let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4296         let _guard = lock.lock().expect("acquire named lock");
4297 
4298         let temp_dir = TempDir::new().unwrap();
4299 
4300         let cache_policy = match zero_message_open {
4301             true => CachePolicy::Always,
4302             false => CachePolicy::Auto,
4303         };
4304 
4305         let cfg = Config {
4306             cache_policy,
4307             ..Default::default()
4308         };
4309         let fs = PassthroughFs::new("tag", cfg).unwrap();
4310 
4311         let capable = FsOptions::ZERO_MESSAGE_OPEN;
4312         fs.init(capable).unwrap();
4313 
4314         // atomic_open with flag O_RDWR, should return NO_EXIST error
4315         let res = atomic_open(
4316             &fs,
4317             &temp_dir.path().join("a.txt"),
4318             0o666,
4319             libc::O_RDWR as u32,
4320             0,
4321             None,
4322         );
4323         assert!(res.is_err());
4324         let err_kind = res.unwrap_err().kind();
4325         assert_eq!(err_kind, io::ErrorKind::NotFound);
4326 
4327         // atomic_open with flag O_RDWR | O_CREATE, should return positive dentry and file handler
4328         let res = atomic_open(
4329             &fs,
4330             &temp_dir.path().join("b.txt"),
4331             0o666,
4332             (libc::O_RDWR | libc::O_CREAT) as u32,
4333             0,
4334             None,
4335         );
4336         assert!(res.is_ok());
4337         let (entry, handler, open_options) = res.unwrap();
4338         assert_ne!(entry.inode, 0);
4339 
4340         if zero_message_open {
4341             assert!(handler.is_none());
4342             assert_eq!(
4343                 open_options & OpenOptions::KEEP_CACHE,
4344                 OpenOptions::KEEP_CACHE
4345             );
4346         } else {
4347             assert!(handler.is_some());
4348         }
4349         assert_eq!(
4350             open_options & OpenOptions::FILE_CREATED,
4351             OpenOptions::FILE_CREATED
4352         );
4353     }
4354 
4355     #[test]
atomic_open_symbol_link()4356     fn atomic_open_symbol_link() {
4357         // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4358         // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4359         let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4360         let _guard = lock.lock().expect("acquire named lock");
4361 
4362         let temp_dir = TempDir::new().unwrap();
4363         create_test_data(&temp_dir, &["dir"], &["a.txt"]);
4364 
4365         let cfg = Default::default();
4366         let fs = PassthroughFs::new("tag", cfg).unwrap();
4367 
4368         let capable = FsOptions::empty();
4369         fs.init(capable).unwrap();
4370 
4371         // atomic open the link destination file
4372         let res_dst = atomic_open(
4373             &fs,
4374             &temp_dir.path().join("a.txt"),
4375             0o666,
4376             libc::O_RDWR as u32,
4377             0,
4378             None,
4379         );
4380         assert!(res_dst.is_ok());
4381         let (entry_dst, handler_dst, _) = res_dst.unwrap();
4382         assert_ne!(entry_dst.inode, 0);
4383         assert!(handler_dst.is_some());
4384 
4385         // create depth 1 symbol link
4386         let sym1_res = symlink(
4387             &fs,
4388             &temp_dir.path().join("a.txt"),
4389             &temp_dir.path().join("blink"),
4390             None,
4391         );
4392         assert!(sym1_res.is_ok());
4393         let sym1_entry = sym1_res.unwrap();
4394         assert_ne!(sym1_entry.inode, 0);
4395 
4396         // atomic_open symbol link, should return dentry with no handler
4397         let res = atomic_open(
4398             &fs,
4399             &temp_dir.path().join("blink"),
4400             0o666,
4401             libc::O_RDWR as u32,
4402             0,
4403             None,
4404         );
4405         assert!(res.is_ok());
4406         let (entry, handler, open_options) = res.unwrap();
4407         assert_eq!(entry.inode, sym1_entry.inode);
4408         assert!(handler.is_none());
4409         assert_eq!(open_options, OpenOptions::empty());
4410 
4411         // delete link destination
4412         unlink(&fs, &temp_dir.path().join("a.txt")).expect("Remove");
4413         assert_eq!(
4414             lookup(&fs, &temp_dir.path().join("a.txt"))
4415                 .expect_err("file must not exist")
4416                 .kind(),
4417             io::ErrorKind::NotFound,
4418             "a.txt must be removed"
4419         );
4420 
4421         // after link destination removed, should still return valid dentry
4422         let res = atomic_open(
4423             &fs,
4424             &temp_dir.path().join("blink"),
4425             0o666,
4426             libc::O_RDWR as u32,
4427             0,
4428             None,
4429         );
4430         assert!(res.is_ok());
4431         let (entry, handler, open_options) = res.unwrap();
4432         assert_eq!(entry.inode, sym1_entry.inode);
4433         assert!(handler.is_none());
4434         assert_eq!(open_options, OpenOptions::empty());
4435     }
4436 
4437     #[test]
4438     #[cfg(feature = "arc_quota")]
set_permission_ioctl_valid_data()4439     fn set_permission_ioctl_valid_data() {
4440         // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4441         // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4442         let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4443         let _guard = lock.lock().expect("acquire named lock");
4444 
4445         let cfg = Config {
4446             max_dynamic_perm: 1,
4447             ..Default::default()
4448         };
4449         let p = PassthroughFs::new("tag", cfg).expect("Failed to create PassthroughFs");
4450 
4451         let perm_path_string = String::from("/test");
4452         let fs_permission_data_buffer = FsPermissionDataBuffer {
4453             guest_uid: 1,
4454             guest_gid: 2,
4455             host_uid: 3,
4456             host_gid: 4,
4457             umask: 5,
4458             pad: 0,
4459             perm_path: {
4460                 let mut perm_path: [u8; FS_IOCTL_PATH_MAX_LEN] = [0; FS_IOCTL_PATH_MAX_LEN];
4461                 perm_path[..perm_path_string.len()].copy_from_slice(perm_path_string.as_bytes());
4462                 perm_path
4463             },
4464         };
4465         let r = std::io::Cursor::new(fs_permission_data_buffer.as_bytes());
4466 
4467         let res = fs_ioc_setpermission(
4468             &p,
4469             mem::size_of_val(&fs_permission_data_buffer) as u32,
4470             r.clone(),
4471         )
4472         .expect("valid input should get IoctlReply");
4473         assert!(matches!(res, IoctlReply::Done(Ok(data)) if data.is_empty()));
4474 
4475         let read_guard = p
4476             .permission_paths
4477             .read()
4478             .expect("read permission_paths failed");
4479         let permission_data = read_guard
4480             .first()
4481             .expect("permission path should not be empty");
4482 
4483         // Check expected data item is added to permission_paths.
4484         let expected_data = PermissionData {
4485             guest_uid: 1,
4486             guest_gid: 2,
4487             host_uid: 3,
4488             host_gid: 4,
4489             umask: 5,
4490             perm_path: perm_path_string,
4491         };
4492         assert_eq!(*permission_data, expected_data);
4493 
4494         // Second ioctl should not succeed since max_dynamic_perm is set to 1
4495         let res = fs_ioc_setpermission(
4496             &p,
4497             mem::size_of_val(&fs_permission_data_buffer) as u32,
4498             r.clone(),
4499         )
4500         .expect("valid input should get IoctlReply");
4501         assert!(
4502             matches!(res, IoctlReply::Done(Err(err)) if err.raw_os_error().is_some_and(|errno| {
4503                 errno == libc::EPERM
4504             }))
4505         );
4506     }
4507 
4508     #[test]
4509     #[cfg(feature = "arc_quota")]
set_permission_ioctl_invalid_data()4510     fn set_permission_ioctl_invalid_data() {
4511         // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4512         // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4513         let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4514         let _guard = lock.lock().expect("acquire named lock");
4515 
4516         let cfg = Config {
4517             max_dynamic_perm: 1,
4518             ..Default::default()
4519         };
4520         let p = PassthroughFs::new("tag", cfg).expect("Failed to create PassthroughFs");
4521 
4522         // The perm_path is not valid since it does not start with /.
4523         let perm_path_string = String::from("test");
4524         let fs_permission_data_buffer = FsPermissionDataBuffer {
4525             guest_uid: 1,
4526             guest_gid: 2,
4527             host_uid: 3,
4528             host_gid: 4,
4529             umask: 5,
4530             pad: 0,
4531             perm_path: {
4532                 let mut perm_path: [u8; FS_IOCTL_PATH_MAX_LEN] = [0; FS_IOCTL_PATH_MAX_LEN];
4533                 perm_path[..perm_path_string.len()].copy_from_slice(perm_path_string.as_bytes());
4534                 perm_path
4535             },
4536         };
4537 
4538         let r = std::io::Cursor::new(fs_permission_data_buffer.as_bytes());
4539         // In this ioctl inode,handle,flags,arg and out_size is irrelavant, set to empty value.
4540         // This call is supposed to get EINVAL ioctlReply, since the perm_path is invalid.
4541         let res = fs_ioc_setpermission(&p, mem::size_of_val(&fs_permission_data_buffer) as u32, r)
4542             .expect("invalid perm_path should get IoctlReply");
4543         assert!(
4544             matches!(res, IoctlReply::Done(Err(err)) if err.raw_os_error().is_some_and(|errno| {
4545                 errno == libc::EINVAL
4546             }))
4547         );
4548 
4549         let fake_data_buffer: [u8; 128] = [0; 128];
4550         let r = std::io::Cursor::new(fake_data_buffer.as_bytes());
4551 
4552         // This call is supposed to get EINVAL ioctlReply, since the in_size is not the size of
4553         // struct FsPermissionDataBuffer.
4554         let res = fs_ioc_setpermission(&p, mem::size_of_val(&fake_data_buffer) as u32, r)
4555             .expect_err("invalid in_size should get Error");
4556         assert!(res
4557             .raw_os_error()
4558             .is_some_and(|errno| { errno == libc::EINVAL }));
4559     }
4560 
4561     #[test]
4562     #[cfg(feature = "arc_quota")]
permission_data_path_matching()4563     fn permission_data_path_matching() {
4564         let ctx = get_context();
4565         let temp_dir = TempDir::new().unwrap();
4566         // Prepare `a.txt` before starting the test.
4567         create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/a.txt"]);
4568 
4569         let cfg = Config {
4570             max_dynamic_perm: 1,
4571             ..Default::default()
4572         };
4573         let fs = PassthroughFs::new("tag", cfg).unwrap();
4574 
4575         let capable = FsOptions::empty();
4576         fs.init(capable).unwrap();
4577 
4578         const BY_PATH_UID: u32 = 655360;
4579         const BY_PATH_GID: u32 = 655361;
4580         const BY_PATH_UMASK: u32 = 0o007;
4581 
4582         let dir_path = temp_dir.path().join("dir");
4583         let permission_data = PermissionData {
4584             guest_uid: BY_PATH_UID,
4585             guest_gid: BY_PATH_GID,
4586             host_uid: ctx.uid,
4587             host_gid: ctx.gid,
4588             umask: BY_PATH_UMASK,
4589             perm_path: dir_path.to_string_lossy().into_owned(),
4590         };
4591         fs.permission_paths
4592             .write()
4593             .expect("permission_path lock must be acquired")
4594             .push(permission_data);
4595 
4596         // a_path is the path with out set permission by path
4597         let a_path = temp_dir.path().join("a.txt");
4598         let in_dir_a_path = dir_path.join("a.txt");
4599 
4600         // a.txt should not be set with guest_uid/guest_uid/umask by path
4601         let a_entry = lookup_ent(&fs, &a_path).expect("a.txt must exist");
4602         assert_ne!(a_entry.attr.st_uid, BY_PATH_UID);
4603         assert_ne!(a_entry.attr.st_gid, BY_PATH_GID);
4604 
4605         // a.txt in dir should be set guest_uid/guest_uid/umask by path
4606         let in_dir_a_entry = lookup_ent(&fs, &in_dir_a_path).expect("dir/a.txt must exist");
4607         assert_eq!(in_dir_a_entry.attr.st_uid, BY_PATH_UID);
4608         assert_eq!(in_dir_a_entry.attr.st_gid, BY_PATH_GID);
4609         assert_eq!(in_dir_a_entry.attr.st_mode & 0o777, !BY_PATH_UMASK & 0o777);
4610 
4611         // Create dir/b.txt.
4612         let in_dir_b_path = dir_path.join("b.txt");
4613         create(&fs, &in_dir_b_path).expect("create b.txt");
4614 
4615         // newly created b.txt in dir should be set guest_uid/guest_uid/umask by path
4616         let in_dir_b_entry = lookup_ent(&fs, &in_dir_a_path).expect("dir/b.txt must exist");
4617         assert_eq!(in_dir_b_entry.attr.st_uid, BY_PATH_UID);
4618         assert_eq!(in_dir_b_entry.attr.st_gid, BY_PATH_GID);
4619         assert_eq!(in_dir_b_entry.attr.st_mode & 0o777, !BY_PATH_UMASK & 0o777);
4620     }
4621 
4622     #[test]
4623     #[cfg(feature = "arc_quota")]
set_path_xattr_ioctl_valid_data()4624     fn set_path_xattr_ioctl_valid_data() {
4625         // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4626         // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4627         let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4628         let _guard = lock.lock().expect("acquire named lock");
4629 
4630         let cfg: Config = Config {
4631             max_dynamic_xattr: 1,
4632             ..Default::default()
4633         };
4634         let p = PassthroughFs::new("tag", cfg).expect("Failed to create PassthroughFs");
4635 
4636         let path_string = String::from("/test");
4637         let xattr_name_string = String::from("test_name");
4638         let xattr_value_string = String::from("test_value");
4639         let fs_path_xattr_data_buffer = FsPathXattrDataBuffer {
4640             path: {
4641                 let mut path: [u8; FS_IOCTL_PATH_MAX_LEN] = [0; FS_IOCTL_PATH_MAX_LEN];
4642                 path[..path_string.len()].copy_from_slice(path_string.as_bytes());
4643                 path
4644             },
4645             xattr_name: {
4646                 let mut xattr_name: [u8; FS_IOCTL_XATTR_NAME_MAX_LEN] =
4647                     [0; FS_IOCTL_XATTR_NAME_MAX_LEN];
4648                 xattr_name[..xattr_name_string.len()].copy_from_slice(xattr_name_string.as_bytes());
4649                 xattr_name
4650             },
4651             xattr_value: {
4652                 let mut xattr_value: [u8; FS_IOCTL_XATTR_VALUE_MAX_LEN] =
4653                     [0; FS_IOCTL_XATTR_VALUE_MAX_LEN];
4654                 xattr_value[..xattr_value_string.len()]
4655                     .copy_from_slice(xattr_value_string.as_bytes());
4656                 xattr_value
4657             },
4658         };
4659         let r = std::io::Cursor::new(fs_path_xattr_data_buffer.as_bytes());
4660 
4661         let res = fs_ioc_setpathxattr(
4662             &p,
4663             mem::size_of_val(&fs_path_xattr_data_buffer) as u32,
4664             r.clone(),
4665         )
4666         .expect("valid input should get IoctlReply");
4667         assert!(matches!(res, IoctlReply::Done(Ok(data)) if data.is_empty()));
4668 
4669         let read_guard = p.xattr_paths.read().expect("read xattr_paths failed");
4670         let xattr_data = read_guard.first().expect("xattr_paths should not be empty");
4671 
4672         // Check expected data item is added to permission_paths.
4673         let expected_data = XattrData {
4674             xattr_path: path_string,
4675             xattr_name: xattr_name_string,
4676             xattr_value: xattr_value_string,
4677         };
4678         assert_eq!(*xattr_data, expected_data);
4679 
4680         // Second ioctl should not succeed since max_dynamic_perm is set to 1
4681         let res = fs_ioc_setpathxattr(
4682             &p,
4683             mem::size_of_val(&fs_path_xattr_data_buffer) as u32,
4684             r.clone(),
4685         )
4686         .expect("valid input should get IoctlReply");
4687         assert!(
4688             matches!(res, IoctlReply::Done(Err(err)) if err.raw_os_error().is_some_and(|errno| {
4689                 errno == libc::EPERM
4690             }))
4691         );
4692     }
4693     #[test]
4694     #[cfg(feature = "arc_quota")]
set_path_xattr_ioctl_invalid_data()4695     fn set_path_xattr_ioctl_invalid_data() {
4696         // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4697         // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4698         let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4699         let _guard = lock.lock().expect("acquire named lock");
4700 
4701         let cfg: Config = Config {
4702             max_dynamic_xattr: 1,
4703             ..Default::default()
4704         };
4705         let p = PassthroughFs::new("tag", cfg).expect("Failed to create PassthroughFs");
4706 
4707         let path_string = String::from("test");
4708         let xattr_name_string = String::from("test_name");
4709         let xattr_value_string = String::from("test_value");
4710         let fs_path_xattr_data_buffer = FsPathXattrDataBuffer {
4711             path: {
4712                 let mut path: [u8; FS_IOCTL_PATH_MAX_LEN] = [0; FS_IOCTL_PATH_MAX_LEN];
4713                 path[..path_string.len()].copy_from_slice(path_string.as_bytes());
4714                 path
4715             },
4716             xattr_name: {
4717                 let mut xattr_name: [u8; FS_IOCTL_XATTR_NAME_MAX_LEN] =
4718                     [0; FS_IOCTL_XATTR_NAME_MAX_LEN];
4719                 xattr_name[..xattr_name_string.len()].copy_from_slice(xattr_name_string.as_bytes());
4720                 xattr_name
4721             },
4722             xattr_value: {
4723                 let mut xattr_value: [u8; FS_IOCTL_XATTR_VALUE_MAX_LEN] =
4724                     [0; FS_IOCTL_XATTR_VALUE_MAX_LEN];
4725                 xattr_value[..xattr_value_string.len()]
4726                     .copy_from_slice(xattr_value_string.as_bytes());
4727                 xattr_value
4728             },
4729         };
4730         let r = std::io::Cursor::new(fs_path_xattr_data_buffer.as_bytes());
4731 
4732         // This call is supposed to get EINVAL ioctlReply, since the perm_path is invalid.
4733         let res = fs_ioc_setpathxattr(
4734             &p,
4735             mem::size_of_val(&fs_path_xattr_data_buffer) as u32,
4736             r.clone(),
4737         )
4738         .expect("valid input should get IoctlReply");
4739         assert!(
4740             matches!(res, IoctlReply::Done(Err(err)) if err.raw_os_error().is_some_and(|errno| {
4741                 errno == libc::EINVAL
4742             }))
4743         );
4744 
4745         let fake_data_buffer: [u8; 128] = [0; 128];
4746         let r = std::io::Cursor::new(fake_data_buffer.as_bytes());
4747         // This call is supposed to get EINVAL ioctlReply, since the in_size is not the size of
4748         // struct FsPathXattrDataBuffer.
4749         let res = fs_ioc_setpathxattr(&p, mem::size_of_val(&fake_data_buffer) as u32, r.clone())
4750             .expect_err("valid input should get IoctlReply");
4751         assert!(res
4752             .raw_os_error()
4753             .is_some_and(|errno| { errno == libc::EINVAL }));
4754     }
4755 
4756     #[test]
4757     #[cfg(feature = "arc_quota")]
xattr_data_path_matching()4758     fn xattr_data_path_matching() {
4759         let ctx = get_context();
4760         let temp_dir = TempDir::new().unwrap();
4761         // Prepare `a.txt` before starting the test.
4762         create_test_data(&temp_dir, &["dir"], &["a.txt", "dir/a.txt"]);
4763 
4764         let cfg = Config {
4765             max_dynamic_xattr: 1,
4766             ..Default::default()
4767         };
4768         let fs = PassthroughFs::new("tag", cfg).unwrap();
4769 
4770         let capable = FsOptions::empty();
4771         fs.init(capable).unwrap();
4772 
4773         let dir_path = temp_dir.path().join("dir");
4774         let xattr_name_string = String::from("test_name");
4775         let xattr_name_cstring = CString::new(xattr_name_string.clone()).expect("create c string");
4776         let xattr_value_string = String::from("test_value");
4777         let xattr_value_bytes = xattr_value_string.clone().into_bytes();
4778 
4779         let xattr_data = XattrData {
4780             xattr_name: xattr_name_string,
4781             xattr_value: xattr_value_string,
4782             xattr_path: dir_path.to_string_lossy().into_owned(),
4783         };
4784         fs.xattr_paths
4785             .write()
4786             .expect("xattr_paths lock must be acquired")
4787             .push(xattr_data);
4788 
4789         // a_path is the path with out set xattr by path
4790         let a_path: std::path::PathBuf = temp_dir.path().join("a.txt");
4791         let in_dir_a_path = dir_path.join("a.txt");
4792 
4793         let a_node = lookup(&fs, a_path.as_path()).expect("lookup a node");
4794         // a.txt should not be set with xattr by path
4795         assert!(fs
4796             .getxattr(
4797                 ctx,
4798                 a_node,
4799                 &xattr_name_cstring,
4800                 xattr_value_bytes.len() as u32
4801             )
4802             .is_err());
4803 
4804         let in_dir_a_node = lookup(&fs, in_dir_a_path.as_path()).expect("lookup in dir a node");
4805         // a.txt in dir should be set xattr by path
4806         let in_dir_a_reply = fs
4807             .getxattr(
4808                 ctx,
4809                 in_dir_a_node,
4810                 &xattr_name_cstring,
4811                 xattr_value_bytes.len() as u32,
4812             )
4813             .expect("Getxattr should success");
4814         assert!(matches!(in_dir_a_reply, GetxattrReply::Value(v) if v == xattr_value_bytes));
4815         // Create dir/b.txt.
4816         let in_dir_b_path = dir_path.join("b.txt");
4817         create(&fs, &in_dir_b_path).expect("create b.txt");
4818 
4819         // newly created b.txt in dir should be set xattr by path
4820         let in_dir_b_node = lookup(&fs, in_dir_a_path.as_path()).expect("lookup in dir b node");
4821         let in_dir_b_reply = fs
4822             .getxattr(
4823                 ctx,
4824                 in_dir_b_node,
4825                 &xattr_name_cstring,
4826                 xattr_value_bytes.len() as u32,
4827             )
4828             .expect("Getxattr should success");
4829         assert!(matches!(in_dir_b_reply, GetxattrReply::Value(v) if v == xattr_value_bytes));
4830     }
4831 
4832     /// Creates and open a new file by atomic_open with O_APPEND flag.
4833     /// We check O_APPEND is properly handled, depending on writeback cache is enabled or not.
atomic_open_create_o_append(writeback: bool)4834     fn atomic_open_create_o_append(writeback: bool) {
4835         // Since PassthroughFs may executes process-wide operations such as `fchdir`, acquire
4836         // `NamedLock` before starting each unit test creating a `PassthroughFs` instance.
4837         let lock = NamedLock::create(UNITTEST_LOCK_NAME).expect("create named lock");
4838         let _guard = lock.lock().expect("acquire named lock");
4839 
4840         let temp_dir = TempDir::new().unwrap();
4841 
4842         let cfg = Config {
4843             cache_policy: CachePolicy::Always,
4844             writeback,
4845             ..Default::default()
4846         };
4847         let fs = PassthroughFs::new("tag", cfg).unwrap();
4848 
4849         let capable = FsOptions::ZERO_MESSAGE_OPEN | FsOptions::WRITEBACK_CACHE;
4850         fs.init(capable).unwrap();
4851 
4852         let (entry, _, _) = atomic_open(
4853             &fs,
4854             &temp_dir.path().join("a.txt"),
4855             0o666,
4856             (libc::O_RDWR | libc::O_CREAT | libc::O_APPEND) as u32,
4857             0,
4858             None,
4859         )
4860         .expect("atomic_open");
4861         assert_ne!(entry.inode, 0);
4862 
4863         let inodes = fs.inodes.lock();
4864         let data = inodes.get(&entry.inode).unwrap();
4865         let flags = data.file.lock().1;
4866         if writeback {
4867             // When writeback is enabled, O_APPEND must be handled by the guest kernel.
4868             // So, it must be cleared.
4869             assert_eq!(flags & libc::O_APPEND, 0);
4870         } else {
4871             // Without writeback cache, O_APPEND must not be cleared.
4872             assert_eq!(flags & libc::O_APPEND, libc::O_APPEND);
4873         }
4874     }
4875 
4876     #[test]
test_atomic_open_create_o_append_no_writeback()4877     fn test_atomic_open_create_o_append_no_writeback() {
4878         atomic_open_create_o_append(false);
4879     }
4880 
4881     #[test]
test_atomic_open_create_o_append_writeback()4882     fn test_atomic_open_create_o_append_writeback() {
4883         atomic_open_create_o_append(true);
4884     }
4885 }
4886