youki container escape via "masked path" abuse due to mount race conditions
Description
Youki is a container runtime written in Rust. In versions 0.5.6 and below, the initial validation of the source /dev/null is insufficient, allowing container escape when youki utilizes bind mounting the container's /dev/null as a file mask. This issue is fixed in version 0.5.7.
AI Insight
LLM-synthesized narrative grounded in this CVE's description and references.
Youki container runtime <=0.5.6 insufficiently validates /dev/null during masked-path bind-mounts, allowing container escape.
Vulnerability
Overview
CVE-2025-62161 is a container escape vulnerability in Youki, a container runtime written in Rust, affecting versions 0.5.6 and below. The root cause is insufficient validation of the source /dev/null when it is used to mask files via bind-mounting, as specified by the OCI runtime specification's maskedPaths feature.[2] The OCI spec allows masking files by bind-mounting /dev/null from the container over the target path. However, Youki does not verify that the source inode is actually a real /dev/null inode before performing the mount.[1]
Exploitation
Method
An attacker with the ability to create or modify the /dev/null inode within a container—potentially through race conditions with other containers sharing mounts—can replace /dev/null with a symlink to an attacker-controlled path. This allows the attacker to cause Youki to bind-mount an arbitrary source path to a path inside the container.[1] The attack is feasible in scenarios where multiple containers share mounts, such as during a Docker build with docker buildx build, which permits parallel execution of containers with custom shared mounts.[1]
Impact
Successful exploitation can lead to container escape, host information disclosure, or host denial of service. By bind-mounting files like /proc/sysrq-trigger, an attacker could gain read-write access to host-critical files, enabling destructive actions against the host system.[1] This is a high-severity issue as it breaks container isolation boundaries.
Mitigation
The vulnerability is fixed in Youki version 0.5.7.[2] Users should upgrade to the latest version immediately. For those unable to upgrade, limiting the use of shared mounts between containers and applying additional security restrictions may reduce risk, but are not complete mitigations. The issue is reminiscent of a similar vulnerability in runc (GHSA-9493-h29p-rfm2).[1]
AI Insight generated on May 19, 2026. Synthesized from this CVE's description and the cited reference URLs; citations are validated against the source bundle.
Affected packages
Versions sourced from the GitHub Security Advisory.
| Package | Affected versions | Patched versions |
|---|---|---|
youkicrates.io | < 0.5.7 | 0.5.7 |
Affected products
2Patches
15886c91073b9Merge commit from fork
18 files changed · +1038 −388
Cargo.lock+43 −8 modified@@ -236,7 +236,7 @@ dependencies = [ "bitflags 2.9.4", "cexpr", "clang-sys", - "itertools", + "itertools 0.12.1", "lazy_static", "lazycell", "proc-macro2", @@ -1927,6 +1927,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.15" @@ -2035,6 +2044,7 @@ dependencies = [ "mockall", "nix 0.29.0", "oci-spec", + "pathrs", "procfs", "quickcheck", "rbpf", @@ -2061,6 +2071,7 @@ dependencies = [ "nix 0.29.0", "oci-spec", "once_cell", + "pathrs", "prctl", "procfs", "protobuf", @@ -2152,9 +2163,9 @@ checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" [[package]] name = "linux-raw-sys" -version = "0.9.2" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db9c683daf087dc577b7506e9695b3d556a9f3849903fa28186283afd6809e9" +checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" [[package]] name = "lock_api" @@ -2615,6 +2626,24 @@ dependencies = [ "once_cell", ] +[[package]] +name = "pathrs" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c63f58f2463b3eecc7d285eace144d8c4820631d018d927049f8569be4b3b1a4" +dependencies = [ + "bitflags 2.6.0", + "itertools 0.14.0", + "libc", + "memchr", + "once_cell", + "rustix 1.1.2", + "rustversion", + "static_assertions", + "tempfile", + "thiserror 2.0.17", +] + [[package]] name = "pentacle" version = "1.1.0" @@ -3396,14 +3425,14 @@ dependencies = [ [[package]] name = "rustix" -version = "1.0.0" +version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17f8dcd64f141950290e45c99f7710ede1b600297c91818bb30b3667c0f45dc0" +checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e" dependencies = [ "bitflags 2.9.4", "errno", "libc", - "linux-raw-sys 0.9.2", + "linux-raw-sys 0.11.0", "windows-sys 0.59.0", ] @@ -3885,6 +3914,12 @@ dependencies = [ "version_check", ] +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "stdweb" version = "0.4.20" @@ -4077,7 +4112,7 @@ dependencies = [ "fastrand", "getrandom 0.3.1", "once_cell", - "rustix 1.0.0", + "rustix 1.1.2", "windows-sys 0.59.0", ] @@ -5431,7 +5466,7 @@ dependencies = [ "cranelift-frontend 0.118.0", "cranelift-native", "gimli 0.31.1", - "itertools", + "itertools 0.12.1", "log", "object 0.36.7", "pulley-interpreter",
crates/libcgroups/Cargo.toml+1 −0 modified@@ -22,6 +22,7 @@ cgroupsv2_devices = ["rbpf", "libbpf-sys", "errno", "libc", "nix/dir"] [dependencies] nix = { version = "0.29.0", features = ["signal", "user", "fs"] } procfs = "0.17.0" +pathrs = "0.2.1" oci-spec = { version = "~0.8.3", features = ["runtime"] } fixedbitset = "0.5.7" serde = { version = "1.0", features = ["derive"] }
crates/libcgroups/src/v1/manager.rs+13 −7 modified@@ -4,8 +4,9 @@ use std::path::{Path, PathBuf}; use std::time::Duration; use nix::unistd::Pid; -use procfs::ProcError; -use procfs::process::Process; +use pathrs::flags::OpenFlags; +use pathrs::procfs::{ProcfsBase, ProcfsHandle}; +use procfs::{FromRead, ProcError, ProcessCGroups}; use super::blkio::{Blkio, V1BlkioStatsError}; use super::controller::Controller; @@ -47,6 +48,8 @@ pub enum V1ManagerError { CGroupRequired(CtrlType), #[error("subsystem does not exist")] SubsystemDoesNotExist, + #[error(transparent)] + Pathrs(#[from] pathrs::error::Error), #[error(transparent)] BlkioController(WrappedIoError), @@ -101,11 +104,14 @@ impl Manager { tracing::debug!("Get path for subsystem: {}", subsystem); let mount_point = util::get_subsystem_mount_point(subsystem)?; - let cgroup = Process::myself()? - .cgroups()? - .into_iter() - .find(|c| c.controllers.contains(&subsystem.to_string())) - .ok_or(V1ManagerError::SubsystemDoesNotExist)?; + let cgroup = ProcessCGroups::from_read(ProcfsHandle::new()?.open( + ProcfsBase::ProcSelf, + "cgroup", + OpenFlags::O_RDONLY | OpenFlags::O_CLOEXEC, + )?)? + .into_iter() + .find(|c| c.controllers.contains(&subsystem.to_string())) + .ok_or(V1ManagerError::SubsystemDoesNotExist)?; let p = if cgroup_path.as_os_str().is_empty() { mount_point.join_safely(Path::new(&cgroup.pathname))?
crates/libcgroups/src/v1/util.rs+61 −38 modified@@ -1,33 +1,50 @@ use std::collections::HashMap; +use std::io::{BufRead, BufReader}; use std::path::PathBuf; +use pathrs::flags::OpenFlags; +use pathrs::procfs::{ProcfsBase, ProcfsHandle}; use procfs::ProcError; -use procfs::process::Process; +use procfs::process::MountInfo; use super::ControllerType; use super::controller_type::CONTROLLERS; #[derive(thiserror::Error, Debug)] pub enum V1MountPointError { - #[error("failed to read process info from /proc/self: {0}")] - ReadSelf(ProcError), + #[error("io error: {0}")] + Io(#[from] std::io::Error), #[error("failed to get mountinfo: {0}")] MountInfo(ProcError), #[error("could not find mountpoint for {subsystem}")] NotFound { subsystem: ControllerType }, + #[error(transparent)] + Pathrs(#[from] pathrs::error::Error), } /// List all cgroup v1 subsystem mount points on the system. This can include unsupported /// subsystems, comounted controllers and named hierarchies. pub fn list_subsystem_mount_points() -> Result<Vec<PathBuf>, V1MountPointError> { - Ok(Process::myself() - .map_err(V1MountPointError::ReadSelf)? - .mountinfo() - .map_err(V1MountPointError::MountInfo)? - .into_iter() - .filter(|m| m.fs_type == "cgroup") - .map(|m| m.mount_point) - .collect()) + let reader = BufReader::new(ProcfsHandle::new()?.open( + ProcfsBase::ProcSelf, + "mountinfo", + OpenFlags::O_RDONLY | OpenFlags::O_CLOEXEC, + )?); + + reader + .lines() + .map(|lr| { + lr.map_err(V1MountPointError::Io) + .and_then(|line| MountInfo::from_line(&line).map_err(V1MountPointError::MountInfo)) + }) + .try_fold(Vec::new(), |mut mount_points, r| { + r.map(|m| { + if m.fs_type == "cgroup" { + mount_points.push(m.mount_point); + } + mount_points + }) + }) } /// List the mount points of all currently supported cgroup subsystems. @@ -46,38 +63,44 @@ pub fn list_supported_mount_points() -> Result<HashMap<ControllerType, PathBuf>, pub fn get_subsystem_mount_point(subsystem: &ControllerType) -> Result<PathBuf, V1MountPointError> { let subsystem_name = subsystem.to_string(); - Process::myself() - .map_err(V1MountPointError::ReadSelf)? - .mountinfo() - .map_err(V1MountPointError::MountInfo)? - .into_iter() - .find(|m| { - if m.fs_type == "cgroup" { + let reader = BufReader::new(ProcfsHandle::new()?.open( + ProcfsBase::ProcSelf, + "mountinfo", + OpenFlags::O_RDONLY | OpenFlags::O_CLOEXEC, + )?); + + reader + .lines() + .map(|lr| { + lr.map_err(V1MountPointError::Io) + .and_then(|line| MountInfo::from_line(&line).map_err(V1MountPointError::MountInfo)) + }) + .find_map(|r| match r { + Err(e) => Some(Err(e)), + Ok(m) if m.fs_type == "cgroup" => { // Some systems mount net_prio and net_cls in the same directory // other systems mount them in their own directories. This // should handle both cases. - if subsystem_name == "net_cls" { - return m.mount_point.ends_with("net_cls,net_prio") - || m.mount_point.ends_with("net_prio,net_cls") - || m.mount_point.ends_with("net_cls"); - } else if subsystem_name == "net_prio" { - return m.mount_point.ends_with("net_cls,net_prio") - || m.mount_point.ends_with("net_prio,net_cls") - || m.mount_point.ends_with("net_prio"); - } - - if subsystem_name == "cpu" { - return m.mount_point.ends_with("cpu,cpuacct") - || m.mount_point.ends_with("cpu"); - } - if subsystem_name == "cpuacct" { - return m.mount_point.ends_with("cpu,cpuacct") - || m.mount_point.ends_with("cpuacct"); - } + let ok = match subsystem_name.as_str() { + "net_cls" => ["net_cls,net_prio", "net_prio,net_cls", "net_cls"] + .iter() + .any(|s| m.mount_point.ends_with(s)), + "net_prio" => ["net_cls,net_prio", "net_prio,net_cls", "net_prio"] + .iter() + .any(|s| m.mount_point.ends_with(s)), + "cpu" => ["cpu,cpuacct", "cpu"] + .iter() + .any(|s| m.mount_point.ends_with(s)), + "cpuacct" => ["cpu,cpuacct", "cpuacct"] + .iter() + .any(|s| m.mount_point.ends_with(s)), + _ => m.mount_point.ends_with(&subsystem_name), + }; + if ok { Some(Ok(m.mount_point)) } else { None } } - m.mount_point.ends_with(&subsystem_name) + Ok(_) => None, }) - .map(|m| m.mount_point) + .transpose()? .ok_or(V1MountPointError::NotFound { subsystem: *subsystem, })
crates/libcgroups/src/v2/util.rs+26 −6 modified@@ -1,7 +1,10 @@ +use std::io::{BufRead, BufReader}; use std::path::{Path, PathBuf}; +use pathrs::flags::OpenFlags; +use pathrs::procfs::{ProcfsBase, ProcfsHandle}; use procfs::ProcError; -use procfs::process::Process; +use procfs::process::MountInfo; use super::controller_type::ControllerType; use crate::common::{self, WrappedIoError}; @@ -11,6 +14,8 @@ pub const CGROUP_SUBTREE_CONTROL: &str = "cgroup.subtree_control"; #[derive(thiserror::Error, Debug)] pub enum V2UtilError { + #[error("io error: {0}")] + Io(#[from] std::io::Error), #[error("io error: {0}")] WrappedIo(#[from] WrappedIoError), #[error("proc error: {0}")] @@ -19,15 +24,30 @@ pub enum V2UtilError { CouldNotFind, #[error("cannot get available controllers. {0} does not exist")] DoesNotExist(PathBuf), + #[error(transparent)] + Pathrs(#[from] pathrs::error::Error), } // Reads the `/proc/self/mountinfo` to get the mount point of this cgroup pub fn get_unified_mount_point() -> Result<PathBuf, V2UtilError> { - Process::myself()? - .mountinfo()? - .into_iter() - .find(|m| m.fs_type == "cgroup2") - .map(|m| m.mount_point) + let reader = BufReader::new(ProcfsHandle::new()?.open( + ProcfsBase::ProcSelf, + "mountinfo", + OpenFlags::O_RDONLY | OpenFlags::O_CLOEXEC, + )?); + + reader + .lines() + .map(|lr| { + lr.map_err(V2UtilError::Io) + .and_then(|s| MountInfo::from_line(&s).map_err(V2UtilError::from)) + }) + .find_map(|r| match r { + Ok(mi) if mi.fs_type == "cgroup2" => Some(Ok(mi.mount_point)), + Ok(_) => None, + Err(e) => Some(Err(e)), + }) + .transpose()? .ok_or(V2UtilError::CouldNotFind) }
crates/libcontainer/Cargo.toml+1 −0 modified@@ -53,6 +53,7 @@ thiserror = "2.0.17" tracing = { version = "0.1.41", features = ["attributes"] } safe-path = "0.1.0" nc = "0.9.6" +pathrs = "0.2.1" [dev-dependencies] oci-spec = { version = "~0.8.3", features = ["proptests", "runtime"] }
crates/libcontainer/src/apparmor.rs+23 −18 modified@@ -1,18 +1,20 @@ -use std::fs::{self}; -use std::path::Path; +use std::fs; +use std::io::Write; +use std::path::{Path, PathBuf}; -use crate::utils; +use pathrs::flags::OpenFlags; +use pathrs::procfs::{ProcfsBase, ProcfsHandle}; #[derive(Debug, thiserror::Error)] pub enum AppArmorError { #[error("failed to apply AppArmor profile")] ActivateProfile { - path: std::path::PathBuf, + path: PathBuf, profile: String, source: std::io::Error, }, #[error(transparent)] - EnsureProcfs(#[from] utils::EnsureProcfsError), + Pathrs(#[from] pathrs::error::Error), } type Result<T> = std::result::Result<T, AppArmorError>; @@ -33,19 +35,22 @@ pub fn apply_profile(profile: &str) -> Result<()> { // Try the module specific subdirectory. This is the recommended way to configure // LSMs since Linux 5.1. AppArmor has such a directory since Linux 5.8. - if activate_profile(Path::new("/proc/self/attr/apparmor/exec"), profile).is_ok() { - return Ok(()); - } - - // try the legacy interface - activate_profile(Path::new("/proc/self/attr/exec"), profile) + activate_profile(Path::new("attr/apparmor/exec"), profile) + // try the legacy interface + .or_else(|_| activate_profile(Path::new("attr/exec"), profile)) } -fn activate_profile(path: &Path, profile: &str) -> Result<()> { - utils::ensure_procfs(path).map_err(AppArmorError::EnsureProcfs)?; - fs::write(path, format!("exec {profile}")).map_err(|err| AppArmorError::ActivateProfile { - path: path.to_owned(), - profile: profile.to_owned(), - source: err, - }) +fn activate_profile(subpath: &Path, profile: &str) -> Result<()> { + ProcfsHandle::new()? + .open( + ProcfsBase::ProcSelf, + subpath, + OpenFlags::O_WRONLY | OpenFlags::O_CLOEXEC, + )? + .write_all(format!("exec {profile}").as_bytes()) + .map_err(|err| AppArmorError::ActivateProfile { + path: PathBuf::from("/proc/self").join(subpath), + profile: profile.to_owned(), + source: err, + }) }
crates/libcontainer/src/process/container_intermediate_process.rs+2 −3 modified@@ -1,9 +1,8 @@ use std::os::fd::FromRawFd; use libcgroups::common::CgroupManager; -use nix::unistd::{Gid, Pid, Uid, close, write}; +use nix::unistd::{Gid, Pid, Uid, close, getpid, write}; use oci_spec::runtime::{LinuxNamespace, LinuxNamespaceType, LinuxResources}; -use procfs::process::Process; use super::args::{ContainerArgs, ContainerType}; use super::channel::{IntermediateReceiver, MainSender}; @@ -264,7 +263,7 @@ fn apply_cgroups< resources: Option<&LinuxResources>, init: bool, ) -> Result<()> { - let pid = Pid::from_raw(Process::myself()?.pid()); + let pid = getpid(); cmanager.add_task(pid).map_err(|err| { tracing::error!(?pid, ?err, ?init, "failed to add task to cgroup"); IntermediateProcessError::Cgroup(err.to_string())
crates/libcontainer/src/process/init/error.rs+5 −0 modified@@ -1,5 +1,6 @@ use crate::namespaces::NamespaceError; use crate::process::channel; +use crate::rootfs::device::DeviceError; #[cfg(feature = "libseccomp")] use crate::seccomp; use crate::syscall::SyscallError; @@ -36,6 +37,8 @@ pub enum InitProcessError { SyscallOther(#[source] SyscallError), #[error("failed apparmor")] AppArmor(#[source] apparmor::AppArmorError), + #[error(transparent)] + Pathrs(#[from] pathrs::error::Error), #[error("invalid umask")] InvalidUmask(u32), #[error(transparent)] @@ -67,6 +70,8 @@ pub enum InitProcessError { NoLinux, #[error("missing process section in spec")] NoProcess, + #[error("device error")] + Device(#[source] DeviceError), #[error("personality flag has not supported at this time")] UnsupportedPersonalityFlag, }
crates/libcontainer/src/process/init/process.rs+114 −68 modified@@ -1,4 +1,5 @@ use std::collections::HashMap; +use std::io::{Read, Write}; use std::os::unix::io::AsRawFd; use std::path::{Path, PathBuf}; use std::{env, fs, mem}; @@ -12,6 +13,8 @@ use oci_spec::runtime::{ IOPriorityClass, LinuxIOPriority, LinuxNamespaceType, LinuxPersonalityDomain, LinuxSchedulerFlag, LinuxSchedulerPolicy, Scheduler, Spec, User, }; +use pathrs::flags::OpenFlags; +use pathrs::procfs::{ProcfsBase, ProcfsHandle, ProcfsHandleBuilder}; use super::Result; use super::context::InitContext; @@ -22,6 +25,7 @@ use crate::namespaces::Namespaces; use crate::process::args::{ContainerArgs, ContainerType}; use crate::process::channel; use crate::rootfs::RootFS; +use crate::rootfs::device::{open_device_fd, verify_dev_null}; #[cfg(feature = "libseccomp")] use crate::seccomp; use crate::syscall::{Syscall, SyscallError}; @@ -190,18 +194,11 @@ pub fn container_init_process( } if let Some(paths) = ctx.linux.masked_paths() { - // mount masked path - for path in paths { - masked_path( - Path::new(path), - ctx.linux.mount_label(), - ctx.syscall.as_ref(), - ) - .map_err(|err| { - tracing::error!(?err, ?path, "failed to set masked path"); - err - })?; - } + // mount masked paths + masked_paths(paths, ctx.linux.mount_label(), ctx.syscall.as_ref()).map_err(|err| { + tracing::error!(?err, "failed to set masked paths"); + err + })?; } let cwd = format!("{}", ctx.process.cwd().display()); @@ -434,15 +431,22 @@ pub fn container_init_process( } fn sysctl(kernel_params: &HashMap<String, String>) -> Result<()> { - let sys = PathBuf::from("/proc/sys"); + let procfs = ProcfsHandleBuilder::new().unmasked().build()?; + let sys = PathBuf::from("sys"); for (kernel_param, value) in kernel_params { - let path = sys.join(kernel_param.replace('.', "/")); tracing::debug!( "apply value {} to kernel parameter {}.", value, kernel_param ); - fs::write(path, value.as_bytes()).map_err(|err| { + + let subpath = sys.join(kernel_param.replace('.', "/")); + let mut f = procfs.open( + ProcfsBase::ProcRoot, + subpath, + OpenFlags::O_WRONLY | OpenFlags::O_CLOEXEC, + )?; + f.write_all(value.as_bytes()).map_err(|err| { tracing::error!("failed to set sysctl {kernel_param}={value}: {err}"); InitProcessError::Sysctl(err) })?; @@ -498,44 +502,53 @@ fn readonly_path(path: &Path, syscall: &dyn Syscall) -> Result<()> { // For files, bind mounts /dev/null over the top of the specified path. // For directories, mounts read-only tmpfs over the top of the specified path. -fn masked_path(path: &Path, mount_label: &Option<String>, syscall: &dyn Syscall) -> Result<()> { - if let Err(err) = syscall.mount( - Some(Path::new("/dev/null")), - path, - None, - MsFlags::MS_BIND, - None, - ) { - match err { - SyscallError::Nix(nix::errno::Errno::ENOENT) => { - // ignore error if path is not exist. - } - SyscallError::Nix(nix::errno::Errno::ENOTDIR) => { - let label = match mount_label { - Some(l) => format!("context=\"{l}\""), - None => "".to_string(), - }; - syscall - .mount( - Some(Path::new("tmpfs")), - path, - Some("tmpfs"), - MsFlags::MS_RDONLY, - Some(label.as_str()), - ) - .map_err(|err| { - tracing::error!(?path, ?err, "failed to mount path as masked using tempfs"); - InitProcessError::MountPathMasked(err) - })?; - } - _ => { +fn masked_paths( + paths: &Vec<String>, + mount_label: &Option<String>, + syscall: &dyn Syscall, +) -> Result<()> { + let (dev_null_fd, dev_null_stat) = + open_device_fd(Path::new("/dev/null")).map_err(InitProcessError::NixOther)?; + verify_dev_null(&dev_null_stat).map_err(|err| { + tracing::error!(?err, "invalid /dev/null device"); + InitProcessError::Device(err) + })?; + + for path_str in paths { + let path = Path::new(path_str); + if !path.exists() { + // Skip if the path does not exist. + continue; + } + + if path.is_dir() { + // Destination is a directory, mount a read-only tmpfs over the top of it. + let label = match mount_label { + Some(l) => format!("context=\"{l}\""), + None => "".to_string(), + }; + syscall + .mount( + Some(Path::new("tmpfs")), + path, + Some("tmpfs"), + MsFlags::MS_RDONLY, + Some(label.as_str()), + ) + .map_err(|err| { + tracing::error!(?path, ?err, "failed to mount path as masked using tempfs"); + InitProcessError::MountPathMasked(err) + })?; + } else { + // Destination is a file, bind mount /dev/null over the top of it. + syscall.mount_from_fd(&dev_null_fd, path).map_err(|err| { tracing::error!( ?path, ?err, "failed to mount path as masked using /dev/null" ); - return Err(InitProcessError::MountPathMasked(err)); - } + InitProcessError::MountPathMasked(err) + })?; } } @@ -588,7 +601,6 @@ fn reopen_dev_null() -> Result<()> { // At this point we should be inside of the container and now // we can re-open /dev/null if it is in use to the /dev/null // in the container. - let dev_null = fs::File::open("/dev/null").map_err(|err| { tracing::error!(?err, "failed to open /dev/null inside the container"); InitProcessError::ReopenDevNull(err) @@ -597,6 +609,10 @@ fn reopen_dev_null() -> Result<()> { tracing::error!(?err, "failed to fstat /dev/null inside the container"); InitProcessError::NixOther(err) })?; + verify_dev_null(&dev_null_fstat_info).map_err(|err| { + tracing::error!(?err, "invalid /dev/null device inside the container"); + InitProcessError::Device(err) + })?; // Check if stdin, stdout or stderr point to /dev/null for fd in 0..3 { @@ -722,10 +738,15 @@ fn set_supplementary_gids( return Ok(()); } - let setgroups = fs::read_to_string("/proc/self/setgroups").map_err(|err| { - tracing::error!(?err, "failed to read setgroups"); - InitProcessError::Io(err) - })?; + let mut setgroups = String::new(); + ProcfsHandle::new()? + .open(ProcfsBase::ProcSelf, "setgroups", OpenFlags::O_RDONLY)? + .read_to_string(&mut setgroups) + .map_err(|err| { + tracing::error!(?err, "failed to read setgroups"); + InitProcessError::Io(err) + })?; + if setgroups.trim() == "deny" { tracing::error!("cannot set supplementary gids, setgroup is disabled"); return Err(InitProcessError::SetGroupDisabled); @@ -904,6 +925,7 @@ fn verify_cwd() -> Result<()> { #[cfg(test)] mod tests { use std::fs; + use std::path::Path; use anyhow::Result; #[cfg(feature = "libseccomp")] @@ -1092,27 +1114,46 @@ mod tests { .as_any() .downcast_ref::<TestHelperSyscall>() .unwrap(); - mocks.set_ret_err(ArgName::Mount, || { - Err(SyscallError::Nix(nix::errno::Errno::ENOENT)) - }); - assert!(masked_path(Path::new("/proc/self"), &None, syscall.as_ref()).is_ok()); + let paths = vec!["/doesnotexist".to_string()]; + assert!(super::masked_paths(&paths, &None, syscall.as_ref()).is_ok()); + let got = mocks.get_mount_from_fd_args(); + assert_eq!(0, got.len()); let got = mocks.get_mount_args(); assert_eq!(0, got.len()); } + #[test] + fn test_masked_path_mounts_via_fd() -> Result<()> { + let syscall = create_syscall(); + let paths = vec!["/proc/sys/kernel/core_pattern".to_string()]; + super::masked_paths(&paths, &None, syscall.as_ref()).map_err(anyhow::Error::from)?; + + let got = syscall + .as_any() + .downcast_ref::<TestHelperSyscall>() + .unwrap() + .get_mount_from_fd_args(); + assert_eq!(1, got.len()); + let arg = &got[0]; + assert!(arg.fd >= 0); + assert_eq!(PathBuf::from("/proc/sys/kernel/core_pattern"), arg.target); + Ok(()) + } + #[test] fn test_masked_path_is_file_with_no_label() { let syscall = create_syscall(); let mocks = syscall .as_any() .downcast_ref::<TestHelperSyscall>() .unwrap(); - mocks.set_ret_err(ArgName::Mount, || { + mocks.set_ret_err(ArgName::MountFromFd, || { Err(SyscallError::Nix(nix::errno::Errno::ENOTDIR)) }); - assert!(masked_path(Path::new("/proc/self"), &None, syscall.as_ref()).is_ok()); + let paths = vec!["/proc/self".to_string()]; + assert!(super::masked_paths(&paths, &None, syscall.as_ref()).is_ok()); let got = mocks.get_mount_args(); let want = MountArgs { @@ -1133,17 +1174,13 @@ mod tests { .as_any() .downcast_ref::<TestHelperSyscall>() .unwrap(); - mocks.set_ret_err(ArgName::Mount, || { + mocks.set_ret_err(ArgName::MountFromFd, || { Err(SyscallError::Nix(nix::errno::Errno::ENOTDIR)) }); + let paths = vec!["/proc/self".to_string()]; assert!( - masked_path( - Path::new("/proc/self"), - &Some("default".to_string()), - syscall.as_ref() - ) - .is_ok() + super::masked_paths(&paths, &Some("default".to_string()), syscall.as_ref()).is_ok() ); let got = mocks.get_mount_args(); @@ -1165,11 +1202,20 @@ mod tests { .as_any() .downcast_ref::<TestHelperSyscall>() .unwrap(); - mocks.set_ret_err(ArgName::Mount, || { + mocks.set_ret_err(ArgName::MountFromFd, || { Err(SyscallError::Nix(nix::errno::Errno::UnknownErrno)) }); - assert!(masked_path(Path::new("/proc/self"), &None, syscall.as_ref()).is_err()); + let paths = vec!["/proc/self/exe".to_string()]; + assert!(super::masked_paths(&paths, &None, syscall.as_ref()).is_err()); + let got = mocks.get_mount_args(); + assert_eq!(0, got.len()); + + mocks.set_ret_err(ArgName::Mount, || { + Err(SyscallError::Nix(nix::errno::Errno::UnknownErrno)) + }); + let paths = vec!["/proc/self".to_string()]; + assert!(super::masked_paths(&paths, &None, syscall.as_ref()).is_err()); let got = mocks.get_mount_args(); assert_eq!(0, got.len()); }
crates/libcontainer/src/process/intel_rdt.rs+23 −12 modified@@ -1,12 +1,14 @@ use std::collections::HashMap; use std::fs::{self, OpenOptions}; -use std::io::Write; +use std::io::{BufRead, BufReader, Write}; use std::path::{Path, PathBuf}; use nix::unistd::Pid; use oci_spec::runtime::LinuxIntelRdt; use once_cell::sync::Lazy; -use procfs::process::Process; +use pathrs::flags::OpenFlags; +use pathrs::procfs::{ProcfsBase, ProcfsHandle}; +use procfs::process::MountInfo; use regex::Regex; #[derive(Debug, thiserror::Error)] @@ -45,6 +47,10 @@ pub enum IntelRdtError { CreateClosIDDirectory(#[source] std::io::Error), #[error("failed to canonicalize path")] Canonicalize(#[source] std::io::Error), + #[error(transparent)] + Pathrs(#[from] pathrs::error::Error), + #[error(transparent)] + Io(#[from] std::io::Error), } #[derive(Debug, thiserror::Error)] @@ -90,16 +96,21 @@ pub fn delete_resctrl_subdirectory(id: &str) -> Result<()> { /// Finds the resctrl mount path by looking at the process mountinfo data. pub fn find_resctrl_mount_point() -> Result<PathBuf> { - let process = Process::myself()?; - let mount_infos = process.mountinfo()?; - - for mount_info in mount_infos.0.iter() { - // "resctrl" type fs can be mounted only once. - if mount_info.fs_type == "resctrl" { - let path = mount_info.mount_point.clone().canonicalize().map_err(|err| { - tracing::error!(path = ?mount_info.mount_point, "failed to canonicalize path: {}", err); - IntelRdtError::Canonicalize(err) - })?; + let reader = BufReader::new(ProcfsHandle::new()?.open( + ProcfsBase::ProcSelf, + "mountinfo", + OpenFlags::O_RDONLY | OpenFlags::O_CLOEXEC, + )?); + + for lr in reader.lines() { + let s = lr.map_err(IntelRdtError::from)?; + let mi = MountInfo::from_line(&s).map_err(IntelRdtError::from)?; + + if mi.fs_type == "resctrl" { + let path = mi + .mount_point + .canonicalize() + .map_err(IntelRdtError::Canonicalize)?; return Ok(path); } }
crates/libcontainer/src/rootfs/device.rs+51 −24 modified@@ -1,8 +1,10 @@ +use std::os::unix::io::{AsRawFd, FromRawFd, OwnedFd}; use std::path::{Path, PathBuf}; +use libc; use nix::fcntl::{OFlag, open}; use nix::mount::MsFlags; -use nix::sys::stat::{Mode, umask}; +use nix::sys::stat::{FileStat, Mode, fstat, umask}; use nix::unistd::{Gid, Uid, close}; use oci_spec::runtime::LinuxDevice; @@ -27,6 +29,35 @@ pub enum DeviceError { type Result<T> = std::result::Result<T, DeviceError>; +pub(crate) fn open_device_fd(dev_path: &Path) -> nix::Result<(OwnedFd, FileStat)> { + let fd = open( + dev_path, + OFlag::O_PATH | OFlag::O_CLOEXEC, + Mode::from_bits_truncate(0o000), + )?; + let owned = unsafe { OwnedFd::from_raw_fd(fd) }; + let stat = fstat(owned.as_raw_fd())?; + Ok((owned, stat)) +} + +pub(crate) fn verify_dev_null(stat: &FileStat) -> Result<()> { + if stat.st_mode & libc::S_IFMT != libc::S_IFCHR { + return Err(DeviceError::Custom( + "device is not a character device".to_string(), + )); + } + + let actual_major = libc::major(stat.st_rdev) as i64; + let actual_minor = libc::minor(stat.st_rdev) as i64; + if actual_major != 1 || actual_minor != 3 { + return Err(DeviceError::Custom(format!( + "device dev null major/minor mismatch: expected 1/3, actual {}/{}", + actual_major, actual_minor + ))); + } + Ok(()) +} + pub struct Device { syscall: Box<dyn Syscall>, } @@ -202,7 +233,7 @@ mod tests { use oci_spec::runtime::{LinuxDeviceBuilder, LinuxDeviceType}; use super::*; - use crate::syscall::test::{ChownArgs, MknodArgs, MountArgs, TestHelperSyscall}; + use crate::syscall::test::{ChownArgs, MknodArgs, TestHelperSyscall}; #[test] fn test_bind_dev() -> Result<()> { @@ -213,27 +244,25 @@ mod tests { .bind_dev( tmp_dir.path(), &LinuxDeviceBuilder::default() - .path(PathBuf::from("/null")) + .path(PathBuf::from("/dev/null")) .build() .unwrap(), ) .is_ok() ); - let want = MountArgs { - source: Some(PathBuf::from("/null")), - target: tmp_dir.path().join("null"), - fstype: Some("bind".to_string()), - flags: MsFlags::MS_BIND, - data: None, - }; - let got = &device + let helper = device .syscall .as_any() .downcast_ref::<TestHelperSyscall>() - .unwrap() - .get_mount_args()[0]; - assert_eq!(want, *got); + .unwrap(); + let mount_args = helper.get_mount_args(); + assert_eq!(1, mount_args.len()); + let got = &mount_args[0]; + assert_eq!(Some(PathBuf::from("/dev/null")), got.source); + assert_eq!(tmp_dir.path().join("dev").join("null"), got.target); + assert_eq!(MsFlags::MS_BIND, got.flags); + assert!(got.data.is_none()); Ok(()) } @@ -313,20 +342,18 @@ mod tests { .is_ok() ); - let want = MountArgs { - source: Some(PathBuf::from("/dev/null")), - target: tmp_dir.path().join("dev/null"), - fstype: Some("bind".to_string()), - flags: MsFlags::MS_BIND, - data: None, - }; - let got = &device + let mount_args = device .syscall .as_any() .downcast_ref::<TestHelperSyscall>() .unwrap() - .get_mount_args()[0]; - assert_eq!(want, *got); + .get_mount_args(); + assert_eq!(1, mount_args.len()); + let bind = &mount_args[0]; + assert_eq!(Some(PathBuf::from("/dev/null")), bind.source); + assert_eq!(tmp_dir.path().join("dev/null"), bind.target); + assert_eq!(MsFlags::MS_BIND, bind.flags); + assert!(bind.data.is_none()); assert!( device
crates/libcontainer/src/rootfs/mount.rs+299 −137 modified@@ -1,7 +1,7 @@ -use std::fs::{OpenOptions, canonicalize, create_dir_all}; -use std::io::ErrorKind; -use std::os::unix::fs::MetadataExt; -use std::os::unix::io::AsRawFd; +use std::fs::{Permissions, canonicalize}; +use std::io::{BufRead, BufReader, ErrorKind}; +use std::os::fd::{AsFd, OwnedFd}; +use std::os::unix::fs::{MetadataExt, PermissionsExt}; use std::path::{Path, PathBuf}; use std::time::Duration; #[cfg(feature = "v1")] @@ -12,15 +12,17 @@ use libcgroups::common::CgroupSetup::{Hybrid, Legacy, Unified}; #[cfg(feature = "v1")] use libcgroups::common::DEFAULT_CGROUP_ROOT; use nix::NixPath; -use nix::dir::Dir; use nix::errno::Errno; -use nix::fcntl::OFlag; use nix::mount::MsFlags; -use nix::sys::stat::Mode; use nix::sys::statfs::{PROC_SUPER_MAGIC, statfs}; use oci_spec::runtime::{Mount as SpecMount, MountBuilder as SpecMountBuilder}; -use procfs::process::{MountInfo, MountOptFields, Process}; -use safe_path; +use pathrs::Root; +use pathrs::flags::OpenFlags; +use pathrs::procfs::{ProcfsBase, ProcfsHandle}; +#[cfg(feature = "v1")] +use procfs::process::Process; +use procfs::process::{MountInfo, MountOptFields}; +use procfs::{FromRead, ProcessCGroups}; #[cfg(feature = "v1")] use super::symlink::Symlink; @@ -61,6 +63,8 @@ pub enum MountError { Procfs(#[from] procfs::ProcError), #[error("unknown mount option: {0}")] UnsupportedMountOption(String), + #[error(transparent)] + Pathrs(#[from] pathrs::error::Error), } type Result<T> = std::result::Result<T, MountError>; @@ -238,8 +242,12 @@ impl Mount { // The non-zero ppid means that the PID Namespace is not separated. let ppid = if ppid == 0 { std::process::id() } else { ppid }; let root_cgroups = Process::new(ppid as i32)?.cgroups()?.0; - let process_cgroups: HashMap<String, String> = Process::myself()? - .cgroups()? + let process_cgroups: HashMap<String, String> = + ProcessCGroups::from_read(ProcfsHandle::new()?.open( + ProcfsBase::ProcSelf, + "cgroup", + OpenFlags::O_RDONLY | OpenFlags::O_CLOEXEC, + )?)? .into_iter() .map(|c| { let hierarchy = c.hierarchy; @@ -452,22 +460,16 @@ impl Mount { MountError::Other(err.into()) })?; - let process_cgroup = Process::myself() - .map_err(|err| { - tracing::error!("failed to get /proc/self: {}", err); - MountError::Other(err.into()) - })? - .cgroups() - .map_err(|err| { - tracing::error!("failed to get process cgroups: {}", err); - MountError::Other(err.into()) - })? - .into_iter() - .find(|c| c.hierarchy == 0) - .map(|c| PathBuf::from(c.pathname)) - .ok_or_else(|| { - MountError::Custom("failed to find unified process cgroup".into()) - })?; + let process_cgroup = ProcessCGroups::from_read(ProcfsHandle::new()?.open( + ProcfsBase::ProcSelf, + "cgroup", + OpenFlags::O_RDONLY | OpenFlags::O_CLOEXEC, + )?)? + .into_iter() + .find(|c| c.hierarchy == 0) + .map(|c| PathBuf::from(c.pathname)) + .ok_or_else(|| MountError::Custom("failed to find unified process cgroup".into()))?; + let bind_mount = SpecMountBuilder::default() .typ("bind") .source(host_mount.join_safely(process_cgroup).map_err(|err| { @@ -503,17 +505,21 @@ impl Mount { /// Make parent mount of rootfs private if it was shared, which is required by pivot_root. /// It also makes sure following bind mount does not propagate in other namespaces. pub fn make_parent_mount_private(&self, rootfs: &Path) -> Result<Option<MountInfo>> { - let mount_infos = Process::myself() - .map_err(|err| { - tracing::error!("failed to get /proc/self: {}", err); - MountError::Other(err.into()) - })? - .mountinfo() - .map_err(|err| { - tracing::error!("failed to get mount info: {}", err); - MountError::Other(err.into()) - })?; - let parent_mount = find_parent_mount(rootfs, mount_infos.0)?; + let reader = BufReader::new(ProcfsHandle::new()?.open( + ProcfsBase::ProcSelf, + "mountinfo", + OpenFlags::O_RDONLY | OpenFlags::O_CLOEXEC, + )?); + + let mount_infos: Vec<MountInfo> = reader + .lines() + .map(|lr| { + lr.map_err(MountError::from) + .and_then(|s| MountInfo::from_line(&s).map_err(MountError::from)) + }) + .collect::<Result<_>>()?; + + let parent_mount = find_parent_mount(rootfs, mount_infos)?; // check parent mount has 'shared' propagation type if parent_mount @@ -553,130 +559,280 @@ impl Mount { } } - let dest_for_host = safe_path::scoped_join(rootfs, m.destination()).map_err(|err| { - tracing::error!( - "failed to join rootfs {:?} with mount destination {:?}: {}", - rootfs, - m.destination(), - err - ); - MountError::Other(err.into()) - })?; + let root = Root::open(rootfs)?; + let container_dest = m.destination(); - let dest = Path::new(&dest_for_host); let source = m.source().as_ref().ok_or(MountError::NoSource)?; + let dir_perm = Permissions::from_mode(0o755); let src = if typ == Some("bind") { let src = canonicalize(source).map_err(|err| { tracing::error!("failed to canonicalize {:?}: {}", source, err); err })?; - let dir = if src.is_file() { - Path::new(&dest).parent().unwrap() + + if src.is_file() { + let parent = container_dest + .parent() + .ok_or(MountError::Custom("destination has no parent".to_string()))?; + root.mkdir_all(parent, &dir_perm)?; + + match root.create_file( + container_dest, + OpenFlags::O_EXCL + | OpenFlags::O_CREAT + | OpenFlags::O_NOFOLLOW + | OpenFlags::O_CLOEXEC, + &Permissions::from_mode(0o644), + ) { + Ok(_) => Ok(()), + // If we get here, the file is already present, so continue. + Err(create_err) => root + .resolve(container_dest) + .map(|_| ()) + .map_err(|_| create_err), + }?; } else { - Path::new(&dest) + root.mkdir_all(container_dest, &dir_perm)?; }; - create_dir_all(dir).map_err(|err| { - tracing::error!("failed to create dir for bind mount {:?}: {}", dir, err); - err - })?; - - if src.is_file() && !dest.exists() { - OpenOptions::new() - .create(true) - .truncate(true) - .write(true) - .open(dest) - .map_err(|err| { - tracing::error!("failed to create file for bind mount {:?}: {}", src, err); - err - })?; - } - src } else { - create_dir_all(dest).inspect_err(|_err| { - tracing::error!("failed to create device: {:?}", dest); - })?; - + root.mkdir_all(container_dest, &dir_perm)?; PathBuf::from(source) }; - if let Err(err) = - self.syscall - .mount(Some(&*src), dest, typ, mount_option_config.flags, Some(&*d)) - { - if let SyscallError::Nix(errno) = err { - if matches!(errno, Errno::EINVAL) { - self.syscall.mount( - Some(&*src), - dest, - typ, - mount_option_config.flags, - Some(&mount_option_config.data), - )?; - } else if matches!(errno, Errno::EBUSY) { - let mount_op = || -> std::result::Result<(), SyscallError> { - self.syscall.mount( - Some(&*src), - dest, - typ, - mount_option_config.flags, - Some(&*d), - ) - }; - let delay = Duration::from_millis(MOUNT_RETRY_DELAY_MS); - let retry_policy = |err: &SyscallError| -> bool { - matches!(err, SyscallError::Nix(Errno::EBUSY)) - }; - retry(mount_op, MAX_EBUSY_MOUNT_ATTEMPTS - 1, delay, retry_policy)?; - } else { - return Err(err.into()); - } - } else { - return Err(err.into()); + let dest: OwnedFd = root.resolve(container_dest)?.into(); + let dest_fd = dest.as_fd(); + + let is_bind = typ == Some("bind") + || m.options() + .as_deref() + .is_some_and(|ops| ops.iter().any(|o| o == "bind" || o == "rbind")); + + // fd-based mount flow: + // - bind: open_tree -> mount_setattr -> move_mount + // - nonbind: fsopen -> fsconfig -> fsmount -> mount_setattr -> move_mount + if is_bind { + let recursive = m + .options() + .as_ref() + .map(|v| v.iter().any(|o| o == "rbind")) + .unwrap_or(false); + let mut open_tree_flags: libc::c_uint = (libc::OPEN_TREE_CLOEXEC as libc::c_uint) + | (libc::OPEN_TREE_CLONE as libc::c_uint) + | (libc::AT_EMPTY_PATH as libc::c_uint); + if recursive { + open_tree_flags |= libc::AT_RECURSIVE as libc::c_uint; + }; + + let src_str = src.to_str().ok_or(SyscallError::Nix(Errno::EINVAL))?; + let mount_fd_owned = + self.syscall + .open_tree(libc::AT_FDCWD, Some(src_str), open_tree_flags)?; + let mount_fd = mount_fd_owned.as_fd(); + + // mount_setattr + let attr_set_from_flags = self.mount_flag_to_attr(&mount_option_config.flags); + let mut mount_attr = mount_option_config + .rec_attr + .clone() + .unwrap_or(linux::MountAttr { + attr_set: 0, + attr_clr: 0, + propagation: 0, + userns_fd: 0, + }); + mount_attr.attr_set |= attr_set_from_flags; + + let mut at_flags = linux::AT_EMPTY_PATH; + if recursive { + at_flags |= linux::AT_RECURSIVE; } - } - if typ == Some("bind") - && mount_option_config.flags.intersects( - !(MsFlags::MS_REC - | MsFlags::MS_REMOUNT - | MsFlags::MS_BIND - | MsFlags::MS_PRIVATE - | MsFlags::MS_SHARED - | MsFlags::MS_SLAVE), - ) - { - self.syscall - .mount( - Some(dest), - dest, - None, - mount_option_config.flags | MsFlags::MS_REMOUNT, - None, - ) - .map_err(|err| { - tracing::error!("failed to remount {:?}: {}", dest, err); - err - })?; - } + self.apply_atime_from_msflags( + &mut mount_attr, + attr_set_from_flags, + mount_option_config.flags, + ); - if let Some(mount_attr) = &mount_option_config.rec_attr { - let open_dir = Dir::open(dest, OFlag::O_DIRECTORY, Mode::empty())?; - let dir_fd_pathbuf = PathBuf::from(format!("/proc/self/fd/{}", open_dir.as_raw_fd())); self.syscall.mount_setattr( - -1, - &dir_fd_pathbuf, - linux::AT_RECURSIVE, - mount_attr, + mount_fd, + Path::new(""), + at_flags, + &mount_attr, mem::size_of::<linux::MountAttr>(), )?; + + // move_mount + self.syscall.move_mount( + mount_fd, + None, + dest_fd, + None, + libc::MOVE_MOUNT_T_EMPTY_PATH | libc::MOVE_MOUNT_F_EMPTY_PATH, + )?; + } else { + let mount_fn = || -> std::result::Result<(), SyscallError> { + // fsopen + let fsfd_owned = self.syscall.fsopen(typ, 0)?; + let fsfd = fsfd_owned.as_fd(); + + // fsconfig + let src_str = src + .as_os_str() + .to_str() + .ok_or(SyscallError::Nix(Errno::EINVAL))?; + self.syscall.fsconfig( + fsfd, + linux::FSCONFIG_SET_STRING as u32, + Some("source"), + Some(src_str), + 0, + )?; + + for opt in d.split(',').filter(|s| !s.is_empty()) { + if let Some((k, v)) = opt.split_once('=') { + self.syscall.fsconfig( + fsfd, + linux::FSCONFIG_SET_STRING as u32, + Some(k), + Some(v), + 0, + )?; + } else { + self.syscall.fsconfig( + fsfd, + linux::FSCONFIG_SET_FLAG as u32, + Some(opt), + None, + 0, + )?; + }; + } + + self.syscall + .fsconfig(fsfd, linux::FSCONFIG_CMD_CREATE as u32, None, None, 0)?; + + // fsmount + let mount_fd_owned = self.syscall.fsmount(fsfd, 0, None)?; + let mount_fd = mount_fd_owned.as_fd(); + + // mount_setattr + let attr_set_from_flags = self.mount_flag_to_attr(&mount_option_config.flags); + let mut mount_attr = + mount_option_config + .rec_attr + .clone() + .unwrap_or(linux::MountAttr { + attr_set: 0, + attr_clr: 0, + propagation: 0, + userns_fd: 0, + }); + mount_attr.attr_set |= attr_set_from_flags; + + self.apply_atime_from_msflags( + &mut mount_attr, + attr_set_from_flags, + mount_option_config.flags, + ); + + self.syscall.mount_setattr( + mount_fd, + Path::new(""), + linux::AT_EMPTY_PATH | linux::AT_RECURSIVE, + &mount_attr, + mem::size_of::<linux::MountAttr>(), + )?; + + // move_mount + self.syscall.move_mount( + mount_fd, + None, + dest_fd, + None, + libc::MOVE_MOUNT_T_EMPTY_PATH | libc::MOVE_MOUNT_F_EMPTY_PATH, + )?; + Ok(()) + }; + + match mount_fn() { + Ok(()) => {} + Err(SyscallError::Nix(nix::Error::EINVAL)) => { + mount_fn()?; + } + Err(SyscallError::Nix(nix::Error::EBUSY)) => { + let delay = Duration::from_millis(MOUNT_RETRY_DELAY_MS); + let retry_policy = + |err: &SyscallError| matches!(err, SyscallError::Nix(Errno::EBUSY)); + retry(mount_fn, MAX_EBUSY_MOUNT_ATTEMPTS - 1, delay, retry_policy)?; + } + Err(e) => return Err(e.into()), + } } Ok(()) } + // https://man7.org/linux/man-pages/man2/mount_setattr.2.html + // To apply MsFlags via mount_setattr, we set the corresponding bits in attr_set + fn mount_flag_to_attr(&self, flags: &MsFlags) -> u64 { + const MAP_SET: &[(MsFlags, u64)] = &[ + (MsFlags::MS_RDONLY, linux::MOUNT_ATTR_RDONLY), + (MsFlags::MS_NOSUID, linux::MOUNT_ATTR_NOSUID), + (MsFlags::MS_NODEV, linux::MOUNT_ATTR_NODEV), + (MsFlags::MS_NOEXEC, linux::MOUNT_ATTR_NOEXEC), + (MsFlags::MS_NOATIME, linux::MOUNT_ATTR_NOATIME), + (MsFlags::MS_NODIRATIME, linux::MOUNT_ATTR_NODIRATIME), + (MsFlags::MS_RELATIME, linux::MOUNT_ATTR_RELATIME), + (MsFlags::MS_STRICTATIME, linux::MOUNT_ATTR_STRICTATIME), + ]; + + let mut set = 0; + for (ms, attr) in MAP_SET { + if flags.intersects(*ms) { + set |= *attr; + } + } + set + } + + // Apply atime-related configuration. + // https://man7.org/linux/man-pages/man2/mount_setattr.2.html + // ref: MOUNT_ATTR__ATIME + fn apply_atime_from_msflags( + &self, + mount_attr: &mut linux::MountAttr, + attr_set_from_flags: u64, + msflags: MsFlags, + ) { + let atime_bits = + linux::MOUNT_ATTR_NOATIME | linux::MOUNT_ATTR_STRICTATIME | linux::MOUNT_ATTR_RELATIME; + + let noatime = msflags.contains(MsFlags::MS_NOATIME); + let strictatime = msflags.contains(MsFlags::MS_STRICTATIME); + let relatime = msflags.contains(MsFlags::MS_RELATIME); + + let atime = if strictatime { + linux::MOUNT_ATTR_STRICTATIME + } else if noatime { + linux::MOUNT_ATTR_NOATIME + } else if relatime { + linux::MOUNT_ATTR_RELATIME + } else { + 0 + }; + + let non_atime = attr_set_from_flags & !atime_bits; + + if atime != 0 { + mount_attr.attr_clr |= linux::MOUNT_ATTR__ATIME; + mount_attr.attr_set |= non_atime | atime; + } else { + mount_attr.attr_set |= non_atime; + } + } + /// check_proc_mount checks to ensure that the mount destination is not over the top of /proc. /// dest is required to be an abs path and have any symlinks resolved before calling this function. /// # Example (a valid case where `/proc` is mounted with `proc` type.) @@ -831,6 +987,7 @@ pub fn find_parent_mount( mod tests { #[cfg(feature = "v1")] use std::fs; + use std::fs::OpenOptions; use std::os::unix::fs::symlink; use anyhow::{Context, Ok, Result}; @@ -839,6 +996,7 @@ mod tests { use crate::syscall::test::{ArgName, MountArgs, TestHelperSyscall}; #[test] + #[ignore] // TODO: fix fd-based test fn test_mount_into_container() -> Result<()> { let tmp_dir = tempfile::tempdir()?; { @@ -916,7 +1074,7 @@ mod tests { }, // remount one MountArgs { - source: Some(tmp_dir.path().join("dev/null")), + source: None, target: tmp_dir.path().join("dev/null"), fstype: None, flags: MsFlags::MS_RDONLY | MsFlags::MS_REMOUNT, @@ -1069,6 +1227,7 @@ mod tests { #[test] #[cfg(feature = "v1")] + #[ignore] // TODO: fix fd-based test fn test_namespaced_subsystem_success() -> Result<()> { let tmp = tempfile::tempdir().unwrap(); let container_cgroup = Path::new("/container_cgroup"); @@ -1120,6 +1279,7 @@ mod tests { #[test] #[cfg(feature = "v1")] + #[ignore] // TODO: fix fd-based test fn test_emulated_subsystem_success() -> Result<()> { // arrange let tmp = tempfile::tempdir().unwrap(); @@ -1186,6 +1346,7 @@ mod tests { #[test] #[cfg(feature = "v1")] + #[ignore] // TODO: fix fd-based test fn test_mount_cgroup_v1() -> Result<()> { // arrange let tmp = tempfile::tempdir()?; @@ -1259,6 +1420,7 @@ mod tests { #[test] #[cfg(feature = "v2")] + #[ignore] // TODO: fix fd-based test fn test_mount_cgroup_v2() -> Result<()> { // arrange let tmp = tempfile::tempdir().unwrap();
crates/libcontainer/src/syscall/linux.rs+251 −36 modified@@ -1,43 +1,58 @@ //! Implements Command trait for Linux systems use std::any::Any; use std::ffi::{CStr, CString, OsStr}; -use std::os::fd::BorrowedFd; +use std::os::fd::{BorrowedFd, FromRawFd, RawFd}; use std::os::unix::ffi::OsStrExt; use std::os::unix::fs::symlink; -use std::os::unix::io::RawFd; +use std::os::unix::io::{AsRawFd, OwnedFd}; use std::path::Path; use std::str::FromStr; use std::sync::Arc; -use std::{fs, mem, ptr}; +use std::{mem, ptr}; use caps::{CapSet, CapsHashSet}; use libc::{c_char, setdomainname, uid_t}; +use nix::dir::Dir; use nix::fcntl; use nix::fcntl::{OFlag, open}; use nix::mount::{MntFlags, MsFlags, mount, umount2}; use nix::sched::{CloneFlags, unshare}; use nix::sys::stat::{Mode, SFlag, mknod}; use nix::unistd::{Gid, Uid, chown, chroot, close, fchdir, pivot_root, sethostname}; use oci_spec::runtime::PosixRlimit; +use pathrs::flags::OpenFlags; +use pathrs::procfs::{ProcfsBase, ProcfsHandle}; use super::{Result, Syscall, SyscallError}; +use crate::capabilities; use crate::config::PersonalityDomain; -use crate::{capabilities, utils}; // Flags used in mount_setattr(2). // see https://man7.org/linux/man-pages/man2/mount_setattr.2.html. pub const AT_RECURSIVE: u32 = 0x00008000; // Change the mount properties of the entire mount tree. +pub const AT_EMPTY_PATH: u32 = 0x00001000; #[allow(non_upper_case_globals)] pub const MOUNT_ATTR__ATIME: u64 = 0x00000070; // Setting on how atime should be updated. -const MOUNT_ATTR_RDONLY: u64 = 0x00000001; -const MOUNT_ATTR_NOSUID: u64 = 0x00000002; -const MOUNT_ATTR_NODEV: u64 = 0x00000004; -const MOUNT_ATTR_NOEXEC: u64 = 0x00000008; -const MOUNT_ATTR_RELATIME: u64 = 0x00000000; -const MOUNT_ATTR_NOATIME: u64 = 0x00000010; -const MOUNT_ATTR_STRICTATIME: u64 = 0x00000020; -const MOUNT_ATTR_NODIRATIME: u64 = 0x00000080; -const MOUNT_ATTR_NOSYMFOLLOW: u64 = 0x00200000; +pub const MOUNT_ATTR_RDONLY: u64 = 0x00000001; +pub const MOUNT_ATTR_NOSUID: u64 = 0x00000002; +pub const MOUNT_ATTR_NODEV: u64 = 0x00000004; +pub const MOUNT_ATTR_NOEXEC: u64 = 0x00000008; +pub const MOUNT_ATTR_RELATIME: u64 = 0x00000000; +pub const MOUNT_ATTR_NOATIME: u64 = 0x00000010; +pub const MOUNT_ATTR_STRICTATIME: u64 = 0x00000020; +pub const MOUNT_ATTR_NODIRATIME: u64 = 0x00000080; +pub const MOUNT_ATTR_NOSYMFOLLOW: u64 = 0x00200000; + +// The type of fsconfig() call made. +pub const FSCONFIG_SET_FLAG: u64 = 0; +pub const FSCONFIG_SET_STRING: u64 = 1; +pub const FSCONFIG_SET_BINARY: u64 = 2; +pub const FSCONFIG_SET_PATH: u64 = 3; +pub const FSCONFIG_SET_PATH_EMPTY: u64 = 4; +pub const FSCONFIG_SET_FD: u64 = 5; +pub const FSCONFIG_CMD_CREATE: u64 = 6; +pub const FSCONFIG_CMD_RECONFIGURE: u64 = 7; +pub const FSCONFIG_CMD_CREATE_EXCL: u64 = 8; /// Constants used by mount(2). pub enum MountOption { @@ -327,31 +342,24 @@ impl LinuxSyscall { // Get a list of open fds for the calling process. fn get_open_fds() -> Result<Vec<i32>> { - const PROCFS_FD_PATH: &str = "/proc/self/fd"; - utils::ensure_procfs(Path::new(PROCFS_FD_PATH)).map_err(|err| { - tracing::error!(?err, "failed to ensure /proc is mounted"); - match err { - utils::EnsureProcfsError::Nix(err) => SyscallError::Nix(err), - utils::EnsureProcfsError::IO(err) => SyscallError::IO(err), - } - })?; - - let fds: Vec<i32> = fs::read_dir(PROCFS_FD_PATH) - .map_err(|err| { - tracing::error!(?err, "failed to read /proc/self/fd"); - err - })? - .filter_map(|entry| match entry { - Ok(entry) => Some(entry.path()), - Err(_) => None, - }) - .filter_map(|path| path.file_name().map(|file_name| file_name.to_owned())) - .filter_map(|file_name| file_name.to_str().map(String::from)) - .filter_map(|file_name| -> Option<i32> { + let dir = ProcfsHandle::new()?.open( + ProcfsBase::ProcSelf, + Path::new("fd"), + OpenFlags::O_DIRECTORY | OpenFlags::O_CLOEXEC, + )?; + + let fds = Dir::from(dir)? + .into_iter() + .filter_map(|entry| entry.ok()) + .filter_map(|entry| { // Convert the file name from string into i32. Since we are looking // at /proc/<pid>/fd, anything that's not a number (i32) can be // ignored. We are only interested in opened fds. - file_name.parse().ok() + entry + .file_name() + .to_str() + .ok() + .and_then(|name| name.parse::<i32>().ok()) }) .collect(); @@ -599,6 +607,213 @@ impl Syscall for LinuxSyscall { Ok(()) } + fn mount_from_fd(&self, source_fd: &OwnedFd, target: &Path) -> Result<()> { + let parent = target.parent().ok_or_else(|| { + tracing::error!(?target, "target has no parent"); + SyscallError::Nix(nix::Error::EINVAL) + })?; + let name = target.file_name().ok_or_else(|| { + tracing::error!(?target, "target has no file name"); + SyscallError::Nix(nix::Error::EINVAL) + })?; + + let parent_fd = unsafe { + OwnedFd::from_raw_fd(open( + parent, + OFlag::O_PATH | OFlag::O_CLOEXEC | OFlag::O_DIRECTORY, + Mode::empty(), + )?) + }; + + let open_tree_flags: libc::c_uint = (libc::OPEN_TREE_CLOEXEC as libc::c_uint) + | (libc::OPEN_TREE_CLONE as libc::c_uint) + | (libc::AT_EMPTY_PATH as libc::c_uint); + + const EMPTY_PATH: [libc::c_char; 1] = [0]; + + let mount_fd_raw = unsafe { + libc::syscall( + libc::SYS_open_tree, + source_fd.as_raw_fd(), + EMPTY_PATH.as_ptr(), + open_tree_flags, + ) + }; + + if mount_fd_raw < 0 { + let err = nix::errno::Errno::last(); + tracing::error!(?err, "open_tree from fd failed"); + return Err(SyscallError::Nix(err)); + } + let mount_fd = unsafe { OwnedFd::from_raw_fd(mount_fd_raw as RawFd) }; + + let name_cstr = CString::new(name.as_bytes()).map_err(|err| { + tracing::error!(?target, ?err, "failed to convert file name to cstring"); + SyscallError::Nix(nix::Error::EINVAL) + })?; + + let res = unsafe { + libc::syscall( + libc::SYS_move_mount, + mount_fd.as_raw_fd(), + EMPTY_PATH.as_ptr(), + parent_fd.as_raw_fd(), + name_cstr.as_ptr(), + libc::MOVE_MOUNT_F_EMPTY_PATH as libc::c_uint, + ) + }; + + if res < 0 { + let err = nix::errno::Errno::last(); + tracing::error!(?target, ?err, "move_mount failed"); + return Err(SyscallError::Nix(err)); + } + + Ok(()) + } + + fn move_mount( + &self, + from_dirfd: BorrowedFd<'_>, + from_path: Option<&str>, + to_dirfd: BorrowedFd<'_>, + to_path: Option<&str>, + flags: u32, + ) -> Result<()> { + const EMPTY_PATH: [libc::c_char; 1] = [0]; + + let from_cstr: Option<CString> = from_path + .and_then(|s| if s.is_empty() { None } else { Some(s) }) + .map(|s| CString::new(s).map_err(|_| nix::Error::EINVAL)) + .transpose()?; + let from_ptr = from_cstr + .as_ref() + .map_or(EMPTY_PATH.as_ptr(), |c| c.as_ptr()); + + let to_cstr: Option<CString> = to_path + .and_then(|s| if s.is_empty() { None } else { Some(s) }) + .map(|s| CString::new(s).map_err(|_| nix::Error::EINVAL)) + .transpose()?; + let to_ptr = to_cstr.as_ref().map_or(EMPTY_PATH.as_ptr(), |c| c.as_ptr()); + + let rc = unsafe { + libc::syscall( + libc::SYS_move_mount, + from_dirfd, + from_ptr, + to_dirfd, + to_ptr, + flags as libc::c_uint, + ) + }; + + match rc { + 0 => Ok(()), + -1 => Err(nix::Error::last().into()), + _ => Err(nix::Error::UnknownErrno.into()), + } + } + + fn fsopen(&self, fstype: Option<&str>, flags: u32) -> Result<OwnedFd> { + let t_cstr: Option<CString> = fstype + .map(|t| CString::new(t).map_err(|_| SyscallError::Nix(nix::errno::Errno::EINVAL))) + .transpose()?; + + let t_ptr = t_cstr.as_ref().map_or(std::ptr::null(), |c| c.as_ptr()); + + let fd = + unsafe { libc::syscall(libc::SYS_fsopen, t_ptr, flags as libc::c_uint) } as libc::c_int; + if fd < 0 { + return Err(SyscallError::Nix(nix::Error::last())); + } + Ok(unsafe { OwnedFd::from_raw_fd(fd) }) + } + + fn fsconfig( + &self, + fsfd: BorrowedFd<'_>, + cmd: u32, + key: Option<&str>, + val: Option<&str>, + aux: libc::c_int, + ) -> Result<()> { + let k_cstr: Option<CString> = key + .map(|k| CString::new(k).map_err(|_| SyscallError::Nix(nix::errno::Errno::EINVAL))) + .transpose()?; + let k_ptr = k_cstr.as_ref().map_or(std::ptr::null(), |k| k.as_ptr()); + + let v_cstr: Option<CString> = val + .map(|v| CString::new(v).map_err(|_| SyscallError::Nix(nix::errno::Errno::EINVAL))) + .transpose()?; + let v_ptr = v_cstr + .as_ref() + .map_or(std::ptr::null(), |v| v.as_ptr() as *const libc::c_void); + + let rc = unsafe { + libc::syscall( + libc::SYS_fsconfig, + fsfd.as_raw_fd() as libc::c_int, + cmd as libc::c_uint, + k_ptr, + v_ptr, + aux, + ) + }; + if rc == -1 { + return Err(SyscallError::Nix(nix::Error::last())); + } + Ok(()) + } + + fn fsmount( + &self, + fsfd: BorrowedFd<'_>, + flags: u32, + attr_flags: Option<u64>, + ) -> Result<OwnedFd> { + let attr = attr_flags.unwrap_or(0); + + let ret = unsafe { + libc::syscall( + libc::SYS_fsmount, + fsfd.as_raw_fd() as libc::c_int, + flags as libc::c_uint, + attr as libc::c_ulong, + ) + } as libc::c_int; + + if ret < 0 { + return Err(SyscallError::Nix(nix::Error::last())); + } + Ok(unsafe { std::os::fd::OwnedFd::from_raw_fd(ret) }) + } + + //dirfd is RawFd because we need to pass AT_FDCWD + fn open_tree(&self, dirfd: RawFd, path: Option<&str>, flags: u32) -> Result<OwnedFd> { + static EMPTY: [libc::c_char; 1] = [0]; + let path_cstr: Option<CString> = path + .map(|s| CString::new(s).map_err(|_| SyscallError::Nix(nix::errno::Errno::EINVAL))) + .transpose()?; + let c_path: *const c_char = match path_cstr.as_ref() { + Some(cs) => cs.as_ptr(), + None => EMPTY.as_ptr(), + }; + + let fd = unsafe { + libc::syscall( + libc::SYS_open_tree, + dirfd as libc::c_int, + c_path, + flags as libc::c_uint, + ) + } as libc::c_int; + + if fd < 0 { + return Err(SyscallError::Nix(nix::Error::last())); + } + Ok(unsafe { OwnedFd::from_raw_fd(fd) }) + } + fn symlink(&self, original: &Path, link: &Path) -> Result<()> { symlink(original, link)?; @@ -660,7 +875,7 @@ impl Syscall for LinuxSyscall { fn mount_setattr( &self, - dirfd: RawFd, + dirfd: BorrowedFd<'_>, pathname: &Path, flags: u32, mount_attr: &MountAttr,
crates/libcontainer/src/syscall/mod.rs+2 −0 modified@@ -18,6 +18,8 @@ pub enum SyscallError { IO(#[from] std::io::Error), #[error("failed to set capabilities: {0}")] SetCaps(#[from] caps::errors::CapsError), + #[error(transparent)] + Pathrs(#[from] pathrs::error::Error), } type Result<T> = std::result::Result<T, SyscallError>;
crates/libcontainer/src/syscall/syscall.rs+26 −1 modified@@ -3,6 +3,8 @@ //! implementation details use std::any::Any; use std::ffi::OsStr; +use std::os::fd::{BorrowedFd, RawFd}; +use std::os::unix::io::OwnedFd; use std::path::Path; use std::sync::Arc; @@ -41,14 +43,37 @@ pub trait Syscall { flags: MsFlags, data: Option<&str>, ) -> Result<()>; + // mount_from_fd mounts a filesystem specified by source_fd to target path. + // NOTE: mount_from_fd only supports BIND_MOUNT. + fn mount_from_fd(&self, source_fd: &OwnedFd, target: &Path) -> Result<()>; + fn move_mount( + &self, + from_dirfd: BorrowedFd<'_>, + from_path: Option<&str>, + to_dirfd: BorrowedFd<'_>, + to_path: Option<&str>, + flags: u32, + ) -> Result<()>; + fn fsopen(&self, fstype: Option<&str>, flags: u32) -> Result<OwnedFd>; + fn fsconfig( + &self, + fsfd: BorrowedFd<'_>, + cmd: u32, + key: Option<&str>, + val: Option<&str>, + aux: libc::c_int, + ) -> Result<()>; + fn fsmount(&self, fsfd: BorrowedFd<'_>, flags: u32, attr_flags: Option<u64>) + -> Result<OwnedFd>; + fn open_tree(&self, dirfd: RawFd, path: Option<&str>, flags: u32) -> Result<OwnedFd>; fn symlink(&self, original: &Path, link: &Path) -> Result<()>; fn mknod(&self, path: &Path, kind: SFlag, perm: Mode, dev: u64) -> Result<()>; fn chown(&self, path: &Path, owner: Option<Uid>, group: Option<Gid>) -> Result<()>; fn set_groups(&self, groups: &[Gid]) -> Result<()>; fn close_range(&self, preserve_fds: i32) -> Result<()>; fn mount_setattr( &self, - dirfd: i32, + dirfd: BorrowedFd<'_>, pathname: &Path, flags: u32, mount_attr: &MountAttr,
crates/libcontainer/src/syscall/test.rs+97 −2 modified@@ -2,6 +2,8 @@ use std::any::Any; use std::cell::{Ref, RefCell, RefMut}; use std::collections::HashMap; use std::ffi::{OsStr, OsString}; +use std::os::fd::{AsRawFd, BorrowedFd, RawFd}; +use std::os::unix::io::OwnedFd; use std::path::{Path, PathBuf}; use std::sync::Arc; @@ -24,6 +26,27 @@ pub struct MountArgs { pub data: Option<String>, } +#[derive(Clone, PartialEq, Eq, Debug)] +pub struct MountFromFdArgs { + pub fd: i32, + pub target: PathBuf, +} + +#[derive(Clone, PartialEq, Eq, Debug)] +pub struct MoveMountArgs { + pub from_dirfd: i32, + pub from_path: Option<OsString>, + pub to_dirfd: i32, + pub to_path: Option<OsString>, + pub flags: u32, +} + +#[derive(Clone, PartialEq, Eq, Debug)] +pub struct FsopenArgs { + pub fsname: Option<String>, + pub flags: u32, +} + #[derive(Clone, PartialEq, Eq, Debug)] pub struct MknodArgs { pub path: PathBuf, @@ -63,6 +86,7 @@ pub enum ArgName { Namespace, Unshare, Mount, + MountFromFd, Symlink, Mknod, Chown, @@ -72,6 +96,8 @@ pub enum ArgName { Capability, IoPriority, UMount2, + MoveMount, + Fsopen, } impl ArgName { @@ -80,6 +106,7 @@ impl ArgName { ArgName::Namespace, ArgName::Unshare, ArgName::Mount, + ArgName::MountFromFd, ArgName::Symlink, ArgName::Mknod, ArgName::Chown, @@ -88,6 +115,7 @@ impl ArgName { ArgName::Groups, ArgName::Capability, ArgName::IoPriority, + ArgName::MoveMount, ] .iter() .copied() @@ -224,18 +252,76 @@ impl Syscall for TestHelperSyscall { flags: MsFlags, data: Option<&str>, ) -> Result<()> { + // For tests: resolve /proc/self/fd/<n> to the real path before recording. + let target_owned = if target.starts_with(Path::new("/proc/self/fd")) { + std::fs::read_link(target).unwrap_or_else(|_| target.to_owned()) + } else { + target.to_owned() + }; + self.mocks.act( ArgName::Mount, Box::new(MountArgs { source: source.map(|x| x.to_owned()), - target: target.to_owned(), + target: target_owned, fstype: fstype.map(|x| x.to_owned()), flags, data: data.map(|x| x.to_owned()), }), ) } + fn mount_from_fd(&self, source_fd: &OwnedFd, target: &Path) -> Result<()> { + self.mocks.act( + ArgName::MountFromFd, + Box::new(MountFromFdArgs { + fd: source_fd.as_raw_fd(), + target: target.to_owned(), + }), + ) + } + + fn move_mount( + &self, + from_dirfd: BorrowedFd<'_>, + from_path: Option<&str>, + to_dirfd: BorrowedFd<'_>, + to_path: Option<&str>, + flags: u32, + ) -> Result<()> { + let rec = MoveMountArgs { + from_dirfd: from_dirfd.as_raw_fd(), + from_path: from_path.map(OsString::from), + to_dirfd: to_dirfd.as_raw_fd(), + to_path: to_path.map(OsString::from), + flags, + }; + self.mocks.act(ArgName::MoveMount, Box::new(rec)) + } + + fn fsopen(&self, _: Option<&str>, _: u32) -> Result<OwnedFd> { + todo!() + } + + fn fsconfig( + &self, + _: BorrowedFd<'_>, + _: u32, + _: Option<&str>, + _: Option<&str>, + _: libc::c_int, + ) -> Result<()> { + todo!() + } + + fn fsmount(&self, _: BorrowedFd<'_>, _: u32, _: Option<u64>) -> Result<OwnedFd> { + todo!() + } + + fn open_tree(&self, _: RawFd, _: Option<&str>, _: u32) -> Result<OwnedFd> { + todo!() + } + fn symlink(&self, original: &Path, link: &Path) -> Result<()> { self.mocks.act( ArgName::Symlink, @@ -275,7 +361,7 @@ impl Syscall for TestHelperSyscall { fn mount_setattr( &self, - _: i32, + _: BorrowedFd<'_>, _: &Path, _: u32, _: &linux::MountAttr, @@ -368,6 +454,15 @@ impl TestHelperSyscall { .collect::<Vec<MountArgs>>() } + pub fn get_mount_from_fd_args(&self) -> Vec<MountFromFdArgs> { + self.mocks + .fetch(ArgName::MountFromFd) + .values + .iter() + .map(|x| x.downcast_ref::<MountFromFdArgs>().unwrap().clone()) + .collect::<Vec<MountFromFdArgs>>() + } + pub fn get_symlink_args(&self) -> Vec<(PathBuf, PathBuf)> { self.mocks .fetch(ArgName::Symlink)
crates/libcontainer/src/utils.rs+0 −28 modified@@ -8,7 +8,6 @@ use std::path::{Component, Path, PathBuf}; use std::time::Duration; use nix::sys::stat::Mode; -use nix::sys::statfs; use nix::unistd::{Uid, User}; use oci_spec::runtime::Spec; @@ -225,33 +224,6 @@ pub fn create_dir_all_with_mode<P: AsRef<Path>>( } } -#[derive(Debug, thiserror::Error)] -pub enum EnsureProcfsError { - #[error(transparent)] - Nix(#[from] nix::Error), - #[error(transparent)] - IO(#[from] std::io::Error), -} - -// Make sure a given path is on procfs. This is to avoid the security risk that -// /proc path is mounted over. Ref: CVE-2019-16884 -pub fn ensure_procfs(path: &Path) -> Result<(), EnsureProcfsError> { - let procfs_fd = fs::File::open(path).map_err(|err| { - tracing::error!(?err, ?path, "failed to open procfs file"); - err - })?; - let fstat_info = statfs::fstatfs(&procfs_fd).inspect_err(|err| { - tracing::error!(?err, ?path, "failed to fstatfs the procfs"); - })?; - - if fstat_info.filesystem_type() != statfs::PROC_SUPER_MAGIC { - tracing::error!(?path, "given path is not on the procfs"); - Err(nix::Error::EINVAL)?; - } - - Ok(()) -} - pub fn is_in_new_userns() -> Result<bool, std::io::Error> { let uid_map_path = "/proc/self/uid_map"; let content = std::fs::read_to_string(uid_map_path)?;
Vulnerability mechanics
Generated on May 9, 2026. Inputs: CWE entries + fix-commit diffs from this CVE's patches. Citations validated against bundle.
References
5- github.com/advisories/GHSA-4g74-7cff-xcv8ghsaADVISORY
- nvd.nist.gov/vuln/detail/CVE-2025-62161ghsaADVISORY
- github.com/opencontainers/runc/security/advisories/GHSA-9493-h29p-rfm2ghsaWEB
- github.com/youki-dev/youki/commit/5886c91073b9be748bd8d5aed49c4a820548030aghsax_refsource_MISCWEB
- github.com/youki-dev/youki/security/advisories/GHSA-4g74-7cff-xcv8ghsax_refsource_CONFIRMx_refsource_MISCWEB
News mentions
0No linked articles in our index yet.