Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.lock
Original file line number Diff line number Diff line change
Expand Up @@ -8634,6 +8634,7 @@ dependencies = [
"inspect_counters",
"libc",
"mesh",
"nix 0.30.1",
"pal_async",
"pal_event",
"parking_lot",
Expand Down Expand Up @@ -8714,6 +8715,7 @@ dependencies = [
"libc",
"nix 0.30.1",
"pal_async",
"tracelimit",
"tracing",
"vfio-bindings",
]
Expand Down
2 changes: 2 additions & 0 deletions vm/devices/user_driver/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ vfio = [
"dep:trycopy",
"dep:inspect_counters",
"dep:mesh",
"dep:nix",
"dep:pci_core",
"dep:tracing",
"dep:uevent",
Expand All @@ -39,6 +40,7 @@ futures-concurrency.workspace = true
inspect_counters = { workspace = true, optional = true }
libc.workspace = true
mesh = { workspace = true, optional = true }
nix = { workspace = true, optional = true }
pal_event.workspace = true
pci_core = { workspace = true, optional = true }
tracing = { workspace = true, optional = true }
Expand Down
49 changes: 43 additions & 6 deletions vm/devices/user_driver/src/vfio.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ use futures::FutureExt;
use futures_concurrency::future::Race;
use inspect::Inspect;
use inspect_counters::SharedCounter;
use nix::errno::Errno;
use pal_async::task::Spawn;
use pal_async::task::Task;
use pal_async::wait::PolledWait;
Expand Down Expand Up @@ -123,22 +124,58 @@ impl VfioDevice {
tracing::info!(pci_id, keepalive, "device arrived");
vfio_sys::print_relevant_params();

let driver = driver_source.simple();
let retry = vfio_sys::VfioRetry::new(&driver);

let is_not_found = |e: &anyhow::Error| {
e.chain().any(|cause| {
cause
.downcast_ref::<std::io::Error>()
.is_some_and(|io_err| io_err.kind() == std::io::ErrorKind::NotFound)
})
};
let is_enodev = |e: &anyhow::Error| {
e.chain().any(|cause| {
cause
.downcast_ref::<Errno>()
.is_some_and(|e| *e == Errno::ENODEV)
})
};

let container = vfio_sys::Container::new()?;
let group_id = vfio_sys::Group::find_group_for_device(&path)?;
let group = vfio_sys::Group::open_noiommu(group_id)?;
let group_id = retry
.retry(
|| vfio_sys::Group::find_group_for_device(&path),
&is_not_found,
"find_group_for_device",
)
.await?;
let group = retry
.retry(
|| vfio_sys::Group::open_noiommu(group_id),
&is_not_found,
"open_noiommu",
)
.await?;
group.set_container(&container)?;
if !group.status()?.viable() {
anyhow::bail!("group is not viable");
}

let driver = driver_source.simple();
container.set_iommu(IommuType::NoIommu)?;
if keepalive {
// Prevent physical hardware interaction when restoring.
group.set_keep_alive(pci_id, &driver).await?;
retry
.retry(
|| group.set_keep_alive(pci_id),
&is_enodev,
"set_keep_alive",
)
.await?;
}
tracing::debug!(pci_id, "about to open device");
let device = group.open_device(pci_id, &driver).await?;
let device = retry
.retry(|| group.open_device(pci_id), &is_enodev, "open_device")
.await?;
let msix_info = device.irq_info(vfio_bindings::bindings::vfio::VFIO_PCI_MSIX_IRQ_INDEX)?;
if msix_info.flags.noresize() {
anyhow::bail!("unsupported: kernel does not support dynamic msix allocation");
Expand Down
1 change: 1 addition & 0 deletions vm/devices/user_driver/vfio_sys/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ anyhow.workspace = true
bitfield-struct.workspace = true
libc.workspace = true
nix = { workspace = true, features = ["ioctl"] }
tracelimit.workspace = true
tracing.workspace = true
vfio-bindings.workspace = true

Expand Down
108 changes: 59 additions & 49 deletions vm/devices/user_driver/vfio_sys/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ use std::io::BufRead;
use std::io::BufReader;
use std::os::unix::prelude::*;
use std::path::Path;
use std::time::Duration;
use vfio_bindings::bindings::vfio::VFIO_IRQ_SET_ACTION_TRIGGER;
use vfio_bindings::bindings::vfio::VFIO_IRQ_SET_DATA_EVENTFD;
use vfio_bindings::bindings::vfio::VFIO_IRQ_SET_DATA_NONE;
Expand Down Expand Up @@ -151,30 +152,12 @@ impl Group {
Ok(group)
}

pub async fn open_device(
&self,
device_id: &str,
driver: &(impl ?Sized + Driver),
) -> anyhow::Result<Device> {
pub fn open_device(&self, device_id: &str) -> anyhow::Result<Device> {
let id = CString::new(device_id)?;
// SAFETY: The file descriptor is valid and the string is null-terminated.
let file = unsafe {
let fd = ioctl::vfio_group_get_device_fd(self.file.as_raw_fd(), id.as_ptr());
// There is a small race window in the 6.1 kernel between when the
// vfio device is visible to userspace, and when it is added to its
// internal list. Try one more time on ENODEV failure after a brief
// sleep.
let fd = match fd {
Err(nix::errno::Errno::ENODEV) => {
tracing::warn!(pci_id = device_id, "Retrying vfio open_device after delay");
PolledTimer::new(driver)
.sleep(std::time::Duration::from_millis(250))
.await;
ioctl::vfio_group_get_device_fd(self.file.as_raw_fd(), id.as_ptr())
}
_ => fd,
};
let fd = fd.with_context(|| format!("failed to get device fd for {device_id}"))?;
let fd = ioctl::vfio_group_get_device_fd(self.file.as_raw_fd(), id.as_ptr())
.with_context(|| format!("failed to get device fd for {device_id}"))?;
File::from_raw_fd(fd)
};

Expand Down Expand Up @@ -206,39 +189,66 @@ impl Group {
/// Skip VFIO device reset when kernel is reloaded during servicing.
/// This feature is non-upstream version of our kernel and will be
/// eventually replaced with iommufd.
pub async fn set_keep_alive(
&self,
device_id: &str,
driver: &(impl ?Sized + Driver),
) -> anyhow::Result<()> {
pub fn set_keep_alive(&self, device_id: &str) -> anyhow::Result<()> {
let id = CString::new(device_id)?;
// SAFETY: The file descriptor is valid and a correctly constructed struct is being passed.
unsafe {
let id = CString::new(device_id)?;
let r = ioctl::vfio_group_set_keep_alive(self.file.as_raw_fd(), id.as_ptr());
match r {
Ok(_) => Ok(()),
Err(nix::errno::Errno::ENODEV) => {
// There is a small race window in the kernel between when the
// vfio device is visible to userspace, and when it is added to its
// internal list. Try one more time on ENODEV failure after a brief
// sleep.
tracing::warn!(
pci_id = device_id,
"vfio keepalive got ENODEV, retrying after delay"
ioctl::vfio_group_set_keep_alive(self.file.as_raw_fd(), id.as_ptr())
.with_context(|| format!("failed to set keep-alive for {device_id}"))?;
}
Ok(())
}
}

/// Retry wrapper for VFIO operations that may transiently fail
pub struct VfioRetry<'a> {
driver: &'a dyn Driver,
sleep_duration: Duration,
max_retries: u32,
}

impl<'a> VfioRetry<'a> {
const SLEEP_DURATION: Duration = Duration::from_millis(250);
const MAX_RETRIES: u32 = 1;

pub fn new(driver: &'a dyn Driver) -> Self {
Self {
driver,
sleep_duration: Self::SLEEP_DURATION,
max_retries: Self::MAX_RETRIES,
}
}

/// Retry `op` when `should_retry` returns true for the error, up to
/// `max_retries` times with a sleep between attempts.
pub async fn retry<T, E>(
&self,
mut op: impl FnMut() -> Result<T, E>,
should_retry: impl Fn(&E) -> bool,
context: &str,
) -> Result<T, E>
where
E: std::fmt::Display,
{
let mut attempt = 0;
loop {
match op() {
Ok(val) => return Ok(val),
Err(err) => {
if attempt >= self.max_retries || !should_retry(&err) {
return Err(err);
}
attempt += 1;
tracelimit::warn_ratelimited!(
operation = context,
attempt,
"retrying after transient error: {err}"
);
Comment on lines +242 to 246
Copy link

Copilot AI Apr 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

VfioRetry::retry logs a ratelimited warning on retries, but the log fields don’t include any identifier for the VFIO device/group being operated on. The earlier per-callsite retry logic logged pci_id, and losing that context will make these warnings hard to action when multiple VFIO devices are present. Consider extending retry to accept an optional identifier (e.g., pci_id/path) and include it as a structured field in the warning, or ensure callers enter a span that carries pci_id before invoking retry so the emitted warnings are attributable.

Copilot uses AI. Check for mistakes.
PolledTimer::new(driver)
.sleep(std::time::Duration::from_millis(250))
.await;
ioctl::vfio_group_set_keep_alive(self.file.as_raw_fd(), id.as_ptr())
.with_context(|| {
format!("failed to set keep-alive after delay for {device_id}")
})
.map(|_| ())
}
Err(_) => r
.with_context(|| format!("failed to set keep-alive for {device_id}"))
.map(|_| ()),
}
PolledTimer::new(self.driver)
.sleep(self.sleep_duration)
.await;
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ use petri::vtl2_settings::ControllerType;
use petri::vtl2_settings::Vtl2LunBuilder;
use petri::vtl2_settings::Vtl2StorageBackingDeviceBuilder;
use petri::vtl2_settings::Vtl2StorageControllerBuilder;
use petri_artifacts_vmm_test::artifacts::openhcl_igvm::LATEST_LINUX_DIRECT_TEST_X64;
use vmm_test_macros::openvmm_test;
use zerocopy::FromBytes;

Expand Down Expand Up @@ -79,8 +80,8 @@ async fn mana_nic_shared_pool(
}

/// Test an OpenHCL Linux direct VM with many NVMe devices assigned to VTL2 and vmbus relay.
///#[openvmm_test(openhcl_linux_direct_x64 [LATEST_LINUX_DIRECT_TEST_X64])]
async fn _many_nvme_devices_servicing_very_heavy(
#[openvmm_test(openhcl_linux_direct_x64 [LATEST_LINUX_DIRECT_TEST_X64])]
async fn many_nvme_devices_servicing_very_heavy(
config: PetriVmBuilder<OpenVmmPetriBackend>,
(igvm_file,): (ResolvedArtifact<impl petri_artifacts_common::tags::IsOpenhclIgvm>,),
) -> Result<(), anyhow::Error> {
Expand Down
Loading