diff --git a/.buildkite/pipeline_pr.py b/.buildkite/pipeline_pr.py index 94a92772aa7..96ddb924214 100755 --- a/.buildkite/pipeline_pr.py +++ b/.buildkite/pipeline_pr.py @@ -99,6 +99,15 @@ ), ) + pipeline.build_group( + "vfio", + pipeline.devtool_test( + devtool_opts="--vfio-nvme-device /dev/sdf --first-vfio-nvme-device -c 1-10", + pytest_opts="-m vfio integration_tests/functional/", + ), + **DEFAULTS_PERF, + ) + pipeline.build_group( "performance", pipeline.devtool_test( diff --git a/CHANGELOG.md b/CHANGELOG.md index 1b2a76e523d..b66b7e8a76c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,10 @@ and this project adheres to ### Added +- [#5870](https://github.com/firecracker-microvm/firecracker/pull/5870): Add + basic VFIO support allowing for PCIe device passthrough into VM. See + [documentation][docs/vfio.md] for instructions and current limitations. + ### Changed ### Deprecated diff --git a/Cargo.lock b/Cargo.lock index cb1194958c4..0ccaae51832 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1575,6 +1575,29 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "vfio-bindings" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "188dac3057a0cbc94470085204c84b82ff7ec5dac629a514323cd133d1f9abe0" + +[[package]] +name = "vfio-ioctls" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4b1d98dff7f0d219278e406323e7eda4d426447bd203c7828189baf0d8c07b7" +dependencies = [ + "byteorder", + "kvm-bindings", + "kvm-ioctls", + "libc", + "log", + "thiserror 2.0.18", + "vfio-bindings", + "vm-memory", + "vmm-sys-util", +] + [[package]] name = "vhost" version = "0.15.0" @@ -1660,6 +1683,8 @@ dependencies = [ "userfaultfd", "utils", "uuid", + "vfio-bindings", + "vfio-ioctls", "vhost", "vm-allocator", "vm-fdt", diff --git a/docs/device-hotplug.md b/docs/device-hotplug.md index d15fb956f98..b73ec553ea9 100644 --- a/docs/device-hotplug.md +++ b/docs/device-hotplug.md @@ -12,6 +12,7 @@ running microVM without requiring a reboot. Supported device types are: - `virtio-block` - `virtio-pmem` - `virtio-net` +- `vfio` ## Prerequisites diff --git a/docs/vfio.md b/docs/vfio.md new file mode 100644 index 00000000000..b5b596093ac --- /dev/null +++ b/docs/vfio.md @@ -0,0 +1,148 @@ +# VFIO Device Passthrough + +## What is VFIO + +VFIO (Virtual Function I/O) is a Linux kernel framework that allows userspace +programs to directly access physical devices in a secure, IOMMU-protected +environment. Firecracker uses VFIO to pass through PCI devices from the host +into the guest, giving the guest near-native performance access to physical +hardware such as GPUs, network adapters, and NVMe drives. + +## Prerequisites + +VFIO passthrough requires: + +- Firecracker must be started with the `--enable-pci` flag since VFIO devices + are PCI devices. +- An IOMMU (Intel VT-d, AMD-Vi, or ARM SMMU) must be enabled on the host. +- The host must have the `vfio` and `vfio-pci` kernel modules loaded. +- The target PCI device must be unbound from its native kernel driver and bound + to the `vfio-pci` driver. +- All devices in the same IOMMU group must be bound to `vfio-pci`. + +## How to bind device to `vfio-pci` driver + +To bind a device (e.g. `0000:11:22.3`) to `vfio-pci`: + +```bash +# Unbind from current driver +echo "0000:11:22.3" > /sys/bus/pci/devices/0000:11:22.3/driver/unbind +# Bind to vfio-pci +echo "vfio-pci" > /sys/bus/pci/devices/0000:11:22.3/driver_override +echo "0000:11:22.3" > /sys/bus/pci/drivers/vfio-pci/bind +``` + +## Configuration + +Firecracker exposes the following configuration options for VFIO devices: + +- `id` - unique identifier for the device +- `sbdf` - host PCI device identifier, accepted in many forms: + - full sysfs path: `/sys/bus/pci/devices/0000:01:02.03` + - full SBDF: `0000:01:02.03` + - short BDF: `01:02.03` + - hex integer: `0x010203` + - decimal integer: `66051` + +### Config file + +```json +"vfio": [ + { + "id": "device0", + "sbdf": "/sys/bus/pci/devices/0000:11:22.3" + } +] +``` + +### API + +#### Add device + +The same `PUT /vfio/{id}` endpoint works both before and after boot: + +```console +curl --unix-socket $socket_location -i \ + -X PUT 'http://localhost/vfio/device0' \ + -H 'Accept: application/json' \ + -H 'Content-Type: application/json' \ + -d "{ + \"id\": \"device0\", + \"sbdf\": \"/sys/bus/pci/devices/0000:01:02.03\" + }" +``` + +#### Remove device + +A VFIO device can be removed at runtime via `DELETE /vfio/{id}`: + +```console +curl --unix-socket $socket_location -i \ + -X DELETE 'http://localhost/vfio/device0' +``` + +Hot-unplug is only valid after the microVM has booted. The device is detached +from the guest PCI bus and all associated resources (DMA mappings, interrupts, +BAR memory) are released. + +## Booting from a VFIO device + +A passthrough block device (e.g. an NVMe SSD bound to `vfio-pci`) can serve as +the guest's root filesystem instead of a virtio-block drive. Firecracker does +not auto-detect this; you must point the guest kernel at the right device via +the boot arguments. + +1. Configure the VFIO device as usual (see [Configuration](#configuration)) and + make sure no `is_root_device: true` virtio drive is configured. + +1. In the boot source, set `boot_args` so that `root=` names the block device + the guest kernel will see for the passthrough device. For an NVMe namespace + that will appear as `/dev/nvme0n1`: + + ```json + "boot-source": { + "kernel_image_path": "/path/to/vmlinux", + "boot_args": "console=ttyS0 reboot=k panic=1 root=/dev/nvme0n1 ro" + } + ``` + + Use `root=/dev/nvme0n1p1` (or similar) if the rootfs lives on a partition, + and adjust the device name for non-NVMe devices (`/dev/sda`, etc.). + +Notes: + +- The guest kernel must include the driver for the passthrough device (e.g. + `CONFIG_BLK_DEV_NVME=y`) and any filesystem it uses, either built-in or + available as an initrd-loadable module. +- If the device is hot-plugged after boot it cannot be the root device — the + kernel has already mounted root by then. Use cold-boot configuration for the + root device. + +## Security + +- **IOMMU is mandatory.** Without an IOMMU, a passthrough device could DMA to + arbitrary host memory. +- **IOMMU groups.** All devices in the same IOMMU group must be assigned to the + same VM. Splitting a group across VMs would break DMA isolation. Linux already + enforces this behaviour. + +## Snapshot support + +VFIO devices do not support snapshots. Device state is opaque to the VMM and +cannot be serialized or restored. VMs with VFIO devices attached cannot be +snapshotted. + +## Limitations + +| Limitation | Details | +| :-------------------------- | :----------------------------------------------------------------------------- | +| No memory over-subscription | All the memory of the guest will be paged in and pinned by the kernel | +| No snapshots | Device state is opaque and cannot be saved/restored. | +| No BAR relocation | BAR addresses are assigned at init and cannot be moved. | +| No BAR resizing | Resizable BAR capability is masked from the guest. | +| No IO BARs | IO-type BARs are skipped. Devices relying solely on IO BARs will not work. | +| No ROM BAR | Expansion ROM BAR is not handled. | +| No MSI (non-X) | Only MSI-X interrupts are supported. Devices without MSI-X fail to initialize. | +| No INTx | Legacy pin-based interrupts are not supported. | +| No SR-IOV | SR-IOV capability is masked. Virtual Functions cannot be created. | +| No virtio-iommu | The guest has no IOMMU. DMA isolation relies entirely on the host IOMMU. | diff --git a/resources/guest_configs/nvme.config b/resources/guest_configs/nvme.config new file mode 100644 index 00000000000..740a25e4a33 --- /dev/null +++ b/resources/guest_configs/nvme.config @@ -0,0 +1,2 @@ +CONFIG_NVME_CORE=y +CONFIG_BLK_DEV_NVME=y diff --git a/resources/rebuild.sh b/resources/rebuild.sh index eb75f944a6a..955a8cf0054 100755 --- a/resources/rebuild.sh +++ b/resources/rebuild.sh @@ -225,15 +225,16 @@ function build_al_kernels { clone_amazon_linux_repo CI_CONFIG="$PWD/guest_configs/ci.config" + NVME_CONFIG="$PWD/guest_configs/nvme.config" if [[ "$KERNEL_VERSION" == @(all|5.10) ]]; then - build_al_kernel $PWD/guest_configs/microvm-kernel-ci-$ARCH-5.10.config "$CI_CONFIG" + build_al_kernel $PWD/guest_configs/microvm-kernel-ci-$ARCH-5.10.config "$CI_CONFIG" "$NVME_CONFIG" fi if [[ $ARCH == "x86_64" && "$KERNEL_VERSION" == @(all|5.10-no-acpi) ]]; then - build_al_kernel $PWD/guest_configs/microvm-kernel-ci-$ARCH-5.10-no-acpi.config "$CI_CONFIG" + build_al_kernel $PWD/guest_configs/microvm-kernel-ci-$ARCH-5.10-no-acpi.config "$CI_CONFIG" "$NVME_CONFIG" fi if [[ "$KERNEL_VERSION" == @(all|6.1) ]]; then - build_al_kernel $PWD/guest_configs/microvm-kernel-ci-$ARCH-6.1.config "$CI_CONFIG" + build_al_kernel $PWD/guest_configs/microvm-kernel-ci-$ARCH-6.1.config "$CI_CONFIG" "$NVME_CONFIG" fi # Build debug kernels @@ -242,11 +243,11 @@ function build_al_kernels { OUTPUT_DIR=$OUTPUT_DIR/debug mkdir -pv $OUTPUT_DIR if [[ "$KERNEL_VERSION" == @(all|5.10) ]]; then - build_al_kernel "$PWD/guest_configs/microvm-kernel-ci-$ARCH-5.10.config" "$CI_CONFIG" "$FTRACE_CONFIG" "$DEBUG_CONFIG" + build_al_kernel "$PWD/guest_configs/microvm-kernel-ci-$ARCH-5.10.config" "$CI_CONFIG" "$FTRACE_CONFIG" "$NVME_CONFIG" "$DEBUG_CONFIG" vmlinux_split_debuginfo $OUTPUT_DIR/vmlinux-5.10.* fi if [[ "$KERNEL_VERSION" == @(all|6.1) ]]; then - build_al_kernel "$PWD/guest_configs/microvm-kernel-ci-$ARCH-6.1.config" "$CI_CONFIG" "$FTRACE_CONFIG" "$DEBUG_CONFIG" + build_al_kernel "$PWD/guest_configs/microvm-kernel-ci-$ARCH-6.1.config" "$CI_CONFIG" "$FTRACE_CONFIG" "$NVME_CONFIG" "$DEBUG_CONFIG" vmlinux_split_debuginfo $OUTPUT_DIR/vmlinux-6.1.* fi } diff --git a/resources/seccomp/aarch64-unknown-linux-musl.json b/resources/seccomp/aarch64-unknown-linux-musl.json index 1e0047266e6..65d90f3e4ba 100644 --- a/resources/seccomp/aarch64-unknown-linux-musl.json +++ b/resources/seccomp/aarch64-unknown-linux-musl.json @@ -711,7 +711,219 @@ "val": 0, "comment": "Ensure PROT_EXEC is not set" } + ], + "comment": "Used by memory hotplug to protect access to underlying host memory" + }, + { + "syscall": "dup", + "comment": "Used by vfio-ioctls to duplicate file descriptors for VFIO device setup" + }, + { + "syscall": "ioctl", + "comment": "Used by vfio-ioctls during VfioDevice drop to detach group from container", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 15209, + "comment": "VFIO_GROUP_UNSET_CONTAINER" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Used by vfio-ioctls during VFIO device cleanup to unmap DMA regions", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 15218, + "comment": "VFIO_IOMMU_UNMAP_DMA" + } ] + }, + { + "syscall": "pread64", + "comment": "Used by VFIO passthrough for reading device PCI config and BAR regions" + }, + { + "syscall": "pwrite64", + "comment": "Used by VFIO passthrough for writing device PCI config and BAR regions" + }, + { + "syscall": "ioctl", + "comment": "Used by VFIO hotplug to create the KVM VFIO device", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 3222056672, + "comment": "KVM_CREATE_DEVICE" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Used by VFIO hotplug to query the VFIO API version", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 15204, + "comment": "VFIO_GET_API_VERSION" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Used by VFIO hotplug to check VFIO IOMMU type extension", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 15205, + "comment": "VFIO_CHECK_EXTENSION" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Used by VFIO hotplug to set the IOMMU type on the container", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 15206, + "comment": "VFIO_SET_IOMMU" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Used by VFIO hotplug to get the VFIO group status", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 15207, + "comment": "VFIO_GROUP_GET_STATUS" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Used by VFIO hotplug to bind the VFIO group to the container", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 15208, + "comment": "VFIO_GROUP_SET_CONTAINER" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Used by VFIO hotplug to obtain a fd for a device in the group", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 15210, + "comment": "VFIO_GROUP_GET_DEVICE_FD" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Used by VFIO hotplug to get general device info", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 15211, + "comment": "VFIO_DEVICE_GET_INFO" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Used by VFIO hotplug to query device region (BAR) info", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 15212, + "comment": "VFIO_DEVICE_GET_REGION_INFO" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Used by VFIO hotplug to query device IRQ info (MSI-X)", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 15213, + "comment": "VFIO_DEVICE_GET_IRQ_INFO" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Used by VFIO hotplug to set up MSI-X interrupts on the device", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 15214, + "comment": "VFIO_DEVICE_SET_IRQS" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Used by VFIO hotplug to reset the device after attach", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 15215, + "comment": "VFIO_DEVICE_RESET" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Used by VFIO hotplug to map guest memory and BAR regions for DMA", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 15217, + "comment": "VFIO_IOMMU_MAP_DMA" + } + ] + }, + { + "syscall": "readlinkat", + "comment": "Used by VFIO hotplug to resolve the iommu_group symlink under the device sysfs path" } ] }, @@ -1335,6 +1547,14 @@ "comment": "KVM_IRQFD" } ] + }, + { + "syscall": "pread64", + "comment": "Used by VFIO passthrough to read device PCI config and BAR regions on vCPU MMIO/PIO exits" + }, + { + "syscall": "pwrite64", + "comment": "Used by VFIO passthrough to write device PCI config and BAR regions on vCPU MMIO/PIO exits" } ] } diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json index ea4d49e98b5..3a2300fff7a 100644 --- a/resources/seccomp/x86_64-unknown-linux-musl.json +++ b/resources/seccomp/x86_64-unknown-linux-musl.json @@ -723,7 +723,232 @@ "val": 0, "comment": "Ensure PROT_EXEC is not set" } + ], + "comment": "Used by memory hotplug to protect access to underlying host memory" + }, + { + "syscall": "dup", + "comment": "Used by vfio-ioctls to duplicate file descriptors for VFIO device setup" + }, + { + "syscall": "ioctl", + "comment": "Used by vfio-ioctls during VfioDevice drop to detach group from container", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 15209, + "comment": "VFIO_GROUP_UNSET_CONTAINER" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Used by vfio-ioctls during VFIO device cleanup to unmap DMA regions", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 15218, + "comment": "VFIO_IOMMU_UNMAP_DMA" + } + ] + }, + { + "syscall": "pread64", + "comment": "Used by VFIO passthrough for reading device PCI config and BAR regions" + }, + { + "syscall": "pwrite64", + "comment": "Used by VFIO passthrough for writing device PCI config and BAR regions" + }, + { + "syscall": "ioctl", + "comment": "Used by VFIO hotplug to create the KVM VFIO device", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 3222056672, + "comment": "KVM_CREATE_DEVICE" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Used by VFIO hotplug to attach the VFIO group to the KVM VFIO device", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1075359457, + "comment": "KVM_SET_DEVICE_ATTR" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Used by VFIO hotplug to query the VFIO API version", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 15204, + "comment": "VFIO_GET_API_VERSION" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Used by VFIO hotplug to check VFIO IOMMU type extension", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 15205, + "comment": "VFIO_CHECK_EXTENSION" + } ] + }, + { + "syscall": "ioctl", + "comment": "Used by VFIO hotplug to set the IOMMU type on the container", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 15206, + "comment": "VFIO_SET_IOMMU" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Used by VFIO hotplug to get the VFIO group status", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 15207, + "comment": "VFIO_GROUP_GET_STATUS" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Used by VFIO hotplug to bind the VFIO group to the container", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 15208, + "comment": "VFIO_GROUP_SET_CONTAINER" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Used by VFIO hotplug to obtain a fd for a device in the group", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 15210, + "comment": "VFIO_GROUP_GET_DEVICE_FD" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Used by VFIO hotplug to get general device info", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 15211, + "comment": "VFIO_DEVICE_GET_INFO" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Used by VFIO hotplug to query device region (BAR) info", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 15212, + "comment": "VFIO_DEVICE_GET_REGION_INFO" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Used by VFIO hotplug to query device IRQ info (MSI-X)", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 15213, + "comment": "VFIO_DEVICE_GET_IRQ_INFO" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Used by VFIO hotplug to set up MSI-X interrupts on the device", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 15214, + "comment": "VFIO_DEVICE_SET_IRQS" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Used by VFIO hotplug to reset the device after attach", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 15215, + "comment": "VFIO_DEVICE_RESET" + } + ] + }, + { + "syscall": "ioctl", + "comment": "Used by VFIO hotplug to map guest memory and BAR regions for DMA", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 15217, + "comment": "VFIO_IOMMU_MAP_DMA" + } + ] + }, + { + "syscall": "readlink", + "comment": "Used by VFIO hotplug to resolve the iommu_group symlink under the device sysfs path" } ] }, @@ -1467,6 +1692,14 @@ "comment": "KVM_IRQFD" } ] + }, + { + "syscall": "pread64", + "comment": "Used by VFIO passthrough to read device PCI config and BAR regions on vCPU MMIO/PIO exits" + }, + { + "syscall": "pwrite64", + "comment": "Used by VFIO passthrough to write device PCI config and BAR regions on vCPU MMIO/PIO exits" } ] } diff --git a/src/firecracker/src/api_server/parsed_request.rs b/src/firecracker/src/api_server/parsed_request.rs index 2946b197067..480916e8be3 100644 --- a/src/firecracker/src/api_server/parsed_request.rs +++ b/src/firecracker/src/api_server/parsed_request.rs @@ -28,11 +28,12 @@ use super::request::net::{parse_patch_net, parse_put_net}; use super::request::pmem::{parse_patch_pmem, parse_put_pmem}; use super::request::snapshot::{parse_patch_vm_state, parse_put_snapshot}; use super::request::version::parse_get_version; +use super::request::vfio::parse_put_vfio; use super::request::vsock::parse_put_vsock; use crate::api_server::request::hotplug::memory::{ parse_get_memory_hotplug, parse_patch_memory_hotplug, parse_put_memory_hotplug, }; -use crate::api_server::request::hotplug::parse_unplug_device; +use crate::api_server::request::hotplug::{parse_unplug_device, parse_unplug_vfio_device}; use crate::api_server::request::serial::parse_put_serial; #[derive(Debug)] @@ -110,6 +111,7 @@ impl TryFrom<&Request> for ParsedRequest { } (Method::Put, "snapshot", Some(body)) => parse_put_snapshot(body, path_tokens.next()), (Method::Put, "vsock", Some(body)) => parse_put_vsock(body), + (Method::Put, "vfio", Some(body)) => parse_put_vfio(body, path_tokens.next()), (Method::Put, "entropy", Some(body)) => parse_put_entropy(body), (Method::Put, "hotplug", Some(body)) if path_tokens.next() == Some("memory") => { parse_put_memory_hotplug(body) @@ -137,6 +139,7 @@ impl TryFrom<&Request> for ParsedRequest { (Method::Delete, "network-interfaces", None) => { parse_unplug_device(VirtioDeviceType::Net, path_tokens.next()) } + (Method::Delete, "vfio", None) => parse_unplug_vfio_device(path_tokens.next()), (Method::Delete, _, Some(_)) => method_to_error(Method::Delete), (method, unknown_uri, _) => Err(RequestError::InvalidPathMethod( unknown_uri.to_string(), diff --git a/src/firecracker/src/api_server/request/hotplug/mod.rs b/src/firecracker/src/api_server/request/hotplug/mod.rs index 13bbdbfde96..bc4fbfefcf9 100644 --- a/src/firecracker/src/api_server/request/hotplug/mod.rs +++ b/src/firecracker/src/api_server/request/hotplug/mod.rs @@ -18,3 +18,12 @@ pub(crate) fn parse_unplug_device( id.to_string(), )))) } + +pub(crate) fn parse_unplug_vfio_device( + id_from_path: Option<&str>, +) -> Result { + let id = checked_id(id_from_path.ok_or(RequestError::EmptyID)?)?; + Ok(ParsedRequest::new_sync(VmmAction::HotUnplugVfioDevice( + id.to_string(), + ))) +} diff --git a/src/firecracker/src/api_server/request/mod.rs b/src/firecracker/src/api_server/request/mod.rs index 9be4617bd8e..45cb75e538f 100644 --- a/src/firecracker/src/api_server/request/mod.rs +++ b/src/firecracker/src/api_server/request/mod.rs @@ -18,5 +18,6 @@ pub mod pmem; pub mod serial; pub mod snapshot; pub mod version; +pub mod vfio; pub mod vsock; pub use micro_http::{Body, Method, StatusCode}; diff --git a/src/firecracker/src/api_server/request/vfio.rs b/src/firecracker/src/api_server/request/vfio.rs new file mode 100644 index 00000000000..ccb1b54e266 --- /dev/null +++ b/src/firecracker/src/api_server/request/vfio.rs @@ -0,0 +1,80 @@ +// Copyright 2026 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use vmm::logger::{IncMetric, METRICS}; +use vmm::rpc_interface::VmmAction; +use vmm::vmm_config::vfio::VfioConfig; + +use super::super::parsed_request::{ParsedRequest, RequestError, checked_id}; +use super::{Body, StatusCode}; + +pub(crate) fn parse_put_vfio( + body: &Body, + id_from_path: Option<&str>, +) -> Result { + METRICS.put_api_requests.vfio_count.inc(); + let id = if let Some(id) = id_from_path { + checked_id(id)? + } else { + METRICS.put_api_requests.vfio_fails.inc(); + return Err(RequestError::EmptyID); + }; + + let device_cfg = serde_json::from_slice::(body.raw()).inspect_err(|_| { + METRICS.put_api_requests.vfio_fails.inc(); + })?; + + if id != device_cfg.id { + METRICS.put_api_requests.vfio_fails.inc(); + Err(RequestError::Generic( + StatusCode::BadRequest, + "The id from the path does not match the id from the body!".to_string(), + )) + } else { + Ok(ParsedRequest::new_sync(VmmAction::InsertVfioDevice( + device_cfg, + ))) + } +} + +#[cfg(test)] +mod tests { + use vmm::pci::PciSBDF; + + use super::*; + use crate::api_server::parsed_request::tests::vmm_action_from_request; + + #[test] + fn test_parse_put_vfio_request() { + // No body, no id + parse_put_vfio(&Body::new("invalid_payload"), None).unwrap_err(); + // Invalid body, with id + parse_put_vfio(&Body::new("invalid_payload"), Some("id")).unwrap_err(); + + // Mismatched ids + let body = r#"{ + "id": "bar", + "sbdf": "/sys/bus/pci/devices/0000:00:1f.0" + }"#; + parse_put_vfio(&Body::new(body), Some("foo")).unwrap_err(); + + // Missing required field + let body = r#"{ + "id": "dev0" + }"#; + parse_put_vfio(&Body::new(body), Some("dev0")).unwrap_err(); + + // Valid request + let body = r#"{ + "id": "dev0", + "sbdf": "/sys/bus/pci/devices/0000:00:1f.0" + }"#; + let r = vmm_action_from_request(parse_put_vfio(&Body::new(body), Some("dev0")).unwrap()); + + let expected_config = VfioConfig { + id: "dev0".to_string(), + sbdf: PciSBDF::new(0x0, 0x0, 0x1f, 0x0), + }; + assert_eq!(r, VmmAction::InsertVfioDevice(expected_config)); + } +} diff --git a/src/firecracker/swagger/firecracker.yaml b/src/firecracker/swagger/firecracker.yaml index d1ac91bdb6a..fdd0be44d85 100644 --- a/src/firecracker/swagger/firecracker.yaml +++ b/src/firecracker/swagger/firecracker.yaml @@ -339,6 +339,64 @@ paths: schema: $ref: "#/definitions/Error" + /vfio/{id}: + put: + summary: Creates or updates a VFIO passthrough device. + description: + This API can be used pre-boot and at runtime. During pre-boot phase it + creates new VFIO passthrough device with ID specified by id parameter. + If a VFIO device with the specified ID already exists, it updates + device state based on new input. During runtime it can be called to + hot-plug new VFIO device. Updates to existing VFIO devices during + runtime is not supported. + operationId: putGuestVfioByID + parameters: + - name: id + in: path + description: The id of the VFIO passthrough device + required: true + type: string + - name: body + in: body + description: VFIO device properties + required: true + schema: + $ref: "#/definitions/Vfio" + responses: + 204: + description: VFIO device is created/updated + 400: + description: VFIO device cannot be created/updated due to bad input + schema: + $ref: "#/definitions/Error" + default: + description: Internal server error. + schema: + $ref: "#/definitions/Error" + delete: + summary: Hot-unplugs a VFIO passthrough device. Runtime only. + description: + Removes a VFIO passthrough device with the specified ID from the guest. + Only valid after the microVM has booted. + operationId: deleteGuestVfioByID + parameters: + - name: id + in: path + description: The id of the VFIO passthrough device + required: true + type: string + responses: + 204: + description: VFIO device is removed + 400: + description: VFIO device cannot be removed due to bad input + schema: + $ref: "#/definitions/Error" + default: + description: Internal server error. + schema: + $ref: "#/definitions/Error" + /pmem/{id}: put: summary: Creates or updates a pmem device. Pre-boot only. @@ -1287,6 +1345,23 @@ definitions: rate_limiter: $ref: "#/definitions/RateLimiter" + Vfio: + type: object + required: + - id + - sbdf + properties: + id: + type: string + description: + Identifier for this VFIO passthrough device. + sbdf: + type: string + description: + Host PCI device identifier. Accepted formats are full sysfs path + ("/sys/bus/pci/devices/0000:11:22.3"), full SBDF ("0000:11:22.3"), + short BDF ("11:22.3"), hex integer ("0x1113"), or decimal integer. + Error: type: object properties: @@ -1333,6 +1408,11 @@ definitions: $ref: "#/definitions/Vsock" entropy: $ref: "#/definitions/EntropyDevice" + vfio: + type: array + description: Configurations for all VFIO passthrough devices. + items: + $ref: "#/definitions/Vfio" InstanceActionInfo: type: object diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index 50cd9b59134..a72d5aa384c 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -11,13 +11,13 @@ bench = false [features] default = [] tracing = ["log-instrument"] -gdb = ["arrayvec", "gdbstub", "gdbstub_arch"] +gdb = ["gdbstub", "gdbstub_arch"] fuzzing = [] [dependencies] acpi_tables = { path = "../acpi-tables" } -arrayvec = { version = "0.7.6", optional = true } +arrayvec = { version = "0.7.6" } aws-lc-rs = "1.17.0" base64 = "0.22.1" bitcode = { version = "0.6.9", features = ["serde"] } @@ -49,6 +49,8 @@ thiserror = "2.0.18" userfaultfd = "0.9.0" utils = { path = "../utils" } uuid = "1.23.2" +vfio-bindings = "0.6.2" +vfio-ioctls = { version = "0.6.0", default-features = false, features = ["kvm"] } vhost = { version = "0.15.0", features = ["vhost-user-frontend"] } vm-allocator = { version = "0.1.4", features = ["serde"] } vm-memory = { version = "0.17.1", features = [ diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index fbdf24d80f1..51e990f8c81 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -44,7 +44,7 @@ use crate::logger::debug; #[cfg(target_arch = "aarch64")] use crate::logger::warn; use crate::persist::{MicrovmState, MicrovmStateError}; -use crate::resources::VmResources; +use crate::resources::{ResourcesError, VmResources}; use crate::seccomp::BpfThreadMap; use crate::snapshot::Persist; use crate::utils::mib_to_bytes; @@ -55,6 +55,7 @@ use crate::vmm_config::instance_info::{InstanceInfo, VmState}; use crate::vmm_config::machine_config::MachineConfigError; use crate::vmm_config::memory_hotplug::MemoryHotplugConfig; use crate::vmm_config::pmem::PmemConfig; +use crate::vmm_config::vfio::VfioConfig; use crate::vstate::kvm::{Kvm, KvmError}; use crate::vstate::memory::GuestRegionMmap; #[cfg(target_arch = "aarch64")] @@ -66,6 +67,8 @@ use crate::{EventManager, Vmm, VmmError}; /// Errors associated with starting the instance. #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum StartMicrovmError { + /// Incompatible device configuration: {0} + IncompatibleDeviceConfiguration(#[from] ResourcesError), /// Unable to attach block device to Vmm: {0} AttachBlockDevice(io::Error), /// Could not attach device: {0} @@ -153,6 +156,7 @@ pub fn build_microvm_for_boot( ) -> Result>, StartMicrovmError> { // Timestamp for measuring microVM boot duration. let request_ts = TimestampUs::default(); + vm_resources.validate()?; let boot_config = vm_resources .boot_source @@ -292,6 +296,8 @@ pub fn build_microvm_for_boot( )?; } + attach_vfio_devices(&mut device_manager, &vm, &vm_resources.vfio.configs)?; + #[cfg(target_arch = "aarch64")] device_manager.attach_legacy_devices_aarch64( &kvm_vm, @@ -392,6 +398,8 @@ pub fn build_and_boot_microvm( /// Error type for [`build_microvm_from_snapshot`]. #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum BuildMicrovmFromSnapshotError { + /// Incompatible device configuration: {0} + IncompatibleDeviceConfiguration(#[from] ResourcesError), /// Failed to create microVM and vCPUs: {0} CreateMicrovmAndVcpus(#[from] StartMicrovmError), /// Could not access KVM: {0} @@ -439,6 +447,7 @@ pub fn build_microvm_from_snapshot( ) -> Result>, BuildMicrovmFromSnapshotError> { // Build Vmm. debug!("event_start: build microvm from snapshot"); + vm_resources.validate()?; let kvm = Kvm::new(microvm_state.kvm_state.kvm_cap_modifiers.clone()) .map_err(StartMicrovmError::Kvm)?; @@ -624,6 +633,22 @@ fn allocate_virtio_mem_address( Ok(GuestAddress(addr)) } +fn attach_vfio_devices( + device_manager: &mut DeviceManager, + vm: &Vm, + configs: &[VfioConfig], +) -> Result<(), StartMicrovmError> { + let kvm_vm = vm.as_kvm().ok_or(AttachDeviceError::NotSupported)?; + for config in configs.iter() { + device_manager + .pci_devices + // NOTE: technically, there is no reason to clone, + // but unfortunately vm_resources are not mutable. + .attach_vfio_device(kvm_vm, config.clone())?; + } + Ok(()) +} + fn attach_virtio_mem_device( device_manager: &mut DeviceManager, vm: &Vm, diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 95c8877eead..2ddd0f82cc8 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -48,7 +48,7 @@ use crate::devices::virtio::transport::pci::device::CAPABILITY_BAR_SIZE; use crate::devices::virtio::vsock::{VsockError, VsockUnixBackendError}; use crate::logger::{error, info, warn}; use crate::rate_limiter::TokenBucket; -use crate::resources::VmResources; +use crate::resources::{ResourcesError, VmResources}; use crate::rpc_interface::VmmActionError; use crate::snapshot::Persist; use crate::utils::open_file_nonblock; @@ -57,6 +57,7 @@ use crate::vmm_config::drive::{BlockDeviceConfig, DriveError}; use crate::vmm_config::mmds::MmdsConfigError; use crate::vmm_config::net::{NetBuilder, NetworkInterfaceConfig, NetworkInterfaceError}; use crate::vmm_config::pmem::{PmemConfig, PmemConfigError}; +use crate::vmm_config::vfio::VfioConfig; use crate::vstate::bus::BusError; use crate::vstate::memory::GuestMemoryMmap; use crate::vstate::vm::{KvmVm, Vm}; @@ -500,6 +501,20 @@ impl DeviceManager { } let dev_type = config.device_type(); + match dev_type { + VirtioDeviceType::Balloon if !self.pci_devices.vfio_devices.is_empty() => { + return Err(VmmActionError::IncompatibleDeviceConfiguration( + ResourcesError::VfioWithBalloon, + )); + } + VirtioDeviceType::Mem if !self.pci_devices.vfio_devices.is_empty() => { + return Err(VmmActionError::IncompatibleDeviceConfiguration( + ResourcesError::VfioWithMemHotplug, + )); + } + _ => {} + } + let dev_id = config.device_id().to_string(); if self @@ -521,6 +536,33 @@ impl DeviceManager { Ok(()) } + /// Attaches a device after VM start + pub fn hotplug_device_vfio( + &mut self, + vm: &Arc, + config: VfioConfig, + ) -> Result<(), VmmActionError> { + if !self.is_pci_enabled() { + return Err(VmmActionError::PciNotEnabled); + } + + for (virtio_type, _) in self.pci_devices.virtio_devices.keys() { + if *virtio_type == VirtioDeviceType::Balloon { + return Err(VmmActionError::IncompatibleDeviceConfiguration( + ResourcesError::VfioWithBalloon, + )); + } + if *virtio_type == VirtioDeviceType::Mem { + return Err(VmmActionError::IncompatibleDeviceConfiguration( + ResourcesError::VfioWithMemHotplug, + )); + } + } + + self.pci_devices.attach_vfio_device(vm, config)?; + Ok(()) + } + fn hotplug_make_block( config: BlockDeviceConfig, ) -> Result>, VmmActionError> { @@ -629,6 +671,15 @@ impl DeviceManager { Ok(()) } + + /// Detaches a device after VM start + pub fn hot_unplug_vfio_device(&mut self, vm: &KvmVm, id: String) -> Result<(), VmmActionError> { + if !self.is_pci_enabled() { + return Err(VmmActionError::PciNotEnabled); + } + self.pci_devices.detach_vfio_device(vm, id)?; + Ok(()) + } } #[derive(Debug, Default, Clone, Serialize, Deserialize)] @@ -787,9 +838,9 @@ impl<'a> Persist<'a> for DeviceManager { #[cfg(test)] pub(crate) mod tests { - use super::*; use vmm_sys_util::tempfile::TempFile; + use super::*; use crate::builder::tests::{ CustomBlockConfig, default_kernel_cmdline, default_vmm, insert_block_devices, }; diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index 95e9290cdc9..fe4b51b1314 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -38,22 +38,38 @@ use crate::pci::PciSBDF; use crate::pci::bus::PciRootError; use crate::resources::VmResources; use crate::snapshot::Persist; +use crate::vfio::{VfioContainer, VfioDeviceBundle, VfioError, deinit_vfio_device}; use crate::vmm_config::memory_hotplug::MemoryHotplugConfig; +use crate::vmm_config::vfio::VfioConfig; use crate::vstate::bus::BusError; use crate::vstate::interrupts::InterruptError; use crate::vstate::memory::GuestMemoryMmap; use crate::vstate::vm::KvmVm; -#[derive(Debug, Default)] +#[derive(Default)] pub struct PciDevices { /// PCIe segment of the VMM, if PCI is enabled. We currently support a single PCIe segment. pub pci_segment: Option, /// All VirtIO PCI devices of the system pub virtio_devices: HashMap>>, + + pub vfio_container: Option>, + // All Vfio PCI devices + pub vfio_devices: Vec>>, +} + +impl std::fmt::Debug for PciDevices { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PciDevices").finish() + } } #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum PciManagerError { + /// Trying to add new device with id: {0}, but device with this id is already present + AdddingDuplicatedDevice(String), + /// Device not found + DeviceNotFound, /// Resource allocation error: {0} ResourceAllocation(#[from] vm_allocator::Error), /// Bus error: {0} @@ -66,6 +82,8 @@ pub enum PciManagerError { VirtioPciDevice(#[from] VirtioPciDeviceError), /// KVM error: {0} Kvm(#[from] vmm_sys_util::errno::Error), + /// Vfio error: {0} + Vfio(#[from] VfioError), } impl PciDevices { @@ -174,6 +192,110 @@ impl PciDevices { self.attach_common(vm, device_type, id, sbdf, virtio_device, event_manager) } + pub fn attach_vfio_device( + &mut self, + vm: &Arc, + config: VfioConfig, + ) -> Result<(), PciManagerError> { + for device in self.vfio_devices.iter() { + let device = device.lock().unwrap(); + if device.config.id == config.id { + return Err(PciManagerError::AdddingDuplicatedDevice(config.id)); + } + if device.config.sbdf == config.sbdf { + return Err(PciManagerError::AdddingDuplicatedDevice(config.id)); + } + } + + let pci_segment = self.pci_segment.as_ref().unwrap(); + let pci_device_bdf = pci_segment.next_device_sbdf()?; + debug!("VFIO: Allocating BDF: {pci_device_bdf:?} for device"); + + if self.vfio_container.is_none() { + let container = crate::vfio::init_kvm_vfio_device_and_vfio_container(vm.as_ref())?; + self.vfio_container = Some(container); + } + let container = self.vfio_container.as_ref().unwrap(); + let is_first_device = self.vfio_devices.is_empty(); + + let device = match crate::vfio::init_vfio_device(container, vm, config, pci_device_bdf) { + Ok(d) => d, + Err(e) => { + if is_first_device { + self.vfio_container = None; + } + return Err(e.into()); + } + }; + + #[allow(clippy::collapsible_if)] + if is_first_device { + if let Err(e) = crate::vfio::dma_map_guest_memory(container, vm.guest_memory()) { + crate::vfio::deinit_vfio_device(container, vm, &device.lock().unwrap()); + self.vfio_container = None; + return Err(e.into()); + } + } + + // This is for config space + pci_segment + .pci_bus + .lock() + .expect("Poisoned lock") + // SAFETY: we should never add 2 devices with same device id + .add_device(pci_device_bdf.device(), device.clone()) + .unwrap(); + + self.vfio_devices.push(device); + + Ok(()) + } + + pub fn detach_vfio_device(&mut self, vm: &KvmVm, id: String) -> Result<(), PciManagerError> { + if self.pci_segment.is_none() { + return Ok(()); + } + if self.vfio_container.is_none() { + // SAFETY: vfio devices cannot exist without vfio_container + assert!(self.vfio_devices.is_empty()); + return Ok(()); + } + + let pci_segment = self.pci_segment.as_ref().unwrap(); + let mut pci_bus = pci_segment.pci_bus.lock().expect("Poisoned lock"); + let container = self.vfio_container.as_ref().unwrap(); + + let last_vfio_device = self.vfio_devices.len() == 1; + + let mut idx = None; + for (i, device) in self.vfio_devices.iter().enumerate() { + let device = device.lock().unwrap(); + if device.config.id == id { + idx = Some(i); + pci_bus.remove_device(device.sbdf.device()); + deinit_vfio_device(container, vm, &device); + break; + } + } + if let Some(idx) = idx { + let device = self.vfio_devices.swap_remove(idx); + assert_eq!(Arc::strong_count(&device), 1); + // Drop device explicitly to have a guaranteed destroy order with the vfio_container + drop(device); + + if last_vfio_device { + crate::vfio::dma_unmap_guest_memory(container, vm.guest_memory()); + + assert_eq!(Arc::strong_count(container), 1); + self.vfio_container = None; + } + + Ok(()) + } else { + Err(PciManagerError::DeviceNotFound) + } + } + fn restore_pci_device( &mut self, vm: &Arc, @@ -835,7 +957,8 @@ mod tests { "total_size_mib": 1024, "block_size_mib": 2, "slot_size_mib": 128 - }} + }}, + "vfio": [] }}"#, _block_files.last().unwrap().as_path().to_str().unwrap(), tmp_sock_file.as_path().to_str().unwrap(), diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 82f02ede962..adba019356e 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -859,7 +859,8 @@ mod tests { "total_size_mib": 1024, "block_size_mib": 2, "slot_size_mib": 128 - }} + }}, + "vfio": [] }}"#, _block_files.last().unwrap().as_path().to_str().unwrap(), tmp_sock_file.as_path().to_str().unwrap(), diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index e914814e7ac..4bfbf673dc2 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -108,6 +108,8 @@ pub mod snapshot; pub mod test_utils; /// Utility functions and struct pub mod utils; +/// VFIO device configuration and emulation +pub mod vfio; /// Wrappers over structures used to configure the VMM. pub mod vmm_config; /// Module with virtual state structs. @@ -158,6 +160,7 @@ use crate::vmm_config::machine_config::MachineConfig; use crate::vmm_config::memory_hotplug::MemoryHotplugConfig; use crate::vmm_config::mmds::MmdsConfig; use crate::vmm_config::net::NetworkInterfaceConfig; +use crate::vmm_config::vfio::VfioConfig; use crate::vmm_config::vsock::VsockDeviceConfig; pub use crate::vstate::kvm::Kvm; use crate::vstate::memory::{GuestMemory, GuestMemoryMmap, GuestMemoryRegion}; @@ -353,6 +356,7 @@ impl Vmm { let mut memory_hotplug = None; let mut mmds_ipv4_address = None; let mut mmds_ref = None; + let mut vfio = Vec::new(); self.device_manager .for_each_virtio_device(|device_type, device| match device_type { @@ -400,6 +404,12 @@ impl Vmm { } }); + if self.device_manager.is_pci_enabled() { + for device in self.device_manager.pci_devices.vfio_devices.iter() { + vfio.push(device.lock().unwrap().config.clone()); + } + } + let mmds_config = mmds_ref.map(|mmds| { let mmds = mmds.lock().expect("Poisoned lock"); MmdsConfig { @@ -429,6 +439,7 @@ impl Vmm { // serial_config is marked serde(skip) so that it doesnt end up in snapshots serial_config: None, memory_hotplug, + vfio, } } @@ -444,6 +455,9 @@ impl Vmm { tuples.push(("vhost-user-block", b.id().to_owned())); } }); + for device in self.device_manager.pci_devices.vfio_devices.iter() { + tuples.push(("vfio", device.lock().unwrap().config.id.clone())); + } if tuples.is_empty() { Ok(()) } else { @@ -718,6 +732,17 @@ impl Vmm { .hotplug_device(kvm_vm, config, event_manager) } + /// Attaches a VFIO device after VM start + #[inline] + pub fn hotplug_device_vfio(&mut self, config: VfioConfig) -> Result<(), VmmActionError> { + log_dev_preview_warning("VFIO device hotplug", None); + let kvm_vm = self + .vm + .as_kvm() + .ok_or_else(|| VmmActionError::NotSupported("Operation requires KVM".to_string()))?; + self.device_manager.hotplug_device_vfio(kvm_vm, config) + } + /// Detaches a device after VM start #[inline] pub fn hot_unplug_device( @@ -734,6 +759,17 @@ impl Vmm { self.device_manager .hot_unplug_device(kvm_vm, device_id, event_manager) } + + /// Detaches a device after VM start + #[inline] + pub fn hot_unplug_vfio_device(&mut self, id: String) -> Result<(), VmmActionError> { + log_dev_preview_warning("VFIO device hot-unplug", None); + let kvm_vm = self + .vm + .as_kvm() + .ok_or_else(|| VmmActionError::NotSupported("Operation requires KVM".to_string()))?; + self.device_manager.hot_unplug_vfio_device(kvm_vm, id) + } } /// Process the content of the MPIDR_EL1 register in order to be able to pass it to KVM diff --git a/src/vmm/src/logger/metrics.rs b/src/vmm/src/logger/metrics.rs index ca96fe8b5b3..c2efb1a121e 100644 --- a/src/vmm/src/logger/metrics.rs +++ b/src/vmm/src/logger/metrics.rs @@ -432,6 +432,10 @@ pub struct PutRequestsMetrics { pub hotplug_memory_count: SharedIncMetric, /// Number of failed PUTs to /hotplug/memory pub hotplug_memory_fails: SharedIncMetric, + /// Number of PUTs triggering a vfio device attach. + pub vfio_count: SharedIncMetric, + /// Number of failures in attaching a vfio device. + pub vfio_fails: SharedIncMetric, } impl PutRequestsMetrics { /// Const default construction. @@ -463,6 +467,8 @@ impl PutRequestsMetrics { serial_fails: SharedIncMetric::new(), hotplug_memory_count: SharedIncMetric::new(), hotplug_memory_fails: SharedIncMetric::new(), + vfio_count: SharedIncMetric::new(), + vfio_fails: SharedIncMetric::new(), } } } diff --git a/src/vmm/src/pci/configuration.rs b/src/vmm/src/pci/configuration.rs index b838262a0a4..283115149d2 100644 --- a/src/vmm/src/pci/configuration.rs +++ b/src/vmm/src/pci/configuration.rs @@ -72,6 +72,18 @@ pub struct Bar { pub about_to_be_read: bool, } +impl Bar { + /// Is this BAR used + pub fn used(&self) -> bool { + !(self.encoded_addr == 0 && self.encoded_size == 0) + } + /// Is this a 64bit BAR + /// Must be called only on lower register of the BAR + pub fn is_64bit(&self) -> bool { + (self.encoded_addr & 0b100) == 0b100 + } +} + /// Specifies if the BAR is prefetchable #[derive(Debug, Clone, Copy)] #[repr(u8)] @@ -82,6 +94,14 @@ pub enum BarPrefetchable { Yes = 1, } +impl From for BarPrefetchable { + fn from(value: bool) -> Self { + // SAFETY: BarPrefetchable is #[repr(u8)] with No = 0 and Yes = 1, + // which matches the guaranteed bool representation (false = 0, true = 1). + unsafe { std::mem::transmute(value) } + } +} + /// Type to handle basic interactions with BARs region #[derive(Debug, Default, Clone, Copy, Serialize, Deserialize)] pub struct Bars { @@ -89,6 +109,23 @@ pub struct Bars { pub bars: [Bar; NUM_BAR_REGS as usize], } impl Bars { + /// Set a BAR slot as a single 32bit bar + pub fn set_bar_32(&mut self, bar_idx: u8, addr: u32, size: u32, prefetchable: BarPrefetchable) { + assert_ne!(size, 0); + assert!(size.is_power_of_two()); + assert!(addr & 0b1111 == 0); + addr.checked_add(size - 1).unwrap(); + assert!(bar_idx < NUM_BAR_REGS); + + // Unused BARs will have address and size of 0 + assert_eq!(self.bars[bar_idx as usize].encoded_addr, 0); + assert_eq!(self.bars[bar_idx as usize].encoded_size, 0); + + let size = encode_32_bits_bar_size(size); + let prefetchable = (prefetchable as u32) << 3; + self.bars[bar_idx as usize].encoded_addr = addr | prefetchable; + self.bars[bar_idx as usize].encoded_size = size; + } /// Set 2 consecutive BAR slots as a single 64bit bar pub fn set_bar_64(&mut self, bar_idx: u8, addr: u64, size: u64, prefetchable: BarPrefetchable) { assert_ne!(size, 0); @@ -114,6 +151,34 @@ impl Bars { self.bars[(bar_idx + 1) as usize].encoded_addr = addr_hi; self.bars[(bar_idx + 1) as usize].encoded_size = size_hi; } + /// Get the address of the 32bit or 64bit BAR + pub fn get_bar_addr(&self, bar_idx: u8) -> u64 { + assert!(bar_idx < NUM_BAR_REGS); + let bar = &self.bars[bar_idx as usize]; + if bar.is_64bit() { + self.get_bar_addr_64(bar_idx) + } else { + if 0 < bar_idx { + let previous_bar = &self.bars[bar_idx as usize - 1]; + assert!(!previous_bar.is_64bit()); + } + u64::from(bar.encoded_addr & !0b1111) + } + } + /// Get the size of the 32bit or 64bit BAR + pub fn get_bar_size(&self, bar_idx: u8) -> u64 { + assert!(bar_idx < NUM_BAR_REGS); + let bar = &self.bars[bar_idx as usize]; + if bar.is_64bit() { + self.get_bar_size_64(bar_idx) + } else { + if 0 < bar_idx { + let previous_bar = &self.bars[bar_idx as usize - 1]; + assert!(!previous_bar.is_64bit()); + } + u64::from(decode_32_bits_bar_size(bar.encoded_size)) + } + } /// Get the address of the 64bit bar pub fn get_bar_addr_64(&self, bar_idx: u8) -> u64 { assert!(bar_idx < NUM_BAR_REGS - 1); @@ -121,6 +186,13 @@ impl Bars { let addr_lo = self.bars[bar_idx as usize].encoded_addr & !0b1111; (addr_hi as u64) << 32 | (addr_lo as u64) } + /// Get the size of the 64bit bar + pub fn get_bar_size_64(&self, bar_idx: u8) -> u64 { + assert!(bar_idx < NUM_BAR_REGS - 1); + let size_hi = self.bars[(bar_idx + 1) as usize].encoded_size; + let size_lo = self.bars[bar_idx as usize].encoded_size; + decode_64_bits_bar_size(size_hi, size_lo) + } /// Writes into a given BAR register at the given offset pub fn write(&mut self, bar_idx: u8, offset: u8, data: &[u8]) { // There are only 6 registers each 4 bytes long @@ -167,6 +239,18 @@ pub trait PciCapability { fn id(&self) -> PciCapabilityId; } +// This encodes the BAR size as expected by the software running inside the guest. +// It assumes that bar_size is not 0 +fn encode_32_bits_bar_size(bar_size: u32) -> u32 { + assert_ne!(bar_size, 0); + !(bar_size - 1) +} + +/// Decode the BAR size for 32 bit +pub fn decode_32_bits_bar_size(encoded_size: u32) -> u32 { + (!encoded_size).wrapping_add(1) +} + // This encodes the BAR size as expected by the software running inside the guest. // It assumes that bar_size is not 0 fn encode_64_bits_bar_size(bar_size: u64) -> (u32, u32) { @@ -177,6 +261,12 @@ fn encode_64_bits_bar_size(bar_size: u64) -> (u32, u32) { (result_hi, result_lo) } +/// Decode the BAR size for 64 bit +pub fn decode_64_bits_bar_size(encoded_size_hi: u32, encoded_size_lo: u32) -> u64 { + let result = (u64::from(encoded_size_hi) << 32) | (u64::from(encoded_size_lo)); + (!result).wrapping_add(1) +} + /// PCI configuration space state for (de)serialization #[derive(Debug, Clone, Serialize, Deserialize)] pub struct PciConfigurationState { @@ -693,35 +783,56 @@ mod tests { #[test] #[should_panic] - fn test_bars_size_no_power_of_two() { + fn test_bars_set_bar_32_size_no_power_of_two() { + let mut bars = Bars::default(); + bars.set_bar_32(0, 0x1000, 0x1001, BarPrefetchable::No); + } + + #[test] + #[should_panic] + fn test_bars_set_bar_32_bad_bar_index() { + let mut bars = Bars::default(); + bars.set_bar_32(NUM_BAR_REGS, 0x1000, 0x1000, BarPrefetchable::No); + } + + #[test] + #[should_panic] + fn test_bars_set_bar_32_bar_size_overflows() { + let mut bars = Bars::default(); + bars.set_bar_32(0, u32::MAX, 0x2, BarPrefetchable::No); + } + + #[test] + #[should_panic] + fn test_bars_set_bar_64_size_no_power_of_two() { let mut bars = Bars::default(); bars.set_bar_64(0, 0x1000, 0x1001, BarPrefetchable::No); } #[test] #[should_panic] - fn test_bars_bad_bar_index() { + fn test_bars_set_bar_64_invalid_bar_index() { let mut bars = Bars::default(); bars.set_bar_64(NUM_BAR_REGS, 0x1000, 0x1000, BarPrefetchable::No); } #[test] #[should_panic] - fn test_bars_bad_64bit_bar_index() { + fn test_bars_set_bar_64_bad_bar_index() { let mut bars = Bars::default(); bars.set_bar_64(NUM_BAR_REGS - 1, 0x1000, 0x1000, BarPrefetchable::No); } #[test] #[should_panic] - fn test_bars_bar_size_overflows() { + fn test_bars_set_bar_64_bar_size_overflows() { let mut bars = Bars::default(); bars.set_bar_64(0, u64::MAX, 0x2, BarPrefetchable::No); } #[test] #[should_panic] - fn test_bars_lower_bar_free_upper_used() { + fn test_bars_set_bar_64_lower_bar_free_upper_used() { let mut bars = Bars::default(); bars.set_bar_64(1, 0x1000, 0x1000, BarPrefetchable::No); bars.set_bar_64(0, 0x1000, 0x1000, BarPrefetchable::No); @@ -729,7 +840,7 @@ mod tests { #[test] #[should_panic] - fn test_bars_lower_bar_used() { + fn test_bars_set_bar_64_lower_bar_used() { let mut bars = Bars::default(); bars.set_bar_64(0, 0x1000, 0x1000, BarPrefetchable::No); bars.set_bar_64(0, 0x1000, 0x1000, BarPrefetchable::No); @@ -737,7 +848,7 @@ mod tests { #[test] #[should_panic] - fn test_bars_upper_bar_used() { + fn test_bars_set_bar_64_upper_bar_used() { let mut bars = Bars::default(); bars.set_bar_64(0, 0x1000, 0x1000, BarPrefetchable::No); bars.set_bar_64(1, 0x1000, 0x1000, BarPrefetchable::No); @@ -747,11 +858,21 @@ mod tests { fn test_bars_add_pci_bar() { let mut bars = Bars::default(); bars.set_bar_64(0, 0x1_0000_0000, 0x1000, BarPrefetchable::No); + assert!(bars.bars[0].used()); + assert!(bars.bars[0].is_64bit()); + assert!(bars.bars[1].used()); assert_eq!(bars.get_bar_addr_64(0), 0x1_0000_0000); + assert_eq!(bars.get_bar_size(0), 0x1000); let mut v: u32 = 0; bars.read(0, 0, v.as_mut_bytes()); assert_eq!(v & 0xffff_fff0, 0x0); bars.read(1, 0, v.as_mut_bytes()); assert_eq!(v, 1); + + bars.set_bar_32(2, 0x2_0000, 0x2000, BarPrefetchable::Yes); + assert!(bars.bars[2].used()); + assert!(!bars.bars[2].is_64bit()); + assert_eq!(bars.get_bar_addr(2), 0x2_0000); + assert_eq!(bars.get_bar_size(2), 0x2000); } } diff --git a/src/vmm/src/pci/mod.rs b/src/vmm/src/pci/mod.rs index 89f2129edfd..9d146ce6ed9 100644 --- a/src/vmm/src/pci/mod.rs +++ b/src/vmm/src/pci/mod.rs @@ -88,6 +88,95 @@ impl PciSBDF { | (function & 0x7) as u32, ) } + + const SYSFS_PCI_PREFIX: &str = "/sys/bus/pci/devices/"; + + /// Parse SBDF from string + /// Accepts the following formats: + /// - Full sysfs path: "/sys/bus/pci/devices/0000:f4:00.0" + /// - Full SBDF: "0000:f4:00.0" + /// - Short BDF: "f4:00.0" + /// - Hex integer: "0x0308" or "0x00f40000" + /// - Decimal integer: "776" + pub fn new_from_str(s: &str) -> Option { + let s = s.trim(); + if s.is_empty() { + return None; + } + + let stripped = s + .strip_prefix(Self::SYSFS_PCI_PREFIX) + .unwrap_or(s) + .trim_end_matches('/'); + + if let Some(sbdf) = Self::parse_sbdf(stripped) { + return Some(sbdf); + } + + if let Some(hex_str) = stripped + .strip_prefix("0x") + .or_else(|| stripped.strip_prefix("0X")) + { + if let Ok(val) = u32::from_str_radix(hex_str, 16) { + return Some(val.into()); + } else { + return None; + }; + } + + if let Ok(val) = stripped.parse::() { + return Some(val.into()); + } + + None + } + + /// Parse "SSSS:BB:DD.F" or "BB:DD.F" + fn parse_sbdf(s: &str) -> Option { + let mut segment: u16 = 0; + let mut bdf_str = s; + if let Some((seg_str, rest)) = Self::split_segment_part(s) { + segment = u16::from_str_radix(seg_str, 16).ok()?; + bdf_str = rest + } + + let (bus_str, devfn) = bdf_str.split_once(':')?; + let (dev_str, fn_str) = devfn.split_once('.')?; + + let bus = u8::from_str_radix(bus_str, 16).ok()?; + let dev = u8::from_str_radix(dev_str, 16).ok()?; + let func = u8::from_str_radix(fn_str, 16).ok()?; + + // `dev` can only occupy 5 bits + // `func` can only occuppy 3 bits + if 0x1f < dev || 0x7 < func { + return None; + } + + Some(PciSBDF::new(segment, bus, dev, func)) + } + + fn split_segment_part(s: &str) -> Option<(&str, &str)> { + let first_colon = s.find(':')?; + let rest = &s[first_colon + 1..]; + if rest.contains(':') { + Some((&s[..first_colon], rest)) + } else { + None + } + } + + /// Full sysfs path for this device (e.g. "/sys/bus/pci/devices/0000:f4:00.0") + pub fn sysfs_path(&self) -> String { + format!( + "{}{:04x}:{:02x}:{:02x}.{:x}", + Self::SYSFS_PCI_PREFIX, + self.segment(), + self.bus(), + self.device(), + self.function() + ) + } } impl From for PciSBDF { @@ -423,4 +512,28 @@ mod tests { assert_eq!(sbdf.device(), 0x0f); assert_eq!(sbdf.function(), 0x0); } + + #[test] + fn test_pci_bdf_parse() { + let sbdf = PciSBDF::new_from_str("/sys/bus/pci/devices/0000:f4:00.0").unwrap(); + assert_eq!(sbdf, PciSBDF::new(0, 0xf4, 0, 0)); + + let sbdf = PciSBDF::new_from_str("0000:03:01.0").unwrap(); + assert_eq!(sbdf, PciSBDF::new(0, 3, 1, 0)); + + let sbdf = PciSBDF::new_from_str("03:01.0").unwrap(); + assert_eq!(sbdf, PciSBDF::new(0, 3, 1, 0)); + + let sbdf = PciSBDF::new_from_str("0x0308").unwrap(); + assert_eq!(sbdf.0, 0x0308); + + let sbdf = PciSBDF::new_from_str("776").unwrap(); + assert_eq!(sbdf.0, 776); + + assert!(PciSBDF::new_from_str("").is_none()); + // 0x1f < device + assert!(PciSBDF::new_from_str("00:20.0").is_none()); + // 7 < function + assert!(PciSBDF::new_from_str("00:00.8").is_none()); + } } diff --git a/src/vmm/src/pci/msix.rs b/src/vmm/src/pci/msix.rs index 0d7c3e75fe1..a05992018d0 100644 --- a/src/vmm/src/pci/msix.rs +++ b/src/vmm/src/pci/msix.rs @@ -466,7 +466,7 @@ impl MsixConfig { #[repr(C, packed)] #[derive(Debug, Clone, Copy, Default, Serialize, Deserialize)] -/// MSI-X PCI capability +/// 7.7.2 MSI-X Capability and Table Structure pub struct MsixCap { /// Message Control Register /// 10-0: MSI-X Table size @@ -517,6 +517,55 @@ impl MsixCap { pba: (pba_off & 0xffff_fff8u32) | u32::from(pba_pci_bar & 0x7u8), } } + + /// Is MASKED bit set + pub fn masked(&self) -> bool { + (self.msg_ctl >> FUNCTION_MASK_BIT) & 0x1 == 0x1 + } + + /// Is ENABLED bit set + pub fn enabled(&self) -> bool { + (self.msg_ctl >> MSIX_ENABLE_BIT) & 0x1 == 0x1 + } + + /// Table offset + pub fn table_offset(&self) -> u32 { + self.table & 0xffff_fff8 + } + + /// Pba offset + pub fn pba_offset(&self) -> u32 { + self.pba & 0xffff_fff8 + } + + /// Table BAR idx + pub fn table_bir(&self) -> u8 { + (self.table & 0x7) as u8 + } + + /// PBA BAR idx + pub fn pba_bir(&self) -> u8 { + (self.pba & 0x7) as u8 + } + + /// Table size + pub fn table_size(&self) -> u16 { + (self.msg_ctl & 0x7ff) + 1 + } + + /// Table BAR offset and size in bytes + pub fn table_range(&self) -> (u64, u64) { + // The table takes 16 bytes per entry. + let size = self.table_size() as u64 * 16; + (self.table_offset() as u64, size) + } + + /// PBA BAR offset and size in bytes + pub fn pba_range(&self) -> (u64, u64) { + // The table takes 1 bit per entry modulo 8 bytes. + let size = (self.table_size() as u64).div_ceil(64) * 8; + (self.pba_offset() as u64, size) + } } #[cfg(test)] diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index 632db13f259..71dd295e511 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -31,6 +31,7 @@ use crate::vmm_config::mmds::{MmdsConfig, MmdsConfigError}; use crate::vmm_config::net::*; use crate::vmm_config::pmem::{PmemBuilder, PmemConfig, PmemConfigError}; use crate::vmm_config::serial::SerialConfig; +use crate::vmm_config::vfio::{VfioConfig, VfioConfigError, VfioConfigs}; use crate::vmm_config::vsock::*; use crate::vstate::memory; use crate::vstate::memory::{GuestRegionMmap, MemoryError}; @@ -68,6 +69,14 @@ pub enum ResourcesError { PmemConfig(#[from] PmemConfigError), /// Memory hotplug config error: {0} MemoryHotplugConfig(#[from] MemoryHotplugConfigError), + /// VFIO config error: {0} + VfioConfig(#[from] VfioConfigError), + /// VFIO devices attached, but PCI disabled + VfioWithoutPci, + /// VFIO devices are not compatible with memory hot-plugging device + VfioWithMemHotplug, + /// VFIO devices are not compatible with memory balloon device + VfioWithBalloon, } #[derive(Serialize, Deserialize, PartialEq, Eq, Debug)] @@ -100,6 +109,8 @@ pub struct VmmConfig { #[serde(skip)] pub serial_config: Option, pub memory_hotplug: Option, + #[serde(default)] + pub vfio: Vec, } /// A data structure that encapsulates the device configurations @@ -138,6 +149,8 @@ pub struct VmResources { pub serial_out_path: Option, /// Optional rate limiter config for serial output. pub serial_rate_limiter_cfg: Option, + /// VFIO passthrough configuration. + pub vfio: VfioConfigs, } impl VmResources { @@ -239,9 +252,29 @@ impl VmResources { resources.set_memory_hotplug_config(memory_hotplug_config)?; } + for config in vmm_config.vfio { + resources.vfio.add(config)?; + } + Ok(resources) } + /// Validate the VM configuration for incompatibilities + pub fn validate(&self) -> Result<(), ResourcesError> { + if !self.vfio.configs.is_empty() { + if !self.pci_enabled { + return Err(ResourcesError::VfioWithoutPci); + } + if self.memory_hotplug.is_some() { + return Err(ResourcesError::VfioWithMemHotplug); + } + if self.balloon.get().is_some() { + return Err(ResourcesError::VfioWithBalloon); + } + } + Ok(()) + } + /// If not initialised, create the mmds data store with the default config. pub fn mmds_or_default(&mut self) -> Result<&Arc>, MmdsConfigError> { Ok(self @@ -533,6 +566,11 @@ impl VmResources { .pop() .unwrap()) } + + /// Adds a VFIO passthrough device configuration. + pub fn set_vfio_device(&mut self, config: VfioConfig) -> Result<(), VfioConfigError> { + self.vfio.add(config) + } } impl From<&VmResources> for VmmConfig { @@ -553,6 +591,7 @@ impl From<&VmResources> for VmmConfig { // serial_config is marked serde(skip) so that it doesnt end up in snapshots. serial_config: None, memory_hotplug: resources.memory_hotplug.clone(), + vfio: resources.vfio.configs.clone(), } } } @@ -665,6 +704,7 @@ mod tests { serial_out_path: None, serial_rate_limiter_cfg: None, memory_hotplug: Default::default(), + vfio: Default::default(), } } diff --git a/src/vmm/src/rpc_interface.rs b/src/vmm/src/rpc_interface.rs index 7693b305f65..776100267ac 100644 --- a/src/vmm/src/rpc_interface.rs +++ b/src/vmm/src/rpc_interface.rs @@ -9,7 +9,7 @@ use utils::time::{ClockType, get_time_us}; use super::builder::build_and_boot_microvm; use super::persist::{create_snapshot, restore_from_snapshot}; -use super::resources::VmResources; +use super::resources::{ResourcesError, VmResources}; use super::{Vmm, VmmError}; use crate::EventManager; use crate::builder::StartMicrovmError; @@ -23,7 +23,6 @@ use crate::mmds::data_store::{self, Mmds, MmdsDatastoreError}; use crate::persist::{CreateSnapshotError, RestoreFromSnapshotError, VmInfo}; use crate::resources::VmmConfig; use crate::seccomp::BpfThreadMap; -use crate::vmm_config::HotplugDeviceConfig; use crate::vmm_config::balloon::{ BalloonConfigError, BalloonDeviceConfig, BalloonStats, BalloonUpdateConfig, BalloonUpdateStatsConfig, @@ -44,8 +43,9 @@ use crate::vmm_config::net::{ use crate::vmm_config::pmem::{PmemConfig, PmemConfigError, PmemDeviceUpdateConfig}; use crate::vmm_config::serial::SerialConfig; use crate::vmm_config::snapshot::{CreateSnapshotParams, LoadSnapshotParams, SnapshotType}; +use crate::vmm_config::vfio::{VfioConfig, VfioConfigError}; use crate::vmm_config::vsock::{VsockConfigError, VsockDeviceConfig}; -use crate::vmm_config::{self, RateLimiterUpdate}; +use crate::vmm_config::{self, HotplugDeviceConfig, RateLimiterUpdate}; /// This enum represents the public interface of the VMM. Each action contains various /// bits of information (ids, paths, etc.). @@ -119,6 +119,9 @@ pub enum VmmAction { /// Set the entropy device using `EntropyDeviceConfig` as input. This action can only be called /// before the microVM has booted. SetEntropyDevice(EntropyDeviceConfig), + /// Add a VFIO passthrough device using `VfioConfig` as input. This action can only be called + /// before the microVM has booted. + InsertVfioDevice(VfioConfig), /// Get the memory hotplug device configuration and status. GetMemoryHotplugStatus, /// Set the memory hotplug device using `MemoryHotplugConfig` as input. This action can only be @@ -153,6 +156,8 @@ pub enum VmmAction { UpdateMachineConfiguration(MachineConfigUpdate), /// Hot-unplug a device. HotUnplugDevice(VirtioDeviceId), + /// Hot-unplug a VFIO device. + HotUnplugVfioDevice(String), } /// Wrapper for all errors associated with VMM actions. @@ -218,6 +223,10 @@ pub enum VmmActionError { PciNotEnabled, /// PCI manager error: {0} PciManager(#[from] PciManagerError), + /// VFIO config error: {0} + VfioConfig(#[from] VfioConfigError), + /// Incompatible device configuration: {0} + IncompatibleDeviceConfiguration(#[from] ResourcesError), } /// The enum represents the response sent by the VMM in case of success. The response is either @@ -498,6 +507,7 @@ impl<'a> PrebootApiController<'a> { StartMicroVm => self.start_microvm(), UpdateMachineConfiguration(config) => self.update_machine_config(config), SetEntropyDevice(config) => self.set_entropy_device(config), + InsertVfioDevice(config) => self.insert_vfio_device(config), SetMemoryHotplugDevice(config) => self.set_memory_hotplug_device(config), // Operations not allowed pre-boot. CreateSnapshot(_) @@ -516,6 +526,7 @@ impl<'a> PrebootApiController<'a> { | GetFreePageHintingStatus | StopFreePageHinting | HotUnplugDevice(_) => Err(VmmActionError::OperationNotSupportedPreBoot), + HotUnplugVfioDevice(_) => Err(VmmActionError::OperationNotSupportedPreBoot), #[cfg(target_arch = "x86_64")] SendCtrlAltDel => Err(VmmActionError::OperationNotSupportedPreBoot), } @@ -557,11 +568,16 @@ impl<'a> PrebootApiController<'a> { } fn set_balloon_device(&mut self, cfg: BalloonDeviceConfig) -> Result { + if !self.vm_resources.vfio.configs.is_empty() { + return Err(VmmActionError::IncompatibleDeviceConfiguration( + ResourcesError::VfioWithBalloon, + )); + } self.boot_path = true; self.vm_resources .set_balloon_device(cfg) - .map(|()| VmmData::Empty) - .map_err(VmmActionError::BalloonConfig) + .map_err(VmmActionError::BalloonConfig)?; + Ok(VmmData::Empty) } fn set_boot_source(&mut self, cfg: BootSourceConfig) -> Result { @@ -613,10 +629,40 @@ impl<'a> PrebootApiController<'a> { Ok(VmmData::Empty) } + fn insert_vfio_device(&mut self, cfg: VfioConfig) -> Result { + if !self.vm_resources.pci_enabled { + return Err(VmmActionError::IncompatibleDeviceConfiguration( + ResourcesError::VfioWithoutPci, + )); + } + if self.vm_resources.memory_hotplug.is_some() { + return Err(VmmActionError::IncompatibleDeviceConfiguration( + ResourcesError::VfioWithMemHotplug, + )); + } + if self.vm_resources.balloon.get().is_some() { + return Err(VmmActionError::IncompatibleDeviceConfiguration( + ResourcesError::VfioWithBalloon, + )); + } + + log_dev_preview_warning("VFIO", None); + self.boot_path = true; + self.vm_resources + .set_vfio_device(cfg) + .map_err(VmmActionError::VfioConfig)?; + Ok(VmmData::Empty) + } + fn set_memory_hotplug_device( &mut self, cfg: MemoryHotplugConfig, ) -> Result { + if !self.vm_resources.vfio.configs.is_empty() { + return Err(VmmActionError::IncompatibleDeviceConfiguration( + ResourcesError::VfioWithMemHotplug, + )); + } self.boot_path = true; self.vm_resources.set_memory_hotplug_config(cfg)?; Ok(VmmData::Empty) @@ -779,12 +825,24 @@ impl RuntimeApiController { .expect("Poisoned lock") .hotplug_device(HotplugDeviceConfig::Net(config), event_manager) .map(|()| VmmData::Empty), + InsertVfioDevice(config) => self + .vmm + .lock() + .expect("Poisoned lock") + .hotplug_device_vfio(config) + .map(|()| VmmData::Empty), HotUnplugDevice(device_id) => self .vmm .lock() .expect("Poisoned lock") .hot_unplug_device(device_id, event_manager) .map(|()| VmmData::Empty), + HotUnplugVfioDevice(id) => self + .vmm + .lock() + .expect("Poisoned lock") + .hot_unplug_vfio_device(id) + .map(|()| VmmData::Empty), Pause => self.pause(), PutMMDS(value) => mmds_put_data( self.vmm diff --git a/src/vmm/src/utils/mod.rs b/src/vmm/src/utils/mod.rs index 291263e9980..827f949549f 100644 --- a/src/vmm/src/utils/mod.rs +++ b/src/vmm/src/utils/mod.rs @@ -17,6 +17,8 @@ use std::path::Path; use libc::O_NONBLOCK; +use crate::arch::host_page_size; + /// How many bits to left-shift by to convert MiB to bytes const MIB_TO_BYTES_SHIFT: usize = 20; @@ -66,6 +68,25 @@ pub const fn align_down(addr: u64, align: u64) -> u64 { addr & !(align - 1) } +/// Calculate the difference between the current address +/// and the nearest lower page boundary. +pub fn offset_from_lower_host_page(addr: u64) -> u64 { + let align = usize_to_u64(host_page_size()); + addr & (align - 1) +} + +/// Align address up to the host page boundary. +pub fn align_up_host_page(addr: u64) -> u64 { + let align = usize_to_u64(host_page_size()); + (addr + align - 1) & !(align - 1) +} + +/// Align address down to the host page boundary. +pub fn align_down_host_page(addr: u64) -> u64 { + let align = usize_to_u64(host_page_size()); + addr & !(align - 1) +} + /// Create and open a file for both reading and writing to it with a O_NONBLOCK flag. /// In case we open a FIFO, we need all READ, WRITE and O_NONBLOCK in order to not block the process /// if nobody is consuming the message. Otherwise opening the FIFO with only WRITE and O_NONBLOCK diff --git a/src/vmm/src/vfio.rs b/src/vmm/src/vfio.rs new file mode 100644 index 00000000000..43b14953b13 --- /dev/null +++ b/src/vmm/src/vfio.rs @@ -0,0 +1,1898 @@ +// Copyright 2026 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::ops::DerefMut; +use std::os::fd::AsRawFd; +use std::path::Path; +use std::sync::{Arc, Barrier, Mutex}; + +use arrayvec::ArrayVec; +use bitflags::bitflags; +use kvm_bindings::{ + kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_userspace_memory_region, +}; +use vfio_bindings::bindings::vfio::*; +pub use vfio_ioctls::{ + VfioContainer, VfioDevice, VfioDeviceFd, VfioRegionInfoCap, VfioRegionInfoCapSparseMmap, + VfioRegionSparseMmapArea, +}; +use vm_allocator::{AllocPolicy, RangeInclusive}; +use vm_memory::{GuestMemory, GuestMemoryRegion}; +use vmm_sys_util::eventfd::EventFd; +use zerocopy::IntoBytes; + +use crate::arch::host_page_size; +use crate::logger::{debug, error, trace, warn}; +use crate::pci::configuration::{ + BAR0_REG_IDX, Bars, NUM_BAR_REGS, decode_32_bits_bar_size, decode_64_bits_bar_size, +}; +use crate::pci::msix::{MsixCap, MsixConfig}; +use crate::pci::{PciCapabilityId, PciDevice, PciExpressCapabilityId, PciSBDF}; +use crate::utils::{ + align_down_host_page, align_up_host_page, offset_from_lower_host_page, u64_to_usize, + usize_to_u64, +}; +use crate::vmm_config::vfio::VfioConfig; +use crate::vstate::bus::BusDevice; +use crate::vstate::interrupts::InterruptError; +use crate::vstate::memory::{GuestMemoryMmap, GuestRegionType}; +use crate::vstate::resources::ResourceAllocator; +use crate::vstate::vm::KvmVm; + +// First BAR offset in the PCI config space. +const PCI_CONFIG_BAR_OFFSET: u32 = 0x10; +// Capability register offset in the PCI config space. +const PCI_CONFIG_CAPABILITY_OFFSET: u32 = 0x34; +// Extended capabilities register offset in the PCI config space. +const PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET: u16 = 0x100; +// IO BAR when first BAR bit is 1. +const PCI_CONFIG_IO_BAR: u32 = 1 << 0; +// 64-bit memory bar flag. +const PCI_CONFIG_MEMORY_BAR_64BIT: u32 = 1 << 2; +// Prefetchable BAR bit +const PCI_CONFIG_BAR_PREFETCHABLE: u32 = 1 << 3; + +/// VfioError +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum VfioError { + /// Failed to allocate guest address for BAR + BarAllocation, + /// mmap failed + Mmap, + /// Failed to allocate KVM slot + KvmSlot, + /// Failed to set KVM user memory region: {0} + SetUserMemoryRegion(String), + /// Cannot create Msix vector group: {0} + MsixConfig(#[from] InterruptError), + /// Device does not provide MSIx irq + NoMsixIrq, + /// KVM failed to create KVM_DEV_TYPE_VFIO device: {0} + KVMCreateVfioDevice(kvm_ioctls::Error), + /// vfio-ioctls crate error: {0} + VfioIoctls(#[from] vfio_ioctls::VfioError), + /// BAR{0} MSI-X table at offset {1:#x} size {2:#x} does not fit in region of size {3:#x} + MsixTableOutOfRange(u8, u64, u64, u64), + /// BAR{0} MSI-X PBA at offset {1:#x} size {2:#x} does not fit in region of size {3:#x} + MsixPbaOutOfRange(u8, u64, u64, u64), + /// BAR{0} sparse mmap area at offset {1:#x} size {2:#x} does not fit in region of size {3:#x} + SparseMmapAreaOutOfRange(u8, u64, u64, u64), +} + +#[derive(Debug, Clone)] +struct VfioRegionInfo { + pub flags: u32, + pub size: u64, + pub offset: u64, + pub caps: Vec, +} + +/// Mask for specific register in the configuration space +#[derive(Debug)] +pub struct RegisterMask { + /// register + pub register: u16, + /// applied as (R & mask) | value + pub mask: u32, + /// value + pub value: u32, +} + +bitflags! { + /// Type of the hole in the bar. A single hole can contain both + /// the MSI-X table and PBA when their host-page-aligned ranges overlap. + #[derive(Debug, Copy, Clone, PartialEq, Eq)] + pub struct BarHoleInfoUsage: u8 { + /// The hole contains MSIx table + const TABLE = 1 << 0; + /// The hole contains MSIx pba + const PBA = 1 << 1; + } +} + +/// Information about the location of the hole in the bar +#[derive(Debug, Copy, Clone)] +pub struct BarHoleInfo { + /// Guest location of the hole + pub gpa: u64, + /// Size of the hole + pub size: u64, + /// What does the hole contain + pub usage: BarHoleInfoUsage, +} + +/// Information about the bar mapping +#[derive(Debug, Copy, Clone)] +pub struct BarMapping { + /// KVM slot assigned to the mapping + pub slot: u32, + /// Guest physical address + pub iova: u64, + /// Size + pub size: u64, + /// Host virtual address + pub hva: u64, +} + +/// Container for everything MSIx related +#[derive(Debug)] +pub struct MsixState { + /// Register idx where the capability is in the configuration space + pub register: u8, + /// The actual capability (without first 2 bytes) + pub cap: MsixCap, + /// Info about Table and Pba holes + pub bar_hole_infos: ArrayVec, + /// Config + pub config: MsixConfig, +} + +/// The VFIO device bundle +pub struct VfioDeviceBundle { + /// Configuration with which the device was created + pub config: VfioConfig, + /// SBDF of the device in the configuration space + pub sbdf: PciSBDF, + /// Device + pub device: VfioDevice, + /// Information about BARs + pub bars: Bars, + /// DMA mapped BARs + pub bar_mappings: Vec, + /// MSIx state + pub msix_state: MsixState, + /// Masks for configuration space registers + pub masks: Vec, + /// Vm + pub vm: Arc, +} + +impl std::fmt::Debug for VfioDeviceBundle { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("VfioDeviceBundle") + .field("config", &self.config) + .field("sbdf", &self.sbdf) + .finish() + } +} + +macro_rules! handle_bar_access { + ($state:expr, $device:expr, $base:expr, $offset:expr, $data:expr, + $table_fn:ident, $pba_fn:ident, $region_method:ident) => {{ + let mut name = "----"; + let mut handled = false; + let data_start = $offset; + let data_end = $offset + $data.len() as u64; + for hole in $state.bar_hole_infos.iter() { + if hole.gpa == $base { + if hole.usage.contains(BarHoleInfoUsage::TABLE) { + let (t_off, t_size) = $state.cap.table_range(); + let t_start = offset_from_lower_host_page(t_off); + let t_end = t_start + t_size; + if t_start <= data_start && data_end <= t_end { + $state.config.$table_fn($offset - t_start, $data); + handled = true; + name = "MsiTable"; + break; + } + // Reject partial overlap with table. + // This should not happen in normal operations, but malicious + // driver can try this. + // In this case it should be fine to ignore the access all together + if data_start < t_end && t_start < data_end { + handled = true; + break; + } + } + + if hole.usage.contains(BarHoleInfoUsage::PBA) { + let (p_off, p_size) = $state.cap.pba_range(); + let p_start = offset_from_lower_host_page(p_off); + let p_end = p_start + p_size; + if p_start <= data_start && data_end <= p_end { + $state.config.$pba_fn($offset - p_start, $data); + handled = true; + name = "PbaTable"; + break; + } + // Reject partial overlap with pba. + // This should not happen in normal operations, but malicious + // driver can try this. + // In this case it should be fine to ignore the access all together + if data_start < p_end && p_start < data_end { + handled = true; + break; + } + } + + let (region_idx, hole_off_in_region) = + if hole.usage.contains(BarHoleInfoUsage::TABLE) { + ( + $state.cap.table_bir(), + align_down_host_page($state.cap.table_offset() as u64), + ) + } else { + ( + $state.cap.pba_bir(), + align_down_host_page($state.cap.pba_offset() as u64), + ) + }; + let in_region_off = hole_off_in_region + $offset; + let region_size = $device.get_region_size(region_idx as u32); + if in_region_off + ($data.len() as u64) <= region_size { + $device.$region_method(region_idx as u32, $data, in_region_off); + } else { + // Again, if access is partially out of the region boundaries + // just ignore it + } + handled = true; + break; + } + } + (name, handled) + }}; +} + +// This should only serve BARs +impl BusDevice for VfioDeviceBundle { + fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) { + let (name, handled) = handle_bar_access!( + self.msix_state, + self.device, + base, + offset, + data, + read_table, + read_pba, + region_read + ); + if !handled { + warn!( + "[{}] BusDevice::read not handled: base: {base:#x} offset: {offset:#x}", + self.config.id + ); + data.fill(0); + } + trace!( + "[{}] base: {base:<#10x} offset: {offset:<#5x} data: {data:<4?} name: {name} handled: \ + {handled}", + self.config.id, + ); + } + + fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option> { + let (name, handled) = handle_bar_access!( + self.msix_state, + self.device, + base, + offset, + data, + write_table, + write_pba, + region_write + ); + if !handled { + warn!( + "[{}] BusDevice::write not handled: base: {base:#x} offset: {offset:#x}", + self.config.id + ); + } + trace!( + "[{}] base: {base:<#10x} offset: {offset:<#5x} data: {data:<4?} table_name: {name}, \ + handled: {handled}", + self.config.id + ); + None + } +} + +// This should only serve config space +impl PciDevice for VfioDeviceBundle { + fn write_config_register( + &mut self, + reg_idx: u16, + offset: u8, + data: &[u8], + ) -> Option> { + let mut name = "----"; + let mut handled: bool = false; + + if BAR0_REG_IDX <= reg_idx && reg_idx < BAR0_REG_IDX + u16::from(NUM_BAR_REGS) { + // reg_idx is in [BAR0_REG, BAR0_REG+NUM_BAR_REGS), so the difference is 0..5. + #[allow(clippy::cast_possible_truncation)] + let bar_idx = (reg_idx - BAR0_REG_IDX) as u8; + // offset is within a 4-byte PCI config register (0..3). + self.bars.write(bar_idx, offset, data); + name = "BAR"; + handled = true; + } else if reg_idx == u16::from(self.msix_state.register) { + // offset is within a 4-byte PCI config register (0..3). + self.msix_state.config.write_msg_ctl_register(offset, data); + name = "MSIX_CAP"; + // Don't set `handled` since we need to passthrough write + // to the msg_ctl register to the device, so it will enable Msix + // interrupts + } + let config_offset = reg_idx * 4 + u16::from(offset); + if !handled { + self.device + .region_write(VFIO_PCI_CONFIG_REGION_INDEX, data, u64::from(config_offset)); + } + trace!( + "[{}] reg: {reg_idx:>3}({config_offset:>#6x}) data: {data:<4?} name: {name}", + self.config.id + ); + None + } + fn read_config_register(&mut self, reg_idx: u16) -> u32 { + let mut name = "----"; + let config_offset = reg_idx as u64 * 4; + let mut result: u32 = 0; + if BAR0_REG_IDX <= reg_idx && reg_idx < BAR0_REG_IDX + u16::from(NUM_BAR_REGS) { + // reg_idx is in [BAR0_REG, BAR0_REG+NUM_BAR_REGS), so the difference is 0..5. + #[allow(clippy::cast_possible_truncation)] + let bar_idx = (reg_idx - BAR0_REG_IDX) as u8; + self.bars.read(bar_idx, 0, result.as_mut_bytes()); + name = "BAR"; + } else { + self.device.region_read( + VFIO_PCI_CONFIG_REGION_INDEX, + result.as_mut_bytes(), + config_offset, + ); + if reg_idx == u16::from(self.msix_state.register) { + result = (result & !(1 << 31 | 1 << 30)) + | ((self.msix_state.config.enabled as u32) << 31) + | ((self.msix_state.config.masked as u32) << 30); + name = "MSIX_CAP"; + } + for mask in self.masks.iter() { + if mask.register == reg_idx { + result = (result & mask.mask) | mask.value; + name = "MASK"; + break; + } + } + } + trace!( + "[{}] reg: {reg_idx:>3}({config_offset:>#6x}) data: {:<4?} name: {name}", + self.config.id, + result.as_bytes() + ); + result + } +} + +#[allow(clippy::type_complexity)] +fn vfio_device_get_pci_capabilities( + device: &VfioDevice, +) -> Result<(Option<(MsixCap, u8)>, Vec), VfioError> { + let mut next_cap_offset: u8 = 0; + device.region_read( + VFIO_PCI_CONFIG_REGION_INDEX, + next_cap_offset.as_mut_bytes(), + PCI_CONFIG_CAPABILITY_OFFSET as u64, + ); + debug!("PCI CAPS offset: {}", next_cap_offset); + + let mut has_pci_express_cap = false; + let mut msix_cap_and_register = None; + // The legacy region with PCI capis is 256 bytes long and + // split into 4 byte registers. + const LOOP_UPPER_BOUND: u32 = 256 / 4; + let mut loop_bound: u32 = 0; + while next_cap_offset != 0 && loop_bound < LOOP_UPPER_BOUND { + loop_bound += 1; + + let mut cap_id_and_next_ptr: u16 = 0; + device.region_read( + VFIO_PCI_CONFIG_REGION_INDEX, + cap_id_and_next_ptr.as_mut_bytes(), + next_cap_offset as u64, + ); + // clear low 2 bits just in case to get 4 byte aligned address + next_cap_offset &= 0xfc; + + let current_cap_offset = next_cap_offset; + + // 7.5.3.1 PCI Express Capability List Register + // | 2 bytes | 1 byte | 1 byte | + // | Cap register | Capability ID | Next Capability Pointer | + let cap_id: u8 = (cap_id_and_next_ptr & 0xff) as u8; + next_cap_offset = ((cap_id_and_next_ptr & 0xff00) >> 8) as u8; + debug!("PCI CAP id: {cap_id} next offset: {next_cap_offset:#x}"); + + let cap = PciCapabilityId::from(cap_id); + let register = current_cap_offset / 4; + debug!("Found pci cap: {cap:?} at offset: {current_cap_offset:#x}({register})"); + + match cap { + PciCapabilityId::PciExpress => { + has_pci_express_cap = true; + } + PciCapabilityId::MsiX => { + if let Some(irq_info) = device.get_irq_info(VFIO_PCI_MSIX_IRQ_INDEX) { + if irq_info.count != 0 { + // 7.7.2 MSI-X Capability and Table Structure + let mut msg_ctl: u16 = 0; + let mut table: u32 = 0; + let mut pba: u32 = 0; + device.region_read( + VFIO_PCI_CONFIG_REGION_INDEX, + msg_ctl.as_mut_bytes(), + (current_cap_offset as u64) + 2, + ); + device.region_read( + VFIO_PCI_CONFIG_REGION_INDEX, + table.as_mut_bytes(), + (current_cap_offset as u64) + 4, + ); + device.region_read( + VFIO_PCI_CONFIG_REGION_INDEX, + pba.as_mut_bytes(), + (current_cap_offset as u64) + 8, + ); + msix_cap_and_register = Some(( + MsixCap { + msg_ctl, + table, + pba, + }, + register, + )); + } else { + debug!( + "Found MSI-X cap, but the device does not support MSI-X interrupts." + ); + } + } + } + _ => {} + }; + } + + let mut masks = Vec::new(); + if has_pci_express_cap { + let mut next_cap_offset: u16 = PCI_CONFIG_EXTENDED_CAPABILITY_OFFSET; + + // The PCIe region is 4K in size and split into 4 byte registers + const LOOP_UPPER_BOUND: u32 = 4096 / 4; + let mut loop_bound: u32 = 0; + while next_cap_offset != 0 && loop_bound < LOOP_UPPER_BOUND { + loop_bound += 1; + + let mut cap_id_and_next_ptr: u32 = 0; + device.region_read( + VFIO_PCI_CONFIG_REGION_INDEX, + cap_id_and_next_ptr.as_mut_bytes(), + next_cap_offset as u64, + ); + // clear low 2 bits just in case to get 4 byte aligned address + next_cap_offset &= 0xfffc; + + let current_cap_offset = next_cap_offset; + + // 7.7.3.1 Secondary PCI Express Extended Capability Header + // | 31-20 | 19-16 | 15-0 | + // | Next capability offset | Capability Version | PCIe Capability ID | + let cap_id: u16 = (cap_id_and_next_ptr & 0xffff) as u16; + next_cap_offset = (cap_id_and_next_ptr >> 20) as u16; + + let pci_cap = PciExpressCapabilityId::from(cap_id); + let register = current_cap_offset / 4; + debug!( + "Found pci ext cap: {pci_cap:?} cap at offset: {current_cap_offset:#x}({register})" + ); + + // Find registers which contain the headers of PCIe caps we want to filter out of + // the capability list. The "filtering" is done by changing the "PCI Express Cap ID" + // part of the register (the first byte) to 0. 0 represents the "null" capability in + // the PCIe spec. The actual chain of capabilities is not broken by this action. When + // guest driver encounters this capability it just jumps to the next one since the + // "Next Cap Pointer" (second byte) is intact. + // + // NOTE: the list of capabilities is hardcoded for now. In the future this + // may be configurable from the user side. + match pci_cap { + PciExpressCapabilityId::AlternativeRoutingIdentificationInterpretation + | PciExpressCapabilityId::ResizeableBar + | PciExpressCapabilityId::SingleRootIoVirtualization => { + debug!( + "Found cap to be masked at register: {register}({current_cap_offset:#x})" + ); + masks.push(RegisterMask { + register, + mask: 0xffff_0000, + value: 0x0000_0000, + }) + } + _ => {} + } + } + } + Ok((msix_cap_and_register, masks)) +} + +fn vfio_device_get_single_bar_info( + device: &VfioDevice, + bar_idx: u8, +) -> Result<(u32, u32), VfioError> { + // 7.5.1.2.1 Base Address Registers + // IMPLEMENTATION NOTE: SIZING A 32-BIT BASE ADDRESS REGISTER + let bar_offset = u64::from(PCI_CONFIG_BAR_OFFSET) + u64::from(bar_idx) * 4; + let mut value: u32 = 0; + let mut size: u32 = 0; + device.region_read( + VFIO_PCI_CONFIG_REGION_INDEX, + value.as_mut_bytes(), + bar_offset, + ); + device.region_write( + VFIO_PCI_CONFIG_REGION_INDEX, + 0xffff_ffff_u32.as_bytes(), + bar_offset, + ); + device.region_read( + VFIO_PCI_CONFIG_REGION_INDEX, + size.as_mut_bytes(), + bar_offset, + ); + device.region_write(VFIO_PCI_CONFIG_REGION_INDEX, value.as_bytes(), bar_offset); + Ok((value, size)) +} + +fn vfio_device_allocate_bars( + device: &VfioDevice, + resource_allocator: &mut ResourceAllocator, +) -> Result { + let mut bars = Bars::default(); + let mut bar_idx = 0; + while bar_idx < NUM_BAR_REGS { + let (bar_info, mut lower_size) = vfio_device_get_single_bar_info(device, bar_idx)?; + + let is_io_bar = bar_info & PCI_CONFIG_IO_BAR != 0; + let is_64_bits = bar_info & PCI_CONFIG_MEMORY_BAR_64BIT != 0; + let is_prefetchable = bar_info & PCI_CONFIG_BAR_PREFETCHABLE != 0; + + if is_64_bits && bar_idx == NUM_BAR_REGS - 1 { + warn!("BAR{bar_idx} is last BAR but marked as 64bit. Skipping"); + break; + } + + let size = if is_io_bar { + lower_size &= !0b11; + u64::from(decode_32_bits_bar_size(lower_size)) + } else if !is_64_bits { + lower_size &= !0b1111; + u64::from(decode_32_bits_bar_size(lower_size)) + } else { + lower_size &= !0b1111; + let (_, upper_size) = vfio_device_get_single_bar_info(device, bar_idx + 1)?; + decode_64_bits_bar_size(upper_size, lower_size) + }; + if size != 0 { + fn calculate_alignment(size: u64) -> u64 { + // 7.5.1.2.1 Base Address Registers + // This design implies that all address spaces used are a power of two + // in size and are naturally aligned. + let alignment = std::cmp::max(host_page_size(), 1 << size.trailing_zeros()); + usize_to_u64(alignment) + } + + let idx = bar_idx; + let gpa; + if is_io_bar { + warn!( + "BAR{bar_idx} size: {size:>#10x} io_bar: {is_io_bar} 64bits: {is_64_bits} \ + prefetchable: {is_prefetchable} Skipping IO BAR" + ); + bar_idx += 1; + continue; + } else if is_64_bits { + let alignment = calculate_alignment(size); + gpa = resource_allocator + .mmio64_memory + .allocate(size, alignment, AllocPolicy::FirstMatch) + .map_err(|_| VfioError::BarAllocation)? + .start(); + bars.set_bar_64(idx, gpa, size, is_prefetchable.into()); + } else { + let alignment = calculate_alignment(size); + gpa = resource_allocator + .mmio32_memory + .allocate(size, alignment, AllocPolicy::FirstMatch) + .map_err(|_| VfioError::BarAllocation)? + .start(); + assert!(gpa < u64::from(u32::MAX)); + assert!(size < u64::from(u32::MAX)); + #[allow(clippy::cast_possible_truncation)] + let gpa = gpa as u32; + #[allow(clippy::cast_possible_truncation)] + let size = size as u32; + bars.set_bar_32(idx, gpa, size, is_prefetchable.into()); + } + debug!( + "BAR{bar_idx} gpa: [{:#x}..{:#x}] size: {size:>#10x} io_bar: {is_io_bar} 64bits: \ + {is_64_bits} prefetchable: {is_prefetchable}", + gpa, + gpa + size + ); + } else { + debug!( + "BAR{bar_idx} size: {size:>#10x} io_bar: {is_io_bar} 64bits: {is_64_bits} \ + prefetchable: {is_prefetchable}" + ); + } + if is_64_bits { + bar_idx += 1; + } + bar_idx += 1; + } + Ok(bars) +} + +fn deallocate_bars(resource_allocator: &mut ResourceAllocator, bars: &Bars) { + let mut bar_idx = 0; + while bar_idx < NUM_BAR_REGS { + if bars.bars[bar_idx as usize].used() { + let start = bars.get_bar_addr(bar_idx); + let size = bars.get_bar_size(bar_idx); + let range = RangeInclusive::new(start, start + size - 1).unwrap(); + if bars.bars[bar_idx as usize].is_64bit() { + resource_allocator.mmio64_memory.free(&range).unwrap(); + bar_idx += 2; + } else { + resource_allocator.mmio32_memory.free(&range).unwrap(); + bar_idx += 1; + } + } else { + bar_idx += 1; + } + } +} + +/// Intermediate type to store areas needed to be mmaped for the device +#[derive(Debug, Clone, Copy)] +struct BarArea { + /// BAR gpa + bar_gpa: u64, + /// Offset into VFIO region + region_offset: u64, + /// Offset within BAR + offset: u64, + /// Size + size: u64, + /// Prot flags + prot: i32, +} + +/// Calculate areas needed to be mmaped for the device BARs including any BAR holes caused +/// by MSI-X table/pba +fn calculate_bar_areas( + bars: &Bars, + region_infos: &[VfioRegionInfo], + msix_cap: Option<&MsixCap>, +) -> Result<(Vec, ArrayVec), VfioError> { + // There are 6 BARs with maximum of 2 holes in one or two of them + // The only reasons to use Vec instead of ArrayVec here is because this vector can be populated + // from the `sparse_mmap_cap` which can contiains different number of areas. + // But in any case the size here is limited by the `nr_areas` field in the + // `vfio_region_info_cap_sparse_mmap` struct. This field has type of `u32`. + let mut areas = Vec::with_capacity(8); + let mut bar_hole_infos = ArrayVec::::new(); + let mut bar_idx: u8 = 0; + while bar_idx < NUM_BAR_REGS { + let bar_gpa = bars.get_bar_addr(bar_idx); + if bar_gpa != 0 { + let region_info = ®ion_infos[bar_idx as usize]; + let mut has_msix_mappable = false; + let mut sparse_mmap_cap = None; + for cap in region_info.caps.iter() { + match cap { + VfioRegionInfoCap::SparseMmap(cap) => sparse_mmap_cap = Some(cap), + VfioRegionInfoCap::MsixMappable => has_msix_mappable = true, + _ => {} + } + } + let mut contain_msix_table: bool = false; + let mut msix_table_offset = 0; + let mut msix_table_size = 0; + + let mut contain_msix_pba: bool = false; + let mut msix_pba_offset = 0; + let mut msix_pba_size = 0; + + if let Some(msix_cap) = msix_cap { + contain_msix_table = bar_idx == msix_cap.table_bir(); + if contain_msix_table { + let (offset, size) = msix_cap.table_range(); + let offset_in_hole = offset_from_lower_host_page(offset); + + msix_table_offset = align_down_host_page(offset); + msix_table_size = align_up_host_page(offset_in_hole + size); + + if msix_table_offset + .checked_add(msix_table_size) + .is_none_or(|end| end > region_info.size) + { + return Err(VfioError::MsixTableOutOfRange( + bar_idx, + msix_table_offset, + msix_table_size, + region_info.size, + )); + } + + debug!( + "BAR{} msix_table hole: [{:#x}..{:#x}] actual table: [{:#x} ..{:#x}]", + bar_idx, + bar_gpa + msix_table_offset, + bar_gpa + msix_table_offset + msix_table_size, + bar_gpa + offset_in_hole, + bar_gpa + offset_in_hole + size, + ); + + let info = BarHoleInfo { + gpa: bar_gpa + msix_table_offset, + size: msix_table_size, + usage: BarHoleInfoUsage::TABLE, + }; + bar_hole_infos.push(info); + } + + contain_msix_pba = bar_idx == msix_cap.pba_bir(); + if contain_msix_pba { + let (offset, size) = msix_cap.pba_range(); + let offset_in_hole = offset_from_lower_host_page(offset); + + msix_pba_offset = align_down_host_page(offset); + msix_pba_size = align_up_host_page(offset_in_hole + size); + + if msix_pba_offset + .checked_add(msix_pba_size) + .is_none_or(|end| end > region_info.size) + { + return Err(VfioError::MsixPbaOutOfRange( + bar_idx, + msix_pba_offset, + msix_pba_size, + region_info.size, + )); + } + + debug!( + "BAR{} pba_table hole: [{:#x} ..{:#x}] actual table: [{:#x} ..{:#x}]", + bar_idx, + bar_gpa + msix_pba_offset, + bar_gpa + msix_pba_offset + msix_pba_size, + bar_gpa + offset_in_hole, + bar_gpa + offset_in_hole + size, + ); + + let pba_gpa = bar_gpa + msix_pba_offset; + // The table hole, if present, was just pushed above, so + // the PBA hole can only coincide with the last entry. + // Merge into it so we don't register the same MMIO range + // twice. + if let Some(last) = bar_hole_infos.last_mut() + && last.gpa == pba_gpa + { + last.usage |= BarHoleInfoUsage::PBA; + // In case PBA table is weirdly located at the page boundary which forces + // the size to become 2 pages instead of 1, just extend the region + last.size = last.size.max(msix_pba_size); + } else { + let info = BarHoleInfo { + gpa: pba_gpa, + size: msix_pba_size, + usage: BarHoleInfoUsage::PBA, + }; + bar_hole_infos.push(info); + } + } + } + + if (contain_msix_table || contain_msix_pba) + && !has_msix_mappable + && sparse_mmap_cap.is_none() + { + debug!( + "BAR{} contains msix_table: {} msix_pba: {}, but mappable is {} and \ + sparse_mmap_cap is {}. Skipping", + bar_idx, + contain_msix_table, + contain_msix_pba, + has_msix_mappable, + sparse_mmap_cap.is_some() + ); + } else { + let can_mmap = region_info.flags & VFIO_REGION_INFO_FLAG_MMAP != 0; + if can_mmap || sparse_mmap_cap.is_some() { + let mut prot = 0; + if region_info.flags & VFIO_REGION_INFO_FLAG_READ != 0 { + prot |= libc::PROT_READ; + } + if region_info.flags & VFIO_REGION_INFO_FLAG_WRITE != 0 { + prot |= libc::PROT_WRITE; + } + let region_size = region_info.size; + + if let Some(cap) = sparse_mmap_cap { + for area in cap.areas.iter() { + if area + .offset + .checked_add(area.size) + .is_none_or(|end| end > region_size) + { + return Err(VfioError::SparseMmapAreaOutOfRange( + bar_idx, + area.offset, + area.size, + region_size, + )); + } + areas.push(BarArea { + bar_gpa, + region_offset: region_info.offset, + offset: area.offset, + size: area.size, + prot, + }); + } + } else if has_msix_mappable { + let mut first_gap_offset = msix_table_offset; + let mut first_gap_size = msix_table_size; + let mut second_gap_offset = msix_pba_offset; + let mut second_gap_size = msix_pba_size; + if second_gap_offset < first_gap_offset { + second_gap_offset = msix_table_offset; + second_gap_size = msix_table_size; + first_gap_offset = msix_pba_offset; + first_gap_size = msix_pba_size; + } + let mut offset = 0; + if first_gap_size != 0 { + let area_size = first_gap_offset - offset; + if area_size != 0 { + areas.push(BarArea { + bar_gpa, + region_offset: region_info.offset, + offset, + size: area_size, + prot, + }); + } + offset = first_gap_offset + first_gap_size; + } + if second_gap_size != 0 { + if offset < second_gap_offset { + let area_size = second_gap_offset - offset; + if area_size != 0 { + areas.push(BarArea { + bar_gpa, + region_offset: region_info.offset, + offset, + size: area_size, + prot, + }); + } + } + offset = offset.max(second_gap_offset + second_gap_size); + } + let area_size = region_size - offset; + if area_size != 0 { + areas.push(BarArea { + bar_gpa, + region_offset: region_info.offset, + offset, + size: area_size, + prot, + }); + } + } else { + areas.push(BarArea { + bar_gpa, + region_offset: region_info.offset, + offset: 0, + size: region_size, + prot, + }); + } + } + } + } + if bars.bars[bar_idx as usize].is_64bit() { + bar_idx += 1; + } + bar_idx += 1; + } + Ok((areas, bar_hole_infos)) +} + +/// Establish DMA mapping of the Dram region of the guest memory with the vfio container +pub fn dma_map_guest_memory( + container: &VfioContainer, + guest_memory: &GuestMemoryMmap, +) -> Result<(), VfioError> { + for (i, region) in guest_memory.iter().enumerate() { + if region.region_type == GuestRegionType::Dram { + let region = ®ion.inner; + let hva = region.as_ptr(); + let iova = region.start_addr().0; + let size = region.size(); + debug!( + "DMA map guest memory: [{:#x}..{:#x}]", + iova, + iova + size as u64 + ); + // SAFETY: all arguments are from the existing guest memory region + // After this operation, virtual memory will have a pinned physical pages backing it + if let Err(e) = unsafe { container.vfio_dma_map(iova, size, hva) } { + // Try to remove DMA mapping if anything fails. If unmap also fails, just log it + // since there is nothing we can do about it. + // Since the failed region is at index 'i', we only care about [0..i) regions + for region in guest_memory.iter().take(i) { + if region.region_type == GuestRegionType::Dram { + let iova = region.start_addr().0; + let size = region.size(); + if let Err(ee) = container.vfio_dma_unmap(iova, size) { + error!("Failed to unmap DMA from guest memory: {ee}"); + } + } + } + return Err(VfioError::VfioIoctls(e)); + } + } + } + Ok(()) +} + +/// Tear down DMA mapping of the Dram guest memory from the vfio container +pub fn dma_unmap_guest_memory(container: &VfioContainer, guest_memory: &GuestMemoryMmap) { + for region in guest_memory.iter() { + if region.region_type == GuestRegionType::Dram { + let iova = region.start_addr().0; + let size = region.size(); + if let Err(ee) = container.vfio_dma_unmap(iova, size) { + error!("Failed to unmap DMA from guest memory: {ee}"); + } + } + } +} + +fn map_bar_mapping( + container: &VfioContainer, + device: &VfioDevice, + vm: &KvmVm, + area: &BarArea, + slot: u32, +) -> Result { + // SAFETY: FFI call to mmap with valid fd and offset. The returned pointer is checked + // against MAP_FAILED before use. + let hva_ptr = unsafe { + libc::mmap( + std::ptr::null_mut(), + #[allow(clippy::cast_possible_truncation)] + { + area.size as usize + }, + area.prot, + libc::MAP_SHARED, + device.as_raw_fd(), + #[allow(clippy::cast_possible_wrap)] + { + (area.region_offset + area.offset) as i64 + }, + ) + }; + + if hva_ptr == libc::MAP_FAILED { + return Err(VfioError::Mmap); + } + + let iova = area.bar_gpa + area.offset; + let size = area.size; + let hva = hva_ptr as u64; + + let kvm_memory_region = kvm_userspace_memory_region { + slot, + flags: 0, + guest_phys_addr: iova, + memory_size: size, + userspace_addr: hva, + }; + if let Err(e) = vm.set_user_memory_region(kvm_memory_region) { + // SAFETY: hva_ptr was returned by a successful mmap call above with the given size. + let r = unsafe { libc::munmap(hva_ptr.cast(), u64_to_usize(size)) }; + if r < 0 { + error!( + "Error on unmapping host memory on VFIO device creation failure: {r:?}. \ + Continuing with other regions removal." + ); + } + return Err(VfioError::SetUserMemoryRegion(e.to_string())); + } + + // NOTE: the `vfio_dma_map` always maps with `VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE` + // which does not respect the `region_info.flags`/`area.prot`. + if let Err(e) = + // SAFETY: hva_ptr was returned by a successful mmap call with the given size. + unsafe { container.vfio_dma_map(iova, u64_to_usize(size), hva_ptr.cast::()) } + { + let kvm_memory_region = kvm_userspace_memory_region { + slot, + flags: 0, + guest_phys_addr: iova, + memory_size: 0, + userspace_addr: hva, + }; + if let Err(ee) = vm.set_user_memory_region(kvm_memory_region) { + error!( + "Error on removing KVM region on VFIO device creation failure: {ee:?}. Continuing \ + with other regions removal." + ); + } + // SAFETY: hva_ptr was returned by a successful mmap call with the given size. + let r = unsafe { libc::munmap(hva_ptr.cast(), u64_to_usize(size)) }; + if r < 0 { + error!( + "Error on unmapping host memory on VFIO device creation failure: {r:?}. \ + Continuing with other regions removal." + ); + } + return Err(e.into()); + } + Ok(BarMapping { + slot, + iova, + size, + hva, + }) +} + +fn unmap_bar_mapping(container: &VfioContainer, vm: &KvmVm, mapping: &BarMapping) { + let kvm_memory_region = kvm_userspace_memory_region { + slot: mapping.slot, + flags: 0, + guest_phys_addr: mapping.iova, + memory_size: 0, + userspace_addr: mapping.hva, + }; + if let Err(ee) = vm.set_user_memory_region(kvm_memory_region) { + error!( + "Error on removing KVM region on VFIO device creation failure: {ee:?}. Continuing \ + with other regions removal." + ); + } + + if let Err(ee) = container.vfio_dma_unmap(mapping.iova, u64_to_usize(mapping.size)) { + error!( + "Error on unmapping DMA region on VFIO device creation failure: {ee:?}. Continuing \ + with other regions removal." + ); + } + + // SAFETY: host_addr was obtained from a successful mmap call with the given size. + let r = unsafe { libc::munmap(mapping.hva as *mut libc::c_void, u64_to_usize(mapping.size)) }; + if r < 0 { + error!( + "Error on unmapping host memory on VFIO device creation failure: {r:?}. Continuing \ + with other regions removal." + ); + } +} + +// There is no direct access to `regions` in `VfioDevice`, so need to work around this +fn extract_bar_region_infos(device: &VfioDevice) -> Vec { + (0..NUM_BAR_REGS as u32) + .map(|i| VfioRegionInfo { + flags: device.get_region_flags(i), + size: device.get_region_size(i), + offset: device.get_region_offset(i), + caps: device.get_region_caps(i), + }) + .collect() +} + +/// Create KVM_DEV_TYPE_VFIO device +fn create_kvm_vfio_device(vm: &KvmVm) -> Result { + let mut vfio_dev = kvm_create_device { + type_: kvm_device_type_KVM_DEV_TYPE_VFIO, + fd: 0, + flags: 0, + }; + vm.fd() + .create_device(&mut vfio_dev) + .map_err(VfioError::KVMCreateVfioDevice) +} + +/// Create a VfioContainer wrapper around both KVM vfio device and VFIO container +pub fn init_kvm_vfio_device_and_vfio_container( + vm: &KvmVm, +) -> Result, VfioError> { + let kvm_device_fd = create_kvm_vfio_device(vm)?; + let device_fd = VfioDeviceFd::new_from_kvm(kvm_device_fd); + let container = VfioContainer::new(Some(Arc::new(device_fd)))?; + Ok(Arc::new(container)) +} + +#[allow(clippy::type_complexity)] +fn prepare_vfio_device( + container: &Arc, + vm: &Arc, + sysfs_path: &Path, + sbdf: PciSBDF, +) -> Result< + ( + VfioDevice, + Bars, + Vec, + MsixState, + Vec, + ), + VfioError, +> { + let device = VfioDevice::new( + sysfs_path, + container.clone() as Arc, + )?; + device.reset(); + + let (msix_cap_and_register, masks) = vfio_device_get_pci_capabilities(&device)?; + + // Only devices with MSI-X cap and irqs are supported + let Some((msix_cap, msix_register)) = msix_cap_and_register else { + return Err(VfioError::NoMsixIrq); + }; + let Some(msix_irq_info) = device.get_irq_info(VFIO_PCI_MSIX_IRQ_INDEX) else { + return Err(VfioError::NoMsixIrq); + }; + + // SAFETY: maximum msix table size is 1 << 11 = 2048 (it has 10 bits int the control register + // and encoded as N - 1) + // This fits into u16 without issues + #[allow(clippy::cast_possible_truncation)] + let msix_num = msix_irq_info.count as u16; + let msix_vectors = + KvmVm::create_msix_group(vm.clone(), msix_num).map_err(VfioError::MsixConfig)?; + let msix_config = MsixConfig::new(Arc::new(msix_vectors), sbdf); + + // We set VFIO irqs here on device setup. There is no reason to add additional tracking + // for driver MSIx configuration since those are handled by the MsixState. + // If anything after this call fails, we don't need to do anything since the kernel will + // clean up these irqs when `device` file will be closed. + let fds: Vec<&EventFd> = msix_config + .vectors + .vectors + .iter() + .map(|v| &v.event_fd) + .collect(); + device.enable_msix(fds)?; + + let bars = { + let mut resource_allocator_lock = vm.resource_allocator(); + let resource_allocator = resource_allocator_lock.deref_mut(); + vfio_device_allocate_bars(&device, resource_allocator)? + }; + + let bar_region_infos = extract_bar_region_infos(&device); + let (areas, bar_hole_infos) = match calculate_bar_areas( + &bars, + &bar_region_infos, + msix_cap_and_register.as_ref().map(|(v, _)| v), + ) { + Ok(v) => v, + Err(e) => { + let mut resource_allocator_lock = vm.resource_allocator(); + let resource_allocator = resource_allocator_lock.deref_mut(); + deallocate_bars(resource_allocator, &bars); + return Err(e); + } + }; + let first_area_slot = match vm.next_kvm_slot( + // SAFETY: areas.len() is bound to fit in u32 + #[allow(clippy::cast_possible_truncation)] + { + areas.len() as u32 + }, + ) { + Some(v) => v, + None => { + let mut resource_allocator_lock = vm.resource_allocator(); + let resource_allocator = resource_allocator_lock.deref_mut(); + deallocate_bars(resource_allocator, &bars); + return Err(VfioError::KvmSlot); + } + }; + + // Same as with areas, usually there only should be 6 BARs and one of them can be split into 3 + // regions + let mut bar_mappings = Vec::with_capacity(8); + for (i, area) in areas.iter().enumerate() { + match map_bar_mapping( + container, + &device, + vm.as_ref(), + area, + first_area_slot + { + // TODO i can fit into u32, but the sum might not. But the propability that this + // should ever happen is 0, so this should be ok. + #[allow(clippy::cast_possible_truncation)] + { + i as u32 + } + }, + ) { + Ok(mapping) => { + debug!( + "BAR area{} kvm gpa: [{:#x} ..{:#x}]", + i, + mapping.iova, + mapping.iova + mapping.size + ); + bar_mappings.push(mapping); + } + Err(e) => { + let mut resource_allocator_lock = vm.resource_allocator(); + let resource_allocator = resource_allocator_lock.deref_mut(); + deallocate_bars(resource_allocator, &bars); + + for mapping in bar_mappings.iter() { + unmap_bar_mapping(container, vm.as_ref(), mapping); + } + return Err(e); + } + } + } + + let msix_state = MsixState { + register: msix_register, + cap: msix_cap, + bar_hole_infos, + config: msix_config, + }; + Ok((device, bars, bar_mappings, msix_state, masks)) +} + +/// This will open a VFIO device, attach it's group both to the KVM VFIO device and to the VFIO +/// container. It will setup MSIx irqs and BAR DMAs. +pub fn init_vfio_device( + container: &Arc, + vm: &Arc, + config: VfioConfig, + sbdf: PciSBDF, +) -> Result>, VfioError> { + let sysfs_path = config.sbdf.sysfs_path(); + debug!("Opening device at path: {}", sysfs_path); + let (device, bars, bar_mappings, msix_state, masks) = + prepare_vfio_device(container, vm, Path::new(&sysfs_path), sbdf)?; + + let vfio_device_bundle = Arc::new(Mutex::new(VfioDeviceBundle { + config, + sbdf, + device, + bars, + bar_mappings, + msix_state, + masks, + vm: vm.clone(), + })); + + for hole in vfio_device_bundle + .lock() + .unwrap() + .msix_state + .bar_hole_infos + .iter() + { + vm.common + .mmio_bus + .insert(vfio_device_bundle.clone(), hole.gpa, hole.size) + // SAFETY: the hole gpa and size were allocated from internal allocator. we must never + // receive overlapping regions from it. + .unwrap(); + } + Ok(vfio_device_bundle) +} + +/// Performs cleanup of all VFIO device resources allocated by `init_vfio_device` +pub fn deinit_vfio_device(container: &Arc, vm: &KvmVm, device: &VfioDeviceBundle) { + for hole in device.msix_state.bar_hole_infos.iter() { + vm.common.mmio_bus.remove(hole.gpa, hole.size).unwrap(); + } + for mapping in device.bar_mappings.iter() { + unmap_bar_mapping(container, vm, mapping); + } + + device.device.reset(); + + let mut resource_allocator_lock = vm.resource_allocator(); + let resource_allocator = resource_allocator_lock.deref_mut(); + deallocate_bars(resource_allocator, &device.bars); +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::pci::configuration::BarPrefetchable; + + fn make_region(size: u64, caps: Vec) -> VfioRegionInfo { + let flags = if size != 0 { + VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | VFIO_REGION_INFO_FLAG_MMAP + } else { + 0 + }; + VfioRegionInfo { + flags, + size, + offset: 0, + caps, + } + } + + #[test] + fn test_calculate_bar_areas_no_msix() { + let mut bars = Bars::default(); + bars.set_bar_64(0, 0x4000_0000_0000, 0x10_0000, BarPrefetchable::No); + let region_infos = [make_region(0x10_0000, vec![])]; + + let (areas, holes) = calculate_bar_areas(&bars, ®ion_infos, None).unwrap(); + assert_eq!(areas.len(), 1); + assert_eq!(areas[0].bar_gpa, 0x4000_0000_0000); + assert_eq!(areas[0].size, 0x10_0000); + assert_eq!(areas[0].offset, 0); + assert!(holes.is_empty()); + } + + #[test] + fn test_calculate_bar_areas_msix_table_and_pba_different_bars() { + let mut bars = Bars::default(); + bars.set_bar_64(0, 0x4000_0000_0000, 0x10_0000, BarPrefetchable::No); + bars.set_bar_64(2, 0x4000_0010_0000, 0x1_0000, BarPrefetchable::No); + + let region_infos = [ + make_region(0x10_0000, vec![VfioRegionInfoCap::MsixMappable]), + // BAR 0 is 64-bit so slot 1 is its high half and is never indexed. + make_region(0, vec![]), + make_region(0x1_0000, vec![VfioRegionInfoCap::MsixMappable]), + ]; + + let msix_cap = MsixCap::new(0, 32, 0, 2, 0); + + let (areas, holes) = calculate_bar_areas(&bars, ®ion_infos, Some(&msix_cap)).unwrap(); + assert_eq!(holes.len(), 2); + assert!(!areas.is_empty()); + } + + #[test] + fn test_calculate_bar_areas_msix_table_and_pba_same_bar() { + let mut bars = Bars::default(); + bars.set_bar_64(0, 0x4000_0000_0000, 0x10_0000, BarPrefetchable::No); + + let region_infos = [make_region( + 0x10_0000, + vec![VfioRegionInfoCap::MsixMappable], + )]; + + let msix_cap = MsixCap::new(0, 32, 0, 0, 0x1000); + + let (areas, holes) = calculate_bar_areas(&bars, ®ion_infos, Some(&msix_cap)).unwrap(); + assert_eq!(holes.len(), 2); + // Table hole [0x0..0x1000), PBA hole [0x1000..0x2000), one area after both. + assert_eq!(areas.len(), 1); + assert_eq!(areas[0].offset, 0x2000); + assert_eq!(areas[0].size, 0x10_0000 - 0x2000); + } + + #[test] + fn test_calculate_bar_areas_sparse_mmap() { + let mut bars = Bars::default(); + bars.set_bar_64(0, 0x4000_0000_0000, 0x10_0000, BarPrefetchable::No); + + let sparse_areas = vec![ + VfioRegionSparseMmapArea { + offset: 0, + size: 0x8_0000, + }, + VfioRegionSparseMmapArea { + offset: 0xC_0000, + size: 0x4_0000, + }, + ]; + let region_infos = [make_region( + 0x10_0000, + vec![VfioRegionInfoCap::SparseMmap(VfioRegionInfoCapSparseMmap { + areas: sparse_areas, + })], + )]; + + let msix_cap = MsixCap::new(0, 32, 0x8_0000, 0, 0xB_0000); + + let (areas, _holes) = calculate_bar_areas(&bars, ®ion_infos, Some(&msix_cap)).unwrap(); + assert_eq!(areas.len(), 2); + assert_eq!(areas[0].offset, 0); + assert_eq!(areas[0].size, 0x8_0000); + assert_eq!(areas[1].offset, 0xC_0000); + assert_eq!(areas[1].size, 0x4_0000); + } + + #[test] + fn test_calculate_bar_areas_zero_size_bar() { + let bars = Bars::default(); + let region_infos: [VfioRegionInfo; 0] = []; + + let (areas, holes) = calculate_bar_areas(&bars, ®ion_infos, None).unwrap(); + assert!(areas.is_empty()); + assert!(holes.is_empty()); + } + + #[test] + fn test_calculate_bar_areas_overlapping_msix_holes() { + let mut bars = Bars::default(); + bars.set_bar_64(0, 0x4000_0000_0000, 0x10_0000, BarPrefetchable::No); + + let region_infos = [make_region( + 0x10_0000, + vec![VfioRegionInfoCap::MsixMappable], + )]; + + // Both tables create the same hole [0x0..0x1000) + let msix_cap = MsixCap::new(0, 32, 0x0, 0, 0x200); + let (areas, holes) = calculate_bar_areas(&bars, ®ion_infos, Some(&msix_cap)).unwrap(); + + assert_eq!(areas.len(), 1); + assert_eq!(areas[0].offset, 0x1000); + assert_eq!(areas[0].size, 0x10_0000 - 0x1000); + + assert_eq!(holes.len(), 1); + assert_eq!( + holes[0].usage, + BarHoleInfoUsage::TABLE | BarHoleInfoUsage::PBA + ); + } + + /// Table and PBA share the same starting page (so the same hole `gpa`), + /// but the PBA contents straddle the page boundary, so its host-page- + /// aligned size is larger than the table's. The two holes must merge + /// into one - taking the larger size - otherwise `init_vfio_device` + /// would panic on `mmio_bus.insert` for an overlapping range. + #[test] + fn test_calculate_bar_areas_same_gpa_different_size_msix_holes() { + let mut bars = Bars::default(); + bars.set_bar_64(0, 0x4000_0000_0000, 0x10_0000, BarPrefetchable::No); + + let region_infos = [make_region( + 0x10_0000, + vec![VfioRegionInfoCap::MsixMappable], + )]; + + // table at offset 0, 128 entries (0x800 bytes) -> hole [0x0, 0x1000) + // PBA at offset 0xff8, 16 bytes -> straddles 0x1000 -> hole [0x0, 0x2000) + // Same gpa (bar_gpa + 0), different sizes. + let msix_cap = MsixCap::new(0, 128, 0, 0, 0xff8); + + let (areas, holes) = calculate_bar_areas(&bars, ®ion_infos, Some(&msix_cap)).unwrap(); + + // One merged hole, sized to the larger of the two (PBA's 0x2000). + assert_eq!(holes.len(), 1); + assert_eq!(holes[0].gpa, 0x4000_0000_0000); + assert_eq!(holes[0].size, 0x2000); + assert_eq!( + holes[0].usage, + BarHoleInfoUsage::TABLE | BarHoleInfoUsage::PBA + ); + + // One area covering the BAR after the merged hole. + assert_eq!(areas.len(), 1); + assert_eq!(areas[0].offset, 0x2000); + assert_eq!(areas[0].size, 0x10_0000 - 0x2000); + } + + /// MSI-X table claimed at the very end of the BAR. The page-aligned hole + /// would extend past `region_size` and the gap arithmetic would underflow. + /// `calculate_bar_areas` must reject this with `MsixTableOutOfRange` + /// instead of proceeding with corrupted offsets. + #[test] + fn test_calculate_bar_areas_msix_table_past_region_end() { + let mut bars = Bars::default(); + bars.set_bar_64(0, 0x4000_0000_0000, 0x4000, BarPrefetchable::No); + + let region_infos = [make_region(0x4000, vec![VfioRegionInfoCap::MsixMappable])]; + + // table at offset 0x3ff8, 1 entry (16 bytes). The page-aligned hole is + // [0x3000, 0x5000) (offset 0x3000, size 0x2000), whose end 0x5000 is + // past the BAR's region_size of 0x4000. + let msix_cap = MsixCap::new(0, 1, 0x3ff8, 0, 0); + + let err = calculate_bar_areas(&bars, ®ion_infos, Some(&msix_cap)).unwrap_err(); + assert!(matches!( + err, + VfioError::MsixTableOutOfRange(0, 0x3000, 0x2000, 0x4000) + )); + } + + /// MSI-X PBA claimed past the end of the BAR - same underflow risk as the + /// table case but on the PBA path. + #[test] + fn test_calculate_bar_areas_msix_pba_past_region_end() { + let mut bars = Bars::default(); + bars.set_bar_64(0, 0x4000_0000_0000, 0x4000, BarPrefetchable::No); + + let region_infos = [make_region(0x4000, vec![VfioRegionInfoCap::MsixMappable])]; + + // PBA at offset 0x4000 (= region_size), 1 entry. The page-aligned hole is + // [0x4000, 0x5000) (offset 0x4000, size 0x1000), whose end 0x5000 is past + // region_size 0x4000. + let msix_cap = MsixCap::new(0, 1, 0, 0, 0x4000); + + let err = calculate_bar_areas(&bars, ®ion_infos, Some(&msix_cap)).unwrap_err(); + assert!(matches!( + err, + VfioError::MsixPbaOutOfRange(0, 0x4000, 0x1000, 0x4000) + )); + } + + /// A sparse mmap area extending past the region end must also be rejected. + #[test] + fn test_calculate_bar_areas_sparse_mmap_area_past_region_end() { + let mut bars = Bars::default(); + bars.set_bar_64(0, 0x4000_0000_0000, 0x4000, BarPrefetchable::No); + + let sparse_areas = vec![VfioRegionSparseMmapArea { + offset: 0x3000, + size: 0x2000, // 0x3000 + 0x2000 = 0x5000, past region_size 0x4000 + }]; + let region_infos = [make_region( + 0x4000, + vec![VfioRegionInfoCap::SparseMmap(VfioRegionInfoCapSparseMmap { + areas: sparse_areas, + })], + )]; + + let err = calculate_bar_areas(&bars, ®ion_infos, None).unwrap_err(); + assert!(matches!( + err, + VfioError::SparseMmapAreaOutOfRange(0, 0x3000, 0x2000, 0x4000) + )); + } + + /// MSI-X table offset+size that overflows u64 on addition must also be + /// rejected, not silently wrap. + #[test] + fn test_calculate_bar_areas_msix_table_offset_overflow() { + let mut bars = Bars::default(); + bars.set_bar_64(0, 0x4000_0000_0000, 0x4000, BarPrefetchable::No); + + let region_infos = [make_region(0x4000, vec![VfioRegionInfoCap::MsixMappable])]; + + // MsixCap stores table_offset in 32 bits, so it cannot itself overflow. + // But the masked-off table_offset is 0xffff_fff8; with 32 entries the + // table_range size = 32 * 16 = 0x200, end = 0x1_0000_01f8, past region_size. + let msix_cap = MsixCap::new(0, 32, 0xffff_fff8, 0, 0); + + let err = calculate_bar_areas(&bars, ®ion_infos, Some(&msix_cap)).unwrap_err(); + assert!(matches!(err, VfioError::MsixTableOutOfRange(0, _, _, _))); + } + + #[derive(Debug, Default)] + struct MockMsixConfig { + table_read: Option<(u64, usize)>, + table_write: Option<(u64, Vec)>, + pba_read: Option<(u64, usize)>, + pba_write: Option<(u64, Vec)>, + } + + impl MockMsixConfig { + fn read_table(&mut self, offset: u64, data: &mut [u8]) { + assert!( + self.table_read.is_none(), + "read_table called more than once" + ); + self.table_read = Some((offset, data.len())); + data.fill(0xAA); + } + fn write_table(&mut self, offset: u64, data: &[u8]) { + assert!( + self.table_write.is_none(), + "write_table called more than once" + ); + self.table_write = Some((offset, data.to_vec())); + } + fn read_pba(&mut self, offset: u64, data: &mut [u8]) { + assert!(self.pba_read.is_none(), "read_pba called more than once"); + self.pba_read = Some((offset, data.len())); + data.fill(0xBB); + } + fn write_pba(&mut self, offset: u64, data: &[u8]) { + assert!(self.pba_write.is_none(), "write_pba called more than once"); + self.pba_write = Some((offset, data.to_vec())); + } + } + + #[derive(Debug)] + struct MockVfioDevice { + region_size: u64, + read: Option<(u32, u64, usize)>, + write: Option<(u32, u64, Vec)>, + } + + impl MockVfioDevice { + fn new(region_size: u64) -> Self { + Self { + region_size, + read: None, + write: None, + } + } + fn get_region_size(&self, _index: u32) -> u64 { + self.region_size + } + fn region_read(&mut self, index: u32, data: &mut [u8], offset: u64) { + assert!(self.read.is_none(), "region_read called more than once"); + self.read = Some((index, offset, data.len())); + data.fill(0xCC); + } + fn region_write(&mut self, index: u32, data: &[u8], offset: u64) { + assert!(self.write.is_none(), "region_write called more than once"); + self.write = Some((index, offset, data.to_vec())); + } + } + + struct MockMsixState { + bar_hole_infos: ArrayVec, + cap: MsixCap, + config: MockMsixConfig, + } + + fn drive_read( + state: &mut MockMsixState, + device: &mut MockVfioDevice, + base: u64, + offset: u64, + data: &mut [u8], + ) -> (&'static str, bool) { + handle_bar_access!( + state, + device, + base, + offset, + data, + read_table, + read_pba, + region_read + ) + } + + fn drive_write( + state: &mut MockMsixState, + device: &mut MockVfioDevice, + base: u64, + offset: u64, + data: &[u8], + ) -> (&'static str, bool) { + handle_bar_access!( + state, + device, + base, + offset, + data, + write_table, + write_pba, + region_write + ) + } + + const BAR_GPA: u64 = 0x4000_0000_0000; + const REGION_SIZE: u64 = 0x10_0000; + + /// Hole-only-table at the start of the BAR. 4-entry table at offset 0 + /// (= 64 bytes). Hole spans the page [0x0..0x1000). + fn state_table_only() -> MockMsixState { + let cap = MsixCap::new(0, 4, 0, 0, 0x800); + let mut bar_hole_infos = ArrayVec::new(); + bar_hole_infos.push(BarHoleInfo { + gpa: BAR_GPA, + size: 0x1000, + usage: BarHoleInfoUsage::TABLE, + }); + MockMsixState { + bar_hole_infos, + cap, + config: MockMsixConfig::default(), + } + } + + /// Hole-only-PBA at the start of the BAR. PBA at offset 0x100, 8 bytes + /// (one word for a 4-entry table). Hole spans the page [0x0..0x1000). + fn state_pba_only() -> MockMsixState { + let cap = MsixCap::new(0, 4, 0x800, 0, 0x100); + let mut bar_hole_infos = ArrayVec::new(); + bar_hole_infos.push(BarHoleInfo { + gpa: BAR_GPA, + size: 0x1000, + usage: BarHoleInfoUsage::PBA, + }); + MockMsixState { + bar_hole_infos, + cap, + config: MockMsixConfig::default(), + } + } + + /// Merged hole: table at offset 0 (4 entries = 64 bytes) and PBA at + /// offset 0x200 (8 bytes), both in the same host page -> single hole + /// [0x0..0x1000) flagged as TABLE|PBA. + fn state_merged() -> MockMsixState { + let cap = MsixCap::new(0, 4, 0, 0, 0x200); + let mut bar_hole_infos = ArrayVec::new(); + bar_hole_infos.push(BarHoleInfo { + gpa: BAR_GPA, + size: 0x1000, + usage: BarHoleInfoUsage::TABLE | BarHoleInfoUsage::PBA, + }); + MockMsixState { + bar_hole_infos, + cap, + config: MockMsixConfig::default(), + } + } + + #[test] + fn test_handle_bar_access_table_inside_table_range() { + // Read at offset 0x10 (second table entry, vector ctl). + let mut state = state_table_only(); + let mut device = MockVfioDevice::new(REGION_SIZE); + let mut data = [0u8; 4]; + let (name, handled) = drive_read(&mut state, &mut device, BAR_GPA, 0x10, &mut data); + assert!(handled); + assert_eq!(name, "MsiTable"); + assert_eq!(data, [0xAA, 0xAA, 0xAA, 0xAA]); + assert_eq!(state.config.table_read, Some((0x10, 4))); + assert!(device.read.is_none()); + } + + #[test] + fn test_handle_bar_access_table_outside_table_range_forwards_to_device() { + // Table is 64 bytes (4 entries). Access at offset 0x100 is in the + // hole padding -> forward to device region. + let mut state = state_table_only(); + let mut device = MockVfioDevice::new(REGION_SIZE); + let mut data = [0u8; 4]; + let (_name, handled) = drive_read(&mut state, &mut device, BAR_GPA, 0x100, &mut data); + assert!(handled); + assert_eq!(data, [0xCC, 0xCC, 0xCC, 0xCC]); + assert!(state.config.table_read.is_none()); + // table_bir = 0, in_region_offset = (0 - 0) + 0x100 = 0x100. + assert_eq!(device.read, Some((0, 0x100, 4))); + } + + #[test] + fn test_handle_bar_access_pba_inside_pba_range() { + // PBA at offset 0x100, size 8. Access at 0x100 -> handled by PBA. + // Relative offset passed to read_pba is access_offset - pba_start = 0. + let mut state = state_pba_only(); + let mut device = MockVfioDevice::new(REGION_SIZE); + let mut data = [0u8; 4]; + let (name, handled) = drive_read(&mut state, &mut device, BAR_GPA, 0x100, &mut data); + assert!(handled); + assert_eq!(name, "PbaTable"); + assert_eq!(data, [0xBB, 0xBB, 0xBB, 0xBB]); + assert_eq!(state.config.pba_read, Some((0, 4))); + assert!(device.read.is_none()); + } + + #[test] + fn test_handle_bar_access_pba_outside_pba_range_forwards_to_device() { + // Hole [0x0..0x1000), PBA [0x100..0x108). Access at 0x800 forwards. + let mut state = state_pba_only(); + let mut device = MockVfioDevice::new(REGION_SIZE); + let mut data = [0u8; 4]; + let (_name, handled) = drive_read(&mut state, &mut device, BAR_GPA, 0x800, &mut data); + assert!(handled); + assert_eq!(device.read, Some((0, 0x800, 4))); + } + + #[test] + fn test_handle_bar_access_merged_hits_table() { + // Merged hole. Access at 0x10 -> table. + let mut state = state_merged(); + let mut device = MockVfioDevice::new(REGION_SIZE); + let mut data = [0u8; 4]; + let (name, handled) = drive_read(&mut state, &mut device, BAR_GPA, 0x10, &mut data); + assert!(handled); + assert_eq!(name, "MsiTable"); + assert_eq!(state.config.table_read, Some((0x10, 4))); + assert!(state.config.pba_read.is_none()); + } + + #[test] + fn test_handle_bar_access_merged_hits_pba() { + // Merged hole. Access at 0x200 -> PBA. Relative offset = 0. + let mut state = state_merged(); + let mut device = MockVfioDevice::new(REGION_SIZE); + let mut data = [0u8; 4]; + let (name, handled) = drive_read(&mut state, &mut device, BAR_GPA, 0x200, &mut data); + assert!(handled); + assert_eq!(name, "PbaTable"); + assert!(state.config.table_read.is_none()); + assert_eq!(state.config.pba_read, Some((0, 4))); + } + + #[test] + fn test_handle_bar_access_merged_padding_forwards_to_device() { + // Merged hole padding (between PBA end at 0x208 and page end 0x1000). + let mut state = state_merged(); + let mut device = MockVfioDevice::new(REGION_SIZE); + let mut data = [0u8; 4]; + let (_name, handled) = drive_read(&mut state, &mut device, BAR_GPA, 0x800, &mut data); + assert!(handled); + assert_eq!(device.read, Some((0, 0x800, 4))); + } + + #[test] + fn test_handle_bar_access_unrelated_base_is_unhandled() { + // base != hole.gpa -> handled stays false. + let mut state = state_table_only(); + let mut device = MockVfioDevice::new(REGION_SIZE); + let mut data = [0u8; 4]; + let (_name, handled) = drive_read(&mut state, &mut device, BAR_GPA + 0x1000, 0, &mut data); + assert!(!handled); + assert!(state.config.table_read.is_none()); + assert!(device.read.is_none()); + } + + #[test] + fn test_handle_bar_access_write_table() { + let mut state = state_table_only(); + let mut device = MockVfioDevice::new(REGION_SIZE); + let buf = [1u8, 2, 3, 4]; + let (name, handled) = drive_write(&mut state, &mut device, BAR_GPA, 0, &buf); + assert!(handled); + assert_eq!(name, "MsiTable"); + assert_eq!(state.config.table_write, Some((0, vec![1, 2, 3, 4]))); + } + + #[test] + fn test_handle_bar_access_write_pba_padding_forwards_to_device() { + // PBA-padding write -> forwarded to device region_write. + let mut state = state_pba_only(); + let mut device = MockVfioDevice::new(REGION_SIZE); + let buf = [9u8, 8, 7, 6]; + let (_name, handled) = drive_write(&mut state, &mut device, BAR_GPA, 0x800, &buf); + assert!(handled); + assert_eq!(device.write, Some((0, 0x800, vec![9, 8, 7, 6]))); + } + + #[test] + fn test_handle_bar_access_forward_beyond_region_size_is_dropped() { + // Region is 0x100 bytes; access at 0x800 in the hole padding is + // past region_size -> device call must be skipped. + let mut state = state_table_only(); + let mut device = MockVfioDevice::new(0x100); + let mut data = [0u8; 4]; + let (_name, handled) = drive_read(&mut state, &mut device, BAR_GPA, 0x800, &mut data); + assert!(handled); + assert!(device.read.is_none()); + assert!(state.config.table_read.is_none()); + } +} diff --git a/src/vmm/src/vmm_config/mod.rs b/src/vmm/src/vmm_config/mod.rs index b67a486c2f9..a9d7740bb90 100644 --- a/src/vmm/src/vmm_config/mod.rs +++ b/src/vmm/src/vmm_config/mod.rs @@ -37,6 +37,8 @@ pub mod pmem; /// Wrapper for configuring microVM snapshots and the microVM state. pub mod serial; pub mod snapshot; +/// Wrapper for configuring the VFIO passthrough devices. +pub mod vfio; /// Wrapper for configuring the vsock devices attached to the microVM. pub mod vsock; diff --git a/src/vmm/src/vmm_config/vfio.rs b/src/vmm/src/vmm_config/vfio.rs new file mode 100644 index 00000000000..406371c5a92 --- /dev/null +++ b/src/vmm/src/vmm_config/vfio.rs @@ -0,0 +1,120 @@ +// Copyright 2026 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use serde::{Deserialize, Deserializer, Serialize, Serializer}; + +use crate::pci::PciSBDF; + +/// Errors for VFIO device configuration. +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum VfioConfigError { + /// Duplicate VFIO SBDF: {0} + DuplicateSBDF(PciSBDF), + /// Invalid VFIO SBDF: {0} + InvalidSBDF(String), +} + +fn serialize_sbdf_as_str(sbdf: &PciSBDF, serializer: S) -> Result { + serializer.serialize_str(&format!("{sbdf}")) +} + +fn deserialize_sbdf_from_str<'de, D: Deserializer<'de>>( + deserializer: D, +) -> Result { + let s = String::deserialize(deserializer)?; + PciSBDF::new_from_str(&s) + .ok_or_else(|| serde::de::Error::custom(VfioConfigError::InvalidSBDF(s.to_string()))) +} + +/// Config for VFIO device +#[derive(Clone, Debug, Default, PartialEq, Eq, Deserialize, Serialize)] +#[serde(deny_unknown_fields)] +pub struct VfioConfig { + /// ID of the device + pub id: String, + /// Host identifier for the PCI device + #[serde( + serialize_with = "serialize_sbdf_as_str", + deserialize_with = "deserialize_sbdf_from_str" + )] + pub sbdf: PciSBDF, +} + +/// Config for VFIO passthrough devices +#[derive(Clone, Debug, Default, PartialEq, Eq, Deserialize, Serialize)] +#[serde(deny_unknown_fields)] +pub struct VfioConfigs { + /// VFIO configs + pub configs: Vec, +} + +impl VfioConfigs { + /// Add config to the set. Overwrite existing one if + /// ids are same. + pub fn add(&mut self, config: VfioConfig) -> Result<(), VfioConfigError> { + if self + .configs + .iter() + .any(|b| b.sbdf == config.sbdf && b.id != config.id) + { + return Err(VfioConfigError::DuplicateSBDF(config.sbdf)); + } + if let Some(old_config) = self.configs.iter_mut().find(|b| b.id == config.id) { + old_config.sbdf = config.sbdf; + } else { + self.configs.push(config); + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_add_vfio_config_and_overwrite() { + let id1 = PciSBDF::new_from_str("01:00.0").unwrap(); + let id2 = PciSBDF::new_from_str("02:00.0").unwrap(); + + let mut configs = VfioConfigs::default(); + + configs + .add(VfioConfig { + id: "dev0".to_string(), + sbdf: id1, + }) + .unwrap(); + assert_eq!(configs.configs.len(), 1); + assert_eq!(configs.configs[0].sbdf, id1); + + configs + .add(VfioConfig { + id: "dev0".to_string(), + sbdf: id2, + }) + .unwrap(); + assert_eq!(configs.configs.len(), 1); + assert_eq!(configs.configs[0].sbdf, id2); + + configs + .add(VfioConfig { + id: "dev1".to_string(), + sbdf: id1, + }) + .unwrap(); + assert_eq!(configs.configs.len(), 2); + assert_eq!(configs.configs[0].sbdf, id2); + assert_eq!(configs.configs[1].sbdf, id1); + + configs + .add(VfioConfig { + id: "dev1".to_string(), + sbdf: id2, + }) + .unwrap_err(); + assert_eq!(configs.configs.len(), 2); + assert_eq!(configs.configs[0].sbdf, id2); + assert_eq!(configs.configs[1].sbdf, id1); + } +} diff --git a/tests/framework/http_api.py b/tests/framework/http_api.py index a23fbf07e30..f280bec6172 100644 --- a/tests/framework/http_api.py +++ b/tests/framework/http_api.py @@ -200,3 +200,4 @@ def __init__(self, api_usocket_full_name, *, validate=True, on_error=None): self.pmem = Resource(self, "/pmem", "id") self.serial = Resource(self, "/serial") self.memory_hotplug = Resource(self, "/hotplug/memory") + self.vfio = Resource(self, "/vfio", "id") diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index 8339ae1695e..250c24f18a1 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -447,7 +447,10 @@ def _validate_api_response_times(self): "Got API call duration log entry before request entry" ) - if current_call.url not in ["/snapshot/create", "/snapshot/load"]: + if current_call.url not in [ + "/snapshot/create", + "/snapshot/load", + ] and not current_call.url.startswith("/vfio"): exec_time = float(match.group("execution_time")) / 1000.0 assert ( diff --git a/tests/framework/vm_config.json b/tests/framework/vm_config.json index b2bac4066d5..89676b4596c 100644 --- a/tests/framework/vm_config.json +++ b/tests/framework/vm_config.json @@ -33,5 +33,6 @@ "mmds-config": null, "entropy": null, "pmem": [], - "memory-hotplug": null + "memory-hotplug": null, + "vfio": [] } diff --git a/tests/host_tools/fcmetrics.py b/tests/host_tools/fcmetrics.py index dbbc9c4f24a..5fbf621bbd3 100644 --- a/tests/host_tools/fcmetrics.py +++ b/tests/host_tools/fcmetrics.py @@ -241,6 +241,8 @@ def validate_fc_metrics(metrics): "serial_fails", "hotplug_memory_count", "hotplug_memory_fails", + "vfio_count", + "vfio_fails", ], "seccomp": [ "num_faults", diff --git a/tests/integration_tests/functional/test_api.py b/tests/integration_tests/functional/test_api.py index 5fc32105231..38a30d87ef9 100644 --- a/tests/integration_tests/functional/test_api.py +++ b/tests/integration_tests/functional/test_api.py @@ -1406,6 +1406,9 @@ def test_get_full_config_after_restoring_snapshot(microvm_factory, uvm_configure # We should expect a null entropy device expected_cfg["entropy"] = None + # No VFIO devices configured + expected_cfg["vfio"] = [] + # Validate full vm configuration post-restore. response = uvm2.api.vm_config.get().json() assert response != setup_cfg @@ -1536,6 +1539,9 @@ def test_get_full_config(uvm): # We should expect a null entropy device expected_cfg["entropy"] = None + # No VFIO devices configured + expected_cfg["vfio"] = [] + # Getting full vm configuration should be available pre-boot. response = test_microvm.api.vm_config.get() assert response.json() == expected_cfg diff --git a/tests/integration_tests/functional/test_vfio.py b/tests/integration_tests/functional/test_vfio.py new file mode 100644 index 00000000000..66fa0b6f153 --- /dev/null +++ b/tests/integration_tests/functional/test_vfio.py @@ -0,0 +1,278 @@ +# Copyright 2026 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Integration tests for VFIO passthrough.""" + +import json +import os +import re +import stat +from pathlib import Path + +import pytest + +from framework.artifacts import ACPI_GUEST_KERNELS, pin_guest_kernel + +VFIO_SBDF = os.environ.get("FC_VFIO_PCI_SBDF") +VFIO_SYSFS = os.environ.get("FC_VFIO_PCI_SYSFS_PATH") + +# Skip tests if no VFIO device was passed through env variables. Also use +# `xdist_group` to make sure all tests here run sequentially since we only have +# 1 device to work with +pytestmark = [ + pytest.mark.skipif( + VFIO_SBDF is None, reason="No VFIO device configured (set FC_VFIO_PCI_SBDF)" + ), + pytest.mark.xdist_group("vfio"), + pytest.mark.vfio, +] + + +@pytest.fixture +def uvm_with_vfio(microvm_factory, guest_kernel, rootfs): + """Boot a microVM with the VFIO NVMe device attached.""" + vm = microvm_factory.build(guest_kernel, rootfs, pci=True) + + # Set up the jailer chroot directory before spawning + vm.jailer.setup() + + chroot = Path(vm.jailer.chroot_path()) + dev_sysfs = chroot / f"sys/bus/pci/devices/{VFIO_SBDF}" + dev_sysfs.mkdir(parents=True, exist_ok=True) + + # Create VFIO device nodes inside the jailer chroot + group_id = os.readlink(f"{VFIO_SYSFS}/iommu_group").split("/")[-1] + vfio_dir = chroot / "dev" / "vfio" + vfio_dir.mkdir(parents=True, exist_ok=True) + for name in ["vfio", group_id]: + src = Path(f"/dev/vfio/{name}") + dst = vfio_dir / name + st = src.stat() + os.mknod(dst, stat.S_IFCHR | 0o600, st.st_rdev) + os.chown(dst, vm.jailer.uid, vm.jailer.gid) + + # Create iommu_group symlink for the VFIO device. + # The VFIO code readlink()s this to get the group ID from the basename. + (dev_sysfs / "iommu_group").symlink_to(f"../iommu_groups/{group_id}") + os.lchown(dev_sysfs / "iommu_group", vm.jailer.uid, vm.jailer.gid) + + vm.spawn() + vm.basic_config(mem_size_mib=512) + vm.add_net_iface() + return vm + + +def test_api_vfio(microvm_factory, guest_kernel, rootfs): + """ + Test VFIO passthrough API commands. + """ + + vm = microvm_factory.build(guest_kernel, rootfs, pci=True) + vm.spawn() + vm.basic_config() + + # Missing required field 'path' + expected_msg = re.escape("missing field `sbdf`") + with pytest.raises(RuntimeError, match=expected_msg): + vm.api.vfio.put(id="dev0") + + # Valid VFIO device configs and overwrites + vm.api.vfio.put(id="nvme0", sbdf="/sys/bus/pci/devices/0000:01:02.03") + vm.api.vfio.put(id="nvme0", sbdf="0000:01:02.03") + vm.api.vfio.put(id="nvme0", sbdf="01:02.03") + vm.api.vfio.put(id="nvme0", sbdf="0x00010203") + vm.api.vfio.put(id="nvme0", sbdf=f"{0x00010203}") # converts hex to dec + + # Adding a second device should be OK + vm.api.vfio.put(id="nvme1", sbdf="0000:01:02.03") + + # Empty id should fail + expected_msg = re.escape("The ID cannot be empty.") + with pytest.raises(RuntimeError, match=expected_msg): + vm.api.vfio.put(id="", sbdf="0000:01:02.03") + + +def test_vfio_incompatible_devices_no_pci(microvm_factory, guest_kernel, rootfs): + """ + Test that adding VFIO device without PCI fails at API level. + """ + vm = microvm_factory.build(guest_kernel, rootfs, pci=False) + vm.jailer.setup() + vm.spawn() + vm.basic_config() + + expected_msg = re.escape("VFIO devices attached, but PCI disabled") + with pytest.raises(RuntimeError, match=expected_msg): + vm.api.vfio.put(id="nvme0", sbdf="0000:01:02.03") + + +def test_vfio_incompatible_devices_vfio_balloon(microvm_factory, guest_kernel, rootfs): + """ + Test that adding balloon after VFIO fails at API level. + """ + vm = microvm_factory.build(guest_kernel, rootfs, pci=True) + vm.jailer.setup() + vm.spawn() + vm.basic_config() + + vm.api.vfio.put(id="nvme0", sbdf="0000:01:02.03") + expected_msg = re.escape( + "VFIO devices are not compatible with memory balloon device" + ) + with pytest.raises(RuntimeError, match=expected_msg): + vm.api.balloon.put( + amount_mib=0, deflate_on_oom=False, stats_polling_interval_s=1 + ) + + +def test_vfio_incompatible_devices_balloon_vfio(microvm_factory, guest_kernel, rootfs): + """ + Test that adding VFIO after balloon fails at API level. + """ + vm = microvm_factory.build(guest_kernel, rootfs, pci=True) + vm.jailer.setup() + vm.spawn() + vm.basic_config() + + vm.api.balloon.put(amount_mib=0, deflate_on_oom=False, stats_polling_interval_s=1) + expected_msg = re.escape( + "VFIO devices are not compatible with memory balloon device" + ) + with pytest.raises(RuntimeError, match=expected_msg): + vm.api.vfio.put(id="nvme0", sbdf="0000:01:02.03") + + +def test_vfio_incompatible_devices_vfio_mem_hot_plug( + microvm_factory, guest_kernel, rootfs +): + """ + Test that adding memory hotplug after VFIO fails at API level. + """ + vm = microvm_factory.build(guest_kernel, rootfs, pci=True) + vm.jailer.setup() + vm.spawn() + vm.basic_config() + + vm.api.vfio.put(id="nvme0", sbdf="0000:01:02.03") + expected_msg = re.escape( + "VFIO devices are not compatible with memory hot-plugging device" + ) + with pytest.raises(RuntimeError, match=expected_msg): + vm.api.memory_hotplug.put( + total_size_mib=256, slot_size_mib=256, block_size_mib=64 + ) + + +def test_vfio_incompatible_devices_mem_hot_plug_vfio( + microvm_factory, guest_kernel, rootfs +): + """ + Test that adding VFIO after memory hotplug fails at API level. + """ + vm = microvm_factory.build(guest_kernel, rootfs, pci=True) + vm.jailer.setup() + vm.spawn() + vm.basic_config() + + vm.api.memory_hotplug.put(total_size_mib=256, slot_size_mib=256, block_size_mib=64) + expected_msg = re.escape( + "VFIO devices are not compatible with memory hot-plugging device" + ) + with pytest.raises(RuntimeError, match=expected_msg): + vm.api.vfio.put(id="nvme0", sbdf="0000:01:02.03") + + +def test_vfio_nvme_not_present_without_config(microvm_factory, guest_kernel, rootfs): + """NVMe device does NOT appear when no VFIO device is configured.""" + vm = microvm_factory.build(guest_kernel, rootfs, pci=True) + vm.spawn() + vm.basic_config(mem_size_mib=512) + vm.add_net_iface() + vm.start() + + rc, _, _ = vm.ssh.run("test -e /dev/nvme0n1") + assert rc != 0 + + _, stdout, _ = vm.ssh.check_output("lspci -nn") + assert "Non-Volatile memory controller" not in stdout + + +def test_vfio_nvme_visible(uvm_with_vfio): + """The passthrough device appears on the guest PCI bus.""" + vm = uvm_with_vfio + vm.api.vfio.put(id="nvme0", sbdf=VFIO_SBDF) + vm.start() + + _, stdout, _ = vm.ssh.check_output("lspci -nn") + assert "Non-Volatile memory controller" in stdout + + vm.ssh.check_output("test -d /sys/class/nvme/nvme0") + vm.ssh.check_output("test -b /dev/nvme0n1") + + _, stdout, _ = vm.ssh.check_output("lsblk -Jb") + blocks = json.loads(stdout)["blockdevices"] + nvme = [b for b in blocks if b["name"] == "nvme0n1"] + assert len(nvme) == 1 + assert int(nvme[0]["size"]) > 0 + + +@pin_guest_kernel(ACPI_GUEST_KERNELS) +def test_vfio_nvme_hotplug_unplug_cycle(uvm_with_vfio): + """The passthrough device appears on the guest PCI bus.""" + vm = uvm_with_vfio + vm.start() + + _, lspci_before, _ = vm.ssh.check_output("lspci -n") + + vm.api.vfio.put(id="nvme0", sbdf=VFIO_SBDF) + vm.ssh.check_output("echo 1 > /sys/bus/pci/rescan") + + _, lspci_after, _ = vm.ssh.check_output("lspci -n") + new_entries = set(lspci_after.splitlines()) - set(lspci_before.splitlines()) + assert len(new_entries) == 1 + entry = new_entries.pop() + bdf = entry.split()[0] + + vm.ssh.check_output("test -d /sys/class/nvme/nvme0") + vm.ssh.check_output("test -b /dev/nvme0n1") + + _, stdout, _ = vm.ssh.check_output("lsblk -Jb") + blocks = json.loads(stdout)["blockdevices"] + nvme = [b for b in blocks if b["name"] == "nvme0n1"] + assert len(nvme) == 1 + assert int(nvme[0]["size"]) > 0 + + vm.ssh.check_output(f"echo 1 > /sys/bus/pci/devices/0000:{bdf}/remove") + vm.api.vfio.delete("nvme0") + vm.ssh.check_output("echo 1 > /sys/bus/pci/rescan") + _, lspci_after_unplug, _ = vm.ssh.check_output("lspci -n") + assert lspci_after_unplug == lspci_before + + +def test_vfio_nvme_read(uvm_with_vfio): + """The guest can read data from the passthrough NVMe device.""" + vm = uvm_with_vfio + vm.api.vfio.put(id="nvme0", sbdf=VFIO_SBDF) + vm.start() + + _, stdout, _ = vm.ssh.check_output( + "dd if=/dev/nvme0n1 of=/dev/null bs=4k count=256 2>&1" + ) + assert "256+0 records in" in stdout + + +def test_vfio_nvme_write_readback(uvm_with_vfio): + """Write data and read it back to confirm DMA in both directions.""" + vm = uvm_with_vfio + vm.api.vfio.put(id="nvme0", sbdf=VFIO_SBDF) + vm.start() + + vm.ssh.check_output("dd if=/dev/urandom of=/tmp/pattern bs=4k count=1") + vm.ssh.check_output("dd if=/tmp/pattern of=/dev/nvme0n1 bs=4k count=1 oflag=direct") + vm.ssh.check_output( + "dd if=/dev/nvme0n1 of=/tmp/readback bs=4k count=1 iflag=direct" + ) + # There is no `cmp` binary in AL2023 rootfs, so use a workaround + _, stdout, _ = vm.ssh.check_output("md5sum /tmp/pattern /tmp/readback") + hashes = [line.split()[0] for line in stdout.strip().splitlines()] + assert hashes[0] == hashes[1], "write/readback mismatch" diff --git a/tests/pytest.ini b/tests/pytest.ini index 5656c8eee4d..a2cc0d6fbf0 100644 --- a/tests/pytest.ini +++ b/tests/pytest.ini @@ -11,6 +11,7 @@ addopts = markers = no_block_pr: tests whose failure does not block PR merging. nonci: mark test as nonci. + vfio: tests requiring a VFIO passthrough device. ; Overwrite the default norecursedirs, which includes 'build'. norecursedirs = .* diff --git a/tools/devtool b/tools/devtool index d138564c370..cdf815e56f3 100755 --- a/tools/devtool +++ b/tools/devtool @@ -144,23 +144,24 @@ KVM_MODULE_LOCKFILE="/tmp/.kvm_module_lock" # Query default S3 bucket with artifacts and return the most recient path get_newest_s3_artifacts() { - local bucket="spec.ccfc.min" - local base_prefix="firecracker-ci/" - - # Query all files in the `firecracker-ci` directory, check files contianing "vmlinux", - # sort them by the `LastModified` date, and return the last one (newest). - # We need to do it this way as S3 doesn't store `LastModified`` date for directories, - # so we need to list all files. - local newest_dir=$(aws s3api list-objects-v2 \ - --bucket "$bucket" --prefix "$base_prefix" --no-sign-request \ - --query 'sort_by(Contents[?contains(Key, `vmlinux`)], &LastModified)[-1].Key' | - tr -d '"' | - sed "s|^$base_prefix||" | - cut -d'/' -f1 - ) - [ -z "$newest_dir" ] && die "Could not find newest artifacts in S3." - - echo "$DEFAULT_ARTIFACTS_S3_BUCKET/$newest_dir" + echo "s3://spec.ccfc.min/firecracker-ci-custom/vfio-0" + # local bucket="spec.ccfc.min" + # local base_prefix="firecracker-ci/" + # + # # Query all files in the `firecracker-ci` directory, check files contianing "vmlinux", + # # sort them by the `LastModified` date, and return the last one (newest). + # # We need to do it this way as S3 doesn't store `LastModified`` date for directories, + # # so we need to list all files. + # local newest_dir=$(aws s3api list-objects-v2 \ + # --bucket "$bucket" --prefix "$base_prefix" --no-sign-request \ + # --query 'sort_by(Contents[?contains(Key, `vmlinux`)], &LastModified)[-1].Key' | + # tr -d '"' | + # sed "s|^$base_prefix||" | + # cut -d'/' -f1 + # ) + # [ -z "$newest_dir" ] && die "Could not find newest artifacts in S3." + # + # echo "$DEFAULT_ARTIFACTS_S3_BUCKET/$newest_dir" } # Function to return local path to artifacts. Accepts the url from function above @@ -463,6 +464,14 @@ cmd_help() { echo " --no-archive Skip archiving of 'test_result' after the test is done." echo " --no-kvm-check Skip checking for '/dev/kvm' presence." echo " --no-artifacts-check Skip checking existing artifacts." + echo " --vfio-nvme-device DEVICE Prepare an NVMe PCI device for VFIO passthrough and pass it to tests." + echo " CI tests expect this to be an NVMe device." + echo " DEVICE can be an NVMe block device path (e.g. /dev/nvme1n1), a SCSI" + echo " block device path (e.g. /dev/sdf), or a PCI SBDF (e.g. 0003:16:00.0)." + echo " Binds the device to vfio-pci and sets FC_VFIO_PCI_SBDF and" + echo " FC_VFIO_PCI_SYSFS_PATH env vars in the container." + echo " --first-vfio-nvme-device Fallback: if --vfio-nvme-device fails to find the specified device," + echo " search for the first NVMe PCI device already bound to vfio-pci." echo "" echo " build_ci_artifacts [all|rootfs|kernels]" echo " Builds the rootfs and guest kernel artifacts we use for our CI." @@ -922,6 +931,81 @@ unapply_performance_tweaks() { } +# Find the first NVMe PCI device bound to vfio-pci. +# Prints the PCI SBDF on success, returns 1 if none found. +find_first_vfio_nvme_device() { + for dev in /sys/bus/pci/devices/*; do + local driver=$(basename "$(readlink -f "$dev/driver" 2>/dev/null)" 2>/dev/null || echo "none") + [ "$driver" = "vfio-pci" ] || continue + local class=$(cat "$dev/class" 2>/dev/null || echo "") + # 0x010802 is NVMe + [[ "$class" == "0x010802" ]] || continue + basename "$dev" + return 0 + done + return 1 +} + +# Ensure vfio and vfio-pci kernel modules are loaded +load_vfio_kernel_modules() { + for mod in vfio vfio-pci; do + if ! lsmod | grep -q "^${mod//-/_} "; then + say "Loading $mod kernel module..." + sudo modprobe "$mod" || die "Failed to load $mod kernel module" + fi + done +} + +# Resolve an NVMe device path or PCI SBDF to a PCI device, bind it to vfio-pci, +# and verify the binding. Sets VFIO_SBDF and VFIO_PCI_PATH on success. +# Returns non-zero on failure since we might need to retry this +setup_vfio_nvme_device() { + local vfio_device="$1" + VFIO_SBDF="" + VFIO_PCI_PATH="" + + # Check if it looks like a PCI SBDF (e.g. 0003:16:00.0) + if [[ "$vfio_device" =~ ^[0-9a-fA-F]{4}:[0-9a-fA-F]{2}:[0-9a-fA-F]{2}\.[0-9a-fA-F]$ ]]; then + VFIO_SBDF="$vfio_device" + VFIO_PCI_PATH="/sys/bus/pci/devices/$VFIO_SBDF" + elif [[ "$vfio_device" == /dev/* ]]; then + # Resolve symlinks (e.g. /dev/sdf -> /dev/nvme1n1) + local resolved_dev=$(readlink -f "$vfio_device") + local block_name=$(basename "$resolved_dev") + # For NVMe namespaces (nvmeXnY), the PCI device is on the controller (nvmeX) + local ctrl_name=$(echo "$block_name" | sed 's/n[0-9]*$//') + if [ -d "/sys/class/nvme/$ctrl_name" ]; then + VFIO_PCI_PATH=$(readlink -f "/sys/class/nvme/$ctrl_name/device") + elif [ -d "/sys/block/$block_name/device" ]; then + VFIO_PCI_PATH=$(readlink -f "/sys/block/$block_name/device") + else + say_err "Cannot resolve $vfio_device to a PCI device" + return 1 + fi + VFIO_SBDF=$(basename "$VFIO_PCI_PATH") + else + say_err "Invalid --vfio-device value: $vfio_device (expected /dev/... or PCI SBDF)" + return 1 + fi + + [ -d "$VFIO_PCI_PATH" ] || { say_err "PCI device not found: $VFIO_PCI_PATH"; return 1; } + + # Bind to vfio-pci if not already bound + local current_driver=$(basename "$(readlink -f "$VFIO_PCI_PATH/driver" 2>/dev/null)" 2>/dev/null || echo "none") + if [ "$current_driver" != "vfio-pci" ]; then + say "Binding $VFIO_SBDF to vfio-pci (currently: $current_driver)..." + echo "$VFIO_SBDF" | sudo tee "$VFIO_PCI_PATH/driver/unbind" > /dev/null 2>&1 || true + echo "vfio-pci" | sudo tee "$VFIO_PCI_PATH/driver_override" > /dev/null + echo "$VFIO_SBDF" | sudo tee /sys/bus/pci/drivers/vfio-pci/bind > /dev/null + fi + + # Verify the device is attached to vfio-pci + local bound_driver=$(basename "$(readlink -f "$VFIO_PCI_PATH/driver" 2>/dev/null)" 2>/dev/null || echo "none") + [ "$bound_driver" = "vfio-pci" ] || { say_err "Failed to bind $VFIO_SBDF to vfio-pci (driver is: $bound_driver)"; return 1; } + + say "VFIO device ready: $VFIO_SBDF ($VFIO_PCI_PATH)" +} + # `$0 test` - run integration tests # Please see `$0 help` for more information. # @@ -932,6 +1016,8 @@ cmd_test() { do_kvm_check=1 do_build_dir_check=1 do_artifacts_check=1 + local vfio_nvme_device="" + local first_vfio_nvme_device=0 # Parse any command line args. while [ $# -gt 0 ]; do case "$1" in @@ -969,6 +1055,13 @@ cmd_test() { "--no-artifacts-check") do_artifacts_check=0 ;; + "--vfio-nvme-device") + shift + vfio_nvme_device="$1" + ;; + "--first-vfio-nvme-device") + first_vfio_nvme_device=1 + ;; "--") { shift; break; } ;; *) die "Unknown argument: $1. Please use --help for help." @@ -1043,6 +1136,24 @@ cmd_test() { test_script="./tools/ab_test.py" fi + # Prepare VFIO device if requested + local vfio_env_args=() + if [ -n "$vfio_nvme_device" ]; then + load_vfio_kernel_modules + if ! setup_vfio_nvme_device "$vfio_nvme_device"; then + if [ "$first_vfio_nvme_device" -eq 1 ]; then + say "Falling back to searching for first NVMe vfio-pci device..." + local found_sbdf + found_sbdf=$(find_first_vfio_nvme_device) || die "No NVMe PCI device bound to vfio-pci found" + say "Found NVMe vfio-pci device: $found_sbdf" + setup_vfio_nvme_device "$found_sbdf" || die "Failed to set up fallback VFIO device $found_sbdf" + else + die "Failed to set up VFIO device '$vfio_nvme_device' and --first-vfio-pci-device not specified" + fi + fi + vfio_env_args=(--env "FC_VFIO_PCI_SBDF=$VFIO_SBDF" --env "FC_VFIO_PCI_SYSFS_PATH=$VFIO_PCI_PATH") + fi + # Testing (running Firecracker via the jailer) needs root access, # in order to set-up the Firecracker jail (manipulating cgroups, net # namespaces, etc). @@ -1057,6 +1168,7 @@ cmd_test() { --cpuset-cpus="$cpuset_cpus" \ --cpuset-mems="$cpuset_mems" \ --env-file env.list \ + "${vfio_env_args[@]}" \ -- \ $test_script "$@"