From 8a4f71cd70f8200d214d03c7ff48fa16a1637370 Mon Sep 17 00:00:00 2001 From: Jonas Savulionis Date: Fri, 29 May 2026 19:29:20 +0300 Subject: [PATCH 1/5] virtio-blk: add discard config space fields Extend the virtio-blk config space through the discard limit fields so the device can report discard capability details when the feature is enabled. Signed-off-by: Jonas Savulionis --- .../src/devices/virtio/block/virtio/device.rs | 53 ++++++++++++++++--- .../src/devices/virtio/block/virtio/mod.rs | 9 ++++ 2 files changed, 54 insertions(+), 8 deletions(-) diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index ec7ec468505..c8e9b6651fd 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -21,7 +21,10 @@ use vmm_sys_util::eventfd::EventFd; use super::io::async_io; use super::request::*; -use super::{BLOCK_QUEUE_SIZES, SECTOR_SHIFT, SECTOR_SIZE, VirtioBlockError, io as block_io}; +use super::{ + BLOCK_QUEUE_SIZES, DISCARD_SECTOR_ALIGNMENT, MAX_DISCARD_SECTORS, MAX_DISCARD_SEG, + SECTOR_SHIFT, SECTOR_SIZE, VirtioBlockError, io as block_io, +}; use crate::devices::virtio::ActivateError; use crate::devices::virtio::block::CacheType; use crate::devices::virtio::block::virtio::metrics::{BlockDeviceMetrics, BlockMetricsPerDevice}; @@ -163,11 +166,39 @@ impl DiskProperties { #[repr(C)] pub struct ConfigSpace { pub capacity: u64, + pub size_max: u32, + pub seg_max: u32, + pub cylinders: u16, + pub heads: u8, + pub sectors: u8, + pub blk_size: u32, + pub physical_block_exp: u8, + pub alignment_offset: u8, + pub min_io_size: u16, + pub opt_io_size: u32, + pub wce: u8, + pub unused: u8, + pub num_queues: u16, + pub max_discard_sectors: u32, + pub max_discard_seg: u32, + pub discard_sector_alignment: u32, } // SAFETY: `ConfigSpace` contains only PODs in `repr(C)` or `repr(transparent)`, without padding. unsafe impl ByteValued for ConfigSpace {} +impl ConfigSpace { + pub(crate) fn new(capacity_sectors: u64) -> Self { + Self { + capacity: capacity_sectors, + max_discard_sectors: MAX_DISCARD_SECTORS, + max_discard_seg: MAX_DISCARD_SEG, + discard_sector_alignment: DISCARD_SECTOR_ALIGNMENT, + ..Default::default() + } + } +} + /// Use this structure to set up the Block Device before booting the kernel. #[derive(Debug, PartialEq, Eq, Deserialize, Serialize)] #[serde(deny_unknown_fields)] @@ -311,9 +342,7 @@ impl VirtioBlock { let queues = BLOCK_QUEUE_SIZES.iter().map(|&s| Queue::new(s)).collect(); - let config_space = ConfigSpace { - capacity: disk_properties.nsectors.to_le(), - }; + let config_space = ConfigSpace::new(disk_properties.nsectors); Ok(VirtioBlock { avail_features, @@ -536,7 +565,7 @@ impl VirtioBlock { /// Update the backing file and the config space of the block device. pub fn update_disk_image(&mut self, disk_image_path: String) -> Result<(), VirtioBlockError> { self.disk.update(disk_image_path, self.read_only)?; - self.config_space.capacity = self.disk.nsectors.to_le(); // virtio_block_config_space(); + self.config_space = ConfigSpace::new(self.disk.nsectors); // Kick the driver to pick up the changes. (But only if the device is already activated). if self.is_activated() { @@ -824,11 +853,14 @@ mod tests { // This will read the number of sectors. // The block's backing file size is 0x1000, so there are 8 (4096/512) sectors. // The config space is little endian. - let expected_config_space = ConfigSpace { capacity: 8 }; + let expected_config_space = ConfigSpace::new(8); assert_eq!(actual_config_space, expected_config_space); // Invalid read. - let expected_config_space = ConfigSpace { capacity: 696969 }; + let expected_config_space = ConfigSpace { + capacity: 696969, + ..Default::default() + }; actual_config_space = expected_config_space; block.read_config( std::mem::size_of::() as u64 + 1, @@ -845,7 +877,10 @@ mod tests { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); - let expected_config_space = ConfigSpace { capacity: 696969 }; + let expected_config_space = ConfigSpace { + capacity: 696969, + ..Default::default() + }; block.write_config(0, expected_config_space.as_slice()); let mut actual_config_space = ConfigSpace::default(); @@ -855,6 +890,7 @@ mod tests { // If privileged user writes to `/dev/mem`, in block config space - byte by byte. let expected_config_space = ConfigSpace { capacity: 0x1122334455667788, + ..Default::default() }; let expected_config_space_slice = expected_config_space.as_slice(); for (i, b) in expected_config_space_slice.iter().enumerate() { @@ -866,6 +902,7 @@ mod tests { // Invalid write. let new_config_space = ConfigSpace { capacity: 0xDEADBEEF, + ..Default::default() }; block.write_config(5, new_config_space.as_slice()); // Make sure nothing got written. diff --git a/src/vmm/src/devices/virtio/block/virtio/mod.rs b/src/vmm/src/devices/virtio/block/virtio/mod.rs index 9e97d6d3897..52ddde77875 100644 --- a/src/vmm/src/devices/virtio/block/virtio/mod.rs +++ b/src/vmm/src/devices/virtio/block/virtio/mod.rs @@ -22,6 +22,15 @@ use crate::devices::virtio::queue::FIRECRACKER_MAX_QUEUE_SIZE; pub const SECTOR_SHIFT: u8 = 9; /// Size of block sector. pub const SECTOR_SIZE: u32 = (0x01_u32) << SECTOR_SHIFT; +/// Maximum sectors accepted in a single discard range. +/// +/// Keep this bounded so guest fstrim is split into predictable chunks instead +/// of letting one large discard ioctl block the VMM thread for too long. +pub const MAX_DISCARD_SECTORS: u32 = (128_u32 << 20) / SECTOR_SIZE; +/// Maximum number of segments accepted in a single discard request. +pub const MAX_DISCARD_SEG: u32 = 1; +/// Discard alignment, expressed in 512-byte sectors. +pub const DISCARD_SECTOR_ALIGNMENT: u32 = 1; /// The number of queues of block device. pub const BLOCK_NUM_QUEUES: usize = 1; pub const BLOCK_QUEUE_SIZES: [u16; BLOCK_NUM_QUEUES] = [FIRECRACKER_MAX_QUEUE_SIZE]; From e7a7be44583d610614ec3d6be5baa3126eaf670c Mon Sep 17 00:00:00 2001 From: Jonas Savulionis Date: Fri, 29 May 2026 19:29:35 +0300 Subject: [PATCH 2/5] virtio-blk: add discard request support Add an opt-in discard drive flag and parse a single validated discard range without re-reading guest memory after validation. Execute discard with hole punching for regular files and BLKDISCARD for block devices on the sync engine. Reject discard with the async engine until an async implementation can be added. Signed-off-by: Jonas Savulionis --- CHANGELOG.md | 3 + docs/api_requests/block-discard.md | 43 ++++ src/firecracker/swagger/firecracker.yaml | 8 + src/vmm/src/builder.rs | 1 + src/vmm/src/device_manager/mod.rs | 1 + .../devices/virtio/block/vhost_user/device.rs | 7 +- .../src/devices/virtio/block/virtio/device.rs | 108 ++++++++- .../virtio/block/virtio/io/async_io.rs | 6 + .../src/devices/virtio/block/virtio/io/mod.rs | 23 ++ .../devices/virtio/block/virtio/io/sync_io.rs | 90 +++++++ .../src/devices/virtio/block/virtio/mod.rs | 2 + .../devices/virtio/block/virtio/persist.rs | 6 +- .../devices/virtio/block/virtio/request.rs | 228 +++++++++++++++++- .../devices/virtio/block/virtio/test_utils.rs | 1 + src/vmm/src/resources.rs | 1 + src/vmm/src/vmm_config/drive.rs | 20 ++ src/vmm/tests/integration_tests.rs | 1 + 17 files changed, 528 insertions(+), 21 deletions(-) create mode 100644 docs/api_requests/block-discard.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 5c1be039674..96f66441c83 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -59,6 +59,9 @@ and this project adheres to - Added support for Linux 6.18 host kernels alongside the existing 5.10 and 6.1 host kernels. See the [kernel support policy](docs/kernel-policy.md) for details. +- [#5908](https://github.com/firecracker-microvm/firecracker/pull/5908): Add + opt-in virtio-blk discard support for writable `Sync` IO engine drives through + the `discard` drive configuration field. ### Changed diff --git a/docs/api_requests/block-discard.md b/docs/api_requests/block-discard.md new file mode 100644 index 00000000000..288a3b6aed8 --- /dev/null +++ b/docs/api_requests/block-discard.md @@ -0,0 +1,43 @@ +# Block device discard + +Firecracker can expose virtio-blk discard support to Linux guests. When enabled, +the guest can issue discard/TRIM requests, for example through `fstrim`, and +Firecracker forwards those requests to the backing storage. + +Discard is configured per virtio-block device through the `discard` field in the +`PUT /drives/{drive_id}` request. It is disabled by default. + +## Supported configuration + +Discard is currently supported only for writable virtio-block devices using the +`Sync` IO engine. It is not supported for: + +- read-only drives; +- `Async` IO engine drives; +- vhost-user block devices. + +For regular backing files, Firecracker uses hole punching. For block-device +backends, Firecracker uses `BLKDISCARD`. + +## Example configuration + +```bash +curl --unix-socket ${socket} -i \ + -X PUT "http://localhost/drives/rootfs" \ + -H "accept: application/json" \ + -H "Content-Type: application/json" \ + -d "{ + \"drive_id\": \"rootfs\", + \"path_on_host\": \"${drive_path}\", + \"is_root_device\": true, + \"is_read_only\": false, + \"io_engine\": \"Sync\", + \"discard\": true + }" +``` + +After the guest boots, Linux guests can usually issue discard requests with: + +```bash +fstrim -av +``` diff --git a/src/firecracker/swagger/firecracker.yaml b/src/firecracker/swagger/firecracker.yaml index d1ac91bdb6a..231b5b7752f 100644 --- a/src/firecracker/swagger/firecracker.yaml +++ b/src/firecracker/swagger/firecracker.yaml @@ -1238,6 +1238,14 @@ definitions: description: Is block read only. This field is required for virtio-block config and should be omitted for vhost-user-block configuration. + discard: + type: boolean + description: + Enables virtio-blk discard support. When enabled, the guest can issue + discard/TRIM requests for this drive. Only supported for writable + virtio-block devices using the Sync IO engine. + This field is optional for virtio-block config and should be omitted for vhost-user-block configuration. + default: false path_on_host: type: string description: diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 45dd3865155..cdcbd53b5c1 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -894,6 +894,7 @@ pub(crate) mod tests { cache_type: custom_block_cfg.cache_type, is_read_only: Some(custom_block_cfg.is_read_only), + discard: None, path_on_host: Some( block_files .last() diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 202c457b648..094b5466a70 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -886,6 +886,7 @@ pub(crate) mod tests { is_root_device: is_root, cache_type: CacheType::Unsafe, is_read_only: Some(false), + discard: None, path_on_host: Some(f.as_path().to_str().unwrap().to_string()), rate_limiter: None, file_engine_type: None, diff --git a/src/vmm/src/devices/virtio/block/vhost_user/device.rs b/src/vmm/src/devices/virtio/block/vhost_user/device.rs index 7df600722ab..143a393d2d1 100644 --- a/src/vmm/src/devices/virtio/block/vhost_user/device.rs +++ b/src/vmm/src/devices/virtio/block/vhost_user/device.rs @@ -67,9 +67,10 @@ impl TryFrom<&BlockDeviceConfig> for VhostUserBlockConfig { type Error = VhostUserBlockError; fn try_from(value: &BlockDeviceConfig) -> Result { - if let (Some(socket), None, None, None, None) = ( + if let (Some(socket), None, None, None, None, None) = ( &value.socket, &value.is_read_only, + &value.discard, &value.path_on_host, &value.rate_limiter, &value.file_engine_type, @@ -97,6 +98,7 @@ impl From for BlockDeviceConfig { cache_type: value.cache_type, is_read_only: None, + discard: None, path_on_host: None, rate_limiter: None, file_engine_type: None, @@ -409,6 +411,7 @@ mod tests { cache_type: CacheType::Unsafe, is_read_only: None, + discard: None, path_on_host: None, rate_limiter: None, file_engine_type: None, @@ -424,6 +427,7 @@ mod tests { cache_type: CacheType::Unsafe, is_read_only: Some(true), + discard: None, path_on_host: Some("path".to_string()), rate_limiter: None, file_engine_type: Some(FileEngineType::Sync), @@ -439,6 +443,7 @@ mod tests { cache_type: CacheType::Unsafe, is_read_only: Some(true), + discard: None, path_on_host: Some("path".to_string()), rate_limiter: None, file_engine_type: Some(FileEngineType::Sync), diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index c8e9b6651fd..70704cb6bb7 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -30,7 +30,7 @@ use crate::devices::virtio::block::CacheType; use crate::devices::virtio::block::virtio::metrics::{BlockDeviceMetrics, BlockMetricsPerDevice}; use crate::devices::virtio::device::{ActiveState, DeviceState, VirtioDevice, VirtioDeviceType}; use crate::devices::virtio::generated::virtio_blk::{ - VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO, VIRTIO_BLK_ID_BYTES, + VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO, VIRTIO_BLK_ID_BYTES, }; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::generated::virtio_ring::VIRTIO_RING_F_EVENT_IDX; @@ -220,6 +220,8 @@ pub struct VirtioBlockConfig { /// If set to true, the drive is opened in read-only mode. Otherwise, the /// drive is opened as read-write. pub is_read_only: bool, + /// If set to true, the device advertises discard support to the guest. + pub discard: bool, /// Path of the backing file on the host pub path_on_host: String, /// Rate Limiter for I/O operations. @@ -242,6 +244,7 @@ impl TryFrom<&BlockDeviceConfig> for VirtioBlockConfig { cache_type: value.cache_type, is_read_only: value.is_read_only.unwrap_or(false), + discard: value.discard.unwrap_or(false), path_on_host: path_on_host.clone(), rate_limiter: value.rate_limiter, file_engine_type: value.file_engine_type.unwrap_or_default(), @@ -261,6 +264,7 @@ impl From for BlockDeviceConfig { cache_type: value.cache_type, is_read_only: Some(value.is_read_only), + discard: Some(value.discard), path_on_host: Some(value.path_on_host), rate_limiter: value.rate_limiter, file_engine_type: Some(value.file_engine_type), @@ -315,6 +319,10 @@ impl VirtioBlock { /// /// The given file must be seekable and sizable. pub fn new(config: VirtioBlockConfig) -> Result { + if config.discard && config.file_engine_type == FileEngineType::Async { + return Err(VirtioBlockError::DiscardAsyncUnsupported); + } + let disk_properties = DiskProperties::new( config.path_on_host, config.is_read_only, @@ -336,6 +344,8 @@ impl VirtioBlock { if config.is_read_only { avail_features |= 1u64 << VIRTIO_BLK_F_RO; + } else if config.discard { + avail_features |= 1u64 << VIRTIO_BLK_F_DISCARD; }; let queue_evts = [EventFd::new(libc::EFD_NONBLOCK).map_err(VirtioBlockError::EventFd)?]; @@ -376,6 +386,7 @@ impl VirtioBlock { is_root_device: self.root_device, partuuid: self.partuuid.clone(), is_read_only: self.read_only, + discard: self.avail_features & (1u64 << VIRTIO_BLK_F_DISCARD) != 0, cache_type: self.cache_type, rate_limiter: rl.into_option(), file_engine_type: self.file_engine_type(), @@ -440,6 +451,7 @@ impl VirtioBlock { head.index, &active_state.mem, &self.metrics, + self.acked_features & (1u64 << VIRTIO_BLK_F_DISCARD) != 0, ) } Err(err) => { @@ -734,9 +746,9 @@ mod tests { use crate::check_metric_after_block; use crate::devices::virtio::block::virtio::IO_URING_NUM_ENTRIES; use crate::devices::virtio::block::virtio::test_utils::{ - default_block, read_blk_req_descriptors, set_queue, set_rate_limiter, - simulate_async_completion_event, simulate_queue_and_async_completion_events, - simulate_queue_event, + RequestDescriptorChain, default_block, read_blk_req_descriptors, set_queue, + set_rate_limiter, simulate_async_completion_event, + simulate_queue_and_async_completion_events, simulate_queue_event, }; use crate::devices::virtio::queue::{VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; use crate::devices::virtio::test_utils::{VirtQueue, default_interrupt, default_mem}; @@ -752,6 +764,7 @@ mod tests { cache_type: CacheType::Unsafe, is_read_only: Some(true), + discard: None, path_on_host: Some("path".to_string()), rate_limiter: None, file_engine_type: Default::default(), @@ -767,6 +780,7 @@ mod tests { cache_type: CacheType::Unsafe, is_read_only: None, + discard: None, path_on_host: None, rate_limiter: None, file_engine_type: Default::default(), @@ -782,6 +796,7 @@ mod tests { cache_type: CacheType::Unsafe, is_read_only: Some(true), + discard: None, path_on_host: Some("path".to_string()), rate_limiter: None, file_engine_type: Default::default(), @@ -843,6 +858,91 @@ mod tests { } } + #[test] + fn test_discard_feature() { + let f = TempFile::new().unwrap(); + f.as_file().set_len(0x1000).unwrap(); + let path = f.as_path().to_str().unwrap().to_string(); + let config = VirtioBlockConfig { + drive_id: "test".to_string(), + path_on_host: path.clone(), + is_root_device: false, + partuuid: None, + is_read_only: false, + discard: true, + cache_type: CacheType::Unsafe, + rate_limiter: None, + file_engine_type: FileEngineType::Sync, + }; + + let block = VirtioBlock::new(config).unwrap(); + assert_eq!( + block.avail_features & (1u64 << VIRTIO_BLK_F_DISCARD), + 1u64 << VIRTIO_BLK_F_DISCARD + ); + + let async_config = VirtioBlockConfig { + drive_id: "test".to_string(), + path_on_host: path, + is_root_device: false, + partuuid: None, + is_read_only: false, + discard: true, + cache_type: CacheType::Unsafe, + rate_limiter: None, + file_engine_type: FileEngineType::Async, + }; + assert!(matches!( + VirtioBlock::new(async_config), + Err(VirtioBlockError::DiscardAsyncUnsupported) + )); + } + + #[test] + fn test_discard_requires_negotiated_feature() { + let f = TempFile::new().unwrap(); + f.as_file().set_len(0x1000).unwrap(); + let config = VirtioBlockConfig { + drive_id: "test".to_string(), + path_on_host: f.as_path().to_str().unwrap().to_string(), + is_root_device: false, + partuuid: None, + is_read_only: false, + discard: true, + cache_type: CacheType::Unsafe, + rate_limiter: None, + file_engine_type: FileEngineType::Sync, + }; + + let mut block = VirtioBlock::new(config).unwrap(); + let mem = default_mem(); + let interrupt = default_interrupt(); + let vq = VirtQueue::new(GuestAddress(0), &mem, 16); + set_queue(&mut block, 0, vq.create_queue()); + block.activate(mem.clone(), interrupt).unwrap(); + + let chain = RequestDescriptorChain::new(&vq); + chain.set_header(RequestHeader::new(VIRTIO_BLK_T_DISCARD, 0)); + chain.data_desc.flags.set(VIRTQ_DESC_F_NEXT); + chain.data_desc.len.set(16); + + let mut discard_range = [0_u8; 16]; + discard_range[8..12].copy_from_slice(&1_u32.to_le_bytes()); + mem.write_slice(&discard_range, GuestAddress(chain.data_desc.addr.get())) + .unwrap(); + + simulate_queue_event(&mut block, Some(true)); + + assert_eq!(vq.used.idx.get(), 1); + assert_eq!(vq.used.ring[0].get().id, 0); + assert_eq!(vq.used.ring[0].get().len, 1); + assert_eq!( + mem.read_obj::(GuestAddress(chain.status_desc.addr.get())) + .unwrap(), + VIRTIO_BLK_S_UNSUPP + ); + } + #[test] fn test_virtio_read_config() { for engine in [FileEngineType::Sync, FileEngineType::Async] { diff --git a/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs b/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs index f83d9dea1df..de35cf4a703 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs @@ -19,6 +19,8 @@ use crate::vstate::memory::{GuestAddress, GuestMemory, GuestMemoryExtension, Gue #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum AsyncIoError { + /// Not implemented + NotImplemented, /// IO: {0} IO(std::io::Error), /// IoUring: {0} @@ -198,6 +200,10 @@ impl AsyncFileEngine { }) } + pub fn discard(&mut self, _range: (u64, u64)) -> Result { + Err(AsyncIoError::NotImplemented) + } + pub fn kick_submission_queue(&mut self) -> Result<(), AsyncIoError> { self.ring .submit() diff --git a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs index b7aa8061d76..158d9d150e8 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs @@ -157,6 +157,29 @@ impl FileEngine { } } + pub fn discard( + &mut self, + range: (u64, u64), + req: PendingRequest, + ) -> Result> { + match self { + FileEngine::Async(engine) => match engine.discard(range) { + Ok(count) => Ok(FileEngineOk::Executed(RequestOk { req, count })), + Err(err) => Err(RequestError { + req, + error: BlockIoError::Async(err), + }), + }, + FileEngine::Sync(engine) => match engine.discard(range) { + Ok(count) => Ok(FileEngineOk::Executed(RequestOk { req, count })), + Err(err) => Err(RequestError { + req, + error: BlockIoError::Sync(err), + }), + }, + } + } + pub fn drain(&mut self, discard: bool) -> Result<(), BlockIoError> { match self { FileEngine::Async(engine) => engine.drain(discard).map_err(BlockIoError::Async), diff --git a/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs b/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs index eec3b3d8b8d..3389b90e29f 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs @@ -3,6 +3,8 @@ use std::fs::File; use std::io::{Seek, SeekFrom, Write}; +use std::os::fd::AsRawFd; +use std::os::unix::fs::FileTypeExt; use vm_memory::{GuestMemoryError, ReadVolatile, WriteVolatile}; @@ -10,6 +12,8 @@ use crate::vstate::memory::{GuestAddress, GuestMemory, GuestMemoryMmap}; #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum SyncIoError { + /// Discard: {0} + Discard(std::io::Error), /// Flush: {0} Flush(std::io::Error), /// Seek: {0} @@ -28,6 +32,50 @@ pub struct SyncFileEngine { // SAFETY: `File` is send and ultimately a POD. unsafe impl Send for SyncFileEngine {} +const BLKDISCARD: libc::Ioctl = 0x1277; +const FALLOC_FL_KEEP_SIZE: libc::c_int = 0x01; +const FALLOC_FL_PUNCH_HOLE: libc::c_int = 0x02; + +pub(super) fn discard_file(file: &File, range: (u64, u64)) -> Result { + let file_type = file.metadata()?.file_type(); + let (offset, len) = range; + + if len == 0 { + return Ok(0); + } + + if file_type.is_block_device() { + let mut range = [offset, len]; + // SAFETY: file is a valid fd, BLKDISCARD expects a pointer to two u64 values + // representing byte offset and byte length. + let ret = unsafe { libc::ioctl(file.as_raw_fd(), BLKDISCARD, range.as_mut_ptr()) }; + if ret < 0 { + return Err(std::io::Error::last_os_error()); + } + } else { + let off = libc::off_t::try_from(offset).map_err(|_| { + std::io::Error::new(std::io::ErrorKind::InvalidInput, "discard offset overflow") + })?; + let len = libc::off_t::try_from(len).map_err(|_| { + std::io::Error::new(std::io::ErrorKind::InvalidInput, "discard length overflow") + })?; + // SAFETY: file is a valid fd and fallocate does not retain the passed values. + let ret = unsafe { + libc::fallocate( + file.as_raw_fd(), + FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + off, + len, + ) + }; + if ret < 0 { + return Err(std::io::Error::last_os_error()); + } + } + + Ok(u32::try_from(len).unwrap_or(u32::MAX)) +} + impl SyncFileEngine { pub fn from_file(file: File) -> SyncFileEngine { SyncFileEngine { file } @@ -81,4 +129,46 @@ impl SyncFileEngine { // Sync data out to physical media on host. self.file.sync_all().map_err(SyncIoError::SyncAll) } + + pub fn discard(&mut self, range: (u64, u64)) -> Result { + discard_file(&self.file, range).map_err(SyncIoError::Discard) + } +} + +#[cfg(test)] +mod tests { + use std::io::{Read, Seek, SeekFrom, Write}; + + use vmm_sys_util::tempfile::TempFile; + + use super::discard_file; + + #[test] + fn test_discard_regular_file() { + let mut file = TempFile::new().unwrap().into_file(); + file.write_all(&vec![0x5a; 4096]).unwrap(); + + match discard_file(&file, (1024, 2048)) { + Ok(discarded) => assert_eq!(discarded, 2048), + // Some filesystems do not support hole punching; that is a host/filesystem + // capability limitation, not a test failure for the regular discard path. + Err(err) + if matches!( + err.raw_os_error(), + Some(libc::EOPNOTSUPP) | Some(libc::ENOSYS) + ) => + { + return; + } + Err(err) => panic!("discard failed: {err}"), + } + + file.seek(SeekFrom::Start(0)).unwrap(); + let mut data = vec![0u8; 4096]; + file.read_exact(&mut data).unwrap(); + + assert_eq!(&data[..1024], vec![0x5a; 1024].as_slice()); + assert_eq!(&data[1024..3072], vec![0; 2048].as_slice()); + assert_eq!(&data[3072..], vec![0x5a; 1024].as_slice()); + } } diff --git a/src/vmm/src/devices/virtio/block/virtio/mod.rs b/src/vmm/src/devices/virtio/block/virtio/mod.rs index 52ddde77875..e3cdcaab605 100644 --- a/src/vmm/src/devices/virtio/block/virtio/mod.rs +++ b/src/vmm/src/devices/virtio/block/virtio/mod.rs @@ -56,6 +56,8 @@ pub enum VirtioBlockError { InvalidDataLength, /// The requested operation would cause a seek beyond disk end. InvalidOffset, + /// Discard is not supported with the async IO engine. + DiscardAsyncUnsupported, /// Guest gave us a read only descriptor that protocol says to write to. UnexpectedReadOnlyDescriptor, /// Guest gave us a write only descriptor that protocol says to read from. diff --git a/src/vmm/src/devices/virtio/block/virtio/persist.rs b/src/vmm/src/devices/virtio/block/virtio/persist.rs index c4288460a56..189ef35347e 100644 --- a/src/vmm/src/devices/virtio/block/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/block/virtio/persist.rs @@ -110,9 +110,7 @@ impl Persist<'_> for VirtioBlock { let avail_features = state.virtio_state.avail_features; let acked_features = state.virtio_state.acked_features; - let config_space = ConfigSpace { - capacity: disk_properties.nsectors.to_le(), - }; + let config_space = ConfigSpace::new(disk_properties.nsectors); Ok(VirtioBlock { avail_features, @@ -159,6 +157,7 @@ mod tests { is_root_device: false, partuuid: None, is_read_only: false, + discard: false, cache_type: CacheType::Writeback, rate_limiter: None, file_engine_type: FileEngineType::default(), @@ -200,6 +199,7 @@ mod tests { is_root_device: false, partuuid: None, is_read_only: false, + discard: false, cache_type: CacheType::Unsafe, rate_limiter: None, file_engine_type: FileEngineType::default(), diff --git a/src/vmm/src/devices/virtio/block/virtio/request.rs b/src/vmm/src/devices/virtio/block/virtio/request.rs index feaa811e221..d2a1cddc49a 100644 --- a/src/vmm/src/devices/virtio/block/virtio/request.rs +++ b/src/vmm/src/devices/virtio/block/virtio/request.rs @@ -6,24 +6,27 @@ // found in the THIRD-PARTY file. use std::convert::From; +use std::mem::size_of; use vm_memory::GuestMemoryError; -use super::{SECTOR_SHIFT, SECTOR_SIZE, VirtioBlockError, io as block_io}; +use super::{MAX_DISCARD_SECTORS, SECTOR_SHIFT, SECTOR_SIZE, VirtioBlockError, io as block_io}; use crate::devices::virtio::block::virtio::device::DiskProperties; use crate::devices::virtio::block::virtio::metrics::BlockDeviceMetrics; pub use crate::devices::virtio::generated::virtio_blk::{ VIRTIO_BLK_ID_BYTES, VIRTIO_BLK_S_IOERR, VIRTIO_BLK_S_OK, VIRTIO_BLK_S_UNSUPP, - VIRTIO_BLK_T_FLUSH, VIRTIO_BLK_T_GET_ID, VIRTIO_BLK_T_IN, VIRTIO_BLK_T_OUT, + VIRTIO_BLK_T_DISCARD, VIRTIO_BLK_T_FLUSH, VIRTIO_BLK_T_GET_ID, VIRTIO_BLK_T_IN, + VIRTIO_BLK_T_OUT, }; use crate::devices::virtio::queue::DescriptorChain; use crate::logger::{IncMetric, error}; use crate::rate_limiter::{RateLimiter, TokenType}; -use crate::vstate::memory::{ByteValued, Bytes, GuestAddress, GuestMemoryMmap}; +use crate::vstate::memory::{Address, ByteValued, Bytes, GuestAddress, GuestMemoryMmap}; -#[derive(Debug, derive_more::From)] +#[derive(Debug)] pub enum IoErr { GetId(GuestMemoryError), + UnsupportedRequest(u32), PartialTransfer { completed: u32, expected: u32 }, FileEngine(block_io::BlockIoError), } @@ -32,6 +35,7 @@ pub enum IoErr { pub enum RequestType { In, Out, + Discard, Flush, GetDeviceID, Unsupported(u32), @@ -42,6 +46,7 @@ impl From for RequestType { match value { VIRTIO_BLK_T_IN => RequestType::In, VIRTIO_BLK_T_OUT => RequestType::Out, + VIRTIO_BLK_T_DISCARD => RequestType::Discard, VIRTIO_BLK_T_FLUSH => RequestType::Flush, VIRTIO_BLK_T_GET_ID => RequestType::GetDeviceID, t => RequestType::Unsupported(t), @@ -167,6 +172,9 @@ impl PendingRequest { } status } + (Ok(_), RequestType::Discard) => Status::Ok { + num_bytes_to_mem: 0, + }, (Ok(_), RequestType::Flush) => { block_metrics.flush_count.inc(); Status::Ok { @@ -176,7 +184,9 @@ impl PendingRequest { (Ok(transferred_data_len), RequestType::GetDeviceID) => { Status::from_data(self.data_len, transferred_data_len, true) } - (_, RequestType::Unsupported(op)) => Status::Unsupported { op }, + (Err(IoErr::UnsupportedRequest(op)), _) | (_, RequestType::Unsupported(op)) => { + Status::Unsupported { op } + } (Err(err), _) => Status::IoErr { num_bytes_to_mem: 0, err, @@ -207,6 +217,21 @@ pub struct RequestHeader { // SAFETY: Safe because RequestHeader only contains plain data. unsafe impl ByteValued for RequestHeader {} +#[derive(Debug, Copy, Clone, Default)] +#[repr(C)] +pub struct DiscardWriteZeroes { + sector: u64, + num_sectors: u32, + flags: u32, +} + +// SAFETY: Safe because DiscardWriteZeroes only contains plain data and has no padding. +unsafe impl ByteValued for DiscardWriteZeroes {} + +fn discard_range_size() -> u32 { + u32::try_from(size_of::()).expect("discard range size fits in u32") +} + impl RequestHeader { pub fn new(request_type: u32, sector: u64) -> RequestHeader { RequestHeader { @@ -238,6 +263,7 @@ pub struct Request { pub status_addr: GuestAddress, sector: u64, data_addr: GuestAddress, + discard_range: Option<(u64, u64)>, } impl Request { @@ -256,6 +282,7 @@ impl Request { r#type: RequestType::from(request_header.request_type), sector: request_header.sector, data_addr: GuestAddress(0), + discard_range: None, data_len: 0, status_addr: GuestAddress(0), }; @@ -278,7 +305,9 @@ impl Request { .next_descriptor() .ok_or(VirtioBlockError::DescriptorChainTooShort)?; - if data_desc.is_write_only() && req.r#type == RequestType::Out { + if data_desc.is_write_only() + && (req.r#type == RequestType::Out || req.r#type == RequestType::Discard) + { return Err(VirtioBlockError::UnexpectedWriteOnlyDescriptor); } if !data_desc.is_write_only() && req.r#type == RequestType::In { @@ -311,6 +340,9 @@ impl Request { RequestType::GetDeviceID if req.data_len < VIRTIO_BLK_ID_BYTES => { return Err(VirtioBlockError::InvalidDataLength); } + RequestType::Discard => { + req.validate_discard_ranges(mem, num_disk_sectors)?; + } _ => {} } @@ -328,6 +360,41 @@ impl Request { Ok(req) } + fn validate_discard_ranges( + &mut self, + mem: &GuestMemoryMmap, + num_disk_sectors: u64, + ) -> Result<(), VirtioBlockError> { + if self.data_len != discard_range_size() { + return Err(VirtioBlockError::InvalidDataLength); + } + + let range: DiscardWriteZeroes = mem + .read_obj(self.data_addr) + .map_err(VirtioBlockError::GuestMemory)?; + if range.num_sectors == 0 || range.num_sectors > MAX_DISCARD_SECTORS || range.flags != 0 { + return Err(VirtioBlockError::InvalidDataLength); + } + let top_sector = range + .sector + .checked_add(u64::from(range.num_sectors)) + .ok_or(VirtioBlockError::InvalidOffset)?; + if top_sector > num_disk_sectors { + return Err(VirtioBlockError::InvalidOffset); + } + + let byte_offset = range + .sector + .checked_mul(u64::from(SECTOR_SIZE)) + .ok_or(VirtioBlockError::InvalidOffset)?; + let byte_len = u64::from(range.num_sectors) + .checked_mul(u64::from(SECTOR_SIZE)) + .ok_or(VirtioBlockError::InvalidDataLength)?; + self.discard_range = Some((byte_offset, byte_len)); + + Ok(()) + } + pub(crate) fn rate_limit(&self, rate_limiter: &mut RateLimiter) -> bool { // If limiter.consume() fails it means there is no more TokenType::Ops // budget and rate limiting is in effect. @@ -367,6 +434,7 @@ impl Request { desc_idx: u16, mem: &GuestMemoryMmap, block_metrics: &BlockDeviceMetrics, + discard_supported: bool, ) -> ProcessingResult { let pending = self.to_pending_request(desc_idx); let res = match self.r#type { @@ -380,6 +448,21 @@ impl Request { disk.file_engine .write(self.offset(), mem, self.data_addr, self.data_len, pending) } + RequestType::Discard => { + if !discard_supported { + return ProcessingResult::Executed(pending.finish( + mem, + Err(IoErr::UnsupportedRequest(VIRTIO_BLK_T_DISCARD)), + block_metrics, + )); + } + + disk.file_engine.discard( + self.discard_range + .expect("discard request missing validated range"), + pending, + ) + } RequestType::Flush => disk.file_engine.flush(pending), RequestType::GetDeviceID => { let res = mem @@ -445,6 +528,7 @@ mod tests { let supported_request_types = vec![ VIRTIO_BLK_T_IN, VIRTIO_BLK_T_OUT, + VIRTIO_BLK_T_DISCARD, VIRTIO_BLK_T_FLUSH, VIRTIO_BLK_T_GET_ID, ]; @@ -468,6 +552,10 @@ mod tests { fn test_request_type_from() { assert_eq!(RequestType::from(VIRTIO_BLK_T_IN), RequestType::In); assert_eq!(RequestType::from(VIRTIO_BLK_T_OUT), RequestType::Out); + assert_eq!( + RequestType::from(VIRTIO_BLK_T_DISCARD), + RequestType::Discard + ); assert_eq!(RequestType::from(VIRTIO_BLK_T_FLUSH), RequestType::Flush); assert_eq!( RequestType::from(VIRTIO_BLK_T_GET_ID), @@ -643,6 +731,74 @@ mod tests { chain.check_parse(false); } + #[test] + fn test_parse_discard() { + let mem = &default_mem(); + let queue = VirtQueue::new(GuestAddress(0), mem, 16); + let chain = RequestDescriptorChain::new(&queue); + + let request_header = RequestHeader::new(VIRTIO_BLK_T_DISCARD, 0); + chain.set_header(request_header); + + // Write only data descriptor for Discard. + chain + .data_desc + .flags + .set(VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE); + chain.check_parse_err(VirtioBlockError::UnexpectedWriteOnlyDescriptor); + + chain.data_desc.flags.set(VIRTQ_DESC_F_NEXT); + chain.data_desc.len.set(15); + chain.check_parse_err(VirtioBlockError::InvalidDataLength); + + let range = DiscardWriteZeroes { + sector: 0, + num_sectors: 0, + flags: 0, + }; + chain.data_desc.len.set(discard_range_size()); + mem.write_obj(range, GuestAddress(chain.data_desc.addr.get())) + .unwrap(); + chain.check_parse_err(VirtioBlockError::InvalidDataLength); + + let range = DiscardWriteZeroes { + sector: 0, + num_sectors: 1, + flags: 1, + }; + mem.write_obj(range, GuestAddress(chain.data_desc.addr.get())) + .unwrap(); + chain.check_parse_err(VirtioBlockError::InvalidDataLength); + + let range = DiscardWriteZeroes { + sector: 0, + num_sectors: MAX_DISCARD_SECTORS + 1, + flags: 0, + }; + mem.write_obj(range, GuestAddress(chain.data_desc.addr.get())) + .unwrap(); + chain.check_parse_err(VirtioBlockError::InvalidDataLength); + + let range = DiscardWriteZeroes { + sector: NUM_DISK_SECTORS - 1, + num_sectors: 2, + flags: 0, + }; + chain.data_desc.len.set(discard_range_size()); + mem.write_obj(range, GuestAddress(chain.data_desc.addr.get())) + .unwrap(); + chain.check_parse_err(VirtioBlockError::InvalidOffset); + + let range = DiscardWriteZeroes { + sector: NUM_DISK_SECTORS - 1, + num_sectors: 1, + flags: 0, + }; + mem.write_obj(range, GuestAddress(chain.data_desc.addr.get())) + .unwrap(); + chain.check_parse(true); + } + #[test] fn test_parse_get_id() { let mem = &default_mem(); @@ -695,6 +851,7 @@ mod tests { (u32, std::sync::Arc Self>), (u32, std::sync::Arc Self>), (u32, std::sync::Arc Self>), + (u32, std::sync::Arc Self>), ( u32, std::sync::Arc::Strategy, fn(u32) -> Self>>, @@ -702,20 +859,25 @@ mod tests { )>; fn arbitrary_with(_: Self::Parameters) -> Self::Strategy { + const FIRST_UNSUPPORTED_REQUEST_ID: u32 = VIRTIO_BLK_T_DISCARD + 1; + // All strategies have the same weight, there is no reson currently to skew // the rations to increase the odds of a specific request type. TupleUnion::new(( (1u32, std::sync::Arc::new(|| RequestType::In {})), (1u32, std::sync::Arc::new(|| RequestType::Out {})), + (1u32, std::sync::Arc::new(|| RequestType::Discard {})), (1u32, std::sync::Arc::new(|| RequestType::Flush {})), (1u32, std::sync::Arc::new(|| RequestType::GetDeviceID {})), ( 1u32, std::sync::Arc::new(Strategy::prop_map(any::(), |id| { - // Random unsupported requests for our implementation start at - // VIRTIO_BLK_T_GET_ID + 1 = 9. - // This can be further refined to include unsupported requests ids < 9. - RequestType::Unsupported(id.checked_add(9).unwrap_or(9)) + // Random unsupported requests for our implementation start above the + // highest request id we support. + RequestType::Unsupported( + id.checked_add(FIRST_UNSUPPORTED_REQUEST_ID) + .unwrap_or(FIRST_UNSUPPORTED_REQUEST_ID), + ) })), ), )) @@ -727,6 +889,7 @@ mod tests { match request_type { RequestType::In => VIRTIO_BLK_T_IN, RequestType::Out => VIRTIO_BLK_T_OUT, + RequestType::Discard => VIRTIO_BLK_T_DISCARD, RequestType::Flush => VIRTIO_BLK_T_FLUSH, RequestType::GetDeviceID => VIRTIO_BLK_T_GET_ID, RequestType::Unsupported(id) => id, @@ -738,7 +901,7 @@ mod tests { fn request_type_flags(request_type: RequestType) -> u16 { match request_type { RequestType::In => VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, - RequestType::Out => VIRTQ_DESC_F_NEXT, + RequestType::Out | RequestType::Discard => VIRTQ_DESC_F_NEXT, RequestType::Flush => VIRTQ_DESC_F_NEXT, RequestType::GetDeviceID => VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, RequestType::Unsupported(_) => VIRTQ_DESC_F_NEXT, @@ -823,7 +986,11 @@ mod tests { // Make sure that data_len is a multiple of 512 // and that 512 <= data_len <= (4096 + 512). - let valid_data_len = ((data_len & 4096) | (SECTOR_SIZE - 1)) + 1; + let valid_data_len = if request_type == RequestType::Discard { + discard_range_size() + } else { + ((data_len & 4096) | (SECTOR_SIZE - 1)) + 1 + }; let sectors_len = u64::from(valid_data_len / SECTOR_SIZE); // Craft a random request with the randomized parameters. let mut request = Request { @@ -832,6 +999,7 @@ mod tests { status_addr, sector: sector & (NUM_DISK_SECTORS - sectors_len), data_addr, + discard_range: None, }; let mut request_header = RequestHeader::new(virtio_request_id, request.sector); @@ -851,6 +1019,23 @@ mod tests { request_type_flags(request.r#type), 2, ); + + if request.r#type == RequestType::Discard { + let discard_sector = sector % NUM_DISK_SECTORS; + mem.write_obj( + DiscardWriteZeroes { + sector: discard_sector, + num_sectors: 1, + flags: 0, + }, + request.data_addr, + ) + .unwrap(); + request.discard_range = Some(( + discard_sector * u64::from(SECTOR_SIZE), + u64::from(SECTOR_SIZE), + )); + } } chain @@ -889,7 +1074,7 @@ mod tests { if *coins.next().unwrap() { match request.r#type { // Readonly buffer is writable. - RequestType::Out => { + RequestType::Out | RequestType::Discard => { data_desc_flags.set(data_desc_flags.get() | VIRTQ_DESC_F_WRITE); return (Err(VirtioBlockError::UnexpectedWriteOnlyDescriptor), mem, q); } @@ -913,6 +1098,11 @@ mod tests { .set(valid_data_len + (data_len % 511) + 1); return (Err(VirtioBlockError::InvalidDataLength), mem, q); } + RequestType::Discard => { + // data_len is not a multiple of discard range size. + chain.data_desc.len.set(valid_data_len + 1); + return (Err(VirtioBlockError::InvalidDataLength), mem, q); + } RequestType::GetDeviceID => { // data_len is < VIRTIO_BLK_ID_BYTES chain @@ -933,6 +1123,18 @@ mod tests { chain.set_header(request_header); return (Err(VirtioBlockError::InvalidOffset), mem, q); } + RequestType::Discard => { + mem.write_obj( + DiscardWriteZeroes { + sector: NUM_DISK_SECTORS, + num_sectors: 1, + flags: 0, + }, + request.data_addr, + ) + .unwrap(); + return (Err(VirtioBlockError::InvalidOffset), mem, q); + } _ => {} }; } diff --git a/src/vmm/src/devices/virtio/block/virtio/test_utils.rs b/src/vmm/src/devices/virtio/block/virtio/test_utils.rs index e4f23c6a038..8567d091d79 100644 --- a/src/vmm/src/devices/virtio/block/virtio/test_utils.rs +++ b/src/vmm/src/devices/virtio/block/virtio/test_utils.rs @@ -43,6 +43,7 @@ pub fn default_block_with_path(path: String, file_engine_type: FileEngineType) - is_root_device: false, partuuid: None, is_read_only: false, + discard: false, cache_type: CacheType::Unsafe, // Rate limiting is enabled but with a high operation rate (10 million ops/s). rate_limiter: Some(RateLimiterConfig { diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index 64254170e76..cf23ba25730 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -621,6 +621,7 @@ mod tests { cache_type: CacheType::Unsafe, is_read_only: Some(false), + discard: None, path_on_host: Some(tmp_file.as_path().to_str().unwrap().to_string()), rate_limiter: Some(RateLimiterConfig::default()), file_engine_type: None, diff --git a/src/vmm/src/vmm_config/drive.rs b/src/vmm/src/vmm_config/drive.rs index 2d3fddac830..397d75e954a 100644 --- a/src/vmm/src/vmm_config/drive.rs +++ b/src/vmm/src/vmm_config/drive.rs @@ -51,6 +51,8 @@ pub struct BlockDeviceConfig { /// If set to true, the drive is opened in read-only mode. Otherwise, the /// drive is opened as read-write. pub is_read_only: Option, + /// If set to true, the drive advertises discard support to the guest. + pub discard: Option, /// Path of the drive. pub path_on_host: Option, /// Rate Limiter for I/O operations. @@ -208,6 +210,7 @@ mod tests { partuuid: self.partuuid.clone(), is_root_device: self.is_root_device, is_read_only: self.is_read_only, + discard: self.discard, cache_type: self.cache_type, path_on_host: self.path_on_host.clone(), @@ -237,6 +240,7 @@ mod tests { cache_type: CacheType::Writeback, is_read_only: Some(false), + discard: None, path_on_host: Some(dummy_path), rate_limiter: None, file_engine_type: None, @@ -271,6 +275,7 @@ mod tests { cache_type: CacheType::Unsafe, is_read_only: Some(true), + discard: None, path_on_host: Some(dummy_path), rate_limiter: None, file_engine_type: None, @@ -303,6 +308,7 @@ mod tests { cache_type: CacheType::Unsafe, is_read_only: Some(true), + discard: None, path_on_host: Some(dummy_path), rate_limiter: None, file_engine_type: None, @@ -332,6 +338,7 @@ mod tests { cache_type: CacheType::Unsafe, is_read_only: Some(false), + discard: None, path_on_host: Some(dummy_path_1), rate_limiter: None, file_engine_type: None, @@ -348,6 +355,7 @@ mod tests { cache_type: CacheType::Unsafe, is_read_only: Some(false), + discard: None, path_on_host: Some(dummy_path_2), rate_limiter: None, file_engine_type: None, @@ -375,6 +383,7 @@ mod tests { cache_type: CacheType::Unsafe, is_read_only: Some(false), + discard: None, path_on_host: Some(dummy_path_1), rate_limiter: None, file_engine_type: None, @@ -391,6 +400,7 @@ mod tests { cache_type: CacheType::Unsafe, is_read_only: Some(false), + discard: None, path_on_host: Some(dummy_path_2), rate_limiter: None, file_engine_type: None, @@ -407,6 +417,7 @@ mod tests { cache_type: CacheType::Unsafe, is_read_only: Some(false), + discard: None, path_on_host: Some(dummy_path_3), rate_limiter: None, file_engine_type: None, @@ -448,6 +459,7 @@ mod tests { cache_type: CacheType::Unsafe, is_read_only: Some(false), + discard: None, path_on_host: Some(dummy_path_1), rate_limiter: None, file_engine_type: None, @@ -464,6 +476,7 @@ mod tests { cache_type: CacheType::Unsafe, is_read_only: Some(false), + discard: None, path_on_host: Some(dummy_path_2), rate_limiter: None, file_engine_type: None, @@ -480,6 +493,7 @@ mod tests { cache_type: CacheType::Unsafe, is_read_only: Some(false), + discard: None, path_on_host: Some(dummy_path_3), rate_limiter: None, file_engine_type: None, @@ -522,6 +536,7 @@ mod tests { cache_type: CacheType::Unsafe, is_read_only: Some(false), + discard: None, path_on_host: Some(dummy_path_1.clone()), rate_limiter: None, file_engine_type: None, @@ -538,6 +553,7 @@ mod tests { cache_type: CacheType::Unsafe, is_read_only: Some(false), + discard: None, path_on_host: Some(dummy_path_2.clone()), rate_limiter: None, file_engine_type: None, @@ -610,6 +626,7 @@ mod tests { cache_type: CacheType::Unsafe, is_read_only: Some(false), + discard: None, path_on_host: Some(dummy_path_1), rate_limiter: None, file_engine_type: None, @@ -626,6 +643,7 @@ mod tests { cache_type: CacheType::Unsafe, is_read_only: Some(false), + discard: None, path_on_host: Some(dummy_path_2), rate_limiter: None, file_engine_type: None, @@ -652,6 +670,7 @@ mod tests { cache_type: CacheType::Unsafe, is_read_only: Some(true), + discard: None, path_on_host: Some(dummy_file.as_path().to_str().unwrap().to_string()), rate_limiter: None, file_engine_type: Some(FileEngineType::Sync), @@ -682,6 +701,7 @@ mod tests { cache_type: CacheType::default(), is_read_only: Some(true), + discard: None, path_on_host: Some(backing_file.as_path().to_str().unwrap().to_string()), rate_limiter: None, file_engine_type: None, diff --git a/src/vmm/tests/integration_tests.rs b/src/vmm/tests/integration_tests.rs index 45bc3301cf6..6758a7d4a05 100644 --- a/src/vmm/tests/integration_tests.rs +++ b/src/vmm/tests/integration_tests.rs @@ -433,6 +433,7 @@ fn test_preboot_load_snap_disallowed_after_boot_resources() { cache_type: CacheType::Unsafe, is_read_only: Some(false), + discard: None, path_on_host: Some(tmp_file), rate_limiter: None, file_engine_type: None, From 560d2f89b6607cf4ffb6a7680c8f97f6c4bd7935 Mon Sep 17 00:00:00 2001 From: Jonas Savulionis Date: Fri, 29 May 2026 19:29:40 +0300 Subject: [PATCH 3/5] seccomp: allow discard syscalls for virtio-blk Allow fallocate hole punching and BLKDISCARD ioctl calls needed by the sync virtio-blk discard implementation. Signed-off-by: Jonas Savulionis --- .../seccomp/aarch64-unknown-linux-musl.json | 26 +++++++++++++++++++ .../seccomp/x86_64-unknown-linux-musl.json | 26 +++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/resources/seccomp/aarch64-unknown-linux-musl.json b/resources/seccomp/aarch64-unknown-linux-musl.json index 1e0047266e6..53fd6fff4a4 100644 --- a/resources/seccomp/aarch64-unknown-linux-musl.json +++ b/resources/seccomp/aarch64-unknown-linux-musl.json @@ -42,6 +42,19 @@ { "syscall": "fsync" }, + { + "syscall": "fallocate", + "comment": "Used by the VirtIO block device to punch holes for discard on regular backing files", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 3, + "comment": "FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE" + } + ] + }, { "syscall": "close" }, @@ -519,6 +532,19 @@ } ] }, + { + "syscall": "ioctl", + "comment": "Used by the VirtIO block device to pass discard through to block-device backing files", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 4727, + "comment": "BLKDISCARD" + } + ] + }, { "syscall": "ioctl", "comment": "Used to make vsock UDS nonblocking", diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json index ea4d49e98b5..0007318a5a1 100644 --- a/resources/seccomp/x86_64-unknown-linux-musl.json +++ b/resources/seccomp/x86_64-unknown-linux-musl.json @@ -42,6 +42,19 @@ { "syscall": "fsync" }, + { + "syscall": "fallocate", + "comment": "Used by the VirtIO block device to punch holes for discard on regular backing files", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 3, + "comment": "FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE" + } + ] + }, { "syscall": "close" }, @@ -571,6 +584,19 @@ } ] }, + { + "syscall": "ioctl", + "comment": "Used by the VirtIO block device to pass discard through to block-device backing files", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 4727, + "comment": "BLKDISCARD" + } + ] + }, { "syscall": "ioctl", "args": [ From 6b74e51f25dab73ef2df7dc3642984168e0c432d Mon Sep 17 00:00:00 2001 From: Jonas Savulionis Date: Sun, 7 Jun 2026 22:21:51 +0300 Subject: [PATCH 4/5] virtio-blk: reuse request fields for discard range Store the validated discard byte offset and length in the existing Request fields instead of carrying a separate discard range. Signed-off-by: Jonas Savulionis --- .../devices/virtio/block/virtio/io/sync_io.rs | 5 ++- .../devices/virtio/block/virtio/request.rs | 43 ++++++++++--------- 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs b/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs index 3389b90e29f..5c1b0f9316d 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs @@ -43,6 +43,9 @@ pub(super) fn discard_file(file: &File, range: (u64, u64)) -> Result Result, } impl Request { @@ -282,7 +281,6 @@ impl Request { r#type: RequestType::from(request_header.request_type), sector: request_header.sector, data_addr: GuestAddress(0), - discard_range: None, data_len: 0, status_addr: GuestAddress(0), }; @@ -390,7 +388,10 @@ impl Request { let byte_len = u64::from(range.num_sectors) .checked_mul(u64::from(SECTOR_SIZE)) .ok_or(VirtioBlockError::InvalidDataLength)?; - self.discard_range = Some((byte_offset, byte_len)); + let byte_len = u32::try_from(byte_len).map_err(|_| VirtioBlockError::InvalidDataLength)?; + + self.sector = byte_offset; + self.data_len = byte_len; Ok(()) } @@ -457,11 +458,8 @@ impl Request { )); } - disk.file_engine.discard( - self.discard_range - .expect("discard request missing validated range"), - pending, - ) + disk.file_engine + .discard((self.sector, u64::from(self.data_len)), pending) } RequestType::Flush => disk.file_engine.flush(pending), RequestType::GetDeviceID => { @@ -502,7 +500,7 @@ mod tests { use super::*; use crate::devices::virtio::queue::{Queue, VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; - use crate::devices::virtio::test_utils::{VirtQueue, default_mem}; + use crate::devices::virtio::test_utils::{default_mem, VirtQueue}; use crate::vstate::memory::{Address, GuestAddress, GuestMemory}; const NUM_DISK_SECTORS: u64 = 1024; @@ -586,9 +584,17 @@ mod tests { request.r#type, RequestType::from(expected_header.request_type) ); - assert_eq!(request.sector, expected_header.sector); + if request.r#type == RequestType::Discard { + let range: DiscardWriteZeroes = memory + .read_obj(GuestAddress(self.data_desc.addr.get())) + .unwrap(); + assert_eq!(request.sector, range.sector * u64::from(SECTOR_SIZE)); + assert_eq!(request.data_len, range.num_sectors * SECTOR_SIZE); + } else { + assert_eq!(request.sector, expected_header.sector); + } - if check_data { + if check_data && request.r#type != RequestType::Discard { assert_eq!(request.data_addr.raw_value(), self.data_desc.addr.get()); assert_eq!(request.data_len, self.data_desc.len.get()); } @@ -909,8 +915,8 @@ mod tests { } #[allow(clippy::let_with_type_underscore)] - fn random_request_parse() - -> impl Strategy, GuestMemoryMmap, Queue)> { + fn random_request_parse( + ) -> impl Strategy, GuestMemoryMmap, Queue)> { // In this strategy we are going to generate random Requests/Errors and map them // to an input descriptor chain. // @@ -999,7 +1005,6 @@ mod tests { status_addr, sector: sector & (NUM_DISK_SECTORS - sectors_len), data_addr, - discard_range: None, }; let mut request_header = RequestHeader::new(virtio_request_id, request.sector); @@ -1031,10 +1036,8 @@ mod tests { request.data_addr, ) .unwrap(); - request.discard_range = Some(( - discard_sector * u64::from(SECTOR_SIZE), - u64::from(SECTOR_SIZE), - )); + request.sector = discard_sector * u64::from(SECTOR_SIZE); + request.data_len = SECTOR_SIZE; } } From 1692379048b0ea859bf2798ccb5949a0a228250c Mon Sep 17 00:00:00 2001 From: Jonas Savulionis Date: Thu, 11 Jun 2026 11:03:23 +0300 Subject: [PATCH 5/5] tests: cover virtio-blk discard API --- .../src/devices/virtio/block/virtio/device.rs | 22 +++++++ .../src/devices/virtio/block/virtio/mod.rs | 2 + .../devices/virtio/block/virtio/request.rs | 10 +-- tests/framework/microvm.py | 2 + .../integration_tests/functional/test_api.py | 64 +++++++++++++++++++ .../functional/test_drive_virtio.py | 48 +++++++++++++- 6 files changed, 142 insertions(+), 6 deletions(-) diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index 70704cb6bb7..cee79257d5f 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -319,6 +319,10 @@ impl VirtioBlock { /// /// The given file must be seekable and sizable. pub fn new(config: VirtioBlockConfig) -> Result { + if config.discard && config.is_read_only { + return Err(VirtioBlockError::DiscardReadOnlyUnsupported); + } + if config.discard && config.file_engine_type == FileEngineType::Async { return Err(VirtioBlockError::DiscardAsyncUnsupported); } @@ -896,6 +900,24 @@ mod tests { VirtioBlock::new(async_config), Err(VirtioBlockError::DiscardAsyncUnsupported) )); + + let f = TempFile::new().unwrap(); + f.as_file().set_len(0x1000).unwrap(); + let read_only_config = VirtioBlockConfig { + drive_id: "test".to_string(), + path_on_host: f.as_path().to_str().unwrap().to_string(), + is_root_device: false, + partuuid: None, + is_read_only: true, + discard: true, + cache_type: CacheType::Unsafe, + rate_limiter: None, + file_engine_type: FileEngineType::Sync, + }; + assert!(matches!( + VirtioBlock::new(read_only_config), + Err(VirtioBlockError::DiscardReadOnlyUnsupported) + )); } #[test] diff --git a/src/vmm/src/devices/virtio/block/virtio/mod.rs b/src/vmm/src/devices/virtio/block/virtio/mod.rs index e3cdcaab605..7f6e97d6d0f 100644 --- a/src/vmm/src/devices/virtio/block/virtio/mod.rs +++ b/src/vmm/src/devices/virtio/block/virtio/mod.rs @@ -58,6 +58,8 @@ pub enum VirtioBlockError { InvalidOffset, /// Discard is not supported with the async IO engine. DiscardAsyncUnsupported, + /// Discard is not supported with read-only drives. + DiscardReadOnlyUnsupported, /// Guest gave us a read only descriptor that protocol says to write to. UnexpectedReadOnlyDescriptor, /// Guest gave us a write only descriptor that protocol says to read from. diff --git a/src/vmm/src/devices/virtio/block/virtio/request.rs b/src/vmm/src/devices/virtio/block/virtio/request.rs index 372b09c3ed3..b5a526a52b7 100644 --- a/src/vmm/src/devices/virtio/block/virtio/request.rs +++ b/src/vmm/src/devices/virtio/block/virtio/request.rs @@ -10,7 +10,7 @@ use std::mem::size_of; use vm_memory::GuestMemoryError; -use super::{io as block_io, VirtioBlockError, MAX_DISCARD_SECTORS, SECTOR_SHIFT, SECTOR_SIZE}; +use super::{MAX_DISCARD_SECTORS, SECTOR_SHIFT, SECTOR_SIZE, VirtioBlockError, io as block_io}; use crate::devices::virtio::block::virtio::device::DiskProperties; use crate::devices::virtio::block::virtio::metrics::BlockDeviceMetrics; pub use crate::devices::virtio::generated::virtio_blk::{ @@ -19,7 +19,7 @@ pub use crate::devices::virtio::generated::virtio_blk::{ VIRTIO_BLK_T_OUT, }; use crate::devices::virtio::queue::DescriptorChain; -use crate::logger::{error, IncMetric}; +use crate::logger::{IncMetric, error}; use crate::rate_limiter::{RateLimiter, TokenType}; use crate::vstate::memory::{Address, ByteValued, Bytes, GuestAddress, GuestMemoryMmap}; @@ -500,7 +500,7 @@ mod tests { use super::*; use crate::devices::virtio::queue::{Queue, VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; - use crate::devices::virtio::test_utils::{default_mem, VirtQueue}; + use crate::devices::virtio::test_utils::{VirtQueue, default_mem}; use crate::vstate::memory::{Address, GuestAddress, GuestMemory}; const NUM_DISK_SECTORS: u64 = 1024; @@ -915,8 +915,8 @@ mod tests { } #[allow(clippy::let_with_type_underscore)] - fn random_request_parse( - ) -> impl Strategy, GuestMemoryMmap, Queue)> { + fn random_request_parse() + -> impl Strategy, GuestMemoryMmap, Queue)> { // In this strategy we are going to generate random Requests/Errors and map them // to an input descriptor chain. // diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index 8d59882e162..438f2c6c842 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -910,6 +910,7 @@ def add_drive( partuuid=None, cache_type=None, io_engine=None, + discard=None, ): """Add a block device.""" @@ -922,6 +923,7 @@ def add_drive( partuuid=partuuid, cache_type=cache_type, io_engine=io_engine, + discard=discard, ) self.disks[drive_id] = path_on_host diff --git a/tests/integration_tests/functional/test_api.py b/tests/integration_tests/functional/test_api.py index 5fc32105231..3d13ca3c655 100644 --- a/tests/integration_tests/functional/test_api.py +++ b/tests/integration_tests/functional/test_api.py @@ -635,6 +635,65 @@ def test_rate_limiters_api_config(uvm, io_engine): ) +@pin_guest_kernel(GUEST_KERNEL_DEFAULT) +def test_discard_drive_api_config(uvm): + """ + Test the discard API configuration and unsupported combinations. + """ + test_microvm = uvm + test_microvm.spawn() + + test_microvm.basic_config(add_root_device=False) + + fs = drive_tools.FilesystemFile(os.path.join(test_microvm.fsfiles, "discard")) + test_microvm.api.drive.put( + drive_id="discard", + path_on_host=test_microvm.create_jailed_resource(fs.path), + is_root_device=False, + is_read_only=False, + io_engine="Sync", + discard=True, + ) + + vm_config = test_microvm.api.vm_config.get().json() + discard_drive = next( + drive for drive in vm_config["drives"] if drive["drive_id"] == "discard" + ) + assert discard_drive["discard"] is True + + fs_read_only = drive_tools.FilesystemFile( + os.path.join(test_microvm.fsfiles, "discard_read_only") + ) + with pytest.raises( + RuntimeError, + match="Discard is not supported with read-only drives", + ): + test_microvm.api.drive.put( + drive_id="discard_read_only", + path_on_host=test_microvm.create_jailed_resource(fs_read_only.path), + is_root_device=False, + is_read_only=True, + io_engine="Sync", + discard=True, + ) + + fs_async = drive_tools.FilesystemFile( + os.path.join(test_microvm.fsfiles, "discard_async") + ) + with pytest.raises( + RuntimeError, + match="Discard is not supported with the async IO engine", + ): + test_microvm.api.drive.put( + drive_id="discard_async", + path_on_host=test_microvm.create_jailed_resource(fs_async.path), + is_root_device=False, + is_read_only=False, + io_engine="Async", + discard=True, + ) + + @pin_guest_kernel(GUEST_KERNEL_DEFAULT) def test_api_patch_pre_boot(uvm, io_engine): """ @@ -890,6 +949,7 @@ def _drive_patch(test_microvm, io_engine): "is_root_device": True, "cache_type": "Unsafe", "is_read_only": True, + "discard": False, "path_on_host": "/" + test_microvm.rootfs_file.name, "rate_limiter": None, "io_engine": "Sync", @@ -901,6 +961,7 @@ def _drive_patch(test_microvm, io_engine): "is_root_device": False, "cache_type": "Unsafe", "is_read_only": False, + "discard": False, "path_on_host": "/scratch_new.ext4", "rate_limiter": { "bandwidth": {"size": 5000, "one_time_burst": None, "refill_time": 100}, @@ -915,6 +976,7 @@ def _drive_patch(test_microvm, io_engine): "is_root_device": False, "cache_type": "Unsafe", "is_read_only": None, + "discard": None, "path_on_host": None, "rate_limiter": None, "io_engine": None, @@ -1304,6 +1366,7 @@ def test_get_full_config_after_restoring_snapshot(microvm_factory, uvm_configure "is_root_device": True, "cache_type": "Unsafe", "is_read_only": True, + "discard": False, "path_on_host": f"/{uvm_configured.rootfs_file.name}", "rate_limiter": None, "io_engine": "Sync", @@ -1444,6 +1507,7 @@ def test_get_full_config(uvm): "is_root_device": True, "cache_type": "Unsafe", "is_read_only": True, + "discard": False, "path_on_host": "/" + test_microvm.rootfs_file.name, "rate_limiter": None, "io_engine": "Sync", diff --git a/tests/integration_tests/functional/test_drive_virtio.py b/tests/integration_tests/functional/test_drive_virtio.py index 6830dad3bd1..78c60c2a475 100644 --- a/tests/integration_tests/functional/test_drive_virtio.py +++ b/tests/integration_tests/functional/test_drive_virtio.py @@ -8,7 +8,12 @@ import host_tools.drive as drive_tools from framework import utils -from framework.artifacts import GUEST_KERNEL_DEFAULT, pin_guest_kernel, pin_rootfs_mode +from framework.artifacts import ( + GUEST_KERNEL_DEFAULT, + pin_guest_kernel, + pin_pci, + pin_rootfs_mode, +) from framework.utils_drive import partuuid_and_disk_path MB = 1024 * 1024 @@ -324,6 +329,47 @@ def test_no_flush(uvm, io_engine): assert fc_metrics["block"]["flush_count"] == 0 +@pin_guest_kernel(GUEST_KERNEL_DEFAULT) +@pin_pci(False) +def test_discard(uvm): + """ + Verify discard is advertised and punches holes in a file-backed drive. + """ + test_microvm = uvm + test_microvm.spawn() + + test_microvm.basic_config(vcpu_count=1) + test_microvm.add_net_iface() + + fs = drive_tools.FilesystemFile( + os.path.join(test_microvm.fsfiles, "discard"), size=64 + ) + test_microvm.add_drive("discard", fs.path, io_engine="Sync", discard=True) + + test_microvm.start() + + _, stdout, stderr = test_microvm.ssh.run( + "cat /sys/block/vdb/queue/discard_granularity" + ) + assert stderr == "" + assert int(stdout.strip()) > 0 + + _, _, stderr = test_microvm.ssh.run( + "dd if=/dev/zero of=/dev/vdb bs=1M count=32 oflag=direct conv=fsync status=none", + timeout=30.0, + ) + assert stderr == "" + + blocks_before = os.stat(fs.path).st_blocks + assert blocks_before > 0 + + _, _, stderr = test_microvm.ssh.run("blkdiscard -f /dev/vdb", timeout=30.0) + assert stderr in ("", "blkdiscard: Operation forced, data will be lost!\n") + + blocks_after = os.stat(fs.path).st_blocks + assert blocks_after < blocks_before + + @pin_guest_kernel(GUEST_KERNEL_DEFAULT) @pin_rootfs_mode("rw") def test_flush(uvm, io_engine):