From 0428a63e6ee7623677b0ae8bc28c379ae1597c91 Mon Sep 17 00:00:00 2001 From: Jonas Savulionis Date: Wed, 27 May 2026 19:59:19 +0300 Subject: [PATCH 1/2] virtio-blk: add direct write mode Add an optional direct_write mode for block devices. Aligned guest writes use an O_DIRECT host file descriptor. Reads stay buffered and unaligned writes use the regular path. Keep direct writes opt-in for selected storage backends. This avoids changing the default buffered path. Signed-off-by: Jonas Savulionis --- CHANGELOG.md | 4 + docs/api_requests/block-io-engine.md | 19 +++ .../src/api_server/parsed_request.rs | 2 +- .../src/api_server/request/drive.rs | 1 + src/firecracker/swagger/firecracker.yaml | 8 ++ src/vmm/src/builder.rs | 2 + src/vmm/src/device_manager/mod.rs | 2 + .../devices/virtio/block/vhost_user/device.rs | 10 +- .../src/devices/virtio/block/virtio/device.rs | 75 +++++++++-- .../virtio/block/virtio/io/async_io.rs | 47 +++++-- .../src/devices/virtio/block/virtio/io/mod.rs | 125 ++++++++++++++++-- .../devices/virtio/block/virtio/io/sync_io.rs | 33 +++-- .../devices/virtio/block/virtio/persist.rs | 6 + .../devices/virtio/block/virtio/test_utils.rs | 1 + src/vmm/src/resources.rs | 2 + src/vmm/src/vmm_config/drive.rs | 37 ++++++ src/vmm/tests/integration_tests.rs | 1 + 17 files changed, 339 insertions(+), 36 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5c1be039674..e3e965f6483 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,10 @@ and this project adheres to ### Added +- [#5910](https://github.com/firecracker-microvm/firecracker/pull/5910): Add + optional `direct_write` support for virtio-block devices. When enabled, + aligned guest writes use host direct I/O while reads remain buffered. + ### Changed ### Deprecated diff --git a/docs/api_requests/block-io-engine.md b/docs/api_requests/block-io-engine.md index edc544bbfb6..fb5dcc0012e 100644 --- a/docs/api_requests/block-io-engine.md +++ b/docs/api_requests/block-io-engine.md @@ -42,10 +42,29 @@ curl --unix-socket ${socket} -i \ \"path_on_host\": \"${drive_path}\", \"is_root_device\": true, \"is_read_only\": false, + \"direct_write\": false, \"io_engine\": \"Sync\" }" ``` +## Direct write mode + +The optional `direct_write` field enables direct I/O for guest write requests +only when they satisfy direct I/O alignment requirements, while keeping reads on +the regular buffered file descriptor. The guest write offset and length must be +multiples of 4096 bytes (4 KiB), and the guest memory buffer address must also +be suitably aligned for direct I/O. Writes that do not meet these requirements +automatically fall back to the buffered path. + +This mode can reduce host page-cache overhead for write-heavy block devices +whose backing storage performs better with direct writes. It is disabled by +default to preserve existing behavior. + +Direct I/O support and alignment requirements are filesystem and kernel +dependent. Validate this mode with the target backing storage before enabling it +for production workloads, especially when the backing file is also accessed by +host tooling through buffered I/O. + ## Host requirements Firecracker requires a minimum host kernel version of 5.10.51 for the `Async` IO diff --git a/src/firecracker/src/api_server/parsed_request.rs b/src/firecracker/src/api_server/parsed_request.rs index 2946b197067..54abec2fa1e 100644 --- a/src/firecracker/src/api_server/parsed_request.rs +++ b/src/firecracker/src/api_server/parsed_request.rs @@ -807,7 +807,7 @@ pub mod tests { let mut connection = HttpConnection::new(receiver); let body = "{ \"drive_id\": \"string\", \"path_on_host\": \"string\", \"is_root_device\": \ true, \"partuuid\": \"string\", \"is_read_only\": true, \"cache_type\": \ - \"Unsafe\", \"io_engine\": \"Sync\", \"rate_limiter\": { \"bandwidth\": { \ + \"Unsafe\", \"direct_write\": true, \"io_engine\": \"Sync\", \"rate_limiter\": { \"bandwidth\": { \ \"size\": 0, \"one_time_burst\": 0, \"refill_time\": 0 }, \"ops\": { \ \"size\": 0, \"one_time_burst\": 0, \"refill_time\": 0 } } }"; sender diff --git a/src/firecracker/src/api_server/request/drive.rs b/src/firecracker/src/api_server/request/drive.rs index 10ca8fd3256..8fae665eb71 100644 --- a/src/firecracker/src/api_server/request/drive.rs +++ b/src/firecracker/src/api_server/request/drive.rs @@ -228,6 +228,7 @@ mod tests { "partuuid": "string", "is_read_only": true, "cache_type": "Unsafe", + "direct_write": true, "io_engine": "Sync", "rate_limiter": { "bandwidth": { diff --git a/src/firecracker/swagger/firecracker.yaml b/src/firecracker/swagger/firecracker.yaml index d1ac91bdb6a..e40c953927b 100644 --- a/src/firecracker/swagger/firecracker.yaml +++ b/src/firecracker/swagger/firecracker.yaml @@ -1245,6 +1245,14 @@ definitions: This field is required for virtio-block config and should be omitted for vhost-user-block configuration. rate_limiter: $ref: "#/definitions/RateLimiter" + direct_write: + type: boolean + description: + If true, aligned guest write requests use direct I/O on the host while + reads remain buffered. Unaligned writes fall back to the buffered file + descriptor. Host direct I/O alignment requirements depend on the + backing filesystem and kernel. + default: false io_engine: type: string description: diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 45dd3865155..ae46952ea64 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -904,6 +904,8 @@ pub(crate) mod tests { .to_string(), ), rate_limiter: None, + direct_write: None, + file_engine_type: None, socket: None, diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 202c457b648..ca3fabc7ce8 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -888,6 +888,8 @@ pub(crate) mod tests { is_read_only: Some(false), path_on_host: Some(f.as_path().to_str().unwrap().to_string()), rate_limiter: None, + direct_write: None, + file_engine_type: None, socket: None, } diff --git a/src/vmm/src/devices/virtio/block/vhost_user/device.rs b/src/vmm/src/devices/virtio/block/vhost_user/device.rs index 7df600722ab..6ee93fddd8e 100644 --- a/src/vmm/src/devices/virtio/block/vhost_user/device.rs +++ b/src/vmm/src/devices/virtio/block/vhost_user/device.rs @@ -67,11 +67,12 @@ impl TryFrom<&BlockDeviceConfig> for VhostUserBlockConfig { type Error = VhostUserBlockError; fn try_from(value: &BlockDeviceConfig) -> Result { - if let (Some(socket), None, None, None, None) = ( + if let (Some(socket), None, None, None, None, None) = ( &value.socket, &value.is_read_only, &value.path_on_host, &value.rate_limiter, + &value.direct_write, &value.file_engine_type, ) { Ok(Self { @@ -99,6 +100,7 @@ impl From for BlockDeviceConfig { is_read_only: None, path_on_host: None, rate_limiter: None, + direct_write: None, file_engine_type: None, socket: Some(value.socket), @@ -411,6 +413,8 @@ mod tests { is_read_only: None, path_on_host: None, rate_limiter: None, + direct_write: None, + file_engine_type: None, socket: Some("sock".to_string()), @@ -426,6 +430,8 @@ mod tests { is_read_only: Some(true), path_on_host: Some("path".to_string()), rate_limiter: None, + direct_write: None, + file_engine_type: Some(FileEngineType::Sync), socket: None, @@ -441,6 +447,8 @@ mod tests { is_read_only: Some(true), path_on_host: Some("path".to_string()), rate_limiter: None, + direct_write: None, + file_engine_type: Some(FileEngineType::Sync), socket: Some("sock".to_string()), diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index ec7ec468505..63c844d1544 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -11,7 +11,7 @@ use std::fs::{File, OpenOptions}; use std::io::{Seek, SeekFrom}; use std::ops::Deref; use std::os::linux::fs::MetadataExt; -use std::path::PathBuf; +use std::os::unix::fs::OpenOptionsExt; use std::sync::Arc; use block_io::FileEngine; @@ -58,6 +58,7 @@ pub struct DiskProperties { pub file_engine: FileEngine, pub nsectors: u64, pub image_id: [u8; VIRTIO_BLK_ID_BYTES as usize], + pub direct_write: bool, } impl DiskProperties { @@ -66,7 +67,25 @@ impl DiskProperties { OpenOptions::new() .read(true) .write(!is_disk_read_only) - .open(PathBuf::from(&disk_image_path)) + .open(disk_image_path) + .map_err(|x| VirtioBlockError::BackingFile(x, disk_image_path.to_string())) + } + + fn open_direct_file( + disk_image_path: &str, + is_disk_read_only: bool, + direct_write: bool, + ) -> Result, VirtioBlockError> { + if !direct_write || is_disk_read_only { + return Ok(None); + } + + OpenOptions::new() + .read(true) + .write(true) + .custom_flags(libc::O_DIRECT) + .open(disk_image_path) + .map(Some) .map_err(|x| VirtioBlockError::BackingFile(x, disk_image_path.to_string())) } @@ -94,17 +113,22 @@ impl DiskProperties { disk_image_path: String, is_disk_read_only: bool, file_engine_type: FileEngineType, + direct_write: bool, ) -> Result { let mut disk_image = Self::open_file(&disk_image_path, is_disk_read_only)?; + let direct_disk_image = + Self::open_direct_file(&disk_image_path, is_disk_read_only, direct_write)?; + let direct_write = direct_disk_image.is_some(); let disk_size = Self::file_size(&disk_image_path, &mut disk_image)?; let image_id = Self::build_disk_image_id(&disk_image); Ok(Self { file_path: disk_image_path, - file_engine: FileEngine::from_file(disk_image, file_engine_type) + file_engine: FileEngine::from_file(disk_image, direct_disk_image, file_engine_type) .map_err(VirtioBlockError::FileEngine)?, nsectors: disk_size >> SECTOR_SHIFT, image_id, + direct_write, }) } @@ -115,11 +139,13 @@ impl DiskProperties { is_disk_read_only: bool, ) -> Result<(), VirtioBlockError> { let mut disk_image = Self::open_file(&disk_image_path, is_disk_read_only)?; + let direct_disk_image = + Self::open_direct_file(&disk_image_path, is_disk_read_only, self.direct_write)?; let disk_size = Self::file_size(&disk_image_path, &mut disk_image)?; self.image_id = Self::build_disk_image_id(&disk_image); self.file_engine - .update_file_path(disk_image) + .update_file_path(disk_image, direct_disk_image) .map_err(VirtioBlockError::FileEngine)?; self.nsectors = disk_size >> SECTOR_SHIFT; self.file_path = disk_image_path; @@ -193,6 +219,9 @@ pub struct VirtioBlockConfig { pub path_on_host: String, /// Rate Limiter for I/O operations. pub rate_limiter: Option, + /// If true, aligned guest writes use host direct I/O while reads remain buffered. + #[serde(default)] + pub direct_write: bool, /// The type of IO engine used by the device. #[serde(default)] #[serde(rename = "io_engine")] @@ -213,6 +242,7 @@ impl TryFrom<&BlockDeviceConfig> for VirtioBlockConfig { is_read_only: value.is_read_only.unwrap_or(false), path_on_host: path_on_host.clone(), rate_limiter: value.rate_limiter, + direct_write: value.direct_write.unwrap_or(false), file_engine_type: value.file_engine_type.unwrap_or_default(), }) } else { @@ -232,6 +262,7 @@ impl From for BlockDeviceConfig { is_read_only: Some(value.is_read_only), path_on_host: Some(value.path_on_host), rate_limiter: value.rate_limiter, + direct_write: Some(value.direct_write), file_engine_type: Some(value.file_engine_type), socket: None, @@ -288,6 +319,7 @@ impl VirtioBlock { config.path_on_host, config.is_read_only, config.file_engine_type, + config.direct_write, )?; let rate_limiter = config @@ -349,6 +381,7 @@ impl VirtioBlock { is_read_only: self.read_only, cache_type: self.cache_type, rate_limiter: rl.into_option(), + direct_write: self.disk.direct_write, file_engine_type: self.file_engine_type(), } } @@ -725,11 +758,17 @@ mod tests { is_read_only: Some(true), path_on_host: Some("path".to_string()), rate_limiter: None, + direct_write: Some(true), + file_engine_type: Default::default(), socket: None, }; - VirtioBlockConfig::try_from(&block_config).unwrap(); + assert!( + VirtioBlockConfig::try_from(&block_config) + .unwrap() + .direct_write + ); let block_config = BlockDeviceConfig { drive_id: "".to_string(), @@ -740,6 +779,8 @@ mod tests { is_read_only: None, path_on_host: None, rate_limiter: None, + direct_write: None, + file_engine_type: Default::default(), socket: Some("sock".to_string()), @@ -755,6 +796,8 @@ mod tests { is_read_only: Some(true), path_on_host: Some("path".to_string()), rate_limiter: None, + direct_write: None, + file_engine_type: Default::default(), socket: Some("sock".to_string()), @@ -770,16 +813,30 @@ mod tests { f.as_file().set_len(size).unwrap(); for engine in [FileEngineType::Sync, FileEngineType::Async] { - let disk_properties = - DiskProperties::new(String::from(f.as_path().to_str().unwrap()), true, engine) - .unwrap(); + let disk_properties = DiskProperties::new( + String::from(f.as_path().to_str().unwrap()), + true, + engine, + false, + ) + .unwrap(); assert_eq!(size, u64::from(SECTOR_SIZE) * num_sectors); assert_eq!(disk_properties.nsectors, num_sectors); // Testing `backing_file.virtio_block_disk_image_id()` implies // duplicating that logic in tests, so skipping it. - let res = DiskProperties::new("invalid-disk-path".to_string(), true, engine); + let disk_properties = DiskProperties::new( + String::from(f.as_path().to_str().unwrap()), + true, + engine, + true, + ) + .unwrap(); + assert!(!disk_properties.direct_write); + assert!(disk_properties.file_engine.direct_file().is_none()); + + let res = DiskProperties::new("invalid-disk-path".to_string(), true, engine, false); assert!( matches!(res, Err(VirtioBlockError::BackingFile(_, _))), "{:?}", diff --git a/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs b/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs index f83d9dea1df..fe542cb47d4 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs @@ -9,7 +9,9 @@ use std::os::unix::io::AsRawFd; use vm_memory::GuestMemoryError; use vmm_sys_util::eventfd::EventFd; -use crate::devices::virtio::block::virtio::io::RequestError; +use crate::devices::virtio::block::virtio::io::{ + DIRECT_WRITE_FD, RequestError, direct_io_eligible, +}; use crate::devices::virtio::block::virtio::{IO_URING_NUM_ENTRIES, PendingRequest}; use crate::io_uring::operation::{Cqe, OpCode, Operation}; use crate::io_uring::restriction::Restriction; @@ -36,6 +38,7 @@ pub enum AsyncIoError { #[derive(Debug)] pub struct AsyncFileEngine { file: File, + direct_file: Option, ring: IoUring, completion_evt: EventFd, } @@ -70,11 +73,17 @@ impl WrappedRequest { impl AsyncFileEngine { fn new_ring( file: &File, + direct_file: Option<&File>, completion_fd: RawFd, ) -> Result, IoUringError> { + let mut files = vec![file]; + if let Some(direct_file) = direct_file { + files.push(direct_file); + } + IoUring::new( u32::from(IO_URING_NUM_ENTRIES), - vec![file], + files, vec![ // Make sure we only allow operations on pre-registered fds. Restriction::RequireFixedFds, @@ -87,26 +96,35 @@ impl AsyncFileEngine { ) } - pub fn from_file(file: File) -> Result { + pub fn from_file( + file: File, + direct_file: Option, + ) -> Result { log_dev_preview_warning("Async file IO", Option::None); let completion_evt = EventFd::new(libc::EFD_NONBLOCK).map_err(AsyncIoError::EventFd)?; - let ring = - Self::new_ring(&file, completion_evt.as_raw_fd()).map_err(AsyncIoError::IoUring)?; + let ring = Self::new_ring(&file, direct_file.as_ref(), completion_evt.as_raw_fd()) + .map_err(AsyncIoError::IoUring)?; Ok(AsyncFileEngine { file, + direct_file, ring, completion_evt, }) } - pub fn update_file(&mut self, file: File) -> Result<(), AsyncIoError> { - let ring = Self::new_ring(&file, self.completion_evt.as_raw_fd()) + pub fn update_file( + &mut self, + file: File, + direct_file: Option, + ) -> Result<(), AsyncIoError> { + let ring = Self::new_ring(&file, direct_file.as_ref(), self.completion_evt.as_raw_fd()) .map_err(AsyncIoError::IoUring)?; - self.file = file; self.ring = ring; + self.file = file; + self.direct_file = direct_file; Ok(()) } @@ -115,6 +133,11 @@ impl AsyncFileEngine { &self.file } + #[cfg(test)] + pub fn direct_file(&self) -> Option<&File> { + self.direct_file.as_ref() + } + pub fn completion_evt(&self) -> &EventFd { &self.completion_evt } @@ -173,9 +196,15 @@ impl AsyncFileEngine { let wrapped_user_data = WrappedRequest::new(req); + let fd = if self.direct_file.is_some() && direct_io_eligible(buf as usize, offset, count) { + DIRECT_WRITE_FD + } else { + 0 + }; + self.ring .push(Operation::write( - 0, + fd, buf as usize, count, offset, diff --git a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs index b7aa8061d76..1cdf146c2e6 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs @@ -13,6 +13,16 @@ use crate::devices::virtio::block::virtio::PendingRequest; use crate::devices::virtio::block::virtio::device::FileEngineType; use crate::vstate::memory::{GuestAddress, GuestMemoryMmap}; +const DIRECT_IO_ALIGNMENT: u64 = 4096; +const DIRECT_WRITE_FD: u32 = 1; + +fn direct_io_eligible(buf: usize, offset: u64, count: u32) -> bool { + count != 0 + && (buf as u64).is_multiple_of(DIRECT_IO_ALIGNMENT) + && offset.is_multiple_of(DIRECT_IO_ALIGNMENT) + && u64::from(count) % DIRECT_IO_ALIGNMENT == 0 +} + #[derive(Debug)] pub struct RequestOk { pub req: PendingRequest, @@ -57,19 +67,32 @@ pub enum FileEngine { } impl FileEngine { - pub fn from_file(file: File, engine_type: FileEngineType) -> Result { + pub fn from_file( + file: File, + direct_file: Option, + engine_type: FileEngineType, + ) -> Result { match engine_type { FileEngineType::Async => Ok(FileEngine::Async( - AsyncFileEngine::from_file(file).map_err(BlockIoError::Async)?, + AsyncFileEngine::from_file(file, direct_file).map_err(BlockIoError::Async)?, )), - FileEngineType::Sync => Ok(FileEngine::Sync(SyncFileEngine::from_file(file))), + FileEngineType::Sync => Ok(FileEngine::Sync(SyncFileEngine::from_file( + file, + direct_file, + ))), } } - pub fn update_file_path(&mut self, file: File) -> Result<(), BlockIoError> { + pub fn update_file_path( + &mut self, + file: File, + direct_file: Option, + ) -> Result<(), BlockIoError> { match self { - FileEngine::Async(engine) => engine.update_file(file).map_err(BlockIoError::Async)?, - FileEngine::Sync(engine) => engine.update_file(file), + FileEngine::Async(engine) => engine + .update_file(file, direct_file) + .map_err(BlockIoError::Async)?, + FileEngine::Sync(engine) => engine.update_file(file, direct_file), }; Ok(()) @@ -83,6 +106,14 @@ impl FileEngine { } } + #[cfg(test)] + pub fn direct_file(&self) -> Option<&File> { + match self { + FileEngine::Async(engine) => engine.direct_file(), + FileEngine::Sync(engine) => engine.direct_file(), + } + } + pub fn read( &mut self, offset: u64, @@ -177,6 +208,7 @@ impl FileEngine { #[cfg(test)] pub mod tests { #![allow(clippy::undocumented_unsafe_blocks)] + use std::io::{Read, Seek, SeekFrom}; use std::os::unix::ffi::OsStrExt; use vm_memory::GuestMemoryRegion; @@ -236,6 +268,13 @@ pub mod tests { .unwrap() } + fn read_file_prefix(file: &mut std::fs::File, len: usize) -> Vec { + let mut data = vec![0; len]; + file.seek(SeekFrom::Start(0)).unwrap(); + file.read_exact(&mut data).unwrap(); + data + } + fn check_dirty_mem(mem: &GuestMemoryMmap, addr: GuestAddress, len: u32) { let bitmap = mem.find_region(addr).unwrap().bitmap(); for offset in addr.0..addr.0 + u64::from(len) { @@ -250,12 +289,82 @@ pub mod tests { } } + #[test] + fn test_direct_io_alignment() { + assert!(direct_io_eligible(0x1000, 0x2000, 0x3000)); + assert!(!direct_io_eligible(0x1001, 0x2000, 0x3000)); + assert!(!direct_io_eligible(0x1000, 0x2001, 0x3000)); + assert!(!direct_io_eligible(0x1000, 0x2000, 0x3001)); + assert!(!direct_io_eligible(0x1000, 0x2000, 0)); + } + + #[test] + fn test_direct_write_file_engine() { + for engine_type in [FileEngineType::Sync, FileEngineType::Async] { + let file = TempFile::new().unwrap().into_file(); + let direct_file = file.try_clone().unwrap(); + let engine = FileEngine::from_file(file, Some(direct_file), engine_type).unwrap(); + + assert!(engine.direct_file().is_some()); + } + } + + #[test] + fn test_sync_direct_write_selects_expected_file() { + const DIRECT_WRITE_LEN: u32 = 4096; + + let mem = create_mem(); + let addr = GuestAddress(0); + let first_write = vec![0x5a; DIRECT_WRITE_LEN as usize]; + mem.write(&first_write, addr).unwrap(); + + let mut buffered_file = TempFile::new().unwrap().into_file(); + let mut direct_file = TempFile::new().unwrap().into_file(); + buffered_file.set_len(MEM_LEN as u64).unwrap(); + direct_file.set_len(MEM_LEN as u64).unwrap(); + let mut buffered_check = buffered_file.try_clone().unwrap(); + let mut direct_check = direct_file.try_clone().unwrap(); + + let slice = mem.get_slice(addr, DIRECT_WRITE_LEN as usize).unwrap(); + let buf = slice.ptr_guard().as_ptr() as usize; + assert!(direct_io_eligible(buf, 0, DIRECT_WRITE_LEN)); + + let mut engine = SyncFileEngine::from_file(buffered_file, Some(direct_file)); + assert_eq!( + engine.write(0, &mem, addr, DIRECT_WRITE_LEN).unwrap(), + DIRECT_WRITE_LEN + ); + assert_eq!( + read_file_prefix(&mut direct_check, DIRECT_WRITE_LEN as usize), + first_write + ); + assert_eq!( + read_file_prefix(&mut buffered_check, DIRECT_WRITE_LEN as usize), + vec![0; DIRECT_WRITE_LEN as usize] + ); + + let second_write = vec![0xa5; DIRECT_WRITE_LEN as usize]; + mem.write(&second_write, addr).unwrap(); + assert_eq!( + engine.write(1, &mem, addr, DIRECT_WRITE_LEN).unwrap(), + DIRECT_WRITE_LEN + ); + + let buffered_data = read_file_prefix(&mut buffered_check, DIRECT_WRITE_LEN as usize + 1); + assert_eq!(buffered_data[0], 0); + assert_eq!(&buffered_data[1..], second_write.as_slice()); + assert_eq!( + read_file_prefix(&mut direct_check, DIRECT_WRITE_LEN as usize), + first_write + ); + } + #[test] fn test_sync() { let mem = create_mem(); // Create backing file. let file = TempFile::new().unwrap().into_file(); - let mut engine = FileEngine::from_file(file, FileEngineType::Sync).unwrap(); + let mut engine = FileEngine::from_file(file, None, FileEngineType::Sync).unwrap(); let data = vmm_sys_util::rand::rand_alphanumerics(FILE_LEN as usize) .as_bytes() @@ -339,7 +448,7 @@ pub mod tests { fn test_async() { // Create backing file. let file = TempFile::new().unwrap().into_file(); - let mut engine = FileEngine::from_file(file, FileEngineType::Async).unwrap(); + let mut engine = FileEngine::from_file(file, None, FileEngineType::Async).unwrap(); let data = vmm_sys_util::rand::rand_alphanumerics(FILE_LEN as usize) .as_bytes() diff --git a/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs b/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs index eec3b3d8b8d..0b5a4edcc75 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs @@ -8,6 +8,8 @@ use vm_memory::{GuestMemoryError, ReadVolatile, WriteVolatile}; use crate::vstate::memory::{GuestAddress, GuestMemory, GuestMemoryMmap}; +use super::direct_io_eligible; + #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum SyncIoError { /// Flush: {0} @@ -23,14 +25,15 @@ pub enum SyncIoError { #[derive(Debug)] pub struct SyncFileEngine { file: File, + direct_file: Option, } // SAFETY: `File` is send and ultimately a POD. unsafe impl Send for SyncFileEngine {} impl SyncFileEngine { - pub fn from_file(file: File) -> SyncFileEngine { - SyncFileEngine { file } + pub fn from_file(file: File, direct_file: Option) -> SyncFileEngine { + SyncFileEngine { file, direct_file } } #[cfg(test)] @@ -38,9 +41,15 @@ impl SyncFileEngine { &self.file } + #[cfg(test)] + pub fn direct_file(&self) -> Option<&File> { + self.direct_file.as_ref() + } + /// Update the backing file of the engine - pub fn update_file(&mut self, file: File) { - self.file = file + pub fn update_file(&mut self, file: File, direct_file: Option) { + self.file = file; + self.direct_file = direct_file; } pub fn read( @@ -66,11 +75,19 @@ impl SyncFileEngine { addr: GuestAddress, count: u32, ) -> Result { - self.file - .seek(SeekFrom::Start(offset)) + let slice = mem + .get_slice(addr, count as usize) + .map_err(SyncIoError::Transfer)?; + let buf = slice.ptr_guard().as_ptr() as usize; + let file = match self.direct_file.as_mut() { + Some(direct_file) if direct_io_eligible(buf, offset, count) => direct_file, + _ => &mut self.file, + }; + + file.seek(SeekFrom::Start(offset)) .map_err(SyncIoError::Seek)?; - mem.get_slice(addr, count as usize) - .and_then(|slice| Ok(self.file.write_all_volatile(&slice)?)) + file.write_all_volatile(&slice) + .map_err(GuestMemoryError::from) .map_err(SyncIoError::Transfer)?; Ok(count) } diff --git a/src/vmm/src/devices/virtio/block/virtio/persist.rs b/src/vmm/src/devices/virtio/block/virtio/persist.rs index c4288460a56..bf53541b521 100644 --- a/src/vmm/src/devices/virtio/block/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/block/virtio/persist.rs @@ -59,6 +59,8 @@ pub struct VirtioBlockState { disk_path: String, pub virtio_state: VirtioDeviceState, rate_limiter_state: RateLimiterState, + #[serde(default)] + direct_write: bool, file_engine_type: FileEngineTypeState, } @@ -77,6 +79,7 @@ impl Persist<'_> for VirtioBlock { disk_path: self.disk.file_path.clone(), virtio_state: VirtioDeviceState::from_device(self), rate_limiter_state: self.rate_limiter.save(), + direct_write: self.disk.direct_write, file_engine_type: FileEngineTypeState::from(self.file_engine_type()), } } @@ -93,6 +96,7 @@ impl Persist<'_> for VirtioBlock { state.disk_path.clone(), is_read_only, state.file_engine_type.into(), + state.direct_write, )?; let queue_evts = [EventFd::new(libc::EFD_NONBLOCK).map_err(VirtioBlockError::EventFd)?]; @@ -162,6 +166,7 @@ mod tests { cache_type: CacheType::Writeback, rate_limiter: None, file_engine_type: FileEngineType::default(), + direct_write: false, }; let block = VirtioBlock::new(config).unwrap(); @@ -203,6 +208,7 @@ mod tests { cache_type: CacheType::Unsafe, rate_limiter: None, file_engine_type: FileEngineType::default(), + direct_write: false, }; let block = VirtioBlock::new(config).unwrap(); diff --git a/src/vmm/src/devices/virtio/block/virtio/test_utils.rs b/src/vmm/src/devices/virtio/block/virtio/test_utils.rs index e4f23c6a038..41900e346e2 100644 --- a/src/vmm/src/devices/virtio/block/virtio/test_utils.rs +++ b/src/vmm/src/devices/virtio/block/virtio/test_utils.rs @@ -57,6 +57,7 @@ pub fn default_block_with_path(path: String, file_engine_type: FileEngineType) - refill_time: 10, }), }), + direct_write: false, file_engine_type, }; diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index 64254170e76..c508e05ef26 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -623,6 +623,8 @@ mod tests { is_read_only: Some(false), path_on_host: Some(tmp_file.as_path().to_str().unwrap().to_string()), rate_limiter: Some(RateLimiterConfig::default()), + direct_write: None, + file_engine_type: None, socket: None, diff --git a/src/vmm/src/vmm_config/drive.rs b/src/vmm/src/vmm_config/drive.rs index 2d3fddac830..53ae3303c24 100644 --- a/src/vmm/src/vmm_config/drive.rs +++ b/src/vmm/src/vmm_config/drive.rs @@ -55,6 +55,8 @@ pub struct BlockDeviceConfig { pub path_on_host: Option, /// Rate Limiter for I/O operations. pub rate_limiter: Option, + /// If true, aligned guest writes use host direct I/O while reads remain buffered. + pub direct_write: Option, /// The type of IO engine used by the device. // #[serde(default)] // #[serde(rename = "io_engine")] @@ -212,6 +214,7 @@ mod tests { path_on_host: self.path_on_host.clone(), rate_limiter: self.rate_limiter, + direct_write: self.direct_write, file_engine_type: self.file_engine_type, socket: self.socket.clone(), @@ -239,6 +242,8 @@ mod tests { is_read_only: Some(false), path_on_host: Some(dummy_path), rate_limiter: None, + direct_write: None, + file_engine_type: None, socket: None, @@ -273,6 +278,8 @@ mod tests { is_read_only: Some(true), path_on_host: Some(dummy_path), rate_limiter: None, + direct_write: None, + file_engine_type: None, socket: None, @@ -305,6 +312,8 @@ mod tests { is_read_only: Some(true), path_on_host: Some(dummy_path), rate_limiter: None, + direct_write: None, + file_engine_type: None, socket: None, @@ -334,6 +343,8 @@ mod tests { is_read_only: Some(false), path_on_host: Some(dummy_path_1), rate_limiter: None, + direct_write: None, + file_engine_type: None, socket: None, @@ -350,6 +361,8 @@ mod tests { is_read_only: Some(false), path_on_host: Some(dummy_path_2), rate_limiter: None, + direct_write: None, + file_engine_type: None, socket: None, @@ -377,6 +390,8 @@ mod tests { is_read_only: Some(false), path_on_host: Some(dummy_path_1), rate_limiter: None, + direct_write: None, + file_engine_type: None, socket: None, @@ -393,6 +408,8 @@ mod tests { is_read_only: Some(false), path_on_host: Some(dummy_path_2), rate_limiter: None, + direct_write: None, + file_engine_type: None, socket: None, @@ -409,6 +426,8 @@ mod tests { is_read_only: Some(false), path_on_host: Some(dummy_path_3), rate_limiter: None, + direct_write: None, + file_engine_type: None, socket: None, @@ -450,6 +469,8 @@ mod tests { is_read_only: Some(false), path_on_host: Some(dummy_path_1), rate_limiter: None, + direct_write: None, + file_engine_type: None, socket: None, @@ -466,6 +487,8 @@ mod tests { is_read_only: Some(false), path_on_host: Some(dummy_path_2), rate_limiter: None, + direct_write: None, + file_engine_type: None, socket: None, @@ -482,6 +505,8 @@ mod tests { is_read_only: Some(false), path_on_host: Some(dummy_path_3), rate_limiter: None, + direct_write: None, + file_engine_type: None, socket: None, @@ -524,6 +549,8 @@ mod tests { is_read_only: Some(false), path_on_host: Some(dummy_path_1.clone()), rate_limiter: None, + direct_write: None, + file_engine_type: None, socket: None, @@ -540,6 +567,8 @@ mod tests { is_read_only: Some(false), path_on_host: Some(dummy_path_2.clone()), rate_limiter: None, + direct_write: None, + file_engine_type: None, socket: None, @@ -612,6 +641,8 @@ mod tests { is_read_only: Some(false), path_on_host: Some(dummy_path_1), rate_limiter: None, + direct_write: None, + file_engine_type: None, socket: None, @@ -628,6 +659,8 @@ mod tests { is_read_only: Some(false), path_on_host: Some(dummy_path_2), rate_limiter: None, + direct_write: None, + file_engine_type: None, socket: None, @@ -654,6 +687,8 @@ mod tests { is_read_only: Some(true), path_on_host: Some(dummy_file.as_path().to_str().unwrap().to_string()), rate_limiter: None, + direct_write: None, + file_engine_type: Some(FileEngineType::Sync), socket: None, @@ -684,6 +719,8 @@ mod tests { is_read_only: Some(true), path_on_host: Some(backing_file.as_path().to_str().unwrap().to_string()), rate_limiter: None, + direct_write: None, + file_engine_type: None, socket: None, diff --git a/src/vmm/tests/integration_tests.rs b/src/vmm/tests/integration_tests.rs index 45bc3301cf6..73262c14181 100644 --- a/src/vmm/tests/integration_tests.rs +++ b/src/vmm/tests/integration_tests.rs @@ -435,6 +435,7 @@ fn test_preboot_load_snap_disallowed_after_boot_resources() { is_read_only: Some(false), path_on_host: Some(tmp_file), rate_limiter: None, + direct_write: None, file_engine_type: None, socket: None, From 3ea3dcdfc3632ea616a16cb732de049883e3f3a2 Mon Sep 17 00:00:00 2001 From: Jonas Savulionis Date: Mon, 1 Jun 2026 15:10:37 +0300 Subject: [PATCH 2/2] virtio-blk: preserve default direct write config Signed-off-by: Jonas Savulionis --- src/vmm/src/devices/virtio/block/virtio/device.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index 63c844d1544..bbb2059ccc5 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -262,7 +262,7 @@ impl From for BlockDeviceConfig { is_read_only: Some(value.is_read_only), path_on_host: Some(value.path_on_host), rate_limiter: value.rate_limiter, - direct_write: Some(value.direct_write), + direct_write: value.direct_write.then_some(true), file_engine_type: Some(value.file_engine_type), socket: None,