Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ and this project adheres to

### Added

- [#5910](https://github.com/firecracker-microvm/firecracker/pull/5910): Add
optional `direct_write` support for virtio-block devices. When enabled,
aligned guest writes use host direct I/O while reads remain buffered.

### Changed

### Deprecated
Expand Down
19 changes: 19 additions & 0 deletions docs/api_requests/block-io-engine.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,29 @@ curl --unix-socket ${socket} -i \
\"path_on_host\": \"${drive_path}\",
\"is_root_device\": true,
\"is_read_only\": false,
\"direct_write\": false,
\"io_engine\": \"Sync\"
}"
```

## Direct write mode

The optional `direct_write` field enables direct I/O for guest write requests
only when they satisfy direct I/O alignment requirements, while keeping reads on
the regular buffered file descriptor. The guest write offset and length must be
multiples of 4096 bytes (4 KiB), and the guest memory buffer address must also
be suitably aligned for direct I/O. Writes that do not meet these requirements
automatically fall back to the buffered path.

This mode can reduce host page-cache overhead for write-heavy block devices
whose backing storage performs better with direct writes. It is disabled by
default to preserve existing behavior.

Direct I/O support and alignment requirements are filesystem and kernel
dependent. Validate this mode with the target backing storage before enabling it
for production workloads, especially when the backing file is also accessed by
host tooling through buffered I/O.

## Host requirements

Firecracker requires a minimum host kernel version of 5.10.51 for the `Async` IO
Expand Down
2 changes: 1 addition & 1 deletion src/firecracker/src/api_server/parsed_request.rs
Original file line number Diff line number Diff line change
Expand Up @@ -807,7 +807,7 @@ pub mod tests {
let mut connection = HttpConnection::new(receiver);
let body = "{ \"drive_id\": \"string\", \"path_on_host\": \"string\", \"is_root_device\": \
true, \"partuuid\": \"string\", \"is_read_only\": true, \"cache_type\": \
\"Unsafe\", \"io_engine\": \"Sync\", \"rate_limiter\": { \"bandwidth\": { \
\"Unsafe\", \"direct_write\": true, \"io_engine\": \"Sync\", \"rate_limiter\": { \"bandwidth\": { \
\"size\": 0, \"one_time_burst\": 0, \"refill_time\": 0 }, \"ops\": { \
\"size\": 0, \"one_time_burst\": 0, \"refill_time\": 0 } } }";
sender
Expand Down
1 change: 1 addition & 0 deletions src/firecracker/src/api_server/request/drive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ mod tests {
"partuuid": "string",
"is_read_only": true,
"cache_type": "Unsafe",
"direct_write": true,
"io_engine": "Sync",
"rate_limiter": {
"bandwidth": {
Expand Down
8 changes: 8 additions & 0 deletions src/firecracker/swagger/firecracker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1245,6 +1245,14 @@ definitions:
This field is required for virtio-block config and should be omitted for vhost-user-block configuration.
rate_limiter:
$ref: "#/definitions/RateLimiter"
direct_write:
type: boolean
description:
If true, aligned guest write requests use direct I/O on the host while
reads remain buffered. Unaligned writes fall back to the buffered file
descriptor. Host direct I/O alignment requirements depend on the
backing filesystem and kernel.
default: false
io_engine:
type: string
description:
Expand Down
2 changes: 2 additions & 0 deletions src/vmm/src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -904,6 +904,8 @@ pub(crate) mod tests {
.to_string(),
),
rate_limiter: None,
direct_write: None,

file_engine_type: None,

socket: None,
Expand Down
2 changes: 2 additions & 0 deletions src/vmm/src/device_manager/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -888,6 +888,8 @@ pub(crate) mod tests {
is_read_only: Some(false),
path_on_host: Some(f.as_path().to_str().unwrap().to_string()),
rate_limiter: None,
direct_write: None,

file_engine_type: None,
socket: None,
}
Expand Down
10 changes: 9 additions & 1 deletion src/vmm/src/devices/virtio/block/vhost_user/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,12 @@ impl TryFrom<&BlockDeviceConfig> for VhostUserBlockConfig {
type Error = VhostUserBlockError;

fn try_from(value: &BlockDeviceConfig) -> Result<Self, Self::Error> {
if let (Some(socket), None, None, None, None) = (
if let (Some(socket), None, None, None, None, None) = (
&value.socket,
&value.is_read_only,
&value.path_on_host,
&value.rate_limiter,
&value.direct_write,
&value.file_engine_type,
) {
Ok(Self {
Expand Down Expand Up @@ -99,6 +100,7 @@ impl From<VhostUserBlockConfig> for BlockDeviceConfig {
is_read_only: None,
path_on_host: None,
rate_limiter: None,
direct_write: None,
file_engine_type: None,

socket: Some(value.socket),
Expand Down Expand Up @@ -411,6 +413,8 @@ mod tests {
is_read_only: None,
path_on_host: None,
rate_limiter: None,
direct_write: None,

file_engine_type: None,

socket: Some("sock".to_string()),
Expand All @@ -426,6 +430,8 @@ mod tests {
is_read_only: Some(true),
path_on_host: Some("path".to_string()),
rate_limiter: None,
direct_write: None,

file_engine_type: Some(FileEngineType::Sync),

socket: None,
Expand All @@ -441,6 +447,8 @@ mod tests {
is_read_only: Some(true),
path_on_host: Some("path".to_string()),
rate_limiter: None,
direct_write: None,

file_engine_type: Some(FileEngineType::Sync),

socket: Some("sock".to_string()),
Expand Down
75 changes: 66 additions & 9 deletions src/vmm/src/devices/virtio/block/virtio/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use std::fs::{File, OpenOptions};
use std::io::{Seek, SeekFrom};
use std::ops::Deref;
use std::os::linux::fs::MetadataExt;
use std::path::PathBuf;
use std::os::unix::fs::OpenOptionsExt;
use std::sync::Arc;

use block_io::FileEngine;
Expand Down Expand Up @@ -58,6 +58,7 @@ pub struct DiskProperties {
pub file_engine: FileEngine,
pub nsectors: u64,
pub image_id: [u8; VIRTIO_BLK_ID_BYTES as usize],
pub direct_write: bool,
}

impl DiskProperties {
Expand All @@ -66,7 +67,25 @@ impl DiskProperties {
OpenOptions::new()
.read(true)
.write(!is_disk_read_only)
.open(PathBuf::from(&disk_image_path))
.open(disk_image_path)
.map_err(|x| VirtioBlockError::BackingFile(x, disk_image_path.to_string()))
}

fn open_direct_file(
disk_image_path: &str,
is_disk_read_only: bool,
direct_write: bool,
) -> Result<Option<File>, VirtioBlockError> {
if !direct_write || is_disk_read_only {
return Ok(None);
}

OpenOptions::new()
.read(true)
.write(true)
.custom_flags(libc::O_DIRECT)
.open(disk_image_path)
.map(Some)
.map_err(|x| VirtioBlockError::BackingFile(x, disk_image_path.to_string()))
}

Expand Down Expand Up @@ -94,17 +113,22 @@ impl DiskProperties {
disk_image_path: String,
is_disk_read_only: bool,
file_engine_type: FileEngineType,
direct_write: bool,
) -> Result<Self, VirtioBlockError> {
let mut disk_image = Self::open_file(&disk_image_path, is_disk_read_only)?;
let direct_disk_image =
Self::open_direct_file(&disk_image_path, is_disk_read_only, direct_write)?;
let direct_write = direct_disk_image.is_some();
let disk_size = Self::file_size(&disk_image_path, &mut disk_image)?;
let image_id = Self::build_disk_image_id(&disk_image);

Ok(Self {
file_path: disk_image_path,
file_engine: FileEngine::from_file(disk_image, file_engine_type)
file_engine: FileEngine::from_file(disk_image, direct_disk_image, file_engine_type)
.map_err(VirtioBlockError::FileEngine)?,
nsectors: disk_size >> SECTOR_SHIFT,
image_id,
direct_write,
})
}

Expand All @@ -115,11 +139,13 @@ impl DiskProperties {
is_disk_read_only: bool,
) -> Result<(), VirtioBlockError> {
let mut disk_image = Self::open_file(&disk_image_path, is_disk_read_only)?;
let direct_disk_image =
Self::open_direct_file(&disk_image_path, is_disk_read_only, self.direct_write)?;
let disk_size = Self::file_size(&disk_image_path, &mut disk_image)?;

self.image_id = Self::build_disk_image_id(&disk_image);
self.file_engine
.update_file_path(disk_image)
.update_file_path(disk_image, direct_disk_image)
.map_err(VirtioBlockError::FileEngine)?;
self.nsectors = disk_size >> SECTOR_SHIFT;
self.file_path = disk_image_path;
Expand Down Expand Up @@ -193,6 +219,9 @@ pub struct VirtioBlockConfig {
pub path_on_host: String,
/// Rate Limiter for I/O operations.
pub rate_limiter: Option<RateLimiterConfig>,
/// If true, aligned guest writes use host direct I/O while reads remain buffered.
#[serde(default)]
pub direct_write: bool,
/// The type of IO engine used by the device.
#[serde(default)]
#[serde(rename = "io_engine")]
Expand All @@ -213,6 +242,7 @@ impl TryFrom<&BlockDeviceConfig> for VirtioBlockConfig {
is_read_only: value.is_read_only.unwrap_or(false),
path_on_host: path_on_host.clone(),
rate_limiter: value.rate_limiter,
direct_write: value.direct_write.unwrap_or(false),
file_engine_type: value.file_engine_type.unwrap_or_default(),
})
} else {
Expand All @@ -232,6 +262,7 @@ impl From<VirtioBlockConfig> for BlockDeviceConfig {
is_read_only: Some(value.is_read_only),
path_on_host: Some(value.path_on_host),
rate_limiter: value.rate_limiter,
direct_write: value.direct_write.then_some(true),
file_engine_type: Some(value.file_engine_type),

socket: None,
Expand Down Expand Up @@ -288,6 +319,7 @@ impl VirtioBlock {
config.path_on_host,
config.is_read_only,
config.file_engine_type,
config.direct_write,
)?;

let rate_limiter = config
Expand Down Expand Up @@ -349,6 +381,7 @@ impl VirtioBlock {
is_read_only: self.read_only,
cache_type: self.cache_type,
rate_limiter: rl.into_option(),
direct_write: self.disk.direct_write,
file_engine_type: self.file_engine_type(),
}
}
Expand Down Expand Up @@ -725,11 +758,17 @@ mod tests {
is_read_only: Some(true),
path_on_host: Some("path".to_string()),
rate_limiter: None,
direct_write: Some(true),

file_engine_type: Default::default(),

socket: None,
};
VirtioBlockConfig::try_from(&block_config).unwrap();
assert!(
VirtioBlockConfig::try_from(&block_config)
.unwrap()
.direct_write
);

let block_config = BlockDeviceConfig {
drive_id: "".to_string(),
Expand All @@ -740,6 +779,8 @@ mod tests {
is_read_only: None,
path_on_host: None,
rate_limiter: None,
direct_write: None,

file_engine_type: Default::default(),

socket: Some("sock".to_string()),
Expand All @@ -755,6 +796,8 @@ mod tests {
is_read_only: Some(true),
path_on_host: Some("path".to_string()),
rate_limiter: None,
direct_write: None,

file_engine_type: Default::default(),

socket: Some("sock".to_string()),
Expand All @@ -770,16 +813,30 @@ mod tests {
f.as_file().set_len(size).unwrap();

for engine in [FileEngineType::Sync, FileEngineType::Async] {
let disk_properties =
DiskProperties::new(String::from(f.as_path().to_str().unwrap()), true, engine)
.unwrap();
let disk_properties = DiskProperties::new(
String::from(f.as_path().to_str().unwrap()),
true,
engine,
false,
)
.unwrap();

assert_eq!(size, u64::from(SECTOR_SIZE) * num_sectors);
assert_eq!(disk_properties.nsectors, num_sectors);
// Testing `backing_file.virtio_block_disk_image_id()` implies
// duplicating that logic in tests, so skipping it.

let res = DiskProperties::new("invalid-disk-path".to_string(), true, engine);
let disk_properties = DiskProperties::new(
String::from(f.as_path().to_str().unwrap()),
true,
engine,
true,
)
.unwrap();
assert!(!disk_properties.direct_write);
assert!(disk_properties.file_engine.direct_file().is_none());

let res = DiskProperties::new("invalid-disk-path".to_string(), true, engine, false);
assert!(
matches!(res, Err(VirtioBlockError::BackingFile(_, _))),
"{:?}",
Expand Down
Loading