Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
15 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,23 @@ env-fast

# TLS certificates — local only, never commit (paths to certs are in .env)
.certs/

# Benchmark simulation output files
sim_*.tsv
sim_*.tsv.zst

# Sweep run logs and results (local benchmark output)
sweep_logs/
sweep_flux_master.log
results/

# Test scripts and helpers not part of the benchmark suite
test_s3dlio_gen_direct.py

# Hydra runtime output (created in cwd when running workloads with hydra config)
hydra_log/

# Timestamped sweep run logs written to repo root by sweep_*.sh scripts
sweep_unet3d_*.log
sweep_dlrm_*.log
sweep_flux_*.log
8 changes: 4 additions & 4 deletions configs/dlio/workload/dlrm_b200.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ dataset:
data_folder: data/dlrm/
format: parquet
num_files_train: 1024 # Number of training files to generate
num_samples_per_file: 4718592 # Samples per parquet file
num_samples_per_file: 1536000 # 250 RGs × 6144 → ~3.1 MiB footer (under s3-ultra 4 MiB limit)
record_length_bytes: 761
compression: none # Options: snappy, gzip, lz4, zstd, none

Expand Down Expand Up @@ -627,12 +627,12 @@ dataset:
reader:
data_loader: pytorch
batch_size: 12288
prefetch_size: 2 # Increase from default 2 for better I/O overlap
read_threads: 4 # Increase parallelism
prefetch_size: 0
read_threads: 0 # single-process, no IPC overhead; ThreadPoolExecutor handles I/O
file_shuffle: seed

train:
epochs: 1
epochs: 2
computation_time: 0.000375

metric:
Expand Down
5 changes: 3 additions & 2 deletions configs/dlio/workload/dlrm_datagen.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,14 @@ dataset:
data_folder: data/dlrm/
format: parquet
num_files_train: 1024 # Number of training files to generate
num_samples_per_file: 4718592 # Samples per parquet file
num_samples_per_file: 1536000 # Samples per parquet file (250 RGs × 6144 → ~3.1 MiB footer, under s3-ultra 4 MiB limit)
record_length_bytes: 761
compression: none # Options: snappy, gzip, lz4, zstd, none

# Parquet-specific configuration
parquet:
row_group_size: 8192
use_s3dlio_gen: true
row_group_size: 6144 # Match batch_size for optimal caching
read_mode: row_group

columns:
Expand Down
2 changes: 2 additions & 0 deletions configs/dlio/workload/flux_datagen.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ dataset:
record_length: 2164832

parquet:
use_s3dlio_gen: true
row_group_size: 48
# Parquet-specific field specifications
columns:
- name: t5_encodings
Expand Down
40 changes: 40 additions & 0 deletions configs/dlio/workload/unet3d_b200.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
model:
name: unet3d
type: cnn
model_size: 499153191

framework: pytorch

workflow:
generate_data: False
train: True
checkpoint: False

dataset:
data_folder: data/unet3d/
format: npz
num_files_train: 7200 # ~984 GiB: 7200 × ~140 MiB avg file size
num_samples_per_file: 1
record_length_bytes: 146600628
record_length_bytes_stdev: 68341808
record_length_bytes_resize: 2097152

reader:
data_loader: pytorch
batch_size: 7
read_threads: 4
file_shuffle: seed
sample_shuffle: seed

train:
epochs: 5
# B200 computation_time = H100 (0.323 s) ÷ 2 (B200 is ~2× faster than H100)
computation_time: 0.162

checkpoint:
checkpoint_folder: checkpoints/unet3d
checkpoint_after_epoch: 5
epochs_between_checkpoints: 2

metric:
au: 0.90
Loading