NVlabs · drbh · Mar 25, 2026 · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,7 @@
 target
 
 **/*.bc
+**/*.mlir
 **/*.cubin
 **/core
 scratch

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -63,11 +63,11 @@ convert_case = "0.8"
 
 # ── Intra-workspace ──────────────────────────────────────────────────────────
 cuda-bindings = { path = "cuda-bindings", version = "0.0.1" }
-cuda-core = { path = "cuda-core", version = "0.0.1" }
+cuda-core = { path = "cuda-core", version = "0.0.1", default-features = false }
 cuda-async = { path = "cuda-async", version = "0.0.1" }
 cuda-tile-rs = { path = "cuda-tile-rs", version = "0.0.1" }
 cutile-compiler = { path = "cutile-compiler", version = "0.0.1" }
 cutile-macro = { path = "cutile-macro", version = "0.0.1" }
-cutile = { path = "cutile", version = "0.0.1" }
+cutile = { path = "cutile", version = "0.0.1", default-features = false }
 cutile-examples = { path = "cutile-examples", version = "0.0.1" }
 cutile-benchmarks = { path = "cutile-benchmarks", version = "0.0.1" }
diff --git a/cuda-async/Cargo.toml b/cuda-async/Cargo.toml
@@ -10,7 +10,7 @@ readme = "README.md"
 description = "Safe Async CUDA support via Async Rust."
 
 [dependencies]
-cuda-core = { workspace = true }
+cuda-core = { workspace = true, features = ["cuda"] }
 futures = { workspace = true }
 anyhow = { workspace = true }
 thiserror = { workspace = true }
diff --git a/cuda-async/src/device_context.rs b/cuda-async/src/device_context.rs
@@ -8,6 +8,9 @@
 use crate::error::{device_assert, device_error, DeviceError};
 use crate::scheduling_policies::{GlobalSchedulingPolicy, SchedulingPolicy, StreamPoolRoundRobin};
 use cuda_core::{CudaContext, CudaFunction, CudaModule, CudaStream};
+pub use cuda_core::{
+    PointerParamType, ScalarParamType, TensorParamType, ValidParamType, Validator,
+};
 use std::cell::Cell;
 use std::collections::HashMap;
 use std::hash::{DefaultHasher, Hash, Hasher};
@@ -36,36 +39,6 @@ pub trait FunctionKey: Hash {
     }
 }
 
-#[derive(Debug, Clone)]
-pub enum ValidParamType {
-    Scalar(ScalarParamType),
-    Pointer(PointerParamType),
-    Tensor(TensorParamType),
-}
-
-#[derive(Debug, Clone)]
-pub struct ScalarParamType {
-    pub element_type: String,
-}
-
-#[derive(Debug, Clone)]
-pub struct PointerParamType {
-    pub mutable: bool,
-    pub element_type: String,
-}
-
-// TODO (hme): This is note entirely tile-agnostic with this param type.
-#[derive(Debug, Clone)]
-pub struct TensorParamType {
-    pub element_type: String,
-    pub shape: Vec<i32>,
-}
-
-#[derive(Debug, Clone)]
-pub struct Validator {
-    pub params: Vec<ValidParamType>,
-}
-
 type DeviceFunctions = HashMap<String, (Arc<CudaModule>, Arc<CudaFunction>)>;
 type DeviceFunctionValidators = HashMap<String, Arc<Validator>>;
 

diff --git a/cuda-core/Cargo.toml b/cuda-core/Cargo.toml
@@ -9,7 +9,11 @@ authors.workspace = true
 repository.workspace = true
 readme = "README.md"
 
+[features]
+default = ["cuda"]
+cuda = ["dep:cuda-bindings"]
+
 [dependencies]
-cuda-bindings = { workspace = true }
+cuda-bindings = { workspace = true, optional = true }
 anyhow = { workspace = true }
 half = { workspace = true }
diff --git a/cuda-core/src/lib.rs b/cuda-core/src/lib.rs
@@ -5,13 +5,26 @@
 
 //! Low-level CUDA driver API bindings and safe wrappers.
 
+// Always available (no CUDA required)
+mod dtype;
+mod validator;
+
+pub use dtype::*;
+pub use validator::*;
+
+// CUDA-dependent modules (requires cuda feature)
+#[cfg(feature = "cuda")]
 mod api;
+#[cfg(feature = "cuda")]
 mod cudarc_shim;
-mod dtype;
+#[cfg(feature = "cuda")]
 mod error;
 
+#[cfg(feature = "cuda")]
 pub use api::*;
+#[cfg(feature = "cuda")]
 pub use cuda_bindings as sys;
+#[cfg(feature = "cuda")]
 pub use cudarc_shim::*;
-pub use dtype::*;
+#[cfg(feature = "cuda")]
 pub use error::*;
diff --git a/cuda-core/src/validator.rs b/cuda-core/src/validator.rs
@@ -0,0 +1,35 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+//! Shared kernel interface metadata used at the compiler/runtime boundary.
+
+#[derive(Debug, Clone)]
+pub enum ValidParamType {
+    Scalar(ScalarParamType),
+    Pointer(PointerParamType),
+    Tensor(TensorParamType),
+}
+
+#[derive(Debug, Clone)]
+pub struct ScalarParamType {
+    pub element_type: String,
+}
+
+#[derive(Debug, Clone)]
+pub struct PointerParamType {
+    pub mutable: bool,
+    pub element_type: String,
+}
+
+#[derive(Debug, Clone)]
+pub struct TensorParamType {
+    pub element_type: String,
+    pub shape: Vec<i32>,
+}
+
+#[derive(Debug, Clone)]
+pub struct Validator {
+    pub params: Vec<ValidParamType>,
+}
diff --git a/cutile-benchmarks/Cargo.toml b/cutile-benchmarks/Cargo.toml
@@ -39,5 +39,5 @@ tokio = { workspace = true }
 cutile-compiler = { workspace = true }
 cuda-async = { workspace = true }
 cuda-core = { workspace = true }
-cutile = { workspace = true }
+cutile = { workspace = true, features = ["cuda"] }
 cutile-examples = { workspace = true }
diff --git a/cutile-book/tutorials/04-matrix-multiplication.md b/cutile-book/tutorials/04-matrix-multiplication.md
@@ -57,7 +57,7 @@ use cuda_core::CudaContext;
 use std::sync::Arc;
 use cutile;
 use cutile::api;
-use cutile::candle_core::WithDType;
+use candle_core::WithDType;
 use cutile::error::Error;
 use cutile::tensor::{IntoPartition, Tensor, ToHostVec, Unpartition};
 use cutile::tile_kernel::TileKernel;

diff --git a/cutile-compiler/Cargo.toml b/cutile-compiler/Cargo.toml
@@ -10,6 +10,10 @@ repository.workspace = true
 readme = "README.md"
 keywords = ["mlir", "cutile", "compiler"]
 
+[features]
+default = []
+cuda = ["cuda-core/cuda"]
+
 [dependencies]
 melior = { workspace = true }
 syn = { workspace = true }
@@ -23,5 +27,4 @@ mlir-sys = { workspace = true }
 anyhow = { workspace = true }
 stacker = { workspace = true }
 cuda-tile-rs = { workspace = true }
-cuda-core = { workspace = true }
-cuda-async = { workspace = true }
+cuda-core = { workspace = true, default-features = false }
diff --git a/cutile-compiler/src/compiler/_function.rs b/cutile-compiler/src/compiler/_function.rs
@@ -25,7 +25,7 @@ use crate::kernel_entry_generator::generate_entry_point;
 use crate::kernel_naming::KernelNaming;
 use crate::syn_utils::*;
 use crate::types::*;
-use cuda_async::device_context::Validator;
+use cuda_core::Validator;
 use cuda_tile_rs::operation_parse;
 use melior::ir::attribute::StringAttribute;
 use melior::ir::operation::{OperationBuilder, OperationLike};

diff --git a/cutile-compiler/src/kernel_entry_generator.rs b/cutile-compiler/src/kernel_entry_generator.rs
@@ -13,9 +13,7 @@ use crate::generics::{GenericVars, TypeInstance};
 use crate::kernel_naming::KernelNaming;
 use crate::syn_utils::{get_fn_arg_var_name, get_ident_from_path_expr, get_ident_generic_args};
 use crate::types::{get_primitives_attrs, get_type_mutability};
-use cuda_async::device_context::{
-    PointerParamType, ScalarParamType, TensorParamType, ValidParamType, Validator,
-};
+use cuda_core::{PointerParamType, ScalarParamType, TensorParamType, ValidParamType, Validator};
 use proc_macro2::Ident;
 use proc_macro2::Span;
 use quote::ToTokens;

diff --git a/cutile-compiler/src/lib.rs b/cutile-compiler/src/lib.rs
@@ -9,6 +9,7 @@
 extern crate core;
 
 pub use cuda_tile_rs::cuda_tile;
+pub use cuda_tile_rs::cuda_tile_write_bytecode_to_buffer;
 use cuda_tile_rs::register_cuda_tile_dialects;
 use melior::{
     dialect::DialectRegistry,
@@ -18,6 +19,7 @@ use melior::{
 
 pub mod ast;
 mod bounds;
+#[cfg(feature = "cuda")]
 pub mod cuda_tile_runtime_utils;
 pub mod error;
 pub mod generics;

diff --git a/cutile-examples/Cargo.toml b/cutile-examples/Cargo.toml
@@ -10,11 +10,14 @@ repository.workspace = true
 readme = "README.md"
 publish = false
 
+[features]
+default = ["cuda"]
+cuda = ["cutile/cuda", "dep:cuda-core", "dep:cuda-async", "dep:tokio", "dep:candle-core", "dep:candle-nn"]
+
 [dependencies]
-cutile-compiler = { workspace = true }
 cutile = { workspace = true }
-cuda-core = { workspace = true }
-cuda-async = { workspace = true }
-tokio = { workspace = true }
-candle-core = { workspace = true }
-candle-nn = { workspace = true }
+cuda-core = { workspace = true, optional = true }
+cuda-async = { workspace = true, optional = true }
+tokio = { workspace = true, optional = true }
+candle-core = { workspace = true, optional = true }
+candle-nn = { workspace = true, optional = true }
diff --git a/cutile-examples/examples/async_mlp_fused.rs b/cutile-examples/examples/async_mlp_fused.rs
@@ -8,12 +8,12 @@ use cuda_async::device_operation::*;
 use cuda_async::launch::AsyncKernelLaunch;
 use cuda_async::scheduling_policies::WithDeviceId;
 use cuda_core::LaunchConfig;
+use cutile::cutile_compiler::compiler::{CUDATileFunctionCompiler, CUDATileModules};
+use cutile::cutile_compiler::cuda_tile::ModuleOperation;
+use cutile::cutile_compiler::cuda_tile_runtime_utils::{compile_module, get_gpu_name};
 use cutile::tensor::{Tensor, ToHostVec};
 use cutile::tile_kernel::IntoDeviceOperationPartition;
 use cutile::{api, error::Error};
-use cutile_compiler::compiler::{CUDATileFunctionCompiler, CUDATileModules};
-use cutile_compiler::cuda_tile::ModuleOperation;
-use cutile_compiler::cuda_tile_runtime_utils::{compile_module, get_gpu_name};
 use std::sync::Arc;
 
 #[cutile::module]

diff --git a/cutile-examples/examples/compile_only.rs b/cutile-examples/examples/compile_only.rs
@@ -0,0 +1,105 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Example: Compile to without requiring a GPU to run on
+ *
+ * Run with: cargo run -p cutile-examples --example compile_only --no-default-features
+ */
+
+use cutile::cutile_compiler::compiler::{CUDATileFunctionCompiler, CUDATileModules};
+use cutile::cutile_compiler::cuda_tile_write_bytecode_to_buffer;
+use std::env;
+use std::slice;
+
+// Build with --no-default-features to compile kernels without a GPU.
+#[cutile::module]
+mod my_kernels {
+    use cutile::core::*;
+
+    /// Simple kernel that does tile math without dynamic tensor inputs
+    #[cutile::entry()]
+    fn tile_math<const S: [i32; 1]>(output: &mut Tensor<f32, S>, scalar: f32) {
+        // Get block ID and create tiles
+        let _pid = get_tile_block_id().0;
+        let scalar_tile: Tile<f32, S> = broadcast_scalar(scalar, output.shape());
+        let ones: Tile<f32, S> = broadcast_scalar(1.0f32, output.shape());
+
+        // Simple computation
+        let result = scalar_tile + ones;
+        output.store(result);
+    }
+}
+
+fn main() {
+    // Default to sm_90 (Hopper) if not specified
+    let gpu_name = env::args().nth(1).unwrap_or_else(|| "sm_90".to_string());
+    println!("Target GPU: {}", gpu_name);
+
+    // Get the module ASTs from the generated code
+    let module_asts = my_kernels::_module_asts();
+
+    // Create the modules container
+    let modules = match CUDATileModules::new(module_asts) {
+        Ok(m) => m,
+        Err(e) => {
+            eprintln!("Failed to create modules: {:?}", e);
+            return;
+        }
+    };
+
+    // Compile with specific generic args (tile size = 32)
+    let module_name = "my_kernels";
+    let function_name = "tile_math";
+    let function_generics = vec!["32".to_string()];
+    // Stride args for the output tensor (1D tensor with stride 1)
+    let output_strides: [i32; 1] = [1];
+    let stride_args: Vec<(&str, &[i32])> = vec![("output", &output_strides)];
+    let const_grid: Option<(u32, u32, u32)> = None;
+
+    println!("Compiling {module_name}::{function_name}");
+
+    let compiler = match CUDATileFunctionCompiler::new(
+        &modules,
+        module_name,
+        function_name,
+        &function_generics,
+        &stride_args,
+        const_grid,
+        gpu_name.clone(),
+    ) {
+        Ok(c) => c,
+        Err(e) => {
+            eprintln!("Failed to create compiler: {:?}", e);
+            return;
+        }
+    };
+
+    let module_op = match compiler.compile() {
+        Ok(m) => m,
+        Err(e) => {
+            eprintln!("Compilation failed: {:?}", e);
+            return;
+        }
+    };
+
+    // Print human readable MLIR IR
+    let mlir_string = module_op.as_operation().to_string();
+    println!("Generated MLIR IR:\n");
+    println!("{}", mlir_string);
+
+    // Get compiled bytecode
+    let bytecode = cuda_tile_write_bytecode_to_buffer(&module_op);
+    let raw = bytecode.to_raw();
+    let bytes: &[u8] = unsafe { slice::from_raw_parts(raw.data as *const u8, raw.length) };
+
+    println!("\nCompiled bytecode: {} bytes", bytes.len());
+    println!(
+        "First 32 bytes (hex): {:02x?}",
+        &bytes[..bytes.len().min(32)]
+    );
+
+    // Write MLIR and bytecode to files
+    std::fs::write("output.mlir", mlir_string).unwrap();
+    std::fs::write("output.bc", bytes).unwrap();
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,6 +5,7 @@ @@
     target
     **/*.bc
+    **/*.mlir
     **/*.cubin
     **/core
     scratch
@@ Expand Down @@