Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
target

**/*.bc
**/*.mlir
**/*.cubin
**/core
scratch
Expand Down
2 changes: 0 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,11 @@ convert_case = "0.8"

# ── Intra-workspace ──────────────────────────────────────────────────────────
cuda-bindings = { path = "cuda-bindings", version = "0.0.1" }
cuda-core = { path = "cuda-core", version = "0.0.1" }
cuda-core = { path = "cuda-core", version = "0.0.1", default-features = false }
cuda-async = { path = "cuda-async", version = "0.0.1" }
cuda-tile-rs = { path = "cuda-tile-rs", version = "0.0.1" }
cutile-compiler = { path = "cutile-compiler", version = "0.0.1" }
cutile-macro = { path = "cutile-macro", version = "0.0.1" }
cutile = { path = "cutile", version = "0.0.1" }
cutile = { path = "cutile", version = "0.0.1", default-features = false }
cutile-examples = { path = "cutile-examples", version = "0.0.1" }
cutile-benchmarks = { path = "cutile-benchmarks", version = "0.0.1" }
2 changes: 1 addition & 1 deletion cuda-async/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ readme = "README.md"
description = "Safe Async CUDA support via Async Rust."

[dependencies]
cuda-core = { workspace = true }
cuda-core = { workspace = true, features = ["cuda"] }
futures = { workspace = true }
anyhow = { workspace = true }
thiserror = { workspace = true }
33 changes: 3 additions & 30 deletions cuda-async/src/device_context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
use crate::error::{device_assert, device_error, DeviceError};
use crate::scheduling_policies::{GlobalSchedulingPolicy, SchedulingPolicy, StreamPoolRoundRobin};
use cuda_core::{CudaContext, CudaFunction, CudaModule, CudaStream};
pub use cuda_core::{
PointerParamType, ScalarParamType, TensorParamType, ValidParamType, Validator,
};
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The main thing we need here is to preserve the generality of cuda-async. cutile-compiler is tile-specific, whereas cuda-async should be usable by both tile and custom CUDA kernels written outside of tile. At least that is the intent :)

Perhaps the intent behind your changes here are that the validator bit is closer from a concerns point-of-view to the compiler? I am okay with moving the validator stuff to the compiler, but we ought to avoid having cuda-async depend on cutile-compiler.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

very good points, I'm going to rework the changes to remove this dependency.

yea the original intent was to avoid depending on cuda-async inside of cutile-compiler since the cutile-compiler only seemed to need the validator structs from cuda-async

will push updates shortly

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One option is to move the validator code to cuda-core. I haven't thought through the gpu-dependence implications of this (there are a lot of moving parts right now), but I think it should work as a solution which provides visibility of the validator code to both cuda-async and cutile-compiler.

The dependence graph should look something like this (I'll add it to the README):

cutile-compiler
├── cuda-tile-rs
├── cuda-async
└── cuda-core

cuda-async
└── cuda-core

cuda-core
└── cuda-bindings

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

okay i've opt'd to move the type validation structs into their own crate and now the deps are

$ wtree
cuda-core
└── cuda-bindings

cutile-compiler
├── cuda-kernel-interface
└── cuda-tile-rs

cuda-async
├── cuda-core
│   └── cuda-bindings
└── cuda-kernel-interface

cutile-examples
├── cuda-async
│   ├── cuda-core
│   │   └── cuda-bindings
│   └── cuda-kernel-interface
├── cuda-core
│   └── cuda-bindings
└── cutile
    ├── cuda-async
    │   ├── cuda-core
    │   │   └── cuda-bindings
    │   └── cuda-kernel-interface
    ├── cuda-core
    │   └── cuda-bindings
    ├── cutile-compiler
    │   ├── cuda-core
    │   │   └── cuda-bindings
    │   ├── cuda-kernel-interface
    │   └── cuda-tile-rs
    └── cutile-macro
        └── cutile-compiler
            ├── cuda-kernel-interface
            └── cuda-tile-rs

and when --no-default-features is added

(namely the examples do not depend on either core or async in this case)

$ wtree --no-default-features
cuda-core
└── cuda-bindings

cutile-compiler
├── cuda-kernel-interface
└── cuda-tile-rs

cuda-async
├── cuda-core
│   └── cuda-bindings
└── cuda-kernel-interface

cutile-examples
└── cutile
    ├── cutile-compiler
    │   ├── cuda-kernel-interface
    │   └── cuda-tile-rs
    └── cutile-macro
        └── cutile-compiler
            ├── cuda-kernel-interface
            └── cuda-tile-rs

ps the helper used to dump the dep tree is

wtree() {
  for p in cuda-core cutile-compiler cuda-async cutile-examples; do
    cargo tree -p "$p" --edges normal --depth workspace --no-dedupe --format "{p}" "$@" \
    | sed -E 's/ v[0-9].*//'
    echo
  done
}

use std::cell::Cell;
use std::collections::HashMap;
use std::hash::{DefaultHasher, Hash, Hasher};
Expand Down Expand Up @@ -36,36 +39,6 @@ pub trait FunctionKey: Hash {
}
}

#[derive(Debug, Clone)]
pub enum ValidParamType {
Scalar(ScalarParamType),
Pointer(PointerParamType),
Tensor(TensorParamType),
}

#[derive(Debug, Clone)]
pub struct ScalarParamType {
pub element_type: String,
}

#[derive(Debug, Clone)]
pub struct PointerParamType {
pub mutable: bool,
pub element_type: String,
}

// TODO (hme): This is note entirely tile-agnostic with this param type.
#[derive(Debug, Clone)]
pub struct TensorParamType {
pub element_type: String,
pub shape: Vec<i32>,
}

#[derive(Debug, Clone)]
pub struct Validator {
pub params: Vec<ValidParamType>,
}

type DeviceFunctions = HashMap<String, (Arc<CudaModule>, Arc<CudaFunction>)>;
type DeviceFunctionValidators = HashMap<String, Arc<Validator>>;

Expand Down
6 changes: 5 additions & 1 deletion cuda-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@ authors.workspace = true
repository.workspace = true
readme = "README.md"

[features]
default = ["cuda"]
cuda = ["dep:cuda-bindings"]

[dependencies]
cuda-bindings = { workspace = true }
cuda-bindings = { workspace = true, optional = true }
anyhow = { workspace = true }
half = { workspace = true }
17 changes: 15 additions & 2 deletions cuda-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,26 @@

//! Low-level CUDA driver API bindings and safe wrappers.

// Always available (no CUDA required)
mod dtype;
mod validator;

pub use dtype::*;
pub use validator::*;

// CUDA-dependent modules (requires cuda feature)
#[cfg(feature = "cuda")]
mod api;
#[cfg(feature = "cuda")]
mod cudarc_shim;
mod dtype;
#[cfg(feature = "cuda")]
mod error;

#[cfg(feature = "cuda")]
pub use api::*;
#[cfg(feature = "cuda")]
pub use cuda_bindings as sys;
#[cfg(feature = "cuda")]
pub use cudarc_shim::*;
pub use dtype::*;
#[cfg(feature = "cuda")]
pub use error::*;
35 changes: 35 additions & 0 deletions cuda-core/src/validator.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/

//! Shared kernel interface metadata used at the compiler/runtime boundary.

#[derive(Debug, Clone)]
pub enum ValidParamType {
Scalar(ScalarParamType),
Pointer(PointerParamType),
Tensor(TensorParamType),
}

#[derive(Debug, Clone)]
pub struct ScalarParamType {
pub element_type: String,
}

#[derive(Debug, Clone)]
pub struct PointerParamType {
pub mutable: bool,
pub element_type: String,
}

#[derive(Debug, Clone)]
pub struct TensorParamType {
pub element_type: String,
pub shape: Vec<i32>,
}

#[derive(Debug, Clone)]
pub struct Validator {
pub params: Vec<ValidParamType>,
}
2 changes: 1 addition & 1 deletion cutile-benchmarks/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,5 @@ tokio = { workspace = true }
cutile-compiler = { workspace = true }
cuda-async = { workspace = true }
cuda-core = { workspace = true }
cutile = { workspace = true }
cutile = { workspace = true, features = ["cuda"] }
cutile-examples = { workspace = true }
2 changes: 1 addition & 1 deletion cutile-book/tutorials/04-matrix-multiplication.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ use cuda_core::CudaContext;
use std::sync::Arc;
use cutile;
use cutile::api;
use cutile::candle_core::WithDType;
use candle_core::WithDType;
use cutile::error::Error;
use cutile::tensor::{IntoPartition, Tensor, ToHostVec, Unpartition};
use cutile::tile_kernel::TileKernel;
Expand Down
7 changes: 5 additions & 2 deletions cutile-compiler/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ repository.workspace = true
readme = "README.md"
keywords = ["mlir", "cutile", "compiler"]

[features]
default = []
cuda = ["cuda-core/cuda"]

[dependencies]
melior = { workspace = true }
syn = { workspace = true }
Expand All @@ -23,5 +27,4 @@ mlir-sys = { workspace = true }
anyhow = { workspace = true }
stacker = { workspace = true }
cuda-tile-rs = { workspace = true }
cuda-core = { workspace = true }
cuda-async = { workspace = true }
cuda-core = { workspace = true, default-features = false }
2 changes: 1 addition & 1 deletion cutile-compiler/src/compiler/_function.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ use crate::kernel_entry_generator::generate_entry_point;
use crate::kernel_naming::KernelNaming;
use crate::syn_utils::*;
use crate::types::*;
use cuda_async::device_context::Validator;
use cuda_core::Validator;
use cuda_tile_rs::operation_parse;
use melior::ir::attribute::StringAttribute;
use melior::ir::operation::{OperationBuilder, OperationLike};
Expand Down
4 changes: 1 addition & 3 deletions cutile-compiler/src/kernel_entry_generator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,7 @@ use crate::generics::{GenericVars, TypeInstance};
use crate::kernel_naming::KernelNaming;
use crate::syn_utils::{get_fn_arg_var_name, get_ident_from_path_expr, get_ident_generic_args};
use crate::types::{get_primitives_attrs, get_type_mutability};
use cuda_async::device_context::{
PointerParamType, ScalarParamType, TensorParamType, ValidParamType, Validator,
};
use cuda_core::{PointerParamType, ScalarParamType, TensorParamType, ValidParamType, Validator};
use proc_macro2::Ident;
use proc_macro2::Span;
use quote::ToTokens;
Expand Down
2 changes: 2 additions & 0 deletions cutile-compiler/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
extern crate core;

pub use cuda_tile_rs::cuda_tile;
pub use cuda_tile_rs::cuda_tile_write_bytecode_to_buffer;
use cuda_tile_rs::register_cuda_tile_dialects;
use melior::{
dialect::DialectRegistry,
Expand All @@ -18,6 +19,7 @@ use melior::{

pub mod ast;
mod bounds;
#[cfg(feature = "cuda")]
pub mod cuda_tile_runtime_utils;
pub mod error;
pub mod generics;
Expand Down
15 changes: 9 additions & 6 deletions cutile-examples/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,14 @@ repository.workspace = true
readme = "README.md"
publish = false

[features]
default = ["cuda"]
cuda = ["cutile/cuda", "dep:cuda-core", "dep:cuda-async", "dep:tokio", "dep:candle-core", "dep:candle-nn"]

[dependencies]
cutile-compiler = { workspace = true }
cutile = { workspace = true }
cuda-core = { workspace = true }
cuda-async = { workspace = true }
tokio = { workspace = true }
candle-core = { workspace = true }
candle-nn = { workspace = true }
cuda-core = { workspace = true, optional = true }
cuda-async = { workspace = true, optional = true }
tokio = { workspace = true, optional = true }
candle-core = { workspace = true, optional = true }
candle-nn = { workspace = true, optional = true }
Comment thread
elibol marked this conversation as resolved.
6 changes: 3 additions & 3 deletions cutile-examples/examples/async_mlp_fused.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@ use cuda_async::device_operation::*;
use cuda_async::launch::AsyncKernelLaunch;
use cuda_async::scheduling_policies::WithDeviceId;
use cuda_core::LaunchConfig;
use cutile::cutile_compiler::compiler::{CUDATileFunctionCompiler, CUDATileModules};
use cutile::cutile_compiler::cuda_tile::ModuleOperation;
use cutile::cutile_compiler::cuda_tile_runtime_utils::{compile_module, get_gpu_name};
use cutile::tensor::{Tensor, ToHostVec};
use cutile::tile_kernel::IntoDeviceOperationPartition;
use cutile::{api, error::Error};
use cutile_compiler::compiler::{CUDATileFunctionCompiler, CUDATileModules};
use cutile_compiler::cuda_tile::ModuleOperation;
use cutile_compiler::cuda_tile_runtime_utils::{compile_module, get_gpu_name};
use std::sync::Arc;

#[cutile::module]
Expand Down
105 changes: 105 additions & 0 deletions cutile-examples/examples/compile_only.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Example: Compile to without requiring a GPU to run on
Comment thread
elibol marked this conversation as resolved.
*
* Run with: cargo run -p cutile-examples --example compile_only --no-default-features
*/

use cutile::cutile_compiler::compiler::{CUDATileFunctionCompiler, CUDATileModules};
use cutile::cutile_compiler::cuda_tile_write_bytecode_to_buffer;
use std::env;
use std::slice;

// Build with --no-default-features to compile kernels without a GPU.
#[cutile::module]
mod my_kernels {
use cutile::core::*;

/// Simple kernel that does tile math without dynamic tensor inputs
#[cutile::entry()]
Comment thread
elibol marked this conversation as resolved.
fn tile_math<const S: [i32; 1]>(output: &mut Tensor<f32, S>, scalar: f32) {
// Get block ID and create tiles
let _pid = get_tile_block_id().0;
let scalar_tile: Tile<f32, S> = broadcast_scalar(scalar, output.shape());
let ones: Tile<f32, S> = broadcast_scalar(1.0f32, output.shape());

// Simple computation
let result = scalar_tile + ones;
output.store(result);
}
}

fn main() {
// Default to sm_90 (Hopper) if not specified
let gpu_name = env::args().nth(1).unwrap_or_else(|| "sm_90".to_string());
println!("Target GPU: {}", gpu_name);

// Get the module ASTs from the generated code
let module_asts = my_kernels::_module_asts();

// Create the modules container
let modules = match CUDATileModules::new(module_asts) {
Ok(m) => m,
Err(e) => {
eprintln!("Failed to create modules: {:?}", e);
return;
}
};

// Compile with specific generic args (tile size = 32)
let module_name = "my_kernels";
let function_name = "tile_math";
let function_generics = vec!["32".to_string()];
// Stride args for the output tensor (1D tensor with stride 1)
let output_strides: [i32; 1] = [1];
let stride_args: Vec<(&str, &[i32])> = vec![("output", &output_strides)];
let const_grid: Option<(u32, u32, u32)> = None;

println!("Compiling {module_name}::{function_name}");

let compiler = match CUDATileFunctionCompiler::new(
&modules,
module_name,
function_name,
&function_generics,
&stride_args,
const_grid,
gpu_name.clone(),
) {
Ok(c) => c,
Err(e) => {
eprintln!("Failed to create compiler: {:?}", e);
return;
}
};

let module_op = match compiler.compile() {
Ok(m) => m,
Err(e) => {
eprintln!("Compilation failed: {:?}", e);
return;
}
};

// Print human readable MLIR IR
let mlir_string = module_op.as_operation().to_string();
println!("Generated MLIR IR:\n");
println!("{}", mlir_string);

// Get compiled bytecode
let bytecode = cuda_tile_write_bytecode_to_buffer(&module_op);
let raw = bytecode.to_raw();
let bytes: &[u8] = unsafe { slice::from_raw_parts(raw.data as *const u8, raw.length) };

println!("\nCompiled bytecode: {} bytes", bytes.len());
println!(
"First 32 bytes (hex): {:02x?}",
&bytes[..bytes.len().min(32)]
);

// Write MLIR and bytecode to files
std::fs::write("output.mlir", mlir_string).unwrap();
std::fs::write("output.bc", bytes).unwrap();
}
Loading
Loading