diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 5229955a..4b79385f 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -47,7 +47,6 @@ jobs:
           # Install rust
           curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
           source "${HOME}/.cargo/env"
-          rustup default nightly
           wget https://apt.llvm.org/llvm.sh
           bash ./llvm.sh 21
           apt install -y --no-install-recommends libmlir-21-dev mlir-21-tools libpolly-21-dev
@@ -66,7 +65,7 @@ jobs:
       - name: Clippy
         run: |
           source "${HOME}/.cargo/env"
-          cargo clippy
+          cargo clippy -- --deny clippy::all --allow clippy::missing-safety-doc --allow clippy::type_complexity
 
       - name: Test (compile only)
         run: |
diff --git a/cuda-async/src/device_future.rs b/cuda-async/src/device_future.rs
index 9f78b34e..2b20acfc 100644
--- a/cuda-async/src/device_future.rs
+++ b/cuda-async/src/device_future.rs
@@ -33,7 +33,7 @@ pub enum DeviceFutureState {
 }
 
 /// Shared state between a CUDA stream callback and the async waker.
-#[derive(Debug)]
+#[derive(Debug, Default)]
 pub struct StreamCallbackState {
     pub(crate) waker: AtomicWaker,
     pub(crate) complete: AtomicBool,
@@ -42,10 +42,7 @@ pub struct StreamCallbackState {
 impl StreamCallbackState {
     /// Creates a new callback state with the completion flag unset.
     pub fn new() -> Self {
-        Self {
-            waker: AtomicWaker::new(),
-            complete: AtomicBool::new(false),
-        }
+        Self::default()
     }
     /// Marks the operation as complete and wakes the associated task.
     pub fn signal(&self) {
diff --git a/cuda-async/src/device_operation.rs b/cuda-async/src/device_operation.rs
index 6a226499..45fce080 100644
--- a/cuda-async/src/device_operation.rs
+++ b/cuda-async/src/device_operation.rs
@@ -391,7 +391,7 @@ pub trait DeviceOp:
 /// - `dup`, `copy_host_vec_to_device`
 ///
 /// See [`Scope`](crate::cuda_graph::Scope) for the full safety proof.
-pub trait GraphNode: DeviceOp {}
+pub trait GraphNode {}
 
 // Arc
 
@@ -987,10 +987,12 @@ where
         if !self.computed.load(Ordering::Acquire) {
             // Safety: This block is guaranteed to execute at most once.
             // Put the input in a box so the pointer is dropped when this block exits.
-            let input = unsafe { (&mut *self.input.get()).take() }.ok_or(device_error(
-                context.get_device_id(),
-                "Select operation failed.",
-            ))?;
+            let input = self.input.get();
+            let input = unsafe { input.as_mut() };
+            let input = input
+                .unwrap()
+                .take()
+                .ok_or_else(|| device_error(context.get_device_id(), "Select operation failed."))?;
             let (left, right) = input.execute(context)?;
             // Update internal state.
             unsafe {
@@ -1002,12 +1004,14 @@ where
         Ok(())
     }
     unsafe fn left(&self) -> T1 {
-        let left = unsafe { (&mut *self.left.get()).take() }.unwrap();
-        left
+        let cell = self.left.get();
+        let cell = unsafe { cell.as_mut() };
+        cell.unwrap().take().unwrap()
     }
     unsafe fn right(&self) -> T2 {
-        let right = unsafe { (&mut *self.right.get()).take() }.unwrap();
-        right
+        let cell = self.right.get();
+        let cell = unsafe { cell.as_mut() };
+        cell.unwrap().take().unwrap()
     }
 }
 
diff --git a/cuda-async/src/launch.rs b/cuda-async/src/launch.rs
index a7abe072..ceb0db7c 100644
--- a/cuda-async/src/launch.rs
+++ b/cuda-async/src/launch.rs
@@ -33,9 +33,9 @@ impl Drop for AsyncKernelLaunch {
         let _ = self
             .args
             .iter()
-            .map(|arg| {
+            .map(|&arg| {
                 // Reconstruct the boxes. Pointers will be dropped when they go out of scope.
-                unsafe { Box::from_raw(*arg) }
+                unsafe { Box::from_raw(arg as *mut usize) }
             })
             .collect::<Vec<_>>();
     }
diff --git a/cuda-core/src/cudarc_shim.rs b/cuda-core/src/cudarc_shim.rs
index 7afaac98..18491992 100644
--- a/cuda-core/src/cudarc_shim.rs
+++ b/cuda-core/src/cudarc_shim.rs
@@ -40,7 +40,6 @@ pub(crate) mod primary_ctx {
 }
 
 /// Low-level device query operations.
-
 #[allow(dead_code)]
 pub(crate) mod device {
 
@@ -191,7 +190,6 @@ pub(crate) mod ctx {
 }
 
 /// Low-level CUDA stream operations.
-
 #[allow(dead_code)]
 pub(crate) mod stream {
     use super::{DriverError, IntoResult};
diff --git a/cutile-ir/src/bytecode/encoding.rs b/cutile-ir/src/bytecode/encoding.rs
index f575b9e1..32b72cdc 100644
--- a/cutile-ir/src/bytecode/encoding.rs
+++ b/cutile-ir/src/bytecode/encoding.rs
@@ -20,16 +20,13 @@ pub struct EncodingWriter {
 
 impl EncodingWriter {
     pub fn new() -> Self {
-        Self {
-            buf: Vec::new(),
-            required_alignment: 1,
-        }
+        Self::default()
     }
 
     pub fn with_capacity(cap: usize) -> Self {
         Self {
             buf: Vec::with_capacity(cap),
-            required_alignment: 1,
+            ..Self::default()
         }
     }
 
@@ -223,6 +220,15 @@ impl EncodingWriter {
     }
 }
 
+impl Default for EncodingWriter {
+    fn default() -> Self {
+        Self {
+            buf: Default::default(),
+            required_alignment: 1,
+        }
+    }
+}
+
 /// Patch a `u32` value at `offset` in the buffer (little-endian).
 pub fn patch_u32(buf: &mut [u8], offset: usize, value: u32) {
     buf[offset..offset + 4].copy_from_slice(&value.to_le_bytes());
@@ -270,7 +276,7 @@ fn convert_to_f8(
     // Handle special values.
     if f64_exp == 0x7FF {
         // Inf or NaN
-        if f64_man != 0 || (nan_only_all_ones && f64_man == 0) {
+        if f64_man != 0 || nan_only_all_ones {
             // NaN (or Inf mapped to NaN for formats without infinities)
             if nan_only_all_ones {
                 return (sign << 7) | ((max_exp as u8) << man_bits) | man_mask;
diff --git a/cutile-ir/src/ir/fmt.rs b/cutile-ir/src/ir/fmt.rs
index b6dfd1f6..d17641f3 100644
--- a/cutile-ir/src/ir/fmt.rs
+++ b/cutile-ir/src/ir/fmt.rs
@@ -1072,7 +1072,7 @@ impl<'a> ModulePrinter<'a> {
         let pad = " ".repeat(self.indent);
 
         // Operands: [lb, ub, step, init_values...]
-        let lb = op.operands.get(0).map(|v| v.index());
+        let lb = op.operands.first().map(|v| v.index());
         let ub = op.operands.get(1).map(|v| v.index());
         let step = op.operands.get(2).map(|v| v.index());
         let init_values = &op.operands[3.min(op.operands.len())..];
@@ -2601,7 +2601,7 @@ fn format_dense_i32_array(attr: &Attribute) -> String {
                 .collect();
             format!("[{}]", elems.join(", "))
         }
-        _ => format!("{}", format_attr(attr)),
+        _ => format_attr(attr).to_string(),
     }
 }
 
diff --git a/cutile-ir/src/ir/types.rs b/cutile-ir/src/ir/types.rs
index 8bcce05a..1b572053 100644
--- a/cutile-ir/src/ir/types.rs
+++ b/cutile-ir/src/ir/types.rs
@@ -266,11 +266,7 @@ fn strip_prefix_suffix<'a>(s: &'a str, prefix: &str, _suffix: &str) -> Option<&'
         }
     }
     // No matching close — try without nesting (just strip last char if it's '>').
-    if after_prefix.ends_with('>') {
-        Some(&after_prefix[..after_prefix.len() - 1])
-    } else {
-        None
-    }
+    after_prefix.strip_suffix('>')
 }
 
 fn parse_scalar(s: &str) -> Option<ScalarType> {
@@ -314,7 +310,7 @@ fn parse_tile(inner: &str) -> Option<Type> {
             before
                 .trim_end_matches('x')
                 .split('x')
-                .map(|d| parse_dim(d))
+                .map(parse_dim)
                 .collect()
         };
         let ptr_inner_start = ptr_start + "ptr<".len();
@@ -368,7 +364,7 @@ fn parse_tensor_view(inner: &str) -> Option<Type> {
 
     let strides = if let Some(sp) = strides_part {
         let sp = sp.trim_start_matches('[').trim_end_matches(']');
-        sp.split(',').map(|s| parse_dim(s)).collect()
+        sp.split(',').map(parse_dim).collect()
     } else {
         vec![DYNAMIC; shape.len()]
     };
diff --git a/cutile-macro/src/_module.rs b/cutile-macro/src/_module.rs
index 6bc66756..20585200 100644
--- a/cutile-macro/src/_module.rs
+++ b/cutile-macro/src/_module.rs
@@ -411,7 +411,7 @@ pub fn trait_(mut item: ItemTrait) -> Result<TokenStream, Error> {
     );
     let res = match attributes {
         Some(attributes)
-            if attributes.name_as_str().as_deref() == Some("cuda_tile :: variadic_trait") =>
+            if attributes.name_as_str() == Some("cuda_tile :: variadic_trait".into()) =>
         {
             desugar_variadic_trait_decl(&item)?
         }
diff --git a/cutile-macro/src/rank_instantiation.rs b/cutile-macro/src/rank_instantiation.rs
index f7d66b19..cb14990f 100644
--- a/cutile-macro/src/rank_instantiation.rs
+++ b/cutile-macro/src/rank_instantiation.rs
@@ -1109,9 +1109,7 @@ impl RankInstantiator {
     /// Rewrite a free-fn signature (generics, args, return) and its body.
     pub fn rewrite_function(mut self, item: &ItemFn) -> Result<ItemFn, Error> {
         let mut item = item.clone();
-        if let Err(e) = rewrite_fn_sig(&mut item.sig, &self.bindings) {
-            return Err(e);
-        }
+        rewrite_fn_sig(&mut item.sig, &self.bindings)?;
         self.visit_block_mut(&mut item.block);
         self.into_result(item)
     }
@@ -1125,9 +1123,7 @@ impl RankInstantiator {
             Ok(t) => *item.self_ty = t,
             Err(e) => return Err(e),
         }
-        if let Err(e) = rewrite_generics_for_rank(&mut item.generics, &self.bindings) {
-            return Err(e);
-        }
+        rewrite_generics_for_rank(&mut item.generics, &self.bindings)?;
         if let Some(trait_) = &mut item.trait_ {
             let path = &mut trait_.1;
             if path.segments.is_empty() {
@@ -1138,9 +1134,7 @@ impl RankInstantiator {
             }
             let last_seg = path.segments.last_mut().unwrap();
             if let PathArguments::AngleBracketed(path_args) = &mut last_seg.arguments {
-                if let Err(e) = rewrite_generic_args_for_rank(path_args, &self.bindings) {
-                    return Err(e);
-                }
+                rewrite_generic_args_for_rank(path_args, &self.bindings)?
             }
         }
 
@@ -1164,8 +1158,8 @@ impl RankInstantiator {
                     }
                     let mut result = fn_impl.clone();
                     self.rewrite_impl_method(&original_self_ty, &mut result);
-                    if self.error.is_some() {
-                        return Err(self.error.unwrap());
+                    if let Some(error) = self.error {
+                        return Err(error);
                     }
                     impl_items.push(ImplItem::Fn(result));
                 }
diff --git a/cutile-macro/src/shadow_dispatch.rs b/cutile-macro/src/shadow_dispatch.rs
index c6bd1193..d4f1c499 100644
--- a/cutile-macro/src/shadow_dispatch.rs
+++ b/cutile-macro/src/shadow_dispatch.rs
@@ -477,10 +477,8 @@ impl RankPolyOpSpec {
         }
         // Rank-dependent non-shape arg types as trait generics (e.g. `Idx0`
         // for `idx: [i32; N]`). Caller's array literal pins them.
-        for slot in &self.rank_dep_arg_idents {
-            if let Some(id) = slot {
-                all_trait_params.push(quote! { #id });
-            }
+        for id in self.rank_dep_arg_idents.iter().flatten() {
+            all_trait_params.push(quote! { #id });
         }
         if let Some(ref out) = extra_out_trait_param {
             all_trait_params.push(out.clone());
@@ -565,7 +563,7 @@ impl RankPolyOpSpec {
         let mut return_concrete = rewrite_ty_for_rank(&self.return_type, combo, &self.cgas);
         for (orig, replacement) in self.dead_lifetimes.iter().zip(self.dead_lt_idents.iter()) {
             return_concrete =
-                replace_lifetimes_with(&return_concrete, &[orig.clone()], replacement);
+                replace_lifetimes_with(&return_concrete, std::slice::from_ref(orig), replacement);
         }
 
         let mut trait_instantiation_args: Vec<TokenStream2> = Vec::new();
@@ -773,10 +771,8 @@ impl RankPolyOpSpec {
             trait_args.push(quote! { #i });
         }
         // Rank-dep arg generics, matching trait declaration ordering.
-        for slot in &self.rank_dep_arg_idents {
-            if let Some(id) = slot {
-                trait_args.push(quote! { #id });
-            }
+        for id in self.rank_dep_arg_idents.iter().flatten() {
+            trait_args.push(quote! { #id });
         }
         if use_free_out {
             trait_args.push(quote! { #out_ident });
@@ -827,10 +823,8 @@ impl RankPolyOpSpec {
         for i in &extra_shape_generic_idents {
             all_wrapper_generics.push(quote! { #i });
         }
-        for slot in &self.rank_dep_arg_idents {
-            if let Some(id) = slot {
-                all_wrapper_generics.push(quote! { #id });
-            }
+        for id in self.rank_dep_arg_idents.iter().flatten() {
+            all_wrapper_generics.push(quote! { #id });
         }
         if use_free_out {
             all_wrapper_generics.push(quote! { #out_ident });
@@ -1744,7 +1738,7 @@ pub fn desugar_variadic_trait_decl(item: &ItemTrait) -> Result<TokenStream2, Err
     for param in &item.generics.params {
         let drop_it = matches!(
             param,
-            GenericParam::Const(c) if cga_idents.iter().any(|i| *i == c.ident)
+            GenericParam::Const(c) if cga_idents.contains(&c.ident)
         );
         if !drop_it {
             new_params.push(param.clone());
@@ -1942,7 +1936,7 @@ fn emit_variadic_trait_impl_for_rank(
     for param in &item.generics.params {
         let skip = matches!(
             param,
-            GenericParam::Const(c) if cga_idents.iter().any(|i| *i == c.ident)
+            GenericParam::Const(c) if cga_idents.contains(&c.ident)
         );
         if !skip {
             all_impl_params.push(quote! { #param });
@@ -2012,7 +2006,7 @@ fn rewrite_trait_method_for_rank_poly(
                 .map(|(i, _)| i);
             if let Some(i) = cga_idx {
                 if let CgaRole::ShapeBound { sh_ident } = &shape.roles[i] {
-                    pt.ty = Box::new(syn::parse_quote! { #sh_ident });
+                    *pt.ty = syn::parse_quote! { #sh_ident };
                 }
                 // Free CGAs aren't in args by definition (classify_cgas's
                 // post-condition), so reaching this branch with a Free role
@@ -2031,7 +2025,7 @@ fn rewrite_trait_method_for_rank_poly(
             } else {
                 syn::parse_quote! { Self::Out }
             };
-            *ret = Box::new(new_ret);
+            **ret = new_ret;
         }
     }
 }
@@ -2110,13 +2104,13 @@ fn rewrite_impl_method_body_for_rank(
         if let FnArg::Typed(pt) = arg {
             let new_ty = rewrite_ty_for_rank(&pt.ty, combo, cgas);
             let new_ty = bind_anon_lifetimes_to(&new_ty, recv_lt);
-            pt.ty = Box::new(new_ty);
+            *pt.ty = new_ty;
         }
     }
     if let ReturnType::Type(_, ret) = &mut new_sig.output {
         let new_ret = rewrite_ty_for_rank(ret, combo, cgas);
         let new_ret = bind_anon_lifetimes_to(&new_ret, recv_lt);
-        *ret = Box::new(new_ret);
+        **ret = new_ret;
     }
     let muted_args: Vec<TokenStream2> = new_sig
         .inputs
@@ -2212,10 +2206,8 @@ fn type_uses_lifetime(ty: &Type) -> bool {
                     for arg in ab.args.iter() {
                         match arg {
                             GenericArgument::Lifetime(_) => return true,
-                            GenericArgument::Type(t) => {
-                                if type_uses_lifetime(t) {
-                                    return true;
-                                }
+                            GenericArgument::Type(t) if type_uses_lifetime(t) => {
+                                return true;
                             }
                             _ => {}
                         }
@@ -2236,11 +2228,10 @@ fn filter_cuda_tile_attrs(attrs: &[syn::Attribute]) -> Vec<syn::Attribute> {
     attrs
         .iter()
         .filter(|a| {
-            let path = a.path();
-            !path
+            a.path()
                 .segments
                 .first()
-                .is_some_and(|s| s.ident == "cuda_tile")
+                .is_none_or(|s| s.ident != "cuda_tile")
         })
         .cloned()
         .collect()
diff --git a/cutile/src/api.rs b/cutile/src/api.rs
index d767477a..0c7b6469 100644
--- a/cutile/src/api.rs
+++ b/cutile/src/api.rs
@@ -279,7 +279,6 @@ pub fn memcpy<T: DType>(dst: &mut Tensor<T>, src: &Tensor<T>) -> Memcpy {
 /// the destination is borrowed immutably but written to through the device
 /// pointer during graph replay.
 ///
-
 pub struct Memcpy {
     src_ptr: cuda_core::sys::CUdeviceptr,
     dst_ptr: cuda_core::sys::CUdeviceptr,
@@ -637,6 +636,7 @@ pub fn convert<FromType: DType, ToType: DType>(
 /// - `std`: Standard deviation
 /// - `shape`: Tensor shape
 /// - `seed`: Optional random seed for reproducibility
+///
 /// Generates a tensor with values from a normal distribution.
 ///
 /// Supports `f32` and `f64` natively via cuRAND. For `f16`, generates `f32`
@@ -674,7 +674,7 @@ pub fn randn_f16<const RANK: usize>(
             let res = value((Arc::new(src_tensor), dst))
                 .then(convert_apply)
                 .unzip();
-            res.1.unpartition().reshape(&shape.to_vec())
+            res.1.unpartition().reshape(shape.as_ref())
         })
     })
 }
diff --git a/cutile/src/tensor.rs b/cutile/src/tensor.rs
index d8a482ce..e7b19b69 100644
--- a/cutile/src/tensor.rs
+++ b/cutile/src/tensor.rs
@@ -578,7 +578,7 @@ impl<T: DType> Tensor<T> {
             return tensor_error_result("Reinterpret shape must preserve total byte size.");
         }
         let alignment = align_of::<U>() as u64;
-        if alignment > 1 && self.cu_deviceptr() % alignment != 0 {
+        if alignment > 1 && !self.cu_deviceptr().is_multiple_of(alignment) {
             return tensor_error_result(
                 "Tensor storage alignment is incompatible with reinterpret target type.",
             );
@@ -826,7 +826,7 @@ impl<T: DType> Reshape for Tensor<T> {
     }
 }
 
-impl<'a, T: DType> Reshape for &'a Arc<Tensor<T>> {
+impl<T: DType> Reshape for &Arc<Tensor<T>> {
     type Output = Arc<Tensor<T>>;
     fn reshape(self, shape: &[usize]) -> Result<Arc<Tensor<T>>, Error> {
         self.reshape_shared(shape)
@@ -1052,7 +1052,7 @@ impl<'a, T: DType> PartitionMut<'a, T> for &'a mut Tensor<T> {
     }
 }
 
-impl<'a, T: DType> Partition<&'a mut Tensor<T>> {
+impl<T: DType> Partition<&mut Tensor<T>> {
     pub fn dtype_str(&self) -> &'static str {
         T::DTYPE.as_str()
     }
@@ -1318,7 +1318,7 @@ impl<T: DType> KernelOutputStored<T> for Partition<Tensor<T>> {
     }
 }
 
-impl<'a, T: DType> KernelOutputStored<T> for Partition<&'a mut Tensor<T>> {
+impl<T: DType> KernelOutputStored<T> for Partition<&mut Tensor<T>> {
     fn push_kernel_args(&self, launcher: &mut AsyncKernelLaunch) {
         unsafe {
             launcher.push_device_ptr(self.object.cu_deviceptr());
@@ -1462,7 +1462,7 @@ impl<T: DType> KernelInputStored for Arc<Tensor<T>> {
     }
 }
 
-impl<'a, T: DType + Sync> KernelInputStored for &'a Tensor<T> {
+impl<T: DType + Sync> KernelInputStored for &Tensor<T> {
     fn push_kernel_args(&self, launcher: &mut AsyncKernelLaunch) {
         unsafe {
             launcher.push_device_ptr(self.cu_deviceptr());
diff --git a/cutile/src/tile_kernel.rs b/cutile/src/tile_kernel.rs
index 72345fc4..ee9d77df 100644
--- a/cutile/src/tile_kernel.rs
+++ b/cutile/src/tile_kernel.rs
@@ -48,6 +48,7 @@ pub struct TileFunctionKey {
 }
 
 impl TileFunctionKey {
+    #[allow(clippy::too_many_arguments)]
     pub fn new(
         module_name: String,
         function_name: String,
@@ -580,7 +581,7 @@ impl<T: DType> KernelArgument for &Partition<Tensor<T>> {
 }
 
 /// Same as above but for borrowed mutable tensor partitions.
-impl<'a, T: DType> KernelArgument for &Partition<&'a mut Tensor<T>> {
+impl<T: DType> KernelArgument for &Partition<&mut Tensor<T>> {
     fn push_arg(self, launcher: &mut AsyncKernelLaunch) {
         unsafe {
             launcher.push_device_ptr(self.object.cu_deviceptr());