diff --git a/CMakeLists.txt b/CMakeLists.txt
index a6aacfb..5e01282 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -51,6 +51,10 @@ option(DECREASE_RANSAC_AREA "Do not use 10% tracks for RANSAC near the image bor
 option(CUVSLAM_BUILD_SHARED_LIB "Build shared library version of cuVSLAM" TRUE)
 option(USE_RERUN "Use Rerun for visualization" OFF)
 
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+  set(CMAKE_CUDA_ARCHITECTURES "all" CACHE STRING "CUDA architectures to compile for (e.g. 87, 86, all)")
+endif()
+
 include(cmake/cuVSLAMUtils.cmake)
 setup_cuvslam_settings()
 
diff --git a/README.md b/README.md
index cf049e9..92b8962 100644
--- a/README.md
+++ b/README.md
@@ -145,6 +145,42 @@ make -j
       ```
    2. Update SRC & DST paths in `build_release.sh`
 
+### Build natively on Jetson (aarch64)
+
+For building directly on a Jetson Orin device (e.g. Orin Nano, Orin NX, AGX Orin):
+
+1. Install build dependencies (JetPack provides CUDA runtime but not all dev packages):
+   ```bash
+   sudo apt-get update
+   sudo apt-get install g++ cmake git git-lfs python3-dev libcublas-dev-12-6 libcusolver-dev-12-6
+   ```
+   `libcublas-dev` and `libcusolver-dev` provide the headers, unversioned linker symlinks, and cmake config files needed at build time. JetPack only ships the runtime libraries by default.
+
+2. Clone the repository and pull LFS data (test images and datasets are stored with Git LFS):
+   ```bash
+   git clone https://github.com/nvidia-isaac/cuVSLAM.git
+   cd cuVSLAM
+   git lfs install
+   git lfs pull
+   ```
+
+3. Set source and build paths (add to `~/.bashrc` for persistence):
+   ```bash
+   export CUVSLAM_SRC_DIR=~/cuVSLAM
+   export CUVSLAM_DST_DIR=~/cuVSLAM/build
+   ```
+
+4. Build targeting your specific GPU architecture:
+   ```bash
+   ./build_release.sh --cuda_arch=87
+   ```
+   Use `--cuda_arch=87` for Orin Nano/NX/AGX (SM_87, Ampere). Omit for the default (`all` architectures). Building for a single architecture reduces binary size and improves register allocation.
+
+5. Run tests to verify the build:
+   ```bash
+   ./build_release.sh --cuda_arch=87 --modules_test
+   ```
+
 ### Build on remote ARM
 
 Requires SSH access to the remote device.
diff --git a/build_release.sh b/build_release.sh
index 9c1944e..10194ff 100755
--- a/build_release.sh
+++ b/build_release.sh
@@ -14,6 +14,7 @@
 #   --build_lib        Build cuvslam library and python bindings
 #   --build_docs       Build documentation
 #   --build_type=TYPE  Set CMake build type (Debug|Release[default]|RelWithDebInfo|MinSizeRel)
+#   --cuda_arch=ARCH   Set CUDA architecture target (e.g. 87 for Orin Nano, default: all)
 #   --jobs=N           Set number of parallel jobs (default: 8)
 #
 # Environment variables (optional):
@@ -35,6 +36,7 @@ APITESTS=false
 LIBBUILD=false
 BUILDDOCS=false
 BUILD_TYPE="Release"
+CUDA_ARCH=""
 USE_RERUN=OFF
 
 SRC=/cuvslam/src
@@ -69,6 +71,9 @@ while [ "$#" -gt 0 ]; do
     --jobs=*)
       MAKE_JOBS="${1#*=}"
       ;;
+    --cuda_arch=*)
+      CUDA_ARCH="${1#*=}"
+      ;;
     --build_type=*)
       BUILD_TYPE="${1#*=}"
       ;;
@@ -95,14 +100,18 @@ set -v # echo each command
 
 mkdir -p $DST
 cd $DST
-cmake -DUSE_RERUN=$USE_RERUN -DCMAKE_BUILD_TYPE=$BUILD_TYPE -S $SRC -B $DST
+CMAKE_ARGS="-DUSE_RERUN=$USE_RERUN -DCMAKE_BUILD_TYPE=$BUILD_TYPE"
+if [ -n "$CUDA_ARCH" ]; then
+  CMAKE_ARGS="$CMAKE_ARGS -DCMAKE_CUDA_ARCHITECTURES=$CUDA_ARCH"
+fi
+cmake $CMAKE_ARGS -S $SRC -B $DST
 # Build all CMake targets regardless of the flags
 make -j${MAKE_JOBS} -C $DST
 
 # Step 1: Run module tests
 if is_true "$MODULETESTS"; then
   echo "Module tests executed."
-  GTEST_FILTER=-*SpeedUp* ctest --output-on-failure || exit 1
+  GTEST_FILTER=-*SpeedUp*:*Speedup* ctest --output-on-failure || exit 1
 else
   echo "Module tests skipped."
 fi
diff --git a/libs/cuda_modules/cuda_kernels/CMakeLists.txt b/libs/cuda_modules/cuda_kernels/CMakeLists.txt
index 20d91f4..7310599 100644
--- a/libs/cuda_modules/cuda_kernels/CMakeLists.txt
+++ b/libs/cuda_modules/cuda_kernels/CMakeLists.txt
@@ -31,15 +31,13 @@ set(SOURCES
 add_library(cuda_kernels STATIC ${SOURCES})
 
 set_target_properties(cuda_kernels PROPERTIES
-    # CMAKE_CUDA_ARCHITECTURES is set to some unusable value by default, so we can't use it to choose architectures
-    # CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}"
+    CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}"
     CUDA_SEPARABLE_COMPILATION ON
     CUDA_RESOLVE_DEVICE_SYMBOLS ON
 )
 
 target_compile_options(cuda_kernels PRIVATE
     $<$<COMPILE_LANGUAGE:CUDA>:--compiler-options=-fPIC,-fvisibility=hidden>
-    $<$<COMPILE_LANGUAGE:CUDA>:-arch=all> # instead of CMAKE_CUDA_ARCHITECTURES
     # Enable device debug info for compute-sanitizer support
     $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CONFIG:Debug>>:-G>
     $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CONFIG:RelWithDebInfo>>:-lineinfo>