From f563b7cf6fa02dbc3f49db90c3317ee7865fd6f3 Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Wed, 6 Sep 2023 16:12:31 +0200
Subject: [PATCH 01/24] Adding KernelAbstractions tooling for Molly and tests

---
 .github/workflows/CI.yml             |    1 +
 .gitignore                           |    2 +-
 Project.toml                         |    8 +-
 benchmark/benchmarks.jl              |   52 +-
 benchmark/protein.jl                 |   22 +-
 docs/src/documentation.md            |   15 +-
 ext/MollyCUDAEnzymeExt.jl            |   13 +
 ext/MollyCUDAExt.jl                  | 1235 ++++++++++++++++++++++++++
 ext/MollyEnzymeExt.jl                |    3 -
 ext/MollyGLMakieExt.jl               |    2 +-
 ext/MollyPythonCallExt.jl            |    6 +-
 src/Molly.jl                         |    5 +-
 src/analysis.jl                      |    3 +-
 src/coupling.jl                      |    6 +-
 src/energy.jl                        |   10 +-
 src/force.jl                         |   23 +-
 src/interactions/implicit_solvent.jl |  144 ++-
 src/kernels.jl                       |  371 ++++++++
 src/neighbors.jl                     |   43 +-
 src/setup.jl                         |  102 +--
 src/simulators.jl                    |   20 +-
 src/spatial.jl                       |   22 +-
 src/types.jl                         |   93 +-
 test/Project.toml                    |    2 +
 test/basic.jl                        |   43 +-
 test/energy_conservation.jl          |   20 +-
 test/gradients.jl                    |   57 +-
 test/minimization.jl                 |   14 +-
 test/protein.jl                      |   14 +-
 test/runtests.jl                     |   30 +-
 test/simulation.jl                   |   94 +-
 31 files changed, 2059 insertions(+), 416 deletions(-)
 create mode 100644 ext/MollyCUDAEnzymeExt.jl
 create mode 100644 ext/MollyCUDAExt.jl
 create mode 100644 src/kernels.jl

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 7bb822ad9..3145e136e 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -29,6 +29,7 @@ jobs:
           - NotGradients
           - Gradients
     steps:
+      - run: export UCX_ERROR_SIGNALS="SIGILL,SIGBUS,SIGFPE"
       - uses: actions/checkout@v4
       - uses: julia-actions/setup-julia@v2
         with:
diff --git a/.gitignore b/.gitignore
index 293442edd..697b70410 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,7 @@
 *.jl.*.cov
 *.jl.mem
 docs/build
-/Manifest.toml
+*Manifest.toml
 benchmark/tune.json
 benchmark/results
 .vscode/settings.json
diff --git a/Project.toml b/Project.toml
index 0c895314b..a19459ed9 100644
--- a/Project.toml
+++ b/Project.toml
@@ -8,7 +8,6 @@ Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
 AtomsBase = "a963bdd2-2df7-4f54-a1ee-49d51e6be12a"
 AtomsCalculators = "a3e0e189-c65a-42c1-833c-339540406eb1"
 BioStructures = "de9282ab-8554-53be-b2d6-f6c222edabfc"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CellListMap = "69e1c6dd-3888-40e6-b3c8-31ac5f578864"
 Chemfiles = "46823bd8-5fb3-5f92-9aa0-96921f3dd015"
 Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
@@ -17,7 +16,9 @@ Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
 FLoops = "cc61a311-1640-44b5-9fba-1b764f453329"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
 PeriodicTable = "7b2266bf-644c-5ea3-82d8-af4bbd25a884"
@@ -32,6 +33,7 @@ UnitfulAtomic = "a7773ee8-282e-5fa2-be4e-bd808c38a91a"
 UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
 
 [weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
@@ -39,7 +41,9 @@ KernelDensity = "5ab0869b-81aa-558d-bb23-cbf5423bbe9b"
 PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d"
 
 [extensions]
+MollyCUDAExt = "CUDA"
 MollyEnzymeExt = "Enzyme"
+MollyCUDAEnzymeExt = ["CUDA", "Enzyme"]
 MollyGLMakieExt = ["GLMakie", "Colors"]
 MollyKernelDensityExt = "KernelDensity"
 MollyPythonCallExt = "PythonCall"
@@ -61,7 +65,9 @@ Enzyme = "0.13.20"
 EzXML = "1"
 FLoops = "0.2"
 GLMakie = "0.8, 0.9, 0.10"
+GPUArrays = "10"
 Graphs = "1.8"
+KernelAbstractions = "0.9"
 KernelDensity = "0.5, 0.6"
 LinearAlgebra = "1.9"
 NearestNeighbors = "0.4"
diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index 08e6c5b4a..e3974c07c 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -62,7 +62,8 @@ const starting_velocities = [random_velocity(atom_mass, 1.0u"K") for i in 1:n_at
 const starting_coords_f32 = [Float32.(c) for c in starting_coords]
 const starting_velocities_f32 = [Float32.(c) for c in starting_velocities]
 
-function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
+function test_sim(nl::Bool, parallel::Bool, f32::Bool,
+                  array_type::Type{AT}) where AT <: AbstractArray
     n_atoms = 400
     n_steps = 200
     atom_mass = f32 ? 10.0f0u"g/mol" : 10.0u"g/mol"
@@ -72,9 +73,9 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
     r0 = f32 ? 0.2f0u"nm" : 0.2u"nm"
     bonds = [HarmonicBond(k=k, r0=r0) for i in 1:(n_atoms ÷ 2)]
     specific_inter_lists = (InteractionList2Atoms(
-        gpu ? CuArray(Int32.(collect(1:2:n_atoms))) : Int32.(collect(1:2:n_atoms)),
-        gpu ? CuArray(Int32.(collect(2:2:n_atoms))) : Int32.(collect(2:2:n_atoms)),
-        gpu ? CuArray(bonds) : bonds,
+        array_type(Int32.(collect(1:2:n_atoms))),
+        array_type(Int32.(collect(2:2:n_atoms))),
+        array_type(bonds),
     ),)
 
     neighbor_finder = NoNeighborFinder()
@@ -82,24 +83,17 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
     pairwise_inters = (LennardJones(use_neighbors=false, cutoff=cutoff),)
     if nl
         neighbor_finder = DistanceNeighborFinder(
-            eligible=gpu ? CuArray(trues(n_atoms, n_atoms)) : trues(n_atoms, n_atoms),
+            eligible=array_type(trues(n_atoms, n_atoms)),
             n_steps=10,
             dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm",
         )
         pairwise_inters = (LennardJones(use_neighbors=true, cutoff=cutoff),)
     end
 
-    if gpu
-        coords = CuArray(copy(f32 ? starting_coords_f32 : starting_coords))
-        velocities = CuArray(copy(f32 ? starting_velocities_f32 : starting_velocities))
-        atoms = CuArray([Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
-                              ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
-    else
-        coords = copy(f32 ? starting_coords_f32 : starting_coords)
-        velocities = copy(f32 ? starting_velocities_f32 : starting_velocities)
-        atoms = [Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
-                      ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms]
-    end
+    coords = array_type(deepcopy(f32 ? starting_coords_f32 : starting_coords))
+    velocities = array_type(deepcopy(f32 ? starting_velocities_f32 : starting_velocities))
+    atoms = array_type([Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
+                            ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
 
     sys = System(
         atoms=atoms,
@@ -117,22 +111,22 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
 end
 
 runs = [
-    ("CPU"       , [false, false, false, false]),
-    ("CPU f32"   , [false, false, true , false]),
-    ("CPU NL"    , [true , false, false, false]),
-    ("CPU f32 NL", [true , false, true , false]),
+    ("CPU"       , [false, false, false, Array]),
+    ("CPU f32"   , [false, false, true , Array]),
+    ("CPU NL"    , [true , false, false, Array]),
+    ("CPU f32 NL", [true , false, true , Array]),
 ]
 if run_parallel_tests
-    push!(runs, ("CPU parallel"       , [false, true , false, false]))
-    push!(runs, ("CPU parallel f32"   , [false, true , true , false]))
-    push!(runs, ("CPU parallel NL"    , [true , true , false, false]))
-    push!(runs, ("CPU parallel f32 NL", [true , true , true , false]))
+    push!(runs, ("CPU parallel"       , [false, true , false, Array]))
+    push!(runs, ("CPU parallel f32"   , [false, true , true , Array]))
+    push!(runs, ("CPU parallel NL"    , [true , true , false, Array]))
+    push!(runs, ("CPU parallel f32 NL", [true , true , true , Array]))
 end
-if run_gpu_tests
-    push!(runs, ("GPU"       , [false, false, false, true]))
-    push!(runs, ("GPU f32"   , [false, false, true , true]))
-    push!(runs, ("GPU NL"    , [true , false, false, true]))
-    push!(runs, ("GPU f32 NL", [true , false, true , true]))
+if run_cuda_tests
+    push!(runs, ("GPU"       , [false, false, false, CuArray]))
+    push!(runs, ("GPU f32"   , [false, false, true , CuArray]))
+    push!(runs, ("GPU NL"    , [true , false, false, CuArray]))
+    push!(runs, ("GPU f32 NL", [true , false, true , CuArray]))
 end
 
 for (name, args) in runs
diff --git a/benchmark/protein.jl b/benchmark/protein.jl
index 30f512c07..131d77917 100644
--- a/benchmark/protein.jl
+++ b/benchmark/protein.jl
@@ -11,7 +11,7 @@ const data_dir = normpath(dirname(pathof(Molly)), "..", "data")
 const ff_dir = joinpath(data_dir, "force_fields")
 const openmm_dir = joinpath(data_dir, "openmm_6mrr")
 
-function setup_system(gpu::Bool, f32::Bool, units::Bool)
+function setup_system(array_type::AbstractArray, f32::Bool, units::Bool)
     T = f32 ? Float32 : Float64
     ff = MolecularForceField(
         T,
@@ -27,7 +27,7 @@ function setup_system(gpu::Bool, f32::Bool, units::Bool)
     sys = System(
         joinpath(data_dir, "6mrr_equil.pdb"),
         ff;
-        velocities=gpu ? CuArray(velocities) : velocities,
+        velocities=array_type(velocities),
         units=units,
         gpu=gpu,
         dist_cutoff=(units ? dist_cutoff * u"nm" : dist_cutoff),
@@ -42,15 +42,15 @@ end
 
 runs = [
     # run_name                             gpu    parr   f32    units
-    ("CPU 1 thread"                      , false, false, false, true ),
-    ("CPU 1 thread f32"                  , false, false, true , true ),
-    ("CPU 1 thread f32 nounits"          , false, false, true , false),
-    ("CPU $n_threads threads"            , false, true , false, true ),
-    ("CPU $n_threads threads f32"        , false, true , true , true ),
-    ("CPU $n_threads threads f32 nounits", false, true , true , false),
-    ("GPU"                               , true , false, false, true ),
-    ("GPU f32"                           , true , false, true , true ),
-    ("GPU f32 nounits"                   , true , false, true , false),
+    ("CPU 1 thread"                      , Array, false, false, true ),
+    ("CPU 1 thread f32"                  , Array, false, true , true ),
+    ("CPU 1 thread f32 nounits"          , Array, false, true , false),
+    ("CPU $n_threads threads"            , Array, true , false, true ),
+    ("CPU $n_threads threads f32"        , Array, true , true , true ),
+    ("CPU $n_threads threads f32 nounits", Array, true , true , false),
+    ("GPU"                               , CuArray, false, false, true ),
+    ("GPU f32"                           , CuArray, false, true , true ),
+    ("GPU f32 nounits"                   , CuArray, false, true , false),
 ]
 
 for (run_name, gpu, parallel, f32, units) in runs
diff --git a/docs/src/documentation.md b/docs/src/documentation.md
index 4cbe9a38b..45d2bf383 100644
--- a/docs/src/documentation.md
+++ b/docs/src/documentation.md
@@ -135,11 +135,21 @@ visualize(sys.loggers.coords, boundary, "sim_lj.mp4")
 
 ## GPU acceleration
 
-To run simulations on the GPU you will need to have a CUDA-compatible device.
-[CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) is used to run on the device.
+To run simulations on the GPU you will need to have a GPU available and then load the appropriate package:
+
+| Hardware Available | Necessary Package | Array Type |
+| ------------------ | ----------------- | ---------- |
+| Parallel CPU       | none              | Array      |
+| NVIDIA GPU         | CUDA              | CuArray    |
+| AMD GPU            | AMDGPU            | ROCArray   |
+| Intel GPU          | oneAPI            | oneArray   |
+| Apple Silicon      | Metal             | MtlArray   |
+
+As an important note, Metal / Apple Silicon devices can only run with 32 bit precision, so be sure to use `Float32` (for example) where necessary.
 Simulation setup is similar to above, but with the coordinates, velocities and atoms moved to the GPU.
 This example also shows setting up a simulation to run with `Float32`, which gives better performance on GPUs.
 Of course, you will need to determine whether this level of numerical accuracy is appropriate in your case.
+Here is an example script for an NVIDIA GPU using CUDA:
 ```julia
 using Molly
 using CUDA
@@ -168,6 +178,7 @@ sys = System(
 simulate!(deepcopy(sys), simulator, 20) # Compile function
 simulate!(sys, simulator, 1_000)
 ```
+To use another GPU package, just swap out `CUDA` for your desired package and `CuArray` for your desired array type.
 The device to run on can be changed with `device!`, e.g. `device!(1)`.
 The GPU code path is currently designed to be compatible with differentiable simulation and runs slower than related software, but this is an active area of development.
 Nonetheless, GPU performance is significantly better than CPU performance and is good enough for many applications.
diff --git a/ext/MollyCUDAEnzymeExt.jl b/ext/MollyCUDAEnzymeExt.jl
new file mode 100644
index 000000000..c88ebd144
--- /dev/null
+++ b/ext/MollyCUDAEnzymeExt.jl
@@ -0,0 +1,13 @@
+module MollyCUDAEnzymeExt
+
+using Molly
+using CUDA
+using Enzyme
+
+ext = Base.get_extension(Molly,:MollyCUDAExt)
+
+EnzymeRules.inactive(::typeof(ext.cuda_threads_blocks_pairwise), args...) = nothing
+EnzymeRules.inactive(::typeof(ext.cuda_threads_blocks_specific), args...) = nothing
+
+
+end
diff --git a/ext/MollyCUDAExt.jl b/ext/MollyCUDAExt.jl
new file mode 100644
index 000000000..0adc59795
--- /dev/null
+++ b/ext/MollyCUDAExt.jl
@@ -0,0 +1,1235 @@
+module MollyCUDAExt
+
+using Molly
+using CUDA
+using Atomix
+using KernelAbstractions
+
+CUDA.Const(nl::Molly.NoNeighborList) = nl
+
+# CUDA.jl kernels
+const WARPSIZE = UInt32(32)
+
+macro shfl_multiple_sync(mask, target, width, vars...)
+    all_lines = map(vars) do v
+        Expr(:(=), v,
+            Expr(:call, :shfl_sync,
+                mask, v, target, width
+            )
+        )
+    end
+    return esc(Expr(:block, all_lines...))
+end
+
+CUDA.shfl_recurse(op, x::Quantity) = op(x.val) * unit(x)
+CUDA.shfl_recurse(op, x::SVector{1, C}) where C = SVector{1, C}(op(x[1]))
+CUDA.shfl_recurse(op, x::SVector{2, C}) where C = SVector{2, C}(op(x[1]), op(x[2]))
+CUDA.shfl_recurse(op, x::SVector{3, C}) where C = SVector{3, C}(op(x[1]), op(x[2]), op(x[3]))
+
+function cuda_threads_blocks_pairwise(n_neighbors)
+    n_threads_gpu = min(n_neighbors, parse(Int, get(ENV, "MOLLY_GPUNTHREADS_PAIRWISE", "512")))
+    n_blocks = cld(n_neighbors, n_threads_gpu)
+    return n_threads_gpu, n_blocks
+end
+
+function cuda_threads_blocks_specific(n_inters)
+    n_threads_gpu = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_SPECIFIC", "128"))
+    n_blocks = cld(n_inters, n_threads_gpu)
+    return n_threads_gpu, n_blocks
+end
+
+function pairwise_force_gpu!(buffers, sys::System{D, AT, T}, pairwise_inters, nbs, step_n) where {D, AT <: CuArray, T}
+    if typeof(nbs) == NoNeighborList
+        kernel = @cuda launch=false pairwise_force_kernel_nonl!(
+                buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters, step_n,
+                Val(D), Val(sys.force_units))
+        conf = launch_configuration(kernel.fun)
+        threads_basic = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_PAIRWISE", "512"))
+        nthreads = min(length(sys.atoms), threads_basic, conf.threads)
+        nthreads = cld(nthreads, WARPSIZE) * WARPSIZE
+        n_blocks_i = cld(length(sys.atoms), WARPSIZE)
+        n_blocks_j = cld(length(sys.atoms), nthreads)
+        kernel(buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters, step_n, Val(D),
+               Val(sys.force_units); threads=nthreads, blocks=(n_blocks_i, n_blocks_j))
+    else    
+        N = length(sys.coords)
+        n_blocks = cld(N, WARPSIZE)
+        r_cut = sys.neighbor_finder.dist_cutoff
+        if step_n % sys.neighbor_finder.n_steps_reorder == 0 || !sys.neighbor_finder.initialized
+            Morton_bits = 4
+            w = r_cut - typeof(ustrip(r_cut))(0.1) * unit(r_cut)
+            Morton_seq_cpu = sorted_Morton_seq(Array(sys.coords), w, Morton_bits)
+            copyto!(buffers.Morton_seq, Morton_seq_cpu)
+            CUDA.@sync @cuda blocks=(cld(N, WARPSIZE),) threads=(32,) kernel_min_max!(buffers.Morton_seq, buffers.box_mins, buffers.box_maxs, sys.coords, Val(N), sys.boundary, Val(D))
+            sys.neighbor_finder.initialized = true
+            CUDA.@sync @cuda blocks=(n_blocks, n_blocks) threads=(32, 1) always_inline=true compress_boolean_matrices!(buffers.Morton_seq, 
+            sys.neighbor_finder.eligible, sys.neighbor_finder.special, buffers.compressed_eligible, buffers.compressed_special, Val(N))
+        end
+        CUDA.@sync @cuda blocks=(n_blocks, n_blocks) threads=(32, 1) always_inline=true force_kernel!(buffers.Morton_seq, buffers.fs_mat, buffers.box_mins, buffers.box_maxs, sys.coords, sys.velocities, sys.atoms, Val(N), r_cut, Val(sys.force_units), pairwise_inters, sys.boundary, step_n, buffers.compressed_special, buffers.compressed_eligible, Val(T), Val(D))
+    end
+    return buffers
+end
+
+function pairwise_pe_gpu!(pe_vec_nounits, buffers, sys::System{D, AT, T}, pairwise_inters, nbs, step_n) where {D, AT <: CuArray, T}
+    if typeof(nbs) == NoNeighborList
+        n_threads_gpu, n_blocks = cuda_threads_blocks_pairwise(length(nbs))
+        CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks pairwise_pe_kernel!(
+            pe_vec_nounits, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters, nbs, step_n, Val(sys.energy_units))
+    else
+        N = length(sys.coords)
+        n_blocks = cld(N, WARPSIZE)
+        r_cut = sys.neighbor_finder.dist_cutoff
+        Morton_bits = 4
+        w = r_cut - typeof(ustrip(r_cut))(0.1) * unit(r_cut)
+        Morton_seq_cpu = sorted_Morton_seq(Array(sys.coords), w, Morton_bits)
+        copyto!(buffers.Morton_seq, Morton_seq_cpu)
+        CUDA.@sync @cuda blocks=(cld(N, WARPSIZE),) threads=(32,) kernel_min_max!(buffers.Morton_seq, buffers.box_mins, buffers.box_maxs, sys.coords, Val(N), sys.boundary, Val(D))
+        sys.neighbor_finder.initialized = true
+        CUDA.@sync @cuda blocks=(n_blocks, n_blocks) threads=(32, 1) always_inline=true energy_kernel!(buffers.Morton_seq, 
+            pe_vec_nounits, buffers.box_mins, buffers.box_maxs, sys.coords, sys.velocities, sys.atoms, Val(N), r_cut, Val(sys.energy_units), 
+            pairwise_inters, sys.boundary, step_n, sys.neighbor_finder.special, sys.neighbor_finder.eligible, Val(T), Val(D))
+    end
+    return pe_vec_nounits
+end
+
+function sorted_Morton_seq(positions, w, bits::Int)
+    N = length(positions)
+    D = length(positions[1])
+    Morton_sequence = Vector{Int32}(undef, N)
+    for i in 1:N
+        scaled_coords = floor.(Int32, positions[i] ./ w)
+        Morton_sequence[i] = generalized_Morton_code(scaled_coords, bits, D)
+    end
+    sort = Int32.(sortperm(Morton_sequence))
+    return sort
+end
+
+function generalized_Morton_code(indices, bits::Int, D::Int)
+    code = 0
+    for bit in 0:(bits-1)
+        for d in 1:D
+            code |= ((indices[d] >> bit) & 1) << (D * bit + (d - 1))
+        end
+    end
+    return Int32(code)
+end
+
+function boxes_dist(x1_min::D, x1_max::D, x2_min::D, x2_max::D, Lx::D) where D
+
+    a = abs(vector_1D(x2_max, x1_min, Lx))
+    b = abs(vector_1D(x1_max, x2_min, Lx))
+
+    return ifelse(
+        x1_min - x2_max <= zero(D) && x2_min - x1_max <= zero(D),
+        zero(D),
+        ifelse(a < b, a, b)	
+    )
+end
+
+function kernel_min_max!(
+    sorted_seq,
+    mins::AbstractArray{C}, 
+    maxs::AbstractArray{C}, 
+    coords, 
+    ::Val{n}, 
+    boundary,
+    ::Val{D}) where {n, C, D}
+
+    D32 = Int32(32)
+    a = Int32(1)
+    b = Int32(D)
+    r = Int32(n % D32)
+    i = threadIdx().x + (blockIdx().x - a) * blockDim().x
+    local_i = threadIdx().x
+    mins_smem = CuStaticSharedArray(C, (D32, b))
+    maxs_smem = CuStaticSharedArray(C, (D32, b))
+    r_smem = CuStaticSharedArray(C, (r, b))
+ 
+    if i <= n - r && local_i <= D32
+        for k in a:b
+            s_i = sorted_seq[i]
+            mins_smem[local_i, k] = coords[s_i][k]
+            maxs_smem[local_i, k] = coords[s_i][k]
+        end
+    end
+    sync_threads() 
+    if i <= n - r && local_i <= D32
+        for p in a:Int32(log2(D32))
+            for k in a:b
+                @inbounds begin
+                    if local_i % Int32(2^p) == Int32(0)
+                        if mins_smem[local_i, k] > mins_smem[local_i - Int32(2^(p - 1)), k] 
+                            mins_smem[local_i, k] = mins_smem[local_i - Int32(2^(p - 1)), k]
+                        end
+                        if maxs_smem[local_i, k] < maxs_smem[local_i - Int32(2^(p - 1)), k] 
+                            maxs_smem[local_i, k] = maxs_smem[local_i - Int32(2^(p - 1)), k]
+                        end
+                    end
+                end
+            end
+        end 
+        if local_i == D32 
+            for k in a:b
+                mins[blockIdx().x, k] = mins_smem[local_i, k]
+                maxs[blockIdx().x, k] = maxs_smem[local_i, k]
+            end
+        end
+
+    end 
+
+    # Since the remainder array is low-dimensional, we do the scan
+    if i > n - r && i <= n && local_i <= r
+        for k in a:b
+            r_smem[local_i, k] = coords[sorted_seq[i]][k]
+        end
+    end
+    xyz_min = CuStaticSharedArray(C, b)
+    xyz_max = CuStaticSharedArray(C, b)
+    for k in a:b
+        xyz_min[k] = 10 * boundary.side_lengths[k] # very large (arbitrary) value
+        xyz_max[k] = -10 * boundary.side_lengths[k]
+    end
+    if local_i == a
+        for j in a:r
+            @inbounds begin
+                for k in a:b
+                    if r_smem[j, k] < xyz_min[k] 
+                        xyz_min[k] = r_smem[j, k]
+                    end
+                    if r_smem[j, k] > xyz_max[k] 
+                        xyz_max[k] = r_smem[j, k]
+                    end
+                end
+            end
+        end
+        if blockIdx().x == Int32(ceil(n/D32)) && r != Int32(0)
+            for k in a:b
+                mins[blockIdx().x, k] = xyz_min[k] 
+                maxs[blockIdx().x, k] = xyz_max[k]
+            end
+        end
+    end
+
+    return nothing
+end
+
+function compress_boolean_matrices!(sorted_seq, eligible_matrix, special_matrix, compressed_eligible, compressed_special, ::Val{N}) where N
+
+    a = Int32(1)
+    n_blocks = Int32(ceil(N / 32))
+    r = Int32((N - 1) % 32 + 1)
+    i = blockIdx().x
+    j = blockIdx().y
+    i_0_tile = (i - a) * warpsize()
+    j_0_tile = (j - a) * warpsize()
+    index_i = i_0_tile + laneid()
+    index_j = j_0_tile + laneid()
+
+    if j < n_blocks && i <= j 
+        s_idx_i = sorted_seq[index_i]
+        eligible_bitmask = UInt32(0)
+        special_bitmask = UInt32(0)
+        for m in a:warpsize()
+            s_idx_j = sorted_seq[j_0_tile + m]
+            eligible_bitmask = (eligible_bitmask << 1) | UInt32(eligible_matrix[s_idx_i, s_idx_j])
+            special_bitmask = (special_bitmask << 1) | UInt32(special_matrix[s_idx_i, s_idx_j])
+        end
+        compressed_eligible[laneid(), i, j] = eligible_bitmask
+        compressed_special[laneid(), i, j] = special_bitmask
+    end
+
+    if j == n_blocks && i < j
+        s_idx_i = sorted_seq[index_i]
+        eligible_bitmask = UInt32(0)
+        special_bitmask = UInt32(0)
+        for m in a:r
+            s_idx_j = sorted_seq[j_0_tile + m]
+            eligible_bitmask = (eligible_bitmask << 1) | UInt32(eligible_matrix[s_idx_i, s_idx_j])
+            special_bitmask = (special_bitmask << 1) | UInt32(special_matrix[s_idx_i, s_idx_j])
+        end
+        eligible_bitmask = (eligible_bitmask >> r) | (eligible_bitmask << (warpsize() - r))
+        special_bitmask = (special_bitmask >> r) | (special_bitmask << (warpsize() - r))
+        compressed_eligible[laneid(), i, j] = eligible_bitmask
+        compressed_special[laneid(), i, j] = special_bitmask
+    end
+
+    if j == n_blocks && i == j && laneid() <= r
+        s_idx_i = sorted_seq[index_i]
+        eligible_bitmask = UInt32(0)
+        special_bitmask = UInt32(0)
+        for m in a:r
+            s_idx_j = sorted_seq[j_0_tile + m]
+            eligible_bitmask = (eligible_bitmask << 1) | UInt32(eligible_matrix[s_idx_i, s_idx_j])
+            special_bitmask = (special_bitmask << 1) | UInt32(special_matrix[s_idx_i, s_idx_j])
+        end
+        eligible_bitmask = (eligible_bitmask >> r) | (eligible_bitmask << (warpsize() - r))
+        special_bitmask = (special_bitmask >> r) | (special_bitmask << (warpsize() - r))
+        compressed_eligible[laneid(), i, j] = eligible_bitmask
+        compressed_special[laneid(), i, j] = special_bitmask
+    end
+    return nothing
+end
+
+
+#=
+**The No-neighborlist pairwise force summation kernel (algorithm by Eastman, see https://onlinelibrary.wiley.com/doi/full/10.1002/jcc.21413)**: 
+1. Case j < n_blocks && i < j, i.e., `WARPSIZE`×`WARPSIZE` tiles: For such tiles each row is assiged to a different thread in a warp which calculates the
+forces for the entire row in `WARPSIZE` steps. This is done such that some data can be shuffled from `i+1`'th thread to `i`'th thread in each
+subsequent iteration of the force calculation in a row. If `a, b, ...` are different atoms and `1, 2, ...` are order in which each thread calculates
+the interatomic forces, then we can represent this scenario as (considering `WARPSIZE=8`):
+```
+    × | i j k l m n o p
+    --------------------
+    a | 1 2 3 4 5 6 7 8
+    b | 8 1 2 3 4 5 6 7
+    c | 7 8 1 2 3 4 5 6
+    d | 6 7 8 1 2 3 4 5
+    e | 5 6 7 8 1 2 3 4
+    f | 4 5 6 7 8 1 2 3
+    g | 3 4 5 6 7 8 1 2
+    h | 2 3 4 5 6 7 8 1
+```
+
+2. Cases j == n_blocks && i < n_blocks, i == j && i < n_blocks, i == n_blocks && j == n_blocks: In such cases, it is not possible to shuffle data generally
+so there is no need to order calculations for each thread diagonally and it is also a bit more complicated to do so.
+That's why the calculations are done in the following order:
+```
+    × | i j k l m n
+    ----------------
+    a | 1 2 3 4 5 6
+    b | 1 2 3 4 5 6
+    c | 1 2 3 4 5 6
+    d | 1 2 3 4 5 6
+    e | 1 2 3 4 5 6
+    f | 1 2 3 4 5 6
+    g | 1 2 3 4 5 6
+    h | 1 2 3 4 5 6
+```
+=#
+
+function force_kernel!( 
+    sorted_seq,
+    forces_nounits, 
+    mins::AbstractArray{C}, 
+    maxs::AbstractArray{C},
+    coords, 
+    velocities,
+    atoms,
+    ::Val{N}, 
+    r_cut, 
+    ::Val{force_units},
+    inters_tuple,
+    boundary,
+    step_n,
+    special_compressed,
+    eligible_compressed,
+    ::Val{T},
+    ::Val{D}) where {N, C, force_units, T, D}
+
+    a = Int32(1)
+    b = Int32(D)
+    n_blocks = Int32(ceil(N / 32))
+    i = blockIdx().x
+    j = blockIdx().y
+    i_0_tile = (i - a) * warpsize()
+    j_0_tile = (j - a) * warpsize()
+    index_i = i_0_tile + laneid()
+    index_j = j_0_tile + laneid()
+    force_smem = CuStaticSharedArray(T, (32, 3))
+    opposites_sum = CuStaticSharedArray(T, (32, 3))
+    r = Int32((N - 1) % 32 + 1)
+    @inbounds for k in a:b
+        force_smem[laneid(), k] = zero(T)
+        opposites_sum[laneid(), k] = zero(T)
+    end
+
+    # The code is organised in 4 mutually excluding parts
+    if j < n_blocks && i < j
+        d_block = zero(C)
+        dist_block = zero(C) * zero(C)
+        @inbounds for k in a:b	
+            d_block = boxes_dist(mins[i, k], maxs[i, k], mins[j, k], maxs[j, k], boundary.side_lengths[k])
+            dist_block += d_block * d_block	
+        end
+        if dist_block <= r_cut * r_cut
+            s_idx_i = sorted_seq[index_i]
+            coords_i = coords[s_idx_i] 
+            vel_i = velocities[s_idx_i] 
+            atoms_i = atoms[s_idx_i]
+            d_pb = zero(C)
+            dist_pb = zero(C) * zero(C)
+            @inbounds for k in a:b	
+                d_pb = boxes_dist(coords_i[k], coords_i[k], mins[j, k], maxs[j, k], boundary.side_lengths[k])
+                dist_pb += d_pb * d_pb
+            end
+
+            Bool_excl = dist_pb <= r_cut * r_cut
+            s_idx_j = sorted_seq[index_j]
+            coords_j = coords[s_idx_j]
+            vel_j = velocities[s_idx_j] 
+            shuffle_idx = laneid()
+            atoms_j = atoms[s_idx_j]
+            atype_j = atoms_j.atom_type
+            aindex_j = atoms_j.index
+            amass_j = atoms_j.mass
+            acharge_j = atoms_j.charge
+            aσ_j = atoms_j.σ
+            aϵ_j = atoms_j.ϵ
+            eligible_bitmask = UInt32(0)
+            special_bitmask = UInt32(0)
+            eligible_bitmask = eligible_compressed[laneid(), i, j]
+            special_bitmask = special_compressed[laneid(), i, j]
+
+            # Shuffle
+            for m in a:warpsize()
+                sync_warp()
+                coords_j = CUDA.shfl_sync(0xFFFFFFFF, coords_j, laneid() + a, warpsize())
+                vel_j = CUDA.shfl_sync(0xFFFFFFFF, vel_j, laneid() + a, warpsize())
+                shuffle_idx = CUDA.shfl_sync(0xFFFFFFFF, shuffle_idx, laneid() + a, warpsize())
+                atype_j = CUDA.shfl_sync(0xFFFFFFFF, atype_j, laneid() + a, warpsize())
+                aindex_j = CUDA.shfl_sync(0xFFFFFFFF, aindex_j, laneid() + a, warpsize())
+                amass_j = CUDA.shfl_sync(0xFFFFFFFF, amass_j, laneid() + a, warpsize())
+                acharge_j = CUDA.shfl_sync(0xFFFFFFFF, acharge_j, laneid() + a, warpsize())
+                aσ_j = CUDA.shfl_sync(0xFFFFFFFF, aσ_j, laneid() + a, warpsize())
+                aϵ_j = CUDA.shfl_sync(0xFFFFFFFF, aϵ_j, laneid() + a, warpsize())
+                
+                atoms_j_shuffle = Atom(atype_j, aindex_j, amass_j, acharge_j, aσ_j, aϵ_j)
+                dr = vector(coords_j, coords_i, boundary)
+                r2 = sum(abs2, dr)
+                excl = (eligible_bitmask >> (warpsize() - shuffle_idx)) | (eligible_bitmask << shuffle_idx)
+                spec = (special_bitmask >> (warpsize() - shuffle_idx)) | (special_bitmask << shuffle_idx)
+                condition = (excl & 0x1) == true && r2 <= r_cut * r_cut
+                 
+                f = condition ? sum_pairwise_forces(
+                    inters_tuple,
+                    atoms_i, atoms_j_shuffle,
+                    Val(force_units),
+                    (spec & 0x1) == true,
+                    coords_i, coords_j,
+                    boundary,
+                    vel_i, vel_j,
+                    step_n) : zero(SVector{D, T})
+
+                @inbounds for k in a:b
+                    force_smem[laneid(), k] += ustrip(f[k])
+                    opposites_sum[shuffle_idx, k] -= ustrip(f[k])
+                end
+            end
+            sync_threads()
+            @inbounds for k in a:b
+                CUDA.atomic_add!(
+                    pointer(forces_nounits, s_idx_i * b - (b - k)), 
+                    -force_smem[laneid(), k]
+                ) 
+                CUDA.atomic_add!(
+                    pointer(forces_nounits, s_idx_j * b - (b - k)), 
+                    -opposites_sum[laneid(), k]
+                ) 
+            end
+        end
+    end
+
+    if j == n_blocks && i < n_blocks
+        d_block = zero(C)
+        dist_block = zero(C) * zero(C)
+        @inbounds for k in a:b
+            d_block = boxes_dist(mins[i, k], maxs[i, k], mins[j, k], maxs[j, k], boundary.side_lengths[k])
+            dist_block += d_block * d_block	
+        end
+
+        if dist_block <= r_cut * r_cut 
+            s_idx_i = sorted_seq[index_i]
+            coords_i = coords[s_idx_i]
+            vel_i = velocities[s_idx_i]
+            atoms_i = atoms[s_idx_i]
+            d_pb = zero(C)
+            dist_pb = zero(C) * zero(C)			
+            @inbounds for k in a:b	
+                d_pb = boxes_dist(coords_i[k], coords_i[k], mins[j, k], maxs[j, k], boundary.side_lengths[k])
+                dist_pb += d_pb * d_pb
+            end
+            Bool_excl = dist_pb <= r_cut * r_cut
+            eligible_bitmask = UInt32(0)
+            special_bitmask = UInt32(0)
+            eligible_bitmask = eligible_compressed[laneid(), i, j]
+            special_bitmask = special_compressed[laneid(), i, j]
+            
+            for m in a:r
+                s_idx_j = sorted_seq[j_0_tile + m]
+                coords_j = coords[s_idx_j]
+                vel_j = velocities[s_idx_j]
+                atoms_j = atoms[s_idx_j]
+                dr = vector(coords_j, coords_i, boundary)
+                r2 = sum(abs2, dr)
+                excl = (eligible_bitmask >> (warpsize() - m)) | (eligible_bitmask << m)
+                spec = (special_bitmask >> (warpsize() - m)) | (special_bitmask << m)
+                condition = (excl & 0x1) == true && r2 <= r_cut * r_cut
+
+                f = condition ? sum_pairwise_forces(
+                    inters_tuple,
+                    atoms_i, atoms_j,
+                    Val(force_units),
+                    (spec & 0x1) == true,
+                    coords_i, coords_j,
+                    boundary,
+                    vel_i, vel_j,
+                    step_n) : zero(SVector{D, T})
+
+                @inbounds for k in a:b
+                    force_smem[laneid(), k] += ustrip(f[k])
+                    CUDA.atomic_add!(
+                        pointer(forces_nounits, s_idx_j * b - (b - k)), 
+                        ustrip(f[k])
+                    )
+                end
+            end
+
+            # Sum contributions of the r-block to the other standard blocks
+            @inbounds for k in a:b
+                CUDA.atomic_add!(
+                    pointer(forces_nounits, s_idx_i * b - (b - k)), 
+                    -force_smem[laneid(), k]
+                ) 
+            end
+        end
+    end
+
+    if i == j && i < n_blocks
+        s_idx_i = sorted_seq[index_i]
+        coords_i = coords[s_idx_i]
+        vel_i = velocities[s_idx_i]
+        atoms_i = atoms[s_idx_i]
+        eligible_bitmask = UInt32(0)
+        special_bitmask = UInt32(0)
+        eligible_bitmask = eligible_compressed[laneid(), i, j]
+        special_bitmask = special_compressed[laneid(), i, j]
+
+        for m in (laneid() + a) : warpsize()
+            s_idx_j = sorted_seq[j_0_tile + m]
+            coords_j = coords[s_idx_j]
+            vel_j = velocities[s_idx_j]
+            atoms_j = atoms[s_idx_j]
+            dr = vector(coords_j, coords_i, boundary)
+            r2 = sum(abs2, dr)
+            excl = (eligible_bitmask >> (warpsize() - m)) | (eligible_bitmask << m)
+            spec = (special_bitmask >> (warpsize() - m)) | (special_bitmask << m)
+            condition = (excl & 0x1) == true && r2 <= r_cut * r_cut
+
+            f = condition ? sum_pairwise_forces(
+                inters_tuple,
+                atoms_i, atoms_j,
+                Val(force_units),
+                (spec & 0x1) == true,
+                coords_i, coords_j,
+                boundary,
+                vel_i, vel_j,
+                step_n) : zero(SVector{D, T})
+            
+            @inbounds for k in a:b
+                force_smem[laneid(), k] += ustrip(f[k])
+                opposites_sum[m, k] -= ustrip(f[k])
+            end
+        end	
+
+        @inbounds for k in a:b
+            # In this case i == j, so we can call atomic_add! only once
+            CUDA.atomic_add!(
+                pointer(forces_nounits, s_idx_i * b - (b - k)), 
+                -force_smem[laneid(), k] - opposites_sum[laneid(), k]
+            ) 
+        end
+    end
+
+    if i == n_blocks && j == n_blocks
+        if laneid() <= r
+            s_idx_i = sorted_seq[index_i]
+            coords_i = coords[s_idx_i]
+            vel_i = velocities[s_idx_i]
+            atoms_i = atoms[s_idx_i]
+            eligible_bitmask = UInt32(0)
+            special_bitmask = UInt32(0)
+            eligible_bitmask = eligible_compressed[laneid(), i, j]
+            special_bitmask = special_compressed[laneid(), i, j]
+
+            for m in (laneid() + a) : r
+                s_idx_j = sorted_seq[j_0_tile + m]
+                coords_j = coords[s_idx_j]
+                vel_j = velocities[s_idx_j]
+                atoms_j = atoms[s_idx_j]
+                dr = vector(coords_j, coords_i, boundary)
+                r2 = sum(abs2, dr)
+                excl = (eligible_bitmask >> (warpsize() - m)) | (eligible_bitmask << m)
+                spec = (special_bitmask >> (warpsize() - m)) | (special_bitmask << m)
+                condition = (excl & 0x1) == true && r2 <= r_cut * r_cut
+                
+                f = condition ? sum_pairwise_forces(
+                    inters_tuple,
+                    atoms_i, atoms_j,
+                    Val(force_units),
+                    (spec & 0x1) == true,
+                    coords_i, coords_j,
+                    boundary,
+                    vel_i, vel_j,
+                    step_n) : zero(SVector{D, T})
+
+                @inbounds for k in a:b
+                    force_smem[laneid(), k] += ustrip(f[k])
+                    opposites_sum[m, k] -= ustrip(f[k])
+                end
+            end
+            @inbounds for k in a:b
+                CUDA.atomic_add!(
+                    pointer(forces_nounits, s_idx_i * b - (b - k)), 
+                    -force_smem[laneid(), k] - opposites_sum[laneid(), k]
+                ) 
+            end
+        end
+    end
+
+    return nothing
+end
+
+
+function energy_kernel!( 
+    sorted_seq,
+    energy_nounits, 
+    mins::AbstractArray{C}, 
+    maxs::AbstractArray{C}, 
+    coords, 
+    velocities,
+    atoms,
+    ::Val{N}, 
+    r_cut, 
+    ::Val{energy_units},
+    inters_tuple,
+    boundary,
+    step_n, 
+    special_matrix,
+    eligible_matrix,
+    ::Val{T},
+    ::Val{D}) where {N, C, energy_units, T, D}
+
+    a = Int32(1)
+    b = Int32(D)
+    n_blocks = Int32(ceil(N / 32))
+    r = Int32((N - 1) % 32 + 1)
+    i = blockIdx().x
+    j = blockIdx().y
+    i_0_tile = (i - 1) * warpsize()
+    j_0_tile = (j - 1) * warpsize()
+    index_i = i_0_tile + laneid()
+    index_j = j_0_tile + laneid()
+    E_smem = CuStaticSharedArray(T, 32)
+    E_smem[laneid()] = zero(T)
+    eligible = CuStaticSharedArray(Bool, (32, 32))
+    special = CuStaticSharedArray(Bool, (32, 32))
+
+    # The code is organised in 4 mutually excluding parts
+    if j < n_blocks && i < j
+        d_block = zero(C)
+        dist_block = zero(C) * zero(C)
+        @inbounds for k in a:b	
+            d_block = boxes_dist(mins[i, k], maxs[i, k], mins[j, k], maxs[j, k], boundary.side_lengths[k])
+            dist_block += d_block * d_block	
+        end
+        if dist_block <= r_cut * r_cut
+            s_idx_i = sorted_seq[index_i]
+            coords_i = coords[s_idx_i] 
+            vel_i = velocities[s_idx_i]
+            atoms_i = atoms[s_idx_i]
+            d_pb = zero(C)
+            dist_pb = zero(C) * zero(C)
+            @inbounds for k in a:b	
+                d_pb = boxes_dist(coords_i[k], coords_i[k], mins[j, k], maxs[j, k], boundary.side_lengths[k])
+                dist_pb += d_pb * d_pb
+            end
+            Bool_excl = dist_pb <= r_cut * r_cut
+            s_idx_j = sorted_seq[index_j]
+            coords_j = coords[s_idx_j]
+            vel_j = velocities[s_idx_j]
+            shuffle_idx = laneid()
+            atoms_j = atoms[s_idx_j]
+            atype_j = atoms_j.atom_type
+            aindex_j = atoms_j.index
+            amass_j = atoms_j.mass
+            acharge_j = atoms_j.charge
+            aσ_j = atoms_j.σ
+            aϵ_j = atoms_j.ϵ
+            @inbounds for m in a:warpsize()
+                eligible[laneid(), m] = eligible_matrix[s_idx_i, sorted_seq[j_0_tile + m]]
+                special[laneid(), m] = special_matrix[s_idx_i, sorted_seq[j_0_tile + m]]
+            end
+
+            # Shuffle
+            for m in a:warpsize()
+                sync_warp()
+                coords_j = CUDA.shfl_sync(0xFFFFFFFF, coords_j, laneid() + a, warpsize())
+                vel_j = CUDA.shfl_sync(0xFFFFFFFF, vel_j, laneid() + a, warpsize())
+                s_idx_j = CUDA.shfl_sync(0xFFFFFFFF, s_idx_j, laneid() + a, warpsize())
+                shuffle_idx = CUDA.shfl_sync(0xFFFFFFFF, shuffle_idx, laneid() + a, warpsize())
+                atype_j = CUDA.shfl_sync(0xFFFFFFFF, atype_j, laneid() + a, warpsize())
+                aindex_j = CUDA.shfl_sync(0xFFFFFFFF, aindex_j, laneid() + a, warpsize())
+                amass_j = CUDA.shfl_sync(0xFFFFFFFF, amass_j, laneid() + a, warpsize())
+                acharge_j = CUDA.shfl_sync(0xFFFFFFFF, acharge_j, laneid() + a, warpsize())
+                aσ_j = CUDA.shfl_sync(0xFFFFFFFF, aσ_j, laneid() + a, warpsize())
+                aϵ_j = CUDA.shfl_sync(0xFFFFFFFF, aϵ_j, laneid() + a, warpsize())
+                
+                atoms_j_shuffle = Atom(atype_j, aindex_j, amass_j, acharge_j, aσ_j, aϵ_j)
+                dr = vector(coords_j, coords_i, boundary)
+                r2 = sum(abs2, dr)
+                condition = eligible[laneid(), shuffle_idx] && Bool_excl && r2 <= r_cut * r_cut
+
+                pe = condition ? sum_pairwise_potentials(
+                    inters_tuple,
+                    atoms_i, atoms_j_shuffle,
+                    Val(energy_units),
+                    special[laneid(), shuffle_idx],
+                    coords_i, coords_j,
+                    boundary,
+                    vel_i, vel_j,
+                    step_n) : zero(SVector{1, T})
+
+                E_smem[laneid()] += ustrip(pe[1])
+            end
+        end
+    end
+
+    if j == n_blocks && i < n_blocks
+        d_block = zero(C)
+        dist_block = zero(C) * zero(C)
+        @inbounds for k in a:b
+            d_block = boxes_dist(mins[i, k], maxs[i, k], mins[j, k], maxs[j, k], boundary.side_lengths[k])
+            dist_block += d_block * d_block	
+        end
+        if dist_block <= r_cut * r_cut 
+            s_idx_i = sorted_seq[index_i]
+            coords_i = coords[s_idx_i]
+            vel_i = velocities[s_idx_i]
+            atoms_i = atoms[s_idx_i]
+            d_pb = zero(C)
+            dist_pb = zero(C) * zero(C)			
+            @inbounds for k in a:b	
+                d_pb = boxes_dist(coords_i[k], coords_i[k], mins[j, k], maxs[j, k], boundary.side_lengths[k])
+                dist_pb += d_pb * d_pb
+            end
+            Bool_excl = dist_pb <= r_cut * r_cut
+            @inbounds for m in a:r
+                s_idx_j = sorted_seq[j_0_tile + m]
+                eligible[laneid(), m] = eligible_matrix[s_idx_i, s_idx_j]
+                special[laneid(), m] = special_matrix[s_idx_i, s_idx_j]
+            end
+            
+            for m in a:r
+                s_idx_j = sorted_seq[j_0_tile + m]
+                coords_j = coords[s_idx_j]
+                vel_j = velocities[s_idx_j]
+                atoms_j = atoms[s_idx_j]
+                dr = vector(coords_j, coords_i, boundary)
+                r2 = sum(abs2, dr)
+                condition = eligible[laneid(), m] && Bool_excl && r2 <= r_cut * r_cut
+
+                pe = condition ? sum_pairwise_potentials(
+                    inters_tuple,
+                    atoms_i, atoms_j,
+                    Val(energy_units),
+                    special[laneid(), m],
+                    coords_i, coords_j,
+                    boundary,
+                    vel_i, vel_j,
+                    step_n) : zero(SVector{1, T})
+
+                E_smem[laneid()] += ustrip(pe[1])
+            end
+        end
+    end
+
+    if i == j && i < n_blocks
+        s_idx_i = sorted_seq[index_i]
+        coords_i = coords[s_idx_i]
+        vel_i = velocities[s_idx_i]
+        atoms_i = atoms[s_idx_i]
+        @inbounds for m in a:warpsize()
+            s_idx_j = sorted_seq[j_0_tile + m]
+            eligible[laneid(), m] = eligible_matrix[s_idx_i, s_idx_j]
+            special[laneid(), m] = special_matrix[s_idx_i, s_idx_j]
+        end
+        @inbounds for m in (laneid() + a) : warpsize()
+            s_idx_j = sorted_seq[j_0_tile + m]
+            coords_j = coords[s_idx_j]
+            vel_j = velocities[s_idx_j]
+            atoms_j = atoms[s_idx_j]
+            dr = vector(coords_j, coords_i, boundary)
+            r2 = sum(abs2, dr)
+            condition = eligible[laneid(), m] && r2 <= r_cut * r_cut
+
+            pe = condition ? sum_pairwise_potentials(
+                    inters_tuple,
+                    atoms_i, atoms_j,
+                    Val(energy_units),
+                    special[laneid(), m],
+                    coords_i, coords_j,
+                    boundary,
+                    vel_i, vel_j,
+                    step_n) : zero(SVector{1, T})
+
+            E_smem[laneid()] += ustrip(pe[1])
+        end	
+    end
+
+    if i == n_blocks && j == n_blocks
+        if laneid() <= r
+            s_idx_i = sorted_seq[index_i]
+            coords_i = coords[s_idx_i]
+            vel_i = velocities[s_idx_i]
+            atoms_i = atoms[s_idx_i]
+            @inbounds for m in a:r
+                s_idx_j = sorted_seq[j_0_tile + m]
+                eligible[laneid(), m] = eligible_matrix[s_idx_i, s_idx_j]
+                special[laneid(), m] = special_matrix[s_idx_i, s_idx_j]
+            end
+
+            @inbounds for m in (laneid() + a) : r
+                s_idx_j = sorted_seq[j_0_tile + m]
+                coords_j = coords[s_idx_j]
+                vel_j = velocities[s_idx_j]
+                atoms_j = atoms[s_idx_j]
+                dr = vector(coords_j, coords_i, boundary)
+                r2 = sum(abs2, dr)
+                condition = eligible[laneid(), m] && r2 <= r_cut * r_cut
+                
+                pe = condition ? sum_pairwise_potentials(
+                    inters_tuple,
+                    atoms_i, atoms_j,
+                    Val(energy_units),
+                    special[laneid(), m],
+                    coords_i, coords_j,
+                    boundary,
+                    vel_i, vel_j,
+                    step_n) : zero(SVector{1, T})
+
+                E_smem[laneid()] += ustrip(pe[1])
+            end
+        end
+    end
+
+    if threadIdx().x == a
+        sum_E = zero(T)
+        for k in a:warpsize()
+            sum_E += E_smem[k]
+        end
+        CUDA.atomic_add!(pointer(energy_nounits), sum_E)
+    end
+    return nothing
+end
+
+
+
+function pairwise_force_kernel_nonl!(forces::AbstractArray{T}, coords_var, velocities_var,
+                        atoms_var, boundary, inters, step_n, ::Val{D}, ::Val{F}) where {T, D, F}
+    coords = CUDA.Const(coords_var)
+    velocities = CUDA.Const(velocities_var)
+    atoms = CUDA.Const(atoms_var)
+    n_atoms = length(atoms)
+
+    tidx = threadIdx().x
+    i_0_tile = (blockIdx().x - 1) * warpsize()
+    j_0_block = (blockIdx().y - 1) * blockDim().x
+    warpidx = cld(tidx, warpsize())
+    j_0_tile = j_0_block + (warpidx - 1) * warpsize()
+    i = i_0_tile + laneid()
+
+    forces_shmem = CuStaticSharedArray(T, (3, 1024))
+    @inbounds for dim in 1:3
+        forces_shmem[dim, tidx] = zero(T)
+    end
+
+    if i_0_tile + warpsize() > n_atoms || j_0_tile + warpsize() > n_atoms
+        @inbounds if i <= n_atoms
+            njs = min(warpsize(), n_atoms - j_0_tile)
+            atom_i, coord_i, vel_i = atoms[i], coords[i], velocities[i]
+            for del_j in 1:njs
+                j = j_0_tile + del_j
+                if i != j
+                    atom_j, coord_j, vel_j = atoms[j], coords[j], velocities[j]
+                    f = sum_pairwise_forces(inters, atom_i, atom_j, Val(F), false, coord_i, coord_j,
+                                            boundary, vel_i, vel_j, step_n)
+                    for dim in 1:D
+                        forces_shmem[dim, tidx] += -ustrip(f[dim])
+                    end
+                end
+            end
+
+            for dim in 1:D
+                Atomix.@atomic :monotonic forces[dim, i] += forces_shmem[dim, tidx]
+            end
+        end
+    else
+        j = j_0_tile + laneid()
+        tilesteps = warpsize()
+        if i_0_tile == j_0_tile  # To not compute i-i forces
+            j = j_0_tile + laneid() % warpsize() + 1
+            tilesteps -= 1
+        end
+
+        atom_i, coord_i, vel_i = atoms[i], coords[i], velocities[i]
+        coord_j, vel_j = coords[j], velocities[j]
+        @inbounds for _ in 1:tilesteps
+            sync_warp()
+            atom_j = atoms[j]
+            f = sum_pairwise_forces(inters, atom_i, atom_j, Val(F), false, coord_i, coord_j,
+                                    boundary, vel_i, vel_j, step_n)
+            for dim in 1:D
+                forces_shmem[dim, tidx] += -ustrip(f[dim])
+            end
+            @shfl_multiple_sync(FULL_MASK, laneid() + 1, warpsize(), j, coord_j)
+        end
+
+        @inbounds for dim in 1:D
+            Atomix.@atomic :monotonic forces[dim, i] += forces_shmem[dim, tidx]
+        end
+    end
+
+    return nothing
+end
+
+function pairwise_pe_kernel!(energy, coords_var, velocities_var, atoms_var, boundary, inters,
+                             neighbors_var, step_n, ::Val{E}) where E
+    coords = CUDA.Const(coords_var)
+    velocities = CUDA.Const(velocities_var)
+    atoms = CUDA.Const(atoms_var)
+    neighbors = CUDA.Const(neighbors_var)
+
+    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+
+    @inbounds if inter_i <= length(neighbors)
+        i, j, special = neighbors[inter_i]
+        coord_i, coord_j, vel_i, vel_j = coords[i], coords[j], velocities[i], velocities[j]
+        dr = vector(coord_i, coord_j, boundary)
+        pe = potential_energy_gpu(inters[1], dr, atoms[i], atoms[j], E, special, coord_i, coord_j,
+                                  boundary, vel_i, vel_j, step_n)
+        for inter in inters[2:end]
+            pe += potential_energy_gpu(inter, dr, atoms[i], atoms[j], E, special, coord_i, coord_j,
+                                       boundary, vel_i, vel_j, step_n)
+        end
+        if unit(pe) != E
+            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
+        end
+        Atomix.@atomic :monotonic energy[1] += ustrip(pe)
+    end
+    return nothing
+end
+
+@inline function sum_pairwise_forces(inters, atom_i, atom_j, ::Val{F}, special, coord_i, coord_j,
+                                     boundary, vel_i, vel_j, step_n) where F
+    dr = vector(coord_i, coord_j, boundary)
+    f_tuple = ntuple(length(inters)) do inter_type_i
+        force_gpu(inters[inter_type_i], dr, atom_i, atom_j, F, special, coord_i, coord_j, boundary,
+                  vel_i, vel_j, step_n)
+    end
+    f = sum(f_tuple)
+    if unit(f[1]) != F
+        # This triggers an error but it isn't printed
+        # See https://discourse.julialang.org/t/error-handling-in-cuda-kernels/79692
+        #   for how to throw a more meaningful error
+        error("wrong force unit returned, was expecting $F but got $(unit(f[1]))")
+    end
+    return f
+end
+
+@inline function sum_pairwise_potentials(inters, atom_i, atom_j, ::Val{E}, special, coord_i, coord_j,
+                                     boundary, vel_i, vel_j, step_n) where E
+    dr = vector(coord_i, coord_j, boundary)
+
+    pe_tuple = ntuple(length(inters)) do inter_type_i
+        SVector(potential_energy_gpu(inters[inter_type_i], dr, atom_i, atom_j, E, special, coord_i, coord_j, boundary,
+                  vel_i, vel_j, step_n))
+                  # SVector was required to avoid a GPU error occurring with scalars (like the quantity returned by potential_energy_gpu) 
+    end
+    pe = sum(pe_tuple)
+    if unit(pe[1]) != E
+        # This triggers an error but it isn't printed
+        # See https://discourse.julialang.org/t/error-handling-in-cuda-kernels/79692
+        #   for how to throw a more meaningful error
+        error("wrong force unit returned, was expecting $E but got $(unit(pe[1]))")
+    end
+    return pe
+end
+
+function specific_force_gpu!(fs_mat, inter_list::InteractionList1Atoms, coords::AbstractArray{SVector{D, C}},
+                            velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T}
+    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
+    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_force_1_atoms_kernel!(fs_mat,
+            coords, velocities, atoms, boundary, step_n, inter_list.is, inter_list.inters,
+            Val(D), Val(force_units))
+    return fs_mat
+end
+
+function specific_force_gpu!(fs_mat, inter_list::InteractionList2Atoms, coords::AbstractArray{SVector{D, C}},
+                            velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T}
+    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
+    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_force_2_atoms_kernel!(fs_mat,
+            coords, velocities, atoms, boundary, step_n, inter_list.is, inter_list.js,
+            inter_list.inters, Val(D), Val(force_units))
+    return fs_mat
+end
+
+function specific_force_gpu!(fs_mat, inter_list::InteractionList3Atoms, coords::AbstractArray{SVector{D, C}},
+                            velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T}
+    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
+    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_force_3_atoms_kernel!(fs_mat,
+            coords, velocities, atoms, boundary, step_n, inter_list.is, inter_list.js,
+            inter_list.ks, inter_list.inters, Val(D), Val(force_units))
+    return fs_mat
+end
+
+function specific_force_gpu!(fs_mat, inter_list::InteractionList4Atoms, coords::AbstractArray{SVector{D, C}},
+                            velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T}
+    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
+    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_force_4_atoms_kernel!(fs_mat,
+            coords, velocities, atoms, boundary, step_n, inter_list.is, inter_list.js,
+            inter_list.ks, inter_list.ls, inter_list.inters, Val(D), Val(force_units))
+    return fs_mat
+end
+
+function specific_force_1_atoms_kernel!(forces, coords_var, velocities_var, atoms_var, boundary,
+                        step_n, is_var, inters_var, ::Val{D}, ::Val{F}) where {D, F}
+    coords = CUDA.Const(coords_var)
+    velocities = CUDA.Const(velocities_var)
+    atoms = CUDA.Const(atoms_var)
+    is = CUDA.Const(is_var)
+    inters = CUDA.Const(inters_var)
+
+    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+
+    @inbounds if inter_i <= length(is)
+        i = is[inter_i]
+        fs = force_gpu(inters[inter_i], coords[i], boundary, atoms[i], F, velocities[i], step_n)
+        if unit(fs.f1[1]) != F
+            error("wrong force unit returned, was expecting $F")
+        end
+        for dim in 1:D
+            Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim])
+        end
+    end
+    return nothing
+end
+
+function specific_force_2_atoms_kernel!(forces, coords_var, velocities_var, atoms_var, boundary,
+                        step_n, is_var, js_var, inters_var, ::Val{D}, ::Val{F}) where {D, F}
+    coords = CUDA.Const(coords_var)
+    velocities = CUDA.Const(velocities_var)
+    atoms = CUDA.Const(atoms_var)
+    is = CUDA.Const(is_var)
+    js = CUDA.Const(js_var)
+    inters = CUDA.Const(inters_var)
+
+    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+
+    @inbounds if inter_i <= length(is)
+        i, j = is[inter_i], js[inter_i]
+        fs = force_gpu(inters[inter_i], coords[i], coords[j], boundary, atoms[i], atoms[j], F,
+                       velocities[i], velocities[j], step_n)
+        if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F
+            error("wrong force unit returned, was expecting $F")
+        end
+        for dim in 1:D
+            Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim])
+            Atomix.@atomic :monotonic forces[dim, j] += ustrip(fs.f2[dim])
+        end
+    end
+    return nothing
+end
+
+function specific_force_3_atoms_kernel!(forces, coords_var, velocities_var, atoms_var, boundary,
+                        step_n, is_var, js_var, ks_var, inters_var, ::Val{D}, ::Val{F}) where {D, F}
+    coords = CUDA.Const(coords_var)
+    velocities = CUDA.Const(velocities_var)
+    atoms = CUDA.Const(atoms_var)
+    is = CUDA.Const(is_var)
+    js = CUDA.Const(js_var)
+    ks = CUDA.Const(ks_var)
+    inters = CUDA.Const(inters_var)
+
+    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+
+    @inbounds if inter_i <= length(is)
+        i, j, k = is[inter_i], js[inter_i], ks[inter_i]
+        fs = force_gpu(inters[inter_i], coords[i], coords[j], coords[k], boundary, atoms[i],
+                       atoms[j], atoms[k], F, velocities[i], velocities[j], velocities[k], step_n)
+        if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F || unit(fs.f3[1]) != F
+            error("wrong force unit returned, was expecting $F")
+        end
+        for dim in 1:D
+            Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim])
+            Atomix.@atomic :monotonic forces[dim, j] += ustrip(fs.f2[dim])
+            Atomix.@atomic :monotonic forces[dim, k] += ustrip(fs.f3[dim])
+        end
+    end
+    return nothing
+end
+
+function specific_force_4_atoms_kernel!(forces, coords_var, velocities_var, atoms_var, boundary,
+                        step_n, is_var, js_var, ks_var, ls_var, inters_var,
+                        ::Val{D}, ::Val{F}) where {D, F}
+    coords = CUDA.Const(coords_var)
+    velocities = CUDA.Const(velocities_var)
+    atoms = CUDA.Const(atoms_var)
+    is = CUDA.Const(is_var)
+    js = CUDA.Const(js_var)
+    ks = CUDA.Const(ks_var)
+    ls = CUDA.Const(ls_var)
+    inters = CUDA.Const(inters_var)
+
+    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+
+    @inbounds if inter_i <= length(is)
+        i, j, k, l = is[inter_i], js[inter_i], ks[inter_i], ls[inter_i]
+        fs = force_gpu(inters[inter_i], coords[i], coords[j], coords[k], coords[l], boundary,
+                       atoms[i], atoms[j], atoms[k], atoms[l], F, velocities[i], velocities[j],
+                       velocities[k], velocities[l], step_n)
+        if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F || unit(fs.f3[1]) != F || unit(fs.f4[1]) != F
+            error("wrong force unit returned, was expecting $F")
+        end
+        for dim in 1:D
+            Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim])
+            Atomix.@atomic :monotonic forces[dim, j] += ustrip(fs.f2[dim])
+            Atomix.@atomic :monotonic forces[dim, k] += ustrip(fs.f3[dim])
+            Atomix.@atomic :monotonic forces[dim, l] += ustrip(fs.f4[dim])
+        end
+    end
+    return nothing
+end
+
+
+function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList1Atoms, coords::AbstractArray{SVector{D, C}},
+                          velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T}
+    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
+    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_pe_1_atoms_kernel!(
+            pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is,
+            inter_list.inters, Val(energy_units))
+    return pe_vec_nounits
+end
+
+function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList2Atoms, coords::AbstractArray{SVector{D, C}},
+                          velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T}
+    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
+    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_pe_2_atoms_kernel!(
+            pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is,
+            inter_list.js, inter_list.inters, Val(energy_units))
+    return pe_vec_nounits
+end
+
+function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList3Atoms, coords::AbstractArray{SVector{D, C}},
+                          velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T}
+    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
+    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_pe_3_atoms_kernel!(
+            pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is,
+            inter_list.js, inter_list.ks, inter_list.inters, Val(energy_units))
+    return pe_vec_nounits
+end
+
+function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList4Atoms, coords::AbstractArray{SVector{D, C}},
+                          velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T}
+    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
+    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_pe_4_atoms_kernel!(
+            pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is,
+            inter_list.js, inter_list.ks, inter_list.ls, inter_list.inters, Val(energy_units))
+    return pe_vec_nounits
+end
+
+function specific_pe_1_atoms_kernel!(energy, coords_var, velocities_var, atoms_var, boundary,
+                    step_n, is_var, inters_var, ::Val{E}) where E
+    coords = CUDA.Const(coords_var)
+    velocities = CUDA.Const(velocities_var)
+    atoms = CUDA.Const(atoms_var)
+    is = CUDA.Const(is_var)
+    inters = CUDA.Const(inters_var)
+
+    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+
+    @inbounds if inter_i <= length(is)
+        i = is[inter_i]
+        pe = potential_energy_gpu(inters[inter_i], coords[i], boundary, atoms[i], E,
+                                  velocities[i], step_n)
+        if unit(pe) != E
+            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
+        end
+        Atomix.@atomic :monotonic energy[1] += ustrip(pe)
+    end
+    return nothing
+end
+
+function specific_pe_2_atoms_kernel!(energy, coords_var, velocities_var, atoms_var, boundary,
+                    step_n, is_var, js_var, inters_var, ::Val{E}) where E
+    coords = CUDA.Const(coords_var)
+    velocities = CUDA.Const(velocities_var)
+    atoms = CUDA.Const(atoms_var)
+    is = CUDA.Const(is_var)
+    js = CUDA.Const(js_var)
+    inters = CUDA.Const(inters_var)
+
+    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+
+    @inbounds if inter_i <= length(is)
+        i, j = is[inter_i], js[inter_i]
+        pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], boundary, atoms[i],
+                                  atoms[j], E, velocities[i], velocities[j], step_n)
+        if unit(pe) != E
+            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
+        end
+        Atomix.@atomic :monotonic energy[1] += ustrip(pe)
+    end
+    return nothing
+end
+
+function specific_pe_3_atoms_kernel!(energy, coords_var, velocities_var, atoms_var, boundary,
+                    step_n, is_var, js_var, ks_var, inters_var, ::Val{E}) where E
+    coords = CUDA.Const(coords_var)
+    velocities = CUDA.Const(velocities_var)
+    atoms = CUDA.Const(atoms_var)
+    is = CUDA.Const(is_var)
+    js = CUDA.Const(js_var)
+    ks = CUDA.Const(ks_var)
+    inters = CUDA.Const(inters_var)
+
+    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+
+    @inbounds if inter_i <= length(is)
+        i, j, k = is[inter_i], js[inter_i], ks[inter_i]
+        pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], coords[k], boundary,
+                                  atoms[i], atoms[j], atoms[k], E, velocities[i], velocities[j],
+                                  velocities[k], step_n)
+        if unit(pe) != E
+            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
+        end
+        Atomix.@atomic :monotonic energy[1] += ustrip(pe)
+    end
+    return nothing
+end
+
+function specific_pe_4_atoms_kernel!(energy, coords_var, velocities_var, atoms_var, boundary,
+                    step_n, is_var, js_var, ks_var, ls_var, inters_var, ::Val{E}) where E
+    coords = CUDA.Const(coords_var)
+    velocities = CUDA.Const(velocities_var)
+    atoms = CUDA.Const(atoms_var)
+    is = CUDA.Const(is_var)
+    js = CUDA.Const(js_var)
+    ks = CUDA.Const(ks_var)
+    ls = CUDA.Const(ls_var)
+    inters = CUDA.Const(inters_var)
+
+    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+
+    @inbounds if inter_i <= length(is)
+        i, j, k, l = is[inter_i], js[inter_i], ks[inter_i], ls[inter_i]
+        pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], coords[k], coords[l],
+                                  boundary, atoms[i], atoms[j], atoms[k], atoms[l], E,
+                                  velocities[i], velocities[j], velocities[k], velocities[l],
+                                  step_n)
+        if unit(pe) != E
+            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
+        end
+        Atomix.@atomic :monotonic energy[1] += ustrip(pe)
+    end
+    return nothing
+end
diff --git a/ext/MollyEnzymeExt.jl b/ext/MollyEnzymeExt.jl
index 90e015390..26fd0e882 100644
--- a/ext/MollyEnzymeExt.jl
+++ b/ext/MollyEnzymeExt.jl
@@ -11,13 +11,10 @@ EnzymeRules.inactive(::typeof(Molly.n_infinite_dims), args...) = nothing
 EnzymeRules.inactive(::typeof(random_velocity), args...) = nothing
 EnzymeRules.inactive(::typeof(random_velocities), args...) = nothing
 EnzymeRules.inactive(::typeof(random_velocities!), args...) = nothing
-EnzymeRules.inactive(::typeof(Molly.cuda_threads_blocks_pairwise), args...) = nothing
-EnzymeRules.inactive(::typeof(Molly.cuda_threads_blocks_specific), args...) = nothing
 EnzymeRules.inactive(::typeof(Molly.check_force_units), args...) = nothing
 EnzymeRules.inactive(::typeof(Molly.check_energy_units), args...) = nothing
 EnzymeRules.inactive(::typeof(Molly.atoms_bonded_to_N), args...) = nothing
 EnzymeRules.inactive(::typeof(Molly.lookup_table), args...) = nothing
-EnzymeRules.inactive(::typeof(Molly.cuda_threads_blocks_gbsa), args...) = nothing
 EnzymeRules.inactive(::typeof(find_neighbors), args...) = nothing
 EnzymeRules.inactive_type(::Type{DistanceNeighborFinder}) = nothing
 EnzymeRules.inactive(::typeof(visualize), args...) = nothing
diff --git a/ext/MollyGLMakieExt.jl b/ext/MollyGLMakieExt.jl
index 5509dddec..fa7a49096 100644
--- a/ext/MollyGLMakieExt.jl
+++ b/ext/MollyGLMakieExt.jl
@@ -6,8 +6,8 @@ module MollyGLMakieExt
 using Molly
 import AtomsBase
 using GLMakie
-using Colors
 using Unitful
+using Colors
 
 using LinearAlgebra
 
diff --git a/ext/MollyPythonCallExt.jl b/ext/MollyPythonCallExt.jl
index e1afaeb82..9d0a26bf0 100644
--- a/ext/MollyPythonCallExt.jl
+++ b/ext/MollyPythonCallExt.jl
@@ -6,7 +6,7 @@ module MollyPythonCallExt
 using Molly
 using PythonCall
 import AtomsCalculators
-using CUDA
+using GPUArrays
 using StaticArrays
 using Unitful
 
@@ -91,7 +91,7 @@ end
 
 uconvert_vec(x...) = uconvert.(x...)
 
-function AtomsCalculators.forces(sys::System{D, G, T},
+function AtomsCalculators.forces(sys::System{D, AT, T},
                                  ase_calc::ASECalculator;
                                  kwargs...) where {D, G, T}
     update_ase_calc!(ase_calc, sys)
@@ -105,7 +105,7 @@ function AtomsCalculators.forces(sys::System{D, G, T},
     else
         fs_unit = uconvert_vec.(sys.force_units, fs * u"eV/Å")
     end
-    return G ? CuArray(fs_unit) : fs_unit
+    return AT <: AbstractGPUArray ? AT(fs_unit) : fs_unit
 end
 
 function AtomsCalculators.potential_energy(sys::System{D, G, T},
diff --git a/src/Molly.jl b/src/Molly.jl
index 19664debc..08026b186 100644
--- a/src/Molly.jl
+++ b/src/Molly.jl
@@ -11,7 +11,8 @@ import BioStructures # Imported to avoid clashing names
 using CellListMap
 import Chemfiles
 using Combinatorics
-using CUDA
+using KernelAbstractions
+using GPUArrays
 using DataStructures
 using Distances
 using Distributions
@@ -34,7 +35,7 @@ include("types.jl")
 include("units.jl")
 include("spatial.jl")
 include("cutoffs.jl")
-include("cuda.jl")
+include("kernels.jl")
 include("force.jl")
 include("interactions/lennard_jones.jl")
 include("interactions/soft_sphere.jl")
diff --git a/src/analysis.jl b/src/analysis.jl
index 01429b5a2..1c69fa656 100644
--- a/src/analysis.jl
+++ b/src/analysis.jl
@@ -88,8 +88,7 @@ Calculate the hydrodynamic radius of a set of coordinates.
 """
 function hydrodynamic_radius(coords::AbstractArray{SVector{D, T}}, boundary) where {D, T}
     n_atoms = length(coords)
-    diag_cpu = Diagonal(ones(T, n_atoms))
-    diag = isa(coords, CuArray) ? CuArray(diag_cpu) : diag_cpu
+    diag  = get_array_type(coords)(Diagonal(ones(T, n_atoms)))
     dists = distances(coords, boundary) .+ diag
     sum_inv_dists = sum(inv.(dists)) - sum(inv(diag))
     inv_R_hyd = sum_inv_dists / (2 * n_atoms^2)
diff --git a/src/coupling.jl b/src/coupling.jl
index da30dfae1..c47cc4b99 100644
--- a/src/coupling.jl
+++ b/src/coupling.jl
@@ -58,7 +58,7 @@ struct AndersenThermostat{T, C}
     coupling_const::C
 end
 
-function apply_coupling!(sys::System{D, false}, thermostat::AndersenThermostat, sim,
+function apply_coupling!(sys::System{D}, thermostat::AndersenThermostat, sim,
                          neighbors=nothing, step_n::Integer=0;
                          n_threads::Integer=Threads.nthreads(),
                          rng=Random.default_rng()) where D
@@ -71,10 +71,10 @@ function apply_coupling!(sys::System{D, false}, thermostat::AndersenThermostat,
     return false
 end
 
-function apply_coupling!(sys::System{D, true, T}, thermostat::AndersenThermostat, sim,
+function apply_coupling!(sys::System{D, AT, T}, thermostat::AndersenThermostat, sim,
                          neighbors=nothing, step_n::Integer=0;
                          n_threads::Integer=Threads.nthreads(),
-                         rng=Random.default_rng()) where {D, T}
+                         rng=Random.default_rng()) where {D, AT <: AbstractGPUArray, T}
     atoms_to_bump = T.(rand(rng, length(sys)) .< (sim.dt / thermostat.coupling_const))
     atoms_to_leave = one(T) .- atoms_to_bump
     atoms_to_bump_dev = move_array(atoms_to_bump, sys)
diff --git a/src/energy.jl b/src/energy.jl
index 6fdd265c2..7427cef2d 100644
--- a/src/energy.jl
+++ b/src/energy.jl
@@ -78,8 +78,8 @@ function potential_energy(sys; n_threads::Integer=Threads.nthreads())
     return potential_energy(sys, find_neighbors(sys; n_threads=n_threads); n_threads=n_threads)
 end
 
-function potential_energy(sys::System{D, false, T}, neighbors, step_n::Integer=0;
-                          n_threads::Integer=Threads.nthreads()) where {D, T}
+function potential_energy(sys::System{D, AT, T}, neighbors, step_n::Integer=0;
+                          n_threads::Integer=Threads.nthreads()) where {D, AT, T}
     pairwise_inters_nonl = filter(!use_neighbors, values(sys.pairwise_inters))
     pairwise_inters_nl   = filter( use_neighbors, values(sys.pairwise_inters))
     sils_1_atoms = filter(il -> il isa InteractionList1Atoms, values(sys.specific_inter_lists))
@@ -253,9 +253,9 @@ function specific_pe(atoms, coords, velocities, boundary, energy_units, sils_1_a
     return pe
 end
 
-function potential_energy(sys::System{D, true, T}, neighbors, step_n::Integer=0;
-                          n_threads::Integer=Threads.nthreads()) where {D, T}
-    pe_vec_nounits = CUDA.zeros(T, 1)
+function potential_energy(sys::System{D, AT, T}, neighbors, step_n::Integer=0;
+                          n_threads::Integer=Threads.nthreads()) where {D, AT <: AbstractGPUArray, T}
+    n_atoms = length(sys)
     val_ft = Val(T)
     buffers = init_forces_buffer!(sys, ustrip_vec.(zero(sys.coords)), 1)
 
diff --git a/src/force.jl b/src/force.jl
index 0b54db195..1edd12b03 100644
--- a/src/force.jl
+++ b/src/force.jl
@@ -132,16 +132,17 @@ struct ForcesBuffer{F, C, M, R}
     compressed_special::R
 end
 
-function init_forces_buffer!(sys, forces_nounits::CuArray{SVector{D, T}}, n_threads) where {D, T}
+function init_forces_buffer!(sys, forces_nounits::AbstractGPUArray{SVector{D, T}}, n_threads) where {D, T}
     N = length(forces_nounits)
     C = eltype(eltype(sys.coords))
     n_blocks = cld(N, 32)
-    fs_mat = CUDA.zeros(T, D, N)
-    box_mins = CUDA.zeros(C, n_blocks, D) 
-    box_maxs = CUDA.zeros(C, n_blocks, D) 
-    Morton_seq = CUDA.zeros(Int32, N)
-    compressed_eligible = CUDA.zeros(UInt32, 32, n_blocks, n_blocks)
-    compressed_special = CUDA.zeros(UInt32, 32, n_blocks, n_blocks)
+    backend = get_backend(forces_nounits)
+    fs_mat = KernelAbstractions.zeros(backend, T, D, N)
+    box_mins = KernelAbstractions.zeros(backend, C, n_blocks, D) 
+    box_maxs = KernelAbstractions.zeros(backend, C, n_blocks, D) 
+    Morton_seq = KernelAbstractions.zeros(backend, Int32, N)
+    compressed_eligible = KernelAbstractions.zeros(backend, UInt32, 32, n_blocks, n_blocks)
+    compressed_special = KernelAbstractions.zeros(backend, UInt32, 32, n_blocks, n_blocks)
     if sys.neighbor_finder isa GPUNeighborFinder
         sys.neighbor_finder.initialized = false
     end
@@ -165,8 +166,8 @@ function forces(sys, neighbors, step_n::Integer=0; n_threads::Integer=Threads.nt
     return forces_nounits .* sys.force_units
 end
 
-function forces_nounits!(fs_nounits, sys::System{D, false}, neighbors, fs_chunks=nothing,
-                         step_n::Integer=0; n_threads::Integer=Threads.nthreads()) where D
+function forces_nounits!(fs_nounits, sys::System{D, AT}, neighbors, fs_chunks=nothing,
+                         step_n::Integer=0; n_threads::Integer=Threads.nthreads()) where {D, AT <: AbstractArray}
     pairwise_inters_nonl = filter(!use_neighbors, values(sys.pairwise_inters))
     pairwise_inters_nl   = filter( use_neighbors, values(sys.pairwise_inters))
     sils_1_atoms = filter(il -> il isa InteractionList1Atoms, values(sys.specific_inter_lists))
@@ -367,9 +368,9 @@ function specific_forces!(fs_nounits, atoms, coords, velocities, boundary, force
     return fs_nounits
 end
 
-function forces_nounits!(fs_nounits, sys::System{D, true, T}, neighbors,
+function forces_nounits!(fs_nounits, sys::System{D, AT, T}, neighbors,
                          buffers, step_n::Integer=0;
-                         n_threads::Integer=Threads.nthreads()) where {D, T}
+                         n_threads::Integer=Threads.nthreads()) where {D, AT <: AbstractGPUArray, T}
     fill!(buffers.fs_mat, zero(T))
     val_ft = Val(T)
 
diff --git a/src/interactions/implicit_solvent.jl b/src/interactions/implicit_solvent.jl
index 44e81da85..c05222722 100644
--- a/src/interactions/implicit_solvent.jl
+++ b/src/interactions/implicit_solvent.jl
@@ -411,10 +411,11 @@ function ImplicitSolventOBC(atoms::AbstractArray{Atom{TY, M, T, D, E}},
         factor_solvent = zero(T(coulomb_const_units))
     end
 
-    if isa(atoms, CuArray)
-        or = CuArray(offset_radii)
-        sor = CuArray(scaled_offset_radii)
-        is, js = CuArray(inds_i), CuArray(inds_j)
+    if isa(atoms, AbstractGPUArray)
+        array_type = get_array_type(atoms)
+        or = array_type(offset_radii)
+        sor = array_type(scaled_offset_radii)
+        is, js = array_type(inds_i), array_type(inds_j)
     else
         or = offset_radii
         sor = scaled_offset_radii
@@ -563,12 +564,13 @@ function ImplicitSolventGBN2(atoms::AbstractArray{Atom{TY, M, T, D, E}},
         factor_solvent = zero(T(coulomb_const_units))
     end
 
-    if isa(atoms, CuArray)
-        or = CuArray(offset_radii)
-        sor = CuArray(scaled_offset_radii)
-        is, js = CuArray(inds_i), CuArray(inds_j)
-        d0s, m0s = CuArray(table_d0), CuArray(table_m0)
-        αs, βs, γs = CuArray(αs_cpu), CuArray(βs_cpu), CuArray(γs_cpu)
+    if isa(atoms, AbstractGPUArray)
+        array_type = get_array_type(atoms)
+        or = array_type(offset_radii)
+        sor = array_type(scaled_offset_radii)
+        is, js = array_type(inds_i), array_type(inds_j)
+        d0s, m0s = array_type(table_d0), array_type(table_m0)
+        αs, βs, γs = array_type(αs_cpu), array_type(βs_cpu), array_type(γs_cpu)
     else
         or = offset_radii
         sor = scaled_offset_radii
@@ -694,7 +696,7 @@ function born_radii_and_grad(inter::ImplicitSolventOBC{T}, coords, boundary) whe
     return Bs, B_grads, I_grads
 end
 
-function born_radii_and_grad(inter::ImplicitSolventOBC, coords::CuArray, boundary)
+function born_radii_and_grad(inter::ImplicitSolventOBC, coords::AbstractGPUArray, boundary)
     coords_i = @view coords[inter.is]
     coords_j = @view coords[inter.js]
     loop_res = born_radii_loop_OBC.(coords_i, coords_j, inter.oris, inter.srjs,
@@ -766,7 +768,7 @@ function born_radii_and_grad(inter::ImplicitSolventGBN2{T}, coords, boundary) wh
     return Bs, B_grads, I_grads
 end
 
-function born_radii_and_grad(inter::ImplicitSolventGBN2{T}, coords::CuArray, boundary) where T
+function born_radii_and_grad(inter::ImplicitSolventGBN2{T}, coords::AbstractGPUArray, boundary) where T
     Is, I_grads = gbsa_born_gpu(coords, inter.offset_radii, inter.scaled_offset_radii,
                                 inter.dist_cutoff, inter.offset, inter.neck_scale,
                                 inter.neck_cut, inter.d0s, inter.m0s, boundary, Val(T))
@@ -778,42 +780,41 @@ function born_radii_and_grad(inter::ImplicitSolventGBN2{T}, coords::CuArray, bou
     return Bs, B_grads, I_grads
 end
 
-function cuda_threads_blocks_gbsa(n_inters)
+function gpu_threads_blocks_gbsa(n_inters)
     n_threads_gpu = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_IMPLICIT", "512"))
-    n_blocks = cld(n_inters, n_threads_gpu)
-    return n_threads_gpu, n_blocks
+    return n_threads_gpu
 end
 
 function gbsa_born_gpu(coords::AbstractArray{SVector{D, C}}, offset_radii, scaled_offset_radii,
                        dist_cutoff, offset, neck_scale, neck_cut, d0s, m0s, boundary,
                        ::Val{T}) where {D, C, T}
+    backend = get_backend(coords)
     n_atoms = length(coords)
-    Is_nounits = CUDA.zeros(T, n_atoms)
-    I_grads_nounits = CUDA.zeros(T, n_atoms, n_atoms)
+    Is_nounits = KernelAbstractions.zeros(backend, T, n_atoms)
+    I_grads_nounits = KernelAbstractions.zeros(backend, T, n_atoms, n_atoms)
     n_inters = n_atoms ^ 2
-    n_threads_gpu, n_blocks = cuda_threads_blocks_gbsa(n_inters)
+    n_threads_gpu = gpu_threads_blocks_gbsa(n_inters)
 
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks gbsa_born_kernel!(
-                Is_nounits, I_grads_nounits, coords, offset_radii, scaled_offset_radii,
-                dist_cutoff, offset, neck_scale, neck_cut, d0s, m0s, boundary, Val(C))
+    kernel! = gbsa_born_kernel!(backend, n_threads_gpu)
+    kernel!(Is_nounits, I_grads_nounits, coords, offset_radii,
+            scaled_offset_radii, dist_cutoff, offset, neck_scale,
+            neck_cut, d0s, m0s, boundary, Val(C), ndrange = n_inters)
 
     Is = Is_nounits * unit(dist_cutoff)^-1
     I_grads = I_grads_nounits * unit(dist_cutoff)^-2
     return Is, I_grads
 end
 
-function gbsa_born_kernel!(Is, I_grads, coords_var, offset_radii_var, scaled_offset_radii_var,
-                           dist_cutoff, offset, neck_scale, neck_cut, d0s_var, m0s_var, boundary,
-                           ::Val{C}) where C
-    coords              = CUDA.Const(coords_var)
-    offset_radii        = CUDA.Const(offset_radii_var)
-    scaled_offset_radii = CUDA.Const(scaled_offset_radii_var)
-    d0s                 = CUDA.Const(d0s_var)
-    m0s                 = CUDA.Const(m0s_var)
+@kernel function gbsa_born_kernel!(Is, I_grads, @Const(coords),
+                                   @Const(offset_radii),
+                                   @Const(scaled_offset_radii),
+                                   dist_cutoff, offset, neck_scale, neck_cut,
+                                   @Const(d0s), @Const(m0s), boundary,
+                                   ::Val{C}) where C
 
     n_atoms = length(coords)
     n_inters = n_atoms ^ 2
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    inter_i = @index(Global, Linear)
 
     @inbounds if inter_i <= n_inters
         i = cld(inter_i, n_atoms)
@@ -849,12 +850,11 @@ function gbsa_born_kernel!(Is, I_grads, coords_var, offset_radii_var, scaled_off
                     numer = 2 * r_d0_strip + 9 * r_d0_strip^5 / 5
                     I_grad -= 10 * neck_scale * m0 * numer / (denom^2 * unit(dist_cutoff))
                 end
-                Atomix.@atomic :monotonic Is[i] += ustrip(unit(dist_cutoff)^-1, I)
+                Atomix.@atomic Is[i] += ustrip(unit(dist_cutoff)^-1, I)
                 I_grads[i, j] += ustrip(unit(dist_cutoff)^-2, I_grad)
             end
         end
     end
-    return nothing
 end
 
 function gb_force_loop_1(coord_i, coord_j, i, j, charge_i, charge_j, Bi, Bj, dist_cutoff,
@@ -948,8 +948,8 @@ function forces_gbsa(sys, inter, Bs, B_grads, I_grads, born_forces, atom_charges
     return fs
 end
 
-function forces_gbsa(sys::System{D, true, T}, inter, Bs, B_grads, I_grads, born_forces,
-                     atom_charges) where {D, T}
+function forces_gbsa(sys::System{D, AT, T}, inter, Bs, B_grads, I_grads, born_forces,
+                     atom_charges) where {D, AT <: AbstractGPUArray, T}
     fs_mat_1, born_forces_mod_ustrip = gbsa_force_1_gpu(sys.coords, sys.boundary, inter.dist_cutoff,
                         inter.factor_solute, inter.factor_solvent, inter.kappa, Bs, atom_charges,
                         sys.force_units)
@@ -965,16 +965,17 @@ end
 function gbsa_force_1_gpu(coords::AbstractArray{SVector{D, C}}, boundary, dist_cutoff,
                           factor_solute, factor_solvent, kappa, Bs, atom_charges::AbstractArray{T},
                           force_units) where {D, C, T}
+    backend = get_backend(coords)
     n_atoms = length(coords)
-    fs_mat = CUDA.zeros(T, D, n_atoms)
-    born_forces_mod_ustrip = CUDA.zeros(T, n_atoms)
+    fs_mat = KernelAbstractions.zeros(backend, T, D, n_atoms)
+    born_forces_mod_ustrip = KernelAbstractions.zeros(backend, T, n_atoms)
     n_inters = n_atoms_to_n_pairs(n_atoms) + n_atoms
-    n_threads_gpu, n_blocks = cuda_threads_blocks_gbsa(n_inters)
+    n_threads_gpu = gpu_threads_blocks_gbsa(n_inters)
 
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks gbsa_force_1_kernel!(
-                fs_mat, born_forces_mod_ustrip, coords, boundary, dist_cutoff,
-                factor_solute, factor_solvent, kappa, Bs, atom_charges,
-                Val(D), Val(force_units))
+    kernel! = gbsa_force_1_kernel!(backend, n_threads_gpu)
+    kernel!(fs_mat, born_forces_mod_ustrip, coords, boundary, dist_cutoff,
+            factor_solute, factor_solvent, kappa, Bs, atom_charges,
+            Val(D), Val(force_units), ndrange = n_inters)
 
     return fs_mat, born_forces_mod_ustrip
 end
@@ -982,29 +983,30 @@ end
 function gbsa_force_2_gpu(coords::AbstractArray{SVector{D, C}}, boundary, dist_cutoff, Bs, B_grads,
                           I_grads, born_forces, offset_radii, scaled_offset_radii,
                           force_units, ::Val{T}) where {D, C, T}
+    backend = get_backend(coords)
     n_atoms = length(coords)
-    fs_mat = CUDA.zeros(T, D, n_atoms)
+    fs_mat = KernelAbstractions.zeros(backend, T, D, n_atoms)
     n_inters = n_atoms ^ 2
-    n_threads_gpu, n_blocks = cuda_threads_blocks_gbsa(n_inters)
+    n_threads_gpu = gpu_threads_blocks_gbsa(n_inters)
 
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks gbsa_force_2_kernel!(
-                fs_mat, born_forces, coords, boundary, dist_cutoff, offset_radii,
-                scaled_offset_radii, Bs, B_grads, I_grads, Val(D), Val(force_units))
+    kernel! = gbsa_force_2_kernel!(backend, n_threads_gpu)
+    kernel!(fs_mat, born_forces, coords, boundary, dist_cutoff, offset_radii,
+            scaled_offset_radii, Bs, B_grads, I_grads, Val(D), Val(force_units),
+            ndrange = n_inters)
 
     return fs_mat
 end
 
-function gbsa_force_1_kernel!(forces, born_forces_mod_ustrip, coords_var, boundary, dist_cutoff,
-                              factor_solute, factor_solvent, kappa, Bs_var, atom_charges_var,
-                              ::Val{D}, ::Val{F}) where {D, F}
-    coords  = CUDA.Const(coords_var)
-    Bs      = CUDA.Const(Bs_var)
-    atom_charges = CUDA.Const(atom_charges_var)
+@kernel function gbsa_force_1_kernel!(forces, born_forces_mod_ustrip,
+                                      @Const(coords), boundary, dist_cutoff,
+                                      factor_solute, factor_solvent, kappa,
+                                      @Const(Bs), @Const(atom_charges),
+                                      ::Val{D}, ::Val{F}) where {D, F}
 
     n_atoms = length(coords)
     n_inters_not_self = n_atoms_to_n_pairs(n_atoms)
     n_inters = n_inters_not_self + n_atoms
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    inter_i = @index(Global, Linear)
 
     @inbounds if inter_i <= n_inters
         if inter_i <= n_inters_not_self
@@ -1034,38 +1036,33 @@ function gbsa_force_1_kernel!(forces, born_forces_mod_ustrip, coords_var, bounda
             dGpol_dalpha2_ij = -Gpol * exp_term * (1 + D_term) / (2 * denominator2)
 
             change_born_force_i = dGpol_dalpha2_ij * Bj
-            Atomix.@atomic :monotonic born_forces_mod_ustrip[i] += ustrip(change_born_force_i)
+            Atomix.@atomic born_forces_mod_ustrip[i] += ustrip(change_born_force_i)
             if i != j
                 change_born_force_j = dGpol_dalpha2_ij * Bi
-                Atomix.@atomic :monotonic born_forces_mod_ustrip[j] += ustrip(change_born_force_j)
+                Atomix.@atomic born_forces_mod_ustrip[j] += ustrip(change_born_force_j)
                 fdr = dr * dGpol_dr
                 if unit(fdr[1]) != F
                     error("wrong force unit returned, was expecting $F but got $(unit(fdr[1]))")
                 end
                 for dim in 1:D
                     fval = ustrip(fdr[dim])
-                    Atomix.@atomic :monotonic forces[dim, i] +=  fval
-                    Atomix.@atomic :monotonic forces[dim, j] += -fval
+                    Atomix.@atomic forces[dim, i] +=  fval
+                    Atomix.@atomic forces[dim, j] += -fval
                 end
             end
         end
     end
-    return nothing
 end
 
-function gbsa_force_2_kernel!(forces, born_forces, coords_var, boundary, dist_cutoff, or_var,
-                              sor_var, Bs_var, B_grads_var, I_grads_var, ::Val{D},
-                              ::Val{F}) where {D, F}
-    coords  = CUDA.Const(coords_var)
-    or      = CUDA.Const(or_var)
-    sor     = CUDA.Const(sor_var)
-    Bs      = CUDA.Const(Bs_var)
-    B_grads = CUDA.Const(B_grads_var)
-    I_grads = CUDA.Const(I_grads_var)
+@kernel function gbsa_force_2_kernel!(forces, born_forces, @Const(coords),
+                                      boundary, dist_cutoff, @Const(or),
+                                      @Const(sor), @Const(Bs),
+                                      @Const(B_grads), @Const(I_grads),
+                                      ::Val{D}, ::Val{F}) where {D, F}
 
     n_atoms = length(coords)
     n_inters = n_atoms ^ 2
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    inter_i = @index(Global, Linear)
 
     @inbounds if inter_i <= n_inters
         i = cld(inter_i, n_atoms)
@@ -1091,14 +1088,13 @@ function gbsa_force_2_kernel!(forces, born_forces, coords_var, boundary, dist_cu
                     end
                     for dim in 1:D
                         fval = ustrip(fdr[dim])
-                        Atomix.@atomic :monotonic forces[dim, i] +=  fval
-                        Atomix.@atomic :monotonic forces[dim, j] += -fval
+                        Atomix.@atomic forces[dim, i] +=  fval
+                        Atomix.@atomic forces[dim, j] += -fval
                     end
                 end
             end
         end
     end
-    return nothing
 end
 
 function AtomsCalculators.forces(sys, inter::AbstractGBSA; kwargs...)
@@ -1153,8 +1149,8 @@ function gb_energy_loop(coord_i, coord_j, i, j, charge_i, charge_j, Bi, Bj, ori,
     end
 end
 
-function AtomsCalculators.potential_energy(sys::System{<:Any, false, T}, inter::AbstractGBSA;
-                                           kwargs...) where T
+function AtomsCalculators.potential_energy(sys::System{<:Any, AT, T}, inter::AbstractGBSA;
+                                           kwargs...) where {AT, T}
     coords, boundary = sys.coords, sys.boundary
     Bs, B_grads, I_grads = born_radii_and_grad(inter, coords, boundary)
     atom_charges = charge.(sys.atoms)
@@ -1173,7 +1169,7 @@ function AtomsCalculators.potential_energy(sys::System{<:Any, false, T}, inter::
     return E
 end
 
-function AtomsCalculators.potential_energy(sys::System{<:Any, true}, inter::AbstractGBSA; kwargs...)
+function AtomsCalculators.potential_energy(sys::System{<:Any, AT}, inter::AbstractGBSA; kwargs...) where AT <: AbstractGPUArray
     coords, atoms, boundary = sys.coords, sys.atoms, sys.boundary
     Bs, B_grads, I_grads = born_radii_and_grad(inter, coords, boundary)
 
diff --git a/src/kernels.jl b/src/kernels.jl
new file mode 100644
index 000000000..1863ea74e
--- /dev/null
+++ b/src/kernels.jl
@@ -0,0 +1,371 @@
+# KernelAbstractions.jl kernels
+
+function get_array_type(a::AT) where AT <: AbstractArray
+    return AT.name.wrapper
+end
+
+@inline function sum_pairwise_forces(inters, atom_i, atom_j, ::Val{F}, special, coord_i, coord_j,
+                                     boundary, vel_i, vel_j, step_n) where F
+    dr = vector(coord_i, coord_j, boundary)
+    f_tuple = ntuple(length(inters)) do inter_type_i
+        force_gpu(inters[inter_type_i], dr, atom_i, atom_j, F, special, coord_i, coord_j, boundary,
+                  vel_i, vel_j, step_n)
+    end
+    f = sum(f_tuple)
+    if unit(f[1]) != F
+        # This triggers an error but it isn't printed
+        # See https://discourse.julialang.org/t/error-handling-in-cuda-kernels/79692
+        #   for how to throw a more meaningful error
+        error("wrong force unit returned, was expecting $F but got $(unit(f[1]))")
+    end
+    return f
+end
+
+function gpu_threads_pairwise(n_neighbors)
+    n_threads_gpu = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_PAIRWISE", "512"))
+    return n_threads_gpu
+end
+
+function gpu_threads_specific(n_inters)
+    n_threads_gpu = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_SPECIFIC", "128"))
+    return n_threads_gpu
+end
+
+function pairwise_force_gpu!(buffers, sys::System{D, AT, T}, 
+                    pairwise_inters, nbs, step_n) where {D, AT <: AbstractGPUArray, T}
+    backend = get_backend(coords)
+    if typeof(nbs) == NoNeighborList
+        n_threads_gpu = gpu_threads_pairwise(length(atoms))
+        kernel! = pairwise_force_kernel_nonl!(backend, n_threads_gpu)
+        kernel!(buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters, step_n, Val(D), Val(force_units); ndrange = length(atoms))
+    else
+        n_threads_gpu = gpu_threads_pairwise(length(nbs))
+        kernel! = pairwise_force_kernel_nl!(backend, n_threads_gpu)
+        kernel!(buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters,
+                nbs, step_n, Val(D), Val(force_units); ndrange = length(nbs))
+    end
+    return fs_mat
+end
+
+@kernel function pairwise_force_kernel_nl!(forces, @Const(coords),
+                                           @Const(velocities), @Const(atoms),
+                                           boundary, inters,
+                                           @Const(neighbors), step_n, ::Val{D},
+                                           ::Val{F}) where {D, F}
+
+    inter_i = @index(Global, Linear)
+
+    @inbounds if inter_i <= length(neighbors)
+        i, j, special = neighbors[inter_i]
+        f = sum_pairwise_forces(inters, atoms[i], atoms[j], Val(F), special, coords[i], coords[j], boundary, velocities[i], velocities[j], step_n)
+        for dim in 1:D
+            fval = ustrip(f[dim])
+            Atomix.@atomic forces[dim, i] = forces[dim, i] - fval
+            Atomix.@atomic forces[dim, j] = forces[dim, j] + fval
+        end
+    end
+end
+
+@kernel function pairwise_force_kernel_nonl!(forces, @Const(coords),
+                                             @Const(velocities), @Const(atoms),
+                                             boundary, inters,
+                                             step_n, ::Val{D},
+                                             ::Val{F}) where {D, F}
+
+    i = @index(Global, Linear)
+
+    @inbounds for j = 1:i
+        if i != j
+            f = sum_pairwise_forces(inters, atoms[i], atoms[j], Val(F), false, coords[i], coords[j], boundary, velocities[i], velocities[j], step_n)
+            for dim in 1:D
+                fval = ustrip(f[dim])
+                Atomix.@atomic forces[dim, i] = forces[dim, i] - fval
+                Atomix.@atomic forces[dim, j] = forces[dim, j] + fval
+            end
+        end
+    end
+end
+
+function specific_force_gpu!(fs_mat, inter_list::InteractionList1Atoms, coords::AbstractArray{SVector{D, C}},
+                            velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T}
+    backend = get_backend(coords)
+    n_threads_gpu = gpu_threads_specific(length(inter_list))
+    kernel! = specific_force_1_atoms_kernel!(backend, n_threads_gpu)
+    kernel!(fs_mat, coords, velocities, atoms, boundary, step_n, inter_list.is,
+            inter_list.inters, Val(D), Val(force_units);
+            ndrange = length(inter_list))
+    return fs_mat
+end
+
+function specific_force_gpu!(fs_mat, inter_list::InteractionList2Atoms, coords::AbstractArray{SVector{D, C}},
+                            velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T}
+    backend = get_backend(coords)
+    n_threads_gpu = gpu_threads_specific(length(inter_list))
+    kernel! = specific_force_2_atoms_kernel!(backend, n_threads_gpu)
+    kernel!(fs_mat, coords, velocities, atoms, boundary, step_n, inter_list.is,
+            inter_list.js, inter_list.inters, Val(D), Val(force_units);
+            ndrange = length(inter_list))
+    return fs_mat
+end
+
+function specific_force_gpu!(fs_mat, inter_list::InteractionList3Atoms, coords::AbstractArray{SVector{D, C}},
+                            velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T}
+    backend = get_backend(coords)
+    n_threads_gpu = gpu_threads_specific(length(inter_list))
+    kernel! = specific_force_3_atoms_kernel!(backend, n_threads_gpu)
+    kernel!(fs_mat, coords, velocities, atoms, boundary, step_n, inter_list.is,
+            inter_list.js, inter_list.ks, inter_list.inters, Val(D),
+            Val(force_units); ndrange = length(inter_list))
+    return fs_mat
+end
+
+function specific_force_gpu!(fs_mat, inter_list::InteractionList4Atoms, coords::AbstractArray{SVector{D, C}},
+                            velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T}
+    backend = get_backend(coords)
+    n_threads_gpu = gpu_threads_specific(length(inter_list))
+    kernel! = specific_force_4_atoms_kernel!(backend, n_threads_gpu)
+    kernel!(fs_mat, coords, velocities, atoms, boundary, step_n, inter_list.is,
+            inter_list.js, inter_list.ks, inter_list.ls, inter_list.inters,
+            Val(D), Val(force_units); ndrange = length(inter_list))
+    return fs_mat
+end
+
+@kernel function specific_force_1_atoms_kernel!(forces, @Const(coords),
+                                                @Const(velocities),
+                                                @Const(atoms), boundary,
+                                                step_n, @Const(is),
+                                                @Const(inters), ::Val{D},
+                                                ::Val{F}) where {D, F}
+
+    inter_i = @index(Global, Linear)
+
+    @inbounds if inter_i <= length(is)
+        i = is[inter_i]
+        fs = force_gpu(inters[inter_i], coords[i], boundary, atoms[i], F, velocities[i], step_n)
+        if unit(fs.f1[1]) != F
+            error("wrong force unit returned, was expecting $F")
+        end
+        for dim in 1:D
+            Atomix.@atomic forces[dim, i] += ustrip(fs.f1[dim])
+        end
+    end
+end
+
+@kernel function specific_force_2_atoms_kernel!(forces, @Const(coords),
+                                                @Const(velocities),
+                                                @Const(atoms), boundary,
+                                                step_n, @Const(is), @Const(js),
+                                                @Const(inters), ::Val{D},
+                                                ::Val{F}) where {D, F}
+
+    inter_i = @index(Global, Linear)
+
+    @inbounds if inter_i <= length(is)
+        i, j = is[inter_i], js[inter_i]
+        fs = force_gpu(inters[inter_i], coords[i], coords[j], boundary, atoms[i], atoms[j], F,
+                       velocities[i], velocities[j], step_n)
+        if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F
+            error("wrong force unit returned, was expecting $F")
+        end
+        for dim in 1:D
+            Atomix.@atomic forces[dim, i] += ustrip(fs.f1[dim])
+            Atomix.@atomic forces[dim, j] += ustrip(fs.f2[dim])
+        end
+    end
+end
+
+@kernel function specific_force_3_atoms_kernel!(forces, @Const(coords),
+                                                @Const(velocities),
+                                                @Const(atoms), boundary,
+                                                step_n, @Const(is),
+                                                @Const(js), @Const(ks),
+                                                @Const(inters), ::Val{D},
+                                                ::Val{F}) where {D, F}
+
+    inter_i = @index(Global, Linear)
+
+    @inbounds if inter_i <= length(is)
+        i, j, k = is[inter_i], js[inter_i], ks[inter_i]
+        fs = force_gpu(inters[inter_i], coords[i], coords[j], coords[k], boundary, atoms[i],
+                       atoms[j], atoms[k], F, velocities[i], velocities[j], velocities[k], step_n)
+        if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F || unit(fs.f3[1]) != F
+            error("wrong force unit returned, was expecting $F")
+        end
+        for dim in 1:D
+            Atomix.@atomic forces[dim, i] += ustrip(fs.f1[dim])
+            Atomix.@atomic forces[dim, j] += ustrip(fs.f2[dim])
+            Atomix.@atomic forces[dim, k] += ustrip(fs.f3[dim])
+        end
+    end
+end
+
+@kernel function specific_force_4_atoms_kernel!(forces, @Const(coords),
+                                                @Const(velocities),
+                                                @Const(atoms), boundary,
+                                                step_n, @Const(is),
+                                                @Const(js), @Const(ks),
+                                                @Const(ls),
+                                                @Const(inters), ::Val{D},
+                                                ::Val{F}) where {D, F}
+
+    inter_i = @index(Global, Linear)
+
+    @inbounds if inter_i <= length(is)
+        i, j, k, l = is[inter_i], js[inter_i], ks[inter_i], ls[inter_i]
+        fs = force_gpu(inters[inter_i], coords[i], coords[j], coords[k], coords[l], boundary,
+                       atoms[i], atoms[j], atoms[k], atoms[l], F, velocities[i], velocities[j],
+                       velocities[k], velocities[l], step_n)
+        if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F || unit(fs.f3[1]) != F || unit(fs.f4[1]) != F
+            error("wrong force unit returned, was expecting $F")
+        end
+        for dim in 1:D
+            Atomix.@atomic forces[dim, i] += ustrip(fs.f1[dim])
+            Atomix.@atomic forces[dim, j] += ustrip(fs.f2[dim])
+            Atomix.@atomic forces[dim, k] += ustrip(fs.f3[dim])
+            Atomix.@atomic forces[dim, l] += ustrip(fs.f4[dim])
+        end
+    end
+end
+
+function pairwise_pe_gpu!(pe_vec_nounits, coords::AbstractArray{SVector{D, C}}, velocities, atoms, boundary,
+                         pairwise_inters, nbs, step_n, energy_units, ::Val{T}) where {D, C, T}
+    backend = get_backend(coords)
+    n_threads_gpu = gpu_threads_pairwise(length(nbs))
+    kernel! = pairwise_pe_kernel!(backend, n_threads_gpu)
+    kernel!(pe_vec_nounits, coords, velocities, atoms, boundary, pairwise_inters, nbs, step_n, Val(energy_units); ndrange = length(nbs))
+    return pe_vec_nounits
+end
+
+@kernel function pairwise_pe_kernel!(energy, @Const(coords), @Const(velocities),
+                                     @Const(atoms), boundary, inters,
+                                     @Const(neighbors), step_n, ::Val{E}) where E
+
+    inter_i = @index(Global, Linear)
+
+    @inbounds if inter_i <= length(neighbors)
+        i, j, special = neighbors[inter_i]
+        coord_i, coord_j, vel_i, vel_j = coords[i], coords[j], velocities[i], velocities[j]
+        dr = vector(coord_i, coord_j, boundary)
+        pe = potential_energy_gpu(inters[1], dr, atoms[i], atoms[j], E, special, coord_i, coord_j,
+                                  boundary, vel_i, vel_j, step_n)
+        for inter in inters[2:end]
+            pe += potential_energy_gpu(inter, dr, atoms[i], atoms[j], E, special, coord_i, coord_j,
+                                       boundary, vel_i, vel_j, step_n)
+        end
+        if unit(pe) != E
+            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
+        end
+        Atomix.@atomic energy[1] += ustrip(pe)
+    end
+end
+
+function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList1Atoms, coords::AbstractArray{SVector{D, C}},
+                          velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T}
+    backend = get_backend(coords)
+    n_threads_gpu = gpu_threads_specific(length(inter_list))
+    kernel! = specific_pe_1_atoms_kernel!(backend, n_threads_gpu)
+    kernel!(pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is,
+            inter_list.inters, Val(energy_units); ndrange = length(inter_list))
+    return pe_vec_nounits
+end
+
+function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList2Atoms, coords::AbstractArray{SVector{D, C}},
+                          velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T}
+
+    backend = get_backend(coords)
+    n_threads_gpu = gpu_threads_specific(length(inter_list))
+    kernel! = specific_pe_2_atoms_kernel!(backend, n_threads_gpu)
+    kernel!(pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is,
+            inter_list.js, inter_list.inters, Val(energy_units); ndrange = length(inter_list))
+    return pe_vec_nounits
+end
+
+function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList3Atoms, coords::AbstractArray{SVector{D, C}},
+                          velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T}
+    backend = get_backend(coords)
+    n_threads_gpu = gpu_threads_specific(length(inter_list))
+    kernel! = specific_pe_3_atoms_kernel!(backend, n_threads_gpu)
+    kernel!(pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is,
+            inter_list.js, inter_list.ks, inter_list.inters, Val(energy_units);
+            ndrange = length(inter_list))
+    return pe_vec_nounits
+end
+
+function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList4Atoms, coords::AbstractArray{SVector{D, C}},
+                          velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T}
+    backend = get_backend(coords)
+    n_threads_gpu = gpu_threads_specific(length(inter_list))
+    kernel! = specific_pe_4_atoms_kernel!(backend, n_threads_gpu)
+    kernel!(pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is,
+            inter_list.js, inter_list.ks, inter_list.ls, inter_list.inters, Val(energy_units);
+            ndrange = length(inter_list))
+    return pe_vec_nounits
+end
+
+@kernel function specific_pe_1_atoms_kernel!(energy, @Const(coords), @Const(velocities), @Const(atoms), boundary,
+                    step_n, @Const(is), @Const(inters), ::Val{E}) where E
+
+    inter_i = @index(Global, Linear)
+
+    @inbounds if inter_i <= length(is)
+        i = is[inter_i]
+        pe = potential_energy_gpu(inters[inter_i], coords[i], boundary, atoms[i], E,
+                                  velocities[i], step_n)
+        if unit(pe) != E
+            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
+        end
+        Atomix.@atomic energy[1] += ustrip(pe)
+    end
+end
+
+@kernel function specific_pe_2_atoms_kernel!(energy, @Const(coords), @Const(velocities), @Const(atoms), boundary,
+                    step_n, @Const(is), @Const(js), @Const(inters), ::Val{E}) where E
+
+
+    inter_i = @index(Global, Linear)
+
+    @inbounds if inter_i <= length(is)
+        i, j = is[inter_i], js[inter_i]
+        pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], boundary, atoms[i],
+                                  atoms[j], E, velocities[i], velocities[j], step_n)
+        if unit(pe) != E
+            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
+        end
+        Atomix.@atomic energy[1] += ustrip(pe)
+    end
+end
+
+@kernel function specific_pe_3_atoms_kernel!(energy, @Const(coords), @Const(velocities), @Const(atoms), boundary,
+                    step_n, @Const(is), @Const(js), @Const(ks), @Const(inters), ::Val{E}) where E
+
+    inter_i = @index(Global, Linear)
+
+    @inbounds if inter_i <= length(is)
+        i, j, k = is[inter_i], js[inter_i], ks[inter_i]
+        pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], coords[k], boundary,
+                                  atoms[i], atoms[j], atoms[k], E, velocities[i], velocities[j],
+                                  velocities[k], step_n)
+        if unit(pe) != E
+            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
+        end
+        Atomix.@atomic energy[1] += ustrip(pe)
+    end
+end
+
+@kernel function specific_pe_4_atoms_kernel!(energy, @Const(coords), @Const(velocities), @Const(atoms), boundary,
+                    step_n, @Const(is), @Const(js), @Const(ks), @Const(ls), @Const(inters), ::Val{E}) where E
+
+    inter_i = @index(Global, Linear)
+
+    @inbounds if inter_i <= length(is)
+        i, j, k, l = is[inter_i], js[inter_i], ks[inter_i], ls[inter_i]
+        pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], coords[k], coords[l],
+                                  boundary, atoms[i], atoms[j], atoms[k], atoms[l], E,
+                                  velocities[i], velocities[j], velocities[k], velocities[l],
+                                  step_n)
+        if unit(pe) != E
+            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
+        end
+        Atomix.@atomic energy[1] += ustrip(pe)
+    end
+end
diff --git a/src/neighbors.jl b/src/neighbors.jl
index 1c2288435..27210a093 100644
--- a/src/neighbors.jl
+++ b/src/neighbors.jl
@@ -92,12 +92,12 @@ function DistanceNeighborFinder(;
                 eligible, dist_cutoff, special, n_steps, zero(eligible))
 end
 
-function find_neighbors(sys::System{D, false},
+function find_neighbors(sys::System{D, AT},
                         nf::DistanceNeighborFinder,
                         current_neighbors=nothing,
                         step_n::Integer=0,
                         force_recompute::Bool=false;
-                        n_threads::Integer=Threads.nthreads()) where D
+                        n_threads::Integer=Threads.nthreads()) where {D, AT}
     if !force_recompute && !iszero(step_n % nf.n_steps)
         return current_neighbors
     end
@@ -120,20 +120,19 @@ function find_neighbors(sys::System{D, false},
     return NeighborList(length(neighbors_list), neighbors_list)
 end
 
-function cuda_threads_blocks_dnf(n_inters)
+function gpu_threads_blocks_dnf(n_inters)
     n_threads_gpu = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_DISTANCENF", "512"))
-    n_blocks = cld(n_inters, n_threads_gpu)
-    return n_threads_gpu, n_blocks
+    return n_threads_gpu
 end
 
-function distance_neighbor_finder_kernel!(neighbors, coords_var, eligible_var,
-                                          boundary, sq_dist_neighbors)
-    coords    = CUDA.Const(coords_var)
-    eligible = CUDA.Const(eligible_var)
+@kernel function distance_neighbor_finder_kernel!(neighbors,
+                                                  @Const(coords),
+                                                  @Const(eligible),
+                                                  boundary, sq_dist_neighbors)
 
     n_atoms = length(coords)
     n_inters = n_atoms_to_n_pairs(n_atoms)
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    inter_i = @index(Global, Linear)
 
     @inbounds if inter_i <= n_inters
         i, j = pair_index(n_atoms, inter_i)
@@ -145,28 +144,28 @@ function distance_neighbor_finder_kernel!(neighbors, coords_var, eligible_var,
             end
         end
     end
-    return nothing
 end
 
 lists_to_tuple_list(i, j, w) = (Int32(i), Int32(j), w)
 
-function find_neighbors(sys::System{D, true},
+function find_neighbors(sys::System{D, AT},
                         nf::DistanceNeighborFinder,
                         current_neighbors=nothing,
                         step_n::Integer=0,
                         force_recompute::Bool=false;
-                        kwargs...) where D
+                        kwargs...) where {D, AT <: AbstractGPUArray}
     if !force_recompute && !iszero(step_n % nf.n_steps)
         return current_neighbors
     end
 
     nf.neighbors .= false
     n_inters = n_atoms_to_n_pairs(length(sys))
-    n_threads_gpu, n_blocks = cuda_threads_blocks_dnf(n_inters)
+    n_threads_gpu = gpu_threads_blocks_dnf(n_inters)
 
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks distance_neighbor_finder_kernel!(
-        nf.neighbors, sys.coords, nf.eligible, sys.boundary, nf.dist_cutoff^2,
-    )
+    backend = get_backend(sys.coords)
+    kernel! = distance_neighbor_finder_kernel!(backend, n_threads_gpu)
+    kernel!(nf.neighbors, sys.coords, nf.eligible, sys.boundary,
+            nf.dist_cutoff^2, ndrange = n_inters)
 
     pairs = findall(nf.neighbors)
     nbsi, nbsj = getindex.(pairs, 1), getindex.(pairs, 2)
@@ -335,19 +334,19 @@ function reduce_pairs(neighbors::NeighborList, neighbors_threaded::Vector{Neighb
     return neighbors
 end
 
-function find_neighbors(sys::System{D, G},
+function find_neighbors(sys::System{D, AT},
                         nf::CellListMapNeighborFinder,
                         current_neighbors=nothing,
                         step_n::Integer=0,
                         force_recompute::Bool=false;
-                        n_threads::Integer=Threads.nthreads()) where {D, G}
+                        n_threads::Integer=Threads.nthreads()) where {D, AT}
     if !force_recompute && !iszero(step_n % nf.n_steps)
         return current_neighbors
     end
 
     if isnothing(current_neighbors)
         neighbors = NeighborList()
-    elseif G
+    elseif AT <: AbstractGPUArray
         neighbors = NeighborList(current_neighbors.n, Array(current_neighbors.list))
     else
         neighbors = current_neighbors
@@ -379,8 +378,8 @@ function find_neighbors(sys::System{D, G},
     )
 
     nf.cl = cl
-    if G
-        return NeighborList(neighbors.n, CuArray(neighbors.list))
+    if AT <: AbstractGPUArray
+        return NeighborList(neighbors.n, AT(neighbors.list))
     else
         return neighbors
     end
diff --git a/src/setup.jl b/src/setup.jl
index e0b5efbe8..ad75f9164 100644
--- a/src/setup.jl
+++ b/src/setup.jl
@@ -428,8 +428,8 @@ are not available when reading Gromacs files.
 - `loggers=()`: the loggers that record properties of interest during a
     simulation.
 - `units::Bool=true`: whether to use Unitful quantities.
-- `gpu::Bool=false`: whether to move the relevant parts of the system onto
-    the GPU.
+- `array_type::AbstractArray = Array`: The array_type desired for the simulation
+   (for GPU support, use CuArray or ROCArray)
 - `dist_cutoff=1.0u"nm"`: cutoff distance for long-range interactions.
 - `dist_neighbors=1.2u"nm"`: cutoff distance for the neighbor list, should be
     greater than `dist_cutoff`.
@@ -452,7 +452,7 @@ function System(coord_file::AbstractString,
                 velocities=nothing,
                 loggers=(),
                 units::Bool=true,
-                gpu::Bool=false,
+                array_type::Type{AT} where AT <: AbstractArray = Array,
                 dist_cutoff=units ? 1.0u"nm" : 1.0,
                 dist_neighbors=units ? 1.2u"nm" : 1.2,
                 center_coords::Bool=true,
@@ -824,9 +824,9 @@ function System(coord_file::AbstractString,
     specific_inter_array = []
     if length(bonds.is) > 0
         push!(specific_inter_array, InteractionList2Atoms(
-            gpu ? CuArray(bonds.is) : bonds.is,
-            gpu ? CuArray(bonds.js) : bonds.js,
-            gpu ? CuArray([bonds.inters...]) : [bonds.inters...],
+            array_type(bonds.is),
+            array_type(bonds.js),
+            array_type([bonds.inters...]),
             bonds.types,
         ))
         topology = MolecularTopology(bonds.is, bonds.js, n_atoms)
@@ -835,30 +835,30 @@ function System(coord_file::AbstractString,
     end
     if length(angles.is) > 0
         push!(specific_inter_array, InteractionList3Atoms(
-            gpu ? CuArray(angles.is) : angles.is,
-            gpu ? CuArray(angles.js) : angles.js,
-            gpu ? CuArray(angles.ks) : angles.ks,
-            gpu ? CuArray([angles.inters...]) : [angles.inters...],
+            array_type(angles.is),
+            array_type(angles.js),
+            array_type(angles.ks),
+            array_type([angles.inters...]),
             angles.types,
         ))
     end
     if length(torsions.is) > 0
         push!(specific_inter_array, InteractionList4Atoms(
-            gpu ? CuArray(torsions.is) : torsions.is,
-            gpu ? CuArray(torsions.js) : torsions.js,
-            gpu ? CuArray(torsions.ks) : torsions.ks,
-            gpu ? CuArray(torsions.ls) : torsions.ls,
-            gpu ? CuArray(torsion_inters_pad) : torsion_inters_pad,
+            array_type(torsions.is),
+            array_type(torsions.js),
+            array_type(torsions.ks),
+            array_type(torsions.ls),
+            array_type(torsion_inters_pad),
             torsions.types,
         ))
     end
     if length(impropers.is) > 0
         push!(specific_inter_array, InteractionList4Atoms(
-            gpu ? CuArray(impropers.is) : impropers.is,
-            gpu ? CuArray(impropers.js) : impropers.js,
-            gpu ? CuArray(impropers.ks) : impropers.ks,
-            gpu ? CuArray(impropers.ls) : impropers.ls,
-            gpu ? CuArray(improper_inters_pad) : improper_inters_pad,
+            array_type(impropers.is),
+            array_type(impropers.js),
+            array_type(impropers.ks),
+            array_type(impropers.ls),
+            array_type(improper_inters_pad),
             impropers.types,
         ))
     end
@@ -887,11 +887,11 @@ function System(coord_file::AbstractString,
     end
     coords = wrap_coords.(coords, (boundary_used,))
 
-    if gpu
-        neighbor_finder = GPUNeighborFinder(
-            eligible=CuArray(eligible),
+    if (array_type <: AbstractGPUArray)
+        neighbor_finder = DistanceNeighborFinder(
+            eligible=array_type(eligible),
             dist_cutoff=T(dist_neighbors),
-            special=CuArray(special),
+            special=array_type(special),
             n_steps_reorder=10,
             initialized=false,
         )
@@ -912,13 +912,9 @@ function System(coord_file::AbstractString,
             dist_cutoff=T(dist_neighbors),
         )
     end
-    if gpu
-        atoms = CuArray([atoms_abst...])
-        coords_dev = CuArray(coords)
-    else
-        atoms = [atoms_abst...]
-        coords_dev = coords
-    end
+
+    atoms = array_type([atoms_abst...])
+    coords_dev = array_type(coords)
 
     if isnothing(velocities)
         if units
@@ -973,7 +969,7 @@ function System(T::Type,
                 velocities=nothing,
                 loggers=(),
                 units::Bool=true,
-                gpu::Bool=false,
+                array_type::Type{AT} where AT <: AbstractArray = Array,
                 dist_cutoff=units ? 1.0u"nm" : 1.0,
                 dist_neighbors=units ? 1.2u"nm" : 1.2,
                 center_coords::Bool=true,
@@ -1254,9 +1250,9 @@ function System(T::Type,
     specific_inter_array = []
     if length(bonds.is) > 0
         push!(specific_inter_array, InteractionList2Atoms(
-            gpu ? CuArray(bonds.is) : bonds.is,
-            gpu ? CuArray(bonds.js) : bonds.js,
-            gpu ? CuArray([bonds.inters...]) : [bonds.inters...],
+            array_type(bonds.is),
+            array_type(bonds.js),
+            array_type([bonds.inters...]),
             bonds.types,
         ))
         topology = MolecularTopology(bonds.is, bonds.js, n_atoms)
@@ -1265,30 +1261,30 @@ function System(T::Type,
     end
     if length(angles.is) > 0
         push!(specific_inter_array, InteractionList3Atoms(
-            gpu ? CuArray(angles.is) : angles.is,
-            gpu ? CuArray(angles.js) : angles.js,
-            gpu ? CuArray(angles.ks) : angles.ks,
-            gpu ? CuArray([angles.inters...]) : [angles.inters...],
+            array_type(angles.is),
+            array_type(angles.js),
+            array_type(angles.ks),
+            array_type([angles.inters...]),
             angles.types,
         ))
     end
     if length(torsions.is) > 0
         push!(specific_inter_array, InteractionList4Atoms(
-            gpu ? CuArray(torsions.is) : torsions.is,
-            gpu ? CuArray(torsions.js) : torsions.js,
-            gpu ? CuArray(torsions.ks) : torsions.ks,
-            gpu ? CuArray(torsions.ls) : torsions.ls,
-            gpu ? CuArray([torsions.inters...]) : [torsions.inters...],
+            array_type(torsions.is),
+            array_type(torsions.js),
+            array_type(torsions.ks),
+            array_type(torsions.ls),
+            array_type([torsions.inters...]),
             torsions.types,
         ))
     end
     specific_inter_lists = tuple(specific_inter_array...)
 
-    if gpu
-        neighbor_finder = GPUNeighborFinder(
-            eligible=CuArray(eligible),
+    if array_type <: AbstractGPUArray
+        neighbor_finder = DistanceNeighborFinder(
+            eligible=array_type(eligible),
             dist_cutoff=T(dist_neighbors),
-            special=CuArray(special),
+            special=array_type(special),
             n_steps_reorder=10,
             initialized=false,
         )
@@ -1309,13 +1305,9 @@ function System(T::Type,
             dist_cutoff=T(dist_neighbors),
         )
     end
-    if gpu
-        atoms = CuArray([atoms_abst...])
-        coords_dev = CuArray(coords)
-    else
-        atoms = [atoms_abst...]
-        coords_dev = coords
-    end
+
+    atoms = array_type([atoms_abst...])
+    coords_dev = array_type(coords)
 
     if isnothing(velocities)
         if units
diff --git a/src/simulators.jl b/src/simulators.jl
index 288d2870d..ddacb3fca 100644
--- a/src/simulators.jl
+++ b/src/simulators.jl
@@ -829,12 +829,12 @@ Attempt an exchange of replicas `n` and `m` in a [`ReplicaSystem`](@ref) during
 Successful exchanges should exchange coordinates and velocities as appropriate.
 Returns acceptance quantity `Δ` and a `Bool` indicating whether the exchange was successful.
 """
-function remd_exchange!(sys::ReplicaSystem{D, G, T},
+function remd_exchange!(sys::ReplicaSystem{D, AT, T},
                         sim::TemperatureREMD,
                         n::Integer,
                         m::Integer;
                         n_threads::Integer=Threads.nthreads(),
-                        rng=Random.default_rng()) where {D, G, T}
+                        rng=Random.default_rng()) where {D, AT, T}
     T_n, T_m = sim.temperatures[n], sim.temperatures[m]
     β_n, β_m = inv(sys.k * T_n), inv(sys.k * T_m)
     neighbors_n = find_neighbors(sys.replicas[n], sys.replicas[n].neighbor_finder;
@@ -920,12 +920,12 @@ function simulate!(sys::ReplicaSystem,
     return simulate_remd!(sys, sim, n_steps; n_threads=n_threads, run_loggers=run_loggers, rng=rng)
 end
 
-function remd_exchange!(sys::ReplicaSystem{D, G, T},
+function remd_exchange!(sys::ReplicaSystem{D, AT, T},
                         sim::HamiltonianREMD,
                         n::Integer,
                         m::Integer;
                         n_threads::Integer=Threads.nthreads(),
-                        rng=Random.default_rng()) where {D, G, T}
+                        rng=Random.default_rng()) where {D, AT, T}
     T_sim = sim.temperature
     β_sim = inv(sys.k * T_sim)
     neighbors_n = find_neighbors(sys.replicas[n], sys.replicas[n].neighbor_finder;
@@ -1045,12 +1045,12 @@ function MetropolisMonteCarlo(; temperature, trial_moves, trial_args=Dict())
     return MetropolisMonteCarlo(temperature, trial_moves, trial_args)
 end
 
-@inline function simulate!(sys::System{D, G, T},
+@inline function simulate!(sys::System{D, AT, T},
                            sim::MetropolisMonteCarlo,
                            n_steps::Integer;
                            n_threads::Integer=Threads.nthreads(),
                            run_loggers=true,
-                           rng=Random.default_rng()) where {D, G, T}
+                           rng=Random.default_rng()) where {D, AT, T}
     neighbors = find_neighbors(sys, sys.neighbor_finder; n_threads=n_threads)
     E_old = potential_energy(sys, neighbors; n_threads=n_threads)
     coords_old = similar(sys.coords)
@@ -1088,9 +1088,9 @@ Performs a random translation of the coordinates of a randomly selected atom in
 The translation is generated using a uniformly selected direction and uniformly selected length
 in range [0, 1) scaled by `shift_size` which should have appropriate length units.
 """
-function random_uniform_translation!(sys::System{D, G, T};
+function random_uniform_translation!(sys::System{D, AT, T};
                                      shift_size=oneunit(eltype(eltype(sys.coords))),
-                                     rng=Random.default_rng()) where {D, G, T}
+                                     rng=Random.default_rng()) where {D, AT, T}
     rand_idx = rand(rng, eachindex(sys))
     direction = random_unit_vector(T, D, rng)
     magnitude = rand(rng, T) * shift_size
@@ -1108,9 +1108,9 @@ The translation is generated using a uniformly chosen direction and length selec
 the standard normal distribution i.e. with mean 0 and standard deviation 1, scaled by `shift_size`
 which should have appropriate length units.
 """
-function random_normal_translation!(sys::System{D, G, T};
+function random_normal_translation!(sys::System{D, AT, T};
                                     shift_size=oneunit(eltype(eltype(sys.coords))),
-                                    rng=Random.default_rng()) where {D, G, T}
+                                    rng=Random.default_rng()) where {D, AT, T}
     rand_idx = rand(rng, eachindex(sys))
     direction = random_unit_vector(T, D, rng)
     magnitude = randn(rng, T) * shift_size
diff --git a/src/spatial.jl b/src/spatial.jl
index f918a827a..3895ec1ba 100644
--- a/src/spatial.jl
+++ b/src/spatial.jl
@@ -613,12 +613,12 @@ function random_velocities(sys::AtomsBase.AbstractSystem{2}, temp; rng=Random.de
     return random_velocity_2D.(masses(sys), temp, sys.k, rng)
 end
 
-function random_velocities(sys::System{3, true}, temp; rng=Random.default_rng())
-    return CuArray(random_velocity_3D.(Array(masses(sys)), temp, sys.k, rng))
+function random_velocities(sys::System{3, AT}, temp; rng=Random.default_rng()) where AT <: AbstractGPUArray
+    return AT(random_velocity_3D.(Array(masses(sys)), temp, sys.k, rng))
 end
 
-function random_velocities(sys::System{2, true}, temp; rng=Random.default_rng())
-    return CuArray(random_velocity_2D.(Array(masses(sys)), temp, sys.k, rng))
+function random_velocities(sys::System{2, AT}, temp; rng=Random.default_rng()) where AT <: AbstractGPUArray
+    return AT(random_velocity_2D.(Array(masses(sys)), temp, sys.k, rng))
 end
 
 """
@@ -634,6 +634,7 @@ function random_velocities!(sys, temp; rng=Random.default_rng())
 end
 
 function random_velocities!(vels, sys::AbstractSystem, temp; rng=Random.default_rng())
+    vs = random_velocities(sys, temp; rng=rng)
     vels .= random_velocities(sys, temp; rng=rng)
     return vels
 end
@@ -738,9 +739,9 @@ function virial(sys, neighbors, step_n::Integer=0; n_threads::Integer=Threads.nt
     return v
 end
 
-function virial(sys::System{D, G, T}, neighbors_dev, step_n, pairwise_inters_nonl,
-                            pairwise_inters_nl) where {D, G, T}
-    if G
+function virial(sys::System{D, AT, T}, neighbors_dev, step_n, pairwise_inters_nonl,
+                            pairwise_inters_nl) where {D, AT, T}
+    if AT <: AbstractGPUArray
         coords, velocities, atoms = Array(sys.coords), Array(sys.velocities), Array(sys.atoms)
         if isnothing(neighbors_dev)
             neighbors = neighbors_dev
@@ -792,7 +793,7 @@ function virial(sys::System{D, G, T}, neighbors_dev, step_n, pairwise_inters_non
 end
 
 # Default for general interactions
-function virial(inter, sys::System{D, G, T}, args...; kwargs...) where {D, G, T}
+function virial(inter, sys::System{D, AT, T}, args...; kwargs...) where {D, AT, T}
     return zero(T) * sys.energy_units
 end
 
@@ -874,8 +875,9 @@ function molecule_centers(coords::AbstractArray{SVector{D, C}}, boundary, topolo
     end
 end
 
-function molecule_centers(coords::CuArray, boundary, topology)
-    return CuArray(molecule_centers(Array(coords), boundary, topology))
+function molecule_centers(coords::AbstractGPUArray, boundary, topology)
+    array_type = get_array_type(coords)
+    return array_type(molecule_centers(Array(coords), boundary, topology))
 end
 
 # Allows scaling multiple vectors at once by broadcasting this function
diff --git a/src/types.jl b/src/types.jl
index 225d9cff3..817ad29f3 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -20,7 +20,8 @@ export
     masses,
     charges,
     MollyCalculator,
-    ASECalculator
+    ASECalculator,
+    NoNeighborList
 
 const DefaultFloat = Float64
 
@@ -182,39 +183,23 @@ function Base.:+(il1::InteractionList4Atoms{I, T}, il2::InteractionList4Atoms{I,
     )
 end
 
-function inject_interaction_list(inter::InteractionList1Atoms, params_dic, gpu)
-    if gpu
-        inters_grad = CuArray(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
-    else
-        inters_grad = inject_interaction.(inter.inters, inter.types, (params_dic,))
-    end
+function inject_interaction_list(inter::InteractionList1Atoms, params_dic, array_type)
+    inters_grad = array_type(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
     InteractionList1Atoms(inter.is, inters_grad, inter.types)
 end
 
-function inject_interaction_list(inter::InteractionList2Atoms, params_dic, gpu)
-    if gpu
-        inters_grad = CuArray(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
-    else
-        inters_grad = inject_interaction.(inter.inters, inter.types, (params_dic,))
-    end
+function inject_interaction_list(inter::InteractionList2Atoms, params_dic, array_type)
+    inters_grad = array_type(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
     InteractionList2Atoms(inter.is, inter.js, inters_grad, inter.types)
 end
 
-function inject_interaction_list(inter::InteractionList3Atoms, params_dic, gpu)
-    if gpu
-        inters_grad = CuArray(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
-    else
-        inters_grad = inject_interaction.(inter.inters, inter.types, (params_dic,))
-    end
+function inject_interaction_list(inter::InteractionList3Atoms, params_dic, array_type)
+    inters_grad = array_type(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
     InteractionList3Atoms(inter.is, inter.js, inter.ks, inters_grad, inter.types)
 end
 
-function inject_interaction_list(inter::InteractionList4Atoms, params_dic, gpu)
-    if gpu
-        inters_grad = CuArray(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
-    else
-        inters_grad = inject_interaction.(inter.inters, inter.types, (params_dic,))
-    end
+function inject_interaction_list(inter::InteractionList4Atoms, params_dic, array_type)
+    inters_grad = array_type(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
     InteractionList4Atoms(inter.is, inter.js, inter.ks, inter.ls, inters_grad, inter.types)
 end
 
@@ -431,8 +416,6 @@ Base.firstindex(::NoNeighborList) = 1
 Base.lastindex(nl::NoNeighborList) = length(nl)
 Base.eachindex(nl::NoNeighborList) = Base.OneTo(length(nl))
 
-CUDA.Const(nl::NoNeighborList) = nl
-
 """
     System(; <keyword arguments>)
 
@@ -481,8 +464,8 @@ interface described there.
     modified in some simulations. `k` is chosen based on the `energy_units` given.
 - `data::DA=nothing`: arbitrary data associated with the system.
 """
-mutable struct System{D, G, T, A, C, B, V, AD, TO, PI, SI, GI, CN, NF,
-                      L, F, E, K, M, DA} <: AtomsBase.AbstractSystem{D}
+mutable struct System{D, AT, T, A, C, B, V, AD, TO, PI, SI, GI, CN, NF,
+                      L, F, E, K, M, DA} <: AbstractSystem{D}
     atoms::A
     coords::C
     boundary::B
@@ -521,7 +504,7 @@ function System(;
                 k=default_k(energy_units),
                 data=nothing)
     D = AtomsBase.n_dimensions(boundary)
-    G = isa(coords, CuArray)
+    AT = get_array_type(coords)
     T = float_type(boundary)
     A = typeof(atoms)
     C = typeof(coords)
@@ -567,19 +550,19 @@ function System(;
         end
     end
 
-    if isa(atoms, CuArray) && !isa(coords, CuArray)
+    if isa(atoms, AbstractGPUArray) && !isa(coords, AbstractGPUArray)
         throw(ArgumentError("the atoms are on the GPU but the coordinates are not"))
     end
-    if isa(coords, CuArray) && !isa(atoms, CuArray)
+    if isa(coords, AbstractGPUArray) && !isa(atoms, AbstractGPUArray)
         throw(ArgumentError("the coordinates are on the GPU but the atoms are not"))
     end
-    if isa(atoms, CuArray) && !isa(vels, CuArray)
+    if isa(atoms, AbstractGPUArray) && !isa(vels, AbstractGPUArray)
         throw(ArgumentError("the atoms are on the GPU but the velocities are not"))
     end
-    if isa(vels, CuArray) && !isa(atoms, CuArray)
+    if isa(vels, AbstractGPUArray) && !isa(atoms, AbstractGPUArray)
         throw(ArgumentError("the velocities are on the GPU but the atoms are not"))
     end
-    if isa(atoms, CuArray) && length(constraints) > 0
+    if isa(atoms, AbstractGPUArray) && length(constraints) > 0
         @warn "Constraints are not currently compatible with simulation on the GPU"
     end
 
@@ -596,7 +579,7 @@ function System(;
     check_units(atoms, coords, vels, energy_units, force_units, pairwise_inters,
                 specific_inter_lists, general_inters, boundary)
 
-    return System{D, G, T, A, C, B, V, AD, TO, PI, SI, GI, CN, NF, L, F, E, K, M, DA}(
+    return System{D, AT, T, A, C, B, V, AD, TO, PI, SI, GI, CN, NF, L, F, E, K, M, DA}(
                     atoms, coords, boundary, vels, atoms_data, topology, pairwise_inters,
                     specific_inter_lists, general_inters, constraints, neighbor_finder, loggers,
                     df, force_units, energy_units, k_converted, atom_masses, data)
@@ -721,19 +704,15 @@ Allows gradients for individual parameters to be tracked.
 Returns atoms, pairwise interactions, specific interaction lists and general
 interactions.
 """
-function inject_gradients(sys::System{D, G}, params_dic) where {D, G}
-    if G
-        atoms_grad = CuArray(inject_atom.(Array(sys.atoms), sys.atoms_data, (params_dic,)))
-    else
-        atoms_grad = inject_atom.(sys.atoms, sys.atoms_data, (params_dic,))
-    end
+function inject_gradients(sys::System{D, AT}, params_dic) where {D, AT}
+    atoms_grad = AT(inject_atom.(Array(sys.atoms), sys.atoms_data, (params_dic,)))
     if length(sys.pairwise_inters) > 0
         pis_grad = inject_interaction.(sys.pairwise_inters, (params_dic,))
     else
         pis_grad = sys.pairwise_inters
     end
     if length(sys.specific_inter_lists) > 0
-        sis_grad = inject_interaction_list.(sys.specific_inter_lists, (params_dic,), G)
+        sis_grad = inject_interaction_list.(sys.specific_inter_lists, (params_dic,), AT)
     else
         sis_grad = sys.specific_inter_lists
     end
@@ -847,7 +826,7 @@ construction where `n` is the number of threads to be used per replica.
     modified in some simulations. `k` is chosen based on the `energy_units` given.
 - `data::DA=nothing`: arbitrary data associated with the replica system.
 """
-mutable struct ReplicaSystem{D, G, T, A, AD, EL, F, E, K, R, DA} <: AtomsBase.AbstractSystem{D}
+mutable struct ReplicaSystem{D, AT, T, A, AD, EL, F, E, K, R, DA} <: AbstractSystem{D}
     atoms::A
     n_replicas::Int
     atoms_data::AD
@@ -884,7 +863,8 @@ function ReplicaSystem(;
                         k=default_k(energy_units),
                         data=nothing)
     D = AtomsBase.n_dimensions(boundary)
-    G = isa(replica_coords[1], CuArray)
+    D = n_dimensions(boundary)
+    AT = get_array_type(replica_coords[1])
     T = float_type(boundary)
     A = typeof(atoms)
     AD = typeof(atoms_data)
@@ -995,25 +975,25 @@ function ReplicaSystem(;
         throw(ArgumentError("there are $(length(atoms)) atoms but $(length(atoms_data)) atom data entries"))
     end
 
-    n_cuarray = sum(y -> isa(y, CuArray), replica_coords)
+    n_cuarray = sum(y -> isa(y, AbstractGPUArray), replica_coords)
     if !(n_cuarray == n_replicas || n_cuarray == 0)
         throw(ArgumentError("the coordinates for $n_cuarray out of $n_replicas replicas are on GPU"))
     end
-    if isa(atoms, CuArray) && n_cuarray != n_replicas
+    if isa(atoms, AbstractGPUArray) && n_cuarray != n_replicas
         throw(ArgumentError("the atoms are on the GPU but the coordinates are not"))
     end
-    if n_cuarray == n_replicas && !isa(atoms, CuArray)
+    if n_cuarray == n_replicas && !isa(atoms, AbstractGPUArray)
         throw(ArgumentError("the coordinates are on the GPU but the atoms are not"))
     end
 
-    n_cuarray = sum(y -> isa(y, CuArray), replica_velocities)
+    n_cuarray = sum(y -> isa(y, AbstractGPUArray), replica_velocities)
     if !(n_cuarray == n_replicas || n_cuarray == 0)
         throw(ArgumentError("the velocities for $n_cuarray out of $n_replicas replicas are on GPU"))
     end
-    if isa(atoms, CuArray) && n_cuarray != n_replicas
+    if isa(atoms, AbstractGPUArray) && n_cuarray != n_replicas
         throw(ArgumentError("the atoms are on the GPU but the velocities are not"))
     end
-    if n_cuarray == n_replicas && !isa(atoms, CuArray)
+    if n_cuarray == n_replicas && !isa(atoms, AbstractGPUArray)
         throw(ArgumentError("the velocities are on the GPU but the atoms are not"))
     end
 
@@ -1023,7 +1003,7 @@ function ReplicaSystem(;
     k_converted = convert_k_units(T, k, energy_units)
     K = typeof(k_converted)
 
-    replicas = Tuple(System{D, G, T, A, C, B, V, AD, TO, typeof(replica_pairwise_inters[i]),
+    replicas = Tuple(System{D, AT, T, A, C, B, V, AD, TO, typeof(replica_pairwise_inters[i]),
                         typeof(replica_specific_inter_lists[i]), typeof(replica_general_inters[i]),
                         typeof(replica_constraints[i]), NF, typeof(replica_loggers[i]), F, E, K,
                         M, Nothing}(
@@ -1034,7 +1014,7 @@ function ReplicaSystem(;
             force_units, energy_units, k_converted, atom_masses, nothing) for i in 1:n_replicas)
     R = typeof(replicas)
 
-    return ReplicaSystem{D, G, T, A, AD, EL, F, E, K, R, DA}(
+    return ReplicaSystem{D, AT, T, A, AD, EL, F, E, K, R, DA}(
             atoms, n_replicas, atoms_data, exchange_logger, force_units,
             energy_units, k_converted, replicas, data)
 end
@@ -1044,7 +1024,7 @@ end
 
 Whether a [`System`](@ref) or [`ReplicaSystem`](@ref) is on the GPU.
 """
-is_on_gpu(::Union{System{D, G}, ReplicaSystem{D, G}}) where {D, G} = G
+is_on_gpu(::Union{System{D, AT}, ReplicaSystem{D, AT}}) where {D, AT} = AT <: AbstractGPUArray
 
 """
     float_type(sys)
@@ -1052,7 +1032,7 @@ is_on_gpu(::Union{System{D, G}, ReplicaSystem{D, G}}) where {D, G} = G
 
 The float type a [`System`](@ref), [`ReplicaSystem`](@ref) or bounding box uses.
 """
-float_type(::Union{System{D, G, T}, ReplicaSystem{D, G, T}}) where {D, G, T} = T
+float_type(::Union{System{D, AT, T}, ReplicaSystem{D, AT, T}}) where {D, AT, T} = T
 
 """
     masses(sys)
@@ -1071,8 +1051,7 @@ charges(s::Union{System, ReplicaSystem}) = charge.(s.atoms)
 charge(s::Union{System, ReplicaSystem}, i::Integer) = charge(s.atoms[i])
 
 # Move an array to the GPU depending on whether the system is on the GPU
-move_array(arr, ::System{D, false}) where {D} = arr
-move_array(arr, ::System{D, true }) where {D} = CuArray(arr)
+move_array(arr, ::System{D, AT}) where {D, AT} = AT(arr)
 
 Base.getindex(s::Union{System, ReplicaSystem}, i::Union{Integer, AbstractVector}) = s.atoms[i]
 Base.length(s::Union{System, ReplicaSystem}) = length(s.atoms)
diff --git a/test/Project.toml b/test/Project.toml
index 3901cc98f..69fec6609 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,4 +1,5 @@
 [deps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 AtomsBase = "a963bdd2-2df7-4f54-a1ee-49d51e6be12a"
 AtomsBaseTesting = "ed7c10db-df7e-4efa-a7be-4f4190f7f227"
@@ -9,6 +10,7 @@ DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
 GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 KernelDensity = "5ab0869b-81aa-558d-bb23-cbf5423bbe9b"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
diff --git a/test/basic.jl b/test/basic.jl
index b9273cf0f..61dfa18a6 100644
--- a/test/basic.jl
+++ b/test/basic.jl
@@ -176,22 +176,22 @@
     @test mcs == [SVector(0.05, 0.0), SVector(1.0, 1.0)]
 
     ff = MolecularForceField(joinpath.(ff_dir, ["ff99SBildn.xml", "tip3p_standard.xml", "his.xml"])...)
-    for gpu in gpu_list
-        sys = System(joinpath(data_dir, "6mrr_equil.pdb"), ff; gpu=gpu, use_cell_list=false)
+    for array_type in array_list
+        sys = System(joinpath(data_dir, "6mrr_equil.pdb"), ff; array_type=array_type, use_cell_list=false)
         mcs = molecule_centers(sys.coords, sys.boundary, sys.topology)
-        @test isapprox(Array(mcs)[1], mean(sys.coords[1:1170]); atol=0.04u"nm")
+        @test isapprox(Array(mcs)[1], mean(sys.coords[1:1170]); atol=0.08u"nm")
 
         # Mark all pairs as ineligible for pairwise interactions and check that the
         #   potential energy from the specific interactions does not change on scaling
         no_nbs = falses(length(sys), length(sys))
-        if gpu
+        if array_type <: AbstractGPUArray
             sys.neighbor_finder = GPUNeighborFinder(
-                eligible=(gpu ? CuArray(no_nbs) : no_nbs),
+                eligible=array_type(no_nbs),
                 dist_cutoff=1.0u"nm",
             )
         else 
             sys.neighbor_finder = DistanceNeighborFinder(
-                eligible=(gpu ? CuArray(no_nbs) : no_nbs),
+                eligible=array_type(no_nbs),
                 dist_cutoff=1.0u"nm",
             )
         end
@@ -317,8 +317,9 @@ end
         end
     end
 
-    if run_gpu_tests
-        sys_gpu = System(joinpath(data_dir, "6mrr_equil.pdb"), ff; gpu=true)
+    if run_cuda_tests
+        sys_gpu = System(joinpath(data_dir, "6mrr_equil.pdb"), ff;
+                         array_type=CuArray)
         for neighbor_finder in (DistanceNeighborFinder,)
             nf_gpu = neighbor_finder(
                 eligible=sys_gpu.neighbor_finder.eligible,
@@ -327,7 +328,25 @@ end
             )
             neighbors_gpu = find_neighbors(sys_gpu, nf_gpu)
             @test length(neighbors_gpu) == n_neighbors_ref
-            CUDA.allowscalar() do
+            GPUArrays.allowscalar() do
+                @test neighbors_gpu[10] isa Tuple{Int32, Int32, Bool}
+            end
+            @test identical_neighbors(neighbors_gpu, neighbors_ref)
+        end
+    end
+
+    if run_rocm_tests
+        sys_gpu = System(joinpath(data_dir, "6mrr_equil.pdb"), ff;
+                         array_type=ROCArray)
+        for neighbor_finder in (DistanceNeighborFinder,)
+            nf_gpu = neighbor_finder(
+                eligible=sys_gpu.neighbor_finder.eligible,
+                special=sys_gpu.neighbor_finder.special,
+                dist_cutoff=dist_cutoff,
+            )
+            neighbors_gpu = find_neighbors(sys_gpu, nf_gpu)
+            @test length(neighbors_gpu) == n_neighbors_ref
+            GPUArrays.allowscalar() do
                 @test neighbors_gpu[10] isa Tuple{Int32, Int32, Bool}
             end
             @test identical_neighbors(neighbors_gpu, neighbors_ref)
@@ -343,9 +362,13 @@ end
     coords_1 = SVector{3, Float64}.(eachcol(cm_1)) / 10 * u"nm"
     coords_2 = SVector{3, Float64}.(eachcol(cm_2)) / 10 * u"nm"
     @test rmsd(coords_1, coords_2) ≈ 2.54859467758795u"Å"
-    if run_gpu_tests
+    if run_cuda_tests
         @test rmsd(CuArray(coords_1), CuArray(coords_2)) ≈ 2.54859467758795u"Å"
     end
+    if run_rocm_tests
+        @test rmsd(ROCArray(coords_1),
+                   ROCArray(coords_2)) ≈ 2.54859467758795u"Å"
+    end
 
     bb_atoms = BioStructures.collectatoms(struc[1], BioStructures.backboneselector)
     coords = SVector{3, Float64}.(eachcol(BioStructures.coordarray(bb_atoms))) / 10 * u"nm"
diff --git a/test/energy_conservation.jl b/test/energy_conservation.jl
index f29aca2b4..c2a423bae 100644
--- a/test/energy_conservation.jl
+++ b/test/energy_conservation.jl
@@ -6,7 +6,7 @@ using CUDA
 using Test
 
 @testset "Lennard-Jones energy conservation" begin
-    function test_energy_conservation(nl::Bool, gpu::Bool, n_threads::Integer, n_steps::Integer)
+    function test_energy_conservation(nl::Bool, array_type::AbstractArray, n_threads::Integer, n_steps::Integer)
         n_atoms = 2_000
         atom_mass = 40.0u"g/mol"
         temp = 1.0u"K"
@@ -41,8 +41,8 @@ using Test
             end
     
             sys = System(
-                atoms=(gpu ? CuArray(atoms) : atoms),
-                coords=(gpu ? CuArray(coords) : coords),
+                atoms=(array_type(atoms) : atoms),
+                coords=(array_type(coords) : coords),
                 boundary=boundary,
                 pairwise_inters=(LennardJones(cutoff=cutoff, use_neighbors=ifelse(nl, true, false)),),
                 neighbor_finder=neighbor_finder,
@@ -72,15 +72,15 @@ using Test
         end
     end
 
-    test_energy_conservation(true, false, 1, 10_000)
-    test_energy_conservation(false, false, 1, 10_000)
+    test_energy_conservation(true, Array, 1, 10_000)
+    test_energy_conservation(false, Array, 1, 10_000)
     if Threads.nthreads() > 1
-        test_energy_conservation(true, false, Threads.nthreads(), 50_000)
-        test_energy_conservation(false, false, Threads.nthreads(), 50_000)
+        test_energy_conservation(true, Array, Threads.nthreads(), 50_000)
+        test_energy_conservation(false, Array, Threads.nthreads(), 50_000)
     end
-    if CUDA.functional()
-        test_energy_conservation(true, true, 1, 100_000)
-        test_energy_conservation(false, true, 1, 100_000)
+    for array_type in array_list[2:end]
+        test_energy_conservation(true, array_type, 1, 100_000)
+        test_energy_conservation(false, array_type, 1, 100_000)
     end
 end
 
diff --git a/test/gradients.jl b/test/gradients.jl
index cce785a6b..1013ef9ae 100644
--- a/test/gradients.jl
+++ b/test/gradients.jl
@@ -36,24 +36,31 @@ end
 
 @testset "Differentiable simulation" begin
     runs = [ #               gpu    par    fwd    f32    obc2   gbn2   tol_σ tol_r0
-        ("CPU"             , false, false, false, false, false, false, 1e-4, 1e-4),
-        ("CPU forward"     , false, false, true , false, false, false, 0.5 , 0.1 ),
-        ("CPU f32"         , false, false, false, true , false, false, 0.01, 5e-4),
-        ("CPU obc2"        , false, false, false, false, true , false, 1e-4, 1e-4),
-        ("CPU gbn2"        , false, false, false, false, false, true , 1e-4, 1e-4),
-        ("CPU gbn2 forward", false, false, true , false, false, true , 0.5 , 0.1 ),
+        ("CPU"             , Array, false, false, false, false, false, 1e-4, 1e-4),
+        ("CPU forward"     , Array, false, true , false, false, false, 0.5 , 0.1 ),
+        ("CPU f32"         , Array, false, false, true , false, false, 0.01, 5e-4),
+        ("CPU obc2"        , Array, false, false, false, true , false, 1e-4, 1e-4),
+        ("CPU gbn2"        , Array, false, false, false, false, true , 1e-4, 1e-4),
+        ("CPU gbn2 forward", Array, false, true , false, false, true , 0.5 , 0.1 ),
     ]
     if run_parallel_tests #                  gpu    par    fwd    f32    obc2   gbn2   tol_σ tol_r0
-        push!(runs, ("CPU parallel"        , false, true , false, false, false, false, 1e-4, 1e-4))
-        push!(runs, ("CPU parallel forward", false, true , true , false, false, false, 0.5 , 0.1 ))
-        push!(runs, ("CPU parallel f32"    , false, true , false, true , false, false, 0.01, 5e-4))
+        push!(runs, ("CPU parallel"        , Array, true , false, false, false, false, 1e-4, 1e-4))
+        push!(runs, ("CPU parallel forward", Array, true , true , false, false, false, 0.5 , 0.1 ))
+        push!(runs, ("CPU parallel f32"    , Array, true , false, true , false, false, 0.01, 5e-4))
     end
-    if run_gpu_tests #                       gpu    par    fwd    f32    obc2   gbn2   tol_σ tol_r0
-        push!(runs, ("GPU"                 , true , false, false, false, false, false, 0.25, 20.0))
-        push!(runs, ("GPU forward"         , true , false, true , false, false, false, 0.25, 20.0))
-        push!(runs, ("GPU f32"             , true , false, false, true , false, false, 0.5 , 50.0))
-        push!(runs, ("GPU obc2"            , true , false, false, false, true , false, 0.25, 20.0))
-        push!(runs, ("GPU gbn2"            , true , false, false, false, false, true , 0.25, 20.0))
+    if run_cuda_tests #                       gpu     par    fwd    f32    obc2   gbn2   tol_σ tol_r0
+        push!(runs, ("CUDA"                , CuArray, false, false, false, false, false, 0.25, 20.0))
+        push!(runs, ("CUDA forward"        , CuArray, false, true , false, false, false, 0.25, 20.0))
+        push!(runs, ("CUDA f32"            , CuArray, false, false, true , false, false, 0.5 , 50.0))
+        push!(runs, ("CUDA obc2"           , CuArray, false, false, false, true , false, 0.25, 20.0))
+        push!(runs, ("CUDA gbn2"           , CuArray, false, false, false, false, true , 0.25, 20.0))
+    end
+    if run_rocm_tests #                       gpu      par    fwd    f32    obc2   gbn2   tol_σ tol_r0
+        push!(runs, ("ROCM"                , ROCArray, false, false, false, false, false, 0.25, 20.0))
+        push!(runs, ("ROCM forward"        , ROCArray, false, true , false, false, false, 0.25, 20.0))
+        push!(runs, ("ROCM f32"            , ROCArray, false, false, true , false, false, 0.5 , 50.0))
+        push!(runs, ("ROCM obc2"           , ROCArray, false, false, false, true , false, 0.25, 20.0))
+        push!(runs, ("ROCM gbn2"           , ROCArray, false, false, false, false, true , 0.25, 20.0))
     end
 
     function mean_min_separation(coords, boundary, ::Val{T}) where T
@@ -103,9 +110,8 @@ end
         return mean_min_separation(sys.coords, boundary, Val(T))
     end
 
-    for (name, gpu, parallel, forward, f32, obc2, gbn2, tol_σ, tol_r0) in runs
+    for (name, AT, parallel, forward, f32, obc2, gbn2, tol_σ, tol_r0) in runs
         T = f32 ? Float32 : Float64
-        AT = gpu ? CuArray : Array
         σ  = T(0.4)
         r0 = T(1.0)
         n_atoms = 50
@@ -245,13 +251,13 @@ end
 end
 
 @testset "Differentiable protein" begin
-    function create_sys(gpu::Bool)
+    function create_sys(array_type)
         ff = MolecularForceField(joinpath.(ff_dir, ["ff99SBildn.xml", "his.xml"])...; units=false)
         return System(
             joinpath(data_dir, "6mrr_nowater.pdb"),
             ff;
             units=false,
-            gpu=gpu,
+            array_type=array_type,
             implicit_solvent="gbn2",
             kappa=0.7,
         )
@@ -402,10 +408,13 @@ end
 
     platform_runs = [("CPU", false, false)]
     if run_parallel_tests
-        push!(platform_runs, ("CPU parallel", false, true))
+        push!(platform_runs, ("CPU parallel", Array, true))
+    end
+    if run_cuda_tests
+        push!(platform_runs, ("CUDA", CuArray, false))
     end
-    if run_gpu_tests
-        push!(platform_runs, ("GPU", true, false))
+    if run_rocm_tests
+        push!(platform_runs, ("ROCM", ROCArray, false))
     end
     test_runs = [
         ("Energy", test_energy_grad, 1e-8),
@@ -423,8 +432,8 @@ end
     )
 
     for (test_name, test_fn, test_tol) in test_runs
-        for (platform, gpu, parallel) in platform_runs
-            sys_ref = create_sys(gpu)
+        for (platform, AT, parallel) in platform_runs
+            sys_ref = create_sys(AT)
             n_threads = parallel ? Threads.nthreads() : 1
             grads_enzyme = Dict(k => 0.0 for k in keys(params_dic))
             autodiff(
diff --git a/test/minimization.jl b/test/minimization.jl
index 83a10f0e8..5a75a0e27 100644
--- a/test/minimization.jl
+++ b/test/minimization.jl
@@ -42,14 +42,14 @@
     @test isapprox(potential_energy(sys; n_threads=1) * u"kJ * mol^-1", -3.0u"kJ * mol^-1";
                     atol=1e-4u"kJ * mol^-1")
 
-    if run_gpu_tests
-        coords = CuArray([
+    for array_type in array_list[2:end]
+        coords = array_type([
             SVector(1.0, 1.0, 1.0)u"nm",
             SVector(1.6, 1.0, 1.0)u"nm",
             SVector(1.4, 1.6, 1.0)u"nm",
         ])
         sys = System(
-            atoms=CuArray([Atom(σ=(0.4 / (2 ^ (1 / 6)))u"nm", ϵ=1.0u"kJ * mol^-1") for i in 1:3]),
+            atoms=array_type([Atom(σ=(0.4 / (2 ^ (1 / 6)))u"nm", ϵ=1.0u"kJ * mol^-1") for i in 1:3]),
             coords=coords,
             boundary=CubicBoundary(5.0u"nm"),
             pairwise_inters=(LennardJones(),),
@@ -57,10 +57,12 @@
         sim = SteepestDescentMinimizer(tol=1.0u"kJ * mol^-1 * nm^-1")
 
         simulate!(sys, sim)
-        dists = distances(sys.coords, sys.boundary)
+        dists = Array(distances(sys.coords, sys.boundary))
         dists_flat = dists[triu(trues(3, 3), 1)]
-        @test all(x -> isapprox(x, 0.4u"nm"; atol=1e-3u"nm"), dists_flat)
+
+        # GPU tolerances are more lenient (possibly for f32 shenanigans)
+        @test all(x -> isapprox(x, 0.4u"nm"; atol=1e-2u"nm"), dists_flat)
         @test isapprox(potential_energy(sys), -3.0u"kJ * mol^-1";
-                        atol=1e-4u"kJ * mol^-1")
+                        atol=1e-2u"kJ * mol^-1")
     end
 end
diff --git a/test/protein.jl b/test/protein.jl
index 9d7ab007e..4327e37da 100644
--- a/test/protein.jl
+++ b/test/protein.jl
@@ -179,12 +179,12 @@ end
     @test pis_grad == sys_nounits.pairwise_inters
 
     # Test the same simulation on the GPU
-    if run_gpu_tests
+    for array_type in array_list[2:end]
         sys = System(
             joinpath(data_dir, "6mrr_equil.pdb"),
             ff;
-            velocities=CuArray(copy(velocities_start)),
-            gpu=true,
+            velocities=array_type(deepcopy(velocities_start)),
+            array_type = array_type,
             center_coords=false,
         )
         @test kinetic_energy(sys) ≈ 65521.87288132431u"kJ * mol^-1"
@@ -211,9 +211,9 @@ end
         sys_nounits = System(
             joinpath(data_dir, "6mrr_equil.pdb"),
             ff_nounits;
-            velocities=CuArray(copy(ustrip_vec.(velocities_start))),
+            velocities=array_type(deepcopy(ustrip_vec.(velocities_start))),
             units=false,
-            gpu=true,
+            array_type = array_type,
             center_coords=false,
         )
         @test kinetic_energy(sys_nounits)u"kJ * mol^-1" ≈ 65521.87288132431u"kJ * mol^-1"
@@ -248,13 +248,13 @@ end
 @testset "Implicit solvent" begin
     ff = MolecularForceField(joinpath.(ff_dir, ["ff99SBildn.xml", "his.xml"])...)
 
-    for gpu in gpu_list
+    for array_type in array_list
         for solvent_model in ("obc2", "gbn2")
             sys = System(
                 joinpath(data_dir, "6mrr_nowater.pdb"),
                 ff;
                 boundary=CubicBoundary(100.0u"nm"),
-                gpu=gpu,
+                array_type = array_type,
                 dist_cutoff=5.0u"nm",
                 dist_neighbors=5.0u"nm",
                 implicit_solvent=solvent_model,
diff --git a/test/runtests.jl b/test/runtests.jl
index b18d4d73c..68cf4ce28 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -7,6 +7,8 @@ using AtomsCalculators.AtomsCalculatorsTesting
 import BioStructures # Imported to avoid clashing names
 using CUDA
 using Enzyme
+using AMDGPU
+using GPUArrays
 using FiniteDifferences
 using KernelDensity
 import SimpleCrystals
@@ -34,7 +36,7 @@ if running_CI
     @warn "Some CPU gradient tests will not be run as this is CI"
 end
 
-const run_visualize_tests = get(ENV, "VISTESTS", "1") != "0"
+const run_visualize_tests = false#get(ENV, "VISTESTS", "1") != "0"
 if run_visualize_tests
     import GLMakie
 else
@@ -50,17 +52,27 @@ else
 end
 
 # Allow CUDA device to be specified
-const DEVICE = parse(Int, get(ENV, "DEVICE", "0"))
+const DEVICE = 2#parse(Int, get(ENV, "DEVICE", "0"))
 
-const run_gpu_tests = get(ENV, "GPUTESTS", "1") != "0" && CUDA.functional()
-const gpu_list = (run_gpu_tests ? (false, true) : (false,))
-if run_gpu_tests
+const run_cuda_tests = get(ENV, "GPUTESTS", "1") != "0" && CUDA.functional()
+const run_rocm_tests = get(ENV, "GPUTESTS", "1") != "0" && AMDGPU.functional()
+
+array_list = (Array,)
+
+if run_cuda_tests
+    array_list = (array_list..., CuArray)
     device!(DEVICE)
-    @info "The GPU tests will be run on device $DEVICE"
-elseif get(ENV, "GPUTESTS", "1") == "0"
-    @warn "The GPU tests will not be run as GPUTESTS is set to 0"
+    @info "The CUDA tests will be run on device $DEVICE"
+else
+    @warn "The CUDA tests will not be run as a CUDA-enabled device is not available"
+end
+
+if run_rocm_tests
+    array_list = (array_list..., ROCArray)
+    AMDGPU.device!(AMDGPU.device(DEVICE+1))
+    @info "The ROCM tests will be run on device $DEVICE"
 else
-    @warn "The GPU tests will not be run as a CUDA-enabled device is not available"
+    @warn "The ROCM tests will not be run as a ROCM-enabled device is not available"
 end
 
 const data_dir = normpath(@__DIR__, "..", "data")
diff --git a/test/simulation.jl b/test/simulation.jl
index ebd99051b..8667ef050 100644
--- a/test/simulation.jl
+++ b/test/simulation.jl
@@ -565,7 +565,7 @@ end
 end
 
 @testset "Position restraints" begin
-    for gpu in gpu_list
+    for array_type in array_list
         n_atoms = 10
         n_atoms_res = n_atoms ÷ 2
         n_steps = 2_000
@@ -576,8 +576,8 @@ end
         sim = Langevin(dt=0.001u"ps", temperature=300.0u"K", friction=1.0u"ps^-1")
 
         sys = System(
-            atoms=(gpu ? CuArray(atoms) : atoms),
-            coords=(gpu ? CuArray(copy(starting_coords)) : copy(starting_coords)),
+            atoms=array_type(atoms),
+            coords=array_type(deepcopy(starting_coords)),
             boundary=boundary,
             atoms_data=atoms_data,
             pairwise_inters=(LennardJones(),),
@@ -1067,15 +1067,14 @@ end
     vvand_baro = VelocityVerlet(dt=dt, coupling=(AndersenThermostat(temp, 1.0u"ps"), barostat))
 
     for sim in (lang_baro, vvand_baro)
-        for gpu in gpu_list
-            if gpu && sim == vvand_baro
+        for array_type in array_list
+            if array_type <: AbstractGPUArray && sim == vvand_baro
                 continue
             end
-            AT = gpu ? CuArray : Array
 
             sys = System(
-                atoms=AT(atoms),
-                coords=AT(copy(coords)),
+                atoms=array_type(atoms),
+                coords=array_type(deepcopy(coords)),
                 boundary=boundary,
                 pairwise_inters=(LennardJones(),),
                 loggers=(
@@ -1131,16 +1130,15 @@ end
         SVector(nothing  , nothing  , nothing  ), # Uncoupled
     )
 
-    for gpu in gpu_list
-        AT = gpu ? CuArray : Array
+    for array_type in array_list
         for (press_i, press) in enumerate(pressure_test_set)
-            if gpu && press_i != 2
+            if array_type <: AbstractGPUArray && press_i != 2
                 continue
             end
 
             sys = System(
-                atoms=AT(atoms),
-                coords=AT(copy(coords)),
+                atoms=array_type(atoms),
+                coords=array_type(deepcopy(coords)),
                 boundary=boundary,
                 pairwise_inters=(LennardJones(),),
                 loggers=(
@@ -1200,16 +1198,15 @@ end
         MonteCarloMembraneBarostat(press, tens, temp, boundary; z_axis_fixed=true),
     )
 
-    for gpu in gpu_list
-        AT = gpu ? CuArray : Array
+    for array_type in array_list
         for (barostat_i, barostat) in enumerate(barostat_test_set)
-            if gpu && barostat_i != 2
+            if array_type <: AbstractGPUArray && barostat_i != 2
                 continue
             end
 
             sys = System(
-                atoms=AT(atoms),
-                coords=AT(copy(coords)),
+                atoms=array_type(atoms),
+                coords=array_type(deepcopy(coords)),
                 boundary=boundary,
                 pairwise_inters=(LennardJones(),),
                 loggers=(
@@ -1323,7 +1320,8 @@ end
     starting_coords_f32 = [Float32.(c) for c in starting_coords]
     starting_velocities_f32 = [Float32.(c) for c in starting_velocities]
 
-    function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
+    function test_sim(nl::Bool, parallel::Bool, f32::Bool,
+                      array_type::Type{AT}) where AT <: AbstractArray
         n_atoms = 400
         n_steps = 200
         atom_mass = f32 ? 10.0f0u"g/mol" : 10.0u"g/mol"
@@ -1333,9 +1331,9 @@ end
         r0 = f32 ? 0.2f0u"nm" : 0.2u"nm"
         bonds = [HarmonicBond(k=k, r0=r0) for i in 1:(n_atoms ÷ 2)]
         specific_inter_lists = (InteractionList2Atoms(
-            gpu ? CuArray(Int32.(collect(1:2:n_atoms))) : Int32.(collect(1:2:n_atoms)),
-            gpu ? CuArray(Int32.(collect(2:2:n_atoms))) : Int32.(collect(2:2:n_atoms)),
-            gpu ? CuArray(bonds) : bonds,
+            array_type(Int32.(collect(1:2:n_atoms))),
+            array_type(Int32.(collect(2:2:n_atoms))),
+            array_type(bonds),
         ),)
 
         neighbor_finder = NoNeighborFinder()
@@ -1351,7 +1349,7 @@ end
         end
         if nl && !gpu
             neighbor_finder = DistanceNeighborFinder(
-                eligible=gpu ? CuArray(trues(n_atoms, n_atoms)) : trues(n_atoms, n_atoms),
+                eligible=array_type(trues(n_atoms, n_atoms)),
                 n_steps=10,
                 dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm",
             )
@@ -1359,17 +1357,10 @@ end
         end
         show(devnull, neighbor_finder)
 
-        if gpu
-            coords = CuArray(copy(f32 ? starting_coords_f32 : starting_coords))
-            velocities = CuArray(copy(f32 ? starting_velocities_f32 : starting_velocities))
-            atoms = CuArray([Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
-                                  ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
-        else
-            coords = copy(f32 ? starting_coords_f32 : starting_coords)
-            velocities = copy(f32 ? starting_velocities_f32 : starting_velocities)
-            atoms = [Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
-                            ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms]
-        end
+        coords = array_type(deepcopy(f32 ? starting_coords_f32 : starting_coords))
+        velocities = array_type(deepcopy(f32 ? starting_velocities_f32 : starting_velocities))
+        atoms = array_type([Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
+                                ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
 
         s = System(
             atoms=atoms,
@@ -1381,7 +1372,7 @@ end
             neighbor_finder=neighbor_finder,
         )
 
-        @test is_on_gpu(s) == gpu
+        @test is_on_gpu(s) == (array_type <: AbstractGPUArray)
         @test float_type(s) == (f32 ? Float32 : Float64)
 
         n_threads = parallel ? Threads.nthreads() : 1
@@ -1392,24 +1383,31 @@ end
     end
 
     runs = [
-        ("CPU"       , [false, false, false, false]),
-        ("CPU f32"   , [false, false, true , false]),
-        ("CPU NL"    , [true , false, false, false]),
-        ("CPU f32 NL", [true , false, true , false]),
+        ("CPU"       , [false, false, false, Array]),
+        ("CPU f32"   , [false, false, true , Array]),
+        ("CPU NL"    , [true , false, false, Array]),
+        ("CPU f32 NL", [true , false, true , Array]),
     ]
     if run_parallel_tests
-        push!(runs, ("CPU parallel"       , [false, true , false, false]))
-        push!(runs, ("CPU parallel f32"   , [false, true , true , false]))
-        push!(runs, ("CPU parallel NL"    , [true , true , false, false]))
-        push!(runs, ("CPU parallel f32 NL", [true , true , true , false]))
+        push!(runs, ("CPU parallel"       , [false, true , false, Array]))
+        push!(runs, ("CPU parallel f32"   , [false, true , true , Array]))
+        push!(runs, ("CPU parallel NL"    , [true , true , false, Array]))
+        push!(runs, ("CPU parallel f32 NL", [true , true , true , Array]))
     end
-    if run_gpu_tests
-        push!(runs, ("GPU"       , [false, false, false, true]))
-        push!(runs, ("GPU f32"   , [false, false, true , true]))
-        push!(runs, ("GPU NL"    , [true , false, false, true]))
-        push!(runs, ("GPU f32 NL", [true , false, true , true]))
+    if run_cuda_tests
+        push!(runs, ("GPU"       , [false, false, false, CuArray]))
+        push!(runs, ("GPU f32"   , [false, false, true , CuArray]))
+        push!(runs, ("GPU NL"    , [true , false, false, CuArray]))
+        push!(runs, ("GPU f32 NL", [true , false, true , CuArray]))
+    end
+    if run_rocm_tests
+        push!(runs, ("GPU"       , [false, false, false, ROCArray]))
+        push!(runs, ("GPU f32"   , [false, false, true , ROCArray]))
+        push!(runs, ("GPU NL"    , [true , false, false, ROCArray]))
+        push!(runs, ("GPU f32 NL", [true , false, true , ROCArray]))
     end
 
+
     final_coords_ref, E_start_ref = test_sim(runs[1][2]...)
     # Check all simulations give the same result to within some error
     for (name, args) in runs

From 60a532d9868024939350b41a738645045df90536 Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Fri, 17 Jan 2025 14:01:58 +0100
Subject: [PATCH 02/24] modifying all gpu = {true | false} statements in docs

---
 docs/src/documentation.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/src/documentation.md b/docs/src/documentation.md
index 45d2bf383..58ac03fd0 100644
--- a/docs/src/documentation.md
+++ b/docs/src/documentation.md
@@ -327,7 +327,7 @@ sys = System(
         energy=TotalEnergyLogger(10),
         writer=StructureWriter(10, "traj_6mrr_1ps.pdb", ["HOH"]),
     ),
-    gpu=false,
+    array_type=Array,
 )
 
 minimizer = SteepestDescentMinimizer()
@@ -363,7 +363,7 @@ Residue patches, virtual sites, file includes and any force types other than `Ha
 
     Some PDB files that read in fine can be found [here](https://github.com/greener-group/GB99dms/tree/main/structures/training/conf_1).
 
-To run on the GPU, set `gpu=true`.
+To run on the GPU, set `array_type=GPUArrayType`, where `GPUArrayType` is the array type for your GPU backend (for example, `CuArray` for NVIDIA or `ROCArray` for AMD).
 You can use an implicit solvent method by giving the `implicit_solvent` keyword argument to [`System`](@ref).
 The options are `"obc1"`, `"obc2"` and `"gbn2"`, corresponding to the Onufriev-Bashford-Case GBSA model with parameter set I or II and the GB-Neck2 model.
 Other options include overriding the boundary dimensions in the file (`boundary`) and modifying the non-bonded interaction and neighbor list cutoff distances (`dist_cutoff` and `dist_neighbors`).

From 7fbda307eb2b63cabe1395357635a81457a80b68 Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Fri, 17 Jan 2025 15:44:20 +0100
Subject: [PATCH 03/24] some tests are running

---
 ext/MollyCUDAExt.jl | 1 +
 src/energy.jl       | 1 +
 src/kernels.jl      | 8 ++++----
 src/setup.jl        | 4 ++--
 test/basic.jl       | 2 +-
 test/runtests.jl    | 2 +-
 6 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/ext/MollyCUDAExt.jl b/ext/MollyCUDAExt.jl
index 0adc59795..ddedb1bdc 100644
--- a/ext/MollyCUDAExt.jl
+++ b/ext/MollyCUDAExt.jl
@@ -1233,3 +1233,4 @@ function specific_pe_4_atoms_kernel!(energy, coords_var, velocities_var, atoms_v
     end
     return nothing
 end
+end
diff --git a/src/energy.jl b/src/energy.jl
index 7427cef2d..f842ab852 100644
--- a/src/energy.jl
+++ b/src/energy.jl
@@ -257,6 +257,7 @@ function potential_energy(sys::System{D, AT, T}, neighbors, step_n::Integer=0;
                           n_threads::Integer=Threads.nthreads()) where {D, AT <: AbstractGPUArray, T}
     n_atoms = length(sys)
     val_ft = Val(T)
+    pe_vec_nounits = KernelAbstractions.zeros(get_backend(sys.coords), T, 1)
     buffers = init_forces_buffer!(sys, ustrip_vec.(zero(sys.coords)), 1)
 
     pairwise_inters_nonl = filter(!use_neighbors, values(sys.pairwise_inters))
diff --git a/src/kernels.jl b/src/kernels.jl
index 1863ea74e..a7ea220dc 100644
--- a/src/kernels.jl
+++ b/src/kernels.jl
@@ -227,12 +227,12 @@ end
     end
 end
 
-function pairwise_pe_gpu!(pe_vec_nounits, coords::AbstractArray{SVector{D, C}}, velocities, atoms, boundary,
-                         pairwise_inters, nbs, step_n, energy_units, ::Val{T}) where {D, C, T}
-    backend = get_backend(coords)
+function pairwise_pe_gpu!(pe_vec_nounits, buffers, sys::System{D, AT, T},
+                         pairwise_inters, nbs, step_n) where {D, AT <: AbstractGPUArray, T}
+    backend = get_backend(sys.coords)
     n_threads_gpu = gpu_threads_pairwise(length(nbs))
     kernel! = pairwise_pe_kernel!(backend, n_threads_gpu)
-    kernel!(pe_vec_nounits, coords, velocities, atoms, boundary, pairwise_inters, nbs, step_n, Val(energy_units); ndrange = length(nbs))
+    kernel!(pe_vec_nounits, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters, nbs, step_n, Val(energy_units); ndrange = length(nbs))
     return pe_vec_nounits
 end
 
diff --git a/src/setup.jl b/src/setup.jl
index ad75f9164..95ff28eb3 100644
--- a/src/setup.jl
+++ b/src/setup.jl
@@ -888,7 +888,7 @@ function System(coord_file::AbstractString,
     coords = wrap_coords.(coords, (boundary_used,))
 
     if (array_type <: AbstractGPUArray)
-        neighbor_finder = DistanceNeighborFinder(
+        neighbor_finder = GPUNeighborFinder(
             eligible=array_type(eligible),
             dist_cutoff=T(dist_neighbors),
             special=array_type(special),
@@ -1281,7 +1281,7 @@ function System(T::Type,
     specific_inter_lists = tuple(specific_inter_array...)
 
     if array_type <: AbstractGPUArray
-        neighbor_finder = DistanceNeighborFinder(
+        neighbor_finder = GPUNeighborFinder(
             eligible=array_type(eligible),
             dist_cutoff=T(dist_neighbors),
             special=array_type(special),
diff --git a/test/basic.jl b/test/basic.jl
index 61dfa18a6..fe24454d7 100644
--- a/test/basic.jl
+++ b/test/basic.jl
@@ -191,7 +191,7 @@
             )
         else 
             sys.neighbor_finder = DistanceNeighborFinder(
-                eligible=array_type(no_nbs),
+                eligible=(array_type <: AbstractGPUArray ? array_type(no_nbs) : no_nbs),
                 dist_cutoff=1.0u"nm",
             )
         end
diff --git a/test/runtests.jl b/test/runtests.jl
index 68cf4ce28..8d12c38bb 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -69,7 +69,7 @@ end
 
 if run_rocm_tests
     array_list = (array_list..., ROCArray)
-    AMDGPU.device!(AMDGPU.device(DEVICE+1))
+    AMDGPU.device!(AMDGPU.device(DEVICE))
     @info "The ROCM tests will be run on device $DEVICE"
 else
     @warn "The ROCM tests will not be run as a ROCM-enabled device is not available"

From 865f4d8bee11966f0fb79fae91264a4cb3dac5de Mon Sep 17 00:00:00 2001
From: Joe Greener <jgreener@hotmail.co.uk>
Date: Thu, 23 Jan 2025 15:14:05 +0000
Subject: [PATCH 04/24] pass neighbors to force/pe functions

---
 src/cuda.jl    | 119 ++++++++++++++++++++++++++-----------------------
 src/energy.jl  |   2 +-
 src/force.jl   |   2 +-
 src/kernels.jl |  21 +++++----
 4 files changed, 78 insertions(+), 66 deletions(-)

diff --git a/src/cuda.jl b/src/cuda.jl
index e751f51a5..93118131e 100644
--- a/src/cuda.jl
+++ b/src/cuda.jl
@@ -29,71 +29,78 @@ function cuda_threads_blocks_specific(n_inters)
     return n_threads_gpu, n_blocks
 end
 
-function pairwise_force_gpu!(buffers, sys::System{D, true, T}, pairwise_inters, nbs, step_n) where {D, T}
-    if typeof(nbs) == NoNeighborList
-        kernel = @cuda launch=false pairwise_force_kernel_nonl!(
-                buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters, step_n,
-                Val(D), Val(sys.force_units))
-        conf = launch_configuration(kernel.fun)
-        threads_basic = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_PAIRWISE", "512"))
-        nthreads = min(length(sys.atoms), threads_basic, conf.threads)
-        nthreads = cld(nthreads, WARPSIZE) * WARPSIZE
-        n_blocks_i = cld(length(sys.atoms), WARPSIZE)
-        n_blocks_j = cld(length(sys.atoms), nthreads)
-        kernel(buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters,
-               step_n, Val(D), Val(sys.force_units); threads=nthreads,
-               blocks=(n_blocks_i, n_blocks_j))
-    else    
-        N = length(sys.coords)
-        n_blocks = cld(N, WARPSIZE)
-        r_cut = sys.neighbor_finder.dist_cutoff
-        if step_n % sys.neighbor_finder.n_steps_reorder == 0 || !sys.neighbor_finder.initialized
-            Morton_bits = 4
-            w = r_cut - typeof(ustrip(r_cut))(0.1) * unit(r_cut)
-            Morton_seq_cpu = sorted_Morton_seq(Array(sys.coords), w, Morton_bits)
-            copyto!(buffers.Morton_seq, Morton_seq_cpu)
-            CUDA.@sync @cuda blocks=(cld(N, WARPSIZE),) threads=(32,) kernel_min_max!(
-                    buffers.Morton_seq, buffers.box_mins, buffers.box_maxs, sys.coords, Val(N),
-                    sys.boundary, Val(D))
-            sys.neighbor_finder.initialized = true
-            CUDA.@sync @cuda blocks=(n_blocks, n_blocks) threads=(32, 1) always_inline=true compress_boolean_matrices!(
-                    buffers.Morton_seq, sys.neighbor_finder.eligible, sys.neighbor_finder.special,
-                    buffers.compressed_eligible, buffers.compressed_special, Val(N))
-        end
-        CUDA.@sync @cuda blocks=(n_blocks, n_blocks) threads=(32, 1) always_inline=true force_kernel!(
-                buffers.Morton_seq, buffers.fs_mat, buffers.box_mins, buffers.box_maxs, sys.coords,
-                sys.velocities, sys.atoms, Val(N), r_cut, Val(sys.force_units), pairwise_inters,
-                sys.boundary, step_n, buffers.compressed_special, buffers.compressed_eligible,
-                Val(T), Val(D))
-    end
+function pairwise_force_gpu!(buffers, sys::System{D, true, T}, pairwise_inters, nbs::NoNeighborList,
+                             step_n) where {D, T}
+    kernel = @cuda launch=false pairwise_force_kernel_nonl!(
+            buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters, step_n,
+            Val(D), Val(sys.force_units))
+    conf = launch_configuration(kernel.fun)
+    threads_basic = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_PAIRWISE", "512"))
+    nthreads = min(length(sys.atoms), threads_basic, conf.threads)
+    nthreads = cld(nthreads, WARPSIZE) * WARPSIZE
+    n_blocks_i = cld(length(sys.atoms), WARPSIZE)
+    n_blocks_j = cld(length(sys.atoms), nthreads)
+    kernel(buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters,
+            step_n, Val(D), Val(sys.force_units); threads=nthreads,
+            blocks=(n_blocks_i, n_blocks_j))
     return buffers
 end
 
-function pairwise_pe_gpu!(pe_vec_nounits, buffers, sys::System{D, true, T}, pairwise_inters, nbs, step_n) where {D, T}
-    if typeof(nbs) == NoNeighborList
-        n_threads_gpu, n_blocks = cuda_threads_blocks_pairwise(length(nbs))
-        CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks pairwise_pe_kernel!(
-            pe_vec_nounits, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters, nbs, step_n, Val(sys.energy_units))
-    else
-        # The ordering is always recomputed for potential energy
-        # Different buffers are used to the forces case, so sys.neighbor_finder.initialized
-        #   is not updated
-        N = length(sys.coords)
-        n_blocks = cld(N, WARPSIZE)
-        r_cut = sys.neighbor_finder.dist_cutoff
+function pairwise_force_gpu!(buffers, sys::System{D, true, T}, pairwise_inters, nbs::Nothing,
+                             step_n) where {D, T}
+    N = length(sys.coords)
+    n_blocks = cld(N, WARPSIZE)
+    r_cut = sys.neighbor_finder.dist_cutoff
+    if step_n % sys.neighbor_finder.n_steps_reorder == 0 || !sys.neighbor_finder.initialized
         Morton_bits = 4
         w = r_cut - typeof(ustrip(r_cut))(0.1) * unit(r_cut)
         Morton_seq_cpu = sorted_Morton_seq(Array(sys.coords), w, Morton_bits)
         copyto!(buffers.Morton_seq, Morton_seq_cpu)
         CUDA.@sync @cuda blocks=(cld(N, WARPSIZE),) threads=(32,) kernel_min_max!(
-                buffers.Morton_seq, buffers.box_mins, buffers.box_maxs, sys.coords,
-                Val(N), sys.boundary, Val(D))
-        CUDA.@sync @cuda blocks=(n_blocks, n_blocks) threads=(32, 1) always_inline=true energy_kernel!(
-                buffers.Morton_seq, pe_vec_nounits, buffers.box_mins, buffers.box_maxs, sys.coords,
-                sys.velocities, sys.atoms, Val(N), r_cut, Val(sys.energy_units), pairwise_inters,
-                sys.boundary, step_n, sys.neighbor_finder.special, sys.neighbor_finder.eligible,
-                Val(T), Val(D))
+                buffers.Morton_seq, buffers.box_mins, buffers.box_maxs, sys.coords, Val(N),
+                sys.boundary, Val(D))
+        sys.neighbor_finder.initialized = true
+        CUDA.@sync @cuda blocks=(n_blocks, n_blocks) threads=(32, 1) always_inline=true compress_boolean_matrices!(
+                buffers.Morton_seq, sys.neighbor_finder.eligible, sys.neighbor_finder.special,
+                buffers.compressed_eligible, buffers.compressed_special, Val(N))
     end
+    CUDA.@sync @cuda blocks=(n_blocks, n_blocks) threads=(32, 1) always_inline=true force_kernel!(
+            buffers.Morton_seq, buffers.fs_mat, buffers.box_mins, buffers.box_maxs, sys.coords,
+            sys.velocities, sys.atoms, Val(N), r_cut, Val(sys.force_units), pairwise_inters,
+            sys.boundary, step_n, buffers.compressed_special, buffers.compressed_eligible,
+            Val(T), Val(D))
+    return buffers
+end
+
+function pairwise_pe_gpu!(pe_vec_nounits, buffers, sys::System{D, true, T}, pairwise_inters,
+                          nbs::NoNeighborList, step_n) where {D, T}
+    n_threads_gpu, n_blocks = cuda_threads_blocks_pairwise(length(nbs))
+    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks pairwise_pe_kernel!(
+        pe_vec_nounits, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters,
+        nbs, step_n, Val(sys.energy_units))
+    return pe_vec_nounits
+end
+
+function pairwise_pe_gpu!(pe_vec_nounits, buffers, sys::System{D, true, T}, pairwise_inters,
+                          nbs::Nothing, step_n) where {D, T}
+    # The ordering is always recomputed for potential energy
+    # Different buffers are used to the forces case, so sys.neighbor_finder.initialized
+    #   is not updated
+    N = length(sys.coords)
+    n_blocks = cld(N, WARPSIZE)
+    r_cut = sys.neighbor_finder.dist_cutoff
+    Morton_bits = 4
+    w = r_cut - typeof(ustrip(r_cut))(0.1) * unit(r_cut)
+    Morton_seq_cpu = sorted_Morton_seq(Array(sys.coords), w, Morton_bits)
+    copyto!(buffers.Morton_seq, Morton_seq_cpu)
+    CUDA.@sync @cuda blocks=(cld(N, WARPSIZE),) threads=(32,) kernel_min_max!(
+            buffers.Morton_seq, buffers.box_mins, buffers.box_maxs, sys.coords,
+            Val(N), sys.boundary, Val(D))
+    CUDA.@sync @cuda blocks=(n_blocks, n_blocks) threads=(32, 1) always_inline=true energy_kernel!(
+            buffers.Morton_seq, pe_vec_nounits, buffers.box_mins, buffers.box_maxs, sys.coords,
+            sys.velocities, sys.atoms, Val(N), r_cut, Val(sys.energy_units), pairwise_inters,
+            sys.boundary, step_n, sys.neighbor_finder.special, sys.neighbor_finder.eligible,
+            Val(T), Val(D))
     return pe_vec_nounits
 end
 
diff --git a/src/energy.jl b/src/energy.jl
index f842ab852..be8d5cdbe 100644
--- a/src/energy.jl
+++ b/src/energy.jl
@@ -268,7 +268,7 @@ function potential_energy(sys::System{D, AT, T}, neighbors, step_n::Integer=0;
 
     pairwise_inters_nl = filter(use_neighbors, values(sys.pairwise_inters))
     if length(pairwise_inters_nl) > 0
-        pairwise_pe_gpu!(pe_vec_nounits, buffers, sys, pairwise_inters_nl, nothing, step_n)   
+        pairwise_pe_gpu!(pe_vec_nounits, buffers, sys, pairwise_inters_nl, neighbors, step_n)   
     end
 
     for inter_list in values(sys.specific_inter_lists)
diff --git a/src/force.jl b/src/force.jl
index 1edd12b03..bf1adf886 100644
--- a/src/force.jl
+++ b/src/force.jl
@@ -383,7 +383,7 @@ function forces_nounits!(fs_nounits, sys::System{D, AT, T}, neighbors,
 
     pairwise_inters_nl = filter(use_neighbors, values(sys.pairwise_inters))
     if length(pairwise_inters_nl) > 0
-        pairwise_force_gpu!(buffers, sys, pairwise_inters_nl, nothing, step_n)
+        pairwise_force_gpu!(buffers, sys, pairwise_inters_nl, neighbors, step_n)
     end
 
     for inter_list in values(sys.specific_inter_lists)
diff --git a/src/kernels.jl b/src/kernels.jl
index a7ea220dc..1aca5f16d 100644
--- a/src/kernels.jl
+++ b/src/kernels.jl
@@ -32,13 +32,14 @@ function gpu_threads_specific(n_inters)
 end
 
 function pairwise_force_gpu!(buffers, sys::System{D, AT, T}, 
-                    pairwise_inters, nbs, step_n) where {D, AT <: AbstractGPUArray, T}
+                    pairwise_inters, neighbors, step_n) where {D, AT <: AbstractGPUArray, T}
     backend = get_backend(coords)
-    if typeof(nbs) == NoNeighborList
+    if typeof(neighbors) == NoNeighborList
         n_threads_gpu = gpu_threads_pairwise(length(atoms))
         kernel! = pairwise_force_kernel_nonl!(backend, n_threads_gpu)
         kernel!(buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters, step_n, Val(D), Val(force_units); ndrange = length(atoms))
-    else
+    elseif length(neighbors) > 0
+        nbs = @view neighbors.list[1:neighbors.n]
         n_threads_gpu = gpu_threads_pairwise(length(nbs))
         kernel! = pairwise_force_kernel_nl!(backend, n_threads_gpu)
         kernel!(buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters,
@@ -228,11 +229,15 @@ end
 end
 
 function pairwise_pe_gpu!(pe_vec_nounits, buffers, sys::System{D, AT, T},
-                         pairwise_inters, nbs, step_n) where {D, AT <: AbstractGPUArray, T}
-    backend = get_backend(sys.coords)
-    n_threads_gpu = gpu_threads_pairwise(length(nbs))
-    kernel! = pairwise_pe_kernel!(backend, n_threads_gpu)
-    kernel!(pe_vec_nounits, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters, nbs, step_n, Val(energy_units); ndrange = length(nbs))
+                         pairwise_inters, neighbors, step_n) where {D, AT <: AbstractGPUArray, T}
+    if length(neighbors) > 0
+        backend = get_backend(sys.coords)
+        nbs = @view neighbors.list[1:neighbors.n]
+        n_threads_gpu = gpu_threads_pairwise(length(nbs))
+        kernel! = pairwise_pe_kernel!(backend, n_threads_gpu)
+        kernel!(pe_vec_nounits, sys.coords, sys.velocities, sys.atoms, sys.boundary,
+                pairwise_inters, nbs, step_n, Val(energy_units); ndrange=length(nbs))
+    end
     return pe_vec_nounits
 end
 

From 09ec1306455d51f7ebe28aa5664fd5740c0bd340 Mon Sep 17 00:00:00 2001
From: Joe Greener <jgreener@hotmail.co.uk>
Date: Thu, 23 Jan 2025 15:20:49 +0000
Subject: [PATCH 05/24] move all CUDA code to extension

---
 ext/MollyCUDAExt.jl |  114 ++--
 src/cuda.jl         | 1248 -------------------------------------------
 2 files changed, 68 insertions(+), 1294 deletions(-)
 delete mode 100644 src/cuda.jl

diff --git a/ext/MollyCUDAExt.jl b/ext/MollyCUDAExt.jl
index ddedb1bdc..2c9e7afa6 100644
--- a/ext/MollyCUDAExt.jl
+++ b/ext/MollyCUDAExt.jl
@@ -5,11 +5,10 @@ using CUDA
 using Atomix
 using KernelAbstractions
 
-CUDA.Const(nl::Molly.NoNeighborList) = nl
-
-# CUDA.jl kernels
 const WARPSIZE = UInt32(32)
 
+CUDA.Const(nl::Molly.NoNeighborList) = nl
+
 macro shfl_multiple_sync(mask, target, width, vars...)
     all_lines = map(vars) do v
         Expr(:(=), v,
@@ -38,57 +37,78 @@ function cuda_threads_blocks_specific(n_inters)
     return n_threads_gpu, n_blocks
 end
 
-function pairwise_force_gpu!(buffers, sys::System{D, AT, T}, pairwise_inters, nbs, step_n) where {D, AT <: CuArray, T}
-    if typeof(nbs) == NoNeighborList
-        kernel = @cuda launch=false pairwise_force_kernel_nonl!(
-                buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters, step_n,
-                Val(D), Val(sys.force_units))
-        conf = launch_configuration(kernel.fun)
-        threads_basic = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_PAIRWISE", "512"))
-        nthreads = min(length(sys.atoms), threads_basic, conf.threads)
-        nthreads = cld(nthreads, WARPSIZE) * WARPSIZE
-        n_blocks_i = cld(length(sys.atoms), WARPSIZE)
-        n_blocks_j = cld(length(sys.atoms), nthreads)
-        kernel(buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters, step_n, Val(D),
-               Val(sys.force_units); threads=nthreads, blocks=(n_blocks_i, n_blocks_j))
-    else    
-        N = length(sys.coords)
-        n_blocks = cld(N, WARPSIZE)
-        r_cut = sys.neighbor_finder.dist_cutoff
-        if step_n % sys.neighbor_finder.n_steps_reorder == 0 || !sys.neighbor_finder.initialized
-            Morton_bits = 4
-            w = r_cut - typeof(ustrip(r_cut))(0.1) * unit(r_cut)
-            Morton_seq_cpu = sorted_Morton_seq(Array(sys.coords), w, Morton_bits)
-            copyto!(buffers.Morton_seq, Morton_seq_cpu)
-            CUDA.@sync @cuda blocks=(cld(N, WARPSIZE),) threads=(32,) kernel_min_max!(buffers.Morton_seq, buffers.box_mins, buffers.box_maxs, sys.coords, Val(N), sys.boundary, Val(D))
-            sys.neighbor_finder.initialized = true
-            CUDA.@sync @cuda blocks=(n_blocks, n_blocks) threads=(32, 1) always_inline=true compress_boolean_matrices!(buffers.Morton_seq, 
-            sys.neighbor_finder.eligible, sys.neighbor_finder.special, buffers.compressed_eligible, buffers.compressed_special, Val(N))
-        end
-        CUDA.@sync @cuda blocks=(n_blocks, n_blocks) threads=(32, 1) always_inline=true force_kernel!(buffers.Morton_seq, buffers.fs_mat, buffers.box_mins, buffers.box_maxs, sys.coords, sys.velocities, sys.atoms, Val(N), r_cut, Val(sys.force_units), pairwise_inters, sys.boundary, step_n, buffers.compressed_special, buffers.compressed_eligible, Val(T), Val(D))
-    end
+function pairwise_force_gpu!(buffers, sys::System{D, AT, T}, pairwise_inters, nbs::NoNeighborList,
+                             step_n) where {D, AT <: CuArray, T}
+    kernel = @cuda launch=false pairwise_force_kernel_nonl!(
+            buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters, step_n,
+            Val(D), Val(sys.force_units))
+    conf = launch_configuration(kernel.fun)
+    threads_basic = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_PAIRWISE", "512"))
+    nthreads = min(length(sys.atoms), threads_basic, conf.threads)
+    nthreads = cld(nthreads, WARPSIZE) * WARPSIZE
+    n_blocks_i = cld(length(sys.atoms), WARPSIZE)
+    n_blocks_j = cld(length(sys.atoms), nthreads)
+    kernel(buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters,
+            step_n, Val(D), Val(sys.force_units); threads=nthreads,
+            blocks=(n_blocks_i, n_blocks_j))
     return buffers
 end
 
-function pairwise_pe_gpu!(pe_vec_nounits, buffers, sys::System{D, AT, T}, pairwise_inters, nbs, step_n) where {D, AT <: CuArray, T}
-    if typeof(nbs) == NoNeighborList
-        n_threads_gpu, n_blocks = cuda_threads_blocks_pairwise(length(nbs))
-        CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks pairwise_pe_kernel!(
-            pe_vec_nounits, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters, nbs, step_n, Val(sys.energy_units))
-    else
-        N = length(sys.coords)
-        n_blocks = cld(N, WARPSIZE)
-        r_cut = sys.neighbor_finder.dist_cutoff
+function pairwise_force_gpu!(buffers, sys::System{D, AT, T}, pairwise_inters, nbs::Nothing,
+                             step_n) where {D, AT <: CuArray, T}
+    N = length(sys.coords)
+    n_blocks = cld(N, WARPSIZE)
+    r_cut = sys.neighbor_finder.dist_cutoff
+    if step_n % sys.neighbor_finder.n_steps_reorder == 0 || !sys.neighbor_finder.initialized
         Morton_bits = 4
         w = r_cut - typeof(ustrip(r_cut))(0.1) * unit(r_cut)
         Morton_seq_cpu = sorted_Morton_seq(Array(sys.coords), w, Morton_bits)
         copyto!(buffers.Morton_seq, Morton_seq_cpu)
-        CUDA.@sync @cuda blocks=(cld(N, WARPSIZE),) threads=(32,) kernel_min_max!(buffers.Morton_seq, buffers.box_mins, buffers.box_maxs, sys.coords, Val(N), sys.boundary, Val(D))
+        CUDA.@sync @cuda blocks=(cld(N, WARPSIZE),) threads=(32,) kernel_min_max!(
+                buffers.Morton_seq, buffers.box_mins, buffers.box_maxs, sys.coords, Val(N),
+                sys.boundary, Val(D))
         sys.neighbor_finder.initialized = true
-        CUDA.@sync @cuda blocks=(n_blocks, n_blocks) threads=(32, 1) always_inline=true energy_kernel!(buffers.Morton_seq, 
-            pe_vec_nounits, buffers.box_mins, buffers.box_maxs, sys.coords, sys.velocities, sys.atoms, Val(N), r_cut, Val(sys.energy_units), 
-            pairwise_inters, sys.boundary, step_n, sys.neighbor_finder.special, sys.neighbor_finder.eligible, Val(T), Val(D))
+        CUDA.@sync @cuda blocks=(n_blocks, n_blocks) threads=(32, 1) always_inline=true compress_boolean_matrices!(
+                buffers.Morton_seq, sys.neighbor_finder.eligible, sys.neighbor_finder.special,
+                buffers.compressed_eligible, buffers.compressed_special, Val(N))
     end
+    CUDA.@sync @cuda blocks=(n_blocks, n_blocks) threads=(32, 1) always_inline=true force_kernel!(
+            buffers.Morton_seq, buffers.fs_mat, buffers.box_mins, buffers.box_maxs, sys.coords,
+            sys.velocities, sys.atoms, Val(N), r_cut, Val(sys.force_units), pairwise_inters,
+            sys.boundary, step_n, buffers.compressed_special, buffers.compressed_eligible,
+            Val(T), Val(D))
+    return buffers
+end
+
+function pairwise_pe_gpu!(pe_vec_nounits, buffers, sys::System{D, AT, T}, pairwise_inters,
+                          nbs::NoNeighborList, step_n) where {D, AT <: CuArray, T}
+    n_threads_gpu, n_blocks = cuda_threads_blocks_pairwise(length(nbs))
+    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks pairwise_pe_kernel!(
+        pe_vec_nounits, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters,
+        nbs, step_n, Val(sys.energy_units))
+    return pe_vec_nounits
+end
+
+function pairwise_pe_gpu!(pe_vec_nounits, buffers, sys::System{D, AT, T}, pairwise_inters,
+                          nbs::Nothing, step_n) where {D, AT <: CuArray, T}
+    # The ordering is always recomputed for potential energy
+    # Different buffers are used to the forces case, so sys.neighbor_finder.initialized
+    #   is not updated
+    N = length(sys.coords)
+    n_blocks = cld(N, WARPSIZE)
+    r_cut = sys.neighbor_finder.dist_cutoff
+    Morton_bits = 4
+    w = r_cut - typeof(ustrip(r_cut))(0.1) * unit(r_cut)
+    Morton_seq_cpu = sorted_Morton_seq(Array(sys.coords), w, Morton_bits)
+    copyto!(buffers.Morton_seq, Morton_seq_cpu)
+    CUDA.@sync @cuda blocks=(cld(N, WARPSIZE),) threads=(32,) kernel_min_max!(
+            buffers.Morton_seq, buffers.box_mins, buffers.box_maxs, sys.coords,
+            Val(N), sys.boundary, Val(D))
+    CUDA.@sync @cuda blocks=(n_blocks, n_blocks) threads=(32, 1) always_inline=true energy_kernel!(
+            buffers.Morton_seq, pe_vec_nounits, buffers.box_mins, buffers.box_maxs, sys.coords,
+            sys.velocities, sys.atoms, Val(N), r_cut, Val(sys.energy_units), pairwise_inters,
+            sys.boundary, step_n, sys.neighbor_finder.special, sys.neighbor_finder.eligible,
+            Val(T), Val(D))
     return pe_vec_nounits
 end
 
@@ -213,7 +233,8 @@ function kernel_min_max!(
     return nothing
 end
 
-function compress_boolean_matrices!(sorted_seq, eligible_matrix, special_matrix, compressed_eligible, compressed_special, ::Val{N}) where N
+function compress_boolean_matrices!(sorted_seq, eligible_matrix, special_matrix,
+                                    compressed_eligible, compressed_special, ::Val{N}) where N
 
     a = Int32(1)
     n_blocks = Int32(ceil(N / 32))
@@ -1233,4 +1254,5 @@ function specific_pe_4_atoms_kernel!(energy, coords_var, velocities_var, atoms_v
     end
     return nothing
 end
+
 end
diff --git a/src/cuda.jl b/src/cuda.jl
deleted file mode 100644
index 93118131e..000000000
--- a/src/cuda.jl
+++ /dev/null
@@ -1,1248 +0,0 @@
-# CUDA.jl kernels
-const WARPSIZE = UInt32(32)
-
-macro shfl_multiple_sync(mask, target, width, vars...)
-    all_lines = map(vars) do v
-        Expr(:(=), v,
-            Expr(:call, :shfl_sync,
-                mask, v, target, width
-            )
-        )
-    end
-    return esc(Expr(:block, all_lines...))
-end
-
-CUDA.shfl_recurse(op, x::Quantity) = op(x.val) * unit(x)
-CUDA.shfl_recurse(op, x::SVector{1, C}) where C = SVector{1, C}(op(x[1]))
-CUDA.shfl_recurse(op, x::SVector{2, C}) where C = SVector{2, C}(op(x[1]), op(x[2]))
-CUDA.shfl_recurse(op, x::SVector{3, C}) where C = SVector{3, C}(op(x[1]), op(x[2]), op(x[3]))
-
-function cuda_threads_blocks_pairwise(n_neighbors)
-    n_threads_gpu = min(n_neighbors, parse(Int, get(ENV, "MOLLY_GPUNTHREADS_PAIRWISE", "512")))
-    n_blocks = cld(n_neighbors, n_threads_gpu)
-    return n_threads_gpu, n_blocks
-end
-
-function cuda_threads_blocks_specific(n_inters)
-    n_threads_gpu = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_SPECIFIC", "128"))
-    n_blocks = cld(n_inters, n_threads_gpu)
-    return n_threads_gpu, n_blocks
-end
-
-function pairwise_force_gpu!(buffers, sys::System{D, true, T}, pairwise_inters, nbs::NoNeighborList,
-                             step_n) where {D, T}
-    kernel = @cuda launch=false pairwise_force_kernel_nonl!(
-            buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters, step_n,
-            Val(D), Val(sys.force_units))
-    conf = launch_configuration(kernel.fun)
-    threads_basic = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_PAIRWISE", "512"))
-    nthreads = min(length(sys.atoms), threads_basic, conf.threads)
-    nthreads = cld(nthreads, WARPSIZE) * WARPSIZE
-    n_blocks_i = cld(length(sys.atoms), WARPSIZE)
-    n_blocks_j = cld(length(sys.atoms), nthreads)
-    kernel(buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters,
-            step_n, Val(D), Val(sys.force_units); threads=nthreads,
-            blocks=(n_blocks_i, n_blocks_j))
-    return buffers
-end
-
-function pairwise_force_gpu!(buffers, sys::System{D, true, T}, pairwise_inters, nbs::Nothing,
-                             step_n) where {D, T}
-    N = length(sys.coords)
-    n_blocks = cld(N, WARPSIZE)
-    r_cut = sys.neighbor_finder.dist_cutoff
-    if step_n % sys.neighbor_finder.n_steps_reorder == 0 || !sys.neighbor_finder.initialized
-        Morton_bits = 4
-        w = r_cut - typeof(ustrip(r_cut))(0.1) * unit(r_cut)
-        Morton_seq_cpu = sorted_Morton_seq(Array(sys.coords), w, Morton_bits)
-        copyto!(buffers.Morton_seq, Morton_seq_cpu)
-        CUDA.@sync @cuda blocks=(cld(N, WARPSIZE),) threads=(32,) kernel_min_max!(
-                buffers.Morton_seq, buffers.box_mins, buffers.box_maxs, sys.coords, Val(N),
-                sys.boundary, Val(D))
-        sys.neighbor_finder.initialized = true
-        CUDA.@sync @cuda blocks=(n_blocks, n_blocks) threads=(32, 1) always_inline=true compress_boolean_matrices!(
-                buffers.Morton_seq, sys.neighbor_finder.eligible, sys.neighbor_finder.special,
-                buffers.compressed_eligible, buffers.compressed_special, Val(N))
-    end
-    CUDA.@sync @cuda blocks=(n_blocks, n_blocks) threads=(32, 1) always_inline=true force_kernel!(
-            buffers.Morton_seq, buffers.fs_mat, buffers.box_mins, buffers.box_maxs, sys.coords,
-            sys.velocities, sys.atoms, Val(N), r_cut, Val(sys.force_units), pairwise_inters,
-            sys.boundary, step_n, buffers.compressed_special, buffers.compressed_eligible,
-            Val(T), Val(D))
-    return buffers
-end
-
-function pairwise_pe_gpu!(pe_vec_nounits, buffers, sys::System{D, true, T}, pairwise_inters,
-                          nbs::NoNeighborList, step_n) where {D, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_pairwise(length(nbs))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks pairwise_pe_kernel!(
-        pe_vec_nounits, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters,
-        nbs, step_n, Val(sys.energy_units))
-    return pe_vec_nounits
-end
-
-function pairwise_pe_gpu!(pe_vec_nounits, buffers, sys::System{D, true, T}, pairwise_inters,
-                          nbs::Nothing, step_n) where {D, T}
-    # The ordering is always recomputed for potential energy
-    # Different buffers are used to the forces case, so sys.neighbor_finder.initialized
-    #   is not updated
-    N = length(sys.coords)
-    n_blocks = cld(N, WARPSIZE)
-    r_cut = sys.neighbor_finder.dist_cutoff
-    Morton_bits = 4
-    w = r_cut - typeof(ustrip(r_cut))(0.1) * unit(r_cut)
-    Morton_seq_cpu = sorted_Morton_seq(Array(sys.coords), w, Morton_bits)
-    copyto!(buffers.Morton_seq, Morton_seq_cpu)
-    CUDA.@sync @cuda blocks=(cld(N, WARPSIZE),) threads=(32,) kernel_min_max!(
-            buffers.Morton_seq, buffers.box_mins, buffers.box_maxs, sys.coords,
-            Val(N), sys.boundary, Val(D))
-    CUDA.@sync @cuda blocks=(n_blocks, n_blocks) threads=(32, 1) always_inline=true energy_kernel!(
-            buffers.Morton_seq, pe_vec_nounits, buffers.box_mins, buffers.box_maxs, sys.coords,
-            sys.velocities, sys.atoms, Val(N), r_cut, Val(sys.energy_units), pairwise_inters,
-            sys.boundary, step_n, sys.neighbor_finder.special, sys.neighbor_finder.eligible,
-            Val(T), Val(D))
-    return pe_vec_nounits
-end
-
-function sorted_Morton_seq(positions, w, bits::Int)
-    N = length(positions)
-    D = length(positions[1])
-    Morton_sequence = Vector{Int32}(undef, N)
-    for i in 1:N
-        scaled_coords = floor.(Int32, positions[i] ./ w)
-        Morton_sequence[i] = generalized_Morton_code(scaled_coords, bits, D)
-    end
-    sort = Int32.(sortperm(Morton_sequence))
-    return sort
-end
-
-function generalized_Morton_code(indices, bits::Int, D::Int)
-    code = 0
-    for bit in 0:(bits-1)
-        for d in 1:D
-            code |= ((indices[d] >> bit) & 1) << (D * bit + (d - 1))
-        end
-    end
-    return Int32(code)
-end
-
-function boxes_dist(x1_min::D, x1_max::D, x2_min::D, x2_max::D, Lx::D) where D
-
-    a = abs(vector_1D(x2_max, x1_min, Lx))
-    b = abs(vector_1D(x1_max, x2_min, Lx))
-
-    return ifelse(
-        x1_min - x2_max <= zero(D) && x2_min - x1_max <= zero(D),
-        zero(D),
-        ifelse(a < b, a, b)	
-    )
-end
-
-function kernel_min_max!(
-    sorted_seq,
-    mins::AbstractArray{C}, 
-    maxs::AbstractArray{C}, 
-    coords, 
-    ::Val{n}, 
-    boundary,
-    ::Val{D}) where {n, C, D}
-
-    D32 = Int32(32)
-    a = Int32(1)
-    b = Int32(D)
-    r = Int32(n % D32)
-    i = threadIdx().x + (blockIdx().x - a) * blockDim().x
-    local_i = threadIdx().x
-    mins_smem = CuStaticSharedArray(C, (D32, b))
-    maxs_smem = CuStaticSharedArray(C, (D32, b))
-    r_smem = CuStaticSharedArray(C, (r, b))
- 
-    if i <= n - r && local_i <= D32
-        for k in a:b
-            s_i = sorted_seq[i]
-            mins_smem[local_i, k] = coords[s_i][k]
-            maxs_smem[local_i, k] = coords[s_i][k]
-        end
-    end
-    sync_threads() 
-    if i <= n - r && local_i <= D32
-        for p in a:Int32(log2(D32))
-            for k in a:b
-                @inbounds begin
-                    if local_i % Int32(2^p) == Int32(0)
-                        if mins_smem[local_i, k] > mins_smem[local_i - Int32(2^(p - 1)), k] 
-                            mins_smem[local_i, k] = mins_smem[local_i - Int32(2^(p - 1)), k]
-                        end
-                        if maxs_smem[local_i, k] < maxs_smem[local_i - Int32(2^(p - 1)), k] 
-                            maxs_smem[local_i, k] = maxs_smem[local_i - Int32(2^(p - 1)), k]
-                        end
-                    end
-                end
-            end
-        end 
-        if local_i == D32 
-            for k in a:b
-                mins[blockIdx().x, k] = mins_smem[local_i, k]
-                maxs[blockIdx().x, k] = maxs_smem[local_i, k]
-            end
-        end
-
-    end 
-
-    # Since the remainder array is low-dimensional, we do the scan
-    if i > n - r && i <= n && local_i <= r
-        for k in a:b
-            r_smem[local_i, k] = coords[sorted_seq[i]][k]
-        end
-    end
-    xyz_min = CuStaticSharedArray(C, b)
-    xyz_max = CuStaticSharedArray(C, b)
-    for k in a:b
-        xyz_min[k] = 10 * boundary.side_lengths[k] # very large (arbitrary) value
-        xyz_max[k] = -10 * boundary.side_lengths[k]
-    end
-    if local_i == a
-        for j in a:r
-            @inbounds begin
-                for k in a:b
-                    if r_smem[j, k] < xyz_min[k] 
-                        xyz_min[k] = r_smem[j, k]
-                    end
-                    if r_smem[j, k] > xyz_max[k] 
-                        xyz_max[k] = r_smem[j, k]
-                    end
-                end
-            end
-        end
-        if blockIdx().x == Int32(ceil(n/D32)) && r != Int32(0)
-            for k in a:b
-                mins[blockIdx().x, k] = xyz_min[k] 
-                maxs[blockIdx().x, k] = xyz_max[k]
-            end
-        end
-    end
-
-    return nothing
-end
-
-function compress_boolean_matrices!(sorted_seq, eligible_matrix, special_matrix,
-                                    compressed_eligible, compressed_special, ::Val{N}) where N
-
-    a = Int32(1)
-    n_blocks = Int32(ceil(N / 32))
-    r = Int32((N - 1) % 32 + 1)
-    i = blockIdx().x
-    j = blockIdx().y
-    i_0_tile = (i - a) * warpsize()
-    j_0_tile = (j - a) * warpsize()
-    index_i = i_0_tile + laneid()
-    index_j = j_0_tile + laneid()
-
-    if j < n_blocks && i <= j 
-        s_idx_i = sorted_seq[index_i]
-        eligible_bitmask = UInt32(0)
-        special_bitmask = UInt32(0)
-        for m in a:warpsize()
-            s_idx_j = sorted_seq[j_0_tile + m]
-            eligible_bitmask = (eligible_bitmask << 1) | UInt32(eligible_matrix[s_idx_i, s_idx_j])
-            special_bitmask = (special_bitmask << 1) | UInt32(special_matrix[s_idx_i, s_idx_j])
-        end
-        compressed_eligible[laneid(), i, j] = eligible_bitmask
-        compressed_special[laneid(), i, j] = special_bitmask
-    end
-
-    if j == n_blocks && i < j
-        s_idx_i = sorted_seq[index_i]
-        eligible_bitmask = UInt32(0)
-        special_bitmask = UInt32(0)
-        for m in a:r
-            s_idx_j = sorted_seq[j_0_tile + m]
-            eligible_bitmask = (eligible_bitmask << 1) | UInt32(eligible_matrix[s_idx_i, s_idx_j])
-            special_bitmask = (special_bitmask << 1) | UInt32(special_matrix[s_idx_i, s_idx_j])
-        end
-        eligible_bitmask = (eligible_bitmask >> r) | (eligible_bitmask << (warpsize() - r))
-        special_bitmask = (special_bitmask >> r) | (special_bitmask << (warpsize() - r))
-        compressed_eligible[laneid(), i, j] = eligible_bitmask
-        compressed_special[laneid(), i, j] = special_bitmask
-    end
-
-    if j == n_blocks && i == j && laneid() <= r
-        s_idx_i = sorted_seq[index_i]
-        eligible_bitmask = UInt32(0)
-        special_bitmask = UInt32(0)
-        for m in a:r
-            s_idx_j = sorted_seq[j_0_tile + m]
-            eligible_bitmask = (eligible_bitmask << 1) | UInt32(eligible_matrix[s_idx_i, s_idx_j])
-            special_bitmask = (special_bitmask << 1) | UInt32(special_matrix[s_idx_i, s_idx_j])
-        end
-        eligible_bitmask = (eligible_bitmask >> r) | (eligible_bitmask << (warpsize() - r))
-        special_bitmask = (special_bitmask >> r) | (special_bitmask << (warpsize() - r))
-        compressed_eligible[laneid(), i, j] = eligible_bitmask
-        compressed_special[laneid(), i, j] = special_bitmask
-    end
-    return nothing
-end
-
-
-#=
-**The No-neighborlist pairwise force summation kernel (algorithm by Eastman, see https://onlinelibrary.wiley.com/doi/full/10.1002/jcc.21413)**: 
-1. Case j < n_blocks && i < j, i.e., `WARPSIZE`×`WARPSIZE` tiles: For such tiles each row is assiged to a different thread in a warp which calculates the
-forces for the entire row in `WARPSIZE` steps. This is done such that some data can be shuffled from `i+1`'th thread to `i`'th thread in each
-subsequent iteration of the force calculation in a row. If `a, b, ...` are different atoms and `1, 2, ...` are order in which each thread calculates
-the interatomic forces, then we can represent this scenario as (considering `WARPSIZE=8`):
-```
-    × | i j k l m n o p
-    --------------------
-    a | 1 2 3 4 5 6 7 8
-    b | 8 1 2 3 4 5 6 7
-    c | 7 8 1 2 3 4 5 6
-    d | 6 7 8 1 2 3 4 5
-    e | 5 6 7 8 1 2 3 4
-    f | 4 5 6 7 8 1 2 3
-    g | 3 4 5 6 7 8 1 2
-    h | 2 3 4 5 6 7 8 1
-```
-
-2. Cases j == n_blocks && i < n_blocks, i == j && i < n_blocks, i == n_blocks && j == n_blocks: In such cases, it is not possible to shuffle data generally
-so there is no need to order calculations for each thread diagonally and it is also a bit more complicated to do so.
-That's why the calculations are done in the following order:
-```
-    × | i j k l m n
-    ----------------
-    a | 1 2 3 4 5 6
-    b | 1 2 3 4 5 6
-    c | 1 2 3 4 5 6
-    d | 1 2 3 4 5 6
-    e | 1 2 3 4 5 6
-    f | 1 2 3 4 5 6
-    g | 1 2 3 4 5 6
-    h | 1 2 3 4 5 6
-```
-=#
-
-function force_kernel!( 
-    sorted_seq,
-    forces_nounits, 
-    mins::AbstractArray{C}, 
-    maxs::AbstractArray{C},
-    coords, 
-    velocities,
-    atoms,
-    ::Val{N}, 
-    r_cut, 
-    ::Val{force_units},
-    inters_tuple,
-    boundary,
-    step_n,
-    special_compressed,
-    eligible_compressed,
-    ::Val{T},
-    ::Val{D}) where {N, C, force_units, T, D}
-
-    a = Int32(1)
-    b = Int32(D)
-    n_blocks = Int32(ceil(N / 32))
-    i = blockIdx().x
-    j = blockIdx().y
-    i_0_tile = (i - a) * warpsize()
-    j_0_tile = (j - a) * warpsize()
-    index_i = i_0_tile + laneid()
-    index_j = j_0_tile + laneid()
-    force_smem = CuStaticSharedArray(T, (32, 3))
-    opposites_sum = CuStaticSharedArray(T, (32, 3))
-    r = Int32((N - 1) % 32 + 1)
-    @inbounds for k in a:b
-        force_smem[laneid(), k] = zero(T)
-        opposites_sum[laneid(), k] = zero(T)
-    end
-
-    # The code is organised in 4 mutually excluding parts
-    if j < n_blocks && i < j
-        d_block = zero(C)
-        dist_block = zero(C) * zero(C)
-        @inbounds for k in a:b	
-            d_block = boxes_dist(mins[i, k], maxs[i, k], mins[j, k], maxs[j, k], boundary.side_lengths[k])
-            dist_block += d_block * d_block	
-        end
-        if dist_block <= r_cut * r_cut
-            s_idx_i = sorted_seq[index_i]
-            coords_i = coords[s_idx_i] 
-            vel_i = velocities[s_idx_i] 
-            atoms_i = atoms[s_idx_i]
-            d_pb = zero(C)
-            dist_pb = zero(C) * zero(C)
-            @inbounds for k in a:b	
-                d_pb = boxes_dist(coords_i[k], coords_i[k], mins[j, k], maxs[j, k], boundary.side_lengths[k])
-                dist_pb += d_pb * d_pb
-            end
-
-            Bool_excl = dist_pb <= r_cut * r_cut
-            s_idx_j = sorted_seq[index_j]
-            coords_j = coords[s_idx_j]
-            vel_j = velocities[s_idx_j] 
-            shuffle_idx = laneid()
-            atoms_j = atoms[s_idx_j]
-            atype_j = atoms_j.atom_type
-            aindex_j = atoms_j.index
-            amass_j = atoms_j.mass
-            acharge_j = atoms_j.charge
-            aσ_j = atoms_j.σ
-            aϵ_j = atoms_j.ϵ
-            eligible_bitmask = UInt32(0)
-            special_bitmask = UInt32(0)
-            eligible_bitmask = eligible_compressed[laneid(), i, j]
-            special_bitmask = special_compressed[laneid(), i, j]
-
-            # Shuffle
-            for m in a:warpsize()
-                sync_warp()
-                coords_j = CUDA.shfl_sync(0xFFFFFFFF, coords_j, laneid() + a, warpsize())
-                vel_j = CUDA.shfl_sync(0xFFFFFFFF, vel_j, laneid() + a, warpsize())
-                shuffle_idx = CUDA.shfl_sync(0xFFFFFFFF, shuffle_idx, laneid() + a, warpsize())
-                atype_j = CUDA.shfl_sync(0xFFFFFFFF, atype_j, laneid() + a, warpsize())
-                aindex_j = CUDA.shfl_sync(0xFFFFFFFF, aindex_j, laneid() + a, warpsize())
-                amass_j = CUDA.shfl_sync(0xFFFFFFFF, amass_j, laneid() + a, warpsize())
-                acharge_j = CUDA.shfl_sync(0xFFFFFFFF, acharge_j, laneid() + a, warpsize())
-                aσ_j = CUDA.shfl_sync(0xFFFFFFFF, aσ_j, laneid() + a, warpsize())
-                aϵ_j = CUDA.shfl_sync(0xFFFFFFFF, aϵ_j, laneid() + a, warpsize())
-                
-                atoms_j_shuffle = Atom(atype_j, aindex_j, amass_j, acharge_j, aσ_j, aϵ_j)
-                dr = vector(coords_j, coords_i, boundary)
-                r2 = sum(abs2, dr)
-                excl = (eligible_bitmask >> (warpsize() - shuffle_idx)) | (eligible_bitmask << shuffle_idx)
-                spec = (special_bitmask >> (warpsize() - shuffle_idx)) | (special_bitmask << shuffle_idx)
-                condition = (excl & 0x1) == true && r2 <= r_cut * r_cut
-                 
-                f = condition ? sum_pairwise_forces(
-                    inters_tuple,
-                    atoms_i, atoms_j_shuffle,
-                    Val(force_units),
-                    (spec & 0x1) == true,
-                    coords_i, coords_j,
-                    boundary,
-                    vel_i, vel_j,
-                    step_n) : zero(SVector{D, T})
-
-                @inbounds for k in a:b
-                    force_smem[laneid(), k] += ustrip(f[k])
-                    opposites_sum[shuffle_idx, k] -= ustrip(f[k])
-                end
-            end
-            sync_threads()
-            @inbounds for k in a:b
-                CUDA.atomic_add!(
-                    pointer(forces_nounits, s_idx_i * b - (b - k)), 
-                    -force_smem[laneid(), k]
-                ) 
-                CUDA.atomic_add!(
-                    pointer(forces_nounits, s_idx_j * b - (b - k)), 
-                    -opposites_sum[laneid(), k]
-                ) 
-            end
-        end
-    end
-
-    if j == n_blocks && i < n_blocks
-        d_block = zero(C)
-        dist_block = zero(C) * zero(C)
-        @inbounds for k in a:b
-            d_block = boxes_dist(mins[i, k], maxs[i, k], mins[j, k], maxs[j, k], boundary.side_lengths[k])
-            dist_block += d_block * d_block	
-        end
-
-        if dist_block <= r_cut * r_cut 
-            s_idx_i = sorted_seq[index_i]
-            coords_i = coords[s_idx_i]
-            vel_i = velocities[s_idx_i]
-            atoms_i = atoms[s_idx_i]
-            d_pb = zero(C)
-            dist_pb = zero(C) * zero(C)			
-            @inbounds for k in a:b	
-                d_pb = boxes_dist(coords_i[k], coords_i[k], mins[j, k], maxs[j, k], boundary.side_lengths[k])
-                dist_pb += d_pb * d_pb
-            end
-            Bool_excl = dist_pb <= r_cut * r_cut
-            eligible_bitmask = UInt32(0)
-            special_bitmask = UInt32(0)
-            eligible_bitmask = eligible_compressed[laneid(), i, j]
-            special_bitmask = special_compressed[laneid(), i, j]
-            
-            for m in a:r
-                s_idx_j = sorted_seq[j_0_tile + m]
-                coords_j = coords[s_idx_j]
-                vel_j = velocities[s_idx_j]
-                atoms_j = atoms[s_idx_j]
-                dr = vector(coords_j, coords_i, boundary)
-                r2 = sum(abs2, dr)
-                excl = (eligible_bitmask >> (warpsize() - m)) | (eligible_bitmask << m)
-                spec = (special_bitmask >> (warpsize() - m)) | (special_bitmask << m)
-                condition = (excl & 0x1) == true && r2 <= r_cut * r_cut
-
-                f = condition ? sum_pairwise_forces(
-                    inters_tuple,
-                    atoms_i, atoms_j,
-                    Val(force_units),
-                    (spec & 0x1) == true,
-                    coords_i, coords_j,
-                    boundary,
-                    vel_i, vel_j,
-                    step_n) : zero(SVector{D, T})
-
-                @inbounds for k in a:b
-                    force_smem[laneid(), k] += ustrip(f[k])
-                    CUDA.atomic_add!(
-                        pointer(forces_nounits, s_idx_j * b - (b - k)), 
-                        ustrip(f[k])
-                    )
-                end
-            end
-
-            # Sum contributions of the r-block to the other standard blocks
-            @inbounds for k in a:b
-                CUDA.atomic_add!(
-                    pointer(forces_nounits, s_idx_i * b - (b - k)), 
-                    -force_smem[laneid(), k]
-                ) 
-            end
-        end
-    end
-
-    if i == j && i < n_blocks
-        s_idx_i = sorted_seq[index_i]
-        coords_i = coords[s_idx_i]
-        vel_i = velocities[s_idx_i]
-        atoms_i = atoms[s_idx_i]
-        eligible_bitmask = UInt32(0)
-        special_bitmask = UInt32(0)
-        eligible_bitmask = eligible_compressed[laneid(), i, j]
-        special_bitmask = special_compressed[laneid(), i, j]
-
-        for m in (laneid() + a) : warpsize()
-            s_idx_j = sorted_seq[j_0_tile + m]
-            coords_j = coords[s_idx_j]
-            vel_j = velocities[s_idx_j]
-            atoms_j = atoms[s_idx_j]
-            dr = vector(coords_j, coords_i, boundary)
-            r2 = sum(abs2, dr)
-            excl = (eligible_bitmask >> (warpsize() - m)) | (eligible_bitmask << m)
-            spec = (special_bitmask >> (warpsize() - m)) | (special_bitmask << m)
-            condition = (excl & 0x1) == true && r2 <= r_cut * r_cut
-
-            f = condition ? sum_pairwise_forces(
-                inters_tuple,
-                atoms_i, atoms_j,
-                Val(force_units),
-                (spec & 0x1) == true,
-                coords_i, coords_j,
-                boundary,
-                vel_i, vel_j,
-                step_n) : zero(SVector{D, T})
-            
-            @inbounds for k in a:b
-                force_smem[laneid(), k] += ustrip(f[k])
-                opposites_sum[m, k] -= ustrip(f[k])
-            end
-        end	
-
-        @inbounds for k in a:b
-            # In this case i == j, so we can call atomic_add! only once
-            CUDA.atomic_add!(
-                pointer(forces_nounits, s_idx_i * b - (b - k)), 
-                -force_smem[laneid(), k] - opposites_sum[laneid(), k]
-            ) 
-        end
-    end
-
-    if i == n_blocks && j == n_blocks
-        if laneid() <= r
-            s_idx_i = sorted_seq[index_i]
-            coords_i = coords[s_idx_i]
-            vel_i = velocities[s_idx_i]
-            atoms_i = atoms[s_idx_i]
-            eligible_bitmask = UInt32(0)
-            special_bitmask = UInt32(0)
-            eligible_bitmask = eligible_compressed[laneid(), i, j]
-            special_bitmask = special_compressed[laneid(), i, j]
-
-            for m in (laneid() + a) : r
-                s_idx_j = sorted_seq[j_0_tile + m]
-                coords_j = coords[s_idx_j]
-                vel_j = velocities[s_idx_j]
-                atoms_j = atoms[s_idx_j]
-                dr = vector(coords_j, coords_i, boundary)
-                r2 = sum(abs2, dr)
-                excl = (eligible_bitmask >> (warpsize() - m)) | (eligible_bitmask << m)
-                spec = (special_bitmask >> (warpsize() - m)) | (special_bitmask << m)
-                condition = (excl & 0x1) == true && r2 <= r_cut * r_cut
-                
-                f = condition ? sum_pairwise_forces(
-                    inters_tuple,
-                    atoms_i, atoms_j,
-                    Val(force_units),
-                    (spec & 0x1) == true,
-                    coords_i, coords_j,
-                    boundary,
-                    vel_i, vel_j,
-                    step_n) : zero(SVector{D, T})
-
-                @inbounds for k in a:b
-                    force_smem[laneid(), k] += ustrip(f[k])
-                    opposites_sum[m, k] -= ustrip(f[k])
-                end
-            end
-            @inbounds for k in a:b
-                CUDA.atomic_add!(
-                    pointer(forces_nounits, s_idx_i * b - (b - k)), 
-                    -force_smem[laneid(), k] - opposites_sum[laneid(), k]
-                ) 
-            end
-        end
-    end
-
-    return nothing
-end
-
-
-function energy_kernel!( 
-    sorted_seq,
-    energy_nounits, 
-    mins::AbstractArray{C}, 
-    maxs::AbstractArray{C}, 
-    coords, 
-    velocities,
-    atoms,
-    ::Val{N}, 
-    r_cut, 
-    ::Val{energy_units},
-    inters_tuple,
-    boundary,
-    step_n, 
-    special_matrix,
-    eligible_matrix,
-    ::Val{T},
-    ::Val{D}) where {N, C, energy_units, T, D}
-
-    a = Int32(1)
-    b = Int32(D)
-    n_blocks = Int32(ceil(N / 32))
-    r = Int32((N - 1) % 32 + 1)
-    i = blockIdx().x
-    j = blockIdx().y
-    i_0_tile = (i - 1) * warpsize()
-    j_0_tile = (j - 1) * warpsize()
-    index_i = i_0_tile + laneid()
-    index_j = j_0_tile + laneid()
-    E_smem = CuStaticSharedArray(T, 32)
-    E_smem[laneid()] = zero(T)
-    eligible = CuStaticSharedArray(Bool, (32, 32))
-    special = CuStaticSharedArray(Bool, (32, 32))
-
-    # The code is organised in 4 mutually excluding parts
-    if j < n_blocks && i < j
-        d_block = zero(C)
-        dist_block = zero(C) * zero(C)
-        @inbounds for k in a:b	
-            d_block = boxes_dist(mins[i, k], maxs[i, k], mins[j, k], maxs[j, k], boundary.side_lengths[k])
-            dist_block += d_block * d_block	
-        end
-        if dist_block <= r_cut * r_cut
-            s_idx_i = sorted_seq[index_i]
-            coords_i = coords[s_idx_i] 
-            vel_i = velocities[s_idx_i]
-            atoms_i = atoms[s_idx_i]
-            d_pb = zero(C)
-            dist_pb = zero(C) * zero(C)
-            @inbounds for k in a:b	
-                d_pb = boxes_dist(coords_i[k], coords_i[k], mins[j, k], maxs[j, k], boundary.side_lengths[k])
-                dist_pb += d_pb * d_pb
-            end
-            Bool_excl = dist_pb <= r_cut * r_cut
-            s_idx_j = sorted_seq[index_j]
-            coords_j = coords[s_idx_j]
-            vel_j = velocities[s_idx_j]
-            shuffle_idx = laneid()
-            atoms_j = atoms[s_idx_j]
-            atype_j = atoms_j.atom_type
-            aindex_j = atoms_j.index
-            amass_j = atoms_j.mass
-            acharge_j = atoms_j.charge
-            aσ_j = atoms_j.σ
-            aϵ_j = atoms_j.ϵ
-            @inbounds for m in a:warpsize()
-                eligible[laneid(), m] = eligible_matrix[s_idx_i, sorted_seq[j_0_tile + m]]
-                special[laneid(), m] = special_matrix[s_idx_i, sorted_seq[j_0_tile + m]]
-            end
-
-            # Shuffle
-            for m in a:warpsize()
-                sync_warp()
-                coords_j = CUDA.shfl_sync(0xFFFFFFFF, coords_j, laneid() + a, warpsize())
-                vel_j = CUDA.shfl_sync(0xFFFFFFFF, vel_j, laneid() + a, warpsize())
-                s_idx_j = CUDA.shfl_sync(0xFFFFFFFF, s_idx_j, laneid() + a, warpsize())
-                shuffle_idx = CUDA.shfl_sync(0xFFFFFFFF, shuffle_idx, laneid() + a, warpsize())
-                atype_j = CUDA.shfl_sync(0xFFFFFFFF, atype_j, laneid() + a, warpsize())
-                aindex_j = CUDA.shfl_sync(0xFFFFFFFF, aindex_j, laneid() + a, warpsize())
-                amass_j = CUDA.shfl_sync(0xFFFFFFFF, amass_j, laneid() + a, warpsize())
-                acharge_j = CUDA.shfl_sync(0xFFFFFFFF, acharge_j, laneid() + a, warpsize())
-                aσ_j = CUDA.shfl_sync(0xFFFFFFFF, aσ_j, laneid() + a, warpsize())
-                aϵ_j = CUDA.shfl_sync(0xFFFFFFFF, aϵ_j, laneid() + a, warpsize())
-                
-                atoms_j_shuffle = Atom(atype_j, aindex_j, amass_j, acharge_j, aσ_j, aϵ_j)
-                dr = vector(coords_j, coords_i, boundary)
-                r2 = sum(abs2, dr)
-                condition = eligible[laneid(), shuffle_idx] && Bool_excl && r2 <= r_cut * r_cut
-
-                pe = condition ? sum_pairwise_potentials(
-                    inters_tuple,
-                    atoms_i, atoms_j_shuffle,
-                    Val(energy_units),
-                    special[laneid(), shuffle_idx],
-                    coords_i, coords_j,
-                    boundary,
-                    vel_i, vel_j,
-                    step_n) : zero(SVector{1, T})
-
-                E_smem[laneid()] += ustrip(pe[1])
-            end
-        end
-    end
-
-    if j == n_blocks && i < n_blocks
-        d_block = zero(C)
-        dist_block = zero(C) * zero(C)
-        @inbounds for k in a:b
-            d_block = boxes_dist(mins[i, k], maxs[i, k], mins[j, k], maxs[j, k], boundary.side_lengths[k])
-            dist_block += d_block * d_block	
-        end
-        if dist_block <= r_cut * r_cut 
-            s_idx_i = sorted_seq[index_i]
-            coords_i = coords[s_idx_i]
-            vel_i = velocities[s_idx_i]
-            atoms_i = atoms[s_idx_i]
-            d_pb = zero(C)
-            dist_pb = zero(C) * zero(C)			
-            @inbounds for k in a:b	
-                d_pb = boxes_dist(coords_i[k], coords_i[k], mins[j, k], maxs[j, k], boundary.side_lengths[k])
-                dist_pb += d_pb * d_pb
-            end
-            Bool_excl = dist_pb <= r_cut * r_cut
-            @inbounds for m in a:r
-                s_idx_j = sorted_seq[j_0_tile + m]
-                eligible[laneid(), m] = eligible_matrix[s_idx_i, s_idx_j]
-                special[laneid(), m] = special_matrix[s_idx_i, s_idx_j]
-            end
-            
-            for m in a:r
-                s_idx_j = sorted_seq[j_0_tile + m]
-                coords_j = coords[s_idx_j]
-                vel_j = velocities[s_idx_j]
-                atoms_j = atoms[s_idx_j]
-                dr = vector(coords_j, coords_i, boundary)
-                r2 = sum(abs2, dr)
-                condition = eligible[laneid(), m] && Bool_excl && r2 <= r_cut * r_cut
-
-                pe = condition ? sum_pairwise_potentials(
-                    inters_tuple,
-                    atoms_i, atoms_j,
-                    Val(energy_units),
-                    special[laneid(), m],
-                    coords_i, coords_j,
-                    boundary,
-                    vel_i, vel_j,
-                    step_n) : zero(SVector{1, T})
-
-                E_smem[laneid()] += ustrip(pe[1])
-            end
-        end
-    end
-
-    if i == j && i < n_blocks
-        s_idx_i = sorted_seq[index_i]
-        coords_i = coords[s_idx_i]
-        vel_i = velocities[s_idx_i]
-        atoms_i = atoms[s_idx_i]
-        @inbounds for m in a:warpsize()
-            s_idx_j = sorted_seq[j_0_tile + m]
-            eligible[laneid(), m] = eligible_matrix[s_idx_i, s_idx_j]
-            special[laneid(), m] = special_matrix[s_idx_i, s_idx_j]
-        end
-        @inbounds for m in (laneid() + a) : warpsize()
-            s_idx_j = sorted_seq[j_0_tile + m]
-            coords_j = coords[s_idx_j]
-            vel_j = velocities[s_idx_j]
-            atoms_j = atoms[s_idx_j]
-            dr = vector(coords_j, coords_i, boundary)
-            r2 = sum(abs2, dr)
-            condition = eligible[laneid(), m] && r2 <= r_cut * r_cut
-
-            pe = condition ? sum_pairwise_potentials(
-                    inters_tuple,
-                    atoms_i, atoms_j,
-                    Val(energy_units),
-                    special[laneid(), m],
-                    coords_i, coords_j,
-                    boundary,
-                    vel_i, vel_j,
-                    step_n) : zero(SVector{1, T})
-
-            E_smem[laneid()] += ustrip(pe[1])
-        end	
-    end
-
-    if i == n_blocks && j == n_blocks
-        if laneid() <= r
-            s_idx_i = sorted_seq[index_i]
-            coords_i = coords[s_idx_i]
-            vel_i = velocities[s_idx_i]
-            atoms_i = atoms[s_idx_i]
-            @inbounds for m in a:r
-                s_idx_j = sorted_seq[j_0_tile + m]
-                eligible[laneid(), m] = eligible_matrix[s_idx_i, s_idx_j]
-                special[laneid(), m] = special_matrix[s_idx_i, s_idx_j]
-            end
-
-            @inbounds for m in (laneid() + a) : r
-                s_idx_j = sorted_seq[j_0_tile + m]
-                coords_j = coords[s_idx_j]
-                vel_j = velocities[s_idx_j]
-                atoms_j = atoms[s_idx_j]
-                dr = vector(coords_j, coords_i, boundary)
-                r2 = sum(abs2, dr)
-                condition = eligible[laneid(), m] && r2 <= r_cut * r_cut
-                
-                pe = condition ? sum_pairwise_potentials(
-                    inters_tuple,
-                    atoms_i, atoms_j,
-                    Val(energy_units),
-                    special[laneid(), m],
-                    coords_i, coords_j,
-                    boundary,
-                    vel_i, vel_j,
-                    step_n) : zero(SVector{1, T})
-
-                E_smem[laneid()] += ustrip(pe[1])
-            end
-        end
-    end
-
-    if threadIdx().x == a
-        sum_E = zero(T)
-        for k in a:warpsize()
-            sum_E += E_smem[k]
-        end
-        CUDA.atomic_add!(pointer(energy_nounits), sum_E)
-    end
-    return nothing
-end
-
-
-
-function pairwise_force_kernel_nonl!(forces::AbstractArray{T}, coords_var, velocities_var,
-                        atoms_var, boundary, inters, step_n, ::Val{D}, ::Val{F}) where {T, D, F}
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    n_atoms = length(atoms)
-
-    tidx = threadIdx().x
-    i_0_tile = (blockIdx().x - 1) * warpsize()
-    j_0_block = (blockIdx().y - 1) * blockDim().x
-    warpidx = cld(tidx, warpsize())
-    j_0_tile = j_0_block + (warpidx - 1) * warpsize()
-    i = i_0_tile + laneid()
-
-    forces_shmem = CuStaticSharedArray(T, (3, 1024))
-    @inbounds for dim in 1:3
-        forces_shmem[dim, tidx] = zero(T)
-    end
-
-    if i_0_tile + warpsize() > n_atoms || j_0_tile + warpsize() > n_atoms
-        @inbounds if i <= n_atoms
-            njs = min(warpsize(), n_atoms - j_0_tile)
-            atom_i, coord_i, vel_i = atoms[i], coords[i], velocities[i]
-            for del_j in 1:njs
-                j = j_0_tile + del_j
-                if i != j
-                    atom_j, coord_j, vel_j = atoms[j], coords[j], velocities[j]
-                    f = sum_pairwise_forces(inters, atom_i, atom_j, Val(F), false, coord_i, coord_j,
-                                            boundary, vel_i, vel_j, step_n)
-                    for dim in 1:D
-                        forces_shmem[dim, tidx] += -ustrip(f[dim])
-                    end
-                end
-            end
-
-            for dim in 1:D
-                Atomix.@atomic :monotonic forces[dim, i] += forces_shmem[dim, tidx]
-            end
-        end
-    else
-        j = j_0_tile + laneid()
-        tilesteps = warpsize()
-        if i_0_tile == j_0_tile  # To not compute i-i forces
-            j = j_0_tile + laneid() % warpsize() + 1
-            tilesteps -= 1
-        end
-
-        atom_i, coord_i, vel_i = atoms[i], coords[i], velocities[i]
-        coord_j, vel_j = coords[j], velocities[j]
-        @inbounds for _ in 1:tilesteps
-            sync_warp()
-            atom_j = atoms[j]
-            f = sum_pairwise_forces(inters, atom_i, atom_j, Val(F), false, coord_i, coord_j,
-                                    boundary, vel_i, vel_j, step_n)
-            for dim in 1:D
-                forces_shmem[dim, tidx] += -ustrip(f[dim])
-            end
-            @shfl_multiple_sync(FULL_MASK, laneid() + 1, warpsize(), j, coord_j)
-        end
-
-        @inbounds for dim in 1:D
-            Atomix.@atomic :monotonic forces[dim, i] += forces_shmem[dim, tidx]
-        end
-    end
-
-    return nothing
-end
-
-function pairwise_pe_kernel!(energy, coords_var, velocities_var, atoms_var, boundary, inters,
-                             neighbors_var, step_n, ::Val{E}) where E
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    neighbors = CUDA.Const(neighbors_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(neighbors)
-        i, j, special = neighbors[inter_i]
-        coord_i, coord_j, vel_i, vel_j = coords[i], coords[j], velocities[i], velocities[j]
-        dr = vector(coord_i, coord_j, boundary)
-        pe = potential_energy_gpu(inters[1], dr, atoms[i], atoms[j], E, special, coord_i, coord_j,
-                                  boundary, vel_i, vel_j, step_n)
-        for inter in inters[2:end]
-            pe += potential_energy_gpu(inter, dr, atoms[i], atoms[j], E, special, coord_i, coord_j,
-                                       boundary, vel_i, vel_j, step_n)
-        end
-        if unit(pe) != E
-            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
-        end
-        Atomix.@atomic :monotonic energy[1] += ustrip(pe)
-    end
-    return nothing
-end
-
-@inline function sum_pairwise_forces(inters, atom_i, atom_j, ::Val{F}, special, coord_i, coord_j,
-                                     boundary, vel_i, vel_j, step_n) where F
-    dr = vector(coord_i, coord_j, boundary)
-    f_tuple = ntuple(length(inters)) do inter_type_i
-        force_gpu(inters[inter_type_i], dr, atom_i, atom_j, F, special, coord_i, coord_j, boundary,
-                  vel_i, vel_j, step_n)
-    end
-    f = sum(f_tuple)
-    if unit(f[1]) != F
-        # This triggers an error but it isn't printed
-        # See https://discourse.julialang.org/t/error-handling-in-cuda-kernels/79692
-        #   for how to throw a more meaningful error
-        error("wrong force unit returned, was expecting $F but got $(unit(f[1]))")
-    end
-    return f
-end
-
-@inline function sum_pairwise_potentials(inters, atom_i, atom_j, ::Val{E}, special, coord_i, coord_j,
-                                     boundary, vel_i, vel_j, step_n) where E
-    dr = vector(coord_i, coord_j, boundary)
-
-    pe_tuple = ntuple(length(inters)) do inter_type_i
-        SVector(potential_energy_gpu(inters[inter_type_i], dr, atom_i, atom_j, E, special, coord_i, coord_j, boundary,
-                  vel_i, vel_j, step_n))
-                  # SVector was required to avoid a GPU error occurring with scalars (like the quantity returned by potential_energy_gpu) 
-    end
-    pe = sum(pe_tuple)
-    if unit(pe[1]) != E
-        # This triggers an error but it isn't printed
-        # See https://discourse.julialang.org/t/error-handling-in-cuda-kernels/79692
-        #   for how to throw a more meaningful error
-        error("wrong force unit returned, was expecting $E but got $(unit(pe[1]))")
-    end
-    return pe
-end
-
-function specific_force_gpu!(fs_mat, inter_list::InteractionList1Atoms, coords::AbstractArray{SVector{D, C}},
-                            velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_force_1_atoms_kernel!(fs_mat,
-            coords, velocities, atoms, boundary, step_n, inter_list.is, inter_list.inters,
-            Val(D), Val(force_units))
-    return fs_mat
-end
-
-function specific_force_gpu!(fs_mat, inter_list::InteractionList2Atoms, coords::AbstractArray{SVector{D, C}},
-                            velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_force_2_atoms_kernel!(fs_mat,
-            coords, velocities, atoms, boundary, step_n, inter_list.is, inter_list.js,
-            inter_list.inters, Val(D), Val(force_units))
-    return fs_mat
-end
-
-function specific_force_gpu!(fs_mat, inter_list::InteractionList3Atoms, coords::AbstractArray{SVector{D, C}},
-                            velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_force_3_atoms_kernel!(fs_mat,
-            coords, velocities, atoms, boundary, step_n, inter_list.is, inter_list.js,
-            inter_list.ks, inter_list.inters, Val(D), Val(force_units))
-    return fs_mat
-end
-
-function specific_force_gpu!(fs_mat, inter_list::InteractionList4Atoms, coords::AbstractArray{SVector{D, C}},
-                            velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_force_4_atoms_kernel!(fs_mat,
-            coords, velocities, atoms, boundary, step_n, inter_list.is, inter_list.js,
-            inter_list.ks, inter_list.ls, inter_list.inters, Val(D), Val(force_units))
-    return fs_mat
-end
-
-function specific_force_1_atoms_kernel!(forces, coords_var, velocities_var, atoms_var, boundary,
-                        step_n, is_var, inters_var, ::Val{D}, ::Val{F}) where {D, F}
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i = is[inter_i]
-        fs = force_gpu(inters[inter_i], coords[i], boundary, atoms[i], F, velocities[i], step_n)
-        if unit(fs.f1[1]) != F
-            error("wrong force unit returned, was expecting $F")
-        end
-        for dim in 1:D
-            Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim])
-        end
-    end
-    return nothing
-end
-
-function specific_force_2_atoms_kernel!(forces, coords_var, velocities_var, atoms_var, boundary,
-                        step_n, is_var, js_var, inters_var, ::Val{D}, ::Val{F}) where {D, F}
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    js = CUDA.Const(js_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i, j = is[inter_i], js[inter_i]
-        fs = force_gpu(inters[inter_i], coords[i], coords[j], boundary, atoms[i], atoms[j], F,
-                       velocities[i], velocities[j], step_n)
-        if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F
-            error("wrong force unit returned, was expecting $F")
-        end
-        for dim in 1:D
-            Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim])
-            Atomix.@atomic :monotonic forces[dim, j] += ustrip(fs.f2[dim])
-        end
-    end
-    return nothing
-end
-
-function specific_force_3_atoms_kernel!(forces, coords_var, velocities_var, atoms_var, boundary,
-                        step_n, is_var, js_var, ks_var, inters_var, ::Val{D}, ::Val{F}) where {D, F}
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    js = CUDA.Const(js_var)
-    ks = CUDA.Const(ks_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i, j, k = is[inter_i], js[inter_i], ks[inter_i]
-        fs = force_gpu(inters[inter_i], coords[i], coords[j], coords[k], boundary, atoms[i],
-                       atoms[j], atoms[k], F, velocities[i], velocities[j], velocities[k], step_n)
-        if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F || unit(fs.f3[1]) != F
-            error("wrong force unit returned, was expecting $F")
-        end
-        for dim in 1:D
-            Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim])
-            Atomix.@atomic :monotonic forces[dim, j] += ustrip(fs.f2[dim])
-            Atomix.@atomic :monotonic forces[dim, k] += ustrip(fs.f3[dim])
-        end
-    end
-    return nothing
-end
-
-function specific_force_4_atoms_kernel!(forces, coords_var, velocities_var, atoms_var, boundary,
-                        step_n, is_var, js_var, ks_var, ls_var, inters_var,
-                        ::Val{D}, ::Val{F}) where {D, F}
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    js = CUDA.Const(js_var)
-    ks = CUDA.Const(ks_var)
-    ls = CUDA.Const(ls_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i, j, k, l = is[inter_i], js[inter_i], ks[inter_i], ls[inter_i]
-        fs = force_gpu(inters[inter_i], coords[i], coords[j], coords[k], coords[l], boundary,
-                       atoms[i], atoms[j], atoms[k], atoms[l], F, velocities[i], velocities[j],
-                       velocities[k], velocities[l], step_n)
-        if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F || unit(fs.f3[1]) != F || unit(fs.f4[1]) != F
-            error("wrong force unit returned, was expecting $F")
-        end
-        for dim in 1:D
-            Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim])
-            Atomix.@atomic :monotonic forces[dim, j] += ustrip(fs.f2[dim])
-            Atomix.@atomic :monotonic forces[dim, k] += ustrip(fs.f3[dim])
-            Atomix.@atomic :monotonic forces[dim, l] += ustrip(fs.f4[dim])
-        end
-    end
-    return nothing
-end
-
-
-function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList1Atoms, coords::AbstractArray{SVector{D, C}},
-                          velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_pe_1_atoms_kernel!(
-            pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is,
-            inter_list.inters, Val(energy_units))
-    return pe_vec_nounits
-end
-
-function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList2Atoms, coords::AbstractArray{SVector{D, C}},
-                          velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_pe_2_atoms_kernel!(
-            pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is,
-            inter_list.js, inter_list.inters, Val(energy_units))
-    return pe_vec_nounits
-end
-
-function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList3Atoms, coords::AbstractArray{SVector{D, C}},
-                          velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_pe_3_atoms_kernel!(
-            pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is,
-            inter_list.js, inter_list.ks, inter_list.inters, Val(energy_units))
-    return pe_vec_nounits
-end
-
-function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList4Atoms, coords::AbstractArray{SVector{D, C}},
-                          velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_pe_4_atoms_kernel!(
-            pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is,
-            inter_list.js, inter_list.ks, inter_list.ls, inter_list.inters, Val(energy_units))
-    return pe_vec_nounits
-end
-
-function specific_pe_1_atoms_kernel!(energy, coords_var, velocities_var, atoms_var, boundary,
-                    step_n, is_var, inters_var, ::Val{E}) where E
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i = is[inter_i]
-        pe = potential_energy_gpu(inters[inter_i], coords[i], boundary, atoms[i], E,
-                                  velocities[i], step_n)
-        if unit(pe) != E
-            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
-        end
-        Atomix.@atomic :monotonic energy[1] += ustrip(pe)
-    end
-    return nothing
-end
-
-function specific_pe_2_atoms_kernel!(energy, coords_var, velocities_var, atoms_var, boundary,
-                    step_n, is_var, js_var, inters_var, ::Val{E}) where E
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    js = CUDA.Const(js_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i, j = is[inter_i], js[inter_i]
-        pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], boundary, atoms[i],
-                                  atoms[j], E, velocities[i], velocities[j], step_n)
-        if unit(pe) != E
-            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
-        end
-        Atomix.@atomic :monotonic energy[1] += ustrip(pe)
-    end
-    return nothing
-end
-
-function specific_pe_3_atoms_kernel!(energy, coords_var, velocities_var, atoms_var, boundary,
-                    step_n, is_var, js_var, ks_var, inters_var, ::Val{E}) where E
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    js = CUDA.Const(js_var)
-    ks = CUDA.Const(ks_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i, j, k = is[inter_i], js[inter_i], ks[inter_i]
-        pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], coords[k], boundary,
-                                  atoms[i], atoms[j], atoms[k], E, velocities[i], velocities[j],
-                                  velocities[k], step_n)
-        if unit(pe) != E
-            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
-        end
-        Atomix.@atomic :monotonic energy[1] += ustrip(pe)
-    end
-    return nothing
-end
-
-function specific_pe_4_atoms_kernel!(energy, coords_var, velocities_var, atoms_var, boundary,
-                    step_n, is_var, js_var, ks_var, ls_var, inters_var, ::Val{E}) where E
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    js = CUDA.Const(js_var)
-    ks = CUDA.Const(ks_var)
-    ls = CUDA.Const(ls_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i, j, k, l = is[inter_i], js[inter_i], ks[inter_i], ls[inter_i]
-        pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], coords[k], coords[l],
-                                  boundary, atoms[i], atoms[j], atoms[k], atoms[l], E,
-                                  velocities[i], velocities[j], velocities[k], velocities[l],
-                                  step_n)
-        if unit(pe) != E
-            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
-        end
-        Atomix.@atomic :monotonic energy[1] += ustrip(pe)
-    end
-    return nothing
-end

From fa72697a683188d28dee2e4ea78d170c3567e527 Mon Sep 17 00:00:00 2001
From: Joe Greener <jgreener@hotmail.co.uk>
Date: Thu, 23 Jan 2025 15:22:20 +0000
Subject: [PATCH 06/24] remove CUDA/Enzyme extension

---
 Project.toml              |  1 -
 ext/MollyCUDAEnzymeExt.jl | 13 -------------
 2 files changed, 14 deletions(-)
 delete mode 100644 ext/MollyCUDAEnzymeExt.jl

diff --git a/Project.toml b/Project.toml
index 90a1e24a9..239ddf267 100644
--- a/Project.toml
+++ b/Project.toml
@@ -43,7 +43,6 @@ PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d"
 [extensions]
 MollyCUDAExt = "CUDA"
 MollyEnzymeExt = "Enzyme"
-MollyCUDAEnzymeExt = ["CUDA", "Enzyme"]
 MollyGLMakieExt = ["GLMakie", "Colors"]
 MollyKernelDensityExt = "KernelDensity"
 MollyPythonCallExt = "PythonCall"
diff --git a/ext/MollyCUDAEnzymeExt.jl b/ext/MollyCUDAEnzymeExt.jl
deleted file mode 100644
index c88ebd144..000000000
--- a/ext/MollyCUDAEnzymeExt.jl
+++ /dev/null
@@ -1,13 +0,0 @@
-module MollyCUDAEnzymeExt
-
-using Molly
-using CUDA
-using Enzyme
-
-ext = Base.get_extension(Molly,:MollyCUDAExt)
-
-EnzymeRules.inactive(::typeof(ext.cuda_threads_blocks_pairwise), args...) = nothing
-EnzymeRules.inactive(::typeof(ext.cuda_threads_blocks_specific), args...) = nothing
-
-
-end

From 2280cc5dc3b641f22c70ed8ff55c97810a4aa540 Mon Sep 17 00:00:00 2001
From: Joe Greener <jgreener@hotmail.co.uk>
Date: Thu, 23 Jan 2025 17:19:12 +0000
Subject: [PATCH 07/24] formatting changes, minor fixes

---
 Project.toml                         |  2 +-
 benchmark/benchmarks.jl              | 23 ++++---
 benchmark/protein.jl                 | 18 +++---
 docs/src/documentation.md            | 20 +++---
 ext/MollyCUDAExt.jl                  |  4 +-
 ext/MollyGLMakieExt.jl               |  2 +-
 ext/MollyPythonCallExt.jl            |  8 +--
 src/Molly.jl                         |  4 +-
 src/coupling.jl                      | 20 +++---
 src/energy.jl                        |  3 +-
 src/force.jl                         |  6 +-
 src/interactions/implicit_solvent.jl | 33 +++++-----
 src/kernels.jl                       | 35 +++++-----
 src/neighbors.jl                     | 13 ++--
 src/setup.jl                         | 96 ++++++++++++++--------------
 src/simulators.jl                    | 12 ++--
 src/spatial.jl                       |  9 ++-
 src/types.jl                         | 34 +++++-----
 test/basic.jl                        | 10 +--
 test/energy_conservation.jl          | 13 ++--
 test/gradients.jl                    |  4 +-
 test/minimization.jl                 |  6 +-
 test/protein.jl                      | 14 ++--
 test/simulation.jl                   | 49 +++++++-------
 24 files changed, 217 insertions(+), 221 deletions(-)

diff --git a/Project.toml b/Project.toml
index 239ddf267..469f820ea 100644
--- a/Project.toml
+++ b/Project.toml
@@ -64,7 +64,7 @@ Enzyme = "0.13.20"
 EzXML = "1"
 FLoops = "0.2"
 GLMakie = "0.8, 0.9, 0.10, 0.11"
-GPUArrays = "10"
+GPUArrays = "11"
 Graphs = "1.8"
 KernelAbstractions = "0.9"
 KernelDensity = "0.5, 0.6"
diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index e3974c07c..38e16bd41 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -20,8 +20,8 @@ end
 # Allow CUDA device to be specified
 const DEVICE = get(ENV, "DEVICE", "0")
 
-const run_gpu_tests = CUDA.functional()
-if run_gpu_tests
+const run_cuda_tests = CUDA.functional()
+if run_cuda_tests
     device!(parse(Int, DEVICE))
     @info "The GPU benchmarks will be run on device $DEVICE"
 else
@@ -62,8 +62,7 @@ const starting_velocities = [random_velocity(atom_mass, 1.0u"K") for i in 1:n_at
 const starting_coords_f32 = [Float32.(c) for c in starting_coords]
 const starting_velocities_f32 = [Float32.(c) for c in starting_velocities]
 
-function test_sim(nl::Bool, parallel::Bool, f32::Bool,
-                  array_type::Type{AT}) where AT <: AbstractArray
+function test_sim(nl::Bool, parallel::Bool, f32::Bool, ::Type{AT}) where AT
     n_atoms = 400
     n_steps = 200
     atom_mass = f32 ? 10.0f0u"g/mol" : 10.0u"g/mol"
@@ -73,9 +72,9 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool,
     r0 = f32 ? 0.2f0u"nm" : 0.2u"nm"
     bonds = [HarmonicBond(k=k, r0=r0) for i in 1:(n_atoms ÷ 2)]
     specific_inter_lists = (InteractionList2Atoms(
-        array_type(Int32.(collect(1:2:n_atoms))),
-        array_type(Int32.(collect(2:2:n_atoms))),
-        array_type(bonds),
+        AT(Int32.(collect(1:2:n_atoms))),
+        AT(Int32.(collect(2:2:n_atoms))),
+        AT(bonds),
     ),)
 
     neighbor_finder = NoNeighborFinder()
@@ -83,17 +82,17 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool,
     pairwise_inters = (LennardJones(use_neighbors=false, cutoff=cutoff),)
     if nl
         neighbor_finder = DistanceNeighborFinder(
-            eligible=array_type(trues(n_atoms, n_atoms)),
+            eligible=AT(trues(n_atoms, n_atoms)),
             n_steps=10,
             dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm",
         )
         pairwise_inters = (LennardJones(use_neighbors=true, cutoff=cutoff),)
     end
 
-    coords = array_type(deepcopy(f32 ? starting_coords_f32 : starting_coords))
-    velocities = array_type(deepcopy(f32 ? starting_velocities_f32 : starting_velocities))
-    atoms = array_type([Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
-                            ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
+    coords = AT(copy(f32 ? starting_coords_f32 : starting_coords))
+    velocities = AT(copy(f32 ? starting_velocities_f32 : starting_velocities))
+    atoms = AT([Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
+                     ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
 
     sys = System(
         atoms=atoms,
diff --git a/benchmark/protein.jl b/benchmark/protein.jl
index 131d77917..0d2f86614 100644
--- a/benchmark/protein.jl
+++ b/benchmark/protein.jl
@@ -11,7 +11,7 @@ const data_dir = normpath(dirname(pathof(Molly)), "..", "data")
 const ff_dir = joinpath(data_dir, "force_fields")
 const openmm_dir = joinpath(data_dir, "openmm_6mrr")
 
-function setup_system(array_type::AbstractArray, f32::Bool, units::Bool)
+function setup_system(::Type{AT}, f32::Bool, units::Bool) where AT
     T = f32 ? Float32 : Float64
     ff = MolecularForceField(
         T,
@@ -27,7 +27,7 @@ function setup_system(array_type::AbstractArray, f32::Bool, units::Bool)
     sys = System(
         joinpath(data_dir, "6mrr_equil.pdb"),
         ff;
-        velocities=array_type(velocities),
+        velocities=AT(velocities),
         units=units,
         gpu=gpu,
         dist_cutoff=(units ? dist_cutoff * u"nm" : dist_cutoff),
@@ -41,13 +41,13 @@ function setup_system(array_type::AbstractArray, f32::Bool, units::Bool)
 end
 
 runs = [
-    # run_name                             gpu    parr   f32    units
-    ("CPU 1 thread"                      , Array, false, false, true ),
-    ("CPU 1 thread f32"                  , Array, false, true , true ),
-    ("CPU 1 thread f32 nounits"          , Array, false, true , false),
-    ("CPU $n_threads threads"            , Array, true , false, true ),
-    ("CPU $n_threads threads f32"        , Array, true , true , true ),
-    ("CPU $n_threads threads f32 nounits", Array, true , true , false),
+    # run_name                             gpu      parr   f32    units
+    ("CPU 1 thread"                      , Array  , false, false, true ),
+    ("CPU 1 thread f32"                  , Array  , false, true , true ),
+    ("CPU 1 thread f32 nounits"          , Array  , false, true , false),
+    ("CPU $n_threads threads"            , Array  , true , false, true ),
+    ("CPU $n_threads threads f32"        , Array  , true , true , true ),
+    ("CPU $n_threads threads f32 nounits", Array  , true , true , false),
     ("GPU"                               , CuArray, false, false, true ),
     ("GPU f32"                           , CuArray, false, true , true ),
     ("GPU f32 nounits"                   , CuArray, false, true , false),
diff --git a/docs/src/documentation.md b/docs/src/documentation.md
index 96866b19b..f2cd85ad9 100644
--- a/docs/src/documentation.md
+++ b/docs/src/documentation.md
@@ -139,13 +139,13 @@ To run simulations on the GPU you will need to have a GPU available and then loa
 
 | Hardware Available | Necessary Package | Array Type |
 | ------------------ | ----------------- | ---------- |
-| Parallel CPU       | none              | Array      |
-| NVIDIA GPU         | CUDA              | CuArray    |
-| AMD GPU            | AMDGPU            | ROCArray   |
-| Intel GPU          | oneAPI            | oneArray   |
-| Apple Silicon      | Metal             | MtlArray   |
+| Parallel CPU       | none              | `Array`    |
+| NVIDIA GPU         | CUDA              | `CuArray`  |
+| AMD GPU            | AMDGPU            | `ROCArray` |
+| Intel GPU          | oneAPI            | `oneArray` |
+| Apple Silicon      | Metal             | `MtlArray` |
 
-As an important note, Metal / Apple Silicon devices can only run with 32 bit precision, so be sure to use `Float32` (for example) where necessary.
+As an important note, Metal/Apple Silicon devices can only run with 32 bit precision, so be sure to use `Float32` (for example) where necessary.
 Simulation setup is similar to above, but with the coordinates, velocities and atoms moved to the GPU.
 This example also shows setting up a simulation to run with `Float32`, which gives much better performance on GPUs.
 Of course, you will need to determine whether this level of numerical accuracy is appropriate in your case.
@@ -363,7 +363,7 @@ Residue patches, virtual sites, file includes and any force types other than `Ha
 
     Some PDB files that read in fine can be found [here](https://github.com/greener-group/GB99dms/tree/main/structures/training/conf_1).
 
-To run on the GPU, set `array_type=GPUArrayType`, where `GPUArrayType` is the array type for your GPU backend (for example, `CuArray` for NVIDIA or `ROCArray` for AMD).
+To run on the GPU, set `array_type=GPUArrayType`, where `GPUArrayType` is the array type for your GPU backend (for example `CuArray` for NVIDIA or `ROCArray` for AMD).
 You can use an implicit solvent method by giving the `implicit_solvent` keyword argument to [`System`](@ref).
 The options are `"obc1"`, `"obc2"` and `"gbn2"`, corresponding to the Onufriev-Bashford-Case GBSA model with parameter set I or II and the GB-Neck2 model.
 Other options include overriding the boundary dimensions in the file (`boundary`) and modifying the non-bonded interaction and neighbor list cutoff distances (`dist_cutoff` and `dist_neighbors`).
@@ -1028,10 +1028,10 @@ function Molly.simulate!(sys::ReplicaSystem,
 end
 ```
 
-Under the hood there are two implementations for the [`forces`](@ref) function, used by [`accelerations`](@ref), and for [`potential_energy`](@ref): a version geared towards CPUs and parallelism, and a version geared towards GPUs.
-You can define different versions of a simulator for CPU and GPU systems by dispatching on `System{D, false}` or `System{D, true}` respectively.
+Under the hood there are multiple implementations for the [`forces`](@ref) function, used by [`accelerations`](@ref), and for [`potential_energy`](@ref): a version geared towards CPUs and parallelism, a CUDA version, and a version for other GPU backends.
+You can define different versions of a simulator for CPU, CUDA and generic GPU systems by dispatching on `System{D, Array}` or `System{D, CuArray}` and `System{D, AT} where AT <: AbstractGPUArray` respectively.
 This also applies to coupling methods, neighbor finders and analysis functions.
-You do not have to define two versions though: you may only intend to use the simulator one way, or one version may be performant in all cases.
+You do not have to define different versions though: you may only intend to use the simulator one way, or one version may be performant in all cases.
 
 ## Coupling
 
diff --git a/ext/MollyCUDAExt.jl b/ext/MollyCUDAExt.jl
index 2c9e7afa6..22fbdb53f 100644
--- a/ext/MollyCUDAExt.jl
+++ b/ext/MollyCUDAExt.jl
@@ -37,7 +37,7 @@ function cuda_threads_blocks_specific(n_inters)
     return n_threads_gpu, n_blocks
 end
 
-function pairwise_force_gpu!(buffers, sys::System{D, AT, T}, pairwise_inters, nbs::NoNeighborList,
+function pairwise_force_gpu!(buffers, sys::System{D, AT, T}, pairwise_inters, nbs::Molly.NoNeighborList,
                              step_n) where {D, AT <: CuArray, T}
     kernel = @cuda launch=false pairwise_force_kernel_nonl!(
             buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters, step_n,
@@ -81,7 +81,7 @@ function pairwise_force_gpu!(buffers, sys::System{D, AT, T}, pairwise_inters, nb
 end
 
 function pairwise_pe_gpu!(pe_vec_nounits, buffers, sys::System{D, AT, T}, pairwise_inters,
-                          nbs::NoNeighborList, step_n) where {D, AT <: CuArray, T}
+                          nbs::Molly.NoNeighborList, step_n) where {D, AT <: CuArray, T}
     n_threads_gpu, n_blocks = cuda_threads_blocks_pairwise(length(nbs))
     CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks pairwise_pe_kernel!(
         pe_vec_nounits, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters,
diff --git a/ext/MollyGLMakieExt.jl b/ext/MollyGLMakieExt.jl
index fa7a49096..5509dddec 100644
--- a/ext/MollyGLMakieExt.jl
+++ b/ext/MollyGLMakieExt.jl
@@ -6,8 +6,8 @@ module MollyGLMakieExt
 using Molly
 import AtomsBase
 using GLMakie
-using Unitful
 using Colors
+using Unitful
 
 using LinearAlgebra
 
diff --git a/ext/MollyPythonCallExt.jl b/ext/MollyPythonCallExt.jl
index 9d0a26bf0..acbb6c675 100644
--- a/ext/MollyPythonCallExt.jl
+++ b/ext/MollyPythonCallExt.jl
@@ -93,7 +93,7 @@ uconvert_vec(x...) = uconvert.(x...)
 
 function AtomsCalculators.forces(sys::System{D, AT, T},
                                  ase_calc::ASECalculator;
-                                 kwargs...) where {D, G, T}
+                                 kwargs...) where {D, AT, T}
     update_ase_calc!(ase_calc, sys)
     forces_py = ase_calc.ase_atoms.get_forces()
     forces_flat = reshape(transpose(pyconvert(Matrix{T}, forces_py)), length(sys) * D)
@@ -105,12 +105,12 @@ function AtomsCalculators.forces(sys::System{D, AT, T},
     else
         fs_unit = uconvert_vec.(sys.force_units, fs * u"eV/Å")
     end
-    return AT <: AbstractGPUArray ? AT(fs_unit) : fs_unit
+    return AT(fs_unit)
 end
 
-function AtomsCalculators.potential_energy(sys::System{D, G, T},
+function AtomsCalculators.potential_energy(sys::System{D, AT, T},
                                            ase_calc::ASECalculator;
-                                           kwargs...) where {D, G, T}
+                                           kwargs...) where {D, AT, T}
     update_ase_calc!(ase_calc, sys)
     pe_py = ase_calc.ase_atoms.get_potential_energy()
     pe = pyconvert(T, pe_py)
diff --git a/src/Molly.jl b/src/Molly.jl
index 08026b186..e57e17ced 100644
--- a/src/Molly.jl
+++ b/src/Molly.jl
@@ -11,14 +11,14 @@ import BioStructures # Imported to avoid clashing names
 using CellListMap
 import Chemfiles
 using Combinatorics
-using KernelAbstractions
-using GPUArrays
 using DataStructures
 using Distances
 using Distributions
 using EzXML
 using FLoops
+using GPUArrays
 using Graphs
+using KernelAbstractions
 using NearestNeighbors
 using PeriodicTable
 using SimpleCrystals
diff --git a/src/coupling.jl b/src/coupling.jl
index c47cc4b99..ae4fc7f55 100644
--- a/src/coupling.jl
+++ b/src/coupling.jl
@@ -58,10 +58,10 @@ struct AndersenThermostat{T, C}
     coupling_const::C
 end
 
-function apply_coupling!(sys::System{D}, thermostat::AndersenThermostat, sim,
+function apply_coupling!(sys::System, thermostat::AndersenThermostat, sim,
                          neighbors=nothing, step_n::Integer=0;
                          n_threads::Integer=Threads.nthreads(),
-                         rng=Random.default_rng()) where D
+                         rng=Random.default_rng())
     for i in eachindex(sys)
         if rand(rng) < (sim.dt / thermostat.coupling_const)
             sys.velocities[i] = random_velocity(mass(sys.atoms[i]), thermostat.temperature, sys.k;
@@ -77,8 +77,8 @@ function apply_coupling!(sys::System{D, AT, T}, thermostat::AndersenThermostat,
                          rng=Random.default_rng()) where {D, AT <: AbstractGPUArray, T}
     atoms_to_bump = T.(rand(rng, length(sys)) .< (sim.dt / thermostat.coupling_const))
     atoms_to_leave = one(T) .- atoms_to_bump
-    atoms_to_bump_dev = move_array(atoms_to_bump, sys)
-    atoms_to_leave_dev = move_array(atoms_to_leave, sys)
+    atoms_to_bump_dev = AT(atoms_to_bump)
+    atoms_to_leave_dev = AT(atoms_to_leave)
     vs = random_velocities(sys, thermostat.temperature; rng=rng)
     sys.velocities .= sys.velocities .* atoms_to_leave_dev .+ vs .* atoms_to_bump_dev
     return false
@@ -231,9 +231,9 @@ function MonteCarloBarostat(P, T, boundary; n_steps=30, n_iterations=1, scale_fa
                               max_volume_frac, trial_find_neighbors, 0, 0)
 end
 
-function apply_coupling!(sys::System{D, G, T}, barostat::MonteCarloBarostat, sim, neighbors=nothing,
+function apply_coupling!(sys::System{D, AT, T}, barostat::MonteCarloBarostat, sim, neighbors=nothing,
                          step_n::Integer=0; n_threads::Integer=Threads.nthreads(),
-                         rng=Random.default_rng()) where {D, G, T}
+                         rng=Random.default_rng()) where {D, AT, T}
     if !iszero(step_n % barostat.n_steps)
         return false
     end
@@ -371,13 +371,13 @@ function MonteCarloAnisotropicBarostat(pressure::SVector{D},
     )
 end
 
-function apply_coupling!(sys::System{D, G, T},
+function apply_coupling!(sys::System{D, AT, T},
                          barostat::MonteCarloAnisotropicBarostat{D},
                          sim,
                          neighbors=nothing,
                          step_n::Integer=0;
                          n_threads::Integer=Threads.nthreads(),
-                         rng=Random.default_rng()) where {D, G, T}
+                         rng=Random.default_rng()) where {D, AT, T}
     !iszero(step_n % barostat.n_steps) && return false
     all(isnothing, barostat.pressure) && return false
 
@@ -546,13 +546,13 @@ function MonteCarloMembraneBarostat(pressure,
     )
 end
 
-function apply_coupling!(sys::System{D, G, T},
+function apply_coupling!(sys::System{D, AT, T},
                          barostat::MonteCarloMembraneBarostat,
                          sim,
                          neighbors=nothing,
                          step_n::Integer=0;
                          n_threads::Integer=Threads.nthreads(),
-                         rng=Random.default_rng()) where {D, G, T}
+                         rng=Random.default_rng()) where {D, AT, T}
     !iszero(step_n % barostat.n_steps) && return false
 
     kT = energy_remove_mol(sys.k * barostat.temperature)
diff --git a/src/energy.jl b/src/energy.jl
index be8d5cdbe..c9b590b53 100644
--- a/src/energy.jl
+++ b/src/energy.jl
@@ -33,7 +33,7 @@ E_k = \frac{1}{2} \sum_{i} m_i v_i^2
 ```
 where ``m_i`` is the mass and ``v_i`` is the velocity of atom ``i``.
 """
-function kinetic_energy(sys::System{D, G, T}) where {D, G, T}
+function kinetic_energy(sys::System)
     ke = kinetic_energy_noconvert(sys)
     return uconvert(sys.energy_units, ke)
 end
@@ -255,7 +255,6 @@ end
 
 function potential_energy(sys::System{D, AT, T}, neighbors, step_n::Integer=0;
                           n_threads::Integer=Threads.nthreads()) where {D, AT <: AbstractGPUArray, T}
-    n_atoms = length(sys)
     val_ft = Val(T)
     pe_vec_nounits = KernelAbstractions.zeros(get_backend(sys.coords), T, 1)
     buffers = init_forces_buffer!(sys, ustrip_vec.(zero(sys.coords)), 1)
diff --git a/src/force.jl b/src/force.jl
index bf1adf886..0ee26d682 100644
--- a/src/force.jl
+++ b/src/force.jl
@@ -166,8 +166,8 @@ function forces(sys, neighbors, step_n::Integer=0; n_threads::Integer=Threads.nt
     return forces_nounits .* sys.force_units
 end
 
-function forces_nounits!(fs_nounits, sys::System{D, AT}, neighbors, fs_chunks=nothing,
-                         step_n::Integer=0; n_threads::Integer=Threads.nthreads()) where {D, AT <: AbstractArray}
+function forces_nounits!(fs_nounits, sys::System, neighbors, fs_chunks=nothing,
+                         step_n::Integer=0; n_threads::Integer=Threads.nthreads())
     pairwise_inters_nonl = filter(!use_neighbors, values(sys.pairwise_inters))
     pairwise_inters_nl   = filter( use_neighbors, values(sys.pairwise_inters))
     sils_1_atoms = filter(il -> il isa InteractionList1Atoms, values(sys.specific_inter_lists))
@@ -402,5 +402,3 @@ function forces_nounits!(fs_nounits, sys::System{D, AT, T}, neighbors,
 
     return fs_nounits
 end
-
-
diff --git a/src/interactions/implicit_solvent.jl b/src/interactions/implicit_solvent.jl
index c05222722..668bf1682 100644
--- a/src/interactions/implicit_solvent.jl
+++ b/src/interactions/implicit_solvent.jl
@@ -412,10 +412,10 @@ function ImplicitSolventOBC(atoms::AbstractArray{Atom{TY, M, T, D, E}},
     end
 
     if isa(atoms, AbstractGPUArray)
-        array_type = get_array_type(atoms)
-        or = array_type(offset_radii)
-        sor = array_type(scaled_offset_radii)
-        is, js = array_type(inds_i), array_type(inds_j)
+        AT = get_array_type(atoms)
+        or = AT(offset_radii)
+        sor = AT(scaled_offset_radii)
+        is, js = AT(inds_i), AT(inds_j)
     else
         or = offset_radii
         sor = scaled_offset_radii
@@ -565,12 +565,12 @@ function ImplicitSolventGBN2(atoms::AbstractArray{Atom{TY, M, T, D, E}},
     end
 
     if isa(atoms, AbstractGPUArray)
-        array_type = get_array_type(atoms)
-        or = array_type(offset_radii)
-        sor = array_type(scaled_offset_radii)
-        is, js = array_type(inds_i), array_type(inds_j)
-        d0s, m0s = array_type(table_d0), array_type(table_m0)
-        αs, βs, γs = array_type(αs_cpu), array_type(βs_cpu), array_type(γs_cpu)
+        AT = get_array_type(atoms)
+        or = AT(offset_radii)
+        sor = AT(scaled_offset_radii)
+        is, js = AT(inds_i), AT(inds_j)
+        d0s, m0s = AT(table_d0), AT(table_m0)
+        αs, βs, γs = AT(αs_cpu), AT(βs_cpu), AT(γs_cpu)
     else
         or = offset_radii
         sor = scaled_offset_radii
@@ -798,7 +798,7 @@ function gbsa_born_gpu(coords::AbstractArray{SVector{D, C}}, offset_radii, scale
     kernel! = gbsa_born_kernel!(backend, n_threads_gpu)
     kernel!(Is_nounits, I_grads_nounits, coords, offset_radii,
             scaled_offset_radii, dist_cutoff, offset, neck_scale,
-            neck_cut, d0s, m0s, boundary, Val(C), ndrange = n_inters)
+            neck_cut, d0s, m0s, boundary, Val(C), ndrange=n_inters)
 
     Is = Is_nounits * unit(dist_cutoff)^-1
     I_grads = I_grads_nounits * unit(dist_cutoff)^-2
@@ -975,7 +975,7 @@ function gbsa_force_1_gpu(coords::AbstractArray{SVector{D, C}}, boundary, dist_c
     kernel! = gbsa_force_1_kernel!(backend, n_threads_gpu)
     kernel!(fs_mat, born_forces_mod_ustrip, coords, boundary, dist_cutoff,
             factor_solute, factor_solvent, kappa, Bs, atom_charges,
-            Val(D), Val(force_units), ndrange = n_inters)
+            Val(D), Val(force_units), ndrange=n_inters)
 
     return fs_mat, born_forces_mod_ustrip
 end
@@ -992,7 +992,7 @@ function gbsa_force_2_gpu(coords::AbstractArray{SVector{D, C}}, boundary, dist_c
     kernel! = gbsa_force_2_kernel!(backend, n_threads_gpu)
     kernel!(fs_mat, born_forces, coords, boundary, dist_cutoff, offset_radii,
             scaled_offset_radii, Bs, B_grads, I_grads, Val(D), Val(force_units),
-            ndrange = n_inters)
+            ndrange=n_inters)
 
     return fs_mat
 end
@@ -1149,8 +1149,8 @@ function gb_energy_loop(coord_i, coord_j, i, j, charge_i, charge_j, Bi, Bj, ori,
     end
 end
 
-function AtomsCalculators.potential_energy(sys::System{<:Any, AT, T}, inter::AbstractGBSA;
-                                           kwargs...) where {AT, T}
+function AtomsCalculators.potential_energy(sys::System{<:Any, <:Any, T}, inter::AbstractGBSA;
+                                           kwargs...) where T
     coords, boundary = sys.coords, sys.boundary
     Bs, B_grads, I_grads = born_radii_and_grad(inter, coords, boundary)
     atom_charges = charge.(sys.atoms)
@@ -1169,7 +1169,8 @@ function AtomsCalculators.potential_energy(sys::System{<:Any, AT, T}, inter::Abs
     return E
 end
 
-function AtomsCalculators.potential_energy(sys::System{<:Any, AT}, inter::AbstractGBSA; kwargs...) where AT <: AbstractGPUArray
+function AtomsCalculators.potential_energy(sys::System{<:Any, AT}, inter::AbstractGBSA;
+                                           kwargs...) where AT <: AbstractGPUArray
     coords, atoms, boundary = sys.coords, sys.atoms, sys.boundary
     Bs, B_grads, I_grads = born_radii_and_grad(inter, coords, boundary)
 
diff --git a/src/kernels.jl b/src/kernels.jl
index 1aca5f16d..6f620e9fa 100644
--- a/src/kernels.jl
+++ b/src/kernels.jl
@@ -1,9 +1,5 @@
 # KernelAbstractions.jl kernels
 
-function get_array_type(a::AT) where AT <: AbstractArray
-    return AT.name.wrapper
-end
-
 @inline function sum_pairwise_forces(inters, atom_i, atom_j, ::Val{F}, special, coord_i, coord_j,
                                      boundary, vel_i, vel_j, step_n) where F
     dr = vector(coord_i, coord_j, boundary)
@@ -13,9 +9,6 @@ end
     end
     f = sum(f_tuple)
     if unit(f[1]) != F
-        # This triggers an error but it isn't printed
-        # See https://discourse.julialang.org/t/error-handling-in-cuda-kernels/79692
-        #   for how to throw a more meaningful error
         error("wrong force unit returned, was expecting $F but got $(unit(f[1]))")
     end
     return f
@@ -37,7 +30,8 @@ function pairwise_force_gpu!(buffers, sys::System{D, AT, T},
     if typeof(neighbors) == NoNeighborList
         n_threads_gpu = gpu_threads_pairwise(length(atoms))
         kernel! = pairwise_force_kernel_nonl!(backend, n_threads_gpu)
-        kernel!(buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters, step_n, Val(D), Val(force_units); ndrange = length(atoms))
+        kernel!(buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary,
+                pairwise_inters, step_n, Val(D), Val(force_units); ndrange = length(atoms))
     elseif length(neighbors) > 0
         nbs = @view neighbors.list[1:neighbors.n]
         n_threads_gpu = gpu_threads_pairwise(length(nbs))
@@ -58,7 +52,8 @@ end
 
     @inbounds if inter_i <= length(neighbors)
         i, j, special = neighbors[inter_i]
-        f = sum_pairwise_forces(inters, atoms[i], atoms[j], Val(F), special, coords[i], coords[j], boundary, velocities[i], velocities[j], step_n)
+        f = sum_pairwise_forces(inters, atoms[i], atoms[j], Val(F), special, coords[i], coords[j],
+                                boundary, velocities[i], velocities[j], step_n)
         for dim in 1:D
             fval = ustrip(f[dim])
             Atomix.@atomic forces[dim, i] = forces[dim, i] - fval
@@ -77,7 +72,8 @@ end
 
     @inbounds for j = 1:i
         if i != j
-            f = sum_pairwise_forces(inters, atoms[i], atoms[j], Val(F), false, coords[i], coords[j], boundary, velocities[i], velocities[j], step_n)
+            f = sum_pairwise_forces(inters, atoms[i], atoms[j], Val(F), false, coords[i], coords[j],
+                                    boundary, velocities[i], velocities[j], step_n)
             for dim in 1:D
                 fval = ustrip(f[dim])
                 Atomix.@atomic forces[dim, i] = forces[dim, i] - fval
@@ -307,8 +303,8 @@ function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList4Atoms, coo
     return pe_vec_nounits
 end
 
-@kernel function specific_pe_1_atoms_kernel!(energy, @Const(coords), @Const(velocities), @Const(atoms), boundary,
-                    step_n, @Const(is), @Const(inters), ::Val{E}) where E
+@kernel function specific_pe_1_atoms_kernel!(energy, @Const(coords), @Const(velocities),
+                    @Const(atoms), boundary, step_n, @Const(is), @Const(inters), ::Val{E}) where E
 
     inter_i = @index(Global, Linear)
 
@@ -323,8 +319,9 @@ end
     end
 end
 
-@kernel function specific_pe_2_atoms_kernel!(energy, @Const(coords), @Const(velocities), @Const(atoms), boundary,
-                    step_n, @Const(is), @Const(js), @Const(inters), ::Val{E}) where E
+@kernel function specific_pe_2_atoms_kernel!(energy, @Const(coords), @Const(velocities),
+                    @Const(atoms), boundary, step_n, @Const(is), @Const(js), @Const(inters),
+                    ::Val{E}) where E
 
 
     inter_i = @index(Global, Linear)
@@ -340,8 +337,9 @@ end
     end
 end
 
-@kernel function specific_pe_3_atoms_kernel!(energy, @Const(coords), @Const(velocities), @Const(atoms), boundary,
-                    step_n, @Const(is), @Const(js), @Const(ks), @Const(inters), ::Val{E}) where E
+@kernel function specific_pe_3_atoms_kernel!(energy, @Const(coords), @Const(velocities),
+                    @Const(atoms), boundary, step_n, @Const(is), @Const(js), @Const(ks),
+                    @Const(inters), ::Val{E}) where E
 
     inter_i = @index(Global, Linear)
 
@@ -357,8 +355,9 @@ end
     end
 end
 
-@kernel function specific_pe_4_atoms_kernel!(energy, @Const(coords), @Const(velocities), @Const(atoms), boundary,
-                    step_n, @Const(is), @Const(js), @Const(ks), @Const(ls), @Const(inters), ::Val{E}) where E
+@kernel function specific_pe_4_atoms_kernel!(energy, @Const(coords), @Const(velocities),
+                    @Const(atoms), boundary, step_n, @Const(is), @Const(js), @Const(ks),
+                    @Const(ls), @Const(inters), ::Val{E}) where E
 
     inter_i = @index(Global, Linear)
 
diff --git a/src/neighbors.jl b/src/neighbors.jl
index 61cf066a9..166630c1b 100644
--- a/src/neighbors.jl
+++ b/src/neighbors.jl
@@ -93,12 +93,12 @@ function DistanceNeighborFinder(;
                 eligible, dist_cutoff, special, n_steps, zero(eligible))
 end
 
-function find_neighbors(sys::System{D, AT},
+function find_neighbors(sys::System,
                         nf::DistanceNeighborFinder,
                         current_neighbors=nothing,
                         step_n::Integer=0,
                         force_recompute::Bool=false;
-                        n_threads::Integer=Threads.nthreads()) where {D, AT}
+                        n_threads::Integer=Threads.nthreads())
     if !force_recompute && !iszero(step_n % nf.n_steps)
         return current_neighbors
     end
@@ -130,7 +130,6 @@ end
                                                   @Const(coords),
                                                   @Const(eligible),
                                                   boundary, sq_dist_neighbors)
-
     n_atoms = length(coords)
     n_inters = n_atoms_to_n_pairs(n_atoms)
     inter_i = @index(Global, Linear)
@@ -166,7 +165,7 @@ function find_neighbors(sys::System{D, AT},
     backend = get_backend(sys.coords)
     kernel! = distance_neighbor_finder_kernel!(backend, n_threads_gpu)
     kernel!(nf.neighbors, sys.coords, nf.eligible, sys.boundary,
-            nf.dist_cutoff^2, ndrange = n_inters)
+            nf.dist_cutoff^2, ndrange=n_inters)
 
     pairs = findall(nf.neighbors)
     nbsi, nbsj = getindex.(pairs, 1), getindex.(pairs, 2)
@@ -198,12 +197,12 @@ function TreeNeighborFinder(;
     return TreeNeighborFinder{typeof(dist_cutoff)}(eligible, dist_cutoff, special, n_steps)
 end
 
-function find_neighbors(sys::System,
+function find_neighbors(sys::System{<:Any, AT},
                         nf::TreeNeighborFinder,
                         current_neighbors=nothing,
                         step_n::Integer=0,
                         force_recompute::Bool=false;
-                        n_threads::Integer=Threads.nthreads())
+                        n_threads::Integer=Threads.nthreads()) where AT
     if !force_recompute && !iszero(step_n % nf.n_steps)
         return current_neighbors
     end
@@ -226,7 +225,7 @@ function find_neighbors(sys::System,
         end
     end
 
-    return NeighborList(length(neighbors_list), move_array(neighbors_list, sys))
+    return NeighborList(length(neighbors_list), AT(neighbors_list))
 end
 
 """
diff --git a/src/setup.jl b/src/setup.jl
index 95ff28eb3..f12b4140f 100644
--- a/src/setup.jl
+++ b/src/setup.jl
@@ -428,8 +428,8 @@ are not available when reading Gromacs files.
 - `loggers=()`: the loggers that record properties of interest during a
     simulation.
 - `units::Bool=true`: whether to use Unitful quantities.
-- `array_type::AbstractArray = Array`: The array_type desired for the simulation
-   (for GPU support, use CuArray or ROCArray)
+- `array_type=Array`: the array type for the simulation, for example
+    use `CuArray` or `ROCArray` for GPU support.
 - `dist_cutoff=1.0u"nm"`: cutoff distance for long-range interactions.
 - `dist_neighbors=1.2u"nm"`: cutoff distance for the neighbor list, should be
     greater than `dist_cutoff`.
@@ -452,7 +452,7 @@ function System(coord_file::AbstractString,
                 velocities=nothing,
                 loggers=(),
                 units::Bool=true,
-                array_type::Type{AT} where AT <: AbstractArray = Array,
+                ::Type{AT}=Array,
                 dist_cutoff=units ? 1.0u"nm" : 1.0,
                 dist_neighbors=units ? 1.2u"nm" : 1.2,
                 center_coords::Bool=true,
@@ -460,7 +460,7 @@ function System(coord_file::AbstractString,
                 data=nothing,
                 implicit_solvent=nothing,
                 kappa=0.0u"nm^-1",
-                rename_terminal_res::Bool=true)
+                rename_terminal_res::Bool=true) where AT <: AbstractArray
     T = typeof(force_field.weight_14_coulomb)
 
     # Chemfiles uses zero-based indexing, be careful
@@ -824,9 +824,9 @@ function System(coord_file::AbstractString,
     specific_inter_array = []
     if length(bonds.is) > 0
         push!(specific_inter_array, InteractionList2Atoms(
-            array_type(bonds.is),
-            array_type(bonds.js),
-            array_type([bonds.inters...]),
+            AT(bonds.is),
+            AT(bonds.js),
+            AT([bonds.inters...]),
             bonds.types,
         ))
         topology = MolecularTopology(bonds.is, bonds.js, n_atoms)
@@ -835,30 +835,30 @@ function System(coord_file::AbstractString,
     end
     if length(angles.is) > 0
         push!(specific_inter_array, InteractionList3Atoms(
-            array_type(angles.is),
-            array_type(angles.js),
-            array_type(angles.ks),
-            array_type([angles.inters...]),
+            AT(angles.is),
+            AT(angles.js),
+            AT(angles.ks),
+            AT([angles.inters...]),
             angles.types,
         ))
     end
     if length(torsions.is) > 0
         push!(specific_inter_array, InteractionList4Atoms(
-            array_type(torsions.is),
-            array_type(torsions.js),
-            array_type(torsions.ks),
-            array_type(torsions.ls),
-            array_type(torsion_inters_pad),
+            AT(torsions.is),
+            AT(torsions.js),
+            AT(torsions.ks),
+            AT(torsions.ls),
+            AT(torsion_inters_pad),
             torsions.types,
         ))
     end
     if length(impropers.is) > 0
         push!(specific_inter_array, InteractionList4Atoms(
-            array_type(impropers.is),
-            array_type(impropers.js),
-            array_type(impropers.ks),
-            array_type(impropers.ls),
-            array_type(improper_inters_pad),
+            AT(impropers.is),
+            AT(impropers.js),
+            AT(impropers.ks),
+            AT(impropers.ls),
+            AT(improper_inters_pad),
             impropers.types,
         ))
     end
@@ -887,11 +887,11 @@ function System(coord_file::AbstractString,
     end
     coords = wrap_coords.(coords, (boundary_used,))
 
-    if (array_type <: AbstractGPUArray)
+    if AT <: AbstractGPUArray
         neighbor_finder = GPUNeighborFinder(
-            eligible=array_type(eligible),
+            eligible=AT(eligible),
             dist_cutoff=T(dist_neighbors),
-            special=array_type(special),
+            special=AT(special),
             n_steps_reorder=10,
             initialized=false,
         )
@@ -913,8 +913,8 @@ function System(coord_file::AbstractString,
         )
     end
 
-    atoms = array_type([atoms_abst...])
-    coords_dev = array_type(coords)
+    atoms = AT([atoms_abst...])
+    coords_dev = AT(coords)
 
     if isnothing(velocities)
         if units
@@ -969,12 +969,12 @@ function System(T::Type,
                 velocities=nothing,
                 loggers=(),
                 units::Bool=true,
-                array_type::Type{AT} where AT <: AbstractArray = Array,
+                ::Type{AT}=Array,
                 dist_cutoff=units ? 1.0u"nm" : 1.0,
                 dist_neighbors=units ? 1.2u"nm" : 1.2,
                 center_coords::Bool=true,
                 use_cell_list::Bool=true,
-                data=nothing)
+                data=nothing) where AT <: AbstractArray
     # Read force field and topology file
     atomtypes = Dict{String, Atom}()
     bondtypes = Dict{String, HarmonicBond}()
@@ -1250,9 +1250,9 @@ function System(T::Type,
     specific_inter_array = []
     if length(bonds.is) > 0
         push!(specific_inter_array, InteractionList2Atoms(
-            array_type(bonds.is),
-            array_type(bonds.js),
-            array_type([bonds.inters...]),
+            AT(bonds.is),
+            AT(bonds.js),
+            AT([bonds.inters...]),
             bonds.types,
         ))
         topology = MolecularTopology(bonds.is, bonds.js, n_atoms)
@@ -1261,30 +1261,30 @@ function System(T::Type,
     end
     if length(angles.is) > 0
         push!(specific_inter_array, InteractionList3Atoms(
-            array_type(angles.is),
-            array_type(angles.js),
-            array_type(angles.ks),
-            array_type([angles.inters...]),
+            AT(angles.is),
+            AT(angles.js),
+            AT(angles.ks),
+            AT([angles.inters...]),
             angles.types,
         ))
     end
     if length(torsions.is) > 0
         push!(specific_inter_array, InteractionList4Atoms(
-            array_type(torsions.is),
-            array_type(torsions.js),
-            array_type(torsions.ks),
-            array_type(torsions.ls),
-            array_type([torsions.inters...]),
+            AT(torsions.is),
+            AT(torsions.js),
+            AT(torsions.ks),
+            AT(torsions.ls),
+            AT([torsions.inters...]),
             torsions.types,
         ))
     end
     specific_inter_lists = tuple(specific_inter_array...)
 
-    if array_type <: AbstractGPUArray
+    if AT <: AbstractGPUArray
         neighbor_finder = GPUNeighborFinder(
-            eligible=array_type(eligible),
+            eligible=AT(eligible),
             dist_cutoff=T(dist_neighbors),
-            special=array_type(special),
+            special=AT(special),
             n_steps_reorder=10,
             initialized=false,
         )
@@ -1306,8 +1306,8 @@ function System(T::Type,
         )
     end
 
-    atoms = array_type([atoms_abst...])
-    coords_dev = array_type(coords)
+    atoms = AT([atoms_abst...])
+    coords_dev = AT(coords)
 
     if isnothing(velocities)
         if units
@@ -1374,10 +1374,10 @@ The `atom_selector` function takes in each atom and atom data and determines whe
 that atom.
 For example, [`is_heavy_atom`](@ref) means non-hydrogen atoms are restrained.
 """
-function add_position_restraints(sys,
+function add_position_restraints(sys::System{<:Any, AT},
                                  k;
                                  atom_selector::Function=is_any_atom,
-                                 restrain_coords=sys.coords)
+                                 restrain_coords=sys.coords) where AT
     k_array = isa(k, AbstractArray) ? k : fill(k, length(sys))
     if length(k_array) != length(sys)
         throw(ArgumentError("the system has $(length(sys)) atoms but there are $(length(k_array)) k values"))
@@ -1394,7 +1394,7 @@ function add_position_restraints(sys,
             push!(inters, HarmonicPositionRestraint(k_res, x0))
         end
     end
-    restraints = InteractionList1Atoms(move_array(is, sys), move_array([inters...], sys), types)
+    restraints = InteractionList1Atoms(AT(is), AT([inters...]), types)
     sis = (sys.specific_inter_lists..., restraints)
     return System(
         atoms=deepcopy(sys.atoms),
diff --git a/src/simulators.jl b/src/simulators.jl
index 748e08690..141fc0a47 100644
--- a/src/simulators.jl
+++ b/src/simulators.jl
@@ -831,12 +831,12 @@ Attempt an exchange of replicas `n` and `m` in a [`ReplicaSystem`](@ref) during
 Successful exchanges should exchange coordinates and velocities as appropriate.
 Returns acceptance quantity `Δ` and a `Bool` indicating whether the exchange was successful.
 """
-function remd_exchange!(sys::ReplicaSystem{D, AT, T},
+function remd_exchange!(sys::ReplicaSystem,
                         sim::TemperatureREMD,
                         n::Integer,
                         m::Integer;
                         n_threads::Integer=Threads.nthreads(),
-                        rng=Random.default_rng()) where {D, AT, T}
+                        rng=Random.default_rng())
     T_n, T_m = sim.temperatures[n], sim.temperatures[m]
     β_n, β_m = inv(sys.k * T_n), inv(sys.k * T_m)
     neighbors_n = find_neighbors(sys.replicas[n], sys.replicas[n].neighbor_finder;
@@ -922,12 +922,12 @@ function simulate!(sys::ReplicaSystem,
     return simulate_remd!(sys, sim, n_steps; n_threads=n_threads, run_loggers=run_loggers, rng=rng)
 end
 
-function remd_exchange!(sys::ReplicaSystem{D, AT, T},
+function remd_exchange!(sys::ReplicaSystem,
                         sim::HamiltonianREMD,
                         n::Integer,
                         m::Integer;
                         n_threads::Integer=Threads.nthreads(),
-                        rng=Random.default_rng()) where {D, AT, T}
+                        rng=Random.default_rng())
     T_sim = sim.temperature
     β_sim = inv(sys.k * T_sim)
     neighbors_n = find_neighbors(sys.replicas[n], sys.replicas[n].neighbor_finder;
@@ -1047,12 +1047,12 @@ function MetropolisMonteCarlo(; temperature, trial_moves, trial_args=Dict())
     return MetropolisMonteCarlo(temperature, trial_moves, trial_args)
 end
 
-@inline function simulate!(sys::System{D, AT, T},
+@inline function simulate!(sys::System,
                            sim::MetropolisMonteCarlo,
                            n_steps::Integer;
                            n_threads::Integer=Threads.nthreads(),
                            run_loggers=true,
-                           rng=Random.default_rng()) where {D, AT, T}
+                           rng=Random.default_rng())
     neighbors = find_neighbors(sys, sys.neighbor_finder; n_threads=n_threads)
     E_old = potential_energy(sys, neighbors; n_threads=n_threads)
     coords_old = similar(sys.coords)
diff --git a/src/spatial.jl b/src/spatial.jl
index 3895ec1ba..797f6ef5c 100644
--- a/src/spatial.jl
+++ b/src/spatial.jl
@@ -634,7 +634,6 @@ function random_velocities!(sys, temp; rng=Random.default_rng())
 end
 
 function random_velocities!(vels, sys::AbstractSystem, temp; rng=Random.default_rng())
-    vs = random_velocities(sys, temp; rng=rng)
     vels .= random_velocities(sys, temp; rng=rng)
     return vels
 end
@@ -876,8 +875,8 @@ function molecule_centers(coords::AbstractArray{SVector{D, C}}, boundary, topolo
 end
 
 function molecule_centers(coords::AbstractGPUArray, boundary, topology)
-    array_type = get_array_type(coords)
-    return array_type(molecule_centers(Array(coords), boundary, topology))
+    AT = get_array_type(coords)
+    return AT(molecule_centers(Array(coords), boundary, topology))
 end
 
 # Allows scaling multiple vectors at once by broadcasting this function
@@ -897,7 +896,7 @@ This can be disabled with `ignore_molecules=true`.
 
 Not currently compatible with [`TriclinicBoundary`](@ref) if the topology is set.
 """
-function scale_coords!(sys, scale_factor; ignore_molecules=false)
+function scale_coords!(sys::System{<:Any, AT}, scale_factor; ignore_molecules=false) where AT
     if ignore_molecules || isnothing(sys.topology)
         sys.boundary = scale_boundary(sys.boundary, scale_factor)
         sys.coords .= scale_vec.(sys.coords, Ref(scale_factor))
@@ -928,7 +927,7 @@ function scale_coords!(sys, scale_factor; ignore_molecules=false)
             coords_nounits[i] = wrap_coords(
                     coords_nounits[i] .+ shift_vecs[mi] .- center_shifts[mi], boundary_nounits)
         end
-        sys.coords .= move_array(coords_nounits .* coord_units, sys)
+        sys.coords .= AT(coords_nounits .* coord_units)
     end
     return sys
 end
diff --git a/src/types.jl b/src/types.jl
index 817ad29f3..93c6ae7a1 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -20,8 +20,7 @@ export
     masses,
     charges,
     MollyCalculator,
-    ASECalculator,
-    NoNeighborList
+    ASECalculator
 
 const DefaultFloat = Float64
 
@@ -183,23 +182,23 @@ function Base.:+(il1::InteractionList4Atoms{I, T}, il2::InteractionList4Atoms{I,
     )
 end
 
-function inject_interaction_list(inter::InteractionList1Atoms, params_dic, array_type)
-    inters_grad = array_type(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
+function inject_interaction_list(inter::InteractionList1Atoms, params_dic, AT)
+    inters_grad = AT(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
     InteractionList1Atoms(inter.is, inters_grad, inter.types)
 end
 
-function inject_interaction_list(inter::InteractionList2Atoms, params_dic, array_type)
-    inters_grad = array_type(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
+function inject_interaction_list(inter::InteractionList2Atoms, params_dic, AT)
+    inters_grad = AT(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
     InteractionList2Atoms(inter.is, inter.js, inters_grad, inter.types)
 end
 
-function inject_interaction_list(inter::InteractionList3Atoms, params_dic, array_type)
-    inters_grad = array_type(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
+function inject_interaction_list(inter::InteractionList3Atoms, params_dic, AT)
+    inters_grad = AT(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
     InteractionList3Atoms(inter.is, inter.js, inter.ks, inters_grad, inter.types)
 end
 
-function inject_interaction_list(inter::InteractionList4Atoms, params_dic, array_type)
-    inters_grad = array_type(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
+function inject_interaction_list(inter::InteractionList4Atoms, params_dic, AT)
+    inters_grad = AT(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
     InteractionList4Atoms(inter.is, inter.js, inter.ks, inter.ls, inters_grad, inter.types)
 end
 
@@ -465,7 +464,7 @@ interface described there.
 - `data::DA=nothing`: arbitrary data associated with the system.
 """
 mutable struct System{D, AT, T, A, C, B, V, AD, TO, PI, SI, GI, CN, NF,
-                      L, F, E, K, M, DA} <: AbstractSystem{D}
+                      L, F, E, K, M, DA} <: AtomsBase.AbstractSystem{D}
     atoms::A
     coords::C
     boundary::B
@@ -826,7 +825,7 @@ construction where `n` is the number of threads to be used per replica.
     modified in some simulations. `k` is chosen based on the `energy_units` given.
 - `data::DA=nothing`: arbitrary data associated with the replica system.
 """
-mutable struct ReplicaSystem{D, AT, T, A, AD, EL, F, E, K, R, DA} <: AbstractSystem{D}
+mutable struct ReplicaSystem{D, AT, T, A, AD, EL, F, E, K, R, DA} <: AtomsBase.AbstractSystem{D}
     atoms::A
     n_replicas::Int
     atoms_data::AD
@@ -863,7 +862,6 @@ function ReplicaSystem(;
                         k=default_k(energy_units),
                         data=nothing)
     D = AtomsBase.n_dimensions(boundary)
-    D = n_dimensions(boundary)
     AT = get_array_type(replica_coords[1])
     T = float_type(boundary)
     A = typeof(atoms)
@@ -1019,6 +1017,13 @@ function ReplicaSystem(;
             energy_units, k_converted, replicas, data)
 end
 
+# Rename, export, docstring
+function get_array_type(::AT) where AT
+    return AT.name.wrapper
+end
+
+get_array_type(::Union{System{D, AT}, ReplicaSystem{D, AT}}) where {D, AT} = AT
+
 """
     is_on_gpu(sys)
 
@@ -1050,9 +1055,6 @@ The partial charges of the atoms in a [`System`](@ref) or [`ReplicaSystem`](@ref
 charges(s::Union{System, ReplicaSystem}) = charge.(s.atoms)
 charge(s::Union{System, ReplicaSystem}, i::Integer) = charge(s.atoms[i])
 
-# Move an array to the GPU depending on whether the system is on the GPU
-move_array(arr, ::System{D, AT}) where {D, AT} = AT(arr)
-
 Base.getindex(s::Union{System, ReplicaSystem}, i::Union{Integer, AbstractVector}) = s.atoms[i]
 Base.length(s::Union{System, ReplicaSystem}) = length(s.atoms)
 Base.eachindex(s::Union{System, ReplicaSystem}) = Base.OneTo(length(s))
diff --git a/test/basic.jl b/test/basic.jl
index fe24454d7..288051856 100644
--- a/test/basic.jl
+++ b/test/basic.jl
@@ -176,22 +176,22 @@
     @test mcs == [SVector(0.05, 0.0), SVector(1.0, 1.0)]
 
     ff = MolecularForceField(joinpath.(ff_dir, ["ff99SBildn.xml", "tip3p_standard.xml", "his.xml"])...)
-    for array_type in array_list
-        sys = System(joinpath(data_dir, "6mrr_equil.pdb"), ff; array_type=array_type, use_cell_list=false)
+    for AT in array_list
+        sys = System(joinpath(data_dir, "6mrr_equil.pdb"), ff; array_type=AT, use_cell_list=false)
         mcs = molecule_centers(sys.coords, sys.boundary, sys.topology)
         @test isapprox(Array(mcs)[1], mean(sys.coords[1:1170]); atol=0.08u"nm")
 
         # Mark all pairs as ineligible for pairwise interactions and check that the
         #   potential energy from the specific interactions does not change on scaling
         no_nbs = falses(length(sys), length(sys))
-        if array_type <: AbstractGPUArray
+        if AT <: AbstractGPUArray
             sys.neighbor_finder = GPUNeighborFinder(
-                eligible=array_type(no_nbs),
+                eligible=AT(no_nbs),
                 dist_cutoff=1.0u"nm",
             )
         else 
             sys.neighbor_finder = DistanceNeighborFinder(
-                eligible=(array_type <: AbstractGPUArray ? array_type(no_nbs) : no_nbs),
+                eligible=AT(no_nbs),
                 dist_cutoff=1.0u"nm",
             )
         end
diff --git a/test/energy_conservation.jl b/test/energy_conservation.jl
index c2a423bae..48fba4a98 100644
--- a/test/energy_conservation.jl
+++ b/test/energy_conservation.jl
@@ -6,7 +6,8 @@ using CUDA
 using Test
 
 @testset "Lennard-Jones energy conservation" begin
-    function test_energy_conservation(nl::Bool, array_type::AbstractArray, n_threads::Integer, n_steps::Integer)
+    function test_energy_conservation(nl::Bool, ::Type{AT}, n_threads::Integer,
+                                      n_steps::Integer) where AT
         n_atoms = 2_000
         atom_mass = 40.0u"g/mol"
         temp = 1.0u"K"
@@ -41,8 +42,8 @@ using Test
             end
     
             sys = System(
-                atoms=(array_type(atoms) : atoms),
-                coords=(array_type(coords) : coords),
+                atoms=AT(atoms),
+                coords=AT(coords),
                 boundary=boundary,
                 pairwise_inters=(LennardJones(cutoff=cutoff, use_neighbors=ifelse(nl, true, false)),),
                 neighbor_finder=neighbor_finder,
@@ -78,9 +79,9 @@ using Test
         test_energy_conservation(true, Array, Threads.nthreads(), 50_000)
         test_energy_conservation(false, Array, Threads.nthreads(), 50_000)
     end
-    for array_type in array_list[2:end]
-        test_energy_conservation(true, array_type, 1, 100_000)
-        test_energy_conservation(false, array_type, 1, 100_000)
+    for AT in array_list[2:end]
+        test_energy_conservation(true, AT, 1, 100_000)
+        test_energy_conservation(false, AT, 1, 100_000)
     end
 end
 
diff --git a/test/gradients.jl b/test/gradients.jl
index 1013ef9ae..196d4e740 100644
--- a/test/gradients.jl
+++ b/test/gradients.jl
@@ -251,13 +251,13 @@ end
 end
 
 @testset "Differentiable protein" begin
-    function create_sys(array_type)
+    function create_sys(AT)
         ff = MolecularForceField(joinpath.(ff_dir, ["ff99SBildn.xml", "his.xml"])...; units=false)
         return System(
             joinpath(data_dir, "6mrr_nowater.pdb"),
             ff;
             units=false,
-            array_type=array_type,
+            array_type=AT,
             implicit_solvent="gbn2",
             kappa=0.7,
         )
diff --git a/test/minimization.jl b/test/minimization.jl
index 5a75a0e27..7baaaa1d3 100644
--- a/test/minimization.jl
+++ b/test/minimization.jl
@@ -42,14 +42,14 @@
     @test isapprox(potential_energy(sys; n_threads=1) * u"kJ * mol^-1", -3.0u"kJ * mol^-1";
                     atol=1e-4u"kJ * mol^-1")
 
-    for array_type in array_list[2:end]
-        coords = array_type([
+    for AT in array_list[2:end]
+        coords = AT([
             SVector(1.0, 1.0, 1.0)u"nm",
             SVector(1.6, 1.0, 1.0)u"nm",
             SVector(1.4, 1.6, 1.0)u"nm",
         ])
         sys = System(
-            atoms=array_type([Atom(σ=(0.4 / (2 ^ (1 / 6)))u"nm", ϵ=1.0u"kJ * mol^-1") for i in 1:3]),
+            atoms=AT([Atom(σ=(0.4 / (2 ^ (1 / 6)))u"nm", ϵ=1.0u"kJ * mol^-1") for i in 1:3]),
             coords=coords,
             boundary=CubicBoundary(5.0u"nm"),
             pairwise_inters=(LennardJones(),),
diff --git a/test/protein.jl b/test/protein.jl
index 4327e37da..c68016527 100644
--- a/test/protein.jl
+++ b/test/protein.jl
@@ -179,12 +179,12 @@ end
     @test pis_grad == sys_nounits.pairwise_inters
 
     # Test the same simulation on the GPU
-    for array_type in array_list[2:end]
+    for AT in array_list[2:end]
         sys = System(
             joinpath(data_dir, "6mrr_equil.pdb"),
             ff;
-            velocities=array_type(deepcopy(velocities_start)),
-            array_type = array_type,
+            velocities=AT(copy(velocities_start)),
+            array_type=AT,
             center_coords=false,
         )
         @test kinetic_energy(sys) ≈ 65521.87288132431u"kJ * mol^-1"
@@ -211,9 +211,9 @@ end
         sys_nounits = System(
             joinpath(data_dir, "6mrr_equil.pdb"),
             ff_nounits;
-            velocities=array_type(deepcopy(ustrip_vec.(velocities_start))),
+            velocities=AT(copy(ustrip_vec.(velocities_start))),
             units=false,
-            array_type = array_type,
+            array_type=AT,
             center_coords=false,
         )
         @test kinetic_energy(sys_nounits)u"kJ * mol^-1" ≈ 65521.87288132431u"kJ * mol^-1"
@@ -248,13 +248,13 @@ end
 @testset "Implicit solvent" begin
     ff = MolecularForceField(joinpath.(ff_dir, ["ff99SBildn.xml", "his.xml"])...)
 
-    for array_type in array_list
+    for AT in array_list
         for solvent_model in ("obc2", "gbn2")
             sys = System(
                 joinpath(data_dir, "6mrr_nowater.pdb"),
                 ff;
                 boundary=CubicBoundary(100.0u"nm"),
-                array_type = array_type,
+                array_type=AT,
                 dist_cutoff=5.0u"nm",
                 dist_neighbors=5.0u"nm",
                 implicit_solvent=solvent_model,
diff --git a/test/simulation.jl b/test/simulation.jl
index eb0bc5516..9ebb6cbcf 100644
--- a/test/simulation.jl
+++ b/test/simulation.jl
@@ -574,7 +574,7 @@ end
 end
 
 @testset "Position restraints" begin
-    for array_type in array_list
+    for AT in array_list
         n_atoms = 10
         n_atoms_res = n_atoms ÷ 2
         n_steps = 2_000
@@ -585,8 +585,8 @@ end
         sim = Langevin(dt=0.001u"ps", temperature=300.0u"K", friction=1.0u"ps^-1")
 
         sys = System(
-            atoms=array_type(atoms),
-            coords=array_type(deepcopy(starting_coords)),
+            atoms=AT(atoms),
+            coords=AT(copy(starting_coords)),
             boundary=boundary,
             atoms_data=atoms_data,
             pairwise_inters=(LennardJones(),),
@@ -1077,14 +1077,14 @@ end
     vvand_baro = VelocityVerlet(dt=dt, coupling=(AndersenThermostat(temp, 1.0u"ps"), barostat))
 
     for sim in (lang_baro, vvand_baro)
-        for array_type in array_list
-            if array_type <: AbstractGPUArray && sim == vvand_baro
+        for AT in array_list
+            if AT <: AbstractGPUArray && sim == vvand_baro
                 continue
             end
 
             sys = System(
-                atoms=array_type(atoms),
-                coords=array_type(deepcopy(coords)),
+                atoms=AT(atoms),
+                coords=AT(copy(coords)),
                 boundary=boundary,
                 pairwise_inters=(LennardJones(),),
                 loggers=(
@@ -1140,15 +1140,15 @@ end
         SVector(nothing  , nothing  , nothing  ), # Uncoupled
     )
 
-    for array_type in array_list
+    for AT in array_list
         for (press_i, press) in enumerate(pressure_test_set)
-            if array_type <: AbstractGPUArray && press_i != 2
+            if AT <: AbstractGPUArray && press_i != 2
                 continue
             end
 
             sys = System(
-                atoms=array_type(atoms),
-                coords=array_type(deepcopy(coords)),
+                atoms=AT(atoms),
+                coords=AT(copy(coords)),
                 boundary=boundary,
                 pairwise_inters=(LennardJones(),),
                 loggers=(
@@ -1208,15 +1208,15 @@ end
         MonteCarloMembraneBarostat(press, tens, temp, boundary; z_axis_fixed=true),
     )
 
-    for array_type in array_list
+    for AT in array_list
         for (barostat_i, barostat) in enumerate(barostat_test_set)
-            if array_type <: AbstractGPUArray && barostat_i != 2
+            if AT <: AbstractGPUArray && barostat_i != 2
                 continue
             end
 
             sys = System(
-                atoms=array_type(atoms),
-                coords=array_type(deepcopy(coords)),
+                atoms=AT(atoms),
+                coords=AT(copy(coords)),
                 boundary=boundary,
                 pairwise_inters=(LennardJones(),),
                 loggers=(
@@ -1330,8 +1330,7 @@ end
     starting_coords_f32 = [Float32.(c) for c in starting_coords]
     starting_velocities_f32 = [Float32.(c) for c in starting_velocities]
 
-    function test_sim(nl::Bool, parallel::Bool, f32::Bool,
-                      array_type::Type{AT}) where AT <: AbstractArray
+    function test_sim(nl::Bool, parallel::Bool, f32::Bool, ::Type{AT}) where AT
         n_atoms = 400
         n_steps = 200
         atom_mass = f32 ? 10.0f0u"g/mol" : 10.0u"g/mol"
@@ -1341,9 +1340,9 @@ end
         r0 = f32 ? 0.2f0u"nm" : 0.2u"nm"
         bonds = [HarmonicBond(k=k, r0=r0) for i in 1:(n_atoms ÷ 2)]
         specific_inter_lists = (InteractionList2Atoms(
-            array_type(Int32.(collect(1:2:n_atoms))),
-            array_type(Int32.(collect(2:2:n_atoms))),
-            array_type(bonds),
+            AT(Int32.(collect(1:2:n_atoms))),
+            AT(Int32.(collect(2:2:n_atoms))),
+            AT(bonds),
         ),)
 
         neighbor_finder = NoNeighborFinder()
@@ -1359,7 +1358,7 @@ end
         end
         if nl && !gpu
             neighbor_finder = DistanceNeighborFinder(
-                eligible=array_type(trues(n_atoms, n_atoms)),
+                eligible=AT(trues(n_atoms, n_atoms)),
                 n_steps=10,
                 dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm",
             )
@@ -1367,9 +1366,9 @@ end
         end
         show(devnull, neighbor_finder)
 
-        coords = array_type(deepcopy(f32 ? starting_coords_f32 : starting_coords))
-        velocities = array_type(deepcopy(f32 ? starting_velocities_f32 : starting_velocities))
-        atoms = array_type([Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
+        coords = AT(copy(f32 ? starting_coords_f32 : starting_coords))
+        velocities = AT(copy(f32 ? starting_velocities_f32 : starting_velocities))
+        atoms = AT([Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
                                 ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
 
         s = System(
@@ -1382,7 +1381,7 @@ end
             neighbor_finder=neighbor_finder,
         )
 
-        @test is_on_gpu(s) == (array_type <: AbstractGPUArray)
+        @test is_on_gpu(s) == (AT <: AbstractGPUArray)
         @test float_type(s) == (f32 ? Float32 : Float64)
 
         n_threads = parallel ? Threads.nthreads() : 1

From fc52f529edb32e31c4d8d7dbf0a3e47f3a2cfbef Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Fri, 24 Jan 2025 13:50:53 +0100
Subject: [PATCH 08/24] small changes, still broken

---
 src/kernels.jl | 38 +++++++++++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/src/kernels.jl b/src/kernels.jl
index a7ea220dc..03ec29f87 100644
--- a/src/kernels.jl
+++ b/src/kernels.jl
@@ -230,10 +230,42 @@ end
 function pairwise_pe_gpu!(pe_vec_nounits, buffers, sys::System{D, AT, T},
                          pairwise_inters, nbs, step_n) where {D, AT <: AbstractGPUArray, T}
     backend = get_backend(sys.coords)
-    n_threads_gpu = gpu_threads_pairwise(length(nbs))
-    kernel! = pairwise_pe_kernel!(backend, n_threads_gpu)
-    kernel!(pe_vec_nounits, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters, nbs, step_n, Val(energy_units); ndrange = length(nbs))
+    if typeof(nbs) == Nothing
+        n_threads_gpu = gpu_threads_pairwise(length(sys.coords))
+        kernel! = pairwise_pe_kernel_nonl!(backend, n_threads_gpu)
+        kernel!(pe_vec_nounits, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters, step_n, Val(energy_units); ndrange = length(sys.coords))
+    else
+        n_threads_gpu = gpu_threads_pairwise(length(nbs))
+        kernel! = pairwise_pe_kernel!(backend, n_threads_gpu)
+        kernel!(pe_vec_nounits, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters, nbs, step_n, Val(energy_units); ndrange = length(nbs))
+    end
     return pe_vec_nounits
+
+end
+
+@kernel function pairwise_pe_kernel_nonl!(energy, @Const(coords),
+                                          @Const(velocities),
+                                          @Const(atoms), boundary, inters,
+                                          step_n,
+                                          ::Val{E}) where E
+
+    i = @index(Global, Linear)
+
+    for j = i+1:length(coords)
+        special = false
+        coord_i, coord_j, vel_i, vel_j = coords[i], coords[j], velocities[i], velocities[j]
+        dr = vector(coord_i, coord_j, boundary)
+        pe = potential_energy_gpu(inters[1], dr, atoms[i], atoms[j], E, special, coord_i, coord_j,
+                                  boundary, vel_i, vel_j, step_n)
+        for inter in inters[2:end]
+            pe += potential_energy_gpu(inter, dr, atoms[i], atoms[j], E, special, coord_i, coord_j,
+                                       boundary, vel_i, vel_j, step_n)
+        end
+        if unit(pe) != E
+            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
+        end
+        Atomix.@atomic energy[1] += ustrip(pe)
+    end
 end
 
 @kernel function pairwise_pe_kernel!(energy, @Const(coords), @Const(velocities),

From 8785dc481137003b371ae3dd5089d4c7d333f9b2 Mon Sep 17 00:00:00 2001
From: Joe Greener <jgreener@hotmail.co.uk>
Date: Fri, 24 Jan 2025 17:03:28 +0000
Subject: [PATCH 09/24] various changes

---
 benchmark/protein.jl                 |   6 +-
 docs/src/documentation.md            |   2 +-
 src/analysis.jl                      |   2 +-
 src/interactions/implicit_solvent.jl |   4 +-
 src/kernels.jl                       |  54 ++++++---------
 src/neighbors.jl                     |   4 +-
 src/setup.jl                         |   4 +-
 src/spatial.jl                       |   2 +-
 src/types.jl                         |  40 ++++++-----
 test/basic.jl                        |  28 ++------
 test/energy_conservation.jl          |  50 ++++++++------
 test/gradients.jl                    |  12 ++--
 test/minimization.jl                 |   2 -
 test/runtests.jl                     |   8 +--
 test/simulation.jl                   | 100 ++++++++++++---------------
 15 files changed, 145 insertions(+), 173 deletions(-)

diff --git a/benchmark/protein.jl b/benchmark/protein.jl
index 0d2f86614..7ff549c22 100644
--- a/benchmark/protein.jl
+++ b/benchmark/protein.jl
@@ -29,7 +29,7 @@ function setup_system(::Type{AT}, f32::Bool, units::Bool) where AT
         ff;
         velocities=AT(velocities),
         units=units,
-        gpu=gpu,
+        array_type=AT,
         dist_cutoff=(units ? dist_cutoff * u"nm" : dist_cutoff),
         dist_neighbors=(units ? dist_neighbors * u"nm" : dist_neighbors),
     )
@@ -53,9 +53,9 @@ runs = [
     ("GPU f32 nounits"                   , CuArray, false, true , false),
 ]
 
-for (run_name, gpu, parallel, f32, units) in runs
+for (run_name, AT, parallel, f32, units) in runs
     n_threads_used = parallel ? n_threads : 1
-    sys, sim = setup_system(gpu, f32, units)
+    sys, sim = setup_system(AT, f32, units)
     simulate!(deepcopy(sys), sim, 20; n_threads=n_threads_used)
     println(run_name)
     @time simulate!(sys, sim, n_steps; n_threads=n_threads_used)
diff --git a/docs/src/documentation.md b/docs/src/documentation.md
index f2cd85ad9..ce6d7c242 100644
--- a/docs/src/documentation.md
+++ b/docs/src/documentation.md
@@ -1332,7 +1332,7 @@ The available neighbor finders are:
 - [`DistanceNeighborFinder`](@ref)
 - [`TreeNeighborFinder`](@ref)
 
-The recommended neighbor finder is [`CellListMapNeighborFinder`](@ref) on CPU and [`GPUNeighborFinder`](@ref) on GPU.
+The recommended neighbor finder is [`CellListMapNeighborFinder`](@ref) on CPU, [`GPUNeighborFinder`](@ref) on NVIDIA GPUs and [`DistanceNeighborFinder`](@ref) on other GPUs.
 When using a neighbor finder you should in general also use an interaction cutoff (see [Cutoffs](@ref)) with a cutoff distance less than the neighbor finder distance.
 The difference between the two should be larger than an atom can move in the time of the `n_steps` defined by the neighbor finder.
 The exception is [`GPUNeighborFinder`](@ref), which uses the algorithm from [Eastman and Pande 2010](https://doi.org/10.1002/jcc.21413) to avoid calculating a neighbor list and should have `dist_cutoff` set to the interaction cutoff distance.
diff --git a/src/analysis.jl b/src/analysis.jl
index 1c69fa656..5ad057b97 100644
--- a/src/analysis.jl
+++ b/src/analysis.jl
@@ -88,7 +88,7 @@ Calculate the hydrodynamic radius of a set of coordinates.
 """
 function hydrodynamic_radius(coords::AbstractArray{SVector{D, T}}, boundary) where {D, T}
     n_atoms = length(coords)
-    diag  = get_array_type(coords)(Diagonal(ones(T, n_atoms)))
+    diag  = array_type(coords)(Diagonal(ones(T, n_atoms)))
     dists = distances(coords, boundary) .+ diag
     sum_inv_dists = sum(inv.(dists)) - sum(inv(diag))
     inv_R_hyd = sum_inv_dists / (2 * n_atoms^2)
diff --git a/src/interactions/implicit_solvent.jl b/src/interactions/implicit_solvent.jl
index 668bf1682..860314a91 100644
--- a/src/interactions/implicit_solvent.jl
+++ b/src/interactions/implicit_solvent.jl
@@ -412,7 +412,7 @@ function ImplicitSolventOBC(atoms::AbstractArray{Atom{TY, M, T, D, E}},
     end
 
     if isa(atoms, AbstractGPUArray)
-        AT = get_array_type(atoms)
+        AT = array_type(atoms)
         or = AT(offset_radii)
         sor = AT(scaled_offset_radii)
         is, js = AT(inds_i), AT(inds_j)
@@ -565,7 +565,7 @@ function ImplicitSolventGBN2(atoms::AbstractArray{Atom{TY, M, T, D, E}},
     end
 
     if isa(atoms, AbstractGPUArray)
-        AT = get_array_type(atoms)
+        AT = array_type(atoms)
         or = AT(offset_radii)
         sor = AT(scaled_offset_radii)
         is, js = AT(inds_i), AT(inds_j)
diff --git a/src/kernels.jl b/src/kernels.jl
index 6f620e9fa..d8e284a91 100644
--- a/src/kernels.jl
+++ b/src/kernels.jl
@@ -26,20 +26,23 @@ end
 
 function pairwise_force_gpu!(buffers, sys::System{D, AT, T}, 
                     pairwise_inters, neighbors, step_n) where {D, AT <: AbstractGPUArray, T}
-    backend = get_backend(coords)
+    if isnothing(neighbors)
+        error("neighbors is nothing, if you are using GPUNeighborFinder on a non-NVIDIA GPU you " *
+              "should use DistanceNeighborFinder instead")
+    end
     if typeof(neighbors) == NoNeighborList
-        n_threads_gpu = gpu_threads_pairwise(length(atoms))
-        kernel! = pairwise_force_kernel_nonl!(backend, n_threads_gpu)
-        kernel!(buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary,
-                pairwise_inters, step_n, Val(D), Val(force_units); ndrange = length(atoms))
-    elseif length(neighbors) > 0
+        nbs = neighbors
+    else
         nbs = @view neighbors.list[1:neighbors.n]
+    end
+    if length(neighbors) > 0
+        backend = get_backend(coords)
         n_threads_gpu = gpu_threads_pairwise(length(nbs))
         kernel! = pairwise_force_kernel_nl!(backend, n_threads_gpu)
         kernel!(buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters,
-                nbs, step_n, Val(D), Val(force_units); ndrange = length(nbs))
+                nbs, step_n, Val(D), Val(force_units); ndrange=length(nbs))
     end
-    return fs_mat
+    return buffers
 end
 
 @kernel function pairwise_force_kernel_nl!(forces, @Const(coords),
@@ -62,27 +65,6 @@ end
     end
 end
 
-@kernel function pairwise_force_kernel_nonl!(forces, @Const(coords),
-                                             @Const(velocities), @Const(atoms),
-                                             boundary, inters,
-                                             step_n, ::Val{D},
-                                             ::Val{F}) where {D, F}
-
-    i = @index(Global, Linear)
-
-    @inbounds for j = 1:i
-        if i != j
-            f = sum_pairwise_forces(inters, atoms[i], atoms[j], Val(F), false, coords[i], coords[j],
-                                    boundary, velocities[i], velocities[j], step_n)
-            for dim in 1:D
-                fval = ustrip(f[dim])
-                Atomix.@atomic forces[dim, i] = forces[dim, i] - fval
-                Atomix.@atomic forces[dim, j] = forces[dim, j] + fval
-            end
-        end
-    end
-end
-
 function specific_force_gpu!(fs_mat, inter_list::InteractionList1Atoms, coords::AbstractArray{SVector{D, C}},
                             velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T}
     backend = get_backend(coords)
@@ -224,11 +206,19 @@ end
     end
 end
 
-function pairwise_pe_gpu!(pe_vec_nounits, buffers, sys::System{D, AT, T},
-                         pairwise_inters, neighbors, step_n) where {D, AT <: AbstractGPUArray, T}
+function pairwise_pe_gpu!(pe_vec_nounits, buffers, sys::System{D, AT},
+                         pairwise_inters, neighbors, step_n) where {D, AT <: AbstractGPUArray}
+    if isnothing(neighbors)
+        error("neighbors is nothing, if you are using GPUNeighborFinder on a non-NVIDIA GPU you " *
+            "should use DistanceNeighborFinder instead")
+    end
+    if typeof(neighbors) == NoNeighborList
+        nbs = neighbors
+    else
+        nbs = @view neighbors.list[1:neighbors.n]
+    end
     if length(neighbors) > 0
         backend = get_backend(sys.coords)
-        nbs = @view neighbors.list[1:neighbors.n]
         n_threads_gpu = gpu_threads_pairwise(length(nbs))
         kernel! = pairwise_pe_kernel!(backend, n_threads_gpu)
         kernel!(pe_vec_nounits, sys.coords, sys.velocities, sys.atoms, sys.boundary,
diff --git a/src/neighbors.jl b/src/neighbors.jl
index 166630c1b..415e08801 100644
--- a/src/neighbors.jl
+++ b/src/neighbors.jl
@@ -49,7 +49,7 @@ find_neighbors(sys::System, nf::NoNeighborFinder, args...; kwargs...) = nothing
 Use the non-bonded forces/potential energy algorithm from
 [Eastman and Pande 2010](https://doi.org/10.1002/jcc.21413) to avoid calculating a neighbor list.
 
-This is the recommended neighbor finder on GPU.
+This is the recommended neighbor finder on NVIDIA GPUs.
 """
 mutable struct GPUNeighborFinder{B, D}
     eligible::B
@@ -75,6 +75,8 @@ find_neighbors(sys::System, nf::GPUNeighborFinder, args...; kwargs...) = nothing
     DistanceNeighborFinder(; eligible, dist_cutoff, special, n_steps)
 
 Find close atoms by distance.
+
+This is the recommended neighbor finder on non-NVIDIA GPUs.
 """
 struct DistanceNeighborFinder{B, D}
     eligible::B
diff --git a/src/setup.jl b/src/setup.jl
index f12b4140f..ac371ca75 100644
--- a/src/setup.jl
+++ b/src/setup.jl
@@ -452,7 +452,7 @@ function System(coord_file::AbstractString,
                 velocities=nothing,
                 loggers=(),
                 units::Bool=true,
-                ::Type{AT}=Array,
+                array_type::Type{AT}=Array,
                 dist_cutoff=units ? 1.0u"nm" : 1.0,
                 dist_neighbors=units ? 1.2u"nm" : 1.2,
                 center_coords::Bool=true,
@@ -969,7 +969,7 @@ function System(T::Type,
                 velocities=nothing,
                 loggers=(),
                 units::Bool=true,
-                ::Type{AT}=Array,
+                array_type::Type{AT}=Array,
                 dist_cutoff=units ? 1.0u"nm" : 1.0,
                 dist_neighbors=units ? 1.2u"nm" : 1.2,
                 center_coords::Bool=true,
diff --git a/src/spatial.jl b/src/spatial.jl
index 797f6ef5c..728577f48 100644
--- a/src/spatial.jl
+++ b/src/spatial.jl
@@ -875,7 +875,7 @@ function molecule_centers(coords::AbstractArray{SVector{D, C}}, boundary, topolo
 end
 
 function molecule_centers(coords::AbstractGPUArray, boundary, topology)
-    AT = get_array_type(coords)
+    AT = array_type(coords)
     return AT(molecule_centers(Array(coords), boundary, topology))
 end
 
diff --git a/src/types.jl b/src/types.jl
index 93c6ae7a1..023883d33 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -15,6 +15,7 @@ export
     inject_gradients,
     extract_parameters,
     ReplicaSystem,
+    array_type,
     is_on_gpu,
     float_type,
     masses,
@@ -503,7 +504,7 @@ function System(;
                 k=default_k(energy_units),
                 data=nothing)
     D = AtomsBase.n_dimensions(boundary)
-    AT = get_array_type(coords)
+    AT = array_type(coords)
     T = float_type(boundary)
     A = typeof(atoms)
     C = typeof(coords)
@@ -635,7 +636,7 @@ Construct a `System` from a SimpleCrystals.jl `Crystal` struct.
 
 Properties unused in the simulation or in analysis can be left with their
 default values.
-`atoms`, `atoms_data`, `coords` and `boundary` are automatically calcualted from
+`atoms`, `atoms_data`, `coords` and `boundary` are automatically calculated from
 the `Crystal` struct.
 Extra atom paramaters like `σ` have to be added manually after construction using
 the convenience constructor `System(sys; <keyword arguments>)`.
@@ -862,7 +863,7 @@ function ReplicaSystem(;
                         k=default_k(energy_units),
                         data=nothing)
     D = AtomsBase.n_dimensions(boundary)
-    AT = get_array_type(replica_coords[1])
+    AT = array_type(replica_coords[1])
     T = float_type(boundary)
     A = typeof(atoms)
     AD = typeof(atoms_data)
@@ -973,25 +974,25 @@ function ReplicaSystem(;
         throw(ArgumentError("there are $(length(atoms)) atoms but $(length(atoms_data)) atom data entries"))
     end
 
-    n_cuarray = sum(y -> isa(y, AbstractGPUArray), replica_coords)
-    if !(n_cuarray == n_replicas || n_cuarray == 0)
-        throw(ArgumentError("the coordinates for $n_cuarray out of $n_replicas replicas are on GPU"))
+    n_gpu_array = sum(y -> isa(y, AbstractGPUArray), replica_coords)
+    if !(n_gpu_array == n_replicas || n_gpu_array == 0)
+        throw(ArgumentError("the coordinates for $n_gpu_array out of $n_replicas replicas are on GPU"))
     end
-    if isa(atoms, AbstractGPUArray) && n_cuarray != n_replicas
+    if isa(atoms, AbstractGPUArray) && n_gpu_array != n_replicas
         throw(ArgumentError("the atoms are on the GPU but the coordinates are not"))
     end
-    if n_cuarray == n_replicas && !isa(atoms, AbstractGPUArray)
+    if n_gpu_array == n_replicas && !isa(atoms, AbstractGPUArray)
         throw(ArgumentError("the coordinates are on the GPU but the atoms are not"))
     end
 
-    n_cuarray = sum(y -> isa(y, AbstractGPUArray), replica_velocities)
-    if !(n_cuarray == n_replicas || n_cuarray == 0)
-        throw(ArgumentError("the velocities for $n_cuarray out of $n_replicas replicas are on GPU"))
+    n_gpu_array = sum(y -> isa(y, AbstractGPUArray), replica_velocities)
+    if !(n_gpu_array == n_replicas || n_gpu_array == 0)
+        throw(ArgumentError("the velocities for $n_gpu_array out of $n_replicas replicas are on GPU"))
     end
-    if isa(atoms, AbstractGPUArray) && n_cuarray != n_replicas
+    if isa(atoms, AbstractGPUArray) && n_gpu_array != n_replicas
         throw(ArgumentError("the atoms are on the GPU but the velocities are not"))
     end
-    if n_cuarray == n_replicas && !isa(atoms, AbstractGPUArray)
+    if n_gpu_array == n_replicas && !isa(atoms, AbstractGPUArray)
         throw(ArgumentError("the velocities are on the GPU but the atoms are not"))
     end
 
@@ -1017,12 +1018,15 @@ function ReplicaSystem(;
             energy_units, k_converted, replicas, data)
 end
 
-# Rename, export, docstring
-function get_array_type(::AT) where AT
-    return AT.name.wrapper
-end
+"""
+    array_type(sys)
+    array_type(arr)
 
-get_array_type(::Union{System{D, AT}, ReplicaSystem{D, AT}}) where {D, AT} = AT
+The array type of a [`System`](@ref), [`ReplicaSystem`](@ref) or array, for example
+`Array` for systems on CPU or `CuArray` for systems on a NVIDIA GPU.
+"""
+array_type(::AT) where AT = AT.name.wrapper
+array_type(::Union{System{D, AT}, ReplicaSystem{D, AT}}) where {D, AT} = AT
 
 """
     is_on_gpu(sys)
diff --git a/test/basic.jl b/test/basic.jl
index 288051856..b6a218f3a 100644
--- a/test/basic.jl
+++ b/test/basic.jl
@@ -191,7 +191,7 @@
             )
         else 
             sys.neighbor_finder = DistanceNeighborFinder(
-                eligible=AT(no_nbs),
+                eligible=no_nbs,
                 dist_cutoff=1.0u"nm",
             )
         end
@@ -317,27 +317,8 @@ end
         end
     end
 
-    if run_cuda_tests
-        sys_gpu = System(joinpath(data_dir, "6mrr_equil.pdb"), ff;
-                         array_type=CuArray)
-        for neighbor_finder in (DistanceNeighborFinder,)
-            nf_gpu = neighbor_finder(
-                eligible=sys_gpu.neighbor_finder.eligible,
-                special=sys_gpu.neighbor_finder.special,
-                dist_cutoff=dist_cutoff,
-            )
-            neighbors_gpu = find_neighbors(sys_gpu, nf_gpu)
-            @test length(neighbors_gpu) == n_neighbors_ref
-            GPUArrays.allowscalar() do
-                @test neighbors_gpu[10] isa Tuple{Int32, Int32, Bool}
-            end
-            @test identical_neighbors(neighbors_gpu, neighbors_ref)
-        end
-    end
-
-    if run_rocm_tests
-        sys_gpu = System(joinpath(data_dir, "6mrr_equil.pdb"), ff;
-                         array_type=ROCArray)
+    for AT in array_list[2:end]
+        sys_gpu = System(joinpath(data_dir, "6mrr_equil.pdb"), ff; array_type=AT)
         for neighbor_finder in (DistanceNeighborFinder,)
             nf_gpu = neighbor_finder(
                 eligible=sys_gpu.neighbor_finder.eligible,
@@ -366,8 +347,7 @@ end
         @test rmsd(CuArray(coords_1), CuArray(coords_2)) ≈ 2.54859467758795u"Å"
     end
     if run_rocm_tests
-        @test rmsd(ROCArray(coords_1),
-                   ROCArray(coords_2)) ≈ 2.54859467758795u"Å"
+        @test rmsd(ROCArray(coords_1), ROCArray(coords_2)) ≈ 2.54859467758795u"Å"
     end
 
     bb_atoms = BioStructures.collectatoms(struc[1], BioStructures.backboneselector)
diff --git a/test/energy_conservation.jl b/test/energy_conservation.jl
index 48fba4a98..fbd636281 100644
--- a/test/energy_conservation.jl
+++ b/test/energy_conservation.jl
@@ -1,6 +1,8 @@
 # Energy conservation test
 
 using Molly
+using AbstractGPUArray
+using AMDGPU
 using CUDA
 
 using Test
@@ -25,20 +27,22 @@ using Test
     
         for cutoff in cutoffs
             coords = place_atoms(n_atoms, boundary; min_dist=0.1u"nm")
-            neighbor_finder = NoNeighborFinder()
-            if nl && gpu
-                neighbor_finder=GPUNeighborFinder(
-                    eligible=CuArray(trues(n_atoms, n_atoms)),
-                    n_steps_reorder=10,
-                    dist_cutoff=dist_cutoff,
-                )
-            end
-            if nl && !gpu
-                neighbor_finder=DistanceNeighborFinder(
-                    eligible=trues(n_atoms, n_atoms),
-                    n_steps=10,
-                    dist_cutoff=dist_cutoff,
-                )
+            if nl
+                if AT <: CuArray
+                    neighbor_finder=GPUNeighborFinder(
+                        eligible=AT(trues(n_atoms, n_atoms)),
+                        n_steps_reorder=10,
+                        dist_cutoff=dist_cutoff,
+                    )
+                else
+                    neighbor_finder=DistanceNeighborFinder(
+                        eligible=trues(n_atoms, n_atoms),
+                        n_steps=10,
+                        dist_cutoff=dist_cutoff,
+                    )
+                end
+            else
+                neighbor_finder = NoNeighborFinder()
             end
     
             sys = System(
@@ -62,7 +66,7 @@ using Test
             @test isapprox(Es[1], E0; atol=1e-7u"kJ * mol^-1")
     
             max_ΔE = maximum(abs.(Es .- E0))
-            platform_str = gpu ? "GPU" : "CPU $n_threads thread(s)"
+            platform_str = (AT <: AbstractGPUArray ? "$AT" : "CPU $n_threads thread(s)")
             cutoff_str = Base.typename(typeof(cutoff)).wrapper
             @info "$platform_str - $cutoff_str - max energy difference $max_ΔE"
             @test max_ΔE < 5e-4u"kJ * mol^-1"
@@ -73,16 +77,18 @@ using Test
         end
     end
 
-    test_energy_conservation(true, Array, 1, 10_000)
+    test_energy_conservation(true , Array, 1, 10_000)
     test_energy_conservation(false, Array, 1, 10_000)
     if Threads.nthreads() > 1
-        test_energy_conservation(true, Array, Threads.nthreads(), 50_000)
+        test_energy_conservation(true , Array, Threads.nthreads(), 50_000)
         test_energy_conservation(false, Array, Threads.nthreads(), 50_000)
     end
-    for AT in array_list[2:end]
-        test_energy_conservation(true, AT, 1, 100_000)
-        test_energy_conservation(false, AT, 1, 100_000)
+    if CUDA.functional()
+        test_energy_conservation(true , CuArray, 1, 100_000)
+        test_energy_conservation(false, CuArray, 1, 100_000)
+    end
+    if AMDGPU.functional()
+        test_energy_conservation(true , ROCArray, 1, 100_000)
+        test_energy_conservation(false, ROCArray, 1, 100_000)
     end
 end
-
-
diff --git a/test/gradients.jl b/test/gradients.jl
index 196d4e740..752148e26 100644
--- a/test/gradients.jl
+++ b/test/gradients.jl
@@ -43,19 +43,19 @@ end
         ("CPU gbn2"        , Array, false, false, false, false, true , 1e-4, 1e-4),
         ("CPU gbn2 forward", Array, false, true , false, false, true , 0.5 , 0.1 ),
     ]
-    if run_parallel_tests #                  gpu    par    fwd    f32    obc2   gbn2   tol_σ tol_r0
-        push!(runs, ("CPU parallel"        , Array, true , false, false, false, false, 1e-4, 1e-4))
-        push!(runs, ("CPU parallel forward", Array, true , true , false, false, false, 0.5 , 0.1 ))
-        push!(runs, ("CPU parallel f32"    , Array, true , false, true , false, false, 0.01, 5e-4))
+    if run_parallel_tests #                  gpu      par    fwd    f32    obc2   gbn2   tol_σ tol_r0
+        push!(runs, ("CPU parallel"        , Array  , true , false, false, false, false, 1e-4, 1e-4))
+        push!(runs, ("CPU parallel forward", Array  , true , true , false, false, false, 0.5 , 0.1 ))
+        push!(runs, ("CPU parallel f32"    , Array  , true , false, true , false, false, 0.01, 5e-4))
     end
-    if run_cuda_tests #                       gpu     par    fwd    f32    obc2   gbn2   tol_σ tol_r0
+    if run_cuda_tests #                      gpu      par    fwd    f32    obc2   gbn2   tol_σ tol_r0
         push!(runs, ("CUDA"                , CuArray, false, false, false, false, false, 0.25, 20.0))
         push!(runs, ("CUDA forward"        , CuArray, false, true , false, false, false, 0.25, 20.0))
         push!(runs, ("CUDA f32"            , CuArray, false, false, true , false, false, 0.5 , 50.0))
         push!(runs, ("CUDA obc2"           , CuArray, false, false, false, true , false, 0.25, 20.0))
         push!(runs, ("CUDA gbn2"           , CuArray, false, false, false, false, true , 0.25, 20.0))
     end
-    if run_rocm_tests #                       gpu      par    fwd    f32    obc2   gbn2   tol_σ tol_r0
+    if run_rocm_tests #                      gpu       par    fwd    f32    obc2   gbn2   tol_σ tol_r0
         push!(runs, ("ROCM"                , ROCArray, false, false, false, false, false, 0.25, 20.0))
         push!(runs, ("ROCM forward"        , ROCArray, false, true , false, false, false, 0.25, 20.0))
         push!(runs, ("ROCM f32"            , ROCArray, false, false, true , false, false, 0.5 , 50.0))
diff --git a/test/minimization.jl b/test/minimization.jl
index 7baaaa1d3..c3baa0826 100644
--- a/test/minimization.jl
+++ b/test/minimization.jl
@@ -59,8 +59,6 @@
         simulate!(sys, sim)
         dists = Array(distances(sys.coords, sys.boundary))
         dists_flat = dists[triu(trues(3, 3), 1)]
-
-        # GPU tolerances are more lenient (possibly for f32 shenanigans)
         @test all(x -> isapprox(x, 0.4u"nm"; atol=1e-2u"nm"), dists_flat)
         @test isapprox(potential_energy(sys), -3.0u"kJ * mol^-1";
                         atol=1e-2u"kJ * mol^-1")
diff --git a/test/runtests.jl b/test/runtests.jl
index 8d12c38bb..cfdf775c1 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,4 +1,5 @@
 using Molly
+using AMDGPU
 using Aqua
 import AtomsBase
 using AtomsBaseTesting
@@ -7,9 +8,8 @@ using AtomsCalculators.AtomsCalculatorsTesting
 import BioStructures # Imported to avoid clashing names
 using CUDA
 using Enzyme
-using AMDGPU
-using GPUArrays
 using FiniteDifferences
+using GPUArrays
 using KernelDensity
 import SimpleCrystals
 
@@ -36,7 +36,7 @@ if running_CI
     @warn "Some CPU gradient tests will not be run as this is CI"
 end
 
-const run_visualize_tests = false#get(ENV, "VISTESTS", "1") != "0"
+const run_visualize_tests = get(ENV, "VISTESTS", "1") != "0"
 if run_visualize_tests
     import GLMakie
 else
@@ -52,7 +52,7 @@ else
 end
 
 # Allow CUDA device to be specified
-const DEVICE = 2#parse(Int, get(ENV, "DEVICE", "0"))
+const DEVICE = parse(Int, get(ENV, "DEVICE", "0"))
 
 const run_cuda_tests = get(ENV, "GPUTESTS", "1") != "0" && CUDA.functional()
 const run_rocm_tests = get(ENV, "GPUTESTS", "1") != "0" && AMDGPU.functional()
diff --git a/test/simulation.jl b/test/simulation.jl
index 9ebb6cbcf..7e350f688 100644
--- a/test/simulation.jl
+++ b/test/simulation.jl
@@ -1,6 +1,5 @@
 @testset "Lennard-Jones 2D" begin
-    for gpu in gpu_list
-        AT = gpu ? CuArray : Array
+    for AT in array_list
         n_atoms = 10
         n_steps = 20_000
         temp = 100.0u"K"
@@ -8,7 +7,7 @@
         simulator = VelocityVerlet(dt=0.001u"ps", coupling=AndersenThermostat(temp, 10.0u"ps"))
         gen_temp_wrapper(s, args...; kwargs...) = temperature(s)
 
-        if gpu
+        if AT <: CuArray
             neighbor_finder = GPUNeighborFinder(
                 eligible=eligible=AT(trues(n_atoms, n_atoms)),
                 n_steps_reorder=10,
@@ -221,39 +220,32 @@ end
         OverdampedLangevin(dt=0.002u"ps", temperature=temp, friction=10.0u"ps^-1"),
     ]
 
-    s = System(
-        atoms=[Atom(mass=10.0u"g/mol", charge=0.0, σ=0.3u"nm", ϵ=0.2u"kJ * mol^-1") for i in 1:n_atoms],
-        coords=coords,
-        boundary=boundary,
-        pairwise_inters=(LennardJones(use_neighbors=true),),
-        neighbor_finder=DistanceNeighborFinder(
-            eligible=trues(n_atoms, n_atoms),
-            n_steps=10,
-            dist_cutoff=2.0u"nm",
-        ),
-        loggers=(coords=CoordinatesLogger(100),),
-    )
-    random_velocities!(s, temp)
-
-    if run_gpu_tests
-        s_gpu = System(
-            atoms=CuArray([Atom(mass=10.0u"g/mol", charge=0.0, σ=0.3u"nm", ϵ=0.2u"kJ * mol^-1") for i in 1:n_atoms]),
-            coords=CuArray(coords),
-            boundary=boundary,
-            pairwise_inters=(LennardJones(use_neighbors=true),),
-            neighbor_finder=GPUNeighborFinder(
-                eligible=CuArray(trues(n_atoms, n_atoms)),
+    for AT in array_list
+        if AT <: CuArray
+            neighbor_finder = GPUNeighborFinder(
+                eligible=AT(trues(n_atoms, n_atoms)),
                 n_steps_reorder=10,
                 dist_cutoff=2.0u"nm",
-            ),
+            )
+        else
+            neighbor_finder = DistanceNeighborFinder(
+                eligible=AT(trues(n_atoms, n_atoms)),
+                n_steps=10,
+                dist_cutoff=2.0u"nm",
+            )
+        end
+        s = System(
+            atoms=AT([Atom(mass=10.0u"g/mol", charge=0.0, σ=0.3u"nm", ϵ=0.2u"kJ * mol^-1")
+                      for i in 1:n_atoms]),
+            coords=AT(coords),
+            boundary=boundary,
+            pairwise_inters=(LennardJones(use_neighbors=true),),
+            neighbor_finder=neighbor_finder,
             loggers=(coords=CoordinatesLogger(100),),
         )
-    end
-
-    for simulator in simulators
-        @time simulate!(s, simulator, n_steps; n_threads=1)
-        if run_gpu_tests
-            @time simulate!(s_gpu, simulator, n_steps; n_threads=1)
+        random_velocities!(s, temp)
+        for simulator in simulators
+            @time simulate!(s, simulator, n_steps; n_threads=1)
         end
     end
 end
@@ -285,7 +277,7 @@ end
         loggers=(coords=CoordinatesLogger(100),),
     )
 
-    if run_gpu_tests
+    if run_cuda_tests
         s_gpu = System(
             atoms=CuArray([Atom(mass=10.0u"g/mol", charge=0.0, σ=0.1u"nm", ϵ=0.2u"kJ * mol^-1") for i in 1:n_atoms]),
             coords=CuArray(coords),
@@ -303,7 +295,7 @@ end
 
     for simulator in simulators
         @time simulate!(s, simulator, n_steps; n_threads=1)
-        if run_gpu_tests
+        if run_cuda_tests
             @time simulate!(s_gpu, simulator, n_steps; n_threads=1)
             coord_diff = sum(sum(map(x -> abs.(x), s.coords .- Array(s_gpu.coords)))) / (3 * n_atoms)
             E_diff = abs(potential_energy(s) - potential_energy(s_gpu))
@@ -437,7 +429,7 @@ end
             neighbor_finder = NoNeighborFinder()
         end
 
-        if run_gpu_tests
+        if run_cuda_tests
             neighbor_finder_gpu = GPUNeighborFinder(eligible=CuArray(trues(n_atoms, n_atoms)), n_steps_reorder=10,
                                                         dist_cutoff=1.2u"nm")
         end
@@ -457,7 +449,7 @@ end
         E0 = potential_energy(s)
         @time simulate!(s, simulator, n_steps)
 
-        if run_gpu_tests
+        if run_cuda_tests
             s_gpu = System(
                 atoms=CuArray(atoms),
                 coords=CuArray(coords),
@@ -1344,25 +1336,26 @@ end
             AT(Int32.(collect(2:2:n_atoms))),
             AT(bonds),
         ),)
-
-        neighbor_finder = NoNeighborFinder()
         cutoff = DistanceCutoff(f32 ? 1.0f0u"nm" : 1.0u"nm")
-        pairwise_inters = (LennardJones(use_neighbors=false, cutoff=cutoff),)
-        if nl && gpu
-            neighbor_finder = GPUNeighborFinder(
-                eligible=gpu ? CuArray(trues(n_atoms, n_atoms)) : trues(n_atoms, n_atoms),
-                n_steps_reorder=10,
-                dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm",
-            )
-            pairwise_inters = (LennardJones(use_neighbors=true, cutoff=cutoff),)
-        end
-        if nl && !gpu
-            neighbor_finder = DistanceNeighborFinder(
-                eligible=AT(trues(n_atoms, n_atoms)),
-                n_steps=10,
-                dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm",
-            )
+
+        if nl
+            if AT <: CuArray
+                neighbor_finder = GPUNeighborFinder(
+                    eligible=AT(trues(n_atoms, n_atoms),
+                    n_steps_reorder=10,
+                    dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm",
+                )
+            else
+                neighbor_finder = DistanceNeighborFinder(
+                    eligible=AT(trues(n_atoms, n_atoms)),
+                    n_steps=10,
+                    dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm",
+                )
+            end
             pairwise_inters = (LennardJones(use_neighbors=true, cutoff=cutoff),)
+        else
+            neighbor_finder = NoNeighborFinder()
+            pairwise_inters = (LennardJones(use_neighbors=false, cutoff=cutoff),)
         end
         show(devnull, neighbor_finder)
 
@@ -1416,7 +1409,6 @@ end
         push!(runs, ("GPU f32 NL", [true , false, true , ROCArray]))
     end
 
-
     final_coords_ref, E_start_ref = test_sim(runs[1][2]...)
     # Check all simulations give the same result to within some error
     for (name, args) in runs

From 6bb91d629ca2d2cf62e91b620be723272ab333e7 Mon Sep 17 00:00:00 2001
From: Joe Greener <jgreener@hotmail.co.uk>
Date: Fri, 24 Jan 2025 17:03:44 +0000
Subject: [PATCH 10/24] remove unused CUDA kernels

---
 ext/MollyCUDAExt.jl | 348 +-------------------------------------------
 1 file changed, 6 insertions(+), 342 deletions(-)

diff --git a/ext/MollyCUDAExt.jl b/ext/MollyCUDAExt.jl
index 22fbdb53f..5d0018fc9 100644
--- a/ext/MollyCUDAExt.jl
+++ b/ext/MollyCUDAExt.jl
@@ -37,8 +37,8 @@ function cuda_threads_blocks_specific(n_inters)
     return n_threads_gpu, n_blocks
 end
 
-function pairwise_force_gpu!(buffers, sys::System{D, AT, T}, pairwise_inters, nbs::Molly.NoNeighborList,
-                             step_n) where {D, AT <: CuArray, T}
+function Molly.pairwise_force_gpu!(buffers, sys::System{D, AT, T}, pairwise_inters,
+                                   nbs::Molly.NoNeighborList, step_n) where {D, AT <: CuArray, T}
     kernel = @cuda launch=false pairwise_force_kernel_nonl!(
             buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters, step_n,
             Val(D), Val(sys.force_units))
@@ -54,8 +54,8 @@ function pairwise_force_gpu!(buffers, sys::System{D, AT, T}, pairwise_inters, nb
     return buffers
 end
 
-function pairwise_force_gpu!(buffers, sys::System{D, AT, T}, pairwise_inters, nbs::Nothing,
-                             step_n) where {D, AT <: CuArray, T}
+function Molly.pairwise_force_gpu!(buffers, sys::System{D, AT, T}, pairwise_inters, nbs::Nothing,
+                                   step_n) where {D, AT <: CuArray, T}
     N = length(sys.coords)
     n_blocks = cld(N, WARPSIZE)
     r_cut = sys.neighbor_finder.dist_cutoff
@@ -80,17 +80,8 @@ function pairwise_force_gpu!(buffers, sys::System{D, AT, T}, pairwise_inters, nb
     return buffers
 end
 
-function pairwise_pe_gpu!(pe_vec_nounits, buffers, sys::System{D, AT, T}, pairwise_inters,
-                          nbs::Molly.NoNeighborList, step_n) where {D, AT <: CuArray, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_pairwise(length(nbs))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks pairwise_pe_kernel!(
-        pe_vec_nounits, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters,
-        nbs, step_n, Val(sys.energy_units))
-    return pe_vec_nounits
-end
-
-function pairwise_pe_gpu!(pe_vec_nounits, buffers, sys::System{D, AT, T}, pairwise_inters,
-                          nbs::Nothing, step_n) where {D, AT <: CuArray, T}
+function Molly.pairwise_pe_gpu!(pe_vec_nounits, buffers, sys::System{D, AT, T}, pairwise_inters,
+                                nbs::Nothing, step_n) where {D, AT <: CuArray, T}
     # The ordering is always recomputed for potential energy
     # Different buffers are used to the forces case, so sys.neighbor_finder.initialized
     #   is not updated
@@ -291,7 +282,6 @@ function compress_boolean_matrices!(sorted_seq, eligible_matrix, special_matrix,
     return nothing
 end
 
-
 #=
 **The No-neighborlist pairwise force summation kernel (algorithm by Eastman, see https://onlinelibrary.wiley.com/doi/full/10.1002/jcc.21413)**: 
 1. Case j < n_blocks && i < j, i.e., `WARPSIZE`×`WARPSIZE` tiles: For such tiles each row is assiged to a different thread in a warp which calculates the
@@ -610,7 +600,6 @@ function force_kernel!(
     return nothing
 end
 
-
 function energy_kernel!( 
     sorted_seq,
     energy_nounits, 
@@ -843,8 +832,6 @@ function energy_kernel!(
     return nothing
 end
 
-
-
 function pairwise_force_kernel_nonl!(forces::AbstractArray{T}, coords_var, velocities_var,
                         atoms_var, boundary, inters, step_n, ::Val{D}, ::Val{F}) where {T, D, F}
     coords = CUDA.Const(coords_var)
@@ -913,50 +900,6 @@ function pairwise_force_kernel_nonl!(forces::AbstractArray{T}, coords_var, veloc
     return nothing
 end
 
-function pairwise_pe_kernel!(energy, coords_var, velocities_var, atoms_var, boundary, inters,
-                             neighbors_var, step_n, ::Val{E}) where E
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    neighbors = CUDA.Const(neighbors_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(neighbors)
-        i, j, special = neighbors[inter_i]
-        coord_i, coord_j, vel_i, vel_j = coords[i], coords[j], velocities[i], velocities[j]
-        dr = vector(coord_i, coord_j, boundary)
-        pe = potential_energy_gpu(inters[1], dr, atoms[i], atoms[j], E, special, coord_i, coord_j,
-                                  boundary, vel_i, vel_j, step_n)
-        for inter in inters[2:end]
-            pe += potential_energy_gpu(inter, dr, atoms[i], atoms[j], E, special, coord_i, coord_j,
-                                       boundary, vel_i, vel_j, step_n)
-        end
-        if unit(pe) != E
-            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
-        end
-        Atomix.@atomic :monotonic energy[1] += ustrip(pe)
-    end
-    return nothing
-end
-
-@inline function sum_pairwise_forces(inters, atom_i, atom_j, ::Val{F}, special, coord_i, coord_j,
-                                     boundary, vel_i, vel_j, step_n) where F
-    dr = vector(coord_i, coord_j, boundary)
-    f_tuple = ntuple(length(inters)) do inter_type_i
-        force_gpu(inters[inter_type_i], dr, atom_i, atom_j, F, special, coord_i, coord_j, boundary,
-                  vel_i, vel_j, step_n)
-    end
-    f = sum(f_tuple)
-    if unit(f[1]) != F
-        # This triggers an error but it isn't printed
-        # See https://discourse.julialang.org/t/error-handling-in-cuda-kernels/79692
-        #   for how to throw a more meaningful error
-        error("wrong force unit returned, was expecting $F but got $(unit(f[1]))")
-    end
-    return f
-end
-
 @inline function sum_pairwise_potentials(inters, atom_i, atom_j, ::Val{E}, special, coord_i, coord_j,
                                      boundary, vel_i, vel_j, step_n) where E
     dr = vector(coord_i, coord_j, boundary)
@@ -976,283 +919,4 @@ end
     return pe
 end
 
-function specific_force_gpu!(fs_mat, inter_list::InteractionList1Atoms, coords::AbstractArray{SVector{D, C}},
-                            velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_force_1_atoms_kernel!(fs_mat,
-            coords, velocities, atoms, boundary, step_n, inter_list.is, inter_list.inters,
-            Val(D), Val(force_units))
-    return fs_mat
-end
-
-function specific_force_gpu!(fs_mat, inter_list::InteractionList2Atoms, coords::AbstractArray{SVector{D, C}},
-                            velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_force_2_atoms_kernel!(fs_mat,
-            coords, velocities, atoms, boundary, step_n, inter_list.is, inter_list.js,
-            inter_list.inters, Val(D), Val(force_units))
-    return fs_mat
-end
-
-function specific_force_gpu!(fs_mat, inter_list::InteractionList3Atoms, coords::AbstractArray{SVector{D, C}},
-                            velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_force_3_atoms_kernel!(fs_mat,
-            coords, velocities, atoms, boundary, step_n, inter_list.is, inter_list.js,
-            inter_list.ks, inter_list.inters, Val(D), Val(force_units))
-    return fs_mat
-end
-
-function specific_force_gpu!(fs_mat, inter_list::InteractionList4Atoms, coords::AbstractArray{SVector{D, C}},
-                            velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_force_4_atoms_kernel!(fs_mat,
-            coords, velocities, atoms, boundary, step_n, inter_list.is, inter_list.js,
-            inter_list.ks, inter_list.ls, inter_list.inters, Val(D), Val(force_units))
-    return fs_mat
-end
-
-function specific_force_1_atoms_kernel!(forces, coords_var, velocities_var, atoms_var, boundary,
-                        step_n, is_var, inters_var, ::Val{D}, ::Val{F}) where {D, F}
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i = is[inter_i]
-        fs = force_gpu(inters[inter_i], coords[i], boundary, atoms[i], F, velocities[i], step_n)
-        if unit(fs.f1[1]) != F
-            error("wrong force unit returned, was expecting $F")
-        end
-        for dim in 1:D
-            Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim])
-        end
-    end
-    return nothing
-end
-
-function specific_force_2_atoms_kernel!(forces, coords_var, velocities_var, atoms_var, boundary,
-                        step_n, is_var, js_var, inters_var, ::Val{D}, ::Val{F}) where {D, F}
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    js = CUDA.Const(js_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i, j = is[inter_i], js[inter_i]
-        fs = force_gpu(inters[inter_i], coords[i], coords[j], boundary, atoms[i], atoms[j], F,
-                       velocities[i], velocities[j], step_n)
-        if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F
-            error("wrong force unit returned, was expecting $F")
-        end
-        for dim in 1:D
-            Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim])
-            Atomix.@atomic :monotonic forces[dim, j] += ustrip(fs.f2[dim])
-        end
-    end
-    return nothing
-end
-
-function specific_force_3_atoms_kernel!(forces, coords_var, velocities_var, atoms_var, boundary,
-                        step_n, is_var, js_var, ks_var, inters_var, ::Val{D}, ::Val{F}) where {D, F}
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    js = CUDA.Const(js_var)
-    ks = CUDA.Const(ks_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i, j, k = is[inter_i], js[inter_i], ks[inter_i]
-        fs = force_gpu(inters[inter_i], coords[i], coords[j], coords[k], boundary, atoms[i],
-                       atoms[j], atoms[k], F, velocities[i], velocities[j], velocities[k], step_n)
-        if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F || unit(fs.f3[1]) != F
-            error("wrong force unit returned, was expecting $F")
-        end
-        for dim in 1:D
-            Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim])
-            Atomix.@atomic :monotonic forces[dim, j] += ustrip(fs.f2[dim])
-            Atomix.@atomic :monotonic forces[dim, k] += ustrip(fs.f3[dim])
-        end
-    end
-    return nothing
-end
-
-function specific_force_4_atoms_kernel!(forces, coords_var, velocities_var, atoms_var, boundary,
-                        step_n, is_var, js_var, ks_var, ls_var, inters_var,
-                        ::Val{D}, ::Val{F}) where {D, F}
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    js = CUDA.Const(js_var)
-    ks = CUDA.Const(ks_var)
-    ls = CUDA.Const(ls_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i, j, k, l = is[inter_i], js[inter_i], ks[inter_i], ls[inter_i]
-        fs = force_gpu(inters[inter_i], coords[i], coords[j], coords[k], coords[l], boundary,
-                       atoms[i], atoms[j], atoms[k], atoms[l], F, velocities[i], velocities[j],
-                       velocities[k], velocities[l], step_n)
-        if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F || unit(fs.f3[1]) != F || unit(fs.f4[1]) != F
-            error("wrong force unit returned, was expecting $F")
-        end
-        for dim in 1:D
-            Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim])
-            Atomix.@atomic :monotonic forces[dim, j] += ustrip(fs.f2[dim])
-            Atomix.@atomic :monotonic forces[dim, k] += ustrip(fs.f3[dim])
-            Atomix.@atomic :monotonic forces[dim, l] += ustrip(fs.f4[dim])
-        end
-    end
-    return nothing
-end
-
-
-function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList1Atoms, coords::AbstractArray{SVector{D, C}},
-                          velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_pe_1_atoms_kernel!(
-            pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is,
-            inter_list.inters, Val(energy_units))
-    return pe_vec_nounits
-end
-
-function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList2Atoms, coords::AbstractArray{SVector{D, C}},
-                          velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_pe_2_atoms_kernel!(
-            pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is,
-            inter_list.js, inter_list.inters, Val(energy_units))
-    return pe_vec_nounits
-end
-
-function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList3Atoms, coords::AbstractArray{SVector{D, C}},
-                          velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_pe_3_atoms_kernel!(
-            pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is,
-            inter_list.js, inter_list.ks, inter_list.inters, Val(energy_units))
-    return pe_vec_nounits
-end
-
-function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList4Atoms, coords::AbstractArray{SVector{D, C}},
-                          velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_pe_4_atoms_kernel!(
-            pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is,
-            inter_list.js, inter_list.ks, inter_list.ls, inter_list.inters, Val(energy_units))
-    return pe_vec_nounits
-end
-
-function specific_pe_1_atoms_kernel!(energy, coords_var, velocities_var, atoms_var, boundary,
-                    step_n, is_var, inters_var, ::Val{E}) where E
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i = is[inter_i]
-        pe = potential_energy_gpu(inters[inter_i], coords[i], boundary, atoms[i], E,
-                                  velocities[i], step_n)
-        if unit(pe) != E
-            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
-        end
-        Atomix.@atomic :monotonic energy[1] += ustrip(pe)
-    end
-    return nothing
-end
-
-function specific_pe_2_atoms_kernel!(energy, coords_var, velocities_var, atoms_var, boundary,
-                    step_n, is_var, js_var, inters_var, ::Val{E}) where E
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    js = CUDA.Const(js_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i, j = is[inter_i], js[inter_i]
-        pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], boundary, atoms[i],
-                                  atoms[j], E, velocities[i], velocities[j], step_n)
-        if unit(pe) != E
-            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
-        end
-        Atomix.@atomic :monotonic energy[1] += ustrip(pe)
-    end
-    return nothing
-end
-
-function specific_pe_3_atoms_kernel!(energy, coords_var, velocities_var, atoms_var, boundary,
-                    step_n, is_var, js_var, ks_var, inters_var, ::Val{E}) where E
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    js = CUDA.Const(js_var)
-    ks = CUDA.Const(ks_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i, j, k = is[inter_i], js[inter_i], ks[inter_i]
-        pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], coords[k], boundary,
-                                  atoms[i], atoms[j], atoms[k], E, velocities[i], velocities[j],
-                                  velocities[k], step_n)
-        if unit(pe) != E
-            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
-        end
-        Atomix.@atomic :monotonic energy[1] += ustrip(pe)
-    end
-    return nothing
-end
-
-function specific_pe_4_atoms_kernel!(energy, coords_var, velocities_var, atoms_var, boundary,
-                    step_n, is_var, js_var, ks_var, ls_var, inters_var, ::Val{E}) where E
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    js = CUDA.Const(js_var)
-    ks = CUDA.Const(ks_var)
-    ls = CUDA.Const(ls_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i, j, k, l = is[inter_i], js[inter_i], ks[inter_i], ls[inter_i]
-        pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], coords[k], coords[l],
-                                  boundary, atoms[i], atoms[j], atoms[k], atoms[l], E,
-                                  velocities[i], velocities[j], velocities[k], velocities[l],
-                                  step_n)
-        if unit(pe) != E
-            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
-        end
-        Atomix.@atomic :monotonic energy[1] += ustrip(pe)
-    end
-    return nothing
-end
-
 end

From 1a27b4910cdd538880ecff4c03958f26400e0bb3 Mon Sep 17 00:00:00 2001
From: Joe Greener <jgreener@hotmail.co.uk>
Date: Tue, 28 Jan 2025 18:10:25 +0000
Subject: [PATCH 11/24] fix tests

---
 ext/MollyCUDAExt.jl | 20 ++++++++++----------
 src/kernels.jl      |  4 ++--
 test/simulation.jl  |  2 +-
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/ext/MollyCUDAExt.jl b/ext/MollyCUDAExt.jl
index 5d0018fc9..89efba5f3 100644
--- a/ext/MollyCUDAExt.jl
+++ b/ext/MollyCUDAExt.jl
@@ -411,7 +411,7 @@ function force_kernel!(
                 spec = (special_bitmask >> (warpsize() - shuffle_idx)) | (special_bitmask << shuffle_idx)
                 condition = (excl & 0x1) == true && r2 <= r_cut * r_cut
                  
-                f = condition ? sum_pairwise_forces(
+                f = condition ? Molly.sum_pairwise_forces(
                     inters_tuple,
                     atoms_i, atoms_j_shuffle,
                     Val(force_units),
@@ -476,7 +476,7 @@ function force_kernel!(
                 spec = (special_bitmask >> (warpsize() - m)) | (special_bitmask << m)
                 condition = (excl & 0x1) == true && r2 <= r_cut * r_cut
 
-                f = condition ? sum_pairwise_forces(
+                f = condition ? Molly.sum_pairwise_forces(
                     inters_tuple,
                     atoms_i, atoms_j,
                     Val(force_units),
@@ -526,7 +526,7 @@ function force_kernel!(
             spec = (special_bitmask >> (warpsize() - m)) | (special_bitmask << m)
             condition = (excl & 0x1) == true && r2 <= r_cut * r_cut
 
-            f = condition ? sum_pairwise_forces(
+            f = condition ? Molly.sum_pairwise_forces(
                 inters_tuple,
                 atoms_i, atoms_j,
                 Val(force_units),
@@ -573,7 +573,7 @@ function force_kernel!(
                 spec = (special_bitmask >> (warpsize() - m)) | (special_bitmask << m)
                 condition = (excl & 0x1) == true && r2 <= r_cut * r_cut
                 
-                f = condition ? sum_pairwise_forces(
+                f = condition ? Molly.sum_pairwise_forces(
                     inters_tuple,
                     atoms_i, atoms_j,
                     Val(force_units),
@@ -859,8 +859,8 @@ function pairwise_force_kernel_nonl!(forces::AbstractArray{T}, coords_var, veloc
                 j = j_0_tile + del_j
                 if i != j
                     atom_j, coord_j, vel_j = atoms[j], coords[j], velocities[j]
-                    f = sum_pairwise_forces(inters, atom_i, atom_j, Val(F), false, coord_i, coord_j,
-                                            boundary, vel_i, vel_j, step_n)
+                    f = Molly.sum_pairwise_forces(inters, atom_i, atom_j, Val(F), false, coord_i,
+                                                  coord_j, boundary, vel_i, vel_j, step_n)
                     for dim in 1:D
                         forces_shmem[dim, tidx] += -ustrip(f[dim])
                     end
@@ -884,7 +884,7 @@ function pairwise_force_kernel_nonl!(forces::AbstractArray{T}, coords_var, veloc
         @inbounds for _ in 1:tilesteps
             sync_warp()
             atom_j = atoms[j]
-            f = sum_pairwise_forces(inters, atom_i, atom_j, Val(F), false, coord_i, coord_j,
+            f = Molly.sum_pairwise_forces(inters, atom_i, atom_j, Val(F), false, coord_i, coord_j,
                                     boundary, vel_i, vel_j, step_n)
             for dim in 1:D
                 forces_shmem[dim, tidx] += -ustrip(f[dim])
@@ -905,9 +905,9 @@ end
     dr = vector(coord_i, coord_j, boundary)
 
     pe_tuple = ntuple(length(inters)) do inter_type_i
-        SVector(potential_energy_gpu(inters[inter_type_i], dr, atom_i, atom_j, E, special, coord_i, coord_j, boundary,
-                  vel_i, vel_j, step_n))
-                  # SVector was required to avoid a GPU error occurring with scalars (like the quantity returned by potential_energy_gpu) 
+        # SVector was required to avoid a GPU error occurring with scalars
+        SVector(Molly.potential_energy_gpu(inters[inter_type_i], dr, atom_i, atom_j, E, special,
+                            coord_i, coord_j, boundary, vel_i, vel_j, step_n))
     end
     pe = sum(pe_tuple)
     if unit(pe[1]) != E
diff --git a/src/kernels.jl b/src/kernels.jl
index d8e284a91..da5257ab2 100644
--- a/src/kernels.jl
+++ b/src/kernels.jl
@@ -40,7 +40,7 @@ function pairwise_force_gpu!(buffers, sys::System{D, AT, T},
         n_threads_gpu = gpu_threads_pairwise(length(nbs))
         kernel! = pairwise_force_kernel_nl!(backend, n_threads_gpu)
         kernel!(buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters,
-                nbs, step_n, Val(D), Val(force_units); ndrange=length(nbs))
+                nbs, step_n, Val(D), Val(sys.force_units); ndrange=length(nbs))
     end
     return buffers
 end
@@ -222,7 +222,7 @@ function pairwise_pe_gpu!(pe_vec_nounits, buffers, sys::System{D, AT},
         n_threads_gpu = gpu_threads_pairwise(length(nbs))
         kernel! = pairwise_pe_kernel!(backend, n_threads_gpu)
         kernel!(pe_vec_nounits, sys.coords, sys.velocities, sys.atoms, sys.boundary,
-                pairwise_inters, nbs, step_n, Val(energy_units); ndrange=length(nbs))
+                pairwise_inters, nbs, step_n, Val(sys.energy_units); ndrange=length(nbs))
     end
     return pe_vec_nounits
 end
diff --git a/test/simulation.jl b/test/simulation.jl
index 7e350f688..12ac7ac4d 100644
--- a/test/simulation.jl
+++ b/test/simulation.jl
@@ -1341,7 +1341,7 @@ end
         if nl
             if AT <: CuArray
                 neighbor_finder = GPUNeighborFinder(
-                    eligible=AT(trues(n_atoms, n_atoms),
+                    eligible=AT(trues(n_atoms, n_atoms)),
                     n_steps_reorder=10,
                     dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm",
                 )

From a6e394e59d8cce43e16374f076eaa07f34d74950 Mon Sep 17 00:00:00 2001
From: Joe Greener <jgreener@hotmail.co.uk>
Date: Wed, 29 Jan 2025 18:25:05 +0000
Subject: [PATCH 12/24] buffer energy don't reset

---
 src/energy.jl  | 2 +-
 src/force.jl   | 5 +++--
 src/kernels.jl | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/energy.jl b/src/energy.jl
index c9b590b53..b257c6fa7 100644
--- a/src/energy.jl
+++ b/src/energy.jl
@@ -257,7 +257,7 @@ function potential_energy(sys::System{D, AT, T}, neighbors, step_n::Integer=0;
                           n_threads::Integer=Threads.nthreads()) where {D, AT <: AbstractGPUArray, T}
     val_ft = Val(T)
     pe_vec_nounits = KernelAbstractions.zeros(get_backend(sys.coords), T, 1)
-    buffers = init_forces_buffer!(sys, ustrip_vec.(zero(sys.coords)), 1)
+    buffers = init_forces_buffer!(sys, ustrip_vec.(zero(sys.coords)), 1, true)
 
     pairwise_inters_nonl = filter(!use_neighbors, values(sys.pairwise_inters))
     if length(pairwise_inters_nonl) > 0
diff --git a/src/force.jl b/src/force.jl
index 0ee26d682..bbb67e38c 100644
--- a/src/force.jl
+++ b/src/force.jl
@@ -132,7 +132,8 @@ struct ForcesBuffer{F, C, M, R}
     compressed_special::R
 end
 
-function init_forces_buffer!(sys, forces_nounits::AbstractGPUArray{SVector{D, T}}, n_threads) where {D, T}
+function init_forces_buffer!(sys, forces_nounits::AbstractGPUArray{SVector{D, T}}, n_threads,
+                             for_pe::Bool=false) where {D, T}
     N = length(forces_nounits)
     C = eltype(eltype(sys.coords))
     n_blocks = cld(N, 32)
@@ -143,7 +144,7 @@ function init_forces_buffer!(sys, forces_nounits::AbstractGPUArray{SVector{D, T}
     Morton_seq = KernelAbstractions.zeros(backend, Int32, N)
     compressed_eligible = KernelAbstractions.zeros(backend, UInt32, 32, n_blocks, n_blocks)
     compressed_special = KernelAbstractions.zeros(backend, UInt32, 32, n_blocks, n_blocks)
-    if sys.neighbor_finder isa GPUNeighborFinder
+    if !for_pe && sys.neighbor_finder isa GPUNeighborFinder
         sys.neighbor_finder.initialized = false
     end
     return ForcesBuffer(fs_mat, box_mins, box_maxs, Morton_seq, compressed_eligible, compressed_special)
diff --git a/src/kernels.jl b/src/kernels.jl
index da5257ab2..0f743f942 100644
--- a/src/kernels.jl
+++ b/src/kernels.jl
@@ -210,7 +210,7 @@ function pairwise_pe_gpu!(pe_vec_nounits, buffers, sys::System{D, AT},
                          pairwise_inters, neighbors, step_n) where {D, AT <: AbstractGPUArray}
     if isnothing(neighbors)
         error("neighbors is nothing, if you are using GPUNeighborFinder on a non-NVIDIA GPU you " *
-            "should use DistanceNeighborFinder instead")
+              "should use DistanceNeighborFinder instead")
     end
     if typeof(neighbors) == NoNeighborList
         nbs = neighbors

From 1022af7395cb6b4025cc80f2b0836ede6e417875 Mon Sep 17 00:00:00 2001
From: Joe Greener <jgreener@hotmail.co.uk>
Date: Wed, 29 Jan 2025 19:06:47 +0000
Subject: [PATCH 13/24] more test fixes

---
 src/kernels.jl |  2 +-
 src/setup.jl   | 18 ++++++++----------
 test/basic.jl  |  6 +++---
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/src/kernels.jl b/src/kernels.jl
index 0f743f942..ea70e1b8a 100644
--- a/src/kernels.jl
+++ b/src/kernels.jl
@@ -36,7 +36,7 @@ function pairwise_force_gpu!(buffers, sys::System{D, AT, T},
         nbs = @view neighbors.list[1:neighbors.n]
     end
     if length(neighbors) > 0
-        backend = get_backend(coords)
+        backend = get_backend(sys.coords)
         n_threads_gpu = gpu_threads_pairwise(length(nbs))
         kernel! = pairwise_force_kernel_nl!(backend, n_threads_gpu)
         kernel!(buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters,
diff --git a/src/setup.jl b/src/setup.jl
index ac371ca75..2dc91e3cb 100644
--- a/src/setup.jl
+++ b/src/setup.jl
@@ -887,15 +887,14 @@ function System(coord_file::AbstractString,
     end
     coords = wrap_coords.(coords, (boundary_used,))
 
-    if AT <: AbstractGPUArray
+    if Symbol(AT) == :CuArray
         neighbor_finder = GPUNeighborFinder(
             eligible=AT(eligible),
             dist_cutoff=T(dist_neighbors),
             special=AT(special),
             n_steps_reorder=10,
-            initialized=false,
         )
-    elseif use_cell_list
+    elseif use_cell_list && !(AT <: AbstractGPUArray)
         neighbor_finder = CellListMapNeighborFinder(
             eligible=eligible,
             special=special,
@@ -906,8 +905,8 @@ function System(coord_file::AbstractString,
         )
     else
         neighbor_finder = DistanceNeighborFinder(
-            eligible=eligible,
-            special=special,
+            eligible=AT(eligible),
+            special=AT(special),
             n_steps=10,
             dist_cutoff=T(dist_neighbors),
         )
@@ -1280,15 +1279,14 @@ function System(T::Type,
     end
     specific_inter_lists = tuple(specific_inter_array...)
 
-    if AT <: AbstractGPUArray
+    if Symbol(AT) == :CuArray
         neighbor_finder = GPUNeighborFinder(
             eligible=AT(eligible),
             dist_cutoff=T(dist_neighbors),
             special=AT(special),
             n_steps_reorder=10,
-            initialized=false,
         )
-    elseif use_cell_list
+    elseif use_cell_list && !(AT <: AbstractGPUArray)
         neighbor_finder = CellListMapNeighborFinder(
             eligible=eligible,
             special=special,
@@ -1299,8 +1297,8 @@ function System(T::Type,
         )
     else
         neighbor_finder = DistanceNeighborFinder(
-            eligible=eligible,
-            special=special,
+            eligible=AT(eligible),
+            special=AT(special),
             n_steps=10,
             dist_cutoff=T(dist_neighbors),
         )
diff --git a/test/basic.jl b/test/basic.jl
index b6a218f3a..51542c1b7 100644
--- a/test/basic.jl
+++ b/test/basic.jl
@@ -184,14 +184,14 @@
         # Mark all pairs as ineligible for pairwise interactions and check that the
         #   potential energy from the specific interactions does not change on scaling
         no_nbs = falses(length(sys), length(sys))
-        if AT <: AbstractGPUArray
+        if AT <: CuArray
             sys.neighbor_finder = GPUNeighborFinder(
                 eligible=AT(no_nbs),
                 dist_cutoff=1.0u"nm",
             )
-        else 
+        else
             sys.neighbor_finder = DistanceNeighborFinder(
-                eligible=no_nbs,
+                eligible=(AT <: Array ? no_nbs : AT(no_nbs)),
                 dist_cutoff=1.0u"nm",
             )
         end

From 63b2aec18b79ee9cc5a23fae814696b64c09dec3 Mon Sep 17 00:00:00 2001
From: Joe Greener <jgreener@hotmail.co.uk>
Date: Wed, 29 Jan 2025 19:19:59 +0000
Subject: [PATCH 14/24] neighbor finder test fix

---
 test/basic.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/basic.jl b/test/basic.jl
index 51542c1b7..8c1d6fd85 100644
--- a/test/basic.jl
+++ b/test/basic.jl
@@ -191,7 +191,7 @@
             )
         else
             sys.neighbor_finder = DistanceNeighborFinder(
-                eligible=(AT <: Array ? no_nbs : AT(no_nbs)),
+                eligible=AT(no_nbs),
                 dist_cutoff=1.0u"nm",
             )
         end

From f005fc2ba62bbc2503926e9c8d62ac68fb8f3f57 Mon Sep 17 00:00:00 2001
From: Joe Greener <jgreener@hotmail.co.uk>
Date: Thu, 30 Jan 2025 11:41:13 +0000
Subject: [PATCH 15/24] function to determine GPU NF compat

---
 ext/MollyCUDAExt.jl | 2 ++
 src/neighbors.jl    | 3 +++
 src/setup.jl        | 4 ++--
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/ext/MollyCUDAExt.jl b/ext/MollyCUDAExt.jl
index 89efba5f3..de68d0807 100644
--- a/ext/MollyCUDAExt.jl
+++ b/ext/MollyCUDAExt.jl
@@ -7,6 +7,8 @@ using KernelAbstractions
 
 const WARPSIZE = UInt32(32)
 
+Molly.uses_gpu_neighbor_finder(::Type{AT}) where {AT <: CuArray} = true
+
 CUDA.Const(nl::Molly.NoNeighborList) = nl
 
 macro shfl_multiple_sync(mask, target, width, vars...)
diff --git a/src/neighbors.jl b/src/neighbors.jl
index 415e08801..dbece0938 100644
--- a/src/neighbors.jl
+++ b/src/neighbors.jl
@@ -43,6 +43,9 @@ find_neighbors(sys::System; kwargs...) = find_neighbors(sys, sys.neighbor_finder
 
 find_neighbors(sys::System, nf::NoNeighborFinder, args...; kwargs...) = nothing
 
+# Indicates whether an array type is compatible with GPUNeighborFinder
+uses_gpu_neighbor_finder(AT) = false
+
 """
     GPUNeighborFinder(; eligible, dist_cutoff, special, n_steps_reorder, initialized)
 
diff --git a/src/setup.jl b/src/setup.jl
index 2dc91e3cb..35b151f45 100644
--- a/src/setup.jl
+++ b/src/setup.jl
@@ -887,7 +887,7 @@ function System(coord_file::AbstractString,
     end
     coords = wrap_coords.(coords, (boundary_used,))
 
-    if Symbol(AT) == :CuArray
+    if uses_gpu_neighbor_finder(AT)
         neighbor_finder = GPUNeighborFinder(
             eligible=AT(eligible),
             dist_cutoff=T(dist_neighbors),
@@ -1279,7 +1279,7 @@ function System(T::Type,
     end
     specific_inter_lists = tuple(specific_inter_array...)
 
-    if Symbol(AT) == :CuArray
+    if uses_gpu_neighbor_finder(AT)
         neighbor_finder = GPUNeighborFinder(
             eligible=AT(eligible),
             dist_cutoff=T(dist_neighbors),

From a1ab539a10ec17273dcc76ab79dd04ce3349514e Mon Sep 17 00:00:00 2001
From: Joe Greener <jgreener@hotmail.co.uk>
Date: Thu, 30 Jan 2025 14:23:34 +0000
Subject: [PATCH 16/24] test on all available backends

---
 benchmark/benchmarks.jl | 10 +++++-----
 test/Project.toml       |  7 ++++++-
 test/basic.jl           |  7 ++-----
 test/gradients.jl       | 32 +++++++++++---------------------
 test/runtests.jl        | 32 ++++++++++++++++++++++++++------
 test/simulation.jl      | 16 +++++-----------
 6 files changed, 55 insertions(+), 49 deletions(-)

diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index 38e16bd41..c790b4463 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -17,15 +17,15 @@ else
     @warn "The parallel benchmarks will not be run as Julia is running on 1 thread"
 end
 
-# Allow CUDA device to be specified
-const DEVICE = get(ENV, "DEVICE", "0")
+# Allow GPU device to be specified
+const DEVICE = parse(Int, get(ENV, "DEVICE", "0"))
 
 const run_cuda_tests = CUDA.functional()
 if run_cuda_tests
-    device!(parse(Int, DEVICE))
-    @info "The GPU benchmarks will be run on device $DEVICE"
+    device!(DEVICE)
+    @info "The CUDA benchmarks will be run on device $DEVICE"
 else
-    @warn "The GPU benchmarks will not be run as a CUDA-enabled device is not available"
+    @warn "The CUDA benchmarks will not be run as a CUDA-enabled device is not available"
 end
 
 const SUITE = BenchmarkGroup(
diff --git a/test/Project.toml b/test/Project.toml
index 69fec6609..bbe5c8dc2 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -13,15 +13,20 @@ GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
 GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 KernelDensity = "5ab0869b-81aa-558d-bb23-cbf5423bbe9b"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SimpleCrystals = "64031d72-e220-11ed-1a7e-43a2532b2fa8"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
 
 [compat]
+AMDGPU = "1.2"
 Aqua = "0.8"
 AtomsBaseTesting = "0.4"
+CUDA = "5"
 DelimitedFiles = "1.9"
 FiniteDifferences = "0.12"
-GLMakie = "0.9, 0.10"
+Metal = "1.5"
 Test = "1.9"
+oneAPI = "2"
diff --git a/test/basic.jl b/test/basic.jl
index 8c1d6fd85..e510bf6e3 100644
--- a/test/basic.jl
+++ b/test/basic.jl
@@ -343,11 +343,8 @@ end
     coords_1 = SVector{3, Float64}.(eachcol(cm_1)) / 10 * u"nm"
     coords_2 = SVector{3, Float64}.(eachcol(cm_2)) / 10 * u"nm"
     @test rmsd(coords_1, coords_2) ≈ 2.54859467758795u"Å"
-    if run_cuda_tests
-        @test rmsd(CuArray(coords_1), CuArray(coords_2)) ≈ 2.54859467758795u"Å"
-    end
-    if run_rocm_tests
-        @test rmsd(ROCArray(coords_1), ROCArray(coords_2)) ≈ 2.54859467758795u"Å"
+    for AT in array_list[2:end]
+        @test rmsd(AT(coords_1), AT(coords_2)) ≈ 2.54859467758795u"Å"
     end
 
     bb_atoms = BioStructures.collectatoms(struc[1], BioStructures.backboneselector)
diff --git a/test/gradients.jl b/test/gradients.jl
index 752148e26..bdda204aa 100644
--- a/test/gradients.jl
+++ b/test/gradients.jl
@@ -44,23 +44,16 @@ end
         ("CPU gbn2 forward", Array, false, true , false, false, true , 0.5 , 0.1 ),
     ]
     if run_parallel_tests #                  gpu      par    fwd    f32    obc2   gbn2   tol_σ tol_r0
-        push!(runs, ("CPU parallel"        , Array  , true , false, false, false, false, 1e-4, 1e-4))
-        push!(runs, ("CPU parallel forward", Array  , true , true , false, false, false, 0.5 , 0.1 ))
-        push!(runs, ("CPU parallel f32"    , Array  , true , false, true , false, false, 0.01, 5e-4))
+        push!(runs, ("CPU parallel"        , Array, true , false, false, false, false, 1e-4, 1e-4))
+        push!(runs, ("CPU parallel forward", Array, true , true , false, false, false, 0.5 , 0.1 ))
+        push!(runs, ("CPU parallel f32"    , Array, true , false, true , false, false, 0.01, 5e-4))
     end
-    if run_cuda_tests #                      gpu      par    fwd    f32    obc2   gbn2   tol_σ tol_r0
-        push!(runs, ("CUDA"                , CuArray, false, false, false, false, false, 0.25, 20.0))
-        push!(runs, ("CUDA forward"        , CuArray, false, true , false, false, false, 0.25, 20.0))
-        push!(runs, ("CUDA f32"            , CuArray, false, false, true , false, false, 0.5 , 50.0))
-        push!(runs, ("CUDA obc2"           , CuArray, false, false, false, true , false, 0.25, 20.0))
-        push!(runs, ("CUDA gbn2"           , CuArray, false, false, false, false, true , 0.25, 20.0))
-    end
-    if run_rocm_tests #                      gpu       par    fwd    f32    obc2   gbn2   tol_σ tol_r0
-        push!(runs, ("ROCM"                , ROCArray, false, false, false, false, false, 0.25, 20.0))
-        push!(runs, ("ROCM forward"        , ROCArray, false, true , false, false, false, 0.25, 20.0))
-        push!(runs, ("ROCM f32"            , ROCArray, false, false, true , false, false, 0.5 , 50.0))
-        push!(runs, ("ROCM obc2"           , ROCArray, false, false, false, true , false, 0.25, 20.0))
-        push!(runs, ("ROCM gbn2"           , ROCArray, false, false, false, false, true , 0.25, 20.0))
+    for AT in array_list[2:end] #            gpu    par    fwd    f32    obc2   gbn2   tol_σ tol_r0
+        push!(runs, ("$AT"                 , AT   , false, false, false, false, false, 0.25, 20.0))
+        push!(runs, ("$AT forward"         , AT   , false, true , false, false, false, 0.25, 20.0))
+        push!(runs, ("$AT f32"             , AT   , false, false, true , false, false, 0.5 , 50.0))
+        push!(runs, ("$AT obc2"            , AT   , false, false, false, true , false, 0.25, 20.0))
+        push!(runs, ("$AT gbn2"            , AT   , false, false, false, false, true , 0.25, 20.0))
     end
 
     function mean_min_separation(coords, boundary, ::Val{T}) where T
@@ -410,11 +403,8 @@ end
     if run_parallel_tests
         push!(platform_runs, ("CPU parallel", Array, true))
     end
-    if run_cuda_tests
-        push!(platform_runs, ("CUDA", CuArray, false))
-    end
-    if run_rocm_tests
-        push!(platform_runs, ("ROCM", ROCArray, false))
+    for AT in array_list[2:end]
+        push!(platform_runs, ("$AT", AT, false))
     end
     test_runs = [
         ("Energy", test_energy_grad, 1e-8),
diff --git a/test/runtests.jl b/test/runtests.jl
index cfdf775c1..c60fd9c31 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -11,6 +11,8 @@ using Enzyme
 using FiniteDifferences
 using GPUArrays
 using KernelDensity
+using Metal
+using oneAPI
 import SimpleCrystals
 
 using DelimitedFiles
@@ -51,17 +53,20 @@ else
     @warn "The parallel tests will not be run as Julia is running on 1 thread"
 end
 
-# Allow CUDA device to be specified
+const run_gpu_tests = get(ENV, "GPUTESTS", "1") != "0"
+# Allow GPU device to be specified
 const DEVICE = parse(Int, get(ENV, "DEVICE", "0"))
 
-const run_cuda_tests = get(ENV, "GPUTESTS", "1") != "0" && CUDA.functional()
-const run_rocm_tests = get(ENV, "GPUTESTS", "1") != "0" && AMDGPU.functional()
+const run_cuda_tests   = run_gpu_tests && CUDA.functional()
+const run_rocm_tests   = run_gpu_tests && AMDGPU.functional()
+const run_oneapi_tests = run_gpu_tests && oneAPI.functional()
+const run_metal_tests  = run_gpu_tests && Metal.functional()
 
 array_list = (Array,)
 
 if run_cuda_tests
     array_list = (array_list..., CuArray)
-    device!(DEVICE)
+    CUDA.device!(DEVICE)
     @info "The CUDA tests will be run on device $DEVICE"
 else
     @warn "The CUDA tests will not be run as a CUDA-enabled device is not available"
@@ -70,9 +75,24 @@ end
 if run_rocm_tests
     array_list = (array_list..., ROCArray)
     AMDGPU.device!(AMDGPU.device(DEVICE))
-    @info "The ROCM tests will be run on device $DEVICE"
+    @info "The AMDGPU tests will be run on device $DEVICE"
 else
-    @warn "The ROCM tests will not be run as a ROCM-enabled device is not available"
+    @warn "The AMDGPU tests will not be run as a AMDGPU-enabled device is not available"
+end
+
+if run_oneapi_tests
+    array_list = (array_list..., oneArray)
+    oneAPI.device!(DEVICE)
+    @info "The oneAPI tests will be run on device $DEVICE"
+else
+    @warn "The oneAPI tests will not be run as a oneAPI-enabled device is not available"
+end
+
+if run_metal_tests
+    array_list = (array_list..., MtlArray)
+    @info "The Metal tests will be run"
+else
+    @warn "The Metal tests will not be run as a Metal-enabled device is not available"
 end
 
 const data_dir = normpath(@__DIR__, "..", "data")
diff --git a/test/simulation.jl b/test/simulation.jl
index 12ac7ac4d..19b3af8bd 100644
--- a/test/simulation.jl
+++ b/test/simulation.jl
@@ -1396,17 +1396,11 @@ end
         push!(runs, ("CPU parallel NL"    , [true , true , false, Array]))
         push!(runs, ("CPU parallel f32 NL", [true , true , true , Array]))
     end
-    if run_cuda_tests
-        push!(runs, ("GPU"       , [false, false, false, CuArray]))
-        push!(runs, ("GPU f32"   , [false, false, true , CuArray]))
-        push!(runs, ("GPU NL"    , [true , false, false, CuArray]))
-        push!(runs, ("GPU f32 NL", [true , false, true , CuArray]))
-    end
-    if run_rocm_tests
-        push!(runs, ("GPU"       , [false, false, false, ROCArray]))
-        push!(runs, ("GPU f32"   , [false, false, true , ROCArray]))
-        push!(runs, ("GPU NL"    , [true , false, false, ROCArray]))
-        push!(runs, ("GPU f32 NL", [true , false, true , ROCArray]))
+    for AT in array_list[2:end]
+        push!(runs, ("$AT"       , [false, false, false, AT]))
+        push!(runs, ("$AT f32"   , [false, false, true , AT]))
+        push!(runs, ("$AT NL"    , [true , false, false, AT]))
+        push!(runs, ("$AT f32 NL", [true , false, true , AT]))
     end
 
     final_coords_ref, E_start_ref = test_sim(runs[1][2]...)

From 66e08479fcedb9fedc58eec442cd00b460c37594 Mon Sep 17 00:00:00 2001
From: Joe Greener <jgreener@hotmail.co.uk>
Date: Thu, 30 Jan 2025 14:24:41 +0000
Subject: [PATCH 17/24] fix neighbor finding on Metal devices

---
 README.md    | 2 +-
 src/types.jl | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 2db2fa5d8..182d52f52 100644
--- a/README.md
+++ b/README.md
@@ -34,7 +34,7 @@ Implemented features include:
 - [Unitful.jl](https://github.com/PainterQubits/Unitful.jl) compatibility so numbers have physical meaning.
 - Set up crystal systems using [SimpleCrystals.jl](https://github.com/ejmeitz/SimpleCrystals.jl).
 - Automatic multithreading.
-- GPU acceleration on CUDA-enabled devices.
+- GPU acceleration on all backends supported by [KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl), with better performance on CUDA-enabled devices.
 - Run with Float64, Float32 or other float types.
 - Some analysis functions, e.g. RDF.
 - Visualise simulations as animations with [Makie.jl](https://makie.juliaplots.org/stable).
diff --git a/src/types.jl b/src/types.jl
index 023883d33..0d049dc3f 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -399,8 +399,10 @@ n_atoms_to_n_pairs(n_atoms::Integer) = (n_atoms * (n_atoms - 1)) ÷ 2
 Base.length(nl::NoNeighborList) = n_atoms_to_n_pairs(nl.n_atoms)
 
 function pair_index(n_atoms::Integer, ind::Integer)
+    T = Float32 # Float32 for compatibility with Metal devices
     kz = ind - 1
-    iz = n_atoms - 2 - Int(floor(sqrt(-8 * kz + 4 * n_atoms * (n_atoms - 1) - 7) / 2 - 0.5))
+    sq = sqrt(T(-8 * kz + 4 * n_atoms * (n_atoms - 1) - 7))
+    iz = n_atoms - 2 - unsafe_trunc(Int, sq * T(0.5) - T(0.5))
     jz = kz + iz + 1 - (n_atoms * (n_atoms - 1)) ÷ 2 + ((n_atoms - iz) * ((n_atoms - iz) - 1)) ÷ 2
     i = iz + 1
     j = jz + 1

From a6ffe0c8ed3d7ce61f1d9afebc316840dff8f95c Mon Sep 17 00:00:00 2001
From: Joe Greener <jgreener@hotmail.co.uk>
Date: Thu, 30 Jan 2025 14:37:49 +0000
Subject: [PATCH 18/24] only run f32 tests on Metal

---
 test/Project.toml  | 2 +-
 test/runtests.jl   | 1 -
 test/simulation.jl | 4 ++++
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/test/Project.toml b/test/Project.toml
index bbe5c8dc2..c183b5756 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -21,7 +21,7 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
 
 [compat]
-AMDGPU = "1.2"
+AMDGPU = "1"
 Aqua = "0.8"
 AtomsBaseTesting = "0.4"
 CUDA = "5"
diff --git a/test/runtests.jl b/test/runtests.jl
index c60fd9c31..306628320 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -89,7 +89,6 @@ else
 end
 
 if run_metal_tests
-    array_list = (array_list..., MtlArray)
     @info "The Metal tests will be run"
 else
     @warn "The Metal tests will not be run as a Metal-enabled device is not available"
diff --git a/test/simulation.jl b/test/simulation.jl
index 19b3af8bd..ad4d2f125 100644
--- a/test/simulation.jl
+++ b/test/simulation.jl
@@ -1402,6 +1402,10 @@ end
         push!(runs, ("$AT NL"    , [true , false, false, AT]))
         push!(runs, ("$AT f32 NL", [true , false, true , AT]))
     end
+    if run_metal_tests
+        push!(runs, ("$AT f32"   , [false, false, true , AT]))
+        push!(runs, ("$AT f32 NL", [true , false, true , AT]))
+    end
 
     final_coords_ref, E_start_ref = test_sim(runs[1][2]...)
     # Check all simulations give the same result to within some error

From bb12e414e00ab878e386771a5e1f1728c229fb9a Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Fri, 31 Jan 2025 16:32:32 +0100
Subject: [PATCH 19/24] adding a +1 to run on AMD

---
 test/runtests.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 306628320..3c602a14c 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -74,7 +74,7 @@ end
 
 if run_rocm_tests
     array_list = (array_list..., ROCArray)
-    AMDGPU.device!(AMDGPU.device(DEVICE))
+    AMDGPU.device!(AMDGPU.device(DEVICE+1))
     @info "The AMDGPU tests will be run on device $DEVICE"
 else
     @warn "The AMDGPU tests will not be run as a AMDGPU-enabled device is not available"

From bd9a2bce90abafd3d3f1d0bcaae397ace5e4a5d0 Mon Sep 17 00:00:00 2001
From: Joe Greener <jgreener@hotmail.co.uk>
Date: Fri, 31 Jan 2025 16:11:01 +0000
Subject: [PATCH 20/24] revert pair_index change

---
 src/types.jl | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/types.jl b/src/types.jl
index 0d049dc3f..023883d33 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -399,10 +399,8 @@ n_atoms_to_n_pairs(n_atoms::Integer) = (n_atoms * (n_atoms - 1)) ÷ 2
 Base.length(nl::NoNeighborList) = n_atoms_to_n_pairs(nl.n_atoms)
 
 function pair_index(n_atoms::Integer, ind::Integer)
-    T = Float32 # Float32 for compatibility with Metal devices
     kz = ind - 1
-    sq = sqrt(T(-8 * kz + 4 * n_atoms * (n_atoms - 1) - 7))
-    iz = n_atoms - 2 - unsafe_trunc(Int, sq * T(0.5) - T(0.5))
+    iz = n_atoms - 2 - Int(floor(sqrt(-8 * kz + 4 * n_atoms * (n_atoms - 1) - 7) / 2 - 0.5))
     jz = kz + iz + 1 - (n_atoms * (n_atoms - 1)) ÷ 2 + ((n_atoms - iz) * ((n_atoms - iz) - 1)) ÷ 2
     i = iz + 1
     j = jz + 1

From b32da720e69e7b4ac40175df8cefa96a6bcb0ce0 Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Sat, 1 Feb 2025 00:34:41 +0100
Subject: [PATCH 21/24] ensuring the distance neighborfinder uses the right
 arraytype for GPU tests

---
 test/energy_conservation.jl | 2 +-
 test/simulation.jl          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/energy_conservation.jl b/test/energy_conservation.jl
index fbd636281..22d020c4b 100644
--- a/test/energy_conservation.jl
+++ b/test/energy_conservation.jl
@@ -36,7 +36,7 @@ using Test
                     )
                 else
                     neighbor_finder=DistanceNeighborFinder(
-                        eligible=trues(n_atoms, n_atoms),
+                        eligible=AT(trues(n_atoms, n_atoms)),
                         n_steps=10,
                         dist_cutoff=dist_cutoff,
                     )
diff --git a/test/simulation.jl b/test/simulation.jl
index ad4d2f125..490bf30a9 100644
--- a/test/simulation.jl
+++ b/test/simulation.jl
@@ -15,7 +15,7 @@
             )
         else
             neighbor_finder = DistanceNeighborFinder(
-                eligible=trues(n_atoms, n_atoms),
+                eligible=AT(trues(n_atoms, n_atoms)),
                 n_steps=10,
                 dist_cutoff=2.0u"nm",
             )

From a01f914c154262ef6e82af0d7e87f4852894c633 Mon Sep 17 00:00:00 2001
From: James Schloss <jrs.schloss@gmail.com>
Date: Sat, 1 Feb 2025 19:48:40 +0100
Subject: [PATCH 22/24] moving inbounds propagation to be at the kernel level

---
 src/kernels.jl | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/kernels.jl b/src/kernels.jl
index ea70e1b8a..b797356fc 100644
--- a/src/kernels.jl
+++ b/src/kernels.jl
@@ -45,7 +45,7 @@ function pairwise_force_gpu!(buffers, sys::System{D, AT, T},
     return buffers
 end
 
-@kernel function pairwise_force_kernel_nl!(forces, @Const(coords),
+@kernel inbounds=true function pairwise_force_kernel_nl!(forces, @Const(coords),
                                            @Const(velocities), @Const(atoms),
                                            boundary, inters,
                                            @Const(neighbors), step_n, ::Val{D},
@@ -53,7 +53,7 @@ end
 
     inter_i = @index(Global, Linear)
 
-    @inbounds if inter_i <= length(neighbors)
+    if inter_i <= length(neighbors)
         i, j, special = neighbors[inter_i]
         f = sum_pairwise_forces(inters, atoms[i], atoms[j], Val(F), special, coords[i], coords[j],
                                 boundary, velocities[i], velocities[j], step_n)
@@ -109,7 +109,7 @@ function specific_force_gpu!(fs_mat, inter_list::InteractionList4Atoms, coords::
     return fs_mat
 end
 
-@kernel function specific_force_1_atoms_kernel!(forces, @Const(coords),
+@kernel inbounds=true function specific_force_1_atoms_kernel!(forces, @Const(coords),
                                                 @Const(velocities),
                                                 @Const(atoms), boundary,
                                                 step_n, @Const(is),
@@ -118,7 +118,7 @@ end
 
     inter_i = @index(Global, Linear)
 
-    @inbounds if inter_i <= length(is)
+    if inter_i <= length(is)
         i = is[inter_i]
         fs = force_gpu(inters[inter_i], coords[i], boundary, atoms[i], F, velocities[i], step_n)
         if unit(fs.f1[1]) != F
@@ -130,7 +130,7 @@ end
     end
 end
 
-@kernel function specific_force_2_atoms_kernel!(forces, @Const(coords),
+@kernel inbounds=true function specific_force_2_atoms_kernel!(forces, @Const(coords),
                                                 @Const(velocities),
                                                 @Const(atoms), boundary,
                                                 step_n, @Const(is), @Const(js),
@@ -139,7 +139,7 @@ end
 
     inter_i = @index(Global, Linear)
 
-    @inbounds if inter_i <= length(is)
+    if inter_i <= length(is)
         i, j = is[inter_i], js[inter_i]
         fs = force_gpu(inters[inter_i], coords[i], coords[j], boundary, atoms[i], atoms[j], F,
                        velocities[i], velocities[j], step_n)
@@ -153,7 +153,7 @@ end
     end
 end
 
-@kernel function specific_force_3_atoms_kernel!(forces, @Const(coords),
+@kernel inbounds=true function specific_force_3_atoms_kernel!(forces, @Const(coords),
                                                 @Const(velocities),
                                                 @Const(atoms), boundary,
                                                 step_n, @Const(is),
@@ -163,7 +163,7 @@ end
 
     inter_i = @index(Global, Linear)
 
-    @inbounds if inter_i <= length(is)
+    if inter_i <= length(is)
         i, j, k = is[inter_i], js[inter_i], ks[inter_i]
         fs = force_gpu(inters[inter_i], coords[i], coords[j], coords[k], boundary, atoms[i],
                        atoms[j], atoms[k], F, velocities[i], velocities[j], velocities[k], step_n)
@@ -178,7 +178,7 @@ end
     end
 end
 
-@kernel function specific_force_4_atoms_kernel!(forces, @Const(coords),
+@kernel inbounds=true function specific_force_4_atoms_kernel!(forces, @Const(coords),
                                                 @Const(velocities),
                                                 @Const(atoms), boundary,
                                                 step_n, @Const(is),
@@ -189,7 +189,7 @@ end
 
     inter_i = @index(Global, Linear)
 
-    @inbounds if inter_i <= length(is)
+    if inter_i <= length(is)
         i, j, k, l = is[inter_i], js[inter_i], ks[inter_i], ls[inter_i]
         fs = force_gpu(inters[inter_i], coords[i], coords[j], coords[k], coords[l], boundary,
                        atoms[i], atoms[j], atoms[k], atoms[l], F, velocities[i], velocities[j],
@@ -227,13 +227,13 @@ function pairwise_pe_gpu!(pe_vec_nounits, buffers, sys::System{D, AT},
     return pe_vec_nounits
 end
 
-@kernel function pairwise_pe_kernel!(energy, @Const(coords), @Const(velocities),
+@kernel inbounds=true function pairwise_pe_kernel!(energy, @Const(coords), @Const(velocities),
                                      @Const(atoms), boundary, inters,
                                      @Const(neighbors), step_n, ::Val{E}) where E
 
     inter_i = @index(Global, Linear)
 
-    @inbounds if inter_i <= length(neighbors)
+    if inter_i <= length(neighbors)
         i, j, special = neighbors[inter_i]
         coord_i, coord_j, vel_i, vel_j = coords[i], coords[j], velocities[i], velocities[j]
         dr = vector(coord_i, coord_j, boundary)
@@ -293,12 +293,12 @@ function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList4Atoms, coo
     return pe_vec_nounits
 end
 
-@kernel function specific_pe_1_atoms_kernel!(energy, @Const(coords), @Const(velocities),
+@kernel inbounds=true function specific_pe_1_atoms_kernel!(energy, @Const(coords), @Const(velocities),
                     @Const(atoms), boundary, step_n, @Const(is), @Const(inters), ::Val{E}) where E
 
     inter_i = @index(Global, Linear)
 
-    @inbounds if inter_i <= length(is)
+    if inter_i <= length(is)
         i = is[inter_i]
         pe = potential_energy_gpu(inters[inter_i], coords[i], boundary, atoms[i], E,
                                   velocities[i], step_n)
@@ -309,14 +309,14 @@ end
     end
 end
 
-@kernel function specific_pe_2_atoms_kernel!(energy, @Const(coords), @Const(velocities),
+@kernel inbounds=true function specific_pe_2_atoms_kernel!(energy, @Const(coords), @Const(velocities),
                     @Const(atoms), boundary, step_n, @Const(is), @Const(js), @Const(inters),
                     ::Val{E}) where E
 
 
     inter_i = @index(Global, Linear)
 
-    @inbounds if inter_i <= length(is)
+    if inter_i <= length(is)
         i, j = is[inter_i], js[inter_i]
         pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], boundary, atoms[i],
                                   atoms[j], E, velocities[i], velocities[j], step_n)
@@ -327,13 +327,13 @@ end
     end
 end
 
-@kernel function specific_pe_3_atoms_kernel!(energy, @Const(coords), @Const(velocities),
+@kernel inbounds=true function specific_pe_3_atoms_kernel!(energy, @Const(coords), @Const(velocities),
                     @Const(atoms), boundary, step_n, @Const(is), @Const(js), @Const(ks),
                     @Const(inters), ::Val{E}) where E
 
     inter_i = @index(Global, Linear)
 
-    @inbounds if inter_i <= length(is)
+    if inter_i <= length(is)
         i, j, k = is[inter_i], js[inter_i], ks[inter_i]
         pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], coords[k], boundary,
                                   atoms[i], atoms[j], atoms[k], E, velocities[i], velocities[j],
@@ -345,13 +345,13 @@ end
     end
 end
 
-@kernel function specific_pe_4_atoms_kernel!(energy, @Const(coords), @Const(velocities),
+@kernel inbounds=true function specific_pe_4_atoms_kernel!(energy, @Const(coords), @Const(velocities),
                     @Const(atoms), boundary, step_n, @Const(is), @Const(js), @Const(ks),
                     @Const(ls), @Const(inters), ::Val{E}) where E
 
     inter_i = @index(Global, Linear)
 
-    @inbounds if inter_i <= length(is)
+    if inter_i <= length(is)
         i, j, k, l = is[inter_i], js[inter_i], ks[inter_i], ls[inter_i]
         pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], coords[k], coords[l],
                                   boundary, atoms[i], atoms[j], atoms[k], atoms[l], E,

From abbd81b799cf521d8687d7ddbd0e170bf7b5d463 Mon Sep 17 00:00:00 2001
From: Joe Greener <jgreener@hotmail.co.uk>
Date: Mon, 3 Feb 2025 17:19:59 +0000
Subject: [PATCH 23/24] fix atomic kernel usage

---
 src/kernels.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/kernels.jl b/src/kernels.jl
index b797356fc..92cb16f50 100644
--- a/src/kernels.jl
+++ b/src/kernels.jl
@@ -59,8 +59,8 @@ end
                                 boundary, velocities[i], velocities[j], step_n)
         for dim in 1:D
             fval = ustrip(f[dim])
-            Atomix.@atomic forces[dim, i] = forces[dim, i] - fval
-            Atomix.@atomic forces[dim, j] = forces[dim, j] + fval
+            Atomix.@atomic forces[dim, i] += -fval
+            Atomix.@atomic forces[dim, j] +=  fval
         end
     end
 end

From 1ccb6273178df7e7b22437494e9faeabceb7aef9 Mon Sep 17 00:00:00 2001
From: Joe Greener <jgreener@hotmail.co.uk>
Date: Mon, 3 Feb 2025 17:55:12 +0000
Subject: [PATCH 24/24] disable membrane barostat test on non-CUDA backends

---
 test/simulation.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/simulation.jl b/test/simulation.jl
index 490bf30a9..6b277609e 100644
--- a/test/simulation.jl
+++ b/test/simulation.jl
@@ -1200,7 +1200,8 @@ end
         MonteCarloMembraneBarostat(press, tens, temp, boundary; z_axis_fixed=true),
     )
 
-    for AT in array_list
+    if run_cuda_tests
+        AT = CuArray
         for (barostat_i, barostat) in enumerate(barostat_test_set)
             if AT <: AbstractGPUArray && barostat_i != 2
                 continue